diff --git a/.gitignore b/.gitignore
index bdcb067fc26d2a18ed88034ab616c08095794e17..fdc61ee8251a63953e5e92cff602e7ace9653700 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,8 +5,6 @@ node_modules
 /.tf_configure.bazelrc
 /bazel-*
 /bazel_pip
-/third_party/eigen3/mkl_include
-/third_party/mkl/*
 /tools/python_bin_path.sh
 /tools/git/gen
 /pip_test
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000000000000000000000000000000000..69393c377589cc707d6c079e575346564b9c3fbf
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,52 @@
+# Where component owners are known, add them here.
+
+tensorflow/core/platform/windows/* @mrry
+tensorflow/java/* @asimshankar
+tensorflow/tensorboard/* @jart @dandelionmane
+tensorflow/tools/docs/* @markdaoust
+
+# contrib
+
+# NEED OWNER: tensorflow/contrib/avro/*
+tensorflow/contrib/batching/* @alextp @chrisolston
+tensorflow/contrib/bayesflow/* @ebrevdo @rsepassi @jvdillon
+tensorflow/contrib/cmake/* @mrry @benoitsteiner
+tensorflow/contrib/copy_graph/* @tucker @poxvoculi
+tensorflow/contrib/crf/* @kentonl
+tensorflow/contrib/data/* @mrry
+tensorflow/contrib/distributions/* @jvdillon @langmore @rsepassi
+tensorflow/contrib/factorization/* @agarwal-ashish @xavigonzalvo
+tensorflow/contrib/ffmpeg/* @fredbertsch
+# NEED OWNERT: tensorflow/contrib/framework/*
+tensorflow/contrib/graph_editor/* @purpledog
+# NEED OWNER: tensorflow/contrib/grid_rnn/*
+tensorflow/contrib/hvx/* @satok16
+tensorflow/contrib/imperative/* @keveman
+tensorflow/contrib/integrate/* @shoyer
+tensorflow/contrib/kernel_methods/* @petrosmol
+tensorflow/contrib/ios_examples/* @petewarden
+tensorflow/contrib/labeled_tensor/* @shoyer
+tensorflow/contrib/layers/* @fchollet @martinwicke
+tensorflow/contrib/learn/* @martinwicke @ispirmustafa @alextp
+tensorflow/contrib/linalg/* @langmore
+tensorflow/contrib/linear_optimizer/* @petrosmol @andreasst @katsiapis
+tensorflow/contrib/lookup/* @ysuematsu @andreasst
+tensorflow/contrib/losses/* @alextp @ispirmustafa
+tensorflow/contrib/makefile/* @petewarden @satok16 @wolffg
+tensorflow/contrib/metrics/* @alextp @honkentuber @ispirmustafa
+tensorflow/contrib/nccl/* @cwhipkey @zheng-xq
+tensorflow/contrib/opt/* @strategist333
+tensorflow/contrib/pi_examples/* @maciekcc
+tensorflow/contrib/quantization/* @petewarden @cwhipkey @keveman
+tensorflow/contrib/rnn/* @ebrevdo
+tensorflow/contrib/saved_model/* @nfiedel @sukritiramesh
+tensorflow/contrib/seq2seq/* @lukaszkaiser
+tensorflow/contrib/session_bundle/* @nfiedel @sukritiramesh
+tensorflow/contrib/slim/* @sguada @thenbasilmanran
+tensorflow/contrib/stateless/* @girving
+tensorflow/contrib/tensor_forest/* @gilberthendry @thomascolthurst
+tensorflow/contrib/testing/* @dandelionmane
+tensorflow/contrib/timeseries/* @allenlavoie
+tensorflow/contrib/tpu/* @frankchn @saeta @jhseu
+tensorflow/contrib/training/* @joel-shor @ebrevdo
+tensorflow/contrib/util/* @sherrym
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..10fd595fec7f240c3fdc871e1f32cc83f2ffd46d
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,70 @@
+# TensorFlow Code of Conduct
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic address, without explicit permission
+* Conduct which could reasonably be considered inappropriate for the forum in which it occurs. 
+
+All TensorFlow forums and spaces are meant for professional interactions, and any behavior which could reasonably be considered inappropriate in a professional setting is unacceptable.
+
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+
+## Scope
+
+This Code of Conduct applies to all content on tensorflow.org, TensorFlow’s GitHub organization, or any other official TensorFlow web presence allowing for community interactions, as well as at all official TensorFlow events, whether offline or online.
+
+The Code of Conduct also applies within project spaces and in public spaces whenever an individual is representing TensorFlow or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed or de facto representative at an online or offline event. 
+
+
+## Conflict Resolution
+
+Conflicts in an open source project can take many forms, from someone having a bad day and using harsh and hurtful language in the issue queue, to more serious instances such as sexist/racist statements or threats of violence, and everything in between.
+
+If the behaviour is threatening or harassing, or for other reasons requires immediate escalation, please see below.
+
+However, for the vast majority of issues, we aim to empower individuals to first resolve conflicts themselves, asking for help when needed, and only after that fails to escalate further. This approach gives people more control over the outcome of their dispute. 
+
+If you are experiencing or witnessing conflict, we ask you to use the following escalation strategy to address the conflict:
+
+1. Address the perceived conflict directly with those involved, preferably in a real-time medium. 
+2. If this fails, get a third party (e.g. a mutual friend, and/or someone with background on the issue, but not involved in conflict) to intercede.
+3. If you are still unable to resolve the conflict, and you believe it rises to harassment or another code of conduct violation, report it.
+
+
+## Reporting Violations
+
+Violations of the Code of Conduct can be reported to TensorFlow’s Project Steward at conduct@tensorflow.org. The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
+
+Violations of the Code of Conduct can occur in any setting, even those unrelated to the project. We will only consider complaints about conduct that has occurred within one year of the report.
+
+
+## Enforcement
+
+If the Project Steward receives a report alleging a violation of the Code of Conduct, the Project Steward will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Steward will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Steward may issue sanctions without notice.
+
+
+## Attribution
+
+This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at http://contributor-covenant.org/version/1/4, and includes some aspects of the Geek Feminism Code of Conduct and the Drupal Code of Conduct.
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 6f4c048ce83fb47a611b5dfe08e0fde0779994c0..2bf2c754cf64ec3bac22a22fbafcebbd4dc54bf4 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -1,11 +1,12 @@
 Please go to Stack Overflow for help and support:
 
-http://stackoverflow.com/questions/tagged/tensorflow
+https://stackoverflow.com/questions/tagged/tensorflow
 
 If you open a GitHub issue, here is our policy:
 
 1. It must be a bug or a feature request.
 2. The form below must be filled out.
+3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
 **Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow.
 
@@ -16,6 +17,7 @@ If you open a GitHub issue, here is our policy:
 - **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
 - **TensorFlow installed from (source or binary)**:
 - **TensorFlow version (use command below)**:
+- **Python version**: 
 - **Bazel version (if compiling from source)**:
 - **CUDA/cuDNN version**:
 - **GPU model and memory**:
diff --git a/README.md b/README.md
index e7dbf57b25a6276498ce26f1df41e2a54d1fc159..4e17182f8117f86fbdfc96dd0926804fda0c310d 100644
--- a/README.md
+++ b/README.md
@@ -9,37 +9,38 @@
 | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
 
 **TensorFlow** is an open source software library for numerical computation using
-data flow graphs.  Nodes in the graph represent mathematical operations, while
+data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture lets you deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
 code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
-working on the Google Brain team within Google's Machine Intelligence research
+working on the Google Brain team within Google's Machine Intelligence Research
 organization for the purposes of conducting machine learning and deep neural
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
-**If you'd like to contribute to TensorFlow, be sure to review the [contribution
+**If you want to contribute to TensorFlow, be sure to review the [contribution
 guidelines](CONTRIBUTING.md).**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs, but please see
-[Community](https://www.tensorflow.org/community/) for general questions
-and discussion.**
+tracking requests and bugs. So please see 
+[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
+and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
 
 ## Installation
-*See [Installing TensorFlow](https://www.tensorflow.org/install/) for instructions on how to install our release binaries or how to build from source.*
+*See [Installing TensorFlow](https://www.tensorflow.org/install) for instructions on how to install our release binaries or how to build from source.*
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
+
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.3.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.3.0rc0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.3.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.3.0rc0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
@@ -55,16 +56,17 @@ $ python
 'Hello, TensorFlow!'
 >>> a = tf.constant(10)
 >>> b = tf.constant(32)
->>> sess.run(a+b)
+>>> sess.run(a + b)
 42
 >>>
 ```
 
 ## For more information
 
-* [TensorFlow website](https://tensorflow.org)
-* [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
+* [TensorFlow website](https://www.tensorflow.org)
+* [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
+* [TensorFlow course at Stanford](https://web.stanford.edu/class/cs20si)
 
-The TensorFlow community has created amazing things with TensorFlow, please see the [resources section of tensorflow.org](https://www.tensorflow.org/about/#community) for an incomplete list.
+Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
diff --git a/RELEASE.md b/RELEASE.md
index d22c5c62fe01e5d3e2bc0cd4657aff692ee734bf..e7c086164a261b1169446738abe3b5e390d2d798 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,112 @@
+# Release 1.3.0
+
+## Major Features and Improvements
+* Added canned estimators to Tensorflow library. List of added estimators: `DNNClassifier`, `DNNRegressor`, `LinearClassifer`, `LinearRegressor`, `DNNLinearCombinedClassifier`, `DNNLinearCombinedRegressor`.
+* All our prebuilt binaries have been built with cuDNN 6.
+* Adds a file cache to the GCS filesystem with configurable max staleness for file contents. This permits caching of file contents across close/open boundaries.
+* Added an axis parameter to `tf.gather`.
+* Added a `constant_values` keyword argument to `tf.pad`.
+* Adds `Dataset.interleave` transformation.
+* Add `ConcatenateDataset` to concatenate two datasets.
+* Added Mobilenet support to TensorFlow for Poets training script.
+* Adds a block cache to the GCS filesystem with configurable block size and count.
+* SinhArcSinh bijector added.
+* Added `Dataset.list_files` API.
+* Introduces new operations and Python bindings for the Cloud TPU.
+* Adding TensorFlow-iOS CocoaPod for symmetry with tensorflow-android.
+* Introduces base implementations of ClusterResolvers.
+* Unify memory representations of TensorShape and PartialTensorShape. As a consequence, tensors now have a maximum of 254 dimensions, not 255.
+* Changed references to LIBXSMM to use version 1.8.1.
+* TensorFlow Debugger (tfdbg): Display summaries of numeric tensor values with the `-s` flag to command `print_tensor` or `pt`.
+* Initial release of the statistical distribution library `tf.distributions`.
+* GPU kernels and speed improvements for for unary `tf.where` and `tf.nn.top_k`.
+* Monotonic Attention wrappers added to `tf.contrib.seq2seq`.
+
+## Breaking Changes to the API
+* `tf.RewriterConfig` was removed from the Python API after being available in 1.2 release candidates (it was never in an actual release). Graph rewriting is still available, just not as `tf.RewriterConfig`. Instead add an explicit import.
+* Breaking change to `tf.contrib.data.Dataset` APIs that expect a nested structure. Lists are now converted to `tf.Tensor` implicitly. You may need to change uses of lists to tuples in existing code. In addition, dicts are now supported as a nested structure.
+
+## Changes to contrib APIs
+* Adds tf.contrib.nn.rank_sampled_softmax_loss, a sampled-softmax variant that can improve rank loss.
+* `tf.contrib.metrics`.{streaming_covariance,streaming_pearson_correlation} modified to return nan when they have seen less or equal to 1 unit of weight.
+* Adds time series models to contrib. See contrib/timeseries/README.md for details.
+* Adds FULLY_CONNECTED Op to tensorflow/contrib/lite/schema.fbs
+
+## Bug Fixes and Other Changes
+* Fixes 'strides' and 'begin' dtype mismatch when slicing using int64 Tensor index in python.
+* Improved convolution padding documentation.
+* Add a tag constant, gpu, to present graph with GPU support.
+* `saved_model.utils` now support SparseTensors transparently.
+* A more efficient implementation of non-max suppression.
+* Add support for the shrinkage-type L2 to FtrlOptimizer in addition to the online L2 it already supports.
+* Fix negative variance in moments calculation.
+* Expand UniqueOp Benchmark Tests to cover more collision cases.
+* Improves stability of GCS filesystem on Mac.
+* Add time estimation to HloCostAnalysis.
+* Fixed the bug in Estimator that params in constructor was not a deepcopy of the user provided one. This bugs inadvertently enabled user to mutate the params after the creation of Estimator, leading to potentially undefined behavior.
+* Added None check for save_path in `saver.restore`.
+* Register devices under their legacy names in device_mgr to ease the transition to clusterspec-propagated configurations.
+* VectorExponential added to distributions.
+* Add a bitwise module with bitwise_and, bitwise_or, bitwise_xor, and invert functions.
+* Add fixed-grid ODE integration routines.
+* Allow passing bounds to ScipyOptimizerInterface.
+* Correctness fixes for fft_length parameter to `tf.spectral.rfft` & `tf.spectral.irfft`.
+* Exported model signatures using the 'predict' method will no longer have their input and output keys silently ignored and rewritten to 'inputs' and 'outputs'. If a model was exported with different names before 1.2, and is now served with tensorflow/serving, it will accept requests using 'inputs' and 'outputs'. Starting at 1.2, such a model will accept the keys specified during export. Therefore, inference requests using 'inputs' and 'outputs' may start to fail. To fix this, either update any inference clients to send requests with the actual input and output keys used by the trainer code, or conversely, update the trainer code to name the input and output Tensors 'inputs' and 'outputs', respectively. Signatures using the 'classify' and 'regress' methods are not affected by this change; they will continue to standardize their input and output keys as before.
+* Add in-memory caching to the Dataset API.
+* Set default end_of_sequence variable in datasets iterators to false.
+* [Performance] Increase performance of `tf.layers.con2d` when setting use_bias=True by 2x by using nn.bias_add.
+* Update iOS examples to use CocoaPods, and moved to tensorflow/examples/ios.
+* Adds a family= attribute in `tf.summary` ops to allow controlling the tab name used in Tensorboard for organizing summaries.
+* When GPU is configured, do not require --config=cuda, instead, automatically build for GPU if this is requested in the configure script.
+* Fix incorrect sampling of small probabilities in CPU/GPU multinomial.
+* Add a list_devices() API on sessions to list devices within a cluster. Additionally, this change augment the ListDevices master API to support specifying a session.
+* Allow uses of over-parameterized separable convolution.
+* TensorForest multi-regression bug fix.
+* Framework now supports armv7, cocoapods.org now displays correct page.
+* Script to create iOS framework for CocoaPods.
+* Android releases of TensorFlow are now pushed to jcenter for easier integration into apps. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/android/README.md for more details.
+* Fixed a bug that prevented tfdbg from functioning with multi-GPU setups.
+* Fixed a bug that prevented tfdbg from working with `tf.Session.make_callable`.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4F2E4A2E, Adriano Carmezim, Adrià Arrufat, Alan Yee, Alex Lattas, Alex Rothberg,
+Alexandr Baranezky, Ali Siddiqui, Andreas Solleder, Andrei Costinescu, Andrew Hundt,
+Androbin, Andy Kernahan, Anish Shah, Anthony Platanios, Arvinds-Ds, b1rd, Baptiste
+Arnaud, Ben Mabey, Benedikt Linse, Beomsu Kim, Bo Wang, Boyuan Deng, Brett Koonce,
+Bruno Rosa, Carl Thomé, Changming Sun, Chase Roberts, Chirag Bhatia, Chris Antaki,
+Chris Hoyean Song, Chris Tava, Christos Nikolaou, Croath Liu, cxx, Czxck001, Daniel
+Ylitalo, Danny Goodman, Darren Garvey, David Brailovsky, David Norman, DavidNorman,
+davidpham87, ddurham2, Dhruv, DimanNe, Drew Hintz, Dustin Tran, Earthson Lu, ethiraj,
+Fabian Winnen, Fei Sun, Freedom" Koan-Sin Tan, Fritz Obermeyer, Gao, Xiang, Gautam,
+Guenther Schmuelling, Gyu-Ho Lee, Hauke Brammer, horance, Humanity123, J Alammar,
+Jayeol Chun, Jeroen BéDorf, Jianfei Wang, jiefangxuanyan, Jing Jun Yin, Joan Puigcerver,
+Joel Hestness, Johannes Mayer, John Lawson, Johnson145, Jon Malmaud, Jonathan Alvarez-Gutierrez,
+Juang, Yi-Lin, Julian Viereck, Kaarthik Sivashanmugam, Karl Lessard, karl@kubx.ca, Kevin
+Carbone, Kevin Van Der Burgt, Kongsea, ksellesk, lanhin, Lef Ioannidis, Liangliang He,
+Louis Tiao, Luke Iwanski, LáSzló Csomor, magixsno, Mahmoud Abuzaina, Marcel Hlopko, Mark
+Neumann, Maxwell Paul Brickner, mdfaijul, MichaëL Defferrard, Michał JastrzęBski, Michele
+Colombo, Mike Brodie, Mosnoi Ion, mouradmourafiq, myPrecious, Nayana Thorat,
+Neeraj Kashyap, Nelson Liu, Niranjan Hasabnis, Olivier Moindrot, orome, Pankaj Gupta, Paul
+Van Eck, peeyush18, Peng Yu, Pierre, preciousdp11, qjivy, Raingo, raoqiyu, ribx, Richard S.
+Imaoka, Rishabh Patel, Robert Walecki, Rockford Wei, Ryan Kung, Sahil Dua, Sandip Giri, Sayed
+Hadi Hashemi, sgt101, Shitian Ni, Shuolongbj, Siim PõDer, Simon Perkins, sj6077, SOLARIS,
+Spotlight0xff, Steffen Eberbach, Stephen Fox, superryanguo, Sven Mayer, Tapan Prakash,
+Tiago Morais Morgado, Till Hoffmann, Tj Rana, Vadim Markovtsev, vhasanov, Wei Wu,
+windead, Yan (Asta) Li, Yan Chen, Yann Henon, Yi Wang, Yong Tang, yorkie, Yuan (Terry)
+Tang, Yuxin Wu, zhengjiajin, zhongzyd, 黄璞
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+# Release 1.2.1
+
+## Bug Fixes and Other Changes
+* Updating markdown version required to >= 2.6.8.
+* Support tensors as dropout rates again, by removing the min(max(..))
+
 # Release 1.2.0
 
 ## Major Features and Improvements
@@ -59,37 +168,6 @@
   integration into apps. See
   https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/android/README.md
   for more details.
-* RNNCells' variable names have been renamed for consistency with Keras layers.
-  Specifically, the previous variable names "weights" and "biases" have
-  been changed to "kernel" and "bias", respectively.
-  This may cause backward incompatibility with regard to your old
-  checkpoints containing such RNN cells, in which case you can use the tool
-  [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
-  to convert the variable names in your old checkpoints.
-* Many of the RNN functions and classes that were in the `tf.nn` namespace
-  before the 1.0 release and which were moved to `tf.contrib.rnn` have now
-  been moved back to the core namespace.  This includes
-  `RNNCell`, `LSTMCell`, `GRUCell`, and a number of other cells.  These
-  now reside in `tf.nn.rnn_cell` (with aliases in `tf.contrib.rnn` for backwards
-  compatibility).  The original `tf.nn.rnn` function is now `tf.nn.static_rnn`,
-  and the bidirectional static and state saving static rnn functions are also
-  now back in the `tf.nn` namespace.
-
-  Notable exceptions are the `EmbeddingWrapper`, `InputProjectionWrapper` and
-  `OutputProjectionWrapper`,  which will slowly be moved to deprecation
-  in `tf.contrib.rnn`.  These are inefficient wrappers that should often
-  be replaced by calling `embedding_lookup` or `layers.dense` as pre- or post-
-  processing of the rnn.  For RNN decoding, this functionality has been replaced
-  with an alternative API in `tf.contrib.seq2seq`.
-* Intel MKL Integration (https://software.intel.com/en-us/articles/tensorflow-optimizations-on-modern-intel-architecture). Intel developed a number of
-  optimized deep learning primitives: In addition to matrix multiplication and
-  convolution, these building blocks include:
-  Direct batched convolution
-  Pooling: maximum, minimum, average
-  Normalization: LRN, batch normalization
-  Activation: rectified linear unit (ReLU)
-  Data manipulation: multi-dimensional transposition (conversion), split,
-  concat, sum and scale.
 
 ## Deprecations
 
@@ -113,6 +191,8 @@
   checkpoints containing such RNN cells, in which case you can use the
   [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
   to convert the variable names in your old checkpoints.
+* Added `tf.contrib.kernel_methods` module with Ops and estimators for primal
+  (explicit) kernel methods in TensorFlow.
 
 ## Bug Fixes and Other Changes
 * In python, `Operation.get_attr` on type attributes returns the Python DType
diff --git a/WORKSPACE b/WORKSPACE
index 74ce13f4e88710050ac3f5aa22e6de0375da9694..6b5d24560ca416bcff10355cf760e6c4af928137 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -32,6 +32,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #    name="androidndk",
 #    path="<PATH_TO_NDK>",
 #    # This needs to be 14 or higher to compile TensorFlow.
+#    # Please specify API level to >= 21 to build for 64-bit
+#    # archtectures or the Android NDK will automatically select biggest
+#    # API level that it supports without notice.
 #    # Note that the NDK version is not the API level.
 #    api_level=14)
 
diff --git a/arm_compiler.BUILD b/arm_compiler.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b231d0180e3953e72c350a53544419ae634a355a
--- /dev/null
+++ b/arm_compiler.BUILD
@@ -0,0 +1,81 @@
+package(default_visibility = ['//visibility:public'])
+
+filegroup(
+  name = 'gcc',
+  srcs = [
+    'bin/arm-linux-gnueabihf-gcc',
+  ],
+)
+
+filegroup(
+  name = 'ar',
+  srcs = [
+    'bin/arm-linux-gnueabihf-ar',
+  ],
+)
+
+filegroup(
+  name = 'ld',
+  srcs = [
+    'bin/arm-linux-gnueabihf-ld',
+  ],
+)
+
+filegroup(
+  name = 'nm',
+  srcs = [
+    'bin/arm-linux-gnueabihf-nm',
+  ],
+)
+
+filegroup(
+  name = 'objcopy',
+  srcs = [
+    'bin/arm-linux-gnueabihf-objcopy',
+  ],
+)
+
+filegroup(
+  name = 'objdump',
+  srcs = [
+    'bin/arm-linux-gnueabihf-objdump',
+  ],
+)
+
+filegroup(
+  name = 'strip',
+  srcs = [
+    'bin/arm-linux-gnueabihf-strip',
+  ],
+)
+
+filegroup(
+  name = 'as',
+  srcs = [
+    'bin/arm-linux-gnueabihf-as',
+  ],
+)
+
+filegroup(
+  name = 'compiler_pieces',
+  srcs = glob([
+    'arm-linux-gnueabihf/**',
+    'libexec/**',
+    'lib/gcc/arm-linux-gnueabihf/**',
+    'include/**',
+  ]),
+)
+
+filegroup(
+  name = 'compiler_components',
+  srcs = [
+    ':gcc',
+    ':ar',
+    ':ld',
+    ':nm',
+    ':objcopy',
+    ':objdump',
+    ':strip',
+    ':as',
+  ],
+)
diff --git a/configure b/configure
index 602124225fe0712135798a779e509a16fe2ccc79..9c21d2b03a27714f05094667691e74c16fa89f35 100755
--- a/configure
+++ b/configure
@@ -3,879 +3,12 @@
 set -e
 set -o pipefail
 
-MIN_BAZEL_VERSION=0.4.5
-
-# Find out the absolute path to where ./configure resides
-pushd `dirname $0` > /dev/null
-SOURCE_BASE_DIR=`pwd -P`
-popd > /dev/null
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-
-function is_linux() {
-  [[ "${PLATFORM}" == "linux" ]]
-}
-
-function is_macos() {
-  [[ "${PLATFORM}" == "darwin" ]]
-}
-
-function is_windows() {
-  # On windows, the shell script is actually running in msys
-  [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]
-}
-
-function sed_in_place() {
-  sed -e $1 $2 > "$2.bak"
-  mv "$2.bak" $2
-}
-
-function write_to_bazelrc() {
-  echo "$1" >> .tf_configure.bazelrc
-}
-
-function write_action_env_to_bazelrc() {
-  write_to_bazelrc "build --action_env $1=\"$2\""
-}
-
-function python_path {
-  "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import site
-import os
-
-try:
-  input = raw_input
-except NameError:
-  pass
-
-python_paths = []
-if os.getenv('PYTHONPATH') is not None:
-  python_paths = os.getenv('PYTHONPATH').split(':')
-try:
-  library_paths = site.getsitepackages()
-except AttributeError:
- from distutils.sysconfig import get_python_lib
- library_paths = [get_python_lib()]
-all_paths = set(python_paths + library_paths)
-
-paths = []
-for path in all_paths:
-  if os.path.isdir(path):
-    paths.append(path)
-
-print(",".join(paths))
-END
-}
-
-function setup_python {
-  ## Set up python-related environment settings:
-  while true; do
-    fromuser=""
-    if [ -z "$PYTHON_BIN_PATH" ]; then
-      default_python_bin_path=$(which python || which python3 || true)
-      read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
-      fromuser="1"
-      if [ -z "$PYTHON_BIN_PATH" ]; then
-        PYTHON_BIN_PATH=$default_python_bin_path
-      fi
-    fi
-    if [ -e "$PYTHON_BIN_PATH" ]; then
-      break
-    fi
-    echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
-    if [ -z "$fromuser" ]; then
-      exit 1
-    fi
-    PYTHON_BIN_PATH=""
-    # Retry
-  done
-
-  if [ -z "$PYTHON_LIB_PATH" ]; then
-    # Split python_path into an array of paths, this allows path containing spaces
-    IFS=',' read -r -a python_lib_path <<< "$(python_path)"
-
-    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
-      PYTHON_LIB_PATH=${python_lib_path[0]}
-      echo "Using python library path: $PYTHON_LIB_PATH"
-
-    else
-      echo "Found possible Python library paths:"
-      for x in "${python_lib_path[@]}"; do
-        echo "  $x"
-      done
-      set -- "${python_lib_path[@]}"
-      echo "Please input the desired Python library path to use.  Default is [$1]"
-      read b || true
-      if [ "$b" == "" ]; then
-        PYTHON_LIB_PATH=${python_lib_path[0]}
-        echo "Using python library path: $PYTHON_LIB_PATH"
-      else
-        PYTHON_LIB_PATH="$b"
-      fi
-    fi
-  fi
-
-  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
-    exit 1
-  fi
-
-  local python_major_version
-  python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);' | head -c1)
-  if [ -z "$python_major_version" ]; then
-    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
-    exit 1
-  fi
-
-  # Convert python path to Windows style before writing into bazel.rc
-  if is_windows; then
-    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
-    PYTHON_LIB_PATH="$(cygpath -m "$PYTHON_LIB_PATH")"
-  fi
-
-  # Set-up env variables used by python_configure.bzl
-  write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH"
-  write_action_env_to_bazelrc "PYTHON_LIB_PATH" "$PYTHON_LIB_PATH"
-  write_to_bazelrc "build --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "build --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
-  write_to_bazelrc "build --force_python=py$python_major_version"
-  write_to_bazelrc "build --host_force_python=py$python_major_version"
-  write_to_bazelrc "build --python${python_major_version}_path=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "test --force_python=py$python_major_version"
-  write_to_bazelrc "test --host_force_python=py$python_major_version"
-  write_to_bazelrc "test --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "test --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
-  write_to_bazelrc "run --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "run --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
-
-  # Write tools/python_bin_path.sh
-  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
-}
-
-function version {
-  echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }';
-}
-
-
-bazel version > bazel.version
-curr_bazel_version=$(head -n 1 bazel.version | cut -d ' ' -f3)
-rm -f bazel.version
-
-
-echo "You have bazel $curr_bazel_version installed."
-if [ -z "$curr_bazel_version" ]; then
-  echo "WARNING: current bazel installation is not a release version."
-  echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION."
-elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
-  echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!"
-  echo "Exiting..."
-  exit 1
-fi
-
-# This file contains customized config settings.
-rm -f .tf_configure.bazelrc
-touch .tf_configure.bazelrc
-if [[ ! -e .bazelrc ]]; then
-  if [[ -e "${HOME}/.bazelrc" ]]; then
-    echo "import ${HOME}/.bazelrc" >.bazelrc
-  else
-    touch .bazelrc
-  fi
-fi
-sed_in_place "/tf_configure/d" .bazelrc
-echo "import %workspace%/.tf_configure.bazelrc" >> .bazelrc
-
-# Delete any leftover BUILD files from the Makefile build, which would interfere
-# with Bazel parsing.
-MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads
-if [ -d "${MAKEFILE_DOWNLOAD_DIR}" ]; then
-  find ${MAKEFILE_DOWNLOAD_DIR} -type f -name '*BUILD' -delete
-fi
-
-setup_python
-
-## Set up MKL related environment settings
-while [ "$TF_NEED_MKL" == "" ]; do
-  fromuser=""
-  read -p "Do you wish to build TensorFlow with MKL support? [y/N] " INPUT
-  fromuser="1"
-  case $INPUT in
-    [Yy]* ) echo "MKL support will be enabled for TensorFlow"; TF_NEED_MKL=1;;
-    [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-    "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-OSNAME=`uname -s`
-
-if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
-  while [ "$TF_DOWNLOAD_MKL" == "" ]; do
-    fromuser=""
-    read -p "Do you wish to download MKL LIB from the web? [Y/n] " INPUT
-    fromuser="1"
-    case $INPUT in
-      [Yy]* ) TF_DOWNLOAD_MKL=1;;
-      [Nn]* ) TF_DOWNLOAD_MKL=0;;
-      "" )    TF_DOWNLOAD_MKL=1;;
-      * )     echo "Invalid selection: " $INPUT; exit 1;;
-    esac
-  done
-
-  if [[ "$TF_DOWNLOAD_MKL" == "1" ]]; then
-    DST=`dirname $0`
-    ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
-    GITHUB_RELEASE_TAG=v0.7
-    MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
-    if ! [ -e "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" ]; then
-      curl -fSsL -o "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" "${MKLURL}"
-    fi
-    tar -xzf $DST/third_party/mkl/$ARCHIVE_BASENAME -C $DST/third_party/mkl/
-    extracted_dir_name="${ARCHIVE_BASENAME%.*}"
-    MKL_INSTALL_PATH=$DST/third_party/mkl/$extracted_dir_name
-    MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
-
-  else
-    default_mkl_path=/opt/intel/mklml
-    fromuser=""
-    if [ -z "$MKL_INSTALL_PATH" ]; then
-      read -p "Please specify the location where MKL is installed. [Default is $default_mkl_path]: " MKL_INSTALL_PATH
-      fromuser="1"
-    fi
-    if [ -z "$MKL_INSTALL_PATH" ]; then
-      MKL_INSTALL_PATH=$default_mkl_path
-    fi
-    # Result returned from "read" will be used unexpanded. That make "~" unusable.
-    # Going through one more level of expansion to handle that.
-    MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
-  fi
-
-  if [ "$OSNAME" == "Linux" ]; then
-      # Full MKL configuration
-      MKL_RT_LIB_PATH="lib/intel64/libmkl_rt.so" #${TF_MKL_EXT}#TODO version?
-      MKL_RT_OMP_LIB_PATH="../compiler/lib/intel64/libiomp5.so" #TODO VERSION?
-
-      # MKL-ML configuration
-      MKL_ML_LIB_PATH="lib/libmklml_intel.so" #${TF_MKL_EXT}#TODO version?
-      MKL_ML_OMP_LIB_PATH="lib/libiomp5.so" #TODO VERSION?
-  elif [ "$OSNAME" == "Darwin" ]; then
-      echo "Darwin is unsupported yet";
-      exit 1
-  fi
-
-  if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
-      ln -sf $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} third_party/mkl/
-      ln -sf $MKL_INSTALL_PATH/${MKL_ML_OMP_LIB_PATH} third_party/mkl/
-      ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
-      ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
-      loc=$(locate -e libdl.so.2 | sed -n 1p)
-      ln -sf $loc third_party/mkl/libdl.so.2
-  elif [ -e "$MKL_INSTALL_PATH/${MKL_RT_LIB_PATH}" ]; then
-      ln -sf $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} third_party/mkl/
-      ln -sf $MKL_INSTALL_PATH/${MKL_RT_OMP_LIB_PATH} third_party/mkl/
-      ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
-      ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
-      loc=$(locate -e libdl.so.2 | sed -n 1p)
-      ln -sf $loc third_party/mkl/libdl.so.2
-  else
-      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} nor $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} exists";
-      exit 1
-  fi
-
-cat > third_party/mkl/mkl.config <<EOF
-# MKL_INSTALL_PATH refers to the location of MKL root folder. The MKL header and library
-# files can be either in this directory, or under include/ and lib64/
-MKL_INSTALL_PATH=$MKL_INSTALL_PATH
-EOF
-
-fi # TF_NEED_MKL
-## End MKL setup
-
-## Set up architecture-dependent optimization flags.
-if [ -z "$CC_OPT_FLAGS" ]; then
-  default_cc_opt_flags="-march=native"
-  read -p "Please specify optimization flags to use during compilation when bazel option "\
-"\"--config=opt\" is specified [Default is $default_cc_opt_flags]: " CC_OPT_FLAGS
-  if [ -z "$CC_OPT_FLAGS" ]; then
-    CC_OPT_FLAGS=$default_cc_opt_flags
-  fi
-fi
-
-if is_windows; then
-  TF_NEED_GCP=0
-  TF_NEED_HDFS=0
-  TF_NEED_JEMALLOC=0
-  TF_NEED_OPENCL=0
-  TF_CUDA_CLANG=0
-fi
-
-if is_linux; then
-  while [ "$TF_NEED_JEMALLOC" == "" ]; do
-    read -p "Do you wish to use jemalloc as the malloc implementation? [Y/n] "\
-      INPUT
-    case $INPUT in
-      [Yy]* ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;;
-      [Nn]* ) echo "jemalloc disabled"; TF_NEED_JEMALLOC=0;;
-      "" ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;;
-      * ) echo "Invalid selection: " $INPUT;;
-    esac
-  done
-else
-  TF_NEED_JEMALLOC=0
-fi
-
-if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
-  write_to_bazelrc 'build --define with_jemalloc=true'
-fi
-
-while [[ "$TF_NEED_GCP" == "" ]]; do
-  read -p "Do you wish to build TensorFlow with "\
-"Google Cloud Platform support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Google Cloud Platform support will be enabled for "\
-"TensorFlow"; TF_NEED_GCP=1;;
-    [Nn]* ) echo "No Google Cloud Platform support will be enabled for "\
-"TensorFlow"; TF_NEED_GCP=0;;
-    "" ) echo "No Google Cloud Platform support will be enabled for "\
-"TensorFlow"; TF_NEED_GCP=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_NEED_GCP" == "1" ]]; then
-  write_to_bazelrc 'build --define with_gcp_support=true'
-fi
-
-while [[ "$TF_NEED_HDFS" == "" ]]; do
-  read -p "Do you wish to build TensorFlow with "\
-"Hadoop File System support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Hadoop File System support will be enabled for "\
-"TensorFlow"; TF_NEED_HDFS=1;;
-    [Nn]* ) echo "No Hadoop File System support will be enabled for "\
-"TensorFlow"; TF_NEED_HDFS=0;;
-    "" ) echo "No Hadoop File System support will be enabled for "\
-"TensorFlow"; TF_NEED_HDFS=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_NEED_HDFS" == "1" ]]; then
-  write_to_bazelrc 'build --define with_hdfs_support=true'
-fi
-
-## Enable XLA.
-while [[ "$TF_ENABLE_XLA" == "" ]]; do
-  read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;;
-    [Nn]* ) echo "No XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=0;;
-    "" ) echo "No XLA support will be enabled for TensorFlow"; TF_ENABLE_XLA=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_ENABLE_XLA" == "1" ]]; then
-  write_to_bazelrc 'build --define with_xla_support=true'
-fi
-
-# Verbs configuration
-while [ "$TF_NEED_VERBS" == "" ]; do
-  read -p "Do you wish to build TensorFlow with "\
-"VERBS support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "VERBS support will be enabled for "\
-"TensorFlow"; TF_NEED_VERBS=1;;
-    [Nn]* ) echo "No VERBS support will be enabled for "\
-"TensorFlow"; TF_NEED_VERBS=0;;
-    "" ) echo "No VERBS support will be enabled for "\
-"TensorFlow"; TF_NEED_VERBS=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_NEED_VERBS" == "1" ]]; then
-  write_to_bazelrc 'build --define with_verbs_support=true'
-fi
-
-# Append CC optimization flags to bazel.rc
-for opt in $CC_OPT_FLAGS; do
-  write_to_bazelrc "build:opt --cxxopt=$opt --copt=$opt"
-done
-
-# Run the gen_git_source to create links where bazel can track dependencies for
-# git hash propagation
-GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
-chmod a+x ${GEN_GIT_SOURCE}
-"${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"
-
-## Set up SYCL-related environment settings
-while [ "$TF_NEED_OPENCL" == "" ]; do
-  read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;;
-    [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
-    "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-## Set up Cuda-related environment settings
-
-while [ "$TF_NEED_CUDA" == "" ]; do
-  read -p "Do you wish to build TensorFlow with CUDA support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=1;;
-    [Nn]* ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
-    "" ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-export TF_NEED_CUDA
-write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"
-
-export TF_NEED_OPENCL
-write_action_env_to_bazelrc "TF_NEED_OPENCL" "$TF_NEED_OPENCL"
-
-if [ "$TF_NEED_CUDA" == "1" ]; then
-while [[ "$TF_CUDA_CLANG" == "" ]]; do
-  read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Clang will be used as CUDA compiler"; TF_CUDA_CLANG=1;;
-    [Nn]* ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
-    "" ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-export TF_CUDA_CLANG
-write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
-
-# Set up which clang we should use as the cuda / host compiler.
-while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
-  fromuser=""
-  if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
-    default_clang_host_compiler_path=$(which clang || true)
-    read -p "Please specify which clang should be used as device and host compiler. [Default is $default_clang_host_compiler_path]: " CLANG_CUDA_COMPILER_PATH
-    fromuser="1"
-    if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
-      CLANG_CUDA_COMPILER_PATH="$default_clang_host_compiler_path"
-    fi
-  fi
-  if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
-    export CLANG_CUDA_COMPILER_PATH
-    write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH"
-    break
-  fi
-  echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  CLANG_CUDA_COMPILER_PATH=""
-  # Retry
-done
-
-# Find out where the CUDA toolkit is installed
-while true; do
-  # Configure the Cuda SDK version to use.
-  if [ -z "$TF_CUDA_VERSION" ]; then
-    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION
-  fi
-
-  fromuser=""
-  if [ -z "$CUDA_TOOLKIT_PATH" ]; then
-    default_cuda_path=/usr/local/cuda
-    if is_windows; then
-      if [ -z "$CUDA_PATH" ]; then
-        default_cuda_path="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0"
-      else
-        default_cuda_path="$(cygpath -m "$CUDA_PATH")"
-      fi
-    elif is_linux; then
-      # If the default doesn't exist, try an alternative default.
-      if [ ! -d $default_cuda_path ] && [ -d /opt/cuda ]; then
-        default_cuda_path=/opt/cuda
-      fi
-    fi
-    read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
-    fromuser="1"
-    if [ -z "$CUDA_TOOLKIT_PATH" ]; then
-      CUDA_TOOLKIT_PATH="$default_cuda_path"
-    fi
-  fi
-
-  if [[ -z "$TF_CUDA_VERSION" ]]; then
-    TF_CUDA_EXT=""
-  else
-    TF_CUDA_EXT=".$TF_CUDA_VERSION"
-  fi
-
-  if is_windows; then
-    CUDA_RT_LIB_PATH="lib/x64/cudart.lib"
-  elif is_linux; then
-    CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}"
-  elif is_macos; then
-    CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib"
-  fi
-
-  if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
-    export CUDA_TOOLKIT_PATH
-    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
-    export TF_CUDA_VERSION
-    break
-  fi
-  echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
-
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  # Retry
-  TF_CUDA_VERSION=""
-  CUDA_TOOLKIT_PATH=""
-done
-
-# Set default CUDA version if not set
-if [ -z "$TF_CUDA_VERSION" ]; then
-  TF_CUDA_VERSION="8.0"
-  export TF_CUDA_VERSION 
-fi
-write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" 
-
-# Set up which gcc nvcc should use as the host compiler
-# No need to set this on Windows
-while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
-  fromuser=""
-  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-    default_gcc_host_compiler_path=$(which gcc || true)
-    cuda_bin_symlink="$CUDA_TOOLKIT_PATH/bin/gcc"
-    if [ -L "$cuda_bin_symlink" ]; then
-      default_gcc_host_compiler_path=$(readlink $cuda_bin_symlink)
-    fi
-    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
-    fromuser="1"
-    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
-    fi
-  fi
-  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
-    export GCC_HOST_COMPILER_PATH
-    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
-    break
-  fi
-  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  GCC_HOST_COMPILER_PATH=""
-  # Retry
-done
-
-# Find out where the cuDNN library is installed
-while true; do
-  # Configure the cuDNN version to use.
-  if [ -z "$TF_CUDNN_VERSION" ]; then
-    read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION
-  fi
-
-  fromuser=""
-  if [ -z "$CUDNN_INSTALL_PATH" ]; then
-    default_cudnn_path=${CUDA_TOOLKIT_PATH}
-    read -p "Please specify the location where cuDNN $TF_CUDNN_VERSION library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH
-    fromuser="1"
-    if [ -z "$CUDNN_INSTALL_PATH" ]; then
-      CUDNN_INSTALL_PATH=$default_cudnn_path
-    fi
-    # Result returned from "read" will be used unexpanded. That make "~" unusable.
-    # Going through one more level of expansion to handle that.
-    CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
-  fi
-
-  if [[ -z "$TF_CUDNN_VERSION" ]]; then
-    TF_CUDNN_EXT=""
-  else
-    TF_CUDNN_EXT=".$TF_CUDNN_VERSION"
-  fi
-
-  if is_windows; then
-    CUDA_DNN_LIB_PATH="lib/x64/cudnn.lib"
-    CUDA_DNN_LIB_ALT_PATH="lib/x64/cudnn.lib"
-  elif is_linux; then
-    CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
-    CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
-  elif is_macos; then
-    CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}.dylib"
-    CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib"
-  fi
-
-  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" ] || [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
-    export TF_CUDNN_VERSION
-    write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
-    export CUDNN_INSTALL_PATH
-    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
-    break
-  fi
-
-  if is_linux; then
-    if ! type ldconfig > /dev/null 2>&1; then
-        LDCONFIG_BIN=/sbin/ldconfig
-    else
-        LDCONFIG_BIN=ldconfig
-    fi
-    CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
-    if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
-      export TF_CUDNN_VERSION
-      export CUDNN_INSTALL_PATH
-      CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
-      write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
-      break
-    fi
-  fi
-  echo "Invalid path to cuDNN ${CUDNN_VERSION} toolkit. Neither of the following two files can be found:"
-  echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_PATH}"
-  echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_ALT_PATH}"
-  if is_linux; then
-    echo "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}"
-  fi
-
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  # Retry
-  TF_CUDNN_VERSION=""
-  CUDNN_INSTALL_PATH=""
-done
-
-# Set default CUDNN version if not set
-if [ -z "$TF_CUDNN_VERSION" ]; then
-  TF_CUDNN_VERSION="6"
-  export TF_CUDNN_VERSION
-fi
-write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
-
-# Configure the compute capabilities that TensorFlow builds for.
-# Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
-while true; do
-  fromuser=""
-  default_cuda_compute_capabilities="3.5,5.2"
-  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
-cat << EOF
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
-You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your build time and binary size.
-EOF
-    read -p "[Default is: \"3.5,5.2\"]: " TF_CUDA_COMPUTE_CAPABILITIES
-    fromuser=1
-  fi
-  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
-    TF_CUDA_COMPUTE_CAPABILITIES=$default_cuda_compute_capabilities
-  fi
-  # Check whether all capabilities from the input is valid
-  COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES//,/ }
-  ALL_VALID=1
-  for CAPABILITY in $COMPUTE_CAPABILITIES; do
-    if [[ ! "$CAPABILITY" =~ [0-9]+.[0-9]+ ]]; then
-      echo "Invalid compute capability: " $CAPABILITY
-      ALL_VALID=0
-      break
-    fi
-  done
-  if [ "$ALL_VALID" == "0" ]; then
-    if [ -z "$fromuser" ]; then
-      exit 1
-    fi
-  else
-    export TF_CUDA_COMPUTE_CAPABILITIES
-    write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES"
-    break
-  fi
-  TF_CUDA_COMPUTE_CAPABILITIES=""
-done
-
-if is_windows; then
-  # The following three variables are needed for MSVC toolchain configuration in Bazel
-  export CUDA_PATH="$CUDA_TOOLKIT_PATH"
-  export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
-  export NO_WHOLE_ARCHIVE_OPTION=1
-  write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH"
-  write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES"
-  write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1"
-  write_to_bazelrc "build --config=win-cuda"
-  write_to_bazelrc "test --config=win-cuda"
-else
-  # If CUDA is enabled, always use GPU during build and test.
-  write_to_bazelrc "build --config=cuda"
-  write_to_bazelrc "test --config=cuda"
-fi
-
-# end of if "$TF_NEED_CUDA" == "1"
-fi
-
-# OpenCL configuration
-
-if [ "$TF_NEED_OPENCL" == "1" ]; then
-
-# Determine which C++ compiler should be used as the host compiler
-while true; do
-  fromuser=""
-  if [ -z "$HOST_CXX_COMPILER" ]; then
-    default_cxx_host_compiler=$(which g++ || true)
-    read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER
-    fromuser="1"
-    if [ -z "$HOST_CXX_COMPILER" ]; then
-      HOST_CXX_COMPILER=$default_cxx_host_compiler
-    fi
-  fi
-  if [ -e "$HOST_CXX_COMPILER" ]; then
-    export HOST_CXX_COMPILER
-    write_action_env_to_bazelrc "HOST_CXX_COMPILER" "$HOST_CXX_COMPILER"
-    break
-  fi
-  echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  HOST_CXX_COMPILER=""
-  # Retry
-done
-
-# Determine which C compiler should be used as the host compiler
-while true; do
-  fromuser=""
-  if [ -z "$HOST_C_COMPILER" ]; then
-    default_c_host_compiler=$(which gcc || true)
-    read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER
-    fromuser="1"
-    if [ -z "$HOST_C_COMPILER" ]; then
-      HOST_C_COMPILER=$default_c_host_compiler
-    fi
-  fi
-  if [ -e "$HOST_C_COMPILER" ]; then
-    export HOST_C_COMPILER
-    write_action_env_to_bazelrc "HOST_C_COMPILER" "$HOST_C_COMPILER"
-    break
-  fi
-  echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  HOST_C_COMPILER=""
-  # Retry
-done
-
-while true; do
-  # Configure the OPENCL version to use.
-  TF_OPENCL_VERSION="1.2"
-
-  # Point to ComputeCpp root
-  if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
-    default_computecpp_toolkit_path=/usr/local/computecpp
-    read -p "Please specify the location where ComputeCpp for SYCL $TF_OPENCL_VERSION is installed. [Default is $default_computecpp_toolkit_path]: " COMPUTECPP_TOOLKIT_PATH
-    fromuser="1"
-    if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
-      COMPUTECPP_TOOLKIT_PATH=$default_computecpp_toolkit_path
-    fi
-  fi
-
-  if is_linux; then
-    SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
-  fi
-
-  if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then
-    export COMPUTECPP_TOOLKIT_PATH
-    write_action_env_to_bazelrc "COMPUTECPP_TOOLKIT_PATH" "$COMPUTECPP_TOOLKIT_PATH"
-    break
-  fi
-  echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
-
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  # Retry
-  TF_OPENCL_VERSION=""
-  COMPUTECPP_TOOLKIT_PATH=""
-done
-
-# end of if "$TF_NEED_OPENCL" == "1"
-fi
-
-
-while [ "$TF_NEED_MPI" == "" ]; do
-  read -p "Do you wish to build TensorFlow with "\
-"MPI support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "MPI support will be enabled for "\
-"TensorFlow"; TF_NEED_MPI=1;;
-    [Nn]* ) echo "MPI support will not be enabled for "\
-"TensorFlow"; TF_NEED_MPI=0;;
-    "" ) echo "MPI support will not be enabled for "\
-"TensorFlow"; TF_NEED_MPI=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-# Find out where the MPI toolkit is installed
-while true; do
-    if [ "$TF_NEED_MPI" == "0" ]; then
-        break;
-    fi
-
-    fromuser=""
-    if [ -z "$MPI_HOME" ]; then
-        #Get the base folder by removing the bin path
-        default_mpi_path=$(dirname $(dirname $(which mpirun)) || dirname $(dirname $(which mpiexec))  || true)
-        read -p "Please specify the MPI toolkit folder. [Default is $default_mpi_path]: " MPI_HOME
-        fromuser="1"
-        if [ -z "$MPI_HOME" ]; then
-            MPI_HOME=$default_mpi_path
-        fi
-    fi
-
-    #Check that the include and library folders are where we expect them to be
-    if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then
-        break
-    fi
- 
-    echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found."
-    if [ -z "$fromuser" ]; then
-        exit 1
-    fi
-
-    # Retry
-    MPI_HOME="" 
-done
-    
-    
-if [ "$TF_NEED_MPI" == "1" ]; then
-  write_to_bazelrc 'build --define with_mpi_support=true'
-
-  #Link the MPI header files
-  ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h
-
-
-  #Determine if we use OpenMPI or MVAPICH, these require different header files 
-  #to be included here to make bazel dependency checker happy
-
-  if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then
-        #OpenMPI 
-        ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/
-        sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl
- else
-        #MVAPICH / MPICH
-        ln -sf "${MPI_HOME}/include/mpio.h" third_party/mpi/
-        ln -sf "${MPI_HOME}/include/mpicxx.h" third_party/mpi/
-        sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl
- fi
-
-  
-  if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then
-    ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/
-  else
-    echo "Cannot find the MPI library file in ${MPI_HOME}/lib "
-    exit 1
-  fi
+if [ -z "$PYTHON_BIN_PATH" ]; then
+  PYTHON_BIN_PATH=$(which python || which python3 || true)
 fi
 
+# Set all env variables
+"$PYTHON_BIN_PATH" configure.py
 
 echo "Configuration finished"
+
diff --git a/configure.py b/configure.py
new file mode 100644
index 0000000000000000000000000000000000000000..edb0a47ee611d3f355a1dd1622847d738bfe0122
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,950 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""configure script to get build parameters from user."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import platform
+import re
+import site
+import subprocess
+import sys
+
+_TF_BAZELRC = '.tf_configure.bazelrc'
+_DEFAULT_CUDA_VERSION = '8.0'
+_DEFAULT_CUDNN_VERSION = '6'
+_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
+_DEFAULT_CUDA_PATH = '/usr/local/cuda'
+_DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
+_DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
+                          'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
+_TF_OPENCL_VERSION = '1.2'
+_DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
+
+
+def is_windows():
+  return platform.system() == 'Windows'
+
+
+def is_linux():
+  return platform.system() == 'Linux'
+
+
+def is_macos():
+  return platform.system() == 'Darwin'
+
+
+def is_ppc64le():
+  return platform.machine() == 'ppc64le'
+
+
+def get_input(question):
+  try:
+    try:
+      answer = raw_input(question)
+    except NameError:
+      answer = input(question)  # pylint: disable=bad-builtin
+  except EOFError:
+    answer = ''
+  return answer
+
+
+def symlink_force(target, link_name):
+  """Force symlink, equivalent of 'ln -sf'.
+
+  Args:
+    target: items to link to.
+    link_name: name of the link.
+  """
+  try:
+    os.symlink(target, link_name)
+  except OSError as e:
+    if e.errno == errno.EEXIST:
+      os.remove(link_name)
+      os.symlink(target, link_name)
+    else:
+      raise e
+
+
+def sed_in_place(filename, old, new):
+  """Replace old string with new string in file.
+
+  Args:
+    filename: string for filename.
+    old: string to replace.
+    new: new string to replace to.
+  """
+  with open(filename, 'r') as f:
+    filedata = f.read()
+  newdata = filedata.replace(old, new)
+  with open(filename, 'w') as f:
+    f.write(newdata)
+
+
+def remove_line_with(filename, token):
+  """Remove lines that contain token from file.
+
+  Args:
+    filename: string for filename.
+    token: string token to check if to remove a line from file or not.
+  """
+  with open(filename, 'r') as f:
+    filedata = f.read()
+
+  with open(filename, 'w') as f:
+    for line in filedata.strip().split('\n'):
+      if token not in line:
+        f.write(line + '\n')
+
+
+def write_to_bazelrc(line):
+  with open(_TF_BAZELRC, 'a') as f:
+    f.write(line + '\n')
+
+
+def write_action_env_to_bazelrc(var_name, var):
+  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))
+
+
+def run_shell(cmd):
+  return subprocess.check_output(cmd, shell=True).decode('UTF-8').strip()
+
+
+def cygpath(path):
+  """Convert path from posix to windows."""
+  return run_shell('cygpath  -m "%s"' % path)
+
+
+def get_python_path(environ_cp):
+  """Get the python site package paths."""
+  python_paths = []
+  if environ_cp.get('PYTHONPATH'):
+    python_paths = environ_cp.get('PYTHONPATH').split(':')
+  try:
+    library_paths = site.getsitepackages()
+  except AttributeError:
+    from distutils.sysconfig import get_python_lib  # pylint: disable=g-import-not-at-top
+    library_paths = [get_python_lib()]
+  all_paths = set(python_paths + library_paths)
+
+  paths = []
+  for path in all_paths:
+    if os.path.isdir(path):
+      paths.append(path)
+  return paths
+
+
+def setup_python(environ_cp):
+  """Setup python related env variables."""
+  # Get PYTHON_BIN_PATH, default is the current running python.
+  default_python_bin_path = sys.executable
+  ask_python_bin_path = ('Please specify the location of python. [Default is '
+                         '%s]: ') % default_python_bin_path
+  while True:
+    python_bin_path = get_from_env_or_user_or_default(
+        environ_cp, 'PYTHON_BIN_PATH', ask_python_bin_path,
+        default_python_bin_path)
+    # Check if the path is valid
+    if (os.path.isfile(python_bin_path) and os.access(
+        python_bin_path, os.X_OK)) or (os.path.isdir(python_bin_path)):
+      break
+    elif not os.path.exists(python_bin_path):
+      print('Invalid python path: %s cannot be found.' % python_bin_path)
+    else:
+      print('%s is not executable.  Is it the python binary?' % python_bin_path)
+    environ_cp['PYTHON_BIN_PATH'] = ''
+
+  # Get PYTHON_LIB_PATH
+  python_lib_path = environ_cp.get('PYTHON_LIB_PATH')
+  if not python_lib_path:
+    python_lib_paths = get_python_path(environ_cp)
+    if environ_cp.get('USE_DEFAULT_PYTHON_LIB_PATH') == '1':
+      python_lib_path = python_lib_paths[0]
+    else:
+      print('Found possible Python library paths:\n%s' %
+            '\n'.join(python_lib_paths))
+      default_python_lib_path = python_lib_paths[0]
+      python_lib_path = get_input(
+          'Please input the desired Python library path to use.  Default is %s'
+          % python_lib_paths[0])
+      if not python_lib_path:
+        python_lib_path = default_python_lib_path
+    environ_cp['PYTHON_LIB_PATH'] = python_lib_path
+
+  python_major_version = sys.version_info[0]
+  # Convert python path to Windows style before writing into bazel.rc
+  if is_windows():
+    python_bin_path = cygpath(python_bin_path)
+    python_lib_path = cygpath(python_lib_path)
+
+  # Set-up env variables used by python_configure.bzl
+  write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
+  write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
+  write_to_bazelrc('build --define PYTHON_BIN_PATH="%s"' % python_bin_path)
+  write_to_bazelrc('build --define PYTHON_LIB_PATH="%s"' % python_lib_path)
+  write_to_bazelrc('build --force_python=py%s' % python_major_version)
+  write_to_bazelrc('build --host_force_python=py%s' % python_major_version)
+  write_to_bazelrc('build --python%s_path=\"%s"' % (python_major_version,
+                                                    python_bin_path))
+  write_to_bazelrc('test --force_python=py%s' % python_major_version)
+  write_to_bazelrc('test --host_force_python=py%s' % python_major_version)
+  write_to_bazelrc('test --define PYTHON_BIN_PATH="%s"' % python_bin_path)
+  write_to_bazelrc('test --define PYTHON_LIB_PATH="%s"' % python_lib_path)
+  write_to_bazelrc('run --define PYTHON_BIN_PATH="%s"' % python_bin_path)
+  write_to_bazelrc('run --define PYTHON_LIB_PATH="%s"' % python_lib_path)
+  environ_cp['PYTHON_BIN_PATH'] = python_bin_path
+
+  # Write tools/python_bin_path.sh
+  with open('tools/python_bin_path.sh', 'w') as f:
+    f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
+
+
+def reset_tf_configure_bazelrc():
+  """Reset file that contains customized config settings."""
+  open(_TF_BAZELRC, 'w').close()
+
+  home = os.path.expanduser('~')
+  if not os.path.exists('.bazelrc'):
+    if os.path.exists(os.path.join(home, '.bazelrc')):
+      with open('.bazelrc', 'a') as f:
+        f.write('import %s/.bazelrc\n' % home)
+    else:
+      open('.bazelrc', 'w').close()
+
+  remove_line_with('.bazelrc', 'tf_configure')
+  with open('.bazelrc', 'a') as f:
+    f.write('import %workspace%/.tf_configure.bazelrc\n')
+
+
+def run_gen_git_source(environ_cp):
+  """Run the gen_git_source to create links.
+
+  The links are for bazel to track dependencies for git hash propagation.
+
+  Args:
+    environ_cp: copy of the os.environ.
+  """
+  cmd = '"%s" tensorflow/tools/git/gen_git_source.py --configure %s' % (
+      environ_cp.get('PYTHON_BIN_PATH'), os.getcwd())
+  os.system(cmd)
+
+
+def cleanup_makefile():
+  """Delete any leftover BUILD files from the Makefile build.
+
+  These files could interfere with Bazel parsing.
+  """
+  makefile_download_dir = 'tensorflow/contrib/makefile/downloads'
+  if os.path.isdir(makefile_download_dir):
+    for root, _, filenames in os.walk(makefile_download_dir):
+      for f in filenames:
+        if f.endswith('BUILD'):
+          os.remove(os.path.join(root, f))
+
+
+def get_var(environ_cp,
+            var_name,
+            query_item,
+            enabled_by_default,
+            question=None,
+            yes_reply=None,
+            no_reply=None):
+  """Get boolean input from user.
+
+  If var_name is not set in env, ask user to enable query_item or not. If the
+  response is empty, use the default.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    query_item: string for feature related to the variable, e.g. "Hadoop File
+      System".
+    enabled_by_default: boolean for default behavior.
+    question: optional string for how to ask for user input.
+    yes_reply: optionanl string for reply when feature is enabled.
+    no_reply: optional string for reply when feature is disabled.
+
+  Returns:
+    boolean value of the variable.
+  """
+  if not question:
+    question = 'Do you wish to build TensorFlow with %s support?' % query_item
+  if not yes_reply:
+    yes_reply = '%s support will be enabled for TensorFlow.' % query_item
+  if not no_reply:
+    no_reply = 'No %s' % yes_reply
+
+  yes_reply += '\n'
+  no_reply += '\n'
+
+  if enabled_by_default:
+    question += ' [Y/n]: '
+  else:
+    question += ' [y/N]: '
+
+  var = environ_cp.get(var_name)
+  while var is None:
+    user_input_origin = get_input(question)
+    user_input = user_input_origin.strip().lower()
+    if user_input == 'y':
+      print(yes_reply)
+      var = True
+    elif user_input == 'n':
+      print(no_reply)
+      var = False
+    elif not user_input:
+      if enabled_by_default:
+        print(yes_reply)
+        var = True
+      else:
+        print(no_reply)
+        var = False
+    else:
+      print('Invalid selection: %s' % user_input_origin)
+  return var
+
+
+def set_build_var(environ_cp, var_name, query_item, option_name,
+                  enabled_by_default):
+  """Set if query_item will be enabled for the build.
+
+  Ask user if query_item will be enabled. Default is used if no input is given.
+  Set subprocess environment variable and write to .bazelrc if enabled.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    query_item: string for feature related to the variable, e.g. "Hadoop File
+      System".
+    option_name: string for option to define in .bazelrc.
+    enabled_by_default: boolean for default behavior.
+  """
+
+  var = str(int(get_var(environ_cp, var_name, query_item, enabled_by_default)))
+  environ_cp[var_name] = var
+  if var == '1':
+    write_to_bazelrc('build --define %s=true' % option_name)
+
+
+def set_action_env_var(environ_cp,
+                       var_name,
+                       query_item,
+                       enabled_by_default,
+                       question=None,
+                       yes_reply=None,
+                       no_reply=None):
+  """Set boolean action_env variable.
+
+  Ask user if query_item will be enabled. Default is used if no input is given.
+  Set environment variable and write to .bazelrc.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    query_item: string for feature related to the variable, e.g. "Hadoop File
+      System".
+    enabled_by_default: boolean for default behavior.
+    question: optional string for how to ask for user input.
+    yes_reply: optionanl string for reply when feature is enabled.
+    no_reply: optional string for reply when feature is disabled.
+  """
+  var = int(
+      get_var(environ_cp, var_name, query_item, enabled_by_default, question,
+              yes_reply, no_reply))
+
+  write_action_env_to_bazelrc(var_name, var)
+  environ_cp[var_name] = str(var)
+
+
+def check_bazel_version(min_version):
+  """Check installed bezel version is at least min_version.
+
+  Args:
+    min_version: string for minimum bazel version.
+  """
+  try:
+    curr_version = run_shell('bazel --batch version')
+  except subprocess.CalledProcessError:
+    print('Cannot find bazel. Please install bazel.')
+    sys.exit(0)
+
+  for line in curr_version.split('\n'):
+    if 'Build label: ' in line:
+      curr_version = line.split('Build label: ')[1]
+      break
+
+  min_version_segments = min_version.split('.')
+  curr_version_segments = curr_version.split('.')
+
+  # Check if current bazel version can be detected properly.
+  for seg in curr_version_segments:
+    if not seg.isdigit():
+      print('WARNING: current bazel installation is not a release version.')
+      print('Make sure you are running at least bazel %s' % min_version)
+      return
+
+  min_version_str = ''.join(['%03d' % int(seg) for seg in min_version_segments])
+  curr_version_str = ''.join(
+      ['%03d' % int(seg) for seg in curr_version_segments])
+  if int(curr_version_str) < int(min_version_str):
+    print('Please upgrade your bazel installation to version %s or higher to '
+          'build TensorFlow!' % min_version)
+    sys.exit(0)
+
+
+def set_cc_opt_flags(environ_cp):
+  """Set up architecture-dependent optimization flags.
+
+  Also append CC optimization flags to bazel.rc..
+
+  Args:
+    environ_cp: copy of the os.environ.
+  """
+  if is_ppc64le():
+    # gcc on ppc64le does not support -march, use mcpu instead
+    default_cc_opt_flags = '-mcpu=native'
+  else:
+    default_cc_opt_flags = '-march=native'
+  question = ('Please specify optimization flags to use during compilation when'
+              ' bazel option "--config=opt" is specified [Default is %s]: '
+             ) % default_cc_opt_flags
+  cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS',
+                                                 question, default_cc_opt_flags)
+  for opt in cc_opt_flags.split():
+    write_to_bazelrc('build:opt --cxxopt=%s --copt=%s' % (opt, opt))
+
+
+def set_tf_cuda_clang(environ_cp):
+  """set TF_CUDA_CLANG action_env.
+
+  Args:
+    environ_cp: copy of the os.environ.
+  """
+  question = 'Do you want to use clang as CUDA compiler?'
+  yes_reply = 'Clang will be used as CUDA compiler.'
+  no_reply = 'nvcc will be used as CUDA compiler.'
+  set_action_env_var(
+      environ_cp,
+      'TF_CUDA_CLANG',
+      None,
+      False,
+      question=question,
+      yes_reply=yes_reply,
+      no_reply=no_reply)
+
+
+def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var,
+                                    var_default):
+  """Get var_name either from env, or user or default.
+
+  If var_name has been set as environment variable, use the preset value, else
+  ask for user input. If no input is provided, the default is used.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    ask_for_var: string for how to ask for user input.
+    var_default: default value string.
+
+  Returns:
+    string value for var_name
+  """
+  var = environ_cp.get(var_name)
+  if not var:
+    var = get_input(ask_for_var)
+  if not var:
+    var = var_default
+  return var
+
+
+def set_clang_cuda_compiler_path(environ_cp):
+  """Set CLANG_CUDA_COMPILER_PATH."""
+  default_clang_path = run_shell('which clang || true')
+  ask_clang_path = ('Please specify which clang should be used as device and '
+                    'host compiler. [Default is %s]: ') % default_clang_path
+
+  while True:
+    clang_cuda_compiler_path = get_from_env_or_user_or_default(
+        environ_cp, 'CLANG_CUDA_COMPILER_PATH', ask_clang_path,
+        default_clang_path)
+    if os.path.exists(clang_cuda_compiler_path):
+      break
+
+    # Reset and retry
+    print('Invalid clang path: %s cannot be found.' % clang_cuda_compiler_path)
+    environ_cp['CLANG_CUDA_COMPILER_PATH'] = ''
+
+  # Set CLANG_CUDA_COMPILER_PATH
+  environ_cp['CLANG_CUDA_COMPILER_PATH'] = clang_cuda_compiler_path
+  write_action_env_to_bazelrc('CLANG_CUDA_COMPILER_PATH',
+                              clang_cuda_compiler_path)
+
+
+def set_gcc_host_compiler_path(environ_cp):
+  """Set GCC_HOST_COMPILER_PATH."""
+  default_gcc_host_compiler_path = run_shell('which gcc || true')
+  cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')
+
+  if os.path.islink(cuda_bin_symlink):
+    # os.readlink is only available in linux
+    default_gcc_host_compiler_path = run_shell('readlink %s' % cuda_bin_symlink)
+
+  ask_gcc_path = (
+      'Please specify which gcc should be used by nvcc as the '
+      'host compiler. [Default is %s]: ') % default_gcc_host_compiler_path
+  while True:
+    gcc_host_compiler_path = get_from_env_or_user_or_default(
+        environ_cp, 'GCC_HOST_COMPILER_PATH', ask_gcc_path,
+        default_gcc_host_compiler_path)
+
+    if os.path.exists(gcc_host_compiler_path):
+      break
+
+    # Reset and retry
+    print('Invalid gcc path. %s cannot be found' % gcc_host_compiler_path)
+    environ_cp['GCC_HOST_COMPILER_PATH'] = ''
+
+  # Set GCC_HOST_COMPILER_PATH
+  environ_cp['GCC_HOST_COMPILER_PATH'] = gcc_host_compiler_path
+  write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
+
+
+def set_tf_cuda_version(environ_cp):
+  """Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION."""
+  ask_cuda_version = (
+      'Please specify the CUDA SDK version you want to use, '
+      'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
+
+  while True:
+    # Configure the Cuda SDK version to use.
+    tf_cuda_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION)
+
+    # Find out where the CUDA toolkit is installed
+    default_cuda_path = _DEFAULT_CUDA_PATH
+    if is_windows():
+      default_cuda_path = cygpath(
+          environ_cp.get('CUDA_PATH', _DEFAULT_CUDA_PATH_WIN))
+    elif is_linux():
+      # If the default doesn't exist, try an alternative default.
+      if (not os.path.exists(default_cuda_path)
+         ) and os.path.exists(_DEFAULT_CUDA_PATH_LINUX):
+        default_cuda_path = _DEFAULT_CUDA_PATH_LINUX
+    ask_cuda_path = ('Please specify the location where CUDA %s toolkit is'
+                     ' installed. Refer to README.md for more details. '
+                     '[Default is %s]: ') % (tf_cuda_version, default_cuda_path)
+    cuda_toolkit_path = get_from_env_or_user_or_default(
+        environ_cp, 'CUDA_TOOLKIT_PATH', ask_cuda_path, default_cuda_path)
+
+    if is_windows():
+      cuda_rt_lib_path = 'lib/x64/cudart.lib'
+    elif is_linux():
+      cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version
+    elif is_macos():
+      cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version
+
+    cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path)
+    if os.path.exists(cuda_toolkit_path_full):
+      break
+
+    # Reset and retry
+    print('Invalid path to CUDA %s toolkit. %s cannot be found' %
+          (tf_cuda_version, cuda_toolkit_path_full))
+    environ_cp['TF_CUDA_VERSION'] = ''
+    environ_cp['CUDA_TOOLKIT_PATH'] = ''
+
+  # Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION
+  environ_cp['CUDA_TOOLKIT_PATH'] = cuda_toolkit_path
+  write_action_env_to_bazelrc('CUDA_TOOLKIT_PATH', cuda_toolkit_path)
+  environ_cp['TF_CUDA_VERSION'] = tf_cuda_version
+  write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version)
+
+
+def set_tf_cunn_version(environ_cp):
+  """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
+  ask_cudnn_version = (
+      '"Please specify the cuDNN version you want to use. '
+      '[Leave empty to default to cuDNN %s.0]: ') % _DEFAULT_CUDNN_VERSION
+
+  while True:
+    tf_cudnn_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version,
+        _DEFAULT_CUDNN_VERSION)
+
+    default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+    ask_cudnn_path = (r'Please specify the location where cuDNN %s library is '
+                      'installed. Refer to README.md for more details. [Default'
+                      ' is %s]:') % (tf_cudnn_version, default_cudnn_path)
+    cudnn_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'CUDNN_INSTALL_PATH', ask_cudnn_path, default_cudnn_path)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    cudnn_install_path = os.path.realpath(
+        os.path.expanduser(cudnn_install_path))
+    if is_windows():
+      cudnn_install_path = cygpath(cudnn_install_path)
+
+    if is_windows():
+      cuda_dnn_lib_path = 'lib/x64/cudnn.lib'
+      cuda_dnn_lib_alt_path = 'lib/x64/cudnn.lib'
+    elif is_linux():
+      cuda_dnn_lib_path = 'lib64/libcudnn.so.%s' % tf_cudnn_version
+      cuda_dnn_lib_alt_path = 'libcudnn.so.%s' % tf_cudnn_version
+    elif is_macos():
+      cuda_dnn_lib_path = 'lib/libcudnn.%s.dylib' % tf_cudnn_version
+      cuda_dnn_lib_alt_path = 'libcudnn.%s.dylib' % tf_cudnn_version
+
+    cuda_dnn_lib_path_full = os.path.join(cudnn_install_path, cuda_dnn_lib_path)
+    cuda_dnn_lib_alt_path_full = os.path.join(cudnn_install_path,
+                                              cuda_dnn_lib_alt_path)
+    if os.path.exists(cuda_dnn_lib_path_full) or os.path.exists(
+        cuda_dnn_lib_alt_path_full):
+      break
+
+    # Try another alternative for Linux
+    if is_linux():
+      if subprocess.call(['which', 'ldconfig']):
+        ldconfig_bin = '/sbin/ldconfig'
+      else:
+        ldconfig_bin = 'ldconfig'
+      cudnn_path_from_ldconfig = run_shell(
+          r'%s -p | sed -n "s/.*libcudnn.so .* => \(.*\)/\\1/p"' % ldconfig_bin)
+      if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)):
+        cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig)
+        break
+
+    # Reset and Retry
+    print(
+        'Invalid path to cuDNN %s toolkit. None of the following files can be '
+        'found:' % tf_cudnn_version)
+    print(cuda_dnn_lib_path_full)
+    print(cuda_dnn_lib_alt_path_full)
+    if is_linux():
+      print('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version))
+
+    environ_cp['TF_CUDNN_VERSION'] = ''
+
+  # Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION
+  environ_cp['CUDNN_INSTALL_PATH'] = cudnn_install_path
+  write_action_env_to_bazelrc('CUDNN_INSTALL_PATH', cudnn_install_path)
+  environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version
+  write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
+
+
+def get_native_cuda_compute_capabilities(environ_cp):
+  """Get native cuda compute capabilities.
+
+  Args:
+    environ_cp: copy of the os.environ.
+  Returns:
+    string of native cuda compute capabilities, separated by comma.
+  """
+  device_query_bin = os.path.join(
+      environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
+  cmd = (r'"%s" | grep "Capability" | grep -o "[0-9]*\.[0-9]*" | sed '
+         '":a;{N;s/\\n/,/};ba"') % device_query_bin
+  try:
+    output = run_shell(cmd)
+  except subprocess.CalledProcessError:
+    output = ''
+  return output
+
+
+def set_tf_cuda_compute_capabilities(environ_cp):
+  """Set TF_CUDA_COMPUTE_CAPABILITIES."""
+  while True:
+    native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(
+        environ_cp)
+    if not native_cuda_compute_capabilities:
+      default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+    else:
+      default_cuda_compute_capabilities = native_cuda_compute_capabilities
+
+    ask_cuda_compute_capabilities = (
+        'Please specify a list of comma-separated '
+        'Cuda compute capabilities you want to '
+        'build with.\nYou can find the compute '
+        'capability of your device at: '
+        'https://developer.nvidia.com/cuda-gpus.\nPlease'
+        ' note that each additional compute '
+        'capability significantly increases your '
+        'build time and binary size. [Default is: %s]' %
+        default_cuda_compute_capabilities)
+    tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
+        environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
+        ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
+    # Check whether all capabilities from the input is valid
+    all_valid = True
+    for compute_capability in tf_cuda_compute_capabilities.split(','):
+      if not re.match('[0-9]+.[0-9]+', compute_capability):
+        print('Invalid compute capability: ' % compute_capability)
+        all_valid = False
+
+    if all_valid:
+      break
+
+    # Reset and Retry
+    environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = ''
+
+  # Set TF_CUDA_COMPUTE_CAPABILITIES
+  environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = tf_cuda_compute_capabilities
+  write_action_env_to_bazelrc('TF_CUDA_COMPUTE_CAPABILITIES',
+                              tf_cuda_compute_capabilities)
+
+
+def set_other_cuda_vars(environ_cp):
+  """Set other CUDA related variables."""
+  if is_windows():
+    # The following three variables are needed for MSVC toolchain configuration
+    # in Bazel
+    environ_cp['CUDA_PATH'] = environ_cp.get('CUDA_TOOLKIT_PATH')
+    environ_cp['CUDA_COMPUTE_CAPABILITIES'] = environ_cp.get(
+        'TF_CUDA_COMPUTE_CAPABILITIES')
+    environ_cp['NO_WHOLE_ARCHIVE_OPTION'] = 1
+    write_action_env_to_bazelrc('CUDA_PATH', environ_cp.get('CUDA_PATH'))
+    write_action_env_to_bazelrc('CUDA_COMPUTE_CAPABILITIE',
+                                environ_cp.get('CUDA_COMPUTE_CAPABILITIE'))
+    write_action_env_to_bazelrc('NO_WHOLE_ARCHIVE_OPTION',
+                                environ_cp.get('NO_WHOLE_ARCHIVE_OPTION'))
+    write_to_bazelrc('build --config=win-cuda')
+    write_to_bazelrc('test --config=win-cuda')
+  else:
+    # If CUDA is enabled, always use GPU during build and test.
+    if environ_cp.get('TF_CUDA_CLANG') == '1':
+      write_to_bazelrc('build --config=cuda_clang')
+      write_to_bazelrc('test --config=cuda_clang')
+    else:
+      write_to_bazelrc('build --config=cuda')
+      write_to_bazelrc('test --config=cuda')
+
+
+def set_host_cxx_compiler(environ_cp):
+  """Set HOST_CXX_COMPILER."""
+  default_cxx_host_compiler = run_shell('which g++ || true')
+  ask_cxx_host_compiler = (
+      'Please specify which C++ compiler should be used as'
+      ' the host C++ compiler. [Default is %s]: ') % default_cxx_host_compiler
+
+  while True:
+    host_cxx_compiler = get_from_env_or_user_or_default(
+        environ_cp, 'HOST_CXX_COMPILER', ask_cxx_host_compiler,
+        default_cxx_host_compiler)
+    if os.path.exists(host_cxx_compiler):
+      break
+
+    # Reset and retry
+    print('Invalid C++ compiler path. %s cannot be found' % host_cxx_compiler)
+    environ_cp['HOST_CXX_COMPILER'] = ''
+
+  # Set HOST_CXX_COMPILER
+  environ_cp['HOST_CXX_COMPILER'] = host_cxx_compiler
+  write_action_env_to_bazelrc('HOST_CXX_COMPILER', host_cxx_compiler)
+
+
+def set_host_c_compiler(environ_cp):
+  """Set HOST_C_COMPILER."""
+  default_c_host_compiler = run_shell('which gcc || true')
+  ask_c_host_compiler = (
+      'Please specify which C compiler should be used as the'
+      ' host C compiler. [Default is %s]: ') % default_c_host_compiler
+
+  while True:
+    host_c_compiler = get_from_env_or_user_or_default(
+        environ_cp, 'HOST_C_COMPILER', ask_c_host_compiler,
+        default_c_host_compiler)
+    if os.path.exists(host_c_compiler):
+      break
+
+    # Reset and retry
+    print('Invalid C compiler path. %s cannot be found' % host_c_compiler)
+    environ_cp['HOST_C_COMPILER'] = ''
+
+  # Set HOST_C_COMPILER
+  environ_cp['HOST_C_COMPILER'] = host_c_compiler
+  write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
+
+
+def set_computecpp_toolkit_path(environ_cp):
+  """Set COMPUTECPP_TOOLKIT_PATH."""
+  ask_computecpp_toolkit_path = ('Please specify the location where ComputeCpp '
+                                 'for SYCL %s is installed. [Default is %s]: '
+                                ) % (_TF_OPENCL_VERSION,
+                                     _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+
+  while True:
+    computecpp_toolkit_path = get_from_env_or_user_or_default(
+        environ_cp, 'COMPUTECPP_TOOLKIT_PATH', ask_computecpp_toolkit_path,
+        _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+    if is_linux():
+      sycl_rt_lib_path = 'lib/libComputeCpp.so'
+    else:
+      sycl_rt_lib_path = ''
+
+    sycl_rt_lib_path_full = os.path.join(computecpp_toolkit_path,
+                                         sycl_rt_lib_path)
+    if os.path.exists(sycl_rt_lib_path_full):
+      break
+
+    print('Invalid SYCL %s library path. %s cannot be found' %
+          (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
+    environ_cp['COMPUTECPP_TOOLKIT_PATH'] = ''
+
+  # Set COMPUTECPP_TOOLKIT_PATH
+  environ_cp['COMPUTECPP_TOOLKIT_PATH'] = computecpp_toolkit_path
+  write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
+                              computecpp_toolkit_path)
+
+
+def set_mpi_home(environ_cp):
+  """Set MPI_HOME."""
+  cmd = ('dirname $(dirname $(which mpirun)) || dirname $(dirname $(which '
+         'mpiexec))  || true')
+  default_mpi_home = run_shell(cmd)
+  ask_mpi_home = ('Please specify the MPI toolkit folder. [Default is %s]: '
+                 ) % default_mpi_home
+  while True:
+    mpi_home = get_from_env_or_user_or_default(environ_cp, 'MPI_HOME',
+                                               ask_mpi_home, default_mpi_home)
+
+    if os.path.exists(os.path.join(mpi_home, 'include')) and os.path.exists(
+        os.path.join(mpi_home, 'lib')):
+      break
+
+    print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
+          (os.path.join(mpi_home, 'include'),
+           os.path.exists(os.path.join(mpi_home, 'lib'))))
+    environ_cp['MPI_HOME'] = ''
+
+  # Set MPI_HOME
+  environ_cp['MPI_HOME'] = str(mpi_home)
+
+
+def set_other_mpi_vars(environ_cp):
+  """Set other MPI related variables."""
+  # Link the MPI header files
+  mpi_home = environ_cp.get('MPI_HOME')
+  symlink_force('%s/include/mpi.h' % mpi_home, 'third_party/mpi/mpi.h')
+
+  # Determine if we use OpenMPI or MVAPICH, these require different header files
+  # to be included here to make bazel dependency checker happy
+  if os.path.exists(os.path.join(mpi_home, 'include/mpi_portable_platform.h')):
+    symlink_force(
+        os.path.join(mpi_home, 'include/mpi_portable_platform.h'),
+        'third_party/mpi/mpi_portable_platform.h')
+    # TODO(gunan): avoid editing files in configure
+    sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI=False',
+                 'MPI_LIB_IS_OPENMPI=True')
+  else:
+    # MVAPICH / MPICH
+    symlink_force(
+        os.path.join(mpi_home, 'include/mpio.h'), 'third_party/mpi/mpio.h')
+    symlink_force(
+        os.path.join(mpi_home, 'include/mpicxx.h'), 'third_party/mpi/mpicxx.h')
+    # TODO(gunan): avoid editing files in configure
+    sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI=True',
+                 'MPI_LIB_IS_OPENMPI=False')
+
+  if os.path.exists(os.path.join(mpi_home, 'lib/libmpi.so')):
+    symlink_force(
+        os.path.join(mpi_home, 'lib/libmpi.so'), 'third_party/mpi/libmpi.so')
+  else:
+    raise ValueError('Cannot find the MPI library file in %s/lib' % mpi_home)
+
+
+def set_mkl():
+  write_to_bazelrc('build:mkl --define with_mkl_support=true')
+  write_to_bazelrc('build:mkl --define using_mkl=true')
+  write_to_bazelrc('build:mkl -c opt')
+  write_to_bazelrc('build:mkl --copt="-DEIGEN_USE_VML"')
+  print(
+      'Add "--config=mkl" to your bazel command to build with MKL '
+      'support.\nPlease note that MKL on MacOS or windows is still not '
+      'supported.\nIf you would like to use a local MKL instead of '
+      'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
+      'time before build.')
+
+
+def main():
+  # Make a copy of os.environ to be clear when functions and getting and setting
+  # environment variables.
+  environ_cp = dict(os.environ)
+
+  check_bazel_version('0.4.5')
+
+  reset_tf_configure_bazelrc()
+  cleanup_makefile()
+  setup_python(environ_cp)
+  run_gen_git_source(environ_cp)
+
+  if is_windows():
+    environ_cp['TF_NEED_GCP'] = '0'
+    environ_cp['TF_NEED_HDFS'] = '0'
+    environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_OPENCL'] = '0'
+    environ_cp['TF_CUDA_CLANG'] = '0'
+
+  if is_macos():
+    environ_cp['TF_NEED_JEMALLOC'] = '0'
+
+  set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
+                'with_jemalloc', True)
+  set_build_var(environ_cp, 'TF_NEED_GCP', 'Google Cloud Platform',
+                'with_gcp_support', False)
+  set_build_var(environ_cp, 'TF_NEED_HDFS', 'Hadoop File System',
+                'with_hdfs_support', False)
+  set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
+                False)
+  set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
+                False)
+
+  set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False)
+  if environ_cp.get('TF_NEED_OPENCL') == '1':
+    set_host_cxx_compiler(environ_cp)
+    set_host_c_compiler(environ_cp)
+    set_computecpp_toolkit_path(environ_cp)
+
+  set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
+  if environ_cp.get('TF_NEED_CUDA') == '1':
+    set_tf_cuda_version(environ_cp)
+    set_tf_cunn_version(environ_cp)
+    set_tf_cuda_compute_capabilities(environ_cp)
+
+    set_tf_cuda_clang(environ_cp)
+    if environ_cp.get('TF_CUDA_CLANG') == '1':
+      # Set up which clang we should use as the cuda / host compiler.
+      set_clang_cuda_compiler_path(environ_cp)
+    else:
+      # Set up which gcc nvcc should use as the host compiler
+      # No need to set this on Windows
+      if not is_windows():
+        set_gcc_host_compiler_path(environ_cp)
+    set_other_cuda_vars(environ_cp)
+
+  set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
+  if environ_cp.get('TF_NEED_MPI') == '1':
+    set_mpi_home(environ_cp)
+    set_other_mpi_vars(environ_cp)
+
+  set_cc_opt_flags(environ_cp)
+  set_mkl()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 6450b2ad878b57191ae3b12e7e39213ac168eef6..a162bcf452515315c621c6c72740413bf21f849d 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -39,7 +39,7 @@ config_setting(
 config_setting(
     name = "android_armeabi",
     values = {
-        "cc_target_os": "android",
+        "crosstool_top": "//external:android/crosstool",
         "cpu": "armeabi",
     },
     visibility = ["//visibility:public"],
@@ -63,6 +63,24 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "android_mips",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "mips",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_mips64",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "mips64",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
@@ -178,7 +196,10 @@ config_setting(
 
 package_group(
     name = "internal",
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/protonn/llgtm/...",
+        "//tensorflow/...",
+    ],
 )
 
 filegroup(
@@ -216,9 +237,12 @@ filegroup(
         "//tensorflow/compiler/jit/kernels:all_files",
         "//tensorflow/compiler/jit/legacy_flags:all_files",
         "//tensorflow/compiler/jit/ops:all_files",
+        "//tensorflow/compiler/plugin/executor:all_files",
         "//tensorflow/compiler/tests:all_files",
         "//tensorflow/compiler/tf2xla:all_files",
+        "//tensorflow/compiler/tf2xla/cc:all_files",
         "//tensorflow/compiler/tf2xla/kernels:all_files",
+        "//tensorflow/compiler/tf2xla/ops:all_files",
         "//tensorflow/compiler/xla:all_files",
         "//tensorflow/compiler/xla/client:all_files",
         "//tensorflow/compiler/xla/client/lib:all_files",
@@ -238,6 +262,7 @@ filegroup(
         "//tensorflow/contrib/batching/util:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
         "//tensorflow/contrib/boosted_trees:all_files",
+        "//tensorflow/contrib/boosted_trees/estimator_batch:all_files",
         "//tensorflow/contrib/boosted_trees/lib:all_files",
         "//tensorflow/contrib/boosted_trees/proto:all_files",
         "//tensorflow/contrib/boosted_trees/resources:all_files",
@@ -253,13 +278,14 @@ filegroup(
         "//tensorflow/contrib/data/python/kernel_tests:all_files",
         "//tensorflow/contrib/data/python/ops:all_files",
         "//tensorflow/contrib/data/python/util:all_files",
-        "//tensorflow/contrib/decision_trees:all_files",
+        "//tensorflow/contrib/decision_trees/proto:all_files",
         "//tensorflow/contrib/distributions:all_files",
         "//tensorflow/contrib/factorization:all_files",
         "//tensorflow/contrib/factorization/kernels:all_files",
         "//tensorflow/contrib/ffmpeg:all_files",
         "//tensorflow/contrib/ffmpeg/default:all_files",
         "//tensorflow/contrib/framework:all_files",
+        "//tensorflow/contrib/fused_conv:all_files",
         "//tensorflow/contrib/graph_editor:all_files",
         "//tensorflow/contrib/grid_rnn:all_files",
         "//tensorflow/contrib/hooks:all_files",
@@ -284,6 +310,9 @@ filegroup(
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/nn:all_files",
         "//tensorflow/contrib/opt:all_files",
+        "//tensorflow/contrib/predictor:all_files",
+        "//tensorflow/contrib/remote_fused_graph/pylib:all_files",
+        "//tensorflow/contrib/resampler:all_files",
         "//tensorflow/contrib/rnn:all_files",
         "//tensorflow/contrib/saved_model:all_files",
         "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
@@ -302,10 +331,17 @@ filegroup(
         "//tensorflow/contrib/stateless:all_files",
         "//tensorflow/contrib/tensor_forest:all_files",
         "//tensorflow/contrib/tensor_forest/hybrid:all_files",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:all_files",
+        "//tensorflow/contrib/tensor_forest/proto:all_files",
         "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/testing:all_files",
         "//tensorflow/contrib/text:all_files",
-        "//tensorflow/contrib/tfprof/python/tools/tfprof:all_files",
+        "//tensorflow/contrib/tfprof:all_files",
+        "//tensorflow/contrib/timeseries:all_files",
+        "//tensorflow/contrib/timeseries/examples:all_files",
+        "//tensorflow/contrib/timeseries/python/timeseries:all_files",
+        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:all_files",
+        "//tensorflow/contrib/tpu:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/contrib/verbs:all_files",
@@ -321,12 +357,16 @@ filegroup(
         "//tensorflow/core/grappler/optimizers:all_files",
         "//tensorflow/core/grappler/utils:all_files",
         "//tensorflow/core/kernels:all_files",
+        "//tensorflow/core/kernels/fuzzing:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
         "//tensorflow/core/kernels/neon:all_files",
         "//tensorflow/core/ops/compat:all_files",
         "//tensorflow/core/platform/cloud:all_files",
         "//tensorflow/core/platform/default/build_config:all_files",
         "//tensorflow/core/platform/hadoop:all_files",
+        "//tensorflow/core/profiler:all_files",
+        "//tensorflow/core/profiler/internal:all_files",
+        "//tensorflow/core/profiler/internal/advisor:all_files",
         "//tensorflow/core/util/ctc:all_files",
         "//tensorflow/core/util/tensor_bundle:all_files",
         "//tensorflow/examples/android:all_files",
@@ -351,72 +391,10 @@ filegroup(
         "//tensorflow/python/kernel_tests:all_files",
         "//tensorflow/python/kernel_tests/distributions:all_files",
         "//tensorflow/python/ops/distributions:all_files",
+        "//tensorflow/python/profiler:all_files",
+        "//tensorflow/python/profiler/internal:all_files",
         "//tensorflow/python/saved_model:all_files",
         "//tensorflow/python/tools:all_files",
-        "//tensorflow/tensorboard:all_files",
-        "//tensorflow/tensorboard/backend:all_files",
-        "//tensorflow/tensorboard/backend/event_processing:all_files",
-        "//tensorflow/tensorboard/components:all_files",
-        "//tensorflow/tensorboard/components/tf_audio_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_audio_dashboard/test:all_files",
-        "//tensorflow/tensorboard/components/tf_backend:all_files",
-        "//tensorflow/tensorboard/components/tf_backend/test:all_files",
-        "//tensorflow/tensorboard/components/tf_color_scale:all_files",
-        "//tensorflow/tensorboard/components/tf_color_scale/test:all_files",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:all_files",
-        "//tensorflow/tensorboard/components/tf_dashboard_common/test:all_files",
-        "//tensorflow/tensorboard/components/tf_distribution_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_globals:all_files",
-        "//tensorflow/tensorboard/components/tf_graph:all_files",
-        "//tensorflow/tensorboard/components/tf_graph/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_app:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_app/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_board:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_board/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_common:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_controls:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_controls/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_dashboard/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_info:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_info/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_loader:all_files",
-        "//tensorflow/tensorboard/components/tf_graph_loader/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_histogram_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_image_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_imports:all_files",
-        "//tensorflow/tensorboard/components/tf_option_selector:all_files",
-        "//tensorflow/tensorboard/components/tf_profile_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_profile_dashboard/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_runs_selector:all_files",
-        "//tensorflow/tensorboard/components/tf_scalar_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_scalar_dashboard/demo:all_files",
-        "//tensorflow/tensorboard/components/tf_storage:all_files",
-        "//tensorflow/tensorboard/components/tf_storage/test:all_files",
-        "//tensorflow/tensorboard/components/tf_tensorboard:all_files",
-        "//tensorflow/tensorboard/components/tf_text_dashboard:all_files",
-        "//tensorflow/tensorboard/components/tf_trace_viewer:all_files",
-        "//tensorflow/tensorboard/components/vz_distribution_chart:all_files",
-        "//tensorflow/tensorboard/components/vz_histogram_timeseries:all_files",
-        "//tensorflow/tensorboard/components/vz_line_chart:all_files",
-        "//tensorflow/tensorboard/components/vz_projector:all_files",
-        "//tensorflow/tensorboard/components/vz_projector/test:all_files",
-        "//tensorflow/tensorboard/components/vz_sorting:all_files",
-        "//tensorflow/tensorboard/components/vz_sorting/test:all_files",
-        "//tensorflow/tensorboard/demo:all_files",
-        "//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:all_files",
-        "//tensorflow/tensorboard/plugins:all_files",
-        "//tensorflow/tensorboard/plugins/audio:all_files",
-        "//tensorflow/tensorboard/plugins/distributions:all_files",
-        "//tensorflow/tensorboard/plugins/graphs:all_files",
-        "//tensorflow/tensorboard/plugins/histograms:all_files",
-        "//tensorflow/tensorboard/plugins/images:all_files",
-        "//tensorflow/tensorboard/plugins/projector:all_files",
-        "//tensorflow/tensorboard/plugins/scalars:all_files",
-        "//tensorflow/tensorboard/plugins/text:all_files",
-        "//tensorflow/tensorboard/scripts:all_files",
         "//tensorflow/tools/api/golden:all_files",
         "//tensorflow/tools/api/lib:all_files",
         "//tensorflow/tools/api/tests:all_files",
@@ -427,12 +405,10 @@ filegroup(
         "//tensorflow/tools/docker/notebooks:all_files",
         "//tensorflow/tools/docs:all_files",
         "//tensorflow/tools/git:all_files",
+        "//tensorflow/tools/mlpbtxt:all_files",
         "//tensorflow/tools/proto_text:all_files",
         "//tensorflow/tools/quantization:all_files",
         "//tensorflow/tools/test:all_files",
-        "//tensorflow/tools/tfprof:all_files",
-        "//tensorflow/tools/tfprof/internal:all_files",
-        "//tensorflow/tools/tfprof/internal/advisor:all_files",
         "//tensorflow/user_ops:all_files",
         "//third_party/hadoop:all_files",
         "//third_party/sycl:all_files",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 3ab4e8efcdb5b05cf8922edd302e7cbf3a3597f1..507b2fe1f1420661585b573da541e33e39e5adf5 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -62,6 +62,7 @@ tf_cuda_library(
             "//tensorflow/cc:scope_internal",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",
         ],
     }),
@@ -102,6 +103,19 @@ tf_cuda_library(
 # -----------------------------------------------------------------------------
 # Tests
 
+tf_cuda_library(
+    name = "c_test_util",
+    testonly = 1,
+    srcs = ["c_test_util.cc"],
+    hdrs = ["c_test_util.h"],
+    deps = [
+        ":c_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "c_api_test",
     size = "small",
@@ -119,6 +133,7 @@ tf_cc_test(
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
+        ":c_test_util",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
@@ -138,11 +153,38 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "while_loop_test",
+    size = "small",
+    srcs = ["while_loop_test.cc"],
+    deps = [
+        ":c_api",
+        ":c_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_custom_op_library(
     name = "test_op.so",
     srcs = ["test_op.cc"],
 )
 
+# -----------------------------------------------------------------------------
+# Python API target
+
+tf_cuda_library(
+    name = "python_api",
+    srcs = ["python_api.cc"],
+    hdrs = ["python_api.h"],
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        ":c_api",
+        ":c_api_internal",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 77faa475ed47990a4dcee0e1ca0925af0c1643f9..371264ef6c20dbaa8263668eb526d49bb25c50c0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -28,12 +28,15 @@ limitations under the License.
 #endif
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -53,21 +56,16 @@ limitations under the License.
 
 // The implementation below is at the top level instead of the
 // brain namespace because we are defining 'extern "C"' functions.
-using tensorflow::error::Code;
-using tensorflow::errors::InvalidArgument;
-using tensorflow::gtl::ArraySlice;
-using tensorflow::strings::StrCat;
 using tensorflow::AllocationDescription;
 using tensorflow::DataType;
 using tensorflow::Graph;
 using tensorflow::GraphDef;
-using tensorflow::mutex_lock;
 using tensorflow::NameRangeMap;
 using tensorflow::NameRangesForNode;
 using tensorflow::NewSession;
 using tensorflow::Node;
-using tensorflow::NodeDef;
 using tensorflow::NodeBuilder;
+using tensorflow::NodeDef;
 using tensorflow::OpDef;
 using tensorflow::OpRegistry;
 using tensorflow::PartialTensorShape;
@@ -80,6 +78,11 @@ using tensorflow::TensorBuffer;
 using tensorflow::TensorId;
 using tensorflow::TensorShape;
 using tensorflow::TensorShapeProto;
+using tensorflow::error::Code;
+using tensorflow::errors::InvalidArgument;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::mutex_lock;
+using tensorflow::strings::StrCat;
 
 extern "C" {
 
@@ -163,7 +166,7 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
   if (out->data != nullptr) {
     return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
   }
-  const auto proto_size = in.ByteSize();
+  const auto proto_size = in.ByteSizeLong();
   void* buf = tensorflow::port::Malloc(proto_size);
   in.SerializeToArray(buf, proto_size);
   out->data = buf;
@@ -255,24 +258,27 @@ size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
   return sz;
 }
 
-size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
-                       size_t* dst_len, TF_Status* status) {
+static Status TF_StringDecode_Impl(const char* src, size_t src_len,
+                                   const char** dst, size_t* dst_len) {
   tensorflow::uint64 len64 = 0;
   const char* p = tensorflow::core::GetVarint64Ptr(src, src + src_len, &len64);
   if (p == nullptr) {
-    status->status =
-        InvalidArgument("invalid string encoding or truncated src buffer");
-    return 0;
+    return InvalidArgument("invalid string encoding or truncated src buffer");
   }
   if (len64 > std::numeric_limits<size_t>::max()) {
-    status->status =
-        InvalidArgument("encoded string is ", len64,
-                        "-bytes, which is too large for this architecture");
-    return 0;
+    return InvalidArgument("encoded string is ", len64,
+                           "-bytes, which is too large for this architecture");
   }
   *dst = p;
   *dst_len = static_cast<size_t>(len64);
-  return static_cast<size_t>(p - src) + *dst_len;
+  return Status::OK();
+}
+
+size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
+                       size_t* dst_len, TF_Status* status) {
+  status->status = TF_StringDecode_Impl(src, src_len, dst, dst_len);
+  if (!status->status.ok()) return 0;
+  return static_cast<size_t>(*dst - src) + *dst_len;
 }
 
 size_t TF_StringEncodedSize(size_t len) {
@@ -388,16 +394,20 @@ void TF_Reset(const TF_SessionOptions* opt, const char** containers,
 
 namespace tensorflow {
 
-// Non-static for testing.
-bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status) {
+Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
+  if (src->dtype != TF_STRING) {
+    *dst = TensorCApi::MakeTensor(src->dtype, src->shape, src->buffer);
+    return Status::OK();
+  }
+  // TF_STRING tensors require copying since Tensor class expects a sequence of
+  // string objects.
   const tensorflow::int64 num_elements = src->shape.num_elements();
   const char* input = reinterpret_cast<const char*>(TF_TensorData(src));
   const size_t src_size = TF_TensorByteSize(src);
   if (static_cast<tensorflow::int64>(src_size / sizeof(tensorflow::uint64)) <
       num_elements) {
-    status->status = InvalidArgument(
+    return InvalidArgument(
         "Malformed TF_STRING tensor; too short to hold number of elements");
-    return false;
   }
   const char* data_start = input + sizeof(tensorflow::uint64) * num_elements;
   const char* limit = input + src_size;
@@ -408,24 +418,30 @@ bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status) {
     tensorflow::uint64 offset =
         reinterpret_cast<const tensorflow::uint64*>(input)[i];
     if (static_cast<ptrdiff_t>(offset) >= (limit - data_start)) {
-      status->status = InvalidArgument("Malformed TF_STRING tensor; element ",
-                                       i, " out of range");
-      return false;
+      return InvalidArgument("Malformed TF_STRING tensor; element ", i,
+                             " out of range");
     }
     size_t len;
     const char* p;
     const char* srcp = data_start + offset;
-    TF_StringDecode(srcp, limit - srcp, &p, &len, status);
-    if (!status->status.ok()) {
-      return false;
-    }
+    Status status = TF_StringDecode_Impl(srcp, limit - srcp, &p, &len);
+    if (!status.ok()) return status;
     dstarray(i).assign(p, len);
   }
-  return true;
+  return Status::OK();
 }
 
 // Non-static for testing.
-TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src) {
+TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src) {
+  if (src.dtype() != DT_STRING) {
+    TensorBuffer* buf = TensorCApi::Buffer(src);
+    buf->Ref();
+    return new TF_Tensor{static_cast<TF_DataType>(src.dtype()), src.shape(),
+                         buf};
+  }
+  // DT_STRING tensors require a copying since TF_Tensor.buffer expects a flatly
+  // encoded sequence of strings.
+
   // Compute bytes needed for encoding.
   size_t size = 0;
   const auto& srcarray = src.flat<tensorflow::string>();
@@ -466,15 +482,6 @@ TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src) {
                       dimvec.size(), base, size, DeleteArray, base);
 }
 
-class TensorCApi {
- public:
-  static TensorBuffer* Buffer(const Tensor& tensor) { return tensor.buf_; }
-  static Tensor MakeTensor(TF_DataType type, const TensorShape& shape,
-                           TensorBuffer* buf) {
-    return Tensor(static_cast<DataType>(type), shape, buf);
-  }
-};
-
 // Create an empty tensor of type 'dtype'. 'shape' can be arbitrary, but has to
 // result in a zero-sized tensor.
 static TF_Tensor* EmptyTensor(TF_DataType dtype, const TensorShape& shape) {
@@ -513,16 +520,8 @@ static bool TF_Run_Inputs(
     TF_Status* status) {
   const int ninputs = input_pairs->size();
   for (int i = 0; i < ninputs; ++i) {
-    TF_Tensor* src = c_inputs[i];
-    if (c_inputs[i]->dtype != TF_STRING) {
-      (*input_pairs)[i].second = tensorflow::TensorCApi::MakeTensor(
-          src->dtype, src->shape, src->buffer);
-    } else if (!tensorflow::TF_Tensor_DecodeStrings(
-                   src, &(*input_pairs)[i].second, status)) {
-      // TF_STRING tensors require copying since Tensor class expects
-      // a sequence of string objects.
-      return false;
-    }
+    status->status = TF_TensorToTensor(c_inputs[i], &(*input_pairs)[i].second);
+    if (!status->status.ok()) return false;
   }
   return true;
 }
@@ -580,15 +579,7 @@ static void TF_Run_Helper(
           static_cast<TF_DataType>(src.dtype()), src.shape());
       continue;
     }
-    if (src.dtype() != tensorflow::DT_STRING) {
-      // Share the underlying buffer.
-      TensorBuffer* buf = tensorflow::TensorCApi::Buffer(src);
-      buf->Ref();
-      c_outputs[i] = new TF_Tensor{static_cast<TF_DataType>(src.dtype()),
-                                   src.shape(), buf};
-    } else {
-      c_outputs[i] = tensorflow::TF_Tensor_EncodeStrings(src);
-    }
+    c_outputs[i] = TF_TensorFromTensor(src);
   }
 }
 
@@ -628,7 +619,7 @@ void TF_PRunSetup(TF_DeprecatedSession* s,
                   // Target nodes
                   const char** c_target_oper_names, int ntargets,
                   const char** handle, TF_Status* status) {
-  status->status = Status::OK();
+  *handle = nullptr;
 
   std::vector<tensorflow::string> input_names(ninputs);
   std::vector<tensorflow::string> output_names(noutputs);
@@ -643,16 +634,12 @@ void TF_PRunSetup(TF_DeprecatedSession* s,
     target_oper_names[i] = c_target_oper_names[i];
   }
   tensorflow::string new_handle;
-  Status result;
-  result = s->session->PRunSetup(input_names, output_names, target_oper_names,
-                                 &new_handle);
-  if (result.ok()) {
+  status->status = s->session->PRunSetup(input_names, output_names,
+                                         target_oper_names, &new_handle);
+  if (status->status.ok()) {
     char* buf = new char[new_handle.size() + 1];
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
-  } else {
-    *handle = nullptr;
-    status->status = result;
   }
 }
 
@@ -1072,20 +1059,9 @@ void TF_SetAttrTensorShapeProtoList(TF_OperationDescription* desc,
 
 void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name,
                       TF_Tensor* value, TF_Status* status) {
-  status->status = Status::OK();
   Tensor t;
-  bool ok = true;
-
-  if (value->dtype != TF_STRING) {
-    t = tensorflow::TensorCApi::MakeTensor(value->dtype, value->shape,
-                                           value->buffer);
-  } else {
-    // TF_STRING tensors require copying since Tensor class expects
-    // a sequence of string objects.
-    ok = tensorflow::TF_Tensor_DecodeStrings(value, &t, status);
-  }
-
-  if (ok) desc->node_builder.Attr(attr_name, t);
+  status->status = TF_TensorToTensor(value, &t);
+  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
@@ -1094,21 +1070,14 @@ void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
   status->status = Status::OK();
   std::vector<Tensor> t;
   t.reserve(num_values);
-  bool ok = true;
 
-  for (int i = 0; i < num_values && ok; ++i) {
-    if (values[i]->dtype != TF_STRING) {
-      t.emplace_back(tensorflow::TensorCApi::MakeTensor(
-          values[i]->dtype, values[i]->shape, values[i]->buffer));
-    } else {
-      t.emplace_back(::tensorflow::DT_STRING);
-      // TF_STRING tensors require copying since Tensor class expects
-      // a sequence of string objects.
-      ok = tensorflow::TF_Tensor_DecodeStrings(values[i], &t.back(), status);
-    }
+  for (int i = 0; i < num_values && status->status.ok(); ++i) {
+    Tensor v;
+    status->status = TF_TensorToTensor(values[i], &v);
+    t.emplace_back(v);
   }
 
-  if (ok) desc->node_builder.Attr(attr_name, t);
+  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
@@ -1565,9 +1534,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
   Tensor t;
   status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
   if (!status->status.ok()) return;
-  *value = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
-                         tensorflow::TensorCApi::Buffer(t)};
-  (*value)->buffer->Ref();
+  *value = TF_TensorFromTensor(t);
 }
 
 void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
@@ -1578,10 +1545,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
   if (!status->status.ok()) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
-    const Tensor& t = ts[i];
-    values[i] = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
-                              tensorflow::TensorCApi::Buffer(t)};
-    values[i]->buffer->Ref();
+    values[i] = TF_TensorFromTensor(ts[i]);
   }
 }
 
@@ -1600,6 +1564,14 @@ void TF_OperationToNodeDef(TF_Operation* oper, TF_Buffer* output_node_def,
 
 // TF_Graph functions ---------------------------------------------------------
 
+TF_Graph::TF_Graph()
+    : graph(tensorflow::OpRegistry::Global()),
+      refiner(graph.versions().producer(), graph.op_registry()),
+      num_sessions(0),
+      delete_requested(false),
+      parent(nullptr),
+      parent_inputs(nullptr) {}
+
 TF_Graph* TF_NewGraph() { return new TF_Graph; }
 
 void TF_DeleteGraph(TF_Graph* g) {
@@ -2119,7 +2091,8 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
     const int max_node_id_before = g->graph.num_node_ids();
 
     tensorflow::Scope scope =
-        NewInternalScope(&g->graph, &status->status, &g->refiner);
+        NewInternalScope(&g->graph, &status->status, &g->refiner)
+            .NewSubScope("gradients");
 
     if (dx != nullptr) {
       std::vector<tensorflow::Output> dx_arg;
@@ -2326,6 +2299,8 @@ void TF_SessionPRunSetup(TF_Session* session, const TF_Output* inputs,
                          int ninputs, const TF_Output* outputs, int noutputs,
                          const TF_Operation* const* target_opers, int ntargets,
                          const char** handle, TF_Status* status) {
+  *handle = nullptr;
+
   if (!ExtendSessionGraphHelper(session, status)) {
     return;
   }
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 15139a47acf4b5bcf7a6b6fd77de5834f3f9189c..46758408c44ea4170abd4282294b77b07c762389 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -117,6 +117,7 @@ typedef enum TF_DataType {
   TF_COMPLEX128 = 18,  // Double-precision complex
   TF_HALF = 19,
   TF_RESOURCE = 20,
+  TF_VARIANT = 21,
 } TF_DataType;
 
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
@@ -1101,8 +1102,7 @@ TF_CAPI_EXPORT extern void TF_SessionRun(
 // needed.
 //
 // On failure, out_status contains a tensorflow::Status with an error
-// message.
-// NOTE: This is EXPERIMENTAL and subject to change.
+// message. *handle is set to nullptr.
 TF_CAPI_EXPORT extern void TF_SessionPRunSetup(
     TF_Session*,
     // Input names
@@ -1118,7 +1118,6 @@ TF_CAPI_EXPORT extern void TF_SessionPRunSetup(
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
-// NOTE: This is EXPERIMENTAL and subject to change.
 TF_CAPI_EXPORT extern void TF_SessionPRun(
     TF_Session*, const char* handle,
     // Input tensors
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index f17ac26ad9665d7ea8cc1ef566cad81bba712b62..d077ad264b198d7f4b29dbd58808b09c8239a28e 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_C_C_API_INTERNAL_H_
+#define TENSORFLOW_C_C_API_INTERNAL_H_
+
 #include "tensorflow/c/c_api.h"
 
 #include <vector>
@@ -56,13 +59,8 @@ struct TF_Library {
 };
 
 struct TF_Graph {
-  TF_Graph()
-      : graph(tensorflow::OpRegistry::Global()),
-        refiner(graph.versions().producer(), graph.op_registry()),
-        num_sessions(0),
-        delete_requested(false),
-        parent(nullptr),
-        parent_inputs(nullptr) {}
+  TF_Graph();
+
   tensorflow::mutex mu;
   tensorflow::Graph graph GUARDED_BY(mu);
 
@@ -117,3 +115,18 @@ struct TF_ImportGraphDefOptions {
 struct TF_DeviceList {
   std::vector<tensorflow::DeviceAttributes> response;
 };
+
+namespace tensorflow {
+
+class TensorCApi {
+ public:
+  static TensorBuffer* Buffer(const Tensor& tensor) { return tensor.buf_; }
+  static Tensor MakeTensor(TF_DataType type, const TensorShape& shape,
+                           TensorBuffer* buf) {
+    return Tensor(static_cast<DataType>(type), shape, buf);
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 04540bd793dab34c2f707e9e995defe7b4e15858..25b6cbd8e7ad4b92b8ecbafe87c040190ad18b58 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -16,9 +16,12 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <memory>
 #include <vector>
+
+#include "tensorflow/c/c_test_util.h"
 #include "tensorflow/cc/saved_model/signature_constants.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
@@ -41,24 +44,13 @@ limitations under the License.
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
-using tensorflow::int32;
-using tensorflow::string;
-using tensorflow::GraphDef;
-using tensorflow::NodeDef;
-using tensorflow::Tensor;
-using tensorflow::TensorShape;
-
 namespace tensorflow {
-bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status);
-TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src);
-}  // namespace tensorflow
+TF_Tensor* TF_TensorFromTensor(const Tensor& src);
+Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 
 namespace {
 
-typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
-    unique_tensor_ptr;
-
-TEST(CAPI, Version) { EXPECT_NE("", string(TF_Version())); }
+TEST(CAPI, Version) { EXPECT_STRNE("", TF_Version()); }
 
 TEST(CAPI, Status) {
   TF_Status* s = TF_NewStatus();
@@ -70,7 +62,7 @@ TEST(CAPI, Status) {
   TF_DeleteStatus(s);
 }
 
-static void Deallocator(void* data, size_t, void* arg) {
+void Deallocator(void* data, size_t, void* arg) {
   tensorflow::cpu_allocator()->DeallocateRaw(data);
   *reinterpret_cast<bool*>(arg) = true;
 }
@@ -143,7 +135,7 @@ TEST(CAPI, LibraryLoadFunctions) {
   TF_DeleteLibraryHandle(lib);
 }
 
-static void TestEncodeDecode(int line, const std::vector<string>& data) {
+void TestEncodeDecode(int line, const std::vector<string>& data) {
   const tensorflow::int64 n = data.size();
   for (const std::vector<tensorflow::int64>& dims :
        std::vector<std::vector<tensorflow::int64>>{
@@ -153,19 +145,16 @@ static void TestEncodeDecode(int line, const std::vector<string>& data) {
     for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
       src.flat<string>()(i) = data[i];
     }
-    TF_Tensor* dst = TF_Tensor_EncodeStrings(src);
+    TF_Tensor* dst = TF_TensorFromTensor(src);
 
     // Convert back to a C++ Tensor and ensure we get expected output.
-    TF_Status* status = TF_NewStatus();
     Tensor output;
-    ASSERT_TRUE(TF_Tensor_DecodeStrings(dst, &output, status)) << line;
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << line;
+    ASSERT_EQ(Status::OK(), TF_TensorToTensor(dst, &output)) << line;
     ASSERT_EQ(src.NumElements(), output.NumElements()) << line;
     for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
       ASSERT_EQ(data[i], output.flat<string>()(i)) << line;
     }
 
-    TF_DeleteStatus(status);
     TF_DeleteTensor(dst);
   }
 }
@@ -275,194 +264,6 @@ TEST(CAPI, GetAllOpList) {
   TF_DeleteBuffer(buf);
 }
 
-static void Int32Deallocator(void* data, size_t, void* arg) {
-  delete[] static_cast<int32*>(data);
-}
-
-// Create a tensor with values of type TF_INT8 provided by `values`.
-static TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims,
-                             const char* values) {
-  int64_t num_values = 1;
-  for (int i = 0; i < num_dims; ++i) {
-    num_values *= dims[i];
-  }
-  TF_Tensor* t =
-      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
-  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
-  return t;
-}
-
-static TF_Tensor* Int32Tensor(int32 v) {
-  const int num_bytes = sizeof(int32);
-  int32* values = new int32[1];
-  values[0] = v;
-  return TF_NewTensor(TF_INT32, nullptr, 0, values, num_bytes,
-                      &Int32Deallocator, nullptr);
-}
-
-TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
-                          const char* name = "feed") {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
-  TF_SetAttrType(desc, "dtype", TF_INT32);
-  return TF_FinishOperation(desc, s);
-}
-
-TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
-                    const char* name = "const") {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
-  TF_SetAttrTensor(desc, "value", t, s);
-  if (TF_GetCode(s) != TF_OK) return nullptr;
-  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
-  return TF_FinishOperation(desc, s);
-}
-
-TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s,
-                          const char* name = "scalar") {
-  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
-  return Const(tensor.get(), graph, s, name);
-}
-
-TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                  TF_Status* s, const char* name = "add") {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
-  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
-  TF_AddInputList(desc, add_inputs, 2);
-  return TF_FinishOperation(desc, s);
-}
-
-TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
-                  const char* name = "add") {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
-  TF_Output inputs[2] = {l, r};
-  TF_AddInputList(desc, inputs, 2);
-  return TF_FinishOperation(desc, s);
-}
-
-TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Neg", "neg");
-  TF_Output neg_input = {n, 0};
-  TF_AddInput(desc, neg_input);
-  return TF_FinishOperation(desc, s);
-}
-
-TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
-                       TF_Status* s) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Less", "less_than");
-  TF_AddInput(desc, l);
-  TF_AddInput(desc, r);
-  return TF_FinishOperation(desc, s);
-}
-
-bool IsPlaceholder(const NodeDef& node_def) {
-  if (node_def.op() != "Placeholder" || node_def.name() != "feed") {
-    return false;
-  }
-  bool found_dtype = false;
-  bool found_shape = false;
-  for (const auto& attr : node_def.attr()) {
-    if (attr.first == "dtype") {
-      if (attr.second.type() == tensorflow::DT_INT32) {
-        found_dtype = true;
-      } else {
-        return false;
-      }
-    } else if (attr.first == "shape") {
-      found_shape = true;
-    }
-  }
-  return found_dtype && found_shape;
-}
-
-bool IsScalarConst(const NodeDef& node_def, int v) {
-  if (node_def.op() != "Const" || node_def.name() != "scalar") {
-    return false;
-  }
-  bool found_dtype = false;
-  bool found_value = false;
-  for (const auto& attr : node_def.attr()) {
-    if (attr.first == "dtype") {
-      if (attr.second.type() == tensorflow::DT_INT32) {
-        found_dtype = true;
-      } else {
-        return false;
-      }
-    } else if (attr.first == "value") {
-      if (attr.second.has_tensor() &&
-          attr.second.tensor().int_val_size() == 1 &&
-          attr.second.tensor().int_val(0) == v) {
-        found_value = true;
-      } else {
-        return false;
-      }
-    }
-  }
-  return found_dtype && found_value;
-}
-
-bool IsAddN(const NodeDef& node_def, int n) {
-  if (node_def.op() != "AddN" || node_def.name() != "add" ||
-      node_def.input_size() != n) {
-    return false;
-  }
-  bool found_t = false;
-  bool found_n = false;
-  for (const auto& attr : node_def.attr()) {
-    if (attr.first == "T") {
-      if (attr.second.type() == tensorflow::DT_INT32) {
-        found_t = true;
-      } else {
-        return false;
-      }
-    } else if (attr.first == "N") {
-      if (attr.second.i() == n) {
-        found_n = true;
-      } else {
-        return false;
-      }
-    }
-  }
-  return found_t && found_n;
-}
-
-bool IsNeg(const NodeDef& node_def, const string& input) {
-  return node_def.op() == "Neg" && node_def.name() == "neg" &&
-         node_def.input_size() == 1 && node_def.input(0) == input;
-}
-
-bool GetGraphDef(TF_Graph* graph, GraphDef* graph_def) {
-  TF_Status* s = TF_NewStatus();
-  TF_Buffer* buffer = TF_NewBuffer();
-  TF_GraphToGraphDef(graph, buffer, s);
-  bool ret = TF_GetCode(s) == TF_OK;
-  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
-  TF_DeleteBuffer(buffer);
-  TF_DeleteStatus(s);
-  return ret;
-}
-
-bool GetNodeDef(TF_Operation* oper, NodeDef* node_def) {
-  TF_Status* s = TF_NewStatus();
-  TF_Buffer* buffer = TF_NewBuffer();
-  TF_OperationToNodeDef(oper, buffer, s);
-  bool ret = TF_GetCode(s) == TF_OK;
-  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  if (ret) ret = node_def->ParseFromArray(buffer->data, buffer->length);
-  TF_DeleteBuffer(buffer);
-  TF_DeleteStatus(s);
-  return ret;
-}
-
-bool GetAttrValue(TF_Operation* oper, const char* attr_name,
-                  tensorflow::AttrValue* attr_value, TF_Status* s) {
-  TF_Buffer* buffer = TF_NewBuffer();
-  TF_OperationGetAttrValueProto(oper, attr_name, buffer, s);
-  bool ret = TF_GetCode(s) == TF_OK;
-  if (ret) ret = attr_value->ParseFromArray(buffer->data, buffer->length);
-  TF_DeleteBuffer(buffer);
-  return ret;
-}
-
 TEST(CAPI, SetShape) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -879,114 +680,6 @@ TEST(CAPI, ImportGraphDef) {
   TF_DeleteStatus(s);
 }
 
-class CSession {
- public:
-  CSession(TF_Graph* graph, TF_Status* s) {
-    TF_SessionOptions* opts = TF_NewSessionOptions();
-    session_ = TF_NewSession(graph, opts, s);
-    TF_DeleteSessionOptions(opts);
-  }
-
-  explicit CSession(TF_Session* session) : session_(session) {}
-
-  ~CSession() {
-    TF_Status* s = TF_NewStatus();
-    CloseAndDelete(s);
-    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    TF_DeleteStatus(s);
-  }
-
-  void SetInputs(std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs) {
-    DeleteInputValues();
-    inputs_.clear();
-    for (const auto& p : inputs) {
-      inputs_.emplace_back(TF_Output{p.first, 0});
-      input_values_.emplace_back(p.second);
-    }
-  }
-
-  void SetOutputs(std::initializer_list<TF_Operation*> outputs) {
-    ResetOutputValues();
-    outputs_.clear();
-    for (TF_Operation* o : outputs) {
-      outputs_.emplace_back(TF_Output{o, 0});
-    }
-  }
-
-  void SetOutputs(const std::vector<TF_Output>& outputs) {
-    ResetOutputValues();
-    outputs_ = outputs;
-  }
-
-  void SetTargets(std::initializer_list<TF_Operation*> targets) {
-    targets_.clear();
-    for (TF_Operation* t : targets) {
-      targets_.emplace_back(t);
-    }
-  }
-
-  void Run(TF_Status* s) {
-    if (inputs_.size() != input_values_.size()) {
-      ADD_FAILURE() << "Call SetInputs() before Run()";
-      return;
-    }
-    ResetOutputValues();
-    output_values_.resize(outputs_.size(), nullptr);
-
-    const TF_Output* inputs_ptr = inputs_.empty() ? nullptr : &inputs_[0];
-    TF_Tensor* const* input_values_ptr =
-        input_values_.empty() ? nullptr : &input_values_[0];
-
-    const TF_Output* outputs_ptr = outputs_.empty() ? nullptr : &outputs_[0];
-    TF_Tensor** output_values_ptr =
-        output_values_.empty() ? nullptr : &output_values_[0];
-
-    TF_Operation* const* targets_ptr =
-        targets_.empty() ? nullptr : &targets_[0];
-
-    TF_SessionRun(session_, nullptr, inputs_ptr, input_values_ptr,
-                  inputs_.size(), outputs_ptr, output_values_ptr,
-                  outputs_.size(), targets_ptr, targets_.size(), nullptr, s);
-
-    DeleteInputValues();
-  }
-
-  void CloseAndDelete(TF_Status* s) {
-    DeleteInputValues();
-    ResetOutputValues();
-    if (session_ != nullptr) {
-      TF_CloseSession(session_, s);
-      EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-      TF_DeleteSession(session_, s);
-      session_ = nullptr;
-    }
-  }
-
-  TF_Tensor* output_tensor(int i) { return output_values_[i]; }
-
- private:
-  void DeleteInputValues() {
-    for (int i = 0; i < input_values_.size(); ++i) {
-      TF_DeleteTensor(input_values_[i]);
-    }
-    input_values_.clear();
-  }
-
-  void ResetOutputValues() {
-    for (int i = 0; i < output_values_.size(); ++i) {
-      if (output_values_[i] != nullptr) TF_DeleteTensor(output_values_[i]);
-    }
-    output_values_.clear();
-  }
-
-  TF_Session* session_;
-  std::vector<TF_Output> inputs_;
-  std::vector<TF_Tensor*> input_values_;
-  std::vector<TF_Output> outputs_;
-  std::vector<TF_Tensor*> output_values_;
-  std::vector<TF_Operation*> targets_;
-};
-
 TEST(CAPI, Session) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -1221,7 +914,7 @@ TEST(CAPI, SavedModel) {
   TF_Operation* input_op =
       TF_GraphOperationByName(graph, input_op_name.c_str());
   ASSERT_TRUE(input_op != nullptr);
-  csession.SetInputs({{input_op, TF_Tensor_EncodeStrings(input)}});
+  csession.SetInputs({{input_op, TF_TensorFromTensor(input)}});
 
   const tensorflow::string output_op_name =
       tensorflow::ParseTensorName(output_name).first.ToString();
@@ -1272,308 +965,6 @@ TEST(CAPI, SavedModelNullArgsAreValid) {
   TF_DeleteStatus(s);
 }
 
-class CApiWhileLoopTest : public ::testing::Test {
- protected:
-  CApiWhileLoopTest() : s_(TF_NewStatus()), graph_(TF_NewGraph()) {}
-
-  ~CApiWhileLoopTest() override {
-    TF_DeleteGraph(graph_);
-    TF_DeleteStatus(s_);
-  }
-
-  void Init(int ninputs) {
-    DCHECK(inputs_.empty());
-    DCHECK_GT(ninputs, 0);
-
-    for (int i = 0; i < ninputs; ++i) {
-      TF_Operation* placeholder = Placeholder(
-          graph_, s_, ::tensorflow::strings::StrCat("p", i).c_str());
-      DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-      inputs_.push_back({placeholder, 0});
-    }
-
-    original_graph_description_ = GraphDebugString();
-
-    params_.reset(new TF_WhileParams(
-        TF_NewWhile(graph_, &inputs_[0], inputs_.size(), s_)));
-    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-    ASSERT_EQ(original_graph_description_, GraphDebugString())
-        << "TF_NewWhile() altered graph";
-
-    params_->name = "test_loop";
-
-    // Initialize outputs_ so we can easily detect errors/bugs
-    outputs_.resize(ninputs, {nullptr, -1});
-  }
-
-  void ExpectOK() {
-    TF_FinishWhile(params_.get(), s_, &outputs_[0]);
-    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  }
-
-  void ExpectError(TF_Code expected_code, const string& expected_msg) {
-    TF_FinishWhile(params_.get(), s_, &outputs_[0]);
-    EXPECT_EQ(expected_code, TF_GetCode(s_));
-    EXPECT_EQ(expected_msg, TF_Message(s_));
-    // TODO(skyewm): this assert is currently broken. Fix or remove guarantee.
-    // ASSERT_EQ(original_graph_description_, GraphDebugString()) <<
-    //     "TF_FinishWhile() altered graph on error";
-  }
-
-  void Run(std::initializer_list<int> input_values) {
-    DCHECK_EQ(inputs_.size(), input_values.size());
-    std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs(inputs_.size());
-    int i = 0;
-    for (int v : input_values) {
-      inputs[i] = {inputs_[i].oper, Int32Tensor(v)};
-      ++i;
-    }
-    csession_.reset(new CSession(graph_, s_));
-    csession_->SetInputs(inputs);
-    csession_->SetOutputs(outputs_);
-    csession_->Run(s_);
-    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  }
-
-  void ExpectOutputValue(int idx, int expected_value) {
-    TF_Tensor* out = csession_->output_tensor(idx);
-    ASSERT_TRUE(out != nullptr);
-    EXPECT_EQ(TF_INT32, TF_TensorType(out));
-    EXPECT_EQ(0, TF_NumDims(out));
-    ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
-    int32* data = static_cast<int32*>(TF_TensorData(out));
-    EXPECT_EQ(expected_value, *data);
-  }
-
-  // Create a valid conditional graph. Useful for testing unrelated errors.
-  void CreateCondGraph() {
-    TF_Operation* one = ScalarConst(1, params_->cond_graph, s_);
-    TF_Operation* less_than =
-        LessThan(params_->cond_inputs[0], {one, 0}, params_->cond_graph, s_);
-    DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-    params_->cond_output = {less_than, 0};
-  }
-
-  string GraphDebugString() const {
-    TF_Buffer* buf = TF_NewBuffer();
-    TF_GraphToGraphDef(graph_, buf, s_);
-    DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-    GraphDef def;
-    bool success = def.ParseFromArray(buf->data, buf->length);
-    DCHECK(success);
-    TF_DeleteBuffer(buf);
-    return def.DebugString();
-  }
-
-  TF_Status* s_;
-  TF_Graph* graph_;
-  std::vector<TF_Output> inputs_;   // The inputs to the while loop
-  std::vector<TF_Output> outputs_;  // The final outputs of the while loop
-  std::unique_ptr<TF_WhileParams> params_;
-  std::unique_ptr<CSession> csession_;
-
- private:
-  // Used to verify that errors don't change graph_
-  string original_graph_description_;
-};
-
-TEST_F(CApiWhileLoopTest, BasicLoop) {
-  Init(2);
-
-  // Validate TF_WhileParams returned by TF_NewWhile()
-  EXPECT_TRUE(params_->body_graph != nullptr);
-  EXPECT_TRUE(params_->cond_graph != nullptr);
-
-  EXPECT_EQ(params_->ninputs, 2);
-
-  ASSERT_TRUE(params_->cond_inputs != nullptr);
-  ASSERT_TRUE(params_->cond_inputs[0].oper != nullptr);
-  EXPECT_TRUE(params_->cond_inputs[1].oper != nullptr);
-
-  ASSERT_TRUE(params_->body_inputs != nullptr);
-  EXPECT_TRUE(params_->body_inputs[0].oper != nullptr);
-  EXPECT_TRUE(params_->body_inputs[1].oper != nullptr);
-
-  ASSERT_TRUE(params_->body_outputs != nullptr);
-
-  // Create loop: while (input1 < input2) input1 += input2 + 1
-  TF_Operation* less_than =
-      LessThan(params_->cond_inputs[0], params_->cond_inputs[1],
-               params_->cond_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  params_->cond_output = {less_than, 0};
-
-  TF_Operation* add1 = Add(params_->body_inputs[0], params_->body_inputs[1],
-                           params_->body_graph, s_, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Operation* one = ScalarConst(1, params_->body_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Operation* add2 = Add(add1, one, params_->body_graph, s_, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  params_->body_outputs[0] = {add2, 0};
-  params_->body_outputs[1] = params_->body_inputs[1];
-
-  // Finalize while loop
-  ExpectOK();
-
-  // Validate while loop outputs returned by TF_FinishWhile()
-  EXPECT_TRUE(outputs_[0].oper != nullptr);
-  EXPECT_GE(outputs_[0].index, 0);
-  EXPECT_TRUE(outputs_[1].oper != nullptr);
-  EXPECT_GE(outputs_[1].index, 0);
-
-  // Run the graph
-  Run({-9, 2});
-  ExpectOutputValue(0, 3);
-  ExpectOutputValue(1, 2);
-}
-
-TEST_F(CApiWhileLoopTest, NestedLoop) {
-  Init(2);
-  // Create nested loop:
-  //  while (input1 < 6) {
-  //    inner_input1 = input1
-  //    while (inner_input1 < 3) {
-  //      input2 += 1
-  //      inner_input1 += 2
-  //    }
-  //    input1 += input2
-  //  }
-  //
-  // Expected execution with initial values input1 = input2 = 0:
-  //
-  // outer inner               inner_
-  // step# step# input1 input2 input1
-  // ------------------------------------
-  //   0     0     0      0      0
-  //   0     1     0      1      2
-  //   0     2     0      2      4
-  //   0     -     2      2      -
-  //   1     0     2      2      2
-  //   1     1     2      3      4
-  //   1     -     5      3      -
-  //   2     0     5      3      5
-  //   2     -     8      3      -
-
-  // Create outer cond graph
-  TF_Operation* six = ScalarConst(6, params_->cond_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Operation* less_than =
-      LessThan(params_->cond_inputs[0], {six, 0}, params_->cond_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  params_->cond_output = {less_than, 0};
-
-  // Create outer body graph
-  // Init inner graph
-  TF_Output inner_inputs[] = {params_->body_inputs[0], params_->body_inputs[1]};
-  TF_WhileParams inner_params =
-      TF_NewWhile(params_->body_graph, inner_inputs, 2, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  inner_params.name = "inner_loop";
-
-  // Create inner cond graph
-  TF_Operation* three = ScalarConst(3, inner_params.cond_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Operation* inner_less_than = LessThan(
-      inner_params.cond_inputs[0], {three, 0}, inner_params.cond_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  inner_params.cond_output = {inner_less_than, 0};
-
-  // Create inner body graph
-  TF_Operation* one = ScalarConst(1, inner_params.body_graph, s_, "one");
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Operation* two = ScalarConst(2, inner_params.body_graph, s_, "two");
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_Operation* input2_add =
-      Add(inner_params.body_inputs[1].oper, one, inner_params.body_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  inner_params.body_outputs[1] = {input2_add, 0};
-
-  TF_Operation* inner_input1_add = Add(inner_params.body_inputs[0].oper, two,
-                                       inner_params.body_graph, s_, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  inner_params.body_outputs[0] = {inner_input1_add, 0};
-
-  // Finalize inner graph
-  TF_Output inner_outputs[2] = {{nullptr, -1}};
-  TF_FinishWhile(&inner_params, s_, inner_outputs);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_Operation* input1_add =
-      Add(params_->body_inputs[0], inner_outputs[1], params_->body_graph, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  params_->body_outputs[0] = {input1_add, 0};
-
-  params_->body_outputs[1] = inner_outputs[1];
-
-  // Finalize outer graph
-  ExpectOK();
-
-  // Check for a few expected nodes
-  const char* node_name = "test_loop/cond/scalar";
-  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
-  node_name = "test_loop/body/add";
-  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
-  node_name = "test_loop/body/inner_loop/body/one";
-  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
-  node_name = "test_loop/body/inner_loop/cond/less_than";
-  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
-
-  // Run the graph
-  Run({0, 0});
-  ExpectOutputValue(0, 8);
-  ExpectOutputValue(1, 3);
-}
-
-TEST_F(CApiWhileLoopTest, BadCondOutput) {
-  Init(1);
-  params_->body_outputs[0] = params_->body_inputs[0];
-  ExpectError(TF_INVALID_ARGUMENT,
-              "TF_WhileParams `cond_output` field isn't set");
-}
-
-TEST_F(CApiWhileLoopTest, BadBodyOutput) {
-  Init(1);
-  CreateCondGraph();
-  ExpectError(TF_INVALID_ARGUMENT,
-              "TF_WhileParams `body_outputs[0]` field isn't set");
-}
-
-TEST_F(CApiWhileLoopTest, NullName) {
-  Init(1);
-  CreateCondGraph();
-  params_->body_outputs[0] = params_->body_inputs[0];
-  params_->name = nullptr;
-  ExpectError(TF_INVALID_ARGUMENT, "TF_WhileParams `name` field is null");
-}
-
-TEST_F(CApiWhileLoopTest, WrongGraph) {
-  Init(1);
-  CreateCondGraph();
-  // Set body output to output from outer graph
-  params_->body_outputs[0] = inputs_[0];
-  // TODO(skyewm): improve error message
-  ExpectError(TF_INVALID_ARGUMENT,
-              "Requested return node 'p0' not found in graph def");
-}
-
-TEST_F(CApiWhileLoopTest, BadTypes) {
-  Init(1);
-  CreateCondGraph();
-  // Op that has a float input + output
-  TF_OperationDescription* desc = TF_NewOperation(
-      params_->body_graph, "FakeQuantWithMinMaxArgs", "float_op");
-  TF_AddInput(desc, params_->body_inputs[0]);
-  TF_FinishOperation(desc, s_);
-  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
-  string msg(TF_Message(s_));
-  EXPECT_NE(msg.find("Input 'inputs' passed int32 expected float while "
-                     "building NodeDef 'float_op'"),
-            msg.npos);
-  TF_AbortWhile(params_.get());
-}
-
 REGISTER_OP("TestOpWithNoGradient")
     .Input("x: T")
     .Output("y: T")
@@ -1765,13 +1156,13 @@ class CApiGradientsTest : public ::testing::Test {
       const float const3_val[] = {1.0, 1.0, 1.0, 1.0};
       const3 = FloatConst2x2(expected_graph_, s_, const3_val, "GradInputs");
     } else {
-      const3 = OnesLike(expected_graph_, s_, matmul, "OnesLike");
+      const3 = OnesLike(expected_graph_, s_, matmul, "gradients/OnesLike");
     }
 
-    TF_Operation* matmul1 =
-        MatMul(expected_graph_, s_, const3, const1, "MatMul_1", false, true);
-    TF_Operation* matmul2 =
-        MatMul(expected_graph_, s_, const0, const3, "MatMul_2", true, false);
+    TF_Operation* matmul1 = MatMul(expected_graph_, s_, const3, const1,
+                                   "gradients/MatMul", false, true);
+    TF_Operation* matmul2 = MatMul(expected_graph_, s_, const0, const3,
+                                   "gradients/MatMul_1", true, false);
     expected_grad_outputs[0] = {matmul1, 0};
     expected_grad_outputs[1] = {matmul2, 0};
   }
@@ -2241,6 +1632,39 @@ TEST_F(CApiAttributesTest, Tensor) {
   TF_DeleteTensor(value);
 }
 
+TEST_F(CApiAttributesTest, StringTensor) {
+  // Create the string-Tensor "atttribute" value.
+  char encoded[] = {
+      0,   0, 0, 0, 0, 0, 0, 0,  // array[uint64] offsets
+      1,                         // varint encoded string length
+      'A',
+  };
+  auto deallocator = [](void* data, size_t len, void* arg) {};
+  unique_tensor_ptr t_in(TF_NewTensor(TF_STRING, nullptr, 0, &encoded[0],
+                                      sizeof(encoded), deallocator, nullptr),
+                         TF_DeleteTensor);
+
+  // Create a TF_Operation with the attribute t_in
+  auto desc = init("tensor");
+  TF_SetAttrTensor(desc, "v", t_in.get(), s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  auto oper = TF_FinishOperation(desc, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Fetch the attribute back.
+  EXPECT_TF_META("v", -1, TF_ATTR_TENSOR, -1);
+  TF_Tensor* t_out = nullptr;
+  TF_OperationGetAttrTensor(oper, "v", &t_out, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  EXPECT_EQ(TF_STRING, TF_TensorType(t_out));
+  EXPECT_EQ(0, TF_NumDims(t_out));
+  ASSERT_EQ(TF_TensorByteSize(t_in.get()), TF_TensorByteSize(t_out));
+  EXPECT_EQ(0, memcmp(TF_TensorData(t_in.get()), TF_TensorData(t_out),
+                      TF_TensorByteSize(t_out)));
+  TF_DeleteTensor(t_out);
+}
+
 TEST_F(CApiAttributesTest, TensorList) {
   const char tensor1[] = {5, 7};
   const int64_t dims1[] = {1, 2};
@@ -2252,7 +1676,8 @@ TEST_F(CApiAttributesTest, TensorList) {
 
   auto desc = init("list(tensor)");
   TF_Tensor* tmp[] = {
-      Int8Tensor(dims1, ndims1, tensor1), Int8Tensor(dims2, ndims2, tensor2),
+      Int8Tensor(dims1, ndims1, tensor1),
+      Int8Tensor(dims2, ndims2, tensor2),
   };
   TF_SetAttrTensorList(desc, "v", tmp, TF_ARRAYSIZE(tmp), s_);
   for (int i = 0; i < TF_ARRAYSIZE(tmp); ++i) {
@@ -2304,12 +1729,14 @@ TEST_F(CApiAttributesTest, Errors) {
   TF_OperationGetAttrString(oper, "v", nullptr, 0, s_);
   EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_);
 }
+
 #undef EXPECT_TF_META
 
+}  // namespace
+}  // namespace tensorflow
+
 // TODO(josh11b): Test:
 // * TF_SetDevice(desc, "/job:worker");
 // * control inputs / outputs
 // * targets
 // * TF_DeleteGraph() before TF_DeleteSession()
-
-}  // namespace
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21603c1a07caf9e9fdcd53561a94fdf7756ec84d
--- /dev/null
+++ b/tensorflow/c/c_test_util.cc
@@ -0,0 +1,304 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_test_util.h"
+
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::GraphDef;
+using tensorflow::NodeDef;
+
+static void Int32Deallocator(void* data, size_t, void* arg) {
+  delete[] static_cast<int32_t*>(data);
+}
+
+TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
+  int64_t num_values = 1;
+  for (int i = 0; i < num_dims; ++i) {
+    num_values *= dims[i];
+  }
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
+  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
+  return t;
+}
+
+TF_Tensor* Int32Tensor(int32_t v) {
+  const int num_bytes = sizeof(int32_t);
+  int32_t* values = new int32_t[1];
+  values[0] = v;
+  return TF_NewTensor(TF_INT32, nullptr, 0, values, num_bytes,
+                      &Int32Deallocator, nullptr);
+}
+
+TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
+  TF_SetAttrType(desc, "dtype", TF_INT32);
+  return TF_FinishOperation(desc, s);
+}
+
+TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
+                    const char* name) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
+  TF_SetAttrTensor(desc, "value", t, s);
+  if (TF_GetCode(s) != TF_OK) return nullptr;
+  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
+  return TF_FinishOperation(desc, s);
+}
+
+TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
+                          const char* name) {
+  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
+  return Const(tensor.get(), graph, s, name);
+}
+
+TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
+  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
+  TF_AddInputList(desc, add_inputs, 2);
+  return TF_FinishOperation(desc, s);
+}
+
+TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
+                  const char* name) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
+  TF_Output inputs[2] = {l, r};
+  TF_AddInputList(desc, inputs, 2);
+  return TF_FinishOperation(desc, s);
+}
+
+TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Neg", "neg");
+  TF_Output neg_input = {n, 0};
+  TF_AddInput(desc, neg_input);
+  return TF_FinishOperation(desc, s);
+}
+
+TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
+                       TF_Status* s) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Less", "less_than");
+  TF_AddInput(desc, l);
+  TF_AddInput(desc, r);
+  return TF_FinishOperation(desc, s);
+}
+
+bool IsPlaceholder(const tensorflow::NodeDef& node_def) {
+  if (node_def.op() != "Placeholder" || node_def.name() != "feed") {
+    return false;
+  }
+  bool found_dtype = false;
+  bool found_shape = false;
+  for (const auto& attr : node_def.attr()) {
+    if (attr.first == "dtype") {
+      if (attr.second.type() == tensorflow::DT_INT32) {
+        found_dtype = true;
+      } else {
+        return false;
+      }
+    } else if (attr.first == "shape") {
+      found_shape = true;
+    }
+  }
+  return found_dtype && found_shape;
+}
+
+bool IsScalarConst(const tensorflow::NodeDef& node_def, int v) {
+  if (node_def.op() != "Const" || node_def.name() != "scalar") {
+    return false;
+  }
+  bool found_dtype = false;
+  bool found_value = false;
+  for (const auto& attr : node_def.attr()) {
+    if (attr.first == "dtype") {
+      if (attr.second.type() == tensorflow::DT_INT32) {
+        found_dtype = true;
+      } else {
+        return false;
+      }
+    } else if (attr.first == "value") {
+      if (attr.second.has_tensor() &&
+          attr.second.tensor().int_val_size() == 1 &&
+          attr.second.tensor().int_val(0) == v) {
+        found_value = true;
+      } else {
+        return false;
+      }
+    }
+  }
+  return found_dtype && found_value;
+}
+
+bool IsAddN(const tensorflow::NodeDef& node_def, int n) {
+  if (node_def.op() != "AddN" || node_def.name() != "add" ||
+      node_def.input_size() != n) {
+    return false;
+  }
+  bool found_t = false;
+  bool found_n = false;
+  for (const auto& attr : node_def.attr()) {
+    if (attr.first == "T") {
+      if (attr.second.type() == tensorflow::DT_INT32) {
+        found_t = true;
+      } else {
+        return false;
+      }
+    } else if (attr.first == "N") {
+      if (attr.second.i() == n) {
+        found_n = true;
+      } else {
+        return false;
+      }
+    }
+  }
+  return found_t && found_n;
+}
+
+bool IsNeg(const tensorflow::NodeDef& node_def, const string& input) {
+  return node_def.op() == "Neg" && node_def.name() == "neg" &&
+         node_def.input_size() == 1 && node_def.input(0) == input;
+}
+
+bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def) {
+  TF_Status* s = TF_NewStatus();
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphToGraphDef(graph, buffer, s);
+  bool ret = TF_GetCode(s) == TF_OK;
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
+  TF_DeleteBuffer(buffer);
+  TF_DeleteStatus(s);
+  return ret;
+}
+
+bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def) {
+  TF_Status* s = TF_NewStatus();
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_OperationToNodeDef(oper, buffer, s);
+  bool ret = TF_GetCode(s) == TF_OK;
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  if (ret) ret = node_def->ParseFromArray(buffer->data, buffer->length);
+  TF_DeleteBuffer(buffer);
+  TF_DeleteStatus(s);
+  return ret;
+}
+
+bool GetAttrValue(TF_Operation* oper, const char* attr_name,
+                  tensorflow::AttrValue* attr_value, TF_Status* s) {
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_OperationGetAttrValueProto(oper, attr_name, buffer, s);
+  bool ret = TF_GetCode(s) == TF_OK;
+  if (ret) ret = attr_value->ParseFromArray(buffer->data, buffer->length);
+  TF_DeleteBuffer(buffer);
+  return ret;
+}
+
+CSession::CSession(TF_Graph* graph, TF_Status* s) {
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  session_ = TF_NewSession(graph, opts, s);
+  TF_DeleteSessionOptions(opts);
+}
+
+CSession::CSession(TF_Session* session) : session_(session) {}
+
+CSession::~CSession() {
+  TF_Status* s = TF_NewStatus();
+  CloseAndDelete(s);
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteStatus(s);
+}
+
+void CSession::SetInputs(
+    std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs) {
+  DeleteInputValues();
+  inputs_.clear();
+  for (const auto& p : inputs) {
+    inputs_.emplace_back(TF_Output{p.first, 0});
+    input_values_.emplace_back(p.second);
+  }
+}
+
+void CSession::SetOutputs(std::initializer_list<TF_Operation*> outputs) {
+  ResetOutputValues();
+  outputs_.clear();
+  for (TF_Operation* o : outputs) {
+    outputs_.emplace_back(TF_Output{o, 0});
+  }
+  output_values_.resize(outputs_.size());
+}
+
+void CSession::SetOutputs(const std::vector<TF_Output>& outputs) {
+  ResetOutputValues();
+  outputs_ = outputs;
+  output_values_.resize(outputs_.size());
+}
+
+void CSession::SetTargets(std::initializer_list<TF_Operation*> targets) {
+  targets_.clear();
+  for (TF_Operation* t : targets) {
+    targets_.emplace_back(t);
+  }
+}
+
+void CSession::Run(TF_Status* s) {
+  if (inputs_.size() != input_values_.size()) {
+    ADD_FAILURE() << "Call SetInputs() before Run()";
+    return;
+  }
+  ResetOutputValues();
+  output_values_.resize(outputs_.size(), nullptr);
+
+  const TF_Output* inputs_ptr = inputs_.empty() ? nullptr : &inputs_[0];
+  TF_Tensor* const* input_values_ptr =
+      input_values_.empty() ? nullptr : &input_values_[0];
+
+  const TF_Output* outputs_ptr = outputs_.empty() ? nullptr : &outputs_[0];
+  TF_Tensor** output_values_ptr =
+      output_values_.empty() ? nullptr : &output_values_[0];
+
+  TF_Operation* const* targets_ptr = targets_.empty() ? nullptr : &targets_[0];
+
+  TF_SessionRun(session_, nullptr, inputs_ptr, input_values_ptr, inputs_.size(),
+                outputs_ptr, output_values_ptr, outputs_.size(), targets_ptr,
+                targets_.size(), nullptr, s);
+
+  DeleteInputValues();
+}
+
+void CSession::CloseAndDelete(TF_Status* s) {
+  DeleteInputValues();
+  ResetOutputValues();
+  if (session_ != nullptr) {
+    TF_CloseSession(session_, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteSession(session_, s);
+    session_ = nullptr;
+  }
+}
+
+void CSession::DeleteInputValues() {
+  for (size_t i = 0; i < input_values_.size(); ++i) {
+    TF_DeleteTensor(input_values_[i]);
+  }
+  input_values_.clear();
+}
+
+void CSession::ResetOutputValues() {
+  for (size_t i = 0; i < output_values_.size(); ++i) {
+    if (output_values_[i] != nullptr) TF_DeleteTensor(output_values_[i]);
+  }
+  output_values_.clear();
+}
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c0ba667bd0c3014efc6f0bd48ad0e63ccf4ee6e
--- /dev/null
+++ b/tensorflow/c/c_test_util.h
@@ -0,0 +1,102 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
+
+#include "tensorflow/c/c_api.h"
+
+#include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+using ::tensorflow::string;
+
+typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
+    unique_tensor_ptr;
+
+// Create a tensor with values of type TF_INT8 provided by `values`.
+TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values);
+
+TF_Tensor* Int32Tensor(int32_t v);
+
+TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
+                          const char* name = "feed");
+
+TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
+                    const char* name = "const");
+
+TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar");
+
+TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "add");
+
+TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
+                  const char* name = "add");
+
+TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s);
+
+TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
+
+bool IsPlaceholder(const tensorflow::NodeDef& node_def);
+
+bool IsScalarConst(const tensorflow::NodeDef& node_def, int v);
+
+bool IsAddN(const tensorflow::NodeDef& node_def, int n);
+
+bool IsNeg(const tensorflow::NodeDef& node_def, const string& input);
+
+bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
+
+bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def);
+
+bool GetAttrValue(TF_Operation* oper, const char* attr_name,
+                  tensorflow::AttrValue* attr_value, TF_Status* s);
+
+class CSession {
+ public:
+  CSession(TF_Graph* graph, TF_Status* s);
+  explicit CSession(TF_Session* session);
+
+  ~CSession();
+
+  void SetInputs(std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs);
+  void SetOutputs(std::initializer_list<TF_Operation*> outputs);
+  void SetOutputs(const std::vector<TF_Output>& outputs);
+  void SetTargets(std::initializer_list<TF_Operation*> targets);
+
+  void Run(TF_Status* s);
+
+  void CloseAndDelete(TF_Status* s);
+
+  TF_Tensor* output_tensor(int i) { return output_values_[i]; }
+
+ private:
+  void DeleteInputValues();
+  void ResetOutputValues();
+
+  TF_Session* session_;
+  std::vector<TF_Output> inputs_;
+  std::vector<TF_Tensor*> input_values_;
+  std::vector<TF_Output> outputs_;
+  std::vector<TF_Tensor*> output_values_;
+  std::vector<TF_Operation*> targets_;
+};
+
+#endif  // THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adca6c762526a85f015560efb22d3de185e2ae6c
--- /dev/null
+++ b/tensorflow/c/python_api.cc
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/python_api.h"
+
+#include "tensorflow/c/c_api_internal.h"
+
+namespace tensorflow {
+
+void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input) {
+  // TODO(skyewm): make sure cycles are prevented
+  mutex_lock l(graph->mu);
+  graph->graph.AddControlEdge(&input->node, &op->node);
+}
+
+void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device) {
+  mutex_lock l(graph->mu);
+  op->node.set_requested_device(device);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1a55d7755a76c778bf6a8120a8cf81adb6941dc
--- /dev/null
+++ b/tensorflow/c/python_api.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
+#define THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
+
+#include "tensorflow/c/c_api.h"
+
+// These functions can be removed without notice. They exist to facilitate some
+// refactoring of graph construction code in the Python API.
+
+namespace tensorflow {
+
+void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input);
+
+void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/c/while_loop_test.cc b/tensorflow/c/while_loop_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e36226903411dc46c958fe1a096277d653ba4
--- /dev/null
+++ b/tensorflow/c/while_loop_test.cc
@@ -0,0 +1,329 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api.h"
+
+#include "tensorflow/c/c_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+using tensorflow::GraphDef;
+
+namespace {
+
+class CApiWhileLoopTest : public ::testing::Test {
+ protected:
+  CApiWhileLoopTest() : s_(TF_NewStatus()), graph_(TF_NewGraph()) {}
+
+  ~CApiWhileLoopTest() override {
+    TF_DeleteGraph(graph_);
+    TF_DeleteStatus(s_);
+  }
+
+  void Init(int ninputs) {
+    DCHECK(inputs_.empty());
+    DCHECK_GT(ninputs, 0);
+
+    for (int i = 0; i < ninputs; ++i) {
+      TF_Operation* placeholder = Placeholder(
+          graph_, s_, ::tensorflow::strings::StrCat("p", i).c_str());
+      DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+      inputs_.push_back({placeholder, 0});
+    }
+
+    original_graph_description_ = GraphDebugString();
+
+    params_.reset(new TF_WhileParams(
+        TF_NewWhile(graph_, &inputs_[0], inputs_.size(), s_)));
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    ASSERT_EQ(original_graph_description_, GraphDebugString())
+        << "TF_NewWhile() altered graph";
+
+    params_->name = "test_loop";
+
+    // Initialize outputs_ so we can easily detect errors/bugs
+    outputs_.resize(ninputs, {nullptr, -1});
+  }
+
+  void ExpectOK() {
+    TF_FinishWhile(params_.get(), s_, &outputs_[0]);
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void ExpectError(TF_Code expected_code, const string& expected_msg) {
+    TF_FinishWhile(params_.get(), s_, &outputs_[0]);
+    EXPECT_EQ(expected_code, TF_GetCode(s_));
+    EXPECT_EQ(expected_msg, TF_Message(s_));
+    // TODO(skyewm): this assert is currently broken. Fix or remove guarantee.
+    // ASSERT_EQ(original_graph_description_, GraphDebugString()) <<
+    //     "TF_FinishWhile() altered graph on error";
+  }
+
+  void Run(std::initializer_list<int> input_values) {
+    DCHECK_EQ(inputs_.size(), input_values.size());
+    std::vector<std::pair<TF_Operation*, TF_Tensor*>> inputs(inputs_.size());
+    int i = 0;
+    for (int v : input_values) {
+      inputs[i] = {inputs_[i].oper, Int32Tensor(v)};
+      ++i;
+    }
+    csession_.reset(new CSession(graph_, s_));
+    csession_->SetInputs(inputs);
+    csession_->SetOutputs(outputs_);
+    csession_->Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void ExpectOutputValue(int idx, int expected_value) {
+    TF_Tensor* out = csession_->output_tensor(idx);
+    ASSERT_TRUE(out != nullptr);
+    EXPECT_EQ(TF_INT32, TF_TensorType(out));
+    EXPECT_EQ(0, TF_NumDims(out));
+    ASSERT_EQ(sizeof(int32_t), TF_TensorByteSize(out));
+    int32_t* data = static_cast<int32_t*>(TF_TensorData(out));
+    EXPECT_EQ(expected_value, *data);
+  }
+
+  // Create a valid conditional graph. Useful for testing unrelated errors.
+  void CreateCondGraph() {
+    TF_Operation* one = ScalarConst(1, params_->cond_graph, s_);
+    TF_Operation* less_than =
+        LessThan(params_->cond_inputs[0], {one, 0}, params_->cond_graph, s_);
+    DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    params_->cond_output = {less_than, 0};
+  }
+
+  string GraphDebugString() const {
+    TF_Buffer* buf = TF_NewBuffer();
+    TF_GraphToGraphDef(graph_, buf, s_);
+    DCHECK_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    GraphDef def;
+    bool success = def.ParseFromArray(buf->data, buf->length);
+    DCHECK(success);
+    TF_DeleteBuffer(buf);
+    return def.DebugString();
+  }
+
+  TF_Status* s_;
+  TF_Graph* graph_;
+  std::vector<TF_Output> inputs_;   // The inputs to the while loop
+  std::vector<TF_Output> outputs_;  // The final outputs of the while loop
+  std::unique_ptr<TF_WhileParams> params_;
+  std::unique_ptr<CSession> csession_;
+
+ private:
+  // Used to verify that errors don't change graph_
+  string original_graph_description_;
+};
+
+TEST_F(CApiWhileLoopTest, BasicLoop) {
+  Init(2);
+
+  // Validate TF_WhileParams returned by TF_NewWhile()
+  EXPECT_TRUE(params_->body_graph != nullptr);
+  EXPECT_TRUE(params_->cond_graph != nullptr);
+
+  EXPECT_EQ(params_->ninputs, 2);
+
+  ASSERT_TRUE(params_->cond_inputs != nullptr);
+  ASSERT_TRUE(params_->cond_inputs[0].oper != nullptr);
+  EXPECT_TRUE(params_->cond_inputs[1].oper != nullptr);
+
+  ASSERT_TRUE(params_->body_inputs != nullptr);
+  EXPECT_TRUE(params_->body_inputs[0].oper != nullptr);
+  EXPECT_TRUE(params_->body_inputs[1].oper != nullptr);
+
+  ASSERT_TRUE(params_->body_outputs != nullptr);
+
+  // Create loop: while (input1 < input2) input1 += input2 + 1
+  TF_Operation* less_than =
+      LessThan(params_->cond_inputs[0], params_->cond_inputs[1],
+               params_->cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->cond_output = {less_than, 0};
+
+  TF_Operation* add1 = Add(params_->body_inputs[0], params_->body_inputs[1],
+                           params_->body_graph, s_, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* one = ScalarConst(1, params_->body_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* add2 = Add(add1, one, params_->body_graph, s_, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->body_outputs[0] = {add2, 0};
+  params_->body_outputs[1] = params_->body_inputs[1];
+
+  // Finalize while loop
+  ExpectOK();
+
+  // Validate while loop outputs returned by TF_FinishWhile()
+  EXPECT_TRUE(outputs_[0].oper != nullptr);
+  EXPECT_GE(outputs_[0].index, 0);
+  EXPECT_TRUE(outputs_[1].oper != nullptr);
+  EXPECT_GE(outputs_[1].index, 0);
+
+  // Run the graph
+  Run({-9, 2});
+  ExpectOutputValue(0, 3);
+  ExpectOutputValue(1, 2);
+}
+
+TEST_F(CApiWhileLoopTest, NestedLoop) {
+  Init(2);
+  // Create nested loop:
+  //  while (input1 < 6) {
+  //    inner_input1 = input1
+  //    while (inner_input1 < 3) {
+  //      input2 += 1
+  //      inner_input1 += 2
+  //    }
+  //    input1 += input2
+  //  }
+  //
+  // Expected execution with initial values input1 = input2 = 0:
+  //
+  // outer inner               inner_
+  // step# step# input1 input2 input1
+  // ------------------------------------
+  //   0     0     0      0      0
+  //   0     1     0      1      2
+  //   0     2     0      2      4
+  //   0     -     2      2      -
+  //   1     0     2      2      2
+  //   1     1     2      3      4
+  //   1     -     5      3      -
+  //   2     0     5      3      5
+  //   2     -     8      3      -
+
+  // Create outer cond graph
+  TF_Operation* six = ScalarConst(6, params_->cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* less_than =
+      LessThan(params_->cond_inputs[0], {six, 0}, params_->cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->cond_output = {less_than, 0};
+
+  // Create outer body graph
+  // Init inner graph
+  TF_Output inner_inputs[] = {params_->body_inputs[0], params_->body_inputs[1]};
+  TF_WhileParams inner_params =
+      TF_NewWhile(params_->body_graph, inner_inputs, 2, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.name = "inner_loop";
+
+  // Create inner cond graph
+  TF_Operation* three = ScalarConst(3, inner_params.cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* inner_less_than = LessThan(
+      inner_params.cond_inputs[0], {three, 0}, inner_params.cond_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.cond_output = {inner_less_than, 0};
+
+  // Create inner body graph
+  TF_Operation* one = ScalarConst(1, inner_params.body_graph, s_, "one");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* two = ScalarConst(2, inner_params.body_graph, s_, "two");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Operation* input2_add =
+      Add(inner_params.body_inputs[1].oper, one, inner_params.body_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.body_outputs[1] = {input2_add, 0};
+
+  TF_Operation* inner_input1_add = Add(inner_params.body_inputs[0].oper, two,
+                                       inner_params.body_graph, s_, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  inner_params.body_outputs[0] = {inner_input1_add, 0};
+
+  // Finalize inner graph
+  TF_Output inner_outputs[2] = {{nullptr, -1}};
+  TF_FinishWhile(&inner_params, s_, inner_outputs);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_Operation* input1_add =
+      Add(params_->body_inputs[0], inner_outputs[1], params_->body_graph, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  params_->body_outputs[0] = {input1_add, 0};
+
+  params_->body_outputs[1] = inner_outputs[1];
+
+  // Finalize outer graph
+  ExpectOK();
+
+  // Check for a few expected nodes
+  const char* node_name = "test_loop/cond/scalar";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+  node_name = "test_loop/body/add";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+  node_name = "test_loop/body/inner_loop/body/one";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+  node_name = "test_loop/body/inner_loop/cond/less_than";
+  EXPECT_TRUE(TF_GraphOperationByName(graph_, node_name) != nullptr);
+
+  // Run the graph
+  Run({0, 0});
+  ExpectOutputValue(0, 8);
+  ExpectOutputValue(1, 3);
+}
+
+TEST_F(CApiWhileLoopTest, BadCondOutput) {
+  Init(1);
+  params_->body_outputs[0] = params_->body_inputs[0];
+  ExpectError(TF_INVALID_ARGUMENT,
+              "TF_WhileParams `cond_output` field isn't set");
+}
+
+TEST_F(CApiWhileLoopTest, BadBodyOutput) {
+  Init(1);
+  CreateCondGraph();
+  ExpectError(TF_INVALID_ARGUMENT,
+              "TF_WhileParams `body_outputs[0]` field isn't set");
+}
+
+TEST_F(CApiWhileLoopTest, NullName) {
+  Init(1);
+  CreateCondGraph();
+  params_->body_outputs[0] = params_->body_inputs[0];
+  params_->name = nullptr;
+  ExpectError(TF_INVALID_ARGUMENT, "TF_WhileParams `name` field is null");
+}
+
+TEST_F(CApiWhileLoopTest, WrongGraph) {
+  Init(1);
+  CreateCondGraph();
+  // Set body output to output from outer graph
+  params_->body_outputs[0] = inputs_[0];
+  // TODO(skyewm): improve error message
+  ExpectError(TF_INVALID_ARGUMENT,
+              "Requested return node 'p0' not found in graph def");
+}
+
+TEST_F(CApiWhileLoopTest, BadTypes) {
+  Init(1);
+  CreateCondGraph();
+  // Op that has a float input + output
+  TF_OperationDescription* desc = TF_NewOperation(
+      params_->body_graph, "FakeQuantWithMinMaxArgs", "float_op");
+  TF_AddInput(desc, params_->body_inputs[0]);
+  TF_FinishOperation(desc, s_);
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  string msg(TF_Message(s_));
+  EXPECT_NE(msg.find("Input 'inputs' passed int32 expected float while "
+                     "building NodeDef 'float_op'"),
+            msg.npos);
+  TF_AbortWhile(params_.get());
+}
+
+}  // namespace
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index b86731b1183fd59875d470aa48ba12c269789f0f..c65170dfe85b847259f7c59d437f62aa32ce1178 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -45,6 +45,7 @@ tf_cc_test(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -61,7 +62,6 @@ cc_library(
         ":gradients",
         ":ops",
         ":scope",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -248,6 +248,7 @@ cc_library(
         ":gradients",
         "//tensorflow/core:lib_proto_parsing",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -274,11 +275,8 @@ cc_library(
     deps = [
         ":cc_ops",
         ":grad_op_registry",
-        ":ops",
-        ":scope",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -305,11 +303,8 @@ cc_library(
         ":cc_ops",
         ":cc_ops_internal",
         ":grad_op_registry",
-        ":ops",
-        ":scope",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -441,6 +436,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:op_gen_overrides_proto_cc",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -481,10 +477,23 @@ cc_binary(
     name = "tutorials_example_trainer",
     srcs = ["tutorials/example_trainer.cc"],
     copts = tf_copts(),
-    linkopts = [
-        "-lpthread",
-        "-lm",
-    ],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
+        "//tensorflow:darwin": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//tensorflow:ios": [
+            "-lm",
+            "-lpthread",
+        ],
+        "//conditions:default": [
+            "-lm",
+            "-lpthread",
+            "-lrt",
+        ],
+    }),
     deps = [
         ":cc_ops",
         "//tensorflow/core:core_cpu",
@@ -514,7 +523,6 @@ cc_library(
     deps = [
         ":coordinator",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -547,8 +555,6 @@ cc_library(
     srcs = ["training/coordinator.cc"],
     hdrs = ["training/coordinator.h"],
     deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 71aa986f918de68822d457422f6c7a73d6253819..80dd272f6f9dd5eecf5d7002bdf1c7c98e4c3ba3 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -18,8 +18,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/cc/framework/cc_op_gen.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/op_gen_overrides.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb_text.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 8c00a6f70497df2c70f266a747197e50c98375bb..66a943410e2757ea5a5c55351c1fc20d5a5e3154 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -65,7 +65,7 @@ class SymbolicGradientBuilder {
   // gradients for the node associated with `src`.
   Status BackpropAlongEdge(const Output& dst_grad, const Output& src);
 
-  // Adds a node to the graph (returned in`grad`) that sums the in-bound
+  // Adds a node to the graph (returned in `grad`) that sums the in-bound
   // gradients to `src` (if there are more than one).
   Status SumGradients(const Output& src, Output* grad);
 
@@ -152,12 +152,12 @@ Status SymbolicGradientBuilder::Initialize() {
   grad_outputs_->resize(inputs_.size());
   // Populate `output_nodes_` from node ids in `outputs_`.
   output_nodes_.reserve(outputs_.size());
-  for (int i = 0; i < outputs_.size(); ++i) {
+  for (size_t i = 0; i < outputs_.size(); ++i) {
     output_nodes_.insert(outputs_[i].node()->id());
   }
   // Populate `input_nodes_` from Outputs in `inputs_`.
   input_nodes_.reserve(inputs_.size());
-  for (int i = 0; i < inputs_.size(); ++i) {
+  for (size_t i = 0; i < inputs_.size(); ++i) {
     input_nodes_.insert({inputs_[i], i});
   }
 
@@ -341,7 +341,7 @@ Status SymbolicGradientBuilder::AddGradients() {
     // gradient function to the src node/output to which it should be
     // backproped. Maybe grad functions can return a vector of Output pairs to
     // make this association explicit.
-    int dx_index = 0;
+    size_t dx_index = 0;
     for (const Edge* e : n->in_edges()) {
       if (e->IsControlEdge()) continue;
       if (dx_index == dx.size()) {
@@ -352,6 +352,23 @@ Status SymbolicGradientBuilder::AddGradients() {
           BackpropAlongEdge(dx[dx_index++], {e->src(), e->src_output()}));
     }
   }
+
+  // Check if any input nodes still have pending gradients and have not been
+  // processed yet. This happens if not all outputs of a node are in 'inputs_'.
+  std::unordered_map<Node*, int> requested_grads;
+  for (const Output& nout : inputs_) {
+    if (pending_[nout.node()->id()] > 0) {
+      DCHECK_GT(nout.node()->num_outputs(), 1);
+      int idx = input_nodes_[nout];
+      DCHECK(((*grad_outputs_)[idx].node() == nullptr));
+      TF_RETURN_IF_ERROR(SumGradients(nout, &(*grad_outputs_)[idx]));
+      ++requested_grads[nout.node()];
+    }
+  }
+  for (const auto& p : requested_grads) {
+    int num_requested_inputs = p.first->num_outputs() - pending_[p.first->id()];
+    CHECK_EQ(num_requested_inputs, p.second);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 6a249825812b4d39b55f7170a35436b6ae88c020..24af7d567b267332610eba2c8c8c57681fa0559b 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/grad_op_registry.h"
 #include "tensorflow/cc/framework/testutil.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -259,6 +260,42 @@ TEST_F(GradientsTest, StackUnstack_StopBackprop) {
   CompareTestAndExpectedGraphs();
 }
 
+TEST_F(GradientsTest, StackUnstack_SubsetOfUnstackOutputs) {
+  // Constructs an unstack with three outputs, and takes the gradient with
+  // respect to only two of the outputs. Tests that the output gradients are
+  // computed.
+  for (const bool expected : {false, true}) {
+    const Scope& scope = expected ? scope_expected_ : scope_test_;
+    // Construct forward graph.
+    auto c = Const(scope, 1, {3, 4, 2});
+    auto unpack = Unstack(scope, c, 3);
+    auto x = Identity(scope, unpack.output[0]);
+    auto y = Identity(scope, unpack.output[1]);
+    auto z = Identity(scope, unpack.output[2]);
+    TF_ASSERT_OK(scope.status());
+
+    // Construct grad inputs.
+    auto dy = Const(scope, 4, {4, 2});
+    auto dz = Const(scope, 5, {4, 2});
+
+    if (expected) {
+      // Construct backward graph.
+      auto g1 = Identity(scope, dy);
+      auto g2 = Identity(scope, dz);
+    } else {
+      // Call AddSymbolicGradients.
+      std::vector<Output> grad_outputs;
+      TF_ASSERT_OK(AddSymbolicGradients(scope, {y, z},
+                                        {unpack.output[1], unpack.output[2]},
+                                        {dy, dz}, &grad_outputs));
+      ASSERT_EQ(grad_outputs.size(), 2);
+      EXPECT_TRUE(grad_outputs[0].node() != nullptr);
+      EXPECT_TRUE(grad_outputs[1].node() != nullptr);
+    }
+  }
+  CompareTestAndExpectedGraphs();
+}
+
 TEST_F(GradientsTest, DependentGradOutputs) {
   // Tests that dependent gradients (in this case the gradients w.r.t to the
   // output and one input of MatMul) are computed properly.
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 32c0822de69da7989ceaa4028539db928b6fcea3..1948dd4e46b932775fdb5cbbdad7b66338b0fcf4 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -136,7 +136,7 @@ Scope::Impl::Impl(const std::shared_ptr<Graph>& graph,
 Scope Scope::NewRootScope() {
   Graph* graph = new Graph(OpRegistry::Global());
   ShapeRefiner* refiner =
-      new ShapeRefiner(graph->versions().producer(), graph->op_registry());
+      new ShapeRefiner(graph->versions(), graph->op_registry());
   return Scope(new Impl(graph, new Status, new Impl::NameMap, refiner));
 }
 
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 37f07e71a0dff9144f193679bbcfcf581c1538cf..6545e4ee3eb406436937a43ddac66d017af8e108 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -100,6 +100,17 @@ Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
 
+Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV3", QuantizeAndDequantizeV3Grad);
+
 Status SplitGrad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
@@ -247,6 +258,18 @@ Status ScatterNdGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("ScatterNd", ScatterNdGrad);
 
+Status ScatterNdNonAliasingAddGrad(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  auto indices = op.input(1);
+  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(GatherNd(scope, grad_inputs[0], indices));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("ScatterNdNonAliasingAdd", ScatterNdNonAliasingAddGrad);
+
+template <bool IsPadV2>
 Status PadGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
@@ -259,9 +282,14 @@ Status PadGrad(const Scope& scope, const Operation& op,
   auto begin = Reshape(scope, pad_before, {-1});
   grad_outputs->push_back(Slice(scope, grad_inputs[0], begin, Shape(scope, x)));
   grad_outputs->push_back(NoGradient());
+  // PadV2 adds a "constant_values" input.
+  if (IsPadV2) {
+    grad_outputs->push_back(NoGradient());
+  }
   return scope.status();
 }
-REGISTER_GRADIENT_OP("Pad", PadGrad);
+REGISTER_GRADIENT_OP("Pad", PadGrad<false>);
+REGISTER_GRADIENT_OP("PadV2", PadGrad<true>);
 
 Status SpaceToBatchGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 5798b5b509fc14e6c9d95d4fd42aca893254f775..1777e181451b267f52a418888912ed1393bdf8b1 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -233,6 +233,28 @@ TEST_F(ArrayGradTest, ScatterNdGrad_SliceIndexing) {
   RunTest(updates, updates_shape, y, y_shape);
 }
 
+TEST_F(ArrayGradTest, ScatterNdNonAliasingAddGrad_SimpleIndexing) {
+  TensorShape updates_shape({4});
+  TensorShape input_shape({8});
+  auto input = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(input_shape));
+  auto updates =
+      Placeholder(scope_, DT_FLOAT, Placeholder::Shape(updates_shape));
+  auto indices = Const(scope_, {{4}, {3}, {1}, {7}});
+  auto y = ScatterNdNonAliasingAdd(scope_, input, indices, updates);
+  RunTest({input, updates}, {input_shape, updates_shape}, {y}, {input_shape});
+}
+
+TEST_F(ArrayGradTest, ScatterNdNonAliasingAddGrad_SliceIndexing) {
+  TensorShape updates_shape({2, 4, 4});
+  TensorShape input_shape({4, 4, 4});
+  auto input = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(input_shape));
+  auto updates =
+      Placeholder(scope_, DT_FLOAT, Placeholder::Shape(updates_shape));
+  auto indices = Const(scope_, {{0}, {2}});
+  auto y = ScatterNdNonAliasingAdd(scope_, input, indices, updates);
+  RunTest({input, updates}, {input_shape, updates_shape}, {y}, {input_shape});
+}
+
 TEST_F(ArrayGradTest, PadGrad) {
   TensorShape x_shape({2, 3});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 71d9a8ed7be5ea75a3b26224df871b955f05c132..0b9b665b1eb4420827b152a88d9023ceab4d932d 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -203,6 +203,46 @@ Status TanhGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Tanh", TanhGrad);
 
+Status AsinhGrad(const Scope& scope, const Operation& op,
+                 const std::vector<Output>& grad_inputs,
+                 std::vector<Output>* grad_outputs) {
+  // y = asinh(x)
+  // dy/dx = 1 / cosh(y)
+  auto dydx = Reciprocal(scope, Cosh(scope, op.output(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Asinh", AsinhGrad);
+
+Status AcoshGrad(const Scope& scope, const Operation& op,
+                 const std::vector<Output>& grad_inputs,
+                 std::vector<Output>* grad_outputs) {
+  // y = acosh(x)
+  // dy/dx = 1 / sinh(y)
+  auto dydx = Reciprocal(scope, Sinh(scope, op.output(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Acosh", AcoshGrad);
+
+Status AtanhGrad(const Scope& scope, const Operation& op,
+                 const std::vector<Output>& grad_inputs,
+                 std::vector<Output>* grad_outputs) {
+  // y = atanh(x)
+  // dy/dx = 1 / (1 - x^2)
+  auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
+  auto dydx = Reciprocal(scope, Sub(scope, one, Square(scope, op.input(0))));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Atanh", AtanhGrad);
+
 Status SigmoidGrad(const Scope& scope, const Operation& op,
                    const std::vector<Output>& grad_inputs,
                    std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 1653b04378f30bd788d549da04d4140ac7d6317e..48b3ddbe90c2313ec0aa50729f277a1c258de52c 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -48,6 +48,9 @@ class CWiseUnaryGradTest : public ::testing::Test {
     SINH,
     COSH,
     TANH,
+    ASINH,
+    ACOSH,
+    ATANH,
     SIGMOID,
     SIGN,
     SIN,
@@ -122,6 +125,15 @@ class CWiseUnaryGradTest : public ::testing::Test {
       case TANH:
         y = Tanh(scope_, x);
         break;
+      case ASINH:
+        y = Asinh(scope_, x);
+        break;
+      case ACOSH:
+        y = Acosh(scope_, x);
+        break;
+      case ATANH:
+        y = Atanh(scope_, x);
+        break;
       case SIGMOID:
         y = Sigmoid(scope_, x);
         break;
@@ -413,6 +425,76 @@ TEST_F(CWiseUnaryGradTest, Tanh_Complex) {
   TestCWiseGrad<complex64>(TANH, x_fn, dy_fn, dx_fn);
 }
 
+TEST_F(CWiseUnaryGradTest, Asinh) {
+  auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    auto y = std::asinh(x);
+    return dy / std::cosh(y);
+  };
+  TestCWiseGrad<float>(ASINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Asinh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    auto y = std::asinh(x);
+    return dy / conjugate(std::cosh(y));
+  };
+  TestCWiseGrad<complex64>(ASINH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Acosh) {
+  auto x_fn = [this](const int i) { return RV({1, 2, 3, 4, 5, 6, 7}); };
+  auto dy_fn = [this](const float x) { return x + RV({8, 9, 10, 11, 12, 13, 14}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    auto y = std::acosh(x);
+    return dy / std::sinh(y);
+  };
+  TestCWiseGrad<float>(ACOSH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Acosh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 1}, {2, 1}, {1, 4}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{2, 2}, {3, 3}, {1, 4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    auto y = std::acosh(x);
+    return dy / conjugate(std::sinh(y));
+  };
+  TestCWiseGrad<complex64>(ACOSH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Atanh) {
+  auto x_fn = [this](const int i) { return RV({0, -0.5, 0.5, -0.1, 0.1}); };
+  auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
+  auto dx_fn = [this](const float x, const float dy) {
+    return dy * (1. / (1. - x * x));
+  };
+  TestCWiseGrad<float>(ATANH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Atanh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{0.1, 0}, {0, 0.1}, {0.2, -0.1}, {0.1, 0.2}, {0.3, 0.4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / conjugate(one_ - x * x);
+  };
+  TestCWiseGrad<complex64>(ATANH, x_fn, dy_fn, dx_fn);
+}
+
 TEST_F(CWiseUnaryGradTest, Sigmoid) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 5e5203d09055d65cb1dcc16e091f6e5028ee7ae1..f9d69ff8967e7c7d56f5771a8ccbd4091f7bc8c0 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -46,6 +46,19 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
+Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
+                   const std::vector<Output>& grad_inputs,
+                   std::vector<Output>* grad_outputs) {
+
+  auto softmax = Exp(scope, op.output(0));
+  auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true));
+  auto mul = Mul(scope, sum, softmax);
+  auto dx = Sub(scope, grad_inputs[0], mul);
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LogSoftmax", LogSoftmaxGrad);
+
 Status ReluGradHelper(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
@@ -73,6 +86,15 @@ Status EluGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Elu", EluGradHelper);
 
+Status SeluGradHelper(const Scope& scope, const Operation& op,
+                      const std::vector<Output>& grad_inputs,
+                      std::vector<Output>* grad_outputs) {
+  auto dx = internal::SeluGrad(scope, grad_inputs[0], op.output(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Selu", SeluGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 70c9bd4e08b2b46866a44becc8fe1305fec48ea9..eab5b446261cc7c69a4aa3b26a2debd402c9bdd9 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -57,6 +57,19 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
+TEST_F(NNGradTest, LogSoftmaxGrad) {
+  TensorShape shape({5, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = LogSoftmax(scope_, x);
+  // Avoid numerical instability when computing finite differences.
+  Tensor x_init_value = test::AsTensor<float>(
+          {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f,
+           0.1f, 0.3f, 0.5f, 0.7f, 0.8f,
+           -0.1f, 0.1f, 0.1f, 0.1f, 1.2f},
+          {5, 3});
+  RunTest(x, x_init_value, y, shape);
+}
+
 TEST_F(NNGradTest, ReluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
@@ -90,5 +103,15 @@ TEST_F(NNGradTest, EluGrad) {
   RunTest(x, x_init_value, y, shape);
 }
 
+TEST_F(NNGradTest, SeluGrad) {
+  TensorShape shape({5, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Selu(scope_, x);
+  Tensor x_init_value = test::AsTensor<float>(
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
+  RunTest(x, x_init_value, y, shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/ops/op_gen_overrides.pbtxt b/tensorflow/cc/ops/op_gen_overrides.pbtxt
index 1dffb10c03379571907e921c1add98d1f11625c3..2252cbb2892af9b0d9938a7864235d3d6b4ec005 100644
--- a/tensorflow/cc/ops/op_gen_overrides.pbtxt
+++ b/tensorflow/cc/ops/op_gen_overrides.pbtxt
@@ -100,6 +100,10 @@ op { name: "Stack" skip: true }
 op { name: "StackClose" skip: true }
 op { name: "StackPop" skip: true }
 op { name: "StackPush" skip: true }
+op { name: "StackV2" skip: true }
+op { name: "StackCloseV2" skip: true }
+op { name: "StackPopV2" skip: true }
+op { name: "StackPushV2" skip: true }
 
 op { name: "TensorArrayCloseV2" skip: true }
 op { name: "TensorArrayCloseV3" rename_to: "TensorArrayClose" }
@@ -173,6 +177,7 @@ op { name: "MaxPoolGradWithArgmax" hide: true }
 op { name: "ReluGrad" hide: true }
 op { name: "Relu6Grad" hide: true }
 op { name: "EluGrad" hide: true }
+op { name: "SeluGrad" hide: true }
 op { name: "SoftplusGrad" hide: true }
 op { name: "SoftsignGrad" hide: true }
 op { name: "FractionalAvgPoolGrad" hide: true }
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index 94a3b3cf465a279e3bb44344739499ad670119c3..c940df8a8761d97a859be3af30980ff79ca3577a 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -21,6 +21,9 @@ namespace tensorflow {
 /// SavedModel assets directory.
 constexpr char kSavedModelAssetsDirectory[] = "assets";
 
+/// SavedModel assets.extra directory.
+constexpr char kSavedModelAssetsExtraDirectory[] = "assets.extra";
+
 /// SavedModel assets key for graph collection-def.
 constexpr char kSavedModelAssetsKey[] = "saved_model_assets";
 
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 807f5904afcf36890f4bd02f0d811a3ebe0cceba..f98abc8a817eca7bc129bb03a2ad31b97d957065 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
@@ -76,8 +77,16 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
       return Status::OK();
     }
   }
+  string tags_as_string = "{ ";
+  for (const string& tag : tags) {
+    tags_as_string = strings::StrCat(tags_as_string, tag, " ");
+  }
+  tags_as_string = strings::StrCat(tags_as_string, "}");
   return Status(error::Code::NOT_FOUND,
-                "Could not find meta graph def matching supplied tags.");
+                "Could not find meta graph def matching supplied tags: " +
+                    tags_as_string +
+                    ". To inspect available tag-sets in the SavedModel, please "
+                    "use the SavedModel CLI: `saved_model_cli`");
 }
 
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index cef29e7b071e538a60193fd998acc0fb29c2cea3..0ad6b33bba5fcceaca68e2f179cef2232c689a80 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -133,9 +133,9 @@ TEST_F(LoaderTest, NoTagMatch) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(
-      StringPiece(st.error_message())
-          .contains("Could not find meta graph def matching supplied tags."))
+  EXPECT_TRUE(StringPiece(st.error_message())
+                  .contains("Could not find meta graph def matching supplied "
+                            "tags: { missing-tag }"))
       << st.error_message();
 }
 
@@ -151,7 +151,7 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(
       StringPiece(st.error_message())
-          .contains("Could not find meta graph def matching supplied tags."))
+          .contains("Could not find meta graph def matching supplied tags: "))
       << st.error_message();
 }
 
diff --git a/tensorflow/cc/saved_model/tag_constants.h b/tensorflow/cc/saved_model/tag_constants.h
index 48ab1158e462af25c27a728e404a041516e82057..2b0b2d5c7fb33768494c1781669c1adcb875a579 100644
--- a/tensorflow/cc/saved_model/tag_constants.h
+++ b/tensorflow/cc/saved_model/tag_constants.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+/// Tag for the `gpu` graph.
+constexpr char kSavedModelTagGpu[] = "gpu";
+
 /// Tag for the `serving` graph.
 constexpr char kSavedModelTagServe[] = "serve";
 
-/// Tag for the `training` graph.`
+/// Tag for the `training` graph.
 constexpr char kSavedModelTagTrain[] = "train";
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 1f6fe28188cfbb6a64935e4a3f70cf8e0f6eb9ad..f956602ba221bbbb3c2fc9c7df7d452da833c002 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -108,6 +108,7 @@ cc_test(
     deps = [
         ":tfcompile_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -126,16 +127,7 @@ cc_library(
     deps = [
         ":tfcompile_lib",
         ":tfcompile_proto",
-        "//tensorflow/compiler/xla/legacy_flags:alias_analysis_flags",
-        "//tensorflow/compiler/xla/legacy_flags:buffer_assignment_flags",
-        "//tensorflow/compiler/xla/legacy_flags:compiler_functor_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
-        "//tensorflow/compiler/xla/legacy_flags:llvm_util_flags",
-        "//tensorflow/compiler/xla/legacy_flags:service_flags",
-        "//tensorflow/compiler/xla/legacy_flags:util_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -161,6 +153,40 @@ tf_library(
     tags = ["manual"],
 )
 
+# A test of tf_library that includes a graph with an unknown op, but where
+# the compilation works because the unknown op is not needed for the fetches.
+tf_library(
+    name = "test_graph_tfunknownop",
+    testonly = 1,
+    config = "test_graph_tfunknownop.config.pbtxt",
+    cpp_class = "UnknownOpAddComp",
+    graph = "test_graph_tfunknownop.pbtxt",
+    tags = ["manual"],
+)
+
+# A test of tf_library that includes a graph with an unknown op, but where
+# the compilation works because the op between the unknown op and the
+# fetches is a feed.
+tf_library(
+    name = "test_graph_tfunknownop2",
+    testonly = 1,
+    config = "test_graph_tfunknownop2.config.pbtxt",
+    cpp_class = "UnknownOpAddComp",
+    graph = "test_graph_tfunknownop.pbtxt",
+    tags = ["manual"],
+)
+
+# A test of tf_library that includes a graph with an unknown op, but where
+# the compilation works because the unknown op is fed.
+tf_library(
+    name = "test_graph_tfunknownop3",
+    testonly = 1,
+    config = "test_graph_tfunknownop3.config.pbtxt",
+    cpp_class = "UnknownOpAddComp",
+    graph = "test_graph_tfunknownop.pbtxt",
+    tags = ["manual"],
+)
+
 # Utility library for benchmark binaries, used by the *_benchmark rules that are
 # added by the tfcompile bazel macro.
 cc_library(
@@ -204,6 +230,7 @@ test_suite(
     tests = [
         ":benchmark_test",
         ":test_graph_tfadd_test",
+        ":test_graph_tfunknownop_test",
         "//tensorflow/compiler/aot/tests:all_tests",
     ],
 )
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index ca17c5ab690f606bd531638fece8b0a74cdd8c18..03bdd63623dcd1176c4598107281db9ad72e1947 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/aot/tfcompile_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -77,66 +79,51 @@ Status DumpGraph(const MainFlags& flags, const string& name,
   return WriteTextProto(Env::Default(), file, graph_def);
 }
 
-string TensorIdToString(const TensorId& id) {
-  return strings::StrCat(id.node_name(), ":", id.output_index());
-}
-
 typedef std::unordered_map<string, Node*> NodeMap;
 
 // Each feed id identifies the positional output of some node, which may consist
-// of multiple edges.  For each feed node, replaces all matching edges so that
-// they point from a new _Arg node instead.
+// of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
+// tensor with a placeholder.  For each feed tensor, replaces all edges so they
+// point from a new _Arg node instead.
 Status AddArgNodes(Graph* graph, const NodeMap& node_map,
-                   const protobuf::RepeatedPtrField<Feed>& feeds) {
+                   const protobuf::RepeatedPtrField<Feed>& feeds,
+                   const std::unordered_map<string, string>& feed_remapping) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const Feed& feed = feeds[arg_index];
-    const TensorId& id = feed.id();
-    auto it = node_map.find(id.node_name());
-    if (it == node_map.end()) {
-      return errors::NotFound("Can't find feed id: ", TensorIdToString(id));
-    }
-    const Node* feed_node = it->second;
-    if (id.output_index() >= feed_node->num_outputs()) {
-      return errors::InvalidArgument("Invalid feed id: ", TensorIdToString(id),
-                                     ", output index should be < ",
-                                     feed_node->num_outputs());
-    }
-    // TODO(toddw): Invoke shape inference on the graph and add a "_shape" attr
-    // if we can determine it.  That way the graph will be initialized with
-    // whatever shapes we can infer, while the user can still explicitly specify
-    // or override them.
+    // All feeds have been replaced by placeholders.
+    const int output_index = 0;
+
+    const auto remap_it = feed_remapping.find(TensorIdToString(feed.id()));
+    auto node_it = node_map.find(remap_it->second);
+    const Node* feed_node = node_it->second;
+
+    // TODO(toddw): Invoke shape inference in AddPlaceholdersForFeeds and add a
+    // "_shape" attr if we can determine it.  That way the graph will be
+    // initialized with whatever shapes we can infer, while the user can still
+    // explicitly specify or override them.
     Node* arg_node = nullptr;
     TF_RETURN_IF_ERROR(
         NodeBuilder(strings::StrCat("_arg_", arg_index), kArgOp)
-            .Attr("T", BaseType(feed_node->output_type(id.output_index())))
+            .Attr("T", BaseType(feed_node->output_type(output_index)))
             .Attr("index", arg_index)
-            .Attr(kFeedIdAttr, TensorIdToString(id))
+            .Attr(kFeedIdAttr, TensorIdToString(feed.id()))
             .Attr(kShapeAttr, TensorShape(feed.shape()))
             .Attr(kDebugNameAttr, feed.name())
             .Finalize(graph, &arg_node));
+
     // Collects out-edges from the feed node that have a matching edge index;
-    // these will be replaced with edges from the arg node instead.  Also
-    // replaces all control edges from Placeholder feed nodes; similar code
-    // exists in subgraph::RewriteGraphForExecution.
-    // TODO(toddw): Why only replace control edges from Placeholder?
+    // these will be replaced with edges from the arg node instead.
     //
     // We must collect the edges first and process them in a second pass, since
     // removing the edge from the graph invalidates feed_node->out_edges.
     std::vector<const Edge*> feed_edges;
     for (const Edge* edge : feed_node->out_edges()) {
-      if (edge->src_output() == id.output_index() ||
-          (edge->src_output() == Graph::kControlSlot &&
-           feed_node->type_string() == "Placeholder")) {
+      if (edge->src_output() == output_index) {
         feed_edges.push_back(edge);
       }
     }
     for (const Edge* edge : feed_edges) {
-      if (edge->src_output() == id.output_index()) {
-        graph->AddEdge(arg_node, 0, edge->dst(), edge->dst_input());
-      } else {
-        CHECK_EQ(edge->src_output(), Graph::kControlSlot);
-        graph->AddControlEdge(arg_node, edge->dst());
-      }
+      graph->AddEdge(arg_node, 0, edge->dst(), edge->dst_input());
       graph->RemoveEdge(edge);
     }
   }
@@ -178,13 +165,16 @@ Status AddRetvalNodes(Graph* graph, const NodeMap& node_map,
 // fetch ids respectively), and rewrites the edges so that inputs flow from _Arg
 // nodes, and outputs flow to _Retval nodes.  This allows the symbolic graph
 // execution to know the input and output args for the generated function.
-Status RewriteAndPruneGraph(Graph* graph, const Config& config,
-                            const MainFlags& flags) {
+Status RewriteAndPruneGraph(
+    Graph* graph, const Config& config,
+    const std::unordered_map<string, string>& feed_remapping,
+    const MainFlags& flags) {
   NodeMap node_map;
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
   }
-  TF_RETURN_IF_ERROR(AddArgNodes(graph, node_map, config.feed()));
+  TF_RETURN_IF_ERROR(
+      AddArgNodes(graph, node_map, config.feed(), feed_remapping));
   std::unordered_set<const Node*> retval_nodes;
   TF_RETURN_IF_ERROR(
       AddRetvalNodes(graph, node_map, config.fetch(), &retval_nodes));
@@ -265,7 +255,9 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, &arg.shape));
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
@@ -378,14 +370,32 @@ Status CompileXla(xla::CompileOnlyClient* client,
 Status InitGraph(const GraphDef& graph_def, const Config& config,
                  const MainFlags& flags, std::unique_ptr<Graph>* graph) {
   TF_RETURN_IF_ERROR(ValidateConfig(config));
+
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), graph_def.library());
   std::unique_ptr<Graph> g(new Graph(flib_def));
-  GraphDef copy_def(graph_def);
-  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&copy_def, *g->op_registry(),
-                                               0 /*node_offset*/));
+
+  // Replace references to fed tensors with references to newly added
+  // placeholders.
+  GraphDef first_copy_def = graph_def;
+
+  // Maps from name:port of a feed to the name:port of the placeholder to use.
+  std::unordered_map<string, string> feed_remapping;
+  TF_RETURN_IF_ERROR(AddPlaceholdersForFeeds(config, g->op_registry(),
+                                             &feed_remapping, &first_copy_def));
+
+  // Prune the GraphDef first so that unknown ops that we aren't compiling get
+  // filtered out.
+  GraphDef second_copy_def;
+  TF_RETURN_IF_ERROR(
+      PruneGraphDefInto(config, first_copy_def, &second_copy_def));
+
+  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
+      &second_copy_def, *g->op_registry(), 0 /*node_offset*/));
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
+                                            second_copy_def, g.get()));
   TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(GraphConstructorOptions(), copy_def, g.get()));
-  TF_RETURN_IF_ERROR(RewriteAndPruneGraph(g.get(), config, flags));
+      RewriteAndPruneGraph(g.get(), config, feed_remapping, flags));
   *graph = std::move(g);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/aot/test_graph_tfadd.config.pbtxt b/tensorflow/compiler/aot/test_graph_tfadd.config.pbtxt
index 5625c0ab03893c997245a6449d145b9149b48627..f2d9c34b2d1d68aa80245a6f3379b3759bb9f4b9 100644
--- a/tensorflow/compiler/aot/test_graph_tfadd.config.pbtxt
+++ b/tensorflow/compiler/aot/test_graph_tfadd.config.pbtxt
@@ -6,7 +6,7 @@ feed {
   }
 }
 feed {
-  id { node_name: "y_const" }
+  id { node_name: "y_reshape" }
   shape {
     dim { size: 1 }
   }
diff --git a/tensorflow/compiler/aot/test_graph_tfadd.pbtxt b/tensorflow/compiler/aot/test_graph_tfadd.pbtxt
index 91c900e06d7547fe9a377a427b6ca56b9e46942d..665c9fe28721b25c544c30ecd1b4dfc399934314 100644
--- a/tensorflow/compiler/aot/test_graph_tfadd.pbtxt
+++ b/tensorflow/compiler/aot/test_graph_tfadd.pbtxt
@@ -4,15 +4,7 @@ node {
   attr {
     key: "value"
     value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
+      tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } int_val: 1 }
     }
   }
   attr {
@@ -28,15 +20,7 @@ node {
   attr {
     key: "value"
     value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 2
-      }
+      tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } int_val: 2 }
     }
   }
   attr {
@@ -46,11 +30,20 @@ node {
     }
   }
 }
+node {
+  name  : "y_reshape"
+  op    : "Reshape"
+  input : "y_const"
+  input : "y_shape"
+  attr { key: "T" value { type: DT_INT32 } }
+  # Attribute TShape not specified; needs to be set to its default
+  # by tfcompile.
+}
 node {
   name  : "x_y_sum"
   op    : "Add"
   input : "x_const"
-  input : "y_const"
+  input : "y_reshape"
   attr {
     key  : "T"
     value {
diff --git a/tensorflow/compiler/aot/test_graph_tfunknownop.config.pbtxt b/tensorflow/compiler/aot/test_graph_tfunknownop.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5625c0ab03893c997245a6449d145b9149b48627
--- /dev/null
+++ b/tensorflow/compiler/aot/test_graph_tfunknownop.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tfcompile.Config proto.
+feed {
+  id { node_name: "x_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "x_y_sum" }
+}
diff --git a/tensorflow/compiler/aot/test_graph_tfunknownop.pbtxt b/tensorflow/compiler/aot/test_graph_tfunknownop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48b881bb9462dc30944a1377d4d2a2c58b9dfe43
--- /dev/null
+++ b/tensorflow/compiler/aot/test_graph_tfunknownop.pbtxt
@@ -0,0 +1,58 @@
+node {
+  name  : "x_const"
+  op    : "Const"
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape { dim { size: 1 } }
+        int_val: 1
+      }
+    }
+  }
+  attr { key  : "dtype" value { type: DT_INT32 } }
+}
+node {
+  name  : "y_const"
+  op    : "Const"
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape { dim { size: 1 } }
+        int_val: 2
+      }
+    }
+  }
+  attr { key: "dtype" value { type: DT_INT32 } }
+}
+node {
+  name  : "x_y_sum"
+  op    : "Add"
+  input : "x_const"
+  input : "y_const"
+  attr { key  : "T" value { type: DT_INT32 } }
+}
+node {
+  name  : "z"
+  op    : "SomeUnknownOp"
+  input : "x_const"
+}
+node {
+  name  : "z_identity"
+  op    : "Identity"
+  input : "z:1"
+  attr { key  : "T" value { type: DT_INT32 } }
+}
+node {
+  name  : "x_z_sum"
+  op    : "Add"
+  input : "x_const"
+  input : "z_identity"
+  attr { key  : "T" value { type: DT_INT32 } }
+}
+versions {
+  producer: 15
+}
diff --git a/tensorflow/compiler/aot/test_graph_tfunknownop2.config.pbtxt b/tensorflow/compiler/aot/test_graph_tfunknownop2.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7370ed370d314052ed23d4ceca22cab7def65485
--- /dev/null
+++ b/tensorflow/compiler/aot/test_graph_tfunknownop2.config.pbtxt
@@ -0,0 +1,25 @@
+# Text form of tensorflow.tfcompile.Config proto.
+feed {
+  id { node_name: "x_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "z_identity"}
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "x_y_sum" }
+}
+fetch {
+  id { node_name: "x_z_sum" }
+}
diff --git a/tensorflow/compiler/aot/test_graph_tfunknownop3.config.pbtxt b/tensorflow/compiler/aot/test_graph_tfunknownop3.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2d7d5457427775fe2f00e079ced6b23c3308230
--- /dev/null
+++ b/tensorflow/compiler/aot/test_graph_tfunknownop3.config.pbtxt
@@ -0,0 +1,26 @@
+# Text form of tensorflow.tfcompile.Config proto.
+feed {
+  id { node_name: "x_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "z" output_index: 1}
+  shape {
+    dim { size: 1 }
+  }
+  type: DT_INT32
+}
+fetch {
+  id { node_name: "x_y_sum" }
+}
+fetch {
+  id { node_name: "x_z_sum" }
+}
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 4be4e0fbb39c710e64478ee6b98a8dd1fc0441b9..12e1485b484d6cb9f3f896db567e9a6fae719943 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -27,6 +27,15 @@ def tf_library(name, graph, config,
                deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
+  Given an invocation of tf_library(name="foo", ...), generates the following
+  build targets:
+    foo:           A cc_library containing the generated header and computation.
+    foo_test:      A cc_test with simple tests and benchmarks. Only created if
+                   gen_test=True.
+    foo_benchmark: A cc_binary that runs a minimal-dependency benchmark, useful
+                   for mobile devices or other platforms that can't compile the
+                   full test libraries. Only created if gen_benchmark=True.
+
   Args:
     name: The name of the build rule.
     graph: The TensorFlow GraphDef to compile.  If the file ends in '.pbtxt' it
diff --git a/tensorflow/compiler/aot/tfcompile.proto b/tensorflow/compiler/aot/tfcompile.proto
index be3f5043501c71c844a00b5a5b23fa4285c00ec6..cd83840d894f2a28ca70c54f3320a6287b4a0a20 100644
--- a/tensorflow/compiler/aot/tfcompile.proto
+++ b/tensorflow/compiler/aot/tfcompile.proto
@@ -7,6 +7,7 @@ option java_multiple_files = true;
 option java_package = "org.tensorflow.tfcompile";
 
 import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
 
 // TensorId identifies a tensor in a TensorFlow graph, by specifying the output
 // index of a particular node in the graph.  If the output of the named node
@@ -23,6 +24,12 @@ message Feed {
   TensorId id = 1;
   TensorShapeProto shape = 2;
   string name = 3;  // Optional name for generated code.
+
+  // Optional data type. This is not normally required, as the graph itself
+  // contains this information. However, if the node being fed is an op that
+  // is not linked into the tfcompile binary, then the type cannot be inferred
+  // from the node; in this case, the type should be set here.
+  DataType type = 4;
 };
 
 // Fetch represents a single fetch tensor in the graph, which corresponds to an
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 6fed46b4329606baeed21dd9ee4d34849a7c50a0..be2cfe4734e0493ba41a1bda23606a65d2cb4af4 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -23,16 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/aot/tfcompile.pb.h"
 #include "tensorflow/compiler/aot/tfcompile_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/util_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -133,19 +124,11 @@ int main(int argc, char** argv) {
   flags.target_triple = "x86_64-pc-linux";
   flags.out_object = "out.o";
   flags.out_header = "out.h";
+  flags.entry_point = "entry";
 
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
-  xla::legacy_flags::AppendAliasAnalysisFlags(&flag_list);
-  xla::legacy_flags::AppendBufferAssignmentFlags(&flag_list);
-  xla::legacy_flags::AppendCompilerFunctorFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
-  xla::legacy_flags::AppendCpuRuntimeFlags(&flag_list);
-  xla::legacy_flags::AppendHloGraphDumperFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendLlvmUtilFlags(&flag_list);
-  xla::legacy_flags::AppendServiceFlags(&flag_list);
-  xla::legacy_flags::AppendUtilFlags(&flag_list);
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/aot/tfcompile_util.cc b/tensorflow/compiler/aot/tfcompile_util.cc
index fd073a2e2623b4b24ddc58360525886f3fc1b3ac..e6a4705b6c24eccac6528c64d030f9e37eb5c3f4 100644
--- a/tensorflow/compiler/aot/tfcompile_util.cc
+++ b/tensorflow/compiler/aot/tfcompile_util.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/tfcompile_util.h"
 
+#include <queue>
 #include <set>
+#include <unordered_map>
 
 #include "tensorflow/compiler/aot/tfcompile.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 namespace tfcompile {
@@ -115,5 +121,164 @@ Status ValidateConfig(const Config& config) {
   return Status::OK();
 }
 
+Status AddPlaceholdersForFeeds(
+    const Config& config, const OpRegistryInterface* op_registry,
+    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def) {
+  struct PlaceholderInfo {
+    const Feed* feed = nullptr;  // point to Feed in <config>.
+    string placeholder_name;
+    DataType data_type = DT_INVALID;
+  };
+
+  // Put each fed tensor into a map by name:port. A map is used for determinism
+  // when creating placeholders (genrules want deterministic output).
+  std::map<string, PlaceholderInfo> placeholder_info;
+  for (int i = 0; i < config.feed_size(); ++i) {
+    const Feed* feed = &config.feed(i);
+    const string name_port = TensorIdToString(feed->id());
+    auto& info = placeholder_info[name_port];
+    info.feed = feed;
+    info.placeholder_name = strings::StrCat(
+        "aot_feed_", feed->id().output_index(), "/", feed->id().node_name());
+    (*feed_remapping)[name_port] = info.placeholder_name;
+  }
+
+  // Verify node exists and determine data type.
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (int i = 0; i < graph_def->node_size(); ++i) {
+    name_to_node[graph_def->node(i).name()] = &graph_def->node(i);
+  }
+  for (auto it = placeholder_info.begin(); it != placeholder_info.end(); ++it) {
+    PlaceholderInfo& info = it->second;
+    const TensorId& feed_id = info.feed->id();
+
+    // Find the existing node and determine data type.
+    auto node_it = name_to_node.find(feed_id.node_name());
+    if (node_it == name_to_node.end()) {
+      return errors::NotFound("Can't find feed node: ",
+                              TensorIdToString(feed_id));
+    }
+    const NodeDef* existing = node_it->second;
+
+    if (info.feed->type() != DT_INVALID) {
+      info.data_type = info.feed->type();
+    } else {
+      // Build the node in order to infer its type.
+
+      // Must first add default attrs as well, so do this in a copied GraphDef.
+      GraphDef gd;
+      *gd.mutable_versions() = graph_def->versions();
+      *gd.add_node() = *existing;
+      TF_RETURN_IF_ERROR(
+          AddDefaultAttrsToGraphDef(&gd, *op_registry, 0 /*node_offset*/));
+
+      // Now build the node from the copied node def.
+      Graph g(op_registry);
+      g.set_versions(graph_def->versions());
+      Status status;
+      Node* feed_node = g.AddNode(gd.node(0), &status);
+      TF_RETURN_IF_ERROR(status);
+      info.data_type =
+          BaseType(feed_node->output_type(info.feed->id().output_index()));
+    }
+  }
+
+  // Create placeholders. Note that we could avoid creating a placeholder for
+  // feeds which are already placeholders, but we omit that to avoid more cases
+  // in this code.
+  for (auto it = placeholder_info.begin(); it != placeholder_info.end(); ++it) {
+    const PlaceholderInfo& info = it->second;
+    NodeDef* d = graph_def->add_node();
+    d->set_name(info.placeholder_name);
+    d->set_op("PlaceholderV2");
+    auto& attr_map = *d->mutable_attr();
+    attr_map["dtype"].set_type(info.data_type);
+    *attr_map["shape"].mutable_shape() = info.feed->shape();
+  }
+
+  // Rewrite references to the fed tensors to refer to the placeholder.
+  for (int i = 0; i < graph_def->node_size(); ++i) {
+    NodeDef* node_def = graph_def->mutable_node(i);
+    for (int j = 0; j < node_def->input_size(); ++j) {
+      auto id = ParseTensorName(node_def->input(j));
+      auto it = placeholder_info.find(id.ToString());
+      if (it != placeholder_info.end()) {
+        node_def->set_input(j, it->second.placeholder_name);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status PruneGraphDefInto(const Config& config, const GraphDef& in,
+                         GraphDef* out) {
+  *out = in;
+  out->clear_node();
+
+  // Tensors needed for feeding.
+  std::set<std::pair<string, int>> feed_tensors;
+  for (const auto& feed_config : config.feed()) {
+    feed_tensors.insert(std::make_pair(feed_config.id().node_name(),
+                                       feed_config.id().output_index()));
+  }
+
+  // Maps node name to reachability.
+  std::unordered_map<string, std::pair<bool, const NodeDef*>> node_by_name;
+  for (const NodeDef& node : in.node()) {
+    node_by_name[node.name()] = std::pair<bool, const NodeDef*>(false, &node);
+  }
+
+  // Traverse.
+  std::queue<string> name_queue;
+  for (int i = 0; i < config.fetch_size(); ++i) {
+    name_queue.push(config.fetch(i).id().node_name());
+  }
+  while (!name_queue.empty()) {
+    const string name = name_queue.front();
+    name_queue.pop();
+
+    auto find_it = node_by_name.find(name);
+    if (find_it == node_by_name.end()) {
+      return errors::InvalidArgument("While pruning graph, node ", name,
+                                     " needed but not found in the graph.");
+    }
+    auto& map_entry = find_it->second;
+    if (map_entry.first) {
+      continue;
+    }
+    map_entry.first = true;
+
+    // Push input nodes of the currently visited node to name_queue.
+    for (const string& in_edge : map_entry.second->input()) {
+      auto id = ParseTensorName(in_edge);
+      const string node_name = id.first.ToString();
+      if (feed_tensors.find(std::make_pair(node_name, id.second)) ==
+          feed_tensors.end()) {
+        name_queue.push(node_name);
+      } else {
+        // The input tensor is from an edge that is being fed. Therefore,
+        // we skip recursing down that edge, to avoid requiring nodes that
+        // may not be needed (note that the input node may still be added
+        // to name_queue later if one of its output edges is not being fed).
+      }
+    }
+  }
+
+  // Copy over, preserving order of original and only nodes that are reachable
+  // from the fetches.
+  out->mutable_node()->Reserve(in.node_size());
+  for (const NodeDef& node : in.node()) {
+    if (node_by_name[node.name()].first) {
+      *out->add_node() = node;
+    }
+  }
+  return Status::OK();
+}
+
+string TensorIdToString(const TensorId& id) {
+  return strings::StrCat(id.node_name(), ":", id.output_index());
+}
+
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/tfcompile_util.h b/tensorflow/compiler/aot/tfcompile_util.h
index 651d75d0d02bdac110159996498778d2c57ddf78..365f7b0e7b19a495ade13a7cff4140cdae68cad2 100644
--- a/tensorflow/compiler/aot/tfcompile_util.h
+++ b/tensorflow/compiler/aot/tfcompile_util.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_AOT_TFCOMPILE_UTIL_H_
 #define TENSORFLOW_COMPILER_AOT_TFCOMPILE_UTIL_H_
 
+#include <unordered_map>
+
 #include "tensorflow/compiler/aot/tfcompile.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -30,6 +34,23 @@ Status ValidateCppIdent(StringPiece ident, StringPiece msg);
 // ValidateConfig returns OK iff config is valid.
 Status ValidateConfig(const Config& config);
 
+// Modifies <graph_def> to include placeholders for each fed tensor, and
+// update references to the fed tensors to refer to the placeholders.
+// The existing nodes referenced by the feeds are not removed or modified
+// (except where their input edges are modified by the replacement of other
+// feeds).
+Status AddPlaceholdersForFeeds(
+    const Config& config, const OpRegistryInterface* op_registry,
+    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def);
+
+// Returns in <out> a copy of <in>, pruned to only include fetches from
+// <config>.
+Status PruneGraphDefInto(const Config& config, const GraphDef& in,
+                         GraphDef* out);
+
+// Returns node:port for the given <id>.
+string TensorIdToString(const TensorId& id);
+
 }  // namespace tfcompile
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/aot/tfcompile_util_test.cc b/tensorflow/compiler/aot/tfcompile_util_test.cc
index c321d3ff4c779fbd2e9c67dfc1eb24c734a9103f..5a92851ceb972ca63a8a3845eb4730fe198762dd 100644
--- a/tensorflow/compiler/aot/tfcompile_util_test.cc
+++ b/tensorflow/compiler/aot/tfcompile_util_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/tfcompile_util.h"
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -180,6 +182,65 @@ TEST(ValidateConfig, ConflictingFetchName) {
   ExpectErrorContains(ValidateConfig(config), "conflicting fetch name");
 }
 
+static Config FetchesConfig(std::vector<string> fetches) {
+  Config config;
+  for (const auto& fetch_node_name : fetches) {
+    auto* fetch = config.add_fetch();
+    fetch->set_name(strings::StrCat("fetch_", fetch_node_name));
+    fetch->mutable_id()->set_node_name(fetch_node_name);
+  }
+  return config;
+}
+
+TEST(PruneGraphDefInto, Basic) {
+  GraphDef def;
+  auto* n = def.add_node();
+  n->set_name("a");
+  n->add_input("b:0");
+  n->add_input("^c");
+
+  GraphDef copy;
+  ExpectErrorContains(PruneGraphDefInto(FetchesConfig({"missing"}), def, &copy),
+                      "node missing needed");
+  ExpectErrorContains(PruneGraphDefInto(FetchesConfig({"a"}), def, &copy),
+                      "node b needed");
+
+  n = def.add_node();
+  n->set_name("b");
+  ExpectErrorContains(PruneGraphDefInto(FetchesConfig({"a"}), def, &copy),
+                      "node c needed");
+  n->add_input("d:1");
+
+  n = def.add_node();
+  n->set_name("c");
+  n->add_input("d:1");
+
+  n = def.add_node();
+  n->set_name("d");
+
+  // Graph is full, no pruning done.
+  // Graph right now has diamond from d:
+  //   d --> b --> a
+  //   d --> c --> a
+  TF_EXPECT_OK(PruneGraphDefInto(FetchesConfig({"a"}), def, &copy));
+  EXPECT_EQ(def.DebugString(), copy.DebugString());
+  GraphDef pruned_a = copy;
+
+  // Add some unrelated fields that use b and c, but are not needed for a.
+  n = def.add_node();
+  n->set_name("e");
+  n->add_input("^d");
+  n->add_input("b:2");
+  copy.Clear();
+  TF_EXPECT_OK(PruneGraphDefInto(FetchesConfig({"a"}), def, &copy));
+  EXPECT_EQ(pruned_a.DebugString(), copy.DebugString());
+
+  // Fetch "a" and "e" to get the original graph.
+  copy.Clear();
+  TF_EXPECT_OK(PruneGraphDefInto(FetchesConfig({"a", "e"}), def, &copy));
+  EXPECT_EQ(def.DebugString(), copy.DebugString());
+}
+
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 5f857191da78ddd68c5689f9c4f467c01300ca7c..625eb08f1b5a334b0b5b44324c27cab93772a177 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -15,27 +15,16 @@ package_group(
 )
 
 package(
-    default_visibility = [":internal"],
+    default_visibility = [
+        ":internal",
+        "//tensorflow/compiler/plugin/executor:__pkg__",
+    ],
 )
 
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
-# This target can be used by XLA device plugins to prevent circular
-# dependencies, and provides access to all of the required headers
-# for building a device library.
-cc_header_only_library(
-    name = "xla_jit_headers_lib",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xla_cpu_device",
-        ":xla_cpu_jit",
-        ":xla_gpu_device",
-        ":xla_gpu_jit",
-    ],
-)
-
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
     name = "jit",
@@ -150,6 +139,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:tensorflow_opensource",
         "//tensorflow/core/kernels:constant_op",
@@ -243,18 +233,38 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
+cc_test(
+    name = "graph_to_functiondef_test",
+    size = "small",
+    srcs = [
+        "graph_to_functiondef_test.cc",
+    ],
+    deps = [
+        ":graph_to_functiondef",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_test(
     name = "compilation_passes_test",
     size = "small",
     srcs = [
         "encapsulate_subgraphs_pass_test.cc",
-        "graph_to_functiondef_test.cc",
         "mark_for_compilation_pass_test.cc",
     ],
     deps = [
         ":common",
         ":compilation_passes",
-        ":graph_to_functiondef",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
@@ -283,3 +293,15 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
+cc_header_only_library(
+    name = "xla_jit_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+    ],
+)
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 14d8f2ab351bd99dd3fe42a9ac6e31062d552ff0..a1ddad3e9b8191ee4d783136d2b509ec15d993d1 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/compiler/jit/graph_to_functiondef.cc
index 5cdbebd88ee458e5ff332c7a3fe5d736af112ca9..6fa21fa6204dcc9446081d07e2a59ccace216713 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef.cc
@@ -151,8 +151,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
       argdef->set_type(type);
       const string normalized = node_names.Normalize(node->name());
       argdef->set_name(normalized);
-      CHECK_EQ(node->in_edges().size(), 1) << node->DebugString();
-      Edge const* edge = *node->in_edges().begin();
+      Edge const* edge;
+      TF_CHECK_OK(node->input_edge(0, &edge));
       return_values[normalized] =
           strings::StrCat(edge->src()->name(), ":", edge->src_output());
       continue;
diff --git a/tensorflow/compiler/jit/graph_to_functiondef_test.cc b/tensorflow/compiler/jit/graph_to_functiondef_test.cc
index 5c09e96a4c2817e5a871a91ca6c68de87dc3b762..676db7c4dd2fd7047e8ae9bb190daf18af6ac7cf 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef_test.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef_test.cc
@@ -82,5 +82,38 @@ TEST(GraphToFunctionDefTest, Basics) {
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
+// Regression test for a crash if there was a control edge to a _Retval node.
+TEST(GraphToFunctionDefTest, ControlDependencies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(root.WithOpName("a"), DT_FLOAT, 0);
+  auto b = ops::Neg(root.WithOpName("b").WithControlDependencies(a), a);
+  auto c = ops::_Retval(root.WithOpName("c").WithControlDependencies(b), b, 0);
+
+  GraphDef graph_def;
+  TF_EXPECT_OK(root.ToGraphDef(&graph_def));
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphConstructorOptions options;
+  TF_EXPECT_OK(ConvertGraphDefToGraph(options, graph_def, graph.get()));
+
+  FunctionDef fdef;
+  TF_EXPECT_OK(GraphToFunctionDef(*graph, "test_fn", &fdef));
+
+  FunctionDef fdef_expected = FunctionDefHelper::Create(
+      "test_fn",     // function name
+      {"a: float"},  // inputs
+      {"c: float"},  // outputs
+      {},            // attrs
+      {
+          // nodes in the function body
+          {{"b"}, "Neg", {"a", "^a"}, {{"T", DT_FLOAT}}},
+      },
+      {{"c", "b:y:0"}});  // return values
+
+  string diff;
+  bool fdefs_equal = EqualFunctionDef(fdef_expected, fdef, &diff);
+  EXPECT_TRUE(fdefs_equal) << diff;
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index c4116cb8b52adc191e9f695bc9a6e0cf413b4b5c..354c0fabfc78bcb9f5d63e84edc224fc33650ea9 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -2,6 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(
     default_visibility = [
+        "//tensorflow/compiler/plugin/executor:__pkg__",
         "//tensorflow/compiler/tf2xla:internal",
     ],
 )
@@ -35,9 +36,11 @@ cc_library(
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:xla_compilation_cache",
         "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/jit/kernels/parallel_check_op.cc b/tensorflow/compiler/jit/kernels/parallel_check_op.cc
index c86e03118b53ddf4865b7995b1d197c3ef07ba29..bd4eefbc0bb960f8ddc1d238057e73a29a098f26 100644
--- a/tensorflow/compiler/jit/kernels/parallel_check_op.cc
+++ b/tensorflow/compiler/jit/kernels/parallel_check_op.cc
@@ -64,7 +64,7 @@ class ParallelCheckOp : public OpKernel {
           ok = (diff <= tolerance);
         }
         if (ok) continue;
-        LOG(ERROR) << "Op " << def().name() << " fails equality at output "
+        LOG(ERROR) << "Op " << name() << " fails equality at output "
                    << input_idx << " type " << DataTypeString(dtype)
                    << " element " << i << ": std_val=" << p0[i]
                    << " test_val=" << p1[i] << " diff=" << (p0[i] - p1[i]);
@@ -75,7 +75,7 @@ class ParallelCheckOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "Compute " << def().name();
+    VLOG(1) << "Compute " << name();
     const int num_pairs = ctx->num_inputs() / 2;
     for (int i = 0; i < num_pairs; ++i) {
       CHECK_EQ(ctx->input_dtype(i), ctx->input_dtype(i + num_pairs));
@@ -113,7 +113,7 @@ class ParallelCheckOp : public OpKernel {
           LOG(FATAL) << "unimpl: " << ctx->input_dtype(i);
       }
       if (failed > 0) {
-        LOG(ERROR) << "check failed for " << def().name() << " output " << i
+        LOG(ERROR) << "check failed for " << name() << " output " << i
                    << " num_elts: " << num_elts;
         legacy_flags::ParallelCheckOpFlags* flags =
             legacy_flags::GetParallelCheckOpFlags();
@@ -121,7 +121,7 @@ class ParallelCheckOp : public OpKernel {
           LOG(QFATAL) << "failfast on first parallel-check failure";
         }
       } else {
-        VLOG(1) << "check passed for " << def().name() << " output " << i
+        VLOG(1) << "check passed for " << name() << " output " << i
                 << " num_elts: " << num_elts;
       }
 
diff --git a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
index 29c5ff724299ec84d31268c4227259ec02d10742..2b77e5aaf4e0983354c14a4e20656af0e0e4f84b 100644
--- a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -149,6 +151,8 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
     xla::ExecutionOptions execution_options;
     *execution_options.mutable_shape_with_output_layout() =
         kernel->xla_output_shape;
+    *execution_options.mutable_debug_options() =
+        xla::legacy_flags::GetDebugOptionsFromFlags();
     Env* env = Env::Default();
     auto start_time = env->NowMicros();
     VLOG(1) << "Executing XLA Computation...";
@@ -202,11 +206,14 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
 
   // Apply variable updates, if any.
   VLOG(2) << "Applying variable updates";
-  for (int i = 0; i < kernel->variable_updates.size(); ++i) {
-    const XlaCompiler::VariableUpdate& write = kernel->variable_updates[i];
+  for (int i = 0; i < kernel->resource_updates.size(); ++i) {
+    const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
     OP_REQUIRES(ctx,
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
+    TensorShape write_shape;
+    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(write.shape, &write_shape));
+
     // This code is very close to being a clone of AssignVariableOp, but the
     // key difference is that the contents of an XLA device tensor cannot be
     // copied safely; instead we must use
@@ -214,26 +221,27 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
     Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, not
     // a Tensor.
-    OP_REQUIRES_OK(ctx, LookupOrCreateResource<Var>(
-                            ctx, HandleFromInput(ctx, write.input_index),
-                            &variable, [this, ctx, &write](Var** ptr) {
-                              *ptr = new Var(write.type);
-                              PersistentTensor unused;
-                              Tensor* tmp;
-                              TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-                                  write.type, write.shape, &unused, &tmp));
-                              *(*ptr)->tensor() = *tmp;
-                              return Status::OK();
-                            }));
+    OP_REQUIRES_OK(ctx,
+                   LookupOrCreateResource<Var>(
+                       ctx, HandleFromInput(ctx, write.input_index), &variable,
+                       [this, ctx, &write, &write_shape](Var** ptr) {
+                         *ptr = new Var(write.type);
+                         PersistentTensor unused;
+                         Tensor* tmp;
+                         TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+                             write.type, write_shape, &unused, &tmp));
+                         *(*ptr)->tensor() = *tmp;
+                         return Status::OK();
+                       }));
     core::ScopedUnref s(variable);
 
     mutex_lock ml(*variable->mu());
     OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
                 errors::Internal("Mismatched type in variable write"));
-    if (!variable->tensor()->shape().IsSameSize(write.shape)) {
+    if (!variable->tensor()->shape().IsSameSize(write_shape)) {
       PersistentTensor unused;
       Tensor* tmp;
-      OP_REQUIRES_OK(ctx, ctx->allocate_persistent(write.type, write.shape,
+      OP_REQUIRES_OK(ctx, ctx->allocate_persistent(write.type, write_shape,
                                                    &unused, &tmp));
       *variable->tensor() = *tmp;
     }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index f1fef85f994a5f1f7514a5cb8b8b339706c7d998..77b45aa11e2e71f206bea4fbf08ed686ec6bb649 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -162,10 +163,12 @@ Status DeviceTypeOfDevice(const string& device, DeviceType* device_type) {
   return Status::OK();
 }
 
-// Does `node` have a DT_RESOURCE typed argument?
-bool HasResourceArgument(const Node& node) {
+// Tests whether `node` has a DT_RESOURCE typed input or output.
+bool HasResourceInputOrOutput(const Node& node) {
   return std::find(node.input_types().begin(), node.input_types().end(),
-                   DT_RESOURCE) != node.input_types().end();
+                   DT_RESOURCE) != node.input_types().end() ||
+         std::find(node.output_types().begin(), node.output_types().end(),
+                   DT_RESOURCE) != node.output_types().end();
 }
 
 Status FindCompilationCandidates(
@@ -193,9 +196,10 @@ Status FindCompilationCandidates(
               << ": " << node->type_string();
       continue;
     }
-    if (!registration->compile_resource_ops && HasResourceArgument(*node)) {
-      VLOG(2) << "Compilation rejected node: resource argument " << node->name()
-              << ": " << node->type_string();
+    if (!registration->compile_resource_ops &&
+        HasResourceInputOrOutput(*node)) {
+      VLOG(2) << "Compilation rejected node: resource input/output "
+              << node->name() << ": " << node->type_string();
       continue;
     }
     if (node->type_string() == "While" &&
@@ -253,6 +257,11 @@ Status MarkForCompilationPass::Run(
                                              &registration)) {
       return false;
     }
+
+    // Don't compile control trigger nodes. We won't preserve their deadness
+    // semantics correctly, so it's safest not to compile them.
+    if (node->IsControlTrigger()) return false;
+
     // If this device requires a JIT, we must say yes.
     if (registration->requires_compilation) return true;
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 9f30e12e0e30fef6b4bcd0ea3c091842b008c29a..4b88da27a188ed4fa6125b3e7a84034efb1a0ec1 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/jit/defs.h"
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -455,5 +457,39 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
   EXPECT_EQ(clusters["B"], clusters["C"]);
 }
 
+REGISTER_OP("ResourceInput").Input("a: resource").Output("o: float");
+REGISTER_OP("ResourceOutput").Input("a: float").Output("o: resource");
+
+namespace {
+
+class DummyOp : public XlaOpKernel {
+  using XlaOpKernel::XlaOpKernel;
+  void Compile(XlaOpKernelContext* ctx) override {}
+};
+
+REGISTER_XLA_OP(Name("ResourceInput"), DummyOp);
+REGISTER_XLA_OP(Name("ResourceOutput"), DummyOp);
+
+}  // namespace
+
+TEST(XlaCompilationTest, Resources) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a =
+        ops::SourceOp("UncompilableNullary", builder.opts().WithName("A"));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    // We should not form clusters with resource ops by default.
+    Node* c = ops::UnaryOp("ResourceOutput", b, builder.opts().WithName("C"));
+    Node* d = ops::UnaryOp("ResourceInput", c, builder.opts().WithName("D"));
+    ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+  }
+  MarkForCompilation(&graph);
+  auto clusters = GetClusters(*graph);
+  EXPECT_EQ(0, clusters.size());  // Nothing should be compiled.
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index 8d1fa03cc0d74f3a61b3e2e1d6f2af07c0bcd23f..e5787ca4c8cff436e4404b8488970248b24a5eda 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -1,32 +1,20 @@
 licenses(["notice"])  # Apache 2.0
 
 package(
-    default_visibility = [
-        "//tensorflow/compiler/tf2xla:internal",
-    ],
+    default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
 )
 
 cc_library(
     name = "xla_ops",
-    srcs = [
-        "xla_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    srcs = ["xla_ops.cc"],
+    deps = ["//tensorflow/core:framework"],
     alwayslink = 1,
 )
 
 cc_library(
     name = "parallel_check_op",
     srcs = ["parallel_check_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = ["//tensorflow/core:framework"],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 63ca77f9a912acce2078f3da43d64f2e10049380..3c52316ccef0023472b2e888e0c31b07fc00e694 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -148,7 +148,8 @@ Status BuildArguments(int num_constant_args,
     XlaCompiler::Argument& arg = (*args)[input_num];
     arg.kind = XlaCompiler::Argument::kConstant;
     arg.type = input.dtype();
-    arg.shape = input.shape();
+    TF_RETURN_IF_ERROR(
+        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
     arg.constant_value = input;
     ++input_num;
   }
@@ -169,7 +170,8 @@ Status BuildArguments(int num_constant_args,
       arg.constant_value = input;
     }
     arg.type = input.dtype();
-    arg.shape = input.shape();
+    TF_RETURN_IF_ERROR(
+        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
     ++input_num;
   }
 
@@ -182,19 +184,21 @@ Status BuildArguments(int num_constant_args,
     XlaCompiler::Argument& arg = (*args)[input_num];
 
     arg.name = variable_args[variable_id].name;
+    arg.kind = XlaCompiler::Argument::kVariable;
     if (variable_args[variable_id].present) {
       const Tensor& value = variable_args[variable_id].value;
-      arg.kind = XlaCompiler::Argument::kVariable;
       arg.type = value.dtype();
-      arg.shape = value.shape();
+      TF_RETURN_IF_ERROR(
+          TensorShapeToXLAShape(value.dtype(), value.shape(), &arg.shape));
+      arg.initialized = true;
     } else {
       // The values of uninitialized variables are not passed as inputs, since
       // they are meaningless. However, it is legal to assign to a resource
       // variable for the first time inside the XLA computation, so we do permit
       // uninitialized variables.
-      arg.kind = XlaCompiler::Argument::kUninitializedVariable;
+      arg.initialized = false;
       arg.type = DT_INVALID;
-      arg.shape = TensorShape();
+      arg.shape = xla::Shape();
     }
     ++input_num;
   }
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 5e336c5287bd9e2067e93cd8db8a5a1b62b62bd2..615e2230f42f63f893ad645e1ab9513d6c30abf5 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -31,9 +31,11 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index f329e83e14dfce68eff3feb720c1603bd36fa7d6..0ab81ebd5ffec0b3dd6aee509a6d4d2b41d156db 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -137,7 +137,7 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
       done(result.status());
       return;
     }
-    const void* src_ptr = xla::LiteralUtil::InternalData(*result.ValueOrDie());
+    const void* src_ptr = result.ValueOrDie()->InternalData();
     void* dst_ptr = DMAHelper::base(cpu_tensor);
     size_t total_bytes = cpu_tensor->TotalBytes();
     memcpy(dst_ptr, src_ptr, total_bytes);
diff --git a/tensorflow/compiler/plugin/executor/BUILD b/tensorflow/compiler/plugin/executor/BUILD
index 9bc706abdf646a32da734906cada727d949eee21..bc7c25c12056332a8b74077d9f73ea551e8bbbee 100644
--- a/tensorflow/compiler/plugin/executor/BUILD
+++ b/tensorflow/compiler/plugin/executor/BUILD
@@ -11,12 +11,14 @@ cc_library(
         "*.h",
     ]),
     deps = [
+        "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit:xla_jit_headers_lib",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:xla_headers_lib",
-        "//tensorflow/compiler/xla/service:hlo_evaluator",
+        "//tensorflow/compiler/xla/service",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
diff --git a/tensorflow/compiler/plugin/executor/compiler.cc b/tensorflow/compiler/plugin/executor/compiler.cc
index 893ff152f0c77c354be178818eaf9e8fc75feaa4..72fe7ba4519833e17314f8fef803ad0230713780 100644
--- a/tensorflow/compiler/plugin/executor/compiler.cc
+++ b/tensorflow/compiler/plugin/executor/compiler.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/plugin/executor/compiler.h"
 #include "tensorflow/compiler/plugin/executor/executable.h"
-
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
@@ -30,27 +29,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 
-#include "tensorflow/core/lib/core/errors.h"
+namespace xla {
+namespace executorplugin {
 
 namespace se = ::perftools::gputools;
 namespace sep = ::perftools::gputools::executorplugin;
-namespace port = ::perftools::gputools::port;
-
-namespace xla {
-namespace executorplugin {
 
 /*
  * Run optimization passes on the module.  The graph is transformed by
  * each pass in the optimization pipeline.  The service subdirectory
  * contains useful optimization passes.
  */
-Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module,
-                                            HloDumper dump_hlo) {
-  HloPassPipeline pipeline("Executor", dump_hlo);
+Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module) {
+  HloPassPipeline pipeline("Executor");
   pipeline.AddPass<Inliner>();
   pipeline.AddPass<HloSubcomputationUnification>();
   pipeline.AddPass<HloCSE>(false);
@@ -67,13 +62,13 @@ Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module,
 }
 
 StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
-        std::unique_ptr<HloModule> hlo_module, HloDumper dump_hlo,
+        std::unique_ptr<HloModule> hlo_module,
         se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Generate graph " << hlo_module->name();
 
-  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get(), dump_hlo));
+  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
 
   // Typically you would visit the HLO graph, building up a compiled equivalent
   // In this case we are using an Hlo evaluator at execution time, so we don't
@@ -88,7 +83,7 @@ StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
         std::vector<std::unique_ptr<HloModule>> hlo_modules,
-        HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
+        std::vector<se::StreamExecutor*> stream_execs) {
 
   return tensorflow::errors::Unimplemented(
       "Compilation of multiple HLO modules is not supported on Executor.");
@@ -97,7 +92,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 ExecutorCompiler::CompileAheadOfTime(
     std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    HloDumper dump_hlo, const AotCompilationOptions& aot_options) {
+    const AotCompilationOptions& aot_options) {
 
   return tensorflow::errors::InvalidArgument(
       "AOT compilation not supported on Executor");
@@ -112,12 +107,11 @@ ExecutorCompiler::ShapeSizeBytesFunction() const {
   return ExecutorExecutable::ShapeSizeBytes;
 }
 
-
-}  // namespace executorplugin
-}  // namespace xla
-
 REGISTER_MODULE_INITIALIZER(executor_compiler, {
   xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() {
     return xla::MakeUnique<xla::executorplugin::ExecutorCompiler>();
   });
 });
+
+}  // namespace executorplugin
+}  // namespace xla
diff --git a/tensorflow/compiler/plugin/executor/compiler.h b/tensorflow/compiler/plugin/executor/compiler.h
index 8fe591c8abd57933aafa6c82159b49aad45a42d5..d318eefc49f0f1983cf58802d56e71b799944b11 100644
--- a/tensorflow/compiler/plugin/executor/compiler.h
+++ b/tensorflow/compiler/plugin/executor/compiler.h
@@ -35,25 +35,23 @@ class ExecutorCompiler : public Compiler {
 
   StatusOr<std::unique_ptr<Executable>> Compile(
       std::unique_ptr<HloModule> hlo_module,
-      HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_module,
-      HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       std::vector<std::unique_ptr<HloModule>> module,
-      HloDumper dump_hlo, const AotCompilationOptions& options) override;
+      const AotCompilationOptions& options) override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
  private:
-  Status RunHloOptimization(HloModule* hlo_module, HloDumper dump_hlo);
+  Status RunHloOptimization(HloModule* hlo_module);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler);
 };
diff --git a/tensorflow/compiler/plugin/executor/device.cc b/tensorflow/compiler/plugin/executor/device.cc
index bbc39dc03f866c0b10c0e4ac46eddebda4bec87f..d902f9df6a50161dacf12a5b234c1304ead353d5 100644
--- a/tensorflow/compiler/plugin/executor/device.cc
+++ b/tensorflow/compiler/plugin/executor/device.cc
@@ -47,7 +47,12 @@ Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options,
   return Status::OK();
 }
 
-REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 110);
+// Set priority to be below the default priority (50), so that Executor is not
+// selected as a high priority device over other default devices.
+// See constructor comments for Registrar in
+// tensorflow/core/common_runtime/device_factory.h for a list of priority for
+// devices.
+REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 40);
 
 // Kernel registrations
 
diff --git a/tensorflow/compiler/plugin/executor/executable.cc b/tensorflow/compiler/plugin/executor/executable.cc
index 92a517ba533cb073dac9b37179825d089e29f3ab..4f1f0d99f9730443f64bc58c16453b195b388ca1 100644
--- a/tensorflow/compiler/plugin/executor/executable.cc
+++ b/tensorflow/compiler/plugin/executor/executable.cc
@@ -15,18 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/plugin/executor/executable.h"
 #include "tensorflow/compiler/plugin/executor/executor.h"
-
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
-
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
-
 namespace xla {
 namespace executorplugin {
 
+namespace se = ::perftools::gputools;
+namespace sep = ::perftools::gputools::executorplugin;
+
 ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
     : Executable(std::move(hlo_module), ShapeSizeBytes) {}
 
@@ -36,7 +34,7 @@ static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor
                                                  const Literal& literal) {
   int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
   void* buf = executor->Allocate(size);
-  const void* src = LiteralUtil::InternalData(literal);
+  const void* src = literal.InternalData();
   memcpy(buf, src, size);
   return se::DeviceMemoryBase(buf, size);
 }
@@ -49,13 +47,14 @@ static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor
   } else {
     int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
     void** buf = reinterpret_cast<void**>(executor->Allocate(size));
+    void** buf_rc = buf;
     for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
       se::DeviceMemoryBase out =
           AllocateSingleOutput(executor, literal.tuple_literals(n));
       *buf++ = out.opaque();
     }
 
-    return se::DeviceMemoryBase(buf, size);
+    return se::DeviceMemoryBase(buf_rc, size);
   }
 }
 
@@ -86,19 +85,18 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
   for (int64 p = 0; p < computation->num_parameters(); p++) {
     // Create the input literal for the parameter
     HloInstruction* param = computation->parameter_instruction(p);
-    arg_literals.emplace_back(LiteralUtil::CreateFromShape(param->shape()));
+    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
     arg_literals_ptrs.push_back(arg_literals.back().get());
 
     // Copy in the data from the stream_executor buffers
-    void* buffer = LiteralUtil::MutableInternalData(arg_literals.back().get());
+    void* buffer = arg_literals.back()->MutableInternalData();
     memcpy(buffer, arguments[p].opaque(),
            ShapeUtil::ByteSizeOf(param->shape()));
   }
 
   // Execute the graph using the evaluator
   HloEvaluator evaluator;
-  std::unique_ptr<Literal> output;
-  TF_ASSIGN_OR_RETURN(output,
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> output,
                       evaluator.Evaluate(computation, arg_literals_ptrs));
 
   // Copy the result into the return buffer
diff --git a/tensorflow/compiler/plugin/executor/executor.cc b/tensorflow/compiler/plugin/executor/executor.cc
index e72c2711f794792fd4d7834b07eee5d983dff0a0..908b996bc95ac8d36f6c5577857b1a3a3826c3d4 100644
--- a/tensorflow/compiler/plugin/executor/executor.cc
+++ b/tensorflow/compiler/plugin/executor/executor.cc
@@ -14,14 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/plugin/executor/executor.h"
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
-
-#include "tensorflow/compiler/xla/status_macros.h"
 
 #include <stdlib.h>
 #include <string.h>
 
-namespace se = ::perftools::gputools;
+#include "tensorflow/compiler/plugin/executor/platform_id.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 
 namespace perftools {
 namespace gputools {
@@ -37,10 +35,7 @@ ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config)
 
 ExecutorExecutor::~ExecutorExecutor() {}
 
-void *ExecutorExecutor::Allocate(uint64 size) {
-  void *buf = new char[size];
-  return buf;
-}
+void *ExecutorExecutor::Allocate(uint64 size) { return new char[size]; }
 
 void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
                                          uint64 offset_bytes,
@@ -126,8 +121,7 @@ DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const {
   builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
   builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);
 
-  auto built = builder.Build();
-  return built.release();
+  return builder.Build().release();
 }
 
 }  // namespace executorplugin
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.cc b/tensorflow/compiler/plugin/executor/transfer_manager.cc
index b59d20a7791f1ed2df2f35c6186e34e64fe4b248..51c5deeea5d5fd03d0fb99d4f33413c7bf4abe0f 100644
--- a/tensorflow/compiler/plugin/executor/transfer_manager.cc
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.cc
@@ -70,13 +70,13 @@ Status ExecutorTransferManager::TransferLiteralFromDevice(
   }
 
   *literal->mutable_shape() = device_shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(device_shape), literal);
+  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
   TF_RETURN_IF_ERROR(TransferBufferFromDevice(
       executor, source, ShapeUtil::ByteSizeOf(device_shape),
-      LiteralUtil::MutableInternalData(literal)));
+      literal->MutableInternalData()));
   if (!ShapeUtil::Equal(literal_shape, device_shape)) {
     literal->Swap(
-        LiteralUtil::Relayout(*literal, literal_shape.layout()).get());
+        literal->Relayout(literal_shape.layout()).get());
   }
   TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
   return Status::OK();
@@ -134,7 +134,7 @@ Status ExecutorTransferManager::TransferLiteralToDevice(
   }
 
   return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
-                                LiteralUtil::InternalData(literal),
+                                literal.InternalData(),
                                 destination);
 }
 
@@ -147,6 +147,11 @@ Status ExecutorTransferManager::TransferLiteralToInfeed(
   return Status::OK();
 }
 
+Status ExecutorTransferManager::TransferBufferToInfeed(
+    se::StreamExecutor* executor, int64 size, const void* source) {
+  return Unimplemented("Transfer to Infeed");
+}
+
 Status ExecutorTransferManager::TransferLiteralFromOutfeed(
     perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
     Literal* literal) {
diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.h b/tensorflow/compiler/plugin/executor/transfer_manager.h
index 22142cd778a0aeccb6c393bdc1593e6213de858a..7a42e5a2d7542eaad7f8f90f011c65a9c526cc11 100644
--- a/tensorflow/compiler/plugin/executor/transfer_manager.h
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.h
@@ -55,6 +55,9 @@ class ExecutorTransferManager : public TransferManager {
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const Literal& literal) override;
 
+  Status TransferBufferToInfeed(se::StreamExecutor* executor,
+                                 int64 size, const void* source) override;
+
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     const Shape& literal_shape,
                                     Literal* literal) override;
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 4bbb2767ac033dd9995cad37886d476fc87618da..c693f58f8bddb7703d10e41afb4b666d92c25823 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -40,7 +40,9 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:variables",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -174,6 +176,11 @@ tf_xla_py_test(
     name = "slice_ops_test",
     size = "small",
     srcs = ["slice_ops_test.py"],
+    # TODO(b/62962492): Test fails with assertion error.
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -323,7 +330,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "reverse_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["reverse_ops_test.py"],
     deps = [
         ":xla_test",
@@ -346,6 +353,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "segment_reduction_ops_test",
+    size = "small",
+    srcs = ["segment_reduction_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "spacetobatch_op_test",
     size = "medium",
@@ -360,6 +381,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "stack_ops_test",
+    size = "small",
+    srcs = ["stack_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "tensor_array_ops_test",
     size = "small",
@@ -455,6 +489,11 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
     ],
+    # TODO(b/62961789): Test fails with SIGABRT
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index a5c5885b4284aee167ae4cb18f7e42820c6d251d..9a93b3216404d8ed21fd6c57757bec1730c119b4 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -49,9 +49,11 @@ class AdagradOptimizerTest(XLATestCase):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            float_rtol=1e-5)
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
@@ -73,9 +75,11 @@ class AdagradOptimizerTest(XLATestCase):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            float_rtol=1e-5)
 
   def testSharing(self):
     for dtype in self.float_types:
@@ -107,9 +111,11 @@ class AdagradOptimizerTest(XLATestCase):
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            float_rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 7221a0a3c745f939b88cae0f66af2421922dcd68..0bdbf53c39f0bf35943646d9f11a11bbcfa2d6fe 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -113,6 +113,14 @@ class BinaryOpsTest(XLATestCase):
           np.array([-.6, -.4, -.2, 0, .2, .4], dtype=dtype),
           expected=np.array([0.4, 1.2, 2.4, 4, 5, 6], dtype=dtype))
 
+      self._testBinary(
+          gen_nn_ops._selu_grad,
+          np.array([1, 2, 3, 4, 5, 6], dtype=dtype),
+          np.array([-.6, -.4, -.2, .2, .4, .6], dtype=dtype),
+          expected=np.array(
+              [1.158099340847, 2.7161986816948, 4.67429802254,
+               4.202803949422, 5.2535049367774, 6.30420592413], dtype=dtype))
+
       self._testBinary(
           gen_nn_ops._relu_grad,
           np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
@@ -555,17 +563,18 @@ class BinaryOpsTest(XLATestCase):
     self._testBinary(
         math_ops.matmul,
         np.array(
-            [[[[1000, 100], [10, 1]], [[2000, 200], [20, 2]]],
-             [[[3000, 300], [30, 3]], [[4000, 400], [40, 4]]]],
+            [[[[7, 13], [10, 1]], [[2, 0.25], [20, 2]]],
+             [[[3, 5], [30, 3]], [[0.75, 1], [40, 4]]]],
             dtype=np.float32),
         np.array(
             [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[11, 22], [33, 44]],
                                                     [[55, 66], [77, 88]]]],
             dtype=np.float32),
         expected=np.array(
-            [[[[1300, 2400], [13, 24]], [[11400, 13600], [114, 136]]],
-             [[[42900, 79200], [429, 792]], [[250800, 299200], [2508, 2992]]]],
+            [[[[46, 66], [13, 24]], [[11.75, 14], [114, 136]]],
+             [[[198, 286], [429, 792]], [[118.25, 137.5], [2508, 2992]]]],
             dtype=np.float32))
+
     self._testBinary(
         math_ops.matmul,
         np.array([], dtype=np.float32).reshape((2, 2, 0)),
@@ -581,7 +590,7 @@ class BinaryOpsTest(XLATestCase):
 
     # Regression test for b/31472796.
     if hasattr(np, "matmul"):
-      x = np.arange(0, 3 * 5 * 16 * 7, dtype=np.float32).reshape((3, 5, 16, 7))
+      x = np.arange(0, 3 * 5 * 2 * 7, dtype=np.float32).reshape((3, 5, 2, 7))
       self._testBinary(
           lambda x, y: math_ops.matmul(x, y, adjoint_b=True),
           x, x,
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 6b328fb618bf8b9174dce756487494994b8aea04..7e3871312c86530b6d3cb0bbacc16c25d3469832 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -134,9 +134,9 @@ class FtrlOptimizerTest(XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.60260963, -4.29698515]), var0.eval())
+            np.array([-2.60260963, -4.29698515]), var0.eval(), float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28432083, -0.56694895]), var1.eval())
+            np.array([-0.28432083, -0.56694895]), var1.eval(), float_rtol=1e-5)
 
   def testFtrlwithoutRegularization2(self):
     for dtype in self.float_types:
@@ -189,8 +189,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-7.66718769, -10.91273689]), var0.eval())
-        self.assertAllClose(np.array([-0.93460727, -1.86147261]), var1.eval())
+        self.assertAllClose(np.array([-7.66718769, -10.91273689]), var0.eval(),
+                            rtol=1e-4)
+        self.assertAllClose(np.array([-0.93460727, -1.86147261]), var1.eval(),
+                            rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -215,10 +217,47 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval())
-        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval())
+        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval(),
+                            rtol=1e-5)
+        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval(),
+                            rtol=1e-5)
+
+  def testFtrlWithL1_L2_L2Shrinkage(self):
+    """Test the new FTRL op with support for l2 shrinkage.
+
+    The addition of this parameter which places a constant pressure on weights
+    towards the origin causes the gradient descent trajectory to differ. The
+    weights will tend to have smaller magnitudes with this parameter set.
+    """
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.FtrlOptimizer(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([4.0, 3.0], var1.eval())
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          ftrl_update.run()
+
+        # Validate updated params
+        self.assertAllClose(np.array([-0.21931979, -0.40642974]), var0.eval(),
+                            rtol=1e-4)
+        self.assertAllClose(np.array([-0.0282721, -0.07188385]), var1.eval(),
+                            rtol=1e-4)
 
-  # When variables are intialized with Zero, FTRL-Proximal has two properties:
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
   # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
   # with GradientDescent.
   # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is idential
@@ -233,8 +272,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllClose(val0, val2)
-    self.assertAllClose(val1, val3)
+    self.assertAllClose(val0, val2, rtol=1e-4)
+    self.assertAllClose(val1, val3, rtol=1e-4)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
@@ -245,8 +284,8 @@ class FtrlOptimizerTest(XLATestCase):
         val2, val3 = self.equivGradientDescentTest_GradientDescentPart(
             steps, dtype)
 
-    self.assertAllClose(val0, val2)
-    self.assertAllClose(val1, val3)
+    self.assertAllClose(val0, val2, rtol=1e-5)
+    self.assertAllClose(val1, val3, rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py
index 52290e63548910309a9b6b75b7a4642ebeed1efa..7c19a99c4eb4be3ca34b3ce949216e557b0a681d 100644
--- a/tensorflow/compiler/tests/pooling_ops_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_test.py
@@ -376,7 +376,7 @@ class PoolGradTest(XLATestCase):
       self.assertAllClose(
           expected_input_gradient_vals.flatten(),
           actual.flatten(),
-          rtol=1e-5,
+          rtol=1e-4,
           atol=1e-6)
       self.assertShapeEqual(actual, inputs)
 
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index d3821ad02e5aa9cbec6ba7fb940ee2246d38c81e..825fd9de2eb306234da36c691e0c7ca2e724dd5a 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -1434,6 +1434,23 @@ TEST_F(OpTest, EluGrad) {
   });
 }
 
+TEST_F(OpTest, Selu) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Selu").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, SeluGrad) {
+  Repeatedly([this]() {
+    auto dims = RandomDims();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SeluGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, Equal) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..260a04421b62310c109d8f0ea72875a50c234bb0
--- /dev/null
+++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for segment reduction ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+class SegmentReductionOpsTest(XLATestCase):
+  """Test cases for segment reduction ops."""
+
+  def UnsortedSegmentSum(self, data, indices, num_segments):
+    with self.test_session() as sess, self.test_scope():
+      d = array_ops.placeholder(data.dtype, shape=data.shape)
+      if isinstance(indices, int):
+        i = array_ops.placeholder(np.int32, shape=[])
+      else:
+        i = array_ops.placeholder(indices.dtype, shape=indices.shape)
+      return sess.run(
+          math_ops.unsorted_segment_sum(d, i, num_segments),
+          {d: data,
+           i: indices})
+
+  def testUnsortedSegmentSum0DIndices1DData(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array(
+              [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5],
+               [0, 0, 0, 0, 0, 0]],
+              dtype=dtype),
+          self.UnsortedSegmentSum(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype), 2, 4))
+
+  def testUnsortedSegmentSum1DIndices1DData(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([1, 3, 2, 9], dtype=dtype),
+          self.UnsortedSegmentSum(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([3, 0, 2, 1, 3, 3], dtype=np.int32), 4))
+
+  def testUnsortedSegmentSum1DIndices2DDataDisjoint(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[0, 1, 2, 3], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43],
+           [50, 51, 52, 53]],
+          dtype=dtype)
+      indices = np.array([8, 1, 0, 3, 7], dtype=np.int32)
+      num_segments = 10
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[30, 31, 32, 33], [20, 21, 22, 23], [0, 0, 0, 0],
+               [40, 41, 42, 43], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
+               [50, 51, 52, 53], [0, 1, 2, 3], [0, 0, 0, 0]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSum1DIndices2DDataNonDisjoint(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[0, 1, 2, 3], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43],
+           [50, 51, 52, 53]],
+          dtype=dtype)
+      indices = np.array([0, 1, 2, 0, 1], dtype=np.int32)
+      num_segments = 4
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[40, 42, 44, 46], [70, 72, 74, 76], [30, 31, 32, 33],
+               [0, 0, 0, 0]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSum2DIndices3DData(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]],
+           [[200, 201, 202], [210, 211, 212]], [[300, 301, 302],
+                                                [310, 311, 312]]],
+          dtype=dtype)
+      indices = np.array([[3, 5], [3, 1], [5, 0], [6, 2]], dtype=np.int32)
+      num_segments = 8
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[210, 211, 212], [110, 111, 112], [310, 311, 312],
+               [100, 102, 104], [0, 0, 0.], [210, 212, 214], [300, 301,
+                                                              302], [0, 0, 0]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSum1DIndices3DData(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]],
+           [[200, 201, 202], [210, 211, 212]], [[300, 301, 302],
+                                                [310, 311, 312]]],
+          dtype=dtype)
+      indices = np.array([3, 0, 2, 5], dtype=np.int32)
+      num_segments = 6
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[[100, 101, 102.], [110, 111, 112]], [[0, 0, 0], [0, 0, 0]],
+               [[200, 201, 202], [210, 211, 212]], [[0, 1, 2.], [10, 11, 12]],
+               [[0, 0, 0], [0, 0, 0]], [[300, 301, 302], [310, 311, 312]]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSumShapeError(self):
+    for dtype in self.numeric_types:
+      data = np.ones((4, 8, 7), dtype=dtype)
+      indices = np.ones((3, 2), dtype=np.int32)
+      num_segments = 4
+      self.assertRaises(ValueError,
+                        functools.partial(self.UnsortedSegmentSum, data,
+                                          indices, num_segments))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index 9c3b86c84b2b92089da0dfc0070a4a7b8a03c81a..c013f4b50a4cf95be8028248c52b10b1c3be2bd3 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -228,34 +228,40 @@ class SpaceToBatchNDTest(XLATestCase):
         outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
                  [[4, 41], [6, 61]]])
 
-  def testDirect(self):
+  def testDirect0(self):
     # Test with zero-size remaining dimension.
     self._testDirect(
         input_shape=[3, 1, 2, 0], block_shape=[3], paddings=[[0, 2]])
 
+  def testDirect1(self):
     # Test with zero-size blocked dimension.
     self._testDirect(
         input_shape=[3, 0, 2, 5], block_shape=[3], paddings=[[0, 0]])
 
+  def testDirect2(self):
     # Test with padding up from zero size.
     self._testDirect(
         input_shape=[3, 0, 2, 5], block_shape=[3], paddings=[[1, 2]])
 
+  def testDirect3(self):
     self._testDirect(
         input_shape=[3, 3, 4, 5, 2],
         block_shape=[3, 4, 2],
         paddings=[[1, 2], [0, 0], [3, 0]])
 
+  def testDirect4(self):
     self._testDirect(
         input_shape=[3, 3, 4, 5, 2],
         block_shape=[3, 4, 2, 2],
         paddings=[[1, 2], [0, 0], [3, 0], [0, 0]])
 
+  def testDirect5(self):
     self._testDirect(
         input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
         block_shape=[1, 1, 3, 4, 2, 2],
         paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0]])
 
+  def testDirect6(self):
     self._testDirect(
         input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
         block_shape=[1, 1, 3, 4, 2, 2, 1],
diff --git a/tensorflow/compiler/tests/stack_ops_test.py b/tensorflow/compiler/tests/stack_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9c2279737ccee531d488d27ccdb0cafa1dc8fc
--- /dev/null
+++ b/tensorflow/compiler/tests/stack_ops_test.py
@@ -0,0 +1,104 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.stack_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.platform import test
+
+
+class StackOpTest(XLATestCase):
+
+  def testStackPushPop(self):
+    with self.test_session(), self.test_scope():
+      size = array_ops.placeholder(dtypes.int32)
+      v = array_ops.placeholder(dtypes.float32)
+      h = gen_data_flow_ops._stack_v2(size, dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push_v2(h, v)
+      with ops.control_dependencies([c]):
+        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+      self.assertAllClose([[4.0, 5.0]], c1.eval({size: 5, v: [[4.0, 5.0]]}))
+
+  def testStackPushPopSwap(self):
+    with self.test_session(), self.test_scope():
+      a = np.arange(2000)
+      x = array_ops.placeholder(dtypes.float32)
+      h = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push_v2(h, x, swap_memory=True)
+      with ops.control_dependencies([c]):
+        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
+      self.assertAllClose(a, c1.eval({x: a}))
+
+  def testMultiStack(self):
+    with self.test_session(), self.test_scope():
+      v = array_ops.placeholder(dtypes.float32)
+      h1 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
+      c1 = gen_data_flow_ops._stack_push_v2(h1, v)
+      with ops.control_dependencies([c1]):
+        c1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
+      h2 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="bar")
+      c2 = gen_data_flow_ops._stack_push_v2(h2, 5.0)
+      with ops.control_dependencies([c2]):
+        c2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+      r = c1 + c2
+      self.assertAllClose(9.0, r.eval({v: 4.0}))
+
+  def testSameNameStacks(self):
+    """Different stacks with the same name do not interfere."""
+    with self.test_session() as sess, self.test_scope():
+      v1 = array_ops.placeholder(dtypes.float32)
+      v2 = array_ops.placeholder(dtypes.float32)
+      h1 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
+      h2 = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
+
+      c1 = gen_data_flow_ops._stack_push_v2(h1, v1)
+      with ops.control_dependencies([c1]):
+        c2 = gen_data_flow_ops._stack_push_v2(h2, v2)
+      with ops.control_dependencies([c2]):
+        pop1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
+        pop2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+
+      out1, out2 = sess.run([pop1, pop2], {v1: 4.0, v2: 5.0})
+      self.assertAllClose(out1, 4.0)
+      self.assertAllClose(out2, 5.0)
+
+  def testCloseStack(self):
+    with self.test_session() as sess, self.test_scope():
+      size = array_ops.placeholder(dtypes.int32)
+      h = gen_data_flow_ops._stack_v2(size, dtypes.float32, stack_name="foo")
+      c1 = gen_data_flow_ops._stack_close_v2(h)
+      sess.run(c1, {size: 5})
+
+  def testPushCloseStack(self):
+    with self.test_session() as sess, self.test_scope():
+      v = array_ops.placeholder(dtypes.float32)
+      h = gen_data_flow_ops._stack_v2(5, dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push_v2(h, v)
+      with ops.control_dependencies([c]):
+        c1 = gen_data_flow_ops._stack_close_v2(h)
+      sess.run(c1, {v: [[4.0, 5.0]]})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 27a29773053e08c755afce23c3257d96ce27a929..ac039e01623b954e291760fb9b50ef8eae3da7c1 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -57,11 +57,13 @@ class TensorArrayTest(xla_test.XLATestCase):
       r0 = w2.read(0)
       r1 = w2.read(1)
       r2 = w2.read(2)
+      flow = w2.flow
 
-      d0, d1, d2 = session.run([r0, r1, r2])
+      d0, d1, d2, flow_val = session.run([r0, r1, r2, flow])
       self.assertAllEqual([[4.0, 5.0]], d0)
       self.assertAllEqual([[1.0, 3.0]], d1)
       self.assertAllEqual([[7.0, -8.5]], d2)
+      self.assertAllEqual([], flow_val.shape)
 
   def _testTensorArrayWritePack(self, tf_dtype):
     with self.test_session(), self.test_scope():
@@ -139,7 +141,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
-      # Unpack a matrix into vectors
+      # Unpack a matrix into vectors.
       w1 = ta.unstack(convert([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]))
       r0 = w1.read(0)
       r1 = w1.read(1)
@@ -180,7 +182,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
       convert = _make_converter(tf_dtype)
 
-      # Split an empty vector
+      # Split an empty vector.
       lengths = constant_op.constant([0, 0, 0])
       w0 = ta.split(convert([]), lengths=lengths)
       r0 = w0.read(0)
@@ -192,7 +194,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(convert([]), d1)
       self.assertAllEqual(convert([]), d2)
 
-      # Split a vector
+      # Split a vector.
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
       lengths = constant_op.constant([1, 1, 1])
@@ -206,7 +208,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(convert([2.0]), d1)
       self.assertAllEqual(convert([3.0]), d2)
 
-      # Split a matrix
+      # Split a matrix.
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
       lengths = constant_op.constant([1, 1, 1])
@@ -319,27 +321,31 @@ class TensorArrayTest(xla_test.XLATestCase):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
-      # Test writing the wrong datatype
+      # Test writing the wrong datatype.
       with self.assertRaisesOpError(
           "TensorArray dtype is float but op has dtype int32"):
         ta.write(-1, np.int32(7)).flow.eval()
 
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
-    with self.test_session(), self.test_scope():
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, tensor_array_name="foo", size=3)
-
-      w0 = ta.write(0, [[4.0, 5.0]])
-
-      # Test reading wrong datatype
-      r0_bad = gen_data_flow_ops._tensor_array_read_v3(
-          handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
-      with self.assertRaisesOpError(
-          "TensorArray dtype is float but Op requested dtype double."):
-        r0_bad.eval()
-
-      # Test reading from a different index than the one we wrote to
-      w0.read(1)
+    # Find two different floating point types, create an array of
+    # the first type, but try to read the other type.
+    if len(self.float_types) > 1:
+      dtype1 = self.float_types[0]
+      dtype2 = self.float_types[1]
+      with self.test_session(), self.test_scope():
+        ta = tensor_array_ops.TensorArray(
+            dtype=dtype1, tensor_array_name="foo", size=3)
+
+        w0 = ta.write(0, [[4.0, 5.0]])
+
+        # Test reading wrong datatype.
+        r0_bad = gen_data_flow_ops._tensor_array_read_v3(
+            handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow)
+        with self.assertRaisesOpError("TensorArray dtype is "):
+          r0_bad.eval()
+
+        # Test reading from a different index than the one we wrote to
+        w0.read(1)
 
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.test_session(), self.test_scope():
@@ -487,7 +493,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       r0 = w1.read(0)
       s0 = w1.concat()
 
-      # Test gradient accumulation between read(0), pack(), and concat()
+      # Test gradient accumulation between read(0), pack(), and concat().
       with ops.control_dependencies([p0, r0, s0]):
         grad_r = gradients_impl.gradients(
             ys=[p0, r0, s0],
@@ -536,7 +542,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       r0_1 = w.read(0)
       r1 = w.read(1)
 
-      # Test combined gradients + aggregation of read(0)
+      # Test combined gradients + aggregation of read(0).
       grad = gradients_impl.gradients(
           ys=[r0, r0_1, r1],
           xs=[value],
@@ -573,13 +579,12 @@ class TensorArrayTest(xla_test.XLATestCase):
                            [2000.0, -2000.0]],
                           grad_vals[0])
 
-  # TODO(phawkins): implement TensorArrayClose
-  # def testCloseTensorArray(self):
-  #   with self.test_session() as session, self.test_scope():
-  #     ta = tensor_array_ops.TensorArray(
-  #         dtype=dtypes.float32, tensor_array_name="foo", size=3)
-  #     c1 = ta.close()
-  #     session.run(c1)
+  def testCloseTensorArray(self):
+    with self.test_session() as session, self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+      c1 = ta.close()
+      session.run(c1)
 
   def testSizeTensorArray(self):
     with self.test_session(), self.test_scope():
@@ -588,17 +593,16 @@ class TensorArrayTest(xla_test.XLATestCase):
       s = ta.size()
       self.assertAllEqual(3, s.eval())
 
-  # TODO(phawkins): implement TensorArrayClose
-  # def testWriteCloseTensorArray(self):
-  #   with self.test_session(), self.test_scope():
-  #     ta = tensor_array_ops.TensorArray(
-  #         dtype=dtypes.float32,
-  #         tensor_array_name="foo",
-  #         size=3,
-  #         infer_shape=False)
-  #     w0 = ta.write(0, [[4.0, 5.0]])
-  #     w1 = w0.write(1, [3.0])
-  #     w1.close().run()  # Expected to run without problems
+  def testWriteCloseTensorArray(self):
+    with self.test_session(), self.test_scope():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3,
+          infer_shape=False)
+      w0 = ta.write(0, [[4.0, 5.0]])
+      w1 = w0.write(1, [3.0])
+      w1.close().run()  # Expected to run without problems
 
   # TODO(phawkins): implement while loops.
   # def _testWhileLoopWritePackGradients(self, dynamic_size, dtype):
@@ -746,7 +750,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       grad_b_t, = session.run([grad_b])
       self.assertAllEqual(grad_b_t, g0)
 
-      # Test gradients calculated jointly
+      # Test gradients calculated jointly.
       joint_grad_a_t, joint_grad_b_t = session.run([grad_a, grad_b])
       self.assertAllEqual(joint_grad_a_t, g0)
       self.assertAllEqual(joint_grad_b_t, g0)
@@ -879,7 +883,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       x = constant_op.constant([2.0, 3.0])
       w = ta.unstack(x)
       r0 = w.read(0)
-      # calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
+      # Calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
       grad_r0 = gradients_impl.gradients(ys=[r0], xs=[x], grad_ys=[1.0])
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
@@ -929,7 +933,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       r0 = w.read(1)
       r1 = w.read(8)
 
-      # Test combined gradients + aggregation of read(0)
+      # Test combined gradients + aggregation of read(0).
       grad = gradients_impl.gradients(
           ys=[r0, r1], xs=[value], grad_ys=[[2.0, 3.0], [4.0, 5.0]])
       read_vals, grad_vals = session.run([[r0, r1], grad])
@@ -953,7 +957,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       w = ta.unstack(values)
       g = w.gather(indices)
 
-      # Test combined gradients + aggregation of read(0)
+      # Test combined gradients + aggregation of read(0).
       grad = gradients_impl.gradients(
           ys=[g], xs=[values], grad_ys=[[[2.0, 3.0], [4.0, 5.0]]])
       g_vals, grad_vals = session.run([[g], grad])
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 51d8786ce3d7148e6863be7e1557a8bb23153d63..81ff18f3023c17f722632962dfa1cac60a7dfdc1 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -152,6 +152,16 @@ class UnaryOpsTest(XLATestCase):
           np.array([[1, 2]], dtype=dtype),
           expected=np.array([[0, 0.69314718]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.sin,
+          np.array([[1, 2]], dtype=dtype),
+          expected=np.array([[0.841478, 0.909302]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          math_ops.cos,
+          np.array([[1, 2]], dtype=dtype),
+          expected=np.array([[0.540297, -0.41614]], dtype=dtype))
+
       # TODO(b/34703906): improve log1p implementation and make tolerance
       # tighter.
       self._assertOpOutputMatchesExpected(
@@ -219,6 +229,11 @@ class UnaryOpsTest(XLATestCase):
           np.array([[-1, 0, 1]], dtype=dtype),
           expected=np.array([[-0.63212056, 0, 1]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          nn_ops.selu,
+          np.array([[-1, 0, 1]], dtype=dtype),
+          expected=np.array([[-1.11133074, 0., 1.05070099]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           nn_ops.relu,
           np.array([[-1, 1]], dtype=dtype),
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index 70dacd9de4b95dfb77986dfaf177c16b758406f1..a6b59fc731e7556cbfa6e0c2c4f889b58568e622 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -54,6 +54,54 @@ class VariableOpsTest(XLATestCase):
         self.assertAllClose(np.array([[2, 3], [4, 5]], dtype=dtype),
                             sess.run(y, {p: 1}))
 
+  def testSparseRead0DIndices(self):
+    for dtype in self.numeric_types:
+      init = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        x = v.sparse_read(2)
+        self.assertAllClose(np.array([8, 9, 10, 11], dtype=dtype), sess.run(x))
+
+  def testSparseRead1DIndices(self):
+    for dtype in self.numeric_types:
+      init = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        x = v.sparse_read([2, 1])
+        self.assertAllClose(
+            np.array([[8, 9, 10, 11], [4, 5, 6, 7]], dtype=dtype), sess.run(x))
+
+  def testSparseRead2DIndices(self):
+    for dtype in self.numeric_types:
+      init = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        x = v.sparse_read([[2, 1], [0, 2]])
+        self.assertAllClose(
+            np.array(
+                [[[8, 9, 10, 11], [4, 5, 6, 7]], [[0, 1, 2, 3], [8, 9, 10,
+                                                                 11]]],
+                dtype=dtype), sess.run(x))
+
+  def testSparseRead2DIndices3DTensor(self):
+    for dtype in self.numeric_types:
+      init = np.array(
+          [[[0, 1, 2], [3, 4, 5]], [[10, 11, 12], [13, 14, 15]],
+           [[20, 21, 22], [23, 24, 25]], [[30, 31, 32], [33, 34, 35]]],
+          dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        x = v.sparse_read([[2, 1], [3, 0]])
+        self.assertAllClose(
+            np.array(
+                [[[[20, 21, 22], [23, 24, 25]], [[10, 11, 12], [13, 14, 15]]],
+                 [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]],
+                dtype=dtype), sess.run(x))
+
   def testReadWrite(self):
     """Tests initialization, reading, and writing a resource variable."""
     with self.test_session() as session:
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 93c484ca7a0d04654371724aac905eb055c82b05..60e68db2d689f502481c45a748f6e6abac2b69e8 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -42,6 +42,7 @@ cc_library(
     deps = [
         ":common",
         ":dump_graph",
+        ":functionalize_control_flow",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -101,6 +102,7 @@ cc_test(
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -152,7 +154,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -165,13 +166,10 @@ cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
@@ -203,6 +201,59 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "functionalize_control_flow",
+    srcs = ["functionalize_control_flow.cc"],
+    hdrs = ["functionalize_control_flow.h"],
+    deps = [
+        "//tensorflow/compiler/jit:graph_to_functiondef",
+        "//tensorflow/compiler/tf2xla:dump_graph",
+        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_test(
+    name = "functionalize_control_flow_test",
+    srcs = ["functionalize_control_flow_test.cc"],
+    deps = [
+        ":functionalize_control_flow",
+        ":test_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/compiler/tf2xla/cc:functional_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:resource_variable_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..599265ba449c88baef1671b1c81d96d1715ce5f2
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -0,0 +1,44 @@
+package(
+    default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
+
+tf_gen_op_wrapper_cc(
+    name = "functional_ops_gen",
+    include_internal_ops = 1,
+    out_ops_file = "ops/functional_ops",
+    deps = ["//tensorflow/compiler/tf2xla/ops:functional_ops"],
+)
+
+cc_library(
+    name = "functional_ops",
+    srcs = ["ops/functional_ops.cc"],
+    hdrs = ["ops/functional_ops.h"],
+    deps = [
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 36a6c90af4f4e8e0618ea5a5432365d4d90e51e4..d98cf829bb6819ea2efc3217a9539a88b570bc4b 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -81,6 +81,7 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Split", "split_dim"},
       {"SplitV", "split_dim"},
       {"SplitV", "size_splits"},
+      {"StackV2", "max_size"},
       {"StridedSlice", "begin"},
       {"StridedSlice", "end"},
       {"StridedSlice", "strides"},
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c7a2046aa549beb2de58d21f517363d4fe8aea7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -0,0 +1,583 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
+
+#include <algorithm>
+#include <deque>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/jit/graph_to_functiondef.h"
+#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/control_flow.h"
+
+namespace tensorflow {
+
+namespace {
+
+const char* const kArgOp = "_Arg";
+const char* const kRetValOp = "_Retval";
+
+// Information about a loop argument.
+struct Arg {
+  // Every loop argument has an Enter node.
+  Node* enter;
+
+  // Is the loop argument a loop-invariant value? Taken from the `is_constant`
+  // attribute on the Enter node.
+  bool is_loop_invariant;
+
+  // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant
+  // arguments must have all of the following nodes:
+  Node* merge = nullptr;
+  Node* switch_node = nullptr;
+  Node* next_iteration = nullptr;
+  Node* exit = nullptr;
+};
+
+// Information about a loop frame.
+struct Frame {
+  string name;
+
+  // Pointer to the parent frame. The root frame has a pointer to itself.
+  Frame* parent = nullptr;
+  int num_children = 0;
+
+  // Arguments to this loop.
+  std::vector<Arg> args;
+
+  // The loop condition of the loop. There should be exactly one loop condition
+  // in every loop.
+  Node* loop_cond = nullptr;
+
+  // Set of nodes that belong to the loop frame.
+  std::unordered_set<Node*> nodes;
+};
+
+// Copies a subgraph from `graph` to `output` by performing a reverse DFS
+// starting at nodes in vector `stack`.
+// `node_map` is a vector indexed by source node ID to dest nodes.
+// Does not traverse into nodes in `node_map`, so by adding nodes to `node_map`
+// before the traversal clients can cut the graph. Returns an error if the
+// traversal leaves 'frame'; the client must add enough nodes to `node_map` to
+// cut the graph and prevent the traversal from escaping.
+//
+// `squash_src_outputs` contains a bool for each source node ID. If true, then
+// the source output on that node will be replaced by zero when copied. This is
+// used when replacing a Switch node with an _Arg node. The output we are
+// taking from the Switch node was not necessarily the first output, but _Arg
+// nodes only have one output. By adding the Switch node to `squash_src_outputs`
+// we rewrite the src_output of the corresponding edge to be 0.
+Status CopySubgraph(const Graph& graph, const Frame& frame,
+                    std::vector<Node*> stack,
+                    const std::vector<bool>& squash_src_outputs,
+                    std::vector<Node*>* node_map, Graph* output) {
+  std::vector<bool> visited(graph.num_node_ids(), false);
+  while (!stack.empty()) {
+    Node* n = stack.back();
+    stack.pop_back();
+
+    VLOG(3) << "Copying node " << n->name();
+
+    if (visited[n->id()]) continue;
+    visited[n->id()] = true;
+
+    for (const Edge* e : n->in_edges()) {
+      Node* src = e->src();
+      if (frame.nodes.find(src) == frame.nodes.end()) {
+        // We traversed out of the loop frame, without encountering a cut node.
+        return errors::Internal("Graph traversal of loop frame ", frame.name,
+                                " escaped frame at ", src->name(),
+                                " without encountering an argument node.");
+      }
+      if ((*node_map)[src->id()] == nullptr) {
+        (*node_map)[src->id()] = output->CopyNode(src);
+        stack.push_back(src);
+      }
+      Node* src_copy = (*node_map)[e->src()->id()];
+      int src_output = squash_src_outputs[e->src()->id()] ? 0 : e->src_output();
+      Node* dst_copy = (*node_map)[e->dst()->id()];
+      output->AddEdge(src_copy, src_output, dst_copy, e->dst_input());
+    }
+  }
+  return Status::OK();
+}
+
+Status BuildArgNode(Graph* graph, DataType type, int index, Node** arg_node) {
+  NodeDef arg_def;
+  NodeDefBuilder builder(strings::StrCat("_Arg", index), kArgOp);
+  builder.Attr("T", type);
+  builder.Attr("index", index);
+  TF_RETURN_IF_ERROR(builder.Finalize(&arg_def));
+  Status status;
+  *arg_node = graph->AddNode(arg_def, &status);
+  return status;
+}
+
+Status BuildRetvalNode(Graph* graph, DataType type, int index,
+                       Node** retval_node) {
+  NodeDef ret_def;
+  ret_def.set_op(kRetValOp);
+  ret_def.set_name(strings::StrCat("_Retval", index));
+  AddNodeAttr("T", type, &ret_def);
+  AddNodeAttr("index", index, &ret_def);
+  Status status;
+  *retval_node = graph->AddNode(ret_def, &status);
+  return status;
+}
+
+// Builds a graph for the loop condition.
+Status BuildLoopCondition(const Graph& graph, Frame* frame,
+                          std::unique_ptr<Graph>* cond_output) {
+  VLOG(2) << "Building loop condition for " << frame->name;
+  *cond_output = xla::MakeUnique<Graph>(graph.op_registry());
+  Graph* output = cond_output->get();
+
+  // Map from nodes in the original graph to the condition graph.
+  std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
+  std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
+
+  // Build one _Arg node for each Enter node.
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+
+    Node* arg_node;
+    TF_RETURN_IF_ERROR(
+        BuildArgNode(output, arg.enter->input_type(0), i, &arg_node));
+    if (arg.is_loop_invariant) {
+      node_map[arg.enter->id()] = arg_node;
+    } else {
+      node_map[arg.merge->id()] = arg_node;
+    }
+  }
+
+  // Build a Retval node for the loop condition. The LoopCond nodes are always
+  // boolean because of the type constraints on the LoopCond op.
+  TF_RETURN_IF_ERROR(
+      BuildRetvalNode(output, DT_BOOL, 0, &node_map[frame->loop_cond->id()]));
+
+  // Performs a reverse DFS, copying nodes and edges to the output graph.
+  // The _Arg and _Retval nodes were added unconditionally above, so we are
+  // guaranteed to get the correct function signature.
+  TF_RETURN_IF_ERROR(CopySubgraph(graph, *frame, {frame->loop_cond},
+                                  squash_src_outputs, &node_map, output));
+
+  return Status::OK();
+}
+
+// Builds a graph for the loop body.
+Status BuildLoopBody(const Graph& graph, Frame* frame,
+                     DataTypeVector* arg_types,
+                     std::unique_ptr<Graph>* body_output) {
+  VLOG(2) << "Building loop body for " << frame->name;
+  *body_output = xla::MakeUnique<Graph>(graph.op_registry());
+  Graph* output = body_output->get();
+
+  // Map from nodes in the original graph to the condition graph.
+  std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
+  std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
+
+  // Build one _Arg node for each Enter node.
+  std::vector<Node*> next_iterations;
+  next_iterations.reserve(frame->args.size());
+  arg_types->reserve(frame->args.size());
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+
+    DataType dtype = arg.enter->input_type(0);
+    arg_types->push_back(dtype);
+    Node* arg_node;
+    TF_RETURN_IF_ERROR(BuildArgNode(output, dtype, i, &arg_node));
+
+    if (dtype == DT_RESOURCE) {
+      // The convention of the XLA bridge is that resource variable arguments
+      // are only inputs to the loop body and have no corresponding output.
+      // TODO(b/37741920): change the convention so that DT_RESOURCE variables
+      // are both inputs and outputs, and then remove this case.
+      TF_RET_CHECK(arg.is_loop_invariant);
+      node_map[arg.enter->id()] = arg_node;
+    } else {
+      Node* retval_node;
+      TF_RETURN_IF_ERROR(BuildRetvalNode(output, dtype, i, &retval_node));
+
+      if (arg.is_loop_invariant) {
+        // Argument is loop-invariant. Forward it from the Arg to the Retval.
+        node_map[arg.enter->id()] = arg_node;
+        output->AddEdge(arg_node, 0, retval_node, 0);
+      } else {
+        // Argument is loop-varying.
+        node_map[arg.switch_node->id()] = arg_node;
+        // The Switch node has two outputs, but _Arg only has one. This tells
+        // the CopySubgraph function to rewrite the output number of edges from
+        // the _Arg node to be 0 rather than copying the output number from the
+        // Switch node.
+        squash_src_outputs[arg.switch_node->id()] = true;
+        node_map[arg.next_iteration->id()] = retval_node;
+        next_iterations.push_back(arg.next_iteration);
+      }
+    }
+  }
+
+  // Performs a reverse DFS, copying nodes and edges to the output graph.
+  // The _Arg and _Retval nodes were added unconditionally above, so we are
+  // guaranteed to get the correct function signature.
+  TF_RETURN_IF_ERROR(CopySubgraph(graph, *frame, std::move(next_iterations),
+                                  squash_src_outputs, &node_map, output));
+
+  return Status::OK();
+}
+
+Status FunctionalizeLoop(Graph* graph, Frame* frame,
+                         FunctionLibraryDefinition* library) {
+  VLOG(2) << "Frame " << frame->name << " before: "
+          << dump_graph::DumpGraphToFile("functionalize_before", *graph);
+
+  // Split loop-varying Enter nodes with multiple successors. If the same
+  // Tensor is fed as input to multiple loop arguments, we may end up with a
+  // shared Enter node. We clone Enter nodes with multiple successors to
+  // maintain the invariant of a unique Enter node per argument of the final
+  // loop.
+  std::vector<Arg> args;
+  for (const Arg& arg : frame->args) {
+    if (arg.is_loop_invariant) {
+      args.push_back(arg);
+    } else {
+      std::vector<const Edge*> edges(arg.enter->out_edges().begin(),
+                                     arg.enter->out_edges().end());
+      for (int i = 0; i < edges.size(); ++i) {
+        if (edges[i]->IsControlEdge() && edges[i]->dst()->IsSink()) {
+          continue;
+        }
+        TF_RET_CHECK(!edges[i]->IsControlEdge()) << edges[i]->src()->name();
+        Arg new_arg;
+        new_arg.is_loop_invariant = false;
+        if (i == 0) {
+          new_arg.enter = arg.enter;
+        } else {
+          new_arg.enter = graph->CopyNode(arg.enter);
+          frame->nodes.insert(new_arg.enter);
+          for (Edge const* e : arg.enter->in_edges()) {
+            graph->AddEdge(e->src(), e->src_output(), new_arg.enter,
+                           e->IsControlEdge() ? Graph::kControlSlot : 0);
+          }
+          Node* dst = edges[i]->dst();
+          int dst_input = edges[i]->dst_input();
+          graph->RemoveEdge(edges[i]);
+          graph->AddEdge(new_arg.enter, 0, dst, dst_input);
+        }
+        args.push_back(new_arg);
+      }
+    }
+  }
+  frame->args = std::move(args);
+
+  // Order the arguments so that:
+  // a) resource variables are last, and
+  // b) sort lexicographically by name (for deterministic output).
+  std::sort(frame->args.begin(), frame->args.end(),
+            [](const Arg& a, const Arg& b) {
+              bool a_is_resource = (a.enter->input_type(0) == DT_RESOURCE);
+              bool b_is_resource = (b.enter->input_type(0) == DT_RESOURCE);
+              return std::tie(a_is_resource, a.enter->name()) <
+                     std::tie(b_is_resource, b.enter->name());
+            });
+
+  if (frame->loop_cond == nullptr) {
+    return errors::InvalidArgument("Loop ", frame->name,
+                                   " has no LoopCond node");
+  }
+
+  // Find the set of Switch nodes that are successors of the LoopCond.
+  std::unordered_set<Node*> switches;
+  for (const Edge* edge : frame->loop_cond->out_edges()) {
+    if (!edge->IsControlEdge() && IsSwitch(edge->dst()) &&
+        edge->dst_input() == 1) {
+      switches.insert(edge->dst());
+    }
+  }
+
+  // For each non-constant argument, looks for the following pattern of nodes:
+  // Enter ----> Merge  -------->  Switch  --> Exit
+  //               ^                  ^
+  //               |                  |
+  //         NextIteration         LoopCond
+  //               ^                  ^
+  //               |                  |
+  //              ...                ...
+  for (Arg& arg : frame->args) {
+    if (!arg.is_loop_invariant) {
+      // Follow the edge from the Enter to Merge.
+      const Edge* enter_merge = nullptr;
+      for (const Edge* e : arg.enter->out_edges()) {
+        // Ignore control-edges to the sink node. These are allowed by the
+        // graph invariants, although probably they should have been stripped
+        // off earlier.
+        if (e->IsControlEdge() && e->dst()->IsSink()) {
+          continue;
+        }
+        if (enter_merge != nullptr) {
+          return errors::Internal(
+              "Enter node for loop-varying argument ", arg.enter->name(),
+              " has multiple successors: ", enter_merge->dst()->name(), " and ",
+              e->dst()->name());
+        }
+        enter_merge = e;
+      }
+      if (enter_merge == nullptr) {
+        return errors::Internal("Enter node for loop-varying argument ",
+                                arg.enter->name(), " has zero successors");
+      }
+      arg.merge = enter_merge->dst();
+      if (!IsMerge(arg.merge)) {
+        return errors::InvalidArgument(
+            "Successor of Enter node for loop-varying argument ",
+            arg.merge->name(),
+            " is not a Merge node; got: ", arg.merge->type_string());
+      }
+
+      // Find the NextIteration from the merge. There should be two inputs to
+      // the Merge and the NextIteration should be the other input.
+      if (arg.merge->input_types().size() != 2) {
+        return errors::InvalidArgument(
+            "Unexpected number of inputs to Merge node for loop-varying "
+            "argument ",
+            arg.merge->name(), "; expected 2, got ",
+            arg.merge->input_types().size());
+      }
+      TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(),
+                                               &arg.next_iteration));
+      if (!IsNextIteration(arg.next_iteration)) {
+        return errors::InvalidArgument(
+            "Expected NextIteration node as input to Merge node; got node ",
+            arg.next_iteration->name(), " with kind ",
+            arg.next_iteration->type_string());
+      }
+
+      // Find the Switch successor of the Merge. There should be exactly one
+      // Switch node that is a successor of both the Merge and the LoopCond.
+      for (const Edge* edge : arg.merge->out_edges()) {
+        if (edge->dst_input() == 0 && IsSwitch(edge->dst()) &&
+            switches.find(edge->dst()) != switches.end()) {
+          if (arg.switch_node != nullptr) {
+            return errors::InvalidArgument("Duplicate Switch successors to ",
+                                           arg.merge->name());
+          }
+          arg.switch_node = edge->dst();
+        }
+      }
+      if (arg.switch_node == nullptr) {
+        return errors::InvalidArgument("Missing Switch successor to ",
+                                       arg.merge->name());
+      }
+
+      // Find the Exit successor of the Switch.
+      for (const Edge* edge : arg.switch_node->out_edges()) {
+        if (edge->src_output() == 0 && IsExit(edge->dst())) {
+          if (arg.exit != nullptr) {
+            return errors::InvalidArgument("Duplicate Exit successors to ",
+                                           arg.switch_node->name());
+          }
+          arg.exit = edge->dst();
+        }
+      }
+      if (arg.exit == nullptr) {
+        return errors::InvalidArgument("Missing Exit successor to ",
+                                       arg.switch_node->name());
+      }
+    }
+  }
+
+  // Builds the condition and body functions.
+  std::unique_ptr<Graph> cond_graph;
+  TF_RETURN_IF_ERROR(BuildLoopCondition(*graph, frame, &cond_graph));
+  DataTypeVector arg_types;
+  std::unique_ptr<Graph> body_graph;
+  TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
+
+  VLOG(2) << "Frame " << frame->name << " condition: "
+          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph)
+          << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
+
+  static std::atomic<int64> sequence_num(0LL);
+  int64 id = ++sequence_num;
+  NameAttrList cond_name;
+  cond_name.set_name(strings::StrCat("_functionalize_cond_", id));
+  NameAttrList body_name;
+  body_name.set_name(strings::StrCat("_functionalize_body_", id));
+  FunctionDef cond_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*cond_graph, cond_name.name(), &cond_fdef));
+  FunctionDef body_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*body_graph, body_name.name(), &body_fdef));
+
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef));
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef));
+
+  // Builds a While operator.
+  NodeDef while_def;
+  NodeDefBuilder builder(frame->loop_cond->name(), "XlaWhile");
+  builder.Attr("T", arg_types);
+  builder.Attr("cond", cond_name);
+  builder.Attr("body", body_name);
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+    const Edge* in_edge;
+    TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
+    if (in_edge->IsControlEdge()) {
+      builder.ControlInput(in_edge->src()->name());
+    } else {
+      inputs.push_back(NodeDefBuilder::NodeOut(
+          in_edge->src()->name(), in_edge->src_output(), arg_types[i]));
+    }
+  }
+  builder.Input(inputs);
+  TF_RETURN_IF_ERROR(builder.Finalize(&while_def));
+
+  Status status;
+  Node* while_node = graph->AddNode(while_def, &status);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Copies edges to the Enter nodes and from the Exit nodes onto the While.
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+    const Edge* in_edge;
+    TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
+    if (in_edge->IsControlEdge()) {
+      graph->AddControlEdge(in_edge->src(), while_node);
+    } else {
+      graph->AddEdge(in_edge->src(), in_edge->src_output(), while_node, i);
+    }
+
+    if (!arg.is_loop_invariant) {
+      std::vector<const Edge*> edges(arg.exit->out_edges().begin(),
+                                     arg.exit->out_edges().end());
+      for (const Edge* edge : edges) {
+        Node* dst = edge->dst();
+        int dst_input = edge->dst_input();
+        graph->RemoveEdge(edge);
+
+        int src_output =
+            dst_input == Graph::kControlSlot ? Graph::kControlSlot : i;
+        graph->AddEdge(while_node, src_output, dst, dst_input);
+      }
+    }
+  }
+
+  // Remove the old nodes from the graph, and add the while node to the parent
+  // frame.
+  for (Node* node : frame->nodes) {
+    graph->RemoveNode(node);
+  }
+  frame->parent->nodes.insert(while_node);
+
+  VLOG(2) << "Frame " << frame->name << " after: "
+          << dump_graph::DumpGraphToFile("functionalize_after", *graph);
+
+  return Status::OK();
+}
+
+}  // namespace
+
+// Transformation that converts Tensorflow's graph control flow constructs into
+// functional equivalents.
+Status FunctionalizeControlFlow(Graph* graph,
+                                FunctionLibraryDefinition* library) {
+  VLOG(2) << "FunctionalizeControlFlow: "
+          << dump_graph::DumpGraphToFile("functionalize_initial", *graph);
+  // Note: BuildControlFlowInfo() requires that the graph's source node is
+  // connected to all source nodes in the graph. Many graphs violate this
+  // invariant.
+  std::vector<ControlFlowInfo> cf_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info));
+
+  // Builds Frames, indexed by name.
+  std::unordered_map<string, Frame> frames;
+  for (Node* node : graph->op_nodes()) {
+    const ControlFlowInfo& cf = cf_info[node->id()];
+
+    VLOG(2) << "node: " << node->name() << " frame_name: " << cf.frame_name
+            << " frame: " << (cf.frame ? cf.frame->name() : "---")
+            << " parent_frame: "
+            << (cf.parent_frame ? cf.parent_frame->name() : "---");
+    TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr);
+
+    Frame& frame = frames[cf.frame_name];
+    Frame* parent = &frames[cf_info[cf.parent_frame->id()].frame_name];
+    if (frame.parent == nullptr) {
+      frame.parent = parent;
+      frame.name = cf.frame_name;
+      ++parent->num_children;
+    } else if (frame.parent != parent) {
+      return errors::InvalidArgument("Mismatched parent frames for ",
+                                     cf.frame->id(), ": ", parent->name, " vs ",
+                                     frame.parent->name);
+    }
+
+    if (IsEnter(node)) {
+      Arg arg;
+      arg.enter = node;
+      TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant",
+                                     &arg.is_loop_invariant));
+      frame.args.push_back(arg);
+    } else if (IsLoopCond(node)) {
+      if (frame.loop_cond) {
+        return errors::InvalidArgument(
+            "Loop ", cf.frame_name,
+            " has more than one LoopCond node: ", node->name(), " and ",
+            frame.loop_cond->name());
+      }
+      frame.loop_cond = node;
+    }
+    frame.nodes.insert(node);
+  }
+
+  // Adds frames with no children (i.e., the innermost frames) to a worklist.
+  std::deque<Frame*> worklist;
+  for (auto& frame : frames) {
+    if (frame.second.num_children == 0) {
+      worklist.push_back(&frame.second);
+    }
+  }
+
+  // Eliminate loops from innermost to outermost.
+  while (!worklist.empty()) {
+    Frame* frame = worklist.front();
+    worklist.pop_front();
+    if (frame->parent == frame) {
+      // Skip the root frame.
+      continue;
+    }
+
+    TF_RETURN_IF_ERROR(FunctionalizeLoop(graph, frame, library));
+
+    // If the parent has no remaining children, add it to the worklist.
+    --frame->parent->num_children;
+    if (frame->parent->num_children == 0) {
+      worklist.push_back(frame->parent);
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
new file mode 100644
index 0000000000000000000000000000000000000000..1535dc80b0ccdba38c57b534ed7473fc8632e33f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Transformation that converts tf.while_loop() loops into functional While
+// operators, suitable for XLA compilation.
+// TODO(b/36470387): add support for conditionals.
+Status FunctionalizeControlFlow(Graph* graph,
+                                FunctionLibraryDefinition* library);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..914c8999a6f13f5f2dc4e3cecc38c91afd432131
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -0,0 +1,658 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/functional_ops.h"
+#include "tensorflow/compiler/tf2xla/test_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/equal_graph_def.h"
+
+namespace tensorflow {
+namespace {
+
+// Returns the names of the "cond" and "body" functions for the While node
+// in a graph.
+Status FindWhileCondAndBody(const GraphDef& graph, NameAttrList* cond,
+                            NameAttrList* body) {
+  for (const NodeDef& node : graph.node()) {
+    if (node.op() == "XlaWhile") {
+      const NameAttrList* result;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node, "cond", &result));
+      *cond = *result;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node, "body", &result));
+      *body = *result;
+      return Status::OK();
+    }
+  }
+  return errors::NotFound("No XlaWhile node found in graph");
+}
+
+// Graph:
+// x = array_ops.placeholder(dtypes.int32)
+// y = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [x])
+TEST(FunctionalizeControlFlow, OneLoopVar) {
+  Graph graph(OpRegistry::Global());
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+
+    auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
+
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto enter =
+        ops::internal::Enter(scope.WithOpName("while/Enter"), source, "aloop");
+    // Add an unused Enter node. These should be ignored.
+    auto enter2 =
+        ops::internal::Enter(scope.WithOpName("while/Enter2"), source, "aloop");
+    auto merge = ops::Merge(scope.WithOpName("while/Merge"),
+                            std::initializer_list<Input>{enter, dummy});
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
+    auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
+    auto switch_ =
+        ops::Switch(scope.WithOpName("while/Switch"), merge.output, loop_cond);
+    auto exit = ops::internal::Exit(scope.WithOpName("while/Exit"),
+                                    switch_.output_false);
+    auto identity =
+        ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
+    auto one = ops::Const<int32>(
+        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+    auto next_iteration =
+        ops::NextIteration(scope.WithOpName("while/NextIteration"), add);
+
+    auto sink = ops::Identity(scope.WithOpName("sink"), exit);
+
+    // Remove the dummy node and add the loop backedge.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(next_iteration.node(), 0, merge.output.node(), 1);
+
+    TF_EXPECT_OK(scope.ToGraph(&graph));
+  }
+
+  // Regression test: control edges from an Enter node to the graph sink should
+  // be ignored.
+  for (Node* n : graph.nodes()) {
+    if (n->name() == "while/Enter") {
+      graph.AddControlEdge(n, graph.sink_node());
+    }
+  }
+
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+
+  NameAttrList cond_fn, body_fn;
+  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+  // Outer graph
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto while_op =
+        ops::XlaWhile(scope.WithOpName("while/LoopCond"),
+                      std::initializer_list<Input>{source}, cond_fn, body_fn);
+    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // Condition graph
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // Body graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+    auto one = ops::Const<int32>(
+        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
+// Graph:
+// x = array_ops.placeholder(dtypes.int32)
+// y = array_ops.placeholder(dtypes.int32)
+// cond = lambda (i, j): i + 3 < 10
+// body = lambda (i, j): (i < 10, j * 2)
+// z = control_flow_ops.while_loop(cond, body, [x, y])
+TEST(FunctionalizeControlFlow, TwoLoopVars) {
+  Graph graph(OpRegistry::Global());
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+
+    auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
+
+    auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
+    auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
+    auto enter_x =
+        ops::internal::Enter(scope.WithOpName("while/Enter/x"), x, "aloop");
+    auto enter_y =
+        ops::internal::Enter(scope.WithOpName("while/Enter/y"), y, "aloop");
+    auto merge_x = ops::Merge(scope.WithOpName("while/Merge/x"),
+                              std::initializer_list<Input>{enter_x, dummy});
+    auto merge_y = ops::Merge(scope.WithOpName("while/Merge/y"),
+                              std::initializer_list<Input>{enter_y, dummy});
+
+    // Loop condition
+    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+                                       .WithControlDependencies(merge_x.output),
+                                   3);
+    auto cond_add =
+        ops::Add(scope.WithOpName("while/cond/Add"), merge_x.output, three);
+    auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
+                                     .WithControlDependencies(merge_x.output),
+                                 10);
+    auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
+    auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
+
+    auto switch_x = ops::Switch(scope.WithOpName("while/Switch/x"),
+                                merge_x.output, loop_cond);
+    auto switch_y = ops::Switch(scope.WithOpName("while/Switch/y"),
+                                merge_y.output, loop_cond);
+
+    auto exit_x = ops::internal::Exit(scope.WithOpName("while/Exit/x"),
+                                      switch_x.output_false);
+    auto exit_y = ops::internal::Exit(scope.WithOpName("while/Exit/y"),
+                                      switch_y.output_false);
+
+    auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"),
+                                    switch_x.output_true);
+    auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"),
+                                    switch_y.output_true);
+
+    auto one = ops::Const<int32>(
+        scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
+        1);
+    auto two = ops::Const<int32>(
+        scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
+        2);
+
+    auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
+    auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
+    auto next_iteration_x =
+        ops::NextIteration(scope.WithOpName("while/NextIteration/x"), add);
+    auto next_iteration_y =
+        ops::NextIteration(scope.WithOpName("while/NextIteration/y"), mul);
+
+    auto sink_x = ops::Identity(scope.WithOpName("sink_x"), exit_x);
+    auto sink_y = ops::Identity(scope.WithOpName("sink_y"), exit_y);
+
+    // Remove the dummy node and add the loop backedges.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(next_iteration_x.node(), 0, merge_x.output.node(),
+                           1);
+    scope.graph()->AddEdge(next_iteration_y.node(), 0, merge_y.output.node(),
+                           1);
+
+    TF_EXPECT_OK(scope.ToGraph(&graph));
+  }
+
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+
+  NameAttrList cond_fn, body_fn;
+  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+  // Outer graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
+    auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
+    auto while_op =
+        ops::XlaWhile(scope.WithOpName("while/LoopCond"),
+                      std::initializer_list<Input>{x, y}, cond_fn, body_fn);
+    auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
+    auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // Condition graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+                                       .WithControlDependencies(arg0.output),
+                                   3);
+    auto cond_add =
+        ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("while/cond/ten").WithControlDependencies(arg0.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
+    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // Body graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+
+    auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
+    auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
+
+    auto one = ops::Const<int32>(
+        scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
+        1);
+    auto two = ops::Const<int32>(
+        scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
+        2);
+
+    auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
+    auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
+    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
+// Example with nesting, loop-invariant arguments, and resource variables.
+//
+// accum = resource_variable_ops.ResourceVariable(1)
+// x = array_ops.placeholder(2, dtype=dtypes.int32)
+// y = 3 + x
+//
+// def inner_body(j, k):
+//   add = state_ops.assign_add(accum, k * j + x)
+//   with ops.control_dependencies([add]):
+//     return [j + 1, k]
+//
+// def body(i):
+//   m = control_flow_ops.while_loop(lambda j, k: j < 5, inner_body,
+//                                   [1, y], name="inner")
+//   with ops.control_dependencies(m):
+//     return [i + 1]
+//
+// z = control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="outer")
+TEST(FunctionalizeControlFlow, Complex) {
+  Graph graph(OpRegistry::Global());
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+
+    auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
+
+    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto y = ops::Add(scope.WithOpName("y"), x, three);
+
+    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                TensorShape({}));
+
+    // Outer loop
+    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+    auto enter_i =
+        ops::internal::Enter(scope.WithOpName("outer/Enter_i"), zero, "outer");
+    auto merge_i = ops::Merge(scope.WithOpName("outer/Merge_i"),
+                              std::initializer_list<Input>{enter_i, dummy});
+    auto ten = ops::Const<int32>(scope.WithOpName("outer/Less/y")
+                                     .WithControlDependencies(merge_i.output),
+                                 10);
+    auto less_i =
+        ops::Less(scope.WithOpName("outer/Less_i"), merge_i.output, ten);
+    auto outer_loop_cond =
+        ops::LoopCond(scope.WithOpName("outer/LoopCond"), less_i);
+    auto switch_i = ops::Switch(scope.WithOpName("outer/Switch"),
+                                merge_i.output, outer_loop_cond);
+    auto exit_i = ops::internal::Exit(scope.WithOpName("outer/Exit"),
+                                      switch_i.output_false);
+    auto identity_i =
+        ops::Identity(scope.WithOpName("outer/Identity"), switch_i.output_true);
+
+    auto enter_x_outer =
+        ops::internal::Enter(scope.WithOpName("outer/Enter_x"), x, "outer",
+                             ops::internal::Enter::Attrs().IsConstant(true));
+    auto enter_k_outer =
+        ops::internal::Enter(scope.WithOpName("outer/Enter_k"), y, "outer",
+                             ops::internal::Enter::Attrs().IsConstant(true));
+    auto enter_var_outer =
+        ops::internal::Enter(scope.WithOpName("outer/Enter_var"), var, "outer",
+                             ops::internal::Enter::Attrs().IsConstant(true));
+
+    // Inner loop
+    auto one_j = ops::Const<int32>(
+        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+    auto enter_j = ops::internal::Enter(scope.WithOpName("outer/inner/Enter_j"),
+                                        one_j, "inner");
+    auto enter_k =
+        ops::internal::Enter(scope.WithOpName("outer/inner/Enter_k")
+                                 .WithControlDependencies(identity_i),
+                             enter_k_outer, "inner");
+    auto enter_x = ops::internal::Enter(
+        scope.WithOpName("outer/inner/Enter_x"), enter_x_outer, "inner",
+        ops::internal::Enter::Attrs().IsConstant(true));
+    auto enter_var = ops::internal::Enter(
+        scope.WithOpName("outer/inner/Enter_var"), enter_var_outer, "inner",
+        ops::internal::Enter::Attrs().IsConstant(true));
+
+    auto merge_j = ops::Merge(scope.WithOpName("outer/inner/Merge_j"),
+                              std::initializer_list<Input>{enter_j, dummy});
+    auto merge_k = ops::Merge(scope.WithOpName("outer/inner/Merge_k"),
+                              std::initializer_list<Input>{enter_k, dummy});
+
+    auto five = ops::Const<int32>(scope.WithOpName("outer/inner/Five")
+                                      .WithControlDependencies(merge_j.output),
+                                  5);
+    auto less_j =
+        ops::Less(scope.WithOpName("outer/inner/Less_j"), merge_j.output, five);
+    auto loop_cond = ops::LoopCond(scope.WithOpName("outer/LoopCond"), less_j);
+
+    auto switch_j = ops::Switch(scope.WithOpName("outer/inner/Switch_j"),
+                                merge_j.output, loop_cond);
+    auto switch_k = ops::Switch(scope.WithOpName("outer/inner/Switch_k"),
+                                merge_k.output, loop_cond);
+    auto exit_j = ops::internal::Exit(scope.WithOpName("outer/inner/Exit_j"),
+                                      switch_j.output_false);
+    auto exit_k = ops::internal::Exit(scope.WithOpName("outer/inner/Exit_k"),
+                                      switch_k.output_false);
+    auto identity_j = ops::Identity(scope.WithOpName("outer/inner/Identity_j"),
+                                    switch_j.output_true);
+    auto identity_k = ops::Identity(scope.WithOpName("outer/inner/Identity_k"),
+                                    switch_k.output_true);
+
+    // Variable update
+    auto mul_jk =
+        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+    auto add_jkx =
+        ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, enter_x);
+    auto assign = ops::AssignAddVariableOp(
+        scope.WithOpName("outer/inner/assign_add"), enter_var, add_jkx);
+
+    auto one =
+        ops::Const<int32>(scope.WithOpName("outer/inner/One")
+                              .WithControlDependencies(
+                                  gtl::ArraySlice<Operation>{assign.operation}),
+                          1);
+    auto add_j =
+        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+    auto next_iteration_j = ops::NextIteration(
+        scope.WithOpName("outer/inner/NextIteration_j"), add_j);
+    auto next_iteration_k = ops::NextIteration(
+        scope.WithOpName("outer/inner/NextIteration_k"), identity_k);
+
+    // Body and backedge for outer loop.
+    auto one_outer = ops::Const<int32>(
+        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
+    auto add_i =
+        ops::Add(scope.WithOpName("outer/add")
+                     .WithControlDependencies(gtl::ArraySlice<Operation>{
+                         exit_j.output.op(), exit_k.output.op()}),
+                 identity_i, one_outer);
+    auto next_iteration_i =
+        ops::NextIteration(scope.WithOpName("outer/NextIteration"), add_i);
+
+    auto sink = ops::Identity(scope.WithOpName("sink"), exit_i);
+
+    // Remove the dummy node and add the loop backedge.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(next_iteration_i.node(), 0, merge_i.output.node(),
+                           1);
+    scope.graph()->AddEdge(next_iteration_j.node(), 0, merge_j.output.node(),
+                           1);
+    scope.graph()->AddEdge(next_iteration_k.node(), 0, merge_k.output.node(),
+                           1);
+
+    TF_EXPECT_OK(scope.ToGraph(&graph));
+  }
+
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+
+  NameAttrList outer_cond_fn, outer_body_fn;
+  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
+
+  // Outer graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto y = ops::Add(scope.WithOpName("y"), x, three);
+
+    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                TensorShape({}));
+
+    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+
+    auto while_op = ops::XlaWhile(scope.WithOpName("outer/LoopCond"),
+                                  std::initializer_list<Input>{zero, y, x, var},
+                                  outer_cond_fn, outer_body_fn);
+    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // Outer condition graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
+    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // Outer body graph.
+  NameAttrList inner_cond_fn, inner_body_fn;
+  {
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
+
+    // Find the inner condition and body names.
+    TF_EXPECT_OK(
+        FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
+
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+    auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
+    auto one_j = ops::Const<int32>(
+        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+    auto while_op =
+        ops::XlaWhile(scope.WithOpName("outer/LoopCond_1"),
+                      std::initializer_list<Input>{one_j, arg1, arg2, arg3},
+                      inner_cond_fn, inner_body_fn);
+
+    auto one_outer = ops::Const<int32>(
+        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
+    auto add_i =
+        ops::Add(scope.WithOpName("outer/add")
+                     .WithControlDependencies(gtl::ArraySlice<Operation>{
+                         while_op[0].op(), while_op[1].op()}),
+                 identity_i, one_outer);
+
+    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
+    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
+    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // Inner condition graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+    auto five = ops::Const<int32>(
+        scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
+    auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
+    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // Inner body graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+    auto identity_j =
+        ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
+    auto identity_k =
+        ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+
+    auto mul_jk =
+        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+    auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
+    auto assign = ops::AssignAddVariableOp(
+        scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+
+    auto one =
+        ops::Const<int32>(scope.WithOpName("outer/inner/One")
+                              .WithControlDependencies(
+                                  gtl::ArraySlice<Operation>{assign.operation}),
+                          1);
+    auto add_j =
+        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
+    auto retval1 =
+        ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
+    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+              result.arg_types);
+    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index a434c7468095a05ee6da31826d44379a735b51f7..546e9be8647587991de5d0d0c232827ad84fba94 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -47,6 +47,7 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "segment_reduction_ops.cc",
         "select_op.cc",
         "sequence_ops.cc",
         "shape_op.cc",
@@ -54,6 +55,7 @@ tf_kernel_library(
         "softmax_op.cc",
         "spacetobatch_op.cc",
         "split_op.cc",
+        "stack_ops.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
         "tile_ops.cc",
@@ -68,6 +70,7 @@ tf_kernel_library(
         "reduction_ops.h",
     ],
     deps = [
+        ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:literal_util",
@@ -91,6 +94,21 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "while_op",
+    srcs = ["while_op.cc"],
+    hdrs = ["while_op.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/ops:functional_ops",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow_opensource",
+    ],
+)
+
 # Kernels that only work on CPU, because they use XLA custom calls.
 # Only link this when using the CPU backend for XLA.
 #
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 620fc8443785388781caf5121da53c4d908d4cb4..1156546512952871fafe93e4b5a42308322671df 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -51,13 +51,29 @@ class ArgOp : public XlaOpKernel {
 
     XlaContext& xc = XlaContext::Get(ctx);
     const XlaContext::Argument& arg = xc.args()[index_];
-    if (arg.is_variable) {
+    if (arg.is_resource) {
+      XlaResource::Kind kind;
+      switch (arg.kind) {
+        case XlaCompiler::Argument::kVariable:
+          kind = XlaResource::kVariable;
+          break;
+        case XlaCompiler::Argument::kTensorArray:
+          kind = XlaResource::kTensorArray;
+          break;
+        case XlaCompiler::Argument::kStack:
+          kind = XlaResource::kStack;
+          break;
+        default:
+          CHECK(false);
+      }
+
       // TODO(phawkins): this code assumes that variables do not alias.
-      XlaVariable* var;
-      OP_REQUIRES_OK(ctx, xc.CreateVariable(index_, arg.name, arg.value.type,
-                                            arg.value.handle, &var));
-      var->tensor_array_size = arg.tensor_array_size;
-      ctx->SetVariableOutput(0, var);
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     xc.CreateResource(kind, index_, arg.name, arg.value.type,
+                                       arg.value.handle, &resource));
+      resource->tensor_array_size = arg.tensor_array_size;
+      ctx->SetResourceOutput(0, resource);
     } else if (arg.value.is_constant) {
       ctx->SetConstantOutput(0, arg.value.constant_value);
     } else {
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 8642cbf2a924e3c82c80bff8f5122e62ce12082d..21d3e64872e19109852297838043975cea6d7921 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -127,8 +127,8 @@ void BatchToSpace(XlaOpKernelContext* ctx,
   std::vector<int64> end_indices = reshaped_permuted_shape;
   std::vector<int64> strides(input_rank, 1);
   for (int i = 0; i < block_rank; ++i) {
-    int64 crop_start = xla::LiteralUtil::Get<int64>(crops, {i, 0});
-    int64 crop_end = xla::LiteralUtil::Get<int64>(crops, {i, 1});
+    int64 crop_start = crops.Get<int64>({i, 0});
+    int64 crop_end = crops.Get<int64>({i, 1});
     OP_REQUIRES(ctx, crop_start >= 0 && crop_end >= 0,
                 errors::InvalidArgument("Crops must be non-negative"));
     start_indices[1 + i] = crop_start;
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index b0fee5e4bca502a7abb4613b58ecdd2ffca2206d..bc2cd31230dfe9ca35540341d225dcb768fa34f6 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -55,7 +55,7 @@ class BCastGradArgsOp : public XlaOpKernel {
 
       BCast::Vec vec;
       for (int64 i = 0; i < in_shape.num_elements(); ++i) {
-        vec.push_back(xla::LiteralUtil::Get<int>(literal, {i}));
+        vec.push_back(literal.Get<int>({i}));
       }
       shapes.push_back(vec);
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 124e33d7935ce19ced72d1c84521ffda1090bc86..2331520230176fce7646d89140851fe37aee5fda 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -38,17 +38,6 @@ class CastOp : public XlaOpKernel {
 
     if (src_dtype_ == dst_dtype_) {
       output = input;
-    } else if (src_dtype_ == DT_BOOL) {
-      // XLA's ConvertElementType doesn't support casting to/from
-      // bools. So we need to handle those cases separately.
-      // Builds the equivalent of (input ? 1 : 0)
-      xla::ComputationBuilder l(builder->client(), "PredCast");
-      xla::ComputationDataHandle x =
-          l.Parameter(0, xla::ShapeUtil::MakeShape(src_type_, {}), "x");
-      l.Select(x, XlaHelpers::One(&l, dst_dtype_),
-               XlaHelpers::Zero(&l, dst_dtype_));
-      xla::Computation computation = l.Build().ConsumeValueOrDie();
-      output = builder->Map({input}, computation);
     } else if (dst_dtype_ == DT_BOOL) {
       output = builder->Ne(input, XlaHelpers::Zero(builder, src_dtype_));
     } else {
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index e2eacb3839d39e6fa41192e8aa0f31d878d96aea..73a4740e29af7fa57e71ef42a342f46b0e24231d 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -52,7 +52,7 @@ class ConcatBaseOp : public XlaOpKernel {
     xla::Literal literal;
     OP_REQUIRES_OK(ctx, ctx->ConstantInput(axis_index_, &literal));
     // TODO(annarev): add a helper to support int64 input.
-    const int32 concat_dim = xla::LiteralUtil::Get<int>(literal, {});
+    const int32 concat_dim = literal.Get<int>({});
 
     std::vector<xla::ComputationDataHandle> values;
     std::vector<TensorShape> shapes;
@@ -163,7 +163,7 @@ class ConcatOffsetOp : public XlaOpKernel {
 
     xla::Literal concat_dim_literal;
     OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &concat_dim_literal));
-    const int64 cdim = xla::LiteralUtil::Get<int>(concat_dim_literal, {});
+    const int64 cdim = concat_dim_literal.Get<int>({});
 
     VLOG(1) << "ConcatOffset " << cdim << "," << dims;
     int32 axis = cdim < 0 ? cdim + dims : cdim;
@@ -185,12 +185,10 @@ class ConcatOffsetOp : public XlaOpKernel {
       for (int64 j = 0; j < dims; ++j) {
         if (j == axis) {
           out_vec(j) = offset;
-          offset += xla::LiteralUtil::Get<int>(inp_literal, {j});
+          offset += inp_literal.Get<int>({j});
         } else {
-          const int32 inp0_element =
-              xla::LiteralUtil::Get<int>(inp0_literal, {j});
-          const int32 inp_element =
-              xla::LiteralUtil::Get<int>(inp_literal, {j});
+          const int32 inp0_element = inp0_literal.Get<int>({j});
+          const int32 inp_element = inp_literal.Get<int>({j});
           OP_REQUIRES(
               ctx, (inp0_element == inp_element),
               errors::InvalidArgument("input[", i, ",", j, "] mismatch: ",
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index ad676e7a2bb3d3f28ecb98164323cbf1e32f61a9..9833323d851e00e7ca76d0b39cd2b216748a17fa 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 107c673f4a7d62f8b760b137aeda2864e156b7f7..dde7898015e73190c96fa6effddfd3fc892264ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -63,11 +63,14 @@ class DynamicStitchOp : public XlaOpKernel {
     std::vector<xla::Literal> indices(indices_input.size());
 
     const TensorShape& data0_shape = data_shapes[0];
-    const TensorShape indices0_shape =
-        XLAShapeToTensorShape(indices_input[0].shape());
+    TensorShape indices0_shape;
+    OP_REQUIRES_OK(
+        ctx, XLAShapeToTensorShape(indices_input[0].shape(), &indices0_shape));
     for (int input_num = 0; input_num < indices_input.size(); input_num++) {
-      const TensorShape indices_shape =
-          XLAShapeToTensorShape(indices_input[input_num].shape());
+      TensorShape indices_shape;
+      OP_REQUIRES_OK(ctx,
+                     XLAShapeToTensorShape(indices_input[input_num].shape(),
+                                           &indices_shape));
       const TensorShape& data_shape = data_shapes[input_num];
       OP_REQUIRES(ctx, TensorShapeUtils::StartsWith(data_shape, indices_shape),
                   errors::InvalidArgument(
@@ -103,8 +106,7 @@ class DynamicStitchOp : public XlaOpKernel {
     int max_index = -1;
     for (int input_num = 0; input_num < indices.size(); input_num++) {
       for (int i = 0; i < indices[input_num].shape().dimensions(0); ++i) {
-        max_index = std::max(
-            max_index, xla::LiteralUtil::Get<int>(indices[input_num], {i}));
+        max_index = std::max(max_index, indices[input_num].Get<int>({i}));
       }
     }
     int number_of_indices = max_index + 1;
@@ -118,7 +120,7 @@ class DynamicStitchOp : public XlaOpKernel {
     int index_used_count = 0;
     for (int input_num = 0; input_num < indices.size(); input_num++) {
       for (int i = 0; i < indices[input_num].shape().dimensions(0); ++i) {
-        int index = xla::LiteralUtil::Get<int>(indices[input_num], {i});
+        int index = indices[input_num].Get<int>({i});
         src_input_vector[index] = input_num;
         src_slice_vector[index] = i;
         if (!src_index_used[index]) {
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
index 62a5e1bd421a75fb0a8fa6eacd58e4aaa2f02236..2fd27c5ca7e87c8b387d9d0854b787d30e7f7b6f 100644
--- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -61,5 +61,49 @@ class EluGradOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Elu"), EluOp);
 REGISTER_XLA_OP(Name("EluGrad"), EluGradOp);
 
+class SeluOp : public XlaOpKernel {
+ public:
+  explicit SeluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  // Computes the max of the scalar input x and 0.
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const auto zero = XlaHelpers::Zero(b, input_type(0));
+    const auto one = XlaHelpers::One(b, input_type(0));
+    const auto scale = XlaHelpers::FloatLiteral(b, input_type(0),
+            1.0507009873554804934193349852946);
+    const auto scale_alpha = XlaHelpers::FloatLiteral(b, input_type(0),
+            1.7580993408473768599402175208123);
+    const auto pred = b->Gt(ctx->Input(0), zero);
+    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    ctx->SetOutput(0, b->Select(pred, b->Mul(scale, ctx->Input(0)),
+                                      b->Mul(scale_alpha, expm1)));
+  }
+};
+
+class SeluGradOp : public XlaOpKernel {
+ public:
+  explicit SeluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
+  // otherwise return lhs * (1 + rhs).
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const auto zero = XlaHelpers::Zero(b, input_type(0));
+    const auto one = XlaHelpers::One(b, input_type(0));
+    const auto scale = XlaHelpers::FloatLiteral(b, input_type(0),
+            1.0507009873554804934193349852946);
+    const auto scale_alpha = XlaHelpers::FloatLiteral(b, input_type(0),
+            1.7580993408473768599402175208123);
+    const auto grad = ctx->Input(0);
+    const auto activation = ctx->Input(1);
+    const auto lin_grad = b->Mul(grad, scale);
+    const auto exp_grad = b->Mul(grad, b->Add(activation, scale_alpha));
+    const auto pred = b->Gt(activation, zero);
+    ctx->SetOutput(0, b->Select(pred, lin_grad, exp_grad));
+  }
+};
+
+REGISTER_XLA_OP(Name("Selu"), SeluOp);
+REGISTER_XLA_OP(Name("SeluGrad"), SeluGradOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index 1e1d2a1b4b3fa281adc96b76ade5ce7b07b2b41c..9e090fe01cbfd4dab81b0de21e3a44e42c2ef18e 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -52,7 +52,7 @@ class FillOp : public XlaOpKernel {
     std::vector<int64> broadcast;
     broadcast.reserve(dims_literal.shape().dimensions(0));
     for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
-      broadcast.push_back(xla::LiteralUtil::Get<int>(dims_literal, {i}));
+      broadcast.push_back(dims_literal.Get<int>({i}));
     }
     // Look up the value input, reshaping to a scalar if it was a
     // 'legacy' scalar (secretly a vector).
diff --git a/tensorflow/compiler/tf2xla/kernels/function_ops.cc b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
index 8dacb6627bde516c92cb07b747207adbe85ada5b..af1085d5b35077b7ebd144bfb2473485e3b3de6b 100644
--- a/tensorflow/compiler/tf2xla/kernels/function_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 49eadaf9d1f0ff1dbfa2321f20f9f833a0d4eb9a..184b5119f83d35e91d76685701c61fe712ac91ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -29,6 +29,7 @@ class GatherOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape params_shape = ctx->InputShape(0);
+    const auto params_dims = params_shape.dims();
     const TensorShape indices_shape = ctx->InputShape(1);
     OP_REQUIRES(
         ctx, TensorShapeUtils::IsVectorOrHigher(params_shape),
@@ -38,20 +39,51 @@ class GatherOp : public XlaOpKernel {
     OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
                 errors::InvalidArgument("index must be int32 or int64"));
 
+    // GatherV2 added an axis argument. We support both Gather and GatherV2 in
+    // this kernel by defaulting axis to 0 if there are 2 inputs.
+    int64 axis = 0;
+    if (ctx->num_inputs() == 3) {
+      const TensorShape axis_shape = ctx->InputShape(2);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(axis_shape),
+                  errors::InvalidArgument("axis must be scalar"));
+      DataType axis_type = input_type(2);
+      OP_REQUIRES(ctx, axis_type == DT_INT32 || axis_type == DT_INT64,
+                  errors::InvalidArgument("axis must be int32 or int64"));
+
+      xla::Literal literal;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &literal));
+      int64 axis_input = axis_type == DT_INT32 ? literal.Get<int32>({})
+                                               : literal.Get<int64>({});
+      axis = axis_input < 0 ? axis_input + params_dims : axis_input;
+      OP_REQUIRES(ctx, 0 <= axis && axis < params_dims,
+                  errors::InvalidArgument("Expected axis in the range [",
+                                          -params_dims, ", ", params_dims,
+                                          "), but got ", axis_input));
+    }
+
     // Check that we have enough index space.
     const int64 limit = index_type == DT_INT32
                             ? std::numeric_limits<int32>::max()
                             : std::numeric_limits<int64>::max();
-    OP_REQUIRES(
-        ctx, params_shape.dim_size(0) <= limit,
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(index_type), " indexing: ",
-                                params_shape.dim_size(0), " > ", limit));
-
-    // The result shape is indices.shape + params.shape[1:].
-    TensorShape result_shape = indices_shape;
-    for (int i = 1; i < params_shape.dims(); i++) {
+    OP_REQUIRES(ctx, params_shape.dim_size(axis) <= limit,
+                errors::InvalidArgument(
+                    "params.shape[", axis, "] too large for ",
+                    DataTypeString(index_type),
+                    " indexing: ", params_shape.dim_size(axis), " > ", limit));
+
+    // The result shape is params.shape[0:axis] + indices.shape +
+    // params.shape[axis + 1:].
+    TensorShape result_shape;
+    int64 outer_size = 1;
+    int64 inner_size = 1;
+    for (int i = 0; i < axis; i++) {
+      result_shape.AddDim(params_shape.dim_size(i));
+      outer_size *= params_shape.dim_size(i);
+    }
+    result_shape.AppendShape(indices_shape);
+    for (int i = axis + 1; i < params_dims; i++) {
       result_shape.AddDim(params_shape.dim_size(i));
+      inner_size *= params_shape.dim_size(i);
     }
 
     XlaContext& tc = XlaContext::Get(ctx);
@@ -66,11 +98,13 @@ class GatherOp : public XlaOpKernel {
     std::vector<xla::ComputationDataHandle> args;
     args.push_back(tc.GetOrCreateRuntimeContextParameter());
     args.push_back(b.ConstantLiteral(
-        *xla::LiteralUtil::CreateR0<int64>(indices_shape.num_elements())));
+        *xla::Literal::CreateR0<int64>(indices_shape.num_elements())));
+    args.push_back(
+        b.ConstantLiteral(*xla::Literal::CreateR0<int64>(outer_size)));
     args.push_back(b.ConstantLiteral(
-        *xla::LiteralUtil::CreateR0<int64>(params_shape.dim_size(0))));
-    args.push_back(b.ConstantLiteral(*xla::LiteralUtil::CreateR0<int64>(
-        params_shape.num_elements() / params_shape.dim_size(0))));
+        *xla::Literal::CreateR0<int64>(params_shape.dim_size(axis))));
+    args.push_back(
+        b.ConstantLiteral(*xla::Literal::CreateR0<int64>(inner_size)));
     args.push_back(ctx->Input(0));
     args.push_back(ctx->Input(1));
 
@@ -97,6 +131,10 @@ REGISTER_XLA_OP(Name("Gather")
                     .TypeConstraint("Tparams", DT_FLOAT)
                     .Device(DEVICE_CPU_XLA_JIT),
                 GatherOp);
+REGISTER_XLA_OP(Name("GatherV2")
+                    .TypeConstraint("Tparams", DT_FLOAT)
+                    .Device(DEVICE_CPU_XLA_JIT),
+                GatherOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
index 691a0b972d5c09ad632d706d72a1b60988730986..33b1b087d00d8263cd80f7d5d879401e4ed6c0fb 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
@@ -26,28 +26,31 @@ namespace tensorflow {
 
 EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
   // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 6 * sizeof(void*));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 7 * sizeof(void*));
 
   int64 indices_size = *static_cast<int64*>(data[1]);
   int64 params_x = *static_cast<int64*>(data[2]);
   int64 params_y = *static_cast<int64*>(data[3]);
+  int64 params_z = *static_cast<int64*>(data[4]);
 
-  float* in = static_cast<float*>(data[4]);
+  float* in = static_cast<float*>(data[5]);
 
-  int32* indices = static_cast<int32*>(data[5]);
-  Eigen::DSizes<Eigen::DenseIndex, 2> in_eig_sizes;
+  int32* indices = static_cast<int32*>(data[6]);
+  Eigen::DSizes<Eigen::DenseIndex, 3> in_eig_sizes;
   in_eig_sizes[0] = params_x;
   in_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float, 2>::ConstMatrix in_eig(in, in_eig_sizes);
+  in_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::ConstTensor in_eig(in, in_eig_sizes);
 
   Eigen::DSizes<Eigen::DenseIndex, 1> indices_eig_sizes;
   indices_eig_sizes[0] = indices_size;
   tensorflow::TTypes<int32>::ConstFlat indices_eig(indices, indices_eig_sizes);
 
-  Eigen::DSizes<Eigen::DenseIndex, 2> out_eig_sizes;
-  out_eig_sizes[0] = indices_size;
-  out_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float>::Matrix out_eig(out, out_eig_sizes);
+  Eigen::DSizes<Eigen::DenseIndex, 3> out_eig_sizes;
+  out_eig_sizes[0] = params_x;
+  out_eig_sizes[1] = indices_size;
+  out_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::Tensor out_eig(out, out_eig_sizes);
 
   tensorflow::functor::GatherFunctorCPU<float, int32> f;
   const int64 bad_i = f(in_eig, indices_eig, out_eig);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
index 3dff6e2737bf1af7f5d646928e740fa895692a03..5e2d872ce0b28ab479c73ed1fea5f32804c21e22 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
@@ -26,28 +26,31 @@ namespace tensorflow {
 
 EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
   // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 6 * sizeof(void*));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 7 * sizeof(void*));
 
   int64 indices_size = *static_cast<int64*>(data[1]);
   int64 params_x = *static_cast<int64*>(data[2]);
   int64 params_y = *static_cast<int64*>(data[3]);
+  int64 params_z = *static_cast<int64*>(data[4]);
 
-  float* in = static_cast<float*>(data[4]);
+  float* in = static_cast<float*>(data[5]);
 
-  int64* indices = static_cast<int64*>(data[5]);
-  Eigen::DSizes<Eigen::DenseIndex, 2> in_eig_sizes;
+  int64* indices = static_cast<int64*>(data[6]);
+  Eigen::DSizes<Eigen::DenseIndex, 3> in_eig_sizes;
   in_eig_sizes[0] = params_x;
   in_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float, 2>::ConstMatrix in_eig(in, in_eig_sizes);
+  in_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::ConstTensor in_eig(in, in_eig_sizes);
 
   Eigen::DSizes<Eigen::DenseIndex, 1> indices_eig_sizes;
   indices_eig_sizes[0] = indices_size;
   tensorflow::TTypes<int64>::ConstFlat indices_eig(indices, indices_eig_sizes);
 
-  Eigen::DSizes<Eigen::DenseIndex, 2> out_eig_sizes;
-  out_eig_sizes[0] = indices_size;
-  out_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float>::Matrix out_eig(out, out_eig_sizes);
+  Eigen::DSizes<Eigen::DenseIndex, 3> out_eig_sizes;
+  out_eig_sizes[0] = params_x;
+  out_eig_sizes[1] = indices_size;
+  out_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::Tensor out_eig(out, out_eig_sizes);
 
   tensorflow::functor::GatherFunctorCPU<float, int64> f;
   const int64 bad_i = f(in_eig, indices_eig, out_eig);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index df002dddd043c6795481436586a31c74b20d33d1..6be66cf66ec19cad33858f36a3239048efce9de3 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -69,7 +69,7 @@ class ArgMaxOp : public XlaOpKernel {
     // XLA op would have the same requirement.
     xla::Literal literal;
     OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal));
-    const int32 dim = xla::LiteralUtil::Get<int32>(literal, {});
+    const int32 dim = literal.Get<int32>({});
     OP_REQUIRES(ctx, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
     OP_REQUIRES(
         ctx, dim < input_shape.dims(),
@@ -97,14 +97,13 @@ class ArgMaxOp : public XlaOpKernel {
     std::vector<xla::ComputationDataHandle> args;
     args.push_back(ctx->Input(0));
     args.push_back(b.ConstantLiteral(
-        *xla::LiteralUtil::CreateR1<int64>(input_shape.dim_sizes())));
+        *xla::Literal::CreateR1<int64>(input_shape.dim_sizes())));
     if (input_shape.dims() > 1) {
       // Don't bother passing the output shape and dim for the 1d case, since
       // the shape is always a scalar and the dim is always 0.
       args.push_back(b.ConstantLiteral(
-          *xla::LiteralUtil::CreateR1<int64>(output_shape.dim_sizes())));
-      args.push_back(
-          b.ConstantLiteral(*xla::LiteralUtil::CreateR0<int32>(dim)));
+          *xla::Literal::CreateR1<int64>(output_shape.dim_sizes())));
+      args.push_back(b.ConstantLiteral(*xla::Literal::CreateR0<int32>(dim)));
     }
 
     xla::Shape xla_shape =
diff --git a/tensorflow/compiler/tf2xla/kernels/no_op.cc b/tensorflow/compiler/tf2xla/kernels/no_op.cc
index b8f0c0b9fe6087a7719a689628ca4738cc13aab9..8c8a9bbe787f3224e7444b62dcf8ad99130cf37f 100644
--- a/tensorflow/compiler/tf2xla/kernels/no_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/no_op.cc
@@ -23,4 +23,9 @@ namespace tensorflow {
 // dummy operator using CompilationOnly().
 REGISTER_XLA_OP(Name("NoOp").CompilationOnly(), NoOp);
 
+// We register ControlTrigger as a no-op. This is correct since nodes seen
+// by the XLA compiler are never dead. This may need rethinking when we add
+// support for conditionals to XLA.
+REGISTER_XLA_OP(Name("ControlTrigger"), NoOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 22476f4a0c51930cabf146313347e5e3bd2eaebe..d841bd37b33c31dbc156fa824ff62a58169a99cb 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -60,8 +60,8 @@ class PadOp : public XlaOpKernel {
     xla::PaddingConfig config;
     for (int i = 0; i < fixed_dims; ++i) {
       auto* dim = config.add_dimensions();
-      int before = xla::LiteralUtil::Get<int32>(pad_literal, {i, 0});
-      int after = xla::LiteralUtil::Get<int32>(pad_literal, {i, 1});
+      int before = pad_literal.Get<int32>({i, 0});
+      int after = pad_literal.Get<int32>({i, 1});
       OP_REQUIRES(ctx, before >= 0 && after >= 0,
                   errors::InvalidArgument("Paddings must be non-negative: ",
                                           before, " ", after));
@@ -69,12 +69,22 @@ class PadOp : public XlaOpKernel {
       dim->set_edge_padding_high(after);
     }
 
-    auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0));
-    ctx->SetOutput(0, ctx->builder()->Pad(ctx->Input(0), zero, config));
+    // PadV2 added a "constant_values" input that indicates the pad value.
+    xla::ComputationDataHandle constant_values;
+    if (ctx->num_inputs() == 3) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)),
+                  errors::InvalidArgument("constant_values must be a scalar."));
+      ctx->SetOutput(0,
+                     ctx->builder()->Pad(ctx->Input(0), ctx->Input(2), config));
+    } else {
+      auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0));
+      ctx->SetOutput(0, ctx->builder()->Pad(ctx->Input(0), zero, config));
+    }
   }
 };
 
 REGISTER_XLA_OP(Name("Pad"), PadOp);
+REGISTER_XLA_OP(Name("PadV2"), PadOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 518a9372c4fa3f195ff7c77e8ef0de1ba0a8807b..dae2eb9d2a92ef8d4eabb8d6f9a79758c42d446d 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -63,7 +63,7 @@ class MinOp : public XlaReductionOp {
       xla::ComputationBuilder* builder) override {
     xla::PrimitiveType type;
     TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-    return builder->ConstantLiteral(xla::LiteralUtil::MaxValue(type));
+    return builder->ConstantLiteral(xla::Literal::MaxValue(type));
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
@@ -83,7 +83,7 @@ class MaxOp : public XlaReductionOp {
       xla::ComputationBuilder* builder) override {
     xla::PrimitiveType type;
     TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type));
-    return builder->ConstantLiteral(xla::LiteralUtil::MinValue(type));
+    return builder->ConstantLiteral(xla::Literal::MinValue(type));
   }
 
   void BuildReducer(xla::ComputationBuilder* builder,
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 8798c80ad5354c76a9b4061ad8913b76ae0629b0..4b5d09eb9fd4110cdc4221099ff55767e9132540 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -66,13 +66,13 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
                      1, {axes_tensor_shape.num_elements()}, &axes_literal));
 
   VLOG(1) << "data shape: " << data_shape.DebugString();
-  VLOG(1) << "axes      : " << xla::LiteralUtil::ToString(axes_literal);
+  VLOG(1) << "axes      : " << axes_literal.ToString();
 
   gtl::InlinedVector<bool, 4> bitmap(data_shape.dims(), false);
   std::vector<int64> xla_axes;
   int64 num_elements_reduced = 1LL;
   for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) {
-    int32 index = xla::LiteralUtil::Get<int>(axes_literal, {i});
+    int32 index = axes_literal.Get<int>({i});
     OP_REQUIRES(ctx,
                 !(index < -data_shape.dims() || index >= data_shape.dims()),
                 errors::InvalidArgument("Invalid reduction dimension (", index,
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index df542350b443b765a1ab35be9632cf61a38be49c..5952e752724d1e6953dd4dbb6a8099b847c64d08 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -50,7 +50,7 @@ class ReshapeOp : public XlaOpKernel {
     int64 product = 1;
     int unknown_index = -1;
     for (int d = 0; d < num_dims; ++d) {
-      const int32 size = xla::LiteralUtil::Get<int>(literal, {d});
+      const int32 size = literal.Get<int>({d});
       if (size == -1) {
         OP_REQUIRES(
             ctx, unknown_index == -1,
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a0ce775dc69e1b87041bad31b13cdaff676e20f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -0,0 +1,155 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace {
+
+class UnsortedSegmentSum : public XlaOpKernel {
+ public:
+  explicit UnsortedSegmentSum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // output = unsorted_segment_sum(data, indices, num_segments)
+    // Compute a tensor such that:
+    //    output[i] = sum over {j where indices[j] == i} of data[j]
+    //    output[i] == 0 if i does not appear in indices
+    //
+    // Contrast with segment_sum(), which assumes indices are sorted and that
+    // max(indices)+1 is the desired size of the output.
+    //
+    // The returned output tensor has the same type as data, and the same shape
+    // as data with the first indices.rank dimensions are replaced
+    // by a single dimension with size num_segments.
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    auto data = ctx->Input(0);
+    auto data_shape = ctx->InputShape(0);
+
+    auto indices = ctx->Input(1);
+    auto indices_shape = ctx->InputShape(1);
+
+    OP_REQUIRES(ctx, data_shape.dims() >= indices_shape.dims(),
+                errors::InvalidArgument(
+                    "UnsortedSegmentSum requires that indices' rank be"
+                    " less than or equal to data's rank."));
+    // Validate that indices.shape is a prefix of data.shape.
+    for (int d = 0; d < indices_shape.dims(); ++d) {
+      OP_REQUIRES(ctx, (data_shape.dim_size(d) == indices_shape.dim_size(d)),
+                  errors::InvalidArgument(
+                      "UnsortedSegmentSum requires indices shape to be prefix"
+                      " of data_shape, but dimension ",
+                      d, " differs ", data_shape.dim_size(d), " vs. ",
+                      indices_shape.dim_size(d)));
+    }
+
+    int64 num_segments;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &num_segments));
+
+    // Flatten the indices into 1-D.
+    auto indices_1d = builder->Reshape(indices, {indices_shape.num_elements()});
+
+    // flatten data for dynamic indexing.
+    int64 out_tensor_dims = data_shape.dims() - indices_shape.dims();
+    std::vector<int64> flat_shape(1 + out_tensor_dims);
+    flat_shape[0] = indices_shape.num_elements();
+    for (int64 k = 0; k < out_tensor_dims; ++k) {
+      flat_shape[1 + k] = data_shape.dim_size(indices_shape.dims() + k);
+    }
+    auto data_flat = builder->Reshape(data, flat_shape);
+
+    // output shape; same as data_shape, but dimension 0 is num_segments.
+    std::vector<int64> out_shape(flat_shape);
+    out_shape[0] = num_segments;
+
+    // Pad the output array dims to rank >= 3 to work around lowering issues.
+    // TODO(b/37575001) This is awkward, and could be improved.
+    int64 extra_dims = 0;
+    if (out_shape.size() < 3) {
+      extra_dims = 3u - out_shape.size();
+    }
+    std::vector<int64> rshape(extra_dims + out_shape.size(), 1);
+    for (unsigned k = 0; k < out_shape.size(); ++k) {
+      rshape[extra_dims + k] = out_shape[k];
+    }
+    auto output = builder->Broadcast(XlaHelpers::Zero(builder, dtype_), rshape);
+
+    auto zero = builder->ConstantR1<int32>({0});
+
+    for (int64 i = 0; i < indices_shape.num_elements(); ++i) {
+      // output[indices[i]] += data[i]
+
+      std::vector<int64> data_start_indices(flat_shape.size());
+      data_start_indices[0] = i;
+      for (unsigned d = 1; d < flat_shape.size(); ++d) {
+        data_start_indices[d] = 0;
+      }
+      std::vector<int64> data_limit_indices(flat_shape);
+      data_limit_indices[0] = i + 1;
+      std::vector<int64> stride(flat_shape.size(), 1);
+
+      auto data_slice = builder->Slice(data_flat, data_start_indices,
+                                       data_limit_indices, stride);
+
+      // Reshape the sliced data into the R3+ shape to match output array.
+      std::vector<int64> rdata_shape(extra_dims + flat_shape.size());
+      for (int64 k = 0; k <= extra_dims; ++k) {
+        rdata_shape[k] = 1;
+      }
+      for (unsigned k = 1; k < data_limit_indices.size(); ++k) {
+        rdata_shape[extra_dims + k] = data_limit_indices[k];
+      }
+      auto rdata_slice = builder->Reshape(data_slice, rdata_shape);
+
+      auto index = builder->Slice(indices_1d, {i}, {i + 1}, {1});
+
+      // Construct the index into the R3+ output array 0, ..., <index>, 0, ...
+      std::vector<xla::ComputationDataHandle> out_start_index_parts(
+          extra_dims + flat_shape.size(), zero);
+      out_start_index_parts[extra_dims] = builder->Reshape(index, {1});
+      auto out_start_indices = builder->ConcatInDim(out_start_index_parts, 0);
+
+      std::vector<int64> slice_size(rshape);
+      slice_size[extra_dims] = 1;
+
+      auto out_slice =
+          builder->DynamicSlice(output, out_start_indices, slice_size);
+      auto sumval = builder->Add(out_slice, rdata_slice);
+      output = builder->DynamicUpdateSlice(output, sumval, out_start_indices);
+    }
+    auto reshaped_output = builder->Reshape(output, out_shape);
+    ctx->SetOutput(0, reshaped_output);
+  }
+
+ private:
+  DataType dtype_;
+};
+
+REGISTER_XLA_OP(Name("UnsortedSegmentSum"), UnsortedSegmentSum);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 5b6fa64fa825894b5d7bf938c5892d30f4fc11b0..c2b0e1bb4c1a141d0ab3f5b3ff5397d9da620bd8 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -32,7 +32,7 @@ template <typename T>
 Status GetValue(int index, XlaOpKernelContext* ctx, T* value) {
   xla::Literal literal;
   TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  *value = xla::LiteralUtil::Get<T>(literal, {});
+  *value = literal.Get<T>({});
   return Status::OK();
 }
 
@@ -41,10 +41,10 @@ Status GetIntValue(int index, XlaOpKernelContext* ctx, int64* value) {
   TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
   switch (literal.shape().element_type()) {
     case xla::S32:
-      *value = xla::LiteralUtil::Get<int32>(literal, {});
+      *value = literal.Get<int32>({});
       break;
     case xla::S64:
-      *value = xla::LiteralUtil::Get<int64>(literal, {});
+      *value = literal.Get<int64>({});
       break;
     default:
       return errors::InvalidArgument("Invalid argument type for argument",
@@ -58,9 +58,9 @@ template <typename T>
 Status CreateRangeTensor(const xla::Literal& start_literal,
                          const xla::Literal& limit_literal,
                          const xla::Literal& delta_literal, Tensor* output) {
-  T start = xla::LiteralUtil::Get<T>(start_literal, {});
-  T limit = xla::LiteralUtil::Get<T>(limit_literal, {});
-  T delta = xla::LiteralUtil::Get<T>(delta_literal, {});
+  T start = start_literal.Get<T>({});
+  T limit = limit_literal.Get<T>({});
+  T delta = delta_literal.Get<T>({});
 
   if (delta == 0) {
     return errors::InvalidArgument("Requires delta != 0: ", delta);
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index f15b354cb26d390352d866a8e827970f7c8b0c7f..83a87f19a718ce86a105e3c33ab9eaf0faff3a76 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -56,8 +56,8 @@ void SpaceToBatch(XlaOpKernelContext* ctx,
   padding_config.add_dimensions();  // Don't pad the batch dimension.
   for (int i = 0; i < block_rank; ++i) {
     auto* dim = padding_config.add_dimensions();
-    int64 pad_start = xla::LiteralUtil::Get<int64>(paddings, {i, 0});
-    int64 pad_end = xla::LiteralUtil::Get<int64>(paddings, {i, 1});
+    int64 pad_start = paddings.Get<int64>({i, 0});
+    int64 pad_end = paddings.Get<int64>({i, 1});
     OP_REQUIRES(ctx, pad_start >= 0 && pad_end >= 0,
                 errors::InvalidArgument("Paddings must be non-negative"));
     dim->set_edge_padding_low(pad_start);
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 42bde90042218b3a36f50e32d4f458d31c82d5da..44ee81461e5b31f15594c0dfb86f7219f9875768 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -39,7 +39,7 @@ class SplitOp : public XlaOpKernel {
 
     int32 split_dim;
     if (index_shape.dims() == 0) {
-      split_dim = xla::LiteralUtil::Get<int>(literal_index, {});
+      split_dim = literal_index.Get<int>({});
     } else {
       OP_REQUIRES(
           ctx, index_shape.dims() == 1,
@@ -49,7 +49,7 @@ class SplitOp : public XlaOpKernel {
           ctx, index_shape.dim_size(0) == 1,
           errors::InvalidArgument("split_index input to Split Op must be a "
                                   "scalar or a vector with 1 element"));
-      split_dim = xla::LiteralUtil::Get<int>(literal_index, {0});
+      split_dim = literal_index.Get<int>({0});
     }
     const int32 num_split = num_outputs();
     const TensorShape input_shape = ctx->InputShape(1);
@@ -115,7 +115,7 @@ class SplitVOp : public XlaOpKernel {
     OP_REQUIRES(ctx, index_shape.dims() == 0,
                 errors::InvalidArgument("split_dim input to Split Op must be a "
                                         "scalar"));
-    split_dim = xla::LiteralUtil::Get<int>(literal_index, {});
+    split_dim = literal_index.Get<int>({});
 
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
@@ -152,7 +152,7 @@ class SplitVOp : public XlaOpKernel {
 
     for (int i = 0; i < num_split; ++i) {
       int slice_size;
-      slice_size = xla::LiteralUtil::Get<int>(split_size_literal, {i});
+      slice_size = split_size_literal.Get<int>({i});
       if (slice_size == -1) {
         OP_REQUIRES(
             ctx, neg_one_dim == -1,
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d1394c280383b7e9b9be39da4ed028e15a005fd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -0,0 +1,250 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA Stack operators.
+
+#include <limits>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+Status GetStackShape(xla::ComputationBuilder* builder, XlaResource* resource,
+                     TensorShape* stack_shape) {
+  auto shape_or_status = builder->GetShape(resource->value);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
+  }
+  xla::Shape shape = *shape_or_status.ValueOrDie();
+  TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
+  return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
+                               stack_shape);
+}
+
+// Since the element shape is not provided to the Stack operator,
+// we lazily initialize the Stack at the time of the first write.
+//
+// If a Stack `resource` has not been initialized, constructs storage for the
+// Stack with elements of `elem_shape`. For both initialized and
+// uninitialized Stacks, checks that the tensor has a type compatible with
+// 'dtype' and shape compatible with 'elem_shape'.
+//
+// TODO(phawkins): consider changing the API of the stack operators to
+// allow an optional element shape at stack construction time.
+Status MaybeInitializeStack(xla::ComputationBuilder* builder,
+                            XlaResource* resource, DataType dtype,
+                            const TensorShape& elem_shape) {
+  if (resource->type != dtype) {
+    return errors::InvalidArgument(
+        "Stack dtype is ", DataTypeString(resource->type), " but op has dtype ",
+        DataTypeString(dtype), ".");
+  }
+
+  TensorShape stack_shape;
+  stack_shape.AddDim(resource->tensor_array_size);
+  stack_shape.AppendShape(elem_shape);
+
+  if (resource->value.handle() == 0) {
+    // Stack has not been initialized.
+    xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, resource->type);
+    resource->value =
+        builder->Tuple({builder->Broadcast(zero, stack_shape.dim_sizes()),
+                        builder->ConstantR0<int32>(0)});
+  } else {
+    // Checks the expected shape matches the actual shape.
+    TensorShape actual_shape;
+    TF_RETURN_IF_ERROR(GetStackShape(builder, resource, &actual_shape));
+    if (stack_shape != actual_shape) {
+      return errors::InvalidArgument(
+          "Mismatched Stack shapes: ", stack_shape.DebugString(), " vs ",
+          actual_shape.DebugString());
+    }
+  }
+  return Status::OK();
+}
+
+class StackOp : public XlaOpKernel {
+ public:
+  explicit StackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("elem_type", &dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("stack_name", &stack_name_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    int64 size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &size));
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument(
+            "XLA compilation requires a fixed stack size upper bound."));
+
+    // We defer initializing the Stack resource until we see the first push.
+    // Otherwise we do not know the shape of the stack elements.
+    xla::ComputationDataHandle value;
+    XlaContext& xc = XlaContext::Get(ctx);
+    XlaResource* resource;
+    string name = strings::StrCat("Stack: ", stack_name_);
+    OP_REQUIRES_OK(
+        ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
+                               value, &resource));
+    resource->tensor_array_size = size;
+    ctx->SetResourceOutput(0, resource);
+  }
+
+ private:
+  DataType dtype_;
+  string stack_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
+};
+
+REGISTER_XLA_OP(Name("StackV2"), StackOp);
+
+class StackPushOp : public XlaOpKernel {
+ public:
+  explicit StackPushOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    TensorShape elem_shape = ctx->InputShape(1);
+
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+
+    // Initializes the Stack, if the element shape was not already known.
+    OP_REQUIRES_OK(ctx, MaybeInitializeStack(b, resource, dtype_, elem_shape));
+
+    xla::ComputationDataHandle ta = b->GetTupleElement(resource->value, 0);
+    xla::ComputationDataHandle index = b->GetTupleElement(resource->value, 1);
+    xla::ComputationDataHandle value = ctx->Input(1);
+
+    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+    auto start_indices = XlaHelpers::PadWithZeros(b, index, elem_shape.dims());
+
+    TensorShape slice_shape = elem_shape;
+    slice_shape.InsertDim(0, 1LL);
+    auto update = b->Reshape(value, slice_shape.dim_sizes());
+
+    // TODO(phawkins): We don't check the index is in bounds --- there is no
+    // error mechanism in XLA.
+    resource->value =
+        b->Tuple({b->DynamicUpdateSlice(ta, update, start_indices),
+                  b->Add(index, b->ConstantR0<int32>(1))});
+
+    ctx->SetOutput(0, value);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StackPushOp);
+};
+
+REGISTER_XLA_OP(Name("StackPushV2"), StackPushOp);
+
+class StackPopOp : public XlaOpKernel {
+ public:
+  explicit StackPopOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("elem_type", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+
+    OP_REQUIRES(ctx, resource->type == dtype_,
+                errors::InvalidArgument(
+                    "Stack dtype is ", DataTypeString(resource->type),
+                    " but Op requested dtype ", DataTypeString(dtype_), "."));
+
+    // There is a somewhat subtle issue here: here "uninitialized" means we have
+    // not yet seen a pop in the order that we compile operators, not the order
+    // that we run them. However, in practice the two orders should be the same
+    // for the sole user of the stack operators (loop gradients).
+    OP_REQUIRES(ctx, resource->value.handle() != 0,
+                errors::InvalidArgument("Stack pop on uninitialized stack"));
+
+    TensorShape stack_shape;
+    OP_REQUIRES_OK(ctx, GetStackShape(b, resource, &stack_shape));
+
+    xla::ComputationDataHandle state = resource->value;
+    xla::ComputationDataHandle ta = b->GetTupleElement(state, 0);
+    xla::ComputationDataHandle index = b->GetTupleElement(state, 1);
+
+    index = b->Sub(index, b->ConstantR0<int32>(1));
+    resource->value = b->Tuple({ta, index});
+
+    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+    auto start_indices =
+        XlaHelpers::PadWithZeros(b, index, stack_shape.dims() - 1);
+
+    auto slice_shape = stack_shape.dim_sizes();
+    slice_shape[0] = 1LL;
+
+    // TODO(phawkins): We don't check the index is in bounds --- there is no
+    // error mechanism in XLA.
+    xla::ComputationDataHandle read =
+        b->DynamicSlice(ta, start_indices, slice_shape);
+
+    // Remove the leading '1' dimension.
+    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
+    ctx->SetOutput(0, b->Reshape(read, value_shape));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StackPopOp);
+};
+
+REGISTER_XLA_OP(Name("StackPopV2"), StackPopOp);
+
+class StackCloseOp : public XlaOpKernel {
+ public:
+  explicit StackCloseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // Do nothing.
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StackCloseOp);
+};
+
+REGISTER_XLA_OP(Name("StackCloseV2"), StackCloseOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 9eb689983105eff05555bbe454f97149eb8f14a2..6af4bd0496e0da926726e3f74376281f539e925a 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -63,17 +63,13 @@ class StridedSliceOp : public XlaOpKernel {
                                             &strides_tensor));
 
     TensorShape dummy_processing_shape;
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
-    ShapeReadWriteFromTensorShape wrapped_dummy_processing_shape(
-        &dummy_processing_shape);
     bool dummy = false;
-    OP_REQUIRES_OK(
-        ctx, ValidateStridedSliceOp(
-                 &begin_tensor, &end_tensor, strides_tensor,
-                 ShapeReadWriteFromTensorShape(&input_shape), begin_mask_,
-                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                 &wrapped_dummy_processing_shape, &wrapped_final_shape, &dummy,
-                 &dummy, &dummy, &begin, &end, &strides));
+    OP_REQUIRES_OK(ctx,
+                   ValidateStridedSliceOp(
+                       &begin_tensor, &end_tensor, strides_tensor, input_shape,
+                       begin_mask_, end_mask_, ellipsis_mask_, new_axis_mask_,
+                       shrink_axis_mask_, &dummy_processing_shape, &final_shape,
+                       &dummy, &dummy, &dummy, &begin, &end, &strides));
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
     gtl::InlinedVector<int64, 4> slice_begin, slice_end, slice_strides;
@@ -146,14 +142,11 @@ class StridedSliceGradOp : public XlaOpKernel {
                                             &strides_tensor));
 
     bool dummy = false;
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
-    ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
     OP_REQUIRES_OK(
         ctx, ValidateStridedSliceOp(
-                 &begin_tensor, &end_tensor, strides_tensor,
-                 ShapeReadWriteFromTensorShape(&input_shape), begin_mask_,
-                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                 &wrapped_processing_shape, &wrapped_final_shape, &dummy,
+                 &begin_tensor, &end_tensor, strides_tensor, input_shape,
+                 begin_mask_, end_mask_, ellipsis_mask_, new_axis_mask_,
+                 shrink_axis_mask_, &processing_shape, &final_shape, &dummy,
                  &dummy, &dummy, &begin, &end, &strides));
 
     // Check to make sure dy is consistent with the original slice
@@ -257,17 +250,13 @@ class StridedSliceAssignOp : public XlaOpKernel {
     const TensorShape rhs_shape = ctx->InputShape(4);
 
     TensorShape dummy_processing_shape;
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
-    ShapeReadWriteFromTensorShape wrapped_dummy_processing_shape(
-        &dummy_processing_shape);
     bool dummy = false;
-    OP_REQUIRES_OK(
-        ctx, ValidateStridedSliceOp(
-                 &begin_tensor, &end_tensor, strides_tensor,
-                 ShapeReadWriteFromTensorShape(&lhs_shape), begin_mask_,
-                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                 &wrapped_dummy_processing_shape, &wrapped_final_shape, &dummy,
-                 &dummy, &dummy, &begin, &end, &strides));
+    OP_REQUIRES_OK(ctx,
+                   ValidateStridedSliceOp(
+                       &begin_tensor, &end_tensor, strides_tensor, lhs_shape,
+                       begin_mask_, end_mask_, ellipsis_mask_, new_axis_mask_,
+                       shrink_axis_mask_, &dummy_processing_shape, &final_shape,
+                       &dummy, &dummy, &dummy, &begin, &end, &strides));
 
     if (final_shape.num_elements() == 0 && rhs_shape.num_elements() == 0) {
       // DynamicUpdateSlice does not allow 0-element updates. We should probably
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index deee7dd44dbf80f83ded3f09819365f7b6c1c7bd..34cc8b23159a0c20166c28d21911d4f3e7a43693 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -41,36 +41,42 @@ namespace {
 // Since the element shape is not always provided to the TensorArrayV3 operator,
 // we must support lazily initialization of the TensorArray at the time of the
 // first write.
-// If a TensorArray `var` has not been initialized, constructs storage for the
-// TensorArray with elements of `elem_shape`. For both initialized and
+// If a TensorArray `resource` has not been initialized, constructs storage for
+// the TensorArray with elements of `elem_shape`. For both initialized and
 // uninitialized TensorArrays, checks that the tensor has a type compatible with
 // 'dtype' and shape compatible with 'elem_shape'.
 Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
-                                  XlaVariable* var, DataType dtype,
+                                  XlaResource* resource, DataType dtype,
                                   const TensorShape& elem_shape) {
-  if (var->type != dtype) {
+  if (resource->kind != XlaResource::kTensorArray) {
+    return errors::InvalidArgument("Unexpected non-TensorArray resource");
+  }
+
+  if (resource->type != dtype) {
     return errors::InvalidArgument(
-        "TensorArray dtype is ", DataTypeString(var->type),
+        "TensorArray dtype is ", DataTypeString(resource->type),
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
-  TF_RET_CHECK(var->tensor_array_size >= 0)
-      << var->name << " size " << var->tensor_array_size;
+  TF_RET_CHECK(resource->tensor_array_size >= 0)
+      << resource->name << " size " << resource->tensor_array_size;
   TensorShape ta_shape;
-  ta_shape.AddDim(var->tensor_array_size);
+  ta_shape.AddDim(resource->tensor_array_size);
   ta_shape.AppendShape(elem_shape);
 
-  if (var->value.handle() == 0) {
+  if (resource->value.handle() == 0) {
     // TensorArray has not been initialized.
-    xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, var->type);
-    var->value = builder->Broadcast(zero, ta_shape.dim_sizes());
+    xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, resource->type);
+    resource->value = builder->Broadcast(zero, ta_shape.dim_sizes());
   } else {
     // Checks the elem_shape matches the TensorArray shape.
-    auto shape_or_status = builder->GetShape(var->value);
+    auto shape_or_status = builder->GetShape(resource->value);
     if (!shape_or_status.ok()) {
       return shape_or_status.status();
     }
-    TensorShape shape = XLAShapeToTensorShape(*shape_or_status.ValueOrDie());
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(
+        XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
     if (ta_shape != shape) {
       return errors::InvalidArgument(
           "Mismatched TensorArray sizes: ", ta_shape.DebugString(), " vs ",
@@ -80,14 +86,43 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
   return Status::OK();
 }
 
-// Pads 'x' with 'count' zero indices. 'x' must have 1 element.
-xla::ComputationDataHandle PadIndexWithZeros(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    int count) {
-  xla::ComputationDataHandle zero = builder->ConstantR1<int32>({0});
-  std::vector<xla::ComputationDataHandle> xs(count + 1, zero);
-  xs[0] = builder->Reshape(x, {1});
-  return builder->ConcatInDim(xs, 0);
+// Checks that the TensorArray 'resource' has been initialized, and has type
+// 'dtype'. Sets 'shape' to the shape
+Status CheckTensorArrayIsInitialized(const string& op_name,
+                                     const XlaResource* resource,
+                                     DataType dtype) {
+  if (resource->kind != XlaResource::kTensorArray) {
+    return errors::InvalidArgument(
+        "Unexpected non-TensorArray resource passed "
+        "to ",
+        op_name);
+  }
+  if (resource->value.handle() == 0) {
+    return errors::InvalidArgument("Uninitialized TensorArray passed to ",
+                                   op_name);
+  }
+  if (resource->type != dtype) {
+    return errors::InvalidArgument(
+        "TensorArray dtype is ", DataTypeString(resource->type),
+        " but op has dtype ", DataTypeString(dtype), ".");
+  }
+
+  return Status::OK();
+}
+
+Status GetTensorArrayShape(const XlaResource* resource,
+                           xla::ComputationBuilder* builder,
+                           TensorShape* shape) {
+  auto shape_or_status = builder->GetShape(resource->value);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
+  }
+  TF_RETURN_IF_ERROR(
+      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), shape));
+  if (shape->dims() < 1) {
+    return errors::InvalidArgument("TensorArray rank must be >= 1");
+  }
+  return Status::OK();
 }
 
 // Like ComputationBuilder::DynamicUpdateSlice, but adds 'update' to the
@@ -125,7 +160,6 @@ class TensorArrayOp : public XlaOpKernel {
                 errors::InvalidArgument("TensorArray size must be >= 0"));
 
     xla::ComputationBuilder* b = ctx->builder();
-    b->set_die_immediately_on_error(true);
 
     // Initializes the TensorArray value if we know the element shape.
     // Otherwise, defer initialization to the first write.
@@ -141,13 +175,17 @@ class TensorArrayOp : public XlaOpKernel {
     }
 
     XlaContext& xc = XlaContext::Get(ctx);
-    XlaVariable* var;
+    XlaResource* var;
     string name = strings::StrCat("TensorArray: ", tensor_array_name_);
-    OP_REQUIRES_OK(ctx,
-                   xc.CreateVariable(-1, std::move(name), dtype_, value, &var));
+    OP_REQUIRES_OK(
+        ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
+                               dtype_, value, &var));
     var->tensor_array_size = size;
-    ctx->SetVariableOutput(0, var);
-    ctx->SetConstantOutput(1, Tensor(DT_FLOAT));
+    ctx->SetResourceOutput(0, var);
+
+    Tensor flow(DT_FLOAT, TensorShape({}));
+    flow.scalar<float>()() = 0.0f;
+    ctx->SetConstantOutput(1, flow);
   }
 
  private:
@@ -173,16 +211,18 @@ class TensorArrayWriteOp : public XlaOpKernel {
 
     // Initializes the TensorArray, if the element shape was not known at
     // construction time.
-    XlaVariable* var;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, MaybeInitializeTensorArray(b, var, dtype_, elem_shape));
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+    OP_REQUIRES_OK(ctx,
+                   MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
 
-    xla::ComputationDataHandle ta = var->value;
+    xla::ComputationDataHandle ta = resource->value;
     xla::ComputationDataHandle index = ctx->Input(1);
     xla::ComputationDataHandle value = ctx->Input(2);
+    xla::ComputationDataHandle flow = ctx->Input(3);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
+    auto start_indices = XlaHelpers::PadWithZeros(b, index, elem_shape.dims());
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
@@ -191,8 +231,8 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::ComputationDataHandle written =
         DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, written));
-    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+    resource->value = written;
+    ctx->SetOutput(0, flow);
   }
 
  private:
@@ -210,24 +250,22 @@ class TensorArrayReadOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    DataType ta_type;
-    TensorShape ta_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
-    OP_REQUIRES(ctx, ta_type == dtype_,
-                errors::InvalidArgument(
-                    "TensorArray dtype is ", DataTypeString(ta_type),
-                    " but Op requested dtype ", DataTypeString(dtype_), "."));
-    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
-                errors::InvalidArgument("TensorArray rank must be >= 1"));
-
     xla::ComputationBuilder* b = ctx->builder();
 
-    xla::ComputationDataHandle ta;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &ta));
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+
+    OP_REQUIRES_OK(ctx,
+                   CheckTensorArrayIsInitialized(name(), resource, dtype_));
+    TensorShape ta_shape;
+    OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
+
+    xla::ComputationDataHandle ta = resource->value;
     xla::ComputationDataHandle index = ctx->Input(1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
+    auto start_indices =
+        XlaHelpers::PadWithZeros(b, index, ta_shape.dims() - 1);
 
     auto slice_shape = ta_shape.dim_sizes();
     slice_shape[0] = 1LL;
@@ -255,24 +293,23 @@ class TensorArrayGatherOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    DataType ta_type;
+    xla::ComputationBuilder* b = ctx->builder();
+
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+
+    OP_REQUIRES_OK(ctx,
+                   CheckTensorArrayIsInitialized(name(), resource, dtype_));
     TensorShape ta_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
-    OP_REQUIRES(ctx, ta_type == dtype_,
-                errors::InvalidArgument("TensorArray type mismatch"));
-    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
-                errors::InvalidArgument("TensorArray rank must be >= 1"));
+    OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
     const TensorShape indices_shape = ctx->InputShape(1);
-    OP_REQUIRES(ctx, indices_shape.dims() >= 1,
+    OP_REQUIRES(ctx, indices_shape.dims() == 1,
                 errors::InvalidArgument("indices must be rank 1"));
     const int num_indices = indices_shape.dim_size(0);
     auto indices = ctx->Input(1);
 
-    xla::ComputationBuilder* b = ctx->builder();
-
-    xla::ComputationDataHandle ta;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &ta));
+    xla::ComputationDataHandle ta = resource->value;
 
     // For each index in `indices`, add the corresponding slice to `slices`.
     std::vector<xla::ComputationDataHandle> slices(num_indices);
@@ -282,7 +319,8 @@ class TensorArrayGatherOp : public XlaOpKernel {
       auto index = b->Slice(indices, {i}, {i + 1}, {1});
 
       // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-      auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1);
+      auto start_indices =
+          XlaHelpers::PadWithZeros(b, index, ta_shape.dims() - 1);
 
       auto slice_shape = ta_shape.dim_sizes();
       slice_shape[0] = 1LL;
@@ -320,11 +358,12 @@ class TensorArrayScatterOp : public XlaOpKernel {
 
     const TensorShape value_shape = ctx->InputShape(2);
 
-    XlaVariable* var;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
     TensorShape elem_shape = value_shape;
     elem_shape.RemoveDim(0);
-    OP_REQUIRES_OK(ctx, MaybeInitializeTensorArray(b, var, dtype_, elem_shape));
+    OP_REQUIRES_OK(ctx,
+                   MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
 
     const TensorShape indices_shape = ctx->InputShape(1);
     OP_REQUIRES(ctx, indices_shape.dims() >= 1,
@@ -332,8 +371,9 @@ class TensorArrayScatterOp : public XlaOpKernel {
     const int num_indices = indices_shape.dim_size(0);
     const xla::ComputationDataHandle indices = ctx->Input(1);
 
-    xla::ComputationDataHandle ta = var->value;
+    xla::ComputationDataHandle ta = resource->value;
     const xla::ComputationDataHandle value = ctx->Input(2);
+    const xla::ComputationDataHandle flow = ctx->Input(3);
 
     auto slice_dims = value_shape.dim_sizes();
     slice_dims[0] = 1LL;
@@ -353,12 +393,13 @@ class TensorArrayScatterOp : public XlaOpKernel {
 
       // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
       auto index = b->Slice(indices, {i}, {i + 1}, {1});
-      auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims());
+      auto start_indices =
+          XlaHelpers::PadWithZeros(b, index, elem_shape.dims());
       ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
     }
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, ta));
-    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+    resource->value = ta;
+    ctx->SetOutput(0, flow);
   }
 
  private:
@@ -376,18 +417,17 @@ class TensorArrayConcatOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    DataType ta_type;
-    TensorShape ta_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
-    OP_REQUIRES(ctx, ta_type == dtype_,
-                errors::InvalidArgument("TensorArray type mismatch"));
-    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
-                errors::InvalidArgument("TensorArray rank must be >= 1"));
-
     xla::ComputationBuilder* b = ctx->builder();
 
-    xla::ComputationDataHandle ta;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &ta));
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+
+    OP_REQUIRES_OK(ctx,
+                   CheckTensorArrayIsInitialized(name(), resource, dtype_));
+    TensorShape ta_shape;
+    OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
+
+    xla::ComputationDataHandle ta = resource->value;
 
     auto ta_dims = ta_shape.dim_sizes();
     std::vector<int64> shape(ta_dims.begin() + 1, ta_dims.end());
@@ -438,31 +478,32 @@ class TensorArraySplitOp : public XlaOpKernel {
     elem_shape.set_dim(0, length);
 
     xla::ComputationBuilder* b = ctx->builder();
-    XlaVariable* var;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, MaybeInitializeTensorArray(b, var, dtype_, elem_shape));
-    xla::ComputationDataHandle ta = var->value;
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
+    OP_REQUIRES_OK(ctx,
+                   MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
+    xla::ComputationDataHandle ta = resource->value;
 
     TensorShape ta_shape;
-    ta_shape.AddDim(var->tensor_array_size);
+    ta_shape.AddDim(resource->tensor_array_size);
     ta_shape.AppendShape(elem_shape);
 
-    OP_REQUIRES(ctx, lengths.size() == var->tensor_array_size,
+    OP_REQUIRES(ctx, lengths.size() == resource->tensor_array_size,
                 errors::InvalidArgument(
                     "TensorArray's size is not equal to the size of lengths (",
-                    lengths.size(), " vs. ", var->tensor_array_size, ")"));
+                    lengths.size(), " vs. ", resource->tensor_array_size, ")"));
 
     const xla::ComputationDataHandle value = ctx->Input(1);
+    const xla::ComputationDataHandle flow = ctx->Input(3);
 
     OP_REQUIRES(ctx, value_shape.num_elements() == ta_shape.num_elements(),
                 errors::InvalidArgument("mismatched element count ",
                                         value_shape.DebugString(), " vs. ",
                                         ta_shape.DebugString()));
 
-    ta = b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, ta));
+    resource->value = b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()));
 
-    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+    ctx->SetOutput(0, flow);
   }
 
  private:
@@ -478,8 +519,8 @@ class TensorArraySizeOp : public XlaOpKernel {
   explicit TensorArraySizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    XlaVariable* var;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    XlaResource* var;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
     size_tensor.scalar<int32>()() = static_cast<int32>(var->tensor_array_size);
     ctx->SetConstantOutput(0, size_tensor);
@@ -500,31 +541,31 @@ class TensorArrayGradOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* b = ctx->builder();
 
-    XlaVariable* var;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableInput(0, &var));
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
 
-    DataType ta_type;
+    OP_REQUIRES_OK(
+        ctx, CheckTensorArrayIsInitialized(name(), resource, resource->type));
     TensorShape ta_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &ta_type, &ta_shape));
-    OP_REQUIRES(ctx, ta_shape.dims() >= 1,
-                errors::InvalidArgument("TensorArray rank must be >= 1"));
+    OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
     // Finds or looks up the corresponding gradient TensorArray, which stores
     // gradients computed during backpropagation.
-    XlaVariable*& gradient = var->tensor_array_gradient[source_];
+    XlaResource*& gradient = resource->tensor_array_gradient[source_];
     if (!gradient) {
-      xla::ComputationDataHandle zero = XlaHelpers::Zero(b, ta_type);
+      xla::ComputationDataHandle zero = XlaHelpers::Zero(b, resource->type);
       xla::ComputationDataHandle value =
           b->Broadcast(zero, ta_shape.dim_sizes());
 
       XlaContext& xc = XlaContext::Get(ctx);
-      string name = strings::StrCat("TensorArrayGrad: ", var->name);
-      OP_REQUIRES_OK(ctx, xc.CreateVariable(-1, std::move(name), var->type,
-                                            value, &gradient));
-      gradient->tensor_array_size = var->tensor_array_size;
+      string name = strings::StrCat("TensorArrayGrad: ", resource->name);
+      OP_REQUIRES_OK(
+          ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
+                                 resource->type, value, &gradient));
+      gradient->tensor_array_size = resource->tensor_array_size;
     }
 
-    ctx->SetVariableOutput(0, gradient);
+    ctx->SetResourceOutput(0, gradient);
     ctx->SetConstantOutput(1, Tensor(DT_FLOAT));
   }
 
@@ -536,5 +577,19 @@ class TensorArrayGradOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorArrayGradV3"), TensorArrayGradOp);
 
+class TensorArrayCloseOp : public XlaOpKernel {
+ public:
+  explicit TensorArrayCloseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // Do nothing; XLA handles resource management.
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayCloseOp);
+};
+
+REGISTER_XLA_OP(Name("TensorArrayCloseV3"), TensorArrayCloseOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 4cc2eb8f877a873593f0460346e3379e851e8e08..9ee6bd892504e683a191484fb09259619759f36d 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -68,7 +68,7 @@ class TileOp : public XlaOpKernel {
     bool all_multiples_are_one = true;
     bool one_dimension_is_broadcasted_without_multiple = true;
     for (int i = 0; i < input_dims; ++i) {
-      int multiple = xla::LiteralUtil::Get<int>(literal, {i});
+      int multiple = literal.Get<int>({i});
       OP_REQUIRES(ctx, multiple,
                   errors::InvalidArgument("Expected multiples[", i,
                                           "] >= 0, but got ", multiple));
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index e9ac1ee91b8e86a7154f42b8c51dcbb5c8a32a83..a2ecbca124c28574560afea17e13889506869e36 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -352,9 +352,9 @@ class ResourceApplyRMSProp : public XlaOpKernel {
                b->Sub(XlaHelpers::FloatLiteral(b, type, 1.0), rho)));
     xla::ComputationDataHandle new_mom =
         b->Add(b->Mul(mom, momentum),
-               b->Div(b->Mul(grad, lr),
+               b->Mul(b->Mul(grad, lr),
                       b->Pow(b->Add(new_ms, epsilon),
-                             XlaHelpers::FloatLiteral(b, type, 0.5))));
+                             XlaHelpers::FloatLiteral(b, type, -0.5))));
     xla::ComputationDataHandle new_var = b->Sub(var, new_mom);
 
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, new_var));
@@ -364,112 +364,160 @@ class ResourceApplyRMSProp : public XlaOpKernel {
 };
 REGISTER_XLA_OP(Name("ResourceApplyRMSProp"), ResourceApplyRMSProp);
 
-class ResourceApplyFtrl : public XlaOpKernel {
- public:
-  explicit ResourceApplyFtrl(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
+                 bool has_l2_shrinkage) {
+  xla::ComputationBuilder* b = ctx->builder();
+
+  DataType var_type, accum_type, linear_type;
+  TensorShape var_shape, accum_shape, linear_shape;
+  OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+  OP_REQUIRES_OK(ctx,
+                 ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
+  OP_REQUIRES_OK(ctx,
+                 ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape));
+
+  OP_REQUIRES(
+      ctx, dtype == var_type && dtype == accum_type && dtype == linear_type,
+      errors::InvalidArgument(
+          "Types of variable arguments to ResourceApplyFtrlV2 must match: ",
+          DataTypeString(dtype), " vs. ", DataTypeString(var_type), " and ",
+          DataTypeString(accum_type), " and ", DataTypeString(linear_type)));
+
+  OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+              errors::InvalidArgument(
+                  "var and accum do not have the same shape",
+                  var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+  OP_REQUIRES(ctx, var_shape.IsSameSize(linear_shape),
+              errors::InvalidArgument(
+                  "var and linear do not have the same shape",
+                  var_shape.DebugString(), " ", linear_shape.DebugString()));
+
+  TensorShape grad_shape = ctx->InputShape(3);
+  TensorShape lr_shape = ctx->InputShape(4);
+  TensorShape l1_shape = ctx->InputShape(5);
+  TensorShape l2_shape = ctx->InputShape(6);
+  TensorShape l2_shrinkage_shape;
+  TensorShape lr_power_shape;
+  if (has_l2_shrinkage) {
+    l2_shrinkage_shape = ctx->InputShape(7);
+    lr_power_shape = ctx->InputShape(8);
+  } else {
+    lr_power_shape = ctx->InputShape(7);
   }
 
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+  OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+              errors::InvalidArgument("var and grad do not have the same shape",
+                                      var_shape.DebugString(), " ",
+                                      grad_shape.DebugString()));
 
-    DataType var_type, accum_type, linear_type;
-    TensorShape var_shape, accum_shape, linear_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-    OP_REQUIRES_OK(
-        ctx, ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape));
+  OP_REQUIRES(
+      ctx, TensorShapeUtils::IsScalar(lr_shape),
+      errors::InvalidArgument("lr is not a scalar: ", lr_shape.DebugString()));
 
-    OP_REQUIRES(
-        ctx,
-        dtype_ == var_type && dtype_ == accum_type && dtype_ == linear_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyFtrl must match: ",
-            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type), " and ", DataTypeString(linear_type)));
+  OP_REQUIRES(
+      ctx, TensorShapeUtils::IsScalar(l1_shape),
+      errors::InvalidArgument("l1 is not a scalar: ", l1_shape.DebugString()));
 
-    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
-                errors::InvalidArgument(
-                    "var and accum do not have the same shape",
-                    var_shape.DebugString(), " ", accum_shape.DebugString()));
-
-    OP_REQUIRES(ctx, var_shape.IsSameSize(linear_shape),
-                errors::InvalidArgument(
-                    "var and linear do not have the same shape",
-                    var_shape.DebugString(), " ", linear_shape.DebugString()));
+  OP_REQUIRES(
+      ctx, TensorShapeUtils::IsScalar(l2_shape),
+      errors::InvalidArgument("l2 is not a scalar: ", l2_shape.DebugString()));
 
-    TensorShape grad_shape = ctx->InputShape(3);
-    TensorShape lr_shape = ctx->InputShape(4);
-    TensorShape l1_shape = ctx->InputShape(5);
-    TensorShape l2_shape = ctx->InputShape(6);
-    TensorShape lr_power_shape = ctx->InputShape(7);
+  if (has_l2_shrinkage) {
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2_shrinkage_shape),
+                errors::InvalidArgument("l2_shrinkage is not a scalar: ",
+                                        l2_shrinkage_shape.DebugString()));
+  }
 
-    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
-                errors::InvalidArgument(
-                    "var and grad do not have the same shape",
-                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_power_shape),
+              errors::InvalidArgument("lr_power is not a scalar: ",
+                                      lr_power_shape.DebugString()));
+
+  xla::ComputationDataHandle var, accum, linear;
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear));
+  xla::ComputationDataHandle grad = ctx->Input(3);
+  xla::ComputationDataHandle lr = ctx->Input(4);
+  xla::ComputationDataHandle l1 = ctx->Input(5);
+  xla::ComputationDataHandle l2 = ctx->Input(6);
+  xla::ComputationDataHandle l2_shrinkage;
+  xla::ComputationDataHandle lr_power;
+  if (has_l2_shrinkage) {
+    l2_shrinkage = ctx->Input(7);
+    lr_power = ctx->Input(8);
+  } else {
+    lr_power = ctx->Input(7);
+  }
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
-                errors::InvalidArgument("lr is not a scalar: ",
-                                        lr_shape.DebugString()));
+  // grad_to_use = grad + 2 * l2_shrinkage * var
+  // new_accum = accum + grad_to_use * grad_to_use
+  // linear += grad_to_use -
+  //     (new_accum^(-lr_power) - accum^(-lr_power)) / lr * var
+  // quadratic = (new_accum^(-lr_power) / lr) + 2 * l2
+  // var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+  // accum = new_accum
+
+  xla::ComputationDataHandle zero_broadcast = b->Broadcast(
+      XlaHelpers::FloatLiteral(b, dtype, 0.0), var_shape.dim_sizes());
+  xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
+  xla::ComputationDataHandle grad_to_use;
+  if (has_l2_shrinkage) {
+    grad_to_use = b->Add(grad, b->Mul(two, b->Mul(l2_shrinkage, var)));
+  } else {
+    grad_to_use = grad;
+  }
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1_shape),
-                errors::InvalidArgument("l1 is not a scalar: ",
-                                        l1_shape.DebugString()));
+  xla::ComputationDataHandle new_accum =
+      b->Add(accum, b->Pow(grad_to_use, two));
+  xla::ComputationDataHandle new_accum_lr_pow =
+      b->Pow(new_accum, b->Neg(lr_power));
+  xla::ComputationDataHandle accum_lr_pow = b->Pow(accum, b->Neg(lr_power));
+  linear = b->Add(
+      linear,
+      b->Sub(grad_to_use,
+             b->Mul(b->Div(b->Sub(new_accum_lr_pow, accum_lr_pow), lr), var)));
+  xla::ComputationDataHandle quadratic =
+      b->Add(b->Div(new_accum_lr_pow, lr), b->Mul(two, l2));
+  xla::ComputationDataHandle pre_shrink =
+      b->Div(b->Sub(b->Mul(l1, b->Sign(linear)), linear), quadratic);
+  var = b->Select(b->Gt(b->Abs(linear), l1), pre_shrink, zero_broadcast);
+  accum = new_accum;
+
+  OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype, var));
+  OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype, accum));
+  OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype, linear));
+}
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2_shape),
-                errors::InvalidArgument("l2 is not a scalar: ",
-                                        l2_shape.DebugString()));
+class ResourceApplyFtrl : public XlaOpKernel {
+ public:
+  explicit ResourceApplyFtrl(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_power_shape),
-                errors::InvalidArgument("lr_power is not a scalar: ",
-                                        lr_power_shape.DebugString()));
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileFtrl(ctx, dtype_, /*has_l2_shrinkage=*/false);
+  }
 
-    xla::ComputationDataHandle var, accum, linear;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear));
-    xla::ComputationDataHandle grad = ctx->Input(3);
-    xla::ComputationDataHandle lr = ctx->Input(4);
-    xla::ComputationDataHandle l1 = ctx->Input(5);
-    xla::ComputationDataHandle l2 = ctx->Input(6);
-    xla::ComputationDataHandle lr_power = ctx->Input(7);
-
-    // new_accum = accum + grad * grad
-    // linear += grad - (new_accum^(-lr_power) - accum^(-lr_power)) / lr * var
-    // quadratic = (new_accum^(-lr_power) / lr) + 2 * l2
-    // var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-    // accum = new_accum
-
-    xla::ComputationDataHandle zero_broadcast = b->Broadcast(
-        XlaHelpers::FloatLiteral(b, dtype_, 0.0), var_shape.dim_sizes());
-    xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
+ private:
+  DataType dtype_;
+};
+REGISTER_XLA_OP(Name("ResourceApplyFtrl"), ResourceApplyFtrl);
 
-    xla::ComputationDataHandle new_accum = b->Add(accum, b->Pow(grad, two));
-    xla::ComputationDataHandle new_accum_lr_pow =
-        b->Pow(new_accum, b->Neg(lr_power));
-    xla::ComputationDataHandle accum_lr_pow = b->Pow(accum, b->Neg(lr_power));
-    linear = b->Add(
-        linear,
-        b->Sub(grad, b->Mul(b->Div(b->Sub(new_accum_lr_pow, accum_lr_pow), lr),
-                            var)));
-    xla::ComputationDataHandle quadratic =
-        b->Add(b->Div(new_accum_lr_pow, lr), b->Mul(two, l2));
-    xla::ComputationDataHandle pre_shrink =
-        b->Div(b->Sub(b->Mul(l1, b->Sign(linear)), linear), quadratic);
-    var = b->Select(b->Gt(b->Abs(linear), l1), pre_shrink, zero_broadcast);
-    accum = new_accum;
+class ResourceApplyFtrlV2 : public XlaOpKernel {
+ public:
+  explicit ResourceApplyFtrlV2(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, linear));
+  void Compile(XlaOpKernelContext* ctx) override {
+    CompileFtrl(ctx, dtype_, /*has_l2_shrinkage=*/true);
   }
 
  private:
   DataType dtype_;
 };
-REGISTER_XLA_OP(Name("ResourceApplyFtrl"), ResourceApplyFtrl);
+REGISTER_XLA_OP(Name("ResourceApplyFtrlV2"), ResourceApplyFtrlV2);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index abe4949f5dbc8034fa46828e3ff872cae7591d90..626ddd17d394d4a2e1c014c3a280949a415dce94 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -44,6 +44,8 @@ namespace {
 // Return x if x>0, otherwise -x.
 XLAJIT_MAKE_UNARY(Abs, b->Abs(x));
 XLAJIT_MAKE_UNARY(Ceil, b->Ceil(x));
+XLAJIT_MAKE_UNARY(Cos, b->Cos(x));
+XLAJIT_MAKE_UNARY(Sin, b->Sin(x));
 XLAJIT_MAKE_UNARY(Exp, b->Exp(x));
 XLAJIT_MAKE_UNARY(Floor, b->Floor(x));
 // Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0.
@@ -77,12 +79,19 @@ static xla::ComputationDataHandle Round(xla::ComputationBuilder* b,
                                 b->LogicalAnd(b->Eq(fraction, half), is_odd)),
                    b->Add(round_val, one), round_val);
 }
-XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x));
 
+// Expresses sigmoid as a rescaled tanh: sigmoid(x) == (tanh(x/2) + 1) / 2.
+static xla::ComputationDataHandle Sigmoid(xla::ComputationBuilder* b,
+                                          DataType dtype,
+                                          const xla::ComputationDataHandle& x) {
+  auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5);
+  return b->Add(half, b->Mul(half, b->Tanh(b->Mul(half, x))));
+}
+
+XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x));
 XLAJIT_MAKE_UNARY(Rsqrt,
                   b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5)));
-XLAJIT_MAKE_UNARY(Sigmoid,
-                  b->Map({x}, *ctx->GetOrCreateSigmoid(input_type(0))));
+XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(b, input_type(0), x));
 XLAJIT_MAKE_UNARY(Softplus,
                   b->Log(b->Add(b->Exp(x), XlaHelpers::One(b, input_type(0)))));
 XLAJIT_MAKE_UNARY(Sqrt,
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 1b04b8b802c5c6e9da337933a7c4cd99233ebe8d..0eea81b308bbee26cc607bcb95ffbf2d3f6abe0f 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
@@ -86,5 +87,69 @@ REGISTER_XLA_OP(
     Name("AssignSubVariableOp").TypeConstraint("dtype", kNumericTypes),
     AssignSubVariableOp);
 
+class ResourceGatherOp : public XlaOpKernel {
+ public:
+  explicit ResourceGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    // Get the shape of the resource tensor.
+    TensorShape resource_shape;
+    DataType resource_dtype;
+    OP_REQUIRES_OK(
+        ctx, ctx->GetVariableTypeAndShape(0, &resource_dtype, &resource_shape));
+
+    DataType expected_output_dtype = ctx->expected_output_dtype(0);
+    OP_REQUIRES(ctx, resource_dtype == expected_output_dtype,
+                errors::InvalidArgument(
+                    "Variable dtype is ", DataTypeString(resource_dtype),
+                    " but expected output dtype is ",
+                    DataTypeString(expected_output_dtype), "."));
+
+    xla::ComputationDataHandle resource_handle;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &resource_handle));
+
+    auto indices = ctx->Input(1);
+    auto indices_shape = ctx->InputShape(1);
+    const int num_indices = indices_shape.num_elements();
+
+    // Flatten the indices into 1-D.
+    auto indices_1d = builder->Reshape(indices, {num_indices});
+
+    // Compute the slice for each of these indices separately.
+    std::vector<xla::ComputationDataHandle> slices(num_indices);
+    for (int i = 0; i < num_indices; ++i) {
+      auto index = builder->Slice(indices_1d, {i}, {i + 1}, {1});
+
+      auto start_indices =
+          XlaHelpers::PadWithZeros(builder, index, resource_shape.dims() - 1);
+
+      auto slice_shape = resource_shape.dim_sizes();
+      slice_shape[0] = 1LL;
+
+      slices[i] =
+          builder->DynamicSlice(resource_handle, start_indices, slice_shape);
+    }
+
+    // Concatenate the slices into one tensor.
+    xla::ComputationDataHandle concat = builder->ConcatInDim(slices, 0);
+
+    // Compute the shape of the result tensor, which is:
+    //    indices.shape + resource.shape[1:]
+    TensorShape gather_shape = indices_shape;
+    gather_shape.AppendShape(resource_shape);
+    gather_shape.RemoveDim(indices_shape.dims());
+
+    // Reshape the concatenated slices into the shape expected of the result
+    // tensor.
+    xla::ComputationDataHandle gather =
+        builder->Reshape(concat, gather_shape.dim_sizes());
+
+    ctx->SetOutput(0, gather);
+  }
+};
+REGISTER_XLA_OP(Name("ResourceGather").TypeConstraint("dtype", kNumericTypes),
+                ResourceGatherOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2031fc761e55ddb08a19dbc1b34a4d60e19562
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -0,0 +1,277 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/while_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Builds XlaCompiler argument descriptions `args` from `ctx`.
+Status MakeXlaCompilerArgumentsFromInputs(
+    XlaOpKernelContext* ctx, std::vector<XlaCompiler::Argument>* args,
+    bool* has_uninitialized_vars) {
+  VLOG(2) << "Num inputs " << ctx->num_inputs();
+  args->resize(ctx->num_inputs());
+  *has_uninitialized_vars = false;
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    VLOG(2) << "  Input " << i
+            << " type: " << DataTypeString(ctx->input_type(i))
+            << " shape: " << ctx->InputShape(i).DebugString();
+    XlaCompiler::Argument& arg = (*args)[i];
+    DataType type = ctx->input_type(i);
+    // When reading a resource input, use the type and shape of the resource's
+    // current value.
+    if (type == DT_RESOURCE) {
+      XlaResource* resource;
+      TF_RETURN_IF_ERROR(ctx->GetResourceInput(i, &resource));
+
+      arg.initialized = resource->value.handle() > 0;
+      switch (resource->kind) {
+        case XlaResource::kVariable:
+          arg.kind = XlaCompiler::Argument::kVariable;
+          break;
+        case XlaResource::kTensorArray:
+          arg.kind = XlaCompiler::Argument::kTensorArray;
+          break;
+        case XlaResource::kStack:
+          arg.kind = XlaCompiler::Argument::kStack;
+          break;
+        case XlaResource::kInvalid:
+          CHECK(false);
+      }
+      arg.type = resource->type;
+      if (arg.initialized) {
+        auto shape = ctx->builder()->GetShape(resource->value);
+        TF_RETURN_IF_ERROR(shape.status());
+        arg.shape = *shape.ValueOrDie();
+      } else {
+        *has_uninitialized_vars = true;
+      }
+      arg.tensor_array_size = resource->tensor_array_size;
+      arg.name = resource->name;
+      // TODO(phawkins): propagate TensorArray gradients into loops.
+      VLOG(2) << "    resource " << resource->name
+              << " type: " << DataTypeString(arg.type)
+              << " shape: " << arg.shape.DebugString()
+              << " initialized: " << arg.initialized;
+
+    } else {
+      arg.kind = XlaCompiler::Argument::kParameter;
+      arg.type = ctx->input_type(i);
+      TF_RETURN_IF_ERROR(
+          TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+    }
+  }
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+XlaWhileOp::XlaWhileOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  const NameAttrList* name_attr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &name_attr));
+  cond_name_attr_ = *name_attr;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &name_attr));
+  body_name_attr_ = *name_attr;
+}
+
+void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
+  VLOG(1) << "WhileOp::Compile";
+
+  std::vector<XlaCompiler::Argument> arguments;
+  bool has_uninitialized_vars;
+  OP_REQUIRES_OK(ctx, MakeXlaCompilerArgumentsFromInputs(
+                          ctx, &arguments, &has_uninitialized_vars));
+
+  const bool use_tuple_arg = (arguments.size() != 1);
+
+  xla::ComputationBuilder* builder = ctx->builder();
+  XlaCompiler* compiler = ctx->compiler();
+
+  VLOG(1) << "Compiling body";
+
+  // All resource that are inputs to the loop's body must also be
+  // present as loop body outputs; the signature of the loop's input and
+  // output must match. We ensure this by asking the compiler to include the
+  // current values of all resources, even if they haven't been updated by the
+  // computation. We must also ask the compiler to keep compile-time constant
+  // outputs as part of the generated computation, for the same reason.
+  // TODO(phawkins): consider adding loop-invariant inputs to XLA's While()
+  // operator.
+  XlaCompiler::CompileOptions body_options;
+  body_options.use_tuple_arg = use_tuple_arg;
+  body_options.return_updated_values_for_all_resources = true;
+  body_options.resolve_compile_time_constants = false;
+  XlaCompiler::CompilationResult body;
+  OP_REQUIRES_OK(ctx, compiler->CompileFunction(body_options, body_name_attr_,
+                                                arguments, &body));
+
+  // We must use a static shape for parameters to an XLA compilation. However,
+  // we may not know the shape of a TensorArray if it is first written inside
+  // the loop. Ideally we would require the user to provide a static shape,
+  // but this is not always easy.
+  // So if uninitialized resource are used by the loop body, we compile the
+  // body function twice:
+  // 1) once with uninitialized resource inputs. We discard the computation
+  //    but we assume resource shapes reach a fixpoint after one iteration.
+  //    So we can use the output shapes of the resource as the "true" shapes.
+  // 2) again with the "correct" input shapes determined by (1).
+  if (has_uninitialized_vars) {
+    // Initializes any uninitialized resource with zero values of the
+    // shape determined by the first compilation.
+    for (int i = 0; i < body.resource_updates.size(); ++i) {
+      const XlaCompiler::ResourceUpdate& update = body.resource_updates[i];
+      XlaCompiler::Argument& arg = arguments[update.input_index];
+      if (!arg.initialized) {
+        VLOG(2) << "Update shape for argument " << update.input_index << " "
+                << xla::ShapeUtil::HumanString(update.shape);
+        arg.initialized = true;
+        arg.shape = update.shape;
+
+        XlaResource* resource;
+        OP_REQUIRES_OK(ctx,
+                       ctx->GetResourceInput(update.input_index, &resource));
+
+        std::unique_ptr<xla::Literal> zero =
+            xla::Literal::CreateFromShape(update.shape);
+        resource->value = builder->ConstantLiteral(*zero);
+      }
+    }
+    // Recompile the body with the "correct" shapes.
+    VLOG(1) << "Recompiling body with non-placeholder shapes";
+    body = {};
+    OP_REQUIRES_OK(ctx, compiler->CompileFunction(body_options, body_name_attr_,
+                                                  arguments, &body));
+  }
+
+  VLOG(1) << "Compiling condition";
+
+  XlaCompiler::CompileOptions cond_options;
+  cond_options.use_tuple_arg = use_tuple_arg;
+  cond_options.resolve_compile_time_constants = false;
+  XlaCompiler::CompilationResult cond;
+  OP_REQUIRES_OK(ctx, compiler->CompileFunction(cond_options, cond_name_attr_,
+                                                arguments, &cond));
+
+  xla::Shape body_input_shape, cond_input_shape;
+  if (use_tuple_arg) {
+    body_input_shape = xla::ShapeUtil::MakeTupleShape(body.xla_input_shapes);
+    cond_input_shape = xla::ShapeUtil::MakeTupleShape(cond.xla_input_shapes);
+  } else {
+    CHECK(!body.xla_input_shapes.empty());
+    body_input_shape = body.xla_input_shapes[0];
+    CHECK(!cond.xla_input_shapes.empty());
+    cond_input_shape = cond.xla_input_shapes[0];
+  }
+
+  VLOG(2) << "Body shape: " << xla::ShapeUtil::HumanString(body_input_shape)
+          << " -> " << xla::ShapeUtil::HumanString(body.xla_output_shape);
+  VLOG(2) << "Cond shape: " << xla::ShapeUtil::HumanString(cond_input_shape)
+          << " -> " << xla::ShapeUtil::HumanString(cond.xla_output_shape);
+
+  OP_REQUIRES(ctx,
+              xla::ShapeUtil::Compatible(body_input_shape, cond_input_shape),
+              errors::InvalidArgument(
+                  "Input shapes of loop body and condition do not match: ",
+                  xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
+                  xla::ShapeUtil::HumanString(cond_input_shape)));
+  OP_REQUIRES(
+      ctx, xla::ShapeUtil::Compatible(body_input_shape, body.xla_output_shape),
+      errors::InvalidArgument(
+          "Input and output shapes of loop body do not match: ",
+          xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
+          xla::ShapeUtil::HumanString(body.xla_output_shape)));
+
+  xla::ComputationDataHandle data;
+
+  int num_inputs = body.input_mapping.size();
+
+  std::vector<xla::ComputationDataHandle> inputs(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    int input_num = body.input_mapping[i];
+    if (ctx->input_type(input_num) == DT_RESOURCE) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx, ctx->GetResourceInput(input_num, &resource));
+      inputs[i] = resource->value;
+    } else {
+      inputs[i] = ctx->Input(i);
+    }
+  }
+
+  xla::ComputationDataHandle init;
+  if (use_tuple_arg) {
+    init = builder->Tuple(inputs);
+  } else {
+    init = inputs[0];
+  }
+
+  VLOG(1) << "Building while loop";
+
+  xla::ComputationDataHandle while_result =
+      builder->While(*cond.computation, *body.computation, init);
+
+  auto get_loop_output = [&](int i) {
+    if (use_tuple_arg) {
+      return builder->GetTupleElement(while_result, i);
+    } else {
+      return while_result;
+    }
+  };
+
+  // Sets non-variable outputs.
+  for (int i = 0; i < ctx->num_outputs(); ++i) {
+    if (ctx->input_type(i) != DT_RESOURCE) {
+      ctx->SetOutput(body.input_mapping[i], get_loop_output(i));
+    }
+  }
+
+  // Updates the values of any resource variables modified by the loop.
+  for (int i = 0; i < body.resource_updates.size(); ++i) {
+    const XlaCompiler::ResourceUpdate& update = body.resource_updates[i];
+    XlaResource* resource;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(update.input_index, &resource));
+    if (update.modified) {
+      int pos = body.outputs.size() + i;
+      resource->value = get_loop_output(pos);
+    }
+    VLOG(2) << "Loop-carried variable: pos: " << update.input_index
+            << " name: " << resource->name << " modified: " << update.modified
+            << " type: " << DataTypeString(update.type)
+            << " shape: " << update.shape.DebugString();
+    // Copies the identity of the resource variable from input to output
+    // unchanged, even if the variable was not modified.
+    ctx->op_kernel_context()->set_output(
+        update.input_index,
+        ctx->op_kernel_context()->input(update.input_index));
+  }
+
+  VLOG(1) << "Done building while loop";
+}
+
+REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes(), XlaWhileOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.h b/tensorflow/compiler/tf2xla/kernels/while_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..67edebabf9f643a919d0f06c228e2d224a49a2af
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional iteration primitive.
+//
+// The inputs and outputs of the loop body must agree on the number, types, and
+// shapes of the Tensors carried around the loop body.
+//
+// Computations in while loops may read from and write to resource variables.
+// Resource variables may be passed as arguments to a function's body and
+// condition functions. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the body's output. This ensures the loop body's input and output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+//
+// For example, suppose we have a loop body with arguments:
+// DT_INT32, DT_RESOURCE (pointing to a DT_BOOL var), DT_FLOAT
+// and return values
+// DT_INT32, DT_FLOAT
+// It is an error for the body to return DT_RESOURCE values.
+//
+// The body will be lowered into an XLA computation that takes and returns a
+// tuple with XLA type (I32, F32, PRED). Note the resource variable appears at
+// the end of both the loop body's input and output argument lists.
+class XlaWhileOp : public XlaOpKernel {
+ public:
+  explicit XlaWhileOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  NameAttrList cond_name_attr_;
+  NameAttrList body_name_attr_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaWhileOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 1f2bc01cf4a48b37de585c55b781c239ee4b8f2a..576cd9bf9abb43e29d9eb8f706e0f42ac2d038e9 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -27,13 +27,13 @@ Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal) {
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(
       host_tensor.dtype(), host_tensor.shape(), literal->mutable_shape()));
 
-  xla::LiteralUtil::Reserve(host_tensor.NumElements(), literal);
+  literal->Reserve(host_tensor.NumElements());
 
   // memcpy over the payload ...
   // TODO(phawkins): handle string types.
   size_t total_bytes = host_tensor.TotalBytes();
   if (total_bytes > 0) {
-    void* dst_ptr = xla::LiteralUtil::MutableInternalData(literal);
+    void* dst_ptr = literal->MutableInternalData();
     const void* src_ptr = DMAHelper::base(&host_tensor);
     memcpy(dst_ptr, src_ptr, total_bytes);
   }
@@ -51,11 +51,12 @@ Status LiteralToHostTensor(const xla::Literal& literal, DataType target_type,
         " to tensor of type ", DataTypeString(target_type));
   }
 
-  TensorShape shape = XLAShapeToTensorShape(literal.shape());
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(literal.shape(), &shape));
   *host_tensor = Tensor(target_type, shape);
   size_t total_bytes = host_tensor->TotalBytes();
   if (total_bytes > 0) {
-    const void* src_ptr = xla::LiteralUtil::InternalData(literal);
+    const void* src_ptr = literal.InternalData();
     void* dst_ptr = DMAHelper::base(host_tensor);
     memcpy(dst_ptr, src_ptr, total_bytes);
   }
diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc
index 56993bc58534d1225f9177719804a69f561b3a06..f3d6787daaa1165b28ce63dfd501533fa0963edd 100644
--- a/tensorflow/compiler/tf2xla/literal_util_test.cc
+++ b/tensorflow/compiler/tf2xla/literal_util_test.cc
@@ -27,7 +27,7 @@ TEST(LiteralUtil, LiteralToHostTensor) {
   {
     std::vector<int64> int64_values = {1, 2, 3};
     std::unique_ptr<xla::Literal> int64_values_literal =
-        xla::LiteralUtil::CreateR1(gtl::ArraySlice<int64>(int64_values));
+        xla::Literal::CreateR1(gtl::ArraySlice<int64>(int64_values));
     Tensor host_tensor;
     EXPECT_EQ("Cannot convert literal of type S64 to tensor of type int32",
               LiteralToHostTensor(*int64_values_literal, DT_INT32, &host_tensor)
@@ -48,7 +48,7 @@ TEST(LiteralUtil, LiteralToHostTensor) {
     Tensor host_tensor;
     std::vector<int32> int32_values = {10, 11};
     std::unique_ptr<xla::Literal> int32_values_literal =
-        xla::LiteralUtil::CreateR1(gtl::ArraySlice<int32>(int32_values));
+        xla::Literal::CreateR1(gtl::ArraySlice<int32>(int32_values));
     EXPECT_TRUE(
         LiteralToHostTensor(*int32_values_literal, DT_INT32, &host_tensor)
             .ok());
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a2bd06861d5f383e3497a386b42a2e5a4035f1ea
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -0,0 +1,38 @@
+package(
+    default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+cc_library(
+    name = "functional_ops",
+    srcs = ["functional_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_functional_ops",
+    out = "gen_functional_ops.py",
+    deps = [
+        ":functional_ops",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/tf2xla/ops/functional_ops.cc b/tensorflow/compiler/tf2xla/ops/functional_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38bcaa32278c4acf212881b10d66bb67b807a21c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/ops/functional_ops.cc
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("XlaWhile")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .Doc(R"doc(
+output = input; While (Cond(output)) { output = Body(output) }
+
+input: A list of input tensors whose types are T.
+output: A list of output tensors whose types are T.
+cond: A function takes 'input' and returns a tensor.  If the tensor is
+    a scalar of non-boolean, the scalar is converted to a boolean
+    according to the following rule: if the scalar is a numerical
+    value, non-zero means True and zero means False; if the scalar is
+    a string, non-empty means True and empty means False. If the
+    tensor is not a scalar, non-emptiness means True and False
+    otherwise.
+body: A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified by T.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index f5ecb51a5b77e36e606ed1c48b8e2dbe76de0074..9d1992205b02665b99b1bd15b7b65a1fb8c35a51 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -24,12 +24,18 @@ limitations under the License.
 namespace tensorflow {
 
 // Convert an XLA Shape into the equivalent TensorFlow shape.
-TensorShape XLAShapeToTensorShape(const xla::Shape& shape) {
-  TensorShape tensor_shape;
+Status XLAShapeToTensorShape(const xla::Shape& shape,
+                             TensorShape* tensor_shape) {
+  if (xla::ShapeUtil::IsTuple(shape)) {
+    return errors::InvalidArgument("XLA shape ",
+                                   xla::ShapeUtil::HumanString(shape),
+                                   " cannot be converted to a TensorShape");
+  }
+  *tensor_shape = TensorShape();
   for (int i = 0; i < xla::ShapeUtil::Rank(shape); ++i) {
-    tensor_shape.AddDim(shape.dimensions(i));
+    tensor_shape->AddDim(shape.dimensions(i));
   }
-  return tensor_shape;
+  return Status::OK();
 }
 
 // Convert a TensorShape into the equivalent XLA Shape proto.
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index 516dd636a970f78fda363a0b13961b8244dc2cd9..58240b9c965a194b9380ac7cd477ce7344e5ebe3 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -24,8 +24,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Convert an XLA Shape into the equivalent TensorFlow shape.
-TensorShape XLAShapeToTensorShape(const xla::Shape& shape);
+// Convert an XLA Shape into the equivalent TensorFlow shape. May fail since
+// not all XLA shapes can be represented as TensorShapes.
+Status XLAShapeToTensorShape(const xla::Shape& shape,
+                             TensorShape* tensor_shape);
 
 // Convert a TensorShape into the equivalent XLA Shape proto. Unlike Tensorflow,
 // XLA shapes include the type. Not all `dtype` values can be represented by
diff --git a/tensorflow/compiler/tf2xla/test_util.cc b/tensorflow/compiler/tf2xla/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c6c9a91b6d2fb47f6dee1c347e9b852f1eea3ec
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/test_util.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/test_util.h"
+
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+
+Status InstantiateFunctionForTest(const string& name,
+                                  const FunctionLibraryDefinition& library,
+                                  InstantiationResultForTest* result) {
+  const FunctionDef* fdef = library.Find(name);
+  TF_RET_CHECK(fdef != nullptr);
+
+  auto get_func_sig = [&library](const string& op, const OpDef** sig) {
+    return library.LookUpOpDef(op, sig);
+  };
+  InstantiationResult inst;
+  TF_RETURN_IF_ERROR(
+      InstantiateFunction(*fdef, AttrSlice(), get_func_sig, &inst));
+  result->arg_types = inst.arg_types;
+  result->ret_types = inst.ret_types;
+  for (NodeDef& n : inst.nodes) {
+    *result->gdef.add_node() = std::move(n);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/test_util.h b/tensorflow/compiler/tf2xla/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6e4ae92ed23f3fca0f59b131dc73152e0947b72
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/test_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for tests.
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TEST_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TEST_UTIL_H_
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Same as InstantiationResult, but has a GraphDef instead of just nodes.
+struct InstantiationResultForTest {
+  DataTypeVector arg_types;
+  DataTypeVector ret_types;
+  GraphDef gdef;
+};
+
+// Instantiates a function, producing a GraphDef to compare against the
+// expected graph.
+Status InstantiateFunctionForTest(const string& name,
+                                  const FunctionLibraryDefinition& library,
+                                  InstantiationResultForTest* result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TEST_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index 75630bee3961243b2389274f0f98200ee3a0a7eb..ec28bdccda47a326a0f60f2f73e8837b68e668cb 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -64,26 +64,36 @@ class XlaCompilationDevice : public LocalDevice {
   std::unique_ptr<XlaCompilationAllocator> allocator_;
 };
 
-struct XlaVariable {
-  // If this variable is visible externally, what was its argument number?
+// Represents a resource, such as a Variable or TensorArray.
+struct XlaResource {
+  enum Kind {
+    kInvalid,
+    kVariable,
+    kTensorArray,
+    kStack,
+  };
+
+  Kind kind = kInvalid;
+
+  // If this resource is visible externally, what was its argument number?
   int arg_num = -1;
 
-  // A descriptive name for the variable, used in error messages.
+  // A descriptive name for the resource, used in error messages.
   string name;
 
-  // Current type and value of the variable. Uninitialized variables are
+  // Current type and value of the resource. Uninitialized resources are
   // represented by a default (zero) handle and type DT_INVALID.
-  // While the type of a variable is notionally fixed during execution, when
-  // a variable is first initialized we do not yet know its type, so we keep
+  // While the type of a resource is notionally fixed during execution, when
+  // a resource is first initialized we do not yet know its type, so we keep
   // track of its type dynamically.
   DataType type = DT_INVALID;
   xla::ComputationDataHandle value;
 
-  // Value of the variable at computation entry. Used to detect which
+  // Value of the resource at computation entry. Used to detect which
   // variables have new values that need to be written back.
   xla::ComputationDataHandle initial_value;
 
-  // We treat TensorArrays as a Variable with some extra metadata.
+  // TensorArray-specific fields
 
   // 'tensor_array_size' stores the expected size of the TensorArray. We need
   // to store this since sometimes TensorArrays must be initialized lazily since
@@ -91,10 +101,10 @@ struct XlaVariable {
   int64 tensor_array_size = -1;
 
   // 'tensor_array_gradient' is a map from TensorArrayGradV3 'source' attributes
-  // to an XlaVariable containing the gradient TensorArrays. We store a pointer
+  // to an XlaResource containing the gradient TensorArrays. We store a pointer
   // here since there should only be one gradient TensorArray per 'source'
   // string, irrespective of the number of calls to TensorArrayGrad.
-  std::unordered_map<string, XlaVariable*> tensor_array_gradient;
+  std::unordered_map<string, XlaResource*> tensor_array_gradient;
 };
 
 // A XlaExpression wraps an XLA computation. Each Tensor on an
@@ -115,8 +125,8 @@ class XlaExpression {
   bool has_constant_value() const { return has_constant_value_; }
   const Tensor& constant_value() const { return constant_value_; }
 
-  void set_variable(XlaVariable* variable) { variable_ = variable; }
-  XlaVariable* variable() const { return variable_; }
+  void set_resource(XlaResource* resource) { resource_ = resource; }
+  XlaResource* resource() const { return resource_; }
 
  private:
   // The XLA handle of the expression's computation.
@@ -128,7 +138,7 @@ class XlaExpression {
   bool has_constant_value_ = false;
   Tensor constant_value_;
 
-  XlaVariable* variable_ = nullptr;  // Not owned.
+  XlaResource* resource_ = nullptr;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExpression);
 };
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 580ce3d802e71ef99903321fff2bc7374d0a9470..11a62b23aeac5028ef3384b0ec6a07018b7a3cbf 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <numeric>
 
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -59,9 +60,11 @@ Status CheckSignature(const DataTypeVector& types,
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, type, shape, name, tensor_array_size) !=
-      std::tie(other.kind, other.type, other.shape, other.name,
-               other.tensor_array_size)) {
+  if (std::tie(kind, type, name, tensor_array_size) !=
+      std::tie(other.kind, other.type, other.name, other.tensor_array_size)) {
+    return false;
+  }
+  if (!xla::ShapeUtil::Equal(shape, other.shape)) {
     return false;
   }
   if (constant_value.shape() != other.constant_value.shape()) {
@@ -85,6 +88,12 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
         (*options_.populate_resource_manager)(device_->resource_manager());
   }
 
+  local_flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(),
+                                                      FunctionDefLibrary{}));
+  local_flib_runtime_.reset(NewFunctionLibraryRuntime(
+      &device_mgr_, Env::Default(), device_, options.graph_def_version,
+      local_flib_def_.get(), OptimizerOptions(),
+      nullptr /* custom_kernel_creator */));
   flib_runtime_.reset(NewFunctionLibraryRuntime(
       &device_mgr_, Env::Default(), device_, options.graph_def_version,
       options.flib_def, OptimizerOptions(),
@@ -103,6 +112,18 @@ uint64 XlaCompiler::SignatureHash::operator()(
   return std::hash<string>()(signature.first);
 }
 
+static Status GetFunctionBody(const NameAttrList& function,
+                              FunctionLibraryRuntime* flib_runtime,
+                              const FunctionBody** fbody) {
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(flib_runtime->Instantiate(
+      function.name(), AttrSlice(&function.attr()), &handle));
+
+  *fbody = flib_runtime->GetFunctionBody(handle);
+  TF_RET_CHECK(*fbody);
+  return Status::OK();
+}
+
 Status XlaCompiler::CompileFunction(
     const XlaCompiler::CompileOptions& options, const NameAttrList& function,
     const std::vector<XlaCompiler::Argument>& args,
@@ -117,21 +138,21 @@ Status XlaCompiler::CompileFunction(
     return Status::OK();
   }
 
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(flib_runtime_->Instantiate(
-      function.name(), AttrSlice(&function.attr()), &handle));
-
-  const FunctionBody* fbody = flib_runtime_->GetFunctionBody(handle);
-  CHECK(fbody);
+  const FunctionBody* fbody;
+  if (!GetFunctionBody(function, local_flib_runtime_.get(), &fbody).ok()) {
+    TF_RETURN_IF_ERROR(GetFunctionBody(function, flib_runtime_.get(), &fbody));
+  }
 
   TF_RETURN_IF_ERROR(CheckSignature(fbody->arg_types, args));
 
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
-  if (VLOG_IS_ON(1)) {
-    dump_graph::DumpGraphToFile(
-        strings::StrCat("xla_compile_function_input_", function_id), *graph);
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "XlaCompiler::CompileFunction: "
+            << dump_graph::DumpGraphToFile(
+                   strings::StrCat("xla_compile_function_", function_id),
+                   *graph);
   }
 
   // Optimize the graph before running the compiler.
@@ -143,12 +164,6 @@ Status XlaCompiler::CompileFunction(
   optimizer.Optimize(flib_runtime_.get(), flib_runtime_->env(),
                      /*device=*/nullptr, &graph);
 
-  if (VLOG_IS_ON(1)) {
-    dump_graph::DumpGraphToFile(
-        strings::StrCat("xla_compile_function_optimized_", function_id),
-        *graph);
-  }
-
   VLOG(1) << "====================================================";
   TF_RETURN_IF_ERROR(
       CompileGraph(options, function_id, std::move(graph), args, result));
@@ -249,35 +264,37 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
                       std::vector<xla::Shape>* input_shapes) {
   context_args->resize(args.size());
 
-  // Argument numbers of arguments and variables that are to be passed to the
+  // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
-  std::vector<int> parameters, variables;
+  std::vector<int> parameters, resources;
   parameters.reserve(args.size());
-  variables.reserve(args.size());
+  resources.reserve(args.size());
 
   for (std::vector<XlaCompiler::Argument>::size_type i = 0; i < args.size();
        ++i) {
     XlaContext::Argument& context_arg = (*context_args)[i];
+    context_arg.kind = args[i].kind;
     context_arg.name = args[i].name;
     context_arg.value.constant_value = args[i].constant_value;
     context_arg.value.type = args[i].type;
 
     switch (args[i].kind) {
       case XlaCompiler::Argument::kVariable:
-        variables.push_back(i);
-        context_arg.is_variable = true;
-        context_arg.value.is_constant = false;
+      case XlaCompiler::Argument::kTensorArray:
+      case XlaCompiler::Argument::kStack:
+        context_arg.is_resource = true;
+        if (args[i].initialized) {
+          resources.push_back(i);
+          context_arg.value.is_constant = false;
+        } else {
+          context_arg.value.is_constant = true;
+        }
         context_arg.tensor_array_size = args[i].tensor_array_size;
         break;
       case XlaCompiler::Argument::kParameter:
         parameters.push_back(i);
         context_arg.value.is_constant = false;
         break;
-      case XlaCompiler::Argument::kUninitializedVariable:
-        context_arg.is_variable = true;
-        context_arg.value.is_constant = true;
-        context_arg.tensor_array_size = args[i].tensor_array_size;
-        break;
       case XlaCompiler::Argument::kConstant:
         context_arg.value.is_constant = true;
         break;
@@ -288,7 +305,7 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
 
   // Append parameters containing variable values after the other runtime
   // parameters.
-  parameters.insert(parameters.end(), variables.begin(), variables.end());
+  parameters.insert(parameters.end(), resources.begin(), resources.end());
   if (parameters.empty()) {
     return Status::OK();
   }
@@ -298,10 +315,7 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
   for (std::vector<int>::size_type i = 0; i < input_shapes->size(); ++i) {
     const XlaCompiler::Argument& arg = args[parameters[i]];
     // Computes the shapes of non-constant arguments.
-    xla::PrimitiveType type;
-    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(arg.type, &type));
-    xla::ShapeUtil::PopulateShape(type, arg.shape.dim_sizes(),
-                                  &(*input_shapes)[i]);
+    (*input_shapes)[i] = arg.shape;
     (*input_mapping)[i] = parameters[i];
   }
 
@@ -329,22 +343,22 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
 // variable states, generated by the symbolic evaluation.
 // If `has_side_effects` is true, the computation has side effects and should be
 // built even if it has no outputs.
-// If `return_updated_values_for_all_variables` is true, all variables will be
-// included in `variable_updates`, regardless of whether their value changed.
+// If `return_updated_values_for_all_resources` is true, all resources will be
+// included in `resource_updates`, regardless of whether their value changed.
 // Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
-// Sets `*variable_updates` to a description of variables whose values are
+// Sets `*resource_updates` to a description of resources whose values are
 // written by the computation; the variable writes are the last
-// `variable_updates.size()` return values from the computation. Each entry in
-// `variable_updates` is a (input_index, type) pair, where `input_index` is the
+// `resource_updates.size()` return values from the computation. Each entry in
+// `resource_updates` is a (input_index, type) pair, where `input_index` is the
 // index of a resource variable argument to the computation, and `type` is the
 // type of the final output.
 Status BuildComputation(
     const std::vector<XlaContext::HandleOrConstant>& retvals,
-    const std::vector<std::unique_ptr<XlaVariable>>& variables,
-    bool has_side_effects, bool return_updated_values_for_all_variables,
+    const std::vector<std::unique_ptr<XlaResource>>& resources,
+    bool has_side_effects, bool return_updated_values_for_all_resources,
     xla::ComputationBuilder* builder, xla::Computation* computation,
-    int* num_nonconst_outputs,
-    std::vector<XlaCompiler::VariableUpdate>* variable_updates) {
+    int* num_computation_outputs, int* num_nonconst_outputs,
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
   std::vector<xla::ComputationDataHandle> elems;
   elems.reserve(retvals.size());
   for (const XlaContext::HandleOrConstant& retval : retvals) {
@@ -354,24 +368,24 @@ Status BuildComputation(
   }
   *num_nonconst_outputs = elems.size();
 
-  // Add return values for variables whose values have changed.
-  std::vector<const XlaVariable*> arg_vars;
-  arg_vars.reserve(variables.size());
-  for (const auto& var : variables) {
+  // Add return values for resources whose values have changed.
+  std::vector<const XlaResource*> arg_vars;
+  arg_vars.reserve(resources.size());
+  for (const auto& var : resources) {
     if (var->arg_num >= 0) {
       arg_vars.push_back(var.get());
     }
   }
   std::sort(arg_vars.begin(), arg_vars.end(),
-            [](const XlaVariable* a, const XlaVariable* b) {
+            [](const XlaResource* a, const XlaResource* b) {
               return a->arg_num < b->arg_num;
             });
 
-  for (const XlaVariable* var : arg_vars) {
+  for (const XlaResource* var : arg_vars) {
     bool modified = var->value.handle() != var->initial_value.handle();
-    if (return_updated_values_for_all_variables || modified) {
-      variable_updates->emplace_back();
-      XlaCompiler::VariableUpdate& update = variable_updates->back();
+    if (return_updated_values_for_all_resources || modified) {
+      resource_updates->emplace_back();
+      XlaCompiler::ResourceUpdate& update = resource_updates->back();
       update.input_index = var->arg_num;
       update.type = var->type;
       update.modified = modified;
@@ -379,6 +393,7 @@ Status BuildComputation(
     }
   }
 
+  *num_computation_outputs = elems.size();
   if (!elems.empty() || has_side_effects) {
     // Builds a empty tuple return value for computations that have side effects
     // but have no return values.
@@ -401,6 +416,18 @@ Status BuildComputation(
   return Status::OK();
 }
 
+void AssignMajorToMinorLayout(xla::Shape* shape) {
+  if (xla::ShapeUtil::IsTuple(*shape)) {
+    for (xla::Shape& elem_shape : *shape->mutable_tuple_shapes()) {
+      AssignMajorToMinorLayout(&elem_shape);
+    }
+  } else {
+    auto& minor_to_major = *shape->mutable_layout()->mutable_minor_to_major();
+    minor_to_major.Resize(xla::ShapeUtil::Rank(*shape), 0);
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
+  }
+}
+
 }  // namespace
 
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
@@ -410,13 +437,24 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate ComputationBuilder.";
 
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "XlaCompiler::CompileGraph: "
+            << dump_graph::DumpGraphToFile(
+                   strings::StrCat("xla_compile_graph_", name), *graph);
+  }
+
   // Report the error here if initialization failed.
   TF_RETURN_IF_ERROR(initialization_status_);
 
+  // Converts Tensorflow's graph control-flow constructs into functional
+  // control-flow that can be compiled into XLA code.
+  TF_RETURN_IF_ERROR(
+      FunctionalizeControlFlow(graph.get(), local_flib_def_.get()));
+
   xla::ComputationBuilder builder(client(), name);
   XlaContext* context =
       new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
-                     options_.resolve_compile_time_constants);
+                     options.resolve_compile_time_constants);
   core::ScopedUnref context_unref(context);
 
   result->tuple_arg = options.use_tuple_arg;
@@ -431,12 +469,13 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                   flib_runtime_.get(), NextStepId()));
 
   int num_nonconst_outputs;
+  int num_computation_outputs;
   result->computation = std::make_shared<xla::Computation>();
   TF_RETURN_IF_ERROR(BuildComputation(
-      context->retvals(), context->variables(), context->has_side_effects(),
-      options.return_updated_values_for_all_variables, &builder,
-      result->computation.get(), &num_nonconst_outputs,
-      &result->variable_updates));
+      context->retvals(), context->resources(), context->has_side_effects(),
+      options.return_updated_values_for_all_resources, &builder,
+      result->computation.get(), &num_computation_outputs,
+      &num_nonconst_outputs, &result->resource_updates));
 
   result->requires_runtime_context = context->has_context_parameter();
 
@@ -473,23 +512,8 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   VLOG(2) << "XLA output shape: "
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
-  auto num_computation_outputs =
-      (xla::ShapeUtil::IsTuple(result->xla_output_shape))
-          ? xla::ShapeUtil::TupleElementCount(result->xla_output_shape)
-          : 1;
   // Tensorflow expects a major-to-minor order of results.
-  if (1 == num_computation_outputs) {
-    xla::Shape& s = result->xla_output_shape;
-    auto& minor_to_major = *s.mutable_layout()->mutable_minor_to_major();
-    minor_to_major.Resize(xla::ShapeUtil::Rank(s), 0);
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-  } else {
-    for (xla::Shape& s : *result->xla_output_shape.mutable_tuple_shapes()) {
-      auto& minor_to_major = *s.mutable_layout()->mutable_minor_to_major();
-      minor_to_major.Resize(xla::ShapeUtil::Rank(s), 0);
-      std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-    }
-  }
+  AssignMajorToMinorLayout(&result->xla_output_shape);
 
   // Converts the output shapes to TensorShapes.
   int computation_output = 0;
@@ -501,26 +525,26 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       OutputDescription& output = result->outputs[i];
       output.is_constant = false;
       if (num_computation_outputs > 1) {
-        output.shape =
-            XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(
-                result->xla_output_shape, computation_output));
+        TF_RETURN_IF_ERROR(XLAShapeToTensorShape(
+            xla::ShapeUtil::GetTupleElementShape(result->xla_output_shape,
+                                                 computation_output),
+            &output.shape));
       } else {
-        output.shape = XLAShapeToTensorShape(result->xla_output_shape);
+        TF_RETURN_IF_ERROR(
+            XLAShapeToTensorShape(result->xla_output_shape, &output.shape));
       }
       ++computation_output;
     }
   }
 
-  for (std::vector<VariableUpdate>::size_type i = 0;
-       i < result->variable_updates.size(); ++i) {
+  for (std::vector<ResourceUpdate>::size_type i = 0;
+       i < result->resource_updates.size(); ++i) {
     if (num_computation_outputs > 1) {
-      result->variable_updates[i].shape =
-          XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(
-              result->xla_output_shape, computation_output));
+      result->resource_updates[i].shape = xla::ShapeUtil::GetTupleElementShape(
+          result->xla_output_shape, computation_output);
     } else {
       CHECK_EQ(0, computation_output);
-      result->variable_updates[i].shape =
-          XLAShapeToTensorShape(result->xla_output_shape);
+      result->resource_updates[i].shape = result->xla_output_shape;
     }
     ++computation_output;
   }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 131430553252e2b62315c6388a53058bdf20eb7f..7251c92edb2b56e2d738abc4570e74e4c9dc6c62 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -85,27 +85,31 @@ class XlaCompiler {
       // Argument is a compile-time constant. No associated runtime parameter.
       kConstant,
 
-      // Argument is a variable that has not been initialized yet. No associated
-      // runtime parameter.
-      kUninitializedVariable,
-
-      // Argument is a variable that already has a value set. Expects a runtime
-      // parameter containing the current value.
+      // Argument is a Variable resource. Has an associated runtime parameter
+      // iff `initialized` is true.
       kVariable,
 
+      // Argument is a TensorArray resource. Has an associated runtime parameter
+      // iff `initialized` is true.
+      kTensorArray,
+
+      // Argument is a Stack resource. Has an associated runtime parameter
+      // iff `initialized` is true.
+      kStack,
+
       // Argument is a run-time parameter.
       kParameter,
     };
 
     Kind kind = kInvalid;
 
-    // The type of the argument. If the argument is a resource variable, this
+    // The type of the argument. If the argument is a resource, this
     // is the type of the variable's value, not DT_RESOURCE.
     DataType type;
 
-    // The shape of the argument. If the argument is a resource variable, this
-    // is the shape of the variable's value.
-    TensorShape shape;
+    // The shape of the argument. If the argument is a resource, this is the
+    // shape of the resource's value.
+    xla::Shape shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -114,8 +118,11 @@ class XlaCompiler {
     // The name of this argument, used for debugging.
     string name;
 
-    // For a kVariable or kUninitializedVariable corresponding to a TensorArray,
-    // what is the tensor array's declared size?
+    // For a kVariable or kTensorArray, has this resource been initialized?
+    bool initialized = false;
+
+    // For a kTensorArray, what is the array's declared size? (Used for lazy
+    // initialization.)
     int64 tensor_array_size = -1;
 
     bool operator==(const Argument& other) const;
@@ -133,23 +140,23 @@ class XlaCompiler {
   };
 
   // Describes a variable write side effect of the computation.
-  struct VariableUpdate {
+  struct ResourceUpdate {
     // Index of the input that contains the variable resource to write to.
     int input_index;
 
     // Type and shape of the tensor to be written back.
     DataType type;
-    TensorShape shape;
+    xla::Shape shape;
 
     // Was the value of the variable modified by the computation?
-    // (Always true, unless `return_updated_values_for_all_variables` is true.)
+    // (Always true, unless `return_updated_values_for_all_resources` is true.)
     bool modified;
   };
 
   struct CompilationResult {
     // Vector that maps from the parameters of the XLA computation to their
     // original argument positions. To handle compile-time constant inputs and
-    // variables, the parameters to the XLA computation may be a subset of the
+    // resources, the parameters to the XLA computation may be a subset of the
     // original arguments, and are not necessarily in the same order.)
     std::vector<int> input_mapping;
 
@@ -172,10 +179,10 @@ class XlaCompiler {
     // containing both constant and non-constant results.
     std::vector<OutputDescription> outputs;
 
-    // Variables whose values were updated by the computation, ordered
-    // by return value position. Variable updates follow the non-constant
+    // Resources whose values were updated by the computation, ordered
+    // by return value position. Resource updates follow the non-constant
     // results in the outputs of XLA computation.
-    std::vector<VariableUpdate> variable_updates;
+    std::vector<ResourceUpdate> resource_updates;
 
     // The XLA computation built from the tensorflow subgraph. May be null
     // if the output consists solely of compile-time constants.
@@ -206,12 +213,6 @@ class XlaCompiler {
     // stored in device memory.
     bool local_executable_has_hybrid_result = false;
 
-    // If 'resolve_compile_time_constants' is true, then outputs of a
-    // computation that are known to be compile-time constants will be returned
-    // as Tensors at compile-time, rather than as run-time outputs of the
-    // computation.
-    bool resolve_compile_time_constants = true;
-
     // If not nullptr, populate_resource_manager is called with the
     // compilation device's resource manager when the compilation
     // device is created, and can be used to create metadata objects
@@ -229,12 +230,18 @@ class XlaCompiler {
     // arguments; if false, each argument gets its own parameter.
     bool use_tuple_arg = false;
 
-    // If 'return_updated_values_for_all_variables' is true, then updated
-    // values of all resource variables arguments will be included in the
-    // 'variable_updates' of the computation, even if the variable was not
+    // If 'return_updated_values_for_all_resources' is true, then updated
+    // values of all resource resources arguments will be included in the
+    // 'resource_updates' of the computation, even if the resource was not
     // modified by the computation. Used when compiling loop bodies to ensure
     // the input and output signatures match.
-    bool return_updated_values_for_all_variables = false;
+    bool return_updated_values_for_all_resources = false;
+
+    // If 'resolve_compile_time_constants' is true, then outputs of a
+    // computation that are known to be compile-time constants will be returned
+    // as Tensors at compile-time, rather than as run-time outputs of the
+    // computation.
+    bool resolve_compile_time_constants = true;
   };
 
   // Compiles a Tensorflow function `fn_name_attrs` into an XLA computation.
@@ -294,6 +301,12 @@ class XlaCompiler {
   XlaCompilationDevice* device_;  // Owned by device_mgr_
   DeviceMgr device_mgr_;
 
+  // To avoid copying the client's function library, use a local function
+  // library and runtime for functions created as part of the functionalize
+  // control flow transformation.
+  std::unique_ptr<FunctionLibraryDefinition> local_flib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> local_flib_runtime_;
+
   std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
 
   struct SignatureHash {
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 58d74057d101cdef89fca24ec6c0858291d825fa..42bbccd1d365f10d8d8d1bd839b5b4de57fb1656 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -149,10 +150,10 @@ TEST_F(XlaCompilerTest, Simple) {
   std::vector<XlaCompiler::Argument> args(2);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = TensorShape({2});
+  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = TensorShape({2});
+  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
@@ -163,9 +164,9 @@ TEST_F(XlaCompilerTest, Simple) {
 
   // Tests that the generated computation works.
   std::unique_ptr<xla::Literal> param0_literal =
-      xla::LiteralUtil::CreateR1<int32>({7, 42});
+      xla::Literal::CreateR1<int32>({7, 42});
   std::unique_ptr<xla::Literal> param1_literal =
-      xla::LiteralUtil::CreateR1<int32>({-3, 101});
+      xla::Literal::CreateR1<int32>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -179,7 +180,7 @@ TEST_F(XlaCompilerTest, Simple) {
       client_->Transfer(*actual).ConsumeValueOrDie();
 
   std::unique_ptr<xla::Literal> expected_literal =
-      xla::LiteralUtil::CreateR1<int32>({4, 143});
+      xla::Literal::CreateR1<int32>({4, 143});
   xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
 }
 
@@ -201,21 +202,21 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = TensorShape({2});
+  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
 
+  XlaCompiler::Options options = DefaultOptions();
+  XlaCompiler compiler(options);
   {
     // Compiles the graph, with resolve_compile_time_constants enabled.
-    XlaCompiler::Options options = DefaultOptions();
-    options.resolve_compile_time_constants = true;
-    XlaCompiler compiler(options);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.resolve_compile_time_constants = true;
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
-                                       "constants", std::move(graph_copy), args,
-                                       &result));
+    TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
+                                       std::move(graph_copy), args, &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_TRUE(result.outputs[0].is_constant);
@@ -225,7 +226,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
 
     // Tests that the generated computation works.
     std::unique_ptr<xla::Literal> param0_literal =
-        xla::LiteralUtil::CreateR1<int32>({7, 42});
+        xla::Literal::CreateR1<int32>({7, 42});
     std::unique_ptr<xla::GlobalData> param0_data =
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -236,23 +237,20 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->Transfer(*actual).ConsumeValueOrDie();
 
     std::unique_ptr<xla::Literal> expected_literal =
-        xla::LiteralUtil::CreateR1<int32>({-7, -42});
+        xla::Literal::CreateR1<int32>({-7, -42});
     xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
   }
 
   {
     // Compiles the graph, with resolve_compile_time_constants disabled.
-    XlaCompiler::Options options = DefaultOptions();
-    options.resolve_compile_time_constants = false;
-    XlaCompiler compiler(options);
-
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.resolve_compile_time_constants = false;
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
-                                       "constants", std::move(graph_copy), args,
-                                       &result));
+    TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
+                                       std::move(graph_copy), args, &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_FALSE(result.outputs[0].is_constant);
@@ -260,7 +258,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
 
     // Tests that the generated computation works.
     std::unique_ptr<xla::Literal> param0_literal =
-        xla::LiteralUtil::CreateR1<int32>({7, 42});
+        xla::Literal::CreateR1<int32>({7, 42});
     std::unique_ptr<xla::GlobalData> param0_data =
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -270,12 +268,11 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
 
-    std::unique_ptr<xla::Literal> expected0 =
-        xla::LiteralUtil::CreateR0<int32>(7);
+    std::unique_ptr<xla::Literal> expected0 = xla::Literal::CreateR0<int32>(7);
     std::unique_ptr<xla::Literal> expected1 =
-        xla::LiteralUtil::CreateR1<int32>({-7, -42});
+        xla::Literal::CreateR1<int32>({-7, -42});
     std::unique_ptr<xla::Literal> expected =
-        xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()});
+        xla::Literal::MakeTuple({expected0.get(), expected1.get()});
     xla::LiteralTestUtil::ExpectEqual(*expected, *actual_literal);
   }
 }
@@ -294,7 +291,7 @@ TEST_F(XlaCompilerTest, ResourceManager) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = TensorShape({2});
+  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
 
   DummyResourceForTest* resource = new DummyResourceForTest();
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 4440b530696db6125e0af0606be49e2d834dbd9f..d4d493b456f668ecfbdd0164c573b9ae2aa810e9 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -129,16 +129,18 @@ void XlaContext::AddSideEffects() {
 
 xla::ComputationBuilder* XlaContext::builder() { return builder_; }
 
-Status XlaContext::CreateVariable(int arg_num, string name, DataType type,
+Status XlaContext::CreateResource(XlaResource::Kind kind, int arg_num,
+                                  string name, DataType type,
                                   const xla::ComputationDataHandle& handle,
-                                  XlaVariable** variable) {
-  variables_.emplace_back(new XlaVariable);
-  *variable = variables_.back().get();
-  XlaVariable& var = **variable;
-  var.arg_num = arg_num;
-  var.name = std::move(name);
-  var.type = type;
-  var.initial_value = var.value = handle;
+                                  XlaResource** resource) {
+  resources_.emplace_back(new XlaResource);
+  *resource = resources_.back().get();
+  XlaResource& r = **resource;
+  r.kind = kind;
+  r.arg_num = arg_num;
+  r.name = std::move(name);
+  r.type = type;
+  r.initial_value = r.value = handle;
   return Status::OK();
 }
 
@@ -170,27 +172,6 @@ const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
   });
 }
 
-const xla::Computation* XlaContext::GetOrCreateSigmoid(const DataType type) {
-  return LookupOrCreate(type, &sigmoid_func_, [this, type] {
-    const string type_string = DataTypeString(type);
-    VLOG(1) << "Building Sigmoid() for " << type_string;
-    xla::ComputationBuilder b(builder()->client(),
-                              "sigmoid<" + type_string + ">");
-    xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
-    auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
-    // Clamp the inputs to the range [-18, 18] since anything outside
-    // this range is 0.0f or 1.0f in single-precision. We must clamp the range
-    // of x to avoid incorrect outputs due to fast-math optimizations for large
-    // negative x.
-    x = b.Clamp(XlaHelpers::IntegerLiteral(&b, type, -18), x,
-                XlaHelpers::IntegerLiteral(&b, type, 18));
-    auto one = XlaHelpers::One(&b, type);
-    b.Div(one, b.Add(b.Exp(b.Neg(x)), one));
-    return b.Build().ConsumeValueOrDie();
-  });
-}
-
 const xla::Computation* XlaContext::LookupOrCreate(
     DataType type, ComputationMap* out,
     const std::function<xla::Computation()>& create) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 3978baaf637b4948510eafe37de94a383a87ddc3..544921b9e38fb52e70b9f67ba10f7c79dc53c657 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -52,11 +52,13 @@ class XlaContext : public ResourceBase {
   };
 
   struct Argument {
-    // Descriptive name for the variable, for use in error messages.
+    XlaCompiler::Argument::Kind kind;
+
+    // Descriptive name for the resource, for use in error messages.
     string name;
 
-    // Is this a variable?
-    bool is_variable = false;
+    // Is this a resource?
+    bool is_resource = false;
 
     HandleOrConstant value;
 
@@ -106,15 +108,15 @@ class XlaContext : public ResourceBase {
 
   bool has_side_effects() const { return has_side_effects_; }
 
-  // Creates a variable with variable `variable_id` and initial type `type` and
+  // Creates a resource with resource `kind` and initial type `type` and
   // value `handle`. `name` is a descriptive name for use in error messages.
-  // Fails if the variable already exists.
-  Status CreateVariable(int arg_num, string name, DataType type,
-                        const xla::ComputationDataHandle& handle,
-                        XlaVariable** variable);
+  // Fails if the resource already exists.
+  Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
+                        DataType type, const xla::ComputationDataHandle& handle,
+                        XlaResource** resource);
 
-  const std::vector<std::unique_ptr<XlaVariable>>& variables() {
-    return variables_;
+  const std::vector<std::unique_ptr<XlaResource>>& resources() {
+    return resources_;
   }
 
   // Get an XLA lambda to compute Max. This is cached in the
@@ -127,11 +129,6 @@ class XlaContext : public ResourceBase {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
-  // Get an XLA lambda to compute Sigmoid. This is cached in the
-  // XlaContext since it may be used by multiple Ops. There is a
-  // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateSigmoid(const DataType type);
-
   // The name of the XlaContext resource during symbolic graph execution.
   static const char kXlaContextResourceName[];
 
@@ -166,8 +163,8 @@ class XlaContext : public ResourceBase {
   // Does the computation have side effects, i.e., Send() calls?
   bool has_side_effects_ = false;
 
-  // Holds ownership of variables. The variables are not ordered.
-  std::vector<std::unique_ptr<XlaVariable>> variables_;
+  // Holds ownership of resources. The resources are not ordered.
+  std::vector<std::unique_ptr<XlaResource>> resources_;
 
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::Computation>;
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index f060f8f2f178b2bc56caf7a3df9df32c8a407473..3af866f9be516beae7e6fa64b5a4cf1fef843f67 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -30,28 +30,28 @@ xla::ComputationDataHandle XlaHelpers::MinValue(xla::ComputationBuilder* b,
                                                 DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
-  return b->ConstantLiteral(xla::LiteralUtil::MinValue(type));
+  return b->ConstantLiteral(xla::Literal::MinValue(type));
 }
 
 xla::ComputationDataHandle XlaHelpers::MaxValue(xla::ComputationBuilder* b,
                                                 DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
-  return b->ConstantLiteral(xla::LiteralUtil::MaxValue(type));
+  return b->ConstantLiteral(xla::Literal::MaxValue(type));
 }
 
 xla::ComputationDataHandle XlaHelpers::Zero(xla::ComputationBuilder* b,
                                             DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
-  return b->ConstantLiteral(xla::LiteralUtil::Zero(type));
+  return b->ConstantLiteral(xla::Literal::Zero(type));
 }
 
 xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
                                            DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
-  return b->ConstantLiteral(xla::LiteralUtil::One(type));
+  return b->ConstantLiteral(xla::Literal::One(type));
 }
 
 xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
@@ -61,28 +61,28 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   switch (type) {
     case xla::U8:
-      literal = *xla::LiteralUtil::CreateR0<uint8>(value);
+      literal = *xla::Literal::CreateR0<uint8>(value);
       break;
     case xla::U32:
-      literal = *xla::LiteralUtil::CreateR0<uint32>(value);
+      literal = *xla::Literal::CreateR0<uint32>(value);
       break;
     case xla::U64:
-      literal = *xla::LiteralUtil::CreateR0<uint64>(value);
+      literal = *xla::Literal::CreateR0<uint64>(value);
       break;
     case xla::S8:
-      literal = *xla::LiteralUtil::CreateR0<int8>(value);
+      literal = *xla::Literal::CreateR0<int8>(value);
       break;
     case xla::S32:
-      literal = *xla::LiteralUtil::CreateR0<int32>(value);
+      literal = *xla::Literal::CreateR0<int32>(value);
       break;
     case xla::S64:
-      literal = *xla::LiteralUtil::CreateR0<int64>(value);
+      literal = *xla::Literal::CreateR0<int64>(value);
       break;
     case xla::F32:
-      literal = *xla::LiteralUtil::CreateR0<float>(value);
+      literal = *xla::Literal::CreateR0<float>(value);
       break;
     case xla::F64:
-      literal = *xla::LiteralUtil::CreateR0<double>(value);
+      literal = *xla::Literal::CreateR0<double>(value);
       break;
     case xla::PRED:
       LOG(FATAL) << "pred element type is not integral";
@@ -91,7 +91,7 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case xla::F16:
       literal =
-          *xla::LiteralUtil::CreateR0<xla::half>(static_cast<xla::half>(value));
+          *xla::Literal::CreateR0<xla::half>(static_cast<xla::half>(value));
       break;
     case xla::TUPLE:
       LOG(FATAL) << "tuple element type is not integral";
@@ -205,4 +205,13 @@ Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
   return Status::OK();
 }
 
+xla::ComputationDataHandle XlaHelpers::PadWithZeros(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    int count) {
+  xla::ComputationDataHandle zero = builder->ConstantR1<int32>({0});
+  std::vector<xla::ComputationDataHandle> xs(count + 1, zero);
+  xs[0] = builder->Reshape(x, {1});
+  return builder->ConcatInDim(xs, 0);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index a141ee05c13ed2e09fab69946ba400ab6cd628a9..2166ce363608ea65ba8cd9db856aff9ee2715005 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -77,6 +77,11 @@ class XlaHelpers {
                        const xla::ComputationDataHandle& on_value,
                        const xla::ComputationDataHandle& off_value,
                        xla::ComputationDataHandle* one_hot);
+
+  // Pads 'x' with 'count' zeros. 'x' must have 1 element.
+  static xla::ComputationDataHandle PadWithZeros(
+      xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+      int count);
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 3272b1efa153c0ecab720583277175b81fe59509..c5a68e05d9e1dfa3ed1c648e95d3690fadef8b51 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -39,7 +39,7 @@ static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
   CHECK(expression->handle().handle() != 0 ||
-        expression->variable() != nullptr);
+        expression->resource() != nullptr);
   VLOG(1) << "Fetched T" << expression->handle().handle();
   return expression;
 }
@@ -144,9 +144,9 @@ static Status LiteralToInt64Scalar(const xla::Literal& literal, int64* out) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::S32) {
-    *out = xla::LiteralUtil::Get<int32>(literal, {});
+    *out = literal.Get<int32>({});
   } else if (literal.shape().element_type() == xla::S64) {
-    *out = xla::LiteralUtil::Get<int64>(literal, {});
+    *out = literal.Get<int64>({});
   } else {
     return errors::InvalidArgument("value must be either int32 or int64");
   }
@@ -168,11 +168,11 @@ static Status LiteralToInt64Vector(const xla::Literal& literal,
   int64 size = xla::ShapeUtil::ElementsIn(literal.shape());
   if (literal.shape().element_type() == xla::S32) {
     for (int64 i = 0; i < size; ++i) {
-      out->push_back(xla::LiteralUtil::Get<int32>(literal, {i}));
+      out->push_back(literal.Get<int32>({i}));
     }
   } else if (literal.shape().element_type() == xla::S64) {
     for (int64 i = 0; i < size; ++i) {
-      out->push_back(xla::LiteralUtil::Get<int64>(literal, {i}));
+      out->push_back(literal.Get<int64>({i}));
     }
   } else {
     return errors::InvalidArgument("value must be either int32 or int64");
@@ -252,8 +252,9 @@ Status XlaOpKernelContext::ReadVariableInput(
     int index, xla::ComputationDataHandle* value) {
   const Tensor& tensor = context_->input(index);
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
-  XlaVariable* variable = expression->variable();
+  XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
+  TF_RET_CHECK(variable->kind == XlaResource::kVariable);
   if (variable->value.handle() == 0) {
     return errors::InvalidArgument("Read of uninitialized variable ",
                                    variable->name);
@@ -262,22 +263,13 @@ Status XlaOpKernelContext::ReadVariableInput(
   return Status::OK();
 }
 
-string XlaOpKernelContext::VariableDebugString(int index) {
-  const Tensor& tensor = context_->input(index);
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
-  XlaVariable* variable = expression->variable();
-  if (!variable) {
-    return "<invalid variable ID>";
-  }
-  return variable->name;
-}
-
 Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
                                                    TensorShape* shape) const {
   const Tensor& tensor = context_->input(index);
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
-  XlaVariable* variable = expression->variable();
+  XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
+  TF_RET_CHECK(variable->kind == XlaResource::kVariable);
   if (variable->value.handle() == 0) {
     return errors::InvalidArgument("Read of uninitialized variable ",
                                    variable->name);
@@ -287,7 +279,8 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   if (!shape_or_status.ok()) {
     return shape_or_status.status();
   }
-  *shape = XLAShapeToTensorShape(*shape_or_status.ValueOrDie());
+  TF_RETURN_IF_ERROR(
+      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), shape));
   return Status::OK();
 }
 
@@ -304,10 +297,11 @@ void XlaOpKernelContext::SetOutput(int index,
   // The step's default allocator is the dummy XlaCompilationAllocator which
   // simply allocates a metadata buffer to hold the expression to which it
   // corresponds.
-  OP_REQUIRES_OK(
-      context_,
-      context_->allocate_output(
-          index, XLAShapeToTensorShape(*shape.ValueOrDie()), &output));
+  TensorShape tensor_shape;
+  OP_REQUIRES_OK(context_,
+                 XLAShapeToTensorShape(*shape.ValueOrDie(), &tensor_shape));
+  OP_REQUIRES_OK(context_,
+                 context_->allocate_output(index, tensor_shape, &output));
 
   // The expression is stored in the tensor's data buffer. Fill in the
   // fields now.
@@ -337,33 +331,34 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   expression->set_constant_value(constant);
 }
 
-void XlaOpKernelContext::SetVariableOutput(int index, XlaVariable* variable) {
+void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
   Tensor* output = nullptr;
-  // The shape of the output tensor is the shape of the variable resource
-  // (i.e., a scalar), not the shape of the variable's value.
+  // The shape of the output tensor is the shape of the resource itself
+  // (i.e., a scalar), not the shape of the resource's value.
   OP_REQUIRES_OK(context_,
                  context_->allocate_output(index, TensorShape(), &output));
   XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_variable(variable);
+  expression->set_resource(resource);
 }
 
-Status XlaOpKernelContext::GetVariableInput(int index, XlaVariable** variable) {
+Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
   const XlaExpression* expression =
       CastExpressionFromTensor(context_->input(index));
-  TF_RET_CHECK(expression->variable() != nullptr);
-  *variable = expression->variable();
+  TF_RET_CHECK(expression->resource() != nullptr);
+  *resource = expression->resource();
   return Status::OK();
 }
 
 Status XlaOpKernelContext::AssignVariable(
-    int index, DataType type, const xla::ComputationDataHandle& handle) {
+    int input_index, DataType type, const xla::ComputationDataHandle& handle) {
   TF_RET_CHECK(handle.handle() != 0);
   SetOpHasSideEffects();
 
   const XlaExpression* expression =
-      CastExpressionFromTensor(context_->input(index));
-  XlaVariable* variable = expression->variable();
+      CastExpressionFromTensor(context_->input(input_index));
+  XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
+  TF_RET_CHECK(variable->kind == XlaResource::kVariable);
   if (!((variable->type == DT_INVALID && type != DT_INVALID) ||
         (variable->type == type))) {
     return errors::InvalidArgument(
@@ -398,11 +393,6 @@ const xla::Computation* XlaOpKernelContext::GetOrCreateAdd(
   return XlaContext::Get(context_).GetOrCreateAdd(type);
 }
 
-const xla::Computation* XlaOpKernelContext::GetOrCreateSigmoid(
-    const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateSigmoid(type);
-}
-
 XlaOpKernel::XlaOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
 
 void XlaOpKernel::Compute(OpKernelContext* context) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index a25774c3a6a4a7212d157766a23e73063c2deab8..30b794c8c198cae6bf3b11794b35049b729063e1 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -148,6 +148,12 @@ class XlaOpKernelContext {
 
   // Variables
 
+  // Sets '*resource' to the resource associated with input `index`.
+  Status GetResourceInput(int index, XlaResource** resource);
+
+  // Sets output 'index' to be a reference to resource 'resource'.
+  void SetResourceOutput(int index, XlaResource* resource);
+
   // Sets `*type` and `*shape` to the current type and shape of a variable's
   // value.
   Status GetVariableTypeAndShape(int index, DataType* type,
@@ -158,20 +164,10 @@ class XlaOpKernelContext {
   Status ReadVariableInput(int index, xla::ComputationDataHandle* value);
 
   // Assigns the value `handle` to the variable referenced by input
-  // `variable_index`. Marks the operator as having side effects.
-  Status AssignVariable(int variable_index, DataType type,
+  // `input_index`. Marks the operator as having side effects.
+  Status AssignVariable(int input_index, DataType type,
                         const xla::ComputationDataHandle& handle);
 
-  // Sets '*variable' to the variable associated with input `index`.
-  Status GetVariableInput(int index, XlaVariable** variable);
-
-  // Sets output 'index' to be a reference to variable 'variable'. Used
-  // to propagate resource variables through the compilation.
-  void SetVariableOutput(int index, XlaVariable* variable);
-
-  // Returns a human-readable debug string describing 'variable_index'.
-  string VariableDebugString(int variable_index);
-
   // Helper routines for the OP_REQUIRES macros
   void CtxFailure(Status s);
   void CtxFailureWithWarning(Status s);
@@ -205,11 +201,6 @@ class XlaOpKernelContext {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
-  // Get an XLA lambda to compute Sigmoid. This is cached in the
-  // XlaContext since it may be used by multiple Ops. There is a
-  // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateSigmoid(const DataType type);
-
  private:
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 1bb0d8528994b957ccebeabce8bc48227122e366..d059c7a23ef2955cdd1280d1ceff7fc39b625631 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -34,11 +36,18 @@ const char* const DEVICE_GPU_XLA_JIT = "XLA_GPU_JIT";
 const char* const DEVICE_XLA_CPU = "XLA_CPU";
 const char* const DEVICE_XLA_GPU = "XLA_GPU";
 
-// Is platform 'id' supported by XLA?
-static bool IsPlatformSupported(perftools::gputools::Platform::Id id) {
-  auto platform = perftools::gputools::MultiPlatformManager::PlatformWithId(id);
-  if (!platform.ok()) return false;
-  return xla::ClientLibrary::GetOrCreateLocalClient(platform.ValueOrDie()).ok();
+static Status LaunchOpHasKernelForDevice(const DeviceType& device_type) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef("_XlaLaunch", &op_def));
+  NodeDef node_def;
+  node_def.set_name("_XlaLaunch-op");
+  node_def.set_op("_XlaLaunch");
+  string kernel_class_name;
+  TF_RETURN_IF_ERROR(FindKernelDef(device_type, node_def, /*KernelDef*/ nullptr,
+                                   &kernel_class_name));
+  VLOG(1) << "LaunchOpHasKernelForDevice"
+          << " kernel_class_name: " << kernel_class_name;
+  return Status::OK();
 }
 
 XlaOpRegistry::XlaOpRegistry() = default;
@@ -75,7 +84,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   // GetCompilationDevice is called.
   static void* registration_init = [&registry]() {
     mutex_lock lock(registry.mutex_);
-    if (IsPlatformSupported(perftools::gputools::host::kHostPlatformId)) {
+    if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_CPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_CPU];
       registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
@@ -83,7 +92,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
       registration.enable_jit_by_default = false;
       registration.compile_resource_ops = false;
     }
-    if (IsPlatformSupported(perftools::gputools::cuda::kCudaPlatformId)) {
+    if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_GPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_GPU];
       registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 9a39cc96754fe8fef2c52e5de9626bcad30bf483..47d61a21a13b35050dff3d95c3856ee3f356f3c7 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -45,9 +45,10 @@ extern const char* const DEVICE_XLA_CPU;
 extern const char* const DEVICE_XLA_GPU;
 
 constexpr std::array<DataType, 2> kIntTypes = {{DT_INT32, DT_INT64}};
-constexpr std::array<DataType, 2> kFloatTypes = {{DT_FLOAT, DT_DOUBLE}};
-constexpr std::array<DataType, 4> kNumericTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE}};
+constexpr std::array<DataType, 3> kFloatTypes = {
+    {DT_HALF, DT_FLOAT, DT_DOUBLE}};
+constexpr std::array<DataType, 5> kNumericTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE}};
 
 constexpr std::array<DataType, 5> kCpuAllTypes = {
     {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_BOOL}};
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 2491cc3f7a2011827f4e093f287b525155153b71..e0a03a78f1d847ee03e136d46bdb28b0a085dc4c 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -46,21 +46,18 @@ xla_proto_library(
     ],
 )
 
-# This is a headers target that extra XLA devices can use to prevent
-# circular dependencies.  Devices that are compiled as separate shared
-# objects can also use it to prevent linking of library code.
-cc_header_only_library(
-    name = "xla_headers_lib",
-    visibility = ["//visibility:public"],
+cc_library(
+    name = "execution_options_util",
+    srcs = [
+        "execution_options_util.cc",
+    ],
+    hdrs = [
+        "execution_options_util.h",
+    ],
+    visibility = [":friends"],
     deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_evaluator",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:stream_executor_headers_lib",
+        ":xla_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
     ],
 )
 
@@ -135,7 +132,10 @@ cc_library(
 cc_library(
     name = "statusor",
     srcs = ["statusor.cc"],
-    hdrs = ["statusor.h"],
+    hdrs = [
+        "statusor.h",
+        "statusor_internals.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":status",
@@ -171,7 +171,6 @@ cc_library(
         ":status",
         ":types",
         ":xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:util_flags",
         "//tensorflow/core:lib",
     ],
 )
@@ -226,7 +225,6 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
@@ -256,7 +254,6 @@ cc_test(
         ":shape_util",
         ":test",
         ":test_helpers",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
         "//tensorflow/core:test_main",
     ],
 )
@@ -577,6 +574,7 @@ cc_test(
     srcs = ["reference_util_test.cc"],
     deps = [
         ":array2d",
+        ":array3d",
         ":array4d",
         ":literal_util",
         ":reference_util",
@@ -602,3 +600,17 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
+cc_header_only_library(
+    name = "xla_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_data_proto",
+        ":xla_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ],
+)
diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h
index d93f968f4d7a8c30129f4e14c4db06c25187cb45..4c7fce1aaf1faf4bd08bca38bc8eb2b47303b575 100644
--- a/tensorflow/compiler/xla/array4d.h
+++ b/tensorflow/compiler/xla/array4d.h
@@ -207,6 +207,18 @@ class Array4D {
     }
   }
 
+  // Invokes a callback with the (indices, value) for each cell in the 4D array.
+  void Each(
+      std::function<void(tensorflow::gtl::ArraySlice<int64>, T)> f) const {
+    // We const_cast to be able to use the common non-const implementation,
+    // but prevent modification of the data by passing it by-value to the
+    // caller.
+    const_cast<Array4D*>(this)->Each(
+        [&f](tensorflow::gtl::ArraySlice<int64> indices, T* value) {
+          f(indices, *value);
+        });
+  }
+
   // Fills all of the {p,z} with the array provided, which specifies {y,x}.
   void FillWithYX(const Array2D<T>& value) {
     CHECK_EQ(value.height(), height());
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 63c6d9ddaca5e9e336e29cd3b23cfd921d4ce9e7..a998b91c89d79ac5e354d2a3edf5fb78695d73cb 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -62,6 +62,7 @@ cc_library(
     deps = [
         ":computation",
         ":global_data",
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:service_interface",
         "//tensorflow/compiler/xla:status_macros",
@@ -70,6 +71,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
@@ -114,7 +116,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:support",
     ],
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 454d0fbd9650c4d77a62b4c25a5407e36bd191f8..1799bbd3480daacc204b42f168a7f8e9149db58b 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -197,7 +199,10 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     ExecutionProfile* execution_profile) {
   ExecuteRequest request;
   *request.mutable_computation() = computation.handle();
-  if (execution_options != nullptr) {
+
+  if (execution_options == nullptr) {
+    *request.mutable_execution_options() = CreateDefaultExecutionOptions();
+  } else {
     *request.mutable_execution_options() = *execution_options;
   }
   for (GlobalData* argument : arguments) {
@@ -298,7 +303,9 @@ StatusOr<ExecutionHandle> Client::ExecuteAsync(
   for (GlobalData* argument : arguments) {
     *request.add_arguments() = argument->handle();
   }
-  if (execution_options != nullptr) {
+  if (execution_options == nullptr) {
+    *request.mutable_execution_options() = CreateDefaultExecutionOptions();
+  } else {
     *request.mutable_execution_options() = *execution_options;
   }
 
@@ -376,9 +383,10 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::DeconstructTuple(
 }
 
 StatusOr<ComputationStats> Client::GetComputationStats(
-    const Computation& computation) const {
+    const Computation& computation, const DebugOptions& debug_options) const {
   ComputationStatsRequest request;
   *request.mutable_computation() = computation.handle();
+  *request.mutable_debug_options() = debug_options;
   ComputationStatsResponse response;
 
   VLOG(1) << "making computation stats request";
@@ -427,7 +435,10 @@ StatusOr<Shape> Client::GetShape(const GlobalData& data) {
 
 StatusOr<string> Client::ExecutionStatsAsString(
     const Computation& computation, const ExecutionProfile& profile) {
-  TF_ASSIGN_OR_RETURN(auto computation_stats, GetComputationStats(computation));
+  TF_ASSIGN_OR_RETURN(
+      auto computation_stats,
+      GetComputationStats(computation,
+                          legacy_flags::GetDebugOptionsFromFlags()));
   int64 total_flops =
       computation_stats.flop_count() + computation_stats.transcendental_count();
   if (profile.compute_time_ns() > 0) {
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 797835160fa2850f108e85ff3147abffd9f86ad8..69d3642911fa8fe87ceb347d929e95ffd972615b 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -150,7 +150,7 @@ class Client {
 
   // Retrieves the statistics of the given computation.
   StatusOr<ComputationStats> GetComputationStats(
-      const Computation& computation) const;
+      const Computation& computation, const DebugOptions& debug_options) const;
 
   // Returns the Shape of the given array specified by 'data'. The shape
   // includes the Layout of the array as it is stored on the service.
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 8238261e1c90cadeda9005e437d684d3770bd67b..b1663bc815719c3da75b37593ac665b1f3493db8 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -23,6 +23,13 @@ limitations under the License.
 
 namespace xla {
 
+LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
+                                       int number_of_replicas,
+                                       int intra_op_parallelism_threads)
+    : platform_(platform),
+      number_of_replicas_(number_of_replicas),
+      intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
+
 LocalClientOptions& LocalClientOptions::set_platform(
     perftools::gputools::Platform* platform) {
   platform_ = platform;
@@ -142,4 +149,12 @@ ClientLibrary::GetOrCreateCompileOnlyClient(
   return cl;
 }
 
+/* static */ void ClientLibrary::DestroyLocalInstances() {
+  ClientLibrary& client_library = Singleton();
+  tensorflow::mutex_lock lock(client_library.service_mutex_);
+
+  client_library.local_instances_.clear();
+  client_library.compile_only_instances_.clear();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 3ddd235d0efeeb78f49eafbf670d7c74a88960dd..a6f30d82e43587135697e76e8bc7d122edc0f602 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -43,13 +43,16 @@ namespace xla {
 // Options to configure the local client when it is created.
 class LocalClientOptions {
  public:
+  LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
+                     int number_of_replicas = 1,
+                     int intra_op_parallelism_threads = -1);
+
   // Set the platform backing the service, or nullptr for the default platform.
   LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
   perftools::gputools::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
-  // programs. The default is -1 meaning that the value is read from
-  // the xla_replicas flag.
+  // programs.
   LocalClientOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
@@ -58,9 +61,9 @@ class LocalClientOptions {
   int intra_op_parallelism_threads() const;
 
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
-  int number_of_replicas_ = -1;
-  int intra_op_parallelism_threads_ = -1;
+  perftools::gputools::Platform* platform_;
+  int number_of_replicas_;
+  int intra_op_parallelism_threads_;
 };
 
 class ClientLibrary {
@@ -90,6 +93,11 @@ class ClientLibrary {
   static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
       perftools::gputools::Platform* platform = nullptr);
 
+  // Clears the local instance and compile only instance caches. The client
+  // pointers returned by the previous GetOrCreateLocalClient() or
+  // GetOrCreateCompileOnlyClient() invocations are not valid anymore.
+  static void DestroyLocalInstances();
+
  private:
   // Returns the singleton instance of ClientLibrary.
   static ClientLibrary& Singleton();
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 37bf697683b0f5f61a1b915628920b0752116a32..212bcd27d29d6e3c06362344bd370d5ef24d6f56 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -971,6 +971,16 @@ ComputationDataHandle ComputationBuilder::Sign(
   return UnaryOp(UNOP_SIGN, operand);
 }
 
+ComputationDataHandle ComputationBuilder::Cos(
+    const ComputationDataHandle& operand) {
+  return UnaryOp(UNOP_COS, operand);
+}
+
+ComputationDataHandle ComputationBuilder::Sin(
+    const ComputationDataHandle& operand) {
+  return UnaryOp(UNOP_SIN, operand);
+}
+
 ComputationDataHandle ComputationBuilder::Tanh(
     const ComputationDataHandle& operand) {
   return UnaryOp(UNOP_TANH, operand);
@@ -1411,6 +1421,72 @@ ComputationDataHandle ComputationBuilder::ReduceWindowWithGeneralPadding(
   return ParseOpResponse(s, &response);
 }
 
+ComputationDataHandle ComputationBuilder::BatchNormTraining(
+    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
+    const ComputationDataHandle& offset, float epsilon, int64 feature_index) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+  BatchNormTrainingRequest request;
+  *request.mutable_operand() = operand;
+  *request.mutable_scale() = scale;
+  *request.mutable_offset() = offset;
+  request.set_epsilon(epsilon);
+  request.set_feature_index(feature_index);
+
+  OpRequest op_request;
+  *op_request.mutable_batch_norm_training_request() = request;
+  *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
+
+  OpResponse response;
+
+  VLOG(2) << "making BatchNormTraining request";
+
+  Status s = client_->stub()->Op(&op_request, &response);
+  return ParseOpResponse(s, &response);
+}
+
+ComputationDataHandle ComputationBuilder::BatchNormInference(
+    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
+    const ComputationDataHandle& offset, const ComputationDataHandle& mean,
+    const ComputationDataHandle& variance, float epsilon, int64 feature_index) {
+  // TODO(b/62843645): Implement BatchNormInference.
+  NoteError(Unimplemented("BatchNormInference is not implemented yet."));
+  return ComputationDataHandle();
+}
+
+ComputationDataHandle ComputationBuilder::BatchNormGrad(
+    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
+    const ComputationDataHandle& mean, const ComputationDataHandle& var,
+    const ComputationDataHandle& grad_output, float epsilon,
+    int64 feature_index) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+  BatchNormGradRequest request;
+  *request.mutable_operand() = operand;
+  *request.mutable_scale() = scale;
+  *request.mutable_mean() = mean;
+  *request.mutable_variance() = var;
+  *request.mutable_grad_output() = grad_output;
+  request.set_epsilon(epsilon);
+  request.set_feature_index(feature_index);
+
+  OpRequest op_request;
+  *op_request.mutable_batch_norm_grad_request() = request;
+  *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
+
+  OpResponse response;
+
+  VLOG(2) << "making BatchNormGrad request";
+
+  Status s = client_->stub()->Op(&op_request, &response);
+
+  return ParseOpResponse(s, &response);
+}
+
 ComputationDataHandle ComputationBuilder::CrossReplicaSum(
     const ComputationDataHandle& operand) {
   if (!first_error_.ok() || !PrepareComputation().ok()) {
@@ -1487,6 +1563,28 @@ ComputationDataHandle ComputationBuilder::SelectAndScatterWithGeneralPadding(
   return ParseOpResponse(s, &response);
 }
 
+ComputationDataHandle ComputationBuilder::ReducePrecision(
+    const ComputationDataHandle& operand, const int exponent_bits,
+    const int mantissa_bits) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  ReducePrecisionRequest request;
+  *request.mutable_operand() = operand;
+  request.set_exponent_bits(exponent_bits);
+  request.set_mantissa_bits(mantissa_bits);
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_reduce_precision_request() = request;
+  AddOpMetadata(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making reduce-precision request";
+  Status s = client_->stub()->Op(&op_request, &response);
+  return ParseOpResponse(s, &response);
+}
+
 void ComputationBuilder::Send(const ComputationDataHandle& operand,
                               const ChannelHandle& handle) {
   if (!first_error_.ok() || !PrepareComputation().ok()) {
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 5cc73c28d03a097a4fd5b8d3a549ffdc43c6fcd3..94602bd473ffb138d29ca8df86388fe88cf5f312 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -510,6 +510,12 @@ class ComputationBuilder {
   // Enqueues a sign instruction onto the computation.
   ComputationDataHandle Sign(const ComputationDataHandle& operand);
 
+  // Enqueues a cosine instruction onto the computation.
+  ComputationDataHandle Cos(const ComputationDataHandle& operand);
+
+  // Enqueues a sine instruction onto the computation.
+  ComputationDataHandle Sin(const ComputationDataHandle& operand);
+
   // Enqueues a tanh instruction onto the computation.
   ComputationDataHandle Tanh(const ComputationDataHandle& operand);
 
@@ -597,6 +603,11 @@ class ComputationBuilder {
                               const Computation& body,
                               const ComputationDataHandle& init);
 
+  // Enqueues a ReducePrecision node onto the computation.
+  ComputationDataHandle ReducePrecision(const ComputationDataHandle& operand,
+                                        const int exponent_bits,
+                                        const int mantissa_bits);
+
   // Enqueues a Send node onto the computation, to send the given operand to
   // a Recv instruction that shares the same channel handle.
   void Send(const ComputationDataHandle& operand, const ChannelHandle& handle);
@@ -820,87 +831,80 @@ class ComputationBuilder {
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR0(NativeT value) {
-  return ConstantOp(
-      [value](Literal* literal) { LiteralUtil::PopulateR0(value, literal); });
+  return ConstantOp([value](Literal* literal) { literal->PopulateR0(value); });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR1(
     tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantOp([&values](Literal* literal) {
-    LiteralUtil::PopulateR1(values, literal);
-  });
+  return ConstantOp(
+      [&values](Literal* literal) { literal->PopulateR1(values); });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR1(int64 length,
                                                      NativeT value) {
   return ConstantOp([length, value](Literal* literal) {
-    LiteralUtil::PopulateWithValue(value, {length}, literal);
+    literal->PopulateWithValue(value, {length});
   });
 }
 
 inline ComputationDataHandle ComputationBuilder::ConstantR1(
     const tensorflow::core::Bitmap& values) {
-  return ConstantOp([&values](Literal* literal) {
-    LiteralUtil::PopulateR1(values, literal);
-  });
+  return ConstantOp(
+      [&values](Literal* literal) { literal->PopulateR1(values); });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantOp([&values](Literal* literal) {
-    LiteralUtil::PopulateR2(values, literal);
-  });
+  return ConstantOp(
+      [&values](Literal* literal) { literal->PopulateR2(values); });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR2FromArray2DWithLayout(
     const Array2D<NativeT>& values, const Layout& layout) {
   return ConstantOp([&values, &layout](Literal* literal) {
-    LiteralUtil::PopulateR2FromArray2DWithLayout(values, layout, literal);
+    literal->PopulateR2FromArray2DWithLayout(values, layout);
   });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR2FromArray2D(
     const Array2D<NativeT>& values) {
-  return ConstantOp([&values](Literal* literal) {
-    LiteralUtil::PopulateR2FromArray2D(values, literal);
-  });
+  return ConstantOp(
+      [&values](Literal* literal) { literal->PopulateR2FromArray2D(values); });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR3FromArray3DWithLayout(
     const Array3D<NativeT>& values, const Layout& layout) {
   return ConstantOp([&values, &layout](Literal* literal) {
-    LiteralUtil::PopulateR3FromArray3DWithLayout(values, layout, literal);
+    literal->PopulateR3FromArray3DWithLayout(values, layout);
   });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR3FromArray3D(
     const Array3D<NativeT>& values) {
-  return ConstantOp([&values](Literal* literal) {
-    LiteralUtil::PopulateR3FromArray3D(values, literal);
-  });
+  return ConstantOp(
+      [&values](Literal* literal) { literal->PopulateR3FromArray3D(values); });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR4FromArray4DWithLayout(
     const Array4D<NativeT>& values, const Layout& layout) {
   return ConstantOp([&values, &layout](Literal* literal) {
-    LiteralUtil::PopulateR4FromArray4DWithLayout(values, layout, literal);
+    literal->PopulateR4FromArray4DWithLayout(values, layout);
   });
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR4FromArray4D(
     const Array4D<NativeT>& values) {
-  return ConstantOp([&values](Literal* literal) {
-    LiteralUtil::PopulateR4FromArray4D(values, literal);
-  });
+  return ConstantOp(
+      [&values](Literal* literal) { literal->PopulateR4FromArray4D(values); });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 86b16be62f041ae3e96591627501592b34203e16..ee3468208792879c3fe4ff5860e434ef5a0c0155 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -32,6 +33,7 @@ cc_library(
     srcs = ["testing.cc"],
     hdrs = ["testing.h"],
     deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index a45974b86b67c14868fcfe9c5f8a43445a35807e..969b0eee1d195a36728f16a598add4b3b850ed60 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -22,65 +22,85 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
+namespace {
+using InstructionGenerator =
+    ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
+                              const ComputationDataHandle&);
+
+Computation CreateScalarComputation(const string& name, PrimitiveType type,
+                                    ComputationBuilder* builder,
+                                    InstructionGenerator generator) {
+  std::unique_ptr<ComputationBuilder> b;
+  if (type == PRED) {
+    b = builder->CreateSubBuilder(name);
+  } else {
+    b = builder->CreateSubBuilder(
+        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
+  }
 
-Computation CreateScalarAddComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
   const Shape scalar = ShapeUtil::MakeShape(type, {});
-  auto b = builder->CreateSubBuilder("add_" + PrimitiveType_Name(type));
   auto lhs = b->Parameter(0, scalar, "lhs");
   auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Add(lhs, rhs);
+  generator(b.get(), lhs, rhs);
   return b->BuildAndNoteError();
 }
+}  // namespace
+
+Computation CreateScalarAddComputation(PrimitiveType type,
+                                       ComputationBuilder* builder) {
+  return CreateScalarComputation(
+      "add", type, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->Add(lhs, rhs); });
+}
+
+Computation CreateScalarMultiplyComputation(PrimitiveType type,
+                                            ComputationBuilder* builder) {
+  return CreateScalarComputation(
+      "add", type, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
+}
 
 Computation CreateScalarGeComputation(PrimitiveType type,
                                       ComputationBuilder* builder) {
-  const Shape scalar = ShapeUtil::MakeShape(type, {});
-  auto b = builder->CreateSubBuilder("ge_" + PrimitiveType_Name(type));
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Ge(lhs, rhs);
-  return b->BuildAndNoteError();
+  return CreateScalarComputation(
+      "ge", type, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->Ge(lhs, rhs); });
 }
 
 Computation CreateScalarMaxComputation(PrimitiveType type,
                                        ComputationBuilder* builder) {
-  const Shape scalar = ShapeUtil::MakeShape(type, {});
-  auto b = builder->CreateSubBuilder("max_" + PrimitiveType_Name(type));
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Max(lhs, rhs);
-  return b->BuildAndNoteError();
+  return CreateScalarComputation(
+      "max", type, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->Max(lhs, rhs); });
 }
 
 Computation CreateScalarMinComputation(PrimitiveType type,
                                        ComputationBuilder* builder) {
-  const Shape scalar = ShapeUtil::MakeShape(type, {});
-  auto b = builder->CreateSubBuilder("min_" + PrimitiveType_Name(type));
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Min(lhs, rhs);
-  return b->BuildAndNoteError();
+  return CreateScalarComputation(
+      "min", type, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->Min(lhs, rhs); });
 }
 
 Computation CreateScalarLogicalAndComputation(ComputationBuilder* builder) {
-  const Shape scalar = ShapeUtil::MakeShape(PRED, {});
-  auto b = builder->CreateSubBuilder("logical_and");
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  b->LogicalAnd(lhs, rhs);
-  return b->BuildAndNoteError();
+  return CreateScalarComputation(
+      "logical_and", PRED, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->LogicalAnd(lhs, rhs); });
 }
 
 Computation CreateScalarLogicalOrComputation(ComputationBuilder* builder) {
-  const Shape scalar = ShapeUtil::MakeShape(PRED, {});
-  auto b = builder->CreateSubBuilder("logical_or");
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  b->LogicalOr(lhs, rhs);
-  return b->BuildAndNoteError();
+  return CreateScalarComputation(
+      "logical_or", PRED, builder,
+      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
+         const ComputationDataHandle& rhs) { return b->LogicalOr(lhs, rhs); });
 }
 
 StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 633086a2e7e4609543c465c9f52dc452ce3fabb3..f43d35fe4a52016d4054af28835d6b66a35217d4 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -28,6 +28,10 @@ namespace xla {
 Computation CreateScalarAddComputation(PrimitiveType type,
                                        ComputationBuilder* builder);
 
+// Creates a scalar multiply computation and returns it.
+Computation CreateScalarMultiplyComputation(PrimitiveType type,
+                                            ComputationBuilder* builder);
+
 // Creates a scalar ge computation and returns it.
 Computation CreateScalarGeComputation(PrimitiveType type,
                                       ComputationBuilder* builder);
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index daa1557df0b97ee20679f45b8d54164ca93555fa..d8bfc945807d061234c1bc5999ea377a72e85a62 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,11 +35,11 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
       client,
       tensorflow::strings::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
   // TODO(b/26811613): Replace this when RNG is supported on all backends.
-  b.Broadcast(b.ConstantLiteral(LiteralUtil::One(shape.element_type())),
+  b.Broadcast(b.ConstantLiteral(Literal::One(shape.element_type())),
               AsInt64Slice(shape.dimensions()));
   Computation computation = b.Build().ConsumeValueOrDie();
 
-  ExecutionOptions execution_options;
+  auto execution_options = CreateDefaultExecutionOptions();
   *execution_options.mutable_shape_with_output_layout() = shape;
   return client->Execute(computation, /*arguments=*/{}, &execution_options)
       .ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 67f3a6c1df4d74e5ef714dcaa56bae1e81f8276a..33d5b6f1d4d15d5143a3421c87eab9b7a7d11345 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -77,4 +77,14 @@ ExecutionProfile* ExecutableRunOptions::execution_profile() const {
   return execution_profile_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_device_assignment(
+    DeviceAssignment* device_assignment) {
+  device_assignment_ = device_assignment;
+  return *this;
+}
+
+DeviceAssignment* ExecutableRunOptions::device_assignment() const {
+  return device_assignment_;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 03f2d016ad07b63e6b7d9681c86885ce947f5319..deb3ddb203d263d25bef0499a8a53a6098d0de0c 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -40,6 +40,7 @@ struct ThreadPoolDevice;
 namespace xla {
 
 class DeviceMemoryAllocator;
+class DeviceAssignment;
 class ExecutionProfile;
 
 // Class containing options for running a LocalExecutable.
@@ -79,9 +80,14 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile() const;
   ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
 
+  ExecutableRunOptions& set_device_assignment(
+      DeviceAssignment* device_assignment);
+  DeviceAssignment* device_assignment() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
+  DeviceAssignment* device_assignment_ = nullptr;
   perftools::gputools::Stream* stream_ = nullptr;
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
diff --git a/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html b/tensorflow/compiler/xla/execution_options_util.cc
similarity index 50%
rename from tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
rename to tensorflow/compiler/xla/execution_options_util.cc
index a325f0a04cd033dd89b870a2fc6eca9a7a6f0020..e83ff7cddd675197c7f6d7018257edb4c25b6228 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
+++ b/tensorflow/compiler/xla/execution_options_util.cc
@@ -1,6 +1,4 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--->
+==============================================================================*/
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/d3.html">
+namespace xla {
 
-<!--
-tf-color-scale is a plumbing component that takes in an array of runs, and produces
-an upward-bindable outColorScale, which is a color scale mapping from those runs to
-a set of colors.
+ExecutionOptions CreateDefaultExecutionOptions() {
+  ExecutionOptions execution_options;
+  *(execution_options.mutable_debug_options()) =
+      legacy_flags::GetDebugOptionsFromFlags();
+  return execution_options;
+}
 
-@element tf-color-scale
--->
-<dom-module id="tf-color-scale">
-  <script src="palettes.js"></script>
-  <script src="colorScale.js"></script>
-</dom-module>
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/execution_options_util.h b/tensorflow/compiler/xla/execution_options_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..562da78e837ea6c4a01f0d1170797340fd421ad8
--- /dev/null
+++ b/tensorflow/compiler/xla/execution_options_util.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace xla {
+
+// Create a default ExecutionOptions proto; this proto has its debug options
+// popupated to the default values taken from flags.
+ExecutionOptions CreateDefaultExecutionOptions();
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 119c4e373f7c52993f6dbbdfe1554d818746ed1d..35a563bf22701b50c6bfed9193f8b17ffcb1ca90 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -39,35 +38,17 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using DimensionOrder = legacy_flags::DefaultLayout::DimensionOrder;
-
 // Internal helper for GetDefaultLayoutForShape and SetToDefaultLayout. Sets
 // minor_to_major to the value that represents the default layout.
 void SetDefaultLayoutToContainer(
     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
         minor_to_major) {
+  // The default XLA layout is major-to-minor (dim 0 is major).
+  // For more information on XLA layouts, see:
+  // https://www.tensorflow.org/performance/xla/shapes
   const int64 size = minor_to_major->size();
-  legacy_flags::LayoutUtilFlags* flags = legacy_flags::GetLayoutUtilFlags();
-  auto default_layout = flags->xla_default_layout;
-  switch (default_layout.dimension_order) {
-    case DimensionOrder::kMajorToMinor:
-      for (int64 i = 0; i < size; ++i) {
-        minor_to_major->Set(i, size - 1 - i);
-      }
-      break;
-    case DimensionOrder::kMinorToMajor:
-      for (int64 i = 0; i < size; ++i) {
-        minor_to_major->Set(i, i);
-      }
-      break;
-    case DimensionOrder::kRandom:
-      for (int64 i = 0; i < size; ++i) {
-        minor_to_major->Set(i, i);
-      }
-      std::shuffle(
-          minor_to_major->begin(), minor_to_major->end(),
-          std::mt19937(default_layout.seed != 0 ? default_layout.seed
-                                                : std::random_device()()));
+  for (int64 i = 0; i < size; ++i) {
+    minor_to_major->Set(i, size - 1 - i);
   }
 }
 
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index d3fcccff654fbbafa0b3c6a3d900123691f059fb..331bb9afa94e9e7c97d9c880dbac31c60ac0da18 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -210,13 +209,6 @@ TEST_F(LayoutUtilTest, IsPadded) {
 }
 
 TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
-  // Test that LayoutUtil returns expected layouts when the xla_default_layout
-  // flag is set to kMajorToMinor.
-  legacy_flags::LayoutUtilFlags* flags = legacy_flags::GetLayoutUtilFlags();
-  flags->xla_default_layout = xla::legacy_flags::DefaultLayout{
-      .dimension_order =
-          legacy_flags::DefaultLayout::DimensionOrder::kMajorToMinor};
-
   EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({1, 0}),
                                 LayoutUtil::GetDefaultLayoutForR2()));
   EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({2, 1, 0}),
@@ -229,25 +221,5 @@ TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
                             ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
 }
 
-TEST_F(LayoutUtilTest, DefaultLayoutGettersMinorToMajor) {
-  // Test that LayoutUtil returns expected layouts when the xla_default_layout
-  // flag is set to kMinorToMajor.
-  legacy_flags::LayoutUtilFlags* flags = legacy_flags::GetLayoutUtilFlags();
-  flags->xla_default_layout = xla::legacy_flags::DefaultLayout{
-      .dimension_order =
-          legacy_flags::DefaultLayout::DimensionOrder::kMinorToMajor};
-
-  EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1}),
-                                LayoutUtil::GetDefaultLayoutForR2()));
-  EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2}),
-                                LayoutUtil::GetDefaultLayoutForR3()));
-  EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2, 3}),
-                                LayoutUtil::GetDefaultLayoutForR4()));
-  EXPECT_TRUE(
-      LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2, 3, 4}),
-                        LayoutUtil::GetDefaultLayoutForShape(
-                            ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
index a147ce67a28884d485280b4d811875d569fad879..b47c82f075a1b71dd355bd86ae7200360ab0f388 100644
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ b/tensorflow/compiler/xla/legacy_flags/BUILD
@@ -41,31 +41,6 @@ cc_test(
         ],
 )
 
-cc_library(
-    name = "layout_util_flags",
-    srcs = ["layout_util_flags.cc"],
-    hdrs = ["layout_util_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "util_flags",
-    srcs = ["util_flags.cc"],
-    hdrs = ["util_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
 cc_library(
     name = "debug_options_flags",
     srcs = ["debug_options_flags.cc"],
@@ -73,188 +48,12 @@ cc_library(
     deps =
         [
             ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
             "//tensorflow/compiler/xla:xla_proto",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
         ],
 )
 
-cc_library(
-    name = "cpu_compiler_flags",
-    srcs = ["cpu_compiler_flags.cc"],
-    hdrs = ["cpu_compiler_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "cpu_runtime_flags",
-    srcs = ["cpu_runtime_flags.cc"],
-    hdrs = ["cpu_runtime_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "compiler_functor_flags",
-    srcs = ["compiler_functor_flags.cc"],
-    hdrs = ["compiler_functor_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "convolution_thunk_flags",
-    srcs = ["convolution_thunk_flags.cc"],
-    hdrs = ["convolution_thunk_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "gpu_compiler_flags",
-    srcs = ["gpu_compiler_flags.cc"],
-    hdrs = ["gpu_compiler_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "gpu_backend_lib_flags",
-    srcs = ["gpu_backend_lib_flags.cc"],
-    hdrs = ["gpu_backend_lib_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "stream_assignment_flags",
-    srcs = ["stream_assignment_flags.cc"],
-    hdrs = ["stream_assignment_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "hlo_graph_dumper_flags",
-    srcs = ["hlo_graph_dumper_flags.cc"],
-    hdrs = ["hlo_graph_dumper_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "alias_analysis_flags",
-    srcs = ["alias_analysis_flags.cc"],
-    hdrs = ["alias_analysis_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "llvm_util_flags",
-    srcs = ["llvm_util_flags.cc"],
-    hdrs = ["llvm_util_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "service_flags",
-    srcs = ["service_flags.cc"],
-    hdrs = ["service_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "buffer_assignment_flags",
-    srcs = ["buffer_assignment_flags.cc"],
-    hdrs = ["buffer_assignment_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "hlo_test_base_flags",
-    srcs = ["hlo_test_base_flags.cc"],
-    hdrs = ["hlo_test_base_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "backend_flags",
-    srcs = ["backend_flags.cc"],
-    hdrs = ["backend_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "user_computation_flags",
-    srcs = ["user_computation_flags.cc"],
-    hdrs = ["user_computation_flags.h"],
-    deps = [
-        ":parse_flags_from_env",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-    ],
-)
-
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.cc b/tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.cc
deleted file mode 100644
index 474753c10ad7ed5eb4a9a446c3f877280c5ad302..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's alias_analysis module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static AliasAnalysisFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new AliasAnalysisFlags;
-  flags->xla_emit_alias_scope = true;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_emit_alias_scope", &flags->xla_emit_alias_scope,
-                       "Use buffer analysis to refine alias-analysis."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's alias_analysis
-// module.
-void AppendAliasAnalysisFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the AliasAnalysisFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-AliasAnalysisFlags* GetAliasAnalysisFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h b/tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h
deleted file mode 100644
index 369f8cd7caa6f42273cd405ca5f43d325e457128..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_ALIAS_ANALYSIS_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_ALIAS_ANALYSIS_FLAGS_H_
-
-// Legacy flags for XLA's alias_analysis module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's alias_analysis
-// module.
-void AppendAliasAnalysisFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's alias_analysis module.
-typedef struct {
-  bool xla_emit_alias_scope;  // Use buffer analysis to refine alias-analysis.
-} AliasAnalysisFlags;
-
-// Return a pointer to the AliasAnalysisFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-AliasAnalysisFlags* GetAliasAnalysisFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_ALIAS_ANALYSIS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/backend_flags.cc b/tensorflow/compiler/xla/legacy_flags/backend_flags.cc
deleted file mode 100644
index 7c007f4435c088b35bffce40372f88f37af6ed5b..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/backend_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's backend module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/backend_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static BackendFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new BackendFlags;
-  // TODO(b/32648682): Decide if this should continue to be a flag longer term.
-  flags->xla_replicas = 1;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_replicas", &flags->xla_replicas,
-          "The number of replicas to use. 1 means no replication."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's backend module.
-void AppendBackendFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the BackendFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BackendFlags* GetBackendFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/backend_flags.h b/tensorflow/compiler/xla/legacy_flags/backend_flags.h
deleted file mode 100644
index 061238b7e690257f4eb681558dcd59b1f8ba2653..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/backend_flags.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_BACKEND_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_BACKEND_FLAGS_H_
-
-// Legacy flags for XLA's backend module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's backend module.
-void AppendBackendFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's backend module.
-typedef struct {
-  int64 xla_replicas;  // The number of replicas to use.  1 means no
-                       // replication.
-} BackendFlags;
-
-// Return a pointer to the BackendFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BackendFlags* GetBackendFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_BACKEND_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.cc b/tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.cc
deleted file mode 100644
index 71873f73afd5bb8c59832a4c82f87f4e51c31180..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's buffer_assignment module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static BufferAssignmentFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new BufferAssignmentFlags;
-  flags->xla_enable_buffer_reuse = true;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_enable_buffer_reuse",
-                       &flags->xla_enable_buffer_reuse,
-                       "Enable reuse of buffers."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's buffer_assignment
-// module.
-void AppendBufferAssignmentFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the BufferAssignmentFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BufferAssignmentFlags* GetBufferAssignmentFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h b/tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h
deleted file mode 100644
index 5f098c2663f638940aead45b74332edcf3fcc37f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_BUFFER_ASSIGNMENT_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_BUFFER_ASSIGNMENT_FLAGS_H_
-
-// Legacy flags for XLA's buffer_assignment module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's buffer_assignment
-// module.
-void AppendBufferAssignmentFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's buffer_assignment module.
-typedef struct {
-  bool xla_enable_buffer_reuse;  // Enable reuse of buffers.
-} BufferAssignmentFlags;
-
-// Return a pointer to the BufferAssignmentFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BufferAssignmentFlags* GetBufferAssignmentFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_BUFFER_ASSIGNMENT_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.cc b/tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.cc
deleted file mode 100644
index 617a9b712ed99d343dc28b6e6c0de4b54e271096..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's compiler_functor module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static CompilerFunctorFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new CompilerFunctorFlags;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_debug_cpu_dump_ir", &flags->xla_debug_cpu_dump_ir,
-                       "Dump IR, before optimizations to a path"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's compiler_functor
-// module.
-void AppendCompilerFunctorFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the CompilerFunctorFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-CompilerFunctorFlags* GetCompilerFunctorFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h b/tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h
deleted file mode 100644
index 28b505ec5eac2d74879a22779137c6982a7c9ce8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_COMPILER_FUNCTOR_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_COMPILER_FUNCTOR_FLAGS_H_
-
-// Legacy flags for the XLA's compiler_functor module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's compiler_functor
-// module.
-void AppendCompilerFunctorFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's compiler_functor module.
-typedef struct {
-  string xla_debug_cpu_dump_ir;  // Dump IR, before optimizations to a path
-} CompilerFunctorFlags;
-
-// Return a pointer to the CompilerFunctorFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-CompilerFunctorFlags* GetCompilerFunctorFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_COMPILER_FUNCTOR_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.cc b/tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.cc
deleted file mode 100644
index fe5d19147f09557817fee5c670f52058f21f5cdc..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's convolution_thunk module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static ConvolutionThunkFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new ConvolutionThunkFlags;
-  flags->xla_gpu_autotune_convolution_algorithm = true;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_gpu_autotune_convolution_algorithm",
-                       &flags->xla_gpu_autotune_convolution_algorithm,
-                       "Auto-tune the algorithm used by convolution"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's convolution_thunk
-// module.
-void AppendConvolutionThunkFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the ConvolutionThunkFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-ConvolutionThunkFlags* GetConvolutionThunkFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h b/tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h
deleted file mode 100644
index 53d6806a71902d1227728f74bd45f12f9d11421d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CONVOLUTION_THUNK_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CONVOLUTION_THUNK_FLAGS_H_
-
-// Legacy flags for XLA's convolution_thunk module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's convolution_thunk
-// module.
-void AppendConvolutionThunkFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's convolution_thunk module.
-typedef struct {
-  // Auto-tune the algorithm used by convolution
-  bool xla_gpu_autotune_convolution_algorithm;
-} ConvolutionThunkFlags;
-
-// Return a pointer to the ConvolutionThunkFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-ConvolutionThunkFlags* GetConvolutionThunkFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CONVOLUTION_THUNK_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc b/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc
deleted file mode 100644
index 13d41a8636b6ba3aa88545523e93dffe4b0c12f5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's cpu_compiler module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static CpuCompilerFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new CpuCompilerFlags;
-  flags->xla_cpu_embed_ir = false;
-  flags->xla_cpu_dump_debug_json_to = "";
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_cpu_embed_ir", &flags->xla_cpu_embed_ir,
-          "Embed the LLVM IR module string in the resultant CpuExecutable."),
-      tensorflow::Flag("xla_cpu_dump_debug_json_to",
-                       &flags->xla_cpu_dump_debug_json_to,
-                       "Dump debug JSON to this directory."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's cpu_compiler
-// module.
-void AppendCpuCompilerFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the CpuCompilerFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-CpuCompilerFlags* GetCpuCompilerFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h b/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h
deleted file mode 100644
index bac498e18eb241d3b3044f14c88ac2b3aaaa322f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CPU_COMPILER_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CPU_COMPILER_FLAGS_H_
-
-// Legacy flags for the XLA's cpu_compiler module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's cpu_compiler
-// module.
-void AppendCpuCompilerFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's cpu_compiler module.
-typedef struct {
-  bool xla_cpu_embed_ir;  // Embed the LLVM IR module string in the resultant
-                          // CpuExecutable
-  string xla_cpu_dump_debug_json_to;  // Dump debug JSON to this directory.
-} CpuCompilerFlags;
-
-// Return a pointer to the CpuCompilerFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-CpuCompilerFlags* GetCpuCompilerFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CPU_COMPILER_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.cc b/tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.cc
deleted file mode 100644
index d7817c5d54a047b1987a19dfbde9f48081ae6413..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's cpu_runtime module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static CpuRuntimeFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new CpuRuntimeFlags;
-  flags->xla_cpu_use_eigen = true;
-  flags->xla_cpu_multi_thread_eigen = true;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_cpu_use_eigen", &flags->xla_cpu_use_eigen,
-          "Use Eigen for matrix multiply on the CPU platform. This "
-          "is a useful hack for performance comparisons against "
-          "XLA's implementation."),
-      tensorflow::Flag(
-          "xla_cpu_multi_thread_eigen", &flags->xla_cpu_multi_thread_eigen,
-          "When generating calls to Eigen for matmul and conv, should "
-          "single or multi-threaded eigen be used? "
-          "Only used when --xla_cpu_use_eigen is true."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's cpu_runtime
-// module.
-void AppendCpuRuntimeFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the CpuRuntimeFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-CpuRuntimeFlags* GetCpuRuntimeFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h b/tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h
deleted file mode 100644
index e3ff30da36a5fabd7d7798fd636cb3955a91b09f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CPU_RUNTIME_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CPU_RUNTIME_FLAGS_H_
-
-// Legacy flags for the XLA's cpu_runtime module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's cpu_runtime
-// module.
-void AppendCpuRuntimeFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's cpu_runtime module.
-typedef struct {
-  // Use Eigen for matrix multiply on the CPU platform. This is a useful hack
-  // for performance comparisons against XLA's implementation.
-  bool xla_cpu_use_eigen;
-  // When generating calls to Eigen for matmul and conv, should single or
-  // multi-threaded eigen be used?  Only used when --xla_cpu_use_eigen is true.
-  bool xla_cpu_multi_thread_eigen;
-} CpuRuntimeFlags;
-
-// Return a pointer to the CpuRuntimeFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-CpuRuntimeFlags* GetCpuRuntimeFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_CPU_RUNTIME_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index 5e3c4f912bf6073e89a66633c44a7e052ca43ade..87c6215e6badc9f7e4c99f78fb23c8d621b9dbd2 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -23,56 +23,233 @@ limitations under the License.
 namespace xla {
 namespace legacy_flags {
 
-struct DebugOptionsFlags {
-  string xla_generate_hlo_graph;
-  string xla_disable_hlo_passes;
-  bool xla_enable_fast_math;
-  int32 xla_backend_optimization_level;
-  string xla_backend_extra_options;
-};
-
 namespace {
 
-DebugOptionsFlags* flag_values;
+DebugOptions* flag_values;
 std::vector<tensorflow::Flag>* flag_objects;
 std::once_flag flags_init;
 
+namespace {
+void SetDebugOptionsDefaults(DebugOptions* flags) {
+  flags->set_xla_hlo_graph_path("/tmp/");
+  flags->set_xla_enable_fast_math(true);
+  flags->set_xla_llvm_enable_alias_scope_metadata(true);
+  flags->set_xla_llvm_enable_noalias_metadata(true);
+  flags->set_xla_llvm_enable_invariant_load_metadata(true);
+  flags->set_xla_backend_optimization_level(3);
+  flags->set_xla_cpu_multi_thread_eigen(true);
+  flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
+}
+}  // namespace
+
 // Allocates flag_values and flag_objects; this function must not be called more
 // than once - its call done via call_once.
 void AllocateFlags() {
-  flag_values = new DebugOptionsFlags;
-  flag_values->xla_generate_hlo_graph = "";
-  flag_values->xla_disable_hlo_passes = "";
-  flag_values->xla_enable_fast_math = true;
-  flag_values->xla_backend_optimization_level = 2;
-  flag_values->xla_backend_extra_options = "";
+  flag_values = new DebugOptions;
+
+  SetDebugOptionsDefaults(flag_values);
+
+  // Returns a lambda that calls "member_setter" on "flag_values" with the
+  // argument passed in to the lambda.
+  auto bool_setter_for = [](void (DebugOptions::*member_setter)(bool)) {
+    return [member_setter](bool value) {
+      (flag_values->*member_setter)(value);
+      return true;
+    };
+  };
+
+  // Returns a lambda that calls "member_setter" on "flag_values" with the
+  // argument passed in to the lambda.
+  auto int32_setter_for = [](void (DebugOptions::*member_setter)(int32)) {
+    return [member_setter](int32 value) {
+      (flag_values->*member_setter)(value);
+      return true;
+    };
+  };
+
+  // Returns a lambda that is a custom "sub-parser" for xla_disable_hlo_passes.
+  auto setter_for_xla_disable_hlo_passes = [](string comma_separated_values) {
+    std::vector<string> disabled_passes =
+        tensorflow::str_util::Split(comma_separated_values, ',');
+    for (const auto& passname : disabled_passes) {
+      flag_values->add_xla_disable_hlo_passes(passname);
+    }
+    return true;
+  };
+
+  // Returns a lambda that is a custom "sub-parser" for
+  // xla_backend_extra_options.
+  auto setter_for_xla_backend_extra_options =
+      [](string comma_separated_values) {
+        std::vector<string> extra_options_parts =
+            tensorflow::str_util::Split(comma_separated_values, ',');
+        auto* extra_options_map =
+            flag_values->mutable_xla_backend_extra_options();
+
+        // The flag contains a comma-separated list of options; some options
+        // have arguments following "=", some don't.
+        for (const auto& part : extra_options_parts) {
+          size_t eq_pos = part.find_first_of('=');
+          if (eq_pos == string::npos) {
+            (*extra_options_map)[part] = "";
+          } else {
+            string value = "";
+            if (eq_pos + 1 < part.size()) {
+              value = part.substr(eq_pos + 1);
+            }
+            (*extra_options_map)[part.substr(0, eq_pos)] = value;
+          }
+        }
+
+        return true;
+      };
 
   flag_objects = new std::vector<tensorflow::Flag>(
       {tensorflow::Flag(
-           "xla_generate_hlo_graph", &flag_values->xla_generate_hlo_graph,
+           "xla_generate_hlo_graph",
+           flag_values->mutable_xla_generate_hlo_graph(),
            "HLO modules matching this regex will be dumped to a .dot file "
            "throughout various stages in compilation."),
-
        tensorflow::Flag(
-           "xla_enable_fast_math", &flag_values->xla_enable_fast_math,
+           "xla_hlo_graph_addresses",
+           bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
+           flag_values->xla_hlo_graph_addresses(),
+           "With xla_generate_hlo_graph, show addresses of HLO ops in "
+           "graph dump."),
+       tensorflow::Flag(
+           "xla_hlo_graph_layout",
+           bool_setter_for(&DebugOptions::set_xla_hlo_graph_layout),
+           flag_values->xla_hlo_graph_layout(),
+           "With xla_generate_hlo_graph, show layout of HLO ops in "
+           "graph dump."),
+       tensorflow::Flag(
+           "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
+           "With xla_generate_hlo_graph, dump the graphs into this path."),
+       tensorflow::Flag(
+           "xla_hlo_dump_as_graphdef",
+           bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
+           flag_values->xla_hlo_dump_as_graphdef(),
+           "Dump HLO graphs as TensorFlow GraphDefs."),
+       tensorflow::Flag(
+           "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
+           "HLO modules matching this regex will be dumped to LOG(INFO). "),
+       tensorflow::Flag(
+           "xla_generate_hlo_text_to",
+           flag_values->mutable_xla_generate_hlo_text_to(),
+           "Dump all HLO modules as text into the provided directory path."),
+       tensorflow::Flag(
+           "xla_enable_fast_math",
+           bool_setter_for(&DebugOptions::set_xla_enable_fast_math),
+           flag_values->xla_enable_fast_math(),
            "Enable unsafe fast-math optimizations in the compiler; "
            "this may produce faster code at the expense of some accuracy."),
+       tensorflow::Flag(
+           "xla_llvm_enable_alias_scope_metadata",
+           bool_setter_for(
+               &DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
+           flag_values->xla_llvm_enable_alias_scope_metadata(),
+           "In LLVM-based backends, enable the emission of "
+           "!alias.scope metadata in the generated IR."),
+       tensorflow::Flag(
+           "xla_llvm_enable_noalias_metadata",
+           bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
+           flag_values->xla_llvm_enable_noalias_metadata(),
+           "In LLVM-based backends, enable the emission of "
+           "!noalias metadata in the generated IR."),
+       tensorflow::Flag(
+           "xla_llvm_enable_invariant_load_metadata",
+           bool_setter_for(
+               &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
+           flag_values->xla_llvm_enable_invariant_load_metadata(),
+           "In LLVM-based backends, enable the emission of "
+           "!invariant.load metadata in "
+           "the generated IR."),
        tensorflow::Flag(
            "xla_backend_optimization_level",
-           &flag_values->xla_backend_optimization_level,
+           int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
+           flag_values->xla_backend_optimization_level(),
            "Numerical optimization level for the XLA compiler backend."),
-
+       tensorflow::Flag(
+           "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
+           "Comma-separated list of hlo passes to be disabled. These names "
+           "must exactly match the passes' names; no whitespace around "
+           "commas."),
+       tensorflow::Flag(
+           "xla_embed_ir_in_executable",
+           bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
+           flag_values->xla_embed_ir_in_executable(),
+           "Embed the compiler IR as a string in the executable."),
+       tensorflow::Flag(
+           "xla_dump_ir_to", flag_values->mutable_xla_dump_ir_to(),
+           "Dump the compiler IR into this directory as individual files."),
+       tensorflow::Flag(
+           "xla_eliminate_hlo_implicit_broadcast",
+           bool_setter_for(
+               &DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
+           flag_values->xla_eliminate_hlo_implicit_broadcast(),
+           "Eliminate implicit broadcasts when lowering user "
+           "computations to HLO instructions; use explicit "
+           "broadcast instead."),
+       tensorflow::Flag(
+           "xla_cpu_multi_thread_eigen",
+           bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
+           flag_values->xla_cpu_multi_thread_eigen(),
+           "When generating calls to Eigen in the CPU backend, "
+           "use multi-threaded Eigen mode."),
+       tensorflow::Flag("xla_gpu_cuda_data_dir",
+                        flag_values->mutable_xla_gpu_cuda_data_dir(),
+                        "If non-empty, speficies a local directory containing "
+                        "ptxas and nvvm libdevice files; otherwise we use "
+                        "those from runfile directories."),
+       tensorflow::Flag("xla_gpu_ftz",
+                        bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
+                        flag_values->xla_gpu_ftz(),
+                        "If true, flush-to-zero semantics are enabled in the "
+                        "code generated for GPUs."),
+       tensorflow::Flag(
+           "xla_gpu_disable_multi_streaming",
+           bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
+           flag_values->xla_gpu_disable_multi_streaming(),
+           "If true, multi-streaming in the GPU backend is disabled."),
+       tensorflow::Flag(
+           "xla_dump_debug_json_to",
+           flag_values->mutable_xla_dump_debug_json_to(),
+           "Dump compilation artifacts as JSON into this directory."),
+       tensorflow::Flag(
+           "xla_test_all_output_layouts",
+           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
+           flag_values->xla_test_all_output_layouts(),
+           "Let ClientLibraryTestBase::ComputeAndCompare* test "
+           "all permutations of output layouts. For example, with "
+           "a 3D shape, all permutations of the set {0, 1, 2} are "
+           "tried."),
+       tensorflow::Flag(
+           "xla_test_all_input_layouts",
+           bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
+           flag_values->xla_test_all_input_layouts(),
+           "Let ClientLibraryTestBase::ComputeAndCompare* test "
+           "all permutations of *input* layouts. For example, for "
+           "2 input arguments with 2D shape and 4D shape, the "
+           "computation will run 2! * 4! times for every possible "
+           "layouts"),
+       tensorflow::Flag(
+           "xla_hlo_profile",
+           bool_setter_for(&DebugOptions::set_xla_hlo_profile),
+           flag_values->xla_hlo_profile(),
+           "Instrument the computation to collect per-HLO cycle counts"),
+       tensorflow::Flag("xla_dump_computations_to",
+                        flag_values->mutable_xla_dump_computations_to(),
+                        "Dump computations that XLA executes into the provided "
+                        "directory path"),
+       tensorflow::Flag("xla_dump_executions_to",
+                        flag_values->mutable_xla_dump_executions_to(),
+                        "Dump parameters and results of computations that XLA "
+                        "executes into the provided directory path"),
        tensorflow::Flag("xla_backend_extra_options",
-                        &flag_values->xla_backend_extra_options,
+                        setter_for_xla_backend_extra_options, "",
                         "Extra options to pass to a backend; "
                         "comma-separated list of 'key=val' strings (=val "
-                        "may be omitted); no whitespace around commas."),
-
-       tensorflow::Flag(
-           "xla_disable_hlo_passes", &flag_values->xla_disable_hlo_passes,
-           "Comma-separated list of HLO passes to be disabled. These names "
-           "must exactly match the passes' names; "
-           "no whitespace around commas.")});
+                        "may be omitted); no whitespace around commas.")});
   ParseFlagsFromEnv(*flag_objects);
 }
 
@@ -86,40 +263,7 @@ void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list) {
 
 xla::DebugOptions GetDebugOptionsFromFlags() {
   std::call_once(flags_init, &AllocateFlags);
-
-  DebugOptions options;
-  options.set_xla_generate_hlo_graph(flag_values->xla_generate_hlo_graph);
-
-  std::vector<string> disabled_passes =
-      tensorflow::str_util::Split(flag_values->xla_disable_hlo_passes, ',');
-  for (const auto& passname : disabled_passes) {
-    options.add_xla_disable_hlo_passes(passname);
-  }
-
-  options.set_xla_enable_fast_math(flag_values->xla_enable_fast_math);
-  options.set_xla_backend_optimization_level(
-      flag_values->xla_backend_optimization_level);
-
-  std::vector<string> extra_options_parts =
-      tensorflow::str_util::Split(flag_values->xla_backend_extra_options, ',');
-  auto* extra_options_map = options.mutable_xla_backend_extra_options();
-
-  // The flag contains a comma-separated list of options; some options have
-  // arguments following "=", some don't.
-  for (const auto& part : extra_options_parts) {
-    size_t eq_pos = part.find_first_of('=');
-    if (eq_pos == string::npos) {
-      (*extra_options_map)[part] = "";
-    } else {
-      string value = "";
-      if (eq_pos + 1 < part.size()) {
-        value = part.substr(eq_pos + 1);
-      }
-      (*extra_options_map)[part.substr(0, eq_pos)] = value;
-    }
-  }
-
-  return options;
+  return *flag_values;
 }
 
 }  // namespace legacy_flags
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
deleted file mode 100644
index f8f6ea26b1d0df67b934616fe60aa29199fc2eb9..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's gpu_backend_lib module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static GpuBackendLibFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new GpuBackendLibFlags;
-  flags->dump_temp_products_to = "";
-  flags->ftz = false;
-  flags->fma = true;
-  flags->verbose_ptx_asm = false;
-  flags->kernel = "";
-  flags->llvm_dump_passes = false;
-  flags->llvm_cl_opts = "";
-  flags->dump_ir_before_passes = false;
-  flags->opt_level = 3;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("dump_temp_products_to", &flags->dump_temp_products_to,
-                       "dump temporary compilation products to this directory. "
-                       "If empty, no dump is produced"),
-      tensorflow::Flag("ftz", &flags->ftz, "flush to zero semantics"),
-      tensorflow::Flag("fma", &flags->fma, "use FMA synthesis"),
-      tensorflow::Flag("verbose_ptx_asm", &flags->verbose_ptx_asm,
-                       "emit PTX assembly with extra comments"),
-      tensorflow::Flag("kernel", &flags->kernel,
-                       "only emit the IR and PTX for this kernel"),
-      tensorflow::Flag("llvm_dump_passes", &flags->llvm_dump_passes,
-                       "dump the passes LLVM runs to stderr"),
-      tensorflow::Flag(
-          "llvm_cl_opts", &flags->llvm_cl_opts,
-          "comma-separated list of command line options to pass to "
-          "LLVM.  For example, --llvm_cl_opts=--print-before=loop-unroll"),
-      tensorflow::Flag("dump_ir_before_passes", &flags->dump_ir_before_passes,
-                       "dump the IR before each optimization pass in "
-                       "sequentially-named files."),
-      tensorflow::Flag("opt_level", &flags->opt_level,
-                       "optimization level (default to 3)"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's gpu_backend_lib
-// module.
-void AppendGpuBackendLibFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the GpuBackendLibFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-GpuBackendLibFlags* GetGpuBackendLibFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h b/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
deleted file mode 100644
index 31cb50e9da986b5bad3e71439a4976ec84e17be7..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_GPU_BACKEND_LIB_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_GPU_BACKEND_LIB_FLAGS_H_
-
-// Legacy flags for XLA's gpu_backend_lib module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's gpu_backend_lib
-// module.
-void AppendGpuBackendLibFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's gpu_backend_lib module.
-typedef struct {
-  string dump_temp_products_to;  // temporary compilation products dir
-  bool ftz;                      // flush to zero semantics
-  bool fma;                      // use FMA synthesis
-  bool verbose_ptx_asm;          // emit PTX assembly with extra comments
-  string kernel;                 // only emit the IR and PTX for this kernel
-  bool llvm_dump_passes;         // dump the passes LLVM runs to stderr
-  string llvm_cl_opts;           // comma-separated list of LLVM options
-  bool dump_ir_before_passes;    // dump IR before each pass
-  int32 opt_level;               // optimization level
-} GpuBackendLibFlags;
-
-// Return a pointer to the GpuBackendLibFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-GpuBackendLibFlags* GetGpuBackendLibFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_GPU_BACKEND_LIB_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
deleted file mode 100644
index 131e3ce70ac9e7fc2f6f233ffd93e8757d0bc725..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's gpu_compiler module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static GpuCompilerFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new GpuCompilerFlags;
-  flags->xla_gpu_embed_ir = false;
-  flags->xla_cuda_data_dir = "./cuda_sdk_lib";
-  flags->xla_gpu_dump_debug_json_to = "";
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_gpu_embed_ir", &flags->xla_gpu_embed_ir,
-          "Embed the LLVM IR module string in the resultant GpuExecutable."),
-      tensorflow::Flag(
-          "xla_cuda_data_dir", &flags->xla_cuda_data_dir,
-          "If non-empty, specifies a local directory containing ptxas and "
-          "nvvm libdevice files. Otherwise, by default, we use those from "
-          "runfile directories."),
-      tensorflow::Flag("xla_ptxas_path", &flags->xla_ptxas_path,
-                       "The path to ptxas. Required to log stats of the ptx."),
-      tensorflow::Flag("xla_gpu_dump_debug_json_to",
-                       &flags->xla_gpu_dump_debug_json_to,
-                       "Dump debug JSON to this directory."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's gpu_compiler
-// module.
-void AppendGpuCompilerFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the GpuCompilerFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-GpuCompilerFlags* GetGpuCompilerFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h
deleted file mode 100644
index 0cf39e0ab35e663c7abc14980daa8b92d15489d6..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_GPU_COMPILER_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_GPU_COMPILER_FLAGS_H_
-
-// Legacy flags for XLA's gpu_compiler module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's gpu_compiler
-// module.
-void AppendGpuCompilerFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's gpu_compiler module.
-typedef struct {
-  bool xla_gpu_embed_ir;     // Embed the LLVM IR module string in the resultant
-                             // GpuExecutable.
-  string xla_cuda_data_dir;  // If non-empty, specifies a local directory
-                             // containing ptxas and nvvm libdevice files.
-                             // Otherwise, by default, we use those from runfile
-                             // directories.
-  string xla_ptxas_path;     // The path to ptxas.  Required to log stats of
-                             // the ptx.
-  string xla_gpu_dump_debug_json_to;  // Dump debug JSON to this directory.
-} GpuCompilerFlags;
-
-// Return a pointer to the GpuCompilerFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-GpuCompilerFlags* GetGpuCompilerFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_GPU_COMPILER_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
deleted file mode 100644
index ba43a5919522ff783f450481c629d64613e1f8ab..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's hlo_graph_dumper module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static HloGraphDumperFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new HloGraphDumperFlags;
-  flags->xla_hlo_dump_graph_path = "/tmp/";
-  flags->xla_hlo_dump_as_graphdef = false;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_hlo_dump_graph_path",
-                       &flags->xla_hlo_dump_graph_path,
-                       "Path to write dumped HLO graphs to"),
-      tensorflow::Flag("xla_hlo_dump_as_graphdef",
-                       &flags->xla_hlo_dump_as_graphdef,
-                       "Dumps HLO graphs as tensorflow GraphDefs"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's hlo_graph_dumper
-// module.
-void AppendHloGraphDumperFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the HloGraphDumperFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-HloGraphDumperFlags* GetHloGraphDumperFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
deleted file mode 100644
index d0b4d092ff1003bc1df90c3d878feacf71a5aa21..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_GRAPH_DUMPER_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_GRAPH_DUMPER_FLAGS_H_
-
-// Legacy flags for XLA's hlo_graph_dumper module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's hlo_graph_dumper
-// module.
-void AppendHloGraphDumperFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's hlo_graph_dumper module.
-typedef struct {
-  string xla_hlo_dump_graph_path;  // Path to write dumped HLO graphs to
-  // If set, dumps HLO graphs as tensorflow GraphDef; otherwise, dumps HLO
-  // graphs as DOT graph.
-  bool xla_hlo_dump_as_graphdef;
-} HloGraphDumperFlags;
-
-// Return a pointer to the HloGraphDumperFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-HloGraphDumperFlags* GetHloGraphDumperFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_GRAPH_DUMPER_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.cc b/tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.cc
deleted file mode 100644
index c7893c138596b034dbb83df9fda2d4c5edd8e32b..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's hlo_test_base module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static HloTestBaseFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new HloTestBaseFlags;
-  flags->xla_hlo_test_generate_hlo_graph = false;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_hlo_test_generate_hlo_graph",
-                       &flags->xla_hlo_test_generate_hlo_graph,
-                       "Generate graph output of HLO instructions"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's hlo_test_base
-// module.
-void AppendHloTestBaseFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the HloTestBaseFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-HloTestBaseFlags* GetHloTestBaseFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.h b/tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.h
deleted file mode 100644
index 23b808cecb7e5eaf480292f5207a4b87ebd4a2d5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_TEST_BASE_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_TEST_BASE_FLAGS_H_
-
-// Legacy flags for XLA's hlo_test_base module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's hlo_test_base
-// module.
-void AppendHloTestBaseFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's hlo_test_base module.
-typedef struct {
-  bool xla_hlo_test_generate_hlo_graph;  // Generate graph output of HLO
-                                         // instructions
-} HloTestBaseFlags;
-
-// Return a pointer to the HloTestBaseFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-HloTestBaseFlags* GetHloTestBaseFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_HLO_TEST_BASE_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc b/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc
deleted file mode 100644
index f838861898ddd08b56a13f9b8f722f3c1e4da5eb..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/layout_util_flags.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's layout_util module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the string value of the xla_default_layout flag and the flag
-// descriptor, initialized via raw_flags_init.
-static string* raw_flag;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag raw_flags_init;
-
-// Allocate *raw_flag.  Called via call_once(&raw_flags_init,...).
-static void AllocateRawFlag() {
-  raw_flag = new string;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_default_layout", raw_flag,
-          "Default layout for Shapes in XLA. Valid values are: "
-          "'minor2major', 'major2minor', 'random', 'random:<seed>'. "
-          "For debugging purposes. If no seed (or 0) is given, a seed from "
-          "random_device is used."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Parse text into *layout.
-static bool ParseDefaultLayout(const string& text, DefaultLayout* layout) {
-  bool result = true;
-  std::vector<string> field = tensorflow::str_util::Split(text, ':');
-  if (!field.empty()) {
-    if (field[0] == "random") {
-      layout->dimension_order = DefaultLayout::DimensionOrder::kRandom;
-      if (field.size() > 1) {
-        uint64 seed = 0;
-        result = tensorflow::strings::safe_strtou64(field[1], &seed);
-        layout->seed = seed;
-      }
-    } else if (field[0] == "minor2major") {
-      layout->dimension_order = DefaultLayout::DimensionOrder::kMinorToMajor;
-    } else if (field[0] == "major2minor") {
-      layout->dimension_order = DefaultLayout::DimensionOrder::kMajorToMinor;
-    } else {
-      result = false;
-    }
-  }
-  return result;
-}
-
-// Pointer to the parsed value of the flags, initialized via flags_init.
-static LayoutUtilFlags* flags;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  std::call_once(raw_flags_init, &AllocateRawFlag);
-  flags = new LayoutUtilFlags;
-  flags->xla_default_layout.dimension_order =
-      DefaultLayout::DimensionOrder::kMajorToMinor;
-  flags->xla_default_layout.seed = 0;
-  if (!ParseDefaultLayout(*raw_flag, &flags->xla_default_layout)) {
-    flags = nullptr;
-  }
-}
-
-// Append to *append_to the flag definitions associated with XLA's layout_util
-// module.
-void AppendLayoutUtilFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(raw_flags_init, &AllocateRawFlag);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the LayoutUtilFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-LayoutUtilFlags* GetLayoutUtilFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/layout_util_flags.h b/tensorflow/compiler/xla/legacy_flags/layout_util_flags.h
deleted file mode 100644
index 177f428b734dcdf703472f3e240aef9792f988d7..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/layout_util_flags.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_LAYOUT_UTIL_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_LAYOUT_UTIL_FLAGS_H_
-
-// Legacy flags for the XLA's layout_util module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// The default layout for all newly created shapes. Specified by the flag
-// --xla_default_layout.
-struct DefaultLayout {
-  enum class DimensionOrder {
-    kRandom,
-    kMinorToMajor,
-    kMajorToMinor,
-  };
-
-  DimensionOrder dimension_order;
-  size_t seed;
-};
-
-// Append to *flag_list the flag definitions associated with XLA's layout_util
-// module.
-void AppendLayoutUtilFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's layout_util module.
-typedef struct {
-  // Default layout for Shapes in XLA.  Valid values are:  'minor2major',
-  // 'major2minor', 'random', 'random:<seed>'.  For debugging purposes.  If no
-  // seed (or 0) is given, a seed from random_device is used.
-  DefaultLayout xla_default_layout;
-} LayoutUtilFlags;
-
-// Return a pointer to the LayoutFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-LayoutUtilFlags* GetLayoutUtilFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_LAYOUT_UTIL_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/llvm_util_flags.cc b/tensorflow/compiler/xla/legacy_flags/llvm_util_flags.cc
deleted file mode 100644
index 3c53729a67049fdac6b358149e06f39858ebd98f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/llvm_util_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's llvm_util module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static LlvmUtilFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new LlvmUtilFlags;
-  flags->xla_emit_tbaa = true;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_emit_tbaa", &flags->xla_emit_tbaa,
-                       "Perform type-based alias analysis optimizations for "
-                       "LLVM-based backends."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's llvm_util
-// module.
-void AppendLlvmUtilFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the LlvmUtilFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-LlvmUtilFlags* GetLlvmUtilFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h b/tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h
deleted file mode 100644
index 98da26b4b806dd83c7baf6bdcf60cbf5297457a6..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_LLVM_UTIL_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_LLVM_UTIL_FLAGS_H_
-
-// Legacy flags for XLA's llvm_util module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's llvm_util module.
-void AppendLlvmUtilFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's llvm_util module.
-typedef struct {
-  bool xla_emit_tbaa;  // Perform type-based alias analysis optimizations for
-                       // LLVM-based backends.
-} LlvmUtilFlags;
-
-// Return a pointer to the LlvmUtilFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-LlvmUtilFlags* GetLlvmUtilFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_LLVM_UTIL_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/service_flags.cc b/tensorflow/compiler/xla/legacy_flags/service_flags.cc
deleted file mode 100644
index 41cb8d8bdfc51de1d8fe77906317b4b4a0804802..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/service_flags.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's service module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static ServiceFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new ServiceFlags;
-  flags->xla_hlo_profile = false;
-  flags->xla_log_hlo_text = "";
-  flags->xla_generate_hlo_graph = "";
-  flags->xla_hlo_graph_addresses = false;
-  flags->xla_hlo_graph_layout = false;
-  flags->xla_hlo_graph_for_compute_constant = false;
-  flags->xla_dump_computations_to = "";
-  flags->xla_dump_hlo_text_to = "";
-  flags->xla_dump_executions_to = "";
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_hlo_profile", &flags->xla_hlo_profile,
-          "Instrument the computation to collect per-HLO cycle counts"),
-      tensorflow::Flag(
-          "xla_log_hlo_text", &flags->xla_log_hlo_text,
-          "If non-empty, print the text format of "
-          "HLO modules whose name partially matches this regex. E.g. "
-          "xla_log_hlo_text=.* will dump the text for every module."),
-      tensorflow::Flag(
-          "xla_generate_hlo_graph", &flags->xla_generate_hlo_graph,
-          "If non-empty, dump graph of HLO modules whose name partially "
-          "matches this regex. E.g. --xla_generate_hlo_graph=.* will dump "
-          "the graph of every module."),
-      tensorflow::Flag("xla_hlo_graph_addresses",
-                       &flags->xla_hlo_graph_addresses,
-                       "Show addresses of HLO ops in graph"),
-      tensorflow::Flag("xla_hlo_graph_layout", &flags->xla_hlo_graph_layout,
-                       "Show layout of HLO ops in graph"),
-      tensorflow::Flag(
-          "xla_hlo_graph_for_compute_constant",
-          &flags->xla_hlo_graph_for_compute_constant,
-          "If true, include hlo dumps of graphs from ComputeConstant."
-          "Such graphs still need to be matched via xla_generate_hlo_graph."),
-      tensorflow::Flag("xla_dump_computations_to",
-                       &flags->xla_dump_computations_to,
-                       "Dumps computations that XLA executes into the provided "
-                       "directory path"),
-      tensorflow::Flag("xla_dump_hlo_text_to", &flags->xla_dump_hlo_text_to,
-                       "Dumps HLO modules that XLA executes into the provided "
-                       "directory path"),
-      tensorflow::Flag("xla_dump_executions_to", &flags->xla_dump_executions_to,
-                       "Dumps parameters and results of computations that XLA "
-                       "executes into the provided directory path"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's service module.
-void AppendServiceFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the ServiceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-ServiceFlags* GetServiceFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/service_flags.h b/tensorflow/compiler/xla/legacy_flags/service_flags.h
deleted file mode 100644
index d982506944daed41eb6e7c4a238d540b38cf8be3..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/service_flags.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_SERVICE_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_SERVICE_FLAGS_H_
-
-// Legacy flags for XLA's service module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's service module.
-void AppendServiceFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's service module.
-typedef struct {
-  bool xla_hlo_profile;  // Instrument the computation to collect per-HLO cycle
-                         // counts
-  string xla_log_hlo_text;  // If non-empty, print the text format of the HLO
-                            // modules whose name partially
-                            // matches this regex.  E.g. xla_log_hlo_text=.*
-                            // will dump the text for every module.
-  string xla_generate_hlo_graph;  // If non-empty, dump graph of HLO modules
-                                  // whose name partially matches this regex.
-                                  // E.g. --xla_generate_hlo_graph=.* will dump
-                                  // the graph of every module.
-  bool xla_hlo_graph_addresses;   // Show addresses of HLO ops in graph
-  bool xla_hlo_graph_layout;      // Show layout of HLO ops in graph
-  bool xla_hlo_graph_for_compute_constant;  // If true, include hlo dumps of
-                                            // graphs from ComputeConstant.
-                                            // Such graphs still need to be
-                                            // matched via
-                                            // xla_generate_hlo_graph.
-  string xla_dump_hlo_text_to;  // Dumps HLO text for each HLO module that is
-                                // executed into the provided directory path
-  string xla_dump_computations_to;  // Dumps computations that XLA executes
-                                    // into the provided directory path
-  // Dumps parameters and results of computations that XLA executes into
-  // the provided directory path
-  string xla_dump_executions_to;
-} ServiceFlags;
-
-// Return a pointer to the ServiceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-ServiceFlags* GetServiceFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_SERVICE_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.cc b/tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.cc
deleted file mode 100644
index 6506175777ccd262b6467f8fbe6de8bb24eff945..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's stream_assignment module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static StreamAssignmentFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new StreamAssignmentFlags;
-  flags->xla_gpu_disable_multi_streaming = false;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_gpu_disable_multi_streaming",
-                       &flags->xla_gpu_disable_multi_streaming,
-                       "Disable multi-streaming in XLA's GPU backend"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's stream_assignment
-// module.
-void AppendStreamAssignmentFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the StreamAssignmentFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-StreamAssignmentFlags* GetStreamAssignmentFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.h b/tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.h
deleted file mode 100644
index a98f9b34584b43161aa8e3248c28d520403f3f3a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_STREAM_ASSIGNMENT_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_STREAM_ASSIGNMENT_FLAGS_H_
-
-// Legacy flags for XLA's stream_assignment module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's stream_assignment
-// module.
-void AppendStreamAssignmentFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's stream_assignment module.
-typedef struct {
-  bool xla_gpu_disable_multi_streaming;  // Disable multi-streaming in XLA's GPU
-                                         // backend
-} StreamAssignmentFlags;
-
-// Return a pointer to the StreamAssignmentFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-StreamAssignmentFlags* GetStreamAssignmentFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_STREAM_ASSIGNMENT_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/user_computation_flags.cc b/tensorflow/compiler/xla/legacy_flags/user_computation_flags.cc
deleted file mode 100644
index a9597d0cd8f89d7d664c38b79d225b0aa6b6b13b..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/user_computation_flags.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static UserComputationFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new UserComputationFlags;
-  flags->xla_eliminate_hlo_implicit_broadcast = false;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_eliminate_hlo_implicit_broadcast",
-                       &flags->xla_eliminate_hlo_implicit_broadcast,
-                       "Eliminate implicit broadcast on when lowering user "
-                       "computation to HLO instructions, use explicit "
-                       "broadcast instead."),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's hlo_pass_pipeline
-// module.
-void AppendUserComputationFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the UserComputationFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-UserComputationFlags* GetUserComputationFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/user_computation_flags.h b/tensorflow/compiler/xla/legacy_flags/user_computation_flags.h
deleted file mode 100644
index f5222c927cb203b901fb3bc6ea3d2e7d30cb658a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/user_computation_flags.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_USER_COMPUTATION_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_USER_COMPUTATION_FLAGS_H_
-
-// Legacy flags for XLA's user_computation module.
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flags definitions associated with XLA's user_computation
-// module.
-void AppendUserComputationFlags(std::vector<tensorflow::Flag>* flag_list);
-
-typedef struct {
-  // Eliminate implicit broadcast on when lowering user computation to HLO
-  // instructions, use explicit broadcast instead.
-  bool xla_eliminate_hlo_implicit_broadcast;
-} UserComputationFlags;
-
-// Return a pointer to the UserComputationFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-UserComputationFlags* GetUserComputationFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_USER_COMPUTATION_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/util_flags.cc b/tensorflow/compiler/xla/legacy_flags/util_flags.cc
deleted file mode 100644
index e6df19ddd2afbbf14149d77a1e0652df209f58fe..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/util_flags.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for XLA's util module.
-
-#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/legacy_flags/util_flags.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static UtilFlags* flags;
-static std::vector<tensorflow::Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new UtilFlags;
-  flags->xla_status_add_backtrace = false;
-  flag_list = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag("xla_status_add_backtrace",
-                       &flags->xla_status_add_backtrace,
-                       "add backtraces to XLA-produced status values"),
-  });
-  ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with XLA's util module.
-void AppendUtilFlags(std::vector<tensorflow::Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the UtilFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-UtilFlags* GetUtilFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/util_flags.h b/tensorflow/compiler/xla/legacy_flags/util_flags.h
deleted file mode 100644
index 03bffcd726f0544a185f5e8403ad2c45318bd0ad..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/util_flags.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_UTIL_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_UTIL_FLAGS_H_
-
-// Legacy flags for the XLA's util module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with XLA's util module.
-void AppendUtilFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with XLA's util module.
-typedef struct {
-  bool xla_status_add_backtrace;  // add backtraces to XLA-produced statuses
-} UtilFlags;
-
-// Return a pointer to the UtilFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-UtilFlags* GetUtilFlags();
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_UTIL_FLAGS_H_
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index caef3a3869f4bcde7a6982ce3dfc0db9d36cbc5e..0db9bd757d420d8ecf281b6ec936c3f34ee23617 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -62,7 +62,17 @@ Literal::StrideConfig::StrideConfig(
 std::unique_ptr<Literal> Literal::CreateFromShape(const Shape& shape) {
   auto literal = MakeUnique<Literal>();
   *literal->mutable_shape() = shape;
-  literal->Reserve(ShapeUtil::ElementsIn(literal->shape()));
+  if (ShapeUtil::IsTuple(shape)) {
+    int64 num_elements = ShapeUtil::TupleElementCount(shape);
+    literal->tuple_literals_.resize(num_elements);
+    for (int i = 0; i < num_elements; ++i) {
+      std::unique_ptr<Literal> elem =
+          CreateFromShape(ShapeUtil::GetTupleElementShape(shape, i));
+      literal->tuple_literals_[i] = std::move(*elem);
+    }
+  } else {
+    literal->Reserve(ShapeUtil::ElementsIn(literal->shape()));
+  }
   return literal;
 }
 
@@ -321,6 +331,7 @@ Status Literal::Copy(const Literal& src_literal,
 }
 
 std::unique_ptr<Literal> Literal::Relayout(const Layout& layout) const {
+  CHECK(ShapeUtil::IsArray(shape()));
   std::unique_ptr<Literal> result = CloneToUnique();
   *result->mutable_shape()->mutable_layout() = layout;
 
@@ -620,6 +631,18 @@ string Literal::ToString() const {
   return literal;
 }
 
+/* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
+    std::vector<std::unique_ptr<Literal>> elements) {
+  auto literal = MakeUnique<Literal>();
+  std::vector<Shape> shape;
+  for (auto& tuple_element : elements) {
+    shape.push_back(tuple_element->shape());
+    literal->add_tuple_literals()->Swap(tuple_element.get());
+  }
+  *literal->mutable_shape() = ShapeUtil::MakeTupleShape(shape);
+  return literal;
+}
+
 const void* Literal::InternalData() const {
   return const_cast<const void*>(
       const_cast<Literal*>(this)->MutableInternalData());
@@ -630,7 +653,6 @@ void* Literal::MutableInternalData() {
   // created by the accessor functions.
   switch (shape().element_type()) {
     case PRED:
-      return reinterpret_cast<void*>(preds_.data());
     case U8:
       return reinterpret_cast<void*>(u8s_.data());
     case S32:
@@ -698,8 +720,6 @@ tensorflow::Status Literal::ValidateLiteral() const {
   int64 actual = -1;
   switch (shape().element_type()) {
     case PRED:
-      actual = preds_size();
-      break;
     case U8:
       actual = u8s_size();
       break;
@@ -754,10 +774,30 @@ void Literal::EachCellAsString(
 }
 
 namespace {
+template <typename NativeSrcT, typename NativeDestT>
+std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+  auto result_literal = MakeUnique<Literal>();
+  Shape* result_shape = result_literal->mutable_shape();
+  *result_shape = src_literal.shape();
+  result_shape->set_element_type(
+      primitive_util::NativeToPrimitiveType<NativeDestT>());
+  result_literal->Reserve(ShapeUtil::ElementsIn(*result_shape));
+  tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
+      src_literal.GetArraySlice<NativeSrcT>();
+  tensorflow::gtl::MutableArraySlice<NativeDestT> dest_data =
+      result_literal->GetMutableArraySlice<NativeDestT>();
+  int64 num_elements = ShapeUtil::ElementsIn(src_literal.shape());
+
+  for (int64 i = 0; i < num_elements; ++i) {
+    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
+  }
+  return result_literal;
+}
+
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
 std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-  return LiteralUtil::Convert<
+  return ConvertBetweenNativeTypes<
       typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
       typename primitive_util::PrimitiveTypeToNative<
           primitive_dest_type>::type>(src_literal);
@@ -782,19 +822,20 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
 #undef CONVERT_IF_TYPES_MATCH
     // Other types are not yet supported.
     default:
-      return tensorflow::errors::InvalidArgument(
-          "Unimplemented: ConvertIfDestTypeMatches for type " +
-          PrimitiveType_Name(src_literal.shape().element_type()));
+      return InvalidArgument(
+          "Unimplemented: Convert from type %s to type %s",
+          PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
+          PrimitiveType_Name(primitive_dest_type).c_str());
   }
 }
-}
+}  // namespace
 
-StatusOr<std::unique_ptr<Literal>> LiteralUtil::ConvertIfSrcTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (src_literal.shape().element_type()) {
+StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+    PrimitiveType primitive_dest_type) const {
+  switch (shape().element_type()) {
 #define CONVERT_IF_DEST_TYPE_MATCHES(type) \
   case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
+    return ConvertIfDestTypeMatches<(type)>(*this, primitive_dest_type);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
@@ -807,9 +848,9 @@ StatusOr<std::unique_ptr<Literal>> LiteralUtil::ConvertIfSrcTypeMatches(
 #undef CONVERT_IF_DEST_TYPE_MATCHES
     // Other types are not yet supported.
     default:
-      return tensorflow::errors::InvalidArgument(
-          "Unimplemented: ConvertIfSrcTypeMatches for type " +
-          PrimitiveType_Name(src_literal.shape().element_type()));
+      return InvalidArgument("Unimplemented: Convert from type %s to type %s",
+                             PrimitiveType_Name(shape().element_type()).c_str(),
+                             PrimitiveType_Name(primitive_dest_type).c_str());
   }
 }
 
@@ -884,26 +925,22 @@ bool Literal::Equal(const Literal& literal2) const {
 template <>
 tensorflow::gtl::MutableArraySlice<bool> Literal::GetMutableArraySlice() {
   auto values = mutable_preds();
-  return tensorflow::gtl::MutableArraySlice<bool>(values->data(),
-                                                  values->size());
+  return tensorflow::gtl::MutableArraySlice<bool>(
+      reinterpret_cast<bool*>(values->data()), values->size());
 }
 
 template <>
 tensorflow::gtl::MutableArraySlice<int8> Literal::GetMutableArraySlice() {
-  // C++11 standard, basic_string 21.4.1.5, values should be stored
-  // contiguously. From C++17 a mutable data() member will be provided.
   auto values = mutable_u8s();
   return tensorflow::gtl::MutableArraySlice<int8>(
-      reinterpret_cast<int8*>(&(*values)[0]), values->size());
+      reinterpret_cast<int8*>(values->data()), values->size());
 }
 
 template <>
 tensorflow::gtl::MutableArraySlice<uint8> Literal::GetMutableArraySlice() {
-  // C++11 standard, basic_string 21.4.1.5, values should be stored
-  // contiguously. From C++17 a mutable data() member will be provided.
   auto values = mutable_u8s();
-  return tensorflow::gtl::MutableArraySlice<uint8>(
-      reinterpret_cast<uint8*>(&(*values)[0]), values->size());
+  return tensorflow::gtl::MutableArraySlice<uint8>(values->data(),
+                                                   values->size());
 }
 
 template <>
@@ -965,19 +1002,18 @@ tensorflow::gtl::MutableArraySlice<double> Literal::GetMutableArraySlice() {
 
 template <>
 tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice<half>() {
-  // C++11 standard, basic_string 21.4.1.5, values should be stored
-  // contiguously. From C++17 a mutable data() member will be provided.
   // TODO - there is an endianess problem here. fix it, or wait for uint16
   //        support in protobuf
   auto values = mutable_f16s();
-  return tensorflow::gtl::MutableArraySlice<half>(
-      reinterpret_cast<half*>(&(*values)[0]), values->size() / sizeof(half));
+  return tensorflow::gtl::MutableArraySlice<half>(values->data(),
+                                                  values->size());
 }
 
 template <>
 tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const {
   CHECK_EQ(shape().element_type(), PRED);
-  return tensorflow::gtl::ArraySlice<bool>(preds().data(), preds().size());
+  return tensorflow::gtl::ArraySlice<bool>(
+      reinterpret_cast<const bool*>(preds().data()), preds().size());
 }
 
 template <>
@@ -1027,9 +1063,8 @@ tensorflow::gtl::ArraySlice<double> Literal::GetArraySlice<double>() const {
 template <>
 tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const {
   CHECK_EQ(shape().element_type(), F16);
-  return tensorflow::gtl::ArraySlice<half>(
-      reinterpret_cast<const half*>(f16s().data()),
-      f16s().size() / sizeof(half));
+  return tensorflow::gtl::ArraySlice<half>(f16s().data(),
+                                           f16s().size() / sizeof(half));
 }
 
 template <typename NativeT>
@@ -1192,21 +1227,13 @@ static void CopyToRepeatedField(RepeatedFieldT* dest,
   *dest = RepeatedFieldT(src.begin(), src.end());
 }
 
-template <typename RepeatedFieldT>
-static void CopyToRepeatedBoolField(RepeatedFieldT* dest,
-                                    const BoolVector& src) {
-  *dest = RepeatedFieldT(src.begin(), src.end());
-}
-
 LiteralProto Literal::ToProto() const {
   LiteralProto proto;
   proto.Clear();
   *proto.mutable_shape() = shape();
   switch (shape().element_type()) {
     case PRED:
-      if (preds().begin()) {
-        CopyToRepeatedBoolField(proto.mutable_preds(), preds());
-      }
+      CopyToRepeatedField(proto.mutable_preds(), preds());
       break;
     case U8:
       *proto.mutable_u8s() = u8s_string();
@@ -1260,8 +1287,7 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) {
   *mutable_shape() = literal_proto.shape();
   switch (shape().element_type()) {
     case PRED:
-      *mutable_preds() = BoolVector(literal_proto.preds().begin(),
-                                    literal_proto.preds().end());
+      CopyFromRepeatedField(mutable_preds(), literal_proto.preds());
       break;
     case U8:
       set_u8s(literal_proto.u8s());
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 42c8b61acec8f4dc661111affc17773b1aa71583..125c268573becad622d880aab9a7f3dd18ab68df 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -49,94 +49,6 @@ limitations under the License.
 
 namespace xla {
 
-// This class is a simple vector of boolean values. It's used to workaround some
-// implementations of std::vector<bool> that use a bitset which does not have
-// the semantics expected by Literal::preds().
-class BoolVector {
- public:
-  typedef bool* iterator;
-  typedef const bool* const_iterator;
-
-  BoolVector() : bits_(nullptr), size_(0), capacity_(0) {}
-
-  BoolVector(const_iterator other_begin, const_iterator other_end)
-      : bits_(nullptr), size_(0), capacity_(0) {
-    if (other_begin && other_end) {
-      resize(other_end - other_begin);
-      memcpy(begin(), other_begin, size());
-    }
-  }
-
-  BoolVector(const BoolVector& other) { CopyFrom(other); }
-
-  BoolVector& operator=(const BoolVector& other) {
-    CopyFrom(other);
-    return *this;
-  }
-
-  void push_back(const bool& value) {
-    resize(size_ + 1);
-    bits_[size_ - 1] = value;
-  }
-
-  bool* data() const { return bits_.get(); }
-
-  size_t size() const { return size_; }
-
-  size_t capacity() const { return capacity_; }
-
-  void resize(size_t new_size, bool val = false) {
-    if (new_size == 0) {
-      bits_.reset(nullptr);
-      size_ = 0;
-      capacity_ = 0;
-    } else {
-      size_t old_size = size();
-      if (new_size > old_size) {
-        grow(new_size);
-      }
-      if (old_size < new_size) {
-        memset(&bits_[old_size], val, new_size - old_size);
-      }
-      size_ = new_size;
-    }
-  }
-
-  void clear() {
-    bits_.reset(nullptr);
-    size_ = 0;
-    capacity_ = 0;
-  }
-
-  iterator begin() { return &bits_[0]; }
-  iterator end() { return &bits_[size()]; }
-  const_iterator begin() const { return &bits_[0]; }
-  const_iterator end() const { return &bits_[size()]; }
-
- private:
-  void grow(size_t n) {
-    if (capacity_ < n) {
-      capacity_ = 2 * n;
-      bool* new_bits = new bool[capacity_]();
-      if (size_ > 0) {
-        memcpy(new_bits, bits_.get(), size_);
-      }
-      bits_.reset(new_bits);
-    }
-  }
-
-  void CopyFrom(const BoolVector& other) {
-    bits_ = MakeUnique<bool[]>(other.capacity());
-    memcpy(begin(), other.begin(), other.size());
-    size_ = other.size();
-    capacity_ = other.capacity();
-  }
-
-  std::unique_ptr<bool[]> bits_;
-  size_t size_;
-  size_t capacity_;
-};
-
 // Utility class for dealing with XLA literal values.  Most methods are
 // templated by native (host) type which corresponds to a unique XLA
 // PrimitiveType. See ComputationBuilder for details.  Not all primitive types
@@ -147,10 +59,12 @@ class Literal {
   Literal() {}
 
   Literal(const Literal& other) = default;
+  Literal(Literal&&) = default;
 
   explicit Literal(const LiteralProto& other) { CopyFromProto(other); }
 
   Literal& operator=(const Literal& other) = default;
+  Literal& operator=(Literal&&) = default;
 
   LiteralProto ToProto() const;
 
@@ -165,7 +79,6 @@ class Literal {
 
   void Clear() {
     shape_.Clear();
-    preds_.clear();
     u8s_.clear();
     s32s_.clear();
     s64s_.clear();
@@ -177,9 +90,17 @@ class Literal {
     tuple_literals_.clear();
   }
 
-  int preds_size() const { return preds().size(); }
-  const BoolVector& preds() const { return preds_; }
-  BoolVector* mutable_preds() { return &preds_; }
+  int preds_size() const { return u8s().size(); }
+  const std::vector<uint8>& preds() const {
+    static_assert(sizeof(uint8) == sizeof(bool),
+                  "The uint8 and bool types should be the same size");
+    return u8s_;
+  }
+  std::vector<uint8>* mutable_preds() {
+    static_assert(sizeof(uint8) == sizeof(bool),
+                  "The uint8 and bool types should be the same size");
+    return &u8s_;
+  }
 
   int s32s_size() const { return s32s().size(); }
   int32 s32s(int i) const { return s32s_[i]; }
@@ -251,7 +172,7 @@ class Literal {
     *other = temp;
   }
 
-  // CreatesCreate new literal of a given rank. To minimize ambiguity (for users
+  // Creates a new literal of a given rank. To minimize ambiguity (for users
   // and the compiler) these CreateR[0-2] methods should explicitly specify the
   // native type. For example:
   //
@@ -362,10 +283,10 @@ class Literal {
   template <typename NativeT>
   std::unique_ptr<Literal> Replicate(int64 times) const;
 
-  // Creates a literal by converting each element in this literal to a new
-  // type.
-  template <typename NativeSrcT, typename NativeDestT>
-  std::unique_ptr<Literal> Convert() const;
+  // Converts this literal to another primitive type. Returns an error if the
+  // conversion is not possible.
+  StatusOr<std::unique_ptr<Literal>> Convert(
+      PrimitiveType primitive_dest_type) const;
 
   // Creates a literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
@@ -444,10 +365,21 @@ class Literal {
   template <typename NativeT>
   void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
 
-  // Retrieves the mutable array slice interface which can be used to manipulate
-  // pre-allocated literal values.
+  // Returns a (Mutable)ArraySlice view of the array for this literal for the
+  // given NativeT (e.g., float). These functions map native type to XLA
+  // PrimitiveType via template specialization. The unspecialized forms below
+  // aborts to handle the error case where the given native type does not map to
+  // an XLA primitive type.
   template <typename NativeT>
-  tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice();
+  tensorflow::gtl::ArraySlice<NativeT> GetArraySlice() const {
+    static_assert(!std::is_same<NativeT, NativeT>::value,
+                  "Cannot map native type to primitive type.");
+  }
+  template <typename NativeT>
+  tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice() {
+    static_assert(!std::is_same<NativeT, NativeT>::value,
+                  "Cannot map native type to primitive type.");
+  }
 
   // Returns the element value at index (0, ..., 0), however many zeroes are
   // required for that index.
@@ -466,6 +398,16 @@ class Literal {
   static std::unique_ptr<Literal> MakeTuple(
       tensorflow::gtl::ArraySlice<const Literal*> elements);
 
+  // As above, but intended to be invoked with move semantics; i.e.
+  //
+  //  std::vector<std::unique_ptr<Literal>> elements = ...;
+  //  auto result = Literal::MakeTupleOwned(std::move(elements));
+  //
+  // This would have been declared as an overload, but there is ambiguity
+  // in invocation between the above signature and this one.
+  static std::unique_ptr<Literal> MakeTupleOwned(
+      std::vector<std::unique_ptr<Literal>> elements);
+
   // Validates that the data payload of the literal matches the literal shape;
   // if it does not, an appropriate status is returned.
   tensorflow::Status ValidateLiteral() const;
@@ -588,17 +530,6 @@ class Literal {
   bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
 
  private:
-  // Returns an ArraySlice view of the array for this literal for the given
-  // NativeT (e.g., float). These functions map native type to XLA PrimitiveType
-  // via template specialization. The unspecialized forms below aborts to handle
-  // the error case where the given native type does not map to an XLA primitive
-  // type.
-  template <typename NativeT>
-  tensorflow::gtl::ArraySlice<NativeT> GetArraySlice() const {
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
-  }
-
   // Copy from a LiteralProto instance.
   void CopyFromProto(const LiteralProto& literal_proto);
 
@@ -634,7 +565,6 @@ class Literal {
   };
 
   Shape shape_;
-  BoolVector preds_;
   std::vector<uint8> u8s_;
   std::vector<int32> s32s_;
   std::vector<int64> s64s_;
@@ -646,544 +576,6 @@ class Literal {
   std::vector<Literal> tuple_literals_;
 };
 
-// Utility class for dealing with XLA literal values.  Most methods are
-// templated by native (host) type which corresponds to a unique XLA
-// PrimitiveType. See ComputationBuilder for details.  Not all primitive types
-// defined in xla_data.proto have a corresponding native type or even have a
-// storage location in the Literal proto yet (for example, primitive type F16).
-//
-// TODO(dnovillo) - All functions in this class simply redirect to the
-// corresponding function in class Literal. Remove this class after converting
-// all user code to use Literal directly.
-class LiteralUtil {
- public:
-  // Creates new literal of a given rank. To minimize ambiguity (for users and
-  // the compiler) these CreateR[0-2] methods should explicitly specify the
-  // native type. For example:
-  //
-  //  CreateR1<float>({1.0, 42.0});
-  //  CreateR2<uint32>({{1, 2}, {3, 4}});
-  //
-  // The variants not ending with WithLayout use the default XLA layout for the
-  // literal's linear representation in memory.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR0(NativeT value) {
-    return Literal::CreateR0(value);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR1(
-      tensorflow::gtl::ArraySlice<NativeT> values) {
-    return Literal::CreateR1(values);
-  }
-
-  static std::unique_ptr<Literal> CreateR1(
-      const tensorflow::core::Bitmap& values) {
-    return Literal::CreateR1(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR2(
-      std::initializer_list<std::initializer_list<NativeT>> values) {
-    return Literal::CreateR2(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR2WithLayout(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      const Layout& layout) {
-    return Literal::CreateR2WithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR3(
-      std::initializer_list<
-          std::initializer_list<std::initializer_list<NativeT>>>
-          values) {
-    return Literal::CreateR3(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR3WithLayout(
-      std::initializer_list<
-          std::initializer_list<std::initializer_list<NativeT>>>
-          values,
-      const Layout& layout) {
-    return Literal::CreateR3WithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4(
-      std::initializer_list<std::initializer_list<
-          std::initializer_list<std::initializer_list<NativeT>>>>
-          values) {
-    return Literal::CreateR4(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4WithLayout(
-      std::initializer_list<std::initializer_list<
-          std::initializer_list<std::initializer_list<NativeT>>>>
-          values,
-      const Layout& layout) {
-    return Literal::CreateR4WithLayout(values, layout);
-  }
-
-  // Creates a new Literal object with the shape specified as parameter.
-  // The content of the literal values is the default value of the primitive
-  // type of literal itself (0 for numeric types, and false for predicates).
-  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape) {
-    return Literal::CreateFromShape(shape);
-  }
-
-  // Creates a new Literal object with its values havings the primitive_type
-  // type, and with dimensions defined by the dimensions parameter.
-  // The content of the literal values is the default value of the primitive
-  // type of literal itself (0 for numeric types, and false for predicates).
-  static std::unique_ptr<Literal> CreateFromDimensions(
-      PrimitiveType primitive_type,
-      tensorflow::gtl::ArraySlice<int64> dimensions) {
-    return Literal::CreateFromDimensions(primitive_type, dimensions);
-  }
-
-  // Copies the values from src_literal, starting at src_base shape indexes,
-  // to dest_literal, starting at dest_base, where the copy size in each
-  // dimension is specified by copy_size.
-  //
-  // The src_literal and dest_literal must have the same primitive type,
-  // src_base+copy_size must fit the source literal dimensions, as well as
-  // dest_base+copy_size must fit the destination literal dimensions.
-  static Status Copy(const Literal& src_literal,
-                     tensorflow::gtl::ArraySlice<int64> src_base,
-                     Literal* dest_literal,
-                     tensorflow::gtl::ArraySlice<int64> dest_base,
-                     tensorflow::gtl::ArraySlice<int64> copy_size) {
-    return dest_literal->Copy(src_literal, src_base, dest_base, copy_size);
-  }
-
-  // Creates a new value that has the equivalent value as literal, but conforms
-  // to new_layout; e.g. a literal matrix that was in {0, 1} minor-to-major
-  // dimension layout can be re-laid-out as {1, 0} minor-to-major dimension
-  // layout and the value in the cell at any given logical index (i0, i1) will
-  // be the same.
-  //
-  // Note: this is useful when the client wants to ensure that a value placed in
-  // the XLA allocation tracker has a particular layout; for efficiency
-  // purposes or avoiding unimplemented operation/layout combinations.
-  static std::unique_ptr<Literal> Relayout(const Literal& literal,
-                                           const Layout& new_layout) {
-    return literal.Relayout(new_layout);
-  }
-
-  // Reshapes literal 'input' to have 'shape'. Both the original shape and
-  // 'shape' must contain the same number of elements. The implementation
-  // currently only supports monotonic dim0-major layouts.
-  static StatusOr<std::unique_ptr<Literal>> Reshape(
-      const xla::Literal& input, tensorflow::gtl::ArraySlice<int64> shape) {
-    return input.Reshape(shape);
-  }
-
-  // Creates a new literal by reordering the dimensions of the original literal.
-  // The given `permutation` must be a permutation of the dimension numbers
-  // in the original literal, and it specifies the order of the new dimensions
-  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
-  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
-  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
-  static std::unique_ptr<Literal> Transpose(
-      const Literal& literal, tensorflow::gtl::ArraySlice<int64> permutation) {
-    return literal.Transpose(permutation);
-  }
-
-  // Creates a sub-array from the given literal by extracting the indices
-  // [start_index, limit_index) of each dimension. The result literal has the
-  // same rank and layout as for the given literal. The number of indices in
-  // start_indices and limit_indices must be the rank of the literal, and the
-  // indices follow the order of the dimensions.
-  static std::unique_ptr<Literal> Slice(
-      const Literal& literal, tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices) {
-    return literal.Slice(start_indices, limit_indices);
-  }
-
-  // Creates a literal with a prepended dimension with bound "times"; e.g. a
-  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from the input
-  // literal replicated four times.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> Replicate(const Literal& input, int64 times) {
-    return input.Replicate<NativeT>(times);
-  }
-
-  // Creates a literal by converting each element in an original literal to a
-  // new type.
-  template <typename NativeSrcT, typename NativeDestT>
-  static std::unique_ptr<Literal> Convert(const Literal& literal) {
-    return literal.Convert<NativeSrcT, NativeDestT>();
-  }
-
-  // Convert a literal to another primitive type, but only if the literal
-  // type is connvertable into the destination type
-  static StatusOr<std::unique_ptr<Literal>> ConvertIfSrcTypeMatches(
-      const Literal& src_literal, PrimitiveType primitive_dest_type);
-
-  // Creates a literal value zero of the given primitive type.
-  static Literal Zero(PrimitiveType primitive_type) {
-    return Literal::Zero(primitive_type);
-  }
-
-  // Creates a literal value one of the given primitive type.
-  static Literal One(PrimitiveType primitive_type) {
-    return Literal::One(primitive_type);
-  }
-
-  // Creates a literal value containing the minimum value of the given
-  // primitive type. For floating-point types, returns -inf.
-  static Literal MinValue(PrimitiveType primitive_type) {
-    return Literal::MinValue(primitive_type);
-  }
-
-  // Creates a literal value containing the maximum value of the given
-  // primitive type. For floating-point types, returns inf.
-  static Literal MaxValue(PrimitiveType primitive_type) {
-    return Literal::MaxValue(primitive_type);
-  }
-
-  // Creates a literal of the given shape where each element is `value`.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateFullWithMonotonicDim0MajorLayout(
-      tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value) {
-    return Literal::CreateFullWithMonotonicDim0MajorLayout(dimensions, value);
-  }
-
-  // Creates a new literal from an array. The variants not ending with
-  // WithLayout use the default XLA layout for the literal's linear
-  // representation in memory.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR2FromArray2D(
-      const Array2D<NativeT>& values) {
-    return Literal::CreateR2FromArray2D(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR2FromArray2DWithLayout(
-      const Array2D<NativeT>& values, const Layout& layout) {
-    return Literal::CreateR2FromArray2DWithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR3FromArray3D(
-      const Array3D<NativeT>& values) {
-    return Literal::CreateR3FromArray3D(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR3FromArray3DWithLayout(
-      const Array3D<NativeT>& values, const Layout& layout) {
-    return Literal::CreateR3FromArray3DWithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4FromArray4D(
-      const Array4D<NativeT>& values) {
-    return Literal::CreateR4FromArray4D(values);
-  }
-
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4FromArray4DWithLayout(
-      const Array4D<NativeT>& values, const Layout& layout) {
-    return Literal::CreateR4FromArray4DWithLayout(values, layout);
-  }
-
-  // Creates a new vector of U8s literal value from a string.
-  static std::unique_ptr<Literal> CreateR1U8(tensorflow::StringPiece value) {
-    return Literal::CreateR1U8(value);
-  }
-
-  // Creates a linspace-populated literal with the given number of rows and
-  // columns.
-  static std::unique_ptr<Literal> CreateR2F32Linspace(float from, float to,
-                                                      int64 rows, int64 cols) {
-    return Literal::CreateR2F32Linspace(from, to, rows, cols);
-  }
-
-  // Creates a literal that projects the (x, y) dimensions given in values into
-  // the z dimension given by "projection".
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR3Projected(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      int64 projection) {
-    return Literal::CreateR3Projected(values, projection);
-  }
-
-  // Creates a literal that projects the (x, y) dimensions given in values into
-  // the z and p dimensions given.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4Projected(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      int64 projection_p, int64 projection_z) {
-    return Literal::CreateR4Projected(values, projection_p, projection_z);
-  }
-
-  // Clones literal into an owned unique_ptr version.
-  static std::unique_ptr<Literal> CloneToUnique(const Literal& literal) {
-    return literal.CloneToUnique();
-  }
-
-  // Returns the linear index of the given index within the literal's
-  // element_type repeated field.
-  static int64 LinearIndex(const Literal& literal,
-                           tensorflow::gtl::ArraySlice<int64> multi_index) {
-    return literal.LinearIndex(multi_index);
-  }
-
-  // Gets or sets an element in the literal at the given index. The index is
-  // CHECKed against the dimension sizes.
-  template <typename NativeT>
-  static NativeT Get(const Literal& literal,
-                     tensorflow::gtl::ArraySlice<int64> multi_index) {
-    return literal.Get<NativeT>(multi_index);
-  }
-
-  template <typename NativeT>
-  static void Set(Literal* literal,
-                  tensorflow::gtl::ArraySlice<int64> multi_index,
-                  NativeT value) {
-    literal->Set(multi_index, value);
-  }
-
-  // Retrieves the mutable array slice interface which can be used to manipulate
-  // pre-allocated literal values.
-  template <typename NativeT>
-  static tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice(
-      Literal* literal) {
-    return literal->GetMutableArraySlice<NativeT>();
-  }
-
-  // Returns the element value at index (0, ..., 0), however many zeroes are
-  // required for that index.
-  template <typename NativeT>
-  static NativeT GetFirstElement(const Literal& literal) {
-    return literal.GetFirstElement<NativeT>();
-  }
-
-  // As Get(), but determines the correct type and converts the value
-  // into text.
-  static string GetAsString(const Literal& literal,
-                            tensorflow::gtl::ArraySlice<int64> multi_index) {
-    return literal.GetAsString(multi_index);
-  }
-
-  // Returns an identity matrix (rank 2) with the given row and column count.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> MakeIdentityR2(int64 size) {
-    return Literal::MakeIdentityR2<NativeT>(size);
-  }
-
-  // Returns a tuple literal composed of given literals.
-  static std::unique_ptr<Literal> MakeTuple(
-      tensorflow::gtl::ArraySlice<const Literal*> elements) {
-    return Literal::MakeTuple(elements);
-  }
-
-  // Validates that the data payload of the literal matches the literal shape;
-  // if it does not, an appropriate status is returned.
-  static tensorflow::Status ValidateLiteral(const Literal& literal) {
-    return literal.ValidateLiteral();
-  }
-
-  // Returns a string representation of the literal value.
-  static string ToString(const Literal& literal) { return literal.ToString(); }
-
-  // Invokes the "per cell" callback for each element in the provided
-  // literal with the element's indices and a string representation of
-  // the element's value.
-  //
-  // This function is useful if you want a polymorphic representation
-  // of the tensor's elements (turning it to a string for something
-  // like representation in a protobuf).
-  static void EachCellAsString(
-      const Literal& literal,
-      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                               const string& value)>& per_cell) {
-    literal.EachCellAsString(per_cell);
-  }
-
-  template <typename NativeT>
-  static void EachCell(
-      const Literal& literal,
-      std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                         NativeT value)>
-          per_cell) {
-    literal.EachCell<NativeT>(per_cell);
-  }
-
-  // Templated methods which populate the given repeated field in the Literal
-  // proto with the given value(s). The Shape field of the Literal proto is set
-  // to match the array dimensions and type. Examples:
-  //
-  //   // Populate with floats.
-  //   Array2D<float> float_values = ...
-  //   PopulateR2FromArray2D(values, literal);
-  //
-  //   // Populate with int32s.
-  //   PopulateR2({{1, 2}, {3, 4}}, literal);
-  //
-  template <typename NativeT>
-  static void PopulateR0(NativeT values, Literal* literal) {
-    literal->PopulateR0(values);
-  }
-
-  template <typename NativeT>
-  static void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values,
-                         Literal* literal) {
-    literal->PopulateR1(values);
-  }
-
-  static void PopulateR1(const tensorflow::core::Bitmap& values,
-                         Literal* literal) {
-    literal->PopulateR1(values);
-  }
-
-  template <typename NativeT>
-  static void PopulateR2(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      Literal* literal) {
-    literal->PopulateR2(values);
-  }
-
-  template <typename NativeT>
-  static void PopulateR2WithLayout(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      const Layout& layout, Literal* literal) {
-    literal->PopulateR2WithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static void PopulateR2FromArray2D(const Array2D<NativeT>& values,
-                                    Literal* literal) {
-    literal->PopulateR2FromArray2D(values);
-  }
-
-  template <typename NativeT>
-  static void PopulateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                              const Layout& layout,
-                                              Literal* literal) {
-    literal->PopulateR2FromArray2DWithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static void PopulateR3FromArray3D(const Array3D<NativeT>& values,
-                                    Literal* literal) {
-    literal->PopulateR3FromArray3D(values);
-  }
-
-  template <typename NativeT>
-  static void PopulateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                              const Layout& layout,
-                                              Literal* literal) {
-    literal->PopulateR3FromArray3DWithLayout(values, layout);
-  }
-
-  template <typename NativeT>
-  static void PopulateR4FromArray4D(const Array4D<NativeT>& values,
-                                    Literal* literal) {
-    literal->PopulateR4FromArray4D(values);
-  }
-
-  template <typename NativeT>
-  static void PopulateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                              const Layout& layout,
-                                              Literal* literal) {
-    literal->PopulateR4FromArray4DWithLayout(values, layout);
-  }
-
-  // Populates literal values by calling the generator function for every cell
-  // in the literal object.
-  template <typename NativeT>
-  static Status Populate(
-      Literal* literal,
-      const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
-          generator) {
-    return literal->Populate(generator);
-  }
-
-  // Creates a Literal of the given dimensions with all elements set to the
-  // given value.
-  template <typename NativeT>
-  static void PopulateWithValue(NativeT value,
-                                tensorflow::gtl::ArraySlice<int64> dimensions,
-                                Literal* literal) {
-    return literal->PopulateWithValue(value, dimensions);
-  }
-
-  // Returns a pointer to the underlying vector containing the array data. Use
-  // with care.
-  static const void* InternalData(const Literal& literal) {
-    return literal.InternalData();
-  }
-
-  static void* MutableInternalData(Literal* literal) {
-    return literal->MutableInternalData();
-  }
-
-  // Allocates space in the underlying vector of the literal sufficient to hold
-  // num_elements of the literal's primitive type. Values in the vector are set
-  // to zero. num_elements must equal the number of elements in the literals
-  // shape.
-  static void Reserve(int64 num_elements, Literal* literal) {
-    literal->Reserve(num_elements);
-  }
-
-  // Allocates space in the underlying vector of the literal sufficient to hold
-  // num_elements of the literal's primitive type and sets each element in the
-  // literal to the given value. num_elements must equal the number of elements
-  // in the literals shape.
-  template <typename NativeT>
-  static void Resize(int64 num_elements, NativeT value, Literal* literal) {
-    literal->Resize(num_elements, value);
-  }
-
-  // Returns true if the two given literals have the same shape and
-  // values. Layout is not considered in the comparison.
-  static bool Equal(const Literal& literal1, const Literal& literal2) {
-    return literal1.Equal(literal2);
-  }
-
-  // Returns whether every element in the given literal is equal to value.
-  //
-  // value is an int8 because we expect this to be called with small
-  // compile-time constants (0, -1, etc.) and so that whatever value you pass
-  // can be represented exactly by floating-point types as small as 16 bits.
-  //
-  // If value doesn't fit in literal's type, returns false.  Values of 1/0 are
-  // considered equal to true/false; other values are not considered equal to
-  // true.
-  static bool IsAll(const Literal& literal, int8 value) {
-    return literal.IsAll(value);
-  }
-
-  // Like IsAll(const Literal&, int8), except we check whether the literal is
-  // equal to a particular floating-point number.
-  //
-  // If the literal is not a floating-point value, this always returns false.
-  //
-  // This casts value to the type of literal, then compares using ==.  The usual
-  // admonishments about floating-point equality checks apply.  We expect you to
-  // use this to check for values that can be expressed precisely as a float,
-  // e.g. -0.5.
-  static bool IsAllFloat(const Literal& literal, float value) {
-    return literal.IsAllFloat(value);
-  }
-
-  // Returns whether the literal is zero at the specified index. The literal
-  // must be an array.
-  static bool IsZero(const Literal& literal,
-                     tensorflow::gtl::ArraySlice<int64> indices) {
-    return literal.IsZero(indices);
-  }
-
-  TF_DISALLOW_COPY_AND_ASSIGN(LiteralUtil);
-};
-
 // Declarations of template specializations for GetArraySlice and
 // GetMutableArraySlice. The specializations map native type to XLA primitive
 // type.
@@ -1759,27 +1151,6 @@ void Literal::PopulateWithValue(NativeT value,
   Resize<NativeT>(ShapeUtil::ElementsIn(shape()), value);
 }
 
-template <typename NativeSrcT, typename NativeDestT>
-std::unique_ptr<Literal> Literal::Convert() const {
-  const Shape& this_shape = shape();
-  auto result_literal = MakeUnique<Literal>();
-  Shape* result_shape = result_literal->mutable_shape();
-  *result_shape = this_shape;
-  result_shape->set_element_type(
-      primitive_util::NativeToPrimitiveType<NativeDestT>());
-  result_literal->Reserve(ShapeUtil::ElementsIn(*result_shape));
-  tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
-      GetArraySlice<NativeSrcT>();
-  tensorflow::gtl::MutableArraySlice<NativeDestT> dest_data =
-      result_literal->GetMutableArraySlice<NativeDestT>();
-  int64 num_elements = ShapeUtil::ElementsIn(this_shape);
-
-  for (int64 i = 0; i < num_elements; ++i) {
-    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
-  }
-  return result_literal;
-}
-
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal>
 Literal::CreateFullWithMonotonicDim0MajorLayout(
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 8d4a75d7affebd3ee39702cb1226ee52aff09691..b50e741b8ad55173d932231836abd5996cf1a068 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -72,11 +72,11 @@ class LiteralUtilTest : public ::testing::Test {
     layout_r4_dim0minor_ = LayoutUtil::MakeLayout({0, 1, 2, 3});
 
     literal_r4_2x2x3x3_dim0major_ =
-        LiteralUtil::CreateR4FromArray4DWithLayout<float>(arr4d,
-                                                          layout_r4_dim0major_);
+        Literal::CreateR4FromArray4DWithLayout<float>(arr4d,
+                                                      layout_r4_dim0major_);
     literal_r4_2x2x3x3_dim0minor_ =
-        LiteralUtil::CreateR4FromArray4DWithLayout<float>(arr4d,
-                                                          layout_r4_dim0minor_);
+        Literal::CreateR4FromArray4DWithLayout<float>(arr4d,
+                                                      layout_r4_dim0minor_);
   }
 
   Layout layout_r2_dim0major_;
@@ -90,43 +90,42 @@ class LiteralUtilTest : public ::testing::Test {
 };
 
 TEST_F(LiteralUtilTest, LiteralScalarToString) {
-  auto true_lit = LiteralUtil::CreateR0<bool>(true);
-  ASSERT_EQ("true", LiteralUtil::ToString(*true_lit));
+  auto true_lit = Literal::CreateR0<bool>(true);
+  ASSERT_EQ("true", true_lit->ToString());
 
-  auto false_lit = LiteralUtil::CreateR0<bool>(false);
-  ASSERT_EQ("false", LiteralUtil::ToString(*false_lit));
+  auto false_lit = Literal::CreateR0<bool>(false);
+  ASSERT_EQ("false", false_lit->ToString());
 
-  auto u32_lit = LiteralUtil::CreateR0<uint32>(42);
-  ASSERT_EQ("42", LiteralUtil::ToString(*u32_lit));
+  auto u32_lit = Literal::CreateR0<uint32>(42);
+  ASSERT_EQ("42", u32_lit->ToString());
 
-  auto s32_lit = LiteralUtil::CreateR0<int32>(-999);
-  ASSERT_EQ("-999", LiteralUtil::ToString(*s32_lit));
+  auto s32_lit = Literal::CreateR0<int32>(-999);
+  ASSERT_EQ("-999", s32_lit->ToString());
 
-  auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
-  ASSERT_EQ("3.14", LiteralUtil::ToString(*f32_lit));
+  auto f32_lit = Literal::CreateR0<float>(3.14f);
+  ASSERT_EQ("3.14", f32_lit->ToString());
 
-  auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
-  ASSERT_EQ("0.5", LiteralUtil::ToString(*f16_lit));
+  auto f16_lit = Literal::CreateR0<half>(static_cast<half>(0.5f));
+  ASSERT_EQ("0.5", f16_lit->ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
-  auto pred_vec = LiteralUtil::CreateR1<bool>({true, false, true});
-  ASSERT_EQ("{101}", LiteralUtil::ToString(*pred_vec));
+  auto pred_vec = Literal::CreateR1<bool>({true, false, true});
+  ASSERT_EQ("{101}", pred_vec->ToString());
 }
 
 TEST_F(LiteralUtilTest, R2ToString) {
-  const auto literal = LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}});
+  const auto literal = Literal::CreateR2({{1, 2}, {3, 4}, {5, 6}});
   const string expected = R"(s32[3,2] {
   { 1, 2 },
   { 3, 4 },
   { 5, 6 },
 })";
-  ASSERT_EQ(expected, LiteralUtil::ToString(*literal));
+  ASSERT_EQ(expected, literal->ToString());
 }
 
 TEST_F(LiteralUtilTest, R3ToString) {
-  const auto literal =
-      LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
+  const auto literal = Literal::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
   const string expected = R"(s32[3,2,1] {
 { { 1 },
   { 2 } },
@@ -135,13 +134,13 @@ TEST_F(LiteralUtilTest, R3ToString) {
 { { 5 },
   { 6 } }
 })";
-  ASSERT_EQ(expected, LiteralUtil::ToString(*literal));
+  ASSERT_EQ(expected, literal->ToString());
 }
 
 TEST_F(LiteralUtilTest, TupleToString) {
-  auto scalar = LiteralUtil::CreateR0<float>(1.0);
-  auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto tuple = LiteralUtil::MakeTuple({scalar.get(), matrix.get()});
+  auto scalar = Literal::CreateR0<float>(1.0);
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   const string expected = R"((f32[], f32[2,2]) (
 1,
 f32[2,2] {
@@ -149,7 +148,7 @@ f32[2,2] {
   { 3, 4 },
 },
 ))";
-  ASSERT_EQ(expected, LiteralUtil::ToString(*tuple));
+  ASSERT_EQ(expected, tuple->ToString());
 }
 
 TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
@@ -164,9 +163,9 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   });
   // clang-format on
 
-  auto literal = LiteralUtil::CreateR3FromArray3D(array_3d);
+  auto literal = Literal::CreateR3FromArray3D(array_3d);
   EXPECT_THAT(literal->shape().dimensions(), ElementsAre(2, 3, 2));
-  string result = LiteralUtil::ToString(*literal);
+  string result = literal->ToString();
   const string expected = R"(f32[2,3,2] {
 { { 1, 2 },
   { 3, 4 },
@@ -180,14 +179,14 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
 
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   // clang-format off
-  auto literal = LiteralUtil::CreateR4Projected<float>({
+  auto literal = Literal::CreateR4Projected<float>({
     {1, 2},
     {1001, 1002},
     {2001, 2002},
   }, /*projection_p=*/1, /*projection_z=*/2);
   // clang-format on
   EXPECT_THAT(literal->shape().dimensions(), ElementsAre(1, 2, 3, 2));
-  string result = LiteralUtil::ToString(*literal);
+  string result = literal->ToString();
   const string expected = R"(f32[1,2,3,2] {
   {  // i0=0
     {  // i1=0
@@ -208,7 +207,7 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
 TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
   EXPECT_THAT(literal_r4_2x2x3x3_dim0major_->shape().dimensions(),
               ElementsAre(2, 2, 3, 3));
-  string result = LiteralUtil::ToString(*literal_r4_2x2x3x3_dim0major_);
+  string result = literal_r4_2x2x3x3_dim0major_->ToString();
   const string expected = R"(f32[2,2,3,3] {
   {  // i0=0
     {  // i1=0
@@ -240,14 +239,13 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
 
 TEST_F(LiteralUtilTest, EachCellR2F32) {
   // clang-format off
-  auto literal = LiteralUtil::CreateR2<float>({
+  auto literal = Literal::CreateR2<float>({
     {3.1f, 4.2f},
     {9.3f, 12.4f},
   });
   // clang-format on
   std::vector<std::tuple<int64, int64, string>> seen;
-  LiteralUtil::EachCellAsString(
-      *literal,
+  literal->EachCellAsString(
       [&seen](tensorflow::gtl::ArraySlice<int64> indices, const string& value) {
         seen.emplace_back(indices[0], indices[1], value);
       });
@@ -259,176 +257,171 @@ TEST_F(LiteralUtilTest, EachCellR2F32) {
 }
 
 TEST_F(LiteralUtilTest, ScalarEquality) {
-  // Test LiteralUtil::Equal with scalars.
-  auto f32_42 = LiteralUtil::CreateR0<float>(42.0);
-  auto f32_42_clone = LiteralUtil::CreateR0<float>(42.0);
+  // Test Literal::Equal with scalars.
+  auto f32_42 = Literal::CreateR0<float>(42.0);
+  auto f32_42_clone = Literal::CreateR0<float>(42.0);
 
-  EXPECT_TRUE(LiteralUtil::Equal(*f32_42, *f32_42));
-  EXPECT_TRUE(LiteralUtil::Equal(*f32_42, *f32_42_clone));
+  EXPECT_TRUE(f32_42->Equal(*f32_42));
+  EXPECT_TRUE(f32_42->Equal(*f32_42_clone));
 
-  auto f32_123 = LiteralUtil::CreateR0<float>(123.0);
-  EXPECT_FALSE(LiteralUtil::Equal(*f32_42, *f32_123));
+  auto f32_123 = Literal::CreateR0<float>(123.0);
+  EXPECT_FALSE(f32_42->Equal(*f32_123));
 
-  auto f64_42 = LiteralUtil::CreateR0<double>(42.0);
-  EXPECT_FALSE(LiteralUtil::Equal(*f32_42, *f64_42));
+  auto f64_42 = Literal::CreateR0<double>(42.0);
+  EXPECT_FALSE(f32_42->Equal(*f64_42));
 }
 
 TEST_F(LiteralUtilTest, NonScalarEquality) {
-  // Test LiteralUtil::Equal with nonscalars.
-  auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto matrix_clone = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto matrix_different =
-      LiteralUtil::CreateR2<float>({{4.0, 3.0}, {1.0, 2.0}});
-  auto vector_literal = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
-  auto scalar = LiteralUtil::CreateR0<float>(1.0);
-
-  EXPECT_TRUE(LiteralUtil::Equal(*matrix, *matrix));
-  EXPECT_TRUE(LiteralUtil::Equal(*matrix, *matrix_clone));
-  EXPECT_FALSE(LiteralUtil::Equal(*matrix, *matrix_different));
-  EXPECT_FALSE(LiteralUtil::Equal(*matrix, *vector_literal));
-  EXPECT_FALSE(LiteralUtil::Equal(*matrix, *scalar));
+  // Test Literal::Equal with nonscalars.
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto matrix_clone = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto matrix_different = Literal::CreateR2<float>({{4.0, 3.0}, {1.0, 2.0}});
+  auto vector_literal = Literal::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
+  auto scalar = Literal::CreateR0<float>(1.0);
+
+  EXPECT_TRUE(matrix->Equal(*matrix));
+  EXPECT_TRUE(matrix->Equal(*matrix_clone));
+  EXPECT_FALSE(matrix->Equal(*matrix_different));
+  EXPECT_FALSE(matrix->Equal(*vector_literal));
+  EXPECT_FALSE(matrix->Equal(*scalar));
 }
 
 TEST_F(LiteralUtilTest, DifferentLayoutEquality) {
-  // Test LiteralUtil::Equal with literals which have different layouts.
+  // Test Literal::Equal with literals which have different layouts.
   auto colmajor = MakeUnique<Literal>();
   *colmajor->mutable_shape() = ShapeUtil::MakeShape(F32, {2, 2});
   *colmajor->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
-  LiteralUtil::Reserve(4, colmajor.get());
-  LiteralUtil::Set<float>(colmajor.get(), {0, 0}, 1.0);
-  LiteralUtil::Set<float>(colmajor.get(), {0, 1}, 2.0);
-  LiteralUtil::Set<float>(colmajor.get(), {1, 0}, 3.0);
-  LiteralUtil::Set<float>(colmajor.get(), {1, 1}, 4.0);
+  colmajor->Reserve(4);
+  colmajor->Set<float>({0, 0}, 1.0);
+  colmajor->Set<float>({0, 1}, 2.0);
+  colmajor->Set<float>({1, 0}, 3.0);
+  colmajor->Set<float>({1, 1}, 4.0);
 
   auto rowmajor = MakeUnique<Literal>();
   *rowmajor->mutable_shape() = ShapeUtil::MakeShape(F32, {2, 2});
   *rowmajor->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
-  LiteralUtil::Reserve(4, rowmajor.get());
-  LiteralUtil::Set<float>(rowmajor.get(), {0, 0}, 1.0);
-  LiteralUtil::Set<float>(rowmajor.get(), {0, 1}, 2.0);
-  LiteralUtil::Set<float>(rowmajor.get(), {1, 0}, 3.0);
-  LiteralUtil::Set<float>(rowmajor.get(), {1, 1}, 4.0);
+  rowmajor->Reserve(4);
+  rowmajor->Set<float>({0, 0}, 1.0);
+  rowmajor->Set<float>({0, 1}, 2.0);
+  rowmajor->Set<float>({1, 0}, 3.0);
+  rowmajor->Set<float>({1, 1}, 4.0);
 
-  EXPECT_TRUE(LiteralUtil::Equal(*rowmajor, *colmajor));
+  EXPECT_TRUE(rowmajor->Equal(*colmajor));
 }
 
 TEST_F(LiteralUtilTest, TupleEquality) {
-  // Test LiteralUtil::Equal with tuples.
-  auto scalar = LiteralUtil::CreateR0<float>(1.0);
-  auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto tuple1 = LiteralUtil::MakeTuple({scalar.get(), matrix.get()});
+  // Test Literal::Equal with tuples.
+  auto scalar = Literal::CreateR0<float>(1.0);
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto tuple1 = Literal::MakeTuple({scalar.get(), matrix.get()});
 
   // Tuple with the same elements. One element is shared with the original
   // tuple, the other is a clone of the element in the original tuple.
-  auto scalar_clone = LiteralUtil::CreateR0<float>(1.0);
-  auto tuple2 = LiteralUtil::MakeTuple({scalar_clone.get(), matrix.get()});
-  EXPECT_TRUE(LiteralUtil::Equal(*tuple1, *tuple2));
+  auto scalar_clone = Literal::CreateR0<float>(1.0);
+  auto tuple2 = Literal::MakeTuple({scalar_clone.get(), matrix.get()});
+  EXPECT_TRUE(tuple1->Equal(*tuple2));
 
   // Tuple with elements reversed.
-  auto reversed_tuple = LiteralUtil::MakeTuple({matrix.get(), scalar.get()});
-  EXPECT_FALSE(LiteralUtil::Equal(*tuple1, *reversed_tuple));
+  auto reversed_tuple = Literal::MakeTuple({matrix.get(), scalar.get()});
+  EXPECT_FALSE(tuple1->Equal(*reversed_tuple));
 
   // Tuple with different value.
-  auto scalar_42 = LiteralUtil::CreateR0<float>(42.0);
-  auto different_tuple =
-      LiteralUtil::MakeTuple({scalar_42.get(), matrix.get()});
-  EXPECT_FALSE(LiteralUtil::Equal(*tuple1, *different_tuple));
+  auto scalar_42 = Literal::CreateR0<float>(42.0);
+  auto different_tuple = Literal::MakeTuple({scalar_42.get(), matrix.get()});
+  EXPECT_FALSE(tuple1->Equal(*different_tuple));
 }
 
 TEST_F(LiteralUtilTest, IsAllTuple) {
-  auto element1 = LiteralUtil::CreateR0<float>(0.0);
-  auto element2 = LiteralUtil::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
-  auto tuple = LiteralUtil::MakeTuple({element1.get(), element1.get()});
+  auto element1 = Literal::CreateR0<float>(0.0);
+  auto element2 = Literal::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
+  auto tuple = Literal::MakeTuple({element1.get(), element1.get()});
 
   // Tuples should always return false for IsAll.
-  EXPECT_FALSE(LiteralUtil::IsAll(*tuple, 0));
-  EXPECT_FALSE(LiteralUtil::IsAll(*tuple, 1));
+  EXPECT_FALSE(tuple->IsAll(0));
+  EXPECT_FALSE(tuple->IsAll(1));
+}
+
+// Verifies that CreateFromShape works for tuples.
+TEST_F(LiteralUtilTest, CreateFromShapeTuple) {
+  auto scalar = Literal::CreateR0<float>(0.0);
+  auto matrix = Literal::CreateR2<int32>({{0, 0}, {0, 0}});
+  auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
+
+  auto x = Literal::CreateFromShape(tuple->shape());
+  EXPECT_TRUE(tuple->Equal(*x));
 }
 
 TEST_F(LiteralUtilTest, IsAll) {
-  EXPECT_TRUE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(false), 0));
-  EXPECT_TRUE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(true), 1));
-  EXPECT_FALSE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(false), 1));
-  EXPECT_FALSE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(false), 2));
-  EXPECT_FALSE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(true), 0));
-  EXPECT_FALSE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(true), 2));
-  EXPECT_FALSE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<bool>(true), -1));
+  EXPECT_TRUE(Literal::CreateR0<bool>(false)->IsAll(0));
+  EXPECT_TRUE(Literal::CreateR0<bool>(true)->IsAll(1));
+  EXPECT_FALSE(Literal::CreateR0<bool>(false)->IsAll(1));
+  EXPECT_FALSE(Literal::CreateR0<bool>(false)->IsAll(2));
+  EXPECT_FALSE(Literal::CreateR0<bool>(true)->IsAll(0));
+  EXPECT_FALSE(Literal::CreateR0<bool>(true)->IsAll(2));
+  EXPECT_FALSE(Literal::CreateR0<bool>(true)->IsAll(-1));
 
   // We shouldn't reinterpret int8_min as an unsigned type and then decide that
   // it is equal to 255.
   auto int8_min = std::numeric_limits<int8>::min();
-  EXPECT_FALSE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR0<uint8>(255), int8_min));
+  EXPECT_FALSE(Literal::CreateR0<uint8>(255)->IsAll(int8_min));
 
-  EXPECT_TRUE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<float>(42.0), 42));
-  EXPECT_FALSE(LiteralUtil::IsAll(*LiteralUtil::CreateR0<float>(42.0001), 42));
+  EXPECT_TRUE(Literal::CreateR0<float>(42.0)->IsAll(42));
+  EXPECT_FALSE(Literal::CreateR0<float>(42.0001)->IsAll(42));
 
-  EXPECT_TRUE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR1<int>({100, 100, 100}), 100));
-  EXPECT_FALSE(LiteralUtil::IsAll(
-      *LiteralUtil::CreateR1<double>({100, 100, 100.001}), 100));
+  EXPECT_TRUE(Literal::CreateR1<int>({100, 100, 100})->IsAll(100));
+  EXPECT_FALSE(Literal::CreateR1<double>({100, 100, 100.001})->IsAll(100));
 
-  EXPECT_TRUE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{8, 8}, {8, 8}}), 8));
-  EXPECT_FALSE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{8, 8}, {8, 9}}), 8));
-  EXPECT_FALSE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{9, 8}, {8, 8}}), 8));
+  EXPECT_TRUE(Literal::CreateR2<uint64>({{8, 8}, {8, 8}})->IsAll(8));
+  EXPECT_FALSE(Literal::CreateR2<uint64>({{8, 8}, {8, 9}})->IsAll(8));
+  EXPECT_FALSE(Literal::CreateR2<uint64>({{9, 8}, {8, 8}})->IsAll(8));
 
   half h8(8.0f);
   half h9(9.0f);
-  EXPECT_TRUE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h8}}), 8));
-  EXPECT_FALSE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h9}}), 8));
-  EXPECT_FALSE(
-      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h9}, {h8}}), 8));
+  EXPECT_TRUE(Literal::CreateR2<half>({{h8}, {h8}})->IsAll(8));
+  EXPECT_FALSE(Literal::CreateR2<half>({{h8}, {h9}})->IsAll(8));
+  EXPECT_FALSE(Literal::CreateR2<half>({{h9}, {h8}})->IsAll(8));
 
   auto uint64_max = std::numeric_limits<uint64>::max();
-  EXPECT_FALSE(LiteralUtil::IsAll(
-      *LiteralUtil::CreateR2<uint64>(
-          {{uint64_max, uint64_max}, {uint64_max, uint64_max}}),
-      -1));
+  EXPECT_FALSE(Literal::CreateR2<uint64>(
+                   {{uint64_max, uint64_max}, {uint64_max, uint64_max}})
+                   ->IsAll(-1));
 }
 
 TEST_F(LiteralUtilTest, IsAllFloat) {
   // IsAllFloat always returns false when the literal is not floating-point.
-  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<bool>(false), 0));
-  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<int8>(0), 0));
-  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<uint8>(0), 0));
-  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<int>(0), 0));
-
-  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(0), 0));
-  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(.5), .5));
-  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(-.5), -.5));
+  EXPECT_FALSE(Literal::CreateR0<bool>(false)->IsAllFloat(0));
+  EXPECT_FALSE(Literal::CreateR0<int8>(0)->IsAllFloat(0));
+  EXPECT_FALSE(Literal::CreateR0<uint8>(0)->IsAllFloat(0));
+  EXPECT_FALSE(Literal::CreateR0<int>(0)->IsAllFloat(0));
+
+  EXPECT_TRUE(Literal::CreateR0<float>(0)->IsAllFloat(0));
+  EXPECT_TRUE(Literal::CreateR0<float>(.5)->IsAllFloat(.5));
+  EXPECT_TRUE(Literal::CreateR0<float>(-.5)->IsAllFloat(-.5));
+  EXPECT_FALSE(Literal::CreateR0<float>(-.5)->IsAllFloat(-.49));
   EXPECT_FALSE(
-      LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(-.5), -.49));
-  EXPECT_FALSE(LiteralUtil::IsAllFloat(
-      *LiteralUtil::CreateR2<float>({{0, 0, 0}, {0, .1, 0}}), 0));
-  EXPECT_TRUE(LiteralUtil::IsAllFloat(
-      *LiteralUtil::CreateR2<float>({{.5, .5, .5}, {.5, .5, .5}}), .5));
-
-  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(0), 0));
-  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(.5), .5));
+      Literal::CreateR2<float>({{0, 0, 0}, {0, .1, 0}})->IsAllFloat(0));
   EXPECT_TRUE(
-      LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(-.5), -.5));
+      Literal::CreateR2<float>({{.5, .5, .5}, {.5, .5, .5}})->IsAllFloat(.5));
+
+  EXPECT_TRUE(Literal::CreateR0<double>(0)->IsAllFloat(0));
+  EXPECT_TRUE(Literal::CreateR0<double>(.5)->IsAllFloat(.5));
+  EXPECT_TRUE(Literal::CreateR0<double>(-.5)->IsAllFloat(-.5));
+  EXPECT_FALSE(Literal::CreateR0<double>(-.5)->IsAllFloat(-.49));
   EXPECT_FALSE(
-      LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(-.5), -.49));
-  EXPECT_FALSE(LiteralUtil::IsAllFloat(
-      *LiteralUtil::CreateR2<double>({{0, 0, 0}, {0, .1, 0}}), 0));
+      Literal::CreateR2<double>({{0, 0, 0}, {0, .1, 0}})->IsAllFloat(0));
 }
 
 TEST_F(LiteralUtilTest, IsZero) {
-  auto scalar_zero = LiteralUtil::CreateR0<float>(0.0f);
-  auto scalar_one = LiteralUtil::CreateR0<float>(1.0f);
-  EXPECT_TRUE(LiteralUtil::IsZero(*scalar_zero, {}));
-  EXPECT_FALSE(LiteralUtil::IsZero(*scalar_one, {}));
-
-  auto array = LiteralUtil::CreateR2<uint32>({{1, 2, 0, 3}, {1, 0, 1, 2}});
-  EXPECT_FALSE(LiteralUtil::IsZero(*array, {0, 1}));
-  EXPECT_TRUE(LiteralUtil::IsZero(*array, {0, 2}));
-  EXPECT_TRUE(LiteralUtil::IsZero(*array, {1, 1}));
-  EXPECT_FALSE(LiteralUtil::IsZero(*array, {1, 2}));
+  auto scalar_zero = Literal::CreateR0<float>(0.0f);
+  auto scalar_one = Literal::CreateR0<float>(1.0f);
+  EXPECT_TRUE(scalar_zero->IsZero({}));
+  EXPECT_FALSE(scalar_one->IsZero({}));
+
+  auto array = Literal::CreateR2<uint32>({{1, 2, 0, 3}, {1, 0, 1, 2}});
+  EXPECT_FALSE(array->IsZero({0, 1}));
+  EXPECT_TRUE(array->IsZero({0, 2}));
+  EXPECT_TRUE(array->IsZero({1, 1}));
+  EXPECT_FALSE(array->IsZero({1, 2}));
 }
 
 template <typename T>
@@ -440,127 +433,122 @@ TYPED_TEST_CASE(LiteralUtilTestTemplated, TestedTypes);
 TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) {
   // Make a non-integer for floating point types.
   TypeParam half = TypeParam(1) / TypeParam(2);
-  auto data = LiteralUtil::CreateR2<TypeParam>({{half, 2}, {3, 4}});
+  auto data = Literal::CreateR2<TypeParam>({{half, 2}, {3, 4}});
   const Layout layout01 = LayoutUtil::MakeLayout({0, 1});
   const Layout layout10 = LayoutUtil::MakeLayout({1, 0});
 
-  auto data01 = LiteralUtil::Relayout(*data, layout01);
+  auto data01 = data->Relayout(layout01);
   EXPECT_TRUE(LayoutUtil::Equal(data01->shape().layout(), layout01));
-  EXPECT_TRUE(LiteralUtil::Equal(*data, *data01));
+  EXPECT_TRUE(data->Equal(*data01));
 
-  auto data10 = LiteralUtil::Relayout(*data, layout10);
+  auto data10 = data->Relayout(layout10);
   EXPECT_TRUE(LayoutUtil::Equal(data10->shape().layout(), layout10));
-  EXPECT_TRUE(LiteralUtil::Equal(*data, *data10));
+  EXPECT_TRUE(data->Equal(*data10));
 }
 
 TEST_F(LiteralUtilTest, ReshapeR0) {
-  auto original = LiteralUtil::CreateR0<float>(1.7f);
-  auto reshape =
-      LiteralUtil::Reshape(*original, /*shape=*/{}).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*original, *reshape));
+  auto original = Literal::CreateR0<float>(1.7f);
+  auto reshape = original->Reshape(/*shape=*/{}).ConsumeValueOrDie();
+  EXPECT_TRUE(original->Equal(*reshape));
 }
 
 TEST_F(LiteralUtilTest, ReshapeR4) {
   // clang-format off
   // F32[1x3x2x4]
-  auto original = LiteralUtil::CreateR4WithLayout<float>({{
+  auto original = Literal::CreateR4WithLayout<float>({{
      {{10, 11, 12, 13}, {14, 15, 16, 17}},
      {{18, 19, 20, 21}, {22, 23, 24, 25}},
      {{26, 27, 28, 29}, {30, 31, 32, 33}},
   }}, layout_r4_dim0major_);
   // F32[1x3x4x2]
-  auto expected = LiteralUtil::CreateR3WithLayout<float>({
+  auto expected = Literal::CreateR3WithLayout<float>({
     {{10, 11}, {12, 13}, {14, 15}, {16, 17}},
     {{18, 19}, {20, 21}, {22, 23}, {24, 25}},
     {{26, 27}, {28, 29}, {30, 31}, {32, 33}},
   }, layout_r3_dim0major_);
   // clang-format on
-  auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
+  auto reshape = original->Reshape({3, 4, 2}).ConsumeValueOrDie();
 
-  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
+  EXPECT_TRUE(expected->Equal(*reshape));
 }
 
 TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
   // clang-format off
   // F32[1x3x2x4]
-  auto original = LiteralUtil::CreateR4WithLayout<float>({{
+  auto original = Literal::CreateR4WithLayout<float>({{
      {{10, 11, 12, 13}, {14, 15, 16, 17}},
      {{18, 19, 20, 21}, {22, 23, 24, 25}},
      {{26, 27, 28, 29}, {30, 31, 32, 33}},
   }}, layout_r4_dim0minor_);
   // F32[1x3x4x2]
-  auto expected = LiteralUtil::CreateR3WithLayout<float>({
+  auto expected = Literal::CreateR3WithLayout<float>({
     {{10, 11}, {12, 13}, {14, 15}, {16, 17}},
     {{18, 19}, {20, 21}, {22, 23}, {24, 25}},
     {{26, 27}, {28, 29}, {30, 31}, {32, 33}},
   }, layout_r3_dim0major_);
   // clang-format on
-  auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
+  auto reshape = original->Reshape({3, 4, 2}).ConsumeValueOrDie();
 
-  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
+  EXPECT_TRUE(expected->Equal(*reshape));
 }
 
 TEST_F(LiteralUtilTest, TransposeR0) {
-  auto original = LiteralUtil::CreateR0<float>(1.7f);
-  auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
-  EXPECT_TRUE(LiteralUtil::Equal(*original, *reshape));
+  auto original = Literal::CreateR0<float>(1.7f);
+  auto reshape = original->Transpose(/*permutation=*/{});
+  EXPECT_TRUE(original->Equal(*reshape));
 }
 
 TEST_F(LiteralUtilTest, TransposeR4) {
   // clang-format off
   // F32[1x3x2x4]
-  auto original = LiteralUtil::CreateR4<float>({{
+  auto original = Literal::CreateR4<float>({{
      {{10, 11, 12, 13}, {14, 15, 16, 17}},
      {{18, 19, 20, 21}, {22, 23, 24, 25}},
      {{26, 27, 28, 29}, {30, 31, 32, 33}},
   }});
   // clang-format on
-  auto reshape =
-      LiteralUtil::Transpose(*original, /*permutation=*/{2, 3, 0, 1});
-
-  LiteralUtil::EachCell<float>(
-      *reshape, [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-        EXPECT_EQ(value,
-                  LiteralUtil::Get<float>(*original, {indices[2], indices[3],
-                                                      indices[0], indices[1]}));
+  auto reshape = original->Transpose(/*permutation=*/{2, 3, 0, 1});
+
+  reshape->EachCell<float>(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
+        EXPECT_EQ(value, original->Get<float>(
+                             {indices[2], indices[3], indices[0], indices[1]}));
       });
 }
 
 TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) {
   // Tests that using Relayout on an array is equivalent to creating it in the
   // target layout in the first place.
-  auto dim0minor_relaid_to_dim0major = LiteralUtil::Relayout(
-      *literal_r4_2x2x3x3_dim0minor_, layout_r4_dim0major_);
-  EXPECT_TRUE(LiteralUtil::Equal(*literal_r4_2x2x3x3_dim0major_,
-                                 *dim0minor_relaid_to_dim0major));
-
-  auto dim0major_relaid_to_dim0minor = LiteralUtil::Relayout(
-      *literal_r4_2x2x3x3_dim0major_, layout_r4_dim0minor_);
-  EXPECT_TRUE(LiteralUtil::Equal(*literal_r4_2x2x3x3_dim0minor_,
-                                 *dim0major_relaid_to_dim0minor));
+  auto dim0minor_relaid_to_dim0major =
+      literal_r4_2x2x3x3_dim0minor_->Relayout(layout_r4_dim0major_);
+  EXPECT_TRUE(
+      literal_r4_2x2x3x3_dim0major_->Equal(*dim0minor_relaid_to_dim0major));
+
+  auto dim0major_relaid_to_dim0minor =
+      literal_r4_2x2x3x3_dim0major_->Relayout(layout_r4_dim0minor_);
+  EXPECT_TRUE(
+      literal_r4_2x2x3x3_dim0minor_->Equal(*dim0major_relaid_to_dim0minor));
 }
 
 TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   // Test expected memory layout of R2 dim0-minor (column-major) literal.
-  auto mat_dim0minor = LiteralUtil::CreateR2WithLayout<int>(
-      {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0minor_);
+  auto mat_dim0minor = Literal::CreateR2WithLayout<int>({{1, 2, 3}, {4, 5, 6}},
+                                                        layout_r2_dim0minor_);
   EXPECT_EQ(mat_dim0minor->s32s_size(), 6);
   EXPECT_THAT(mat_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
 
   // Test expected memory layout when using Relayout to row major.
-  auto relaid_mat_to_dim0major =
-      LiteralUtil::Relayout(*mat_dim0minor, layout_r2_dim0major_);
+  auto relaid_mat_to_dim0major = mat_dim0minor->Relayout(layout_r2_dim0major_);
   EXPECT_THAT(relaid_mat_to_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout of R2 created with dim0-major (row-major).
-  auto mat_dim0major = LiteralUtil::CreateR2WithLayout<int>(
-      {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0major_);
+  auto mat_dim0major = Literal::CreateR2WithLayout<int>({{1, 2, 3}, {4, 5, 6}},
+                                                        layout_r2_dim0major_);
   EXPECT_EQ(mat_dim0major->s32s_size(), 6);
   EXPECT_THAT(mat_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout when using Relayout to column major.
-  auto relaid_mat_to_dim0minor =
-      LiteralUtil::Relayout(*mat_dim0major, layout_r2_dim0minor_);
+  auto relaid_mat_to_dim0minor = mat_dim0major->Relayout(layout_r2_dim0minor_);
   EXPECT_THAT(relaid_mat_to_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
 }
 
@@ -578,8 +566,8 @@ TEST_F(LiteralUtilTest, TestR3LinearLayout) {
             {10, 11, 12},
           },
       });  // clang-format on
-  auto lit_dim0minor = LiteralUtil::CreateR3FromArray3DWithLayout<int>(
-      arr3d, layout_r3_dim0minor_);
+  auto lit_dim0minor =
+      Literal::CreateR3FromArray3DWithLayout<int>(arr3d, layout_r3_dim0minor_);
 
   EXPECT_EQ(lit_dim0minor->s32s_size(), 12);
   std::vector<int> expected_dim0minor{1, 7, 4, 10, 2, 8, 5, 11, 3, 9, 6, 12};
@@ -587,122 +575,120 @@ TEST_F(LiteralUtilTest, TestR3LinearLayout) {
               testing::ElementsAreArray(expected_dim0minor));
 
   // Test expected memory layout when using Relayout to row major.
-  auto relaid_lit_to_dim0major =
-      LiteralUtil::Relayout(*lit_dim0minor, layout_r3_dim0major_);
+  auto relaid_lit_to_dim0major = lit_dim0minor->Relayout(layout_r3_dim0major_);
   std::vector<int> expected_dim0major{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   EXPECT_THAT(relaid_lit_to_dim0major->s32s(),
               testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout of R3 created with dim0-major (row-major).
-  auto lit_dim0major = LiteralUtil::CreateR3FromArray3DWithLayout<int>(
-      arr3d, layout_r3_dim0major_);
+  auto lit_dim0major =
+      Literal::CreateR3FromArray3DWithLayout<int>(arr3d, layout_r3_dim0major_);
   EXPECT_EQ(lit_dim0major->s32s_size(), 12);
   EXPECT_THAT(lit_dim0major->s32s(),
               testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout when using Relayout to column major.
-  auto relaid_lit_to_dim0minor =
-      LiteralUtil::Relayout(*lit_dim0major, layout_r3_dim0minor_);
+  auto relaid_lit_to_dim0minor = lit_dim0major->Relayout(layout_r3_dim0minor_);
   EXPECT_THAT(relaid_lit_to_dim0minor->s32s(),
               testing::ElementsAreArray(expected_dim0minor));
 }
 
 TEST_F(LiteralUtilTest, SliceR0S32) {
-  auto input = LiteralUtil::CreateR0<int32>(1);
-  auto result = LiteralUtil::Slice(*input, {}, {});
-  EXPECT_TRUE(LiteralUtil::Equal(*input, *result));
+  auto input = Literal::CreateR0<int32>(1);
+  auto result = input->Slice({}, {});
+  EXPECT_TRUE(input->Equal(*result));
 }
 
 TEST_F(LiteralUtilTest, SliceR1F32) {
-  auto input = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0, 5.0});
-  auto result = LiteralUtil::Slice(*input, {3}, {4});
-  auto expected = LiteralUtil::CreateR1<float>({4.0});
-  EXPECT_TRUE(LiteralUtil::Equal(*expected, *result));
+  auto input = Literal::CreateR1<float>({1.0, 2.0, 3.0, 4.0, 5.0});
+  auto result = input->Slice({3}, {4});
+  auto expected = Literal::CreateR1<float>({4.0});
+  EXPECT_TRUE(expected->Equal(*result));
 }
 
 TEST_F(LiteralUtilTest, SliceR2U32) {
-  auto input_3x4 = LiteralUtil::CreateR2<uint32>(
-      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
-  auto result = LiteralUtil::Slice(*input_3x4, {0, 2}, {2, 4});
-  auto expected = LiteralUtil::CreateR2<uint32>({{3, 4}, {7, 8}});
-  EXPECT_TRUE(LiteralUtil::Equal(*expected, *result));
+  auto input_3x4 =
+      Literal::CreateR2<uint32>({{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
+  auto result = input_3x4->Slice({0, 2}, {2, 4});
+  auto expected = Literal::CreateR2<uint32>({{3, 4}, {7, 8}});
+  EXPECT_TRUE(expected->Equal(*result));
 }
 
 TEST_F(LiteralUtilTest, SliceR3U32Full) {
-  auto input_2x3x2 = LiteralUtil::CreateR3<uint32>(
+  auto input_2x3x2 = Literal::CreateR3<uint32>(
       {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
-  auto result = LiteralUtil::Slice(*input_2x3x2, {0, 0, 0}, {2, 3, 2});
-  EXPECT_TRUE(LiteralUtil::Equal(*input_2x3x2, *result));
+  auto result = input_2x3x2->Slice({0, 0, 0}, {2, 3, 2});
+  EXPECT_TRUE(input_2x3x2->Equal(*result));
 }
 
 TEST_F(LiteralUtilTest, PopulateR1S64) {
   Literal output;
-  LiteralUtil::PopulateR1<int64>({77}, &output);
-  auto expected = LiteralUtil::CreateR1<int64>({77});
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateR1<int64>({77});
+  auto expected = Literal::CreateR1<int64>({77});
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateR2U64) {
   Literal output;
-  LiteralUtil::PopulateR1<uint64>({{77, 88}}, &output);
-  auto expected = LiteralUtil::CreateR1<uint64>({{77, 88}});
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateR1<uint64>({{77, 88}});
+  auto expected = Literal::CreateR1<uint64>({{77, 88}});
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR0F32) {
   Literal output;
-  LiteralUtil::PopulateWithValue<float>(2.5f, {}, &output);
-  auto expected = LiteralUtil::CreateR0<float>(2.5f);
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateWithValue<float>(2.5f, {});
+  auto expected = Literal::CreateR0<float>(2.5f);
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR1S64) {
   Literal output;
-  LiteralUtil::PopulateWithValue<int64>(-7, {3}, &output);
-  auto expected = LiteralUtil::CreateR1<int64>({-7, -7, -7});
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateWithValue<int64>(-7, {3});
+  auto expected = Literal::CreateR1<int64>({-7, -7, -7});
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR2U64) {
   Literal output;
-  LiteralUtil::PopulateWithValue<uint64>(42, {2, 2}, &output);
-  auto expected = LiteralUtil::CreateR2<uint64>({{42, 42}, {42, 42}});
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateWithValue<uint64>(42, {2, 2});
+  auto expected = Literal::CreateR2<uint64>({{42, 42}, {42, 42}});
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
   Literal output;
   half h(0.25f);
-  LiteralUtil::PopulateWithValue<half>(h, {}, &output);
-  auto expected = LiteralUtil::CreateR0<half>(h);
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateWithValue<half>(h, {});
+  auto expected = Literal::CreateR0<half>(h);
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR1F16) {
   Literal output;
   half h(0.5f);
-  LiteralUtil::PopulateWithValue<half>(h, {3}, &output);
-  auto expected = LiteralUtil::CreateR1<half>({h, h, h});
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateWithValue<half>(h, {3});
+  auto expected = Literal::CreateR1<half>({h, h, h});
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
   Literal output;
   half h(2.0f);
-  LiteralUtil::PopulateWithValue<half>(h, {2, 2}, &output);
-  auto expected = LiteralUtil::CreateR2<half>({{h, h}, {h, h}});
-  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+  output.PopulateWithValue<half>(h, {2, 2});
+  auto expected = Literal::CreateR2<half>({{h, h}, {h, h}});
+  EXPECT_TRUE(output.Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, ReplicateR2U32) {
-  auto input = LiteralUtil::CreateR2<uint32>(
-      {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
-  auto output = LiteralUtil::Replicate<uint32>(*input, 3);
-  auto expected = LiteralUtil::CreateR3<uint32>(
+  auto input =
+      Literal::CreateR2<uint32>({{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
+  auto output = input->Replicate<uint32>(3);
+  auto expected = Literal::CreateR3<uint32>(
       {{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
        {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
        {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}});
-  EXPECT_TRUE(LiteralUtil::Equal(*output, *expected));
+  EXPECT_TRUE(output->Equal(*expected));
 }
 
 TEST_F(LiteralUtilTest, Copy) {
@@ -712,13 +698,13 @@ TEST_F(LiteralUtilTest, Copy) {
   for (const auto& layout : layouts) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
-    auto blank = LiteralUtil::CreateFromShape(shape);
-    auto source = LiteralUtil::CreateFromShape(shape);
+    auto blank = Literal::CreateFromShape(shape);
+    auto source = Literal::CreateFromShape(shape);
     const int64 zero_base[] = {0, 0, 0, 0};
     const int64 step[] = {1, 1, 1, 1};
     uint32 seqnr = 0;
     auto init_proc = [&](const std::vector<int64>& indexes) {
-      LiteralUtil::Set(source.get(), indexes, ++seqnr);
+      source->Set(indexes, ++seqnr);
       return true;
     };
 
@@ -729,8 +715,7 @@ TEST_F(LiteralUtilTest, Copy) {
     const int64 dest_base[] = {6, 4, 12, 2};
     const int64 copy_size[] = {7, 8, 11, 9};
 
-    TF_EXPECT_OK(LiteralUtil::Copy(*source, src_base, blank.get(), dest_base,
-                                   copy_size));
+    TF_EXPECT_OK(blank->Copy(*source, src_base, dest_base, copy_size));
     std::vector<int64> source_indexes(TF_ARRAYSIZE(dimensions), 0);
     std::vector<int64> blank_indexes(TF_ARRAYSIZE(dimensions), 0);
     bool matched = true;
@@ -741,9 +726,8 @@ TEST_F(LiteralUtilTest, Copy) {
       std::copy(indexes.begin(), indexes.end(), blank_indexes.begin());
       std::transform(blank_indexes.begin(), blank_indexes.end(), dest_base,
                      blank_indexes.begin(), std::plus<int64>());
-      auto bval = LiteralUtil::Get<uint32>(*blank, blank_indexes);
-      matched = (bval != 0 &&
-                 bval == LiteralUtil::Get<uint32>(*source, source_indexes));
+      auto bval = blank->Get<uint32>(blank_indexes);
+      matched = (bval != 0 && bval == source->Get<uint32>(source_indexes));
       return matched;
     };
     ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
@@ -753,25 +737,25 @@ TEST_F(LiteralUtilTest, Copy) {
 }
 
 TEST_F(LiteralUtilTest, CopyScalars) {
-  auto zero = LiteralUtil::CreateR0<uint32>(0);
-  auto nine = LiteralUtil::CreateR0<uint32>(9);
-  TF_EXPECT_OK(LiteralUtil::Copy(*nine, {}, zero.get(), {}, {}));
-  EXPECT_TRUE(LiteralUtil::Equal(*zero, *nine));
-
-  auto vect = LiteralUtil::CreateR1<uint32>({3, 4, 9, 12, 5, 17, 21});
-  TF_EXPECT_OK(LiteralUtil::Copy(*vect, {5}, zero.get(), {}, {}));
-  EXPECT_EQ(LiteralUtil::Get<uint32>(*zero, {}), 17);
-  TF_EXPECT_OK(LiteralUtil::Copy(*zero, {}, vect.get(), {4}, {}));
-  EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
+  auto zero = Literal::CreateR0<uint32>(0);
+  auto nine = Literal::CreateR0<uint32>(9);
+  TF_EXPECT_OK(zero->Copy(*nine, {}, {}, {}));
+  EXPECT_TRUE(zero->Equal(*nine));
+
+  auto vect = Literal::CreateR1<uint32>({3, 4, 9, 12, 5, 17, 21});
+  TF_EXPECT_OK(zero->Copy(*vect, {5}, {}, {}));
+  EXPECT_EQ(zero->Get<uint32>({}), 17);
+  TF_EXPECT_OK(vect->Copy(*zero, {}, {4}, {}));
+  EXPECT_EQ(vect->Get<uint32>({4}), 17);
 }
 
 TEST_F(LiteralUtilTest, F16) {
   // Verify that the internal data views are consistent and that they
   // are in little endian format
   // TODO - modify if we make the data format machine endianess dependent
-  auto m1 = LiteralUtil::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
+  auto m1 = Literal::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
   Literal* l1 = m1.get();
-  const char* d1 = static_cast<const char*>(LiteralUtil::InternalData(*l1));
+  const char* d1 = static_cast<const char*>(l1->InternalData());
   EXPECT_EQ(d1[0], 0);
   EXPECT_EQ(d1[1], 0);
   EXPECT_EQ(d1[2], 0);
@@ -780,14 +764,13 @@ TEST_F(LiteralUtilTest, F16) {
   EXPECT_EQ(d1[5], 0);
   EXPECT_EQ(d1[6], 0);
   EXPECT_EQ(d1[7], 0);
-  EXPECT_EQ(LiteralUtil::InternalData(*l1),
-            LiteralUtil::MutableInternalData(l1));
+  EXPECT_EQ(l1->InternalData(), l1->MutableInternalData());
 
   half h1(1.0f);
   half h2(2.0f);
-  auto m2 = LiteralUtil::CreateR2<half>({{h1, h2}, {h2, h1}});
+  auto m2 = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
   Literal* l2 = m2.get();
-  const char* d2 = static_cast<const char*>(LiteralUtil::InternalData(*l2));
+  const char* d2 = static_cast<const char*>(l2->InternalData());
   EXPECT_EQ(d2[0], 0);
   EXPECT_EQ(d2[1], 0x3C);
   EXPECT_EQ(d2[2], 0);
@@ -796,8 +779,7 @@ TEST_F(LiteralUtilTest, F16) {
   EXPECT_EQ(d2[5], 0x40);
   EXPECT_EQ(d2[6], 0);
   EXPECT_EQ(d2[7], 0x3C);
-  EXPECT_EQ(LiteralUtil::InternalData(*l2),
-            LiteralUtil::MutableInternalData(l2));
+  EXPECT_EQ(l2->InternalData(), l2->MutableInternalData());
 }
 
 TEST_F(LiteralUtilTest, Populate) {
@@ -818,19 +800,19 @@ TEST_F(LiteralUtilTest, Populate) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = LiteralUtil::CreateFromShape(shape);
+    auto literal = Literal::CreateFromShape(shape);
     auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
-      return LiteralUtil::LinearIndex(*literal, indexes) + 17;
+      return literal->LinearIndex(indexes) + 17;
     };
-    TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
+    TF_EXPECT_OK(literal->Populate<uint32>(generator));
 
     std::vector<int64> zero_base(data.dimensions.size(), 0);
     std::vector<int64> step(data.dimensions.size(), 1);
     bool matched = true;
     auto check_function = [&](const std::vector<int64>& indexes) {
-      auto value = LiteralUtil::Get<uint32>(*literal, indexes);
+      auto value = literal->Get<uint32>(indexes);
       matched = matched && (value == generator(indexes));
       return matched;
     };
@@ -842,65 +824,66 @@ TEST_F(LiteralUtilTest, Populate) {
 
 TEST_F(LiteralUtilTest, ConvertR4) {
   // clang-format off
-  auto original = LiteralUtil::CreateR4WithLayout<int8>({{
+  auto original = Literal::CreateR4WithLayout<int8>({{
      {{10, 11, 12, 13}, {14, 15, 16, 17}},
      {{18, 19, 20, 21}, {22, 23, 24, 25}},
      {{26, 27, 28, 29}, {30, 31, 32, 33}},
   }}, layout_r4_dim0major_);
-  auto expected = LiteralUtil::CreateR4WithLayout<uint32>({{
+  auto expected = Literal::CreateR4WithLayout<uint32>({{
      {{10, 11, 12, 13}, {14, 15, 16, 17}},
      {{18, 19, 20, 21}, {22, 23, 24, 25}},
      {{26, 27, 28, 29}, {30, 31, 32, 33}},
   }}, layout_r4_dim0major_);
   // clang-format on
-  auto converted = LiteralUtil::Convert<int8, uint32>(*original);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> converted,
+                          original->Convert(U32));
 
-  EXPECT_TRUE(LiteralUtil::Equal(*expected, *converted));
+  EXPECT_TRUE(expected->Equal(*converted));
 }
 
 TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   // clang-format off
-  auto s8 = LiteralUtil::CreateR4WithLayout<int8>({{
+  auto s8 = Literal::CreateR4WithLayout<int8>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
-  auto s32 = LiteralUtil::CreateR4WithLayout<int32>({{
+  auto s32 = Literal::CreateR4WithLayout<int32>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
-  auto u32 = LiteralUtil::CreateR4WithLayout<uint32>({{
+  auto u32 = Literal::CreateR4WithLayout<uint32>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
-  auto s64 = LiteralUtil::CreateR4WithLayout<int64>({{
+  auto s64 = Literal::CreateR4WithLayout<int64>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
-  auto u64 = LiteralUtil::CreateR4WithLayout<uint64>({{
+  auto u64 = Literal::CreateR4WithLayout<uint64>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
-  auto pred = LiteralUtil::CreateR4WithLayout<bool>({{
+  auto pred = Literal::CreateR4WithLayout<bool>({{
     {{true, false, true, false}, {false, true, false, true}},
     {{false, true, false, true}, {true, false, true, false}},
     {{true, false, true, false}, {false, true, false, true}},
   }}, layout_r4_dim0major_);
-  auto int32_pred = LiteralUtil::CreateR4WithLayout<int32>({{
+  auto int32_pred = Literal::CreateR4WithLayout<int32>({{
     {{1, 0, 1, 0}, {0, 1, 0, 1}},
     {{0, 1, 0, 1}, {1, 0, 1, 0}},
     {{1, 0, 1, 0}, {0, 1, 0, 1}},
   }}, layout_r4_dim0major_);
-  auto f32 = LiteralUtil::CreateR4WithLayout<float>({{
+  auto f32 = Literal::CreateR4WithLayout<float>({{
     {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}},
     {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}},
     {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}},
   }}, layout_r4_dim0major_);
-  auto f64 = LiteralUtil::CreateR4WithLayout<double>({{
+  auto f64 = Literal::CreateR4WithLayout<double>({{
     {{10.0, 0.0, 12.0, 0.0}, {0.0, 15.0, 0.0, 17.0}},
     {{0.0, 19.0, 0.0, 21.0}, {22.0, 0.0, 24.0, 0.0}},
     {{26.0, 0.0, 28.0, 0.0}, {0.0, 31.0, 0.0, 33.0}},
@@ -908,40 +891,40 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   // clang-format on
   std::unique_ptr<Literal> conv;
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, U32).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *u32));
+  conv = s8->Convert(U32).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*u32));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, S32).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s32));
+  conv = s8->Convert(S32).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*s32));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, U64).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *u64));
+  conv = s8->Convert(U64).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*u64));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, S64).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s64));
+  conv = s8->Convert(S64).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*s64));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s8, PRED).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *pred));
+  conv = s8->Convert(PRED).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*pred));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*pred, S32).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *int32_pred));
+  conv = pred->Convert(S32).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*int32_pred));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*f32, S32).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s32));
+  conv = f32->Convert(S32).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*s32));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*f64, S32).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *s32));
+  conv = f64->Convert(S32).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*s32));
 
-  conv = LiteralUtil::ConvertIfSrcTypeMatches(*s32, F32).ConsumeValueOrDie();
-  EXPECT_TRUE(LiteralUtil::Equal(*conv, *f32));
+  conv = s32->Convert(F32).ConsumeValueOrDie();
+  EXPECT_TRUE(conv->Equal(*f32));
 
-  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, TUPLE).status().code(),
+  EXPECT_EQ(s32->Convert(TUPLE).status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, F16).status().code(),
+  EXPECT_EQ(s32->Convert(F16).status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, S16).status().code(),
+  EXPECT_EQ(s32->Convert(S16).status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  EXPECT_EQ(LiteralUtil::ConvertIfSrcTypeMatches(*s32, U16).status().code(),
+  EXPECT_EQ(s32->Convert(U16).status().code(),
             tensorflow::error::INVALID_ARGUMENT);
 }
 
@@ -996,9 +979,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   half h1(1.0f);
   half h2(2.0f);
 
-  const char half_vals[8] = {
-    0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C
-  };
+  const char half_vals[8] = {0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C};
   LiteralProto p;
   p.mutable_shape()->set_element_type(F16);
   p.mutable_shape()->clear_dimensions();
@@ -1006,7 +987,6 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   p.clear_f16s();
   p.set_f16s(half_vals, 8);
 
-
   Literal literal(p);
   ASSERT_EQ(4, literal.f16s_size());
   ASSERT_EQ(h1, literal.f16s(0));
@@ -1022,6 +1002,5 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   ASSERT_EQ(h1, r[3]);
 }
 
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc
index d488830a6cd7b07ccb8de237121ab0693bd73a0f..70e0f5a74711c8ceef1b6d4225141aa1cc9c6219 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@@ -58,8 +58,7 @@ StatusOr<std::unique_ptr<Literal>> PackedLiteralReader::Read(
   }
 
   int64 elements = ShapeUtil::ElementsIn(shape);
-  LiteralUtil::Resize(elements, std::numeric_limits<float>::quiet_NaN(),
-                      result.get());
+  result->Resize(elements, std::numeric_limits<float>::quiet_NaN());
   std::vector<float>* field = result->mutable_f32s();
   char* data = tensorflow::bit_cast<char*>(field->data());
   uint64 bytes = elements * sizeof(float);
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index e8de559a5ef9e69864abab21c99887d40cfd378a..7ef5c6d916f52f89a58e107c9526ee312f7369d3 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -86,6 +86,53 @@ namespace xla {
   return result;
 }
 
+/*  static */ std::unique_ptr<Array3D<float>> ReferenceUtil::ConvArray3D(
+    const Array3D<float>& lhs, const Array3D<float>& rhs, int64 kernel_stride,
+    Padding padding) {
+  return ConvArray3DGeneralDimensionsDilated(
+      lhs, rhs, kernel_stride, padding, 1, 1,
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(1));
+}
+
+/*static*/ std::unique_ptr<Array3D<float>>
+ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
+    const Array3D<float>& lhs, const Array3D<float>& rhs, int64 kernel_stride,
+    Padding padding, int64 lhs_dilation, int64 rhs_dilation,
+    const ConvolutionDimensionNumbers& dnums) {
+  CHECK_EQ(dnums.spatial_dimensions_size(), 1);
+  // Reuse the code for Array4D-convolution by extending the 3D input into a 4D
+  // array by adding a fourth dummy dimension of size 1 without stride, padding
+  // and dilation.
+  Array4D<float> a4dlhs(lhs.n1(), lhs.n2(), lhs.n3(), 1);
+  a4dlhs.Each(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, float* value_ptr) {
+        CHECK_EQ(indices[3], 0);
+        *value_ptr = lhs.operator()(indices[0], indices[1], indices[2]);
+      });
+  Array4D<float> a4drhs(rhs.n1(), rhs.n2(), rhs.n3(), 1);
+  a4drhs.Each(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, float* value_ptr) {
+        CHECK_EQ(indices[3], 0);
+        *value_ptr = rhs.operator()(indices[0], indices[1], indices[2]);
+      });
+  // Add a second dummy spatial dimensions.
+  ConvolutionDimensionNumbers dnums2d = dnums;
+  dnums2d.add_spatial_dimensions(3);
+  dnums2d.add_kernel_spatial_dimensions(3);
+  std::unique_ptr<Array4D<float>> convr4 = ConvArray4DGeneralDimensionsDilated(
+      a4dlhs, a4drhs, {kernel_stride, 1}, padding, {lhs_dilation, 1},
+      {rhs_dilation, 1}, dnums2d);
+
+  auto convr3 = MakeUnique<Array3D<float>>(convr4->planes(), convr4->depth(),
+                                           convr4->height());
+  convr4->Each(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, float* value_ptr) {
+        CHECK_EQ(indices[3], 0);
+        convr3->operator()(indices[0], indices[1], indices[2]) = *value_ptr;
+      });
+  return convr3;
+}
+
 /* static */ std::unique_ptr<Array4D<float>> ReferenceUtil::ConvArray4D(
     const Array4D<float>& lhs, const Array4D<float>& rhs,
     std::pair<int64, int64> kernel_stride, Padding padding) {
@@ -135,6 +182,49 @@ ReferenceUtil::SeparableConvArray4D(const Array4D<float>& input,
   return tensorflow::MathUtil::CeilOfRatio(unpadded_width, stride);
 }
 
+/* static  */ std::unique_ptr<std::vector<float>>
+ReferenceUtil::ReduceWindow1DGeneric(
+    const tensorflow::gtl::ArraySlice<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
+  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+
+  std::vector<int64> window_counts(window.size(), 0);
+  std::vector<int64> pad_low(window.size(), 0);
+  for (int64 i = 0; i < window.size(); ++i) {
+    window_counts[i] =
+        WindowCount(dim_lengths[i], window[i], stride[i], padding);
+    pad_low[i] = padding_both[i].first;
+  }
+  auto result = MakeUnique<std::vector<float>>(window_counts[0]);
+
+  // Do a full 1D reduce window.
+  for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
+    int64 i0_base = i0 * stride[0] - pad_low[0];
+
+    float val = init;
+    for (int64 i0_win = 0; i0_win < window[0]; ++i0_win) {
+      if (i0_base + i0_win >= 0 && i0_base + i0_win < dim_lengths[0]) {
+        val = reduce_func(val, operand[i0_base + i0_win]);
+      }
+    }
+    (*result)[i0] = val;
+  }
+  return result;
+}
+
+/* static  */ std::unique_ptr<std::vector<float>>
+ReferenceUtil::ReduceWindow1DAdd(
+    const tensorflow::gtl::ArraySlice<float>& operand, float init,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+  const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; };
+  return ReduceWindow1DGeneric(operand, init, add_reduce, window, stride,
+                               padding);
+}
+
 /* static  */ std::unique_ptr<Array2D<float>> ReferenceUtil::ReduceWindow2DAdd(
     const Array2D<float>& operand, float init,
     const tensorflow::gtl::ArraySlice<int64>& window,
@@ -252,6 +342,20 @@ ReferenceUtil::ReduceWindow4DGeneric(
                                padding);
 }
 
+/* static */ std::unique_ptr<Array4D<float>> ReferenceUtil::BatchNorm4D(
+    const Array4D<float>& input, const Array4D<float>& mean,
+    const Array4D<float>& var, const Array4D<float>& scale,
+    const Array4D<float>& offset, float epsilon) {
+  auto normalized =
+      *MapArray4D(input, mean, [](float a, float b) { return a - b; });
+  normalized = *MapArray4D(normalized, var, [&](float a, float b) {
+    return a / std::sqrt(b + epsilon);
+  });
+  normalized =
+      *MapArray4D(normalized, scale, [](float a, float b) { return a * b; });
+  return MapArray4D(normalized, offset, [](float a, float b) { return a + b; });
+}
+
 /* static  */ std::unique_ptr<Array4D<float>>
 ReferenceUtil::SelectAndScatter4DGePlus(
     const Array4D<float>& operand, const Array4D<float>& source, float init,
@@ -439,21 +543,21 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
   // Lambda to access the rhs operand at the given 4D index.  height_over_dky
   // should be equal to height / dky, and width_over_dkx should be equal to
   // width / dkx.  (This is an optimization to avoid doing divisions.)
-  const auto rhs_element = [&](
-      int64 kernel_output_feature, int64 kernel_input_feature, int64 height,
-      int64 width, int64 height_over_dky, int64 width_over_dkx) {
-    DCHECK_EQ(height % dky, 0);
-    DCHECK_EQ(width % dkx, 0);
-    DCHECK_EQ(height / dky, height_over_dky);
-    DCHECK_EQ(width / dkx, width_over_dkx);
-
-    std::array<int64, 4> index;
-    index[dnums.kernel_output_feature_dimension()] = kernel_output_feature;
-    index[dnums.kernel_input_feature_dimension()] = kernel_input_feature;
-    index[dnums.kernel_spatial_dimensions(0)] = height_over_dky;
-    index[dnums.kernel_spatial_dimensions(1)] = width_over_dkx;
-    return rhs(index[0], index[1], index[2], index[3]);
-  };
+  const auto rhs_element =
+      [&](int64 kernel_output_feature, int64 kernel_input_feature, int64 height,
+          int64 width, int64 height_over_dky, int64 width_over_dkx) {
+        DCHECK_EQ(height % dky, 0);
+        DCHECK_EQ(width % dkx, 0);
+        DCHECK_EQ(height / dky, height_over_dky);
+        DCHECK_EQ(width / dkx, width_over_dkx);
+
+        std::array<int64, 4> index;
+        index[dnums.kernel_output_feature_dimension()] = kernel_output_feature;
+        index[dnums.kernel_input_feature_dimension()] = kernel_input_feature;
+        index[dnums.kernel_spatial_dimensions(0)] = height_over_dky;
+        index[dnums.kernel_spatial_dimensions(1)] = width_over_dkx;
+        return rhs(index[0], index[1], index[2], index[3]);
+      };
 
   // Lambda to access the result data at the given 4D index.
   const auto result_element = [&](int64 batch, int64 kernel_output_feature,
@@ -491,13 +595,37 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
       }
     }
   }
+  if (samples == 0 || kx == 0 || ky == 0 || ox == 0 || oy == 0 || oz == 0 ||
+      iz == 0) {
+    LOG(INFO) << "Output will be trivially empty because one of these "
+                 "dimensions is 0: samples: "
+              << samples << " kx: " << kx << " ky: " << ky << " ox: " << ox
+              << " oy: " << oy << " oz: " << oz << " iz: " << iz;
+    return result;
+  }
+  bool trivial = true;
+  auto check_trivial = [&trivial](tensorflow::gtl::ArraySlice<int64> indices,
+                                  float value) {
+    if (value != 0.0) {
+      trivial = false;
+    }
+  };
+  lhs.Each(check_trivial);
+  if (trivial) {
+    LOG(FATAL) << "LHS is all 0.0.";
+  }
+  trivial = true;
+  rhs.Each(check_trivial);
+  if (trivial) {
+    LOG(FATAL) << "RHS is all 0.0.";
+  }
   return result;
 }
 
 /* static */ std::unique_ptr<std::vector<float>>
 ReferenceUtil::ReduceToColArray2D(
     const Array2D<float>& matrix, float init,
-    std::function<float(float, float)> reduce_function) {
+    const std::function<float(float, float)>& reduce_function) {
   int64 rows = matrix.height();
   int64 cols = matrix.width();
   auto result = MakeUnique<std::vector<float>>();
@@ -514,7 +642,7 @@ ReferenceUtil::ReduceToColArray2D(
 /* static */ std::unique_ptr<std::vector<float>>
 ReferenceUtil::ReduceToRowArray2D(
     const Array2D<float>& matrix, float init,
-    std::function<float(float, float)> reduce_function) {
+    const std::function<float(float, float)>& reduce_function) {
   int64 rows = matrix.height();
   int64 cols = matrix.width();
   auto result = MakeUnique<std::vector<float>>();
@@ -531,7 +659,7 @@ ReferenceUtil::ReduceToRowArray2D(
 /*static*/ std::vector<float> ReferenceUtil::Reduce4DTo1D(
     const Array4D<float>& array, float init,
     tensorflow::gtl::ArraySlice<int64> dims,
-    std::function<float(float, float)> reduce_function) {
+    const std::function<float(float, float)>& reduce_function) {
   std::vector<float> result;
   CHECK_EQ(dims.size(), 3);
   const std::set<int64> dim_set(dims.begin(), dims.end());
@@ -566,10 +694,42 @@ ReferenceUtil::ReduceToRowArray2D(
   return result;
 }
 
+/* static */ std::unique_ptr<Array4D<float>> ReferenceUtil::Broadcast1DTo4D(
+    const std::vector<float>& array, const std::vector<int64>& bounds,
+    int64 broadcast_from_dim) {
+  auto result =
+      MakeUnique<Array4D<float>>(bounds[0], bounds[1], bounds[2], bounds[3]);
+  for (int64 i = 0; i < result->n1(); ++i) {
+    for (int64 j = 0; j < result->n2(); ++j) {
+      for (int64 k = 0; k < result->n3(); ++k) {
+        for (int64 l = 0; l < result->n4(); ++l) {
+          switch (broadcast_from_dim) {
+            case 0:
+              (*result)(i, j, k, l) = array[i];
+              break;
+            case 1:
+              (*result)(i, j, k, l) = array[j];
+              break;
+            case 2:
+              (*result)(i, j, k, l) = array[k];
+              break;
+            case 3:
+              (*result)(i, j, k, l) = array[l];
+              break;
+            default:
+              break;
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
 /* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::Reduce3DTo2D(
     const Array3D<float>& array, float init,
     tensorflow::gtl::ArraySlice<int64> dims,
-    std::function<float(float, float)> reduce_function) {
+    const std::function<float(float, float)>& reduce_function) {
   CHECK_EQ(dims.size(), 1);
   int64 rows = dims[0] == 0 ? array.n2() : array.n1();
   int64 cols = dims[0] == 2 ? array.n2() : array.n3();
@@ -665,6 +825,61 @@ ReferenceUtil::ReduceToRowArray2D(
   return result;
 }
 
+/* static */ Array3D<float> ReferenceUtil::PadArray3D(
+    const Array3D<float>& operand, const PaddingConfig& padding,
+    const float pad) {
+  CHECK_EQ(padding.dimensions_size(), 3);
+
+  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                           operand.n3()};
+  std::vector<int64> pad_low(3);
+  std::vector<int64> pad_high(3);
+  std::vector<int64> pad_interior(3);
+  std::vector<int64> output_bounds(3);
+  for (int64 i = 0; i < 3; ++i) {
+    pad_low[i] = padding.dimensions(i).edge_padding_low();
+    pad_high[i] = padding.dimensions(i).edge_padding_high();
+    CHECK_LE(0, pad_low[i]);
+    CHECK_LE(0, pad_high[i]);
+    CHECK_LE(0, padding.dimensions(i).interior_padding()) << "not implemented";
+    pad_interior[i] = padding.dimensions(i).interior_padding();
+
+    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                       (input_bounds[i] - 1) * pad_interior[i];
+  }
+
+  Array3D<float> result(output_bounds[0], output_bounds[1], output_bounds[2]);
+  std::vector<int> indices = {0, 0, 0};
+  for (indices[0] = 0; indices[0] < output_bounds[0]; ++indices[0]) {
+    for (indices[1] = 0; indices[1] < output_bounds[1]; ++indices[1]) {
+      for (indices[2] = 0; indices[2] < output_bounds[2]; ++indices[2]) {
+        float* value = &result(indices[0], indices[1], indices[2]);
+        bool value_padded = false;
+        for (int i = 0; i < 3; ++i) {
+          bool in_low_padding = indices[i] < pad_low[i];
+          bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+          if (in_low_padding || in_high_padding) {
+            *value = pad;
+            value_padded = true;
+          }
+          if (pad_interior[i] &&
+              (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+            *value = pad;
+            value_padded = true;
+          }
+        }
+        if (value_padded) {
+          continue;
+        }
+        *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                         (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                         (indices[2] - pad_low[2]) / (pad_interior[2] + 1));
+      }
+    }
+  }
+  return result;
+}
+
 /* static */ Array4D<float> ReferenceUtil::PadArray4D(
     const Array4D<float>& operand, const PaddingConfig& padding,
     const float pad) {
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index f58f0bdc9f51dff62c10dda4aba7aac03e689ce7..2da17307817858eea60e868f4be1ab8138784385 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
@@ -73,6 +74,20 @@ class ReferenceUtil {
       std::pair<int64, int64> lhs_dilation,
       std::pair<int64, int64> rhs_dilation, ConvolutionDimensionNumbers dnums);
 
+  // Returns the result of a convolution `lhs <conv> rhs`, with the default
+  // convolution dimension numbers returned from
+  // ComputationBuilder::CreateDefaultConvDimensionNumbers().
+  static std::unique_ptr<Array3D<float>> ConvArray3D(const Array3D<float>& lhs,
+                                                     const Array3D<float>& rhs,
+                                                     int64 kernel_stride,
+                                                     Padding padding);
+
+  // Returns the result of a convolution `lhs <conv> rhs`.
+  static std::unique_ptr<Array3D<float>> ConvArray3DGeneralDimensionsDilated(
+      const Array3D<float>& lhs, const Array3D<float>& rhs, int64 kernel_stride,
+      Padding padding, int64 lhs_dilation, int64 rhs_dilation,
+      const ConvolutionDimensionNumbers& dnums);
+
   // Returns the result of a separable  convolution with the given parameters.
   // kernel_stride and padding applies to the depthwise convolution during
   // the separable convolution. pointwise_weights.depth() must be equal to
@@ -87,21 +102,21 @@ class ReferenceUtil {
   // to apply for each reduction step.
   static std::unique_ptr<std::vector<float>> ReduceToColArray2D(
       const Array2D<float>& matrix, float init,
-      std::function<float(float, float)> reduce_function);
+      const std::function<float(float, float)>& reduce_function);
 
   // Returns the result of reducing a matrix to a row vector. init is the
   // initial value for the reduce operation, and reduce_function is the function
   // to apply for each reduction step.
   static std::unique_ptr<std::vector<float>> ReduceToRowArray2D(
       const Array2D<float>& matrix, float init,
-      std::function<float(float, float)> reduce_function);
+      const std::function<float(float, float)>& reduce_function);
 
   // Performs a R2=>R1 reduction by reducing away the dimension specified in
   // 'dimension_to_reduce'.
   template <typename T>
   static std::vector<T> ReduceR2ToR1(const Array2D<T>& input,
                                      int dimension_to_reduce, T init,
-                                     std::function<T(T, T)> freduce) {
+                                     const std::function<T(T, T)>& freduce) {
     std::vector<T> result(dimension_to_reduce == 0 ? input.n2() : input.n1(),
                           init);
     for (int i0 = 0; i0 < input.n1(); ++i0) {
@@ -118,14 +133,19 @@ class ReferenceUtil {
   static std::vector<float> Reduce4DTo1D(
       const Array4D<float>& array, float init,
       tensorflow::gtl::ArraySlice<int64> dims,
-      std::function<float(float, float)> reduce_function);
+      const std::function<float(float, float)>& reduce_function);
+
+  // Broadcast 1D dimension to 4D, from the dimension `broadcast_from_dim`.
+  static std::unique_ptr<Array4D<float>> Broadcast1DTo4D(
+      const std::vector<float>& array, const std::vector<int64>& bounds,
+      int64 broadcast_from_dim);
 
   // Returns the result of reducing the 3D array to a 2D array, reducing away
   // the dimensions specified in dims.
   static std::unique_ptr<Array2D<float>> Reduce3DTo2D(
       const Array3D<float>& array, float init,
       tensorflow::gtl::ArraySlice<int64> dims,
-      std::function<float(float, float)> reduce_function);
+      const std::function<float(float, float)>& reduce_function);
 
   // Applies map_function to each element in the input (2D array) and returns
   // the result.
@@ -144,19 +164,26 @@ class ReferenceUtil {
   static int64 WindowCount(int64 unpadded_width, int64 window_len, int64 stride,
                            Padding padding);
 
-  // Performs a 2D window reduction with Add as the function to apply.
+  // Windowed reductions with Add as the function to apply.
+  static std::unique_ptr<std::vector<float>> ReduceWindow1DAdd(
+      const tensorflow::gtl::ArraySlice<float>& operand, float init,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
   static std::unique_ptr<Array2D<float>> ReduceWindow2DAdd(
       const Array2D<float>& operand, float init,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
-
-  // Performs a 4D window reduction with Add as the function to apply.
   static std::unique_ptr<Array4D<float>> ReduceWindow4DAdd(
       const Array4D<float>& operand, float init,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
 
-  // Performs a 4D window reduction with a generic reduce function.
+  // Windowed reductions with a generic reduce function.
+  static std::unique_ptr<std::vector<float>> ReduceWindow1DGeneric(
+      const tensorflow::gtl::ArraySlice<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
   static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
       const Array4D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
@@ -169,6 +196,12 @@ class ReferenceUtil {
       const tensorflow::gtl::ArraySlice<int64>& stride,
       const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
 
+  // Batch normalize data.
+  static std::unique_ptr<Array4D<float>> BatchNorm4D(
+      const Array4D<float>& input, const Array4D<float>& mean,
+      const Array4D<float>& var, const Array4D<float>& scale,
+      const Array4D<float>& offset, float epsilon);
+
   // Performs select and scatter with Greater Than or equal as the select, plus
   // as the scatter, and Same Padding.
   static std::unique_ptr<Array4D<float>> SelectAndScatter4DGePlus(
@@ -283,48 +316,56 @@ class ReferenceUtil {
     return result;
   }
 
-  // Slices the input array given starting indices in each dimension and limit
-  // indices in each dimension.
+  // Slices the input array given starting indices, limit indices, and strides
+  // in each dimension.
   template <typename T>
   static std::unique_ptr<Array2D<T>> Slice2D(const Array2D<T>& input,
                                              std::array<int64, 2> starts,
-                                             std::array<int64, 2> limits) {
+                                             std::array<int64, 2> limits,
+                                             std::array<int64, 2> strides) {
     CHECK_LE(starts[0], input.n1());
     CHECK_LE(starts[1], input.n2());
     CHECK_LE(limits[0], input.n1());
     CHECK_LE(limits[1], input.n2());
+    CHECK_GE(strides[0], 1);
+    CHECK_GE(strides[1], 1);
     auto result =
-        MakeUnique<Array2D<T>>(limits[0] - starts[0], limits[1] - starts[1]);
+        MakeUnique<Array2D<T>>(CeilOfRatio(limits[0] - starts[0], strides[0]),
+                               CeilOfRatio(limits[1] - starts[1], strides[1]));
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
-        (*result)(i0, i1) = input(starts[0] + i0, starts[1] + i1);
+        (*result)(i0, i1) =
+            input(starts[0] + i0 * strides[0], starts[1] + i1 * strides[1]);
       }
     }
     return result;
   }
 
   template <typename T>
-  static std::unique_ptr<Array4D<T>> Slice4D(const Array4D<T>& input,
-                                             std::array<int64, 4> starts,
-                                             std::array<int64, 4> limits) {
+  static std::unique_ptr<Array3D<T>> Slice3D(const Array3D<T>& input,
+                                             std::array<int64, 3> starts,
+                                             std::array<int64, 3> limits,
+                                             std::array<int64, 3> strides) {
     CHECK_LE(starts[0], input.n1());
     CHECK_LE(starts[1], input.n2());
     CHECK_LE(starts[2], input.n3());
-    CHECK_LE(starts[3], input.n4());
     CHECK_LE(limits[0], input.n1());
     CHECK_LE(limits[1], input.n2());
     CHECK_LE(limits[2], input.n3());
-    CHECK_LE(limits[3], input.n4());
+    CHECK_GE(strides[0], 1);
+    CHECK_GE(strides[1], 1);
+    CHECK_GE(strides[2], 1);
     auto result =
-        MakeUnique<Array4D<T>>(limits[0] - starts[0], limits[1] - starts[1],
-                               limits[2] - starts[2], limits[3] - starts[3]);
+        MakeUnique<Array3D<T>>(CeilOfRatio(limits[0] - starts[0], strides[0]),
+                               CeilOfRatio(limits[1] - starts[1], strides[1]),
+                               CeilOfRatio(limits[2] - starts[2], strides[2]));
+
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
         for (int64 i2 = 0; i2 < result->n3(); ++i2) {
-          for (int64 i3 = 0; i3 < result->n4(); ++i3) {
-            (*result)(i0, i1, i2, i3) = input(starts[0] + i0, starts[1] + i1,
-                                              starts[2] + i2, starts[3] + i3);
-          }
+          (*result)(i0, i1, i2) =
+              input(starts[0] + i0 * strides[0], starts[1] + i1 * strides[1],
+                    starts[2] + i2 * strides[2]);
         }
       }
     }
@@ -332,22 +373,35 @@ class ReferenceUtil {
   }
 
   template <typename T>
-  static std::unique_ptr<Array3D<T>> Slice3D(const Array3D<T>& input,
-                                             std::array<int64, 3> starts,
-                                             std::array<int64, 3> limits) {
+  static std::unique_ptr<Array4D<T>> Slice4D(const Array4D<T>& input,
+                                             std::array<int64, 4> starts,
+                                             std::array<int64, 4> limits,
+                                             std::array<int64, 4> strides) {
     CHECK_LE(starts[0], input.n1());
     CHECK_LE(starts[1], input.n2());
     CHECK_LE(starts[2], input.n3());
+    CHECK_LE(starts[3], input.n4());
     CHECK_LE(limits[0], input.n1());
     CHECK_LE(limits[1], input.n2());
     CHECK_LE(limits[2], input.n3());
-    auto result = MakeUnique<Array3D<T>>(
-        limits[0] - starts[0], limits[1] - starts[1], limits[2] - starts[2]);
+    CHECK_LE(limits[3], input.n4());
+    CHECK_GE(strides[0], 1);
+    CHECK_GE(strides[1], 1);
+    CHECK_GE(strides[2], 1);
+    CHECK_GE(strides[3], 1);
+    auto result =
+        MakeUnique<Array4D<T>>(CeilOfRatio(limits[0] - starts[0], strides[0]),
+                               CeilOfRatio(limits[1] - starts[1], strides[1]),
+                               CeilOfRatio(limits[2] - starts[2], strides[2]),
+                               CeilOfRatio(limits[3] - starts[3], strides[3]));
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
         for (int64 i2 = 0; i2 < result->n3(); ++i2) {
-          (*result)(i0, i1, i2) =
-              input(starts[0] + i0, starts[1] + i1, starts[2] + i2);
+          for (int64 i3 = 0; i3 < result->n4(); ++i3) {
+            (*result)(i0, i1, i2, i3) =
+                input(starts[0] + i0 * strides[0], starts[1] + i1 * strides[1],
+                      starts[2] + i2 * strides[2], starts[3] + i3 * strides[3]);
+          }
         }
       }
     }
@@ -396,11 +450,51 @@ class ReferenceUtil {
     return result;
   }
 
+  // Applies map_function to each pair of elements in the input lhs and rhs
+  // (4D array) and returns the result.
+  template <typename F>
+  static std::unique_ptr<Array4D<float>> MapArray4D(const Array4D<float>& lhs,
+                                                    const Array4D<float>& rhs,
+                                                    F&& map_function) {
+    return MapWithIndexArray4D(
+        lhs, rhs, [&](float lhs, float rhs, int64, int64, int64, int64) {
+          return map_function(lhs, rhs);
+        });
+  }
+
+  // Applies map_function to each pair of element in lhs and rhs (4D array) and
+  // returns the result.
+  // (plane, depth, height, width) index of each element is also provided as
+  // arguments to map_function.
+  template <typename F>
+  static std::unique_ptr<Array4D<float>> MapWithIndexArray4D(
+      const Array4D<float>& lhs, const Array4D<float>& rhs, F&& map_function) {
+    auto result = MakeUnique<Array4D<float>>(lhs.planes(), lhs.depth(),
+                                             lhs.height(), lhs.width());
+    for (int64 plane = 0; plane < lhs.planes(); ++plane) {
+      for (int64 depth = 0; depth < lhs.depth(); ++depth) {
+        for (int64 height = 0; height < lhs.height(); ++height) {
+          for (int64 width = 0; width < lhs.width(); ++width) {
+            (*result)(plane, depth, height, width) = map_function(
+                lhs(plane, depth, height, width),
+                rhs(plane, depth, height, width), plane, depth, height, width);
+          }
+        }
+      }
+    }
+    return result;
+  }
+
   // Returns the result of a 2D pad on an input matrix.
   static std::unique_ptr<Array2D<float>> PadArray2D(
       const Array2D<float>& operand, const PaddingConfig& padding,
       const float pad);
 
+  // Returns the result of a 3D pad on an input matrix.
+  static Array3D<float> PadArray3D(const Array3D<float>& operand,
+                                   const PaddingConfig& padding,
+                                   const float pad);
+
   // Returns the result of a 4D pad on an input array.
   static Array4D<float> PadArray4D(const Array4D<float>& operand,
                                    const PaddingConfig& padding,
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index f839ac019df07c5c5e07eed856ea55463bb3efae..35b5e8cd52ab0ec21a4bd2df3e9fa0538ae60816 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -52,7 +53,7 @@ class ReferenceUtilTest : public ::testing::Test {
 
 TEST_F(ReferenceUtilTest, TransposeArray2D) {
   auto result = ReferenceUtil::TransposeArray2D(*matrix_);
-  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = Literal::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 4.f}, {2.f, 5.f}, {3.f, 6.f}},
                                        *actual_literal, ErrorSpec(0.0001));
 }
@@ -62,7 +63,7 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) {
       {7.f, 8.f}, {9.f, 10.f}, {11.f, 12.f},
   });
   auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs);
-  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = Literal::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{58.f, 64.f}, {139.f, 154.f}},
                                        *actual_literal, ErrorSpec(0.0001));
 }
@@ -70,7 +71,7 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) {
 TEST_F(ReferenceUtilTest, ReduceToColArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToColArray2D(*matrix_, 0.0f, add);
-  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  auto actual_literal = Literal::CreateR1<float>(*result);
   LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
@@ -78,7 +79,7 @@ TEST_F(ReferenceUtilTest, ReduceToColArray2D) {
 TEST_F(ReferenceUtilTest, ReduceToRowArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToRowArray2D(*matrix_, 0.0f, add);
-  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  auto actual_literal = Literal::CreateR1<float>(*result);
   LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
@@ -86,7 +87,7 @@ TEST_F(ReferenceUtilTest, ReduceToRowArray2D) {
 TEST_F(ReferenceUtilTest, MapArray2D) {
   auto identity = [](float value) { return log(exp(value)); };
   auto result = ReferenceUtil::MapArray2D(*matrix_, identity);
-  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = Literal::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *actual_literal,
                                        ErrorSpec(0.0001));
 }
@@ -96,7 +97,7 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray2D) {
     return value + row + col;
   };
   auto result = ReferenceUtil::MapWithIndexArray2D(*matrix_, add_index);
-  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = Literal::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 3.f, 5.f}, {5.f, 7.f, 9.f}},
                                        *actual_literal, ErrorSpec(0.0001));
 }
@@ -107,7 +108,7 @@ TEST_F(ReferenceUtilTest, MapArray4D) {
   input->FillWithMultiples(1.0f);
   auto multiply_by_two = [](float value) { return 2 * value; };
   auto result = ReferenceUtil::MapArray4D(*input, multiply_by_two);
-  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = Literal::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.FillWithMultiples(2.0f);
@@ -124,7 +125,7 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray4D) {
     return value - (3 * 4 * 5 * plane + 4 * 5 * depth + 5 * height + width);
   };
   auto result = ReferenceUtil::MapWithIndexArray4D(*input, subtract_index);
-  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = Literal::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.Fill(0.0f);
@@ -132,6 +133,101 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray4D) {
                                        ErrorSpec(0.0001));
 }
 
+TEST_F(ReferenceUtilTest, SliceArray2D) {
+  auto result = ReferenceUtil::Slice2D(*matrix_, {{0, 0}}, {{2, 2}}, {{1, 1}});
+  auto actual_literal = Literal::CreateR2FromArray2D(*result);
+
+  LiteralTestUtil::ExpectR2Near<float>({{1.f, 2.f}, {4.f, 5.f}},
+                                       *actual_literal, ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, SliceStridedArray2D) {
+  auto result = ReferenceUtil::Slice2D(*matrix_, {{0, 0}}, {{2, 3}}, {{1, 2}});
+  auto actual_literal = Literal::CreateR2FromArray2D(*result);
+
+  LiteralTestUtil::ExpectR2Near<float>({{1.f, 3.f}, {4.f, 6.f}},
+                                       *actual_literal, ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, SliceArray3D) {
+  Array3D<float> input(2, 3, 4);
+  input.FillIota(0);
+
+  auto result =
+      ReferenceUtil::Slice3D(input, {{0, 0, 0}}, {{2, 2, 2}}, {{1, 1, 1}});
+  auto actual_literal = Literal::CreateR3FromArray3D(*result);
+
+  LiteralTestUtil::ExpectR3Near<float>(
+      {{{0.f, 1.f}, {4.f, 5.f}}, {{12.f, 13.f}, {16.f, 17.f}}}, *actual_literal,
+      ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, SliceStridedArray3D) {
+  Array3D<float> input(2, 3, 4);
+  input.FillIota(0);
+
+  auto result =
+      ReferenceUtil::Slice3D(input, {{0, 0, 0}}, {{2, 3, 4}}, {{1, 2, 2}});
+  auto actual_literal = Literal::CreateR3FromArray3D(*result);
+
+  LiteralTestUtil::ExpectR3Near<float>(
+      {{{0.f, 2.f}, {8.f, 10.f}}, {{12.f, 14.f}, {20.f, 22.f}}},
+      *actual_literal, ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, SliceArray4D) {
+  Array4D<float> input(2, 3, 4, 5);
+  input.FillIota(0);
+
+  auto result = ReferenceUtil::Slice4D(input, {{1, 0, 0, 0}}, {{2, 2, 2, 2}},
+                                       {{1, 1, 1, 1}});
+  auto actual_literal = Literal::CreateR4FromArray4D(*result);
+
+  LiteralTestUtil::ExpectR4Near<float>(
+      {{{{60.f, 61.f}, {65.f, 66.f}}, {{80.f, 81.f}, {85.f, 86.f}}}},
+      *actual_literal, ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, SliceStridedArray4D) {
+  Array4D<float> input(2, 3, 4, 5);
+  input.FillIota(0);
+
+  auto result = ReferenceUtil::Slice4D(input, {{1, 0, 0, 0}}, {{2, 3, 4, 5}},
+                                       {{1, 2, 2, 2}});
+  auto actual_literal = Literal::CreateR4FromArray4D(*result);
+
+  LiteralTestUtil::ExpectR4Near<float>(
+      {{{{60.f, 62.f, 64.f}, {70.f, 72.f, 74.f}},
+        {{100.f, 102.f, 104.f}, {110.f, 112.f, 114.f}}}},
+      *actual_literal, ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, ConvArray3DWithSamePadding) {
+  Array3D<float> input = {{{1, 2, 3, 4}}};
+  Array3D<float> weights = {{{5, 6}}};
+  std::unique_ptr<Array3D<float>> actual =
+      ReferenceUtil::ConvArray3D(input, weights, 1, Padding::kSame);
+  Array3D<float> expected = {{{17, 28, 39, 20}}};
+
+  auto actual_literal = Literal::CreateR3FromArray3D(*actual);
+
+  LiteralTestUtil::ExpectR3NearArray3D<float>(expected, *actual_literal,
+                                              ErrorSpec(0.0001));
+}
+
+TEST_F(ReferenceUtilTest, ConvArray3DWithValidPadding) {
+  Array3D<float> input = {{{1, 2, 3, 4}}};
+  Array3D<float> weights = {{{5, 6}}};
+  std::unique_ptr<Array3D<float>> actual =
+      ReferenceUtil::ConvArray3D(input, weights, 1, Padding::kValid);
+  Array3D<float> expected = {{{17, 28, 39}}};
+
+  auto actual_literal = Literal::CreateR3FromArray3D(*actual);
+
+  LiteralTestUtil::ExpectR3NearArray3D<float>(expected, *actual_literal,
+                                              ErrorSpec(0.0001));
+}
+
 TEST_F(ReferenceUtilTest, ConvWithSamePadding) {
   Array4D<float> input(1, 1, 4, 4);
   // clang-format off
@@ -161,7 +257,7 @@ TEST_F(ReferenceUtilTest, ConvWithSamePadding) {
   }));
   // clang-format on
 
-  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual);
+  auto actual_literal = Literal::CreateR4FromArray4D(*actual);
 
   LiteralTestUtil::ExpectR4NearArray4D<float>(expected, *actual_literal,
                                               ErrorSpec(0.0001));
@@ -195,7 +291,7 @@ TEST_F(ReferenceUtilTest, ConvWithValidPadding) {
   }));
   // clang-format on
 
-  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual);
+  auto actual_literal = Literal::CreateR4FromArray4D(*actual);
 
   LiteralTestUtil::ExpectR4NearArray4D<float>(expected, *actual_literal,
                                               ErrorSpec(0.0001));
@@ -247,7 +343,7 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithSamePadding) {
   }});
   // clang-format on
 
-  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual);
+  auto actual_literal = Literal::CreateR4FromArray4D(*actual);
 
   LiteralTestUtil::ExpectR4NearArray4D<float>(expected, *actual_literal,
                                               ErrorSpec(0.0001));
@@ -296,7 +392,7 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) {
   Array4D<float> expected({{{{2514, 2685}}}});
   // clang-format on
 
-  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual);
+  auto actual_literal = Literal::CreateR4FromArray4D(*actual);
 
   LiteralTestUtil::ExpectR4NearArray4D<float>(expected, *actual_literal,
                                               ErrorSpec(0.0001));
@@ -309,7 +405,7 @@ TEST_F(ReferenceUtilTest, ApplyElementwise2D) {
 
   auto actual = ReferenceUtil::ApplyElementwise2D(
       [](float x, float y, float z) { return 100 * x + 10 * y + z; }, a, b, c);
-  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*actual);
+  auto actual_literal = Literal::CreateR2FromArray2D(*actual);
   LiteralTestUtil::ExpectR2Near({{300.f, 600.f}, {900.f, 1200.f}},
                                 *actual_literal, ErrorSpec(0.0001));
 }
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 0687368b83db343cfa15da969b9f4d9d1a821078..89ebdb0e26a4c03440d771c5867c5dea880311cf 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -24,9 +24,7 @@ xla_proto_library(
 xla_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
-    deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
-    ],
+    deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
 # Filegroup used to collect source files for dependency checking.
@@ -88,11 +86,13 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_query",
+        ":shape_inference",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
@@ -106,12 +106,16 @@ cc_test(
         ":hlo",
         ":hlo_evaluator",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
@@ -138,6 +142,7 @@ cc_library(
     deps = [
         ":hlo_module_config",
         ":hlo_proto",
+        ":hlo_reachability",
         ":name_uniquer",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:literal_util",
@@ -155,6 +160,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_reachability",
+    srcs = ["hlo_reachability.cc"],
+    hdrs = ["hlo_reachability.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_test(
+    name = "hlo_reachability_test",
+    srcs = ["hlo_reachability_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_reachability",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "hlo_matchers",
     testonly = 1,
@@ -285,7 +315,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -303,7 +333,7 @@ cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:test",
     ],
@@ -330,6 +360,7 @@ cc_library(
     hdrs = ["backend.h"],
     deps = [
         ":compiler",
+        ":computation_placer",
         ":device_memory_allocator",
         ":platform_util",
         ":pool",
@@ -338,7 +369,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:backend_flags",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -382,7 +412,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:service_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -416,7 +446,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:service_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -506,9 +536,8 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:service_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
     ],
@@ -707,9 +736,10 @@ cc_library(
     ],
     deps = [
         ":buffer_liveness",
+        ":heap_simulator",
         ":hlo",
-        ":hlo_ordering",
         ":hlo_proto",
+        ":hlo_scheduling",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -718,7 +748,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:buffer_assignment_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
@@ -736,6 +765,7 @@ cc_test(
         ":flatten_call_graph",
         ":hlo",
         ":hlo_ordering",
+        ":hlo_scheduling",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -748,11 +778,61 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_ordering",
+    srcs = ["hlo_ordering.cc"],
+    hdrs = ["hlo_ordering.h"],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_proto",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "hlo_ordering_test",
+    size = "small",
+    srcs = ["hlo_ordering_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_ordering",
+        ":hlo_scheduling",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
+cc_library(
+    name = "heap_simulator",
+    srcs = ["heap_simulator.cc"],
+    hdrs = ["heap_simulator.h"],
+    deps = [
+        ":hlo",
+        ":hlo_ordering",
+        ":hlo_proto",
+        ":liveness_util",
+        ":logical_buffer",
+        ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_test(
     name = "heap_simulator_test",
     size = "small",
     srcs = ["heap_simulator_test.cc"],
     deps = [
+        ":heap_simulator",
         ":hlo",
         ":hlo_ordering",
         ":logical_buffer",
@@ -765,23 +845,14 @@ cc_test(
     ],
 )
 
-# The hlo_ordering library contains both hlo_ordering and heap_simulator because
-# they are mutually dependent.
 cc_library(
-    name = "hlo_ordering",
-    srcs = [
-        "heap_simulator.cc",
-        "hlo_ordering.cc",
-    ],
-    hdrs = [
-        "heap_simulator.h",
-        "hlo_ordering.h",
-    ],
+    name = "hlo_scheduling",
+    srcs = ["hlo_scheduling.cc"],
+    hdrs = ["hlo_scheduling.h"],
     deps = [
-        ":call_graph",
+        ":heap_simulator",
         ":hlo",
-        ":hlo_proto",
-        ":liveness_util",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -794,12 +865,13 @@ cc_library(
 )
 
 cc_test(
-    name = "hlo_ordering_test",
+    name = "hlo_scheduling_test",
     size = "small",
-    srcs = ["hlo_ordering_test.cc"],
+    srcs = ["hlo_scheduling_test.cc"],
     deps = [
         ":hlo",
         ":hlo_ordering",
+        ":hlo_scheduling",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -841,6 +913,46 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "batchnorm_rewriter",
+    srcs = ["batchnorm_rewriter.cc"],
+    hdrs = ["batchnorm_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_query",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "batchnorm_rewriter_test",
+    size = "small",
+    srcs = ["batchnorm_rewriter_test.cc"],
+    deps = [
+        ":batchnorm_rewriter",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "algebraic_simplifier",
     srcs = ["algebraic_simplifier.cc"],
@@ -948,6 +1060,38 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "computation_placer",
+    srcs = ["computation_placer.cc"],
+    hdrs = ["computation_placer.h"],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+    alwayslink = True,  # Contains per-platform computation placer registration
+)
+
+cc_library(
+    name = "human_readable_profile_builder",
+    srcs = ["human_readable_profile_builder.cc"],
+    hdrs = ["human_readable_profile_builder.h"],
+    deps = [
+        "//tensorflow/compiler/xla:metric_table_report",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "generic_transfer_manager",
     srcs = ["generic_transfer_manager.cc"],
@@ -1030,12 +1174,8 @@ cc_test(
 
 cc_library(
     name = "hlo_cost_analysis",
-    srcs = [
-        "hlo_cost_analysis.cc",
-    ],
-    hdrs = [
-        "hlo_cost_analysis.h",
-    ],
+    srcs = ["hlo_cost_analysis.cc"],
+    hdrs = ["hlo_cost_analysis.h"],
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:shape_util",
@@ -1068,6 +1208,7 @@ cc_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
@@ -1080,6 +1221,7 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_cost_analysis",
+        ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:metric_table_report",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -1137,12 +1279,8 @@ cc_test(
 
 cc_library(
     name = "logical_buffer",
-    srcs = [
-        "logical_buffer.cc",
-    ],
-    hdrs = [
-        "logical_buffer.h",
-    ],
+    srcs = ["logical_buffer.cc"],
+    hdrs = ["logical_buffer.h"],
     deps = [
         ":hlo",
         ":hlo_proto",
@@ -1155,18 +1293,31 @@ cc_library(
 )
 
 cc_library(
-    name = "hlo_dataflow_analysis",
-    srcs = [
-        "hlo_dataflow_analysis.cc",
-    ],
-    hdrs = [
-        "hlo_dataflow_analysis.h",
+    name = "hlo_value",
+    srcs = ["hlo_value.cc"],
+    hdrs = ["hlo_value.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
     ],
+)
+
+cc_library(
+    name = "hlo_dataflow_analysis",
+    srcs = ["hlo_dataflow_analysis.cc"],
+    hdrs = ["hlo_dataflow_analysis.h"],
     deps = [
         ":call_graph",
         ":hlo",
+        ":hlo_ordering",
+        ":hlo_value",
         ":liveness_util",
-        "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -1174,7 +1325,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -1201,20 +1351,32 @@ cc_test(
 )
 
 cc_library(
-    name = "hlo_alias_analysis",
-    srcs = [
-        "hlo_alias_analysis.cc",
-    ],
-    hdrs = [
-        "hlo_alias_analysis.h",
+    name = "hlo_buffer",
+    srcs = ["hlo_buffer.cc"],
+    hdrs = ["hlo_buffer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_value",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
     ],
+)
+
+cc_library(
+    name = "hlo_alias_analysis",
+    srcs = ["hlo_alias_analysis.cc"],
+    hdrs = ["hlo_alias_analysis.h"],
     deps = [
-        ":call_graph",
         ":hlo",
+        ":hlo_buffer",
         ":hlo_dataflow_analysis",
-        ":logical_buffer",
-        "//tensorflow/compiler/xla:shape_tree",
+        ":hlo_value",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -1245,12 +1407,8 @@ cc_test(
 
 cc_library(
     name = "tuple_points_to_analysis",
-    srcs = [
-        "tuple_points_to_analysis.cc",
-    ],
-    hdrs = [
-        "tuple_points_to_analysis.h",
-    ],
+    srcs = ["tuple_points_to_analysis.cc"],
+    hdrs = ["tuple_points_to_analysis.h"],
     deps = [
         ":hlo",
         ":logical_buffer",
@@ -1287,12 +1445,8 @@ cc_test(
 
 cc_library(
     name = "compilation_cache",
-    srcs = [
-        "compilation_cache.cc",
-    ],
-    hdrs = [
-        "compilation_cache.h",
-    ],
+    srcs = ["compilation_cache.cc"],
+    hdrs = ["compilation_cache.h"],
     deps = [
         ":executable",
         ":hlo_module_config",
@@ -1386,7 +1540,10 @@ cc_library(
     name = "hlo_verifier",
     srcs = ["hlo_verifier.cc"],
     hdrs = ["hlo_verifier.h"],
-    deps = [":hlo_pass"],
+    deps = [
+        ":hlo_pass",
+        "//tensorflow/core:lib",
+    ],
 )
 
 cc_library(
@@ -1398,9 +1555,9 @@ cc_library(
         ":call_graph",
         ":flatten_call_graph",
         ":hlo",
-        ":hlo_cost_analysis",
         ":hlo_dce",
         ":hlo_ordering",
+        ":hlo_scheduling",
         ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
@@ -1497,8 +1654,8 @@ cc_library(
         "hlo_pass_pipeline.h",
     ],
     deps = [
-        ":compiler",
         ":hlo",
+        ":hlo_graph_dumper",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1572,10 +1729,8 @@ cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/core:lib",
     ],
 )
 
@@ -1707,8 +1862,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
     ],
     alwayslink = 1,
 )
@@ -1777,10 +1933,39 @@ cc_library(
         ":hlo",
         ":hlo_proto",
         "//tensorflow/compiler/xla:status",
+    ],
+)
+
+cc_library(
+    name = "reduce_precision_insertion",
+    srcs = ["reduce_precision_insertion.cc"],
+    hdrs = ["reduce_precision_insertion.h"],
+    deps = [
+        ":buffer_liveness",
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
     ],
 )
 
+cc_test(
+    name = "reduce_precision_insertion_test",
+    size = "small",
+    srcs = ["reduce_precision_insertion_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_matchers",
+        ":reduce_precision_insertion",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 754ac0c68dc025c6d2bde4b40e148e6043f0cf6d..691f9f22964841c1163d161a7c02c2215ba6f066 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -48,7 +48,7 @@ namespace {
 // Returns whether operand is a literal with the given value.
 bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
   return operand->opcode() == HloOpcode::kConstant &&
-         LiteralUtil::IsAll(operand->literal(), value);
+         operand->literal().IsAll(value);
 }
 
 bool IsAll(const HloInstruction* op, int8 value) {
@@ -126,10 +126,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
       HloInstruction* concatenate,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
 
-  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
+  Status HandleConstant(HloInstruction* constant,
+                        const Literal& literal) override;
 
-  Status HandleConvert(HloInstruction* convert,
-                       HloInstruction* operand) override;
+  Status HandleCopy(HloInstruction* copy) override;
+
+  Status HandleConvert(HloInstruction* convert) override;
 
   Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs,
                            HloInstruction* rhs, const Window& window) override;
@@ -179,11 +181,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   Status HandleSubtract(HloInstruction* sub, HloInstruction* lhs,
                         HloInstruction* rhs) override;
 
-  Status HandleMaximum(HloInstruction* maximum, HloInstruction* lhs,
-                       HloInstruction* rhs) override;
-
-  Status HandleMinimum(HloInstruction* minimum, HloInstruction* lhs,
-                       HloInstruction* rhs) override;
+  Status HandleMaximum(HloInstruction* maximum) override;
+  Status HandleMinimum(HloInstruction* minimum) override;
 
   // Returns whether algebraic simplification has occurred.
   const bool changed() const { return changed_; }
@@ -334,16 +333,16 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add,
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy,
-                                              HloInstruction* operand) {
+Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
   // If a copy feeds a copy, make it a single copy.
-  if (operand->opcode() == HloOpcode::kCopy) {
+  if (copy->operand(0)->opcode() == HloOpcode::kCopy) {
     return ReplaceWithNewInstruction(
-        copy, HloInstruction::CreateUnary(copy->shape(), HloOpcode::kCopy,
-                                          operand->operands()[0]));
+        copy, HloInstruction::CreateUnary(
+                  copy->shape(), HloOpcode::kCopy,
+                  copy->mutable_operand(0)->mutable_operand(0)));
   }
   // All copies can be eliminated (assuming layout constraints are satisified).
-  ReplaceInstructionIfSameShape(copy, operand);
+  ReplaceInstructionIfSameShape(copy, copy->mutable_operand(0));
   return Status::OK();
 }
 
@@ -415,6 +414,32 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
   return Status::OK();
 }
 
+static HloInstruction* BuildTupleConstant(HloComputation* computation,
+                                          const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<HloInstruction*> elems;
+    elems.reserve(ShapeUtil::TupleElementCount(literal.shape()));
+    for (const Literal& child : literal.tuple_literals()) {
+      elems.push_back(BuildTupleConstant(computation, child));
+    }
+    return computation->AddInstruction(HloInstruction::CreateTuple(elems));
+  } else {
+    return computation->AddInstruction(
+        HloInstruction::CreateConstant(MakeUnique<Literal>(literal)));
+  }
+}
+
+Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant,
+                                                  const Literal& literal) {
+  // Tuple constants aren't directly supported by any backend. Expand them into
+  // explicit Tuple instructions.
+  if (ShapeUtil::IsTuple(constant->shape())) {
+    return ReplaceInstruction(constant,
+                              BuildTupleConstant(computation_, literal));
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub,
                                                   HloInstruction* lhs,
                                                   HloInstruction* rhs) {
@@ -448,6 +473,72 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
                                             subtract));
   }
 
+  // A/exp(B) => A*exp(-B)
+  if (rhs->opcode() == HloOpcode::kExp) {
+    VLOG(10) << "transform [A/exp(B) => A*exp(-B)]: " << divide->ToString();
+    HloInstruction* negate =
+        computation_->AddInstruction(HloInstruction::CreateUnary(
+            divide->shape(), HloOpcode::kNegate, rhs->mutable_operand(0)));
+    HloInstruction* new_exp = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp, negate));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(
+                    divide->shape(), HloOpcode::kMultiply, lhs, new_exp));
+  }
+
+  // A/pow(B,C) => A*pow(B,-C)
+  if (rhs->opcode() == HloOpcode::kPower) {
+    VLOG(10) << "transform [A/pow(B,C) => A*pow(B,-C)]: " << divide->ToString();
+    HloInstruction* negate =
+        computation_->AddInstruction(HloInstruction::CreateUnary(
+            divide->shape(), HloOpcode::kNegate, rhs->mutable_operand(1)));
+    HloInstruction* new_power = computation_->AddInstruction(
+        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kPower,
+                                     rhs->mutable_operand(0), negate));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(
+                    divide->shape(), HloOpcode::kMultiply, lhs, new_power));
+  }
+
+  // Simplifying integral division would produce unexpected results.
+  if (ShapeUtil::ElementIsIntegral(divide->shape())) {
+    return Status::OK();
+  }
+
+  // (A / B) / (C / D)  =>  (A / B)*(D / C) => (A * D) / (B * C)
+  if (lhs->opcode() == HloOpcode::kDivide &&
+      rhs->opcode() == HloOpcode::kDivide) {
+    auto a_times_d = computation_->AddInstruction(HloInstruction::CreateBinary(
+        divide->shape(), HloOpcode::kMultiply, lhs->mutable_operand(0),
+        rhs->mutable_operand(1)));
+    auto b_times_c = computation_->AddInstruction(HloInstruction::CreateBinary(
+        divide->shape(), HloOpcode::kMultiply, lhs->mutable_operand(1),
+        rhs->mutable_operand(0)));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(
+                    divide->shape(), HloOpcode::kDivide, a_times_d, b_times_c));
+  }
+
+  // (A / B) / C => A / (B * C)
+  if (lhs->opcode() == HloOpcode::kDivide) {
+    auto b_times_c = computation_->AddInstruction(HloInstruction::CreateBinary(
+        divide->shape(), HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
+    return ReplaceWithNewInstruction(
+        divide,
+        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
+                                     lhs->mutable_operand(0), b_times_c));
+  }
+
+  // A / (B / C) => (A*C) / B
+  if (rhs->opcode() == HloOpcode::kDivide) {
+    auto a_times_c = computation_->AddInstruction(HloInstruction::CreateBinary(
+        divide->shape(), HloOpcode::kMultiply, lhs, rhs->mutable_operand(1)));
+    return ReplaceWithNewInstruction(
+        divide,
+        HloInstruction::CreateBinary(divide->shape(), HloOpcode::kDivide,
+                                     a_times_c, rhs->mutable_operand(0)));
+  }
+
   return Status::OK();
 }
 
@@ -469,7 +560,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
       ShapeUtil::HasZeroElements(lhs->shape()) ||
       ShapeUtil::HasZeroElements(rhs->shape())) {
     auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
     return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
@@ -507,7 +598,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
         computation_->parent(), F32, HloOpcode::kAdd);
     auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
     auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
         ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
         {0}, add_reduce_computation));
@@ -531,7 +622,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
         computation_->parent(), F32, HloOpcode::kAdd);
     auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
     HloInstruction* reduce;
     if (ShapeUtil::Rank(rhs->shape()) == 1) {
       auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
@@ -571,7 +662,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
         computation_->parent(), F32, HloOpcode::kAdd);
     auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
     auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
         ShapeUtil::MakeShape(dot->shape().element_type(),
                              {lhs->shape().dimensions(0)}),
@@ -595,6 +686,16 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply,
   if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(multiply, rhs)) {
     return Status::OK();
   }
+
+  // exp(A) * exp(B) => exp(A+B)
+  if (lhs->opcode() == HloOpcode::kExp && rhs->opcode() == HloOpcode::kExp) {
+    auto add = computation_->AddInstruction(HloInstruction::CreateBinary(
+        multiply->shape(), HloOpcode::kAdd, lhs->mutable_operand(0),
+        rhs->mutable_operand(0)));
+    return ReplaceWithNewInstruction(
+        multiply,
+        HloInstruction::CreateUnary(multiply->shape(), HloOpcode::kExp, add));
+  }
   return Status::OK();
 }
 
@@ -606,6 +707,17 @@ Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log,
       ReplaceInstructionIfSameShape(log, operand->mutable_operand(0))) {
     return Status::OK();
   }
+
+  // ln(pow(A,B)) => B*ln(A)
+  if (operand->opcode() == HloOpcode::kPower) {
+    auto new_log = computation_->AddInstruction(HloInstruction::CreateUnary(
+        log->shape(), HloOpcode::kLog, operand->mutable_operand(0)));
+    return ReplaceWithNewInstruction(
+        log,
+        HloInstruction::CreateBinary(log->shape(), HloOpcode::kMultiply,
+                                     new_log, operand->mutable_operand(1)));
+  }
+
   return Status::OK();
 }
 
@@ -792,12 +904,11 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
-Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert,
-                                                 HloInstruction* operand) {
-  PrimitiveType src_type = operand->shape().element_type();
+Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
+  PrimitiveType src_type = convert->operand(0)->shape().element_type();
   PrimitiveType dest_type = convert->shape().element_type();
   if (src_type == dest_type) {
-    return ReplaceInstruction(convert, operand);
+    return ReplaceInstruction(convert, convert->mutable_operand(0));
   }
   return Status::OK();
 }
@@ -878,10 +989,10 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     }
 
     // Verify that the slice shape matches the pad shape.
-    TF_ASSIGN_OR_RETURN(Shape inferred_slice_shape,
-                        ShapeInference::InferSliceShape(
-                            nonzero_pad_shape, start_indices, end_indices,
-                            strides));
+    TF_ASSIGN_OR_RETURN(
+        Shape inferred_slice_shape,
+        ShapeInference::InferSliceShape(nonzero_pad_shape, start_indices,
+                                        end_indices, strides));
     TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape()));
 
     std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
@@ -897,8 +1008,8 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
                                                HloInstruction* rhs) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
   if (IsAll(rhs, 0)) {
-    auto one = HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
-        LiteralUtil::One(power->shape().element_type())));
+    auto one = HloInstruction::CreateConstant(
+        Literal::One(power->shape().element_type()).CloneToUnique());
     std::unique_ptr<HloInstruction> ones;
     if (ShapeUtil::IsScalar(power->shape())) {
       ones = std::move(one);
@@ -914,6 +1025,14 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
     return Status::OK();
   }
 
+  // pow(exp(A),B) => exp(A*B)
+  if (lhs->opcode() == HloOpcode::kExp) {
+    auto a_times_b = computation_->AddInstruction(HloInstruction::CreateBinary(
+        power->shape(), HloOpcode::kMultiply, lhs->operands()[0], rhs));
+    return ReplaceWithNewInstruction(
+        power, HloInstruction::CreateUnary(power->shape(), HloOpcode::kExp,
+                                           a_times_b));
+  }
   VLOG(10) << "trying transform [pow(A, 2) => A*A]: " << power->ToString();
   if (IsAll(rhs, 2)) {
     return ReplaceWithNewInstruction(
@@ -923,9 +1042,8 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
 
   VLOG(10) << "trying transform [pow(A, -1) => 1/A]: " << power->ToString();
   if (IsAll(rhs, -1)) {
-    auto* one = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
-            LiteralUtil::One(rhs->shape().element_type()))));
+    auto* one = computation_->AddInstruction(HloInstruction::CreateConstant(
+        Literal::One(rhs->shape().element_type()).CloneToUnique()));
     return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
                                             one, lhs));
@@ -937,6 +1055,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
     TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
         HloInstruction* reshape_or_broadcast) {
   bool changed = false;
+  if (ShapeUtil::IsScalar(reshape_or_broadcast->shape())) {
+    return false;
+  }
   HloInstruction* operand = reshape_or_broadcast->mutable_operand(0);
   for (HloInstruction* user : reshape_or_broadcast->users()) {
     if (user->user_count() == 0 && user != computation_->root_instruction()) {
@@ -1008,7 +1129,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   // dimension.
   if (ShapeUtil::HasZeroElements(reshape->shape())) {
     auto empty_constant = HloInstruction::CreateConstant(
-        LiteralUtil::CreateFromShape(reshape->shape()));
+        Literal::CreateFromShape(reshape->shape()));
 
     return ReplaceWithNewInstruction(reshape, std::move(empty_constant));
   }
@@ -1208,8 +1329,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     // try to get more fancy about proving equivalence in cases beyond that.
     if (pad_value->opcode() != HloOpcode::kConstant ||
         reduce_init_value->opcode() != HloOpcode::kConstant ||
-        !LiteralUtil::Equal(pad_value->literal(),
-                            reduce_init_value->literal())) {
+        !pad_value->literal().Equal(reduce_init_value->literal())) {
       VLOG(10) << "Not folding pad into reduce-window due to different pad "
                   "values.";
       return Status::OK();
@@ -1368,9 +1488,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(lhs->shape(), input_shape) ||
-      !valid_bitcast_callback_(rhs->shape(), new_filter_shape) ||
-      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
+  if (!valid_bitcast_callback_(input_shape, lhs->shape()) ||
+      !valid_bitcast_callback_(new_filter_shape, rhs->shape()) ||
+      !valid_bitcast_callback_(convolution_shape, dot_output_shape)) {
     return Status::OK();
   }
 
@@ -1396,9 +1516,7 @@ bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
   return true;
 }
 
-Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum,
-                                                 HloInstruction* lhs,
-                                                 HloInstruction* rhs) {
+Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum) {
   // Match the following tree:
   //          min_operand     operand
   //                     \   /
@@ -1429,9 +1547,7 @@ Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum,
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum,
-                                                 HloInstruction* lhs,
-                                                 HloInstruction* rhs) {
+Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) {
   // Match the following tree:
   //          max_operand     operand
   //                     \   /
@@ -1470,6 +1586,9 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   // module, invalidating iteration.
   std::vector<HloComputation*> computations;
   for (auto& comp : module->computations()) {
+    if (comp->IsFusionComputation()) {
+      continue;
+    }
     computations.push_back(comp.get());
   }
   for (auto& comp : computations) {
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index f8919f0caad6d7009d371d8a1893ba5c91110122..4295a3227a837ffc8483b3be59994c9e6ac96aec 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -26,12 +26,13 @@ namespace xla {
 // A pass which performs AlgebraicSimplications.
 class AlgebraicSimplifier : public HloPassInterface {
  public:
-  // Given two shapes, determines if it is valid to bitcast between them after
-  // considering platform dependent effects on layout like alignment
-  // restrictions.
-  // Precondition: the two shapes have layouts, the same number of
-  // elements and ShapeUtil::ReshapeIsBitcast returns true.
-  using ValidBitcastCallback = std::function<bool(const Shape&, const Shape&)>;
+  // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
+  // bitcast from 'from_shape' to 'to_shape' after considering platform
+  // dependent effects on layout like alignment restrictions. Precondition: the
+  // two shapes have layouts, the same number of elements and
+  // ShapeUtil::ReshapeIsBitcast returns true.
+  using ValidBitcastCallback =
+      std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
 
   // If is_layout_sensitive is true, then the simplifier preserves layout during
   // transformation. Otherwise, layout is ignored. If valid_bitcast_callback
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index e4368a7bb25093f70bf78288db2021d36fa7f25a..be71e03e985a285abafc2adf7219b6aca2a775b6 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -55,7 +55,7 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
@@ -76,7 +76,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r2f32, "param0"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   HloInstruction* bcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(r2f32, zero, {0, 1}));
   builder.AddInstruction(
@@ -99,7 +99,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r2f32, "param0"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0, 0, 0})));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({0, 0, 0})));
   HloInstruction* bcast =
       builder.AddInstruction(HloInstruction::CreateBroadcast(r2f32, zero, {1}));
   builder.AddInstruction(
@@ -123,7 +123,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
@@ -138,6 +138,155 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   EXPECT_EQ(root, param0);
 }
 
+// Test that (A/B)/C is simplified to A/(B*C).
+TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0f32, "param2"));
+  HloInstruction* div = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, param1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(op::Divide(param0, param1), param2));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(param0, op::Multiply(param1, param2)));
+}
+
+// Test that A/(B/C) is simplified to (A*C)/B.
+TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0f32, "param2"));
+  HloInstruction* div = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param1, param2));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(param0, op::Divide(param1, param2)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(op::Multiply(param0, param2), param1));
+}
+
+// Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
+TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0f32, "param2"));
+  HloInstruction* param3 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, r0f32, "param3"));
+  HloInstruction* div0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, param1));
+  HloInstruction* div1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param2, param3));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div0, div1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Divide(op::Divide(param0, param1), op::Divide(param2, param3)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Divide(op::Multiply(param0, param3), op::Multiply(param1, param2)));
+}
+
+// Test that A/exp(B) is simplified to A*exp(-B).
+TEST_F(AlgebraicSimplifierTest, DivOfExp) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, param1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(param0, op::Exp(param1)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(param0, op::Exp(op::Negate(param1))));
+}
+
+// Test that A/pow(B,C) is simplified to A*pow(B,-C).
+TEST_F(AlgebraicSimplifierTest, DivOfPower) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0f32, "param2"));
+  HloInstruction* power = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param1, param2));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(param0, op::Power(param1, param2)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+}
+
 // Test that A/1 is simplified to A for a scalar.
 TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -145,7 +294,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
@@ -167,7 +316,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r2f32, "param0"));
   HloInstruction* one = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 1.0}, {1.0, 1.0}})));
+      Literal::CreateR2<float>({{1.0, 1.0}, {1.0, 1.0}})));
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
@@ -239,6 +388,89 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
               op::Exp(op::Subtract(param0, param1)));
 }
 
+// Test that exp(A)*exp(B) is simplified to exp(A+B)
+TEST_F(AlgebraicSimplifierTest, ExpMul) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* exp0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, param0));
+  HloInstruction* exp1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, param1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Exp(param0), op::Exp(param1)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Exp(op::Add(param0, param1)));
+}
+
+// Test that pow(exp(A), B) is simplified to exp(A*B)
+TEST_F(AlgebraicSimplifierTest, PowExp) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* exp0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, param0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Power(op::Exp(param0), param1));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Exp(op::Multiply(param0, param1)));
+}
+
+// Test that ln(pow(A, B)) is simplified to ln(A)*B
+TEST_F(AlgebraicSimplifierTest, LnPow) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* pow = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, param1));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Log(op::Power(param0, param1)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Log(param0), param1));
+}
+
 // Test that ln(exp(A)) is simplified to A
 TEST_F(AlgebraicSimplifierTest, LnExp) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -300,7 +532,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
@@ -315,7 +547,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->literal()), 1);
+  EXPECT_EQ(root->literal().GetFirstElement<float>(), 1);
 }
 
 // Test that pow(A, 0) where A is not a scalar is simplified to broadcast(1).
@@ -325,7 +557,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r1f32, "param0"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
@@ -344,8 +576,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
   EXPECT_TRUE(ShapeUtil::IsScalar(root->operand(0)->shape()));
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->operand(0)->literal()),
-            1);
+  EXPECT_EQ(root->operand(0)->literal().GetFirstElement<float>(), 1);
 }
 
 // Test that pow(A, 1) is simplified to A.
@@ -355,7 +586,7 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
@@ -378,7 +609,7 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* two = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
@@ -401,7 +632,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* negative_one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(-1)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(-1)));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
@@ -416,8 +647,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Divide(op::Constant(), param0));
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->operand(0)->literal()),
-            1);
+  EXPECT_EQ(root->operand(0)->literal().GetFirstElement<float>(), 1);
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -451,7 +681,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
 TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   HloComputation::Builder builder(TestName());
   HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
@@ -519,7 +749,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, r1f32, "param1"));
   HloInstruction* empty_literal = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
   HloInstruction* empty_slice =
       builder.AddInstruction(HloInstruction::CreateSlice(
           ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1}));
@@ -550,7 +780,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r1f32, "param0"));
   HloInstruction* empty_literal = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({})));
   HloInstruction* empty_slice =
       builder.AddInstruction(HloInstruction::CreateSlice(
           ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1}));
@@ -735,7 +965,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}), param));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}),
                                    HloOpcode::kMaximum, movable_reshape, zero));
@@ -753,6 +983,34 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
               op::Reshape(op::Maximum(param, zero)));
 }
 
+// Regression test for a bug in the reshape sinking transformation, where
+// moving a reshape to a scalar led to a crash.
+TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1}), "param"));
+  HloInstruction* reshape = builder.AddInstruction(
+      HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {}), param));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1., 2., 3.})));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kMaximum, reshape, zero));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Maximum(op::Reshape(param), zero));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+
+  simplifier.Run(module.get()).ValueOrDie();
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Maximum(op::Reshape(param), zero));
+}
+
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
@@ -1035,7 +1293,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
       builder.AddInstruction(HloInstruction::CreateParameter(
           0, ShapeUtil::MakeShape(F32, {2, 2}), "param"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   PaddingConfig no_padding;
   for (int i = 0; i < 2; ++i) {
     auto dimension = no_padding.add_dimensions();
@@ -1066,7 +1324,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
       builder.AddInstruction(HloInstruction::CreateParameter(
           0, ShapeUtil::MakeShape(F32, {10, 10}), "param"));
   HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   PaddingConfig padding;
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {2, -3};
@@ -1134,7 +1392,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
           0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
   builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
-      /*limit_indices=*/{dim0, dim1}, /*slices=*/{1, 1}));
+      /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
@@ -1376,9 +1634,9 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* min_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   HloInstruction* max_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   HloInstruction* min = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, param0, min_value));
   builder.AddInstruction(
@@ -1406,9 +1664,9 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* min_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   HloInstruction* max_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMaximum, param0, max_value));
   builder.AddInstruction(
@@ -1437,9 +1695,9 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r1f32, "param0"));
   HloInstruction* min_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   HloInstruction* max_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kMaximum, param0, max_value));
   builder.AddInstruction(
@@ -1497,9 +1755,9 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r0f32, "param0"));
   HloInstruction* min_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   HloInstruction* max_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMaximum, param0, max_value));
   HloInstruction* fmax = builder.AddInstruction(
@@ -1566,7 +1824,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
 TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   HloComputation::Builder builder(TestName());
   HloInstruction* forty_two = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6});
   HloInstruction* broadcast =
@@ -1614,7 +1872,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   padding.mutable_dimensions(3)->set_edge_padding_high(2);
 
   HloInstruction* pad_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {1, 3, 3, 5}), operand, pad_value, padding));
 
@@ -1645,7 +1903,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   const Shape reduce_window_shape =
       ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
   HloInstruction* reduce_init_value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
   HloInstruction* reduce_window =
       builder.AddInstruction(HloInstruction::CreateReduceWindow(
           reduce_window_shape, pad, reduce_init_value, window,
@@ -1714,9 +1972,9 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
 
   HloComputation::Builder call_builder(TestName() + ".Call");
   HloInstruction* zero = call_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0.0f})));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({0.0f})));
   HloInstruction* one = call_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1.0f})));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1.0f})));
   builder.AddInstruction(
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
@@ -1728,6 +1986,26 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 }
 
+// Test that a constant with tuple shape becomes a tuple of constants.
+TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
+  HloComputation::Builder builder(TestName());
+  const float constant_scalar = 7.3f;
+  std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
+  std::unique_ptr<Literal> value =
+      Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar).get(),
+                          Literal::CreateR1<float>(constant_vector).get()});
+  builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Tuple(op::Constant(), op::Constant()));
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 66d54ad3802fe442decd11335eddf74bdd1cf950..9abe30e3f371cc294c36c1dcd743224b11b0c4f5 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/legacy_flags/backend_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -51,13 +50,6 @@ perftools::gputools::Platform* BackendOptions::platform() const {
   return platform_;
 }
 
-BackendOptions& BackendOptions::set_number_of_replicas(int number_of_replicas) {
-  number_of_replicas_ = number_of_replicas;
-  return *this;
-}
-
-int BackendOptions::number_of_replicas() const { return number_of_replicas_; }
-
 BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
     int num_threads) {
   intra_op_parallelism_threads_ = num_threads;
@@ -85,20 +77,17 @@ struct Backend::EigenThreadPoolWrapper {
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
     const BackendOptions& options) {
-  int64 replica_count = options.number_of_replicas();
-  if (replica_count == -1) {
-    legacy_flags::BackendFlags* flags = legacy_flags::GetBackendFlags();
-    replica_count = flags->xla_replicas;
-  }
   perftools::gputools::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
   TF_ASSIGN_OR_RETURN(auto transfer_manager,
                       TransferManager::GetForPlatform(platform));
+  TF_ASSIGN_OR_RETURN(auto computation_placer,
+                      ComputationPlacer::GetForPlatform(platform));
   std::unique_ptr<Backend> backend(
-      new Backend(replica_count, platform, compiler, stream_executors,
-                  transfer_manager, options.intra_op_parallelism_threads()));
+      new Backend(platform, compiler, stream_executors, transfer_manager,
+                  computation_placer, options.intra_op_parallelism_threads()));
   return std::move(backend);
 }
 
@@ -132,34 +121,25 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
 }
 
 Backend::Backend(
-    int64 replica_count, perftools::gputools::Platform* platform,
-    Compiler* compiler,
+    perftools::gputools::Platform* platform, Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
-    TransferManager* transfer_manager, int intra_op_parallelism_threads)
+    TransferManager* transfer_manager, ComputationPlacer* computation_placer,
+    int intra_op_parallelism_threads)
     : platform_(platform),
       compiler_(compiler),
       transfer_manager_(transfer_manager),
-      replica_count_(replica_count) {
+      computation_placer_(computation_placer) {
   // The given set of stream executors set may include invalid executors.
   for (se::StreamExecutor* exec : stream_executors) {
     if (exec != nullptr) {
       stream_executors_.push_back(exec);
     }
   }
-  CHECK_GE(replica_count, 1) << "Must request at least 1 replica.";
-
   // Create a memory allocator for the valid stream executors.
   memory_allocator_ =
       MakeUnique<StreamExecutorMemoryAllocator>(platform, stream_executors);
-
-  // First check that there are some non-null stream executors to avoid issuing
-  // an error mentioning replicas in the common case of requesting just 1
-  // replica, which means no replication.
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
-  CHECK_GE(stream_executors_.size(), replica_count)
-      << "Requested more replicas than there are devices for backend "
-      << platform_->Name() << '.';
 
   if (platform->id() == se::host::kHostPlatformId) {
     inter_op_thread_pool_.reset(new tensorflow::thread::ThreadPool(
@@ -179,36 +159,6 @@ int Backend::default_device_ordinal() const {
   return default_stream_executor()->device_ordinal();
 }
 
-StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Backend::Replicas(
-    int device_ordinal) const {
-  if (stream_executors_[device_ordinal] == nullptr) {
-    return InvalidArgument("device %s not supported by XLA service",
-                           device_name(device_ordinal).c_str());
-  }
-
-  // Find replica_count_ stream executors starting from the given device
-  // ordinal.
-  std::vector<perftools::gputools::StreamExecutor*> replicas;
-  for (se::StreamExecutor* exec : stream_executors_) {
-    CHECK(exec != nullptr);
-    if (exec->device_ordinal() >= device_ordinal) {
-      replicas.push_back(exec);
-      if (replicas.size() >= replica_count_) {
-        return replicas;
-      }
-    }
-  }
-
-  return InvalidArgument(
-      "Not enough devices for replicas for the device ordinal %d",
-      device_ordinal);
-}
-
-std::vector<perftools::gputools::StreamExecutor*> Backend::Replicas() const {
-  CHECK_GE(stream_executors_.size(), replica_count_);
-  return Replicas(default_device_ordinal()).ValueOrDie();
-}
-
 tensorflow::thread::ThreadPool* Backend::inter_op_thread_pool() const {
   return inter_op_thread_pool_.get();
 }
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index e0b15dc43f25244bc1a3e3c5cdc45877d4d11804..b5ca483b7274d20c31e932d748b6a4c9dea926f9 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -46,12 +47,6 @@ class BackendOptions {
   BackendOptions& set_platform(perftools::gputools::Platform* platform);
   perftools::gputools::Platform* platform() const;
 
-  // Set the number of replicas to use when compiling replicated
-  // programs. The default is -1 meaning that the value is read from
-  // the xla_replicas flag.
-  BackendOptions& set_number_of_replicas(int number_of_replicas);
-  int number_of_replicas() const;
-
   // Sets the thread pool size for parallel execution of an individual operator.
   // The default value of -1 will result in initializing the thread pool with
   // the number of threads equal to the number of cores in the system.
@@ -60,7 +55,6 @@ class BackendOptions {
 
  private:
   perftools::gputools::Platform* platform_ = nullptr;
-  int number_of_replicas_ = -1;
   int intra_op_parallelism_threads_ = -1;
 };
 
@@ -74,8 +68,7 @@ class Backend {
  public:
   using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
 
-  // Creates a new backend for the given platform with the given number of
-  // replicas.
+  // Creates a new backend.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
       const BackendOptions& options);
 
@@ -92,6 +85,7 @@ class Backend {
     return memory_allocator_.get();
   }
   TransferManager* transfer_manager() const { return transfer_manager_; }
+  ComputationPlacer* computation_placer() const { return computation_placer_; }
 
   // Returns the number of devices of the platform type which are visible. Not
   // all of these devices may be usable by XLA.
@@ -107,24 +101,13 @@ class Backend {
     return stream_executors_;
   }
 
-  // Returns the replicas for the default stream executor.
-  //
-  // When the number of replicas is R, the first R stream executors are assigned
-  // to the replicas of the default stream executor.
-  std::vector<perftools::gputools::StreamExecutor*> Replicas() const;
-
-  // Returns the replicas for the given device_ordinal. The given device ordinal
-  // is considered to be the first device ordinal among the replicas. Returns an
-  // error status if the stream executor for the given given device ordinal does
-  // not exist or if there are not enough stream executors for the replicas.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
-      int device_ordinal) const;
-
-  // Return the stream executor for the given device ordinal.
+  // Returns the stream executor for the given device ordinal.
   StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
       int device_ordinal) const;
 
-  // Return the stream executor for the default device ordinal.
+  // Returns the stream executor for the default device ordinal. This stream
+  // executor can only be used when the number of computations is 1 (replication
+  // can be > 1).
   perftools::gputools::StreamExecutor* default_stream_executor() const {
     CHECK(!stream_executors_.empty());
     return stream_executors_[0];
@@ -174,18 +157,19 @@ class Backend {
 
  private:
   struct EigenThreadPoolWrapper;
-  Backend(int64 replica_count, perftools::gputools::Platform* platform,
-          Compiler* compiler,
+  Backend(perftools::gputools::Platform* platform, Compiler* compiler,
           tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
               stream_executors,
-          TransferManager* transfer_manager, int intra_op_parallelism_threads);
+          TransferManager* transfer_manager,
+          ComputationPlacer* computation_placer,
+          int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
   perftools::gputools::Platform* platform_;
   Compiler* compiler_;
   TransferManager* transfer_manager_;
-  int64 replica_count_ = -1;
+  ComputationPlacer* computation_placer_;
 
   // Vector of stream executors. stream_executors_[0] is the default executor.
   std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca2d413e11d3ad12bb3cac7695386c3089a21b1b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
@@ -0,0 +1,286 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// BatchNormRewriterVisitor traverses the HLO computation and rewrites BatchNorm
+// operations into smaller operations.
+class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
+
+  // Runs the visitor on a computation.
+  static bool Run(HloComputation* computation, bool rewrite_training_op,
+                  bool rewrite_grad_op);
+
+  // Returns whether any batch norm ops were rewritten.
+  const bool changed() const { return changed_; }
+
+  ~BatchNormRewriterVisitor() override = default;
+
+ private:
+  explicit BatchNormRewriterVisitor(HloComputation* computation,
+                                    bool rewrite_training_op,
+                                    bool rewrite_grad_op)
+      : computation_(computation),
+        rewrite_training_op_(rewrite_training_op),
+        rewrite_grad_op_(rewrite_grad_op) {}
+
+  HloComputation* GetScalarBinaryComputation(PrimitiveType primitive_type,
+                                             HloOpcode opcode) {
+    HloComputation::Builder b("scalar computation");
+    auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "scalar lhs"));
+    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "scalar rhs"));
+    auto scalar_op = b.AddInstruction(
+        HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
+                                     opcode, scalar_lhs, scalar_rhs));
+    return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+  }
+
+  // Current HloComputation instance the BatchNormRewriter is
+  // traversing.
+  HloComputation* computation_;
+
+  bool rewrite_training_op_;
+  bool rewrite_grad_op_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithNewInstruction(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction) {
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        old_instruction, std::move(new_instruction)));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceInstruction(HloInstruction* old_instruction,
+                            HloInstruction* new_instruction) {
+    TF_RETURN_IF_ERROR(
+        computation_->ReplaceInstruction(old_instruction, new_instruction));
+    changed_ = true;
+    return Status::OK();
+  }
+};
+
+bool BatchNormRewriterVisitor::Run(HloComputation* computation,
+                                   bool rewrite_training_op,
+                                   bool rewrite_grad_op) {
+  BatchNormRewriterVisitor visitor(computation,
+                                   /*rewrite_training_op=*/rewrite_training_op,
+                                   /*rewrite_grad_op=*/rewrite_grad_op);
+  TF_CHECK_OK(computation->Accept(&visitor));
+  return visitor.changed_;
+}
+
+Status BatchNormRewriterVisitor::HandleBatchNormTraining(
+    HloInstruction* batch_norm) {
+  if (!rewrite_training_op_) {
+    return Status::OK();
+  }
+  // Expand batch norm training into smaller HLO ops.
+  HloInstruction* operand = batch_norm->mutable_operand(0);
+  const Shape operand_shape = operand->shape();
+  int64 feature_index = batch_norm->feature_index();
+  const int64 feature_count = operand_shape.dimensions(feature_index);
+  const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
+  auto elements_per_feature =
+      computation_->AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR0<float>(size_in_elements / feature_count)));
+
+  HloInstruction* scale = batch_norm->mutable_operand(1);
+  HloInstruction* offset = batch_norm->mutable_operand(2);
+  const Shape feature_shape = scale->shape();
+
+  auto zero = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
+
+  auto epsilon = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+
+  std::vector<int64> dimensions_without_feature;
+
+  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+    if (i != feature_index) {
+      dimensions_without_feature.push_back(i);
+    }
+  }
+
+  auto scale_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
+
+  auto offset_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
+
+  HloComputation* add_reduce_computation =
+      GetScalarBinaryComputation(F32, HloOpcode::kAdd);
+
+  // X^2.
+  auto operand_squared =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kMultiply, operand, operand));
+  // Sum[X].
+  auto sum = computation_->AddInstruction(HloInstruction::CreateReduce(
+      feature_shape, operand, zero, dimensions_without_feature,
+      add_reduce_computation));
+
+  // Sum[X^2].
+  auto squared_sum = computation_->AddInstruction(HloInstruction::CreateReduce(
+      feature_shape, operand_squared, zero, dimensions_without_feature,
+      add_reduce_computation));
+
+  // Fuse two parallel reduces together to improve performance.
+  auto tuple = computation_->AddInstruction(
+      HloInstruction::CreateTuple({sum, squared_sum}));
+
+  auto fused = computation_->CreateFusionInstruction(
+      {tuple, sum, squared_sum, operand_squared},
+      HloInstruction::FusionKind::kInput);
+
+  sum = computation_->AddInstruction(
+      HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+
+  squared_sum = computation_->AddInstruction(
+      HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+
+  // E[X].
+  auto mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kDivide, sum, elements_per_feature));
+
+  auto mean_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
+
+  // E[X^2].
+  auto square_mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature));
+
+  // E^2[X].
+  auto mean_square = computation_->AddInstruction(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kMultiply, mean, mean));
+
+  // Var[X].
+  auto var = computation_->AddInstruction(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kSubtract, square_mean, mean_square));
+
+  auto var_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+
+  // Var[X] + epsilon.
+  auto var_add_epsilon =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+
+  auto neg_half = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
+
+  // 1 / Sqrt[Var[X] + epsilon].
+  auto rsqrt_var_add_epsilon =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+
+  // X - E[X].
+  auto operand_minus_mean =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+
+  // (X - E[X]) / Sqrt[Var[X] + epsilon].
+  auto normalized = computation_->AddInstruction(
+      HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
+                                   operand_minus_mean, rsqrt_var_add_epsilon));
+
+  // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
+  auto scaled_normalized =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+
+  // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
+  auto shifted_normalized = computation_->AddInstruction(
+      HloInstruction::CreateBinary(operand_shape, HloOpcode::kAdd,
+                                   scaled_normalized, offset_broadcasted));
+
+  TF_CHECK_OK(ReplaceWithNewInstruction(
+      batch_norm,
+      HloInstruction::CreateTuple({shifted_normalized, mean, var})));
+  return Status::OK();
+}
+
+StatusOr<bool> BatchNormRewriter::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "BatchNormRewriter::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  // Make a copy of the computations because we may add computations to the
+  // module, invalidating iteration.
+  std::vector<HloComputation*> computations;
+  for (auto& comp : module->computations()) {
+    if (comp->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(comp.get());
+  }
+  for (auto& comp : computations) {
+    if (BatchNormRewriterVisitor::Run(comp, rewrite_training_op_,
+                                      rewrite_grad_op_)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(2, "BatchNormRewriter::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.h b/tensorflow/compiler/xla/service/batchnorm_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d176f4849a786e8650013c430527959bdd004a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which rewrites batch norm operations into more operations. Breaking a
+// big operation into smaller operations helps leverage our generic fusion
+// logic.
+class BatchNormRewriter : public HloPassInterface {
+ public:
+  BatchNormRewriter(bool rewrite_training_op = false,
+                    bool rewrite_grad_op = false)
+      : rewrite_training_op_(rewrite_training_op),
+        rewrite_grad_op_(rewrite_grad_op) {}
+  ~BatchNormRewriter() = default;
+  tensorflow::StringPiece name() const override { return "batchnorm_rewriter"; }
+
+  // Run operation expander on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  bool rewrite_training_op_;
+  bool rewrite_grad_op_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc b/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..301f31b51ceb9700b71a86ceddf0065dee93b121
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace xla {
+namespace {
+
+using BatchNormRewriterTest = HloTestBase;
+
+// Test that we expand BatchNormTraining.
+TEST_F(BatchNormRewriterTest, BatchNormTraining) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
+  Shape scale_shape = ShapeUtil::MakeShape(F32, {2});
+  Shape offset_shape = ShapeUtil::MakeShape(F32, {2});
+
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "activiation"));
+
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scale_shape, "scale"));
+
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, offset_shape, "offset"));
+
+  builder.AddInstruction(HloInstruction::CreateBatchNormTraining(
+      ShapeUtil::MakeTupleShape({input_shape, scale_shape, offset_shape}),
+      param0, param1, param2,
+      /*epsilon=*/0.001, /*feature_index=*/3));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
+  BatchNormRewriter rewriter(/*rewrite_training_op=*/true);
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Make sure this operation is expanded.
+  EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index f91eb0207a23fe55394d59ed99a0d08cf16aa285..ae31135a1aeb2807649aceb6e77d6050525ce5a6 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include <ostream>
 #include <utility>
 
-#include "tensorflow/compiler/xla/legacy_flags/buffer_assignment_flags.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -66,6 +66,7 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 
 void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
+  VLOG(4) << "Trying to add " << buffer << " to " << this;
   CHECK(assigned_buffers_.count(&buffer) == 0)
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
@@ -212,10 +213,14 @@ bool BufferAssignment::HasTopLevelAllocation(
 
 StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueSlice(
     const HloInstruction* instruction, const ShapeIndex& index) const {
+  VLOG(3) << "Trying to find unique slice for " << instruction->name() << " ["
+          << index << "]";
   BufferAllocation::Slice result;
   for (const LogicalBuffer* buffer :
        GetPointsToSet(instruction).element(index)) {
+    VLOG(3) << "Examining buffer " << *buffer;
     if (HasAllocation(*buffer)) {
+      VLOG(3) << "Has allocation";
       const BufferAllocation::Slice slice =
           GetAssignedAllocation(*buffer).GetSlice(*buffer);
       if (result.allocation() == nullptr) {
@@ -226,6 +231,8 @@ StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueSlice(
             "be determined at compile-time.",
             instruction->name().c_str(), index.ToString().c_str());
       }
+    } else {
+      VLOG(3) << "No allocation";
     }
   }
   if (result.allocation() == nullptr) {
@@ -320,8 +327,9 @@ void BufferAssignment::CombineTempAllocations() {
       // Each temp allocation is placed end-to-end, accounting for alignment.
       // The offset of each buffer in the combined allocation is computed from
       // the base offset of the allocation.
+      int64 alignment = color_alignment_(color);
       const int64 base =
-          RoundUpToNearest(combined_allocation->size(), alignment_);
+          RoundUpToNearest(combined_allocation->size(), alignment);
       combined_allocation->set_size(base + temp_allocation.size());
       for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) {
         const LogicalBuffer* buffer = buffer_offset_size.first;
@@ -575,12 +583,13 @@ Status GatherComputationsByAllocationType(
 /* static */
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-    bool allow_input_output_aliasing, TuplePointsToAnalysis::Colorer colorer) {
-  BufferAssigner assigner(alignment, allow_input_output_aliasing,
-                          std::move(colorer));
+    LogicalBuffer::SizeFunction buffer_size,
+    LogicalBuffer::AlignmentFunction color_alignment,
+    bool allow_input_output_aliasing, BufferLiveness::Colorer colorer) {
+  BufferAssigner assigner(allow_input_output_aliasing, std::move(colorer));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
-                                   std::move(buffer_size));
+                                   std::move(buffer_size),
+                                   std::move(color_alignment));
 }
 
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
@@ -662,7 +671,8 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 }
 
 Status BufferAssigner::AssignBuffersForComputation(
-    const HloComputation* computation, bool is_thread_local,
+    const HloComputation* computation, const DebugOptions& debug_options,
+    bool is_thread_local,
     const FlatSet<const LogicalBuffer*>& colocated_buffers,
     const FlatSet<BufferAllocation::Index>& colocated_allocations,
     FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
@@ -786,10 +796,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
-    legacy_flags::BufferAssignmentFlags* flags =
-        legacy_flags::GetBufferAssignmentFlags();
-    if (!flags->xla_enable_buffer_reuse || is_thread_local ||
-        instruction->opcode() == HloOpcode::kCustomCall) {
+    if (is_thread_local || instruction->opcode() == HloOpcode::kCustomCall) {
       // Custom call operations never have reusable buffers. Also we do not
       // reuse thread-local buffers for now, because they are dynamically
       // allocated and their lifetimes are hard to compute.
@@ -938,11 +945,13 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     }
     auto color_map = SplitBuffersByColor(all_buffers_to_assign);
     for (auto& single_colored_set : color_map) {
-      VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+      auto color = single_colored_set.first;
+      VLOG(2) << "Simulating heap for color " << color;
+      int64 alignment = assignment->color_alignment_(color);
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                                 MakeUnique<LazyBestFitHeap>(alignment_)),
+                                 MakeUnique<LazyBestFitHeap>(alignment)),
                              assignment->module(), module_sequence,
                              assignment->points_to_analysis(),
                              assignment->buffer_size_,
@@ -963,11 +972,13 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       CHECK(instruction_sequence != nullptr) << computation->name();
       auto color_map = SplitBuffersByColor(buffers_to_assign);
       for (auto& single_colored_set : color_map) {
-        VLOG(2) << "Simulating heap for color " << single_colored_set.first;
+        auto color = single_colored_set.first;
+        VLOG(2) << "Simulating heap for color " << color;
+        int64 alignment = assignment->color_alignment_(color);
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                                   MakeUnique<LazyBestFitHeap>(alignment_)),
+                                   MakeUnique<LazyBestFitHeap>(alignment)),
                                *computation, *instruction_sequence,
                                assignment->points_to_analysis(),
                                assignment->buffer_size_,
@@ -1074,7 +1085,8 @@ void BufferAssigner::AddSetToColocatedBufferSets(
 // different while instructions.
 void BufferAssigner::AddWhileSetToColocatedBufferSets(
     const std::vector<const LogicalBuffer*>& colocated_set,
-    const LogicalBuffer* while_init_buffer, const HloInstruction* while_hlo,
+    const LogicalBuffer* while_init_buffer,
+    const LogicalBuffer* while_result_buffer, const HloInstruction* while_hlo,
     const HloComputation& computation, const BufferLiveness& buffer_liveness,
     const LogicalBuffer::SizeFunction& buffer_size,
     std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
@@ -1137,16 +1149,30 @@ void BufferAssigner::AddWhileSetToColocatedBufferSets(
       continue;
     }
 
-    // Skip predecessor set if the live range of any predecessor buffers
-    // overlaps with 'while_init_buffer'. Note that tuple element buffer
-    // forwarding can cause the same buffer to appear on both sides of the
-    // interference comparison below.
-    if (std::any_of(
-            predecessor_while_buffers.begin(), predecessor_while_buffers.end(),
-            [while_init_buffer, &buffer_liveness](const LogicalBuffer* buffer) {
-              return while_init_buffer->id() != buffer->id() &&
-                     buffer_liveness.MayInterfere(*while_init_buffer, *buffer);
-            })) {
+    // Skip predecessor set if the live range of any predecessor
+    // buffers overlaps with 'while_init_buffer' or
+    // 'while_result_buffer' (we need to check both since they're
+    // aliased together, but the points-to analysis is unaware of this
+    // aliasing). Note that tuple element buffer forwarding can cause
+    // the same buffer to appear on both sides of the interference
+    // comparison below.
+    auto may_interfere_with_init_or_result = [&](const LogicalBuffer* buffer) {
+      if (while_init_buffer->id() != buffer->id() &&
+          buffer_liveness.MayInterfere(*while_init_buffer, *buffer)) {
+        return true;
+      }
+
+      if (while_result_buffer->id() != buffer->id() &&
+          buffer_liveness.MayInterfere(*while_result_buffer, *buffer)) {
+        return true;
+      }
+
+      return false;
+    };
+
+    if (std::any_of(predecessor_while_buffers.begin(),
+                    predecessor_while_buffers.end(),
+                    may_interfere_with_init_or_result)) {
       continue;
     }
 
@@ -1193,6 +1219,9 @@ void BufferAssigner::BuildColocatedBufferSets(
   const TuplePointsToAnalysis& points_to_analysis =
       buffer_liveness.points_to_analysis();
   for (const HloComputation* computation : module->MakeComputationPostOrder()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     for (const HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       const HloOpcode opcode = instruction->opcode();
@@ -1209,8 +1238,8 @@ void BufferAssigner::BuildColocatedBufferSets(
                   AddBufferToColocatedSet(while_hlo->operand(0), index,
                                           points_to_analysis, &colocated_set);
               // Add while.result.
-              AddBufferToColocatedSet(while_hlo, index, points_to_analysis,
-                                      &colocated_set);
+              auto* result_buffer = AddBufferToColocatedSet(
+                  while_hlo, index, points_to_analysis, &colocated_set);
               // Add while.cond.parameter.
               AddBufferToColocatedSet(
                   while_hlo->while_condition()->parameter_instruction(0), index,
@@ -1224,8 +1253,9 @@ void BufferAssigner::BuildColocatedBufferSets(
                   while_hlo->while_body()->root_instruction(), index,
                   points_to_analysis, &colocated_set);
               AddWhileSetToColocatedBufferSets(
-                  colocated_set, init_buffer, while_hlo, *computation,
-                  buffer_liveness, buffer_size, colocated_buffer_sets);
+                  colocated_set, init_buffer, result_buffer, while_hlo,
+                  *computation, buffer_liveness, buffer_size,
+                  colocated_buffer_sets);
             });
       } else if (opcode == HloOpcode::kCall) {
         const HloInstruction* call_hlo = instruction;
@@ -1300,10 +1330,10 @@ void BufferAssigner::AssignColocatedBufferSets(
 
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size) {
+    LogicalBuffer::SizeFunction buffer_size,
+    LogicalBuffer::AlignmentFunction color_alignment) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
-                      BufferLiveness::Run(module, std::move(hlo_ordering),
-                                          std::move(colorer_)));
+                      BufferLiveness::Run(module, std::move(hlo_ordering)));
 
   VLOG(1) << "Assigning buffers to module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
@@ -1311,8 +1341,9 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
 
   // Can't use MakeUnique because BufferAssignment constructor is private.
-  std::unique_ptr<BufferAssignment> assignment(new BufferAssignment(
-      module, std::move(liveness), alignment_, std::move(buffer_size)));
+  std::unique_ptr<BufferAssignment> assignment(
+      new BufferAssignment(module, std::move(liveness), std::move(buffer_size),
+                           std::move(color_alignment)));
 
   // Assign buffers with the tightest constraints first (colocated buffer sets).
   // Once b/32491382 enables module-level liveness analysis, we may be able
@@ -1323,6 +1354,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   std::vector<ColocatedBufferSet> colocated_buffer_sets;
   BuildColocatedBufferSets(module, assignment->liveness(),
                            assignment->buffer_size_, &colocated_buffer_sets);
+  TF_RETURN_IF_ERROR(colorer_(assignment->liveness()));
+  VLOG(3) << "After coloring:";
+  XLA_VLOG_LINES(3, assignment->points_to_analysis().ToString());
+
   AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
                             &colocated_buffers, &colocated_allocations);
 
@@ -1337,9 +1372,9 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/false, colocated_buffers,
-        colocated_allocations, &buffers_to_assign_sequentially,
-        assignment.get()));
+        computation, module->config().debug_options(),
+        /*is_thread_local=*/false, colocated_buffers, colocated_allocations,
+        &buffers_to_assign_sequentially, assignment.get()));
   }
   // Assign buffers with sequential ordering, if any. If all global computations
   // are sequential, we can run heap simuation on the whole module, which
@@ -1354,10 +1389,13 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // their own BufferAllocation.
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/true, colocated_buffers,
-        colocated_allocations, /*buffers_to_assign_sequentially=*/nullptr,
-        assignment.get()));
+        computation, module->config().debug_options(),
+        /*is_thread_local=*/true, colocated_buffers, colocated_allocations,
+        /*buffers_to_assign_sequentially=*/nullptr, assignment.get()));
   }
 
   // Mark all buffers which may be live out of the entry computation as
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index b3933f11c1e6ae3e7ffcc990442183338788caf4..35c904df130564a4848d3cb2db21ed8fa209e7e8 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -351,12 +351,12 @@ class BufferAssignment {
 
   explicit BufferAssignment(const HloModule* module,
                             std::unique_ptr<BufferLiveness> liveness,
-                            int64 alignment,
-                            LogicalBuffer::SizeFunction buffer_size)
+                            LogicalBuffer::SizeFunction buffer_size,
+                            LogicalBuffer::AlignmentFunction color_alignment)
       : module_(module),
         liveness_(std::move(liveness)),
-        alignment_(alignment),
-        buffer_size_(std::move(buffer_size)) {}
+        buffer_size_(std::move(buffer_size)),
+        color_alignment_(std::move(color_alignment)) {}
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
@@ -402,11 +402,13 @@ class BufferAssignment {
 
   const HloModule* module_;
   const std::unique_ptr<BufferLiveness> liveness_;
-  const int64 alignment_;
 
   // Function which returns the buffer size for a given logical buffer (shape).
   LogicalBuffer::SizeFunction buffer_size_;
 
+  // Function which returns the alignment for a given logical buffer color.
+  LogicalBuffer::AlignmentFunction color_alignment_;
+
   Stats stats_;
   std::vector<HeapSimulatorTrace> heap_simulator_traces_;
 
@@ -417,36 +419,37 @@ class BufferAssignment {
 class BufferAssigner {
  public:
   // Build and return a BufferAssignment for the given module. The given
-  // HloOrdering is used to determine buffer liveness. buffer_size is a function
-  // which returns the size of a LogicalBuffer. Alignment is the minimum
-  // alignment of any buffer. allow_input_output_aliasing specifies whether
-  // input buffer are allowed to be reused as outbut buffers by the client code.
+  // HloOrdering is used to determine buffer liveness. buffer_size and
+  // color_alignment are functions which returns the size and alignment of a
+  // LogicalBuffer.  allow_input_output_aliasing specifies whether input buffer
+  // are allowed to be reused as outbut buffers by the client code.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size, int64 alignment,
+      LogicalBuffer::SizeFunction buffer_size,
+      LogicalBuffer::AlignmentFunction color_alignment,
       bool allow_input_output_aliasing = false,
-      TuplePointsToAnalysis::Colorer colorer =
-          TuplePointsToAnalysis::DefaultColorer());
+      BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer());
 
  private:
-  BufferAssigner(int64 alignment, bool allow_input_output_aliasing,
-                 TuplePointsToAnalysis::Colorer colorer)
-      : alignment_(alignment),
-        allow_input_output_aliasing_(allow_input_output_aliasing),
+  BufferAssigner(bool allow_input_output_aliasing,
+                 BufferLiveness::Colorer colorer)
+      : allow_input_output_aliasing_(allow_input_output_aliasing),
         colorer_(colorer) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
   StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size);
+      LogicalBuffer::SizeFunction buffer_size,
+      LogicalBuffer::AlignmentFunction color_alignment);
 
   // Assigns buffers to the instructions in the given computation. "assignment"
   // is modified to reflect the new buffer assignments. If is_thread_local is
   // true, then all assigned buffers have the is_thread_local flag set to
   // true.
   Status AssignBuffersForComputation(
-      const HloComputation* computation, bool is_thread_local,
+      const HloComputation* computation, const DebugOptions& debug_options,
+      bool is_thread_local,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
@@ -511,7 +514,8 @@ class BufferAssigner {
   // colocated buffers for while instructions.
   void AddWhileSetToColocatedBufferSets(
       const std::vector<const LogicalBuffer*>& colocated_set,
-      const LogicalBuffer* while_init_buffer, const HloInstruction* while_hlo,
+      const LogicalBuffer* while_init_buffer,
+      const LogicalBuffer* while_result_buffer, const HloInstruction* while_hlo,
       const HloComputation& computation, const BufferLiveness& buffer_liveness,
       const LogicalBuffer::SizeFunction& buffer_size,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
@@ -524,15 +528,12 @@ class BufferAssigner {
   SplitBuffersByColor(
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers);
 
-  // Minimum alignment of any buffer.
-  int64 alignment_;
-
   // If true, buffer assignments assumes that input parameter buffers and output
   // buffers can be shared if their sizes match.
   bool allow_input_output_aliasing_;
 
   // Functor used to assign colors to newly allocated logical buffers.
-  TuplePointsToAnalysis::Colorer colorer_;
+  BufferLiveness::Colorer colorer_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 892f67a8812823a6f156dc6098bf6b39fa800d3c..18acd4f3ae47882bf629c090c510db92049e215a 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -85,17 +86,18 @@ class BufferAssignmentTest : public HloTestBase {
                                                         int64 alignment = 1) {
     return BufferAssigner::Run(
                module, MakeUnique<DependencyHloOrdering>(module),
-               backend_->compiler()->BufferSizeBytesFunction(), alignment)
+               backend_->compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; })
         .ConsumeValueOrDie();
   }
 
   std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
-      HloModule* module, TuplePointsToAnalysis::Colorer colorer,
-      int64 alignment = 1) {
-    return BufferAssigner::Run(module,
-                               MakeUnique<DependencyHloOrdering>(module),
-                               backend_->compiler()->BufferSizeBytesFunction(),
-                               alignment, false, std::move(colorer))
+      HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) {
+    return BufferAssigner::Run(
+               module, MakeUnique<DependencyHloOrdering>(module),
+               backend_->compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; }, false,
+               std::move(colorer))
         .ConsumeValueOrDie();
   }
 
@@ -105,7 +107,7 @@ class BufferAssignmentTest : public HloTestBase {
     auto param =
         builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
     auto value = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
     builder.AddInstruction(
         HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, param, value));
     return builder.Build();
@@ -122,7 +124,7 @@ class BufferAssignmentTest : public HloTestBase {
       const string& name) {
     auto builder = HloComputation::Builder(name);
     auto const4 = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(4)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(4)));
     auto param = builder.AddInstruction(
         HloInstruction::CreateParameter(0, t_s32_f32v4_, "x"));
     auto index = builder.AddInstruction(
@@ -147,9 +149,9 @@ class BufferAssignmentTest : public HloTestBase {
       const string& name) {
     auto builder = HloComputation::Builder(name);
     auto const1 = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(1)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(1)));
     auto constv = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
+        Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
     auto param = builder.AddInstruction(
         HloInstruction::CreateParameter(0, t_s32_f32v4_, "x"));
     auto indexc = builder.AddInstruction(
@@ -264,7 +266,7 @@ static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
 TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
@@ -278,9 +280,9 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
   // no buffers assigned, and their consumer has a buffer.
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
+      Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({4.1f, 4.2f, 4.3f, 4.4f})));
+      Literal::CreateR1<float>({4.1f, 4.2f, 4.3f, 4.4f})));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, const0, const1));
   auto module = CreateNewModule();
@@ -298,7 +300,7 @@ TEST_F(BufferAssignmentTest, BufferForOutputConst) {
   // This computation copies a constant to output.
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
+      Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(const0->shape(), HloOpcode::kCopy, const0));
   auto module = CreateNewModule();
@@ -378,12 +380,16 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunColoredBufferAssignment(
-      module.get(),
-      [](const HloInstruction* instruction, const ShapeIndex& index) {
-        static int64 serial = 0;
-        return LogicalBuffer::Color(serial++);
-      });
+  auto colorer = [](const BufferLiveness& buffer_liveness) {
+    int color = 0;
+    for (auto& buffer :
+         buffer_liveness.points_to_analysis().logical_buffers()) {
+      buffer->set_color(LogicalBuffer::Color(color++));
+    }
+    return Status::OK();
+  };
+
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -430,14 +436,25 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunColoredBufferAssignment(
-      module.get(),
-      [](const HloInstruction* instruction, const ShapeIndex& index) {
-        return (instruction->opcode() == HloOpcode::kAdd ||
-                instruction->opcode() == HloOpcode::kMultiply)
-                   ? LogicalBuffer::Color(1)
-                   : LogicalBuffer::Color(0);
-      });
+  auto colorer = [](const BufferLiveness& buffer_liveness) {
+    for (auto& buffer :
+         buffer_liveness.points_to_analysis().logical_buffers()) {
+      const auto& aliases =
+          buffer_liveness.points_to_analysis().GetBufferAliases(*buffer);
+      for (const auto& alias : aliases) {
+        if (alias.instruction()->opcode() == HloOpcode::kAdd ||
+            alias.instruction()->opcode() == HloOpcode::kMultiply) {
+          buffer->set_color(LogicalBuffer::Color(1));
+        }
+      }
+      if (!buffer->has_color()) {
+        buffer->set_color(LogicalBuffer::Color(0));
+      }
+    }
+    return Status::OK();
+  };
+
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -586,7 +603,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
   auto exp2 = builder.AddInstruction(
       HloInstruction::CreateUnary(f32a100x10_, HloOpcode::kExp, exp1));
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
       /*shape=*/f32vec10_,
       /*operand=*/exp2,
@@ -634,9 +651,9 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   // Creates the main kernel and verifies instruction counts.
   auto builder = HloComputation::Builder(TestName());
   auto const3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int>(0)));
   auto const4 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
+      Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({const3, const4}));
   auto while_op = builder.AddInstruction(HloInstruction::CreateWhile(
@@ -996,9 +1013,10 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
   // Test a computation that returns a tuple parameter.
   auto builder = HloComputation::Builder(TestName());
   auto tuple_param = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
-                                    ShapeUtil::MakeShape(F32, {}),
-                                    ShapeUtil::MakeShape(S32, {42})}),
+      0,
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
+                                 ShapeUtil::MakeShape(F32, {}),
+                                 ShapeUtil::MakeShape(S32, {42})}),
       "param0"));
 
   auto module = CreateNewModule();
@@ -1027,10 +1045,11 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
   // parameter.
   auto builder = HloComputation::Builder(TestName());
   auto tuple_param = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
-              ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {42}),
-                                         ShapeUtil::MakeShape(S32, {101})})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
+           ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {42}),
+                                      ShapeUtil::MakeShape(S32, {101})})}),
       "param0"));
   auto tuple_element =
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -1075,9 +1094,8 @@ TEST_F(BufferAssignmentTest, DISABLED_TupleConstantAsOutput) {
   // Test that a tuple constant which is forwarded to the computation output is
   // properly handled.
   auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR0<int64>(0).get(),
-                              LiteralUtil::CreateR0<int64>(1).get()})));
+  builder.AddInstruction(HloInstruction::CreateConstant(Literal::MakeTuple(
+      {Literal::CreateR0<int64>(0).get(), Literal::CreateR0<int64>(1).get()})));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
@@ -1369,9 +1387,9 @@ class WhileBufferAssignmentTest : public HloTestBase {
     builder.AddInstruction(
         HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
     auto zero = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(0)));
     auto ten = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(10)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(10)));
     builder.AddInstruction(HloInstruction::CreateBinary(
         ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, zero, ten));
     return builder.Build();
@@ -1399,7 +1417,8 @@ class WhileBufferAssignmentTest : public HloTestBase {
         CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
                module, MakeUnique<SequentialHloOrdering>(module, sequence),
-               ByteSizeOf, alignment)
+               ByteSizeOf,
+               [alignment](LogicalBuffer::Color) { return alignment; })
         .ConsumeValueOrDie();
   }
 
@@ -1429,7 +1448,7 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
       HloInstruction::CreateParameter(2, data_shape_, "weights1"));
 
   auto zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
   auto output0 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
   auto output1 = builder.AddInstruction(
@@ -1484,7 +1503,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
       HloInstruction::CreateParameter(1, data_shape_, "weights0"));
 
   auto zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
   auto output0 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
   auto output1 = builder.AddInstruction(
@@ -1532,16 +1551,16 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
     auto param = builder.AddInstruction(
         HloInstruction::CreateParameter(0, r0f32, "param"));
     auto constant1 = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
     auto add = builder.AddInstruction(
         HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, constant1));
     sub_computation = module->AddEmbeddedComputation(builder.Build(add));
   }
   auto builder = HloComputation::Builder(TestName());
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto call1 = builder.AddInstruction(
       HloInstruction::CreateCall(r0f32, {constant2}, sub_computation));
   auto call2 = builder.AddInstruction(
@@ -1554,7 +1573,7 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
     std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   }
@@ -1565,6 +1584,105 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
   EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment));
 }
 
+static bool IsPostOrderTraversal(
+    const std::vector<const HloInstruction*>& sequence) {
+  tensorflow::gtl::FlatSet<const HloInstruction*> seen_so_far;
+  auto has_not_been_seen_yet = [&](const HloInstruction* instruction) {
+    return seen_so_far.count(instruction) == 0;
+  };
+
+  for (auto instruction : sequence) {
+    if (std::any_of(instruction->operands().begin(),
+                    instruction->operands().end(), has_not_been_seen_yet) ||
+        std::any_of(instruction->control_predecessors().begin(),
+                    instruction->control_predecessors().end(),
+                    has_not_been_seen_yet)) {
+      return false;  // Not a post order.
+    }
+    if (!seen_so_far.insert(instruction).second) {
+      return false;  // Not a "traversal".
+    }
+  }
+
+  return true;
+}
+
+TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder(TestName());
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto input1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, data_shape_, "input1"));
+  auto weights1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, data_shape_, "weights1"));
+  auto output1 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, one, {1}));
+
+  auto cond =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body = module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input1, weights1, output1}));
+
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond, body, tuple0));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond, body, tuple1));
+
+  auto root_add = builder.AddInstruction(HloInstruction::CreateBinary(
+      while0->shape(), HloOpcode::kAdd, while0, while1));
+  module->AddEntryComputation(builder.Build());
+
+  RunCopyInsertion(module.get());
+
+  {
+    FlattenCallGraph flatten;
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
+    EXPECT_TRUE(result);
+  }
+
+  auto sequence =
+      CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
+
+  // To trigger b/38494731, we want a specific Hlo sequence for the
+  // root computation, so we overwrite that entry with a manually
+  // crafted sequence.
+  std::vector<const HloInstruction*> sequence_for_buffer_assigment = {
+      input1,   weights1, one,     output1, tuple1, while1,  input0,
+      weights0, zero,     output0, tuple0,  while0, root_add};
+
+  // If this ASSERT_TRUE fails, we constructed a bogus sequence above
+  // and this test itself is buggy.
+  ASSERT_TRUE(IsPostOrderTraversal(sequence_for_buffer_assigment));
+
+  sequence[module->entry_computation()] =
+      std::move(sequence_for_buffer_assigment);
+
+  auto assignment =
+      BufferAssigner::Run(
+          module.get(),
+          MakeUnique<SequentialHloOrdering>(module.get(), sequence), ByteSizeOf,
+          [](LogicalBuffer::Color) { return 1; })
+          .ConsumeValueOrDie();
+
+  EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
+}
+
 // Test buffer assignment for while nodes with multiple uses.
 // TODO(b/37245345): Fix buffer assignment for this case.
 TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
@@ -1577,7 +1695,7 @@ TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
       HloInstruction::CreateParameter(1, data_shape_, "weights0"));
 
   auto zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
   auto output0 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
 
@@ -1605,7 +1723,7 @@ TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
   }
 
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 1b14c26340f6c1922bf35457fe7f1367ed953df0..f085ffa6bc40b212339a97604455a07c1e662952 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -37,18 +37,19 @@ namespace xla {
 
 /* static */
 StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    TuplePointsToAnalysis::Colorer colorer) {
+    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) {
   std::unique_ptr<BufferLiveness> liveness(
-      new BufferLiveness(module, std::move(hlo_ordering), std::move(colorer)));
+      new BufferLiveness(module, std::move(hlo_ordering)));
   TF_RETURN_IF_ERROR(liveness->Analyze());
   return std::move(liveness);
 }
 
 tensorflow::Status BufferLiveness::Analyze() {
-  TF_ASSIGN_OR_RETURN(points_to_analysis_,
-                      TuplePointsToAnalysis::Run(module_, colorer_));
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
   for (auto& computation : module_->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     // Gather all instructions whose buffers might alias other instructions into
     // the set aliased_buffers_.  This includes those contained as a tuple
     // element in other instruction's output.
@@ -122,7 +123,7 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
     if (b.instruction()->IsUserOf(alias.instruction()) &&
         !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
                                        b.instruction(), b.index(),
-                                       points_to_analysis())) {
+                                       &points_to_analysis())) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 9bb2564a8312f0d80e01f40cb18f99d5ad0e1771..70d642b40c8e4f51748f736c69795a94ccc30de2 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -36,12 +36,12 @@ namespace xla {
 // interference.
 class BufferLiveness {
  public:
+  using Colorer = std::function<Status(const BufferLiveness& buffer_liveness)>;
+
   // Constructs a buffer liveness object for the given module assuming the given
   // HLO instruction ordering.
   static StatusOr<std::unique_ptr<BufferLiveness>> Run(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      TuplePointsToAnalysis::Colorer colorer =
-          TuplePointsToAnalysis::DefaultColorer());
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering);
 
   // Returns true if the live range of the buffer containing the output of 'a'
   // may overlap with the live range of the buffer of 'b'. If instruction 'a'
@@ -67,15 +67,24 @@ class BufferLiveness {
   // Returns the underlying hlo ordering used for this liveness analysis.
   const HloOrdering& hlo_ordering() const { return *hlo_ordering_; }
 
+  const HloModule& module() const { return *module_; }
+
   string ToString() const;
 
+  static Colorer DefaultColorer() {
+    return [](const BufferLiveness& buffer_liveness) {
+      for (auto& buffer :
+           buffer_liveness.points_to_analysis().logical_buffers()) {
+        buffer->set_color(LogicalBuffer::Color(0));
+      }
+      return Status::OK();
+    };
+  }
+
  private:
   explicit BufferLiveness(const HloModule* module,
-                          std::unique_ptr<HloOrdering> hlo_ordering,
-                          TuplePointsToAnalysis::Colorer colorer)
-      : module_(module),
-        hlo_ordering_(std::move(hlo_ordering)),
-        colorer_(colorer) {}
+                          std::unique_ptr<HloOrdering> hlo_ordering)
+      : module_(module), hlo_ordering_(std::move(hlo_ordering)) {}
 
   // Perform buffer liveness analysis. This method must be called prior to
   // MayInterfere or MaybeLiveOut.
@@ -98,8 +107,6 @@ class BufferLiveness {
   tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers_;
 
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
-
-  TuplePointsToAnalysis::Colorer colorer_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index fda44ff4d2df18b90d308617cf845c9946227249..a5f7cc0aebe856931a122eb4bf56f87666ee38a0 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -397,13 +397,11 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   // computation. The buffer containing {0, 1} is copied by GetTupleElement, and
   // the buffers containing {3} and 3 are dead.
   auto builder = HloComputation::Builder(TestName());
-  auto inner_tuple0 =
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR0<int64>(0).get(),
-                              LiteralUtil::CreateR0<int64>(1).get()});
-  auto inner_tuple1 =
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR0<int64>(3).get()});
+  auto inner_tuple0 = Literal::MakeTuple(
+      {Literal::CreateR0<int64>(0).get(), Literal::CreateR0<int64>(1).get()});
+  auto inner_tuple1 = Literal::MakeTuple({Literal::CreateR0<int64>(3).get()});
   auto tuple_constant = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::MakeTuple({inner_tuple0.get(), inner_tuple1.get()})));
+      Literal::MakeTuple({inner_tuple0.get(), inner_tuple1.get()})));
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0->shape(), tuple_constant, 0));
 
@@ -450,7 +448,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           tuple_element0_shape, tuple_param0, 0));
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
+      Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
   auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
       tuple_element0_shape, HloOpcode::kAdd, tuple_element0, const0));
 
@@ -462,7 +460,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           tuple_element1_shape, tuple_param0, 1));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f})));
+      Literal::CreateR1<float>({2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f})));
   auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
       tuple_element1_shape, HloOpcode::kAdd, tuple_element1, const1));
 
@@ -513,7 +511,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           tuple_element0_shape, tuple_param0, 0));
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
+      Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
   auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
       tuple_element0_shape, HloOpcode::kAdd, tuple_element0, const0));
 
@@ -585,7 +583,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
         HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 1));
 
     auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+        Literal::CreateR1<float>({2.f, 2.f, 2.f})));
     HloInstruction* slice = nullptr;
     if (update_uses_tuple_element1) {
       // Create a slice instruction as an additional user of 'gte1'.
@@ -596,7 +594,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     }
     // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
     auto starts = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+        HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
     auto dynamic_update_slice =
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
             data_shape, gte1, update, starts));
@@ -715,7 +713,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
         HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 1));
 
     auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+        Literal::CreateR1<float>({2.f, 2.f, 2.f})));
 
     if (tuple_element1_has_two_uses) {
       // Add 'gte0' and 'gte1' to create another user of 'gte1'.
@@ -724,7 +722,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     }
     // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
     auto starts = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+        HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
     auto dynamic_update_slice =
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
             data_shape, gte1, update, starts));
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index fa7b2a309525dd80d655e10474c5d49f9da14ea8..b450e0c40074344778109ed2ba8b2238cff7940e 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -133,6 +133,37 @@ CallGraphNode& CallGraph::GetNode(const HloComputation* computation) {
   return nodes_[it->second];
 }
 
+bool CallGraph::DominatesHelper(
+    const HloComputation* a, const HloComputation* b,
+    tensorflow::gtl::FlatSet<const HloComputation*>* visited) const {
+  if (a == b || ContainsKey(*visited, b)) {
+    // The call graph is guaranteed to be acyclic so any previously visited node
+    // we encounter was already determined to be dominated.
+    return true;
+  }
+
+  const CallGraphNode& b_node = GetNode(b);
+  if (b_node.callers().empty()) {
+    // We reached a root node without hitting 'a'. 'a' does not dominate 'b'.
+    return false;
+  }
+
+  // Walk up the callers of 'b' until we hit 'a' or a root node (no callers).
+  visited->insert(b);
+  for (const HloComputation* b_caller : b_node.callers()) {
+    if (!DominatesHelper(a, b_caller, visited)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CallGraph::Dominates(const HloComputation* a,
+                          const HloComputation* b) const {
+  tensorflow::gtl::FlatSet<const HloComputation*> visited;
+  return DominatesHelper(a, b, &visited);
+}
+
 namespace {
 
 // Returns the call context of a computation which is called from contexts 'a'
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 7f9990f06d4fee4c52fa516fc2f6031f5dab2bb9..a3297ff534f429279fd4674517db545f289af627 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -189,6 +189,20 @@ class CallGraph {
   Status VisitNodes(const VisitorFunction& visitor_func,
                     bool visit_unreachable_nodes = true) const;
 
+  // Returns true if 'a' dominates 'b' in the call graph. Computation 'a'
+  // dominates computation 'b' iff all callgraph paths in the caller-to-callee
+  // direction from a root computation to 'b' pass through computation
+  // 'a'. Trivially, a computation dominates itself.
+  bool Dominates(const HloComputation* a, const HloComputation* b) const;
+
+  // Returns whether 'instruction' is contained in 'computation' either directly
+  // ('instruction->parent' is 'computation') or indirectly ('computation'
+  // dominates 'instruction->parent' in the call graph).
+  bool InstructionIsNestedIn(const HloInstruction* instruction,
+                             const HloComputation* computation) const {
+    return Dominates(computation, instruction->parent());
+  }
+
   string ToString() const;
 
  private:
@@ -205,6 +219,13 @@ class CallGraph {
       const VisitorFunction& visitor_func, const CallGraphNode& node,
       tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const;
 
+  // Recursive helper for computing whether 'a' dominates 'b' in the call
+  // graph. 'b_ancestor' is the currently visited node (which starts at 'b'),
+  // and 'visited' is the set of computations which have been visited.
+  bool DominatesHelper(
+      const HloComputation* a, const HloComputation* b,
+      tensorflow::gtl::FlatSet<const HloComputation*>* visited) const;
+
   // The HLO module represented by this call graph.
   const HloModule* module_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index e276473c90aa3fcc6b494537db6bceb841ade91e..3c22871b3bff193c27ee2eb639fe72306d532b97 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -81,7 +81,7 @@ class CallGraphTest : public HloTestBase {
     HloInstruction* param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     HloInstruction* zero = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
     builder.AddInstruction(HloInstruction::CreateBinary(
         ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
     return builder.Build();
@@ -314,6 +314,37 @@ TEST_F(CallGraphTest, ComplexGraph) {
   EXPECT_LT(index_of(cond_computation), index_of(a_computation));
   EXPECT_LT(index_of(c_computation), index_of(b_computation));
   EXPECT_LT(index_of(b_computation), index_of(a_computation));
+
+  // Verify dominance relations between computation in the graph.
+
+  // Entry dominates everybody, and is dominated by no one except itself.
+  EXPECT_TRUE(call_graph->Dominates(entry_computation, entry_computation));
+  EXPECT_TRUE(call_graph->Dominates(entry_computation, a_computation));
+  EXPECT_TRUE(call_graph->Dominates(entry_computation, b_computation));
+  EXPECT_TRUE(call_graph->Dominates(entry_computation, c_computation));
+  EXPECT_TRUE(call_graph->Dominates(entry_computation, cond_computation));
+  EXPECT_FALSE(call_graph->Dominates(a_computation, entry_computation));
+  EXPECT_FALSE(call_graph->Dominates(b_computation, entry_computation));
+  EXPECT_FALSE(call_graph->Dominates(c_computation, entry_computation));
+  EXPECT_FALSE(call_graph->Dominates(cond_computation, entry_computation));
+
+  // 'a' only dominates 'b' and 'c'.
+  EXPECT_TRUE(call_graph->Dominates(a_computation, a_computation));
+  EXPECT_TRUE(call_graph->Dominates(a_computation, b_computation));
+  EXPECT_TRUE(call_graph->Dominates(a_computation, c_computation));
+  EXPECT_FALSE(call_graph->Dominates(b_computation, a_computation));
+  EXPECT_FALSE(call_graph->Dominates(c_computation, a_computation));
+  EXPECT_FALSE(call_graph->Dominates(a_computation, cond_computation));
+
+  EXPECT_TRUE(call_graph->Dominates(b_computation, b_computation));
+  EXPECT_FALSE(call_graph->Dominates(b_computation, c_computation));
+  EXPECT_FALSE(call_graph->Dominates(b_computation, cond_computation));
+
+  EXPECT_TRUE(call_graph->Dominates(c_computation, c_computation));
+  EXPECT_FALSE(call_graph->Dominates(c_computation, cond_computation));
+  EXPECT_FALSE(call_graph->Dominates(cond_computation, c_computation));
+
+  EXPECT_TRUE(call_graph->Dominates(cond_computation, cond_computation));
 }
 
 TEST_F(CallGraphTest, VisitSingletonComputation) {
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 0d1a439724a95231240227cfdf089cb2d74b3dd2..d43dc5b214a95edf3be726b318fd379164edbd9f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -52,17 +52,17 @@ CompileOnlyService::NewService(const ServiceOptions& options) {
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
-  std::unique_ptr<CompileOnlyService> service(
-      new CompileOnlyService(compiler, std::move(compute_constant_backend)));
+  std::unique_ptr<CompileOnlyService> service(new CompileOnlyService(
+      options, compiler, std::move(compute_constant_backend)));
   return std::move(service);
 }
 
 CompileOnlyService::CompileOnlyService(
-    Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
-    : Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
-      compiler_(compiler) {
-  runs_in_client_process_ = true;
-}
+    const ServiceOptions& options, Compiler* compiler,
+    std::unique_ptr<Backend> compute_constant_backend)
+    : Service(options, /*backend=*/nullptr,
+              std::move(compute_constant_backend)),
+      compiler_(compiler) {}
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyService::CompileAheadOfTime(
@@ -75,9 +75,11 @@ CompileOnlyService::CompileAheadOfTime(
     VersionedComputationHandle versioned_handle =
         user_computation->GetVersionedHandle();
 
+    // TODO(b/63773457): Track DebugOptions in AotCompilationOptions.
+    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+
     // Dump computation proto state if flag is set.
-    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-    const string& directory_path = flags->xla_dump_computations_to;
+    const string& directory_path = debug_options.xla_dump_computations_to();
     if (!directory_path.empty()) {
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<SessionModule> session_module,
@@ -95,11 +97,10 @@ CompileOnlyService::CompileAheadOfTime(
         user_computation->ComputeProgramShape(versioned_handle.version));
 
     HloModuleConfig hlo_module_config(*program_shape);
-    hlo_module_config.set_debug_options(
-        legacy_flags::GetDebugOptionsFromFlags());
+    hlo_module_config.set_debug_options(debug_options);
     auto* computation_layout =
         hlo_module_config.mutable_entry_computation_layout();
-    if (flags->xla_hlo_profile) {
+    if (debug_options.xla_hlo_profile()) {
       hlo_module_config.enable_hlo_profiling(true);
     }
     for (int i = 0; i < instance.argument_layouts.size(); ++i) {
@@ -122,8 +123,7 @@ CompileOnlyService::CompileAheadOfTime(
     hlo_modules.push_back(std::move(hlo_module));
   }
 
-  return compiler_->CompileAheadOfTime(std::move(hlo_modules),
-                                       MakeHloDumper(), options);
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index b19f4bd592162045a41e2ec82266826ce84096ef..0a1911cbd15b0278ec2c3ccc944ce4df80a683ed 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -55,7 +55,7 @@ class CompileOnlyService : public Service {
 
   // Override Service methods that require or imply the existence of an
   // execute backend.  Note that this does not include TransferToClient, as
-  // computing contants produces global data that we may wish to transfer.
+  // computing constants produces global data that we may wish to transfer.
   tensorflow::Status Execute(const ExecuteRequest* arg,
                              ExecuteResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
@@ -103,7 +103,8 @@ class CompileOnlyService : public Service {
 
  private:
   explicit CompileOnlyService(
-      Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
+      const ServiceOptions& options, Compiler* compiler,
+      std::unique_ptr<Backend> compute_constant_backend);
   CompileOnlyService(const CompileOnlyService&) = delete;
   void operator=(const CompileOnlyService&) = delete;
 
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 7ae285170e4b99ecf036eeb81eaee49ef34034ea..d5bd9214be44f4abd5f672168335ae1a259c9118 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -92,13 +92,6 @@ class AotCompilationOptions {
 // platform.
 class Compiler {
  public:
-  // Callback signature used to dump the HLO graph during compilation.
-  // Different compiler backends will call this as they please, providing
-  // a view of the HLO at different points in compilation -- context for the
-  // dump is indicated by the label string.
-  using HloDumper =
-      std::function<void(const HloModule& module, const string& label)>;
-
   virtual ~Compiler() {}
 
   // Returns the ID of the platform that this compiler targets.
@@ -113,21 +106,20 @@ class Compiler {
   //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* executor) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
   virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
-                     HloDumper dump_hlo,
                      const AotCompilationOptions& options) = 0;
 
   /////
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cdfa30dd9a7b6a5b9e58087491a9d99caaa1b998
--- /dev/null
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
+  proto->set_replica_count(replica_count());
+  proto->set_computation_count(computation_count());
+  for (int computation = 0; computation < computation_count(); ++computation) {
+    DeviceAssignmentProto::ComputationDevice* computation_device =
+        proto->add_computation_devices();
+    for (int replica = 0; replica < replica_count(); ++replica) {
+      computation_device->add_replica_device_ids((*this)(replica, computation));
+    }
+  }
+  return Status::OK();
+}
+
+/* static */ StatusOr<std::unique_ptr<DeviceAssignment>>
+DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
+  TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count());
+  auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
+                                                 proto.computation_count());
+  for (int computation = 0; computation < proto.computation_count();
+       ++computation) {
+    const auto& computation_device = proto.computation_devices(computation);
+    TF_RET_CHECK(computation_device.replica_device_ids_size() ==
+                 proto.replica_count());
+    for (int replica = 0; replica < proto.replica_count(); ++replica) {
+      (*assignment)(replica, computation) =
+          computation_device.replica_device_ids(replica);
+    }
+  }
+  return std::move(assignment);
+}
+
+StatusOr<int> ComputationPlacer::DeviceId(int replica, int computation,
+                                          int replica_count,
+                                          int computation_count) {
+  TF_RET_CHECK(replica < replica_count);
+  TF_RET_CHECK(computation < computation_count);
+
+  return computation * replica_count + replica;
+}
+
+StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
+    int replica_count, int computation_count) {
+  DeviceAssignment assignment(replica_count, computation_count);
+  for (int replica = 0; replica < replica_count; ++replica) {
+    for (int computation = 0; computation < computation_count; ++computation) {
+      TF_ASSIGN_OR_RETURN(
+          int device_id,
+          DeviceId(replica, computation, replica_count, computation_count));
+      assignment(replica, computation) = device_id;
+    }
+  }
+  return std::move(assignment);
+}
+
+/* static */ void ComputationPlacer::RegisterComputationPlacer(
+    se::Platform::Id platform_id,
+    ComputationPlacerCreationFunction creation_function) {
+  tensorflow::mutex_lock lock(
+      *ComputationPlacer::platform_computation_placer_mutex());
+  auto* computation_placers = GetPlatformComputationPlacers();
+  CHECK(computation_placers->find(platform_id) == computation_placers->end());
+  (*computation_placers)[platform_id].creation_function = creation_function;
+}
+
+/* static */ StatusOr<ComputationPlacer*> ComputationPlacer::GetForPlatform(
+    const se::Platform* platform) {
+  tensorflow::mutex_lock lock(
+      *ComputationPlacer::platform_computation_placer_mutex());
+  auto* computation_placers = GetPlatformComputationPlacers();
+
+  auto it = computation_placers->find(platform->id());
+  if (it == computation_placers->end()) {
+    return NotFound(
+        "could not find registered computation placer for platform %s -- check "
+        "target linkage",
+        platform->Name().c_str());
+  }
+
+  if (it->second.placer == nullptr) {
+    // Lazily create the computation placer the first time it is needed.
+    it->second.placer = (*it->second.creation_function)();
+  }
+
+  return it->second.placer.get();
+}
+
+/* static */ tensorflow::mutex*
+ComputationPlacer::platform_computation_placer_mutex() {
+  static tensorflow::mutex* m = new tensorflow::mutex;
+  return m;
+}
+
+/* static */ std::map<perftools::gputools::Platform::Id,
+                      ComputationPlacer::State>*
+ComputationPlacer::GetPlatformComputationPlacers() {
+  static auto* r =
+      new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
+  return r;
+}
+
+}  // namespace xla
+
+static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
+  return xla::MakeUnique<xla::ComputationPlacer>();
+}
+
+static bool InitModule() {
+  xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
+                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
+                                                    &CreateComputationPlacer);
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d9abcd100dd9e878da885110bc1bd1ac65e3f84
--- /dev/null
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_PLACER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_PLACER_H_
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Class that represents the device assignment for a set of XLA replicated
+// computations. For R replicas and C computations, R * C devices are required
+// execute the computation in parallel. The assigned device ids can be accessed
+// by assignment(replica, computation).
+class DeviceAssignment : public Array2D<int> {
+ public:
+  DeviceAssignment() {}
+  DeviceAssignment(int replica_count, int computation_count)
+      : Array2D<int>(replica_count, computation_count, -1) {
+    CHECK_GT(replica_count, 0);
+    CHECK_GT(computation_count, 0);
+  }
+
+  int replica_count() const { return height(); }
+  int computation_count() const { return width(); }
+
+  // Protocol buffer serialization and deserialization.
+  Status Serialize(DeviceAssignmentProto* proto) const;
+
+  // Return a std::unique_ptr<DeviceAssignment> instead of a DeviceAssignment
+  // directly because one of the supported TF platforms (mac) does not compile
+  // due to a StatusOr of an incomplete type (DeviceAssignment).
+  static StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
+      const DeviceAssignmentProto& proto);
+};
+
+// A generic implementation of the XLA computation placer, which assigns device
+// ids to a set of replicated computations.
+class ComputationPlacer {
+ public:
+  ComputationPlacer() {}
+  virtual ~ComputationPlacer() {}
+
+  // Returns the device id assigned to the given replica and computation
+  // instance for [replica_count x computation_count] setup. The returned device
+  // id must match the assignement from PlaceReplicatedComputation().
+  virtual StatusOr<int> DeviceId(int replica, int computation,
+                                 int replica_count, int computation_count);
+
+  // Returns the device ids assigned to a set of replicated computations, given
+  // the number of replicas and the number of computations.
+  virtual StatusOr<DeviceAssignment> AssignDevices(int replica_count,
+                                                   int computation_count);
+
+  using ComputationPlacerCreationFunction =
+      std::unique_ptr<ComputationPlacer> (*)();
+
+  // Registers a computation placer creation function for a particular platform.
+  static void RegisterComputationPlacer(
+      perftools::gputools::Platform::Id platform_id,
+      ComputationPlacerCreationFunction creation_function);
+
+  // Returns the computation placer singleton pointer if it is available for the
+  // given platform, or an error status if it is not.
+  static StatusOr<ComputationPlacer*> GetForPlatform(
+      const perftools::gputools::Platform* platform);
+
+ private:
+  // Routine that returns the mutex that guards the platform-to-computation
+  // placer map. Done as a routine to ensure correct initialization ordering,
+  // since RegisterComputationPlacer can be called during program initialization
+  // time.
+  static tensorflow::mutex* platform_computation_placer_mutex();
+
+  // State kept for each kind of ComputationPlacer. Registration functions set
+  // up creation_function, and then we use that to lazily create "placer" the
+  // first time GetForPlatform is invoked for a particular id.
+  struct State {
+    std::unique_ptr<ComputationPlacer> placer;
+    ComputationPlacerCreationFunction creation_function = nullptr;
+  };
+
+  // Map from platform kind to computation placer singleton.
+  static std::map<perftools::gputools::Platform::Id, State>*
+  GetPlatformComputationPlacers();
+
+  perftools::gputools::Platform::Id platform_id_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_PLACER_H_
diff --git a/tensorflow/compiler/xla/service/computation_tracker.cc b/tensorflow/compiler/xla/service/computation_tracker.cc
index 9aa32a1fb76616e6c81043fabb053570a86d2619..70e25eebdb068db893e24aec0f72d09090ac7027 100644
--- a/tensorflow/compiler/xla/service/computation_tracker.cc
+++ b/tensorflow/compiler/xla/service/computation_tracker.cc
@@ -216,6 +216,7 @@ StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloComputation> hlo_computation,
         computation->BuildHloComputation(versioned_handle.version, resolver,
+                                         config.debug_options(),
                                          include_unreachable_instructions));
 
     // Add the newly created computation to VersionedHandle-to-HloComputation
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index a3803c34ba7db99db7139b53cad22d5bce7fe5e6..c47abe9c62a40716eb03fbd2213b941b5e0abbc3 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -551,6 +551,9 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   // Add copies of computation root instructions, if needed.
   FlatMap<const HloComputation*, ShapeTree<bool>> while_body_read_only_indices;
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     VLOG(2) << "computation " << computation->name();
     InstructionCopier root_copier(computation->root_instruction(),
                                   /*copy_users=*/{});
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index cc77339bb63220d8c9da0500ee818c7b9fb02a4b..026be75757a9129c94e2c1c3083f226790d482f4 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -87,7 +87,7 @@ TEST_F(CopyInsertionTest, SingleParameter) {
 TEST_F(CopyInsertionTest, SingleConstant) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant}));
 
@@ -110,9 +110,9 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
   auto builder = HloComputation::Builder(TestName());
 
   HloInstruction* constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   HloInstruction* constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
 
   HloInstruction* x = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -140,11 +140,11 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   // the computation result. Verify that copies are added properly.
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   HloInstruction* constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   HloInstruction* constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
 
   HloInstruction* tuple1 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
@@ -152,7 +152,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
       HloInstruction::CreateTuple({constant3, constant2}));
 
   HloInstruction* pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
@@ -196,9 +196,8 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   // The output of a bitcast is its operand (same buffer), so a bitcast
   // constant feeding the result must have a copy added.
   auto builder = HloComputation::Builder(TestName());
-  HloInstruction* constant =
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<float>({1.0, 42.0})));
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1.0, 42.0})));
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
@@ -308,9 +307,9 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
   // copy is added.
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   HloInstruction* constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
 
   HloInstruction* tuple1 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
@@ -318,7 +317,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       HloInstruction::CreateTuple({constant2, constant1}));
 
   HloInstruction* pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloInstruction* select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
   HloInstruction* gte =
@@ -350,7 +349,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
       bool nested = false) {
     auto builder = HloComputation::Builder(TestName() + ".Condition");
     auto limit_const = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(10)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(10)));
     const Shape& loop_state_shape =
         nested ? nested_loop_state_shape_ : loop_state_shape_;
     auto loop_state = builder.AddInstruction(
@@ -381,7 +380,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             induction_variable_shape_, loop_state, 0));
     auto inc = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
         induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
     // Update data GTE(1).
@@ -419,7 +418,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             induction_variable_shape_, loop_state, 0));
     auto inc = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
 
     // add0 = Add(in0, 1)
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
@@ -488,7 +487,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             induction_variable_shape_, loop_state, 0));
     auto inc = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
     // add0 = Add(in0, 1)
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
         induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
@@ -503,9 +502,8 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
       data = builder.AddInstruction(
           HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
     }
-    auto update = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
-            {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
+    auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
     // add1 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
     auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data, update));
@@ -538,7 +536,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto gte0 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
         induction_variable_shape_, loop_state, 0));
     auto inc = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
         gte0->shape(), HloOpcode::kAdd, gte0, inc));
 
@@ -548,9 +546,8 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     // GTE(GTE(loop_state, 1), 0) -> Add
     auto gte10 = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(data_shape_, gte1, 0));
-    auto update10 = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
-            {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
+    auto update10 = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
     auto add10 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, gte10, update10));
 
@@ -574,11 +571,10 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
                                         bool nested = false) {
     auto builder = HloComputation::Builder(TestName() + ".While");
     auto induction_var_init = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
 
-    auto data_init = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
-            {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f})));
+    auto data_init = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR1<float>({0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f})));
 
     if (nested) {
       auto inner_init = builder.AddInstruction(
@@ -601,9 +597,8 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 
   HloInstruction* BuildWhileInstruction_InitPointsToConstant() {
     auto builder = HloComputation::Builder(TestName() + ".While");
-    auto data_init = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
-            {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f})));
+    auto data_init = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR1<float>({0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f})));
     return BuildWhileInstructionWithCustomInit(loop_state_shape_, data_init,
                                                &builder);
   }
@@ -620,11 +615,11 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto builder = HloComputation::Builder(TestName() + ".While");
 
     auto one = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
     auto v1 = builder.AddInstruction(
         HloInstruction::CreateBroadcast(data_shape_, one, {1}));
     auto zero = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
     auto v2 = builder.AddInstruction(
         HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
 
@@ -632,7 +627,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto tuple2 = builder.AddInstruction(HloInstruction::CreateTuple({v2, v1}));
 
     auto pred = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
     auto data_init = builder.AddInstruction(HloInstruction::CreateTernary(
         nested_tuple_shape_, HloOpcode::kSelect, pred, tuple1, tuple2));
 
@@ -644,7 +639,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto builder = HloComputation::Builder(TestName() + ".While");
 
     auto one = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
     auto one_vec = builder.AddInstruction(
         HloInstruction::CreateBroadcast(data_shape_, one, {1}));
     auto data_init =
@@ -657,12 +652,11 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   HloInstruction* BuildWhileInstruction_InitPointsToInterfering() {
     auto builder = HloComputation::Builder(TestName() + ".While");
     auto one = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
     auto data_init = builder.AddInstruction(
         HloInstruction::CreateBroadcast(data_shape_, one, {1}));
-    auto one_vec = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
-            {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
+    auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
     // Take a reference to 'data_init' to make it interfere with while result.
     builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data_init, one_vec));
@@ -677,7 +671,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     const bool nested =
         ShapeUtil::Equal(loop_state_shape, nested_loop_state_shape_);
     auto induction_var_init = builder->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
     auto condition =
         module_->AddEmbeddedComputation(BuildConditionComputation(nested));
     auto body = module_->AddEmbeddedComputation(
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 51ecbccd494fced68d5e92eda752f5292580a190..2ca4af67cd55cfd01e952cf2306d5e475d7f4944 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -52,7 +52,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
@@ -69,9 +68,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:hlo_proto_util",
+        "//tensorflow/compiler/xla/service:hlo_scheduling",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:inliner",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",  # fixdeps: keep
@@ -151,9 +152,12 @@ cc_library(
 cc_library(
     name = "parallel_cpu_executable",
     srcs = ["parallel_cpu_executable.cc"],
-    hdrs = ["parallel_cpu_executable.h"],
+    hdrs = [
+        "parallel_cpu_executable.h",
+    ],
     deps = [
         ":cpu_runtime",
+        ":shape_partition",
         ":simple_orc_jit",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -177,7 +181,9 @@ cc_library(
 cc_library(
     name = "ir_emitter",
     srcs = ["ir_emitter.cc"],
-    hdrs = ["ir_emitter.h"],
+    hdrs = [
+        "ir_emitter.h",
+    ],
     deps = [
         ":cpu_runtime",
         ":dot_op_emitter",
@@ -191,7 +197,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
@@ -222,8 +227,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
@@ -283,8 +288,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:compiler_functor_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "@llvm//:analysis",
@@ -325,15 +328,20 @@ cc_library(
     name = "cpu_runtime",
     srcs = [
         "cpu_runtime.cc",
-        "infeed_manager.cc",
+        "xfeed_manager.cc",
     ],
     hdrs = [
         "cpu_runtime.h",
-        "infeed_manager.h",
+        "xfeed_manager.h",
     ],
     copts = runtime_copts(),
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -405,6 +413,7 @@ cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -413,9 +422,9 @@ cc_test(
 )
 
 cc_test(
-    name = "infeed_manager_test",
+    name = "xfeed_manager_test",
     size = "small",
-    srcs = ["infeed_manager_test.cc"],
+    srcs = ["xfeed_manager_test.cc"],
     deps = [
         ":cpu_runtime",
         "//tensorflow/core:lib",
@@ -437,10 +446,16 @@ cc_library(
 cc_library(
     name = "cpu_parallelization_preparation",
     srcs = ["cpu_parallelization_preparation.cc"],
-    hdrs = ["cpu_parallelization_preparation.h"],
+    hdrs = [
+        "cpu_parallelization_preparation.h",
+    ],
     deps = [
+        ":ir_emission_utils",
+        ":shape_partition",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
@@ -472,7 +487,6 @@ cc_library(
         ":cpu_runtime",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
         "//tensorflow/compiler/xla/service:hlo",
     ],
 )
@@ -499,9 +513,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -511,6 +525,7 @@ cc_test(
     srcs = ["conv_canonicalization_test.cc"],
     deps = [
         ":conv_canonicalization",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
@@ -518,6 +533,26 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "shape_partition",
+    srcs = ["shape_partition.cc"],
+    hdrs = ["shape_partition.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+    ],
+)
+
+cc_test(
+    name = "shape_partition_test",
+    srcs = ["shape_partition_test.cc"],
+    deps = [
+        ":shape_partition",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test_main",
+    ],
+)
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 8ebf9ab110d080a017abb2077ac588672c8099bb..d86881c282488356f5c146467e5e41ecc5038511 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "external/llvm/include/llvm/Transforms/IPO.h"
 #include "external/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h"
 #include "external/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "tensorflow/compiler/xla/legacy_flags/compiler_functor_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
@@ -45,7 +43,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -66,14 +63,9 @@ operator()(llvm::Module& module) const {
 
   VLOG(2) << "IR before optimizations";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
-  legacy_flags::CompilerFunctorFlags* flags =
-      legacy_flags::GetCompilerFunctorFlags();
-  string dump_path = flags->xla_debug_cpu_dump_ir;
-  if (!dump_path.empty()) {
-    std::unique_ptr<tensorflow::WritableFile> f;
-    TF_CHECK_OK(tensorflow::Env::Default()->NewAppendableFile(dump_path, &f));
-    TF_CHECK_OK(f->Append(llvm_ir::DumpModuleToString(module)));
-    TF_CHECK_OK(f->Close());
+
+  if (pre_optimization_hook_) {
+    TF_CHECK_OK(pre_optimization_hook_(module));
   }
 
   // Build up optimization pipeline.
@@ -99,6 +91,10 @@ operator()(llvm::Module& module) const {
   VLOG(2) << "IR after optimizations";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
 
+  if (post_optimization_hook_) {
+    TF_CHECK_OK(post_optimization_hook_(module));
+  }
+
   // Generate code.
   llvm::MCContext* mc_context;
   llvm::legacy::PassManager codegen_passes;
@@ -135,33 +131,28 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
   std::vector<llvm::VecDesc> vector_functions;
 
   const llvm::VecDesc four_wide_vector_functions[] = {
-      {"expf", runtime::kExpV4F32, 4},
-      {"llvm.exp.f32", runtime::kExpV4F32, 4},
+      {"expf", runtime::kExpV4F32SymbolName, 4},
+      {"llvm.exp.f32", runtime::kExpV4F32SymbolName, 4},
 
-      {"logf", runtime::kLogV4F32, 4},
-      {"llvm.log.f32", runtime::kLogV4F32, 4},
+      {"logf", runtime::kLogV4F32SymbolName, 4},
+      {"llvm.log.f32", runtime::kLogV4F32SymbolName, 4},
 
-      {"tanhf", runtime::kTanhV4F32, 4},
-      {"llvm.tanh.f32", runtime::kTanhV4F32, 4},
+      {"tanhf", runtime::kTanhV4F32SymbolName, 4},
+      {"llvm.tanh.f32", runtime::kTanhV4F32SymbolName, 4},
   };
 
   const llvm::VecDesc eight_wide_vector_functions[] = {
-      {"expf", runtime::kExpV8F32, 8},
-      {"llvm.exp.f32", runtime::kExpV8F32, 8},
+      {"expf", runtime::kExpV8F32SymbolName, 8},
+      {"llvm.exp.f32", runtime::kExpV8F32SymbolName, 8},
 
-      {"logf", runtime::kLogV8F32, 8},
-      {"llvm.log.f32", runtime::kLogV8F32, 8},
+      {"logf", runtime::kLogV8F32SymbolName, 8},
+      {"llvm.log.f32", runtime::kLogV8F32SymbolName, 8},
 
-      {"tanhf", runtime::kTanhV8F32, 8},
-      {"llvm.tanh.f32", runtime::kTanhV8F32, 8},
+      {"tanhf", runtime::kTanhV8F32SymbolName, 8},
+      {"llvm.tanh.f32", runtime::kTanhV8F32SymbolName, 8},
   };
 
-  // Our vectorized library calls are currently implement by calling into Eigen.
-  // As such, only emit calls to these routines if --xla_cpu_use_eigen is
-  // enabled.
-  legacy_flags::CpuRuntimeFlags* flags = legacy_flags::GetCpuRuntimeFlags();
-  if (flags->xla_cpu_use_eigen &&
-      (arch == llvm::Triple::x86 || llvm::Triple::x86_64)) {
+  if (arch == llvm::Triple::x86 || llvm::Triple::x86_64) {
     llvm::SmallVector<llvm::StringRef, 32> features;
     feature_string.split(features, ',', -1, /*KeepEmpty=*/false);
     if (std::find(features.begin(), features.end(), "+sse4.1") !=
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index 17dadebe975b936b7d5d7a78ac69b890d9c8e7ac..a5358076b7f543948e0957767dfda1be43e07611 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -39,13 +39,22 @@ class CompilerFunctor {
   // Returns a VectorIntrinsics where all intrinsics are available.
   static VectorIntrinsics AllIntrinsics();
 
+  // A callback of this type can be run before and/or after IR-level
+  // optimization to e.g. dump out the generated IR to disk or gather some
+  // statistics.
+  using ModuleHook = std::function<Status(const llvm::Module&)>;
+
   explicit CompilerFunctor(llvm::TargetMachine* target_machine,
                            const Disassembler* disassembler, int opt_level,
-                           const VectorIntrinsics& available_intrinsics)
+                           const VectorIntrinsics& available_intrinsics,
+                           ModuleHook pre_optimization_hook = nullptr,
+                           ModuleHook post_optimization_hook = nullptr)
       : target_machine_(target_machine),
         disassembler_(CHECK_NOTNULL(disassembler)),
         opt_level_(opt_level),
-        available_intrinsics_(available_intrinsics) {}
+        available_intrinsics_(available_intrinsics),
+        pre_optimization_hook_(pre_optimization_hook),
+        post_optimization_hook_(post_optimization_hook) {}
 
   // Compile a Module to an ObjectFile.
   llvm::object::OwningBinary<llvm::object::ObjectFile> operator()(
@@ -61,6 +70,8 @@ class CompilerFunctor {
   const Disassembler* disassembler_;
   const unsigned opt_level_;
   const VectorIntrinsics available_intrinsics_;
+  ModuleHook pre_optimization_hook_;
+  ModuleHook post_optimization_hook_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index cdf43587b683e4e22d14d4fc08fa3705bc636de8..069979c6611e90ed2d95cbbe341198577cdf56cf 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -30,11 +29,6 @@ namespace xla {
 namespace cpu {
 
 StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
-  legacy_flags::CpuRuntimeFlags* flags = legacy_flags::GetCpuRuntimeFlags();
-  if (!flags->xla_cpu_use_eigen) {
-    return false;
-  }
-
   bool changed = false;
   for (HloInstruction* hlo :
        module->entry_computation()->MakeInstructionPostOrder()) {
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index f5ad431277d94039cd20cf51e0932413e87a0436..ec992f15e63b29ee67d16b6d841fedffd9c90f5b 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -59,11 +59,11 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
   auto builder = HloComputation::Builder(TestName());
   // The input dimensions are in CNHW order.
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR4FromArray4D(Array4D<float>(
+      Literal::CreateR4FromArray4D(Array4D<float>(
           kInputFeatureCount, kBatchSize, kInputSize, kInputSize))));
   // The kernel dimensions are in OIHW order.
   auto kernel = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR4FromArray4D(Array4D<float>(
+      Literal::CreateR4FromArray4D(Array4D<float>(
           kOutputFeatureCount, kInputFeatureCount, kWindowSize, kWindowSize))));
 
   ConvolutionDimensionNumbers dnums;
@@ -113,11 +113,11 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
   auto builder = HloComputation::Builder(TestName());
   // The input dimensions are in NHWC order.
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR4FromArray4D(Array4D<float>(
+      Literal::CreateR4FromArray4D(Array4D<float>(
           kBatchSize, kInputSize, kInputSize, kInputFeatureCount))));
   // The kernel dimensions are in HWIO order.
   auto kernel = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR4FromArray4D(Array4D<float>(
+      Literal::CreateR4FromArray4D(Array4D<float>(
           kWindowSize, kWindowSize, kInputFeatureCount, kOutputFeatureCount))));
 
   ConvolutionDimensionNumbers dnums;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 34b99f2440b935402283d76d4a09475f4bfcb315..f115236ee71125e23341c905c01eb39fd77cb210 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "external/llvm/include/llvm/Support/TargetSelect.h"
 #include "external/llvm/include/llvm/Target/TargetMachine.h"
 #include "external/llvm/include/llvm/Target/TargetOptions.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -70,10 +69,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -81,7 +82,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace se = ::perftools::gputools;
 
@@ -166,7 +170,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
     fake_argv_storage.push_back("");
     for (const auto& it : options) {
       // Skip options the XLA backend itself consumes.
-      if (it.first != kXlaParallelCpuOption) {
+      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
         if (it.second.empty()) {
           fake_argv_storage.push_back(it.first);
         } else {
@@ -245,19 +249,23 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 };
 }  // namespace
 
-Status CpuCompiler::RunHloPasses(HloModule* module, HloDumper dump_hlo) {
+Status CpuCompiler::RunHloPasses(HloModule* module) {
   // Optimization pipeline.
-  HloPassPipeline pipeline("CPU", dump_hlo);
+  HloPassPipeline pipeline("CPU");
   pipeline.AddInvariantChecker<HloVerifier>();
 
+  ReducePrecisionInsertion::AddPasses(
+      &pipeline, module->config().debug_options(),
+      HloReducePrecisionOptions::BEFORE_OP_FUSION);
+
   // TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding
   // where we will take this pass in future.
   // pipeline.AddPass<Inliner>();
 
   pipeline.AddPass<ConvCanonicalization>();
   {
-    auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification",
-                                                               dump_hlo);
+    auto& pass =
+        pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
@@ -275,6 +283,11 @@ Status CpuCompiler::RunHloPasses(HloModule* module, HloDumper dump_hlo) {
       TransposeFolding::NeverFoldTranspose);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
+
+  ReducePrecisionInsertion::AddPasses(
+      &pipeline, module->config().debug_options(),
+      HloReducePrecisionOptions::AFTER_OP_FUSION);
+
   pipeline.AddPass<CpuLayoutAssignment>(
       module->mutable_entry_computation_layout());
   // The LayoutAssignment pass may leave behind kCopy instructions which are
@@ -285,8 +298,13 @@ Status CpuCompiler::RunHloPasses(HloModule* module, HloDumper dump_hlo) {
       /*enable_dot_simplification=*/false);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   // Outline ops in the entry computation into calls to subcomputations.
+  const int max_parallelism =
+      module->config().intra_op_parallelism_threads() > 0
+          ? module->config().intra_op_parallelism_threads()
+          : tensorflow::port::NumSchedulableCPUs();
   if (CpuParallelBackendRequested(module->config())) {
-    pipeline.AddPass<ParallelizationPreparation>();
+    pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
+                                                 ShapeSizeBytesFunction());
   }
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
@@ -299,7 +317,8 @@ Status CpuCompiler::RunHloPasses(HloModule* module, HloDumper dump_hlo) {
   if (CpuParallelBackendRequested(module->config())) {
     // Re-run the outlining, in case any copies were inserted into the entry
     // computation.
-    pipeline.AddPass<ParallelizationPreparation>();
+    pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
+                                                 ShapeSizeBytesFunction());
   }
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
@@ -310,6 +329,7 @@ namespace {
 
 // Align buffers to 16-byte boundaries.
 constexpr int64 kMemoryAlignment = 16;
+auto memory_alignment = [](LogicalBuffer::Color) { return kMemoryAlignment; };
 
 llvm::TargetOptions CompilerTargetOptions(
     const HloModuleConfig& module_config) {
@@ -338,25 +358,83 @@ llvm::CodeGenOpt::Level CodeGenOptLevel(const HloModuleConfig& module_config) {
   }
 }
 
+Status AppendIRToFile(const string& file_name, const string& ir_module_string) {
+  std::unique_ptr<tensorflow::WritableFile> f;
+  TF_RETURN_IF_ERROR(
+      tensorflow::Env::Default()->NewAppendableFile(file_name, &f));
+  TF_RETURN_IF_ERROR(f->Append(ir_module_string));
+  TF_RETURN_IF_ERROR(f->Close());
+  return Status::OK();
+}
+
+Status InitializeIRDumpHooks(
+    const HloModule& module,
+    CompilerFunctor::ModuleHook* pre_optimization_ir_dump_hook,
+    CompilerFunctor::ModuleHook* post_optimization_ir_dump_hook) {
+  const string& dump_ir_to = module.config().debug_options().xla_dump_ir_to();
+  if (dump_ir_to.empty()) {
+    return Status::OK();
+  }
+
+  TF_RETURN_IF_ERROR(
+      tensorflow::Env::Default()->RecursivelyCreateDir(dump_ir_to));
+
+  string safe_file_name_base = module.name();
+  std::replace_if(safe_file_name_base.begin(), safe_file_name_base.end(),
+                  [](char c) { return c == '/' || c == '\\'; }, '_');
+
+  string unoptimized_ir_file_name = tensorflow::io::JoinPath(
+      dump_ir_to,
+      tensorflow::strings::StrCat("ir-", safe_file_name_base, "-no-opt.ll"));
+  string optimized_ir_file_name = tensorflow::io::JoinPath(
+      dump_ir_to,
+      tensorflow::strings::StrCat("ir-", safe_file_name_base, "-opt.ll"));
+
+  // We still want to append to avoid overwriting possibly important information
+  // due to operator error.
+
+  *pre_optimization_ir_dump_hook =
+      [unoptimized_ir_file_name](const llvm::Module& module) {
+        return AppendIRToFile(unoptimized_ir_file_name,
+                              llvm_ir::DumpModuleToString(module));
+      };
+
+  *post_optimization_ir_dump_hook =
+      [optimized_ir_file_name](const llvm::Module& module) {
+        return AppendIRToFile(optimized_ir_file_name,
+                              llvm_ir::DumpModuleToString(module));
+      };
+
+  return Status::OK();
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
-    std::unique_ptr<HloModule> module, HloDumper dump_hlo,
-    se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+  VLOG(1) << "Compiling: " << module->name();
   TF_RET_CHECK(stream_exec != nullptr);
   std::call_once(llvm_command_line_options_initialized,
                  &InitializeLLVMCommandLineOptions, module->config());
 
+  CompilerFunctor::ModuleHook pre_optimization_ir_dump_hook;
+  CompilerFunctor::ModuleHook post_optimization_ir_dump_hook;
+  TF_RETURN_IF_ERROR(InitializeIRDumpHooks(*module,
+                                           &pre_optimization_ir_dump_hook,
+                                           &post_optimization_ir_dump_hook));
+
   // Compile must be thread-safe so create a new LLVM context for the module.
   auto llvm_context = MakeUnique<llvm::LLVMContext>();
   auto llvm_module =
       MakeUnique<llvm::Module>("__compute_module", *llvm_context);
   auto jit = MakeUnique<SimpleOrcJIT>(CompilerTargetOptions(module->config()),
-                                      CodeGenOptLevel(module->config()));
+                                      CodeGenOptLevel(module->config()),
+                                      pre_optimization_ir_dump_hook,
+                                      post_optimization_ir_dump_hook);
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
 
-  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), dump_hlo));
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get()));
 
   HloComputation* computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
@@ -367,8 +445,17 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
   }
 
   std::unique_ptr<Executable> cpu_executable;
-  legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
+
+  // Cache these flags here since we'll want to access them after the module's
+  // ownership is std::moved.
+  const bool embed_ir_in_executable =
+      module->config().debug_options().xla_embed_ir_in_executable();
+  const string dump_debug_json_to =
+      module->config().debug_options().xla_dump_debug_json_to();
+
   if (CpuParallelBackendRequested(module->config())) {
+    VLOG(1) << "Using parallel cpu backend";
+
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
     // DependencyHloOrdering is used for the parallel emitter because the order
@@ -379,12 +466,12 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(module.get(),
                             MakeUnique<DependencyHloOrdering>(module.get()),
-                            BufferSizeBytesFunction(), kMemoryAlignment));
+                            BufferSizeBytesFunction(), memory_alignment));
 
-    if (!flags->xla_cpu_dump_debug_json_to.empty()) {
+    if (!dump_debug_json_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
-          proto, flags->xla_cpu_dump_debug_json_to, module->name()));
+          proto, dump_debug_json_to, module->name()));
     }
 
     // If we are using the parallel CPU backend, we need to create map from
@@ -400,7 +487,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       if (instruction->opcode() == HloOpcode::kConstant) {
         // Copy the constant out of the ProtocolBuffer so that we can give it a
         // higher alignment.
-        const void* data = LiteralUtil::InternalData(instruction->literal());
+        const void* data = instruction->literal().InternalData();
         int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
         auto iter = aligned_constants.emplace(
             instruction, MakeUnique<unsigned char[]>(size));
@@ -418,11 +505,15 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     }
 
     IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         &hlo_to_profile_idx);
+                         &hlo_to_profile_idx, jit->target_machine());
+
     std::unique_ptr<std::map<HloInstruction*, string>> function_names(
         new std::map<HloInstruction*, string>());
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
+      if (embedded_computation->IsFusionComputation()) {
+        continue;
+      }
       auto parallel_computation_iter =
           parallel_computations.find(embedded_computation);
       // All parallel computations are considered to be an entry computation for
@@ -446,7 +537,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     }
 
     string ir_module_string;
-    if (flags->xla_cpu_embed_ir) {
+    if (embed_ir_in_executable) {
       ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
     }
 
@@ -457,11 +548,13 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         std::move(function_names), std::move(hlo_to_profile_idx),
         std::move(aligned_constants)));
 
-    if (flags->xla_cpu_embed_ir) {
+    if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
           .set_ir_module_string(ir_module_string);
     }
   } else {
+    VLOG(1) << "Using sequential cpu backend";
+
     // Select an order for emitting the HLO instructions for each
     // computation. Using this sequence enables tighter buffer liveness analysis
     // and reduced memory usage (as compared to using DependencyHloOrdering).
@@ -476,12 +569,12 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         BufferAssigner::Run(
             module.get(),
             MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-            BufferSizeBytesFunction(), kMemoryAlignment));
+            BufferSizeBytesFunction(), memory_alignment));
 
-    if (!flags->xla_cpu_dump_debug_json_to.empty()) {
+    if (!dump_debug_json_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
-          proto, flags->xla_cpu_dump_debug_json_to, module->name()));
+          proto, dump_debug_json_to, module->name()));
     }
 
     // Each computation is a single function.  Emit all embedded computations
@@ -489,9 +582,13 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // GetEmbeddedComputations guarantees that a called computation occurs
     // before a caller computation.
     IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         &hlo_to_profile_idx);
+                         &hlo_to_profile_idx, jit->target_machine());
+
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
+      if (embedded_computation->IsFusionComputation()) {
+        continue;
+      }
       TF_RETURN_IF_ERROR(
           ir_emitter
               .EmitComputation(embedded_computation,
@@ -510,7 +607,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 
     string function_name = llvm_ir::AsString(entry_function->getName());
     string ir_module_string;
-    if (flags->xla_cpu_embed_ir) {
+    if (embed_ir_in_executable) {
       ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
     }
 
@@ -520,17 +617,18 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         std::move(jit), std::move(assignment), std::move(module), function_name,
         std::move(hlo_to_profile_idx)));
 
-    if (flags->xla_cpu_embed_ir) {
+    if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
           .set_ir_module_string(ir_module_string);
     }
   }
 
+  VLOG(1) << "Compilation finished";
   return std::move(cpu_executable);
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlos,
+    std::vector<std::unique_ptr<HloModule>> modules,
     std::vector<se::StreamExecutor*> stream_execs) {
   return Unimplemented(
       "Compilation of multiple HLO modules is not yet supported on CPU.");
@@ -538,7 +636,6 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
-                                HloDumper dump_hlo,
                                 const AotCompilationOptions& aot_options) {
   TF_RET_CHECK(!modules.empty());
   std::call_once(llvm_command_line_options_initialized,
@@ -627,8 +724,9 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
   std::vector<std::unique_ptr<AotCompilationResult>> results;
   for (size_t i = 0; i < modules.size(); ++i) {
     HloModule* module = modules[i].get();
+    VLOG(1) << "Compiling ahead-of-time: " << module->name();
 
-    TF_RETURN_IF_ERROR(RunHloPasses(module, dump_hlo));
+    TF_RETURN_IF_ERROR(RunHloPasses(module));
 
     TF_ASSIGN_OR_RETURN(
         SequentialHloOrdering::HloModuleSequence module_sequence,
@@ -640,20 +738,24 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(
             module, MakeUnique<SequentialHloOrdering>(module, module_sequence),
-            BufferSizeBytesFunction(), kMemoryAlignment));
+            BufferSizeBytesFunction(), memory_alignment));
 
-    legacy_flags::CpuCompilerFlags* flags = legacy_flags::GetCpuCompilerFlags();
-    if (!flags->xla_cpu_dump_debug_json_to.empty()) {
+    const string dump_debug_json_to =
+        module->config().debug_options().xla_dump_debug_json_to();
+    if (!dump_debug_json_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
-          proto, flags->xla_cpu_dump_debug_json_to, module->name()));
+          proto, dump_debug_json_to, module->name()));
     }
 
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
-                         /*hlo_to_profile_idx=*/nullptr);
+                         /*hlo_to_profile_idx=*/nullptr, target_machine.get());
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
+      if (embedded_computation->IsFusionComputation()) {
+        continue;
+      }
       TF_RETURN_IF_ERROR(
           ir_emitter
               .EmitComputation(embedded_computation,
@@ -671,10 +773,17 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
 
     entry_function->setName(llvm_ir::AsStringRef(entry_point_name));
 
+    CompilerFunctor::ModuleHook pre_optimization_ir_dump_hook;
+    CompilerFunctor::ModuleHook post_optimization_ir_dump_hook;
+    TF_RETURN_IF_ERROR(InitializeIRDumpHooks(*module,
+                                             &pre_optimization_ir_dump_hook,
+                                             &post_optimization_ir_dump_hook));
+
     Disassembler disassembler(*target_machine);
-    CompilerFunctor compiler_functor(target_machine.get(), &disassembler,
-                                     opt_level,
-                                     CompilerFunctor::AllIntrinsics());
+    CompilerFunctor compiler_functor(
+        target_machine.get(), &disassembler, opt_level,
+        CompilerFunctor::AllIntrinsics(), pre_optimization_ir_dump_hook,
+        post_optimization_ir_dump_hook);
     llvm::object::OwningBinary<llvm::object::ObjectFile> object_file =
         compiler_functor(llvm_module);
     llvm::StringRef object_file_data_ref = object_file.getBinary()->getData();
@@ -704,6 +813,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         std::move(object_file_data), std::move(buffer_sizes),
         result_slice.index()));
   }
+
+  VLOG(1) << "Compilation finished";
   return std::move(results);
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 29fa4eac61beaa25e1662b1be5afa9757ab077ea..b82e181df2b883ddac7e7d39212fb28b07ca7b0c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -110,16 +110,15 @@ class CpuCompiler : public Compiler {
   ~CpuCompiler() override {}
 
   StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
-                     HloDumper dump_hlo,
                      const AotCompilationOptions& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
@@ -132,7 +131,7 @@ class CpuCompiler : public Compiler {
 
   // Runs the HLO passes which are necessary for both optimizations and
   // correctness.
-  Status RunHloPasses(HloModule* hlo_module, HloDumper dump_hlo);
+  Status RunHloPasses(HloModule* module);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 671d6957a39c068c416cd4fa3739f05c9ddb3baa..8787336ed0755647dd9ffbc68484d4cb9cef4790 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -66,7 +66,8 @@ CpuExecutable::CpuExecutable(
   CHECK(sym) << "Symbol " << entry_function_name << " not found.";
   // getAddress can do work under the hood in the jit, so it needs to be
   // guarded by the mutex.
-  compute_function_ = reinterpret_cast<ComputeFunctionType>(sym.getAddress());
+  compute_function_ =
+      reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress()));
 }
 
 // Given a pointer to an output buffer (following the CPU JIT calling
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index b5746769ba7e4bb2593bab7abc24f1a75a083d80..4b42530c09dbf2ff4aa767e398535e4d55cc4673 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -78,6 +78,11 @@ class CpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
+  const Status EqualOrFail(const Executable& executable) {
+    // TODO(b/62952745) Implement equality test on CPU executable.
+    return Unimplemented("Equality test on CPU executable is not implemented.");
+  }
+
   static int64 ShapeSizeBytes(const Shape& shape);
 
  private:
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
index f6b1dcae75a773811f8c652dea36b7f3ca36e901..4d0e0f744ac4b02f7c4a74c5a341d6b9ce937967 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
@@ -15,19 +15,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
 
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace cpu {
 
 StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "ParallelizationPreparation ENTRY");
+  XLA_VLOG_LINES(2, module->ToString());
+
   bool changed = false;
+  TF_ASSIGN_OR_RETURN(changed, RunParallelTaskAssignment(module));
+
   HloComputation* entry_computation = module->entry_computation();
   std::unordered_set<HloInstruction*> outlined;
   std::vector<HloInstruction*> instructions_to_outline;
@@ -44,13 +53,21 @@ StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
         instruction->opcode() == HloOpcode::kConstant) {
       continue;
     }
+
+    // Outline 'instruction' in isolation if it was assigned parallel tasks.
+    if (OutlineParallelizableInstruction(instruction)) {
+      outlined.insert(instruction);
+      changed = true;
+      continue;
+    }
+
     instructions_to_outline.clear();
     HloInstruction* outline_candidate = instruction;
     instructions_to_outline.push_back(outline_candidate);
     bool all_bitcasts = outline_candidate->opcode() == HloOpcode::kBitcast;
 
     // Outline sole users with the current instruction.
-    while (outline_candidate->users().size() == 1) {
+    while (CanOutlineWithUser(outline_candidate)) {
       HloInstruction* prior_candidate = outline_candidate;
       outline_candidate = *outline_candidate->users().begin();
       all_bitcasts |= outline_candidate->opcode() == HloOpcode::kBitcast;
@@ -108,6 +125,9 @@ StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     HloInstruction* root = computation->root_instruction();
     // Copy root instruction if it does not define its own top-level buffer.
     // TODO(b/32885001) Remove these copies (at least for the unambiguous case).
@@ -120,8 +140,136 @@ StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
       changed = true;
     }
   }
+
+  XLA_VLOG_LINES(2, "ParallelizationPreparation EXIT");
+  XLA_VLOG_LINES(2, module->ToString());
   return changed;
 }
 
+StatusOr<bool> ParallelizationPreparation::RunParallelTaskAssignment(
+    HloModule* module) {
+  VLOG(1) << "RunParallelTaskAssignment max_parallelism_: " << max_parallelism_;
+  bool changed = false;
+  // Run cost analysis on entry computation.
+  HloCostAnalysis cost_analysis(shape_size_);
+  HloComputation* computation = module->entry_computation();
+  Status cost_status = computation->root_instruction()->Accept(&cost_analysis);
+  for (auto& instruction : computation->instructions()) {
+    // Currently, we do not assign parallel tasks to instructions with at least
+    // one of the following properties:
+    // *) Internal threading (library calls to kConv, kDot, and kCustomCall).
+    // *) Emit custom loops (kSelectAndScatter, FusionKind::kTransposeDot).
+    // *) Tuple-shaped.
+    // TODO(b/27458679) Parallelize instructions which are skipped here.
+    if (instruction->opcode() == HloOpcode::kParameter ||
+        instruction->opcode() == HloOpcode::kConstant ||
+        instruction->opcode() == HloOpcode::kCall ||
+        instruction->opcode() == HloOpcode::kCustomCall ||
+        instruction->opcode() == HloOpcode::kSelectAndScatter ||
+        (instruction->opcode() == HloOpcode::kConvolution &&
+         PotentiallyImplementedAsEigenConvolution(*instruction)) ||
+        PotentiallyImplementedAsEigenDot(*instruction) ||
+        (instruction->opcode() == HloOpcode::kFusion &&
+         instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
+        ShapeUtil::IsTuple(instruction->shape())) {
+      continue;
+    }
+
+    // Calculate target parallel task count in [1, max_parallelism_].
+    const int64 target_parallel_task_count = GetTargetParallelTaskCount(
+        cost_status.ok() ? &cost_analysis : nullptr, instruction.get());
+    if (target_parallel_task_count == 1) {
+      continue;
+    }
+
+    // Assign feasible dimension partitions (based on actual dimension sizes).
+    auto dim_partition_counts = ShapePartitionAssigner(instruction->shape())
+                                    .Run(target_parallel_task_count);
+    const int64 total_partition_count =
+        ShapePartitionAssigner::GetTotalPartitionCount(dim_partition_counts);
+    if (total_partition_count <= 1) {
+      // Feasible partition calculation resulting in no partitioning, so skip.
+      continue;
+    }
+    VLOG(2) << "Assigning parallel task count: " << total_partition_count
+            << " to instruction: " << instruction->name();
+    // Map 'instruction' to assigned dimension partitioning.
+    instruction->set_outer_dimension_partitions(dim_partition_counts);
+  }
+
+  return changed;
+}
+
+int64 ParallelizationPreparation::GetTargetParallelTaskCount(
+    const HloCostAnalysis* cost_analysis, HloInstruction* instruction) {
+  // Default to a simple cost model based on hlo size and typical L2 cache size.
+  // Note that 'cost_analysis' can be 'nullptr' if HloCostAnalysis returns an
+  // error status (likely because HLOs like CustomCall are not yet implemented
+  // in the HloCostAnalysis).
+  int64 instruction_cost = shape_size_(instruction->shape());
+  int64 min_cost_per_thread = 256LL << 10;  // 256KB L2 Cache size.
+  if (cost_analysis != nullptr) {
+    // Calculate the instruction cost in cycles.
+    // TODO(29630486) Improve on this linear cost model.
+    // Consider making 'min_cost_per_thread' be a function of the target
+    // bandwidth limit for instructions with low arithmetic complexity.
+    instruction_cost = 1 * cost_analysis->flop_count(*instruction) +
+                       2 * cost_analysis->transcendental_count(*instruction) +
+                       10 * cost_analysis->bytes_accessed(*instruction);
+    // Minimum per-thread cost is 100us of work on a 2GHz core.
+    min_cost_per_thread = 100000;
+  }
+  // Return target parallel task count in [1, max_parallelism_].
+  return std::min(max_parallelism_,
+                  std::max(1LL, instruction_cost / min_cost_per_thread));
+}
+
+bool ParallelizationPreparation::OutlineParallelizableInstruction(
+    HloInstruction* instruction) {
+  if (instruction->outer_dimension_partitions().empty()) {
+    return false;
+  }
+  // Store dimension partition counts before outlining (which clones
+  // 'instruction').
+  std::vector<int64> dim_partition_counts =
+      instruction->outer_dimension_partitions();
+  // Outline 'instruction' in its own sub-computation.
+  HloModule* module = instruction->parent()->parent();
+  auto* call = module->OutlineExpressionFromComputation(
+      {instruction}, tensorflow::strings::StrCat("pp_", instruction->name()),
+      module->entry_computation());
+  // Map previously assigned 'dim_partition_counts' to cloned root instruction.
+  VLOG(1) << "Outlining parallelizable"
+          << " caller: " << call->name()
+          << " callee: " << call->to_apply()->root_instruction()->name();
+  call->to_apply()->root_instruction()->set_outer_dimension_partitions(
+      dim_partition_counts);
+  return true;
+}
+
+bool ParallelizationPreparation::CanOutlineWithUser(
+    HloInstruction* instruction) {
+  if (instruction->users().size() != 1) {
+    // Do not outline 'instruction' with multiple users.
+    return false;
+  }
+  if (AssignedParallelTasks(instruction) ||
+      AssignedParallelTasks(*instruction->users().begin())) {
+    // Do not outline if 'instruction' (or user) were assigned parallel tasks.
+    return false;
+  }
+  return true;
+}
+
+bool ParallelizationPreparation::AssignedParallelTasks(
+    HloInstruction* instruction) {
+  return !instruction->outer_dimension_partitions().empty() ||
+         (instruction->opcode() == HloOpcode::kCall &&
+          !instruction->to_apply()
+               ->root_instruction()
+               ->outer_dimension_partitions()
+               .empty());
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
index 62999f5686db2e4db3ace0c5580bd156edbfa994..d53fc461509cad51778dba37922212731236952f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_
 
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -32,14 +33,51 @@ namespace cpu {
 // handle While constructs.
 class ParallelizationPreparation : public HloPassInterface {
  public:
+  // 'max_parallelism': the maximum parallel task count per instruction.
+  // 'shape_size': shape size function used by HloCostAnalysis during parallel
+  //               task assignment.
+  ParallelizationPreparation(
+      const int64 max_parallelism,
+      const HloCostAnalysis::ShapeSizeFunction& shape_size)
+      : max_parallelism_(max_parallelism), shape_size_(shape_size) {}
   ~ParallelizationPreparation() override {}
+
   tensorflow::StringPiece name() const override {
     return "cpu-parallel-prepare";
   }
 
-  // Run instruction fusion on the given computation. Returns whether the
+  // Run parallel preparation on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Assigns parallel task partitions to conformant instructions in 'module'.
+  // Returns true on success or error status otherwise.
+  StatusOr<bool> RunParallelTaskAssignment(HloModule* module);
+
+  // Returns the target parallel task count for 'instruction'.
+  // Utilizes 'cost_analysis' if non-null.
+  // Otherwise defaults to a simple HLO output size-based cost model.
+  int64 GetTargetParallelTaskCount(const HloCostAnalysis* cost_analysis,
+                                   HloInstruction* instruction);
+
+  // Outlines 'instruction' from entry computation, if it had
+  // been assigned parallel tasks in an earlier pass through the computation.
+  // Returns true if 'instruction' was successfully outlined, false otherwise.
+  bool OutlineParallelizableInstruction(HloInstruction* instruction);
+
+  // Returns true if 'instruction' can be outlined into the same sub-computation
+  // with its single user (parallelizable instructions are not outlined with
+  // each other). Returns false otherwise.
+  bool CanOutlineWithUser(HloInstruction* instruction);
+
+  // Returns true if 'instruction' (or the root of the sub-computation that
+  // 'instruction' calls) has had parallel tasks assigned in earlier pass.
+  // Returns false otherwise.
+  bool AssignedParallelTasks(HloInstruction* instruction);
+
+  const int64 max_parallelism_;
+  const HloCostAnalysis::ShapeSizeFunction shape_size_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 253de20f25127bf0ac23d5969e0f16c143396e47..5d6efa535958a2757a22f633aa41d08ca712cb5d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -17,35 +17,118 @@ limitations under the License.
 
 #include <functional>
 
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace cpu {
 namespace runtime {
 
-InfeedManager* GetInfeedManager() {
-  static InfeedManager* manager = new InfeedManager;
+XfeedManager* GetXfeedManager() {
+  static XfeedManager* manager = new XfeedManager;
   return manager;
 }
 
+extern const char* const kEigenMatMulF32SymbolName =
+    "__xla_cpu_runtime_EigenMatMulF32";
+extern const char* const kEigenMatMulF64SymbolName =
+    "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kEigenConvF32SymbolName =
+    "__xla_cpu_runtime_EigenConvF32";
+extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulF32";
+extern const char* const kEigenSingleThreadedMatMulF64SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulF64";
+extern const char* const kEigenSingleThreadedConvF32SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedConvF32";
+extern const char* const kAcquireInfeedBufferForDequeueSymbolName =
+    "__xla_cpu_runtime_AcquireInfeedBufferForDequeue";
+extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName =
+    "__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue";
+extern const char* const kAcquireOutfeedBufferForPopulationSymbolName =
+    "__xla_cpu_runtime_AcquireOutfeedBufferForPopulation";
+extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName =
+    "__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation";
+extern const char* const kXlaCpuRuntimeSymbolNamePrefix = "__xla_cpu_runtime_";
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
 
-void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
-    xla::int32 buffer_length) {
-  xla::cpu::runtime::InfeedManager* infeed =
-      xla::cpu::runtime::GetInfeedManager();
+namespace {
+
+tensorflow::string ShapeString(const void* shape_ptr, xla::int32 shape_length) {
+  xla::StatusOr<xla::Shape> shape =
+      xla::llvm_ir::DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
+  if (shape.ok()) {
+    return xla::ShapeUtil::HumanStringWithLayout(shape.ValueOrDie());
+  }
+  return "<invalid shape>";
+}
+
+}  // namespace
+
+void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length,
+                                                      const void* shape,
+                                                      xla::int32 shape_length) {
+  if (VLOG_IS_ON(2)) {
+    LOG(INFO) << "AcquireInfeedBufferForDequeue: "
+              << ShapeString(shape, shape_length);
+  }
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
+  // Wait until there's a buffer to dequeue.
+  xla::cpu::runtime::XfeedBuffer* buffer =
+      xfeed->infeed()->BlockingDequeueBuffer();
+  CHECK_EQ(buffer->length(), buffer_length)
+      << "XLA program infeed request buffer size " << buffer_length
+      << " did not match the runtime's infed buffer length " << buffer->length()
+      << "; program reports desired shape: "
+      << ShapeString(shape, shape_length);
+  return buffer->data();
+}
+
+void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
+    xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr,
+    xla::int32 shape_length) {
+  if (VLOG_IS_ON(2)) {
+    LOG(INFO) << "ReleaseInfeedBufferAfterDeque: "
+              << ShapeString(shape_ptr, shape_length);
+  }
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
+  xla::StatusOr<xla::Shape> shape =
+      xla::llvm_ir::DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
+  xfeed->infeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr,
+                                        std::move(shape));
+}
+
+void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+    xla::int32 buffer_length, const void* shape_ptr, xla::int32 shape_length) {
+  if (VLOG_IS_ON(2)) {
+    LOG(INFO) << "AcquireOutfeedBufferForPopulation: "
+              << ShapeString(shape_ptr, shape_length);
+  }
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
   // Wait until there's a buffer to dequeue.
-  xla::cpu::runtime::InfeedBuffer* buffer = infeed->BlockingDequeueBuffer();
-  CHECK_EQ(buffer->length(), buffer_length);
+  xla::cpu::runtime::XfeedBuffer* buffer =
+      xfeed->outfeed()->BlockingDequeueBuffer();
+  CHECK_EQ(buffer->length(), buffer_length)
+      << "XLA program outfeed request buffer size " << buffer_length
+      << " did not match the runtime's outfeed buffer length "
+      << buffer->length() << "; program reports outfed shape: "
+      << ShapeString(shape_ptr, shape_length);
   return buffer->data();
 }
 
-void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(xla::int32 buffer_length,
-                                                       void* buffer_ptr) {
-  xla::cpu::runtime::InfeedManager* infeed =
-      xla::cpu::runtime::GetInfeedManager();
-  infeed->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
+void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+    xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr,
+    xla::int32 shape_length) {
+  if (VLOG_IS_ON(2)) {
+    LOG(INFO) << "ReleaseOutfeedBufferAfterPopulation: "
+              << ShapeString(shape_ptr, shape_length);
+  }
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
+  xla::StatusOr<xla::Shape> shape =
+      xla::llvm_ir::DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
+  xfeed->outfeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr, shape);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 8eae2102305a3898c244a356d383184139e9208e..29feb7267fe97f6876827b6cbfa6217a0cecf238 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -26,7 +26,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
 
-#include "tensorflow/compiler/xla/service/cpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
@@ -41,22 +41,23 @@ namespace runtime {
 //    the actual symbol.
 // 2. When using ahead-of-time compilation, the linker can resolve the name
 //    because it is a symbol in the cpu_runtime library.
-constexpr char kEigenMatmulF32SymbolName[] = "__xla_cpu_runtime_EigenMatMulF32";
-constexpr char kEigenMatmulF64SymbolName[] = "__xla_cpu_runtime_EigenMatMulF64";
-constexpr char kEigenConvF32SymbolName[] = "__xla_cpu_runtime_EigenConvF32";
-constexpr char kEigenSingleThreadedMatmulF32SymbolName[] =
-    "__xla_cpu_runtime_EigenSingleThreadedMatMulF32";
-constexpr char kEigenSingleThreadedMatmulF64SymbolName[] =
-    "__xla_cpu_runtime_EigenSingleThreadedMatMulF64";
-constexpr char kEigenSingleThreadedConvF32SymbolName[] =
-    "__xla_cpu_runtime_EigenSingleThreadedConvF32";
-constexpr char kAcquireInfeedBufferForDequeueSymbolName[] =
-    "__xla_cpu_runtime_AcquireInfeedBufferForDequeue";
-constexpr char kReleaseInfeedBufferAfterDequeueSymbolName[] =
-    "__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue";
+extern const char* const kEigenMatMulF32SymbolName;
+extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kEigenConvF32SymbolName;
+extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
+extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
+extern const char* const kEigenSingleThreadedConvF32SymbolName;
+extern const char* const kAcquireInfeedBufferForDequeueSymbolName;
+extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
+extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
+extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
+
+// All symbol names for XLA CPU runtime functions need to start with this
+// prefix.
+extern const char* const kXlaCpuRuntimeSymbolNamePrefix;
 
 // Returns the infeed manager used by the CPU runtime.
-InfeedManager* GetInfeedManager();
+XfeedManager* GetXfeedManager();
 
 }  // namespace runtime
 }  // namespace cpu
@@ -64,13 +65,19 @@ InfeedManager* GetInfeedManager();
 
 extern "C" {
 
+// Note: in the runtime entry points below, the shape pointer and shape_length
+// reflect values that can be deserialized via
+// llvm_ir::DecodeSelfDescribingShapeConstant. This is the way we pass reified
+// type information from the generated program to the runtime, which helps check
+// the type safety and contract for the emitted-code/runtime communication.
+
 // Blocks until the next infeed buffer is ready to be dequeued, then
 // returns it. Fails catastrophically if the next enqueued buffer is
 // not of the correct length in bytes. Checking the shape rather than
 // the length would be more exact, but the length check is chosen as a
 // tradeoff between error checking and speed/simplicity.
 extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
-    xla::int32 buffer_length);
+    xla::int32 buffer_length, const void* shape, xla::int32 shape_length);
 
 // Relinquishes the next infeed buffer that was returned by
 // __xla_cpu_runtime_AcquireInfeedBufferForDequeue. Once this call
@@ -85,7 +92,27 @@ extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
 // implemented we will add support for multiple outstanding buffers
 // that can be returned out of order.
 extern void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
-    xla::int32 buffer_length, void* buffer_ptr);
-}
+    xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr,
+    xla::int32 shape_length);
+
+// Blocks until the next outfeed buffer is available to be populated, then
+// returns it.
+extern void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+    xla::int32 buffer_length, const void* shape_ptr, xla::int32 shape_length);
+
+// Relinquishes the outfeed buffer after it has been populated.
+// buffer_ptr must have been previously returned by
+// __xla_cpu_runtime_AcquireOutfeedBufferForPopulation.
+// Once this call completes, buffer_ptr may no longer be accessed.
+// buffer_length must match the length passed to the call to
+// __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
+// buffer_ptr. This function must be called before the next buffer is
+// acquired, i.e., there may only be one outstanding outfeed buffer in
+// use by the runtime.
+extern void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+    xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr,
+    xla::int32 shape_length);
+
+}  // extern "C"
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
index 646254887c83fcaff8fd5def9fafc8ff17d03d32..f6664bb854e2dda4c199d6f716e6dc7173447cea 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
@@ -19,17 +19,30 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
+#ifdef __AVX__
+xla::cpu::runtime::V8F32 __xla_cpu_runtime_ExpV8F32(
+    xla::cpu::runtime::V8F32 x) {
+  return Eigen::internal::pexp(x);
+}
+
+xla::cpu::runtime::V8F32 __xla_cpu_runtime_LogV8F32(
+    xla::cpu::runtime::V8F32 x) {
+  return Eigen::internal::plog(x);
+}
+
+xla::cpu::runtime::V8F32 __xla_cpu_runtime_TanhV8F32(
+    xla::cpu::runtime::V8F32 x) {
+  return Eigen::internal::ptanh(x);
+}
+#endif  // __AVX__
+
 namespace xla {
 namespace cpu {
 namespace runtime {
 
-#ifdef __AVX__
-V8F32 ExpV8F32(V8F32 x) { return Eigen::internal::pexp(x); }
-
-V8F32 LogV8F32(V8F32 x) { return Eigen::internal::plog(x); }
-
-V8F32 TanhV8F32(V8F32 x) { return Eigen::internal::ptanh(x); }
-#endif  // __AVX__
+const char *const kExpV8F32SymbolName = "__xla_cpu_runtime_ExpV8F32";
+const char *const kLogV8F32SymbolName = "__xla_cpu_runtime_LogV8F32";
+const char *const kTanhV8F32SymbolName = "__xla_cpu_runtime_TanhV8F32";
 
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index 89721aaf835eec5e4a8be0fbabb310b084065825..c15710fb00197d41c1047d3e8ade0165f18cf0fb 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -28,23 +28,28 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-constexpr char kExpV8F32[] = "__xla_cpu_runtime_ExpV8F32";
-constexpr char kLogV8F32[] = "__xla_cpu_runtime_LogV8F32";
-constexpr char kTanhV8F32[] = "__xla_cpu_runtime_TanhV8F32";
+extern const char *const kExpV8F32SymbolName;
+extern const char *const kLogV8F32SymbolName;
+extern const char *const kTanhV8F32SymbolName;
 
 typedef float V8F32 __attribute__((__vector_size__(32)));
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+extern "C" {
 
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
-V8F32 ExpV8F32(V8F32 x) TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V8F32 __xla_cpu_runtime_ExpV8F32(xla::cpu::runtime::V8F32 x)
+    TF_ATTRIBUTE_WEAK;
 
-V8F32 LogV8F32(V8F32 x) TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V8F32 __xla_cpu_runtime_LogV8F32(xla::cpu::runtime::V8F32 x)
+    TF_ATTRIBUTE_WEAK;
 
-V8F32 TanhV8F32(V8F32 x) TF_ATTRIBUTE_WEAK;
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
+xla::cpu::runtime::V8F32 __xla_cpu_runtime_TanhV8F32(xla::cpu::runtime::V8F32 x)
+    TF_ATTRIBUTE_WEAK;
+}
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
index 69d04427c60b0d8db8a8859b4abff9bfa7e93260..58ec9fc6e8ee7329c5dc1624cca2f0f0f4b68f59 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
@@ -19,29 +19,36 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
-namespace xla {
-namespace cpu {
-namespace runtime {
-
 #ifdef __SSE4_1__
 
-V4F32 ExpV4F32(V4F32 x) {
+xla::cpu::runtime::V4F32 __xla_cpu_runtime_ExpV4F32(
+    xla::cpu::runtime::V4F32 x) {
   Eigen::internal::Packet4f p = x;
   return Eigen::internal::pexp(p);
 }
 
-V4F32 LogV4F32(V4F32 x) {
+xla::cpu::runtime::V4F32 __xla_cpu_runtime_LogV4F32(
+    xla::cpu::runtime::V4F32 x) {
   Eigen::internal::Packet4f p = x;
   return Eigen::internal::plog(p);
 }
 
-V4F32 TanhV4F32(V4F32 x) {
+xla::cpu::runtime::V4F32 __xla_cpu_runtime_TanhV4F32(
+    xla::cpu::runtime::V4F32 x) {
   Eigen::internal::Packet4f p = x;
   return Eigen::internal::ptanh(p);
 }
 
 #endif  // __SSE4_1__
 
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+const char *const kExpV4F32SymbolName = "__xla_cpu_runtime_ExpV4F32";
+const char *const kLogV4F32SymbolName = "__xla_cpu_runtime_LogV4F32";
+const char *const kTanhV4F32SymbolName = "__xla_cpu_runtime_TanhV4F32";
+
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index ded206f90a076ba81643799c07e3f3a7d481eaf2..7ab9a52d00848891b73415fdb5cb49c515243c05 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -28,23 +28,29 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-constexpr char kExpV4F32[] = "__xla_cpu_runtime_ExpV4F32";
-constexpr char kLogV4F32[] = "__xla_cpu_runtime_LogV4F32";
-constexpr char kTanhV4F32[] = "__xla_cpu_runtime_TanhV4F32";
+extern const char *const kExpV4F32SymbolName;
+extern const char *const kLogV4F32SymbolName;
+extern const char *const kTanhV4F32SymbolName;
 
 typedef float V4F32 __attribute__((__vector_size__(16)));
 
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+extern "C" {
+
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
-V4F32 ExpV4F32(V4F32 x) TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V4F32 __xla_cpu_runtime_ExpV4F32(xla::cpu::runtime::V4F32 x)
+    TF_ATTRIBUTE_WEAK;
 
-V4F32 LogV4F32(V4F32 x) TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V4F32 __xla_cpu_runtime_LogV4F32(xla::cpu::runtime::V4F32 x)
+    TF_ATTRIBUTE_WEAK;
 
-V4F32 TanhV4F32(V4F32 x) TF_ATTRIBUTE_WEAK;
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
+xla::cpu::runtime::V4F32 __xla_cpu_runtime_TanhV4F32(xla::cpu::runtime::V4F32 x)
+    TF_ATTRIBUTE_WEAK;
+}
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 420f9cebc5b1ded365c20079589ebc79a03b3164..f45c28ef74c7ef716e7f0330a1c10abc528a90ee 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Instructions.h"
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "external/llvm/include/llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -44,7 +44,8 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
                            const llvm_ir::IrArray& lhs_array,
                            const llvm_ir::IrArray& rhs_array,
                            llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* ir_builder)
+                           llvm::IRBuilder<>* ir_builder,
+                           const HloModuleConfig& hlo_module_config)
     : dot_(dot),
       transpose_lhs_(transpose_lhs),
       transpose_rhs_(transpose_rhs),
@@ -52,18 +53,20 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
       executable_run_options_value_(executable_run_options_value),
-      ir_builder_(ir_builder) {}
+      ir_builder_(ir_builder),
+      hlo_module_config_(hlo_module_config) {}
 
 /* static */ tensorflow::Status DotOpEmitter::EmitDotOperation(
     const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
     const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
     const llvm_ir::IrArray& rhs_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder) {
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+    const HloModuleConfig& hlo_module_config) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F32 == type || F64 == type);
   DotOpEmitter dot_emitter(dot, transpose_lhs, transpose_rhs, target_array,
                            lhs_array, rhs_array, executable_run_options_value,
-                           ir_builder);
+                           ir_builder, hlo_module_config);
   return dot_emitter.Emit();
 }
 
@@ -233,22 +236,22 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  legacy_flags::CpuRuntimeFlags* flags = legacy_flags::GetCpuRuntimeFlags();
-  bool multi_threaded = flags->xla_cpu_multi_thread_eigen;
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
     case F32:
-      fn_name = multi_threaded
-                    ? runtime::kEigenMatmulF32SymbolName
-                    : runtime::kEigenSingleThreadedMatmulF32SymbolName;
+      fn_name = multi_threaded_eigen
+                    ? runtime::kEigenMatMulF32SymbolName
+                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded
-                    ? runtime::kEigenMatmulF64SymbolName
-                    : runtime::kEigenSingleThreadedMatmulF64SymbolName;
+      fn_name = multi_threaded_eigen
+                    ? runtime::kEigenMatMulF64SymbolName
+                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 44dfe5f2a91222d99907e31062fb1d8f74aed3ff..b6147163802dde12a8bf7dde91ac8dad45ba1990 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "external/llvm/include/llvm/IR/IRBuilder.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,7 +40,8 @@ class DotOpEmitter {
       const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
       const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
       const llvm_ir::IrArray& rhs_array,
-      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder);
+      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+      const HloModuleConfig& hlo_module_config);
 
  private:
   DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
@@ -47,7 +49,8 @@ class DotOpEmitter {
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
                llvm::Value* executable_run_options_value,
-               llvm::IRBuilder<>* ir_builder);
+               llvm::IRBuilder<>* ir_builder,
+               const HloModuleConfig& hlo_module_config);
 
   // Emits the IR to perform the dot operation.
   tensorflow::Status Emit();
@@ -82,6 +85,7 @@ class DotOpEmitter {
   const llvm_ir::IrArray& rhs_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* ir_builder_;
+  const HloModuleConfig& hlo_module_config_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/infeed_manager_test.cc b/tensorflow/compiler/xla/service/cpu/infeed_manager_test.cc
deleted file mode 100644
index c65d8216606a1caa561adea5a83c8f1aa2c82906..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/infeed_manager_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/infeed_manager.h"
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace {
-
-class InfeedManagerTest : public ::testing::Test {};
-
-class TestInfeedBuffer : public cpu::runtime::InfeedBuffer {
- public:
-  explicit TestInfeedBuffer(int32 length)
-      : done_called_(false), length_(length) {}
-  ~TestInfeedBuffer() override { EXPECT_TRUE(done_called_); }
-
-  int32 length() override { return length_; }
-  void* data() override { return nullptr; }
-  void Done() override {
-    CHECK(!done_called_);
-    done_called_ = true;
-  }
-
- private:
-  bool done_called_;
-  int32 length_;
-};
-
-void ProcessNextBuffer(int32 length) {
-  void* buffer = __xla_cpu_runtime_AcquireInfeedBufferForDequeue(length);
-  __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(length, buffer);
-}
-
-TEST_F(InfeedManagerTest, SingleThreadedSequential) {
-  TestInfeedBuffer* a = new TestInfeedBuffer(64);
-  TestInfeedBuffer* b = new TestInfeedBuffer(32);
-
-  cpu::runtime::InfeedManager* infeed = cpu::runtime::GetInfeedManager();
-
-  infeed->EnqueueBuffer(a);
-  infeed->EnqueueBuffer(b);
-  ProcessNextBuffer(a->length());
-  ProcessNextBuffer(b->length());
-}
-
-TEST_F(InfeedManagerTest, SingleThreadedInterleaved) {
-  TestInfeedBuffer* a = new TestInfeedBuffer(64);
-  TestInfeedBuffer* b = new TestInfeedBuffer(32);
-
-  cpu::runtime::InfeedManager* infeed = cpu::runtime::GetInfeedManager();
-
-  infeed->EnqueueBuffer(a);
-  ProcessNextBuffer(a->length());
-  infeed->EnqueueBuffer(b);
-  ProcessNextBuffer(b->length());
-}
-
-TEST_F(InfeedManagerTest, MultiThreaded) {
-  tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "test", 2);
-
-  cpu::runtime::InfeedManager* infeed = cpu::runtime::GetInfeedManager();
-
-  const int32 length = 64;
-
-  pool.Schedule([infeed]() {
-    // Spin for 100 milliseconds
-    int64 start_micros = tensorflow::Env::Default()->NowMicros();
-    while (true) {
-      int64 end_micros = tensorflow::Env::Default()->NowMicros();
-      if ((end_micros - start_micros) >= 100000) {  // 100 ms
-        break;
-      }
-    }
-    TestInfeedBuffer* a = new TestInfeedBuffer(length);
-    infeed->EnqueueBuffer(a);
-  });
-
-  ProcessNextBuffer(length);
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 2d855d0eb1e9448707b3916d20803cebf2ebabe4..859329e2c1ddca9dbea14c16b67f63d4803b6acd 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -26,11 +25,6 @@ namespace cpu {
 
 bool PotentiallyImplementedAsEigenConvolution(
     const HloInstruction& convolution) {
-  legacy_flags::CpuRuntimeFlags* flags = legacy_flags::GetCpuRuntimeFlags();
-  if (!flags->xla_cpu_use_eigen) {
-    return false;
-  }
-
   // The following conditions are necessary (but not sufficient) for
   // implementing `convolution` with Eigen convolution:
   // - the input and kernel have a non-zero number of elements.
@@ -82,11 +76,6 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
 }  // namespace
 
 bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
-  legacy_flags::CpuRuntimeFlags* flags = legacy_flags::GetCpuRuntimeFlags();
-  if (!flags->xla_cpu_use_eigen) {
-    return false;
-  }
-
   // For certain types of Dot, we can call Eigen
   if (hlo.opcode() == HloOpcode::kDot) {
     const Shape& lhs_shape = hlo.operand(0)->shape();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 7ad497ff1a27ff083517de6a82a8c4b903800cce..3ee417191d3def7e3f0e44155c6c308378c30b96 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -32,8 +32,9 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Instructions.h"
 #include "external/llvm/include/llvm/IR/Intrinsics.h"
 #include "external/llvm/include/llvm/IR/LLVMContext.h"
+#include "external/llvm/include/llvm/Target/TargetRegisterInfo.h"
+#include "external/llvm/include/llvm/Target/TargetSubtargetInfo.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
@@ -52,9 +53,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace {
+const char* kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
+bool VectorizedReduceDisabled(const xla::HloModuleConfig& config) {
+  return config.debug_options().xla_backend_extra_options().count(
+      kXlaDisableVectorizedReduce);
+}
+}  // namespace
 
 namespace xla {
 
@@ -65,14 +77,16 @@ namespace cpu {
 IrEmitter::IrEmitter(
     const HloModule& hlo_module, const BufferAssignment& assignment,
     llvm::Module* llvm_module,
-    const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
+    const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+    llvm::TargetMachine* target_machine)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       ir_builder_(llvm_module->getContext()),
       hlo_to_profile_idx_(hlo_to_profile_idx),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
-      hlo_module_config_(hlo_module.config()) {
+      hlo_module_config_(hlo_module.config()),
+      target_machine_features_(target_machine) {
   ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
           .xla_enable_fast_math()));
@@ -83,7 +97,14 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     bool is_entry_computation,
     std::vector<const HloInstruction*>* instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
-  VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix << "]";
+  VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
+          << "]; ordered? " << (instruction_order != nullptr);
+  num_dynamic_loop_bounds_ = 0;
+  if (!computation->root_instruction()->outer_dimension_partitions().empty()) {
+    num_dynamic_loop_bounds_ =
+        computation->root_instruction()->outer_dimension_partitions().size();
+  }
+
   InitializeIrFunction(function_name, is_entry_computation);
   // The rdtscp instruction is x86 specific.  We will fallback to LLVM's generic
   // readcyclecounter if it is unavailable.
@@ -91,11 +112,10 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
                     arch_type_ == llvm::Triple::ArchType::x86_64;
   profiling_state_ = ProfilingState(is_entry_computation, use_rdtscp,
                                     GetProfileCountersArgument());
-  if (instruction_order != nullptr) {
-    TF_RETURN_IF_ERROR(computation->root_instruction()->AcceptOrdered(
-        this, *instruction_order));
+  if (instruction_order == nullptr) {
+    TF_RETURN_IF_ERROR(computation->Accept(this));
   } else {
-    TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(this));
+    TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, *instruction_order));
   }
   InsertOrDie(&emitted_functions_, computation, compute_function_);
 
@@ -112,7 +132,7 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
                                      bool is_entry_computation) {
   // The function signature is:
   //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
-  //                 i64* prof_counters)
+  //                 i64* dynamic_loop_bounds, i64* prof_counters)
   //
   // retval: points to the returned value.
   // params: address of an array with pointers to parameters.
@@ -152,6 +172,10 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
   //                     | temp  0 |  | temp  1 |         | temp  N-1 |
   //                     \---------/  \---------/         \-----------/
   //
+  //                        /--------------------------------------------\
+  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
+  //  (elided for aot)      \--------------------------------------------/
+  //
   //                     /---------------------------------------------\
   //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
   //  (elided for aot)   \---------------------------------------------/
@@ -164,6 +188,9 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
   llvm::Type* i64_ptr_type = llvm::Type::getInt64PtrTy(module_->getContext());
   std::vector<llvm::Type*> compute_function_params(
       {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
+  if (num_dynamic_loop_bounds_ > 0) {
+    compute_function_params.push_back(i64_ptr_type);
+  }
   if (hlo_to_profile_idx_) {
     compute_function_params.push_back(i64_ptr_type);
   }
@@ -190,6 +217,9 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
   (++arg_iter)->setName("run_options");
   (++arg_iter)->setName("params");
   (++arg_iter)->setName("temps");
+  if (num_dynamic_loop_bounds_ > 0) {
+    (++arg_iter)->setName("dynamic_loop_bounds");
+  }
   if (hlo_to_profile_idx_) {
     (++arg_iter)->setName("prof_counters");
   }
@@ -242,12 +272,12 @@ Status IrEmitter::HandleConstant(HloInstruction* constant,
   return Status::OK();
 }
 
-Status IrEmitter::HandleCopy(HloInstruction* copy, HloInstruction* operand) {
+Status IrEmitter::HandleCopy(HloInstruction* copy) {
   if (ShapeUtil::IsTuple(copy->shape())) {
     // kCopy shallow copies a tuple so just memcpy the top-level buffer.
     TF_ASSIGN_OR_RETURN(llvm::Value * copy_value, EmitTargetAddressForOp(copy));
     emitted_value_[copy] = copy_value;
-    return EmitMemcpy(*operand, *copy);
+    return EmitMemcpy(*(copy->operand(0)), *copy);
   } else {
     // Use the elemental emitter for non-tuple shapes.
     return DefaultAction(copy);
@@ -358,63 +388,158 @@ Status IrEmitter::HandleSelect(HloInstruction* select, HloInstruction* pred,
 Status IrEmitter::HandleInfeed(HloInstruction* infeed) {
   VLOG(2) << "HandleInfeed: " << infeed->ToString();
 
+  const Shape& shape = infeed->shape();
+
+  // The infeed operation produces data (dequeued from the infeed queue) at this
+  // address, which has been provided by buffer assignment.
+  TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                      EmitTargetAddressForOp(infeed));
+
+  if (ShapeUtil::IsTuple(shape)) {
+    TF_RET_CHECK(!ShapeUtil::IsNestedTuple(shape));
+
+    // For a tuple, we first copy each of the internal elements to
+    // their corresponding target locations. We then construct the
+    // tuple outer buffer containing pointers to the internal
+    // elements.
+    std::vector<llvm::Value*> tuple_element_addresses;
+    for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) {
+      TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
+                          assignment_.GetUniqueSlice(infeed, {i}));
+
+      const Shape& tuple_element_shape =
+          ShapeUtil::GetTupleElementShape(shape, i);
+
+      // Only the outer tuple buffer's target address is obtained from
+      // EmitTargetAddressForOp to handle the case when Infeed is the
+      // root instruction. Target addresses for internal elements can
+      // be obtained from EmitTempBufferPointer.
+      llvm::Value* tuple_element_address =
+          EmitTempBufferPointer(buffer, tuple_element_shape);
+
+      TF_RETURN_IF_ERROR(EmitXfeedTransfer(
+          XfeedKind::kInfeed, tuple_element_shape, tuple_element_address));
+
+      tuple_element_addresses.push_back(tuple_element_address);
+    }
+
+    llvm_ir::EmitTuple(llvm_ir::IrArray(target_address, shape),
+                       tuple_element_addresses, &ir_builder_);
+  } else {
+    TF_RETURN_IF_ERROR(
+        EmitXfeedTransfer(XfeedKind::kInfeed, shape, target_address));
+  }
+
+  emitted_value_[infeed] = target_address;
+
+  return Status::OK();
+}
+
+Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
+                                    llvm::Value* program_buffer_address) {
+  int64 length = ByteSizeOf(shape);
+  if (length <= 0 || length > std::numeric_limits<int32>::max()) {
+    return InvalidArgument(
+        "xfeed (infeed or outfeed) buffer length %lld is outside the valid "
+        "size range",
+        length);
+  }
+  int32 length_32 = static_cast<int32>(length);
+
+  int32 shape_length;
+  TF_ASSIGN_OR_RETURN(llvm::Value * shape_ptr,
+                      llvm_ir::EncodeSelfDescribingShapeConstant(
+                          shape, &shape_length, &ir_builder_));
+
   // The signature of the acquire infeed buffer function is:
   //
   //   (void*)(int32 length);
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
   llvm::Type* int32_type = ir_builder_.getInt32Ty();
-  llvm::FunctionType* acquire_type =
-      llvm::FunctionType::get(i8_ptr_type, {int32_type},
-                              /*isVarArg=*/false);
+  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
+  llvm::FunctionType* acquire_type = llvm::FunctionType::get(
+      i8_ptr_type, {int32_type, i8_ptr_type, int32_type},
+      /*isVarArg=*/false);
 
-  llvm::Function* acquire_func =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          runtime::kAcquireInfeedBufferForDequeueSymbolName, acquire_type));
+  llvm::Function* acquire_func;
+  if (kind == XfeedKind::kInfeed) {
+    acquire_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+        runtime::kAcquireInfeedBufferForDequeueSymbolName, acquire_type));
+  } else {
+    acquire_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+        runtime::kAcquireOutfeedBufferForPopulationSymbolName, acquire_type));
+  }
   acquire_func->setCallingConv(llvm::CallingConv::C);
 
   // The signature of the release infeed buffer function is:
   //
   //   (void)(int32 length, void* buffer);
   llvm::FunctionType* release_type = llvm::FunctionType::get(
-      ir_builder_.getVoidTy(), {int32_type, i8_ptr_type},
+      ir_builder_.getVoidTy(),
+      {int32_type, i8_ptr_type, i8_ptr_type, int32_type},
       /*isVarArg=*/false);
 
-  llvm::Function* release_func =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          runtime::kReleaseInfeedBufferAfterDequeueSymbolName, release_type));
+  llvm::Function* release_func;
+  if (kind == XfeedKind::kInfeed) {
+    release_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+        runtime::kReleaseInfeedBufferAfterDequeueSymbolName, release_type));
+  } else {
+    release_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+        runtime::kReleaseOutfeedBufferAfterPopulationSymbolName, release_type));
+  }
   release_func->setCallingConv(llvm::CallingConv::C);
 
-  const Shape& shape = infeed->shape();
-  int64 length = ByteSizeOf(shape);
-  if (length > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("infeed buffer length %lld is too large", length);
+  // Implementation note: this call informs the runtime that it wants a buffer
+  // of size exactly 'length_32', and the runtime is responsible for
+  // check-failing the process if there is a mismatch, versus passing us back a
+  // buffer that we might overrun.
+  llvm::Value* acquired_pointer = ir_builder_.CreateCall(
+      acquire_func, {ir_builder_.getInt32(length_32), shape_ptr,
+                     ir_builder_.getInt32(shape_length)});
+
+  if (kind == XfeedKind::kInfeed) {
+    // Copy to the program buffer address from the acquired buffer.
+    ir_builder_.CreateMemCpy(program_buffer_address, acquired_pointer,
+                             length_32, 1);
+  } else {
+    // Outfeed -- copy from the in-program address to the acquired buffer.
+    ir_builder_.CreateMemCpy(acquired_pointer, program_buffer_address,
+                             length_32, 1);
   }
-  int32 length_32 = static_cast<int32>(length);
-
-  llvm::Value* acquired_pointer =
-      ir_builder_.CreateCall(acquire_func, {ir_builder_.getInt32(length_32)});
-
-  TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
-                      EmitTargetAddressForOp(infeed));
-
-  ir_builder_.CreateMemCpy(target_address, acquired_pointer, length_32, 1);
 
   ir_builder_.CreateCall(release_func,
-                         {ir_builder_.getInt32(length_32), acquired_pointer});
-
-  emitted_value_[infeed] = target_address;
+                         {ir_builder_.getInt32(length_32), acquired_pointer,
+                          shape_ptr, ir_builder_.getInt32(shape_length)});
 
   return Status::OK();
 }
 
 Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
-  // TODO(b/34359662): Implement outfeed on CPU.
-  return Unimplemented("Outfeed is not supported on CPU (b/34359662).");
+  HloInstruction* operand = outfeed->operands()[0];
+  const Shape& operand_shape = operand->shape();
+
+  llvm::Value* value = GetEmittedValueFor(operand);
+  if (!ShapeUtil::IsTuple(operand_shape)) {
+    return EmitXfeedTransfer(XfeedKind::kOutfeed, operand_shape, value);
+  }
+
+  TF_RET_CHECK(!ShapeUtil::IsNestedTuple(operand_shape));
+
+  for (int64 i = 0; i < operand_shape.tuple_shapes_size(); ++i) {
+    const Shape& tuple_element_shape =
+        ShapeUtil::GetTupleElementShape(operand_shape, i);
+    llvm::Value* tuple_element = llvm_ir::EmitGetTupleElement(
+        tuple_element_shape, i, MinimumAlignmentForShape(tuple_element_shape),
+        value, &ir_builder_);
+    TF_RETURN_IF_ERROR(EmitXfeedTransfer(XfeedKind::kOutfeed,
+                                         tuple_element_shape, tuple_element));
+  }
+
+  return Status::OK();
 }
 
 Status IrEmitter::HandleSort(HloInstruction* sort, HloInstruction* operand) {
   // TODO(b/26783907): Implement sort on CPU.
-  return Unimplemented("Sort is not supported on GPU (b/26783907).");
+  return Unimplemented("Sort is not supported on CPU (b/26783907).");
 }
 
 Status IrEmitter::HandleTuple(
@@ -760,7 +885,8 @@ Status IrEmitter::HandleDot(HloInstruction* dot, HloInstruction* lhs,
   // Dot operation is complicated so we delegate to a helper class.
   TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
       *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-      lhs_array, rhs_array, GetExecutableRunOptionsArgument(), &ir_builder_));
+      lhs_array, rhs_array, GetExecutableRunOptionsArgument(), &ir_builder_,
+      hlo_module_config_));
 
   emitted_value_[dot] = target_address;
   return Status::OK();
@@ -845,9 +971,10 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution,
            int64_type,    int64_type,     int64_type,     int64_type,
            int64_type,    int64_type,     int64_type,     int64_type},
           /*isVarArg=*/false);
-      legacy_flags::CpuRuntimeFlags* flags = legacy_flags::GetCpuRuntimeFlags();
+      bool multi_threaded_eigen =
+          hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
       const char* fn_name =
-          (flags->xla_cpu_multi_thread_eigen
+          (multi_threaded_eigen
                ? runtime::kEigenConvF32SymbolName
                : runtime::kEigenSingleThreadedConvF32SymbolName);
       llvm::Function* conv_func = llvm::cast<llvm::Function>(
@@ -1039,6 +1166,237 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
       "Cross replica sum not implemented on CPU. See b/33011107.");
 }
 
+// Fills up the free variables in 'index_with_free_var' with values from
+// 'filler_index'. The size of free variables must be the same as the
+// size of 'filler_index'.
+//
+// This is often used after dimension reduction, where
+// 'index_with_free_var' has one or more dimensions reduced, which serves as
+// free variables (represented as nullptr). For example, if we have a 4
+// dimensional input and index for the dimension being reduced is
+// 2 (third dimension), we will have an index like [i, j, NULL, k]
+// after reduced dimension.
+//
+// Here we fill up that free variable by 'filler_index', which contains
+// the value in the reduced dimension.
+static llvm_ir::IrArray::Index FillReducedDimensionIndex(
+    llvm_ir::IrArray::Index index_with_free_var,
+    llvm_ir::IrArray::Index filler_index) {
+  llvm_ir::IrArray::Index::const_iterator it = filler_index.begin();
+
+  for (size_t i = 0; i < index_with_free_var.size(); ++i) {
+    if (index_with_free_var[i] == nullptr) {
+      index_with_free_var[i] = *it++;
+    }
+  }
+  CHECK(filler_index.end() == it);
+  return index_with_free_var;
+}
+
+Status IrEmitter::HandleBatchNormTraining(HloInstruction* batch_norm_training) {
+  // The output of BatchNormTraining is a tuple of three element:
+  //   - An N-dimensional array containing normalized values.
+  //   - A 1 dimensional array containing the mean value for each feature.
+  //   - A 1 dimensional array containing the variance value for each feature.
+  HloInstruction* operand = batch_norm_training->operands()[0];
+  HloInstruction* scale = batch_norm_training->operands()[1];
+  HloInstruction* offset = batch_norm_training->operands()[2];
+  float epsilon = batch_norm_training->epsilon();
+  int64 feature_index = batch_norm_training->feature_index();
+  TF_RET_CHECK(ShapeUtil::IsTuple(batch_norm_training->shape()) &&
+               ShapeUtil::TupleElementCount(batch_norm_training->shape()) == 3);
+
+  const Shape& output_shape =
+      ShapeUtil::GetTupleElementShape(batch_norm_training->shape(), 0);
+  const Shape& feature_shape =
+      ShapeUtil::GetTupleElementShape(batch_norm_training->shape(), 1);
+
+  // Reduce vector of the non-feature dimensions.
+  std::vector<int64> dimensions_to_reduce;
+
+  for (int64 i = 0; i < operand->shape().dimensions_size(); ++i) {
+    if (i != feature_index) {
+      dimensions_to_reduce.push_back(i);
+    }
+  }
+
+  // Get the second and third allocations in the output tuple, which should be
+  // used to store the result of mean and variance value calculation.
+  TF_ASSIGN_OR_RETURN(
+      const BufferAllocation::Slice slice_mean,
+      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{1}));
+  TF_ASSIGN_OR_RETURN(
+      const BufferAllocation::Slice slice_var,
+      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{2}));
+  const int feature_count = output_shape.dimensions(feature_index);
+  const int size_in_elements = ShapeUtil::ElementsIn(output_shape);
+  TF_RET_CHECK(ShapeUtil::ElementsIn(operand->shape()) == size_in_elements);
+  const int elements_per_feature = size_in_elements / feature_count;
+
+  llvm::Value* mean = EmitTempBufferPointer(slice_mean, feature_shape);
+  llvm_ir::IrArray mean_array(mean, feature_shape);
+
+  llvm::Value* var = EmitTempBufferPointer(slice_var, feature_shape);
+  llvm_ir::IrArray var_array(var, feature_shape);
+
+  // This loop calculates mean and variance for each feature.
+  //
+  // In theory this could be swapped by multi-output fusion. We will evaluate
+  // this when it's ready.
+  //
+  // For variance calculation, we use a simplified formula so we can fuse the
+  // computation into the same loop to calculate mean: Var=E(X^2) - E(X)^2.
+  TF_RETURN_IF_ERROR(
+      llvm_ir::LoopEmitter(
+          [this, operand, dimensions_to_reduce, feature_shape, var_array,
+           elements_per_feature](const llvm_ir::IrArray::Index& index) {
+            PrimitiveType element_type = operand->shape().element_type();
+            // Used to calculate E(X).
+            llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
+                llvm_ir::PrimitiveTypeToIrType(element_type, &ir_builder_),
+                "sum_address", &ir_builder_,
+                MinimumAlignmentForPrimitiveType(element_type));
+
+            // Used to calculate E(X^2).
+            llvm::Value* sum_square_address =
+                llvm_ir::EmitAllocaAtFunctionEntry(
+                    llvm_ir::PrimitiveTypeToIrType(element_type, &ir_builder_),
+                    "sum_square_address", &ir_builder_,
+                    MinimumAlignmentForPrimitiveType(element_type));
+
+            ir_builder_.CreateStore(
+                llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0),
+                sum_address);
+
+            ir_builder_.CreateStore(
+                llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0),
+                sum_square_address);
+
+            llvm_ir::ForLoopNest loops(&ir_builder_);
+
+            const llvm_ir::IrArray::Index reduced_dims_index =
+                loops.AddLoopsForShapeOnDimensions(
+                    operand->shape(), dimensions_to_reduce, "reduction_dim");
+
+            SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(),
+                                  &ir_builder_);
+
+            llvm_ir::IrArray operand_array(GetIrArrayForOp(operand));
+            llvm_ir::IrArray::Index input_index =
+                FillReducedDimensionIndex(reduced_dims_index, index);
+            llvm::Value* new_value =
+                operand_array.EmitReadArrayElement(input_index, &ir_builder_);
+
+            llvm::Value* new_value_square =
+                ir_builder_.CreateFMul(new_value, new_value);
+
+            llvm::Value* current_sum = ir_builder_.CreateLoad(sum_address);
+            llvm::Value* current_sum_square =
+                ir_builder_.CreateLoad(sum_square_address);
+            // Update sum.
+            ir_builder_.CreateStore(
+                ir_builder_.CreateFAdd(current_sum, new_value), sum_address);
+
+            // Update sum square.
+            ir_builder_.CreateStore(
+                ir_builder_.CreateFAdd(current_sum_square, new_value_square),
+                sum_square_address);
+
+            SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(),
+                                  &ir_builder_);
+
+            llvm::Value* sum = ir_builder_.CreateLoad(sum_address);
+            llvm::Value* elements_per_feature_value = llvm::ConstantFP::get(
+                ir_builder_.getFloatTy(), elements_per_feature);
+            llvm::Value* mean =
+                ir_builder_.CreateFDiv(sum, elements_per_feature_value);
+            llvm::Value* mean_square = ir_builder_.CreateFMul(mean, mean);
+            llvm::Value* sum_square =
+                ir_builder_.CreateLoad(sum_square_address);
+
+            // Var=E(X^2) - E(X)^2.
+            llvm::Value* var = ir_builder_.CreateFSub(
+                ir_builder_.CreateFDiv(sum_square, elements_per_feature_value),
+                mean_square);
+
+            var_array.EmitWriteArrayElement(index, var, &ir_builder_);
+            return mean;
+          },
+          mean_array, &ir_builder_)
+          .EmitLoop());
+
+  TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                      EmitTargetAddressForOp(batch_norm_training));
+
+  TF_ASSIGN_OR_RETURN(
+      const BufferAllocation::Slice slice,
+      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{0}));
+
+  llvm::Value* normalized = EmitTempBufferPointer(slice, output_shape);
+
+  llvm_ir::IrArray target_array(normalized, output_shape);
+
+  AddAliasingInformationToIrArray(*batch_norm_training, &target_array);
+
+  TF_RETURN_IF_ERROR(
+      llvm_ir::LoopEmitter(
+          [this, mean_array, var_array, epsilon, operand, dimensions_to_reduce,
+           feature_index, offset, scale](const llvm_ir::IrArray::Index& index) {
+            // The following logic normalizes the input value, scales and shifts
+            // it:
+            //
+            // normalized = (input - mean) / sqrt(variance + epsilon)
+            // result = normalized * scale + offset
+
+            // Current index in the feature dimension.
+            llvm_ir::IrArray::Index feature_index_value(1,
+                                                        index[feature_index]);
+
+            llvm::Value* mean = mean_array.EmitReadArrayElement(
+                feature_index_value, &ir_builder_);
+            llvm::Value* var = var_array.EmitReadArrayElement(
+                feature_index_value, &ir_builder_);
+
+            llvm_ir::IrArray operand_array(GetIrArrayForOp(operand));
+            llvm::Value* input =
+                operand_array.EmitReadArrayElement(index, &ir_builder_);
+
+            llvm::Value* variance_with_epsilon = ir_builder_.CreateFAdd(
+                var, llvm::ConstantFP::get(ir_builder_.getFloatTy(), epsilon));
+            llvm::Function* func_llvm_sqrt = llvm::Intrinsic::getDeclaration(
+                module_, llvm::Intrinsic::sqrt, {ir_builder_.getFloatTy()});
+            llvm::Value* variance_sqrt =
+                ir_builder_.CreateCall(func_llvm_sqrt, {variance_with_epsilon});
+            llvm::Value* normalized = ir_builder_.CreateFDiv(
+                ir_builder_.CreateFSub(input, mean), variance_sqrt);
+            llvm_ir::IrArray offset_array(GetIrArrayForOp(offset));
+            llvm::Value* offset = offset_array.EmitReadArrayElement(
+                feature_index_value, &ir_builder_);
+            llvm_ir::IrArray scale_array(GetIrArrayForOp(scale));
+            llvm::Value* scale = scale_array.EmitReadArrayElement(
+                feature_index_value, &ir_builder_);
+            llvm::Value* result = ir_builder_.CreateFAdd(
+                ir_builder_.CreateFMul(normalized, scale), offset);
+
+            return result;
+          },
+          target_array, &ir_builder_)
+          .EmitLoop());
+
+  llvm_ir::EmitTuple(
+      llvm_ir::IrArray(target_address, batch_norm_training->shape()),
+      {normalized, mean, var}, &ir_builder_);
+  emitted_value_[batch_norm_training] = target_address;
+
+  return Status::OK();
+}
+
+Status IrEmitter::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
+  // TODO(b/62843645) Implement BatchNormGrad on CPU backend.
+  return Unimplemented(
+      "BatchNormGrad is not implemented on CPU. See b/62843645.");
+}
+
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   auto param_number = parameter->parameter_number();
@@ -1073,10 +1431,450 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
+IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
+    HloComputation* function, string* failure_reason) const {
+  CHECK_EQ(function->num_parameters(), 2);
+
+  auto root_instruction = function->root_instruction();
+  CHECK(ShapeUtil::IsScalar(root_instruction->shape()));
+
+  if (root_instruction->operand_count() != 2) {
+    *failure_reason = "root instruction is not a binary operation";
+    return nullptr;
+  }
+
+  const Shape& root_shape = root_instruction->shape();
+  bool root_is_floating_point = ShapeUtil::ElementIsFloating(root_shape);
+  bool root_is_integral = ShapeUtil::ElementIsIntegral(root_shape);
+  bool root_is_signed = ShapeUtil::ElementIsSigned(root_shape);
+
+  auto lhs = root_instruction->operand(0);
+  auto rhs = root_instruction->operand(1);
+
+  auto param_0 = function->parameter_instruction(0);
+  auto param_1 = function->parameter_instruction(1);
+  if (!(lhs == param_0 && rhs == param_1) &&
+      !(rhs == param_0 && lhs == param_1)) {
+    *failure_reason =
+        "root instruction is not a binary operation on the incoming arguments";
+    return nullptr;
+  }
+
+  CHECK(ShapeUtil::IsScalar(lhs->shape()) && ShapeUtil::IsScalar(rhs->shape()));
+
+  // This is visually similar to ElementalIrEmitter, though conceptually we're
+  // doing something different here.  ElementalIrEmitter emits scalar operations
+  // while these emit scalar or vector operations depending on the type of the
+  // operands.
+  switch (root_instruction->opcode()) {
+    default:
+      *failure_reason = "did not recognize root instruction opcode";
+      return nullptr;
+
+    case HloOpcode::kAdd:
+      return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+                                llvm::Value* rhs) {
+        return root_is_integral ? ir_builder->CreateAdd(lhs, rhs)
+                                : ir_builder->CreateFAdd(lhs, rhs);
+      };
+
+    case HloOpcode::kMultiply:
+      return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+                                llvm::Value* rhs) {
+        return root_is_integral ? ir_builder->CreateMul(lhs, rhs)
+                                : ir_builder->CreateFMul(lhs, rhs);
+      };
+
+    case HloOpcode::kLogicalAnd:
+      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+                llvm::Value* rhs) { return ir_builder->CreateAnd(lhs, rhs); };
+
+    case HloOpcode::kLogicalOr:
+      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+                llvm::Value* rhs) { return ir_builder->CreateOr(lhs, rhs); };
+
+    case HloOpcode::kMaximum:
+      return [root_is_floating_point, root_is_signed](
+                 llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+                 llvm::Value* rhs) {
+        if (root_is_floating_point) {
+          return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum,
+                                              {lhs, rhs}, {lhs->getType()},
+                                              ir_builder);
+        }
+
+        return ir_builder->CreateSelect(
+            ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE
+                                                  : llvm::ICmpInst::ICMP_UGE,
+                                   lhs, rhs),
+            lhs, rhs);
+      };
+
+    case HloOpcode::kMinimum:
+      return [root_is_floating_point, root_is_signed](
+                 llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+                 llvm::Value* rhs) {
+        if (root_is_floating_point) {
+          return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum,
+                                              {lhs, rhs}, {lhs->getType()},
+                                              ir_builder);
+        }
+
+        return ir_builder->CreateSelect(
+            ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE
+                                                  : llvm::ICmpInst::ICMP_ULE,
+                                   lhs, rhs),
+            lhs, rhs);
+      };
+  }
+}
+
+IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
+    PrimitiveType element_type, unsigned element_count) {
+  // Here we assume that the largest register is a vector register.
+  int max_vector_register_size_in_bytes =
+      target_machine_features_.largest_register_size_in_bytes(
+          compute_function_);
+
+  int vector_register_size_in_elements =
+      max_vector_register_size_in_bytes /
+      ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+
+  ShardedVectorType sharded_vector_type;
+  llvm::Type* element_ir_type =
+      llvm_ir::PrimitiveTypeToIrType(element_type, &ir_builder_);
+
+  for (int i = 0, e = 1 + tensorflow::Log2Ceiling(element_count); i < e; i++) {
+    // For every power of two present in element_count, we generate one or more
+    // vector or scalar types.
+    const unsigned current_size_fragment = 1u << i;
+    if (!(element_count & current_size_fragment)) {
+      // Power of two not present in element_count.
+      continue;
+    }
+
+    if (current_size_fragment == 1) {
+      // Single element, use a scalar type.
+      sharded_vector_type.push_back(element_ir_type);
+      continue;
+    }
+
+    // Lower "current_size_fragment" number of elements using (as few as
+    // possible) vector registers.
+
+    if (current_size_fragment >= vector_register_size_in_elements) {
+      auto vector_type = llvm::VectorType::get(
+          element_ir_type, vector_register_size_in_elements);
+      sharded_vector_type.insert(
+          sharded_vector_type.end(),
+          current_size_fragment / vector_register_size_in_elements,
+          vector_type);
+
+      // Both current_size_fragment and vector_register_size_in_elements are
+      // powers of two.
+      CHECK_EQ(current_size_fragment % vector_register_size_in_elements, 0);
+      continue;
+    }
+
+    // For now we assume that vector_register_size_in_elements and lower powers
+    // of two are all legal vector sizes (or at least can be lowered easily by
+    // LLVM).
+    sharded_vector_type.push_back(
+        llvm::VectorType::get(element_ir_type, current_size_fragment));
+  }
+  return sharded_vector_type;
+}
+
+StatusOr<IrEmitter::ShardedVector>
+IrEmitter::EmitInnerLoopForVectorizedReduction(
+    const ReductionGenerator& reduction_generator,
+    const llvm_ir::IrArray::Index& output_index,
+    const ShardedVectorType& accumulator_type, HloInstruction* init_value,
+    HloInstruction* arg, tensorflow::gtl::ArraySlice<int64> dimensions,
+    unsigned element_alignment) {
+  ShardedVector accumulator;
+  accumulator.reserve(accumulator_type.size());
+  for (auto accumulator_shard_type : accumulator_type) {
+    accumulator.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
+        accumulator_shard_type, "accumulator", &ir_builder_, 0));
+  }
+
+  llvm::Value* init_value_ssa =
+      ir_builder_.CreateLoad(GetEmittedValueFor(init_value));
+
+  for (llvm::Value* accumulator_shard : accumulator) {
+    llvm::Value* initial_value;
+    auto shard_type = accumulator_shard->getType()->getPointerElementType();
+    if (auto vector_type = llvm::dyn_cast<llvm::VectorType>(shard_type)) {
+      initial_value = ir_builder_.CreateVectorSplat(
+          vector_type->getNumElements(), init_value_ssa);
+    } else {
+      initial_value = init_value_ssa;
+    }
+
+    ir_builder_.CreateAlignedStore(initial_value, accumulator_shard,
+                                   element_alignment);
+  }
+
+  llvm_ir::ForLoopNest reduction_loop_nest(&ir_builder_);
+  llvm_ir::IrArray::Index reduced_dims_index =
+      reduction_loop_nest.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
+                                                       "reduction_dim");
+
+  SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(),
+                        &ir_builder_);
+
+  llvm_ir::IrArray arg_array(GetIrArrayForOp(arg));
+  llvm_ir::IrArray::Index input_index = reduced_dims_index;
+  llvm_ir::IrArray::Index::const_iterator it = output_index.begin();
+
+  for (size_t i = 0; i < input_index.size(); ++i) {
+    if (input_index[i] == nullptr) {
+      input_index[i] = *it++;
+    }
+  }
+  CHECK(output_index.end() == it);
+
+  llvm::Value* input_address = ir_builder_.CreateBitCast(
+      arg_array.EmitArrayElementAddress(input_index, &ir_builder_),
+      ir_builder_.getInt8PtrTy());
+
+  for (int i = 0; i < accumulator.size(); i++) {
+    auto input_address_typed =
+        ir_builder_.CreateBitCast(input_address, accumulator[i]->getType());
+    auto current_accumulator_value =
+        ir_builder_.CreateAlignedLoad(accumulator[i], element_alignment);
+    auto addend =
+        ir_builder_.CreateAlignedLoad(input_address_typed, element_alignment);
+    arg_array.AnnotateLoadStoreInstructionWithMetadata(addend);
+
+    auto reduced_result =
+        reduction_generator(&ir_builder_, current_accumulator_value, addend);
+    ir_builder_.CreateAlignedStore(reduced_result, accumulator[i],
+                                   element_alignment);
+
+    if (i != (accumulator.size() - 1)) {
+      input_address = ir_builder_.CreateConstInBoundsGEP1_32(
+          reduced_result->getType(), input_address_typed, 1);
+    }
+  }
+
+  SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(),
+                        &ir_builder_);
+
+  ShardedVector result_ssa;
+  result_ssa.reserve(accumulator.size());
+  for (auto accumulator_shard : accumulator) {
+    result_ssa.push_back(
+        ir_builder_.CreateAlignedLoad(accumulator_shard, element_alignment));
+  }
+  return result_ssa;
+}
+
+void IrEmitter::EmitShardedVectorStore(
+    llvm::Value* store_address, const std::vector<llvm::Value*>& value_to_store,
+    const int alignment, const llvm_ir::IrArray& containing_array) {
+  for (int i = 0; i < value_to_store.size(); i++) {
+    auto store_address_typed = ir_builder_.CreateBitCast(
+        store_address,
+        llvm::PointerType::getUnqual(value_to_store[i]->getType()));
+
+    auto store_instruction = ir_builder_.CreateAlignedStore(
+        value_to_store[i], store_address_typed, alignment);
+    containing_array.AnnotateLoadStoreInstructionWithMetadata(
+        store_instruction);
+
+    if (i != (value_to_store.size() - 1)) {
+      store_address = ir_builder_.CreateConstInBoundsGEP1_32(
+          value_to_store[i]->getType(), store_address_typed, 1);
+    }
+  }
+}
+
+namespace {
+// TODO(sanjoy): This is duplicated in tensorflow/core/lib/core/arena.cc.
+// Extract out a common implementation to tensorflow/core/lib/math/math_util.h
+uint32 GCD(uint32 x, uint32 y) {
+  while (y != 0) {
+    uint32 r = x % y;
+    x = y;
+    y = r;
+  }
+  return x;
+}
+}  // namespace
+
+StatusOr<bool> IrEmitter::EmitVectorizedReduce(
+    HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
+    tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function,
+    string* failure_reason) {
+  ReductionGenerator reduction_generator =
+      MatchReductionGenerator(function, failure_reason);
+  if (!reduction_generator) {
+    return false;
+  }
+
+  int vectorization_factor_in_bytes =
+      target_machine_features_.vectorization_factor_in_bytes();
+
+  // We try to process vectorization_factor elements at the same time.
+  const int vectorization_factor =
+      vectorization_factor_in_bytes /
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
+
+  bool is_reduction_over_minor_dimension =
+      std::find(dimensions.begin(), dimensions.end(),
+                arg->shape().layout().minor_to_major(0)) != dimensions.end();
+
+  unsigned element_alignment =
+      GCD(ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
+          MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
+
+  if (is_reduction_over_minor_dimension) {
+    // TODO(sanjoy): Implement vectorized reduction over the minor dimension.
+    *failure_reason = "reduction over minor dimension not implemented";
+    return false;
+  }
+
+  CHECK(!ShapeUtil::IsTuple(reduce->shape()));
+
+  // We know we're not reducing over the most minor dimension, which means we
+  // can lower the reduction loop as:
+  //
+  //  1. We're reducing over dimensions R0, R1.
+  //  2. D0 is the most minor dimension.
+  //  3. VS is the vectorization stride (we want to reduce this many elements at
+  //     once)
+  //
+  //  for (d1 in D1) {
+  //    for (d0 in D0 with stride VS) {
+  //      vector_acc = init
+  //      for (r1 in R1) {
+  //        for (r0 in R0) {
+  //          vector_acc = elementwise_reduce(vector_acc, input[d1, d0, r1, r0]
+  //        }
+  //      }
+  //      output[d1, d0] = vector_acc
+  //    }
+  //  }
+
+  llvm_ir::ForLoopNest loop_nest(&ir_builder_);
+  llvm_ir::IrArray::Index array_index(reduce->shape().dimensions_size());
+  for (int i = reduce->shape().layout().minor_to_major_size() - 1; i > 0; --i) {
+    int64 dimension = reduce->shape().layout().minor_to_major(i);
+    int64 start_index = 0;
+    int64 end_index = reduce->shape().dimensions(dimension);
+    std::unique_ptr<llvm_ir::ForLoop> loop =
+        loop_nest.AddLoop(start_index, end_index,
+                          tensorflow::strings::Printf("dim.%lld", dimension));
+    array_index[dimension] = loop->GetIndVarValue();
+  }
+
+  int64 innermost_dimension = reduce->shape().layout().minor_to_major(0);
+  int64 innermost_dimension_size =
+      reduce->shape().dimensions(innermost_dimension);
+
+  if (llvm::BasicBlock* innermost_body_bb =
+          loop_nest.GetInnerLoopBodyBasicBlock()) {
+    SetToFirstInsertPoint(innermost_body_bb, &ir_builder_);
+  }
+
+  auto outermost_loop_exit_block = loop_nest.GetOuterLoopExitBasicBlock();
+
+  if (innermost_dimension_size >= vectorization_factor) {
+    int64 start_index = 0;
+    int64 end_index = (innermost_dimension_size / vectorization_factor) *
+                      vectorization_factor;
+    std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+        start_index, end_index, vectorization_factor,
+        tensorflow::strings::Printf("dim.%lld", innermost_dimension));
+    array_index[innermost_dimension] = loop->GetIndVarValue();
+
+    SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &ir_builder_);
+
+    ShardedVectorType vector_type = CreateShardedVectorType(
+        reduce->shape().element_type(), vectorization_factor);
+    TF_ASSIGN_OR_RETURN(std::vector<llvm::Value*> accumulator,
+                        EmitInnerLoopForVectorizedReduction(
+                            reduction_generator, array_index, vector_type,
+                            init_value, arg, dimensions, element_alignment));
+
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(reduce));
+    llvm_ir::IrArray target_array(target_address, reduce->shape());
+    AddAliasingInformationToIrArray(*reduce, &target_array);
+    llvm::Value* output_address =
+        target_array.EmitArrayElementAddress(array_index, &ir_builder_);
+    EmitShardedVectorStore(output_address, accumulator, element_alignment,
+                           target_array);
+
+    if (auto exit_terminator = loop->GetExitBasicBlock()->getTerminator()) {
+      CHECK_GT(reduce->shape().layout().minor_to_major_size(), 1);
+      ir_builder_.SetInsertPoint(exit_terminator);
+    } else {
+      CHECK_EQ(reduce->shape().layout().minor_to_major_size(), 1);
+      ir_builder_.SetInsertPoint(loop->GetExitBasicBlock());
+    }
+  }
+
+  // Since we increment the stride for the inner dimension by more than 1, we
+  // may need to peel out an "epilogue" iteration to get the remaining elements
+  // in the following case:
+  if (innermost_dimension_size % vectorization_factor) {
+    // TODO(b/63775531): Consider using a scalar loop here to save on code size.
+    array_index[innermost_dimension] =
+        ir_builder_.getInt64(innermost_dimension_size -
+                             (innermost_dimension_size % vectorization_factor));
+
+    ShardedVectorType vector_type = CreateShardedVectorType(
+        reduce->shape().element_type(),
+        innermost_dimension_size % vectorization_factor);
+    TF_ASSIGN_OR_RETURN(std::vector<llvm::Value*> accumulator,
+                        EmitInnerLoopForVectorizedReduction(
+                            reduction_generator, array_index, vector_type,
+                            init_value, arg, dimensions, element_alignment));
+
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(reduce));
+    llvm_ir::IrArray target_array(target_address, reduce->shape());
+    AddAliasingInformationToIrArray(*reduce, &target_array);
+    llvm::Value* output_address =
+        target_array.EmitArrayElementAddress(array_index, &ir_builder_);
+    EmitShardedVectorStore(output_address, accumulator, element_alignment,
+                           target_array);
+  }
+
+  if (outermost_loop_exit_block) {
+    ir_builder_.SetInsertPoint(outermost_loop_exit_block);
+  }
+
+  TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                      EmitTargetAddressForOp(reduce));
+
+  emitted_value_[reduce] = target_address;
+  return true;
+}
+
 Status IrEmitter::HandleReduce(HloInstruction* reduce, HloInstruction* arg,
                                HloInstruction* init_value,
                                tensorflow::gtl::ArraySlice<int64> dimensions,
                                HloComputation* function) {
+  if (!VectorizedReduceDisabled(hlo_module_config_)) {
+    string vectorization_failure_reason;
+    TF_ASSIGN_OR_RETURN(
+        bool vectorization_successful,
+        EmitVectorizedReduce(reduce, arg, init_value, dimensions, function,
+                             &vectorization_failure_reason));
+    if (vectorization_successful) {
+      VLOG(1) << "Successfully vectorized reduction " << reduce->ToString()
+              << "\n";
+      return Status::OK();
+    } else {
+      VLOG(1) << "Could not vectorize reduction " << reduce->ToString() << ": "
+              << vectorization_failure_reason;
+    }
+  }
+
   // The called computation should have been emitted previously.
   llvm::Function* reducer_function = FindOrDie(emitted_functions_, function);
   return EmitTargetElementLoop(
@@ -1140,13 +1938,143 @@ Status IrEmitter::HandleSend(HloInstruction* send) {
 }
 
 Status IrEmitter::HandleSlice(HloInstruction* slice, HloInstruction* operand) {
-  if (ShapeUtil::IsScalar(slice->shape())) {
-    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
-                        EmitTargetAddressForOp(slice));
-    emitted_value_[slice] = target_address;
-    return EmitMemcpy(*operand, *slice);
+  VLOG(2) << "HandleSlice: " << slice->ToString();
+
+  // The code below emits a sequential loop nest. For the parallel backend, use
+  // EmitParallelTargetElementLoop() which respects dynamic loop bounds.
+  if (ShouldEmitParallelLoopFor(*slice)) {
+    return DefaultAction(slice);
+  }
+
+  // The code below assumes the layouts are equal.
+  if (!LayoutUtil::Equal(operand->shape().layout(), slice->shape().layout())) {
+    return DefaultAction(slice);
+  }
+
+  TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                      EmitTargetAddressForOp(slice));
+  emitted_value_[slice] = target_address;
+
+  if (ShapeUtil::HasZeroElements(slice->shape())) {
+    return Status::OK();
+  }
+
+  const Layout& layout = operand->shape().layout();
+  const int64 num_dims = operand->shape().dimensions_size();
+
+  // The slice lowering finds maximal contiguous blocks of memory that can be
+  // copied from the source to the target. This is done by looking at the
+  // source/target layout in minor to major order and do the following:
+  //
+  // * Find an initial segment of dimensions along which the slice uses the
+  //   whole dimension. These are the "inner" dimensions and can be folded into
+  //   the memcpy.
+  //
+  // * Of the remaining dimensions decide which ones require loops.
+  //
+  // * Implement the memcpy within the innermost loop.
+
+  tensorflow::gtl::FlatSet<int64> inner_dims;
+  for (int64 dim : layout.minor_to_major()) {
+    if (operand->shape().dimensions(dim) != slice->shape().dimensions(dim)) {
+      break;
+    }
+    inner_dims.insert(dim);
+  }
+
+  const bool is_trivial_copy = (inner_dims.size() == num_dims);
+  if (is_trivial_copy) {
+    if (ShapeUtil::IsEffectiveScalar(slice->shape())) {
+      return DefaultAction(slice);
+    } else {
+      return EmitMemcpy(*slice, *operand);
+    }
   }
-  return DefaultAction(slice);
+
+  // The memcpy will copy elements that are logically this shape (allowed to be
+  // scalar).
+  const Shape element_shape = ShapeUtil::FilterDimensions(
+      [&inner_dims](int64 dim) -> bool { return inner_dims.count(dim); },
+      operand->shape());
+
+  // memcpy_dim is the innermost (in terms of layout) dimension for which the
+  // slice does *not* just copy all the elements along the dimension.
+  const int64 memcpy_dim = layout.minor_to_major(inner_dims.size());
+
+  const bool memcpy_is_contiguous = slice->slice_strides(memcpy_dim) == 1;
+  // The number of logical elements that can be copied in a single call
+  // to memcpy. We can only copy 1 element at a time if there is a non-trivial
+  // stride.
+  const int64 memcpy_elements =
+      memcpy_is_contiguous
+          ? slice->slice_limits(memcpy_dim) - slice->slice_starts(memcpy_dim)
+          : 1;
+
+  if (memcpy_elements == 1 && ShapeUtil::IsEffectiveScalar(element_shape)) {
+    // Avoid using memcpy for copying element by element at a time. This does
+    // not buy us anything and may actually cause LLVM's load/store optimization
+    // to be less effective.
+    return DefaultAction(slice);
+  }
+
+  // Determine the dimensions that get lowered as loops.
+  std::vector<int64> outer_dims;
+  for (int64 i = 0; i < num_dims - inner_dims.size() - 1; ++i) {
+    outer_dims.push_back(LayoutUtil::Major(layout, i));
+  }
+
+  // Is the slice along the memcpy dimension contiguous? If not, then memcpy_dim
+  // needs to be wrapped around a loop as well.
+  if (!memcpy_is_contiguous) {
+    outer_dims.push_back(memcpy_dim);
+  }
+
+  llvm_ir::IrArray target_array(target_address, slice->shape());
+  AddAliasingInformationToIrArray(*slice, &target_array);
+
+  const int64 num_outer_loops = outer_dims.size();
+  llvm_ir::ForLoopNest loops(&ir_builder_);
+  llvm_ir::IrArray::Index target_index =
+      loops.AddLoopsForShapeOnDimensions(slice->shape(), outer_dims, "slice");
+
+  // Only the indices for the outer dimensions have been initialized in
+  // target_index. The rest of the indices should get initialized to 0, since
+  // for the rest of the dimensions the copy writes to the full dimension.
+  for (llvm::Value*& index : target_index) {
+    if (index == nullptr) {
+      index = ir_builder_.getInt64(0);
+    }
+  }
+
+  if (num_outer_loops > 0) {
+    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  }
+
+  llvm_ir::IrArray source_array(GetEmittedValueFor(operand), operand->shape());
+
+  const llvm_ir::IrArray::Index source_index = target_index.SourceIndexOfSlice(
+      /*shape=*/slice->shape(), /*starts=*/slice->slice_starts(),
+      /*strides=*/slice->slice_strides(), /*builder=*/&ir_builder_);
+
+  llvm::Value* memcpy_dest = target_array.EmitArrayElementAddress(
+      target_index, &ir_builder_, "slice.dest");
+  llvm::Value* memcpy_source = source_array.EmitArrayElementAddress(
+      source_index, &ir_builder_, "slice.source");
+  const int64 memcpy_bytes =
+      ShapeUtil::ByteSizeOf(element_shape) * memcpy_elements;
+  // TODO(b/63762267): Be more aggressive with `align` by using the GCD of the
+  // element size and buffer alignment.
+  ir_builder_.CreateMemCpy(memcpy_dest, memcpy_source, memcpy_bytes,
+                           /*align=*/1);
+
+  VLOG(2) << "  emitted memcpy of " << memcpy_bytes << " bytes inside "
+          << num_outer_loops << " loops";
+
+  if (num_outer_loops > 0) {
+    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+  }
+
+  return Status::OK();
 }
 
 Status IrEmitter::HandleDynamicSlice(HloInstruction* dynamic_slice,
@@ -1283,7 +2211,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
         *dot, dot->operand(0)->IsRank2Transpose(),
         dot->operand(1)->IsRank2Transpose(), target_array, lhs_array, rhs_array,
-        GetExecutableRunOptionsArgument(), &ir_builder_));
+        GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_));
 
     emitted_value_[fusion] = target_address;
     return Status::OK();
@@ -1568,6 +2496,7 @@ void IrEmitter::ProfilingState::RecordCompleteComputation(
 }
 
 Status IrEmitter::Preprocess(HloInstruction* hlo) {
+  VLOG(3) << "Visiting: " << hlo->ToString();
   if (hlo_to_profile_idx_ && hlo_to_profile_idx_->count(hlo)) {
     profiling_state_.RecordCycleStart(&ir_builder_, hlo);
   }
@@ -1606,13 +2535,24 @@ llvm::Argument* IrEmitter::GetResultArgument() {
 }
 
 llvm::Argument* IrEmitter::GetProfileCountersArgument() {
-  return hlo_to_profile_idx_ ? GetArg(compute_function_, 4) : nullptr;
+  const int64 arg_index = num_dynamic_loop_bounds_ > 0 ? 5 : 4;
+  return hlo_to_profile_idx_ ? GetArg(compute_function_, arg_index) : nullptr;
 }
 
 llvm::Value* IrEmitter::GetTempBuffersArgument() {
   return GetArg(compute_function_, 3);
 }
 
+llvm::Value* IrEmitter::GetDynamicLoopBound(const int64 offset) {
+  CHECK_GT(num_dynamic_loop_bounds_, 0);
+  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
+  llvm::Argument* loop_bounds_arg = GetArg(compute_function_, 4);
+  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
+  return ir_builder_.CreateLoad(
+      ir_builder_.CreateGEP(loop_bounds_arg, ir_builder_.getInt64(offset),
+                            llvm_ir::AsStringRef(name)));
+}
+
 llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() {
   return GetArg(compute_function_, 1);
 }
@@ -1645,11 +2585,14 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
       GetTempBuffersArgument(), slice.index(), &ir_builder_);
   llvm::LoadInst* tempbuf_address_base =
       ir_builder_.CreateLoad(tempbuf_address_ptr);
-  //  Loading the address of a buffer is invariant of the point at which the
-  //  load is executed in the program because we never reassign buffers.
-  tempbuf_address_base->setMetadata(
-      llvm::LLVMContext::MD_invariant_load,
-      llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{}));
+  if (hlo_module_config_.debug_options()
+          .xla_llvm_enable_invariant_load_metadata()) {
+    //  Loading the address of a buffer is invariant of the point at which the
+    //  load is executed in the program because we never reassign buffers.
+    tempbuf_address_base->setMetadata(
+        llvm::LLVMContext::MD_invariant_load,
+        llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{}));
+  }
   llvm_ir::SetTbaaForInstruction(tempbuf_address_base, target_shape,
                                  /*is_pointer_to=*/true);
   AttachAlignmentMetadataForLoad(tempbuf_address_base, allocation.size());
@@ -1739,13 +2682,13 @@ llvm::Value* IrEmitter::EmitArrayFunctionCall(
 }
 
 StatusOr<llvm::Value*> IrEmitter::EmitTargetAddressForOp(
-    const HloInstruction* op) {
-  const Shape& target_shape = op->shape();
-  if (op == op->parent()->root_instruction()) {
+    const HloInstruction* op, const ShapeIndex& shape_index) {
+  const Shape& target_shape = ShapeUtil::GetSubshape(op->shape(), shape_index);
+  if (op == op->parent()->root_instruction() && shape_index.empty()) {
     // For the root node, we write directly to the output buffer of the
     // function.
     llvm::Argument* retval = GetResultArgument();
-    if (!ShapeUtil::HasZeroElements(target_shape)) {
+    if (!ShapeUtil::IsNil(target_shape)) {
       llvm::AttrBuilder attr_builder;
       attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape));
       attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape));
@@ -1773,16 +2716,103 @@ Status IrEmitter::EmitTargetElementLoop(
   TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
                       EmitTargetAddressForOp(target_op));
   VLOG(2) << "  target address: " << llvm_ir::DumpToString(*target_address);
-  llvm_ir::IrArray target_array(target_address, target_shape);
-  AddAliasingInformationToIrArray(*target_op, &target_array);
 
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
-          .EmitLoop());
+  if (target_op->IsMultiOutputFusion()) {
+    // For multiple outputs fusion, we need to emit each operand and the root.
+    TF_RET_CHECK(num_dynamic_loop_bounds_ == 0);
+    std::vector<llvm_ir::IrArray> output_arrays;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(target_shape); ++i) {
+      TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                          assignment_.GetUniqueSlice(target_op, {i}));
+      const Shape& element_shape = ShapeUtil::GetSubshape(target_shape, {i});
+      llvm::Value* op_target_address =
+          EmitTempBufferPointer(slice, element_shape);
+      output_arrays.push_back(
+          llvm_ir::IrArray(op_target_address, element_shape));
+    }
+    TF_RETURN_IF_ERROR(
+        llvm_ir::LoopEmitter(element_generator, output_arrays, &ir_builder_)
+            .EmitLoop());
+
+    std::vector<llvm::Value*> tuple_operand_ptrs;
+    for (int64 i = 0; i < output_arrays.size(); ++i) {
+      tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
+    }
+    llvm_ir::EmitTuple(llvm_ir::IrArray(target_address, target_shape),
+                       tuple_operand_ptrs, &ir_builder_);
+
+  } else {
+    llvm_ir::IrArray target_array(target_address, target_shape);
+    AddAliasingInformationToIrArray(*target_op, &target_array);
+
+    if (ShouldEmitParallelLoopFor(*target_op)) {
+      TF_RETURN_IF_ERROR(EmitParallelTargetElementLoop(
+          target_shape, element_generator, &target_array));
+    } else {
+      TF_RETURN_IF_ERROR(
+          llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
+              .EmitLoop());
+    }
+  }
+
   emitted_value_[target_op] = target_address;
   return Status::OK();
 }
 
+Status IrEmitter::EmitParallelTargetElementLoop(
+    const Shape& target_shape,
+    const llvm_ir::ElementGenerator& element_generator,
+    llvm_ir::IrArray* target_array) {
+  CHECK(!ShapeUtil::IsTuple(target_shape));
+  CHECK(!ShapeUtil::IsScalar(target_shape));
+
+  // Emit code to read dynamic loop bounds from function argument 4.
+  std::vector<llvm::Value*> dynamic_loop_bounds(2 * num_dynamic_loop_bounds_);
+  for (int i = 0; i < 2 * num_dynamic_loop_bounds_; ++i) {
+    dynamic_loop_bounds[i] = GetDynamicLoopBound(i);
+  }
+
+  llvm_ir::ForLoopNest loop_nest(&ir_builder_);
+  const int64 num_dims = target_shape.dimensions_size();
+  llvm_ir::IrArray::Index array_index(num_dims);
+
+  // Add loops from outer-most to inner-most dimensions.
+  for (int i = target_shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
+    const int64 dimension = target_shape.layout().minor_to_major(i);
+    const int bounds_index = num_dims - 1 - i;
+    if (bounds_index < num_dynamic_loop_bounds_) {
+      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
+      // are read from ir function dynamic loop bounds argument.
+      llvm::Value* start_index = dynamic_loop_bounds[bounds_index * 2 + 0];
+      llvm::Value* end_index = dynamic_loop_bounds[bounds_index * 2 + 1];
+
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
+          start_index, end_index);
+      array_index[dimension] = loop->GetIndVarValue();
+    } else {
+      // Emit static loop bounds for this dimension.
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*start_index=*/0,
+          /*end_index=*/target_shape.dimensions(dimension),
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
+      array_index[dimension] = loop->GetIndVarValue();
+    }
+  }
+  // Point IR builder at inner loop BB.
+  SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+
+  // Emit loop body.
+  TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
+                      element_generator(array_index));
+  target_array->EmitWriteArrayElement(array_index, target_element,
+                                      &ir_builder_);
+  // Point IR builder at outer loop exit BB.
+  SetToFirstInsertPoint(loop_nest.GetOuterLoopExitBasicBlock(), &ir_builder_);
+
+  return Status::OK();
+}
+
 Status IrEmitter::EmitMemcpy(const HloInstruction& source,
                              const HloInstruction& destination) {
   llvm::Value* source_value = GetEmittedValueFor(&source);
@@ -1825,5 +2855,36 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
       hlo, elemental_emitter.MakeElementGenerator(hlo, operand_to_generator));
 }
 
+unsigned TargetMachineFeatures::largest_register_size_in_bytes(
+    llvm::Function* function) {
+  auto itr = largest_register_size_in_bytes_.find(function);
+  if (itr != largest_register_size_in_bytes_.end()) {
+    return itr->second;
+  }
+
+  int result = largest_register_size_in_bytes_impl(function);
+
+  InsertOrDie(&largest_register_size_in_bytes_, function, result);
+  DCHECK_EQ(result, largest_register_size_in_bytes_.begin()->second);
+  return result;
+}
+
+unsigned TargetMachineFeatures::largest_register_size_in_bytes_impl(
+    llvm::Function* function) const {
+  auto register_info =
+      target_machine_->getSubtargetImpl(*function)->getRegisterInfo();
+
+  unsigned largest_register_size = 0;
+  for (const llvm::TargetRegisterClass* register_class :
+       register_info->regclasses()) {
+    if (register_class->isAllocatable()) {
+      largest_register_size =
+          std::max(largest_register_size,
+                   register_info->getRegSizeInBits(*register_class));
+    }
+  }
+
+  return largest_register_size / 8;
+}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index ebb7296a075f266870fa179a0791dd6d0f77e29f..1a77f695809d471f5c7d03d01ec291093326a9ef 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -41,12 +41,55 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace cpu {
 
+// Wraps an llvm::TargetMachine and parses out some information that feeds into
+// code LLVM IR generation decisions.
+//
+// Ideally we'd be able to use llvm::TargetTransformInfo here (since its
+// interface is pretty much a perfect fit for our use case), but obtaining an
+// instance of llvm::TargetTransformInfo outside an LLVM pass pipeline without
+// super-ugly hacks is difficult.
+//
+// TODO(b/27457097): See if the LLVM community will be receptive to exposing an
+// API that lets us directly create and use llvm::TargetTransformInfo instances
+// outside of a pass manager.
+class TargetMachineFeatures {
+ public:
+  TargetMachineFeatures(llvm::TargetMachine* target_machine)
+      : target_machine_(target_machine) {}
+
+  // Return the vectorization factor, which is the number of bytes of data
+  // explicitly vectorized routines will try to process at once.
+  int vectorization_factor_in_bytes() const {
+    // Ideally this should be a function of the cache line size (which we can
+    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
+    // machine.  Guess a value of 128 bytes for now.
+    return 128;
+  }
+
+  // Return the size of the largest register size in bytes.  We need to pass in
+  // "function" since llvm functions can contain annotations for specializing
+  // them to specific micro-architectures (though currently XLA does not use
+  // this functionality).
+  //
+  // Ideally we should have been able to use
+  // llvm::TargetTransformInfo::getRegisterBitWidth(true) here.
+  unsigned largest_register_size_in_bytes(llvm::Function* function);
+
+ private:
+  unsigned largest_register_size_in_bytes_impl(llvm::Function* function) const;
+
+  tensorflow::gtl::FlatMap<llvm::Function*, int>
+      largest_register_size_in_bytes_;
+  llvm::TargetMachine* target_machine_;
+};
+
 // This class is the top-level API for the XLA HLO --> LLVM IR compiler.  It
 // implements the DfsHloVisitor interface and emits HLO computations as LLVM IR
 // functions.
@@ -63,7 +106,8 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
             llvm::Module* llvm_module,
             const std::unordered_map<const HloInstruction*, size_t>*
-                hlo_to_profile_idx);
+                hlo_to_profile_idx,
+            llvm::TargetMachine* target_machine);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -96,7 +140,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleConstant(HloInstruction* constant,
                         const Literal& literal) override;
-  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
+  Status HandleCopy(HloInstruction* copy) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element,
                                HloInstruction* operand) override;
   Status HandleSelect(HloInstruction* select, HloInstruction* pred,
@@ -106,9 +150,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                    HloInstruction* rhs) override;
   Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs,
                            HloInstruction* rhs, const Window& window) override;
+  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override;
+  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
-  Status HandleOutfeed(HloInstruction* infeed) override;
+  Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleSort(HloInstruction* sort, HloInstruction* operand) override;
   Status HandleParameter(HloInstruction* parameter) override;
   Status HandleReduce(HloInstruction* reduce, HloInstruction* arg,
@@ -192,6 +238,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // computation function being emitted by this emitter.
   llvm::Value* GetTempBuffersArgument();
 
+  // Emit ir to read and return the ir value for the dynamic loop bound at
+  // 'offset' from the "dynamic_loop_bounds" argument of the computation
+  // function being emitted by this emitter.
+  llvm::Value* GetDynamicLoopBound(const int64 offset);
+
   // Emits code that computes the address of the given temporary buffer to the
   // function. target_shape is the shape of this temporary buffer.
   // The returned Value's type is a pointer to element_type.
@@ -262,6 +313,15 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloInstruction* target_op,
       const llvm_ir::ElementGenerator& element_generator);
 
+  // Emit IR to perform a computation for every element in a partition/slice of
+  // 'target_shape'. The loop bounds for the outer-dimension partitions are
+  // passed into the compute function as a runtime argument (accessible from
+  // GetDynamicLoopBound).
+  Status EmitParallelTargetElementLoop(
+      const Shape& target_shape,
+      const llvm_ir::ElementGenerator& element_generator,
+      llvm_ir::IrArray* target_array);
+
   // Emits a memcpy from the source instruction's result value to the
   // destination's.  Both source and destination must have an entry in the
   // emitted_value_ table.
@@ -271,7 +331,8 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Emit IR to compute the target address of the buffer for the given op.
   // The returned Value is a pointer to a IR type that represents the op's
   // element type.
-  StatusOr<llvm::Value*> EmitTargetAddressForOp(const HloInstruction* op);
+  StatusOr<llvm::Value*> EmitTargetAddressForOp(
+      const HloInstruction* op, const ShapeIndex& shape_index = {});
 
   // Structurizes "array_elements" into an MD array that represents "shape".
   // This is a recursive function, and "dimension_index" indicates the index of
@@ -281,6 +342,71 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       const std::vector<llvm::Constant*>& array_elements, const Shape& shape,
       int64 dimension_index);
 
+  // Tries to codegen a reduction operation using vectorized instructions.
+  // Returns true if successful, and false on failure.  On failure, sets
+  // "failure_reason" to a string describing why it could not vectorize the
+  // reduction.
+  //
+  // TODO(sanjoy): Some of the things we do here can be abstracted out into
+  // concepts that generalize over other vectorizable operations.  We should
+  // consider pulling out these abstractions into a VectorizingIrEmitter or
+  // something similar.
+  StatusOr<bool> EmitVectorizedReduce(
+      HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
+      tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function,
+      string* failure_reason);
+
+  // We'd like to keep one or two one cache-line's worth of data in registers
+  // without generating IR with illegal (e.g. excessively large or
+  // non-power-of-two) vector types.  We do this by introducing a layer of
+  // abstraction: we introduce a high level vector-like concept called a
+  // "sharded vector" that models data paralleism, and is mapped to a sequence
+  // scalar and vector llvm::Value s.
+  //
+  // For example, we can represent 29 f32 elements by a sharded vector mapped to
+  // a sequence of LLVM values of types [<16 x f32>, <8 x f32>, <4 x f32>, f32].
+  // Note that the last element is scalar.
+  //
+  // There is no requirement on the ordering or the uniqueness of the elements
+  // mapped to sharded vectors -- we allow repeated elements, and we allow
+  // elements to appear in any order.
+  using ShardedVector = std::vector<llvm::Value*>;
+
+  // A sharded vector type is the element-wise llvm::Type's of some
+  // ShardedVector.
+  using ShardedVectorType = std::vector<llvm::Type*>;
+
+  // Create a sharded vector type corresponding to a "element_count" long
+  // sequence of "element_type" values.
+  ShardedVectorType CreateShardedVectorType(PrimitiveType element_type,
+                                            unsigned element_count);
+
+  // Emit LLVM IR to store the sharded vector "value_to_store" to
+  // "store_address".
+  void EmitShardedVectorStore(llvm::Value* store_address,
+                              const ShardedVector& value_to_store,
+                              const int alignment,
+                              const llvm_ir::IrArray& containing_array);
+
+  using ReductionGenerator = std ::function<llvm::Value*(
+      llvm::IRBuilder<>*, llvm::Value*, llvm::Value*)>;
+
+  // Tries to match the reduction function "function" to a known reduction
+  // pattern.  Returns a non-null ReductionGenerator on a successful match,
+  // which can be used to generate the LLVM IR corresponding to said reduction.
+  // On failure, this stores a reason string into "failure_reason".
+  ReductionGenerator MatchReductionGenerator(HloComputation* function,
+                                             string* failure_reason) const;
+
+  // Emits the inner loop nest that runs the reduction.  Helper function for
+  // EmitVectorizedReduce.
+  StatusOr<ShardedVector> EmitInnerLoopForVectorizedReduction(
+      const ReductionGenerator& reduction_generator,
+      const llvm_ir::IrArray::Index& output_index,
+      const ShardedVectorType& accumulator_type, HloInstruction* init_value,
+      HloInstruction* arg, tensorflow::gtl::ArraySlice<int64> dimensions,
+      unsigned element_alignment);
+
   // Name of the computation entry function. This function serves as the
   // top-level "main" of the computation and will be invoked by the JIT.
   string entry_function_name_;
@@ -319,6 +445,18 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   llvm_ir::AliasAnalysis alias_analysis_;
 
+  // The number of root instruction outer dimensions used in parallel loop
+  // emission (EmitParallelTargetElementLoop).
+  int64 num_dynamic_loop_bounds_ = 0;
+
+  // Returns whether the given instruction should be emitted as a parallel loop.
+  bool ShouldEmitParallelLoopFor(const HloInstruction& op) const {
+    // Emit parallel loop for root instruction if dynamic outer-dimension loop
+    // bounds were specified.
+    return num_dynamic_loop_bounds_ > 0 &&
+           op.parent()->root_instruction() == &op;
+  }
+
   // This struct contains all the state needed to emit instructions for
   // profiling a computation.
   class ProfilingState {
@@ -404,8 +542,20 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Returns the number of bytes within the shape.
   int64 ByteSizeOf(const Shape& shape) const;
 
+  enum class XfeedKind {
+    kInfeed,
+    kOutfeed,
+  };
+
+  // Emit IR to transfer between a {infeed,outfeed} buffer and an in-program
+  // address.
+  Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
+                           llvm::Value* program_buffer_address);
+
   const HloModuleConfig& hlo_module_config_;
 
+  TargetMachineFeatures target_machine_features_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index bdddca99c2f50c47ab112eda92ab1509f5448849..f0af3e7b894af875c222b184873dcc4cc9e79b8f 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -70,7 +71,7 @@ ParallelCpuExecutable::ParallelCpuExecutable(
 
 // Type of the computation function we expect in the JIT.
 using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
-                                     uint64*);
+                                     int64*, uint64*);
 
 // Given a pointer to an output buffer (following the CPU JIT calling
 // conventions), mark addresses that are "live". The initial pointer itself is
@@ -95,6 +96,232 @@ static void MarkLiveAddressesInOutput(
   }
 }
 
+namespace {
+
+// Executor manages the concurrent execution of 'functions' for instructions
+// in 'pending' on 'thread_pool' (storing resulting data in 'results').
+class Executor {
+ public:
+  Executor(const std::map<HloInstruction*, ComputeFunctionType>& functions,
+           const ServiceExecutableRunOptions* run_options,
+           std::list<HloInstruction*>* pending,
+           std::map<HloInstruction*, const void*>* results, void** temps_array,
+           uint64* profile_counters_array, BufferAssignment* assignment)
+      : functions_(functions),
+        run_options_(run_options),
+        pending_(pending),
+        results_(results),
+        temps_array_(temps_array),
+        profile_counters_array_(profile_counters_array),
+        thread_pool_(CHECK_NOTNULL(run_options_->xla_intra_op_thread_pool())),
+        assignment_(assignment) {}
+
+  // Executes pending list of instructions on thread pool.
+  // Returns OK status on success, error status otherwise.
+  Status Run();
+
+ private:
+  // Schedules a parallel invocation of compute function for 'instruction' on
+  // 'thread_pool_', storing result in 'result_buffer'.
+  // If 'partition_buffers' is non-null, parallel task will be invoked on
+  // per-dimension partition [start, limit) values stored in
+  // 'partition_buffers'.
+  void Schedule(HloInstruction* instruction, int64* partition_buffers,
+                void* result_buffer);
+
+  // Returns true if 'instruction' has been assigned parallel tasks (returns
+  // false otherwise).
+  bool HasParallelTasks(HloInstruction* instruction);
+
+  // Returns in 'partition_buffers' the partition [size, limit) for each
+  // dimension.
+  int64* GetPartitionBuffers(
+      const std::vector<std::pair<int64, int64>>& partition);
+
+  // Returns array of result buffers for all operands in 'instruction'.
+  const void** GetOperandBuffers(HloInstruction* instruction);
+
+  // Arguments passed into Executor.
+  const std::map<HloInstruction*, ComputeFunctionType>& functions_;
+  const ServiceExecutableRunOptions* run_options_;
+  std::list<HloInstruction*>* pending_;
+  std::map<HloInstruction*, const void*>* results_;
+  void** temps_array_;
+  uint64* profile_counters_array_;
+  tensorflow::thread::ThreadPool* thread_pool_;
+  BufferAssignment* assignment_;
+
+  // Members used to manage instruction execution.
+  tensorflow::mutex completion_queue_lock_;
+  tensorflow::condition_variable completion_queue_cv_;
+  std::deque<HloInstruction*> completion_queue_;
+  int64 instructions_in_flight_ = 0;
+  std::unordered_map<const HloInstruction*, int64> tasks_in_flight_;
+};
+
+Status Executor::Run() {
+  while (!pending_->empty() || instructions_in_flight_ > 0) {
+    auto pending_it = pending_->begin();
+    while (pending_it != pending_->end()) {
+      HloInstruction* instruction = *pending_it;
+      // Skip pending instructions whose operands aren't ready.
+      if (std::any_of(instruction->operands().begin(),
+                      instruction->operands().end(),
+                      [&](HloInstruction* operand) {
+                        return !ContainsKey(*results_, operand);
+                      })) {
+        ++pending_it;
+        continue;
+      }
+
+      // Get 'result_buffer' reference to result buffer for 'instruction'.
+      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                          assignment_->GetUniqueTopLevelSlice(instruction));
+      void* result_buffer =
+          static_cast<char*>(temps_array_[result_slice.index()]) +
+          result_slice.offset();
+
+      if (HasParallelTasks(instruction)) {
+        // 'instruction' has been assigned parallel task partitions.
+        CHECK_EQ(HloOpcode::kCall, instruction->opcode());
+        HloInstruction* root = instruction->to_apply()->root_instruction();
+
+        // Create ShapePartitionIterator to iterate through all outer dimension
+        // partitions of 'instruction'.
+        ShapePartitionIterator partition_iterator(
+            root->shape(), root->outer_dimension_partitions());
+
+        const int64 partition_count =
+            partition_iterator.GetTotalPartitionCount();
+
+        // Record total parallel task count for 'instruction' before dispatch.
+        {
+          tensorflow::mutex_lock l(completion_queue_lock_);
+          tasks_in_flight_.insert(std::make_pair(instruction, partition_count));
+          VLOG(2) << "Schedule PARALLEL"
+                  << " instruction: " << instruction->name()
+                  << " instruction.callee: "
+                  << instruction->to_apply()->root_instruction()->name()
+                  << " partition_count: " << partition_count;
+        }
+
+        for (int64 i = 0; i < partition_count; ++i) {
+          // Get partition [start, limit) for each dimension.
+          auto partition_buffers =
+              GetPartitionBuffers(partition_iterator.GetPartition(i));
+          Schedule(instruction, partition_buffers, result_buffer);
+        }
+
+      } else {
+        // Set tasks in-flight to '1' for sequential instruction execution.
+        {
+          tensorflow::mutex_lock l(completion_queue_lock_);
+          tasks_in_flight_.insert(std::make_pair(instruction, 1));
+          VLOG(2) << "Schedule SEQUENTIAL"
+                  << " instruction: " << instruction->name()
+                  << " instruction.callee: "
+                  << instruction->to_apply()->root_instruction()->name();
+        }
+        Schedule(instruction, nullptr, result_buffer);
+      }
+
+      ++instructions_in_flight_;
+      pending_it = pending_->erase(pending_it);
+    }
+    // Wait for a completed HLO instruction to be present in the queue.  We will
+    // pop it out of the queue and make the result available to its users.
+    HloInstruction* instruction;
+    do {
+      tensorflow::mutex_lock l(completion_queue_lock_);
+      if (completion_queue_.empty()) {
+        completion_queue_cv_.wait(l);
+      }
+      if (!completion_queue_.empty()) {
+        instruction = completion_queue_.front();
+        completion_queue_.pop_front();
+        break;
+      }
+    } while (1);
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                        assignment_->GetUniqueTopLevelSlice(instruction));
+    void* result_buffer =
+        static_cast<char*>(temps_array_[result_slice.index()]) +
+        result_slice.offset();
+    InsertOrDie(results_, instruction, result_buffer);
+    --instructions_in_flight_;
+  }
+  return Status::OK();
+}
+
+void Executor::Schedule(HloInstruction* instruction, int64* partition_buffers,
+                        void* result_buffer) {
+  // The thread pool entry takes ownership of |operand_buffers|.
+  auto operand_buffers = GetOperandBuffers(instruction);
+
+  auto function = FindOrDie(functions_, instruction);
+  const auto* exec_run_options = &run_options_->run_options();
+  thread_pool_->Schedule([this, instruction, result_buffer, operand_buffers,
+                          partition_buffers, exec_run_options, function]() {
+    function(result_buffer, exec_run_options, operand_buffers, temps_array_,
+             partition_buffers, profile_counters_array_);
+
+    delete[] operand_buffers;
+    delete[] partition_buffers;
+    // Push the completed HLO instruction on the queue, the main
+    // thread will pop it off and potentially launch more work which
+    // uses the result.
+    // TODO(b/27458679) Consider alternative task scheduling and synchronization
+    // schemes. For example, we could avoid the overhead associate with the
+    // condvar here if the thread just dequed the next instruction to execute
+    // on completion.
+    {
+      tensorflow::mutex_lock l(completion_queue_lock_);
+      // Decrement in-flight task count for this completion.
+      if (--FindOrDie(tasks_in_flight_, instruction) == 0) {
+        completion_queue_.push_back(instruction);
+        completion_queue_cv_.notify_all();
+        tasks_in_flight_.erase(instruction);
+      }
+    }
+  });
+}
+
+int64* Executor::GetPartitionBuffers(
+    const std::vector<std::pair<int64, int64>>& partition) {
+  // Return in 'partition_buffers' partition [size, limit) for each dimension.
+  auto partition_buffers = new int64[partition.size() * 2];
+  for (int i = 0; i < partition.size(); ++i) {
+    partition_buffers[2 * i + 0] = partition[i].first;
+    partition_buffers[2 * i + 1] = partition[i].first + partition[i].second;
+  }
+  return partition_buffers;
+}
+
+bool Executor::HasParallelTasks(HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kCall &&
+         !instruction->to_apply()
+              ->root_instruction()
+              ->outer_dimension_partitions()
+              .empty();
+}
+
+const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
+  // We cannot use a move-only RAII type like std::unique_ptr because the
+  // list of operands is allocated on the main thread and transferred to the
+  // worker via the lambda passed to enqueue_function.  In order for the
+  // lambda to take ownership, we would need to use generalized lambda
+  // capture which is a feature new to C++14.
+  // TODO(b/27458679) Avoid dynamic allocations in Executor.
+  auto operand_buffers = new const void*[instruction->operand_count()];
+  std::transform(instruction->operands().begin(), instruction->operands().end(),
+                 operand_buffers, [this](HloInstruction* operand) {
+                   return FindOrDie(*results_, operand);
+                 });
+  return operand_buffers;
+}
+
+}  // namespace
+
 Status ParallelCpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
     std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
@@ -180,8 +407,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     HloInstruction* instruction = entry.first;
     llvm::JITSymbol sym = jit_->FindSymbol(entry.second);
     TF_RET_CHECK(sym);
-    InsertOrDie(&functions, instruction,
-                reinterpret_cast<ComputeFunctionType>(sym.getAddress()));
+    InsertOrDie(
+        &functions, instruction,
+        reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress())));
   }
 
   // Map containing pointers to result buffers for each instruction.
@@ -210,88 +438,16 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     }
   }
 
-  void** temps_array = buffer_pointers.data();
-  uint64* profile_counters_array = profile_counters.data();
-  auto* thread_pool = CHECK_NOTNULL(run_options->xla_intra_op_thread_pool());
-  tensorflow::mutex completion_queue_lock;
-  tensorflow::condition_variable completion_queue_cv;
-  std::deque<HloInstruction*> completion_queue;
-  int64 instructions_in_flight = 0;
-  while (!pending.empty() || instructions_in_flight > 0) {
-    auto pending_it = pending.begin();
-    while (pending_it != pending.end()) {
-      HloInstruction* instruction = *pending_it;
-      // Skip pending instructions whose operands aren't ready.
-      if (std::any_of(instruction->operands().begin(),
-                      instruction->operands().end(),
-                      [&](HloInstruction* operand) {
-                        return !ContainsKey(results, operand);
-                      })) {
-        ++pending_it;
-        continue;
-      }
+  // TODO(b/27458679) Manage scheduling based on in-flight concurrency limits.
+  // For example, if we expect a library conv/matmul call to run at max
+  // concurrency, we should not dispatch runnable instructions until the
+  // library call is finished (to avoid expensive cache invalidation).
+  Executor executor(functions, run_options, &pending, &results,
+                    buffer_pointers.data(), profile_counters.data(),
+                    assignment_.get());
 
-      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                          assignment_->GetUniqueTopLevelSlice(instruction));
-      void* result_buffer =
-          static_cast<char*>(temps_array[result_slice.index()]) +
-          result_slice.offset();
-      // We cannot use a move-only RAII type like std::unique_ptr because the
-      // list of operands is allocated on the main thread and transferred to the
-      // worker via the lambda passed to enqueue_function.  In order for the
-      // lambda to take ownership, we would need to use generalized lambda
-      // capture which is a feature new to C++14.
-      auto operand_buffers = new const void*[instruction->operand_count()];
-      std::transform(instruction->operands().begin(),
-                     instruction->operands().end(), operand_buffers,
-                     [&results](HloInstruction* operand) {
-                       return FindOrDie(results, operand);
-                     });
-      auto function = FindOrDie(functions, instruction);
-      // The thread pool entry takes ownership of |operand_buffers|.
-      const auto* exec_run_options = &run_options->run_options();
-      thread_pool->Schedule([instruction, &completion_queue,
-                             &completion_queue_lock, &completion_queue_cv,
-                             result_buffer, exec_run_options, operand_buffers,
-                             temps_array, profile_counters_array, function] {
-        function(result_buffer, exec_run_options, operand_buffers, temps_array,
-                 profile_counters_array);
-        delete[] operand_buffers;
-        // Push the completed HLO instruction on the queue, the main thread
-        // will pop it off and potentially launch more work which uses the
-        // result.
-        {
-          tensorflow::mutex_lock l(completion_queue_lock);
-          completion_queue.push_back(instruction);
-          completion_queue_cv.notify_all();
-        }
-      });
+  TF_RETURN_IF_ERROR(executor.Run());
 
-      ++instructions_in_flight;
-      pending_it = pending.erase(pending_it);
-    }
-    // Wait for a completed HLO instruction to be present in the queue.  We will
-    // pop it out of the queue and make the result available to its users.
-    HloInstruction* instruction;
-    do {
-      tensorflow::mutex_lock l(completion_queue_lock);
-      if (completion_queue.empty()) {
-        completion_queue_cv.wait(l);
-      }
-      if (!completion_queue.empty()) {
-        instruction = completion_queue.front();
-        completion_queue.pop_front();
-        break;
-      }
-    } while (1);
-    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                        assignment_->GetUniqueTopLevelSlice(instruction));
-    void* result_buffer =
-        static_cast<char*>(temps_array[result_slice.index()]) +
-        result_slice.offset();
-    InsertOrDie(&results, instruction, result_buffer);
-    --instructions_in_flight;
-  }
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
   {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 6d5f790c3941af5cac098fd39c1dace5564cee5b..a3fe2657989ef2a7bd001e49d1baab57b3def839 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -89,6 +89,12 @@ class ParallelCpuExecutable : public Executable {
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
   }
 
+  const Status EqualOrFail(const Executable& executable) {
+    // TODO(b/62952745) Implement equality test on CPU parallel executable.
+    return Unimplemented(
+        "Equality test on CPU parallel executable is not implemented.");
+  }
+
  private:
   // Allocate buffers required for execution and assign them to the elements of
   // "buffers". "buffers" should be sized to the number of buffers in buffer
diff --git a/tensorflow/compiler/xla/service/cpu/sample_harness.cc b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
index 8f1ce82d49a1c7cabfb62bf30e69faedc0318138..b3f4609d465efb4df8921abb684bafd263fe040f 100644
--- a/tensorflow/compiler/xla/service/cpu/sample_harness.cc
+++ b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
@@ -38,13 +38,12 @@ int main(int argc, char** argv) {
 
   // Transfer parameters.
   std::unique_ptr<xla::Literal> param0_literal =
-      xla::LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
+      xla::Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
   std::unique_ptr<xla::GlobalData> param0_data =
       client->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
-  std::unique_ptr<xla::Literal> param1_literal =
-      xla::LiteralUtil::CreateR2<float>(
-          {{3.1f, 4.2f, 7.3f, 9.5f}, {1.1f, 2.2f, 3.3f, 4.4f}});
+  std::unique_ptr<xla::Literal> param1_literal = xla::Literal::CreateR2<float>(
+      {{3.1f, 4.2f, 7.3f, 9.5f}, {1.1f, 2.2f, 3.3f, 4.4f}});
   std::unique_ptr<xla::GlobalData> param1_data =
       client->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
@@ -69,7 +68,7 @@ int main(int argc, char** argv) {
 
   LOG(INFO) << tensorflow::strings::Printf("computation took %lldns",
                                            profile.compute_time_ns());
-  LOG(INFO) << xla::LiteralUtil::ToString(*actual);
+  LOG(INFO) << actual->ToString();
 
   return 0;
 }
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61b408b8c24dded134218110d4e219c31f1685a8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
+
+namespace xla {
+namespace cpu {
+
+std::vector<int64> ShapePartitionAssigner::Run(int64 target_partition_count) {
+  // Gather outer-most dims where dim_size >= 'target_partition_count'.
+  // Note: always leave inner-dim static for vectorization/optimizations.
+  std::vector<int64> outer_dims;
+  int64 outer_dim_size = 1;
+  // TODO(b/27458679) Consider reserving enough minor dimensions (based on
+  // target vector register width) to enable vector instructions.
+  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 1; --i) {
+    const int64 dimension = shape_.layout().minor_to_major(i);
+    outer_dims.push_back(dimension);
+    outer_dim_size *= shape_.dimensions(dimension);
+    if (outer_dim_size >= target_partition_count) {
+      break;
+    }
+  }
+
+  // Clip target partition count if outer dim size is insufficient to cover.
+  target_partition_count = std::min(outer_dim_size, target_partition_count);
+
+  // Calculate the target number of partitions per-dimension, by factoring
+  // 'target_partition_count' into 'num_outer_dims' equal terms.
+  // EX:
+  // *) target_partition_count = 16
+  // *) out_dim_count = 2
+  // *) target_dim_partition_count = 16 ^ (1.0 / 2) == 4
+  const int64 target_dim_partition_count = std::pow(
+      static_cast<double>(target_partition_count), 1.0 / outer_dims.size());
+
+  // Assign feasible dimension partitions based on 'target_dim_partition_count'
+  // and actual dimension sizes from 'shape_'.
+  std::vector<int64> dimension_partition_counts(outer_dims.size());
+  for (int64 i = 0; i < outer_dims.size(); ++i) {
+    dimension_partition_counts[i] =
+        std::min(static_cast<int64>(shape_.dimensions(outer_dims[i])),
+                 target_dim_partition_count);
+  }
+
+  // Check if total partition count is below 'target_partition_count'.
+  // This can occur if some dimensions in 'shape_' are below the
+  // 'target_dim_partition_count' threshold.
+  if (GetTotalPartitionCount(dimension_partition_counts) <
+      target_partition_count) {
+    // Assign additional partitions (greedily to outer dimensions), if doing
+    // so would keep the total number of partitions <= 'target_partition_count',
+    // using one pass over 'dimension_partition_counts'.
+    for (int64 i = 0; i < dimension_partition_counts.size(); ++i) {
+      const int64 current_dim_partition_count = dimension_partition_counts[i];
+      const int64 other_dims_partition_count =
+          GetTotalPartitionCount(dimension_partition_counts) /
+          current_dim_partition_count;
+      // Constraint: (current + additional) * other <= target
+      // Calculate: additional = target / other - current
+      int64 additional_partition_count =
+          target_partition_count / other_dims_partition_count -
+          current_dim_partition_count;
+      // Clip 'additional_partition_count' by current dimension size.
+      additional_partition_count = std::min(
+          shape_.dimensions(outer_dims[i]) - dimension_partition_counts[i],
+          additional_partition_count);
+      if (additional_partition_count > 0) {
+        dimension_partition_counts[i] += additional_partition_count;
+      }
+    }
+  }
+
+  return dimension_partition_counts;
+}
+
+int64 ShapePartitionAssigner::GetTotalPartitionCount(
+    const std::vector<int64>& dimension_partition_counts) {
+  int64 total_partition_count = 1;
+  for (int64 dim_partition_count : dimension_partition_counts) {
+    total_partition_count *= dim_partition_count;
+  }
+  return total_partition_count;
+}
+
+ShapePartitionIterator::ShapePartitionIterator(
+    const Shape& shape, const std::vector<int64>& dimension_partition_counts)
+    : shape_(shape),
+      dimension_partition_counts_(dimension_partition_counts),
+      dimensions_(dimension_partition_counts_.size()),
+      dimension_partition_sizes_(dimension_partition_counts_.size()),
+      dimension_partition_strides_(dimension_partition_counts_.size()) {
+  // Store partitioned outer dimensions from 'shape_'.
+  for (int i = 0; i < dimensions_.size(); ++i) {
+    dimensions_[i] = shape_.layout().minor_to_major(
+        shape_.layout().minor_to_major_size() - 1 - i);
+  }
+
+  // Calculate partition size for each dimension (note that the size of
+  // the last partition in each dimension may be different if the dimension
+  // size is not a multiple of partition size).
+  for (int i = 0; i < dimension_partition_sizes_.size(); ++i) {
+    const int64 dim_size = shape_.dimensions(dimensions_[i]);
+    dimension_partition_sizes_[i] =
+        std::max(1LL, dim_size / dimension_partition_counts_[i]);
+  }
+
+  // Calculate the partition strides for each dimension.
+  dimension_partition_strides_[dimension_partition_strides_.size() - 1] = 1;
+  for (int i = dimension_partition_strides_.size() - 2; i >= 0; --i) {
+    dimension_partition_strides_[i] = dimension_partition_strides_[i + 1] *
+                                      dimension_partition_counts_[i + 1];
+  }
+}
+
+std::vector<std::pair<int64, int64>> ShapePartitionIterator::GetPartition(
+    int64 index) const {
+  // Calculate and return the partition for 'index'.
+  // Returns for each dimension: (partition_start, partition_size).
+  std::vector<std::pair<int64, int64>> partition(dimensions_.size());
+  for (int64 i = 0; i < partition.size(); ++i) {
+    // Calculate the index for dimension 'i'.
+    const int64 partition_index = index / dimension_partition_strides_[i];
+    // Calculate dimension partition start at 'partition_index'.
+    partition[i].first = partition_index * dimension_partition_sizes_[i];
+    // Calculate dimension partition size (note that the last partition size
+    // may be adjusted if dimension size is not a multiple of partition size).
+    if (partition_index == dimension_partition_counts_[i] - 1) {
+      // Last partition in this dimension.
+      partition[i].second =
+          shape_.dimensions(dimensions_[i]) - partition[i].first;
+    } else {
+      partition[i].second = dimension_partition_sizes_[i];
+    }
+    CHECK_GT(partition[i].second, 0);
+    // Update index to remove conribution from current dimension.
+    index -= partition_index * dimension_partition_strides_[i];
+  }
+  return partition;
+}
+
+int64 ShapePartitionIterator::GetTotalPartitionCount() const {
+  return ShapePartitionAssigner::GetTotalPartitionCount(
+      dimension_partition_counts_);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.h b/tensorflow/compiler/xla/service/cpu/shape_partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a2d00421cfdc8e41ec48698a16665621de16bda
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.h
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+namespace cpu {
+
+// ShapePartitionAssigner partitions the most-major dimensions of 'shape' such
+// that the total partition count <= 'target_partition_count'.
+//
+// Example 1:
+//
+//   Let 'shape' = [8, 16, 32] and 'target_partition_count' = 6.
+//
+//   Because the most-major dimension size is <= 'target_partition_count', we
+//   can generate our target number of partitions by partition the most-major
+//   dimensions.
+//
+//   This will result in the following partitions of the most-major dimension:
+//
+//     [0, 1), [1, 2), [2, 3), [3, 4), [4, 5) [5, 8)
+//
+//   Note that the last partition has residule because the dimension size is
+//   not a multiple of the partition count.
+//
+//
+// Example 2:
+//
+//   Let 'shape' = [8, 16, 32] and 'target_partition_count' = 16.
+//
+//   Because the most-major dimension only has size 8, we must also partition
+//   the next most-major dimension to generate the target of 16 partitions.
+//   We factor 'target_partition_count' by the number of most-major dimensions
+//   we need to partition, to get a per-dimension target partition count:
+//
+//     target_dimension_partition_count = 16 ^ (1 / 2) == 4
+//
+//   This will result in the following partitions of the most-major dimension:
+//
+//     [0, 2), [2, 4), [4, 6), [6, 8)
+//
+//   This will result in the following partitions of the second most-major
+//   dimension:
+//
+//     [0, 4), [4, 8), [8, 12), [12, 16)
+//
+class ShapePartitionAssigner {
+ public:
+  ShapePartitionAssigner(const Shape& shape) : shape_(shape) {}
+
+  // Returns dimension partition counts (starting at outer-most dimension).
+  std::vector<int64> Run(int64 target_partition_count);
+
+  // Returns the total partition count based on 'dimension_partition_counts'.
+  static int64 GetTotalPartitionCount(
+      const std::vector<int64>& dimension_partition_counts);
+
+ private:
+  const Shape& shape_;
+};
+
+// ShapePartitionIterator iterates through outer-dimension partitions of
+// 'shape' as specified by 'dimension_partition_counts'.
+class ShapePartitionIterator {
+ public:
+  ShapePartitionIterator(const Shape& shape,
+                         const std::vector<int64>& dimension_partition_counts);
+
+  // Returns a partition [start, size] for each dimension.
+  // Partitions are listed starting from outer-most dimension first.
+  std::vector<std::pair<int64, int64>> GetPartition(int64 index) const;
+
+  int64 GetTotalPartitionCount() const;
+
+ private:
+  const Shape& shape_;
+  const std::vector<int64> dimension_partition_counts_;
+
+  std::vector<int64> dimensions_;
+  std::vector<int64> dimension_partition_sizes_;
+  std::vector<int64> dimension_partition_strides_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee0c53fa6d7c41481a53350e57e5844dea2644c1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -0,0 +1,248 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
+
+#include <algorithm>
+#include <random>
+
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+class ShapePartitionAssignerTest : public HloTestBase {
+ protected:
+  typedef std::vector<int64> Vec;
+
+  void RunR2Test(const Shape& shape, const int64 expected_max_partition_count) {
+    ShapePartitionAssigner assigner(shape);
+    // Check all partitions of outer dimension.
+    for (int64 i = 1; i <= expected_max_partition_count; ++i) {
+      EXPECT_TRUE(ContainersEqual(Vec({i}),
+                                  assigner.Run(/*target_partition_count=*/i)));
+    }
+    // Check target_partition_count > outer dimension size.
+    EXPECT_TRUE(ContainersEqual(
+        Vec({expected_max_partition_count}),
+        assigner.Run(
+            /*target_partition_count=*/expected_max_partition_count + 1)));
+  }
+};
+
+TEST_F(ShapePartitionAssignerTest, Shape13WithLayout10) {
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 1);
+}
+
+TEST_F(ShapePartitionAssignerTest, Shape31WithLayout01) {
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 1);
+}
+
+TEST_F(ShapePartitionAssignerTest, Shape53WithLayout10) {
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 5);
+}
+
+TEST_F(ShapePartitionAssignerTest, Shape53WithLayout01) {
+  RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 3);
+}
+
+TEST_F(ShapePartitionAssignerTest, Shape532WithLayout210) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0});
+  ShapePartitionAssigner assigner(shape);
+
+  for (int64 i = 1; i <= 5; ++i) {
+    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
+                                              /*target_partition_count=*/i)));
+  }
+
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({4, 2}), assigner.Run(/*target_partition_count=*/8)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
+                              assigner.Run(/*target_partition_count=*/10)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
+                              assigner.Run(/*target_partition_count=*/11)));
+  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
+                              assigner.Run(/*target_partition_count=*/12)));
+  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
+                              assigner.Run(/*target_partition_count=*/13)));
+  EXPECT_TRUE(ContainersEqual(Vec({4, 3}),
+                              assigner.Run(/*target_partition_count=*/14)));
+  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
+                              assigner.Run(/*target_partition_count=*/15)));
+  EXPECT_TRUE(ContainersEqual(Vec({5, 3}),
+                              assigner.Run(/*target_partition_count=*/16)));
+}
+
+TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1});
+  ShapePartitionAssigner assigner(shape);
+
+  for (int64 i = 1; i <= 3; ++i) {
+    EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run(
+                                              /*target_partition_count=*/i)));
+  }
+
+  EXPECT_TRUE(
+      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/4)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/5)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/8)));
+  EXPECT_TRUE(
+      ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
+                              assigner.Run(/*target_partition_count=*/10)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 3}),
+                              assigner.Run(/*target_partition_count=*/11)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
+                              assigner.Run(/*target_partition_count=*/12)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
+                              assigner.Run(/*target_partition_count=*/13)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 4}),
+                              assigner.Run(/*target_partition_count=*/14)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
+                              assigner.Run(/*target_partition_count=*/15)));
+  EXPECT_TRUE(ContainersEqual(Vec({3, 5}),
+                              assigner.Run(/*target_partition_count=*/16)));
+}
+
+class ShapePartitionIteratorTest : public HloTestBase {
+ protected:
+  typedef std::vector<std::pair<int64, int64>> Partition;
+};
+
+TEST_F(ShapePartitionIteratorTest, Shape53WithLayout10) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0});
+
+  {
+    ShapePartitionIterator iterator(shape, {1});
+    EXPECT_EQ(1, iterator.GetTotalPartitionCount());
+    EXPECT_TRUE(ContainersEqual(Partition({{0, 5}}), iterator.GetPartition(0)));
+  }
+
+  {
+    ShapePartitionIterator iterator(shape, {2});
+    EXPECT_EQ(2, iterator.GetTotalPartitionCount());
+    EXPECT_TRUE(ContainersEqual(Partition({{0, 2}}), iterator.GetPartition(0)));
+    EXPECT_TRUE(ContainersEqual(Partition({{2, 3}}), iterator.GetPartition(1)));
+  }
+
+  {
+    ShapePartitionIterator iterator(shape, {3});
+    EXPECT_EQ(3, iterator.GetTotalPartitionCount());
+    EXPECT_TRUE(ContainersEqual(Partition({{0, 1}}), iterator.GetPartition(0)));
+    EXPECT_TRUE(ContainersEqual(Partition({{1, 1}}), iterator.GetPartition(1)));
+    EXPECT_TRUE(ContainersEqual(Partition({{2, 3}}), iterator.GetPartition(2)));
+  }
+}
+
+TEST_F(ShapePartitionIteratorTest, Shape532WithLayout210) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0});
+
+  {
+    ShapePartitionIterator iterator(shape, {1, 1});
+    EXPECT_EQ(1, iterator.GetTotalPartitionCount());
+    EXPECT_TRUE(
+        ContainersEqual(Partition({{0, 5}, {0, 3}}), iterator.GetPartition(0)));
+  }
+
+  {
+    ShapePartitionIterator iterator(shape, {2, 2});
+    EXPECT_EQ(4, iterator.GetTotalPartitionCount());
+    EXPECT_TRUE(
+        ContainersEqual(Partition({{0, 2}, {0, 1}}), iterator.GetPartition(0)));
+    EXPECT_TRUE(
+        ContainersEqual(Partition({{0, 2}, {1, 2}}), iterator.GetPartition(1)));
+    EXPECT_TRUE(
+        ContainersEqual(Partition({{2, 3}, {0, 1}}), iterator.GetPartition(2)));
+    EXPECT_TRUE(
+        ContainersEqual(Partition({{2, 3}, {1, 2}}), iterator.GetPartition(3)));
+  }
+}
+
+class RandomShapePartitionIteratorTest : public HloTestBase {
+ protected:
+  typedef std::vector<std::pair<int64, int64>> Partition;
+  RandomShapePartitionIteratorTest()
+      : generator_(rd_()), distribution_(1, 10) {}
+
+  std::vector<int64> RandR4Dims() { return {Rand(), Rand(), Rand(), Rand()}; }
+
+  int64 Rand() { return distribution_(generator_); }
+
+  std::random_device rd_;
+  std::mt19937 generator_;
+  std::uniform_int_distribution<int> distribution_;
+};
+
+TEST_F(RandomShapePartitionIteratorTest, RandomShapeAndPartitions) {
+  // Choose random dimensions for R4 shape.
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, RandR4Dims(), {3, 2, 1, 0});
+  // Choose random number of outer dimensions to partition.
+  const int num_outer_dims_to_partition = 1 + (Rand() % 3);
+  // Choose random outer dimension partition counts.
+  std::vector<int64> dim_sizes(num_outer_dims_to_partition);
+  std::vector<int64> dim_partition_counts(num_outer_dims_to_partition);
+  int64 total_dim_size = 1;
+  for (int i = 0; i < num_outer_dims_to_partition; ++i) {
+    const int64 dimension = shape.layout().minor_to_major(
+        shape.layout().minor_to_major_size() - 1 - i);
+    dim_sizes[i] = shape.dimensions(dimension);
+    total_dim_size *= dim_sizes[i];
+    // Choose dimension partition count in [1, dim_size]
+    const int64 dim_partition_count = 1 + Rand() % dim_sizes[i];
+    dim_partition_counts[i] = dim_partition_count;
+  }
+  // Iterate through all partition: for each partition record covered
+  // index ranges by dimension.
+  std::vector<std::map<int64, int64>> ranges(num_outer_dims_to_partition);
+  ShapePartitionIterator partition_iterator(shape, dim_partition_counts);
+  const int64 partition_count = partition_iterator.GetTotalPartitionCount();
+  for (int64 i = 0; i < partition_count; ++i) {
+    const auto& dim_partition = partition_iterator.GetPartition(i);
+    for (int dim = 0; dim < dim_partition.size(); ++dim) {
+      ranges[dim].insert(
+          std::make_pair(dim_partition[dim].first,
+                         dim_partition[dim].first + dim_partition[dim].second));
+    }
+  }
+  // Check that partitions cover entire dimension size range (for each
+  // partitioned dimension).
+  for (int i = 0; i < ranges.size(); ++i) {
+    int64 expected_index = 0;
+    for (auto& r : ranges[i]) {
+      EXPECT_EQ(expected_index, r.first);
+      expected_index = r.second;
+    }
+    EXPECT_EQ(expected_index, dim_sizes[i]);
+  }
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 7c74912a7ab9c388c9911fe8194f268623f0abd1..262c471b4079b92daee98095b2fa61834cb2f243 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -21,11 +21,12 @@ limitations under the License.
 #include <list>
 #include <utility>
 
+#include "external/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h"
+#include "external/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "external/llvm/include/llvm/IR/Mangler.h"
 #include "external/llvm/include/llvm/Support/CodeGen.h"
 #include "external/llvm/include/llvm/Support/Host.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
@@ -41,7 +42,7 @@ namespace cpu {
 namespace {
 
 // Converts a symbol 'name' into the form expected by dlsym().
-std::string CanonicalizeSymbol(const std::string &name) {
+std::string CanonicalizeSymbol(const std::string& name) {
 #if defined(__APPLE__)
   // On Mac OS X, dlsym() expects names not to be prefixed with a leading
   // underscore.
@@ -52,47 +53,77 @@ std::string CanonicalizeSymbol(const std::string &name) {
   return name;
 }
 
+class JITSymbolTable {
+ public:
+  JITSymbolTable() { Populate(); }
+
+  void* Lookup(llvm::StringRef jit_symbol_name) const {
+    auto it = jit_symbol_table_.find(jit_symbol_name);
+    return it == jit_symbol_table_.end() ? nullptr : it->getValue();
+  }
+
+  static bool MustBeInTable(llvm::StringRef name) {
+    // In particular, names starting with
+    // runtime::kXlaCpuRuntimeSymbolNamePrefix should not be dlsym'ed.
+    return name.startswith(runtime::kXlaCpuRuntimeSymbolNamePrefix);
+  }
+
+ private:
+  void AddJITSymbolToTable(llvm::StringRef jit_symbol_name,
+                           llvm::StringRef cpp_symbol_name,
+                           void* jit_symbol_value) {
+    // The JIT symbol name and the C++ symbol name (with an extern "C" linkage)
+    // need to match, otherwise AOT links will fail.
+    CHECK(jit_symbol_name == cpp_symbol_name);
+    CHECK(jit_symbol_table_.insert({jit_symbol_name, jit_symbol_value}).second);
+  }
+
+  void Populate() {
+#define ADD_JIT_SYMBOL_TO_TABLE(base_name)                       \
+  do {                                                           \
+    AddJITSymbolToTable(                                         \
+        xla::cpu::runtime::k##base_name##SymbolName,             \
+        "__xla_cpu_runtime_" #base_name,                         \
+        reinterpret_cast<void*>(__xla_cpu_runtime_##base_name)); \
+  } while (false)
+
+    ADD_JIT_SYMBOL_TO_TABLE(AcquireInfeedBufferForDequeue);
+    ADD_JIT_SYMBOL_TO_TABLE(ReleaseInfeedBufferAfterDequeue);
+    ADD_JIT_SYMBOL_TO_TABLE(AcquireOutfeedBufferForPopulation);
+    ADD_JIT_SYMBOL_TO_TABLE(ReleaseOutfeedBufferAfterPopulation);
+    ADD_JIT_SYMBOL_TO_TABLE(ExpV8F32);
+    ADD_JIT_SYMBOL_TO_TABLE(LogV8F32);
+    ADD_JIT_SYMBOL_TO_TABLE(TanhV8F32);
+    ADD_JIT_SYMBOL_TO_TABLE(ExpV4F32);
+    ADD_JIT_SYMBOL_TO_TABLE(LogV4F32);
+    ADD_JIT_SYMBOL_TO_TABLE(TanhV4F32);
+    ADD_JIT_SYMBOL_TO_TABLE(EigenConvF32);
+    ADD_JIT_SYMBOL_TO_TABLE(EigenMatMulF32);
+    ADD_JIT_SYMBOL_TO_TABLE(EigenMatMulF64);
+    ADD_JIT_SYMBOL_TO_TABLE(EigenSingleThreadedConvF32);
+    ADD_JIT_SYMBOL_TO_TABLE(EigenSingleThreadedMatMulF32);
+    ADD_JIT_SYMBOL_TO_TABLE(EigenSingleThreadedMatMulF64);
+
+#undef ADD_JIT_SYMBOL_TO_TABLE
+  }
+
+  llvm::StringMap<void*> jit_symbol_table_;
+};
+
+const JITSymbolTable& GetJITSymbolTable() {
+  static JITSymbolTable* symbol_table = new JITSymbolTable;
+  return *symbol_table;
+}
+
 // A simple SymbolResolver that delegates to the host dynamic linker.
 struct SimpleResolver : public llvm::JITSymbolResolver {
-  llvm::JITSymbol findSymbol(const std::string &name) override {
-    void *func_addr = nullptr;
-
+  llvm::JITSymbol findSymbol(const std::string& name) override {
     std::string canonical_name = CanonicalizeSymbol(name);
-    if (canonical_name == runtime::kEigenMatmulF32SymbolName) {
-      func_addr = reinterpret_cast<void *>(__xla_cpu_runtime_EigenMatMulF32);
-    } else if (canonical_name ==
-               runtime::kEigenSingleThreadedMatmulF32SymbolName) {
-      func_addr = reinterpret_cast<void *>(
-          __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
-    } else if (canonical_name == runtime::kEigenConvF32SymbolName) {
-      func_addr = reinterpret_cast<void *>(__xla_cpu_runtime_EigenConvF32);
-    } else if (canonical_name ==
-               runtime::kEigenSingleThreadedConvF32SymbolName) {
-      func_addr = reinterpret_cast<void *>(
-          __xla_cpu_runtime_EigenSingleThreadedConvF32);
-    } else if (canonical_name ==
-               runtime::kAcquireInfeedBufferForDequeueSymbolName) {
-      func_addr = reinterpret_cast<void *>(
-          __xla_cpu_runtime_AcquireInfeedBufferForDequeue);
-    } else if (canonical_name ==
-               runtime::kReleaseInfeedBufferAfterDequeueSymbolName) {
-      func_addr = reinterpret_cast<void *>(
-          __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue);
-    } else if (canonical_name == runtime::kExpV4F32) {
-      func_addr = reinterpret_cast<void *>(runtime::ExpV4F32);
-    } else if (canonical_name == runtime::kExpV8F32) {
-      func_addr = reinterpret_cast<void *>(runtime::ExpV8F32);
-    } else if (canonical_name == runtime::kLogV4F32) {
-      func_addr = reinterpret_cast<void *>(runtime::LogV4F32);
-    } else if (canonical_name == runtime::kLogV8F32) {
-      func_addr = reinterpret_cast<void *>(runtime::LogV8F32);
-    } else if (canonical_name == runtime::kTanhV4F32) {
-      func_addr = reinterpret_cast<void *>(runtime::TanhV4F32);
-    } else if (canonical_name == runtime::kTanhV8F32) {
-      func_addr = reinterpret_cast<void *>(runtime::TanhV8F32);
-    } else {
-      func_addr = dlsym(RTLD_DEFAULT, canonical_name.c_str());
-    }
+    const JITSymbolTable& jit_symbol_table = GetJITSymbolTable();
+
+    void* func_addr = JITSymbolTable::MustBeInTable(canonical_name)
+                          ? jit_symbol_table.Lookup(canonical_name)
+                          : dlsym(RTLD_DEFAULT, canonical_name.c_str());
 
     if (func_addr == nullptr) {
       return nullptr;
@@ -101,7 +132,7 @@ struct SimpleResolver : public llvm::JITSymbolResolver {
                                          llvm::JITSymbolFlags::None);
     return symbol_info;
   }
-  llvm::JITSymbol findSymbolInLogicalDylib(const std::string &name) override {
+  llvm::JITSymbol findSymbolInLogicalDylib(const std::string& name) override {
     return nullptr;
   }
 };
@@ -110,7 +141,7 @@ llvm::SmallVector<std::string, 0> DetectMachineAttributes() {
   llvm::SmallVector<std::string, 0> result;
   llvm::StringMap<bool> host_features;
   if (llvm::sys::getHostCPUFeatures(host_features)) {
-    for (auto &feature : host_features) {
+    for (auto& feature : host_features) {
       if (feature.second) {
         llvm::StringRef feature_name = feature.first();
         // Skip avx512 for now, it isn't quite ready in LLVM.
@@ -133,15 +164,17 @@ llvm::StringRef GetHostCpuName() {
 
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
-  intrinsics.sse_intrinsics = (&runtime::ExpV4F32 != nullptr);
-  intrinsics.avx_intrinsics = (&runtime::ExpV8F32 != nullptr);
+  intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32 != nullptr);
+  intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32 != nullptr);
   return intrinsics;
 }
 
 }  // namespace
 
-SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions &target_options,
-                           llvm::CodeGenOpt::Level opt_level)
+SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
+                           llvm::CodeGenOpt::Level opt_level,
+                           CompilerFunctor::ModuleHook pre_optimization_hook,
+                           CompilerFunctor::ModuleHook post_optimization_hook)
     : target_machine_(
           CHECK_NOTNULL(llvm::EngineBuilder()
                             .setTargetOptions(target_options)
@@ -152,33 +185,33 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions &target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
+      object_layer_(
+          [] { return std::make_shared<llvm::SectionMemoryManager>(); }),
       compile_layer_(object_layer_,
                      CompilerFunctor(target_machine_.get(), &disassembler_,
-                                     opt_level, GetAvailableIntrinsics())) {
+                                     opt_level, GetAvailableIntrinsics(),
+                                     std::move(pre_optimization_hook),
+                                     std::move(post_optimization_hook))) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
           << " features: " << target_machine_->getTargetFeatureString().str();
 }
 
 SimpleOrcJIT::ModuleHandleT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
-  // The Orc API adds a whole iterable "set" of modules, so we wrap the module
-  // in a vector.
-  std::vector<std::unique_ptr<llvm::Module>> module_set;
-  module_set.push_back(std::move(module));
-  auto handle = compile_layer_.addModuleSet(
-      std::move(module_set), MakeUnique<llvm::SectionMemoryManager>(),
-      MakeUnique<SimpleResolver>());
+  auto handle = cantFail(compile_layer_.addModule(
+      std::move(module), MakeUnique<SimpleResolver>()));
   module_handles_.push_back(handle);
   return handle;
 }
 
 void SimpleOrcJIT::RemoveModule(SimpleOrcJIT::ModuleHandleT handle) {
   module_handles_.erase(
-      std::remove(module_handles_.begin(), module_handles_.end(), handle));
-  compile_layer_.removeModuleSet(handle);
+      std::remove(module_handles_.begin(), module_handles_.end(), handle),
+      module_handles_.end());
+  cantFail(compile_layer_.removeModule(handle));
 }
 
-llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string &name) {
+llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string& name) {
   std::string mangled_name;
   {
     llvm::raw_string_ostream mangled_name_stream(mangled_name);
@@ -187,7 +220,7 @@ llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string &name) {
 
   // Resolve symbol from last module to first, allowing later redefinitions of
   // symbols shadow earlier ones.
-  for (auto &handle :
+  for (auto& handle :
        llvm::make_range(module_handles_.rbegin(), module_handles_.rend())) {
     if (auto symbol =
             compile_layer_.findSymbolIn(handle, mangled_name,
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 4d8653484a037a345321dbe11c384f650e0142d0..f57049c9dde23c7ac540d22dde0b7be9be38e2e0 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "external/llvm/include/llvm/Target/TargetMachine.h"
+#include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -41,9 +42,12 @@ namespace cpu {
 // it's added to the JIT.
 class SimpleOrcJIT {
  public:
-  using ObjLayerT = llvm::orc::RTDyldObjectLinkingLayer<>;
-  using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT>;
-  using ModuleHandleT = CompileLayerT::ModuleSetHandleT;
+  using ObjLayerT = llvm::orc::RTDyldObjectLinkingLayer;
+  using CompileFtor =
+      std::function<llvm::object::OwningBinary<llvm::object::ObjectFile>(
+          llvm::Module&)>;
+  using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT, CompileFtor>;
+  using ModuleHandleT = CompileLayerT::ModuleHandleT;
 
   // Create a new JIT, targeting the host architecture.
   // The |target_options| parameter allows customization of certain code
@@ -51,8 +55,14 @@ class SimpleOrcJIT {
   // can be reassociated, etc.).
   // The |opt_level| parameter controls the optimization level of the code
   // generator.
+  // The |pre_optimization_hook| is invoked on the module before any IR
+  // level optimizations are applied.
+  // The |post_optimization_hook| is invoked on the module after all IR
+  // level optimizations are applied.
   SimpleOrcJIT(const llvm::TargetOptions& target_options,
-               llvm::CodeGenOpt::Level opt_level);
+               llvm::CodeGenOpt::Level opt_level,
+               CompilerFunctor::ModuleHook pre_optimization_hook,
+               CompilerFunctor::ModuleHook post_optimization_hook);
 
   // Data layout this JIT was created with.
   const llvm::DataLayout& data_layout() const { return data_layout_; }
@@ -73,6 +83,8 @@ class SimpleOrcJIT {
   // nullptr if the symbol cannot be found.
   llvm::JITSymbol FindSymbol(const std::string& name);
 
+  llvm::TargetMachine* target_machine() const { return target_machine_.get(); }
+
  private:
   std::vector<ModuleHandleT> module_handles_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
diff --git a/tensorflow/compiler/xla/service/cpu/infeed_manager.cc b/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc
similarity index 56%
rename from tensorflow/compiler/xla/service/cpu/infeed_manager.cc
rename to tensorflow/compiler/xla/service/cpu/xfeed_manager.cc
index 14c882a06ee9fdfc66f3d6db55146431634dd85e..2160c3cd01df0b359f986d5e5ba16c71e2676584 100644
--- a/tensorflow/compiler/xla/service/cpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc
@@ -13,32 +13,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
 
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace cpu {
 namespace runtime {
 
-InfeedBuffer::~InfeedBuffer() = default;
-
-InfeedManager::InfeedManager() : current_buffer_(nullptr) {}
+void XfeedManager::Reset() {
+  infeed()->Reset();
+  outfeed()->Reset();
+}
 
-void InfeedManager::Reset() {
+void XfeedQueueManager::Reset() {
   tensorflow::mutex_lock l(mu_);
-  CHECK(!current_buffer_);
-  for (auto buffer : enqueued_buffer_) {
-    buffer->Done();
+  CHECK(current_buffer_ == nullptr);
+  for (auto buffer : enqueued_buffers_) {
+    buffer->Done(ShapeUtil::MakeNil());
   }
-  enqueued_buffer_.clear();
+  enqueued_buffers_.clear();
 }
 
-void InfeedManager::EnqueueBuffer(InfeedBuffer* buffer) {
+void XfeedQueueManager::EnqueueBuffers(
+    tensorflow::gtl::ArraySlice<XfeedBuffer*> buffers) {
   tensorflow::mutex_lock l(mu_);
-  bool was_empty = enqueued_buffer_.empty();
-  enqueued_buffer_.push_back(buffer);
-  if (was_empty) {
+  bool was_empty = enqueued_buffers_.empty();
+  for (XfeedBuffer* b : buffers) {
+    enqueued_buffers_.push_back(b);
+  }
+  if (was_empty && !buffers.empty()) {
     // This has the potential to suffer from the notified thread
     // immediately trying and failing to acquire mu_, but seems
     // preferable to the alternative of notifying outside the lock
@@ -47,23 +52,24 @@ void InfeedManager::EnqueueBuffer(InfeedBuffer* buffer) {
   }
 }
 
-InfeedBuffer* InfeedManager::BlockingDequeueBuffer() {
+XfeedBuffer* XfeedQueueManager::BlockingDequeueBuffer() {
   tensorflow::mutex_lock l(mu_);
-  while (enqueued_buffer_.empty()) {
+  while (enqueued_buffers_.empty()) {
     cv_.wait(l);
   }
-  CHECK(!current_buffer_);
-  current_buffer_ = enqueued_buffer_.front();
-  enqueued_buffer_.pop_front();
+  CHECK(current_buffer_ == nullptr);
+  current_buffer_ = enqueued_buffers_.front();
+  enqueued_buffers_.pop_front();
   return current_buffer_;
 }
 
-void InfeedManager::ReleaseCurrentBuffer(int32 length, void* data) {
+void XfeedQueueManager::ReleaseCurrentBuffer(int32 length, void* data,
+                                             StatusOr<Shape> shape) {
   tensorflow::mutex_lock l(mu_);
-  CHECK(current_buffer_);
+  CHECK(current_buffer_ != nullptr);
   CHECK_EQ(length, current_buffer_->length());
   CHECK_EQ(data, current_buffer_->data());
-  current_buffer_->Done();
+  current_buffer_->Done(std::move(shape));
   current_buffer_ = nullptr;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/infeed_manager.h b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
similarity index 50%
rename from tensorflow/compiler/xla/service/cpu/infeed_manager.h
rename to tensorflow/compiler/xla/service/cpu/xfeed_manager.h
index 77472746e659b2ddbd9b54a036775ebdd0084fdd..86af789384e0a926b2e469daac68b6e1521875bc 100644
--- a/tensorflow/compiler/xla/service/cpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
@@ -17,12 +17,15 @@ limitations under the License.
 // is used by the CPU runtime to transfer buffers into an executing
 // CPU computation, e.g., to feed data into a while loop.
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_INFEED_MANAGER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_INFEED_MANAGER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_XFEED_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_XFEED_MANAGER_H_
 
 #include <deque>
 
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
@@ -31,62 +34,89 @@ namespace runtime {
 
 // Abstract class defining an infeed buffer that is passed to the
 // runtime by a client. The client manages the storage of the buffer.
-class InfeedBuffer {
+class XfeedBuffer {
  public:
-  virtual ~InfeedBuffer();
+  virtual ~XfeedBuffer() = default;
 
   virtual int32 length() = 0;
   virtual void* data() = 0;
-  virtual void Done() = 0;
+
+  // The 'shape' parameter reflects what shape the embedded program was
+  // expecting / producing with respect to this XfeedBuffer. E.g. this will
+  // contain information about the layout of an outfed buffer.
+  virtual void Done(StatusOr<Shape> shape) = 0;
 };
 
-// Client-side class used to enqueue infeed buffers.
-class InfeedManager {
+// Reusable component for managing the infeed and outfeed queue state.
+class XfeedQueueManager {
  public:
-  InfeedManager();
+  XfeedQueueManager() = default;
 
   // Calls the completion callback for any enqueued buffers that have
-  // not been dequeued by the runtime, and empties the infeed
+  // not been dequeued by the runtime, and empties the
   // queue. Reset may not be called while a runtime computation is
   // processing a dequeued buffer. The only safe way to ensure this
   // condition is to call Reset when no computation is taking place.
   void Reset();
 
-  // Adds buffer to the infeed queue. buffer->Done will be called when
-  // the buffer will no longer be accessed by the InfeedManager,
-  // either as a result of a call to Reset or because the runtime has
-  // dequeued and used the buffer.
-  void EnqueueBuffer(InfeedBuffer* buffer);
-
-  // Blocks until the infeed queue is non-empty, then returns the
-  // buffer at the head of the queue. Sets the current buffer to be
-  // the returned buffer. It is an error to call BlockingDequeueBuffer
-  // if there is an unreleased current buffer, i.e.,
-  // ReleaseCurrentBuffer must be called between calls to
+  // Adds a sequence of buffers to the queue atomically. buffer->Done will be
+  // called when the buffer will no longer be accessed by the XfeedManager,
+  // either as a result of a call to Reset or because the runtime has dequeued
+  // and used the buffer.
+  void EnqueueBuffers(tensorflow::gtl::ArraySlice<XfeedBuffer*> buffers);
+
+  // Blocks until the queue is non-empty, then returns the buffer at the head of
+  // the queue. Sets the current buffer to be the returned buffer. It is an
+  // error to call BlockingDequeueBuffer if there is an unreleased current
+  // buffer, i.e., ReleaseCurrentBuffer must be called between calls to
   // BlockingDequeueBuffer.
-  InfeedBuffer* BlockingDequeueBuffer();
+  XfeedBuffer* BlockingDequeueBuffer();
 
   // Releases the current buffer, which is the last buffer returned by
   // BlockingDequeuBuffer and not yet released. length and data must
   // match the buffer->length() and buffer->data() for the current
   // buffer.
-  void ReleaseCurrentBuffer(int32 length, void* data);
+  //
+  // 'shape' communicates the shape of the buffer being released. If the program
+  // passed a value that could not be decoded as a shape, 'shape' will be an
+  // error status. In the case of outfeed, this indicates the layout of the
+  // shape that has been outfed. In the case of infeed, this can be used for
+  // sanity checking purposes.
+  void ReleaseCurrentBuffer(int32 length, void* data, StatusOr<Shape> shape);
 
  private:
   tensorflow::mutex mu_;
+
   // Condition variable that is signaled every time a buffer is
   // enqueued to an empty queue.
   tensorflow::condition_variable cv_;
-  // InfeedBuffer* queue contents are not owned, but buffer->Done must
+
+  // XfeedBuffer* queue contents are not owned, but buffer->Done must
   // be called when the buffer is no longer needed by the runtime.
-  std::deque<InfeedBuffer*> enqueued_buffer_;
+  std::deque<XfeedBuffer*> enqueued_buffers_;
+
   // If non-NULL, the buffer that is currently being processed by the
   // runtime. Not owned.
-  InfeedBuffer* current_buffer_;
+  XfeedBuffer* current_buffer_ = nullptr;
+};
+
+// Client-side class used to enqueue infeed buffers.
+class XfeedManager {
+ public:
+  XfeedManager() = default;
+
+  void Reset();
+
+  XfeedQueueManager* infeed() { return &infeed_; }
+  XfeedQueueManager* outfeed() { return &outfeed_; }
+
+ private:
+  XfeedQueueManager infeed_;
+  XfeedQueueManager outfeed_;
 };
 
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_INFEED_MANAGER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_XFEED_MANAGER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc b/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8defd28b013512a0b6ace0c23dff4a38fe505385
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class InfeedManagerTest : public ::testing::Test {};
+
+class TestInfeedBuffer : public cpu::runtime::XfeedBuffer {
+ public:
+  explicit TestInfeedBuffer(int32 length, bool expect_shape_match = true)
+      : shape_(ShapeUtil::MakeShape(U8, {length})),
+        done_called_(false),
+        length_(length),
+        expect_shape_match_(expect_shape_match) {}
+  ~TestInfeedBuffer() override { EXPECT_TRUE(done_called_); }
+
+  int32 length() override { return length_; }
+  void* data() override { return nullptr; }
+  void Done(StatusOr<Shape> shape) override {
+    CHECK(!done_called_);
+    done_called_ = true;
+    TF_ASSERT_OK(shape.status());
+    EXPECT_EQ(expect_shape_match_, ShapeUtil::Equal(shape_, shape.ValueOrDie()))
+        << "want " << ShapeUtil::HumanString(shape_) << " "
+        << (expect_shape_match_ ? "==" : "!=") << " "
+        << ShapeUtil::HumanString(shape.ValueOrDie());
+  }
+
+  const Shape& shape() const { return shape_; }
+
+ private:
+  Shape shape_;
+  bool done_called_;
+  int32 length_;
+  bool expect_shape_match_;
+};
+
+// Performs the acquire/release sequence on the infeed, as the generated CPU
+// code would in the process of executing the infeed operation.
+void ProcessNextBuffer(int32 length) {
+  auto shape = ShapeUtil::MakeShape(U8, {length});
+  string bytes = shape.SerializeAsString();
+  void* buffer = __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
+      length, bytes.data(), bytes.size());
+  __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(length, buffer,
+                                                    bytes.data(), bytes.size());
+}
+
+// Performs the acquire/release sequence on the outfeed, as the generated CPU
+// code would in the process of executing the outfeed operation.
+void ProcessNextOutfeedBuffer(int32 length, const Shape& shape) {
+  string bytes = shape.SerializeAsString();
+  void* buffer = __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+      length, bytes.data(), bytes.size());
+  __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+      length, buffer, bytes.data(), bytes.size());
+}
+
+TEST_F(InfeedManagerTest, SingleThreadedSequential) {
+  TestInfeedBuffer* a = new TestInfeedBuffer(64);
+  TestInfeedBuffer* b = new TestInfeedBuffer(32);
+
+  cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager();
+
+  xfeed->infeed()->EnqueueBuffers({a});
+  xfeed->infeed()->EnqueueBuffers({b});
+  ProcessNextBuffer(a->length());
+  ProcessNextBuffer(b->length());
+}
+
+TEST_F(InfeedManagerTest, SingleThreadedInterleaved) {
+  TestInfeedBuffer* a = new TestInfeedBuffer(64);
+  TestInfeedBuffer* b = new TestInfeedBuffer(32);
+
+  cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager();
+
+  xfeed->infeed()->EnqueueBuffers({a});
+  ProcessNextBuffer(a->length());
+  xfeed->infeed()->EnqueueBuffers({b});
+  ProcessNextBuffer(b->length());
+}
+
+TEST_F(InfeedManagerTest, MultiThreaded) {
+  tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "test", 2);
+
+  cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager();
+
+  const int32 length = 64;
+
+  pool.Schedule([xfeed]() {
+    // Spin for 100 milliseconds
+    int64 start_micros = tensorflow::Env::Default()->NowMicros();
+    while (true) {
+      int64 end_micros = tensorflow::Env::Default()->NowMicros();
+      if ((end_micros - start_micros) >= 100000) {  // 100 ms
+        break;
+      }
+    }
+    TestInfeedBuffer* a = new TestInfeedBuffer(length);
+    xfeed->infeed()->EnqueueBuffers({a});
+  });
+
+  ProcessNextBuffer(length);
+}
+
+TEST_F(InfeedManagerTest, OutfeedWrongShape) {
+  TestInfeedBuffer* b = new TestInfeedBuffer(32, /*expect_shape_match=*/false);
+  cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager();
+  xfeed->outfeed()->EnqueueBuffers({b});
+
+  ProcessNextOutfeedBuffer(32, ShapeUtil::MakeShape(U8, {33}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu_transfer_manager.cc
index 2d9d9c7de62a34e4d18ef1d7f5552a85ad1c49cb..d8a76443a66c9234aa0b93d1d21e213fd3ba67ab 100644
--- a/tensorflow/compiler/xla/service/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu_transfer_manager.cc
@@ -21,15 +21,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace se = ::perftools::gputools;
@@ -38,7 +40,7 @@ namespace xla {
 
 namespace {
 
-class CpuInfeedBuffer : public cpu::runtime::InfeedBuffer {
+class CpuInfeedBuffer : public cpu::runtime::XfeedBuffer {
  public:
   explicit CpuInfeedBuffer(int32 length)
       : length_(length),
@@ -48,7 +50,7 @@ class CpuInfeedBuffer : public cpu::runtime::InfeedBuffer {
 
   int32 length() override { return length_; }
   void* data() override { return buffer_; }
-  void Done() override { delete this; }
+  void Done(StatusOr<Shape> /*shape*/) override { delete this; }
 
   se::DeviceMemoryBase* device_memory() { return &device_memory_; }
 
@@ -58,6 +60,30 @@ class CpuInfeedBuffer : public cpu::runtime::InfeedBuffer {
   se::DeviceMemoryBase device_memory_;
 };
 
+class CpuOutfeedBuffer : public cpu::runtime::XfeedBuffer {
+ public:
+  CpuOutfeedBuffer(void* destination, int32 length)
+      : destination_(destination), length_(length) {}
+
+  StatusOr<Shape> WaitForNotification() {
+    done_.WaitForNotification();
+    return status_;
+  }
+
+  int32 length() override { return length_; }
+  void* data() override { return destination_; }
+  void Done(StatusOr<Shape> shape) override {
+    status_ = std::move(shape);
+    done_.Notify();
+  }
+
+ private:
+  void* destination_;
+  int32 length_;
+  StatusOr<Shape> status_;
+  tensorflow::Notification done_;
+};
+
 }  // namespace
 
 CpuTransferManager::CpuTransferManager()
@@ -66,34 +92,173 @@ CpuTransferManager::CpuTransferManager()
 Status CpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
                                                    const Literal& literal) {
   const Shape& shape = literal.shape();
-  VLOG(2) << "transferring literal shape to infeed: "
+  VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
 
-  // TODO(b/31381668) handle tuples.
-  if (ShapeUtil::IsTuple(shape)) {
-    return Unimplemented("Infeed with a tuple shape is not supported: %s",
-                         ShapeUtil::HumanString(literal.shape()).c_str());
+  if (!ShapeUtil::IsTuple(shape)) {
+    int64 size = GetByteSizeRequirement(shape);
+    return TransferBufferToInfeed(executor, size, literal.InternalData());
   }
 
-  cpu::runtime::InfeedManager* infeed_manager =
-      cpu::runtime::GetInfeedManager();
+  if (ShapeUtil::IsNestedTuple(shape)) {
+    return Unimplemented(
+        "Infeed with a nested tuple shape is not supported: %s",
+        ShapeUtil::HumanString(literal.shape()).c_str());
+  }
+
+  // For a tuple, we transfer each of its elements to the device and
+  // enqueue the resulting destination device addresses with the
+  // infeed manager.
+  std::vector<cpu::runtime::XfeedBuffer*> buffers;
+  buffers.reserve(literal.tuple_literals_size());
+  auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() {
+    for (cpu::runtime::XfeedBuffer* b : buffers) {
+      b->Done(ShapeUtil::MakeNil());
+    }
+  });
+
+  for (const auto& tuple_element : literal.tuple_literals()) {
+    const Shape& tuple_element_shape = tuple_element.shape();
+    int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape);
+    TF_ASSIGN_OR_RETURN(
+        cpu::runtime::XfeedBuffer * buffer,
+        TransferBufferToInfeedInternal(executor, tuple_element_size,
+                                       tuple_element.InternalData()));
+    buffers.push_back(buffer);
+  }
+
+  cpu::runtime::XfeedManager* xfeed_manager = cpu::runtime::GetXfeedManager();
+  xfeed_manager->infeed()->EnqueueBuffers(buffers);
+
+  cleanup.release();
+  return Status::OK();
+}
+
+Status CpuTransferManager::TransferBufferToInfeed(se::StreamExecutor* executor,
+                                                  int64 size,
+                                                  const void* source) {
+  TF_ASSIGN_OR_RETURN(cpu::runtime::XfeedBuffer * buffer,
+                      TransferBufferToInfeedInternal(executor, size, source));
+
+  cpu::runtime::XfeedManager* xfeed_manager = cpu::runtime::GetXfeedManager();
+  xfeed_manager->infeed()->EnqueueBuffers({buffer});
 
-  int64 size = GetByteSizeRequirement(shape);
+  return Status::OK();
+}
+
+StatusOr<cpu::runtime::XfeedBuffer*>
+CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
+                                                   int64 size,
+                                                   const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return Unimplemented("Infeed shape is too large: %s needs %lld bytes",
-                         ShapeUtil::HumanString(literal.shape()).c_str(), size);
+    return InvalidArgument("Infeed shape is too large: needs %lld bytes", size);
   }
+
+  if (size <= 0) {
+    return InvalidArgument("Infeed shape must have positive size; got %lld",
+                           size);
+  }
+
   int32 size_32 = static_cast<int32>(size);
   CpuInfeedBuffer* queued_buffer = new CpuInfeedBuffer(size_32);
-  TF_RETURN_IF_ERROR(TransferBufferToDevice(
-      executor, /*size=*/size, /*source=*/LiteralUtil::InternalData(literal),
-      queued_buffer->device_memory()));
+  Status s =
+      TransferBufferToDevice(executor, /*size=*/size,
+                             /*source=*/source, queued_buffer->device_memory());
 
-  infeed_manager->EnqueueBuffer(queued_buffer);
+  if (!s.ok()) {
+    queued_buffer->Done(ShapeUtil::MakeNil());
+    return s;
+  }
+  return queued_buffer;
+}
 
+Status CpuTransferManager::TransferLiteralFromOutfeed(
+    se::StreamExecutor* executor, const Shape& literal_shape,
+    Literal* literal) {
+  if (!ShapeUtil::IsTuple(literal_shape)) {
+    int64 size = GetByteSizeRequirement(literal_shape);
+    // Note: OSS build didn't like implicit conversion from
+    // literal_shape.dimensions() to the array slice on 2017-07-10.
+    tensorflow::gtl::ArraySlice<int64> dimensions(
+        tensorflow::bit_cast<const int64*>(literal_shape.dimensions().data()),
+        literal_shape.dimensions().size());
+    auto empty =
+        Literal::CreateFromDimensions(literal_shape.element_type(), dimensions);
+    literal->Swap(empty.get());
+    TF_ASSIGN_OR_RETURN(Shape received_shape,
+                        TransferBufferFromOutfeed(
+                            executor, size, literal->MutableInternalData()));
+    TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal->shape()))
+        << "Shape received from outfeed "
+        << ShapeUtil::HumanString(received_shape)
+        << " did not match the shape that was requested for outfeed: "
+        << ShapeUtil::HumanString(literal_shape);
+    TF_RET_CHECK(size == GetByteSizeRequirement(received_shape));
+    *literal->mutable_shape() = received_shape;
+    return Status::OK();
+  }
+
+  if (ShapeUtil::IsNestedTuple(literal_shape)) {
+    return Unimplemented(
+        "Nested tuple outfeeds are not yet implemented on CPU.");
+  }
+
+  std::vector<std::unique_ptr<Literal>> elements;
+  for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) {
+    const Shape& tuple_element_shape =
+        ShapeUtil::GetTupleElementShape(literal_shape, i);
+    // Note: OSS build didn't like implicit conversion from
+    // literal_shape.dimensions() to the array slice on 2017-07-10.
+    tensorflow::gtl::ArraySlice<int64> dimensions(
+        tensorflow::bit_cast<const int64*>(
+            tuple_element_shape.dimensions().data()),
+        tuple_element_shape.dimensions().size());
+    auto empty = Literal::CreateFromDimensions(
+        tuple_element_shape.element_type(), dimensions);
+    TF_ASSIGN_OR_RETURN(
+        Shape received_shape,
+        TransferBufferFromOutfeed(executor,
+                                  GetByteSizeRequirement(tuple_element_shape),
+                                  empty->MutableInternalData()));
+    TF_RET_CHECK(ShapeUtil::Compatible(received_shape, tuple_element_shape))
+        << "Shape received from outfeed "
+        << ShapeUtil::HumanString(received_shape)
+        << " did not match the shape that was requested for outfeed: "
+        << ShapeUtil::HumanString(tuple_element_shape);
+    TF_RET_CHECK(GetByteSizeRequirement(tuple_element_shape) ==
+                 GetByteSizeRequirement(received_shape));
+    *empty->mutable_shape() = received_shape;
+    elements.push_back(std::move(empty));
+  }
+  auto result = Literal::MakeTupleOwned(std::move(elements));
+  literal->Swap(result.get());
+  TF_RET_CHECK(ShapeUtil::Equal(literal->shape(), literal_shape));
   return Status::OK();
 }
 
+StatusOr<Shape> CpuTransferManager::TransferBufferFromOutfeed(
+    perftools::gputools::StreamExecutor* executor, int64 size,
+    void* destination) {
+  if (size > std::numeric_limits<int32>::max()) {
+    return InvalidArgument("Outfeed shape is too large: needs %lld bytes",
+                           size);
+  }
+
+  if (size <= 0) {
+    return InvalidArgument("Outfeed shape must have positive size; got %lld",
+                           size);
+  }
+
+  int32 size_32 = static_cast<int32>(size);
+  cpu::runtime::XfeedManager* xfeed_manager = cpu::runtime::GetXfeedManager();
+  CpuOutfeedBuffer buffer(destination, size_32);
+  VLOG(2) << "Enqueueing outfeed buffer (for the device to populate) of length "
+          << size_32 << "B";
+  xfeed_manager->outfeed()->EnqueueBuffers({&buffer});
+  VLOG(2) << "Waiting for buffer to be notified as populated.";
+  return buffer.WaitForNotification();
+}
+
 }  // namespace xla
 
 static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
diff --git a/tensorflow/compiler/xla/service/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu_transfer_manager.h
index 727462252d7291959fd09c05c87e36411eb3ddab..30dc2d90623fb20656874d40a25b6a8449a7486c 100644
--- a/tensorflow/compiler/xla/service/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu_transfer_manager.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -37,8 +38,24 @@ class CpuTransferManager : public GenericTransferManager {
 
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
+  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
+                                int64 size, const void* source) override;
+  Status TransferLiteralFromOutfeed(
+      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+      Literal* literal) override;
 
  private:
+  // Transfers infeed data to device. InfeedBuffer->Done() must be
+  // called to clean up the memory allocated for InfeedBuffer.
+  StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      const void* source);
+
+  // On success, returns the shape that was transferred from the outfeed.
+  StatusOr<Shape> TransferBufferFromOutfeed(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      void* destination);
+
   TF_DISALLOW_COPY_AND_ASSIGN(CpuTransferManager);
 };
 
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index c13c86741cc4291d5ae76cb4b3d7913927c565ea..2e4b0a5230516b5308aeed892de9a49565a09f2e 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -35,7 +35,15 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
                                         bool retry_on_failure) {
   TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
-  return stream_executor->AllocateArray<uint8>(size);
+  perftools::gputools::DeviceMemoryBase result =
+      stream_executor->AllocateArray<uint8>(size);
+  if (size > 0 && result == nullptr) {
+    return ResourceExhausted(
+        "Failed to allocate request for %s (%lluB) on device ordinal %d",
+        tensorflow::strings::HumanReadableNumBytes(size).c_str(), size,
+        device_ordinal);
+  }
+  return result;
 }
 
 tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
index 5b296861006923f438df1ad4fb5898f82f11b9e0..0f7ab111170a3152cbe86c1a4fa8d592a14d6241 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
@@ -24,51 +24,29 @@ limitations under the License.
 namespace xla {
 
 Status DfsHloVisitor::HandleElementwiseUnary(HloInstruction* hlo,
-                                             HloOpcode opcode,
-                                             HloInstruction* operand) {
+                                             HloOpcode opcode) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s",
                        HloOpcodeString(opcode).c_str());
 }
 
 Status DfsHloVisitor::HandleElementwiseBinary(HloInstruction* hlo,
-                                              HloOpcode opcode,
-                                              HloInstruction* lhs,
-                                              HloInstruction* rhs) {
+                                              HloOpcode opcode) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s",
                        HloOpcodeString(opcode).c_str());
 }
 
 void DfsHloVisitor::SetVisiting(const HloInstruction& instruction) {
   VLOG(3) << "marking HLO " << &instruction << " as visiting: ";
-  CHECK(NotVisited(instruction));
+  DCHECK(NotVisited(instruction));
   visit_state_[&instruction] = VisitState::kVisiting;
 }
 
 void DfsHloVisitor::SetVisited(const HloInstruction& instruction) {
   VLOG(3) << "marking HLO " << &instruction << " as visited: ";
-  CHECK(NotVisited(instruction) || IsVisiting(instruction));
+  DCHECK(NotVisited(instruction) || IsVisiting(instruction));
   visit_state_[&instruction] = VisitState::kVisited;
 }
 
-bool DfsHloVisitor::IsVisiting(const HloInstruction& instruction) {
-  if (visit_state_.count(&instruction) == 0) {
-    return false;
-  }
-  return visit_state_[&instruction] == VisitState::kVisiting;
-}
-
-bool DfsHloVisitor::DidVisit(const HloInstruction& instruction) {
-  if (visit_state_.count(&instruction) == 0) {
-    return false;
-  }
-  return visit_state_[&instruction] == VisitState::kVisited;
-}
-
-bool DfsHloVisitor::NotVisited(const HloInstruction& instruction) {
-  return visit_state_.count(&instruction) == 0 ||
-         visit_state_[&instruction] == VisitState::kNotVisited;
-}
-
 Status DfsHloVisitor::Preprocess(HloInstruction* hlo) { return Status::OK(); }
 
 Status DfsHloVisitor::Postprocess(HloInstruction* visited) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 78a398f8efa870fcfbda78a769b3f6878a8a429b..e6067ae9ea244e58920d0a163078ab302327b871 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -65,43 +65,37 @@ class DfsHloVisitor {
   // These routines are self-descriptive, see class comment for usage
   // information.
 
-  virtual Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode,
-                                        HloInstruction* operand);
-  virtual Status HandleElementwiseBinary(HloInstruction* hlo, HloOpcode opcode,
-                                         HloInstruction* lhs,
-                                         HloInstruction* rhs);
+  virtual Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode);
+  virtual Status HandleElementwiseBinary(HloInstruction* hlo, HloOpcode opcode);
   virtual Status HandleClamp(HloInstruction* clamp, HloInstruction* min,
                              HloInstruction* arg, HloInstruction* max) = 0;
   virtual Status HandleSelect(HloInstruction* select, HloInstruction* pred,
                               HloInstruction* on_true,
                               HloInstruction* on_false) = 0;
-  virtual Status HandleMaximum(HloInstruction* maximum, HloInstruction* lhs,
-                               HloInstruction* rhs) {
-    return HandleElementwiseBinary(maximum, HloOpcode::kMaximum, lhs, rhs);
+  virtual Status HandleMaximum(HloInstruction* maximum) {
+    return HandleElementwiseBinary(maximum, HloOpcode::kMaximum);
   }
-  virtual Status HandleMinimum(HloInstruction* minimum, HloInstruction* lhs,
-                               HloInstruction* rhs) {
-    return HandleElementwiseBinary(minimum, HloOpcode::kMinimum, lhs, rhs);
+  virtual Status HandleMinimum(HloInstruction* minimum) {
+    return HandleElementwiseBinary(minimum, HloOpcode::kMinimum);
   }
   virtual Status HandleConcatenate(
       HloInstruction* concatenate,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands) = 0;
-  virtual Status HandleConvert(HloInstruction* convert,
-                               HloInstruction* operand) {
-    return HandleElementwiseUnary(convert, HloOpcode::kConvert, operand);
+  virtual Status HandleConvert(HloInstruction* convert) {
+    return HandleElementwiseUnary(convert, HloOpcode::kConvert);
   }
-  virtual Status HandleCopy(HloInstruction* copy, HloInstruction* operand) {
-    return HandleElementwiseUnary(copy, HloOpcode::kCopy, operand);
+  virtual Status HandleCopy(HloInstruction* copy) {
+    return HandleElementwiseUnary(copy, HloOpcode::kCopy);
   }
   virtual Status HandleMultiply(HloInstruction* multiply, HloInstruction* lhs,
                                 HloInstruction* rhs) {
-    return HandleElementwiseBinary(multiply, HloOpcode::kMultiply, lhs, rhs);
+    return HandleElementwiseBinary(multiply, HloOpcode::kMultiply);
   }
   virtual Status HandleDot(HloInstruction* dot, HloInstruction* lhs,
                            HloInstruction* rhs) = 0;
   virtual Status HandlePower(HloInstruction* power, HloInstruction* lhs,
                              HloInstruction* rhs) {
-    return HandleElementwiseBinary(power, HloOpcode::kPower, lhs, rhs);
+    return HandleElementwiseBinary(power, HloOpcode::kPower);
   }
   virtual Status HandleConvolution(HloInstruction* convolution,
                                    HloInstruction* lhs, HloInstruction* rhs,
@@ -109,64 +103,73 @@ class DfsHloVisitor {
   virtual Status HandleCrossReplicaSum(HloInstruction* crs) = 0;
   virtual Status HandleCompare(HloInstruction* compare, HloOpcode opcode,
                                HloInstruction* lhs, HloInstruction* rhs) {
-    return HandleElementwiseBinary(compare, opcode, lhs, rhs);
+    return HandleElementwiseBinary(compare, opcode);
   }
   virtual Status HandleAdd(HloInstruction* add, HloInstruction* lhs,
                            HloInstruction* rhs) {
-    return HandleElementwiseBinary(add, HloOpcode::kAdd, lhs, rhs);
+    return HandleElementwiseBinary(add, HloOpcode::kAdd);
   }
   virtual Status HandleDivide(HloInstruction* divide, HloInstruction* lhs,
                               HloInstruction* rhs) {
-    return HandleElementwiseBinary(divide, HloOpcode::kDivide, lhs, rhs);
+    return HandleElementwiseBinary(divide, HloOpcode::kDivide);
   }
   virtual Status HandleRemainder(HloInstruction* remainder, HloInstruction* lhs,
                                  HloInstruction* rhs) {
-    return HandleElementwiseBinary(remainder, HloOpcode::kRemainder, lhs, rhs);
+    return HandleElementwiseBinary(remainder, HloOpcode::kRemainder);
   }
   virtual Status HandleSubtract(HloInstruction* subtract, HloInstruction* lhs,
                                 HloInstruction* rhs) {
-    return HandleElementwiseBinary(subtract, HloOpcode::kSubtract, lhs, rhs);
+    return HandleElementwiseBinary(subtract, HloOpcode::kSubtract);
   }
   virtual Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
-    return HandleElementwiseUnary(abs, HloOpcode::kAbs, operand);
+    return HandleElementwiseUnary(abs, HloOpcode::kAbs);
   }
   virtual Status HandleSign(HloInstruction* sign, HloInstruction* operand) {
-    return HandleElementwiseUnary(sign, HloOpcode::kSign, operand);
+    return HandleElementwiseUnary(sign, HloOpcode::kSign);
   }
   virtual Status HandleNegate(HloInstruction* negate, HloInstruction* operand) {
-    return HandleElementwiseUnary(negate, HloOpcode::kNegate, operand);
+    return HandleElementwiseUnary(negate, HloOpcode::kNegate);
   }
   virtual Status HandleExp(HloInstruction* exp, HloInstruction* operand) {
-    return HandleElementwiseUnary(exp, HloOpcode::kExp, operand);
+    return HandleElementwiseUnary(exp, HloOpcode::kExp);
   }
   virtual Status HandleFloor(HloInstruction* floor, HloInstruction* operand) {
-    return HandleElementwiseUnary(floor, HloOpcode::kFloor, operand);
+    return HandleElementwiseUnary(floor, HloOpcode::kFloor);
   }
   virtual Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) {
-    return HandleElementwiseUnary(ceil, HloOpcode::kCeil, operand);
+    return HandleElementwiseUnary(ceil, HloOpcode::kCeil);
   }
   virtual Status HandleLog(HloInstruction* log, HloInstruction* operand) {
-    return HandleElementwiseUnary(log, HloOpcode::kLog, operand);
+    return HandleElementwiseUnary(log, HloOpcode::kLog);
+  }
+  virtual Status HandleCos(HloInstruction* cos, HloInstruction* operand) {
+    return HandleElementwiseUnary(cos, HloOpcode::kCos);
+  }
+  virtual Status HandleSin(HloInstruction* sin, HloInstruction* operand) {
+    return HandleElementwiseUnary(sin, HloOpcode::kSin);
   }
   virtual Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) {
-    return HandleElementwiseUnary(tanh, HloOpcode::kTanh, operand);
+    return HandleElementwiseUnary(tanh, HloOpcode::kTanh);
   }
   virtual Status HandleIsFinite(HloInstruction* is_finite,
                                 HloInstruction* operand) {
-    return HandleElementwiseUnary(is_finite, HloOpcode::kIsFinite, operand);
+    return HandleElementwiseUnary(is_finite, HloOpcode::kIsFinite);
   }
   virtual Status HandleLogicalAnd(HloInstruction* logical_and,
                                   HloInstruction* lhs, HloInstruction* rhs) {
-    return HandleElementwiseBinary(logical_and, HloOpcode::kLogicalAnd, lhs,
-                                   rhs);
+    return HandleElementwiseBinary(logical_and, HloOpcode::kLogicalAnd);
   }
   virtual Status HandleLogicalNot(HloInstruction* logical_not,
                                   HloInstruction* operand) {
-    return HandleElementwiseUnary(logical_not, HloOpcode::kLogicalNot, operand);
+    return HandleElementwiseUnary(logical_not, HloOpcode::kLogicalNot);
   }
   virtual Status HandleLogicalOr(HloInstruction* logical_or,
                                  HloInstruction* lhs, HloInstruction* rhs) {
-    return HandleElementwiseBinary(logical_or, HloOpcode::kLogicalOr, lhs, rhs);
+    return HandleElementwiseBinary(logical_or, HloOpcode::kLogicalOr);
+  }
+  virtual Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return HandleElementwiseUnary(reduce_precision,
+                                  HloOpcode::kReducePrecision);
   }
 
   virtual Status HandleInfeed(HloInstruction* infeed) = 0;
@@ -225,6 +228,10 @@ class DfsHloVisitor {
 
   virtual Status HandleRecv(HloInstruction* recv) = 0;
 
+  virtual Status HandleBatchNormTraining(HloInstruction* batchNormTraining) = 0;
+
+  virtual Status HandleBatchNormGrad(HloInstruction* batchNormGrad) = 0;
+
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
   virtual Status FinishVisit(HloInstruction* root) = 0;
@@ -237,6 +244,14 @@ class DfsHloVisitor {
     kVisited,
   };
 
+  VisitState GetVisitState(const HloInstruction& instruction) {
+    auto it = visit_state_.find(&instruction);
+    if (it == visit_state_.end()) {
+      return kNotVisited;
+    }
+    return it->second;
+  }
+
   // Sets the visitation state of the given instruction as kVisiting.
   //
   // Precondition: current state must be kNotVisited.
@@ -248,13 +263,19 @@ class DfsHloVisitor {
   void SetVisited(const HloInstruction& instruction);
 
   // Returns whether the state of the given instruction is kVisiting.
-  bool IsVisiting(const HloInstruction& instruction);
+  bool IsVisiting(const HloInstruction& instruction) {
+    return GetVisitState(instruction) == kVisiting;
+  }
 
   // Returns whether the state of the given instruction is kVisited.
-  bool DidVisit(const HloInstruction& instruction);
+  bool DidVisit(const HloInstruction& instruction) {
+    return GetVisitState(instruction) == kVisited;
+  }
 
   // Returns whether the state of the given instruction is kNotVisited.
-  bool NotVisited(const HloInstruction& instruction);
+  bool NotVisited(const HloInstruction& instruction) {
+    return GetVisitState(instruction) == kNotVisited;
+  }
 
   // This method should be overridden by subclasses that wish to run some
   // operation on an op before its Handle* visitor method is called.
@@ -279,7 +300,7 @@ class DfsHloVisitor {
 
  private:
   // Tracks the visitation state of each instruction. Any instructions that are
-  // not found from the map are considered as VisitState::kNotVisited.
+  // not found in the map are considered as VisitState::kNotVisited.
   tensorflow::gtl::FlatMap<const HloInstruction*, VisitState> visit_state_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DfsHloVisitor);
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 6557c3aa8e6b8356887432c6dd91d326603fc1e7..c447165ceccc3d55088cafda24d90fedea9994ae 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -41,15 +41,23 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
   // Default action performed on HloInstruction.
   virtual Status DefaultAction(HloInstruction* hlo_instruction) = 0;
 
-  Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode,
-                                HloInstruction* operand) override {
+  Status HandleElementwiseUnary(HloInstruction* hlo,
+                                HloOpcode opcode) override {
     return DefaultAction(hlo);
   }
-  Status HandleElementwiseBinary(HloInstruction* hlo, HloOpcode opcode,
-                                 HloInstruction* lhs,
-                                 HloInstruction* rhs) override {
+  Status HandleElementwiseBinary(HloInstruction* hlo,
+                                 HloOpcode opcode) override {
     return DefaultAction(hlo);
   }
+
+  Status HandleBatchNormTraining(HloInstruction* hlo) override {
+    return DefaultAction(hlo);
+  }
+
+  Status HandleBatchNormGrad(HloInstruction* hlo) override {
+    return DefaultAction(hlo);
+  }
+
   Status HandleClamp(HloInstruction* clamp, HloInstruction* /*min*/,
                      HloInstruction* /*arg*/,
                      HloInstruction* /*max*/) override {
@@ -60,12 +68,10 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
       tensorflow::gtl::ArraySlice<HloInstruction*> /*operands*/) override {
     return DefaultAction(concatenate);
   }
-  Status HandleConvert(HloInstruction* convert,
-                       HloInstruction* /*operand*/) override {
+  Status HandleConvert(HloInstruction* convert) override {
     return DefaultAction(convert);
   }
-  Status HandleCopy(HloInstruction* copy,
-                    HloInstruction* /*operand*/) override {
+  Status HandleCopy(HloInstruction* copy) override {
     return DefaultAction(copy);
   }
   Status HandleSelect(HloInstruction* select, HloInstruction* /*pred*/,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index be4aadb6522b8d6ad9d6425df56c1746c3849f11..81092e42d5c841546ad49cbb8cfaf501fa8cd55e 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -63,7 +63,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       PrimitiveType to_type = op->shape().element_type();
-      CHECK(primitive_util::IsIntegralType(from_type));
+      CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED);
       if (from_type == to_type) {
         return operand_value;
       }
@@ -78,7 +78,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
               operand_value,
               llvm_ir::PrimitiveTypeToIrType(to_type, ir_builder_));
         }
-        if (primitive_util::IsUnsignedIntegralType(from_type)) {
+        if (primitive_util::IsUnsignedIntegralType(from_type) ||
+            from_type == PRED) {
           return ir_builder_->CreateUIToFP(
               operand_value,
               llvm_ir::PrimitiveTypeToIrType(to_type, ir_builder_));
@@ -172,6 +173,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {operand_value},
                                           {operand_value->getType()},
                                           ir_builder_);
+    case HloOpcode::kCos:
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {operand_value},
+                                          {operand_value->getType()},
+                                          ir_builder_);
+    case HloOpcode::kSin:
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {operand_value},
+                                          {operand_value->getType()},
+                                          ir_builder_);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(
           llvm::Intrinsic::floor, {operand_value}, {operand_value->getType()},
@@ -381,6 +390,118 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
   return EmitErfInv(prim_type, ir_builder_->CreateFSub(one, value));
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
+    const HloInstruction* hlo, llvm::Value* x) const {
+  if (hlo->operand(0)->shape().element_type() != F32) {
+    return Unimplemented("reduce-precision only implemented for F32");
+  }
+
+  // Integer and float types for casting and constant generation.
+  llvm::Type* float_type = x->getType();
+  llvm::IntegerType* int_type = ir_builder_->getInt32Ty();
+
+  // Cast the input value to an integer for bitwise manipulation.
+  llvm::Value* x_as_int = ir_builder_->CreateBitCast(x, int_type);
+
+  if (hlo->mantissa_bits() < 23) {
+    // Last remaining mantissa bit.
+    const uint32_t last_mantissa_bit_mask = 1u << (23 - hlo->mantissa_bits());
+
+    // Compute rounding bias for round-to-nearest with ties to even.  This is
+    // equal to a base value of 0111... plus one bit if the last remaining
+    // mantissa bit is 1.
+    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
+    llvm::Value* x_last_mantissa_bit = ir_builder_->CreateLShr(
+        ir_builder_->CreateAnd(
+            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
+        (23 - hlo->mantissa_bits()));
+    llvm::Value* x_rounding_bias = ir_builder_->CreateAdd(
+        x_last_mantissa_bit,
+        llvm::ConstantInt::get(int_type, base_rounding_bias));
+
+    // Add rounding bias, and mask out truncated bits.  Note that the case
+    // where adding the rounding bias overflows into the exponent bits is
+    // correct; the non-masked mantissa bits will all be zero, and the
+    // exponent will be incremented by one.
+    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+    x_as_int = ir_builder_->CreateAdd(x_as_int, x_rounding_bias);
+    x_as_int = ir_builder_->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
+  }
+
+  if (hlo->exponent_bits() < 8) {
+    // Masks for f32 values.
+    const uint32_t f32_sign_bit_mask = 1u << 31;
+    const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
+    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
+    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
+    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
+    // exponent (corresponding to 0.0f).
+    //
+    // Thus, the f32 exponent corresponding to the highest non-infinite
+    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+    // exponent corresponding to the lowest exponent for a bit size of n is
+    // (2^7-1) - 2^(n-1)-1.
+    //
+    // Note that we have already checked that exponents_bits >= 1.
+    const uint32_t f32_exponent_bias = (1 << 7) - 1;
+    const uint32_t reduced_exponent_bias =
+        (1 << (hlo->exponent_bits() - 1)) - 1;
+    const uint32_t reduced_max_exponent =
+        f32_exponent_bias + reduced_exponent_bias;
+    const uint32_t reduced_min_exponent =
+        f32_exponent_bias - reduced_exponent_bias;
+
+    // Do we overflow or underflow?
+    llvm::Value* x_exponent = ir_builder_->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+    llvm::Value* x_overflows = ir_builder_->CreateICmpUGT(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
+    llvm::Value* x_underflows = ir_builder_->CreateICmpULE(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
+
+    // Compute appropriately-signed values of zero and infinity.
+    llvm::Value* x_signed_zero = ir_builder_->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
+    llvm::Value* x_signed_inf = ir_builder_->CreateOr(
+        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+
+    // Force to zero or infinity if overflow or underflow.  (Note that this
+    // truncates all denormal values to zero, rather than rounding them.)
+    x_as_int = ir_builder_->CreateSelect(x_overflows, x_signed_inf, x_as_int);
+    x_as_int = ir_builder_->CreateSelect(x_underflows, x_signed_zero, x_as_int);
+  }
+
+  // Cast the result back to a floating-point type.
+  llvm::Value* result = ir_builder_->CreateBitCast(x_as_int, float_type);
+
+  // Correct result for NaN inputs.
+  //
+  // The exponent handling will "normalize" NaN values to infinities, which is
+  // undesirable (except in the case with no mantissa bits, in which case it
+  // is mandatory).  This logic also handles cases where mantissa-rounding
+  // causes a NaN's mantissa to overflow into the exponent bits, which would
+  // otherwise create an erroneous zero value.
+  //
+  // If the fast-math flags are set to assume no NaNs, the comparison is likely
+  // to be optimized away, so there's no point in even emitting it.
+  if (!ir_builder_->getFastMathFlags().noNaNs()) {
+    llvm::Value* x_is_nan = ir_builder_->CreateFCmpUNO(x, x);
+
+    if (hlo->mantissa_bits() > 0) {
+      result = ir_builder_->CreateSelect(x_is_nan, x, result);
+    } else {
+      result = ir_builder_->CreateSelect(
+          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
+    }
+  }
+  return result;
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value,
     bool is_signed) const {
@@ -588,20 +709,37 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
               llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(1)},
               {param_ir_type}, ir_builder_);
           auto in_block = ir_builder_->GetInsertBlock();
-          auto body_block = in_block->splitBasicBlock(
-              ir_builder_->GetInsertPoint(), "rng_body");
-          SetToFirstInsertPoint(body_block, ir_builder_);
-          auto out_block = body_block->splitBasicBlock(
-              ir_builder_->GetInsertPoint(), "rng_out");
+
+          // A terminator should be present iff we're emitting code
+          // into the middle (as opposed to the end) of a basic block.
+          CHECK_EQ(ir_builder_->GetInsertPoint() == in_block->end(),
+                   in_block->getTerminator() == nullptr);
+
+          llvm::BasicBlock* body_block;
+          llvm::BasicBlock* out_block;
+
+          if (ir_builder_->GetInsertPoint() == in_block->end()) {
+            body_block =
+                llvm_ir::CreateBasicBlock(nullptr, "rng_body", ir_builder_);
+            out_block =
+                llvm_ir::CreateBasicBlock(nullptr, "rng_out", ir_builder_);
+            llvm::BranchInst::Create(body_block, in_block);
+          } else {
+            body_block = in_block->splitBasicBlock(
+                ir_builder_->GetInsertPoint(), "rng_body");
+            out_block = body_block->splitBasicBlock(
+                ir_builder_->GetInsertPoint(), "rng_out");
+            body_block->getTerminator()->eraseFromParent();
+          }
+
           SetToFirstInsertPoint(body_block, ir_builder_);
           auto random = ir_builder_->CreateAnd(
               ir_builder_->CreateZExtOrTrunc(get_next_i64(), param_ir_type),
               ir_builder_->CreateLShr(llvm::ConstantInt::get(param_ir_type, ~0),
                                       leading_zeros));
-          llvm::ReplaceInstWithInst(
-              body_block->getTerminator(),
-              llvm::BranchInst::Create(out_block, body_block,
-                                       ir_builder_->CreateICmpULT(random, r)));
+          llvm::BranchInst::Create(out_block, body_block,
+                                   ir_builder_->CreateICmpULT(random, r),
+                                   body_block);
           SetToFirstInsertPoint(out_block, ir_builder_);
           return ir_builder_->CreateAdd(
               p, ir_builder_->CreateSelect(
@@ -647,12 +785,14 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kCeil:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
+    case HloOpcode::kCos:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kNegate:
     case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kTanh:
     case HloOpcode::kLogicalNot:
       return [this, hlo, &operand_to_generator](
@@ -720,6 +860,14 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                                 ElementwiseSourceIndex(index, *hlo, 2)));
         return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
       };
+    case HloOpcode::kReducePrecision:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                            operand_to_generator.at(hlo->operand(0))(
+                                ElementwiseSourceIndex(index, *hlo, 0)));
+        return EmitReducePrecision(hlo, operand_value);
+      };
     case HloOpcode::kConcatenate:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index target_index) -> StatusOr<llvm::Value*> {
@@ -805,23 +953,9 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        IrArray::Index sliced_index(index.size());
-        for (int i = 0; i < index.size(); ++i) {
-          int64 stride = hlo->slice_stride(i);
-          if (stride != 1) {
-            sliced_index[i] = ir_builder_->CreateAdd(
-                ir_builder_->CreateMul(
-                    index[i], llvm::ConstantInt::get(index[i]->getType(),
-                                                     stride)),
-                llvm::ConstantInt::get(index[i]->getType(),
-                                       hlo->slice_starts(i)));
-          } else {
-            sliced_index[i] = ir_builder_->CreateAdd(
-                    index[i],
-                    llvm::ConstantInt::get(index[i]->getType(),
-                                           hlo->slice_starts(i)));
-          }
-        }
+        IrArray::Index sliced_index = index.SourceIndexOfSlice(
+            /*shape=*/hlo->shape(), /*starts=*/hlo->slice_starts(),
+            /*strides=*/hlo->slice_strides(), /*builder=*/ir_builder_);
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
     case HloOpcode::kDynamicSlice:
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 2576d3823e06ed3050554b38766dbd6c6a48ca5c..bb9117ca61e3b6ccb7f1fcecb62b0be5f984e6d1 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -84,6 +84,9 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitErfcInv(PrimitiveType prim_type,
                                              llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
+                                                     llvm::Value* x) const;
+
   // A helper method for MakeElementGenerator. Given an elementwise op `hlo` and
   // the target array index, computes the source array index of its
   // `operand_no`-th operand.
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 3a9f8dc79ee0589f27fe5aabf9592a73f34c4a0e..cbc02b84627d992179e88c107840a14c104c01c8 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -15,45 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/executable.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
 
-/* static */ void Executable::DumpExecutedHlo(
-    const HloModule& module, const string& label,
-    const HloExecutionProfile* profile) {
-  VLOG(2) << "module name = " << module.name();
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  string generate_hlo_graph_regex;
-  if (!flags->xla_generate_hlo_graph.empty()) {
-    generate_hlo_graph_regex = flags->xla_generate_hlo_graph;
-  } else {
-    generate_hlo_graph_regex =
-        module.config().debug_options().xla_generate_hlo_graph();
-  }
-  if (!generate_hlo_graph_regex.empty() &&
-      RE2::PartialMatch(module.name(), generate_hlo_graph_regex)) {
-    hlo_graph_dumper::DumpGraph(*module.entry_computation(), label,
-                                flags->xla_hlo_graph_addresses,
-                                flags->xla_hlo_graph_layout, profile);
-  }
-  if (!flags->xla_log_hlo_text.empty() &&
-      RE2::PartialMatch(module.name(), flags->xla_log_hlo_text)) {
-    LOG(INFO) << "HLO for module " << module.name();
-    LOG(INFO) << "Label: " << label;
-    XLA_LOG_LINES(2, module.ToString());
-  }
-  if (!flags->xla_dump_hlo_text_to.empty()) {
-    hlo_graph_dumper::DumpText(module, label, flags->xla_dump_hlo_text_to);
-  }
-}
-
 StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
 Executable::ExecuteOnStreams(
     tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions> run_options,
@@ -87,8 +57,8 @@ Executable::ExecuteOnStreams(
 
 Status Executable::DumpSessionModule() {
   TF_RET_CHECK(dumping());
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  const string& directory_path = flags->xla_dump_executions_to;
+  const string& directory_path =
+      module_config().debug_options().xla_dump_executions_to();
   VersionedComputationHandle versioned_handle = entry_computation_handle();
   // This filename does not include the version number because the computation
   // is only ever executed at one version.
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 291916cd9f7acb0c136dc0834b28f57a83736ec6..5388c9efa4b795b3861586030c407b6864b9382e 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -19,11 +19,12 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
@@ -49,10 +50,6 @@ class Executable {
         shape_size_function_(std::move(shape_size_function)) {}
   virtual ~Executable() {}
 
-  // Dumps the executed HLO according to service-associated flags.
-  static void DumpExecutedHlo(const HloModule& module, const string& label,
-                              const HloExecutionProfile* profile);
-
   // Enqueues the compilation result on the provided stream, passing the given
   // arguments. This call is blocking and returns after the execution is done.
   //
@@ -110,6 +107,14 @@ class Executable {
     return execution_profile_;
   }
 
+  // Returns Status::ok() if the two executables are equal to each other.
+  //
+  // An error status is returned otherwise.
+  virtual const Status EqualOrFail(const Executable& executable) {
+    return Unimplemented(
+        "Equality test on this executable is not implemented.");
+  }
+
   // Returns whether this executable was compiled with HLO profilings support
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
@@ -191,10 +196,11 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
   // If the profiling flag isn't enabled, we pass nullptr as the profile to
   // indicate profiling is not requested.
   HloExecutionProfile hlo_execution_profile;
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
   HloExecutionProfile* profile_ptr =
-      flags->xla_hlo_profile && hlo_profiling_enabled() ? &hlo_execution_profile
-                                                        : nullptr;
+      module_config().debug_options().xla_hlo_profile() &&
+              hlo_profiling_enabled()
+          ? &hlo_execution_profile
+          : nullptr;
 
   auto return_value = ExecuteOnStream(run_options, arguments, profile_ptr);
 
@@ -240,7 +246,8 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
         }
       }
     }
-    DumpExecutedHlo(module(), "Service::Execute", profile_ptr);
+    hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
+                                         profile_ptr);
   }
 
   return return_value;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index bb4712c86f6d649a9ec8f1450d90735de9ec43c3..12a6794ac177deb54dd66822a5f830ff213c7b40 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -80,7 +80,7 @@ class FlattenCallGraphTest : public HloTestBase {
     HloInstruction* param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     HloInstruction* zero = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
     builder.AddInstruction(HloInstruction::CreateBinary(
         ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
     return builder.Build();
@@ -139,7 +139,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   }
 
   {
-    TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
     std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module.get());
     const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
@@ -157,7 +157,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
         builder.AddInstruction(HloInstruction::CreateParameter(
             0, ShapeUtil::MakeShape(PRED, {}), "param0"));
     HloInstruction* false_constant = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
     builder.AddInstruction(
         HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
                                      HloOpcode::kEq, param0, false_constant));
@@ -168,7 +168,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   {
     HloComputation::Builder builder(TestName() + ".entry");
     HloInstruction* false_constant = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
     builder.AddInstruction(HloInstruction::CreateWhile(
         ShapeUtil::MakeShape(PRED, {}), cond_computation, cond_computation,
         false_constant));
@@ -182,7 +182,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   }
 
   {
-    TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
     std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
@@ -211,7 +211,7 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   module->AddEntryComputation(
       MakeCallingComputation(b_computation, /*callsites=*/2, ".Entry"));
 
-  TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(7, module->computations().size());
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index eb8b93330fbc7b786c66a07f8009b4676358421b..69195c45ed33bbb689a0633471686a03bb6d2654 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -82,13 +82,12 @@ Status GenericTransferManager::TransferLiteralFromDevice(
   }
 
   *literal->mutable_shape() = device_shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(device_shape), literal);
+  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
   TF_RETURN_IF_ERROR(TransferBufferFromDevice(
       executor, source, /*size=*/ShapeUtil::ByteSizeOf(device_shape),
-      /*destination=*/LiteralUtil::MutableInternalData(literal)));
+      /*destination=*/literal->MutableInternalData()));
   if (!ShapeUtil::Equal(literal_shape, device_shape)) {
-    literal->Swap(
-        LiteralUtil::Relayout(*literal, literal_shape.layout()).get());
+    literal->Swap(literal->Relayout(literal_shape.layout()).get());
   }
   TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
   return Status::OK();
@@ -152,27 +151,34 @@ Status GenericTransferManager::TransferLiteralToDevice(
         tuple_elements_on_device.data(), destination);
   }
 
-  return TransferBufferToDevice(
-      executor, /*size=*/GetByteSizeRequirement(shape),
-      /*source=*/LiteralUtil::InternalData(literal), destination);
+  return TransferBufferToDevice(executor,
+                                /*size=*/GetByteSizeRequirement(shape),
+                                /*source=*/literal.InternalData(), destination);
 }
 
 Status GenericTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const Literal& literal) {
-  return Unimplemented("Infeed is not supported on GPU (b/30467474)");
+  return Unimplemented("Generic transfer to Infeed");
+}
+
+Status GenericTransferManager::TransferBufferToInfeed(
+    perftools::gputools::StreamExecutor* executor, int64 size,
+    const void* source) {
+  return Unimplemented("Generic transfer to Infeed");
 }
 
 Status GenericTransferManager::TransferLiteralFromOutfeed(
     perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
     Literal* literal) {
-  return Unimplemented("Outfeed is not supported on CPU/GPU (b/30467474)");
+  return Unimplemented(
+      "Outfeed is not supported on this platform (b/30467474)");
 }
 
 Status GenericTransferManager::ResetDevices(
     tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        executors) {
+    /*executors*/) {
   return Unimplemented(
-      "Device reset is not yet supported on CPU and GPU (b/30481585)");
+      "Device reset is not yet supported on this platform (b/30481585)");
 }
 
 int64 GenericTransferManager::GetByteSizeRequirement(const Shape& shape) {
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 2fbdb94f06f1b12763571dc2aa9b0d770f420406..48c061d28e5967f903e9ea665fdaeb02fab7e02e 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -54,6 +54,8 @@ class GenericTransferManager : public TransferManager {
 
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
+  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
+                                int64 size, const void* source) override;
 
   Status TransferLiteralFromOutfeed(
       perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 869869341179822aa8d9e9675211be92f733077d..cdd7c8187c94231c3889dc9135030268a861b3da 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -68,8 +68,8 @@ cc_library(
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:stream_assignment_flags",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_reachability",
         "//tensorflow/core:lib",
     ],
 )
@@ -253,7 +253,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:convolution_thunk_flags",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
@@ -267,7 +266,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
-        "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
+        "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
     ],
 )
 
@@ -376,7 +375,6 @@ cc_test(
         ":fusion_merger",
         ":instruction_fusion",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
@@ -418,7 +416,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:gpu_compiler_flags",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
@@ -435,6 +432,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_util",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
@@ -500,8 +498,10 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_ordering",
+        "//tensorflow/compiler/xla/service:hlo_reachability",
+        "//tensorflow/compiler/xla/service:hlo_scheduling",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 9a0b14eb7332358d0e68e6a40b47c94b88666eb6..20e0d8eb785daa07b3fcc5339efe950aac0dacad 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/legacy_flags/convolution_thunk_flags.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -287,10 +286,7 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const ConvolutionDescriptor& convolution_descriptor,
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  legacy_flags::ConvolutionThunkFlags* flags =
-      legacy_flags::GetConvolutionThunkFlags();
-  if (flags->xla_gpu_autotune_convolution_algorithm &&
-      best_algorithm_.algorithm() == se::dnn::kDefaultAlgorithm) {
+  if (best_algorithm_.algorithm() == se::dnn::kDefaultAlgorithm) {
     // Auto-tuning either is disabled or only happens in the first run of this
     // function.
     VLOG(2) << "Profiling for best convolution algorithm used for "
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index aaf72935e61ee8b8da7df410ba3aaed63800cfd9..91d6df299da2686d6d836445d391c4b0eaf4ed00 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -81,9 +81,8 @@ class ConvolutionThunk : public Thunk {
   ConvolutionThunk(const ConvolutionThunk&) = delete;
   ConvolutionThunk& operator=(const ConvolutionThunk&) = delete;
 
-  // Does the convolution for the thunk on "stream". If the
-  // xla_gpu_autotune_convolution_algorithm is turned on, auto-tuning happens on
-  // the first run of this function.
+  // Does the convolution for the thunk on "stream". Auto-tuning happens on the
+  // first run of this function.
   tensorflow::Status ExecuteOnStream(
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 2987c8913d7cdd93d57bfcca40d6c56ae4dd30f0..c03213ab6d61df56dc3c826aac90271075e6db4a 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -55,7 +55,7 @@ using tensorflow::strings::StrAppend;
 // Returns whether operand is a floating-point literal with the given value.
 bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
   return operand->opcode() == HloOpcode::kConstant &&
-         LiteralUtil::IsAllFloat(operand->literal(), value);
+         operand->literal().IsAllFloat(value);
 }
 
 GpuElementalIrEmitter::GpuElementalIrEmitter(
@@ -211,6 +211,12 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
     case HloOpcode::kLog:
       return EmitLibdeviceMathCall("__nv_log", {operand_value}, {input_type},
                                    output_type);
+    case HloOpcode::kCos:
+      return EmitLibdeviceMathCall("__nv_cos", {operand_value}, {input_type},
+                                   output_type);
+    case HloOpcode::kSin:
+      return EmitLibdeviceMathCall("__nv_sin", {operand_value}, {input_type},
+                                   output_type);
     case HloOpcode::kTanh:
       return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
                                    output_type);
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index afb78b8300b457ba9384bd66f789d333630b51e4..a9ef204b46facafabcf16d1d38d69c14e6aab497 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -98,7 +98,13 @@ double CalculateFlopsToBytesRatio(HloInstruction* fusion) {
   // Calculate total bytes transferred in/out.
   double bytes = CalculateBytesReadByFusionInstruction(fusion);
   // Add bytes written to root instructions buffer.
-  bytes += ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
+  if (fusion->IsMultiOutputFusion()) {
+    for (auto& operand : fusion->fused_expression_root()->operands()) {
+      bytes += ShapeUtil::ByteSizeOf(operand->shape());
+    }
+  } else {
+    bytes += ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
+  }
   // Calculate flops for all fused instructions. Use a null shape size function
   // because we don't care about bytes accessed by the ops.
   HloCostAnalysis analysis([](const Shape& shape) { return 0; });
@@ -112,8 +118,15 @@ double CalculateFlopsToBytesRatio(HloInstruction* fusion) {
 double GetCurrentBytesTransferred(HloInstruction* fusion) {
   CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
   const double bytes_read = CalculateBytesReadByFusionInstruction(fusion);
-  const double bytes_written =
-      ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
+  double bytes_written = 0;
+  if (fusion->IsMultiOutputFusion()) {
+    for (auto& operand : fusion->fused_expression_root()->operands()) {
+      bytes_written += ShapeUtil::ByteSizeOf(operand->shape());
+    }
+  } else {
+    bytes_written =
+        ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
+  }
   // Current bytes transferred (ignoring non 'fusion' user operands) is bytes
   // read and written by 'fusion', plus reads of size 'bytes_written' for each
   // user.
@@ -198,6 +211,12 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     ++num_fail_not_loop_fusion_;
     return Status::OK();
   }
+
+  // Skip multiple output fusion. It's not yet supported.
+  if (fusion->IsMultiOutputFusion()) {
+    ++num_fail_not_loop_fusion_;
+    return Status::OK();
+  }
   // Skip 'fusion' instruction if we cannot merge into all of its users.
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
@@ -274,12 +293,19 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
 StatusOr<bool> FusionMerger::Run(HloModule* module) {
   bool changed = false;
   VLOG(2) << "FusionMerger for module: " << module->name();
+  std::vector<HloComputation*> computations;
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (auto& computation : computations) {
     VLOG(1) << "Before running FusionInstructionMerger for computation: "
             << computation->name();
     XLA_VLOG_LINES(3, computation->ToString());
 
-    FusionInstructionMerger fusion_merger(computation.get());
+    FusionInstructionMerger fusion_merger(computation);
     TF_RETURN_IF_ERROR(fusion_merger.Run());
     changed |= fusion_merger.changed();
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 8afc32dea97ea00442d2f094c8d6de0b510482fd..242c32936d31d0cb578825cade5f35979077a44e 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -59,7 +59,7 @@ class FusionMergerTest : public HloTestBase {
 
     // Create const vector of ones to be used in element-wise computations.
     auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
 
     // Create simple fusable computation for tuple element 0 (wont get merged).
     auto out0 = builder.AddInstruction(HloInstruction::CreateBinary(
@@ -138,7 +138,7 @@ class FusionMergerTest : public HloTestBase {
 
     // Create two sub-computations, both of which are users of 'mul0'.
     auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
 
     // First sub-computation: out0 = Mul(Add(mul0, one_vec), one_vec)
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
@@ -209,7 +209,7 @@ class FusionMergerTest : public HloTestBase {
     // Create two fusable sub-computations which are dependent on shared
     // computation 'reduce_out'.
     auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
 
     // First sub-computation: out0 = Mul(Add(reduce_out, one_vec), one_vec)
     auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 86137a569f9b199782462582ba11683ff9884d7b..031ecbd3aedfb3e531d8e10c9a7b381bb97037e3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/DiagnosticPrinter.h"
 #include "external/llvm/include/llvm/IR/LLVMContext.h"
 #include "external/llvm/include/llvm/IR/Module.h"
-#include "tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
@@ -57,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -82,7 +82,7 @@ const char* kTargetTriple = "nvptx64-nvidia-cuda";
 
 // The data layout of the emitted module. Copied from computeDataLayout in
 // NVPTXTargetMachine.cpp.
-const char* kDataLayout = "e-i64:64-v16:16-v32:32-n16:32:64";
+const char* kDataLayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64";
 
 // Any address of a variable residing in global memory or returned by one of the
 // memory allocation routines from the driver or runtime API is always aligned
@@ -95,11 +95,9 @@ constexpr int64 kMemoryAlignment = 256;
 // called in GpuCompiler's constructor, so can't return an error. But
 // GpuCompiler::Compile will return an error when the wanted libdevice file
 // doesn't exist in the folder this function returns.
-string GetLibdeviceDir() {
+string GetLibdeviceDir(const HloModuleConfig& config) {
   std::vector<string> potential_libdevice_dirs;
-  // Flag xla_cuda_data_dir specified by the user.
-  legacy_flags::GpuCompilerFlags* flags = legacy_flags::GetGpuCompilerFlags();
-  const string datadir = flags->xla_cuda_data_dir;
+  const string datadir = config.debug_options().xla_gpu_cuda_data_dir();
   if (!datadir.empty()) {
     potential_libdevice_dirs.push_back(datadir);
   }
@@ -122,14 +120,16 @@ string GetLibdeviceDir() {
 
 // Runs optimization passes on the given HLO module.
 tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
-                                     const Compiler::HloDumper& dump_hlo,
                                      const se::DeviceDescription& device_desc) {
   {
-    HloPassPipeline pipeline("optimization", dump_hlo);
+    HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
+    ReducePrecisionInsertion::AddPasses(
+        &pipeline, hlo_module->config().debug_options(),
+        HloReducePrecisionOptions::BEFORE_OP_FUSION);
     {
-      auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-          "simplification", dump_hlo);
+      auto& pass =
+          pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
       pass.AddPass<AlgebraicSimplifier>(
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
@@ -149,24 +149,37 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
   {
-    HloPassFix<HloPassPipeline> fusion("fusion", dump_hlo);
+    HloPassFix<HloPassPipeline> fusion("fusion");
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
-    return fusion.Run(hlo_module).status();
+    TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+
+    HloPassPipeline reduce_pipeline("reduce-precision");
+    ReducePrecisionInsertion::AddPasses(
+        &reduce_pipeline, hlo_module->config().debug_options(),
+        HloReducePrecisionOptions::AFTER_OP_FUSION);
+    StatusOr<bool> reduce_result = reduce_pipeline.Run(hlo_module);
+    TF_RETURN_IF_ERROR(reduce_result.status());
+
+    if (reduce_result.ValueOrDie()) {
+      // Do another fusion pass, with the expectation that we may be able to
+      // fuse the new ReducePrecision operations.
+      TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+    }
   }
+  return tensorflow::Status::OK();
 }
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(
-    const Compiler::HloDumper& dump_hlo, HloModule* hlo_module) {
+tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
   // (b/27180329). Therefore, in that case, we set the output to be a copy of
   // the parameter.
-  HloPassPipeline pipeline("GPU-ir-emit-prepare", dump_hlo);
+  HloPassPipeline pipeline("GPU-ir-emit-prepare");
   pipeline.AddInvariantChecker<HloVerifier>();
   pipeline.AddPass<PadInsertion>();
   pipeline.AddPass<GpuLayoutAssignment>(
@@ -230,17 +243,15 @@ void DumpPtxasInfo(const string& ptx) {
 }  // namespace
 
 GpuCompiler::GpuCompiler()
-    : libdevice_dir_(GetLibdeviceDir()),
-      pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
+    : pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
-    std::unique_ptr<HloModule> module, HloDumper dump_hlo,
-    se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), dump_hlo,
-                                       stream_exec->GetDeviceDescription()));
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(dump_hlo, module.get()));
+  TF_RETURN_IF_ERROR(
+      OptimizeHloModule(module.get(), stream_exec->GetDeviceDescription()));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -271,13 +282,16 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> buffer_assignment,
       BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(),
-                          BufferSizeBytesFunction(), kMemoryAlignment));
+                          BufferSizeBytesFunction(), [](LogicalBuffer::Color) {
+                            return kMemoryAlignment;
+                          }));
 
-  legacy_flags::GpuCompilerFlags* flags = legacy_flags::GetGpuCompilerFlags();
-  if (!flags->xla_gpu_dump_debug_json_to.empty()) {
+  const string dump_debug_json_to =
+      module->config().debug_options().xla_dump_debug_json_to();
+  if (!dump_debug_json_to.empty()) {
     HloProto proto = MakeHloProto(*module, *buffer_assignment);
     TF_RETURN_IF_ERROR(protobuf_util::DumpJsonToDirectory(
-        proto, flags->xla_gpu_dump_debug_json_to, module->name()));
+        proto, dump_debug_json_to, module->name()));
   }
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
@@ -292,7 +306,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
       entry_computation->root_instruction()->Accept(&ir_emitter));
 
   string ir_module_string_before_opt;
-  if (VLOG_IS_ON(2) || flags->xla_gpu_embed_ir) {
+  const bool embed_ir_in_executable =
+      module->config().debug_options().xla_embed_ir_in_executable();
+  if (VLOG_IS_ON(2) || embed_ir_in_executable) {
     ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
     VLOG(2) << "LLVM module before optimizations:";
     XLA_VLOG_LINES(2, ir_module_string_before_opt);
@@ -313,6 +329,10 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     cc_major = 2;
     cc_minor = 0;
   }
+  if (libdevice_dir_.empty()) {
+    // Compute libdevice_dir_ just once and cache it in this member.
+    libdevice_dir_ = GetLibdeviceDir(module->config());
+  }
   TF_ASSIGN_OR_RETURN(*ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
                                          module->config(), libdevice_dir_));
 
@@ -333,7 +353,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   auto* gpu_executable =
       new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(module),
                         std::move(buffer_assignment), ShapeSizeBytesFunction());
-  if (flags->xla_gpu_embed_ir) {
+  if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
   }
@@ -341,16 +361,15 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlos,
+    std::vector<std::unique_ptr<HloModule>> modules,
     std::vector<se::StreamExecutor*> stream_execs) {
   return Unimplemented(
       "Compilation of multiple HLO modules is not yet supported on GPU.");
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-GpuCompiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> module,
-    HloDumper dump_hlo, const AotCompilationOptions& options) {
+GpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
+                                const AotCompilationOptions& options) {
   return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index da52f5ab1f8e5bf8c2fa3c33948ccf8a0f647f7a..b87555b931f1d73de8bcaf84aea80305c9d585bf 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -41,17 +41,16 @@ class GpuCompiler : public Compiler {
   ~GpuCompiler() override {}
 
   StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      HloDumper dump_hlo, AotCompilationOptions const& options) override;
+  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
+                     AotCompilationOptions const& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
@@ -65,7 +64,7 @@ class GpuCompiler : public Compiler {
 
  private:
   // The parent directory of libdevice IR libraries.
-  const string libdevice_dir_;
+  string libdevice_dir_;
 
   // The list of PTX strings generated by this GpuCompiler. We let GpuCompiler
   // to own them because they need to be alive across the life span of the
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index e1a55118fc7a962cbc77b8214f01451e6f155ca0..8558e150e06e31fc36c60cf8564f0d22cba020e8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -80,6 +80,11 @@ class GpuExecutable : public Executable {
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments) override;
 
+  const Status EqualOrFail(const Executable& executable) {
+    // TODO(b/62952745) Implement equality test on GPU executable.
+    return Unimplemented("Equality test on GPU executable is not implemented.");
+  }
+
  private:
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
index d16a1d4ee5be00e685fc181f19c1a3cfda253f6a..81e905a06665436875b17991a8635e7bb31600de 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
@@ -67,38 +69,38 @@ GpuHloOrdering::GpuHloOrdering(
   // waits for its operands before executing.
   //
   // The predecessor map is built incrementally, in thunk launch order. We
-  // record the instructions already visited per stream in
-  // 'instructions_per_stream'. This lets us quickly determine the same-stream
-  // predecessors of each instruction. To capture cross-stream dependency edges,
-  // we use the predecessor map to insert each operand as well as its transitive
-  // closure of dependencies.
-
-  // Compute the set of all instructions we will want to set reachability on
-  auto predecessor_map = MakeUnique<HloComputation::ReachabilityMap>(
+  // record the most-recently seen instructions per stream in
+  // 'last_instruction_per_stream'. This lets us quickly determine the
+  // same-stream predecessors of each instruction.
+
+  // Compute the set of all instructions we will want to set reachability on.
+  auto predecessor_map = MakeUnique<HloReachabilityMap>(
       module->entry_computation()->MakeInstructionPostOrder());
 
-  std::vector<std::vector<const HloInstruction*>> instructions_per_stream(
-      stream_assignment.StreamCount());
+  // The most recently visited instruction per stream.
+  std::vector<const HloInstruction*> last_instruction_per_stream(
+      stream_assignment.StreamCount(), nullptr);
 
   for (const HloInstruction* hlo : thunk_launch_order) {
+    predecessor_map->SetReachable(hlo, hlo);
     if (stream_assignment.HasStreamAssigned(*hlo)) {
+      // Gather all instruction which are immediate predecessors of 'hlo' in the
+      // reachability graph.
+      std::vector<const HloInstruction*> immediate_preds;
+      immediate_preds.insert(immediate_preds.end(), hlo->operands().begin(),
+                             hlo->operands().end());
+      immediate_preds.insert(immediate_preds.end(),
+                             hlo->control_predecessors().begin(),
+                             hlo->control_predecessors().end());
+
       // All ops already queued on the same instruction stream, and their
-      // transitive predecessors, are predecessors. Since the relation is
-      // transitive, we just set the transitive closure of the previous op.
+      // transitive predecessors, are predecessors.
       const int stream_no = stream_assignment.StreamNumberForHlo(*hlo);
-      std::vector<const HloInstruction*>* instructions =
-          &instructions_per_stream[stream_no];
-      if (!instructions->empty()) {
-        const HloInstruction* back = instructions->back();
-        predecessor_map->SetReachableAndTransitiveClosure(hlo, back);
-      }
-      // All operands and their transitive predecessors are predecessors. Each
-      // operand must already exist in 'predecessor_map', since we're iterating
-      // in thunk launch order.
-      for (const HloInstruction* operand : hlo->operands()) {
-        predecessor_map->SetReachableAndTransitiveClosure(hlo, operand);
+      if (last_instruction_per_stream[stream_no] != nullptr) {
+        immediate_preds.push_back(last_instruction_per_stream[stream_no]);
       }
-      instructions->push_back(hlo);
+      predecessor_map->SetReachabilityToUnion(immediate_preds, hlo);
+      last_instruction_per_stream[stream_no] = hlo;
     } else {
       // Only parameters and constants don't have an assigned stream, since they
       // don't require a thunk. These ops don't have any predecessors.
@@ -107,21 +109,21 @@ GpuHloOrdering::GpuHloOrdering(
       CHECK_EQ(hlo->operand_count(), 0);
     }
   }
-  strict_predecessors_.emplace(module->entry_computation(),
-                               std::move(predecessor_map));
+  predecessors_.emplace(module->entry_computation(),
+                        std::move(predecessor_map));
 
-  // The ordering of instructions in subcomputations is based solely on data
-  // dependencies. I.e. the strict predecessors of each subcomputation
-  // instruction is its transitive operands.
+  // The ordering of instructions in subcomputations is based solely on control
+  // and data dependencies.
   //
   // TODO(toddw): Each subcomputation is actually emitted as a function in DFS
   // postorder, so we can do better and establish the total order here. We don't
   // do that yet since it's hard to ensure that the order here is the order used
   // by IrEmitterNested. And mismatched ordering bugs would be hard to find.
   for (auto& computation : module->computations()) {
-    if (computation.get() != module->entry_computation()) {
-      strict_predecessors_.emplace(computation.get(),
-                                   computation->ComputeTransitiveOperands());
+    if (computation.get() != module->entry_computation() &&
+        !computation->IsFusionComputation()) {
+      predecessors_.emplace(computation.get(),
+                            computation->ComputeReachability());
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/hlo_schedule.h
index 773973010a46bb4a2af1f536c43201ba8c0be5d8..1ce7a48ac8fcbbad0b3697845681582fe806b322 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 1a61eec353740202065c1ce98e8c91274facfd19..a04214930dfc95b82ca4c702d12648381a4c8135 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -86,23 +86,35 @@ void HloToIrBindings::EmitBasePointersForHlos(
       continue;
     }
 
-    // A non-IO HLO with a buffer is bound to
-    // (1) an alloca if it is thread-local, or
-    // (2) an internal pointer in temp_buffer_base according to its offset.
-    const BufferAllocation::Slice slice =
-        buffer_assignment_->GetUniqueTopLevelSlice(non_io_hlo)
-            .ConsumeValueOrDie();
-    if (slice.allocation()->is_thread_local()) {
-      llvm::Type* pointee_type =
-          llvm_ir::ShapeToIrType(non_io_hlo->shape(), ir_builder_);
-      BindHloToIrValue(*non_io_hlo, ir_builder_->CreateAlloca(pointee_type));
-    } else {
-      const int64 offset = slice.offset();
-      CHECK_NE(nullptr, temp_buffer_base_);
-      BindHloToIrValue(*non_io_hlo,
-                       ir_builder_->CreateInBoundsGEP(
-                           temp_buffer_base_, ir_builder_->getInt64(offset)));
-    }
+    ShapeUtil::ForEachSubshape(
+        non_io_hlo->shape(),
+        [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+          // A non-IO HLO with a buffer is bound to
+          // (1) an alloca if it is thread-local, or
+          // (2) an internal pointer in temp_buffer_base according to its
+          // offset.
+          auto slice_result =
+              buffer_assignment_->GetUniqueSlice(non_io_hlo, index);
+          if (!slice_result.ok()) {
+            return;
+          }
+          const BufferAllocation::Slice slice =
+              slice_result.ConsumeValueOrDie();
+          if (slice.allocation()->is_thread_local()) {
+            llvm::Type* pointee_type =
+                llvm_ir::ShapeToIrType(non_io_hlo->shape(), ir_builder_);
+            BindHloToIrValue(*non_io_hlo,
+                             ir_builder_->CreateAlloca(pointee_type), index);
+          } else {
+            const int64 offset = slice.offset();
+            CHECK_NE(nullptr, temp_buffer_base_);
+            BindHloToIrValue(
+                *non_io_hlo,
+                ir_builder_->CreateInBoundsGEP(temp_buffer_base_,
+                                               ir_builder_->getInt64(offset)),
+                index);
+          }
+        });
   }
 }
 
@@ -112,7 +124,7 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
   if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) {
     return llvm_ir::EmitGetTupleElement(
         gte->shape(), gte->tuple_index(), /*alignment=*/1,
-        GetTypedIrValue(*gte->operand(0), base_ptr), ir_builder_);
+        GetTypedIrValue(*gte->operand(0), {}, base_ptr), ir_builder_);
   }
   return llvm_ir::EmitGetTupleElement(
       gte->shape(), gte->tuple_index(), /*alignment=*/1,
@@ -120,8 +132,10 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
 }
 
 llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
+                                              const ShapeIndex& shape_index,
                                               llvm::Value* ir_value) {
-  llvm::Type* pointee_type = llvm_ir::ShapeToIrType(hlo.shape(), ir_builder_);
+  llvm::Type* pointee_type = llvm_ir::ShapeToIrType(
+      ShapeUtil::GetSubshape(hlo.shape(), shape_index), ir_builder_);
   llvm::Type* dest_type = pointee_type->getPointerTo();
 
   llvm::Value* typed_ir_value;
@@ -139,13 +153,24 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
 }
 
 void HloToIrBindings::BindHloToIrValue(const HloInstruction& hlo,
-                                       llvm::Value* ir_value) {
+                                       llvm::Value* ir_value,
+                                       const ShapeIndex& shape_index) {
   VLOG(2) << "Binding " << hlo.ToString();
-  InsertOrDie(&base_ptrs_, &hlo, GetTypedIrValue(hlo, ir_value));
+
+  const Shape& hlo_shape = hlo.shape();
+  llvm::Value* typed_ir_value = GetTypedIrValue(hlo, shape_index, ir_value);
+
+  if (!BoundToIrValue(hlo)) {
+    // Set the root of ShapeTree first before assigning the element ir value.
+    InsertOrDie(&base_ptrs_, &hlo, ShapeTree<llvm::Value*>(hlo_shape, nullptr));
+  }
+  *(base_ptrs_[&hlo].mutable_element(shape_index)) = typed_ir_value;
 }
 
-llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo) {
-  llvm_ir::IrArray ir_array(GetBasePointer(hlo), hlo.shape());
+llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
+                                             const ShapeIndex& shape_index) {
+  llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index),
+                            ShapeUtil::GetSubshape(hlo.shape(), shape_index));
   alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array);
   return ir_array;
 }
@@ -154,7 +179,7 @@ void HloToIrBindings::UnbindAllLocalIrValues() {
   std::vector<const HloInstruction*> hlos_to_unbind;
   for (auto& key_value : base_ptrs_) {
     if (!llvm::isa<llvm::GlobalVariable>(
-            key_value.second->stripPointerCasts())) {
+            (key_value.second.element({}))->stripPointerCasts())) {
       hlos_to_unbind.push_back(key_value.first);
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index 5be2150801fbd2a3a624d9c87513d5cee7288bbd..2c59886e9ae410b6a6a1dd9973c75c061c8db808 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -48,7 +48,8 @@ class HloToIrBindings {
       tensorflow::gtl::ArraySlice<const HloInstruction*> non_io_hlos);
 
   // Rebinds the given HLO to the LLVM IR value that represent its address.
-  void BindHloToIrValue(const HloInstruction& hlo, llvm::Value* ir_value);
+  void BindHloToIrValue(const HloInstruction& hlo, llvm::Value* ir_value,
+                        const ShapeIndex& shape_index = {});
 
   // Unbinds all IR values that's defined in an LLVM function, e.g., function
   // arguments and stack variables. Global variables will be kept in bindings_.
@@ -64,15 +65,18 @@ class HloToIrBindings {
 
   llvm::Value* GetTempBufferBase() const { return temp_buffer_base_; }
 
-  // A helper method that returns the base pointer of the IrArray for "inst".
-  llvm::Value* GetBasePointer(const HloInstruction& hlo) const {
+  // A helper method that returns the base pointer of the IrArray containing the
+  // output of "inst".at the given ShapeIndex.
+  llvm::Value* GetBasePointer(const HloInstruction& hlo,
+                              const ShapeIndex& shape_index = {}) const {
     auto it = base_ptrs_.find(&hlo);
     CHECK(it != base_ptrs_.end());
-    return it->second;
+    return it->second.element(shape_index);
   }
 
   // Return the underlying IrArray of the output of the given instruction.
-  llvm_ir::IrArray GetIrArray(const HloInstruction& hlo);
+  llvm_ir::IrArray GetIrArray(const HloInstruction& hlo,
+                              const ShapeIndex& shape_index = {});
 
  private:
   // Emits IR to resolve (possibly) recursive GetTupleElement instructions.
@@ -81,6 +85,7 @@ class HloToIrBindings {
 
   // Returns an llvm typed ir representation of 'ir_value' based on 'hlo' shape.
   llvm::Value* GetTypedIrValue(const HloInstruction& hlo,
+                               const ShapeIndex& shape_index,
                                llvm::Value* ir_value);
 
   const BufferAssignment* buffer_assignment_;
@@ -90,7 +95,10 @@ class HloToIrBindings {
   llvm::IRBuilder<>* ir_builder_;
 
   // Stores the underlying llvm::IrArray for each HloInstruction.
-  std::unordered_map<const HloInstruction*, llvm::Value*> base_ptrs_;
+  // For an instruction that generates multiple outputs, the root will be a
+  // tuple shape. The IrArray for each element output is stored in the subnode
+  // in the ShapeTree.
+  std::unordered_map<const HloInstruction*, ShapeTree<llvm::Value*>> base_ptrs_;
 
   // The address of the memory block that contains all temporary buffers.
   llvm::Value* temp_buffer_base_;
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index 120a3f7fba2101ce64da1e8135fb5f862e603fe4..ee5b447c9cd0b1fde4d3a0943d5d4cb8cc5b3376 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace se = ::perftools::gputools;
@@ -22,23 +24,23 @@ namespace se = ::perftools::gputools;
 namespace xla {
 namespace gpu {
 
-InfeedManager::InfeedManager()
-    : current_buffer_(nullptr),
-      host_to_device_executor_(nullptr) {}
+InfeedManager::InfeedManager() : host_to_device_executor_(nullptr) {}
 
 void InfeedManager::Reset() {
   tensorflow::mutex_lock l(mu_);
-  CHECK(!current_buffer_);
+  CHECK(dequeued_buffer_.empty());
   for (auto buffer : enqueued_buffer_) {
     buffer->Done();
   }
   enqueued_buffer_.clear();
 }
 
-void InfeedManager::EnqueueBuffer(InfeedBuffer* buffer) {
+void InfeedManager::EnqueueBuffers(const std::vector<InfeedBuffer*>& buffers) {
   tensorflow::mutex_lock l(mu_);
   bool was_empty = enqueued_buffer_.empty();
-  enqueued_buffer_.push_back(buffer);
+  for (gpu::InfeedBuffer* b : buffers) {
+    enqueued_buffer_.push_back(b);
+  }
   if (was_empty) {
     // This has the potential to suffer from the notified thread
     // immediately trying and failing to acquire mu_, but seems
@@ -53,18 +55,23 @@ InfeedBuffer* InfeedManager::BlockingDequeueBuffer() {
   while (enqueued_buffer_.empty()) {
     cv_.wait(l);
   }
-  CHECK(!current_buffer_);
-  current_buffer_ = enqueued_buffer_.front();
+  InfeedBuffer* current_buffer = enqueued_buffer_.front();
   enqueued_buffer_.pop_front();
-  return current_buffer_;
+  dequeued_buffer_.insert(current_buffer);
+  return current_buffer;
 }
 
-void InfeedManager::ReleaseCurrentBuffer(se::DeviceMemoryBase* device_memory) {
-  tensorflow::mutex_lock l(mu_);
-  CHECK(current_buffer_);
-  CHECK(device_memory->IsSameAs(*current_buffer_->device_memory()));
-  current_buffer_->Done();
-  current_buffer_ = nullptr;
+void InfeedManager::ReleaseBuffers(const std::vector<InfeedBuffer*>& buffers) {
+  {
+    tensorflow::mutex_lock l(mu_);
+    for (gpu::InfeedBuffer* b : buffers) {
+      CHECK(ContainsKey(dequeued_buffer_, b));
+      dequeued_buffer_.erase(b);
+    }
+  }
+  for (gpu::InfeedBuffer* b : buffers) {
+    b->Done();
+  }
 }
 
 se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) {
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
index 50d0ce340f3d85c2c46f111dba3e316ff0f4df1a..73d5a5ce35497f156a181371bfb97fc37a8eb09e 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -81,25 +82,19 @@ class InfeedManager {
   // condition is to call Reset when no computation is taking place.
   void Reset();
 
-  // Adds buffer to the infeed queue. buffer->Done will be called when
-  // the buffer will no longer be accessed by the InfeedManager,
-  // either as a result of a call to Reset or because the runtime has
-  // dequeued and used the buffer.
-  void EnqueueBuffer(InfeedBuffer* buffer);
+  // Adds a set of buffers to the infeed queue atomically. buffer->Done
+  // will be called when the buffer will no longer be accessed by the
+  // InfeedManager, either as a result of a call to Reset or because the
+  // runtime has dequeued and used the buffer.
+  void EnqueueBuffers(const std::vector<InfeedBuffer*>& buffers);
 
   // Blocks until the infeed queue is non-empty, then returns the
-  // buffer at the head of the queue. Sets the current buffer to be
-  // the returned buffer. It is an error to call BlockingDequeueBuffer
-  // if there is an unreleased current buffer, i.e.,
-  // ReleaseCurrentBuffer must be called between calls to
-  // BlockingDequeueBuffer.
+  // buffer at the head of the queue. Adds the current buffer to the
+  // to-be released set.
   InfeedBuffer* BlockingDequeueBuffer();
 
-  // Releases the current buffer, which is the last buffer returned by
-  // BlockingDequeueBuffer and not yet released. device_memory must
-  // match that of the current buffer.
-  void ReleaseCurrentBuffer(
-      perftools::gputools::DeviceMemoryBase* device_memory);
+  // Releases a set of buffers from the to-be released set.
+  void ReleaseBuffers(const std::vector<InfeedBuffer*>& buffers);
 
   // Returns a cached stream associated with an executor. Allocates a
   // new stream on the first invocation. On subsequent invocations, if
@@ -109,18 +104,25 @@ class InfeedManager {
       perftools::gputools::StreamExecutor* executor);
 
  private:
+  // TODO(b/30467474): Revisit if this mutex becomes a point of
+  // contention.
   tensorflow::mutex mu_;
+
   // Condition variable that is signaled every time a buffer is
   // enqueued to an empty queue.
   tensorflow::condition_variable cv_;
+
   // InfeedBuffer* queue contents are not owned, but buffer->Done must
   // be called when the buffer is no longer needed by the runtime.
   std::deque<InfeedBuffer*> enqueued_buffer_;
-  // If non-NULL, the buffer that is currently being processed by the
+
+  // Buffers that are dequeued and currently being processed by the
   // runtime. Not owned.
-  InfeedBuffer* current_buffer_;
+  tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;
+
   // Cached host to device stream for queuing infeed data.
   std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
+
   // Executor that the host_to_device_stream belongs to. Not owned.
   perftools::gputools::StreamExecutor* host_to_device_executor_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 6f144c7273e69beedeb143c395ce37414ce99139..e33e904692ca5ad41e17d2e165dbb40b6bd4aa33 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -21,31 +21,59 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-InfeedThunk::InfeedThunk(const BufferAllocation::Slice& destination_buffer,
-                         uint64 mem_size, const HloInstruction* hlo_instruction)
+InfeedThunk::InfeedThunk(
+    tensorflow::gtl::ArraySlice<BufferAllocation::Slice> tuple_element_buffers,
+    const BufferAllocation::Slice& destination_buffer,
+    const HloInstruction* hlo_instruction)
     : Thunk(Kind::kInfeed, hlo_instruction),
-      destination_buffer_(destination_buffer),
-      mem_size_(mem_size) {}
+      tuple_element_buffers_(tuple_element_buffers.begin(),
+                             tuple_element_buffers.end()),
+      destination_buffer_(destination_buffer) {}
 
 tensorflow::Status InfeedThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations,
     perftools::gputools::Stream* stream) {
   VLOG(2) << "Infeeding to GPU ";
-  perftools::gputools::DeviceMemoryBase destination_data =
+
+  perftools::gputools::DeviceMemoryBase destination_address =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
 
   InfeedManager* infeed_manager = GetOrCreateInfeedManager();
-  InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
-  CHECK_EQ(buffer->length(), mem_size_);
-  stream->ThenMemcpy(&destination_data, *(buffer->device_memory()),
-                     buffer->length());
+  std::vector<InfeedBuffer*> infeed_buffers;
+  if (ShapeUtil::IsTuple(hlo_instruction()->shape())) {
+    CHECK(!ShapeUtil::IsNestedTuple(hlo_instruction()->shape()));
+    // Transfer the tuple elements first.
+    std::vector<void*> tuple_element_addresses;
+    for (BufferAllocation::Slice tuple_element_buffer :
+         tuple_element_buffers_) {
+      perftools::gputools::DeviceMemoryBase tuple_element_address =
+          buffer_allocations.GetDeviceAddress(tuple_element_buffer);
+
+      InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
+      infeed_buffers.push_back(buffer);
+      stream->ThenMemcpy(&tuple_element_address, *(buffer->device_memory()),
+                         buffer->length());
+      tuple_element_addresses.push_back(tuple_element_address.opaque());
+    }
+    // Transfer the tuple outer buffer.
+    auto host_size = tuple_element_addresses.size() * sizeof(void*);
+    stream->ThenMemcpy(&destination_address, tuple_element_addresses.data(),
+                       host_size);
+  } else {
+    InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
+    infeed_buffers.push_back(buffer);
+    stream->ThenMemcpy(&destination_address, *(buffer->device_memory()),
+                       buffer->length());
+  }
+
   if (!stream->BlockHostUntilDone()) {
     return InternalError("Failed to complete data transfer on stream %p",
                          stream);
   }
-  // Since Infeeds are totally ordered, no other infeed should sneak
-  // in and we should be able to release the same buffer we dequeued.
-  infeed_manager->ReleaseCurrentBuffer(buffer->device_memory());
+
+  infeed_manager->ReleaseBuffers(infeed_buffers);
+
+  VLOG(2) << "Infeeding to GPU complete";
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 0a808186c212660e4be3905456d29cb2fed0f511..371d71f9dbdd21cb5f36cc3108c8f398a4a91c29 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -35,8 +35,10 @@ class InfeedThunk : public Thunk {
   // infeed queue to the device buffer
   // `destination_buffer`. `mem_size` is the size of the data in
   // bytes.
-  InfeedThunk(const BufferAllocation::Slice& destination_buffer,
-              uint64 mem_size, const HloInstruction* hlo_instruction);
+  InfeedThunk(tensorflow::gtl::ArraySlice<BufferAllocation::Slice>
+                  tuple_element_buffers,
+              const BufferAllocation::Slice& destination_buffer,
+              const HloInstruction* hlo_instruction);
 
   InfeedThunk(const InfeedThunk&) = delete;
   InfeedThunk& operator=(const InfeedThunk&) = delete;
@@ -46,8 +48,8 @@ class InfeedThunk : public Thunk {
       perftools::gputools::Stream* stream) override;
 
  private:
+  const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
   const BufferAllocation::Slice destination_buffer_;
-  const uint64 mem_size_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 7d5b6ed5cfabcd429cc25f63b8fa14e2e20e387f..80f91e5daed30567ff66476ff9066dc36b01ee3c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -202,18 +202,22 @@ bool IrEmitter::MaybeEmitSpecialAtomicOperation(
   // NVPTX supports atomicMax and atomicMin on only integer types.
   if (root_opcode == HloOpcode::kMaximum &&
       primitive_util::IsIntegralType(element_type)) {
-    // min(integral, integral)
-    ir_builder_.CreateAtomicRMW(llvm::AtomicRMWInst::Max, output_address,
-                                source,
+    // max(integral, integral)
+    auto opcode = primitive_util::IsSignedIntegralType(element_type)
+                      ? llvm::AtomicRMWInst::Max
+                      : llvm::AtomicRMWInst::UMax;
+    ir_builder_.CreateAtomicRMW(opcode, output_address, source,
                                 llvm::AtomicOrdering::SequentiallyConsistent);
     return true;
   }
 
   if (root_opcode == HloOpcode::kMinimum &&
       primitive_util::IsIntegralType(element_type)) {
-    // max(integral, integral)
-    ir_builder_.CreateAtomicRMW(llvm::AtomicRMWInst::Min, output_address,
-                                source,
+    // min(integral, integral)
+    auto opcode = primitive_util::IsSignedIntegralType(element_type)
+                      ? llvm::AtomicRMWInst::Min
+                      : llvm::AtomicRMWInst::UMin;
+    ir_builder_.CreateAtomicRMW(opcode, output_address, source,
                                 llvm::AtomicOrdering::SequentiallyConsistent);
     return true;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 607a366ac67d98d11c5141b390420aef00539dcd..718e27101e0dc2bfb1338f17979d452b08a2a376 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -118,8 +118,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                      IrEmitterContext* ir_emitter_context, bool is_nested);
 
   // A convenient helper for calling HloToIrBindings::GetIrArray.
-  llvm_ir::IrArray GetIrArray(const HloInstruction& inst) {
-    return bindings_.GetIrArray(inst);
+  llvm_ir::IrArray GetIrArray(const HloInstruction& inst,
+                              const ShapeIndex& shape_index = {}) {
+    return bindings_.GetIrArray(inst, shape_index);
   }
   // A convenient helper for calling HloToIrBindings::GetBasePointer.
   llvm::Value* GetBasePointer(const HloInstruction& inst) const {
@@ -231,7 +232,7 @@ class IrEmitterUnnested : public IrEmitter {
 
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter.
-  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
+  Status HandleCopy(HloInstruction* copy) override;
   Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs,
                            HloInstruction* rhs, const Window& window) override;
   Status HandleDot(HloInstruction* dot, HloInstruction* lhs_instruction,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 5fa2bfdd7e4301144054e0d4f41d1161e798176b..484de369675fb0188754d4bc2d187cbc6c92259b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -722,8 +722,7 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
 
 }  // namespace
 
-Status IrEmitterUnnested::HandleCopy(HloInstruction* copy,
-                                     HloInstruction* operand) {
+Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
   if (ImplementedAsMemcpy(*copy)) {
     thunk_sequence_->emplace_back(BuildCopyThunk(copy));
     return Status::OK();
@@ -731,7 +730,7 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy,
   bool is_transpose_021;
   Shape reduced_input_shape, reduced_output_shape;
   std::tie(is_transpose_021, reduced_input_shape, reduced_output_shape) =
-      IsTranspose021(operand->shape(), copy->shape());
+      IsTranspose021(copy->operand(0)->shape(), copy->shape());
   if (is_transpose_021 &&
       reduced_input_shape.dimensions(1) >= kMinDimensionToTransposeTiled &&
       reduced_input_shape.dimensions(2) >= kMinDimensionToTransposeTiled) {
@@ -739,7 +738,8 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy,
     VLOG(3) << "Emitting tiled 0-2-1 transposition";
     constexpr int64 tile_size = 32;
     int64 num_tiles = EmitTranspose021Tiled(
-        GetIrArray(*operand).CastToShape(reduced_input_shape, &ir_builder_),
+        GetIrArray(*(copy->operand(0)))
+            .CastToShape(reduced_input_shape, &ir_builder_),
         GetIrArray(*copy).CastToShape(reduced_output_shape, &ir_builder_),
         tile_size, &ir_builder_);
     UpdateLaunchDimensions(LaunchDimensions(num_tiles, tile_size), LastThunk(),
@@ -747,7 +747,7 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy,
     return Status::OK();
   }
 
-  return IrEmitter::HandleCopy(copy, operand);
+  return IrEmitter::HandleCopy(copy);
 }
 
 Status IrEmitterUnnested::EmitColumnReduction(
@@ -1648,7 +1648,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildCopyThunk(
   const HloInstruction* operand = inst->operand(0);
   CHECK_EQ(HloOpcode::kConstant, operand->opcode());
   return MakeUnique<CopyThunk>(
-      /*source_address=*/LiteralUtil::InternalData(operand->literal()),
+      /*source_address=*/operand->literal().InternalData(),
       /*destination_buffer=*/GetAllocationSlice(*inst),
       /*mem_size=*/
       llvm_ir::ByteSizeOf(operand->shape(),
@@ -1659,12 +1659,18 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildCopyThunk(
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildInfeedThunk(
     const HloInstruction* inst) {
   CHECK_EQ(HloOpcode::kInfeed, inst->opcode());
+
+  std::vector<BufferAllocation::Slice> tuple_element_buffers;
+  for (int64 i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
+    BufferAllocation::Slice buffer = ir_emitter_context_->buffer_assignment()
+                                         .GetUniqueSlice(inst, {i})
+                                         .ConsumeValueOrDie();
+    tuple_element_buffers.push_back(buffer);
+  }
+
   return MakeUnique<InfeedThunk>(
-      /*destination_buffer=*/GetAllocationSlice(*inst),
-      /*mem_size=*/
-      llvm_ir::ByteSizeOf(inst->shape(),
-                          ir_emitter_context_->llvm_module()->getDataLayout()),
-      inst);
+      tuple_element_buffers,
+      /*destination_buffer=*/GetAllocationSlice(*inst), inst);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
@@ -1880,15 +1886,38 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  const Shape& element_shape = hlo.IsMultiOutputFusion()
+                                   ? ShapeUtil::GetSubshape(hlo.shape(), {0})
+                                   : hlo.shape();
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      hlo.shape(), ir_emitter_context_->device_description());
+      element_shape, ir_emitter_context_->device_description());
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
-  // Otherwise, emit a parallel loop that computes the partition that each
-  // thread is in charge of.
-  return ParallelLoopEmitter(element_generator, GetIrArray(hlo),
-                             launch_dimensions, &ir_builder_)
-      .EmitLoop();
+  if (!hlo.IsMultiOutputFusion()) {
+    return ParallelLoopEmitter(element_generator, GetIrArray(hlo),
+                               launch_dimensions, &ir_builder_)
+        .EmitLoop();
+  }
+
+  // For multiple outputs fusion, we need to emit each operand and the root.
+  std::vector<llvm_ir::IrArray> output_arrays;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
+    output_arrays.push_back(GetIrArray(hlo, {i}));
+  }
+  TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays,
+                                         launch_dimensions, &ir_builder_)
+                         .EmitLoop());
+
+  std::vector<llvm::Value*> tuple_operand_ptrs;
+  for (int64 i = 0; i < output_arrays.size(); ++i) {
+    tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
+  }
+  ir_builder_.SetInsertPoint(ir_builder_.GetInsertBlock()->getTerminator());
+  //  const HloInstruction* root = hlo.fused_expression_root();
+  llvm_ir::EmitTuple(
+      GetIrArray(*hlo.fused_expression_root()->fusion_instruction()),
+      tuple_operand_ptrs, &ir_builder_);
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoop(
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 724549c0c4ef46e7526953f41439ea8eff71a779..1d1e5bee542c1c682fa74121934348e7e7a1b026 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -28,10 +28,10 @@ cc_library(
         "utils.h",
     ],
     deps = [
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:gpu_backend_lib_flags",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index e03571a9672df62593318766fcecf414e0899ea1..881522a0298a8c8cd45d03a4863ad5e995bd4b13 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/legacy_flags/gpu_backend_lib_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
@@ -134,13 +133,8 @@ static string GetSmName(std::pair<int, int> compute_capability) {
 // from the input filename.
 string MakeNameForTempProduct(const std::string& input_filename,
                               tensorflow::StringPiece extension) {
-  legacy_flags::GpuBackendLibFlags* flags =
-      legacy_flags::GetGpuBackendLibFlags();
-  return tensorflow::io::JoinPath(
-      flags->dump_temp_products_to,
-      ReplaceFilenameExtension(
-          tensorflow::io::Basename(llvm_ir::AsString(input_filename)),
-          extension));
+  return ReplaceFilenameExtension(
+      tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension);
 }
 
 // Initializes LLVM passes. Uses the PassRegistry mechanism.
@@ -177,20 +171,16 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
           .xla_enable_fast_math(),
       &target_options);
 
-  // Enable FMA synthesis if desired.
-  legacy_flags::GpuBackendLibFlags* flags =
-      legacy_flags::GetGpuBackendLibFlags();
-  if (flags->fma) {
-    target_options.AllowFPOpFusion = FPOpFusion::Fast;
-  }
+  // Enable FMA synthesis.
+  target_options.AllowFPOpFusion = FPOpFusion::Fast;
 
   // Set the verbose assembly options.
-  target_options.MCOptions.AsmVerbose = flags->verbose_ptx_asm;
+  target_options.MCOptions.AsmVerbose = false;
 
   // The selection of codegen optimization level is copied from function
   // GetCodeGenOptLevel in //external/llvm/tools/opt/opt.cpp.
   CodeGenOpt::Level codegen_opt_level;
-  switch (flags->opt_level) {
+  switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
     case 1:
       codegen_opt_level = CodeGenOpt::Less;
       break;
@@ -262,12 +252,10 @@ string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
     // The extension is stripped by IrDumpingPassManager, so we need to
     // get creative to add a suffix.
     string module_id(llvm_ir::AsString(module->getModuleIdentifier()));
-    legacy_flags::GpuBackendLibFlags* flags =
-        legacy_flags::GetGpuBackendLibFlags();
     IrDumpingPassManager codegen_passes(
         ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
                                  "-nvptx.dummy"),
-        flags->dump_temp_products_to, flags->dump_ir_before_passes);
+        "", false);
     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
         llvm::Triple(module->getTargetTriple())));
 
@@ -345,36 +333,19 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   TF_RETURN_IF_ERROR(
       LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
 
-  legacy_flags::GpuBackendLibFlags* flags =
-      legacy_flags::GetGpuBackendLibFlags();
-  if (!flags->dump_temp_products_to.empty()) {
-    string linked_filename =
-        MakeNameForTempProduct(module->getModuleIdentifier(), "linked.bc");
-    LOG(INFO) << "dumping bitcode after linking libdevice to: "
-              << linked_filename;
-    EmitBitcodeToFile(*module, linked_filename);
-  }
-
   // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
   // can access it.
-  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", flags->ftz);
+  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
+                        hlo_module_config.debug_options().xla_gpu_ftz());
 
   // If ftz is enabled, set it as an attribute on every function in the module.
-  if (flags->ftz) {
+  if (hlo_module_config.debug_options().xla_gpu_ftz()) {
     for (llvm::Function& fn : *module) {
       fn.addFnAttr("nvptx-f32ftz", "true");
     }
   }
 
-  // Run IR-level optimizations.
-  if (flags->dump_ir_before_passes && flags->dump_temp_products_to.empty()) {
-    LOG(FATAL) << "--dump_ir_before_passes must be specified with "
-                  "--dump_temp_products_to";
-  }
-
-  IrDumpingPassManager module_passes(module->getModuleIdentifier(),
-                                     flags->dump_temp_products_to,
-                                     flags->dump_ir_before_passes);
+  IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false);
 
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
   llvm::TargetLibraryInfoWrapperPass* tliwp =
@@ -406,8 +377,16 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   // too.
   llvm::legacy::FunctionPassManager function_passes(module);
 
-  AddOptimizationPasses(flags->opt_level, /*size_level=*/0,
-                        target_machine.get(), &module_passes, &function_passes);
+  int32 opt_level =
+      hlo_module_config.debug_options().xla_backend_optimization_level();
+
+  CHECK_GE(opt_level, 2)
+      << "The XLA GPU backend doesn't support unoptimized code generation";
+
+  AddOptimizationPasses(opt_level,
+                        /*size_level=*/0, target_machine.get(), &module_passes,
+                        &function_passes);
+
   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
   // again after the standard optimization passes [http://b/13329423].
   // TODO(jingyue): SROA may further expose more optimization opportunities, such
@@ -415,7 +394,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   // the inlining cost of a function). For now, running SROA already emits good
   // enough code for the evaluated benchmarks. We may want to run more
   // optimizations later.
-  if (flags->opt_level > 0) {
+  if (opt_level > 0) {
     // LLVM's optimizer turns on SROA when the optimization level is greater
     // than 0. We mimic this behavior here.
     module_passes.add(llvm::createSROAPass());
@@ -433,14 +412,6 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   function_passes.doFinalization();
   module_passes.run(*module);
 
-  if (!flags->dump_temp_products_to.empty()) {
-    string optimized_filename =
-        MakeNameForTempProduct(module->getModuleIdentifier(), "optimized.bc");
-    LOG(INFO) << "dumping bitcode after optimizations to: "
-              << optimized_filename;
-    EmitBitcodeToFile(*module, optimized_filename);
-  }
-
   // Finally, produce PTX.
   return EmitModuleToPTX(module, target_machine.get());
 }
@@ -473,22 +444,6 @@ void GPUBackendInit() {
   // between those loads.
   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
 
-  legacy_flags::GpuBackendLibFlags* flags =
-      legacy_flags::GetGpuBackendLibFlags();
-  if (!flags->llvm_cl_opts.empty()) {
-    std::vector<string> opts =
-        tensorflow::str_util::Split(flags->llvm_cl_opts, ',');
-    FeedLLVMWithFlags(opts);
-  }
-
-  if (flags->llvm_dump_passes) {
-    // Enable LLVM pass debugging dump. LLVM dumps this information when a pass
-    // manager is initialized for execution. It's done to stderr (this is
-    // hardcoded within LLVM to the dbgs() stream, we can't change it from the
-    // outside).
-    FeedLLVMWithFlags({"-debug-pass=Arguments"});
-  }
-
   // Initialize the NVPTX target; it's the only target we link with, so call its
   // specific initialization functions instead of the catch-all InitializeAll*.
   LLVMInitializeNVPTXTarget();
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index a12a9a716829fbcf5b6348037fa723d5ddcc6930..b8c61620845a1434cc79dc9a8b00f89944e2ae95 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -61,7 +61,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     PrimitiveType element_type = input->shape().element_type();
     HloInstruction* padding =
         computation->AddInstruction(HloInstruction::CreateConstant(
-            MakeUnique<Literal>(LiteralUtil::Zero(element_type))));
+            MakeUnique<Literal>(Literal::Zero(element_type))));
     input = computation->AddInstruction(HloInstruction::CreatePad(
         ShapeInference::InferPadShape(
             /*operand_shape=*/input->shape(),
@@ -127,7 +127,7 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   PrimitiveType element_type = kernel->shape().element_type();
   HloInstruction* padding =
       computation->AddInstruction(HloInstruction::CreateConstant(
-          MakeUnique<Literal>(LiteralUtil::Zero(element_type))));
+          MakeUnique<Literal>(Literal::Zero(element_type))));
   return computation->AddInstruction(HloInstruction::CreatePad(
       ShapeInference::InferPadShape(
           /*operand_shape=*/kernel->shape(),
@@ -242,9 +242,9 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   // Create a new backward convolution replacing the old one.
   HloComputation* computation = backward_conv->parent();
   HloInstruction* output = backward_conv->mutable_operand(1);
-  HloInstruction* padding = computation->AddInstruction(
-      HloInstruction::CreateConstant(MakeUnique<Literal>(
-          LiteralUtil::Zero(input->shape().element_type()))));
+  HloInstruction* padding =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          MakeUnique<Literal>(Literal::Zero(input->shape().element_type()))));
   HloInstruction* padded_input =
       computation->AddInstruction(HloInstruction::CreatePad(
           ShapeInference::InferPadShape(input->shape(), padding->shape(),
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 65610b0995c512cc4a611ac650c581d0180d258d..d5543d296b3f0f6b19de90c42bea4f162057802a 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -36,6 +36,13 @@ ParallelLoopEmitter::ParallelLoopEmitter(
     : LoopEmitter(body_emitter, shape, ir_builder),
       launch_dimensions_(launch_dimensions) {}
 
+ParallelLoopEmitter::ParallelLoopEmitter(
+    const llvm_ir::ElementGenerator& target_element_generator,
+    tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    : LoopEmitter(target_element_generator, target_arrays, ir_builder),
+      launch_dimensions_(launch_dimensions) {}
+
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 73ca28cd842fe350ecd10885d983907e7288a350..d324a50698ea0d3e5e196347bd69c29b2ad27e3e 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -41,6 +41,12 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
                       llvm::IRBuilder<>* ir_builder);
+
+  ParallelLoopEmitter(
+      const llvm_ir::ElementGenerator& target_element_generator,
+      tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder);
+
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 5065e7aedd08c591f33c152c6709823948db54f0..e4cfc6999f2da04dd7e7a34d854fdb3d75b8bfc6 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/stream_assignment_flags.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 
 namespace xla {
 namespace gpu {
@@ -46,10 +46,9 @@ namespace {
 
 // Returns whether the two HLOs can run concurrently, i.e., neither is a
 // transitive consumer of the other.
-bool CanRunConcurrently(
-    const HloInstruction& a, const HloInstruction& b,
-    const HloComputation::ReachabilityMap& transitive_operands) {
-  return !transitive_operands.IsConnected(&a, &b);
+bool CanRunConcurrently(const HloInstruction& a, const HloInstruction& b,
+                        const HloReachabilityMap& reachability) {
+  return !reachability.IsConnected(&a, &b);
 }
 
 // Returns which existing stream to assign to `hlo`, or -1 if a stream is not
@@ -58,7 +57,7 @@ bool CanRunConcurrently(
 // are topologically before `hlo`.
 int ComputeStreamToAssign(
     const HloInstruction& hlo, const StreamAssignment& stream_assignment,
-    const HloComputation::ReachabilityMap& transitive_operands,
+    const HloReachabilityMap& reachability,
     const std::vector<const HloInstruction*>& seen_gemms) {
   if (hlo.opcode() == HloOpcode::kParameter ||
       hlo.opcode() == HloOpcode::kConstant) {
@@ -66,9 +65,10 @@ int ComputeStreamToAssign(
     return -1;
   }
 
-  legacy_flags::StreamAssignmentFlags* flags =
-      legacy_flags::GetStreamAssignmentFlags();
-  if (flags->xla_gpu_disable_multi_streaming) {
+  if (hlo.GetModule()
+          ->config()
+          .debug_options()
+          .xla_gpu_disable_multi_streaming()) {
     return 0;
   }
 
@@ -96,7 +96,7 @@ int ComputeStreamToAssign(
   for (const auto* seen_gemm : seen_gemms) {
     int stream_no = stream_assignment.StreamNumberForHlo(*seen_gemm);
     if (!forbidden_stream_numbers.count(stream_no) &&
-        CanRunConcurrently(*seen_gemm, hlo, transitive_operands)) {
+        CanRunConcurrently(*seen_gemm, hlo, reachability)) {
       forbidden_stream_numbers.insert(stream_no);
     }
   }
@@ -115,12 +115,12 @@ int ComputeStreamToAssign(
 std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
   auto stream_assignment = MakeUnique<StreamAssignment>();
   const HloComputation& computation = *module.entry_computation();
-  std::unique_ptr<HloComputation::ReachabilityMap> transitive_operands =
-      computation.ComputeTransitiveOperands();
+  std::unique_ptr<HloReachabilityMap> reachability =
+      computation.ComputeReachability();
   std::vector<const HloInstruction*> seen_gemms;
   for (const auto* hlo : computation.MakeInstructionPostOrder()) {
     int stream_no = ComputeStreamToAssign(*hlo, *stream_assignment,
-                                          *transitive_operands, seen_gemms);
+                                          *reachability, seen_gemms);
     if (stream_no != -1) {
       stream_assignment->AssignStreamToHlo(hlo, stream_no);
     }
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index 06b01d311dac5a6be78d7b8b16e7fcb39c189647..3034ed06b7eaff46a923b19cedb39f02d276c9f8 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -37,8 +37,8 @@ namespace {
 // patterns to match.
 //
 // Each ExprTree node is comprised of an HloOpcode, and a set of operands (each
-// of type ExprTree). Operands can be added by specifying the index and HloOpcode
-// of the operand.
+// of type ExprTree). Operands can be added by specifying the index and
+// HloOpcode of the operand.
 //
 // For example, the following computation:
 //
@@ -197,10 +197,9 @@ class MatcherBase {
       return InvalidArgument("Must use S32 or S64 integral types.");
     }
     if (type == S32) {
-      *const_value =
-          static_cast<int64>(LiteralUtil::GetFirstElement<int32>(literal));
+      *const_value = static_cast<int64>(literal.GetFirstElement<int32>());
     } else if (type == S64) {
-      *const_value = LiteralUtil::GetFirstElement<int64>(literal);
+      *const_value = literal.GetFirstElement<int64>();
     }
     return tensorflow::Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index e82491fd6f9f1158fc5b9e5bd475ef6ff97f2a7c..51d38f84212b01c08c33f1b648c579c5672769ba 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -41,7 +41,7 @@ class WhileTransformerTest : public HloTestBase {
       const int64 tuple_index, const int64 limit) {
     auto builder = HloComputation::Builder(TestName() + ".Condition");
     auto limit_const = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(limit)));
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(limit)));
     auto loop_state = builder.AddInstruction(
         HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
     auto induction_variable =
@@ -64,8 +64,8 @@ class WhileTransformerTest : public HloTestBase {
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             induction_variable_shape_, loop_state, ind_var_tuple_index));
-    auto inc = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR0<int32>(increment)));
+    auto inc = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(increment)));
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
         induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
     // Update data GTE(data_tuple_index).
@@ -88,12 +88,10 @@ class WhileTransformerTest : public HloTestBase {
                                         const int64 ind_var_tuple_index,
                                         const int64 ind_var_init) {
     auto builder = HloComputation::Builder(TestName() + ".While");
-    auto induction_var_init =
-        builder.AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::CreateR0<int32>(ind_var_init)));
-    auto data_init = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
-            {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f})));
+    auto induction_var_init = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(ind_var_init)));
+    auto data_init = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR1<float>({0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f})));
     auto loop_state_init =
         ind_var_tuple_index == 0
             ? builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu_transfer_manager.cc
index 4b8d190a463ceb155f4fc8d3d22b47b9cbc8f23f..74f0bdb7db1847119c5bd75cc9fd9d921c6e162a 100644
--- a/tensorflow/compiler/xla/service/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu_transfer_manager.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -44,24 +44,85 @@ GpuTransferManager::GpuTransferManager()
 Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
                                                    const Literal& literal) {
   const Shape& shape = literal.shape();
-  VLOG(2) << "Transferring literal shape to infeed: "
+  VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
 
-  // TODO(b/30467474) handle tuples.
-  if (ShapeUtil::IsTuple(shape)) {
-    return Unimplemented("Infeed with a tuple shape is not supported: %s",
-                         ShapeUtil::HumanString(literal.shape()).c_str());
+  if (!ShapeUtil::IsTuple(shape)) {
+    int64 size = GetByteSizeRequirement(shape);
+    return TransferBufferToInfeed(executor, size, literal.InternalData());
   }
 
-  int64 size = GetByteSizeRequirement(shape);
+  if (ShapeUtil::IsNestedTuple(shape)) {
+    return Unimplemented(
+        "Infeed with a nested tuple shape is not supported: %s",
+        ShapeUtil::HumanString(literal.shape()).c_str());
+  }
+
+  // For a tuple, we transfer each of its elements to the device and
+  // enqueue the resulting destination device addresses with the
+  // infeed manager.
+  std::vector<gpu::InfeedBuffer*> buffers;
+  buffers.reserve(literal.tuple_literals_size());
+  auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() {
+    for (gpu::InfeedBuffer* b : buffers) {
+      b->Done();
+    }
+  });
+
+  for (const auto& tuple_element : literal.tuple_literals()) {
+    const Shape& tuple_element_shape = tuple_element.shape();
+    int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape);
+    TF_ASSIGN_OR_RETURN(
+        gpu::InfeedBuffer * buffer,
+        TransferBufferToInfeedInternal(executor, tuple_element_size,
+                                       tuple_element.InternalData()));
+    buffers.push_back(buffer);
+  }
+
+  cleanup.release();
+  return EnqueueBuffersToInfeed(executor, buffers);
+}
+
+Status GpuTransferManager::TransferBufferToInfeed(se::StreamExecutor* executor,
+                                                  int64 size,
+                                                  const void* source) {
+  TF_ASSIGN_OR_RETURN(gpu::InfeedBuffer * buffer,
+                      TransferBufferToInfeedInternal(executor, size, source));
+  return EnqueueBuffersToInfeed(executor, {buffer});
+}
+
+Status GpuTransferManager::EnqueueBuffersToInfeed(
+    se::StreamExecutor* executor, std::vector<gpu::InfeedBuffer*> buffers) {
+  gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager();
+  se::Stream* stream = infeed_manager->GetStream(executor);
+
+  // TODO(b/30467474): Since this stream is shared across different
+  // infeed requests, blocking on the stream might be
+  // heavy-handed. Figure out if finer-grained acknowledgement is
+  // possible.
+  if (!stream->BlockHostUntilDone()) {
+    for (gpu::InfeedBuffer* b : buffers) {
+      b->Done();
+    }
+    return InternalError("Failed to complete data transfer on stream %p",
+                         stream);
+  }
+
+  infeed_manager->EnqueueBuffers(buffers);
+
+  VLOG(2) << "Infeed data transferred";
+
+  return Status::OK();
+}
+
+StatusOr<gpu::InfeedBuffer*> GpuTransferManager::TransferBufferToInfeedInternal(
+    se::StreamExecutor* executor, int64 size, const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return Unimplemented("Infeed shape is too large: %s needs %lld bytes",
-                         ShapeUtil::HumanString(literal.shape()).c_str(), size);
+    return InvalidArgument("Infeed shape is too large: needs %lld bytes", size);
   }
 
   if (size == 0) {
-    return Unimplemented("Infeed shape %s needs 0 bytes",
-                         ShapeUtil::HumanString(literal.shape()).c_str());
+    return InvalidArgument("Infeed shape needs 0 bytes");
   }
 
   gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager();
@@ -71,21 +132,11 @@ Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
   }
 
   gpu::InfeedBuffer* buffer = new gpu::InfeedBuffer(executor, size);
-  stream->ThenMemcpy(buffer->device_memory(),
-                     LiteralUtil::InternalData(literal), size);
+  stream->ThenMemcpy(buffer->device_memory(), source, size);
 
   VLOG(2) << "Queued infeed data on stream " << stream;
 
-  if (!stream->BlockHostUntilDone()) {
-    buffer->Done();
-    return InternalError("Failed to complete data transfer on stream %p",
-                         stream);
-  }
-
-  infeed_manager->EnqueueBuffer(buffer);
-
-  VLOG(2) << "Infeed data transferred";
-  return Status::OK();
+  return buffer;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu_transfer_manager.h
index 6dfe7ba0295aea699ca737e9dd47123b17cae3dc..9aa369c668364079504ead3491903e2590a142cc 100644
--- a/tensorflow/compiler/xla/service/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu_transfer_manager.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -37,8 +38,21 @@ class GpuTransferManager : public GenericTransferManager {
 
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
+  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
+                                int64 size, const void* source) override;
 
  private:
+  // Initiates the infeed data transfers. InfeedBuffer->Done() must be
+  // called to clean up the memory allocated for InfeedBuffer.
+  StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      const void* source);
+
+  // Enqueues infeed data buffers with the infeed manager after their
+  // transfer completes.
+  Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
+                                std::vector<gpu::InfeedBuffer*> buffers);
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
 };
 
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index cd00a41a03718502fcfa63e035639390b6fe6e07..049e8d80d80c835bca4a4d38592564ba82a3ecf9 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -47,7 +47,7 @@ HloComputation* AddScalarConstantComputation(int64 addend, HloModule* module) {
   auto x_value = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "x_value"));
   auto half = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.5)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.5)));
   builder.AddInstruction(HloInstruction::CreateBinary(
       half->shape(), HloOpcode::kAdd, x_value, half));
   return module->AddEmbeddedComputation(builder.Build());
@@ -118,7 +118,7 @@ std::unique_ptr<HloModule> MakeBigGraph() {
   auto rng = builder.AddInstruction(
       HloInstruction::CreateRng(vshape, RNG_UNIFORM, {param_m, param_m}));
   auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto add_computation = ScalarSumComputation(module.get());
   builder.AddInstruction(
       HloInstruction::CreateReduce(vshape, rng, one, {1}, add_computation));
@@ -156,10 +156,9 @@ int main(int argc, char** argv) {
 
   auto module = xla::MakeBigGraph();
 
-  printf("Graph URL: %s\n",
-         xla::hlo_graph_dumper::DumpGraph(
-             *module->entry_computation(), "Example computation",
-             /*show_addresses=*/false, /*show_layouts=*/false)
-             .c_str());
+  printf("Graph URL: %s\n", xla::hlo_graph_dumper::DumpGraph(
+                                *module->entry_computation(),
+                                "Example computation", xla::DebugOptions())
+                                .c_str());
   return 0;
 }
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 86f62accd3b524c3aa39c256a982bcf21edc1b25..840be603bf997f6f84e4c372c178fdf96f928f23 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -35,18 +35,26 @@ namespace {
 std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
     const HloInstruction* instruction,
     const TuplePointsToAnalysis& points_to_analysis) {
-  FlatSet<const LogicalBuffer*> buffers;
+  std::vector<const LogicalBuffer*> buffers;
   for (const HloInstruction* operand : instruction->operands()) {
-    FlatSet<const LogicalBuffer*> sources =
-        points_to_analysis.GetPointsToSet(operand).CreateFlattenedSet();
-    buffers.insert(sources.begin(), sources.end());
+    points_to_analysis.GetPointsToSet(operand).ForEachElement(
+        [&](const ShapeIndex& /*index*/,
+            const std::vector<const LogicalBuffer*>& points_to) {
+          buffers.insert(buffers.end(), points_to.begin(), points_to.end());
+        });
   }
-  std::vector<const LogicalBuffer*> sorted(buffers.begin(), buffers.end());
-  std::sort(sorted.begin(), sorted.end(),
+
+  // Sort and then remove duplicates from buffers.
+  std::sort(buffers.begin(), buffers.end(),
             [](const LogicalBuffer* a, const LogicalBuffer* b) {
               return a->id() < b->id();
             });
-  return sorted;
+  buffers.erase(std::unique(buffers.begin(), buffers.end(),
+                            [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                              return a->id() == b->id();
+                            }),
+                buffers.end());
+  return buffers;
 }
 
 }  // namespace
@@ -187,7 +195,7 @@ Status HeapSimulator::RunComputation(
             buffer->instruction()->opcode() != HloOpcode::kCopy &&
             CanShareOperandBufferWithUser(
                 operand_buffer->instruction(), operand_buffer->index(),
-                buffer->instruction(), buffer->index(), points_to_analysis)) {
+                buffer->instruction(), buffer->index(), &points_to_analysis)) {
           ShareBuffer(buffer, operand_buffer, instruction);
           shared = true;
           break;
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 60a0768a86b30ad5e8810a6f289008a9ee8c8a2e..ef9db8ba236f9923420c1f8b1a7423e0c036fb0f 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -173,7 +173,7 @@ class HeapSimulatorTest : public HloTestBase {
 TEST_F(HeapSimulatorTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
 
   // Constants aren't assigned.  See b/32248867
   HeapSimulatorTracker tracker(TestName(), builder.Build(), {const0});
@@ -510,8 +510,7 @@ class HeapAlgorithmTestBase : public ::testing::Test {
   // other than the id and color.
   const LogicalBuffer* DummyLogicalBuffer() {
     const LogicalBuffer::Id id = buffers_.size();
-    buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id,
-                                                    LogicalBuffer::Color(0)));
+    buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id));
     return buffers_.back().get();
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 3b37f4a4b892497135c4dccc0082d244c1d8a27e..0c03d72752f97c201f2e209f99a4915ec97257ac 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -15,21 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 
-#include <ostream>
-#include <queue>
+#include <algorithm>
+#include <memory>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -38,115 +38,16 @@ using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
-void HloBuffer::AddValue(const HloValue& value) {
-  // If the value is already contained in this buffer, just return.
-  if (std::find(value_ids_.begin(), value_ids_.end(), value.id()) !=
-      value_ids_.end()) {
-    return;
-  }
-
-  value_ids_.push_back(value.id());
-
-  // Add all of the locations of the HloValue to this buffer.
-  for (const HloLocation& location : value.locations()) {
-    if (std::find(locations_.begin(), locations_.end(), location) ==
-        locations_.end()) {
-      locations_.push_back(location);
-    }
-  }
-}
-
-bool HloBuffer::operator==(const HloBuffer& other) const {
-  bool equal = id() == other.id();
-  if (equal) {
-    // DCHECK because these comparisons are expensive (linear time).
-    DCHECK(value_ids() == other.value_ids());
-    DCHECK(locations() == other.locations());
-  }
-  return equal;
-}
-
-string HloBuffer::ToString() const {
-  return StrCat("HloBuffer ", id_, ", values: ", Join(value_ids_, ", "));
-}
-
-std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer) {
-  out << buffer.ToString();
-  return out;
-}
-
-void HloBufferSet::AddBuffer(HloBuffer::Id buffer_id) {
-  if (std::find(buffer_ids_.begin(), buffer_ids_.end(), buffer_id) ==
-      buffer_ids_.end()) {
-    buffer_ids_.push_back(buffer_id);
-  }
-}
-
-void HloBufferSet::RemoveBufferOrDie(HloBuffer::Id buffer_id) {
-  auto it = std::find(buffer_ids_.begin(), buffer_ids_.end(), buffer_id);
-  CHECK(it != buffer_ids_.end());
-  buffer_ids_.erase(it);
-}
-
-string HloBufferSet::ToString() const {
-  return StrCat("HloBufferSet, buffers: ", Join(buffer_ids_, ", "));
-}
-
-std::ostream& operator<<(std::ostream& out, const HloBufferSet& buffer_set) {
-  out << buffer_set.ToString();
-  return out;
-}
-
-bool InstructionBufferSet::IsAmbiguous() const {
-  bool is_ambiguous = false;
-  ForEachElement(
-      [&is_ambiguous](const ShapeIndex& index, const HloBufferSet& buffer_set) {
-        is_ambiguous |= buffer_set.buffer_ids().size() > 1;
-      });
-  return is_ambiguous;
-}
-
-bool InstructionBufferSet::IsDistinct() const {
-  bool is_distinct = true;
-  tensorflow::gtl::FlatSet<HloBuffer::Id> seen_ids;
-  ForEachElement([&is_distinct, &seen_ids](const ShapeIndex& index,
-                                           const HloBufferSet& buffer_set) {
-    for (HloBuffer::Id buffer_id : buffer_set.buffer_ids()) {
-      auto pair = seen_ids.insert(buffer_id);
-      if (!pair.second) {
-        is_distinct = false;
-      }
-    }
-  });
-  return is_distinct;
-}
-
-string InstructionBufferSet::ToString() const {
-  string out =
-      StrCat("InstructionBufferSet(", ShapeUtil::HumanString(shape()), ")\n");
-  ForEachElement([this, &out](const ShapeIndex& index,
-                              const HloBufferSet& value_set) {
-    StrAppend(&out, "  ", index.ToString(), " : ", value_set.ToString(), "\n");
-  });
-  return out;
-}
-
-std::ostream& operator<<(std::ostream& out,
-                         const InstructionBufferSet& buffer_set) {
-  out << buffer_set.ToString();
-  return out;
-}
-
 HloAliasAnalysis::HloAliasAnalysis(HloModule* module) : module_(module) {}
 
 void HloAliasAnalysis::InitializeBufferSets() {
-  std::unordered_map<HloValue::Id, HloBuffer::Id> value_to_buffer;
+  std::unordered_map<HloValue::Id, const HloBuffer*> value_to_buffer;
 
   // Initially define a buffer for every HloValue in the module.
   for (const HloValue* value : dataflow_analysis_->values()) {
-    HloBuffer& buffer = NewHloBuffer();
-    buffer.AddValue(*value);
-    value_to_buffer[value->id()] = buffer.id();
+    HloBuffer* buffer = NewHloBuffer();
+    buffer->AddValue(*value);
+    value_to_buffer[value->id()] = buffer;
   }
 
   // Construct the Instruction buffer set to contain the HloBuffers for each
@@ -160,9 +61,9 @@ void HloAliasAnalysis::InitializeBufferSets() {
           .ForEachElement(
               [this, &instruction, &value_to_buffer](
                   const ShapeIndex& index, const HloValueSet& value_set) {
-                for (HloValue::Id value_id : value_set.value_ids()) {
-                  HloBuffer::Id buffer_id = value_to_buffer.at(value_id);
-                  GetBufferSet(instruction.get(), index).AddBuffer(buffer_id);
+                for (const HloValue* value : value_set.values()) {
+                  const HloBuffer* buffer = value_to_buffer.at(value->id());
+                  GetBufferSet(instruction.get(), index).AddBuffer(buffer);
                 }
               });
     }
@@ -189,18 +90,18 @@ void HloAliasAnalysis::CombineBuffers(
     VLOG(4) << "Eliminating buffer: " << buffer_id;
 
     // Add all values held by the buffer-to-eliminate to the unified buffer.
-    for (HloValue::Id value_id : buffer.value_ids()) {
-      unified_buffer.AddValue(dataflow_analysis_->GetValue(value_id));
+    for (const HloValue* value : buffer.values()) {
+      unified_buffer.AddValue(*value);
     }
 
-    // Iterate through all locations where the buffer-to-eliminate exists and
+    // Iterate through all positions where the buffer-to-eliminate exists and
     // replace it with the unified buffer.
-    for (const HloLocation& location : buffer.locations()) {
-      VLOG(4) << "Replacing in " << location;
-      GetBufferSet(location.instruction, location.index)
+    for (const HloPosition& position : buffer.positions()) {
+      VLOG(4) << "Replacing in " << position;
+      GetBufferSet(position.instruction, position.index)
           .RemoveBufferOrDie(buffer_id);
-      GetBufferSet(location.instruction, location.index)
-          .AddBuffer(unified_buffer.id());
+      GetBufferSet(position.instruction, position.index)
+          .AddBuffer(&unified_buffer);
     }
 
     buffers_.erase(buffer_id);
@@ -219,9 +120,9 @@ Status HloAliasAnalysis::Verify() const {
     TF_RETURN_IF_ERROR(instruction_buffer_set.ForEachElementWithStatus(
         [this, &buffers_in_sets](const ShapeIndex& index,
                                  const HloBufferSet& buffer_set) -> Status {
-          for (HloBuffer::Id buffer_id : buffer_set.buffer_ids()) {
-            TF_RET_CHECK(ContainsKey(buffers_, buffer_id));
-            buffers_in_sets.insert(buffer_id);
+          for (const HloBuffer* buffer : buffer_set.buffers()) {
+            TF_RET_CHECK(ContainsKey(buffers_, buffer->id()));
+            buffers_in_sets.insert(buffer->id());
           }
           return Status::OK();
         }));
@@ -240,7 +141,7 @@ void HloAliasAnalysis::FlattenInstructionBufferSets(
   VLOG(4) << "Flattening buffer sets of instructions: "
           << Join(instructions, ", ",
                   [this](string* out, const HloInstruction* instruction) {
-                    StrAppend(out, instruction->FullyQualifiedName());
+                    StrAppend(out, instruction->name());
                   });
   if (instructions.size() < 2) {
     return;
@@ -253,10 +154,11 @@ void HloAliasAnalysis::FlattenInstructionBufferSets(
         std::vector<HloBuffer::Id> to_unify;
         for (const HloInstruction* instruction : instructions) {
           const HloBufferSet& buffer_set = GetBufferSet(instruction, index);
-          to_unify.insert(to_unify.end(), buffer_set.buffer_ids().begin(),
-                          buffer_set.buffer_ids().end());
+          for (const HloBuffer* buffer : buffer_set.buffers()) {
+            to_unify.push_back(buffer->id());
+          }
         }
-        // Sort and uniquify buffers to combine.
+        // Sort and uniquify buffer ids to combine.
         std::sort(to_unify.begin(), to_unify.end());
         to_unify.erase(std::unique(to_unify.begin(), to_unify.end()),
                        to_unify.end());
@@ -265,14 +167,13 @@ void HloAliasAnalysis::FlattenInstructionBufferSets(
       });
 }
 
-HloBuffer& HloAliasAnalysis::NewHloBuffer() {
+HloBuffer* HloAliasAnalysis::NewHloBuffer() {
   HloBuffer::Id buffer_id = next_buffer_id_++;
-  auto it_added = buffers_.emplace(std::piecewise_construct,
+  auto emplaced = buffers_.emplace(std::piecewise_construct,
                                    std::forward_as_tuple(buffer_id),
                                    std::forward_as_tuple(buffer_id));
-  CHECK(it_added.second);
-
-  return it_added.first->second;
+  CHECK(emplaced.second);
+  return &emplaced.first->second;
 }
 
 string HloAliasAnalysis::ToString() const {
@@ -282,34 +183,18 @@ string HloAliasAnalysis::ToString() const {
        module_->computations()) {
     for (const std::unique_ptr<HloInstruction>& instruction :
          computation->instructions()) {
-      StrAppend(&out, "    ", instruction->FullyQualifiedName(), ":\n");
-      auto buffer_str = [this](const HloBuffer& buffer) {
-        return StrCat(
-            "Buffer ", buffer.id(), ", values: ",
-            Join(buffer.value_ids(), ", ",
-                 [this](string* out, HloValue::Id value_id) {
-                   StrAppend(
-                       out,
-                       dataflow_analysis_->GetValue(value_id).ToShortString());
-                 }));
-      };
+      StrAppend(&out, "    ", instruction->name(), ":\n");
       if (ShapeUtil::IsTuple(instruction->shape())) {
         GetInstructionBufferSet(instruction.get())
-            .ForEachElement([this, &out, &buffer_str](
-                                const ShapeIndex& index,
-                                const HloBufferSet& buffer_set) {
+            .ForEachElement([this, &out](const ShapeIndex& index,
+                                         const HloBufferSet& buffer_set) {
               StrAppend(&out, "      tuple index ", index.ToString(), ":\n");
-              for (HloBuffer::Id buffer_id : buffer_set.buffer_ids()) {
-                StrAppend(&out, "        ", buffer_str(GetBuffer(buffer_id)),
-                          "\n");
-              }
+              StrAppend(&out, "        ", buffer_set.ToString(), "\n");
             });
       } else {
         const HloBufferSet top_level_buffer_set =
             GetBufferSet(instruction.get());
-        for (HloBuffer::Id buffer_id : top_level_buffer_set.buffer_ids()) {
-          StrAppend(&out, "      ", buffer_str(GetBuffer(buffer_id)), "\n");
-        }
+        StrAppend(&out, "      ", top_level_buffer_set.ToString(), "\n");
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index 0fa35827b5ecbfd3987a17e60c3b395b36b16b2e..c70ec38c990ff5ea863f737b48cdcbcd49d513c2 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -16,182 +16,23 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
 
-#include <stddef.h>
-#include <iosfwd>
 #include <memory>
-#include <set>
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-// A container which can hold one or more HloValues. An HLO buffer abstractly
-// represents the allocation which HLO instructions write into and read
-// from. Generally there is a one-to-one correspondence between HloBuffers and
-// HloValue where each HloValue in the module is held in a unique HloBuffer. An
-// exception is the while instruction which updates the loop state in-place. In
-// this case, we have a single HloBuffer for each HloLocation in the loop state,
-// but multiple HloValues. For example:
-//
-//   %init = ...
-//   %while = While(%init, body, condition)
-//
-//  body:
-//   %body_param = Param(0)
-//     ...
-//   %body_root = ...
-//
-//  condition:
-//   %cond_param = Param(0)
-//     ...
-//
-// For simplicity, assume that %while is array-shaped. In this case, we have a
-// single HloBuffer which holds the following HloValues: HloValue{%init},
-// HloValue{%while}, HloValue{%body_param}, HloValue{%body_root}, and
-// HloValue{%cond_param}.
-//
-// HloBuffers may appear at different HloLocations in the module mirroring the
-// same propery of HloValues. For example:
-//
-//   %sub = Sub(...)
-//   %add = Add(...)
-//   %tuple = Tuple(%add, %sub)
-//   %gte = GetTupleElement(%tuple, 0)
-//
-// In this case, the HloBuffer containing %add appears at the following
-// locations: HloLocation{%add, {}}, HloLocation{%tuple, {0}}, and
-// HloLocation{%gte, {}}.
-//
-// Different HloLocations which share the same HloBuffer indicate mandatory
-// aliasing in the HLO module. These locations must share the same memory
-// allocation for correctness (the backends rely on this property). This differs
-// from incidental aliasing introduced by memory reuse in BufferAssignment where
-// different instructions may happen to get the same allocation.
-class HloBuffer {
- public:
-  using Id = int64;
-
-  HloBuffer(int64 id) : id_(id) {}
-
-  // Return the unique identifier for this HloBuffer.
-  int64 id() const { return id_; }
-
-  // Add a value to the set of values held by this buffer. Also adds the
-  // HloLocations of the value to the locations vector of the buffer. If the
-  // buffer already contains this value, then this method is a nop.
-  void AddValue(const HloValue& value);
-
-  // Return the IDs of all values contained in this buffer.
-  const std::vector<HloValue::Id>& value_ids() const { return value_ids_; }
-
-  // Return the locations (output of which instruction and at what index) where
-  // the buffer is used. This is exactly the union of the locations of the
-  // HloValues contained by the buffer.
-  const std::vector<HloLocation>& locations() const { return locations_; }
-
-  string ToString() const;
-
-  bool operator==(const HloBuffer& other) const;
-  bool operator!=(const HloBuffer& other) const { return !(*this == other); }
-
- private:
-  // Unique identifier for this HloBuffer.
-  const Id id_;
-
-  // The set of values contained in the this buffer.
-  std::vector<HloValue::Id> value_ids_;
-
-  // The set of locations where this buffer is used.
-  std::vector<HloLocation> locations_;
-};
-
-std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer);
-
-// A class representing the set of possible HloBuffers at a particular
-// HloLocation (shape index in the output of an instruction) in the XLA
-// graph. In most cases, the buffer set will have a single HloBuffer indicating
-// that the HloBuffer which appears at that particular location is known
-// unambiguously at compile-time.  However, tuple-shaped Select instructions can
-// introduce ambiguity as the tuple elements of the operands are passed by
-// reference into the output of the Select. For example:
-//
-//   %pred = ...
-//   %tuple0 = Tuple(%a, %b)
-//   %tuple1 = Tuple(%x, %y)
-//   %select = Select(%pred, %tuple0, %tuple1)
-//
-// In this case the HloBufferSet at HloLocation{%select, {0}} contains the
-// HloBuffer holding %a and the HloBuffer holding %x.
-class HloBufferSet {
- public:
-  HloBufferSet() = default;
-
-  // Add the given buffer to this buffer set. If the buffer already exists in
-  // the set, then this is a NOP.
-  void AddBuffer(HloBuffer::Id buffer_id);
-
-  // Removes the given buffer from this buffer set. CHECK fails in the buffer is
-  // not contained in this set.
-  void RemoveBufferOrDie(HloBuffer::Id buffer_id);
-
-  // Returns the unique buffer in this set. CHECK fails if the set does not
-  // contain exactly one buffer.
-  HloBuffer::Id GetUniqueBufferId() const {
-    CHECK_EQ(buffer_ids().size(), 1);
-    return buffer_ids()[0];
-  }
-
-  // Returns the IDs of the HloBuffers contained in this buffer set.
-  const std::vector<HloBuffer::Id>& buffer_ids() const { return buffer_ids_; }
-
-  string ToString() const;
-
- private:
-  // The IDs of the HloBuffers containted in this buffer set.
-  std::vector<HloBuffer::Id> buffer_ids_;
-};
-
-std::ostream& operator<<(std::ostream& out, const HloBufferSet& buffer_set);
-
-// A class collecting the HloBuffers in the output of an HLO instruction. For
-// array-shaped instructions, an InstructionBufferSet trivially holds a single
-// HloBufferSet. Tuple-shaped InstructionBufferSets hold multiple
-// HloBufferSets.
-class InstructionBufferSet : public ShapeTree<HloBufferSet> {
- public:
-  InstructionBufferSet(const Shape& shape) : ShapeTree<HloBufferSet>(shape) {}
-
-  // Returns true if any HloBufferSet contained in this InstructionBufferSet
-  // is not a singleton.
-  bool IsAmbiguous() const;
-
-  // Returns true if any HloBuffer appears in more than one HloBufferSet
-  // contained in this InstructionBufferSet.
-  bool IsDistinct() const;
-
-  string ToString() const;
-};
-
-std::ostream& operator<<(std::ostream& out,
-                         const InstructionBufferSet& buffer_set);
-
 class HloAliasAnalysis {
  public:
   static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(HloModule* module);
@@ -204,7 +45,7 @@ class HloAliasAnalysis {
   InstructionBufferSet& GetInstructionBufferSet(
       const HloInstruction* instruction);
 
-  // Return the HloBufferSet for the given location.
+  // Return the HloBufferSet for the given position.
   const HloBufferSet& GetBufferSet(const HloInstruction* instruction,
                                    const ShapeIndex& index = {}) const;
   HloBufferSet& GetBufferSet(const HloInstruction* instruction,
@@ -218,15 +59,15 @@ class HloAliasAnalysis {
     return buffers_.at(buffer_id);
   }
 
-  // Returns the unique buffer at the given location. CHECK fails if the buffer
-  // set at that location does not contain exactly one buffer.
+  // Returns the unique buffer at the given position. CHECK fails if the buffer
+  // set at that position does not contain exactly one buffer.
   const HloBuffer& GetUniqueBufferAt(const HloInstruction* instruction,
                                      const ShapeIndex& index = {}) const {
-    return GetBuffer(GetBufferSet(instruction, index).GetUniqueBufferId());
+    return GetBufferSet(instruction, index).GetUniqueBuffer();
   }
   HloBuffer& GetUniqueBufferAt(const HloInstruction* instruction,
                                const ShapeIndex& index = {}) {
-    return GetBuffer(GetBufferSet(instruction, index).GetUniqueBufferId());
+    return GetBuffer(GetBufferSet(instruction, index).GetUniqueBuffer().id());
   }
 
   // Return a vector of all HloBuffers stabily sorted by HloBuffer::Id. This
@@ -242,8 +83,8 @@ class HloAliasAnalysis {
  protected:
   HloAliasAnalysis(HloModule* module);
 
-  // Creates a new HloBuffer and returns a reference to it.
-  HloBuffer& NewHloBuffer();
+  // Returns a new HloBuffer.
+  HloBuffer* NewHloBuffer();
 
   // Construct the initial set of buffer sets where an HloBuffer is created for
   // each HloValue in the module.
@@ -282,7 +123,9 @@ class HloAliasAnalysis {
   // The underlying dataflow analysis used by this alias analysis.
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 
-  // The map of all HloBuffers in the module.
+  // The map of all HloBuffers in the module. We pass around pointers to the
+  // mapped HloBuffers, so the underlying container must keep them valid despite
+  // mutations touching other map entries.
   std::unordered_map<HloBuffer::Id, HloBuffer> buffers_;
 
   // A map from instruction to its InstructionBufferSet.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 24c467d411b93be32bd884a8bb92ef288d9c2f10..3c5b2e03b762be2247a5c58b13915ae883c93622 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -37,22 +37,22 @@ using ::testing::UnorderedElementsAre;
 
 class HloAliasAnalysisTest : public HloTestBase {
  protected:
-  HloAliasAnalysisTest() : module_(TestName()) {}
+  HloAliasAnalysisTest() : module_(CreateNewModule()) {}
 
   // Run alias analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   const HloAliasAnalysis& RunAnalysis() {
-    analysis_ = HloAliasAnalysis::Run(&module_).ConsumeValueOrDie();
+    analysis_ = HloAliasAnalysis::Run(module_.get()).ConsumeValueOrDie();
     return *analysis_;
   }
 
-  // Return a vector of the buffers in the buffer set at the current location.
+  // Return a vector of the buffers in the buffer set at the current position.
   std::vector<HloBuffer> GetBuffersAt(const HloInstruction* instruction,
                                       const ShapeIndex& index = {}) const {
     std::vector<HloBuffer> buffers;
-    for (HloBuffer::Id buffer_id :
-         analysis_->GetBufferSet(instruction, index).buffer_ids()) {
-      buffers.push_back(analysis_->GetBuffer(buffer_id));
+    for (const HloBuffer* buffer :
+         analysis_->GetBufferSet(instruction, index).buffers()) {
+      buffers.push_back(*buffer);
     }
     return buffers;
   }
@@ -60,24 +60,41 @@ class HloAliasAnalysisTest : public HloTestBase {
   // Return a vector containing all of the HloValues in the given buffer.
   std::vector<HloValue> GetValuesInBuffer(const HloBuffer& buffer) {
     std::vector<HloValue> values;
-    for (HloValue::Id value_id : buffer.value_ids()) {
-      values.push_back(analysis_->dataflow_analysis().GetValue(value_id));
+    for (const HloValue* value : buffer.values()) {
+      values.push_back(*value);
     }
     return values;
   }
 
-  // Return the HloValue defined at the given location.
+  // Return the HloValue defined at the given position.
   const HloValue& GetValueDefinedAt(const HloInstruction* instruction,
                                     const ShapeIndex& index = {}) const {
     return analysis_->dataflow_analysis().GetValueDefinedAt(instruction, index);
   }
 
-  const HloValue& GetUniqueValueInBuffer(const HloBuffer& buffer) const {
-    CHECK_EQ(buffer.value_ids().size(), 1);
-    return analysis_->dataflow_analysis().GetValue(buffer.value_ids()[0]);
+  // Returns true if any values held in the same buffer interfere. Generally, in
+  // the compiler pipeline copy-insertion will guarantee that this interference
+  // never occurs, but HLO graphs with interference can be explicitly
+  // constructed.
+  bool AnyValuesInSameBufferInterfere() {
+    DependencyHloOrdering ordering(module_.get());
+    for (const HloBuffer* buffer : analysis_->buffers()) {
+      for (const HloValue* value_a : buffer->values()) {
+        for (const HloValue* value_b : buffer->values()) {
+          if (*value_a != *value_b &&
+              analysis_->dataflow_analysis().MayInterfere(*value_a, *value_b,
+                                                          ordering)) {
+            VLOG(1) << *value_a << " interferes with " << *value_b
+                    << " in buffer: " << *buffer;
+            return true;
+          }
+        }
+      }
+    }
+    return false;
   }
 
-  HloModule module_;
+  std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloAliasAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
@@ -87,12 +104,12 @@ TEST_F(HloAliasAnalysisTest, BinaryOperation) {
   // Test the analysis on a single binary operation (Add).
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, constant1, constant2));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -101,12 +118,14 @@ TEST_F(HloAliasAnalysisTest, BinaryOperation) {
   // All of the buffer sets should trivially contain a single buffer containing
   // a single value.
   for (const HloInstruction* instruction : {constant1, constant2, add}) {
-    EXPECT_EQ(GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(instruction)),
+    EXPECT_EQ(analysis.GetUniqueBufferAt(instruction).GetUniqueValue(),
               GetValueDefinedAt(instruction));
   }
 
   EXPECT_FALSE(analysis.GetInstructionBufferSet(add).IsAmbiguous());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(add).IsDistinct());
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, TupleAndGtes) {
@@ -124,22 +143,19 @@ TEST_F(HloAliasAnalysisTest, TupleAndGtes) {
       HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
   builder.AddInstruction(
       HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
   EXPECT_EQ(analysis.buffers().size(), 4);
 
   // Verify the expected aliasing of the tuple elements.
-  EXPECT_EQ(
-      GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(tuple, /*index=*/{})),
-      GetValueDefinedAt(tuple, /*index=*/{}));
-  EXPECT_EQ(
-      GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(tuple, /*index=*/{0})),
-      GetValueDefinedAt(param0));
-  EXPECT_EQ(
-      GetUniqueValueInBuffer(analysis.GetUniqueBufferAt(tuple, /*index=*/{1})),
-      GetValueDefinedAt(param1));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(tuple, /*index=*/{}).GetUniqueValue(),
+            GetValueDefinedAt(tuple, /*index=*/{}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(tuple, /*index=*/{0}).GetUniqueValue(),
+            GetValueDefinedAt(param0));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(tuple, /*index=*/{1}).GetUniqueValue(),
+            GetValueDefinedAt(param1));
 
   // The tuple operand, tuple element, and result of the GTE instruction should
   // all be the same buffer.
@@ -148,14 +164,16 @@ TEST_F(HloAliasAnalysisTest, TupleAndGtes) {
   EXPECT_EQ(analysis.GetUniqueBufferAt(param0),
             analysis.GetUniqueBufferAt(gte0));
 
-  // Verify the locations of an aliased buffer.
+  // Verify the positions of an aliased buffer.
   EXPECT_THAT(
-      analysis.GetUniqueBufferAt(param0).locations(),
-      UnorderedElementsAre(HloLocation{param0, {}}, HloLocation{tuple, {0}},
-                           HloLocation{gte0, {}}));
+      analysis.GetUniqueBufferAt(param0).positions(),
+      UnorderedElementsAre(HloPosition{param0, {}}, HloPosition{tuple, {0}},
+                           HloPosition{gte0, {}}));
 
   EXPECT_FALSE(analysis.GetInstructionBufferSet(tuple).IsAmbiguous());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(tuple).IsDistinct());
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, NondistinctTuple) {
@@ -168,17 +186,19 @@ TEST_F(HloAliasAnalysisTest, NondistinctTuple) {
   // param0 is included twice in the tuple.
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({param0, param1, param0}));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
   EXPECT_THAT(
-      analysis.GetUniqueBufferAt(param0).locations(),
-      UnorderedElementsAre(HloLocation{param0, {}}, HloLocation{tuple, {0}},
-                           HloLocation{tuple, {2}}));
+      analysis.GetUniqueBufferAt(param0).positions(),
+      UnorderedElementsAre(HloPosition{param0, {}}, HloPosition{tuple, {0}},
+                           HloPosition{tuple, {2}}));
 
   EXPECT_FALSE(analysis.GetInstructionBufferSet(tuple).IsAmbiguous());
   EXPECT_FALSE(analysis.GetInstructionBufferSet(tuple).IsDistinct());
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, SingleCall) {
@@ -192,31 +212,33 @@ TEST_F(HloAliasAnalysisTest, SingleCall) {
   auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
   HloComputation* called_computation =
-      module_.AddEmbeddedComputation(subbuilder.Build());
+      module_->AddEmbeddedComputation(subbuilder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
   // Verify aliasing of the kCall operands and the subcomputation parameters.
-  EXPECT_THAT(analysis.GetUniqueBufferAt(constant1).locations(),
-              UnorderedElementsAre(HloLocation{constant1, {}},
-                                   HloLocation{subparam0, {}}));
-  EXPECT_THAT(analysis.GetUniqueBufferAt(constant2).locations(),
-              UnorderedElementsAre(HloLocation{constant2, {}},
-                                   HloLocation{subparam1, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant1).positions(),
+              UnorderedElementsAre(HloPosition{constant1, {}},
+                                   HloPosition{subparam0, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant2).positions(),
+              UnorderedElementsAre(HloPosition{constant2, {}},
+                                   HloPosition{subparam1, {}}));
 
   // The subcomputation root and the kCall itself should alias.
   EXPECT_THAT(
-      analysis.GetUniqueBufferAt(add).locations(),
-      UnorderedElementsAre(HloLocation{add, {}}, HloLocation{call, {}}));
+      analysis.GetUniqueBufferAt(add).positions(),
+      UnorderedElementsAre(HloPosition{add, {}}, HloPosition{call, {}}));
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, ComputationCalledTwice) {
@@ -229,35 +251,35 @@ TEST_F(HloAliasAnalysisTest, ComputationCalledTwice) {
   auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
   HloComputation* called_computation =
-      module_.AddEmbeddedComputation(subbuilder.Build());
+      module_->AddEmbeddedComputation(subbuilder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto call1 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {call1, constant2}, called_computation));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
-  EXPECT_THAT(analysis.GetUniqueBufferAt(constant1).locations(),
-              UnorderedElementsAre(HloLocation{constant1, {}},
-                                   HloLocation{subparam0, {}}));
-  EXPECT_THAT(analysis.GetUniqueBufferAt(constant2).locations(),
-              UnorderedElementsAre(HloLocation{constant2, {}},
-                                   HloLocation{subparam1, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant1).positions(),
+              UnorderedElementsAre(HloPosition{constant1, {}},
+                                   HloPosition{subparam0, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(constant2).positions(),
+              UnorderedElementsAre(HloPosition{constant2, {}},
+                                   HloPosition{subparam1, {}}));
 
   // The 'add' (root of the subcomputation) aliases the two call instruction,
   // and the first parameter of the subcomputation because 'call1' it is passed
   // as an argument to the subcomputation in 'call2'.
   EXPECT_THAT(
-      analysis.GetUniqueBufferAt(add).locations(),
-      UnorderedElementsAre(HloLocation{add, {}}, HloLocation{call1, {}},
-                           HloLocation{subparam0, {}}, HloLocation{call2, {}}));
+      analysis.GetUniqueBufferAt(add).positions(),
+      UnorderedElementsAre(HloPosition{add, {}}, HloPosition{call1, {}},
+                           HloPosition{subparam0, {}}, HloPosition{call2, {}}));
 
   EXPECT_THAT(GetBuffersAt(subparam0),
               UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1),
@@ -269,6 +291,8 @@ TEST_F(HloAliasAnalysisTest, ComputationCalledTwice) {
   EXPECT_FALSE(analysis.GetInstructionBufferSet(subparam1).IsAmbiguous());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(subparam0).IsDistinct());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(subparam1).IsDistinct());
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, SingleWhile) {
@@ -303,48 +327,48 @@ TEST_F(HloAliasAnalysisTest, SingleWhile) {
       scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
   auto body_tuple = body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_0, add}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   // Condition computation trivially returns a constant "false".
   auto cond_builder = HloComputation::Builder("condition");
   auto cond_param = cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
-  // Verify the locations of the aliased while buffers.
-  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{}).locations(),
+  // Verify the positions of the aliased while buffers.
+  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{}).positions(),
               UnorderedElementsAre(
-                  HloLocation{tuple, {}}, HloLocation{xla_while, {}},
-                  HloLocation{body_param, {}}, HloLocation{body_tuple, {}},
-                  HloLocation{cond_param, {}}));
-  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}).locations(),
+                  HloPosition{tuple, {}}, HloPosition{xla_while, {}},
+                  HloPosition{body_param, {}}, HloPosition{body_tuple, {}},
+                  HloPosition{cond_param, {}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0}).positions(),
               UnorderedElementsAre(
-                  HloLocation{constant1, {}}, HloLocation{tuple, {0}},
-                  HloLocation{xla_while, {0}}, HloLocation{body_param, {0}},
-                  HloLocation{body_element_0, {}}, HloLocation{body_tuple, {0}},
-                  HloLocation{cond_param, {0}}));
-  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1}).locations(),
+                  HloPosition{constant1, {}}, HloPosition{tuple, {0}},
+                  HloPosition{xla_while, {0}}, HloPosition{body_param, {0}},
+                  HloPosition{body_element_0, {}}, HloPosition{body_tuple, {0}},
+                  HloPosition{cond_param, {0}}));
+  EXPECT_THAT(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1}).positions(),
               UnorderedElementsAre(
-                  HloLocation{constant2, {}}, HloLocation{tuple, {1}},
-                  HloLocation{xla_while, {1}}, HloLocation{body_param, {1}},
-                  HloLocation{body_element_1, {}}, HloLocation{add, {}},
-                  HloLocation{body_tuple, {1}}, HloLocation{cond_param, {1}}));
+                  HloPosition{constant2, {}}, HloPosition{tuple, {1}},
+                  HloPosition{xla_while, {1}}, HloPosition{body_param, {1}},
+                  HloPosition{body_element_1, {}}, HloPosition{add, {}},
+                  HloPosition{body_tuple, {1}}, HloPosition{cond_param, {1}}));
 
   EXPECT_THAT(
       GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0})),
@@ -356,6 +380,8 @@ TEST_F(HloAliasAnalysisTest, SingleWhile) {
                            GetValueDefinedAt(body_param, {1}),
                            GetValueDefinedAt(cond_param, {1}),
                            GetValueDefinedAt(add)));
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
@@ -392,21 +418,21 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
       scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
   body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_0, add}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   auto cond_builder = HloComputation::Builder("condition");
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto xla_while0 = builder.AddInstruction(
@@ -415,7 +441,7 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
       HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while0));
   auto xla_while2 = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while1));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -449,13 +475,21 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) {
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
-  auto cond_builder = HloComputation::Builder("condition");
-  cond_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, tuple_shape, "param"));
-  cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
-  HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+  auto build_cond_computation = [&tuple_shape]() {
+    auto cond_builder = HloComputation::Builder("condition");
+    cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape, "param"));
+    cond_builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+    return cond_builder.Build();
+  };
+  // Build separate condition computations so the call graph is flat. The
+  // callgraph is always flattened in the compiler pipeline, and the flattened
+  // callgraph enables representative interference analysis.
+  HloComputation* condition1 =
+      module_->AddEmbeddedComputation(build_cond_computation());
+  HloComputation* condition2 =
+      module_->AddEmbeddedComputation(build_cond_computation());
 
   // Element 0 passes transparently through the body.
   auto inner_builder = HloComputation::Builder("inner_body");
@@ -470,7 +504,7 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) {
   inner_builder.AddInstruction(
       HloInstruction::CreateTuple({inner_element_0, add}));
   HloComputation* inner_body =
-      module_.AddEmbeddedComputation(inner_builder.Build());
+      module_->AddEmbeddedComputation(inner_builder.Build());
 
   // Element 1 passes transparently through the body.
   auto outer_builder = HloComputation::Builder("outer_body");
@@ -485,20 +519,20 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) {
   auto outer_tuple = outer_builder.AddInstruction(
       HloInstruction::CreateTuple({negate, outer_element_1}));
   auto nested_while = outer_builder.AddInstruction(HloInstruction::CreateWhile(
-      tuple_shape, condition, inner_body, outer_tuple));
+      tuple_shape, condition1, inner_body, outer_tuple));
   HloComputation* outer_body =
-      module_.AddEmbeddedComputation(outer_builder.Build());
+      module_->AddEmbeddedComputation(outer_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto entry_while = builder.AddInstruction(
-      HloInstruction::CreateWhile(tuple_shape, condition, outer_body, tuple));
-  module_.AddEntryComputation(builder.Build());
+      HloInstruction::CreateWhile(tuple_shape, condition2, outer_body, tuple));
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -515,6 +549,8 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) {
             analysis.GetUniqueBufferAt(nested_while, /*index=*/{1}));
   EXPECT_EQ(analysis.GetUniqueBufferAt(constant2),
             analysis.GetUniqueBufferAt(inner_element_1));
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, SwizzlingWhile) {
@@ -548,32 +584,32 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) {
       HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 2));
   body_builder.AddInstruction(HloInstruction::CreateTuple(
       {body_element_1, body_element_2, body_element_0}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   auto cond_builder = HloComputation::Builder("condition");
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   auto cond_constant = cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2, constant3}));
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
-  // The swizzling while makes most locations in the module alias leaving only 3
+  // The swizzling while makes most positions in the module alias leaving only 3
   // HloBuffers.
   EXPECT_THAT(
       analysis.buffers(),
@@ -593,6 +629,10 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) {
             analysis.GetUniqueBufferAt(constant2));
   EXPECT_EQ(analysis.GetUniqueBufferAt(constant1),
             analysis.GetUniqueBufferAt(constant3));
+
+  // All elements in of the loop state tuple are forced into the same buffer
+  // resulting liveness interference.
+  EXPECT_TRUE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, TupleSelect) {
@@ -600,15 +640,15 @@ TEST_F(HloAliasAnalysisTest, TupleSelect) {
   // instruction.
   auto builder = HloComputation::Builder(TestName());
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto constant4 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(4.0)));
   auto tuple1 =
       builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
   auto tuple2 =
@@ -627,24 +667,24 @@ TEST_F(HloAliasAnalysisTest, TupleSelect) {
   auto select1234 = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple_shape, HloOpcode::kSelect, pred, select12, select34));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
   // Verify the buffer sets of each select.
-  EXPECT_THAT(analysis.GetBufferSet(select11, /*index=*/{0}).buffer_ids(),
-              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1).id()));
-  EXPECT_THAT(analysis.GetBufferSet(select12, /*index=*/{0}).buffer_ids(),
-              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1).id(),
-                                   analysis.GetUniqueBufferAt(constant2).id()));
-  EXPECT_THAT(analysis.GetBufferSet(select34, /*index=*/{0}).buffer_ids(),
-              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant3).id(),
-                                   analysis.GetUniqueBufferAt(constant4).id()));
-  EXPECT_THAT(analysis.GetBufferSet(select1234, /*index=*/{0}).buffer_ids(),
-              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1).id(),
-                                   analysis.GetUniqueBufferAt(constant2).id(),
-                                   analysis.GetUniqueBufferAt(constant3).id(),
-                                   analysis.GetUniqueBufferAt(constant4).id()));
+  EXPECT_THAT(GetBuffersAt(select11, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1)));
+  EXPECT_THAT(GetBuffersAt(select12, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1),
+                                   analysis.GetUniqueBufferAt(constant2)));
+  EXPECT_THAT(GetBuffersAt(select34, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant3),
+                                   analysis.GetUniqueBufferAt(constant4)));
+  EXPECT_THAT(GetBuffersAt(select1234, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1),
+                                   analysis.GetUniqueBufferAt(constant2),
+                                   analysis.GetUniqueBufferAt(constant3),
+                                   analysis.GetUniqueBufferAt(constant4)));
 
   EXPECT_FALSE(analysis.GetInstructionBufferSet(select11).IsAmbiguous());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(select12).IsAmbiguous());
@@ -655,6 +695,8 @@ TEST_F(HloAliasAnalysisTest, TupleSelect) {
   EXPECT_TRUE(analysis.GetInstructionBufferSet(select12).IsDistinct());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(select34).IsDistinct());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(select1234).IsDistinct());
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) {
@@ -688,22 +730,22 @@ TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) {
   auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape_, HloOpcode::kNegate, body_element));
   body_builder.AddInstruction(HloInstruction::CreateTuple({negate}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   auto cond_builder = HloComputation::Builder("condition");
   auto cond_param = cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple1 =
       builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
   auto tuple2 =
@@ -713,7 +755,7 @@ TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) {
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, select));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -736,17 +778,21 @@ TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) {
 
   EXPECT_TRUE(analysis.GetInstructionBufferSet(select).IsDistinct());
   EXPECT_TRUE(analysis.GetInstructionBufferSet(xla_while).IsDistinct());
+
+  // The two operands of the select get flattened into the same buffer resulting
+  // in liveness interference.
+  EXPECT_TRUE(AnyValuesInSameBufferInterfere());
 }
 
 TEST_F(HloAliasAnalysisTest, Bitcast) {
   // Bitcasting a value should not produce a new buffer.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape_, HloOpcode::kBitcast, constant));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1ad5daf79ce9e71652b8b6cba6e36ba57a838bc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_buffer.h"
+
+#include <algorithm>
+#include <ostream>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+void HloBuffer::AddValue(const HloValue& value) {
+  // If the value is already contained in this buffer, just return.
+  if (!values_.AddValue(&value)) {
+    return;
+  }
+
+  // Add all of the positions of the HloValue to this buffer.
+  for (const HloPosition& position : value.positions()) {
+    if (std::find(positions_.begin(), positions_.end(), position) ==
+        positions_.end()) {
+      positions_.push_back(position);
+    }
+  }
+}
+
+bool HloBuffer::operator==(const HloBuffer& other) const {
+  bool equal = id() == other.id();
+  if (equal) {
+    // DCHECK because these comparisons are expensive (linear time).
+    DCHECK(values_ == other.values_);
+    DCHECK(positions() == other.positions());
+  }
+  return equal;
+}
+
+string HloBuffer::ToString() const {
+  return StrCat("HloBuffer ", id_, ", values: ", values_.ToString());
+}
+
+std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer) {
+  out << buffer.ToString();
+  return out;
+}
+
+void HloBufferSet::AddBuffer(const HloBuffer* buffer) {
+  auto it = std::lower_bound(buffers_.begin(), buffers_.end(), buffer,
+                             HloBuffer::IdLessThan);
+  if (it == buffers_.end() || (*it)->id() != buffer->id()) {
+    buffers_.insert(it, buffer);
+  }
+}
+
+void HloBufferSet::RemoveBufferOrDie(HloBuffer::Id buffer_id) {
+  auto it = std::lower_bound(buffers_.begin(), buffers_.end(), buffer_id,
+                             [](const HloBuffer* buffer, HloBuffer::Id id) {
+                               return buffer->id() < id;
+                             });
+  CHECK(it != buffers_.end() && (*it)->id() == buffer_id)
+      << "HloBuffer " << buffer_id << " doesn't exist in set: " << ToString();
+  buffers_.erase(it);
+}
+
+string HloBufferSet::ToString() const {
+  return StrCat(
+      "HloBufferSet, buffers: ",
+      Join(buffers_, ", ", [](string* result, const HloBuffer* buffer) {
+        result->append(buffer->ToString());
+      }));
+}
+
+std::ostream& operator<<(std::ostream& out, const HloBufferSet& buffer_set) {
+  out << buffer_set.ToString();
+  return out;
+}
+
+bool InstructionBufferSet::IsAmbiguous() const {
+  bool is_ambiguous = false;
+  ForEachElement(
+      [&is_ambiguous](const ShapeIndex& index, const HloBufferSet& buffer_set) {
+        is_ambiguous |= buffer_set.buffers().size() > 1;
+      });
+  return is_ambiguous;
+}
+
+bool InstructionBufferSet::IsDistinct() const {
+  bool is_distinct = true;
+  tensorflow::gtl::FlatSet<HloBuffer::Id> seen_ids;
+  ForEachElement([&is_distinct, &seen_ids](const ShapeIndex& /*index*/,
+                                           const HloBufferSet& buffer_set) {
+    for (const HloBuffer* buffer : buffer_set.buffers()) {
+      auto pair = seen_ids.insert(buffer->id());
+      if (!pair.second) {
+        is_distinct = false;
+      }
+    }
+  });
+  return is_distinct;
+}
+
+string InstructionBufferSet::ToString() const {
+  string out =
+      StrCat("InstructionBufferSet(", ShapeUtil::HumanString(shape()), ")\n");
+  ForEachElement([this, &out](const ShapeIndex& index,
+                              const HloBufferSet& value_set) {
+    StrAppend(&out, "  ", index.ToString(), " : ", value_set.ToString(), "\n");
+  });
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionBufferSet& buffer_set) {
+  out << buffer_set.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.h b/tensorflow/compiler/xla/service/hlo_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f42d2f7720e44978fdbac8783e1b4b70e3bf3a01
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_buffer.h
@@ -0,0 +1,199 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_BUFFER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_BUFFER_H_
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// A container which can hold one or more HloValues. An HLO buffer abstractly
+// represents the allocation which HLO instructions write into and read
+// from. Generally there is a one-to-one correspondence between HloBuffers and
+// HloValue where each HloValue in the module is held in a unique HloBuffer. An
+// exception is the while instruction which updates the loop state in-place. In
+// this case, we have a single HloBuffer for each HloPosition in the loop state,
+// but multiple HloValues. For example:
+//
+//   %init = ...
+//   %while = While(%init, body, condition)
+//
+//  body:
+//   %body_param = Param(0)
+//     ...
+//   %body_root = ...
+//
+//  condition:
+//   %cond_param = Param(0)
+//     ...
+//
+// For simplicity, assume that %while is array-shaped. In this case, we have a
+// single HloBuffer which holds the following HloValues: HloValue{%init},
+// HloValue{%while}, HloValue{%body_param}, HloValue{%body_root}, and
+// HloValue{%cond_param}.
+//
+// HloBuffers may appear at different HloPositions in the module mirroring the
+// same propery of HloValues. For example:
+//
+//   %sub = Sub(...)
+//   %add = Add(...)
+//   %tuple = Tuple(%add, %sub)
+//   %gte = GetTupleElement(%tuple, 0)
+//
+// In this case, the HloBuffer containing %add appears at the following
+// positions: HloPosition{%add, {}}, HloPosition{%tuple, {0}}, and
+// HloPosition{%gte, {}}.
+//
+// Different HloPositions which share the same HloBuffer indicate mandatory
+// aliasing in the HLO module. These positions must share the same memory
+// allocation for correctness (the backends rely on this property). This differs
+// from incidental aliasing introduced by memory reuse in BufferAssignment where
+// different instructions may happen to get the same allocation.
+class HloBuffer {
+ public:
+  using Id = int64;
+
+  // Predicate comparing HloBuffers by increasing id, useful for std::sort.
+  static bool IdLessThan(const HloBuffer* a, const HloBuffer* b) {
+    return a->id() < b->id();
+  }
+
+  // Predicate comparing HloBuffers by equal id, useful for std::unique.
+  static bool IdEqual(const HloBuffer* a, const HloBuffer* b) {
+    return a->id() == b->id();
+  }
+
+  HloBuffer(Id id) : id_(id) {}
+
+  // Return the unique identifier for this HloBuffer.
+  Id id() const { return id_; }
+
+  // Add a value to the set of values held by this buffer. Also adds the
+  // HloPositions of the value to the positions vector of the buffer. If the
+  // buffer already contains this value, then this method is a nop.
+  void AddValue(const HloValue& value);
+
+  // Return all values contained in this buffer.
+  const std::vector<const HloValue*>& values() const {
+    return values_.values();
+  }
+
+  // Return the unique HLO value in the buffer. CHECK fails if the buffer does
+  // not contain exactly one value.
+  const HloValue& GetUniqueValue() const { return values_.GetUniqueValue(); }
+
+  // Return the positions (output of which instruction and at what index) where
+  // the buffer is used. This is exactly the union of the positions of the
+  // HloValues contained by the buffer.
+  const std::vector<HloPosition>& positions() const { return positions_; }
+
+  string ToString() const;
+
+  bool operator==(const HloBuffer& other) const;
+  bool operator!=(const HloBuffer& other) const { return !(*this == other); }
+
+ private:
+  // Unique identifier for this HloBuffer.
+  const Id id_;
+
+  // The set of values contained in this buffer.
+  HloValueSet values_;
+
+  // The set of positions where this buffer is used.
+  std::vector<HloPosition> positions_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer);
+
+// A class representing the set of possible HloBuffers at a particular
+// HloPosition (shape index in the output of an instruction) in the XLA
+// graph. In most cases, the buffer set will have a single HloBuffer indicating
+// that the HloBuffer which appears at that particular position is known
+// unambiguously at compile-time.  However, tuple-shaped Select instructions can
+// introduce ambiguity as the tuple elements of the operands are passed by
+// reference into the output of the Select. For example:
+//
+//   %pred = ...
+//   %tuple0 = Tuple(%a, %b)
+//   %tuple1 = Tuple(%x, %y)
+//   %select = Select(%pred, %tuple0, %tuple1)
+//
+// In this case the HloBufferSet at HloPosition{%select, {0}} contains the
+// HloBuffer holding %a and the HloBuffer holding %x.
+class HloBufferSet {
+ public:
+  HloBufferSet() = default;
+
+  // Add the given buffer to this buffer set. If the buffer already exists in
+  // the set, then this is a NOP.
+  void AddBuffer(const HloBuffer* buffer);
+
+  // Removes the given buffer from this buffer set. CHECK fails in the buffer is
+  // not contained in this set.
+  void RemoveBufferOrDie(HloBuffer::Id buffer_id);
+
+  // Returns the unique buffer in this set. CHECK fails if the set does not
+  // contain exactly one buffer.
+  const HloBuffer& GetUniqueBuffer() const {
+    CHECK_EQ(buffers_.size(), 1);
+    return *buffers_[0];
+  }
+
+  // Returns the vector of HloBuffers in the set, sorted by HloBuffer::Id.
+  const std::vector<const HloBuffer*>& buffers() const { return buffers_; }
+
+  string ToString() const;
+
+ private:
+  // HloBuffers sorted by HloBuffer::Id.
+  std::vector<const HloBuffer*> buffers_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloBufferSet& buffer_set);
+
+// A class collecting the HloBuffers in the output of an HLO instruction. For
+// array-shaped instructions, an InstructionBufferSet trivially holds a single
+// HloBufferSet. Tuple-shaped InstructionBufferSets hold multiple
+// HloBufferSets.
+class InstructionBufferSet : public ShapeTree<HloBufferSet> {
+ public:
+  InstructionBufferSet(const Shape& shape) : ShapeTree<HloBufferSet>(shape) {}
+
+  // Returns true if any HloBufferSet contained in this InstructionBufferSet
+  // is not a singleton.
+  bool IsAmbiguous() const;
+
+  // Returns true if any HloBuffer appears in more than one HloBufferSet
+  // contained in this InstructionBufferSet.
+  bool IsDistinct() const;
+
+  string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionBufferSet& buffer_set);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_BUFFER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index ff76cc7bf67e29d489f9b32e4fce94ce28b59992..119cf7dde5a79add498c94ea6f0cf385f4363764 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -66,22 +67,25 @@ HloComputation::HloComputation(
     HloInstruction* root_instruction, bool is_fusion_computation)
     : name_(name),
       root_instruction_(root_instruction),
-      is_fusion_computation_(is_fusion_computation),
-      instruction_name_uniquer_(/*separator=*/".") {
+      is_fusion_computation_(is_fusion_computation) {
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
   for (auto& instruction : *instructions) {
     if (instruction->opcode() == HloOpcode::kParameter) {
       int64 param_no = instruction->parameter_number();
-      CHECK_GE(param_no, 0);
-      CHECK_LT(param_no, param_instructions_.size());
-      CHECK_EQ(nullptr, param_instructions_[param_no]);
+      CHECK(param_no >= 0 && param_no < parameter_count)
+          << "\nERROR: invalid parameter number.  Expected [0, "
+          << parameter_count << "), got " << param_no;
+      CHECK(param_instructions_[param_no] == nullptr)
+          << "\nERROR: parameter number " << param_no
+          << " already allocated in this computation";
       param_instructions_[param_no] = instruction.get();
     }
     root_found |= instruction.get() == root_instruction_;
     AddInstructionInternal(std::move(instruction));
   }
-  CHECK(root_found);
+  CHECK(root_found)
+      << "\nERROR: root instruction is not present in computation.";
 }
 
 HloInstruction* HloComputation::AddInstruction(
@@ -94,8 +98,9 @@ HloInstruction* HloComputation::AddInstruction(
 
 HloInstruction* HloComputation::AddInstructionInternal(
     std::unique_ptr<HloInstruction> instruction) {
-  // Generate a unique name for the instruction.
-  instruction->UniquifyName(&instruction_name_uniquer_);
+  if (parent() != nullptr) {
+    instruction->UniquifyName(&parent()->instruction_name_uniquer());
+  }
   Reparent(instruction.get());
   HloInstruction* pinst = instruction.get();
   instruction_iterators_[pinst] =
@@ -206,7 +211,8 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
 Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
   VLOG(2) << "Removing instruction " << instruction->name()
           << " from computation " << name();
-  TF_RET_CHECK(IsRemovable(instruction));
+  TF_RET_CHECK(IsRemovable(instruction))
+      << "cannot remove instruction: " << instruction->ToString();
   TF_RET_CHECK(root_instruction() != instruction)
       << "cannot remove root instruction " << instruction->name();
   TF_RET_CHECK(instruction->user_count() == 0)
@@ -537,67 +543,46 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
   return RemoveInstructionAndUnusedOperands(old_instruction);
 }
 
-HloComputation::ReachabilityMap::ReachabilityMap(
-    const std::list<HloInstruction*>& all_instructions) {
-  const int n = all_instructions.size();
-  int next_id = 0;
-  for (const auto* hlo : all_instructions) {
-    ids_[hlo] = next_id;
-    next_id++;
-  }
-  DCHECK_EQ(n, ids_.size());  // instructions should be unique
-  matrix_.Reset(n * n);
-}
-
-void HloComputation::ReachabilityMap::SetReachable(const HloInstruction* a,
-                                                   const HloInstruction* b) {
-  const int id_a = FindOrDie(ids_, a);
-  const int id_b = FindOrDie(ids_, b);
-  matrix_.set(id_a * ids_.size() + id_b);
-}
+std::unique_ptr<HloReachabilityMap> HloComputation::ComputeReachability()
+    const {
+  const std::list<HloInstruction*> all = MakeInstructionPostOrder();
+  auto result = MakeUnique<HloReachabilityMap>(all);
 
-bool HloComputation::ReachabilityMap::IsReachable(
-    const HloInstruction* a, const HloInstruction* b) const {
-  const int id_a = FindOrDie(ids_, a);
-  const int id_b = FindOrDie(ids_, b);
-  return matrix_.get(id_a * ids_.size() + id_b);
+  std::vector<HloInstruction*> inputs;
+  for (const HloInstruction* hlo : all) {
+    inputs.assign(hlo->operands().begin(), hlo->operands().end());
+    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
+                  hlo->control_predecessors().end());
+    result->SetReachabilityToUnion(inputs, hlo);
+  }
+  return result;
 }
 
-bool HloComputation::ReachabilityMap::IsConnected(
-    const HloInstruction* a, const HloInstruction* b) const {
-  const int id_a = FindOrDie(ids_, a);
-  const int id_b = FindOrDie(ids_, b);
-  return matrix_.get(id_a * ids_.size() + id_b) ||
-         matrix_.get(id_b * ids_.size() + id_a);
-}
+void HloComputation::UpdateReachabilityThroughInstruction(
+    const HloInstruction* instruction, HloReachabilityMap* reachability_map) {
+  std::queue<const HloInstruction*> worklist;
+  worklist.push(instruction);
 
-void HloComputation::ReachabilityMap::SetReachableAndTransitiveClosure(
-    const HloInstruction* a, const HloInstruction* b) {
-  const int id_a = FindOrDie(ids_, a);
-  const int id_b = FindOrDie(ids_, b);
-  const int n = ids_.size();
-  matrix_.set(id_a * n + id_b);
+  std::vector<HloInstruction*> inputs;
 
-  // Copy transitive set for b into entries for a
-  for (int i = 0; i < n; i++) {
-    if (matrix_.get(id_b * n + i)) {
-      matrix_.set(id_a * n + i);
-    }
-  }
-}
+  while (!worklist.empty()) {
+    const HloInstruction* item = worklist.front();
+    worklist.pop();
 
-std::unique_ptr<HloComputation::ReachabilityMap>
-HloComputation::ComputeTransitiveOperands() const {
-  const auto all = MakeInstructionPostOrder();
-  auto result = MakeUnique<HloComputation::ReachabilityMap>(all);
+    inputs.assign(item->operands().begin(), item->operands().end());
+    inputs.insert(inputs.end(), item->control_predecessors().begin(),
+                  item->control_predecessors().end());
 
-  // Fill in the dependency bit matrix
-  for (const auto* hlo : all) {
-    for (const HloInstruction* operand : hlo->operands()) {
-      result->SetReachableAndTransitiveClosure(hlo, operand);
+    if (reachability_map->SetReachabilityToUnion(inputs, item)) {
+      // Add immediate successors to worklist.
+      for (const HloInstruction* user : item->users()) {
+        worklist.push(user);
+      }
+      for (const HloInstruction* succ : item->control_successors()) {
+        worklist.push(succ);
+      }
     }
   }
-  return result;
 }
 
 std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
@@ -609,6 +594,12 @@ std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
       unreachable_roots.push_back(instruction.get());
     }
   }
+  VLOG(3) << "Unreachable roots:"
+          << tensorflow::str_util::Join(
+                 unreachable_roots, "\n\t",
+                 [](string* out, const HloInstruction* hlo) {
+                   tensorflow::strings::StrAppend(out, hlo->ToString());
+                 });
   return unreachable_roots;
 }
 
@@ -617,6 +608,7 @@ Status HloComputation::Accept(DfsHloVisitor* visitor) const {
   // visited root, which would invalidate iterators if the unreachable roots
   // weren't computed ahead of time.
   for (HloInstruction* root : CollectUnreachableRoots()) {
+    VLOG(3) << "Traversing unreachable root: " << root->ToString();
     // Call FinishVisit only at the end.
     TF_RETURN_IF_ERROR(root->Accept(visitor, /*call_finish_visit=*/false));
   }
@@ -643,9 +635,15 @@ Status HloComputation::AcceptWithOperandOrder(
 Status HloComputation::AcceptOrdered(
     DfsHloVisitor* visitor,
     const std::vector<const HloInstruction*>& order) const {
+  VLOG(3) << "Accepting visitor with order.";
+  for (HloInstruction* root : CollectUnreachableRoots()) {
+    TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
+        << root->ToString();
+  }
   TF_RET_CHECK(order.size() == instruction_count());
   std::unordered_set<const HloInstruction*> visited;
   for (const HloInstruction* instruction : order) {
+    VLOG(3) << "Visiting ordered: " << instruction->ToString();
     TF_RET_CHECK(instruction_iterators_.count(instruction) == 1)
         << "Instruction " << instruction->name() << " is not in computation "
         << name();
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 39074b24e41f073b6b5b60880cbd1f6e2e9b399d..cf6df3c94f885816d20530161822f7cc948a30be 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -29,11 +29,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/bitmap.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -153,9 +153,18 @@ class HloComputation {
   // this order, definitions of values always appear before their uses.
   std::list<HloInstruction*> MakeInstructionPostOrder() const;
 
-  // Computes and returns the mapping from HLO to its transitive operands.
-  class ReachabilityMap;
-  std::unique_ptr<ReachabilityMap> ComputeTransitiveOperands() const;
+  // Computes and returns the reachability between HLO instructions in the
+  // computation. The returned HloReachabilityMap is constructed such that
+  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
+  // directed path (from producer to consumer) from 'a' to 'b'. Both data
+  // dependencies (operands) and control dependencies are considered for
+  // reachability. Trivially an instruction is reachable from itself.
+  std::unique_ptr<HloReachabilityMap> ComputeReachability() const;
+
+  // Updates the given reachabilty map after the immediate predecessor set
+  // (operands and control predecessors) of 'instruction' has changed.
+  void UpdateReachabilityThroughInstruction(
+      const HloInstruction* instruction, HloReachabilityMap* reachability_map);
 
   int64 instruction_count() const { return instructions_.size(); }
 
@@ -308,34 +317,6 @@ class HloComputation {
   TF_DISALLOW_COPY_AND_ASSIGN(HloComputation);
 };
 
-class HloComputation::ReachabilityMap {
- public:
-  // Sets up an empty reachable matrix for the full set of
-  // instructions specified in "all_instructions"
-  explicit ReachabilityMap(const std::list<HloInstruction*>& all_instructions);
-  // Sets entry so that IsReachable(a, b) will return true
-  void SetReachable(const HloInstruction* a, const HloInstruction* b);
-
-  // Sets IsReachable(a_inst, b_inst) as well as IsReachable(a_inst, trans)
-  // for all "trans" s.t. "IsReachable(b_inst, trans)" is true
-  void SetReachableAndTransitiveClosure(const HloInstruction* a_inst,
-                                        const HloInstruction* b_inst);
-
-  // Returns true if "b" is reachable from "a"
-  bool IsReachable(const HloInstruction* a, const HloInstruction* b) const;
-
-  // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
-  bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
-
- private:
-  friend class HloComputation;
-
-  // dense id assignment from HloInstruction* to number
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> ids_;
-  // matrix_(a,b) is true iff b is reachable from a
-  tensorflow::core::Bitmap matrix_;
-};
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 5d49c83e2d070cb9e5409a62983940225b903b2b..4a4a8556692b3da6f92f8333397a9537ade2f8ef 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -110,7 +110,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
   // Test GetInstructionPostOrder for a computation with one instruction.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto computation = builder.Build();
 
   EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
@@ -121,7 +121,7 @@ TEST_F(HloComputationTest, PostOrderSimple) {
   // instructions.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto negate1 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto negate2 = builder.AddInstruction(
@@ -136,7 +136,7 @@ TEST_F(HloComputationTest, PostOrderTrace) {
   // Test GetInstructionPostOrder for a computation with a trace instruction.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto negate1 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto trace =
@@ -155,13 +155,13 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
   // which are not connected.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant4 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto computation = builder.Build();
 
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
@@ -173,11 +173,11 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
   // which are not connected.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
   auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
@@ -197,11 +197,11 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
   // computation has multiple roots (dead code).
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   // Add three disconnected add expressions.
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       constant1, constant2));
@@ -248,7 +248,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   // Test that DeepCopyInstruction properly copies an array.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
+      Literal::CreateR1<float>({1.0, 2.0, 3.0})));
   auto computation = builder.Build();
 
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
@@ -260,9 +260,9 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   // Test that DeepCopyInstruction properly copies a tuple.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
+      Literal::CreateR1<float>({1.0, 2.0, 3.0})));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
@@ -280,7 +280,7 @@ TEST_F(HloComputationTest, CycleDetection) {
   // Test whether the visitor can detect cycles in the graph.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto add = builder.AddInstruction(
@@ -303,7 +303,7 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
   // twice.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto dead_negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto dead_add = builder.AddInstruction(HloInstruction::CreateBinary(
@@ -326,9 +326,9 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
 TEST_F(HloComputationTest, CloneWithControlDependency) {
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0f)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
 
@@ -352,6 +352,105 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
+TEST_F(HloComputationTest, Reachability) {
+  // Test reachability of a non-trivial computation:
+  //
+  // const1    const2
+  //    |         |
+  //    | +-------+
+  //    | |       |
+  //    add ..   negate
+  //     |   .     |
+  //     |   .... exp
+  //     |         |
+  //     +---+   +-+---+
+  //         |   |     |
+  //       multiply   copy
+  //
+  // There is a control dependency from 'add' to 'exp'.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant2));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, negate));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, add, exp));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kCopy, exp));
+
+  auto computation = builder.Build(/*root_instruction=*/mul);
+
+  TF_CHECK_OK(add->AddControlDependencyTo(exp));
+  auto reachability = computation->ComputeReachability();
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
+  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
+  EXPECT_FALSE(reachability->IsReachable(exp, add));
+  EXPECT_FALSE(reachability->IsReachable(exp, negate));
+  EXPECT_TRUE(reachability->IsReachable(exp, exp));
+  EXPECT_TRUE(reachability->IsReachable(exp, mul));
+  EXPECT_TRUE(reachability->IsReachable(exp, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
+  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
+  EXPECT_FALSE(reachability->IsReachable(mul, add));
+  EXPECT_FALSE(reachability->IsReachable(mul, negate));
+  EXPECT_FALSE(reachability->IsReachable(mul, exp));
+  EXPECT_TRUE(reachability->IsReachable(mul, mul));
+  EXPECT_FALSE(reachability->IsReachable(mul, copy));
+
+  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
+  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
+  EXPECT_FALSE(reachability->IsConnected(negate, add));
+  EXPECT_FALSE(reachability->IsConnected(add, negate));
+
+  // Remove the control dependency then update and verify the reachability map
+  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
+  computation->UpdateReachabilityThroughInstruction(exp, reachability.get());
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
+
+  // Change a use within the graph then update and verify the reachability map
+  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
+  computation->UpdateReachabilityThroughInstruction(negate, reachability.get());
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 93f448e701853e271646c9f8fb0d42f49489b756..1a2eed5f6026dc6a27e4879e63ecc378d2064d47 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -42,6 +42,9 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
   bool changed = false;
 
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     for (auto instruction : computation->MakeInstructionPostOrder()) {
       // Skip dead code.
       if (instruction->user_count() == 0 &&
@@ -58,6 +61,13 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
         continue;
       }
 
+      // Broadcasts dramatically increase the size of constants with is often
+      // detrimental to performance and memory capacity so do not fold
+      // broadcasts.
+      if (instruction->opcode() == HloOpcode::kBroadcast) {
+        continue;
+      }
+
       std::unique_ptr<Literal> result = evaluator->TryEvaluate(instruction);
       // Currently we skip unimplemented operations.
       // TODO(b/35975797): Fold constant computations for more operations.
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 31b81052cb2b00e602b94b9d84525a623caa741e..3ae499d5e0c37532ae0a83a4a247cab85fd2c84e 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -41,7 +41,7 @@ using HloConstantFoldingTest = HloTestBase;
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   HloComputation::Builder builder(TestName());
   HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
 
@@ -51,19 +51,18 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
-                computation->root_instruction()->literal()),
+  EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<int64>(),
             42);
 }
 
 TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   HloComputation::Builder builder(TestName());
   HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(42)));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
@@ -73,19 +72,18 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
-                computation->root_instruction()->literal()),
+  EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
             42.0f);
 }
 
 TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({42.0f, 19.0f})));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
 
@@ -95,16 +93,12 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   HloConstantFolding const_folder;
-  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
-      42);
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
-      19);
+  EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({0}), 42);
+  EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({1}), 19);
 }
 
 TEST_F(HloConstantFoldingTest, Concatenate) {
@@ -126,7 +120,7 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     for (auto csize : test_config.concat_sizes) {
       dimensions[test_config.concat_dimension] = csize;
       concat_size += csize;
-      auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
+      auto literal = Literal::CreateFromDimensions(F32, dimensions);
       HloInstruction* insn = builder.AddInstruction(
           HloInstruction::CreateConstant(std::move(literal)));
       operands.push_back(insn);
@@ -139,7 +133,7 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     auto computation = module->AddEntryComputation(builder.Build());
 
     HloConstantFolding const_folder;
-    TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
     EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
@@ -154,9 +148,9 @@ TEST_F(HloConstantFoldingTest, Slice) {
   const int64 slice_start[] = {4, 2, 3, 1, 5};
   const int64 slice_limits[] = {10, 8, 6, 5, 9};
   const int64 slice_strides[] = {1, 1, 1, 1, 1};
-  TF_ASSIGN_OR_ASSERT_OK(auto literal,
-                         LiteralTestUtil::CreateRandomLiteral<F32>(
-                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  TF_ASSERT_OK_AND_ASSIGN(auto literal,
+                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                              ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   HloInstruction* literal_instruction = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
@@ -166,7 +160,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
@@ -177,10 +171,10 @@ TEST_F(HloConstantFoldingTest, Slice) {
 TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   HloComputation::Builder builder(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
-  TF_ASSIGN_OR_ASSERT_OK(auto literal,
-                         LiteralTestUtil::CreateRandomLiteral<F32>(
-                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
-  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  TF_ASSERT_OK_AND_ASSIGN(auto literal,
+                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                              ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = literal->Literal::CloneToUnique();
   HloInstruction* literal_instruction = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
   Shape shape = ShapeUtil::MakeShape(F32, {8, 7, 11, 9, 5});
@@ -191,7 +185,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
@@ -200,12 +194,10 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
   bool matched = true;
-  LiteralUtil::EachCell<NativeT>(
-      root->literal(),
+  root->literal().EachCell<NativeT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
         std::vector<int64> rindexes = Permute(permutation, indices);
-        matched = matched && (value == LiteralUtil::Get<NativeT>(*literal_clone,
-                                                                 rindexes));
+        matched = matched && (value == literal_clone->Get<NativeT>(rindexes));
       });
   EXPECT_TRUE(matched);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 38cc74b0f1e640d4e72188416258d9b262053152..efc3b1c49c6ddca15a3615dc12551c4557ec841c 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -25,34 +25,56 @@ limitations under the License.
 
 namespace xla {
 
+constexpr char HloCostAnalysis::kFlopsKey[];
+constexpr char HloCostAnalysis::kTranscendentalsKey[];
+constexpr char HloCostAnalysis::kBytesAccessedKey[];
+constexpr char HloCostAnalysis::kSecondsKey[];
+
+HloCostAnalysis::HloCostAnalysis(const ShapeSizeFunction& shape_size)
+    : HloCostAnalysis(shape_size, {}) {}
+
+HloCostAnalysis::HloCostAnalysis(const ShapeSizeFunction& shape_size,
+                                 const Properties& per_second_rates)
+    : shape_size_(shape_size), per_second_rates_(per_second_rates) {}
+
 Status HloCostAnalysis::Preprocess(HloInstruction* hlo) {
   // Set current instruction cost values to reasonable default values. Each
-  // handler can overwrite these values. In Postprocess, these value are
+  // handler can overwrite these values. In Postprocess, these values are
   // accumulated and written to the per-instruction maps.
-  current_flop_count_ = 0;
-  current_transcendental_count_ = 0;
+  current_properties_.clear();
+  current_should_compute_bottleneck_time_ = true;
 
-  // The default element count for an instruction is the sum of elements in the
-  // operands and output. The default ShapeUtil::ByteSizeOf does not handle
-  // opaque types.
-  current_bytes_accessed_ = shape_size_(hlo->shape());
+  // The default number of bytes accessed for an instruction is the sum of the
+  // sizes of the inputs and outputs. The default ShapeUtil::ByteSizeOf does not
+  // handle opaque types.
+  float bytes_accessed = shape_size_(hlo->shape());
   for (const HloInstruction* operand : hlo->operands()) {
-    current_bytes_accessed_ += shape_size_(operand->shape());
+    bytes_accessed += shape_size_(operand->shape());
   }
+  current_properties_[kBytesAccessedKey] = bytes_accessed;
 
   return Status::OK();
 }
 
 Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
-  // Accumulate cost values and write into per-instruction maps.
-  flop_count_ += current_flop_count_;
-  hlo_to_flop_count_[hlo] = current_flop_count_;
-
-  transcendental_count_ += current_transcendental_count_;
-  hlo_to_transcendental_count_[hlo] = current_transcendental_count_;
+  if (current_should_compute_bottleneck_time_) {
+    // Compute the time as the time of the bottleneck, i.e. the slowest property
+    // given the per-second rate of each property.
+    float max_seconds = 0.0f;
+    for (const auto& property : current_properties_) {
+      if (property.first != kSecondsKey) {
+        max_seconds = std::max(
+            max_seconds,
+            property.second / GetProperty(property.first, per_second_rates_));
+      }
+    }
+    current_properties_[kSecondsKey] = max_seconds;
+  }
 
-  bytes_accessed_ += current_bytes_accessed_;
-  hlo_to_bytes_accessed_[hlo] = current_bytes_accessed_;
+  TF_RET_CHECK(hlo_properties_.emplace(hlo, current_properties_).second);
+  for (const auto& property : current_properties_) {
+    properties_sum_[property.first] += property.second;
+  }
 
   return Status::OK();
 }
@@ -65,25 +87,39 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
   auto opcode = hlo_instruction->opcode();
   // We treat the two opcodes (kExp, kPower) as transcendental operations.
   if (opcode == HloOpcode::kExp || opcode == HloOpcode::kPower) {
-    current_transcendental_count_ = computation_count;
+    current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
     // FLOPs.
-    current_flop_count_ = computation_count;
+    current_properties_[kFlopsKey] = computation_count;
   }
   return Status::OK();
 }
 
+/*static*/ float HloCostAnalysis::GetProperty(const string& key,
+                                              const Properties& properties) {
+  auto key_value = properties.find(key);
+  return key_value == properties.end() ? 0.0f : key_value->second;
+}
+
+/*static*/ float HloCostAnalysis::GetPropertyForHlo(
+    const HloInstruction& hlo, const string& key,
+    const HloToProperties& hlo_to_properties) {
+  auto it = hlo_to_properties.find(&hlo);
+  if (it == hlo_to_properties.end()) {
+    return 0.0f;
+  } else {
+    return GetProperty(key, it->second);
+  }
+}
+
 Status HloCostAnalysis::HandleElementwiseUnary(HloInstruction* hlo,
-                                               HloOpcode opcode,
-                                               HloInstruction* operand) {
+                                               HloOpcode opcode) {
   return HandleElementwiseOp(hlo);
 }
 
 Status HloCostAnalysis::HandleElementwiseBinary(HloInstruction* hlo,
-                                                HloOpcode opcode,
-                                                HloInstruction* lhs,
-                                                HloInstruction* rhs) {
+                                                HloOpcode opcode) {
   return HandleElementwiseOp(hlo);
 }
 
@@ -100,14 +136,18 @@ Status HloCostAnalysis::HandleClamp(HloInstruction* clamp,
   return HandleElementwiseOp(clamp);
 }
 
+Status HloCostAnalysis::HandleReducePrecision(HloInstruction* hlo) {
+  return HandleElementwiseOp(hlo);
+}
+
 Status HloCostAnalysis::HandleParameter(HloInstruction* parameter) {
-  current_bytes_accessed_ = 0;
+  current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleConstant(HloInstruction* constant,
                                        const Literal& literal) {
-  current_bytes_accessed_ = 0;
+  current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
@@ -115,7 +155,7 @@ Status HloCostAnalysis::HandleGetTupleElement(HloInstruction* get_tuple_element,
                                               HloInstruction* operand) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
-  current_bytes_accessed_ = 0;
+  current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
@@ -153,8 +193,9 @@ Status HloCostAnalysis::HandleTuple(
     tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
   // The tuple instruction only gathers pointers from inputs (it doesn't iterate
   // through them). The memory touched is then only the size of the output
-  // buffer.
-  current_bytes_accessed_ = shape_size_(tuple->shape());
+  // index table of the tuple.
+
+  current_properties_[kBytesAccessedKey] = shape_size_(tuple->shape());
   return Status::OK();
 }
 
@@ -164,13 +205,11 @@ Status HloCostAnalysis::HandleConcatenate(
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleConvert(HloInstruction* convert,
-                                      HloInstruction* operand) {
+Status HloCostAnalysis::HandleConvert(HloInstruction* convert) {
   return HandleElementwiseOp(convert);
 }
 
-Status HloCostAnalysis::HandleCopy(HloInstruction* copy,
-                                   HloInstruction* operand) {
+Status HloCostAnalysis::HandleCopy(HloInstruction* copy) {
   return Status::OK();
 }
 
@@ -194,7 +233,7 @@ Status HloCostAnalysis::HandleDot(HloInstruction* dot,
   }
 
   // We count an FMA operation as 2 floating point operations.
-  current_flop_count_ = kFmaFlops * fma_count;
+  current_properties_[kFlopsKey] = kFmaFlops * fma_count;
   return Status::OK();
 }
 
@@ -210,16 +249,17 @@ Status HloCostAnalysis::HandleMap(
     HloInstruction* map, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* function,
     tensorflow::gtl::ArraySlice<HloInstruction*> /*static_operands*/) {
-  // Compute the cost of the user function.
-  HloInstruction* function_instruction = function->root_instruction();
-  HloCostAnalysis visitor(shape_size_);
-  TF_RETURN_IF_ERROR(function_instruction->Accept(&visitor));
+  // Compute properties of the mapped function.
+  TF_ASSIGN_OR_RETURN(const Properties sub_properties,
+                      ProcessSubcomputation(function));
 
   // Compute the cost of all elements for this Map operation.
-  int64 element_count = ShapeUtil::ElementsIn(map->shape());
-  current_transcendental_count_ =
-      element_count * visitor.transcendental_count();
-  current_flop_count_ = element_count * visitor.flop_count();
+  const int64 element_count = ShapeUtil::ElementsIn(map->shape());
+  for (const auto& property : sub_properties) {
+    if (property.first != kBytesAccessedKey) {
+      current_properties_[property.first] = property.second * element_count;
+    }
+  }
   return Status::OK();
 }
 
@@ -227,16 +267,17 @@ Status HloCostAnalysis::HandleReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function) {
   // Compute the cost of the user function.
-  HloInstruction* function_instruction = function->root_instruction();
-  HloCostAnalysis visitor(shape_size_);
-  TF_RETURN_IF_ERROR(function_instruction->Accept(&visitor));
+  TF_ASSIGN_OR_RETURN(const Properties sub_properties,
+                      ProcessSubcomputation(function));
 
   // Compute the cost of all elements for this Reduce operation.
   int64 reduction_count = ShapeUtil::ElementsIn(arg->shape()) -
                           ShapeUtil::ElementsIn(reduce->shape());
-  current_flop_count_ = reduction_count * visitor.flop_count();
-  current_transcendental_count_ =
-      reduction_count * visitor.transcendental_count();
+  for (const auto& property : sub_properties) {
+    if (property.first != kBytesAccessedKey) {
+      current_properties_[property.first] = property.second * reduction_count;
+    }
+  }
   return Status::OK();
 }
 
@@ -244,55 +285,63 @@ Status HloCostAnalysis::HandleReduceWindow(HloInstruction* reduce_window,
                                            HloInstruction* operand,
                                            const Window& window,
                                            HloComputation* function) {
-  // Compute the cost of the user function.
-  HloInstruction* function_instruction = function->root_instruction();
-  HloCostAnalysis visitor(shape_size_);
-  TF_RETURN_IF_ERROR(function_instruction->Accept(&visitor));
+  // Compute the properties of the reduction function.
+  TF_ASSIGN_OR_RETURN(const Properties sub_properties,
+                      ProcessSubcomputation(function));
 
   // Compute the cost of all elements for this ReduceWindow operation. For each
-  // output element, (window_size - 1) number of user computations are applied.
-  auto output_size = ShapeUtil::ElementsIn(reduce_window->shape());
-  int64 window_size = 1;
+  // output element there are window_size - 1 reductions to perform.
+  int64 window_element_count = 1;
   for (const auto& dimension : window.dimensions()) {
-    window_size *= dimension.size();
+    window_element_count *= dimension.size();
+  }
+  const int64 output_element_count =
+      ShapeUtil::ElementsIn(reduce_window->shape());
+  const int64 reduction_count =
+      (window_element_count - 1) * output_element_count;
+  for (const auto& property : sub_properties) {
+    if (property.first != kBytesAccessedKey) {
+      current_properties_[property.first] = property.second * reduction_count;
+    }
   }
-  current_flop_count_ = output_size * (window_size - 1) * visitor.flop_count();
-  current_transcendental_count_ =
-      output_size * (window_size - 1) * visitor.transcendental_count();
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleSelectAndScatter(HloInstruction* instruction) {
-  // Compute the cost of the select and scatter function.
-  HloInstruction* select = instruction->select()->root_instruction();
-  HloCostAnalysis select_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(select->Accept(&select_visitor));
-  HloInstruction* scatter = instruction->scatter()->root_instruction();
-  HloCostAnalysis scatter_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(scatter->Accept(&scatter_visitor));
+  // Compute the properties of the select and scatter function.
+  // Compute the properties of the reduction function.
+  TF_ASSIGN_OR_RETURN(const Properties select_properties,
+                      ProcessSubcomputation(instruction->select()));
+  TF_ASSIGN_OR_RETURN(const Properties scatter_properties,
+                      ProcessSubcomputation(instruction->scatter()));
 
   // Compute the cost of all elements for this operation. For each scatter
-  // source element, (window_size - 1) number of select computations and 1
-  // scatter computation are applied.
+  // source element there are window_size - 1 select computations to perform and
+  // 1 scatter computation to perform.
   const auto source = instruction->operand(1);
   const auto source_element_count = ShapeUtil::ElementsIn(source->shape());
-  int64 window_size = 1;
+  int64 window_element_count = 1;
   for (const auto& dimension : instruction->window().dimensions()) {
-    window_size *= dimension.size();
+    window_element_count *= dimension.size();
+  }
+  const int64 select_count = source_element_count * (window_element_count - 1);
+  for (const auto& property : select_properties) {
+    if (property.first != kBytesAccessedKey) {
+      current_properties_[property.first] += property.second * select_count;
+    }
+  }
+  for (const auto& property : scatter_properties) {
+    if (property.first != kBytesAccessedKey) {
+      current_properties_[property.first] +=
+          property.second * source_element_count;
+    }
   }
-  current_flop_count_ =
-      source_element_count * ((window_size - 1) * select_visitor.flop_count() +
-                              scatter_visitor.flop_count());
-  current_transcendental_count_ =
-      source_element_count *
-      ((window_size - 1) * select_visitor.transcendental_count() +
-       scatter_visitor.transcendental_count());
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleBitcast(HloInstruction* bitcast) {
   // A bitcast does no computation and touches no memory.
-  current_bytes_accessed_ = 0;
+  current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
@@ -314,6 +363,17 @@ Status HloCostAnalysis::HandleReshape(HloInstruction* reshape) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleBatchNormTraining(
+    HloInstruction* batchNormTraining) {
+  // TODO(b/62294698): Implement cost analysis for batch-norm-training.
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleBatchNormGrad(HloInstruction* batchNormGrad) {
+  // TODO(b/62294698): Implement cost analysis for batch-norm-grad.
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleTranspose(HloInstruction* transpose) {
   return Status::OK();
 }
@@ -326,12 +386,13 @@ Status HloCostAnalysis::HandleConvolution(HloInstruction* convolution,
   const int64 output_features =
       convolution->shape().dimensions(dnums.feature_dimension());
 
-  // For each output element, we do one fma per element in the
-  // kernel at some given output feature index.
+  // For each output element, we do one fma per element in the kernel at some
+  // given output feature index.
   const int64 fmas_per_output_element =
       ShapeUtil::ElementsIn(rhs_instruction->shape()) / output_features;
   const int64 output_elements = ShapeUtil::ElementsIn(convolution->shape());
-  current_flop_count_ = output_elements * fmas_per_output_element * kFmaFlops;
+  current_properties_[kFlopsKey] =
+      output_elements * fmas_per_output_element * kFmaFlops;
   return Status::OK();
 }
 
@@ -341,7 +402,7 @@ Status HloCostAnalysis::HandleCrossReplicaSum(HloInstruction* crs) {
   //
   // TODO(b/33004697): Compute correct cost here, taking the actual number of
   // replicas into account.
-  current_flop_count_ = ShapeUtil::ElementsIn(crs->shape());
+  current_properties_[kFlopsKey] = ShapeUtil::ElementsIn(crs->shape());
   return Status::OK();
 }
 
@@ -350,31 +411,43 @@ Status HloCostAnalysis::HandleRng(HloInstruction* random,
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
   // the cost of each RNG is same as a transcendental operation.
-  current_transcendental_count_ = ShapeUtil::ElementsIn(random->shape());
+  current_properties_[kTranscendentalsKey] =
+      ShapeUtil::ElementsIn(random->shape());
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
-  // Compute the cost of the fused expression.
-  HloInstruction* fused_expression_root = fusion->fused_expression_root();
-  // Don't compute sizes inside of fused ops. We don't use the size here and the
-  // operations inside might not have a layout.
-  HloCostAnalysis visitor([](const Shape&) { return 0; });
-  TF_RETURN_IF_ERROR(fused_expression_root->Accept(&visitor));
+  // Compute the properties of the fused expression and attribute them to the
+  // fusion node. Use a dummy shape_size to avoid any errors from trying to
+  // calculate the size of a shape that does not have a layout, since nodes
+  // inside fusion nodes do not necessarily have a layout assigned.
+  ShapeSizeFunction shape_size = [](const Shape& shape) { return 0; };
+  TF_ASSIGN_OR_RETURN(
+      current_properties_,
+      ProcessSubcomputation(fusion->fused_instructions_computation(),
+                            &shape_size));
+
+  // Fusion nodes that produce a tuple also produce the entries in the tuple.
+  // Ignore the memory accessed inside fused ops, since fusion is supposed to
+  // prevent intermediate data from touching slow memory.
+  current_properties_[kBytesAccessedKey] = 0;
+  ShapeUtil::ForEachSubshape(
+      fusion->shape(),
+      [this](const Shape& subshape, const ShapeIndex& /*shape_index*/) {
+        current_properties_[kBytesAccessedKey] += shape_size_(subshape);
+      });
+
+  for (const HloInstruction* operand : fusion->operands()) {
+    current_properties_[kBytesAccessedKey] += shape_size_(operand->shape());
+  }
 
-  // Attribute the cost of the fused expression to the fusion node.
-  current_transcendental_count_ = visitor.transcendental_count();
-  current_flop_count_ = visitor.flop_count();
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleCall(HloInstruction* call) {
-  HloCostAnalysis computation_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(call->to_apply()->Accept(&computation_visitor));
-
-  current_flop_count_ = computation_visitor.flop_count();
-  current_transcendental_count_ = computation_visitor.transcendental_count();
-  current_bytes_accessed_ = computation_visitor.bytes_accessed();
+  TF_ASSIGN_OR_RETURN(current_properties_,
+                      ProcessSubcomputation(call->to_apply()));
+  current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
 
@@ -382,34 +455,38 @@ Status HloCostAnalysis::HandleCustomCall(
     HloInstruction* custom_call,
     tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     tensorflow::StringPiece custom_call_target) {
-  return Unimplemented("custom-call");
+  return Unimplemented("Custom-call is not implemented for HLO cost analysis.");
 }
 
 Status HloCostAnalysis::HandleSort(HloInstruction* sort,
                                    HloInstruction* operand_instruction) {
-  // The cost of sort is implementation dependent, so cannot determine at HLO
-  // level. Assume comparison based N*log(N) sorting.
+  // This assumes a comparison based N*log(N) algorithm. As for all ops, the
+  // actual properties of the op depend on the backend implementation.
   int64 elements = ShapeUtil::ElementsIn(operand_instruction->shape());
-  current_flop_count_ = elements * tensorflow::Log2Ceiling(elements);
+  current_properties_[kFlopsKey] = elements * tensorflow::Log2Ceiling(elements);
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while) {
-  // Since the number of iterations of the while node is not statically
-  // determined, we cannot precisely compute the cost of a while node. For now
-  // compute the cost of a single iteration.
-  // TODO(b/26346211): Improve the cost analysis for while node.
-  HloCostAnalysis body_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&body_visitor));
-  HloCostAnalysis condition_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(xla_while->while_condition()->Accept(&condition_visitor));
+  // Since the number of iterations of the while node will not always be
+  // something that we can statically analyze, we cannot precisely compute the
+  // cost of a while node. For now compute the cost of a single iteration.
+  //
+  // TODO(b/26346211): Improve the cost analysis for while nodes.
+  TF_ASSIGN_OR_RETURN(const Properties body_properties,
+                      ProcessSubcomputation(xla_while->while_body()));
 
-  current_flop_count_ =
-      body_visitor.flop_count() + condition_visitor.flop_count();
-  current_transcendental_count_ = body_visitor.transcendental_count() +
-                                  condition_visitor.transcendental_count();
-  current_bytes_accessed_ =
-      body_visitor.bytes_accessed() + condition_visitor.bytes_accessed();
+  TF_ASSIGN_OR_RETURN(const Properties condition_properties,
+                      ProcessSubcomputation(xla_while->while_condition()));
+
+  current_properties_.clear();
+  for (const auto& property : body_properties) {
+    current_properties_[property.first] += property.second;
+  }
+  for (const auto& property : condition_properties) {
+    current_properties_[property.first] += property.second;
+  }
+  current_should_compute_bottleneck_time_ = false;
 
   return Status::OK();
 }
@@ -418,19 +495,42 @@ Status HloCostAnalysis::FinishVisit(HloInstruction* root) {
   return Status::OK();
 }
 
+float HloCostAnalysis::flop_count() const {
+  return GetProperty(kFlopsKey, properties_sum_);
+}
+
+float HloCostAnalysis::transcendental_count() const {
+  return GetProperty(kTranscendentalsKey, properties_sum_);
+}
+
+float HloCostAnalysis::bytes_accessed() const {
+  return GetProperty(kBytesAccessedKey, properties_sum_);
+}
+
+float HloCostAnalysis::seconds() const {
+  return GetProperty(kSecondsKey, properties_sum_);
+}
+
 int64 HloCostAnalysis::flop_count(const HloInstruction& hlo) const {
-  auto it = hlo_to_flop_count_.find(&hlo);
-  return it == hlo_to_flop_count_.end() ? 0 : it->second;
+  return GetPropertyForHlo(hlo, kFlopsKey, hlo_properties_);
 }
 
 int64 HloCostAnalysis::transcendental_count(const HloInstruction& hlo) const {
-  auto it = hlo_to_transcendental_count_.find(&hlo);
-  return it == hlo_to_transcendental_count_.end() ? 0 : it->second;
+  return GetPropertyForHlo(hlo, kTranscendentalsKey, hlo_properties_);
 }
 
 int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
-  auto it = hlo_to_bytes_accessed_.find(&hlo);
-  return it == hlo_to_bytes_accessed_.end() ? 0 : it->second;
+  return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_);
+}
+
+StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
+    HloComputation* computation, const ShapeSizeFunction* shape_size) {
+  if (shape_size == nullptr) {
+    shape_size = &shape_size_;
+  }
+  HloCostAnalysis visitor(*shape_size, per_second_rates_);
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  return visitor.properties();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index b2c40f75ca4e833f1f5529977564b0e3a7ca25b1..3c2e9503aa626d9b9777d6650f219458a915f57d 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -36,17 +36,22 @@ namespace xla {
 // operations separately from transcendental operations.
 class HloCostAnalysis : public DfsHloVisitor {
  public:
+  // Each HLO is associated to a vector of properties with the indices given
+  // below. Sub-classes can add further properties.
+  typedef std::map<string, float> Properties;
+  static constexpr char kFlopsKey[] = "flops";
+  static constexpr char kTranscendentalsKey[] = "transcendentals";
+  static constexpr char kBytesAccessedKey[] = "bytes accessed";
+  static constexpr char kSecondsKey[] = "seconds";
+
   // shape_size is a function which returns the size in bytes of the top-level
   // buffer of a shape.
   using ShapeSizeFunction = std::function<int64(const Shape&)>;
-  explicit HloCostAnalysis(const ShapeSizeFunction& shape_size)
-      : shape_size_(shape_size) {}
-
-  Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode,
-                                HloInstruction* operand) override;
-  Status HandleElementwiseBinary(HloInstruction* hlo, HloOpcode opcode,
-                                 HloInstruction* lhs,
-                                 HloInstruction* rhs) override;
+  explicit HloCostAnalysis(const ShapeSizeFunction& shape_size);
+
+  Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode) override;
+  Status HandleElementwiseBinary(HloInstruction* hlo,
+                                 HloOpcode opcode) override;
   Status HandleConstant(HloInstruction* constant,
                         const Literal& literal) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element,
@@ -58,14 +63,14 @@ class HloCostAnalysis : public DfsHloVisitor {
                        HloInstruction* lhs, HloInstruction* rhs) override;
   Status HandleClamp(HloInstruction* clamp, HloInstruction* min,
                      HloInstruction* arg, HloInstruction* max) override;
+  Status HandleReducePrecision(HloInstruction* hlo) override;
   Status HandleConcatenate(
       HloInstruction* concatenate,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleRecv(HloInstruction* recv) override;
-  Status HandleConvert(HloInstruction* convert,
-                       HloInstruction* operand) override;
-  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
+  Status HandleConvert(HloInstruction* convert) override;
+  Status HandleCopy(HloInstruction* copy) override;
   Status HandleDot(HloInstruction* dot, HloInstruction* lhs,
                    HloInstruction* rhs) override;
   Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs,
@@ -83,6 +88,8 @@ class HloCostAnalysis : public DfsHloVisitor {
                       HloInstruction* init_value,
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function_handle) override;
+  Status HandleBatchNormTraining(HloInstruction* batchNormTraining) override;
+  Status HandleBatchNormGrad(HloInstruction* batchNormGrad) override;
   Status HandleFusion(HloInstruction* fusion) override;
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
@@ -119,48 +126,88 @@ class HloCostAnalysis : public DfsHloVisitor {
   Status Preprocess(HloInstruction* hlo) override;
   Status Postprocess(HloInstruction* hlo) override;
 
-  // Returns the amount of computations in the graph.
-  int64 flop_count() const { return flop_count_; }
-  int64 transcendental_count() const { return transcendental_count_; }
+  // Set the rates used to calculate the time taken by the computation. These
+  // need to be set before visiting starts.
+  void set_flops_per_second(float value) {
+    per_second_rates_[kFlopsKey] = value;
+  }
+  void set_transcendentals_per_second(float value) {
+    per_second_rates_[kTranscendentalsKey] = value;
+  }
+  void set_bytes_per_second(float value) {
+    per_second_rates_[kBytesAccessedKey] = value;
+  }
+
+  // Returns properties for the computation.
+  float flop_count() const;
+  float transcendental_count() const;
+  float bytes_accessed() const;
+  float seconds() const;
 
   // Returns the respective cost computed for a particular HLO instruction, or 0
   // if the HLO was not found to have a cost in the analysis.
   int64 flop_count(const HloInstruction& hlo) const;
   int64 transcendental_count(const HloInstruction& hlo) const;
-
-  // Returns the number of bytes read/written.
   int64 bytes_accessed(const HloInstruction& hlo) const;
-  int64 bytes_accessed() const { return bytes_accessed_; }
+  float seconds(const HloInstruction& hlo) const;
+
+  const Properties& properties() const { return properties_sum_; }
+  const float property(const string& key) const {
+    return GetProperty(key, properties());
+  }
+
+ protected:
+  typedef std::unordered_map<const HloInstruction*, Properties> HloToProperties;
 
- private:
   // An FMA counts as two floating point operations in these analyzes.
   static constexpr int64 kFmaFlops = 2;
 
+  HloCostAnalysis(const ShapeSizeFunction& shape_size,
+                  const Properties& per_second_rates);
+
+  // Returns the properties computed from visiting the computation rooted at the
+  // given hlo. Uses shape_size_ to calculate shape sizes if shape_size is null,
+  // otherwise uses shape_size_.
+  StatusOr<Properties> ProcessSubcomputation(
+      HloComputation* computation,
+      const ShapeSizeFunction* shape_size = nullptr);
+
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(HloInstruction* hlo_instruction);
 
+  // Returns 0.0f if the key is not present in the properties. Otherwise,
+  // returns the value that the key maps to from the properties parameter.
+  static float GetProperty(const string& key, const Properties& properties);
+
+  // Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
+  // is not present in hlo_to_properties[hlo]. Otherwise, returns the value that
+  // the key maps to in the properties of the given hlo.
+  static float GetPropertyForHlo(const HloInstruction& hlo, const string& key,
+                                 const HloToProperties& hlo_to_properties);
+
   // Function which computes the size of the top-level of a given shape (not
   // including nested elements, if any). If null then bytes_accessed methods
   // return an error.
   const ShapeSizeFunction shape_size_;
 
-  // The total number of floating point operations, transcendental operations,
-  // and bytes accesses (read or written) in the computation.
-  int64 flop_count_ = 0;
-  int64 transcendental_count_ = 0;
-  int64 bytes_accessed_ = 0;
-
-  // Cost counts of the current instruction. These should be set by each
-  // handlers if different from the default values computed in Preprocess.
-  int64 current_flop_count_;
-  int64 current_transcendental_count_;
-  int64 current_bytes_accessed_;
-
-  // Mapping from HLO instructions to the cost we computed for them in the
-  // course of the graph analysis.
-  std::map<const HloInstruction*, int64> hlo_to_flop_count_;
-  std::map<const HloInstruction*, int64> hlo_to_transcendental_count_;
-  std::map<const HloInstruction*, int64> hlo_to_bytes_accessed_;
+  HloToProperties hlo_properties_;
+
+  // If true, the time taken will be computed from the rates for each property
+  // and the total time will be the maximum time, which is the time of the
+  // bottleneck.
+  bool current_should_compute_bottleneck_time_;
+
+  // The properties of the currently visited instruction. A HandleFoo method can
+  // modify these to change the default values computed in Preprocess.
+  Properties current_properties_;
+
+  // The sum of the properties of all HLOs in the computation.
+  Properties properties_sum_;
+
+  // How much of each property can be processed per second. E.g. if the property
+  // is bytes accessed, this is the number of bytes that can be processed per
+  // second. Is empty if no rates have been set.
+  Properties per_second_rates_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloCostAnalysis);
 };
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index b74c7eb4e074bd8f340137066b6d9675bb32cee1..0a288a77ada840451915561b4b0865785b39ade7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
@@ -329,51 +330,67 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
   EXPECT_EQ(conv_analysis.flop_count(), matmul_analysis.flop_count());
 }
 
-using FusionCostAnalysis = ::testing::Test;
+using FusionCostAnalysis = HloTestBase;
 
 TEST_F(FusionCostAnalysis, LoopFusion) {
-  Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
-
-  // Fuse all instructions in complicated expression:
-  //
-  //   add = Add(C1, C2)
-  //   clamp = Clamp(C2, add, add)
-  //   exp = Exp(add)
-  //   mul = Mul(exp, C3)
-  //   sub = Sub(mul, clamp)
-  //   tuple = Tuple({sub, sub, mul, C1})
-  auto c1 = HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace(
-      /*from=*/0.0f, /*to=*/1.0f, /*rows=*/2, /*cols=*/2));
-  auto c2 = HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace(
-      /*from=*/1.0f, /*to=*/2.0f, /*rows=*/2, /*cols=*/2));
-  auto c3 = HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace(
-      /*from=*/2.0f, /*to=*/3.0f, /*rows=*/2, /*cols=*/2));
-
-  auto add =
-      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, c1.get(), c2.get());
-  auto clamp = HloInstruction::CreateTernary(r2f32, HloOpcode::kClamp, c2.get(),
-                                             add.get(), add.get());
-  auto exp = HloInstruction::CreateUnary(r2f32, HloOpcode::kExp, add.get());
-  auto mul = HloInstruction::CreateBinary(r2f32, HloOpcode::kMultiply,
-                                          exp.get(), c3.get());
-  auto sub = HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract,
-                                          mul.get(), clamp.get());
-  auto tuple =
-      HloInstruction::CreateTuple({sub.get(), sub.get(), mul.get(), c1.get()});
-
-  auto fusion = HloInstruction::CreateFusion(
-      r2f32, HloInstruction::FusionKind::kLoop, tuple.get());
-  fusion->FuseInstruction(sub.get());
-  fusion->FuseInstruction(mul.get());
-  fusion->FuseInstruction(exp.get());
-  fusion->FuseInstruction(clamp.get());
-  fusion->FuseInstruction(add.get());
-
-  HloCostAnalysis fusion_analysis(ShapeSize);
-  ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
-
-  EXPECT_EQ(fusion_analysis.flop_count(), 16);
-  EXPECT_EQ(fusion_analysis.transcendental_count(), 4);
+  // Do this 4 times with different per-second rates to test the computation of
+  // bottleneck time on fusion nodes.
+  for (int i = 0; i < 4; ++i) {
+    Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
+
+    // Fuse all instructions in complicated expression:
+    //
+    //   add = Add(C1, C2)
+    //   clamp = Clamp(C2, add, add)
+    //   exp = Exp(add)
+    //   mul = Mul(exp, C3)
+    //   sub = Sub(mul, clamp)
+    //   tuple = Tuple({sub, sub, mul, C1})
+    HloComputation::Builder builder(TestName());
+    auto c1 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+            /*from=*/0.0f, /*to=*/1.0f, /*rows=*/2, /*cols=*/2)));
+    auto c2 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+            /*from=*/1.0f, /*to=*/2.0f, /*rows=*/2, /*cols=*/2)));
+    auto c3 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+            /*from=*/2.0f, /*to=*/3.0f, /*rows=*/2, /*cols=*/2)));
+    auto add = builder.AddInstruction(
+        HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, c1, c2));
+    auto clamp = builder.AddInstruction(
+        HloInstruction::CreateTernary(r2f32, HloOpcode::kClamp, c2, add, add));
+    auto exp = builder.AddInstruction(
+        HloInstruction::CreateUnary(r2f32, HloOpcode::kExp, add));
+    auto mul = builder.AddInstruction(
+        HloInstruction::CreateBinary(r2f32, HloOpcode::kMultiply, exp, c3));
+    auto sub = builder.AddInstruction(
+        HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
+    auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
+
+    HloModule module(TestName());
+    auto* computation = module.AddEntryComputation(builder.Build());
+    auto* fusion = computation->CreateFusionInstruction(
+        {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
+
+    // The time given these rates at i == 0 is exactly even among the properties
+    // at 1.0 seconds. For other values, one of the rates is slower so that it
+    // becomes the bottleneck.
+    HloCostAnalysis fusion_analysis(ShapeSize);
+    fusion_analysis.set_flops_per_second(16 * (i == 1 ? 1 / 2.0 : 1.0));
+    fusion_analysis.set_transcendentals_per_second(4 *
+                                                   (i == 2 ? 1 / 4.0 : 1.0));
+    fusion_analysis.set_bytes_per_second(64 * (i == 3 ? 1 / 8.0 : 1.0));
+    ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
+
+    EXPECT_EQ(fusion_analysis.flop_count(), 16);
+    EXPECT_EQ(fusion_analysis.transcendental_count(), 4);
+    constexpr int64 bytes_accessed = sizeof(float) * 4 * 2 * 2;
+    static_assert(bytes_accessed == 64, "");
+    EXPECT_EQ(fusion_analysis.bytes_accessed(), bytes_accessed);
+
+    EXPECT_EQ(fusion_analysis.seconds(), 1 << i);
+  }
 }
 
 TEST_F(FusionCostAnalysis, NoLayout) {
@@ -382,19 +399,21 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   Shape shape_without_layout = shape_with_layout;
   shape_without_layout.clear_layout();
 
-  auto c1 = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR4FromArray4D(Array4D<float>(2, 3, 4, 5)));
-  auto c2 =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1, 2, 3}));
-
-  auto broadcast =
-      HloInstruction::CreateBroadcast(shape_without_layout, c2.get(), {1});
-  auto add = HloInstruction::CreateBinary(shape_with_layout, HloOpcode::kAdd,
-                                          c1.get(), broadcast.get());
-
-  auto fusion = HloInstruction::CreateFusion(
-      shape_with_layout, HloInstruction::FusionKind::kLoop, add.get());
-  fusion->FuseInstruction(broadcast.get());
+  HloComputation::Builder builder(TestName());
+  auto c1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR4FromArray4D(Array4D<float>(2, 3, 4, 5))));
+  auto c2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3})));
+
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape_without_layout, c2, {1}));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape_with_layout, HloOpcode::kAdd, c1, broadcast));
+
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {add, broadcast}, HloInstruction::FusionKind::kLoop);
 
   HloCostAnalysis fusion_analysis(ShapeSize);
   ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 4c6af5c40fa563d1c656eb152819e454aae5fb69..690c084efb131e9b075ced17bfcd0b23a23218f1 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -68,7 +68,7 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
       auto range = constants.equal_range(shape_string);
       HloInstruction* match = nullptr;
       for (auto it = range.first; it != range.second; ++it) {
-        if (LiteralUtil::Equal(instruction->literal(), it->second->literal())) {
+        if (instruction->literal().Equal(it->second->literal())) {
           match = it->second;
           break;
         }
@@ -92,6 +92,9 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
 StatusOr<bool> HloCSE::Run(HloModule* module) {
   bool changed = false;
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     changed |= CombineConstants(computation.get(), is_layout_sensitive_);
 
     std::list<HloInstruction*> post_order =
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index cc39c3ac20396f9648b5d325933aad819275b2a6..8b0b9c8bbd0cf442149b32a4539277b2daeed90e 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -51,9 +51,9 @@ TEST_F(HloCseTest, CombineTwoConstants) {
   // Test that two identical constants are commoned.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
@@ -67,10 +67,10 @@ TEST_F(HloCseTest, CombineTwoConstants) {
 
   EXPECT_EQ(2, computation->instruction_count());
   HloInstruction* constant = computation->instructions().begin()->get();
-  EXPECT_EQ(42.0f, LiteralUtil::Get<float>(constant->literal(), {}));
+  EXPECT_EQ(42.0f, constant->literal().Get<float>({}));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
-  auto expected = LiteralUtil::CreateR0<float>(84.0);
+  auto expected = Literal::CreateR0<float>(84.0);
   LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
 }
 
@@ -102,7 +102,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   EXPECT_THAT(add, op::Add(first_operand, first_operand));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
-  auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
+  auto expected = Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
   LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
 }
 
@@ -132,7 +132,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
-  auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
+  auto expected = Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
   LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
 }
 
@@ -141,20 +141,20 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
   // commoned.
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(42)));
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(42)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint64>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<uint64>(42.0)));
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(42.0)));
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<double>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<double>(42.0)));
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   // Duplicate the float constant to verify something happens.
   builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -171,13 +171,13 @@ TEST_F(HloCseTest, NonscalarConstants) {
   // Test that identical nonscalar constants are merged.
   auto builder = HloComputation::Builder(TestName());
   auto common_constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
   auto common_constant2 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
   // Create a constant which has the same shape but a different value.
   auto uncommon_constant =
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}})));
+          Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}})));
 
   // Tie the constants together with a tuple. This makes it easier to refer to
   // the constant instructions via their use.
@@ -206,7 +206,7 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   // Test that three identical instructions are commoned.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
   auto exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kExp, constant));
   auto exp2 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -236,7 +236,7 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   // commoned if the pass is layout sensitive.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
 
   auto exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kExp, constant));
@@ -267,7 +267,7 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   // the pass is layout insensitive.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
 
   auto exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kExp, constant));
@@ -311,7 +311,7 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   // The *1 instructions should be merged with the *2 instructions.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
 
   auto negate1 = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kNegate, constant));
@@ -349,9 +349,9 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   // Test that two RNG ops are not commoned.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   auto rng1 = builder.AddInstruction(HloInstruction::CreateRng(
       ShapeUtil::MakeShape(F32, {}), RandomDistribution::RNG_UNIFORM,
       {constant1, constant2}));
@@ -392,9 +392,9 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
     Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
     auto builder = HloComputation::Builder(TestName() + "_rng_fun");
     auto constant1 = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
     auto constant2 = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
     auto rng = builder.AddInstruction(HloInstruction::CreateRng(
         scalar_shape, RandomDistribution::RNG_UNIFORM, {constant1, constant2}));
     auto param = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -409,7 +409,7 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   {
     auto builder = HloComputation::Builder(TestName());
     auto constant = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({5.0f})));
+        HloInstruction::CreateConstant(Literal::CreateR1<float>({5.0f})));
     auto rng1 = builder.AddInstruction(
         HloInstruction::CreateMap(constant->shape(), {constant}, rng_function));
     auto rng2 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index d1b87256445e4fd51134a66666e5736baf272c71..92548dfaf0bf12755053bfe26d4cb2ae0459dd37 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -16,14 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 
 #include <algorithm>
-#include <iosfwd>
 #include <queue>
-#include <set>
 #include <vector>
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -35,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -43,209 +39,6 @@ namespace xla {
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
-string HloLocation::ToString() const {
-  string index_str =
-      ShapeUtil::IsTuple(instruction->shape()) ? (" " + index.ToString()) : "";
-  return StrCat(instruction->FullyQualifiedName(), index_str);
-}
-
-std::ostream& operator<<(std::ostream& out, const HloLocation& location) {
-  out << location.ToString();
-  return out;
-}
-
-string HloUse::ToString() const {
-  string index_str =
-      ShapeUtil::IsTuple(instruction->operand(operand_number)->shape())
-          ? (" " + operand_index.ToString())
-          : "";
-  return StrCat(instruction->FullyQualifiedName(), ", operand ", operand_number,
-                index_str);
-}
-
-std::ostream& operator<<(std::ostream& out, const HloUse& use) {
-  out << use.ToString();
-  return out;
-}
-
-HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
-                   const ShapeIndex& index, bool is_phi)
-    : id_(id), is_phi_(is_phi) {
-  // The defining location is always the first element in the locations_ vector.
-  AddLocation(instruction, index);
-}
-
-bool HloValue::operator==(const HloValue& other) const {
-  bool equal = instruction() == other.instruction() && index() == other.index();
-  // If the values are equal they most both be phi (or non phi).
-  CHECK(!(equal && is_phi() != other.is_phi()));
-  return equal;
-}
-
-bool HloValue::operator!=(const HloValue& other) const {
-  return !(*this == other);
-}
-
-string HloValue::ToShortString() const {
-  string index_str =
-      ShapeUtil::IsTuple(instruction()->shape()) ? index().ToString() : "";
-  return StrCat(is_phi_ ? "PHI " : "", instruction()->FullyQualifiedName(),
-                index_str);
-}
-
-string HloValue::ToString(int indent) const {
-  string indentation(indent, ' ');
-  string out = StrCat(indentation, ToShortString(), ", locations:\n");
-  for (const HloLocation& location : locations()) {
-    StrAppend(&out, indentation, "  ", location.ToString(), "\n");
-  }
-  StrAppend(&out, indentation, " uses:\n");
-  for (const HloUse& use : uses()) {
-    StrAppend(&out, indentation, "  ", use.ToString(), "\n");
-  }
-  return out;
-}
-
-void HloValue::AddLocation(HloInstruction* instruction,
-                           const ShapeIndex& index) {
-  // The given location should not already exist in locations_.
-  for (const HloLocation& location : locations_) {
-    DCHECK(!(location.instruction == instruction && location.index == index));
-  }
-
-  locations_.push_back(HloLocation{instruction, index});
-
-  //  Update uses.
-  for (HloInstruction* user : instruction->users()) {
-    for (int64 operand_number : user->OperandIndices(instruction)) {
-      if (!DoesNotUseOperandBuffer(instruction, index, user)) {
-        for (const HloUse& use : uses_) {
-          // Verify that this use does not already exist.
-          DCHECK(!(use.instruction == user &&
-                   use.operand_number == operand_number &&
-                   use.operand_index == index));
-        }
-
-        uses_.push_back(HloUse{user, operand_number, index});
-      }
-    }
-  }
-
-  // Update liveout status of this HloValue.
-  const HloModule& module = *instruction->parent()->parent();
-  if (instruction == module.entry_computation()->root_instruction()) {
-    live_out_of_module_ = true;
-  }
-}
-
-void HloValue::RemoveLocation(HloInstruction* instruction,
-                              const ShapeIndex& index) {
-  // The defining location cannot be removed.
-  CHECK(!(instruction == this->instruction() && index == this->index()));
-
-  int64 size_before = locations_.size();
-  locations_.erase(
-      std::remove_if(locations_.begin(), locations_.end(),
-                     [instruction, &index](const HloLocation& location) {
-                       return location.instruction == instruction &&
-                              location.index == index;
-                     }),
-      locations_.end());
-  // Only a single location should have been removed.
-  CHECK_EQ(locations_.size(), size_before - 1);
-
-  //  Update uses which referred to this location.
-  uses_.erase(std::remove_if(uses_.begin(), uses_.end(),
-                             [instruction, &index](const HloUse& use) {
-                               return use.instruction->operand(
-                                          use.operand_number) == instruction &&
-                                      use.operand_index == index;
-                             }),
-              uses_.end());
-
-  const HloModule& module = *instruction->parent()->parent();
-  if (instruction == module.entry_computation()->root_instruction()) {
-    // Value has been removed from a location in the entry root instruction.
-    // Check if the value is still live out of the module by walking all
-    // remaining locations.
-    live_out_of_module_ = false;
-    for (const HloLocation& location : locations()) {
-      if (location.instruction ==
-          module.entry_computation()->root_instruction()) {
-        live_out_of_module_ = true;
-        break;
-      }
-    }
-  }
-}
-
-std::ostream& operator<<(std::ostream& out, const HloValue& value) {
-  out << value.ToShortString();
-  return out;
-}
-
-void HloValueSet::SortAndUniquifyValues() {
-  std::sort(value_ids_.begin(), value_ids_.end());
-  value_ids_.erase(std::unique(value_ids_.begin(), value_ids_.end()),
-                   value_ids_.end());
-}
-
-string HloValueSet::ToString() const {
-  return StrCat("HloValueSet: ", tensorflow::str_util::Join(value_ids_, ", "));
-}
-
-/*static */
-HloValueSet HloValueSet::Union(
-    tensorflow::gtl::ArraySlice<const HloValueSet*> inputs) {
-  HloValueSet union_set;
-  for (const HloValueSet* input : inputs) {
-    for (HloValue::Id value_id : input->value_ids()) {
-      union_set.value_ids_.push_back(value_id);
-    }
-  }
-  union_set.SortAndUniquifyValues();
-  return union_set;
-}
-
-std::ostream& operator<<(std::ostream& out, const HloValueSet& value_set) {
-  out << value_set.ToString();
-  return out;
-}
-
-InstructionValueSet InstructionValueSet::Union(
-    tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
-  CHECK_GT(inputs.size(), 0);
-  for (int i = 1; i < inputs.size(); ++i) {
-    CHECK(ShapeUtil::Compatible(inputs[0]->shape(), inputs[i]->shape()));
-  }
-  InstructionValueSet union_set(inputs[0]->shape());
-  union_set.ForEachMutableElement(
-      [&inputs](const ShapeIndex& index, HloValueSet* value_set) {
-        std::vector<const HloValueSet*> input_sets;
-        for (const InstructionValueSet* input : inputs) {
-          input_sets.push_back(&input->element(index));
-        }
-        *value_set = HloValueSet::Union(input_sets);
-      });
-  return union_set;
-}
-
-std::ostream& operator<<(std::ostream& out,
-                         const InstructionValueSet& instruction_value_set) {
-  out << instruction_value_set.ToString();
-  return out;
-}
-
-string InstructionValueSet::ToString() const {
-  string out =
-      StrCat("InstructionValueSet(", ShapeUtil::HumanString(shape()), ")\n");
-  ForEachElement([this, &out](const ShapeIndex& index,
-                              const HloValueSet& value_set) {
-    StrAppend(&out, "  ", index.ToString(), " : ", value_set.ToString(), "\n");
-  });
-  return out;
-}
-
 HloDataflowAnalysis::HloDataflowAnalysis(HloModule* module, bool ssa_form,
                                          bool bitcast_defines_value)
     : module_(module),
@@ -256,10 +49,10 @@ HloDataflowAnalysis::HloDataflowAnalysis(HloModule* module, bool ssa_form,
 bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction,
                                            const ShapeIndex& index) const {
   const HloValueSet& value_set = GetValueSet(instruction, index);
-  if (value_set.value_ids().size() != 1) {
+  if (value_set.values().size() != 1) {
     return false;
   }
-  return GetValue(value_set.GetUniqueValueId()).instruction() == instruction;
+  return value_set.GetUniqueValue().defining_instruction() == instruction;
 }
 
 const HloValue& HloDataflowAnalysis::GetValueDefinedAt(
@@ -274,20 +67,20 @@ HloValue& HloDataflowAnalysis::GetValueDefinedAt(
   return GetUniqueValueAt(instruction, index);
 }
 
-HloValue::Id HloDataflowAnalysis::NewHloValue(HloInstruction* instruction,
-                                              const ShapeIndex& index,
-                                              bool is_phi) {
-  int64 value_id = next_value_id_++;
-  auto it_added = values_.emplace(
+HloValue* HloDataflowAnalysis::NewHloValue(HloInstruction* instruction,
+                                           const ShapeIndex& index,
+                                           bool is_phi) {
+  const int64 value_id = next_value_id_++;
+  auto emplaced = values_.emplace(
       std::piecewise_construct, std::forward_as_tuple(value_id),
       std::forward_as_tuple(value_id, instruction, index, is_phi));
-  CHECK(it_added.second);
+  CHECK(emplaced.second);
 
   // Clear the vector of values as it is now stale. It will be lazily
   // reconstructed if needed when HloDataflowAnalysis::values() is called.
   values_vector_.clear();
 
-  return value_id;
+  return &emplaced.first->second;
 }
 
 void HloDataflowAnalysis::DeleteHloValue(HloValue::Id value_id) {
@@ -305,16 +98,16 @@ string HloDataflowAnalysis::ToString() const {
        module_->computations()) {
     for (const std::unique_ptr<HloInstruction>& instruction :
          computation->instructions()) {
-      StrAppend(&out, "    ", instruction->FullyQualifiedName(), ":\n");
+      StrAppend(&out, "    ", instruction->name(), ":\n");
       if (ShapeUtil::IsTuple(instruction->shape())) {
         GetInstructionValueSet(instruction.get())
             .ForEachElement([this, &instruction, &out](
                                 const ShapeIndex& index,
                                 const HloValueSet& value_set) {
               StrAppend(&out, "      tuple index ", index.ToString(), ":\n");
-              for (HloValue::Id value_id : value_set.value_ids()) {
+              for (const HloValue* value : value_set.values()) {
                 StrAppend(
-                    &out, "        ", GetValue(value_id).ToShortString(),
+                    &out, "        ", value->ToShortString(),
                     ValueIsDefinedAt(instruction.get(), index) ? " (def)" : "",
                     "\n");
               }
@@ -322,8 +115,8 @@ string HloDataflowAnalysis::ToString() const {
       } else {
         const HloValueSet& top_level_value_set =
             GetValueSet(instruction.get(), /*index=*/{});
-        for (HloValue::Id value_id : top_level_value_set.value_ids()) {
-          StrAppend(&out, "      ", GetValue(value_id).ToShortString(),
+        for (const HloValue* value : top_level_value_set.values()) {
+          StrAppend(&out, "      ", value->ToShortString(),
                     ValueIsDefinedAt(instruction.get()) ? " (def)" : "", "\n");
         }
       }
@@ -361,9 +154,8 @@ const std::vector<const HloValue*>& HloDataflowAnalysis::values() const {
     for (auto& pair : values_) {
       values_vector_.push_back(&pair.second);
     }
-    std::sort(
-        values_vector_.begin(), values_vector_.end(),
-        [](const HloValue* a, const HloValue* b) { return a->id() < b->id(); });
+    std::sort(values_vector_.begin(), values_vector_.end(),
+              HloValue::IdLessThan);
   } else {
     CHECK_EQ(values_vector_.size(), values_.size());
     for (const HloValue* value : values_vector_) {
@@ -405,8 +197,8 @@ InstructionValueSet HloDataflowAnalysis::Phi(
         // Construct a vector of unique value IDs of the inputs.
         std::vector<HloValue::Id> input_value_ids;
         for (const InstructionValueSet* input : inputs) {
-          for (HloValue::Id value_id : input->element(index).value_ids()) {
-            input_value_ids.push_back(value_id);
+          for (const HloValue* value : input->element(index).values()) {
+            input_value_ids.push_back(value->id());
           }
         }
         std::sort(input_value_ids.begin(), input_value_ids.end());
@@ -427,7 +219,7 @@ InstructionValueSet HloDataflowAnalysis::Phi(
 
         if (input_value_ids.size() <= 1) {
           if (input_value_ids.size() == 1) {
-            *value_set = HloValueSet({input_value_ids[0]});
+            *value_set = HloValueSet({&GetValue(input_value_ids[0])});
           }
           if (existing_phi_value) {
             // The merge point does not have multiple distinct inputs (which are
@@ -442,7 +234,7 @@ InstructionValueSet HloDataflowAnalysis::Phi(
           if (existing_phi_value) {
             // A phi value already exists so reuse it in the new
             // InstructionValueSet.
-            *value_set = HloValueSet({existing_phi_value->id()});
+            *value_set = HloValueSet({existing_phi_value});
           } else {
             // Create a new phi value.
             *value_set =
@@ -453,39 +245,37 @@ InstructionValueSet HloDataflowAnalysis::Phi(
   return new_value_set;
 }
 
-void HloDataflowAnalysis::UpdateLocationsOfValuesAt(
+void HloDataflowAnalysis::UpdatePositionsOfValuesAt(
     HloInstruction* instruction, const InstructionValueSet& new_value_set,
     const InstructionValueSet* prev_value_set) {
   if (prev_value_set != nullptr) {
-    // Remove locations from the old value set.
+    // Remove positions from the old value set.
     prev_value_set->ForEachElement(
         [this, instruction](const ShapeIndex& index,
                             const HloValueSet& value_set) {
-          for (HloValue::Id value_id : value_set.value_ids()) {
+          for (const HloValue* value : value_set.values()) {
             // HloValues in the previous value set may have been deleted.
-            if (!ContainsKey(values_, value_id)) {
+            if (!ContainsKey(values_, value->id())) {
               continue;
             }
-            // Don't remove the defining location of the value.
-            HloValue& value = GetValue(value_id);
-            if (instruction == value.instruction()) {
-              CHECK_EQ(index, value.index());
+            // Don't remove the defining position of the value.
+            if (instruction == value->defining_instruction()) {
+              CHECK_EQ(index, value->defining_index());
             } else {
-              value.RemoveLocation(instruction, index);
+              GetValue(value->id()).RemovePosition(instruction, index);
             }
           }
         });
   }
-  // Add locations in the new value set.
+  // Add positions in the new value set.
   new_value_set.ForEachElement(
       [this, instruction](const ShapeIndex& index,
                           const HloValueSet& value_set) {
-        for (HloValue::Id value_id : value_set.value_ids()) {
-          HloValue& value = GetValue(value_id);
-          if (instruction == value.instruction()) {
-            CHECK_EQ(index, value.index());
+        for (const HloValue* value : value_set.values()) {
+          if (instruction == value->defining_instruction()) {
+            CHECK_EQ(index, value->defining_index());
           } else {
-            value.AddLocation(instruction, index);
+            GetValue(value->id()).AddPosition(instruction, index);
           }
         }
       });
@@ -672,7 +462,7 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
     // Update uses. First clear all of the old uses at the particular
     // operands. Then add the new uses. There may be overlap between the old
     // uses and new uses.
-    UpdateLocationsOfValuesAt(instruction, GetInstructionValueSet(instruction),
+    UpdatePositionsOfValuesAt(instruction, GetInstructionValueSet(instruction),
                               &old_value);
   }
 }
@@ -694,15 +484,24 @@ InstructionValueSet HloDataflowAnalysis::RecomputeParameterValueSet(
   std::vector<const InstructionValueSet*> inputs;
   bool called_from_while = false;
   for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-    inputs.push_back(&GetInstructionValueSet(
-        callsite.instruction()->operand(parameter->parameter_number())));
-    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
-      // In a while instruction, the backedge is also a dataflow input to the
-      // parameter instruction. This code covers the case where the parameter is
-      // in the while body or the parameter is in the while condition.
+    if (callsite.instruction()->opcode() == HloOpcode::kCall) {
+      // The operand values of a call instruction are forwarded to the
+      // respective parameter instruction of the subcomputation.
+      inputs.push_back(&GetInstructionValueSet(
+          callsite.instruction()->operand(parameter->parameter_number())));
+    } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+      // In a while instruction, the while operand (ie, the init value) and the
+      // backedge are dataflow inputs to the parameter instruction. This is the
+      // case for parameters of both the body and condition computations.
+      CHECK_EQ(parameter->parameter_number(), 0);
+      inputs.push_back(
+          &GetInstructionValueSet(callsite.instruction()->operand(0)));
       inputs.push_back(&GetInstructionValueSet(
           callsite.instruction()->while_body()->root_instruction()));
       called_from_while = true;
+    } else {
+      LOG(FATAL) << "CallContext::kSequential computations should only be "
+                    "called from call or while instructions";
     }
   }
 
@@ -797,13 +596,156 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           define_all_values();
           break;
       }
-      UpdateLocationsOfValuesAt(instruction.get(),
+      UpdatePositionsOfValuesAt(instruction.get(),
                                 GetInstructionValueSet(instruction.get()));
     }
   }
   return Status::OK();
 }
 
+bool HloDataflowAnalysis::IsDefinedBefore(const HloValue& a, const HloValue& b,
+                                          const HloOrdering& ordering) const {
+  // If 'b' is an entry param then 'a' cannot be defined before 'b' because 'b'
+  // is live into the module.
+  if (b.defining_instruction()->parent() == module_->entry_computation() &&
+      b.defining_instruction()->opcode() == HloOpcode::kParameter) {
+    return false;
+  }
+
+  // Phi values require special handling. Because XLA does not have a phi
+  // instruction, the definition instruction of the phis values are
+  // placeholders: either the subcomputation parameter (body or condition) or
+  // the while instruction. However, the program point where these values are
+  // logically defined does not necessarily coincide exactly with program point
+  // of these place-holder instructions. So we explicitly define the following
+  // order for phi values:
+  //
+  //   body/condition parameter phi:
+  //     Defined before all values defined in its computation excepting other
+  //     phis.
+  //
+  //   while phi:
+  //     defined after all values defined in the condition or body.
+  //
+  auto is_body_or_condition_phi = [](const HloValue& v) {
+    return v.is_phi() &&
+           v.defining_instruction()->opcode() == HloOpcode::kParameter;
+  };
+  if (is_body_or_condition_phi(a) && !is_body_or_condition_phi(b) &&
+      call_graph_->InstructionIsNestedIn(b.defining_instruction(),
+                                         a.defining_instruction()->parent())) {
+    return true;
+  }
+  if (is_body_or_condition_phi(b) &&
+      call_graph_->InstructionIsNestedIn(a.defining_instruction(),
+                                         b.defining_instruction()->parent())) {
+    return false;
+  }
+
+  // If 'b' is a while phi and 'a' is in the body or condition, then 'a'
+  // executes before 'b'.
+  if (b.is_phi() && b.defining_instruction()->opcode() == HloOpcode::kWhile &&
+      (call_graph_->InstructionIsNestedIn(
+           a.defining_instruction(), b.defining_instruction()->while_body()) ||
+       call_graph_->InstructionIsNestedIn(
+           a.defining_instruction(),
+           b.defining_instruction()->while_condition()))) {
+    return true;
+  }
+
+  return ordering.ExecutesBefore(a.defining_instruction(),
+                                 b.defining_instruction());
+}
+
+bool HloDataflowAnalysis::UseIsBeforeValueDefinition(
+    const HloUse& use, const HloValue& value,
+    const HloOrdering& ordering) const {
+  if (ordering.ExecutesBefore(use.instruction, value.defining_instruction())) {
+    return true;
+  }
+
+  // If the use is at the instruction where the value is defined, then the use
+  // is before the def if the instruction allows buffer sharing (in place
+  // computation).
+  if (use.instruction == value.defining_instruction() &&
+      CanShareOperandBufferWithUser(
+          use.instruction->mutable_operand(use.operand_number),
+          use.operand_index, value.defining_instruction(),
+          value.defining_index())) {
+    return true;
+  }
+
+  // The use at a while is an input to a phi, and logically occurs before values
+  // are defined in the body or condition computations.
+  if (use.instruction->opcode() == HloOpcode::kWhile) {
+    const HloInstruction* xla_while = use.instruction;
+    if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
+                                           xla_while->while_body()) ||
+        call_graph_->InstructionIsNestedIn(value.defining_instruction(),
+                                           xla_while->while_condition())) {
+      return true;
+    }
+  }
+
+  // Similarly if the value is defined at a while, it logically occurs after any
+  // uses in the body or condition computations.
+  if (value.defining_instruction()->opcode() == HloOpcode::kWhile) {
+    CHECK(ssa_form_);
+    const HloInstruction* xla_while = value.defining_instruction();
+    if (call_graph_->InstructionIsNestedIn(use.instruction,
+                                           xla_while->while_body()) ||
+        call_graph_->InstructionIsNestedIn(use.instruction,
+                                           xla_while->while_condition())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HloDataflowAnalysis::LiveRangeStrictlyBefore(
+    const HloValue& a, const HloValue& b, const HloOrdering& ordering) const {
+  VLOG(4) << "LiveRangeStrictlyBefore(a = " << a.ToShortString()
+          << ", b = " << b.ToShortString() << ")";
+  if (!IsDefinedBefore(a, b, ordering)) {
+    VLOG(4) << "a not defined before b";
+    return false;
+  }
+
+  // Live-out values from the module can never have ranges strictly before any
+  // other value.
+  if (a.live_out_of_module()) {
+    VLOG(4) << "a is live out of module";
+    return false;
+  }
+
+  // Live-out values of computations can never have ranges strictly before any
+  // other value in the computation (including values nested in
+  // subcomputations).
+  if (a.live_out_of_computation() &&
+      call_graph_->InstructionIsNestedIn(b.defining_instruction(),
+                                         a.defining_instruction()->parent())) {
+    VLOG(4) << "a is live out of computation containing b";
+    return false;
+  }
+
+  // All uses of 'a' must be before 'b' is defined.
+  for (const HloUse& use : a.uses()) {
+    if (!UseIsBeforeValueDefinition(use, b, ordering)) {
+      VLOG(4) << "use of a (" << use << ") not before b is defined";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool HloDataflowAnalysis::MayInterfere(const HloValue& a, const HloValue& b,
+                                       const HloOrdering& ordering) const {
+  // Buffers without disjoint liveness may interfere.
+  return !LiveRangeStrictlyBefore(a, b, ordering) &&
+         !LiveRangeStrictlyBefore(b, a, ordering);
+}
+
 /* static */
 StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
     HloModule* module, bool ssa_form, bool bitcast_defines_value) {
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 2f9b0a64be5a00f490e5fc678ac5589e374f80d7..4eb4f0bb16768bee9eaae8d19f578dad242dbb2e 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Analysis for determining the possible set of values for all locations
+// Analysis for determining the possible set of values for all positions
 // (instructions and ShapeIndexes) in the HLO module. Analysis is module-scoped
 // tracking values across computation boundaries.
 
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
 
-#include <functional>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -28,222 +28,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-// Abstraction which identifies a specific point in the XLA graph. An
-// HloLocation specifies a ShapeIndex within the output of a specific
-// instruction.
-struct HloLocation {
-  HloInstruction* instruction;
-  ShapeIndex index;
-
-  string ToString() const;
-
-  bool operator==(const HloLocation& other) const {
-    return instruction == other.instruction && index == other.index;
-  }
-  bool operator!=(const HloLocation& other) const { return !(*this == other); }
-};
-
-std::ostream& operator<<(std::ostream& out, const HloLocation& location);
-
-// Defines a single use of an HLO value.
-struct HloUse {
-  // Instruction at which the value is used.
-  HloInstruction* instruction;
-
-  // The operand number in which the value is appears.
-  int64 operand_number;
-
-  // The shape index within the operand in which the value appears.
-  ShapeIndex operand_index;
-
-  string ToString() const;
-
-  bool operator==(const HloUse& other) const {
-    return instruction == other.instruction &&
-           operand_number == other.operand_number &&
-           operand_index == other.operand_index;
-  }
-
-  bool operator!=(const HloUse& other) const { return !(*this == other); }
-};
-
-std::ostream& operator<<(std::ostream& out, const HloUse& use);
-
-// Class describing a value used by the dataflow analysis. XLA arrays are
-// trivially a single HloValue. Tuples are made up of more than one HloValue: an
-// HloValue for the pointer vector, and an HloValue for each child element.
-//
-// Every HloValue is defined by a particular instruction and most instructions
-// define only a single HloValue. Instructions which define a single HloValue
-// include array-shaped instructions such as Add but also includes Tuple-shaped
-// instructions such as Tuple. The Tuple instruction defines a single HloValue
-// which is a vector of pointers to the values containing the Tuple
-// instruction's operands. Though the result of the Tuple instruction includes
-// multiple values only the top-level HloValue (the vector of pointers) is
-// defined by the Tuple instruction. The values containing the tuple elements
-// are defined by earlier instructions, usually the operands of the Tuple
-// instruction.
-//
-// Instructions which construct both the tuple *and* the tuple elements define
-// more than one HloValue. This includes (at least) tuple-shaped Constant,
-// Parameter, Infeed and While instructions. These tuple-shaped instructions do
-// not assemble a tuple from existing HloValues like the Tuple instruction does,
-// but rather define all the HloValues in the tuple.
-class HloValue {
- public:
-  using Id = int64;
-
-  // Construct an HloValue defined by 'instruction' at shape index 'index'. If
-  // is_phi is true, then this value is a phi value, for example, at the
-  // parameter of a while body computation. Phi values are only used in the SSA
-  // dataflow analysis (HloDataflowAnalysis::ssa_form_ is true).
-  HloValue(HloValue::Id id, HloInstruction* instruction,
-           const ShapeIndex& index, bool is_phi = false);
-
-  // Return a unique identifier for this HloValue. This value is used for stable
-  // sorting and iteration
-  Id id() const { return id_; }
-
-  // Returns whether this value is a phi value.
-  bool is_phi() const { return is_phi_; }
-
-  // Return the location where this value is defined.
-  const HloLocation& DefinitionLocation() const { return locations_[0]; }
-
-  // Return the instruction which defines this HloValue.
-  HloInstruction* instruction() const {
-    return DefinitionLocation().instruction;
-  }
-
-  // Return the shape index at which this HloValue is defined in the output of
-  // instruction().
-  const ShapeIndex& index() const { return DefinitionLocation().index; }
-
-  // Add or remove a location at which the HloValue appears. The definition
-  // location can not be removed. The uses of the HloValue are updated.
-  void AddLocation(HloInstruction* instruction, const ShapeIndex& index);
-  void RemoveLocation(HloInstruction* instruction, const ShapeIndex& index);
-
-  // Return all locations of the HloValue in the module.
-  const std::vector<HloLocation>& locations() const { return locations_; }
-
-  // Return all uses of the HloValue.
-  const std::vector<HloUse>& uses() const { return uses_; }
-
-  // Set/get whether this HloValue is live out of the module.
-  bool live_out_of_module() const { return live_out_of_module_; }
-
-  bool operator==(const HloValue& other) const;
-  bool operator!=(const HloValue& other) const;
-
-  // Return a single-line string representation of the value.
-  string ToShortString() const;
-
-  string ToString(int indent = 0) const;
-
- private:
-  // Unique identifier for this HloValue. Used for stable sorting and iteration.
-  const Id id_;
-
-  // Whether this instruction is a phi value.
-  const bool is_phi_;
-
-  // The set of locations of this HloValue. The first element is always the
-  // location of the definition.
-  std::vector<HloLocation> locations_;
-
-  // The set of uses of this HloValue.
-  std::vector<HloUse> uses_;
-
-  // Whether this value is live out of the HLO module.
-  bool live_out_of_module_ = false;
-};
-
-std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
-
-// A class representing the possible set of HloValues at a particular point
-// (shape index in the output of an instruction) in the XLA graph. This set
-// contains the set of reaching HloValue definitions. For a simple array-shaped
-// instruction like Add, the HloValueSet of the top-level of the instruction's
-// output trivially contains only the HloValue defined by the instruction. For
-// instructions which have non-trivial dataflow such as Tuple or Select, the
-// HloValueSets of the instruction's output contains one or more HloValues
-// defined by the instruction's operands or defined further up in the XLA graph.
-class HloValueSet {
- public:
-  HloValueSet() = default;
-
-  explicit HloValueSet(tensorflow::gtl::ArraySlice<HloValue::Id> value_ids)
-      : value_ids_(value_ids.begin(), value_ids.end()) {
-    SortAndUniquifyValues();
-  }
-
-  // Return the union of the given HloValueSets.
-  static HloValueSet Union(
-      tensorflow::gtl::ArraySlice<const HloValueSet*> inputs);
-
-  // Return the vector of the IDs of all HloValues in the set. Values in the
-  // vector are unique and sorted.
-  const std::vector<HloValue::Id>& value_ids() const { return value_ids_; }
-
-  // Return the unique HLO value in the set. CHECKs if the set does not contain
-  // exactly one value.
-  HloValue::Id GetUniqueValueId() const {
-    CHECK_EQ(value_ids().size(), 1);
-    return value_ids()[0];
-  }
-
-  bool operator==(const HloValueSet& other) const {
-    return value_ids() == other.value_ids();
-  }
-  bool operator!=(const HloValueSet& other) const { return !(*this == other); }
-
-  string ToString() const;
-
- private:
-  // Sorts value_ and removes duplicates. This should be called after adding any
-  // elements to values_.
-  void SortAndUniquifyValues();
-
-  // HloValues sorted by HloValue::Id.
-  std::vector<HloValue::Id> value_ids_;
-};
-
-std::ostream& operator<<(std::ostream& out, const HloValueSet& hlo_value);
-
-// A class collecting the HloValues which might be contained in the output of
-// an HLO instruction. For array-shaped instructions, an InstructionValueSet
-// trivially holds a single HloValueSet. Tuple-shaped InstructionValueSets
-// hold multiple HloValueSets.
-class InstructionValueSet : public ShapeTree<HloValueSet> {
- public:
-  InstructionValueSet(const Shape& shape) : ShapeTree<HloValueSet>(shape) {}
-
-  // Return the union of the given InstructionValueSets.
-  static InstructionValueSet Union(
-      tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs);
-
-  string ToString() const;
-};
-
-std::ostream& operator<<(std::ostream& out,
-                         const InstructionValueSet& instruction_value_set);
-
 // Analysis which identifies all HLO values and their uses in an HLO module.
 class HloDataflowAnalysis {
  public:
@@ -298,17 +94,28 @@ class HloDataflowAnalysis {
   // shape index. CHECKs if the value set does not contain a exactly one value.
   const HloValue& GetUniqueValueAt(const HloInstruction* instruction,
                                    const ShapeIndex& index = {}) const {
-    return GetValue(GetValueSet(instruction, index).GetUniqueValueId());
+    return GetValueSet(instruction, index).GetUniqueValue();
   }
   HloValue& GetUniqueValueAt(const HloInstruction* instruction,
                              const ShapeIndex& index = {}) {
-    return GetValue(GetValueSet(instruction, index).GetUniqueValueId());
+    return GetValue(GetValueSet(instruction, index).GetUniqueValue().id());
   }
 
   // Return the HloValue with the given Id.
   const HloValue& GetValue(HloValue::Id value_id) const;
   HloValue& GetValue(HloValue::Id value_id);
 
+  // Returns whether the given values interfere assuming the given HLO
+  // ordering. Two values interfere if they may both be simultaneously live.
+  bool MayInterfere(const HloValue& a, const HloValue& b,
+                    const HloOrdering& ordering) const;
+
+  // Overload which takes HloValue:Ids.
+  bool MayInterfere(HloValue::Id a, HloValue::Id b,
+                    const HloOrdering& ordering) const {
+    return MayInterfere(GetValue(a), GetValue(b), ordering);
+  }
+
   // Return the total number of HloValues.
   int64 value_count() const { return values_.size(); }
 
@@ -323,10 +130,9 @@ class HloDataflowAnalysis {
   HloDataflowAnalysis(HloModule* module, bool ssa_form,
                       bool bitcast_defines_value = false);
 
-  // Creates a new HloValue defined at the given instruction and shape index and
-  // return its ID.
-  HloValue::Id NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
-                           bool is_phi = false);
+  // Returns a new HloValue defined at the given instruction and shape index.
+  HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
+                        bool is_phi = false);
 
   // Delete the HloValue with the given ID.
   void DeleteHloValue(HloValue::Id value_id);
@@ -363,24 +169,40 @@ class HloDataflowAnalysis {
       tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs,
       bool skip_top_level = false);
 
-  // Updates the locations of the HloValues in the output of the given
+  // Updates the positions of the HloValues in the output of the given
   // instruction. This should be called after the instruction value set of
   // 'instruction' has been changed. 'prev_value_set' must point to the previous
   // state of the value set prior to the change. 'prev_value_set' may be null if
-  // this is the first time locations are being computed. The previous state is
-  // necessary to efficiently remove locations which have been eliminated due to
+  // this is the first time positions are being computed. The previous state is
+  // necessary to efficiently remove positions which have been eliminated due to
   // changes in the instructions' InstructionValueSet.
-  void UpdateLocationsOfValuesAt(
+  void UpdatePositionsOfValuesAt(
       HloInstruction* instruction, const InstructionValueSet& new_value_set,
       const InstructionValueSet* prev_value_set = nullptr);
 
+  // Returns true if the live range of the given value 'a' is strictly before
+  // the live range of value 'b' using the given HLO ordering.
+  bool LiveRangeStrictlyBefore(const HloValue& a, const HloValue& b,
+                               const HloOrdering& ordering) const;
+
+  // Returns whether the value 'a' is defined before the value 'b' under the
+  // given ordering.
+  bool IsDefinedBefore(const HloValue& a, const HloValue& b,
+                       const HloOrdering& ordering) const;
+
+  // Returns whether the given use is before the given value definition.
+  bool UseIsBeforeValueDefinition(const HloUse& use, const HloValue& value,
+                                  const HloOrdering& ordering) const;
+
   HloModule* const module_;
   const bool ssa_form_;
   const bool bitcast_defines_value_;
 
   std::unique_ptr<CallGraph> call_graph_;
 
-  // The map of all HloValues in the module.
+  // The map of all HloValues in the module. We pass around pointers to the
+  // mapped HloValues, so the underlying container must keep them valid despite
+  // mutations touching other map entries.
   std::unordered_map<HloValue::Id, HloValue> values_;
 
   // A map from instruction to InstructionValueSet.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 21344af5f224843a857984162a36b8a09915e607..2b685e355f0ce4d856639c29c3b1b254b068ef7b 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -39,46 +39,58 @@ using ::testing::UnorderedElementsAre;
 class HloDataflowAnalysisTest : public HloTestBase,
                                 public ::testing::WithParamInterface<bool> {
  protected:
-  HloDataflowAnalysisTest() : module_(TestName()) {}
+  HloDataflowAnalysisTest() : module_(CreateNewModule()) {}
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   const HloDataflowAnalysis& RunAnalysis(bool ssa_form,
                                          bool bitcast_defines_value = false) {
     analysis_ =
-        HloDataflowAnalysis::Run(&module_, ssa_form, bitcast_defines_value)
+        HloDataflowAnalysis::Run(module_.get(), ssa_form, bitcast_defines_value)
             .ConsumeValueOrDie();
     return *analysis_;
   }
 
-  // Return a vector of the HloValues at the given program location.
+  // Return a vector of the HloValues at the given program position.
   std::vector<HloValue> HloValuesAt(const HloInstruction* instruction,
                                     const ShapeIndex& index = {}) {
     CHECK(analysis_ != nullptr);
     std::vector<HloValue> values;
-    for (HloValue::Id value_id :
-         analysis_->GetValueSet(instruction, index).value_ids()) {
-      values.push_back(analysis_->GetValue(value_id));
+    for (const HloValue* value :
+         analysis_->GetValueSet(instruction, index).values()) {
+      values.push_back(*value);
     }
     return values;
   }
 
-  HloModule module_;
+  // Returns true if the top-level values for instructions 'a' and 'b' may
+  // interfere. Precondition: 'a' and 'b' define array-shaped values.
+  bool InstructionsMayInterfere(const HloOrdering& ordering,
+                                const HloInstruction* a,
+                                const HloInstruction* b) {
+    EXPECT_FALSE(ShapeUtil::IsTuple(a->shape()));
+    EXPECT_FALSE(ShapeUtil::IsTuple(b->shape()));
+    return analysis_->MayInterfere(analysis_->GetValueDefinedAt(a),
+                                   analysis_->GetValueDefinedAt(b), ordering);
+  }
+
+  std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloDataflowAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
+  const Shape vector_shape_ = ShapeUtil::MakeShape(F32, {42});
 };
 
 TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
   // Test the dataflow for a simple binary operation (Add).
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, constant1, constant2));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -89,14 +101,14 @@ TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
   EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
   EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
 
-  // Verify the locations of the values. These locations are all trivial because
+  // Verify the positions of the values. These positions are all trivial because
   // there are no instructions which forward values.
-  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).locations(),
-              UnorderedElementsAre(HloLocation{constant1, {}}));
-  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).locations(),
-              UnorderedElementsAre(HloLocation{constant2, {}}));
-  EXPECT_THAT(analysis.GetValueDefinedAt(add).locations(),
-              UnorderedElementsAre(HloLocation{add, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).positions(),
+              UnorderedElementsAre(HloPosition{constant1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).positions(),
+              UnorderedElementsAre(HloPosition{constant2, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(add).positions(),
+              UnorderedElementsAre(HloPosition{add, {}}));
 
   // Verify the uses of the values.
   EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
@@ -126,7 +138,7 @@ TEST_P(HloDataflowAnalysisTest, TupleAndGtes) {
       HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -143,42 +155,36 @@ TEST_P(HloDataflowAnalysisTest, TupleAndGtes) {
   EXPECT_FALSE(analysis.ValueIsDefinedAt(gte1));
   EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
 
-  // Verify the locations of the values.
+  // Verify the positions of the values.
   EXPECT_THAT(
-      analysis.GetValueDefinedAt(param0).locations(),
-      UnorderedElementsAre(HloLocation{param0, {}}, HloLocation{tuple, {0}},
-                           HloLocation{gte0, {}}));
+      analysis.GetValueDefinedAt(param0).positions(),
+      UnorderedElementsAre(HloPosition{param0, {}}, HloPosition{tuple, {0}},
+                           HloPosition{gte0, {}}));
   EXPECT_THAT(
-      analysis.GetValueDefinedAt(param1).locations(),
-      UnorderedElementsAre(HloLocation{param1, {}}, HloLocation{tuple, {1}},
-                           HloLocation{gte1, {}}));
-  EXPECT_THAT(analysis.GetValueDefinedAt(tuple).locations(),
-              UnorderedElementsAre(HloLocation{tuple, {}}));
+      analysis.GetValueDefinedAt(param1).positions(),
+      UnorderedElementsAre(HloPosition{param1, {}}, HloPosition{tuple, {1}},
+                           HloPosition{gte1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple).positions(),
+              UnorderedElementsAre(HloPosition{tuple, {}}));
 
   // Verify uses. Of interest is that a GetTupleElement instruction is only a
   // use of the top-level value in the tuple operand.
   EXPECT_THAT(analysis.GetValueDefinedAt(param0).uses(),
-              UnorderedElementsAre(HloUse{tuple, 0, {}}, HloUse{add, 0, {}}));
+              UnorderedElementsAre(HloUse{add, 0, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(param1).uses(),
-              UnorderedElementsAre(HloUse{tuple, 1, {}}, HloUse{add, 1, {}}));
+              UnorderedElementsAre(HloUse{add, 1, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(tuple, /*index=*/{}).uses(),
               UnorderedElementsAre(HloUse{gte0, 0, {}}, HloUse{gte1, 0, {}}));
   EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
 }
 
 TEST_P(HloDataflowAnalysisTest, NestedTuple) {
-  // Verify the dataflow through a nested tuple of the following form for two
-  // constants %constant1 and %constant2:
-  //
-  // %nested_tuple = {{%constant1, %constant2},
-  //                  {%constant1, %constant2},
-  //                  %constant1}
-  //
+  // Verify the dataflow through a nested tuple.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto nested_tuple = builder.AddInstruction(
@@ -187,33 +193,30 @@ TEST_P(HloDataflowAnalysisTest, NestedTuple) {
       HloInstruction::CreateGetTupleElement(tuple->shape(), nested_tuple, 1));
   auto gte_out = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, gte_tuple, 0));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
 
   EXPECT_EQ(analysis.values().size(), 4);
 
-  // Verify locations and uses.
+  // Verify positions and uses.
   EXPECT_THAT(
-      analysis.GetValueDefinedAt(constant1).locations(),
+      analysis.GetValueDefinedAt(constant1).positions(),
       UnorderedElementsAre(
-          HloLocation{constant1, {}}, HloLocation{tuple, {0}},
-          HloLocation{nested_tuple, {0, 0}}, HloLocation{nested_tuple, {1, 0}},
-          HloLocation{nested_tuple, {2}}, HloLocation{gte_tuple, {0}},
-          HloLocation{gte_out, {}}));
-  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-              UnorderedElementsAre(
-                  HloUse{tuple, 0, {}}, HloUse{nested_tuple, 0, {0}},
-                  HloUse{nested_tuple, 1, {0}}, HloUse{nested_tuple, 2, {}}));
-  EXPECT_THAT(
-      analysis.GetValueDefinedAt(constant2).uses(),
-      UnorderedElementsAre(HloUse{tuple, 1, {}}, HloUse{nested_tuple, 0, {1}},
-                           HloUse{nested_tuple, 1, {1}}));
+          HloPosition{constant1, {}}, HloPosition{tuple, {0}},
+          HloPosition{nested_tuple, {0, 0}}, HloPosition{nested_tuple, {1, 0}},
+          HloPosition{nested_tuple, {2}}, HloPosition{gte_tuple, {0}},
+          HloPosition{gte_out, {}}));
+  // Constant values should have no uses though one is live out. The positions
+  // where they appear as operands are on instructions which do not use the
+  // values (eg, Tuple).
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).uses().empty());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).uses().empty());
+
+  // The top-level tuple values are used in GTE instructions.
   EXPECT_THAT(analysis.GetValueDefinedAt(tuple, /*index=*/{}).uses(),
-              UnorderedElementsAre(HloUse{nested_tuple, 0, {}},
-                                   HloUse{nested_tuple, 1, {}},
-                                   HloUse{gte_out, 0, {}}));
+              UnorderedElementsAre(HloUse{gte_out, 0, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(nested_tuple, /*index=*/{}).uses(),
               UnorderedElementsAre(HloUse{gte_tuple, 0, {}}));
 
@@ -236,16 +239,16 @@ TEST_P(HloDataflowAnalysisTest, SingleCall) {
   auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
   HloComputation* called_computation =
-      module_.AddEmbeddedComputation(subbuilder.Build());
+      module_->AddEmbeddedComputation(subbuilder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -268,11 +271,12 @@ TEST_P(HloDataflowAnalysisTest, SingleCall) {
   EXPECT_EQ(analysis.GetUniqueValueAt(call), analysis.GetValueDefinedAt(add));
 
   EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-              UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{call, 0, {}}));
+              UnorderedElementsAre(HloUse{add, 0, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
-              UnorderedElementsAre(HloUse{add, 1, {}}, HloUse{call, 1, {}}));
+              UnorderedElementsAre(HloUse{add, 1, {}}));
 
   EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
 }
 
 TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
@@ -285,20 +289,20 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
   auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
   HloComputation* called_computation =
-      module_.AddEmbeddedComputation(subbuilder.Build());
+      module_->AddEmbeddedComputation(subbuilder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto call1 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kSubtract, call1, call2));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -316,17 +320,18 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
   EXPECT_TRUE(analysis.ValueIsDefinedAt(sub));
 
   EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-              UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{call1, 0, {}},
-                                   HloUse{call2, 0, {}}));
+              UnorderedElementsAre(HloUse{add, 0, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
-              UnorderedElementsAre(HloUse{add, 1, {}}, HloUse{call1, 1, {}},
-                                   HloUse{call2, 1, {}}));
+              UnorderedElementsAre(HloUse{add, 1, {}}));
   // The Add from the subcomputation is used as both operands of the Subtract.
   EXPECT_THAT(analysis.GetValueDefinedAt(add).uses(),
               UnorderedElementsAre(HloUse{sub, 0, {}}, HloUse{sub, 1, {}}));
 
   EXPECT_FALSE(analysis.GetValueDefinedAt(add).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
+
   EXPECT_TRUE(analysis.GetValueDefinedAt(sub).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(sub).live_out_of_computation());
 }
 
 TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithDifferentArguments) {
@@ -339,18 +344,18 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithDifferentArguments) {
   auto add = subbuilder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, subparam0, subparam1));
   HloComputation* called_computation =
-      module_.AddEmbeddedComputation(subbuilder.Build());
+      module_->AddEmbeddedComputation(subbuilder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto call1 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {call1, constant2}, called_computation));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -392,7 +397,7 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
   auto add = inner_builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, inner_param0, inner_param1));
   HloComputation* inner_computation =
-      module_.AddEmbeddedComputation(inner_builder.Build());
+      module_->AddEmbeddedComputation(inner_builder.Build());
 
   auto outer_builder = HloComputation::Builder("OuterComputation");
   auto outer_param0 = outer_builder.AddInstruction(
@@ -400,19 +405,19 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
   auto outer_param1 = outer_builder.AddInstruction(
       HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
   // Swizzle parameters.
-  auto nested_call = outer_builder.AddInstruction(HloInstruction::CreateCall(
+  outer_builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {outer_param1, outer_param0}, inner_computation));
   HloComputation* outer_computation =
-      module_.AddEmbeddedComputation(outer_builder.Build());
+      module_->AddEmbeddedComputation(outer_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
-  auto call = builder.AddInstruction(HloInstruction::CreateCall(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, outer_computation));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -423,14 +428,10 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
 
   // Verify that the uses of the constants are properly swizzled by parameter
   // permutation in nested_call.
-  EXPECT_THAT(
-      analysis.GetValueDefinedAt(constant1).uses(),
-      UnorderedElementsAre(HloUse{call, 0, {}}, HloUse{nested_call, 1, {}},
-                           HloUse{add, 1, {}}));
-  EXPECT_THAT(
-      analysis.GetValueDefinedAt(constant2).uses(),
-      UnorderedElementsAre(HloUse{call, 1, {}}, HloUse{nested_call, 0, {}},
-                           HloUse{add, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{add, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{add, 0, {}}));
 
   EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
 }
@@ -465,33 +466,37 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
       HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
   auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
-  auto body_tuple = body_builder.AddInstruction(
+  body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_0, add}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   // Condition computation trivially returns a constant "false".
   auto cond_builder = HloComputation::Builder("condition");
   auto cond_param = cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
-  cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
 
+  EXPECT_TRUE(
+      analysis.GetValueDefinedAt(cond_constant).live_out_of_computation());
+  EXPECT_FALSE(analysis.GetValueDefinedAt(cond_constant).live_out_of_module());
+
   if (ssa_form) {
     // Element 0 of the tuple passed through the body so no phi value is
     // defined.
@@ -507,15 +512,17 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
     EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{1}));
     EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{1}).is_phi());
 
-    EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-                UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{tuple, 0, {}},
-                                     HloUse{xla_while, 0, {0}},
-                                     HloUse{body_tuple, 0, {}}));
+    EXPECT_THAT(
+        analysis.GetValueDefinedAt(constant1).uses(),
+        UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{xla_while, 0, {0}}));
 
     // Constant1 passes through the body and out of the module.
     EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
     EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1})
                     .live_out_of_module());
+
+    EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
+    EXPECT_FALSE(analysis.GetValueDefinedAt(add).live_out_of_module());
   } else {
     // While instruction and subcomputation parameters should not define values
     // in non-ssa form.
@@ -528,6 +535,7 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
 
     EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
     EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+    EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
   }
 }
 
@@ -565,21 +573,21 @@ TEST_P(HloDataflowAnalysisTest, SequentialWhiles) {
       scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
   body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_0, add}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   auto cond_builder = HloComputation::Builder("condition");
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto xla_while0 = builder.AddInstruction(
@@ -588,7 +596,7 @@ TEST_P(HloDataflowAnalysisTest, SequentialWhiles) {
       HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while0));
   auto xla_while2 = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while1));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -630,9 +638,9 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   // Element 0 passes transparently through the body.
   auto inner_builder = HloComputation::Builder("inner_body");
@@ -647,7 +655,7 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
   inner_builder.AddInstruction(
       HloInstruction::CreateTuple({inner_element_0, add}));
   HloComputation* inner_body =
-      module_.AddEmbeddedComputation(inner_builder.Build());
+      module_->AddEmbeddedComputation(inner_builder.Build());
 
   // Element 1 passes transparently through the body.
   auto outer_builder = HloComputation::Builder("outer_body");
@@ -664,18 +672,18 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
   auto nested_while = outer_builder.AddInstruction(HloInstruction::CreateWhile(
       tuple_shape, condition, inner_body, outer_tuple));
   HloComputation* outer_body =
-      module_.AddEmbeddedComputation(outer_builder.Build());
+      module_->AddEmbeddedComputation(outer_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto entry_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, outer_body, tuple));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -751,26 +759,26 @@ TEST_P(HloDataflowAnalysisTest, SwizzlingWhile) {
       HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
   body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_1, body_element_0}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   auto cond_builder = HloComputation::Builder("condition");
   auto cond_param = cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -817,15 +825,15 @@ TEST_P(HloDataflowAnalysisTest, ArraySelect) {
   // Test a kSelect of an array value.
   auto builder = HloComputation::Builder(TestName());
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       scalar_shape_, HloOpcode::kSelect, pred, constant1, constant2));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -841,15 +849,15 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) {
   // instruction.
   auto builder = HloComputation::Builder(TestName());
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto constant4 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(4.0)));
   auto tuple1 =
       builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
   auto tuple2 =
@@ -868,7 +876,7 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) {
   auto select1234 = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple_shape, HloOpcode::kSelect, pred, select12, select34));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -899,31 +907,33 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) {
                                    analysis.GetValueDefinedAt(constant4)));
 
   EXPECT_THAT(
-      analysis.GetValueDefinedAt(constant1).uses(),
-      UnorderedElementsAre(HloUse{tuple1, 0, {}}, HloUse{select11, 1, {0}},
-                           HloUse{select11, 2, {0}}, HloUse{select12, 1, {0}},
-                           HloUse{select1234, 1, {0}}));
-  EXPECT_THAT(
-      analysis.GetValueDefinedAt(constant2).uses(),
-      UnorderedElementsAre(HloUse{tuple2, 0, {}}, HloUse{select12, 2, {0}},
-                           HloUse{select1234, 1, {0}}));
+      analysis.GetValueDefinedAt(tuple1, /*index=*/{}).uses(),
+      UnorderedElementsAre(HloUse{select11, 1, {}}, HloUse{select11, 2, {}},
+                           HloUse{select12, 1, {}}));
+
+  // The two constant values just pass through the Selects and are not
+  // used. They are live out however.
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).uses().empty());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).uses().empty());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
 }
 
 TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) {
   // Test kSelect of a nested tuple.
   auto builder = HloComputation::Builder(TestName());
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto constant4 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(4.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(4.0)));
   auto constant5 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0)));
   auto inner_tuple1 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant2, constant3}));
   auto tuple1 = builder.AddInstruction(
@@ -935,7 +945,7 @@ TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -993,24 +1003,24 @@ TEST_P(HloDataflowAnalysisTest, TupleSelectToWhile) {
       scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
   body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_0, add}));
-  HloComputation* body = module_.AddEmbeddedComputation(body_builder.Build());
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
   auto cond_builder = HloComputation::Builder("condition");
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, tuple_shape, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   HloComputation* condition =
-      module_.AddEmbeddedComputation(cond_builder.Build());
+      module_->AddEmbeddedComputation(cond_builder.Build());
 
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto tuple1 =
       builder.AddInstruction(HloInstruction::CreateTuple({constant1}));
   auto tuple2 =
@@ -1024,7 +1034,7 @@ TEST_P(HloDataflowAnalysisTest, TupleSelectToWhile) {
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple->shape(), condition, body, tuple));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1062,11 +1072,11 @@ TEST_P(HloDataflowAnalysisTest, BitcastDefinesValue) {
   // Test the bitcast_defines_value flag to the dataflow analysis.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape_, HloOpcode::kBitcast, constant));
 
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   {
@@ -1102,7 +1112,7 @@ TEST_P(HloDataflowAnalysisTest, TupleCopy) {
       builder.AddInstruction(HloInstruction::CreateTuple({param0, param1}));
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(tuple->shape(), HloOpcode::kCopy, tuple));
-  module_.AddEntryComputation(builder.Build());
+  module_->AddEntryComputation(builder.Build());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1126,6 +1136,352 @@ TEST_P(HloDataflowAnalysisTest, TupleCopy) {
       analysis.GetValueDefinedAt(copy, /*index=*/{}).live_out_of_module());
 }
 
+TEST_P(HloDataflowAnalysisTest, ElementwiseChainInterference) {
+  // A simple chain of elementwise operations. No values should interfere.
+  //
+  // param --> negate -> exp -> log
+  //
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "param"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kNegate, param));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kExp, negate));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kLog, exp));
+
+  module_->AddEntryComputation(builder.Build());
+  RunAnalysis(GetParam());
+
+  DependencyHloOrdering ordering(module_.get());
+
+  // No values should interfere.
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, log));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, negate, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, negate, log));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, log));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, log, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, log, exp));
+
+  // Values should interfere with itself.
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, exp, exp));
+}
+
+TEST_P(HloDataflowAnalysisTest, MultipleEntryParameters_Sequential) {
+  // Two entry params, which interfere with each other.
+  //
+  // param0 --> negate ---------------\
+  //                param1 --> exp --> add
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, vector_shape_, "param1"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kNegate, param0));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kExp, param1));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      vector_shape_, HloOpcode::kAdd, negate, exp));
+
+  auto entry = module_->AddEntryComputation(builder.Build());
+  RunAnalysis(GetParam());
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  sequence.insert({entry, {param0, negate, param1, exp, add}});
+  SequentialHloOrdering ordering(module_.get(), sequence);
+
+  // Entry parameters interfere as if they are defined simultaneously at
+  // the very beginning.
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, param0, param1));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param0, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param0, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param0, add));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, param1, param0));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, param1, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param1, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param1, add));
+
+  // Negate and exp still interfere.
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, exp, negate));
+
+  // But {negate, add} and {exp, add} don't interfere.
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, add, exp));
+}
+
+TEST_P(HloDataflowAnalysisTest, WhileParameters_Sequential) {
+  // Similar to MultipleEntryParameters_Sequential, but the parameter is of
+  // while body computation. Body computation in the sequential order:
+  //
+  //  %constant = Constant(...)
+  //  %exp = Exp(%constant)
+  //  %param = Param(0)
+  //  %add = Add(%param, %exp)  ;; Root of body
+  //  %dead_constant = Constant(...)
+  //  %dead_negate = Negate(%dead_constant)
+  //
+  // %constant and its only use %exp are ordered before 'param'. However, the
+  // %constant and %param values still interfere because the parameter is
+  // considered live into the while body.
+  //
+  // Similarly, %dead_constant and %dead_negate are ordered after the root of
+  // the body computation %add. However, %add is liveout of the computation so
+  // %dead_constant and %add interfere.
+  auto body_builder = HloComputation::Builder(TestName());
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "body_param"));
+  auto constant = body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto exp = body_builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kExp, constant));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, exp, body_param));
+  auto dead_constant = body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto dead_negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, dead_constant));
+  HloComputation* body = module_->AddEmbeddedComputation(
+      body_builder.Build(/*root_instruction=*/add));
+
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "cond_param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape_, condition, body, param));
+
+  auto entry = module_->AddEntryComputation(builder.Build());
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  sequence.insert({entry, {param, xla_while}});
+  sequence.insert({condition, {cond_param, cond_constant}});
+  // Construct the order such that 'constant' and its use 'exp' are before
+  // body_param.
+  sequence.insert({body, {constant, exp, body_param, add}});
+
+  SequentialHloOrdering ordering(module_.get(), sequence);
+
+  // 'add' is the body root even though later instructions follow in the order
+  // like 'dead_negate'. Only 'add' should be live out of the computation.
+  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
+  EXPECT_FALSE(
+      analysis.GetValueDefinedAt(dead_negate).live_out_of_computation());
+
+  // 'add' is live out of the body and will interfere with an later instructions
+  // such as 'dead_constant' and 'dead_negate'.
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, add, dead_constant));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, add, dead_negate));
+
+  // The remaining checks test phi values defined by body and condition
+  // parameters which only occur in the SSA form of the analysis.
+  if (ssa_form) {
+    // Though the ordering suggests 'constant' and 'param' should not interfere,
+    // 'param' is live in and thus interferes with any earlier instruction of
+    // the computation in the order (eg 'constant')'
+    EXPECT_TRUE(InstructionsMayInterfere(ordering, body_param, constant));
+    EXPECT_TRUE(InstructionsMayInterfere(ordering, body_param, exp));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, body_param, add));
+
+    // The following values end up in the same buffer:
+    //  (1) the init value: 'param'
+    //  (2) the body parameter: 'body_param'
+    //  (3) the condition parameter: 'cond_param'
+    //  (4) the root value of the while body: 'add'
+    //  (5) the while value: 'xla_while'
+    // None should interfere.
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, param, body_param));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, param, cond_param));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, param, add));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, param, xla_while));
+
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, body_param, cond_param));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, body_param, add));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, body_param, xla_while));
+
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, cond_param, add));
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, cond_param, xla_while));
+
+    EXPECT_FALSE(InstructionsMayInterfere(ordering, add, xla_while));
+  }
+}
+
+TEST_P(HloDataflowAnalysisTest, NonElementwiseOperand) {
+  // A chain of operations with two elementwise and one non-elementwise. The
+  // elementwise op should not interfere with its operand, while the
+  // non-elementwise op should interfere. Entry params always interfere.
+  //
+  // param --> exp -> negate -> reverse
+  //
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kExp, param));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kNegate, exp));
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(vector_shape_, negate, {0}));
+
+  module_->AddEntryComputation(builder.Build());
+  RunAnalysis(GetParam());
+
+  DependencyHloOrdering ordering(module_.get());
+
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, reverse));
+
+  // Negate is elementwise, so doesn't interfere with its operand.
+  // Reverse is non-elementwise, so does interfere with its operand.
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, negate));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, reverse));
+}
+
+TEST_P(HloDataflowAnalysisTest, OverlappedValues) {
+  // Verify simultaneously live values interfere (exp and negate).
+  //
+  // param --> negate -> add
+  //     \---> exp -----/
+  //
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "param"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kNegate, param));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kExp, param));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      vector_shape_, HloOpcode::kAdd, negate, exp));
+
+  module_->AddEntryComputation(builder.Build());
+  RunAnalysis(GetParam());
+
+  DependencyHloOrdering ordering(module_.get());
+
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, param, negate));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, param, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, add));
+
+  // Negate and exp interfere with each other, but not with add.
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, add, exp));
+}
+
+TEST_P(HloDataflowAnalysisTest, OverlappedValuesSequentialOrder) {
+  // Identical to the test OverlappedValue but using a sequential ordering of
+  // HLO instructions.
+  //
+  // param --> negate -> add
+  //     \---> exp -----/
+  //
+  // Sequential order:
+  //  param, negate, exp, add
+  //
+  // Liveness is identical to the DependencyHloOrdering.
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "param"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kNegate, param));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kExp, param));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      vector_shape_, HloOpcode::kAdd, negate, exp));
+
+  auto entry = module_->AddEntryComputation(builder.Build());
+  RunAnalysis(GetParam());
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  std::vector<const HloInstruction*> order = {param, negate, exp, add};
+  sequence.emplace(entry, order);
+
+  SequentialHloOrdering ordering(module_.get(), sequence);
+
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, param, add));
+
+  // Negate and exp interfere with each other, but not with add.
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, add, exp));
+}
+
+TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) {
+  // Test MayInterfere() for embedded computation, specifically the interference
+  // of values in different computations.
+  //
+  // embedded_computation:
+  //   %embedded_param = Param(0)
+  //   %embedded_log = Log(%embedded_param)
+  //
+  // entry computation:
+  //   %param = Param(0)
+  //   %negate = Negate(%param)
+  //   %exp = Negate(%exp)
+  //   %call = Call(embedded_computation, {%exp})
+  //   %add = Add(%negate, %call)
+  //
+  // Note %negate is live across the call and should interfere with all values
+  // in the embedded computation.
+  auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
+  auto embedded_param = embedded_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "embedded_param"));
+  auto embedded_log =
+      embedded_builder.AddInstruction(HloInstruction::CreateUnary(
+          vector_shape_, HloOpcode::kLog, embedded_param));
+  auto embedded_computation =
+      module_->AddEmbeddedComputation(embedded_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vector_shape_, "param"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kNegate, param));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vector_shape_, HloOpcode::kExp, param));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(vector_shape_, {exp}, embedded_computation));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      vector_shape_, HloOpcode::kAdd, negate, call));
+  module_->AddEntryComputation(builder.Build());
+  RunAnalysis(GetParam());
+
+  DependencyHloOrdering ordering(module_.get());
+
+  // Exp only use is the call so it should not interfere with values inside the
+  // embedded computation.
+  EXPECT_FALSE(InstructionsMayInterfere(ordering, exp, embedded_log));
+
+  // Negate is live across the call and should interfere with values in the
+  // embedded computation
+  EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, embedded_log));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 3755b9e4c005c5e50b149d8dc8c51363eb111868..5b2c57da4ff3a1f887f777c3304893d950b3d3a9 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -38,6 +38,9 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
   bool changed = false;
 
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     std::unordered_set<HloInstruction*> live_instructions;
     TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(
         [&live_instructions](HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 10cd7ca7c0990ab553c865da01b00475382316e2..704b8dfca700f7c4a00689593aea9743de1f817c 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -45,9 +45,9 @@ TEST_F(HloDceTest, NoDeadCode) {
   // Verify that no dead code is removed from a computation with no dead code.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(123.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0f)));
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
@@ -98,9 +98,9 @@ TEST_F(HloDceTest, ControlDependencies) {
   // Verify that instructions with control dependencies are not removed.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(123.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0f)));
 
   // Create two dead instructions: a negate and an add.
   auto dead_negate = builder.AddInstruction(HloInstruction::CreateUnary(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 3e7f5b1f3d97ace48fbc22b224667acebcc52093..a0c5cbe916050a8aa7849c3e37daad70bc8d6190 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -31,11 +31,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -89,11 +91,11 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
                  << HloOpcodeString(opcode);
   }
 
-  auto result = LiteralUtil::CreateFromShape(shape);
-  TF_RETURN_IF_ERROR(LiteralUtil::Populate<bool>(
-      result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-        return compare_op(LiteralUtil::Get<OperandT>(lhs_literal, multi_index),
-                          LiteralUtil::Get<OperandT>(rhs_literal, multi_index));
+  auto result = Literal::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(result->Populate<bool>(
+      [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+        return compare_op(lhs_literal.Get<OperandT>(multi_index),
+                          rhs_literal.Get<OperandT>(multi_index));
       }));
 
   return std::move(result);
@@ -117,12 +119,11 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
         ShapeUtil::HumanString(operand->shape()).c_str());
   }
 
-  auto result = LiteralUtil::CreateFromShape(shape);
+  auto result = Literal::CreateFromShape(shape);
 
-  TF_RETURN_IF_ERROR(LiteralUtil::Populate<ReturnT>(
-      result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-        return unary_op(
-            LiteralUtil::Get<NativeT>(operand_literal, multi_index));
+  TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+      [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+        return unary_op(operand_literal.Get<NativeT>(multi_index));
       }));
   return std::move(result);
 }
@@ -168,6 +169,23 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleAbs<ReturnT>(abs, operand);
   };
 
+  Status HandleBroadcast(HloInstruction* broadcast) override {
+    parent_->evaluated_[broadcast] =
+        Literal::CreateFromShape(broadcast->shape());
+    auto output = parent_->evaluated_[broadcast].get();
+    auto operand_to_broadcast =
+        parent_->GetEvaluatedLiteralFor(broadcast->operand(0));
+    std::vector<int64> broadcast_indices(
+        ShapeUtil::Rank(broadcast->operand(0)->shape()), 0);
+    return output->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+            broadcast_indices[i] = multi_index[broadcast->dimensions(i)];
+          }
+          return operand_to_broadcast.Get<ReturnT>(broadcast_indices);
+        });
+  }
+
   Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
                         ElementWiseUnaryOp(ceil, [](ReturnT elem_operand) {
@@ -176,7 +194,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   };
 
-  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override {
+  Status HandleCopy(HloInstruction* copy) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[copy],
                         ElementWiseUnaryOp(copy, [](ReturnT elem_operand) {
                           return elem_operand;
@@ -184,42 +202,19 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   };
 
-  template <PrimitiveType src_type, PrimitiveType dest_type>
-  std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
-    DCHECK_EQ(src_type, src_literal.shape().element_type());
-    return LiteralUtil::Convert<
-        typename primitive_util::PrimitiveTypeToNative<src_type>::type,
-        typename primitive_util::PrimitiveTypeToNative<dest_type>::type>(
-        src_literal);
-  }
-
-  Status HandleConvert(HloInstruction* convert,
-                       HloInstruction* operand) override {
-    auto operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-
-    switch (operand->shape().element_type()) {
-#define CONVERT_IF_TYPES_MATCH(src_type)                                \
-  case (src_type):                                                      \
-    parent_->evaluated_[convert] = LiteralUtil::Convert<                \
-        typename primitive_util::PrimitiveTypeToNative<src_type>::type, \
-        ReturnT>(operand_literal);                                      \
-    break;
-      CONVERT_IF_TYPES_MATCH(PRED)
-      CONVERT_IF_TYPES_MATCH(S8)
-      CONVERT_IF_TYPES_MATCH(S32)
-      CONVERT_IF_TYPES_MATCH(S64)
-      CONVERT_IF_TYPES_MATCH(U8)
-      CONVERT_IF_TYPES_MATCH(U32)
-      CONVERT_IF_TYPES_MATCH(U64)
-      CONVERT_IF_TYPES_MATCH(F32)
-      CONVERT_IF_TYPES_MATCH(F64)
-#undef CONVERT_IF_TYPES_MATCH
-      // Other types are not yet supported.
-      default:
-        LOG(FATAL) << "unimplemented operand type for HandleCovert: "
-                   << PrimitiveType_Name(operand->shape().element_type());
+  Status HandleConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).Convert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
     }
-
     return Status::OK();
   }
 
@@ -322,8 +317,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   };
 
-  Status HandleMaximum(HloInstruction* maximum, HloInstruction* lhs,
-                       HloInstruction* rhs) override {
+  Status HandleMaximum(HloInstruction* maximum) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[maximum],
         ElementWiseBinaryOp(maximum, [](ReturnT lhs, ReturnT rhs) {
@@ -332,8 +326,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   };
 
-  Status HandleMinimum(HloInstruction* minimum, HloInstruction* lhs,
-                       HloInstruction* rhs) override {
+  Status HandleMinimum(HloInstruction* minimum) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[minimum],
         ElementWiseBinaryOp(minimum, [](ReturnT lhs_el, ReturnT rhs_el) {
@@ -409,6 +402,258 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   };
 
+  Status HandleConvolution(HloInstruction* conv, HloInstruction* lhs,
+                           HloInstruction* rhs, const Window& window) override {
+    CHECK(ShapeUtil::IsArray(lhs->shape()));
+    CHECK(ShapeUtil::IsArray(rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), conv->shape()));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs->shape()));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs->shape()));
+
+    const auto& dnums = conv->convolution_dimension_numbers();
+    const int64 num_spatial_dims = dnums.spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
+    CHECK_GE(num_spatial_dims, 1);
+    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
+
+    CHECK_EQ(num_spatial_dims + 2, ShapeUtil::Rank(lhs->shape()));
+    CHECK_EQ(num_spatial_dims + 2, ShapeUtil::Rank(rhs->shape()));
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferConvolveShape(
+                            lhs->shape(), rhs->shape(), window, dnums));
+    CHECK(ShapeUtil::Compatible(conv->shape(), inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(conv->shape())
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
+    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
+
+    // Dimension number applicable for both input (lhs), and output.
+    const int64 batch_dim = dnums.batch_dimension();
+    const int64 z_dim = dnums.feature_dimension();
+    // Dimension number applicable for kernel (rhs).
+    const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+    const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+
+    const int64 z_size = ShapeUtil::GetDimension(lhs->shape(), z_dim);
+
+    std::vector<int64> window_dimension_sizes;
+    for (auto i : dnums.kernel_spatial_dimensions()) {
+      window_dimension_sizes.push_back(
+          ShapeUtil::GetDimension(rhs->shape(), i));
+    }
+
+    const Shape& window_shape = ShapeUtil::MakeShape(
+        rhs->shape().element_type(), window_dimension_sizes);
+
+    auto result = Literal::CreateFromShape(conv->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+          ReturnT result_val = static_cast<ReturnT>(0);
+
+          std::vector<int64> lhs_index(lhs_rank, 0);
+          std::vector<int64> rhs_index(rhs_rank, 0);
+
+          lhs_index[batch_dim] = out_index[batch_dim];
+          rhs_index[kernel_output_z_dim] = out_index[z_dim];
+
+          std::vector<int64> rhs_spatial_index(
+              dnums.kernel_spatial_dimensions_size(), 0);
+
+          // Convolve input feature with kernel.
+          do {
+            for (int64 iz = 0; iz < z_size; ++iz) {
+              lhs_index[z_dim] = iz;
+              rhs_index[kernel_input_z_dim] = iz;
+
+              // Find corresponding spatial dimension index for input (lhs).
+              for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
+                // Spatial dimension number for input (lhs) and output.
+                const int64 spatial_dim = dnums.spatial_dimensions(ki);
+
+                // Calculate lhs (input) index without taking base dilation into
+                // account.
+                const int64 undilated_index =
+                    out_index[spatial_dim] * window.dimensions(ki).stride() -
+                    window.dimensions(ki).padding_low() +
+                    rhs_spatial_index[ki] *
+                        window.dimensions(ki).window_dilation();
+                // Skip if the lhs (input) index is to be dilated.
+                if (undilated_index % window.dimensions(ki).base_dilation() !=
+                    0) {
+                  goto cnt;
+                }
+
+                // Calculate the actual lhs (input) index after dilation.
+                lhs_index[spatial_dim] =
+                    undilated_index / window.dimensions(ki).base_dilation();
+
+                // Skip if input index is not in bound.
+                if (!(lhs_index[spatial_dim] >= 0 &&
+                      lhs_index[spatial_dim] <
+                          lhs->shape().dimensions(spatial_dim))) {
+                  goto cnt;
+                }
+
+                rhs_index[dnums.kernel_spatial_dimensions(ki)] =
+                    rhs_spatial_index[ki];
+              }
+
+              result_val += lhs_literal.Get<ReturnT>(lhs_index) *
+                            rhs_literal.Get<ReturnT>(rhs_index);
+            }
+          cnt:;
+          } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
+
+          return result_val;
+        }));
+
+    parent_->evaluated_[conv] = std::move(result);
+    return Status::OK();
+  };
+
+  Status HandleDot(HloInstruction* dot, HloInstruction* lhs,
+                   HloInstruction* rhs) override {
+    CHECK(ShapeUtil::IsArray(dot->shape()));
+    CHECK(ShapeUtil::IsArray(lhs->shape()));
+    CHECK(ShapeUtil::IsArray(rhs->shape()));
+
+    // Dot only supports operands of rank 1 and 2.
+    const auto dot_rank = ShapeUtil::Rank(dot->shape());
+    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
+    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
+    CHECK(lhs_rank > 0 && lhs_rank <= 2);
+    CHECK(rhs_rank > 0 && rhs_rank <= 2);
+    CHECK_EQ(dot_rank, lhs_rank + rhs_rank - 2);
+
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
+
+    // Check contracted dimensions are the same.
+    //
+    // Determine the index of the contracted dimensions for input tensors.
+    // dimensions -1 of lhs and dimension 0 of rhs are contracted.
+    const int64 lhs_contracted_dimension =
+        ShapeUtil::GetDimensionNumber(lhs->shape(), -1);
+    const int64 rhs_contracted_dimension = 0;
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracted_dimension),
+             rhs->shape().dimensions(rhs_contracted_dimension))
+        << "lhs contracted dimension: "
+        << lhs->shape().dimensions(lhs_contracted_dimension)
+        << " rhs contracted dimension: "
+        << rhs->shape().dimensions(rhs_contracted_dimension);
+    const int64 contracted_dimension_size =
+        lhs->shape().dimensions(lhs_contracted_dimension);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = Literal::CreateFromShape(dot->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          ReturnT result_val = static_cast<ReturnT>(0);
+
+          std::vector<int64> lhs_index(lhs_rank, 0);
+          std::vector<int64> rhs_index(rhs_rank, 0);
+          // Set index for non-contracted dimension for lhs and rhs.
+          if (lhs_rank > 1) {
+            lhs_index[0] = multi_index[0];
+          }
+          if (rhs_rank > 1) {
+            rhs_index[1] = multi_index[multi_index.size() - 1];
+          }
+
+          // Accumulates resulting product along the contracted dimension.
+          for (int64 i = 0; i < contracted_dimension_size; ++i) {
+            lhs_index[lhs_contracted_dimension] = i;
+            rhs_index[rhs_contracted_dimension] = i;
+
+            result_val += lhs_literal.Get<ReturnT>(lhs_index) *
+                          rhs_literal.Get<ReturnT>(rhs_index);
+          }
+
+          return result_val;
+        }));
+
+    parent_->evaluated_[dot] = std::move(result);
+    return Status::OK();
+  };
+
+  Status HandlePad(HloInstruction* pad) override {
+    CHECK(!ShapeUtil::IsTuple(pad->operand(0)->shape()));
+    // Padding value must be scalar.
+    CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
+    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
+             pad->padding_config().dimensions_size());
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferPadShape(
+                            /*operand_shape=*/pad->operand(0)->shape(),
+                            /*padding_value_shape=*/pad->operand(1)->shape(),
+                            /*padding_config=*/pad->padding_config()));
+    CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(pad->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    // Create new HLO of padded shape with padding value.
+    ReturnT scalar =
+        parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
+    auto result = Literal::CreateFromShape(pad->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&scalar](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return scalar;
+        }));
+
+    auto evaluated_operand = parent_->GetEvaluatedLiteralFor(pad->operand(0));
+
+    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
+                                   0);
+    std::vector<int64> target_index(ShapeUtil::Rank(result->shape()), 0);
+
+    // Loop through each element of the operand, assign them to the
+    // corresponding index of the resulting padded literal.
+    const PaddingConfig& pad_config = pad->padding_config();
+
+    auto func = [&](const std::vector<int64>& input_index) {
+      for (auto i = 0; i < input_index.size(); ++i) {
+        // Interior padding occurs logically before edge padding, so in the case
+        // of negative edge padding elements are removed from the
+        // interior-padded operand.
+        target_index[i] =
+            pad_config.dimensions(i).edge_padding_low() +
+            input_index[i] * (pad_config.dimensions(i).interior_padding() + 1);
+
+        // Account for negative low and high padding: skip assignment if the
+        // any target index is out of range.
+        if (!(target_index[i] >= 0 &&
+              target_index[i] < pad->shape().dimensions(i))) {
+          return true;
+        }
+      }
+      result->Set<ReturnT>(target_index,
+                           evaluated_operand.Get<ReturnT>(input_index));
+      return true;
+    };
+
+    std::vector<int64> zero_base(evaluated_operand.shape().dimensions_size(),
+                                 0);
+    std::vector<int64> step(evaluated_operand.shape().dimensions_size(), 1);
+
+    ShapeUtil::ForEachIndex(
+        evaluated_operand.shape(), zero_base,
+        AsInt64Slice(evaluated_operand.shape().dimensions()), step, func);
+
+    parent_->evaluated_[pad] = std::move(result);
+    return Status::OK();
+  };
+
   Status Preprocess(HloInstruction* hlo) override {
     VLOG(2) << hlo->ToString();
     return Status::OK();
@@ -446,12 +691,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
-    auto result = LiteralUtil::CreateFromShape(shape);
+    auto result = Literal::CreateFromShape(shape);
 
-    TF_RETURN_IF_ERROR(LiteralUtil::Populate<ReturnT>(
-        result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          return binary_op(LiteralUtil::Get<ReturnT>(lhs_literal, multi_index),
-                           LiteralUtil::Get<ReturnT>(rhs_literal, multi_index));
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return binary_op(lhs_literal.Get<ReturnT>(multi_index),
+                           rhs_literal.Get<ReturnT>(multi_index));
         }));
     return std::move(result);
   }
@@ -483,14 +728,13 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
     const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
 
-    auto result = LiteralUtil::CreateFromShape(shape);
+    auto result = Literal::CreateFromShape(shape);
 
-    TF_RETURN_IF_ERROR(LiteralUtil::Populate<ReturnT>(
-        result.get(), [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          return ternary_op(
-              LiteralUtil::Get<LhsType>(lhs_literal, multi_index),
-              LiteralUtil::Get<RhsType>(rhs_literal, multi_index),
-              LiteralUtil::Get<EhsType>(ehs_literal, multi_index));
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return ternary_op(lhs_literal.Get<LhsType>(multi_index),
+                            rhs_literal.Get<RhsType>(multi_index),
+                            ehs_literal.Get<EhsType>(multi_index));
         }));
 
     return std::move(result);
@@ -552,7 +796,7 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     if (operand->opcode() == HloOpcode::kParameter) {
       const Literal* input_literal = arg_literals_[operand->parameter_number()];
       VLOG(2) << "Parameter operand evaluated to: "
-              << LiteralUtil::ToString(*input_literal);
+              << input_literal->ToString();
       TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
 
       evaluated_[operand] = MakeUnique<Literal>(*input_literal);
@@ -589,8 +833,7 @@ std::unique_ptr<Literal> HloEvaluator::TryEvaluate(
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
-  VLOG(2) << "Parameter evaluated to: "
-          << LiteralUtil::ToString(*input_literal);
+  VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
   DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
 
   evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
@@ -606,14 +849,14 @@ Status HloEvaluator::HandleConstant(HloInstruction* constant,
 Status HloEvaluator::HandleReshape(HloInstruction* reshape) {
   TF_ASSIGN_OR_RETURN(
       evaluated_[reshape],
-      LiteralUtil::Reshape(GetEvaluatedLiteralFor(reshape->operand(0)),
-                           AsInt64Slice(reshape->shape().dimensions())));
+      GetEvaluatedLiteralFor(reshape->operand(0))
+          .Reshape(AsInt64Slice(reshape->shape().dimensions())));
   return Status::OK();
 }
 
 Status HloEvaluator::HandleTranspose(HloInstruction* transpose) {
-  evaluated_[transpose] = LiteralUtil::Transpose(
-      GetEvaluatedLiteralFor(transpose->operand(0)), transpose->dimensions());
+  evaluated_[transpose] = GetEvaluatedLiteralFor(transpose->operand(0))
+                              .Transpose(transpose->dimensions());
   return Status::OK();
 }
 
@@ -641,16 +884,16 @@ Status HloEvaluator::HandleConcatenate(
         ShapeUtil::GetDimension(operand_shape, concat_dim);
   }
 
-  auto result_literal = LiteralUtil::CreateFromDimensions(
+  auto result_literal = Literal::CreateFromDimensions(
       reference_shape.element_type(), concat_dimensions);
   DimensionVector source_indices(rank, 0);
   DimensionVector dest_indices(concat_dimensions.size(), 0);
 
   for (auto operand : operands) {
     const Shape& operand_shape = operand->shape();
-    TF_RETURN_IF_ERROR(LiteralUtil::Copy(
-        GetEvaluatedLiteralFor(operand), source_indices, result_literal.get(),
-        dest_indices, AsInt64Slice(operand_shape.dimensions())));
+    TF_RETURN_IF_ERROR(result_literal->Copy(
+        GetEvaluatedLiteralFor(operand), source_indices, dest_indices,
+        AsInt64Slice(operand_shape.dimensions())));
     dest_indices[concat_dim] +=
         ShapeUtil::GetDimension(operand_shape, concat_dim);
   }
@@ -775,14 +1018,14 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare, HloOpcode opcode,
 Status HloEvaluator::HandleSlice(HloInstruction* slice,
                                  HloInstruction* operand) {
   const Shape& shape = slice->shape();
-  auto literal = LiteralUtil::CreateFromDimensions(
+  auto literal = Literal::CreateFromDimensions(
       shape.element_type(), AsInt64Slice(shape.dimensions()));
 
   DimensionVector dest_indices(slice->slice_starts().size(), 0);
 
-  TF_RETURN_IF_ERROR(LiteralUtil::Copy(
-      GetEvaluatedLiteralFor(operand), slice->slice_starts(), literal.get(),
-      dest_indices, AsInt64Slice(shape.dimensions())));
+  TF_RETURN_IF_ERROR(literal->Copy(GetEvaluatedLiteralFor(operand),
+                                   slice->slice_starts(), dest_indices,
+                                   AsInt64Slice(shape.dimensions())));
 
   evaluated_[slice] = std::move(literal);
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 91fd56f54c592b8bbe68f6b38e761e1f10a20c8b..976a2325ea970f570748a6872d7bf2459f8ffa4a 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -92,7 +92,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
     return hlo->Visit(typed_visitors_.at(hlo->shape().element_type()).get());
   }
 
-  // Operations that are type-agnostic.
+  // Operations that are type-agnostic or always return a specific type, such as
+  // HandleIsFinite where boolean is always returned.
   //
   Status HandleParameter(HloInstruction* parameter) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index b26ece28b756097b06b4a04d4873775e13760014..7269fbeffc51c39af43f2cfd8e5468da54f12855 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -14,27 +14,33 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 
+#include <initializer_list>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
-class HloEvaluatorTest : public ::testing::Test {
+class HloEvaluatorTest : public HloTestBase {
  protected:
   HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
 
@@ -44,9 +50,9 @@ class HloEvaluatorTest : public ::testing::Test {
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
 TEST_F(HloEvaluatorTest, DoesClamp) {
-  auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
-  auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
-  auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+  auto low = Literal::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
+  auto high = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
+  auto value = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
 
   Shape shape = low->shape();
   auto c1 = HloInstruction::CreateConstant(std::move(low));
@@ -58,17 +64,17 @@ TEST_F(HloEvaluatorTest, DoesClamp) {
   std::unique_ptr<Literal> result =
       evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
-  auto expected = LiteralUtil::CreateR2<float>({{0, 4}, {2, 4}});
+  auto expected = Literal::CreateR2<float>({{0, 4}, {2, 4}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
 TEST_F(HloEvaluatorTest, DoesSelect) {
-  auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
-  auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
-  auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+  auto pred = Literal::CreateR2<bool>({{true, false}, {false, true}});
+  auto on_true = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
+  auto on_false = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
 
   Shape shape = on_true->shape();
   auto c1 = HloInstruction::CreateConstant(std::move(pred));
@@ -80,16 +86,16 @@ TEST_F(HloEvaluatorTest, DoesSelect) {
   std::unique_ptr<Literal> result =
       evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
-  auto expected = LiteralUtil::CreateR2<float>({{2, 5}, {0, 4}});
+  auto expected = Literal::CreateR2<float>({{2, 5}, {0, 4}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
 TEST_F(HloEvaluatorTest, DoesAdd) {
-  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
-  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
 
   Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
   auto c1 = HloInstruction::CreateConstant(std::move(lhs));
@@ -100,16 +106,16 @@ TEST_F(HloEvaluatorTest, DoesAdd) {
   std::unique_ptr<Literal> result =
       evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
-  auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
+  auto expected = Literal::CreateR2<int64>({{3, 4}, {-96, 8}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
 TEST_F(HloEvaluatorTest, DoesDivide) {
-  auto lhs_s64 = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
-  auto rhs_s64 = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto lhs_s64 = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs_s64 = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
 
   Shape shape_s64 = ShapeUtil::MakeShape(S64, {2, 2});
   auto c1_s64 = HloInstruction::CreateConstant(std::move(lhs_s64));
@@ -120,12 +126,12 @@ TEST_F(HloEvaluatorTest, DoesDivide) {
   std::unique_ptr<Literal> result =
       evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
-  auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
+  auto expected = Literal::CreateR2<int64>({{0, 0}, {-25, 1}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 
-  auto lhs_f64 = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
-  auto rhs_f64 = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
+  auto lhs_f64 = Literal::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
+  auto rhs_f64 = Literal::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
 
   Shape shape_f64 = ShapeUtil::MakeShape(F64, {2, 2});
   auto c1_f64 = HloInstruction::CreateConstant(std::move(lhs_f64));
@@ -135,16 +141,15 @@ TEST_F(HloEvaluatorTest, DoesDivide) {
 
   result = evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
-  expected =
-      LiteralUtil::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
+  expected = Literal::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
 TEST_F(HloEvaluatorTest, DoesAbs) {
-  auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  auto operand = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
   const Shape& shape = ShapeUtil::MakeShape(S64, {2, 2});
   auto c1 = HloInstruction::CreateConstant(std::move(operand));
   auto instruction =
@@ -153,42 +158,40 @@ TEST_F(HloEvaluatorTest, DoesAbs) {
   std::unique_ptr<Literal> result =
       evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
-  auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
+  auto expected = Literal::CreateR2<int64>({{1, 20}, {100, 4}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 
   // For R0 literal.
   const Shape& r0 = ShapeUtil::MakeShape(F32, {});
-  operand = LiteralUtil::CreateR0<float>(-1.0f);
+  operand = Literal::CreateR0<float>(-1.0f);
   c1 = HloInstruction::CreateConstant(std::move(operand));
   instruction = HloInstruction::CreateUnary(r0, HloOpcode::kAbs, c1.get());
   result = evaluator_->Evaluate(instruction.get()).ConsumeValueOrDie();
-  expected = LiteralUtil::CreateR0<float>(1.0f);
+  expected = Literal::CreateR0<float>(1.0f);
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 
   // For R1 literal with dimension of size 0.
   Shape empty_r1 = ShapeUtil::MakeShape(F32, {0});
-  operand = LiteralUtil::CreateR1<float>({});
+  operand = Literal::CreateR1<float>({});
   c1 = HloInstruction::CreateConstant(std::move(operand));
   instruction =
       HloInstruction::CreateUnary(empty_r1, HloOpcode::kAbs, c1.get());
 
   result = evaluator_->Evaluate(instruction.get()).ConsumeValueOrDie();
-  expected = LiteralUtil::CreateR1<float>({});
+  expected = Literal::CreateR1<float>({});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 }  // namespace
 
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
-TEST_F(HloEvaluatorTest, DoesTraveseInstructions) {
-  HloComputation::Builder builder(
-      ::testing::UnitTest::GetInstance()->current_test_info()->name());
-
-  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
-  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
-  auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
+  HloComputation::Builder builder(TestName());
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto rhs2 = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
   std::vector<const Literal*> args = {lhs.get(), rhs.get(), rhs2.get()};
 
   Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
@@ -206,21 +209,19 @@ TEST_F(HloEvaluatorTest, DoesTraveseInstructions) {
   std::unique_ptr<Literal> result =
       evaluator_->Evaluate(builder.Build().get(), args).ConsumeValueOrDie();
 
-  auto expected = LiteralUtil::CreateR2<int64>({{4, -16}, {-196, 12}});
+  auto expected = Literal::CreateR2<int64>({{4, -16}, {-196, 12}});
 
-  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+  LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
 // Verifies Reshape operation is correctly evaluated.
 TEST_F(HloEvaluatorTest, DoesReshape) {
-  HloComputation::Builder builder(
-      ::testing::UnitTest::GetInstance()->current_test_info()->name());
-
+  HloComputation::Builder builder(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
-  TF_ASSIGN_OR_ASSERT_OK(auto literal,
-                         LiteralTestUtil::CreateRandomLiteral<F32>(
-                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
-  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  TF_ASSERT_OK_AND_ASSIGN(auto literal,
+                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                              ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = literal->CloneToUnique();
   HloInstruction* literal_instruction = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
 
@@ -233,13 +234,717 @@ TEST_F(HloEvaluatorTest, DoesReshape) {
       evaluator_->Evaluate(builder.Build().get(), {}).ConsumeValueOrDie();
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
-  LiteralUtil::EachCell<NativeT>(
-      *result, [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
+  result->EachCell<NativeT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
         std::vector<int64> rindexes = Permute(permutation, indices);
-        EXPECT_TRUE(value ==
-                    LiteralUtil::Get<NativeT>(*literal_clone, rindexes));
+        EXPECT_TRUE(value == literal_clone->Get<NativeT>(rindexes));
       });
 }
 
+// Verifies Broadcast operation is correctly evaluated.
+TEST_F(HloEvaluatorTest, DoesBroadcast) {
+  HloComputation::Builder builder(TestName());
+  auto input_literal = Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
+  auto output_literal = Literal::CreateR3<int32>(
+      {{{1, 2}, {3, 4}, {5, 6}}, {{1, 2}, {3, 4}, {5, 6}}});
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal)));
+
+  builder.AddInstruction(HloInstruction::CreateBroadcast(
+      output_literal->shape(), literal_instruction, {1, 2}));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), {}).ConsumeValueOrDie();
+
+  LiteralTestUtil::ExpectEqual(*result, *output_literal);
+}
+
+TEST_F(HloEvaluatorTest, ConvertWithSameLayout) {
+  HloComputation::Builder builder(TestName());
+
+  auto input_literal = Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
+  auto expected =
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}});
+  ASSERT_TRUE(LayoutUtil::LayoutsInShapesEqual(input_literal->shape(),
+                                               expected->shape()));
+
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(expected->shape(), constant));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), {}).ConsumeValueOrDie();
+
+  LiteralTestUtil::ExpectEqual(*result, *expected);
+}
+
+TEST_F(HloEvaluatorTest, ConvertWithDifferentLayout) {
+  HloComputation::Builder builder(TestName());
+
+  auto input_literal = Literal::CreateR2WithLayout<int32>(
+      {{1, 2}, {3, 4}, {5, 6}}, LayoutUtil::MakeLayout({0, 1}));
+  auto expected = Literal::CreateR2WithLayout<float>(
+      {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, LayoutUtil::MakeLayout({1, 0}));
+  ASSERT_FALSE(LayoutUtil::LayoutsInShapesEqual(input_literal->shape(),
+                                                expected->shape()));
+
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(expected->shape(), constant));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), {}).ConsumeValueOrDie();
+
+  LiteralTestUtil::ExpectEqual(*result, *expected);
+}
+
+PaddingConfig CreatePaddingConfig(
+    std::initializer_list<std::array<int64, 3>> padding_dimensions) {
+  PaddingConfig padding_config;
+
+  for (auto& paddings_per_dim : padding_dimensions) {
+    auto dimension = padding_config.add_dimensions();
+    dimension->set_edge_padding_low(paddings_per_dim[0]);
+    dimension->set_edge_padding_high(paddings_per_dim[1]);
+    dimension->set_interior_padding(paddings_per_dim[2]);
+  }
+  return padding_config;
+}
+
+TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
+  auto operand = Literal::CreateR2<int32>({{}, {}});
+  auto operand_instruction = HloInstruction::CreateConstant(std::move(operand));
+
+  constexpr int32 kPadValue = 10;
+  auto pad_value = Literal::CreateR0<int32>(kPadValue);
+  auto padding_value_instruction =
+      HloInstruction::CreateConstant(std::move(pad_value));
+
+  auto padding_config = CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}});
+  Shape shape = ShapeUtil::MakeShape(S32, {5, 2});
+  auto pad_instruction = HloInstruction::CreatePad(
+      shape, operand_instruction.get(), padding_value_instruction.get(),
+      padding_config);
+
+  auto result = evaluator_->Evaluate(pad_instruction.get()).ConsumeValueOrDie();
+
+  auto expected = Literal::CreateR2<int32>(
+      {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}});
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
+  HloComputation::Builder b(TestName());
+
+  Array4D<float> input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
+  auto input = Literal::CreateR4FromArray4D<float>(input_array);
+  HloInstruction* input_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
+  constexpr float kPadValue = 1.5;
+  auto pad_value = Literal::CreateR0<float>(kPadValue);
+  HloInstruction* pad_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(pad_value)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8, 5, 1, 1});
+  auto r4_padding_on_dim0_dim1 =
+      CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}, {{0, 0, 0}}, {{0, 0, 0}}});
+  b.AddInstruction(HloInstruction::CreatePad(
+      shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  auto expected_array = MakeUnique<Array4D<float>>(8, 5, 1, 1);
+  expected_array->Fill(kPadValue);
+  (*expected_array)(1, 0, 0, 0) = 1.0f;
+  (*expected_array)(1, 2, 0, 0) = 2.0f;
+  (*expected_array)(4, 0, 0, 0) = 3.0f;
+  (*expected_array)(4, 2, 0, 0) = 4.0f;
+  (*expected_array)(7, 0, 0, 0) = 5.0f;
+  (*expected_array)(7, 2, 0, 0) = 6.0f;
+
+  auto expected = Literal::CreateR4FromArray4D<float>(*expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, NegativePadding2D) {
+  HloComputation::Builder b(TestName());
+
+  // input_array:
+  // f32[4,3] {
+  //  { 1, 2, 3 },
+  //  { 5, 6, 7 },
+  //  { 9, 10, 11 },
+  //  { 13, 14, 15 },
+  // }
+  auto input_array = MakeUnique<Array2D<float>>(4, 3);
+  input_array->FillUnique(1.0f);
+  auto input = Literal::CreateR2FromArray2D<float>(*input_array);
+  HloInstruction* input_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
+
+  auto pad_value_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.718f)));
+
+  auto r2_padding_on_dim0_dim1 =
+      CreatePaddingConfig({{{-1, -2, 0}}, {{-2, 4, 0}}});
+  Shape shape = ShapeUtil::MakeShape(F32, {1, 5});
+  b.AddInstruction(HloInstruction::CreatePad(shape, input_instruction,
+                                             pad_value_instruction,
+                                             r2_padding_on_dim0_dim1));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  // f32[1,5] { 7.0, 2.718, 2.718, 2.718, 2.718 }
+  auto expected_array = MakeUnique<Array2D<float>>(1, 5);
+  (*expected_array)(0, 0) = 7.0f;
+  (*expected_array)(0, 1) = 2.718f;
+  (*expected_array)(0, 2) = 2.718f;
+  (*expected_array)(0, 3) = 2.718f;
+  (*expected_array)(0, 4) = 2.718f;
+  auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
+  HloComputation::Builder b(TestName());
+
+  // f32[4,3] {
+  //  { 1, 2, 3 },
+  //  { 5, 6, 7 },
+  //  { 9, 10, 11 },
+  //  { 13, 14, 15 },
+  // }
+  auto input_array = MakeUnique<Array2D<float>>(4, 3);
+  input_array->FillUnique(1.0f);
+  auto input = Literal::CreateR2FromArray2D<float>(*input_array);
+  HloInstruction* input_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
+
+  auto pad_value_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.718f)));
+
+  PaddingConfig padding_config = MakeNoPaddingConfig(2);
+
+  // Negative padding that results in zero dimensions.
+  auto r2_padding_on_dim0_dim1 =
+      CreatePaddingConfig({{{-2, -5, 1}}, {{-2, 4, 2}}});
+
+  Shape shape = ShapeUtil::MakeShape(F32, {0, 9});
+  b.AddInstruction(HloInstruction::CreatePad(shape, input_instruction,
+                                             pad_value_instruction,
+                                             r2_padding_on_dim0_dim1));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  auto expected_array = MakeUnique<Array2D<float>>(0, 9);
+  auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
+  HloComputation::Builder b(TestName());
+
+  // lhs:
+  // f32[4,1] {
+  //  { 1 },
+  //  { 2 },
+  //  { 3 },
+  //  { 4 },
+  // }
+  auto lhs_array = MakeUnique<Array2D<float>>(4, 1);
+  lhs_array->FillUnique(1.0f);
+  auto lhs_literal = Literal::CreateR2FromArray2D<float>(*lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  // rhs:
+  // f32[2] { 1, 2 },
+  auto rhs_literal = Literal::CreateR2<float>({{1, 2}});
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
+  b.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  // clang-format off
+  auto expected_array = Array2D<float>({
+      {1.f, 2.f},
+      {2.f, 4.f},
+      {3.f, 6.f},
+      {4.f, 8.f},
+  });
+  // clang-format on
+  auto expected = Literal::CreateR2FromArray2D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, DotRank1AndRank2) {
+  HloComputation::Builder b(TestName());
+
+  // lhs:
+  // f32[3]
+  //  { 1, 2, 3 },
+  auto lhs_literal = Literal::CreateR1<float>({1, 2, 3});
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  // rhs:
+  // f32[3,2] {
+  //  { 1, 2 },
+  //  { 3, 4 },
+  //  { 5, 6 },
+  // }
+  auto rhs_array = MakeUnique<Array2D<float>>(3, 2);
+  rhs_array->FillUnique(1.0f);
+  auto rhs_literal = Literal::CreateR2FromArray2D<float>(*rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {2});
+  b.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  auto expected = Literal::CreateR1<float>({22.f, 28.f});
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
+  HloComputation::Builder b(TestName());
+
+  // lhs:
+  // f32[4,3] {
+  //  { 1, 2, 3 },
+  //  { 5, 6, 7 },
+  //  { 9, 10, 11 },
+  //  { 13, 14, 15 },
+  // }
+  auto lhs_array = MakeUnique<Array2D<float>>(4, 3);
+  lhs_array->FillUnique(1.0f);
+  auto lhs_literal = Literal::CreateR2FromArray2D<float>(*lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  // rhs:
+  // f32[3,2] {
+  //  { 1, 2 },
+  //  { 3, 4 },
+  //  { 5, 6 },
+  // }
+  auto rhs_array = MakeUnique<Array2D<float>>(3, 2);
+  rhs_array->FillUnique(1.0f);
+  auto rhs_literal = Literal::CreateR2FromArray2D<float>(*rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
+  b.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  auto expected_array = Array2D<float>({
+      {22.f, 28.f}, {58.f, 76.f}, {94.f, 124.f}, {130.f, 172.f},
+  });
+  auto expected = Literal::CreateR2FromArray2D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, SimpleConv1D) {
+  HloComputation::Builder b(TestName());
+
+  Array3D<float> lhs_array = {{{1, 2, 3}}};
+  auto lhs_literal = Literal::CreateR3FromArray3D<float>(lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  Array3D<float> rhs_array = {{{3.f, 4.f}}};
+  auto rhs_literal = Literal::CreateR3FromArray3D<float>(rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_batch_dimension(0);
+  dnums.set_feature_dimension(1);
+  dnums.add_spatial_dimensions(2);
+
+  dnums.set_kernel_output_feature_dimension(0);
+  dnums.set_kernel_input_feature_dimension(1);
+  dnums.add_kernel_spatial_dimensions(2);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  Array3D<float> expected_array = {{{11.f, 18.f, 9.f}}};
+  auto expected = Literal::CreateR3FromArray3D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
+  HloComputation::Builder b(TestName());
+
+  Array4D<float> lhs_array(1, 1, 4, 4);
+  // clang-format off
+  lhs_array.FillWithYX(Array2D<float>({
+    {1,  2,  3,  4 },
+    {5,  6,  7,  8 },
+    {9,  10, 11, 12},
+    {13, 14, 15, 16},
+  }));
+  // clang-format on
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  Array4D<float> rhs_array(1, 1, 2, 2);
+  // clang-format off
+  rhs_array.FillWithYX(Array2D<float>({
+    {5, 6},
+    {7, 8},
+  }));
+  // clang-format on
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums =
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  Array4D<float> expected_array(1, 1, 4, 4);
+  // clang-format off
+  expected_array.FillWithYX(Array2D<float>({
+    {100, 126, 152,  76},
+    {204, 230, 256, 124},
+    {308, 334, 360, 172},
+    {149, 160, 171,  80},
+  }));
+  // clang-format on
+  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
+  HloComputation::Builder b(TestName());
+
+  // clang-format off
+  // Input dimensions: [feature=2, height=3, batch=1, width=4]
+  Array4D<float> input({
+    {{{1, 2, 3, 4}},
+     {{5, 6, 7, 8}},
+     {{9, 10, 11, 12}}},
+    {{{13, 14, 15, 16}},
+     {{17, 18, 19, 20}},
+     {{21, 22, 23, 24}}}
+  });
+  // Weight dimensions:
+  // [kernel_output_feature=1, width=3, kernel_input_feature=2, height=3]
+  Array4D<float> weight({{
+    {{1, 7, 13},
+     {4, 10, 16}},
+    {{2, 8, 14},
+     {5, 11, 17}},
+    {{3, 9, 15},
+     {6, 12, 18}}
+  }});
+  // clang-format on
+
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(input);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(weight);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(3);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(0);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_batch_dimension(2);
+  dnums.set_feature_dimension(0);
+  dnums.add_spatial_dimensions(1);
+  dnums.add_spatial_dimensions(3);
+
+  dnums.set_kernel_output_feature_dimension(0);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.add_kernel_spatial_dimensions(3);
+  dnums.add_kernel_spatial_dimensions(1);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  // clang-format off
+  // Result dimensions: [feature=1, height=1, batch=1, width=2]
+  Array4D<float> expected_array({{{{2514, 2685}}}});
+  // clang-format on
+  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
+  HloComputation::Builder b(TestName());
+
+  Array4D<float> lhs_array(1, 1, 4, 4);
+  // clang-format off
+  lhs_array.FillWithYX(Array2D<float>({
+    {1,  2,  3,  4 },
+    {5,  6,  7,  8 },
+    {9,  10, 11, 12},
+    {13, 14, 15, 16},
+  }));
+  // clang-format on
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  Array4D<float> rhs_array(1, 1, 2, 2);
+  // clang-format off
+  rhs_array.FillWithYX(Array2D<float>({
+    {5, 6},
+    {7, 8},
+  }));
+  // clang-format on
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(2);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums =
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 7, 7});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  Array4D<float> expected_array(1, 1, 7, 7);
+  expected_array.FillWithYX(Array2D<float>({
+      {5, 12, 10, 18, 15, 24, 20},
+      {35, 48, 42, 56, 49, 64, 56},
+      {25, 36, 30, 42, 35, 48, 40},
+      {63, 80, 70, 88, 77, 96, 84},
+      {45, 60, 50, 66, 55, 72, 60},
+      {91, 112, 98, 120, 105, 128, 112},
+      {65, 84, 70, 90, 75, 96, 80},
+  }));
+  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
+  HloComputation::Builder b(TestName());
+
+  Array4D<float> lhs_array(1, 1, 4, 4);
+  // clang-format off
+  lhs_array.FillWithYX(Array2D<float>({
+    {1,  2,  3,  4 },
+    {5,  6,  7,  8 },
+    {9,  10, 11, 12},
+    {13, 14, 15, 16},
+  }));
+  // clang-format on
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  Array4D<float> rhs_array(1, 1, 2, 2);
+  // clang-format off
+  rhs_array.FillWithYX(Array2D<float>({
+    {5, 6},
+    {7, 8},
+  }));
+  // clang-format on
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(1);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(2);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums =
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 8, 8});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  Array4D<float> expected_array(1, 1, 8, 8);
+  expected_array.FillWithYX(Array2D<float>({
+      {8, 7, 16, 14, 24, 21, 32, 28},
+      {6, 5, 12, 10, 18, 15, 24, 20},
+      {40, 35, 48, 42, 56, 49, 64, 56},
+      {30, 25, 36, 30, 42, 35, 48, 40},
+      {72, 63, 80, 70, 88, 77, 96, 84},
+      {54, 45, 60, 50, 66, 55, 72, 60},
+      {104, 91, 112, 98, 120, 105, 128, 112},
+      {78, 65, 84, 70, 90, 75, 96, 80},
+  }));
+  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_F(HloEvaluatorTest,
+       DilatedWindowAndBaseConv2DWithDifferentLowAndHighPaddingAndStrides) {
+  HloComputation::Builder b(TestName());
+
+  Array4D<float> lhs_array(1, 1, 4, 4);
+  // clang-format off
+  lhs_array.FillWithYX(Array2D<float>({
+    {1,  2,  3,  4 },
+    {5,  6,  7,  8 },
+    {9,  10, 11, 12},
+    {13, 14, 15, 16},
+  }));
+  // clang-format on
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  Array4D<float> rhs_array(1, 1, 2, 3);
+  // clang-format off
+  rhs_array.FillWithYX(Array2D<float>({
+    {5, 6, 7},
+    {8, 9, 10},
+  }));
+  // clang-format on
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(2);
+  dim.set_padding_high(2);
+  dim.set_window_dilation(2);
+  dim.set_base_dilation(2);
+  *window.add_dimensions() = dim;
+  dim.set_size(3);
+  dim.set_stride(3);
+  dim.set_padding_low(2);
+  dim.set_padding_high(-1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(3);
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums =
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 9, 3});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  Array4D<float> expected_array(1, 1, 9, 3);
+  expected_array.FillWithYX(Array2D<float>({
+      {10, 20, 30},
+      {0, 0, 0},
+      {57, 74, 91},
+      {0, 0, 0},
+      {125, 142, 159},
+      {0, 0, 0},
+      {193, 210, 227},
+      {0, 0, 0},
+      {91, 98, 105},
+  }));
+  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 9e25f1aceb1595b89aee601b294792e9e801c6f3..7a83a92404e3cd88f3075322111880cc95637c23 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -19,14 +19,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/metric_table_report.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
 
@@ -55,96 +52,19 @@ string HloExecutionProfile::ToString(
     return "";
   }
 
-  using Item = std::pair<const HloInstruction*, uint64>;
-  std::vector<Item> items;
-  for (Item item : hlo_to_cycles_taken_) {
-    // Only include the HLOs which are part of the desired computation.
-    if (item.first->parent() == &computation) {
-      items.push_back(item);
-    }
-  }
-  auto custom_less = [](const Item& lhs, const Item& rhs) {
-    return lhs.second > rhs.second;
-  };
-  std::sort(items.begin(), items.end(), custom_less);
-  string result;
-  const int64 total_cycles = total_cycles_executed(computation);
-  double clock_rate_ghz = device_description.clock_rate_ghz();
-  CHECK_GE(clock_rate_ghz, 1e-9);
-
-  const auto cycles_to_microseconds = [&](double cycles) {
-    return cycles / clock_rate_ghz / 1000.0;
-  };
-
-  auto append_item = [&](int64 cycles, int64 flops, int64 bytes_accessed,
-                         const string& name) {
-    double nsecs = cycles / clock_rate_ghz;
-    string bytes_per_sec;
-    string bytes_per_cycle;
-    if (cycles <= 0 || bytes_accessed < 0) {
-      bytes_per_sec = "<unknown>";
-      bytes_per_cycle = "<unknown>";
-    } else {
-      bytes_per_sec = tensorflow::strings::HumanReadableNumBytes(
-          bytes_accessed / (nsecs / 1e9));
-      bytes_per_cycle =
-          tensorflow::strings::HumanReadableNumBytes(bytes_accessed / cycles);
-    }
-
-    double cycles_percent = 0;
-    if (total_cycles > 0) {
-      cycles_percent = cycles / static_cast<double>(total_cycles) * 100;
-    }
-
-    tensorflow::strings::StrAppend(
-        &result,
-        tensorflow::strings::Printf(
-            "%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s :: %12s/s "
-            ":: "
-            "%12s/cycle :: "
-            "%s",
-            cycles, cycles_percent, cycles_to_microseconds(cycles),
-            flops <= 0 ? "<none>" : HumanReadableNumFlops(flops, nsecs).c_str(),
-            bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str()));
-  };
-  tensorflow::strings::StrAppend(
-      &result, tensorflow::strings::Printf(
-                   "HLO execution profile for %s: (%s @ f_nom)\n\t",
-                   computation.name().c_str(),
-                   tensorflow::strings::HumanReadableElapsedTime(
-                       total_cycles / clock_rate_ghz / 1e9)
-                       .c_str()));
-
-  append_item(total_cycles, -1, -1, "[total]");
-  for (const auto& item : items) {
+  HumanReadableProfileBuilder builder(computation.name(),
+                                      total_cycles_executed(computation),
+                                      device_description.clock_rate_ghz());
+  for (const auto& item : hlo_to_cycles_taken_) {
     const HloInstruction* hlo = item.first;
-    tensorflow::strings::StrAppend(&result, "\n\t");
-    const int64 flops = (hlo == nullptr) ? -1 : cost_analysis.flop_count(*hlo);
-    const int64 bytes_accessed =
-        (hlo == nullptr) ? -1 : cost_analysis.bytes_accessed(*hlo);
-    const string display = (hlo == nullptr) ? "<none>" : hlo->ToString();
-    append_item(item.second, flops, bytes_accessed, display);
-  }
+    int64 cycles = item.second;
 
-  if (total_cycles <= 0) {
-    result += "****** 0 total cycles ******\n";
-  } else {
-    MetricTableReport table;
-    table.SetMetricName("microseconds");
-    table.SetEntryName("ops");
-    table.SetShowCategoryTable();
-    for (const auto& item : items) {
-      MetricTableReport::Entry entry;
-      entry.text = item.first->ToString();
-      entry.short_text = item.first->ToString(/*compact_operands=*/true);
-      entry.category_text = item.first->ToCategory();
-      entry.metric = cycles_to_microseconds(item.second);
-      table.AddEntry(std::move(entry));
-    }
-    result += table.MakeReport(cycles_to_microseconds(total_cycles));
+    builder.AddOp(/*op_name=*/hlo->ToString(),
+                  /*short_name=*/hlo->ToString(/*compact_operands=*/true),
+                  hlo->ToCategory(), cycles, cost_analysis.flop_count(*hlo),
+                  cost_analysis.bytes_accessed(*hlo));
   }
-
-  return result;
+  return builder.ToString();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index eb2e5dfb37f33fd138e20ee930a2242cb1db89ea..c6c06658316e28d6c40b8d6ce371e3accdd42fcb 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -16,10 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 
 #include <unistd.h>
+#include <algorithm>
+#include <atomic>
+#include <deque>
+#include <map>
+#include <memory>
 #include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
@@ -27,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -34,20 +42,100 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/regexp.h"
 
 using ::tensorflow::Env;
-using ::tensorflow::WriteStringToFile;
+using ::tensorflow::gtl::nullopt;
+using ::tensorflow::gtl::optional;
 using ::tensorflow::io::JoinPath;
-using ::tensorflow::strings::Appendf;
-using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 using ::tensorflow::str_util::Join;
+using ::tensorflow::str_util::StringReplace;
+using ::tensorflow::WriteStringToFile;
 
 namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
+// Helpers for Printf and Appendf.
+template <typename T>
+struct PrintfConvert {
+  const T& operator()(const T& t) const { return t; }
+};
+template <>
+struct PrintfConvert<string> {
+  const char* operator()(const string& s) const { return s.c_str(); }
+};
+
+// Like tensorflow::strings::Printf/Appendf, but you don't need to call c_str()
+// on strings.
+template <typename... Ts>
+string Printf(const char* fmt, const Ts&... ts) {
+  return tensorflow::strings::Printf(fmt, PrintfConvert<Ts>()(ts)...);
+}
+template <typename... Ts>
+void Appendf(string* s, const char* fmt, const Ts&... ts) {
+  tensorflow::strings::Appendf(s, fmt, PrintfConvert<Ts>()(ts)...);
+}
+
+// Used to indicate how we should treat a given HLOInstruction in the graph.
+// should we treat it like normal, hide it, and so on?
+enum NodeFilterResult {
+  kNormalNode,
+  kHideNode,
+  // Make the node easy to find in the final graph.
+  kHighlightNode,
+  // "Gray out" the node to indicate that some of its operands have been
+  // omitted.
+  kSomeOperandsOmitted,
+  // Style the node the same as kSomeOperandsOmitted, but also don't connect it
+  // to its operands, even if they're present in the graph.
+  kOmitNodeOperands,
+  // Same style as kSomeOperandsOmitted, but used to indicate that some of the
+  // node's *users* have been omitted.
+  kSomeUsersOmitted,
+};
+
+// NodeFilter is essentially a map from HloInstruction*s to NodeFilterResult.
+// It lets callers tell the graph-drawing routines which nodes they want to be
+// shown, hidden, or highlighted.
+class NodeFilter {
+ public:
+  NodeFilter() : filter_([](const HloInstruction*) { return kNormalNode; }) {}
+
+  explicit NodeFilter(
+      std::function<NodeFilterResult(const HloInstruction* instr)> filter)
+      : filter_(std::move(filter)) {}
+
+  bool Show(const HloInstruction* instr) const {
+    return filter_(instr) != kHideNode;
+  }
+  bool Highlight(const HloInstruction* instr) const {
+    return filter_(instr) == kHighlightNode;
+  }
+  bool OmitOperands(const HloInstruction* instr) const {
+    return filter_(instr) == kOmitNodeOperands;
+  }
+  bool SomeOrAllOperandsOmitted(const HloInstruction* instr) const {
+    auto result = filter_(instr);
+    return result == kOmitNodeOperands || result == kSomeOperandsOmitted;
+  }
+  bool Deemphasized(const HloInstruction* instr) const {
+    auto result = filter_(instr);
+    return result == kOmitNodeOperands || result == kSomeOperandsOmitted ||
+           result == kSomeUsersOmitted;
+  }
+
+  bool ShowFusionSubcomputation(const HloInstruction* instr) const {
+    CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
+    return Show(instr) && !SomeOrAllOperandsOmitted(instr);
+  }
+
+ private:
+  std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
+};
+
 // Node color schemes, used by NodeColorAttributes.
 enum ColorScheme {
   kBlue,
@@ -62,420 +150,780 @@ enum ColorScheme {
   kRed,
   kWhite,
   kYellow,
+
+  // Causes the node's border to be a dashed line, and its content to be gray
+  // text on a white background, suggesting that this is an "unimportant" node.
+  kDashedBorder,
 };
 
 // Given a ColorScheme, returns an attribute string for a node of that color.
-// Sets the node's fill, stroke, and text colors.
+// Sets the node's style and fill/stroke/text colors.
 //
 // Colors are from https://material.io/color.
 string NodeColorAttributes(ColorScheme color) {
   using std::make_tuple;
 
-  const char *fill_color, *stroke_color, *font_color;
-  std::tie(fill_color, stroke_color, font_color) =
-      [color]() -> std::tuple<const char*, const char*, const char*> {
+  const char *style, *fill_color, *stroke_color, *font_color;
+  std::tie(style, fill_color, stroke_color, font_color) = [color] {
     switch (color) {
       case kBlue:
-        return make_tuple("#bbdefb", "#8aacc8", "black");
+        return make_tuple("filled", "#bbdefb", "#8aacc8", "black");
       case kBrown:
-        return make_tuple("#bcaaa4", "#8c7b75", "black");
+        return make_tuple("filled", "#bcaaa4", "#8c7b75", "black");
       case kDarkBlue:
-        return make_tuple("#1565c0", "#003c8f", "white");
+        return make_tuple("filled", "#1565c0", "#003c8f", "white");
       case kDarkGreen:
-        return make_tuple("#2e7d32", "#005005", "white");
+        return make_tuple("filled", "#2e7d32", "#005005", "white");
       case kDarkRed:
-        return make_tuple("#b71c1c", "#7f0000", "white");
+        return make_tuple("filled", "#b71c1c", "#7f0000", "white");
       case kGray:
-        return make_tuple("#cfd8dc", "#9ea7aa", "black");
+        return make_tuple("filled", "#cfd8dc", "#9ea7aa", "black");
       case kGreen:
-        return make_tuple("#c8e6c9", "#97b498", "black");
+        return make_tuple("filled", "#c8e6c9", "#97b498", "black");
       case kOrange:
-        return make_tuple("#ffe0b2", "#cbae82", "black");
+        return make_tuple("filled", "#ffe0b2", "#cbae82", "black");
       case kPurple:
-        return make_tuple("#e1bee7", "#af8eb5", "black");
+        return make_tuple("filled", "#e1bee7", "#af8eb5", "black");
       case kRed:
-        return make_tuple("#ffcdd2", "#cb9ca1", "black");
+        return make_tuple("filled", "#ffcdd2", "#cb9ca1", "black");
       case kWhite:
-        return make_tuple("white", "black", "black");
+        return make_tuple("filled", "white", "black", "black");
       case kYellow:
-        return make_tuple("#fff9c4", "#cbc693", "black");
+        return make_tuple("filled", "#fff9c4", "#cbc693", "black");
+      case kDashedBorder:
+        // "filled,dashed" looks the same as "dashed", since we have a white
+        // background.  But we use "filled,dashed" so that when you hover over
+        // any part of the node (not just the text inside the node), our css
+        // :hover rule is triggered.
+        return make_tuple("filled,dashed", "white", "#757575", "#757575");
     }
   }();
 
   return Printf(
-      "style=filled, fontcolor=\"%s\", color=\"%s\", fillcolor=\"%s\"",
+      R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")", style,
       font_color, stroke_color, fill_color);
 }
 
 // Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
 // graphviz HTML-like string.
 string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
-  return tensorflow::str_util::StringReplace(
-      tensorflow::str_util::StringReplace(s, "<", "&lt;", /*replace_all=*/true),
-      ">", "&gt;", /*replace_all=*/true);
+  return StringReplace(StringReplace(s, "<", "&lt;", /*replace_all=*/true), ">",
+                       "&gt;", /*replace_all=*/true);
 }
 
-// Returns the dot graph identifier for the given instruction.
-string InstructionId(const HloInstruction* instruction) {
-  return Printf("%lld", reinterpret_cast<uint64>(instruction));
-}
+// Tries to generates a human-readable one-word description of the given
+// computation.
+//
+// Currently we support:
+//
+//   "return param0 + param1;"      --> "add"
+//   "return param0 * param1;"      --> "multiply"
+//   "return min(param0, param1);"  --> "min"
+//   "return max(param0, param1);"  --> "max"
+//   "return param0 <= param1;"     --> "less-or-equal"
+//   "return param0 >= param1;"     --> "greater-or-equal"
+//   "return param0 >  param1;"     --> "greater-than"
+//   "return param0 <  param1;"     --> "less-than"
+//   "return param0 == param1;"     --> "equal-to"
+//   "return param0 != param1;"     --> "not-equal-to"
+//
+// where param0 and param1 are effective scalars.  For the ops that are
+// commutative, we also support them with param0 and param1 swapped.
+//
+// This is useful primarily for reduce and map nodes.  These take a
+// subcomputation which is almost always one of the four above, and pattern
+// matching it to a short string lets us tell the user what the subcomputation
+// is without drawing it as a graph.
+optional<string> MatchTrivialComputation(const HloComputation* computation) {
+  if (computation->instruction_count() != 3) {
+    return nullopt;
+  }
 
-// Returns the dot graph identifier for the given computation.
-string ComputationId(const HloComputation* computation) {
-  return Printf("%lld", reinterpret_cast<uint64>(computation));
-}
+  HloInstruction* root = computation->root_instruction();
+  if (root->operand_count() != 2) {
+    return nullopt;
+  }
 
-// Returns the dot graph edges and nodes for the given instruction sequence.
-// Edges which extend between computations are added to the vector
-// intercomputation_edges. This is necessary because graphviz does not render
-// the graph properly unless these inter-computation edges appear after all
-// subgraph statements.
-string InstructionSequenceGraph(
-    const std::list<std::unique_ptr<HloInstruction>>& instructions,
-    bool show_addresses, bool show_layouts,
-    std::vector<string>* intercomputation_edges,
-    const HloExecutionProfile* hlo_execution_profile) {
-  string graph_body;
-
-  // Create a single "record" node for the parameters. This node is a
-  // partitioned rectangle with one partition per parameter node. The keeps
-  // all the parameter instructions together.
-  std::vector<HloInstruction*> param_instructions;
-  for (auto& instruction : instructions) {
-    if (instruction->opcode() == HloOpcode::kParameter) {
-      size_t param_number = instruction->parameter_number();
-
-      if (param_instructions.size() < param_number + 1) {
-        param_instructions.resize(param_number + 1, nullptr);
-      }
-      param_instructions[param_number] = instruction.get();
-    }
-  }
-  string param_node_name;
-  if (!param_instructions.empty()) {
-    std::vector<string> param_ports;
-    param_node_name =
-        StrCat("parameters_", InstructionId(param_instructions[0]));
-    for (auto& param : param_instructions) {
-      string label = StrCat(param->parameter_name(), "\\n",
-                            ShapeUtil::HumanString(param->shape()));
-      if (show_addresses) {
-        Appendf(&label, "\\n[%p]", param);
-      }
-      if (show_layouts) {
-        StrAppend(&label, "\\nlayout=\\{",
-                  Join(param->shape().layout().minor_to_major(), ","), "\\}");
-      }
-      param_ports.push_back(
-          Printf("<%s> %s", InstructionId(param).c_str(), label.c_str()));
-    }
-    // (If we wanted the word "parameters" to be bold like the other op names,
-    // we'd have to make this into an HTML-like table.  It is possible but
-    // complicated; see http://www.graphviz.org/doc/info/shapes.html#html.)
-    StrAppend(&graph_body, param_node_name, " [shape=record ",
-              NodeColorAttributes(kOrange), "label=\"{parameters | {",
-              Join(param_ports, "|"), "}}\"];\n");
-  }
-
-  for (auto& instruction : instructions) {
-    ColorScheme color = kYellow;
-    string shape = "box";
-    string name =
-        StrCat("<b>", HtmlLikeStringSanitize(instruction->ExtendedOpcodeStr()),
-               "</b> ", HtmlLikeStringSanitize(instruction->name()));
-    if (HloOpcode::kConvolution == instruction->opcode()) {
-      StrAppend(
-          &name, "<br/>",
-          HtmlLikeStringSanitize(
-              instruction->ConvolutionDimensionNumbersToString()),
-          "<br/>",
-          HtmlLikeStringSanitize(window_util::ToString(instruction->window())));
-    }
-
-    if (!instruction->metadata().op_name().empty()) {
-      StrAppend(&name, "<br/>",
-                HtmlLikeStringSanitize(instruction->metadata().op_name()));
-    }
-    if (!instruction->metadata().source_file().empty() &&
-        instruction->metadata().source_line() != 0) {
-      StrAppend(&name, "<br/>", instruction->metadata().source_file(), ":",
-                instruction->metadata().source_line());
-    }
-
-    // Pick different colors or shapes for instructions which are particularly
-    // expensive (eg, dot) and those which are unusual in some way or unique
-    // (eg, parameter).
-    switch (instruction->opcode()) {
-      // "Normal" instructions. Mostly cheap and elementwise. No call to
-      // embedded computations. In this case, use default color, shape and
-      // label.
-      case HloOpcode::kAbs:
-      case HloOpcode::kAdd:
-      case HloOpcode::kCeil:
-      case HloOpcode::kClamp:
-      case HloOpcode::kConvert:
-      case HloOpcode::kDivide:
-      case HloOpcode::kEq:
-      case HloOpcode::kExp:
-      case HloOpcode::kFloor:
+  // Check that both of the operands to the root are parameters.
+  const HloInstruction* operand0 = root->operand(0);
+  const HloInstruction* operand1 = root->operand(1);
+  if (operand0->opcode() != HloOpcode::kParameter ||
+      operand1->opcode() != HloOpcode::kParameter) {
+    return nullopt;
+  }
+
+  // Check that the two operands of root are param0 and param1.  All of the
+  // opcodes we recognize are commutative, so we're OK with either order.
+  auto n0 = operand0->parameter_number();
+  auto n1 = operand1->parameter_number();
+  if (!(n0 == 0 && n1 == 1) && !(n1 == 0 && n0 == 1)) {
+    return nullopt;
+  }
+
+  // If the params are reversed, check that the operation being performed is
+  // commutative.
+  if (n0 == 1) {
+    switch (root->opcode()) {
+      case HloOpcode::kLe:
       case HloOpcode::kGe:
       case HloOpcode::kGt:
-      case HloOpcode::kIndex:
-      case HloOpcode::kIsFinite:
-      case HloOpcode::kLe:
-      case HloOpcode::kLog:
-      case HloOpcode::kLogicalAnd:
-      case HloOpcode::kLogicalNot:
-      case HloOpcode::kLogicalOr:
       case HloOpcode::kLt:
-      case HloOpcode::kMaximum:
-      case HloOpcode::kMinimum:
-      case HloOpcode::kMultiply:
-      case HloOpcode::kNe:
-      case HloOpcode::kNegate:
-      case HloOpcode::kPower:
-      case HloOpcode::kRemainder:
-      case HloOpcode::kSelect:
-      case HloOpcode::kSign:
-      case HloOpcode::kSlice:
-      case HloOpcode::kSort:
-      case HloOpcode::kSubtract:
-      case HloOpcode::kTanh:
+        return nullopt;
+      default:
         break;
+    }
+  }
+
+  // Check that the root and params are all effective scalars.
+  if (!ShapeUtil::IsEffectiveScalar(root->shape()) ||
+      !ShapeUtil::IsEffectiveScalar(operand0->shape()) ||
+      !ShapeUtil::IsEffectiveScalar(operand1->shape())) {
+    return nullopt;
+  }
+
+  // If we recognize the root's opcode, we've successfully pattern-matched!
+  switch (root->opcode()) {
+    case HloOpcode::kAdd:
+      return "add";
+    case HloOpcode::kMultiply:
+      return "multiply";
+    case HloOpcode::kMinimum:
+      return "min";
+    case HloOpcode::kMaximum:
+      return "max";
+    case HloOpcode::kLe:
+      return "less-or-equal";
+    case HloOpcode::kGe:
+      return "greater-or-equal";
+    case HloOpcode::kGt:
+      return "greater-than";
+    case HloOpcode::kLt:
+      return "less-than";
+    case HloOpcode::kEq:
+      return "equal-to";
+    case HloOpcode::kNe:
+      return "not-equal-to";
+    default:
+      return nullopt;
+  }
+}
+
+// Encapsulates logic for dumping an HLO module to DOT (i.e. graphviz syntax).
+class HloDotDumper {
+ public:
+  HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label,
+               bool show_addresses, bool show_layouts,
+               const HloExecutionProfile* profile, NodeFilter filter)
+      : computation_(computation),
+        label_(label.ToString()),
+        show_addresses_(show_addresses),
+        show_layouts_(show_layouts),
+        profile_(profile),
+        filter_(std::move(filter)) {}
+
+  string Dump();
+
+ private:
+  // Returns the dot graph identifier for the given instruction.
+  string InstructionId(const HloInstruction* instruction) {
+    return StrCat(reinterpret_cast<uint64>(instruction));
+  }
+
+  // Returns the dot graph identifier for the given computation.
+  string SubcomputationId(const HloComputation* computation) {
+    return StrCat("cluster_", reinterpret_cast<uint64>(computation));
+  }
+
+  // Generates graph header/footer.  These should be called *after* dumping all
+  // of the instructions and subcomputations for the graph, as they both use
+  // data generated while dumping the graph.
+  string Header();
+  string Footer();
+
+  // Maps HloComputations we should dump to their parent instruction in the
+  // outer computation.
+  std::unordered_map<const HloComputation*, const HloInstruction*>
+  SubcomputationsToDump();
+
+  string DumpSubcomputation(const HloComputation* subcomp,
+                            const HloInstruction* parent_instr);
+  string DumpComputation(const HloComputation* comp);
+  string DumpInstruction(const HloInstruction* instr);
+  ColorScheme GetInstructionColor(const HloInstruction* instr);
+  string GetInstructionNodeShape(const HloInstruction* instr);
+  string GetInstructionNodeLabel(const HloInstruction* instr);
+  string GetInstructionNodeExtraInfo(const HloInstruction* instr);
+  string GetInstructionNodeInlinedConstants(const HloInstruction* instr);
+  void AddInstructionIncomingEdges(const HloInstruction* instr);
+
+  // If instr has just one computation and it's trivial (e.g. "return param0 +
+  // param1"), returns a string you can put into the node's body that names the
+  // subcomputation, e.g. "Subcomputation: <b>add</b>".
+  string GetInstructionTrivialComputationStr(const HloInstruction* instr);
+
+  const HloComputation* computation_;  // never null
+  const string label_;                 // overall name for the graph
+  const bool show_addresses_;
+  const bool show_layouts_;
+  const HloExecutionProfile* profile_;  // may be null
+  const NodeFilter filter_;
+
+  // Each HloInstruction dumped gets a monotically-increasing node ID.  This
+  // must start at 1, because that's where graphviz's accounting starts.
+  int64 next_node_id_ = 1;
+  std::unordered_map<const HloInstruction*, int64> node_ids_;
+
+  // Each (from, to) edge gets a monotonically-increasing ID.  This is a
+  // multimap because it's possible for the same edge to appear multiple times
+  // in the graph (e.g. x^2 may be represented as mul(x, x)).
+  int64 next_edge_id_ = 1;
+  std::unordered_multimap<
+      std::pair<const HloInstruction*, const HloInstruction*>, int64,
+      tensorflow::hash<std::pair<const HloInstruction*, const HloInstruction*>>>
+      edge_ids_;
+
+  // Each HloComputation that's emitted gets a monotonically-increasing ID.
+  int64 next_cluster_id_ = 1;
+  std::unordered_map<const HloComputation*, int64> cluster_ids_;
+
+  // Edges to print from Footer().  Edges come at the end because graphviz is
+  // unhappy if an edge from a subcomputation to a node in the outer computation
+  // appears before both the inner computation and the destination node are
+  // defined.
+  std::vector<string> edges_;
+};
+
+string HloDotDumper::Dump() {
+  string body;
+  for (const auto& kv : SubcomputationsToDump()) {
+    const HloComputation* subcomp = kv.first;
+    const HloInstruction* parent = kv.second;
+    StrAppend(&body, DumpSubcomputation(subcomp, parent));
+  }
+  StrAppend(&body, DumpComputation(computation_));
+
+  // By contract, Header() and Footer() have to be called after we've dumped all
+  // our instructions, because they use state generated during that process.
+  string g = Header();
+  StrAppend(&g, body);
+  StrAppend(&g, Footer());
+  return g;
+}
+
+string HloDotDumper::Header() {
+  const char* fmt = R"(digraph G {
+rankdir = TB;
+compound = true;
+label = <<b>%s</b>>;
+labelloc = t;
+// Disable the tooltip.  Interestingly, "" doesn't work!
+tooltip = " ";
+// DOT graphs accept a stylesheet as a URI.  So naturally, an inline
+// stylesheet is a data URI!
+stylesheet="
+  data:text/css,
+  @import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
+  svg text {
+    font-family: 'Roboto';
+    font-size: 12px;
+  }
+
+  %s
+"
+
+)";
+
+  string graph_label = StrCat(label_, "<br/>", computation_->name());
+  if (profile_ != nullptr) {
+    auto cycles = profile_->total_cycles_executed(*computation_);
+    Appendf(&graph_label, "<br/>total cycles = %lld (%s)", cycles,
+            tensorflow::strings::HumanReadableNum(cycles));
+  }
+
+  // Create CSS rules that say, when you hover over the given node or cluster,
+  // turn the given edge the given color.
+  //
+  // We rely on a few properties of how graphviz generates SVGs:
+  //
+  //  - Nodes are named "nodeN", where N corresponds to the 1-based index of
+  //    the node in our DOT (i.e. the first node in the DOT is "node1", etc.).
+  //    Edges are similarly named "edgeN", and clusters are named "clustN".
+  //  - Nodes come before their in- and out-edges in the SVG.  We need this
+  //    because the "X ~ Y" CSS selector finds a sibling of X that *comes
+  //    after X in the DOM* and matches Y.
+  std::vector<string> edge_css_rules;
+  const char* kBlue = "#1976d2";
+  const char* kRed = "#d32f2f";
+  for (const auto& kv : edge_ids_) {
+    const HloInstruction* from_node = kv.first.first;
+    const HloInstruction* to_node = kv.first.second;
+    int64 edge_id = kv.second;
+
+    auto add_hover_css_rule = [&](string elem_type, int64 elem_id,
+                                  const char* color) {
+      // One could imagine other ways of writing this CSS rule that involve less
+      // duplication, but this way seems to be relatively performant.
+      edge_css_rules.push_back(Printf(
+          "  #%s%d:hover ~ #edge%lld text { fill: %s; }\n"
+          "  #%s%d:hover ~ #edge%lld path { stroke: %s; stroke-width: .2em; }\n"
+          "  #%s%d:hover ~ #edge%lld polygon { "
+          "fill: %s; stroke: %s; stroke-width: .2em; }\n",
+          elem_type, elem_id, edge_id, color,  //
+          elem_type, elem_id, edge_id, color,  //
+          elem_type, elem_id, edge_id, color, color));
+    };
+
+    int64 from_node_id = node_ids_.at(from_node);
+    int64 to_node_id = node_ids_.at(to_node);
+    add_hover_css_rule("node", from_node_id, kBlue);
+    add_hover_css_rule("node", to_node_id, kRed);
+
+    // If this edge crosses a fusion cluster boundary, highlight it when the
+    // cluster is hovered over.
+    if (from_node->IsFused() &&
+        from_node->fusion_instruction()->fused_expression_root() == from_node) {
+      int64 cluster_id = cluster_ids_.at(from_node->parent());
+      add_hover_css_rule("clust", cluster_id, kBlue);
+    }
+    if (to_node->IsFused() && to_node->opcode() == HloOpcode::kParameter) {
+      int64 cluster_id = cluster_ids_.at(to_node->parent());
+      add_hover_css_rule("clust", cluster_id, kRed);
+    }
+  }
+
+  return Printf(fmt, graph_label, Join(edge_css_rules, "\n"));
+}
+
+string HloDotDumper::Footer() { return StrCat(Join(edges_, "\n"), "\n}"); }
+
+std::unordered_map<const HloComputation*, const HloInstruction*>
+HloDotDumper::SubcomputationsToDump() {
+  // Dump the subcomputations of each instruction that's shown and doesn't have
+  // its operands omitted.  If an instruction has just one subcomputation and
+  // it's trivial, omit it: We'll display that subcomputation inlined into the
+  // instruction's node when we draw it.
+  std::unordered_map<const HloComputation*, const HloInstruction*> to_dump;
+  for (const auto& instr : computation_->instructions()) {
+    if (!filter_.Show(instr.get()) ||
+        filter_.SomeOrAllOperandsOmitted(instr.get())) {
+      continue;
+    }
+    if (instr->opcode() == HloOpcode::kFusion) {
+      to_dump[instr->fused_instructions_computation()] = instr.get();
+    }
+
+    for (const HloComputation* comp : instr->called_computations()) {
+      if (!MatchTrivialComputation(comp)) {
+        to_dump[comp] = instr.get();
+      }
+    }
+  }
+  return to_dump;
+}
+
+string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp,
+                                        const HloInstruction* parent_instr) {
+  const char* computation_fmt = R"(subgraph %s {
+%s;
+label = <%s>;
+labelloc = t;
+tooltip = " ";
+%s
+}  // %s
+
+)";
+
+  cluster_ids_[subcomp] = next_cluster_id_++;
+
+  string id = SubcomputationId(subcomp);
+
+  string subcomp_label, style;
+  if (parent_instr->opcode() == HloOpcode::kFusion) {
+    subcomp_label = Printf("Fused expression for <b>%s</b><br/>%s",
+                           HtmlLikeStringSanitize(parent_instr->name()),
+                           HtmlLikeStringSanitize(parent_instr->ToCategory()));
+
+    // Subcomputation's fill/stroke color is light/dark red/gray, depending on
+    // whether or not the subcomputation's fusion node is highlighted.
+    bool highlight = filter_.Highlight(parent_instr);
+    const char* fillcolor = highlight ? "#ffcdd2" : "#f5f5f5";
+    const char* strokecolor = highlight ? "#b71c1c" : "#c2c2c2";
+    style = Printf(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s")",
+                   fillcolor, strokecolor);
+  } else {
+    subcomp_label = Printf("Subcomputation for <b>%s</b><br/>%s",
+                           HtmlLikeStringSanitize(parent_instr->name()),
+                           HtmlLikeStringSanitize(subcomp->name()));
+    style = "style=rounded; color=black;";
+  }
+
+  string comp_body = DumpComputation(subcomp);
+  string computation =
+      Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
+
+  // Add an edge from the subcomputation to its parent node.  If subcomp
+  // belongs to a fusion node, it's drawn in place of the fusion instruction, so
+  // there's no need to link those.
+  if (parent_instr->opcode() != HloOpcode::kFusion) {
+    edge_ids_.insert(
+        {{subcomp->root_instruction(), parent_instr}, next_edge_id_++});
+    const char* edge_fmt =
+        R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)";
+    edges_.push_back(
+        Printf(edge_fmt, InstructionId(subcomp->root_instruction()),
+               InstructionId(parent_instr), SubcomputationId(subcomp),
+               subcomp->name(), parent_instr->name()));
+  }
+
+  return computation;
+}
+
+string HloDotDumper::DumpComputation(const HloComputation* comp) {
+  string g;
+  for (const auto& instr : comp->instructions()) {
+    if (!filter_.Show(instr.get())) {
+      continue;
+    }
+    StrAppend(&g, DumpInstruction(instr.get()));
+  }
+  return g;
+}
+
+string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
+  // We don't display constants as separate nodes; they're merged into their
+  // users.
+  if (instr->opcode() == HloOpcode::kConstant) {
+    return "";
+  }
+  // Omit the fusion node if its subcomputation is drawn, since the
+  // subcomputation will be drawn inline.
+  if (instr->opcode() == HloOpcode::kFusion &&
+      filter_.ShowFusionSubcomputation(instr)) {
+    return "";
+  }
+
+  node_ids_[instr] = next_node_id_++;
+
+  ColorScheme color = GetInstructionColor(instr);
+  string node_shape = GetInstructionNodeShape(instr);
+  string node_label = GetInstructionNodeLabel(instr);
+  string extra_info = GetInstructionNodeExtraInfo(instr);
+  string inlined_constants = GetInstructionNodeInlinedConstants(instr);
+  string trivial_subcomputation = GetInstructionTrivialComputationStr(instr);
+  AddInstructionIncomingEdges(instr);
+
+  // Override the node's styling if it should be (de-)emphasized.
+  if (filter_.Deemphasized(instr)) {
+    color = kDashedBorder;
+  }
+  if (filter_.Highlight(instr)) {
+    node_shape = "diamond";
+    color = kDarkRed;
+  }
+
+  // Build the text that will be displayed inside the node.
+  string node_body = node_label;
+  for (const string& s :
+       {trivial_subcomputation, extra_info, inlined_constants}) {
+    if (!s.empty()) {
+      StrAppend(&node_body, "<br/>", s);
+    }
+  }
+
+  return Printf(R"(%s [label=<%s>, shape=%s, tooltip=" ", %s];)"
+                "\n",
+                InstructionId(instr), node_body, node_shape,
+                NodeColorAttributes(color));
+}
+
+string HloDotDumper::GetInstructionNodeInlinedConstants(
+    const HloInstruction* instr) {
+  auto stringify_constant = [](const HloInstruction* constant) {
+    if (ShapeUtil::IsEffectiveScalar(constant->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          constant->shape(), /*linear_index=*/0);
+      return Printf("%s{%s}", ShapeUtil::HumanString(constant->shape()),
+                    constant->literal().GetAsString(elem_idx));
+    }
+    if (tensorflow::StringPiece(constant->name()).starts_with("%constant")) {
+      return constant->name();
+    }
+    return StrCat("constant ", constant->name());
+  };
+
+  // Special case: If instr is a parameter to a fusion node, check whether the
+  // corresponding operand to the fusion node is a constant.
+  if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) {
+    const HloInstruction* fusion = instr->fusion_instruction();
+    const HloInstruction* operand = fusion->operand(instr->parameter_number());
+    if (operand->opcode() != HloOpcode::kConstant) {
+      return "";
+    }
+    return stringify_constant(operand);
+  }
+
+  std::vector<string> lines;
+  for (int64 i = 0; i < instr->operand_count(); ++i) {
+    const HloInstruction* operand = instr->operand(i);
+    if (operand->opcode() != HloOpcode::kConstant) {
+      continue;
+    }
+    lines.push_back(
+        Printf("<b>operand %lld</b> = %s", i, stringify_constant(operand)));
+  }
+  return Join(lines, "<br/>");
+}
+
+ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
+  // Pick different colors or shapes for instructions which are particularly
+  // expensive (eg, dot) and those which are unusual in some way or unique
+  // (eg, parameter).
+  switch (instr->opcode()) {
+    case HloOpcode::kAbs:
+    case HloOpcode::kAdd:
+    case HloOpcode::kCeil:
+    case HloOpcode::kClamp:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCos:
+    case HloOpcode::kDivide:
+    case HloOpcode::kEq:
+    case HloOpcode::kExp:
+    case HloOpcode::kFloor:
+    case HloOpcode::kGe:
+    case HloOpcode::kGt:
+    case HloOpcode::kIndex:
+    case HloOpcode::kIsFinite:
+    case HloOpcode::kLe:
+    case HloOpcode::kLog:
+    case HloOpcode::kLogicalAnd:
+    case HloOpcode::kLogicalNot:
+    case HloOpcode::kLogicalOr:
+    case HloOpcode::kLt:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kNe:
+    case HloOpcode::kNegate:
+    case HloOpcode::kPower:
+    case HloOpcode::kRemainder:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSign:
+    case HloOpcode::kSin:
+    case HloOpcode::kSlice:
+    case HloOpcode::kSort:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kTanh:
+    case HloOpcode::kRng:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kTranspose:
+      return kYellow;
+    case HloOpcode::kBitcast:
+    case HloOpcode::kTuple:
+    case HloOpcode::kTrace:
+    case HloOpcode::kGetTupleElement:
+      return kWhite;
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kCopy:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kPad:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kUpdate:
+      return kGreen;
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot:
+      return kDarkBlue;
+    case HloOpcode::kReducePrecision:
+      return kRed;
+    case HloOpcode::kParameter:
+      return kOrange;
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kReduce:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kReduceWindow:
+      return kPurple;
+    case HloOpcode::kMap:
+    case HloOpcode::kFusion:
+      return kGray;
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kCrossReplicaSum:
+      return kBrown;
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kWhile:
+    case HloOpcode::kCall:
+      return kDarkGreen;
+    case HloOpcode::kConstant:
+      LOG(FATAL) << "Constants don't get their own nodes in the graph.";
+  }
+}
+
+string HloDotDumper::GetInstructionNodeShape(const HloInstruction* instr) {
+  // Give while loops a different shape so they're easier to pick out.
+  switch (instr->opcode()) {
+    case HloOpcode::kWhile:
+      return "ellipse";
+    default:
+      return "rect";
+  }
+}
+
+string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
+  // If we have a parameter, put the param number in the name.
+  if (instr->opcode() == HloOpcode::kParameter) {
+    return Printf("<b>Parameter %lld</b>", instr->parameter_number());
+  }
+
+  // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
+  // an add instruction.  In this case we render just the name.
+  if (tensorflow::StringPiece(instr->name())
+          .starts_with(StrCat("%", HloOpcodeString(instr->opcode())))) {
+    return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
+  }
+
+  // If the name does not contain the opcode, render both.
+  return Printf("<b>%s</b><br/>%s",
+                HtmlLikeStringSanitize(instr->ExtendedOpcodeStr()),
+                HtmlLikeStringSanitize(instr->name()));
+}
+
+string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
+  string opcode_specific_info = [&]() -> string {
+    switch (instr->opcode()) {
       case HloOpcode::kRng:
-        StrAppend(&name, "<br/>",
-                  RandomDistribution_Name(instruction->random_distribution()));
-        break;
+        return RandomDistribution_Name(instr->random_distribution());
+      case HloOpcode::kConvolution:
+        return StrCat(
+            HtmlLikeStringSanitize(
+                instr->ConvolutionDimensionNumbersToString()),
+            "<br/>",
+            HtmlLikeStringSanitize(window_util::ToString(instr->window())));
       case HloOpcode::kBroadcast:
       case HloOpcode::kTranspose:
-        StrAppend(&name, "<br/>", "dims={",
-                  Join(instruction->dimensions(), ","), "}");
-        break;
-      case HloOpcode::kBitcast:
-      case HloOpcode::kTuple:
-      case HloOpcode::kTrace:
-        color = kWhite;
-        break;
-      case HloOpcode::kGetTupleElement:
-        color = kWhite;
-        StrAppend(&name, "<br/>index=", instruction->tuple_index());
-        break;
-      case HloOpcode::kConcatenate:
-      case HloOpcode::kCopy:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kPad:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
-      case HloOpcode::kUpdate:
-        color = kGreen;
-        break;
-      case HloOpcode::kConstant:
-        color = kBlue;
-        break;
-      case HloOpcode::kConvolution:
-      case HloOpcode::kDot:
-        color = kDarkBlue;
-        break;
-      case HloOpcode::kParameter:
-        // A single record node is created for all the parameter nodes with a
-        // port for each parameter instruction. No need to emit anything in this
-        // case.
-        continue;
       case HloOpcode::kReduce:
-        StrAppend(&name, " dims=", Join(instruction->dimensions(), ","));
-        color = kPurple;
-        break;
-      case HloOpcode::kSelectAndScatter:
-      case HloOpcode::kReduceWindow:
-        color = kPurple;
-        break;
-      case HloOpcode::kWhile:
-        shape = "ellipse";
-        color = kDarkGreen;
-        break;
-      case HloOpcode::kMap:
-      case HloOpcode::kFusion:
-        color = kGray;
-        break;
-      case HloOpcode::kSend:
-      case HloOpcode::kRecv:
-      case HloOpcode::kInfeed:
-      case HloOpcode::kOutfeed:
-      case HloOpcode::kCrossReplicaSum:
-        color = kBrown;
-        break;
-      case HloOpcode::kCall:
-        color = kDarkGreen;
-        break;
+        return Printf("dims={%s}", Join(instr->dimensions(), ","));
+      case HloOpcode::kGetTupleElement:
+        return Printf("index=%lld", instr->tuple_index());
+      case HloOpcode::kBatchNormTraining:
+      case HloOpcode::kBatchNormGrad:
+        return Printf("feature_index=%lld", instr->feature_index());
       case HloOpcode::kCustomCall:
-        color = kDarkGreen;
-        StrAppend(&name, "<br/>",
-                  "custom_call_target=", instruction->custom_call_target());
-        break;
+        return Printf("custom_call_target=%s", instr->custom_call_target());
+      default:
+        return "";
     }
+  }();
 
-    // Create instruction node with appropriate label, shape, and color.
-    // label is interpreted as an HTML-like string, so newlines must be
-    // delimited with <br/>, rather than \n.
-    string label =
-        StrCat(name, "<br/>", ShapeUtil::HumanString(instruction->shape()));
+  std::vector<string> lines;
+  if (!opcode_specific_info.empty()) {
+    lines.push_back(opcode_specific_info);
+  }
 
-    if (instruction->opcode() == HloOpcode::kConstant &&
-        ShapeUtil::IsEffectiveScalar(instruction->shape())) {
-      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
-          instruction->shape(), /*linear_index=*/0);
-      StrAppend(&label, " = {",
-                LiteralUtil::GetAsString(instruction->literal(), elem_idx),
-                "}");
-    }
-
-    if (show_addresses) {
-      Appendf(&label, "<br/>[%p]", instruction.get());
-    }
-    if (show_layouts && LayoutUtil::HasLayout(instruction->shape())) {
-      string layout_string;
-      if (ShapeUtil::IsTuple(instruction->shape())) {
-        // For tuples, emit the full shape because the layout of a tuple is not
-        // represented in a single Layout field.
-        layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
-      } else {
-        layout_string =
-            Join(instruction->shape().layout().minor_to_major(), ",");
-      }
-      StrAppend(&label, "<br/>layout={", layout_string, "}");
-    }
-    if (hlo_execution_profile != nullptr) {
-      auto hlo_cycles_executed =
-          hlo_execution_profile->GetProfileResult(*instruction);
-      auto total_cycles_executed =
-          hlo_execution_profile->total_cycles_executed(*instruction->parent());
-      if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
-        Appendf(&label, "<br/>%% of cycles executed=%.2f",
-                (static_cast<double>(hlo_cycles_executed) /
-                 static_cast<double>(total_cycles_executed)) *
-                    100);
-      }
-    }
+  // Some instructions have giant tuples as their shapes, so truncate the HLO's
+  // shape to kMaxShapeLen characters.
+  constexpr int kMaxShapeLen = 64;
+  string instr_shape = ShapeUtil::HumanString(instr->shape());
+  if (instr_shape.length() > kMaxShapeLen) {
+    instr_shape =
+        StrCat(tensorflow::StringPiece(instr_shape).substr(0, kMaxShapeLen - 3),
+               "...");
+  }
+  lines.push_back(instr_shape);
 
-    Appendf(&graph_body, "%s [label=<%s>, shape=%s, %s];\n",
-            InstructionId(instruction.get()).c_str(), label.c_str(),
-            shape.c_str(), NodeColorAttributes(color).c_str());
-
-    // Create edges from the instruction's operands to the instruction.
-    int64 operand_number = 0;
-    for (auto* operand : instruction->operands()) {
-      string src;
-      if (operand->opcode() == HloOpcode::kParameter) {
-        // If operand is a parameter, then select the proper partition (port) in
-        // the unified parameter node.
-        src = param_node_name + ":" + InstructionId(operand);
-      } else {
-        src = InstructionId(operand);
-      }
-      Appendf(&graph_body, "%s -> %s", src.c_str(),
-              InstructionId(instruction.get()).c_str());
-      if (instruction->operand_count() > 1) {
-        Appendf(&graph_body, " [headlabel=\"%lld\",labeldistance=2]",
-                operand_number);
-      }
-      StrAppend(&graph_body, ";\n");
-      ++operand_number;
-    }
-
-    // Fusion nodes are handled specially because they contain nested
-    // expressions.
-    if (instruction->opcode() == HloOpcode::kFusion) {
-      string cluster_name =
-          StrCat("cluster_", InstructionId(instruction.get()));
-      StrAppend(&graph_body, "subgraph ", cluster_name, " {\n");
-      StrAppend(&graph_body,
-                "label=<<b>fused expression</b>>;\nstyle=\"rounded,filled\";\n"
-                "color=lightgrey;\n");
-      StrAppend(&graph_body, InstructionSequenceGraph(
-                                 instruction->fused_instructions(),
-                                 show_addresses, show_layouts,
-                                 intercomputation_edges, hlo_execution_profile),
-                "}\n");
-      string fusion_edge =
-          StrCat(InstructionId(instruction->fused_expression_root()), " -> ",
-                 InstructionId(instruction.get()),
-                 "  [ style = \"dotted\", arrowsize=0.0, ltail=", cluster_name,
-                 " ];\n");
-      intercomputation_edges->push_back(fusion_edge);
+  if (show_addresses_) {
+    lines.push_back(Printf("[%p]", instr));
+  }
+  if (show_layouts_ && LayoutUtil::HasLayout(instr->shape())) {
+    string layout_str;
+    if (ShapeUtil::IsTuple(instr->shape())) {
+      // For tuples, emit the full shape because the layout of a tuple is not
+      // represented in a single Layout field.
+      layout_str = ShapeUtil::HumanStringWithLayout(instr->shape());
     } else {
-      // Add a dotted edge between the instruction and any computations that the
-      // instruction calls.
-      for (const HloComputation* computation :
-           instruction->called_computations()) {
-        string cluster_name = StrCat("cluster_", ComputationId(computation));
-        string call_edge = Printf(
-            "%s -> %s [ style=dashed; ltail=%s ];\n",
-            InstructionId(computation->root_instruction()).c_str(),
-            InstructionId(instruction.get()).c_str(), cluster_name.c_str());
-        intercomputation_edges->push_back(call_edge);
-      }
+      layout_str = Join(instr->shape().layout().minor_to_major(), ",");
+    }
+    lines.push_back(Printf("layout={%s}", layout_str));
+  }
+  if (profile_ != nullptr) {
+    double hlo_cycles_executed = profile_->GetProfileResult(*instr);
+    double total_cycles_executed =
+        profile_->total_cycles_executed(*instr->parent());
+    if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
+      lines.push_back(
+          Printf("%% of cycles executed=%.2f",
+                 100 * hlo_cycles_executed / total_cycles_executed));
     }
   }
-  return graph_body;
+  return Join(lines, "<br/>");
 }
 
-// DOT graphs accept a stylesheet as a URL.  So naturally, an inline stylesheet
-// is a data URI!
-//
-// We don't perform any escaping on this string, so be careful not to use double
-// quotes inside.
-static const char* dot_stylesheet = R"(
-data:text/css,
-@import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
-svg text {
-  font-family: 'Roboto';
-  font-size: 12px;
+void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) {
+  auto add_edge = [&](const HloInstruction* from, const HloInstruction* to,
+                      int64 operand_num) {
+    // Fusion nodes' subcomputations are displayed inline, so if 'from' is a
+    // fusion node and the node's subcomputation is shown, we draw our edge
+    // starting at the fusion node's root instead of at the fusion node itself.
+    if (from->opcode() == HloOpcode::kFusion &&
+        filter_.ShowFusionSubcomputation(from)) {
+      from = from->fused_expression_root();
+    }
+    if (!filter_.Show(from) || from->opcode() == HloOpcode::kConstant) {
+      return;
+    }
+    edge_ids_.insert({{from, to}, next_edge_id_++});
+
+    string edge_label;
+    if (instr->operand_count() > 1) {
+      edge_label = Printf(R"( headlabel="%lld", labeldistance=2)", operand_num);
+    }
+    const char* kEdgeFmt = R"(%s -> %s [tooltip="%s -> %s" %s];)";
+    edges_.push_back(Printf(kEdgeFmt, InstructionId(from), InstructionId(to),
+                            from->name(), to->name(), edge_label));
+  };
+
+  // Add edges from instr's operands to instr.  Parameters within fusion
+  // expressions are handled specially -- we draw an edge from the corresponding
+  // operand on the fusion node itself to the parameter.
+  if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) {
+    const HloInstruction* fusion = instr->fusion_instruction();
+    add_edge(fusion->operand(instr->parameter_number()), instr,
+             /*operand_num=*/0);
+  } else {
+    for (int64 i = 0; i < instr->operand_count(); ++i) {
+      add_edge(instr->operand(i), instr, i);
+    }
+  }
 }
-)";
 
-string ComputationToDotGraph(const HloComputation& computation,
-                             const string& label, bool show_addresses,
-                             bool show_layouts,
-                             const HloExecutionProfile* hlo_execution_profile) {
-  string graph_label = StrCat(label, "<br/>", computation.name());
-  if (hlo_execution_profile != nullptr) {
-    auto cycles = hlo_execution_profile->total_cycles_executed(computation);
-    Appendf(&graph_label, "<br/>total cycles = %lld (%s)", cycles,
-            tensorflow::strings::HumanReadableNum(cycles).c_str());
-  }
-  string graph = Printf(
-      R"(digraph G {
-rankdir=TB;
-compound=true;
-label=<<b>%s</b>>;
-labelloc=t;
-stylesheet="%s"
-)",
-      graph_label.c_str(), dot_stylesheet);
-
-  // Emit embedded computations as subgraph clusters.
-  std::vector<string> intercomputation_edges;
-  for (auto embedded : computation.MakeEmbeddedComputationsList()) {
-    string graph_body = InstructionSequenceGraph(
-        embedded->instructions(), show_addresses, show_layouts,
-        &intercomputation_edges, hlo_execution_profile);
-    Appendf(&graph,
-            "subgraph cluster_%s "
-            "{\nstyle=rounded;label=<<b>%s</b>>;labelloc=t;\n%s}\n",
-            ComputationId(embedded).c_str(), embedded->name().c_str(),
-            graph_body.c_str());
-  }
-  StrAppend(&graph,
-            InstructionSequenceGraph(computation.instructions(), show_addresses,
-                                     show_layouts, &intercomputation_edges,
-                                     hlo_execution_profile));
-
-  // Edges between computations (subgraph clusters) must be emitted last for the
-  // graph to be rendered properly for some reason.
-  StrAppend(&graph, Join(intercomputation_edges, "\n"), "}\n");
-
-  return graph;
+string HloDotDumper::GetInstructionTrivialComputationStr(
+    const HloInstruction* instr) {
+  // called_computations() on a fusion node "inherits" any called computations
+  // of the fused root, which isn't what we want.  Just ignore fusion nodes
+  // here; they're handled separately.
+  if (instr->opcode() == HloOpcode::kFusion) {
+    return "";
+  }
+
+  std::vector<string> lines;
+  for (int64 i = 0; i < instr->called_computations().size(); ++i) {
+    optional<string> computation_type =
+        MatchTrivialComputation(instr->called_computations()[i]);
+    if (!computation_type) {
+      continue;
+    }
+    if (instr->called_computations().size() == 1) {
+      lines.push_back(Printf("Subcomputation: <b>%s</b>",
+                             HtmlLikeStringSanitize(*computation_type)));
+    } else {
+      lines.push_back(Printf("Subcomputation %lld: <b>%s</b>", i,
+                             HtmlLikeStringSanitize(*computation_type)));
+    }
+  }
+  return Join(lines, "<br/>");
 }
 
 tensorflow::mutex& RendererMutex() {
@@ -508,10 +956,9 @@ namespace {
 
 class FileGraphRenderer : public GraphRendererInterface {
  public:
-  string RenderGraph(const string& graph, GraphKind graph_kind) override {
+  string RenderGraph(const string& graph, GraphKind graph_kind,
+                     const DebugOptions& debug_options) override {
     static std::atomic<int> output_num(0);
-    legacy_flags::HloGraphDumperFlags* flags =
-        legacy_flags::GetHloGraphDumperFlags();
     string file_extension;
     switch (graph_kind) {
       case DOT_GRAPH:
@@ -522,7 +969,7 @@ class FileGraphRenderer : public GraphRendererInterface {
         break;
     }
     string path =
-        JoinPath(flags->xla_hlo_dump_graph_path,
+        JoinPath(debug_options.xla_hlo_graph_path(),
                  StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
     auto status = Status::OK();
     int fd = mkstemps(&path[0], file_extension.length());
@@ -543,18 +990,118 @@ class FileGraphRenderer : public GraphRendererInterface {
   }
 };
 
+// Gets a NodeFilter that includes roughly all instructions whose distance from
+// root is <= radius.
+NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
+  // First, find the neighborhood of nodes with distance from root <= radius.
+  // These nodes are our initial set of "normal" nodes.
+  std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
+  std::deque<std::pair<const HloInstruction*, /*depth*/ int64>> worklist;
+  worklist.push_back({root, 0});
+  while (!worklist.empty()) {
+    const HloInstruction* instr;
+    int64 depth;
+    std::tie(instr, depth) = worklist.front();
+    worklist.pop_front();
+
+    nodes[instr] = kNormalNode;
+    if (depth == radius) {
+      continue;
+    }
+
+    // Traverse into instr's operands.
+    //
+    // Don't traverse into tuples' operands unless the tuple is the root.
+    // Usually a tuple is the bottommost node in the graph, and so its operands
+    // are not interesting to the graph at hand.
+    if (instr == root || instr->opcode() != HloOpcode::kTuple) {
+      for (const HloInstruction* operand : instr->operands()) {
+        if (!nodes.count(operand)) {
+          worklist.push_back({operand, depth + 1});
+        }
+      }
+    }
+
+    // Traverse into instr's users, unless:
+    //
+    //  - there are a ton of them, in which case they're probably not
+    //    interesting (and anyway, rendering them all would make the graph
+    //    unreadable), or
+    //  - instr is a constant, in which case its users are probably not
+    //    interesting.
+    if (instr->opcode() == HloOpcode::kConstant) {
+      continue;
+    }
+    constexpr int kMaxUsersToRender = 16;
+    if (instr->user_count() > kMaxUsersToRender) {
+      // If we're going to skip this node's users, style it as such.
+      nodes[instr] = kSomeUsersOmitted;
+      continue;
+    }
+    for (const HloInstruction* user : instr->users()) {
+      if (!nodes.count(user)) {
+        worklist.push_back({user, depth + 1});
+      }
+    }
+  }
+
+  auto is_displayed = [&](const HloInstruction* instr) {
+    // Constants are displayed inline with their users; they're never omitted.
+    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant;
+  };
+
+  // Make a second pass over 'nodes' to fix up the NodeFilterResults now that we
+  // know which nodes will be included in the graph.
+  for (auto& kv : nodes) {
+    const HloInstruction* instr = kv.first;
+    NodeFilterResult& filter_result = kv.second;
+    const auto& operands = instr->operands();
+
+    if (std::any_of(operands.begin(), operands.end(), is_displayed) &&
+        !std::all_of(operands.begin(), operands.end(), is_displayed)) {
+      // Mark nodes with some operands omitted appropriately.
+      filter_result = kSomeOperandsOmitted;
+    } else if (!operands.empty() &&
+               std::none_of(operands.begin(), operands.end(), is_displayed)) {
+      // Mark nodes with *all* operands omitted appropriately.
+      filter_result = kOmitNodeOperands;
+    }
+
+    // Promote nodes with type kSomeUsersOmitted to kNormalNode if all of their
+    // users made it into the graph.
+    if (filter_result == kSomeUsersOmitted &&
+        std::all_of(instr->users().begin(), instr->users().end(),
+                    is_displayed)) {
+      filter_result = kNormalNode;
+    }
+  }
+
+  // Highlight the root node.
+  nodes[root] = kHighlightNode;
+
+  return NodeFilter([=](const HloInstruction* instr) {
+    auto it = nodes.find(instr);
+    if (it != nodes.end()) {
+      return it->second;
+    }
+    // Show all nodes in subcomputations.
+    if (instr->parent() != root->parent()) {
+      return kNormalNode;
+    }
+    return kHideNode;
+  });
+}
+
 XLA_REGISTER_GRAPH_RENDERER(FileGraphRenderer, 0);
 
 }  // namespace
 
 string DumpGraph(const HloComputation& computation, const string& label,
-                 bool show_addresses, bool show_layouts,
+                 const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile) {
   string graph;
   string graph_url;
-  legacy_flags::HloGraphDumperFlags* flags =
-      legacy_flags::GetHloGraphDumperFlags();
-  if (flags->xla_hlo_dump_as_graphdef) {
+  if (debug_options.xla_hlo_dump_as_graphdef()) {
     HloTfGraphBuilder builder;
     TF_CHECK_OK(builder.AddComputation(computation));
     CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
@@ -563,18 +1110,37 @@ string DumpGraph(const HloComputation& computation, const string& label,
     // renderers support rendering GraphDefs. Always dump GraphDefs to files
     // for now.
     graph_url = FileGraphRenderer().RenderGraph(
-        graph, GraphRendererInterface::TF_GRAPHDEF);
+        graph, GraphRendererInterface::TF_GRAPHDEF, debug_options);
   } else {
-    graph = ComputationToDotGraph(computation, label, show_addresses,
-                                  show_layouts, hlo_execution_profile);
+    graph =
+        HloDotDumper(&computation, label,
+                     /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
+                     /*show_layouts=*/debug_options.xla_hlo_graph_layout(),
+                     hlo_execution_profile, NodeFilter())
+            .Dump();
     graph_url = GetGraphRenderer()->RenderGraph(
-        graph, GraphRendererInterface::DOT_GRAPH);
+        graph, GraphRendererInterface::DOT_GRAPH, debug_options);
   }
   LOG(INFO) << "computation " << computation.name() << " [" << label
             << "]: " << graph_url;
   return graph_url;
 }
 
+string DumpNeighborhoodAround(const HloInstruction& node, int radius) {
+  auto debug_options = node.GetModule()->config().debug_options();
+  string label =
+      StrCat("Neighborhood of ", radius, " nodes around ", node.name());
+  NodeFilter filter = MakeNodeFilter(&node, radius);
+  string graph =
+      HloDotDumper(node.parent(), label,
+                   /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
+                   /*show_layouts=*/debug_options.xla_hlo_graph_layout(),
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return GetGraphRenderer()->RenderGraph(
+      graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+}
+
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix) {
   Env* env = Env::Default();
@@ -584,6 +1150,30 @@ void DumpText(const HloModule& module, const string& label,
       do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
   string path = JoinPath(directory_path, filename);
   TF_CHECK_OK(WriteStringToFile(env, path, module.ToString()));
+  LOG(INFO) << "dumping module '" << module.name() << "' to " << path;
+}
+
+string MaybeDumpHloModule(const HloModule& module, const string& label,
+                          const HloExecutionProfile* profile) {
+  VLOG(2) << "MaybeDumpHloModule called on module " << module.name();
+  string graph_url;
+  const DebugOptions& debug_options = module.config().debug_options();
+  if (!debug_options.xla_generate_hlo_graph().empty() &&
+      RE2::PartialMatch(module.name(),
+                        debug_options.xla_generate_hlo_graph())) {
+    graph_url =
+        DumpGraph(*module.entry_computation(), label, debug_options, profile);
+  }
+  if (!debug_options.xla_log_hlo_text().empty() &&
+      RE2::PartialMatch(module.name(), debug_options.xla_log_hlo_text())) {
+    LOG(INFO) << "HLO for module " << module.name();
+    LOG(INFO) << "Label: " << label;
+    XLA_LOG_LINES(2, module.ToString());
+  }
+  if (!debug_options.xla_generate_hlo_text_to().empty()) {
+    DumpText(module, label, debug_options.xla_generate_hlo_text_to());
+  }
+  return graph_url;
 }
 
 }  // namespace hlo_graph_dumper
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 8ed50c38473a6f6dd36603e155285e855ff0c5be..0100d50c050a30a2464b912fcf3688426618513e 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 namespace hlo_graph_dumper {
@@ -38,16 +39,31 @@ class GraphRendererInterface {
 
   // Renders a DOT graph, returning a description of the rendered output
   // (e.g., a URL)
-  virtual string RenderGraph(const string& graph, GraphKind graph_kind) = 0;
+  virtual string RenderGraph(const string& graph, GraphKind graph_kind,
+                             const DebugOptions& debug_options) = 0;
 };
 
+// Dump the given HLO module if a dump is requested in its debug options. Based
+// on the debug options, either a graph dump, a text dump or both may be
+// generated. If a graph dump is generated, the description (e.g. an URL) is
+// returned; otherwise an empty string is returned.
+string MaybeDumpHloModule(const HloModule& module, const string& label,
+                          const HloExecutionProfile* profile = nullptr);
+
 // Dumps a graph of the computation and returns a description of the rendered
 // graph (e.g., a URL) based on the renderer. The "best" renderer in the
 // registry is used.
 string DumpGraph(const HloComputation& computation, const string& label,
-                 bool show_addresses, bool show_layouts,
+                 const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile = nullptr);
 
+// Like DumpGraph, but renders only nodes "near" the given node in the graph.
+//
+// The number of nodes dumped is controlled by the radius parameter, which
+// (roughly) corresponds to the max distance a node may be from the primary node
+// before it's omitted from the graph.
+string DumpNeighborhoodAround(const HloInstruction& node, int radius);
+
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
 //
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index ea813c98743f7c34a891a3b648a2818f5dada8ec..c11fea09d145815d2142e634d93d44dee6601edc 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -122,6 +122,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
     case HloOpcode::kCopy:
+    case HloOpcode::kCos:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
     case HloOpcode::kIsFinite:
@@ -129,6 +130,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kLogicalNot:
     case HloOpcode::kNegate:
     case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
       break;
@@ -226,6 +228,19 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateReducePrecision(const Shape& shape,
+                                      HloInstruction* operand,
+                                      const int exponent_bits,
+                                      const int mantissa_bits) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kReducePrecision, shape));
+  instruction->AppendOperand(operand);
+  instruction->exponent_bits_ = exponent_bits;
+  instruction->mantissa_bits_ = mantissa_bits;
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCrossReplicaSum(const Shape& shape,
                                       HloInstruction* operand) {
@@ -299,6 +314,12 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
   instruction->slice_starts_.assign(start_indices.begin(), start_indices.end());
   instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end());
   instruction->slice_strides_.assign(strides.begin(), strides.end());
+  // For backward compatibility with old serialized computations: if there are
+  // no strides, assume all strides are 1.
+  // TODO(b/63317920): remove this code.
+  if (instruction->slice_strides_.empty()) {
+    instruction->slice_strides_ = std::vector<int64>(start_indices.size(), 1LL);
+  }
   return instruction;
 }
 
@@ -371,6 +392,40 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBatchNormTraining(const Shape& shape,
+                                        HloInstruction* operand,
+                                        HloInstruction* scale,
+                                        HloInstruction* offset, float epsilon,
+                                        int64 feature_index) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBatchNormTraining, shape));
+  instruction->AppendOperand(operand);
+  instruction->AppendOperand(scale);
+  instruction->AppendOperand(offset);
+  instruction->epsilon_ = epsilon;
+  instruction->feature_index_ = feature_index;
+  return instruction;
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBatchNormGrad(const Shape& shape, HloInstruction* operand,
+                                    HloInstruction* scale, HloInstruction* mean,
+                                    HloInstruction* variance,
+                                    HloInstruction* grad_output, float epsilon,
+                                    int64 feature_index) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBatchNormGrad, shape));
+  instruction->AppendOperand(operand);
+  instruction->AppendOperand(scale);
+  instruction->AppendOperand(mean);
+  instruction->AppendOperand(variance);
+  instruction->AppendOperand(grad_output);
+  instruction->epsilon_ = epsilon;
+  instruction->feature_index_ = feature_index;
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateSelectAndScatter(
     const Shape& shape, HloInstruction* operand, HloComputation* select,
@@ -505,19 +560,20 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     HloInstruction* instruction_to_fuse) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(instruction_to_fuse->IsFusable());
-
+  if (GetModule()) {
+    XLA_VLOG_LINES(1, GetModule()->ToString());
+  }
   HloInstruction* clone = nullptr;
-  if (fused_instructions_computation_ == nullptr) {
+  if (called_computations_.empty()) {
     // New fusion instruction.
     auto builder = HloComputation::Builder("fused_computation", true);
     builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/""));
-    fused_instructions_computation_ = builder.Build();
+    called_computations_.push_back(
+        CHECK_NOTNULL(GetModule())->AddEmbeddedComputation(builder.Build()));
     clone = fused_expression_root();
     clone->parent_fusion_instruction_ = this;
   } else {
-    CHECK(fused_instructions_computation_ != nullptr &&
-          fused_instructions_computation_->IsFusionComputation());
-    clone = fused_instructions_computation_->AddInstruction(
+    clone = fused_instructions_computation()->AddInstruction(
         instruction_to_fuse->Clone(/*suffix=*/""));
     clone->parent_fusion_instruction_ = this;
     // instruction_to_fuse is necessarily an operand of the fusion instruction.
@@ -528,7 +584,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     CHECK(std::find(operands_.begin(), operands_.end(), instruction_to_fuse) !=
           operands_.end());
     const std::vector<HloInstruction*>& fused_parameters_ =
-        fused_instructions_computation_->parameter_instructions();
+        fused_instructions_computation()->parameter_instructions();
     for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) {
       if (instruction_to_fuse == operands_[operand_num]) {
         // replace the fused parameter instruction's uses with the clone.
@@ -538,7 +594,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
         // Remove the corresponding fused parameter and operand from their
         // respective vectors.
         TF_CHECK_OK(
-            fused_instructions_computation_->RemoveParameter(operand_num));
+            fused_instructions_computation()->RemoveParameter(operand_num));
         operands_.erase(operands_.begin() + operand_num);
         break;
       }
@@ -550,7 +606,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
 
   // Reread the parameters in the computation.
   const std::vector<HloInstruction*>& fused_parameters_ =
-      fused_instructions_computation_->parameter_instructions();
+      fused_instructions_computation()->parameter_instructions();
 
   // Add each operand of the clone as an operand of the fusion instruction. A
   // complication is that some clone operands may already be operands of the
@@ -583,7 +639,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
           CreateParameter(param_no, operand->shape(), param_name);
 
       param_instruction->parent_fusion_instruction_ = this;
-      fused_param = fused_instructions_computation_->AddParameter(
+      fused_param = fused_instructions_computation()->AddParameter(
           std::move(param_instruction));
       AppendOperand(operand);
     }
@@ -597,7 +653,6 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       called_computations_.push_back(computation);
     }
   }
-
   return clone;
 }
 
@@ -608,17 +663,15 @@ RandomDistribution HloInstruction::random_distribution() const {
 
 void HloInstruction::CheckFusionInstruction() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
 
   const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
-      fused_instructions_computation_->instructions();
+      fused_instructions_computation()->instructions();
   // All instructions owned by this fusion instruction must be fused, and the
   // parent fusion instruction of the fused instructions must be 'this'.
   for (auto& instruction : fused_instructions_) {
     CHECK(instruction->IsFused());
     CHECK_EQ(this, instruction->fusion_instruction());
-    CHECK_EQ(fused_instructions_computation_.get(), instruction->parent())
+    CHECK_EQ(fused_instructions_computation(), instruction->parent())
         << instruction->ToString();
   }
 
@@ -730,6 +783,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
     case HloOpcode::kCopy:
+    case HloOpcode::kCos:
     case HloOpcode::kExp:
     case HloOpcode::kIsFinite:
     case HloOpcode::kFloor:
@@ -737,6 +791,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kLogicalNot:
     case HloOpcode::kNegate:
     case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
       CHECK_EQ(new_operands.size(), 1);
@@ -780,6 +835,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kConvert:
       CHECK_EQ(new_operands.size(), 1);
       return CreateConvert(shape, new_operands[0]);
+    case HloOpcode::kReducePrecision:
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateReducePrecision(shape, new_operands[0], exponent_bits_,
+                                   mantissa_bits_);
     case HloOpcode::kConvolution:
       CHECK_EQ(new_operands.size(), 2);
       return CreateConvolve(shape, new_operands[0], new_operands[1], *window_,
@@ -838,18 +897,31 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       return CreateWhile(shape, while_condition(), while_body(),
                          new_operands[0]);
     case HloOpcode::kConstant:
-      return CreateConstant(LiteralUtil::CloneToUnique(*literal_));
+      return CreateConstant(literal_->CloneToUnique());
     case HloOpcode::kFusion:
       return CloneFusionWithNewOperands(shape, new_operands);
     case HloOpcode::kParameter:
       return CreateParameter(parameter_number_, shape, parameter_name_);
-    // Unsupported ops for cloning.
+    case HloOpcode::kBatchNormTraining:
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateBatchNormTraining(shape, new_operands[0], new_operands[1],
+                                     new_operands[2], epsilon(),
+                                     feature_index());
+    case HloOpcode::kInfeed:
+      CHECK_EQ(new_operands.size(), 0);
+      return CreateInfeed(shape, infeed_config());
+    case HloOpcode::kOutfeed:
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateOutfeed(outfeed_shape_, new_operands[0], outfeed_config());
+    case HloOpcode::kBatchNormGrad:
+      CHECK_EQ(new_operands.size(), 5);
+      return CreateBatchNormGrad(shape, new_operands[0], new_operands[1],
+                                 new_operands[2], new_operands[3],
+                                 new_operands[4], epsilon(), feature_index());
     case HloOpcode::kRecv:
     case HloOpcode::kSend:
     case HloOpcode::kUpdate:
     case HloOpcode::kIndex:
-    case HloOpcode::kInfeed:
-    case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
@@ -902,8 +974,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(parent() != nullptr);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
 
   auto new_instruction =
       WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
@@ -918,9 +988,9 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
   // fused instructions.
   std::vector<HloInstruction*> new_fused_parameters;
   const std::vector<HloInstruction*>& fused_parameters_ =
-      fused_instructions_computation_->parameter_instructions();
+      fused_instructions_computation()->parameter_instructions();
   const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
-      fused_instructions_computation_->instructions();
+      fused_instructions_computation()->instructions();
 
   for (HloInstruction* old_fused_parameter : fused_parameters_) {
     new_fused_instructions.push_back(old_fused_parameter->Clone());
@@ -954,7 +1024,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
   }
   new_instruction->fusion_kind_ = fusion_kind_;
   auto computation_builder = HloComputation::Builder(
-      fused_instructions_computation_->name() + ".clone", true);
+      fused_instructions_computation()->name() + ".clone", true);
   // We iterated the fusion instructions in reverse post order which means
   // that we must reverse our new list of fusion instructions.
   for (auto new_fused_instruction_iter = new_fused_instructions.rbegin();
@@ -963,8 +1033,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
   }
   auto fused_root_ = fused_expression_root();
-  new_instruction->fused_instructions_computation_ =
-      computation_builder.Build(FindOrDie(old_to_new, fused_root_));
+  new_instruction->called_computations_.push_back(
+      CHECK_NOTNULL(GetModule())
+          ->AddEmbeddedComputation(
+              computation_builder.Build(FindOrDie(old_to_new, fused_root_))));
   new_instruction->set_parent(parent());
   new_instruction->CheckFusionInstruction();
   return new_instruction;
@@ -1041,7 +1113,7 @@ Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
   auto pred_it = std::find(instruction->control_predecessors_.begin(),
                            instruction->control_predecessors_.end(), this);
   TF_RET_CHECK(pred_it != instruction->control_predecessors_.end());
-  instruction->control_predecessors_.erase(succ_it);
+  instruction->control_predecessors_.erase(pred_it);
 
   return Status::OK();
 }
@@ -1099,6 +1171,7 @@ bool HloInstruction::Identical(
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kCopy:
+    case HloOpcode::kCos:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
     case HloOpcode::kDot:
@@ -1123,6 +1196,7 @@ bool HloInstruction::Identical(
     case HloOpcode::kRemainder:
     case HloOpcode::kSelect:
     case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
     case HloOpcode::kTuple:
@@ -1141,15 +1215,25 @@ bool HloInstruction::Identical(
              // different HloComputations.
              ShapeUtil::Compatible(shape(), other.shape());
 
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormGrad:
+      return feature_index() == other.feature_index() &&
+             epsilon() == other.epsilon();
+
     // A constant is defined by the value in the literal.
     case HloOpcode::kConstant:
-      return LiteralUtil::Equal(literal(), other.literal());
+      return literal().Equal(other.literal());
 
     // A convert result is determined by the primitive type that the operand is
     // converted into.
     case HloOpcode::kConvert:
       return shape().element_type() == other.shape().element_type();
 
+    // A reduce-precision operation is determined by the bit sizes.
+    case HloOpcode::kReducePrecision:
+      return exponent_bits() == other.exponent_bits() &&
+             mantissa_bits() == other.mantissa_bits();
+
     // Convolution has a window and dimensions.
     case HloOpcode::kConvolution:
       return protobuf_util::ProtobufEquals(window(), other.window()) &&
@@ -1438,10 +1522,10 @@ string HloInstruction::ToString(bool compact_operands,
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
-    if (ShapeUtil::ElementsIn(shape()) <= 10) {
-      // LiteralUtil::ToString emits multidimensional arrays over multiple
+    if (!ShapeUtil::IsTuple(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) {
+      // Literal::ToString emits multidimensional arrays over multiple
       // lines. Compact this into one line by stripping out white space.
-      string tmp = LiteralUtil::ToString(literal());
+      string tmp = literal().ToString();
       std::replace(tmp.begin(), tmp.end(), '\n', ' ');
       std::vector<string> v = tensorflow::str_util::Split(tmp, ' ');
       bool first = true;
@@ -1455,7 +1539,7 @@ string HloInstruction::ToString(bool compact_operands,
         first = false;
       }
     } else {
-      // Do not show large constants.
+      // Do not show large constants or tuples.
       operands = "{...}";
     }
   } else if (opcode() == HloOpcode::kParameter) {
@@ -1565,7 +1649,7 @@ HloInstructionProto HloInstruction::ToProto() const {
     case HloOpcode::kFusion: {
       HloComputationProto* proto_fused_computation =
           proto.mutable_fused_instructions_computation();
-      proto_fused_computation->set_name(FullyQualifiedName());
+      proto_fused_computation->set_name(name());
 
       // Fill in fused instructions. Note that fused_instructions() returns in
       // reverse post-order (i.e. root first), so we reverse to get post-order.
@@ -1629,6 +1713,8 @@ string HloInstruction::ToCategory() const {
       case FusionKind::kConvBackwardFilter:
       case FusionKind::kConvBackwardInput:
         return "convolution fusion";
+      case FusionKind::kCustom:
+        return "custom fusion";
     }
   }
 
@@ -1639,14 +1725,6 @@ string HloInstruction::ToCategory() const {
   return HloOpcodeString(opcode());
 }
 
-string HloInstruction::FullyQualifiedName() const {
-  if (IsFused()) {
-    return StrCat(fusion_instruction()->parent()->name(),
-                  "::", fusion_instruction()->name(), "::", name_);
-  }
-  return StrCat(parent_->name(), "::", name_);
-}
-
 HloInstruction* HloInstruction::tracing() const { return trace_instruction_; }
 
 void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
@@ -1689,7 +1767,10 @@ bool HloInstruction::IsFusable() const {
 
 HloComputation* HloInstruction::fused_instructions_computation() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_instructions_computation_.get();
+  CHECK(!called_computations_.empty());
+  auto* fused_instructions_computation = called_computations_.front();
+  CHECK(fused_instructions_computation->IsFusionComputation());
+  return fused_instructions_computation;
 }
 
 HloInstruction* HloInstruction::fusion_instruction() const {
@@ -1699,32 +1780,24 @@ HloInstruction* HloInstruction::fusion_instruction() const {
 
 HloInstruction* HloInstruction::fused_expression_root() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->root_instruction();
+  return fused_instructions_computation()->root_instruction();
 }
 
 HloInstruction* HloInstruction::fused_parameter(int64 parameter_number) const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->parameter_instruction(
+  return fused_instructions_computation()->parameter_instruction(
       parameter_number);
 }
 
 const std::vector<HloInstruction*>& HloInstruction::fused_parameters() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->parameter_instructions();
+  return fused_instructions_computation()->parameter_instructions();
 }
 
 const std::list<std::unique_ptr<HloInstruction>>&
 HloInstruction::fused_instructions() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->instructions();
+  return fused_instructions_computation()->instructions();
 }
 
 HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
@@ -1736,6 +1809,10 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
   switch (opcode_) {
     case HloOpcode::kAbs:
       return visitor->HandleAbs(this, operands_[0]);
+    case HloOpcode::kBatchNormTraining:
+      return visitor->HandleBatchNormTraining(this);
+    case HloOpcode::kBatchNormGrad:
+      return visitor->HandleBatchNormGrad(this);
     case HloOpcode::kSign:
       return visitor->HandleSign(this, operands_[0]);
     case HloOpcode::kConstant:
@@ -1758,9 +1835,9 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kSubtract:
       return visitor->HandleSubtract(this, operands_[0], operands_[1]);
     case HloOpcode::kMaximum:
-      return visitor->HandleMaximum(this, operands_[0], operands_[1]);
+      return visitor->HandleMaximum(this);
     case HloOpcode::kMinimum:
-      return visitor->HandleMinimum(this, operands_[0], operands_[1]);
+      return visitor->HandleMinimum(this);
     case HloOpcode::kLogicalAnd:
       return visitor->HandleLogicalAnd(this, operands_[0], operands_[1]);
     case HloOpcode::kLogicalOr:
@@ -1768,9 +1845,9 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kConcatenate:
       return visitor->HandleConcatenate(this, operands_);
     case HloOpcode::kConvert:
-      return visitor->HandleConvert(this, operands_[0]);
+      return visitor->HandleConvert(this);
     case HloOpcode::kCopy:
-      return visitor->HandleCopy(this, operands_[0]);
+      return visitor->HandleCopy(this);
     case HloOpcode::kMultiply:
       return visitor->HandleMultiply(this, operands_[0], operands_[1]);
     case HloOpcode::kDot:
@@ -1814,6 +1891,10 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
       return visitor->HandleLog(this, operands_[0]);
     case HloOpcode::kTanh:
       return visitor->HandleTanh(this, operands_[0]);
+    case HloOpcode::kCos:
+      return visitor->HandleCos(this, operands_[0]);
+    case HloOpcode::kSin:
+      return visitor->HandleSin(this, operands_[0]);
     case HloOpcode::kIsFinite:
       return visitor->HandleIsFinite(this, operands_[0]);
     case HloOpcode::kLogicalNot:
@@ -1830,6 +1911,8 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
       return visitor->HandleTranspose(this);
     case HloOpcode::kReverse:
       return visitor->HandleReverse(this, operands_[0]);
+    case HloOpcode::kReducePrecision:
+      return visitor->HandleReducePrecision(this);
     case HloOpcode::kSlice:
       return visitor->HandleSlice(this, operands_[0]);
     case HloOpcode::kDynamicSlice:
@@ -1868,72 +1951,90 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
                        HloOpcodeString(opcode_).c_str());
 }
 
-Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor,
-                                      const CompareFunction* operand_order,
-                                      bool ignore_control_predecessors) {
-  // Do not visit this HLO node again if it is already visited.
-  if (visitor->DidVisit(*this)) {
-    VLOG(3) << "Not visiting HLO " << name() << " as it was already visited.";
-    return Status::OK();
+static Status PushDFSChild(DfsHloVisitor* visitor,
+                           std::vector<HloInstruction*>* dfs_stack,
+                           HloInstruction* parent, HloInstruction* child) {
+  switch (visitor->GetVisitState(*child)) {
+    case DfsHloVisitor::kVisiting:
+      return FailedPrecondition(
+          "A cycle is detected while visiting instruction %s",
+          parent->ToString().c_str());
+
+    case DfsHloVisitor::kVisited:
+      VLOG(3) << "Not visiting HLO " << child->name()
+              << " as it was already visited.";
+      return Status::OK();
+
+    case DfsHloVisitor::kNotVisited:
+      dfs_stack->push_back(child);
+      return Status::OK();
   }
+}
 
-  // If the instruction is in the visiting state, it means a cycle.
-  if (visitor->IsVisiting(*this)) {
-    return FailedPrecondition(
-        "A cycle is detected while visiting instruction %s",
-        ToString().c_str());
-  }
-  visitor->SetVisiting(*this);
-
-  // Sort operands, if an ordering was provided. 'temp_sorted_operands' must
-  // live at this scope, since 'operands' will point to it if the operands are
-  // sorted.  The purpose of the 'operands' pointer is to avoid copying the
-  // operands in the common case where the operands are not sorted.
-  std::vector<HloInstruction*>* operands = &operands_;
-  std::vector<HloInstruction*> temp_sorted_operands;
-  if (operand_order != nullptr) {
-    temp_sorted_operands = operands_;
-    std::sort(temp_sorted_operands.begin(), temp_sorted_operands.end(),
-              *operand_order);
-    operands = &temp_sorted_operands;
-  }
-  for (HloInstruction* operand : *operands) {
-    VLOG(3) << "Going to visit HLO " << operand->name() << " as operand of HLO "
-            << name();
-    TF_RETURN_IF_ERROR(operand->AcceptInternal(visitor, operand_order,
-                                               ignore_control_predecessors));
-  }
-
-  if (!ignore_control_predecessors) {
-    // This uses the same pointer/vector sorting to avoid extra copies as above.
-    std::vector<HloInstruction*>* predecessors = &control_predecessors_;
-    std::vector<HloInstruction*> temp_sorted_predecessors;
-    if (operand_order != nullptr) {
-      temp_sorted_predecessors = control_predecessors_;
-      std::sort(temp_sorted_predecessors.begin(),
-                temp_sorted_predecessors.end(), *operand_order);
-      predecessors = &temp_sorted_predecessors;
+static Status PostOrderDFS(HloInstruction* root, DfsHloVisitor* visitor,
+                           const HloInstruction::CompareFunction* operand_order,
+                           bool ignore_control_predecessors) {
+  std::vector<HloInstruction*> dfs_stack;
+  dfs_stack.push_back(root);
+
+  do {
+    DCHECK(!dfs_stack.empty());
+
+    HloInstruction* current_node = dfs_stack.back();
+    DfsHloVisitor::VisitState visit_state =
+        visitor->GetVisitState(*current_node);
+    if (visit_state == DfsHloVisitor::kVisited) {
+      dfs_stack.pop_back();
+      VLOG(3) << "Not visiting HLO " << current_node->name()
+              << " as it was already visited.";
+      continue;
     }
-    for (HloInstruction* control_predecessor : *predecessors) {
-      VLOG(3) << "Going to visit HLO " << control_predecessor->name()
-              << " as a control predecessor of HLO " << name();
-      TF_RETURN_IF_ERROR(control_predecessor->AcceptInternal(
-          visitor, operand_order, ignore_control_predecessors));
+
+    if (visit_state == DfsHloVisitor::kVisiting) {
+      dfs_stack.pop_back();
+
+      TF_RETURN_IF_ERROR(visitor->Preprocess(current_node));
+      VLOG(2) << "Visiting HLO " << current_node->name();
+      TF_RETURN_IF_ERROR(current_node->Visit(visitor));
+      visitor->SetVisited(*current_node);
+      TF_RETURN_IF_ERROR(visitor->Postprocess(current_node));
+      continue;
+    }
+
+    visitor->SetVisiting(*current_node);
+
+    const size_t old_dfs_stack_size = dfs_stack.size();
+
+    for (HloInstruction* child : current_node->operands()) {
+      TF_RETURN_IF_ERROR(
+          PushDFSChild(visitor, &dfs_stack, current_node, child));
+    }
+
+    if (!ignore_control_predecessors) {
+      for (HloInstruction* child : current_node->control_predecessors()) {
+        TF_RETURN_IF_ERROR(
+            PushDFSChild(visitor, &dfs_stack, current_node, child));
+      }
     }
-  }
 
-  TF_RETURN_IF_ERROR(visitor->Preprocess(this));
-  VLOG(2) << "Visiting HLO " << name();
-  TF_RETURN_IF_ERROR(Visit(visitor));
-  visitor->SetVisited(*this);
-  return visitor->Postprocess(this);
+    if (operand_order != nullptr) {
+      std::sort(dfs_stack.begin() + old_dfs_stack_size, dfs_stack.end(),
+                *operand_order);
+    }
+
+    // This makes the traversal order the same as what you'd expect
+    // out of a recursive algorithm.
+    std::reverse(dfs_stack.begin() + old_dfs_stack_size, dfs_stack.end());
+  } while (!dfs_stack.empty());
+
+  return Status::OK();
 }
 
 Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit,
                               bool ignore_control_predecessors) {
-  VLOG(2) << "HloInstruction::Accept(" << name() << ")";
+  VLOG(3) << "HloInstruction::Accept(" << name() << ")";
   TF_RETURN_IF_ERROR(
-      AcceptInternal(visitor, nullptr, ignore_control_predecessors));
+      PostOrderDFS(this, visitor, nullptr, ignore_control_predecessors));
   if (call_finish_visit) {
     TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
   }
@@ -1944,11 +2045,14 @@ Status HloInstruction::AcceptWithOperandOrder(
     DfsHloVisitor* visitor, const CompareFunction& operand_order,
     bool call_finish_visit) {
   VLOG(2) << "HloInstruction::AcceptWithOperandOrder(" << name() << ")";
-  TF_RETURN_IF_ERROR(AcceptInternal(visitor, &operand_order,
-                                    /*ignore_control_predecessors=*/false));
+  TF_RETURN_IF_ERROR(PostOrderDFS(this, visitor, &operand_order,
+                                  /*ignore_control_predecessors=*/false));
   if (call_finish_visit) {
+    VLOG(3) << "HloInstruction::AcceptWithOperandOrder BEFORE FINISH VISIT";
     TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
+    VLOG(3) << "HloInstruction::AcceptWithOperandOrder AFTER FINISH VISIT";
   }
+  VLOG(2) << "HloInstruction::AcceptWithOperandOrder EXIT";
   return Status::OK();
 }
 
@@ -2060,13 +2164,16 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kCeil:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
+    case HloOpcode::kCos:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kLogicalNot:
     case HloOpcode::kNegate:
+    case HloOpcode::kReducePrecision:
     case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kTanh:
       return true;
 
@@ -2274,6 +2381,8 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kConvBackwardFilter";
     case HloInstruction::FusionKind::kConvBackwardInput:
       return "kConvBackwardInput";
+    case HloInstruction::FusionKind::kCustom:
+      return "kCustom";
   }
 }
 
@@ -2345,7 +2454,13 @@ HloModule* HloInstruction::GetModule() const {
 }
 
 void HloInstruction::UniquifyName(NameUniquer* name_uniquer) {
+  string parent_str = parent() == nullptr ? "noparent" : parent()->name();
   name_ = name_uniquer->GetUniqueName(name_);
 }
 
+void HloInstruction::set_outer_dimension_partitions(
+    const std::vector<int64>& outer_dimension_partitions) {
+  outer_dimension_partitions_ = outer_dimension_partitions;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c7cd729934b2a52d95b32b4ba5f5c84dc087cfd4..3c188ec83f3bdcec7c40835794d1694f883388a0 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -63,6 +63,9 @@ class HloInstruction {
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
+
+    kCustom,              // Custom category for backend-specific fusions that
+                          // do not match any of the more specific ones.
   };
 
   ~HloInstruction();
@@ -131,6 +134,13 @@ class HloInstruction {
       const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
+  // Creates a reduce-precision op, where operand is the data to reduce in
+  // precision, and exponent_bits and mantissa_bits describe the precision to
+  // reduce it to.
+  static std::unique_ptr<HloInstruction> CreateReducePrecision(
+      const Shape& shape, HloInstruction* operand, const int exponent_bits,
+      const int mantissa_bits);
+
   // Creates a cross replica sum op.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
       const Shape& shape, HloInstruction* operand);
@@ -209,6 +219,17 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
       const Window& window, HloComputation* reduce_computation);
 
+  // Creates a batch-norm-training instruction.
+  static std::unique_ptr<HloInstruction> CreateBatchNormTraining(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, float epsilon, int64 feature_index);
+
+  // Creates a batch-norm-grad instruction.
+  static std::unique_ptr<HloInstruction> CreateBatchNormGrad(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* mean, HloInstruction* variance,
+      HloInstruction* grad_output, float epsilon, int64 feature_index);
+
   // Creates a scatter computation that scatters the `source` array to the
   // selected indices of each window.
   static std::unique_ptr<HloInstruction> CreateSelectAndScatter(
@@ -510,11 +531,6 @@ class HloInstruction {
   // or "elementwise".
   string ToCategory() const;
 
-  // Returns the string concatenation of parent name and this instructions
-  // name. This name is guaranteed to be unique among all instructions in the
-  // HloModule.
-  string FullyQualifiedName() const;
-
   // Returns a logging instruction, if the output of this instruction is logged.
   //
   // Postcondition: retval == nullptr || retval->opcode() == HloOpcode::kTrace
@@ -528,6 +544,18 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kSend or HloOpcode::kRecv
   int64 channel_id() const { return channel_id_; }
 
+  // Returns feature_index field associated with the instruction. The index
+  // represents the index of the feature dimension.
+  //
+  // Precondition: opcode() == HloOpcode::kBatchNormTraining
+  int64 feature_index() const { return feature_index_; }
+
+  // Returns a epsilon value associated with the instruction. The is a small
+  // number added to the variance to avoid divide-by-zero error.
+  //
+  // Precondition: opcode() == HloOpcode::kBatchNormTraining
+  float epsilon() const { return epsilon_; }
+
   // Returns the infeed configuration string. The infeed configuration includes
   // any metadata needed for the backend compiler (e.g., infeed buffer address)
   // and is target-dependent.
@@ -642,7 +670,7 @@ class HloInstruction {
   // Returns the stride in the given dimension for a slice node.
   //
   // Precondition: opcode() == HloOpcode::kSlice
-  int64 slice_stride(int64 dimension) const {
+  int64 slice_strides(int64 dimension) const {
     CHECK_EQ(HloOpcode::kSlice, opcode_);
     return slice_strides_[dimension];
   }
@@ -661,6 +689,22 @@ class HloInstruction {
     return dynamic_slice_sizes_;
   }
 
+  // Returns the number of exponent bits for a reduce-precision node.
+  //
+  // Precondition: opcode() == HloOpcode::kReducePrecision
+  int32 exponent_bits() const {
+    CHECK_EQ(HloOpcode::kReducePrecision, opcode_);
+    return exponent_bits_;
+  }
+
+  // Returns the number of mantissa bits for a reduce-precision node.
+  //
+  // Precondition: opcode() == HloOpcode::kReducePrecision
+  int32 mantissa_bits() const {
+    CHECK_EQ(HloOpcode::kReducePrecision, opcode_);
+    return mantissa_bits_;
+  }
+
   // Returns data on the window in a windowed operation such as
   // convolution.
   const Window& window() const {
@@ -708,6 +752,16 @@ class HloInstruction {
     return called_computations_;
   }
 
+  // Replaces all called computations based on a map function. This is needed
+  // when we clone hlo_computations and want to let the instructions to point
+  // to the newly cloned nodes.
+  void ReplaceCalledComputations(
+      std::function<HloComputation*(HloComputation*)> map_function) {
+    for (int64 i = 0; i < called_computations_.size(); ++i) {
+      called_computations_[i] = map_function(called_computations_[i]);
+    }
+  }
+
   // Returns true if this instruction performs an elementwise operation on
   // `operand_idx`-th operand. An instruction is elementwise on an operand iff,
   // after performing necessary implicit broadcast
@@ -742,9 +796,9 @@ class HloInstruction {
   std::tuple<bool, std::vector<int64>, std::vector<int64>>
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
-  // Returns the opcode string for this instruction. Compared with
-  // HloOpcodeString method, this wrapper dumps additional information
-  // such as fusion kind.
+  // Returns the opcode string for this instruction. This is the result from
+  // HloOpcodeString plus, for fusion nodes, the fusion kind, separated by a
+  // ':'.
   string ExtendedOpcodeStr() const;
 
   // Returns a string identifier for this instruction. If no string identifier
@@ -782,6 +836,17 @@ class HloInstruction {
     parent_fusion_instruction_ = fusion_instruction;
   }
 
+  // Get/Set the number of partitions per outer dimension (in order, starting
+  // with outer-most dimension first). Currently used by the parallel cpu
+  // backend to partition HLOs into parallel tasks.
+  // TODO(b/62783254) Replace these methods with a more general way to
+  // annotate HLOs with backend-specific information.
+  const std::vector<int64>& outer_dimension_partitions() const {
+    return outer_dimension_partitions_;
+  }
+  void set_outer_dimension_partitions(
+      const std::vector<int64>& outer_dimension_partitions);
+
  private:
   enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse };
 
@@ -818,12 +883,6 @@ class HloInstruction {
       const Shape& shape,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
-  // Inner DFS traversal function -- this function being called (rather than
-  // Accept above) allows us to distinguish the root of the traversal.
-  Status AcceptInternal(DfsHloVisitor* visitor,
-                        const CompareFunction* operand_order,
-                        bool ignore_control_predecessors);
-
   // CHECKs various invariants of a fusion instruction.
   void CheckFusionInstruction() const;
 
@@ -864,6 +923,10 @@ class HloInstruction {
   std::vector<int64> slice_limits_;
   std::vector<int64> slice_strides_;
 
+  // The bit sizes for a reduce-precision operation.
+  int32 exponent_bits_;
+  int32 mantissa_bits_;
+
   // Describes the [start, start + size) range size for a dynamic slice
   // ('start' is specified dynamically in the second operand of the operation).
   std::vector<int64> dynamic_slice_sizes_;
@@ -872,10 +935,6 @@ class HloInstruction {
   // padding of this pad instruction. Only set for pad instructions.
   std::unique_ptr<PaddingConfig> padding_config_;
 
-  // The computation that stores of instructions fused into this fusion
-  // instruction. Only set for fusion instructions.
-  std::unique_ptr<HloComputation> fused_instructions_computation_;
-
   // If this instruction is fused into a fusion instruction, this field points
   // to the fusion instruction.
   HloInstruction* parent_fusion_instruction_ = nullptr;
@@ -934,6 +993,14 @@ class HloInstruction {
   // Only present for kRng.
   RandomDistribution distribution_;
 
+  // A small float number added to the variance to avoid divide-by-zero error.
+  // Only present for kBatchNormTraining.
+  float epsilon_;
+
+  // An integer value representing the index of the feature dimension.
+  // Only present for kBatchNormTraining.
+  int64 feature_index_;
+
   // Represents a unique identifier for each Send/Recv instruction pair.
   // Only present for kSend or kRecv.
   int64 channel_id_ = -1;
@@ -950,6 +1017,10 @@ class HloInstruction {
   // Metadata for debugging.
   OpMetadata metadata_;
 
+  // The number of partitions per outer dimension (listed in order from
+  // outer-most dimension first).
+  std::vector<int64> outer_dimension_partitions_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HloInstruction);
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index bcf81cd8ddf63eff2f1df9c6c797588eee42f6b5..ced8417fcef9c009f8f3706ef4707bf0835faec2 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -232,7 +232,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) {
   //                 -------
   auto param0 = HloInstruction::CreateParameter(0, r0f32_, "param0");
   auto param1 = HloInstruction::CreateParameter(1, r0f32_, "param1");
-  auto c0 = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
+  auto c0 = HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
   auto addleft = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                               param0.get(), c0.get());
   auto addright = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
@@ -271,7 +271,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) {
   //                 -------
   auto param0 = HloInstruction::CreateParameter(0, r0f32_, "param0");
   auto param1 = HloInstruction::CreateParameter(1, r0f32_, "param1");
-  auto c0 = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
+  auto c0 = HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
   auto neg1 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, c0.get());
   auto addleft = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                               param0.get(), neg1.get());
@@ -307,7 +307,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   auto param =
       builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32, "x"));
   auto value = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, value));
   auto add_f32 = builder.Build();
@@ -349,9 +349,8 @@ TEST_F(HloInstructionTest, TrivialReduce) {
 
   // Builds a parameter and an initial value and feeds them to the reduce.
   auto param0 = HloInstruction::CreateParameter(0, f32a100x10, "");
-  auto const0 =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f));
-  auto c0 = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
+  auto const0 = HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f));
+  auto c0 = HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
   auto reduce =
       HloInstruction::CreateReduce(f32v100, param0.get(), const0.get(),
                                    /*dimensions_to_reduce=*/{1}, add_f32.get());
@@ -558,78 +557,110 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
 }
 
 TEST_F(HloInstructionTest, SingletonFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Create a fusion instruction containing a single unary operation.
-  auto constant =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
-  auto exp =
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
-
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, exp.get());
-
-  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
-  EXPECT_THAT(constant->users(), UnorderedElementsAre(fusion.get(), exp.get()));
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp}, HloInstruction::FusionKind::kLoop);
+
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant));
+  EXPECT_THAT(constant->users(), ElementsAre(fusion));
 }
 
 TEST_F(HloInstructionTest, BinaryFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Create a fusion instruction containing a single binary operation.
-  auto constant1 =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
-  auto constant2 =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f));
-  auto add = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
-                                          constant1.get(), constant2.get());
-
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, add.get());
-
-  EXPECT_THAT(fusion->operands(),
-              ElementsAre(constant1.get(), constant2.get()));
-  EXPECT_THAT(constant1->users(),
-              UnorderedElementsAre(fusion.get(), add.get()));
-  EXPECT_THAT(constant2->users(),
-              UnorderedElementsAre(fusion.get(), add.get()));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.1f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {add}, HloInstruction::FusionKind::kLoop);
+
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant1, constant2));
+  EXPECT_THAT(constant1->users(), ElementsAre(fusion));
+  EXPECT_THAT(constant2->users(), ElementsAre(fusion));
 }
 
 TEST_F(HloInstructionTest, ChainFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Create a chain of fused unary ops.
-  auto constant =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
-  auto exp1 =
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
-  auto exp2 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1.get());
-  auto exp3 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2.get());
-
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, exp3.get());
-  fusion->FuseInstruction(exp2.get());
-  fusion->FuseInstruction(exp1.get());
-
-  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
-  EXPECT_THAT(constant->users(),
-              UnorderedElementsAre(fusion.get(), exp1.get()));
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto exp1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
+  auto exp2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1));
+  auto exp3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
+
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
+
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant));
+  EXPECT_THAT(constant->users(), ElementsAre(fusion));
 }
 
 TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
+  HloComputation::Builder builder(TestName());
   // Create a chain of fused unary ops.
-  auto constant =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
-  auto exp1 =
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
-  auto exp2 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1.get());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto exp1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
+  auto exp2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1));
   OpMetadata metadata;
   metadata.set_op_name("tf_op");
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, exp2.get());
-  auto* fused = fusion->FuseInstruction(exp1.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp2, exp1}, HloInstruction::FusionKind::kLoop);
+
   EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fusion->metadata()));
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fused->metadata()));
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+      metadata, fusion->fused_expression_root()->metadata()));
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+      metadata, fusion->fused_expression_root()->operand(0)->metadata()));
+}
+
+TEST_F(HloInstructionTest, PreserveOutfeedShapeThroughClone) {
+  HloComputation::Builder builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({
+          {1, 2},
+          {3, 4},
+      })));
+  auto shape10 = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0});
+  auto shape01 = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {0, 1});
+  auto outfeed10 = builder.AddInstruction(
+      HloInstruction::CreateOutfeed(shape10, constant, ""));
+  auto outfeed01 = builder.AddInstruction(
+      HloInstruction::CreateOutfeed(shape01, constant, ""));
+
+  auto clone01 = builder.AddInstruction(outfeed01->Clone());
+  auto clone10 = builder.AddInstruction(outfeed10->Clone());
+
+  EXPECT_TRUE(ShapeUtil::Equal(clone01->outfeed_shape(), shape01));
+  EXPECT_TRUE(ShapeUtil::Equal(clone10->outfeed_shape(), shape10));
 }
 
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
+  HloComputation::Builder builder(TestName());
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
 
@@ -643,33 +674,36 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   std::unique_ptr<HloComputation> computation_x = make_map_computation();
   std::unique_ptr<HloComputation> computation_y = make_map_computation();
 
-  auto constant =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
-  auto map_1_x =
-      HloInstruction::CreateMap(scalar_shape, {constant.get()},
-                                computation_x.get(), /*static_operands=*/{});
-  auto map_2_x =
-      HloInstruction::CreateMap(scalar_shape, {map_1_x.get()},
-                                computation_x.get(), /*static_operands=*/{});
-  auto map_3_y =
-      HloInstruction::CreateMap(scalar_shape, {map_2_x.get()},
-                                computation_y.get(), /*static_operands=*/{});
-
-  auto fusion = HloInstruction::CreateFusion(
-      scalar_shape, HloInstruction::FusionKind::kLoop, map_3_y.get());
-
-  EXPECT_THAT(fusion->called_computations(), ElementsAre(computation_y.get()));
-
-  fusion->FuseInstruction(map_2_x.get());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto map_1_x = builder.AddInstruction(HloInstruction::CreateMap(
+      scalar_shape, {constant}, computation_x.get(), /*static_operands=*/{}));
+  auto map_2_x = builder.AddInstruction(HloInstruction::CreateMap(
+      scalar_shape, {map_1_x}, computation_x.get(), /*static_operands=*/{}));
+  auto map_3_y = builder.AddInstruction(HloInstruction::CreateMap(
+      scalar_shape, {map_2_x}, computation_y.get(), /*static_operands=*/{}));
+
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {map_3_y}, HloInstruction::FusionKind::kLoop);
+  auto* fused_computation = fusion->fused_instructions_computation();
   EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(computation_y.get(), computation_x.get()));
+              ElementsAre(fused_computation, computation_y.get()));
 
-  fusion->FuseInstruction(map_1_x.get());
-  EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(computation_y.get(), computation_x.get()));
+  fusion->FuseInstruction(map_2_x);
+  EXPECT_THAT(
+      fusion->called_computations(),
+      ElementsAre(fused_computation, computation_y.get(), computation_x.get()));
+
+  fusion->FuseInstruction(map_1_x);
+  EXPECT_THAT(
+      fusion->called_computations(),
+      ElementsAre(fused_computation, computation_y.get(), computation_x.get()));
 }
 
 TEST_F(HloInstructionTest, ComplexFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Fuse all instructions in complicated expression:
   //
   //   add = Add(C1, C2)
@@ -681,35 +715,35 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   //
   // Notable complexities are repeated operands in a same instruction, different
   // shapes, use of value in different expressions.
-  auto c1 = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
-  auto c2 = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.1f));
-  auto c3 = HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(9.0f));
-
-  auto add =
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c1.get(), c2.get());
-  auto clamp = HloInstruction::CreateTernary(r0f32_, HloOpcode::kClamp,
-                                             c2.get(), add.get(), add.get());
-  auto exp = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, add.get());
-  auto mul = HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply,
-                                          exp.get(), c3.get());
-  auto sub = HloInstruction::CreateBinary(r0f32_, HloOpcode::kSubtract,
-                                          mul.get(), clamp.get());
+  auto c1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto c2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.1f)));
+  auto c3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(9.0f)));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c1, c2));
+  auto clamp = builder.AddInstruction(
+      HloInstruction::CreateTernary(r0f32_, HloOpcode::kClamp, c2, add, add));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, add));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, exp, c3));
+  auto sub = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kSubtract, mul, clamp));
   auto tuple =
-      HloInstruction::CreateTuple({sub.get(), sub.get(), mul.get(), c1.get()});
+      builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, tuple.get());
-  fusion->FuseInstruction(sub.get());
-  fusion->FuseInstruction(mul.get());
-  fusion->FuseInstruction(exp.get());
-  fusion->FuseInstruction(clamp.get());
-  fusion->FuseInstruction(add.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
   // Operands in the fusion instruction's operands() vector should be in the
   // order in which their users were added fused.
-  EXPECT_THAT(fusion->operands(), ElementsAre(c1.get(), c3.get(), c2.get()));
-  EXPECT_THAT(c1->users(),
-              UnorderedElementsAre(add.get(), tuple.get(), fusion.get()));
+  EXPECT_THAT(fusion->operands(), ElementsAre(c1, c3, c2));
+  EXPECT_THAT(c1->users(), ElementsAre(fusion));
 }
 
 // Convenience function for comparing two HloInstructions inside of
@@ -732,11 +766,11 @@ TEST_F(HloInstructionTest, IdenticalInstructions) {
   // Create a set of random constant operands to use below. Make them matrices
   // so dimensions are interesting.
   auto operand1 = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}));
   auto operand2 = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{10.0, 20.0}, {30.0, 40.0}}));
-  auto vector_operand = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({42.0, 123.0}));
+      Literal::CreateR2<float>({{10.0, 20.0}, {30.0, 40.0}}));
+  auto vector_operand =
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({42.0, 123.0}));
   Shape shape = operand1->shape();
 
   // Convenient short names for the operands.
@@ -865,7 +899,8 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  auto computation = builder.Build();
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -907,7 +942,8 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  auto computation = builder.Build();
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -946,7 +982,8 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
 
-  auto computation = builder.Build();
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 141251011cc0b4205b6069ff90415492ead9f7a9..79f17bbb6bd9bfc0c6ed48c68599ef51fbd27af8 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -95,6 +95,7 @@ HLO_MATCHER(Parameter);
 HLO_MATCHER(Power);
 HLO_MATCHER(Recv);
 HLO_MATCHER(Reduce);
+HLO_MATCHER(ReducePrecision);
 HLO_MATCHER(ReduceWindow);
 HLO_MATCHER(Remainder);
 HLO_MATCHER(Reshape);
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 22ef9c590bcf63a4e0c60931f771455601b0c019..da6f1d77ecb82ddbce11ca43c184ce0552b757fa 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -37,19 +37,17 @@ HloModule::HloModule(const string& name,
                      const HloModuleConfig& config)
     : name_(name),
       config_(config),
-      entry_computation_(nullptr),
       has_entry_computation_handle_(true),
-      entry_computation_handle_(entry_computation_handle),
-      computation_name_uniquer_(/*separator=*/".") {}
+      entry_computation_handle_(entry_computation_handle) {}
 
-HloModule::HloModule(const string& name)
-    : name_(name),
-      entry_computation_(nullptr),
-      computation_name_uniquer_(/*separator=*/".") {}
+HloModule::HloModule(const string& name) : name_(name) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation) {
   computation->UniquifyName(&computation_name_uniquer_);
+  for (auto& instruction : computation->instructions()) {
+    instruction->UniquifyName(&instruction_name_uniquer_);
+  }
   computation->set_parent(this);
   computations_.push_back(std::move(computation));
   return computations_.back().get();
@@ -301,6 +299,36 @@ std::list<HloComputation*> HloModule::MakeComputationPostOrder() const {
   return post_order;
 }
 
+std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) {
+  VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
+  auto module = MakeUnique<HloModule>(name_ + "-" + suffix);
+  module->config_ = config_;
+  module->entry_computation_handle_ = entry_computation_handle_;
+  module->has_entry_computation_handle_ = has_entry_computation_handle_;
+
+  std::unordered_map<HloComputation*, HloComputation*> clone_map;
+  for (auto& computation : computations_) {
+    auto cloned_computation = computation->Clone(suffix);
+    InsertOrDie(&clone_map, computation.get(), cloned_computation.get());
+
+    if (entry_computation_ == computation.get()) {
+      module->AddEntryComputation(std::move(cloned_computation));
+    } else {
+      module->AddEmbeddedComputation(std::move(cloned_computation));
+    }
+  }
+
+  for (auto& cloned_computation : module->computations_) {
+    for (auto& instruction : cloned_computation->instructions()) {
+      // Rewrite instruction's called_computation to point to the cloned
+      // computations.
+      instruction->ReplaceCalledComputations(
+          [&](HloComputation* hlo) { return FindOrDie(clone_map, hlo); });
+    }
+  }
+  return module;
+}
+
 uint64 HloModule::RandomNew64() const {
   tensorflow::mutex_lock l(rng_mutex_);
   return rng_();
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 4b14b4fd62a460ede0639e4417507ff2af02abd6..ae8ec02fbd1a59fa1f4a4a6160de6db0c033c4b1 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -75,6 +75,9 @@ class HloModule {
 
   const string& name() const { return name_; }
 
+  // Returns a deep copy of this module including all computations.
+  std::unique_ptr<HloModule> Clone(const string& suffix = "clone");
+
   // Return a pointer to the entry computation of the module..
   HloComputation* entry_computation() const {
     CHECK_NE(nullptr, entry_computation_);
@@ -121,13 +124,16 @@ class HloModule {
     return computation_name_uniquer_.GetUniqueName(prefix);
   }
 
+  // Returns the NameUniquer for uniquing instruction names in this module.
+  NameUniquer& instruction_name_uniquer() { return instruction_name_uniquer_; }
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation);
 
   const string name_;
   HloModuleConfig config_;
-  HloComputation* entry_computation_;
+  HloComputation* entry_computation_ = nullptr;
   std::vector<std::unique_ptr<HloComputation>> computations_;
 
   // Random number generator engine to use when generating random numbers per
@@ -141,8 +147,10 @@ class HloModule {
   bool has_entry_computation_handle_ = false;
   VersionedComputationHandle entry_computation_handle_;
 
-  // Unique name generator for computation names, which are unique per module.
-  NameUniquer computation_name_uniquer_;
+  // Unique name generator for computation and instruction names, which are
+  // unique per module.
+  NameUniquer computation_name_uniquer_{/*separator=*/"."};
+  NameUniquer instruction_name_uniquer_{/*separator=*/"."};
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index a2235a268235860a633fdc5f26c5127574a9487c..8974deb530c2e4561b5ab57f43c65fd525db3617 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -58,6 +58,10 @@ string HloModuleConfig::compilation_cache_key() const {
     StrAppend(&key, "::replica_count=", replica_count());
   }
   StrAppend(&key, debug_options_.DebugString());
+  if (intra_op_parallelism_threads() > 0) {
+    StrAppend(&key, "::intra_op_parallelism_threads=",
+              intra_op_parallelism_threads());
+  }
   return key;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index ee32ab9bc4b5dd406d0dd9b6dfff52f852883dd9..2299200b5be969c065fded840709a3d6034efe47 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -92,6 +92,15 @@ class HloModuleConfig {
     debug_options_ = debug_options;
   }
 
+  // Sets/returns the number of intra op threads for this module.
+  void set_intra_op_parallelism_threads(
+      const int intra_op_parallelism_threads) {
+    intra_op_parallelism_threads_ = intra_op_parallelism_threads;
+  }
+  int64 intra_op_parallelism_threads() const {
+    return intra_op_parallelism_threads_;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -116,6 +125,10 @@ class HloModuleConfig {
   // The number of replicas to compile this binary for.
   int64 replica_count_ = 1;
 
+  // The target maximum parallelism at which to partition HLOs for parallel
+  // execution on the CPU backend.
+  int64 intra_op_parallelism_threads_ = -1;
+
   DebugOptions debug_options_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 870bc729aec98a2959de5aa322850898502394ad..56dc5632035c625445018becfd25d69557e6232a 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -38,7 +38,7 @@ class HloModuleTest : public HloTestBase {
   std::unique_ptr<HloComputation> CreateConstantComputation() {
     auto builder = HloComputation::Builder("Constant");
     builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
     return builder.Build();
   }
 
@@ -81,6 +81,30 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   EXPECT_EQ(computation2->name(), "Constant.1");
 }
 
+TEST_F(HloModuleTest, CloneTest) {
+  // Create and copy a module with a diamond call graph of computations.
+  auto module = CreateNewModule();
+  auto computation1 =
+      module->AddEmbeddedComputation(CreateConstantComputation());
+  auto computation2 =
+      module->AddEmbeddedComputation(CreateCallComputation({computation1}));
+  auto computation3 =
+      module->AddEmbeddedComputation(CreateCallComputation({computation1}));
+  module->AddEntryComputation(
+      CreateCallComputation({computation2, computation3}));
+
+  auto post_order = module->MakeComputationPostOrder();
+  auto cloned_module = module->Clone("copy");
+  auto post_order_copied = cloned_module->MakeComputationPostOrder();
+
+  EXPECT_EQ(post_order.size(), post_order_copied.size());
+  for (auto origin = post_order.begin(), copied = post_order_copied.begin();
+       origin != post_order.end() && copied != post_order_copied.end();
+       ++origin, ++copied) {
+    EXPECT_EQ((*origin)->name() + "copy", (*copied)->name());
+  }
+}
+
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
   auto module = CreateNewModule();
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index ceb0cdaa3169bb57e4ebb61ac1b2ea41f1ef7995..3888f757adaf2e51c598a08f7464688d162595a4 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -19,11 +19,22 @@ limitations under the License.
 namespace xla {
 
 string HloOpcodeString(HloOpcode opcode) {
+  // Note: Do not use ':' in opcode strings. It is used as a special character
+  // in these places:
+  // - In extended opcode strings (HloInstruction::ExtendedOpcodeString()), to
+  //   separate the opcode from the fusion kind
+  // - In fully qualified names (HloInstruction::FullyQualifiedName()), to
+  //   separate the qualifiers (name of the computation and potentially the
+  //   fusion instruction) from the name
   switch (opcode) {
     case HloOpcode::kAbs:
       return "abs";
     case HloOpcode::kAdd:
       return "add";
+    case HloOpcode::kBatchNormTraining:
+      return "batch-norm-training";
+    case HloOpcode::kBatchNormGrad:
+      return "batch-norm-grad";
     case HloOpcode::kBitcast:
       return "bitcast";
     case HloOpcode::kBroadcast:
@@ -40,6 +51,8 @@ string HloOpcodeString(HloOpcode opcode) {
       return "convert";
     case HloOpcode::kConvolution:
       return "convolution";
+    case HloOpcode::kCos:
+      return "cosine";
     case HloOpcode::kCrossReplicaSum:
       return "cross-replica-sum";
     case HloOpcode::kCustomCall:
@@ -112,6 +125,8 @@ string HloOpcodeString(HloOpcode opcode) {
       return "recv";
     case HloOpcode::kReduce:
       return "reduce";
+    case HloOpcode::kReducePrecision:
+      return "reduce-precision";
     case HloOpcode::kReduceWindow:
       return "reduce-window";
     case HloOpcode::kRemainder:
@@ -130,6 +145,8 @@ string HloOpcodeString(HloOpcode opcode) {
       return "send";
     case HloOpcode::kSign:
       return "sign";
+    case HloOpcode::kSin:
+      return "sine";
     case HloOpcode::kSlice:
       return "slice";
     case HloOpcode::kSort:
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index e2cdbfdfa7a4b5509dccf9a83ffbd799f9ab1374..8a6376b2d1c3d4fcdb4cbcb40cd56c1f9db9ec8e 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -30,6 +30,8 @@ namespace xla {
 enum class HloOpcode {
   kAbs,
   kAdd,
+  kBatchNormTraining,
+  kBatchNormGrad,
   kBitcast,
   kBroadcast,
   kCall,
@@ -40,6 +42,7 @@ enum class HloOpcode {
   kConvert,
   kConvolution,
   kCopy,
+  kCos,
   kCrossReplicaSum,
   kCustomCall,
   kDivide,
@@ -74,6 +77,7 @@ enum class HloOpcode {
   kPower,
   kRecv,
   kReduce,
+  kReducePrecision,
   kReduceWindow,
   kRemainder,
   kReshape,
@@ -83,6 +87,7 @@ enum class HloOpcode {
   kSelectAndScatter,
   kSend,
   kSign,
+  kSin,
   kSlice,
   kSort,
   kSubtract,
@@ -107,6 +112,11 @@ bool HloOpcodeIsComparison(HloOpcode opcode);
 // Returns true iff the given opcode has variadic operands.
 bool HloOpcodeIsVariadic(HloOpcode opcode);
 
+// Returns the number of HloOpcode values.
+inline const uint32_t HloOpcodeCount() {
+  return static_cast<uint32_t>(HloOpcode::kWhile) + 1;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 72911ae9f91c175d729c3136959cf47029e8a695..4c3ff3bdafc0e5184b715b938b317c3ff85fbfa8 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -15,13 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 
-#include <set>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -113,6 +110,20 @@ bool HloOrdering::ExecutesBefore(const HloInstruction* a,
   // a_ancestor and b_ancestor must be either both null or both non-null.
   CHECK_NE(b_ancestor, nullptr);
   CHECK_EQ(a_ancestor->parent(), b_ancestor->parent());
+
+  // If the common ancestor is a while instruction there is an additional
+  // ordering criteria which may apply. The condition computation is considered
+  // to execute before the body computation so if 'a' is in the condition and
+  // 'b' is in the body, then 'a' executes before 'b'.
+  if (a_ancestor == b_ancestor && a_ancestor->opcode() == HloOpcode::kWhile) {
+    const HloComputation* body = a_ancestor->while_body();
+    const HloComputation* condition = a_ancestor->while_condition();
+    if (call_graph_->InstructionIsNestedIn(a, condition) &&
+        call_graph_->InstructionIsNestedIn(b, body)) {
+      return true;
+    }
+  }
+
   return ExecutesBeforeInSameComputation(a_ancestor, b_ancestor);
 }
 
@@ -141,7 +152,7 @@ bool PredecessorHloOrdering::ExecutesBeforeInSameComputation(
   CHECK_EQ(a->parent(), b->parent());
 
   // 'a' executes before 'b' if 'a' is in the strict predecessor set of 'b'.
-  return strict_predecessors_.at(b->parent())->IsReachable(b, a);
+  return a != b && predecessors_.at(a->parent())->IsReachable(a, b);
 }
 
 string PredecessorHloOrdering::ToStringHelper(const string& name) const {
@@ -153,10 +164,10 @@ string PredecessorHloOrdering::ToStringHelper(const string& name) const {
     const auto all = computation->MakeInstructionPostOrder();
     for (auto instruction : all) {
       pieces.push_back(tensorflow::strings::Printf(
-          "  %s strict predecessors:", instruction->name().c_str()));
+          "  %s predecessors:", instruction->name().c_str()));
       for (auto predecessor : all) {
-        if (strict_predecessors_.at(computation.get())
-                ->IsReachable(instruction, predecessor)) {
+        if (predecessors_.at(computation.get())
+                ->IsReachable(predecessor, instruction)) {
           pieces.push_back(
               tensorflow::strings::Printf("  %s", predecessor->name().c_str()));
         }
@@ -172,8 +183,11 @@ DependencyHloOrdering::DependencyHloOrdering(const HloModule* module)
   // ordering based on dependencies. ExecutesBefore will return true iff there
   // exists a path in the HLO computation graph from 'a' to 'b'.
   for (auto& computation : module->computations()) {
-    strict_predecessors_.emplace(computation.get(),
-                                 computation->ComputeTransitiveOperands());
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    predecessors_.emplace(computation.get(),
+                          computation->ComputeReachability());
   }
 }
 
@@ -238,358 +252,6 @@ string SequentialHloOrdering::ToString() const {
   return tensorflow::str_util::Join(pieces, "\n");
 }
 
-StatusOr<int64> MinimumMemoryForSequence(
-    const SequentialHloOrdering::HloModuleSequence& module_sequence,
-    const LogicalBuffer::SizeFunction& size_function) {
-  if (module_sequence.empty()) {
-    return 0;
-  }
-
-  const HloModule* module = module_sequence.begin()->first->parent();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // The absolute minimum memory required for a given sequence of instructions
-  // is determined by the sequence of Alloc and Free calls on a simulated heap,
-  // ignoring fragmentation. We run the heap simulation on the whole module,
-  // rather than summing each computation, since it gives us a better lower
-  // bound, by minimizing the liveness of sub-computations.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
-                         module_sequence, *points_to_analysis, size_function));
-  return result.heap_size;
-}
-
-namespace {
-
-// Class implementing a list scheduler of HLO instructions which produces a
-// sequence which minimizes memory usage.
-class ListScheduler {
- public:
-  // Construct and return a memory-minimizing sequence of HLO instructions
-  // containing the given HLO computation.
-  static StatusOr<std::vector<const HloInstruction*>> Run(
-      const HloComputation& computation,
-      const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_function) {
-    ListScheduler scheduler(computation, points_to_analysis, size_function);
-    return scheduler.CreateSchedule();
-  }
-
- private:
-  // The scheduling priority of an instruction is first the number of bytes
-  // freed by scheduling the instruction, and second (tie-breaker) by the number
-  // of users. This is represented as a std::pair containing these two values
-  // (first element is the bytes freed). std::pair provides the necessary
-  // comparison operators.
-  using Priority = std::pair<int64, int64>;
-
-  ListScheduler(const HloComputation& computation,
-                const TuplePointsToAnalysis& points_to_analysis,
-                const LogicalBuffer::SizeFunction& size_function)
-      : computation_(computation),
-        points_to_analysis_(points_to_analysis),
-        size_function_(size_function) {
-    // Create a map containing the LogicalBuffer uses for each HLO
-    // instruction. An HLO instruction "uses" a LogicalBuffer if the
-    // LogicalBuffer is in an operand of the instruction as indicated by
-    // points-to analysis.
-    for (auto& instruction : computation.instructions()) {
-      buffer_uses_.insert(
-          {instruction.get(), std::unordered_set<const LogicalBuffer*>()});
-      for (auto* operand : instruction->operands()) {
-        for (const LogicalBuffer* buffer :
-             points_to_analysis.GetBuffersDefinedByInstruction(operand)) {
-          buffer_uses_[instruction.get()].insert(buffer);
-        }
-      }
-    }
-
-    // Create map containing the number of unscheduled uses (hlo instructions)
-    // of each logical buffer.
-    for (auto& instruction : computation.instructions()) {
-      for (auto* buffer : points_to_analysis.GetBuffersDefinedByInstruction(
-               instruction.get())) {
-        unscheduled_use_count_[buffer] = 0;
-      }
-    }
-    for (auto& instruction : computation.instructions()) {
-      for (const LogicalBuffer* buffer : buffer_uses_.at(instruction.get())) {
-        ++unscheduled_use_count_[buffer];
-      }
-    }
-
-    // Buffers live out of the computation have an implicit use at the end of
-    // the computation.
-    for (const LogicalBuffer* live_out_buffer :
-         points_to_analysis.GetPointsToSet(computation.root_instruction())
-             .CreateFlattenedSet()) {
-      ++unscheduled_use_count_[live_out_buffer];
-    }
-  }
-
-  // Returns whether the memory used by the given buffer should be ignored by
-  // the scheduling heuristic.
-  bool IgnoreBuffer(const LogicalBuffer& buffer) {
-    return buffer.instruction()->opcode() == HloOpcode::kParameter ||
-           buffer.instruction()->opcode() == HloOpcode::kConstant;
-  }
-
-  // Return the number of bytes freed if the HLO instruction is scheduled.
-  int64 BytesFreedIfScheduled(const HloInstruction* instruction) {
-    int64 freed_bytes = 0;
-    // Sum the total size of the values last used by this instruction.
-    for (auto* buffer : buffer_uses_.at(instruction)) {
-      if (IgnoreBuffer(*buffer)) {
-        continue;
-      }
-      CHECK_GE(unscheduled_use_count_.at(buffer), 1);
-      if (unscheduled_use_count_.at(buffer) == 1) {
-        // This is the last use of the logical buffer.
-        freed_bytes += size_function_(*buffer);
-      }
-    }
-    // Then subtract the size of the value(s) defined by this instruction.
-    for (auto* buffer :
-         points_to_analysis_.GetBuffersDefinedByInstruction(instruction)) {
-      if (!IgnoreBuffer(*buffer)) {
-        freed_bytes -= size_function_(*buffer);
-      }
-    }
-    return freed_bytes;
-  }
-
-  // Construct the scheduling priority of the given instruction.
-  Priority GetPriority(const HloInstruction* instruction) {
-    return {BytesFreedIfScheduled(instruction), instruction->user_count()};
-  }
-
-  std::vector<const HloInstruction*> CreateSchedule() {
-    std::vector<const HloInstruction*> schedule;
-
-    // Populate the ready list with instructions which have no operands or
-    // control predecessors.
-    std::unordered_map<const HloInstruction*, int64> unscheduled_pred_count;
-    std::list<const HloInstruction*> ready_list;
-    for (auto& instruction : computation_.instructions()) {
-      // TODO(b/34466113): Replace this and above with successors() or
-      // predecessors() when these methods are added to HloInstruction.
-      for (const HloInstruction* user : instruction->users()) {
-        unscheduled_pred_count[user]++;
-      }
-      for (const HloInstruction* succ : instruction->control_successors()) {
-        unscheduled_pred_count[succ]++;
-      }
-    }
-    for (auto& instruction : computation_.instructions()) {
-      // Instruction with no operands or control predecessors will
-      // not be in the map.
-      if (unscheduled_pred_count.count(instruction.get()) == 0) {
-        ready_list.push_back(instruction.get());
-      }
-    }
-
-    while (!ready_list.empty()) {
-      // Select the highest priority HLO instruction from the ready list.
-      auto best_it = ready_list.begin();
-      Priority best_priority = GetPriority(*best_it);
-      for (auto ready_it = std::next(ready_list.begin());
-           ready_it != ready_list.end(); ++ready_it) {
-        Priority priority = GetPriority(*ready_it);
-        if (priority > best_priority) {
-          best_it = ready_it;
-          best_priority = priority;
-        }
-      }
-
-      // Remove the selected instruction from the ready list and add it to the
-      // schedule.
-      const HloInstruction* best = *best_it;
-      ready_list.erase(best_it);
-      schedule.push_back(best);
-      scheduled_instructions_.insert(best);
-
-      // Update the unscheduled uses of the logical buffers.
-      for (const LogicalBuffer* buffer : buffer_uses_.at(best)) {
-        CHECK_GT(unscheduled_use_count_.at(buffer), 0);
-        --unscheduled_use_count_[buffer];
-      }
-
-      // Add new instructions to ready list.
-      auto update_pred_count = [&unscheduled_pred_count,
-                                &ready_list](HloInstruction* inst) {
-        int64 pred_count = --unscheduled_pred_count.at(inst);
-        CHECK_GE(pred_count, 0);
-        if (pred_count == 0) {
-          ready_list.push_back(inst);
-        }
-      };
-      // TODO(b/34466113): Replace this and above with successors() or
-      // predecessors() when these methods are added to HloInstruction.
-      for (HloInstruction* user : best->users()) {
-        update_pred_count(user);
-      }
-      for (HloInstruction* succ : best->control_successors()) {
-        update_pred_count(succ);
-      }
-    }
-    CHECK_EQ(schedule.size(), computation_.instructions().size());
-    CHECK_EQ(scheduled_instructions_.size(),
-             computation_.instructions().size());
-
-    return schedule;
-  }
-
-  const HloComputation& computation_;
-  const TuplePointsToAnalysis& points_to_analysis_;
-  const LogicalBuffer::SizeFunction& size_function_;
-
-  // A map containing the LogicalBuffers that each instruction uses.
-  std::unordered_map<const HloInstruction*,
-                     std::unordered_set<const LogicalBuffer*>>
-      buffer_uses_;
-
-  // A map containing the count of unscheduled HLOs which using a particular
-  // LogicalBuffer.
-  std::unordered_map<const LogicalBuffer*, int64> unscheduled_use_count_;
-
-  // Set of instructions which have been scheduled.
-  std::unordered_set<const HloInstruction*> scheduled_instructions_;
-};
-
-int64 SumLogicalBufferSizes(const std::vector<const LogicalBuffer*>& buffers,
-                            const LogicalBuffer::SizeFunction& size_function) {
-  int64 size = 0;
-  for (const LogicalBuffer* buffer : buffers) {
-    size += size_function(*buffer);
-  }
-  return size;
-}
-
-StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
-    const HloComputation& computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // This ordering is based on DFS post-order, with a heuristic to decide which
-  // operand to visit first.  The heuristic is based on 'extra_users', which is
-  // simply users-1 for each instruction.  By subtracting 1, we're saying that
-  // instructions with no users or a single user don't count; instructions with
-  // lots of fan-out will be visited earlier.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
-  tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
-  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
-    extra_users[hlo] = hlo->users().empty() ? 0 : hlo->users().size() - 1;
-    total_sizes[hlo] = SumLogicalBufferSizes(
-        points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
-    tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
-        hlo->operands().begin(), hlo->operands().end());
-    for (const HloInstruction* operand : unique_operands) {
-      extra_users[hlo] += extra_users[operand];
-      total_sizes[hlo] += total_sizes[operand];
-    }
-  }
-  CHECK_EQ(extra_users.size(), computation.instructions().size());
-  CHECK_EQ(total_sizes.size(), computation.instructions().size());
-
-  // Construct a total order based on DFS post-order, visiting operands in
-  // decreasing cumulative extra user order, and next by cumulative size, with a
-  // tiebreaker by name for determinism.
-  std::vector<const HloInstruction*> sequence;
-  FunctionVisitor visitor([&sequence](HloInstruction* hlo) {
-    sequence.push_back(hlo);
-    return Status::OK();
-  });
-  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
-      &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
-                                             const HloInstruction* b) {
-        if (extra_users[a] != extra_users[b]) {
-          return extra_users[a] > extra_users[b];
-        }
-        if (total_sizes[a] != total_sizes[b]) {
-          return total_sizes[a] > total_sizes[b];
-        }
-        return a->name() < b->name();
-      }));
-  CHECK_EQ(sequence.size(), computation.instructions().size());
-  return sequence;
-}
-
-StatusOr<int64> MinimumMemoryForComputation(
-    const HloComputation& computation,
-    const std::vector<const HloInstruction*>& sequence,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
-                         sequence, points_to_analysis, size_function));
-  return result.heap_size;
-}
-
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
-    const HloComputation& computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // We try both a list-scheduler based ordering and a DFS based ordering, and
-  // choose whichever returns a lower min-memory, not accounting for
-  // fragmentation.
-  //
-  // Note that this is just a heuristic. One obvious inaccuracy is that the
-  // memory required for sub-computations might be different when considered
-  // within the caller's context. But it's good enough for now.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<const HloInstruction*> list_sequence,
-      ListScheduler::Run(computation, points_to_analysis, size_function));
-  TF_ASSIGN_OR_RETURN(
-      const int64 list_memory,
-      MinimumMemoryForComputation(computation, list_sequence,
-                                  points_to_analysis, size_function));
-  VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<const HloInstruction*> dfs_sequence,
-      RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
-  TF_ASSIGN_OR_RETURN(
-      const int64 dfs_memory,
-      MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
-                                  size_function));
-  VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
-
-  if (list_memory <= dfs_memory) {
-    VLOG(2) << "Chose min-memory list sequence: " << list_memory << " bytes";
-    return list_sequence;
-  } else {
-    VLOG(2) << "Chose min-memory dfs sequence: " << dfs_memory << " bytes";
-    return dfs_sequence;
-  }
-}
-
-}  // namespace
-
-StatusOr<SequentialHloOrdering::HloModuleSequence>
-CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function) {
-  SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(&module));
-  for (const auto& computation : module.computations()) {
-    TF_ASSIGN_OR_RETURN(sequence[computation.get()],
-                        CreateMemoryMinimizingSequence(
-                            *computation, *points_to_analysis, size_function));
-  }
-  return sequence;
-}
-
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
-    const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(computation.parent()));
-  return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
-                                        size_function);
-}
-
 std::ostream& operator<<(
     std::ostream& out,
     const SequentialHloOrdering::HloModuleSequence& module_sequence) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index b59e1ea5eb0ad4882d4c2b96ee6ab6d1bc973993..130431f28070d52c3a76befa0d5272a3cc295711 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -24,12 +24,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
 
@@ -72,8 +68,8 @@ class HloOrdering {
   std::unique_ptr<CallGraph> call_graph_;
 };
 
-// Base class for partial orderings implemented by a map of strict predecessors
-// for each instruction. Subclasses should fill in strict_predecessors_.
+// Base class for partial orderings implemented by a map of predecessors for
+// each instruction. Subclasses should fill in predecessors_.
 class PredecessorHloOrdering : public HloOrdering {
  public:
   ~PredecessorHloOrdering() override = default;
@@ -93,13 +89,12 @@ class PredecessorHloOrdering : public HloOrdering {
                                        const HloInstruction* b) const override;
 
   // For each computation in the module, this is the set of the instruction's
-  // strict predecessors. An instruction is not an element of its own strict
-  // predecessor set.
+  // predecessors. An instruction is an element of its own predecessor set.
   //
   // Subclasses should fill this in to define the desired ordering.
   tensorflow::gtl::FlatMap<const HloComputation*,
-                           std::unique_ptr<HloComputation::ReachabilityMap>>
-      strict_predecessors_;
+                           std::unique_ptr<HloReachabilityMap>>
+      predecessors_;
 };
 
 // An HLO ordering based on data dependencies in the HLO graph. In this partial
@@ -191,24 +186,6 @@ std::ostream& operator<<(
     std::ostream& out,
     const SequentialHloOrdering::HloModuleSequence& module_sequence);
 
-// Returns the minimum memory required to compute the given module sequence,
-// assuming no fragmentation.
-StatusOr<int64> MinimumMemoryForSequence(
-    const SequentialHloOrdering::HloModuleSequence& module_sequence,
-    const LogicalBuffer::SizeFunction& size_function);
-
-// Returns an HloModuleSequence which seeks to minimize the memory required for
-// the computation. size_function is the function returning the number of bytes
-// required for a LogicalBuffer.
-StatusOr<SequentialHloOrdering::HloModuleSequence>
-CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function);
-
-// Overload of above that computes the sequence for a single computation.
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
-    const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ORDERING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 21d852a51d67b2aadc0edea144f60a037a004614..ad6070a9c1b45afd418c9210a2d1b3def3eaf4d5 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -61,7 +62,7 @@ TEST_F(HloOrderingTest, LastUseScheduledFirst) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       SequentialHloOrdering::HloModuleSequence sequence,
       CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
@@ -101,7 +102,7 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
 
   auto builder_c = HloComputation::Builder("C");
   HloInstruction* c = builder_c.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   HloComputation* computation_c =
       module->AddEmbeddedComputation(builder_c.Build());
 
@@ -155,67 +156,69 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   EXPECT_FALSE(ordering.ExecutesBefore(y, c));
 }
 
-class MinimumMemoryForSequenceTest : public HloTestBase {};
-
-TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
+TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
+  // Tests the ordering of instructions in the body and condition of a while
+  // instruction. HLO code:
+  //
+  // body(F32[]) %param):
+  //   %negate = Negate(%param)
+  //
+  // condition(F32[] %param):
+  //   %convert = Convert<PRED>(%param)
+  //
+  // entry:
+  //   %constant = Constant(1.0)
+  //   return While(%constant, body, condition)
+  //
   auto module = CreateNewModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
-  const Shape tuple_shape =
-      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
-
-  auto cond_builder = HloComputation::Builder("WhileCond");
-  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
-  HloInstruction* cond_param = cond_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
-  HloInstruction* cond_iter = cond_builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
-  HloInstruction* cond_data = cond_builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
-  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
-  HloInstruction* cond_lt = cond_builder.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
-                                   HloOpcode::kLt, cond_iter, cond_data));
-  HloComputation* cond_computation =
-      module->AddEmbeddedComputation(cond_builder.Build());
 
-  auto body_builder = HloComputation::Builder("WhileBody");
-  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
-  HloInstruction* body_param = body_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
-  HloComputation* body_computation =
-      module->AddEmbeddedComputation(body_builder.Build());
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "body_param"));
+  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape, HloOpcode::kNegate, body_param));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "cond_param"));
+  auto convert = cond_builder.AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::MakeShape(xla::PRED, {}), cond_param));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
 
   auto builder = HloComputation::Builder(TestName());
-  // Entry params: 8 bytes (4 bytes per param), TOTAL=8
-  HloInstruction* iter = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "param_iter"));
-  HloInstruction* data = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
-  // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24
-  HloInstruction* tuple =
-      builder.AddInstruction(HloInstruction::CreateTuple({iter, data}));
-  // While: 8 bytes (4 bytes per element), TOTAL=32
-  // Both cond and body use a max of 24 bytes, TOTAL=56
-  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
-      tuple_shape, cond_computation, body_computation, tuple));
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-
-  auto size_fn = [](const LogicalBuffer& buffer) {
-    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-  };
-
-  SequentialHloOrdering::HloModuleSequence module_sequence;
-  module_sequence[cond_computation] = {cond_param, cond_iter, cond_data,
-                                       cond_lt};
-  module_sequence[body_computation] = {body_param};
-  module_sequence[entry_computation] = {iter, data, tuple, while_op};
-  EXPECT_EQ(56,
-            MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape, condition, body, constant));
+  module->AddEntryComputation(builder.Build());
+
+  DependencyHloOrdering ordering(module.get());
+  EXPECT_TRUE(ordering.ExecutesBefore(constant, xla_while));
+  EXPECT_TRUE(ordering.ExecutesBefore(constant, cond_param));
+  EXPECT_TRUE(ordering.ExecutesBefore(constant, convert));
+  EXPECT_TRUE(ordering.ExecutesBefore(constant, body_param));
+  EXPECT_TRUE(ordering.ExecutesBefore(constant, negate));
+
+  // The while should be unordered relative to the body and condition
+  // instructions.
+  EXPECT_FALSE(ordering.ExecutesBefore(xla_while, body_param));
+  EXPECT_FALSE(ordering.ExecutesBefore(xla_while, cond_param));
+  EXPECT_FALSE(ordering.ExecutesBefore(body_param, xla_while));
+  EXPECT_FALSE(ordering.ExecutesBefore(cond_param, xla_while));
+
+  // Condition instructions should be ordered before body instructions.
+  EXPECT_TRUE(ordering.ExecutesBefore(cond_param, body_param));
+  EXPECT_TRUE(ordering.ExecutesBefore(convert, body_param));
+  EXPECT_TRUE(ordering.ExecutesBefore(cond_param, negate));
+  EXPECT_TRUE(ordering.ExecutesBefore(convert, negate));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(body_param, cond_param));
 }
 
 }  // namespace
-
 }  // namespace xla
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 119e2d79022dca094147348d83c59b9a04cb339f..4b824f8240074e7ae70b9d9fa82dfa0706d5b355 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -30,9 +31,10 @@ using ::tensorflow::strings::StrAppend;
 namespace xla {
 
 namespace {
-void DumpModule(const Compiler::HloDumper& dumper_, const HloModule& module,
+void DumpModule(const HloModule& module,
+
                 const string& message) {
-  dumper_(module, message);
+  hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(2) << "HLO " << message << ":";
   XLA_VLOG_LINES(2, module.ToString());
 }
@@ -75,7 +77,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
-    DumpModule(dumper_, *module, message);
+    DumpModule(*module, message);
 
     TF_RETURN_IF_ERROR(run_invariant_checkers());
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
@@ -85,7 +87,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     StrAppend(&prefix, name(), ": after ", pass->name());
   }
   TF_RETURN_IF_ERROR(run_invariant_checkers());
-  DumpModule(dumper_, *module, prefix + ", pipeline end");
+  DumpModule(*module, prefix + ", pipeline end");
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index 682c4b952df6aae8cb933c222772dbd823070ecc..a42d7e59fed2d838dfe3cb7f99e6b946edfdb0b4 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,9 +33,7 @@ namespace xla {
 // Pipeline of HLO passes.
 class HloPassPipeline : public HloPassInterface {
  public:
-  explicit HloPassPipeline(const string& name,
-                           const Compiler::HloDumper& dumper)
-      : name_(name), dumper_(dumper) {}
+  explicit HloPassPipeline(const string& name) : name_(name) {}
   tensorflow::StringPiece name() const override { return name_; }
 
   // Add a pass to the pipeline. It should be called with the arguments for the
@@ -69,7 +66,6 @@ class HloPassPipeline : public HloPassInterface {
 
  private:
   const string name_;
-  Compiler::HloDumper dumper_;
   std::vector<std::unique_ptr<HloPassInterface>> passes_;
   std::vector<std::unique_ptr<HloPassInterface>> invariant_checkers_;
   bool run_called_ = false;
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index a153d73dbd838663c0d7e0d72ad54668f243f2c2..d45038f1f4a2e4aa19234eec93fdc9a068a902e1 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -25,7 +25,7 @@ namespace hlo_query {
 bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   if (instruction->opcode() == HloOpcode::kConstant &&
       ShapeUtil::IsScalarF32(instruction->shape())) {
-    *out = LiteralUtil::Get<float>(instruction->literal(), {});
+    *out = instruction->literal().Get<float>({});
     return true;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb7ecbdc2a09e6e797d283675ccf2c26f9c1a34c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+
+namespace xla {
+
+HloReachabilityMap::HloReachabilityMap(
+    const std::list<HloInstruction*>& instructions)
+    : size_(instructions.size()) {
+  bit_vectors_.reserve(size_);
+  for (const HloInstruction* hlo : instructions) {
+    indices_[hlo] = bit_vectors_.size();
+    bit_vectors_.emplace_back(size_);
+  }
+  CHECK_EQ(size_, indices_.size());  // instructions should be unique
+}
+
+bool HloReachabilityMap::SetReachabilityToUnion(
+    tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
+    const HloInstruction* instruction) {
+  BitVector& bit_vector = GetBitVector(instruction);
+  tmp_bit_vector_ = bit_vector;
+
+  bit_vector.SetToZero();
+  bit_vector.Set(GetIndex(instruction));
+  for (const HloInstruction* input : inputs) {
+    bit_vector.OrWith(GetBitVector(input));
+  }
+
+  return bit_vector != tmp_bit_vector_;
+}
+
+void HloReachabilityMap::SetReachable(const HloInstruction* a,
+                                      const HloInstruction* b) {
+  GetBitVector(b).Set(GetIndex(a));
+}
+
+bool HloReachabilityMap::IsReachable(const HloInstruction* a,
+                                     const HloInstruction* b) const {
+  return GetBitVector(b).Get(GetIndex(a));
+}
+
+bool HloReachabilityMap::IsConnected(const HloInstruction* a,
+                                     const HloInstruction* b) const {
+  return IsReachable(a, b) || IsReachable(b, a);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7bdac9c86579f19afbba133772c2c50894853d1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
+
+#include <list>
+#include <vector>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+class HloInstruction;
+
+// A class for computing and representing reachability between HloInstructions.
+class HloReachabilityMap {
+ public:
+  // Sets up an empty reachable matrix for the full set of instructions
+  // specified in 'instructions'.
+  explicit HloReachabilityMap(const std::list<HloInstruction*>& instructions);
+
+  // Set the reachability set of 'instruction' to the union of the reachability
+  // sets of 'inputs'. Upon return, IsReachable(x, instruction) where
+  // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
+  // for some 'input' in 'inputs'. Also sets 'instruction' to be reachable from
+  // itself. Returns whether the reachability set of 'instruction' changed.
+  bool SetReachabilityToUnion(
+      tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
+      const HloInstruction* instruction);
+
+  // Sets entry so that IsReachable(a, b) will return true
+  void SetReachable(const HloInstruction* a, const HloInstruction* b);
+
+  // Returns true if "b" is reachable from "a"
+  bool IsReachable(const HloInstruction* a, const HloInstruction* b) const;
+
+  // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
+  bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
+
+ private:
+  // A bit-vector implementation specialized for this use case which provides a
+  // fast bitwise OR operation not available in tensorflow::gtl::BitMap.
+  class BitVector {
+   public:
+    BitVector() = default;
+    BitVector(size_t size)
+        : size_(size), vector_((size + kBits - 1) / kBits, 0) {}
+
+    // Return the bit at the given index.
+    bool Get(size_t index) const {
+      DCHECK(index >= 0 && index < size_);
+      return vector_[index / kBits] & (1ull << (index % kBits));
+    }
+
+    // Set the bit at the given index.
+    void Set(size_t index) {
+      DCHECK(index >= 0 && index < size_);
+      vector_[index / kBits] |= 1ull << (index % kBits);
+    }
+
+    // Set this bitvector to the Logical OR of this bitvector and 'other'.
+    void OrWith(const BitVector& other) {
+      for (size_t i = 0; i < vector_.size(); ++i) {
+        vector_[i] |= other.vector_[i];
+      }
+    }
+
+    // Set the bitvector to all zeros.
+    void SetToZero() { std::fill(vector_.begin(), vector_.end(), 0); }
+
+    bool operator==(const BitVector& other) const {
+      return vector_ == other.vector_;
+    }
+    bool operator!=(const BitVector& other) const {
+      return vector_ != other.vector_;
+    }
+
+   private:
+    using Word = uint64;
+    static const size_t kBits = 64;
+
+    // Number of bits in the bitvector.
+    size_t size_;
+
+    std::vector<Word> vector_;
+  };
+
+  // Return the bitvector storing the reachability-to of the given instruction.
+  const BitVector& GetBitVector(const HloInstruction* instruction) const {
+    return bit_vectors_[GetIndex(instruction)];
+  }
+  BitVector& GetBitVector(const HloInstruction* instruction) {
+    return bit_vectors_[GetIndex(instruction)];
+  }
+
+  // Return the index of the given instruction. The value is used to index into
+  // the vector of BitVectors and the BitVectors themselves.
+  int GetIndex(const HloInstruction* instruction) const {
+    return FindOrDie(indices_, instruction);
+  }
+
+  // The number of instructions in the reachability map.
+  const size_t size_;
+
+  // Dense assignment from HloInstruction* to number. These numbers index
+  // into the bit_vectors_ vector and into the bits within a BitVector.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int> indices_;
+
+  // Bitvectors holding the reachability to each instruction. The bit vector for
+  // instruction X includes ones for each instruction which X is reachable from.
+  std::vector<BitVector> bit_vectors_;
+
+  // A temporary used by SetReachabilityToUnion to avoid an allocation with each
+  // call to the method.
+  BitVector tmp_bit_vector_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..657a9ee83d29e72b95660325f9139f44159d6508
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+
+#include <set>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+
+namespace {
+
+class HloReachabilityTest : public HloTestBase {};
+
+TEST_F(HloReachabilityTest, Reachability) {
+  // Construct and test a reachability graph of the following form:
+  /*
+       a
+      / \
+     b   c
+      \ / \
+       d   e
+  */
+  auto builder = HloComputation::Builder(TestName());
+  auto a = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  auto b = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  auto c = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  auto d = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  auto e = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  builder.Build();
+
+  HloReachabilityMap reachability({a, b, c, d, e});
+  reachability.SetReachable(a, a);
+  EXPECT_TRUE(reachability.SetReachabilityToUnion({a}, b));
+  EXPECT_TRUE(reachability.SetReachabilityToUnion({a}, c));
+  EXPECT_TRUE(reachability.SetReachabilityToUnion({b, c}, d));
+  EXPECT_TRUE(reachability.SetReachabilityToUnion({c}, e));
+
+  EXPECT_TRUE(reachability.IsReachable(a, a));
+  EXPECT_TRUE(reachability.IsReachable(a, b));
+  EXPECT_TRUE(reachability.IsReachable(a, c));
+  EXPECT_TRUE(reachability.IsReachable(a, d));
+  EXPECT_TRUE(reachability.IsReachable(a, e));
+
+  EXPECT_FALSE(reachability.IsReachable(b, a));
+  EXPECT_TRUE(reachability.IsReachable(b, b));
+  EXPECT_FALSE(reachability.IsReachable(b, c));
+  EXPECT_TRUE(reachability.IsReachable(b, d));
+  EXPECT_FALSE(reachability.IsReachable(b, e));
+
+  EXPECT_FALSE(reachability.IsReachable(e, a));
+  EXPECT_FALSE(reachability.IsReachable(e, b));
+  EXPECT_FALSE(reachability.IsReachable(e, c));
+  EXPECT_FALSE(reachability.IsReachable(e, d));
+  EXPECT_TRUE(reachability.IsReachable(e, e));
+
+  // Recomputing the same reachability for a previously computed instruction
+  // should return false (no change).
+  EXPECT_FALSE(reachability.SetReachabilityToUnion({a}, b));
+  EXPECT_FALSE(reachability.SetReachabilityToUnion({b, c}, d));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 2c1b0fff4e602a172cfa54d4eaa626198a426873..fd08796e50383ab9ad1aff4f19e8c67fd72a9a63 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -58,9 +59,8 @@ bool IsRematerializable(const HloInstruction* instruction) {
     return false;
   }
 
-  // Don't rematerialize instructions with side effects, those with a cost that
-  // might not be captured by HloCostAnalysis, or instructions which cannot be
-  // cloned safely.
+  // Don't rematerialize instructions with side effects or instructions which
+  // cannot be cloned safely.
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
     case HloOpcode::kConstant:
@@ -802,23 +802,14 @@ bool MemoryUsageTracker::Check() const {
 // Computes and returns the cost of rematerializing the given instruction.
 // Cost per rematerialized instruction is defined as:
 //
-// (flop_count + transcendental_count + element_count) / memory_reduced
+// memory_limit_bytes / memory_reduced
 //
-//   flop_count: from HloCostAnalysis
-//   transcendental_count: from HloCostAnalysis
-//   element_count: number of elements accessed in operands and output of
-//     instruction
-//   memory_reduced: The memory usage reduced by rematerializing the
-//     instruction.
-//
-// This is a rough estimate of the extra execution time per byte saved by
-// rematerializing this instruction for its remaining uses. In general, we
-// want the most memory saving for the least latency penalty which is captured
-// by this heuristic.
+// The idea is to choose the operation that will save the most memory for
+// rematerialization and do not worry about how much the compute costs since
+// running out of memory is more harmful than taking longer to get the answer.
 int64 RematerializationCost(const HloInstruction* instruction,
                             const MemoryUsageTracker& memory_tracker,
-                            const HloCostAnalysis& cost_analysis,
-                            int64 memory_reduced) {
+                            int64 memory_reduced, int64 memory_limit_bytes) {
   // If none of the users of 'instruction' have been placed in the sequence (as
   // tracked by memory_tracker), then rematerialization of 'instruction' is a
   // zero-cost move of 'instruction' in the sequence.
@@ -830,22 +821,8 @@ int64 RematerializationCost(const HloInstruction* instruction,
   }
 
   CHECK_GT(memory_reduced, 0);
-  const int64 bytes_accessed = cost_analysis.bytes_accessed(*instruction);
-  const int64 elements_accessed =
-      ShapeUtil::IsTuple(instruction->shape())
-          ? bytes_accessed
-          : bytes_accessed / ShapeUtil::ByteSizeOfPrimitiveType(
-                                 instruction->shape().element_type());
-
-  // Multiply by 256 to improve precision of cost. Without this factor,
-  // many instructions such as many elementwise instructions would have
-  // zero cost because the bytes reduced can be several times greater than
-  // the element count.
-  return 256 *
-         (cost_analysis.flop_count(*instruction) +
-          cost_analysis.transcendental_count(*instruction) +
-          elements_accessed) /
-         memory_reduced;
+  // Return the inverse of the benefit of rematerialization.
+  return memory_limit_bytes / memory_reduced;
 }
 
 // Selects and returns the best candidate instruction for rematerialization.
@@ -856,8 +833,8 @@ int64 RematerializationCost(const HloInstruction* instruction,
 HloInstruction* PickRematerializationCandidate(
     const MemoryUsageTracker& memory_tracker,
     const InstructionList& instruction_list,
-    const HloCostAnalysis& cost_analysis,
-    const tensorflow::gtl::FlatSet<const HloInstruction*>& blacklist) {
+    const tensorflow::gtl::FlatSet<const HloInstruction*>& blacklist,
+    int64 memory_limit_bytes) {
   HloInstruction* best = nullptr;
   int64 best_cost = 0;
 
@@ -891,12 +868,12 @@ HloInstruction* PickRematerializationCandidate(
 
     if (memory_reduced <= 0) {
       VLOG(5) << "candidate " << candidate->name()
-              << " memory reduced = " << memory_reduced << " <= 0";
+              << " memory reduced = " << memory_reduced << " <=  0";
       continue;
     }
 
     const int cost = RematerializationCost(candidate, memory_tracker,
-                                           cost_analysis, memory_reduced);
+                                           memory_reduced, memory_limit_bytes);
 
     VLOG(5) << "candidate " << candidate->name() << ", memory reduced "
             << memory_reduced << ", cost per byte " << cost;
@@ -1011,7 +988,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
               << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
 
       HloInstruction* best = PickRematerializationCandidate(
-          memory_tracker, instruction_list, cost_analysis_, blacklist);
+          memory_tracker, instruction_list, blacklist, memory_limit_bytes);
 
       if (best == nullptr) {
         VLOG(3) << "Unable to find rematerialization candidate at program "
@@ -1211,11 +1188,6 @@ StatusOr<bool> HloRematerialization::Run(
   VLOG(1) << "Peak memory usage of module (before): "
           << HumanReadableNumBytes(before_peak_memory);
 
-  // Run cost analysis. Operation cost is used in the heuristic for selecting
-  // instructions for rematerialization.
-  TF_RETURN_IF_ERROR(
-      module->entry_computation()->root_instruction()->Accept(&cost_analysis_));
-
   // Subcomputations called by the entry computation will also be
   // rematerialized.
   TF_ASSIGN_OR_RETURN(bool changed, RematerializeComputation(
@@ -1230,6 +1202,9 @@ StatusOr<bool> HloRematerialization::Run(
   // After DCE, the module sequence may include instructions which no longer
   // exist.
   for (const auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     if (sequence->at(computation.get()).size() !=
         computation->instruction_count()) {
       // A size mismatch between the computation instruction count and the size
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 1693f93183bc59c343e3c765cb4051566d4377ef..42c279d440b78d90b9f19b92155c52787156e4b7 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -18,7 +18,6 @@
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -61,7 +60,7 @@ class HloRematerialization {
 
  protected:
   HloRematerialization(const ShapeSizeFunction& size_function)
-      : size_function_(size_function), cost_analysis_(size_function_) {}
+      : size_function_(size_function) {}
   ~HloRematerialization() {}
 
   // Runs rematerialization on the given module. Returns whether the module was
@@ -100,9 +99,6 @@ class HloRematerialization {
   // Call graph of the hlo_module.
   std::unique_ptr<CallGraph> call_graph_;
 
-  // Analysis used for computing the rematerialization cost of instructions.
-  HloCostAnalysis cost_analysis_;
-
   // The peak memory usage of each computation. The map contains only those
   // computations called from sequential context
   // (CallContext::kSequential). These values are updated as rematerialization
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index f306bcc309c6c5e57a311496ee0370741de8a6ab..2358969f38ee66e3eb024215cba4c62da3d6a32f 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -126,7 +126,7 @@ class HloRematerializationTest : public HloTestBase {
     builder.AddInstruction(
         HloInstruction::CreateParameter(0, vec1_shape_, "param"));
     builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
     return builder.Build();
   }
 
@@ -158,7 +158,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   SequentialHloOrdering::HloModuleSequence sequence;
   // Computation requires 16KB without rematerialization, but uses only 12KB
   // with rematerialization so pick a memory limit between these values (14KB).
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
@@ -191,7 +191,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
   EXPECT_EQ(computation->instruction_count(), 7);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
@@ -215,7 +215,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, vec1_shape_, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
   HloComputation* while_cond =
       module->AddEmbeddedComputation(cond_builder.Build());
 
@@ -232,7 +232,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   // while so the peak memory use of the module is 18KB. Set the memory limit a
   // bit lower (17KB) to force rematerialization of the entry computation.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
@@ -254,7 +254,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, vec1_shape_, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
   HloComputation* while_cond =
       module->AddEmbeddedComputation(cond_builder.Build());
 
@@ -268,7 +268,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
   EXPECT_EQ(body_computation->instruction_count(), 7);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
@@ -289,7 +289,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   cond_builder.AddInstruction(
       HloInstruction::CreateParameter(0, vec1_shape_, "param"));
   cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
   HloComputation* while_cond =
       module->AddEmbeddedComputation(cond_builder.Build());
 
@@ -310,7 +310,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   // If all computations are maximally rematerialized then peak memory usage is
   // ~12K so pick something slightly larger.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
@@ -357,7 +357,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(
         vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}, /*slices=*/{1}));
+        /*limit_indices=*/{1024}, /*strides=*/{1}));
     subcomputation = module->AddEmbeddedComputation(builder.Build());
   }
 
@@ -406,7 +406,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
@@ -473,7 +473,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
         /*dimension=*/0));
     builder.AddInstruction(HloInstruction::CreateSlice(
         vec1024_shape_, concat, /*start_indices=*/{0},
-        /*limit_indices=*/{1024}, /*slices=*/{1}));
+        /*limit_indices=*/{1024}, /*strides=*/{1}));
     subcomputation = module->AddEmbeddedComputation(builder.Build());
   }
 
@@ -503,7 +503,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       HloRematerialization::RematerializeAndSchedule(
           ByteSizeOf,
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..922236ee1e79c65719f128c598a5de65d7fc1ab7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -0,0 +1,423 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+StatusOr<int64> MinimumMemoryForSequence(
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const LogicalBuffer::SizeFunction& size_function) {
+  if (module_sequence.empty()) {
+    return 0;
+  }
+
+  const HloModule* module = module_sequence.begin()->first->parent();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
+                      TuplePointsToAnalysis::Run(module));
+
+  // The absolute minimum memory required for a given sequence of instructions
+  // is determined by the sequence of Alloc and Free calls on a simulated heap,
+  // ignoring fragmentation. We run the heap simulation on the whole module,
+  // rather than summing each computation, since it gives us a better lower
+  // bound, by minimizing the liveness of sub-computations.
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
+                         module_sequence, *points_to_analysis, size_function));
+  return result.heap_size;
+}
+
+namespace {
+
+// Class implementing a list scheduler of HLO instructions which produces a
+// sequence which minimizes memory usage.
+class ListScheduler {
+ public:
+  // Construct and return a memory-minimizing sequence of HLO instructions
+  // containing the given HLO computation.
+  static StatusOr<std::vector<const HloInstruction*>> Run(
+      const HloComputation& computation,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const LogicalBuffer::SizeFunction& size_function) {
+    ListScheduler scheduler(computation, points_to_analysis, size_function);
+    return scheduler.CreateSchedule();
+  }
+
+ private:
+  // The scheduling priority of an instruction is first the number of bytes
+  // freed by scheduling the instruction, and second (tie-breaker) by the number
+  // of users. This is represented as a std::pair containing these two values
+  // (first element is the bytes freed). std::pair provides the necessary
+  // comparison operators.
+  using Priority = std::pair<int64, int64>;
+
+  ListScheduler(const HloComputation& computation,
+                const TuplePointsToAnalysis& points_to_analysis,
+                const LogicalBuffer::SizeFunction& size_function)
+      : computation_(computation),
+        points_to_analysis_(points_to_analysis),
+        size_function_(size_function) {
+    // Create a map containing the LogicalBuffer uses for each HLO
+    // instruction. An HLO instruction "uses" a LogicalBuffer if the
+    // LogicalBuffer is in an operand of the instruction as indicated by
+    // points-to analysis.
+    for (auto& instruction : computation.instructions()) {
+      std::unordered_set<const LogicalBuffer*> instr_uses;
+      for (auto* operand : instruction->operands()) {
+        for (const LogicalBuffer* buffer :
+             points_to_analysis.GetBuffersDefinedByInstruction(operand)) {
+          instr_uses.insert(buffer);
+        }
+      }
+      buffer_uses_[instruction.get()] = std::vector<const LogicalBuffer*>(
+          instr_uses.begin(), instr_uses.end());
+    }
+
+    // Create map containing the number of unscheduled uses (hlo instructions)
+    // of each logical buffer.
+    for (auto& instruction : computation.instructions()) {
+      for (auto* buffer : points_to_analysis.GetBuffersDefinedByInstruction(
+               instruction.get())) {
+        unscheduled_use_count_[buffer] = 0;
+      }
+    }
+    for (auto& instruction : computation.instructions()) {
+      for (const LogicalBuffer* buffer : buffer_uses_.at(instruction.get())) {
+        ++unscheduled_use_count_[buffer];
+      }
+    }
+
+    // Buffers live out of the computation have an implicit use at the end of
+    // the computation.
+    for (const LogicalBuffer* live_out_buffer :
+         points_to_analysis.GetPointsToSet(computation.root_instruction())
+             .CreateFlattenedSet()) {
+      ++unscheduled_use_count_[live_out_buffer];
+    }
+  }
+
+  // Returns whether the memory used by the given buffer should be ignored by
+  // the scheduling heuristic.
+  bool IgnoreBuffer(const LogicalBuffer& buffer) {
+    return buffer.instruction()->opcode() == HloOpcode::kParameter ||
+           buffer.instruction()->opcode() == HloOpcode::kConstant;
+  }
+
+  // An entry in the worklist used by CreateSchedule.  Corresponds to one
+  // HloInstruction, plus some cached metadata, saved for the purposes of making
+  // BytesFreedIfScheduled fast.
+  struct ReadyListEntry {
+    const HloInstruction* instruction;
+
+    // The total size of all buffers defined by this instruction.
+    int64 bytes_defined;
+
+    // For each buffer B used by this instruction, we keep a pair (B, U), where
+    // U is the number of uses of B that have not yet been scheduled.  This pair
+    // is a pointer into the unscheduled_use_count_ map, so it gets updated for
+    // free when we update counts in the map.
+    std::vector<const std::pair<const LogicalBuffer* const, int64>*>
+        used_buffer_unscheduled_use_counts;
+  };
+
+  // Creates a ReadyListEntry for the given instruction.
+  ReadyListEntry MakeReadyListEntry(const HloInstruction* instruction) {
+    ReadyListEntry entry;
+    entry.instruction = instruction;
+
+    entry.bytes_defined = 0;
+    for (auto* buffer :
+         points_to_analysis_.GetBuffersDefinedByInstruction(instruction)) {
+      if (!IgnoreBuffer(*buffer)) {
+        entry.bytes_defined += size_function_(*buffer);
+      }
+    }
+
+    for (auto* buffer : buffer_uses_.at(instruction)) {
+      if (IgnoreBuffer(*buffer)) {
+        continue;
+      }
+      auto unscheduled_use_count_it = unscheduled_use_count_.find(buffer);
+      CHECK(unscheduled_use_count_it != unscheduled_use_count_.end());
+      entry.used_buffer_unscheduled_use_counts.push_back(
+          &*unscheduled_use_count_it);
+    }
+    return entry;
+  }
+
+  // Returns the number of bytes freed if the HLO instruction is scheduled.
+  int64 BytesFreedIfScheduled(const ReadyListEntry& entry) {
+    int64 freed_bytes = 0;
+    for (const auto& kv : entry.used_buffer_unscheduled_use_counts) {
+      auto buffer = kv->first;
+      auto use_count = kv->second;
+      if (use_count == 1) {
+        freed_bytes += size_function_(*buffer);
+      }
+    }
+    return freed_bytes - entry.bytes_defined;
+  }
+
+  // Constructs the scheduling priority of the given instruction.
+  Priority GetPriority(const ReadyListEntry& entry) {
+    return {BytesFreedIfScheduled(entry), entry.instruction->user_count()};
+  }
+
+  std::vector<const HloInstruction*> CreateSchedule() {
+    std::vector<const HloInstruction*> schedule;
+
+    // Populate the ready list with instructions which have no operands or
+    // control predecessors.
+    std::unordered_map<const HloInstruction*, int64> unscheduled_pred_count;
+    for (auto& instruction : computation_.instructions()) {
+      // TODO(b/34466113): Replace this and above with successors() or
+      // predecessors() when these methods are added to HloInstruction.
+      for (const HloInstruction* user : instruction->users()) {
+        unscheduled_pred_count[user]++;
+      }
+      for (const HloInstruction* succ : instruction->control_successors()) {
+        unscheduled_pred_count[succ]++;
+      }
+    }
+
+    std::list<ReadyListEntry> ready_list;
+    for (auto& instruction : computation_.instructions()) {
+      // Instruction with no operands or control predecessors will
+      // not be in the map.
+      if (unscheduled_pred_count.count(instruction.get()) == 0) {
+        ready_list.push_back(MakeReadyListEntry(instruction.get()));
+      }
+    }
+
+    while (!ready_list.empty()) {
+      // Select the highest priority HLO instruction from the ready list.
+      auto best_it = ready_list.begin();
+      Priority best_priority = GetPriority(*best_it);
+      for (auto ready_it = std::next(ready_list.begin());
+           ready_it != ready_list.end(); ++ready_it) {
+        Priority priority = GetPriority(*ready_it);
+        if (priority > best_priority) {
+          best_it = ready_it;
+          best_priority = priority;
+        }
+      }
+
+      // Remove the selected instruction from the ready list and add it to the
+      // schedule.
+      const HloInstruction* best = best_it->instruction;
+      ready_list.erase(best_it);
+      schedule.push_back(best);
+      scheduled_instructions_.insert(best);
+
+      // Update the unscheduled uses of the logical buffers.
+      for (const LogicalBuffer* buffer : buffer_uses_.at(best)) {
+        CHECK_GT(unscheduled_use_count_.at(buffer), 0);
+        --unscheduled_use_count_[buffer];
+      }
+
+      // Add new instructions to ready list.
+      auto update_pred_count = [&](HloInstruction* inst) {
+        int64 pred_count = --unscheduled_pred_count.at(inst);
+        CHECK_GE(pred_count, 0);
+        if (pred_count == 0) {
+          ready_list.push_back(MakeReadyListEntry(inst));
+        }
+      };
+      // TODO(b/34466113): Replace this and above with successors() or
+      // predecessors() when these methods are added to HloInstruction.
+      for (HloInstruction* user : best->users()) {
+        update_pred_count(user);
+      }
+      for (HloInstruction* succ : best->control_successors()) {
+        update_pred_count(succ);
+      }
+    }
+    CHECK_EQ(schedule.size(), computation_.instructions().size());
+    CHECK_EQ(scheduled_instructions_.size(),
+             computation_.instructions().size());
+
+    return schedule;
+  }
+
+  const HloComputation& computation_;
+  const TuplePointsToAnalysis& points_to_analysis_;
+  const LogicalBuffer::SizeFunction& size_function_;
+
+  // A map containing the LogicalBuffers that each instruction uses.
+  std::unordered_map<const HloInstruction*, std::vector<const LogicalBuffer*>>
+      buffer_uses_;
+
+  // A map containing the count of unscheduled HLOs which using a particular
+  // LogicalBuffer.  We rely on iterator stability in this map.
+  std::unordered_map<const LogicalBuffer*, int64> unscheduled_use_count_;
+
+  // Set of instructions which have been scheduled.
+  std::unordered_set<const HloInstruction*> scheduled_instructions_;
+};
+
+int64 SumLogicalBufferSizes(const std::vector<const LogicalBuffer*>& buffers,
+                            const LogicalBuffer::SizeFunction& size_function) {
+  int64 size = 0;
+  for (const LogicalBuffer* buffer : buffers) {
+    size += size_function(*buffer);
+  }
+  return size;
+}
+
+StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  // This ordering is based on DFS post-order, with a heuristic to decide which
+  // operand to visit first.  The heuristic is based on 'extra_users', which is
+  // simply users-1 for each instruction.  By subtracting 1, we're saying that
+  // instructions with no users or a single user don't count; instructions with
+  // lots of fan-out will be visited earlier.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
+  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
+    extra_users[hlo] = hlo->users().empty() ? 0 : hlo->users().size() - 1;
+    total_sizes[hlo] = SumLogicalBufferSizes(
+        points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
+    tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
+        hlo->operands().begin(), hlo->operands().end());
+    for (const HloInstruction* operand : unique_operands) {
+      extra_users[hlo] += extra_users[operand];
+      total_sizes[hlo] += total_sizes[operand];
+    }
+  }
+  CHECK_EQ(extra_users.size(), computation.instructions().size());
+  CHECK_EQ(total_sizes.size(), computation.instructions().size());
+
+  // Construct a total order based on DFS post-order, visiting operands in
+  // decreasing cumulative extra user order, and next by cumulative size, with a
+  // tiebreaker by name for determinism.
+  std::vector<const HloInstruction*> sequence;
+  FunctionVisitor visitor([&sequence](HloInstruction* hlo) {
+    sequence.push_back(hlo);
+    return Status::OK();
+  });
+  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
+      &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
+                                             const HloInstruction* b) {
+        if (extra_users[a] != extra_users[b]) {
+          return extra_users[a] > extra_users[b];
+        }
+        if (total_sizes[a] != total_sizes[b]) {
+          return total_sizes[a] > total_sizes[b];
+        }
+        return a->name() < b->name();
+      }));
+  CHECK_EQ(sequence.size(), computation.instructions().size());
+  return sequence;
+}
+
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  // We try both a list-scheduler based ordering and a DFS based ordering, and
+  // choose whichever returns a lower min-memory, not accounting for
+  // fragmentation.
+  //
+  // Note that this is just a heuristic. One obvious inaccuracy is that the
+  // memory required for sub-computations might be different when considered
+  // within the caller's context. But it's good enough for now.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<const HloInstruction*> list_sequence,
+      ListScheduler::Run(computation, points_to_analysis, size_function));
+  TF_ASSIGN_OR_RETURN(
+      const int64 list_memory,
+      MinimumMemoryForComputation(computation, list_sequence,
+                                  points_to_analysis, size_function));
+  VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<const HloInstruction*> dfs_sequence,
+      RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
+  TF_ASSIGN_OR_RETURN(
+      const int64 dfs_memory,
+      MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
+                                  size_function));
+  VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
+
+  if (list_memory <= dfs_memory) {
+    VLOG(2) << "Chose min-memory list sequence: " << list_memory << " bytes";
+    return list_sequence;
+  } else {
+    VLOG(2) << "Chose min-memory dfs sequence: " << dfs_memory << " bytes";
+    return dfs_sequence;
+  }
+}
+
+}  // namespace
+
+StatusOr<SequentialHloOrdering::HloModuleSequence>
+CreateMemoryMinimizingSequence(
+    const HloModule& module, const LogicalBuffer::SizeFunction& size_function) {
+  SequentialHloOrdering::HloModuleSequence sequence;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
+                      TuplePointsToAnalysis::Run(&module));
+  for (const auto& computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(sequence[computation.get()],
+                        CreateMemoryMinimizingSequence(
+                            *computation, *points_to_analysis, size_function));
+  }
+  return sequence;
+}
+
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const LogicalBuffer::SizeFunction& size_function) {
+  CHECK(!computation.IsFusionComputation());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
+                      TuplePointsToAnalysis::Run(computation.parent()));
+  return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
+                                        size_function);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec92a56b962152b15981f868369683144aa7c76a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULING_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+// Returns the minimum memory required to compute the given module sequence,
+// assuming no fragmentation.
+StatusOr<int64> MinimumMemoryForSequence(
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const LogicalBuffer::SizeFunction& size_function);
+
+// Returns an HloModuleSequence which seeks to minimize the memory required for
+// the computation. size_function is the function returning the number of bytes
+// required for a LogicalBuffer.
+StatusOr<SequentialHloOrdering::HloModuleSequence>
+CreateMemoryMinimizingSequence(
+    const HloModule& module, const LogicalBuffer::SizeFunction& size_function);
+
+// Overload of above that computes the sequence for a single computation.
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const LogicalBuffer::SizeFunction& size_function);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d09d22ee40638c5beed3f4eaf3723be0f6b6bf96
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class MinimumMemoryForSequenceTest : public HloTestBase {};
+
+TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
+  auto module = CreateNewModule();
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      module->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  // Entry params: 8 bytes (4 bytes per param), TOTAL=8
+  HloInstruction* iter = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param_iter"));
+  HloInstruction* data = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
+  // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({iter, data}));
+  // While: 8 bytes (4 bytes per element), TOTAL=32
+  // Both cond and body use a max of 24 bytes, TOTAL=56
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, tuple));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  module_sequence[cond_computation] = {cond_param, cond_iter, cond_data,
+                                       cond_lt};
+  module_sequence[body_computation] = {body_param};
+  module_sequence[entry_computation] = {iter, data, tuple, while_op};
+  EXPECT_EQ(56,
+            MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 867ebc7f61aab1483622d1560d951c053e95f135..e3d287d4c91708577b712261842b6ae231fb188b 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -75,7 +75,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
       module->AddEmbeddedComputation(CreateR0S32IdentityComputation());
 
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(5)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(5)));
   auto x = builder.AddInstruction(
       HloInstruction::CreateCall(r0s32_, {constant}, callee1));
   auto y = builder.AddInstruction(
@@ -89,12 +89,14 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
   EXPECT_NE(x->to_apply(), y->to_apply());
   if (VLOG_IS_ON(1)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification", false, false, nullptr);
+                                "before unification",
+                                module->config().debug_options());
   }
   EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
   if (VLOG_IS_ON(1)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification", false, false, nullptr);
+                                "after unification",
+                                module->config().debug_options());
   }
   EXPECT_EQ(2, module->computations().size());
   EXPECT_EQ(x->to_apply(), y->to_apply());
@@ -110,9 +112,9 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
       module->AddEmbeddedComputation(CreateR0S32AdditionComputation());
 
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(5)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(5)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(3)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(3)));
   auto x = builder.AddInstruction(
       HloInstruction::CreateCall(r0s32_, {constant1, constant2}, callee1));
   auto y = builder.AddInstruction(
@@ -126,12 +128,14 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
   EXPECT_NE(x->to_apply(), y->to_apply());
   if (VLOG_IS_ON(1)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification", false, false, nullptr);
+                                "before unification",
+                                module->config().debug_options());
   }
   EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
   if (VLOG_IS_ON(1)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification", false, false, nullptr);
+                                "after unification",
+                                module->config().debug_options());
   }
   EXPECT_EQ(2, module->computations().size());
   EXPECT_EQ(x->to_apply(), y->to_apply());
@@ -164,12 +168,14 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
   EXPECT_NE(x->to_apply(), y->to_apply());
   if (VLOG_IS_ON(1)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification", false, false, nullptr);
+                                "before unification",
+                                module->config().debug_options());
   }
   EXPECT_FALSE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
   if (VLOG_IS_ON(1)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification", false, false, nullptr);
+                                "after unification",
+                                module->config().debug_options());
   }
   EXPECT_EQ(3, module->computations().size());
   EXPECT_NE(x->to_apply(), y->to_apply());
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index 6707b02c5c57262b0154ae6b23fdd61a198a8d70..76177462aa4959261483045296d2388acabe46a5 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -171,8 +171,7 @@ void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
       break;
     case HloOpcode::kConstant:
       if (ShapeUtil::IsScalar(instruction->shape())) {
-        attrs["value"].set_s(
-            LiteralUtil::GetAsString(instruction->literal(), {}));
+        attrs["value"].set_s(instruction->literal().GetAsString({}));
       }
       break;
     case HloOpcode::kCustomCall:
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index c2718ea8003c9d2a8e3d65773b439aae915a30d0..8e9d93e367e51cb69f0a38ae7aa8d9539e78ad8a 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -91,7 +91,7 @@ TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) {
 TEST_F(HloTfGraphBuilderTest, CheckScalarValue) {
   auto builder = HloComputation::Builder("Const");
   HloInstruction *instruction = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(123)));
+      HloInstruction::CreateConstant(Literal::CreateR0(123)));
   OpMetadata metadata;
   metadata.set_op_name("x");
   metadata.set_op_type("y");
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
new file mode 100644
index 0000000000000000000000000000000000000000..221f67b0c1cd280d88c408f69deab12ed51a8b93
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -0,0 +1,327 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+const Shape& HloPosition::shape() const {
+  return ShapeUtil::GetSubshape(instruction->shape(), index);
+}
+
+string HloPosition::ToString() const {
+  string index_str =
+      ShapeUtil::IsTuple(instruction->shape()) ? (" " + index.ToString()) : "";
+  return StrCat(instruction->name(), index_str);
+}
+
+std::ostream& operator<<(std::ostream& out, const HloPosition& position) {
+  out << position.ToString();
+  return out;
+}
+
+string HloUse::ToString() const {
+  string index_str =
+      ShapeUtil::IsTuple(instruction->operand(operand_number)->shape())
+          ? (" " + operand_index.ToString())
+          : "";
+  return StrCat(instruction->name(), ", operand ", operand_number, index_str);
+}
+
+std::ostream& operator<<(std::ostream& out, const HloUse& use) {
+  out << use.ToString();
+  return out;
+}
+
+HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
+                   const ShapeIndex& index, bool is_phi)
+    : id_(id), is_phi_(is_phi) {
+  // The defining position is always the first element in the positions_ vector.
+  AddPosition(instruction, index);
+}
+
+bool HloValue::operator==(const HloValue& other) const {
+  bool equal = defining_instruction() == other.defining_instruction() &&
+               defining_index() == other.defining_index();
+  // If the values are equal they most both be phi (or non phi).
+  CHECK(!(equal && is_phi() != other.is_phi()));
+  return equal;
+}
+
+bool HloValue::operator!=(const HloValue& other) const {
+  return !(*this == other);
+}
+
+string HloValue::ToShortString() const {
+  string index_str = ShapeUtil::IsTuple(defining_instruction()->shape())
+                         ? defining_index().ToString()
+                         : "";
+  return StrCat(id_, " ", is_phi_ ? "PHI " : "", defining_instruction()->name(),
+                index_str);
+}
+
+string HloValue::ToString(int indent) const {
+  string indentation(indent, ' ');
+  string out = StrCat(indentation, ToShortString(), ", positions:\n");
+  for (const HloPosition& position : positions()) {
+    StrAppend(&out, indentation, "  ", position.ToString(), "\n");
+  }
+  StrAppend(&out, indentation, " uses:\n");
+  for (const HloUse& use : uses()) {
+    StrAppend(&out, indentation, "  ", use.ToString(), "\n");
+  }
+  return out;
+}
+
+namespace {
+
+// Returns true if the instruction 'user' may use the value at the given
+// ShapeIndex in the given operand. Generally, instruction which pass through
+// values transparently without reading the value are not considered to use the
+// value.
+bool MayUseOperandValue(int64 operand_number, const ShapeIndex& index,
+                        const HloInstruction* user) {
+  switch (user->opcode()) {
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kCopy:
+      // These instructions only access the top-level values of their
+      // operand. Non-top-level (nested) values are passed through
+      // transparently.
+      CHECK_EQ(operand_number, 0);
+      return index.empty();
+    case HloOpcode::kSelect:
+      // Select does not use any nested elements of its selected-from operands
+      // (operand 1 and 2)
+      CHECK_GE(operand_number, 0);
+      CHECK_LE(operand_number, 2);
+      return operand_number == 0 || index.empty();
+
+    case HloOpcode::kCall:
+    case HloOpcode::kTuple:
+      // These instructions always pass through their operands transparently.
+      return false;
+
+    case HloOpcode::kWhile:
+      // Though the while instructions passes through its operands, we return
+      // true because in SSA form there may be a Phi at the parameter of the
+      // while which is considered a use of its incoming value because the Phi
+      // input values are not passed through into the body computation. Because
+      // this function is used in both SSA and non-SSA forms of the analysis
+      // conservatively return true.
+      return true;
+
+    default:
+      return true;
+  }
+}
+
+}  // namespace
+
+void HloValue::AddPosition(HloInstruction* instruction,
+                           const ShapeIndex& index) {
+  HloPosition new_position{instruction, index};
+
+  // The new position must not already exist in positions_.
+  for (const HloPosition& position : positions_) {
+    DCHECK_NE(position, new_position);
+  }
+  // The shape of the new position must match existing positions.
+  if (!positions_.empty()) {
+    CHECK(
+        ShapeUtil::Compatible(positions_.front().shape(), new_position.shape()))
+        << "front: " << positions_.front() << " new: " << new_position;
+  }
+
+  positions_.push_back(std::move(new_position));
+
+  // Update uses.
+  for (HloInstruction* user : instruction->users()) {
+    for (int64 operand_number : user->OperandIndices(instruction)) {
+      if (MayUseOperandValue(operand_number, index, user)) {
+        HloUse new_use{user, operand_number, index};
+
+        // The new use must not already exist in uses_.
+        for (const HloUse& use : uses_) {
+          DCHECK_NE(use, new_use);
+        }
+
+        uses_.push_back(std::move(new_use));
+      }
+    }
+  }
+
+  // Update liveout status of this HloValue.
+  const HloModule& module = *instruction->parent()->parent();
+  if (instruction == module.entry_computation()->root_instruction()) {
+    live_out_of_module_ = true;
+  }
+
+  if (instruction == instruction->parent()->root_instruction()) {
+    live_out_of_computation_ = true;
+  }
+}
+
+void HloValue::RemovePosition(HloInstruction* instruction,
+                              const ShapeIndex& index) {
+  // The defining position cannot be removed.
+  CHECK(!(instruction == defining_instruction() && index == defining_index()));
+
+  int64 size_before = positions_.size();
+  positions_.erase(
+      std::remove_if(positions_.begin(), positions_.end(),
+                     [instruction, &index](const HloPosition& position) {
+                       return position.instruction == instruction &&
+                              position.index == index;
+                     }),
+      positions_.end());
+  // Only a single position should have been removed.
+  CHECK_EQ(positions_.size(), size_before - 1);
+
+  //  Update uses which referred to this position.
+  uses_.erase(std::remove_if(uses_.begin(), uses_.end(),
+                             [instruction, &index](const HloUse& use) {
+                               return use.instruction->operand(
+                                          use.operand_number) == instruction &&
+                                      use.operand_index == index;
+                             }),
+              uses_.end());
+
+  // Returns whether this value is contained in the given instruction's output.
+  auto is_contained_in = [this](const HloInstruction* instruction) {
+    for (const HloPosition& position : positions()) {
+      if (position.instruction == instruction) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  const HloModule& module = *instruction->parent()->parent();
+  if (instruction == module.entry_computation()->root_instruction()) {
+    // Value has been removed from a position in the entry root instruction.
+    live_out_of_module_ =
+        is_contained_in(module.entry_computation()->root_instruction());
+  }
+  if (instruction == defining_instruction()->parent()->root_instruction()) {
+    // Value has been removed from the root of the computation the value has
+    // been defined in.
+    live_out_of_computation_ =
+        is_contained_in(defining_instruction()->parent()->root_instruction());
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const HloValue& value) {
+  out << value.ToShortString();
+  return out;
+}
+
+void HloValueSet::SortAndUniquifyValues() {
+  std::sort(values_.begin(), values_.end(), HloValue::IdLessThan);
+  values_.erase(std::unique(values_.begin(), values_.end(), HloValue::IdEqual),
+                values_.end());
+}
+
+string HloValueSet::ToString() const {
+  return StrCat("HloValueSet: ",
+                Join(values_, ", ", [](string* result, const HloValue* value) {
+                  result->append(value->ToShortString());
+                }));
+}
+
+/*static */
+HloValueSet HloValueSet::Union(
+    tensorflow::gtl::ArraySlice<const HloValueSet*> inputs) {
+  HloValueSet union_set;
+  for (const HloValueSet* input : inputs) {
+    for (const HloValue* value : input->values()) {
+      union_set.values_.push_back(value);
+    }
+  }
+  union_set.SortAndUniquifyValues();
+  return union_set;
+}
+
+bool HloValueSet::AddValue(const HloValue* value) {
+  auto it = std::lower_bound(values_.begin(), values_.end(), value,
+                             HloValue::IdLessThan);
+  if (it == values_.end() || (*it)->id() != value->id()) {
+    values_.insert(it, value);
+    return true;
+  }
+  return false;  // already exists
+}
+
+std::ostream& operator<<(std::ostream& out, const HloValueSet& value_set) {
+  out << value_set.ToString();
+  return out;
+}
+
+InstructionValueSet InstructionValueSet::Union(
+    tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
+  CHECK_GT(inputs.size(), 0);
+  for (int i = 1; i < inputs.size(); ++i) {
+    CHECK(ShapeUtil::Compatible(inputs[0]->shape(), inputs[i]->shape()));
+  }
+  InstructionValueSet union_set(inputs[0]->shape());
+  union_set.ForEachMutableElement(
+      [&inputs](const ShapeIndex& index, HloValueSet* value_set) {
+        std::vector<const HloValueSet*> input_sets;
+        for (const InstructionValueSet* input : inputs) {
+          input_sets.push_back(&input->element(index));
+        }
+        *value_set = HloValueSet::Union(input_sets);
+      });
+  return union_set;
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionValueSet& instruction_value_set) {
+  out << instruction_value_set.ToString();
+  return out;
+}
+
+string InstructionValueSet::ToString() const {
+  string out =
+      StrCat("InstructionValueSet(", ShapeUtil::HumanString(shape()), ")\n");
+  ForEachElement([this, &out](const ShapeIndex& index,
+                              const HloValueSet& value_set) {
+    StrAppend(&out, "  ", index.ToString(), " : ", value_set.ToString(), "\n");
+  });
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..a21e34821748e5077ba19c29057d85f7c12088c3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -0,0 +1,267 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_H_
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// Abstraction which identifies a specific point in the XLA graph. An
+// HloPosition specifies a ShapeIndex within the output of a specific
+// instruction.
+struct HloPosition {
+  HloInstruction* instruction;
+  ShapeIndex index;
+
+  // Returns the shape at this position.
+  const Shape& shape() const;
+
+  string ToString() const;
+
+  bool operator==(const HloPosition& other) const {
+    return instruction == other.instruction && index == other.index;
+  }
+  bool operator!=(const HloPosition& other) const { return !(*this == other); }
+};
+
+std::ostream& operator<<(std::ostream& out, const HloPosition& position);
+
+// Defines a single use of an HLO value.
+struct HloUse {
+  // Instruction at which the value is used.
+  HloInstruction* instruction;
+
+  // The operand number in which the value is appears.
+  int64 operand_number;
+
+  // The shape index within the operand in which the value appears.
+  ShapeIndex operand_index;
+
+  string ToString() const;
+
+  bool operator==(const HloUse& other) const {
+    return instruction == other.instruction &&
+           operand_number == other.operand_number &&
+           operand_index == other.operand_index;
+  }
+
+  bool operator!=(const HloUse& other) const { return !(*this == other); }
+};
+
+std::ostream& operator<<(std::ostream& out, const HloUse& use);
+
+// Class describing a value used by the dataflow analysis. XLA arrays are
+// trivially a single HloValue. Tuples are made up of more than one HloValue: an
+// HloValue for the pointer vector, and an HloValue for each child element.
+//
+// Every HloValue is defined by a particular instruction and most instructions
+// define only a single HloValue. Instructions which define a single HloValue
+// include array-shaped instructions such as Add but also includes Tuple-shaped
+// instructions such as Tuple. The Tuple instruction defines a single HloValue
+// which is a vector of pointers to the values containing the Tuple
+// instruction's operands. Though the result of the Tuple instruction includes
+// multiple values only the top-level HloValue (the vector of pointers) is
+// defined by the Tuple instruction. The values containing the tuple elements
+// are defined by earlier instructions, usually the operands of the Tuple
+// instruction.
+//
+// Instructions which construct both the tuple *and* the tuple elements define
+// more than one HloValue. This includes (at least) tuple-shaped Constant,
+// Parameter, Infeed and While instructions. These tuple-shaped instructions do
+// not assemble a tuple from existing HloValues like the Tuple instruction does,
+// but rather define all the HloValues in the tuple.
+class HloValue {
+ public:
+  using Id = int64;
+
+  // Predicate comparing HloValues by increasing id, useful for std::sort.
+  static bool IdLessThan(const HloValue* a, const HloValue* b) {
+    return a->id() < b->id();
+  }
+
+  // Predicate comparing HloValues by equal id, useful for std::unique.
+  static bool IdEqual(const HloValue* a, const HloValue* b) {
+    return a->id() == b->id();
+  }
+
+  // Construct an HloValue defined by 'instruction' at shape index 'index'. If
+  // is_phi is true, then this value is a phi value, for example, at the
+  // parameter of a while body computation. Phi values are only used in the SSA
+  // dataflow analysis (HloDataflowAnalysis::ssa_form_ is true).
+  HloValue(Id id, HloInstruction* instruction, const ShapeIndex& index,
+           bool is_phi = false);
+
+  // Return a unique identifier for this HloValue. This value is used for stable
+  // sorting and iteration
+  Id id() const { return id_; }
+
+  // Returns whether this value is a phi value.
+  bool is_phi() const { return is_phi_; }
+
+  // Return the position where this value is defined.
+  const HloPosition& defining_position() const { return positions_[0]; }
+
+  // Return the instruction which defines this HloValue.
+  HloInstruction* defining_instruction() const {
+    return defining_position().instruction;
+  }
+
+  // Return the shape index at which this HloValue is defined in the output of
+  // its defining instruction.
+  const ShapeIndex& defining_index() const { return defining_position().index; }
+
+  // Return the shape of this HloValue.
+  const Shape& shape() const { return defining_position().shape(); }
+
+  // Add or remove a position at which the HloValue appears. The definition
+  // position can not be removed. The uses of the HloValue are updated.
+  void AddPosition(HloInstruction* instruction, const ShapeIndex& index);
+  void RemovePosition(HloInstruction* instruction, const ShapeIndex& index);
+
+  // Return all positions of the HloValue in the module.
+  const std::vector<HloPosition>& positions() const { return positions_; }
+
+  // Return all uses of the HloValue.
+  const std::vector<HloUse>& uses() const { return uses_; }
+
+  // Get whether this HloValue is live out of the module.
+  bool live_out_of_module() const { return live_out_of_module_; }
+
+  // Get whether this HloValue is live out of the computation it is defined in.
+  bool live_out_of_computation() const { return live_out_of_computation_; }
+
+  bool operator==(const HloValue& other) const;
+  bool operator!=(const HloValue& other) const;
+
+  // Return a single-line string representation of the value.
+  string ToShortString() const;
+
+  string ToString(int indent = 0) const;
+
+ private:
+  // Unique identifier for this HloValue. Used for stable sorting and iteration.
+  const Id id_;
+
+  // Whether this instruction is a phi value.
+  const bool is_phi_;
+
+  // The set of positions of this HloValue. The first element is always the
+  // position of the definition.
+  std::vector<HloPosition> positions_;
+
+  // The set of uses of this HloValue.
+  std::vector<HloUse> uses_;
+
+  // Whether this value is live out of the HLO module.
+  bool live_out_of_module_ = false;
+
+  // Whether this value is live out of its computation.
+  bool live_out_of_computation_ = false;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
+
+// A class representing the possible set of HloValues at a particular point
+// (shape index in the output of an instruction) in the XLA graph. This set
+// contains the set of reaching HloValue definitions. For a simple array-shaped
+// instruction like Add, the HloValueSet of the top-level of the instruction's
+// output trivially contains only the HloValue defined by the instruction. For
+// instructions which have non-trivial dataflow such as Tuple or Select, the
+// HloValueSets of the instruction's output contains one or more HloValues
+// defined by the instruction's operands or defined further up in the XLA graph.
+class HloValueSet {
+ public:
+  HloValueSet() = default;
+
+  explicit HloValueSet(tensorflow::gtl::ArraySlice<const HloValue*> values)
+      : values_(values.begin(), values.end()) {
+    SortAndUniquifyValues();
+  }
+
+  // Return the union of the given HloValueSets.
+  static HloValueSet Union(
+      tensorflow::gtl::ArraySlice<const HloValueSet*> inputs);
+
+  // Return the vector of HloValues in the set. Values in the vector are unique
+  // and sorted.
+  const std::vector<const HloValue*>& values() const { return values_; }
+
+  // Adds the value to the set.  Returns true iff the value was added and didn't
+  // already exist in the set.
+  bool AddValue(const HloValue* value);
+
+  // Return the unique HLO value in the set. CHECKs if the set does not contain
+  // exactly one value.
+  const HloValue& GetUniqueValue() const {
+    CHECK_EQ(values_.size(), 1);
+    return *values_[0];
+  }
+
+  bool operator==(const HloValueSet& other) const {
+    if (values_.size() != other.values_.size()) return false;
+    for (size_t i = 0; i < values_.size(); ++i) {
+      if (values_[i]->id() != other.values_[i]->id()) {
+        return false;
+      }
+    }
+    return true;
+  }
+  bool operator!=(const HloValueSet& other) const { return !(*this == other); }
+
+  string ToString() const;
+
+ private:
+  // Sorts value_ and removes duplicates. This should be called after adding any
+  // elements to values_.
+  void SortAndUniquifyValues();
+
+  // HloValues sorted by HloValue::Id.
+  std::vector<const HloValue*> values_;
+};
+
+std::ostream& operator<<(std::ostream& out, const HloValueSet& hlo_value);
+
+// A class collecting the HloValues which might be contained in the output of
+// an HLO instruction. For array-shaped instructions, an InstructionValueSet
+// trivially holds a single HloValueSet. Tuple-shaped InstructionValueSets
+// hold multiple HloValueSets.
+class InstructionValueSet : public ShapeTree<HloValueSet> {
+ public:
+  InstructionValueSet(const Shape& shape) : ShapeTree<HloValueSet>(shape) {}
+
+  // Return the union of the given InstructionValueSets.
+  static InstructionValueSet Union(
+      tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs);
+
+  string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const InstructionValueSet& instruction_value_set);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index de6081e57e7f27a07b314692c6935ecf3e3c54a9..01fba49bc567900418f9e4622351373abe7b2e18 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
+  tensorflow::gtl::FlatMap<string, const HloInstruction*> instructions;
+
   for (auto& computation : module->computations()) {
     for (const auto& instruction : computation->instructions()) {
       TF_RET_CHECK(instruction->parent() == computation.get());
@@ -30,6 +33,16 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
               << " computation: " << computation.get();
         }
       }
+
+      auto previous = instructions.find(instruction->name());
+      TF_RET_CHECK(previous == instructions.end())
+          << "HLO has name that is not unique within module:\n"
+          << instruction->ToString()
+          << " in computation: " << computation->name()
+          << "\nPrevious HLO with same name:\n"
+          << previous->second->ToString()
+          << " in computation: " << previous->second->parent()->name();
+      instructions[instruction->name()] = instruction.get();
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b9a7a297f80cd249fb3dd7a513d785ed3a444d3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
+#include "tensorflow/compiler/xla/metric_table_report.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+
+using tensorflow::strings::Appendf;
+using tensorflow::strings::HumanReadableElapsedTime;
+using tensorflow::strings::HumanReadableNumBytes;
+using tensorflow::strings::StrAppend;
+
+string HumanReadableProfileBuilder::ToString() const {
+  string s;
+
+  Appendf(&s, "Execution profile for %s: (%s @ f_nom)\n",
+          computation_name_.c_str(),
+          HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str());
+
+  auto append_op = [&](const OpInfo& op) {
+    string bytes_per_sec;
+    string bytes_per_cycle;
+    if (op.cycles <= 0 || op.bytes_accessed < 0) {
+      bytes_per_sec = "<unknown>";
+      bytes_per_cycle = "<unknown>";
+    } else {
+      bytes_per_sec =
+          HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles));
+      bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+    }
+
+    double cycles_percent = 0;
+    if (total_cycles_ > 0) {
+      cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100;
+    }
+
+    double nsecs = op.cycles / clock_rate_ghz_;
+    Appendf(&s,
+            "\t%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s "
+            ":: %12s/s :: %12s/cycle :: %s\n",
+            op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
+            op.flop_count <= 0
+                ? "<none>"
+                : HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
+            bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str());
+  };
+
+  append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1});
+
+  // Sort ops in decreasing order of cycles.
+  std::vector<OpInfo> sorted_ops(op_infos_);
+  std::sort(
+      sorted_ops.begin(), sorted_ops.end(),
+      [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; });
+  for (const auto& op : sorted_ops) {
+    append_op(op);
+  }
+
+  if (total_cycles_ <= 0) {
+    StrAppend(&s, "****** 0 total cycles ******\n");
+  } else {
+    MetricTableReport table;
+    table.SetMetricName("microseconds");
+    table.SetEntryName("ops");
+    table.SetShowCategoryTable();
+    for (const auto& op : sorted_ops) {
+      MetricTableReport::Entry entry;
+      entry.text = op.name;
+      entry.short_text = op.short_name;
+      entry.category_text = op.category;
+      entry.metric = CyclesToMicroseconds(op.cycles);
+      table.AddEntry(std::move(entry));
+    }
+    StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
+  }
+  return s;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a69cbf8bf3e2f850eb6a284844b2c95678c92f2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// HumanReadableProfileBuilder helps you create a textual profile of a
+// computation, suitable for consumption by humans.
+class HumanReadableProfileBuilder {
+ public:
+  explicit HumanReadableProfileBuilder(tensorflow::StringPiece computation_name,
+                                       int64 total_cycles,
+                                       double clock_rate_ghz)
+      : computation_name_(computation_name.ToString()),
+        total_cycles_(total_cycles),
+        clock_rate_ghz_(clock_rate_ghz) {
+    CHECK_GE(clock_rate_ghz, 1e-9);
+  }
+
+  int64 total_cycles() const { return total_cycles_; }
+
+  // Adds an operation to the profile.  If you don't know the number of
+  // floating-point ops or bytes touched by the op, pass -1 for that param.
+  void AddOp(tensorflow::StringPiece op_name,
+             tensorflow::StringPiece short_name,
+             tensorflow::StringPiece category, int64 cycles, int64 flop_count,
+             int64 bytes_accessed) {
+    op_infos_.push_back({op_name.ToString(), short_name.ToString(),
+                         category.ToString(), cycles, flop_count,
+                         bytes_accessed});
+  }
+
+  // Gets the human-readable profile.
+  string ToString() const;
+
+ private:
+  struct OpInfo {
+    string name;
+    string short_name;
+    string category;
+    int64 cycles;
+    int64 flop_count;
+    int64 bytes_accessed;
+  };
+
+  double CyclesToSeconds(int64 cycles) const {
+    return cycles / clock_rate_ghz_ / 1e9;
+  }
+  double CyclesToMicroseconds(int64 cycles) const {
+    return cycles / clock_rate_ghz_ / 1000.0;
+  }
+
+  string computation_name_;
+  int64 total_cycles_;
+  double clock_rate_ghz_;
+  std::vector<OpInfo> op_infos_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 2887a8a0a097c9dcb3d490f0845547f104aa1bdf..84bfbb30c30d84a6a233a60fb420b43c3fe3454c 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -51,10 +51,10 @@ TEST_F(InlinerTest, MapMax) {
   auto max_f32 = max_builder.Build();
 
   auto builder = HloComputation::Builder("MapMaxFunction");
-  auto lhs = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1, 2, 3, 4})));
-  auto rhs = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({4, 3, 2, 1})));
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3, 4})));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({4, 3, 2, 1})));
   builder.AddInstruction(
       HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
@@ -70,7 +70,7 @@ TEST_F(InlinerTest, MapMax) {
 
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
-  auto expected = LiteralUtil::CreateR1<float>({4, 3, 3, 4});
+  auto expected = Literal::CreateR1<float>({4, 3, 3, 4});
   LiteralTestUtil::ExpectEqual(*result, *expected);
 }
 
@@ -83,12 +83,12 @@ TEST_F(InlinerTest, MapConstant) {
       HloInstruction::CreateParameter(0, r0f32, "x"));
   (void)param1;
   const2_builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0f)));
   auto const2_f32 = const2_builder.Build();
 
   auto builder = HloComputation::Builder("MapConstFunction");
   auto lhs = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3, 4}, {5, 6, 7, 8}})));
+      Literal::CreateR2<float>({{1, 2, 3, 4}, {5, 6, 7, 8}})));
   builder.AddInstruction(
       HloInstruction::CreateMap(lhs->shape(), {lhs}, const2_f32.get()));
 
@@ -104,7 +104,7 @@ TEST_F(InlinerTest, MapConstant) {
 
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
-  auto expected = LiteralUtil::CreateR2<float>({{2, 2, 2, 2}, {2, 2, 2, 2}});
+  auto expected = Literal::CreateR2<float>({{2, 2, 2, 2}, {2, 2, 2, 2}});
   LiteralTestUtil::ExpectEqual(*result, *expected);
 }
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 721640cdbd8133f621f65a2505cdf3b84590e740..24af07bd4bf8d5a61a6092c8eadc5151c09921b4 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
-
 /*static*/ bool InstructionFusion::IsExpensive(
     const HloInstruction& instruction) {
   switch (instruction.opcode()) {
@@ -43,6 +42,7 @@ namespace xla {
     case HloOpcode::kConstant:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
+    case HloOpcode::kCos:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
     case HloOpcode::kEq:
@@ -64,10 +64,12 @@ namespace xla {
     case HloOpcode::kNegate:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
+    case HloOpcode::kReducePrecision:
     case HloOpcode::kReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kSelect:
     case HloOpcode::kSign:
+    case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSubtract:
     case HloOpcode::kTranspose:
@@ -75,6 +77,8 @@ namespace xla {
       return false;
 
     // Expensive instructions.
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormGrad:
     case HloOpcode::kCall:
     case HloOpcode::kConvolution:
     case HloOpcode::kCrossReplicaSum:
@@ -113,12 +117,111 @@ bool FusionWouldDuplicate(const HloInstruction& producer,
                           const HloInstruction& consumer) {
   return !(producer.users().size() == 1 && consumer.IsUserOf(&producer));
 }
+
+// An "effectively unary" operation is one that has one "large"
+// input with the others being negligible in terms of memory usage.
+// We use "has a smaller true rank than the output" as a heuristic
+// for "negligible" memory usage.
+bool EffectivelyUnary(HloInstruction* hlo) {
+  int64 output_rank = 0;
+  ShapeUtil::ForEachSubshape(
+      hlo->shape(),
+      [&output_rank](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (ShapeUtil::IsArray(subshape)) {
+          output_rank = std::max(output_rank, ShapeUtil::TrueRank(subshape));
+        }
+      });
+  return std::count_if(hlo->operands().begin(), hlo->operands().end(),
+                       [output_rank](HloInstruction* operand) {
+                         if (operand->opcode() == HloOpcode::kBroadcast) {
+                           return false;
+                         }
+                         if (operand->opcode() == HloOpcode::kConstant &&
+                             ShapeUtil::IsEffectiveScalar(operand->shape())) {
+                           return false;
+                         }
+                         return ShapeUtil::TrueRank(operand->shape()) >=
+                                output_rank;
+                       }) <= 1;
+}
 }  // namespace
 
+bool InstructionFusion::CanFuseOnAllPaths(
+    const HloReachabilityMap& reachability_map, HloInstruction* producer,
+    HloInstruction* consumer, DoNotFuseSet* do_not_fuse) {
+  auto could_fuse_on_all_paths = [&] {
+    // First check to see if we have already marked this producer as infeasible
+    // to fuse into consumer.
+    if (do_not_fuse->count(producer) > 0) {
+      return false;
+    }
+    // Make sure it is possible for producer and consumer to exist in a fusion
+    // node.
+    if (!producer->IsFusable() || !consumer->IsFusable()) {
+      return false;
+    }
+    // We do an upward walk of the graph from consumer towards all paths which
+    // lead to producer to find any unfusable paths.
+    for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) {
+      auto* consumer_operand = consumer->mutable_operand(i);
+      if (consumer_operand == producer) {
+        // This is the base case: our upward crawl ends but we need to make sure
+        // that fusion from consumer can happen.
+        if (!ShouldFuse(consumer, i)) {
+          return false;
+        }
+      } else if (reachability_map.IsReachable(producer, consumer_operand)) {
+        // The reachability map told us that consumer_operand is a node on the
+        // path to producer. We need to further investigate from
+        // consumer_operand.
+
+        // First check if we have already ruled out fusing producer into
+        // consumer_operand.
+        if (do_not_fuse->count(consumer_operand) > 0) {
+          return false;
+        }
+        // Make sure it is possible for consumer_operand to exist in a fusion
+        // node.
+        if (!consumer_operand->IsFusable()) {
+          return false;
+        }
+        // The producer is reachable from consumer_operand which means we need
+        // to be able to fuse consumer_operand into consumer in order for
+        // producer to be fusable into consumer on all paths.
+        if (!ShouldFuse(consumer, i)) {
+          return false;
+        }
+        // Perform the recursive step: make sure producer can be fused into
+        // consumer_operand on all paths.
+        if (!CanFuseOnAllPaths(reachability_map, producer, consumer_operand,
+                               do_not_fuse)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  };
+  if (could_fuse_on_all_paths()) {
+    return true;
+  }
+  // We couldn't fuse on all paths, record this result.
+  do_not_fuse->insert(producer);
+  return false;
+}
+
 StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   bool changed = false;
+
+  std::vector<HloComputation*> computations;
   for (auto& computation : module->computations()) {
-    computation_ = computation.get();
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (auto& computation : computations) {
+    CHECK(!computation->IsFusionComputation());
+    computation_ = computation;
 
     // We want to be able to remove arbitrary instructions from the post order
     // and also compare positions of instructions in the post order. To make
@@ -131,56 +234,42 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     std::vector<HloInstruction*> post_order(post_order_list.begin(),
                                             post_order_list.end());
 
-    std::set<HloInstruction*> all_consumers_fusable;
-    // Find which ops can be fused into all of their operands. We would rather
-    // not fuse an op into only some of its users, as that offers no benefit in
-    // terms of memory bandwidth, but forces us to keep more live values around.
-    for (auto* hlo : post_order) {
-      auto user_fusable_into_hlo = [this, &hlo](HloInstruction* consumer) {
-        if (!consumer->IsFusable()) {
-          return false;
-        }
-        for (int operand_number = 0;
-             operand_number < consumer->operands().size(); ++operand_number) {
-          if (consumer->operand(operand_number) == hlo) {
-            if (!ShouldFuse(consumer, operand_number)) {
-              return false;
-            }
-          }
-        }
-        return true;
-      };
-
-      // An "effectively unary" operation is one that has one "large"
-      // input with the others being negligible in terms of memory usage.
-      // We use "has a smaller true rank than the output" as a heuristic
-      // for "negligible" memory usage.
-      auto effectively_unary = [](HloInstruction* hlo) {
-        if (hlo->operands().size() == 1) {
-          return true;
-        }
-        auto output_rank = ShapeUtil::TrueRank(hlo->shape());
-        return std::count_if(
-                   hlo->operands().begin(), hlo->operands().end(),
-                   [output_rank](HloInstruction* operand) {
-                     return ((operand->opcode() != HloOpcode::kBroadcast) &&
-                             ShapeUtil::TrueRank(operand->shape()) >=
-                                 output_rank);
-                   }) <= 1;
-      };
-
-      if (effectively_unary(hlo) ||
-          std::all_of(hlo->users().begin(), hlo->users().end(),
-                      user_fusable_into_hlo)) {
-        all_consumers_fusable.insert(hlo);
-      }
-    }
-
     tensorflow::gtl::FlatMap<HloInstruction*, int> post_order_index;
     for (size_t i = 0; i < post_order.size(); ++i) {
       InsertOrDie(&post_order_index, post_order[i], i);
     }
 
+    DoNotFuseSet do_not_fuse;
+    auto reachability = computation->ComputeReachability();
+
+    auto cheap_to_duplicate = [](HloInstruction* producer) {
+      if (producer->opcode() == HloOpcode::kBroadcast) {
+        return true;
+      }
+      if (producer->opcode() == HloOpcode::kConstant &&
+          ShapeUtil::IsEffectiveScalar(producer->shape())) {
+        return true;
+      }
+      if (EffectivelyUnary(producer)) {
+        return true;
+      }
+      return false;
+    };
+
+    for (HloInstruction* consumer : post_order) {
+      for (HloInstruction* producer : consumer->operands()) {
+        if (cheap_to_duplicate(producer)) {
+          continue;
+        }
+        if (CanFuseOnAllPaths(*reachability, producer, consumer,
+                              &do_not_fuse)) {
+          CHECK_EQ(do_not_fuse.count(producer), 0);
+        } else {
+          CHECK_GT(do_not_fuse.count(producer), 0);
+        }
+      }
+    }
+
     // Instruction fusion effectively fuses edges in the computation graph
     // (producer instruction -> consumer instruction) so we iterate over all
     // edges. When we fuse an edge, we create a copy of the producer inside the
@@ -263,34 +352,36 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
 
-        if (FusionWouldDuplicate(*operand, *instruction) &&
-            (all_consumers_fusable.count(operand) == 0)) {
+        if (!operand->IsFusable()) {
           continue;
         }
-
-        if (operand->IsFusable() && ShouldFuse(instruction, i)) {
-          HloInstruction* fusion_instruction = Fuse(operand, instruction);
-
-          // Fusing an instruction into a fusion instruction can change the
-          // operand set of the fusion instruction. For simplicity just push the
-          // instruction to the top of the post_order and reconsider it for
-          // further fusion in the next iteration of the outer loop.
-          post_order.push_back(fusion_instruction);
-          InsertOrDie(&post_order_index, fusion_instruction,
-                      post_order.size() - 1);
-          changed = true;
-
-          if (operand->user_count() == 0) {
-            // Operand is now dead. Remove from post order by setting it's
-            // location to nullptr.
-            post_order[FindOrDie(post_order_index, operand)] = nullptr;
-            post_order_index.erase(operand);
-
-            // Remove from computation.
-            TF_RETURN_IF_ERROR(computation_->RemoveInstruction(operand));
-          }
-          break;
+        if (!ShouldFuse(instruction, i)) {
+          continue;
+        }
+        if (do_not_fuse.count(operand) > 0) {
+          continue;
+        }
+        HloInstruction* fusion_instruction = Fuse(operand, instruction);
+
+        // Fusing an instruction into a fusion instruction can change the
+        // operand set of the fusion instruction. For simplicity just push the
+        // instruction to the top of the post_order and reconsider it for
+        // further fusion in the next iteration of the outer loop.
+        post_order.push_back(fusion_instruction);
+        InsertOrDie(&post_order_index, fusion_instruction,
+                    post_order.size() - 1);
+        changed = true;
+
+        if (operand->user_count() == 0) {
+          // Operand is now dead. Remove from post order by setting it's
+          // location to nullptr.
+          post_order[FindOrDie(post_order_index, operand)] = nullptr;
+          post_order_index.erase(operand);
+
+          // Remove from computation.
+          TF_RETURN_IF_ERROR(computation_->RemoveInstruction(operand));
         }
+        break;
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index a9f3723f2dfcc1b3b697d34eb9510f5857a443f0..f6f37bb79b9fe1480db61b10b9810347960f9a72 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -72,6 +72,15 @@ class InstructionFusion : public HloPassInterface {
  private:
   HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer);
 
+  // The set of producers whose consumers we cannot fuse into.
+  using DoNotFuseSet = std::unordered_set<HloInstruction*>;
+
+  // Whether or not we can fuse consumer into original_producer on all paths
+  // from the producer to the consumer where nodes are HLOs and edges are uses.
+  bool CanFuseOnAllPaths(const HloReachabilityMap& reachability_map,
+                         HloInstruction* producer, HloInstruction* consumer,
+                         DoNotFuseSet* do_not_fuse);
+
   // Used to determine if an HLO is expensive. Expensive operations will not be
   // duplicated.
   std::function<bool(const HloInstruction& instruction)> is_expensive_;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index a2e6c2ae00bd65b1d3aeca49f26448d8a07670a8..b3e0007dcc2d43028b49cc48477a0a69153b13c8 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -28,7 +28,7 @@ TEST_F(InstructionFusionTest,
        CostlyProducerAndOperandElementReusingConsumerNotFused) {
   HloComputation::Builder builder(TestName());
   HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(5)));
+      HloInstruction::CreateConstant(Literal::CreateR0(5)));
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {}), HloOpcode::kExp, const0));
   HloInstruction* broadcast2 =
@@ -49,7 +49,7 @@ TEST_F(InstructionFusionTest,
        NonCostlyProducerAndOperandElementReusingConsumerFused) {
   HloComputation::Builder builder(TestName());
   HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(5)));
+      HloInstruction::CreateConstant(Literal::CreateR0(5)));
   HloInstruction* negate1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {}), HloOpcode::kNegate, const0));
   HloInstruction* broadcast2 =
@@ -70,7 +70,7 @@ TEST_F(InstructionFusionTest,
        CostlyProducerAndNonOperandElementReusingConsumerFused_Reshape) {
   HloComputation::Builder builder(TestName());
   HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(5)));
+      HloInstruction::CreateConstant(Literal::CreateR0(5)));
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {}), HloOpcode::kExp, const0));
   HloInstruction* reshape2 = builder.AddInstruction(
@@ -90,7 +90,7 @@ TEST_F(InstructionFusionTest,
        CostlyProducerAndNonOperandElementReusingConsumerFused_Transpose) {
   HloComputation::Builder builder(TestName());
   HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(5)));
+      HloInstruction::CreateConstant(Literal::CreateR0(5)));
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {}), HloOpcode::kExp, const0));
   HloInstruction* transpose2 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index e9e199226a6db7a0547bda4b069e917f2a41295b..7d41be94ce92f0b23c8ef414ea6f4fd9fba7d1a4 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -382,7 +382,11 @@ Status LayoutAssignment::AddMandatoryConstraints(
       // instruction.
       // TODO(b/31425034): Change infeeds to be more like parameters, with
       // shapes in the ComputationLayout.
-      shape_with_layout = &instruction->shape();
+      // TODO(b/62477016): When the infeed does not set padding anymore, the
+      // call to ShapeWithoutPadding can be removed.
+      Shape infeed_shape = ShapeUtil::ShapeWithoutPadding(instruction->shape());
+      TF_RETURN_IF_ERROR(
+          constraints->SetInstructionLayout(infeed_shape, instruction.get()));
     } else if (instruction->opcode() == HloOpcode::kOutfeed) {
       // Constrain the input to the Outfeed instruction to be the expected
       // layout of the Outfeed.
@@ -607,6 +611,9 @@ Status CheckLayouts(
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     for (auto& instruction : computation->instructions()) {
       // Verify every instruction has a layout and the layout is valid for the
       // shape.
@@ -729,23 +736,18 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   if (instruction->opcode() == HloOpcode::kReshape) {
     // Prefer the operand layout that makes the reshape an bitcast. If any
     // dimension bound is 1 in the operand shape, there may be several such
-    // layouts. So if 'output_layout' is a MajorToMinor layout, try if the
+    // layouts. So if 'output_layout' is the default layout, try if the
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations.
     const Shape& output_shape = instruction->shape();
     Shape output_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
         AsInt64Slice(output_layout.minor_to_major()));
-    const Shape& operand_shape = operand->shape();
-    if (LayoutUtil::IsMonotonicWithDim0Major(output_layout)) {
-      Shape operand_shape_with_layout =
-          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-              operand_shape.element_type(),
-              AsInt64Slice(operand_shape.dimensions()));
-      if (ShapeUtil::ReshapeIsBitcast(operand_shape_with_layout,
-                                      output_shape_with_layout)) {
-        return MakeUnique<Layout>(operand_shape_with_layout.layout());
-      }
+    Shape operand_shape = operand->shape();
+    *operand_shape.mutable_layout() =
+        LayoutUtil::GetDefaultLayoutForShape(operand_shape);
+    if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) {
+      return MakeUnique<Layout>(operand_shape.layout());
     }
     auto aligned_operand_shape =
         ShapeUtil::AlignLayouts(output_shape_with_layout, operand_shape);
@@ -759,10 +761,14 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
 
   if (instruction->opcode() == HloOpcode::kTranspose) {
     // Pick the operand layout that makes the transpose a bitcast.
-    std::vector<int64> perm =
-        ComposePermutations(instruction->dimensions(),
-                            AsInt64Slice(output_layout.minor_to_major()));
-    Layout operand_layout = LayoutUtil::MakeLayout(perm);
+    int64 rank = ShapeUtil::Rank(instruction->shape());
+    std::vector<int64> new_minor_to_major(rank);
+    for (int64 i = 0; i < rank; ++i) {
+      int64 output_dim = output_layout.minor_to_major(i);
+      int64 operand_dim = instruction->dimensions(output_dim);
+      new_minor_to_major[i] = operand_dim;
+    }
+    Layout operand_layout = LayoutUtil::MakeLayout(new_minor_to_major);
     TF_CHECK_OK(
         LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
     return MakeUnique<Layout>(operand_layout);
@@ -789,23 +795,18 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
   if (user->opcode() == HloOpcode::kReshape) {
     // Prefer the user layout that makes the reshape an bitcast. If any
     // dimension bound is 1 in the user shape, there may be several such
-    // layouts. So if 'operand_layout' is a MajorToMinor layout, try if the
+    // layouts. So if 'operand_layout' is the default layout, try if the
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations.
     Shape operand_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
         operand->shape().element_type(),
         AsInt64Slice(operand->shape().dimensions()),
         AsInt64Slice(operand_layout.minor_to_major()));
-    const Shape& output_shape = user->shape();
-    if (LayoutUtil::IsMonotonicWithDim0Major(operand_layout)) {
-      Shape output_shape_with_layout =
-          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-              output_shape.element_type(),
-              AsInt64Slice(output_shape.dimensions()));
-      if (ShapeUtil::ReshapeIsBitcast(output_shape_with_layout,
-                                      operand_shape_with_layout)) {
-        return MakeUnique<Layout>(output_shape_with_layout.layout());
-      }
+    Shape output_shape = user->shape();
+    *output_shape.mutable_layout() =
+        LayoutUtil::GetDefaultLayoutForShape(output_shape);
+    if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) {
+      return MakeUnique<Layout>(output_shape.layout());
     }
     auto aligned_user_shape =
         ShapeUtil::AlignLayouts(operand_shape_with_layout, output_shape);
@@ -818,14 +819,16 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
   }
 
   if (user->opcode() == HloOpcode::kTranspose) {
-    // Pick the user layout that makes the reshape a bitcast.
-    // To become a bitcast, the layouts need to satisfy
-    //   collapsing_order * output_layout = input_layout
-    // so output_layout = inverse(collapsing_order) * input_layout
-    std::vector<int64> perm =
-        Permute(InversePermutation(user->dimensions()),
-                AsInt64Slice(operand_layout.minor_to_major()));
-    Layout user_layout = LayoutUtil::MakeLayout(perm);
+    // Pick the user layout that makes the transpose a bitcast.
+    int64 rank = ShapeUtil::Rank(user->shape());
+    std::vector<int64> new_minor_to_major(rank);
+    auto inverse_dimensions = InversePermutation(user->dimensions());
+    for (int64 i = 0; i < rank; ++i) {
+      int64 operand_dim = operand_layout.minor_to_major(i);
+      int64 user_dim = inverse_dimensions[operand_dim];
+      new_minor_to_major[i] = user_dim;
+    }
+    Layout user_layout = LayoutUtil::MakeLayout(new_minor_to_major);
     TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
     return MakeUnique<Layout>(user_layout);
   }
@@ -926,7 +929,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
                 ShapeUtil::IsArray(buffer->shape())) {
               TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
                   ShapeUtil::GetSubshape(shape_layout.shape(), index).layout(),
-                  *buffer));
+                  *buffer, /*mandatory=*/true));
             }
           }
         }
@@ -1346,8 +1349,7 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   if (VLOG_IS_ON(10)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "before layout assignment",
-                                /*show_addresses=*/false,
-                                /*show_layouts=*/true);
+                                module->config().debug_options());
   }
 
   // Assign layouts to computations in an order such that a callee computation
@@ -1357,6 +1359,8 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
     if (computation == module->entry_computation()) {
       TF_RETURN_IF_ERROR(RunOnComputation(*entry_computation_layout_,
                                           module->entry_computation()));
+    } else if (computation->IsFusionComputation()) {
+      continue;
     } else {
       ComputationLayout computation_layout(computation->ComputeProgramShape());
       // Setting all embedded computations to the default layout is potentially
@@ -1373,8 +1377,7 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   if (VLOG_IS_ON(10)) {
     hlo_graph_dumper::DumpGraph(*module->entry_computation(),
                                 "after layout assignment",
-                                /*show_addresses=*/false,
-                                /*show_layouts=*/true);
+                                module->config().debug_options());
   }
 
   // All layouts are reset then reassigned by this pass.
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index ccfc17da4c412c23945630a52fc21cbac87e0727..256d6aa8aa64e3585cb21b3fb2a11c7416c705f1 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -315,6 +315,7 @@ class LayoutAssignment : public HloPassInterface {
 
   ComputationLayout* entry_computation_layout_;
 
+ protected:
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 6d818cdea0c30701adf83f6265a6d7b554fb91cc..f69c043f32b4e688a543d277164eb91b364b51dc 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -230,7 +230,7 @@ TEST_F(LayoutAssignmentTest, TupleSelect) {
       HloInstruction::CreateTuple({constant0, constant1}));
 
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
 
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple0->shape(), HloOpcode::kSelect, pred, tuple0, tuple1));
@@ -264,7 +264,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   // tuple and assigning the layouts of the copied arrays as needed.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
   auto inner_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant}));
   auto nested_tuple = builder.AddInstruction(
@@ -552,6 +552,41 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
               ElementsAre(1, 0));
 }
 
+// Test layout assignment of a transpose into a bitcast based on its operand.
+TEST_F(LayoutAssignmentTest, TransposeToBitcastFromOperand) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape input_shape_with_layout =
+      ShapeUtil::MakeShapeWithLayout(F32, {3, 5, 6, 7}, {2, 0, 3, 1});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape_with_layout, "param"));
+  auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
+      ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), param, {2, 3, 0, 1}));
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(builder.Build(transpose));
+  ComputationLayout computation_layout(computation->ComputeProgramShape());
+  AssignLayouts(module.get(), &computation_layout);
+  EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
+                                            transpose->shape(), {2, 3, 0, 1}));
+}
+// Test layout assignment of a transpose into a bitcast based on its user.
+TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {3, 5, 6, 7});
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(input_shape, constant, {}));
+  auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
+      ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), broadcast, {2, 3, 0, 1}));
+  auto module = CreateNewModule();
+  HloComputation* computation =
+      module->AddEntryComputation(builder.Build(transpose));
+  ComputationLayout computation_layout(computation->ComputeProgramShape());
+  AssignLayouts(module.get(), &computation_layout);
+  EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
+                                            transpose->shape(), {2, 3, 0, 1}));
+}
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
index 682bf19807b4b5d4e8a66c6c5e2e01c80a026594..9c80fb3adbc99b2e5cd3efc20deaf602c5ebc526 100644
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -28,17 +28,6 @@ limitations under the License.
 
 namespace xla {
 
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-
-  // GetTupleElement instructions only access the top-level buffer of their
-  // operand.
-  return (user->opcode() == HloOpcode::kGetTupleElement && !index.empty());
-}
-
 bool DoesNotUseOperandBuffer(const HloInstruction* operand,
                              const ShapeIndex& index,
                              const HloInstruction* user,
@@ -149,18 +138,22 @@ bool HasUniqueFusedUseOfOperandAt(
 
 // User and operand can share buffers iff both instructions emit the same shape
 // and layout, and 'user' meets one of the following qualifications:
-// *) Is element-wise. Or...
-// *) Is a loop fusion instruction where the only use of 'operand' at 'index'
-//    in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
-//    at operand 0. Or...
-// *) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
-//    instruction where the only use of 'operand' at 'index' in the set
-//    'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
-// *) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index 0.
+//
+// (1) Is element-wise. Or...
+// (2) Is a loop fusion instruction where the only use of 'operand' at 'index'
+//     in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
+//     at operand 0. Or...
+// (3) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
+//     instruction where the only use of 'operand' at 'index' in the set
+//     'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
+// (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
+//     0.
+//
+// (2) and (3) can only be determined if points-to analysis is available.
 bool CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
     HloInstruction* user, const ShapeIndex& user_index,
-    const TuplePointsToAnalysis& points_to_analysis) {
+    const TuplePointsToAnalysis* points_to_analysis) {
   CHECK(user->IsUserOf(operand))
       << "user: " << user->ToString() << " operand: " << operand->ToString();
   Shape operand_subshape =
@@ -170,7 +163,7 @@ bool CanShareOperandBufferWithUser(
   if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
     return false;
   }
-  if (user->opcode() == HloOpcode::kFusion) {
+  if (points_to_analysis != nullptr && user->opcode() == HloOpcode::kFusion) {
     if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
         user->fused_expression_root()->opcode() ==
             HloOpcode::kDynamicUpdateSlice) {
@@ -180,7 +173,7 @@ bool CanShareOperandBufferWithUser(
       // 'operand_index', and this singleton use is the fused root at operand
       // index 0.
       return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0,
-                                          points_to_analysis);
+                                          *points_to_analysis);
     } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
                user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
       // Output fusion with kAdd fused root.
@@ -208,7 +201,7 @@ bool CanShareOperandBufferWithUser(
       // index 'other_add_operand_index').
       return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
                                           other_add_operand_index,
-                                          points_to_analysis);
+                                          *points_to_analysis);
     }
   }
   if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
diff --git a/tensorflow/compiler/xla/service/liveness_util.h b/tensorflow/compiler/xla/service/liveness_util.h
index 0b01223db73d49ad3ee127dd9076e37f5fac8ec5..c7799e5ab5d0c0d0477c09fa7e6a36c67312a72b 100644
--- a/tensorflow/compiler/xla/service/liveness_util.h
+++ b/tensorflow/compiler/xla/service/liveness_util.h
@@ -34,21 +34,16 @@ bool DoesNotUseOperandBuffer(const HloInstruction* operand,
                              const HloInstruction* user,
                              const TuplePointsToAnalysis& points_to_analysis);
 
-// Overload which does not require points-to analysis. The result is more
-// conservative (returns false more often).
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user);
-
 // Returns true if 'user' (at 'user_index') can share a buffer with its operand
-// 'operand' (at 'operand_index').
-// Returns false otherwise.
+// 'operand' (at 'operand_index'). Returns false otherwise. Optionally takes a
+// points-to analysis argument. Without the analysis, the result is more
+// conservative (returns false more often).
 //
 // REQUIRES: 'operand' is an operand of 'user'.
 bool CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
     HloInstruction* user, const ShapeIndex& user_index,
-    const TuplePointsToAnalysis& points_to_analysis);
+    const TuplePointsToAnalysis* points_to_analysis = nullptr);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
index bad4be149a68bdc07a1f7e4ac0668728d10d152e..6a4fde87614750d21cf9572e7f447bba924379c4 100644
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -85,9 +85,9 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
           data_shape, gte1, update, starts));
@@ -122,10 +122,10 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
 
   BuildModuleAndRunAnalysis(builder.Build());
 
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, log, {}, *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(param, {}, exp, {},
+                                            points_to_analysis_.get()));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(exp, {}, log, {},
+                                            points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
@@ -143,9 +143,9 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
   BuildModuleAndRunAnalysis(builder.Build());
 
   EXPECT_FALSE(CanShareOperandBufferWithUser(param0, {}, result, {},
-                                             *points_to_analysis_));
+                                             points_to_analysis_.get()));
   EXPECT_FALSE(CanShareOperandBufferWithUser(param1, {}, result, {},
-                                             *points_to_analysis_));
+                                             points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
@@ -161,10 +161,10 @@ TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
 
   BuildModuleAndRunAnalysis(builder.Build());
 
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, copy, {}, *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(param, {}, exp, {},
+                                            points_to_analysis_.get()));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(exp, {}, copy, {},
+                                            points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
@@ -180,9 +180,9 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
           data_shape, gte1, update, starts));
@@ -197,9 +197,9 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // The fusion instruction can share with tuple element 1.
   EXPECT_FALSE(CanShareOperandBufferWithUser(tuple, {0}, fusion, {},
-                                             *points_to_analysis_));
+                                             points_to_analysis_.get()));
   EXPECT_TRUE(CanShareOperandBufferWithUser(tuple, {1}, fusion, {},
-                                            *points_to_analysis_));
+                                            points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
@@ -221,12 +221,12 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
 
   // The DynamicUpdateSlice instruction can share with the data operand, but not
   // with update or starts.
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, dus, {}, *points_to_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(update, {}, dus, {}, *points_to_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(data, {}, dus, {},
+                                            points_to_analysis_.get()));
+  EXPECT_FALSE(CanShareOperandBufferWithUser(update, {}, dus, {},
+                                             points_to_analysis_.get()));
+  EXPECT_FALSE(CanShareOperandBufferWithUser(starts, {}, dus, {},
+                                             points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
@@ -234,15 +234,15 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
   Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
 
   auto a = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
   auto b = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
 
   auto dot = builder.AddInstruction(
       HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
 
   auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto add_operand = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape, one, {1}));
 
@@ -256,7 +256,7 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
 
   // Output fused dot add should be able to share buffer with 'add_operand'.
   EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *points_to_analysis_));
+                                            points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
@@ -264,9 +264,9 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
   Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
 
   auto a = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
   auto b = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
   auto b_t = builder.AddInstruction(
       HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
 
@@ -274,7 +274,7 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
       HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
 
   auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto add_operand = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape, one, {1}));
 
@@ -292,7 +292,7 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
 
   // Output fused transpose-dot-add should be share buffer with 'add_operand'.
   EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *points_to_analysis_));
+                                            points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
@@ -300,7 +300,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
   Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
 
   auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto operand = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape, one, {1}));
 
@@ -308,7 +308,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
       HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
 
   auto two = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
 
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
@@ -320,7 +320,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
 
   // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
   EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
-                                             *points_to_analysis_));
+                                             points_to_analysis_.get()));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
@@ -360,8 +360,8 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
   RunAnalysis();
 
   // The While instruction can share with the data operand.
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, whil, {}, *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(data, {}, whil, {},
+                                            points_to_analysis_.get()));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 12b2762f0ed7eb9acce8a60d4501ab6ce53c3b57..61945bd128e68b59bd0a1156882c5b29d6be2a27 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -29,7 +29,6 @@ cc_library(
         ":ir_array",
         ":llvm_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/legacy_flags:alias_analysis_flags",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:logical_buffer",
@@ -47,7 +46,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:llvm_util_flags",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
@@ -93,6 +91,7 @@ cc_library(
     deps = [
         ":ir_array",
         ":llvm_loop",
+        ":ops",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 02710ff57f6f75fe6aa1c32670cc7196ae4c402f..1f6932bcc3fb76adb874b963ecf5fb1b16d8a9f4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <unordered_set>
 
 #include "external/llvm/include/llvm/IR/MDBuilder.h"
-#include "tensorflow/compiler/xla/legacy_flags/alias_analysis_flags.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -51,28 +50,37 @@ void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
     buffer_slice = *slices.begin();
   }
 
-  llvm::MDNode*& alias_scope_md = alias_scope_metadata_[buffer_slice];
-  if (alias_scope_md == nullptr) {
-    alias_scope_md =
-        GetAliasScopeMetadataForBuffer(buffer_slice, GetAliasDomain());
+  if (module_.config().debug_options().xla_llvm_enable_alias_scope_metadata()) {
+    llvm::MDNode*& alias_scope_md = alias_scope_metadata_[buffer_slice];
+    if (alias_scope_md == nullptr) {
+      alias_scope_md =
+          GetAliasScopeMetadataForBuffer(buffer_slice, GetAliasDomain());
+    }
+    array->AddAliasScopeMetadata(alias_scope_md);
   }
-  array->AddAliasScopeMetadata(alias_scope_md);
 
-  llvm::MDNode*& noalias_md = noalias_metadata_[buffer_slice];
-  if (noalias_md == nullptr) {
-    noalias_md = GetNoaliasMetadataForBuffer(buffer_slice, GetAliasDomain(),
-                                             assignment_, hlo);
+  if (module_.config().debug_options().xla_llvm_enable_noalias_metadata()) {
+    llvm::MDNode*& noalias_md = noalias_metadata_[buffer_slice];
+    if (noalias_md == nullptr) {
+      noalias_md = GetNoaliasMetadataForBuffer(buffer_slice, GetAliasDomain(),
+                                               assignment_, hlo);
+    }
+    array->AddNoaliasMetadata(noalias_md);
   }
-  array->AddNoaliasMetadata(noalias_md);
 
-  // Parameters of the entry computation are never stored to, loading from a
-  // parameter pointer should always return the same result within a loop.
-  if (hlo.opcode() == HloOpcode::kParameter) {
-    const std::vector<HloInstruction*>& parameter_instructions =
-        module_.entry_computation()->parameter_instructions();
-    if (std::find(parameter_instructions.begin(), parameter_instructions.end(),
-                  &hlo) != parameter_instructions.end()) {
-      array->AddInvariantLoad(llvm::MDNode::get(*context_, /*MDs=*/{}));
+  if (module_.config()
+          .debug_options()
+          .xla_llvm_enable_invariant_load_metadata()) {
+    // Parameters of the entry computation are never stored to, loading from a
+    // parameter pointer should always return the same result within a loop.
+    if (hlo.opcode() == HloOpcode::kParameter) {
+      const std::vector<HloInstruction*>& parameter_instructions =
+          module_.entry_computation()->parameter_instructions();
+      if (std::find(parameter_instructions.begin(),
+                    parameter_instructions.end(),
+                    &hlo) != parameter_instructions.end()) {
+        array->AddInvariantLoad(llvm::MDNode::get(*context_, /*MDs=*/{}));
+      }
     }
   }
 }
@@ -87,12 +95,6 @@ llvm::MDNode* AliasAnalysis::GetAliasDomain() {
 
 llvm::MDNode* AliasAnalysis::GetAliasScopeMetadataForBuffer(
     const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain) {
-  legacy_flags::AliasAnalysisFlags* flags =
-      legacy_flags::GetAliasAnalysisFlags();
-  if (!flags->xla_emit_alias_scope) {
-    return nullptr;
-  }
-
   // While we could synthesize an alias.scope, doing so is not more profitable
   // than LLVM's default behavior.
   if (buffer_slice.allocation() == kParameterAllocation) {
@@ -109,12 +111,6 @@ llvm::MDNode* AliasAnalysis::GetAliasScopeMetadataForBuffer(
 llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
     const BufferAllocation::Slice& buffer_slice, llvm::MDNode* domain,
     const BufferAssignment& assignment, const HloInstruction& hlo) {
-  legacy_flags::AliasAnalysisFlags* flags =
-      legacy_flags::GetAliasAnalysisFlags();
-  if (!flags->xla_emit_alias_scope) {
-    return nullptr;
-  }
-
   // We want to construct a list of buffers which:
   //
   // 1. Do not alias the given buffer.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index b259d348708c227a3e580fd352422e457284129d..26e73a6ec390c5823c2a0315480a427ea0a7b373 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -128,6 +128,27 @@ Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
+Status FusedIrEmitter::HandleTuple(
+    HloInstruction* tuple,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  std::vector<llvm::Type*> operand_elemental_ir_types;
+  for (HloInstruction* operand : operands) {
+    operand_elemental_ir_types.push_back(llvm_ir::PrimitiveTypeToIrType(
+        operand->shape().element_type(), ir_builder_));
+  }
+  generators_[tuple] =
+      [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+    llvm::Value* ret = llvm::UndefValue::get(llvm::StructType::get(
+        ir_builder_->getContext(), operand_elemental_ir_types));
+    for (size_t i = 0; i < ShapeUtil::TupleElementCount(tuple->shape()); ++i) {
+      TF_ASSIGN_OR_RETURN(llvm::Value * val_i, generators_[operands[i]](index));
+      ret = ir_builder_->CreateInsertValue(ret, val_i, i);
+    }
+    return ret;
+  };
+  return Status::OK();
+}
+
 Status FusedIrEmitter::FinishVisit(HloInstruction* root) {
   fused_root_ = root;
   return tensorflow::Status::OK();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 79007b7099a32973cada7a9986ff95c5e4aabec6..1cd8d1194686236dd11f71c56d668708ad113f03 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -54,6 +54,11 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
 
   Status HandleParameter(HloInstruction* parameter) override;
 
+  // Emits the ir value for each element in the tuple.
+  Status HandleTuple(
+      HloInstruction* tuple,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
   Status FinishVisit(HloInstruction* root) override;
 
   // Returns the generator function for the root of the fused computation.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index e401305ae7342a9db09499c9b3846f5a0a705fa7..75b7856800d2f3e6f279d2ac2bdcf3021bbf4049 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -85,7 +85,7 @@ IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
     ++depth;
   }
 
-  if (ShapeUtil::Rank(*shape_) == 0) {
+  if (!ShapeUtil::IsArray(*shape_) || ShapeUtil::IsScalar(*shape_)) {
     DCHECK(depth == 1 || depth == 0) << depth;
   } else {
     DCHECK_EQ(depth, ShapeUtil::Rank(*shape_)) << shape.ShortDebugString();
@@ -153,6 +153,28 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape(
   return Index(source_multidim_index);
 }
 
+IrArray::Index IrArray::Index::SourceIndexOfSlice(
+    const Shape& shape, tensorflow::gtl::ArraySlice<int64> starts,
+    tensorflow::gtl::ArraySlice<int64> strides,
+    llvm::IRBuilder<>* builder) const {
+  Index source_index(multidim_.size());
+  for (int i = 0; i < multidim_.size(); ++i) {
+    int64 stride = strides[i];
+    auto type = multidim_[i]->getType();
+
+    if (stride != 1) {
+      source_index[i] = builder->CreateAdd(
+          builder->CreateMul(multidim_[i],
+                             llvm::ConstantInt::get(type, stride)),
+          llvm::ConstantInt::get(type, starts[i]));
+    } else {
+      source_index[i] = builder->CreateAdd(
+          multidim_[i], llvm::ConstantInt::get(type, starts[i]));
+    }
+  }
+  return source_index;
+}
+
 IrArray::Index IrArray::Index::SourceIndexOfTranspose(
     const Shape& shape, const Shape& operand_shape,
     tensorflow::gtl::ArraySlice<int64> dimension_mapping,
@@ -228,6 +250,18 @@ llvm::Value* IrArray::EmitArrayElementAddress(
                                        llvm_ir::AsStringRef(name));
 }
 
+void IrArray::AnnotateLoadStoreInstructionWithMetadata(
+    llvm::Instruction* instruction) const {
+  CHECK(llvm::isa<llvm::LoadInst>(instruction) ||
+        llvm::isa<llvm::StoreInst>(instruction));
+
+  for (const auto& kind_md_pair : metadata_) {
+    CHECK(kind_md_pair.first != llvm::LLVMContext::MD_invariant_load ||
+          llvm::isa<llvm::LoadInst>(instruction));
+    instruction->setMetadata(kind_md_pair.first, kind_md_pair.second);
+  }
+}
+
 llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
                                            llvm::IRBuilder<>* ir_builder,
                                            tensorflow::StringPiece name) const {
@@ -236,9 +270,7 @@ llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
   llvm::LoadInst* load = ir_builder->CreateLoad(element_address);
   llvm_ir::SetTbaaForInstruction(load, GetShape(),
                                  /*is_pointer_to=*/false);
-  for (const auto& kind_md_pair : metadata_) {
-    load->setMetadata(kind_md_pair.first, kind_md_pair.second);
-  }
+  AnnotateLoadStoreInstructionWithMetadata(load);
   return load;
 }
 
@@ -248,10 +280,7 @@ void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
   llvm::StoreInst* store = ir_builder->CreateStore(value, element_address);
   llvm_ir::SetTbaaForInstruction(store, GetShape(),
                                  /*is_pointer_to=*/false);
-  for (const auto& kind_md_pair : metadata_) {
-    CHECK_NE(kind_md_pair.first, llvm::LLVMContext::MD_invariant_load);
-    store->setMetadata(kind_md_pair.first, kind_md_pair.second);
-  }
+  AnnotateLoadStoreInstructionWithMetadata(store);
 }
 
 IrArray IrArray::CastToShape(const Shape& new_shape,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 91cb3a679fd67fffb29f8a935cc3c65e9442136b..5fabb1e2433248c0a2fb2a14fb6cb5dacb0dfb39 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -115,6 +115,16 @@ class IrArray {
     Index SourceIndexOfReshape(const Shape& shape, const Shape& operand_shape,
                                llvm::IRBuilder<>* builder) const;
 
+    // Returns the index into the source operand from which a slice operation
+    // selects a value to be placed into index "this". The slice is described
+    // by starting indices `starts` and stride values `strides`.
+    //
+    // Precondition: "this" is an index into a slice whose shape is `shape`.
+    Index SourceIndexOfSlice(const Shape& shape,
+                             tensorflow::gtl::ArraySlice<int64> starts,
+                             tensorflow::gtl::ArraySlice<int64> strides,
+                             llvm::IRBuilder<>* builder) const;
+
     // Given that "this" is the target index of a transpose from `operand_shape`
     // to `shape` with the given dimension mapping, returns the source index.
     Index SourceIndexOfTranspose(
@@ -183,6 +193,10 @@ class IrArray {
                                        llvm::IRBuilder<>* ir_builder,
                                        tensorflow::StringPiece name = "") const;
 
+  // Attach metadata this IrArray instance knows about to "instruction".
+  void AnnotateLoadStoreInstructionWithMetadata(
+      llvm::Instruction* instruction) const;
+
   // Emit IR to read an array element at the given index. Returns the read
   // result (effectively, a Value loaded from memory). This method seamlessly
   // handles scalar shapes by broadcasting their value to all indices (index is
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 97f1b8ce30818eaf7465933a28f30959b5e2b90a..0995ed6ff51763e7fcb281c48bd288c44a1f739f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -144,12 +144,19 @@ llvm::BasicBlock* ForLoop::CreateBasicBlockWithSuffix(
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index) {
+  return AddLoop(suffix, start_index, end_index, ir_builder_->getInt64(1));
+}
+
+std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
+                                              llvm::Value* start_index,
+                                              llvm::Value* end_index,
+                                              llvm::Value* stride) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
     ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
   }
-  std::unique_ptr<ForLoop> loop = ForLoop::EmitForLoop(
-      suffix, start_index, end_index, ir_builder_->getInt64(1), ir_builder_);
+  std::unique_ptr<ForLoop> loop =
+      ForLoop::EmitForLoop(suffix, start_index, end_index, stride, ir_builder_);
 
   if (outer_loop_preheader_bb_ == nullptr) {
     outer_loop_preheader_bb_ = loop->GetPreheaderBasicBlock();
@@ -172,6 +179,15 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                  ir_builder_->getInt64(end_index));
 }
 
+std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
+                                              int64 end_index, int64 stride,
+                                              tensorflow::StringPiece suffix) {
+  CHECK_LE(start_index, end_index);
+  return AddLoop(suffix, ir_builder_->getInt64(start_index),
+                 ir_builder_->getInt64(end_index),
+                 ir_builder_->getInt64(stride));
+}
+
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
                                              tensorflow::StringPiece suffix) {
   std::vector<int64> dimensions(ShapeUtil::Rank(shape));
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 60ac0444bcde002db6fd6bfa2630c9b78157e491..a66bf80959f6579811fb8b5885d6cd209a48dc7a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -167,12 +167,22 @@ class ForLoopNest {
   // Adds a loop to the nest. If no loop has been added yet then emit a loop at
   // the current insert point of the given builder. If one or more loops have
   // been added then emit loop inside the body of the last added loop.
+  std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
+                                   llvm::Value* start_index,
+                                   llvm::Value* end_index, llvm::Value* stride);
+
+  // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
                                    llvm::Value* start_index,
                                    llvm::Value* end_index);
 
   // A convenient wrapper of the other flavor of AddLoop. The given start and
   // end index are constant.
+  std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
+                                   int64 stride,
+                                   tensorflow::StringPiece suffix);
+
+  // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
                                    tensorflow::StringPiece suffix);
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index ff2f4cd693ca76c0e4d20522f50a302fb3ae2c40..a8c17a67f159adec94e0f16052c74e53768decc5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Operator.h"
 #include "external/llvm/include/llvm/Target/TargetOptions.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/llvm_util_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -137,6 +137,24 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::IRBuilder<>* ir_builder) {
   return result_type;
 }
 
+StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
+    const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder) {
+  string encoded_shape = shape.SerializeAsString();
+  if (encoded_shape.size() > std::numeric_limits<int32>::max()) {
+    return InternalError("Encoded shape size exceeded int32 size limit.");
+  }
+  *shape_size = static_cast<int32>(encoded_shape.size());
+  return ir_builder->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape));
+}
+
+StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
+                                                  int32 size_bytes) {
+  Shape shape;
+  TF_RET_CHECK(shape.ParseFromArray(shape_ptr, size_bytes));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
+  return shape;
+}
+
 namespace {
 
 // Recursively construct a multidimensional LLVM constant which represents the
@@ -163,36 +181,36 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
     llvm::Constant* value;
     switch (shape.element_type()) {
       case PRED:
-        value = llvm::ConstantInt::get(
-            ir_element_type, LiteralUtil::Get<bool>(literal, *multi_index));
+        value = llvm::ConstantInt::get(ir_element_type,
+                                       literal.Get<bool>(*multi_index));
         break;
       case U8:
-        value = llvm::ConstantInt::get(
-            ir_element_type, LiteralUtil::Get<uint8>(literal, *multi_index));
+        value = llvm::ConstantInt::get(ir_element_type,
+                                       literal.Get<uint8>(*multi_index));
         break;
       case S32:
-        value = llvm::ConstantInt::get(
-            ir_element_type, LiteralUtil::Get<int32>(literal, *multi_index));
+        value = llvm::ConstantInt::get(ir_element_type,
+                                       literal.Get<int32>(*multi_index));
         break;
       case U32:
-        value = llvm::ConstantInt::get(
-            ir_element_type, LiteralUtil::Get<uint32>(literal, *multi_index));
+        value = llvm::ConstantInt::get(ir_element_type,
+                                       literal.Get<uint32>(*multi_index));
         break;
       case S64:
-        value = llvm::ConstantInt::get(
-            ir_element_type, LiteralUtil::Get<int64>(literal, *multi_index));
+        value = llvm::ConstantInt::get(ir_element_type,
+                                       literal.Get<int64>(*multi_index));
         break;
       case U64:
-        value = llvm::ConstantInt::get(
-            ir_element_type, LiteralUtil::Get<uint64>(literal, *multi_index));
+        value = llvm::ConstantInt::get(ir_element_type,
+                                       literal.Get<uint64>(*multi_index));
         break;
       case F32:
-        value = llvm::ConstantFP::get(
-            ir_element_type, LiteralUtil::Get<float>(literal, *multi_index));
+        value = llvm::ConstantFP::get(ir_element_type,
+                                      literal.Get<float>(*multi_index));
         break;
       case F64:
-        value = llvm::ConstantFP::get(
-            ir_element_type, LiteralUtil::Get<double>(literal, *multi_index));
+        value = llvm::ConstantFP::get(ir_element_type,
+                                      literal.Get<double>(*multi_index));
         break;
       default:
         LOG(FATAL) << "unsupported type " << shape.element_type();
@@ -357,31 +375,9 @@ void EmitLogging(const char* tag, llvm::Value* value,
 
 void SetTbaaForInstruction(llvm::Instruction* instruction, Shape shape,
                            bool is_pointer_to) {
-  legacy_flags::LlvmUtilFlags* flags = legacy_flags::GetLlvmUtilFlags();
-  if (!flags->xla_emit_tbaa) {
-    return;
-  }
-
-  llvm::MDBuilder metadata_builder(instruction->getContext());
-  llvm::MDNode* root = metadata_builder.createTBAARoot("XLA TBAA");
-  string type_name;
-  if (is_pointer_to) {
-    type_name += "pointer-to ";
-  }
-  // Scalars do not have layout which makes it permissible to omit an explicit
-  // layout.  To make sure that equivalent scalar shapes have the same TBAA,
-  // remove the (meaningless) explicit layout if one is present.
-  if (ShapeUtil::Rank(shape) == 0) {
-    LayoutUtil::ClearLayout(&shape);
-  } else {
-    CHECK(shape.has_layout());
-  }
-  type_name += shape.ShortDebugString();
-  llvm::MDNode* tbaa_node =
-      metadata_builder.createTBAANode(llvm_ir::AsStringRef(type_name), root);
-  instruction->setMetadata(llvm::LLVMContext::MD_tbaa,
-                           metadata_builder.createTBAAStructTagNode(
-                               tbaa_node, tbaa_node, /*Offset=*/0));
+  // TODO(b/62903316): TBAA metadata causes LLVM to miscompile generated code,
+  // most likely because the generated metadata is incorrect.  Disable TBAA
+  // metadata while we resolve this.
 }
 
 void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 7b09c1f83145c2994c381686c2e6343d353becf7..d940c3fcbcfd08bd0e2a44b6721d75273c2aae5e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -106,6 +106,19 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
 // if "shape" is [5 x [10 x f32]], the function returns [5 x [10 x float]].
 llvm::Type* ShapeToIrType(const Shape& shape, llvm::IRBuilder<>* ir_builder);
 
+// Returns a value that represents a pointer to a global string constant that
+// encodes the shape as a serialized protobuf.
+StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
+    const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder);
+
+// Inverses the encoding of a Shape protobuf into an LLVM global variable.
+//
+// This is intended to be called from the runtime to decode the llvm::Constants
+// that are created via ConvertShapeToSelfDescribingConstant and subsequently
+// embedded into the program.
+StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
+                                                  int32 size_bytes);
+
 // Converts a given literal to an IR Constant. Literals have known constant
 // values at IR emission time.
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 9a128b2aa6f2d5e5650624f103c573e671335f7b..8839ec582df844f46f060e26917f15aa297cba3d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -51,8 +52,41 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
       shape_(target_array.GetShape()),
       ir_builder_(ir_builder) {}
 
+LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
+                         tensorflow::gtl::ArraySlice<IrArray> target_arrays,
+                         llvm::IRBuilder<>* ir_builder)
+    : body_emitter_([=](const llvm_ir::IrArray::Index array_index)
+                        -> ::tensorflow::Status {
+        // Convert target_element_generator to a BodyEmitter.
+        TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
+                            target_element_generator(array_index));
+        if (target_arrays.size() == 1) {
+          target_arrays[0].EmitWriteArrayElement(array_index, target_element,
+                                                 ir_builder);
+          return tensorflow::Status::OK();
+        }
+
+        for (int64 i = 0; i < target_arrays.size(); ++i) {
+          target_arrays[i].EmitWriteArrayElement(
+              array_index, ir_builder_->CreateExtractValue(target_element, i),
+              ir_builder);
+        }
+        return tensorflow::Status::OK();
+      }),
+      ir_builder_(ir_builder) {
+  if (target_arrays.size() > 1) {
+    // The sanity check for multiple outputs.
+    shape_ = target_arrays[0].GetShape();
+    for (int64 i = 1; i < target_arrays.size(); ++i) {
+      const Shape& element_shape = target_arrays[i].GetShape();
+      CHECK(ShapeUtil::SameDimensions(shape_, element_shape));
+    }
+  } else {
+    shape_ = target_arrays[0].GetShape();
+  }
+}
+
 IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock() {
-  CHECK(!ShapeUtil::IsTuple(shape_));
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
     exit_bb_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 08171e9e9de294339359f86059f89dcf4939ddea..ab6b702c441e04f2c7988a3dcb9880a86ff95355 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -47,6 +47,10 @@ class LoopEmitter {
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
               const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
+  // Same as previous method except emits multiple targets in an array.
+  LoopEmitter(const ElementGenerator& target_element_generator,
+              tensorflow::gtl::ArraySlice<IrArray> target_arrays,
+              llvm::IRBuilder<>* ir_builder);
   LoopEmitter(const LoopEmitter&) = delete;
   LoopEmitter& operator=(const LoopEmitter&) = delete;
   virtual ~LoopEmitter() = default;
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 131c2ee87b0e78a4f7e315bfbb2b2793c0a91fa1..45e37c6f65efcff81cbc72737348015ce43a944f 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -46,13 +46,6 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
-/* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
-    perftools::gputools::Platform* platform) {
-  ServiceOptions default_options;
-  default_options.set_platform(platform);
-  return NewService(default_options);
-}
-
 /* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
     const ServiceOptions& options) {
   perftools::gputools::Platform* platform = options.platform();
@@ -62,7 +55,6 @@ namespace xla {
 
   BackendOptions backend_options;
   backend_options.set_platform(platform)
-      .set_number_of_replicas(options.number_of_replicas())
       .set_intra_op_parallelism_threads(options.intra_op_parallelism_threads());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> backend,
                       Backend::CreateBackend(backend_options));
@@ -70,15 +62,15 @@ namespace xla {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
   std::unique_ptr<LocalService> service(new LocalService(
-      std::move(backend), std::move(compute_constant_backend)));
+      options, std::move(backend), std::move(compute_constant_backend)));
   return std::move(service);
 }
 
-LocalService::LocalService(std::unique_ptr<Backend> execute_backend,
+LocalService::LocalService(const ServiceOptions& options,
+                           std::unique_ptr<Backend> execute_backend,
                            std::unique_ptr<Backend> compute_constant_backend)
-    : Service(std::move(execute_backend), std::move(compute_constant_backend)) {
-  runs_in_client_process_ = true;
-}
+    : Service(options, std::move(execute_backend),
+              std::move(compute_constant_backend)) {}
 
 namespace {
 // Returns the space required to allocate a shape. If
@@ -152,9 +144,13 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   // Construct computation layout from the argument layouts.
   auto module_config = MakeUnique<HloModuleConfig>(*program_shape);
   module_config->set_has_hybrid_result(has_hybrid_result);
-  module_config->set_replica_count(execute_backend_->Replicas().size());
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  if (flags->xla_hlo_profile) {
+  module_config->set_replica_count(options_.number_of_replicas());
+  module_config->set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  if (execute_backend_->eigen_intra_op_thread_pool() != nullptr) {
+    module_config->set_intra_op_parallelism_threads(
+        execute_backend_->eigen_intra_op_thread_pool()->NumThreads());
+  }
+  if (module_config->debug_options().xla_hlo_profile()) {
     module_config->enable_hlo_profiling(true);
   }
   auto* computation_layout = module_config->mutable_entry_computation_layout();
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 767a3ab697febb283af448b25369445152381a5e..13797ec0450bd0eb2030b111464c42e966792266 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -35,11 +35,7 @@ namespace xla {
 // in the same process as the client.
 class LocalService : public Service {
  public:
-  // Factory for creating a LocalService. The parameter platform is the platform
-  // that the service should target. If platform is null then the default
-  // platform is used.
-  static StatusOr<std::unique_ptr<LocalService>> NewService(
-      perftools::gputools::Platform* platform);
+  // Factory for creating a LocalService.
   static StatusOr<std::unique_ptr<LocalService>> NewService(
       const ServiceOptions& options);
 
@@ -60,7 +56,8 @@ class LocalService : public Service {
       const Shape* result_layout, int device_ordinal, bool has_hybrid_result);
 
  private:
-  explicit LocalService(std::unique_ptr<Backend> backend,
+  explicit LocalService(const ServiceOptions& options,
+                        std::unique_ptr<Backend> backend,
                         std::unique_ptr<Backend> compute_constant_backend);
   LocalService(const LocalService&) = delete;
   void operator=(const LocalService&) = delete;
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index d24a592f46ed2dd8fd9c927e8ed9816771a7396c..3e843b202997a09f76993acd4d02f4de9aae9854 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace xla {
 
 string LogicalBuffer::ToString() const {
-  return tensorflow::strings::StrCat(instruction_->FullyQualifiedName(), "[",
+  return tensorflow::strings::StrCat(instruction_->name(), "[",
                                      tensorflow::str_util::Join(index_, ","),
                                      "](#", id_, " @", color_.value(), ")");
 }
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index 566cd01ea437433e5e328ad523090e682a799233..a9f6688612002f320541b7c1d20df4dd41ea971a 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -95,11 +95,13 @@ class LogicalBuffer {
 
   // Functions which return the size and alignment of a logical buffer in bytes.
   using SizeFunction = std::function<int64(const LogicalBuffer&)>;
-  using AlignmentFunction = std::function<int64(const LogicalBuffer&)>;
+  using AlignmentFunction = std::function<int64(LogicalBuffer::Color)>;
 
-  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id,
-                Color color)
-      : instruction_(instruction), index_(index), id_(id), color_(color) {}
+  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id)
+      : instruction_(instruction),
+        index_(index),
+        id_(id),
+        color_(kInvalidColor) {}
 
   Id id() const { return id_; }
 
@@ -112,8 +114,19 @@ class LogicalBuffer {
 
   // Return the color of the logical buffer. Differently colored buffers can
   // not be parts of the same allocation.
-  Color color() const { return color_; }
-  void set_color(Color color) { color_ = color; }
+  Color color() const {
+    CHECK_NE(color_, kInvalidColor)
+        << "Should not query the color of a buffer that was never colored";
+    return color_;
+  }
+
+  void set_color(Color color) {
+    CHECK_NE(color, kInvalidColor)
+        << "Should not set the color of a buffer to the invalid color";
+    color_ = color;
+  }
+
+  bool has_color() const { return color_ != kInvalidColor; }
 
   // Return the shape of the buffer. This reference points into the shape field
   // of the instruction defining the buffer.  Therefore, the returned shape will
@@ -143,6 +156,8 @@ class LogicalBuffer {
   static LogicalBufferProto::Location ToLocationProto(
       const HloInstruction& instruction, const ShapeIndex& index);
 
+  const Color kInvalidColor = Color(-1);
+
  private:
   HloInstruction* instruction_;
   ShapeIndex index_;
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index 4014856b9b243831a087962484128a121680eb1b..069f85af721228c8f5d40cf243eea7f1e5173c62 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -29,7 +29,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
     return root;
   } else {
     tensorflow::strings::StrAppend(&root, separator_, *count);
+    // Increment lookup under old 'root' name.
     (*count)++;
+    // Initialize count under new 'root' name.
+    count = &(generated_names_[root]);
+    *count = 1;
     return root;
   }
 }
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3079a0c033844666eeaa3771b467f738af7fb74
--- /dev/null
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(1) << "Running ReducePrecisionInsertion pass on " << module->name();
+
+  for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    std::vector<HloInstruction*> instructions_to_suffix;
+
+    for (auto& instruction : computation->instructions()) {
+      VLOG(3) << "Visited instruction: " << instruction->ToString();
+
+      // For now, ReducePrecision is only implemented for F32 arrays, so this
+      // ignore instructions that produce other data.  In particular, this
+      // currently ignores instructions producing tuples, even if those tuples
+      // contain F32 arrays inside them.  The assumption is that in most cases
+      // equivalent behavior can be obtained by adding ReducePrecision
+      // instructions after the instructions that pull the F32 arrays out of
+      // the tuples.
+      if (instruction->shape().element_type() == PrimitiveType::F32 &&
+          !ShapeUtil::IsScalar(instruction->shape()) &&
+          should_reduce_output_precision_(instruction->opcode())) {
+        instructions_to_suffix.push_back(instruction.get());
+      }
+    }
+
+    for (auto& instruction : instructions_to_suffix) {
+      HloInstruction* reduced =
+          computation->AddInstruction(HloInstruction::CreateReducePrecision(
+              instruction->shape(), instruction, exponent_bits_,
+              mantissa_bits_));
+      TF_RETURN_IF_ERROR(
+          computation->ReplaceUsesOfInstruction(instruction, reduced));
+      VLOG(2) << "Inserted new op after instruction: "
+              << instruction->ToString();
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+ReducePrecisionInsertion::OpcodeFilterFunction
+ReducePrecisionInsertion::make_filter_function(
+    const HloReducePrecisionOptions& reduce_precision_options) {
+  // Implement the filter function with a lookup table.
+  std::vector<bool> filter(HloOpcodeCount(), false);
+  for (const auto& opcode : reduce_precision_options.opcodes_to_suffix()) {
+    filter[opcode] = true;
+  }
+  return [filter](const HloOpcode opcode) {
+    return filter[static_cast<unsigned int>(opcode)];
+  };
+}
+
+HloReducePrecisionOptions ReducePrecisionInsertion::make_options_proto(
+    const HloReducePrecisionOptions::PassTiming pass_timing,
+    const int exponent_bits, const int mantissa_bits,
+    const OpcodeFilterFunction& should_reduce_output_precision) {
+  HloReducePrecisionOptions options;
+  options.set_pass_timing(pass_timing);
+  options.set_exponent_bits(exponent_bits);
+  options.set_mantissa_bits(mantissa_bits);
+  for (uint32_t opcode = 0; opcode < HloOpcodeCount(); opcode++) {
+    if (should_reduce_output_precision(static_cast<HloOpcode>(opcode))) {
+      options.add_opcodes_to_suffix(opcode);
+    }
+  }
+  return options;
+}
+
+bool ReducePrecisionInsertion::AddPasses(
+    HloPassPipeline* pipeline, const DebugOptions& debug_options,
+    const HloReducePrecisionOptions::PassTiming pass_timing) {
+  bool passes_added = false;
+  for (const auto& pass_options :
+       debug_options.hlo_reduce_precision_options()) {
+    if (pass_options.pass_timing() == pass_timing) {
+      pipeline->AddPass<ReducePrecisionInsertion>(pass_options);
+      passes_added = true;
+    }
+  }
+  return passes_added;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6fcee0039b449ad265b26fa6acfc912e3ab5731
--- /dev/null
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -0,0 +1,95 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_PRECISION_INSERTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_PRECISION_INSERTION_H_
+
+#include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace xla {
+
+// HLO pass which inserts reduce-precision instructions into the HLO graph, for
+// purposes of experimenting with the effects of reduced-precision storage of
+// intermediate values.
+class ReducePrecisionInsertion : public HloPassInterface {
+  using OpcodeFilterFunction = std::function<bool(HloOpcode)>;
+
+ public:
+  // The exponent_bits and mantissa_bits arguments specify the parameters of
+  // the instructions to insert.  The instructions will be inserted after each
+  // instruction with an opcode for which the should_reduce_output_precision
+  // function returns true and the output type is F32.
+  explicit ReducePrecisionInsertion(
+      const int exponent_bits, const int mantissa_bits,
+      const OpcodeFilterFunction& should_reduce_output_precision)
+      : exponent_bits_(exponent_bits),
+        mantissa_bits_(mantissa_bits),
+        should_reduce_output_precision_(should_reduce_output_precision) {}
+
+  // Version of the constructor that takes an HloReducePrecisionOptions proto
+  // rather than explicitly-enumerated parameters, for convenience when
+  // creating passes based on DebugOptions.
+  explicit ReducePrecisionInsertion(
+      const HloReducePrecisionOptions& reduce_precision_options)
+      : exponent_bits_(reduce_precision_options.exponent_bits()),
+        mantissa_bits_(reduce_precision_options.mantissa_bits()),
+        should_reduce_output_precision_(
+            make_filter_function(reduce_precision_options)) {}
+
+  ~ReducePrecisionInsertion() override{};
+
+  tensorflow::StringPiece name() const override {
+    return "reduce-precision-insertion";
+  }
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (reduce-precision instructions were inserted).
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Convert between the (inconvenient) xla.proto HloReducePrecisionOptions
+  // representation and OpcodeFilterFunction functions.
+  static OpcodeFilterFunction make_filter_function(
+      const HloReducePrecisionOptions& reduce_precision_options);
+  static HloReducePrecisionOptions make_options_proto(
+      const HloReducePrecisionOptions::PassTiming pass_timing,
+      const int exponent_bits, const int mantissa_bits,
+      const OpcodeFilterFunction& should_reduce_output_precision);
+
+  // Add ReducePrecisionInsertion passes to an HloPassPipeline based on the list
+  // of HloReducePrecisionOptions in a DebugOptions proto.  Returns true if any
+  // passes were added.
+  static bool AddPasses(
+      HloPassPipeline* pipeline, const DebugOptions& debug_options,
+      const HloReducePrecisionOptions::PassTiming pass_timing);
+
+ private:
+  // Parameters for the precision reduction to be added.
+  const int exponent_bits_;
+  const int mantissa_bits_;
+
+  // Function to determine (from the opcode) whether a given instruction should
+  // have a reduce-precision instruction inserted in its output stream.
+  const OpcodeFilterFunction should_reduce_output_precision_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_PRECISION_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80717ec2e3f43a968b04dae1367cb7f78fa08b25
--- /dev/null
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+
+using ::testing::UnorderedElementsAre;
+
+class ReducePrecisionInsertionTest : public HloTestBase {
+ protected:
+  bool InsertOps(HloModule* module,
+                 const std::function<bool(HloOpcode)>& filter) {
+    ReducePrecisionInsertion op_insertion(5, 10, filter);
+    StatusOr<bool> result = op_insertion.Run(module);
+    EXPECT_IS_OK(result.status());
+    return result.ValueOrDie();
+  }
+};
+
+TEST_F(ReducePrecisionInsertionTest, RootInstruction) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+
+  // Create a simple graph with a parameter feeding a unary cosine function.
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected state before adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+
+  EXPECT_TRUE(InsertOps(module.get(),
+                        [](HloOpcode h) { return h == HloOpcode::kCos; }));
+
+  // Confirm expected graph after adding ops.
+  EXPECT_THAT(computation->root_instruction(), op::ReducePrecision());
+  EXPECT_EQ(computation->root_instruction()->operand(0), b);
+}
+
+TEST_F(ReducePrecisionInsertionTest, NonRootInstruction) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+
+  // Create a graph with two parameters feeding into unary cosine functions,
+  // and the output of those feeds into an add function.  Feeding the outputs
+  // from the suffixed cosine functions into a binary add function allows us to
+  // confirm that the separate operand streams are not crossed when the new
+  // instructions are inserted.
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* a_cos = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
+
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* b_cos = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, b));
+
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_cos, b_cos));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Confirm expected graph before adding ops.
+  EXPECT_EQ(c->operand(0), a_cos);
+  EXPECT_EQ(c->operand(1), b_cos);
+
+  EXPECT_TRUE(InsertOps(module.get(),
+                        [](HloOpcode h) { return h == HloOpcode::kCos; }));
+
+  // Confirm expected graph after adding ops.
+  EXPECT_THAT(c->operand(0), op::ReducePrecision());
+  EXPECT_EQ(c->operand(0)->operand(0), a_cos);
+  EXPECT_THAT(c->operand(1), op::ReducePrecision());
+  EXPECT_EQ(c->operand(1)->operand(0), b_cos);
+}
+
+TEST_F(ReducePrecisionInsertionTest, OutputIsNotFloat) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+  HloInstruction* y = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected graph before adding ops.
+  EXPECT_THAT(x->users(), UnorderedElementsAre(y));
+  EXPECT_EQ(computation->root_instruction(), y);
+
+  // Since none of the instructions produce F32 data, this should not change
+  // the graph.
+  EXPECT_FALSE(InsertOps(module.get(), [](HloOpcode) { return true; }));
+
+  // Confirm that graph has not changed.
+  EXPECT_THAT(x->users(), UnorderedElementsAre(y));
+  EXPECT_EQ(computation->root_instruction(), y);
+}
+
+TEST_F(ReducePrecisionInsertionTest, ShouldReduceOutputPrecisionIsFalse) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+  HloInstruction* y = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected graph before adding ops.
+  EXPECT_THAT(x->users(), UnorderedElementsAre(y));
+  EXPECT_EQ(computation->root_instruction(), y);
+
+  // Since none of the instructions match the should_reduce_output_precision
+  // function, this should not change the graph.
+  EXPECT_FALSE(InsertOps(module.get(), [](HloOpcode h) { return false; }));
+
+  // Confirm that graph has not changed.
+  EXPECT_THAT(x->users(), UnorderedElementsAre(y));
+  EXPECT_EQ(computation->root_instruction(), y);
+}
+
+TEST_F(ReducePrecisionInsertionTest, InsertionIsNotRecursive) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateReducePrecision(shape, a, 9, 23));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected state before adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+
+  // This should insert a new ReducePrecision after the existing one, but
+  // should not then recurse by adding another after the just-inserted one.
+  EXPECT_TRUE(InsertOps(module.get(), [](HloOpcode h) {
+    return h == HloOpcode::kReducePrecision;
+  }));
+
+  // Confirm expected graph after adding ops.
+  EXPECT_THAT(computation->root_instruction(), op::ReducePrecision());
+  EXPECT_EQ(computation->root_instruction()->operand(0), b);
+}
+
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  return xla::ParseDebugOptionsFlagsAndRunTests(argc, argv);
+}
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 2d35ba5e5480511d93a85d8e54ad8983551a329c..1c648d58c7fca25f2cc9069b12532007083cc02d 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -312,10 +312,17 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   bool changed = false;
-  for (const auto& comp : module->computations()) {
+  std::vector<HloComputation*> computations;
+  for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (const auto& comp : computations) {
     for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
       TF_ASSIGN_OR_RETURN(bool did_change,
-                          TrySinkReshapeOrTranspose(comp.get(), instruction));
+                          TrySinkReshapeOrTranspose(comp, instruction));
       changed |= did_change;
     }
   }
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 9becdb2bed480d610e658303ee7deff4cf7d2743..1589d52a256df1914201c866859008c0f1df8a8f 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -84,7 +84,7 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, rng0));
 
   auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateFromShape(root_shape)));
+      HloInstruction::CreateConstant(Literal::CreateFromShape(root_shape)));
 
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
@@ -179,9 +179,8 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
 TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
-  auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<bool>(
-          {{true, true, false}, {false, false, true}})));
+  auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<bool>({{true, true, false}, {false, false, true}})));
 
   auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param1"));
@@ -263,12 +262,12 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
+      Literal::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const0));
 
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
+      Literal::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const1));
 
@@ -318,7 +317,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param0"));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
+      Literal::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
   builder.AddInstruction(HloInstruction::CreateBinary(
@@ -352,16 +351,15 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
-  auto fusion = computation->AddInstruction(HloInstruction::CreateFusion(
-      add->shape(), HloInstruction::FusionKind::kLoop, add));
-  TF_CHECK_OK(computation->ReplaceInstruction(add, fusion));
+  HloModule module(TestName());
+  auto computation = module.AddEntryComputation(builder.Build());
+  computation->CreateFusionInstruction({add},
+                                       HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(),
               op::Fusion(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(&module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Fusion(param0, param1)));
@@ -464,7 +462,7 @@ TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {128, 1}), param0));
   Array2D<float> a(128, 1024);
-  auto literal = LiteralUtil::CreateR2FromArray2D<float>(a);
+  auto literal = Literal::CreateR2FromArray2D<float>(a);
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
   auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 85ca7e4e59ce9e69a671b829f3c2c3a4834a99ce..25e3f57dfb1c994bd6c96ed6ce18190a0088e963 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -141,12 +141,13 @@ int ServiceOptions::intra_op_parallelism_threads() const {
   }
   BackendOptions backend_options;
   backend_options.set_platform(platform);
-  backend_options.set_number_of_replicas(options.number_of_replicas());
   TF_ASSIGN_OR_RETURN(execute_backend, Backend::CreateBackend(backend_options));
+
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
-  std::unique_ptr<Service> service(new Service(
-      std::move(execute_backend), std::move(compute_constant_backend)));
+  std::unique_ptr<Service> service(
+      new Service(options, std::move(execute_backend),
+                  std::move(compute_constant_backend)));
   return std::move(service);
 }
 
@@ -158,24 +159,25 @@ Service::CreateComputeConstantBackend() {
     if (platform->id() == se::host::kHostPlatformId) {
       BackendOptions backend_options;
       backend_options.set_platform(platform);
-      backend_options.set_number_of_replicas(1);
       return Backend::CreateBackend(backend_options);
     }
   }
   return NotFound("CPU platform not found");
 }
 
-/* static */ Compiler::HloDumper Service::MakeHloDumper() {
-  return [](const HloModule& module, const string& label) {
-    return Executable::DumpExecutedHlo(module, label, /*profile=*/nullptr);
-  };
-}
-
-Service::Service(std::unique_ptr<Backend> execute_backend,
+Service::Service(const ServiceOptions& options,
+                 std::unique_ptr<Backend> execute_backend,
                  std::unique_ptr<Backend> compute_constant_backend)
-    : execute_backend_(std::move(execute_backend)),
+    : options_(options),
+      execute_backend_(std::move(execute_backend)),
       compute_constant_backend_(std::move(compute_constant_backend)) {
+  CHECK(options_.number_of_replicas() > 0);
+
   if (execute_backend_) {
+    if (execute_backend_->device_count() > 0) {
+      CHECK_GE(execute_backend_->device_count(), options_.number_of_replicas())
+          << "Requested more replicas than there are devices.";
+    }
     LOG(INFO) << Printf(
         "XLA service %p executing computations on platform %s. Devices:", this,
         execute_backend_->platform()->Name().c_str());
@@ -285,7 +287,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-    const ExecutionOptions& execution_options, Backend* backend) {
+    const ExecutionOptions& execution_options) {
   auto module_config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = module_config->mutable_entry_computation_layout();
 
@@ -320,12 +322,11 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
             shape_with_output_layout));
   }
 
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  if (flags->xla_hlo_profile) {
+  if (execution_options.debug_options().xla_hlo_profile()) {
     module_config->enable_hlo_profiling(true);
   }
 
-  module_config->set_replica_count(backend->Replicas().size());
+  module_config->set_replica_count(options_.number_of_replicas());
   module_config->set_seed(execution_options.seed());
   module_config->set_debug_options(execution_options.debug_options());
 
@@ -341,23 +342,25 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
   // Dump computation proto state if flag is set.
   std::vector<std::unique_ptr<SessionModule>> session_modules;
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  const string& directory_path = flags->xla_dump_computations_to;
-  const string& other_directory_path = flags->xla_dump_executions_to;
-  if ((!directory_path.empty() || !other_directory_path.empty())) {
-    for (int64 i = 0; i < versioned_handles.size(); ++i) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<SessionModule> session_module,
-                          computation_tracker_.SnapshotComputation(
-                              versioned_handles[i].handle));
-      if (!directory_path.empty()) {
-        string filename = Printf("computation_%lld__%s__version_%lld",
-                                 versioned_handles[i].handle.handle(),
-                                 session_module->entry().name().c_str(),
-                                 versioned_handles[i].version);
-        TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                       *session_module));
-        session_modules.push_back(std::move(session_module));
-      }
+  for (int64 i = 0; i < versioned_handles.size(); ++i) {
+    const string& directory_path =
+        module_configs[i]->debug_options().xla_dump_computations_to();
+    const string& other_directory_path =
+        module_configs[i]->debug_options().xla_dump_executions_to();
+    if (directory_path.empty() && other_directory_path.empty()) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<SessionModule> session_module,
+        computation_tracker_.SnapshotComputation(versioned_handles[i].handle));
+    if (!directory_path.empty()) {
+      string filename = Printf("computation_%lld__%s__version_%lld",
+                               versioned_handles[i].handle.handle(),
+                               session_module->entry().name().c_str(),
+                               versioned_handles[i].version);
+      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
+                                                     *session_module));
+      session_modules.push_back(std::move(session_module));
     }
   }
 
@@ -378,14 +381,12 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     modules.push_back(std::move(module));
   }
 
-  Compiler::HloDumper hlo_dumper = MakeHloDumper();
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), hlo_dumper,
-                                   std::move(executors)));
+      backend->compiler()->Compile(std::move(modules), std::move(executors)));
 
-  if (!other_directory_path.empty()) {
-    for (size_t i = 0; i < versioned_handles.size(); ++i) {
+  for (size_t i = 0; i < versioned_handles.size(); ++i) {
+    if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
       executables[i]->set_session_module(std::move(session_modules[i]));
     }
   }
@@ -405,9 +406,10 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
   // Dump computation proto state if flag is set.
   std::unique_ptr<SessionModule> session_module;
-  legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-  const string& directory_path = flags->xla_dump_computations_to;
-  const string& other_directory_path = flags->xla_dump_executions_to;
+  const string& directory_path =
+      module_config->debug_options().xla_dump_computations_to();
+  const string& other_directory_path =
+      module_config->debug_options().xla_dump_executions_to();
   if (!executable_for_compute_constant &&
       (!directory_path.empty() || !other_directory_path.empty())) {
     TF_ASSIGN_OR_RETURN(
@@ -429,15 +431,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                                           /*include_unreachable_instructions=*/
                                           !executable_for_compute_constant));
 
-  Compiler::HloDumper hlo_dumper = MakeHloDumper();
-  if (executable_for_compute_constant &&
-      !flags->xla_hlo_graph_for_compute_constant) {
-    hlo_dumper = [](const HloModule&, const string&) {};
-  }
-
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend->compiler()->Compile(std::move(module), hlo_dumper, executor));
+      backend->compiler()->Compile(std::move(module), executor));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -495,47 +491,55 @@ Service::ExecuteParallelAndRegisterResult(
     tensorflow::gtl::ArraySlice<
         std::vector<perftools::gputools::DeviceMemoryBase>>
         arguments,
-    Backend* backend,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*> executors,
+    Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
     tensorflow::gtl::ArraySlice<string> result_tags) {
-  // TODO(b/33943292): Support for replication when using multiple computations.
-  TF_RET_CHECK(backend->Replicas().size() == 1);
-
-  // Set up streams.
+  // Streams where the computation are launched, so we can wait on the streams
+  // to complete.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
 
-  for (se::StreamExecutor* executor : executors) {
-    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
-                        backend->BorrowStream(executor));
-    streams.push_back(std::move(stream));
-  }
-
-  // Set up run options.
-  std::vector<ServiceExecutableRunOptions> run_options;
-  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
-    ExecutableRunOptions options;
-    options.set_stream(stream.get());
-    options.set_allocator(backend->memory_allocator());
-    options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
-    options.set_intra_op_thread_pool(
-        backend->eigen_intra_op_thread_pool_device());
-    run_options.emplace_back(options, backend->StreamBorrower());
-  }
-
-  // Asynchronously launch all executables.
+  // Global data handles for the computation results, one for each computation.
   std::vector<GlobalDataHandle> result_handles;
-  for (tensorflow::gtl::ArraySlice<Executable*>::size_type i = 0;
-       i < executables.size(); i++) {
-    TF_ASSIGN_OR_RETURN(
-        perftools::gputools::DeviceMemoryBase result,
-        executables[i]->ExecuteAsyncOnStream(&run_options[i], arguments[i]));
-    result_handles.push_back(allocation_tracker_.Register(
-        backend, executors[i]->device_ordinal(), result,
-        executables[i]->result_shape(), result_tags[i]));
+
+  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
+                      backend->computation_placer()->AssignDevices(
+                          options_.number_of_replicas(), executables.size()));
+
+  for (int64 i = 0; i < executables.size(); i++) {
+    // Stream executors for the replicas of the current computation.
+    TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
+    for (int64 replica = 0; replica < replicas.size(); ++replica) {
+      TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+                          backend->BorrowStream(replicas[replica]));
+      streams.push_back(std::move(stream));
+
+      // Set up run options.
+      ExecutableRunOptions options;
+      options.set_stream(streams.back().get());
+      options.set_allocator(backend->memory_allocator());
+      options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
+      options.set_intra_op_thread_pool(
+          backend->eigen_intra_op_thread_pool_device());
+      options.set_device_assignment(&device_assignment);
+      ServiceExecutableRunOptions run_options(options,
+                                              backend->StreamBorrower());
+
+      // Asynchronously launch the computation.
+      TF_ASSIGN_OR_RETURN(
+          perftools::gputools::DeviceMemoryBase result,
+          executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i]));
+
+      // All replicas share the same device address for the result allocation,
+      // so only one of the replicas need to register the result handle.
+      if (replica == 0) {
+        result_handles.push_back(allocation_tracker_.Register(
+            backend, replicas[0]->device_ordinal(), result,
+            executables[i]->result_shape(), result_tags[i]));
+      }
+    }
   }
 
   // Wait for all executions to complete.
-  for (int64 i = 0; i < result_handles.size(); ++i) {
+  for (int64 i = 0; i < streams.size(); ++i) {
     if (!streams[i]->BlockHostUntilDone()) {
       return InternalError("failed to complete execution for stream %lld", i);
     }
@@ -550,17 +554,23 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
         arguments,
     Backend* backend, perftools::gputools::StreamExecutor* executor,
     const string& result_tag, ExecutionProfile* profile) {
-  TF_RET_CHECK(!backend->Replicas().empty());
-
   // Set up streams.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
 
-  for (se::StreamExecutor* executor : backend->Replicas()) {
+  TF_ASSIGN_OR_RETURN(auto replicas,
+                      Replicas(*backend, SingleComputationDeviceHandle()));
+  TF_RET_CHECK(!replicas.empty());
+  for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                         backend->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
 
+  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
+                      backend->computation_placer()->AssignDevices(
+                          options_.number_of_replicas(),
+                          /*computation_count=*/1));
+
   // Set up run options.
   std::vector<ServiceExecutableRunOptions> run_options;
   for (const Pool<se::Stream>::SmartPtr& stream : streams) {
@@ -570,19 +580,20 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
     run_options.emplace_back(options, backend->StreamBorrower(),
                              backend->inter_op_thread_pool());
   }
 
   perftools::gputools::DeviceMemoryBase result;
-  if (backend->Replicas().size() == 1) {
+  if (options_.number_of_replicas() == 1) {
     TF_ASSIGN_OR_RETURN(
         result, executable->ExecuteOnStreamWrapper<se::DeviceMemoryBase>(
                     &run_options[0], profile, arguments));
   } else {
     std::vector<
         tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
-        repeated_arguments(backend->Replicas().size(), arguments);
+        repeated_arguments(options_.number_of_replicas(), arguments);
 
     TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
                                           run_options, repeated_arguments));
@@ -610,25 +621,26 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   std::vector<VersionedComputationHandle> versioned_handles;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
   std::vector<string> computation_names;
+  std::vector<DeviceHandle> device_handles;
 
-  if (arg->requests_size() > execute_backend_->stream_executors().size()) {
+  if (arg->requests_size() * options_.number_of_replicas() >
+      execute_backend_->device_count()) {
     return FailedPrecondition(
         "there are not enough stream executors to execute %d computations",
         arg->requests_size());
   }
 
   for (int64 i = 0; i < arg->requests_size(); ++i) {
-    // Get the stream executor on which the computation will run. Select the
-    // specific device if requested, otherwise select the i'th device from the
-    // list of available stream executors.
-    se::StreamExecutor* executor;
-    if (arg->requests(i).has_device_handle()) {
-      executor =
-          execute_backend_
-              ->stream_executors()[arg->requests(i).device_handle().handle()];
-    } else {
-      executor = execute_backend_->stream_executors()[i];
+    // Get the stream executor for the i'th computation. This stream executor
+    // is one of the executors to run the replicated computation.
+    if (!arg->requests(i).has_device_handle()) {
+      return FailedPrecondition(
+          "device handles must be given to execute parallel computations");
     }
+    TF_ASSIGN_OR_RETURN(
+        auto replicas,
+        Replicas(*execute_backend_, arg->requests(i).device_handle()));
+    se::StreamExecutor* executor = replicas[0];
     CHECK(executor != nullptr);
 
     // Resolve the UserComputation object associated with the requested
@@ -662,8 +674,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // the program and the argument allocations.
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                         CreateModuleConfig(*program_shape, arg_allocations,
-                                           request.execution_options(),
-                                           execute_backend_.get()));
+                                           request.execution_options()));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -673,6 +684,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     module_configs.push_back(std::move(module_config));
     computation_names.push_back(user_computation->name());
     executors.push_back(executor);
+    device_handles.push_back(arg->requests(i).device_handle());
   }
 
   // Build the user computations into HloModules and compile to generate the
@@ -692,7 +704,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDataHandle> outputs,
       ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
-                                       execute_backend_.get(), executors,
+                                       execute_backend_.get(), device_handles,
                                        computation_names));
   for (const GlobalDataHandle& output : outputs) {
     ExecuteResponse response;
@@ -706,10 +718,12 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
 tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                                              GetDeviceHandlesResponse* result) {
-  const int64 available_device_count =
-      execute_backend_->stream_executors().size();
-  const int64 replicas = execute_backend_->Replicas().size();
-  if (available_device_count < arg->device_count() * replicas) {
+  const int64 available_device_count = execute_backend_->device_count();
+  const int64 replica_count = options_.number_of_replicas();
+  if (replica_count <= 0) {
+    return FailedPrecondition("Replica count must be a positive integer");
+  }
+  if (available_device_count < arg->device_count() * replica_count) {
     return ResourceExhausted(
         "Requested device count (%lld) exceeds the number of available devices "
         "on the target (%lld)",
@@ -718,8 +732,8 @@ tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
 
   for (int64 i = 0; i < arg->device_count(); ++i) {
     DeviceHandle device_handle;
-    device_handle.set_handle(
-        execute_backend_->stream_executors()[i * replicas]->device_ordinal());
+    device_handle.set_handle(i);
+    device_handle.set_device_count(arg->device_count());
     *result->add_device_handles() = device_handle;
   }
 
@@ -749,10 +763,9 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arg_allocations,
-                         arg->execution_options(), execute_backend_.get()));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(*program_shape, arg_allocations,
+                                         arg->execution_options()));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -818,10 +831,9 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arg_allocations,
-                         arg->execution_options(), execute_backend_.get()));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(*program_shape, arg_allocations,
+                                         arg->execution_options()));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -841,11 +853,14 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
                               execute_backend_->default_stream_executor(),
                               &profile));
 
-  TF_RET_CHECK(!execute_backend_->Replicas().empty());
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_RET_CHECK(!replicas.empty());
+
   // Set up streams.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
 
-  for (se::StreamExecutor* executor : execute_backend_->Replicas()) {
+  for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
                         execute_backend_->BorrowStream(executor));
     streams.push_back(std::move(stream));
@@ -927,19 +942,20 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
   Literal literal = Literal(arg->literal());
   const Shape& shape = literal.shape();
 
-  if (ShapeUtil::IsTuple(shape) && execute_backend_->Replicas().size() > 1) {
+  if (ShapeUtil::IsTuple(shape) && options_.number_of_replicas() > 1) {
     // TODO(b/32990684): Tuple transfers to host end up allocating further
     // buffers - implement that correctly.
     return Unimplemented(
         "Tuple transfers to the device not supported with replication.");
   }
 
-  se::StreamExecutor* stream_executor;
+  std::vector<se::StreamExecutor*> replicas;
   if (arg->has_device_handle()) {
-    TF_ASSIGN_OR_RETURN(stream_executor, execute_backend_->stream_executor(
-                                             arg->device_handle().handle()));
+    TF_ASSIGN_OR_RETURN(replicas,
+                        Replicas(*execute_backend_, arg->device_handle()));
   } else {
-    stream_executor = execute_backend_->default_stream_executor();
+    TF_ASSIGN_OR_RETURN(
+        replicas, Replicas(*execute_backend_, SingleComputationDeviceHandle()));
   }
 
   // Allocate memory on the device, using the stream executor. The size of the
@@ -950,14 +966,12 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
 
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase allocation,
                       execute_backend_->memory_allocator()->Allocate(
-                          stream_executor->device_ordinal(), allocation_size));
+                          replicas[0]->device_ordinal(), allocation_size));
 
   *result->mutable_data() = allocation_tracker_.Register(
-      execute_backend_.get(), stream_executor->device_ordinal(), allocation,
-      shape, StrCat("TransferToServer literal of size ", allocation_size));
+      execute_backend_.get(), replicas[0]->device_ordinal(), allocation, shape,
+      StrCat("TransferToServer literal of size ", allocation_size));
 
-  TF_ASSIGN_OR_RETURN(auto replicas, execute_backend_->Replicas(
-                                         stream_executor->device_ordinal()));
   for (se::StreamExecutor* executor : replicas) {
     TF_RETURN_IF_ERROR(
         execute_backend_->transfer_manager()->TransferLiteralToDevice(
@@ -968,7 +982,7 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
 
 tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
                                              TransferToInfeedResponse* result) {
-  const int64 replica_count = execute_backend_->Replicas().size();
+  const int64 replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
         "%s",
@@ -980,11 +994,14 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
 
   se::StreamExecutor* executor;
   if (arg->has_device_handle()) {
-    TF_ASSIGN_OR_RETURN(auto replicas, execute_backend_->Replicas(
-                                           arg->device_handle().handle()));
+    TF_ASSIGN_OR_RETURN(auto replicas,
+                        Replicas(*execute_backend_, arg->device_handle()));
     executor = replicas[arg->replica_id()];
   } else {
-    executor = execute_backend_->Replicas()[arg->replica_id()];
+    TF_ASSIGN_OR_RETURN(
+        auto replicas,
+        Replicas(*execute_backend_, SingleComputationDeviceHandle()));
+    executor = replicas[arg->replica_id()];
   }
 
   return execute_backend_->transfer_manager()->TransferLiteralToInfeed(
@@ -994,7 +1011,7 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
 tensorflow::Status Service::TransferFromOutfeed(
     const TransferFromOutfeedRequest* arg,
     TransferFromOutfeedResponse* result) {
-  const int64 replica_count = execute_backend_->Replicas().size();
+  const int64 replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
         "The replica_id=%lld on TransferFromOutfeedRequest not in range [0, "
@@ -1004,11 +1021,14 @@ tensorflow::Status Service::TransferFromOutfeed(
 
   se::StreamExecutor* executor;
   if (arg->has_device_handle()) {
-    TF_ASSIGN_OR_RETURN(auto replicas, execute_backend_->Replicas(
-                                           arg->device_handle().handle()));
+    TF_ASSIGN_OR_RETURN(auto replicas,
+                        Replicas(*execute_backend_, arg->device_handle()));
     executor = replicas[arg->replica_id()];
   } else {
-    executor = execute_backend_->Replicas()[arg->replica_id()];
+    TF_ASSIGN_OR_RETURN(
+        auto replicas,
+        Replicas(*execute_backend_, SingleComputationDeviceHandle()));
+    executor = replicas[arg->replica_id()];
   }
 
   Literal literal;
@@ -1085,8 +1105,7 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options,
-                                         compute_constant_backend_.get()));
+                      CreateModuleConfig(program_shape, {}, execution_options));
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
@@ -1146,11 +1165,14 @@ tensorflow::Status Service::GetComputationStats(
   VersionedComputationHandle versioned_handle =
       user_computation->GetVersionedHandle();
 
+  HloModuleConfig config;
+  config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, HloModuleConfig()));
+      computation_tracker_.BuildHloModule(versioned_handle, config));
 
-  MakeHloDumper()(*module, "computation statistics subject");
+  hlo_graph_dumper::MaybeDumpHloModule(*module,
+                                       "computation statistics subject");
 
   // Run HLO analysis to get the computation statistics.
   HloCostAnalysis analysis(
@@ -1166,17 +1188,6 @@ tensorflow::Status Service::GetComputationStats(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status Service::CheckRunsInClientProcess(
-    const string& method_name) const {
-  if (runs_in_client_process_) {
-    return tensorflow::Status::OK();
-  } else {
-    return FailedPrecondition(
-        "%s only supported if service runs in the same process as the client",
-        method_name.c_str());
-  }
-}
-
 template <typename RequestT, typename ResponseT>
 tensorflow::Status Service::AddInstruction(
     const RequestT* arg, ResponseT* result,
@@ -1195,6 +1206,14 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
   StatusOr<ComputationDataHandle> handle_status;
 
   switch (arg->op_case()) {
+    case OpRequest::kBatchNormTrainingRequest:
+      handle_status = computation->AddBatchNormTrainingInstruction(
+          arg->batch_norm_training_request());
+      break;
+    case OpRequest::kBatchNormGradRequest:
+      handle_status = computation->AddBatchNormGradInstruction(
+          arg->batch_norm_grad_request());
+      break;
     case OpRequest::kBinaryOpRequest:
       handle_status =
           computation->AddBinaryInstruction(arg->binary_op_request());
@@ -1277,6 +1296,11 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
           computation->AddReduceInstruction(arg->reduce_request(), *to_apply);
       break;
     }
+    case OpRequest::kReducePrecisionRequest: {
+      handle_status = computation->AddReducePrecisionInstruction(
+          arg->reduce_precision_request());
+      break;
+    }
     case OpRequest::kReduceWindowRequest: {
       TF_ASSIGN_OR_RETURN(UserComputation * to_apply,
                           computation_tracker_.Resolve(
@@ -1383,4 +1407,28 @@ tensorflow::Status Service::LoadComputationSnapshot(
   return tensorflow::Status::OK();
 }
 
+DeviceHandle Service::SingleComputationDeviceHandle() const {
+  DeviceHandle device_handle;
+  device_handle.set_handle(0);
+  device_handle.set_device_count(1);
+  return device_handle;
+}
+
+StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
+    const Backend& backend, const DeviceHandle& device_handle) const {
+  std::vector<perftools::gputools::StreamExecutor*> replicas;
+  for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
+    // From the computation placer, find out the device ids of the replicas for
+    // the given device handle.
+    TF_ASSIGN_OR_RETURN(
+        int device_ordinal,
+        backend.computation_placer()->DeviceId(replica, device_handle.handle(),
+                                               options_.number_of_replicas(),
+                                               device_handle.device_count()));
+    TF_ASSIGN_OR_RETURN(auto executor, backend.stream_executor(device_ordinal));
+    replicas.push_back(executor);
+  }
+  return replicas;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index abd1281bdd0ab76297bc64493ec77bbc35fb552b..ccd699516e1b874546340e1650a31067aecb6886 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -22,12 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/allocation_tracker.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 #include "tensorflow/compiler/xla/service/compilation_cache.h"
-#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -58,8 +57,7 @@ class ServiceOptions {
   perftools::gputools::Platform* platform() const;
 
   // Set the number of replicas to use when compiling replicated
-  // programs. The default is -1 meaning that the value is read from
-  // the xla_replicas flag.
+  // programs.
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
@@ -69,7 +67,7 @@ class ServiceOptions {
 
  private:
   perftools::gputools::Platform* platform_ = nullptr;
-  int number_of_replicas_ = -1;
+  int number_of_replicas_ = 1;
   int intra_op_parallelism_threads_ = -1;
 };
 
@@ -126,7 +124,7 @@ class Service : public ServiceInterface {
   // least N * R devices must be available. The devices are assigned based on
   // the device ordinals such that the first R available devices are assigned to
   // the first set of replicas, and the next R devices to the second set of
-  // replicas, etc. Each returned device handles represent the device with the
+  // replicas, etc. Each returned device handle represents the device with the
   // replica id 0.
   tensorflow::Status GetDeviceHandles(
       const GetDeviceHandlesRequest* arg,
@@ -248,7 +246,7 @@ class Service : public ServiceInterface {
 
   // The constructor is private. Use the NewService factory to create new
   // service objects.
-  Service(std::unique_ptr<Backend> backend,
+  Service(const ServiceOptions& options, std::unique_ptr<Backend> backend,
           std::unique_ptr<Backend> compute_constant_backend);
 
   static StatusOr<std::unique_ptr<Backend>> CreateComputeConstantBackend();
@@ -264,7 +262,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-      const ExecutionOptions& execution_options, Backend* backend);
+      const ExecutionOptions& execution_options);
 
   // Builds an Executable for the given parameters. If
   // executable_for_compute_constant is true, then the executable is intended to
@@ -319,14 +317,9 @@ class Service : public ServiceInterface {
           std::vector<perftools::gputools::DeviceMemoryBase>>
           arguments,
       Backend* backend,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executors,
+      tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
       tensorflow::gtl::ArraySlice<string> result_tags);
 
-  // Returns an HLO dumper for use in the compiler (it refers to flags
-  // associated with the service).
-  static Compiler::HloDumper MakeHloDumper();
-
   // Convenience function for adding a function to a user computation.
   template <typename RequestT, typename ResponseT>
   tensorflow::Status AddInstruction(
@@ -334,18 +327,24 @@ class Service : public ServiceInterface {
       const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
           adder);
 
-  // If the service is running in the client process
-  // (runs_in_client_process_ is true) then return
-  // tensorflow::Status::OK. Otherwise return an appropriate error
-  // status with the given method name. Used for "InProcess" methods.
-  tensorflow::Status CheckRunsInClientProcess(const string& method_name) const;
-
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
   tensorflow::Status ValidateResultShapeWithLayout(
       const Shape& shape_with_layout, const Shape& result_shape) const;
 
+  // Returns the stream executors assigned to the replicas represented by the
+  // given device handle. Each device_handle is a virtual replicated device that
+  // represents a set of physical devices for the replicas.
+  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
+      const Backend& backend, const DeviceHandle& device_handle) const;
+
+  // Returns the device handle that represents the replicated device for a
+  // single computation that is not model-parallelized.
+  DeviceHandle SingleComputationDeviceHandle() const;
+
+  ServiceOptions options_;
+
   // Tracks computations built via the API.
   ComputationTracker computation_tracker_;
 
@@ -369,9 +368,6 @@ class Service : public ServiceInterface {
   // Backend to use when executing ComputeConstant.
   std::unique_ptr<Backend> compute_constant_backend_;
 
-  // Whether the service runs in the same process as the client.
-  bool runs_in_client_process_ = false;
-
   TF_DISALLOW_COPY_AND_ASSIGN(Service);
 };
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index d6436cf988db7632ecf89f1a1e274a0fbab00ce2..40206145c8987083e0b00ceb48ad7a6e7c6cd926 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -184,6 +184,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   switch (operation) {
     case UNOP_FLOOR:
     case UNOP_CEIL:
+    case UNOP_COS:
+    case UNOP_SIN:
     case UNOP_EXP:
     case UNOP_LOG:
     case UNOP_TANH:
@@ -297,6 +299,30 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::ChangeElementType(operand_shape, new_element_type);
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferReducePrecisionShape(
+    const Shape& operand_shape, const int exponent_bits,
+    const int mantissa_bits) {
+  if (!ShapeUtil::ElementIsFloating(operand_shape)) {
+    return InvalidArgument(
+        "expected element type in shape to be floating point for "
+        "ReducePrecision operation; got %s",
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+  if (exponent_bits < 1) {
+    // One exponent bit is necessary to distinguish 0 from infinity.  Having
+    // no exponent bits doesn't produce a sensible number, so we require at
+    // least one.
+    return InvalidArgument("expected exponent_bits >= 1; got %d",
+                           exponent_bits);
+  }
+  if (mantissa_bits < 0) {
+    // A number with no mantissa bits is still meaningful, however.
+    return InvalidArgument("expected non-negative mantissa_bits; got %d",
+                           mantissa_bits);
+  }
+  return operand_shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferPadShape(
     const Shape& operand_shape, const Shape& padding_value_shape,
     const PaddingConfig& padding_config) {
@@ -525,9 +551,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       ExpectNotTupleOrOpaque(rhs, "rhs of elementwise binary operation"));
 
   if (!ShapeUtil::SameElementType(lhs, rhs)) {
-    return InvalidArgument("binary op with different element types: %s and %s",
-                           ShapeUtil::HumanString(lhs).c_str(),
-                           ShapeUtil::HumanString(rhs).c_str());
+    return InvalidArgument(
+        "binary op %s with different element types: %s and %s",
+        BinaryOperation_Name(operation).c_str(),
+        ShapeUtil::HumanString(lhs).c_str(),
+        ShapeUtil::HumanString(rhs).c_str());
   }
 
   if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs) &&
@@ -754,6 +782,263 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                               AsInt64Slice(arg_shape->dimensions()));
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferBatchNormTrainingShape(
+    const Shape& operand_shape, const Shape& offset_shape,
+    const Shape& scale_shape, int64 feature_index) {
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(operand_shape, "operand of batch norm training"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      offset_shape, "offset input of batch norm training"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      scale_shape, "scale input of batch norm training"));
+
+  TF_RET_CHECK(ShapeUtil::ValidateShape(operand_shape) ==
+               tensorflow::Status::OK());
+  TF_RET_CHECK(ShapeUtil::ValidateShape(offset_shape) ==
+               tensorflow::Status::OK());
+  TF_RET_CHECK(ShapeUtil::ValidateShape(scale_shape) ==
+               tensorflow::Status::OK());
+
+  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+    return InvalidArgument(
+        "Expected feature_index of batch-norm-training to be "
+        "smaller than the rank of operand_shape; "
+        "got feature_index %lld, and rank %lld",
+        feature_index, ShapeUtil::Rank(operand_shape));
+  }
+
+  if (feature_index < 0) {
+    return InvalidArgument(
+        "Expected feature_index of batch-norm-training to "
+        "be a non-negative number, got %lld",
+        feature_index);
+  }
+
+  if (ShapeUtil::Rank(operand_shape) < 1) {
+    return InvalidArgument(
+        "Expected the rank of operand to "
+        "batch-norm-training to be at least 1; got %lld",
+        ShapeUtil::Rank(operand_shape));
+  }
+
+  if (ShapeUtil::Rank(offset_shape) != 1) {
+    return InvalidArgument(
+        "Offset input of batch-norm-training must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(offset_shape));
+  }
+
+  if (ShapeUtil::Rank(scale_shape) != 1) {
+    return InvalidArgument(
+        "Scale input of batch-norm-training must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(scale_shape));
+  }
+
+  if (!ShapeUtil::ElementIsFloating(operand_shape)) {
+    return InvalidArgument(
+        "The operand to batch-norm-training must have a floating point "
+        "element type, but the shape is %s",
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(offset_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for batch-norm-training, "
+        "but the shape of offset factor is %s "
+        "and the shape of operand is %s",
+        PrimitiveType_Name(offset_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(scale_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for batch-norm-training, "
+        "but the shape of scale factor is %s "
+        "and the shape of operand is %s",
+        PrimitiveType_Name(scale_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  const int64 feature_count = operand_shape.dimensions(feature_index);
+  Shape output_shape_for_mean_and_var =
+      ShapeUtil::MakeShape(operand_shape.element_type(), {feature_count});
+
+  if (ShapeUtil::GetDimension(offset_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of offset factor should be the same as feature count,"
+        "but the size of offset factor is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(offset_shape, 0), feature_count);
+  }
+
+  if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of scale factor should be the same as feature count,"
+        "but the size of scale factor is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(scale_shape, 0), feature_count);
+  }
+
+  return ShapeUtil::MakeTupleShape({operand_shape,
+                                    output_shape_for_mean_and_var,
+                                    output_shape_for_mean_and_var});
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferBatchNormGradShape(
+    const Shape& operand_shape, const Shape& scale_shape,
+    const Shape& mean_shape, const Shape& var_shape,
+    const Shape& output_grad_shape, int64 feature_index) {
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(operand_shape, "operand of batch norm grad"));
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(scale_shape, "scale input of batch norm grad"));
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(mean_shape, "mean input of batch norm grad"));
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(var_shape, "var input of batch norm grad"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      output_grad_shape, "output_grad input of batch norm grad"));
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(operand_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(mean_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(scale_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(var_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(output_grad_shape));
+
+  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+    return InvalidArgument(
+        "Expected feature_index of batch-norm-grad to be "
+        "smaller than the rank of operand_shape; "
+        "got feature_index %lld, and rank %lld",
+        feature_index, ShapeUtil::Rank(operand_shape));
+  }
+
+  if (ShapeUtil::Rank(operand_shape) != ShapeUtil::Rank(output_grad_shape)) {
+    return InvalidArgument(
+        "Expected operand_shape of batch-norm-grad to have the same rank as"
+        " output_grad_shape; got rank(oprand_shape) %lld, and"
+        " rank(output_grad_shape) %lld",
+        ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(output_grad_shape));
+  }
+
+  if (ShapeUtil::Rank(mean_shape) != 1) {
+    return InvalidArgument(
+        "Mean input of batch-norm-grad must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(mean_shape));
+  }
+
+  if (ShapeUtil::Rank(scale_shape) != 1) {
+    return InvalidArgument(
+        "Scale input of batch-norm-grad must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(scale_shape));
+  }
+
+  if (ShapeUtil::Rank(var_shape) != 1) {
+    return InvalidArgument(
+        "Var input of batch-norm-grad must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(var_shape));
+  }
+
+  if (!ShapeUtil::ElementIsFloating(operand_shape)) {
+    return InvalidArgument(
+        "The operand to batch-norm-grad must have a floating point "
+        "element type, but the shape is %s",
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::ElementIsFloating(output_grad_shape)) {
+    return InvalidArgument(
+        "The output_grad to batch-norm-grad must have a floating point "
+        "element type, but the shape is %s",
+        PrimitiveType_Name(output_grad_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(output_grad_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for batch-norm-grad, "
+        "but the element type of output_grad is %s "
+        "and the element type of operand is %s",
+        PrimitiveType_Name(output_grad_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(scale_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for batch-norm-grad, "
+        "but the element type of scale factor is %s "
+        "and the element type of operand is %s",
+        PrimitiveType_Name(scale_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(mean_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for batch-norm-grad, "
+        "but the element type of mean is %s "
+        "and the element type of operand is %s",
+        PrimitiveType_Name(mean_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(var_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for batch-norm-grad, "
+        "but the element type of mean is %s "
+        "and the element type of operand is %s",
+        PrimitiveType_Name(mean_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  const int64 feature_count = operand_shape.dimensions(feature_index);
+
+  Shape feature_shape =
+      ShapeUtil::MakeShape(operand_shape.element_type(), {feature_count});
+
+  if (ShapeUtil::GetDimension(mean_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of mean should be the same as feature count,"
+        "but the size of offset factor is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(mean_shape, 0), feature_count);
+  }
+
+  if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of scale factor should be the same as feature count,"
+        "but the size of scale factor is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(scale_shape, 0), feature_count);
+  }
+
+  if (ShapeUtil::GetDimension(var_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of variance should be the same as feature count,"
+        "but the size of variance is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(var_shape, 0), feature_count);
+  }
+
+  // Verify operand_shape and output_grad_shape have same bounds.
+  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+    if (ShapeUtil::GetDimension(operand_shape, i) !=
+        ShapeUtil::GetDimension(output_grad_shape, i)) {
+      return InvalidArgument(
+          "The bounds of operand shape should be the same as output_grad's,"
+          "but the bound of operand_shape at dimension %lld is %lld "
+          "and the bound of output_grad_shape is %lld",
+          i, ShapeUtil::GetDimension(operand_shape, i),
+          ShapeUtil::GetDimension(output_grad_shape, i));
+    }
+  }
+
+  return ShapeUtil::MakeTupleShape(
+      {operand_shape, feature_shape, feature_shape});
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferConvolveShape(
     const Shape& lhs, const Shape& rhs, const Window& window,
     const ConvolutionDimensionNumbers& dnums) {
@@ -1019,6 +1304,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                            starts.size(), limits.size());
   }
 
+  if (starts.size() != strides.size()) {
+    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
+                           starts.size(), strides.size());
+  }
+
   if (starts.size() != ShapeUtil::Rank(arg)) {
     return InvalidArgument(
         "slice index count does not match argument rank: %zu vs %lld",
@@ -1034,9 +1324,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       return InvalidArgument("negative start index to slice: %lld",
                              start_index);
     }
-    if (stride == 0) {
-      return InvalidArgument("Zero stride");
-    }
     if (limit_index > arg.dimensions(dimension)) {
       return InvalidArgument(
           "limit index (%lld) must be less than or equal to dimension "
@@ -1047,17 +1334,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
-    if (stride > 0) {
-      if (start_index > limit_index) {
-        return InvalidArgument(
-            "limit index (%lld) must be greater or equal to "
-            "start index (%lld) in slice with positive stride",
-            limit_index, start_index);
-      }
-      sizes.push_back((limit_index - start_index + stride - 1) / stride);
-    } else {
-      return InvalidArgument("Negative strides not supported");
+    if (start_index > limit_index) {
+      return InvalidArgument(
+          "limit index (%lld) must be greater or equal to "
+          "start index (%lld) in slice with positive stride",
+          limit_index, start_index);
     }
+    if (stride <= 0) {
+      return InvalidArgument("stride (%lld) must be positive", stride);
+    }
+    sizes.push_back((limit_index - start_index + stride - 1) / stride);
   }
 
   return ShapeUtil::MakeShape(arg.element_type(), sizes);
@@ -1394,10 +1680,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const ProgramShape& to_apply) {
   // The applied function's arity equals the number of arguments.
   if (arg_shapes.size() != to_apply.parameters_size()) {
+    string computation_signature = ShapeUtil::HumanString(to_apply);
+    string argument_shapes = tensorflow::str_util::Join(
+        arg_shapes, ", ", [](string* out, const Shape* shape) {
+          tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
+        });
     return InvalidArgument(
         "Call applied function arity must match number of arguments; got: "
-        "arity: %d, arguments: %zu",
-        to_apply.parameters_size(), arg_shapes.size());
+        "arity: %d, arguments: %zu; computation signature: %s; argument "
+        "shapes: [%s]",
+        to_apply.parameters_size(), arg_shapes.size(),
+        computation_signature.c_str(), argument_shapes.c_str());
   }
 
   // All arguments must be compatible with the program shape.
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 0d270f99794bd7a17a1df555b9b666a50d4b7e17..f3f0176a434e350cd2be9d3b8c1fe0aa72972433 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -64,6 +64,21 @@ class ShapeInference {
       tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
       const ProgramShape& to_apply);
 
+  // Infers the shape produced by InferBatchNormTraining with the given
+  // operands.
+  static StatusOr<Shape> InferBatchNormTrainingShape(const Shape& operand_shape,
+                                                     const Shape& offset_shape,
+                                                     const Shape& scale_shape,
+                                                     int64 feature_index);
+
+  // Infers the shape produced by InferBatchNormGrad with the given operands.
+  static StatusOr<Shape> InferBatchNormGradShape(const Shape& operand_shape,
+                                                 const Shape& scale_shape,
+                                                 const Shape& mean_shape,
+                                                 const Shape& var_shape,
+                                                 const Shape& output_grad_shape,
+                                                 int64 feature_index);
+
   // Infers the shape produced by applying the given convolutional
   // filter (rhs) to lhs in the way specified by the fields on window.
   static StatusOr<Shape> InferConvolveShape(
@@ -165,6 +180,12 @@ class ShapeInference {
   static StatusOr<Shape> InferConvertShape(const Shape& operand_shape,
                                            PrimitiveType new_element_type);
 
+  // Helper that validates the input data type for a reduce-precision operation,
+  // and returns the result shape.
+  static StatusOr<Shape> InferReducePrecisionShape(const Shape& operand_shape,
+                                                   const int exponent_bits,
+                                                   const int mantissa_bits);
+
   // Helper that infers the shape produced by a pad operation based on the
   // padding configuration.
   static StatusOr<Shape> InferPadShape(const Shape& operand_shape,
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 15f6b7bfb4a7f507272471c406bd2ade3ab27b20..c79ffa9cd73950b1653f72b1c6286346f76c10fb 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -65,6 +65,17 @@ class TransferManager {
       perftools::gputools::StreamExecutor* executor,
       const Literal& literal) = 0;
 
+  // Transfer a memory block of the given size from 'source' buffer to the
+  // Infeed interface of the device using the given executor.
+  //
+  // size is the size to transfer from source in bytes.
+  //
+  // source is the source data that must be in the target-dependent layout that
+  // the Infeed HLO used in the computation expects.
+  virtual Status TransferBufferToInfeed(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      const void* source) = 0;
+
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralFromOutfeed(
diff --git a/tensorflow/compiler/xla/service/transfer_manager_test.cc b/tensorflow/compiler/xla/service/transfer_manager_test.cc
index ca38601d919adfdfd637dab44796ffa4969cc8f2..29ecef9510cfe6b8764c2e5fe1216255ca1dc983 100644
--- a/tensorflow/compiler/xla/service/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager_test.cc
@@ -55,7 +55,7 @@ class CpuTransferManagerTest : public ::testing::Test {
 TEST_F(CpuTransferManagerTest, TransferR0U32ToDevice) {
   std::vector<uint8> storage(sizeof(uint32), '\x00');
   se::DeviceMemoryBase memptr(storage.data(), storage.size());
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR0<uint32>(42);
+  std::unique_ptr<Literal> literal = Literal::CreateR0<uint32>(42);
   TF_CHECK_OK(transfer_manager_.TransferLiteralToDevice(stream_exec_, *literal,
                                                         &memptr));
 
@@ -66,7 +66,7 @@ TEST_F(CpuTransferManagerTest, TransferR1F32ToDevice) {
   std::vector<uint8> storage(4 * sizeof(float), '\x00');
   se::DeviceMemoryBase memptr(storage.data(), storage.size());
   std::unique_ptr<Literal> literal =
-      LiteralUtil::CreateR1<float>({1.25f, 2.5f, -17.0f, -20.125f});
+      Literal::CreateR1<float>({1.25f, 2.5f, -17.0f, -20.125f});
   TF_CHECK_OK(transfer_manager_.TransferLiteralToDevice(stream_exec_, *literal,
                                                         &memptr));
 
@@ -80,7 +80,7 @@ TEST_F(CpuTransferManagerTest, TransferR1U8ToDevice) {
   std::vector<uint8> storage(16, '\x00');
   se::DeviceMemoryBase memptr(storage.data(), storage.size());
   const char* str = "0123456789abcdef";
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR1U8(str);
+  std::unique_ptr<Literal> literal = Literal::CreateR1U8(str);
   TF_CHECK_OK(transfer_manager_.TransferLiteralToDevice(stream_exec_, *literal,
                                                         &memptr));
 
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index a0c88c6bbc23972bb6a0f3729e51ee0eaee72bc7..585833573606058514d20fa396b433497ec65bd6 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -172,7 +172,14 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
     return tensorflow::Status::OK();
   };
 
-  for (auto& comp : module->computations()) {
+  std::vector<HloComputation*> computations;
+  for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (auto& comp : computations) {
     TF_RETURN_IF_ERROR(comp->Accept(visit_fn));
   }
 
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index c72d127ea86e4e9daf99dff4335c538c081f0605..9520c42d280968e3f21a110089583c94277ef1a6 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -92,11 +92,11 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
   auto builder = HloComputation::Builder("entry_computation");
   // 2x1
   HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<float>({{1}, {2}})));
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1}, {2}})));
   // 3x2
   HloInstruction* const1 =
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}, {5, 6}})));
+          Literal::CreateR2<float>({{1, 2}, {3, 4}, {5, 6}})));
   HloInstruction* transpose0 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {1, 2}), const0, {1, 0}));
@@ -130,11 +130,11 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   auto builder = HloComputation::Builder("entry");
   // (1.0 + 2.0) * (2.0 - 3.0)
   HloInstruction* const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   HloInstruction* const2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   HloInstruction* const3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   HloInstruction* add = builder.AddInstruction(HloInstruction::CreateBinary(
       const1->shape(), HloOpcode::kAdd, const1, const2));
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index ad6f015c70e7241af815246b732fa02768cf0a10..3c4dc19aefa9cb80a25abd916f417e0535ab5171 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -33,9 +33,9 @@ limitations under the License.
 namespace xla {
 
 string BufferAlias::ToString() const {
-  return tensorflow::strings::StrCat(
-      "BufferAlias(", instruction_->FullyQualifiedName(), "[",
-      tensorflow::str_util::Join(index_, ","), "])");
+  return tensorflow::strings::StrCat("BufferAlias(", instruction_->name(), "[",
+                                     tensorflow::str_util::Join(index_, ","),
+                                     "])");
 }
 
 std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias) {
@@ -125,21 +125,19 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
 }
 
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module, Colorer colorer) {
+TuplePointsToAnalysis::Run(const HloModule* module) {
   std::unique_ptr<TuplePointsToAnalysis> analysis(
-      new TuplePointsToAnalysis(module, std::move(colorer)));
+      new TuplePointsToAnalysis(module));
   TF_RETURN_IF_ERROR(analysis->Analyze());
   return std::move(analysis);
 }
 
-/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module) {
-  return Run(module, DefaultColorer());
-}
-
 Status TuplePointsToAnalysis::Analyze() {
   points_to_.clear();
   for (auto& computation : module_->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     TF_RETURN_IF_ERROR(computation->Accept(this));
     TF_RETURN_IF_ERROR(
         PopulateDefinedBuffersAndAliases(computation->instructions()));
@@ -171,9 +169,6 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
             const ShapeIndex& index,
             const std::vector<const LogicalBuffer*>& pointed_to_buffers) {
           for (const LogicalBuffer* buffer : pointed_to_buffers) {
-            if (buffer_aliases_.count(buffer) == 0) {
-              buffer_aliases_.insert({buffer, std::vector<BufferAlias>()});
-            }
             buffer_aliases_[buffer].emplace_back(instruction.get(), index);
           }
         });
@@ -184,8 +179,8 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
 const LogicalBuffer& TuplePointsToAnalysis::NewLogicalBuffer(
     HloInstruction* instruction, const ShapeIndex& index) {
   CHECK_EQ(logical_buffers_.size(), next_buffer_id_);
-  logical_buffers_.push_back(MakeUnique<LogicalBuffer>(
-      instruction, index, next_buffer_id_, colorer_(instruction, index)));
+  logical_buffers_.push_back(
+      MakeUnique<LogicalBuffer>(instruction, index, next_buffer_id_));
   ++next_buffer_id_;
   return *logical_buffers_.back();
 }
@@ -243,12 +238,11 @@ Status TuplePointsToAnalysis::HandleGetTupleElement(
   return Status::OK();
 }
 
-Status TuplePointsToAnalysis::HandleCopy(HloInstruction* copy,
-                                         HloInstruction* operand) {
+Status TuplePointsToAnalysis::HandleCopy(HloInstruction* copy) {
   // A kCopy instruction performs a shallow copy of the operand. The top-level
   // buffer (index={}) is newly created, but all other buffers (in the case of a
   // tuple shape) come from the operand
-  PointsToSet& points_to_set = CreateCopiedPointsToSet(copy, operand);
+  PointsToSet& points_to_set = CreateCopiedPointsToSet(copy, copy->operand(0));
   points_to_set.mutable_element(/*index=*/{})->clear();
   points_to_set.AddPointedToBuffer(NewLogicalBuffer(copy, /*index=*/{}),
                                    /*index=*/{});
@@ -343,9 +337,11 @@ const PointsToSet& TuplePointsToAnalysis::GetPointsToSet(
 
 PointsToSet& TuplePointsToAnalysis::CreateEmptyPointsToSet(
     const HloInstruction* instruction) {
-  CHECK_EQ(0, points_to_.count(instruction));
-  points_to_[instruction] = MakeUnique<PointsToSet>(instruction->shape());
-  return *FindOrDie(points_to_, instruction);
+  auto set = MakeUnique<PointsToSet>(&instruction->shape());
+  auto res = points_to_.emplace(instruction, std::move(set));
+  CHECK(res.second) << "instruction should not have been present in the map.";
+  // Return *set using the iterator returned by emplace.
+  return *res.first->second;
 }
 
 bool TuplePointsToAnalysis::InstructionDefinesBufferAtIndex(
@@ -458,6 +454,9 @@ string TuplePointsToAnalysis::ToString() const {
   string output = tensorflow::strings::Printf(
       "TuplePointsToSet for module %s:\n", module_->name().c_str());
   for (const auto& computation : module_->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     const char* entry =
         computation.get() == module_->entry_computation() ? "entry " : "";
     tensorflow::strings::StrAppend(&output, entry, "computation ",
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 4d7fc7cbc9e5ba2ac87dc6fd10691ce308b827f6..099713d671dec21019d9fb3af767b81603570999 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -48,7 +48,10 @@ namespace xla {
 // the corresponding buffer.
 class PointsToSet : public ShapeTree<std::vector<const LogicalBuffer*>> {
  public:
-  explicit PointsToSet(const Shape& shape)
+  // Construct our ShapeTree with a pointer rather than a reference to a Shape
+  // because this is very hot code, and copying (and then destroying) all these
+  // Shapes is slow.
+  explicit PointsToSet(const Shape* shape)
       : ShapeTree<std::vector<const LogicalBuffer*>>(shape),
         tuple_sources_(shape) {}
 
@@ -142,15 +145,7 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
 // the potential sources of each buffer in each instruction's output.
 class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
  public:
-  using Colorer = std::function<LogicalBuffer::Color(
-      const HloInstruction* instruction, const ShapeIndex& index)>;
-
-  // Runs points-to analysis on 'module' with the provided buffer color
-  // assigner.
-  static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
-      const HloModule* module, Colorer colorer);
-
-  // Runs points-to analysis on 'module' with the default color assigner.
+  // Runs points-to analysis on 'module'.
   static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
       const HloModule* module);
 
@@ -208,23 +203,15 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleGetTupleElement(HloInstruction* get_tuple_element,
                                HloInstruction* operand) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
-  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
+  Status HandleCopy(HloInstruction* copy) override;
   Status HandleSelect(HloInstruction* select, HloInstruction* pred,
                       HloInstruction* on_true,
                       HloInstruction* on_false) override;
 
   string ToString() const;
 
-  static Colorer DefaultColorer() {
-    return [](const HloInstruction* instruction, const ShapeIndex& index) {
-      return LogicalBuffer::Color(0);
-    };
-  }
-
  private:
-  explicit TuplePointsToAnalysis(const HloModule* module,
-                                 Colorer colorer = DefaultColorer())
-      : module_(module), colorer_(colorer) {}
+  explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {}
 
   // Perform the analysis. Should be called immediately after constructing the
   // object and before calling GetPointsToSet.
@@ -283,9 +270,6 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // The ID of the next logical buffer created.
   LogicalBuffer::Id next_buffer_id_ = 0;
 
-  // Used to color the created logical buffers.
-  Colorer colorer_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(TuplePointsToAnalysis);
 };
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 9909c11929d4b2ecf632ab644981a039446bdfc8..cd79e63cafcfecce71cf3380aba9e409da0e72c8 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -124,9 +124,9 @@ class TuplePointsToAnalysisTest : public HloTestBase {
 TEST_F(TuplePointsToAnalysisTest, SimpleTuple) {
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
@@ -177,14 +177,14 @@ TEST_F(TuplePointsToAnalysisTest, NestedTuple) {
   // tuple.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto inner_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({inner_tuple, constant3}));
 
@@ -238,14 +238,14 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) {
   // tuple.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto inner_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
   auto constant3 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({inner_tuple, constant3}));
 
@@ -270,7 +270,7 @@ TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) {
   // Create a tuple which contains duplicate elements.
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant, constant, constant}));
 
@@ -291,9 +291,9 @@ TEST_F(TuplePointsToAnalysisTest, TupleCopy) {
   // the same.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto copy = builder.AddInstruction(
@@ -318,16 +318,16 @@ TEST_F(TuplePointsToAnalysisTest, TupleSelect) {
   // set containing the union of both sides.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple1 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto tuple2 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant2, constant2}));
 
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
@@ -356,7 +356,7 @@ TEST_F(TuplePointsToAnalysisTest, SelectTupleParameters) {
   auto param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, tuple_shape, "param1"));
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple_shape, HloOpcode::kSelect, pred, param0, param1));
   auto copy = builder.AddInstruction(
@@ -396,16 +396,16 @@ TEST_F(TuplePointsToAnalysisTest, UnambiguousTupleSelect) {
   // Select from two identical tuples. The result should not be ambiguous.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto tuple1 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto tuple2 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
@@ -427,9 +427,9 @@ TEST_F(TuplePointsToAnalysisTest, NestedTupleSelect) {
   // the right values.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto inner_tuple1 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto inner_tuple2 = builder.AddInstruction(
@@ -441,7 +441,7 @@ TEST_F(TuplePointsToAnalysisTest, NestedTupleSelect) {
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple2}));
 
   auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
@@ -474,9 +474,9 @@ TEST_F(TuplePointsToAnalysisTest, TupleWithBitcast) {
   // have the operand of the bitcast in its points-to set.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       constant2->shape(), HloOpcode::kBitcast, constant2));
   auto tuple =
@@ -510,10 +510,9 @@ TEST_F(TuplePointsToAnalysisTest, PointsToTupleConstantElements) {
   // Construct a tuple constant and kCopy it. Verify the points-to set of the
   // copy correctly correctly points into the nested elements of the constant.
   auto builder = HloComputation::Builder(TestName());
-  auto tuple_constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::MakeTuple(
-          {LiteralUtil::CreateR2<float>({{1.0}, {2.0}}).get(),
-           LiteralUtil::CreateR1<float>({2.0, 42}).get()})));
+  auto tuple_constant = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::MakeTuple({Literal::CreateR2<float>({{1.0}, {2.0}}).get(),
+                          Literal::CreateR1<float>({2.0, 42}).get()})));
   auto copy = builder.AddInstruction(HloInstruction::CreateUnary(
       tuple_constant->shape(), HloOpcode::kCopy, tuple_constant));
 
@@ -533,9 +532,9 @@ TEST_F(TuplePointsToAnalysisTest, BufferAliases) {
   // times. Verify buffer alias sets.
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
   auto inner_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
   auto tuple = builder.AddInstruction(
@@ -574,7 +573,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
     auto tuple_element1 = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(update_shape, tuple_param0, 1));
     auto ones = builder.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
+        Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f})));
     // Create 'update' = Add(GetTupleElement(tuple_param0, 1), ones)
     auto update = builder.AddInstruction(HloInstruction::CreateBinary(
         update_shape, HloOpcode::kAdd, tuple_element1, ones));
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 4aba8875161c9a2d12668d57ea55ded066d38da0..3ab780e7d0b5f5c0af482d5d452d9a97641e1b54 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -49,6 +48,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kAbs;
     case UNOP_CEIL:
       return HloOpcode::kCeil;
+    case UNOP_COS:
+      return HloOpcode::kCos;
     case UNOP_EXP:
       return HloOpcode::kExp;
     case UNOP_FLOOR:
@@ -63,6 +64,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
       return HloOpcode::kNegate;
     case UNOP_SIGN:
       return HloOpcode::kSign;
+    case UNOP_SIN:
+      return HloOpcode::kSin;
     case UNOP_SORT:
       return HloOpcode::kSort;
     case UNOP_TANH:
@@ -465,6 +468,90 @@ StatusOr<ComputationDataHandle> UserComputation::AddReduceInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle>
+UserComputation::AddBatchNormTrainingInstruction(
+    const BatchNormTrainingRequest& batch_norm_training_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(batch_norm_training_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
+                      LookUpRequest(batch_norm_training_request.scale()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* offset,
+                      LookUpRequest(batch_norm_training_request.offset()));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+
+  TF_ASSIGN_OR_RETURN(
+      Shape inferred_shape,
+      ShapeInference::InferBatchNormTrainingShape(
+          operand->output_shape(), scale->output_shape(),
+          offset->output_shape(), batch_norm_training_request.feature_index()));
+
+  *request.mutable_output_shape() = inferred_shape;
+
+  *request.mutable_output_handle() = handle;
+
+  *request.mutable_request()->mutable_batch_norm_training_request() =
+      batch_norm_training_request;
+
+  VLOG(1) << "AddBatchNormTrainingInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << batch_norm_training_request.ShortDebugString();
+
+  return handle;
+}
+
+StatusOr<ComputationDataHandle> UserComputation::AddBatchNormGradInstruction(
+    const BatchNormGradRequest& batch_norm_grad_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(batch_norm_grad_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
+                      LookUpRequest(batch_norm_grad_request.scale()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* mean,
+                      LookUpRequest(batch_norm_grad_request.mean()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* variance,
+                      LookUpRequest(batch_norm_grad_request.variance()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* grad_output,
+                      LookUpRequest(batch_norm_grad_request.grad_output()));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+
+  TF_ASSIGN_OR_RETURN(
+      Shape inferred_shape,
+      ShapeInference::InferBatchNormGradShape(
+          operand->output_shape(), scale->output_shape(), mean->output_shape(),
+          variance->output_shape(), grad_output->output_shape(),
+          batch_norm_grad_request.feature_index()));
+
+  *request.mutable_output_shape() = inferred_shape;
+
+  *request.mutable_output_handle() = handle;
+
+  *request.mutable_request()->mutable_batch_norm_grad_request() =
+      batch_norm_grad_request;
+
+  VLOG(1) << "AddBatchNormGradInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << batch_norm_grad_request.ShortDebugString();
+
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddReduceWindowInstruction(
     const ReduceWindowRequest& reduce_window_request,
     const UserComputation& to_apply_computation) {
@@ -841,6 +928,34 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddReducePrecisionInstruction(
+    const ReducePrecisionRequest& reduce_precision_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(reduce_precision_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(
+      Shape new_shape,
+      ShapeInference::InferReducePrecisionShape(
+          operand->output_shape(), reduce_precision_request.exponent_bits(),
+          reduce_precision_request.mantissa_bits()));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = new_shape;
+  *request.mutable_request()->mutable_reduce_precision_request() =
+      reduce_precision_request;
+
+  VLOG(1) << "AddReducePrecisionInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << reduce_precision_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddConvolveInstruction(
     const ConvolveRequest& convolve_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -897,9 +1012,6 @@ StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   const Shape& shape = infeed_request.shape();
-  if (ShapeUtil::IsNestedTuple(shape)) {
-    return InvalidArgument("Infeed does not support nested tuple shapes");
-  }
   if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("Given shape to Infeed must have a layout");
   }
@@ -923,9 +1035,6 @@ Status UserComputation::AddOutfeedInstruction(
   tensorflow::mutex_lock lock(mutex_);
 
   const Shape& shape = outfeed_request.shape();
-  if (ShapeUtil::IsNestedTuple(shape)) {
-    return InvalidArgument("Outfeed does not support nested tuple shapes");
-  }
   if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("Given shape to Outfeed must have a layout");
   }
@@ -1556,6 +1665,36 @@ void ConstantVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kBatchNormTrainingRequest: {
+      const BatchNormTrainingRequest& batch_norm_training_request =
+          request.request().batch_norm_training_request();
+      ConstantVisitor(session_computation,
+                      batch_norm_training_request.operand(), visited,
+                      is_constant);
+      ConstantVisitor(session_computation, batch_norm_training_request.scale(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation, batch_norm_training_request.offset(),
+                      visited, is_constant);
+      break;
+    }
+
+    case OpRequest::kBatchNormGradRequest: {
+      const BatchNormGradRequest& batch_norm_grad_request =
+          request.request().batch_norm_grad_request();
+      ConstantVisitor(session_computation, batch_norm_grad_request.operand(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation, batch_norm_grad_request.scale(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation, batch_norm_grad_request.mean(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation, batch_norm_grad_request.variance(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation,
+                      batch_norm_grad_request.grad_output(), visited,
+                      is_constant);
+      break;
+    }
+
     case OpRequest::kBinaryOpRequest: {
       const BinaryOpRequest& binary_op_request =
           request.request().binary_op_request();
@@ -1824,7 +1963,6 @@ Status UserComputation::CheckParametersAreContiguous(
     }
   }
 
-  auto program_shape = MakeUnique<ProgramShape>();
   for (int64 i = 0; i < parameter_requests.size(); ++i) {
     auto it = parameter_requests.find(i);
     if (it == parameter_requests.end()) {
@@ -1850,26 +1988,31 @@ class ComputationLowerer {
       const SessionComputation& session_computation,
       VersionedComputationHandle::Version version,
       UserComputation::HloComputationResolver hlo_resolver,
+      const DebugOptions& debug_options,
       bool include_unreachable_instructions) {
     ComputationLowerer lowerer(computation_name, session_computation, version,
-                               std::move(hlo_resolver));
-    return lowerer.Lower(include_unreachable_instructions);
+                               std::move(hlo_resolver), debug_options,
+                               include_unreachable_instructions);
+    return lowerer.Lower();
   }
 
  private:
   ComputationLowerer(const string& computation_name,
                      const SessionComputation& session_computation,
                      VersionedComputationHandle::Version version,
-                     UserComputation::HloComputationResolver hlo_resolver)
+                     UserComputation::HloComputationResolver hlo_resolver,
+                     const DebugOptions& debug_options,
+                     bool include_unreachable_instructions)
       : hlo_builder_(computation_name),
         session_computation_(session_computation),
         version_(version),
-        hlo_resolver_(std::move(hlo_resolver)) {}
+        hlo_resolver_(std::move(hlo_resolver)),
+        debug_options_(debug_options),
+        include_unreachable_instructions_(include_unreachable_instructions) {}
 
   // Build an HLO computation from the SessionComputation at the given
   // version.
-  StatusOr<std::unique_ptr<HloComputation>> Lower(
-      bool include_unreachable_instructions);
+  StatusOr<std::unique_ptr<HloComputation>> Lower();
 
  private:
   // Traverses the computation 'root' using a DFS, calling 'visit' in postorder.
@@ -1899,6 +2042,8 @@ class ComputationLowerer {
   const SessionComputation& session_computation_;
   const VersionedComputationHandle::Version version_;
   const UserComputation::HloComputationResolver hlo_resolver_;
+  const DebugOptions& debug_options_;
+  const bool include_unreachable_instructions_;
 };
 
 // Calls 'apply' on each operand of 'request'.
@@ -1964,6 +2109,28 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kBatchNormTrainingRequest: {
+      const BatchNormTrainingRequest& batch_norm_training_request =
+          request.request().batch_norm_training_request();
+
+      apply(batch_norm_training_request.operand());
+      apply(batch_norm_training_request.scale());
+      apply(batch_norm_training_request.offset());
+      break;
+    }
+
+    case OpRequest::kBatchNormGradRequest: {
+      const BatchNormGradRequest& batch_norm_grad_request =
+          request.request().batch_norm_grad_request();
+
+      apply(batch_norm_grad_request.operand());
+      apply(batch_norm_grad_request.scale());
+      apply(batch_norm_grad_request.mean());
+      apply(batch_norm_grad_request.variance());
+      apply(batch_norm_grad_request.grad_output());
+      break;
+    }
+
     case OpRequest::kCrossReplicaSumRequest: {
       const CrossReplicaSumRequest& cross_replica_sum_request =
           request.request().cross_replica_sum_request();
@@ -2117,6 +2284,13 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kReducePrecisionRequest: {
+      const ReducePrecisionRequest& reduce_precision_request =
+          request.request().reduce_precision_request();
+      apply(reduce_precision_request.operand());
+      break;
+    }
+
     case OpRequest::kTraceRequest: {
       const TraceRequest& trace_request = request.request().trace_request();
       apply(trace_request.operand());
@@ -2175,8 +2349,7 @@ void ComputationLowerer::TraversePostorder(
   }
 }
 
-StatusOr<std::unique_ptr<HloComputation>> ComputationLowerer::Lower(
-    bool include_unreachable_instructions) {
+StatusOr<std::unique_ptr<HloComputation>> ComputationLowerer::Lower() {
   // Map from ComputationDataHandle to HLO instruction. Serves as a record of
   // which operations have been visited as well as a cache for looking up
   // ComputationDataHandles as HloInstructions.
@@ -2192,7 +2365,7 @@ StatusOr<std::unique_ptr<HloComputation>> ComputationLowerer::Lower(
   HloInstruction* hlo_root =
       instructions.at(root_request->output_handle().handle());
 
-  if (include_unreachable_instructions) {
+  if (include_unreachable_instructions_) {
     // Iterate through all computation data handles, and visit any unvisited
     // operations.
     for (int64 request_num = 1; request_num <= version_; ++request_num) {
@@ -2276,7 +2449,7 @@ void ComputationLowerer::Visit(
       const ConstantRequest& constant_request =
           request.request().constant_request();
       hlo_instruction = add_instruction(HloInstruction::CreateConstant(
-          LiteralUtil::CloneToUnique(Literal(constant_request.literal()))));
+          Literal(constant_request.literal()).CloneToUnique()));
       break;
     }
 
@@ -2457,6 +2630,44 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kBatchNormTrainingRequest: {
+      const BatchNormTrainingRequest& batch_norm_training_request =
+          request.request().batch_norm_training_request();
+      HloInstruction* operand =
+          lookup_instruction(batch_norm_training_request.operand());
+      HloInstruction* scale =
+          lookup_instruction(batch_norm_training_request.scale());
+      HloInstruction* offset =
+          lookup_instruction(batch_norm_training_request.offset());
+
+      hlo_instruction = add_instruction(HloInstruction::CreateBatchNormTraining(
+          request.output_shape(), operand, scale, offset,
+          batch_norm_training_request.epsilon(),
+          batch_norm_training_request.feature_index()));
+      break;
+    }
+
+    case OpRequest::kBatchNormGradRequest: {
+      const BatchNormGradRequest& batch_norm_grad_request =
+          request.request().batch_norm_grad_request();
+
+      HloInstruction* operand =
+          lookup_instruction(batch_norm_grad_request.operand());
+      HloInstruction* scale =
+          lookup_instruction(batch_norm_grad_request.scale());
+      HloInstruction* mean = lookup_instruction(batch_norm_grad_request.mean());
+      HloInstruction* variance =
+          lookup_instruction(batch_norm_grad_request.variance());
+      HloInstruction* grad_output =
+          lookup_instruction(batch_norm_grad_request.grad_output());
+
+      hlo_instruction = add_instruction(HloInstruction::CreateBatchNormGrad(
+          request.output_shape(), operand, scale, mean, variance, grad_output,
+          batch_norm_grad_request.epsilon(),
+          batch_norm_grad_request.feature_index()));
+      break;
+    }
+
     case OpRequest::kBroadcastRequest: {
       const BroadcastRequest& broadcast_request =
           request.request().broadcast_request();
@@ -2670,8 +2881,7 @@ void ComputationLowerer::Visit(
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      if (legacy_flags::GetUserComputationFlags()
-              ->xla_eliminate_hlo_implicit_broadcast) {
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
         if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
           // lhs side is being implicitly broadcast. Change to explicit.
           lhs =
@@ -2688,6 +2898,18 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kReducePrecisionRequest: {
+      const ReducePrecisionRequest& reduce_precision_request =
+          request.request().reduce_precision_request();
+      HloInstruction* operand =
+          lookup_instruction(reduce_precision_request.operand());
+      auto exponent_bits = reduce_precision_request.exponent_bits();
+      auto mantissa_bits = reduce_precision_request.mantissa_bits();
+      hlo_instruction = add_instruction(HloInstruction::CreateReducePrecision(
+          request.output_shape(), operand, exponent_bits, mantissa_bits));
+      break;
+    }
+
     case OpRequest::kTraceRequest: {
       const TraceRequest& trace_request = request.request().trace_request();
       HloInstruction* operand = lookup_instruction(trace_request.operand());
@@ -2718,7 +2940,7 @@ void ComputationLowerer::Visit(
 
 StatusOr<std::unique_ptr<HloComputation>> UserComputation::BuildHloComputation(
     VersionedComputationHandle::Version version,
-    HloComputationResolver hlo_resolver,
+    HloComputationResolver hlo_resolver, const DebugOptions& debug_options,
     bool include_unreachable_instructions) const {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -2730,7 +2952,7 @@ StatusOr<std::unique_ptr<HloComputation>> UserComputation::BuildHloComputation(
       std::unique_ptr<HloComputation> hlo_computation,
       ComputationLowerer::Lower(
           tensorflow::strings::StrCat(name(), ".v", version),
-          session_computation_, version, std::move(hlo_resolver),
+          session_computation_, version, std::move(hlo_resolver), debug_options,
           include_unreachable_instructions));
 
   XLA_VLOG_LINES(2, hlo_computation->ToString());
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index fb5425ae61ab1edcd00aac493c9e2ac3c430cb72..36b1d34e05d7ef4d9d6b5d0f76822b6813d117e8 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -84,6 +85,14 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddUnaryInstruction(
       const UnaryOpRequest& unary_request);
 
+  // Enqueues a batch norm training instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddBatchNormTrainingInstruction(
+      const BatchNormTrainingRequest& batch_norm_training_request);
+
+  // Enqueues a batch norm grad instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddBatchNormGradInstruction(
+      const BatchNormGradRequest& batch_norm_grad_request);
+
   // Enqueues a binary instruction onto this user computation.
   // Returns an error status if the operand indices are out of bounds.
   StatusOr<ComputationDataHandle> AddBinaryInstruction(
@@ -112,6 +121,10 @@ class UserComputation {
       const MapRequest& map_request,
       const UserComputation& to_apply_computation);
 
+  // Enqueues a reduce-precision instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddReducePrecisionInstruction(
+      const ReducePrecisionRequest& reduce_precision_request);
+
   // Enqueues a convolution instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddConvolveInstruction(
       const ConvolveRequest& convolve_request);
@@ -256,7 +269,7 @@ class UserComputation {
       std::function<HloComputation*(const VersionedComputationHandle& handle)>;
   StatusOr<std::unique_ptr<HloComputation>> BuildHloComputation(
       VersionedComputationHandle::Version version,
-      HloComputationResolver hlo_resolver,
+      HloComputationResolver hlo_resolver, const DebugOptions& debug_options,
       bool include_unreachable_instructions = true) const;
 
   // Return a vector containing the embedded computations used by this
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index ea691201263e4935afbc29bcb8624a73c6715f83..07739f241aa01eacf83630c72aec6199b66b49d4 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/user_computation.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -50,16 +50,16 @@ TEST_F(UserComputationTest, SimpleComputation) {
 
   ConstantRequest constant_request;
   *constant_request.mutable_literal() =
-      LiteralUtil::CreateR1<float>({123.0f, 42.0f})->ToProto();
-  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle constant_handle,
-                         computation.AddConstantInstruction(constant_request));
+      Literal::CreateR1<float>({123.0f, 42.0f})->ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle constant_handle,
+                          computation.AddConstantInstruction(constant_request));
 
   ParameterRequest param_request;
   *param_request.mutable_shape() = kScalarShape;
   param_request.set_parameter(0);
   param_request.set_name("param0");
-  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle param_handle,
-                         computation.AddParameterInstruction(param_request));
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle param_handle,
+                          computation.AddParameterInstruction(param_request));
   OpMetadata metadata;
   metadata.set_op_name("meta");
   TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
@@ -81,7 +81,7 @@ TEST_F(UserComputationTest, SimpleComputation) {
 
     // Program shape should have a single scalar parameter and scalar
     // result. The outfeed instruction should not affect the program shape.
-    TF_ASSIGN_OR_ASSERT_OK(
+    TF_ASSERT_OK_AND_ASSIGN(
         std::shared_ptr<const ProgramShape> program_shape,
         computation.ComputeProgramShape(latest_version.version));
     ASSERT_EQ(1, program_shape->parameters_size());
@@ -90,9 +90,10 @@ TEST_F(UserComputationTest, SimpleComputation) {
     EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
 
     // Build the HLO computation.
-    TF_ASSIGN_OR_ASSERT_OK(
+    TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(latest_version.version, hlo_resolver));
+        computation.BuildHloComputation(latest_version.version, hlo_resolver,
+                                        DebugOptions()));
     // There should be one HloInstruction per UserComputation operation.
     EXPECT_EQ(3, hlo_computation->instruction_count());
     // The root of the instruction should be the parameter instruction (not the
@@ -107,7 +108,7 @@ TEST_F(UserComputationTest, SimpleComputation) {
         computation.GetVersionedHandleAtOperation(param_handle);
 
     // Program shape should have a single scalar parameter, and scalar result.
-    TF_ASSIGN_OR_ASSERT_OK(
+    TF_ASSERT_OK_AND_ASSIGN(
         std::shared_ptr<const ProgramShape> program_shape,
         computation.ComputeProgramShape(version_at_param.version));
     ASSERT_EQ(1, program_shape->parameters_size());
@@ -117,9 +118,10 @@ TEST_F(UserComputationTest, SimpleComputation) {
 
     // There should be two instructions, one for the constant and one for the
     // parameter. The outfeed instruction should not be included.
-    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<HloComputation> hlo_computation,
-                           computation.BuildHloComputation(
-                               version_at_param.version, hlo_resolver));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<HloComputation> hlo_computation,
+        computation.BuildHloComputation(version_at_param.version, hlo_resolver,
+                                        DebugOptions()));
     EXPECT_EQ(2, hlo_computation->instruction_count());
     EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
   }
@@ -130,10 +132,11 @@ TEST_F(UserComputationTest, SimpleComputation) {
         computation.GetVersionedHandle();
 
     // Build the HLO computation.
-    TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<HloComputation> hlo_computation,
-                           computation.BuildHloComputation(
-                               latest_version.version, hlo_resolver,
-                               /*include_unreachable_instructions=*/false));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<HloComputation> hlo_computation,
+        computation.BuildHloComputation(
+            latest_version.version, hlo_resolver, DebugOptions(),
+            /*include_unreachable_instructions=*/false));
     // There is only one reachable instruction, the parameter.
     EXPECT_EQ(1, hlo_computation->instruction_count());
     // The root of the instruction should be the parameter instruction (not the
@@ -145,8 +148,8 @@ TEST_F(UserComputationTest, SimpleComputation) {
 }
 
 TEST_F(UserComputationTest, EliminateScalarBroadcast) {
-  if (!legacy_flags::GetUserComputationFlags()
-           ->xla_eliminate_hlo_implicit_broadcast) {
+  if (!legacy_flags::GetDebugOptionsFromFlags()
+           .xla_eliminate_hlo_implicit_broadcast()) {
     return;
   }
 
@@ -161,14 +164,14 @@ TEST_F(UserComputationTest, EliminateScalarBroadcast) {
 
   ConstantRequest a_request;
   *a_request.mutable_literal() =
-      LiteralUtil::CreateR1<float>({123.0f, 42.0f})->ToProto();
-  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle a_handle,
-                         computation.AddConstantInstruction(a_request));
+      Literal::CreateR1<float>({123.0f, 42.0f})->ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
+                          computation.AddConstantInstruction(a_request));
 
   ConstantRequest b_request;
-  *b_request.mutable_literal() = LiteralUtil::CreateR0<float>(1.0f)->ToProto();
-  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle b_handle,
-                         computation.AddConstantInstruction(b_request));
+  *b_request.mutable_literal() = Literal::CreateR0<float>(1.0f)->ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
+                          computation.AddConstantInstruction(b_request));
 
   BinaryOpRequest add;
   add.set_binop(BINOP_ADD);
@@ -182,9 +185,10 @@ TEST_F(UserComputationTest, EliminateScalarBroadcast) {
   VersionedComputationHandle latest_version = computation.GetVersionedHandle();
 
   // Build the HLO computation.
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver));
+      computation.BuildHloComputation(latest_version.version, hlo_resolver,
+                                      DebugOptions()));
   // The binary operation has implicit scalar broadcast, should be converted
   // to an explicit broadcast intruction and a binary instruction.
   EXPECT_EQ(4, hlo_computation->instruction_count());
@@ -196,8 +200,8 @@ TEST_F(UserComputationTest, EliminateScalarBroadcast) {
 }
 
 TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
-  if (!legacy_flags::GetUserComputationFlags()
-           ->xla_eliminate_hlo_implicit_broadcast) {
+  if (!legacy_flags::GetDebugOptionsFromFlags()
+           .xla_eliminate_hlo_implicit_broadcast()) {
     return;
   }
 
@@ -214,15 +218,15 @@ TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
   *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 3});
   a_request.set_name("a");
   a_request.set_parameter(0);
-  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle a_handle,
-                         computation.AddParameterInstruction(a_request));
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
+                          computation.AddParameterInstruction(a_request));
 
   ParameterRequest b_request;
   *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 1, 4});
   b_request.set_name("b");
   b_request.set_parameter(1);
-  TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle b_handle,
-                         computation.AddParameterInstruction(b_request));
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
+                          computation.AddParameterInstruction(b_request));
 
   BinaryOpRequest add;
   add.set_binop(BINOP_ADD);
@@ -238,9 +242,10 @@ TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
   VersionedComputationHandle latest_version = computation.GetVersionedHandle();
 
   // Build the HLO computation.
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver));
+      computation.BuildHloComputation(latest_version.version, hlo_resolver,
+                                      DebugOptions()));
 
   // The binary operation has in-dim broadcast and degenerate broadcast, should
   // first do the in-dim broadcast then convert the degnerate broadcast into a
@@ -266,7 +271,7 @@ TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendUserComputationFlags(&flag_list);
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index cc456df4fce5c78162c41ed36f6c69c0f5ab459b..81cdbf5117f2d16e5a871849a7875b1746baf42a 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -81,19 +82,56 @@ struct ShapeTreeNode {
 // Like the Shape data structure, this is a tree and tuple elements cannot be
 // duplicated. That is, every distinct ShapeIndex in the Shape has a unique T
 // object.
+//
+// Normally a ShapeTree owns its Shape, but for efficiency reasons, sometimes
+// it's helpful not to copy a Shape just to make a ShapeTree.  In these cases,
+// you can pass a Shape* instead of a Shape& to the ShapeTree constructor.  It's
+// then up to you to ensure that the pointed-to Shape doesn't die or mutate
+// before its ShapeTree goes away.
 template <typename T>
 class ShapeTree {
  public:
   // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
   ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
+
   // Create ShapeTree with the given shape, and default-constructed T values for
   // all nodes.
-  explicit ShapeTree(const Shape& shape);
+  //
+  // The version that takes a pointer may be cheaper because it doesn't require
+  // any Shape copies, but then it's up to you to ensure that the pointer stays
+  // alive longer than this ShapeTree.
+  explicit ShapeTree(Shape shape);
+  explicit ShapeTree(const Shape* shape);
+
   // Create ShapeTree with the given shape, and init_value for all nodes.
-  ShapeTree(const Shape& shape, const T& init_value);
+  ShapeTree(Shape shape, const T& init_value);
+  ShapeTree(const Shape* shape, const T& init_value);
+
+  ShapeTree(const ShapeTree& other)
+      : root_(other.root_), shape_storage_(other.shape_storage_) {
+    // Fix up internal pointer if necessary.
+    if (shape_storage_) {
+      CHECK_EQ(other.shape_, &*other.shape_storage_);
+      shape_ = &*shape_storage_;
+    } else {
+      shape_ = other.shape_;
+    }
+  }
 
-  ShapeTree(const ShapeTree& other) = default;
-  ShapeTree& operator=(const ShapeTree& other) = default;
+  ShapeTree& operator=(const ShapeTree& other) {
+    root_ = other.root_;
+    shape_storage_ = other.shape_storage_;
+
+    // Fix up internal pointer if necessary.
+    if (shape_storage_) {
+      CHECK_EQ(other.shape_, &*other.shape_storage_);
+      shape_ = &*shape_storage_;
+    } else {
+      shape_ = other.shape_;
+    }
+
+    return *this;
+  }
 
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
@@ -101,7 +139,7 @@ class ShapeTree {
   T* mutable_element(const ShapeIndex& index);
 
   // Return the shape represented with this ShapeTree.
-  const Shape& shape() const { return shape_; }
+  const Shape& shape() const { return *shape_; }
 
   // Returns true if the node at the given index is a leaf node (an array
   // shape).
@@ -112,27 +150,27 @@ class ShapeTree {
   // Recursively traverses the shape and calls the given function at each
   // element. The function has the following arguments:
   //
+  //   Fn :    A callable of type void(const ShapeIndex& index, const T& data)
+  //           (or compatible).
   //   index : the index of the element in the shape. See ShapeUtil::GetSubshape
   //           for definition of index.
   //   data : The data value at this elemnt.
-  using VisitorFunction =
-      std::function<void(const ShapeIndex& /*index*/, const T& /*data*/)>;
-  void ForEachElement(const VisitorFunction& func) const;
-
-  using MutableVisitorFunction =
-      std::function<void(const ShapeIndex& /*index*/, T* /*data*/)>;
-  void ForEachMutableElement(const MutableVisitorFunction& func);
+  template <typename Fn>
+  void ForEachElement(const Fn& func) const;
 
-  // Variants of ForEach(Mutable)Element which propagate a Status value from the
-  // visitor.
-  using StatusVisitorFunction =
-      std::function<Status(const ShapeIndex& /*index*/, const T& /*data*/)>;
-  Status ForEachElementWithStatus(const StatusVisitorFunction& func) const;
+  // Like ForEachElement, but the callable has type
+  //
+  //   void (const ShapeIndex& index, T* data).
+  //
+  template <typename Fn>
+  void ForEachMutableElement(const Fn& func);
 
-  using MutableStatusVisitorFunction =
-      std::function<Status(const ShapeIndex& /*index*/, T* /*data*/)>;
-  Status ForEachMutableElementWithStatus(
-      const MutableStatusVisitorFunction& func);
+  // Like ForEach(Mutable)Element, but the callable returns a Status instead of
+  // void.  The first non-OK return value is returned by the ForEach* function.
+  template <typename Fn>
+  Status ForEachElementWithStatus(const Fn& func) const;
+  template <typename Fn>
+  Status ForEachMutableElementWithStatus(const Fn& func);
 
   // Copy the subtree of values from 'other' rooted at ShapeIndex
   // 'source_base_index' into the subtree of value in this ShapeTree rooted at
@@ -161,10 +199,12 @@ class ShapeTree {
   // Helpers for traversing the shape via ForEachElement. The helpers
   // recursively traverse the subtree rooted at "index" (defined as in
   // ShapeUtil::GetSubshape).
-  static Status ForEachHelper(const StatusVisitorFunction& func,
-                              const Node& node, ShapeIndex* index);
-  static Status ForEachMutableHelper(const MutableStatusVisitorFunction& func,
-                                     Node* node, ShapeIndex* index);
+  template <typename Fn>
+  static Status ForEachHelper(const Fn& func, const Node& node,
+                              ShapeIndex* index);
+  template <typename Fn>
+  static Status ForEachMutableHelper(const Fn& func, Node* node,
+                                     ShapeIndex* index);
 
   // Return the tree node at the given index.
   Node* Lookup(const ShapeIndex& index);
@@ -173,8 +213,13 @@ class ShapeTree {
   // The root node, which contains all other nodes.
   Node root_;
 
-  // The XLA shape mirrored in this ShapeTree.
-  Shape shape_;
+  // If we own our Shape, this field contains it, and shape_ is a pointer into
+  // here.  Otherwise if we don't own our shape, this is nullopt.
+  tensorflow::gtl::optional<Shape> shape_storage_;
+
+  // The XLA shape mirrored in this ShapeTree.  This is either a pointer into
+  // shape_storage_ or the Shape pointer passed to our constructor.
+  const Shape* shape_;
 };
 
 template <typename T>
@@ -200,20 +245,34 @@ void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
 }
 
 template <typename T>
-ShapeTree<T>::ShapeTree(const Shape& shape) : root_(), shape_(shape) {
+ShapeTree<T>::ShapeTree(Shape shape)
+    : root_(), shape_storage_(std::move(shape)), shape_(&*shape_storage_) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
-  LayoutUtil::ClearLayout(&shape_);
-  InitChildren(shape_, &root_);
+  LayoutUtil::ClearLayout(&*shape_storage_);
+  InitChildren(*shape_, &root_);
 }
 
 template <typename T>
-ShapeTree<T>::ShapeTree(const Shape& shape, const T& init_value)
-    : root_(init_value), shape_(shape) {
+ShapeTree<T>::ShapeTree(const Shape* shape) : root_(), shape_(shape) {
+  InitChildren(*shape_, &root_);
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(Shape shape, const T& init_value)
+    : root_(init_value),
+      shape_storage_(std::move(shape)),
+      shape_(&*shape_storage_) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
-  LayoutUtil::ClearLayout(&shape_);
-  InitChildren(shape_, init_value, &root_);
+  LayoutUtil::ClearLayout(&*shape_storage_);
+  InitChildren(*shape_, init_value, &root_);
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(const Shape* shape, const T& init_value)
+    : root_(init_value), shape_(shape) {
+  InitChildren(*shape_, init_value, &root_);
 }
 
 template <typename T>
@@ -245,8 +304,9 @@ const internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(
 
 /* static */
 template <typename T>
-Status ShapeTree<T>::ForEachHelper(const StatusVisitorFunction& func,
-                                   const Node& node, ShapeIndex* index) {
+template <typename Fn>
+Status ShapeTree<T>::ForEachHelper(const Fn& func, const Node& node,
+                                   ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(*index, node.data));
   for (int64 i = 0; i < node.children.size(); ++i) {
     index->push_back(i);
@@ -258,8 +318,9 @@ Status ShapeTree<T>::ForEachHelper(const StatusVisitorFunction& func,
 
 /* static */
 template <typename T>
-Status ShapeTree<T>::ForEachMutableHelper(
-    const MutableStatusVisitorFunction& func, Node* node, ShapeIndex* index) {
+template <typename Fn>
+Status ShapeTree<T>::ForEachMutableHelper(const Fn& func, Node* node,
+                                          ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(*index, &node->data));
   for (int64 i = 0; i < node->children.size(); ++i) {
     index->push_back(i);
@@ -271,21 +332,22 @@ Status ShapeTree<T>::ForEachMutableHelper(
 }
 
 template <typename T>
-Status ShapeTree<T>::ForEachElementWithStatus(
-    const StatusVisitorFunction& func) const {
+template <typename Fn>
+Status ShapeTree<T>::ForEachElementWithStatus(const Fn& func) const {
   ShapeIndex index;
   return ForEachHelper(func, root_, &index);
 }
 
 template <typename T>
-Status ShapeTree<T>::ForEachMutableElementWithStatus(
-    const MutableStatusVisitorFunction& func) {
+template <typename Fn>
+Status ShapeTree<T>::ForEachMutableElementWithStatus(const Fn& func) {
   ShapeIndex index;
   return ForEachMutableHelper(func, &root_, &index);
 }
 
 template <typename T>
-void ShapeTree<T>::ForEachElement(const VisitorFunction& func) const {
+template <typename Fn>
+void ShapeTree<T>::ForEachElement(const Fn& func) const {
   ShapeIndex index;
   return ForEachHelper(
              [&func](const ShapeIndex& index, const T& data) {
@@ -297,7 +359,8 @@ void ShapeTree<T>::ForEachElement(const VisitorFunction& func) const {
 }
 
 template <typename T>
-void ShapeTree<T>::ForEachMutableElement(const MutableVisitorFunction& func) {
+template <typename Fn>
+void ShapeTree<T>::ForEachMutableElement(const Fn& func) {
   ShapeIndex index;
   return ForEachMutableHelper(
              [&func](const ShapeIndex& index, T* data) {
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index afc3a2b2a34777780ec66d2325011390879fe693..3a5db1b3a651e2d353741c6bf4f6962da4e54ba1 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -365,5 +365,31 @@ TEST_F(ShapeTreeTest, OperatorEquals) {
   }
 }
 
+TEST_F(ShapeTreeTest, ConstructWithPointerToShape) {
+  // Construct a ShapeTree using a pointer to a shape, rather than a reference
+  // to a shape.  This constructor is an optimization to let us avoid
+  // constructing and destroying temporary shapes when we have many ShapeTrees.
+  ShapeTree<int> t(&nested_tuple_shape_, 42);
+  int num_nodes = 0;
+  t.ForEachElement([&num_nodes](const ShapeIndex& /*index*/, int data) {
+    EXPECT_EQ(42, data);
+    ++num_nodes;
+  });
+  EXPECT_EQ(10, num_nodes);
+}
+
+TEST_F(ShapeTreeTest, CopyWithPointerToShape) {
+  ShapeTree<int> source(&nested_tuple_shape_, 0);
+  ShapeTree<int> dest(source);
+  EXPECT_EQ(&dest.shape(), &nested_tuple_shape_);
+}
+
+TEST_F(ShapeTreeTest, CopyAssignWithPointerToShape) {
+  ShapeTree<int> source(&nested_tuple_shape_, 0);
+  ShapeTree<int> dest;
+  dest = source;
+  EXPECT_EQ(&dest.shape(), &nested_tuple_shape_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index ee49a9ae5f5ff442284f2c4bd620425f815fb08d..057905a4311edc246eeea55019821e834605ae78 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -105,6 +105,11 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   return equal;
 }
 
+/* static */ int64 ShapeUtil::Rank(const Shape& shape) {
+  CHECK(!ShapeUtil::IsTuple(shape)) << "Tuples do not have a rank";
+  return shape.dimensions_size();
+}
+
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -165,6 +170,17 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   }
   return MakeShapeWithMonotonicDim0MajorLayout(shape.element_type(), dims);
 }
+
+/* static */ Shape ShapeUtil::ShapeWithoutPadding(const Shape& shape) {
+  Shape result = shape;
+  ForEachMutableSubshape(&result, [](Shape* subshape, const ShapeIndex& index) {
+    auto layout = subshape->mutable_layout();
+    layout->clear_padding_value();
+    layout->clear_padded_dimensions();
+  });
+  return result;
+}
+
 /* static */ void ShapeUtil::PopulateShape(
     PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions,
     Shape* shape) {
@@ -270,7 +286,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ bool ShapeUtil::IsNil(const Shape& shape) {
-  return IsEmptyTuple(shape) || HasZeroElements(shape);
+  return IsTuple(shape) ? IsEmptyTuple(shape) : HasZeroElements(shape);
 }
 
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
@@ -323,6 +339,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
+  CHECK(!IsTuple(shape));
   CHECK_EQ(shape.dimensions_size(), Rank(shape));
   return std::accumulate<decltype(shape.dimensions().begin()), int64>(
       shape.dimensions().begin(), shape.dimensions().end(), 1LL,
@@ -534,11 +551,6 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
     const Shape& shape) {
   if (shape.element_type() == TUPLE) {
-    // Tuple shape.
-    if (Rank(shape) != 0) {
-      return InvalidArgument("tuples must be rank-0; got rank %lld",
-                             Rank(shape));
-    }
     if (shape.dimensions_size() != 0) {
       return InvalidArgument("tuples must not have dimensions specified");
     }
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 853be6b4cb81881f3f03dbb119dee533aa27634f..fa34bfc951d58d252b4381e10a01b39698eb9015 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -93,6 +93,7 @@ class ShapeUtil {
  public:
   // Returns the number of elements are contained within the provided shape;
   // e.g. for rank 0 (scalars) the result is always 1.
+  // Precondition: !IsTuple(shape)
   static int64 ElementsIn(const Shape& shape);
 
   // Returns true if 'shape' has zero elements.
@@ -144,7 +145,8 @@ class ShapeUtil {
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
   // Returns the rank (number of dimensions) of the given shape.
-  static int64 Rank(const Shape& shape) { return shape.dimensions_size(); }
+  // Precondition: !IsTuple(shape)
+  static int64 Rank(const Shape& shape);
 
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
@@ -220,6 +222,9 @@ class ShapeUtil {
   // elements with a different shape.
   static Shape NormalizeShapeToMonotonicDim0MajorLayout(const Shape& shape);
 
+  // Returns a new shape that has all padding values cleared.
+  static Shape ShapeWithoutPadding(const Shape& shape);
+
   // As MakeShape, but the object to write to is passed in.
   static void PopulateShape(PrimitiveType element_type,
                             tensorflow::gtl::ArraySlice<int64> dimensions,
diff --git a/tensorflow/compiler/xla/status_macros.h b/tensorflow/compiler/xla/status_macros.h
index aa12cda666c4abfbf7ec38f0aa640df3b51ea106..5e5550563d02de99ddefbeb8ee8e1bf98afdcdbf 100644
--- a/tensorflow/compiler/xla/status_macros.h
+++ b/tensorflow/compiler/xla/status_macros.h
@@ -183,15 +183,15 @@ class StatusAdaptorForMacros {
       .with_log_stack_trace()                                             \
       .add_ret_check_failure(#condition)
 
-#define TF_ASSIGN_OR_ASSERT_OK(lhs, rexpr)                              \
-  TF_ASSIGN_OR_ASSERT_OK_IMPL(                                          \
+#define TF_ASSERT_OK_AND_ASSIGN(lhs, rexpr)                             \
+  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                         \
       TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
       rexpr);
 
-#define TF_ASSIGN_OR_ASSERT_OK_IMPL(statusor, lhs, rexpr)   \
+#define TF_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr)  \
   auto statusor = (rexpr);                                  \
   ASSERT_TRUE(statusor.status().ok()) << statusor.status(); \
-  lhs = statusor.ConsumeValueOrDie()
+  lhs = std::move(statusor.ValueOrDie())
 
 #define TF_STATUS_MACROS_CONCAT_NAME(x, y) TF_STATUS_MACROS_CONCAT_IMPL(x, y)
 #define TF_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
diff --git a/tensorflow/compiler/xla/status_macros_test.cc b/tensorflow/compiler/xla/status_macros_test.cc
index dead17cdfa1e9f19e0ecfbc071e74e159ae82b5f..4b0740dad72f5d96e5ae153abf9232553ff834c2 100644
--- a/tensorflow/compiler/xla/status_macros_test.cc
+++ b/tensorflow/compiler/xla/status_macros_test.cc
@@ -63,7 +63,7 @@ StatusOr<int> CreateIntUnsuccessfully() {
 }
 
 TEST(StatusMacros, AssignOrAssertOnOK) {
-  TF_ASSIGN_OR_ASSERT_OK(int result, CreateIntSuccessfully());
+  TF_ASSERT_OK_AND_ASSIGN(int result, CreateIntSuccessfully());
   EXPECT_EQ(42, result);
 }
 
diff --git a/tensorflow/compiler/xla/statusor.cc b/tensorflow/compiler/xla/statusor.cc
index 36f08fc99f45a7c82f086d04fa60014343d574da..72ab67ff810e0ec384a22da092363cc7446435bb 100644
--- a/tensorflow/compiler/xla/statusor.cc
+++ b/tensorflow/compiler/xla/statusor.cc
@@ -19,28 +19,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
-namespace internal {
+namespace internal_statusor {
 
-Status StatusOrHelper::HandleInvalidStatusCtorArg() {
+void Helper::HandleInvalidStatusCtorArg(Status* status) {
   const char* kMessage =
-      "Status::OK is not a valid constructor argument to StatusOr<T>";
+      "An OK status is not a valid constructor argument to StatusOr<T>";
   LOG(ERROR) << kMessage;
-  // In optimized builds, we will fall back to tensorflow::error::INTERNAL.
-  return Status(tensorflow::error::INTERNAL, kMessage);
+  // Fall back to tensorflow::error::INTERNAL.
+  *status = ::tensorflow::errors::Internal(kMessage);
 }
 
-Status StatusOrHelper::HandleNullObjectCtorArg() {
-  const char* kMessage =
-      "NULL is not a valid constructor argument to StatusOr<T*>";
-  LOG(ERROR) << kMessage;
-  // In optimized builds, we will fall back to tensorflow::error::INTERNAL.
-  return Status(tensorflow::error::INTERNAL, kMessage);
-}
-
-void StatusOrHelper::Crash(const Status& status) {
+void Helper::Crash(const Status& status) {
   LOG(FATAL) << "Attempting to fetch value instead of handling error "
              << status;
 }
 
-}  // namespace internal
+}  // namespace internal_statusor
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index d8cd736238c19cc00d0302daa54fc7417740001a..92bcfa0f44d524c1652ec3d2493a3ebb48b95423 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -72,216 +72,233 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STATUSOR_H_
 
 #include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor_internals.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
 
 #if defined(__clang__)
 // Only clang supports warn_unused_result as a type annotation.
-template <typename T, bool CopyConstructible>
+template <typename T>
 class TF_MUST_USE_RESULT StatusOr;
 #endif
 
-template <typename T,
-          bool CopyConstructible = std::is_copy_constructible<T>::value>
-class StatusOr {
-  template <typename U, bool UC>
+template <typename T>
+class StatusOr : private internal_statusor::StatusOrData<T>,
+                 private internal_statusor::TraitsBase<
+                     std::is_copy_constructible<T>::value,
+                     std::is_move_constructible<T>::value> {
+  template <typename U>
   friend class StatusOr;
 
+  typedef internal_statusor::StatusOrData<T> Base;
+
  public:
   typedef T element_type;
 
-  // Construct a new StatusOr with Status::UNKNOWN status
-  StatusOr();
+  // Constructs a new StatusOr with Status::UNKNOWN status.  This is marked
+  // 'explicit' to try to catch cases like 'return {};', where people think
+  // StatusOr<std::vector<int>> will be initialized with an empty vector,
+  // instead of a Status::UNKNOWN status.
+  explicit StatusOr();
+
+  // StatusOr<T> will be copy constructuble/assignable if T is copy
+  // constructible.
+  StatusOr(const StatusOr&) = default;
+  StatusOr& operator=(const StatusOr&) = default;
+
+  // StatusOr<T> will be move constructuble/assignable if T is move
+  // constructible.
+  StatusOr(StatusOr&&) = default;
+  StatusOr& operator=(StatusOr&&) = default;
+
+  // Conversion copy/move constructor, T must be convertible from U.
+  // TODO(b/62186717): These should not participate in overload resolution if U
+  // is not convertible to T.
+  template <typename U>
+  StatusOr(const StatusOr<U>& other);
+  template <typename U>
+  StatusOr(StatusOr<U>&& other);
 
-  // Construct a new StatusOr with the given non-ok status. After calling
-  // this constructor, calls to ValueOrDie() will CHECK-fail.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return
-  // value, so it is convenient and sensible to be able to do 'return
-  // Status()' when the return type is StatusOr<T>.
-  //
-  // REQUIRES: status != Status::OK. This requirement is DCHECKed.
-  // In optimized builds, passing Status::OK here will have the effect
-  // of passing tensorflow::error::INTERNAL as a fallback.
-  StatusOr(Status status);              // NOLINT
+  // Conversion copy/move assignment operator, T must be convertible from U.
+  template <typename U>
+  StatusOr& operator=(const StatusOr<U>& other);
+  template <typename U>
+  StatusOr& operator=(StatusOr<U>&& other);
 
-  // Construct a new StatusOr with the given value. If T is a plain pointer,
-  // value must not be NULL. After calling this constructor, calls to
-  // ValueOrDie() will succeed, and calls to status() will return OK.
+  // Constructs a new StatusOr with the given value. After calling this
+  // constructor, calls to ValueOrDie() will succeed, and calls to status() will
+  // return OK.
   //
   // NOTE: Not explicit - we want to use StatusOr<T> as a return type
   // so it is convenient and sensible to be able to do 'return T()'
   // when the return type is StatusOr<T>.
   //
-  // REQUIRES: if T is a plain pointer, value != NULL. This requirement is
-  // DCHECKed. In optimized builds, passing a NULL pointer here will have
-  // the effect of passing tensorflow::error::INTERNAL as a fallback.
-  StatusOr(const T& value);  // NOLINT
-
-  // Copy constructor.
-  StatusOr(const StatusOr& other) = default;
-
-  // Conversion copy constructor, T must be copy constructible from U
-  template <typename U>
-  StatusOr(const StatusOr<U>& other);
-
-  // Assignment operator.
-  StatusOr& operator=(const StatusOr& other) = default;
+  // REQUIRES: T is copy constructible.
+  StatusOr(const T& value);
 
-  // Conversion assignment operator, T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(const StatusOr<U>& other);
+  // Constructs a new StatusOr with the given non-ok status. After calling
+  // this constructor, calls to ValueOrDie() will CHECK-fail.
+  //
+  // NOTE: Not explicit - we want to use StatusOr<T> as a return
+  // value, so it is convenient and sensible to be able to do 'return
+  // Status()' when the return type is StatusOr<T>.
+  //
+  // REQUIRES: !status.ok(). This requirement is DCHECKed.
+  // In optimized builds, passing Status::OK() here will have the effect
+  // of passing tensorflow::error::INTERNAL as a fallback.
+  StatusOr(const Status& status);
+  StatusOr& operator=(const Status& status);
 
-  // Move constructor and move-assignment operator.
-  StatusOr(StatusOr&& other) = default;
-  StatusOr& operator=(StatusOr&& other) = default;
+  // TODO(b/62186997): Add operator=(T) overloads.
 
-  // Rvalue-reference overloads of the other constructors and assignment
-  // operators, to support move-only types and avoid unnecessary copying.
+  // Similar to the `const T&` overload.
   //
-  // Implementation note: we could avoid all these rvalue-reference overloads
-  // if the existing lvalue-reference overloads took their arguments by value
-  // instead. I think this would also let us omit the conversion assignment
-  // operator altogether, since we'd get the same functionality for free
-  // from the implicit conversion constructor and ordinary assignment.
-  // However, this could result in extra copy operations unless we use
-  // std::move to avoid them, and we can't use std::move because this code
-  // needs to be portable to C++03.
-  StatusOr(T&& value);  // NOLINT
-  template <typename U>
-  StatusOr(StatusOr<U>&& other);
+  // REQUIRES: T is move constructible.
+  StatusOr(T&& value);
 
-  // Returns a reference to our status. If this contains a T, then
-  // returns Status::OK.
-  const Status& status() const { return status_; }
+  // RValue versions of the operations declared above.
+  StatusOr(Status&& status);
+  StatusOr& operator=(Status&& status);
 
   // Returns this->status().ok()
-  bool ok() const { return status_.ok(); }
+  bool ok() const { return this->status_.ok(); }
+
+  // Returns a reference to our status. If this contains a T, then
+  // returns Status::OK().
+  const Status& status() const &;
+  Status status() &&;
 
   // Returns a reference to our current value, or CHECK-fails if !this->ok().
-  const T& ValueOrDie() const;
-  T& ValueOrDie();
+  //
+  // Note: for value types that are cheap to copy, prefer simple code:
+  //
+  //   T value = statusor.ValueOrDie();
+  //
+  // Otherwise, if the value type is expensive to copy, but can be left
+  // in the StatusOr, simply assign to a reference:
+  //
+  //   T& value = statusor.ValueOrDie();  // or `const T&`
+  //
+  // Otherwise, if the value type supports an efficient move, it can be
+  // used as follows:
+  //
+  //   T value = std::move(statusor).ValueOrDie();
+  //
+  // The std::move on statusor instead of on the whole expression enables
+  // warnings about possible uses of the statusor object after the move.
+  // C++ style guide waiver for ref-qualified overloads granted in cl/143176389
+  // See go/ref-qualifiers for more details on such overloads.
+  const T& ValueOrDie() const &;
+  T& ValueOrDie() &;
+  const T&& ValueOrDie() const &&;
+  T&& ValueOrDie() &&;
 
-  // Moves our current value out of this object and returns it, or CHECK-fails
-  // if !this->ok().
-  // Use of this method is discouraged; prefer std::move(statusor.ValueOrDie())
-  // instead.
   T ConsumeValueOrDie() { return std::move(ValueOrDie()); }
 
- private:
-  Status status_;
-  T value_;
-};
-
-// Partial specialization for when T is not copy-constructible. This uses all
-// methods from the core implementation, but removes copy assignment and copy
-// construction.
-template <typename T>
-class StatusOr<T, false> : public StatusOr<T, true> {
- public:
-  // Remove copies.
-  StatusOr(const StatusOr& other) = delete;
-  StatusOr& operator=(const StatusOr& other) = delete;
-  template <typename U>
-  StatusOr(const StatusOr<U>& other) = delete;
-  StatusOr(const T& value) = delete;
-
-  // Use the superclass version for other constructors and operators.
-  StatusOr() = default;
-  StatusOr(StatusOr&& other) = default;
-  StatusOr& operator=(StatusOr&& other) = default;
-  StatusOr(T&& value)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(value)) {}
-  StatusOr(Status status)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(status)) {}
-  template <typename U>
-  StatusOr(StatusOr<U>&& other)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(other)) {}
+  // Ignores any errors. This method does nothing except potentially suppress
+  // complaints from any tools that are checking that errors are not dropped on
+  // the floor.
+  void IgnoreError() const;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
 // Implementation details for StatusOr<T>
 
-namespace internal {
+template <typename T>
+StatusOr<T>::StatusOr() : Base(Status(tensorflow::error::UNKNOWN, "")) {}
 
-class StatusOrHelper {
- public:
-  // Move type-agnostic error handling to the .cc.
-  static Status HandleInvalidStatusCtorArg();
-  static Status HandleNullObjectCtorArg();
-  static void Crash(const Status& status);
-
-  // Customized behavior for StatusOr<T> vs. StatusOr<T*>
-  template <typename T>
-  struct Specialize;
-};
+template <typename T>
+StatusOr<T>::StatusOr(const T& value) : Base(value) {}
 
 template <typename T>
-struct StatusOrHelper::Specialize {
-  // For non-pointer T, a reference can never be NULL.
-  static inline bool IsValueNull(const T& t) { return false; }
-};
+StatusOr<T>::StatusOr(const Status& status) : Base(status) {}
 
 template <typename T>
-struct StatusOrHelper::Specialize<T*> {
-  static inline bool IsValueNull(const T* t) { return t == NULL; }
-};
+StatusOr<T>& StatusOr<T>::operator=(const Status& status) {
+  this->Assign(status);
+  return *this;
+}
 
-}  // namespace internal
+template <typename T>
+StatusOr<T>::StatusOr(T&& value) : Base(std::move(value)) {}
 
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr()
-    : status_(tensorflow::error::UNKNOWN, "") {}
+template <typename T>
+StatusOr<T>::StatusOr(Status&& status) : Base(std::move(status)) {}
 
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(Status status)
-    : status_(std::move(status)) {
-  if (status_.ok()) {
-    status_ = internal::StatusOrHelper::HandleInvalidStatusCtorArg();
-  }
+template <typename T>
+StatusOr<T>& StatusOr<T>::operator=(Status&& status) {
+  this->Assign(std::move(status));
+  return *this;
 }
 
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(const T& value)
-    : value_(value) {
-  if (internal::StatusOrHelper::Specialize<T>::IsValueNull(value)) {
-    status_ = internal::StatusOrHelper::HandleNullObjectCtorArg();
-  }
-}
+template <typename T>
+template <typename U>
+inline StatusOr<T>::StatusOr(const StatusOr<U>& other)
+    : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
 
-template <typename T, bool CopyConstructible>
+template <typename T>
 template <typename U>
-inline StatusOr<T, CopyConstructible>::StatusOr(const StatusOr<U>& other)
-    : status_(other.status_), value_(other.value_) {}
-
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(T&& value)
-    : value_(std::move(value)) {
-  if (internal::StatusOrHelper::Specialize<T>::IsValueNull(value_)) {
-    status_ = internal::StatusOrHelper::HandleNullObjectCtorArg();
-  }
+inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
+  if (other.ok())
+    this->Assign(other.ValueOrDie());
+  else
+    this->Assign(other.status());
+  return *this;
 }
 
-template <typename T, bool CopyConstructible>
+template <typename T>
 template <typename U>
-inline StatusOr<T, CopyConstructible>::StatusOr(StatusOr<U>&& other)
-    : status_(std::move(other.status_)), value_(std::move(other.value_)) {}
+inline StatusOr<T>::StatusOr(StatusOr<U>&& other)
+    : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
 
-template <typename T, bool CopyConstructible>
-inline const T& StatusOr<T, CopyConstructible>::ValueOrDie() const {
-  if (!ok()) {
-    internal::StatusOrHelper::Crash(status());
+template <typename T>
+template <typename U>
+inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
+  if (other.ok()) {
+    this->Assign(std::move(other).ValueOrDie());
+  } else {
+    this->Assign(std::move(other).status());
   }
-  return value_;
+  return *this;
 }
 
-template <typename T, bool CopyConstructible>
-inline T& StatusOr<T, CopyConstructible>::ValueOrDie() {
-  if (!status_.ok()) {
-    internal::StatusOrHelper::Crash(status());
-  }
-  return value_;
+template <typename T>
+const Status& StatusOr<T>::status() const & {
+  return this->status_;
+}
+template <typename T>
+Status StatusOr<T>::status() && {
+  return ok() ? Status::OK() : std::move(this->status_);
+}
+
+template <typename T>
+const T& StatusOr<T>::ValueOrDie() const & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+T& StatusOr<T>::ValueOrDie() & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+const T&& StatusOr<T>::ValueOrDie() const && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+T&& StatusOr<T>::ValueOrDie() && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+void StatusOr<T>::IgnoreError() const {
+  // no-op
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/statusor_internals.h b/tensorflow/compiler/xla/statusor_internals.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2fda5bb3c6f11c20fc45c57885b1ce7523db81d
--- /dev/null
+++ b/tensorflow/compiler/xla/statusor_internals.h
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace internal_statusor {
+
+class Helper {
+ public:
+  // Move type-agnostic error handling to the .cc.
+  static void HandleInvalidStatusCtorArg(Status*);
+  TF_ATTRIBUTE_NORETURN static void Crash(const Status& status);
+};
+
+// Construct an instance of T in `p` through placement new, passing Args... to
+// the constructor.
+// This abstraction is here mostly for the gcc performance fix.
+template <typename T, typename... Args>
+void PlacementNew(void* p, Args&&... args) {
+#if defined(__GNUC__) && !defined(__clang__)
+  // Teach gcc that 'p' cannot be null, fixing code size issues.
+  if (p == nullptr) __builtin_unreachable();
+#endif
+  new (p) T(std::forward<Args>(args)...);
+}
+
+// Helper base class to hold the data and all operations.
+// We move all this to a base class to allow mixing with the appropriate
+// TraitsBase specialization.
+template <typename T>
+class StatusOrData {
+  template <typename U>
+  friend class StatusOrData;
+
+ public:
+  StatusOrData() = delete;
+
+  StatusOrData(const StatusOrData& other) {
+    if (other.ok()) {
+      MakeValue(other.data_);
+      MakeStatus();
+    } else {
+      MakeStatus(other.status_);
+    }
+  }
+
+  StatusOrData(StatusOrData&& other) noexcept {
+    if (other.ok()) {
+      MakeValue(std::move(other.data_));
+      MakeStatus();
+    } else {
+      MakeStatus(std::move(other.status_));
+    }
+  }
+
+  template <typename U>
+  StatusOrData(const StatusOrData<U>& other) {
+    if (other.ok()) {
+      MakeValue(other.data_);
+      MakeStatus();
+    } else {
+      MakeStatus(other.status_);
+    }
+  }
+
+  template <typename U>
+  StatusOrData(StatusOrData<U>&& other) {
+    if (other.ok()) {
+      MakeValue(std::move(other.data_));
+      MakeStatus();
+    } else {
+      MakeStatus(std::move(other.status_));
+    }
+  }
+
+  explicit StatusOrData(const T& value) : data_(value) { MakeStatus(); }
+  explicit StatusOrData(T&& value) : data_(std::move(value)) { MakeStatus(); }
+
+  explicit StatusOrData(const Status& status) : status_(status) {
+    EnsureNotOk();
+  }
+  explicit StatusOrData(Status&& status) : status_(std::move(status)) {
+    EnsureNotOk();
+  }
+
+  StatusOrData& operator=(const StatusOrData& other) {
+    if (this == &other) return *this;
+    if (other.ok())
+      Assign(other.data_);
+    else
+      Assign(other.status_);
+    return *this;
+  }
+
+  StatusOrData& operator=(StatusOrData&& other) {
+    if (this == &other) return *this;
+    if (other.ok())
+      Assign(std::move(other.data_));
+    else
+      Assign(std::move(other.status_));
+    return *this;
+  }
+
+  ~StatusOrData() {
+    if (ok()) {
+      status_.~Status();
+      data_.~T();
+    } else {
+      status_.~Status();
+    }
+  }
+
+  void Assign(const T& value) {
+    if (ok()) {
+      data_.~T();
+      MakeValue(value);
+    } else {
+      MakeValue(value);
+      status_ = Status::OK();
+    }
+  }
+
+  void Assign(T&& value) {
+    if (ok()) {
+      data_.~T();
+      MakeValue(std::move(value));
+    } else {
+      MakeValue(std::move(value));
+      status_ = Status::OK();
+    }
+  }
+
+  void Assign(const Status& status) {
+    Clear();
+    status_ = status;
+    EnsureNotOk();
+  }
+
+  void Assign(Status&& status) {
+    Clear();
+    status_ = std::move(status);
+    EnsureNotOk();
+  }
+
+  bool ok() const { return status_.ok(); }
+
+ protected:
+  // status_ will always be active after the constructor.
+  // We make it a union to be able to initialize exactly how we need without
+  // waste.
+  // Eg. in the copy constructor we use the default constructor of Status in
+  // the ok() path to avoid an extra Ref call.
+  union {
+    Status status_;
+  };
+
+  // data_ is active iff status_.ok()==true
+  struct Dummy {};
+  union {
+    // When T is const, we need some non-const object we can cast to void* for
+    // the placement new. dummy_ is that object.
+    Dummy dummy_;
+    T data_;
+  };
+
+  void Clear() {
+    if (ok()) data_.~T();
+  }
+
+  void EnsureOk() const {
+    if (!ok()) Helper::Crash(status_);
+  }
+
+  void EnsureNotOk() {
+    if (ok()) Helper::HandleInvalidStatusCtorArg(&status_);
+  }
+
+  // Construct the value (ie. data_) through placement new with the passed
+  // argument.
+  template <typename Arg>
+  void MakeValue(Arg&& arg) {
+    internal_statusor::PlacementNew<T>(&dummy_, std::forward<Arg>(arg));
+  }
+
+  // Construct the status (ie. status_) through placement new with the passed
+  // argument.
+  template <typename... Args>
+  void MakeStatus(Args&&... args) {
+    internal_statusor::PlacementNew<Status>(&status_,
+                                            std::forward<Args>(args)...);
+  }
+};
+
+// Helper base class to allow implicitly deleted constructors and assignment
+// operations in StatusOr.
+// TraitsBase will explicitly delete what it can't support and StatusOr will
+// inherit that behavior implicitly.
+template <bool Copy, bool Move>
+struct TraitsBase {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = default;
+  TraitsBase(TraitsBase&&) = default;
+  TraitsBase& operator=(const TraitsBase&) = default;
+  TraitsBase& operator=(TraitsBase&&) = default;
+};
+
+template <>
+struct TraitsBase<false, true> {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = delete;
+  TraitsBase(TraitsBase&&) = default;
+  TraitsBase& operator=(const TraitsBase&) = delete;
+  TraitsBase& operator=(TraitsBase&&) = default;
+};
+
+template <>
+struct TraitsBase<false, false> {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = delete;
+  TraitsBase(TraitsBase&&) = delete;
+  TraitsBase& operator=(const TraitsBase&) = delete;
+  TraitsBase& operator=(TraitsBase&&) = delete;
+};
+
+}  // namespace internal_statusor
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index f8555113f816d933423bdf38741d18a574ddd9ce..5fa2211ac66177514ac8ecabfa8791e7c8c014a2 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -29,8 +29,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using tensorflow::Status;
-
 class Base1 {
  public:
   virtual ~Base1() {}
@@ -59,6 +57,14 @@ class CopyNoAssign {
   const CopyNoAssign& operator=(const CopyNoAssign&);
 };
 
+class NoDefaultConstructor {
+ public:
+  explicit NoDefaultConstructor(int foo);
+};
+
+static_assert(!std::is_default_constructible<NoDefaultConstructor>(),
+              "Should not be default-constructible.");
+
 StatusOr<std::unique_ptr<int>> ReturnUniquePtr() {
   // Uses implicit constructor from T&&
   return std::unique_ptr<int>(new int(0));
@@ -69,6 +75,18 @@ TEST(StatusOr, ElementType) {
   static_assert(std::is_same<StatusOr<char>::element_type, char>(), "");
 }
 
+TEST(StatusOr, TestNoDefaultConstructorInitialization) {
+  // Explicitly initialize it with an error code.
+  StatusOr<NoDefaultConstructor> statusor(tensorflow::errors::Cancelled(""));
+  EXPECT_FALSE(statusor.ok());
+  EXPECT_EQ(statusor.status().code(), tensorflow::error::CANCELLED);
+
+  // Default construction of StatusOr initializes it with an UNKNOWN error code.
+  StatusOr<NoDefaultConstructor> statusor2;
+  EXPECT_FALSE(statusor2.ok());
+  EXPECT_EQ(statusor2.status().code(), tensorflow::error::UNKNOWN);
+}
+
 TEST(StatusOr, TestMoveOnlyInitialization) {
   StatusOr<std::unique_ptr<int>> thing(ReturnUniquePtr());
   ASSERT_TRUE(thing.ok());
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 13dd1a30b60a64171425f2a7d872da9bb2ca5380..a94ff9db899c52f1005cdae84ede2209467bcb8f 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -26,6 +26,7 @@ filegroup(
 
 load("//tensorflow/compiler/xla:xla.bzl", "export_dynamic_linkopts")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_test_macros")
 
@@ -94,11 +95,11 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/legacy_flags:hlo_test_base_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
@@ -116,8 +117,12 @@ cc_binary(
     name = "local_client_aot_test_helper",
     srcs = ["local_client_aot_test_helper.cc"],
     deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
     ],
 )
@@ -139,6 +144,7 @@ cc_library(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -151,7 +157,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -171,6 +176,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
@@ -196,12 +202,14 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
@@ -213,12 +221,13 @@ xla_test(
     srcs = ["bad_rng_shape_validation_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
@@ -233,12 +242,12 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:test",
@@ -255,7 +264,6 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:test",
@@ -268,6 +276,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
@@ -275,7 +284,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -291,7 +299,6 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -307,6 +314,7 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
@@ -315,7 +323,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -339,7 +346,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -356,7 +362,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:test",
@@ -371,7 +376,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -388,7 +393,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -409,7 +414,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -422,12 +427,13 @@ xla_test(
     srcs = ["deallocation_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -441,13 +447,14 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -471,9 +478,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -481,36 +486,29 @@ xla_test(
 )
 
 xla_test(
-    name = "dot_operation_test",
-    srcs = ["dot_operation_test.cc"],
+    name = "reduce_precision_test",
+    srcs = ["reduce_precision_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:array3d",
-        "//tensorflow/compiler/xla:reference_util",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
     ],
 )
 
-# Tests the dot operation in some cases that can be performed via a
-# runtime call on some backends - e.g. a runtime call to to Eigen.
 xla_test(
-    name = "dot_operation_runtime_test",
+    name = "dot_operation_test",
     srcs = ["dot_operation_test.cc"],
-    backend_args = {
-        "cpu": ["--xla_cpu_use_eigen"],
-        "cpu_parallel": ["--xla_cpu_use_eigen"],
-    },
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -518,9 +516,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -530,20 +526,11 @@ xla_test(
     ],
 )
 
-# Repeat dot_operation_runtime_test with single-threded eigen.
+# Tests the dot operation in some cases that can be performed via a
+# runtime call on some backends - e.g. a runtime call to Eigen.
 xla_test(
-    name = "dot_operation_single_threaded_runtime_test",
+    name = "dot_operation_runtime_test",
     srcs = ["dot_operation_test.cc"],
-    backend_args = {
-        "cpu": [
-            "--xla_cpu_use_eigen",
-            "--xla_cpu_multi_thread_eigen=false",
-        ],
-        "cpu_parallel": [
-            "--xla_cpu_use_eigen",
-            "--xla_cpu_multi_thread_eigen=false",
-        ],
-    },
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -551,9 +538,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -563,17 +548,16 @@ xla_test(
     ],
 )
 
+# Repeat dot_operation_runtime_test with single-threded eigen.
 xla_test(
-    name = "dot_operation_rowmajor_runtime_test",
+    name = "dot_operation_single_threaded_runtime_test",
     srcs = ["dot_operation_test.cc"],
     backend_args = {
         "cpu": [
-            "--xla_cpu_use_eigen",
-            "--xla_default_layout=major2minor",
+            "--xla_cpu_multi_thread_eigen=false",
         ],
         "cpu_parallel": [
-            "--xla_cpu_use_eigen",
-            "--xla_default_layout=major2minor",
+            "--xla_cpu_multi_thread_eigen=false",
         ],
     },
     deps = [
@@ -583,9 +567,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_runtime_flags",
-        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -605,7 +587,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -624,7 +606,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -650,7 +632,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -677,7 +659,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -694,12 +676,13 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -710,19 +693,29 @@ xla_test(
 xla_test(
     name = "batch_normalization_test",
     srcs = ["batch_normalization_test.cc"],
+    shard_count = 40,
     deps = [
+        ":test_utils",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -738,7 +731,7 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -754,7 +747,7 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -768,11 +761,13 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
+        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -799,7 +794,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -816,7 +811,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -842,7 +837,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -850,11 +845,14 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "reduce_window_test",
-    timeout = "long",
+# External xla_test targets can add "reduce_window_test_library" to xla_test_library_deps, in order
+# to refer to the cc_library compiled with the correct backend macros. The following test target
+# "reduce_window_test" is an example.
+xla_test_library(
+    name = "reduce_window_test_library",
     srcs = ["reduce_window_test.cc"],
     deps = [
+        ":test_macros_header",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
@@ -865,7 +863,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -873,6 +871,14 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "reduce_window_test",
+    timeout = "long",
+    srcs = [],
+    xla_test_library_deps = [":reduce_window_test_library"],
+    deps = [],
+)
+
 xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
@@ -889,7 +895,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -906,7 +912,7 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -921,10 +927,11 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -941,11 +948,12 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -958,7 +966,7 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -976,8 +984,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/legacy_flags:user_computation_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
     ],
@@ -995,7 +1002,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1009,7 +1016,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -1022,7 +1029,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -1044,7 +1051,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1058,11 +1065,12 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1081,13 +1089,14 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1103,7 +1112,7 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1125,7 +1134,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1142,11 +1151,12 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
@@ -1161,7 +1171,6 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1184,7 +1193,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1199,7 +1208,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1215,13 +1224,14 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -1240,7 +1250,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1262,7 +1272,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1279,7 +1289,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1298,7 +1308,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1315,8 +1325,14 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1324,6 +1340,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "multioutput_fusion_test",
+    srcs = ["multioutput_fusion_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_test(
     name = "local_client_aot_test",
     srcs = [
@@ -1333,6 +1374,7 @@ cc_test(
     linkstatic = 1,
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -1347,9 +1389,9 @@ cc_test(
         ":local_client_test_base",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:local_service",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
@@ -1365,7 +1407,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1381,7 +1423,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1407,7 +1449,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
@@ -1415,6 +1457,15 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "deep_graph_test",
+    srcs = ["deep_graph_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+    ],
+)
+
 cc_test(
     name = "literal_test_util_test",
     srcs = ["literal_test_util_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index c07f2745fe9e67898148bf0026ac32534eac506c..fb913e200ffa2ea64cb4014fe3d62efafcfb2bfa 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -26,9 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -45,7 +43,7 @@ namespace {
 
 class ArrayElementwiseOpTest : public ClientLibraryTestBase {
  public:
-  ErrorSpec error_spec_{0.0001};
+  ErrorSpec error_spec_{0.0001, 0.0001};
 };
 
 class ArrayElementwiseOpTestParamCount
@@ -158,13 +156,13 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
     b_values.push_back(2 * i / static_cast<float>(count + 2));
   }
 
-  std::unique_ptr<Literal> a_literal = LiteralUtil::CreateR1<float>({a_values});
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({a_values});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
   auto a_constant = builder.ConstantR1<float>(a_values);
   auto a_param = builder.Parameter(0, a_literal->shape(), "a_param");
 
-  std::unique_ptr<Literal> b_literal = LiteralUtil::CreateR1<float>({b_values});
+  std::unique_ptr<Literal> b_literal = Literal::CreateR1<float>({b_values});
   std::unique_ptr<GlobalData> b_data =
       client_->TransferToServer(*b_literal).ConsumeValueOrDie();
   auto b_constant = builder.Parameter(1, a_literal->shape(), "b_param");
@@ -804,7 +802,7 @@ TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
   std::vector<float> values = {1.0f, 2.0f, 3.2f, -4.0f};
   std::vector<float> exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
 
-  std::unique_ptr<Literal> param_literal = LiteralUtil::CreateR1<float>(values);
+  std::unique_ptr<Literal> param_literal = Literal::CreateR1<float>(values);
   std::unique_ptr<GlobalData> param_data =
       client_->TransferToServer(*param_literal).ConsumeValueOrDie();
 
@@ -826,6 +824,244 @@ TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
   ComputeAndCompareR1<float>(&b, expected, {param_data.get()}, error_spec_);
 }
 
+TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
+  std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  b.Pow(b.Exp(param0), param1);
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = std::pow(std::exp(values0[i]), values1[i]);
+  }
+
+  ComputeAndCompareR1<float>(&b, expected, {data0.get(), data1.get()},
+                             error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f};
+  std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  b.Log(b.Pow(param0, param1));
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = std::log(std::pow(values0[i], values1[i]));
+  }
+
+  ComputeAndCompareR1<float>(&b, expected, {data0.get(), data1.get()},
+                             error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
+  std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  b.Mul(b.Exp(param0), b.Exp(param1));
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = std::exp(values0[i]) * std::exp(values1[i]);
+  }
+
+  ComputeAndCompareR1<float>(&b, expected, {data0.get(), data1.get()},
+                             error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
+  std::vector<float> values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  b.Div(param0, b.Exp(param1));
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = values0[i] / std::exp(values1[i]);
+  }
+
+  ComputeAndCompareR1<float>(&b, expected, {data0.get(), data1.get()},
+                             error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
+  std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+  std::vector<float> values2 = {0.1f, 1.1f, 6.9f, 12.5f, -15.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal2 = Literal::CreateR1<float>(values2);
+  std::unique_ptr<GlobalData> data2 =
+      client_->TransferToServer(*literal2).ConsumeValueOrDie();
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  auto param2 = b.Parameter(2, literal2->shape(), "param2");
+  b.Div(b.Div(param0, param1), param2);
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = (values0[i] / values1[i]) / values2[i];
+  }
+
+  ComputeAndCompareR1<float>(
+      &b, expected, {data0.get(), data1.get(), data2.get()}, error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
+  std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+  std::vector<float> values2 = {0.1f, 1.1f, 6.9f, 12.5f, -15.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal2 = Literal::CreateR1<float>(values2);
+  std::unique_ptr<GlobalData> data2 =
+      client_->TransferToServer(*literal2).ConsumeValueOrDie();
+
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  auto param2 = b.Parameter(2, literal2->shape(), "param2");
+  b.Div(param0, b.Div(param1, param2));
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = values0[i] / (values1[i] / values2[i]);
+  }
+
+  ComputeAndCompareR1<float>(
+      &b, expected, {data0.get(), data1.get(), data2.get()}, error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
+  std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f};
+  std::vector<float> values2 = {0.1f, 1.1f, 6.9f, 9.5f, -11.0f, -0.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal2 = Literal::CreateR1<float>(values2);
+  std::unique_ptr<GlobalData> data2 =
+      client_->TransferToServer(*literal2).ConsumeValueOrDie();
+
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  auto param2 = b.Parameter(2, literal2->shape(), "param2");
+  b.Div(param0, b.Pow(param1, param2));
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = values0[i] / std::pow(values1[i], values2[i]);
+  }
+
+  ComputeAndCompareR1<float>(
+      &b, expected, {data0.get(), data1.get(), data2.get()}, error_spec_);
+}
+
+TEST_F(ArrayElementwiseOpTest, Div4F32) {
+  ComputationBuilder b(client_, TestName());
+
+  std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
+  std::vector<float> values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
+  std::vector<float> values2 = {0.1f, 1.1f, 6.9f, 12.5f, -15.0f, -0.5f};
+  std::vector<float> values3 = {2.1f, 3.1f, 9.9f, -4.5f, -11.0f, -21.5f};
+
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>(values0);
+  std::unique_ptr<GlobalData> data0 =
+      client_->TransferToServer(*literal0).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>(values1);
+  std::unique_ptr<GlobalData> data1 =
+      client_->TransferToServer(*literal1).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal2 = Literal::CreateR1<float>(values2);
+  std::unique_ptr<GlobalData> data2 =
+      client_->TransferToServer(*literal2).ConsumeValueOrDie();
+
+  std::unique_ptr<Literal> literal3 = Literal::CreateR1<float>(values3);
+  std::unique_ptr<GlobalData> data3 =
+      client_->TransferToServer(*literal3).ConsumeValueOrDie();
+
+  auto param0 = b.Parameter(0, literal0->shape(), "param0");
+  auto param1 = b.Parameter(1, literal1->shape(), "param1");
+  auto param2 = b.Parameter(2, literal2->shape(), "param2");
+  auto param3 = b.Parameter(3, literal3->shape(), "param2");
+  b.Div(b.Div(param0, param1), b.Div(param2, param3));
+
+  std::vector<float> expected(values0.size());
+  for (int64 i = 0; i < values0.size(); ++i) {
+    expected[i] = (values0[i] / values1[i]) / (values2[i] / values3[i]);
+  }
+
+  ComputeAndCompareR1<float>(
+      &b, expected, {data0.get(), data1.get(), data2.get(), data3.get()},
+      error_spec_);
+}
+
 TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
   const int count = GetParam();
   ComputationBuilder builder(client_, TestName());
@@ -1241,12 +1477,12 @@ TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
+      Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   std::unique_ptr<Literal> param1_literal =
-      LiteralUtil::CreateR1<float>({7.2f, 2.3f, 3.4f, 5.6f});
+      Literal::CreateR1<float>({7.2f, 2.3f, 3.4f, 5.6f});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
@@ -1263,12 +1499,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR3FromArray3D<float>(Array3D<float>(0, 7, 0));
+      Literal::CreateR3FromArray3D<float>(Array3D<float>(0, 7, 0));
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   std::unique_ptr<Literal> param1_literal =
-      LiteralUtil::CreateR3FromArray3D<float>(Array3D<float>(0, 7, 0));
+      Literal::CreateR3FromArray3D<float>(Array3D<float>(0, 7, 0));
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
@@ -1285,7 +1521,7 @@ TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
+      Literal::CreateR1<float>({1.1f, 2.2f, 3.3f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -1297,6 +1533,24 @@ TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
                              {param0_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CosF32s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({3.14159f, 0.0f, 1.570796f, -0.78539f});
+  auto result = builder.Cos(a);
+
+  ComputeAndCompareR1<float>(&builder, {-1.0f, 1.0f, 0.0f, 0.707107f}, {},
+                             error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({3.14159f, 0.0f, 1.570796f, -0.78539f});
+  auto result = builder.Sin(a);
+
+  ComputeAndCompareR1<float>(&builder, {0.0f, 0.0f, 1.0f, -0.707107f}, {},
+                             error_spec_);
+}
+
 TEST_F(ArrayElementwiseOpTest, TanhF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f});
@@ -1447,9 +1701,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) {
   auto cmp_dim_1 = builder.Eq(v, m, /*broadcast_dimensions=*/{0});
   auto result = builder.Tuple({cmp_dim_0, cmp_dim_1});
 
-  auto expected = LiteralUtil::MakeTuple(
-      {LiteralUtil::CreateR2<bool>({{true, true}, {true, false}}).get(),
-       LiteralUtil::CreateR2<bool>({{true, false}, {false, false}}).get()});
+  auto expected = Literal::MakeTuple(
+      {Literal::CreateR2<bool>({{true, true}, {true, false}}).get(),
+       Literal::CreateR2<bool>({{true, false}, {false, false}}).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -1802,7 +2056,7 @@ TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
   std::iota(r1.begin(), r1.end(), 1.0);
 
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> a_literal = LiteralUtil::CreateR4FromArray4D(r4);
+  std::unique_ptr<Literal> a_literal = Literal::CreateR4FromArray4D(r4);
   *a_literal->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
   auto a = builder.ConstantLiteral(*a_literal);
@@ -1838,8 +2092,8 @@ TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
 // broadcast.
 TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
   ComputationBuilder builder(client_, TestName());
-  auto x_literal = LiteralUtil::CreateR1<float>({1, 2, 3});
-  auto y_literal = LiteralUtil::CreateR1<float>({4, 5});
+  auto x_literal = Literal::CreateR1<float>({1, 2, 3});
+  auto y_literal = Literal::CreateR1<float>({4, 5});
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
 
@@ -1862,8 +2116,6 @@ INSTANTIATE_TEST_CASE_P(ArrayElementwiseOpTestParamCount,
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
-  xla::legacy_flags::AppendUserComputationFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index a1ca1de584f8be808d19a43680f7c093d4f94def..67dbc913b42c89bf5a8fb5b91da13a29e5e248f5 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -76,7 +75,6 @@ TEST_F(AxpySimpleTest, AxpyTenValues) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index ea58491038c1dfcc8069b3c14833ade554be0d8a..02be0b5ab83c23fda36c5ccc65a598fc8e4a1600 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -70,7 +69,6 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 6a47f1b718a1734de731ec50d7094ac529eca9df..d692a810325eae1ebe50e1ad84caf279d51666f1 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -23,13 +23,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/reference_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -48,7 +57,7 @@ class BatchNormalizationTest : public ClientLibraryTestBase {
         {5.0f, 4.4f},   // p2
     });
     input_array_.FillWithPZ(pz);
-    input_literal_ = *LiteralUtil::CreateR4FromArray4D(input_array_);
+    input_literal_ = *Literal::CreateR4FromArray4D(input_array_);
     CHECK_EQ(kSamples, input_array_.planes());
     CHECK_EQ(kZ, input_array_.depth());
     CHECK_EQ(kY, input_array_.height());
@@ -190,13 +199,422 @@ TEST_F(BatchNormalizationTest, SpecComparisonForward) {
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
+struct BatchNormTestParam {
+  std::vector<int64> bounds;
+  int64 feature_index;
+  float random_value_mean;
+  float random_value_var;
+};
+
+// Tests to test the fused operation of BatchNorm.
+class BatchNormTest : public ClientLibraryTestBase,
+                      public ::testing::WithParamInterface<BatchNormTestParam> {
+};
+
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_P(BatchNormTest, DISABLED_ON_GPU(RandomizedTests)) {
+  float epsilon = 0.001;
+  ComputationBuilder builder(client_, TestName());
+  const std::vector<int64>& bounds = GetParam().bounds;
+  Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
+  input_array.FillRandom(GetParam().random_value_var,
+                         GetParam().random_value_mean);
+
+  const int64 feature_index = GetParam().feature_index;
+  const int64 num_elements_per_feature =
+      Product(bounds) / bounds[feature_index];
+  const int64 feature_bound = bounds[feature_index];
+  std::vector<float> offset(feature_bound, 1);
+  std::vector<float> scale(feature_bound, 2);
+
+  auto input_squared =
+      ReferenceUtil::MapArray4D(input_array, [](float a) { return a * a; });
+  std::vector<int64> reduce_dims;
+  for (int64 i = 0; i < bounds.size(); ++i) {
+    if (i != feature_index) {
+      reduce_dims.push_back(i);
+    }
+  }
+
+  auto sum =
+      ReferenceUtil::Reduce4DTo1D(input_array, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  auto sum_squared =
+      ReferenceUtil::Reduce4DTo1D(*input_squared, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  std::vector<float> mean(feature_bound);
+
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean[i] = sum[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> mean_square(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean_square[i] = mean[i] * mean[i];
+  }
+
+  std::vector<float> square_mean(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    square_mean[i] = sum_squared[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> var(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    var[i] = square_mean[i] - mean_square[i];
+  }
+
+  Array4D<float> mean_4D =
+      *ReferenceUtil::Broadcast1DTo4D(mean, bounds, feature_index);
+  auto var_4D = *ReferenceUtil::Broadcast1DTo4D(var, bounds, feature_index);
+  auto scale_4D = *ReferenceUtil::Broadcast1DTo4D(scale, bounds, feature_index);
+  auto offset_4D =
+      *ReferenceUtil::Broadcast1DTo4D(offset, bounds, feature_index);
+
+  auto normalized = *ReferenceUtil::BatchNorm4D(input_array, mean_4D, var_4D,
+                                                scale_4D, offset_4D, epsilon);
+
+  auto expected_normalized = Literal::CreateR4FromArray4D<float>(normalized);
+
+  auto offset_literal = Literal::CreateR1<float>(offset);
+  auto scale_literal = Literal::CreateR1<float>(scale);
+  auto input_literal = Literal::CreateR4FromArray4D<float>(input_array);
+
+  auto input_activations =
+      builder.Parameter(0, input_literal->shape(), "input");
+  auto scale_activations =
+      builder.Parameter(1, scale_literal->shape(), "offset");
+  auto offset_activations =
+      builder.Parameter(2, offset_literal->shape(), "scale");
+
+  auto expected = *Literal::MakeTuple({expected_normalized.get(),
+                                       Literal::CreateR1<float>(mean).get(),
+                                       Literal::CreateR1<float>(var).get()});
+
+  std::unique_ptr<GlobalData> input_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> scale_data =
+      client_->TransferToServer(*scale_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> offset_data =
+      client_->TransferToServer(*offset_literal).ConsumeValueOrDie();
+
+  builder.BatchNormTraining(input_activations, scale_activations,
+                            offset_activations, epsilon, feature_index);
+
+  ComputeAndCompareTuple(
+      &builder, expected,
+      {input_data.get(), scale_data.get(), offset_data.get()},
+      ErrorSpec(0.01, 1));
+}
+
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_P(BatchNormTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(
+                              DISABLED_ON_GPU(RandomizedGradTests)))) {
+  float epsilon = 0.001;
+  ComputationBuilder builder(client_, TestName());
+  const std::vector<int64>& bounds = GetParam().bounds;
+  Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
+  input_array.FillRandom(GetParam().random_value_var,
+                         GetParam().random_value_mean);
+
+  Array4D<float> grad_output_array(bounds[0], bounds[1], bounds[2], bounds[3]);
+  grad_output_array.FillRandom(GetParam().random_value_var,
+                               GetParam().random_value_mean);
+
+  const int64 feature_index = GetParam().feature_index;
+  const int64 num_elements_per_feature =
+      Product(bounds) / bounds[feature_index];
+  const int64 feature_bound = bounds[feature_index];
+  std::vector<float> scale(feature_bound, 2);
+
+  auto input_squared =
+      ReferenceUtil::MapArray4D(input_array, [](float a) { return a * a; });
+  std::vector<int64> reduce_dims;
+  for (int64 i = 0; i < bounds.size(); ++i) {
+    if (i != feature_index) {
+      reduce_dims.push_back(i);
+    }
+  }
+
+  auto sum =
+      ReferenceUtil::Reduce4DTo1D(input_array, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  auto sum_squared =
+      ReferenceUtil::Reduce4DTo1D(*input_squared, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  std::vector<float> mean(feature_bound);
+
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean[i] = sum[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> mean_square(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean_square[i] = mean[i] * mean[i];
+  }
+
+  std::vector<float> square_mean(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    square_mean[i] = sum_squared[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> var(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    var[i] = square_mean[i] - mean_square[i];
+  }
+
+  Array4D<float> mean_4D =
+      *ReferenceUtil::Broadcast1DTo4D(mean, bounds, feature_index);
+  auto var_4D = *ReferenceUtil::Broadcast1DTo4D(var, bounds, feature_index);
+  auto scale_4D = *ReferenceUtil::Broadcast1DTo4D(scale, bounds, feature_index);
+
+  auto var_add_epsilon = *ReferenceUtil::MapArray4D(
+      var_4D, [epsilon](float a) { return std::sqrt(a + epsilon); });
+
+  auto grad_output_times_var =
+      *ReferenceUtil::MapArray4D(grad_output_array, var_add_epsilon,
+                                 [](float a, float b) { return a * b; });
+
+  auto grad_activation = *ReferenceUtil::MapArray4D(
+      grad_output_times_var, scale_4D, [](float a, float b) { return a * b; });
+
+  auto activation_shifted = *ReferenceUtil::MapArray4D(
+      input_array, mean_4D, [](float a, float b) { return a - b; });
+
+  auto grad_scale_before_reduction =
+      *ReferenceUtil::MapArray4D(grad_output_times_var, activation_shifted,
+                                 [](float a, float b) { return a * b; });
+
+  auto grad_scale = ReferenceUtil::Reduce4DTo1D(
+      grad_scale_before_reduction, /*init=*/0.0f, reduce_dims,
+      [](float a, float b) { return a + b; });
+
+  auto grad_offset =
+      ReferenceUtil::Reduce4DTo1D(grad_output_array, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  auto expected_grad_activation =
+      Literal::CreateR4FromArray4D<float>(grad_activation);
+
+  auto input_literal = Literal::CreateR4FromArray4D<float>(input_array);
+  auto scale_literal = Literal::CreateR1<float>(scale);
+  auto mean_literal = Literal::CreateR1<float>(mean);
+  auto var_literal = Literal::CreateR1<float>(var);
+  auto grad_output_literal =
+      Literal::CreateR4FromArray4D<float>(grad_output_array);
+
+  auto input_parameter = builder.Parameter(0, input_literal->shape(), "input");
+  auto scale_parameter = builder.Parameter(1, scale_literal->shape(), "scale");
+  auto mean_parameter = builder.Parameter(2, mean_literal->shape(), "mean");
+  auto var_parameter = builder.Parameter(3, var_literal->shape(), "variance");
+  auto grad_output_parameter =
+      builder.Parameter(4, grad_output_literal->shape(), "grad_output");
+
+  std::unique_ptr<GlobalData> input_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> scale_data =
+      client_->TransferToServer(*scale_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> mean_data =
+      client_->TransferToServer(*mean_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> var_data =
+      client_->TransferToServer(*var_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> grad_output_data =
+      client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie();
+
+  auto t = builder.BatchNormGrad(input_parameter, scale_parameter,
+                                 mean_parameter, var_parameter,
+                                 grad_output_parameter, epsilon, feature_index);
+
+  auto expected =
+      *Literal::MakeTuple({expected_grad_activation.get(),
+                           Literal::CreateR1<float>(grad_scale).get(),
+                           Literal::CreateR1<float>(grad_offset).get()});
+
+  ComputeAndCompareTuple(&builder, expected,
+                         {input_data.get(), scale_data.get(), mean_data.get(),
+                          var_data.get(), grad_output_data.get()},
+                         ErrorSpec(0.01, 1));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BatchNormTest_Instantiation, BatchNormTest,
+    ::testing::Values(BatchNormTestParam{{2, 2, 2, 2}, 0, 100.2f, 200.0f},
+                      BatchNormTestParam{{2, 2, 2, 2}, 3, 300.f, 400.0f},
+
+                      BatchNormTestParam{{1, 10, 1, 1}, 0, 10.1f, 20.1f},
+                      BatchNormTestParam{{10, 10, 10, 10}, 1, 3.14f, 314.15f},
+                      BatchNormTestParam{{10, 10, 10, 10}, 2, 666.6f, 777.7f},
+                      BatchNormTestParam{{10, 10, 10, 10}, 1, -666.6f, 777.7f},
+                      BatchNormTestParam{{10, 10, 10, 10}, 2, 0.f, 777.7f},
+                      BatchNormTestParam{{1, 1, 10, 130}, 2, 0.f, 777.7f},
+                      BatchNormTestParam{{1, 1, 130, 11}, 2, 0.f, 777.7f},
+                      BatchNormTestParam{{1, 1, 10, 1}, 3, 888.8f, 9.9f},
+
+                      BatchNormTestParam{{24, 129, 1, 2}, 2, 10000, 10000},
+                      BatchNormTestParam{{24, 129, 1, 2}, 3, 10000, 10000},
+
+                      // Feature on low dimension to trigger relayout, test
+                      // internal logical to physical dimension calculation
+                      // is correct after relayout.
+                      BatchNormTestParam{{1, 2, 3, 4}, 0, 100, 100}));
+
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(BasicTraining)) {
+  const int kFeatureIndex = 3;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
+
+  auto scale = builder.ConstantR1<float>({2.0f, 3.0f});
+
+  auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
+
+  auto tuple = builder.BatchNormTraining(operand, scale, offset,
+                                         /*epsilon=*/0.001, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
+                                 {{{1.9f, 3.3f}}, {{3.7f, 6.0f}}}})
+           .get(),
+       Literal::CreateR1<float>({4, 5}).get(),
+       Literal::CreateR1<float>({5, 5}).get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
+}
+
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(BasicTrainingOnSublane)) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
+
+  auto scale = builder.ConstantR1<float>({2.0f, 3.0f});
+
+  auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
+
+  auto tuple = builder.BatchNormTraining(operand, scale, offset,
+                                         /*epsilon=*/0.001, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
+                                 {{{1.9f}, {3.3f}}, {{3.7f}, {6.0f}}}})
+           .get(),
+       Literal::CreateR1<float>({4, 5}).get(),
+       Literal::CreateR1<float>({5, 5}).get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
+}
+
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(TrainingWithFeatureOnLowDimension)) {
+  // Use 0 dimension as feature, tests layout analyzer.
+  const int kFeatureIndex = 0;
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle h0;
+  auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
+                                          /*parameter_number=*/0, "operand",
+                                          &builder, &h0);
+  ComputationDataHandle h1;
+  auto scale =
+      CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
+                               /*parameter_number=*/1, "scale", &builder, &h1);
+  ComputationDataHandle h2;
+  auto offset =
+      CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
+                               /*parameter_number=*/2, "offset", &builder, &h2);
+
+  auto tuple = builder.BatchNormTraining(h0, h1, h2,
+                                         /*epsilon=*/1, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
+           .get(),
+       Literal::CreateR1<float>(std::vector<float>(260, 1.0f)).get(),
+       Literal::CreateR1<float>(std::vector<float>(260, 0.0f)).get()});
+
+  ComputeAndCompareTuple(&builder, expected,
+                         {operand.get(), scale.get(), offset.get()},
+                         ErrorSpec(0.1));
+}
+
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(LargeEpsilonTest)) {
+  // Test the correctness of choosing a large epsilon value.
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle h0;
+  auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
+                                          /*parameter_number=*/0, "operand",
+                                          &builder, &h0);
+  ComputationDataHandle h1;
+  auto scale =
+      CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
+                               /*parameter_number=*/1, "scale", &builder, &h1);
+  ComputationDataHandle h2;
+  auto offset =
+      CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
+                               /*parameter_number=*/2, "offset", &builder, &h2);
+
+  // var = 125, mean = 15, epsilon = -100
+  auto tuple = builder.BatchNormTraining(h0, h1, h2,
+                                         /*epsilon=*/-100, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
+           .get(),
+       Literal::CreateR1<float>(std::vector<float>(1, 15.0f)).get(),
+       Literal::CreateR1<float>(std::vector<float>(1, 125.0f)).get()});
+
+  ComputeAndCompareTuple(&builder, expected,
+                         {operand.get(), scale.get(), offset.get()},
+                         ErrorSpec(0.1));
+}
+
+// TODO(b/62764704): Implement on CPU and GPU. Disabled on 2017-07-11.
+XLA_TEST_F(BatchNormTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(
+                              DISABLED_ON_GPU(BatchNormGradBasic)))) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand =
+      builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
+
+  auto scale = builder.ConstantR1<float>({1.0f, 1.0f});
+
+  auto mean = builder.ConstantR1<float>({0.0f, 0.0f});
+
+  auto var = builder.ConstantR1<float>({1.0f, 1.0f});
+
+  auto grad_output = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
+
+  builder.BatchNormGrad(operand, scale, mean, var, grad_output,
+                        /*epsilon=*/0.0, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<float>(
+           {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}})
+           .get(),
+       Literal::CreateR1<float>({0, 0}).get(),
+       Literal::CreateR1<float>({16, 20}).get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index 5e3b70702dd482e6b278386d70fef60b1bacb926..e6b853c2e4e4a08174012c1eb8be3739a2c9dba9 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -143,7 +142,6 @@ TEST_F(BinopScalingTest, R4PlusR0S32) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 25fe04a930e3783ff6024a0bb3bddc430c4fafdd..2a57835ca93cd2b367fe0402aee1f986ae2d4ff3 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -21,9 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/user_computation_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -63,9 +61,8 @@ class BroadcastSimpleTest : public ClientLibraryTestBase {
       Array3D<float>* r3_array, float start, float end, int seed) {
     *r3_shape = ShapeUtil::MakeShapeWithLayout(F32, bounds, minor_to_major);
     r3_array->FillRandom(start, end, seed);
-    auto r3_data =
-        LiteralUtil::Relayout(*LiteralUtil::CreateR3FromArray3D(*r3_array),
-                              LayoutUtil::MakeLayout(minor_to_major));
+    auto r3_data = Literal::CreateR3FromArray3D(*r3_array)->Relayout(
+        LayoutUtil::MakeLayout(minor_to_major));
     std::unique_ptr<GlobalData> r3_global_data =
         client_->TransferToServer(*r3_data).ConsumeValueOrDie();
     return r3_global_data;
@@ -77,9 +74,8 @@ class BroadcastSimpleTest : public ClientLibraryTestBase {
       Array2D<float>* r2_array, float start, float end, int seed) {
     *r2_shape = ShapeUtil::MakeShapeWithLayout(F32, bounds, minor_to_major);
     r2_array->FillRandom(start, end, seed);
-    auto r2_data =
-        LiteralUtil::Relayout(*LiteralUtil::CreateR2FromArray2D(*r2_array),
-                              LayoutUtil::MakeLayout(minor_to_major));
+    auto r2_data = Literal::CreateR2FromArray2D(*r2_array)->Relayout(
+        LayoutUtil::MakeLayout(minor_to_major));
     std::unique_ptr<GlobalData> r2_global_data =
         client_->TransferToServer(*r2_data).ConsumeValueOrDie();
     return r2_global_data;
@@ -217,13 +213,13 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   ComputationBuilder b(client_, TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 5.0}}),
-        b.ConstantLiteral(*LiteralUtil::CreateR3<float>(
+        b.ConstantLiteral(*Literal::CreateR3<float>(
             {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})),
         /*broadcast_dimensions=*/{1, 2});
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{3.0, 7.0}, {4.0, 8.0}, {5.0, 9.0}},
-                                    {{6.0, 10.0}, {7.0, 11.0}, {8.0, 12.0}}});
+      Literal::CreateR3<float>({{{3.0, 7.0}, {4.0, 8.0}, {5.0, 9.0}},
+                                {{6.0, 10.0}, {7.0, 11.0}, {8.0, 12.0}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
@@ -292,7 +288,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
       }
     }
   }
-  auto expected = LiteralUtil::CreateR3FromArray3D(expected_array);
+  auto expected = Literal::CreateR3FromArray3D(expected_array);
   ComputeAndCompareLiteral(
       &builder, *expected,
       {r3_implicit_global_data.get(), r3_global_data.get()},
@@ -317,7 +313,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
   b.Add(r3h, r1h);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}});
+      Literal::CreateR3<float>({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}});
 
   ComputeAndCompareLiteral(&b, *expected, {r3.get(), r1.get()},
                            ErrorSpec(0.0001));
@@ -325,81 +321,79 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
   ComputationBuilder b(client_, TestName());
-  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}}}));
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}});
+      Literal::CreateR3<float>({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
   ComputationBuilder b(client_, TestName());
-  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}, {2}}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1}, {2}}}));
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}});
+      Literal::CreateR3<float>({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
   ComputationBuilder b(client_, TestName());
-  auto r1 =
-      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}, {3, 4}}}));
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}});
+      Literal::CreateR3<float>({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
   ComputationBuilder b(client_, TestName());
-  auto r1 =
-      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}});
+      Literal::CreateR3<float>({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
   ComputationBuilder b(client_, TestName());
-  auto r1 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
+  auto r1 =
+      b.ConstantLiteral(*Literal::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}});
+      Literal::CreateR3<float>({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
   ComputationBuilder b(client_, TestName());
-  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR3<float>({{{1}}}));
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1);
 
   auto expected =
-      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}});
+      Literal::CreateR3<float>({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
@@ -541,7 +535,7 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
     *v = ApplyOpToFloats(spec.op2, tmp, v3);
   });
 
-  auto expected = LiteralUtil::CreateR2FromArray2D(expected_array);
+  auto expected = Literal::CreateR2FromArray2D(expected_array);
   ComputeAndCompareLiteral(
       &builder, *expected,
       {r2_implicit_global_data1.get(), r2_global_data.get(),
@@ -555,22 +549,22 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances,
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
   ComputationBuilder b(client_, TestName());
-  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}}));
-  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}}));
+  auto r2 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
 
-  auto expected = LiteralUtil::CreateR2<float>({{2, 4}, {4, 6}});
+  auto expected = Literal::CreateR2<float>({{2, 4}, {4, 6}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
   ComputationBuilder b(client_, TestName());
-  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1}, {2}}));
-  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  auto r1 = b.ConstantLiteral(*Literal::CreateR2<float>({{1}, {2}}));
+  auto r2 = b.ConstantLiteral(*Literal::CreateR2<float>({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
 
-  auto expected = LiteralUtil::CreateR2<float>({{2, 3}, {5, 6}});
+  auto expected = Literal::CreateR2<float>({{2, 3}, {5, 6}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
@@ -579,11 +573,11 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
   ComputationBuilder b(client_, TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r3, r1, {0});
 
-  auto expected = LiteralUtil::CreateR3<float>(
-      {{{11, 12}, {13, 14}}, {{25, 26}, {27, 28}}});
+  auto expected =
+      Literal::CreateR3<float>({{{11, 12}, {13, 14}}, {{25, 26}, {27, 28}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
@@ -592,11 +586,11 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
   ComputationBuilder b(client_, TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r1, r3, {1});
 
-  auto expected = LiteralUtil::CreateR3<float>(
-      {{{11, 12}, {23, 24}}, {{15, 16}, {27, 28}}});
+  auto expected =
+      Literal::CreateR3<float>({{{11, 12}, {23, 24}}, {{15, 16}, {27, 28}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
@@ -605,11 +599,11 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
   ComputationBuilder b(client_, TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   b.Add(r1, r3, {2});
 
-  auto expected = LiteralUtil::CreateR3<float>(
-      {{{11, 22}, {13, 24}}, {{15, 26}, {17, 28}}});
+  auto expected =
+      Literal::CreateR3<float>({{{11, 22}, {13, 24}}, {{15, 26}, {17, 28}}});
 
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
@@ -620,7 +614,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
   auto r1_1 = b.ConstantR1<float>({100, 200});
   auto r1_2 = b.ConstantR1<float>({10, 20});
   auto r3 = b.ConstantLiteral(
-      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+      *Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
   for (int i = 0; i < 3; ++i) {
     r3 = b.Add(r1_0, r3, {0});
     r3 = b.Add(r3, r1_1, {1});
@@ -628,7 +622,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
   }
   r3 = b.Mul(r3, b.ConstantR0<float>(-2));
 
-  auto expected = LiteralUtil::CreateR3<float>(
+  auto expected = Literal::CreateR3<float>(
       {{{-6 * 1110 - 2, -6 * 1120 - 4}, {-6 * 1210 - 6, -6 * 1220 - 8}},
        {{-6 * 2110 - 10, -6 * 2120 - 12}, {-6 * 2210 - 14, -6 * 2220 - 16}}});
 
@@ -649,7 +643,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
   }
   r3 = b.Mul(r3, b.ConstantR0<float>(-1));
 
-  auto expected = LiteralUtil::CreateR3<float>(
+  auto expected = Literal::CreateR3<float>(
       {{{-3 * 1110 - 3, -3 * 1120 - 3}, {-3 * 1210 - 3, -3 * 1220 - 3}},
        {{-3 * 2110 - 3, -3 * 2120 - 3}, {-3 * 2210 - 3, -3 * 2220 - 3}}});
 
@@ -662,7 +656,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   ComputationBuilder b(client_, TestName());
 
   b.Add(b.ConstantR2<float>({{1.0, 5.0}, {1.0, 5.0}}),
-        b.ConstantLiteral(*LiteralUtil::CreateR3<float>(
+        b.ConstantLiteral(*Literal::CreateR3<float>(
             {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})),
         /*broadcast_dimensions=*/{1, 2});
 
@@ -704,8 +698,6 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
-  xla::legacy_flags::AppendUserComputationFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 96a329a9bd8296a11a3e22e8dea31d71dd973d76..dc1443f5363aab1e6166984a3f2f3fccefad908e 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -39,7 +38,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
   // Test degenerate case of broadcasting a scalar into a scalar.
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {}), input, {}));
 
@@ -48,14 +47,14 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*LiteralUtil::CreateR0<float>(42.0), *result,
+  LiteralTestUtil::ExpectNear(*Literal::CreateR0<float>(42.0), *result,
                               error_spec_);
 }
 
 XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {2, 2}), input, {}));
 
@@ -65,14 +64,14 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR2<float>({{42.0, 42.0}, {42.0, 42.0}}), *result,
+      *Literal::CreateR2<float>({{42.0, 42.0}, {42.0, 42.0}}), *result,
       error_spec_);
 }
 
 XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
+      Literal::CreateR1<float>({1.0, 2.0, 3.0})));
 
   // Broadcast vector in both dimension 0 and dimension 1. Join them in a tuple
   // to enable testing of the results.
@@ -88,18 +87,18 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR2<float>({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}),
+      *Literal::CreateR2<float>({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}),
       result->tuple_literals(0), error_spec_);
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}),
+      *Literal::CreateR2<float>({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}),
       result->tuple_literals(1), error_spec_);
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {2, 2}), input, {0, 1}));
 
@@ -109,7 +108,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}), *result,
+      *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}), *result,
       error_spec_);
 }
 
@@ -118,7 +117,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
   // the dimensions, ie transpose.
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {2, 2}), input, {1, 0}));
 
@@ -128,14 +127,14 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR2<float>({{1.0, 3.0}, {2.0, 4.0}}), *result,
+      *Literal::CreateR2<float>({{1.0, 3.0}, {2.0, 4.0}}), *result,
       error_spec_);
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}})));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {2, 3, 2}), input, {0, 2}));
 
@@ -145,15 +144,15 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR3<float>({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}},
-                                     {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}),
+      *Literal::CreateR3<float>({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}},
+                                 {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}),
       *result, error_spec_);
 }
 
 TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1.0, 2.0})));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1.0, 2.0})));
 
   // Broadcast vector in dimension 1.
   builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -168,8 +167,8 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
   Array2D<float> pz({{1, 2}, {1, 2}});
   expected.FillWithPZ(pz);
 
-  LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
+                              *result, error_spec_);
 }
 
 TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
@@ -178,7 +177,7 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
   int64 r1_size = input_data.size();
   std::iota(input_data.begin(), input_data.end(), 0.0f);
   auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(input_data)));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(input_data)));
 
   // Broadcast vector in dimension 3.
   builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -198,8 +197,8 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
   }
   expected.FillWithYX(yx);
 
-  LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
+                              *result, error_spec_);
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
@@ -209,7 +208,7 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
   std::vector<float> r1_array(64, 42.0);
 
   auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(r1_array)));
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(r1_array)));
 
   // Broadcast vector in dimension 1.
   builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -220,14 +219,14 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*LiteralUtil::CreateR4FromArray4D(r4_array),
-                              *result, error_spec_);
+  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D(r4_array), *result,
+                              error_spec_);
 }
 
 TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
   auto builder = HloComputation::Builder(TestName());
   auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0f)));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {64, 64, 3, 3}), input, {}));
 
@@ -240,15 +239,15 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
   Array4D<float> expected(64, 64, 3, 3);
   expected.Fill(1.0f);
 
-  LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
+                              *result, error_spec_);
 }
 
 TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
   auto builder = HloComputation::Builder(TestName());
   Array2D<float> to_broadcast({{1.0f, 2.0f}, {3.0f, 4.0f}});
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2FromArray2D<float>(to_broadcast)));
+      Literal::CreateR2FromArray2D<float>(to_broadcast)));
 
   // Broadcast vector in dimensions 2 and 3.
   builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -262,8 +261,8 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
   Array4D<float> expected(3, 3, 2, 2);
   expected.FillWithYX(to_broadcast);
 
-  LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
+                              *result, error_spec_);
 }
 
 TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
@@ -282,7 +281,7 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
     }
   }
   auto input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR3FromArray3D<float>(input_vals)));
+      Literal::CreateR3FromArray3D<float>(input_vals)));
 
   // Broadcast vector in dimensions 2 and 3.
   builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -293,8 +292,8 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
+                              *result, error_spec_);
 }
 
 }  // namespace
@@ -302,7 +301,6 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 1f61743451a79a062205708d9ba6014f8a8591e9..dae0956f0a9f3d6fe172dc13b7ce3877a760e161 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -1,16 +1,32 @@
 """Build rules for XLA testing."""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins")
 
-def all_backends():
+all_backends = ["cpu", "cpu_parallel", "gpu"] + plugins.keys()
+
+def filter_backends(backends):
+  """Removes "gpu" from a backend list if CUDA is not enabled.
+
+  This allows us to simply hardcode lists including "gpu" here and in the
+  BUILD file, without causing failures when CUDA isn't enabled.'
+
+  Args:
+    backends: A list of backends to filter.
+
+  Returns:
+    The filtered list of backends.
+  """
   if cuda_is_configured():
-    return ["cpu", "cpu_parallel", "gpu"]
+    return backends
   else:
-    return ["cpu", "cpu_parallel"]
+    return [backend for backend in backends if backend != "gpu"]
+
 
 def xla_test(name,
              srcs,
              deps,
+             xla_test_library_deps=[],
              backends=[],
              args=[],
              tags=[],
@@ -69,6 +85,8 @@ def xla_test(name,
     name: Name of the target.
     srcs: Sources for the target.
     deps: Dependencies of the target.
+    xla_test_library_deps: If set, the generated test targets will depend on the
+      respective cc_libraries generated by the xla_test_library rule.
     backends: A list of backends to generate tests for. Supported
       values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will
       be generated for all supported backends.
@@ -81,7 +99,7 @@ def xla_test(name,
   """
   test_names = []
   if not backends:
-    backends = all_backends()
+    backends = all_backends
 
   native.cc_library(
       name="%s_lib" % name,
@@ -91,7 +109,7 @@ def xla_test(name,
       deps=deps + ["//tensorflow/compiler/xla/tests:test_macros_header"],
   )
 
-  for backend in backends:
+  for backend in filter_backends(backends):
     test_name = "%s_%s" % (name, backend)
     this_backend_tags = ["xla_%s" % backend]
     this_backend_copts = []
@@ -107,9 +125,18 @@ def xla_test(name,
       backend_deps = ["//tensorflow/compiler/xla/service:gpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
       this_backend_tags += ["requires-gpu-sm35"]
+    elif backend in plugins:
+      backend_deps = plugins[backend]["deps"]
+      this_backend_copts += plugins[backend]["copts"]
+      this_backend_tags += plugins[backend]["tags"]
+      this_backend_args += plugins[backend]["args"]
     else:
       fail("Unknown backend %s" % backend)
 
+    if xla_test_library_deps:
+      for lib_dep in xla_test_library_deps:
+        backend_deps += ["%s_%s" % (lib_dep, backend)]
+
     native.cc_test(
         name=test_name,
         srcs=srcs,
@@ -124,19 +151,82 @@ def xla_test(name,
 
   native.test_suite(name=name, tests=test_names)
 
+def xla_test_library(name,
+                     srcs,
+                     hdrs=[],
+                     deps=[],
+                     backends=[]):
+  """Generates cc_library targets for the given XLA backends.
+
+  This rule forces the sources to be compiled for each backend so that the
+  backend specific macros could expand correctly. It's useful when test targets
+  in different directories referring to the same sources but test with different
+  arguments.
+
+  Examples:
+
+    # Generates the targets: foo_test_library_cpu and foo_test_gpu.
+    xla_test_library(
+        name = "foo_test_library",
+        srcs = ["foo_test.cc"],
+        backends = ["cpu", "gpu"],
+        deps = [...],
+    )
+    # Then use the xla_test rule to generate test targets:
+    xla_test(
+        name = "foo_test",
+        srcs = [],
+        backends = ["cpu", "gpu"],
+        deps = [...],
+        xla_test_library_deps = [":foo_test_library"],
+    )
+
+  Args:
+    name: Name of the target.
+    srcs: Sources for the target.
+    hdrs: Headers for the target.
+    deps: Dependencies of the target.
+    backends: A list of backends to generate libraries for.
+      Supported values: "cpu", "cpu_parallel", "gpu". If this list is empty, the
+      library will be generated for all supported backends.
+  """
+
+  if not backends:
+    backends = all_backends
+
+  for backend in filter_backends(backends):
+    this_backend_copts = []
+    if backend in ["cpu", "cpu_parallel", "gpu"]:
+      backend_deps = ["//tensorflow/compiler/xla/tests:test_macros_%s" % backend]
+    elif backend in plugins:
+      backend_deps = plugins[backend]["deps"]
+      this_backend_copts += plugins[backend]["copts"]
+    else:
+      fail("Unknown backend %s" % backend)
+
+    native.cc_library(
+        name = "%s_%s" % (name, backend),
+        srcs = srcs,
+        testonly = True,
+        hdrs = hdrs,
+        copts = ["-DXLA_TEST_BACKEND_%s=1" % backend.upper()]
+        + this_backend_copts,
+        deps = deps + backend_deps,
+    )
+
 
 def generate_backend_suites(backends=[]):
   if not backends:
-    backends = all_backends()
-  for backend in backends:
+    backends = all_backends
+  for backend in filter_backends(backends):
     native.test_suite(name="%s_tests" % backend,
                       tags = ["xla_%s" % backend])
 
 
 def generate_backend_test_macros(backends=[]):
   if not backends:
-    backends = all_backends()
-  for backend in backends:
+    backends = all_backends
+  for backend in filter_backends(backends):
     native.cc_library(
         name="test_macros_%s" % backend,
         testonly = True,
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 55701c62db22f0fff6f4fdeabf0c72d600239969..086199fda1445c966917cff6849373e4474d16f7 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -78,7 +77,7 @@ class CallOpTest : public ClientLibraryTestBase {
 XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR0F32IdentityScalar)) {
   ComputationBuilder builder(client_, TestName());
   Computation callee = CreateR0F32IdentityComputation();
-  auto constant = builder.ConstantLiteral(*LiteralUtil::CreateR0<float>(42.0));
+  auto constant = builder.ConstantLiteral(*Literal::CreateR0<float>(42.0));
   builder.Call(callee, {constant});
 
   ComputeAndCompareR0<float>(&builder, 42.0, {}, ErrorSpec(0.01f));
@@ -87,8 +86,8 @@ XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR0F32IdentityScalar)) {
 XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR1S0F32AddArray)) {
   ComputationBuilder builder(client_, TestName());
   Computation callee = CreateR1S0F32AdditionComputation();
-  auto x = builder.ConstantLiteral(*LiteralUtil::CreateR1<float>({}));
-  auto y = builder.ConstantLiteral(*LiteralUtil::CreateR1<float>({}));
+  auto x = builder.ConstantLiteral(*Literal::CreateR1<float>({}));
+  auto y = builder.ConstantLiteral(*Literal::CreateR1<float>({}));
   builder.Call(callee, {x, y});
 
   ComputeAndCompareR1<float>(&builder, {}, {}, ErrorSpec(0.01f));
@@ -97,8 +96,8 @@ XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR1S0F32AddArray)) {
 XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR1S2F32AddArray)) {
   ComputationBuilder builder(client_, TestName());
   Computation callee = CreateR1S2F32AdditionComputation();
-  auto x = builder.ConstantLiteral(*LiteralUtil::CreateR1<float>({1.0f, 2.0f}));
-  auto y = builder.ConstantLiteral(*LiteralUtil::CreateR1<float>({2.0f, 3.0f}));
+  auto x = builder.ConstantLiteral(*Literal::CreateR1<float>({1.0f, 2.0f}));
+  auto y = builder.ConstantLiteral(*Literal::CreateR1<float>({2.0f, 3.0f}));
   builder.Call(callee, {x, y});
 
   ComputeAndCompareR1<float>(&builder, {3.0f, 5.0f}, {}, ErrorSpec(0.01f));
@@ -107,8 +106,8 @@ XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR1S2F32AddArray)) {
 XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR0F32Tuple)) {
   ComputationBuilder builder(client_, TestName());
   Computation callee = CreateR0F32TupleComputation();
-  auto elem = LiteralUtil::CreateR0<float>(42.0);
-  auto tuple = LiteralUtil::MakeTuple({elem.get()});
+  auto elem = Literal::CreateR0<float>(42.0);
+  auto tuple = Literal::MakeTuple({elem.get()});
   builder.Call(callee, {builder.ConstantLiteral(*elem)});
 
   ComputeAndCompareTuple(&builder, *tuple, {}, ErrorSpec(0.01f));
@@ -120,7 +119,6 @@ XLA_TEST_F(CallOpTest, DISABLED_ON_GPU(CallR0F32Tuple)) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 4825eaf19dc28fd78a5d91a3c1e722c3916f6c20..2f4ad22f5bf0573ba97e6d28a3a207480fcdae18 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -38,7 +37,7 @@ class CheckExecutionArityTest : public ClientLibraryTestBase {};
 
 TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
   ComputationBuilder builder(client_, "add_two_params");
-  auto param_literal = LiteralUtil::CreateR1<float>({1.1f, 2.2f});
+  auto param_literal = Literal::CreateR1<float>({1.1f, 2.2f});
 
   auto p0 = builder.Parameter(0, param_literal->shape(), "param0");
   auto p1 = builder.Parameter(1, param_literal->shape(), "param1");
@@ -55,18 +54,20 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
 
   // The arity of the UserComputation is 2 arguments. Execution will succeed
   // with 2 arguments, but fail with a different number.
-  auto result_two_args =
-      client_->Execute(computation, {param0_data.get(), param1_data.get()});
+  auto result_two_args = client_->Execute(
+      computation, {param0_data.get(), param1_data.get()}, &execution_options_);
   ASSERT_IS_OK(result_two_args.status());
 
-  auto result_one_arg = client_->Execute(computation, {param0_data.get()});
+  auto result_one_arg =
+      client_->Execute(computation, {param0_data.get()}, &execution_options_);
   ASSERT_FALSE(result_one_arg.ok());
   ASSERT_EQ(result_one_arg.status().code(),
             tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(result_one_arg.status().error_message(),
               ContainsRegex("takes 2"));
 
-  auto result_zero_args = client_->Execute(computation, {});
+  auto result_zero_args =
+      client_->Execute(computation, {}, &execution_options_);
   ASSERT_FALSE(result_zero_args.ok());
   ASSERT_EQ(result_zero_args.status().code(),
             tensorflow::error::INVALID_ARGUMENT);
@@ -85,35 +86,38 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_IS_OK(computation_status.status());
   auto computation = computation_status.ConsumeValueOrDie();
 
-  auto f32_literal = LiteralUtil::CreateR0<float>(1.1f);
+  auto f32_literal = Literal::CreateR0<float>(1.1f);
   auto f32_data = client_->TransferToServer(*f32_literal).ConsumeValueOrDie();
-  auto f32_4_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  auto f32_4_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
   auto f32_4_data =
       client_->TransferToServer(*f32_4_literal).ConsumeValueOrDie();
-  auto u8_4_literal = LiteralUtil::CreateR1U8("hola");
+  auto u8_4_literal = Literal::CreateR1U8("hola");
   auto u8_4_data = client_->TransferToServer(*u8_4_literal).ConsumeValueOrDie();
 
   // Match
-  auto status =
-      client_->Execute(computation, {f32_data.get(), f32_4_data.get()});
+  auto status = client_->Execute(
+      computation, {f32_data.get(), f32_4_data.get()}, &execution_options_);
   ASSERT_IS_OK(status.status());
 
   // Shape mismatch in parameter 0
-  status = client_->Execute(computation, {f32_4_data.get(), f32_4_data.get()});
+  status = client_->Execute(computation, {f32_4_data.get(), f32_4_data.get()},
+                            &execution_options_);
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
               ContainsRegex("expects parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
-  status = client_->Execute(computation, {f32_data.get(), f32_data.get()});
+  status = client_->Execute(computation, {f32_data.get(), f32_data.get()},
+                            &execution_options_);
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
               ContainsRegex("expects parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
-  status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()});
+  status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()},
+                            &execution_options_);
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
@@ -126,7 +130,6 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index b96bb8f846909589a52269f0d314dbfd0af2be09..3082630505fe9aea9222ed478a1e6504e18231b6 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -37,18 +37,20 @@ namespace xla {
 namespace {
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
-Client* GetOrCreateLocalClientOrDie(se::Platform* platform) {
-  StatusOr<Client*> result = ClientLibrary::GetOrCreateLocalClient(platform);
+Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
+  StatusOr<Client*> result =
+      ClientLibrary::GetOrCreateLocalClient(client_options);
   TF_CHECK_OK(result.status()) << "could not create local client for testing";
   return result.ValueOrDie();
 }
 }  // namespace
 
-ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
-    : client_(GetOrCreateLocalClientOrDie(platform)) {
-  *(execution_options_.mutable_debug_options()) =
-      legacy_flags::GetDebugOptionsFromFlags();
-
+ClientLibraryTestBase::ClientLibraryTestBase(
+    perftools::gputools::Platform* platform,
+    const LocalClientOptions& client_options)
+    : client_(GetOrCreateLocalClientOrDie(client_options)),
+      execution_options_(CreateDefaultExecutionOptions()) {
+  CHECK_EQ(platform, client_options.platform());
   // Disabling constant_folding so that tests (usually written using Constants)
   // will exercise the intended code paths, instead of being constant folded.
   //
@@ -59,6 +61,15 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
       "constant_folding");
 }
 
+ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
+    : execution_options_(CreateDefaultExecutionOptions()) {
+  LocalClientOptions default_options;
+  default_options.set_platform(platform);
+  client_ = GetOrCreateLocalClientOrDie(default_options);
+  execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+      "constant_folding");
+}
+
 string ClientLibraryTestBase::TestName() const {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
@@ -71,13 +82,16 @@ StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
   return client_->Execute(computation, arguments, &execution_options_);
 }
 
+StatusOr<ExecutionHandle> ClientLibraryTestBase::ExecuteAsync(
+    const Computation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+  return client_->ExecuteAsync(computation, arguments, &execution_options_);
+}
+
 StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
-    ComputationBuilder* builder,
+    const Computation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const Shape* shape_with_output_layout) {
-  // Build the computation, as a convenience.
-  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
-
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
@@ -87,6 +101,15 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
                                      &execution_options);
 }
 
+StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
+    ComputationBuilder* builder,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  // Build the computation, as a convenience.
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
+}
+
 std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -113,14 +136,14 @@ string ClientLibraryTestBase::ExecuteToString(
   if (!result.ok()) {
     return result.status().ToString();
   } else {
-    return LiteralUtil::ToString(*result.ValueOrDie());
+    return result.ValueOrDie()->ToString();
   }
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1(
     ComputationBuilder* builder, const tensorflow::core::Bitmap& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR1(expected);
+  std::unique_ptr<Literal> expected_literal = Literal::CreateR1(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments);
 }
@@ -141,18 +164,121 @@ void ClientLibraryTestBase::ComputeAndCompareLiteral(
                                                   error, shape_with_layout));
 }
 
+tensorflow::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
+    const xla::Computation& computation, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const std::function<void(const Literal& actual,
+                             const string& error_message)>& verify_output) {
+  // Try with no layout requirement.
+  TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments));
+  verify_output(*actual, "");
+
+  // Try with all output layouts.
+  std::vector<int64> minor_to_major(ShapeUtil::Rank(expected.shape()));
+  std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
+  do {
+    auto layout = ShapeUtil::MakeShapeWithLayout(
+        expected.shape().element_type(),
+        AsInt64Slice(expected.shape().dimensions()), minor_to_major);
+    TF_ASSIGN_OR_RETURN(auto actual,
+                        ExecuteAndTransfer(computation, arguments, &layout));
+    verify_output(*actual, tensorflow::strings::StrCat(
+                               "Test with output layout: ",
+                               ShapeUtil::HumanStringWithLayout(layout)));
+  } while (std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
+    const xla::Computation& computation, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const std::function<void(const Literal& actual,
+                             const string& error_message)>& verify_output,
+    const Shape* output_with_layout) {
+  std::vector<GlobalData*> arguments_with_layout;
+  std::vector<string> layout_strings;
+  // This is a recursive function. It's an std::function instead of a lambda
+  // because it needs to capture itself. The index is the index of the argument
+  // to try all layouts for.
+  std::function<tensorflow::Status(int64)> choose;
+  choose = [&, this](int64 index) -> tensorflow::Status {
+    if (index < arguments.size()) {
+      // Try out all layouts for the operand.
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          client_->Transfer(*arguments[index], nullptr));
+      // Skip tuples because they don't have a rank.
+      if (ShapeUtil::IsTuple(literal->shape())) {
+        layout_strings.push_back(
+            ShapeUtil::HumanStringWithLayout(literal->shape()));
+        arguments_with_layout.push_back(arguments[index]);
+        TF_RETURN_IF_ERROR(choose(index + 1));
+        arguments_with_layout.pop_back();
+        layout_strings.pop_back();
+        return tensorflow::Status::OK();
+      }
+
+      std::vector<int64> minor_to_major(ShapeUtil::Rank(literal->shape()));
+      std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
+      do {
+        auto literal_relayout =
+            literal->Relayout(LayoutUtil::MakeLayout(minor_to_major));
+        layout_strings.push_back(
+            ShapeUtil::HumanStringWithLayout(literal_relayout->shape()));
+        TF_ASSIGN_OR_RETURN(auto data,
+                            client_->TransferToServer(*literal_relayout));
+        arguments_with_layout.push_back(data.get());
+        TF_RETURN_IF_ERROR(choose(index + 1));
+        arguments_with_layout.pop_back();
+        layout_strings.pop_back();
+      } while (
+          std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
+      return tensorflow::Status::OK();
+    }
+
+    // Every argument has an assigned layout.
+    TF_ASSIGN_OR_RETURN(
+        auto actual,
+        ExecuteAndTransfer(
+            computation,
+            tensorflow::gtl::ArraySlice<GlobalData*>(arguments_with_layout),
+            output_with_layout));
+    string error_message = "Test with input layouts: ";
+    for (const auto& str : layout_strings) {
+      tensorflow::strings::StrAppend(&error_message, str, " ");
+    }
+    verify_output(*actual, error_message);
+    return tensorflow::Status::OK();
+  };
+
+  return choose(0);
+}
+
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     ComputationBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const Shape* shape_with_layout) {
-  TF_ASSIGN_OR_RETURN(
-      auto actual, ExecuteAndTransfer(builder, arguments, shape_with_layout));
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   if (ShapeUtil::ElementIsFloating(expected.shape())) {
     LOG(WARNING) << "performing exact comparison of floating point numbers";
   } else {
     TF_RET_CHECK(ShapeUtil::ElementIsIntegral(expected.shape()) ||
                  expected.shape().element_type() == PRED);
   }
+  auto expect_equal = [&](const Literal& actual, const string& error_message) {
+    LiteralTestUtil::ExpectEqual(expected, actual, error_message);
+  };
+  if (execution_options_.debug_options().xla_test_all_output_layouts()) {
+    return ComputeAndCompareLiteralWithAllOutputLayouts(
+        computation, expected, arguments, expect_equal);
+  }
+  if (execution_options_.debug_options().xla_test_all_input_layouts()) {
+    return ComputeAndCompareLiteralWithAllInputLayouts(
+        computation, expected, arguments, expect_equal, shape_with_layout);
+  }
+  TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
+                                                      shape_with_layout));
   LiteralTestUtil::ExpectEqual(expected, *actual);
   return tensorflow::Status::OK();
 }
@@ -161,9 +287,21 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     ComputationBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
     const Shape* shape_with_layout) {
-  TF_ASSIGN_OR_RETURN(
-      auto actual, ExecuteAndTransfer(builder, arguments, shape_with_layout));
   TF_RET_CHECK(ShapeUtil::ElementIsFloating(expected.shape()));
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  auto expect_near = [&](const Literal& actual, const string& error_message) {
+    LiteralTestUtil::ExpectNear(expected, actual, error, error_message);
+  };
+  if (execution_options_.debug_options().xla_test_all_output_layouts()) {
+    return ComputeAndCompareLiteralWithAllOutputLayouts(computation, expected,
+                                                        arguments, expect_near);
+  }
+  if (execution_options_.debug_options().xla_test_all_input_layouts()) {
+    return ComputeAndCompareLiteralWithAllInputLayouts(
+        computation, expected, arguments, expect_near, shape_with_layout);
+  }
+  TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
+                                                      shape_with_layout));
   LiteralTestUtil::ExpectNear(expected, *actual, error);
   return tensorflow::Status::OK();
 }
@@ -179,10 +317,10 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   auto actual = actual_status.ConsumeValueOrDie();
 
   // Turn the expected value into a literal.
-  std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR1U8(expected);
+  std::unique_ptr<Literal> expected_literal = Literal::CreateR1U8(expected);
 
-  VLOG(1) << "expected: " << LiteralUtil::ToString(*expected_literal);
-  VLOG(1) << "actual:   " << LiteralUtil::ToString(*actual);
+  VLOG(1) << "expected: " << expected_literal->ToString();
+  VLOG(1) << "actual:   " << actual->ToString();
 
   EXPECT_EQ(expected, actual->u8s_string());
 }
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index f9e1082ebb43ae112c417ff9a71ef8d38b5de900..19c179c4ba250e055912899db42a3e64cbfa9001 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
@@ -48,6 +49,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   explicit ClientLibraryTestBase(
       perftools::gputools::Platform* platform = nullptr);
 
+  // Creates a new ClientLibraryTestBase with custom client options.
+  ClientLibraryTestBase(perftools::gputools::Platform* platform,
+                        const LocalClientOptions& client_options);
+
   // Returns the name of the test currently being run.
   string TestName() const;
 
@@ -66,14 +71,23 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // TODO(b/25566808): Add helper that populates a literal from a testdata file.
 
-  // Convenience methods for building and running a computation from a builder.
+  // Convenience methods for building and running a computation with the member
+  // execution options. Modify execution_options_ in your test if you want to
+  // customize the options.
   StatusOr<std::unique_ptr<GlobalData>> Execute(
       ComputationBuilder* builder,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+  StatusOr<ExecutionHandle> ExecuteAsync(
+      const Computation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
       ComputationBuilder* builder,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
+      const Computation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
 
   // Convenience OrDie variants of above methods.
   std::unique_ptr<GlobalData> ExecuteOrDie(
@@ -271,6 +285,22 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   Client* client_;
   ExecutionOptions execution_options_;
+
+ private:
+  // Build and run the computation with all permutations of output layouts.
+  tensorflow::Status ComputeAndCompareLiteralWithAllOutputLayouts(
+      const xla::Computation& computation, const Literal& expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const std::function<void(const Literal& actual,
+                               const string& error_message)>& verify_output);
+  // Build and run the computation with all permutations of layouts of all input
+  // arguments.
+  tensorflow::Status ComputeAndCompareLiteralWithAllInputLayouts(
+      const xla::Computation& computation, const Literal& expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const std::function<void(const Literal& actual,
+                               const string& error_message)>& verify_output,
+      const Shape* output_with_layout = nullptr);
 };
 
 template <typename NativeT>
@@ -278,7 +308,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
     ComputationBuilder* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR0<NativeT>(expected);
+      Literal::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments);
 }
@@ -291,7 +321,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                     std::is_same<NativeT, double>::value,
                 "Floating point type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR0<NativeT>(expected);
+      Literal::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments, error);
 }
@@ -301,7 +331,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
     ComputationBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR1<NativeT>(expected);
+      Literal::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments);
 }
@@ -314,7 +344,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                     std::is_same<NativeT, double>::value,
                 "Floating point type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR1<NativeT>(expected);
+      Literal::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments, error);
 }
@@ -324,7 +354,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
     ComputationBuilder* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
+      Literal::CreateR2FromArray2D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments);
 }
@@ -337,7 +367,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                     std::is_same<NativeT, double>::value,
                 "Floating point type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
+      Literal::CreateR2FromArray2D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments, error);
 }
@@ -347,7 +377,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
     ComputationBuilder* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
+      Literal::CreateR3FromArray3D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments);
 }
@@ -360,7 +390,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                     std::is_same<NativeT, double>::value,
                 "Floating point type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
+      Literal::CreateR3FromArray3D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments, error);
 }
@@ -370,7 +400,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
     ComputationBuilder* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
+      Literal::CreateR4FromArray4D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments);
 }
@@ -383,7 +413,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                     std::is_same<NativeT, double>::value,
                 "Floating point type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
+      Literal::CreateR4FromArray4D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
                                                   arguments, error);
 }
@@ -392,7 +422,7 @@ template <typename NativeT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     NativeT value, int64 parameter_number, const string& name,
     ComputationBuilder* builder, ComputationDataHandle* data_handle) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR0(value);
+  std::unique_ptr<Literal> literal = Literal::CreateR0(value);
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -404,7 +434,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR1(values);
+  std::unique_ptr<Literal> literal = Literal::CreateR1(values);
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -416,7 +446,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const Array2D<NativeT>& array_2d, int64 parameter_number,
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR2FromArray2D(array_2d);
+  std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -428,7 +458,7 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const Array3D<NativeT>& array_3d, int64 parameter_number,
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR3FromArray3D(array_3d);
+  std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 1247804dae0effd387d5f276a3d64667bc69e18b..e84a6ce710229043c903c5e50daf33e2f93fa6da 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -47,7 +46,7 @@ TEST_F(ClientTest, ExecuteWithLayout) {
       auto computation = b.Build();
       ASSERT_TRUE(computation.ok()) << computation.status();
 
-      ExecutionOptions execution_options;
+      ExecutionOptions execution_options = execution_options_;
       *execution_options.mutable_shape_with_output_layout() =
           ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                          execute_layout);
@@ -77,7 +76,7 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
   auto computation = b.Build();
   ASSERT_TRUE(computation.ok()) << computation.status();
 
-  ExecutionOptions execution_options;
+  ExecutionOptions execution_options = execution_options_;
   // Create a result shape with one element column major and the other row
   // major.
   *execution_options.mutable_shape_with_output_layout() =
@@ -115,7 +114,6 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index cc3eb0e8d46a8ab13553cb78f58bfc48b16ee862..90767c4a17478d4e7edd6202a8629db5b115381d 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <utility>
 
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -32,6 +33,20 @@ limitations under the License.
 
 namespace xla {
 
+std::unique_ptr<HloModule> CodegenTestBase::CreateNewModuleWithEmbeddedIr(
+    bool ftz) {
+  HloModuleConfig config;
+  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  debug_options.set_xla_embed_ir_in_executable(true);
+  debug_options.set_xla_gpu_ftz(ftz);
+  // TODO(b/38354253): Change tests to use Parameters instead of Constants.
+  debug_options.add_xla_disable_hlo_passes("constant_folding");
+  config.set_debug_options(debug_options);
+
+  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
+                               config);
+}
+
 void CodegenTestBase::CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
                                          const string& pattern) {
   std::unique_ptr<Executable> executable =
@@ -43,8 +58,7 @@ void CodegenTestBase::CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
 std::unique_ptr<Executable> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
   return backend_->compiler()
-      ->Compile(std::move(hlo_module), test_hlo_dumper_,
-                backend_->default_stream_executor())
+      ->Compile(std::move(hlo_module), backend_->default_stream_executor())
       .ConsumeValueOrDie();
 }
 
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.h b/tensorflow/compiler/xla/tests/codegen_test_base.h
index 50c0453107095c5fdb6238c88a17b31728b6bf22..fa073cd91ee07462d7aaf40789e87dbc831da95e 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.h
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.h
@@ -28,7 +28,11 @@ namespace xla {
 // Tests that verify IR emitted by the CPU/GPU backend is as expected.
 class CodegenTestBase : public HloTestBase {
  protected:
-  CodegenTestBase() {}
+  // Like HloTestBase::CreateNewModule, but also sets the "embed ir in
+  // executable" flag to true, since this is needed for codegen tests.
+  // The optional ftz flags configures whether these modules have their ftz
+  // option turned on.
+  std::unique_ptr<HloModule> CreateNewModuleWithEmbeddedIr(bool ftz = false);
 
   // Returns the embedded LLVM IR from the given executable. Codegen tests must
   // override this method, but execution tests do not have to because they do
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index 18ea9714d1a8f5f5b127881f657e948d65003ab1..7038afc5b1f5dd388731ae82586fe24ac5476e8b 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -48,10 +47,10 @@ class CompilationCacheTest : public ClientLibraryTestBase {
     std::unique_ptr<Literal> result =
         client_
             ->ExecuteAndTransfer(computation, arguments,
-                                 /*execution_options=*/nullptr,
+                                 /*execution_options=*/&execution_options_,
                                  &execution_profile)
             .ConsumeValueOrDie();
-    LiteralTestUtil::ExpectNear(*LiteralUtil::CreateR0<float>(expected_result),
+    LiteralTestUtil::ExpectNear(*Literal::CreateR0<float>(expected_result),
                                 *result, error_spec_);
     EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
   }
@@ -62,14 +61,13 @@ class CompilationCacheTest : public ClientLibraryTestBase {
       std::initializer_list<std::initializer_list<float>> expected_result,
       bool expect_cache_hit) {
     ExecutionProfile execution_profile;
-    auto data_handle =
-        client_
-            ->Execute(computation, arguments, /*execution_options=*/nullptr,
-                      &execution_profile)
-            .ConsumeValueOrDie();
+    auto data_handle = client_
+                           ->Execute(computation, arguments,
+                                     &execution_options_, &execution_profile)
+                           .ConsumeValueOrDie();
     std::unique_ptr<Literal> result =
         client_->Transfer(*data_handle).ConsumeValueOrDie();
-    LiteralTestUtil::ExpectNear(*LiteralUtil::CreateR2<float>(expected_result),
+    LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>(expected_result),
                                 *result, error_spec_);
     EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
   }
@@ -89,13 +87,13 @@ XLA_TEST_F(CompilationCacheTest, ComputationCalledMultipleTimes) {
 
 XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
   std::unique_ptr<GlobalData> data_42 =
-      client_->TransferToServer(*LiteralUtil::CreateR0<float>(42.0f))
+      client_->TransferToServer(*Literal::CreateR0<float>(42.0f))
           .ConsumeValueOrDie();
   std::unique_ptr<GlobalData> data_123 =
-      client_->TransferToServer(*LiteralUtil::CreateR0<float>(123.0f))
+      client_->TransferToServer(*Literal::CreateR0<float>(123.0f))
           .ConsumeValueOrDie();
   std::unique_ptr<GlobalData> data_456 =
-      client_->TransferToServer(*LiteralUtil::CreateR0<float>(456.0f))
+      client_->TransferToServer(*Literal::CreateR0<float>(456.0f))
           .ConsumeValueOrDie();
 
   ComputationBuilder builder(client_, TestName());
@@ -205,7 +203,6 @@ XLA_TEST_F(CompilationCacheTest, MutatedComputation) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 13c78fb16331340ae9b3586ac47a071230b73a83..4384c9b31495437db10744ea2b98b5b0b05b7ae4 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -86,7 +85,7 @@ class ComputeConstantTest : public ::testing::Test {
                                          ComputationBuilder* builder) {
     TF_ASSIGN_OR_RETURN(auto literal,
                         ComputeConstantLiteral(client, operand, builder));
-    return LiteralUtil::Get<Scalar>(*literal, {});
+    return literal->Get<Scalar>({});
   }
 
   bool IsConstant(const ComputationDataHandle& operand,
@@ -211,7 +210,7 @@ TEST_F(ComputeConstantTest, NonScalarAdd) {
     auto computed = ComputeConstantLiteral(client, computation, &b);
     ASSERT_TRUE(computed.ok()) << computed.status();
     std::unique_ptr<Literal> expected_literal =
-        LiteralUtil::CreateR1<int32>({4, 6});
+        Literal::CreateR1<int32>({4, 6});
     LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
   }
 }
@@ -225,7 +224,7 @@ TEST_F(ComputeConstantTest, IntegerDivide) {
 
     auto computed = ComputeConstantLiteral(client, computation, &b);
     ASSERT_TRUE(computed.ok()) << computed.status();
-    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
+    std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
     LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
   }
 }
@@ -291,7 +290,6 @@ TEST_F(ComputeConstantTest, DISABLED_ON_CPU(ReuseComputedConstant)) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index a7034930bc9493dfc4931a77c05cf87e4d138173..c5d88ad6a08476731b5b09cb4ae16a3e76bbaf98 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -518,8 +517,8 @@ TEST_P(ConcatR2BinaryTest, DoIt) {
 //     concat
 XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
-  auto x_literal = LiteralUtil::CreateR0<float>(2.f);
-  auto y_literal = LiteralUtil::CreateR0<float>(3.f);
+  auto x_literal = Literal::CreateR0<float>(2.f);
+  auto y_literal = Literal::CreateR0<float>(3.f);
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
 
@@ -540,9 +539,9 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
 // produces the correct result in rank 1.
 XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
-  auto x_literal = LiteralUtil::CreateR1<float>({2.0f, 3.0f, 5.0f, 6.0f});
-  auto y_literal = LiteralUtil::CreateR0<float>(1.5f);
-  auto z_literal = LiteralUtil::CreateR0<float>(5.5f);
+  auto x_literal = Literal::CreateR1<float>({2.0f, 3.0f, 5.0f, 6.0f});
+  auto y_literal = Literal::CreateR0<float>(1.5f);
+  auto z_literal = Literal::CreateR0<float>(5.5f);
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
@@ -568,9 +567,9 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
 XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
   Array3D<float> x3d(3, 5, 7, 3.14f);
-  auto x_literal = LiteralUtil::CreateR3FromArray3D<float>(x3d);
-  auto y_literal = LiteralUtil::CreateR0<float>(1.5f);
-  auto z_literal = LiteralUtil::CreateR0<float>(5.5f);
+  auto x_literal = Literal::CreateR3FromArray3D<float>(x3d);
+  auto y_literal = Literal::CreateR0<float>(1.5f);
+  auto z_literal = Literal::CreateR0<float>(5.5f);
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
@@ -607,7 +606,6 @@ INSTANTIATE_TEST_CASE_P(ConcatR2BinaryTestInstantiation, ConcatR2BinaryTest,
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 1c065de8ba7663ac2e7b3dcd52298e6587d993f0..7c276c8c8d0c0e97b0dfba7a5d6a6165386e5261 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -113,7 +112,7 @@ TEST_F(ConstantsTest, Small_2x2) {
 TEST_F(ConstantsTest, Empty_3x0x2) {
   ComputationBuilder builder(client_, TestName());
   auto constant = builder.ConstantLiteral(
-      *LiteralUtil::CreateR3FromArray3D<float>(Array3D<float>(3, 0, 2)));
+      *Literal::CreateR3FromArray3D<float>(Array3D<float>(3, 0, 2)));
 
   ComputeAndCompareR3<float>(&builder, Array3D<float>(3, 0, 2), {});
 }
@@ -128,8 +127,8 @@ TEST_F(ConstantsTest, Small_2x2x2) {
       {{5.f, 6.f},   // y0
        {7.f, 8.f}},  // y1
   });
-  auto constant = builder.ConstantLiteral(
-      *LiteralUtil::CreateR3FromArray3D<float>(array3d));
+  auto constant =
+      builder.ConstantLiteral(*Literal::CreateR3FromArray3D<float>(array3d));
 
   ComputeAndCompareR3<float>(&builder, array3d, {});
 }
@@ -143,7 +142,7 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
       {5.0f, 4.4f},   // p2
   });
   input_array.FillWithPZ(pz);
-  Literal input_literal = *LiteralUtil::CreateR4FromArray4D(input_array);
+  Literal input_literal = *Literal::CreateR4FromArray4D(input_array);
 
   {
     ComputationBuilder builder(client_, TestName());
@@ -161,9 +160,9 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
 // TODO(b/29263943): Support tuple constants.
 TEST_F(ConstantsTest, DISABLED_TupleConstant) {
   ComputationBuilder builder(client_, TestName());
-  builder.ConstantLiteral(*LiteralUtil::MakeTuple(
-      {LiteralUtil::CreateR2<float>({{1.0}, {2.0}}).get(),
-       LiteralUtil::CreateR1<float>({2.0, 42}).get()}));
+  builder.ConstantLiteral(
+      *Literal::MakeTuple({Literal::CreateR2<float>({{1.0}, {2.0}}).get(),
+                           Literal::CreateR1<float>({2.0, 42}).get()}));
 
   std::unique_ptr<Literal> result = ExecuteAndTransferOrDie(&builder, {});
 
@@ -179,7 +178,6 @@ TEST_F(ConstantsTest, DISABLED_TupleConstant) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 6d3797972507f2c17b545c612c0dd839212e5ae5..2d181938ded0804776847772d4bb58bbc5e334f4 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -70,6 +69,24 @@ TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
+TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<bool>({true, false, true});
+  builder.ConvertElementType(a, S32);
+
+  std::vector<int32> expected = {1, 0, 1};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<bool>({true, false, true});
+  builder.ConvertElementType(a, F32);
+
+  std::vector<float> expected = {1., 0., 1.};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
 XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<int32>({});
@@ -197,7 +214,6 @@ TEST_F(ConvertTest, ConvertReshape) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 0b09416a74771a8a9df804dcae783dc220420fc2..fb50d9b0ebf5b4a6c9d244f699620e2dcb74acaf 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -63,8 +62,7 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
   auto weight_array = MakeUnique<Array4D<float>>(4, 3, 1, 1);
   weight_array->FillWithMultiples(0.2);
   auto weight_data =
-      client_
-          ->TransferToServer(*LiteralUtil::CreateR4FromArray4D(*weight_array))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array))
           .ConsumeValueOrDie();
 
   ComputationBuilder builder(client_, TestName());
@@ -102,7 +100,6 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index ec19469fa66c16cff3d1349b7ccc1d0de94d0b54..a110082f9a52ded5e836fa835e82f790e05df0e0 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -115,10 +114,10 @@ TEST_F(ConvolutionTest, Convolve_1x1x1x2_1x1x1x2_Valid) {
       ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kValid);
 
   auto input_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(input))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
           .ConsumeValueOrDie();
   auto filter_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(filter))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
           .ConsumeValueOrDie();
 
   ComputeAndCompareR4<float>(&builder, *aexpected,
@@ -158,10 +157,10 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Valid) {
       ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kValid);
 
   auto input_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(input))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
           .ConsumeValueOrDie();
   auto filter_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(filter))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
           .ConsumeValueOrDie();
 
   ComputeAndCompareR4<float>(&builder, *aexpected,
@@ -201,10 +200,10 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Same) {
       ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kSame);
 
   auto input_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(input))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
           .ConsumeValueOrDie();
   auto filter_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(filter))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
           .ConsumeValueOrDie();
 
   ComputeAndCompareR4<float>(&builder, *aexpected,
@@ -246,10 +245,10 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x3x3_Same) {
       ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kSame);
 
   auto input_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(input))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
           .ConsumeValueOrDie();
   auto filter_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR4FromArray4D(filter))
+      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
           .ConsumeValueOrDie();
 
   ComputeAndCompareR4<float>(&builder, *aexpected,
@@ -273,10 +272,10 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
   Array3D<float> expected({{{510, 610, 710, 810}}});
 
   auto input_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(input))
+      client_->TransferToServer(*Literal::CreateR3FromArray3D(input))
           .ConsumeValueOrDie();
   auto filter_literal =
-      client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(filter))
+      client_->TransferToServer(*Literal::CreateR3FromArray3D(filter))
           .ConsumeValueOrDie();
 
   ComputeAndCompareR3<float>(&builder, expected,
@@ -313,21 +312,18 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
 
   std::vector<float> input_elems(ShapeUtil::ElementsIn(input_shape));
   std::iota(input_elems.begin(), input_elems.end(), 1.0f);
-  auto input_r1 = LiteralUtil::CreateR1<float>(input_elems);
-  auto input_r5 =
-      LiteralUtil::Reshape(*input_r1, input_dims).ConsumeValueOrDie();
+  auto input_r1 = Literal::CreateR1<float>(input_elems);
+  auto input_r5 = input_r1->Reshape(input_dims).ConsumeValueOrDie();
 
   std::vector<float> filter_elems(ShapeUtil::ElementsIn(filter_shape));
   std::iota(filter_elems.begin(), filter_elems.end(), 1.0f);
-  auto filter_r1 = LiteralUtil::CreateR1<float>(filter_elems);
-  auto filter_r5 =
-      LiteralUtil::Reshape(*filter_r1, filter_dims).ConsumeValueOrDie();
+  auto filter_r1 = Literal::CreateR1<float>(filter_elems);
+  auto filter_r5 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie();
 
-  auto expected_r1 = LiteralUtil::CreateR1<float>(
+  auto expected_r1 = Literal::CreateR1<float>(
       {19554, 19962, 20370, 22110, 22590, 23070, 34890, 35730, 36570, 37446,
        38358, 39270, 50226, 51498, 52770, 52782, 54126, 55470});
-  auto expected_r5 =
-      LiteralUtil::Reshape(*expected_r1, {1, 3, 1, 2, 3}).ConsumeValueOrDie();
+  auto expected_r5 = expected_r1->Reshape({1, 3, 1, 2, 3}).ConsumeValueOrDie();
 
   auto input_literal = client_->TransferToServer(*input_r5).ConsumeValueOrDie();
   auto filter_literal =
@@ -344,7 +340,6 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index b5afc2498dace11c57a7099e9a3d32eb2a387984..c8e74aa01a50042b1e5297920cc184b1eeb51fd3 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -1312,20 +1311,19 @@ TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
 TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
   ComputationBuilder builder(client_, TestName());
 
-  auto gradients_flat = LiteralUtil::CreateR1<float>({1});
+  auto gradients_flat = Literal::CreateR1<float>({1});
   auto gradients_literal =
-      LiteralUtil::Reshape(*gradients_flat, {1, 1, 1, 1, 1})
-          .ConsumeValueOrDie();
+      gradients_flat->Reshape({1, 1, 1, 1, 1}).ConsumeValueOrDie();
   auto gradients = builder.ConstantLiteral(*gradients_literal);
 
-  auto weights_flat = LiteralUtil::CreateR1<float>({1, 10, 100});
+  auto weights_flat = Literal::CreateR1<float>({1, 10, 100});
   auto weights_literal =
-      LiteralUtil::Reshape(*weights_flat, {1, 1, 1, 1, 3}).ConsumeValueOrDie();
+      weights_flat->Reshape({1, 1, 1, 1, 3}).ConsumeValueOrDie();
   auto weights = builder.ConstantLiteral(*weights_literal);
 
-  auto expected_flat = LiteralUtil::CreateR1<float>({10});
+  auto expected_flat = Literal::CreateR1<float>({10});
   auto expected_literal =
-      LiteralUtil::Reshape(*expected_flat, {1, 1, 1, 1, 1}).ConsumeValueOrDie();
+      expected_flat->Reshape({1, 1, 1, 1, 1}).ConsumeValueOrDie();
 
   auto mirrored_weights = builder.Rev(weights, {2, 3, 4});
   builder.ConvWithGeneralPadding(gradients, mirrored_weights,
@@ -1337,21 +1335,19 @@ TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
 TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
   ComputationBuilder builder(client_, TestName());
 
-  auto activations_flat = LiteralUtil::CreateR1<float>({1, 2, 3, 4});
+  auto activations_flat = Literal::CreateR1<float>({1, 2, 3, 4});
   auto activations_literal =
-      LiteralUtil::Reshape(*activations_flat, {1, 1, 1, 1, 4})
-          .ConsumeValueOrDie();
+      activations_flat->Reshape({1, 1, 1, 1, 4}).ConsumeValueOrDie();
   auto activations = builder.ConstantLiteral(*activations_literal);
 
-  auto gradients_flat = LiteralUtil::CreateR1<float>({100, 10, 1});
+  auto gradients_flat = Literal::CreateR1<float>({100, 10, 1});
   auto gradients_literal =
-      LiteralUtil::Reshape(*gradients_flat, {1, 1, 1, 1, 3})
-          .ConsumeValueOrDie();
+      gradients_flat->Reshape({1, 1, 1, 1, 3}).ConsumeValueOrDie();
   auto gradients = builder.ConstantLiteral(*gradients_literal);
 
-  auto expected_flat = LiteralUtil::CreateR1<float>({13, 24, 130});
+  auto expected_flat = Literal::CreateR1<float>({13, 24, 130});
   auto expected_literal =
-      LiteralUtil::Reshape(*expected_flat, {1, 1, 1, 1, 3}).ConsumeValueOrDie();
+      expected_flat->Reshape({1, 1, 1, 1, 3}).ConsumeValueOrDie();
 
   auto forward_conv = builder.ConvGeneralDilated(
       activations, gradients,
@@ -1370,7 +1366,6 @@ TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 4c2413d0fe43d486ebf306fc51601467d6ebf7fd..76ae280f1a0f309d9aa159079827a7e2c7e833d7 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -58,39 +57,34 @@ class CopyOpTest : public HloTestBase {
                                 tensorflow::gtl::ArraySlice<int64> permutation);
 };
 
-TEST_F(CopyOpTest, CopyR0Bool) {
-  TestCopyOp(*LiteralUtil::CreateR0<bool>(true));
-}
+TEST_F(CopyOpTest, CopyR0Bool) { TestCopyOp(*Literal::CreateR0<bool>(true)); }
 
-TEST_F(CopyOpTest, CopyR1S0U32) {
-  TestCopyOp(*LiteralUtil::CreateR1<uint32>({}));
-}
+TEST_F(CopyOpTest, CopyR1S0U32) { TestCopyOp(*Literal::CreateR1<uint32>({})); }
 
 TEST_F(CopyOpTest, CopyR1S3U32) {
-  TestCopyOp(*LiteralUtil::CreateR1<uint32>({1, 2, 3}));
+  TestCopyOp(*Literal::CreateR1<uint32>({1, 2, 3}));
 }
 
 TEST_F(CopyOpTest, CopyR3F32_2x2x3) {
-  TestCopyOp(
-      *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
-                              {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}));
+  TestCopyOp(*Literal::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+                                 {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}));
 }
 
 TEST_F(CopyOpTest, CopyR4S32_2x2x3x2) {
-  TestCopyOp(*LiteralUtil::CreateR4(
+  TestCopyOp(*Literal::CreateR4(
       {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}},
        {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}}));
 }
 
 TEST_F(CopyOpTest, CopyR4S32_0x2x3x2) {
-  TestCopyOp(*LiteralUtil::CreateR4FromArray4D(Array4D<int32>(0, 2, 3, 2)));
+  TestCopyOp(*Literal::CreateR4FromArray4D(Array4D<int32>(0, 2, 3, 2)));
 }
 
 TEST_F(CopyOpTest, CopyParameterScalar) {
   auto builder = HloComputation::Builder(TestName());
 
   // Copy literal to device to use as parameter.
-  auto literal = LiteralUtil::CreateR0<float>(42.0);
+  auto literal = Literal::CreateR0<float>(42.0);
   Shape shape = literal->shape();
   auto constant_device_base = TransferToDevice(*literal);
 
@@ -112,7 +106,7 @@ TEST_F(CopyOpTest, CopyParameterScalar) {
 TEST_F(CopyOpTest, CopyConstantR2Twice) {
   auto builder = HloComputation::Builder(TestName());
 
-  auto literal = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto literal = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
 
@@ -134,7 +128,7 @@ TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
   HloComputation::Builder builder(TestName());
 
   std::unique_ptr<Literal> literal =
-      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   // Reverse the minor-to-major order of the literal.
   Layout* literal_layout = literal->mutable_shape()->mutable_layout();
   ASSERT_EQ(2, literal_layout->minor_to_major_size());
@@ -170,7 +164,7 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) {
 
   HloComputation::Builder builder(TestName());
 
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR3FromArray3D(a);
+  std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(a);
 
   HloInstruction* constant = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
@@ -204,7 +198,7 @@ void CopyOpTest::TestCopyConstantLayoutR4(
 
   HloComputation::Builder builder(TestName());
 
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR4FromArray4D(a);
+  std::unique_ptr<Literal> literal = Literal::CreateR4FromArray4D(a);
 
   HloInstruction* constant = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
@@ -247,7 +241,7 @@ using CopyOpClientTest = ClientLibraryTestBase;
 XLA_TEST_F(CopyOpClientTest, Copy0x0) {
   Shape in_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {0, 1});
   Shape out_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {1, 0});
-  auto empty = LiteralUtil::CreateFromShape(in_shape);
+  auto empty = Literal::CreateFromShape(in_shape);
 
   ComputationBuilder builder(client_, TestName());
   auto param0 = builder.Parameter(0, in_shape, "input");
@@ -263,7 +257,6 @@ XLA_TEST_F(CopyOpClientTest, Copy0x0) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 32232acf6e34517587b80d5091dbb9d603223184..73772fdec02fc95cb6c8e0685037515183478e85 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -68,7 +67,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
   auto builder = HloComputation::Builder(TestName());
 
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
   builder.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R0F32Add2"));
 
@@ -89,7 +88,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
   array(1, 1) = 4.0f;
 
   auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array)));
+      HloInstruction::CreateConstant(Literal::CreateR2FromArray2D(array)));
   builder.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R2F32ReduceSum"));
 
@@ -105,7 +104,7 @@ XLA_TEST_F(CustomCallTest,
   auto b = HloComputation::Builder(TestName());
 
   auto input = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(
+      HloInstruction::CreateConstant(Literal::CreateR2FromArray2D(
           Array2D<float>{{1.0f, 2.0f}, {3.0f, 4.0f}})));
   auto incremented = b.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {1, 2, 2}), {input}, "Add1ToValues"));
@@ -129,7 +128,6 @@ XLA_TEST_F(CustomCallTest,
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index 074753bf6f8f9e64626b9ed2015b94b58dfebc87..0c7c3a8ff6656b05041e672cca97b285a4420446 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -42,7 +41,8 @@ class DeallocationTest : public ClientLibraryTestBase {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
     Computation computation = builder->Build().ConsumeValueOrDie();
     auto global_data =
-        client_->Execute(computation, arguments).ConsumeValueOrDie();
+        client_->Execute(computation, arguments, &execution_options_)
+            .ConsumeValueOrDie();
     TF_CHECK_OK(client_->Transfer(*global_data).status());
     return global_data;
   }
@@ -143,7 +143,6 @@ XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index fcddffc1e1340028f11b67cbe14537a240120de7..c65f8c0f08bb8a096b020e73a35cdbb70e517b1f 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -48,7 +47,8 @@ class DeconstructTupleTest : public ClientLibraryTestBase {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
     Computation computation = builder->Build().ConsumeValueOrDie();
     auto global_data =
-        client_->Execute(computation, arguments).ConsumeValueOrDie();
+        client_->Execute(computation, arguments, &execution_options_)
+            .ConsumeValueOrDie();
     TF_CHECK_OK(client_->Transfer(*global_data).status());
     return global_data;
   }
@@ -67,9 +67,9 @@ TEST_F(DeconstructTupleTest, DeconstructTuple) {
   // Try copying the elements back and comparing it
   auto handles = result_status.ConsumeValueOrDie();
   std::unique_ptr<Literal> literal;
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[0]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[0]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[1]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[1]));
   LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
 }
 
@@ -89,17 +89,17 @@ TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
   auto handles2 = result_status2.ConsumeValueOrDie();
 
   std::unique_ptr<Literal> literal;
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles1[0]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles1[0]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles1[1]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles1[1]));
   LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
 
   handles1[0].reset();
   handles1[1].reset();
 
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles2[0]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles2[0]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles2[1]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles2[1]));
   LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
 }
 
@@ -119,13 +119,13 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
   auto handles = result_status.ConsumeValueOrDie();
 
   std::unique_ptr<Literal> literal;
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[0]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[0]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[1]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[1]));
   LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[2]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[2]));
   LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[3]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[3]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
 }
 
@@ -145,17 +145,17 @@ TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
   global_data.reset();
 
   std::unique_ptr<Literal> literal;
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[0]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[0]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[1]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[1]));
   LiteralTestUtil::ExpectR1Equal<float>({2.0, 4.0, 6.0, 8.0}, *literal);
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[2]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[2]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
 
   /// Try deallocating one of the repeated elements, then copy
   handles[0].reset();
 
-  TF_ASSIGN_OR_ASSERT_OK(literal, client_->Transfer(*handles[2]));
+  TF_ASSERT_OK_AND_ASSIGN(literal, client_->Transfer(*handles[2]));
   LiteralTestUtil::ExpectR1Equal<float>({1.0, 2.0, 3.0, 4.0}, *literal);
 }
 
@@ -173,7 +173,7 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({3.14f, -100.25f});
+      Literal::CreateR1<float>({3.14f, -100.25f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2}), "param0");
@@ -205,7 +205,6 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/deep_graph_test.cc b/tensorflow/compiler/xla/tests/deep_graph_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60953a7421d410722b499625b4ce4b9ca90aa874
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/deep_graph_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+
+namespace xla {
+namespace {
+TEST_F(ClientLibraryTestBase, DeepGraph) {
+  // TODO(b/62624812): To trigger the stack overflow this test is
+  // intended to track, we need to set kDepth to 20000.
+  // Unfortunately, setting it that high causes the test to time out.
+  const int kDepth = 200;
+  ComputationBuilder b(client_, TestName());
+  ComputationDataHandle x;
+  ComputationDataHandle y;
+  auto x_data = CreateR0Parameter<int32>(3, 0, "x", &b, &x);
+  auto y_data = CreateR0Parameter<int32>(1, 1, "y", &b, &y);
+  ComputationDataHandle z = x;
+  for (int i = 0; i < kDepth; ++i) {
+    z = b.Add(z, y);
+  }
+  ComputeAndCompareR0<int32>(&b, /*expected=*/kDepth + 3,
+                             {x_data.get(), y_data.get()});
+}
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 754eec1b1edc286b98d02f70c8e5661523bd85de..59ee0073388fe824ee9bc92819c9d10eca624473 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -20,10 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_runtime_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -186,14 +183,14 @@ void DotOperationTest::TestMatrixDot(int M, int K, int N, bool lhs_row_major,
                                      bool rhs_row_major) {
   std::unique_ptr<Array2D<float>> lhs_data =
       MakeLinspaceArray2D(0.0, 1.0, M, K);
-  std::unique_ptr<Literal> lhs_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
+  std::unique_ptr<Literal> lhs_lit = Literal::CreateR2FromArray2DWithLayout(
       *lhs_data,
       LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
   auto lhs_handle = client_->TransferToServer(*lhs_lit).ConsumeValueOrDie();
 
   std::unique_ptr<Array2D<float>> rhs_data =
       MakeLinspaceArray2D(0.0, 1.0, K, N);
-  std::unique_ptr<Literal> rhs_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
+  std::unique_ptr<Literal> rhs_lit = Literal::CreateR2FromArray2DWithLayout(
       *rhs_data,
       LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
   auto rhs_handle = client_->TransferToServer(*rhs_lit).ConsumeValueOrDie();
@@ -380,12 +377,12 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
   builder.Reshape(out_flat, {0, 1, 2}, {2, 2, 2, 2});
 
   auto x_data = client_
-                    ->TransferToServer(*LiteralUtil::CreateR4<float>(
+                    ->TransferToServer(*Literal::CreateR4<float>(
                         {{{{1000, 100}, {10, 1}}, {{2000, 200}, {20, 2}}},
                          {{{3000, 300}, {30, 3}}, {{4000, 400}, {40, 4}}}}))
                     .ConsumeValueOrDie();
   auto y_data = client_
-                    ->TransferToServer(*LiteralUtil::CreateR4<float>(
+                    ->TransferToServer(*Literal::CreateR4<float>(
                         {{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}},
                          {{{11, 22}, {33, 44}}, {{55, 66}, {77, 88}}}}))
                     .ConsumeValueOrDie();
@@ -416,14 +413,14 @@ TEST_F(DotOperationTest, TransposeFolding) {
         auto lhs_handle =
             client_
                 ->TransferToServer(
-                    *LiteralUtil::CreateR2FromArray2DWithLayout<float>(
+                    *Literal::CreateR2FromArray2DWithLayout<float>(
                         *lhs, LayoutUtil::MakeLayout(
                                   MinorToMajorForIsRowMajor(row_major))))
                 .ConsumeValueOrDie();
         auto rhs_handle =
             client_
                 ->TransferToServer(
-                    *LiteralUtil::CreateR2FromArray2DWithLayout<float>(
+                    *Literal::CreateR2FromArray2DWithLayout<float>(
                         *rhs, LayoutUtil::MakeLayout(
                                   MinorToMajorForIsRowMajor(row_major))))
                 .ConsumeValueOrDie();
@@ -460,10 +457,7 @@ TEST_F(DotOperationTest, TransposeFolding) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendLayoutUtilFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuRuntimeFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index b7bb1792f3b9b96fea5f446c787eb55e2577b01b..9e85e357070c8c7a32bdc8b16b139ceb848114d9 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -45,295 +44,310 @@ namespace {
 
 class DynamicSliceTest : public ClientLibraryTestBase {
  protected:
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR1() {
     // Slice at dimension start.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {0}, {5},
-                  {0.0, 1.0, 2.0, 3.0, 4.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {0}, {5}, {0, 1, 2, 3, 4});
     // Slice in the middle.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {3},
-                  {2.0, 3.0, 4.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {3}, {2, 3, 4});
     // Slice at dimension boundaries.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {5}, {3},
-                  {5.0, 6.0, 7.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {5}, {3}, {5, 6, 7});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4},
-                  {6.0, 7.0, 0.0, 1.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1});
     // Zero element slice.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {0}, {});
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR2() {
     // Slice at dimension start.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {0, 0}, {2, 2}, {{1.0f, 2.0f}, {4.0f, 5.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 2},
+                         {{1, 2}, {4, 5}});
     // Slice in the middle.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {1, 1}, {2, 1}, {{5.0f}, {8.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1},
+                         {{5}, {8}});
     // Slice at dimension boundaries.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {1, 1}, {2, 1}, {{5.0f}, {8.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1},
+                         {{5}, {8}});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {1, 1}, {3, 3},
-                  {{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3},
+                         {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}});
     // Zero element slice: 2x0.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {0, 0}, {2, 0}, {{}, {}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 0},
+                         {{}, {}});
     // Zero element slice: 0x2.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {0, 0}, {0, 2}, Array2D<float>(0, 2));
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {0, 2},
+                         Array2D<DataT>(0, 2));
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR3() {
     // R3 Shape: [2, 3, 2]
     // clang-format off
 
     // Slice at dimension start.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-        {0, 0, 0}, {2, 1, 2},
-      {{{1.0f, 2.0f}}, {{7.0f, 8.0f}}});
+    RunR3<IndexT, DataT>(
+      {{{1, 2}, {3, 4}, {5, 6}},
+       {{7, 8}, {9, 10}, {11, 12}}},
+      {0, 0, 0}, {2, 1, 2},
+      {{{1, 2}}, {{7, 8}}});
 
     // Slice in the middle.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-        {0, 1, 1}, {2, 2, 1},
-      {{{4.0f}, {6.0f}}, {{10.0f}, {12.0f}}});
+    RunR3<IndexT, DataT>(
+      {{{1, 2}, {3, 4}, {5, 6}},
+       {{7, 8}, {9, 10}, {11, 12}}},
+      {0, 1, 1}, {2, 2, 1},
+      {{{4}, {6}}, {{10}, {12}}});
 
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-        {0, 2, 1}, {2, 2, 1},
-      {{{6.0f}, {2.0f}}, {{12.0f}, {8.0f}}});
+    RunR3<IndexT, DataT>(
+      {{{1, 2}, {3, 4}, {5, 6}},
+       {{7, 8}, {9, 10}, {11, 12}}},
+      {0, 2, 1}, {2, 1, 2},
+      {{{6, 5}}, {{12, 11}}});
 
     // clang-format on
   }
 
-  template <typename IndexT>
-  void RunR1(const std::vector<float>& input_values,
+  template <typename IndexT, typename DataT>
+  void RunR1(tensorflow::gtl::ArraySlice<DataT> input_values,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const std::vector<float>& expected_values) {
+             tensorflow::gtl::ArraySlice<DataT> expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR1<float>(input_values);
+    auto input = builder.ConstantR1<DataT>(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR1<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR1<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR2(const Array2D<float>& input_values,
+  template <typename IndexT, typename DataT>
+  void RunR2(const Array2D<DataT>& input_values,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const Array2D<float>& expected_values) {
+             const Array2D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR2FromArray2D<float>(input_values);
+    auto input = builder.ConstantR2FromArray2D<DataT>(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR2<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR2<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR3(const Array3D<float>& input_values,
+  template <typename IndexT, typename DataT>
+  void RunR3(const Array3D<DataT>& input_values,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const Array3D<float>& expected_values) {
+             const Array3D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR3FromArray3D<float>(input_values);
+    auto input = builder.ConstantR3FromArray3D<DataT>(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR3<DataT>(&builder, expected_values, {start_data.get()});
   }
 };
 
-XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32, int32>(); }
+
+XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64, float>(); }
+
+XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, double>(); }
+
+XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, float>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64>(); }
+XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, double>(); }
 
-XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, int32>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64>(); }
+XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64, float>(); }
 
-XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, double>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R1Pred) {
+  // Slice at dimension start.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {0},
+                     {5}, {true, false, false, true, false});
+  // Slice in the middle.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {2},
+                     {3}, {false, true, false});
+  // Slice at dimension boundaries.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {5},
+                     {3}, {true, true, false});
+  // Zero element slice.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {2},
+                     {0}, {});
+}
 
-XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R2Pred) {
+  // Slice at dimension start.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
+      {2, 2}, {{true, false}, {false, false}});
+  // Slice in the middle.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {1, 1},
+      {2, 1}, {{false}, {true}});
+  // Slice at dimension boundaries.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {1, 1},
+      {2, 1}, {{false}, {true}});
+  // Zero element slice: 2x0.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
+      {2, 0}, {{}, {}});
+  // Zero element slice: 0x2.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
+      {0, 2}, Array2D<bool>(0, 2));
+}
 
-XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
+  // R3 Shape: [2, 3, 2]
+  // clang-format off
+
+  // Slice at dimension start.
+  RunR3<int32, bool>(
+    {{{true, false}, {false, true}, {true, true}},
+     {{false, true}, {true, false}, {false, false}}},
+    {0, 0, 0}, {2, 1, 2},
+    {{{true, false}}, {{false, true}}});
+
+  // Slice in the middle.
+  RunR3<int32, bool>(
+    {{{true, false}, {false, true}, {true, true}},
+     {{false, true}, {true, false}, {false, false}}},
+    {0, 1, 1}, {2, 2, 1},
+    {{{true}, {true}}, {{false}, {false}}});
+
+  // clang-format on
+}
 
 class DynamicUpdateSliceTest : public ClientLibraryTestBase {
  protected:
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR1() {
-    // clang-format off
     // Slice at dimension start.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {0},
-                  {8.0, 9.0, 10.0, 3.0, 4.0, 5.0, 6.0, 7.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {0},
+                         {8, 9, 10, 3, 4, 5, 6, 7});
     // Slice in the middle.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {2},
-                  {0.0, 1.0, 8.0, 9.0, 10.0, 5.0, 6.0, 7.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {2},
+                         {0, 1, 8, 9, 10, 5, 6, 7});
     // Slice at dimension boundaries.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {5},
-                  {0.0, 1.0, 2.0, 3.0, 4.0, 8.0, 9.0, 10.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {5},
+                         {0, 1, 2, 3, 4, 8, 9, 10});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {6},
-                  {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6},
+                         {0, 1, 2, 3, 4, 5, 8, 9});
     // Zero-sized update.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {}, {2},
-                  {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
-    // clang-format on
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {}, {2},
+                         {0, 1, 2, 3, 4, 5, 6, 7});
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR2() {
-    // clang-format off
     // Slice at dimension start.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {0, 0},
-        {{10.0f, 11.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {0, 0},
+                         {{10, 11, 3}, {4, 5, 6}, {7, 8, 9}});
     // Slice in the middle.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {1, 1},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 10.0f, 11.0f}, {7.0f, 8.0f, 9.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {1, 1},
+                         {{1, 2, 3}, {4, 10, 11}, {7, 8, 9}});
     // Slice at dimension boundaries.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {2, 1},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 10.0f, 11.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 1},
+                         {{1, 2, 3}, {4, 5, 6}, {7, 10, 11}});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {2, 2},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2},
+                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 10}});
     // Zero-sized update.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{}}, {2, 1},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
-    // clang-format on
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{}}, {2, 1},
+                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR3() {
     // R3 Shape: [2, 3, 2]
-    // clang-format off
     // Slice at dimension start.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-      {{{13.0f, 14.0f}, {15.0f, 16.0f}},
-       {{17.0f, 18.0f}, {19.0f, 20.0f}}},
-        {0, 0, 0},
-      {{{13.0f, 14.0f}, {15.0f, 16.0f}, {5.0f, 6.0f}},
-       {{17.0f, 18.0f}, {19.0f, 20.0f}, {11.0f, 12.0f}}});
+    RunR3<IndexT, DataT>(
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}},
+        {{{13, 14}, {15, 16}}, {{17, 18}, {19, 20}}}, {0, 0, 0},
+        {{{13, 14}, {15, 16}, {5, 6}}, {{17, 18}, {19, 20}, {11, 12}}});
     // Slice in the middle.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-      {{{13.0f}, {15.0f}}},
-        {1, 1, 1},
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-        {{7.0f, 8.0f}, {9.0f, 13.0f}, {11.0f, 15.0f}}});
+    RunR3<IndexT, DataT>(
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
+        {1, 1, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 13}, {11, 15}}});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-      {{{13.0f}, {15.0f}}},
-        {1, 2, 1},
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-        {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 13.0f}}});
-    // clang-format on
+    RunR3<IndexT, DataT>(
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
+        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 13}}});
   }
 
-  template <typename IndexT>
-  void RunR1(const std::vector<float>& input_values,
-             const std::vector<float>& update_values,
+  template <typename IndexT, typename DataT>
+  void RunR1(tensorflow::gtl::ArraySlice<DataT> input_values,
+             tensorflow::gtl::ArraySlice<DataT> update_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<float>& expected_values) {
+             tensorflow::gtl::ArraySlice<DataT> expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR1<float>(input_values);
-    auto update = builder.ConstantR1<float>(update_values);
+    auto input = builder.ConstantR1<DataT>(input_values);
+    auto update = builder.ConstantR1<DataT>(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR1<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR1<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR2(const Array2D<float>& input_values,
-             const Array2D<float>& update_values,
+  template <typename IndexT, typename DataT>
+  void RunR2(const Array2D<DataT>& input_values,
+             const Array2D<DataT>& update_values,
              const std::vector<IndexT> slice_starts,
-             const Array2D<float>& expected_values) {
+             const Array2D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR2FromArray2D<float>(input_values);
-    auto update = builder.ConstantR2FromArray2D<float>(update_values);
+    auto input = builder.ConstantR2FromArray2D<DataT>(input_values);
+    auto update = builder.ConstantR2FromArray2D<DataT>(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR2<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR2<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR3(const Array3D<float>& input_values,
-             const Array3D<float>& update_values,
+  template <typename IndexT, typename DataT>
+  void RunR3(const Array3D<DataT>& input_values,
+             const Array3D<DataT>& update_values,
              const std::vector<IndexT> slice_starts,
-             const Array3D<float>& expected_values) {
+             const Array3D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR3FromArray3D<float>(input_values);
-    auto update = builder.ConstantR3FromArray3D<float>(update_values);
+    auto input = builder.ConstantR3FromArray3D<DataT>(input_values);
+    auto update = builder.ConstantR3FromArray3D<DataT>(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR3<DataT>(&builder, expected_values, {start_data.get()});
   }
 
   void RunR3Contiguous(std::vector<int32> operand_shape, int32 index,
@@ -389,28 +403,86 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   template <typename NativeT>
   void DumpArray(const string& name, const Array3D<NativeT> values) {
     std::unique_ptr<Literal> literal =
-        LiteralUtil::CreateR3FromArray3D<NativeT>(values);
-    LOG(INFO) << name << ":" << LiteralUtil::ToString(*literal);
+        Literal::CreateR3FromArray3D<NativeT>(values);
+    LOG(INFO) << name << ":" << literal->ToString();
   }
 };
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64, double>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64, int64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
+  // Slice at dimension start.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false},
+                     {true, true, false}, {0},
+                     {true, true, false, true, false, true, true, false});
+  // Slice in the middle.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false},
+                     {false, true, true}, {2},
+                     {false, false, false, true, true, true, true, false});
+  // Slice at dimension boundaries.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false},
+                     {false, true, true}, {5},
+                     {false, false, true, true, false, false, true, true});
+  // Zero-sized update.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false}, {},
+                     {2}, {false, false, true, true, false, true, true, false});
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R2Pred) {
+  // Slice at dimension start.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}},
+      {{true, false}}, {0, 0},
+      {{true, false, false}, {true, false, true}, {false, true, true}});
+  // Slice in the middle.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}},
+      {{true, false}}, {1, 1},
+      {{false, true, false}, {true, true, false}, {false, true, true}});
+  // Slice at dimension boundaries.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}},
+      {{true, false}}, {2, 1},
+      {{false, true, false}, {true, false, true}, {false, true, false}});
+  // Zero-sized update.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}}, {{}},
+      {2, 1}, {{false, true, false}, {true, false, true}, {false, true, true}});
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
+  // R3 Shape: [2, 3, 2]
+  // Slice at dimension start.
+  RunR3<int32, bool>(
+      {{{true, false}, {false, true}, {true, true}},
+       {{false, false}, {false, true}, {true, false}}},
+      {{{false, true}, {true, false}}, {{true, true}, {false, true}}},
+      {0, 0, 0},
+      {{{false, true}, {true, false}, {true, true}},
+       {{true, true}, {false, true}, {true, false}}});
+  // Slice in the middle.
+  RunR3<int32, bool>({{{true, false}, {false, true}, {true, true}},
+                      {{false, false}, {false, true}, {true, false}}},
+                     {{{false}, {true}}}, {1, 1, 1},
+                     {{{true, false}, {false, true}, {true, true}},
+                      {{false, false}, {false, false}, {true, true}}});
+}
 
 // Tests for simple R3 case where the update is contiguous (i.e. the minor
 // two dimensions are not sliced).
@@ -470,7 +542,7 @@ void BM_DynamicSlice(int num_iters) {
   ComputationBuilder builder(client, "DynamicSlice");
 
   // Create input as a constant: shape [1, 2, 3, 4]
-  auto input_literal = LiteralUtil::CreateR4(
+  auto input_literal = Literal::CreateR4(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
   auto input = builder.ConstantLiteral(*input_literal);
@@ -488,7 +560,7 @@ void BM_DynamicSlice(int num_iters) {
                                                            &allocator, 0)
                     .ConsumeValueOrDie();
 
-  auto start_indices_literal = LiteralUtil::CreateR1<int32>({0, 1, 2, 3});
+  auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
       executors[device_ordinal], *start_indices_literal,
       buffer->mutable_buffer({})));
@@ -521,7 +593,6 @@ BENCHMARK(BM_DynamicSlice);
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index 80267e5459d2ab12e3530110c0def699b7695351..90c5aa65592302e076821aaaeaa701ae40c07a6c 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -114,7 +113,6 @@ TEST_F(FloorCeilTest, R0Ceil) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
index ee4e92505d9dd1f880473f1e76e5be3f01a1cfb3..9c86c65e5bb5b90072f79f5dee1923fa92b36e21 100644
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ b/tensorflow/compiler/xla/tests/fmax_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -47,7 +46,6 @@ TEST_F(FmaxSimpleTest, FmaxTenValues) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index fa36381267e80e3afe693a4d85152d2367956be3..df52f168a8764e2a14e47230cb2a9095d60ddc0f 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -29,7 +31,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -37,10 +41,13 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 using tensorflow::gtl::ArraySlice;
 
+namespace se = ::perftools::gputools;
+
 namespace xla {
 namespace {
 
@@ -81,7 +88,7 @@ class FusionTest : public HloTestBase {
     HloInstruction* hlos[4];
     for (int i = 0; i < Arity; ++i) {
       hlos[i + 1] = builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2FromArray2D(operand_data[i])));
+          Literal::CreateR2FromArray2D(operand_data[i])));
     }
     auto answer_shape =
         ShapeUtil::MakeShape(prim_type, {test_width, test_height});
@@ -107,7 +114,7 @@ class FusionTest : public HloTestBase {
             ArraySlice<HloInstruction*>(hlos, 0, Arity + 1),
             HloInstruction::FusionKind::kLoop);
 
-    auto expected = LiteralUtil::CreateR2FromArray2D(answer_data);
+    auto expected = Literal::CreateR2FromArray2D(answer_data);
     auto actual = ExecuteAndTransfer(std::move(hlo_module), {});
     if (primitive_util::IsFloatingPointType(prim_type)) {
       LiteralTestUtil::ExpectNear(*expected, *actual, ErrorSpec(1e-4));
@@ -178,28 +185,27 @@ XLA_TEST_F(FusionTest, Test) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
+      Literal::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{-1.0}, {-1.0}, {-1.0}})));
+      Literal::CreateR2<float>({{-1.0}, {-1.0}, {-1.0}})));
   auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {3, 1}), HloOpcode::kAdd, const0, const1));
   auto reshape3 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {1, 3}), add2, {1, 0}));
   auto const4 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.62, 2.72, 3.14}})));
+      Literal::CreateR2<float>({{1.62, 2.72, 3.14}})));
   auto concat5 = builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(F32, {2, 3}), {reshape3, const4}, 0));
   auto const6 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}})));
+      Literal::CreateR2<float>({{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}})));
   auto negate7 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kNegate, const6));
   auto add8 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kAdd, concat5, negate7));
   auto const9 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})));
-  auto const10 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<bool>(
-          {{true, false, true}, {false, true, false}})));
+      Literal::CreateR2<float>({{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})));
+  auto const10 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<bool>({{true, false, true}, {false, true, false}})));
   auto select11 = builder.AddInstruction(
       HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}),
                                     HloOpcode::kSelect, const10, add8, const9));
@@ -214,7 +220,7 @@ XLA_TEST_F(FusionTest, Test) {
            const4, reshape3, add2, const1, const0},
           HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(*LiteralUtil::CreateR2<float>({{0.5}, {2.72}}),
+  LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>({{0.5}, {2.72}}),
                               *ExecuteAndTransfer(std::move(hlo_module), {}),
                               ErrorSpec(1e-4));
 }
@@ -226,11 +232,11 @@ XLA_TEST_F(FusionTest, Parameter) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
+      Literal::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {1, 3}), HloOpcode::kCopy, const0));
   auto const2 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{-2.0, -2.0, -2.0}})));
+      Literal::CreateR2<float>({{-2.0, -2.0, -2.0}})));
   // add3 = copy1 + const2 = const0 + const2 = {1,2,3} + {-2,-2,-2} = {-1,0,+1}
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {1, 3}), HloOpcode::kAdd, copy1, const2));
@@ -240,7 +246,7 @@ XLA_TEST_F(FusionTest, Parameter) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add3, const2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(*LiteralUtil::CreateR2<float>({{-1.0, 0.0, 1.0}}),
+  LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>({{-1.0, 0.0, 1.0}}),
                               *ExecuteAndTransfer(std::move(hlo_module), {}),
                               ErrorSpec(1e-4));
 }
@@ -249,9 +255,9 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
+      Literal::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}})));
+      Literal::CreateR2<float>({{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}})));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(const_array->shape(), const_vector, {1}));
   // add2 = broadcast(const_vector) + const_array
@@ -265,7 +271,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
                                 HloInstruction::FusionKind::kLoop);
 
   LiteralTestUtil::ExpectNear(
-      *LiteralUtil::CreateR2<float>({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}),
+      *Literal::CreateR2<float>({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4));
 }
 
@@ -273,13 +279,13 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto single_element_array = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32>({{5}})));
+      HloInstruction::CreateConstant(Literal::CreateR2<int32>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {}), single_element_array));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR0<int32>(5),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(5),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -287,14 +293,14 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
+      Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 2, 3}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   LiteralTestUtil::ExpectEqual(
-      *LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}}),
+      *Literal::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -302,14 +308,14 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
+      Literal::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {3, 2}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   LiteralTestUtil::ExpectEqual(
-      *LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}}),
+      *Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -317,13 +323,13 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32>({{{7}}})));
+      HloInstruction::CreateConstant(Literal::CreateR3<int32>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR0<int32>(7),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(7),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -331,13 +337,13 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR3<int32>({{{7}}}),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR3<int32>({{{7}}}),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -345,13 +351,13 @@ XLA_TEST_F(FusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR0<int32>(7),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(7),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -359,14 +365,14 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {3, 3}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   LiteralTestUtil::ExpectEqual(
-      *LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}),
+      *Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -374,14 +380,14 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {3, 2}), const0, {1, 0}));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   LiteralTestUtil::ExpectEqual(
-      *LiteralUtil::CreateR2<int32>({{1, 4}, {2, 5}, {3, 6}}),
+      *Literal::CreateR2<int32>({{1, 4}, {2, 5}, {3, 6}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -389,14 +395,14 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {3, 3}), const0, {1, 0}));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   LiteralTestUtil::ExpectEqual(
-      *LiteralUtil::CreateR2<int32>({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}),
+      *Literal::CreateR2<int32>({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -404,14 +410,14 @@ XLA_TEST_F(FusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
       ShapeUtil::MakeShape(S32, {3}), const0, {0}));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reverse1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR1<int32>({3, 2, 1}),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({3, 2, 1}),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -430,10 +436,10 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
   auto hlo_module = CreateNewModule();
 
   auto builder = HloComputation::Builder(TestName());
-  auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<int32>({1, 2, 4, 8})));
+  auto const0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 4, 8})));
   auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
   auto reduce2 = builder.AddInstruction(HloInstruction::CreateReduce(
       ShapeUtil::MakeShape(S32, {}), const0, const1, {0},
       hlo_module->AddEmbeddedComputation(MakeReduceTestComputation())));
@@ -441,7 +447,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR0<int32>(15),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(15),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -449,10 +455,10 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
   auto hlo_module = CreateNewModule();
 
   auto builder = HloComputation::Builder(TestName());
-  auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<int32>({1, 2, 4, 8})));
+  auto const0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 4, 8})));
   auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
   auto reduce2 = builder.AddInstruction(HloInstruction::CreateReduce(
       ShapeUtil::MakeShape(S32, {}), const0, const1, {0},
       hlo_module->AddEmbeddedComputation(MakeReduceTestComputation())));
@@ -462,7 +468,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate3, reduce2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*LiteralUtil::CreateR1<int32>({-15}),
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-15}),
                                *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
@@ -470,9 +476,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
+      Literal::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)));
   Window window;
   ASSERT_TRUE(
       tensorflow::protobuf::TextFormat::ParseFromString("dimensions:{\n"
@@ -512,10 +518,46 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
                                 HloInstruction::FusionKind::kLoop);
 
   LiteralTestUtil::ExpectEqual(
-      *LiteralUtil::CreateR2<int32>({{462, 2145}, {24871, 62491}}),
+      *Literal::CreateR2<int32>({{462, 2145}, {24871, 62491}}),
       *ExecuteAndTransfer(std::move(hlo_module), {}));
 }
 
+// When a constant (or other op) which has multiple users is imported
+// into a fusion, it should remain shared, rather than being duplicated
+// within the fusion.
+XLA_TEST_F(FusionTest, SharedConstant) {
+  auto hlo_module = CreateNewModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  auto const0 = builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR1<int32>({0})));
+  auto const1 = builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0));
+  auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add1));
+  auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add2));
+  auto add4 = builder.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add3));
+  hlo_module->AddEntryComputation(builder.Build())
+      ->CreateFusionInstruction(
+        {add4, add3, add2, add1, const1},
+        HloInstruction::FusionKind::kLoop);
+
+  HloComputation* entry_comp = hlo_module->entry_computation();
+
+  // entry computation contains the constant(0) and the fusion
+  EXPECT_EQ(entry_comp->instructions().size(), 2);
+
+  // fused instruction contains the constant(2), the parameter, and 4 adds
+  EXPECT_EQ(entry_comp->root_instruction()->fused_instructions().size(), 6);
+
+  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({8}),
+          *ExecuteAndTransfer(std::move(hlo_module), {}));
+}
+
 XLA_TEST_F(FusionTest, Add2D) { TestElementwise2D<float, 2>(HloOpcode::kAdd); }
 
 XLA_TEST_F(FusionTest, Subtract2D) {
@@ -568,12 +610,66 @@ XLA_TEST_F(FusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }
 
+void BM_ParallelFusion(int num_iters) {
+  // Simple element-wise computation to benchmark parallel task partitioning.
+  tensorflow::testing::StopTiming();
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
+  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
+  StreamExecutorMemoryAllocator allocator(platform, executors);
+
+  const int64 intra_op_parallelism_threads = 16;
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform);
+  client_options.set_intra_op_parallelism_threads(intra_op_parallelism_threads);
+  auto client =
+      ClientLibrary::GetOrCreateLocalClient(client_options).ValueOrDie();
+
+  const int64 dim_size = 1024;
+  // Create a simple fusable elementwise computation.
+  ComputationBuilder builder(client, "ParallelFusion");
+  Shape input_shape = ShapeUtil::MakeShape(F32, {dim_size, dim_size});
+  auto input0 = builder.Broadcast(builder.ConstantR0<float>(1.5f),
+                                  AsInt64Slice(input_shape.dimensions()));
+  auto input1 = builder.Broadcast(builder.ConstantR0<float>(2.0f),
+                                  AsInt64Slice(input_shape.dimensions()));
+  auto input2 = builder.Broadcast(builder.ConstantR0<float>(3.0f),
+                                  AsInt64Slice(input_shape.dimensions()));
+  auto x = builder.Mul(input0, input1);
+  auto y = builder.Add(x, input2);
+  auto computation = builder.Build().ConsumeValueOrDie();
+
+  std::unique_ptr<LocalExecutable> executable =
+      client->Compile(computation, {}, ExecutableBuildOptions())
+          .ConsumeValueOrDie();
+
+  // Run some warm-up executions.
+  ExecutableRunOptions options;
+  options.set_allocator(&allocator);
+  const int kWarmups = 2;
+  for (int i = 0; i < kWarmups; ++i) {
+    auto result = executable->Run({}, options);
+    ASSERT_TRUE(result.ok());
+  }
+
+  // Run benchmark.
+  tensorflow::testing::BytesProcessed(static_cast<int64>(num_iters) * dim_size *
+                                      dim_size * sizeof(float));
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < num_iters; ++i) {
+    auto result = executable->Run({}, options);
+    ASSERT_TRUE(result.ok());
+  }
+}
+
+BENCHMARK(BM_ParallelFusion);
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -586,5 +682,6 @@ int main(int argc, char** argv) {
     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
     return 2;
   }
+  tensorflow::testing::RunBenchmarks();
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
index f54fa2256e217e9aa954a10470cd461023be631d..eded2077fce965ab1c729c610764afa2228ca128 100644
--- a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
+++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
@@ -46,7 +46,7 @@ TEST_F(HloMetadataTest, MetadataPropagation) {
   builder.ClearOpMetadata();
 
   Shape argument_layout = ShapeUtil::MakeShape(F32, {});
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> executable,
       local_client_->Compile(builder.Build().ValueOrDie(),
                              {&argument_layout, &argument_layout},
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 5f7b7aa434e29980a7d813dfb57f3b7988ed6e6d..8149e2b7cc72018ef8deb61305bb61ceb77200f9 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -24,14 +24,12 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/hlo_test_base_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
@@ -56,17 +54,6 @@ struct HloTestBase::EigenThreadPoolWrapper {
 
 HloTestBase::HloTestBase()
     : backend_(Backend::CreateDefaultBackend().ConsumeValueOrDie()) {
-  // TODO(b/62411181): get rid of this flag entirely when the usual debug flags
-  // are piped to all HLO tests.
-  test_hlo_dumper_ = [](const HloModule& module, const string& label) {
-    legacy_flags::HloTestBaseFlags* flags = legacy_flags::GetHloTestBaseFlags();
-    if (flags->xla_hlo_test_generate_hlo_graph) {
-      const bool show_addresses = true;
-      const bool show_layouts = true;
-      hlo_graph_dumper::DumpGraph(*module.entry_computation(), label,
-                                  show_addresses, show_layouts);
-    }
-  };
   VLOG(1) << "executing on platform " << backend_->platform()->Name();
 }
 
@@ -77,9 +64,16 @@ HloTestBase::~HloTestBase() {
   }
 }
 
+/* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+
+  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  // TODO(b/38354253): Change tests to use Parameters instead of Constants.
+  debug_options.add_xla_disable_hlo_passes("constant_folding");
+
+  config.set_debug_options(debug_options);
+
   return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
                                config);
 }
@@ -91,7 +85,7 @@ StatusOr<perftools::gputools::DeviceMemoryBase> HloTestBase::Execute(
     Shape* result_shape) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend_->compiler()->Compile(std::move(module), test_hlo_dumper_,
+      backend_->compiler()->Compile(std::move(module),
                                     backend_->default_stream_executor()));
 
   se::Stream stream(backend_->default_stream_executor());
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 98bc35ae528970e262740631b283b7dbb6d01538..7f3d163290aba3cfcea1b3204e6c88134e172ed7 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -48,7 +48,7 @@ class HloTestBase : public ::testing::Test {
   // TestName() for its name; it will also automatically populate its debug
   // options from command-line flags. It's recommended to use this method to
   // create all HloModules for tests.
-  std::unique_ptr<HloModule> CreateNewModule();
+  static std::unique_ptr<HloModule> CreateNewModule();
 
   // Executes the given module and returns a global data handle.
   StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
@@ -104,8 +104,6 @@ class HloTestBase : public ::testing::Test {
 
   std::unique_ptr<Backend> backend_;
 
-  Compiler::HloDumper test_hlo_dumper_;
-
   // This vector contains handles of all the device memory allocations performed
   // by the test. These are deallocated on destruction of the test object.
   std::vector<perftools::gputools::DeviceMemoryBase> allocations_;
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index eb979ad189db7b238ae6cc393d84d0c6c9fc27d1..0a8208332837545db27bff4e135feb586fc6429a 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -41,20 +41,25 @@ namespace xla {
 
 /* static */ void LiteralTestUtil::AssertEqualShapes(const Shape& expected,
                                                      const Shape& actual) {
-  ASSERT_EQ(ShapeUtil::Rank(expected), ShapeUtil::Rank(actual));
-  ASSERT_EQ(expected.element_type(), actual.element_type())
-      << PrimitiveType_Name(expected.element_type()) << " vs "
-      << PrimitiveType_Name(actual.element_type());
-  ASSERT_EQ(expected.dimensions_size(), actual.dimensions_size());
-  for (int i = 0; i < expected.dimensions_size(); ++i) {
-    ASSERT_EQ(expected.dimensions(i), actual.dimensions(i))
-        << "mismatch in dimension #" << i
-        << " expected: " << ShapeUtil::HumanString(expected)
-        << " actual: " << ShapeUtil::HumanString(actual);
-  }
-  ASSERT_EQ(expected.tuple_shapes_size(), actual.tuple_shapes_size());
-  for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
-    AssertEqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
+  ASSERT_EQ(ShapeUtil::IsTuple(expected), ShapeUtil::IsTuple(actual));
+  if (ShapeUtil::IsTuple(expected)) {
+    ASSERT_EQ(ShapeUtil::TupleElementCount(expected),
+              ShapeUtil::TupleElementCount(actual));
+    for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
+      AssertEqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
+    }
+  } else {
+    ASSERT_EQ(ShapeUtil::Rank(expected), ShapeUtil::Rank(actual));
+    ASSERT_EQ(expected.element_type(), actual.element_type())
+        << PrimitiveType_Name(expected.element_type()) << " vs "
+        << PrimitiveType_Name(actual.element_type());
+    ASSERT_EQ(expected.dimensions_size(), actual.dimensions_size());
+    for (int i = 0; i < expected.dimensions_size(); ++i) {
+      ASSERT_EQ(expected.dimensions(i), actual.dimensions(i))
+          << "mismatch in dimension #" << i
+          << " expected: " << ShapeUtil::HumanString(expected)
+          << " actual: " << ShapeUtil::HumanString(actual);
+    }
   }
 }
 
@@ -128,8 +133,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
                          tensorflow::gtl::MutableArraySlice<int64> multi_index,
                          int64 dimension) {
   if (dimension == expected.shape().dimensions_size()) {
-    NativeT expected_value = LiteralUtil::Get<NativeT>(expected, multi_index);
-    NativeT actual_value = LiteralUtil::Get<NativeT>(actual, multi_index);
+    NativeT expected_value = expected.Get<NativeT>(multi_index);
+    NativeT actual_value = actual.Get<NativeT>(multi_index);
     ::testing::AssertionResult result =
         CompareEqual<NativeT>(expected_value, actual_value);
     return result;  // Defines implicit coersion to bool.
@@ -147,11 +152,15 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
 }  // namespace
 
 /* static */ void LiteralTestUtil::ExpectEqual(const Literal& expected,
-                                               const Literal& actual) {
-  EXPECT_TRUE(Equal(expected, actual)) << "expected:\n"
-                                       << LiteralUtil::ToString(expected)
-                                       << "\n\tvs actual:\n"
-                                       << LiteralUtil::ToString(actual);
+                                               const Literal& actual,
+                                               const string& message) {
+  EXPECT_TRUE(Equal(expected, actual))
+      << "expected:\n"
+      << expected.ToString() << "\n\tvs actual:\n"
+      << actual.ToString()
+      << (message.empty()
+              ? ""
+              : tensorflow::strings::StrCat("\nmessage: ", message));
 }
 
 /* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected,
@@ -161,8 +170,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
     const Literal& expected, const Literal& actual) {
-  VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
-  VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
+  VLOG(1) << "expected: " << expected.ToString();
+  VLOG(1) << "actual:   " << actual.ToString();
 
   AssertEqualShapes(expected.shape(), actual.shape());
   std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
@@ -210,8 +219,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   ::testing::AssertionResult result = ::testing::AssertionSuccess();
   if (!match) {
     result = ::testing::AssertionFailure()
-             << "expected: " << LiteralUtil::ToString(expected)
-             << "\nactual:   " << LiteralUtil::ToString(actual);
+             << "expected: " << expected.ToString()
+             << "\nactual:   " << actual.ToString();
     VLOG(1) << result.message();
   }
   return result;
@@ -219,8 +228,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
 
 /* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
                                                     const Literal& actual) {
-  VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
-  VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
+  VLOG(1) << "expected: " << expected.ToString();
+  VLOG(1) << "actual:   " << actual.ToString();
 
   ASSERT_TRUE(ShapeUtil::IsTuple(expected.shape()));
   ASSERT_TRUE(ShapeUtil::IsTuple(actual.shape()));
@@ -247,8 +256,8 @@ class NearComparator {
   // within the error bound. Emits useful log messages and dumps literals to
   // temporary files on failure. Returns true if  literals match.
   bool ExpectNear(const Literal& expected, const Literal& actual) {
-    VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
-    VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
+    VLOG(1) << "expected: " << expected.ToString();
+    VLOG(1) << "actual:   " << actual.ToString();
 
     LiteralTestUtil::AssertEqualShapes(expected.shape(), actual.shape());
 
@@ -282,9 +291,9 @@ class NearComparator {
     if (num_miscompares_ > 0) {
       if (!VLOG_IS_ON(1)) {
         LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << LiteralUtil::ToString(expected);
+                  << " " << expected.ToString();
         LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << LiteralUtil::ToString(actual);
+                  << " " << actual.ToString();
       }
       EXPECT_TRUE(num_miscompares_ == 0)
           << "\nmax relative mismatch at index "
@@ -369,10 +378,9 @@ class NearComparator {
   void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
                           int64 dimension) {
     if (dimension == expected.shape().dimensions_size()) {
-      bool near =
-          ExpectValuesNear(LiteralUtil::Get<NativeT>(expected, multi_index_),
-                           LiteralUtil::Get<NativeT>(actual, multi_index_));
-      LiteralUtil::Set<bool>(&miscompares_, multi_index_, !near);
+      bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
+                                   actual.Get<NativeT>(multi_index_));
+      miscompares_.Set<bool>(multi_index_, !near);
     } else {
       for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
         multi_index_[dimension] = i;
@@ -431,14 +439,18 @@ class NearComparator {
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
                                               const Literal& actual,
-                                              const ErrorSpec& error) {
-  EXPECT_TRUE(Near(expected, actual, error));
+                                              const ErrorSpec& error,
+                                              const string& message) {
+  EXPECT_TRUE(Near(expected, actual, error))
+      << (message.empty()
+              ? ""
+              : tensorflow::strings::StrCat("\nmessage: ", message));
 }
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::NearTuple(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
-  VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
-  VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
+  VLOG(1) << "expected: " << expected.ToString();
+  VLOG(1) << "actual:   " << actual.ToString();
 
   if (!ShapeUtil::IsTuple(expected.shape()) ||
       !ShapeUtil::IsTuple(actual.shape())) {
@@ -504,8 +516,7 @@ class NearComparator {
   *shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
 
   // Allocate space in the new literal.
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(literal.shape()),
-                       new_literal.get());
+  new_literal->Reserve(ShapeUtil::ElementsIn(literal.shape()));
 
   // Copy data into new literal, element-by-element.
   for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
@@ -515,44 +526,36 @@ class NearComparator {
         IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
     switch (literal.shape().element_type()) {
       case PRED:
-        LiteralUtil::Set<bool>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<bool>(literal, from_multi_index));
+        new_literal->Set<bool>(to_multi_index,
+                               literal.Get<bool>(from_multi_index));
         break;
       case U8:
-        LiteralUtil::Set<uint8>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<uint8>(literal, from_multi_index));
+        new_literal->Set<uint8>(to_multi_index,
+                                literal.Get<uint8>(from_multi_index));
         break;
       case U32:
-        LiteralUtil::Set<uint32>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<uint32>(literal, from_multi_index));
+        new_literal->Set<uint32>(to_multi_index,
+                                 literal.Get<uint32>(from_multi_index));
         break;
       case S32:
-        LiteralUtil::Set<int32>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<int32>(literal, from_multi_index));
+        new_literal->Set<int32>(to_multi_index,
+                                literal.Get<int32>(from_multi_index));
         break;
       case U64:
-        LiteralUtil::Set<uint64>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<uint64>(literal, from_multi_index));
+        new_literal->Set<uint64>(to_multi_index,
+                                 literal.Get<uint64>(from_multi_index));
         break;
       case S64:
-        LiteralUtil::Set<int64>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<int64>(literal, from_multi_index));
+        new_literal->Set<int64>(to_multi_index,
+                                literal.Get<int64>(from_multi_index));
         break;
       case F32:
-        LiteralUtil::Set<float>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<float>(literal, from_multi_index));
+        new_literal->Set<float>(to_multi_index,
+                                literal.Get<float>(from_multi_index));
         break;
       case F64:
-        LiteralUtil::Set<double>(
-            new_literal.get(), to_multi_index,
-            LiteralUtil::Get<double>(literal, from_multi_index));
+        new_literal->Set<double>(to_multi_index,
+                                 literal.Get<double>(from_multi_index));
         break;
       default:
         LOG(FATAL) << "Unhandled primitive element type: "
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index a8b07a2c5d13e93d068cd475cb96a727c8346cc5..f645c4e8dcda73806a4204876716b93aa5fb7185 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -64,7 +64,8 @@ class LiteralTestUtil {
       const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
 
   // Expects that expected and actual are Equal.
-  static void ExpectEqual(const Literal& expected, const Literal& actual);
+  static void ExpectEqual(const Literal& expected, const Literal& actual,
+                          const string& message = "");
 
   // Expects that expected and actual are Not Equal.
   static void ExpectNotEqual(const Literal& expected, const Literal& actual);
@@ -110,7 +111,7 @@ class LiteralTestUtil {
 
   // Expects expected and actual to be Near with the given error.
   static void ExpectNear(const Literal& expected, const Literal& actual,
-                         const ErrorSpec& error);
+                         const ErrorSpec& error, const string& message = "");
 
   // Asserts the given literal are within the given error bound of the given
   // expected values. Only supported for floating point values.
@@ -130,6 +131,12 @@ class LiteralTestUtil {
           std::initializer_list<std::initializer_list<NativeT>>>
           expected,
       const Literal& actual, const ErrorSpec& error);
+  template <typename NativeT>
+  static void ExpectR4Near(
+      std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<NativeT>>>>
+          expected,
+      const Literal& actual, const ErrorSpec& error);
 
   // Asserts the given literal are within the given error bound to the given
   // array. Only supported for floating point values.
@@ -210,20 +217,20 @@ class LiteralTestUtil {
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR0Equal(NativeT expected,
                                                  const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR0<NativeT>(expected), actual);
+  ExpectEqual(*Literal::CreateR0<NativeT>(expected), actual);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR1Equal(
     tensorflow::gtl::ArraySlice<NativeT> expected, const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR1<NativeT>(expected), actual);
+  ExpectEqual(*Literal::CreateR1<NativeT>(expected), actual);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2Equal(
     std::initializer_list<std::initializer_list<NativeT>> expected,
     const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR2<NativeT>(expected), actual);
+  ExpectEqual(*Literal::CreateR2<NativeT>(expected), actual);
 }
 
 template <typename NativeT>
@@ -231,46 +238,46 @@ template <typename NativeT>
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         expected,
     const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR3<NativeT>(expected), actual);
+  ExpectEqual(*Literal::CreateR3<NativeT>(expected), actual);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2EqualArray2D(
     const Array2D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR2FromArray2D(expected), actual);
+  ExpectEqual(*Literal::CreateR2FromArray2D(expected), actual);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3EqualArray3D(
     const Array3D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR3FromArray3D(expected), actual);
+  ExpectEqual(*Literal::CreateR3FromArray3D(expected), actual);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR4EqualArray4D(
     const Array4D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*LiteralUtil::CreateR4FromArray4D(expected), actual);
+  ExpectEqual(*Literal::CreateR4FromArray4D(expected), actual);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR0Near(NativeT expected,
                                                 const Literal& actual,
                                                 const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR0<NativeT>(expected), actual, error);
+  ExpectNear(*Literal::CreateR0<NativeT>(expected), actual, error);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR1Near(
     tensorflow::gtl::ArraySlice<NativeT> expected, const Literal& actual,
     const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR1<NativeT>(expected), actual, error);
+  ExpectNear(*Literal::CreateR1<NativeT>(expected), actual, error);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2Near(
     std::initializer_list<std::initializer_list<NativeT>> expected,
     const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR2<NativeT>(expected), actual, error);
+  ExpectNear(*Literal::CreateR2<NativeT>(expected), actual, error);
 }
 
 template <typename NativeT>
@@ -278,28 +285,37 @@ template <typename NativeT>
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         expected,
     const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR3<NativeT>(expected), actual, error);
+  ExpectNear(*Literal::CreateR3<NativeT>(expected), actual, error);
+}
+
+template <typename NativeT>
+/* static */ void LiteralTestUtil::ExpectR4Near(
+    std::initializer_list<std::initializer_list<
+        std::initializer_list<std::initializer_list<NativeT>>>>
+        expected,
+    const Literal& actual, const ErrorSpec& error) {
+  ExpectNear(*Literal::CreateR4<NativeT>(expected), actual, error);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2NearArray2D(
     const Array2D<NativeT>& expected, const Literal& actual,
     const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR2FromArray2D(expected), actual, error);
+  ExpectNear(*Literal::CreateR2FromArray2D(expected), actual, error);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3NearArray3D(
     const Array3D<NativeT>& expected, const Literal& actual,
     const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR3FromArray3D(expected), actual, error);
+  ExpectNear(*Literal::CreateR3FromArray3D(expected), actual, error);
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR4NearArray4D(
     const Array4D<NativeT>& expected, const Literal& actual,
     const ErrorSpec& error) {
-  ExpectNear(*LiteralUtil::CreateR4FromArray4D(expected), actual, error);
+  ExpectNear(*Literal::CreateR4FromArray4D(expected), actual, error);
 }
 
 template <PrimitiveType type, typename T>
@@ -309,9 +325,9 @@ LiteralTestUtil::CreateRandomLiteral(
     const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
   using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
   TF_RET_CHECK(shape.element_type() == type);
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateFromShape(shape);
-  TF_RETURN_IF_ERROR(LiteralUtil::Populate<NativeT>(
-      literal.get(), [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(literal.get()->Populate<NativeT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indexes) {
         return generator(indexes);
       }));
   return std::move(literal);
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index a94f45f73b7d058d6b82f91967f61624a28fea3d..2acf27ed390b0732ba40fcf505c746bd7d8b651e 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -31,9 +31,8 @@ namespace xla {
 namespace {
 
 TEST(LiteralTestUtilTest, ComparesEqualTuplesEqual) {
-  std::unique_ptr<Literal> literal = LiteralUtil::MakeTuple({
-      LiteralUtil::CreateR0<int32>(42).get(),
-      LiteralUtil::CreateR0<int32>(64).get(),
+  std::unique_ptr<Literal> literal = Literal::MakeTuple({
+      Literal::CreateR0<int32>(42).get(), Literal::CreateR0<int32>(64).get(),
   });
   LiteralTestUtil::ExpectEqual(*literal, *literal);
 }
@@ -43,13 +42,11 @@ TEST(LiteralTestUtilTest, ComparesUnequalTuplesUnequal) {
   // un-fail an assertion failure. The CHECK-failure is death, so we can make a
   // death assertion.
   auto unequal_things_are_equal = [] {
-    std::unique_ptr<Literal> lhs = LiteralUtil::MakeTuple({
-        LiteralUtil::CreateR0<int32>(42).get(),
-        LiteralUtil::CreateR0<int32>(64).get(),
+    std::unique_ptr<Literal> lhs = Literal::MakeTuple({
+        Literal::CreateR0<int32>(42).get(), Literal::CreateR0<int32>(64).get(),
     });
-    std::unique_ptr<Literal> rhs = LiteralUtil::MakeTuple({
-        LiteralUtil::CreateR0<int32>(64).get(),
-        LiteralUtil::CreateR0<int32>(42).get(),
+    std::unique_ptr<Literal> rhs = Literal::MakeTuple({
+        Literal::CreateR0<int32>(64).get(), Literal::CreateR0<int32>(42).get(),
     });
     CHECK(LiteralTestUtil::Equal(*lhs, *rhs)) << "LHS and RHS are unequal";
   };
@@ -58,8 +55,8 @@ TEST(LiteralTestUtilTest, ComparesUnequalTuplesUnequal) {
 
 TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   auto dummy_lambda = [] {
-    auto two = LiteralUtil::CreateR0<float>(2);
-    auto four = LiteralUtil::CreateR0<float>(4);
+    auto two = Literal::CreateR0<float>(2);
+    auto four = Literal::CreateR0<float>(4);
     ErrorSpec error(0.001);
     CHECK(LiteralTestUtil::Near(*two, *four, error)) << "two is not near four";
   };
@@ -88,11 +85,11 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
                                             &literal_proto));
     Literal literal(literal_proto);
     if (result.find("expected") != string::npos) {
-      EXPECT_EQ("2", LiteralUtil::ToString(literal));
+      EXPECT_EQ("2", literal.ToString());
     } else if (result.find("actual") != string::npos) {
-      EXPECT_EQ("4", LiteralUtil::ToString(literal));
+      EXPECT_EQ("4", literal.ToString());
     } else if (result.find("miscompares") != string::npos) {
-      EXPECT_EQ("true", LiteralUtil::ToString(literal));
+      EXPECT_EQ("true", literal.ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
     }
diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc
index 796f43ea4edc2c4858eb85c7fa8a16bbe8401a4b..4cb383a78dfed8a4867f4b589c6c32db345dfc9f 100644
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -62,7 +61,6 @@ TEST_F(LogTest, LogTenValues) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index e4dbd6864a325546fabd88b56acf341b99cb73c8..47a8acbf4ab76758d8387e84eb271c130aba5a64 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -170,7 +169,7 @@ class MapTest : public ClientLibraryTestBase {
 TEST_F(MapTest, MapEachElemPlusOneR0) {
   // Applies lambda (x) (+ x 1)) to an input scalar.
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(42.0);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(42.0);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -184,7 +183,7 @@ TEST_F(MapTest, MapEachElemPlusOneR0) {
 XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0.
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR1<float>({});
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -199,7 +198,7 @@ TEST_F(MapTest, MapEachElemPlusOneR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4.
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -213,7 +212,7 @@ TEST_F(MapTest, MapEachElemPlusOneR1S4) {
 TEST_F(MapTest, MapEachF32ElementToS32Constant) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -226,7 +225,7 @@ TEST_F(MapTest, MapEachF32ElementToS32Constant) {
 TEST_F(MapTest, MapEachF32ElementToU32Constant) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -240,7 +239,7 @@ TEST_F(MapTest, MapEachElemLongerChainR1) {
   // Maps (lambda (x) (* (+ x 1) x)) onto an input R1F32 vector.
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
+      Literal::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -256,7 +255,7 @@ XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then
   // maps (lambda (x) (* x 2)) on the result.
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR1<float>({});
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -273,7 +272,7 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
   // maps (lambda (x) (* x 2)) on the result.
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -288,7 +287,7 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 TEST_F(MapTest, MapEachElemPlusOneR2) {
   // Maps (lambda (x) (+ x 1)) onto an input R2F32 vector.
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR2<float>(
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
@@ -385,11 +384,11 @@ TEST_F(MapTest, MapBinaryAdder) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors.
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   std::unique_ptr<Literal> param1_literal =
-      LiteralUtil::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
+      Literal::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
@@ -434,12 +433,12 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 XLA_TEST_F(MapTest, AddR3_3x0x2) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
+      Literal::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   std::unique_ptr<Literal> param1_literal =
-      LiteralUtil::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
+      Literal::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
@@ -456,15 +455,15 @@ TEST_F(MapTest, MapTernaryAdder) {
   // Maps (lambda (x y z) (+ x y z)) onto three R1F32 vectors.
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   std::unique_ptr<Literal> param1_literal =
-      LiteralUtil::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
+      Literal::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
   std::unique_ptr<Literal> param2_literal =
-      LiteralUtil::CreateR1<float>({-10.0f, -100.0f, -900.0f, -400.0f});
+      Literal::CreateR1<float>({-10.0f, -100.0f, -900.0f, -400.0f});
   std::unique_ptr<GlobalData> param2_data =
       client_->TransferToServer(*param2_literal).ConsumeValueOrDie();
 
@@ -517,11 +516,11 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
   auto error_add = sub_builder->BuildAndNoteError();
 
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
+      Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   std::unique_ptr<Literal> param1_literal =
-      LiteralUtil::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
+      Literal::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
@@ -531,9 +530,10 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
-  EXPECT_THAT(computation_status.status().ToString(),
-              ::testing::HasSubstr("error from: ErrorAdd: binary op with "
-                                   "different element types: f32[] and u16[]"));
+  EXPECT_THAT(
+      computation_status.status().ToString(),
+      ::testing::HasSubstr("error from: ErrorAdd: binary op BINOP_ADD with "
+                           "different element types: f32[] and u16[]"));
 }
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
@@ -554,8 +554,8 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
   sub_builder->Pow(x, y);
   auto power = sub_builder->BuildAndNoteError();
 
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(2.0f);
-  std::unique_ptr<Literal> param1_literal = LiteralUtil::CreateR0<float>(5.0f);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(2.0f);
+  std::unique_ptr<Literal> param1_literal = Literal::CreateR0<float>(5.0f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   std::unique_ptr<GlobalData> param1_data =
@@ -581,8 +581,8 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
   sub_builder->Sub(y, x);  // note that this is y - x, not x - y
   auto sub_opposite = sub_builder->BuildAndNoteError();
 
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(2.0f);
-  std::unique_ptr<Literal> param1_literal = LiteralUtil::CreateR0<float>(5.0f);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(2.0f);
+  std::unique_ptr<Literal> param1_literal = Literal::CreateR0<float>(5.0f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
   std::unique_ptr<GlobalData> param1_data =
@@ -606,7 +606,7 @@ TEST_F(MapTestWithFullOpt, MapSquare) {
   sub_builder->Mul(x, x);
   auto square = sub_builder->BuildAndNoteError();
 
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(10.0f);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(10.0f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -622,7 +622,6 @@ TEST_F(MapTestWithFullOpt, MapSquare) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 51261f0ac1c15ee96dd0f749fec35971d73b34f2..9ad9b33176691f361e03af35ede8030d5417592a 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -88,8 +87,8 @@ TEST_F(MatOpsSimpleTest, ExpTwoByTwoValues) {
   builder.Exp(data);
 
   std::unique_ptr<Literal> expected =
-      LiteralUtil::CreateR2<float>({{2.71828, 1.00000},    // row 0
-                                    {0.36788, 1.64872}});  // row 1
+      Literal::CreateR2<float>({{2.71828, 1.00000},    // row 0
+                                {0.36788, 1.64872}});  // row 1
 
   ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5));
 }
@@ -116,8 +115,8 @@ TEST_F(MatOpsSimpleTest, MapTwoByTwo) {
   auto map = builder.Map({data}, add_half);
 
   std::unique_ptr<Literal> expected =
-      LiteralUtil::CreateR2<float>({{1.5, 0.5},     // row 0
-                                    {-0.5, 1.0}});  // row 1
+      Literal::CreateR2<float>({{1.5, 0.5},     // row 0
+                                {-0.5, 1.0}});  // row 1
   ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5));
 }
 
@@ -134,8 +133,8 @@ TEST_F(MatOpsSimpleTest, MaxTwoByTwoValues) {
   auto max = builder.Max(lhs, rhs);
 
   std::unique_ptr<Literal> expected =
-      LiteralUtil::CreateR2<float>({{7.0, 6.0},     // row 0
-                                    {3.0, -4.0}});  // row 1
+      Literal::CreateR2<float>({{7.0, 6.0},     // row 0
+                                {3.0, -4.0}});  // row 1
   ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6));
 }
 
@@ -179,16 +178,14 @@ TEST_P(MatOpsDotAddTest, Dot_Add_2x2_2x2) {
   Shape rhs_shape =
       ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
 
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       auto lhs_handle,
-      client_->TransferToServer(
-          *LiteralUtil::CreateR2FromArray2DWithLayout<float>(
-              lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
-  TF_ASSIGN_OR_ASSERT_OK(
+      client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<float>(
+          lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+  TF_ASSERT_OK_AND_ASSIGN(
       auto rhs_handle,
-      client_->TransferToServer(
-          *LiteralUtil::CreateR2FromArray2DWithLayout<float>(
-              rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+      client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout<float>(
+          rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
 
   ComputationBuilder builder(client_, TestName());
   auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
@@ -218,7 +215,6 @@ INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest,
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index 4929e25c580c427a3f034ccf82e7821222be0d8a..56c15e5ff7256cc75a10733e5934894cc88a34da 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -60,7 +59,6 @@ XLA_TEST_F(SliceTest, Slice3D) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b34e1d7db24fbbc5927102bce94f576f3e6d4947
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+#include <algorithm>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::gtl::ArraySlice;
+
+namespace xla {
+namespace {
+
+class MultiOutputFusionTest : public HloTestBase {
+ public:
+  ErrorSpec error_spec_{0.0001, 1e-2};
+
+ protected:
+  MultiOutputFusionTest() {}
+  void RunTest2D(bool manual_fusion, int64 size) {
+    auto builder = HloComputation::Builder(TestName());
+    auto hlo_module = CreateNewModule();
+
+    const Shape elem_shape0 = ShapeUtil::MakeShape(F32, {});
+    const Shape elem_shape2 = ShapeUtil::MakeShape(F32, {size, size});
+
+    auto const0 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(8.0f)));
+    auto param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, elem_shape0, "0"));
+
+    auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
+        elem_shape0, HloOpcode::kAdd, param0, const0));
+
+    HloInstruction* broadcast = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(elem_shape2, add1, {0, 1}));
+
+    auto param1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, elem_shape2, "1"));
+
+    HloInstruction* add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+        elem_shape2, HloOpcode::kAdd, broadcast, param1));
+    HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
+        elem_shape2, HloOpcode::kSubtract, param1, broadcast));
+    HloInstruction* dot = builder.AddInstruction(
+        HloInstruction::CreateBinary(elem_shape2, HloOpcode::kDot, sub, add2));
+    auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
+
+    if (manual_fusion) {
+      auto tuple = computation->AddInstruction(HloInstruction::CreateTuple(
+          ArraySlice<HloInstruction*>({sub, add2}, 0, 2)));
+      auto gte0 = computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(elem_shape2, tuple, 0));
+      auto gte1 = computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(elem_shape2, tuple, 1));
+      TF_CHECK_OK(dot->ReplaceOperandWith(0, gte0));
+      TF_CHECK_OK(dot->ReplaceOperandWith(1, gte1));
+
+      CHECK_NE(
+          computation->CreateFusionInstruction(
+              {tuple, sub, add2, broadcast}, HloInstruction::FusionKind::kLoop),
+          nullptr);
+    }
+
+    Literal input;
+    input.PopulateWithValue<float>(2.5f, {size, size});
+    auto p1 = TransferToDevice(input);
+    auto p0 = TransferToDevice(*Literal::CreateR0<float>(-9.0f));
+
+    Literal expect;
+    expect.PopulateWithValue<float>(size * 1.5f * 3.5f, {size, size});
+    auto actual = ExecuteAndTransfer(std::move(hlo_module), {p0, p1});
+    LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
+  }
+
+  void RunTest1D(bool manual_fusion, int size) {
+    auto builder = HloComputation::Builder(TestName());
+    auto hlo_module = CreateNewModule();
+
+    const Shape elem_shape_F32 = ShapeUtil::MakeShape(F32, {size});
+    const Shape elem_shape_U8 = ShapeUtil::MakeShape(F64, {size});
+    auto param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, elem_shape_F32, "0"));
+    auto param1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, elem_shape_U8, "1"));
+
+    HloInstruction* param0_U8 = builder.AddInstruction(
+        HloInstruction::CreateConvert(elem_shape_U8, param0));
+    HloInstruction* param1_F32 = builder.AddInstruction(
+        HloInstruction::CreateConvert(elem_shape_F32, param1));
+    HloInstruction* add = builder.AddInstruction(HloInstruction::CreateBinary(
+        elem_shape_F32, HloOpcode::kAdd, param0, param1_F32));
+    HloInstruction* sub_U8 =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            elem_shape_U8, HloOpcode::kSubtract, param0_U8, param1));
+    HloInstruction* sub = builder.AddInstruction(
+        HloInstruction::CreateConvert(elem_shape_F32, sub_U8));
+
+    HloInstruction* reshape =
+        builder.AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(F32, {size, 1}), add));
+    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(F32, {}), HloOpcode::kDot, sub, reshape));
+    auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
+
+    if (manual_fusion) {
+      auto tuple = computation->AddInstruction(HloInstruction::CreateTuple(
+          ArraySlice<HloInstruction*>({sub_U8, add}, 0, 2)));
+
+      auto gte0 = computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(elem_shape_U8, tuple, 0));
+      auto gte1 = computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(elem_shape_F32, tuple, 1));
+      TF_CHECK_OK(sub->ReplaceOperandWith(0, gte0));
+      TF_CHECK_OK(reshape->ReplaceOperandWith(0, gte1));
+
+      CHECK_NE(computation->CreateFusionInstruction(
+                   {tuple, sub_U8, add, param0_U8, param1_F32},
+                   HloInstruction::FusionKind::kLoop),
+               nullptr);
+    }
+
+    Literal input0, input1;
+    input0.PopulateWithValue<float>(2.5f, {size});
+    input1.PopulateWithValue<double>(1, {size});
+    auto p0 = TransferToDevice(input0);
+    auto p1 = TransferToDevice(input1);
+
+    Literal expect = *Literal::CreateR0<float>(size * 1.5f * 3.5f);
+    auto actual = ExecuteAndTransfer(std::move(hlo_module), {p0, p1});
+    LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
+  }
+};
+
+XLA_TEST_F(MultiOutputFusionTest, 2DNofusion) { RunTest2D(false, 5); }
+XLA_TEST_F(MultiOutputFusionTest, 2DFusion) { RunTest2D(true, 5); }
+XLA_TEST_F(MultiOutputFusionTest, 2DFusionSize129) { RunTest2D(true, 129); }
+XLA_TEST_F(MultiOutputFusionTest, DiffentTypesNoFusion) { RunTest1D(false, 8); }
+XLA_TEST_F(MultiOutputFusionTest, DiffentTypesFusion) { RunTest1D(true, 8); }
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 4922bbf21c447e4db193e63919d4df5f8079e3be..e270a0477fe140b75b6d4ddffb5d4d98ced2171d 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -183,8 +182,8 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
 
   const float pad_value = -5.123f;
   Array4D<float> input_array(1, 1, 2, 3, {1, 2, 3, 4, 5, 6});
-  auto input = LiteralUtil::CreateR4FromArray4D<float>(input_array);
-  input = LiteralUtil::Relayout(*input, layout);
+  auto input = Literal::CreateR4FromArray4D<float>(input_array);
+  input = input->Relayout(layout);
 
   b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
 
@@ -228,8 +227,8 @@ XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   input_array(0, 0, 0, 0) = 1.0f;
   input_array(0, 24, 6, 6) = 2.0f;
   input_array(0, 17, 2, 5) = 3.0f;
-  auto input = LiteralUtil::CreateR4FromArray4D<float>(input_array);
-  input = LiteralUtil::Relayout(*input, layout);
+  auto input = Literal::CreateR4FromArray4D<float>(input_array);
+  input = input->Relayout(layout);
 
   b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
 
@@ -308,7 +307,7 @@ XLA_TEST_F(PadTest, Large2DPad) {
 
   auto ones = MakeUnique<Array2D<float>>(4, 4);
   ones->Fill(1.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D<float>(*ones);
+  auto input_literal = Literal::CreateR2FromArray2D<float>(*ones);
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -334,7 +333,7 @@ XLA_TEST_F(PadTest, AllTypes2DPad) {
 
   auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(0.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D<float>(*operand);
+  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -365,7 +364,7 @@ XLA_TEST_F(PadTest, High2DPad) {
 
   auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(1.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D<float>(*operand);
+  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -397,7 +396,7 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
 
   auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(1.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D<float>(*operand);
+  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -429,7 +428,7 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
 
   auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(1.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D<float>(*operand);
+  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -453,7 +452,7 @@ XLA_TEST_F(PadTest, ReducePad) {
 
   auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
   ones->Fill(1.0);
-  auto input_literal = LiteralUtil::CreateR4FromArray4D<float>(*ones);
+  auto input_literal = Literal::CreateR4FromArray4D<float>(*ones);
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -470,7 +469,6 @@ XLA_TEST_F(PadTest, ReducePad) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 3e1bfcd3090df6df69e344c157390a41476f17a4..a7692fceb4751a4e81851c382be0371efbff8dc8 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -44,8 +43,7 @@ class ParamsTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR0<float>(3.14159f);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(3.14159f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -57,7 +55,7 @@ XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
 
 XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR1<float>({});
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -70,7 +68,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
 XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({3.14f, -100.25f});
+      Literal::CreateR1<float>({3.14f, -100.25f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -83,7 +81,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
 XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
   ComputationBuilder builder(client_, TestName());
   string str("hello world");
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR1U8(str);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR1U8(str);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -96,7 +94,7 @@ XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
 XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(3, 0));
+      Literal::CreateR2FromArray2D<float>(Array2D<float>(3, 0));
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -108,7 +106,7 @@ XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
 
 XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR2<float>(
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
@@ -124,12 +122,12 @@ XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
 XLA_TEST_F(ParamsTest, TwoParameters) {
   ComputationBuilder builder(client_, TestName());
 
-  std::unique_ptr<Literal> literal0 = LiteralUtil::CreateR1<float>({1, 2});
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*literal0).ConsumeValueOrDie();
   auto param0 = builder.Parameter(0, literal0->shape(), "param0");
 
-  std::unique_ptr<Literal> literal1 = LiteralUtil::CreateR1<float>({10, 20});
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>({10, 20});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*literal1).ConsumeValueOrDie();
   auto param1 = builder.Parameter(1, literal1->shape(), "param1");
@@ -155,7 +153,7 @@ XLA_TEST_F(ParamsTest, TwoParameters) {
 XLA_TEST_F(ParamsTest, MissingParameter) {
   // Test that an error is returned when a computation with an incomplete set of
   // parameters (parameter numbers not contiguous from 0) is executed.
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR0<float>(3.14159f);
+  std::unique_ptr<Literal> literal = Literal::CreateR0<float>(3.14159f);
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
 
@@ -173,12 +171,12 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
 XLA_TEST_F(ParamsTest, UnusedParameter) {
   ComputationBuilder builder(client_, TestName());
 
-  std::unique_ptr<Literal> literal0 = LiteralUtil::CreateR1<float>({1, 2});
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*literal0).ConsumeValueOrDie();
   auto param0 = builder.Parameter(0, literal0->shape(), "param0");
 
-  std::unique_ptr<Literal> literal1 = LiteralUtil::CreateR1<float>({10, 20});
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>({10, 20});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*literal1).ConsumeValueOrDie();
   auto param1 = builder.Parameter(1, literal1->shape(), "param1");
@@ -193,12 +191,11 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
   // unused expression.
   ComputationBuilder builder(client_, TestName());
 
-  std::unique_ptr<Literal> literal0 = LiteralUtil::CreateR1<float>({1, 2});
+  std::unique_ptr<Literal> literal0 = Literal::CreateR1<float>({1, 2});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*literal0).ConsumeValueOrDie();
 
-  std::unique_ptr<Literal> literal1 =
-      LiteralUtil::CreateR1<float>({10, 20, 30});
+  std::unique_ptr<Literal> literal1 = Literal::CreateR1<float>({10, 20, 30});
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*literal1).ConsumeValueOrDie();
 
@@ -238,7 +235,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 
     std::vector<float> sum_value = {{entry0, entry1}};
     sum_value.resize(size);
-    std::unique_ptr<Literal> literal = LiteralUtil::CreateR1<float>(sum_value);
+    std::unique_ptr<Literal> literal = Literal::CreateR1<float>(sum_value);
     param_data_owner.push_back(
         client_->TransferToServer(*literal).ConsumeValueOrDie());
     ComputationDataHandle param =
@@ -268,9 +265,9 @@ XLA_TEST_F(ParamsTest,
 
   std::unique_ptr<GlobalData> data =
       client_
-          ->TransferToServer(*LiteralUtil::MakeTuple({
-              LiteralUtil::CreateR1<float>({1, 2, 3}).get(),
-              LiteralUtil::CreateR1<float>({4, 5, 6}).get(),
+          ->TransferToServer(*Literal::MakeTuple({
+              Literal::CreateR1<float>({1, 2, 3}).get(),
+              Literal::CreateR1<float>({4, 5, 6}).get(),
           }))
           .ConsumeValueOrDie();
 
@@ -282,7 +279,7 @@ XLA_TEST_F(ParamsTest,
 // Verifies that passing a 2x2 with {0, 1} layout returns the same value back
 // when (transferred to the server and) passed through a parameter.
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR2<float>({
+  std::unique_ptr<Literal> literal = Literal::CreateR2<float>({
       {1, 2}, {3, 4},
   });
   *literal->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -296,7 +293,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
 
 // As above, but for {1, 0} layout.
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR2<float>({
+  std::unique_ptr<Literal> literal = Literal::CreateR2<float>({
       {1, 3}, {2, 4},
   });
   *literal->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
@@ -309,7 +306,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) {
 }
 
 XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR2<float>({
+  std::unique_ptr<Literal> literal = Literal::CreateR2<float>({
       {1, 3}, {2, 4},
   });
   const Shape original = literal->shape();
@@ -322,7 +319,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
     std::reverse(original_layout.begin(), original_layout.end());
     *literal->mutable_shape()->mutable_layout() =
         LayoutUtil::MakeLayout(original_layout);
-    ASSERT_EQ(2, LiteralUtil::Get<float>(*literal, {0, 1}));
+    ASSERT_EQ(2, literal->Get<float>({0, 1}));
   }
   // Use the original shape in building the computation.
   ComputationBuilder builder(client_, TestName());
@@ -344,7 +341,6 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/tools/ci_build/builds/tensorboard.sh b/tensorflow/compiler/xla/tests/plugin.bzl
old mode 100755
new mode 100644
similarity index 60%
rename from tensorflow/tools/ci_build/builds/tensorboard.sh
rename to tensorflow/compiler/xla/tests/plugin.bzl
index 77bd29c09f8a1009708ed2bd95987df954fd4a77..1b10c778ce3587d9b3f345a92abbb4da92bcad9b
--- a/tensorflow/tools/ci_build/builds/tensorboard.sh
+++ b/tensorflow/compiler/xla/tests/plugin.bzl
@@ -1,5 +1,4 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Additional XLA devices to be included in the unit test suite."""
 
-set -e
-
-export LAUNCHPAD_CHROME=${LAUNCHPAD_CHROME:-$(which chromium-browser)}
-
-cd tensorflow/tensorboard
-
-# Install all js dependencies (tooling via npm, frontend assets via bower)
-npm run prepare
+# Example:
+#
+# plugins = {
+#   "foo": {
+#     "deps": [
+#       "//tensorflow/compiler/plugin/foo:foo_lib",
+#       "//tensorflow/compiler/plugin/foo:test_macros",
+#     ],
+#     "copts": [],
+#     "tags": [],
+#     "args": []
+#   },
+# }
 
-npm run compile
+plugins = {}
 
-# Run wct in headless chrome using xvfb
-xvfb-run ./node_modules/web-component-tester/bin/wct --skip-plugin=sauce
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index b031725d8abd897c83e40a3514bcccb7d7d76acf..d865297ae612f614f45aa6b4b226e15ee154ed2f 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -142,7 +141,6 @@ TEST_F(PredTest, AnyR2False) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 5117478bfd55093a82a5fa361feb5cf59fd68fd1..0a2d4c763d204478683520f339574ca7738d8650 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -58,11 +57,10 @@ void PrngTest::UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims) {
   SetSeed(42);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
   EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
-  LiteralUtil::EachCell<T>(*actual,
-                           [=](tensorflow::gtl::ArraySlice<int64>, T value) {
-                             EXPECT_LE(a, value);
-                             EXPECT_LT(value, b);
-                           });
+  actual->EachCell<T>([=](tensorflow::gtl::ArraySlice<int64>, T value) {
+    EXPECT_LE(a, value);
+    EXPECT_LT(value, b);
+  });
 }
 
 void PrngTest::BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims) {
@@ -70,17 +68,16 @@ void PrngTest::BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims) {
   auto shape = ShapeUtil::MakeShape(U32, dims);
   builder.RngBernoulli(builder.ConstantR0<float>(p), shape);
 
-  TF_ASSIGN_OR_ASSERT_OK(auto computation, builder.Build());
-  ExecutionOptions execution_options;
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+  ExecutionOptions execution_options = execution_options_;
   execution_options.set_seed(42);
-  TF_ASSIGN_OR_ASSERT_OK(
-      auto actual,
-      client_->ExecuteAndTransfer(computation, /*arguments=*/{},
-                                  &execution_options));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto actual, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                               &execution_options));
   EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   int32 sum = 0;
-  LiteralUtil::EachCell<uint32>(
-      *actual, [&sum](tensorflow::gtl::ArraySlice<int64>, uint32 value) {
+  actual->EachCell<uint32>(
+      [&sum](tensorflow::gtl::ArraySlice<int64>, uint32 value) {
         EXPECT_TRUE(value == 0 || value == 1);
         sum += value;
       });
@@ -124,10 +121,8 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count) {
   SetSeed(42);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
   std::vector<int32> counts(range_size, 0);
-  LiteralUtil::EachCell<int32>(
-      *actual, [&counts](tensorflow::gtl::ArraySlice<int64>, int32 value) {
-        ++counts[value];
-      });
+  actual->EachCell<int32>([&counts](tensorflow::gtl::ArraySlice<int64>,
+                                    int32 value) { ++counts[value]; });
   int64 sum = 0;
   for (int32 i = 0; i < range_size; ++i) {
     sum += Square(static_cast<int64>(counts[i] - expected_count));
@@ -170,23 +165,22 @@ XLA_TEST_F(PrngTest, MapUsingRng) {
 
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR1<float>({2.2f, 5.3f, 4.4f, 5.5f});
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<GlobalData> param0_data,
-                         client_->TransferToServer(*param0_literal));
+      Literal::CreateR1<float>({2.2f, 5.3f, 4.4f, 5.5f});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> param0_data,
+                          client_->TransferToServer(*param0_literal));
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto fn = build_sum_rng(builder);
   builder.Map({param0}, fn);
 
-  TF_ASSIGN_OR_ASSERT_OK(auto computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
 
-  ExecutionOptions execution_options;
+  ExecutionOptions execution_options = execution_options_;
   execution_options.set_seed(125);
-  TF_ASSIGN_OR_ASSERT_OK(
-      auto actual,
-      client_->ExecuteAndTransfer(computation,
-                                  /*arguments=*/{param0_data.get()},
-                                  &execution_options));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto actual, client_->ExecuteAndTransfer(
+                       computation,
+                       /*arguments=*/{param0_data.get()}, &execution_options));
 
   EXPECT_EQ(actual->f32s_size(), param0_literal->f32s_size());
   for (int i = 0; i < param0_literal->f32s_size(); ++i) {
@@ -209,47 +203,45 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
     return builder.Build();
   };
 
-  ExecutionOptions execution_options1;
+  ExecutionOptions execution_options1 = execution_options_;
   execution_options1.set_seed(42);
 
-  ExecutionOptions execution_options2;
+  ExecutionOptions execution_options2 = execution_options_;
   execution_options2.set_seed(65);
 
   std::unique_ptr<Literal> result1;
   {
-    TF_ASSIGN_OR_ASSERT_OK(auto computation, build_computation());
-    TF_ASSIGN_OR_ASSERT_OK(
-        result1,
-        client_->ExecuteAndTransfer(computation, /*arguments=*/{},
-                                    &execution_options1));
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, build_computation());
+    TF_ASSERT_OK_AND_ASSIGN(
+        result1, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                             &execution_options1));
   }
   std::unique_ptr<Literal> result2;
   std::unique_ptr<Literal> result3;
   {
-    TF_ASSIGN_OR_ASSERT_OK(auto computation, build_computation());
-    TF_ASSIGN_OR_ASSERT_OK(
-        result2,
-        client_->ExecuteAndTransfer(computation, /*arguments=*/{},
-                                    &execution_options1));
-    TF_ASSIGN_OR_ASSERT_OK(
-        result3,
-        client_->ExecuteAndTransfer(computation, /*arguments=*/{},
-                                    &execution_options1));
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, build_computation());
+    TF_ASSERT_OK_AND_ASSIGN(
+        result2, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                             &execution_options1));
+    TF_ASSERT_OK_AND_ASSIGN(
+        result3, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                             &execution_options1));
   }
 
   std::unique_ptr<Literal> result4;
   std::unique_ptr<Literal> result5;
   std::unique_ptr<Literal> result6;
   {
-    TF_ASSIGN_OR_ASSERT_OK(auto computation, build_computation());
-    TF_ASSIGN_OR_ASSERT_OK(
-        result4,
-        client_->ExecuteAndTransfer(computation, /*arguments=*/{},
-                                    &execution_options2));
-    TF_ASSIGN_OR_ASSERT_OK(
-        result5, client_->ExecuteAndTransfer(computation, /*arguments=*/{}));
-    TF_ASSIGN_OR_ASSERT_OK(
-        result6, client_->ExecuteAndTransfer(computation, /*arguments=*/{}));
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, build_computation());
+    TF_ASSERT_OK_AND_ASSIGN(
+        result4, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                             &execution_options2));
+    TF_ASSERT_OK_AND_ASSIGN(
+        result5, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                             &execution_options_));
+    TF_ASSERT_OK_AND_ASSIGN(
+        result6, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                             &execution_options_));
   }
 
   LiteralTestUtil::ExpectEqual(*result1, *result2);
@@ -273,13 +265,23 @@ XLA_TEST_F(PrngTest, TenValuesN01) {
   // TODO(b/25995601): Test that resultant values are reasonable
 }
 
+XLA_TEST_F(PrngTest, RngUniformCrash) {
+  ComputationBuilder builder(client_, TestName());
+
+  // This used to crash XLA during LLVM IR generation for CPUs.
+  auto rng_uniform = builder.RngUniform(builder.ConstantR0<int32>(0),
+                                        builder.ConstantR0<int32>(1000 * 1000),
+                                        ShapeUtil::MakeShape(S32, {}));
+  SetSeed(0);
+  ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
+}
+
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
index 4a02567a1a2ea8014cceca085c3d3d8589d6500f..0078733e197685fea575e78b8435485ea9de4926 100644
--- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
+++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -46,7 +45,6 @@ TEST_F(QueryInferredShapeTest, OnePlusOneShape) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..527205bbb0d8d6069ec1450a3cade1663b85616e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -0,0 +1,344 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+// Tests to confirm that the ReducePrecision operation produces the expected
+// numerical values.
+class ReducePrecisionAccuracyTest : public ClientLibraryTestBase,
+                                    public ::testing::WithParamInterface<int> {
+};
+
+// For reduction to IEEE-f16, we want to test the following cases, in both
+// positive and negative variants.  (Note: IEEE-f16 is 5 exponent bits and 10
+// mantissa bits.)
+//
+// Vectors of exponent and mantissa sizes to test.  We want to test IEEE-f32 (a
+// no-op), IEEE-f16, and exponent-reduction-only and mantissa-reduction-only
+// variants of IEEE-f16.
+static const int exponent_sizes[] = {8, 5, 5, 8};
+static const int mantissa_sizes[] = {23, 10, 23, 10};
+
+string TestDataToString(const ::testing::TestParamInfo<int> data) {
+  int i = data.param;
+  return tensorflow::strings::StrCat(exponent_sizes[i], "_exponent_bits_",
+                                     mantissa_sizes[i], "_mantissa_bits");
+}
+
+// The FPVAL macro allows us to write out the binary representation of the
+// input and expected values in a more readable manner.  The mantissa bits
+// are separated into the "high" bits (retained with reduction to IEEE-f16)
+// and the "low" bits (truncated with reduction to IEEE-f16).
+#define FPVAL(EXPONENT, HIGH_MANTISSA, LOW_MANTISSA) \
+  ((0b##EXPONENT << 23) + (0b##HIGH_MANTISSA << 13) + (0b##LOW_MANTISSA))
+
+// Each element in the test-value array consists of four numbers.  The first is
+// the input value and the following are the expected output values for the
+// various precision-reduction cases.
+static const uint32_t test_values[][4] = {
+    // True zero.
+    {
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(00000000, 0000000000, 0000000000000)   // 0.0
+    },
+    // Largest exponent that underflows to zero.
+    {
+        FPVAL(01110000, 0000000000, 0000000000000),  // 3.05176e-05
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(01110000, 0000000000, 0000000000000)   // 3.05176e-05
+    },
+    // Largest value that rounds to a denormal and thus clamps to zero.
+    {
+        FPVAL(01110000, 1111111111, 0111111111111),  // 6.10203e-05
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(01110000, 1111111111, 0000000000000)   // 6.10054e-05
+    },
+    // Smallest value that doesn't underflow to zero, due to mantissa rounding
+    // up and incrementing the exponent out of the denormal range.
+    {
+        FPVAL(01110000, 1111111111, 1000000000000),  // 6.10203e-05
+        FPVAL(01110001, 0000000000, 0000000000000),  // 6.10352e-05
+        FPVAL(00000000, 0000000000, 0000000000000),  // 0.0
+        FPVAL(01110001, 0000000000, 0000000000000)   // 6.10352e-05
+    },
+    // Smallest value that doesn't underflow to zero even without mantissa
+    // rounding.
+    {
+        FPVAL(01110001, 0000000000, 0000000000000),  // 6.10352e-05
+        FPVAL(01110001, 0000000000, 0000000000000),  // 6.10352e-05
+        FPVAL(01110001, 0000000000, 0000000000000),  // 6.10352e-05
+        FPVAL(01110001, 0000000000, 0000000000000)   // 6.10352e-05
+    },
+    // One (to make sure bias-handling is done correctly.
+    {
+        FPVAL(01111111, 0000000000, 0000000000000),  // 1.0
+        FPVAL(01111111, 0000000000, 0000000000000),  // 1.0
+        FPVAL(01111111, 0000000000, 0000000000000),  // 1.0
+        FPVAL(01111111, 0000000000, 0000000000000)   // 1.0
+    },
+    // Values in a space where ties round down due to ties-to-even:
+    //   Value with highest mantissa that rounds down.
+    {
+        FPVAL(01111111, 0000000000, 1000000000000),  // 1.00049
+        FPVAL(01111111, 0000000000, 0000000000000),  // 1.0
+        FPVAL(01111111, 0000000000, 1000000000000),  // 1.00049
+        FPVAL(01111111, 0000000000, 0000000000000)   // 1.0
+    },
+    //   Value with lowest mantissa that rounds up.
+    {
+        FPVAL(01111111, 0000000000, 1000000000001),  // 1.00049
+        FPVAL(01111111, 0000000001, 0000000000000),  // 1.00098
+        FPVAL(01111111, 0000000000, 1000000000001),  // 1.00049
+        FPVAL(01111111, 0000000001, 0000000000000)   // 1.00098
+    },
+    // Values in a space where ties round up due to ties-to-even:
+    //   Value with highest mantissa that rounds down.
+    {
+        FPVAL(01111111, 0000000001, 0111111111111),  // 1.00146
+        FPVAL(01111111, 0000000001, 0000000000000),  // 1.00098
+        FPVAL(01111111, 0000000001, 0111111111111),  // 1.00146
+        FPVAL(01111111, 0000000001, 0000000000000)   // 1.00098
+    },
+    //   Value with a mantissa that rounds up.
+    {
+        FPVAL(01111111, 0000000001, 1000000000000),  // 1.00146
+        FPVAL(01111111, 0000000010, 0000000000000),  // 1.00195
+        FPVAL(01111111, 0000000001, 1000000000000),  // 1.00146
+        FPVAL(01111111, 0000000010, 0000000000000)   // 1.00195
+    },
+    // Largest value that does not overflow to infinity.
+    {
+        FPVAL(10001110, 1111111111, 0111111111111),  // 65520.0
+        FPVAL(10001110, 1111111111, 0000000000000),  // 65504.0
+        FPVAL(10001110, 1111111111, 0111111111111),  // 65520.0
+        FPVAL(10001110, 1111111111, 0000000000000)   // 65504.0
+    },
+    // Smallest value that overflows to infinity due to mantissa rounding up.
+    {
+        FPVAL(10001110, 1111111111, 1000000000000),  // 65520.0
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(10001110, 1111111111, 1000000000000),  // 65520.0
+        FPVAL(10001111, 0000000000, 0000000000000)   // 65536.0
+    },
+    // Smallest value that overflows to infinity, without mantissa rounding.
+    {
+        FPVAL(10001111, 0000000000, 0000000000000),  // 65536.0
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(10001111, 0000000000, 0000000000000)   // 65536.0
+    },
+    // Smallest value that overflows to infinity due to mantissa rounding up,
+    // even when exponent bits aren't reduced.
+    {
+        FPVAL(11111110, 1111111111, 1000000000000),  // 3.40199e+38
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(11111111, 0000000000, 0000000000000)   // Inf
+    },
+    // True infinity.
+    {
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(11111111, 0000000000, 0000000000000),  // Inf
+        FPVAL(11111111, 0000000000, 0000000000000)   // Inf
+    },
+    // NAN with a 1 in the preserved bits.
+    {
+        FPVAL(11111111, 1000000000, 0000000000000),  // NaN
+        FPVAL(11111111, 1000000000, 0000000000000),  // NaN
+        FPVAL(11111111, 1000000000, 0000000000000),  // NaN
+        FPVAL(11111111, 1000000000, 0000000000000)   // NaN
+    },
+    // NAN with a 1 in the truncated bits.
+    {
+        FPVAL(11111111, 0000000000, 0000000000001),  // NaN
+        FPVAL(11111111, 0000000000, 0000000000001),  // NaN
+        FPVAL(11111111, 0000000000, 0000000000001),  // NaN
+        FPVAL(11111111, 0000000000, 0000000000001)   // NaN
+    },
+    // NAN with all ones, causing rounding overflow.
+    {
+        FPVAL(11111111, 1111111111, 1111111111111),  // NaN
+        FPVAL(11111111, 1111111111, 1111111111111),  // NaN
+        FPVAL(11111111, 1111111111, 1111111111111),  // NaN
+        FPVAL(11111111, 1111111111, 1111111111111)   // NaN
+    }};
+
+XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
+  int index = GetParam();
+  int exponent_bits = exponent_sizes[index];
+  int mantissa_bits = mantissa_sizes[index];
+
+  std::vector<float> input_values;
+  std::vector<float> expected_values;
+
+  const uint32_t sign_bit = 1u << 31;
+  for (const auto& test_value : test_values) {
+    // Add positive values.
+    input_values.push_back(tensorflow::bit_cast<float>(test_value[0]));
+    expected_values.push_back(tensorflow::bit_cast<float>(test_value[index]));
+    // Add negative values.  We do this in the bitwise representation so as to
+    // avoid problems with NaN handling.
+    input_values.push_back(
+        tensorflow::bit_cast<float>(test_value[0] ^ sign_bit));
+    expected_values.push_back(
+        tensorflow::bit_cast<float>(test_value[index] ^ sign_bit));
+  }
+
+  // This is required for proper handling of NaN values.
+  SetFastMathDisabled(true);
+
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({input_values});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  auto reduce_precision =
+      builder.ReducePrecision(a, exponent_bits, mantissa_bits);
+
+  ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
+                        ReducePrecisionAccuracyTest,
+                        ::testing::Values(0, 1, 2, 3), TestDataToString);
+
+// Tests to confirm that the compiler optimization functions add the expected
+// ReducePrecisionInsertion passes.
+class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  // Abs doesn't affect resolution.
+  auto abs = builder.Abs(a);
+
+  // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
+  // reduce-precision operation showed up in the correct place in the
+  // graph.
+  auto log = builder.Log(abs);
+
+  // Insert precision-reduction after the Abs(x) operation, rounding that
+  // result to exactly 1.0f.
+  auto reduce_precision_pass = execution_options_.mutable_debug_options()
+                                   ->add_hlo_reduce_precision_options();
+  *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+      HloReducePrecisionOptions::BEFORE_OP_FUSION, 5, 10,
+      [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
+
+  ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
+}
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  // These two operations should be fused by any reasonable backend.
+  auto abs = builder.Abs(a);
+  auto neg = builder.Neg(abs);
+
+  // Add a pass after operation fusion, suffixing kAbs operations.  This
+  // should not see into the fusion nodes and thus should not affect the
+  // result.
+  auto reduce_precision_pass = execution_options_.mutable_debug_options()
+                                   ->add_hlo_reduce_precision_options();
+  *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+      HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10,
+      [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
+
+  ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
+}
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  // These two operations should be fused by any reasonable backend.
+  auto abs = builder.Abs(a);
+  auto neg = builder.Neg(abs);
+
+  // Add a pass after operation fusion, suffixing kFusion operations.
+  auto reduce_precision_pass = execution_options_.mutable_debug_options()
+                                   ->add_hlo_reduce_precision_options();
+  *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+      HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10,
+      [](const HloOpcode opcode) { return opcode == HloOpcode::kFusion; });
+
+  ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index ff24177520eab5c6c2061d01223530249050448c..b22866fc84bec6e9e802f18fdea4c17c6f92e40f 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -64,12 +63,12 @@ class ReduceTest : public ClientLibraryTestBase {
   ReduceTest() {
     // Implementation note: laid out z >> y >> x by default.
     // clang-format off
-    literal_2d_ = LiteralUtil::CreateR2<float>({
+    literal_2d_ = Literal::CreateR2<float>({
       // x0   x1   x2
       { 1.f, 2.f, 3.f},  // y0
       { 4.f, 5.f, 6.f},  // y1
     });
-    literal_3d_ = LiteralUtil::CreateR3Projected<float>({
+    literal_3d_ = Literal::CreateR3Projected<float>({
       // x0   x1   x2
       { 1.f, 2.f, 3.f},  // y0
       { 4.f, 5.f, 6.f},  // y1
@@ -98,7 +97,7 @@ class ReduceTest : public ClientLibraryTestBase {
       }
     }
     std::unique_ptr<Literal> input_literal =
-        LiteralUtil::CreateR1(AsSlice(input_data));
+        Literal::CreateR1(AsSlice(input_data));
     std::unique_ptr<GlobalData> input_global_data =
         client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -130,7 +129,7 @@ class ReduceTest : public ClientLibraryTestBase {
     builder.Reduce(pred_values, init_value, reduce,
                    /*dimensions_to_reduce=*/{0});
 
-    std::unique_ptr<Literal> input_literal = LiteralUtil::CreateR1(input_data);
+    std::unique_ptr<Literal> input_literal = Literal::CreateR1(input_data);
     std::unique_ptr<GlobalData> input_global_data =
         client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -157,9 +156,9 @@ class ReduceTest : public ClientLibraryTestBase {
     Array2D<float> input_data(rows, cols);
     input_data.FillRandom(3.14f, 0.04);
     std::unique_ptr<Literal> input_literal =
-        LiteralUtil::CreateR2FromArray2D(input_data);
-    input_literal = LiteralUtil::Relayout(
-        *input_literal, LayoutUtil::MakeLayout({minor, major}));
+        Literal::CreateR2FromArray2D(input_data);
+    input_literal =
+        input_literal->Relayout(LayoutUtil::MakeLayout({minor, major}));
     std::unique_ptr<GlobalData> input_global_data =
         client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -185,9 +184,9 @@ class ReduceTest : public ClientLibraryTestBase {
     Array2D<float> input_data(rows, cols);
     input_data.FillRandom(3.14f, 0.04);
     std::unique_ptr<Literal> input_literal =
-        LiteralUtil::CreateR2FromArray2D(input_data);
-    input_literal = LiteralUtil::Relayout(
-        *input_literal, LayoutUtil::MakeLayout({minor, major}));
+        Literal::CreateR2FromArray2D(input_data);
+    input_literal =
+        input_literal->Relayout(LayoutUtil::MakeLayout({minor, major}));
     std::unique_ptr<GlobalData> input_global_data =
         client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -203,6 +202,102 @@ class ReduceTest : public ClientLibraryTestBase {
                                ErrorSpec(0.01, 1e-4));
   }
 
+  template <typename NativeT>
+  void ComputeAndCompareGeneric(
+      typename std::enable_if<std::is_floating_point<NativeT>::value,
+                              ComputationBuilder>::type* builder,
+      tensorflow::gtl::ArraySlice<NativeT> expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    ComputeAndCompareR1<NativeT>(builder, expected, arguments,
+                                 ErrorSpec(0.01, 1e-4));
+  }
+
+  template <typename NativeT>
+  void ComputeAndCompareGeneric(
+      typename std::enable_if<std::is_integral<NativeT>::value,
+                              ComputationBuilder>::type* builder,
+      tensorflow::gtl::ArraySlice<NativeT> expected,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    ComputeAndCompareR1<NativeT>(builder, expected, arguments);
+  }
+
+  template <typename NativeT>
+  void RunVectorizedReduceTestForType(
+      const std::function<Computation(ComputationBuilder*)>&
+          reduction_function_generator,
+      const std::function<NativeT(NativeT, NativeT)>&
+          reference_reduction_function,
+      const NativeT& initial_value) {
+    const int rows = 64, cols = 128;
+    const int minor = 1, major = 0;
+    ComputationBuilder builder(client_, TestName());
+    Computation reduction_function = reduction_function_generator(&builder);
+    const Shape input_shape = ShapeUtil::MakeShape(
+        xla::primitive_util::NativeToPrimitiveType<NativeT>(), {rows, cols});
+    auto input = builder.Parameter(0, input_shape, "input");
+    auto zero = builder.ConstantR0<NativeT>(initial_value);
+    builder.Reduce(input, zero, reduction_function,
+                   /*dimensions_to_reduce=*/{0});
+
+    Array2D<NativeT> input_data(rows, cols);
+    input_data.FillUnique(initial_value);
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateR2FromArray2D(input_data);
+    input_literal =
+        input_literal->Relayout(LayoutUtil::MakeLayout({minor, major}));
+    std::unique_ptr<GlobalData> input_global_data =
+        client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+
+    // NativeT can be bool, and std::vector<bool> does not convert to
+    // ArraySlice.
+    std::unique_ptr<NativeT[]> expected(new NativeT[cols]);
+    for (int64 colno = 0; colno < cols; ++colno) {
+      NativeT column_result = initial_value;
+      for (int64 rowno = 0; rowno < rows; ++rowno) {
+        column_result = reference_reduction_function(column_result,
+                                                     input_data(rowno, colno));
+      }
+      expected[colno] = column_result;
+    }
+
+    ComputeAndCompareGeneric<NativeT>(
+        &builder, tensorflow::gtl::ArraySlice<NativeT>(expected.get(), cols),
+        {input_global_data.get()});
+  }
+
+  void RunVectorizedReduceTest(
+      const std::function<Computation(PrimitiveType, ComputationBuilder*)>&
+          reduction_function_generator_for_type,
+      const std::function<float(float, float)>&
+          reference_reduction_function_for_floats,
+      const std::function<int32(int32, int32)>&
+          reference_reduction_function_for_ints,
+      const std::function<uint32(uint32, uint32)>&
+          reference_reduction_function_for_uints,
+      float floating_point_identity, int32 signed_int_identity,
+      uint32 unsigned_int_identity) {
+    // Float version
+    RunVectorizedReduceTestForType<float>(
+        [&](ComputationBuilder* builder) {
+          return reduction_function_generator_for_type(F32, builder);
+        },
+        reference_reduction_function_for_floats, floating_point_identity);
+
+    // Signed int version
+    RunVectorizedReduceTestForType<int32>(
+        [&](ComputationBuilder* builder) {
+          return reduction_function_generator_for_type(S32, builder);
+        },
+        reference_reduction_function_for_ints, signed_int_identity);
+
+    // Unsigned int version
+    RunVectorizedReduceTestForType<uint32>(
+        [&](ComputationBuilder* builder) {
+          return reduction_function_generator_for_type(U32, builder);
+        },
+        reference_reduction_function_for_uints, unsigned_int_identity);
+  }
+
   std::unique_ptr<Literal> literal_2d_;
   std::unique_ptr<Literal> literal_3d_;
   uint32 seed_ = 0xdeadbeef;
@@ -306,9 +401,8 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
   Array2D<float> input_data(rows, cols);
   input_data.FillRandom(3.14f, 0.04);
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR2FromArray2D(input_data);
-  input_literal =
-      LiteralUtil::Relayout(*input_literal, LayoutUtil::MakeLayout({0, 1}));
+      Literal::CreateR2FromArray2D(input_data);
+  input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({0, 1}));
   std::unique_ptr<GlobalData> input_global_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -339,9 +433,8 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
   Array2D<float> input_data(rows, cols);
   input_data.FillRandom(3.14f, 0.04);
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR2FromArray2D(input_data);
-  input_literal =
-      LiteralUtil::Relayout(*input_literal, LayoutUtil::MakeLayout({0, 1}));
+      Literal::CreateR2FromArray2D(input_data);
+  input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({0, 1}));
   std::unique_ptr<GlobalData> input_global_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -372,7 +465,7 @@ XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   Array3D<float> input_data(rows, 2, cols / 2);
   input_data.FillRandom(3.14f, 0.04);
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR3FromArray3D(input_data);
+      Literal::CreateR3FromArray3D(input_data);
   std::unique_ptr<GlobalData> input_global_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -435,7 +528,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DToR0) {
   auto max = CreateScalarMaxComputation(F32, &builder);
   Array2D<float> input(300, 250);
   input.FillRandom(214.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D(input);
+  auto input_literal = Literal::CreateR2FromArray2D(input);
   builder.Reduce(builder.ConstantLiteral(*input_literal),
                  builder.ConstantR0<float>(FLT_MIN), max, {0, 1});
   auto input_max = FLT_MIN;
@@ -450,7 +543,7 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) {
   auto min = CreateScalarMinComputation(F32, &builder);
   Array2D<float> input(150, 130);
   input.FillRandom(214.0f);
-  auto input_literal = LiteralUtil::CreateR2FromArray2D(input);
+  auto input_literal = Literal::CreateR2FromArray2D(input);
   builder.Reduce(builder.ConstantLiteral(*input_literal),
                  builder.ConstantR0<float>(FLT_MAX), min, {0, 1});
 
@@ -460,6 +553,32 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) {
   ComputeAndCompareR0<float>(&builder, input_min, {}, ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) {
+  ComputationBuilder builder(client_, TestName());
+  Array2D<uint32> input({{1}, {2}});
+  auto min = CreateScalarMinComputation(U32, &builder);
+  auto input_literal = Literal::CreateR2FromArray2D(input);
+  auto initial_value =
+      builder.ConstantR0<uint32>(std::numeric_limits<uint32>::max());
+
+  builder.Reduce(builder.ConstantLiteral(*input_literal), initial_value, min,
+                 {0, 1});
+  ComputeAndCompareR0<uint32>(&builder, 1, {});
+}
+
+XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) {
+  ComputationBuilder builder(client_, TestName());
+  Array2D<uint32> input({{1}, {2}});
+  auto max = CreateScalarMaxComputation(U32, &builder);
+  auto input_literal = Literal::CreateR2FromArray2D(input);
+  auto initial_value =
+      builder.ConstantR0<uint32>(std::numeric_limits<uint32>::min());
+
+  builder.Reduce(builder.ConstantLiteral(*input_literal), initial_value, max,
+                 {0, 1});
+  ComputeAndCompareR0<uint32>(&builder, 2, {});
+}
+
 // Reduces a matrix among dimension 1.
 XLA_TEST_F(ReduceTest, Reduce2DAmong1) {
   ComputationBuilder builder(client_, TestName());
@@ -571,6 +690,58 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(ReduceTest, VectorizedReduce_Add) {
+  RunVectorizedReduceTest(CreateScalarAddComputation,
+                          [](float a, float b) { return a + b; },
+                          [](int32 a, int32 b) {
+                            return static_cast<int32>(static_cast<uint32>(a) +
+                                                      static_cast<uint32>(b));
+                          },
+                          [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
+}
+
+XLA_TEST_F(ReduceTest, VectorizedReduce_Multiply) {
+  RunVectorizedReduceTest(CreateScalarMultiplyComputation,
+                          [](float a, float b) { return a * b; },
+                          [](int32 a, int32 b) {
+                            return static_cast<int32>(static_cast<uint32>(a) *
+                                                      static_cast<uint32>(b));
+                          },
+                          [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
+}
+
+XLA_TEST_F(ReduceTest, VectorizedReduce_Max) {
+  RunVectorizedReduceTest(CreateScalarMaxComputation,
+                          [](float a, float b) { return std::max(a, b); },
+                          [](int32 a, int32 b) { return std::max(a, b); },
+                          [](uint32 a, uint32 b) { return std::max(a, b); },
+                          std::numeric_limits<float>::min(),
+                          std::numeric_limits<int32>::min(),
+                          std::numeric_limits<uint32>::min());
+}
+
+XLA_TEST_F(ReduceTest, VectorizedReduce_Min) {
+  RunVectorizedReduceTest(CreateScalarMinComputation,
+                          [](float a, float b) { return std::min(a, b); },
+                          [](int32 a, int32 b) { return std::min(a, b); },
+                          [](uint32 a, uint32 b) { return std::min(a, b); },
+                          std::numeric_limits<float>::max(),
+                          std::numeric_limits<int32>::max(),
+                          std::numeric_limits<uint32>::max());
+}
+
+XLA_TEST_F(ReduceTest, VectorizedReduce_LogicalAnd) {
+  RunVectorizedReduceTestForType<bool>(CreateScalarLogicalAndComputation,
+                                       [](bool a, bool b) { return a && b; },
+                                       true);
+}
+
+XLA_TEST_F(ReduceTest, VectorizedReduce_LogicalOr) {
+  RunVectorizedReduceTestForType<bool>(CreateScalarLogicalOrComputation,
+                                       [](bool a, bool b) { return a || b; },
+                                       false);
+}
+
 class ReduceR3ToR2Test : public ReduceTest,
                          public ::testing::WithParamInterface<BoundsLayout> {};
 
@@ -580,9 +751,9 @@ XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) {
   Array3D<float> input_array(bounds[0], bounds[1], bounds[2]);
   input_array.FillRandom(3.14f, 0.05);
 
-  auto input_literal = LiteralUtil::CreateR3FromArray3D(input_array);
-  input_literal = LiteralUtil::Relayout(
-      *input_literal, LayoutUtil::MakeLayout(GetParam().layout));
+  auto input_literal = Literal::CreateR3FromArray3D(input_array);
+  input_literal =
+      input_literal->Relayout(LayoutUtil::MakeLayout(GetParam().layout));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
@@ -630,7 +801,6 @@ INSTANTIATE_TEST_CASE_P(
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index ec7b47bc283538d7d9219610e4297fee8028d07f..9774e409411cd9726c5955be62b166bf4dc3712d 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -58,7 +57,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
     builder_.ReduceWindow(
-        input, builder_.ConstantLiteral(LiteralUtil::MinValue(F32)),
+        input, builder_.ConstantLiteral(Literal::MinValue(F32)),
         CreateScalarMax(), window_dimensions, window_strides, padding);
   }
 
@@ -67,7 +66,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
     builder_.ReduceWindow(input,
-                          builder_.ConstantLiteral(LiteralUtil::MaxValue(F32)),
+                          builder_.ConstantLiteral(Literal::MaxValue(F32)),
                           CreateScalarMinComputation(F32, &builder_),
                           window_dimensions, window_strides, padding);
   }
@@ -75,6 +74,12 @@ class ReduceWindowTest : public ClientLibraryTestBase {
   ComputationBuilder builder_;
 };
 
+TEST_F(ReduceWindowTest, Min3In5Stride2) {
+  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
+  ReduceWindowMin(input, {3}, {2}, Padding::kValid);
+  ComputeAndCompareR1<float>(&builder_, {100, 1}, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(ReduceWindowTest, ZeroElementSmall) {
   Array4D<float> input_array(1, 0, 2, 1);
 
@@ -132,6 +137,26 @@ TEST_F(ReduceWindowTest, Along2ndMinorDim) {
   ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
 }
 
+TEST_F(ReduceWindowTest, AmongMajor2Dims) {
+  Array4D<float> input_array(4, 4, 6, 8);
+  input_array.FillWithMinorDimNum();
+
+  int win_len = 3;
+  int win_stride = 1;
+
+  Padding padding = Padding::kSame;
+  const auto input_data_handle =
+      builder_.ConstantR4FromArray4D<float>(input_array);
+  // Reduce only along the x and y dimensions, according to the win_len.
+  ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
+                  {win_stride, win_stride, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {win_len, win_len, 1, 1},
+      {win_stride, win_stride, 1, 1}, padding);
+  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+}
+
 TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
   Array4D<float> input_array(9, 12, 4, 89);
   input_array.FillRandom(2.0f);
@@ -184,202 +209,6 @@ TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
   ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
 }
 
-// TODO(b/31809540): Implement minor dim reduction to reduce num of reshapes.
-TEST_F(ReduceWindowTest, ReduceR4AmongXYMinorSmall) {
-  Array4D<float> input_array(2, 2, 4, 16);
-
-  Array2D<float> yx({{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f,
-                      11.f, 12.f, 13.f, 14.f, 15.f},
-                     {16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f,
-                      25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f},
-                     {32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, 40.f,
-                      41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f},
-                     {48.f, 49.f, 50.f, 51.f, 52.f, 53.f, 54.f, 55.f, 56.f,
-                      57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f}});
-  input_array.FillWithYX(yx);
-
-  int win_len = 2;
-  int win_stride = 2;
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
-  Padding padding = Padding::kValid;
-  ReduceWindowAdd(input, {1, 1, win_len, win_len},
-                  {1, 1, win_stride, win_stride}, padding);
-
-  auto res = ReferenceUtil::ReduceWindow4DAdd(
-      input_array, 0.0f, {1, 1, win_len, win_len},
-      {1, 1, win_stride, win_stride}, padding);
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
-}
-
-// TODO(b/31809540): Implement minor dim reduction to reduce num of reshapes.
-TEST_F(ReduceWindowTest, ReduceR4AmongXYMinorSmallOverlapped) {
-  constexpr int64 p = 2;
-  constexpr int64 z = 2;
-  constexpr int64 y = 4;
-  constexpr int64 x = 16;
-  Array4D<float> input_array(p, z, y, x);
-
-  Array2D<float> yx({{0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f,
-                      11.f, 12.f, 13.f, 14.f, 15.f},
-                     {16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f,
-                      25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f},
-                     {32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, 40.f,
-                      41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f},
-                     {48.f, 49.f, 50.f, 51.f, 52.f, 53.f, 54.f, 55.f, 56.f,
-                      57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f}});
-  input_array.FillWithYX(yx);
-
-  int win_len = 4;
-  int win_stride = 2;
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
-  ReduceWindowAdd(input, {1, 1, win_len, win_len},
-                  {1, 1, win_stride, win_stride}, Padding::kValid);
-
-  // Expected result
-  Array2D<float> yx_result({{408.f, 440.f, 472.f, 504.f, 536.f, 568.f, 600.f}});
-  Array4D<float> expected(p, z, 1, 7);
-  expected.FillWithYX(yx_result);
-  ComputeAndCompareR4<float>(&builder_, expected, {}, ErrorSpec(1e-3, 1e-3));
-}
-
-TEST_F(ReduceWindowTest, MaxTrivial) {
-  const auto input = builder_.ConstantR1<float>({42});
-  ReduceWindowMax(input, {1}, {1}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {42}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add3In3) {
-  const auto input = builder_.ConstantR1<float>({20, 100, 3});
-  ReduceWindowAdd(input, {3}, {1}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {123}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add4In16Stride4) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  ReduceWindowAdd(input, {4}, {4}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {10, 26, 42, 58}, {},
-                             ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, DISABLED_ON_CPU(DISABLED_ON_GPU(Min3In5Stride2))) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
-  ReduceWindowMin(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {100, 1}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Max3In3) {
-  const auto input = builder_.ConstantR1<float>({20, 100, 3});
-  ReduceWindowMax(input, {3}, {1}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {100}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add2In3) {
-  const auto input = builder_.ConstantR1<float>({100, 10, 1});
-  ReduceWindowAdd(input, {2}, {1}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {110, 11}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add3In5Stride2) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
-  ReduceWindowAdd(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {11100, 111}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Max4In16Stride4) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  ReduceWindowMax(input, {4}, {4}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {4, 8, 12, 16}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Max4In16Stride3) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  ReduceWindowMax(input, {4}, {3}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {4, 7, 10, 13, 16}, {},
-                             ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Max4In16Stride8) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  ReduceWindowMax(input, {4}, {8}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {4, 12}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Max3In5Stride2) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
-  ReduceWindowMax(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {10000, 100}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Max3In5Stride1) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 101});
-  ReduceWindowMax(input, {3}, {1}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {10000, 1000, 101}, {},
-                             ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add3In4Stride2) {
-  const auto input = builder_.ConstantR1<float>({1000, 100, 10, 1});
-  ReduceWindowAdd(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {1110}, {}, ErrorSpec(0.0001));
-}
-
-XLA_TEST_F(ReduceWindowTest, Add2In3SamePad) {
-  const auto input = builder_.ConstantR1<float>({100, 10, 1});
-  ReduceWindowAdd(input, {2}, {1}, Padding::kSame);
-  ComputeAndCompareR1<float>(&builder_, {110, 11, 1}, {}, ErrorSpec(0.0001));
-}
-
-XLA_TEST_F(ReduceWindowTest, Add3In3SamePad) {
-  const auto input = builder_.ConstantR1<float>({100, 10, 1});
-  ReduceWindowAdd(input, {3}, {1}, Padding::kSame);
-  ComputeAndCompareR1<float>(&builder_, {110, 111, 11}, {}, ErrorSpec(0.0001));
-}
-
-XLA_TEST_F(ReduceWindowTest, Add3In3Stride3SamePad) {
-  const auto input = builder_.ConstantR1<float>({100, 10, 1});
-  ReduceWindowAdd(input, {3}, {2}, Padding::kSame);
-  ComputeAndCompareR1<float>(&builder_, {110, 11}, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add2x2In2x2Overlapped) {
-  Array2D<float> input_array({{1.2f, -2.5f, 0.9f, 1.0f},
-                              {3.7f, 0.2f, -1.0f, -0.2f},
-                              {-0.4f, 2.7f, 1.1f, 2.2f},
-                              {0.6f, 1.7f, 1.4f, -0.2f}});
-  auto input = builder_.ConstantR2FromArray2D<float>(input_array);
-  ReduceWindowAdd(input, {2, 2}, {1, 1}, Padding::kValid);
-  Array2D<float> expected(
-      {{2.6f, -2.4f, 0.7f}, {6.2f, 3.0f, 2.1f}, {4.6f, 6.9f, 4.5f}});
-  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add2x2In2x2Disjoint) {
-  Array2D<float> input_array({{1.2f, -2.5f, 0.9f, 1.0f},
-                              {3.7f, 0.2f, -1.0f, -0.2f},
-                              {-0.4f, 2.7f, 1.1f, 2.2f},
-                              {0.6f, 1.7f, 1.4f, -0.2f}});
-  auto input = builder_.ConstantR2FromArray2D<float>(input_array);
-  ReduceWindowAdd(input, {2, 2}, {2, 2}, Padding::kValid);
-  Array2D<float> expected({
-      {2.6f, 0.7f}, {4.6f, 4.5f},
-  });
-  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(ReduceWindowTest, Add1x2In2x2Same) {
-  Array2D<float> input_array({{1.0f, 2.0f}, {3.0f, 4.0f}});
-  auto input = builder_.ConstantR2FromArray2D<float>(input_array);
-  ReduceWindowAdd(input, {1, 2}, {1, 1}, Padding::kSame);
-  Array2D<float> expected({
-      {3.0f, 2.0f}, {7.0f, 4.0f},
-  });
-  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
-}
-
 XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
   Array3D<float> input_array(2, 1, 2);
   input_array(0, 0, 0) = 1000;
@@ -470,13 +299,621 @@ XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
   ComputeAndCompareR4<float>(&builder_, *expected, {}, ErrorSpec(1e-3, 1e-3));
 }
 
+TEST_F(ReduceWindowTest, R4UnitWindow) {
+  Array4D<float> input_array(13, 12, 8, 15);
+  input_array.Fill(1.0f);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR4FromArray4DWithLayout(
+          input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
+  ComputationDataHandle input =
+      builder_.Parameter(0, input_literal->shape(), "operand");
+
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input, {1, 1, 7, 1}, {1, 4, 1, 1}, padding);
+
+  auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 7, 1},
+                                              {1, 4, 1, 1}, padding);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
+  Array4D<float> input_array(2, 1, 27, 119);
+  input_array.FillRandom(2.0f);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR4FromArray4DWithLayout(
+          input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  ComputationDataHandle input =
+      builder_.Parameter(0, input_literal->shape(), "operand");
+
+  int win_len = 1;
+  int stride = 8;
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
+
+  auto res = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
+  Array4D<float> input_array(3, 2, 4, 64);
+  input_array.FillRandom(2.0f);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR4FromArray4DWithLayout(
+          input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  ComputationDataHandle input =
+      builder_.Parameter(0, input_literal->shape(), "operand");
+
+  int win_len = 3;
+  int stride = 1;
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
+
+  auto res = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
+  Array4D<float> input_array(1, 3, 12, 200);
+  input_array.FillRandom(2.0f);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR4FromArray4DWithLayout(
+          input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  ComputationDataHandle input =
+      builder_.Parameter(0, input_literal->shape(), "operand");
+
+  int win_len = 8;
+  int stride = 5;
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
+
+  auto res = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
+  Array4D<float> input_array(6, 4, 10, 130);
+  input_array.FillRandom(2.0f);
+
+  int win_len = 3;
+  int win_stride = 2;
+
+  Padding padding = Padding::kSame;
+  const auto input_data_handle =
+      builder_.ConstantR4FromArray4D<float>(input_array);
+  // Reduce only along the x and y dimensions, according to the win_len.
+  ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
+                  {win_stride, win_stride, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {win_len, win_len, 1, 1},
+      {win_stride, win_stride, 1, 1}, padding);
+  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(ReduceWindowTest, Add24In1152_NoOverlap) {
+  std::vector<float> input_vector(128 * 9, 1);
+  const auto input = builder_.ConstantR1<float>(input_vector);
+  ReduceWindowAdd(input, {32}, {128}, Padding::kValid);
+  ComputeAndCompareR1<float>(&builder_, {32, 32, 32, 32, 32, 32, 32, 32, 32},
+                             {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(ReduceWindowTest, Add128In128Stride128) {
+  const auto input = builder_.ConstantR1<float>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  ReduceWindowAdd(input, {128}, {128}, Padding::kValid);
+  ComputeAndCompareR1<float>(&builder_, {1088}, {}, ErrorSpec(0.0001));
+}
+
+// Regression test for a bug that appeared in Inception (b/34784899).
+TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
+  Array2D<float> input_array(14, 14, 1.0f);
+  ComputationDataHandle input =
+      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {14, 14});
+
+  int win_len = 3;
+  int stride = 1;
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input, {win_len, win_len}, {stride, stride}, padding);
+
+  auto res = ReferenceUtil::ReduceWindow2DAdd(
+      input_array, 0.0f, {win_len, win_len}, {stride, stride}, padding);
+
+  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+}
+
+TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
+  Array2D<float> input_array(6, 4, 1.0f);
+  ComputationDataHandle input =
+      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {6, 4});
+
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input, {4, 2}, {3, 3}, padding);
+
+  auto res = ReferenceUtil::ReduceWindow2DAdd(input_array, 0.0f, {4, 2}, {3, 3},
+                                              padding);
+
+  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+}
+
+enum Reducer { kAdd, kMax };
+
+struct R4ReduceWindowTestData {
+  int64 base_bounds[4];
+  int64 window_bounds[4];
+  int64 strides[4];
+  int64 pad_low[4];
+  int64 pad_high[4];
+
+  Reducer reducer;
+};
+
+string R4ReduceWindowTestDataToString(
+    const ::testing::TestParamInfo<R4ReduceWindowTestData>& data) {
+  string str = tensorflow::strings::StrCat(
+      "base_bounds_",
+      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "__window_bounds_",
+      tensorflow::str_util::Join(data.param.window_bounds, "x"),            //
+      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),    //
+      "__pad_low_", tensorflow::str_util::Join(data.param.pad_low, "x"),    //
+      "__pad_high_", tensorflow::str_util::Join(data.param.pad_high, "x"),  //
+      (data.param.reducer == kAdd) ? "add" : "max");
+  CHECK(data.param.reducer == kAdd || data.param.reducer == kMax);
+
+  // Test names are not allowed to contain the '-' character.
+  std::replace(str.begin(), str.end(), '-', 'n');
+  return str;
+}
+
+class R4ReduceWindowTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<R4ReduceWindowTestData> {
+ protected:
+  void DoIt() {
+    ComputationBuilder b(client_, TestName());
+    const auto& param = GetParam();
+
+    const float kInitValue = 0.0f;
+
+    Array4D<float> input(param.base_bounds[0], param.base_bounds[1],
+                         param.base_bounds[2], param.base_bounds[3]);
+    input.FillIota(1);
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateR4FromArray4D(input);
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
+                            client_->TransferToServer(*input_literal));
+
+    std::vector<std::pair<int64, int64>> padding(4);
+    for (int i = 0; i < 4; ++i) {
+      padding[i] = {param.pad_low[i], param.pad_high[i]};
+    }
+
+    auto parameter = b.Parameter(0, input_literal->shape(), "p0");
+    auto pad_value = b.ConstantR0<float>(kInitValue);
+    CHECK(param.reducer == kAdd || param.reducer == kMax);
+    auto computation = param.reducer == kAdd
+                           ? CreateScalarAddComputation(F32, &b)
+                           : CreateScalarMaxComputation(F32, &b);
+    b.ReduceWindowWithGeneralPadding(
+        /*operand=*/parameter,
+        /*init_value=*/pad_value,
+        /*computation=*/computation,
+        /*window_dimensions=*/param.window_bounds,
+        /*window_strides=*/param.strides,
+        /*padding=*/padding);
+
+    CHECK(param.reducer == kAdd || param.reducer == kMax);
+    auto reduce_func = param.reducer == kAdd
+                           ? +[](float a, float b) { return a + b; }
+                           : +[](float a, float b) { return std::max(a, b); };
+    std::unique_ptr<Array4D<float>> expected =
+        ReferenceUtil::ReduceWindow4DGeneric(
+            /*operand=*/input,
+            /*init=*/kInitValue,
+            /*reduce_func=*/reduce_func,
+            /*window=*/param.window_bounds,
+            /*stride=*/param.strides,
+            /*padding=*/padding);
+    ComputeAndCompareR4<float>(&b, *expected, {input_arg.get()},
+                               ErrorSpec(1e-3, 1e-3));
+  }
+};
+
+TEST_P(R4ReduceWindowTest, DoIt) { DoIt(); }
+
+// base_bounds, window_bounds, strides, pad_low, pad_high
+const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
+    // Minimal edge case.
+    R4ReduceWindowTestData{/*base_bounds=*/{1, 1, 1, 1},
+                           /*window_bounds=*/{1, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // Zero base bound edge case.
+    R4ReduceWindowTestData{/*base_bounds=*/{1, 0, 1, 1},
+                           /*window_bounds=*/{1, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With non-1x1 window.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With max instead of add.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kMax},
+
+    // With stride.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 10, 17, 140},
+                           /*window_bounds=*/{3, 2, 1, 1},
+                           /*strides=*/{2, 4, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With low padding.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{3, 2, 1, 1},
+                           /*strides=*/{2, 2, 1, 1},
+                           /*pad_low=*/{3, 2, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With high padding.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{3, 2, 1, 1},
+                           /*strides=*/{2, 2, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{2, 3, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // Window touches both sides of the padding simultaneously.
+    R4ReduceWindowTestData{/*base_bounds=*/{1, 1, 17, 140},
+                           /*window_bounds=*/{3, 3, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{1, 1, 0, 0},
+                           /*pad_high=*/{1, 1, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // Window is entirely in the padding for some positions.
+    R4ReduceWindowTestData{/*base_bounds=*/{1, 1, 17, 140},
+                           /*window_bounds=*/{3, 3, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{4, 4, 0, 0},
+                           /*pad_high=*/{4, 4, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // Zero base bound with padding edge case.
+    R4ReduceWindowTestData{/*base_bounds=*/{2, 0, 3, 4},
+                           /*window_bounds=*/{1, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 1, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With stride, low padding and high padding.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 3, 17, 140},
+                           /*window_bounds=*/{3, 4, 1, 1},
+                           /*strides=*/{3, 1, 1, 1},
+                           /*pad_low=*/{10, 1, 0, 0},
+                           /*pad_high=*/{2, 3, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With second minor dimension == 9.
+    R4ReduceWindowTestData{/*base_bounds=*/{2, 3, 9, 127},
+                           /*window_bounds=*/{1, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With minor dimension == 129.
+    R4ReduceWindowTestData{/*base_bounds=*/{3, 2, 7, 129},
+                           /*window_bounds=*/{1, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With minor dims reduction and non-overlapped stride.
+    R4ReduceWindowTestData{/*base_bounds=*/{2, 2, 4, 16},
+                           /*window_bounds=*/{1, 1, 2, 2},
+                           /*strides=*/{1, 1, 2, 2},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    // With minor dims reduction and overlapped stride.
+    R4ReduceWindowTestData{/*base_bounds=*/{2, 2, 4, 16},
+                           /*window_bounds=*/{1, 1, 4, 4},
+                           /*strides=*/{1, 1, 2, 2},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
+                        ::testing::ValuesIn(kR4ReduceWindowTestValues),
+                        R4ReduceWindowTestDataToString);
+
+class R4ReduceWindowLargeTest : public R4ReduceWindowTest {};
+
+XLA_TEST_P(R4ReduceWindowLargeTest, DoIt) { DoIt(); }
+
+// Test cases that are large/slow/failed.
+const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
+    R4ReduceWindowTestData{/*base_bounds=*/{28, 28, 256, 128},
+                           /*window_bounds=*/{3, 3, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{1, 1, 0, 0},
+                           /*pad_high=*/{1, 1, 0, 0},
+                           /*reducer=*/kMax},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{112, 112, 64, 128},
+                           /*window_bounds=*/{3, 3, 1, 1},
+                           /*strides=*/{2, 2, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{1, 1, 0, 0},
+                           /*reducer=*/kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(R4ReduceWindowLargeTestInstantiation,
+                        R4ReduceWindowLargeTest,
+                        ::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
+                        R4ReduceWindowTestDataToString);
+
+struct R2ReduceWindowTestData {
+  int64 base_bounds[2];
+  int64 window_bounds[2];
+  int64 strides[2];
+  int64 layout[2];
+  Padding padding;
+  Reducer reducer;
+} kR2TestCases[] = {
+    {/*base_bounds=*/{4, 18}, /*window_bounds=*/{2, 4},
+     /*strides=*/{1, 2}, /*layout=*/{0, 1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{2, 5}, /*window_bounds=*/{2, 4},
+     /*strides=*/{1, 1}, /*layout=*/{0, 1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{1, 3}, /*window_bounds=*/{2, 3},
+     /*strides=*/{1, 1}, /*layout=*/{0, 1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{3, 129}, /*window_bounds=*/{1, 100},
+     /*strides=*/{2, 99}, /*layout=*/{0, 1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{6, 152}, /*window_bounds=*/{2, 25},
+     /*strides=*/{5, 4}, /*layout=*/{0, 1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{6, 4}, /*window_bounds=*/{4, 2},
+     /*strides=*/{3, 3}, /*layout=*/{0, 1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{5, 147}, /*window_bounds=*/{1, 36},
+     /*strides=*/{4, 5}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{4, 153}, /*window_bounds=*/{2, 93},
+     /*strides=*/{1, 1}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    // Regression test for a bug that appeared in Inception (b/34784899).
+    {/*base_bounds=*/{28, 28}, /*window_bounds=*/{3, 3},
+     /*strides=*/{1, 1}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    // Regression test for a bug that appeared in Inception (b/34784899).
+    {/*base_bounds=*/{4, 32}, /*window_bounds=*/{2, 2},
+     /*strides=*/{2, 2}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{1, 1}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+};
+
+string R2ReduceWindowTestDataToString(
+    const ::testing::TestParamInfo<R2ReduceWindowTestData>& data) {
+  string str = tensorflow::strings::StrCat(
+      "base_bounds_",
+      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "__window_bounds_",
+      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
+      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
+      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
+      "__layout_", data.param.layout[0], "_", data.param.layout[1],           //
+      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+  return str;
+}
+
+class R2ReduceWindowTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<R2ReduceWindowTestData> {};
+
+TEST_P(R2ReduceWindowTest, Add) {
+  ComputationBuilder b(client_, TestName());
+  const auto& param = GetParam();
+  CHECK(param.reducer == kAdd);
+
+  const float kInitValue = 0.0f;
+  Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR2FromArray2DWithLayout(
+          input, LayoutUtil::MakeLayout(param.layout));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
+                          client_->TransferToServer(*input_literal));
+  b.ReduceWindow(/*operand=*/
+                 b.Parameter(0, input_literal->shape(), "p0"),
+                 /*init_value=*/b.ConstantR0<float>(kInitValue),
+                 /*computation=*/CreateScalarAddComputation(F32, &b),
+                 /*window_dimensions=*/param.window_bounds,
+                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+  auto expected = ReferenceUtil::ReduceWindow2DAdd(
+      /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
+      /*stride=*/param.strides, /*padding=*/param.padding);
+
+  ComputeAndCompareR2<float>(&b, *expected, {input_arg.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+INSTANTIATE_TEST_CASE_P(R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
+                        ::testing::ValuesIn(kR2TestCases),
+                        R2ReduceWindowTestDataToString);
+
+struct R1ReduceWindowTestData {
+  int64 base_bounds[1];
+  int64 window_bounds[1];
+  int64 strides[1];
+  Padding padding;
+  Reducer reducer;
+} kR1TestCases[] = {
+    {/*base_bounds=*/{1}, /*window_bounds=*/{1},
+     /*strides=*/{1},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{3}, /*window_bounds=*/{3},
+     /*strides=*/{1},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{3}, /*window_bounds=*/{2},
+     /*strides=*/{1},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{1},
+     /*strides=*/{1},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+
+    {/*base_bounds=*/{16}, /*window_bounds=*/{4},
+     /*strides=*/{4},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+
+    {/*base_bounds=*/{16}, /*window_bounds=*/{4},
+     /*strides=*/{3},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{30},
+     /*strides=*/{27},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 17}, /*window_bounds=*/{7},
+     /*strides=*/{64},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{32},
+     /*strides=*/{56},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{3}, /*window_bounds=*/{2},
+     /*strides=*/{1},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{3},
+     /*strides=*/{2},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{16}, /*window_bounds=*/{4},
+     /*strides=*/{3},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+};
+
+string R1ReduceWindowTestDataToString(
+    const ::testing::TestParamInfo<R1ReduceWindowTestData>& data) {
+  string str = tensorflow::strings::StrCat(
+      "base_bounds_",
+      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "__window_bounds_",
+      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
+      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
+      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
+      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+  return str;
+}
+
+class R1ReduceWindowTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<R1ReduceWindowTestData> {};
+
+TEST_P(R1ReduceWindowTest, DoIt) {
+  ComputationBuilder b(client_, TestName());
+  const auto& param = GetParam();
+  CHECK(param.reducer == kAdd || param.reducer == kMax);
+
+  const float kInitValue = 0.0f;
+  std::vector<float> input_vector(param.base_bounds[0]);
+  std::iota(std::begin(input_vector), std::end(input_vector), 0);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
+                          client_->TransferToServer(*input_literal));
+
+  auto computation = param.reducer == kAdd
+                         ? CreateScalarAddComputation(F32, &b)
+                         : CreateScalarMaxComputation(F32, &b);
+  b.ReduceWindow(/*operand=*/
+                 b.Parameter(0, input_literal->shape(), "p0"),
+                 /*init_value=*/b.ConstantR0<float>(kInitValue),
+                 /*computation=*/computation,
+                 /*window_dimensions=*/param.window_bounds,
+                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+  auto reduce_func = param.reducer == kAdd
+                         ? +[](float a, float b) { return a + b; }
+                         : +[](float a, float b) { return std::max(a, b); };
+  auto expected = ReferenceUtil::ReduceWindow1DGeneric(
+      /*operand=*/tensorflow::gtl::ArraySlice<float>(input_vector),
+      /*init=*/kInitValue,
+      /*reduce_func=*/reduce_func,
+      /*window=*/param.window_bounds,
+      /*stride=*/param.strides, /*padding=*/param.padding);
+
+  ComputeAndCompareR1<float>(&b, tensorflow::gtl::ArraySlice<float>(*expected),
+                             {input_arg.get()}, ErrorSpec(1e-3, 1e-3));
+}
+
+INSTANTIATE_TEST_CASE_P(R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
+                        ::testing::ValuesIn(kR1TestCases),
+                        R1ReduceWindowTestDataToString);
 }  // namespace
 }  // namespace xla
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 7c6700feef846242cc49e573fee01c0101b05335..cb7f54ea01c2f063db1575bd498634f5107a39c5 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -61,7 +60,8 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
 
   // Run it.
   std::unique_ptr<Literal> literal =
-      client_->ExecuteAndTransfer(replayed, /*arguments=*/{})
+      client_
+          ->ExecuteAndTransfer(replayed, /*arguments=*/{}, &execution_options_)
           .ConsumeValueOrDie();
 
   // Expect 4.
@@ -92,15 +92,16 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
 
   // Run it.
   std::unique_ptr<GlobalData> x_data =
-      client_->TransferToServer(*LiteralUtil::CreateR0<int32>(2))
+      client_->TransferToServer(*Literal::CreateR0<int32>(2))
           .ConsumeValueOrDie();
   std::unique_ptr<GlobalData> y_data =
-      client_->TransferToServer(*LiteralUtil::CreateR0<int32>(3))
+      client_->TransferToServer(*Literal::CreateR0<int32>(3))
           .ConsumeValueOrDie();
   std::unique_ptr<Literal> literal =
       client_
           ->ExecuteAndTransfer(replayed,
-                               /*arguments=*/{x_data.get(), y_data.get()})
+                               /*arguments=*/{x_data.get(), y_data.get()},
+                               &execution_options_)
           .ConsumeValueOrDie();
 
   // Expect 5.
@@ -141,7 +142,8 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
 
   // Run it.
   std::unique_ptr<Literal> literal =
-      client_->ExecuteAndTransfer(replayed, /*arguments=*/{})
+      client_
+          ->ExecuteAndTransfer(replayed, /*arguments=*/{}, &execution_options_)
           .ConsumeValueOrDie();
 
   // Expect result.
@@ -154,7 +156,6 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index c9817bc23d821d95e660b359ce72ae6f4dec6c85..3051562455f48625def2840913314b16e8de2b72 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -63,7 +62,6 @@ TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index ae7d07727b1e2c20d629f2abc5e58036060f0cef..6748d196c1a6305cc6e3ff87191d2c96a45bf0e7 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -71,7 +70,7 @@ XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
 XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
   ComputationBuilder builder(client_, TestName());
 
-  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(1.0f);
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -99,7 +98,7 @@ XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
-      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
+      Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
@@ -403,7 +402,7 @@ XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
 XLA_TEST_F(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
     ComputationBuilder b(client_, TestName());
-    auto input = LiteralUtil::CreateR1<float>({83.0f});
+    auto input = Literal::CreateR1<float>({83.0f});
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -435,7 +434,7 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
 
   // clang-format off
-  auto literal = LiteralUtil::CreateR4FromArray4DWithLayout(Array4D<float>{
+  auto literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
       {
         {0, 1},
@@ -467,7 +466,7 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   });
 
   Computation computation = builder.Build().ConsumeValueOrDie();
-  ExecutionOptions execution_options;
+  ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 8}, {1, 0});
   std::unique_ptr<Literal> actual =
@@ -475,12 +474,12 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
           .ConsumeValueOrDie();
   std::unique_ptr<Literal> expected =
-      LiteralUtil::CreateR2FromArray2D<float>(expected_array);
+      Literal::CreateR2FromArray2D<float>(expected_array);
   LiteralTestUtil::ExpectEqual(*expected, *actual);
 }
 
 XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  std::unique_ptr<Literal> input = LiteralUtil::CreateR2<float>({
+  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
@@ -508,7 +507,7 @@ XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
 XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  std::unique_ptr<Literal> input = LiteralUtil::CreateR2<float>({
+  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
@@ -542,7 +541,7 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -565,7 +564,7 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -589,7 +588,7 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -603,7 +602,7 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
     expected_array(indices[0], indices[2] * 30 + indices[1] * 3 + indices[3]) =
         *cell;
   });
-  auto expected = LiteralUtil::CreateR2FromArray2D(expected_array);
+  auto expected = Literal::CreateR2FromArray2D(expected_array);
   ComputeAndCompareLiteral(&builder, *expected, {input_data.get()});
 }
 
@@ -615,7 +614,7 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -626,7 +625,7 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
                   /*new_sizes=*/{7, 2, 3, 5});
   Computation computation = builder.Build().ConsumeValueOrDie();
 
-  ExecutionOptions execution_options;
+  ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(F32, {7, 2, 3, 5}, {2, 3, 0, 1});
   std::unique_ptr<Literal> output_literal =
@@ -642,7 +641,7 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
 }
 
 XLA_TEST_F(ReshapeTest, R4ToR4Reshape_Trivial) {
-  auto literal_1x2x3x4 = LiteralUtil::CreateR4(
+  auto literal_1x2x3x4 = Literal::CreateR4(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
@@ -655,7 +654,7 @@ XLA_TEST_F(ReshapeTest, R4ToR4Reshape_Trivial) {
 }
 
 XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
-  auto literal_1x2x3x4 = LiteralUtil::CreateR4(
+  auto literal_1x2x3x4 = Literal::CreateR4(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
@@ -665,7 +664,7 @@ XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
                   /*new_sizes=*/{2, 4, 3, 1});
 
   // clang-format off
-  auto expected_2x4x3x1 = LiteralUtil::CreateR4(
+  auto expected_2x4x3x1 = Literal::CreateR4(
       {{{{1}, {5}, {9}},
         {{2}, {6}, {10}},
         {{3}, {7}, {11}},
@@ -689,7 +688,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -698,9 +697,9 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
   auto a = builder.Parameter(0, input_literal->shape(), "a");
   builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
 
-  std::unique_ptr<Literal> expected = LiteralUtil::Relayout(
-      *LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal),
-      LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  std::unique_ptr<Literal> expected =
+      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+          ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
@@ -718,7 +717,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -727,9 +726,9 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   auto a = builder.Parameter(0, input_literal->shape(), "a");
   builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
 
-  std::unique_ptr<Literal> expected = LiteralUtil::Relayout(
-      *LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal),
-      LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  std::unique_ptr<Literal> expected =
+      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+          ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
@@ -747,7 +746,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -756,9 +755,9 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   auto a = builder.Parameter(0, input_literal->shape(), "a");
   builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
 
-  std::unique_ptr<Literal> expected = LiteralUtil::Relayout(
-      *LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal),
-      LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  std::unique_ptr<Literal> expected =
+      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+          ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
@@ -777,7 +776,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -786,9 +785,9 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   auto a = builder.Parameter(0, input_literal->shape(), "a");
   builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
 
-  std::unique_ptr<Literal> expected = LiteralUtil::Relayout(
-      *LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal),
-      LayoutUtil::MakeLayout({3, 2, 1, 0}));
+  std::unique_ptr<Literal> expected =
+      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+          ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
@@ -806,7 +805,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
       [&rng, &distribution](tensorflow::gtl::ArraySlice<int64> /* indices */,
                             float* cell) { *cell = distribution(rng); });
   std::unique_ptr<Literal> input_literal =
-      LiteralUtil::CreateR4FromArray4DWithLayout(
+      Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -815,9 +814,9 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   auto a = builder.Parameter(0, input_literal->shape(), "a");
   builder.Reshape(a, /*dimensions=*/{1, 0, 2, 3}, /*new_sizes=*/new_bounds);
 
-  std::unique_ptr<Literal> expected = LiteralUtil::Relayout(
-      *LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal),
-      input_literal->shape().layout());
+  std::unique_ptr<Literal> expected =
+      LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal)
+          ->Relayout(input_literal->shape().layout());
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
@@ -831,7 +830,6 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 5ca9702380f4e37b6ba90459222faf832472bbf7..2f72fc0729a8634456986f294bd26de2c37a5212 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -159,7 +158,6 @@ TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
index 05ce22fc359d5c805840e0f07f645cfb8ffb7786..5b4c05c673339a455c9e58d81c73ede182e0f110 100644
--- a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/packed_literal_reader.h"
@@ -66,8 +65,8 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR1F32Length2) {
       reader.Read(ShapeUtil::MakeShape(F32, {2})).ConsumeValueOrDie();
   EXPECT_TRUE(reader.IsExhausted());
 
-  EXPECT_EQ(42.0, LiteralUtil::Get<float>(*actual, {0}));
-  EXPECT_EQ(24.0, LiteralUtil::Get<float>(*actual, {1}));
+  EXPECT_EQ(42.0, actual->Get<float>({0}));
+  EXPECT_EQ(24.0, actual->Get<float>({1}));
 }
 
 TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
@@ -96,10 +95,10 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
           .ConsumeValueOrDie();
   EXPECT_TRUE(reader.IsExhausted());
 
-  EXPECT_EQ(42.0f, LiteralUtil::Get<float>(*actual, {0, 0}));
-  EXPECT_EQ(24.0f, LiteralUtil::Get<float>(*actual, {0, 1}));
-  EXPECT_EQ(64.0f, LiteralUtil::Get<float>(*actual, {1, 0}));
-  EXPECT_EQ(46.0f, LiteralUtil::Get<float>(*actual, {1, 1}));
+  EXPECT_EQ(42.0f, actual->Get<float>({0, 0}));
+  EXPECT_EQ(24.0f, actual->Get<float>({0, 1}));
+  EXPECT_EQ(64.0f, actual->Get<float>({1, 0}));
+  EXPECT_EQ(46.0f, actual->Get<float>({1, 1}));
 
   std::unique_ptr<Literal> round_tripped = RoundTripToServer(*actual);
   LiteralTestUtil::ExpectEqual(*round_tripped, *actual);
@@ -131,10 +130,10 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
           .ConsumeValueOrDie();
   EXPECT_TRUE(reader.IsExhausted());
 
-  EXPECT_EQ(42.0f, LiteralUtil::Get<float>(*actual, {0, 0}));
-  EXPECT_EQ(24.0f, LiteralUtil::Get<float>(*actual, {1, 0}));
-  EXPECT_EQ(64.0f, LiteralUtil::Get<float>(*actual, {0, 1}));
-  EXPECT_EQ(46.0f, LiteralUtil::Get<float>(*actual, {1, 1}));
+  EXPECT_EQ(42.0f, actual->Get<float>({0, 0}));
+  EXPECT_EQ(24.0f, actual->Get<float>({1, 0}));
+  EXPECT_EQ(64.0f, actual->Get<float>({0, 1}));
+  EXPECT_EQ(46.0f, actual->Get<float>({1, 1}));
 
   std::unique_ptr<Literal> round_tripped = RoundTripToServer(*actual);
   LiteralTestUtil::ExpectEqual(*round_tripped, *actual);
@@ -146,7 +145,6 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
index f0760241cdb4e555f3536d024d278c87376bb4d3..e6a6b7b37a4308f2c00f35ae8d3013a59f6c05e7 100644
--- a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -48,62 +47,61 @@ class RoundTripTransferTest : public ClientLibraryTestBase {
 };
 
 TEST_F(RoundTripTransferTest, R0S32) {
-  RoundTripTest(*LiteralUtil::CreateR0<int32>(42));
+  RoundTripTest(*Literal::CreateR0<int32>(42));
 }
 
 TEST_F(RoundTripTransferTest, R0F32) {
-  RoundTripTest(*LiteralUtil::CreateR0<float>(42.0));
+  RoundTripTest(*Literal::CreateR0<float>(42.0));
 }
 
 TEST_F(RoundTripTransferTest, R1F32_Len0) {
-  RoundTripTest(*LiteralUtil::CreateR1<float>({}));
+  RoundTripTest(*Literal::CreateR1<float>({}));
 }
 
 TEST_F(RoundTripTransferTest, R1F32_Len2) {
-  RoundTripTest(*LiteralUtil::CreateR1<float>({42.0, 64.0}));
+  RoundTripTest(*Literal::CreateR1<float>({42.0, 64.0}));
 }
 
 TEST_F(RoundTripTransferTest, R1F32_Len256) {
   std::vector<float> values(256);
   std::iota(values.begin(), values.end(), 1.0);
-  RoundTripTest(*LiteralUtil::CreateR1<float>(values));
+  RoundTripTest(*Literal::CreateR1<float>(values));
 }
 
 TEST_F(RoundTripTransferTest, R1F32_Len1024) {
   std::vector<float> values(1024);
   std::iota(values.begin(), values.end(), 1.0);
-  RoundTripTest(*LiteralUtil::CreateR1<float>(values));
+  RoundTripTest(*Literal::CreateR1<float>(values));
 }
 
 TEST_F(RoundTripTransferTest, R1F32_Len1025) {
   std::vector<float> values(1025);
   std::iota(values.begin(), values.end(), 1.0);
-  RoundTripTest(*LiteralUtil::CreateR1<float>(values));
+  RoundTripTest(*Literal::CreateR1<float>(values));
 }
 
 TEST_F(RoundTripTransferTest, R1F32_Len4096) {
   std::vector<float> values(4096);
   std::iota(values.begin(), values.end(), 1.0);
-  RoundTripTest(*LiteralUtil::CreateR1<float>(values));
+  RoundTripTest(*Literal::CreateR1<float>(values));
 }
 
 TEST_F(RoundTripTransferTest, R2F32_Len10x0) {
-  RoundTripTest(
-      *LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(10, 0)));
+  RoundTripTest(*Literal::CreateR2FromArray2D<float>(Array2D<float>(10, 0)));
 }
 
 TEST_F(RoundTripTransferTest, R2F32_Len2x2) {
-  RoundTripTest(*LiteralUtil::CreateR2<float>({{42.0, 64.0}, {77.0, 88.0}}));
+  RoundTripTest(*Literal::CreateR2<float>({{42.0, 64.0}, {77.0, 88.0}}));
 }
 
 TEST_F(RoundTripTransferTest, R3F32) {
   RoundTripTest(
-      *LiteralUtil::CreateR3<float>({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}},
-                                     {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}));
+      *Literal::CreateR3<float>({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}},
+                                 {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}));
 }
 
 TEST_F(RoundTripTransferTest, R4F32) {
-  RoundTripTest(*LiteralUtil::CreateR4<float>({{
+  RoundTripTest(*Literal::CreateR4<float>({{
       {{10, 11, 12, 13}, {14, 15, 16, 17}},
       {{18, 19, 20, 21}, {22, 23, 24, 25}},
       {{26, 27, 28, 29}, {30, 31, 32, 33}},
@@ -111,36 +109,33 @@ TEST_F(RoundTripTransferTest, R4F32) {
 }
 
 TEST_F(RoundTripTransferTest, EmptyTuple) {
-  RoundTripTest(*LiteralUtil::MakeTuple({}));
+  RoundTripTest(*Literal::MakeTuple({}));
 }
 
 TEST_F(RoundTripTransferTest, TupleOfR1F32) {
-  RoundTripTest(
-      *LiteralUtil::MakeTuple({LiteralUtil::CreateR1<float>({1, 2}).get(),
-                               LiteralUtil::CreateR1<float>({3, 4}).get()}));
+  RoundTripTest(*Literal::MakeTuple({Literal::CreateR1<float>({1, 2}).get(),
+                                     Literal::CreateR1<float>({3, 4}).get()}));
 }
 
 TEST_F(RoundTripTransferTest, TupleOfR1F32_Len0_Len2) {
-  RoundTripTest(
-      *LiteralUtil::MakeTuple({LiteralUtil::CreateR1<float>({}).get(),
-                               LiteralUtil::CreateR1<float>({3, 4}).get()}));
+  RoundTripTest(*Literal::MakeTuple({Literal::CreateR1<float>({}).get(),
+                                     Literal::CreateR1<float>({3, 4}).get()}));
 }
 
 TEST_F(RoundTripTransferTest, TupleOfR0F32AndR1S32) {
-  RoundTripTest(
-      *LiteralUtil::MakeTuple({LiteralUtil::CreateR0<float>(1.0).get(),
-                               LiteralUtil::CreateR1<int>({2, 3}).get()}));
+  RoundTripTest(*Literal::MakeTuple({Literal::CreateR0<float>(1.0).get(),
+                                     Literal::CreateR1<int>({2, 3}).get()}));
 }
 
 // Below two tests are added to identify the cost of large data transfers.
 TEST_F(RoundTripTransferTest, R2F32_Large) {
-  RoundTripTest(*LiteralUtil::CreateR2F32Linspace(-1.0f, 1.0f, 512, 512));
+  RoundTripTest(*Literal::CreateR2F32Linspace(-1.0f, 1.0f, 512, 512));
 }
 
 TEST_F(RoundTripTransferTest, R4F32_Large) {
   Array4D<float> array4d(2, 2, 256, 256);
   array4d.FillWithMultiples(1.0f);
-  RoundTripTest(*LiteralUtil::CreateR4FromArray4D<float>(array4d));
+  RoundTripTest(*Literal::CreateR4FromArray4D<float>(array4d));
 }
 
 }  // namespace
@@ -149,7 +144,6 @@ TEST_F(RoundTripTransferTest, R4F32_Large) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 47a39ffbbc42dedccc98694a23372cb064da752a..6ebd11584ff21abd05effe094b7ffbd7964c865e 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -212,9 +211,9 @@ TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
 
 TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> a_literal = LiteralUtil::CreateR0<float>(2.1f);
-  std::unique_ptr<Literal> b_literal = LiteralUtil::CreateR0<float>(5.5f);
-  std::unique_ptr<Literal> c_literal = LiteralUtil::CreateR0<float>(0.5f);
+  std::unique_ptr<Literal> a_literal = Literal::CreateR0<float>(2.1f);
+  std::unique_ptr<Literal> b_literal = Literal::CreateR0<float>(5.5f);
+  std::unique_ptr<Literal> c_literal = Literal::CreateR0<float>(0.5f);
 
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
@@ -355,26 +354,25 @@ TEST_F(ScalarComputationsTest, DivU32s) {
     ComputationDataHandle divisor =
         builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
     builder.Div(dividend, divisor);
-    TF_ASSIGN_OR_ASSERT_OK(div_computation, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(div_computation, builder.Build());
   }
 
   for (uint32 divisor : vals) {
     if (divisor != 0) {
       for (uint32 dividend : vals) {
-        auto dividend_literal = LiteralUtil::CreateR0<uint32>(dividend);
-        auto divisor_literal = LiteralUtil::CreateR0<uint32>(divisor);
-        TF_ASSIGN_OR_ASSERT_OK(auto dividend_data,
-                               client_->TransferToServer(*dividend_literal));
-        TF_ASSIGN_OR_ASSERT_OK(auto divisor_data,
-                               client_->TransferToServer(*divisor_literal));
+        auto dividend_literal = Literal::CreateR0<uint32>(dividend);
+        auto divisor_literal = Literal::CreateR0<uint32>(divisor);
+        TF_ASSERT_OK_AND_ASSIGN(auto dividend_data,
+                                client_->TransferToServer(*dividend_literal));
+        TF_ASSERT_OK_AND_ASSIGN(auto divisor_data,
+                                client_->TransferToServer(*divisor_literal));
         auto actual_literal =
             client_
                 ->ExecuteAndTransfer(div_computation,
                                      {dividend_data.get(), divisor_data.get()},
                                      &execution_options_)
                 .ConsumeValueOrDie();
-        auto expected_literal =
-            LiteralUtil::CreateR0<uint32>(dividend / divisor);
+        auto expected_literal = Literal::CreateR0<uint32>(dividend / divisor);
         LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
       }
     }
@@ -397,26 +395,25 @@ TEST_F(ScalarComputationsTest, RemU32s) {
     ComputationDataHandle divisor =
         builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
     builder.Rem(dividend, divisor);
-    TF_ASSIGN_OR_ASSERT_OK(rem_computation, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(rem_computation, builder.Build());
   }
 
   for (uint32 divisor : vals) {
     if (divisor != 0) {
       for (uint32 dividend : vals) {
-        auto dividend_literal = LiteralUtil::CreateR0<uint32>(dividend);
-        auto divisor_literal = LiteralUtil::CreateR0<uint32>(divisor);
-        TF_ASSIGN_OR_ASSERT_OK(auto dividend_data,
-                               client_->TransferToServer(*dividend_literal));
-        TF_ASSIGN_OR_ASSERT_OK(auto divisor_data,
-                               client_->TransferToServer(*divisor_literal));
+        auto dividend_literal = Literal::CreateR0<uint32>(dividend);
+        auto divisor_literal = Literal::CreateR0<uint32>(divisor);
+        TF_ASSERT_OK_AND_ASSIGN(auto dividend_data,
+                                client_->TransferToServer(*dividend_literal));
+        TF_ASSERT_OK_AND_ASSIGN(auto divisor_data,
+                                client_->TransferToServer(*divisor_literal));
         auto actual_literal =
             client_
                 ->ExecuteAndTransfer(rem_computation,
                                      {dividend_data.get(), divisor_data.get()},
                                      &execution_options_)
                 .ConsumeValueOrDie();
-        auto expected_literal =
-            LiteralUtil::CreateR0<uint32>(dividend % divisor);
+        auto expected_literal = Literal::CreateR0<uint32>(dividend % divisor);
         LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
       }
     }
@@ -428,8 +425,8 @@ TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   builder.Rem(x, builder.ConstantR0<int32>(80000));
 
-  std::unique_ptr<Literal> literal = LiteralUtil::CreateR0<int32>(87919);
-  TF_ASSIGN_OR_ASSERT_OK(auto input_data, client_->TransferToServer(*literal));
+  std::unique_ptr<Literal> literal = Literal::CreateR0<int32>(87919);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(*literal));
   ComputeAndCompareR0<int32>(&builder, 7919, {input_data.get()});
 }
 
@@ -764,7 +761,7 @@ TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
 
 TEST_F(ScalarComputationsTest, SqrtF320) {
   ComputationBuilder builder(client_, TestName());
-  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32);
+  Literal zero_literal = Literal::Zero(PrimitiveType::F32);
 
   std::unique_ptr<GlobalData> zero_data =
       client_->TransferToServer(zero_literal).ConsumeValueOrDie();
@@ -782,7 +779,6 @@ TEST_F(ScalarComputationsTest, SqrtF320) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 36110da2478083a45d5d378935278de42d55d221..de89588042ec097180906f49fb5b0c4b1fe16edd 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -381,7 +380,6 @@ XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMinScatter) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 5eb4fee8ed28192a238efe2e6c9e1cad49a5f836..6b48116b6e1317eb23624242f1de656c3e7d48ca 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -262,7 +261,6 @@ TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/set_return_value_test.cc b/tensorflow/compiler/xla/tests/set_return_value_test.cc
index 25bb915be56560e9a4eb0ebce990f488fe074241..38fc27f200ce823c2385d9456f8754dfccb1525e 100644
--- a/tensorflow/compiler/xla/tests/set_return_value_test.cc
+++ b/tensorflow/compiler/xla/tests/set_return_value_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -102,7 +101,6 @@ TEST_F(SetReturnValueTest, SetValueMultipleTimesAndModify) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 70345c300cc778d9a52ffb857b8a1df2531e8d30..c77e892665b2254bba16c57382d07e38e50a9be7 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -33,91 +32,45 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class SliceTest : public ClientLibraryTestBase {
- protected:
-  template <typename NativeT>
-  void RunSliceTenToTwo() {
-    std::vector<NativeT> constant;
-    constant.reserve(10);
-    for (int i = 0; i < 10; ++i) {
-      constant.push_back(static_cast<NativeT>(i));
-    }
-
-    ComputationBuilder builder(client_, TestName());
-    auto original = builder.ConstantR1<NativeT>(constant);
-    builder.Slice(original, {2}, {4}, {1});
-
-    const std::vector<NativeT> expected = {static_cast<NativeT>(2),
-                                           static_cast<NativeT>(3)};
-    ComputeAndCompareR1<NativeT>(&builder, expected, {});
-  }
-};
-
-XLA_TEST_F(SliceTest, SliceZeroToZeroF32) {
-  ComputationBuilder builder(client_, TestName());
-  auto original = builder.ConstantR1<float>({});
-  builder.Slice(original, {0}, {0}, {1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {});
-}
-
-XLA_TEST_F(SliceTest, SliceTenToZeroF32) {
-  ComputationBuilder builder(client_, TestName());
-  std::vector<float> constant(10, 0.3);
-  auto original = builder.ConstantR1<float>(constant);
-  builder.Slice(original, {7}, {7}, {1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {});
-}
-
-TEST_F(SliceTest, SliceTenToTwoF32) { RunSliceTenToTwo<float>(); }
-
-XLA_TEST_F(SliceTest, SliceTenToTwoF64) { RunSliceTenToTwo<double>(); }
-
-TEST_F(SliceTest, SliceTenToTwoU32) { RunSliceTenToTwo<uint32>(); }
-
-TEST_F(SliceTest, SliceTenToTwoS32) { RunSliceTenToTwo<int32>(); }
-
-XLA_TEST_F(SliceTest, SliceTenToTwoU64) { RunSliceTenToTwo<uint64>(); }
-
-XLA_TEST_F(SliceTest, SliceTenToTwoS64) { RunSliceTenToTwo<int64>(); }
+class SliceTest : public ClientLibraryTestBase {};
 
-TEST_F(SliceTest, SliceTenToTen) {
-  const std::vector<float> values = {0.0, 1.0, 2.0, 3.0, 4.0,
-                                     5.0, 6.0, 7.0, 8.0, 9.0};
+TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
+  Array3D<float> values(3, 3, 3);
+  values.FillIota(0);
 
   ComputationBuilder builder(client_, TestName());
-  auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {0}, {10}, {1});
+  auto original = builder.ConstantR3FromArray3D<float>(values);
+  builder.Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1});
 
-  ComputeAndCompareR1<float>(&builder, values, {}, ErrorSpec(0.000001));
+  Array3D<float> expected{
+      {{0.0}, {3.0}, {6.0}}, {{9.0}, {12.0}, {15.0}}, {{18.0}, {21.0}, {24.0}}};
+  ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
-TEST_F(SliceTest, SliceLastFourOf1024) {
-  std::vector<float> values(1024);
-  std::iota(values.begin(), values.end(), 0.0);
+TEST_F(SliceTest, Slice3x3x3_To_3x1x3_F32) {
+  Array3D<float> values(3, 3, 3);
+  values.FillIota(0);
 
   ComputationBuilder builder(client_, TestName());
-  auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {1024 - 4}, {1024}, {1});
+  auto original = builder.ConstantR3FromArray3D<float>(values);
+  builder.Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1});
 
-  const std::vector<float> expected = {1020, 1021, 1022, 1023};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.000001));
+  Array3D<float> expected{
+      {{0.0, 1.0, 2.0}}, {{9.0, 10.0, 11.0}}, {{18.0, 19.0, 20.0}}};
+  ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
-// TODO(b/28491443): Fix wrong result on CPU and GPU. Failed on
-// 2016-05-01. Also b/28508652
-TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) {
-  std::vector<float> values(4096);
-  std::iota(values.begin(), values.end(), 0.0);
+TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
+  Array3D<float> values(3, 3, 3);
+  values.FillIota(0);
 
   ComputationBuilder builder(client_, TestName());
-  auto original = builder.ConstantR1<float>(values);
-  builder.Slice(original, {7}, {7 + 1024}, {1});
+  auto original = builder.ConstantR3FromArray3D<float>(values);
+  builder.Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1});
 
-  std::vector<float> expected(1024);
-  std::iota(values.begin(), values.end(), 7.0);
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.000001));
+  Array3D<float> expected{
+      {{{0.0, 1.0, 2.0}, {3.0, 4.0, 5.0}, {6.0, 7.0, 8.0}}}};
+  ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
@@ -201,14 +154,78 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
 TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   Array4D<float> values(2, 2, 24, 256);
   values.FillRandom(3.14f);
-  auto expected =
-      ReferenceUtil::Slice4D(values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}});
+  auto expected = ReferenceUtil::Slice4D(
+      values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}, /*strides=*/{{1, 1, 1, 1}});
   ComputationBuilder builder(client_, TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
+struct R1Spec {
+  int64 input_dim0;
+  int64 slice_start;
+  int64 slice_limit;
+  int64 slice_stride;
+};
+
+// Parameterized test that generates R1 values, slices them according
+// to the R1Spec, and compares the result with a computed version.
+class SliceR1Test : public ClientLibraryTestBase,
+                    public ::testing::WithParamInterface<R1Spec> {
+ protected:
+  template <typename NativeT>
+  void Run(const R1Spec& spec) {
+    std::vector<NativeT> input(spec.input_dim0);
+    std::iota(input.begin(), input.end(), NativeT());
+
+    ComputationBuilder builder(client_, TestName());
+    auto original = builder.ConstantR1<NativeT>(input);
+    builder.Slice(original, {spec.slice_start}, {spec.slice_limit},
+                  {spec.slice_stride});
+
+    std::vector<NativeT> expected;
+    for (int i = spec.slice_start; i < spec.slice_limit;
+         i += spec.slice_stride) {
+      expected.push_back(i);
+    }
+
+    ComputeAndCompareR1<NativeT>(&builder, expected, {});
+  }
+};
+
+XLA_TEST_P(SliceR1Test, DoIt) {
+  Run<float>(GetParam());
+  Run<double>(GetParam());
+  Run<uint32>(GetParam());
+  Run<int32>(GetParam());
+  Run<uint64>(GetParam());
+  Run<int64>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(                  //
+    SliceR1TestInstantiation,             //
+    SliceR1Test,                          //
+    ::testing::Values(                    //
+        R1Spec{10, 0, 0, 1},              //
+        R1Spec{10, 7, 7, 1},              //
+        R1Spec{10, 2, 4, 1},              //
+        R1Spec{10, 2, 4, 1},              //
+        R1Spec{10, 2, 4, 1},              //
+        R1Spec{10, 2, 4, 1},              //
+        R1Spec{10, 2, 4, 1},              //
+        R1Spec{10, 2, 4, 1},              //
+        R1Spec{10, 0, 10, 1},             //
+        R1Spec{1024, 1024 - 4, 1024, 1},  //
+        R1Spec{4096, 7, 7 + 1024, 1},     //
+        R1Spec{10, 0, 10, 2},             //
+        R1Spec{10, 0, 10, 3},             //
+        R1Spec{10, 0, 10, 4},             //
+        R1Spec{10, 0, 10, 5},             //
+        R1Spec{10, 0, 10, 10}             //
+        )                                 //
+);
+
 struct R2Spec {
   int64 input_dim0;
   int64 input_dim1;
@@ -223,17 +240,17 @@ struct R2Spec {
 class SliceR2Test : public ClientLibraryTestBase,
                     public ::testing::WithParamInterface<R2Spec> {};
 
-TEST_P(SliceR2Test, DoIt) {
+XLA_TEST_P(SliceR2Test, DoIt) {
   const R2Spec& spec = GetParam();
   Array2D<int32> input(spec.input_dim0, spec.input_dim1);
   input.FillUnique();
 
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<int32>(input);
+  auto a = builder.ConstantR2FromArray2DWithLayout<int32>(input, spec.layout);
   builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
 
-  std::unique_ptr<Array2D<int32>> expected =
-      ReferenceUtil::Slice2D(input, spec.slice_starts, spec.slice_limits);
+  std::unique_ptr<Array2D<int32>> expected = ReferenceUtil::Slice2D(
+      input, spec.slice_starts, spec.slice_limits, spec.slice_strides);
   ComputeAndCompareR2<int32>(&builder, *expected, {});
 }
 
@@ -258,6 +275,18 @@ INSTANTIATE_TEST_CASE_P(
         R2Spec {384, 512, {{128, 256}}, {{256, 384}}, {{1, 1}},
           LayoutUtil::MakeLayout({1, 0})},
         R2Spec {357, 512, {{111, 256}}, {{301, 384}}, {{1, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {10, 10, {{0, 0}}, {{10, 10}}, {{1, 2}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {10, 10, {{0, 0}}, {{10, 10}}, {{1, 2}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {10, 10, {{0, 0}}, {{10, 10}}, {{2, 1}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {10, 10, {{0, 0}}, {{10, 10}}, {{2, 1}},
+          LayoutUtil::MakeLayout({1, 0})},
+        R2Spec {10, 10, {{0, 0}}, {{10, 10}}, {{2, 2}},
+          LayoutUtil::MakeLayout({0, 1})},
+        R2Spec {10, 10, {{0, 0}}, {{10, 10}}, {{2, 2}},
           LayoutUtil::MakeLayout({1, 0})}
     )
 );
@@ -269,7 +298,6 @@ INSTANTIATE_TEST_CASE_P(
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 6a23df4d3c35a17a56b4ce816f79eaa642831f90..f3a522b05ebae4f1f86d6d7ddbac6e1749d3e286 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -61,7 +61,7 @@ std::unique_ptr<Literal> CreateR2LiteralWithLayout(
   auto literal = MakeUnique<Literal>();
   const int64 d0 = values.size();
   const int64 d1 = values.begin()->size();
-  LiteralUtil::PopulateWithValue<NativeT>(0, {d0, d1}, literal.get());
+  literal.get()->PopulateWithValue<NativeT>(0, {d0, d1});
   *literal->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout(minor_to_major);
   TF_CHECK_OK(ShapeUtil::ValidateShape(literal->shape()));
@@ -70,7 +70,7 @@ std::unique_ptr<Literal> CreateR2LiteralWithLayout(
   for (auto inner_list : values) {
     int64 dim1 = 0;
     for (auto value : inner_list) {
-      LiteralUtil::Set(literal.get(), {dim0, dim1}, value);
+      literal.get()->Set({dim0, dim1}, value);
       ++dim1;
     }
     ++dim0;
@@ -88,7 +88,7 @@ std::unique_ptr<Literal> CreateR3LiteralWithLayout(
   const int64 d0 = values.size();
   const int64 d1 = values.begin()->size();
   const int64 d2 = values.begin()->begin()->size();
-  LiteralUtil::PopulateWithValue<NativeT>(0, {d0, d1, d2}, literal.get());
+  literal.get()->PopulateWithValue<NativeT>(0, {d0, d1, d2});
   *literal->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout(minor_to_major);
   TF_CHECK_OK(ShapeUtil::ValidateShape(literal->shape()));
@@ -99,7 +99,7 @@ std::unique_ptr<Literal> CreateR3LiteralWithLayout(
     for (auto inner_inner_list : inner_list) {
       int64 dim2 = 0;
       for (auto value : inner_inner_list) {
-        LiteralUtil::Set(literal.get(), {dim0, dim1, dim2}, value);
+        literal.get()->Set({dim0, dim1, dim2}, value);
         ++dim2;
       }
       ++dim1;
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index e4951c4201060ae01d48f438bc462191de372f0e..07c0f073e86ee204a90b1f138c8c6d90a5c6936a 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -189,7 +188,6 @@ TEST_F(TransposeTest, TransposeConstant021_MultipleTilesPerLayer) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 6309e7129735aaaa81a14974ea52bd4cba219dc3..4a1c3fe9629218a0c3c8f5ccacd5500cedf73b61 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -40,6 +39,25 @@ class TupleTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.0001};
 };
 
+// Tests a tuple-shaped constant.
+XLA_TEST_F(TupleTest, TupleConstant) {
+  ComputationBuilder builder(client_, TestName());
+
+  const float constant_scalar = 7.3f;
+  std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
+  std::initializer_list<std::initializer_list<float>> constant_matrix = {
+      {1.1f, 2.2f, 3.5f},  // row 0
+      {4.8f, 5.0f, 6.7f},  // row 1
+  };
+  auto value =
+      Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar).get(),
+                          Literal::CreateR1<float>(constant_vector).get(),
+                          Literal::CreateR2<float>(constant_matrix).get()});
+
+  auto result = builder.ConstantLiteral(*value);
+  ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
+}
+
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreate) {
   ComputationBuilder builder(client_, TestName());
@@ -54,10 +72,10 @@ XLA_TEST_F(TupleTest, TupleCreate) {
                                builder.ConstantR1<float>(constant_vector),
                                builder.ConstantR2<float>(constant_matrix)});
 
-  auto expected = LiteralUtil::MakeTuple(
-      {LiteralUtil::CreateR0<float>(constant_scalar).get(),
-       LiteralUtil::CreateR1<float>(constant_vector).get(),
-       LiteralUtil::CreateR2<float>(constant_matrix).get()});
+  auto expected =
+      Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar).get(),
+                          Literal::CreateR1<float>(constant_vector).get(),
+                          Literal::CreateR2<float>(constant_matrix).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -68,9 +86,8 @@ XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
   auto result = builder.Tuple(
       {builder.ConstantR0<float>(7.0), builder.ConstantR1<float>({})});
 
-  auto expected =
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR0<float>(7.0).get(),
-                              LiteralUtil::CreateR1<float>({}).get()});
+  auto expected = Literal::MakeTuple({Literal::CreateR0<float>(7.0).get(),
+                                      Literal::CreateR1<float>({}).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -78,7 +95,7 @@ XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
 XLA_TEST_F(TupleTest, EmptyTupleCreate) {
   ComputationBuilder builder(client_, TestName());
   auto result = builder.Tuple({});
-  auto expected = LiteralUtil::MakeTuple({});
+  auto expected = Literal::MakeTuple({});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -147,12 +164,37 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
                                    builder.ConstantR2<float>(constant_matrix)});
   auto new_tuple = builder.Tuple({builder.GetTupleElement(tuple_data, 1),
                                   builder.GetTupleElement(tuple_data, 0)});
-  auto expected = LiteralUtil::MakeTuple(
-      {LiteralUtil::CreateR2<float>(constant_matrix).get(),
-       LiteralUtil::CreateR1<float>(constant_vector).get()});
+  auto expected =
+      Literal::MakeTuple({Literal::CreateR2<float>(constant_matrix).get(),
+                          Literal::CreateR1<float>(constant_vector).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
+XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
+  ComputationBuilder b(client_, TestName());
+  ComputationDataHandle v1, v2;
+
+  for (bool direction : {false, true}) {
+    std::unique_ptr<GlobalData> v1_data =
+        CreateR0Parameter<float>(0.0f, /*parameter_number=*/0, /*name=*/"v1",
+                                 /*builder=*/&b, /*data_handle=*/&v1);
+    std::unique_ptr<GlobalData> v2_data =
+        CreateR0Parameter<float>(1.0f, /*parameter_number=*/1, /*name=*/"v2",
+                                 /*builder=*/&b, /*data_handle=*/&v2);
+    auto v1_gt = b.Gt(v1, v2);             // false
+    auto v2_gt = b.Gt(v2, v1);             // true
+    auto v1_v2 = b.Tuple({v1_gt, v2_gt});  // {false, true}
+    auto v2_v1 = b.Tuple({v2_gt, v1_gt});  // {true, false}
+    auto select = b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
+    auto expected =
+        Literal::MakeTuple({Literal::CreateR0<bool>(direction).get(),
+                            Literal::CreateR0<bool>(!direction).get()});
+
+    ComputeAndCompareTuple(&b, *expected, {v1_data.get(), v2_data.get()},
+                           error_spec_);
+  }
+}
+
 // Builds two new tuples from an existing tuple (by means of GetTupleElement),
 // then adds up the components of the new tuples.
 XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
@@ -213,9 +255,8 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
 
   auto select =
       builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
-  auto expected =
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR1<float>(vec2).get(),
-                              LiteralUtil::CreateR1<float>(vec1).get()});
+  auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
+                                      Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -259,9 +300,8 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
 
   auto select =
       builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
-  auto expected =
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR1<float>(vec1).get(),
-                              LiteralUtil::CreateR1<float>(vec2).get()});
+  auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec1).get(),
+                                      Literal::CreateR1<float>(vec2).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -340,9 +380,8 @@ XLA_TEST_F(TupleTest,
 
   auto select =
       builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
-  auto expected =
-      LiteralUtil::MakeTuple({LiteralUtil::CreateR1<float>(vec2).get(),
-                              LiteralUtil::CreateR1<float>(vec1).get()});
+  auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
+                                      Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
@@ -353,13 +392,13 @@ XLA_TEST_F(TupleTest, NestedTuples) {
   auto outer_tuple =
       builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
 
-  auto expected_v1 = LiteralUtil::CreateR1<float>({1.0, 2.0});
-  auto expected_s = LiteralUtil::CreateR0<float>(42.0);
+  auto expected_v1 = Literal::CreateR1<float>({1.0, 2.0});
+  auto expected_s = Literal::CreateR0<float>(42.0);
   auto expected_inner_tuple =
-      LiteralUtil::MakeTuple({expected_v1.get(), expected_s.get()});
-  auto expected_v2 = LiteralUtil::CreateR1<float>({22.0, 44.0});
+      Literal::MakeTuple({expected_v1.get(), expected_s.get()});
+  auto expected_v2 = Literal::CreateR1<float>({22.0, 44.0});
   auto expected =
-      LiteralUtil::MakeTuple({expected_inner_tuple.get(), expected_v2.get()});
+      Literal::MakeTuple({expected_inner_tuple.get(), expected_v2.get()});
 
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
@@ -379,14 +418,14 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 
   std::unique_ptr<GlobalData> data =
       client_
-          ->TransferToServer(*LiteralUtil::MakeTuple({
-              LiteralUtil::MakeTuple(
+          ->TransferToServer(*Literal::MakeTuple({
+              Literal::MakeTuple(
                   {
-                      LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0}).get(),
-                      LiteralUtil::CreateR1<float>({4.0, 5.0, 6.0}).get(),
+                      Literal::CreateR1<float>({1.0, 2.0, 3.0}).get(),
+                      Literal::CreateR1<float>({4.0, 5.0, 6.0}).get(),
                   })
                   .get(),
-              LiteralUtil::CreateR1<float>({7.0, 8.0, 9.0}).get(),
+              Literal::CreateR1<float>({7.0, 8.0, 9.0}).get(),
           }))
           .ConsumeValueOrDie();
 
@@ -401,7 +440,6 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 61110d5b4cdaea62aa9844a195ee95698bf1632e..d35d9ecdeb6661ff5d5c8940a0e9dcc609aeb9a2 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -165,7 +164,6 @@ TEST_F(UnaryOpTest, SignAbsTestR2) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
index 26a08953b1534044058a001a8c9a66e6ab6461b0..079dbb06117949c870f89e1a3258e31463aa28ec 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -221,7 +220,6 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32AllDims) {
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index efde45375fdbe8c0abbba0817f9d3062a118ab3c..b2e0c796bde46bac357635a0ab35dc521da7fde4 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -441,7 +440,6 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 5f9177977449561acaec6f480937833ea0de3dd1..8a6c40a0f570d9d979beaa2c1e915004d742675e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -82,6 +81,70 @@ TEST_F(WhileTest, WhileWithScalarResult) {
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
 
+TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
+  auto result_shape = ShapeUtil::MakeShape(S32, {});
+  auto orig_shape = ShapeUtil::MakeShape(S32, {2});
+
+  // Create a computation for the condition: repeat for 5 iterations.
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    builder.Gt(builder.ConstantR0<int32>(5), prev);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body: add 1 to the result variable.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto input = builder.ConstantR0<int32>(1);
+    auto result = builder.Add(input, prev);
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, TestName());
+  auto init = builder.Reduce(builder.ConstantR1<int32>(2, 1),
+                             builder.ConstantR0<int32>(0),
+                             CreateScalarAddComputation(S32, &builder), {0});
+  auto result = builder.While(condition, body, init);
+  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+
+  ComputeAndCompareR0<int32>(&builder, 5, {});
+}
+
+TEST_F(WhileTest, WhileWithPredicateResult) {
+  auto result_shape = ShapeUtil::MakeShape(PRED, {});
+
+  // Create a computation for the condition: run until condition is true.
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    builder.Ne(builder.ConstantR0<bool>(true), prev);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body: or condition with true.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto result = builder.LogicalOr(prev, builder.ConstantR0<bool>(true));
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, TestName());
+  auto init = builder.Ne(builder.ConstantR0<bool>(false),
+                         builder.ConstantR0<bool>(true));
+  auto result = builder.While(condition, body, init);
+
+  ComputeAndCompareR0<bool>(&builder, true, {});
+}
+
 // Tests a while node when the result type T is a vector.
 //
 // All constants are chosen to produce exact results.
@@ -240,15 +303,62 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   VLOG(2) << "while = " << ShapeUtil::HumanString(
                                *builder.GetShape(result).ConsumeValueOrDie());
 
-  auto expected_counter = LiteralUtil::CreateR0<int32>(5);
-  auto expected_data = LiteralUtil::CreateR1<float>(
+  auto expected_counter = Literal::CreateR0<int32>(5);
+  auto expected_data = Literal::CreateR1<float>(
       {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f});
   auto expected =
-      LiteralUtil::MakeTuple({expected_counter.get(), expected_data.get()});
+      Literal::MakeTuple({expected_counter.get(), expected_data.get()});
   VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape());
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
+TEST_F(WhileTest, WhileWithPredicateTupleResult) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(PRED, {})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Gt(builder.ConstantR0<int32>(5), iteration);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and or the predicate with true
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto pred = builder.GetTupleElement(prev, 1);
+    auto new_pred = builder.LogicalOr(pred, builder.ConstantR0<bool>(true));
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_pred});
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple({builder.ConstantR0<int32>(0),
+                             builder.Ne(builder.ConstantR0<bool>(false),
+                                        builder.ConstantR0<bool>(true))});
+  auto result = builder.While(condition, body, init);
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+
+  auto expected_counter = Literal::CreateR0<int32>(5);
+  auto expected_predicate = Literal::CreateR0<bool>(true);
+  auto expected =
+      Literal::MakeTuple({expected_counter.get(), expected_predicate.get()});
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0));
+}
+
 // Tests two while nodes when the result type T is a Tuple and the second
 // while node uses the result of the first while node which is used in two
 // nodes.
@@ -277,7 +387,7 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
-    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
   Computation condition2;
@@ -287,7 +397,7 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
-    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(condition2, builder.Build());
   }
 
   // Create a computation for the body.
@@ -303,7 +413,7 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
     auto new_weights = builder.Add(weights, input);
     auto result = builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
-    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   Computation body2;
@@ -316,7 +426,7 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
     auto new_weights = builder.Add(weights, input);
     auto result = builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
-    TF_ASSIGN_OR_ASSERT_OK(body2, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(body2, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
@@ -356,7 +466,7 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
-    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
   Computation condition2;
@@ -366,7 +476,7 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
-    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(condition2, builder.Build());
   }
 
   // Create a computation for the body.
@@ -382,7 +492,7 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
     auto new_weights = builder.Add(weights, input);
     auto result = builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
-    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
@@ -423,7 +533,7 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
-    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
   Computation condition2;
@@ -433,7 +543,7 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
-    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(condition2, builder.Build());
   }
 
   // Create a computation for the body.
@@ -449,7 +559,7 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
     auto new_weights = builder.Add(weights, input);
     auto result = builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
-    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+    TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
@@ -525,11 +635,11 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
           << ShapeUtil::HumanString(
                  *builder.GetShape(result).ConsumeValueOrDie());
 
-  auto expected_counter = LiteralUtil::CreateR0<int32>(5);
-  auto expected_data = LiteralUtil::CreateR1<float>(
+  auto expected_counter = Literal::CreateR0<int32>(5);
+  auto expected_data = Literal::CreateR1<float>(
       {1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f});
   auto expected =
-      LiteralUtil::MakeTuple({expected_counter.get(), expected_data.get()});
+      Literal::MakeTuple({expected_counter.get(), expected_data.get()});
   VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape());
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
@@ -587,11 +697,11 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   };
 
   for (int i = 1; i < 4; ++i) {
-    TF_ASSIGN_OR_ASSERT_OK(auto computation, while_loop(i));
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, while_loop(i));
 
-    ExecutionOptions execution_options;
+    ExecutionOptions execution_options = execution_options_;
     execution_options.set_seed(65);
-    TF_ASSIGN_OR_ASSERT_OK(
+    TF_ASSERT_OK_AND_ASSIGN(
         auto result,
         client_->ExecuteAndTransfer(computation, {}, &execution_options));
   }
@@ -743,7 +853,6 @@ BENCHMARK(BM_WhileLoop);
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
   xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
-  xla::legacy_flags::AppendCpuCompilerFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 7876272467890b56c2cca71f64e66303eb8ac632..4d060895d357493327ec50b38016478c65fef94d 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -104,8 +104,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
 
   auto result = MakeUnique<Literal>();
   const float fill = std::numeric_limits<float>::quiet_NaN();
-  LiteralUtil::PopulateWithValue<float>(fill, AsInt64Slice(shape.dimensions()),
-                                        result.get());
+  result->PopulateWithValue<float>(fill, AsInt64Slice(shape.dimensions()));
   std::vector<tensorflow::StringPiece> pieces;
   std::vector<tensorflow::StringPiece> coordinates;
   std::vector<int64> coordinate_values;
@@ -147,7 +146,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
           "\"%s\"",
           shape.dimensions_size(), coordinate_values.size(), line.c_str());
     }
-    LiteralUtil::Set<float>(result.get(), coordinate_values, value);
+    result->Set<float>(coordinate_values, value);
   }
   return std::move(result);
 }
diff --git a/tensorflow/compiler/xla/text_literal_reader_test.cc b/tensorflow/compiler/xla/text_literal_reader_test.cc
index a167d80f73b0273739e22d94be8d90ab00839dc9..23070b663870a2b78b38663e09a32fcb28d9c2dc 100644
--- a/tensorflow/compiler/xla/text_literal_reader_test.cc
+++ b/tensorflow/compiler/xla/text_literal_reader_test.cc
@@ -46,12 +46,12 @@ TEST(TextLiteralReaderTest, ReadsR3File) {
       TextLiteralReader::ReadPath(fname).ConsumeValueOrDie();
   EXPECT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {1, 2, 3}), literal->shape()));
-  EXPECT_EQ(42.5, LiteralUtil::Get<float>(*literal, {0, 0, 0}));
-  EXPECT_EQ(43.5, LiteralUtil::Get<float>(*literal, {0, 0, 1}));
-  EXPECT_EQ(44.5, LiteralUtil::Get<float>(*literal, {0, 0, 2}));
-  EXPECT_EQ(45.5, LiteralUtil::Get<float>(*literal, {0, 1, 0}));
-  EXPECT_EQ(46.5, LiteralUtil::Get<float>(*literal, {0, 1, 1}));
-  EXPECT_EQ(47.5, LiteralUtil::Get<float>(*literal, {0, 1, 2}));
+  EXPECT_EQ(42.5, literal->Get<float>({0, 0, 0}));
+  EXPECT_EQ(43.5, literal->Get<float>({0, 0, 1}));
+  EXPECT_EQ(44.5, literal->Get<float>({0, 0, 2}));
+  EXPECT_EQ(45.5, literal->Get<float>({0, 1, 0}));
+  EXPECT_EQ(46.5, literal->Get<float>({0, 1, 1}));
+  EXPECT_EQ(47.5, literal->Get<float>({0, 1, 2}));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index a5097e41cb3cb3fe1c10e3c21c00c2242087deba..3fee467594d8423c707abf07a0622a738437830a 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -45,9 +45,9 @@ namespace xla {
 
   tensorflow::Status status;
   tensorflow::WritableFile* f_ptr = f.get();
-  LiteralUtil::EachCellAsString(
-      literal, [f_ptr, &status](tensorflow::gtl::ArraySlice<int64> indices,
-                                const string& value) {
+  literal.EachCellAsString(
+      [f_ptr, &status](tensorflow::gtl::ArraySlice<int64> indices,
+                       const string& value) {
         if (!status.ok()) {
           return;
         }
diff --git a/tensorflow/compiler/xla/text_literal_writer_test.cc b/tensorflow/compiler/xla/text_literal_writer_test.cc
index 177ae4ea036af660b7a2be1d4082b30ca8fb9fac..70cf2fb1b8a1b4f2ecfdaeaef3a00ddc974e2652 100644
--- a/tensorflow/compiler/xla/text_literal_writer_test.cc
+++ b/tensorflow/compiler/xla/text_literal_writer_test.cc
@@ -30,7 +30,7 @@ namespace xla {
 namespace {
 
 TEST(TextLiteralWriterTest, WritesFloatLiteral) {
-  auto literal = LiteralUtil::CreateR2<float>({
+  auto literal = Literal::CreateR2<float>({
       {3.14, 2.17}, {1.23, 4.56},
   });
   string path =
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 535e5b605b4f68671c9b6a8af4a12732f88e744e..4bbe0ba0ddd93b59557d3a4c6007ed9d2f8b7c11 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -36,7 +36,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:service_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
@@ -187,7 +187,7 @@ cc_binary(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:session_proto",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index 10efa9f3e8d856493b2db23195188da6fba65244..7861c3a9b72e85cba8907c82a9d36d0fe39889c2 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/service_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -53,8 +53,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    debug_options.set_xla_generate_hlo_graph(".*");
+    debug_options.set_xla_hlo_graph_layout(true);
     ComputationStats stats =
-        client->GetComputationStats(computation).ConsumeValueOrDie();
+        client->GetComputationStats(computation, debug_options)
+            .ConsumeValueOrDie();
     fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
   }
 }
@@ -63,12 +67,16 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
 }  // namespace xla
 
 int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  xla::legacy_flags::ServiceFlags* flags = xla::legacy_flags::GetServiceFlags();
-  flags->xla_generate_hlo_graph = ".*";
-  flags->xla_hlo_graph_layout = true;
-
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
   xla::tools::RealMain(args);
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index 850267d3195785a96bf8d2c80fe64fdb8aae0a91..51f90b07c66f7d839f587350726333b9dbe6a9f0 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -52,8 +52,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    debug_options.set_xla_generate_hlo_graph(".*");
+    debug_options.set_xla_hlo_dump_as_graphdef(true);
     ComputationStats stats =
-        client->GetComputationStats(computation).ConsumeValueOrDie();
+        client->GetComputationStats(computation, debug_options)
+            .ConsumeValueOrDie();
     fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
   }
 }
@@ -62,14 +66,16 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  xla::legacy_flags::ServiceFlags* flags = xla::legacy_flags::GetServiceFlags();
-  flags->xla_generate_hlo_graph = ".*";
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
 
-  xla::legacy_flags::HloGraphDumperFlags* dumper_flags =
-      xla::legacy_flags::GetHloGraphDumperFlags();
-  dumper_flags->xla_hlo_dump_as_graphdef = true;
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 3a75bf6495415e569aafce1eccc843cc95f9f7fa..6228ca34c0835a7476e45037c9bb6373ee1750dd 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -98,11 +98,11 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
     std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
     fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
             ShapeUtil::HumanString(result->shape()).c_str(),
-            LiteralUtil::ToString(*result).c_str());
+            result->ToString().c_str());
     if (module.has_result()) {
       fprintf(stdout, "was %s:%s\n",
               ShapeUtil::HumanString(module.result().shape()).c_str(),
-              LiteralUtil::ToString(Literal(module.result())).c_str());
+              Literal(module.result()).ToString().c_str());
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tools/show_literal.cc b/tensorflow/compiler/xla/tools/show_literal.cc
index b6538f5de07743ef7320343d6b23119e919d114f..b50cb5e28eac14ed99af566939f8bd64e393ff64 100644
--- a/tensorflow/compiler/xla/tools/show_literal.cc
+++ b/tensorflow/compiler/xla/tools/show_literal.cc
@@ -42,5 +42,5 @@ int main(int argc, char **argv) {
                                           &literal_proto));
   xla::Literal literal(literal_proto);
   LOG(INFO) << "literal: " << literal_proto.ShortDebugString();
-  fprintf(stderr, "%s\n", xla::LiteralUtil::ToString(literal).c_str());
+  fprintf(stderr, "%s\n", literal.ToString().c_str());
 }
diff --git a/tensorflow/compiler/xla/tools/show_text_literal.cc b/tensorflow/compiler/xla/tools/show_text_literal.cc
index 2d983b407c64ab5547d722abcc2c564a7963f730..bbe9902aa17a585c4bad5b732330305dfdd45302 100644
--- a/tensorflow/compiler/xla/tools/show_text_literal.cc
+++ b/tensorflow/compiler/xla/tools/show_text_literal.cc
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
       xla::TextLiteralReader::ReadPath(argv[1]).ConsumeValueOrDie();
 
   LOG(INFO) << "literal: " << literal->ShortDebugString();
-  fprintf(stderr, "%s\n", xla::LiteralUtil::ToString(*literal).c_str());
+  fprintf(stderr, "%s\n", literal->ToString().c_str());
   if (literal->shape().element_type() == xla::F32) {
     float min =
         *std::min_element(literal->f32s().begin(), literal->f32s().end());
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index d467178cb528a93b2c1030fc72d054cc0edf95b6..1ecdb9852d84175dbe30878022519cd62f54747c 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -15,9 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/util.h"
 
+#include <numeric>
 #include <stdarg.h>
+#include <numeric>
 
-#include "tensorflow/compiler/xla/legacy_flags/util_flags.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -30,18 +31,12 @@ limitations under the License.
 namespace xla {
 namespace {
 
-// Adds a backtrace to the provided status iff the xla_status_add_backtrace flag
-// is set. This is useful for quickly tracing status errors observed coming out
-// of the service.
-Status MaybeAddBacktrace(const Status& prior) {
-  DCHECK(!prior.ok());
-  if (legacy_flags::GetUtilFlags()->xla_status_add_backtrace) {
-    return Status{prior.code(),
-                  tensorflow::strings::StrCat(prior.error_message(), " :: ",
-                                              tensorflow::CurrentStackTrace())};
-  } else {
-    return prior;
-  }
+// Logs the provided status message with a backtrace.
+Status WithLogBacktrace(const Status& status) {
+  CHECK(!status.ok());
+  VLOG(1) << status.ToString();
+  VLOG(1) << tensorflow::CurrentStackTrace();
+  return status;
 }
 
 }  // namespace
@@ -84,7 +79,7 @@ Status InvalidArgument(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::InvalidArgument(message));
+  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
 }
 
 Status Unimplemented(const char* format, ...) {
@@ -93,7 +88,7 @@ Status Unimplemented(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::Unimplemented(message));
+  return WithLogBacktrace(tensorflow::errors::Unimplemented(message));
 }
 
 Status InternalError(const char* format, ...) {
@@ -102,7 +97,7 @@ Status InternalError(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::Internal(message));
+  return WithLogBacktrace(tensorflow::errors::Internal(message));
 }
 
 Status FailedPrecondition(const char* format, ...) {
@@ -111,7 +106,7 @@ Status FailedPrecondition(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::FailedPrecondition(message));
+  return WithLogBacktrace(tensorflow::errors::FailedPrecondition(message));
 }
 
 Status ResourceExhausted(const char* format, ...) {
@@ -120,7 +115,7 @@ Status ResourceExhausted(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::ResourceExhausted(message));
+  return WithLogBacktrace(tensorflow::errors::ResourceExhausted(message));
 }
 
 Status NotFound(const char* format, ...) {
@@ -129,7 +124,7 @@ Status NotFound(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::NotFound(message));
+  return WithLogBacktrace(tensorflow::errors::NotFound(message));
 }
 
 Status Unavailable(const char* format, ...) {
@@ -138,7 +133,7 @@ Status Unavailable(const char* format, ...) {
   va_start(args, format);
   tensorflow::strings::Appendv(&message, format, args);
   va_end(args);
-  return MaybeAddBacktrace(tensorflow::errors::Unavailable(message));
+  return WithLogBacktrace(tensorflow::errors::Unavailable(message));
 }
 
 string Reindent(tensorflow::StringPiece original,
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 42d5c1d15501fb912551a044414e6fa0c83283b8..00151e5da6b7aae79793ccb0f3df49531b417aa9 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -195,16 +195,24 @@ bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
 // 2. permutation.size() == input.size().
 template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
-                       C<T> input_) {
-  tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK(IsPermutation(permutation, input.size()));
-  std::vector<T> output(input.size());
+                       C<T> input) {
+  tensorflow::gtl::ArraySlice<T> data(input);
+  CHECK(IsPermutation(permutation, data.size()));
+  std::vector<T> output(data.size());
   for (size_t i = 0; i < permutation.size(); ++i) {
-    output[permutation[i]] = input[i];
+    output[permutation[i]] = data[i];
   }
   return output;
 }
 
+// Override of the above that works around compile failures with gcc 7.1.1.
+// For details see https://github.com/tensorflow/tensorflow/issues/10843
+template <typename T>
+std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
+                       const std::vector<T>& input) {
+  return Permute<std::vector, T>(permutation, input);
+}
+
 // Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
 std::vector<int64> InversePermutation(
     tensorflow::gtl::ArraySlice<int64> input_permutation);
@@ -336,18 +344,19 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 
 }  // namespace xla
 
-#define XLA_LOG_LINES(SEV, STRING) LogLines(SEV, STRING, __FILE__, __LINE__)
+#define XLA_LOG_LINES(SEV, STRING) \
+  ::xla::LogLines(SEV, STRING, __FILE__, __LINE__)
 
-#define XLA_VLOG_LINES(LEVEL, STRING)                               \
-  do {                                                              \
-    if (VLOG_IS_ON(LEVEL)) XLA_LOG_LINES(tensorflow::INFO, STRING); \
+#define XLA_VLOG_LINES(LEVEL, STRING)                                 \
+  do {                                                                \
+    if (VLOG_IS_ON(LEVEL)) XLA_LOG_LINES(::tensorflow::INFO, STRING); \
   } while (false);
 
 // Utility macro that performs the equivalent of what one would expect
 // LOG_LINES(FATAL, X) to do but can be used at the end of a function that
 // returns a value without getting a compiler warning that no value is returned.
-#define XLA_FATAL_LOG(X)               \
-  XLA_LOG_LINES(tensorflow::ERROR, X); \
+#define XLA_FATAL_LOG(X)                 \
+  XLA_LOG_LINES(::tensorflow::ERROR, X); \
   LOG(FATAL) << "Aborting in " << __FUNCTION__ << " due to previous errors.";
 
 #endif  // TENSORFLOW_COMPILER_XLA_UTIL_H_
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index bdd3dfe82d1546da96cbcd946f3521517a253dc1..f44cbefe228de952b8dabc4ae3e7b8cfc6265105 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -1,15 +1,15 @@
 """Wrapper around cc_proto_library used inside the XLA codebase."""
 
-load("@protobuf//:protobuf.bzl", "cc_proto_library")
+load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
 def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
   cc_proto_library(name=name,
                    srcs=srcs,
                    deps=deps,
-                   cc_libs = ["@protobuf//:protobuf"],
-                   protoc="@protobuf//:protoc",
-                   default_runtime="@protobuf//:protobuf",
+                   cc_libs = ["@protobuf_archive//:protobuf"],
+                   protoc="@protobuf_archive//:protoc",
+                   default_runtime="@protobuf_archive//:protobuf",
                    testonly=testonly,
                    visibility=visibility,)
 
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 4c3cd321f680f36ac3b4edb546fb1f0415a0f9bd..be4e00f63cc9991fcae233d44614e1cc44b68873 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -20,6 +20,24 @@ import "tensorflow/compiler/xla/service/session.proto";
 
 package xla;
 
+// Options for the HLO insert-reduce-precision-operations pass.
+message HloReducePrecisionOptions {
+  // When to run the pass.
+  enum PassTiming {
+    BEFORE_OP_FUSION = 0;
+    AFTER_OP_FUSION = 1;
+  }
+  PassTiming pass_timing = 1;
+
+  // Exponent and mantissa bit counts for the reduced precision.
+  uint32 exponent_bits = 2;
+  uint32 mantissa_bits = 3;
+
+  // Opcodes for operations that should be suffixed with reduced-precision
+  // operations.
+  repeated uint32 opcodes_to_suffix = 4;
+}
+
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields.
 message DebugOptions {
@@ -28,13 +46,45 @@ message DebugOptions {
   // dump *all* HLO modules.
   string xla_generate_hlo_graph = 1;
 
+  // Show addresses of HLO ops in graph dump.
+  bool xla_hlo_graph_addresses = 2;
+
+  // Show layout of HLO ops in graph dump.
+  bool xla_hlo_graph_layout = 3;
+
+  // Path to dump HLO graphs to.
+  string xla_hlo_graph_path = 4;
+
+  // Dump HLO graphs as TensorFlow GraphDefs.
+  bool xla_hlo_dump_as_graphdef = 5;
+
+  // HLO modules matching this regex will be dumped to LOG(INFO). Set to ".*" to
+  // dump *all* HLO modules.
+  string xla_log_hlo_text = 6;
+
+  // Dump all HLO modules as text into the provided directory path.
+  string xla_generate_hlo_text_to = 7;
+
+  // Dump compilation artifacts as JSON into this directory.
+  string xla_dump_debug_json_to = 8;
+
+  // Instrument the computation to collect per-HLO cycle counts.
+  bool xla_hlo_profile = 9;
+
+  // Dumps computations that XLA executes into the provided directory path.
+  string xla_dump_computations_to = 10;
+
+  // Dumps parameters and results of computations that XLA executes into the
+  // provided directory path.
+  string xla_dump_executions_to = 11;
+
   // List of HLO passes to disable. These names must exactly match the pass
   // names as specified by the HloPassInterface::name() method.
-  repeated string xla_disable_hlo_passes = 2;
+  repeated string xla_disable_hlo_passes = 30;
 
   // Numerical optimization level for the XLA compiler backend; the specific
   // interpretation of this value is left to the backends.
-  int32 xla_backend_optimization_level = 3;
+  int32 xla_backend_optimization_level = 31;
 
   // When true, "unsafe" mathematical optimizations are enabled. These
   // transformations include but are not limited to:
@@ -43,11 +93,63 @@ message DebugOptions {
   //    function, or transforming x/y into x * (1/y)).
   //  - Assuming that operations never produce or consume NaN or +/- Inf.
   //  - Assuming that +0 and -0 are indistinguishable.
-  bool xla_enable_fast_math = 4;
+  bool xla_enable_fast_math = 32;
+
+  // Embed the compiler IR as a string in the executable.
+  bool xla_embed_ir_in_executable = 33;
+
+  // Dump the compiler IR into this directory as individual files.
+  string xla_dump_ir_to = 34;
+
+  // Eliminate implicit broadcasts when lowering user computations to HLO
+  // instructions; use explicit broadcast instead.
+  bool xla_eliminate_hlo_implicit_broadcast = 35;
+
+  // When generating calls to Eigen in the CPU backend, use multi-threaded Eigen
+  // mode.
+  bool xla_cpu_multi_thread_eigen = 60;
+
+  // Path to directory with cuda/ptx tools and libraries.
+  string xla_gpu_cuda_data_dir = 61;
+
+  // Enable flush-to-zero semantics in the GPU backend.
+  bool xla_gpu_ftz = 62;
+
+  // Disable multi-streaming in the GPU backend.
+  bool xla_gpu_disable_multi_streaming = 63;
+
+  // If true, in LLVM-based backends, emit !alias.scope metadata in
+  // generated IR.
+  bool xla_llvm_enable_alias_scope_metadata = 70;
+
+  // If true, in LLVM-based backends, emit !noalias metadata in the
+  // generated IR.
+  bool xla_llvm_enable_noalias_metadata = 71;
+
+  // If true, in LLVM-based backends, emit !invariant.load metadata in
+  // the generated IR.
+  bool xla_llvm_enable_invariant_load_metadata = 72;
+
+  // Options for inserting reduce-precision operations for numerical
+  // experimentation.  This is a repeated field, as we may want to have
+  // multiple passes with different parameters.
+  repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80;
+
+  // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
+  // computation will run n! times with all permunations of layouts for the
+  // output shape in rank n. For example, with a 3D shape, all permutations of
+  // the set {0, 1, 2} are tried.
+  bool xla_test_all_output_layouts = 90;
+
+  // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
+  // computation will run for all permunations of layouts of all input
+  // arguments. For example, with 2 input arguments in 2D and 4D shapes, the
+  // computation will run 2! * 4! times.
+  bool xla_test_all_input_layouts = 91;
 
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
-  map<string, string> xla_backend_extra_options = 5;
+  map<string, string> xla_backend_extra_options = 500;
 }
 
 // These settings control how XLA compiles and/or runs code.  Not all settings
@@ -147,6 +249,7 @@ message ResetDeviceResponse {
 
 message ComputationStatsRequest {
   ComputationHandle computation = 1;
+  DebugOptions debug_options = 2;
 }
 
 message ComputationStatsResponse {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 9470e6c3b261e4cdd22578ef4515d92b55b8b5b7..166f31c9fb7205dd9600381dc5d69e430b545587 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -53,7 +53,8 @@ enum PrimitiveType {
   // computation; e.g. a computation that returns weights and biases may have a
   // signature that results in a tuple like (f32[784x2000], f32[2000])
   //
-  // Tuples are currently special in that they may only be rank 0.
+  // If a shape proto has the tuple element type, it may not have any entries
+  // in the dimensions field.
   TUPLE = 13;
 
   // An opaque type used for passing context specific data to a custom
@@ -199,7 +200,7 @@ message OpMetadata {
   string op_name = 2;
   // Indicate a file and line that this op is associated to in a user's program.
   //
-  // e.g. it could be be the file and line of user code that generated the op.
+  // e.g. it could be the file and line of user code that generated the op.
   string source_file = 3;
   int32 source_line = 4;
 }
@@ -255,11 +256,15 @@ message ComputationDataHandle {
   int64 handle = 1;
 }
 
-// Handle given to a user that represents a device to execute a computation.
-// When replication is enabled, the device handle represents the device for the
-// replica id 0.
+// Handle given to a user that represents a replicated virtual device. Each
+// replicated device represents N physical devices for execution where N is the
+// number of replicas.
 message DeviceHandle {
   int64 handle = 1;
+
+  // The number of model-parallel virtual devices that communicate via XLA
+  // Send/Recv instructions.
+  int64 device_count = 2;
 }
 
 // Handle given to a user to represent a channel between two computations
@@ -269,6 +274,21 @@ message ChannelHandle {
   int64 handle = 1;
 }
 
+// DeviceAssignmentProto is a serialized form of DeviceAssignment class, which
+// represents the device ids assigned to a set of replicated computations.
+// See xla::DeviceAssignment class comment for more details.
+message DeviceAssignmentProto {
+  int32 replica_count = 1;
+  int32 computation_count = 2;
+
+  // Each logical computation runs on replica_count physical devices.
+  // ComputationDevice represents the device ids assinged to the replicas.
+  message ComputationDevice {
+    repeated int32 replica_device_ids = 1;
+  }
+  repeated ComputationDevice computation_devices = 3;
+}
+
 // Literals are used when the server and client need to exchange materialized
 // data / results. Literals are also used to describe constants used in
 // computations.
@@ -463,6 +483,24 @@ message ReduceWindowRequest {
   ComputationHandle to_apply = 5;
 }
 
+message BatchNormTrainingRequest {
+  ComputationDataHandle operand = 1;
+  ComputationDataHandle scale = 2;
+  ComputationDataHandle offset = 3;
+  float epsilon = 4;
+  int64 feature_index = 5;
+}
+
+message BatchNormGradRequest {
+  ComputationDataHandle operand = 1;
+  ComputationDataHandle scale = 2;
+  ComputationDataHandle mean = 3;
+  ComputationDataHandle variance = 4;
+  ComputationDataHandle grad_output = 5;
+  float epsilon = 6;
+  int64 feature_index = 7;
+}
+
 message CrossReplicaSumRequest {
   ComputationDataHandle operand = 2;
 }
@@ -596,6 +634,12 @@ enum UnaryOperation {
 
   // Elementwise, tests if values are finite (not NaN or inf)
   UNOP_IS_FINITE = 11;
+
+  // Elementwise, computes the cosine of x.
+  UNOP_COS = 12;
+
+  // Elementwise, computes the sine of x.
+  UNOP_SIN = 13;
 }
 
 message UnaryOpRequest {
@@ -713,6 +757,12 @@ message VariadicOpRequest {
   repeated ComputationDataHandle operands = 3;
 }
 
+message ReducePrecisionRequest {
+  ComputationDataHandle operand = 1;
+  int32 exponent_bits = 2;
+  int32 mantissa_bits = 3;
+}
+
 message SendRequest {
   ComputationDataHandle operand = 1;
   ChannelHandle channel_handle = 2;
@@ -744,6 +794,7 @@ message OpRequest {
     MapRequest map_request = 15;
     PadRequest pad_request = 16;
     ParameterRequest parameter_request = 17;
+    ReducePrecisionRequest reduce_precision_request = 36;
     ReduceRequest reduce_request = 18;
     ReduceWindowRequest reduce_window_request = 19;
     ReshapeRequest reshape_request = 20;
@@ -760,7 +811,9 @@ message OpRequest {
     SendRequest send_request = 30;
     RecvRequest recv_request = 31;
     OutfeedRequest outfeed_request = 32;
-    // Next: 35
+    BatchNormTrainingRequest batch_norm_training_request = 35;
+    BatchNormGradRequest batch_norm_grad_request = 37;
+    // Next: 38
   }
 }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index b99933ff9b561212450f9d8c69f6b0e602123082..5cd47536645eb65f094448a0808bdc861625b639 100755
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -3,8 +3,6 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 py_library(
@@ -15,6 +13,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
+        "//tensorflow/contrib/boosted_trees:init_py",
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:compiler_py",
@@ -27,6 +26,7 @@ py_library(
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/fused_conv:fused_conv_py",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
         "//tensorflow/contrib/hooks",
@@ -52,7 +52,10 @@ py_library(
         "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
+        "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
+        "//tensorflow/contrib/resampler:resampler_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/contrib/seq2seq:seq2seq_py",
@@ -70,6 +73,10 @@ py_library(
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/contrib/text:text_py",
         "//tensorflow/contrib/tfprof",
+        "//tensorflow/contrib/timeseries",
+        "//tensorflow/contrib/tpu:tpu_estimator",
+        "//tensorflow/contrib/tpu:tpu_helper_library",
+        "//tensorflow/contrib/tpu:tpu_py",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
     ],
@@ -80,6 +87,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/batching:batch_ops_kernels",
+        "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/framework:generate_vocab_remapping_kernel",
         "//tensorflow/contrib/framework:load_and_remap_matrix_kernel",
@@ -87,6 +95,8 @@ cc_library(
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
         "//tensorflow/contrib/nccl:nccl_kernels",
         "//tensorflow/contrib/seq2seq:beam_search_ops_kernels",
+        "//tensorflow/contrib/tensor_forest:model_ops_kernels",
+        "//tensorflow/contrib/tensor_forest:stats_ops_kernels",
         "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
         "//tensorflow/contrib/text:all_kernels",
     ],
@@ -97,14 +107,18 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/batching:batch_ops_op_lib",
+        "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/seq2seq:beam_search_ops_op_lib",
+        "//tensorflow/contrib/tensor_forest:model_ops_op_lib",
+        "//tensorflow/contrib/tensor_forest:stats_ops_op_lib",
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
         "//tensorflow/contrib/text:all_ops",
+        "//tensorflow/contrib/tpu:all_ops",
     ],
 )
 
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index a94e809c1390a41b71fc2868f0add0bbcec34268..513e657a333f17625fd6fff68e7793b02795339a 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -50,7 +50,9 @@ from tensorflow.contrib import metrics
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
+from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
+from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
 from tensorflow.contrib import saved_model
 from tensorflow.contrib import seq2seq
@@ -65,14 +67,17 @@ from tensorflow.contrib import tensor_forest
 from tensorflow.contrib import tensorboard
 from tensorflow.contrib import testing
 from tensorflow.contrib import tfprof
+from tensorflow.contrib import timeseries
+from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.ndlstm import python as ndlstm
+from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
 
 from tensorflow.python.util.lazy_loader import LazyLoader
-ffmpeg = LazyLoader("ffmpeg", globals(),
-                    "tensorflow.contrib.ffmpeg")
+ffmpeg = LazyLoader("ffmpeg",
+                    globals(), "tensorflow.contrib.ffmpeg")
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 952f24f34b28e87104676c00afe43143fecd8ac2..0833790e991fb6706feac2acdf2f93a26966405f 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -11,6 +11,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
     "if_android",
+    "if_android_mips",
 )
 
 exports_files([
@@ -85,7 +86,7 @@ cc_binary(
         "-Wl,--gc-sections",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
         LINKER_SCRIPT,
-    ]),
+    ]) + if_android_mips(["-latomic"]),
     linkshared = 1,
     linkstatic = 1,
     tags = [
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 1f180429b23a2aa2c2eb83f4c09ab86162b7c641..587f2941e5f0efb9ec6677929fac6d32ce0a977c 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -17,6 +17,7 @@ package org.tensorflow.contrib.android;
 
 import android.content.res.AssetManager;
 import android.os.Trace;
+import android.os.Build.VERSION;
 import android.text.TextUtils;
 import android.util.Log;
 import java.io.FileInputStream;
@@ -26,6 +27,7 @@ import java.nio.ByteBuffer;
 import java.nio.DoubleBuffer;
 import java.nio.FloatBuffer;
 import java.nio.IntBuffer;
+import java.nio.LongBuffer;
 import java.util.ArrayList;
 import java.util.List;
 import org.tensorflow.DataType;
@@ -225,6 +227,16 @@ public class TensorFlowInferenceInterface {
     addFeed(inputName, Tensor.create(dims, IntBuffer.wrap(src)));
   }
 
+  /**
+   * Given a source array with shape {@link dims} and content {@link src}, copy the contents into
+   * the input Tensor with name {@link inputName}. The source array {@link src} must have at least
+   * as many elements as that of the destination Tensor. If {@link src} has more elements than the
+   * destination has capacity, the copy is truncated.
+   */
+  public void feed(String inputName, long[] src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, LongBuffer.wrap(src)));
+  }
+
   /**
    * Given a source array with shape {@link dims} and content {@link src}, copy the contents into
    * the input Tensor with name {@link inputName}. The source array {@link src} must have at least
@@ -269,6 +281,17 @@ public class TensorFlowInferenceInterface {
     addFeed(inputName, Tensor.create(dims, src));
   }
 
+  /**
+   * Given a source buffer with shape {@link dims} and content {@link src}, both stored as
+   * <b>direct</b> and <b>native ordered</b> java.nio buffers, copy the contents into the input
+   * Tensor with name {@link inputName}. The source buffer {@link src} must have at least as many
+   * elements as that of the destination Tensor. If {@link src} has more elements than the
+   * destination has capacity, the copy is truncated.
+   */
+  public void feed(String inputName, LongBuffer src, long... dims) {
+    addFeed(inputName, Tensor.create(dims, src));
+  }
+
   /**
    * Given a source buffer with shape {@link dims} and content {@link src}, both stored as
    * <b>direct</b> and <b>native ordered</b> java.nio buffers, copy the contents into the input
@@ -309,6 +332,15 @@ public class TensorFlowInferenceInterface {
     fetch(outputName, IntBuffer.wrap(dst));
   }
 
+  /**
+   * Read from a Tensor named {@link outputName} and copy the contents into a Java array. {@link
+   * dst} must have length greater than or equal to that of the source Tensor. This operation will
+   * not affect dst's content past the source Tensor's size.
+   */
+  public void fetch(String outputName, long[] dst) {
+    fetch(outputName, LongBuffer.wrap(dst));
+  }
+
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into a Java array. {@link
    * dst} must have length greater than or equal to that of the source Tensor. This operation will
@@ -347,6 +379,16 @@ public class TensorFlowInferenceInterface {
     getTensor(outputName).writeTo(dst);
   }
 
+  /**
+   * Read from a Tensor named {@link outputName} and copy the contents into the <b>direct</b> and
+   * <b>native ordered</b> java.nio buffer {@link dst}. {@link dst} must have capacity greater than
+   * or equal to that of the source Tensor. This operation will not affect dst's content past the
+   * source Tensor's size.
+   */
+  public void fetch(String outputName, LongBuffer dst) {
+    getTensor(outputName).writeTo(dst);
+  }
+
   /**
    * Read from a Tensor named {@link outputName} and copy the contents into the <b>direct</b> and
    * <b>native ordered</b> java.nio buffer {@link dst}. {@link dst} must have capacity greater than
@@ -370,9 +412,11 @@ public class TensorFlowInferenceInterface {
   private void loadGraph(InputStream is, Graph g) throws IOException {
     final long startMs = System.currentTimeMillis();
 
-    Trace.beginSection("initializeTensorFlow");
+    if (VERSION.SDK_INT >= 18) {
+      Trace.beginSection("initializeTensorFlow");
+      Trace.beginSection("readGraphDef");
+    }
 
-    Trace.beginSection("readGraphDef");
     // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
     byte[] graphDef = new byte[is.available()];
     final int numBytesRead = is.read(graphDef);
@@ -383,17 +427,22 @@ public class TensorFlowInferenceInterface {
               + " of the graph, expected to read "
               + graphDef.length);
     }
-    Trace.endSection();
 
-    Trace.beginSection("importGraphDef");
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // readGraphDef.
+      Trace.beginSection("importGraphDef");
+    }
+
     try {
       g.importGraphDef(graphDef);
     } catch (IllegalArgumentException e) {
       throw new IOException("Not a valid TensorFlow Graph serialization: " + e.getMessage());
     }
-    Trace.endSection();
 
-    Trace.endSection(); // initializeTensorFlow.
+    if (VERSION.SDK_INT >= 18) {
+      Trace.endSection(); // importGraphDef.
+      Trace.endSection(); // initializeTensorFlow.
+    }
 
     final long endMs = System.currentTimeMillis();
     Log.i(
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index 6478ad2ea775ad3cee4b62bb07b88bb672c8ce5b..fac4c41d83cbeedc25e5232dfbd5a5bbaa395f04 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -7,8 +7,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "batch_scheduler_hdrs",
     hdrs = ["batch_scheduler.h"],
@@ -28,12 +26,11 @@ cc_library(
 cc_test(
     name = "batch_scheduler_test",
     size = "small",
-    srcs = [
-        "batch_scheduler_test.cc",
-    ],
+    srcs = ["batch_scheduler_test.cc"],
     deps = [
         ":batch_scheduler",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -53,7 +50,7 @@ cc_library(
     hdrs = ["shared_batch_scheduler.h"],
     deps = [
         ":batch_scheduler",
-        ":shared_batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
         "//tensorflow/core:lib",
     ],
     alwayslink = 1,
@@ -62,9 +59,7 @@ cc_library(
 cc_test(
     name = "shared_batch_scheduler_test",
     size = "small",
-    srcs = [
-        "shared_batch_scheduler_test.cc",
-    ],
+    srcs = ["shared_batch_scheduler_test.cc"],
     deps = [
         ":shared_batch_scheduler",
         "//tensorflow/contrib/batching/test_util:fake_clock_env",
@@ -86,9 +81,7 @@ cc_library(
 cc_test(
     name = "basic_batch_scheduler_test",
     size = "small",
-    srcs = [
-        "basic_batch_scheduler_test.cc",
-    ],
+    srcs = ["basic_batch_scheduler_test.cc"],
     deps = [
         ":basic_batch_scheduler",
         ":batch_scheduler",
@@ -125,9 +118,7 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     name = "python/ops/_batch_ops.so",
-    srcs = [
-        "ops/batch_ops.cc",
-    ],
+    srcs = ["ops/batch_ops.cc"],
     deps = [
         "//tensorflow/contrib/batching/kernels:batch_kernels",
     ],
@@ -189,12 +180,11 @@ py_test(
     ],
     deps = [
         ":batch_py",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:script_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/contrib/batching/kernels/batch_kernels.cc
index 3c06325651f3f418686e7da1c629acf3b37085db..3b7c538fcc42b2e8f100d374c273ee3ca3d6056b 100644
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@@ -51,7 +51,7 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
   std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
   inputs_flat.reserve(inputs.size());
   int64 output_dim0 = 0;
-  for (int i = 0; i < inputs.size(); ++i) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
     const Tensor& input = inputs[i];
     if (input.dims() != input_dims) {
       return errors::InvalidArgument(
@@ -548,7 +548,7 @@ class BatchKernel : public AsyncOpKernel {
       return Status::OK();
     }
     int32 last_size = 0;
-    for (int i = 0; i < allowed_batch_sizes_.size(); ++i) {
+    for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
       const int32 size = allowed_batch_sizes_.at(i);
       if (i > 0 && size <= last_size) {
         return errors::InvalidArgument(
@@ -675,7 +675,7 @@ class UnbatchResource : public ResourceBase {
       // If we have a non-empty tensor, finish the waitlisted runs,
       // and store any remaining pieces.
       if (nonempty_input) {
-        for (int i = 0; i < batch_keys.size(); ++i) {
+        for (size_t i = 0; i < batch_keys.size(); ++i) {
           auto runs_it = waiting_callbacks_.find(batch_keys[i]);
           if (runs_it != waiting_callbacks_.end()) {
             runs_it->second.context->set_output(0, split_inputs[i]);
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index 48d03746bbe45d5e594b04a0d209b6966da59b6b..21dd26722a59634867f424a8d3dc4f04db9ddc90 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -7,8 +7,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -48,6 +46,7 @@ cc_test(
     deps = [
         ":periodic_function",
         "//tensorflow/contrib/batching/test_util:fake_clock_env",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 04288a1934d999ecd2a484b0059d7b1c54c4b289..d324c7d0d096c33cd9e70ba28194a4807d4f9913 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -16,21 +16,62 @@ py_library(
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+cuda_py_test(
+    name = "csiszar_divergence_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/csiszar_divergence_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "custom_grad_test",
+    size = "small",
+    srcs = ["python/kernel_tests/custom_grad_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "entropy_test",
     size = "medium",
@@ -40,12 +81,11 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
 )
@@ -76,14 +116,18 @@ cuda_py_test(
     srcs = ["python/kernel_tests/monte_carlo_test.py"],
     additional_deps = [
         ":bayesflow_py",
+        "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
     ],
 )
 
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index dcda73770026e5a6b90397dd07285a2faf7454bb..15c1614a671894f75831f822f6880df1e277ccbc 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -21,6 +21,8 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
+from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence
+from tensorflow.contrib.bayesflow.python.ops import custom_grad
 from tensorflow.contrib.bayesflow.python.ops import entropy
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
 from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
@@ -33,9 +35,10 @@ from tensorflow.contrib.bayesflow.python.ops import variational_inference
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-_allowed_symbols = ['entropy', 'monte_carlo',
-                    'special_math', 'stochastic_gradient_estimators',
-                    'stochastic_graph', 'stochastic_tensor',
-                    'stochastic_variables', 'variational_inference']
+_allowed_symbols = ['csiszar_divergence', 'custom_grad', 'entropy',
+                    'monte_carlo', 'special_math',
+                    'stochastic_gradient_estimators', 'stochastic_graph',
+                    'stochastic_tensor', 'stochastic_variables',
+                    'variational_inference']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d06b69885ce39973dad2c6f7f788e7667b75e097
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
@@ -0,0 +1,878 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Csiszar Divergence Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence_impl
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
+from tensorflow.contrib.distributions.python.ops import mvn_full_covariance as mvn_full_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+cd = csiszar_divergence_impl
+
+
+def tridiag(d, diag_value, offdiag_value):
+  """d x d matrix with given value on diag, and one super/sub diag."""
+  diag_mat = linalg_ops.eye(d) * (diag_value - offdiag_value)
+  three_bands = array_ops.matrix_band_part(
+      array_ops.fill([d, d], offdiag_value), 1, 1)
+  return diag_mat + three_bands
+
+
+class AmariAlphaTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    for alpha in [-1., 0., 1., 2.]:
+      for normalized in [True, False]:
+        with self.test_session(graph=ops.Graph()):
+          self.assertAllClose(
+              cd.amari_alpha(0., alpha=alpha,
+                             self_normalized=normalized).eval(),
+              0.)
+
+  def test_correct_when_alpha0(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.amari_alpha(self._logu, alpha=0.).eval(),
+          -self._logu)
+
+      self.assertAllClose(
+          cd.amari_alpha(self._logu, alpha=0., self_normalized=True).eval(),
+          -self._logu + (self._u - 1.))
+
+  def test_correct_when_alpha1(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.amari_alpha(self._logu, alpha=1.).eval(),
+          self._u * self._logu)
+
+      self.assertAllClose(
+          cd.amari_alpha(self._logu, alpha=1., self_normalized=True).eval(),
+          self._u * self._logu - (self._u - 1.))
+
+  def test_correct_when_alpha_not_01(self):
+    for alpha in [-2, -1., -0.5, 0.5, 2.]:
+      with self.test_session(graph=ops.Graph()):
+        self.assertAllClose(
+            cd.amari_alpha(self._logu,
+                           alpha=alpha,
+                           self_normalized=False).eval(),
+            ((self._u**alpha - 1)) / (alpha * (alpha - 1.)))
+
+        self.assertAllClose(
+            cd.amari_alpha(self._logu,
+                           alpha=alpha,
+                           self_normalized=True).eval(),
+            ((self._u**alpha - 1.)
+             - alpha * (self._u - 1)) / (alpha * (alpha - 1.)))
+
+
+class KLReverseTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    for normalized in [True, False]:
+      with self.test_session(graph=ops.Graph()):
+        self.assertAllClose(
+            cd.kl_reverse(0., self_normalized=normalized).eval(),
+            0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.kl_reverse(self._logu).eval(),
+          -self._logu)
+
+      self.assertAllClose(
+          cd.kl_reverse(self._logu, self_normalized=True).eval(),
+          -self._logu + (self._u - 1.))
+
+
+class KLForwardTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    for normalized in [True, False]:
+      with self.test_session(graph=ops.Graph()):
+        self.assertAllClose(
+            cd.kl_forward(0., self_normalized=normalized).eval(),
+            0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.kl_forward(self._logu).eval(),
+          self._u * self._logu)
+
+      self.assertAllClose(
+          cd.kl_forward(self._logu, self_normalized=True).eval(),
+          self._u * self._logu - (self._u - 1.))
+
+
+class JensenShannonTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.jensen_shannon(0.).eval(), np.log(0.25))
+
+  def test_symmetric(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.jensen_shannon(self._logu).eval(),
+          cd.symmetrized_csiszar_function(
+              self._logu, cd.jensen_shannon).eval())
+
+      self.assertAllClose(
+          cd.jensen_shannon(self._logu, self_normalized=True).eval(),
+          cd.symmetrized_csiszar_function(
+              self._logu,
+              lambda x: cd.jensen_shannon(x, self_normalized=True)).eval())
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.jensen_shannon(self._logu).eval(),
+          (self._u * self._logu
+           - (1 + self._u) * np.log1p(self._u)))
+
+      self.assertAllClose(
+          cd.jensen_shannon(self._logu, self_normalized=True).eval(),
+          (self._u * self._logu
+           - (1 + self._u) * np.log((1 + self._u) / 2)))
+
+
+class ArithmeticGeometricMeanTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.arithmetic_geometric(0.).eval(), np.log(4))
+      self.assertAllClose(
+          cd.arithmetic_geometric(0., self_normalized=True).eval(), 0.)
+
+  def test_symmetric(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.arithmetic_geometric(self._logu).eval(),
+          cd.symmetrized_csiszar_function(
+              self._logu, cd.arithmetic_geometric).eval())
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.arithmetic_geometric(self._logu).eval(),
+          (1. + self._u) * np.log((1. + self._u) / np.sqrt(self._u)))
+
+      self.assertAllClose(
+          cd.arithmetic_geometric(self._logu, self_normalized=True).eval(),
+          (1. + self._u) * np.log(0.5 * (1. + self._u) / np.sqrt(self._u)))
+
+
+class TotalVariationTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.total_variation(0.).eval(), 0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.total_variation(self._logu).eval(),
+          0.5 * np.abs(self._u - 1))
+
+
+class PearsonTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.pearson(0.).eval(), 0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.pearson(self._logu).eval(),
+          np.square(self._u - 1))
+
+
+class SquaredHellingerTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.squared_hellinger(0.).eval(), 0.)
+
+  def test_symmetric(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.squared_hellinger(self._logu).eval(),
+          cd.symmetrized_csiszar_function(
+              self._logu, cd.squared_hellinger).eval())
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.squared_hellinger(self._logu).eval(),
+          np.square(np.sqrt(self._u) - 1))
+
+
+class TriangularTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.triangular(0.).eval(), 0.)
+
+  def test_symmetric(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.triangular(self._logu).eval(),
+          cd.symmetrized_csiszar_function(
+              self._logu, cd.triangular).eval())
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.triangular(self._logu).eval(),
+          np.square(self._u - 1) / (1 + self._u))
+
+
+class TPowerTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.t_power(0., t=-0.1).eval(), 0.)
+      self.assertAllClose(cd.t_power(0., t=0.5).eval(), 0.)
+      self.assertAllClose(cd.t_power(0., t=1.1).eval(), 0.)
+      self.assertAllClose(
+          cd.t_power(0., t=-0.1, self_normalized=True).eval(), 0.)
+      self.assertAllClose(
+          cd.t_power(0., t=0.5, self_normalized=True).eval(), 0.)
+      self.assertAllClose(
+          cd.t_power(0., t=1.1, self_normalized=True).eval(), 0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.t_power(self._logu, t=np.float64(-0.1)).eval(),
+          self._u ** -0.1 - 1.)
+      self.assertAllClose(
+          cd.t_power(self._logu, t=np.float64(0.5)).eval(),
+          -self._u ** 0.5 + 1.)
+      self.assertAllClose(
+          cd.t_power(self._logu, t=np.float64(1.1)).eval(),
+          self._u ** 1.1 - 1.)
+
+  def test_correct_self_normalized(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.t_power(self._logu, t=np.float64(-0.1),
+                     self_normalized=True).eval(),
+          self._u ** -0.1 - 1. + 0.1 * (self._u - 1.))
+      self.assertAllClose(
+          cd.t_power(self._logu, t=np.float64(0.5),
+                     self_normalized=True).eval(),
+          -self._u ** 0.5 + 1. + 0.5 * (self._u - 1.))
+      self.assertAllClose(
+          cd.t_power(self._logu, t=np.float64(1.1),
+                     self_normalized=True).eval(),
+          self._u ** 1.1 - 1. - 1.1 * (self._u - 1.))
+
+
+class Log1pAbsTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.log1p_abs(0.).eval(), 0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.log1p_abs(self._logu).eval(),
+          self._u**(np.sign(self._u - 1)) - 1)
+
+
+class JeffreysTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.jeffreys(0.).eval(), 0.)
+
+  def test_symmetric(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.jeffreys(self._logu).eval(),
+          cd.symmetrized_csiszar_function(
+              self._logu, cd.jeffreys).eval())
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.jeffreys(self._logu).eval(),
+          0.5 * (self._u * self._logu - self._logu))
+
+
+class ChiSquareTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(cd.chi_square(0.).eval(), 0.)
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.chi_square(self._logu).eval(),
+          self._u**2 - 1)
+
+
+class ModifiedGanTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10, 100)
+    self._u = np.exp(self._logu)
+
+  def test_at_zero(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.modified_gan(0.).eval(), np.log(2))
+      self.assertAllClose(
+          cd.modified_gan(0., self_normalized=True).eval(), np.log(2))
+
+  def test_correct(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.modified_gan(self._logu).eval(),
+          np.log1p(self._u) - self._logu)
+
+      self.assertAllClose(
+          cd.modified_gan(self._logu, self_normalized=True).eval(),
+          np.log1p(self._u) - self._logu + 0.5 * (self._u - 1))
+
+
+class SymmetrizedCsiszarFunctionTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10., 100)
+    self._u = np.exp(self._logu)
+
+  def test_jensen_shannon(self):
+    with self.test_session():
+
+      # The following functions come from the claim made in the
+      # symmetrized_csiszar_function docstring.
+      def js1(logu):
+        return (-logu
+                - (1. + math_ops.exp(logu)) * (
+                    nn_ops.softplus(logu)))
+
+      def js2(logu):
+        return 2. * (math_ops.exp(logu) * (
+            logu - nn_ops.softplus(logu)))
+
+      self.assertAllClose(
+          cd.symmetrized_csiszar_function(self._logu, js1).eval(),
+          cd.jensen_shannon(self._logu).eval())
+
+      self.assertAllClose(
+          cd.symmetrized_csiszar_function(self._logu, js2).eval(),
+          cd.jensen_shannon(self._logu).eval())
+
+  def test_jeffreys(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.symmetrized_csiszar_function(self._logu, cd.kl_reverse).eval(),
+          cd.jeffreys(self._logu).eval())
+
+      self.assertAllClose(
+          cd.symmetrized_csiszar_function(self._logu, cd.kl_forward).eval(),
+          cd.jeffreys(self._logu).eval())
+
+
+class DualCsiszarFunctionTest(test.TestCase):
+
+  def setUp(self):
+    self._logu = np.linspace(-10., 10., 100)
+    self._u = np.exp(self._logu)
+
+  def test_kl_forward(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.dual_csiszar_function(self._logu, cd.kl_forward).eval(),
+          cd.kl_reverse(self._logu).eval())
+
+  def test_kl_reverse(self):
+    with self.test_session():
+      self.assertAllClose(
+          cd.dual_csiszar_function(self._logu, cd.kl_reverse).eval(),
+          cd.kl_forward(self._logu).eval())
+
+
+class MonteCarloCsiszarFDivergenceTest(test.TestCase):
+
+  def test_kl_forward(self):
+    with self.test_session() as sess:
+      q = normal_lib.Normal(
+          loc=np.ones(6),
+          scale=np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]))
+
+      p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2)
+
+      approx_kl = cd.monte_carlo_csiszar_f_divergence(
+          f=cd.kl_forward,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
+          f=lambda logu: cd.kl_forward(logu, self_normalized=True),
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      exact_kl = kullback_leibler.kl_divergence(p, q)
+
+      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
+          approx_kl, approx_kl_self_normalized, exact_kl])
+
+      self.assertAllClose(approx_kl_, exact_kl_,
+                          rtol=0.08, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
+                          rtol=0.02, atol=0.)
+
+  def test_kl_reverse(self):
+    with self.test_session() as sess:
+
+      q = normal_lib.Normal(
+          loc=np.ones(6),
+          scale=np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]))
+
+      p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2)
+
+      approx_kl = cd.monte_carlo_csiszar_f_divergence(
+          f=cd.kl_reverse,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
+          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      exact_kl = kullback_leibler.kl_divergence(q, p)
+
+      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
+          approx_kl, approx_kl_self_normalized, exact_kl])
+
+      self.assertAllClose(approx_kl_, exact_kl_,
+                          rtol=0.07, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
+                          rtol=0.02, atol=0.)
+
+  def test_kl_reverse_multidim(self):
+
+    with self.test_session() as sess:
+      d = 5  # Dimension
+
+      p = mvn_full_lib.MultivariateNormalFullCovariance(
+          covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
+
+      q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[0.5]*d)
+
+      approx_kl = cd.monte_carlo_csiszar_f_divergence(
+          f=cd.kl_reverse,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
+          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      exact_kl = kullback_leibler.kl_divergence(q, p)
+
+      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
+          approx_kl, approx_kl_self_normalized, exact_kl])
+
+      self.assertAllClose(approx_kl_, exact_kl_,
+                          rtol=0.02, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
+                          rtol=0.08, atol=0.)
+
+  def test_kl_forward_multidim(self):
+
+    with self.test_session() as sess:
+      d = 5  # Dimension
+
+      p = mvn_full_lib.MultivariateNormalFullCovariance(
+          covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
+
+      # Variance is very high when approximating Forward KL, so we make
+      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
+      # "covers" p and thus Var_q[p/q] is smaller.
+      q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[1.]*d)
+
+      approx_kl = cd.monte_carlo_csiszar_f_divergence(
+          f=cd.kl_forward,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
+          f=lambda logu: cd.kl_forward(logu, self_normalized=True),
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=int(1e5),
+          seed=1)
+
+      exact_kl = kullback_leibler.kl_divergence(p, q)
+
+      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
+          approx_kl, approx_kl_self_normalized, exact_kl])
+
+      self.assertAllClose(approx_kl_, exact_kl_,
+                          rtol=0.06, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
+                          rtol=0.05, atol=0.)
+
+  def test_score_trick(self):
+
+    with self.test_session() as sess:
+      d = 5  # Dimension
+      num_draws = int(1e5)
+      seed = 1
+
+      p = mvn_full_lib.MultivariateNormalFullCovariance(
+          covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5))
+
+      # Variance is very high when approximating Forward KL, so we make
+      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
+      # "covers" p and thus Var_q[p/q] is smaller.
+      s = array_ops.constant(1.)
+      q = mvn_diag_lib.MultivariateNormalDiag(
+          scale_diag=array_ops.tile([s], [d]))
+
+      approx_kl = cd.monte_carlo_csiszar_f_divergence(
+          f=cd.kl_reverse,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=num_draws,
+          seed=seed)
+
+      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
+          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=num_draws,
+          seed=seed)
+
+      approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence(
+          f=cd.kl_reverse,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=num_draws,
+          use_reparametrization=False,
+          seed=seed)
+
+      approx_kl_self_normalized_score_trick = (
+          cd.monte_carlo_csiszar_f_divergence(
+              f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
+              p_log_prob=p.log_prob,
+              q=q,
+              num_draws=num_draws,
+              use_reparametrization=False,
+              seed=seed))
+
+      exact_kl = kullback_leibler.kl_divergence(q, p)
+
+      grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0]
+
+      [
+          approx_kl_grad_,
+          approx_kl_self_normalized_grad_,
+          approx_kl_score_trick_grad_,
+          approx_kl_self_normalized_score_trick_grad_,
+          exact_kl_grad_,
+          approx_kl_,
+          approx_kl_self_normalized_,
+          approx_kl_score_trick_,
+          approx_kl_self_normalized_score_trick_,
+          exact_kl_,
+      ] = sess.run([
+          grad_sum(approx_kl),
+          grad_sum(approx_kl_self_normalized),
+          grad_sum(approx_kl_score_trick),
+          grad_sum(approx_kl_self_normalized_score_trick),
+          grad_sum(exact_kl),
+          approx_kl,
+          approx_kl_self_normalized,
+          approx_kl_score_trick,
+          approx_kl_self_normalized_score_trick,
+          exact_kl,
+      ])
+
+      # Test average divergence.
+      self.assertAllClose(approx_kl_, exact_kl_,
+                          rtol=0.02, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
+                          rtol=0.08, atol=0.)
+
+      self.assertAllClose(approx_kl_score_trick_, exact_kl_,
+                          rtol=0.02, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_,
+                          rtol=0.08, atol=0.)
+
+      # Test average gradient-divergence.
+      self.assertAllClose(approx_kl_grad_, exact_kl_grad_,
+                          rtol=0.007, atol=0.)
+
+      self.assertAllClose(approx_kl_self_normalized_grad_, exact_kl_grad_,
+                          rtol=0.011, atol=0.)
+
+      self.assertAllClose(approx_kl_score_trick_grad_, exact_kl_grad_,
+                          rtol=0.018, atol=0.)
+
+      self.assertAllClose(
+          approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_,
+          rtol=0.017, atol=0.)
+
+
+class CsiszarVIMCOTest(test.TestCase):
+
+  def _numpy_csiszar_vimco_helper(self, logu):
+    """Numpy implementation of `csiszar_vimco_helper`."""
+    n = logu.shape[0]
+    u = np.exp(logu)
+    loogeoavg_u = []  # Leave-one-out geometric-average of exp(logu).
+    for j in range(n):
+      loogeoavg_u.append(np.exp(np.mean(
+          [logu[i, ...] for i in range(n) if i != j],
+          axis=0)))
+    loogeoavg_u = np.array(loogeoavg_u)
+
+    loosum_u = []  # Leave-one-out sum of exp(logu).
+    for j in range(n):
+      loosum_u.append(np.sum(
+          [u[i, ...] for i in range(n) if i != j],
+          axis=0))
+    loosum_u = np.array(loosum_u)
+
+    # Natural log of the average u except each is swapped-out for its
+    # leave-`i`-th-out Geometric average.
+    log_sooavg_u = np.log(loosum_u + loogeoavg_u) - np.log(n)
+
+    log_avg_u = np.log(np.mean(u, axis=0))
+    return log_avg_u, log_sooavg_u
+
+  def test_vimco_helper(self):
+
+    with self.test_session() as sess:
+      logu = np.linspace(-20, 20, 100)
+      np_log_avg_u, np_log_sooavg_u = self._numpy_csiszar_vimco_helper(logu)
+      [log_avg_u, log_sooavg_u] = sess.run(cd.csiszar_vimco_helper(logu))
+      self.assertAllClose(np_log_avg_u, log_avg_u,
+                          rtol=1e-2, atol=0.)
+      self.assertAllClose(np_log_sooavg_u, log_sooavg_u,
+                          rtol=1e-2, atol=0.)
+
+  def test_vimco_helper_gradient(self):
+
+    with self.test_session():
+      logu = array_ops.constant(
+          np.linspace(-1e2, 100., 100).reshape([50, 2]))
+      log_avg_u, log_sooavg_u = cd.csiszar_vimco_helper(logu)
+      g = gradients_impl.gradients(log_avg_u - log_sooavg_u, logu)[0].eval()
+      self.assertAllEqual(np.ones_like(g, dtype=np.bool), np.isfinite(g))
+      self.assertAllEqual(np.ones_like(g, dtype=np.bool), g != 0.)
+
+  def test_vimco_and_gradient(self):
+
+    with self.test_session() as sess:
+      dims = 5  # Dimension
+      num_draws = int(20)
+      num_batch_draws = int(3)
+      seed = 1
+
+      f = lambda logu: cd.kl_reverse(logu, self_normalized=False)
+      np_f = lambda logu: -logu
+
+      p = mvn_full_lib.MultivariateNormalFullCovariance(
+          covariance_matrix=tridiag(dims, diag_value=1, offdiag_value=0.5))
+
+      # Variance is very high when approximating Forward KL, so we make
+      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
+      # "covers" p and thus Var_q[p/q] is smaller.
+      s = array_ops.constant(1.)
+      q = mvn_diag_lib.MultivariateNormalDiag(
+          scale_diag=array_ops.tile([s], [dims]))
+
+      vimco = cd.csiszar_vimco(
+          f=f,
+          p_log_prob=p.log_prob,
+          q=q,
+          num_draws=num_draws,
+          num_batch_draws=num_batch_draws,
+          seed=seed)
+
+      x = q.sample(sample_shape=[num_draws, num_batch_draws],
+                   seed=seed)
+      x = array_ops.stop_gradient(x)
+      logu = p.log_prob(x) - q.log_prob(x)
+      f_log_sum_u = f(cd.csiszar_vimco_helper(logu)[0])
+
+      grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0]
+
+      def jacobian(x):
+        # Warning: this function is slow and may not even finish if prod(shape)
+        # is larger than, say, 100.
+        shape = x.shape.as_list()
+        assert all(s is not None for s in shape)
+        x = array_ops.reshape(x, shape=[-1])
+        r = [grad_sum(x[i]) for i in range(np.prod(shape))]
+        return array_ops.reshape(array_ops.stack(r), shape=shape)
+
+      [
+          logu_,
+          jacobian_logqx_,
+          vimco_,
+          grad_vimco_,
+          f_log_sum_u_,
+          grad_mean_f_log_sum_u_,
+      ] = sess.run([
+          logu,
+          jacobian(q.log_prob(x)),
+          vimco,
+          grad_sum(vimco),
+          f_log_sum_u,
+          grad_sum(f_log_sum_u) / num_batch_draws,
+      ])
+
+      np_log_avg_u, np_log_sooavg_u = self._numpy_csiszar_vimco_helper(logu_)
+
+      # Test VIMCO loss is correct.
+      self.assertAllClose(np_f(np_log_avg_u).mean(axis=0), vimco_,
+                          rtol=1e-5, atol=0.)
+
+      # Test gradient of VIMCO loss is correct.
+      #
+      # To make this computation we'll inject two gradients from TF:
+      # - grad[mean(f(log(sum(p(x)/q(x)))))]
+      # - jacobian[log(q(x))].
+      #
+      # We now justify why using these (and only these) TF values for
+      # ground-truth does not undermine the completeness of this test.
+      #
+      # Regarding `grad_mean_f_log_sum_u_`, note that we validate the
+      # correctness of the zero-th order derivative (for each batch member).
+      # Since `cd.csiszar_vimco_helper` itself does not manipulate any gradient
+      # information, we can safely rely on TF.
+      self.assertAllClose(np_f(np_log_avg_u), f_log_sum_u_, rtol=1e-4, atol=0.)
+      #
+      # Regarding `jacobian_logqx_`, note that testing the gradient of
+      # `q.log_prob` is outside the scope of this unit-test thus we may safely
+      # use TF to find it.
+
+      # The `mean` is across batches and the `sum` is across iid samples.
+      np_grad_vimco = (
+          grad_mean_f_log_sum_u_
+          + np.mean(
+              np.sum(
+                  jacobian_logqx_ * (np_f(np_log_avg_u)
+                                     - np_f(np_log_sooavg_u)),
+                  axis=0),
+              axis=0))
+
+      self.assertAllClose(np_grad_vimco, grad_vimco_,
+                          rtol=1e-5, atol=0.)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/custom_grad_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/custom_grad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a95df31ac1fd9f5038abe779391ccba5f7fe408d
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/custom_grad_test.py
@@ -0,0 +1,157 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Custom Gradient Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import custom_grad_impl
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+cg = custom_grad_impl
+
+
+class CustomGradientTest(test.TestCase):
+
+  def test_works_correctly(self):
+    with self.test_session() as sess:
+      f = lambda x: x**2 / 2
+      g = lambda x: (x - 1)**3 / 3
+      x_ = np.linspace(-100, 100, int(1e4)) + [0.]
+
+      x = constant_op.constant(x_)
+      fx = cg.custom_gradient(f(x), g(x), x)
+      gx = gradients_impl.gradients(fx, x)[0]
+      [fx_, gx_] = sess.run([fx, gx])
+
+      self.assertAllClose(f(x_), fx_)
+      self.assertAllClose(g(x_), gx_)
+
+  def test_works_correctly_both_f_g_zero(self):
+    with self.test_session() as sess:
+      f = lambda x: x**2 / 2
+      g = lambda x: x**3 / 3
+      x_ = np.linspace(-100, 100, int(1e4)) + [0.]
+
+      x = constant_op.constant(x_)
+      fx = cg.custom_gradient(f(x), g(x), x)
+      gx = gradients_impl.gradients(fx, x)[0]
+      [fx_, gx_] = sess.run([fx, gx])
+
+      self.assertAllClose(f(x_), fx_)
+      self.assertAllClose(g(x_), gx_)
+
+  def test_works_correctly_vector_of_vars(self):
+    with self.test_session() as sess:
+      x = variable_scope.get_variable(
+          name="x",
+          shape=[],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(2))
+      y = variable_scope.get_variable(
+          name="y",
+          shape=[],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(3))
+      sess.run([variables.global_variables_initializer()])
+
+      f = lambda z: z[0] * z[1]
+      g = lambda z: z[0]**2 * z[1]**2 / 2
+
+      z = array_ops.stack([x, y])
+      fz = cg.custom_gradient(f(z), g(z), z, axis=0)
+      gz = gradients_impl.gradients(fz, variables.trainable_variables())
+      [z_, fz_, gx_, gy_] = sess.run([z, fz, gz[0], gz[1]])
+
+      self.assertEqual(f(z_), fz_)
+      self.assertEqual(g(z_), gx_)
+      self.assertEqual(g(z_), gy_)
+
+  def test_works_correctly_side_vars(self):
+    with self.test_session() as sess:
+      x_ = np.float32(2.1)  # Adding extra tenth to force imprecision.
+      y_ = np.float32(3.1)
+      x = variable_scope.get_variable(
+          name="x",
+          shape=[],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(x_))
+      y = variable_scope.get_variable(
+          name="y",
+          shape=[],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(y_))
+      sess.run([variables.global_variables_initializer()])
+
+      f = lambda x: x * y
+      g = lambda z: math_ops.square(x) * y
+
+      fx = cg.custom_gradient(f(x), g(x), x)
+      gx = gradients_impl.gradients(fx, variables.trainable_variables())
+      [x_, fx_, gx_] = sess.run([x, fx, gx[0]])
+      gy_ = gx[1]
+
+      self.assertEqual(x_ * y_, fx_)
+      self.assertEqual(np.square(x_) * y_, gx_)
+      self.assertEqual(None, gy_)
+
+  def test_works_correctly_fx_gx_manually_stopped(self):
+    with self.test_session() as sess:
+      x_ = np.float32(2.1)  # Adding extra tenth to force imprecision.
+      y_ = np.float32(3.1)
+      x = variable_scope.get_variable(
+          name="x",
+          shape=[],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(x_))
+      y = variable_scope.get_variable(
+          name="y",
+          shape=[],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(y_))
+      sess.run([variables.global_variables_initializer()])
+
+      stop = array_ops.stop_gradient  # For readability.
+
+      # Basically we need to stop the `x` portion of `f`. And when we supply the
+      # arg to `custom_gradient` we need to stop the complement, i.e., the `y`
+      # part.
+      f = lambda x: stop(x) * y
+      g = lambda x: stop(math_ops.square(x)) * y
+      fx = cg.custom_gradient(f(x), g(x), x + stop(y),
+                              fx_gx_manually_stopped=True)
+
+      gx = gradients_impl.gradients(fx, variables.trainable_variables())
+      [x_, fx_, gx_, gy_] = sess.run([x, fx, gx[0], gx[1]])
+
+      self.assertEqual(x_ * y_, fx_)
+      self.assertEqual(np.square(x_) * y_, gx_)
+      self.assertEqual(x_, gy_)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
index 6cdaa3187054daa278dc7342626b089f9655457b..0bd12b84d12a9c3219f6b24830b1b82db9716043 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
@@ -20,22 +20,24 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy_lib
+from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
+from tensorflow.contrib.distributions.python.ops import mvn_tril as mvn_tril_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler as kullback_leibler_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
 layers = layers_lib
-entropy = entropy_lib
 
 
-class NormalNoEntropy(distributions.Normal):  # pylint: disable=no-init
+class NormalNoEntropy(normal_lib.Normal):  # pylint: disable=no-init
   """Normal distribution without a `.entropy` method."""
 
   def entropy(self):
@@ -81,10 +83,10 @@ class ElboRatioTest(test.TestCase):
     n_samples = 5000
 
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
-      p = distributions.MultivariateNormalDiag(
+      p = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
 
@@ -95,7 +97,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.sample,
           seed=42)
-      actual_kl = distributions.kl_divergence(q, p)
+      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
@@ -109,10 +111,10 @@ class ElboRatioTest(test.TestCase):
 
     vector_shape = (2, 3)
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
-      p = distributions.MultivariateNormalDiag(
+      p = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
 
@@ -123,7 +125,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.analytic_entropy,
           seed=42)
-      actual_kl = distributions.kl_divergence(q, p)
+      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
@@ -135,7 +137,7 @@ class ElboRatioTest(test.TestCase):
 
     vector_shape = (2, 3)
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
 
@@ -155,7 +157,7 @@ class EntropyShannonTest(test.TestCase):
 
   def test_normal_entropy_default_form_uses_exact_entropy(self):
     with self.test_session():
-      dist = distributions.Normal(loc=1.11, scale=2.22)
+      dist = normal_lib.Normal(loc=1.11, scale=2.22)
       mc_entropy = entropy.entropy_shannon(dist, n=11)
       exact_entropy = dist.entropy()
       self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
@@ -163,7 +165,7 @@ class EntropyShannonTest(test.TestCase):
 
   def test_normal_entropy_analytic_form_uses_exact_entropy(self):
     with self.test_session():
-      dist = distributions.Normal(loc=1.11, scale=2.22)
+      dist = normal_lib.Normal(loc=1.11, scale=2.22)
       mc_entropy = entropy.entropy_shannon(
           dist, form=entropy.ELBOForms.analytic_entropy)
       exact_entropy = dist.entropy()
@@ -173,7 +175,7 @@ class EntropyShannonTest(test.TestCase):
   def test_normal_entropy_sample_form_gets_approximate_answer(self):
     # Tested by showing we get a good answer that is not exact.
     with self.test_session():
-      dist = distributions.Normal(loc=1.11, scale=2.22)
+      dist = normal_lib.Normal(loc=1.11, scale=2.22)
       mc_entropy = entropy.entropy_shannon(
           dist, n=1000, form=entropy.ELBOForms.sample, seed=0)
       exact_entropy = dist.entropy()
@@ -193,7 +195,7 @@ class EntropyShannonTest(test.TestCase):
       # NormalNoEntropy is like a Normal, but does not have .entropy method, so
       # we are forced to fall back on sample entropy.
       dist_no_entropy = NormalNoEntropy(loc=1.11, scale=2.22)
-      dist_yes_entropy = distributions.Normal(loc=1.11, scale=2.22)
+      dist_yes_entropy = normal_lib.Normal(loc=1.11, scale=2.22)
 
       mc_entropy = entropy.entropy_shannon(
           dist_no_entropy, n=1000, form=entropy.ELBOForms.sample, seed=0)
@@ -222,15 +224,16 @@ class RenyiRatioTest(test.TestCase):
     mu_true = np.array([1.0, -1.0], dtype=np.float64)
     chol_true = np.array([[2.0, 0.0], [0.5, 1.0]], dtype=np.float64)
     with self.test_session() as sess:
-      target = distributions.MultivariateNormalTriL(mu_true, chol_true)
+      target = mvn_tril_lib.MultivariateNormalTriL(mu_true, chol_true)
 
       # Set up q distribution by defining mean/covariance as Variables
       mu = variables.Variable(
           np.zeros(mu_true.shape), dtype=mu_true.dtype, name='mu')
       mat = variables.Variable(
           np.zeros(chol_true.shape), dtype=chol_true.dtype, name='mat')
-      chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-      q = distributions.MultivariateNormalTriL(mu, chol)
+      chol = distribution_util.matrix_diag_transform(
+          mat, transform=nn_ops.softplus)
+      q = mvn_tril_lib.MultivariateNormalTriL(mu, chol)
       for alpha in [0.25, 0.75]:
 
         negative_renyi_divergence = entropy.renyi_ratio(
@@ -262,7 +265,7 @@ class RenyiRatioTest(test.TestCase):
     n = 1000
     vector_shape = (2, 3)
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
       for alpha in [0.25, 0.75]:
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
index fd3c79976a53590751df8ca1df5d589573ba961f..d9e23646d8334014f1bef0d0744df9310b59909f 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
@@ -18,19 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions as distributions_lib
+import numpy as np
+
 from tensorflow.contrib import layers as layers_lib
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo_lib
 from tensorflow.contrib.bayesflow.python.ops.monte_carlo_impl import _get_samples
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
+
 layers = layers_lib
-monte_carlo = monte_carlo_lib
+mc = monte_carlo_lib
 
 
 class ExpectationImportanceSampleTest(test.TestCase):
@@ -42,15 +48,15 @@ class ExpectationImportanceSampleTest(test.TestCase):
       mu_q = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
       sigma_p = constant_op.constant([0.5, 0.5], dtype=dtypes.float64)
       sigma_q = constant_op.constant([1.0, 1.0], dtype=dtypes.float64)
-      p = distributions.Normal(loc=mu_p, scale=sigma_p)
-      q = distributions.Normal(loc=mu_q, scale=sigma_q)
+      p = normal_lib.Normal(loc=mu_p, scale=sigma_p)
+      q = normal_lib.Normal(loc=mu_q, scale=sigma_q)
 
       # Compute E_p[X].
-      e_x = monte_carlo.expectation_importance_sampler(
+      e_x = mc.expectation_importance_sampler(
           f=lambda x: x, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42)
 
       # Compute E_p[X^2].
-      e_x2 = monte_carlo.expectation_importance_sampler(
+      e_x2 = mc.expectation_importance_sampler(
           f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42)
 
       stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x))
@@ -68,10 +74,10 @@ class ExpectationImportanceSampleTest(test.TestCase):
     # the product of components in a MultivariateNormal are > 0.
     n = 1000
     with self.test_session():
-      p = distributions.MultivariateNormalDiag(
-          loc=[0.0, 0.0], scale_diag=[1.0, 1.0])
-      q = distributions.MultivariateNormalDiag(
-          loc=[0.5, 0.5], scale_diag=[3., 3.])
+      p = mvn_diag_lib.MultivariateNormalDiag(
+          loc=[0.], scale_diag=[1.0, 1.0])
+      q = mvn_diag_lib.MultivariateNormalDiag(
+          loc=[0.5], scale_diag=[3., 3.])
 
       # Compute E_p[X_1 * X_2 > 0], with X_i the ith component of X ~ p(x).
       # Should equal 1/2 because p is a spherical Gaussian centered at (0, 0).
@@ -79,7 +85,7 @@ class ExpectationImportanceSampleTest(test.TestCase):
         x1_times_x2 = math_ops.reduce_prod(x, reduction_indices=[-1])
         return 0.5 * (math_ops.sign(x1_times_x2) + 1.0)
 
-      prob = monte_carlo.expectation_importance_sampler(
+      prob = mc.expectation_importance_sampler(
           f=indicator, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
@@ -99,12 +105,12 @@ class ExpectationImportanceSampleLogspaceTest(test.TestCase):
       mu_q = constant_op.constant([-1.0, 1.0], dtype=dtypes.float64)
       sigma_p = constant_op.constant([1.0, 2 / 3.], dtype=dtypes.float64)
       sigma_q = constant_op.constant([1.0, 1.0], dtype=dtypes.float64)
-      p = distributions.Normal(loc=mu_p, scale=sigma_p)
-      q = distributions.Normal(loc=mu_q, scale=sigma_q)
+      p = normal_lib.Normal(loc=mu_p, scale=sigma_p)
+      q = normal_lib.Normal(loc=mu_q, scale=sigma_q)
 
       # Compute E_p[X^2].
       # Should equal [1, (2/3)^2]
-      log_e_x2 = monte_carlo.expectation_importance_sampler_logspace(
+      log_e_x2 = mc.expectation_importance_sampler_logspace(
           log_f=lambda x: math_ops.log(math_ops.square(x)),
           log_p=p.log_prob,
           sampling_dist_q=q,
@@ -118,34 +124,12 @@ class ExpectationImportanceSampleLogspaceTest(test.TestCase):
       self.assertAllClose([1., (2 / 3.)**2], e_x2.eval(), rtol=0.02)
 
 
-class ExpectationTest(test.TestCase):
-
-  def test_mc_estimate_of_normal_mean_and_variance_is_correct_vs_analytic(self):
-    random_seed.set_random_seed(0)
-    n = 20000
-    with self.test_session():
-      p = distributions.Normal(loc=[1.0, -1.0], scale=[0.3, 0.5])
-      # Compute E_p[X] and E_p[X^2].
-      z = p.sample(n, seed=42)
-      e_x = monte_carlo.expectation(lambda x: x, p, z=z, seed=42)
-      e_x2 = monte_carlo.expectation(math_ops.square, p, z=z, seed=0)
-      var = e_x2 - math_ops.square(e_x)
-
-      self.assertEqual(p.batch_shape, e_x.get_shape())
-      self.assertEqual(p.batch_shape, e_x2.get_shape())
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertAllClose(p.mean().eval(), e_x.eval(), rtol=0.01)
-      self.assertAllClose(p.variance().eval(), var.eval(), rtol=0.02)
-
-
 class GetSamplesTest(test.TestCase):
   """Test the private method 'get_samples'."""
 
   def test_raises_if_both_z_and_n_are_none(self):
     with self.test_session():
-      dist = distributions.Normal(loc=0., scale=1.)
+      dist = normal_lib.Normal(loc=0., scale=1.)
       z = None
       n = None
       seed = None
@@ -154,7 +138,7 @@ class GetSamplesTest(test.TestCase):
 
   def test_raises_if_both_z_and_n_are_not_none(self):
     with self.test_session():
-      dist = distributions.Normal(loc=0., scale=1.)
+      dist = normal_lib.Normal(loc=0., scale=1.)
       z = dist.sample(seed=42)
       n = 1
       seed = None
@@ -163,7 +147,7 @@ class GetSamplesTest(test.TestCase):
 
   def test_returns_n_samples_if_n_provided(self):
     with self.test_session():
-      dist = distributions.Normal(loc=0., scale=1.)
+      dist = normal_lib.Normal(loc=0., scale=1.)
       z = None
       n = 10
       seed = None
@@ -172,7 +156,7 @@ class GetSamplesTest(test.TestCase):
 
   def test_returns_z_if_z_provided(self):
     with self.test_session():
-      dist = distributions.Normal(loc=0., scale=1.)
+      dist = normal_lib.Normal(loc=0., scale=1.)
       z = dist.sample(10, seed=42)
       n = None
       seed = None
@@ -180,5 +164,142 @@ class GetSamplesTest(test.TestCase):
       self.assertEqual((10,), z.get_shape())
 
 
+class ExpectationTest(test.TestCase):
+
+  def test_works_correctly(self):
+    with self.test_session() as sess:
+      x = constant_op.constant([-1e6, -100, -10, -1, 1, 10, 100, 1e6])
+      p = normal_lib.Normal(loc=x, scale=1.)
+
+      # We use the prefex "efx" to mean "E_p[f(X)]".
+      f = lambda u: u
+      efx_true = x
+      samples = p.sample(int(1e5), seed=1)
+      efx_reparam = mc.expectation(f, samples, p.log_prob)
+      efx_score = mc.expectation(f, samples, p.log_prob,
+                                 use_reparametrization=False)
+
+      [
+          efx_true_,
+          efx_reparam_,
+          efx_score_,
+          efx_true_grad_,
+          efx_reparam_grad_,
+          efx_score_grad_,
+      ] = sess.run([
+          efx_true,
+          efx_reparam,
+          efx_score,
+          gradients_impl.gradients(efx_true, x)[0],
+          gradients_impl.gradients(efx_reparam, x)[0],
+          gradients_impl.gradients(efx_score, x)[0],
+      ])
+
+      self.assertAllEqual(np.ones_like(efx_true_grad_), efx_true_grad_)
+
+      self.assertAllClose(efx_true_, efx_reparam_, rtol=0.005, atol=0.)
+      self.assertAllClose(efx_true_, efx_score_, rtol=0.005, atol=0.)
+
+      self.assertAllEqual(np.ones_like(efx_true_grad_, dtype=np.bool),
+                          np.isfinite(efx_reparam_grad_))
+      self.assertAllEqual(np.ones_like(efx_true_grad_, dtype=np.bool),
+                          np.isfinite(efx_score_grad_))
+
+      self.assertAllClose(efx_true_grad_, efx_reparam_grad_,
+                          rtol=0.03, atol=0.)
+      # Variance is too high to be meaningful, so we'll only check those which
+      # converge.
+      self.assertAllClose(efx_true_grad_[2:-2],
+                          efx_score_grad_[2:-2],
+                          rtol=0.05, atol=0.)
+
+  def test_docstring_example_normal(self):
+    with self.test_session() as sess:
+      num_draws = int(1e5)
+      mu_p = constant_op.constant(0.)
+      mu_q = constant_op.constant(1.)
+      p = normal_lib.Normal(loc=mu_p, scale=1.)
+      q = normal_lib.Normal(loc=mu_q, scale=2.)
+      exact_kl_normal_normal = kullback_leibler.kl_divergence(p, q)
+      approx_kl_normal_normal = monte_carlo_lib.expectation(
+          f=lambda x: p.log_prob(x) - q.log_prob(x),
+          samples=p.sample(num_draws, seed=42),
+          log_prob=p.log_prob,
+          use_reparametrization=(p.reparameterization_type
+                                 == distribution_lib.FULLY_REPARAMETERIZED))
+      [exact_kl_normal_normal_, approx_kl_normal_normal_] = sess.run([
+          exact_kl_normal_normal, approx_kl_normal_normal])
+      self.assertEqual(
+          True,
+          p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)
+      self.assertAllClose(exact_kl_normal_normal_, approx_kl_normal_normal_,
+                          rtol=0.01, atol=0.)
+
+      # Compare gradients. (Not present in `docstring`.)
+      gradp = lambda fp: gradients_impl.gradients(fp, mu_p)[0]
+      gradq = lambda fq: gradients_impl.gradients(fq, mu_q)[0]
+      [
+          gradp_exact_kl_normal_normal_,
+          gradq_exact_kl_normal_normal_,
+          gradp_approx_kl_normal_normal_,
+          gradq_approx_kl_normal_normal_,
+      ] = sess.run([
+          gradp(exact_kl_normal_normal),
+          gradq(exact_kl_normal_normal),
+          gradp(approx_kl_normal_normal),
+          gradq(approx_kl_normal_normal),
+      ])
+      self.assertAllClose(gradp_exact_kl_normal_normal_,
+                          gradp_approx_kl_normal_normal_,
+                          rtol=0.01, atol=0.)
+      self.assertAllClose(gradq_exact_kl_normal_normal_,
+                          gradq_approx_kl_normal_normal_,
+                          rtol=0.01, atol=0.)
+
+  def test_docstring_example_gamma(self):
+    with self.test_session() as sess:
+      num_draws = int(1e5)
+      concentration_p = constant_op.constant(1.)
+      concentration_q = constant_op.constant(2.)
+      p = gamma_lib.Gamma(concentration=concentration_p, rate=1.)
+      q = gamma_lib.Gamma(concentration=concentration_q, rate=3.)
+      approx_kl_gamma_gamma = monte_carlo_lib.expectation(
+          f=lambda x: p.log_prob(x) - q.log_prob(x),
+          samples=p.sample(num_draws, seed=42),
+          log_prob=p.log_prob,
+          use_reparametrization=(p.reparameterization_type
+                                 == distribution_lib.FULLY_REPARAMETERIZED))
+      exact_kl_gamma_gamma = kullback_leibler.kl_divergence(p, q)
+      [exact_kl_gamma_gamma_, approx_kl_gamma_gamma_] = sess.run([
+          exact_kl_gamma_gamma, approx_kl_gamma_gamma])
+      self.assertEqual(
+          False,
+          p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)
+      self.assertAllClose(exact_kl_gamma_gamma_, approx_kl_gamma_gamma_,
+                          rtol=0.01, atol=0.)
+
+      # Compare gradients. (Not present in `docstring`.)
+      gradp = lambda fp: gradients_impl.gradients(fp, concentration_p)[0]
+      gradq = lambda fq: gradients_impl.gradients(fq, concentration_q)[0]
+      [
+          gradp_exact_kl_gamma_gamma_,
+          gradq_exact_kl_gamma_gamma_,
+          gradp_approx_kl_gamma_gamma_,
+          gradq_approx_kl_gamma_gamma_,
+      ] = sess.run([
+          gradp(exact_kl_gamma_gamma),
+          gradq(exact_kl_gamma_gamma),
+          gradp(approx_kl_gamma_gamma),
+          gradq(approx_kl_gamma_gamma),
+      ])
+      # Notice that variance (i.e., `rtol`) is higher when using score-trick.
+      self.assertAllClose(gradp_exact_kl_gamma_gamma_,
+                          gradp_approx_kl_gamma_gamma_,
+                          rtol=0.05, atol=0.)
+      self.assertAllClose(gradq_exact_kl_gamma_gamma_,
+                          gradq_approx_kl_gamma_gamma_,
+                          rtol=0.03, atol=0.)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence.py b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7a95f138f7fd3e726f095dc16f41abb6182e17
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Csiszar f-Divergence and helpers.
+
+See ${python/contrib.bayesflow.csiszar_divergence}.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.csiszar_divergence_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'amari_alpha',
+    'arithmetic_geometric',
+    'chi_square',
+    'csiszar_vimco',
+    'dual_csiszar_function',
+    'jeffreys',
+    'jensen_shannon',
+    'kl_forward',
+    'kl_reverse',
+    'log1p_abs',
+    'modified_gan',
+    'monte_carlo_csiszar_f_divergence',
+    'pearson',
+    'squared_hellinger',
+    'symmetrized_csiszar_function',
+    'total_variation',
+    't_power',
+    'triangular',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..54900ab893714166e72b928d4c17eaf494f3a4b7
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/csiszar_divergence_impl.py
@@ -0,0 +1,1059 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Csiszar f-Divergence and helpers.
+
+@@amari_alpha
+@@arithmetic_geometric
+@@chi_square
+@@csiszar_vimco
+@@dual_csiszar_function
+@@jeffreys
+@@jensen_shannon
+@@kl_forward
+@@kl_reverse
+@@log1p_abs
+@@modified_gan
+@@monte_carlo_csiszar_f_divergence
+@@pearson
+@@squared_hellinger
+@@symmetrized_csiszar_function
+@@total_variation
+@@triangular
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+def amari_alpha(logu, alpha=1., self_normalized=False, name=None):
+  """The Amari-alpha Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True`, the Amari-alpha Csiszar-function is:
+
+  ```none
+  f(u) = { -log(u) + (u - 1),     alpha = 0
+         { u log(u) - (u - 1),    alpha = 1
+         { [(u**alpha - 1) - alpha (u - 1)] / (alpha (alpha - 1)),    otherwise
+  ```
+
+  When `self_normalized = False` the `(u - 1)` terms are omitted.
+
+  Warning: when `alpha != 0` and/or `self_normalized = True` this function makes
+  non-log-space calculations and may therefore be numerically unstable for
+  `|logu| >> 0`.
+
+  For more information, see:
+    A. Cichocki and S. Amari. "Families of Alpha-Beta-and GammaDivergences:
+    Flexible and Robust Measures of Similarities." Entropy, vol. 12, no. 6, pp.
+    1532-1568, 2010.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    alpha: `float`-like Python scalar. (See Mathematical Details for meaning.)
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
+      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
+      when `p, q` are unnormalized measures.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    amari_alpha_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+
+  Raises:
+    TypeError: if `alpha` is `None` or a `Tensor`.
+    TypeError: if `self_normalized` is `None` or a `Tensor`.
+  """
+  with ops.name_scope(name, "amari_alpha", [logu]):
+    if alpha is None or contrib_framework.is_tensor(alpha):
+      raise TypeError("`alpha` cannot be `None` or `Tensor` type.")
+    if self_normalized is None or contrib_framework.is_tensor(self_normalized):
+      raise TypeError("`self_normalized` cannot be `None` or `Tensor` type.")
+
+    logu = ops.convert_to_tensor(logu, name="logu")
+
+    if alpha == 0.:
+      f = -logu
+    elif alpha == 1.:
+      f = math_ops.exp(logu) * logu
+    else:
+      f = math_ops.expm1(alpha * logu) / (alpha * (alpha - 1.))
+
+    if not self_normalized:
+      return f
+
+    if alpha == 0.:
+      return f + math_ops.expm1(logu)
+    elif alpha == 1.:
+      return f - math_ops.expm1(logu)
+    else:
+      return f - math_ops.expm1(logu) / (alpha - 1.)
+
+
+def kl_reverse(logu, self_normalized=False, name=None):
+  """The reverse Kullback-Leibler Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True`, the KL-reverse Csiszar-function is:
+
+  ```none
+  f(u) = -log(u) + (u - 1)
+  ```
+
+  When `self_normalized = False` the `(u - 1)` term is omitted.
+
+  Observe that as an f-Divergence, this Csiszar-function implies:
+
+  ```none
+  D_f[p, q] = KL[q, p]
+  ```
+
+  The KL is "reverse" because in maximum likelihood we think of minimizing `q`
+  as in `KL[p, q]`.
+
+  Warning: when self_normalized = True` this function makes non-log-space
+  calculations and may therefore be numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
+      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
+      when `p, q` are unnormalized measures.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    kl_reverse_of_u: `float`-like `Tensor` of the Csiszar-function evaluated at
+      `u = exp(logu)`.
+
+  Raises:
+    TypeError: if `self_normalized` is `None` or a `Tensor`.
+  """
+
+  with ops.name_scope(name, "kl_reverse", [logu]):
+    return amari_alpha(logu, alpha=0., self_normalized=self_normalized)
+
+
+def kl_forward(logu, self_normalized=False, name=None):
+  """The forward Kullback-Leibler Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True`, the KL-forward Csiszar-function is:
+
+  ```none
+  f(u) = u log(u) - (u - 1)
+  ```
+
+  When `self_normalized = False` the `(u - 1)` term is omitted.
+
+  Observe that as an f-Divergence, this Csiszar-function implies:
+
+  ```none
+  D_f[p, q] = KL[p, q]
+  ```
+
+  The KL is "forward" because in maximum likelihood we think of minimizing `q`
+  as in `KL[p, q]`.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
+      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
+      when `p, q` are unnormalized measures.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    kl_forward_of_u: `float`-like `Tensor` of the Csiszar-function evaluated at
+      `u = exp(logu)`.
+
+  Raises:
+    TypeError: if `self_normalized` is `None` or a `Tensor`.
+  """
+
+  with ops.name_scope(name, "kl_forward", [logu]):
+    return amari_alpha(logu, alpha=1., self_normalized=self_normalized)
+
+
+def jensen_shannon(logu, self_normalized=False, name=None):
+  """The Jensen-Shannon Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True`, the Jensen-Shannon Csiszar-function is:
+
+  ```none
+  f(u) = u log(u) - (1 + u) log(1 + u) + (u + 1) log(2)
+  ```
+
+  When `self_normalized = False` the `(u + 1) log(2)` term is omitted.
+
+  Observe that as an f-Divergence, this Csiszar-function implies:
+
+  ```none
+  D_f[p, q] = KL[p, m] + KL[q, m]
+  m(x) = 0.5 p(x) + 0.5 q(x)
+  ```
+
+  In a sense, this divergence is the "reverse" of the Arithmetic-Geometric
+  f-Divergence.
+
+  This Csiszar-function induces a symmetric f-Divergence, i.e.,
+  `D_f[p, q] = D_f[q, p]`.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  For more information, see:
+    Lin, J. "Divergence measures based on the Shannon entropy." IEEE Trans.
+    Inf. Th., 37, 145-151, 1991.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
+      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
+      when `p, q` are unnormalized measures.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    jensen_shannon_of_u: `float`-like `Tensor` of the Csiszar-function
+      evaluated at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "jensen_shannon", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    npdt = logu.dtype.as_numpy_dtype
+    y = nn_ops.softplus(logu)
+    if self_normalized:
+      y -= np.log(2).astype(npdt)
+    return math_ops.exp(logu) * logu - (1. + math_ops.exp(logu)) * y
+
+
+def arithmetic_geometric(logu, self_normalized=False, name=None):
+  """The Arithmetic-Geometric Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True` the Arithmetic-Geometric Csiszar-function is:
+
+  ```none
+  f(u) = (1 + u) log( (1 + u) / sqrt(u) ) - (1 + u) log(2)
+  ```
+
+  When `self_normalized = False` the `(1 + u) log(2)` term is omitted.
+
+  Observe that as an f-Divergence, this Csiszar-function implies:
+
+  ```none
+  D_f[p, q] = KL[m, p] + KL[m, q]
+  m(x) = 0.5 p(x) + 0.5 q(x)
+  ```
+
+  In a sense, this divergence is the "reverse" of the Jensen-Shannon
+  f-Divergence.
+
+  This Csiszar-function induces a symmetric f-Divergence, i.e.,
+  `D_f[p, q] = D_f[q, p]`.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
+      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
+      when `p, q` are unnormalized measures.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    arithmetic_geometric_of_u: `float`-like `Tensor` of the
+      Csiszar-function evaluated at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "arithmetic_geometric", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    y = nn_ops.softplus(logu) - 0.5 * logu
+    if self_normalized:
+      y -= np.log(2.).astype(logu.dtype.as_numpy_dtype)
+    return (1. + math_ops.exp(logu)) * y
+
+
+def total_variation(logu, name=None):
+  """The Total Variation Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Total-Variation Csiszar-function is:
+
+  ```none
+  f(u) = 0.5 |u - 1|
+  ```
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    total_variation_of_u: `float`-like `Tensor` of the Csiszar-function
+      evaluated at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "total_variation", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return 0.5 * math_ops.abs(math_ops.expm1(logu))
+
+
+def pearson(logu, name=None):
+  """The Pearson Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Pearson Csiszar-function is:
+
+  ```none
+  f(u) = (u - 1)**2
+  ```
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    pearson_of_u: `float`-like `Tensor` of the Csiszar-function evaluated at
+      `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "pearson", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return math_ops.square(math_ops.expm1(logu))
+
+
+def squared_hellinger(logu, name=None):
+  """The Squared-Hellinger Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Squared-Hellinger Csiszar-function is:
+
+  ```none
+  f(u) = (sqrt(u) - 1)**2
+  ```
+
+  This Csiszar-function induces a symmetric f-Divergence, i.e.,
+  `D_f[p, q] = D_f[q, p]`.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    squared_hellinger_of_u: `float`-like `Tensor` of the Csiszar-function
+      evaluated at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "squared_hellinger", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return pearson(0.5 * logu)
+
+
+def triangular(logu, name=None):
+  """The Triangular Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Triangular Csiszar-function is:
+
+  ```none
+  f(u) = (u - 1)**2 / (1 + u)
+  ```
+
+  This Csiszar-function induces a symmetric f-Divergence, i.e.,
+  `D_f[p, q] = D_f[q, p]`.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    triangular_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "triangular", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return pearson(logu) / (1. + math_ops.exp(logu))
+
+
+def t_power(logu, t, self_normalized=False, name=None):
+  """The T-Power Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True` the T-Power Csiszar-function is:
+
+  ```none
+  f(u) = s [ u**t - 1 - t(u - 1) ]
+  s = { -1   0 < t < 1
+      { +1   otherwise
+  ```
+
+  When `self_normalized = False` the `- t(u - 1)` term is omitted.
+
+  This is similar to the `amari_alpha` Csiszar-function, with the associated
+  divergence being the same up to factors depending only on `t`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    t:  `Tensor` of same `dtype` as `logu` and broadcastable shape.
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    t_power_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+  """
+  with ops.name_scope(name, "t_power", [logu, t]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    t = ops.convert_to_tensor(t, dtype=logu.dtype.base_dtype, name="t")
+    fu = math_ops.expm1(t * logu)
+    if self_normalized:
+      fu -= t * math_ops.expm1(logu)
+    fu *= array_ops.where(math_ops.logical_and(0. < t, t < 1.),
+                          -array_ops.ones_like(t),
+                          array_ops.ones_like(t))
+    return fu
+
+
+def log1p_abs(logu, name=None):
+  """The log1p-abs Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Log1p-Abs Csiszar-function is:
+
+  ```none
+  f(u) = u**(sign(u-1)) - 1
+  ```
+
+  This function is so-named because it was invented from the following recipe.
+  Choose a convex function g such that g(0)=0 and solve for f:
+
+  ```none
+  log(1 + f(u)) = g(log(u)).
+    <=>
+  f(u) = exp(g(log(u))) - 1
+  ```
+
+  That is, the graph is identically `g` when y-axis is `log1p`-domain and x-axis
+  is `log`-domain.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    log1p_abs_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "log1p_abs", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return math_ops.expm1(math_ops.abs(logu))
+
+
+def jeffreys(logu, name=None):
+  """The Jeffreys Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Jeffreys Csiszar-function is:
+
+  ```none
+  f(u) = 0.5 ( u log(u) - log(u) )
+       = 0.5 kl_forward + 0.5 kl_reverse
+       = symmetrized_csiszar_function(kl_reverse)
+       = symmetrized_csiszar_function(kl_forward)
+  ```
+
+  This Csiszar-function induces a symmetric f-Divergence, i.e.,
+  `D_f[p, q] = D_f[q, p]`.
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    jeffreys_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "jeffreys", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return 0.5 * math_ops.expm1(logu) * logu
+
+
+def chi_square(logu, name=None):
+  """The chi-Square Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Chi-square Csiszar-function is:
+
+  ```none
+  f(u) = u**2 - 1
+  ```
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    chi_square_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "chi_square", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return math_ops.expm1(2. * logu)
+
+
+def modified_gan(logu, self_normalized=False, name=None):
+  """The Modified-GAN Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  When `self_normalized = True` the modified-GAN (Generative/Adversarial
+  Network) Csiszar-function is:
+
+  ```none
+  f(u) = log(1 + u) - log(u) + 0.5 (u - 1)
+  ```
+
+  When `self_normalized = False` the `0.5 (u - 1)` is omitted.
+
+  The unmodified GAN Csiszar-function is identical to Jensen-Shannon (with
+  `self_normalized = False`).
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    self_normalized: Python `bool` indicating whether `f'(u=1)=0`. When
+      `f'(u=1)=0` the implied Csiszar f-Divergence remains non-negative even
+      when `p, q` are unnormalized measures.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    chi_square_of_u: `float`-like `Tensor` of the Csiszar-function evaluated
+      at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "chi_square", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    y = nn_ops.softplus(logu) - logu
+    if self_normalized:
+      y += 0.5 * math_ops.expm1(logu)
+    return y
+
+
+def dual_csiszar_function(logu, csiszar_function, name=None):
+  """Calculates the dual Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Csiszar-dual is defined as:
+
+  ```none
+  f^*(u) = u f(1 / u)
+  ```
+
+  where `f` is some other Csiszar-function.
+
+  For example, the dual of `kl_reverse` is `kl_forward`, i.e.,
+
+  ```none
+  f(u) = -log(u)
+  f^*(u) = u f(1 / u) = -u log(1 / u) = u log(u)
+  ```
+
+  The dual of the dual is the original function:
+
+  ```none
+  f^**(u) = {u f(1/u)}^*(u) = u (1/u) f(1/(1/u)) = f(u)
+  ```
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    csiszar_function: Python `callable` representing a Csiszar-function over
+      log-domain.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    dual_f_of_u: `float`-like `Tensor` of the result of calculating the dual of
+      `f` at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "dual_csiszar_function", [logu]):
+    return math_ops.exp(logu) * csiszar_function(-logu)
+
+
+def symmetrized_csiszar_function(logu, csiszar_function, name=None):
+  """Symmetrizes a Csiszar-function in log-space.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The symmetrized Csiszar-function is defined as:
+
+  ```none
+  f_g(u) = 0.5 g(u) + 0.5 u g (1 / u)
+  ```
+
+  where `g` is some other Csiszar-function.
+
+  We say the function is "symmetrized" because:
+
+  ```none
+  D_{f_g}[p, q] = D_{f_g}[q, p]
+  ```
+
+  for all `p << >> q` (i.e., `support(p) = support(q)`).
+
+  There exists alternatives for symmetrizing a Csiszar-function. For example,
+
+  ```none
+  f_g(u) = max(f(u), f^*(u)),
+  ```
+
+  where `f^*` is the dual Csiszar-function, also implies a symmetric
+  f-Divergence.
+
+  Example:
+
+  When either of the following functions are symmetrized, we obtain the
+  Jensen-Shannon Csiszar-function, i.e.,
+
+  ```none
+  g(u) = -log(u) - (1 + u) log((1 + u) / 2) + u - 1
+  h(u) = log(4) + 2 u log(u / (1 + u))
+  ```
+
+  implies,
+
+  ```none
+  f_g(u) = f_h(u) = u log(u) - (1 + u) log((1 + u) / 2)
+         = jensen_shannon(log(u)).
+  ```
+
+  Warning: this function makes non-log-space calculations and may therefore be
+  numerically unstable for `|logu| >> 0`.
+
+  Args:
+    logu: `float`-like `Tensor` representing `log(u)` from above.
+    csiszar_function: Python `callable` representing a Csiszar-function over
+      log-domain.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    symmetrized_g_of_u: `float`-like `Tensor` of the result of applying the
+      symmetrization of `g` evaluated at `u = exp(logu)`.
+  """
+
+  with ops.name_scope(name, "symmetrized_csiszar_function", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+    return 0.5 * (csiszar_function(logu)
+                  + dual_csiszar_function(logu, csiszar_function))
+
+
+def monte_carlo_csiszar_f_divergence(
+    f,
+    p_log_prob,
+    q,
+    num_draws,
+    use_reparametrization=None,
+    seed=None,
+    name=None):
+  """Monte-Carlo approximation of the Csiszar f-Divergence.
+
+  A Csiszar-function is a member of,
+
+  ```none
+  F = { f:R_+ to R : f convex }.
+  ```
+
+  The Csiszar f-Divergence for Csiszar-function f is given by:
+
+  ```none
+  D_f[p(X), q(X)] := E_{q(X)}[ f( p(X) / q(X) ) ]
+                  ~= m**-1 sum_j^m f( p(x_j) / q(x_j) ),
+                             where x_j ~iid q(X)
+  ```
+
+  Tricks: Reparameterization and Score-Gradient
+
+  When q is "reparameterized", i.e., a diffeomorphic transformation of a
+  parameterless distribution (e.g.,
+  `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
+  expectation, i.e.,
+  `grad[Avg{ s_i : i=1...n }] = Avg{ grad[s_i] : i=1...n }` where `S_n=Avg{s_i}`
+  and `s_i = f(x_i), x_i ~iid q(X)`.
+
+  However, if q is not reparameterized, TensorFlow's gradient will be incorrect
+  since the chain-rule stops at samples of unreparameterized distributions. In
+  this circumstance using the Score-Gradient trick results in an unbiased
+  gradient, i.e.,
+
+  ```none
+  grad[ E_q[f(X)] ]
+  = grad[ int dx q(x) f(x) ]
+  = int dx grad[ q(x) f(x) ]
+  = int dx [ q'(x) f(x) + q(x) f'(x) ]
+  = int dx q(x) [q'(x) / q(x) f(x) + f'(x) ]
+  = int dx q(x) grad[ f(x) q(x) / stop_grad[q(x)] ]
+  = E_q[ grad[ f(x) q(x) / stop_grad[q(x)] ] ]
+  ```
+
+  Unless `q.reparameterization_type != distribution.FULLY_REPARAMETERIZED` it is
+  usually preferable to set `use_reparametrization = True`.
+
+  Example Application:
+
+  The Csiszar f-Divergence is a useful framework for variational inference.
+  I.e., observe that,
+
+  ```none
+  f(p(x)) =  f( E_{q(Z | x)}[ p(x, Z) / q(Z | x) ] )
+          <= E_{q(Z | x)}[ f( p(x, Z) / q(Z | x) ) ]
+          := D_f[p(x, Z), q(Z | x)]
+  ```
+
+  The inequality follows from the fact that the "perspective" of `f`, i.e.,
+  `(s, t) |-> t f(s / t))`, is convex in `(s, t)` when `s/t in domain(f)` and
+  `t` is a real. Since the above framework includes the popular Evidence Lower
+  BOund (ELBO) as a special case, i.e., `f(u) = -log(u)`, we call this framework
+  "Evidence Divergence Bound Optimization" (EDBO).
+
+  Args:
+    f: Python `callable` representing a Csiszar-function in log-space, i.e.,
+      takes `p_log_prob(q_samples) - q.log_prob(q_samples)`.
+    p_log_prob: Python `callable` taking (a batch of) samples from `q` and
+      returning the the natural-log of the probability under distribution `p`.
+      (In variational inference `p` is the joint distribution.)
+    q: `tf.Distribution`-like instance; must implement:
+      `reparameterization_type`, `sample(n, seed)`, and `log_prob(x)`.
+      (In variational inference `q` is the approximate posterior distribution.)
+    num_draws: Integer scalar number of draws used to approximate the
+      f-Divergence expectation.
+    use_reparametrization: Python `bool`. When `None` (the default),
+      automatically set to:
+      `q.reparameterization_type == distribution.FULLY_REPARAMETERIZED`.
+      When `True` uses the standard Monte-Carlo average. When `False` uses the
+      score-gradient trick. (See above for details.)  When `False`, consider
+      using `csiszar_vimco`.
+    seed: Python `int` seed for `q.sample`.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    monte_carlo_csiszar_f_divergence: `float`-like `Tensor` Monte Carlo
+      approximation of the Csiszar f-Divergence.
+
+  Raises:
+    ValueError: if `q` is not a reparameterized distribution and
+      `use_reparametrization = True`. A distribution `q` is said to be
+      "reparameterized" when its samples are generated by transforming the
+      samples of another distribution which does not depend on the
+      parameterization of `q`. This property ensures the gradient (with respect
+      to parameters) is valid.
+    TypeError: if `p_log_prob` is not a Python `callable`.
+  """
+  with ops.name_scope(name, "monte_carlo_csiszar_f_divergence", [num_draws]):
+    if use_reparametrization is None:
+      use_reparametrization = (q.reparameterization_type
+                               == distribution.FULLY_REPARAMETERIZED)
+    elif (use_reparametrization and
+          q.reparameterization_type != distribution.FULLY_REPARAMETERIZED):
+      # TODO(jvdillon): Consider only raising an exception if the gradient is
+      # requested.
+      raise ValueError(
+          "Distribution `q` must be reparameterized, i.e., a diffeomorphic "
+          "transformation of a parameterless distribution. (Otherwise this "
+          "function has a biased gradient.)")
+    if not callable(p_log_prob):
+      raise TypeError("`p_log_prob` must be a Python `callable` function.")
+    return monte_carlo.expectation(
+        f=lambda q_samples: f(p_log_prob(q_samples) - q.log_prob(q_samples)),
+        samples=q.sample(num_draws, seed=seed),
+        log_prob=q.log_prob,  # Only used if use_reparametrization=False.
+        use_reparametrization=use_reparametrization)
+
+
+def csiszar_vimco(f,
+                  p_log_prob,
+                  q,
+                  num_draws,
+                  num_batch_draws=1,
+                  seed=None,
+                  name=None):
+  """Use VIMCO to lower the variance of gradient[csiszar_function(Avg(logu))].
+
+  This function generalizes "Variational Inference for Monte Carlo Objectives"
+  (VIMCO), i.e., https://arxiv.org/abs/1602.06725, to Csiszar f-Divergences.
+
+  Note: if `q.reparameterization_type = distribution.FULLY_REPARAMETERIZED`,
+  consider using `monte_carlo_csiszar_f_divergence`.
+
+  The VIMCO loss is:
+
+  ```none
+  vimco = f(Avg{logu[i] : i=0,...,m-1})
+  where,
+    logu[i] = log( p(x, h[i]) / q(h[i] | x) )
+    h[i] iid~ q(H | x)
+  ```
+
+  Interestingly, the VIMCO gradient is not the naive gradient of `vimco`.
+  Rather, it is characterized by:
+
+  ```none
+  grad[vimco] - variance_reducing_term
+  where,
+    variance_reducing_term = Sum{ grad[log q(h[i] | x)] *
+                                    (vimco - f(log Avg{h[j;i] : j=0,...,m-1}))
+                                 : i=0, ..., m-1 }
+    h[j;i] = { u[j]                             j!=i
+             { GeometricAverage{ u[k] : k!=i}   j==i
+  ```
+
+  (We omitted `stop_gradient` for brevity. See implementation for more details.)
+
+  The `Avg{h[j;i] : j}` term is a kind of "swap-out average" where the `i`-th
+  element has been replaced by the leave-`i`-out Geometric-average.
+
+  Args:
+    f: Python `callable` representing a Csiszar-function in log-space.
+    p_log_prob: Python `callable` representing the natural-log of the
+      probability under distribution `p`. (In variational inference `p` is the
+      joint distribution.)
+    q: `tf.Distribution`-like instance; must implement: `sample(n, seed)`, and
+      `log_prob(x)`. (In variational inference `q` is the approximate posterior
+      distribution.)
+    num_draws: Integer scalar number of draws used to approximate the
+      f-Divergence expectation.
+    num_batch_draws: Integer scalar number of draws used to approximate the
+      f-Divergence expectation.
+    seed: Python `int` seed for `q.sample`.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    vimco: The Csiszar f-Divergence generalized VIMCO objective.
+
+  Raises:
+    ValueError: if `num_draws < 2`.
+  """
+  with ops.name_scope(name, "csiszar_vimco", [num_draws, num_batch_draws]):
+    if num_draws < 2:
+      raise ValueError("Must specify num_draws > 1.")
+    stop = array_ops.stop_gradient  # For readability.
+    x = stop(q.sample(sample_shape=[num_draws, num_batch_draws],
+                      seed=seed))
+    logqx = q.log_prob(x)
+    logu = p_log_prob(x) - logqx
+    f_log_avg_u, f_log_sooavg_u = [f(r) for r in csiszar_vimco_helper(logu)]
+    dotprod = math_ops.reduce_sum(
+        logqx * stop(f_log_avg_u - f_log_sooavg_u),
+        axis=0)  # Sum over iid samples.
+    # We now rewrite f_log_avg_u so that:
+    #   `grad[f_log_avg_u] := grad[f_log_avg_u + dotprod]`.
+    # To achieve this, we use a trick that
+    #   `f(x) - stop(f(x)) == zeros_like(f(x))`
+    # but its gradient is grad[f(x)].
+    # Note that IEEE754 specifies that `x - x == 0.` and `x + 0. == x`, hence
+    # this trick loses no precision. For more discussion regarding the relevant
+    # portions of the IEEE754 standard, see the StackOverflow question,
+    # "Is there a floating point value of x, for which x-x == 0 is false?"
+    # http://stackoverflow.com/q/2686644
+    f_log_avg_u += dotprod - stop(dotprod)  # Add zeros_like(dot_prod).
+    return math_ops.reduce_mean(f_log_avg_u, axis=0)  # Avg over batches.
+
+
+def csiszar_vimco_helper(logu, name=None):
+  """Helper to `csiszar_vimco`; computes `log_avg_u`, `log_sooavg_u`.
+
+  `axis = 0` of `logu` is presumed to correspond to iid samples from `q`, i.e.,
+
+  ```none
+  logu[j] = log(u[j])
+  u[j] = p(x, h[j]) / q(h[j] | x)
+  h[j] iid~ q(H | x)
+  ```
+
+  Args:
+    logu: Floating-type `Tensor` representing `log(p(x, h) / q(h | x))`.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    log_avg_u: `logu.dtype` `Tensor` corresponding to the natural-log of the
+      average of `u`.
+    log_sooavg_u: `logu.dtype` `Tensor` characterized by the natural-log of the
+      average of `u`` except that the average swaps-out `u[i]` for the
+      leave-`i`-out Geometric-average, i.e.,
+
+      ```none
+      log_sooavg_u[i] = log(Avg{h[j ; i] : j=0, ..., m-1})
+      h[j ; i] = { u[j]                              j!=i
+                 { GeometricAverage{u[k] : k != i}   j==i
+      ```
+
+  """
+  with ops.name_scope(name, "csiszar_vimco_helper", [logu]):
+    logu = ops.convert_to_tensor(logu, name="logu")
+
+    n = logu.shape.with_rank_at_least(1)[0].value
+    if n is None:
+      n = array_ops.shape(logu)[0]
+      log_n = math_ops.log(math_ops.cast(n, dtype=logu.dtype))
+      nm1 = math_ops.cast(n - 1, dtype=logu.dtype)
+    else:
+      log_n = np.log(n).astype(logu.dtype.as_numpy_dtype)
+      nm1 = np.asarray(n - 1, dtype=logu.dtype.as_numpy_dtype)
+
+    # Throughout we reduce across axis=0 since this is presumed to be iid
+    # samples.
+
+    log_sum_u = math_ops.reduce_logsumexp(logu, axis=0)
+
+    # log_loosum_u[i] =
+    # = logsumexp(logu[j] : j != i)
+    # = log( exp(logsumexp(logu)) - exp(logu[i]) )
+    # = log( exp(logsumexp(logu - logu[i])) exp(logu[i])  - exp(logu[i]))
+    # = logu[i] + log(exp(logsumexp(logu - logu[i])) - 1)
+    # = logu[i] + softplus_inverse(logsumexp(logu - logu[i]))
+    log_loosum_u = logu + distribution_util.softplus_inverse(log_sum_u - logu)
+
+    # The swap-one-out-sum ("soosum") is n different sums, each of which
+    # replaces the i-th item with the i-th-left-out average, i.e.,
+    # soo_sum_u[i] = [exp(logu) - exp(logu[i])] + exp(mean(logu[!=i]))
+    #              =  exp(log_loosum_u[i])      + exp(looavg_logu[i])
+    looavg_logu = (math_ops.reduce_sum(logu, axis=0) - logu) / nm1
+    log_soosum_u = math_ops.reduce_logsumexp(
+        array_ops.stack([log_loosum_u, looavg_logu]),
+        axis=0)
+
+    return log_sum_u - log_n, log_soosum_u - log_n
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca1ecb9c40204c3c723fa3423cfe148e823adc28
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad.py
@@ -0,0 +1,34 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for specifying custom gradients.
+
+See ${python/contrib.bayesflow.custom_gradient}.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.custom_grad_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'custom_gradient',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3719232d8796c338247320fd8ef832a41df12b
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for specifying custom gradients.
+
+@@custom_gradient
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+__all__ = [
+    "custom_gradient",
+]
+
+
+def custom_gradient(fx, gx, x, axis=(),
+                    fx_gx_manually_stopped=False,
+                    name=None):
+  """Enables specifying a custom gradient.
+
+  This function works by clever application of `stop_gradient`. I.e., observe
+  that:
+
+  ```none
+  h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x))
+  ```
+
+  is such that `h(x) = stop(f(x))` and `grad[h(x), x] = stop_gradient(g(x)).`
+
+  In addition to scalar-domain/scalar-range functions, this function also
+  supports tensor-domain/scalar-range functions. However, in the latter case it
+  is necessary to reduce `x` to a scalar. This can be done by indicating the
+  `axis` over which `f` operates or by appropriately `reduce_sum`-ing `x`, prior
+  to calling this function.
+
+  Partial Custom Gradient:
+
+  Suppose `h(x) = htilde(x, y)`. Note that `dh/dx = stop(g(x))` but `dh/dy =
+  None`. This is because a `Tensor` cannot have only a portion of its gradient
+  stopped. To circumvent this issue, one must manually `stop_gradient` the
+  relevant portions of `f`, `g`. For example see the unit-test,
+  `test_works_correctly_fx_gx_manually_stopped`.
+
+  Args:
+    fx: `Tensor`. Output of function evaluated at `x`.
+    gx: `Tensor`. Gradient of function evaluated at `x`.
+    x: `Tensor`. Point of evaluation for `f, g`.
+    axis: 1D `int` `Tensor` representing dimensions of `x` which are the domain
+      of `f`. If `()` (the default), `f` is assumed scalar-domain/scalar-range.
+      If `None` `f` is assumed to render one scalar given all of `x`. Otherwise
+      `f` is assumed to output one scalar for each of `axis` dimensions of `x`.
+    fx_gx_manually_stopped: Python `bool` indicating that `fx`, `gx` manually
+      have `stop_gradient` applied.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    fx: Floating-type `Tensor` equal to `f(x)` but which has gradient
+      `stop_gradient(g(x))`.
+  """
+  with ops.name_scope(name, "custom_gradient", [fx, gx, x]):
+    fx = ops.convert_to_tensor(fx, name="fx")
+    # We don't want to bother eagerly computing `gx` since we may not even need
+    # it.
+    with ops.control_dependencies([fx]):
+      gx = ops.convert_to_tensor(gx, dtype=fx.dtype, name="gx")
+      gx = array_ops.identity(gx, name="gx")
+    # Proof of correctness:
+    #
+    #  f(x) = x * stop[gx] + stop[fx - x * gx]
+    #       = stop[fx]
+    #
+    #  g(x) = grad[fx]
+    #       = stop[gx] + grad[stop[fx - x * gx]]
+    #       = stop[gx] + 0
+    #
+    # Notice that when x is zero it still works:
+    # grad[x * stop(gx) + stop(fx - x * gx)] = 1 * stop[gx] + 0 = stop[gx]
+    #
+    # The proof is similar for the tensor-domain case, except that `x` is
+    # replaced by `reduce_sum(x)`.
+    sum_x = math_ops.reduce_sum(x, axis=axis, name="sum_x")
+    if not fx_gx_manually_stopped:
+      fx = array_ops.stop_gradient(fx)
+      gx = array_ops.stop_gradient(gx)
+    # IEEE754 ensures `(x-x)==0.` and that `0.*x==0.` so we make sure to write
+    # the code this way, rather than, e.g.,
+    # `sum_x * stop(gx) + stop(fx - sum_x * gx)`.
+    # For more discussion regarding the relevant portions of the IEEE754
+    # standard, see the StackOverflow question,
+    # "Is there a floating point value of x, for which x-x == 0 is false?"
+    # http://stackoverflow.com/q/2686644
+    return (sum_x - array_ops.stop_gradient(sum_x)) * gx + fx
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
index f155de5032be8fc4477e0c71ca634a32c0d922d1..4a7679fb436b91c9ae70daf85552099e5b710cbc 100644
--- a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
@@ -195,8 +195,9 @@ def entropy_shannon(p,
     # Sample path
     if entropy is None:
       logging.info('Using sampled entropy(p:%s)', p)
-      entropy = -1. * monte_carlo.expectation(
-          p.log_prob, p, z=z, n=n, seed=seed)
+      if z is None:
+        z = p.sample(n, seed=seed)
+      entropy = -monte_carlo.expectation(p.log_prob, z)
 
     return entropy
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 3590f940acfb05ee7a13f59837f6a5ca90c41cb5..985177e897f443989e466d1a498c461a30aeb5cb 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -194,61 +194,161 @@ def _logspace_mean(log_values):
   return log_mean_of_values
 
 
-def expectation(f, p, z=None, n=None, seed=None, name='expectation'):
-  r"""Monte Carlo estimate of an expectation:  `E_p[f(Z)]` with sample mean.
+def expectation(f, samples, log_prob=None, use_reparametrization=True,
+                axis=0, keep_dims=False, name=None):
+  """Computes the Monte-Carlo approximation of `E_p[f(X)]`.
 
-  This `Op` returns
+  This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
+  ```none
+  E_p[f(X)] approx= m**-1 sum_i^m f(x_j),  x_j ~iid p(X)
   ```
-  n^{-1} sum_{i=1}^n f(z_i),  where z_i ~ p
-  \approx E_p[f(Z)]
+
+  where:
+
+  - `x_j = samples[j, ...]`,
+  - `log(p(samples)) = log_prob(samples)` and
+  - `m = prod(shape(samples)[axis])`.
+
+  Tricks: Reparameterization and Score-Gradient
+
+  When p is "reparameterized", i.e., a diffeomorphic transformation of a
+  parameterless distribution (e.g.,
+  `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
+  expectation, i.e.,
+  `grad[ Avg{ s_i : i=1...n } ] = Avg{ grad[s_i] : i=1...n }` where
+  `S_n = Avg{s_i}` and `s_i = f(x_i), x_i ~ p`.
+
+  However, if p is not reparameterized, TensorFlow's gradient will be incorrect
+  since the chain-rule stops at samples of non-reparameterized distributions.
+  (The non-differentiated result, `approx_expectation`, is the same regardless
+  of `use_reparametrization`.) In this circumstance using the Score-Gradient
+  trick results in an unbiased gradient, i.e.,
+
+  ```none
+  grad[ E_p[f(X)] ]
+  = grad[ int dx p(x) f(x) ]
+  = int dx grad[ p(x) f(x) ]
+  = int dx [ p'(x) f(x) + p(x) f'(x) ]
+  = int dx p(x) [p'(x) / p(x) f(x) + f'(x) ]
+  = int dx p(x) grad[ f(x) p(x) / stop_grad[p(x)] ]
+  = E_p[ grad[ f(x) p(x) / stop_grad[p(x)] ] ]
   ```
 
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
+  Unless p is not reparametrized, it is usually preferable to
+  `use_reparametrization = True`.
 
-  Args:
-    f: Callable mapping samples from `p` to `Tensors`.
-    p:  `tf.contrib.distributions.Distribution`.
-    z:  `Tensor` of samples from `p`, produced by `p.sample` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    A `Tensor` with the same `dtype` as `p`.
+  Warning: users are responsible for verifying `p` is a "reparameterized"
+  distribution.
 
-  Example:
+  Example Use:
 
   ```python
-  N_samples = 10000
-
-  distributions = tf.contrib.distributions
+  bf = tf.contrib.bayesflow
+  ds = tf.contrib.distributions
+
+  # Monte-Carlo approximation of a reparameterized distribution, e.g., Normal.
+
+  num_draws = int(1e5)
+  p = ds.Normal(loc=0., scale=1.)
+  q = ds.Normal(loc=1., scale=2.)
+  exact_kl_normal_normal = ds.kl_divergence(p, q)
+  # ==> 0.44314718
+  approx_kl_normal_normal = bf.expectation(
+      f=lambda x: p.log_prob(x) - q.log_prob(x),
+      samples=p.sample(num_draws, seed=42),
+      log_prob=p.log_prob,
+      use_reparametrization=(p.reparameterization_type
+                             == distribution.FULLY_REPARAMETERIZED))
+  # ==> 0.44632751
+  # Relative Error: <1%
+
+  # Monte-Carlo approximation of non-reparameterized distribution, e.g., Gamma.
+
+  num_draws = int(1e5)
+  p = ds.Gamma(concentration=1., rate=1.)
+  q = ds.Gamma(concentration=2., rate=3.)
+  exact_kl_gamma_gamma = ds.kl_divergence(p, q)
+  # ==> 0.37999129
+  approx_kl_gamma_gamma = bf.expectation(
+      f=lambda x: p.log_prob(x) - q.log_prob(x),
+      samples=p.sample(num_draws, seed=42),
+      log_prob=p.log_prob,
+      use_reparametrization=(p.reparameterization_type
+                             == distribution.FULLY_REPARAMETERIZED))
+  # ==> 0.37696719
+  # Relative Error: <1%
+
+  # For comparing the gradients, see `monte_carlo_test.py`.
+  ```
 
-  dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
-  elementwise_mean = lambda x: x
-  mean_sum = lambda x: tf.reduce_sum(x, 1)
+  Note: The above example is for illustration only. To compute approximate
+  KL-divergence, the following is preferred:
 
-  estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
-                                                         dist,
-                                                         n=N_samples)
-  estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
-                                                 dist,
-                                                 n=N_samples)
+  ```python
+  approx_kl_p_q = bf.monte_carlo_csiszar_f_divergence(
+      f=bf.kl_reverse,
+      p_log_prob=q.log_prob,
+      q=p,
+      num_draws=num_draws)
+  ```
 
-  with tf.Session() as sess:
-    estimate_elementwise_mean, estimate_mean_sum = (
-        sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
-  print estimate_elementwise_mean
-  >>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
-  print estimate_mean_sum
-  >>> 1.49571
+  Args:
+    f: Python callable which can return `f(samples)`.
+    samples: `Tensor` of samples used to form the Monte-Carlo approximation of
+      `E_p[f(X)]`.  A batch of samples should be indexed by `axis` dimensions.
+    log_prob: Python callable which can return `log_prob(samples)`. Must
+      correspond to the natural-logarithm of the pdf/pmf of each sample. Only
+      required/used if `use_reparametrization=False`.
+      Default value: `None`.
+    use_reparametrization: Python `bool` indicating that the approximation
+      should use the fact that the gradient of samples is unbiased. Whether
+      `True` or `False`, this arg only affects the gradient of the resulting
+      `approx_expectation`.
+      Default value: `True`.
+    axis: The dimensions to average. If `None`, averages all
+      dimensions.
+      Default value: `0` (the left-most dimension).
+    keep_dims: If True, retains averaged dimensions using size `1`.
+      Default value: `False`.
+    name: A `name_scope` for operations created by this function.
+      Default value: `None` (which implies "expectation").
 
-  ```
+  Returns:
+    approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
+      of `E_p[f(X)]`.
 
+  Raises:
+    ValueError: if `f` is not a Python `callable`.
+    ValueError: if `use_reparametrization=False` and `log_prob` is not a Python
+      `callable`.
   """
-  with ops.name_scope(name, values=[n, z]):
-    z = _get_samples(p, z, n, seed)
-    return _sample_mean(f(z))
+
+  with ops.name_scope(name, 'expectation', [samples]):
+    if not callable(f):
+      raise ValueError('`f` must be a callable function.')
+    if use_reparametrization:
+      return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims)
+    else:
+      if not callable(log_prob):
+        raise ValueError('`log_prob` must be a callable function.')
+      stop = array_ops.stop_gradient  # For readability.
+      x = stop(samples)
+      logpx = log_prob(x)
+      fx = f(x)  # Call `f` once in case it has side-effects.
+      # We now rewrite f(x) so that:
+      #   `grad[f(x)] := grad[f(x)] + f(x) * grad[logqx]`.
+      # To achieve this, we use a trick that
+      #   `h(x) - stop(h(x)) == zeros_like(h(x))`
+      # but its gradient is grad[h(x)].
+      # Note that IEEE754 specifies that `x - x == 0.` and `x + 0. == x`, hence
+      # this trick loses no precision. For more discussion regarding the
+      # relevant portions of the IEEE754 standard, see the StackOverflow
+      # question,
+      # "Is there a floating point value of x, for which x-x == 0 is false?"
+      # http://stackoverflow.com/q/2686644
+      fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
+      return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims)
 
 
 def _sample_mean(values):
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index c1600bdabd75028939e4a1940eb8b775bacd1672..b6c4db141699c19cb2ffd4820970ed4157ccbecf 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -7,15 +7,735 @@ package(default_visibility = [
     "//visibility:public",
 ])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
 filegroup(
     name = "all_files",
     srcs = glob(
         ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
+        exclude = ["**/OWNERS"],
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
 
 package_group(name = "friends")
+
+cc_library(
+    name = "boosted_trees_kernels",
+    deps = [
+        ":ensemble_optimizer_ops_kernels",
+        ":model_ops_kernels",
+        ":prediction_ops_kernels",
+        ":quantile_ops_kernels",
+        ":split_handler_ops_kernels",
+        ":stats_accumulator_ops_kernels",
+        ":training_ops_kernels",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "boosted_trees_ops_op_lib",
+    deps = [
+        ":ensemble_optimizer_ops_op_lib",
+        ":model_ops_op_lib",
+        ":prediction_ops_op_lib",
+        ":quantile_ops_op_lib",
+        ":split_handler_ops_op_lib",
+        ":stats_accumulator_ops_op_lib",
+        ":training_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees_ops_py",
+        ":losses",
+    ],
+)
+
+py_library(
+    name = "losses",
+    srcs = ["python/utils/losses.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:nn",
+    ],
+)
+
+py_test(
+    name = "losses_test",
+    size = "small",
+    srcs = ["python/utils/losses_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":losses",
+        "//tensorflow/python:framework_test_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "gbdt_batch",
+    srcs = [
+        "python/training/functions/gbdt_batch.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/boosted_trees:batch_ops_utils_py",
+        "//tensorflow/contrib/boosted_trees:boosted_trees_ops_py",
+        "//tensorflow/contrib/boosted_trees/lib:categorical_split_handler",
+        "//tensorflow/contrib/boosted_trees/lib:ordinal_split_handler",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/contrib/stateless",
+    ],
+)
+
+py_test(
+    name = "gbdt_batch_test",
+    size = "small",
+    srcs = ["python/training/functions/gbdt_batch_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":gbdt_batch",
+        ":losses",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+        "//tensorflow/python:framework_test_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+# Kernel tests
+
+py_test(
+    name = "ensemble_optimizer_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/ensemble_optimizer_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":ensemble_optimizer_ops_py",
+        ":model_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "model_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/model_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":ensemble_optimizer_ops_py",
+        ":model_ops_py",
+        ":prediction_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "prediction_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/prediction_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":model_ops_py",
+        ":prediction_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "quantile_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/quantile_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":quantile_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "split_handler_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/split_handler_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":split_handler_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "stats_accumulator_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/stats_accumulator_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":stats_accumulator_ops_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_test(
+    name = "training_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/training_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "nomac",  # b/63258195
+    ],
+    deps = [
+        ":model_ops_py",
+        ":training_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//third_party/py/numpy",
+    ],
+)
+
+# Ops
+
+py_library(
+    name = "batch_ops_utils_py",
+    srcs = ["python/ops/batch_ops_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "boosted_trees_ops_py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ensemble_optimizer_ops_py",
+        ":model_ops_py",
+        ":prediction_ops_py",
+        ":quantile_ops_py",
+        ":split_handler_ops_py",
+        ":stats_accumulator_ops_py",
+        ":training_ops_py",
+    ],
+)
+
+# Model Ops.
+tf_gen_op_libs(
+    op_lib_names = ["model_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_model_ops_py",
+    out = "python/ops/gen_model_ops.py",
+    deps = [":model_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "model_ops_py",
+    srcs = ["python/ops/model_ops.py"],
+    dso = [":python/ops/_model_ops.so"],
+    kernels = [
+        ":model_ops_kernels",
+        ":model_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_model_ops_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_kernel_library(
+    name = "model_ops_kernels",
+    srcs = [
+        "kernels/model_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_model_ops.so",
+    srcs = [
+        "kernels/model_ops.cc",
+        "ops/model_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+    ],
+)
+
+# Split handler Ops.
+tf_gen_op_libs(
+    op_lib_names = ["split_handler_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_split_handler_ops_py",
+    out = "python/ops/gen_split_handler_ops.py",
+    deps = [
+        ":split_handler_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "split_handler_ops_py",
+    srcs = ["python/ops/split_handler_ops.py"],
+    dso = [":python/ops/_split_handler_ops.so"],
+    kernels = [
+        ":split_handler_ops_kernels",
+        ":split_handler_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_split_handler_ops_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_split_handler_ops.so",
+    srcs = [
+        "kernels/split_handler_ops.cc",
+        "ops/split_handler_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:feature-column-handlers",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "split_handler_ops_kernels",
+    srcs = [
+        "kernels/split_handler_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:feature-column-handlers",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+# Training Ops.
+tf_gen_op_libs(
+    op_lib_names = [
+        "training_ops",
+    ],
+    deps = ["//tensorflow/contrib/boosted_trees/proto:learner_proto_cc"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_training_ops_py",
+    out = "python/ops/gen_training_ops.py",
+    deps = [
+        ":training_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "training_ops_py",
+    srcs = ["python/ops/training_ops.py"],
+    dso = [":python/ops/_training_ops.so"],
+    kernels = [
+        ":training_ops_kernels",
+        ":training_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_training_ops_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_training_ops.so",
+    srcs = [
+        "kernels/training_ops.cc",
+        "ops/training_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+    ],
+)
+
+tf_kernel_library(
+    name = "training_ops_kernels",
+    srcs = [
+        "kernels/training_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+# Prediction Ops.
+tf_gen_op_libs(
+    op_lib_names = ["prediction_ops"],
+    deps = ["//tensorflow/contrib/boosted_trees/proto:learner_proto_cc"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_prediction_ops_py",
+    out = "python/ops/gen_prediction_ops.py",
+    deps = [
+        ":prediction_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "prediction_ops_py",
+    srcs = ["python/ops/prediction_ops.py"],
+    dso = [":python/ops/_prediction_ops.so"],
+    kernels = [
+        ":prediction_ops_kernels",
+        ":prediction_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_prediction_ops_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_prediction_ops.so",
+    srcs = [
+        "kernels/prediction_ops.cc",
+        "ops/prediction_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:example_partitioner",
+        "//tensorflow/contrib/boosted_trees/lib:models",
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+    ],
+)
+
+tf_kernel_library(
+    name = "prediction_ops_kernels",
+    srcs = [
+        "kernels/prediction_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:example_partitioner",
+        "//tensorflow/contrib/boosted_trees/lib:models",
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+# Quantile ops
+tf_gen_op_libs(
+    op_lib_names = ["quantile_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_quantile_ops_py_wrap",
+    out = "python/ops/gen_quantile_ops.py",
+    deps = [
+        ":quantile_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "quantile_ops_py",
+    srcs = ["python/ops/quantile_ops.py"],
+    dso = [":python/ops/_quantile_ops.so"],
+    kernels = [
+        ":quantile_ops_kernels",
+        ":quantile_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batch_ops_utils_py",
+        ":gen_quantile_ops_py_wrap",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_quantile_ops.so",
+    srcs = [
+        "kernels/quantile_ops.cc",
+        "ops/quantile_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:quantile_stream_resource",
+    ],
+)
+
+tf_kernel_library(
+    name = "quantile_ops_kernels",
+    srcs = [
+        "kernels/quantile_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:quantile_stream_resource",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+# Ensemble optimizer ops
+tf_gen_op_libs(
+    op_lib_names = ["ensemble_optimizer_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_ensemble_optimizer_ops_py",
+    out = "python/ops/gen_ensemble_optimizer_ops.py",
+    deps = [
+        ":ensemble_optimizer_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "ensemble_optimizer_ops_py",
+    srcs = ["python/ops/ensemble_optimizer_ops.py"],
+    dso = [":python/ops/_ensemble_optimizer_ops.so"],
+    kernels = [
+        ":ensemble_optimizer_ops_kernels",
+        ":ensemble_optimizer_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_ensemble_optimizer_ops_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+tf_kernel_library(
+    name = "ensemble_optimizer_ops_kernels",
+    srcs = [
+        "kernels/ensemble_optimizer_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_ensemble_optimizer_ops.so",
+    srcs = [
+        "kernels/ensemble_optimizer_ops.cc",
+        "ops/ensemble_optimizer_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+    ],
+)
+
+# Stats Accumulator ops
+tf_gen_op_libs(
+    op_lib_names = ["stats_accumulator_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_stats_accumulator_ops_py_wrap",
+    out = "python/ops/gen_stats_accumulator_ops.py",
+    deps = [
+        ":stats_accumulator_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "stats_accumulator_ops_py",
+    srcs = ["python/ops/stats_accumulator_ops.py"],
+    dso = [":python/ops/_stats_accumulator_ops.so"],
+    kernels = [
+        ":stats_accumulator_ops_kernels",
+        ":stats_accumulator_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batch_ops_utils_py",
+        ":gen_stats_accumulator_ops_py_wrap",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_stats_accumulator_ops.so",
+    srcs = [
+        "kernels/stats_accumulator_ops.cc",
+        "ops/stats_accumulator_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/resources:stamped_resource",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_accumulator_ops_kernels",
+    srcs = [
+        "kernels/stats_accumulator_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/contrib/boosted_trees/resources:stamped_resource",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+    alwayslink = 1,
+)
+
+# Pip
+
+py_library(
+    name = "boosted_trees_pip",
+    deps = [
+        ":init_py",
+        "//tensorflow/contrib/boosted_trees/estimator_batch:init_py",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/__init__.py b/tensorflow/contrib/boosted_trees/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb2d049b7163d5481a4f0621819393ddbfe81ac
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradient boosted trees implementation in tensorflow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.boosted_trees.python import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f9e186788f6832b292a690d8d7b04e2f4edd584e
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -0,0 +1,108 @@
+# This directory contains estimators to train and run inference on
+# gradient boosted trees on top of TensorFlow.
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "custom_export_strategy",
+        ":custom_loss_head",
+        ":estimator",
+        ":model",
+        ":trainer_hooks",
+    ],
+)
+
+py_library(
+    name = "model",
+    srcs = ["model.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+    ],
+)
+
+py_library(
+    name = "trainer_hooks",
+    srcs = ["trainer_hooks.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/learn",
+    ],
+)
+
+py_test(
+    name = "trainer_hooks_test",
+    size = "small",
+    srcs = ["trainer_hooks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trainer_hooks",
+    ],
+)
+
+py_library(
+    name = "custom_loss_head",
+    srcs = ["custom_loss_head.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+    ],
+)
+
+py_library(
+    name = "custom_export_strategy",
+    srcs = ["custom_export_strategy.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_py",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_py",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_py",
+        "//tensorflow/contrib/learn",
+    ],
+)
+
+py_test(
+    name = "custom_export_strategy_test",
+    size = "small",
+    srcs = ["custom_export_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":custom_export_strategy",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_py",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_py",
+    ],
+)
+
+py_library(
+    name = "estimator",
+    srcs = ["estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model",
+        ":trainer_hooks",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/__init__.py b/tensorflow/contrib/boosted_trees/estimator_batch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02af4adbd99fdddd26150046ef01d5bf0510e553
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradient boosted trees implementation in tensorflow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.boosted_trees.estimator_batch import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..923d27c160d14656271d2bcf4112addbf787ef60
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -0,0 +1,159 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Strategy to export custom proto formats."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_extensions_pb2
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2
+from tensorflow.contrib.learn.python.learn import export_strategy
+from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import loader as saved_model_loader
+from tensorflow.python.saved_model import tag_constants
+
+
+def make_custom_export_strategy(name, convert_fn, feature_columns,
+                                export_input_fn):
+  """Makes custom exporter of GTFlow tree format.
+
+  Args:
+    name: A string, for the name of the export strategy.
+    convert_fn: A function that converts the tree proto to desired format and
+      saves it to the desired location.
+    feature_columns: A list of feature columns.
+    export_input_fn: A function that takes no arguments and returns an
+      `InputFnOps`.
+
+  Returns:
+    An `ExportStrategy`.
+  """
+  base_strategy = saved_model_export_utils.make_export_strategy(
+      serving_input_fn=export_input_fn)
+  input_fn = export_input_fn()
+  (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
+   sparse_int_indices, _, _) = gbdt_batch.extract_features(
+       input_fn.features, feature_columns)
+
+  def export_fn(estimator, export_dir, checkpoint_path, eval_result=None):
+    """A wrapper to export to SavedModel, and convert it to other formats."""
+    result_dir = base_strategy.export(estimator, export_dir,
+                                      checkpoint_path,
+                                      eval_result)
+    with ops.Graph().as_default() as graph:
+      with tf_session.Session(graph=graph) as sess:
+        saved_model_loader.load(
+            sess, [tag_constants.SERVING], result_dir)
+        # Note: This is GTFlow internal API and might change.
+        ensemble_model = graph.get_operation_by_name(
+            "ensemble_model/TreeEnsembleSerialize")
+        _, dfec_str = sess.run(ensemble_model.outputs)
+        dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
+        dtec.ParseFromString(dfec_str)
+        # Export the result in the same folder as the saved model.
+        convert_fn(dtec, sorted_feature_names, len(dense_floats),
+                   len(sparse_float_indices), len(sparse_int_indices),
+                   result_dir, eval_result)
+    return result_dir
+  return export_strategy.ExportStrategy(name, export_fn)
+
+
+def convert_to_universal_format(dtec, sorted_feature_names,
+                                num_dense, num_sparse_float,
+                                num_sparse_int):
+  """Convert GTFlow trees to universal format."""
+  del num_sparse_int  # unused.
+  model_and_features = generic_tree_model_pb2.ModelAndFeatures()
+  # TODO(jonasz): Feature descriptions should contain information about how each
+  # feature is processed before it's fed to the model (e.g. bucketing
+  # information). As of now, this serves as a list of features the model uses.
+  for feature_name in sorted_feature_names:
+    model_and_features.features[feature_name].SetInParent()
+  model = model_and_features.model
+  model.ensemble.summation_combination_technique.SetInParent()
+  for tree_idx in range(len(dtec.trees)):
+    gtflow_tree = dtec.trees[tree_idx]
+    tree_weight = dtec.tree_weights[tree_idx]
+    member = model.ensemble.members.add()
+    member.submodel_id.value = tree_idx
+    tree = member.submodel.decision_tree
+    for node_idx in range(len(gtflow_tree.nodes)):
+      gtflow_node = gtflow_tree.nodes[node_idx]
+      node = tree.nodes.add()
+      node_type = gtflow_node.WhichOneof("node")
+      node.node_id.value = node_idx
+      if node_type == "leaf":
+        leaf = gtflow_node.leaf
+        if leaf.HasField("vector"):
+          for weight in leaf.vector.value:
+            new_value = node.leaf.vector.value.add()
+            new_value.float_value = weight * tree_weight
+        else:
+          for index, weight in zip(
+              leaf.sparse_vector.index, leaf.sparse_vector.value):
+            new_value = node.leaf.sparse_vector.sparse_value[index]
+            new_value.float_value = weight * tree_weight
+      else:
+        node = node.binary_node
+        # Binary nodes here.
+        if node_type == "dense_float_binary_split":
+          split = gtflow_node.dense_float_binary_split
+          feature_id = split.feature_column
+          inequality_test = node.inequality_left_child_test
+          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.type = (
+              generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
+          inequality_test.threshold.float_value = split.threshold
+        elif node_type == "sparse_float_binary_split_default_left":
+          split = gtflow_node.sparse_float_binary_split_default_left.split
+          node.default_direction = (
+              generic_tree_model_pb2.BinaryNode.LEFT)
+          feature_id = split.feature_column + num_dense
+          inequality_test = node.inequality_left_child_test
+          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.type = (
+              generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
+          inequality_test.threshold.float_value = split.threshold
+        elif node_type == "sparse_float_binary_split_default_right":
+          split = gtflow_node.sparse_float_binary_split_default_right
+          node.default_direction = (
+              generic_tree_model_pb2.BinaryNode.RIGHT)
+          feature_id = split.feature_column + num_dense
+          inequality_test = node.inequality_left_child_test
+          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.type = (
+              generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
+          inequality_test.threshold.float_value = split.threshold
+        elif node_type == "categorical_id_binary_split":
+          split = gtflow_node.categorical_id_binary_split
+          node.default_direction = generic_tree_model_pb2.BinaryNode.RIGHT
+          feature_id = split.feature_column + num_dense + num_sparse_float
+          categorical_test = (
+              generic_tree_model_extensions_pb2.MatchingValuesTest())
+          categorical_test.feature_id.id.value = sorted_feature_names[
+              feature_id]
+          matching_id = categorical_test.value.add()
+          matching_id.int64_value = split.feature_id
+          node.custom_left_child_test.Pack(categorical_test)
+        else:
+          raise ValueError("Unexpected node type %s", node_type)
+        node.left_child_id.value = split.left_id
+        node.right_child_id.value = split.right_id
+  return model_and_features
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d801fa1f382fb1b3f53ac0a1214269837c7c0cc
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
@@ -0,0 +1,278 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the conversion code from GTFlow format to Chauffeur."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_export_strategy
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class ConvertModelTest(test_util.TensorFlowTestCase):
+
+  def testConvertModel(self):
+    dtec_str = """
+    trees {
+      nodes {
+        leaf {
+          vector {
+            value: -1
+          }
+        }
+      }
+    }
+    trees {
+      nodes {
+        dense_float_binary_split {
+          feature_column: 0
+          threshold: 1740.0
+          left_id: 1
+          right_id: 2
+        }
+        node_metadata {
+          gain: 500
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.6
+          }
+        }
+      }
+      nodes {
+        sparse_float_binary_split_default_left {
+          split {
+            feature_column: 0
+            threshold: 1500.0
+            left_id: 3
+            right_id: 4
+          }
+        }
+        node_metadata {
+          gain: 500
+        }
+      }
+      nodes {
+        categorical_id_binary_split {
+          feature_column: 0
+          feature_id: 5
+          left_id: 5
+          right_id: 6
+        }
+        node_metadata {
+          gain: 500
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.8
+          }
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.5
+          }
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.3
+          }
+        }
+      }
+    }
+    tree_weights: 1.0
+    tree_weights: 0.1
+    """
+    dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
+    text_format.Merge(dtec_str, dtec)
+    # The feature columns in the order they were added.
+    feature_columns = ["feature_b", "feature_a", "feature_d"]
+    out = custom_export_strategy.convert_to_universal_format(
+        dtec, feature_columns, 1, 1,
+        1)
+    expected_tree = """
+    features { key: "feature_a" }
+    features { key: "feature_b" }
+    features { key: "feature_d" }
+    model {
+      ensemble {
+        summation_combination_technique {
+        }
+        members {
+          submodel {
+            decision_tree {
+              nodes {
+                node_id {
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: -1.0
+                    }
+                  }
+                }
+              }
+            }
+          }
+          submodel_id {
+          }
+        }
+        members {
+          submodel {
+            decision_tree {
+              nodes {
+                node_id {
+                }
+                binary_node {
+                  left_child_id {
+                    value: 1
+                  }
+                  right_child_id {
+                    value: 2
+                  }
+                  inequality_left_child_test {
+                    feature_id {
+                      id {
+                        value: "feature_b"
+                      }
+                    }
+                    threshold {
+                      float_value: 1740.0
+                    }
+                  }
+                }
+              }
+
+              nodes {
+                node_id {
+                  value: 1
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 0.06
+                    }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 2
+                }
+                binary_node {
+                  left_child_id {
+                    value: 3
+                  }
+                  right_child_id {
+                    value: 4
+                  }
+                  inequality_left_child_test {
+                    feature_id {
+                      id {
+                        value: "feature_a"
+                      }
+                    }
+                    threshold {
+                      float_value: 1500.0
+                    }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 3
+                }
+                binary_node {
+                  left_child_id {
+                    value: 5
+                  }
+                  right_child_id {
+                    value: 6
+                  }
+                  default_direction: RIGHT
+                  custom_left_child_test {
+                    [type.googleapis.com/tensorflow.decision_trees.MatchingValuesTest] {
+                      feature_id {
+                        id {
+                          value: "feature_d"
+                        }
+                      }
+                      value {
+                        int64_value: 5
+                      }
+                    }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 4
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 0.08
+                    }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 5
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 0.05
+                    }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 6
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 0.03
+                    }
+                  }
+                }
+              }
+            }
+          }
+          submodel_id {
+            value: 1
+          }
+        }
+      }
+    }"""
+    self.assertProtoEquals(expected_tree, out)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_loss_head.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_loss_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5c6040e572676b6c3e36a05f721941206fc542
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_loss_head.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of `head.Head` with custom loss and link function."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+class CustomLossHead(head_lib._RegressionHead):  # pylint: disable=protected-access
+  """A Head object with custom loss function and link function."""
+
+  def __init__(self,
+               loss_fn,
+               link_fn,
+               logit_dimension,
+               head_name=None,
+               weight_column_name=None,
+               metrics_fn=None):
+    """`Head` for specifying arbitrary loss function.
+
+    Args:
+      loss_fn: Loss function.
+      link_fn: Function that converts logits to prediction.
+      logit_dimension: Number of dimensions for the logits.
+      head_name: name of the head. Predictions, summary, metrics keys are
+        suffixed by `"/" + head_name` and the default variable scope is
+        `head_name`.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      metrics_fn: a function that takes predictions dict, labels and weights and
+        returns a dictionary of metrics to be calculated.
+    """
+
+    def loss_wrapper(labels, logits, weight_tensor):
+      if weight_tensor is None:
+        weight_tensor = array_ops.ones(
+            shape=[array_ops.shape(labels)[0], 1], dtype=dtypes.float32)
+      weighted_loss, _ = loss_fn(labels, weight_tensor, logits)
+      average_loss = math_ops.reduce_mean(weighted_loss)
+      return average_loss, average_loss / math_ops.reduce_mean(weight_tensor)
+
+    super(CustomLossHead, self).__init__(
+        loss_fn=loss_wrapper,
+        link_fn=link_fn,
+        head_name=head_name,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False,
+        label_dimension=logit_dimension)
+
+    self._metrics_fn = metrics_fn
+
+  def _metrics(self, eval_loss, predictions, labels, weights):
+    if self._metrics_fn is not None:
+      return self._metrics_fn(predictions, labels, weights)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e0b071b78b25e966b46e7135bbaeb0c4994371
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -0,0 +1,193 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""GTFlow Estimator definition."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.estimator_batch import model
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+
+
+class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
+  """An estimator using gradient boosted decision trees."""
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               n_classes=2,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               center_bias=True):
+    """Initializes a GradientBoostedDecisionTreeClassifier estimator instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      n_classes: Number of classes in the classification.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+    """
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False)
+    super(GradientBoostedDecisionTreeClassifier, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': head,
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'center_bias': center_bias
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
+
+
+class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
+  """An estimator using gradient boosted decision trees."""
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               label_name=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               center_bias=True):
+    """Initializes a GradientBoostedDecisionTreeRegressor estimator instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      label_dimension: Number of regression labels per example. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`).
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      label_name: String, name of the key in label dict. Can be null if label
+          is a tensor (single headed models).
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+    """
+    head = head_lib.regression_head(
+        label_name=label_name,
+        label_dimension=label_dimension,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False)
+    super(GradientBoostedDecisionTreeRegressor, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': head,
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'center_bias': center_bias
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
+
+
+class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
+  """An estimator using gradient boosted decision trees.
+
+  Useful for training with user specified `Head`.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               head,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               center_bias=True):
+    """Initializes a GradientBoostedDecisionTreeEstimator estimator instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      head: `Head` instance.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+    """
+    super(GradientBoostedDecisionTreeEstimator, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': head,
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'center_bias': center_bias
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9fd64cbf0d507d510d78b9b01830ddd7cbf502
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""GTFlow Model definitions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_util
+
+
+def model_builder(features, labels, mode, params, config):
+  """Multi-machine batch gradient descent tree model.
+
+  Args:
+    features: `Tensor` or `dict` of `Tensor` objects.
+    labels: Labels used to train on.
+    mode: Mode we are in. (TRAIN/EVAL/INFER)
+    params: A dict of hyperparameters.
+      The following hyperparameters are expected:
+      * head: A `Head` instance.
+      * learner_config: A config for the learner.
+      * feature_columns: An iterable containing all the feature columns used by
+          the model.
+      * examples_per_layer: Number of examples to accumulate before growing a
+          layer. It can also be a function that computes the number of examples
+          based on the depth of the layer that's being built.
+      * weight_column_name: The name of weight column.
+      * center_bias: Whether a separate tree should be created for first fitting
+          the bias.
+    config: `RunConfig` of the estimator.
+
+  Returns:
+    A `ModelFnOps` object.
+  Raises:
+    ValueError: if inputs are not valid.
+  """
+  head = params["head"]
+  learner_config = params["learner_config"]
+  examples_per_layer = params["examples_per_layer"]
+  feature_columns = params["feature_columns"]
+  weight_column_name = params["weight_column_name"]
+  num_trees = params["num_trees"]
+  if features is None:
+    raise ValueError("At least one feature must be specified.")
+
+  if config is None:
+    raise ValueError("Missing estimator RunConfig.")
+
+  center_bias = params["center_bias"]
+
+  # Make a shallow copy of features to ensure downstream usage
+  # is unaffected by modifications in the model function.
+  training_features = copy.copy(features)
+  training_features.pop(weight_column_name, None)
+  global_step = training_util.get_global_step()
+  with ops.device(global_step.device):
+    ensemble_handle = model_ops.tree_ensemble_variable(
+        stamp_token=0,
+        tree_ensemble_config="",  # Initialize an empty ensemble.
+        name="ensemble_model")
+
+  # Create GBDT model.
+  gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+      is_chief=config.is_chief,
+      num_ps_replicas=config.num_ps_replicas,
+      ensemble_handle=ensemble_handle,
+      center_bias=center_bias,
+      examples_per_layer=examples_per_layer,
+      learner_config=learner_config,
+      feature_columns=feature_columns,
+      features=features)
+  with ops.name_scope("gbdt", "gbdt_optimizer"):
+    predictions_dict = gbdt_model.predict(mode)
+    logits = predictions_dict["predictions"]
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      update_op = gbdt_model.train(loss, predictions_dict, labels)
+      with ops.control_dependencies(
+          [update_op]), (ops.colocate_with(global_step)):
+        update_op = state_ops.assign_add(global_step, 1).op
+        return update_op
+
+  model_fn_ops = head.create_model_fn_ops(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_train_op_fn,
+      logits=logits)
+  if num_trees:
+    if center_bias:
+      num_trees += 1
+    finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
+    model_fn_ops.training_hooks.append(
+        trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
+                                      finalized_trees))
+  return model_fn_ops
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..222d066f7bf60a315790598e09a0e1038f7c05ac
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Hooks for use with GTFlow Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.learn.python.learn import session_run_hook
+from tensorflow.contrib.learn.python.learn.session_run_hook import SessionRunArgs
+from tensorflow.core.framework.summary_pb2 import Summary
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training_util
+from tensorflow.python.training.summary_io import SummaryWriterCache
+
+
+class FeatureImportanceSummarySaver(session_run_hook.SessionRunHook):
+  """Hook to save feature importance summaries."""
+
+  def __init__(self, model_dir, every_n_steps=1):
+    """Create a FeatureImportanceSummarySaver Hook.
+
+    This hook creates scalar summaries representing feature importance
+    for each feature column during training.
+
+    Args:
+      model_dir: model base output directory.
+      every_n_steps: frequency, in number of steps, for logging summaries.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if model_dir is None:
+      raise ValueError("model dir must be specified.")
+    self._model_dir = model_dir
+    self._every_n_steps = every_n_steps
+    self._last_triggered_step = None
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use FeatureImportanceSummarySaver.")
+    graph = ops.get_default_graph()
+    self._feature_names_tensor = graph.get_tensor_by_name(
+        "gbdt/feature_names:0")
+    self._feature_usage_counts_tensor = graph.get_tensor_by_name(
+        "gbdt/feature_usage_counts:0")
+    self._feature_gains_tensor = graph.get_tensor_by_name(
+        "gbdt/feature_gains:0")
+
+  def before_run(self, run_context):
+    del run_context  # Unused by feature importance summary saver hook.
+    requests = {
+        "global_step": self._global_step_tensor,
+        "feature_names": self._feature_names_tensor,
+        "feature_usage_counts": self._feature_usage_counts_tensor,
+        "feature_gains": self._feature_gains_tensor
+    }
+    return SessionRunArgs(requests)
+
+  def after_run(self, run_context, run_values):
+    del run_context  # Unused by feature importance summary saver hook.
+
+    # Read result tensors.
+    global_step = run_values.results["global_step"]
+    feature_names = run_values.results["feature_names"]
+    feature_usage_counts = run_values.results["feature_usage_counts"]
+    feature_gains = run_values.results["feature_gains"]
+
+    # Ensure summaries are logged at desired frequency
+    if (self._last_triggered_step is not None and
+        global_step < self._last_triggered_step + self._every_n_steps):
+      return
+
+    # Validate tensors.
+    if (len(feature_names) != len(feature_usage_counts) or
+        len(feature_names) != len(feature_gains)):
+      raise RuntimeError(
+          "Feature names and importance measures have inconsistent lengths.")
+
+    # Compute total usage.
+    total_usage_count = 0.0
+    for usage_count in feature_usage_counts:
+      total_usage_count += usage_count
+    usage_count_norm = 1.0 / total_usage_count if total_usage_count else 1.0
+
+    # Compute total gain.
+    total_gain = 0.0
+    for gain in feature_gains:
+      total_gain += gain
+    gain_norm = 1.0 / total_gain if total_gain else 1.0
+
+    # Output summary for each feature.
+    self._last_triggered_step = global_step
+    for (name, usage_count, gain) in zip(feature_names, feature_usage_counts,
+                                         feature_gains):
+      output_dir = os.path.join(self._model_dir, name.decode("utf-8"))
+      summary_writer = SummaryWriterCache.get(output_dir)
+      usage_count_summary = Summary(value=[
+          Summary.Value(
+              tag="feature_importance/usage_counts",
+              simple_value=usage_count)
+      ])
+      usage_fraction_summary = Summary(value=[
+          Summary.Value(
+              tag="feature_importance/usage_fraction",
+              simple_value=usage_count * usage_count_norm)
+      ])
+      summary_writer.add_summary(usage_count_summary, global_step)
+      summary_writer.add_summary(usage_fraction_summary, global_step)
+      gains_summary = Summary(
+          value=[Summary.Value(
+              tag="feature_importance/gains",
+              simple_value=gain)])
+      gains_fraction_summary = Summary(
+          value=[Summary.Value(
+              tag="feature_importance/gains_fraction",
+              simple_value=gain * gain_norm)])
+      summary_writer.add_summary(gains_summary, global_step)
+      summary_writer.add_summary(gains_fraction_summary, global_step)
+
+
+class FeedFnHook(session_run_hook.SessionRunHook):
+  """Runs feed_fn and sets the feed_dict accordingly."""
+
+  def __init__(self, feed_fn):
+    self.feed_fn = feed_fn
+
+  def before_run(self, run_context):
+    del run_context  # unused by FeedFnHook.
+    return session_run_hook.SessionRunArgs(
+        fetches=None, feed_dict=self.feed_fn)
+
+
+class StopAfterNTrees(session_run_hook.SessionRunHook):
+  """Stop training after building N full trees."""
+
+  def __init__(self, n, num_attempted_trees_tensor, num_finalized_trees_tensor):
+    self._num_trees = n
+    # num_attempted_trees_tensor and num_finalized_trees_tensor are both
+    # tensors.
+    self._num_attempted_trees_tensor = num_attempted_trees_tensor
+    self._num_finalized_trees_tensor = num_finalized_trees_tensor
+
+  def before_run(self, run_context):
+    del run_context  # unused by StopTrainingAfterNTrees.
+    return session_run_hook.SessionRunArgs({
+        "num_attempted_trees": self._num_attempted_trees_tensor,
+        "num_finalized_trees": self._num_finalized_trees_tensor,
+    })
+
+  def after_run(self, run_context, run_values):
+    num_attempted_trees = run_values.results["num_attempted_trees"]
+    num_finalized_trees = run_values.results["num_finalized_trees"]
+    assert num_attempted_trees is not None
+    assert num_finalized_trees is not None
+    if (num_finalized_trees >= self._num_trees or
+        num_attempted_trees > self._num_trees):
+      logging.info("Requesting stop since we have reached %d trees.",
+                   num_finalized_trees)
+      run_context.request_stop()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..47e0eb6d97b9b80e2f849cc9084af9cd578742fb
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks_test.py
@@ -0,0 +1,76 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for trainer hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import monitored_session
+
+
+class FeatureImportanceSummarySaverTest(test_util.TensorFlowTestCase):
+
+  def test_invalid_input(self):
+    with self.assertRaises(ValueError):
+      trainer_hooks.FeatureImportanceSummarySaver(model_dir=None)
+
+  def test_invalid_graph(self):
+    # Create inputs.
+    model_dir = tempfile.mkdtemp()
+    hook = trainer_hooks.FeatureImportanceSummarySaver(model_dir)
+    with ops.Graph().as_default():
+      # Begin won't be able to find the required tensors in the graph.
+      _ = variables.get_or_create_global_step()
+      with self.assertRaises(KeyError):
+        hook.begin()
+
+  def test_run(self):
+    # Create inputs.
+    model_dir = tempfile.mkdtemp()
+    hook = trainer_hooks.FeatureImportanceSummarySaver(model_dir)
+    with ops.Graph().as_default(), tf_session.Session() as sess:
+      global_step = variables.get_or_create_global_step()
+      with ops.name_scope("gbdt"):
+        constant_op.constant(["featA", "featB"], name="feature_names")
+        constant_op.constant([0, 2], name="feature_usage_counts")
+        constant_op.constant([0, 0.8], name="feature_gains")
+      # Begin finds tensors in the graph.
+      hook.begin()
+      sess.run(tf_variables.global_variables_initializer())
+      # Run hook in a monitored session.
+      train_op = state_ops.assign_add(global_step, 1)
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      mon_sess.run(train_op)
+      hook.end(sess)
+      # Ensure output summary dirs are created.
+      self.assertTrue(os.path.exists(os.path.join(model_dir, "featA")))
+      self.assertTrue(os.path.exists(os.path.join(model_dir, "featB")))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/kernels/ensemble_optimizer_ops.cc b/tensorflow/contrib/boosted_trees/kernels/ensemble_optimizer_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5cde22901050eadb346d67d49968af925b596bac
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/ensemble_optimizer_ops.cc
@@ -0,0 +1,243 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
+#include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using boosted_trees::models::DecisionTreeEnsembleResource;
+using boosted_trees::trees::DecisionTreeEnsembleConfig;
+using boosted_trees::utils::DropoutUtils;
+using errors::InvalidArgument;
+
+namespace {
+
+// Learning rate epsilon.
+const float kLearningRateEps = 1e-8;
+
+}  // namespace
+
+class AddTreesToEnsembleOp : public OpKernel {
+ public:
+  explicit AddTreesToEnsembleOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    // Ensure feature importance lhs inputs are references.
+    OP_REQUIRES(
+        context,
+        IsRefType(context->input_type(kFeatureColumnUsageCountsHandleIdx)),
+        errors::InvalidArgument(
+            "Feature usage counts lhs input needs to be a ref type"));
+    OP_REQUIRES(context,
+                IsRefType(context->input_type(kFeatureColumnGainsHandleIdx)),
+                errors::InvalidArgument(
+                    "Feature gains lhs input needs to be a ref type"));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    DecisionTreeEnsembleResource* decision_tree_ensemble_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(
+        context, LookupResource(
+                     context, HandleFromInput(context, kTreeEnsembleHandleIdx),
+                     &decision_tree_ensemble_resource));
+    // Lock the resource since we're mutating it.
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    // Remove the reference at the end of this scope.
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+
+    // Read feature importance info.
+    mutex_lock fc_usage_counts_mutex_lock(
+        *context->input_ref_mutex(kFeatureColumnUsageCountsHandleIdx));
+    mutex_lock fc_gains_mutex_lock(
+        *context->input_ref_mutex(kFeatureColumnGainsHandleIdx));
+    Tensor fc_usage_counts_lhs_t =
+        context->mutable_input(kFeatureColumnUsageCountsHandleIdx, true);
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(fc_usage_counts_lhs_t.shape()),
+                InvalidArgument("Feature usage counts should be a vector."));
+    OP_REQUIRES(context, fc_usage_counts_lhs_t.IsInitialized(),
+                errors::FailedPrecondition(
+                    "Attempting to use uninitialized variables: ",
+                    requested_input(kFeatureColumnUsageCountsHandleIdx)));
+
+    Tensor fc_gains_lhs_t =
+        context->mutable_input(kFeatureColumnGainsHandleIdx, true);
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(fc_gains_lhs_t.shape()),
+                InvalidArgument("Feature gains should be a vector."));
+    OP_REQUIRES(context, fc_gains_lhs_t.IsInitialized(),
+                errors::FailedPrecondition(
+                    "Attempting to use uninitialized variables: ",
+                    requested_input(kFeatureColumnGainsHandleIdx)));
+
+    const Tensor fc_usage_counts_rhs_t =
+        context->input(kFeatureColumnUsageCountsToAddIdx);
+    OP_REQUIRES(
+        context,
+        fc_usage_counts_lhs_t.shape().IsSameSize(fc_usage_counts_rhs_t.shape()),
+        errors::InvalidArgument(
+            "Shapes of both feature usage counts tensors should match.",
+            " lhs shape= ", fc_usage_counts_lhs_t.shape().DebugString(),
+            " rhs shape= ", fc_usage_counts_rhs_t.shape().DebugString()));
+
+    const Tensor fc_gains_rhs_t = context->input(kFeatureColumnGainsToAddIdx);
+    OP_REQUIRES(context,
+                fc_gains_lhs_t.shape().IsSameSize(fc_gains_rhs_t.shape()),
+                errors::InvalidArgument(
+                    "Shapes of both feature gains tensors should match.",
+                    " lhs shape= ", fc_gains_lhs_t.shape().DebugString(),
+                    " rhs shape= ", fc_gains_rhs_t.shape().DebugString()));
+
+    // Read in info about trees that were dropped.
+    Tensor dropped_trees_info_t = context->input(kDropedTreesInfoTensorIdx);
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(dropped_trees_info_t.shape()),
+                InvalidArgument("Dropped trees info should be matrix."));
+
+    const auto& dropout_info = dropped_trees_info_t.matrix<float>();
+
+    // Parse the passed in tree ensemble.
+    Tensor tree_ensemble_config_t = context->input(kEnsembleToAddTensorIdx);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(tree_ensemble_config_t.shape()),
+        errors::InvalidArgument("Tree ensemble config must be a scalar."));
+    // Arena increase spatial locality which reduces the average latency to
+    // access memory, as working set of pages will be fewer.
+    // arena has type proto2::Arena*.
+    auto* arena =
+        decision_tree_ensemble_resource->mutable_decision_tree_ensemble()
+            ->GetArena();
+    DecisionTreeEnsembleConfig* ensemble_to_add =
+        protobuf::Arena::CreateMessage<DecisionTreeEnsembleConfig>(arena);
+    OP_REQUIRES(
+        context, ParseProtoUnlimited(ensemble_to_add,
+                                     tree_ensemble_config_t.scalar<string>()()),
+        errors::InvalidArgument("Unable to parse tree ensemble config."));
+
+    auto* mutable_ensemble =
+        decision_tree_ensemble_resource->mutable_decision_tree_ensemble();
+
+    // Read the learning_rate
+    Tensor learning_rate_t = context->input(kLearningRateTensorIdx);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(learning_rate_t.shape()),
+                InvalidArgument("Learning rate should be a scalar."));
+
+    const float learning_rate = learning_rate_t.scalar<float>()();
+    if (learning_rate < kLearningRateEps) {
+      return;
+    }
+    // Prepare current weights vec.
+    std::vector<float> current_weights;
+    current_weights.reserve(mutable_ensemble->tree_weights_size());
+    for (const float weight : mutable_ensemble->tree_weights()) {
+      current_weights.push_back(weight);
+    }
+    const int32 num_dropped = dropped_trees_info_t.dim_size(1);
+    std::vector<int> dropped_trees;
+    dropped_trees.reserve(num_dropped);
+    std::vector<float> dropped_trees_original_weights;
+    dropped_trees_original_weights.reserve(num_dropped);
+    for (int i = 0; i < num_dropped; ++i) {
+      dropped_trees.push_back(dropout_info(0, i));
+      dropped_trees_original_weights.push_back(dropout_info(1, i));
+    }
+
+    std::vector<int32> num_updates;
+    num_updates.reserve(mutable_ensemble->tree_metadata_size());
+
+    for (const auto& meta : mutable_ensemble->tree_metadata()) {
+      num_updates.push_back(meta.num_tree_weight_updates());
+    }
+
+    // If there was a dropout, come up with tree weights
+    const bool was_dropout = !dropped_trees.empty();
+    if (was_dropout) {
+      // New tree/s will be added to the end of the ensemble's tree list.
+      const int32 new_tree_index = current_weights.size();
+      DropoutUtils::GetTreesWeightsForAddingTrees(
+          dropped_trees, dropped_trees_original_weights, new_tree_index,
+          ensemble_to_add->trees_size(), &current_weights, &num_updates);
+
+      // Update the weights of trees according to current weights;
+      for (int i = 0; i < mutable_ensemble->trees_size(); ++i) {
+        mutable_ensemble->set_tree_weights(i, current_weights[i]);
+      }
+    }
+
+    // Add the trees from ensemble_to_add to the tree ensemble variable.
+    int i = mutable_ensemble->trees_size();
+    for (auto& tree : *ensemble_to_add->mutable_trees()) {
+      (*mutable_ensemble->add_trees()).Swap(&tree);
+
+      // New trees were updated only once.
+      auto* meta = mutable_ensemble->add_tree_metadata();
+      meta->set_num_tree_weight_updates(1);
+
+      // When we add complete trees to the ensemble in one step, each tree
+      // that's added is final.
+      meta->set_is_finalized(true);
+
+      if (was_dropout) {
+        mutable_ensemble->add_tree_weights(current_weights[i++]);
+      } else {
+        mutable_ensemble->add_tree_weights(learning_rate);
+      }
+    }
+
+    // Update the number of updates.
+    if (was_dropout) {
+      for (int i = 0; i < num_updates.size(); ++i) {
+        mutable_ensemble->mutable_tree_metadata(i)->set_num_tree_weight_updates(
+            num_updates[i]);
+      }
+    }
+
+    // Update feature importance.
+    fc_usage_counts_lhs_t.vec<int64>() += fc_usage_counts_rhs_t.vec<int64>();
+    fc_gains_lhs_t.vec<float>() += learning_rate * fc_gains_rhs_t.vec<float>();
+  }
+
+ private:
+  // Input tensor indices.
+  // Note that Op definition changes might cause input indices to need
+  // changing as well.
+  static const int kTreeEnsembleHandleIdx = 0;
+  static const int kEnsembleToAddTensorIdx = 1;
+  static const int kFeatureColumnUsageCountsHandleIdx = 2;
+  static const int kFeatureColumnUsageCountsToAddIdx = 3;
+  static const int kFeatureColumnGainsHandleIdx = 4;
+  static const int kFeatureColumnGainsToAddIdx = 5;
+  static const int kDropedTreesInfoTensorIdx = 6;
+  static const int kLearningRateTensorIdx = 7;
+};
+
+REGISTER_KERNEL_BUILDER(Name("AddTreesToEnsemble").Device(DEVICE_CPU),
+                        AddTreesToEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42112c586a5f5e940d31e0810ae9589d79239641
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -0,0 +1,168 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <string>
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
+#include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+
+using boosted_trees::models::DecisionTreeEnsembleResource;
+
+// Creates a tree ensemble variable.
+class CreateTreeEnsembleVariableOp : public OpKernel {
+ public:
+  explicit CreateTreeEnsembleVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble config.
+    const Tensor* tree_ensemble_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_config",
+                                           &tree_ensemble_config_t));
+    auto* result = new boosted_trees::models::DecisionTreeEnsembleResource();
+    result->set_stamp(stamp_token);
+    if (!ParseProtoUnlimited(result->mutable_decision_tree_ensemble(),
+                             tree_ensemble_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false, errors::InvalidArgument(
+                                      "Unable to parse tree ensemble config."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+// Op for retrieving a model stamp token without having to serialize.
+class TreeEnsembleStampTokenOp : public OpKernel {
+ public:
+  explicit TreeEnsembleStampTokenOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    boosted_trees::models::DecisionTreeEnsembleResource*
+        decision_tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    Tensor* output_stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    output_stamp_token_t->scalar<int64>()() =
+        decision_tree_ensemble_resource->stamp();
+  }
+};
+
+// Op for serializing a model.
+class TreeEnsembleSerializeOp : public OpKernel {
+ public:
+  explicit TreeEnsembleSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    boosted_trees::models::DecisionTreeEnsembleResource*
+        decision_tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    Tensor* output_stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    output_stamp_token_t->scalar<int64>()() =
+        decision_tree_ensemble_resource->stamp();
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, TensorShape(), &output_config_t));
+    output_config_t->scalar<string>()() =
+        decision_tree_ensemble_resource->decision_tree_ensemble()
+            .SerializeAsString();
+  }
+};
+
+// Op for deserializing a tree ensemble variable from a checkpoint.
+class TreeEnsembleDeserializeOp : public OpKernel {
+ public:
+  explicit TreeEnsembleDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    boosted_trees::models::DecisionTreeEnsembleResource*
+        decision_tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble config.
+    const Tensor* tree_ensemble_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_config",
+                                           &tree_ensemble_config_t));
+    // Deallocate all the previous objects on the resource.
+    decision_tree_ensemble_resource->Reset();
+    decision_tree_ensemble_resource->set_stamp(stamp_token);
+    boosted_trees::trees::DecisionTreeEnsembleConfig* config =
+        decision_tree_ensemble_resource->mutable_decision_tree_ensemble();
+    OP_REQUIRES(
+        context,
+        ParseProtoUnlimited(config, tree_ensemble_config_t->scalar<string>()()),
+        errors::InvalidArgument("Unable to parse tree ensemble config."));
+  }
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(DecisionTreeEnsembleResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TreeEnsembleIsInitializedOp").Device(DEVICE_CPU),
+    IsResourceInitialized<boosted_trees::models::DecisionTreeEnsembleResource>);
+
+REGISTER_KERNEL_BUILDER(Name("CreateTreeEnsembleVariable").Device(DEVICE_CPU),
+                        CreateTreeEnsembleVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreeEnsembleStampToken").Device(DEVICE_CPU),
+                        TreeEnsembleStampTokenOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreeEnsembleSerialize").Device(DEVICE_CPU),
+                        TreeEnsembleSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreeEnsembleDeserialize").Device(DEVICE_CPU),
+                        TreeEnsembleDeserializeOp);
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..daca0495481fdadebd239933c14e8b6ff08f4558
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -0,0 +1,428 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h"
+#include "tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h"
+#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
+#include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::boosted_trees::learner::AveragingConfig;
+using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
+
+namespace tensorflow {
+namespace boosted_trees {
+
+using boosted_trees::learner::LearnerConfig;
+using boosted_trees::learner::LearningRateConfig;
+using boosted_trees::learner::LearningRateDropoutDrivenConfig;
+using boosted_trees::models::MultipleAdditiveTrees;
+using boosted_trees::models::DecisionTreeEnsembleResource;
+using boosted_trees::utils::DropoutUtils;
+using boosted_trees::utils::TensorUtils;
+
+namespace {
+const char* kLearnerConfigAttributeName = "learner_config";
+const char* kSeedTensorName = "seed";
+const char* kApplyDropoutAttributeName = "apply_dropout";
+const char* kApplyAveragingAttributeName = "apply_averaging";
+const char* kDropoutInfoOutputTensorName = "drop_out_tree_indices_weights";
+const char* kPredictionsTensorName = "predictions";
+const char* kNoDropoutPredictionsTensorName = "no_dropout_predictions";
+}
+
+class GradientTreesPredictionOp : public OpKernel {
+ public:
+  explicit GradientTreesPredictionOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("use_locking", &use_locking_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("center_bias", &center_bias_));
+
+    OP_REQUIRES_OK(
+        context, context->GetAttr(kApplyDropoutAttributeName, &apply_dropout_));
+
+    LearnerConfig learner_config;
+    string learner_config_str;
+    OP_REQUIRES_OK(context, context->GetAttr(kLearnerConfigAttributeName,
+                                             &learner_config_str));
+    OP_REQUIRES(
+        context, ParseProtoUnlimited(&learner_config, learner_config_str),
+        errors::InvalidArgument("Unable to parse learner config config."));
+
+    num_classes_ = learner_config.num_classes();
+    OP_REQUIRES(context, num_classes_ >= 2,
+                errors::InvalidArgument("Number of classes must be >=2"));
+    OP_REQUIRES(
+        context, ParseProtoUnlimited(&learner_config, learner_config_str),
+        errors::InvalidArgument("Unable to parse learner config config."));
+
+    bool reduce_dim;
+    OP_REQUIRES_OK(context, context->GetAttr("reduce_dim", &reduce_dim));
+    prediction_vector_size_ = reduce_dim ? num_classes_ - 1 : num_classes_;
+
+    only_finalized_trees_ =
+        learner_config.growing_mode() == learner_config.WHOLE_TREE;
+    if (learner_config.has_learning_rate_tuner() &&
+        learner_config.learning_rate_tuner().tuner_case() ==
+            LearningRateConfig::kDropout) {
+      dropout_config_ = learner_config.learning_rate_tuner().dropout();
+      has_dropout_ = true;
+    } else {
+      has_dropout_ = false;
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr(kApplyAveragingAttributeName,
+                                             &apply_averaging_));
+    apply_averaging_ =
+        apply_averaging_ && learner_config.averaging_config().config_case() !=
+                                AveragingConfig::CONFIG_NOT_SET;
+    if (apply_averaging_) {
+      averaging_config_ = learner_config.averaging_config();
+
+      // If there is averaging config, check that the values are correct.
+      switch (averaging_config_.config_case()) {
+        case AveragingConfig::kAverageLastNTreesFieldNumber: {
+          OP_REQUIRES(context, averaging_config_.average_last_n_trees() > 0,
+                      errors::InvalidArgument(
+                          "Average last n trees must be a positive number"));
+          break;
+        }
+        case AveragingConfig::kAverageLastPercentTreesFieldNumber: {
+          OP_REQUIRES(context,
+                      averaging_config_.average_last_percent_trees() > 0 &&
+                          averaging_config_.average_last_percent_trees() <= 1.0,
+                      errors::InvalidArgument(
+                          "Average last percent must be in (0,1] interval."));
+          break;
+        }
+        case AveragingConfig::CONFIG_NOT_SET: {
+          QCHECK(false) << "We should never get here.";
+          break;
+        }
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    DecisionTreeEnsembleResource* decision_tree_ensemble_resource;
+    // Gets the resource. Grabs the mutex but releases it.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    if (use_locking_) {
+      mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+      DoCompute(context, decision_tree_ensemble_resource);
+    } else {
+      DoCompute(context, decision_tree_ensemble_resource);
+    }
+  }
+
+ private:
+  void DoCompute(
+      OpKernelContext* context,
+      DecisionTreeEnsembleResource* decision_tree_ensemble_resource) {
+    // Read dense float features list;
+    OpInputList dense_float_features_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadDenseFloatFeatures(
+                                context, &dense_float_features_list));
+
+    // Read sparse float features list;
+    OpInputList sparse_float_feature_indices_list;
+    OpInputList sparse_float_feature_values_list;
+    OpInputList sparse_float_feature_shapes_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadSparseFloatFeatures(
+                                context, &sparse_float_feature_indices_list,
+                                &sparse_float_feature_values_list,
+                                &sparse_float_feature_shapes_list));
+
+    // Read sparse int features list;
+    OpInputList sparse_int_feature_indices_list;
+    OpInputList sparse_int_feature_values_list;
+    OpInputList sparse_int_feature_shapes_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadSparseIntFeatures(
+                                context, &sparse_int_feature_indices_list,
+                                &sparse_int_feature_values_list,
+                                &sparse_int_feature_shapes_list));
+
+    // Infer batch size.
+    const int64 batch_size = TensorUtils::InferBatchSize(
+        dense_float_features_list, sparse_float_feature_shapes_list,
+        sparse_int_feature_shapes_list);
+
+    // Read batch features.
+    boosted_trees::utils::BatchFeatures batch_features(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        batch_features.Initialize(
+            TensorUtils::OpInputListToTensorVec(dense_float_features_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_float_feature_indices_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_float_feature_values_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_float_feature_shapes_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_int_feature_indices_list),
+            TensorUtils::OpInputListToTensorVec(sparse_int_feature_values_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_int_feature_shapes_list)));
+
+    std::vector<int32> dropped_trees;
+    std::vector<float> original_weights;
+
+    // Do dropout if needed.
+    if (apply_dropout_ && has_dropout_) {
+      // Read in seed
+      const Tensor* seed_t;
+      OP_REQUIRES_OK(context, context->input(kSeedTensorName, &seed_t));
+      OP_REQUIRES(context, TensorShapeUtils::IsScalar(seed_t->shape()),
+                  errors::InvalidArgument("Seed must be a scalar."));
+
+      // Cast seed to uint64.
+      const uint64 seed = seed_t->scalar<int64>()();
+
+      std::vector<float> weights;
+      for (const float weight :
+           decision_tree_ensemble_resource->decision_tree_ensemble()
+               .tree_weights()) {
+        weights.push_back(weight);
+      }
+
+      std::unordered_set<int32> trees_not_to_drop;
+      if (center_bias_) {
+        trees_not_to_drop.insert(0);
+      }
+      if (decision_tree_ensemble_resource->decision_tree_ensemble()
+              .has_growing_metadata()) {
+        // We are in batch mode, the last tree is the tree that is being built,
+        // we can't drop it during dropout.
+        const int32 current_tree =
+            decision_tree_ensemble_resource->decision_tree_ensemble()
+                .trees_size() -
+            1;
+        trees_not_to_drop.insert(current_tree);
+      }
+      OP_REQUIRES_OK(context, DropoutUtils::DropOutTrees(
+                                  seed, dropout_config_, trees_not_to_drop,
+                                  weights, &dropped_trees, &original_weights));
+    }
+
+    // Allocate output predictions matrix.
+    Tensor* output_predictions_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(kPredictionsTensorName,
+                                          {batch_size, prediction_vector_size_},
+                                          &output_predictions_t));
+    auto output_predictions = output_predictions_t->matrix<float>();
+
+    Tensor* output_no_dropout_predictions_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(kNoDropoutPredictionsTensorName,
+                                          {batch_size, prediction_vector_size_},
+                                          &output_no_dropout_predictions_t));
+    auto output_no_dropout_predictions =
+        output_no_dropout_predictions_t->matrix<float>();
+
+    // Run predictor.
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+
+    if (apply_averaging_) {
+      DecisionTreeEnsembleConfig adjusted =
+          decision_tree_ensemble_resource->decision_tree_ensemble();
+
+      const int start_averaging = std::max(
+          0.0,
+          averaging_config_.config_case() ==
+                  AveragingConfig::kAverageLastNTreesFieldNumber
+              ? adjusted.trees_size() - averaging_config_.average_last_n_trees()
+              : adjusted.trees_size() *
+                    (1.0 - averaging_config_.average_last_percent_trees()));
+
+      const int num_ensembles = adjusted.trees_size() - start_averaging;
+      for (int i = start_averaging; i < adjusted.trees_size(); ++i) {
+        float weight = adjusted.tree_weights(i);
+        adjusted.mutable_tree_weights()->Set(
+            i, weight * (num_ensembles - i + start_averaging) / num_ensembles);
+      }
+      MultipleAdditiveTrees::Predict(
+          adjusted, only_finalized_trees_, dropped_trees, batch_features,
+          worker_threads, output_predictions, output_no_dropout_predictions);
+    } else {
+      MultipleAdditiveTrees::Predict(
+          decision_tree_ensemble_resource->decision_tree_ensemble(),
+          only_finalized_trees_, dropped_trees, batch_features, worker_threads,
+          output_predictions, output_no_dropout_predictions);
+    }
+
+    // Output dropped trees and original weights.
+    Tensor* output_dropout_info_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                kDropoutInfoOutputTensorName,
+                                {2, static_cast<int64>(dropped_trees.size())},
+                                &output_dropout_info_t));
+    auto output_dropout_info = output_dropout_info_t->matrix<float>();
+
+    for (int32 i = 0; i < dropped_trees.size(); ++i) {
+      output_dropout_info(0, i) = dropped_trees[i];
+      output_dropout_info(1, i) = original_weights[i];
+    }
+  }
+
+ private:
+  LearningRateDropoutDrivenConfig dropout_config_;
+  AveragingConfig averaging_config_;
+  bool only_finalized_trees_;
+  int num_classes_;
+  // What is the size of the output vector for predictions?
+  int prediction_vector_size_;
+  bool apply_dropout_;
+  bool center_bias_;
+  bool apply_averaging_;
+  bool use_locking_;
+  bool has_dropout_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GradientTreesPrediction").Device(DEVICE_CPU),
+                        GradientTreesPredictionOp);
+
+class GradientTreesPartitionExamplesOp : public OpKernel {
+ public:
+  explicit GradientTreesPartitionExamplesOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("use_locking", &use_locking_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    DecisionTreeEnsembleResource* decision_tree_ensemble_resource;
+    // Gets the resource. Grabs the mutex but releases it.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    if (use_locking_) {
+      mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+      DoCompute(context, decision_tree_ensemble_resource);
+    } else {
+      DoCompute(context, decision_tree_ensemble_resource);
+    }
+  }
+
+ private:
+  void DoCompute(
+      OpKernelContext* context,
+      DecisionTreeEnsembleResource* decision_tree_ensemble_resource) {
+    // The last non-finalized tree in the ensemble is by convention the
+    // one to partition on. If no such tree exists, a nodeless tree is
+    // created.
+    const auto& tree_ensemble =
+        decision_tree_ensemble_resource->decision_tree_ensemble();
+    boosted_trees::trees::DecisionTreeConfig empy_tree_config;
+    const boosted_trees::trees::DecisionTreeConfig* tree_config =
+        &empy_tree_config;
+    auto num_trees = tree_ensemble.trees_size();
+    if (num_trees > 0 &&
+        !tree_ensemble.tree_metadata(num_trees - 1).is_finalized()) {
+      tree_config = &tree_ensemble.trees(num_trees - 1);
+    }
+
+    // Read dense float features list;
+    OpInputList dense_float_features_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadDenseFloatFeatures(
+                                context, &dense_float_features_list));
+
+    // Read sparse float features list;
+    OpInputList sparse_float_feature_indices_list;
+    OpInputList sparse_float_feature_values_list;
+    OpInputList sparse_float_feature_shapes_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadSparseFloatFeatures(
+                                context, &sparse_float_feature_indices_list,
+                                &sparse_float_feature_values_list,
+                                &sparse_float_feature_shapes_list));
+
+    // Read sparse int features list;
+    OpInputList sparse_int_feature_indices_list;
+    OpInputList sparse_int_feature_values_list;
+    OpInputList sparse_int_feature_shapes_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadSparseIntFeatures(
+                                context, &sparse_int_feature_indices_list,
+                                &sparse_int_feature_values_list,
+                                &sparse_int_feature_shapes_list));
+
+    // Infer batch size.
+    const int64 batch_size = TensorUtils::InferBatchSize(
+        dense_float_features_list, sparse_float_feature_shapes_list,
+        sparse_int_feature_shapes_list);
+
+    // Read batch features.
+    boosted_trees::utils::BatchFeatures batch_features(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        batch_features.Initialize(
+            TensorUtils::OpInputListToTensorVec(dense_float_features_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_float_feature_indices_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_float_feature_values_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_float_feature_shapes_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_int_feature_indices_list),
+            TensorUtils::OpInputListToTensorVec(sparse_int_feature_values_list),
+            TensorUtils::OpInputListToTensorVec(
+                sparse_int_feature_shapes_list)));
+
+    // Allocate output partitions vector.
+    Tensor* partition_ids_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {batch_size}, &partition_ids_t));
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    learner::ExamplePartitioner::PartitionExamples(
+        *tree_config, batch_features, worker_threads->NumThreads(),
+        worker_threads, partition_ids_t->vec<int32>().data());
+  }
+
+ private:
+  bool use_locking_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("GradientTreesPartitionExamples").Device(DEVICE_CPU),
+    GradientTreesPartitionExamplesOp);
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df6bf22571e5b25c6ab77fcce12e0555e326c405
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -0,0 +1,877 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h"
+#include "tensorflow/contrib/boosted_trees/proto/quantiles.pb.h"
+#include "tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+using ::boosted_trees::QuantileConfig;
+using boosted_trees::utils::TensorUtils;
+using boosted_trees::QuantileStreamResource;
+
+namespace {
+const char* const kExampleWeightsName = "example_weights";
+const char* const kMaxElementsName = "max_elements";
+const char* const kHandleName = "handle";
+const char* const kNextStampTokenName = "next_stamp_token";
+const char* const kStampTokenName = "stamp_token";
+const char* const kAreBucketsReadyName = "are_buckets_ready";
+// Names for sparse arguments.
+const char* const kNumSparseFeaturesName = "num_sparse_features";
+const char* const kSparseBucketsName = "sparse_buckets";
+const char* const kSparseValuesName = "sparse_values";
+const char* const kSparseStreamsStateName = "sparse_streams_state";
+const char* const kSparseSummariesName = "sparse_summaries";
+const char* const kSparseConfigName = "sparse_config";
+const char* const kSparseOutputTensorName = "sparse_quantiles";
+// Names for dense arguments.
+const char* const kDenseBucketsName = "dense_buckets";
+const char* const kDenseConfigName = "dense_config";
+const char* const kDenseOutputTensorName = "dense_quantiles";
+const char* const kDenseStreamsStateName = "dense_streams_state";
+const char* const kDenseSummariesName = "dense_summaries";
+const char* const kDenseValuesName = "dense_values";
+const char* const kNumDenseFeaturesName = "num_dense_features";
+const char* const kResourceHandlesName = "quantile_accumulator_handles";
+const char* const kNumQuantilesName = "num_quantiles";
+const char* const kEpsilonName = "epsilon";
+const char* const kBucketsName = "buckets";
+const char* const kStreamStateName = "stream_state";
+const char* const kSummariesName = "summaries";
+
+using QuantileStream =
+    boosted_trees::quantiles::WeightedQuantilesStream<float, float>;
+using QuantileSummary =
+    boosted_trees::quantiles::WeightedQuantilesSummary<float, float>;
+using QuantileSummaryEntry =
+    boosted_trees::quantiles::WeightedQuantilesSummary<float,
+                                                       float>::SummaryEntry;
+
+std::vector<float> GetBuckets(const int32 feature,
+                              const OpInputList& buckets_list) {
+  const auto& buckets = buckets_list[feature].flat<float>();
+  std::vector<float> buckets_vector(buckets.data(),
+                                    buckets.data() + buckets.size());
+  return buckets_vector;
+}
+
+void QuantizeFeatures(const string& output_name, const OpInputList& values_list,
+                      const OpInputList& buckets_list,
+                      OpKernelContext* const context) {
+  if (values_list.size() == 0) {
+    return;
+  }
+  OpOutputList output_list;
+  OP_REQUIRES_OK(context, context->output_list(output_name, &output_list));
+
+  for (int32 feature_index = 0; feature_index < values_list.size();
+       ++feature_index) {
+    const Tensor& values_tensor = values_list[feature_index];
+    const int64 num_values = values_tensor.dim_size(0);
+
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(
+        context, output_list.allocate(feature_index, TensorShape({num_values}),
+                                      &output_t));
+    TTypes<int32>::Vec output = output_t->vec<int32>();
+    const std::vector<float>& buckets_vector =
+        GetBuckets(feature_index, buckets_list);
+    auto flat_values = values_tensor.flat<float>();
+    for (int64 instance = 0; instance < num_values; ++instance) {
+      const float value = flat_values(instance);
+      auto bucket_iter =
+          std::lower_bound(buckets_vector.begin(), buckets_vector.end(), value);
+      if (bucket_iter == buckets_vector.end()) {
+        --bucket_iter;
+      }
+      const int32 bucket =
+          static_cast<int32>(bucket_iter - buckets_vector.begin());
+      output(instance) = bucket;
+    }
+  }
+}
+
+// Validates attributes for the quantile ops.
+Status ReadAndValidateAttributes(OpKernelConstruction* const context,
+                                 int* num_dense_features,
+                                 int* num_sparse_features) {
+  TF_RETURN_IF_ERROR(
+      context->GetAttr(kNumDenseFeaturesName, num_dense_features));
+  TF_RETURN_IF_ERROR(
+      context->GetAttr(kNumSparseFeaturesName, num_sparse_features));
+  if ((*num_dense_features) + (*num_sparse_features) == 0) {
+    return errors::InvalidArgument(
+        "Please provide at least sparse or dense features.");
+  }
+  return Status::OK();
+}
+
+void ParseConfig(OpKernelConstruction* const context, const string& name,
+                 std::vector<QuantileConfig>* output) {
+  std::vector<string> serialized_config;
+  OP_REQUIRES_OK(context, context->GetAttr(name, &serialized_config));
+  output->reserve(serialized_config.size());
+  QuantileConfig tmp;
+  for (const auto& serialized_string : serialized_config) {
+    OP_REQUIRES(context, tmp.ParseFromString(serialized_string),
+                errors::InvalidArgument("Malformed QuantileConfig passed in."));
+    output->push_back(tmp);
+  }
+}
+
+// Generates quantiles on a finalized QuantileStream.
+std::vector<float> GenerateBoundaries(const QuantileStream& stream,
+                                      int num_boundaries) {
+  std::vector<float> boundaries = stream.GenerateBoundaries(num_boundaries);
+
+  // Uniquify elements as we may get dupes.
+  auto end_it = std::unique(boundaries.begin(), boundaries.end());
+  boundaries.resize(std::distance(boundaries.begin(), end_it));
+  return boundaries;
+}
+
+// Copies quantiles to output list.
+void CopyBoundaries(OpKernelContext* const context,
+                    const std::vector<float>& boundaries, const int64 index,
+                    OpOutputList* output_list) {
+  // Output to tensor.
+  Tensor* output_t = nullptr;
+  OP_REQUIRES_OK(
+      context, output_list->allocate(
+                   index, {static_cast<int64>(boundaries.size())}, &output_t));
+  auto* quantiles_flat = output_t->flat<float>().data();
+  memcpy(quantiles_flat, boundaries.data(), sizeof(float) * boundaries.size());
+}
+
+void CopySummaryToProto(const QuantileSummary& summary,
+                        ::boosted_trees::QuantileSummaryState* summary_proto) {
+  summary_proto->mutable_entries()->Reserve(summary.Size());
+  for (const auto& entry : summary.GetEntryList()) {
+    auto* new_entry = summary_proto->add_entries();
+    new_entry->set_value(entry.value);
+    new_entry->set_weight(entry.weight);
+    new_entry->set_min_rank(entry.min_rank);
+    new_entry->set_max_rank(entry.max_rank);
+  }
+}
+
+}  // namespace
+
+// Accumulator for Quantile Summaries.
+REGISTER_RESOURCE_HANDLE_KERNEL(QuantileStreamResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantileAccumulatorIsInitialized").Device(DEVICE_CPU),
+    IsResourceInitialized<QuantileStreamResource>);
+
+class CreateQuantileAccumulatorOp : public OpKernel {
+ public:
+  explicit CreateQuantileAccumulatorOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr(kEpsilonName, &epsilon_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr(kNumQuantilesName, &num_quantiles_));
+    OP_REQUIRES_OK(context, context->GetAttr(kMaxElementsName, &max_elements_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions. If one already exists, it unrefs the new one.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    auto result =
+        new QuantileStreamResource(epsilon_, num_quantiles_, max_elements_,
+                                   stamp_token_t->scalar<int64>()());
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+
+ private:
+  float epsilon_;
+  int32 num_quantiles_;
+  // An upperbound on the number of enteries that the summaries might have
+  // for a feature.
+  int64 max_elements_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CreateQuantileAccumulator").Device(DEVICE_CPU),
+                        CreateQuantileAccumulatorOp);
+
+// Adds a summary to the quantile summary stream.
+class QuantileAccumulatorAddSummariesOp : public OpKernel {
+ public:
+  explicit QuantileAccumulatorAddSummariesOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList resource_handle_list;
+    OP_REQUIRES_OK(context, context->input_list(kResourceHandlesName,
+                                                &resource_handle_list));
+    OpInputList summary_list;
+    OP_REQUIRES_OK(context, context->input_list(kSummariesName, &summary_list));
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    boosted_trees::utils::ParallelFor(
+        resource_handle_list.size(), worker_threads->NumThreads(),
+        worker_threads,
+        [&context, &resource_handle_list, &summary_list, stamp_token](
+            int64 start, int64 end) {
+          for (int resource_handle_idx = start; resource_handle_idx < end;
+               ++resource_handle_idx) {
+            ResourceHandle handle = resource_handle_list[resource_handle_idx]
+                                        .flat<ResourceHandle>()(0);
+            QuantileStreamResource* streams_resource;
+            // Create a reference to the underlying resource using the handle.
+            OP_REQUIRES_OK(context,
+                           LookupResource(context, handle, &streams_resource));
+            // Remove the reference at the end of this scope.
+            mutex_lock l(*streams_resource->mutex());
+            core::ScopedUnref unref_me(streams_resource);
+
+            // If the stamp is invalid we drop the update.
+            if (!streams_resource->is_stamp_valid(stamp_token)) {
+              VLOG(1)
+                  << "Invalid stamp token in QuantileAccumulatorAddSummariesOp."
+                  << " Passed stamp token: " << stamp_token << " "
+                  << "Current token: " << streams_resource->stamp();
+              return;
+            }
+
+            protobuf::Arena arena;
+            ::boosted_trees::QuantileSummaryState* summary_proto =
+                protobuf::Arena::CreateMessage<
+                    ::boosted_trees::QuantileSummaryState>(&arena);
+            OP_REQUIRES(
+                context,
+                ParseProtoUnlimited(
+                    summary_proto,
+                    summary_list[resource_handle_idx].scalar<string>()()),
+                errors::InvalidArgument("Unable to parse quantile summary."));
+            std::vector<QuantileSummaryEntry> entries;
+            entries.reserve(summary_proto->entries_size());
+            for (const auto& entry : summary_proto->entries()) {
+              entries.emplace_back(entry.value(), entry.weight(),
+                                   entry.min_rank(), entry.max_rank());
+            }
+
+            // Add the summary to the quantile stream.
+            streams_resource->stream(stamp_token)->PushSummary(entries);
+          }
+        });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantileAccumulatorAddSummaries").Device(DEVICE_CPU),
+    QuantileAccumulatorAddSummariesOp);
+
+// Generates summaries for given set of float values, and the given config.
+class MakeQuantileSummariesOp : public OpKernel {
+ public:
+  explicit MakeQuantileSummariesOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   ReadAndValidateAttributes(context, &num_dense_features_,
+                                             &num_sparse_features_));
+    OP_REQUIRES_OK(context, context->GetAttr(kEpsilonName, &epsilon_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Read dense float features list;
+    OpInputList dense_float_features_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadDenseFloatFeatures(
+                                context, &dense_float_features_list));
+
+    // Read sparse float features list;
+    OpInputList sparse_float_feature_indices_list;
+    OpInputList sparse_float_feature_values_list;
+    OpInputList sparse_float_feature_shapes_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadSparseFloatFeatures(
+                                context, &sparse_float_feature_indices_list,
+                                &sparse_float_feature_values_list,
+                                &sparse_float_feature_shapes_list));
+
+    // Parse example weights and get batch size.
+    const Tensor* example_weights_t;
+    OP_REQUIRES_OK(context,
+                   context->input(kExampleWeightsName, &example_weights_t));
+    auto example_weights = example_weights_t->flat<float>();
+    const int64 batch_size = example_weights.size();
+
+    OpOutputList sparse_summaries_output_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list(kSparseSummariesName,
+                                        &sparse_summaries_output_list));
+    OpOutputList dense_summaries_output_list;
+    OP_REQUIRES_OK(context, context->output_list(kDenseSummariesName,
+                                                 &dense_summaries_output_list));
+
+    auto do_quantile_summary_gen = [&](const int64 begin, const int64 end) {
+      auto copy_over_summaries = [&](const QuantileStream& stream,
+                                     const int64 index,
+                                     OpOutputList* output_list) {
+        protobuf::Arena arena;
+        ::boosted_trees::QuantileSummaryState* summary_proto =
+            protobuf::Arena::CreateMessage<
+            ::boosted_trees::QuantileSummaryState>(&arena);
+        const auto& summary = stream.GetFinalSummary();
+        CopySummaryToProto(summary, summary_proto);
+        // Output to tensor.
+        Tensor* output_t = nullptr;
+        OP_REQUIRES_OK(context, output_list->allocate(index, {}, &output_t));
+        summary_proto->SerializeToString(&output_t->scalar<string>()());
+      };
+
+      // These are blocks of ranges. We are iterating over both sparse and
+      // dense features i.e. [0, sparse_features.size() + dense_features.size()]
+      for (int64 i = begin; i < end; ++i) {
+        if (i < num_dense_features_) {
+          const int64 dense_index = i;
+          const auto dense_values =
+              dense_float_features_list[dense_index].flat<float>();
+          QuantileStream stream(epsilon_, batch_size + 1);
+          // Run quantile summary generation.
+          for (int64 j = 0; j < batch_size; ++j) {
+            stream.PushEntry(dense_values(j), example_weights(j));
+          }
+          stream.Finalize();
+          // Copy summaries to output.
+          copy_over_summaries(stream, dense_index,
+                              &dense_summaries_output_list);
+        } else {
+          const int64 sparse_index = i - num_dense_features_;
+          const auto sparse_values =
+              sparse_float_feature_values_list[sparse_index].flat<float>();
+          const auto sparse_indices =
+              sparse_float_feature_indices_list[sparse_index].matrix<int64>();
+          const auto dense_shape =
+              sparse_float_feature_shapes_list[sparse_index].flat<int64>();
+          OP_REQUIRES(context, batch_size == dense_shape(0),
+                      errors::InvalidArgument(
+                          "Sparse column shape doesn't match the batch size."));
+          QuantileStream stream(epsilon_, batch_size + 1);
+          // Run quantile summary generation.
+          const int64 num_sparse_rows =
+              sparse_float_feature_indices_list[sparse_index].dim_size(0);
+          for (int64 j = 0; j < num_sparse_rows; ++j) {
+            const int64 example_id = sparse_indices(j, 0);
+            stream.PushEntry(sparse_values(j), example_weights(example_id));
+          }
+          stream.Finalize();
+          // Copy summaries to output.
+          copy_over_summaries(stream, sparse_index,
+                              &sparse_summaries_output_list);
+        }
+      }
+    };
+    const int64 kCostPerUnit = 500 * batch_size;
+    const int64 num_features = num_sparse_features_ + num_dense_features_;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *context->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, num_features,
+          kCostPerUnit, do_quantile_summary_gen);
+  }
+
+ private:
+  int num_dense_features_;
+  int num_sparse_features_;
+  float epsilon_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MakeQuantileSummaries").Device(DEVICE_CPU),
+                        MakeQuantileSummariesOp);
+
+// Serializes the state of streams.
+class QuantileAccumulatorSerializeOp : public OpKernel {
+ public:
+  explicit QuantileAccumulatorSerializeOp(OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    int64 stamp_token = streams_resource->stamp();
+    Tensor* stream_state_t;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(kStreamStateName, TensorShape({}),
+                                            &stream_state_t));
+    bool are_buckets_ready = streams_resource->are_buckets_ready();
+
+    // We are iterating over both dense and sparse features. First we go
+    // through the dense features and then the sparse features.
+    const QuantileStream& stream = *streams_resource->stream(stamp_token);
+    const std::vector<float>& boundaries =
+        are_buckets_ready ? streams_resource->boundaries(stamp_token)
+                          : std::vector<float>();
+    protobuf::Arena arena;
+    ::boosted_trees::QuantileStreamState* stream_proto =
+        protobuf::Arena::CreateMessage<::boosted_trees::QuantileStreamState>(
+            &arena);
+    for (const auto& summary : stream.SerializeInternalSummaries()) {
+      CopySummaryToProto(summary, stream_proto->add_summaries());
+    }
+    stream_proto->SerializeToString(&stream_state_t->scalar<string>()());
+    Tensor* buckets_t = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            kBucketsName, {static_cast<int64>(boundaries.size())}, &buckets_t));
+    auto* quantiles_flat = buckets_t->flat<float>().data();
+    memcpy(quantiles_flat, boundaries.data(),
+           sizeof(float) * boundaries.size());
+    Tensor* stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(kStampTokenName, TensorShape({}),
+                                            &stamp_token_t));
+    stamp_token_t->scalar<int64>()() = stamp_token;
+    Tensor* are_buckets_ready_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(kAreBucketsReadyName, {},
+                                                     &are_buckets_ready_t));
+    are_buckets_ready_t->scalar<bool>()() = are_buckets_ready;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantileAccumulatorSerialize").Device(DEVICE_CPU),
+                        QuantileAccumulatorSerializeOp);
+
+// Serializes the state of streams.
+class QuantileAccumulatorDeserializeOp : public OpKernel {
+ public:
+  explicit QuantileAccumulatorDeserializeOp(OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    int64 old_stamp_token = streams_resource->stamp();
+
+    const Tensor* stream_state_t;
+    OP_REQUIRES_OK(context, context->input(kStreamStateName, &stream_state_t));
+    const Tensor* buckets_t;
+    OP_REQUIRES_OK(context, context->input(kBucketsName, &buckets_t));
+
+    QuantileStream* stream = streams_resource->stream(old_stamp_token);
+    ::boosted_trees::QuantileStreamState state_proto;
+    OP_REQUIRES(
+        context,
+        ParseProtoUnlimited(&state_proto, stream_state_t->scalar<string>()()),
+        errors::InvalidArgument("Unabnle to parse quantile stream state."));
+    std::vector<QuantileSummary> summaries;
+    summaries.reserve(state_proto.summaries_size());
+    std::vector<QuantileSummaryEntry> entries;
+    for (const auto& summary : state_proto.summaries()) {
+      entries.clear();
+      entries.reserve(summary.entries_size());
+      for (const auto& entry : summary.entries()) {
+        entries.emplace_back(entry.value(), entry.weight(), entry.min_rank(),
+                             entry.max_rank());
+      }
+      summaries.emplace_back();
+      summaries[summaries.size() - 1].BuildFromSummaryEntries(entries);
+    }
+    stream->DeserializeInternalSummaries(summaries);
+
+    const auto& buckets = buckets_t->vec<float>();
+    std::vector<float> result;
+    result.reserve(buckets.size());
+
+    for (size_t i = 0; i < buckets.size(); ++i) {
+      result.push_back(buckets(i));
+    }
+    streams_resource->set_boundaries(old_stamp_token, result);
+
+    // Reset the stamp token.
+    const Tensor* stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+    streams_resource->set_stamp(stamp_token);
+
+    const Tensor* are_buckets_ready_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->input(kAreBucketsReadyName, &are_buckets_ready_t));
+    streams_resource->set_buckets_ready(are_buckets_ready_t->scalar<bool>()());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantileAccumulatorDeserialize").Device(DEVICE_CPU),
+    QuantileAccumulatorDeserializeOp);
+
+// Flushes the quantile summary stream resource.
+class QuantileAccumulatorFlushOp : public OpKernel {
+ public:
+  explicit QuantileAccumulatorFlushOp(OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    const Tensor* next_stamp_token_t;
+    OP_REQUIRES_OK(context,
+                   context->input(kNextStampTokenName, &next_stamp_token_t));
+    int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+    CHECK(streams_resource->is_stamp_valid(stamp_token))
+        << "Invalid stamp token in QuantileAccumulatorFlushOp. "
+        << "Passed stamp token: " << stamp_token << " "
+        << "Current token: " << streams_resource->stamp();
+    QuantileStream* stream = streams_resource->stream(stamp_token);
+    stream->Finalize();
+    streams_resource->set_boundaries(
+        stamp_token,
+        GenerateBoundaries(*stream, streams_resource->num_quantiles()));
+    streams_resource->Reset(next_stamp_token);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantileAccumulatorFlush").Device(DEVICE_CPU),
+                        QuantileAccumulatorFlushOp);
+
+// Flushes the quantile summary stream resource. This version computes the
+// summary.
+class QuantileAccumulatorFlushSummaryOp : public OpKernel {
+ public:
+  explicit QuantileAccumulatorFlushSummaryOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    const Tensor* next_stamp_token_t;
+    OP_REQUIRES_OK(context,
+                   context->input(kNextStampTokenName, &next_stamp_token_t));
+    int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+    CHECK(streams_resource->is_stamp_valid(stamp_token))
+        << "Invalid stamp token in QuantileAccumulatorFlushSummaryOp. "
+        << "Passed stamp token: " << stamp_token << " "
+        << "Current token: " << streams_resource->stamp();
+    QuantileStream* stream = streams_resource->stream(stamp_token);
+    stream->Finalize();
+    protobuf::Arena arena;
+    ::boosted_trees::QuantileSummaryState* summary_proto =
+        protobuf::Arena::CreateMessage<::boosted_trees::QuantileSummaryState>(
+            &arena);
+    const auto& summary = stream->GetFinalSummary();
+    CopySummaryToProto(summary, summary_proto);
+    // Output to tensor.
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output_t));
+    summary_proto->SerializeToString(&output_t->scalar<string>()());
+    streams_resource->Reset(next_stamp_token);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantileAccumulatorFlushSummary").Device(DEVICE_CPU),
+    QuantileAccumulatorFlushSummaryOp);
+
+// Get bucket boundaries from summaries.
+class QuantileAccumulatorGetBucketsOp : public OpKernel {
+ public:
+  explicit QuantileAccumulatorGetBucketsOp(OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* const context) override {
+    OpInputList resource_handle_list;
+    OP_REQUIRES_OK(context, context->input_list(kResourceHandlesName,
+                                                &resource_handle_list));
+    OpOutputList are_buckets_ready_list;
+    OP_REQUIRES_OK(context, context->output_list(kAreBucketsReadyName,
+                                                 &are_buckets_ready_list));
+    OpOutputList buckets_list;
+    OP_REQUIRES_OK(context, context->output_list(kBucketsName, &buckets_list));
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    boosted_trees::utils::ParallelFor(
+        resource_handle_list.size(), worker_threads->NumThreads(),
+        worker_threads,
+        [&context, &resource_handle_list, &are_buckets_ready_list,
+         &buckets_list, stamp_token](int64 start, int64 end) {
+          for (int resource_handle_idx = start; resource_handle_idx < end;
+               ++resource_handle_idx) {
+            ResourceHandle handle = resource_handle_list[resource_handle_idx]
+                                        .flat<ResourceHandle>()(0);
+            QuantileStreamResource* streams_resource;
+            OP_REQUIRES_OK(context,
+                           LookupResource(context, handle, &streams_resource));
+            // Remove the reference at the end of this scope.
+            mutex_lock l(*streams_resource->mutex());
+            core::ScopedUnref unref_me(streams_resource);
+
+            bool are_buckets_ready =
+                streams_resource->is_stamp_valid(stamp_token) &&
+                streams_resource->are_buckets_ready();
+
+            Tensor* are_buckets_ready_t = nullptr;
+            OP_REQUIRES_OK(context,
+                           are_buckets_ready_list.allocate(
+                               resource_handle_idx, {}, &are_buckets_ready_t));
+            are_buckets_ready_t->scalar<bool>()() = are_buckets_ready;
+
+            const std::vector<float>& boundaries =
+                are_buckets_ready ? streams_resource->boundaries(stamp_token)
+                                  : std::vector<float>();
+            Tensor* output_t = nullptr;
+            OP_REQUIRES_OK(context, buckets_list.allocate(
+                                        resource_handle_idx,
+                                        {static_cast<int64>(boundaries.size())},
+                                        &output_t));
+            auto* quantiles_flat = output_t->flat<float>().data();
+            memcpy(quantiles_flat, boundaries.data(),
+                   sizeof(float) * boundaries.size());
+          }
+        });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("QuantileAccumulatorGetBuckets").Device(DEVICE_CPU),
+    QuantileAccumulatorGetBucketsOp);
+
+// Generates buckets for given set of float values, and the given config.
+class QuantileBucketsOp : public OpKernel {
+ public:
+  explicit QuantileBucketsOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   ReadAndValidateAttributes(context, &num_dense_features_,
+                                             &num_sparse_features_));
+
+    ParseConfig(context, kDenseConfigName, &dense_configs_);
+    OP_REQUIRES(context, dense_configs_.size() == num_dense_features_,
+                errors::InvalidArgument(
+                    "Mismatch in number of dense quantile configs."));
+    ParseConfig(context, kSparseConfigName, &sparse_configs_);
+    OP_REQUIRES(context, sparse_configs_.size() == num_sparse_features_,
+                errors::InvalidArgument(
+                    "Mismatch in number of sparse quantile configs."));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Read dense float features list;
+    OpInputList dense_float_features_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadDenseFloatFeatures(
+                                context, &dense_float_features_list));
+
+    // Read sparse float features list;
+    OpInputList sparse_float_feature_indices_list;
+    OpInputList sparse_float_feature_values_list;
+    OpInputList sparse_float_feature_shapes_list;
+    OP_REQUIRES_OK(context, TensorUtils::ReadSparseFloatFeatures(
+                                context, &sparse_float_feature_indices_list,
+                                &sparse_float_feature_values_list,
+                                &sparse_float_feature_shapes_list));
+
+    // Parse example weights and get batch size.
+    const Tensor* example_weights_t;
+    OP_REQUIRES_OK(context,
+                   context->input(kExampleWeightsName, &example_weights_t));
+    auto example_weights = example_weights_t->flat<float>();
+    const int64 batch_size = example_weights.size();
+
+    OpOutputList sparse_buckets_output_list;
+    OP_REQUIRES_OK(context, context->output_list(kSparseBucketsName,
+                                                 &sparse_buckets_output_list));
+    OpOutputList dense_buckets_output_list;
+    OP_REQUIRES_OK(context, context->output_list(kDenseBucketsName,
+                                                 &dense_buckets_output_list));
+
+    auto do_quantile_bucket_gen = [&](const int64 begin, const int64 end) {
+      // These are blocks of ranges. We are iterating over both sparse and
+      // dense features i.e. [0, sparse_features.size() + dense_features.size()]
+      for (int64 i = begin; i < end; ++i) {
+        if (i < sparse_configs_.size()) {
+          const int64 sparse_index = i;
+          const auto sparse_values =
+              sparse_float_feature_values_list[sparse_index].flat<float>();
+          const auto sparse_indices =
+              sparse_float_feature_indices_list[sparse_index].matrix<int64>();
+          QuantileStream stream(sparse_configs_[sparse_index].eps(),
+                                batch_size);
+          // Run quantile summary generation.
+          const int64 num_sparse_rows =
+              sparse_float_feature_indices_list[sparse_index].dim_size(0);
+          for (int64 j = 0; j < num_sparse_rows; ++j) {
+            const int64 example_id = sparse_indices(j, 0);
+            stream.PushEntry(sparse_values(j), example_weights(example_id));
+          }
+          stream.Finalize();
+          // Create buckets.
+          const auto boundaries = GenerateBoundaries(
+              stream, sparse_configs_[sparse_index].num_quantiles());
+          CopyBoundaries(context, boundaries, sparse_index,
+                         &sparse_buckets_output_list);
+
+        } else {
+          const int64 dense_index = i - sparse_configs_.size();
+          const auto dense_values =
+              dense_float_features_list[dense_index].flat<float>();
+          QuantileStream stream(dense_configs_[dense_index].eps(), batch_size);
+          // Run quantile summary generation.
+          for (int64 j = 0; j < batch_size; ++j) {
+            stream.PushEntry(dense_values(j), example_weights(j));
+          }
+          stream.Finalize();
+          // Create buckets.
+          const auto boundaries = GenerateBoundaries(
+              stream, dense_configs_[dense_index].num_quantiles());
+          CopyBoundaries(context, boundaries, dense_index,
+                         &dense_buckets_output_list);
+        }
+      }
+    };
+
+    const int64 kCostPerUnit = 500 * batch_size;
+    const int64 num_features = sparse_configs_.size() + dense_configs_.size();
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *context->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, num_features,
+          kCostPerUnit, do_quantile_bucket_gen);
+  }
+
+ private:
+  int num_dense_features_;
+  int num_sparse_features_;
+  std::vector<QuantileConfig> dense_configs_;
+  std::vector<QuantileConfig> sparse_configs_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("QuantileBuckets").Device(DEVICE_CPU),
+                        QuantileBucketsOp);
+
+// Given the calculated quantiles thresholds and input data, this operation
+// converts the input features into the buckets (categorical values), depending
+// on which quantile they fall into.
+class QuantilesOp : public OpKernel {
+ public:
+  explicit QuantilesOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    int num_dense_features;
+    int num_sparse_features;
+    OP_REQUIRES_OK(context,
+                   ReadAndValidateAttributes(context, &num_dense_features,
+                                             &num_sparse_features));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Dense features inputs
+    OpInputList dense_float_features_list;
+    OP_REQUIRES_OK(context, context->input_list(kDenseValuesName,
+                                                &dense_float_features_list));
+    OpInputList dense_buckets_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list(kDenseBucketsName, &dense_buckets_list));
+
+    if (dense_buckets_list.size() > 0) {
+      // Check the first tensor to make sure it is the right shape
+      OP_REQUIRES(
+          context,
+          tensorflow::TensorShapeUtils::IsVector(dense_buckets_list[0].shape()),
+          errors::InvalidArgument(
+              strings::Printf("Dense buckets should be flat vectors")));
+    }
+
+    // Sparse features inputs
+    OpInputList sparse_float_feature_values_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list(kSparseValuesName,
+                                       &sparse_float_feature_values_list));
+    OpInputList sparse_buckets_list;
+    OP_REQUIRES_OK(
+        context, context->input_list(kSparseBucketsName, &sparse_buckets_list));
+
+    if (sparse_buckets_list.size() > 0) {
+      OP_REQUIRES(
+          context,
+          tensorflow::TensorShapeUtils::IsVector(
+              sparse_buckets_list[0].shape()),
+          errors::InvalidArgument("Sparse buckets should be flat vectors"));
+    }
+
+    // Quantize the feature values
+    QuantizeFeatures(kDenseOutputTensorName, dense_float_features_list,
+                     dense_buckets_list, context);
+
+    QuantizeFeatures(kSparseOutputTensorName, sparse_float_feature_values_list,
+                     sparse_buckets_list, context);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Quantiles").Device(DEVICE_CPU), QuantilesOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29635bb3c404e54f0561d9b9189270022f063cbe
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -0,0 +1,542 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+#include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using boosted_trees::learner::SplitInfo;
+using boosted_trees::learner::stochastic::GradientStats;
+using boosted_trees::learner::stochastic::NodeStats;
+using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
+
+class BaseBuildSplitOp : public OpKernel {
+ public:
+  explicit BaseBuildSplitOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(
+        context,
+        context->GetAttr("feature_column_group_id", &feature_column_group_id_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("l1_regularization", &l1_regularization_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("l2_regularization", &l2_regularization_));
+    OP_REQUIRES_OK(context, context->GetAttr("tree_complexity_regularization",
+                                             &tree_complexity_regularization_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("min_node_weight", &min_node_weight_));
+
+    int strategy;
+    OP_REQUIRES_OK(context, context->GetAttr("multiclass_strategy", &strategy));
+    OP_REQUIRES(
+        context,
+        boosted_trees::learner::LearnerConfig_MultiClassStrategy_IsValid(
+            strategy),
+        errors::InvalidArgument("Wrong multiclass strategy passed."));
+    multiclass_strategy_ = LearnerConfig_MultiClassStrategy(strategy);
+  }
+
+  NodeStats ComputeNodeStats(const GradientStats& grad_stats) {
+    return NodeStats(l1_regularization_, l2_regularization_, min_node_weight_,
+                     multiclass_strategy_, grad_stats);
+  }
+
+  void ReadClassId(OpKernelContext* const context, int32* class_id) {
+    const Tensor* class_id_t;
+    OP_REQUIRES_OK(context, context->input("class_id", &class_id_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(class_id_t->shape()),
+                errors::InvalidArgument("class_id must be a scalar."));
+    *class_id = class_id_t->scalar<int32>()();
+  }
+
+  void FillLeaf(const int class_id, const NodeStats& best_node_stats,
+                boosted_trees::trees::Leaf* leaf) const {
+    if (class_id == -1) {
+      // This would be the case either for TREE_PER_CLASS with only 2 classes,
+      // or for other multiclass strategies.
+      for (float f : best_node_stats.weight_contribution) {
+        leaf->mutable_vector()->add_value(f);
+      }
+    } else {
+      CHECK(best_node_stats.weight_contribution.size() == 1)
+          << "Weight contribution size = "
+          << best_node_stats.weight_contribution.size();
+      leaf->mutable_sparse_vector()->add_index(class_id);
+      leaf->mutable_sparse_vector()->add_value(
+          best_node_stats.weight_contribution[0]);
+    }
+  }
+
+ protected:
+  LearnerConfig_MultiClassStrategy multiclass_strategy_;
+  int32 feature_column_group_id_;
+  float l1_regularization_;
+  float l2_regularization_;
+  float min_node_weight_;
+  float tree_complexity_regularization_;
+};
+
+class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
+ public:
+  explicit BuildDenseInequalitySplitsOp(OpKernelConstruction* const context)
+      : BaseBuildSplitOp(context) {}
+
+  void Compute(OpKernelContext* const context) override {
+    const Tensor* num_minibatches_t;
+    OP_REQUIRES_OK(context,
+                   context->input("num_minibatches", &num_minibatches_t));
+    const int64 num_minibatches = num_minibatches_t->scalar<int64>()();
+    const float normalizer_ratio = (1.0f / num_minibatches);
+
+    const Tensor* bucket_boundaries_t;
+    OP_REQUIRES_OK(context,
+                   context->input("bucket_boundaries", &bucket_boundaries_t));
+    const auto& bucket_boundaries = bucket_boundaries_t->vec<float>();
+
+    const Tensor* partition_ids_t;
+    OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
+    const auto& partition_ids = partition_ids_t->vec<int32>();
+
+    const Tensor* bucket_ids_t;
+    OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t));
+    const auto& bucket_ids = bucket_ids_t->vec<int64>();
+
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+
+    int class_id;
+    ReadClassId(context, &class_id);
+
+    // Find the number of unique partitions before we allocate the output.
+    std::vector<int32> partition_boundaries;
+    partition_boundaries.push_back(0);
+    for (int i = 1; i < partition_ids.size(); ++i) {
+      if (partition_ids(i) != partition_ids(i - 1)) {
+        // Make sure the input is sorted by partition_ids;
+        OP_REQUIRES(context, partition_ids(i) >= partition_ids(i - 1),
+                    errors::InvalidArgument("Input should be sorted."));
+        partition_boundaries.push_back(i);
+      }
+    }
+    if (partition_ids.size() > 0) {
+      partition_boundaries.push_back(partition_ids.size());
+    }
+    int32 num_elements = partition_boundaries.size() - 1;
+
+    // When the handler is inactive, no bucket boundaries are built for it.
+    if (bucket_boundaries.size() == 0) {
+      num_elements = 0;
+    }
+
+    Tensor* output_partition_ids_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output_partition_ids",
+                                            TensorShape({num_elements}),
+                                            &output_partition_ids_t));
+
+    tensorflow::TTypes<int32>::Vec output_partition_ids =
+        output_partition_ids_t->vec<int32>();
+
+    Tensor* gains_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("gains", TensorShape({num_elements}),
+                                          &gains_t));
+
+    tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
+
+    Tensor* output_splits_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "split_infos", TensorShape({num_elements}),
+                                &output_splits_t));
+    tensorflow::TTypes<string>::Vec output_splits =
+        output_splits_t->vec<string>();
+    for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
+      float best_gain = std::numeric_limits<float>::lowest();
+      int start_index = partition_boundaries[root_idx];
+      int end_index = partition_boundaries[root_idx + 1];
+      GradientStats root_gradient_stats;
+      for (int64 bucket_idx = start_index; bucket_idx < end_index;
+           ++bucket_idx) {
+        root_gradient_stats +=
+            GradientStats(*gradients_t, *hessians_t, bucket_idx);
+      }
+      root_gradient_stats *= normalizer_ratio;
+      NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
+      int32 best_bucket_idx = 0;
+      NodeStats best_right_node_stats(0);
+      NodeStats best_left_node_stats(0);
+      GradientStats left_gradient_stats;
+      for (int64 bucket_idx = start_index; bucket_idx < end_index;
+           ++bucket_idx) {
+        GradientStats g(*gradients_t, *hessians_t, bucket_idx);
+        g *= normalizer_ratio;
+        left_gradient_stats += g;
+        NodeStats left_stats = ComputeNodeStats(left_gradient_stats);
+        GradientStats right_gradient_stats =
+            root_gradient_stats - left_gradient_stats;
+        NodeStats right_stats = ComputeNodeStats(right_gradient_stats);
+        if (left_stats.gain + right_stats.gain > best_gain) {
+          best_gain = left_stats.gain + right_stats.gain;
+          best_left_node_stats = left_stats;
+          best_right_node_stats = right_stats;
+          best_bucket_idx = bucket_idx;
+        }
+      }
+      SplitInfo split_info;
+      auto* dense_split =
+          split_info.mutable_split_node()->mutable_dense_float_binary_split();
+      dense_split->set_feature_column(feature_column_group_id_);
+      dense_split->set_threshold(
+          bucket_boundaries(bucket_ids(best_bucket_idx)));
+
+      auto* left_child = split_info.mutable_left_child();
+      auto* right_child = split_info.mutable_right_child();
+
+      FillLeaf(class_id, best_left_node_stats, left_child);
+      FillLeaf(class_id, best_right_node_stats, right_child);
+      split_info.SerializeToString(&output_splits(root_idx));
+      gains(root_idx) =
+          best_gain - root_stats.gain - tree_complexity_regularization_;
+      output_partition_ids(root_idx) = partition_ids(start_index);
+    }
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU),
+                        BuildDenseInequalitySplitsOp);
+
+class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
+ public:
+  explicit BuildSparseInequalitySplitsOp(OpKernelConstruction* const context)
+      : BaseBuildSplitOp(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("bias_feature_id", &bias_feature_id_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    const Tensor* num_minibatches_t;
+    OP_REQUIRES_OK(context,
+                   context->input("num_minibatches", &num_minibatches_t));
+    const int64 num_minibatches = num_minibatches_t->scalar<int64>()();
+    const float normalizer_ratio = (1.0f / num_minibatches);
+
+    const Tensor* bucket_boundaries_t;
+    OP_REQUIRES_OK(context,
+                   context->input("bucket_boundaries", &bucket_boundaries_t));
+    const auto& bucket_boundaries = bucket_boundaries_t->vec<float>();
+
+    const Tensor* partition_ids_t;
+    OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
+    const auto& partition_ids = partition_ids_t->vec<int32>();
+
+    const Tensor* bucket_ids_t;
+    OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t));
+    const auto& bucket_ids = bucket_ids_t->vec<int64>();
+
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+
+    int class_id;
+    ReadClassId(context, &class_id);
+
+    // Find the number of unique partitions before we allocate the output.
+    std::vector<int32> partition_boundaries;
+    std::vector<int32> non_empty_partitions;
+    for (int i = 0; i < partition_ids.size() - 1; ++i) {
+      // Make sure the input is sorted by partition_ids;
+      CHECK_LE(partition_ids(i), partition_ids(i + 1));
+      if (i == 0 || partition_ids(i) != partition_ids(i - 1)) {
+        partition_boundaries.push_back(i);
+        // Some partitions might only have bias feature. We don't want to split
+        // those so check that the partition has at least 2 buckets.
+        if (partition_ids(i) == partition_ids(i + 1)) {
+          non_empty_partitions.push_back(partition_boundaries.size() - 1);
+        }
+      }
+    }
+    if (partition_ids.size() > 0) {
+      partition_boundaries.push_back(partition_ids.size());
+    }
+    int num_elements = non_empty_partitions.size();
+    Tensor* output_partition_ids_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output_partition_ids",
+                                            TensorShape({num_elements}),
+                                            &output_partition_ids_t));
+
+    tensorflow::TTypes<int32>::Vec output_partition_ids =
+        output_partition_ids_t->vec<int32>();
+
+    Tensor* gains_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("gains", TensorShape({num_elements}),
+                                          &gains_t));
+
+    tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
+
+    Tensor* output_splits_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "split_infos", TensorShape({num_elements}),
+                                &output_splits_t));
+    tensorflow::TTypes<string>::Vec output_splits =
+        output_splits_t->vec<string>();
+    for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
+      float best_gain = std::numeric_limits<float>::lowest();
+      int start_index = partition_boundaries[non_empty_partitions[root_idx]];
+      int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1];
+      // First bucket ID in each partition should be the bias feature.
+      OP_REQUIRES(context, bucket_ids(start_index) == bias_feature_id_,
+                  errors::InvalidArgument("Bias feature ID missing."));
+      // For each root, we do two passes over the quantized feature buckets
+      // accumulating gradients on one side and using the root aggregate
+      // gradients to get the gradients for the other side.
+      // Split gains are evaluated for each pass at every threshold and the best
+      // split is picked.
+      GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
+      root_gradient_stats *= normalizer_ratio;
+      NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
+      GradientStats present_gradient_stats;
+      for (int64 bucket_idx = start_index + 1; bucket_idx < end_index;
+           ++bucket_idx) {
+        present_gradient_stats +=
+            GradientStats(*gradients_t, *hessians_t, bucket_idx);
+      }
+      present_gradient_stats *= normalizer_ratio;
+      int32 best_bucket_idx = 0;
+      NodeStats best_right_node_stats(0);
+      NodeStats best_left_node_stats(0);
+      GradientStats left_gradient_stats;
+      bool default_right = false;
+      for (int64 bucket_idx = start_index + 1; bucket_idx < end_index;
+           ++bucket_idx) {
+        GradientStats g(*gradients_t, *hessians_t, bucket_idx);
+        g *= normalizer_ratio;
+        left_gradient_stats += g;
+        // We have the sum of all present gradients. Use that to compute the
+        // backward pass gradients.
+        GradientStats right_gradient_stats =
+            present_gradient_stats - left_gradient_stats;
+        {
+          NodeStats left_stats_default_left =
+              ComputeNodeStats(root_gradient_stats - right_gradient_stats);
+          NodeStats right_stats_default_left =
+              ComputeNodeStats(right_gradient_stats);
+          if (left_stats_default_left.gain + right_stats_default_left.gain >
+              best_gain) {
+            best_gain =
+                left_stats_default_left.gain + right_stats_default_left.gain;
+            best_left_node_stats = left_stats_default_left;
+            best_right_node_stats = right_stats_default_left;
+            best_bucket_idx = bucket_idx;
+            default_right = false;
+          }
+        }
+        {
+          NodeStats left_stats_default_right =
+              ComputeNodeStats(left_gradient_stats);
+          NodeStats right_stats_default_right =
+              ComputeNodeStats(root_gradient_stats - left_gradient_stats);
+          if (left_stats_default_right.gain + right_stats_default_right.gain >
+              best_gain) {
+            best_gain =
+                left_stats_default_right.gain + right_stats_default_right.gain;
+            best_left_node_stats = left_stats_default_right;
+            best_right_node_stats = right_stats_default_right;
+            best_bucket_idx = bucket_idx;
+            default_right = true;
+          }
+        }
+      }
+      SplitInfo split_info;
+      boosted_trees::trees::DenseFloatBinarySplit* dense_split = nullptr;
+      if (default_right) {
+        dense_split = split_info.mutable_split_node()
+                          ->mutable_sparse_float_binary_split_default_right()
+                          ->mutable_split();
+      } else {
+        dense_split = split_info.mutable_split_node()
+                          ->mutable_sparse_float_binary_split_default_left()
+                          ->mutable_split();
+      }
+      dense_split->set_feature_column(feature_column_group_id_);
+      dense_split->set_threshold(
+          bucket_boundaries(bucket_ids(best_bucket_idx)));
+
+      auto* left_child = split_info.mutable_left_child();
+      auto* right_child = split_info.mutable_right_child();
+      FillLeaf(class_id, best_left_node_stats, left_child);
+      FillLeaf(class_id, best_right_node_stats, right_child);
+      split_info.SerializeToString(&output_splits(root_idx));
+      gains(root_idx) =
+          best_gain - root_stats.gain - tree_complexity_regularization_;
+      output_partition_ids(root_idx) = partition_ids(start_index);
+    }
+  }
+
+ private:
+  int64 bias_feature_id_;
+};
+REGISTER_KERNEL_BUILDER(Name("BuildSparseInequalitySplits").Device(DEVICE_CPU),
+                        BuildSparseInequalitySplitsOp);
+
+class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
+ public:
+  explicit BuildCategoricalEqualitySplitsOp(OpKernelConstruction* const context)
+      : BaseBuildSplitOp(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("bias_feature_id", &bias_feature_id_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    const Tensor* num_minibatches_t;
+    OP_REQUIRES_OK(context,
+                   context->input("num_minibatches", &num_minibatches_t));
+    const int64 num_minibatches = num_minibatches_t->scalar<int64>()();
+    const float normalizer_ratio = (1.0f / num_minibatches);
+
+    const Tensor* partition_ids_t;
+    OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
+    const auto& partition_ids = partition_ids_t->vec<int32>();
+
+    const Tensor* feature_ids_t;
+    OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    const auto& feature_ids = feature_ids_t->vec<int64>();
+
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+
+    int class_id;
+    ReadClassId(context, &class_id);
+
+    // Find the number of unique partitions before we allocate the output.
+    std::vector<int32> partition_boundaries;
+    std::vector<int32> non_empty_partitions;
+    for (int i = 0; i < partition_ids.size() - 1; ++i) {
+      // Make sure the input is sorted by partition_ids;
+      CHECK_LE(partition_ids(i), partition_ids(i + 1));
+      if (i == 0 || partition_ids(i) != partition_ids(i - 1)) {
+        partition_boundaries.push_back(i);
+        // Some partitions might only have bias feature. We don't want to split
+        // those so check that the partition has at least 2 features.
+        if (partition_ids(i) == partition_ids(i + 1)) {
+          non_empty_partitions.push_back(partition_boundaries.size() - 1);
+        }
+      }
+    }
+    if (partition_ids.size() > 0) {
+      partition_boundaries.push_back(partition_ids.size());
+    }
+    int num_elements = non_empty_partitions.size();
+    Tensor* output_partition_ids_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output_partition_ids",
+                                            TensorShape({num_elements}),
+                                            &output_partition_ids_t));
+
+    tensorflow::TTypes<int32>::Vec output_partition_ids =
+        output_partition_ids_t->vec<int32>();
+
+    Tensor* gains_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("gains", TensorShape({num_elements}),
+                                          &gains_t));
+
+    tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
+
+    Tensor* output_splits_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "split_infos", TensorShape({num_elements}),
+                                &output_splits_t));
+    tensorflow::TTypes<string>::Vec output_splits =
+        output_splits_t->vec<string>();
+    for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
+      float best_gain = std::numeric_limits<float>::lowest();
+      int start_index = partition_boundaries[non_empty_partitions[root_idx]];
+      int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1];
+      // First feature ID in each partition should be the bias feature.
+      OP_REQUIRES(context, feature_ids(start_index) == bias_feature_id_,
+                  errors::InvalidArgument("Bias feature ID missing."));
+      GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
+      root_gradient_stats *= normalizer_ratio;
+      NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
+      int32 best_feature_idx = 0;
+      NodeStats best_right_node_stats(0);
+      NodeStats best_left_node_stats(0);
+      for (int64 feature_idx = start_index + 1; feature_idx < end_index;
+           ++feature_idx) {
+        GradientStats left_gradient_stats(*gradients_t, *hessians_t,
+                                          feature_idx);
+        left_gradient_stats *= normalizer_ratio;
+        GradientStats right_gradient_stats =
+            root_gradient_stats - left_gradient_stats;
+        NodeStats left_stats = ComputeNodeStats(left_gradient_stats);
+        NodeStats right_stats = ComputeNodeStats(right_gradient_stats);
+        if (left_stats.gain + right_stats.gain > best_gain) {
+          best_gain = left_stats.gain + right_stats.gain;
+          best_left_node_stats = left_stats;
+          best_right_node_stats = right_stats;
+          best_feature_idx = feature_idx;
+        }
+      }
+      SplitInfo split_info;
+      auto* equality_split = split_info.mutable_split_node()
+                                 ->mutable_categorical_id_binary_split();
+      equality_split->set_feature_column(feature_column_group_id_);
+      equality_split->set_feature_id(feature_ids(best_feature_idx));
+      auto* left_child = split_info.mutable_left_child();
+      auto* right_child = split_info.mutable_right_child();
+      FillLeaf(class_id, best_left_node_stats, left_child);
+      FillLeaf(class_id, best_right_node_stats, right_child);
+      split_info.SerializeToString(&output_splits(root_idx));
+      gains(root_idx) =
+          best_gain - root_stats.gain - tree_complexity_regularization_;
+      output_partition_ids(root_idx) = partition_ids(start_index);
+    }
+  }
+
+ private:
+  int64 bias_feature_id_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BuildCategoricalEqualitySplits").Device(DEVICE_CPU),
+    BuildCategoricalEqualitySplitsOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cff75e71d93cb703d87bb09a4b32439e01d70f76
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -0,0 +1,784 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <algorithm>
+#include <iterator>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h"
+#include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+
+namespace {
+const char* const kStampTokenName = "stamp_token";
+const char* const kNextStampTokenName = "next_stamp_token";
+
+struct PartitionKey {
+  PartitionKey() : partition_id(-1), feature_id(-1) {}
+
+  PartitionKey(int32 p, int64 f) : partition_id(p), feature_id(f) {}
+
+  bool operator==(const PartitionKey& other) const {
+    return (feature_id == other.feature_id) &&
+           (partition_id == other.partition_id);
+  }
+
+  // Compare for PartitionKey.
+  struct Less {
+    bool operator()(const PartitionKey& a, const PartitionKey& b) const {
+      if (a.partition_id < b.partition_id) {
+        return true;
+      }
+      if ((a.partition_id == b.partition_id) && (a.feature_id < b.feature_id)) {
+        return true;
+      }
+      return false;
+    }
+  };
+
+  // Tree partition defined by traversing the tree to the leaf.
+  int32 partition_id;
+
+  // Feature Id within the feature column.
+  int64 feature_id;
+};
+
+template <typename GradientType, typename HessianType>
+class StatsAccumulatorResource : public boosted_trees::StampedResource {
+  using StatsByPartition =
+      std::map<PartitionKey, std::pair<GradientType, HessianType>,
+               PartitionKey::Less>;
+
+ public:
+  StatsAccumulatorResource(const TensorShape& gradient_shape,
+                           const TensorShape& hessian_shape)
+      : gradient_shape_(gradient_shape),
+        hessian_shape_(hessian_shape),
+        num_updates_(0) {
+    // If GradientType/HessianType is scalar float then the shapes should be
+    // scalar and vice versa.
+    CHECK_EQ((std::is_same<GradientType, float>::value),
+             TensorShapeUtils::IsScalar(gradient_shape));
+    CHECK_EQ((std::is_same<HessianType, float>::value),
+             TensorShapeUtils::IsScalar(hessian_shape));
+  }
+
+  string DebugString() override {
+    return strings::StrCat("StatsAccumulatorResource[size=", values_.size(),
+                           "]");
+  }
+
+  void Clear() {
+    values_.clear();
+    num_updates_ = 0;
+  }
+
+  tensorflow::mutex* mutex() { return &mu_; }
+  StatsByPartition* mutable_values() { return &values_; }
+  const StatsByPartition& values() const { return values_; }
+  const int64& num_updates() const { return num_updates_; }
+  void set_num_updates(int64 val) { num_updates_ = val; }
+  const TensorShape& gradient_shape() const { return gradient_shape_; }
+  const TensorShape& hessian_shape() const { return hessian_shape_; }
+
+ private:
+  // Key into a specific partition to accumulate stats for the specified feature
+  // id.
+  StatsByPartition values_;
+  const TensorShape gradient_shape_;
+  const TensorShape hessian_shape_;
+  int64 num_updates_;
+  tensorflow::mutex mu_;
+  TF_DISALLOW_COPY_AND_ASSIGN(StatsAccumulatorResource);
+};
+
+using StatsAccumulatorScalarResource = StatsAccumulatorResource<float, float>;
+using StatsAccumulatorTensorResource =
+    StatsAccumulatorResource<std::vector<float>, std::vector<float>>;
+
+void SerializeScalarAccumulatorToOutput(
+    const StatsAccumulatorScalarResource& accumulator_resource,
+    OpKernelContext* context) {
+  int64 num_slots = accumulator_resource.values().size();
+  Tensor* partition_ids_t = nullptr;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
+                               &partition_ids_t));
+  auto partition_ids = partition_ids_t->vec<int32>();
+
+  Tensor* feature_ids_t = nullptr;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_output("output_feature_ids", TensorShape({num_slots}),
+                               &feature_ids_t));
+  auto feature_ids = feature_ids_t->vec<int64>();
+
+  Tensor* gradients_t = nullptr;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_output("output_gradients", TensorShape({num_slots}),
+                               &gradients_t));
+  auto gradients = gradients_t->vec<float>();
+
+  Tensor* hessians_t = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(
+                     "output_hessians", TensorShape({num_slots}), &hessians_t));
+  auto hessians = hessians_t->vec<float>();
+
+  int i = 0;
+  for (const auto& iter : accumulator_resource.values()) {
+    partition_ids(i) = iter.first.partition_id;
+    feature_ids(i) = iter.first.feature_id;
+    gradients(i) = iter.second.first;
+    hessians(i) = iter.second.second;
+    ++i;
+  }
+}
+
+void SerializeTensorAccumulatorToOutput(
+    const StatsAccumulatorTensorResource& accumulator_resource,
+    OpKernelContext* context) {
+  int64 num_slots = accumulator_resource.values().size();
+  Tensor* partition_ids_t = nullptr;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
+                               &partition_ids_t));
+  auto partition_ids = partition_ids_t->vec<int32>();
+
+  Tensor* feature_ids_t = nullptr;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_output("output_feature_ids", TensorShape({num_slots}),
+                               &feature_ids_t));
+  auto feature_ids = feature_ids_t->vec<int64>();
+
+  TensorShape gradient_shape = accumulator_resource.gradient_shape();
+  int64 num_gradient_elements = gradient_shape.num_elements();
+  gradient_shape.InsertDim(0, num_slots);
+  Tensor* gradients_t = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output("output_gradients", gradient_shape,
+                                          &gradients_t));
+  auto gradients = gradients_t->flat_outer_dims<float>();
+
+  TensorShape hessian_shape = accumulator_resource.hessian_shape();
+  int64 num_hessian_elements = hessian_shape.num_elements();
+  hessian_shape.InsertDim(0, num_slots);
+  Tensor* hessians_t = nullptr;
+  OP_REQUIRES_OK(
+      context,
+      context->allocate_output("output_hessians", hessian_shape, &hessians_t));
+  auto hessians = hessians_t->flat_outer_dims<float>();
+
+  int i = 0;
+  for (const auto& iter : accumulator_resource.values()) {
+    partition_ids(i) = iter.first.partition_id;
+    feature_ids(i) = iter.first.feature_id;
+    for (int j = 0; j < num_gradient_elements; ++j) {
+      gradients(i, j) = iter.second.first[j];
+    }
+    for (int j = 0; j < num_hessian_elements; ++j) {
+      hessians(i, j) = iter.second.second[j];
+    }
+    ++i;
+  }
+}
+
+void AddToScalarAccumulator(
+    StatsAccumulatorScalarResource* accumulator_resource,
+    const Tensor& partition_ids_t, const Tensor& feature_ids_t,
+    const Tensor& gradients_t, const Tensor& hessians_t) {
+  accumulator_resource->set_num_updates(accumulator_resource->num_updates() +
+                                        1);
+  const TensorShape& partition_ids_shape = partition_ids_t.shape();
+  const auto& partition_ids = partition_ids_t.vec<int32>();
+  const auto& feature_ids = feature_ids_t.vec<int64>();
+  const auto& gradients = gradients_t.vec<float>();
+  const auto& hessians = hessians_t.vec<float>();
+
+  int64 num_updates = partition_ids_shape.dim_size(0);
+  auto stats_map = accumulator_resource->mutable_values();
+  for (int64 i = 0; i < num_updates; ++i) {
+    const auto key = PartitionKey(partition_ids(i), feature_ids(i));
+    auto itr = stats_map->find(key);
+    if (itr != stats_map->end()) {
+      itr->second.first += gradients(i);
+      itr->second.second += hessians(i);
+    } else {
+      (*stats_map)[key] = {gradients(i), hessians(i)};
+    }
+  }
+}
+
+void AddToScalarAccumulator(
+    StatsAccumulatorScalarResource* accumulator_resource,
+    OpKernelContext* context) {
+  const Tensor* partition_ids_t;
+  OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
+  const Tensor* feature_ids_t;
+  OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+  const Tensor* gradients_t;
+  OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+  const Tensor* hessians_t;
+  OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+  AddToScalarAccumulator(accumulator_resource, *partition_ids_t, *feature_ids_t,
+                         *gradients_t, *hessians_t);
+}
+
+void AddToTensorAccumulator(
+    StatsAccumulatorTensorResource* accumulator_resource,
+    const Tensor& partition_ids_t, const Tensor& feature_ids_t,
+    const Tensor& gradients_t, const Tensor& hessians_t,
+    OpKernelContext* context) {
+  accumulator_resource->set_num_updates(accumulator_resource->num_updates() +
+                                        1);
+
+  const TensorShape& partition_ids_shape = partition_ids_t.shape();
+  const auto& partition_ids = partition_ids_t.vec<int32>();
+  const auto& feature_ids = feature_ids_t.vec<int64>();
+  TensorShape gradients_shape = gradients_t.shape();
+  const auto& gradients = gradients_t.flat_outer_dims<float>();
+  TensorShape hessians_shape = hessians_t.shape();
+  const auto& hessians = hessians_t.flat_outer_dims<float>();
+
+  gradients_shape.RemoveDim(0);
+  hessians_shape.RemoveDim(0);
+
+  // TODO(soroush): Move gradient and hessian shape check to ShapeFn.
+  OP_REQUIRES(
+      context, gradients_shape == accumulator_resource->gradient_shape(),
+      errors::InvalidArgument(strings::StrCat(
+          "Gradients dimensions must match: ", gradients_shape.DebugString(),
+          ", ", accumulator_resource->gradient_shape().DebugString())));
+
+  OP_REQUIRES(
+      context, hessians_shape == accumulator_resource->hessian_shape(),
+      errors::InvalidArgument(strings::StrCat(
+          "Hessian dimensions must match: ", hessians_shape.DebugString(), ", ",
+          accumulator_resource->hessian_shape().DebugString())));
+
+  int64 num_updates = partition_ids_shape.dim_size(0);
+  auto stats_map = accumulator_resource->mutable_values();
+  for (int64 i = 0; i < num_updates; ++i) {
+    const auto key = PartitionKey(partition_ids(i), feature_ids(i));
+    auto itr = stats_map->find(key);
+    if (itr == stats_map->end()) {
+      std::vector<float> new_gradients(gradients_shape.num_elements());
+      for (int j = 0; j < gradients_shape.num_elements(); ++j) {
+        new_gradients[j] = gradients(i, j);
+      }
+      std::vector<float> new_hessians(hessians_shape.num_elements());
+      for (int j = 0; j < hessians_shape.num_elements(); ++j) {
+        new_hessians[j] = hessians(i, j);
+      }
+      (*stats_map)[key] = {new_gradients, new_hessians};
+    } else {
+      auto& stored_gradients = itr->second.first;
+      for (int j = 0; j < gradients_shape.num_elements(); ++j) {
+        stored_gradients[j] += gradients(i, j);
+      }
+      auto& stored_hessians = itr->second.second;
+      for (int j = 0; j < hessians_shape.num_elements(); ++j) {
+        stored_hessians[j] += hessians(i, j);
+      }
+    }
+  }
+}
+
+void AddToTensorAccumulator(
+    StatsAccumulatorTensorResource* accumulator_resource,
+    OpKernelContext* context) {
+  const Tensor* partition_ids_t;
+  OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
+  const Tensor* feature_ids_t;
+  OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+  const Tensor* gradients_t;
+  OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+  const Tensor* hessians_t;
+  OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+  AddToTensorAccumulator(accumulator_resource, *partition_ids_t, *feature_ids_t,
+                         *gradients_t, *hessians_t, context);
+}
+
+}  // namespace
+
+REGISTER_RESOURCE_HANDLE_KERNEL(StatsAccumulatorScalarResource);
+REGISTER_RESOURCE_HANDLE_KERNEL(StatsAccumulatorTensorResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorScalarIsInitialized").Device(DEVICE_CPU),
+    IsResourceInitialized<StatsAccumulatorScalarResource>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorTensorIsInitialized").Device(DEVICE_CPU),
+    IsResourceInitialized<StatsAccumulatorTensorResource>);
+
+class CreateStatsAccumulatorScalarOp : public OpKernel {
+ public:
+  explicit CreateStatsAccumulatorScalarOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+
+    TensorShape gradient_shape = TensorShape({});
+    TensorShape hessian_shape = TensorShape({});
+
+    auto* result =
+        new StatsAccumulatorScalarResource(gradient_shape, hessian_shape);
+    result->set_stamp(stamp_token_t->scalar<int64>()());
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions. If one already exists, it unrefs the new one.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("CreateStatsAccumulatorScalar").Device(DEVICE_CPU),
+                        CreateStatsAccumulatorScalarOp);
+
+class CreateStatsAccumulatorTensorOp : public OpKernel {
+ public:
+  explicit CreateStatsAccumulatorTensorOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+
+    const Tensor* gradient_shape_t;
+    OP_REQUIRES_OK(
+        context, context->input("per_slot_gradient_shape", &gradient_shape_t));
+
+    const Tensor* hessian_shape_t;
+    OP_REQUIRES_OK(context,
+                   context->input("per_slot_hessian_shape", &hessian_shape_t));
+    TensorShape gradient_shape = TensorShape(gradient_shape_t->vec<int64>());
+    TensorShape hessian_shape = TensorShape(hessian_shape_t->vec<int64>());
+    auto* result =
+        new StatsAccumulatorTensorResource(gradient_shape, hessian_shape);
+    result->set_stamp(stamp_token_t->scalar<int64>()());
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions. If one already exists, it unrefs the new one.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("CreateStatsAccumulatorTensor").Device(DEVICE_CPU),
+                        CreateStatsAccumulatorTensorOp);
+
+class StatsAccumulatorScalarAddOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorScalarAddOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList resource_handle_list;
+    OP_REQUIRES_OK(context, context->input_list("stats_accumulator_handles",
+                                                &resource_handle_list));
+    OpInputList partition_ids_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list("partition_ids", &partition_ids_list));
+
+    OpInputList feature_ids_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list("feature_ids", &feature_ids_list));
+    OpInputList gradients_list;
+    OP_REQUIRES_OK(context, context->input_list("gradients", &gradients_list));
+    OpInputList hessians_list;
+    OP_REQUIRES_OK(context, context->input_list("hessians", &hessians_list));
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    boosted_trees::utils::ParallelFor(
+        resource_handle_list.size(), worker_threads->NumThreads(),
+        worker_threads,
+        [&context, &resource_handle_list, &partition_ids_list,
+         &feature_ids_list, &gradients_list, &hessians_list,
+         stamp_token](int64 start, int64 end) {
+          for (int resource_handle_idx = start; resource_handle_idx < end;
+               ++resource_handle_idx) {
+            ResourceHandle handle = resource_handle_list[resource_handle_idx]
+                                        .flat<ResourceHandle>()(0);
+
+            StatsAccumulatorScalarResource* accumulator_resource;
+            OP_REQUIRES_OK(context, LookupResource(context, handle,
+                                                   &accumulator_resource));
+            mutex_lock l(*accumulator_resource->mutex());
+            core::ScopedUnref unref_me(accumulator_resource);
+
+            // If the stamp is invalid we drop the update.
+            if (!accumulator_resource->is_stamp_valid(stamp_token)) {
+              VLOG(1) << "Invalid stamp token in StatsAccumulatorScalarAddOp. "
+                      << "Passed stamp token: " << stamp_token << " "
+                      << "Current token: " << accumulator_resource->stamp();
+              return;
+            }
+            AddToScalarAccumulator(accumulator_resource,
+                                   partition_ids_list[resource_handle_idx],
+                                   feature_ids_list[resource_handle_idx],
+                                   gradients_list[resource_handle_idx],
+                                   hessians_list[resource_handle_idx]);
+          }
+        });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StatsAccumulatorScalarAdd").Device(DEVICE_CPU),
+                        StatsAccumulatorScalarAddOp);
+
+class StatsAccumulatorTensorAddOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorTensorAddOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList resource_handle_list;
+    OP_REQUIRES_OK(context, context->input_list("stats_accumulator_handles",
+                                                &resource_handle_list));
+    OpInputList partition_ids_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list("partition_ids", &partition_ids_list));
+
+    OpInputList feature_ids_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list("feature_ids", &feature_ids_list));
+    OpInputList gradients_list;
+    OP_REQUIRES_OK(context, context->input_list("gradients", &gradients_list));
+    OpInputList hessians_list;
+    OP_REQUIRES_OK(context, context->input_list("hessians", &hessians_list));
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    boosted_trees::utils::ParallelFor(
+        resource_handle_list.size(), worker_threads->NumThreads(),
+        worker_threads,
+        [&context, &resource_handle_list, &partition_ids_list,
+         &feature_ids_list, &gradients_list, &hessians_list,
+         stamp_token](int64 start, int64 end) {
+          for (int resource_handle_idx = start; resource_handle_idx < end;
+               ++resource_handle_idx) {
+            ResourceHandle handle = resource_handle_list[resource_handle_idx]
+                                        .flat<ResourceHandle>()(0);
+
+            StatsAccumulatorTensorResource* accumulator_resource;
+            OP_REQUIRES_OK(context, LookupResource(context, handle,
+                                                   &accumulator_resource));
+            mutex_lock l(*accumulator_resource->mutex());
+            core::ScopedUnref unref_me(accumulator_resource);
+
+            // If the stamp is invalid we drop the update.
+            if (!accumulator_resource->is_stamp_valid(stamp_token)) {
+              VLOG(1) << "Invalid stamp token in StatsAccumulatorScalarAddOp. "
+                      << "Passed stamp token: " << stamp_token << " "
+                      << "Current token: " << accumulator_resource->stamp();
+              return;
+            }
+            AddToTensorAccumulator(accumulator_resource,
+                                   partition_ids_list[resource_handle_idx],
+                                   feature_ids_list[resource_handle_idx],
+                                   gradients_list[resource_handle_idx],
+                                   hessians_list[resource_handle_idx], context);
+          }
+        });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StatsAccumulatorTensorAdd").Device(DEVICE_CPU),
+                        StatsAccumulatorTensorAddOp);
+
+class StatsAccumulatorScalarFlushOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorScalarFlushOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    StatsAccumulatorScalarResource* accumulator_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &accumulator_resource));
+    mutex_lock l(*accumulator_resource->mutex());
+    core::ScopedUnref unref_me(accumulator_resource);
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // If the stamp is invalid we restart the PS. It shouldn't happen since
+    // only Chief should call this function and chief is guaranteed to be in
+    // a consistent state.
+    CHECK(accumulator_resource->is_stamp_valid(stamp_token));
+
+    const Tensor* next_stamp_token_t;
+    OP_REQUIRES_OK(context,
+                   context->input(kNextStampTokenName, &next_stamp_token_t));
+    int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
+    CHECK(stamp_token != next_stamp_token);
+
+    SerializeScalarAccumulatorToOutput(*accumulator_resource, context);
+    Tensor* num_updates_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("num_updates", TensorShape({}),
+                                            &num_updates_t));
+    num_updates_t->scalar<int64>()() = accumulator_resource->num_updates();
+
+    accumulator_resource->Clear();
+    accumulator_resource->set_stamp(next_stamp_token);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StatsAccumulatorScalarFlush").Device(DEVICE_CPU),
+                        StatsAccumulatorScalarFlushOp);
+
+class StatsAccumulatorTensorFlushOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorTensorFlushOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    StatsAccumulatorTensorResource* accumulator_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &accumulator_resource));
+    mutex_lock l(*accumulator_resource->mutex());
+    core::ScopedUnref unref_me(accumulator_resource);
+
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    const Tensor* next_stamp_token_t;
+    OP_REQUIRES_OK(context,
+                   context->input(kNextStampTokenName, &next_stamp_token_t));
+    int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
+
+    // If the stamp is invalid we restart the PS. It shouldn't happen since
+    // only Chief should call this function and chief is guaranteed to be in
+    // a consistent state.
+    CHECK(accumulator_resource->is_stamp_valid(stamp_token));
+    CHECK(stamp_token != next_stamp_token);
+    SerializeTensorAccumulatorToOutput(*accumulator_resource, context);
+    Tensor* num_updates_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("num_updates", TensorShape({}),
+                                            &num_updates_t));
+    num_updates_t->scalar<int64>()() = accumulator_resource->num_updates();
+    accumulator_resource->Clear();
+    accumulator_resource->set_stamp(next_stamp_token);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StatsAccumulatorTensorFlush").Device(DEVICE_CPU),
+                        StatsAccumulatorTensorFlushOp);
+
+class StatsAccumulatorScalarDeserializeOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorScalarDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    StatsAccumulatorScalarResource* accumulator_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &accumulator_resource));
+    mutex_lock l(*accumulator_resource->mutex());
+    core::ScopedUnref unref_me(accumulator_resource);
+
+    // Check the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+    accumulator_resource->Clear();
+    accumulator_resource->set_stamp(stamp_token);
+    AddToScalarAccumulator(accumulator_resource, context);
+    const Tensor* num_updates_t;
+    OP_REQUIRES_OK(context, context->input("num_updates", &num_updates_t));
+    accumulator_resource->set_num_updates(num_updates_t->scalar<int64>()());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorScalarDeserialize").Device(DEVICE_CPU),
+    StatsAccumulatorScalarDeserializeOp);
+
+class StatsAccumulatorTensorDeserializeOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorTensorDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    StatsAccumulatorTensorResource* accumulator_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &accumulator_resource));
+    mutex_lock l(*accumulator_resource->mutex());
+    core::ScopedUnref unref_me(accumulator_resource);
+
+    // Check the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+    accumulator_resource->Clear();
+    accumulator_resource->set_stamp(stamp_token);
+    AddToTensorAccumulator(accumulator_resource, context);
+    const Tensor* num_updates_t;
+    OP_REQUIRES_OK(context, context->input("num_updates", &num_updates_t));
+    accumulator_resource->set_num_updates(num_updates_t->scalar<int64>()());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorTensorDeserialize").Device(DEVICE_CPU),
+    StatsAccumulatorTensorDeserializeOp);
+
+class StatsAccumulatorScalarSerializeOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorScalarSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    StatsAccumulatorScalarResource* accumulator_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &accumulator_resource));
+    mutex_lock l(*accumulator_resource->mutex());
+    core::ScopedUnref unref_me(accumulator_resource);
+    SerializeScalarAccumulatorToOutput(*accumulator_resource, context);
+    Tensor* stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("stamp_token", TensorShape({}),
+                                            &stamp_token_t));
+    stamp_token_t->scalar<int64>()() = accumulator_resource->stamp();
+
+    Tensor* num_updates_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("num_updates", TensorShape({}),
+                                            &num_updates_t));
+    num_updates_t->scalar<int64>()() = accumulator_resource->num_updates();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorScalarSerialize").Device(DEVICE_CPU),
+    StatsAccumulatorScalarSerializeOp);
+
+class StatsAccumulatorTensorSerializeOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorTensorSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    StatsAccumulatorTensorResource* accumulator_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &accumulator_resource));
+    mutex_lock l(*accumulator_resource->mutex());
+    core::ScopedUnref unref_me(accumulator_resource);
+    SerializeTensorAccumulatorToOutput(*accumulator_resource, context);
+    Tensor* stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("stamp_token", TensorShape({}),
+                                            &stamp_token_t));
+    stamp_token_t->scalar<int64>()() = accumulator_resource->stamp();
+
+    Tensor* num_updates_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("num_updates", TensorShape({}),
+                                            &num_updates_t));
+    num_updates_t->scalar<int64>()() = accumulator_resource->num_updates();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorTensorSerialize").Device(DEVICE_CPU),
+    StatsAccumulatorTensorSerializeOp);
+
+class StatsAccumulatorScalarMakeSummaryOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorScalarMakeSummaryOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorShape gradient_shape = TensorShape({});
+    TensorShape hessian_shape = TensorShape({});
+    StatsAccumulatorScalarResource* accumulator_resource =
+        new StatsAccumulatorScalarResource(gradient_shape, hessian_shape);
+    core::ScopedUnref unref_me(accumulator_resource);
+    // Check the stamp token.
+    AddToScalarAccumulator(accumulator_resource, context);
+    SerializeScalarAccumulatorToOutput(*accumulator_resource, context);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorScalarMakeSummary").Device(DEVICE_CPU),
+    StatsAccumulatorScalarMakeSummaryOp);
+
+class StatsAccumulatorTensorMakeSummaryOp : public OpKernel {
+ public:
+  explicit StatsAccumulatorTensorMakeSummaryOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+    TensorShape gradients_shape = gradients_t->shape();
+    gradients_shape.RemoveDim(0);
+
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+    TensorShape hessians_shape = hessians_t->shape();
+    hessians_shape.RemoveDim(0);
+
+    StatsAccumulatorTensorResource* accumulator_resource =
+        new StatsAccumulatorTensorResource(gradients_shape, hessians_shape);
+    core::ScopedUnref unref_me(accumulator_resource);
+    // Check the stamp token.
+    AddToTensorAccumulator(accumulator_resource, context);
+    SerializeTensorAccumulatorToOutput(*accumulator_resource, context);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("StatsAccumulatorTensorMakeSummary").Device(DEVICE_CPU),
+    StatsAccumulatorTensorMakeSummaryOp);
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12200e6a25618c32b27b88b17c052ab2df39fd48
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -0,0 +1,825 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h"
+#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
+#include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h"
+#include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
+
+namespace boosted_trees {
+
+using boosted_trees::trees::DecisionTreeEnsembleConfig;
+using boosted_trees::trees::TreeNode;
+using boosted_trees::trees::TreeNodeMetadata;
+using boosted_trees::utils::DropoutUtils;
+using boosted_trees::learner::LearningRateConfig;
+
+namespace {
+
+// SplitCandidate holds the split candidate node along with the stats.
+struct SplitCandidate {
+  // Id of handler that generated the split candidate.
+  int64 handler_id;
+
+  // Split gain.
+  float gain;
+
+  // Split info.
+  learner::SplitInfo split_info;
+};
+
+// Helper method to update the best split per partition given
+// a current candidate.
+void UpdateBestSplit(
+    const boosted_trees::learner::LearnerConfig& learner_config,
+    int32 partition_id, SplitCandidate* split,
+    std::map<int32, SplitCandidate>* best_splits) {
+  // Don't consider nodeless splits.
+  if (TF_PREDICT_FALSE(split->split_info.split_node().node_case() ==
+                       TreeNode::NODE_NOT_SET)) {
+    return;
+  }
+
+  // Don't consider negative splits if we're pre-pruning the tree.
+  // Note that zero-gain splits are acceptable as they're mostly doing as well
+  // as what bias centering in that partition would do.
+  if (learner_config.pruning_mode() ==
+          boosted_trees::learner::LearnerConfig::PRE_PRUNE &&
+      split->gain < 0) {
+    return;
+  }
+
+  // Take the split if we don't have a candidate yet.
+  auto best_split_it = best_splits->find(partition_id);
+  if (best_split_it == best_splits->end()) {
+    best_splits->insert(std::make_pair(partition_id, std::move(*split)));
+    return;
+  }
+
+  // Determine if best split so far needs to be replaced.
+  SplitCandidate& best_split = best_split_it->second;
+  if (TF_PREDICT_FALSE(split->gain == best_split.gain)) {
+    // Tie break on node case preferring simpler tree node types.
+    VLOG(2) << "Attempting to tie break with smaller node case. "
+            << "(current split: " << split->split_info.split_node().node_case()
+            << ", best split: "
+            << best_split.split_info.split_node().node_case() << ")";
+    if (split->split_info.split_node().node_case() <
+        best_split.split_info.split_node().node_case()) {
+      best_split = std::move(*split);
+    } else if (split->split_info.split_node().node_case() ==
+               best_split.split_info.split_node().node_case()) {
+      // Tie break on handler Id.
+      VLOG(2) << "Tie breaking with higher handler Id. "
+              << "(current split: " << split->handler_id
+              << ", best split: " << best_split.handler_id << ")";
+      if (split->handler_id > best_split.handler_id) {
+        best_split = std::move(*split);
+      }
+    }
+  } else if (split->gain > best_split.gain) {
+    best_split = std::move(*split);
+  }
+}
+
+// Helper method to check whether a node is a terminal node in that it
+// only has leaf nodes as children.
+bool IsTerminalSplitNode(const size_t node_id,
+                         const std::vector<int32>& children,
+                         const std::vector<TreeNode>& nodes) {
+  for (const int32 child_id : children) {
+    const auto& child_node = nodes[child_id];
+    CHECK(child_node.node_case() != TreeNode::NODE_NOT_SET);
+    if (child_node.node_case() != TreeNode::kLeaf) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Helper method to recursively prune the tree in a depth-first fashion.
+void RecursivePruneTree(const size_t node_id, std::vector<TreeNode>* nodes) {
+  // Base case when we reach a leaf.
+  TreeNode& tree_node = (*nodes)[node_id];
+  CHECK(tree_node.node_case() != TreeNode::NODE_NOT_SET);
+  if (tree_node.node_case() == TreeNode::kLeaf) {
+    return;
+  }
+
+  // Traverse node children first and recursively prune their sub-trees.
+  const std::vector<int32> children =
+      boosted_trees::trees::DecisionTree::GetChildren(tree_node);
+  for (const int32 child_id : children) {
+    RecursivePruneTree(child_id, nodes);
+  }
+
+  // Two conditions must be satisfied to prune the node:
+  // 1- The split gain is negative.
+  // 2- After depth-first pruning, the node only has leaf children.
+  TreeNodeMetadata* node_metadata = tree_node.mutable_node_metadata();
+  if (node_metadata->gain() < 0 &&
+      IsTerminalSplitNode(node_id, children, (*nodes))) {
+    // Clear node children.
+    for (const int32 child_id : children) {
+      auto& child_node = (*nodes)[child_id];
+      child_node.Clear();
+    }
+
+    // Change node back into leaf.
+    (*tree_node.mutable_leaf()) = *node_metadata->mutable_original_leaf();
+
+    // Clear gain for leaf node.
+    tree_node.clear_node_metadata();
+  } else {
+    // Clear original leaf as it's no longer needed for back-track pruning.
+    node_metadata->clear_original_leaf();
+  }
+}
+
+}  // namespace
+
+class CenterTreeEnsembleBiasOp : public OpKernel {
+ public:
+  explicit CenterTreeEnsembleBiasOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    // Read learner config.
+    string serialized_learner_config;
+    OP_REQUIRES_OK(context, context->GetAttr("learner_config",
+                                             &serialized_learner_config));
+    OP_REQUIRES(context,
+                learner_config_.ParseFromString(serialized_learner_config),
+                errors::InvalidArgument("Unable to parse learner config."));
+
+    // Read centering epsilon.
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("centering_epsilon", &centering_epsilon_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    boosted_trees::models::DecisionTreeEnsembleResource*
+        decision_tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Only the Chief should run this Op and it is guaranteed to be in
+    // a consistent state so the stamps must always match.
+    CHECK(decision_tree_ensemble_resource->is_stamp_valid(stamp_token));
+
+    // Get the next stamp token.
+    const Tensor* next_stamp_token_t;
+    OP_REQUIRES_OK(context,
+                   context->input("next_stamp_token", &next_stamp_token_t));
+    int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
+    CHECK(stamp_token != next_stamp_token);
+
+    // Get the delta updates.
+    const Tensor* delta_updates_t;
+    OP_REQUIRES_OK(context, context->input("delta_updates", &delta_updates_t));
+    OP_REQUIRES(
+        context,
+        delta_updates_t->dim_size(0) + 1 == learner_config_.num_classes(),
+        errors::InvalidArgument(
+            "Delta updates size must be consistent with label dimensions."));
+    auto delta_updates = delta_updates_t->vec<float>();
+
+    // Update the ensemble stamp.
+    decision_tree_ensemble_resource->set_stamp(next_stamp_token);
+
+    // Get the bias.
+    boosted_trees::trees::Leaf* bias =
+        RetrieveBias(decision_tree_ensemble_resource);
+    CHECK(bias->has_vector());
+    OP_REQUIRES(
+        context,
+        bias->vector().value_size() + 1 == learner_config_.num_classes(),
+        errors::InvalidArgument(
+            "Bias vector size must be consistent with label dimensions."));
+
+    // Update the bias.
+    float total_delta = 0;
+    auto* bias_vec = bias->mutable_vector();
+    for (size_t idx = 0; idx < bias->vector().value_size(); ++idx) {
+      float delta = delta_updates(idx);
+      bias_vec->set_value(idx, bias_vec->value(idx) + delta);
+      total_delta += std::abs(delta);
+    }
+
+    // Make a centering continuation decision based on current update.
+    bool continue_centering = total_delta > centering_epsilon_;
+    if (continue_centering) {
+      VLOG(1) << "Continuing to center bias, delta=" << total_delta;
+    } else {
+      VLOG(1) << "Done centering bias, delta=" << total_delta;
+    }
+    Tensor* continue_centering_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output("continue_centering", TensorShape({}),
+                                          &continue_centering_t));
+    continue_centering_t->scalar<bool>()() = continue_centering;
+  }
+
+ private:
+  // Helper method to retrieve the bias from the tree ensemble.
+  boosted_trees::trees::Leaf* RetrieveBias(
+      boosted_trees::models::DecisionTreeEnsembleResource*
+          decision_tree_ensemble_resource) {
+    boosted_trees::trees::DecisionTreeEnsembleConfig* ensemble_config =
+        decision_tree_ensemble_resource->mutable_decision_tree_ensemble();
+    const auto num_trees = ensemble_config->trees_size();
+    CHECK(num_trees == ensemble_config->tree_metadata_size() &&
+          num_trees == ensemble_config->tree_weights_size());
+    if (num_trees <= 0) {
+      ensemble_config->mutable_growing_metadata()->set_num_trees_attempted(1);
+      ensemble_config->mutable_growing_metadata()->set_num_layers_attempted(1);
+      // Add a new bias leaf.
+      boosted_trees::trees::DecisionTreeConfig* tree_config =
+          ensemble_config->add_trees();
+      auto* leaf = tree_config->add_nodes()->mutable_leaf();
+      for (size_t idx = 0; idx + 1 < learner_config_.num_classes(); ++idx) {
+        leaf->mutable_vector()->add_value(0);
+      }
+      ensemble_config->add_tree_weights(1.0);
+      boosted_trees::trees::DecisionTreeMetadata* tree_metadata =
+          ensemble_config->add_tree_metadata();
+      tree_metadata->set_num_layers_grown(1);
+      tree_metadata->set_is_finalized(true);
+      return leaf;
+    } else if (num_trees == 1) {
+      // Update the existing bias.
+      CHECK_EQ(ensemble_config->trees(0).nodes_size(), 1);
+      auto* node = ensemble_config->mutable_trees(0)->mutable_nodes(0);
+      CHECK(node->node_case() == TreeNode::kLeaf);
+      return node->mutable_leaf();
+    } else {
+      CHECK(false) << "Unable to center bias on an already grown ensemble";
+    }
+  }
+
+  boosted_trees::learner::LearnerConfig learner_config_;
+  float centering_epsilon_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CenterTreeEnsembleBias").Device(DEVICE_CPU),
+                        CenterTreeEnsembleBiasOp);
+
+class GrowTreeEnsembleOp : public OpKernel {
+ public:
+  explicit GrowTreeEnsembleOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    // Read number of handlers, note that this is the static number of
+    // all handlers but any subset of these handlers may be active at a time.
+    OP_REQUIRES_OK(context, context->GetAttr("num_handlers", &num_handlers_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("center_bias", &center_bias_));
+
+    // Read learner config.
+    string serialized_learner_config;
+    OP_REQUIRES_OK(context, context->GetAttr("learner_config",
+                                             &serialized_learner_config));
+    OP_REQUIRES(context,
+                learner_config_.ParseFromString(serialized_learner_config),
+                errors::InvalidArgument("Unable to parse learner config."));
+
+    // Determine whether dropout was used when building this tree.
+    if (learner_config_.has_learning_rate_tuner() &&
+        learner_config_.learning_rate_tuner().tuner_case() ==
+            LearningRateConfig::kDropout) {
+      dropout_config_ = learner_config_.learning_rate_tuner().dropout();
+      dropout_was_applied_ = true;
+    } else {
+      dropout_was_applied_ = false;
+    }
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    boosted_trees::models::DecisionTreeEnsembleResource*
+        decision_tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Only the Chief should run this Op and it is guaranteed to be in
+    // a consistent state so the stamps must always match.
+    CHECK(decision_tree_ensemble_resource->is_stamp_valid(stamp_token));
+
+    // Get the next stamp token.
+    const Tensor* next_stamp_token_t;
+    OP_REQUIRES_OK(context,
+                   context->input("next_stamp_token", &next_stamp_token_t));
+    int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
+    CHECK(stamp_token != next_stamp_token);
+
+    // Update the ensemble stamp regardless of whether a layer
+    // or tree is actually grown.
+    decision_tree_ensemble_resource->set_stamp(next_stamp_token);
+
+    // Read the learning_rate.
+    const Tensor* learning_rate_t;
+    OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    float learning_rate = learning_rate_t->scalar<float>()();
+
+    // Read seed that was used for dropout.
+    const Tensor* seed_t;
+    OP_REQUIRES_OK(context, context->input("dropout_seed", &seed_t));
+    // Cast seed to uint64.
+    const uint64 dropout_seed = seed_t->scalar<int64>()();
+
+    // Read partition Ids, gains and split candidates.
+    OpInputList partition_ids_list;
+    OpInputList gains_list;
+    OpInputList splits_list;
+    OP_REQUIRES_OK(context,
+                   context->input_list("partition_ids", &partition_ids_list));
+    OP_REQUIRES_OK(context, context->input_list("gains", &gains_list));
+    OP_REQUIRES_OK(context, context->input_list("splits", &splits_list));
+
+    boosted_trees::trees::DecisionTreeEnsembleConfig* ensemble_config =
+        decision_tree_ensemble_resource->mutable_decision_tree_ensemble();
+    ensemble_config->mutable_growing_metadata()->set_num_layers_attempted(
+        ensemble_config->growing_metadata().num_layers_attempted() + 1);
+    const int num_trees = ensemble_config->trees_size();
+    if (num_trees <= 0 ||
+        ensemble_config->tree_metadata(num_trees - 1).is_finalized()) {
+      ensemble_config->mutable_growing_metadata()->set_num_trees_attempted(
+          ensemble_config->growing_metadata().num_trees_attempted() + 1);
+    }
+
+    // Find best splits for each active partition.
+    std::map<int32, SplitCandidate> best_splits;
+    FindBestSplitsPerPartition(context, partition_ids_list, gains_list,
+                               splits_list, &best_splits);
+
+    // No-op if no new splits can be considered.
+    if (best_splits.empty()) {
+      LOG(WARNING) << "Not growing tree ensemble as no good splits were found.";
+      return;
+    }
+
+    // Update and retrieve the growable tree with its metadata.
+    boosted_trees::trees::DecisionTreeConfig* tree_config;
+    boosted_trees::trees::DecisionTreeMetadata* tree_metadata;
+
+    // Updates the tree. If the tree is fully built and dropout was applied, it
+    // also adjusts the weights of dropped and the last tree.
+    std::tie(tree_config, tree_metadata) = UpdateAndRetrieveGrowableTree(
+        decision_tree_ensemble_resource, learning_rate, dropout_seed);
+
+    // Split tree nodes.
+    for (auto& split_entry : best_splits) {
+      SplitTreeNode(split_entry.first, &split_entry.second, tree_config);
+    }
+
+    // Post-prune finalized tree if needed.
+    if (learner_config_.pruning_mode() ==
+            boosted_trees::learner::LearnerConfig::POST_PRUNE &&
+        tree_metadata->is_finalized()) {
+      VLOG(2) << "Post-pruning finalized tree.";
+      PruneTree(tree_config);
+
+      // If after post-pruning the whole tree has no gain, remove the tree
+      // altogether from the ensemble.
+      if (tree_config->nodes_size() <= 0) {
+        ensemble_config->mutable_trees()->RemoveLast();
+        ensemble_config->mutable_tree_weights()->RemoveLast();
+        ensemble_config->mutable_tree_metadata()->RemoveLast();
+      }
+    }
+  }
+
+ private:
+  // Helper method which effectively does a reduce over all split candidates
+  // and finds the best split for each partition.
+  void FindBestSplitsPerPartition(
+      OpKernelContext* const context, const OpInputList& partition_ids_list,
+      const OpInputList& gains_list, const OpInputList& splits_list,
+      std::map<int32, SplitCandidate>* best_splits) {
+    // Find best split per partition going through every feature candidate.
+    // TODO(salehay): Is this worth parallelizing?
+    for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
+      const auto& partition_ids = partition_ids_list[handler_id].vec<int32>();
+      const auto& gains = gains_list[handler_id].vec<float>();
+      const auto& splits = splits_list[handler_id].vec<string>();
+      OP_REQUIRES(context, partition_ids.size() == gains.size(),
+                  errors::InvalidArgument(
+                      "Inconsistent partition Ids and gains tensors: ",
+                      partition_ids.size(), " != ", gains.size()));
+      OP_REQUIRES(context, partition_ids.size() == splits.size(),
+                  errors::InvalidArgument(
+                      "Inconsistent partition Ids and splits tensors: ",
+                      partition_ids.size(), " != ", splits.size()));
+      for (size_t candidate_idx = 0; candidate_idx < splits.size();
+           ++candidate_idx) {
+        // Get current split candidate.
+        const auto& partition_id = partition_ids(candidate_idx);
+        const auto& gain = gains(candidate_idx);
+        const auto& serialized_split = splits(candidate_idx);
+        SplitCandidate split;
+        split.handler_id = handler_id;
+        split.gain = gain;
+        OP_REQUIRES(context, split.split_info.ParseFromString(serialized_split),
+                    errors::InvalidArgument("Unable to parse split info."));
+
+        // Update best split for partition based on the current candidate.
+        UpdateBestSplit(learner_config_, partition_id, &split, best_splits);
+      }
+    }
+  }
+
+  void UpdateTreeWeightsIfDropout(
+      boosted_trees::trees::DecisionTreeEnsembleConfig* ensemble_config,
+      boosted_trees::trees::DecisionTreeMetadata* tree_metadata,
+      const uint64 dropout_seed) {
+    // It is possible that the tree was built with dropout. If it is the case,
+    // we need to adjust the tree weight.
+    if (dropout_was_applied_ && tree_metadata->is_finalized()) {
+      const int32 num_trees = ensemble_config->trees_size();
+
+      std::vector<int32> dropped_trees;
+      // Since only chief builds the trees, we are sure that the other tree
+      // weights didn't change.
+      std::vector<float> weights;
+      weights.reserve(num_trees);
+      std::vector<int32> num_updates;
+      num_updates.reserve(num_trees);
+      for (int i = 0; i < num_trees; ++i) {
+        weights.push_back(ensemble_config->tree_weights(i));
+        num_updates.push_back(
+            ensemble_config->tree_metadata(i).num_tree_weight_updates());
+      }
+
+      std::vector<float> dropped_trees_weights;
+      // Based on seed, figure out what trees were dropped before.
+      std::unordered_set<int32> trees_not_to_drop;
+      if (center_bias_) {
+        trees_not_to_drop.insert(0);
+      }
+      // Last tree is the current tree that is built.
+      const int32 current_tree = num_trees - 1;
+      trees_not_to_drop.insert(current_tree);
+
+      const auto dropout_status = DropoutUtils::DropOutTrees(
+          dropout_seed, dropout_config_, trees_not_to_drop, weights,
+          &dropped_trees, &dropped_trees_weights);
+      CHECK(dropout_status.ok())
+          << "Can't figure out what trees were dropped out before, error is "
+          << dropout_status.error_message();
+
+      // Now we have dropped trees, update their weights and the current tree
+      // weight.
+      if (!dropped_trees.empty()) {
+        DropoutUtils::GetTreesWeightsForAddingTrees(
+            dropped_trees, dropped_trees_weights, current_tree,
+            1 /* only 1 tree was added */, &weights, &num_updates);
+
+        // Update the weights and num of updates for trees.
+        for (int i = 0; i < num_trees; ++i) {
+          ensemble_config->set_tree_weights(i, weights[i]);
+          ensemble_config->mutable_tree_metadata(i)
+              ->set_num_tree_weight_updates(num_updates[i]);
+        }
+      }
+    }
+  }
+
+  // Helper method to update and retrieve the growable tree which is by
+  // definition the last tree in the ensemble.
+  std::pair<boosted_trees::trees::DecisionTreeConfig*,
+            boosted_trees::trees::DecisionTreeMetadata*>
+  UpdateAndRetrieveGrowableTree(
+      boosted_trees::models::DecisionTreeEnsembleResource*
+          decision_tree_ensemble_resource,
+      float learning_rate, const uint64 dropout_seed) {
+    boosted_trees::trees::DecisionTreeEnsembleConfig* ensemble_config =
+        decision_tree_ensemble_resource->mutable_decision_tree_ensemble();
+    auto num_trees = ensemble_config->trees_size();
+    CHECK(num_trees == ensemble_config->tree_metadata_size() &&
+          num_trees == ensemble_config->tree_weights_size());
+    if (num_trees <= 0 ||
+        ensemble_config->tree_metadata(num_trees - 1).is_finalized()) {
+      // Create a new tree with a no-op leaf.
+      boosted_trees::trees::DecisionTreeConfig* tree_config =
+          ensemble_config->add_trees();
+      ++num_trees;
+      VLOG(1) << "Adding layer 0 to tree " << num_trees - 1
+              << " of ensemble of " << num_trees << " trees.";
+      tree_config->add_nodes()->mutable_leaf();
+      ensemble_config->add_tree_weights(learning_rate);
+      boosted_trees::trees::DecisionTreeMetadata* tree_metadata =
+          ensemble_config->add_tree_metadata();
+      tree_metadata->set_num_layers_grown(1);
+      tree_metadata->set_is_finalized(
+          learner_config_.constraints().max_tree_depth() <= 1);
+      tree_metadata->set_num_tree_weight_updates(1);
+
+      UpdateTreeWeightsIfDropout(ensemble_config, tree_metadata, dropout_seed);
+      return std::make_pair(tree_config, tree_metadata);
+    } else {
+      // The growable tree is by definition the last tree in the ensemble.
+      boosted_trees::trees::DecisionTreeMetadata* tree_metadata =
+          ensemble_config->mutable_tree_metadata(num_trees - 1);
+      auto num_layers_grown = tree_metadata->num_layers_grown();
+      VLOG(1) << "Adding layer " << num_layers_grown << " to tree "
+              << num_trees - 1 << " of ensemble of " << num_trees << " trees.";
+      // Update growable tree metadata.
+      ++num_layers_grown;
+      tree_metadata->set_num_layers_grown(num_layers_grown);
+      tree_metadata->set_is_finalized(
+          num_layers_grown >= learner_config_.constraints().max_tree_depth());
+      auto* tree_config = ensemble_config->mutable_trees(num_trees - 1);
+
+      UpdateTreeWeightsIfDropout(ensemble_config, tree_metadata, dropout_seed);
+
+      return std::make_pair(tree_config, tree_metadata);
+    }
+  }
+
+  // Helper method to merge leaf weights as the tree is being grown.
+  boosted_trees::trees::Leaf* MergeLeafWeights(
+      const boosted_trees::trees::Leaf& source,
+      boosted_trees::trees::Leaf* dest) {
+    // Resolve leaf merging method based on how the trees are being grown.
+    if (learner_config_.growing_mode() ==
+        boosted_trees::learner::LearnerConfig::WHOLE_TREE) {
+      // No merging occurs when building a whole tree at a time.
+      return dest;
+    }
+
+    // Handle leaf merging based on type.
+    switch (source.leaf_case()) {
+      case boosted_trees::trees::Leaf::kVector: {
+        // No-op if source is empty
+        const auto& src_vec = source.vector();
+        if (src_vec.value_size() == 0) {
+          break;
+        }
+        CHECK(source.leaf_case() == dest->leaf_case());
+
+        // Dense add leaf vectors.
+        auto* dst_vec = dest->mutable_vector();
+        CHECK(src_vec.value_size() == dst_vec->value_size());
+        for (size_t idx = 0; idx < source.vector().value_size(); ++idx) {
+          (*dst_vec->mutable_value()->Mutable(idx)) += src_vec.value(idx);
+        }
+        break;
+      }
+      case boosted_trees::trees::Leaf::kSparseVector: {
+        // No-op if source is empty
+        const auto& src_vec = source.sparse_vector();
+        CHECK(src_vec.value_size() == src_vec.index_size());
+        if (src_vec.value_size() == 0) {
+          break;
+        }
+        CHECK(source.leaf_case() == dest->leaf_case());
+
+        // Get mapping of dimension to value for destination.
+        std::unordered_map<int32, float> dst_map;
+        auto* dst_vec = dest->mutable_sparse_vector();
+        CHECK(dst_vec->value_size() == dst_vec->index_size());
+        dst_map.reserve(dst_vec->value_size());
+        for (size_t idx = 0; idx < dst_vec->value_size(); ++idx) {
+          dst_map[dst_vec->index(idx)] = dst_vec->value(idx);
+        }
+        // Sparse add source vector to destination vector.
+        for (size_t idx = 0; idx < src_vec.value_size(); ++idx) {
+          dst_map[src_vec.index(idx)] += src_vec.value(idx);
+        }
+        // Rebuild merged destination leaf.
+        dst_vec->clear_index();
+        dst_vec->clear_value();
+        for (const auto& entry : dst_map) {
+          dst_vec->add_index(entry.first);
+          dst_vec->add_value(entry.second);
+        }
+        break;
+      }
+      case boosted_trees::trees::Leaf::LEAF_NOT_SET: {
+        // No-op as there is nothing to merge.
+        break;
+      }
+    }
+    return dest;
+  }
+
+  // Helper method to split a tree node and append its respective
+  // leaf children given the split candidate.
+  void SplitTreeNode(const int32 node_id, SplitCandidate* split,
+                     boosted_trees::trees::DecisionTreeConfig* tree_config) {
+    // No-op if we have no real node.
+    CHECK(node_id < tree_config->nodes_size())
+        << "Invalid node " << node_id << " to split.";
+    // Ensure new split node is valid.
+    CHECK(split->split_info.split_node().node_case() != TreeNode::NODE_NOT_SET);
+    CHECK(tree_config->nodes(node_id).node_case() == TreeNode::kLeaf)
+        << "Unexpected node type to split "
+        << tree_config->nodes(node_id).node_case();
+
+    // Add left leaf.
+    int32 left_id = tree_config->nodes_size();
+    (*tree_config->add_nodes()->mutable_leaf()) =
+        *MergeLeafWeights(tree_config->nodes(node_id).leaf(),
+                          split->split_info.mutable_left_child());
+
+    // Add right leaf.
+    int32 right_id = tree_config->nodes_size();
+    (*tree_config->add_nodes()->mutable_leaf()) =
+        *MergeLeafWeights(tree_config->nodes(node_id).leaf(),
+                          split->split_info.mutable_right_child());
+
+    // Link children and add them as new roots.
+    boosted_trees::trees::DecisionTree::LinkChildren(
+        {left_id, right_id}, split->split_info.mutable_split_node());
+
+    // Add split gain and, if needed, original leaf to node metadata.
+    TreeNodeMetadata* node_metadata =
+        split->split_info.mutable_split_node()->mutable_node_metadata();
+    node_metadata->set_gain(split->gain);
+    if (learner_config_.pruning_mode() ==
+        boosted_trees::learner::LearnerConfig::POST_PRUNE) {
+      (*node_metadata->mutable_original_leaf()) =
+          *tree_config->mutable_nodes(node_id)->mutable_leaf();
+    }
+
+    // Replace node in tree.
+    (*tree_config->mutable_nodes(node_id)) =
+        *split->split_info.mutable_split_node();
+  }
+
+  void PruneTree(boosted_trees::trees::DecisionTreeConfig* tree_config) {
+    // No-op if tree is empty.
+    if (tree_config->nodes_size() <= 0) {
+      return;
+    }
+
+    // Copy nodes to temp vector and clear original tree.
+    std::vector<TreeNode> tree_nodes;
+    tree_nodes.reserve(tree_config->nodes_size());
+    for (auto& node : (*tree_config->mutable_nodes())) {
+      tree_nodes.push_back(node);
+      node.Clear();
+    }
+    tree_config->clear_nodes();
+
+    // Prune the tree recursively starting from the root.
+    RecursivePruneTree(0, &tree_nodes);
+
+    // Rebuild compacted tree.
+    (*tree_config->add_nodes()) = tree_nodes[0];
+    std::unordered_map<size_t, size_t> nodes_map;
+    nodes_map[0] = 0;
+    for (size_t node_idx = 0; node_idx < tree_nodes.size(); ++node_idx) {
+      // Skip pruned nodes.
+      auto& original_node = tree_nodes[node_idx];
+      if (original_node.node_case() == TreeNode::NODE_NOT_SET) {
+        continue;
+      }
+
+      // Find node mapped in tree ensemble.
+      auto mapped_node_it = nodes_map.find(node_idx);
+      CHECK(mapped_node_it != nodes_map.end());
+      auto& mapped_node = (*tree_config->mutable_nodes(mapped_node_it->second));
+
+      // Get node children
+      auto children =
+          boosted_trees::trees::DecisionTree::GetChildren(original_node);
+      for (int32& child_idx : children) {
+        auto new_idx = tree_config->nodes_size();
+        (*tree_config->add_nodes()) = tree_nodes[child_idx];
+        nodes_map[child_idx] = new_idx;
+        child_idx = new_idx;
+      }
+      boosted_trees::trees::DecisionTree::LinkChildren(children, &mapped_node);
+    }
+
+    // Check if there are any nodes with gain left.
+    if (tree_config->nodes_size() == 1 &&
+        tree_config->nodes(0).node_metadata().gain() <= 0) {
+      // The whole tree should be pruned.
+      VLOG(2) << "No useful nodes left after post-pruning tree.";
+      tree_config->clear_nodes();
+    }
+  }
+
+ private:
+  boosted_trees::learner::LearnerConfig learner_config_;
+  int64 num_handlers_;
+  LearningRateDropoutDrivenConfig dropout_config_;
+  bool dropout_was_applied_;
+  bool center_bias_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GrowTreeEnsemble").Device(DEVICE_CPU),
+                        GrowTreeEnsembleOp);
+
+class TreeEnsembleStatsOp : public OpKernel {
+ public:
+  explicit TreeEnsembleStatsOp(OpKernelConstruction* const context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    boosted_trees::models::DecisionTreeEnsembleResource*
+        decision_tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_ensemble_resource));
+    core::ScopedUnref unref_me(decision_tree_ensemble_resource);
+    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Only the Chief should run this Op and it is guaranteed to be in
+    // a consistent state so the stamps must always match.
+    CHECK(decision_tree_ensemble_resource->is_stamp_valid(stamp_token));
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& ensemble_config =
+        decision_tree_ensemble_resource->decision_tree_ensemble();
+
+    // Set tree stats.
+    Tensor* num_trees_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "num_trees", TensorShape({}), &num_trees_t));
+    Tensor* active_tree_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("active_tree", TensorShape({}),
+                                            &active_tree_t));
+    Tensor* attempted_tree_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("attempted_trees", TensorShape({}),
+                                            &attempted_tree_t));
+
+    int num_trees = ensemble_config.trees_size();
+    active_tree_t->scalar<int64>()() = num_trees;
+    if (num_trees > 0 &&
+        !ensemble_config.tree_metadata(num_trees - 1).is_finalized()) {
+      --num_trees;
+    }
+    num_trees_t->scalar<int64>()() = num_trees;
+    attempted_tree_t->scalar<int64>()() =
+        ensemble_config.growing_metadata().num_trees_attempted();
+
+    // Set layer stats.
+    Tensor* num_layers_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "num_layers", TensorShape({}), &num_layers_t));
+    Tensor* active_layer_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("active_layer", TensorShape({}),
+                                            &active_layer_t));
+    Tensor* attempted_layers_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("attempted_layers", TensorShape({}),
+                                            &attempted_layers_t));
+
+    int64 num_layers = 0;
+    for (const auto& tree_metadata : ensemble_config.tree_metadata()) {
+      num_layers += tree_metadata.num_layers_grown();
+    }
+    num_layers_t->scalar<int64>()() = num_layers;
+    int tree_metadata_size = ensemble_config.tree_metadata_size();
+    active_layer_t->scalar<int64>()() =
+        tree_metadata_size > 0
+            ? ensemble_config.tree_metadata(tree_metadata_size - 1)
+                  .num_layers_grown()
+            : 0;
+    attempted_layers_t->scalar<int64>()() =
+        ensemble_config.growing_metadata().num_layers_attempted();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TreeEnsembleStats").Device(DEVICE_CPU),
+                        TreeEnsembleStatsOp);
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 011c02d720f7ab01de72039d4b4603194c7aa9ee..fad56cfeafd5f09062e15630ff1b9d1b7c0ec142 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -11,6 +11,8 @@ package(
     ],
 )
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -22,60 +24,7 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-cc_library(
-    name = "weighted_quantiles",
-    srcs = [],
-    hdrs = [
-        "quantiles/weighted_quantiles_buffer.h",
-        "quantiles/weighted_quantiles_stream.h",
-        "quantiles/weighted_quantiles_summary.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-    ],
-)
-
-cc_test(
-    name = "weighted_quantiles_buffer_test",
-    size = "small",
-    srcs = ["quantiles/weighted_quantiles_buffer_test.cc"],
-    deps = [
-        ":weighted_quantiles",
-        "//tensorflow/core",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-cc_test(
-    name = "weighted_quantiles_summary_test",
-    size = "small",
-    srcs = ["quantiles/weighted_quantiles_summary_test.cc"],
-    deps = [
-        ":weighted_quantiles",
-        "//tensorflow/core",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-cc_test(
-    name = "weighted_quantiles_stream_test",
-    size = "small",
-    srcs = ["quantiles/weighted_quantiles_stream_test.cc"],
-    deps = [
-        ":weighted_quantiles",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
+# Utils
 
 cc_library(
     name = "utils",
@@ -102,7 +51,6 @@ cc_library(
     deps = [
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
         "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
         "//third_party/eigen3",
     ],
 )
@@ -113,7 +61,6 @@ cc_test(
     srcs = ["utils/sparse_column_iterable_test.cc"],
     deps = [
         ":utils",
-        "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -126,7 +73,6 @@ cc_test(
     srcs = ["utils/examples_iterable_test.cc"],
     deps = [
         ":utils",
-        "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -152,7 +98,6 @@ cc_test(
     srcs = ["utils/dropout_utils_test.cc"],
     deps = [
         ":utils",
-        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
@@ -161,6 +106,8 @@ cc_test(
     ],
 )
 
+# Models
+
 cc_library(
     name = "models",
     srcs = ["models/multiple_additive_trees.cc"],
@@ -190,12 +137,104 @@ cc_test(
     ],
 )
 
+# Testutil
+
+cc_library(
+    name = "batch_features_testutil",
+    testonly = 1,
+    srcs = ["testutil/batch_features_testutil.cc"],
+    hdrs = ["testutil/batch_features_testutil.h"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "random_tree_gen",
+    srcs = ["testutil/random_tree_gen.cc"],
+    hdrs = ["testutil/random_tree_gen.h"],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_binary(
+    name = "random_tree_gen_main",
+    srcs = ["testutil/random_tree_gen_main.cc"],
+    deps = [
+        ":random_tree_gen",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+# Quantiles
+
+cc_library(
+    name = "weighted_quantiles",
+    srcs = [],
+    hdrs = [
+        "quantiles/weighted_quantiles_buffer.h",
+        "quantiles/weighted_quantiles_stream.h",
+        "quantiles/weighted_quantiles_summary.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "weighted_quantiles_buffer_test",
+    size = "small",
+    srcs = ["quantiles/weighted_quantiles_buffer_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "weighted_quantiles_summary_test",
+    size = "small",
+    srcs = ["quantiles/weighted_quantiles_summary_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "weighted_quantiles_stream_test",
+    size = "small",
+    srcs = ["quantiles/weighted_quantiles_stream_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# Trees
+
 cc_library(
     name = "trees",
     srcs = ["trees/decision_tree.cc"],
     hdrs = ["trees/decision_tree.h"],
     deps = [
-        ":utils",
+        "//tensorflow/contrib/boosted_trees/lib:utils",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
         "//tensorflow/core:framework_headers_lib",
     ],
@@ -207,43 +246,214 @@ cc_test(
     srcs = ["trees/decision_tree_test.cc"],
     deps = [
         ":trees",
-        ":utils",
+        "//tensorflow/contrib/boosted_trees/lib:utils",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
+# Learner/batch
+
+py_library(
+    name = "base_split_handler",
+    srcs = ["learner/batch/base_split_handler.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+    ],
+)
+
+py_library(
+    name = "categorical_split_handler",
+    srcs = ["learner/batch/categorical_split_handler.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_split_handler",
+        "//tensorflow/contrib/boosted_trees:quantile_ops_py",
+        "//tensorflow/contrib/boosted_trees:split_handler_ops_py",
+        "//tensorflow/contrib/boosted_trees:stats_accumulator_ops_py",
+    ],
+)
+
+py_test(
+    name = "categorical_split_handler_test",
+    srcs = ["learner/batch/categorical_split_handler_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":categorical_split_handler",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_library(
+    name = "ordinal_split_handler",
+    srcs = ["learner/batch/ordinal_split_handler.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_split_handler",
+        "//tensorflow/contrib/boosted_trees:quantile_ops_py",
+        "//tensorflow/contrib/boosted_trees:split_handler_ops_py",
+        "//tensorflow/contrib/boosted_trees:stats_accumulator_ops_py",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_py",
+    ],
+)
+
+py_test(
+    name = "ordinal_split_handler_test",
+    srcs = ["learner/batch/ordinal_split_handler_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ordinal_split_handler",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
+        "//tensorflow/contrib/boosted_trees/proto:split_info_proto_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+# Learner/Common
+
 cc_library(
-    name = "batch_features_testutil",
-    testonly = 1,
-    srcs = ["testutil/batch_features_testutil.cc"],
-    hdrs = ["testutil/batch_features_testutil.h"],
+    name = "class-partition-key",
+    hdrs = ["learner/common/accumulators/class-partition-key.h"],
     deps = [
-        ":utils",
         "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "feature-stats-accumulator",
+    hdrs = ["learner/common/accumulators/feature-stats-accumulator.h"],
+    deps = [
+        ":class-partition-key",
+    ],
+)
+
+cc_test(
+    name = "feature-stats-accumulator_test",
+    size = "small",
+    srcs = ["learner/common/accumulators/feature-stats-accumulator_test.cc"],
+    deps = [
+        ":feature-stats-accumulator",
         "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core:test_main",
     ],
 )
 
 cc_library(
-    name = "random_tree_gen",
-    srcs = ["testutil/random_tree_gen.cc"],
-    hdrs = ["testutil/random_tree_gen.h"],
+    name = "example_partitioner",
+    srcs = ["learner/common/partitioners/example_partitioner.cc"],
+    hdrs = ["learner/common/partitioners/example_partitioner.h"],
     deps = [
+        "//tensorflow/contrib/boosted_trees/lib:trees",
+        "//tensorflow/contrib/boosted_trees/lib:utils",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "example_partitioner_test",
+    size = "small",
+    srcs = ["learner/common/partitioners/example_partitioner_test.cc"],
+    deps = [
+        ":example_partitioner",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# Learner/stochastic
+
+cc_library(
+    name = "feature-column-handlers",
+    srcs = [
+        "learner/stochastic/handlers/bias-feature-column-handler.cc",
+        "learner/stochastic/handlers/categorical-feature-column-handler.cc",
+        "learner/stochastic/handlers/dense-quantized-feature-column-handler.cc",
+        "learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc",
+    ],
+    hdrs = [
+        "learner/stochastic/handlers/bias-feature-column-handler.h",
+        "learner/stochastic/handlers/categorical-feature-column-handler.h",
+        "learner/stochastic/handlers/dense-quantized-feature-column-handler.h",
+        "learner/stochastic/handlers/feature-column-handler.h",
+        "learner/stochastic/handlers/sparse-quantized-feature-column-handler.h",
+    ],
+    deps = [
+        ":feature-split-candidate",
+        ":feature-stats-accumulator",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "feature-column-handlers_test",
+    size = "small",
+    srcs = [
+        "learner/stochastic/handlers/bias-feature-column-handler_test.cc",
+        "learner/stochastic/handlers/categorical-feature-column-handler_test.cc",
+        "learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc",
+        "learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc",
+    ],
+    deps = [
+        ":feature-column-handlers",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "gradient-stats",
+    hdrs = ["learner/stochastic/stats/gradient-stats.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "node-stats",
+    hdrs = ["learner/stochastic/stats/node-stats.h"],
+    deps = [
+        ":gradient-stats",
+        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
     ],
 )
 
-cc_binary(
-    name = "random_tree_gen_main",
-    srcs = ["testutil/random_tree_gen_main.cc"],
+cc_library(
+    name = "split-stats",
+    hdrs = ["learner/stochastic/stats/split-stats.h"],
     deps = [
-        ":random_tree_gen",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
+        ":node-stats",
+    ],
+)
+
+cc_library(
+    name = "feature-split-candidate",
+    hdrs = ["learner/stochastic/stats/feature-split-candidate.h"],
+    deps = [
+        ":split-stats",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+    ],
+)
+
+cc_test(
+    name = "node-stats_test",
+    size = "small",
+    srcs = ["learner/stochastic/stats/node-stats_test.cc"],
+    deps = [
+        ":node-stats",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ff00b39062d57c813633c98c765e077dd4c262
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for creating split nodes using one or more features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+from tensorflow.contrib.boosted_trees.python.ops import batch_ops_utils
+from tensorflow.python.ops import control_flow_ops
+
+
+class BaseSplitHandler(object):
+  """Abstract Base class defining split handlers interface."""
+
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self,
+               l1_regularization,
+               l2_regularization,
+               tree_complexity_regularization,
+               min_node_weight,
+               feature_column_group_id,
+               gradient_shape,
+               hessian_shape,
+               multiclass_strategy,
+               name=None):
+    """Constructor for BaseSplitHandler.
+
+    Args:
+      l1_regularization: L1 regularization applied for this split handler.
+      l2_regularization: L2 regularization applied for this split handler.
+      tree_complexity_regularization: Tree complexity regularization applied
+          for this split handler.
+      min_node_weight: Minimum sum of weights of examples in each partition to
+          be considered for splitting.
+      feature_column_group_id: Feature column group index.
+      gradient_shape: A TensorShape, containing shape of gradients.
+      hessian_shape: A TensorShape, containing shape of hessians.
+      multiclass_strategy: Strategy describing how to treat multiclass problems.
+      name: An optional handler name.
+    """
+    self._l1_regularization = l1_regularization
+    self._l2_regularization = l2_regularization
+    self._tree_complexity_regularization = tree_complexity_regularization
+    self._min_node_weight = min_node_weight
+    self._feature_column_group_id = feature_column_group_id
+    self._name = name or ""
+    self._multiclass_strategy = multiclass_strategy
+    self._hessian_shape = hessian_shape
+    self._gradient_shape = gradient_shape
+
+  def scheduled_reads(self):
+    """Returns the list of `ScheduledOp`s required for update_stats."""
+    return []
+
+  @abc.abstractmethod
+  def update_stats(self, stamp_token, example_partition_ids, gradients,
+                   hessians, empty_gradients, empty_hessians, weights,
+                   is_active, scheduled_reads):
+    """Updates the state for this split handler.
+
+    Args:
+      stamp_token: An int32 scalar tensor containing the current stamp token.
+      example_partition_ids: A dense tensor, containing an int32 for each
+        example which is the partition id that the example ends up in.
+      gradients: A dense tensor of gradients.
+      hessians: A dense tensor of hessians.
+      empty_gradients: A dense empty tensor of the same shape (for dimensions >
+        0) as gradients.
+      empty_hessians: A dense empty tensor of the same shape (for dimensions >
+        0) as hessians.
+      weights: A dense float32 tensor with a weight for each example.
+      is_active: A boolean tensor that says if this handler is active or not.
+          One value for the current layer and one value for the next layer.
+      scheduled_reads: List of results from the scheduled reads.
+
+    Returns:
+      A tuple of the op that updates the stats for this handler and a list of
+      `ScheduledOp`s.
+    """
+
+  def update_stats_sync(self, stamp_token, example_partition_ids, gradients,
+                        hessians, empty_gradients, empty_hessians, weights,
+                        is_active):
+    """Updates the state for this split handler running the scheduled I/O.
+
+    Args:
+      stamp_token: An int32 scalar tensor containing the current stamp token.
+      example_partition_ids: A dense tensor, containing an int32 for each
+        example which is the partition id that the example ends up in.
+      gradients: A dense tensor of gradients.
+      hessians: A dense tensor of hessians.
+      empty_gradients: A dense empty tensor of the same shape (for dimensions >
+        0) as gradients.
+      empty_hessians: A dense empty tensor of the same shape (for dimensions >
+        0) as hessians.
+      weights: A dense float32 tensor with a weight for each example.
+      is_active: A boolean tensor that says if this handler is active or not.
+          One value for the current layer and one value for the next layer.
+
+    Returns:
+      Op that updates the stats for this handler.
+    """
+    handler_reads = {self: self.scheduled_reads()}
+    handler_results = batch_ops_utils.run_handler_scheduled_ops(
+        handler_reads, stamp_token, None)
+    update_1, scheduled_updates = self.update_stats(
+        stamp_token, example_partition_ids, gradients, hessians,
+        empty_gradients, empty_hessians, weights, is_active,
+        handler_results[self])
+    update_2 = batch_ops_utils.run_handler_scheduled_ops({
+        self: scheduled_updates
+    }, stamp_token, None)
+    return control_flow_ops.group(update_1, *update_2[self])
+
+  @abc.abstractmethod
+  def make_splits(self, stamp_token, next_stamp_token, class_id):
+    """Create the best split using the accumulated stats and flush the state.
+
+    This should only be called by the master.
+
+    Args:
+      stamp_token: An int32 scalar tensor containing the current stamp token.
+      next_stamp_token: An int32 scalar tensor containing the stamp token for
+        the next iteration.
+      class_id: what class id the handler gathers stats for (for tree per class
+        strategy). When class_id=-1, the strategy is not tree per class.
+    Returns:
+      A tuple (are_splits_ready, partition_id, gain, split_info) where
+      are_splits_ready is a scalar boolean tensor, partition_id is a rank 1,
+      int32 tensor, gain is a rank 1 float32 tensor and split_info is a rank 1
+      string tensor containing serialized SplitInfo protos.
+    """
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..83dad7e4b3301327bcbae5203e9d9330c9e0084d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -0,0 +1,189 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of handler for split nodes for categorical columns."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.lib.learner.batch import base_split_handler
+from tensorflow.contrib.boosted_trees.python.ops import split_handler_ops
+from tensorflow.contrib.boosted_trees.python.ops import stats_accumulator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+_BIAS_FEATURE_ID = -1
+
+
+class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
+  """Creates equality split type for categorical features."""
+
+  def __init__(self,
+               sparse_int_column,
+               l1_regularization,
+               l2_regularization,
+               tree_complexity_regularization,
+               min_node_weight,
+               feature_column_group_id,
+               gradient_shape,
+               hessian_shape,
+               multiclass_strategy,
+               init_stamp_token=0,
+               name=None):
+    """Initialize the internal state for this split handler.
+
+    Args:
+      sparse_int_column: A `SparseTensor` column with int64 values associated
+        with this handler.
+      l1_regularization: L1 regularization applied for this split handler.
+      l2_regularization: L2 regularization applied for this split handler.
+      tree_complexity_regularization: Tree complexity regularization applied
+          for this split handler.
+      min_node_weight: Minimum sum of weights of examples in each partition to
+          be considered for splitting.
+      feature_column_group_id: Feature column group index.
+      gradient_shape: A TensorShape, containing shape of gradients.
+      hessian_shape: A TensorShape, containing shape of hessians.
+      multiclass_strategy: Strategy describing how to treat multiclass problems.
+      init_stamp_token: A tensor containing an scalar for initial stamp of the
+         stamped objects.
+      name: An optional handler name.
+    """
+    super(EqualitySplitHandler, self).__init__(
+        l1_regularization=l1_regularization,
+        l2_regularization=l2_regularization,
+        tree_complexity_regularization=tree_complexity_regularization,
+        min_node_weight=min_node_weight,
+        feature_column_group_id=feature_column_group_id,
+        gradient_shape=gradient_shape,
+        hessian_shape=hessian_shape,
+        multiclass_strategy=multiclass_strategy,
+        name=name)
+    self._stats_accumulator = stats_accumulator_ops.StatsAccumulator(
+        init_stamp_token,
+        gradient_shape,
+        hessian_shape,
+        name="StatsAccumulator/{}".format(self._name))
+    self._sparse_int_column = sparse_int_column
+
+  def update_stats(self, stamp_token, example_partition_ids, gradients,
+                   hessians, empty_gradients, empty_hessians, weights,
+                   is_active, scheduled_reads):
+    """Updates the state for equality split handler.
+
+    Args:
+      stamp_token: An int32 scalar tensor containing the current stamp token.
+      example_partition_ids: A dense tensor, containing an int32 for each
+        example which is the partition id that the example ends up in.
+      gradients: A dense tensor of gradients.
+      hessians: A dense tensor of hessians.
+      empty_gradients: A dense empty tensor of the same shape (for dimensions >
+        0) as gradients.
+      empty_hessians: A dense empty tensor of the same shape (for dimensions >
+        0) as hessians.
+      weights: A dense float32 tensor with a weight for each example.
+      is_active: A boolean tensor that says if this handler is active or not.
+          One value for the current layer and one value for the next layer.
+      scheduled_reads: List of results from the scheduled reads.
+    Returns:
+      The op that updates the stats for this handler.
+    Raises:
+      ValueError: If example_columns is not a single sparse column.
+
+    """
+    del scheduled_reads  # Unused by the categorical split handler.
+
+    def not_active_inputs():
+      return (constant_op.constant([], dtype=dtypes.int32),
+              constant_op.constant([], dtype=dtypes.int64), empty_gradients,
+              empty_hessians)
+
+    def active_inputs():
+      """The normal flow when the handler is active."""
+      # Remove the second column of example indices matrix since it is not
+      # useful.
+      example_indices, _ = array_ops.split(
+          self._sparse_int_column.indices, num_or_size_splits=2, axis=1)
+      example_indices = array_ops.squeeze(example_indices, [1])
+
+      filtered_gradients = array_ops.gather(gradients, example_indices)
+      filtered_hessians = array_ops.gather(hessians, example_indices)
+      filtered_partition_ids = array_ops.gather(example_partition_ids,
+                                                example_indices)
+      unique_partitions, mapped_partitions = array_ops.unique(
+          example_partition_ids)
+
+      # Compute aggregate stats for each partition.
+      # The bias is computed on gradients and hessians (and not
+      # filtered_gradients) which have exactly one value per example, so we
+      # don't double count a gradient in multivalent columns.
+      per_partition_gradients = math_ops.unsorted_segment_sum(
+          gradients, mapped_partitions, array_ops.size(unique_partitions))
+      per_partition_hessians = math_ops.unsorted_segment_sum(
+          hessians, mapped_partitions, array_ops.size(unique_partitions))
+
+      # Prepend a bias feature per partition that accumulates the stats for all
+      # examples in that partition.
+      # Bias is added to the stats even if there are no examples with values in
+      # the current sparse column. The reason is that the other example batches
+      # might have values in these partitions so we have to keep the bias
+      # updated.
+      bias_feature_ids = array_ops.fill(
+          array_ops.shape(unique_partitions), _BIAS_FEATURE_ID)
+      bias_feature_ids = math_ops.cast(bias_feature_ids, dtypes.int64)
+      partition_ids = array_ops.concat(
+          [unique_partitions, filtered_partition_ids], 0)
+      filtered_gradients = array_ops.concat(
+          [per_partition_gradients, filtered_gradients], 0)
+      filtered_hessians = array_ops.concat(
+          [per_partition_hessians, filtered_hessians], 0)
+      feature_ids = array_ops.concat(
+          [bias_feature_ids, self._sparse_int_column.values], 0)
+      return partition_ids, feature_ids, filtered_gradients, filtered_hessians
+
+    partition_ids, feature_ids, gradients_out, hessians_out = (
+        control_flow_ops.cond(is_active[0], active_inputs, not_active_inputs))
+    result = self._stats_accumulator.schedule_add(partition_ids, feature_ids,
+                                                  gradients_out, hessians_out)
+    return (control_flow_ops.no_op(), [result])
+
+  def make_splits(self, stamp_token, next_stamp_token, class_id):
+    """Create the best split using the accumulated stats and flush the state."""
+    # Get the aggregated gradients and hessians per <partition_id, feature_id>
+    # pair.
+    num_minibatches, partition_ids, feature_ids, gradients, hessians = (
+        self._stats_accumulator.flush(stamp_token, next_stamp_token))
+    partition_ids, gains, split_infos = (
+        split_handler_ops.build_categorical_equality_splits(
+            num_minibatches=num_minibatches,
+            partition_ids=partition_ids,
+            feature_ids=feature_ids,
+            gradients=gradients,
+            hessians=hessians,
+            class_id=class_id,
+            feature_column_group_id=self._feature_column_group_id,
+            l1_regularization=self._l1_regularization,
+            l2_regularization=self._l2_regularization,
+            tree_complexity_regularization=self._tree_complexity_regularization,
+            min_node_weight=self._min_node_weight,
+            bias_feature_id=_BIAS_FEATURE_ID,
+            multiclass_strategy=self._multiclass_strategy,))
+    # There are no warm-up rounds needed in the equality column handler. So we
+    # always return ready.
+    are_splits_ready = constant_op.constant(True)
+    return (are_splits_ready, partition_ids, gains, split_infos)
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b65eba2a76273a81f1464ed7639f0c0760e0050
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
@@ -0,0 +1,338 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for checking stats accumulator related ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.lib.learner.batch import categorical_split_handler
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import split_info_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+def get_empty_tensors(gradient_shape, hessian_shape):
+  empty_hess_shape = [1] + hessian_shape.as_list()
+  empty_grad_shape = [1] + gradient_shape.as_list()
+
+  empty_gradients = constant_op.constant(
+      [], dtype=dtypes.float32, shape=empty_grad_shape)
+  empty_hessians = constant_op.constant(
+      [], dtype=dtypes.float32, shape=empty_hess_shape)
+
+  return empty_gradients, empty_hessians
+
+
+class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
+
+  def testGenerateFeatureSplitCandidates(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Feature ID     |
+      # i0      |  (0.2, 0.12)  | 0         | 1,2            |
+      # i1      |  (-0.5, 0.07) | 0         |                |
+      # i2      |  (1.2, 0.2)   | 0         | 2              |
+      # i3      |  (4.0, 0.13)  | 1         | 1              |
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = [0, 0, 0, 1]
+      indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
+      values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = categorical_split_handler.EqualitySplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          sparse_int_column=sparse_tensor.SparseTensor(indices, values, [4, 1]),
+          feature_column_group_id=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+          init_stamp_token=0)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready, partitions, gains, splits = (
+            split_handler.make_splits(0, 1, class_id))
+        are_splits_ready, partitions, gains, splits = (sess.run(
+            [are_splits_ready, partitions, gains, splits]))
+    self.assertTrue(are_splits_ready)
+    self.assertAllEqual([0, 1], partitions)
+
+    # Check the split on partition 0.
+    # -(0.2 + 1.2 - 0.1) / (0.12 + 0.2 + 1)
+    expected_left_weight = -0.9848484848484846
+
+    # (0.2 + 1.2 - 0.1) ** 2 / (0.12 + 0.2 + 1)
+    expected_left_gain = 1.2803030303030298
+
+    # -(-0.5 + 0.1) / (0.07 + 1)
+    expected_right_weight = 0.37383177570093457
+
+    # (-0.5 + 0.1) ** 2 / (0.07 + 1)
+    expected_right_gain = 0.14953271028037385
+
+    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
+    expected_bias_gain = 0.46043165467625885
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertEqual(2, split_node.feature_id)
+
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0],
+        0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    # Check the split on partition 1.
+    # (-4 + 0.1) / (0.13 + 1)
+    expected_left_weight = -3.4513274336283186
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_left_gain = 13.460176991150442
+    expected_right_weight = 0
+    expected_right_gain = 0
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_bias_gain = 13.460176991150442
+
+    # Verify candidate for partition 1, there's only one active feature here
+    # so zero gain is expected.
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+    self.assertAllClose(0.0, gains[1], 0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertEqual(1, split_node.feature_id)
+
+  def testGenerateFeatureSplitCandidatesMulticlass(self):
+    with self.test_session() as sess:
+      # Batch size is 4, 2 gradients per each instance.
+      gradients = array_ops.constant(
+          [[0.2, 0.1], [-0.5, 0.2], [1.2, 3.4], [4.0, -3.5]], shape=[4, 2])
+      # 2x2 matrix for each instance
+      hessian_0 = [[0.12, 0.02], [0.3, 0.11]]
+      hessian_1 = [[0.07, -0.2], [-0.5, 0.2]]
+      hessian_2 = [[0.2, -0.23], [-0.8, 0.9]]
+      hessian_3 = [[0.13, -0.3], [-1.5, 2.2]]
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3])
+
+      partition_ids = [0, 0, 0, 1]
+      indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
+      values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)
+
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+
+      gradient_shape = tensor_shape.TensorShape([2])
+      hessian_shape = tensor_shape.TensorShape([2, 2])
+      class_id = -1
+
+      split_handler = categorical_split_handler.EqualitySplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          sparse_int_column=sparse_tensor.SparseTensor(indices, values, [4, 1]),
+          feature_column_group_id=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN,
+          init_stamp_token=0)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready, partitions, gains, splits = (
+            split_handler.make_splits(0, 1, class_id))
+        are_splits_ready, partitions, gains, splits = (sess.run(
+            [are_splits_ready, partitions, gains, splits]))
+    self.assertTrue(are_splits_ready)
+    self.assertAllEqual([0, 1], partitions)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+    self.assertEqual(1, split_node.feature_id)
+
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(0, len(right_child.value))
+    self.assertEqual(1, split_node.feature_id)
+
+  def testEmpty(self):
+    with self.test_session() as sess:
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = [0, 0, 0, 1]
+      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
+      values = array_ops.constant([], dtype=dtypes.int64)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = categorical_split_handler.EqualitySplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          sparse_int_column=sparse_tensor.SparseTensor(indices, values, [4, 1]),
+          feature_column_group_id=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+          init_stamp_token=0)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready, partitions, gains, splits = (
+            split_handler.make_splits(0, 1, class_id))
+        are_splits_ready, partitions, gains, splits = (sess.run(
+            [are_splits_ready, partitions, gains, splits]))
+    self.assertTrue(are_splits_ready)
+    self.assertEqual(len(partitions), 0)
+    self.assertEqual(len(gains), 0)
+    self.assertEqual(len(splits), 0)
+
+  def testInactive(self):
+    with self.test_session() as sess:
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = [0, 0, 0, 1]
+      indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
+      values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = categorical_split_handler.EqualitySplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          sparse_int_column=sparse_tensor.SparseTensor(indices, values, [4, 1]),
+          feature_column_group_id=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+          init_stamp_token=0)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([False, False]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready, partitions, gains, splits = (
+            split_handler.make_splits(0, 1, class_id))
+        are_splits_ready, partitions, gains, splits = (sess.run(
+            [are_splits_ready, partitions, gains, splits]))
+    self.assertTrue(are_splits_ready)
+    self.assertEqual(len(partitions), 0)
+    self.assertEqual(len(gains), 0)
+    self.assertEqual(len(splits), 0)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c0a3f0d91e0fbd6b6ca02352c8b80b8485d029d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -0,0 +1,518 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of handler for split nodes for float columns.
+
+The general idea in batch split finding is that each handler will accumulate its
+own statistics on multiple workers. After some steps, the master runs
+make_splits() sub-graph of each handler and each handler returns its best split
+per partition.
+
+The way we ensure consistency of statistics is by using stamp_tokens for read
+and write operations. During each update of the model, a new stamp token is
+created. This stamp token makes sure that updates from the previous iterations
+are not included in the statistics for this iteration.
+
+Inequality splits for float features are created similar to the method described
+in Approximate Algorithm described in https://arxiv.org/pdf/1603.02754v3.pdf.
+Weighted quantiles of the feature columns are computed in a distributed fashion
+using quantile_ops.quantile_accumulator.
+After certain number of steps of parallel accumulation of quantile statistics,
+we decide on bucket boundaries. These bucket boundaries are then used for the
+next N steps to accumulate gradients and hessians per bucket.
+
+In this implementation, we gather quantile statistics and gradient statistics
+concurrently. That means that we don't wait until we have enough quantile
+statistics for bucketization before we start gathering gradient stats. Instead
+during each step we create quantile stats for the next iteration and use the
+previous quantile buckets for gradient stats accumulation.
+In make_splits, we do these steps:
+1) Get the buckets that were used creating for the gradient stats.
+2) Create bucket boundaries for the next N iterations and clear the accumulated
+   quantile stats.
+n3) Get the accumulated gradient stats and clear the accumulator. This step can
+   run in parallel to step 2.
+4) For each leaf node in the current tree (partition):
+   4.1) Get the overall gain computed with gradients and hessians of all
+        examples that end up in this partition.
+   4.2) Compute tensors of left and right cumulative sum of gradients, hessians
+        and gain. The first dimension of these tensors are the bucket
+        boundaries.
+   4.3) Find the gains for all bucket boundaries:
+        split_gains = left_gain + right_gain - overall_gain.
+   4.4) Find the bucket boundary that has the best gain (argmax(split_gains))
+   4.5) For Sparse handler, we also consider the gain for when the examples go
+        the left child and when the examples go to the right child and pick the
+        default direction that yields the most gain.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.contrib.boosted_trees.lib.learner.batch import base_split_handler
+from tensorflow.contrib.boosted_trees.python.ops import quantile_ops
+from tensorflow.contrib.boosted_trees.python.ops import split_handler_ops
+from tensorflow.contrib.boosted_trees.python.ops import stats_accumulator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+_BIAS_FEATURE_ID = -1
+# Pattern to remove all non alpha numeric from a string.
+_PATTERN = re.compile(r"[\W_]+")
+
+
+class InequalitySplitHandler(base_split_handler.BaseSplitHandler):
+  """Base class for handlers of inequality splits."""
+
+  def __init__(self,
+               l1_regularization,
+               l2_regularization,
+               tree_complexity_regularization,
+               min_node_weight,
+               feature_column_group_id,
+               epsilon,
+               num_quantiles,
+               gradient_shape,
+               hessian_shape,
+               multiclass_strategy,
+               init_stamp_token=0,
+               name=None):
+    """Initialize the internal state for this split handler.
+
+    Args:
+      l1_regularization: L1 regularization applied for this split handler.
+      l2_regularization: L2 regularization applied for this split handler.
+      tree_complexity_regularization: Tree complexity regularization applied
+          for this split handler.
+      min_node_weight: Minimum sum of weights of examples in each partition to
+          be considered for splitting.
+      feature_column_group_id: Feature column group index.
+      epsilon: A float, the error bound for quantile computation.
+      num_quantiles: An int, the number of buckets to create from the histogram.
+      gradient_shape: A TensorShape, containing shape of gradients.
+      hessian_shape: A TensorShape, containing shape of hessians.
+      multiclass_strategy: Strategy describing how to treat multiclass problems.
+      init_stamp_token: A tensor containing an scalar for initial stamp of the
+         stamped objects.
+      name: An optional handler name.
+    """
+    super(InequalitySplitHandler, self).__init__(
+        name=name,
+        l1_regularization=l1_regularization,
+        l2_regularization=l2_regularization,
+        tree_complexity_regularization=tree_complexity_regularization,
+        min_node_weight=min_node_weight,
+        feature_column_group_id=feature_column_group_id,
+        gradient_shape=gradient_shape,
+        hessian_shape=hessian_shape,
+        multiclass_strategy=multiclass_strategy)
+    self._stats_accumulator = stats_accumulator_ops.StatsAccumulator(
+        init_stamp_token,
+        gradient_shape,
+        hessian_shape,
+        name="StatsAccumulator/{}".format(self._name))
+    self._quantile_accumulator = quantile_ops.QuantileAccumulator(
+        init_stamp_token,
+        epsilon=epsilon,
+        num_quantiles=num_quantiles,
+        name="QuantileAccumulator/{}".format(self._name))
+
+
+class DenseSplitHandler(InequalitySplitHandler):
+  """Computes stats and finds the best inequality splits on dense columns."""
+
+  def __init__(self,
+               dense_float_column,
+               l1_regularization,
+               l2_regularization,
+               tree_complexity_regularization,
+               min_node_weight,
+               feature_column_group_id,
+               epsilon,
+               num_quantiles,
+               gradient_shape,
+               hessian_shape,
+               multiclass_strategy,
+               init_stamp_token=0,
+               name=None):
+    """Initialize the internal state for this split handler.
+
+    Args:
+      dense_float_column: A `Tensor` column associated with this handler.
+      l1_regularization: L1 regularization applied for this split handler.
+      l2_regularization: L2 regularization applied for this split handler.
+      tree_complexity_regularization: Tree complexity regularization applied
+          for this split handler.
+      min_node_weight: Minimum sum of weights of examples in each partition to
+          be considered for splitting.
+      feature_column_group_id: Feature column group index.
+      epsilon: A float, the error bound for quantile computation.
+      num_quantiles: An int, the number of buckets to create from the histogram.
+      gradient_shape: A TensorShape, containing shape of gradients.
+      hessian_shape: A TensorShape, containing shape of hessians.
+      multiclass_strategy: Strategy describing how to treat multiclass problems.
+      init_stamp_token: A tensor containing an scalar for initial stamp of the
+         stamped objects.
+      name: An optional handler name.
+    """
+    super(DenseSplitHandler, self).__init__(
+        l1_regularization=l1_regularization,
+        l2_regularization=l2_regularization,
+        tree_complexity_regularization=tree_complexity_regularization,
+        min_node_weight=min_node_weight,
+        feature_column_group_id=feature_column_group_id,
+        epsilon=epsilon,
+        num_quantiles=num_quantiles,
+        init_stamp_token=init_stamp_token,
+        name=name,
+        gradient_shape=gradient_shape,
+        hessian_shape=hessian_shape,
+        multiclass_strategy=multiclass_strategy)
+    self._dense_float_column = dense_float_column
+    # Register dense_make_stats_update function as an Op to the graph.
+    g = ops.get_default_graph()
+    dense_make_stats_update.add_to_graph(g)
+
+  def scheduled_reads(self):
+    return [self._quantile_accumulator.schedule_get_buckets()]
+
+  def update_stats(self, stamp_token, example_partition_ids, gradients,
+                   hessians, empty_gradients, empty_hessians, weights,
+                   is_active, scheduled_reads):
+    """Updates the state for dense split handler.
+
+    Args:
+      stamp_token: An int32 scalar tensor containing the current stamp token.
+      example_partition_ids: A dense tensor, containing an int32 for each
+        example which is the partition id that the example ends up in.
+      gradients: A dense tensor of gradients.
+      hessians: A dense tensor of hessians.
+      empty_gradients: A dense empty tensor of the same shape (for dimensions >
+        0) as gradients.
+      empty_hessians: A dense empty tensor of the same shape (for dimensions >
+        0) as hessians.
+      weights: A dense float32 tensor with a weight for each example.
+      is_active: A boolean tensor that says if this handler is active or not.
+          One value for the current layer and one value for the next layer.
+      scheduled_reads: List of scheduled reads for this handler.
+
+    Returns:
+      The op that updates the stats for this handler.
+    """
+    name = _PATTERN.sub("", self._name)
+    with ops.name_scope(name, "DenseSplitHandler"):
+      are_buckets_ready, buckets = scheduled_reads[0]
+      (quantile_values, quantile_weights, example_partition_ids,
+       feature_ids, gradients, hessians) = dense_make_stats_update(
+           is_active, are_buckets_ready, self._dense_float_column, buckets,
+           example_partition_ids, gradients, hessians, weights, empty_gradients,
+           empty_hessians)
+      update_quantiles = self._quantile_accumulator.schedule_add_summary(
+          stamp_token=stamp_token,
+          column=quantile_values,
+          example_weights=quantile_weights)
+      update_stats = self._stats_accumulator.schedule_add(
+          example_partition_ids, feature_ids, gradients, hessians)
+      return control_flow_ops.no_op(), [update_quantiles, update_stats]
+
+  def make_splits(self, stamp_token, next_stamp_token, class_id):
+    """Create the best split using the accumulated stats and flush the state."""
+    # Get the bucket boundaries
+    are_splits_ready, buckets = (
+        self._quantile_accumulator.get_buckets(stamp_token))
+    # After we receive the boundaries from previous iteration we can flush
+    # the quantile accumulator.
+    with ops.control_dependencies([buckets]):
+      flush_quantiles = self._quantile_accumulator.flush(
+          stamp_token=stamp_token, next_stamp_token=next_stamp_token)
+
+    # Get the aggregated gradients and hessians per <partition_id, feature_id>
+    # pair.
+    # In order to distribute the computation on all the PSs we use the PS that
+    # had the stats accumulator on.
+    with ops.device(None):
+      with ops.device(self._stats_accumulator.resource().device):
+        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
+            self._stats_accumulator.flush(stamp_token, next_stamp_token))
+
+        # Put quantile and stats accumulator flushing in the dependency path.
+        are_splits_ready = control_flow_ops.with_dependencies(
+            [flush_quantiles, partition_ids], are_splits_ready)
+        partition_ids, gains, split_infos = (
+            split_handler_ops.build_dense_inequality_splits(
+                num_minibatches=num_minibatches,
+                bucket_boundaries=buckets,
+                partition_ids=partition_ids,
+                bucket_ids=bucket_ids,
+                gradients=gradients,
+                hessians=hessians,
+                class_id=class_id,
+                feature_column_group_id=self._feature_column_group_id,
+                l1_regularization=self._l1_regularization,
+                l2_regularization=self._l2_regularization,
+                tree_complexity_regularization=self.
+                _tree_complexity_regularization,
+                min_node_weight=self._min_node_weight,
+                multiclass_strategy=self._multiclass_strategy))
+    return (are_splits_ready, partition_ids, gains, split_infos)
+
+
+class SparseSplitHandler(InequalitySplitHandler):
+  """Computes stats and finds the best inequality splits on sparse columns."""
+
+  def __init__(self,
+               sparse_float_column,
+               l1_regularization,
+               l2_regularization,
+               tree_complexity_regularization,
+               min_node_weight,
+               feature_column_group_id,
+               epsilon,
+               num_quantiles,
+               gradient_shape,
+               hessian_shape,
+               multiclass_strategy,
+               init_stamp_token=0,
+               name=None):
+    """Initialize the internal state for this split handler.
+
+    Args:
+      sparse_float_column: A `SparseTensor` column associated with this handler.
+      l1_regularization: L1 regularization applied for this split handler.
+      l2_regularization: L2 regularization applied for this split handler.
+      tree_complexity_regularization: Tree complexity regularization applied
+          for this split handler.
+      min_node_weight: Minimum sum of weights of examples in each partition to
+          be considered for splitting.
+      feature_column_group_id: Feature column group index.
+      epsilon: A float, the error bound for quantile computation.
+      num_quantiles: An int, the number of buckets to create from the histogram.
+      gradient_shape: A TensorShape, containing shape of gradients.
+      hessian_shape: A TensorShape, containing shape of hessians.
+      multiclass_strategy: Strategy describing how to treat multiclass problems.
+      init_stamp_token: A tensor containing an scalar for initial stamp of the
+         stamped objects.
+      name: An optional handler name.
+    """
+    super(SparseSplitHandler, self).__init__(
+        l1_regularization=l1_regularization,
+        l2_regularization=l2_regularization,
+        tree_complexity_regularization=tree_complexity_regularization,
+        min_node_weight=min_node_weight,
+        feature_column_group_id=feature_column_group_id,
+        epsilon=epsilon,
+        num_quantiles=num_quantiles,
+        gradient_shape=gradient_shape,
+        hessian_shape=hessian_shape,
+        multiclass_strategy=multiclass_strategy,
+        init_stamp_token=init_stamp_token,
+        name=name)
+    # Register sparse_make_stats_update function as an Op to the graph.
+    g = ops.get_default_graph()
+    sparse_make_stats_update.add_to_graph(g)
+    self._sparse_float_column = sparse_float_column
+
+  def scheduled_reads(self):
+    return [self._quantile_accumulator.schedule_get_buckets()]
+
+  def update_stats(self, stamp_token, example_partition_ids, gradients,
+                   hessians, empty_gradients, empty_hessians, weights,
+                   is_active, scheduled_reads):
+    """Updates the state for dense split handler.
+
+    Args:
+      stamp_token: An int32 scalar tensor containing the current stamp token.
+      example_partition_ids: A dense tensor, containing an int32 for each
+        example which is the partition id that the example ends up in.
+      gradients: A dense tensor of gradients.
+      hessians: A dense tensor of hessians.
+      empty_gradients: A dense empty tensor of the same shape (for dimensions >
+        0) as gradients.
+      empty_hessians: A dense empty tensor of the same shape (for dimensions >
+        0) as hessians.
+      weights: A dense float32 tensor with a weight for each example.
+      is_active: A boolean tensor that says if this handler is active or not.
+          One value for the current layer and one value for the next layer.
+      scheduled_reads: List of results from the scheduled reads.
+
+    Returns:
+      The op that updates the stats for this handler.
+    """
+    are_buckets_ready, buckets = scheduled_reads[0]
+    with ops.name_scope(self._name, "SparseSplitHandler"):
+      (quantile_indices, quantile_values, quantile_shapes, quantile_weights,
+       example_partition_ids,
+       feature_ids, gradients, hessians) = sparse_make_stats_update(
+           is_active, are_buckets_ready, self._sparse_float_column.indices,
+           self._sparse_float_column.values,
+           self._sparse_float_column.dense_shape, buckets,
+           example_partition_ids, gradients, hessians, weights, empty_gradients,
+           empty_hessians)
+      update_quantiles = self._quantile_accumulator.schedule_add_summary(
+          stamp_token=stamp_token,
+          column=sparse_tensor.SparseTensor(quantile_indices, quantile_values,
+                                            quantile_shapes),
+          example_weights=quantile_weights)
+      update_stats = self._stats_accumulator.schedule_add(
+          example_partition_ids, feature_ids, gradients, hessians)
+      return (control_flow_ops.no_op(), [update_quantiles, update_stats])
+
+  def make_splits(self, stamp_token, next_stamp_token, class_id):
+    """Create the best split using the accumulated stats and flush the state."""
+    # Get the bucket boundaries
+    are_splits_ready, buckets = (
+        self._quantile_accumulator.get_buckets(stamp_token))
+
+    # After we receive the boundaries from previous iteration we can flush
+    # the quantile accumulator.
+    with ops.control_dependencies([buckets]):
+      flush_quantiles = self._quantile_accumulator.flush(
+          stamp_token=stamp_token, next_stamp_token=next_stamp_token)
+
+    with ops.device(None):
+      with ops.device(self._stats_accumulator.resource().device):
+        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
+            self._stats_accumulator.flush(stamp_token, next_stamp_token))
+
+        # Put quantile and stats accumulator flushing in the dependency path.
+        are_splits_ready = control_flow_ops.with_dependencies(
+            [flush_quantiles, partition_ids], are_splits_ready)
+        partition_ids, gains, split_infos = (
+            split_handler_ops.build_sparse_inequality_splits(
+                num_minibatches=num_minibatches,
+                bucket_boundaries=buckets,
+                partition_ids=partition_ids,
+                bucket_ids=bucket_ids,
+                gradients=gradients,
+                hessians=hessians,
+                class_id=class_id,
+                feature_column_group_id=self._feature_column_group_id,
+                l1_regularization=self._l1_regularization,
+                l2_regularization=self._l2_regularization,
+                tree_complexity_regularization=self.
+                _tree_complexity_regularization,
+                min_node_weight=self._min_node_weight,
+                bias_feature_id=_BIAS_FEATURE_ID,
+                multiclass_strategy=self._multiclass_strategy))
+    return (are_splits_ready, partition_ids, gains, split_infos)
+
+
+@function.Defun(dtypes.bool, dtypes.bool, dtypes.float32, dtypes.float32,
+                dtypes.int32, dtypes.float32, dtypes.float32, dtypes.float32,
+                dtypes.float32, dtypes.float32)
+def dense_make_stats_update(is_active, are_buckets_ready, float_column,
+                            quantile_buckets, example_partition_ids, gradients,
+                            hessians, weights, empty_gradients, empty_hessians):
+  """Updates the state for dense split handler."""
+  empty_float = constant_op.constant([], dtype=dtypes.float32)
+
+  quantile_values, quantile_weights = control_flow_ops.cond(
+      is_active[1],  # For the next layer, this handler is inactive.
+      lambda: (float_column, weights),
+      lambda: (empty_float, empty_float))
+
+  def ready_inputs_fn():
+    """Branch to execute when quantiles are ready."""
+    quantized_feature = quantile_ops.quantiles([float_column], [],
+                                               [quantile_buckets], [])
+    quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
+    quantized_feature = array_ops.reshape(quantized_feature, [-1])
+    return (example_partition_ids, quantized_feature, gradients, hessians)
+
+  def not_ready_inputs_fn():
+    return (constant_op.constant([], dtype=dtypes.int32), constant_op.constant(
+        [], dtype=dtypes.int64), empty_gradients, empty_hessians)
+
+  example_partition_ids, feature_ids, gradients, hessians = (
+      control_flow_ops.cond(
+          math_ops.logical_and(are_buckets_ready, is_active[0]),
+          ready_inputs_fn, not_ready_inputs_fn))
+  return (quantile_values, quantile_weights, example_partition_ids, feature_ids,
+          gradients, hessians)
+
+
+@function.Defun(dtypes.bool, dtypes.bool, dtypes.int64, dtypes.float32,
+                dtypes.int64, dtypes.float32, dtypes.int32, dtypes.float32,
+                dtypes.float32, dtypes.float32, dtypes.float32, dtypes.float32)
+def sparse_make_stats_update(
+    is_active, are_buckets_ready, sparse_column_indices, sparse_column_values,
+    sparse_column_shape, quantile_buckets, example_partition_ids, gradients,
+    hessians, weights, empty_gradients, empty_hessians):
+  """Updates the state for this split handler."""
+
+  def quantiles_ready():
+    """The subgraph for when the quantiles are ready."""
+    quantized_feature = quantile_ops.quantiles([sparse_column_values], [],
+                                               [quantile_buckets], [])
+    quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
+    quantized_feature = array_ops.reshape(quantized_feature, [-1])
+    example_indices, _ = array_ops.split(
+        sparse_column_indices, num_or_size_splits=2, axis=1)
+    example_indices = array_ops.squeeze(example_indices, [1])
+    filtered_gradients = array_ops.gather(gradients, example_indices)
+    filtered_hessians = array_ops.gather(hessians, example_indices)
+    filtered_partition_ids = array_ops.gather(example_partition_ids,
+                                              example_indices)
+    unique_partitions, mapped_partitions = array_ops.unique(
+        example_partition_ids)
+
+    # Compute aggregate stats for each partition.
+    per_partition_gradients = math_ops.unsorted_segment_sum(
+        gradients, mapped_partitions, array_ops.size(unique_partitions))
+    per_partition_hessians = math_ops.unsorted_segment_sum(
+        hessians, mapped_partitions, array_ops.size(unique_partitions))
+
+    # Prepend a bias feature per partition that accumulates the stats for all
+    # examples in that partition.
+    bias_feature_ids = array_ops.fill(
+        array_ops.shape(unique_partitions), _BIAS_FEATURE_ID)
+    bias_feature_ids = math_ops.cast(bias_feature_ids, dtypes.int64)
+    partition_ids = array_ops.concat(
+        [unique_partitions, filtered_partition_ids], 0)
+    filtered_gradients = array_ops.concat(
+        [per_partition_gradients, filtered_gradients], 0)
+    filtered_hessians = array_ops.concat(
+        [per_partition_hessians, filtered_hessians], 0)
+    bucket_ids = array_ops.concat([bias_feature_ids, quantized_feature], 0)
+    return partition_ids, bucket_ids, filtered_gradients, filtered_hessians
+
+  def quantiles_not_ready():
+    """The subgraph for when the quantiles are not ready."""
+    return (constant_op.constant([], dtype=dtypes.int32), constant_op.constant(
+        [], dtype=dtypes.int64), empty_gradients, empty_hessians)
+
+  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  handler_not_active = (constant_op.constant(
+      [], dtype=dtypes.int64, shape=[0, 2]), empty_float, constant_op.constant(
+          [0, 1], dtype=dtypes.int64), empty_float)
+  handler_active = (sparse_column_indices, sparse_column_values,
+                    sparse_column_shape, weights)
+  quantile_indices, quantile_values, quantile_shape, quantile_weights = (
+      control_flow_ops.cond(is_active[1], lambda: handler_active,
+                            lambda: handler_not_active))
+
+  example_partition_ids, feature_ids, gradients, hessians = (
+      control_flow_ops.cond(are_buckets_ready, quantiles_ready,
+                            quantiles_not_ready))
+
+  return (quantile_indices, quantile_values, quantile_shape, quantile_weights,
+          example_partition_ids, feature_ids, gradients, hessians)
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee16a5f838a65f20db4436eb86527518621b6d8d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -0,0 +1,1126 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for checking stats accumulator related ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import split_info_pb2
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+def get_empty_tensors(gradient_shape, hessian_shape):
+  empty_hess_shape = [1] + hessian_shape.as_list()
+  empty_grad_shape = [1] + gradient_shape.as_list()
+
+  empty_gradients = constant_op.constant(
+      [], dtype=dtypes.float32, shape=empty_grad_shape)
+  empty_hessians = constant_op.constant(
+      [], dtype=dtypes.float32, shape=empty_hess_shape)
+
+  return empty_gradients, empty_hessians
+
+
+class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
+
+  def testGenerateFeatureSplitCandidates(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Dense Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 1              |
+      # i1      |  (-0.5, 0.07) | 0         | 1              |
+      # i2      |  (1.2, 0.2)   | 0         | 0              |
+      # i3      |  (4.0, 0.13)  | 1         | 1              |
+      dense_column = array_ops.constant([0.52, 0.52, 0.3, 0.52])
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      class_id = -1
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.001,
+          num_quantiles=10,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([0, 1], partitions)
+
+    # Check the split on partition 0.
+    # -(1.2 - 0.1) / (0.2 + 1)
+    expected_left_weight = -0.91666
+
+    # expected_left_weight * -(1.2 - 0.1)
+    expected_left_gain = 1.0083333333333331
+
+    # (-0.5 + 0.2 + 0.1) / (0.19 + 1)
+    expected_right_weight = 0.1680672
+
+    # expected_right_weight * -(-0.5 + 0.2 + 0.1))
+    expected_right_gain = 0.033613445378151252
+
+    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
+    expected_bias_gain = 0.46043165467625885
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0],
+        0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertAllClose(0.3, split_node.threshold, 0.00001)
+
+    # Check the split on partition 1.
+    # (-4 + 0.1) / (0.13 + 1)
+    expected_left_weight = -3.4513274336283186
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_left_gain = 13.460176991150442
+    expected_right_weight = 0
+    expected_right_gain = 0
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_bias_gain = 13.460176991150442
+
+    # Verify candidate for partition 1, there's only one active bucket here
+    # so zero gain is expected.
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    self.assertAllClose(0.0, gains[1], 0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertAllClose(0.52, split_node.threshold, 0.00001)
+
+  def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
+    with self.test_session() as sess:
+      dense_column = array_ops.constant([0.52, 0.52, 0.3, 0.52])
+      # Batch size is 4, 2 gradients per each instance.
+      gradients = array_ops.constant(
+          [[0.2, 0.1], [-0.5, 0.2], [1.2, 3.4], [4.0, -3.5]], shape=[4, 2])
+      # 2x2 matrix for each instance
+      hessian_0 = [[0.12, 0.02], [0.3, 0.11]]
+      hessian_1 = [[0.07, -0.2], [-0.5, 0.2]]
+      hessian_2 = [[0.2, -0.23], [-0.8, 0.9]]
+      hessian_3 = [[0.13, -0.3], [-1.5, 2.2]]
+
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      class_id = -1
+
+      gradient_shape = tensor_shape.TensorShape([2])
+      hessian_shape = tensor_shape.TensorShape([2, 2])
+
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.001,
+          num_quantiles=3,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+    self.assertEqual(0, split_node.feature_column)
+    self.assertAllClose(0.3, split_node.threshold, 1e-6)
+
+  def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
+    with self.test_session() as sess:
+      dense_column = array_ops.constant([0.52, 0.52, 0.3, 0.52])
+      # Batch size is 4, 2 gradients per each instance.
+      gradients = array_ops.constant(
+          [[0.2, 0.1], [-0.5, 0.2], [1.2, 3.4], [4.0, -3.5]], shape=[4, 2])
+      # Each hessian is a diagonal of a full hessian matrix.
+      hessian_0 = [0.12, 0.11]
+      hessian_1 = [0.07, 0.2]
+      hessian_2 = [0.2, 0.9]
+      hessian_3 = [0.13, 2.2]
+
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      class_id = -1
+
+      gradient_shape = tensor_shape.TensorShape([2])
+      hessian_shape = tensor_shape.TensorShape([2])
+
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.001,
+          num_quantiles=3,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.DIAGONAL_HESSIAN)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+    self.assertEqual(0, split_node.feature_column)
+    self.assertAllClose(0.3, split_node.threshold, 1e-6)
+
+  def testGenerateFeatureSplitCandidatesInactive(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Dense Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 1              |
+      # i1      |  (-0.5, 0.07) | 0         | 1              |
+      # i2      |  (1.2, 0.2)   | 0         | 0              |
+      # i3      |  (4.0, 0.13)  | 1         | 1              |
+      dense_column = array_ops.constant([0.52, 0.52, 0.3, 0.52])
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.001,
+          num_quantiles=10,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, False]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([False, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+    # The handler was inactive, so it shouldn't return any splits.
+    self.assertEqual(len(partitions), 0)
+    self.assertEqual(len(gains), 0)
+    self.assertEqual(len(splits), 0)
+
+  def testGenerateFeatureSplitCandidatesWithTreeComplexity(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Dense Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 1              |
+      # i1      |  (-0.5, 0.07) | 0         | 1              |
+      # i2      |  (1.2, 0.2)   | 0         | 0              |
+      # i3      |  (4.0, 0.13)  | 1         | 1              |
+      dense_column = array_ops.constant([0.52, 0.52, 0.3, 0.52])
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0.5,
+          min_node_weight=0,
+          epsilon=0.001,
+          num_quantiles=10,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([0, 1], partitions)
+
+    # Check the split on partition 0.
+    # -(1.2 - 0.1) / (0.2 + 1)
+    expected_left_weight = -0.91666
+
+    # expected_left_weight * -(1.2 - 0.1)
+    expected_left_gain = 1.0083333333333331
+
+    # (-0.5 + 0.2 + 0.1) / (0.19 + 1)
+    expected_right_weight = 0.1680672
+
+    # expected_right_weight * -(-0.5 + 0.2 + 0.1))
+    expected_right_gain = 0.033613445378151252
+
+    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
+    expected_bias_gain = 0.46043165467625885
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    # Make sure the gain is subtracted by the tree complexity regularization.
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain - 0.5,
+        gains[0], 0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertAllClose(0.3, split_node.threshold, 0.00001)
+
+    # Check the split on partition 1.
+    # (-4 + 0.1) / (0.13 + 1)
+    expected_left_weight = -3.4513274336283186
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_left_gain = 13.460176991150442
+    expected_right_weight = 0
+    expected_right_gain = 0
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_bias_gain = 13.460176991150442
+
+    # Verify candidate for partition 1, there's only one active bucket here
+    # so -0.5 gain is expected (because of tree complexity.
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    self.assertAllClose(-0.5, gains[1], 0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertAllClose(0.52, split_node.threshold, 0.00001)
+
+  def testGenerateFeatureSplitCandidatesWithMinNodeWeight(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Dense Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 1              |
+      # i1      |  (-0.5, 0.07) | 0         | 1              |
+      # i2      |  (1.2, 0.2)   | 0         | 0              |
+      # i3      |  (4.0, 2.0)   | 1         | 1              |
+      dense_column = array_ops.constant([0.52, 0.52, 0.3, 0.52])
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 2])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1,
+          tree_complexity_regularization=0.5,
+          min_node_weight=1.5,
+          epsilon=0.001,
+          num_quantiles=10,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([0, 1], partitions)
+
+    # Check the gain on partition 0 to be -0.5.
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    # Make sure the gain is subtracted by the tree complexity regularization.
+    self.assertAllClose(-0.5, gains[0], 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    # Check the split on partition 1.
+    # (-4 + 0.1) / (2 + 1)
+    expected_left_weight = -1.3
+    expected_right_weight = 0
+
+    # Verify candidate for partition 1, there's only one active bucket here
+    # so -0.5 gain is expected (because of tree complexity.
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    self.assertAllClose(-0.5, gains[1], 0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertAllClose(0.52, split_node.threshold, 0.00001)
+
+
+class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
+
+  def testGenerateFeatureSplitCandidates(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Sparse Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 1               |
+      # i1      |  (-0.5, 0.07) | 0         | N/A             |
+      # i2      |  (1.2, 0.2)   | 0         | 0               |
+      # i3      |  (4.0, 0.13)  | 1         | 1               |
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      example_partitions = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      indices = array_ops.constant([[0, 0], [2, 0], [3, 0]], dtype=dtypes.int64)
+      values = array_ops.constant([0.52, 0.3, 0.52])
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          example_partitions,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            example_partitions,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([0, 1], partitions)
+    # Check the split on partition 0.
+    # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
+    expected_left_weight = -0.603448275862069
+    # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
+    expected_left_gain = 0.8448275862068965
+    # 0.5 / (0.07 + 2)
+    expected_right_weight = 0.24154589371980678
+    # 0.5 ** 2 / (0.07 + 2)
+    expected_right_gain = 0.12077294685990339
+    # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
+    expected_bias_gain = 0.3389121338912133
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_right
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0])
+
+    self.assertAllClose([expected_left_weight], left_child.value)
+
+    self.assertAllClose([expected_right_weight], right_child.value)
+
+    self.assertEqual(0, split_node.split.feature_column)
+
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+    # Check the split on partition 1.
+    expected_left_weight = -1.8779342723004695
+    expected_right_weight = 0
+
+    # Verify candidate for partition 1, there's only one active bucket here
+    # so zero gain is expected.
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+
+    self.assertAllClose(0.0, gains[1])
+
+    self.assertAllClose([expected_left_weight], left_child.value)
+
+    self.assertAllClose([expected_right_weight], right_child.value)
+
+    self.assertEqual(0, split_node.split.feature_column)
+
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+  def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
+    with self.test_session() as sess:
+      # Batch is 4, 2 classes
+      gradients = array_ops.constant(
+          [[0.2, 1.4], [-0.5, 0.1], [1.2, 3], [4.0, -3]])
+      # 2x2 matrix for each instance
+      hessian_0 = [[0.12, 0.02], [0.3, 0.11]]
+      hessian_1 = [[0.07, -0.2], [-0.5, 0.2]]
+      hessian_2 = [[0.2, -0.23], [-0.8, 0.9]]
+      hessian_3 = [[0.13, -0.3], [-1.5, 2.2]]
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3])
+
+      example_partitions = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      indices = array_ops.constant([[0, 0], [2, 0], [3, 0]], dtype=dtypes.int64)
+      values = array_ops.constant([0.52, 0.3, 0.52])
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
+
+      gradient_shape = tensor_shape.TensorShape([2])
+      hessian_shape = tensor_shape.TensorShape([2, 2])
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          example_partitions,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            example_partitions,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_right
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+    self.assertEqual(0, split_node.split.feature_column)
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(0, split_node.split.feature_column)
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+  def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
+    with self.test_session() as sess:
+      # Batch is 4, 2 classes
+      gradients = array_ops.constant(
+          [[0.2, 1.4], [-0.5, 0.1], [1.2, 3], [4.0, -3]])
+      # Each hessian is a diagonal from a full hessian matrix.
+      hessian_0 = [0.12, 0.11]
+      hessian_1 = [0.07, 0.2]
+      hessian_2 = [0.2, 0.9]
+      hessian_3 = [0.13, 2.2]
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3])
+
+      example_partitions = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      indices = array_ops.constant([[0, 0], [2, 0], [3, 0]], dtype=dtypes.int64)
+      values = array_ops.constant([0.52, 0.3, 0.52])
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
+
+      gradient_shape = tensor_shape.TensorShape([2])
+      hessian_shape = tensor_shape.TensorShape([2])
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.DIAGONAL_HESSIAN)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          example_partitions,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            example_partitions,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_right
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+    self.assertEqual(0, split_node.split.feature_column)
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(0, split_node.split.feature_column)
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+  def testGenerateFeatureSplitCandidatesInactive(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Sparse Quantile |
+      # i0      |  (0.2, 0.12)  | 0         | 1               |
+      # i1      |  (-0.5, 0.07) | 0         | N/A             |
+      # i2      |  (1.2, 0.2)   | 0         | 0               |
+      # i3      |  (4.0, 0.13)  | 1         | 1               |
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      example_partitions = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+      indices = array_ops.constant([[0, 0], [2, 0], [3, 0]], dtype=dtypes.int64)
+      values = array_ops.constant([0.52, 0.3, 0.52])
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          example_partitions,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, False]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            example_partitions,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([False, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+    # The handler was inactive so it shouldn't any splits.
+    self.assertEqual(len(partitions), 0)
+    self.assertEqual(len(gains), 0)
+    self.assertEqual(len(splits), 0)
+
+  def testEmpty(self):
+    with self.test_session() as sess:
+      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
+      # No values in this feature column in this mini-batch.
+      values = array_ops.constant([], dtype=dtypes.float32)
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32)
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+    self.assertEqual(len(partitions), 0)
+    self.assertEqual(len(gains), 0)
+    self.assertEqual(len(splits), 0)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1bef0278846e7ff6abc91e8c57f780af45e8b41
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
@@ -0,0 +1,61 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+
+#include "tensorflow/core/lib/hash/hash.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+
+// Key into a specific class and partition to accumulate stats
+// for the specified feature id. A feature id can be the quantile
+// for a float feature or the hash/dictionary entry for a string feature.
+struct ClassPartitionKey {
+  ClassPartitionKey() : class_id(-1), partition_id(-1), feature_id(-1) {}
+
+  ClassPartitionKey(uint32 c, uint32 p, uint64 f)
+      : class_id(c), partition_id(p), feature_id(f) {}
+
+  bool operator==(const ClassPartitionKey& other) const {
+    return (feature_id == other.feature_id) &&
+           (partition_id == other.partition_id) && (class_id == other.class_id);
+  }
+
+  // Hasher for ClassPartitionKey.
+  struct Hash {
+    size_t operator()(const ClassPartitionKey& key) const {
+      uint64 class_partition =
+          (static_cast<uint64>(key.partition_id) << 32) | (key.class_id);
+      return Hash64Combine(class_partition, key.feature_id);
+    }
+  };
+
+  // Class to predict for, this is constant for binary tasks.
+  uint32 class_id;
+
+  // Tree partition defined by traversing the tree to the leaf.
+  uint32 partition_id;
+
+  // Feature Id within the feature column.
+  uint64 feature_id;
+};
+
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
new file mode 100644
index 0000000000000000000000000000000000000000..3814edb5675be74794a08e00becb649f8fc53fdb
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
@@ -0,0 +1,82 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+
+// Feature stats accumulator to aggregate stats across various
+// feature columns. This class is thread compatible not safe, the user
+// must ensure proper synchronization if many threads update overlapping
+// feature columns.
+template <typename StatsType, typename Accumulator>
+class FeatureStatsAccumulator {
+ public:
+  using FeatureStats =
+      std::unordered_map<ClassPartitionKey, StatsType, ClassPartitionKey::Hash>;
+
+  explicit FeatureStatsAccumulator(size_t num_feature_columns,
+                                   Accumulator accumulator = Accumulator())
+      : accumulator_(accumulator), feature_column_stats_(num_feature_columns) {}
+
+  // Delete copy and assign.
+  FeatureStatsAccumulator(const FeatureStatsAccumulator& other) = delete;
+  FeatureStatsAccumulator& operator=(const FeatureStatsAccumulator& other) =
+      delete;
+
+  // Adds stats for the specified class, partition and feature within
+  // the desired slot.
+  void AddStats(uint32 slot_id, uint32 class_id, uint32 partition_id,
+                uint64 feature_id, const StatsType& stats) {
+    accumulator_(stats, &feature_column_stats_[slot_id][ClassPartitionKey(
+                            class_id, partition_id, feature_id)]);
+  }
+
+  // Retrieves stats for the specified class, partition and feature
+  // within the desired feature column. Default stats are returned if no match
+  // can be found. Note that the feature column index must be valid.
+  StatsType GetStats(uint32 slot_id, uint32 class_id, uint32 partition_id,
+                     uint64 feature_id) const {
+    auto it = feature_column_stats_[slot_id].find(
+        ClassPartitionKey(class_id, partition_id, feature_id));
+    return it != feature_column_stats_[slot_id].end() ? it->second
+                                                      : StatsType();
+  }
+
+  // Returns feature stats for a given slot.
+  FeatureStats GetFeatureStats(uint32 slot_id) const {
+    return feature_column_stats_[slot_id];
+  }
+
+ private:
+  // Accumulator method to use.
+  const Accumulator accumulator_;
+
+  // Vector of stats indexed by the feature column.
+  std::vector<FeatureStats> feature_column_stats_;
+};
+
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdbd670120fb3eb6589b18c084a4d24c881b11f5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator_test.cc
@@ -0,0 +1,90 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace {
+
+struct TestStats {
+  TestStats& operator+=(const TestStats& other) {
+    s1 += other.s1;
+    s2 += other.s2;
+    return (*this);
+  }
+
+  float s1;
+  float s2;
+};
+
+struct TestStatsAccumulator {
+  void operator()(const TestStats& from, TestStats* to) const { (*to) += from; }
+};
+#define EXPECT_STATS_EQ(val1, val2)  \
+  EXPECT_FLOAT_EQ(val1.s1, val2.s1); \
+  EXPECT_FLOAT_EQ(val1.s2, val2.s2);
+
+using FeatureStatsAccumulator =
+    FeatureStatsAccumulator<TestStats, TestStatsAccumulator>;
+
+class FeatureStatsAccumulatorTest : public ::testing::Test {};
+
+TEST_F(FeatureStatsAccumulatorTest, Empty) {
+  FeatureStatsAccumulator accumulator(1);
+  TestStats stats = {0, 0};
+
+  EXPECT_STATS_EQ(stats, accumulator.GetStats(0, 2, 1, 234));
+}
+
+TEST_F(FeatureStatsAccumulatorTest, OneFeatureOneGrad) {
+  FeatureStatsAccumulator accumulator(1);
+  TestStats stats = {-12.023f, 8.2f};
+  accumulator.AddStats(0, 2, 1, 234, stats);
+
+  EXPECT_STATS_EQ(stats, accumulator.GetStats(0, 2, 1, 234));
+}
+
+TEST_F(FeatureStatsAccumulatorTest, OneFeatureAggregateGrad) {
+  FeatureStatsAccumulator accumulator(1);
+  TestStats stats1 = {-12.023f, 8.2f};
+  accumulator.AddStats(0, 2, 1, 234, stats1);
+  TestStats stats2 = {4.46f, 1.9f};
+  accumulator.AddStats(0, 2, 1, 234, stats2);
+  TestStats expected = {-7.563f, 10.1f};
+  EXPECT_STATS_EQ(expected, accumulator.GetStats(0, 2, 1, 234));
+}
+
+TEST_F(FeatureStatsAccumulatorTest, TwoFeaturesOneGrad) {
+  FeatureStatsAccumulator accumulator(1);
+  TestStats stats1 = {-12.023f, 8.2f};
+  accumulator.AddStats(0, 1, 0, 34, stats1);
+  TestStats stats2 = {4.46f, 1.9f};
+  accumulator.AddStats(0, 1, 0, 91, stats2);
+
+  EXPECT_STATS_EQ(stats1, accumulator.GetStats(0, 1, 0, 34));
+  EXPECT_STATS_EQ(stats2, accumulator.GetStats(0, 1, 0, 91));
+}
+
+}  // namespace
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f84b1b06a62ced34cc5c4048f0ec7d444087851
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.cc
@@ -0,0 +1,90 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+
+void ExamplePartitioner::UpdatePartitions(
+    const boosted_trees::trees::DecisionTreeConfig& tree_config,
+    const boosted_trees::utils::BatchFeatures& features,
+    const int desired_parallelism, thread::ThreadPool* const thread_pool,
+    int32* example_partition_ids) {
+  // Get batch size.
+  const int64 batch_size = features.batch_size();
+  if (batch_size <= 0) {
+    return;
+  }
+
+  // Lambda for doing a block of work.
+  auto partition_examples_subset = [&tree_config, &features,
+                                    &example_partition_ids](const int64 start,
+                                                            const int64 end) {
+    if (TF_PREDICT_TRUE(tree_config.nodes_size() > 0)) {
+      auto examples_iterable = features.examples_iterable(start, end);
+      for (const auto& example : examples_iterable) {
+        int32& example_partition = example_partition_ids[example.example_idx];
+        example_partition = boosted_trees::trees::DecisionTree::Traverse(
+            tree_config, example_partition, example);
+        DCHECK_GE(example_partition, 0);
+      }
+    } else {
+      std::fill(example_partition_ids + start, example_partition_ids + end, 0);
+    }
+  };
+
+  // Parallelize partitioning over the batch.
+  boosted_trees::utils::ParallelFor(batch_size, desired_parallelism,
+                                    thread_pool, partition_examples_subset);
+}
+
+void ExamplePartitioner::PartitionExamples(
+    const boosted_trees::trees::DecisionTreeConfig& tree_config,
+    const boosted_trees::utils::BatchFeatures& features,
+    const int desired_parallelism, thread::ThreadPool* const thread_pool,
+    int32* example_partition_ids) {
+  // Get batch size.
+  const int64 batch_size = features.batch_size();
+  if (batch_size <= 0) {
+    return;
+  }
+
+  // Lambda for doing a block of work.
+  auto partition_examples_subset = [&tree_config, &features,
+                                    &example_partition_ids](const int64 start,
+                                                            const int64 end) {
+    if (TF_PREDICT_TRUE(tree_config.nodes_size() > 0)) {
+      auto examples_iterable = features.examples_iterable(start, end);
+      for (const auto& example : examples_iterable) {
+        uint32 partition = boosted_trees::trees::DecisionTree::Traverse(
+            tree_config, 0, example);
+        example_partition_ids[example.example_idx] = partition;
+        DCHECK_GE(partition, 0);
+      }
+    } else {
+      std::fill(example_partition_ids + start, example_partition_ids + end, 0);
+    }
+  };
+
+  // Parallelize partitioning over the batch.
+  boosted_trees::utils::ParallelFor(batch_size, desired_parallelism,
+                                    thread_pool, partition_examples_subset);
+}
+
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed0d9fdac108dff4576cc1563dae420340387be
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
@@ -0,0 +1,53 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+
+#include <vector>
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+
+// Partitions examples based on the path through the current tree.
+class ExamplePartitioner {
+ public:
+  // Updates partitions from previous set using the current tree structure by
+  // traversing sub-roots for each example. This method can be optionally
+  // parallelized using the passed thread pool.
+  static void UpdatePartitions(const trees::DecisionTreeConfig& tree_config,
+                               const utils::BatchFeatures& features,
+                               int desired_parallelism,
+                               thread::ThreadPool* const thread_pool,
+                               int32* example_partition_ids);
+
+  // Partitions examples using the current tree structure by traversing from
+  // root for each example. This method can be optionally parallelized using
+  // the passed thread pool.
+  static void PartitionExamples(const trees::DecisionTreeConfig& tree_config,
+                                const utils::BatchFeatures& features,
+                                int desired_parallelism,
+                                thread::ThreadPool* const thread_pool,
+                                int32* example_partition_ids);
+};
+
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d5326aa456e5cbc44872b77dcdd9dda8c52bde6
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner_test.cc
@@ -0,0 +1,99 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace {
+
+class ExamplePartitionerTest : public ::testing::Test {
+ protected:
+  ExamplePartitionerTest()
+      : thread_pool_(tensorflow::Env::Default(), "test_pool", 2),
+        batch_features_(2) {
+    dense_matrix_ = test::AsTensor<float>({7.0f, -2.0f}, {2, 1});
+    TF_EXPECT_OK(
+        batch_features_.Initialize({dense_matrix_}, {}, {}, {}, {}, {}, {}));
+  }
+
+  thread::ThreadPool thread_pool_;
+  Tensor dense_matrix_;
+  boosted_trees::utils::BatchFeatures batch_features_;
+};
+
+TEST_F(ExamplePartitionerTest, EmptyTree) {
+  boosted_trees::trees::DecisionTreeConfig tree_config;
+  std::vector<int32> example_partition_ids(2);
+  ExamplePartitioner::UpdatePartitions(tree_config, batch_features_, 1,
+                                       &thread_pool_,
+                                       example_partition_ids.data());
+  EXPECT_EQ(0, example_partition_ids[0]);
+  EXPECT_EQ(0, example_partition_ids[1]);
+}
+
+TEST_F(ExamplePartitionerTest, UpdatePartitions) {
+  // Create tree with one split.
+  // TODO(salehay): figure out if we can use PARSE_TEXT_PROTO.
+  boosted_trees::trees::DecisionTreeConfig tree_config;
+  auto* split = tree_config.add_nodes()->mutable_dense_float_binary_split();
+  split->set_feature_column(0);
+  split->set_threshold(3.0f);
+  split->set_left_id(1);
+  split->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+
+  // Partition input:
+  // Instance 1 has !(7 <= 3) => go right => leaf 2.
+  // Instance 2 has (-2 <= 3) => go left => leaf 1.
+  std::vector<int32> example_partition_ids(2);
+  ExamplePartitioner::UpdatePartitions(tree_config, batch_features_, 1,
+                                       &thread_pool_,
+                                       example_partition_ids.data());
+  EXPECT_EQ(2, example_partition_ids[0]);
+  EXPECT_EQ(1, example_partition_ids[1]);
+}
+
+TEST_F(ExamplePartitionerTest, PartitionExamples) {
+  // Create tree with one split.
+  // TODO(salehay): figure out if we can use PARSE_TEXT_PROTO.
+  boosted_trees::trees::DecisionTreeConfig tree_config;
+  auto* split = tree_config.add_nodes()->mutable_dense_float_binary_split();
+  split->set_feature_column(0);
+  split->set_threshold(3.0f);
+  split->set_left_id(1);
+  split->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+
+  // Partition input:
+  // Instance 1 has !(7 <= 3) => go right => leaf 2.
+  // Instance 2 has (-2 <= 3) => go left => leaf 1.
+  std::vector<int32> example_partition_ids(2);
+  ExamplePartitioner::PartitionExamples(tree_config, batch_features_, 1,
+                                        &thread_pool_,
+                                        example_partition_ids.data());
+  EXPECT_EQ(2, example_partition_ids[0]);
+  EXPECT_EQ(1, example_partition_ids[1]);
+}
+
+}  // namespace
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b880cf2c47989b1434f17802befb7dd7c248b36f
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc
@@ -0,0 +1,59 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+void BiasFeatureColumnHandler::AggregateGradientStats(
+    const std::vector<int32>& example_partition_ids,
+    const Tensor& example_first_order_gradients,
+    const Tensor& example_second_order_gradients,
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+        gradient_stats_accumulator) const {
+  // Pass over all examples and aggregate gradient stats for each sub-root.
+  for (int64 example_idx = 0; example_idx < batch_size_; ++example_idx) {
+    auto partition_id = example_partition_ids[example_idx];
+    gradient_stats_accumulator->AddStats(
+        slot_id_, class_id_, partition_id, kBiasFeatureId,
+        GradientStats(example_first_order_gradients,
+                      example_second_order_gradients, example_idx));
+  }
+}
+
+void BiasFeatureColumnHandler::GenerateFeatureSplitCandidates(
+    const LearnerConfig& learner_config, const std::vector<int32>& roots,
+    const std::vector<NodeStats>& root_stats,
+    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+        gradient_stats_accumulator,
+    std::vector<FeatureSplitCandidate>* split_candidates) const {
+  split_candidates->clear();
+  split_candidates->reserve(roots.size());
+  boosted_trees::trees::TreeNode tree_node;
+  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
+    const NodeStats& root_node_stats = root_stats[root_idx];
+    tree_node.Clear();
+    root_node_stats.FillLeaf(class_id_, tree_node.mutable_leaf());
+    split_candidates->emplace_back(slot_id_, tree_node,
+                                   SplitStats(learner_config, root_node_stats));
+  }
+}
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c0f99185a63db33a391a98fa16f37bef99507c9
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h
@@ -0,0 +1,57 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// Handler for a bias feature column in the single class case.
+// This handler is useful even if we don't introduce a bias feature because
+// it allows us to aggregate stats per partition which in turn allows us
+// to compute node stats for each root to split.
+class BiasFeatureColumnHandler : public FeatureColumnHandler {
+ public:
+  BiasFeatureColumnHandler(const uint32 class_id, const uint32 slot_id,
+                           const int64 batch_size)
+      : FeatureColumnHandler(class_id, slot_id, batch_size) {}
+
+  void AggregateGradientStats(
+      const std::vector<int32>& example_partition_ids,
+      const Tensor& example_first_order_gradients,
+      const Tensor& example_second_order_gradients,
+      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+          gradient_stats_accumulator) const override;
+
+  void GenerateFeatureSplitCandidates(
+      const LearnerConfig& learner_config, const std::vector<int32>& roots,
+      const std::vector<NodeStats>& root_stats,
+      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+          gradient_stats_accumulator,
+      std::vector<FeatureSplitCandidate>* split_candidates) const override;
+
+  static constexpr auto kBiasFeatureId = 0;
+};
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82664aed72d99aa3e84d5f3f38bff8ec5e4ca099
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
@@ -0,0 +1,134 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+namespace {
+
+using boosted_trees::learner::LearnerConfig;
+
+const auto kClassId = 7;
+const auto kSlotId = 0;
+const auto kBatchSize = 4;
+
+using FeatureStatsAccumulator =
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
+
+class BiasFeatureColumnHandlerTest : public ::testing::Test {
+ protected:
+  BiasFeatureColumnHandlerTest()
+      : example_first_order_gradients_(
+            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
+        example_second_order_gradients_(
+            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
+        example_partitions_({0, 0, 1, 3}) {
+    // Set L2 regularization.
+    learner_config_.mutable_regularization()->set_l2(2.0f);
+
+    // Create handler.
+    handler_.reset(new BiasFeatureColumnHandler(kClassId, kSlotId, kBatchSize));
+  }
+
+  LearnerConfig learner_config_;
+  const Tensor example_first_order_gradients_;
+  const Tensor example_second_order_gradients_;
+  const std::vector<int32> example_partitions_;
+  std::unique_ptr<BiasFeatureColumnHandler> handler_;
+};
+
+TEST_F(BiasFeatureColumnHandlerTest, AggregateGradientStats) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Check stats for each partition.
+  // Partition 0.
+  EXPECT_GRADIENT_STATS_EQ(
+      GradientStats(-0.3f, 0.19f),
+      accumulator.GetStats(kSlotId, kClassId, 0,
+                           BiasFeatureColumnHandler::kBiasFeatureId));
+  // Partition 1.
+  EXPECT_GRADIENT_STATS_EQ(
+      GradientStats(1.2f, 0.2f),
+      accumulator.GetStats(kSlotId, kClassId, 1,
+                           BiasFeatureColumnHandler::kBiasFeatureId));
+  // Partition 2.
+  EXPECT_GRADIENT_STATS_EQ(
+      GradientStats(0.0f, 0.0f),
+      accumulator.GetStats(kSlotId, kClassId, 2,
+                           BiasFeatureColumnHandler::kBiasFeatureId));
+  // Partition 3.
+  EXPECT_GRADIENT_STATS_EQ(
+      GradientStats(4.0f, 0.13f),
+      accumulator.GetStats(kSlotId, kClassId, 3,
+                           BiasFeatureColumnHandler::kBiasFeatureId));
+}
+
+TEST_F(BiasFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Get feature split candidates for two roots 0 and 3.
+  // Root 0 has zero gain and root 3 has the same gain as the leaf.
+  const std::vector<int32> roots = {0, 3};
+  const std::vector<NodeStats>& root_stats = {
+      NodeStats(1), NodeStats(learner_config_, GradientStats(4.0f, 0.13f))};
+  std::vector<FeatureSplitCandidate> split_candidates;
+  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
+                                           accumulator, &split_candidates);
+  // Expect two candidate splits (one per root).
+  EXPECT_EQ(2, split_candidates.size());
+
+  // Verify first candidate for root 0, gain is expected to be the same as
+  // the left child since the root node gain is zero.
+  const SplitStats expected_split_stats0(learner_config_, root_stats[0]);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
+  const auto& tree_node0 = split_candidates[0].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::kLeaf, tree_node0.node_case());
+  EXPECT_EQ(1, tree_node0.leaf().sparse_vector().index_size());
+  EXPECT_EQ(kClassId, tree_node0.leaf().sparse_vector().index(0));
+  EXPECT_EQ(1, tree_node0.leaf().sparse_vector().value_size());
+  EXPECT_EQ(root_stats[0].weight_contribution[0],
+            tree_node0.leaf().sparse_vector().value(0));
+
+  // Verify second candidate for root 3, gain is expected to be zero as
+  // the left child gain is equal to the parent gain.
+  const SplitStats expected_split_stats1(learner_config_, root_stats[1]);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
+  const auto& tree_node1 = split_candidates[1].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::kLeaf, tree_node1.node_case());
+  EXPECT_EQ(1, tree_node1.leaf().sparse_vector().index_size());
+  EXPECT_EQ(kClassId, tree_node1.leaf().sparse_vector().index(0));
+  EXPECT_EQ(1, tree_node1.leaf().sparse_vector().value_size());
+  EXPECT_EQ(root_stats[1].weight_contribution[0],
+            tree_node1.leaf().sparse_vector().value(0));
+}
+
+}  // namespace
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a6c409f846c9ca0bd6b5101e96447642b949978
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc
@@ -0,0 +1,140 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h"
+
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+namespace {
+
+// Creates a categorical Id split node without assigning children.
+boosted_trees::trees::TreeNode CreateCategoricalIdNode(
+    const int32 feature_column, const int32 id) {
+  boosted_trees::trees::TreeNode split_node;
+  auto* split = split_node.mutable_categorical_id_binary_split();
+  split->set_feature_column(feature_column);
+  split->set_feature_id(id);
+  return split_node;
+}
+
+}  // namespace
+
+void CategoricalFeatureColumnHandler::AggregateGradientStats(
+    const std::vector<int32>& example_partition_ids,
+    const Tensor& example_first_order_gradients,
+    const Tensor& example_second_order_gradients,
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+        gradient_stats_accumulator) const {
+  // Pass over all rows and aggregate gradient stats for each feature id.
+  const int64 num_rows = indices_.dimension(0);
+  for (int64 row_idx = 0; row_idx < num_rows; ++row_idx) {
+    auto example_idx = indices_(row_idx, 0);
+    auto feature_id = values_(row_idx);
+    const GradientStats norm_gradient_stats(example_first_order_gradients,
+                                            example_second_order_gradients,
+                                            example_idx);
+    auto partition_id = example_partition_ids[example_idx];
+    gradient_stats_accumulator->AddStats(slot_id_, class_id_, partition_id,
+                                         feature_id, norm_gradient_stats);
+  }
+}
+
+void CategoricalFeatureColumnHandler::GenerateFeatureSplitCandidates(
+    const LearnerConfig& learner_config, const std::vector<int32>& roots,
+    const std::vector<NodeStats>& root_stats,
+    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+        gradient_stats_accumulator,
+    std::vector<FeatureSplitCandidate>* split_candidates) const {
+  // Build a reverse lookup of partition id to root idx.
+  std::unordered_map<int32, size_t> partition_id_to_root_idx;
+  partition_id_to_root_idx.reserve(roots.size());
+  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
+    partition_id_to_root_idx[roots[root_idx]] = root_idx;
+  }
+
+  // Initialize split candidates.
+  split_candidates->clear();
+  if (!roots.empty()) {
+    FeatureSplitCandidate empty_candidate(
+        root_stats[0].weight_contribution.size());
+    split_candidates->resize(roots.size(), empty_candidate);
+  }
+  for (auto& split_candidate : *split_candidates) {
+    split_candidate.split_stats.gain = std::numeric_limits<float>::lowest();
+  }
+
+  // Evaluate split candidates for every root as each is a separate
+  // logical partition over the examples.
+  // Then for each root, we evaluate every feature id as an equality split
+  // and pick the highest split gain.
+  for (const auto& entry :
+       gradient_stats_accumulator.GetFeatureStats(slot_id_)) {
+    DCHECK_EQ(entry.first.class_id, class_id_);
+
+    // Get partition id and root node stats.
+    const int32 partition_id = entry.first.partition_id;
+    auto root_idx_it = partition_id_to_root_idx.find(partition_id);
+    if (root_idx_it == partition_id_to_root_idx.end()) {
+      // Inactive partition.
+      continue;
+    }
+    size_t root_idx = root_idx_it->second;
+    const NodeStats& root_node_stats = root_stats[root_idx];
+
+    // Get gradient stats.
+    const auto& left_gradient_stats = entry.second;
+    auto right_gradient_stats =
+        root_node_stats.gradient_stats - left_gradient_stats;
+
+    // Get node stats.
+    NodeStats left_node_stats(learner_config, left_gradient_stats);
+    NodeStats right_node_stats(learner_config, right_gradient_stats);
+
+    // Generate split candidate and update best split candidate for the
+    // current root if needed.
+    FeatureSplitCandidate split_candidate(
+        slot_id_,
+        CreateCategoricalIdNode(feature_column_, entry.first.feature_id),
+        SplitStats(learner_config, root_node_stats, left_node_stats,
+                   right_node_stats));
+    FeatureSplitCandidate& best_split_candidate = (*split_candidates)[root_idx];
+    if (TF_PREDICT_FALSE(best_split_candidate.tree_node.node_case() ==
+                         boosted_trees::trees::TreeNode::NODE_NOT_SET)) {
+      // Always replace candidates with no node set.
+      best_split_candidate = std::move(split_candidate);
+    } else if (TF_PREDICT_FALSE(split_candidate.split_stats.gain ==
+                                best_split_candidate.split_stats.gain)) {
+      // Tie break on feature id.
+      auto best_split_feature_id =
+          best_split_candidate.tree_node.categorical_id_binary_split()
+              .feature_id();
+      if (entry.first.feature_id < best_split_feature_id) {
+        best_split_candidate = std::move(split_candidate);
+      }
+    } else if (split_candidate.split_stats.gain >
+               best_split_candidate.split_stats.gain) {
+      best_split_candidate = std::move(split_candidate);
+    }
+  }
+}
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef964ba716c6adf9cf9c291cca5f52f7a6efe26f
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h
@@ -0,0 +1,64 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// Handler for a categorical feature column in the single class case.
+class CategoricalFeatureColumnHandler : public FeatureColumnHandler {
+ public:
+  CategoricalFeatureColumnHandler(const int32 class_id, const int32 slot_id,
+                                  const int64 batch_size,
+                                  const int32 feature_column,
+                                  TTypes<int64>::ConstMatrix indices,
+                                  TTypes<int64>::ConstVec values)
+      : FeatureColumnHandler(class_id, slot_id, batch_size),
+        feature_column_(feature_column),
+        indices_(indices),
+        values_(values) {}
+
+  void AggregateGradientStats(
+      const std::vector<int32>& example_partition_ids,
+      const Tensor& example_first_order_gradients,
+      const Tensor& example_second_order_gradients,
+      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+          gradient_stats_accumulator) const override;
+
+  void GenerateFeatureSplitCandidates(
+      const LearnerConfig& learner_config, const std::vector<int32>& roots,
+      const std::vector<NodeStats>& root_stats,
+      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+          gradient_stats_accumulator,
+      std::vector<FeatureSplitCandidate>* split_candidates) const override;
+
+ protected:
+  const int32 feature_column_;
+  TTypes<int64>::ConstMatrix indices_;
+  TTypes<int64>::ConstVec values_;
+};
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..abd72384648dc3ac5d7f00e3b6d89fea3eb09afb
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
@@ -0,0 +1,165 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+namespace {
+
+using boosted_trees::learner::LearnerConfig;
+
+const auto kClassId = 7;
+const auto kSlotId = 0;
+const auto kBatchSize = 4;
+const auto kFeatureColumn = 3;
+
+using FeatureStatsAccumulator =
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
+
+class CategoricalFeatureColumnHandlerTest : public ::testing::Test {
+ protected:
+  // The data looks like the following:
+  // Example |  Gradients    | Partition | Feature Id |
+  // i0      |  (0.2, 0.12)  |     0     |    1,2     |
+  // i1      |  (-0.5, 0.07) |     0     |            |
+  // i2      |  (1.2, 0.2)   |     0     |     2      |
+  // i3      |  (4.0, 0.13)  |     1     |     0      |
+  CategoricalFeatureColumnHandlerTest()
+      : example_first_order_gradients_(
+            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
+        example_second_order_gradients_(
+            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
+        example_partitions_({0, 0, 0, 1}),
+        indices_(test::AsTensor<int64>({0, 0, 0, 1, 2, 0, 3, 0}, {4, 2})),
+        values_(test::AsTensor<int64>({1, 2, 2, 0}, {4})) {
+    // Set L2 regularization.
+    learner_config_.mutable_regularization()->set_l2(2.0f);
+
+    // Create handler.
+    handler_.reset(new CategoricalFeatureColumnHandler(
+        kClassId, kSlotId, kBatchSize, kFeatureColumn, indices_.matrix<int64>(),
+        values_.vec<int64>()));
+  }
+
+  LearnerConfig learner_config_;
+  const Tensor example_first_order_gradients_;
+  const Tensor example_second_order_gradients_;
+  const std::vector<int32> example_partitions_;
+  const Tensor indices_;
+  const Tensor values_;
+  std::unique_ptr<FeatureColumnHandler> handler_;
+};
+
+TEST_F(CategoricalFeatureColumnHandlerTest, AggregateGradientStats) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Check stats for each partition and feature.
+  // Partition 0, Feature 0.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
+  // Partition 0, Feature 1.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f, 0.12f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
+  // Partition 0, Feature 2.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f + 1.2f, 0.12f + 0.2f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 2));
+
+  // Partition 1, Feature 0.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
+  // Partition 1, Feature 1.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
+  // Partition 1, Feature 2.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 2));
+}
+
+TEST_F(CategoricalFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Get feature split candidates for two roots 0 and 1.
+  // The root stats are derived from the per-partition total gradient stats.
+  const std::vector<int32> roots = {0, 1, 5};
+  const std::vector<NodeStats>& root_stats = {
+      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
+      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
+  std::vector<FeatureSplitCandidate> split_candidates;
+  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
+                                           accumulator, &split_candidates);
+  // Expect three candidate splits (one per root).
+  EXPECT_EQ(3, split_candidates.size());
+
+  // Verify candidate for root 0, the best split occurs when we route
+  // example i0, i2 left and i1 right.
+  const NodeStats expected_left_node0(learner_config_,
+                                      GradientStats(0.2f + 1.2f, 0.12f + 0.2f));
+  const NodeStats expected_right_node0(
+      learner_config_,
+      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
+  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
+                                         expected_left_node0,
+                                         expected_right_node0);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
+
+  const auto& tree_node0 = split_candidates[0].tree_node;
+  EXPECT_EQ(
+      boosted_trees::trees::TreeNode::kCategoricalIdBinarySplitFieldNumber,
+      tree_node0.node_case());
+  const auto& split0 = tree_node0.categorical_id_binary_split();
+  EXPECT_EQ(2, split0.feature_id());
+  EXPECT_EQ(kFeatureColumn, split0.feature_column());
+
+  // Verify candidate for root 1, there's only one active feature here
+  // so zero gain is expected.
+  const NodeStats expected_left_node1(learner_config_,
+                                      root_stats[1].gradient_stats);
+  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
+  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
+                                         expected_left_node1,
+                                         expected_right_node1);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
+  const auto& tree_node1 = split_candidates[1].tree_node;
+  EXPECT_EQ(
+      boosted_trees::trees::TreeNode::kCategoricalIdBinarySplitFieldNumber,
+      tree_node1.node_case());
+  const auto& split1 = tree_node1.categorical_id_binary_split();
+  EXPECT_EQ(0, split1.feature_id());
+  EXPECT_EQ(kFeatureColumn, split1.feature_column());
+
+  // Verify there are no candidate splits for root 5.
+  const auto& tree_node2 = split_candidates[2].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
+            tree_node2.node_case());
+}
+
+}  // namespace
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca7bb71e7d0b0fc945ee29092b1e36022d4c0943
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc
@@ -0,0 +1,116 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+namespace {
+
+// Creates a dense split node without assigning children.
+boosted_trees::trees::TreeNode CreateDenseSplitNode(const int32 feature_column,
+                                                    const float threshold) {
+  boosted_trees::trees::TreeNode split_node;
+  auto* split = split_node.mutable_dense_float_binary_split();
+  split->set_feature_column(feature_column);
+  split->set_threshold(threshold);
+  return split_node;
+}
+
+}  // namespace
+
+void DenseQuantizedFeatureColumnHandler::AggregateGradientStats(
+    const std::vector<int32>& example_partition_ids,
+    const Tensor& example_first_order_gradients,
+    const Tensor& example_second_order_gradients,
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+        gradient_stats_accumulator) const {
+  // Pass over all examples and aggregate gradient stats for each partition
+  // and quantized feature bucket.
+  for (int64 example_idx = 0; example_idx < batch_size_; ++example_idx) {
+    auto partition_id = example_partition_ids[example_idx];
+    auto feature_id = dense_quantized_values_(example_idx);
+    gradient_stats_accumulator->AddStats(
+        slot_id_, class_id_, partition_id, feature_id,
+        GradientStats(example_first_order_gradients,
+                      example_second_order_gradients, example_idx));
+  }
+}
+
+void DenseQuantizedFeatureColumnHandler::GenerateFeatureSplitCandidates(
+    const LearnerConfig& learner_config, const std::vector<int32>& roots,
+    const std::vector<NodeStats>& root_stats,
+    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+        gradient_stats_accumulator,
+    std::vector<FeatureSplitCandidate>* split_candidates) const {
+  // Evaluate split candidates for every root as each is a separate
+  // logical partition over the examples.
+  // Then for each root, we do a forward-only pass over the quantized
+  // feature buckets accumulating gradients from left to right.
+  // Split gains are evaluated at every threshold and the best split is picked.
+  split_candidates->clear();
+  split_candidates->reserve(roots.size());
+  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
+    // Get partition Id and root node stats.
+    const int32 partition_id = roots[root_idx];
+    const NodeStats& root_node_stats = root_stats[root_idx];
+
+    // Forward left to right pass over quantiles.
+    GradientStats left_gradient_stats;
+    GradientStats right_gradient_stats(root_node_stats.gradient_stats);
+    FeatureSplitCandidate best_split_candidate(
+        root_node_stats.weight_contribution.size());
+    best_split_candidate.split_stats.gain =
+        std::numeric_limits<float>::lowest();
+    for (int bucket_id = 0; bucket_id < dense_quantiles_.size(); ++bucket_id) {
+      // Get gradient stats.
+      auto gradient_stats = gradient_stats_accumulator.GetStats(
+          slot_id_, class_id_, partition_id, bucket_id);
+      if (gradient_stats.IsZero()) {
+        continue;
+      }
+
+      // Update gradient stats.
+      left_gradient_stats += gradient_stats;
+      right_gradient_stats =
+          root_node_stats.gradient_stats - left_gradient_stats;
+
+      // Get node stats
+      NodeStats left_node_stats(learner_config, left_gradient_stats);
+      NodeStats right_node_stats(learner_config, right_gradient_stats);
+
+      // Generate split candidate.
+      const float threshold = dense_quantiles_(bucket_id);
+      FeatureSplitCandidate split_candidate(
+          slot_id_, CreateDenseSplitNode(dense_feature_column_, threshold),
+          SplitStats(learner_config, root_node_stats, left_node_stats,
+                     right_node_stats));
+      if (split_candidate.split_stats.gain >
+          best_split_candidate.split_stats.gain) {
+        best_split_candidate = std::move(split_candidate);
+      }
+    }
+
+    // Add best candidate for partition.
+    split_candidates->push_back(std::move(best_split_candidate));
+  }
+}
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f3858e4d8c406e9ec3ae7079b241e94ef4aa35c
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h
@@ -0,0 +1,62 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// Handler for a dense quantized feature column in the single class case.
+class DenseQuantizedFeatureColumnHandler : public FeatureColumnHandler {
+ public:
+  DenseQuantizedFeatureColumnHandler(
+      const int32 class_id, const int32 slot_id, const int64 batch_size,
+      const int32 dense_feature_column, TTypes<float>::ConstVec dense_quantiles,
+      TTypes<int32>::ConstVec dense_quantized_values)
+      : FeatureColumnHandler(class_id, slot_id, batch_size),
+        dense_feature_column_(dense_feature_column),
+        dense_quantiles_(dense_quantiles),
+        dense_quantized_values_(dense_quantized_values) {}
+
+  void AggregateGradientStats(
+      const std::vector<int32>& example_partition_ids,
+      const Tensor& example_first_order_gradients,
+      const Tensor& example_second_order_gradients,
+      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+          gradient_stats_accumulator) const override;
+
+  void GenerateFeatureSplitCandidates(
+      const LearnerConfig& learner_config, const std::vector<int32>& roots,
+      const std::vector<NodeStats>& root_stats,
+      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+          gradient_stats_accumulator,
+      std::vector<FeatureSplitCandidate>* split_candidates) const override;
+
+ protected:
+  const int32 dense_feature_column_;
+  TTypes<float>::ConstVec dense_quantiles_;
+  TTypes<int32>::ConstVec dense_quantized_values_;
+};
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..396f48e5321f1012571bcfb2f3f013cf94ffd987
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
@@ -0,0 +1,155 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+namespace {
+
+using boosted_trees::learner::LearnerConfig;
+
+const auto kClassId = 1;
+const auto kSlotId = 0;
+const auto kBatchSize = 4;
+const auto kFeatureColumn = 2;
+
+using FeatureStatsAccumulator =
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
+
+class DenseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
+ protected:
+  // The data looks like the following:
+  // Example |  Gradients    | Partition | Dense Quantile |
+  // i0      |  (0.2, 0.12)  | 0         | 1              |
+  // i1      |  (-0.5, 0.07) | 0         | 1              |
+  // i2      |  (1.2, 0.2)   | 0         | 0              |
+  // i3      |  (4.0, 0.13)  | 1         | 1              |
+  DenseQuantizedFeatureColumnHandlerTest()
+      : example_first_order_gradients_(
+            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
+        example_second_order_gradients_(
+            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
+        example_partitions_({0, 0, 0, 1}),
+        dense_quantiles_(test::AsTensor<float>({0.3f, 0.52f}, {2})),
+        dense_quantized_values_(test::AsTensor<int32>({1, 1, 0, 1}, {4})) {
+    // Set L2 regularization.
+    learner_config_.mutable_regularization()->set_l2(2.0f);
+
+    // Create handler.
+    handler_.reset(new DenseQuantizedFeatureColumnHandler(
+        kClassId, kSlotId, kBatchSize, kFeatureColumn,
+        dense_quantiles_.vec<float>(), dense_quantized_values_.vec<int32>()));
+  }
+
+  LearnerConfig learner_config_;
+  const Tensor example_first_order_gradients_;
+  const Tensor example_second_order_gradients_;
+  const std::vector<int32> example_partitions_;
+  const Tensor dense_quantiles_;
+  const Tensor dense_quantized_values_;
+  std::unique_ptr<FeatureColumnHandler> handler_;
+};
+
+TEST_F(DenseQuantizedFeatureColumnHandlerTest, AggregateGradientStats) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Check stats for each partition and feature.
+  // Partition 0, Feature 0.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(1.2f, 0.2f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
+  // Partition 0, Feature 1.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(-0.3f, 0.19f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
+  // Partition 1, Feature 0.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
+  // Partition 1, Feature 1.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
+}
+
+TEST_F(DenseQuantizedFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Get feature split candidates for two roots 0 and 1.
+  // The root stats are derived from the per-partition total gradient stats.
+  const std::vector<int32> roots = {0, 1, 5};
+  const std::vector<NodeStats>& root_stats = {
+      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
+      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
+  std::vector<FeatureSplitCandidate> split_candidates;
+  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
+                                           accumulator, &split_candidates);
+  // Expect three candidate splits (one per root).
+  EXPECT_EQ(3, split_candidates.size());
+
+  // Verify candidate for root 0, the best split occurs when we route
+  // example i2 left and i0, i1 right.
+  const NodeStats expected_left_node0(learner_config_,
+                                      GradientStats(1.2f, 0.2f));
+  const NodeStats expected_right_node0(
+      learner_config_,
+      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
+  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
+                                         expected_left_node0,
+                                         expected_right_node0);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
+  const auto& tree_node0 = split_candidates[0].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::kDenseFloatBinarySplit,
+            tree_node0.node_case());
+  const auto& split0 = tree_node0.dense_float_binary_split();
+  EXPECT_FLOAT_EQ(dense_quantiles_.vec<float>()(0), split0.threshold());
+  EXPECT_EQ(kFeatureColumn, split0.feature_column());
+
+  // Verify candidate for root 1, there's only one active bucket here
+  // so zero gain is expected.
+  const NodeStats expected_left_node1(learner_config_,
+                                      root_stats[1].gradient_stats);
+  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
+  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
+                                         expected_left_node1,
+                                         expected_right_node1);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
+  const auto& tree_node1 = split_candidates[1].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::kDenseFloatBinarySplit,
+            tree_node1.node_case());
+  const auto& split1 = tree_node1.dense_float_binary_split();
+  EXPECT_FLOAT_EQ(dense_quantiles_.vec<float>()(1), split1.threshold());
+  EXPECT_EQ(kFeatureColumn, split1.feature_column());
+
+  // Verify there are no candidate splits for root 5.
+  const auto& tree_node2 = split_candidates[2].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
+            tree_node2.node_case());
+}
+
+}  // namespace
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3177b31ebce310b8f10252b47d513f07d1656fc
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h
@@ -0,0 +1,82 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
+
+#include <vector>
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h"
+#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// Handler interface for feature columns. Each feature column type may
+// have its own handler which encapsulates the logic of aggregating gradient
+// stats as well as generating split candidates for each partition.
+// Handlers can be stateful and must be thread compatible.
+class FeatureColumnHandler {
+ public:
+  FeatureColumnHandler(const int32 class_id, const int32 slot_id,
+                       const int64 batch_size)
+      : class_id_(class_id), slot_id_(slot_id), batch_size_(batch_size) {}
+
+  virtual ~FeatureColumnHandler() {}
+  FeatureColumnHandler(const FeatureColumnHandler& other) = delete;
+  FeatureColumnHandler& operator=(const FeatureColumnHandler& other) = delete;
+
+  // Aggregates example gradient stats for the feature column.
+  virtual void AggregateGradientStats(
+      const std::vector<int32>& example_partition_ids,
+      const Tensor& example_first_order_gradients,
+      const Tensor& example_second_order_gradients,
+      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+          gradient_stats_accumulator) const = 0;
+
+  // Generates feature column split candidates for the specified roots.
+  virtual void GenerateFeatureSplitCandidates(
+      const LearnerConfig& learner_config, const std::vector<int32>& roots,
+      const std::vector<NodeStats>& root_stats,
+      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+          gradient_stats_accumulator,
+      std::vector<FeatureSplitCandidate>* split_candidates) const = 0;
+
+  // Accessors.
+  int32 class_id() const { return class_id_; }
+  int32 slot_id() const { return slot_id_; }
+  int64 batch_size() const { return batch_size_; }
+
+ protected:
+  // The class Id.
+  const int32 class_id_;
+
+  // The slod Id for use as a unique Id across all feature columns.
+  const int32 slot_id_;
+
+  // Size of the batch of examples.
+  const int64 batch_size_;
+};
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  //  THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0e9efbbc5030e8c2e25fafab98271337a2e582a
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc
@@ -0,0 +1,172 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+namespace {
+
+// Creates a sparse default right split node without assigning children.
+boosted_trees::trees::TreeNode CreateSparseSplitNodeDefaultRight(
+    int32 feature_column, float threshold) {
+  boosted_trees::trees::TreeNode split_node;
+  auto* split = split_node.mutable_sparse_float_binary_split_default_right()
+                    ->mutable_split();
+  split->set_feature_column(feature_column);
+  split->set_threshold(threshold);
+  return split_node;
+}
+
+// Creates a sparse default left split node without assigning children.
+boosted_trees::trees::TreeNode CreateSparseSplitNodeDefaultLeft(
+    int32 feature_column, float threshold) {
+  boosted_trees::trees::TreeNode split_node;
+  auto* split = split_node.mutable_sparse_float_binary_split_default_left()
+                    ->mutable_split();
+  split->set_feature_column(feature_column);
+  split->set_threshold(threshold);
+  return split_node;
+}
+
+}  // namespace
+
+void SparseQuantizedFeatureColumnHandler::AggregateGradientStats(
+    const std::vector<int32>& example_partition_ids,
+    const Tensor& example_first_order_gradients,
+    const Tensor& example_second_order_gradients,
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+        gradient_stats_accumulator) const {
+  // Pass over all rows and aggregate gradient stats for each partition
+  // and quantized feature bucket.
+  const int64 num_rows = sparse_indices_.dimension(0);
+  for (int64 row_idx = 0; row_idx < num_rows; ++row_idx) {
+    auto example_idx = sparse_indices_(row_idx, 0);
+    auto partition_id = example_partition_ids[example_idx];
+    auto feature_id = sparse_quantized_values_(row_idx);
+    gradient_stats_accumulator->AddStats(
+        slot_id_, class_id_, partition_id, feature_id,
+        GradientStats(example_first_order_gradients,
+                      example_second_order_gradients, example_idx));
+  }
+}
+
+void SparseQuantizedFeatureColumnHandler::GenerateFeatureSplitCandidates(
+    const LearnerConfig& learner_config, const std::vector<int32>& roots,
+    const std::vector<NodeStats>& root_stats,
+    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+        gradient_stats_accumulator,
+    std::vector<FeatureSplitCandidate>* split_candidates) const {
+  // Evaluate split candidates for every root as each is a separate
+  // logical partition over the examples.
+  // Then for each root, we do both a forward left to right pass and a backward
+  // right to left pass over the quantized feature buckets accumulating
+  // gradients on one side and using the root aggregate gradients to get the
+  // gradients for the other side. Split gains are evaluated for each pass at
+  // every threshold and the best split is picked.
+  split_candidates->clear();
+  split_candidates->reserve(roots.size());
+  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
+    // Get partition Id and root node stats.
+    const int32 partition_id = roots[root_idx];
+    const NodeStats& root_node_stats = root_stats[root_idx];
+
+    // Forward pass with right default direction.
+    GradientStats left_gradient_stats;
+    GradientStats right_gradient_stats(root_node_stats.gradient_stats);
+    FeatureSplitCandidate best_split_candidate(
+        root_node_stats.weight_contribution.size());
+    best_split_candidate.split_stats.gain =
+        std::numeric_limits<float>::lowest();
+    for (int bucket_id = 0; bucket_id < sparse_quantiles_.size(); ++bucket_id) {
+      // Get gradient stats.
+      auto gradient_stats = gradient_stats_accumulator.GetStats(
+          slot_id_, class_id_, partition_id, bucket_id);
+      if (gradient_stats.IsZero()) {
+        continue;
+      }
+
+      // Update gradient stats.
+      left_gradient_stats += gradient_stats;
+      right_gradient_stats =
+          root_node_stats.gradient_stats - left_gradient_stats;
+
+      // Get node stats
+      NodeStats left_node_stats(learner_config, left_gradient_stats);
+      NodeStats right_node_stats(learner_config, right_gradient_stats);
+
+      // Generate split candidate.
+      const float threshold = sparse_quantiles_(bucket_id);
+      FeatureSplitCandidate split_candidate(
+          slot_id_,
+          CreateSparseSplitNodeDefaultRight(sparse_feature_column_, threshold),
+          SplitStats(learner_config, root_node_stats, left_node_stats,
+                     right_node_stats));
+      if (split_candidate.split_stats.gain >
+          best_split_candidate.split_stats.gain) {
+        best_split_candidate = std::move(split_candidate);
+      }
+    }
+
+    // Determine if we need a backward pass by checking if the residual gradient
+    // after forward aggregation is almost the same as the aggregated gradient.
+    // for the current root. This helps avoid unnecessary computation as well
+    // as consistency due to floating point precision.
+    if (!right_gradient_stats.IsAlmostZero()) {
+      // Backward pass with left default direction.
+      right_gradient_stats = GradientStats();
+      left_gradient_stats = root_node_stats.gradient_stats;
+      for (int bucket_id = sparse_quantiles_.size() - 1; bucket_id > 0;
+           --bucket_id) {
+        // Get gradient stats.
+        auto gradient_stats = gradient_stats_accumulator.GetStats(
+            slot_id_, class_id_, partition_id, bucket_id);
+        if (gradient_stats.IsZero()) {
+          continue;
+        }
+
+        // Update gradient stats.
+        right_gradient_stats += gradient_stats;
+        left_gradient_stats = root_node_stats.gradient_stats - gradient_stats;
+
+        // Get node stats
+        NodeStats left_node_stats(learner_config, left_gradient_stats);
+        NodeStats right_node_stats(learner_config, right_gradient_stats);
+
+        // Generate split candidate.
+        const float threshold = sparse_quantiles_(bucket_id - 1);
+        FeatureSplitCandidate split_candidate(
+            slot_id_,
+            CreateSparseSplitNodeDefaultLeft(sparse_feature_column_, threshold),
+            SplitStats(learner_config, root_node_stats, left_node_stats,
+                       right_node_stats));
+        if (split_candidate.split_stats.gain >
+            best_split_candidate.split_stats.gain) {
+          best_split_candidate = std::move(split_candidate);
+        }
+      }
+    }
+
+    // Add best candidate for partition.
+    split_candidates->push_back(std::move(best_split_candidate));
+  }
+}
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb63e705471a65e8448bda38b2e31eb971d5c1bb
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h
@@ -0,0 +1,67 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// Handler for a sparse quantized feature column in the single class case.
+class SparseQuantizedFeatureColumnHandler : public FeatureColumnHandler {
+ public:
+  SparseQuantizedFeatureColumnHandler(
+      const int32 class_id, const int32 slot_id, const int64 batch_size,
+      const int32 sparse_feature_column,
+      TTypes<float>::ConstVec sparse_quantiles,
+      TTypes<int64>::ConstMatrix sparse_indices,
+      TTypes<int32>::ConstVec sparse_quantized_values)
+      : FeatureColumnHandler(class_id, slot_id, batch_size),
+        sparse_feature_column_(sparse_feature_column),
+        sparse_quantiles_(sparse_quantiles),
+        sparse_indices_(sparse_indices),
+        sparse_quantized_values_(sparse_quantized_values) {}
+
+  void AggregateGradientStats(
+      const std::vector<int32>& example_partition_ids,
+      const Tensor& example_first_order_gradients,
+      const Tensor& example_second_order_gradients,
+      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
+          gradient_stats_accumulator) const override;
+
+  void GenerateFeatureSplitCandidates(
+      const LearnerConfig& learner_config, const std::vector<int32>& roots,
+      const std::vector<NodeStats>& root_stats,
+      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
+          gradient_stats_accumulator,
+      std::vector<FeatureSplitCandidate>* split_candidates) const override;
+
+ protected:
+  const int32 sparse_feature_column_;
+  TTypes<float>::ConstVec sparse_quantiles_;
+  TTypes<int64>::ConstMatrix sparse_indices_;
+  TTypes<int32>::ConstVec sparse_quantized_values_;
+};
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  //  THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db8c64a617f88ecd5ce9696317c12b632de6f78d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
@@ -0,0 +1,162 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+namespace {
+
+using boosted_trees::learner::LearnerConfig;
+
+const auto kClassId = 3;
+const auto kSlotId = 0;
+const auto kBatchSize = 4;
+const auto kFeatureColumn = 4;
+
+using FeatureStatsAccumulator =
+    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
+
+class SparseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
+ protected:
+  // The data looks like the following:
+  // Example |  Gradients    | Partition | Sparse Quantile |
+  // i0      |  (0.2, 0.12)  | 0         | 1               |
+  // i1      |  (-0.5, 0.07) | 0         | N/A             |
+  // i2      |  (1.2, 0.2)   | 0         | 0               |
+  // i3      |  (4.0, 0.13)  | 1         | 1               |
+  SparseQuantizedFeatureColumnHandlerTest()
+      : example_first_order_gradients_(
+            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
+        example_second_order_gradients_(
+            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
+        example_partitions_({0, 0, 0, 1}),
+        sparse_quantiles_(test::AsTensor<float>({0.3f, 0.52f}, {2})),
+        sparse_indices_(test::AsTensor<int64>({0, 0, 2, 0, 3, 0}, {3, 2})),
+        sparse_quantized_values_(test::AsTensor<int32>({1, 0, 1}, {3})) {
+    // Set L2 regularization.
+    learner_config_.mutable_regularization()->set_l2(2.0f);
+
+    // Create handler.
+    handler_.reset(new SparseQuantizedFeatureColumnHandler(
+        kClassId, kSlotId, kBatchSize, kFeatureColumn,
+        sparse_quantiles_.vec<float>(), sparse_indices_.matrix<int64>(),
+        sparse_quantized_values_.vec<int32>()));
+  }
+
+  LearnerConfig learner_config_;
+  const Tensor example_first_order_gradients_;
+  const Tensor example_second_order_gradients_;
+  const std::vector<int32> example_partitions_;
+  const Tensor sparse_quantiles_;
+  const Tensor sparse_indices_;
+  const Tensor sparse_quantized_values_;
+  std::unique_ptr<FeatureColumnHandler> handler_;
+};
+
+TEST_F(SparseQuantizedFeatureColumnHandlerTest, AggregateGradientStats) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Check stats for each partition and feature.
+  // Partition 0, Feature 0.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(1.2f, 0.2f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
+  // Partition 0, Feature 1.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f, 0.12f),
+                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
+  // Partition 1, Feature 0.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
+  // Partition 1, Feature 1.
+  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
+                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
+}
+
+TEST_F(SparseQuantizedFeatureColumnHandlerTest,
+       GenerateFeatureSplitCandidates) {
+  // Create handler.
+  FeatureStatsAccumulator accumulator(1);
+  handler_->AggregateGradientStats(
+      example_partitions_, example_first_order_gradients_,
+      example_second_order_gradients_, &accumulator);
+
+  // Get feature split candidates for two roots 0 and 1.
+  // The root stats are derived from the per-partition total gradient stats.
+  const std::vector<int32> roots = {0, 1, 9};
+  const std::vector<NodeStats>& root_stats = {
+      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
+      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
+  std::vector<FeatureSplitCandidate> split_candidates;
+  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
+                                           accumulator, &split_candidates);
+  // Expect three candidate splits (one per root).
+  EXPECT_EQ(3, split_candidates.size());
+
+  // Verify candidate for root 0, the best split occurs when we route
+  // example i0 and i2 to the left and i1 to the right (by default direction).
+  const NodeStats expected_left_node0(learner_config_,
+                                      GradientStats(0.2f + 1.2f, 0.12f + 0.2f));
+  const NodeStats expected_right_node0(
+      learner_config_,
+      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
+  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
+                                         expected_left_node0,
+                                         expected_right_node0);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
+  const auto& tree_node0 = split_candidates[0].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::kSparseFloatBinarySplitDefaultRight,
+            tree_node0.node_case());
+  const auto& split0 =
+      tree_node0.sparse_float_binary_split_default_right().split();
+  EXPECT_FLOAT_EQ(sparse_quantiles_.vec<float>()(1), split0.threshold());
+  EXPECT_EQ(kFeatureColumn, split0.feature_column());
+
+  // Verify candidate for root 1, there's only one active bucket here
+  // so zero gain is expected.
+  const NodeStats expected_left_node1(learner_config_,
+                                      root_stats[1].gradient_stats);
+  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
+  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
+                                         expected_left_node1,
+                                         expected_right_node1);
+  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
+  const auto& tree_node1 = split_candidates[1].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::kSparseFloatBinarySplitDefaultRight,
+            tree_node1.node_case());
+  const auto& split1 =
+      tree_node1.sparse_float_binary_split_default_right().split();
+  EXPECT_FLOAT_EQ(sparse_quantiles_.vec<float>()(1), split1.threshold());
+  EXPECT_EQ(kFeatureColumn, split1.feature_column());
+
+  // Verify there are no candidate splits for root 9.
+  const auto& tree_node2 = split_candidates[2].tree_node;
+  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
+            tree_node2.node_case());
+}
+
+}  // namespace
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe22691178213094b9affcdee06af98011f85bd2
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h
@@ -0,0 +1,61 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// FeatureSplitCandidate holds the split candidate node along with the stats.
+struct FeatureSplitCandidate {
+  // Empty split candidate.
+  explicit FeatureSplitCandidate(const int output_length)
+      : feature_column_slot_id(kInvalidFeatureColumnSlot),
+        split_stats(output_length) {}
+
+  // Feature binary split candidate.
+  FeatureSplitCandidate(const int64 fc_slot_id,
+                        const boosted_trees::trees::TreeNode& node,
+                        const SplitStats& stats)
+      : feature_column_slot_id(fc_slot_id),
+        tree_node(node),
+        split_stats(stats) {}
+
+  // Globally unique slot Id identifying the feature column
+  // used in this split candidates.
+  int64 feature_column_slot_id;
+
+  // Tree node for the candidate split.
+  boosted_trees::trees::TreeNode tree_node;
+
+  // Split stats.
+  SplitStats split_stats;
+
+  // Invalid feature column slot reserved value.
+  static constexpr int64 kInvalidFeatureColumnSlot = -1;
+};
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..dad64bf165a41bc4f32eea6b37e7afb569887a06
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h
@@ -0,0 +1,193 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
+
+#include <math.h>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+const double kEps = 1e-6;
+
+// A data structure for accumulating a Tensor value.
+struct TensorStat {
+  TensorStat() {}
+
+  explicit TensorStat(const float v) : t(DT_FLOAT, TensorShape({1})) {
+    t.flat<float>()(0) = v;
+  }
+
+  explicit TensorStat(const Tensor& rt) : t(tensor::DeepCopy(rt)) {}
+
+  TensorStat(const TensorStat& ts) : t(tensor::DeepCopy(ts.t)) {}
+
+  TensorStat& operator+=(const TensorStat& other) {
+    if (t.NumElements() == 0) {
+      t = tensor::DeepCopy(other.t);
+      return (*this);
+    }
+    CHECK(t.shape() == other.t.shape())
+        << "My shape = " << t.shape().DebugString()
+        << " Other shape = " << other.t.shape().DebugString();
+    auto me_flat = t.unaligned_flat<float>();
+    auto other_flat = other.t.unaligned_flat<float>();
+    for (int i = 0; i < me_flat.size(); i++) {
+      me_flat(i) += other_flat(i);
+    }
+    return (*this);
+  }
+
+  TensorStat& operator-=(const TensorStat& other) {
+    if (other.t.NumElements() == 0) {
+      return (*this);
+    }
+    CHECK(t.shape() == other.t.shape())
+        << "My shape = " << t.shape().DebugString()
+        << " Other shape = " << other.t.shape().DebugString();
+    auto me_flat = t.unaligned_flat<float>();
+    auto other_flat = other.t.unaligned_flat<float>();
+    for (int i = 0; i < me_flat.size(); i++) {
+      me_flat(i) -= other_flat(i);
+    }
+    return (*this);
+  }
+
+  TensorStat& operator*=(float value) {
+    auto me_flat = t.unaligned_flat<float>();
+    for (size_t i = 0; i < me_flat.size(); i++) {
+      me_flat(i) *= value;
+    }
+    return (*this);
+  }
+
+  bool IsZero() const {
+    auto me_flat = t.unaligned_flat<float>();
+    for (int i = 0; i < me_flat.size(); i++) {
+      if (me_flat(i) != 0.0f) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Checks if the L^2 magnitude of the tensor is less than eps.
+  bool IsAlmostZero(const float eps = kEps) const {
+    auto me_flat = t.unaligned_flat<float>();
+    double s = 0.0;
+    for (int i = 0; i < me_flat.size(); i++) {
+      s += me_flat(i) * me_flat(i);
+      if (s > eps * eps) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  float Magnitude() const {
+    auto me_flat = t.unaligned_flat<float>();
+    double s = 0.0;
+    for (int i = 0; i < me_flat.size(); i++) {
+      s += me_flat(i) * me_flat(i);
+    }
+    return sqrt(s);
+  }
+
+  string DebugString() const { return t.DebugString(); }
+
+  Tensor t;
+};
+
+// GradientStats holds first and second order gradient stats.
+struct GradientStats {
+  GradientStats() {}
+
+  // Legacy constructor for tests
+  GradientStats(float g, float h) : first(g), second(h) {}
+
+  GradientStats(const Tensor& g, const Tensor& h) : first(g), second(h) {}
+
+  GradientStats(const Tensor& g, const Tensor& h, int64 example_index)
+      : first(g.Slice(example_index, example_index + 1)),
+        second(h.Slice(example_index, example_index + 1)) {}
+
+  GradientStats& operator+=(const GradientStats& other) {
+    first += other.first;
+    second += other.second;
+    return (*this);
+  }
+
+  GradientStats& operator*=(float value) {
+    first *= value;
+    second *= value;
+    return (*this);
+  }
+
+  GradientStats& operator-=(const GradientStats& other) {
+    first -= other.first;
+    second -= other.second;
+    return (*this);
+  }
+
+  bool IsZero() const { return first.IsZero() && second.IsZero(); }
+
+  bool IsAlmostZero(const float eps = kEps) const {
+    return first.IsAlmostZero(eps) && second.IsAlmostZero(eps);
+  }
+
+  float Magnitude() const { return second.Magnitude(); }
+
+  string DebugString() const {
+    return "First = " + first.DebugString() +
+           " Second = " + second.DebugString();
+  }
+
+  TensorStat first;
+  TensorStat second;
+};
+
+struct GradientStatsAccumulator {
+  void operator()(const GradientStats& from, GradientStats* to) const {
+    (*to) += from;
+  }
+};
+
+inline GradientStats operator+(const GradientStats& a, const GradientStats& b) {
+  GradientStats ret(a);
+  ret += b;
+  return ret;
+}
+
+inline GradientStats operator-(const GradientStats& a, const GradientStats& b) {
+  GradientStats ret(a);
+  ret -= b;
+  return ret;
+}
+
+// Helper macro to check gradient stats approximate equality.
+#define EXPECT_GRADIENT_STATS_EQ(val1, val2) \
+  EXPECT_TRUE((val1 - val2).IsAlmostZero());
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e5f53874df2207ffa6664a33675f84ef055394b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h
@@ -0,0 +1,301 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/Eigenvalues"
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h"
+#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+using tensorflow::boosted_trees::learner::LearnerConfig;
+using tensorflow::boosted_trees::learner::LearnerConfig_MultiClassStrategy;
+using tensorflow::boosted_trees::learner::
+    LearnerConfig_MultiClassStrategy_DIAGONAL_HESSIAN;
+using tensorflow::boosted_trees::learner::
+    LearnerConfig_MultiClassStrategy_FULL_HESSIAN;
+using tensorflow::boosted_trees::learner::
+    LearnerConfig_MultiClassStrategy_TREE_PER_CLASS;
+
+// NodeStats holds aggregate gradient stats as well as metadata about the node.
+struct NodeStats {
+  // Initialize the NodeStats with 0 stats.  We need the output length
+  // so that we can make weight_contribution the right length.
+  explicit NodeStats(const int output_length)
+      : weight_contribution(output_length, 0.0f), gain(0) {}
+
+  NodeStats(const LearnerConfig& learner_config,
+            const GradientStats& grad_stats)
+      : NodeStats(learner_config.regularization().l1(),
+                  learner_config.regularization().l2(),
+                  learner_config.constraints().min_node_weight(),
+                  learner_config.multi_class_strategy(), grad_stats) {}
+
+  NodeStats(float l1_reg, float l2_reg, float min_node_weight,
+            const LearnerConfig_MultiClassStrategy& strategy,
+            const GradientStats& grad_stats)
+      : gradient_stats(grad_stats), gain(0) {
+    switch (strategy) {
+      case LearnerConfig_MultiClassStrategy_TREE_PER_CLASS: {
+        float g;
+        float h;
+        // Initialize now in case of early return.
+        weight_contribution.push_back(0.0f);
+
+        if (grad_stats.first.t.NumElements() == 0 ||
+            grad_stats.second.t.NumElements() == 0) {
+          return;
+        }
+
+        g = grad_stats.first.t.unaligned_flat<float>()(0);
+        h = grad_stats.second.t.unaligned_flat<float>()(0);
+
+        if (grad_stats.IsAlmostZero() || h <= min_node_weight) {
+          return;
+        }
+
+        // Apply L1 regularization.
+        if (l1_reg > 0) {
+          if (g > l1_reg) {
+            g -= l1_reg;
+          } else if (g < -l1_reg) {
+            g += l1_reg;
+          } else {
+            return;
+          }
+        }
+
+        // The node gain is given by: (l'^2) / (l'' + l2_reg) and the node
+        // weight
+        // contribution is given by: (-l') / (l'' + l2_reg).
+        // Note that l'' can't be zero here because of the min node weight check
+        // since min node weight must be >= 0.
+        weight_contribution[0] = -g / (h + l2_reg);
+        gain = (weight_contribution[0] * -g);
+        break;
+      }
+      case LearnerConfig_MultiClassStrategy_FULL_HESSIAN: {
+        weight_contribution.clear();
+
+        if (grad_stats.first.t.NumElements() == 0 ||
+            grad_stats.second.t.NumElements() == 0) {
+          return;
+        }
+        const int64 grad_dim = grad_stats.first.t.dim_size(1);
+
+        QCHECK(grad_stats.first.t.dims() == 2)
+            << strings::Printf("Gradient should be of rank 2, got rank %d",
+                               grad_stats.first.t.dims());
+        QCHECK(grad_stats.first.t.dim_size(0) == 1) << strings::Printf(
+            "Gradient must be of shape 1 x %lld, got %lld x %lld", grad_dim,
+            grad_stats.first.t.dim_size(0), grad_dim);
+        QCHECK(grad_stats.second.t.dims() == 3)
+            << strings::Printf("Hessian should be of rank 3, got rank %d",
+                               grad_stats.second.t.dims());
+        QCHECK(grad_stats.second.t.shape() ==
+               TensorShape({1, grad_dim, grad_dim}))
+            << strings::Printf(
+                   "Hessian must be of shape 1 x %lld x %lld, got %lld x % lld "
+                   " x % lld ",
+                   grad_dim, grad_dim, grad_stats.second.t.shape().dim_size(0),
+                   grad_stats.second.t.shape().dim_size(1),
+                   grad_stats.second.t.shape().dim_size(2));
+
+        // Check if we're violating min weight constraint.
+
+        if (grad_stats.IsAlmostZero() ||
+            grad_stats.second.Magnitude() <= min_node_weight) {
+          return;
+        }
+        // TODO(nponomareva): figure out l1 in matrix form.
+        // g is a vector of gradients, H is a hessian matrix.
+        Eigen::VectorXf g = TensorToEigenVector(grad_stats.first.t, grad_dim);
+
+        Eigen::MatrixXf hessian =
+            TensorToEigenMatrix(grad_stats.second.t, grad_dim, grad_dim);
+        // I is an identity matrix.
+        // The gain in general form is -g^T (H+l2 I)^-1 g.
+        // The node weights are -(H+l2 I)^-1 g.
+        Eigen::MatrixXf identity;
+        identity.setIdentity(grad_dim, grad_dim);
+
+        Eigen::MatrixXf hessian_and_reg = hessian + l2_reg * identity;
+
+        CalculateWeightAndGain(hessian_and_reg, g);
+        break;
+      }
+      case LearnerConfig_MultiClassStrategy_DIAGONAL_HESSIAN: {
+        weight_contribution.clear();
+        if (grad_stats.first.t.NumElements() == 0 ||
+            grad_stats.second.t.NumElements() == 0) {
+          return;
+        }
+        const int64 grad_dim = grad_stats.first.t.dim_size(1);
+
+        QCHECK(grad_stats.first.t.dims() == 2)
+            << strings::Printf("Gradient should be of rank 2, got rank %d",
+                               grad_stats.first.t.dims());
+        QCHECK(grad_stats.first.t.dim_size(0) == 1) << strings::Printf(
+            "Gradient must be of shape 1 x %lld, got %lld x %lld", grad_dim,
+            grad_stats.first.t.dim_size(0), grad_dim);
+        QCHECK(grad_stats.second.t.dims() == 2)
+            << strings::Printf("Hessian should be of rank 2, got rank %d",
+                               grad_stats.second.t.dims());
+        QCHECK(grad_stats.second.t.shape() == TensorShape({1, grad_dim}))
+            << strings::Printf(
+                   "Hessian must be of shape 1 x %lld, got %lld x %lld",
+                   grad_dim, grad_stats.second.t.shape().dim_size(0),
+                   grad_stats.second.t.shape().dim_size(1));
+
+        // Check if we're violating min weight constraint.
+        if (grad_stats.IsAlmostZero() ||
+            grad_stats.second.Magnitude() <= min_node_weight) {
+          return;
+        }
+        // TODO(nponomareva): figure out l1 in matrix form.
+        // Diagonal of the hessian.
+        Eigen::ArrayXf hessian =
+            TensorToEigenArray(grad_stats.second.t, grad_dim);
+        Eigen::ArrayXf hessian_and_reg = hessian + l2_reg;
+
+        // Check if any of the elements are zeros.
+        bool invertible = true;
+        for (int i = 0; i < hessian_and_reg.size(); ++i) {
+          if (hessian_and_reg[i] == 0.0) {
+            invertible = false;
+            break;
+          }
+        }
+        if (invertible) {
+          Eigen::ArrayXf g = TensorToEigenArray(grad_stats.first.t, grad_dim);
+          // Operations on arrays are element wise. The formulas are as for full
+          // hessian, but for hessian of diagonal form they are simplified.
+          Eigen::ArrayXf ones = Eigen::ArrayXf::Ones(grad_dim);
+          Eigen::ArrayXf temp = ones / hessian_and_reg;
+          Eigen::ArrayXf weight = -temp * g;
+
+          // Copy over weights to weight_contribution.
+          weight_contribution =
+              std::vector<float>(weight.data(), weight.data() + weight.rows());
+          gain = (-g * weight).sum();
+        } else {
+          Eigen::VectorXf g = TensorToEigenVector(grad_stats.first.t, grad_dim);
+          // Hessian is not invertible. We will go the same route as in full
+          // hessian to get an approximate solution.
+          CalculateWeightAndGain(hessian_and_reg.matrix().asDiagonal(), g);
+        }
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown multi-class strategy " << strategy;
+        break;
+    }
+  }
+
+  string DebugString() const {
+    return strings::StrCat(
+        gradient_stats.DebugString(), "\n",
+        "Weight_contrib = ", str_util::Join(weight_contribution, ","),
+        "Gain = ", gain);
+  }
+
+  // Use these node stats to populate a Leaf's model.
+  void FillLeaf(const int class_id, boosted_trees::trees::Leaf* leaf) const {
+    if (class_id == -1) {
+      for (int i = 0; i < weight_contribution.size(); i++) {
+        leaf->mutable_vector()->add_value(weight_contribution[i]);
+      }
+    } else {
+      CHECK(weight_contribution.size() == 1)
+          << "Weight contribution size = " << weight_contribution.size();
+      leaf->mutable_sparse_vector()->add_index(class_id);
+      leaf->mutable_sparse_vector()->add_value(weight_contribution[0]);
+    }
+  }
+
+  // Sets the weight_contribution and gain member variables based on the
+  // given regularized Hessian and gradient vector g.
+  void CalculateWeightAndGain(const Eigen::MatrixXf& hessian_and_reg,
+                              const Eigen::VectorXf& g) {
+    // The gain in general form is -g^T (Hessian_and_regularization)^-1 g.
+    // The node weights are -(Hessian_and_regularization)^-1 g.
+    Eigen::VectorXf weight;
+    // If we want to calculate x = K^-1 v, instead of explicitly calculating
+    // K^-1 and multiplying by v, we can solve this matrix equation using
+    // solve method.
+    weight = -hessian_and_reg.colPivHouseholderQr().solve(g);
+    // Copy over weights to weight_contribution.
+    weight_contribution =
+        std::vector<float>(weight.data(), weight.data() + weight.rows());
+
+    gain = -g.transpose() * weight;
+  }
+
+  static Eigen::MatrixXf TensorToEigenMatrix(const Tensor& tensor,
+                                             const int num_rows,
+                                             const int num_cols) {
+    return Eigen::Map<const Eigen::MatrixXf>(tensor.flat<float>().data(),
+                                             num_rows, num_cols);
+  }
+
+  static Eigen::VectorXf TensorToEigenVector(const Tensor& tensor,
+                                             const int num_elements) {
+    return Eigen::Map<const Eigen::VectorXf>(tensor.flat<float>().data(),
+                                             num_elements);
+  }
+
+  static Eigen::ArrayXf TensorToEigenArray(const Tensor& tensor,
+                                           const int num_elements) {
+    return Eigen::Map<const Eigen::ArrayXf>(tensor.flat<float>().data(),
+                                            num_elements);
+  }
+
+  GradientStats gradient_stats;
+  std::vector<float> weight_contribution;
+  float gain;
+};
+
+// Helper macro to check std::vector<float> approximate equality.
+#define EXPECT_VECTOR_FLOAT_EQ(x, y)       \
+  {                                        \
+    EXPECT_EQ((x).size(), (y).size());     \
+    for (int i = 0; i < (x).size(); ++i) { \
+      EXPECT_FLOAT_EQ((x)[i], (y)[i]);     \
+    }                                      \
+  }
+
+// Helper macro to check node stats approximate equality.
+#define EXPECT_NODE_STATS_EQ(val1, val2)                                      \
+  EXPECT_GRADIENT_STATS_EQ(val1.gradient_stats, val2.gradient_stats);         \
+  EXPECT_VECTOR_FLOAT_EQ(val1.weight_contribution, val2.weight_contribution); \
+  EXPECT_FLOAT_EQ(val1.gain, val2.gain);
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f99b6826a7819e627d274e23700d0c8c9c53d2af
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
@@ -0,0 +1,213 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+using tensorflow::test::AsTensor;
+using std::vector;
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+namespace {
+
+const double kDelta = 1e-5;
+
+TEST(NodeStatsTest, AlmostZero) {
+  LearnerConfig learner_config;
+  NodeStats node_stats(learner_config, GradientStats(1e-8f, 1e-8f));
+  EXPECT_EQ(0, node_stats.weight_contribution[0]);
+  EXPECT_EQ(0, node_stats.gain);
+}
+
+TEST(NodeStatsTest, LessThanMinWeightConstraint) {
+  LearnerConfig learner_config;
+  learner_config.mutable_constraints()->set_min_node_weight(3.2f);
+  NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
+  EXPECT_EQ(0, node_stats.weight_contribution[0]);
+  EXPECT_EQ(0, node_stats.gain);
+}
+
+TEST(NodeStatsTest, L1RegSquashed) {
+  LearnerConfig learner_config;
+  learner_config.mutable_regularization()->set_l1(10.0f);
+  NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
+  EXPECT_EQ(0, node_stats.weight_contribution[0]);
+  EXPECT_EQ(0, node_stats.gain);
+}
+
+TEST(NodeStatsTest, L1RegPos) {
+  LearnerConfig learner_config;
+  learner_config.mutable_regularization()->set_l1(5.0f);
+  NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
+  const float expected_clipped_grad = 7.32f - 5.0f;
+  const float expected_weight_contribution = -expected_clipped_grad / 1.63f;
+  const float expected_gain =
+      expected_clipped_grad * expected_clipped_grad / 1.63f;
+  EXPECT_FLOAT_EQ(expected_weight_contribution,
+                  node_stats.weight_contribution[0]);
+  EXPECT_FLOAT_EQ(expected_gain, node_stats.gain);
+}
+
+TEST(NodeStatsTest, L1RegNeg) {
+  LearnerConfig learner_config;
+  learner_config.mutable_regularization()->set_l1(5.0f);
+  NodeStats node_stats(learner_config, GradientStats(-7.32f, 1.63f));
+  const float expected_clipped_grad = -7.32f + 5.0f;
+  const float expected_weight_contribution = -expected_clipped_grad / 1.63f;
+  const float expected_gain =
+      expected_clipped_grad * expected_clipped_grad / 1.63f;
+  EXPECT_FLOAT_EQ(expected_weight_contribution,
+                  node_stats.weight_contribution[0]);
+  EXPECT_FLOAT_EQ(expected_gain, node_stats.gain);
+}
+
+TEST(NodeStatsTest, L2Reg) {
+  LearnerConfig learner_config;
+  learner_config.mutable_regularization()->set_l2(8.0f);
+  NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
+  const float expected_denom = 1.63f + 8.0f;
+  const float expected_weight_contribution = -7.32f / expected_denom;
+  const float expected_gain = 7.32f * 7.32f / expected_denom;
+  EXPECT_FLOAT_EQ(expected_weight_contribution,
+                  node_stats.weight_contribution[0]);
+  EXPECT_FLOAT_EQ(expected_gain, node_stats.gain);
+}
+
+TEST(NodeStatsTest, L1L2Reg) {
+  LearnerConfig learner_config;
+  learner_config.mutable_regularization()->set_l1(5.0f);
+  learner_config.mutable_regularization()->set_l2(8.0f);
+  NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
+  const float expected_clipped_grad = 7.32f - 5.0f;
+  const float expected_denom = 1.63f + 8.0f;
+  const float expected_weight_contribution =
+      -expected_clipped_grad / expected_denom;
+  const float expected_gain =
+      expected_clipped_grad * expected_clipped_grad / expected_denom;
+  EXPECT_FLOAT_EQ(expected_weight_contribution,
+                  node_stats.weight_contribution[0]);
+  EXPECT_FLOAT_EQ(expected_gain, node_stats.gain);
+}
+
+TEST(NodeStatsTest, MulticlassFullHessianTest) {
+  LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::FULL_HESSIAN);
+  learner_config.mutable_regularization()->set_l2(0.3f);
+
+  const int kNumClasses = 4;
+  const auto& g_shape = TensorShape({1, kNumClasses});
+  Tensor g = AsTensor<float>({0.5, 0.33, -9, 1}, g_shape);
+  const auto& hessian_shape = TensorShape({1, kNumClasses, kNumClasses});
+  Tensor h = AsTensor<float>({3, 5, 7, 8, 5, 4, 1, 5, 7, 1, 8, 4, 8, 5, 4, 9},
+                             hessian_shape);
+
+  NodeStats node_stats(learner_config, GradientStats(g, h));
+
+  // Index 1 has 0 value because of l1 regularization,
+  std::vector<float> expected_weight = {0.9607576, 0.4162569, 0.9863192,
+                                        -1.5820024};
+
+  EXPECT_EQ(kNumClasses, node_stats.weight_contribution.size());
+  for (int i = 0; i < kNumClasses; ++i) {
+    EXPECT_NEAR(expected_weight[i], node_stats.weight_contribution[i], kDelta);
+  }
+  EXPECT_NEAR(9.841132, node_stats.gain, kDelta);
+}
+
+TEST(NodeStatsTest, MulticlassDiagonalHessianTest) {
+  // Normal case.
+  {
+    LearnerConfig learner_config;
+    learner_config.set_multi_class_strategy(LearnerConfig::FULL_HESSIAN);
+    learner_config.mutable_regularization()->set_l2(0.3f);
+
+    const int kNumClasses = 4;
+    const auto& g_shape = TensorShape({1, kNumClasses});
+    Tensor g = AsTensor<float>({0.5, 0.33, -9, 1}, g_shape);
+    Tensor h;
+    // Full hessian.
+    {
+      const auto& hessian_shape = TensorShape({1, kNumClasses, kNumClasses});
+      // Construct full hessian.
+      h = AsTensor<float>({3, 0, 0, 0, 0, 4, 0, 0, 0, 0, 8, 0, 0, 0, 0, 9},
+                          hessian_shape);
+    }
+    NodeStats full_node_stats(learner_config, GradientStats(g, h));
+
+    // Diagonal only.
+    {
+      const auto& hessian_shape = TensorShape({1, kNumClasses});
+      // Construct diagonal of hessian.
+      h = AsTensor<float>({3, 4, 8, 9}, hessian_shape);
+    }
+    learner_config.set_multi_class_strategy(LearnerConfig::DIAGONAL_HESSIAN);
+    NodeStats diag_node_stats(learner_config, GradientStats(g, h));
+
+    // Full and diagonal hessian should return the same results.
+    EXPECT_EQ(full_node_stats.weight_contribution.size(),
+              diag_node_stats.weight_contribution.size());
+    for (int i = 0; i < full_node_stats.weight_contribution.size(); ++i) {
+      EXPECT_FLOAT_EQ(full_node_stats.weight_contribution[i],
+                      diag_node_stats.weight_contribution[i]);
+    }
+    EXPECT_EQ(full_node_stats.gain, diag_node_stats.gain);
+  }
+  // Zero entries in diagonal, no regularization
+  {
+    LearnerConfig learner_config;
+    learner_config.set_multi_class_strategy(LearnerConfig::FULL_HESSIAN);
+
+    const int kNumClasses = 4;
+    const auto& g_shape = TensorShape({1, kNumClasses});
+    Tensor g = AsTensor<float>({0.5, 0.33, -9, 1}, g_shape);
+    Tensor h;
+    // Full hessian.
+    {
+      const auto& hessian_shape = TensorShape({1, kNumClasses, kNumClasses});
+      // Construct full hessian.
+      h = AsTensor<float>({3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0},
+                          hessian_shape);
+    }
+    NodeStats full_node_stats(learner_config, GradientStats(g, h));
+
+    // Diagonal only.
+    {
+      const auto& hessian_shape = TensorShape({1, kNumClasses});
+      // Diagonal of hessian, two entries are 0
+      h = AsTensor<float>({3, 0, 8, 0}, hessian_shape);
+    }
+    learner_config.set_multi_class_strategy(LearnerConfig::DIAGONAL_HESSIAN);
+    NodeStats diag_node_stats(learner_config, GradientStats(g, h));
+
+    // Full and diagonal hessian should return the same results.
+    EXPECT_EQ(full_node_stats.weight_contribution.size(),
+              diag_node_stats.weight_contribution.size());
+    for (int i = 0; i < full_node_stats.weight_contribution.size(); ++i) {
+      EXPECT_FLOAT_EQ(full_node_stats.weight_contribution[i],
+                      diag_node_stats.weight_contribution[i]);
+    }
+    EXPECT_EQ(full_node_stats.gain, diag_node_stats.gain);
+  }
+}
+
+}  // namespace
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..f700cbced833543227de39f54c9ecbb03a7ce7c9
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h
@@ -0,0 +1,84 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
+
+#include <string>
+
+#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace learner {
+namespace stochastic {
+
+// FeatureSplitCandidate holds the split candidate node along with the stats.
+struct SplitStats {
+  // Initialize with 0 stats.
+  explicit SplitStats(const int output_length)
+      : root_node_stats(output_length),
+        left_node_stats(output_length),
+        right_node_stats(output_length),
+        gain(0) {}
+
+  // Feature unary split candidate, we don't apply tree complexity
+  // regularization as no new nodes are being added with this candidate.
+  SplitStats(const LearnerConfig& learner_config, const NodeStats& root_stats)
+      : root_node_stats(root_stats),
+        left_node_stats(root_stats.weight_contribution.size()),
+        right_node_stats(root_stats.weight_contribution.size()),
+        gain(0) {}
+
+  // Feature binary split candidate, we apply tree complexity regularization
+  // over the split gain to trade-off adding new nodes with loss reduction.
+  SplitStats(const LearnerConfig& learner_config, const NodeStats& root_stats,
+             const NodeStats& left_stats, const NodeStats& right_stats)
+      : root_node_stats(root_stats),
+        left_node_stats(left_stats),
+        right_node_stats(right_stats),
+        gain(left_stats.gain + right_stats.gain - root_stats.gain -
+             learner_config.regularization().tree_complexity()) {}
+
+  // Root Stats.
+  NodeStats root_node_stats;
+
+  // Children stats.
+  NodeStats left_node_stats;
+  NodeStats right_node_stats;
+
+  // Split gain.
+  float gain;
+
+  string DebugString() const {
+    return "Root = " + root_node_stats.DebugString() +
+           "\nLeft = " + left_node_stats.DebugString() +
+           "\nRight = " + right_node_stats.DebugString() +
+           "\nGain = " + std::to_string(gain);
+  }
+};
+
+// Helper macro to check split stats approximate equality.
+#define EXPECT_SPLIT_STATS_EQ(val1, val2)                             \
+  EXPECT_NODE_STATS_EQ(val1.root_node_stats, val2.root_node_stats);   \
+  EXPECT_NODE_STATS_EQ(val1.left_node_stats, val2.left_node_stats);   \
+  EXPECT_NODE_STATS_EQ(val1.right_node_stats, val2.right_node_stats); \
+  EXPECT_FLOAT_EQ(val1.gain, val2.gain);
+
+}  // namespace stochastic
+}  // namespace learner
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
index f658532acb2b24565f17a193fd0235df6422d516..ce7632e58987f5890beaded5dd305724f950e1e8 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@@ -17,7 +17,6 @@
 #include <sys/types.h>
 #include <algorithm>
 #include <cstdlib>
-#include <ctime>
 #include <functional>
 #include <iterator>
 #include <unordered_set>
@@ -25,6 +24,7 @@
 
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
 
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
@@ -95,7 +95,8 @@ TEST_F(DropoutUtilsTest, DropoutProbabilityTest) {
     int32 total_num_trees = 0;
     for (int i = 0; i < kNumRuns; ++i) {
       // draw random seeds
-      uint random_generator_seed = static_cast<uint>(std::clock());
+      uint random_generator_seed =
+          static_cast<uint>(Env::Default()->NowMicros());
       uint32 seed = rand_r(&random_generator_seed) % 100 + i;
       TF_EXPECT_OK(DropoutUtils::DropOutTrees(seed, config, trees_not_to_drop,
                                               weights_, &dropped_trees,
diff --git a/tensorflow/contrib/boosted_trees/ops/ensemble_optimizer_ops.cc b/tensorflow/contrib/boosted_trees/ops/ensemble_optimizer_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ea5e7849dbc3aa0fe670878a8040357deda23b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/ensemble_optimizer_ops.cc
@@ -0,0 +1,44 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("AddTreesToEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("ensemble_to_add: string")
+    .Input("feature_column_usage_counts_handle: Ref(int64)")
+    .Input("feature_column_usage_counts_to_add: int64")
+    .Input("feature_column_gains_handle: Ref(float)")
+    .Input("feature_column_gains_to_add: float")
+    .Input("drop_out_tree_indices_weights: float")
+    .Input("learning_rate: float")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Synchronously adds a tree ensemble to a an existing tree ensemble variable.
+tree_ensemble_handle: Handle to the ensemble variable.
+ensemble_to_add: Serialized DecisionTreeConfig proto of the tree.
+feature_column_usage_counts_handle: Handle to the feature column usage counts variable.
+feature_column_usage_counts_to_add: Rank 1 Tensor holding feature column usage counts to add.
+feature_column_gains_handle: Handle to the feature column gains variable.
+feature_column_gains_to_add: Rank 1 Tensor holding feature column gains to add.
+drop_out_tree_indices_weights: Rank 2 Tensor containing dropped trees indices
+and original weights of those trees during prediction.
+learning_rate: The learning rate that the tuner found for this iteration.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/model_ops.cc b/tensorflow/contrib/boosted_trees/ops/model_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c490c765cfbd4f3342d6c157904a4b90bbf27217
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/model_ops.cc
@@ -0,0 +1,114 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace gtflow {
+
+REGISTER_RESOURCE_HANDLE_OP(DecisionTreeEnsembleResource);
+
+REGISTER_OP("TreeEnsembleIsInitializedOp")
+    .Input("tree_ensemble_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Checks whether a tree ensemble has been initialized.
+)doc");
+
+REGISTER_OP("CreateTreeEnsembleVariable")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Creates a tree ensemble model and returns a handle to it.
+
+tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+stamp_token: Token to use as the initial value of the resource stamp.
+tree_ensemble_config: Serialized proto of the tree ensemble.
+)doc");
+
+REGISTER_OP("TreeEnsembleStampToken")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Retrieves the tree ensemble resource stamp token.
+
+tree_ensemble_handle: Handle to the tree ensemble.
+stamp_token: Stamp token of the tree ensemble resource.
+)doc");
+
+REGISTER_OP("TreeEnsembleSerialize")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("tree_ensemble_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Serializes the tree ensemble to a proto.
+
+tree_ensemble_handle: Handle to the tree ensemble.
+stamp_token: Stamp token of the tree ensemble resource.
+tree_ensemble_config: Serialized proto of the ensemble.
+)doc");
+
+REGISTER_OP("TreeEnsembleDeserialize")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Deserializes a serialized tree ensemble config and replaces current tree
+ensemble.
+
+tree_ensemble_handle: Handle to the tree ensemble.
+stamp_token: Token to use as the new value of the resource stamp.
+tree_ensemble_config: Serialized proto of the ensemble.
+)doc");
+
+}  // namespace gtflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc b/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31635906240d582f8ebbb9c8d14f1b2431409bc3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
@@ -0,0 +1,135 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using tensorflow::boosted_trees::learner::LearnerConfig;
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+static Status ApplyGradientTreesPredictionShapeFn(InferenceContext* c) {
+  string learner_config_str;
+  // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+  c->GetAttr("learner_config", &learner_config_str).IgnoreError();
+  LearnerConfig learner_config;
+  ParseProtoUnlimited(&learner_config, learner_config_str);
+
+  bool reduce_dim;
+  c->GetAttr("reduce_dim", &reduce_dim).IgnoreError();
+  // Sets the shape of the output as a matrix.
+  c->set_output(0, {c->Matrix(InferenceContext::kUnknownDim,
+                              reduce_dim ? learner_config.num_classes() - 1
+                                         : learner_config.num_classes())});
+  c->set_output(1, {c->Matrix(InferenceContext::kUnknownDim,
+                              reduce_dim ? learner_config.num_classes() - 1
+                                         : learner_config.num_classes())});
+  c->set_output(2, {c->Vector(InferenceContext::kUnknownDim)});
+  return Status::OK();
+}
+
+REGISTER_OP("GradientTreesPrediction")
+    .Attr("learner_config: string")
+    .Attr("num_dense_float_features: int >= 0")
+    .Attr("num_sparse_float_features: int >= 0")
+    .Attr("num_sparse_int_features: int >= 0")
+    .Attr("use_locking: bool = false")
+    .Attr("apply_dropout: bool")
+    .Attr("apply_averaging: bool")
+    .Attr("center_bias: bool")
+    .Attr("reduce_dim: bool")
+    .Input("tree_ensemble_handle: resource")
+    .Input("seed: int64")
+    .Input("dense_float_features: num_dense_float_features * float")
+    .Input("sparse_float_feature_indices: num_sparse_float_features * int64")
+    .Input("sparse_float_feature_values: num_sparse_float_features * float")
+    .Input("sparse_float_feature_shapes: num_sparse_float_features * int64")
+    .Input("sparse_int_feature_indices: num_sparse_int_features * int64")
+    .Input("sparse_int_feature_values: num_sparse_int_features * int64")
+    .Input("sparse_int_feature_shapes: num_sparse_int_features * int64")
+    .Output("predictions: float")
+    .Output("no_dropout_predictions: float")
+    .Output("drop_out_tree_indices_weights: float")
+    .SetShapeFn(ApplyGradientTreesPredictionShapeFn)
+    .Doc(R"doc(
+Runs multiple additive regression forests predictors on input instances
+and computes the final prediction for each class.
+
+learner_config: Config for the learner of type LearnerConfig proto. Prediction
+ops for now uses only LearningRateDropoutDrivenConfig config from the learner.
+num_dense_float_features: Number of dense float features.
+num_sparse_float_features: Number of sparse float features.
+num_sparse_int_features: Number of sparse int features.
+use_locking: Whether to use locking.
+seed: random seed to be used for dropout.
+reduce_dim: whether to reduce the dimension (legacy impl) or not.
+apply_dropout: whether to apply dropout during prediction.
+apply_averaging: whether averaging of tree ensembles should take place. If set
+to true, will be based on AveragingConfig from learner_config.
+tree_ensemble_handle: The handle to the tree ensemble.
+dense_float_features: Rank 2 Tensors containing dense float feature values.
+sparse_float_feature_indices: Rank 2 Tensors containing sparse float indices.
+sparse_float_feature_values: Rank 1 Tensors containing sparse float values.
+sparse_float_feature_shapes: Rank 1 Tensors containing sparse float shapes.
+sparse_int_feature_indices: Rank 2 Tensors containing sparse int indices.
+sparse_int_feature_values: Rank 1 Tensors containing sparse int values.
+sparse_int_feature_shapes: Rank 1 Tensors containing sparse int shapes.
+predictions: Rank 2 Tensor containing predictions per example per class.
+no_dropout_predictions: The same as predictions, but using all trees (even
+those that were dropped due to dropout).
+drop_out_tree_indices_weights: Tensor of Rank 2 containing dropped trees indices
+and original weights of those trees during prediction.
+)doc");
+
+REGISTER_OP("GradientTreesPartitionExamples")
+    .Attr("num_dense_float_features: int >= 0")
+    .Attr("num_sparse_float_features: int >= 0")
+    .Attr("num_sparse_int_features: int >= 0")
+    .Attr("use_locking: bool = false")
+    .Input("tree_ensemble_handle: resource")
+    .Input("dense_float_features: num_dense_float_features * float")
+    .Input("sparse_float_feature_indices: num_sparse_float_features * int64")
+    .Input("sparse_float_feature_values: num_sparse_float_features * float")
+    .Input("sparse_float_feature_shapes: num_sparse_float_features * int64")
+    .Input("sparse_int_feature_indices: num_sparse_int_features * int64")
+    .Input("sparse_int_feature_values: num_sparse_int_features * int64")
+    .Input("sparse_int_feature_shapes: num_sparse_int_features * int64")
+    .Output("partition_ids: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      return c->set_output("partition_ids",
+                           {c->Vector(InferenceContext::kUnknownDim)});
+    })
+    .Doc(R"doc(
+Splits input examples into the leaves of the tree.
+
+num_dense_float_features: Number of dense float features.
+num_sparse_float_features: Number of sparse float features.
+num_sparse_int_features: Number of sparse int features.
+use_locking: Whether to use locking.
+tree_ensemble_handle: The handle to the tree ensemble.
+dense_float_features: Rank 2 Tensors containing dense float feature values.
+sparse_float_feature_indices: Rank 2 Tensors containing sparse float indices.
+sparse_float_feature_values: Rank 1 Tensors containing sparse float values.
+sparse_float_feature_shapes: Rank 1 Tensors containing sparse float shapes.
+sparse_int_feature_indices: Rank 2 Tensors containing sparse int indices.
+sparse_int_feature_values: Rank 1 Tensors containing sparse int values.
+sparse_int_feature_shapes: Rank 1 Tensors containing sparse int shapes.
+partition_ids: Rank 1 Tensor containing partition ids per example.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58d22a439868c61c5fe0185419c765554176e617
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -0,0 +1,290 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace gtflow {
+using shape_inference::InferenceContext;
+using shape_inference::DimensionHandle;
+using shape_inference::ShapeHandle;
+
+REGISTER_RESOURCE_HANDLE_OP(QuantileStreamResource);
+
+REGISTER_OP("QuantileAccumulatorIsInitialized")
+    .Input("quantile_accumulator_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a quantile accumulator has been initialized.
+)doc");
+
+REGISTER_OP("CreateQuantileAccumulator")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("max_elements: int = 1099511627776")  // 1 << 40
+    .Attr("epsilon: float")
+    .Attr("num_quantiles: int")
+    .Input("quantile_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Creates a stateful accumulator for quantile summaries.
+
+epsilon: Error bound on the quantile summary.
+num_quantiles: Number of buckets that we create from the data.
+stamp_token: Token to use as the initial value of the resource stamp.
+quantile_accumulator_handle: The handle to the accumulator.
+)doc");
+
+REGISTER_OP("QuantileAccumulatorAddSummaries")
+    .Attr("num_resource_handles: int >= 1")
+    .Input("quantile_accumulator_handles: num_resource_handles * resource")
+    .Input("stamp_token: int64")
+    .Input("summaries: num_resource_handles * string")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_resource_handles;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_resource_handles", &num_resource_handles));
+      // All the inputs are scalars.
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < 2 * num_resource_handles + 1; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused_input));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Adds each quantile summary to its stream.
+
+quantile_accumulator_handles: The handles to the quantile stream resources.
+stamp_token: Stamp token to validate the Read/Write operation.
+summaries: A list of serialized QuantileSummaryState.
+)doc");
+
+REGISTER_OP("QuantileAccumulatorGetBuckets")
+    .Attr("num_resource_handles: int >= 1")
+    .Input("quantile_accumulator_handles: num_resource_handles * resource")
+    .Input("stamp_token: int64")
+    .Output("are_buckets_ready: num_resource_handles * bool")
+    .Output("buckets: num_resource_handles * float")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_resource_handles;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_resource_handles", &num_resource_handles));
+      for (int i = 0; i < num_resource_handles; ++i) {
+        c->set_output(i, c->Scalar());
+        c->set_output(i + num_resource_handles, c->Vector(c->UnknownDim()));
+      }
+      return Status::OK();
+    })
+
+    .Doc(R"doc(
+Returns quantile buckets created during previous flush of the accumulator.
+
+quantile_accumulator_handles: The handles to the quantile stream resources.
+stamp_token: Stamp token to validate the Read/Write operation.
+are_buckets_ready: Whether the buckets are ready or not.
+buckets: Output quantile summary representing boundaries with "num_quantile"
+    elements.
+)doc");
+
+REGISTER_OP("QuantileAccumulatorFlush")
+    .Input("quantile_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("next_stamp_token: int64")
+    .Doc(R"doc(
+Resets quantile summary streams for each column with a new token.
+
+quantile_accumulator_handle: The handle to the accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+next_stamp_token: Stamp token to be used for the next iteration.
+)doc");
+
+REGISTER_OP("QuantileAccumulatorFlushSummary")
+    .Input("quantile_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("next_stamp_token: int64")
+    .Output("output: string")
+    .Doc(R"doc(
+Resets quantile summary stream and returns the summary.
+
+quantile_accumulator_handle: The handle to the accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+next_stamp_token: Stamp token to be used for the next iteration.
+output: A scalar string that is the a summary of the accumulator.
+)doc");
+
+REGISTER_OP("QuantileAccumulatorSerialize")
+    .Input("quantile_accumulator_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("stream_state: string")
+    .Output("are_buckets_ready: bool")
+    .Output("buckets: float")
+    .Doc(R"doc(
+Serializes the state of the given resource.
+
+quantile_accumulator_handle: The handle to the accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+stream_state: A serialized QuantileStreamState.
+are_buckets_ready: Whether the buckets are ready or not.
+buckets: Output quantile buckets representing boundaries with "num_quantile"
+    elements.
+)doc");
+
+REGISTER_OP("QuantileAccumulatorDeserialize")
+    .Input("quantile_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("stream_state: string")
+    .Input("are_buckets_ready: bool")
+    .Input("buckets: float")
+    .Doc(R"doc(
+Serializes the state of the given resource.
+
+quantile_accumulator_handle: The handle to the accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+stream_state: A serialized QuantileStreamState.
+are_buckets_ready: Whether the buckets are ready or not.
+buckets: Output quantile summary representing boundaries with "num_quantile"
+    elements.
+)doc");
+
+REGISTER_OP("MakeQuantileSummaries")
+    .Attr("num_dense_features: int >= 0")
+    .Attr("num_sparse_features: int >= 0")
+    .Attr("epsilon: float")
+    .Input("dense_float_features: num_dense_features * float")
+    .Input("sparse_float_feature_indices: num_sparse_features * int64")
+    .Input("sparse_float_feature_values: num_sparse_features * float")
+    .Input("sparse_float_feature_shapes: num_sparse_features * int64")
+    .Input("example_weights: float")
+    .Output("dense_summaries: num_dense_features * string")
+    .Output("sparse_summaries: num_sparse_features * string")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_dense_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_dense_features", &num_dense_features));
+      int num_sparse_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_sparse_features", &num_sparse_features));
+      ShapeHandle example_weights_shape;
+      int example_weights_index = num_dense_features + num_sparse_features * 3;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(example_weights_index), 2,
+                                     &example_weights_shape));
+      for (int i = 0; i < num_dense_features; ++i) {
+        ShapeHandle dense_feature_shape;
+        DimensionHandle unused_dim;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &dense_feature_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(dense_feature_shape, 0),
+                                    c->Dim(example_weights_shape, 0),
+                                    &unused_dim));
+        c->set_output(i, c->Scalar());
+      }
+      for (int i = 0; i < num_sparse_features; ++i) {
+        c->set_output(i + num_dense_features, c->Scalar());
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Creates a summary for the given features.
+
+num_dense_features: Number of dense feature groups to compute quantiles on.
+num_sparse_features: Number of sparse feature groups to compute quantiles on.
+epsilon: Error bound on the computed summary.
+dense_float_features: A list of vectors which contains dense values.
+sparse_float_feature_indices: List of rank 2 tensors containing the sparse float
+feature indices.
+sparse_float_feature_values: List of rank 1 tensors containing the sparse float
+feature values.
+sparse_float_feature_shapes: List of rank 1 tensors containing the shape of the
+float feature.
+example_weights: Rank 2 (N, 1) tensor of per-example weights. Should match
+    dense and sparse features shape.
+dense_summaries: A list of serialized QuantileSummaryState for dense columns.
+sparse_summaries: A list of serialized QuantileSummaryState for sparse columns.
+)doc");
+
+REGISTER_OP("QuantileBuckets")
+    .Attr("num_dense_features: int >= 0")
+    .Attr("num_sparse_features: int >= 0")
+    .Attr("dense_config: list(string)")
+    .Attr("sparse_config: list(string)")
+    .Input("dense_float_features: num_dense_features * float")
+    .Input("sparse_float_feature_indices: num_sparse_features * int64")
+    .Input("sparse_float_feature_values: num_sparse_features * float")
+    .Input("sparse_float_feature_shapes: num_sparse_features * int64")
+    .Input("example_weights: float")
+    .Output("dense_buckets: num_dense_features * float")
+    .Output("sparse_buckets: num_sparse_features * float")
+    .Doc(R"doc(
+Computes quantile buckets for a given list of dense and sparse features with
+given example weights.
+
+num_dense_features: Number of dense feature groups to compute quantiles on.
+num_sparse_features: Number of sparse feature groups to compute quantiles on.
+dense_config: Config for computing buckets for dense values.
+Each entry is QuantileConfig proto.
+sparse_config: Config for computing buckets for sparse feature values.
+Each entry is QuantileConfig proto.
+dense_float_features: A list of vectors which contains dense values.
+sparse_float_feature_indices: List of rank 2 tensors containing the sparse float
+feature indices.
+sparse_float_feature_values: List of rank 1 tensors containing the sparse float
+feature values.
+sparse_float_feature_shapes: List of rank 1 tensors containing the shape of the
+float feature.
+example_weights: Rank 1 tensor containing the example weight tensor.
+dense_buckets: Output quantile summary for each dense float tensor
+representing boundaries each with "num_quantile" elements.
+sparse_buckets: Output quantile summary for each sparse float value tensor
+representing boundaries each with "num_quantile" elements.
+)doc");
+
+REGISTER_OP("Quantiles")
+    .Attr("num_dense_features: int >= 0")
+    .Attr("num_sparse_features: int >= 0")
+    .Input("dense_values: num_dense_features * float")
+    .Input("sparse_values: num_sparse_features * float")
+    .Input("dense_buckets: num_dense_features * float")
+    .Input("sparse_buckets: num_sparse_features * float")
+    .Output("dense_quantiles: num_dense_features * int32")
+    .Output("sparse_quantiles: num_sparse_features * int32")
+    .Doc(R"doc(
+Computes quantile for each a given list of dense and sparse feature values using
+the given buckets.
+
+num_dense_features: Number of dense feature groups to generate quantiles for.
+num_sparse_features: Number of sparse feature groups to generate quantiles for.
+dense_values: List of rank 1 tensors containing the dense values.
+sparse_values: List of rank 1 tensors containing the sparse feature values.
+dense_buckets: Quantile summary for each of the dense float tensor.
+sparse_buckets: Quantile summary for each of the sparse feature float tensor.
+dense_quantiles: Rank 1 tensors representing associated quantiles for each of
+dense float tensors.
+sparse_quantiles: Rank 1 tensors representing associated quantiles for each of
+the sparse feature tensors.
+)doc");
+
+}  // namespace gtflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07cfd413bbd389053ff52ca65693445ef28e8ede
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -0,0 +1,204 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+REGISTER_OP("BuildDenseInequalitySplits")
+    .Attr("feature_column_group_id: int")
+    .Attr("l1_regularization: float")
+    .Attr("l2_regularization: float")
+    .Attr("tree_complexity_regularization: float")
+    .Attr("min_node_weight: float")
+    .Attr("multiclass_strategy: int")
+    .Input("num_minibatches: int64")
+    .Input("partition_ids: int32")
+    .Input("bucket_ids: int64")
+    .Input("gradients: float32")
+    .Input("hessians: float32")
+    .Input("bucket_boundaries: float32")
+    .Input("class_id: int32")
+    .Output("output_partition_ids: int32")
+    .Output("gains: float32")
+    .Output("split_infos: string")
+    .SetShapeFn([](InferenceContext* c) {
+      DimensionHandle unused_dim;
+      ShapeHandle unused_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_shape));
+
+      ShapeHandle partition_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &partition_ids_shape));
+      ShapeHandle bucket_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bucket_ids_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(bucket_ids_shape, 0), &unused_dim));
+      ShapeHandle gradients_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      ShapeHandle hessians_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(4), 1, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(hessians_shape, 0), &unused_dim));
+      ShapeHandle bucket_boundaries_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &bucket_boundaries_shape));
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Find the split that has the best gain for the accumulated stats.
+
+num_minibatches: A scalar, the number of times per example gradients & hessians
+    were accumulated. The stats are divided by this to get per example stats.
+partition_ids: A rank 1 tensor of partition IDs.
+bucket_ids: A rank 1 tensor of buckets IDs.
+gradients: A rank 1 tensor of gradients.
+hessians: A rank 1 tensor of hessians.
+bucket_boundaries: A rank 1 tensor, thresholds that were used for bucketization.
+output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
+    for.
+gains: A rank 1 tensor, for the computed gain for the created splits.
+split_infos: A rank 1 tensor of serialized protos which contains the
+    `SplitInfo`s.
+)doc");
+
+REGISTER_OP("BuildSparseInequalitySplits")
+    .Attr("feature_column_group_id: int")
+    .Attr("bias_feature_id: int")
+    .Attr("l1_regularization: float")
+    .Attr("l2_regularization: float")
+    .Attr("tree_complexity_regularization: float")
+    .Attr("min_node_weight: float")
+    .Attr("multiclass_strategy: int")
+    .Input("num_minibatches: int64")
+    .Input("partition_ids: int32")
+    .Input("bucket_ids: int64")
+    .Input("gradients: float32")
+    .Input("hessians: float32")
+    .Input("bucket_boundaries: float32")
+    .Input("class_id: int32")
+    .Output("output_partition_ids: int32")
+    .Output("gains: float32")
+    .Output("split_infos: string")
+    .SetShapeFn([](InferenceContext* c) {
+      DimensionHandle unused_dim;
+      ShapeHandle unused_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_shape));
+
+      ShapeHandle partition_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &partition_ids_shape));
+      ShapeHandle bucket_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bucket_ids_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(bucket_ids_shape, 0), &unused_dim));
+      ShapeHandle gradients_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      ShapeHandle hessians_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(4), 1, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(hessians_shape, 0), &unused_dim));
+      ShapeHandle bucket_boundaries_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &bucket_boundaries_shape));
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Find the split that has the best gain for the accumulated stats.
+
+num_minibatches: A scalar, the number of times per example gradients & hessians
+    were accumulated. The stats are divided by this to get per example stats.
+partition_ids: A rank 1 tensor of partition IDs.
+bucket_ids: A rank 1 tensor of buckets IDs.
+gradients: A rank 1 tensor of gradients.
+hessians: A rank 1 tensor of hessians.
+bucket_boundaries: A rank 1 tensor, thresholds that were used for bucketization.
+output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
+    for.
+gains: A rank 1 tensor, for the computed gain for the created splits.
+split_infos: A rank 1 tensor of serialized protos which contains the
+    `SplitInfo`s.
+)doc");
+
+REGISTER_OP("BuildCategoricalEqualitySplits")
+    .Attr("feature_column_group_id: int")
+    .Attr("bias_feature_id: int")
+    .Attr("l1_regularization: float")
+    .Attr("l2_regularization: float")
+    .Attr("tree_complexity_regularization: float")
+    .Attr("min_node_weight: float")
+    .Attr("multiclass_strategy: int")
+    .Input("num_minibatches: int64")
+    .Input("partition_ids: int32")
+    .Input("feature_ids: int64")
+    .Input("gradients: float32")
+    .Input("hessians: float32")
+    .Input("class_id: int32")
+    .Output("output_partition_ids: int32")
+    .Output("gains: float32")
+    .Output("split_infos: string")
+    .SetShapeFn([](InferenceContext* c) {
+      DimensionHandle unused_dim;
+      ShapeHandle unused_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_shape));
+
+      ShapeHandle partition_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &partition_ids_shape));
+      ShapeHandle bucket_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bucket_ids_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(bucket_ids_shape, 0), &unused_dim));
+      ShapeHandle gradients_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      ShapeHandle hessians_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(4), 1, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(hessians_shape, 0), &unused_dim));
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Find the split that has the best gain for the accumulated stats.
+
+num_minibatches: A scalar, the number of times per example gradients & hessians
+    were accumulated. The stats are divided by this to get per example stats.
+partition_ids: A rank 1 tensor of partition IDs.
+feature_ids: A rank 1 tensor of feature IDs.
+gradients: A rank 1 tensor of gradients.
+hessians: A rank 1 tensor of hessians.
+output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
+    for.
+gains: A rank 1 tensor, for the computed gain for the created splits.
+split_infos: A rank 1 tensor of serialized protos which contains the
+    `SplitInfo`s.
+)doc");
+
+}  // namespace tensorflow
+   // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b986899be2ca4a5fed75f25e8572b9893fb1af0d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
@@ -0,0 +1,475 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace gtflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::DimensionHandle;
+
+REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorScalarResource);
+
+REGISTER_OP("StatsAccumulatorScalarIsInitialized")
+    .Input("stats_accumulator_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a stats accumulator has been initialized.
+)doc");
+
+REGISTER_OP("CreateStatsAccumulatorScalar")
+    .Input("stats_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      // stamp_token is a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Creates a scalar stats accumulator.
+
+stats_accumulator_handle: handle to the stats accumulator.
+stamp_token: Token to use as the initial value of the resource stamp.
+)doc");
+
+REGISTER_OP("StatsAccumulatorScalarAdd")
+    .Attr("num_resource_handles: int >= 1")
+    .Input("stats_accumulator_handles: num_resource_handles * resource")
+    .Input("stamp_token: int64")
+    .Input("partition_ids: num_resource_handles * int32")
+    .Input("feature_ids: num_resource_handles * int64")
+    .Input("gradients: num_resource_handles * float")
+    .Input("hessians: num_resource_handles * float")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_resource_handles;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_resource_handles", &num_resource_handles));
+      for (int i = 0; i < num_resource_handles; ++i) {
+        ShapeHandle unused_input;
+        DimensionHandle unused_dim;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused_input));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(num_resource_handles), 0, &unused_input));
+        ShapeHandle partition_ids_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(num_resource_handles + i + 1),
+                                       1, &partition_ids_shape));
+        ShapeHandle feature_ids_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(
+            c->input(num_resource_handles * 2 + i + 1), 1, &feature_ids_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                    c->Dim(feature_ids_shape, 0), &unused_dim));
+        ShapeHandle gradients_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(
+            c->input(num_resource_handles * 3 + i + 1), 1, &gradients_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                    c->Dim(gradients_shape, 0), &unused_dim));
+        ShapeHandle hessians_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(
+            c->input(num_resource_handles * 4 + i + 1), 1, &hessians_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                    c->Dim(hessians_shape, 0), &unused_dim));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the scalar stats accumulator.
+
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+stats_accumulator_handles: A list of handles to the stats accumulator.
+partition_ids: A list of vectors of partition_ids.
+feature_ids: A list of vectors of feature_ids.
+gradients: A list of vectors of gradients for each slot in
+    <partition_id, feature_id>.
+hessians: A list of vectors of hessians for each slot in
+    <partition_id, feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorScalarFlush")
+    .Input("stats_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("next_stamp_token: int64")
+    .Output("num_updates: int64")
+    .Output("output_partition_ids: int32")
+    .Output("output_feature_ids: int64")
+    .Output("output_gradients: float")
+    .Output("output_hessians: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(3, c->Vector(c->UnknownDim()));
+      c->set_output(4, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Flushes the scalar stats accumulator to output and resets the internal state.
+
+stats_accumulator_handle: handle to the stats accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+next_stamp_token: Stamp token for the next iteration.
+num_updates: Number of times stats were added to this accumulator since last
+    flush.
+output_partition_ids A vector of partition_ids for the slots.
+output_feature_ids: A vector of feature_ids for the slots.
+output_gradients: A vector of gradients, with a value for each slot
+                  in <output_partition_id, output_feature_id>.
+output_hessians: A vector of hessians, with a value for each slot
+                 in <output_partition_id, output_feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorScalarDeserialize")
+    .Input("stats_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("num_updates: int64")
+    .Input("partition_ids: int32")
+    .Input("feature_ids: int64")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      DimensionHandle unused_dim;
+      // stats_accumulator_handle
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      // stamp_token
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      // num_updates
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      ShapeHandle partition_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &partition_ids_shape));
+      ShapeHandle feature_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &feature_ids_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(feature_ids_shape, 0), &unused_dim));
+      ShapeHandle gradients_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      ShapeHandle hessians_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(hessians_shape, 0), &unused_dim));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Resets the scalar stats accumulator with the serialized state.
+
+stats_accumulator_handle: handle to the stats accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+num_updates: Number of times stats were added to this accumulator since last
+    flush.
+partition_ids: A vector of partition_ids.
+feature_ids: A vector of feature_ids.
+gradients: A vector of gradients for each slot in <partition_id, feature_id>.
+hessians: A vector of hessians for each slot in <partition_id, feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorScalarSerialize")
+    .Input("stats_accumulator_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("num_updates: int64")
+    .Output("output_partition_ids: int32")
+    .Output("output_feature_ids: int64")
+    .Output("output_gradients: float")
+    .Output("output_hessians: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      // stamp_token
+      c->set_output(0, c->Scalar());
+      // num_updates
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(3, c->Vector(c->UnknownDim()));
+      c->set_output(4, c->Vector(c->UnknownDim()));
+      c->set_output(5, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Serializes the scalar stats accumulator state.
+
+stats_accumulator_handle: handle to the stats accumulator.
+stamp_token: The current stamp token for the resource.
+num_updates: Number of times stats were added to this accumulator since last
+    flush.
+output_partition_ids A vector of partition_ids for the slots.
+output_feature_ids: A vector of feature_ids for the slots.
+output_gradients: A vector of gradients, with a value for each slot
+                  in <output_partition_id, output_feature_id>.
+output_hessians: A vector of hessians, with a value for each slot
+                 in <output_partition_id, output_feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorScalarMakeSummary")
+    .Input("partition_ids: int32")
+    .Input("feature_ids: int64")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .Output("output_partition_ids: int32")
+    .Output("output_feature_ids: int64")
+    .Output("output_gradients: float")
+    .Output("output_hessians: float")
+    .Doc(R"doc(
+)doc");
+
+// Tensor version of the stats accumulator ops.
+REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorTensorResource);
+
+REGISTER_OP("StatsAccumulatorTensorIsInitialized")
+    .Input("stats_accumulator_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a tensor stats accumulator has been initialized.
+)doc");
+
+REGISTER_OP("CreateStatsAccumulatorTensor")
+    .Input("stats_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("per_slot_gradient_shape: int64")
+    .Input("per_slot_hessian_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      // stamp_token is a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused_input));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Creates a tensor stats accumulator.
+
+stats_accumulator_handle: handle to the tree ensemble resource to be created.
+stamp_token: Token to use as the initial value of the resource stamp.
+per_slot_gradient_shape: a vector that defines the shape of gradients.
+per_slot_hessian_shape:  a vector that defines the shape of hessians.
+)doc");
+
+REGISTER_OP("StatsAccumulatorTensorAdd")
+    .Attr("num_resource_handles: int >= 1")
+    .Input("stats_accumulator_handles: num_resource_handles * resource")
+    .Input("stamp_token: int64")
+    .Input("partition_ids: num_resource_handles * int32")
+    .Input("feature_ids: num_resource_handles * int64")
+    .Input("gradients: num_resource_handles * float")
+    .Input("hessians: num_resource_handles * float")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_resource_handles;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_resource_handles", &num_resource_handles));
+      for (int i = 0; i < num_resource_handles; ++i) {
+        ShapeHandle unused_input;
+        DimensionHandle unused_dim;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused_input));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(num_resource_handles), 0, &unused_input));
+        ShapeHandle partition_ids_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(num_resource_handles + i + 1),
+                                       1, &partition_ids_shape));
+        ShapeHandle feature_ids_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(
+            c->input(num_resource_handles * 2 + i + 1), 1, &feature_ids_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                    c->Dim(feature_ids_shape, 0), &unused_dim));
+        ShapeHandle gradients_shape;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(
+            c->input(num_resource_handles * 3 + i + 1), 2, &gradients_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                    c->Dim(gradients_shape, 0), &unused_dim));
+        ShapeHandle hessians_shape;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(
+            c->input(num_resource_handles * 4 + i + 1), 2, &hessians_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                    c->Dim(hessians_shape, 0), &unused_dim));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the tensor stats accumulator.
+
+stats_accumulator_handles: A list of handles to the stats accumulator.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+partition_ids: A list of vectors of partition_ids.
+feature_ids: A list of vectors of feature_ids.
+gradients: A list of vectors of gradients for each slot in
+    <partition_id, feature_id>.
+hessians: A list of vectors of hessians for each slot in
+    <partition_id, feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorTensorFlush")
+    .Input("stats_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("next_stamp_token: int64")
+    .Output("num_updates: int64")
+    .Output("output_partition_ids: int32")
+    .Output("output_feature_ids: int64")
+    .Output("output_gradients: float")
+    .Output("output_hessians: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      // num_updates
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(3, c->UnknownShape());
+      c->set_output(4, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Flushes the stats accumulator to output and resets the internal state.
+
+stats_accumulator_handle: handle to the tree ensemble resource to be created.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+next_stamp_token: Stamp token to be used for the next iteration.
+num_updates: Number of times stats were added to this accumulator since last
+    flush.
+output_partition_ids: A vector of partition_ids for the slots.
+output_feature_ids: A vector of feature_ids for the slots.
+output_gradients: A tensor of gradients, first dimension matches slots
+                  in <partition_id, feature_id>.
+output_hessians: A tensor of hessians, first dimension matches slots
+                 in <partition_id, feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorTensorDeserialize")
+    .Input("stats_accumulator_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("num_updates: int64")
+    .Input("partition_ids: int32")
+    .Input("feature_ids: int64")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      DimensionHandle unused_dim;
+      // stats_accumulator_handle
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      // stamp_token
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      // num_updates
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      ShapeHandle partition_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &partition_ids_shape));
+      ShapeHandle feature_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &feature_ids_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(feature_ids_shape, 0), &unused_dim));
+      ShapeHandle gradients_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(5), 2, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      ShapeHandle hessians_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(6), 2, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
+                                  c->Dim(hessians_shape, 0), &unused_dim));
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Resets the tensor stats accumulator with the serialized state.
+
+stats_accumulator_handle: handle to the tree ensemble resource to be created.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+num_updates: Number of times stats were added to this accumulator since last
+    flush.
+partition_ids: A vector of partition_ids.
+feature_ids: A vector of feature_ids.
+gradients: A vector of gradients for each slot in <partition_id, feature_id>.
+hessians: A vector of hessians for each slot in <partition_id, feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorTensorSerialize")
+    .Input("stats_accumulator_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("num_updates: int64")
+    .Output("output_partition_ids: int32")
+    .Output("output_feature_ids: int64")
+    .Output("output_gradients: float")
+    .Output("output_hessians: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      // stamp_token
+      c->set_output(0, c->Scalar());
+      // num_updates
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(3, c->Vector(c->UnknownDim()));
+      c->set_output(4, c->UnknownShape());
+      c->set_output(5, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Serializes the scalar stats accumulator state.
+
+stats_accumulator_handle: handle to the tree ensemble resource to be created.
+stamp_token: Stamp token for Read/Write operations.
+             Any operation with a mismatching token will be dropped.
+num_updates: Number of times stats were added to this accumulator since last
+    flush.
+output_partition_ids: A vector of partition_ids for the slots.
+output_feature_ids: A vector of feature_ids for the slots.
+output_gradients: A tensor of gradients, first dimension matches slots
+                  in <partition_id, feature_id>.
+output_hessians: A tensor of hessians, first dimension matches slots
+                 in <partition_id, feature_id>.
+)doc");
+
+REGISTER_OP("StatsAccumulatorTensorMakeSummary")
+    .Input("partition_ids: int32")
+    .Input("feature_ids: int64")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .Output("output_partition_ids: int32")
+    .Output("output_feature_ids: int64")
+    .Output("output_gradients: float")
+    .Output("output_hessians: float")
+    .Doc(R"doc(
+Summarizes the stats by summing the <gradients, hessians> that are for the same
+<partition_id, feature_id>.
+
+partition_ids: A vector of partition_ids.
+feature_ids: A vector of feature_ids.
+gradients: A vector of gradients for each slot in <partition_id, feature_id>.
+hessians: A vector of hessians for each slot in <partition_id, feature_id>.
+output_partition_ids: A vector of partition_ids for the slots.
+output_feature_ids: A vector of feature_ids for the slots.
+output_gradients: A tensor of gradients, first dimension matches slots
+                  in <partition_id, feature_id>.
+output_hessians: A tensor of hessians, first dimension matches slots
+                 in <partition_id, feature_id>.
+)doc");
+}  // namespace gtflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/ops/training_ops.cc b/tensorflow/contrib/boosted_trees/ops/training_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2debbe03d73c296f25f3107e7f343becd654ba9
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/ops/training_ops.cc
@@ -0,0 +1,120 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace gtflow {
+
+REGISTER_OP("CenterTreeEnsembleBias")
+    .Attr("learner_config: string")
+    .Attr("centering_epsilon: float = 0.01")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("next_stamp_token: int64")
+    .Input("delta_updates: float")
+    .Output("continue_centering: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Centers the tree ensemble bias before adding trees based on feature splits.
+
+learner_config: Config for the learner of type LearnerConfig proto.
+tree_ensemble_handle: Handle to the ensemble variable.
+stamp_token: Stamp token for validating operation consistency.
+next_stamp_token: Stamp token to be used for the next iteration.
+delta_updates: Rank 1 Tensor containing delta updates per bias dimension.
+continue_centering: Scalar indicating whether more centering is needed.
+)doc");
+
+REGISTER_OP("GrowTreeEnsemble")
+    .Attr("learner_config: string")
+    .Attr("num_handlers: int >= 0")
+    .Attr("center_bias: bool")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("next_stamp_token: int64")
+    .Input("learning_rate: float")
+    .Input("dropout_seed: int64")
+    .Input("partition_ids: num_handlers * int32")
+    .Input("gains: num_handlers * float")
+    .Input("splits: num_handlers * string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused_input));
+      // Dropout seed.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused_input));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Grows the tree ensemble by either adding a layer to the last tree being grown
+or by starting a new tree.
+
+learner_config: Config for the learner of type LearnerConfig proto.
+num_handlers: Number of handlers generating candidates.
+tree_ensemble_handle: Handle to the ensemble variable.
+stamp_token: Stamp token for validating operation consistency.
+next_stamp_token: Stamp token to be used for the next iteration.
+learning_rate: Scalar learning rate.
+partition_ids: List of Rank 1 Tensors containing partition Id per candidate.
+gains: List of Rank 1 Tensors containing gains per candidate.
+splits: List of Rank 1 Tensors containing serialized SplitInfo protos per candidate.
+)doc");
+
+REGISTER_OP("TreeEnsembleStats")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Output("num_trees: int64")
+    .Output("num_layers: int64")
+    .Output("active_tree: int64")
+    .Output("active_layer: int64")
+    .Output("attempted_trees: int64")
+    .Output("attempted_layers: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      c->set_output(3, c->Scalar());
+      c->set_output(4, c->Scalar());
+      c->set_output(5, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Retrieves stats related to the tree ensemble.
+
+tree_ensemble_handle: Handle to the ensemble variable.
+stamp_token: Stamp token for validating operation consistency.
+num_trees: Scalar indicating the number of finalized trees in the ensemble.
+num_layers: Scalar indicating the number of layers in the ensemble.
+active_tree: Scalar indicating the active tree being trained.
+active_layer: Scalar indicating the active layer being trained.
+)doc");
+
+}  // namespace gtflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index c99d8849bd59c42870a78e284e51626f081b858f..9a61e163eb5ff51dc75de4e40e0f43b090d03c0c 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -33,6 +33,16 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library(
+    name = "split_info_proto",
+    srcs = ["split_info.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":tree_config_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "tree_config_proto",
     srcs = ["tree_config.proto"],
diff --git a/tensorflow/contrib/boosted_trees/proto/split_info.proto b/tensorflow/contrib/boosted_trees/proto/split_info.proto
new file mode 100644
index 0000000000000000000000000000000000000000..a300c24c8ec507dea0af662b2361d408a2085237
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/proto/split_info.proto
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.boosted_trees.learner;
+
+import "tensorflow/contrib/boosted_trees/proto/tree_config.proto";
+
+// Gathered information for a split node.
+message SplitInfo {
+  // The split node without the child nodes attached.
+  tensorflow.boosted_trees.trees.TreeNode split_node = 1;
+
+  // Left Leaf node.
+  tensorflow.boosted_trees.trees.Leaf left_child = 2;
+
+  // Right Leaf node.
+  tensorflow.boosted_trees.trees.Leaf right_child = 3;
+}
diff --git a/tensorflow/contrib/boosted_trees/python/__init__.py b/tensorflow/contrib/boosted_trees/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..60b0476a442bc580e53dd6b68858c370590a25b7
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradient boosted trees implementation in tensorflow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.boosted_trees.python.ops import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/ensemble_optimizer_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/ensemble_optimizer_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..842e0caeca9734e44333a9d0ccdc3f6c9d64cfc3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/ensemble_optimizer_ops_test.py
@@ -0,0 +1,351 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the GTFlow ensemble optimization ops.
+
+The tests cover:
+- Adding a newly built tree to an existing ensemble
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.contrib.boosted_trees.python.ops import ensemble_optimizer_ops
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+def _append_to_leaf(leaf, class_id, weight):
+  """Helper method for building tree leaves.
+
+  Appends weight contributions for the given class index to a leaf node.
+
+  Args:
+    leaf: leaf node to append to, int
+    class_id: class Id for the weight update, int
+    weight: weight contribution value, float
+  """
+  leaf.sparse_vector.index.append(class_id)
+  leaf.sparse_vector.value.append(weight)
+
+
+class EnsembleOptimizerOpsTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    """Create an ensemble of 2 trees."""
+    super(EnsembleOptimizerOpsTest, self).setUp()
+    self._tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+    # First tree.
+    tree_1 = self._tree_ensemble.trees.add()
+    _append_to_leaf(tree_1.nodes.add().leaf, 0, 0.4)
+    _append_to_leaf(tree_1.nodes.add().leaf, 1, 0.6)
+    # Second tree.
+    tree_2 = self._tree_ensemble.trees.add()
+    _append_to_leaf(tree_2.nodes.add().leaf, 0, 1)
+    _append_to_leaf(tree_2.nodes.add().leaf, 1, 0)
+
+    self._tree_ensemble.tree_weights.append(1.0)
+    self._tree_ensemble.tree_weights.append(1.0)
+
+    meta_1 = self._tree_ensemble.tree_metadata.add()
+    meta_1.num_tree_weight_updates = 2
+    meta_2 = self._tree_ensemble.tree_metadata.add()
+    meta_2.num_tree_weight_updates = 3
+
+    # Ensemble to be added.
+    self._ensemble_to_add = tree_config_pb2.DecisionTreeEnsembleConfig()
+
+    self._tree_to_add = self._ensemble_to_add.trees.add()
+    _append_to_leaf(self._tree_to_add.nodes.add().leaf, 0, 0.3)
+    _append_to_leaf(self._tree_to_add.nodes.add().leaf, 1, 0.7)
+
+  def testWithEmptyEnsemble(self):
+    with self.test_session():
+      # Create an empty ensemble.
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="empty")
+
+      # Create zero feature importance.
+      feature_usage_counts = variables.Variable(
+          initial_value=array_ops.zeros([1], dtypes.int64),
+          name="feature_usage_counts",
+          trainable=False)
+      feature_gains = variables.Variable(
+          initial_value=array_ops.zeros([1], dtypes.float32),
+          name="feature_gains",
+          trainable=False)
+
+      resources.initialize_resources(resources.shared_resources()).run()
+      variables.initialize_all_variables().run()
+
+      with ops.control_dependencies([
+          ensemble_optimizer_ops.add_trees_to_ensemble(
+              tree_ensemble_handle,
+              self._ensemble_to_add.SerializeToString(),
+              feature_usage_counts, [2],
+              feature_gains, [0.4], [[]],
+              learning_rate=1.0)
+      ]):
+        result = model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1]
+
+      # Output.
+      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output_ensemble.ParseFromString(result.eval())
+      self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[0])
+      self.assertEqual(1, len(output_ensemble.trees))
+
+      self.assertAllEqual([1.0], output_ensemble.tree_weights)
+
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[0].num_tree_weight_updates)
+
+      self.assertAllEqual([2], feature_usage_counts.eval())
+      self.assertArrayNear([0.4], feature_gains.eval(), 1e-6)
+
+  def testWithExistingEnsemble(self):
+    with self.test_session():
+      # Create existing tree ensemble.
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=self._tree_ensemble.SerializeToString(),
+          name="existing")
+      # Create non-zero feature importance.
+      feature_usage_counts = variables.Variable(
+          initial_value=np.array([0, 4, 1], np.int64),
+          name="feature_usage_counts",
+          trainable=False)
+      feature_gains = variables.Variable(
+          initial_value=np.array([0.0, 0.3, 0.05], np.float32),
+          name="feature_gains",
+          trainable=False)
+
+      resources.initialize_resources(resources.shared_resources()).run()
+      variables.initialize_all_variables().run()
+      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      with ops.control_dependencies([
+          ensemble_optimizer_ops.add_trees_to_ensemble(
+              tree_ensemble_handle,
+              self._ensemble_to_add.SerializeToString(),
+              feature_usage_counts, [1, 2, 0],
+              feature_gains, [0.02, 0.1, 0.0], [[], []],
+              learning_rate=1)
+      ]):
+        output_ensemble.ParseFromString(
+            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())
+
+      # Output.
+      self.assertEqual(3, len(output_ensemble.trees))
+      self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[2])
+
+      self.assertAllEqual([1.0, 1.0, 1.0], output_ensemble.tree_weights)
+
+      self.assertEqual(2,
+                       output_ensemble.tree_metadata[0].num_tree_weight_updates)
+      self.assertEqual(3,
+                       output_ensemble.tree_metadata[1].num_tree_weight_updates)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[2].num_tree_weight_updates)
+      self.assertAllEqual([1, 6, 1], feature_usage_counts.eval())
+      self.assertArrayNear([0.02, 0.4, 0.05], feature_gains.eval(), 1e-6)
+
+  def testWithExistingEnsembleAndDropout(self):
+    with self.test_session():
+      tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Add 10 trees with some weights.
+      for i in range(0, 10):
+        tree = tree_ensemble.trees.add()
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+        tree_ensemble.tree_weights.append(i + 1)
+        meta = tree_ensemble.tree_metadata.add()
+        meta.num_tree_weight_updates = 1
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble.SerializeToString(),
+          name="existing")
+      # Create non-zero feature importance.
+      feature_usage_counts = variables.Variable(
+          initial_value=np.array([2, 3], np.int64),
+          name="feature_usage_counts",
+          trainable=False)
+      feature_gains = variables.Variable(
+          initial_value=np.array([0.0, 0.3], np.float32),
+          name="feature_gains",
+          trainable=False)
+
+      resources.initialize_resources(resources.shared_resources()).run()
+      variables.initialize_all_variables().run()
+
+      dropped = [1, 6, 8]
+      dropped_original_weights = [2.0, 7.0, 9.0]
+
+      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      with ops.control_dependencies([
+          ensemble_optimizer_ops.add_trees_to_ensemble(
+              tree_ensemble_handle,
+              self._ensemble_to_add.SerializeToString(),
+              feature_usage_counts, [1, 2],
+              feature_gains, [0.5, 0.3], [dropped, dropped_original_weights],
+              learning_rate=0.1)
+      ]):
+        output_ensemble.ParseFromString(
+            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())
+
+      # Output.
+      self.assertEqual(11, len(output_ensemble.trees))
+      self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[10])
+      self.assertAllClose(4.5, output_ensemble.tree_weights[10])
+
+      self.assertAllClose([1., 1.5, 3., 4., 5., 6., 5.25, 8., 6.75, 10., 4.5],
+                          output_ensemble.tree_weights)
+
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[0].num_tree_weight_updates)
+      self.assertEqual(2,
+                       output_ensemble.tree_metadata[1].num_tree_weight_updates)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[2].num_tree_weight_updates)
+
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[3].num_tree_weight_updates)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[4].num_tree_weight_updates)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[5].num_tree_weight_updates)
+      self.assertEqual(2,
+                       output_ensemble.tree_metadata[6].num_tree_weight_updates)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[7].num_tree_weight_updates)
+      self.assertEqual(2,
+                       output_ensemble.tree_metadata[8].num_tree_weight_updates)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[9].num_tree_weight_updates)
+      self.assertEqual(
+          1, output_ensemble.tree_metadata[10].num_tree_weight_updates)
+      self.assertAllEqual([3, 5], feature_usage_counts.eval())
+      self.assertArrayNear([0.05, 0.33], feature_gains.eval(), 1e-6)
+
+  def testWithEmptyEnsembleAndShrinkage(self):
+    with self.test_session():
+      # Add shrinkage config.
+      learning_rate = 0.0001
+      tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble.SerializeToString(),
+          name="existing")
+
+      # Create zero feature importance.
+      feature_usage_counts = variables.Variable(
+          initial_value=np.array([0, 0], np.int64),
+          name="feature_usage_counts",
+          trainable=False)
+      feature_gains = variables.Variable(
+          initial_value=np.array([0.0, 0.0], np.float32),
+          name="feature_gains",
+          trainable=False)
+
+      resources.initialize_resources(resources.shared_resources()).run()
+      variables.initialize_all_variables().run()
+
+      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      with ops.control_dependencies([
+          ensemble_optimizer_ops.add_trees_to_ensemble(
+              tree_ensemble_handle,
+              self._ensemble_to_add.SerializeToString(),
+              feature_usage_counts, [1, 2],
+              feature_gains, [0.5, 0.3], [[], []],
+              learning_rate=learning_rate)
+      ]):
+        output_ensemble.ParseFromString(
+            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())
+
+      # New tree is added with shrinkage weight.
+      self.assertAllClose([learning_rate], output_ensemble.tree_weights)
+      self.assertEqual(1,
+                       output_ensemble.tree_metadata[0].num_tree_weight_updates)
+      self.assertAllEqual([1, 2], feature_usage_counts.eval())
+      self.assertArrayNear([0.5 * learning_rate, 0.3 * learning_rate],
+                           feature_gains.eval(), 1e-6)
+
+  def testWithExistingEnsembleAndShrinkage(self):
+    with self.test_session():
+      # Add shrinkage config.
+      learning_rate = 0.0001
+      tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Add 10 trees with some weights.
+      for i in range(0, 5):
+        tree = tree_ensemble.trees.add()
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+        tree_ensemble.tree_weights.append(i + 1)
+        meta = tree_ensemble.tree_metadata.add()
+        meta.num_tree_weight_updates = 1
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble.SerializeToString(),
+          name="existing")
+
+      # Create non-zero feature importance.
+      feature_usage_counts = variables.Variable(
+          initial_value=np.array([4, 7], np.int64),
+          name="feature_usage_counts",
+          trainable=False)
+      feature_gains = variables.Variable(
+          initial_value=np.array([0.2, 0.8], np.float32),
+          name="feature_gains",
+          trainable=False)
+
+      resources.initialize_resources(resources.shared_resources()).run()
+      variables.initialize_all_variables().run()
+
+      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
+      with ops.control_dependencies([
+          ensemble_optimizer_ops.add_trees_to_ensemble(
+              tree_ensemble_handle,
+              self._ensemble_to_add.SerializeToString(),
+              feature_usage_counts, [1, 2],
+              feature_gains, [0.5, 0.3], [[], []],
+              learning_rate=learning_rate)
+      ]):
+        output_ensemble.ParseFromString(
+            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())
+
+      # The weights of previous trees stayed the same, new tree (LAST) is added
+      # with shrinkage weight.
+      self.assertAllClose([1.0, 2.0, 3.0, 4.0, 5.0, learning_rate],
+                          output_ensemble.tree_weights)
+
+      # Check that all number of updates are equal to 1 (e,g, no old tree weight
+      # got adjusted.
+      for i in range(0, 6):
+        self.assertEqual(
+            1, output_ensemble.tree_metadata[i].num_tree_weight_updates)
+
+      # Ensure feature importance was aggregated correctly.
+      self.assertAllEqual([5, 9], feature_usage_counts.eval())
+      self.assertArrayNear(
+          [0.2 + 0.5 * learning_rate, 0.8 + 0.3 * learning_rate],
+          feature_gains.eval(), 1e-6)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e628568543ee5319476669b7576124364d3a5c0
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
@@ -0,0 +1,333 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the GTFlow model ops.
+
+The tests cover:
+- Loading a model from protobufs.
+- Running Predictions using an existing model.
+- Serializing the model.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.contrib.boosted_trees.python.ops import ensemble_optimizer_ops
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.ops import prediction_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver
+
+
+def _append_to_leaf(leaf, c_id, w):
+  """Helper method for building tree leaves.
+
+  Appends weight contributions for the given class index to a leaf node.
+
+  Args:
+    leaf: leaf node to append to.
+    c_id: class Id for the weight update.
+    w: weight contribution value.
+  """
+  leaf.sparse_vector.index.append(c_id)
+  leaf.sparse_vector.value.append(w)
+
+
+def _set_float_split(split, feat_col, thresh, l_id, r_id):
+  """Helper method for building tree float splits.
+
+  Sets split feature column, threshold and children.
+
+  Args:
+    split: split node to update.
+    feat_col: feature column for the split.
+    thresh: threshold to split on forming rule x <= thresh.
+    l_id: left child Id.
+    r_id: right child Id.
+  """
+  split.feature_column = feat_col
+  split.threshold = thresh
+  split.left_id = l_id
+  split.right_id = r_id
+
+
+class ModelOpsTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    """Sets up test for model_ops.
+
+    Create a batch of two examples having one dense float, two sparse float and
+    one sparse int features.
+    The data looks like the following:
+    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 |
+    | 0        |  7     |    -3    |          |          |
+    | 1        | -2     |          | 4        |   9,1    |
+    """
+    super(ModelOpsTest, self).setUp()
+    self._dense_float_tensor = np.array([[7.0], [-2.0]])
+    self._sparse_float_indices1 = np.array([[0, 0]])
+    self._sparse_float_values1 = np.array([-3.0])
+    self._sparse_float_shape1 = np.array([2, 1])
+    self._sparse_float_indices2 = np.array([[1, 0]])
+    self._sparse_float_values2 = np.array([4.0])
+    self._sparse_float_shape2 = np.array([2, 1])
+    self._sparse_int_indices1 = np.array([[1, 0], [1, 1]])
+    self._sparse_int_values1 = np.array([9, 1])
+    self._sparse_int_shape1 = np.array([2, 2])
+    self._seed = 123
+
+  def testCreate(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree = tree_ensemble_config.trees.add()
+      _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=3,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="create_tree")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result, _, _ = prediction_ops.gradient_trees_prediction(
+          tree_ensemble_handle,
+          self._seed, [self._dense_float_tensor], [
+              self._sparse_float_indices1, self._sparse_float_indices2
+          ], [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1,
+           self._sparse_float_shape2], [self._sparse_int_indices1],
+          [self._sparse_int_values1], [self._sparse_int_shape1],
+          learner_config=learner_config.SerializeToString(),
+          apply_dropout=False,
+          apply_averaging=False,
+          center_bias=False,
+          reduce_dim=True)
+      self.assertAllClose(result.eval(), [[-0.4], [-0.4]])
+      stamp_token = model_ops.tree_ensemble_stamp_token(tree_ensemble_handle)
+      self.assertEqual(stamp_token.eval(), 3)
+
+  def testSerialization(self):
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph):
+        tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+        # Bias tree only for second class.
+        tree1 = tree_ensemble_config.trees.add()
+        _append_to_leaf(tree1.nodes.add().leaf, 1, -0.2)
+
+        tree_ensemble_config.tree_weights.append(1.0)
+
+        # Depth 2 tree.
+        tree2 = tree_ensemble_config.trees.add()
+        tree_ensemble_config.tree_weights.append(1.0)
+        _set_float_split(tree2.nodes.add()
+                         .sparse_float_binary_split_default_right.split, 1, 4.0,
+                         1, 2)
+        _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 3,
+                         4)
+        _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+        _append_to_leaf(tree2.nodes.add().leaf, 1, 1.2)
+        _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+
+        tree_ensemble_handle = model_ops.tree_ensemble_variable(
+            stamp_token=7,
+            tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+            name="saver_tree")
+        stamp_token, serialized_config = model_ops.tree_ensemble_serialize(
+            tree_ensemble_handle)
+        resources.initialize_resources(resources.shared_resources()).run()
+        self.assertEqual(stamp_token.eval(), 7)
+        serialized_config = serialized_config.eval()
+
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph):
+        tree_ensemble_handle2 = model_ops.tree_ensemble_variable(
+            stamp_token=9,
+            tree_ensemble_config=serialized_config,
+            name="saver_tree2")
+        resources.initialize_resources(resources.shared_resources()).run()
+
+        # Prepare learner config.
+        learner_config = learner_pb2.LearnerConfig()
+        learner_config.num_classes = 3
+
+        result, _, _ = prediction_ops.gradient_trees_prediction(
+            tree_ensemble_handle2,
+            self._seed, [self._dense_float_tensor], [
+                self._sparse_float_indices1, self._sparse_float_indices2
+            ], [self._sparse_float_values1, self._sparse_float_values2],
+            [self._sparse_float_shape1,
+             self._sparse_float_shape2], [self._sparse_int_indices1],
+            [self._sparse_int_values1], [self._sparse_int_shape1],
+            learner_config=learner_config.SerializeToString(),
+            apply_dropout=False,
+            apply_averaging=False,
+            center_bias=False,
+            reduce_dim=True)
+
+        # Re-serialize tree.
+        stamp_token2, serialized_config2 = model_ops.tree_ensemble_serialize(
+            tree_ensemble_handle2)
+
+        # The first example will get bias class 1 -0.2 from first tree and
+        # leaf 2 payload (sparse feature missing) of 0.5 hence [0.5, -0.2],
+        # the second example will get the same bias class 1 -0.2 and leaf 3
+        # payload of class 1 1.2 hence [0.0, 1.0].
+        self.assertEqual(stamp_token2.eval(), 9)
+
+        # Class 2 does have scores in the leaf => it gets score 0.
+        self.assertEqual(serialized_config2.eval(), serialized_config)
+        self.assertAllClose(result.eval(), [[0.5, -0.2], [0, 1.0]])
+
+  def testRestore(self):
+    # Calling self.test_session() without a graph specified results in
+    # TensorFlowTestCase caching the session and returning the same one
+    # every time. In this test, we need to create two different sessions
+    # which is why we also create a graph and pass it to self.test_session()
+    # to ensure no caching occurs under the hood.
+    save_path = os.path.join(self.get_temp_dir(), "restore-test")
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph) as sess:
+        tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+
+        tree = tree_ensemble_config.trees.add()
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        tree_ensemble_config.tree_weights.append(1.0)
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.1)
+
+        tree_ensemble_config2 = tree_config_pb2.DecisionTreeEnsembleConfig()
+        tree2 = tree_ensemble_config2.trees.add()
+        tree_ensemble_config.tree_weights.append(1.0)
+        _append_to_leaf(tree2.nodes.add().leaf, 0, -1.0)
+
+        tree_ensemble_config3 = tree_config_pb2.DecisionTreeEnsembleConfig()
+        tree3 = tree_ensemble_config3.trees.add()
+        tree_ensemble_config.tree_weights.append(1.0)
+        _append_to_leaf(tree3.nodes.add().leaf, 0, -10.0)
+
+        # Prepare learner config.
+        learner_config = learner_pb2.LearnerConfig()
+        learner_config.num_classes = 2
+
+        tree_ensemble_handle = model_ops.tree_ensemble_variable(
+            stamp_token=3,
+            tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+            name="restore_tree")
+        feature_usage_counts = variables.Variable(
+            initial_value=array_ops.zeros([1], dtypes.int64),
+            name="feature_usage_counts",
+            trainable=False)
+        feature_gains = variables.Variable(
+            initial_value=array_ops.zeros([1], dtypes.float32),
+            name="feature_gains",
+            trainable=False)
+
+        resources.initialize_resources(resources.shared_resources()).run()
+        variables.initialize_all_variables().run()
+        my_saver = saver.Saver()
+
+        with ops.control_dependencies([
+            ensemble_optimizer_ops.add_trees_to_ensemble(
+                tree_ensemble_handle,
+                tree_ensemble_config2.SerializeToString(),
+                feature_usage_counts, [0],
+                feature_gains, [0], [[]],
+                learning_rate=1)
+        ]):
+          result, _, _ = prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True)
+        self.assertAllClose([[-1.1], [-1.1]], result.eval())
+        # Save before adding other trees.
+        val = my_saver.save(sess, save_path)
+        self.assertEqual(save_path, val)
+
+        # Add more trees after saving.
+        with ops.control_dependencies([
+            ensemble_optimizer_ops.add_trees_to_ensemble(
+                tree_ensemble_handle,
+                tree_ensemble_config3.SerializeToString(),
+                feature_usage_counts, [0],
+                feature_gains, [0], [[]],
+                learning_rate=1)
+        ]):
+          result, _, _ = prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True)
+        self.assertAllClose(result.eval(), [[-11.1], [-11.1]])
+
+    # Start a second session.  In that session the parameter nodes
+    # have not been initialized either.
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph) as sess:
+        tree_ensemble_handle = model_ops.tree_ensemble_variable(
+            stamp_token=0, tree_ensemble_config="", name="restore_tree")
+        my_saver = saver.Saver()
+        my_saver.restore(sess, save_path)
+        result, _, _ = prediction_ops.gradient_trees_prediction(
+            tree_ensemble_handle,
+            self._seed, [self._dense_float_tensor], [
+                self._sparse_float_indices1, self._sparse_float_indices2
+            ], [self._sparse_float_values1, self._sparse_float_values2],
+            [self._sparse_float_shape1,
+             self._sparse_float_shape2], [self._sparse_int_indices1],
+            [self._sparse_int_values1], [self._sparse_int_shape1],
+            learner_config=learner_config.SerializeToString(),
+            apply_dropout=False,
+            apply_averaging=False,
+            center_bias=False,
+            reduce_dim=True)
+        # Make sure we only have the first and second tree.
+        # The third tree was added after the save.
+        self.assertAllClose(result.eval(), [[-1.1], [-1.1]])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b64235bb253d222975f2e17bf8c84613c433f3f
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -0,0 +1,1351 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the GTFlow prediction Ops.
+
+The tests cover tree traversal and additive models for single and
+multi class problems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.ops import prediction_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+def _append_to_leaf(leaf, c_id, w):
+  """Helper method for building tree leaves.
+
+  Appends weight contributions for the given class index to a leaf node.
+
+  Args:
+    leaf: leaf node to append to.
+    c_id: class Id for the weight update.
+    w: weight contribution value.
+  """
+  leaf.sparse_vector.index.append(c_id)
+  leaf.sparse_vector.value.append(w)
+
+
+def _append_multi_values_to_leaf(leaf, c_ids, w):
+  """Helper method for building tree leaves with sparse vector of values.
+
+  Appends weight contributions for the given class index to a leaf node.
+
+  Args:
+    leaf: leaf node to append to.
+    c_ids: list of class ids
+    w: corresponding weight contributions for the classes in c_ids
+  """
+  for i in range(len(c_ids)):
+    leaf.sparse_vector.index.append(c_ids[i])
+    leaf.sparse_vector.value.append(w[i])
+
+
+def _append_multi_values_to_dense_leaf(leaf, w):
+  """Helper method for building tree leaves with dense vector of values.
+
+  Appends weight contributions to a leaf. w is assumed to be for all classes.
+
+  Args:
+    leaf: leaf node to append to.
+    w: corresponding weight contributions for all classes.
+  """
+  for x in w:
+    leaf.vector.value.append(x)
+
+
+def _set_float_split(split, feat_col, thresh, l_id, r_id):
+  """Helper method for building tree float splits.
+
+  Sets split feature column, threshold and children.
+
+  Args:
+    split: split node to update.
+    feat_col: feature column for the split.
+    thresh: threshold to split on forming rule x <= thresh.
+    l_id: left child Id.
+    r_id: right child Id.
+  """
+  split.feature_column = feat_col
+  split.threshold = thresh
+  split.left_id = l_id
+  split.right_id = r_id
+
+
+def _set_categorical_id_split(split, feat_col, feat_id, l_id, r_id):
+  """Helper method for building tree categorical id splits.
+
+  Sets split feature column, feature id and children.
+
+  Args:
+    split: categorical id split node.
+    feat_col: feature column for the split.
+    feat_id: feature id forming rule x == id.
+    l_id: left child Id.
+    r_id: right child Id.
+  """
+  split.feature_column = feat_col
+  split.feature_id = feat_id
+  split.left_id = l_id
+  split.right_id = r_id
+
+
+class PredictionOpsTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    """Sets up the prediction tests.
+
+    Create a batch of two examples having one dense float, two sparse float and
+    one sparse int features.
+    The data looks like the following:
+    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 |
+    | 0        |  7     |    -3    |          |    9,1   |
+    | 1        | -2     |          | 4        |          |
+    """
+    super(PredictionOpsTest, self).setUp()
+    self._dense_float_tensor = np.array([[7.0], [-2.0]])
+    self._sparse_float_indices1 = np.array([[0, 0]])
+    self._sparse_float_values1 = np.array([-3.0])
+    self._sparse_float_shape1 = np.array([2, 1])
+    self._sparse_float_indices2 = np.array([[1, 0]])
+    self._sparse_float_values2 = np.array([4.0])
+    self._sparse_float_shape2 = np.array([2, 1])
+    self._sparse_int_indices1 = np.array([[0, 0], [0, 1]])
+    self._sparse_int_values1 = np.array([9, 1])
+    self._sparse_int_shape1 = np.array([2, 2])
+    self._seed = 123
+
+  def testEmptyEnsemble(self):
+    with self.test_session():
+      # Empty tree ensenble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="empty")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+      self.assertAllEqual([[0], [0]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testBiasEnsembleSingleClass(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="bias")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+      self.assertAllClose([[-0.4], [-0.4]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testBiasEnsembleMultiClass(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      leaf = tree.nodes.add().leaf
+      _append_to_leaf(leaf, 0, -0.4)
+      _append_to_leaf(leaf, 1, 0.9)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="multiclass")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 3
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+      self.assertAllClose([[-0.4, 0.9], [-0.4, 0.9]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testFullEnsembleSingleClass(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)
+
+      # Depth 3 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_left.split, 0, -20.0,
+                       3, 4)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2)
+      _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split,
+                                0, 9, 5, 6)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      # The first example will get bias -0.4 from first tree and
+      # leaf 4 payload of -0.9 hence -1.3, the second example will
+      # get the same bias -0.4 and leaf 3 payload (sparse feature missing)
+      # of 1.2 hence 0.8.
+      self.assertAllClose([[-1.3], [0.8]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testExcludeNonFinalTree(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)
+
+      # Depth 3 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = False
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_left.split, 0, -20.0,
+                       3, 4)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2)
+      _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split,
+                                0, 9, 5, 6)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      # All the examples should get only the bias since the second tree is
+      # non-finalized
+      self.assertAllClose([[-0.4], [-0.4]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testIncludeNonFinalTree(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)
+
+      # Depth 3 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = False
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_left.split, 0, -20.0,
+                       3, 4)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2)
+      _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split,
+                                0, 9, 5, 6)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+      learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      # The first example will get bias -0.4 from first tree and
+      # leaf 4 payload of -0.9 hence -1.3, the second example will
+      # get the same bias -0.4 and leaf 3 payload (sparse feature missing)
+      # of 1.2 hence 0.8. Note that the non-finalized tree is included.
+      self.assertAllClose([[-1.3], [0.8]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testMetadataMissing(self):
+    # Sometimes we want to do prediction on trees that are not added to ensemble
+    # (for example in
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree.
+      tree1 = tree_ensemble_config.trees.add()
+      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)
+
+      # Depth 3 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      # We are not setting the tree_ensemble_config.tree_metadata in this test.
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_left.split, 0, -20.0,
+                       3, 4)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2)
+      _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split,
+                                0, 9, 5, 6)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      # The first example will get bias -0.4 from first tree and
+      # leaf 4 payload of -0.9 hence -1.3, the second example will
+      # get the same bias -0.4 and leaf 3 payload (sparse feature missing)
+      # of 1.2 hence 0.8.
+      self.assertAllClose([[-1.3], [0.8]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  # For TREE_PER_CLASS strategy, predictions size is num_classes-1
+  def testFullEnsembleMultiClassTreePerClassStrategy(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree only for second class.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 1, -0.2)
+
+      # Depth 2 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_right.split, 1, 4.0,
+                       1, 2)
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 3, 4)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree2.nodes.add().leaf, 1, 1.2)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="ensemble_multi_class")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 3
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.TREE_PER_CLASS)
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+      # The first example will get bias class 1 -0.2 from first tree and
+      # leaf 2 payload (sparse feature missing) of 0.5 hence [0.5, -0.2],
+      # the second example will get the same bias class 1 -0.2 and leaf 3
+      # payload of class 1 1.2 hence [0.0, 1.0].
+      self.assertAllClose([[0.5, -0.2], [0, 1.0]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  # For tree-per-class multiclass handling strategies, predictions vec
+  # will have the size of the number of classes.
+  # This test is when leafs have SPARSE weights stored (class id and
+  # contribution).
+  def testFullEnsembleMultiNotClassTreePerClassStrategySparseVector(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree only for second class.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 1, -0.2)
+
+      # Depth 2 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_right.split, 1, 4.0,
+                       1, 2)
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 3, 4)
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
+      _append_multi_values_to_leaf(tree2.nodes.add().leaf, [1, 2], [1.2, -0.7])
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="ensemble_multi_class")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 3
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.FULL_HESSIAN)
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=False))
+      # The first example will get bias class 1 -0.2 from first tree and
+      # leaf 2 payload (sparse feature missing) of 0.5 hence [0.5, -0.2],
+      # the second example will get the same bias class 1 -0.2 and leaf 3
+      # payload of class 1 1.2 and class 2-0.7 hence [0.0, 1.0, -0.7].
+      self.assertAllClose([[0.5, -0.2, 0.0], [0, 1.0, -0.7]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  # For all non-tree-per class multiclass handling strategies, predictions vec
+  # will have the size of the number of classes.
+  # This test is when leafs have DENSE weights stored (weight for each class)
+  def testFullEnsembleMultiNotClassTreePerClassStrategyDenseVector(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree only for second class.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_multi_values_to_dense_leaf(tree1.nodes.add().leaf, [0, -0.2, -2])
+
+      # Depth 2 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _set_float_split(tree2.nodes.add()
+                       .sparse_float_binary_split_default_right.split, 1, 4.0,
+                       1, 2)
+      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 3, 4)
+      _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [0.5, 0, 0])
+      _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [0, 1.2, -0.7])
+      _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [-0.9, 0, 0])
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="ensemble_multi_class")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 3
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.FULL_HESSIAN)
+
+      result, result_no_dropout, dropout_info = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=False,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=False))
+      # The first example will get bias class 1 -0.2  and -2 for class 2 from
+      # first tree and leaf 2 payload (sparse feature missing) of 0.5 hence
+      # 0.5, -0.2], the second example will get the same bias and leaf 3 payload
+      # of class 1 1.2 and class 2-0.7 hence [0.0, 1.0, -2.7].
+      self.assertAllClose([[0.5, -0.2, -2.0], [0, 1.0, -2.7]], result.eval())
+      self.assertAllEqual(result_no_dropout.eval(), result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def _get_predictions(self,
+                       tree_ensemble_handle,
+                       learner_config,
+                       apply_dropout=False,
+                       apply_averaging=False,
+                       center_bias=False):
+    return prediction_ops.gradient_trees_prediction(
+        tree_ensemble_handle,
+        self._seed, [self._dense_float_tensor], [
+            self._sparse_float_indices1, self._sparse_float_indices2
+        ], [self._sparse_float_values1, self._sparse_float_values2],
+        [self._sparse_float_shape1,
+         self._sparse_float_shape2], [self._sparse_int_indices1],
+        [self._sparse_int_values1], [self._sparse_int_shape1],
+        learner_config=learner_config.SerializeToString(),
+        apply_dropout=apply_dropout,
+        apply_averaging=apply_averaging,
+        center_bias=center_bias,
+        reduce_dim=True)
+
+  def testDropout(self):
+    with self.test_session():
+      # Empty tree ensenble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Add 10 trees with some weights.
+      for i in range(0, 999):
+        tree = tree_ensemble_config.trees.add()
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+        tree_ensemble_config.tree_weights.append(i + 1)
+
+      # Prepare learner/dropout config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.dropout.dropout_probability = 0.5
+      learner_config.learning_rate_tuner.dropout.learning_rate = 1.0
+      learner_config.num_classes = 2
+
+      # Apply dropout.
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="existing")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result, result_no_dropout, dropout_info = self._get_predictions(
+          tree_ensemble_handle,
+          learner_config=learner_config,
+          apply_dropout=True,
+          apply_averaging=False,
+          center_bias=False)
+
+      # We expect approx 500 trees were dropped.
+      dropout_info = dropout_info.eval()
+      self.assertIn(dropout_info[0].size, range(400, 601))
+      self.assertEqual(dropout_info[0].size, dropout_info[1].size)
+
+      self.assertEqual(result.eval().size, result_no_dropout.eval().size)
+      for i in range(result.eval().size):
+        self.assertNotEqual(result.eval()[i], result_no_dropout.eval()[i])
+
+      for i in range(dropout_info[0].size):
+        dropped_index = dropout_info[0][i]
+        dropped_weight = dropout_info[1][i]
+        # We constructed the trees so tree number + 1 is the tree weight, so
+        # we can check here the weights for dropped trees.
+        self.assertEqual(dropped_index + 1, dropped_weight)
+
+      # Don't apply dropout.
+      result, result_no_dropout, dropout_info = self._get_predictions(
+          tree_ensemble_handle,
+          learner_config=learner_config,
+          apply_dropout=False,
+          apply_averaging=False,
+          center_bias=False)
+
+      # We expect none of the trees were dropped.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+      self.assertAllEqual(result.eval(), result_no_dropout.eval())
+
+  def testDropoutCenterBiasNoGrowingMeta(self):
+    # This is for normal non-batch mode where ensemble does not contain the tree
+    # that is being built currently.
+    num_trees = 10
+    with self.test_session():
+      # Empty tree ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Add 10 trees with some weights.
+      for i in range(0, num_trees):
+        tree = tree_ensemble_config.trees.add()
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+        tree_ensemble_config.tree_weights.append(i + 1)
+
+      # Prepare learner/dropout config.
+      learner_config = learner_pb2.LearnerConfig()
+      # Drop all the trees.
+      learner_config.learning_rate_tuner.dropout.dropout_probability = 1.0
+      learner_config.learning_rate_tuner.dropout.learning_rate = 1.0
+      learner_config.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="existing")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result, result_no_dropout, dropout_info = self._get_predictions(
+          tree_ensemble_handle,
+          learner_config=learner_config,
+          apply_dropout=True,
+          apply_averaging=False,
+          center_bias=False)
+
+      result_center, result_no_dropout_center, dropout_info_center = (
+          self._get_predictions(
+              tree_ensemble_handle,
+              learner_config=learner_config,
+              apply_dropout=True,
+              apply_averaging=False,
+              center_bias=True))
+
+      dropout_info = dropout_info.eval()
+      dropout_info_center = dropout_info_center.eval()
+
+      # With centering, the bias tree is not dropped.
+      num_dropped = dropout_info[0].size
+      self.assertEqual(num_dropped, num_trees)
+      num_dropped_center = dropout_info_center[0].size
+      self.assertEqual(num_dropped_center, num_trees - 1)
+
+      result = result.eval()
+      result_center = result_center.eval()
+      for i in range(result.size):
+        self.assertNotEqual(result[i], result_center[i])
+
+      # First dropped tree is a bias tree 0.
+      self.assertEqual(0, dropout_info[0][0])
+      # Last dropped tree is the last tree.
+      self.assertEqual(num_trees - 1, dropout_info[0][num_dropped - 1])
+
+      # First dropped tree is a tree 1.
+      self.assertEqual(1, dropout_info_center[0][0])
+      # Last dropped tree is the last tree.
+      self.assertEqual(num_trees - 1, dropout_info_center[0][num_dropped_center
+                                                             - 1])
+
+      self.assertAllEqual(result_no_dropout.eval(),
+                          result_no_dropout_center.eval())
+
+  def testDropoutCenterBiasWithGrowingMeta(self):
+    # This is batch mode where ensemble already contains the tree that we are
+    # building. This tree should never be dropped.
+    num_trees = 10
+    with self.test_session():
+      # Empty tree ensenble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Add 10 trees with some weights.
+      for i in range(0, num_trees):
+        tree = tree_ensemble_config.trees.add()
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+        tree_ensemble_config.tree_weights.append(i + 1)
+
+      # Add growing metadata to indicate batch mode.
+      tree_ensemble_config.growing_metadata.num_trees_attempted = num_trees
+      tree_ensemble_config.growing_metadata.num_layers_attempted = num_trees
+
+      # Prepare learner/dropout config.
+      learner_config = learner_pb2.LearnerConfig()
+      # Drop all the trees.
+      learner_config.learning_rate_tuner.dropout.dropout_probability = 1.0
+      learner_config.learning_rate_tuner.dropout.learning_rate = 1.0
+      learner_config.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="existing")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result, result_no_dropout, dropout_info = self._get_predictions(
+          tree_ensemble_handle,
+          learner_config=learner_config,
+          apply_dropout=True,
+          apply_averaging=False,
+          center_bias=False)
+
+      result_center, result_no_dropout_center, dropout_info_center = (
+          self._get_predictions(
+              tree_ensemble_handle,
+              learner_config=learner_config,
+              apply_dropout=True,
+              apply_averaging=False,
+              center_bias=True))
+
+      dropout_info = dropout_info.eval()
+      dropout_info_center = dropout_info_center.eval()
+
+      # Last tree is never dropped, the bias tree can be dropped.
+      num_dropped = dropout_info[0].size
+      self.assertEqual(num_dropped, num_trees - 1)
+      num_dropped_center = dropout_info_center[0].size
+      self.assertEqual(num_dropped_center, num_trees - 2)
+
+      result = result.eval()
+      result_center = result_center.eval()
+      for i in range(result.size):
+        self.assertNotEqual(result[i], result_center[i])
+
+      # First dropped tree is a bias tree 0.
+      self.assertEqual(0, dropout_info[0][0])
+      # Last dropped tree is not the last tree (not tree num_trees-1).
+      self.assertNotEqual(num_trees - 1, dropout_info[0][num_dropped - 1])
+      # First dropped tree is a tree 1.
+      self.assertEqual(1, dropout_info_center[0][0])
+      # Last dropped tree is not the last tree in ensemble.
+      self.assertNotEqual(num_trees - 1,
+                          dropout_info_center[0][num_dropped_center - 1])
+
+      self.assertAllEqual(result_no_dropout.eval(),
+                          result_no_dropout_center.eval())
+
+  def testDropoutSeed(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Add 10 trees with some weights.
+      for i in range(0, 999):
+        tree = tree_ensemble_config.trees.add()
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+        tree_ensemble_config.tree_weights.append(i + 1)
+
+      # Prepare learner/dropout config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.dropout.dropout_probability = 0.5
+      learner_config.learning_rate_tuner.dropout.learning_rate = 1.0
+      learner_config.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="empty")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      _, result_no_dropout_1, dropout_info_1 = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=True,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      _, result_no_dropout_2, dropout_info_2 = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=True,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      # Different seed.
+      _, result_no_dropout_3, dropout_info_3 = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              112314, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=True,
+              apply_averaging=False,
+              center_bias=False,
+              reduce_dim=True))
+
+      # First seed with centering bias.
+      _, result_no_dropout_4, dropout_info_4 = (
+          prediction_ops.gradient_trees_prediction(
+              tree_ensemble_handle,
+              self._seed, [self._dense_float_tensor], [
+                  self._sparse_float_indices1, self._sparse_float_indices2
+              ], [self._sparse_float_values1, self._sparse_float_values2],
+              [self._sparse_float_shape1,
+               self._sparse_float_shape2], [self._sparse_int_indices1],
+              [self._sparse_int_values1], [self._sparse_int_shape1],
+              learner_config=learner_config.SerializeToString(),
+              apply_dropout=True,
+              apply_averaging=False,
+              center_bias=True,
+              reduce_dim=True))
+
+      # The same seed returns the same results.
+      self.assertAllEqual(dropout_info_1.eval(), dropout_info_2.eval())
+      # Different seeds give diff results.
+      self.assertNotEqual(dropout_info_3.eval().shape,
+                          dropout_info_2.eval().shape)
+      # With centering bias and the same seed does not give the same result.
+      self.assertNotEqual(dropout_info_4.eval(), dropout_info_1.eval())
+      # With centering bias has 1 less tree dropped (bias tree is not dropped).
+      self.assertEqual(
+          len(dropout_info_4.eval()[0]) + 1, len(dropout_info_1.eval()[0]))
+
+      # Predictions without dropout are all the same.
+      result, result_no_dropout, _ = prediction_ops.gradient_trees_prediction(
+          tree_ensemble_handle,
+          self._seed, [self._dense_float_tensor], [
+              self._sparse_float_indices1, self._sparse_float_indices2
+          ], [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1,
+           self._sparse_float_shape2], [self._sparse_int_indices1],
+          [self._sparse_int_values1], [self._sparse_int_shape1],
+          learner_config=learner_config.SerializeToString(),
+          apply_dropout=False,
+          apply_averaging=False,
+          center_bias=False,
+          reduce_dim=True)
+
+      self.assertAllCloseAccordingToType(result.eval(),
+                                         result_no_dropout.eval())
+      self.assertAllCloseAccordingToType(result.eval(),
+                                         result_no_dropout_1.eval())
+      self.assertAllCloseAccordingToType(result.eval(),
+                                         result_no_dropout_2.eval())
+      self.assertAllCloseAccordingToType(result.eval(),
+                                         result_no_dropout_3.eval())
+      self.assertAllCloseAccordingToType(result.eval(),
+                                         result_no_dropout_4.eval())
+
+  def testAveragingAllTrees(self):
+    with self.test_session():
+      # Empty tree ensenble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      adjusted_tree_ensemble_config = (
+          tree_config_pb2.DecisionTreeEnsembleConfig())
+      # Add 100 trees with some weights.
+      # When averaging is applied, the tree weights will essentially change to
+      # 1, 98/99, 97/99 etc, so lets create the ensemble with such weights.
+      # too
+      total_num = 100
+      for i in range(0, total_num):
+        tree = tree_ensemble_config.trees.add()
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        tree_ensemble_config.tree_weights.append(1.0)
+        # This is how the weight will look after averaging
+        copy_tree = adjusted_tree_ensemble_config.trees.add()
+        _append_to_leaf(copy_tree.nodes.add().leaf, 0, -0.4)
+
+        adjusted_tree_ensemble_config.tree_metadata.add().is_finalized = True
+        adjusted_tree_ensemble_config.tree_weights.append(
+            1.0 * (total_num - i) / total_num)
+
+      # Prepare learner config WITH AVERAGING.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+      learner_config.averaging_config.average_last_percent_trees = 1.0
+
+      # No averaging config.
+      learner_config_no_averaging = learner_pb2.LearnerConfig()
+      learner_config_no_averaging.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="existing")
+
+      # This is how our ensemble will "look" during averaging
+      adjusted_tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=adjusted_tree_ensemble_config.SerializeToString(
+          ),
+          name="adjusted")
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Do averaging.
+      result, result_no_dropout, dropout_info = self._get_predictions(
+          tree_ensemble_handle, learner_config, apply_averaging=True)
+
+      pattern_result, pattern_result_no_dropout, pattern_dropout_info = (
+          self._get_predictions(
+              adjusted_tree_ensemble_handle,
+              learner_config_no_averaging,
+              apply_averaging=False))
+
+      self.assertAllEqual(result_no_dropout.eval(),
+                          pattern_result_no_dropout.eval())
+      self.assertAllEqual(result.eval(), pattern_result.eval())
+      self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
+
+  def testAveragingSomeTrees(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      adjusted_tree_ensemble_config = (
+          tree_config_pb2.DecisionTreeEnsembleConfig())
+      # Add 1000 trees with some weights.
+      total_num = 100
+      num_averaged = 25
+      j = 0
+      for i in range(0, total_num):
+        tree = tree_ensemble_config.trees.add()
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        tree_ensemble_config.tree_weights.append(1.0)
+
+        # This is how the weight will look after averaging - we are adjusting
+        # the weights of the last 25 trees
+        copy_tree = adjusted_tree_ensemble_config.trees.add()
+        _append_to_leaf(copy_tree.nodes.add().leaf, 0, -0.4)
+
+        adjusted_tree_ensemble_config.tree_metadata.add().is_finalized = True
+        if i >= 75:
+          adjusted_tree_ensemble_config.tree_weights.append(
+              1.0 * (num_averaged - j) / num_averaged)
+          j += 1
+        else:
+          adjusted_tree_ensemble_config.tree_weights.append(1.0)
+
+      # Prepare learner config WITH AVERAGING.
+      learner_config_1 = learner_pb2.LearnerConfig()
+      learner_config_1.num_classes = 2
+      learner_config_1.averaging_config.average_last_percent_trees = 0.25
+
+      # This is equivalent.
+      learner_config_2 = learner_pb2.LearnerConfig()
+      learner_config_2.num_classes = 2
+      learner_config_2.averaging_config.average_last_n_trees = 25
+
+      # No averaging config.
+      learner_config_no_averaging = learner_pb2.LearnerConfig()
+      learner_config_no_averaging.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="existing")
+
+      # This is how our ensemble will "look" during averaging
+      adjusted_tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=adjusted_tree_ensemble_config.SerializeToString(
+          ),
+          name="adjusted")
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result_1, result_no_dropout_1, dropout_info_1 = self._get_predictions(
+          tree_ensemble_handle, learner_config_1, apply_averaging=True)
+
+      result_2, result_no_dropout_2, dropout_info_2 = self._get_predictions(
+          tree_ensemble_handle, learner_config_2, apply_averaging=True)
+
+      pattern_result, pattern_result_no_dropout, pattern_dropout_info = (
+          self._get_predictions(
+              adjusted_tree_ensemble_handle,
+              learner_config_no_averaging,
+              apply_averaging=False))
+
+      self.assertAllEqual(result_no_dropout_1.eval(),
+                          pattern_result_no_dropout.eval())
+      self.assertAllEqual(result_no_dropout_2.eval(),
+                          pattern_result_no_dropout.eval())
+
+      self.assertAllEqual(result_1.eval(), pattern_result.eval())
+      self.assertAllEqual(result_2.eval(), pattern_result.eval())
+
+      self.assertAllEqual(dropout_info_1.eval(), pattern_dropout_info.eval())
+      self.assertAllEqual(dropout_info_2.eval(), pattern_dropout_info.eval())
+
+  def testAverageMoreThanNumTreesExist(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      adjusted_tree_ensemble_config = (
+          tree_config_pb2.DecisionTreeEnsembleConfig())
+      # When we say to average over more trees than possible, it is averaging
+      # across all trees.
+      total_num = 100
+      for i in range(0, total_num):
+        tree = tree_ensemble_config.trees.add()
+        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
+
+        tree_ensemble_config.tree_metadata.add().is_finalized = True
+        tree_ensemble_config.tree_weights.append(1.0)
+        # This is how the weight will look after averaging
+        copy_tree = adjusted_tree_ensemble_config.trees.add()
+        _append_to_leaf(copy_tree.nodes.add().leaf, 0, -0.4)
+
+        adjusted_tree_ensemble_config.tree_metadata.add().is_finalized = True
+        adjusted_tree_ensemble_config.tree_weights.append(
+            1.0 * (total_num - i) / total_num)
+
+      # Prepare learner config WITH AVERAGING.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+      # We have only 100 trees but we ask to average over 250.
+      learner_config.averaging_config.average_last_n_trees = 250
+
+      # No averaging config.
+      learner_config_no_averaging = learner_pb2.LearnerConfig()
+      learner_config_no_averaging.num_classes = 2
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="existing")
+
+      # This is how our ensemble will "look" during averaging
+      adjusted_tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=adjusted_tree_ensemble_config.SerializeToString(
+          ),
+          name="adjusted")
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result, result_no_dropout, dropout_info = self._get_predictions(
+          tree_ensemble_handle, learner_config, apply_averaging=True)
+
+      pattern_result, pattern_result_no_dropout, pattern_dropout_info = (
+          self._get_predictions(
+              adjusted_tree_ensemble_handle,
+              learner_config_no_averaging,
+              apply_averaging=False))
+
+      self.assertAllEqual(result_no_dropout.eval(),
+                          pattern_result_no_dropout.eval())
+      self.assertAllEqual(result.eval(), pattern_result.eval())
+      self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
+
+
+class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    """Sets up the prediction tests.
+
+    Create a batch of two examples having one dense float, two sparse float and
+    one sparse int features.
+    The data looks like the following:
+    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 |
+    | 0        |  7     |    -3    |          |    9,1   |
+    | 1        | -2     |          | 4        |          |
+    """
+    super(PartitionExamplesOpsTest, self).setUp()
+    self._dense_float_tensor = np.array([[7.0], [-2.0]])
+    self._sparse_float_indices1 = np.array([[0, 0]])
+    self._sparse_float_values1 = np.array([-3.0])
+    self._sparse_float_shape1 = np.array([2, 1])
+    self._sparse_float_indices2 = np.array([[1, 0]])
+    self._sparse_float_values2 = np.array([4.0])
+    self._sparse_float_shape2 = np.array([2, 1])
+    self._sparse_int_indices1 = np.array([[0, 0], [0, 1]])
+    self._sparse_int_values1 = np.array([9, 1])
+    self._sparse_int_shape1 = np.array([2, 2])
+
+  def testEnsembleEmpty(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result = prediction_ops.gradient_trees_partition_examples(
+          tree_ensemble_handle, [self._dense_float_tensor], [
+              self._sparse_float_indices1, self._sparse_float_indices2
+          ], [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1,
+           self._sparse_float_shape2], [self._sparse_int_indices1],
+          [self._sparse_int_values1], [self._sparse_int_shape1])
+
+      self.assertAllEqual([0, 0], result.eval())
+
+  def testTreeNonFinalized(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Depth 3 tree.
+      tree1 = tree_ensemble_config.trees.add()
+      _set_float_split(tree1.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
+      _set_float_split(tree1.nodes.add()
+                       .sparse_float_binary_split_default_left.split, 0, -20.0,
+                       3, 4)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.2)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.3)
+      _set_categorical_id_split(tree1.nodes.add().categorical_id_binary_split,
+                                0, 9, 5, 6)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.6)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_metadata.add().is_finalized = False
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result = prediction_ops.gradient_trees_partition_examples(
+          tree_ensemble_handle, [self._dense_float_tensor], [
+              self._sparse_float_indices1, self._sparse_float_indices2
+          ], [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1,
+           self._sparse_float_shape2], [self._sparse_int_indices1],
+          [self._sparse_int_values1], [self._sparse_int_shape1])
+
+      self.assertAllEqual([5, 3], result.eval())
+
+  def testTreeFinalized(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Depth 3 tree.
+      tree1 = tree_ensemble_config.trees.add()
+      _set_float_split(tree1.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
+      _set_float_split(tree1.nodes.add()
+                       .sparse_float_binary_split_default_left.split, 0, -20.0,
+                       3, 4)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.2)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.3)
+      _set_categorical_id_split(tree1.nodes.add().categorical_id_binary_split,
+                                0, 9, 5, 6)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.5)
+      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.6)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result = prediction_ops.gradient_trees_partition_examples(
+          tree_ensemble_handle, [self._dense_float_tensor], [
+              self._sparse_float_indices1, self._sparse_float_indices2
+          ], [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1,
+           self._sparse_float_shape2], [self._sparse_int_indices1],
+          [self._sparse_int_values1], [self._sparse_int_shape1])
+
+      self.assertAllEqual([0, 0], result.eval())
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3871e8d76dfa09fc9c82b1e8b910472f6ade9a53
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -0,0 +1,398 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for checking quantile related ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+import tempfile
+
+import numpy as np
+
+from tensorflow.contrib.boosted_trees.proto.quantiles_pb2 import QuantileConfig
+from tensorflow.contrib.boosted_trees.python.ops import quantile_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver
+
+
+class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
+
+  def _gen_config(self, eps, num_quantiles):
+    config = QuantileConfig()
+    config.eps = eps
+    config.num_quantiles = num_quantiles
+    return config.SerializeToString()
+
+  def testBasicQuantileBuckets(self):
+    """Sets up the quantile summary op test as follows.
+
+    Create a batch of 6 examples having a dense and sparse features.
+    The data looks like this
+    | Instance | instance weights | Dense 0  | Sparse 0
+    | 0        |     10           |   1      |
+    | 1        |     1            |   2      |    2
+    | 2        |     1            |   3      |    3
+    | 3        |     1            |   4      |    4
+    | 4        |     1            |   4      |    5
+    | 5        |     1            |   5      |    6
+    """
+
+    dense_float_tensor_0 = constant_op.constant(
+        [1, 2, 3, 4, 4, 5], dtype=dtypes.float32)
+    sparse_indices_0 = constant_op.constant(
+        [[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]], dtype=dtypes.int64)
+    sparse_values_0 = constant_op.constant(
+        [2, 3, 4, 5, 6], dtype=dtypes.float32)
+    sparse_shape_0 = constant_op.constant([6, 1], dtype=dtypes.int64)
+    example_weights = constant_op.constant(
+        [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
+
+    with self.test_session():
+      config = self._gen_config(0.33, 3)
+      dense_buckets, sparse_buckets = quantile_ops.quantile_buckets(
+          [dense_float_tensor_0], [sparse_indices_0], [sparse_values_0],
+          [sparse_shape_0],
+          example_weights=example_weights,
+          dense_config=[config],
+          sparse_config=[config])
+
+      self.assertAllEqual([1, 3, 5], dense_buckets[0].eval())
+      self.assertAllEqual([2, 4, 6.], sparse_buckets[0].eval())
+
+  def testStreamingQuantileBuckets(self):
+    """Sets up the quantile summary op test as follows.
+
+    100 batches of data is added to the accumulator. The batches are in form:
+    [0 1 .. 99]
+    [100 101 .. 200]
+    ...
+    [9900 9901 .. 9999]
+    All the batches have 1 for all the example weights.
+    """
+    with self.test_session() as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1")
+      resources.initialize_resources(resources.shared_resources()).run()
+    weight_placeholder = array_ops.placeholder(dtypes.float32)
+    dense_placeholder = array_ops.placeholder(dtypes.float32)
+    update = accumulator.add_summary(
+        stamp_token=0,
+        column=dense_placeholder,
+        example_weights=weight_placeholder)
+    with self.test_session() as sess:
+      for i in range(100):
+        dense_float = np.linspace(
+            i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1)
+        sess.run(update, {
+            dense_placeholder: dense_float,
+            weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32)
+        })
+
+    with self.test_session() as sess:
+      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
+      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertAllEqual([0, 3335., 6671., 9999.], buckets)
+
+  def testStreamingQuantileBucketsTwoLevel(self):
+    """Sets up the quantile summary op test as follows.
+
+    100 batches of data is added to the accumulator. The batches are in form:
+    [0 1 .. 99]
+    [100 101 .. 200]
+    ...
+    [9900 9901 .. 9999]
+    All the batches have 1 for all the example weights.
+    """
+    with self.test_session() as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1")
+      accumulator_2 = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q2")
+      resources.initialize_resources(resources.shared_resources()).run()
+    weight_placeholder = array_ops.placeholder(dtypes.float32)
+    dense_placeholder = array_ops.placeholder(dtypes.float32)
+    update = accumulator.add_summary(
+        stamp_token=0,
+        column=dense_placeholder,
+        example_weights=weight_placeholder)
+    with self.test_session() as sess:
+      for i in range(100):
+        dense_float = np.linspace(
+            i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1)
+        sess.run(update, {
+            dense_placeholder: dense_float,
+            weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32)
+        })
+
+    with self.test_session() as sess:
+      summary = sess.run(
+          accumulator.flush_summary(stamp_token=0, next_stamp_token=1))
+      sess.run(
+          accumulator_2.add_prebuilt_summary(
+              stamp_token=0, summary=constant_op.constant(summary)))
+      sess.run(accumulator_2.flush(stamp_token=0, next_stamp_token=1))
+      are_ready_flush, buckets = (accumulator_2.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertAllEqual([0, 3337., 6677., 9999.], buckets)
+
+  def testSaveRestoreBeforeFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      sparse_indices_0 = constant_op.constant(
+          [[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]], dtype=dtypes.int64)
+      sparse_values_0 = constant_op.constant(
+          [2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtypes.float32)
+      sparse_shape_0 = constant_op.constant([6, 1], dtype=dtypes.int64)
+      example_weights = constant_op.constant(
+          [10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1])
+      update = accumulator.add_summary(
+          stamp_token=0,
+          column=sparse_tensor.SparseTensor(sparse_indices_0, sparse_values_0,
+                                            sparse_shape_0),
+          example_weights=example_weights)
+      update.run()
+      save.save(sess, save_path)
+      reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
+      with ops.control_dependencies([reset]):
+        are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertAllEqual([2, 4, 6.], buckets)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      are_ready_noflush = accumulator.get_buckets(stamp_token=0)[0]
+      with ops.control_dependencies([are_ready_noflush]):
+        reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
+
+      with ops.control_dependencies([reset]):
+        are_ready_flush, buckets = accumulator.get_buckets(stamp_token=1)
+      buckets, are_ready_flush, are_ready_noflush = (sess.run(
+          [buckets, are_ready_flush, are_ready_noflush]))
+      self.assertFalse(are_ready_noflush)
+      self.assertTrue(are_ready_flush)
+      self.assertAllEqual([2, 4, 6.], buckets)
+
+  def testSaveRestoreAfterFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      example_weights = constant_op.constant(
+          [10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1])
+      dense_float_tensor_0 = constant_op.constant(
+          [1, 2, 3, 4, 4, 5], dtype=dtypes.float32, shape=[6, 1])
+      update = accumulator.add_summary(
+          stamp_token=0,
+          column=dense_float_tensor_0,
+          example_weights=example_weights)
+      update.run()
+      reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
+      with ops.control_dependencies([reset]):
+        are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertAllEqual([1, 3, 5], buckets)
+      save.save(sess, save_path)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertAllEqual([1, 3, 5], buckets)
+
+  def testFixedUniform(self):
+    """Sets up the quantile summary op test as follows.
+
+    Creates array dividing range [0, 1] to 1<<16 elements equally spaced
+    with weight of 1.0.
+    """
+    dense_float_tensor_0 = constant_op.constant(
+        [(1.0 * i) / math.pow(2.0, 16)
+         for i in range(0, int(math.pow(2, 16)) + 1)])
+    example_weights = constant_op.constant(
+        [1] * (int(math.pow(2, 16)) + 1), dtype=dtypes.float32)
+    config = self._gen_config(0.1, 10)
+
+    with self.test_session():
+      dense_buckets, _ = quantile_ops.quantile_buckets(
+          [dense_float_tensor_0], [], [], [],
+          example_weights=example_weights,
+          dense_config=[config],
+          sparse_config=[])
+      self.assertAllClose(
+          [0] + [(i + 1.0) / 10 for i in range(0, 10)],
+          dense_buckets[0].eval(),
+          atol=0.1)
+
+  def testFixedNonUniform(self):
+    """Sets up the quantile summary op test as follows.
+
+    Creates array dividing range [0, 1] to 1<<16 elements equally spaced
+    with weight same as the value.
+    """
+    dense_float_tensor_0 = constant_op.constant(
+        [(1.0 * i) / math.pow(2.0, 16)
+         for i in range(0, int(math.pow(2, 16)) + 1)])
+    example_weights = constant_op.constant(
+        [(1.0 * i) / math.pow(2.0, 16)
+         for i in range(0, int(math.pow(2, 16)) + 1)])
+
+    config = self._gen_config(0.1, 10)
+
+    with self.test_session():
+      dense_buckets, _ = quantile_ops.quantile_buckets(
+          [dense_float_tensor_0], [], [], [],
+          example_weights=example_weights,
+          dense_config=[config],
+          sparse_config=[])
+      self.assertAllClose(
+          [0] + [math.sqrt((i + 1.0) / 10) for i in range(0, 10)],
+          dense_buckets[0].eval(),
+          atol=0.1)
+
+
+class QuantilesOpTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    """Sets up the quantile op tests.
+
+    Create a batch of 4 examples having 2 dense and 3 sparse features.
+    The data looks like this
+    | Instance | Dense 0 | Dense 1 | Sparse 0 | Sparse 1 | Sparse 2
+    | 0        |   -0.1  |  -1     |   -2     |   0.1    |
+    | 1        |    0.4  |  -15    |   5.5    |          |   2
+    | 2        |    3.2  |  18     |   16     |   3      |
+    | 3        |    190  |  1000   |   17.5   |  -3      |   4
+    Quantiles are:
+    Dense 0: (-inf,0.4], (0.4,5], (5, 190]
+    Dense 1: (-inf, -9], (-9,15], (15, 1000)
+    Sparse 0: (-inf, 5], (5,16], (16, 100]
+    Sparse 1: (-inf, 2], (2, 5]
+    Sparse 2: (-inf, 100]
+    """
+    super(QuantilesOpTest, self).setUp()
+    self._dense_float_tensor_0 = constant_op.constant(
+        [[-0.1], [0.4], [3.2], [190]], dtype=dtypes.float32)
+    self._dense_float_tensor_1 = constant_op.constant(
+        [[-1], [-15], [18], [1000]], dtype=dtypes.float32)
+    # Sparse feature 0
+    self._sparse_indices_0 = constant_op.constant([[0, 0], [1, 0], [2, 0],
+                                                   [3, 0]])
+    self._sparse_values_0 = constant_op.constant([-2, 5.5, 16, 17.5])
+    self._sparse_shape_0 = constant_op.constant([4, 1])
+    # Sprase feature 1
+    self._sparse_indices_1 = constant_op.constant([[0, 0], [2, 0], [3, 0]])
+    self._sparse_values_1 = constant_op.constant([0.1, 3, -3])
+    self._sparse_shape_1 = constant_op.constant([4, 1])
+    # Sprase feature 2
+    self._sparse_indices_2 = constant_op.constant([[1, 0], [3, 0]])
+    self._sparse_values_2 = constant_op.constant([2, 4], dtype=dtypes.float32)
+    self._sparse_shape_2 = constant_op.constant([4, 1])
+    # Quantiles
+    self._dense_thresholds_0 = [0.4, 5, 190]
+    self._dense_thresholds_1 = [-9, 15, 1000]
+
+    self._sparse_thresholds_0 = [5, 16, 100]
+    self._sparse_thresholds_1 = [2, 5]
+    self._sparse_thresholds_2 = [100]
+
+  def testDenseFeaturesOnly(self):
+    with self.test_session():
+      dense_quantiles, _ = quantile_ops.quantiles(
+          [self._dense_float_tensor_0, self._dense_float_tensor_1], [],
+          [self._dense_thresholds_0, self._dense_thresholds_1], [])
+
+      # Dense feature 0
+      self.assertAllEqual([0, 0, 1, 2], dense_quantiles[0].eval())
+      # Dense feature 1
+      self.assertAllEqual([1, 0, 2, 2], dense_quantiles[1].eval())
+
+  def testSparseFeaturesOnly(self):
+    with self.test_session():
+      _, sparse_quantiles = quantile_ops.quantiles(
+          [],
+          [self._sparse_values_0, self._sparse_values_1, self._sparse_values_2],
+          [], [self._sparse_thresholds_0, self._sparse_thresholds_1,
+               self._sparse_thresholds_2])
+
+      # Sparse feature 0
+      self.assertAllEqual([0, 1, 1, 2], sparse_quantiles[0].eval())
+      # Sparse feature 1
+      self.assertAllEqual([0, 1, 0], sparse_quantiles[1].eval())
+      # Sparse feature 2
+      self.assertAllEqual([0, 0], sparse_quantiles[2].eval())
+
+  def testDenseAndSparseFeatures(self):
+    with self.test_session():
+      dense_quantiles, sparse_quantiles = quantile_ops.quantiles(
+          [self._dense_float_tensor_0, self._dense_float_tensor_1],
+          [self._sparse_values_0, self._sparse_values_1, self._sparse_values_2],
+          [self._dense_thresholds_0, self._dense_thresholds_1],
+          [self._sparse_thresholds_0, self._sparse_thresholds_1,
+           self._sparse_thresholds_2])
+
+      # Dense feature 0
+      self.assertAllEqual([0, 0, 1, 2], dense_quantiles[0].eval())
+      # Dense feature 1
+      self.assertAllEqual([1, 0, 2, 2], dense_quantiles[1].eval())
+      # Sparse feature 0
+      self.assertAllEqual([0, 1, 1, 2], sparse_quantiles[0].eval())
+      # Sparse feature 1
+      self.assertAllEqual([0, 1, 0], sparse_quantiles[1].eval())
+      # Sparse feature 2
+      self.assertAllEqual([0, 0], sparse_quantiles[2].eval())
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..edf088b5fa28d3e465d4e3d8ea7cf6745d48a91f
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -0,0 +1,475 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the GTFlow split handler Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import split_info_pb2
+from tensorflow.contrib.boosted_trees.python.ops import split_handler_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
+
+  def testMakeDenseSplit(self):
+    """Tests split handler op."""
+    with self.test_session() as sess:
+      # The data looks like the following after dividing by number of steps (2).
+      # Gradients    | Partition | Dense Quantile |
+      # (1.2, 0.2)   | 0         | 0              |
+      # (-0.3, 0.19) | 0         | 1              |
+      # (4.0, 0.13)  | 1         | 1              |
+      partition_ids = array_ops.constant([0, 0, 1], dtype=dtypes.int32)
+      bucket_ids = array_ops.constant([0, 1, 1], dtype=dtypes.int64)
+      gradients = array_ops.constant([2.4, -0.6, 8.0])
+      hessians = array_ops.constant([0.4, 0.38, 0.26])
+      bucket_boundaries = [0.3, 0.52]
+      partitions, gains, splits = (
+          split_handler_ops.build_dense_inequality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              bucket_ids=bucket_ids,
+              gradients=gradients,
+              hessians=hessians,
+              bucket_boundaries=bucket_boundaries,
+              l1_regularization=0.1,
+              l2_regularization=1,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              class_id=-1,
+              feature_column_group_id=0,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = sess.run([partitions, gains, splits])
+    self.assertAllEqual([0, 1], partitions)
+
+    # Check the split on partition 0.
+    # -(1.2 - 0.1) / (0.2 + 1)
+    expected_left_weight = -0.91666
+
+    # expected_left_weight * -(1.2 - 0.1)
+    expected_left_gain = 1.0083333333333331
+
+    # (-0.3 + 0.1) / (0.19 + 1)
+    expected_right_weight = 0.1680672
+
+    # expected_right_weight * -(-0.3 + 0.1)
+    expected_right_gain = 0.033613445378151252
+
+    # (-0.3 + 1.2 - 0.1) ** 2 / (0.19 + 0.2 + 1)
+    expected_bias_gain = 0.46043165467625885
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0],
+        0.00001)
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+    self.assertEqual(0, split_node.feature_column)
+    self.assertAllClose(0.3, split_node.threshold, 0.00001)
+
+    # Check the split on partition 1.
+    # (-4 + 0.1) / (0.13 + 1)
+    expected_left_weight = -3.4513274336283186
+    expected_right_weight = 0
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+    # There's only one active bucket here so zero gain is expected.
+    self.assertAllClose(0.0, gains[1], 0.00001)
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+    self.assertEqual(0, split_node.feature_column)
+    self.assertAllClose(0.52, split_node.threshold, 0.00001)
+
+  def testMakeMulticlassDenseSplit(self):
+    """Tests split handler op."""
+    with self.test_session() as sess:
+      partition_ids = array_ops.constant([0, 0, 1], dtype=dtypes.int32)
+      bucket_ids = array_ops.constant([0, 1, 1], dtype=dtypes.int64)
+      gradients = array_ops.constant([[2.4, 3.0], [-0.6, 0.1], [8.0, 1.0]])
+      hessians = array_ops.constant([[[0.4, 1], [1, 1]], [[0.38, 1], [1, 1]],
+                                     [[0.26, 1], [1, 1]]])
+      bucket_boundaries = [0.3, 0.52]
+      partitions, gains, splits = (
+          split_handler_ops.build_dense_inequality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              bucket_ids=bucket_ids,
+              gradients=gradients,
+              hessians=hessians,
+              bucket_boundaries=bucket_boundaries,
+              l1_regularization=0,
+              l2_regularization=1,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              class_id=-1,
+              feature_column_group_id=0,
+              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
+      partitions, gains, splits = sess.run([partitions, gains, splits])
+    self.assertAllEqual([0, 1], partitions)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.dense_float_binary_split
+
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+    self.assertEqual(0, split_node.feature_column)
+    self.assertAllClose(0.3, split_node.threshold, 1e-6)
+
+  def testMakeDenseSplitEmptyInputs(self):
+    """Tests empty inputs op."""
+    with self.test_session() as sess:
+      partition_ids = array_ops.constant([], dtype=dtypes.int32)
+      bucket_ids = array_ops.constant([], dtype=dtypes.int64)
+      gradients = array_ops.constant([])
+      hessians = array_ops.constant([])
+      bucket_boundaries = [0.3, 0.52]
+      partitions, gains, splits = (
+          split_handler_ops.build_dense_inequality_splits(
+              num_minibatches=0,
+              partition_ids=partition_ids,
+              bucket_ids=bucket_ids,
+              gradients=gradients,
+              hessians=hessians,
+              bucket_boundaries=bucket_boundaries,
+              l1_regularization=0.1,
+              l2_regularization=1,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              class_id=-1,
+              feature_column_group_id=0,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = sess.run([partitions, gains, splits])
+    # .assertEmpty doesn't exist on ubuntu-contrib
+    self.assertEqual(0, len(partitions))
+    self.assertEqual(0, len(gains))
+    self.assertEqual(0, len(splits))
+
+  def testMakeSparseSplit(self):
+    """Tests split handler op."""
+    with self.test_session() as sess:
+      # The data looks like the following after dividing by number of steps (2).
+      # Gradients    | Partition | bucket ID       |
+      # (0.9, 0.39)  | 0         | -1              |
+      # (1.2, 0.2)   | 0         | 0               |
+      # (0.2, 0.12)  | 0         | 1               |
+      # (4.0, 0.13)  | 1         | -1              |
+      # (4.0, 0.13)  | 1         | 1               |
+      partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
+      bucket_ids = array_ops.constant([-1, 0, 1, -1, 1], dtype=dtypes.int64)
+      gradients = array_ops.constant([1.8, 2.4, 0.4, 8.0, 8.0])
+      hessians = array_ops.constant([0.78, 0.4, 0.24, 0.26, 0.26])
+      bucket_boundaries = array_ops.constant([0.3, 0.52])
+      partitions, gains, splits = (
+          split_handler_ops.build_sparse_inequality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              bucket_ids=bucket_ids,
+              gradients=gradients,
+              hessians=hessians,
+              bucket_boundaries=bucket_boundaries,
+              l1_regularization=0,
+              l2_regularization=2,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              feature_column_group_id=0,
+              bias_feature_id=-1,
+              class_id=-1,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = (sess.run([partitions, gains, splits]))
+    self.assertAllEqual([0, 1], partitions)
+    # Check the split on partition 0.
+    # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
+    expected_left_weight = -0.603448275862069
+    # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
+    expected_left_gain = 0.8448275862068965
+    # 0.5 / (0.07 + 2)
+    expected_right_weight = 0.24154589371980678
+    # 0.5 ** 2 / (0.07 + 2)
+    expected_right_gain = 0.12077294685990339
+    # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
+    expected_bias_gain = 0.3389121338912133
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_right
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0])
+
+    self.assertAllClose([expected_left_weight], left_child.value)
+
+    self.assertAllClose([expected_right_weight], right_child.value)
+
+    self.assertEqual(0, split_node.split.feature_column)
+
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+    # Check the split on partition 1.
+    expected_left_weight = -1.8779342723004695
+    expected_right_weight = 0
+
+    # Verify candidate for partition 1, there's only one active bucket here
+    # so zero gain is expected.
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+
+    self.assertAllClose(0.0, gains[1])
+
+    self.assertAllClose([expected_left_weight], left_child.value)
+
+    self.assertAllClose([expected_right_weight], right_child.value)
+
+    self.assertEqual(0, split_node.split.feature_column)
+
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+  def testMakeMulticlassSparseSplit(self):
+    """Tests split handler op."""
+    with self.test_session() as sess:
+      partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
+    bucket_ids = array_ops.constant([-1, 0, 1, -1, 1], dtype=dtypes.int64)
+    gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
+                                    [8.0, 3.1], [8.0, 0.8]])
+
+    hessian_0 = [[0.78, 1], [12, 1]]
+    hessian_1 = [[0.4, 1], [1, 1]]
+    hessian_2 = [[0.24, 1], [1, 1]]
+    hessian_3 = [[0.26, 1], [1, 1]]
+    hessian_4 = [[0.26, 1], [1, 1]]
+
+    hessians = array_ops.constant(
+        [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
+    bucket_boundaries = array_ops.constant([0.3, 0.52])
+    partitions, gains, splits = (
+        split_handler_ops.build_sparse_inequality_splits(
+            num_minibatches=2,
+            partition_ids=partition_ids,
+            bucket_ids=bucket_ids,
+            gradients=gradients,
+            hessians=hessians,
+            bucket_boundaries=bucket_boundaries,
+            l1_regularization=0,
+            l2_regularization=2,
+            tree_complexity_regularization=0,
+            min_node_weight=0,
+            feature_column_group_id=0,
+            bias_feature_id=-1,
+            class_id=-1,
+            multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
+    partitions, gains, splits = (sess.run([partitions, gains, splits]))
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_right
+
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+
+    self.assertEqual(0, split_node.split.feature_column)
+    self.assertAllClose(0.52, split_node.split.threshold)
+
+  def testMakeCategoricalEqualitySplit(self):
+    """Tests split handler op for categorical equality split."""
+    with self.test_session() as sess:
+      # The data looks like the following after dividing by number of steps (2).
+      # Gradients    | Partition | Feature ID     |
+      # (0.9, 0.39)  | 0         | -1             |
+      # (0.2, 0.12)  | 0         | 1              |
+      # (1.4, 0.32)  | 0         | 2              |
+      # (4.0, 0.13)  | 1         | -1             |
+      # (4.0, 0.13)  | 1         | 1              |
+      gradients = [1.8, 0.4, 2.8, 8.0, 8.0]
+      hessians = [0.78, 0.24, 0.64, 0.26, 0.26]
+      partition_ids = [0, 0, 0, 1, 1]
+      feature_ids = array_ops.constant([-1, 1, 2, -1, 1], dtype=dtypes.int64)
+      partitions, gains, splits = (
+          split_handler_ops.build_categorical_equality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              feature_ids=feature_ids,
+              gradients=gradients,
+              hessians=hessians,
+              l1_regularization=0.1,
+              l2_regularization=1,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              feature_column_group_id=0,
+              bias_feature_id=-1,
+              class_id=-1,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = sess.run([partitions, gains, splits])
+    self.assertAllEqual([0, 1], partitions)
+
+    # Check the split on partition 0.
+    # -(0.2 + 1.2 - 0.1) / (0.12 + 0.2 + 1)
+    expected_left_weight = -0.9848484848484846
+
+    # (0.2 + 1.2 - 0.1) ** 2 / (0.12 + 0.2 + 1)
+    expected_left_gain = 1.2803030303030298
+
+    # -(-0.5 + 0.1) / (0.07 + 1)
+    expected_right_weight = 0.37383177570093457
+
+    # (-0.5 + 0.1) ** 2 / (0.07 + 1)
+    expected_right_gain = 0.14953271028037385
+
+    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
+    expected_bias_gain = 0.46043165467625885
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertEqual(2, split_node.feature_id)
+
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0],
+        0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    # Check the split on partition 1.
+    # (-4 + 0.1) / (0.13 + 1)
+    expected_left_weight = -3.4513274336283186
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_left_gain = 13.460176991150442
+    expected_right_weight = 0
+    expected_right_gain = 0
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_bias_gain = 13.460176991150442
+
+    # Verify candidate for partition 1, there's only one active feature here
+    # so zero gain is expected.
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+    self.assertAllClose(0.0, gains[1], 0.00001)
+
+    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)
+
+    self.assertEqual(0, split_node.feature_column)
+
+    self.assertEqual(1, split_node.feature_id)
+
+  def testMakeMulticlassCategoricalEqualitySplit(self):
+    """Tests split handler op for categorical equality split in multiclass."""
+    with self.test_session() as sess:
+      gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
+                                      [9.0, 3.1], [3.0, 0.8]])
+
+      hessian_0 = [[0.78, 1], [12, 1]]
+      hessian_1 = [[0.4, 1], [1, 1]]
+      hessian_2 = [[0.24, 1], [1, 1]]
+      hessian_3 = [[0.16, 2], [-1, 1]]
+      hessian_4 = [[0.6, 1], [2, 1]]
+
+      hessians = array_ops.constant(
+          [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
+      partition_ids = [0, 0, 0, 1, 1]
+      feature_ids = array_ops.constant([-1, 1, 2, -1, 1], dtype=dtypes.int64)
+      partitions, gains, splits = (
+          split_handler_ops.build_categorical_equality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              feature_ids=feature_ids,
+              gradients=gradients,
+              hessians=hessians,
+              l1_regularization=0.1,
+              l2_regularization=1,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              feature_column_group_id=0,
+              bias_feature_id=-1,
+              class_id=-1,
+              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
+      partitions, gains, splits = sess.run([partitions, gains, splits])
+    self.assertAllEqual([0, 1], partitions)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.categorical_id_binary_split
+
+    # Each leaf has 2 element vector.
+    self.assertEqual(2, len(left_child.value))
+    self.assertEqual(2, len(right_child.value))
+
+    self.assertEqual(0, split_node.feature_column)
+    self.assertEqual(1, split_node.feature_id)
+
+  def testMakeCategoricalEqualitySplitEmptyInput(self):
+    with self.test_session() as sess:
+      gradients = []
+      hessians = []
+      partition_ids = []
+      feature_ids = []
+      partitions, gains, splits = (
+          split_handler_ops.build_categorical_equality_splits(
+              num_minibatches=0,
+              partition_ids=partition_ids,
+              feature_ids=feature_ids,
+              gradients=gradients,
+              hessians=hessians,
+              l1_regularization=0.1,
+              l2_regularization=1,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              feature_column_group_id=0,
+              bias_feature_id=-1,
+              class_id=-1,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = (sess.run([partitions, gains, splits]))
+    self.assertEqual(0, len(partitions))
+    self.assertEqual(0, len(gains))
+    self.assertEqual(0, len(splits))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0022d4ad52b0699e6706ad04435f09d0d1cd57c3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -0,0 +1,389 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for checking stats accumulator related ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.python.ops import stats_accumulator_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
+  """Tests for scalar gradients and hessians accumulator."""
+
+  def testSimpleAcculumator(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar())
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            gradients=[0.1, 0.3],
+            hessians=[0.2, 0.4])
+        op2 = accumulator.add(0, [1], [2], [0.1], [0.2])
+
+      with ops.control_dependencies([op1, op2]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=0, next_stamp_token=1)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(num_updates, 2)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2)], [0.2, 0.4])
+      self.assertAllClose(result[(2, 3)], [0.3, 0.4])
+
+  def testDropStaleUpdate(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar())
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            gradients=[0.1, 0.3],
+            hessians=[0.2, 0.4])
+        op2 = accumulator.add(
+            stamp_token=-1,
+            partition_ids=[1],
+            feature_ids=[2],
+            gradients=[0.1],
+            hessians=[0.2])
+
+      with ops.control_dependencies([op1, op2]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=0, next_stamp_token=1)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(num_updates, 1)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2)], [0.1, 0.2])
+      self.assertAllClose(result[(2, 3)], [0.3, 0.4])
+
+  def testSerialize(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar())
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            gradients=[0.1, 0.3],
+            hessians=[0.2, 0.4])
+
+      with ops.control_dependencies([op1]):
+        (stamp_token, num_updates, partition_1, feature_1, grads_1,
+         hessians_1) = accumulator.serialize()
+      # Make sure that the accumulator hasn't changed during serialization.
+      with ops.control_dependencies([stamp_token]):
+        num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
+            accumulator.flush(stamp_token=0, next_stamp_token=1))
+        (stamp_token, num_updates, partition_1, feature_1, grads_1, hessians_1,
+         num_updates_2, partition_2, feature_2, grads_2, hessians_2) = sess.run(
+             [
+                 stamp_token, num_updates, partition_1, feature_1, grads_1,
+                 hessians_1, num_updates_2, partition_2, feature_2, grads_2,
+                 hessians_2
+             ])
+
+      result_1 = _AccumulatorResultToDict(partition_1, feature_1, grads_1,
+                                          hessians_1)
+      result_2 = _AccumulatorResultToDict(partition_2, feature_2, grads_2,
+                                          hessians_2)
+      self.assertEqual(num_updates, 1)
+      self.assertEqual(num_updates_2, 1)
+      self.assertEqual(len(result_1), 2)
+      self.assertAllClose(result_1[(1, 2)], [0.1, 0.2])
+      self.assertAllClose(result_1[(2, 3)], [0.3, 0.4])
+      self.assertAllEqual(result_1, result_2)
+      self.assertEqual(0, stamp_token)
+
+  def testDeserialize(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar())
+      with ops.control_dependencies([accumulator._create_op]):
+        # These will be deleted due to deserialize call.
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            gradients=[0.1, 0.3],
+            hessians=[0.2, 0.4])
+
+      with ops.control_dependencies([op1]):
+        deserialize = (accumulator.deserialize(
+            stamp_token=2,
+            num_updates=3,
+            partition_ids=[3, 4],
+            feature_ids=[5, 6],
+            gradients=[0.4, 0.5],
+            hessians=[0.6, 0.7]))
+      with ops.control_dependencies([deserialize]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=2, next_stamp_token=3)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads,
+                                        hessians)
+      self.assertEqual(num_updates, 3)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(3, 5)], [0.4, 0.6])
+      self.assertAllClose(result[(4, 6)], [0.5, 0.7])
+
+  def testMakeSummary(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar())
+      partition, feature, grads, hessians = accumulator._make_summary(
+          partition_ids=[1, 2, 1],
+          feature_ids=[2, 3, 2],
+          gradients=[0.1, 0.3, 0.1],
+          hessians=[0.2, 0.4, 0.2])
+      partition, feature, grads, hessians = sess.run(
+          [partition, feature, grads, hessians])
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2)], [0.2, 0.4])
+      self.assertAllClose(result[(2, 3)], [0.3, 0.4])
+
+
+class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
+  """Tests for tensor gradients and hessians accumulator."""
+
+  def testSimpleAcculumator(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.TensorShape([2]),
+          hessian_shape=tensor_shape.TensorShape([2, 2]))
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            # Two values for gradients,
+            gradients=[[0.1, 0.1], [0.2, 0.2]],
+            # A 2x2 matrix for each hessian.
+            hessians=[[[0.01, 0.02], [0.03, 0.04]],
+                      [[0.05, 0.06], [0.07, 0.08]]])
+        op2 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1],
+            feature_ids=[2],
+            gradients=[[0.10, 0.11]],
+            hessians=[[[0.011, 0.022], [0.033, 0.044]]])
+
+      with ops.control_dependencies([op1, op2]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=0, next_stamp_token=1)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(num_updates, 2)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2)][0], [0.20, 0.21])
+      self.assertAllClose(result[(1, 2)][1], [[0.021, 0.042], [0.063, 0.084]])
+      self.assertAllClose(result[(2, 3)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+
+  def testDropStaleUpdate(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.TensorShape([2]),
+          hessian_shape=tensor_shape.TensorShape([2, 2]))
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            # Two values for gradients,
+            gradients=[[0.1, 0.1], [0.2, 0.2]],
+            # A 2x2 matrix for each hessian.
+            hessians=[[[0.01, 0.02], [0.03, 0.04]],
+                      [[0.05, 0.06], [0.07, 0.08]]])
+        op2 = accumulator.add(
+            stamp_token=-1,
+            partition_ids=[1],
+            feature_ids=[2],
+            gradients=[[0.10, 0.11]],
+            hessians=[[[0.011, 0.022], [0.033, 0.044]]])
+
+      with ops.control_dependencies([op1, op2]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=0, next_stamp_token=1)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(num_updates, 1)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2)][0], [0.1, 0.1])
+      self.assertAllClose(result[(1, 2)][1], [[0.01, 0.02], [0.03, 0.04]])
+      self.assertAllClose(result[(2, 3)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+
+  def testSerialize(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.TensorShape([2]),
+          hessian_shape=tensor_shape.TensorShape([2, 2]))
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            # Two values for gradients,
+            gradients=[[0.1, 0.1], [0.2, 0.2]],
+            # A 2x2 matrix for each hessian.
+            hessians=[[[0.01, 0.02], [0.03, 0.04]],
+                      [[0.05, 0.06], [0.07, 0.08]]])
+
+      with ops.control_dependencies([op1]):
+        (stamp_token, num_updates_1, partition_1, feature_1, grads_1,
+         hessians_1) = accumulator.serialize()
+      # Make sure that the accumulator hasn't changed during serialization.
+      with ops.control_dependencies([stamp_token]):
+        num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
+            accumulator.flush(stamp_token=0, next_stamp_token=1))
+        (stamp_token, num_updates_1, partition_1, feature_1, grads_1,
+         hessians_1, num_updates_2, partition_2, feature_2, grads_2,
+         hessians_2) = sess.run([
+             stamp_token, num_updates_1, partition_1, feature_1, grads_1,
+             hessians_1, num_updates_2, partition_2, feature_2, grads_2,
+             hessians_2
+         ])
+
+      result_1 = _AccumulatorResultToDict(partition_1, feature_1, grads_1,
+                                          hessians_1)
+      result_2 = _AccumulatorResultToDict(partition_2, feature_2, grads_2,
+                                          hessians_2)
+
+      self.assertEqual(num_updates_1, 1)
+      self.assertEqual(num_updates_2, 1)
+      self.assertEqual(len(result_1), 2)
+      self.assertAllClose(result_1[(1, 2)][0], [0.1, 0.1])
+      self.assertAllClose(result_1[(1, 2)][1], [[0.01, 0.02], [0.03, 0.04]])
+      self.assertAllClose(result_1[(2, 3)][0], [0.2, 0.2])
+      self.assertAllClose(result_1[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+
+      self.assertAllEqual(result_1[1, 2][0], result_2[1, 2][0])
+      self.assertAllEqual(result_1[1, 2][1], result_2[1, 2][1])
+      self.assertAllEqual(result_1[2, 3][0], result_2[2, 3][0])
+      self.assertAllEqual(result_1[2, 3][1], result_2[2, 3][1])
+
+  def testDeserialize(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.TensorShape([2]),
+          hessian_shape=tensor_shape.TensorShape([2, 2]))
+      with ops.control_dependencies([accumulator._create_op]):
+        # These will be deleted due to deserialize call.
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[2, 3],
+            # Two values for gradients,
+            gradients=[[0.1, 0.1], [0.2, 0.2]],
+            # A 2x2 matrix for each hessian.
+            hessians=[[[0.01, 0.02], [0.03, 0.04]],
+                      [[0.05, 0.06], [0.07, 0.08]]])
+
+      with ops.control_dependencies([op1]):
+        deserialize = accumulator.deserialize(
+            stamp_token=2,
+            num_updates=3,
+            partition_ids=[3, 4],
+            feature_ids=[4, 5],
+            # Two values for gradients,
+            gradients=[[0.3, 0.3], [0.5, 0.5]],
+            # A 2x2 matrix for each hessian.
+            hessians=[[[0.03, 0.04], [0.05, 0.06]], [[0.07, 0.08], [0.09,
+                                                                    0.10]]])
+      with ops.control_dependencies([deserialize]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=2, next_stamp_token=3)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads,
+                                        hessians)
+      self.assertEqual(num_updates, 3)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(3, 4)][0], [0.3, 0.3])
+      self.assertAllClose(result[(3, 4)][1], [[0.03, 0.04], [0.05, 0.06]])
+      self.assertAllClose(result[(4, 5)][0], [0.5, 0.5])
+      self.assertAllClose(result[(4, 5)][1], [[0.07, 0.08], [0.09, 0.10]])
+
+  def testMakeSummary(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.TensorShape([2]),
+          hessian_shape=tensor_shape.TensorShape([2, 2]))
+      partition, feature, grads, hessians = accumulator._make_summary(
+          partition_ids=[1, 2, 1],
+          feature_ids=[2, 3, 2],
+          # Two values for gradients,
+          gradients=[[0.1, 0.1], [0.2, 0.2], [0.10, 0.11]],
+          # A 2x2 matrix for each hessian.
+          hessians=[[[0.01, 0.02], [0.03, 0.04]], [[0.05, 0.06], [0.07, 0.08]],
+                    [[0.011, 0.022], [0.033, 0.044]]])
+      partition, feature, grads, hessians = sess.run(
+          [partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2)][0], [0.20, 0.21])
+      self.assertAllClose(result[(1, 2)][1], [[0.021, 0.042], [0.063, 0.084]])
+      self.assertAllClose(result[(2, 3)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+
+
+def _AccumulatorResultToDict(partition, feature, grads, hessians):
+  """Converts the inputs to a dictionary since the ordering changes."""
+  return {(partition[i], feature[i]): (grads[i], hessians[i])
+          for i in range(len(partition))}
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0413fee5a8249d15f2cdae095dc7fa2c76a22b8
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -0,0 +1,1590 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the GTFlow training Ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import split_info_pb2
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.ops import training_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+def _gen_learner_config(num_classes,
+                        l1_reg,
+                        l2_reg,
+                        tree_complexity,
+                        max_depth,
+                        min_node_weight,
+                        pruning_mode,
+                        growing_mode,
+                        dropout_probability=None,
+                        dropout_learning_rate=None,
+                        dropout_prob_of_skipping=None):
+  """Create a serialized learner config with the desired settings."""
+  config = learner_pb2.LearnerConfig()
+  config.num_classes = num_classes
+  config.regularization.l1 = l1_reg
+  config.regularization.l2 = l2_reg
+  config.regularization.tree_complexity = tree_complexity
+  config.constraints.max_tree_depth = max_depth
+  config.constraints.min_node_weight = min_node_weight
+  config.pruning_mode = pruning_mode
+  config.growing_mode = growing_mode
+
+  if dropout_probability is not None:
+    config.learning_rate_tuner.dropout.dropout_probability = dropout_probability
+  if dropout_learning_rate is not None:
+    config.learning_rate_tuner.dropout.learning_rate = dropout_learning_rate
+  if dropout_prob_of_skipping is not None:
+    config.learning_rate_tuner.dropout.dropout_prob_of_skipping = (
+        dropout_prob_of_skipping)
+  return config.SerializeToString()
+
+
+def _gen_dense_split_info(fc, threshold, left_weight, right_weight):
+  split_str = """
+    split_node {
+      dense_float_binary_split {
+        feature_column: %d
+        threshold: %f
+      }
+    }
+    left_child {
+      sparse_vector {
+        index: 0
+        value: %f
+      }
+    }
+    right_child {
+      sparse_vector {
+        index: 0
+        value: %f
+      }
+    }""" % (fc, threshold, left_weight, right_weight)
+  split = split_info_pb2.SplitInfo()
+  text_format.Merge(split_str, split)
+  return split.SerializeToString()
+
+
+def _gen_categorical_split_info(fc, feat_id, left_weight, right_weight):
+  split_str = """
+    split_node {
+      categorical_id_binary_split {
+        feature_column: %d
+        feature_id: %d
+      }
+    }
+    left_child {
+      sparse_vector {
+        index: 0
+        value: %f
+      }
+    }
+    right_child {
+      sparse_vector {
+        index: 0
+        value: %f
+      }
+    }""" % (fc, feat_id, left_weight, right_weight)
+  split = split_info_pb2.SplitInfo()
+  text_format.Merge(split_str, split)
+  return split.SerializeToString()
+
+
+def _get_bias_update(grads, hess):
+  return array_ops.where(hess > 0, -grads / hess, array_ops.zeros_like(grads))
+
+
+class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
+  """Tests for centering tree ensemble bias."""
+
+  def testCenterBias(self):
+    """Tests bias centering for multiple iterations."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=3,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=4,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
+          # Dropout does not change anything here.
+          dropout_probability=0.5)
+
+      # Center bias for the initial step.
+      grads = constant_op.constant([0.4, -0.3])
+      hess = constant_op.constant([2.0, 1.0])
+      continue_centering1 = training_ops.center_tree_ensemble_bias(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          delta_updates=_get_bias_update(grads, hess),
+          learner_config=learner_config)
+      continue_centering = session.run(continue_centering1)
+      self.assertEqual(continue_centering, True)
+
+      # Validate ensemble state.
+      # dim 0 update: -0.4/2.0 = -0.2
+      # dim 1 update: +0.3/1.0 = +0.3
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: -0.2
+                value: 0.3
+              }
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+      # Center bias for another step.
+      # dim 0 update: -0.06/0.5 = -0.12
+      # dim 1 update: -0.01/0.5 = -0.02
+      grads = constant_op.constant([0.06, 0.01])
+      hess = constant_op.constant([0.5, 0.5])
+      continue_centering2 = training_ops.center_tree_ensemble_bias(
+          tree_ensemble_handle,
+          stamp_token=1,
+          next_stamp_token=2,
+          delta_updates=_get_bias_update(grads, hess),
+          learner_config=learner_config)
+      continue_centering = session.run(continue_centering2)
+      self.assertEqual(continue_centering, True)
+
+      # Validate ensemble state.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=2))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: -0.32
+                value: 0.28
+              }
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 2)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+      # Center bias for another step, but this time updates are negligible.
+      grads = constant_op.constant([0.0000001, -0.00003])
+      hess = constant_op.constant([0.5, 0.0])
+      continue_centering3 = training_ops.center_tree_ensemble_bias(
+          tree_ensemble_handle,
+          stamp_token=2,
+          next_stamp_token=3,
+          delta_updates=_get_bias_update(grads, hess),
+          learner_config=learner_config)
+      continue_centering = session.run(continue_centering3)
+      self.assertEqual(continue_centering, False)
+
+      # Validate ensemble stamp.
+      new_stamp, _ = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=3))
+      self.assertEqual(new_stamp, 3)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+
+
+class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
+  """Tests for growing tree ensemble from split candidates."""
+
+  def testGrowEmptyEnsemble(self):
+    """Test growing an empty ensemble."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
+          # Dropout does not change anything here, tree is not finalized.
+          dropout_probability=0.5)
+
+      # Prepare handler inputs.
+      # Note that handlers 1 & 3 have the same gain but different splits.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(0, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(0, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(0, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the simpler split from handler 1 to be chosen.
+      # The grown tree should be finalized as max tree depth is 1.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.52
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 7.143
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowExistingEnsembleTreeNotFinalized(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.61999988556
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 7.14300012589
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -4.375
+              }
+            }
+          }
+        }
+        tree_weights: 0.10000000149
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=3,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
+          # Dropout does not change anything here - tree is not finalized.
+          dropout_probability=0.5)
+
+      # Prepare handler inputs.
+      # Handler 1 only has a candidate for partition 1, handler 2 has candidates
+      # for both partitions and handler 3 only has a candidate for partition 2.
+      handler1_partitions = np.array([1], dtype=np.int32)
+      handler1_gains = np.array([1.4], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(0, 0.21, -6.0, 1.65)]
+      handler2_partitions = np.array([1, 2], dtype=np.int32)
+      handler2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      handler2_split = [
+          _gen_dense_split_info(0, 0.23, -0.6, 0.24),
+          _gen_categorical_split_info(1, 7, -1.5, 2.3)
+      ]
+      handler3_partitions = np.array([2], dtype=np.int32)
+      handler3_gains = np.array([1.7], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(0, 3, -0.75, 1.93)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the split for partition 1 to be chosen from handler 1 and
+      # the split for partition 2 to be chosen from handler 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.61999988556
+            }
+          }
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.21
+              left_id: 3
+              right_id: 4
+            }
+            node_metadata {
+              gain: 1.4
+            }
+          }
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 1
+              feature_id: 7
+              left_id: 5
+              right_id: 6
+            }
+            node_metadata {
+              gain: 2.7
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -6.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 1.65
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -1.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 2.3
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 2)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 2)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 2)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowExistingEnsembleTreeFinalized(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 3
+              feature_id: 7
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 1.3
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 2.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -0.9
+              }
+            }
+          }
+        }
+        tree_weights: 0.10000000149
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.2,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect a new tree to be added with the split from handler 1.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 3
+              feature_id: 7
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 1.3
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 2.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -0.9
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            dense_float_binary_split {
+              feature_column: 5
+              threshold: 0.52
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 7.143
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 2)
+      self.assertEqual(stats.num_layers, 2)
+      self.assertEqual(stats.active_tree, 2)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 2)
+      self.assertEqual(stats.attempted_layers, 2)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowEnsemblePrePrune(self):
+    """Test growing an ensemble with pre-pruning."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+
+      # Prepare handler inputs.
+      # All handlers have negative gain.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([-0.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(0, 0.52, 0.01, 0.0143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([-1.3], dtype=np.float32)
+      handler2_split = [_gen_categorical_split_info(0, 7, 0.013, 0.0143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions, handler2_partitions],
+          gains=[handler1_gains, handler2_gains],
+          splits=[handler1_split, handler2_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the ensemble to be empty.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 0)
+      self.assertEqual(stats.active_tree, 0)
+      self.assertEqual(stats.active_layer, 0)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals("""
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+  def testGrowEnsemblePostPruneNone(self):
+    """Test growing an empty ensemble."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+
+      # Prepare handler inputs.
+      # Note that handlers 1 & 3 have the same gain but different splits.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(0, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(0, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(0, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the simpler split from handler 1 to be chosen.
+      # The grown tree should be finalized as max tree depth is 1.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.52
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 7.143
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowEnsemblePostPruneAll(self):
+    """Test growing an ensemble with post-pruning."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=2,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+
+      # Prepare handler inputs.
+      # All handlers have negative gain.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([-1.3], dtype=np.float32)
+      handler1_split = [_gen_categorical_split_info(0, 7, 0.013, 0.0143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([-0.62], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(0, 0.33, 0.01, 0.0143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions, handler2_partitions],
+          gains=[handler1_gains, handler2_gains],
+          splits=[handler1_split, handler2_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the split from handler 2 to be chosen despite the negative gain.
+      # The grown tree should not be finalized as max tree depth is 2 so no
+      # pruning occurs.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      tree_ensemble_config.ParseFromString(serialized)
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      expected_result = """
+        trees {
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.33
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: -0.62
+              original_leaf {
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.01
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.0143
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+      # Prepare handler inputs.
+      # All handlers have negative gain.
+      handler1_partitions = np.array([1, 2], dtype=np.int32)
+      handler1_gains = np.array([-0.2, -0.5], dtype=np.float32)
+      handler1_split = [
+          _gen_categorical_split_info(3, 7, 0.07, 0.083),
+          _gen_categorical_split_info(3, 5, 0.041, 0.064)
+      ]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=1,
+          next_stamp_token=2,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions],
+          gains=[handler1_gains],
+          splits=[handler1_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the ensemble to be empty as post-pruning will prune
+      # the entire finalized tree.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=2))
+      tree_ensemble_config.ParseFromString(serialized)
+      self.assertEqual(new_stamp, 2)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 0)
+      self.assertEqual(stats.active_tree, 0)
+      self.assertEqual(stats.active_layer, 0)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 2)
+      self.assertProtoEquals("""
+      growing_metadata {
+        num_trees_attempted: 1
+        num_layers_attempted: 2
+      }
+      """, tree_ensemble_config)
+
+  def testGrowEnsemblePostPrunePartial(self):
+    """Test growing an ensemble with post-pruning."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=2,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+
+      # Prepare handler inputs.
+      # Second handler has positive gain.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([-1.3], dtype=np.float32)
+      handler1_split = [_gen_categorical_split_info(0, 7, 0.013, 0.0143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([-0.2], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(0, 0.33, 0.01, 0.0143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions, handler2_partitions],
+          gains=[handler1_gains, handler2_gains],
+          splits=[handler1_split, handler2_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the split from handler 2 to be chosen despite the negative gain.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.33
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: -0.2
+              original_leaf {
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.01
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.0143
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+      # Prepare handler inputs for second layer.
+      # Note that partition 1 gain is negative and partition 2 gain is positive.
+      handler1_partitions = np.array([1, 2], dtype=np.int32)
+      handler1_gains = np.array([-0.2, 0.5], dtype=np.float32)
+      handler1_split = [
+          _gen_categorical_split_info(3, 7, 0.07, 0.083),
+          _gen_categorical_split_info(3, 5, 0.041, 0.064)
+      ]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=1,
+          next_stamp_token=2,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions],
+          gains=[handler1_gains],
+          splits=[handler1_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the negative gain split of partition 1 to be pruned and the
+      # positive gain split of partition 2 to be retained.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=2))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.33
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.01
+              }
+            }
+          }
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 3
+              feature_id: 5
+              left_id: 3
+              right_id: 4
+            }
+            node_metadata {
+              gain: 0.5
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.041
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 0.064
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 2)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 2)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 2)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 2)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowEnsembleTreeLayerByLayer(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 7.143
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -4.375
+              }
+            }
+          }
+        }
+        tree_weights: 0.10000000149
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=3,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.LAYER_BY_LAYER,
+          # Dropout will have no effect, since the tree will not be fully grown.
+          dropout_probability=1.0)
+
+      # Prepare handler inputs.
+      # Handler 1 only has a candidate for partition 1, handler 2 has candidates
+      # for both partitions and handler 3 only has a candidate for partition 2.
+      handler1_partitions = np.array([1], dtype=np.int32)
+      handler1_gains = np.array([1.4], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(0, 0.21, -6.0, 1.65)]
+      handler2_partitions = np.array([1, 2], dtype=np.int32)
+      handler2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      handler2_split = [
+          _gen_dense_split_info(0, 0.23, -0.6, 0.24),
+          _gen_categorical_split_info(1, 7, -1.5, 2.3)
+      ]
+      handler3_partitions = np.array([2], dtype=np.int32)
+      handler3_gains = np.array([1.7], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(0, 3, -0.75, 1.93)]
+
+      # Grow tree ensemble layer by layer.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect the split for partition 1 to be chosen from handler 1 and
+      # the split for partition 2 to be chosen from handler 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      # The partition 1 split weights get added to original leaf weight 7.143.
+      # The partition 2 split weights get added to original leaf weight -4.375.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            dense_float_binary_split {
+              threshold: 0.21
+              left_id: 3
+              right_id: 4
+            }
+            node_metadata {
+              gain: 1.4
+            }
+          }
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 1
+              feature_id: 7
+              left_id: 5
+              right_id: 6
+            }
+            node_metadata {
+              gain: 2.7
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 1.143
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 8.793
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -5.875
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -2.075
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 2)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 2)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 2)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowExistingEnsembleTreeFinalizedWithDropout(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split and one bias tree.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: -0.32
+                value: 0.28
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 3
+              feature_id: 7
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 1.3
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 2.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -0.9
+              }
+            }
+          }
+        }
+        tree_weights: 0.7
+        tree_weights: 1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_tree_weight_updates: 5
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
+          dropout_probability=1.0)
+
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect a new tree to be added with the split from handler 1.
+      _, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      tree_ensemble_config.ParseFromString(serialized)
+
+      self.assertEqual(3, len(tree_ensemble_config.trees))
+      # Both trees got 0.5 as weights, bias tree is untouched.
+      self.assertAllClose([0.7, 0.5, 0.5], tree_ensemble_config.tree_weights)
+
+      self.assertEqual(
+          1, tree_ensemble_config.tree_metadata[0].num_tree_weight_updates)
+      self.assertEqual(
+          6, tree_ensemble_config.tree_metadata[1].num_tree_weight_updates)
+      self.assertEqual(
+          2, tree_ensemble_config.tree_metadata[2].num_tree_weight_updates)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..23168bf4935e92bcb5072348361ae04861641b6d
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility for batching remote OPs together to reduce RPC overhead."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+
+
+class ScheduledOp(object):
+  """Represents a scheduled remote operation."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def batching_key(self):
+    """Returns the key for batching operations."""
+
+  @abc.abstractmethod
+  def batch_runner_fn(self):
+    """Returns the function that executes the operation on the batch."""
+
+
+class ScheduledStampedResourceOp(ScheduledOp):
+  """Wrapper class for batched operations on stamped resources."""
+
+  def __init__(self, resource_handle, op, **kwargs):
+    self.resource_handle = resource_handle
+    self.op = op
+    self.args = kwargs
+
+  def batching_key(self):
+    # We want to group the same operations on the same device and run them in
+    # one batch. So we use (device, operation) as the key.
+    return self.resource_handle.device, self.op
+
+  def batch_runner_fn(self):
+    return _scheduled_stamp_resource_op_runner
+
+
+def _move_tensors(tensors, device):
+  """Moves a list of tensors to a device by concatenating/splitting them."""
+  # Reset the device setting to avoid weird interactions with device merging
+  # logic.
+  with ops.device(None):
+    if all(tensor.shape == tensor_shape.scalar() for tensor in tensors):
+      with ops.device(tensors[0].device):
+        values = array_ops.stack(tensors)
+      with ops.device(device):
+        return array_ops.unstack(values)
+    else:
+      with ops.device(tensors[0].device):
+        sizes = array_ops.stack(
+            [array_ops.shape(tensor)[0] for tensor in tensors])
+        values = array_ops.concat(tensors, axis=0)
+      with ops.device(device):
+        sizes = array_ops.unstack(sizes)
+        return list(array_ops.split(values, sizes, axis=0))
+
+
+def _scheduled_stamp_resource_op_runner(batch, stamp):
+  """Runs a batch operation on a stamped resource."""
+  if not batch:
+    return
+  arg_keys = set(batch[0].args.keys())
+  grouped_args = collections.defaultdict(list)
+  resource_handles = []
+  # Check that the set of arguments is the same across all the scheduled ops.
+  for op in batch:
+    if set(op.args.keys()) != arg_keys:
+      raise ValueError("Mismatching arguments: %s, %s.", op.args, arg_keys)
+    for key in arg_keys:
+      grouped_args[key].append(op.args[key])
+    resource_handles.append(op.resource_handle)
+  # Move all the inputs to the op device in one RPC.
+  grouped_args = {
+      k: _move_tensors(v, resource_handles[0].device)
+      for k, v in grouped_args.items()
+  }
+  with ops.device(resource_handles[0].device):
+    return batch[0].op(resource_handles, stamp, **grouped_args)
+
+
+def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device):
+  """Given a dictionary of ops for each handler, runs them in batch."""
+  batched_ops = collections.defaultdict(list)
+  # Group the ops by their batching_key. Ops that share the same batching key
+  # can be executed together.
+  for handler in per_handler_ops.keys():
+    for op in per_handler_ops[handler]:
+      batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op)
+  op_results = {}
+  for batch in batched_ops.values():
+    # Run each of the batched ops using its runner.
+    results = batch[0].batch_runner_fn()(batch, stamp)
+    # If the result is a tuple, move each entry in the tuple in one RPC.
+    if isinstance(results, tuple):
+      results = tuple(
+          _move_tensors(result, worker_device) for result in results)
+      # Once all the results are on the worker, create individual tuple for
+      # each scheduled op request.
+      for i in range(len(batch)):
+        op_results[batch[i]] = tuple(result[i] for result in results)
+    # If the result is a tuple, it didn't have any outputs, so use the
+    # `ops.Operation` as the result for all the scheduled ops.
+    elif isinstance(results, ops.Operation):
+      for i in range(len(batch)):
+        op_results[batch[i]] = results
+    else:
+      raise ValueError("Unknown type of result %s.", results)
+  handler_results = collections.defaultdict(list)
+  # Dispatch the results of the ScheduledOps to the handlers that requested
+  # them.
+  for handler in per_handler_ops.keys():
+    for op in per_handler_ops[handler]:
+      handler_results[handler].append(op_results[op])
+  return handler_results
diff --git a/tensorflow/contrib/boosted_trees/python/ops/ensemble_optimizer_ops.py b/tensorflow/contrib/boosted_trees/python/ops/ensemble_optimizer_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..06eb6d59aeb889b39ada0d20877a8e81cbc1cce7
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/ensemble_optimizer_ops.py
@@ -0,0 +1,32 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Split handler custom ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.boosted_trees.python.ops.gen_ensemble_optimizer_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import resource_loader
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _ensemble_optimizer_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile('_ensemble_optimizer_ops.so'))
+except (errors.NotFoundError, IOError):
+  print('Error loading _ensemble_optimizer_ops.so')
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..813fb181f334b29d83a684091b50a1549b2e32e2
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model ops python wrappers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.boosted_trees.python.ops import gen_model_ops
+from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensemble_deserialize
+from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensemble_serialize
+# pylint: disable=unused-import
+from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensemble_stamp_token
+# pylint: enable=unused-import
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import saver
+
+ops.NotDifferentiable("TreeEnsembleVariable")
+ops.NotDifferentiable("TreeEnsembleSerialize")
+ops.NotDifferentiable("TreeEnsembleDeserialize")
+
+
+class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for TreeEnsembleVariable."""
+
+  def __init__(self, tree_ensemble_handle, create_op, name):
+    """Creates a TreeEnsembleVariableSavable object.
+
+    Args:
+      tree_ensemble_handle: handle to the tree ensemble variable.
+      create_op: the op to initialize the variable.
+      name: the name to save the tree ensemble variable under.
+    """
+    stamp_token, ensemble_config = tree_ensemble_serialize(tree_ensemble_handle)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree ensemble variable. So we just pass an empty
+    # value.
+    slice_spec = ""
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        name + "_stamp"),
+        saver.BaseSaverBuilder.SaveSpec(ensemble_config, slice_spec,
+                                        name + "_config"),
+    ]
+    super(TreeEnsembleVariableSavable,
+          self).__init__(tree_ensemble_handle, specs, name)
+    self._tree_ensemble_handle = tree_ensemble_handle
+    self._create_op = create_op
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return tree_ensemble_deserialize(
+          self._tree_ensemble_handle,
+          stamp_token=restored_tensors[0],
+          tree_ensemble_config=restored_tensors[1])
+
+
+def tree_ensemble_variable(stamp_token,
+                           tree_ensemble_config,
+                           name,
+                           container=None):
+  r"""Creates a tree ensemble model and returns a handle to it.
+
+  Args:
+    stamp_token: The initial stamp token value for the ensemble resource.
+    tree_ensemble_config: A `Tensor` of type `string`.
+      Serialized proto of the tree ensemble.
+    name: A name for the ensemble variable.
+    container: An optional `string`. Defaults to `""`.
+
+  Returns:
+    A `Tensor` of type mutable `string`. The handle to the tree ensemble.
+  """
+  with ops.name_scope(name, "TreeEnsembleVariable") as name:
+    resource_handle = gen_model_ops.decision_tree_ensemble_resource_handle_op(
+        container, shared_name=name, name=name)
+    create_op = gen_model_ops.create_tree_ensemble_variable(
+        resource_handle, stamp_token, tree_ensemble_config)
+    is_initialized_op = gen_model_ops.tree_ensemble_is_initialized_op(
+        resource_handle)
+    # Adds the variable to the savable list.
+    saveable = TreeEnsembleVariableSavable(resource_handle, create_op,
+                                           resource_handle.name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    resources.register_resource(resource_handle, create_op, is_initialized_op)
+    return resource_handle
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _model_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_model_ops.so"))
+except (errors.NotFoundError, IOError):
+  print("Error loading _model_ops.so")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py b/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9ea371566fde9c7c0dfb817f7a87565cd6f158
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
@@ -0,0 +1,32 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Split handler custom ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import resource_loader
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _prediction_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile('_prediction_ops.so'))
+except (errors.NotFoundError, IOError):
+  print('Error loading _prediction_ops.so')
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba6720a4cdd810209ccd8a0b673c99242146d79
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -0,0 +1,196 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Quantile ops python wrappers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import re
+
+from tensorflow.contrib.boosted_trees.python.ops import batch_ops_utils
+from tensorflow.contrib.boosted_trees.python.ops import gen_quantile_ops
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import,undefined-variable
+from tensorflow.contrib.boosted_trees.python.ops.gen_quantile_ops import *
+# pylint: enable=wildcard-import,undefined-variable
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import saver
+
+# Pattern to remove all non alpha numeric from a string.
+_PATTERN = re.compile(r"[\W_]+")
+
+
+class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+  """A resource that allows distributed quantile computation."""
+
+  def __init__(self,
+               init_stamp_token,
+               epsilon,
+               num_quantiles,
+               name=None,
+               container=None):
+    """Creates a QuantileAccumulator object.
+
+    Args:
+      init_stamp_token: The initial value for the stamp token.
+      epsilon: Error bound on the quantile computation.
+      num_quantiles: Number of quantiles to produce from the final summary.
+      name: the name to save the accumulator under.
+      container: An optional `string`. Defaults to `""`
+    """
+    self._epsilon = epsilon
+
+    name = _PATTERN.sub("", name)
+    with ops.name_scope(name, "QuantileAccumulator") as name:
+      self._quantile_accumulator_handle = (
+          gen_quantile_ops.quantile_stream_resource_handle_op(
+              container=container, shared_name=name, name=name))
+      self._create_op = gen_quantile_ops.create_quantile_accumulator(
+          self._quantile_accumulator_handle,
+          init_stamp_token,
+          epsilon=epsilon,
+          num_quantiles=num_quantiles)
+      is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
+          self._quantile_accumulator_handle)
+    resources.register_resource(self._quantile_accumulator_handle,
+                                self._create_op, is_initialized_op)
+    self._make_savable(name)
+
+  def _make_savable(self, name):
+    stamp_token, state, are_buckets_ready, buckets = (
+        gen_quantile_ops.quantile_accumulator_serialize(
+            self._quantile_accumulator_handle))
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful in quantile accumulator.
+    slice_spec = ""
+    def make_save_spec(tensor, suffix):
+      return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix)
+
+    specs = [make_save_spec(stamp_token, "_stamp")]
+    specs += [make_save_spec(state, "_state")]
+    specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")]
+    specs += [make_save_spec(buckets, "buckets")]
+    super(QuantileAccumulator,
+          self).__init__(self._quantile_accumulator_handle, specs, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated quantile accumulator from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore.
+
+    Returns:
+      The operation that restores the state of the quantile accumulator.
+    """
+    # Read the restored tensors with the same order that were added to saving
+    # spec.
+    stamp_token = restored_tensors[:1]
+    state = restored_tensors[1:2]
+    are_buckets_ready = restored_tensors[2:3]
+    buckets = restored_tensors[3]
+    with ops.control_dependencies([self._create_op]):
+      return gen_quantile_ops.quantile_accumulator_deserialize(
+          self._quantile_accumulator_handle,
+          stamp_token=stamp_token,
+          stream_state=state,
+          are_buckets_ready=are_buckets_ready,
+          buckets=buckets)
+
+  def get_buckets(self, stamp_token):
+    """Returns quantile buckets created during previous flush."""
+    are_buckets_ready, buckets = (
+        gen_quantile_ops.quantile_accumulator_get_buckets(
+            quantile_accumulator_handles=[self._quantile_accumulator_handle],
+            stamp_token=stamp_token))
+    return are_buckets_ready[0], buckets[0]
+
+  def schedule_get_buckets(self):
+    """Returns a scheduled read of buckets created during previous flush."""
+    return batch_ops_utils.ScheduledStampedResourceOp(
+        resource_handle=self._quantile_accumulator_handle,
+        op=gen_quantile_ops.quantile_accumulator_get_buckets)
+
+  def _make_summary(self, column, example_weights):
+    if isinstance(column, sparse_tensor.SparseTensor):
+      return gen_quantile_ops.make_quantile_summaries(
+          dense_float_features=[],
+          sparse_float_feature_indices=[column.indices],
+          sparse_float_feature_values=[column.values],
+          sparse_float_feature_shapes=[column.dense_shape],
+          example_weights=example_weights,
+          epsilon=self._epsilon / 2).sparse_summaries[0]
+    else:
+      return gen_quantile_ops.make_quantile_summaries(
+          dense_float_features=[column],
+          sparse_float_feature_indices=[],
+          sparse_float_feature_values=[],
+          sparse_float_feature_shapes=[],
+          example_weights=example_weights,
+          epsilon=self._epsilon / 2).dense_summaries[0]
+
+  def add_summary(self, stamp_token, column, example_weights):
+    """Adds quantile summary to its stream in resource."""
+    summary = self._make_summary(column, example_weights)
+    return gen_quantile_ops.quantile_accumulator_add_summaries(
+        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        stamp_token=stamp_token,
+        summaries=[summary])
+
+  def add_prebuilt_summary(self, stamp_token, summary):
+    """Adds quantile summary to its stream in resource."""
+    return gen_quantile_ops.quantile_accumulator_add_summaries(
+        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        stamp_token=stamp_token,
+        summaries=[summary])
+
+  def schedule_add_summary(self, stamp_token, column, example_weights):
+    """Schedules to add a quantile summary to its stream in resource."""
+    summary = self._make_summary(column, example_weights)
+    return batch_ops_utils.ScheduledStampedResourceOp(
+        op=gen_quantile_ops.quantile_accumulator_add_summaries,
+        resource_handle=self._quantile_accumulator_handle,
+        summaries=summary)
+
+  def flush(self, stamp_token, next_stamp_token):
+    """Finalizes quantile summary stream and resets it for next iteration."""
+    return gen_quantile_ops.quantile_accumulator_flush(
+        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        stamp_token=stamp_token,
+        next_stamp_token=next_stamp_token)
+
+  def flush_summary(self, stamp_token, next_stamp_token):
+    """Finalizes quantile summary stream and resets it for next iteration."""
+    result = gen_quantile_ops.quantile_accumulator_flush_summary(
+        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        stamp_token=stamp_token,
+        next_stamp_token=next_stamp_token)
+    return result
+
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _quantile_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_quantile_ops.so"))
+except (errors.NotFoundError, IOError):
+  print("Error loading _quantile_ops.so")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/split_handler_ops.py b/tensorflow/contrib/boosted_trees/python/ops/split_handler_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..903f7d99c95ac291128dd2fea50890daf3fb4b7b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/split_handler_ops.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Split handler custom ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.boosted_trees.python.ops.gen_split_handler_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import resource_loader
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _split_handler_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile('_split_handler_ops.so'))
+except (errors.NotFoundError, IOError):
+  print('Error loading _split_handler_ops.so')
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c24f8b51739e5aeaac94757140cb42ee00f96618
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -0,0 +1,215 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stats Accumulator ops python wrappers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import re
+from tensorflow.contrib.boosted_trees.python.ops import batch_ops_utils
+from tensorflow.contrib.boosted_trees.python.ops import gen_stats_accumulator_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import saver
+
+# Pattern to remove all non alpha numeric from a string.
+_PATTERN = re.compile(r"[\W_]+")
+
+
+class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
+  """A resource that allows to accumulate gradients and hessians.
+
+  For consistency guarantees, we use read and write stamp tokens.
+  The stamp token on the resource is updated with StatsAccumulator.flush.
+  Calls to StatsAccumulator.add that don't provide the current stamp token are
+  ignored.
+  """
+
+  def __init__(self,
+               stamp_token,
+               gradient_shape,
+               hessian_shape,
+               name=None,
+               container=None):
+    """Creates a stats accumulator and returns a handle to it.
+
+    Args:
+      stamp_token: An int64, initial value to use for the stamp token.
+      gradient_shape: A TensorShape, containing shape of gradients.
+      hessian_shape: A TensorShape, containing shape of hessians.
+      name: A name for the stats accumulator variable.
+      container: An optional `string`. Defaults to `""`.
+
+    Returns:
+      A `Tensor` of type mutable `string`. The handle to the stats accumulator.
+    """
+    if name is not None:
+      name = _PATTERN.sub("", name)
+    with ops.name_scope(name, "StatsAccumulator") as name:
+      # Both values are scalars.
+      if (gradient_shape == tensor_shape.scalar() and
+          hessian_shape == tensor_shape.scalar()):
+        self._is_scalar = True
+        self._resource_handle = (gen_stats_accumulator_ops.
+                                 stats_accumulator_scalar_resource_handle_op(
+                                     container, name, name=name))
+
+        create_op = gen_stats_accumulator_ops.create_stats_accumulator_scalar(
+            self._resource_handle, stamp_token)
+        is_initialized_op = (
+            gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
+                self._resource_handle))
+      else:
+        self._is_scalar = False
+        self._resource_handle = (gen_stats_accumulator_ops.
+                                 stats_accumulator_tensor_resource_handle_op(
+                                     container, name, name=name))
+        create_op = gen_stats_accumulator_ops.create_stats_accumulator_tensor(
+            self._resource_handle, stamp_token, gradient_shape.as_list(),
+            hessian_shape.as_list())
+        is_initialized_op = (
+            gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
+                self._resource_handle))
+
+    self._create_op = create_op
+    slice_spec = ""
+    saver_name = self._resource_handle.name
+    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
+     hessians) = self.serialize()
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        saver_name + "_stamp"),
+        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
+                                        saver_name + "_num_updates"),
+        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
+                                        saver_name + "_partition_ids"),
+        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
+                                        saver_name + "_feature_ids"),
+        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
+                                        saver_name + "_gradients"),
+        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
+                                        saver_name + "hessians"),
+    ]
+
+    super(StatsAccumulator, self).__init__(self._resource_handle, specs, name)
+    resources.register_resource(self._resource_handle, create_op,
+                                is_initialized_op)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+  def add(self, stamp_token, partition_ids, feature_ids, gradients, hessians):
+    """Updates the stats accumulator."""
+    partition_ids, feature_ids, gradients, hessians = (self._make_summary(
+        partition_ids, feature_ids, gradients, hessians))
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_add(
+          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [gradients], [hessians])
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_add(
+          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [gradients], [hessians])
+
+  def schedule_add(self, partition_ids, feature_ids, gradients, hessians):
+    """Schedules an update to the stats accumulator."""
+    partition_ids, feature_ids, gradients, hessians = (self._make_summary(
+        partition_ids, feature_ids, gradients, hessians))
+    if self._is_scalar:
+      return batch_ops_utils.ScheduledStampedResourceOp(
+          op=gen_stats_accumulator_ops.stats_accumulator_scalar_add,
+          resource_handle=self._resource_handle,
+          partition_ids=partition_ids,
+          feature_ids=feature_ids,
+          gradients=gradients,
+          hessians=hessians)
+    else:
+      return batch_ops_utils.ScheduledStampedResourceOp(
+          op=gen_stats_accumulator_ops.stats_accumulator_tensor_add,
+          resource_handle=self._resource_handle,
+          partition_ids=partition_ids,
+          feature_ids=feature_ids,
+          gradients=gradients,
+          hessians=hessians)
+
+  def _make_summary(self, partition_ids, feature_ids, gradients, hessians):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_make_summary(
+          partition_ids, feature_ids, gradients, hessians)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_make_summary(
+          partition_ids, feature_ids, gradients, hessians)
+
+  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
+                  gradients, hessians):
+    """Resets the stats accumulator with the serialized state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+
+  def flush(self, stamp_token, next_stamp_token):
+    """Flushes the stats accumulator."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
+          self._resource_handle, stamp_token, next_stamp_token)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
+          self._resource_handle, stamp_token, next_stamp_token)
+
+  def serialize(self):
+    """Serializes the stats accumulator state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
+          self._resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
+          self._resource_handle)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self.deserialize(
+          stamp_token=restored_tensors[0],
+          num_updates=restored_tensors[1],
+          partition_ids=restored_tensors[2],
+          feature_ids=restored_tensors[3],
+          gradients=restored_tensors[4],
+          hessians=restored_tensors[5])
+
+  def resource(self):
+    return self._resource_handle
+
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _stats_accumulator_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_stats_accumulator_ops.so"))
+except (errors.NotFoundError, IOError):
+  print("Error loading _stats_accumulator_ops.so")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/training_ops.py b/tensorflow/contrib/boosted_trees/python/ops/training_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..32925b3e83b7d6030e730f51c7f5465c755a9b16
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/ops/training_ops.py
@@ -0,0 +1,32 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import and conditionally load custom ops for training boosted trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.boosted_trees.python.ops.gen_training_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import resource_loader
+
+# Conditionally load ops, they might already be statically linked in.
+try:
+  _training_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile('_training_ops.so'))
+except (errors.NotFoundError, IOError):
+  print('Error loading _training_ops.so')
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..077571629198429ddb7d3ed1c43a780a7de228a0
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -0,0 +1,1012 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training functions for Gradient boosted decision trees."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib import learn
+from tensorflow.contrib import stateless
+
+from tensorflow.contrib.boosted_trees.lib.learner.batch import categorical_split_handler
+from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.python.ops import batch_ops_utils
+from tensorflow.contrib.boosted_trees.python.ops import gen_model_ops
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.ops import prediction_ops
+from tensorflow.contrib.boosted_trees.python.ops import stats_accumulator_ops
+from tensorflow.contrib.boosted_trees.python.ops import training_ops
+from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
+from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import device_setter
+
+# Key names for prediction dict.
+ENSEMBLE_STAMP = "ensemble_stamp"
+PREDICTIONS = "predictions"
+PARTITION_IDS = "partition_ids"
+NUM_LAYERS_ATTEMPTED = "num_layers"
+NUM_TREES_ATTEMPTED = "num_trees"
+PREDICTIONS_NO_DROPOUT = "predictions_no_dropout"
+_FEATURE_NAME_TEMPLATE = "%s_%d"
+
+
+def _get_column_by_index(tensor, indices):
+  """Returns columns from a 2-D tensor by index."""
+  shape = array_ops.shape(tensor)
+  p_flat = array_ops.reshape(tensor, [-1])
+  i_flat = array_ops.reshape(
+      array_ops.reshape(math_ops.range(0, shape[0]) * shape[1], [-1, 1]) +
+      indices, [-1])
+  return array_ops.reshape(array_ops.gather(p_flat, i_flat), [shape[0], -1])
+
+
+def _make_predictions_dict(stamp, logits, logits_no_dropout, partition_ids,
+                           ensemble_stats):
+  """Returns predictions for the given logits and n_classes.
+
+  Args:
+    stamp: The ensemble stamp.
+    logits: A rank 2 `Tensor` with shape [batch_size, n_classes - 1].
+    logits_no_dropout: A rank 2 `Tensor` with shape [batch_size, n_classes - 1]
+    that contains predictions when no dropout was applied.
+    partition_ids: A rank 1 `Tensor` with shape [batch_size].
+    ensemble_stats: A TreeEnsembleStatsOp result tuple.
+
+  Returns:
+    A dict of predictions.
+  """
+  result = {}
+  result[ENSEMBLE_STAMP] = stamp
+  result[PREDICTIONS] = logits
+  result[PREDICTIONS_NO_DROPOUT] = logits_no_dropout
+  result[PARTITION_IDS] = partition_ids
+
+  result[NUM_LAYERS_ATTEMPTED] = ensemble_stats.attempted_layers
+  result[NUM_TREES_ATTEMPTED] = ensemble_stats.attempted_trees
+  return result
+
+
+class _OpRoundRobinStrategy(object):
+  """Returns the next ps task index for placement via per-Op round-robin order.
+
+  This strategy works slightly better for the GBDT graph because of using
+  custom resources which vary significantly in compute cost.
+  """
+
+  def __init__(self, ps_ops, num_tasks):
+    """Create a new `_RoundRobinStrategy`.
+
+    Args:
+      ps_ops: List of Op types to place on PS.
+      num_tasks: Number of ps tasks to cycle among.
+    """
+    next_task = 0
+    self._next_task_per_op = {}
+    for op in ps_ops:
+      self._next_task_per_op[op] = next_task
+      next_task = (next_task + 1) % num_tasks if num_tasks else 0
+    self._num_tasks = num_tasks
+
+  def __call__(self, op):
+    """Choose a ps task index for the given `Operation`.
+
+    Args:
+      op: An `Operation` to be placed on ps.
+
+    Returns:
+      The next ps task index to use for the `Operation`. Returns the next
+      index, in the range `[offset, offset + num_tasks)`.
+
+    Raises:
+      ValueError: If attempting to place non-PS Op.
+    """
+    if op.type not in self._next_task_per_op:
+      raise ValueError("Unknown op type '%s' for placement:" % op.type)
+    task = self._next_task_per_op[op.type]
+    self._next_task_per_op[op.type] = ((task + 1) % self._num_tasks
+                                       if self._num_tasks else 0)
+    return task
+
+
+def extract_features(features, feature_columns):
+  """Extracts columns from a dictionary of features.
+
+  Args:
+    features: `Tensor` or `dict` of `Tensor` objects.
+    feature_columns: A list of feature_columns.
+
+  Returns:
+    Seven values:
+      - A list of all feature column names.
+      - A list of dense floats.
+      - A list of sparse float feature indices.
+      - A list of sparse float feature values.
+      - A list of sparse float feature shapes.
+      - A list of sparse int feature indices.
+      - A list of sparse int feature values.
+      - A list of sparse int feature shapes.
+  Raises:
+    ValueError: if features is not valid.
+  """
+  if not features:
+    raise ValueError("Features dictionary must be specified.")
+
+  if isinstance(features, ops.Tensor):
+    features = {features.name, features}
+
+  # Make a shallow copy of features to ensure downstream usage
+  # is unaffected by modifications in the model function.
+  features = copy.copy(features)
+  if feature_columns:
+    scope = "gbdt"
+    with variable_scope.variable_scope(scope):
+      feature_columns = list(feature_columns)
+      transformed_features = {}
+      for fc in feature_columns:
+        # pylint: disable=protected-access
+        if isinstance(fc, feature_column_lib._EmbeddingColumn):
+          # pylint: enable=protected-access
+          transformed_features[fc.name] = fc_core.input_layer(
+              features, [fc],
+              weight_collections=[scope])
+        else:
+          result = feature_column_ops.transform_features(features, [fc])
+          if len(result) > 1:
+            raise ValueError("Unexpected number of output features")
+          transformed_features[fc.name] = result[list(result.keys())[0]]
+    features = transformed_features
+
+  dense_float_names = []
+  dense_floats = []
+  sparse_float_names = []
+  sparse_float_indices = []
+  sparse_float_values = []
+  sparse_float_shapes = []
+  sparse_int_names = []
+  sparse_int_indices = []
+  sparse_int_values = []
+  sparse_int_shapes = []
+  for key in sorted(features.keys()):
+    tensor = features[key]
+    if isinstance(tensor, sparse_tensor.SparseTensor):
+      if tensor.values.dtype == dtypes.float32:
+        sparse_float_names.append(key)
+        sparse_float_indices.append(tensor.indices)
+        sparse_float_values.append(tensor.values)
+        sparse_float_shapes.append(tensor.dense_shape)
+      elif tensor.values.dtype == dtypes.int64:
+        sparse_int_names.append(key)
+        sparse_int_indices.append(tensor.indices)
+        sparse_int_values.append(tensor.values)
+        sparse_int_shapes.append(tensor.dense_shape)
+      else:
+        raise ValueError("Unsupported sparse feature %s with dtype %s." %
+                         (tensor.indices.name, tensor.dtype))
+    else:
+      if tensor.dtype == dtypes.float32:
+        if len(tensor.shape) > 1 and tensor.shape[1] > 1:
+          unstacked = array_ops.unstack(tensor, axis=1)
+          for i in xrange(len(unstacked)):
+            dense_float_names.append(_FEATURE_NAME_TEMPLATE % (key, i))
+            dense_floats.append(array_ops.reshape(unstacked[i], [-1, 1]))
+        else:
+          dense_float_names.append(key)
+          dense_floats.append(tensor)
+      else:
+        raise ValueError("Unsupported dense feature %s with dtype %s." %
+                         (tensor.name, tensor.dtype))
+  # Feature columns are logically organized into incrementing slots starting
+  # from dense floats, then sparse floats then sparse ints.
+  fc_names = (dense_float_names + sparse_float_names + sparse_int_names)
+  return (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
+          sparse_float_shapes, sparse_int_indices, sparse_int_values,
+          sparse_int_shapes)
+
+
+def _dropout_params(mode, ensemble_stats):
+  """Returns parameters relevant for dropout.
+
+  Args:
+    mode: Train/Eval/Infer
+    ensemble_stats: A TreeEnsembleStatsOp result tuple.
+
+  Returns:
+    Whether to apply dropout and a dropout seed.
+  """
+  if mode == learn.ModeKeys.TRAIN:
+    # Do dropout only during training.
+    apply_dropout = True
+    seed = ensemble_stats.attempted_trees
+  else:
+    seed = -1
+    apply_dropout = False
+  return apply_dropout, seed
+
+
+class GradientBoostedDecisionTreeModel(object):
+  """A GBDT model function."""
+
+  def __init__(self,
+               is_chief,
+               num_ps_replicas,
+               ensemble_handle,
+               center_bias,
+               examples_per_layer,
+               learner_config,
+               features,
+               feature_columns=None):
+    """Construct a new GradientBoostedDecisionTreeModel function.
+
+    Args:
+      is_chief: Whether to build the chief graph.
+      num_ps_replicas: Number of parameter server replicas, can be 0.
+      ensemble_handle: A handle to the ensemble variable.
+      center_bias: Whether to center the bias before growing trees.
+      examples_per_layer: Number of examples to accumulate before growing
+        a tree layer. It can also be a function that computes the number of
+        examples based on the depth of the layer that's being built.
+      learner_config: A learner config.
+          print split, sorted_feature_names[split.feature_column]
+      features: `Tensor` or `dict` of `Tensor` objects.
+      feature_columns: A list of feature columns.
+
+    Raises:
+      ValueError: if inputs are not valid.
+    """
+    if ensemble_handle is None:
+      raise ValueError("ensemble_handle must be specified.")
+
+    if learner_config is None:
+      raise ValueError("learner_config must be specified.")
+
+    if learner_config.num_classes < 2:
+      raise ValueError("Number of classes must be >=2")
+
+    self._is_chief = is_chief
+    self._num_ps_replicas = num_ps_replicas
+    self._ensemble_handle = ensemble_handle
+    self._center_bias = center_bias
+    self._examples_per_layer = examples_per_layer
+    self._learner_config = learner_config
+    self._feature_columns = feature_columns
+    self._learner_config_serialized = learner_config.SerializeToString()
+    self._attempted_trees = variables.Variable(
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+    self._finalized_trees = variables.Variable(
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+    if not features:
+      raise ValueError("Features dictionary must be specified.")
+    (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
+     sparse_float_shapes, sparse_int_indices, sparse_int_values,
+     sparse_int_shapes) = extract_features(features, self._feature_columns)
+    logging.info("Active Feature Columns: " + str(fc_names))
+    self._fc_names = fc_names
+    self._dense_floats = dense_floats
+    self._sparse_float_indices = sparse_float_indices
+    self._sparse_float_values = sparse_float_values
+    self._sparse_float_shapes = sparse_float_shapes
+    self._sparse_int_indices = sparse_int_indices
+    self._sparse_int_values = sparse_int_values
+    self._sparse_int_shapes = sparse_int_shapes
+    self._reduce_dim = (self._learner_config.multi_class_strategy ==
+                        learner_pb2.LearnerConfig.TREE_PER_CLASS and
+                        learner_config.num_classes == 2)
+
+  def predict(self, mode):
+    """Returns predictions given the features and mode.
+
+    Args:
+      mode: Mode the graph is running in (train|predict|eval).
+
+    Returns:
+      A dict of predictions tensors.
+
+    Raises:
+      ValueError: if features is not valid.
+    """
+    apply_averaging = mode != learn.ModeKeys.TRAIN
+
+    # Use the current ensemble to predict on the current batch of input.
+    # For faster prediction we check if the inputs are on the same device
+    # as the model. If not, we create a copy of the model on the worker.
+    input_deps = (self._dense_floats + self._sparse_float_indices +
+                  self._sparse_int_indices)
+    if not input_deps:
+      raise ValueError("No input tensors for prediction.")
+
+    if any(i.device != input_deps[0].device for i in input_deps):
+      raise ValueError("All input tensors should be on the same device.")
+
+    # Get most current model stamp.
+    ensemble_stamp = model_ops.tree_ensemble_stamp_token(self._ensemble_handle)
+
+    # Determine if ensemble is colocated with the inputs.
+    if self._ensemble_handle.device != input_deps[0].device:
+      # Create a local ensemble and get its local stamp.
+      with ops.name_scope("local_ensemble", "TreeEnsembleVariable") as name:
+        local_ensemble_handle = (
+            gen_model_ops.decision_tree_ensemble_resource_handle_op(name=name))
+        create_op = gen_model_ops.create_tree_ensemble_variable(
+            local_ensemble_handle, stamp_token=-1, tree_ensemble_config="")
+        with ops.control_dependencies([create_op]):
+          local_stamp = model_ops.tree_ensemble_stamp_token(
+              local_ensemble_handle)
+
+      # Determine whether the local ensemble is stale and update it if needed.
+      def _refresh_local_ensemble_fn():
+        # Serialize the model from parameter server after reading all inputs.
+        with ops.control_dependencies(input_deps):
+          (ensemble_stamp, serialized_model) = (
+              model_ops.tree_ensemble_serialize(self._ensemble_handle))
+
+        # Update local ensemble with the serialized model from parameter server.
+        with ops.control_dependencies([create_op]):
+          return model_ops.tree_ensemble_deserialize(
+              local_ensemble_handle,
+              stamp_token=ensemble_stamp,
+              tree_ensemble_config=serialized_model), ensemble_stamp
+
+      refresh_local_ensemble, ensemble_stamp = control_flow_ops.cond(
+          math_ops.not_equal(ensemble_stamp,
+                             local_stamp), _refresh_local_ensemble_fn,
+          lambda: (control_flow_ops.no_op(), ensemble_stamp))
+
+      # Once updated, Use the the local model for prediction.
+      with ops.control_dependencies([refresh_local_ensemble]):
+        ensemble_stats = training_ops.tree_ensemble_stats(
+            local_ensemble_handle, ensemble_stamp)
+        apply_dropout, seed = _dropout_params(mode, ensemble_stats)
+        # We don't need dropout info - we can always restore it based on the
+        # seed.
+        predictions, predictions_no_dropout, _ = (
+            prediction_ops.gradient_trees_prediction(
+                local_ensemble_handle,
+                seed,
+                self._dense_floats,
+                self._sparse_float_indices,
+                self._sparse_float_values,
+                self._sparse_float_shapes,
+                self._sparse_int_indices,
+                self._sparse_int_values,
+                self._sparse_int_shapes,
+                learner_config=self._learner_config_serialized,
+                apply_dropout=apply_dropout,
+                apply_averaging=apply_averaging,
+                use_locking=False,
+                center_bias=self._center_bias,
+                reduce_dim=self._reduce_dim))
+        partition_ids = prediction_ops.gradient_trees_partition_examples(
+            local_ensemble_handle,
+            self._dense_floats,
+            self._sparse_float_indices,
+            self._sparse_float_values,
+            self._sparse_float_shapes,
+            self._sparse_int_indices,
+            self._sparse_int_values,
+            self._sparse_int_shapes,
+            use_locking=False)
+
+    else:
+      with ops.device(self._ensemble_handle.device):
+        ensemble_stats = training_ops.tree_ensemble_stats(
+            self._ensemble_handle, ensemble_stamp)
+        apply_dropout, seed = _dropout_params(mode, ensemble_stats)
+        # We don't need dropout info - we can always restore it based on the
+        # seed.
+        predictions, predictions_no_dropout, _ = (
+            prediction_ops.gradient_trees_prediction(
+                self._ensemble_handle,
+                seed,
+                self._dense_floats,
+                self._sparse_float_indices,
+                self._sparse_float_values,
+                self._sparse_float_shapes,
+                self._sparse_int_indices,
+                self._sparse_int_values,
+                self._sparse_int_shapes,
+                learner_config=self._learner_config_serialized,
+                apply_dropout=apply_dropout,
+                apply_averaging=apply_averaging,
+                use_locking=False,
+                center_bias=self._center_bias,
+                reduce_dim=self._reduce_dim))
+        partition_ids = prediction_ops.gradient_trees_partition_examples(
+            self._ensemble_handle,
+            self._dense_floats,
+            self._sparse_float_indices,
+            self._sparse_float_values,
+            self._sparse_float_shapes,
+            self._sparse_int_indices,
+            self._sparse_int_values,
+            self._sparse_int_shapes,
+            use_locking=False)
+
+    return _make_predictions_dict(ensemble_stamp, predictions,
+                                  predictions_no_dropout, partition_ids,
+                                  ensemble_stats)
+
+  def train(self, loss, predictions_dict, labels):
+    """Grows a new tree and adds it to the ensemble.
+
+    Args:
+      loss: A scalar tensor representing average loss of examples.
+      predictions_dict: Dictionary of Rank 2 `Tensor` representing information
+          about predictions per example.
+      labels: Rank 2 `Tensor` representing labels per example.
+
+    Returns:
+      An op that adds a new tree to the ensemble.
+
+    Raises:
+      ValueError: if inputs are not valid.
+    """
+    # Get tensors relevant for training and form the loss.
+    predictions = predictions_dict[PREDICTIONS]
+    partition_ids = predictions_dict[PARTITION_IDS]
+    ensemble_stamp = predictions_dict[ENSEMBLE_STAMP]
+    gradients = gradients_impl.gradients(
+        loss,
+        predictions,
+        name="Gradients",
+        colocate_gradients_with_ops=False,
+        gate_gradients=0,
+        aggregation_method=None)[0]
+
+    strategy = self._learner_config.multi_class_strategy
+    num_classes = self._learner_config.num_classes
+
+    class_id = -1
+    # Handle different multiclass strategies.
+    if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS:
+      # We build one vs rest trees.
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+
+      if num_classes == 2:
+        # We have only 1 score, gradients is of shape [batch, 1].
+        hessians = gradients_impl.gradients(
+            gradients,
+            predictions,
+            name="Hessian",
+            colocate_gradients_with_ops=False,
+            gate_gradients=0,
+            aggregation_method=None)[0]
+
+        squeezed_gradients = array_ops.squeeze(gradients, axis=[1])
+        squeezed_hessians = array_ops.squeeze(hessians, axis=[1])
+      else:
+        hessian_list = self._diagonal_hessian(gradients, predictions)
+        # Assemble hessian list into a tensor.
+        hessians = array_ops.stack(hessian_list, axis=1)
+
+        # Choose the class for which the tree is built (one vs rest).
+        class_id = predictions_dict[NUM_TREES_ATTEMPTED] % num_classes
+        class_id = math_ops.to_int32(class_id)
+
+        # Use class id tensor to get the column with that index from gradients
+        # and hessians.
+        squeezed_gradients = array_ops.squeeze(
+            _get_column_by_index(gradients, class_id))
+        squeezed_hessians = array_ops.squeeze(
+            _get_column_by_index(hessians, class_id))
+    else:
+      # Other multiclass strategies.
+      gradient_shape = tensor_shape.TensorShape([num_classes])
+
+      if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
+        hessian_shape = tensor_shape.TensorShape(([num_classes, num_classes]))
+        hessian_list = self._full_hessian(gradients, predictions)
+      else:
+        # Diagonal hessian strategy.
+        hessian_shape = tensor_shape.TensorShape(([num_classes]))
+        hessian_list = self._diagonal_hessian(gradients, predictions)
+
+      squeezed_gradients = gradients
+      hessians = array_ops.stack(hessian_list, axis=1)
+      squeezed_hessians = hessians
+
+    # Get the weights for each example for quantiles calculation,
+    weights = self._get_weights(hessian_shape, squeezed_hessians)
+
+    regularization_config = self._learner_config.regularization
+    min_node_weight = self._learner_config.constraints.min_node_weight
+    # Create all handlers ensuring resources are evenly allocated across PS.
+    fc_name_idx = 0
+    handlers = []
+    init_stamp_token = constant_op.constant(0, dtype=dtypes.int64)
+    with ops.device(self._get_replica_device_setter()):
+      # Create handlers for dense float columns
+      for dense_float_column_idx in range(len(self._dense_floats)):
+        fc_name = self._fc_names[fc_name_idx]
+        handlers.append(
+            ordinal_split_handler.DenseSplitHandler(
+                l1_regularization=regularization_config.l1,
+                l2_regularization=regularization_config.l2,
+                tree_complexity_regularization=(
+                    regularization_config.tree_complexity),
+                min_node_weight=min_node_weight,
+                feature_column_group_id=dense_float_column_idx,
+                epsilon=0.01,
+                num_quantiles=100,
+                dense_float_column=self._dense_floats[dense_float_column_idx],
+                name=fc_name,
+                gradient_shape=gradient_shape,
+                hessian_shape=hessian_shape,
+                multiclass_strategy=strategy,
+                init_stamp_token=init_stamp_token))
+        fc_name_idx += 1
+
+      # Create handlers for sparse float columns.
+      for sparse_float_column_idx in range(len(self._sparse_float_indices)):
+        fc_name = self._fc_names[fc_name_idx]
+        handlers.append(
+            ordinal_split_handler.SparseSplitHandler(
+                l1_regularization=regularization_config.l1,
+                l2_regularization=regularization_config.l2,
+                tree_complexity_regularization=(
+                    regularization_config.tree_complexity),
+                min_node_weight=min_node_weight,
+                feature_column_group_id=sparse_float_column_idx,
+                epsilon=0.01,
+                num_quantiles=100,
+                sparse_float_column=sparse_tensor.SparseTensor(
+                    self._sparse_float_indices[sparse_float_column_idx],
+                    self._sparse_float_values[sparse_float_column_idx],
+                    self._sparse_float_shapes[sparse_float_column_idx]),
+                name=fc_name,
+                gradient_shape=gradient_shape,
+                hessian_shape=hessian_shape,
+                multiclass_strategy=strategy,
+                init_stamp_token=init_stamp_token))
+        fc_name_idx += 1
+
+      # Create handlers for sparse int columns.
+      for sparse_int_column_idx in range(len(self._sparse_int_indices)):
+        fc_name = self._fc_names[fc_name_idx]
+        handlers.append(
+            categorical_split_handler.EqualitySplitHandler(
+                l1_regularization=regularization_config.l1,
+                l2_regularization=regularization_config.l2,
+                tree_complexity_regularization=(
+                    regularization_config.tree_complexity),
+                min_node_weight=min_node_weight,
+                feature_column_group_id=sparse_int_column_idx,
+                sparse_int_column=sparse_tensor.SparseTensor(
+                    self._sparse_int_indices[sparse_int_column_idx],
+                    self._sparse_int_values[sparse_int_column_idx],
+                    self._sparse_int_shapes[sparse_int_column_idx]),
+                name=fc_name,
+                gradient_shape=gradient_shape,
+                hessian_shape=hessian_shape,
+                multiclass_strategy=strategy,
+                init_stamp_token=init_stamp_token))
+        fc_name_idx += 1
+
+      # Create steps accumulator.
+      steps_accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar(),
+          name="StepsAccumulator")
+
+      # Create bias stats accumulator.
+      bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          name="BiasAccumulator")
+
+      # Create ensemble stats variables.
+      num_layer_examples = variables.Variable(
+          initial_value=array_ops.zeros([], dtypes.int64),
+          name="num_layer_examples",
+          trainable=False)
+      num_layer_steps = variables.Variable(
+          initial_value=array_ops.zeros([], dtypes.int64),
+          name="num_layer_steps",
+          trainable=False)
+      num_layers = variables.Variable(
+          initial_value=array_ops.zeros([], dtypes.int64),
+          name="num_layers",
+          trainable=False)
+      active_tree = variables.Variable(
+          initial_value=array_ops.zeros([], dtypes.int64),
+          name="active_tree",
+          trainable=False)
+      active_layer = variables.Variable(
+          initial_value=array_ops.zeros([], dtypes.int64),
+          name="active_layer",
+          trainable=False)
+
+    # Create ensemble stats summaries.
+    summary.scalar("layer_stats/num_examples", num_layer_examples)
+    summary.scalar("layer_stats/num_steps", num_layer_steps)
+    summary.scalar("ensemble_stats/active_tree", active_tree)
+    summary.scalar("ensemble_stats/active_layer", active_layer)
+
+    # Update bias stats.
+    stats_update_ops = []
+    continue_centering = variables.Variable(
+        initial_value=self._center_bias,
+        name="continue_centering",
+        trainable=False)
+    stats_update_ops.append(
+        control_flow_ops.cond(continue_centering,
+                              self._make_update_bias_stats_fn(
+                                  ensemble_stamp, predictions, gradients,
+                                  bias_stats_accumulator),
+                              control_flow_ops.no_op))
+
+    # Update handler stats.
+    handler_reads = {}
+
+    input_deps = (self._dense_floats + self._sparse_float_indices +
+                  self._sparse_int_indices)
+    worker_device = input_deps[0].device
+    for handler in handlers:
+      handler_reads[handler] = handler.scheduled_reads()
+
+    handler_results = batch_ops_utils.run_handler_scheduled_ops(
+        handler_reads, ensemble_stamp, worker_device)
+    per_handler_updates = {}
+    # Two values per handler. First one is if the the handler is active for the
+    # current layer. The second one is if the handler is going to be active
+    # for the next layer.
+    subsampling_type = self._learner_config.WhichOneof("feature_fraction")
+    if subsampling_type == "feature_fraction_per_level":
+      seed = predictions_dict[NUM_LAYERS_ATTEMPTED]
+      active_handlers_current_layer = stateless.stateless_random_uniform(
+          shape=[len(handlers)], seed=[seed, 1])
+      active_handlers_next_layer = stateless.stateless_random_uniform(
+          shape=[len(handlers)], seed=[seed + 1, 1])
+      active_handlers = array_ops.stack(
+          [active_handlers_current_layer, active_handlers_next_layer], axis=1)
+      active_handlers = (active_handlers <
+                         self._learner_config.feature_fraction_per_level)
+    elif subsampling_type == "feature_fraction_per_tree":
+      seed = predictions_dict[NUM_TREES_ATTEMPTED]
+      active_handlers_current_layer = stateless.stateless_random_uniform(
+          shape=[len(handlers)], seed=[seed, 2])
+      active_handlers_current_layer = (
+          active_handlers_current_layer <
+          self._learner_config.feature_fraction_per_tree)
+      active_handlers = array_ops.stack(active_handlers_current_layer,
+                                        array_ops.ones(
+                                            [len(handlers)], dtype=dtypes.bool))
+    else:
+      active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool)
+
+    # Prepare empty gradients and hessians when handlers are not ready.
+    empty_hess_shape = [1] + hessian_shape.as_list()
+    empty_grad_shape = [1] + gradient_shape.as_list()
+
+    empty_gradients = constant_op.constant(
+        [], dtype=dtypes.float32, shape=empty_grad_shape)
+    empty_hessians = constant_op.constant(
+        [], dtype=dtypes.float32, shape=empty_hess_shape)
+
+    for handler_idx in range(len(handlers)):
+      handler = handlers[handler_idx]
+      is_active = active_handlers[handler_idx]
+      updates, scheduled_updates = handler.update_stats(
+          ensemble_stamp, partition_ids, squeezed_gradients, squeezed_hessians,
+          empty_gradients, empty_hessians, weights, is_active,
+          handler_results[handler])
+      stats_update_ops.append(updates)
+      per_handler_updates[handler] = scheduled_updates
+
+    update_results = batch_ops_utils.run_handler_scheduled_ops(
+        per_handler_updates, ensemble_stamp, worker_device)
+    for update in update_results.values():
+      stats_update_ops += update
+    # Accumulate a step after updating stats.
+    batch_size = math_ops.cast(array_ops.shape(labels)[0], dtypes.float32)
+    with ops.control_dependencies(stats_update_ops):
+      add_step_op = steps_accumulator.add(ensemble_stamp, [0], [0],
+                                          [batch_size], [1.0])
+
+    # Determine learning rate.
+    learning_rate_tuner = self._learner_config.learning_rate_tuner.WhichOneof(
+        "tuner")
+    if learning_rate_tuner == "fixed" or learning_rate_tuner == "dropout":
+      tuner = getattr(self._learner_config.learning_rate_tuner,
+                      learning_rate_tuner)
+      learning_rate = tuner.learning_rate
+    else:
+      # TODO(nponomareva, soroush) do the line search.
+      raise ValueError("Line search learning rate is not yet supported.")
+
+    # After adding the step, decide if further processing is needed.
+    ensemble_update_ops = [add_step_op]
+    with ops.control_dependencies([add_step_op]):
+      if self._is_chief:
+        dropout_seed = predictions_dict[NUM_TREES_ATTEMPTED]
+
+        # Get accumulated steps and examples for the current layer.
+        _, _, _, _, acc_examples, acc_steps = steps_accumulator.serialize()
+        acc_examples = math_ops.cast(acc_examples[0], dtypes.int64)
+        acc_steps = math_ops.cast(acc_steps[0], dtypes.int64)
+        ensemble_update_ops.append(num_layer_examples.assign(acc_examples))
+        ensemble_update_ops.append(num_layer_steps.assign(acc_steps))
+        # Determine whether we need to update tree ensemble.
+        examples_per_layer = self._examples_per_layer
+        if callable(examples_per_layer):
+          examples_per_layer = examples_per_layer(active_layer)
+        ensemble_update_ops.append(
+            control_flow_ops.cond(
+                acc_examples >= examples_per_layer,
+                self._make_update_ensemble_fn(
+                    ensemble_stamp, steps_accumulator, bias_stats_accumulator,
+                    continue_centering, learning_rate, handlers, num_layers,
+                    active_tree, active_layer, dropout_seed, class_id),
+                control_flow_ops.no_op))
+
+    # Calculate the loss to be reported - use the predictions without dropout.
+    return control_flow_ops.group(*ensemble_update_ops)
+
+  def _get_weights(self, hessian_shape, hessians):
+    """Derives weights to be used based on hessians and multiclass strategy."""
+    if hessian_shape == tensor_shape.scalar():
+      # This is tree per class.
+      weights = hessians
+    elif len(hessian_shape.dims) == 1:
+      # This is diagonal hessian.
+      weights = math_ops.reduce_sum(hessians, axis=1)
+    else:
+      # This is full hessian.
+      weights = math_ops.trace(hessians)
+    return weights
+
+  def _full_hessian(self, grads, predictions):
+    """Prepares hessians for full-hessian multiclass strategy."""
+    # Because of
+    # https://github.com/tensorflow/tensorflow/issues/675, we can't just
+    # compute the full hessian with a single call to gradients, but instead
+    # must compute it row-by-row.
+    gradients_list = array_ops.unstack(
+        grads, num=self._learner_config.num_classes, axis=1)
+    hessian_rows = []
+
+    for row in range(self._learner_config.num_classes):
+      # If current row is i, K is number of classes,each row returns a tensor of
+      # size batch_size x K representing for each example dx_i dx_1, dx_i dx_2
+      # etc dx_i dx_K
+      hessian_row = gradients_impl.gradients(
+          gradients_list[row],
+          predictions,
+          name="Hessian_%d" % row,
+          colocate_gradients_with_ops=False,
+          gate_gradients=0,
+          aggregation_method=None)
+
+      # hessian_row is of dimension 1, batch_size, K, => trim first dimension
+      # to get batch_size x K
+      hessian_row = array_ops.squeeze(array_ops.unstack(hessian_row), [0])
+      hessian_rows.append(hessian_row)
+    return hessian_rows
+
+  def _diagonal_hessian(self, grads, predictions):
+    """Prepares hessians for diagonal-hessian multiclass mode."""
+    diag_hessian_list = []
+
+    gradients_list = array_ops.unstack(
+        grads, num=self._learner_config.num_classes, axis=1)
+
+    for row, row_grads in enumerate(gradients_list):
+      # If current row is i, K is number of classes,each row returns a tensor of
+      # size batch_size x K representing for each example dx_i dx_1, dx_1 dx_2
+      # etc dx_i dx_K
+      hessian_row = gradients_impl.gradients(
+          row_grads,
+          predictions,
+          name="Hessian_%d" % row,
+          colocate_gradients_with_ops=False,
+          gate_gradients=0,
+          aggregation_method=None)
+
+      # hessian_row is of dimension 1, batch_size, K, => trim first dimension
+      # to get batch_size x K
+      hessian_row = array_ops.squeeze(array_ops.unstack(hessian_row), [0])
+
+      # Get dx_i^2 for the whole batch.
+      elem = array_ops.transpose(hessian_row)[row]
+      diag_hessian_list.append(elem)
+
+    return diag_hessian_list
+
+  def _get_replica_device_setter(self):
+    """Creates a replica device setter."""
+    ps_tasks = self._num_ps_replicas
+    ps_ops = [
+        "Variable",
+        "VariableV2",
+        "DecisionTreeEnsembleResourceHandleOp",
+        "StatsAccumulatorScalarResourceHandleOp",
+        "StatsAccumulatorTensorResourceHandleOp",
+        "QuantileStreamResourceHandleOp",
+    ]
+    ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
+    return device_setter.replica_device_setter(
+        ps_tasks=ps_tasks,
+        merge_devices=True,
+        ps_ops=ps_ops,
+        ps_strategy=ps_strategy)
+
+  def _make_update_bias_stats_fn(self, ensemble_stamp, predictions, gradients,
+                                 bias_stats_accumulator):
+    """A method to create the function which updates the bias stats."""
+
+    def _update_bias_stats():
+      """A method to update the bias stats."""
+      # Get reduced gradients and hessians.
+      grads_sum = math_ops.reduce_sum(gradients, 0)
+      hess = gradients_impl.gradients(
+          grads_sum,
+          predictions,
+          name="Hessians",
+          colocate_gradients_with_ops=False,
+          gate_gradients=0,
+          aggregation_method=None)[0]
+      hess_sum = math_ops.reduce_sum(hess, 0)
+
+      # Accumulate gradients and hessians.
+      partition_ids = math_ops.range(predictions.get_shape()[1])
+      feature_ids = array_ops.zeros_like(partition_ids, dtype=dtypes.int64)
+      add_stats_op = bias_stats_accumulator.add(
+          ensemble_stamp, partition_ids, feature_ids, grads_sum, hess_sum)
+      return control_flow_ops.group(*[add_stats_op], name="update_bias_stats")
+
+    return _update_bias_stats
+
+  def _make_update_ensemble_fn(self, ensemble_stamp, steps_accumulator,
+                               bias_stats_accumulator, continue_centering,
+                               learning_rate, handlers, num_layers, active_tree,
+                               active_layer, dropout_seed, class_id):
+    """A method to create the function which updates the tree ensemble."""
+
+    def _update_ensemble():
+      """A method to update the tree ensemble."""
+      # Get next stamp token.
+      next_ensemble_stamp = ensemble_stamp + 1
+      # Finalize bias stats.
+      _, _, _, bias_grads, bias_hess = bias_stats_accumulator.flush(
+          ensemble_stamp, next_ensemble_stamp)
+
+      # Finalize handler splits.
+      are_splits_ready_list = []
+      partition_ids_list = []
+      gains_list = []
+      split_info_list = []
+
+      for handler in handlers:
+        (are_splits_ready,
+         partition_ids, gains, split_info) = handler.make_splits(
+             ensemble_stamp, next_ensemble_stamp, class_id)
+        are_splits_ready_list.append(are_splits_ready)
+        partition_ids_list.append(partition_ids)
+        gains_list.append(gains)
+        split_info_list.append(split_info)
+      # Stack all the inputs to one tensor per type.
+      # This is a workaround for the slowness of graph building in tf.cond.
+      # See (b/36554864).
+      split_sizes = array_ops.stack([
+          array_ops.shape(partition_id)[0]
+          for partition_id in partition_ids_list
+      ])
+      partition_ids = array_ops.concat(partition_ids_list, axis=0)
+      gains = array_ops.concat(gains_list, axis=0)
+      split_infos = array_ops.concat(split_info_list, axis=0)
+
+      # Determine if all splits are ready.
+      are_all_splits_ready = math_ops.reduce_all(
+          array_ops.stack(
+              are_splits_ready_list, axis=0, name="stack_handler_readiness"))
+
+      # Define bias centering update operation.
+      def _center_bias_fn():
+        # Center tree ensemble bias.
+        delta_updates = array_ops.where(bias_hess > 0, -bias_grads / bias_hess,
+                                        array_ops.zeros_like(bias_grads))
+        center_bias = training_ops.center_tree_ensemble_bias(
+            tree_ensemble_handle=self._ensemble_handle,
+            stamp_token=ensemble_stamp,
+            next_stamp_token=next_ensemble_stamp,
+            delta_updates=delta_updates,
+            learner_config=self._learner_config_serialized)
+        return continue_centering.assign(center_bias)
+
+      # Define ensemble growing operations.
+      def _grow_ensemble_ready_fn():
+        # Grow the ensemble given the current candidates.
+        sizes = array_ops.unstack(split_sizes)
+        partition_ids_list = list(array_ops.split(partition_ids, sizes, axis=0))
+        gains_list = list(array_ops.split(gains, sizes, axis=0))
+        split_info_list = list(array_ops.split(split_infos, sizes, axis=0))
+        return training_ops.grow_tree_ensemble(
+            tree_ensemble_handle=self._ensemble_handle,
+            stamp_token=ensemble_stamp,
+            next_stamp_token=next_ensemble_stamp,
+            learning_rate=learning_rate,
+            partition_ids=partition_ids_list,
+            gains=gains_list,
+            splits=split_info_list,
+            learner_config=self._learner_config_serialized,
+            dropout_seed=dropout_seed,
+            center_bias=self._center_bias)
+
+      def _grow_ensemble_not_ready_fn():
+        # Don't grow the ensemble, just update the stamp.
+        return training_ops.grow_tree_ensemble(
+            tree_ensemble_handle=self._ensemble_handle,
+            stamp_token=ensemble_stamp,
+            next_stamp_token=next_ensemble_stamp,
+            learning_rate=0,
+            partition_ids=[],
+            gains=[],
+            splits=[],
+            learner_config=self._learner_config_serialized,
+            dropout_seed=dropout_seed,
+            center_bias=self._center_bias)
+
+      def _grow_ensemble_fn():
+        # Conditionally grow an ensemble depending on whether the splits
+        # from all the handlers are ready.
+        return control_flow_ops.cond(are_all_splits_ready,
+                                     _grow_ensemble_ready_fn,
+                                     _grow_ensemble_not_ready_fn)
+
+      # Update ensemble.
+      update_ops = [are_all_splits_ready]
+      update_model = control_flow_ops.cond(continue_centering, _center_bias_fn,
+                                           _grow_ensemble_fn)
+      update_ops.append(update_model)
+
+      # Update ensemble stats.
+      with ops.control_dependencies([update_model]):
+        stats = training_ops.tree_ensemble_stats(
+            self._ensemble_handle, stamp_token=next_ensemble_stamp)
+        update_ops.append(self._finalized_trees.assign(stats.num_trees))
+        update_ops.append(self._attempted_trees.assign(stats.attempted_trees))
+        update_ops.append(num_layers.assign(stats.num_layers))
+        update_ops.append(active_tree.assign(stats.active_tree))
+        update_ops.append(active_layer.assign(stats.active_layer))
+
+      # Flush step stats.
+      update_ops.extend(
+          steps_accumulator.flush(ensemble_stamp, next_ensemble_stamp))
+      return control_flow_ops.group(*update_ops, name="update_ensemble")
+
+    return _update_ensemble
+
+  def get_number_of_trees_tensor(self):
+    return self._finalized_trees, self._attempted_trees
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74f9a909dfa9bd0667b99a5084003079a8aac7dc
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -0,0 +1,917 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GBDT train function."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.contrib.boosted_trees.python.utils import losses
+
+from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+def _squared_loss(label, unused_weights, predictions):
+  """Unweighted loss implementation."""
+  loss = math_ops.reduce_sum(
+      math_ops.square(predictions - label), 1, keep_dims=True)
+  return loss
+
+
+class GbdtTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(GbdtTest, self).setUp()
+
+  def testExtractFeatures(self):
+    """Tests feature extraction."""
+    with self.test_session():
+      features = {}
+      features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
+      features["sparse_float"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.float32),
+          array_ops.zeros([2], dtypes.int64))
+      features["sparse_int"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.int64),
+          array_ops.zeros([2], dtypes.int64))
+      (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
+       sparse_float_shapes, sparse_int_indices, sparse_int_values,
+       sparse_int_shapes) = (gbdt_batch.extract_features(features, None))
+      self.assertEqual(len(fc_names), 3)
+      self.assertAllEqual(fc_names,
+                          ["dense_float", "sparse_float", "sparse_int"])
+      self.assertEqual(len(dense_floats), 1)
+      self.assertEqual(len(sparse_float_indices), 1)
+      self.assertEqual(len(sparse_float_values), 1)
+      self.assertEqual(len(sparse_float_shapes), 1)
+      self.assertEqual(len(sparse_int_indices), 1)
+      self.assertEqual(len(sparse_int_values), 1)
+      self.assertEqual(len(sparse_int_shapes), 1)
+      self.assertAllEqual(dense_floats[0].eval(),
+                          features["dense_float"].eval())
+      self.assertAllEqual(sparse_float_indices[0].eval(),
+                          features["sparse_float"].indices.eval())
+      self.assertAllEqual(sparse_float_values[0].eval(),
+                          features["sparse_float"].values.eval())
+      self.assertAllEqual(sparse_float_shapes[0].eval(),
+                          features["sparse_float"].dense_shape.eval())
+      self.assertAllEqual(sparse_int_indices[0].eval(),
+                          features["sparse_int"].indices.eval())
+      self.assertAllEqual(sparse_int_values[0].eval(),
+                          features["sparse_int"].values.eval())
+      self.assertAllEqual(sparse_int_shapes[0].eval(),
+                          features["sparse_int"].dense_shape.eval())
+
+  def testExtractFeaturesWithTransformation(self):
+    """Tests feature extraction."""
+    with self.test_session():
+      features = {}
+      features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
+      features["sparse_float"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.float32),
+          array_ops.zeros([2], dtypes.int64))
+      features["sparse_categorical"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros(
+              [2], dtypes.string), array_ops.zeros([2], dtypes.int64))
+      feature_columns = set()
+      feature_columns.add(layers.real_valued_column("dense_float"))
+      feature_columns.add(
+          layers.feature_column._real_valued_var_len_column(
+              "sparse_float", is_sparse=True))
+      feature_columns.add(
+          feature_column_lib.sparse_column_with_hash_bucket(
+              "sparse_categorical", hash_bucket_size=1000000))
+      (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
+       sparse_float_shapes, sparse_int_indices, sparse_int_values,
+       sparse_int_shapes) = (gbdt_batch.extract_features(
+           features, feature_columns))
+      self.assertEqual(len(fc_names), 3)
+      self.assertAllEqual(fc_names,
+                          ["dense_float", "sparse_float", "sparse_categorical"])
+      self.assertEqual(len(dense_floats), 1)
+      self.assertEqual(len(sparse_float_indices), 1)
+      self.assertEqual(len(sparse_float_values), 1)
+      self.assertEqual(len(sparse_float_shapes), 1)
+      self.assertEqual(len(sparse_int_indices), 1)
+      self.assertEqual(len(sparse_int_values), 1)
+      self.assertEqual(len(sparse_int_shapes), 1)
+      self.assertAllEqual(dense_floats[0].eval(),
+                          features["dense_float"].eval())
+      self.assertAllEqual(sparse_float_indices[0].eval(),
+                          features["sparse_float"].indices.eval())
+      self.assertAllEqual(sparse_float_values[0].eval(),
+                          features["sparse_float"].values.eval())
+      self.assertAllEqual(sparse_float_shapes[0].eval(),
+                          features["sparse_float"].dense_shape.eval())
+      self.assertAllEqual(sparse_int_indices[0].eval(),
+                          features["sparse_categorical"].indices.eval())
+      self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
+      self.assertAllEqual(sparse_int_shapes[0].eval(),
+                          features["sparse_categorical"].dense_shape.eval())
+
+  def testTrainFnChiefNoBiasCentering(self):
+    """Tests the train function running on chief without bias centering."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          "num_trees": 12,
+      }
+
+      labels = array_ops.ones([4, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+
+      # On second run, expect a trivial split to be chosen to basically
+      # predict the average.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [0.1])
+      self.assertEquals(stamp_token.eval(), 2)
+      expected_tree = """
+          nodes {
+            dense_float_binary_split {
+              threshold: 1.0
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 0
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.25
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0
+              }
+            }
+          }"""
+      self.assertProtoEquals(expected_tree, output.trees[0])
+
+  def testTrainFnChiefScalingNumberOfExamples(self):
+    """Tests the train function running on chief without bias centering."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      num_examples_fn = (
+          lambda layer: math_ops.pow(math_ops.cast(2, dtypes.int64), layer) * 1)
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=num_examples_fn,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          "num_trees": 12,
+      }
+
+      labels = array_ops.ones([4, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+
+      # On second run, expect a trivial split to be chosen to basically
+      # predict the average.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [0.1])
+      self.assertEquals(stamp_token.eval(), 2)
+      expected_tree = """
+          nodes {
+            dense_float_binary_split {
+              threshold: 1.0
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 0
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.25
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.0
+              }
+            }
+          }"""
+      self.assertProtoEquals(expected_tree, output.trees[0])
+
+  def testTrainFnChiefWithBiasCentering(self):
+    """Tests the train function running on chief with bias centering."""
+    with self.test_session():
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=True,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          "num_trees": 12,
+      }
+
+      labels = array_ops.ones([4, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect bias to be centered.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      expected_tree = """
+          nodes {
+            leaf {
+              vector {
+                value: 0.25
+              }
+            }
+          }"""
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllEqual(output.tree_weights, [1.0])
+      self.assertProtoEquals(expected_tree, output.trees[0])
+      self.assertEquals(stamp_token.eval(), 1)
+
+  def testTrainFnNonChiefNoBiasCentering(self):
+    """Tests the train function running on worker without bias centering."""
+    with self.test_session():
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=False,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp
+      }
+
+      labels = array_ops.ones([4, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Regardless of how many times the train op is run, a non-chief worker
+      # can only accumulate stats so the tree ensemble never changes.
+      for _ in range(5):
+        train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 0)
+
+  def testTrainFnNonChiefWithCentering(self):
+    """Tests the train function running on worker with bias centering."""
+    with self.test_session():
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=False,
+          num_ps_replicas=0,
+          center_bias=True,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp
+      }
+
+      labels = array_ops.ones([4, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Regardless of how many times the train op is run, a non-chief worker
+      # can only accumulate stats so the tree ensemble never changes.
+      for _ in range(5):
+        train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 0)
+
+  def testPredictFn(self):
+    """Tests the predict function."""
+    with self.test_session() as sess:
+      # Create ensemble with one bias node.
+      ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+          trees {
+            nodes {
+              leaf {
+                vector {
+                  value: 0.25
+                }
+              }
+            }
+          }
+          tree_weights: 1.0
+          tree_metadata {
+            num_tree_weight_updates: 1
+            num_layers_grown: 1
+            is_finalized: true
+          }""", ensemble_config)
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=3,
+          tree_ensemble_config=ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=False,
+          num_ps_replicas=0,
+          center_bias=True,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      # Create predict op.
+      mode = model_fn.ModeKeys.EVAL
+      predictions_dict = sess.run(gbdt_model.predict(mode))
+      self.assertEquals(predictions_dict["ensemble_stamp"], 3)
+      self.assertAllClose(predictions_dict["predictions"], [[0.25], [0.25],
+                                                            [0.25], [0.25]])
+      self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
+
+  def testTrainFnMulticlassFullHessian(self):
+    """Tests the GBDT train for multiclass full hessian."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 1
+      # Use full hessian multiclass strategy.
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.FULL_HESSIAN)
+      learner_config.num_classes = 5
+      learner_config.regularization.l1 = 0
+      # To make matrix inversible.
+      learner_config.regularization.l2 = 1e-5
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      batch_size = 3
+      features["dense_float"] = array_ops.ones([batch_size, 1], dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
+           [0.0, 0.0, 0.0, 0.0, 1.2]],
+          dtype=dtypes.float32)
+
+      labels = array_ops.constant([[2], [2], [3]], dtype=dtypes.float32)
+      weights = array_ops.ones([batch_size, 1], dtypes.float32)
+
+      partition_ids = array_ops.zeros([batch_size], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          "num_trees": 0,
+      }
+
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              losses.per_example_maxent_loss(
+                  labels,
+                  weights,
+                  predictions,
+                  num_classes=learner_config.num_classes)[0]),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+      # On second run, expect a trivial split to be chosen to basically
+      # predict the average.
+      train_op.run()
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [1])
+      self.assertEquals(stamp_token.eval(), 2)
+
+      # Leaf should have a dense vector of size 5.
+      expected = [
+          -1.29972088337, -1.38769364357, 3.42812108994, 1.02690505981,
+          -1.75405228138
+      ]
+      for i in range(learner_config.num_classes):
+        self.assertAlmostEqual(expected[i],
+                               output.trees[0].nodes[1].leaf.vector.value[i],
+                               places=2)
+
+  def testTrainFnMulticlassDiagonalHessian(self):
+    """Tests the GBDT train for multiclass diagonal hessian."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 1
+      # Use full hessian multiclass strategy.
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.DIAGONAL_HESSIAN)
+      learner_config.num_classes = 5
+      learner_config.regularization.l1 = 0
+      # To make matrix inversible.
+      learner_config.regularization.l2 = 1e-5
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      batch_size = 3
+      features = {}
+      features["dense_float"] = array_ops.ones([batch_size, 1], dtypes.float32)
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
+           [0.0, 0.0, 0.0, 0.0, 1.2]],
+          dtype=dtypes.float32)
+
+      labels = array_ops.constant([[2], [2], [3]], dtype=dtypes.float32)
+      weights = array_ops.ones([batch_size, 1], dtypes.float32)
+
+      partition_ids = array_ops.zeros([batch_size], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          "num_trees": 0,
+      }
+
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              losses.per_example_maxent_loss(
+                  labels,
+                  weights,
+                  predictions,
+                  num_classes=learner_config.num_classes)[0]),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEqual(len(output.trees), 0)
+      self.assertEqual(len(output.tree_weights), 0)
+      self.assertEqual(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+      # On second run, expect a trivial split to be chosen to basically
+      # predict the average.
+      train_op.run()
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output.ParseFromString(serialized.eval())
+      self.assertEqual(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [1])
+      self.assertEqual(stamp_token.eval(), 2)
+
+      # Leaf should have a dense vector of size 5.
+      expected = [
+          -1.26767396927, -1.13043296337, 4.58542203903, 1.81428349018,
+          -2.43038392067
+      ]
+      for i in range(learner_config.num_classes):
+        self.assertAlmostEqual(expected[i],
+                               output.trees[0].nodes[1].leaf.vector.value[i])
+
+  def testTrainFnMulticlassTreePerClass(self):
+    """Tests the GBDT train for multiclass tree per class strategy."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 1
+      # Use full hessian multiclass strategy.
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      learner_config.num_classes = 5
+      learner_config.regularization.l1 = 0
+      # To make matrix inversible.
+      learner_config.regularization.l2 = 1e-5
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {
+          "dense_float": array_ops.constant(
+              [[1.0], [1.5], [2.0]], dtypes.float32),
+      }
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          features=features)
+
+      batch_size = 3
+      predictions = array_ops.constant(
+          [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
+           [0.0, 0.0, 0.0, 2.0, 1.2]],
+          dtype=dtypes.float32)
+
+      labels = array_ops.constant([[2], [2], [3]], dtype=dtypes.float32)
+      weights = array_ops.ones([batch_size, 1], dtypes.float32)
+
+      partition_ids = array_ops.zeros([batch_size], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          # This should result in a tree built for a class 2.
+          "num_trees": 13,
+      }
+
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              losses.per_example_maxent_loss(
+                  labels,
+                  weights,
+                  predictions,
+                  num_classes=learner_config.num_classes)[0]),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEqual(len(output.trees), 0)
+      self.assertEqual(len(output.tree_weights), 0)
+      self.assertEqual(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+      # On second run, expect a trivial split to be chosen to basically
+      # predict the average.
+      train_op.run()
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output.ParseFromString(serialized.eval())
+      self.assertEqual(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [1])
+      self.assertEqual(stamp_token.eval(), 2)
+
+      # One node for a split, two children nodes.
+      self.assertEqual(3, len(output.trees[0].nodes))
+
+      # Leafs will have a sparse vector for class 3.
+      self.assertEqual(1,
+                       len(output.trees[0].nodes[1].leaf.sparse_vector.index))
+      self.assertEqual(3, output.trees[0].nodes[1].leaf.sparse_vector.index[0])
+      self.assertAlmostEqual(
+          -1.13134455681, output.trees[0].nodes[1].leaf.sparse_vector.value[0])
+
+      self.assertEqual(1,
+                       len(output.trees[0].nodes[2].leaf.sparse_vector.index))
+      self.assertEqual(3, output.trees[0].nodes[2].leaf.sparse_vector.index[0])
+      self.assertAlmostEqual(
+          0.893284678459, output.trees[0].nodes[2].leaf.sparse_vector.value[0])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f128b230180d8e8070f63c369bc7fc2f3d24376
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -0,0 +1,210 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses for Gtflow Estimator and Batch Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+def per_example_logistic_loss(labels, weights, predictions):
+  """Logistic loss given labels, example weights and predictions.
+
+  Args:
+    labels: Rank 2 (N, 1) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, 1) tensor of per-example predictions.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example logistic loss.
+    update_op: An update operation to update the loss's internal state.
+  """
+  labels = math_ops.to_float(labels)
+  unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+      labels=labels, logits=predictions)
+  return unweighted_loss * weights, control_flow_ops.no_op()
+
+
+# This is classical form of Maximum entropy loss, that is twice differentiable
+# (sparse_softmax_cross_entropy which is what we go for is not twice
+# differentiable).
+def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
+  """Maximum entropy loss for multiclass problems.
+
+  Maximum entropy is a generalization of logistic loss for the case when more
+  than 2 classes are present.
+
+  Args:
+    labels: Rank 2 (N, 1) or Rank 1 (N) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    logits: Rank 2 (N, K) tensor of per-example predictions, K - num of
+    classes.
+    num_classes: number of classes in classification task. Used to expand label
+    indices into one-hot encodings.
+    eps: tolerance, used as a minimum possible value.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example maxent loss
+    update_op: An update operation to update the loss's internal state.
+  """
+  labels = math_ops.to_int64(labels)
+  # If labels are of rank 1, make them rank 2.
+  labels_shape = labels.get_shape()
+  if len(labels_shape) != 2:
+    labels = array_ops.expand_dims(labels, 1)
+  # Labels are indices of classes, convert them to one hot encodings.
+  target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
+  labels = math_ops.reduce_sum(
+      input_tensor=target_one_hot, reduction_indices=[1])
+  labels = math_ops.to_float(labels)
+
+  # Calculate softmax probabilities for each class.
+  unnormalized_probs = math_ops.exp(logits)
+  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keep_dims=True)
+  softmax_predictions = math_ops.divide(unnormalized_probs,
+                                        math_ops.add(normalizers, eps))
+
+  # Pull out the probabilities for real label.
+  probs_for_real_class = math_ops.reduce_sum(labels * softmax_predictions, 1)
+
+  # Add handling for values near 0 and 1.
+  zeros = array_ops.zeros_like(probs_for_real_class, dtype=logits.dtype) + eps
+  one_minus_eps = array_ops.ones_like(
+      probs_for_real_class, dtype=logits.dtype) - eps
+
+  # Take maximum(eps, pred)
+  cond = (probs_for_real_class >= eps)
+  probs_for_real_class = array_ops.where(cond, probs_for_real_class, zeros)
+
+  # Take minimum(1-eps, pred)
+  cond = (probs_for_real_class <= 1 - eps)
+  probs_for_real_class = array_ops.where(cond, probs_for_real_class,
+                                         one_minus_eps)
+
+  unweighted_loss = array_ops.expand_dims(-math_ops.log(probs_for_real_class),
+                                          1)
+  return unweighted_loss * weights, control_flow_ops.no_op()
+
+
+def per_example_squared_loss(labels, weights, predictions):
+  """Squared loss given labels, example weights and predictions.
+
+  Args:
+    labels: Rank 2 (N, D) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, D) tensor of per-example predictions.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example squared loss.
+    update_op: An update operation to update the loss's internal state.
+  """
+  unweighted_loss = math_ops.reduce_sum(
+      math_ops.square(predictions - labels), 1, keep_dims=True)
+
+  return unweighted_loss * weights, control_flow_ops.no_op()
+
+
+def per_example_exp_loss(labels, weights, predictions, name=None, eps=0.1):
+  """Exponential loss given labels, example weights and predictions.
+
+  Note that this is only for binary classification.
+  If logistic loss tries to make sure that the classifier is certain of its
+  predictions, exp loss says: "as long as it got it correct, even barely, i
+  don't care". Can be used on noisy data, or when you don't care about getting
+  the actual probabilities from the model, just the correct label.
+
+  The loss returns is exp(-targets*modified_predictions), where
+  modified_predictions are 1 if sigmoid is >= 0.5+eps (eg we predict positive
+  class), -1 if sigmoid < 0.5-eps (e.g. we predict negative class) and ax+b in
+  the interval 0.5-eps, 0.5+eps, where a = 1/eps, b=1/(2eps).
+
+  Args:
+    labels: Rank 2 (N, D) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, D) tensor of per-example predictions.
+    name: A name for the operation (optional).
+    eps: For the range (0.5-eps, 0.5+eps) we set the predictions to be ax+b.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example exp loss
+    update_op: An update operation to update the loss's internal state.
+  """
+
+  def exp_with_logits(name, eps, labels=None, logits=None):
+    """Computes exponential loss given `logits`.
+
+    The loss returns is exp(-targets*modified_predictions), where
+    modified_predictions are 1 if sigmoid is >= 0.5+eps (eg we predict positive
+    class), -1 if sigmoid < 0.5-eps (e.g. we predict negative class) and ax+b in
+    the interval 0.5-eps, 0.5+eps, where a = 1/eps, b=1/(2eps).
+
+    Args:
+      name: A name for the operation (optional).
+      eps: For the range (0.5-eps, 0.5+eps) we set the predictions to be ax+b.
+      labels: A `Tensor` of the same type and shape as `logits`.
+      logits: A `Tensor` of type `float32` or `float64`.
+
+    Returns:
+      A `Tensor` of the same shape as `logits` with the componentwise
+      exponential losses.
+
+    Raises:
+      ValueError: If `logits` and `labels` do not have the same shape.
+    """
+    with ops.name_scope(name, "exp_loss", [logits, labels]) as name:
+      logits = ops.convert_to_tensor(logits, name="logits")
+      labels = ops.convert_to_tensor(labels, name="labels")
+      try:
+        labels.get_shape().merge_with(logits.get_shape())
+      except ValueError:
+        raise ValueError("logits and labels must have the same shape (%s vs %s)"
+                         % (logits.get_shape(), labels.get_shape()))
+
+    # Default threshold to switch between classes
+    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
+    ones = array_ops.ones_like(logits, dtype=logits.dtype)
+    neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype)
+
+    # Convert labels to 1 and -1
+    cond_labels = (labels > zeros)
+    labels_converted = array_ops.where(cond_labels, ones, neg_ones)
+
+    # Convert predictions to 1 and -1
+    # The loss we build is min(1, max(-1,ax+b))
+    # where a=1/eps, b=-1/2eps.
+
+    a = 1.0 / eps
+    b = -1.0 / 2 / eps
+    probs = math_ops.sigmoid(logits)
+    y = a * probs + b
+    # Build max(-1, ax+b)
+    cond = (y < -1)
+    max_res = array_ops.where(cond, neg_ones, y)
+    # Build min part
+    cond = (max_res > 1)
+    min_res = array_ops.where(cond, ones, max_res)
+    preds_converted = min_res
+    return math_ops.exp(-preds_converted * labels_converted)
+
+  labels = math_ops.to_float(labels)
+  unweighted_loss = exp_with_logits(
+      name=name, eps=eps, labels=labels, logits=predictions)
+  return unweighted_loss * weights, control_flow_ops.no_op()
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d8ac43eda2f270933a3cbc12565ca425255b4
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
@@ -0,0 +1,106 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for trainer hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.contrib.boosted_trees.python.utils import losses
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class LossesTest(test_util.TensorFlowTestCase):
+
+  def test_per_example_exp_loss(self):
+
+    def _logit(p):
+      return np.log(p) - np.log(1 - p)
+
+    labels_positive = array_ops.ones([10, 1], dtypes.float32)
+    weights = array_ops.ones([10, 1], dtypes.float32)
+    labels_negative = array_ops.zeros([10, 1], dtypes.float32)
+    predictions_probs = np.array(
+        [[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9], [0.99]],
+        dtype=np.float32)
+    prediction_logits = _logit(predictions_probs)
+
+    eps = 0.2
+
+    with self.test_session():
+      predictions_tensor = constant_op.constant(
+          prediction_logits, dtype=dtypes.float32)
+      loss_for_positives, _ = losses.per_example_exp_loss(
+          labels_positive, weights, predictions_tensor, eps=eps)
+
+      loss_for_negatives, _ = losses.per_example_exp_loss(
+          labels_negative, weights, predictions_tensor, eps=eps)
+
+      pos_loss = loss_for_positives.eval()
+      neg_loss = loss_for_negatives.eval()
+      # For positive labels, points <= 0.3 get max loss of e.
+      # For negative labels, these points have minimum loss  of 1/e.
+      for i in range(2):
+        self.assertEqual(math.exp(1), pos_loss[i])
+        self.assertEqual(math.exp(-1), neg_loss[i])
+
+      # For positive lables, p oints with predictions 0.7 and larger get minimum
+      # loss value of 1/e. For negative labels, these points are wrongly
+      # classified and get loss e.
+      for i in range(6, 10):
+        self.assertEqual(math.exp(-1), pos_loss[i])
+        self.assertEqual(math.exp(1), neg_loss[i])
+
+      # Points in between 0.5-eps, 0..5+eps get loss exp(-label_m*y), where
+      # y = 1/eps *x -1/(2eps), where x is the probability and label_m is either
+      # 1 or -1 (for label of 0).
+      for i in range(2, 6):
+        self.assertAlmostEqual(
+            math.exp(-1.0 * (predictions_probs[i] * 1.0 / eps - 0.5 / eps)),
+            pos_loss[i])
+        self.assertAlmostEqual(
+            math.exp(1.0 * (predictions_probs[i] * 1.0 / eps - 0.5 / eps)),
+            neg_loss[i])
+
+  def test_per_example_squared_loss(self):
+
+    def _squared_loss(p, y):
+      return np.mean(1.0 * (p - y) * (p - y))
+
+    labels = np.array([[0.123], [224.2], [-3], [2], [.3]], dtype=np.float32)
+    weights = array_ops.ones([5, 1], dtypes.float32)
+    predictions = np.array(
+        [[0.123], [23.2], [233], [52], [3]], dtype=np.float32)
+
+    with self.test_session():
+      loss_tensor, _ = losses.per_example_squared_loss(labels, weights,
+                                                       predictions)
+
+      loss = loss_tensor.eval()
+      for i in range(5):
+        self.assertAlmostEqual(
+            _squared_loss(labels[i], predictions[i]), loss[i])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
index 5dfdf8f48967efd989f969076508d815969d7a04..9fc101612f1e2a6bf6c5d86ea8c7199936dbb069 100644
--- a/tensorflow/contrib/boosted_trees/resources/BUILD
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -37,7 +37,6 @@ cc_library(
         "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
         "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_cc",
         "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
     ],
 )
 
@@ -49,5 +48,4 @@ cc_library(
         "//tensorflow/contrib/boosted_trees/lib:trees",
         "//tensorflow/core:framework_headers_lib",
     ],
-    alwayslink = 1,
 )
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index 840997223fb079f686a2c15077708efd3d7b5cd8..eec2beddc487d67171ea43b0e46e7c8f7c11a4f3 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -49,9 +49,9 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":gen_bigquery_reader_ops",
-        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
index 093000559b7f2d8a5ab500f7f54213ca841946fb..b0f9237ea27fb1736eba66aeb4d1fd8db374515b 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
@@ -60,7 +60,7 @@ class BigQueryReader : public ReaderBase {
     BigQueryTablePartition partition;
     if (!partition.ParseFromString(current_work())) {
       return errors::InvalidArgument(
-          "Could not parse work as as valid partition.");
+          "Could not parse work as valid partition.");
     }
     TF_RETURN_IF_ERROR(bigquery_table_accessor_->SetPartition(partition));
     return Status::OK();
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index 4851d485cc60920731dc8f46a61027bcfa6aeab0..b31b882fa19a7eaad304d6d423961234f9affef4 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -28,7 +28,7 @@ constexpr char kTestProject[] = "test-project";
 constexpr char kTestDataset[] = "test-dataset";
 constexpr char kTestTable[] = "test-table";
 
-static bool HasSubstr(const string& base, const string& substr) {
+bool HasSubstr(const string& base, const string& substr) {
   bool ok = StringPiece(base).contains(substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
@@ -42,8 +42,8 @@ class FakeAuthProvider : public AuthProvider {
   }
 };
 
-static string DeterministicSerialization(const tensorflow::Example& example) {
-  const int size = example.ByteSize();
+string DeterministicSerialization(const tensorflow::Example& example) {
+  const std::size_t size = example.ByteSizeLong();
   string result(size, '\0');
   ::tensorflow::protobuf::io::ArrayOutputStream array_stream(
       gtl::string_as_array(&result), size);
diff --git a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
index 136707da18afbc8878cc00c32db2c5142b2f06db..cc8644bfd503ef3fb5fa305bf98a70f02ccb0c85 100644
--- a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops.py
@@ -92,7 +92,7 @@ class BigQueryReader(io_ops.ReaderBase):
 
     Raises:
       TypeError: - If features is neither None nor a dict or
-                 - If columns is is neither None nor a list or
+                 - If columns is neither None nor a list or
                  - If both features and columns are None or set.
     """
     if (features is None) == (columns is None):
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 6792ebd615c2c90a7d93026a43592d2d56841864..ece5b9c04c72261d9d89487742f9ea948914322b 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -21,15 +21,36 @@ filegroup(
     ),
 )
 
+py_library(
+    name = "cluster_resolver_pip",
+    srcs = ["python/training/__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cluster_resolver_py",
+        ":gce_cluster_resolver_py",
+    ],
+)
+
 py_library(
     name = "cluster_resolver_py",
     srcs = [
-        "python/training/__init__.py",
         "python/training/cluster_resolver.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "gce_cluster_resolver_py",
+    srcs = [
+        "python/training/gce_cluster_resolver.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cluster_resolver_py",
+        "//tensorflow/python:training",
     ],
 )
 
@@ -46,3 +67,17 @@ tf_py_test(
     ],
     main = "python/training/cluster_resolver_test.py",
 )
+
+tf_py_test(
+    name = "gce_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["python/training/gce_cluster_resolver_test.py"],
+    additional_deps = [
+        ":gce_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    main = "python/training/gce_cluster_resolver_test.py",
+)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
index 3520467bc6a9874bf9f0fdb077443c64bbe075b7..fbf7ca3a5d6e1b69ca3d831966739b82af08f10d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -21,3 +21,4 @@ from __future__ import print_function
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index 87da24f22d8e972544cd601ea3a9caafc85bb5ad..b04822fa9d66465e34a545d3b00c399bbb196514 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -75,8 +75,8 @@ class UnionClusterResolver(ClusterResolver):
 
   This class performs a union given two or more existing ClusterResolvers. It
   merges the underlying ClusterResolvers, and returns one unified ClusterSpec
-  when as_cluster_spec is called. The details of the merge function is
-  documented in the as_cluster_spec function.
+  when cluster_spec is called. The details of the merge function is
+  documented in the cluster_spec function.
   """
 
   def __init__(self, *args):
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..2603d59920b0c6e0de9b9191a65e7c6ce6c28fbe
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -0,0 +1,129 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+class GceClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Compute Engine.
+
+  This is an implementation of cluster resolvers for the Google Compute Engine
+  instance group platform. By specifying a project, zone, and instance group,
+  this will retrieve the IP address of all the instances within the instance
+  group and return a Cluster Resolver object suitable for use for distributed
+  TensorFlow.
+  """
+
+  def __init__(self,
+               project,
+               zone,
+               instance_group,
+               port,
+               job_name='worker',
+               credentials=None,
+               service=None):
+    """Creates a new GceClusterResolver object.
+
+    This takes in a few parameters and creates a GceClusterResolver project. It
+    will then use these parameters to query the GCE API for the IP addresses of
+    each instance in the instance group.
+
+    Args:
+      project: Name of the GCE project
+      zone: Zone of the GCE instance group
+      instance_group: Name of the GCE instance group
+      port: Port of the listening TensorFlow server (default: 8470)
+      job_name: Name of the TensorFlow job this set of instances belongs to
+      credentials: GCE Credentials. This defaults to
+        GoogleCredentials.get_application_default()
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. (Default: discovery.build('compute', 'v1')). If you specify a
+        custom service object, then the credentials parameter will be ignored.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+    """
+    self._project = project
+    self._zone = zone
+    self._instance_group = instance_group
+    self._job_name = job_name
+    self._port = port
+    if service is None:
+      if _GOOGLE_API_CLIENT_INSTALLED is True:
+        self._service = discovery.build('compute', 'v1',
+                                        credentials=credentials)
+      else:
+        raise ImportError('googleapiclient must be installed before using the '
+                          'GCE cluster resolver')
+    else:
+      self._service = service
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified instance group. We will retrieve the information from the GCE APIs
+    every time this method is called.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from GCE.
+    """
+    request_body = {'instanceState': 'RUNNING'}
+    request = self._service.instanceGroups().listInstances(
+        project=self._project,
+        zone=self._zone,
+        instanceGroups=self._instance_group,
+        body=request_body,
+        orderBy='name')
+
+    worker_list = []
+
+    while request is not None:
+      response = request.execute()
+
+      items = response['items']
+      for instance in items:
+        instance_name = instance['instance'].split('/')[-1]
+
+        instance_request = self._service.instances().get(
+            project=self._project,
+            zone=self._zone,
+            instance=instance_name)
+
+        if instance_request is not None:
+          instance_details = instance_request.execute()
+          ip_address = instance_details['networkInterfaces'][0]['networkIP']
+          instance_url = '%s:%s' % (ip_address, self._port)
+          worker_list.append(instance_url)
+
+      request = self._service.instanceGroups().listInstances_next(
+          previous_request=request,
+          previous_response=response)
+
+    worker_list.sort()
+    return ClusterSpec({self._job_name: worker_list})
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2deacbc2609825337f2e8ab29f95d14e2edcc59
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
@@ -0,0 +1,234 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for GceClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+mock = test.mock
+
+
+class GceClusterResolverTest(test.TestCase):
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def standard_mock_instance_groups(self, instance_map=None):
+    if instance_map is None:
+      instance_map = [
+          {'instance': 'https://gce.example.com/res/gce-instance-1'}
+      ]
+
+    mock_instance_group_request = mock.MagicMock()
+    mock_instance_group_request.execute.return_value = {
+        'items': instance_map
+    }
+
+    service_attrs = {
+        'listInstances.return_value': mock_instance_group_request,
+        'listInstances_next.return_value': None,
+    }
+    mock_instance_groups = mock.Mock(**service_attrs)
+    return mock_instance_groups
+
+  def standard_mock_instances(self, instance_to_ip_map=None):
+    if instance_to_ip_map is None:
+      instance_to_ip_map = {
+          'gce-instance-1': '10.123.45.67'
+      }
+
+    mock_get_request = mock.MagicMock()
+    mock_get_request.execute.return_value = {
+        'networkInterfaces': [
+            {'networkIP': '10.123.45.67'}
+        ]
+    }
+
+    def get_side_effect(project, zone, instance):
+      del project, zone  # Unused
+
+      if instance in instance_to_ip_map:
+        mock_get_request = mock.MagicMock()
+        mock_get_request.execute.return_value = {
+            'networkInterfaces': [
+                {'networkIP': instance_to_ip_map[instance]}
+            ]
+        }
+        return mock_get_request
+      else:
+        raise RuntimeError('Instance %s not found!' % instance)
+
+    service_attrs = {
+        'get.side_effect': get_side_effect,
+    }
+    mock_instances = mock.MagicMock(**service_attrs)
+    return mock_instances
+
+  def standard_mock_service_client(
+      self,
+      mock_instance_groups=None,
+      mock_instances=None):
+
+    if mock_instance_groups is None:
+      mock_instance_groups = self.standard_mock_instance_groups()
+    if mock_instances is None:
+      mock_instances = self.standard_mock_instances()
+
+    mock_client = mock.MagicMock()
+    mock_client.instanceGroups.return_value = mock_instance_groups
+    mock_client.instances.return_value = mock_instances
+    return mock_client
+
+  def gen_standard_mock_service_client(self, instances=None):
+    name_to_ip = {}
+    instance_list = []
+    for instance in instances:
+      name_to_ip[instance['name']] = instance['ip']
+      instance_list.append({
+          'instance': 'https://gce.example.com/gce/res/' + instance['name']
+      })
+
+    mock_instance = self.standard_mock_instances(name_to_ip)
+    mock_instance_group = self.standard_mock_instance_groups(instance_list)
+
+    return self.standard_mock_service_client(mock_instance_group, mock_instance)
+
+  def testSimpleSuccessfulRetrieval(self):
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        port=8470,
+        service=self.standard_mock_service_client())
+
+    actual_cluster_spec = gce_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: '10.123.45.67:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testCustomJobNameAndPortRetrieval(self):
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        job_name='custom',
+        port=2222,
+        service=self.standard_mock_service_client())
+
+    actual_cluster_spec = gce_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'custom' tasks { key: 0 value: '10.123.45.67:2222' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testMultipleInstancesRetrieval(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        port=8470,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    actual_cluster_spec = gce_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' }
+                         tasks { key: 1 value: '10.2.3.4:8470' }
+                         tasks { key: 2 value: '10.3.4.5:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testUnionMultipleInstanceRetrieval(self):
+    worker1_name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    worker2_name_to_ip = [
+        {'name': 'instance4', 'ip': '10.4.5.6'},
+        {'name': 'instance5', 'ip': '10.5.6.7'},
+        {'name': 'instance6', 'ip': '10.6.7.8'},
+    ]
+
+    ps_name_to_ip = [
+        {'name': 'ps1', 'ip': '10.100.1.2'},
+        {'name': 'ps2', 'ip': '10.100.2.3'},
+    ]
+
+    worker1_gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        job_name='worker',
+        port=8470,
+        service=self.gen_standard_mock_service_client(worker1_name_to_ip))
+
+    worker2_gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        job_name='worker',
+        port=8470,
+        service=self.gen_standard_mock_service_client(worker2_name_to_ip))
+
+    ps_gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        job_name='ps',
+        port=2222,
+        service=self.gen_standard_mock_service_client(ps_name_to_ip))
+
+    union_cluster_resolver = UnionClusterResolver(worker1_gce_cluster_resolver,
+                                                  worker2_gce_cluster_resolver,
+                                                  ps_gce_cluster_resolver)
+
+    actual_cluster_spec = union_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: '10.100.1.2:2222' }
+                     tasks { key: 1 value: '10.100.2.3:2222' } }
+    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' }
+                         tasks { key: 1 value: '10.2.3.4:8470' }
+                         tasks { key: 2 value: '10.3.4.5:8470' }
+                         tasks { key: 3 value: '10.4.5.6:8470' }
+                         tasks { key: 4 value: '10.5.6.7:8470' }
+                         tasks { key: 5 value: '10.6.7.8:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 9ffe08eded82963052eda56394a650598ea88669..83c82c75beaae1b291d0a83e28751c0da2a81ced 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -74,7 +74,7 @@ if(WIN32)
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4197 /ignore:4217 /ignore:4221")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
-  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob2")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
@@ -121,6 +121,8 @@ include(farmhash)
 include(fft2d)
 include(highwayhash)
 include(protobuf)
+include(re2)
+include(cub)
 if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
@@ -136,6 +138,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${fft2d_STATIC_LIBRARIES}
     ${highwayhash_STATIC_LIBRARIES}
     ${protobuf_STATIC_LIBRARIES}
+    ${re2_STATIC_LIBRARIES}
 )
 set(tensorflow_EXTERNAL_DEPENDENCIES
     zlib_copy_headers_to_destination
@@ -149,7 +152,9 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     protobuf
     eigen
     gemmlowp
+    cub
     fft2d
+    re2
 )
 
 include_directories(
@@ -167,7 +172,9 @@ include_directories(
     ${jsoncpp_INCLUDE_DIR}
     ${farmhash_INCLUDE_DIR}
     ${highwayhash_INCLUDE_DIR}
+    ${cub_INCLUDE_DIR}
     ${PROTOBUF_INCLUDE_DIRS}
+    ${re2_INCLUDE_DIR}
 )
 
 if(tensorflow_ENABLE_SSL_SUPPORT)
@@ -271,6 +278,7 @@ endif()
 include(tf_cc_ops.cmake)
 include(tf_c.cmake)
 include(tf_grappler.cmake)
+include(tf_core_profiler.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 664d0f4b6b09bcfccd17dccae662f6112d1d7d17..8ad852755914a2fc67e41c8da921127d22260502 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -225,7 +225,7 @@ Step-by-step Windows build
 
    * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
      GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
-     CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unziped_cudnn.
+     CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unzipped_cudnn.
 
    * `-Dtensorflow_BUILD_CC_TESTS=(ON|OFF)`. Defaults to `OFF`. This builds cc unit tests.
      There are many of them and building will take a few hours.
diff --git a/tensorflow/contrib/cmake/external/cub.cmake b/tensorflow/contrib/cmake/external/cub.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1119f0ccba4102f7d53a34afa524bfad8228c24b
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/cub.cmake
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(cub_URL https://github.com/NVlabs/cub/archive/1.6.4.zip)
+set(cub_HASH SHA256=966d0c4f41e2bdc81aebf9ccfbf0baffaac5a74f00b826b06f4dee79b2bb8cee)
+set(cub_BUILD ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub)
+set(cub_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub)
+set(cub_ARCHIVE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/cub_archive)
+
+ExternalProject_Add(cub
+    PREFIX cub
+    URL ${cub_URL}
+    URL_HASH ${cub_HASH}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/cub/CMakeLists.txt ${cub_BUILD}
+    INSTALL_COMMAND  ${CMAKE_COMMAND} -E copy_directory  ${cub_INCLUDE_DIR}/cub ${cub_ARCHIVE_DIR}/cub)
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
index 85f77e9879fd3e9b9f270f5946fae66c769b466f..a35c24e9e01101f837ba961c06429c981ddc4648 100644
--- a/tensorflow/contrib/cmake/external/fft2d.cmake
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -45,7 +45,7 @@ else()
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       BUILD_IN_SOURCE 1
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
-      INSTALL_DIR $(fft2d_INSTALL)
+      INSTALL_DIR ${fft2d_INSTALL}
       INSTALL_COMMAND echo
       BUILD_COMMAND $(MAKE))
     
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index d7201680ceb9984598bc45df01d6195e4e1ca897..0740c38dd3e1e8d360b68924760eb03e89e7849a 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 3bc78cd0b5bd784a235c01612d634b1ec5f8fb97)
+set(GRPC_TAG 781fd6f6ea03645a520cd5c675da67ab61f87e4b)
 
 if(WIN32)
   set(grpc_STATIC_LIBRARIES
@@ -38,7 +38,10 @@ ExternalProject_Add(grpc
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    # TODO(jhseu): Remove this PATCH_COMMAND once grpc removes the dependency
+    # on "grpc" from the "grpc++_unsecure" rule.
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
@@ -46,5 +49,13 @@ ExternalProject_Add(grpc
         -DPROTOBUF_INCLUDE_DIRS:STRING=${PROTOBUF_INCLUDE_DIRS}
         -DPROTOBUF_LIBRARIES:STRING=${protobuf_STATIC_LIBRARIES}
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
+	-DgRPC_SSL_PROVIDER:STRING=NONE
 )
 
+# grpc/src/core/ext/census/tracing.c depends on the existence of openssl/rand.h.
+ExternalProject_Add_Step(grpc copy_rand
+    COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_SOURCE_DIR}/patches/grpc/rand.h ${GRPC_BUILD}/include/openssl/rand.h
+    DEPENDEES patch
+    DEPENDERS build
+)
diff --git a/tensorflow/contrib/cmake/external/re2.cmake b/tensorflow/contrib/cmake/external/re2.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ec9c2de3388ef918c75d842dab6e1f4ffee9b
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(re2_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/re2/install/include)
+set(re2_URL https://github.com/google/re2)
+set(re2_BUILD ${CMAKE_CURRENT_BINARY_DIR}/re2/src/re2)
+set(re2_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/re2/install)
+set(re2_TAG e7efc48)
+
+if(WIN32)
+  set(re2_STATIC_LIBRARIES ${re2_BUILD}/$(Configuration)/re2.lib)
+else()
+  set(re2_STATIC_LIBRARIES ${re2_BUILD}/libre2.a)
+endif()
+
+set(re2_HEADERS
+    ${re2_INSTALL}/include/re2/re2.h
+)
+
+ExternalProject_Add(re2
+    PREFIX re2
+    GIT_REPOSITORY ${re2_URL}
+    GIT_TAG ${re2_TAG}
+    INSTALL_DIR ${re2_INSTALL}
+    BUILD_IN_SOURCE 1
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/patches/cub/CMakeLists.txt b/tensorflow/contrib/cmake/patches/cub/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36890f0ce6c814ba99b0c534861edd0f9dca6bd1
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/cub/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(cub)
diff --git a/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt b/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt
index ce8a0cb72c74a3d4904dfc6d625ce5c75e831f51..84722c5ca2a9f9253c7a76dd610dde615a176c07 100644
--- a/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt
@@ -1,61 +1,780 @@
-# GRPC global cmake file, modified for the TensorFlow build system.
+# GRPC global cmake file
 # This currently builds C and C++ code.
-
-# This file is based on the CMakeLists.txt available from here:
-# https://github.com/grpc/grpc/blob/3bc78cd0b5bd784a235c01612d634b1ec5f8fb97/CMakeLists.txt
-# with modifications to remove dependencies on SSL, and to reuse
-# previously compiled libprotobuf.
+# This file has been automatically generated from a template file.
+# Please look at the templates directory instead.
+# This file can be regenerated from the template by running
+# tools/buildgen/generate_projects.sh
 #
-# Copyright 2016, Google Inc.
-# All rights reserved.
+# Copyright 2015 gRPC authors.
 #
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
 cmake_minimum_required(VERSION 2.8)
 
 set(PACKAGE_NAME      "grpc")
-set(PACKAGE_VERSION   "1.0.0-pre2-tensorflow")
+set(PACKAGE_VERSION   "1.5.0-dev")
 set(PACKAGE_STRING    "${PACKAGE_NAME} ${PACKAGE_VERSION}")
 set(PACKAGE_TARNAME   "${PACKAGE_NAME}-${PACKAGE_VERSION}")
 set(PACKAGE_BUGREPORT "https://github.com/grpc/grpc/issues/")
 project(${PACKAGE_NAME} C CXX)
 
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -std=c11")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(gRPC_INSTALL_BINDIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables")
+set(gRPC_INSTALL_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries")
+set(gRPC_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_PREFIX}/include" CACHE PATH "Installation directory for headers")
+set(gRPC_INSTALL_CMAKEDIR "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PACKAGE_NAME}" CACHE PATH "Installation directory for cmake config files")
+
+# Options
+option(gRPC_BUILD_TESTS "Build tests" OFF)
+
+set(gRPC_INSTALL_default ON)
+if (NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+  # Disable gRPC_INSTALL by default if building as a submodule
+  set(gRPC_INSTALL_default OFF)
+endif()
+set(gRPC_INSTALL ${gRPC_INSTALL_default} CACHE BOOL
+    "Generate installation target: gRPC_ZLIB_PROVIDER, gRPC_CARES_PROVIDER, gRPC_SSL_PROVIDER and gRPC_PROTOBUF_PROVIDER must all be \"package\"")
+
+set(gRPC_ZLIB_PROVIDER "module" CACHE STRING "Provider of zlib library")
+set_property(CACHE gRPC_ZLIB_PROVIDER PROPERTY STRINGS "module" "package")
+
+set(gRPC_CARES_PROVIDER "module" CACHE STRING "Provider of c-ares library")
+set_property(CACHE gRPC_CARES_PROVIDER PROPERTY STRINGS "module" "package")
 
+set(gRPC_SSL_PROVIDER "module" CACHE STRING "Provider of ssl library")
+set_property(CACHE gRPC_SSL_PROVIDER PROPERTY STRINGS "module" "package")
+
+set(gRPC_PROTOBUF_PROVIDER "module" CACHE STRING "Provider of protobuf library")
+set_property(CACHE gRPC_PROTOBUF_PROVIDER PROPERTY STRINGS "module" "package")
+
+set(gRPC_PROTOBUF_PACKAGE_TYPE "" CACHE STRING "Algorithm for searching protobuf package")
+set_property(CACHE gRPC_PROTOBUF_PACKAGE_TYPE PROPERTY STRINGS "CONFIG" "MODULE")
+
+set(gRPC_GFLAGS_PROVIDER "module" CACHE STRING "Provider of gflags library")
+set_property(CACHE gRPC_GFLAGS_PROVIDER PROPERTY STRINGS "module" "package")
+
+set(gRPC_BENCHMARK_PROVIDER "module" CACHE STRING "Provider of benchmark library")
+set_property(CACHE gRPC_BENCHMARK_PROVIDER PROPERTY STRINGS "module" "package")
+
+set(gRPC_USE_PROTO_LITE OFF CACHE BOOL "Use the protobuf-lite library")
+
+if(UNIX)
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    set(_gRPC_PLATFORM_LINUX ON)
+  elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(_gRPC_PLATFORM_MAC ON)
+  else()
+    set(_gRPC_PLATFORM_POSIX ON)
+  endif()
+endif()
 if(WIN32)
-  add_definitions(-D_WIN32_WINNT=0x0A00)
-  find_package(ZLIB REQUIRED)
-endif(WIN32)
+  set(_gRPC_PLATFORM_WINDOWS ON)
+endif()
+
+set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+
+if (MSVC)
+  include(cmake/msvc_static_runtime.cmake)
+  add_definitions(-D_WIN32_WINNT=0x600 -D_SCL_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_WARNINGS -D_WINSOCK_DEPRECATED_NO_WARNINGS)
+  # needed to compile protobuf
+  add_definitions(/wd4065 /wd4506)
+  # TODO(jtattermusch): revisit C4267 occurrences throughout the code
+  add_definitions(/wd4267)
+endif()
+
+if (gRPC_USE_PROTO_LITE)
+  set(_gRPC_PROTOBUF_LIBRARY_NAME "libprotobuf-lite")
+  add_definitions("-DGRPC_USE_PROTO_LITE")
+else()
+  set(_gRPC_PROTOBUF_LIBRARY_NAME "libprotobuf")
+endif()
+
+if("${gRPC_ZLIB_PROVIDER}" STREQUAL "module")
+  if(NOT ZLIB_ROOT_DIR)
+    set(ZLIB_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
+  endif()
+  set(ZLIB_INCLUDE_DIR "${ZLIB_ROOT_DIR}")
+  if(EXISTS "${ZLIB_ROOT_DIR}/CMakeLists.txt")
+      # TODO(jtattermusch): workaround for https://github.com/madler/zlib/issues/218
+      include_directories(${ZLIB_INCLUDE_DIR})
+
+      add_subdirectory(${ZLIB_ROOT_DIR} third_party/zlib)
+      if(TARGET zlibstatic)
+          set(_gRPC_ZLIB_LIBRARIES zlibstatic)
+      endif()
+  else()
+      message(WARNING "gRPC_ZLIB_PROVIDER is \"module\" but ZLIB_ROOT_DIR is wrong")
+  endif()
+  if(gRPC_INSTALL)
+    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_ZLIB_PROVIDER is \"module\"")
+    set(gRPC_INSTALL FALSE)
+  endif()
+elseif("${gRPC_ZLIB_PROVIDER}" STREQUAL "package")
+  find_package(ZLIB)
+  if(TARGET ZLIB::ZLIB)
+    set(_gRPC_ZLIB_LIBRARIES ZLIB::ZLIB)
+  endif()
+  set(_gRPC_FIND_ZLIB "if(NOT ZLIB_FOUND)\n  find_package(ZLIB)\nendif()")
+endif()
+
+if("${gRPC_CARES_PROVIDER}" STREQUAL "module")
+  if(NOT CARES_ROOT_DIR)
+    set(CARES_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/c-ares)
+  endif()
+  string(TOLOWER ${CMAKE_SYSTEM_NAME} CARES_SYSTEM_NAME)
+  set(CARES_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cares/cares")
+  set(CARES_BUILD_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cares")
+  set(CARES_PLATFORM_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cares/config_${CARES_SYSTEM_NAME}")
+  if(EXISTS "${CARES_ROOT_DIR}/CMakeLists.txt")
+    if("${CARES_SYSTEM_NAME}" MATCHES "windows")
+      add_definitions(-DCARES_STATICLIB=1)
+      add_definitions(-DWIN32_LEAN_AND_MEAN=1)
+    else()
+      add_definitions(-DHAVE_CONFIG_H=1)
+      add_definitions(-D_GNU_SOURCE=1)
+    endif()
+    add_subdirectory(src/c-ares third_party/cares)
+    if(TARGET cares)
+        set(_gRPC_CARES_LIBRARIES cares)
+    endif()
+  else()
+    message(WARNING "gRPC_CARES_PROVIDER is \"module\" but CARES_ROOT_DIR is wrong")
+  endif()
+  if(gRPC_INSTALL)
+    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_CARES_PROVIDER is \"module\"")
+    set(gRPC_INSTALL FALSE)
+  endif()
+elseif("${gRPC_CARES_PROVIDER}" STREQUAL "package")
+  find_package(c-ares CONFIG)
+  if(TARGET c-ares::cares)
+    set(_gRPC_CARES_LIBRARIES c-ares::cares)
+  endif()
+  set(_gRPC_FIND_CARES "if(NOT c-ares_FOUND)\n  find_package(c-ares CONFIG)\nendif()")
+endif()
+
+if("${gRPC_PROTOBUF_PROVIDER}" STREQUAL "module")
+  # Building the protobuf tests require gmock what is not part of a standard protobuf checkout.
+  # Disable them unless they are explicitly requested from the cmake command line (when we assume
+  # gmock is downloaded to the right location inside protobuf).
+  if(NOT protobuf_BUILD_TESTS)
+    set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests")
+  endif()
+  # Disable building protobuf with zlib. Building protobuf with zlib breaks
+  # the build if zlib is not installed on the system.
+  if(NOT protobuf_WITH_ZLIB)
+    set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build protobuf with zlib.")
+  endif()
+  if(NOT PROTOBUF_ROOT_DIR)
+    set(PROTOBUF_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
+  endif()
+  set(PROTOBUF_WELLKNOWN_IMPORT_DIR ${PROTOBUF_ROOT_DIR}/src)
+  if(EXISTS "${PROTOBUF_ROOT_DIR}/cmake/CMakeLists.txt")
+    set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "Link static runtime libraries")
+    add_subdirectory(${PROTOBUF_ROOT_DIR}/cmake third_party/protobuf)
+    if(TARGET ${_gRPC_PROTOBUF_LIBRARY_NAME})
+      set(_gRPC_PROTOBUF_LIBRARIES ${_gRPC_PROTOBUF_LIBRARY_NAME})
+    endif()
+    if(TARGET libprotoc)
+      set(_gRPC_PROTOBUF_PROTOC_LIBRARIES libprotoc)
+    endif()
+    if(TARGET protoc)
+      set(_gRPC_PROTOBUF_PROTOC protoc)
+    endif()
+  else()
+      message(WARNING "gRPC_PROTOBUF_PROVIDER is \"module\" but PROTOBUF_ROOT_DIR is wrong")
+  endif()
+  if(gRPC_INSTALL)
+    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_PROTOBUF_PROVIDER is \"module\"")
+    set(gRPC_INSTALL FALSE)
+  endif()
+elseif("${gRPC_PROTOBUF_PROVIDER}" STREQUAL "package")
+  find_package(Protobuf ${gRPC_PROTOBUF_PACKAGE_TYPE})
+  if(Protobuf_FOUND OR PROTOBUF_FOUND)
+    if(TARGET protobuf::${_gRPC_PROTOBUF_LIBRARY_NAME})
+      set(_gRPC_PROTOBUF_LIBRARIES protobuf::${_gRPC_PROTOBUF_LIBRARY_NAME})
+    else()
+      set(_gRPC_PROTOBUF_LIBRARIES ${PROTOBUF_LIBRARIES})
+    endif()
+    if(TARGET protobuf::libprotoc)
+      set(_gRPC_PROTOBUF_PROTOC_LIBRARIES protobuf::libprotoc)
+    else()
+      set(_gRPC_PROTOBUF_PROTOC_LIBRARIES ${PROTOBUF_PROTOC_LIBRARIES})
+    endif()
+    if(TARGET protobuf::protoc)
+      set(_gRPC_PROTOBUF_PROTOC protobuf::protoc)
+    else()
+      set(_gRPC_PROTOBUF_PROTOC ${PROTOBUF_PROTOC_EXECUTABLE})
+    endif()
+    set(_gRPC_FIND_PROTOBUF "if(NOT Protobuf_FOUND AND NOT PROTOBUF_FOUND)\n  find_package(Protobuf ${gRPC_PROTOBUF_PACKAGE_TYPE})\nendif()")
+  endif()
+  if(PROTOBUF_FOUND)
+    include_directories(${PROTOBUF_INCLUDE_DIRS})
+  endif()
+  set(PROTOBUF_WELLKNOWN_IMPORT_DIR /usr/local/include)
+endif()
+
+if("${gRPC_SSL_PROVIDER}" STREQUAL "module")
+  if(NOT BORINGSSL_ROOT_DIR)
+    set(BORINGSSL_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/boringssl)
+  endif()
+  if(EXISTS "${BORINGSSL_ROOT_DIR}/CMakeLists.txt")
+    set(OPENSSL_NO_ASM ON)  # make boringssl buildable with Visual Studio
+    add_subdirectory(${BORINGSSL_ROOT_DIR} third_party/boringssl)
+    if(TARGET ssl)
+      set(_gRPC_SSL_LIBRARIES ssl)
+    endif()
+  else()
+      message(WARNING "gRPC_SSL_PROVIDER is \"module\" but BORINGSSL_ROOT_DIR is wrong")
+  endif()
+  if(gRPC_INSTALL)
+    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_SSL_PROVIDER is \"module\"")
+    set(gRPC_INSTALL FALSE)
+  endif()
+elseif("${gRPC_SSL_PROVIDER}" STREQUAL "package")
+  find_package(OpenSSL)
+  if(TARGET OpenSSL::SSL)
+    set(_gRPC_SSL_LIBRARIES OpenSSL::SSL)
+  endif()
+  set(_gRPC_FIND_SSL "if(NOT OpenSSL_FOUND)\n  find_package(OpenSSL)\nendif()")
+endif()
+
+if("${gRPC_GFLAGS_PROVIDER}" STREQUAL "module")
+  if(NOT GFLAGS_ROOT_DIR)
+    set(GFLAGS_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
+  endif()
+  if(EXISTS "${GFLAGS_ROOT_DIR}/CMakeLists.txt")
+      add_subdirectory(${GFLAGS_ROOT_DIR} third_party/gflags)
+      if(TARGET gflags_static)
+          set(_gRPC_GFLAGS_LIBRARIES gflags_static)
+      endif()
+  else()
+      message(WARNING "gRPC_GFLAGS_PROVIDER is \"module\" but GFLAGS_ROOT_DIR is wrong")
+  endif()
+elseif("${gRPC_GFLAGS_PROVIDER}" STREQUAL "package")
+  find_package(gflags)
+  if(TARGET gflags::gflags)
+    set(_gRPC_GFLAGS_LIBRARIES gflags::gflags)
+  endif()
+  set(_gRPC_FIND_GFLAGS "if(NOT gflags_FOUND)\n  find_package(gflags)\nendif()")
+endif()
+
+if("${gRPC_BENCHMARK_PROVIDER}" STREQUAL "module")
+  if(NOT BENCHMARK_ROOT_DIR)
+    set(BENCHMARK_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/benchmark)
+  endif()
+  if(EXISTS "${BENCHMARK_ROOT_DIR}/CMakeLists.txt")
+      add_subdirectory(${BENCHMARK_ROOT_DIR} third_party/benchmark)
+      if(TARGET benchmark)
+          set(_gRPC_BENCHMARK_LIBRARIES benchmark)
+      endif()
+  else()
+      message(WARNING "gRPC_BENCHMARK_PROVIDER is \"module\" but BENCHMARK_ROOT_DIR is wrong")
+  endif()
+elseif("${gRPC_BENCHMARK_PROVIDER}" STREQUAL "package")
+  find_package(benchmark)
+  if(TARGET benchmark::benchmark)
+    set(_gRPC_BENCHMARK_LIBRARIES benchmark::benchmark)
+  endif()
+  set(_gRPC_FIND_BENCHMARK "if(NOT benchmark_FOUND)\n  find_package(benchmark)\nendif()")
+endif()
+
+if(NOT MSVC)
+  set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -std=c99")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
+
+if(_gRPC_PLATFORM_MAC)
+  set(_gRPC_ALLTARGETS_LIBRARIES ${CMAKE_DL_LIBS} m pthread)
+elseif(UNIX)
+  set(_gRPC_ALLTARGETS_LIBRARIES ${CMAKE_DL_LIBS} rt m pthread)
+endif()
+
+if(WIN32 AND MSVC)
+  set(_gRPC_BASELIB_LIBRARIES wsock32 ws2_32)
+endif()
+
+# Create directory for generated .proto files
+set(_gRPC_PROTO_GENS_DIR ${CMAKE_BINARY_DIR}/gens)
+file(MAKE_DIRECTORY ${_gRPC_PROTO_GENS_DIR})
+
+#  protobuf_generate_grpc_cpp
+#  --------------------------
+#
+#   Add custom commands to process ``.proto`` files to C++ using protoc and
+#   GRPC plugin::
+#
+#     protobuf_generate_grpc_cpp [<ARGN>...]
+#
+#   ``ARGN``
+#     ``.proto`` files
+#
+function(protobuf_generate_grpc_cpp)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: PROTOBUF_GENERATE_GRPC_CPP() called without any proto files")
+    return()
+  endif()
+
+  set(_protobuf_include_path -I . -I ${PROTOBUF_WELLKNOWN_IMPORT_DIR})
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    file(RELATIVE_PATH REL_FIL ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL})
+    get_filename_component(REL_DIR ${REL_FIL} DIRECTORY)
+    set(RELFIL_WE "${REL_DIR}/${FIL_WE}")
+
+    add_custom_command(
+      OUTPUT "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.cc"
+             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.h"
+             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}_mock.grpc.pb.h"
+             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.cc"
+             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.h"
+      COMMAND $<TARGET_FILE:${_gRPC_PROTOBUF_PROTOC}>
+      ARGS --grpc_out=generate_mock_code=true:${_gRPC_PROTO_GENS_DIR}
+           --cpp_out=${_gRPC_PROTO_GENS_DIR}
+           --plugin=protoc-gen-grpc=$<TARGET_FILE:grpc_cpp_plugin>
+           ${_protobuf_include_path}
+           ${REL_FIL}
+      DEPENDS ${ABS_FIL} ${_gRPC_PROTOBUF_PROTOC} grpc_cpp_plugin
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      COMMENT "Running gRPC C++ protocol buffer compiler on ${FIL}"
+      VERBATIM)
+
+      set_source_files_properties("${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.cc" "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.h"  "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}_mock.grpc.pb.h" "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.cc" "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.h" PROPERTIES GENERATED TRUE)
+  endforeach()
+endfunction()
+
+add_custom_target(plugins
+  DEPENDS
+  grpc_cpp_plugin
+  grpc_csharp_plugin
+  grpc_node_plugin
+  grpc_objective_c_plugin
+  grpc_php_plugin
+  grpc_python_plugin
+  grpc_ruby_plugin
+)
+
+add_custom_target(tools_c
+  DEPENDS
+  check_epollexclusive
+  gen_hpack_tables
+  gen_legal_metadata_characters
+  gen_percent_encoding_tables
+  grpc_create_jwt
+  grpc_print_google_default_creds_token
+  grpc_verify_jwt
+)
+
+add_custom_target(tools_cxx
+  DEPENDS
+)
+
+add_custom_target(tools
+  DEPENDS tools_c tools_cxx)
+
+if (gRPC_BUILD_TESTS)
+add_custom_target(buildtests_c)
+add_dependencies(buildtests_c alarm_test)
+add_dependencies(buildtests_c algorithm_test)
+add_dependencies(buildtests_c alloc_test)
+add_dependencies(buildtests_c alpn_test)
+add_dependencies(buildtests_c arena_test)
+add_dependencies(buildtests_c bad_server_response_test)
+add_dependencies(buildtests_c bdp_estimator_test)
+add_dependencies(buildtests_c bin_decoder_test)
+add_dependencies(buildtests_c bin_encoder_test)
+add_dependencies(buildtests_c census_context_test)
+add_dependencies(buildtests_c census_intrusive_hash_map_test)
+add_dependencies(buildtests_c census_resource_test)
+add_dependencies(buildtests_c census_trace_context_test)
+add_dependencies(buildtests_c channel_create_test)
+add_dependencies(buildtests_c chttp2_hpack_encoder_test)
+add_dependencies(buildtests_c chttp2_stream_map_test)
+add_dependencies(buildtests_c chttp2_varint_test)
+add_dependencies(buildtests_c combiner_test)
+add_dependencies(buildtests_c compression_test)
+add_dependencies(buildtests_c concurrent_connectivity_test)
+add_dependencies(buildtests_c connection_refused_test)
+add_dependencies(buildtests_c dns_resolver_connectivity_test)
+add_dependencies(buildtests_c dns_resolver_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c dualstack_socket_test)
+endif()
+add_dependencies(buildtests_c endpoint_pair_test)
+add_dependencies(buildtests_c error_test)
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c ev_epollsig_linux_test)
+endif()
+add_dependencies(buildtests_c fake_resolver_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c fd_conservation_posix_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c fd_posix_test)
+endif()
+add_dependencies(buildtests_c fling_client)
+add_dependencies(buildtests_c fling_server)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c fling_stream_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c fling_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c goaway_server_test)
+endif()
+add_dependencies(buildtests_c gpr_avl_test)
+add_dependencies(buildtests_c gpr_backoff_test)
+add_dependencies(buildtests_c gpr_cmdline_test)
+add_dependencies(buildtests_c gpr_cpu_test)
+add_dependencies(buildtests_c gpr_env_test)
+add_dependencies(buildtests_c gpr_histogram_test)
+add_dependencies(buildtests_c gpr_host_port_test)
+add_dependencies(buildtests_c gpr_log_test)
+add_dependencies(buildtests_c gpr_mpscq_test)
+add_dependencies(buildtests_c gpr_spinlock_test)
+add_dependencies(buildtests_c gpr_stack_lockfree_test)
+add_dependencies(buildtests_c gpr_string_test)
+add_dependencies(buildtests_c gpr_sync_test)
+add_dependencies(buildtests_c gpr_thd_test)
+add_dependencies(buildtests_c gpr_time_test)
+add_dependencies(buildtests_c gpr_tls_test)
+add_dependencies(buildtests_c gpr_useful_test)
+add_dependencies(buildtests_c grpc_auth_context_test)
+add_dependencies(buildtests_c grpc_b64_test)
+add_dependencies(buildtests_c grpc_byte_buffer_reader_test)
+add_dependencies(buildtests_c grpc_channel_args_test)
+add_dependencies(buildtests_c grpc_channel_stack_test)
+add_dependencies(buildtests_c grpc_completion_queue_test)
+add_dependencies(buildtests_c grpc_completion_queue_threading_test)
+add_dependencies(buildtests_c grpc_credentials_test)
+add_dependencies(buildtests_c grpc_fetch_oauth2)
+add_dependencies(buildtests_c grpc_invalid_channel_args_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c grpc_json_token_test)
+endif()
+add_dependencies(buildtests_c grpc_jwt_verifier_test)
+add_dependencies(buildtests_c grpc_security_connector_test)
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c handshake_client)
+endif()
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c handshake_server)
+endif()
+add_dependencies(buildtests_c hpack_parser_test)
+add_dependencies(buildtests_c hpack_table_test)
+add_dependencies(buildtests_c http_parser_test)
+add_dependencies(buildtests_c httpcli_format_request_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c httpcli_test)
+endif()
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c httpscli_test)
+endif()
+add_dependencies(buildtests_c init_test)
+add_dependencies(buildtests_c invalid_call_argument_test)
+add_dependencies(buildtests_c json_rewrite)
+add_dependencies(buildtests_c json_rewrite_test)
+add_dependencies(buildtests_c json_stream_error_test)
+add_dependencies(buildtests_c json_test)
+add_dependencies(buildtests_c lame_client_test)
+add_dependencies(buildtests_c lb_policies_test)
+add_dependencies(buildtests_c load_file_test)
+add_dependencies(buildtests_c memory_profile_client)
+add_dependencies(buildtests_c memory_profile_server)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c memory_profile_test)
+endif()
+add_dependencies(buildtests_c message_compress_test)
+add_dependencies(buildtests_c minimal_stack_is_minimal_test)
+add_dependencies(buildtests_c mlog_test)
+add_dependencies(buildtests_c multiple_server_queues_test)
+add_dependencies(buildtests_c murmur_hash_test)
+add_dependencies(buildtests_c no_server_test)
+add_dependencies(buildtests_c num_external_connectivity_watchers_test)
+add_dependencies(buildtests_c parse_address_test)
+add_dependencies(buildtests_c percent_encoding_test)
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c pollset_set_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c resolve_address_posix_test)
+endif()
+add_dependencies(buildtests_c resolve_address_test)
+add_dependencies(buildtests_c resource_quota_test)
+add_dependencies(buildtests_c secure_channel_create_test)
+add_dependencies(buildtests_c secure_endpoint_test)
+add_dependencies(buildtests_c sequential_connectivity_test)
+add_dependencies(buildtests_c server_chttp2_test)
+add_dependencies(buildtests_c server_test)
+add_dependencies(buildtests_c slice_buffer_test)
+add_dependencies(buildtests_c slice_hash_table_test)
+add_dependencies(buildtests_c slice_string_helpers_test)
+add_dependencies(buildtests_c slice_test)
+add_dependencies(buildtests_c sockaddr_resolver_test)
+add_dependencies(buildtests_c sockaddr_utils_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c socket_utils_test)
+endif()
+add_dependencies(buildtests_c status_conversion_test)
+add_dependencies(buildtests_c stream_compression_test)
+add_dependencies(buildtests_c stream_owned_slice_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c tcp_client_posix_test)
+endif()
+add_dependencies(buildtests_c tcp_client_uv_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c tcp_posix_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c tcp_server_posix_test)
+endif()
+add_dependencies(buildtests_c tcp_server_uv_test)
+add_dependencies(buildtests_c time_averaged_stats_test)
+add_dependencies(buildtests_c timeout_encoding_test)
+add_dependencies(buildtests_c timer_heap_test)
+add_dependencies(buildtests_c timer_list_test)
+add_dependencies(buildtests_c transport_connectivity_state_test)
+add_dependencies(buildtests_c transport_metadata_test)
+add_dependencies(buildtests_c transport_pid_controller_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c transport_security_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c udp_server_test)
+endif()
+add_dependencies(buildtests_c uri_parser_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c wakeup_fd_cv_test)
+endif()
+add_dependencies(buildtests_c public_headers_must_be_c89)
+add_dependencies(buildtests_c badreq_bad_client_test)
+add_dependencies(buildtests_c connection_prefix_bad_client_test)
+add_dependencies(buildtests_c head_of_line_blocking_bad_client_test)
+add_dependencies(buildtests_c headers_bad_client_test)
+add_dependencies(buildtests_c initial_settings_frame_bad_client_test)
+add_dependencies(buildtests_c large_metadata_bad_client_test)
+add_dependencies(buildtests_c server_registered_method_bad_client_test)
+add_dependencies(buildtests_c simple_request_bad_client_test)
+add_dependencies(buildtests_c unknown_frame_bad_client_test)
+add_dependencies(buildtests_c window_overflow_bad_client_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c bad_ssl_cert_server)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c bad_ssl_cert_test)
+endif()
+add_dependencies(buildtests_c h2_census_test)
+add_dependencies(buildtests_c h2_compress_test)
+add_dependencies(buildtests_c h2_fakesec_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c h2_fd_test)
+endif()
+add_dependencies(buildtests_c h2_full_test)
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c h2_full+pipe_test)
+endif()
+add_dependencies(buildtests_c h2_full+trace_test)
+add_dependencies(buildtests_c h2_full+workarounds_test)
+add_dependencies(buildtests_c h2_http_proxy_test)
+add_dependencies(buildtests_c h2_load_reporting_test)
+add_dependencies(buildtests_c h2_oauth2_test)
+add_dependencies(buildtests_c h2_proxy_test)
+add_dependencies(buildtests_c h2_sockpair_test)
+add_dependencies(buildtests_c h2_sockpair+trace_test)
+add_dependencies(buildtests_c h2_sockpair_1byte_test)
+add_dependencies(buildtests_c h2_ssl_test)
+add_dependencies(buildtests_c h2_ssl_cert_test)
+add_dependencies(buildtests_c h2_ssl_proxy_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c h2_uds_test)
+endif()
+add_dependencies(buildtests_c inproc_test)
+add_dependencies(buildtests_c h2_census_nosec_test)
+add_dependencies(buildtests_c h2_compress_nosec_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c h2_fd_nosec_test)
+endif()
+add_dependencies(buildtests_c h2_full_nosec_test)
+if(_gRPC_PLATFORM_LINUX)
+add_dependencies(buildtests_c h2_full+pipe_nosec_test)
+endif()
+add_dependencies(buildtests_c h2_full+trace_nosec_test)
+add_dependencies(buildtests_c h2_full+workarounds_nosec_test)
+add_dependencies(buildtests_c h2_http_proxy_nosec_test)
+add_dependencies(buildtests_c h2_load_reporting_nosec_test)
+add_dependencies(buildtests_c h2_proxy_nosec_test)
+add_dependencies(buildtests_c h2_sockpair_nosec_test)
+add_dependencies(buildtests_c h2_sockpair+trace_nosec_test)
+add_dependencies(buildtests_c h2_sockpair_1byte_nosec_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_c h2_uds_nosec_test)
+endif()
+add_dependencies(buildtests_c inproc_nosec_test)
+add_dependencies(buildtests_c api_fuzzer_one_entry)
+add_dependencies(buildtests_c client_fuzzer_one_entry)
+add_dependencies(buildtests_c hpack_parser_fuzzer_test_one_entry)
+add_dependencies(buildtests_c http_request_fuzzer_test_one_entry)
+add_dependencies(buildtests_c http_response_fuzzer_test_one_entry)
+add_dependencies(buildtests_c json_fuzzer_test_one_entry)
+add_dependencies(buildtests_c nanopb_fuzzer_response_test_one_entry)
+add_dependencies(buildtests_c nanopb_fuzzer_serverlist_test_one_entry)
+add_dependencies(buildtests_c percent_decode_fuzzer_one_entry)
+add_dependencies(buildtests_c percent_encode_fuzzer_one_entry)
+add_dependencies(buildtests_c server_fuzzer_one_entry)
+add_dependencies(buildtests_c ssl_server_fuzzer_one_entry)
+add_dependencies(buildtests_c uri_fuzzer_test_one_entry)
+
+add_custom_target(buildtests_cxx)
+add_dependencies(buildtests_cxx alarm_cpp_test)
+add_dependencies(buildtests_cxx async_end2end_test)
+add_dependencies(buildtests_cxx auth_property_iterator_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_arena)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_call_create)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_chttp2_hpack)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_chttp2_transport)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_closure)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_cq)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_cq_multiple_threads)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_error)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_fullstack_streaming_ping_pong)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_fullstack_streaming_pump)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_fullstack_trickle)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_fullstack_unary_ping_pong)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_metadata)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx bm_pollset)
+endif()
+add_dependencies(buildtests_cxx channel_arguments_test)
+add_dependencies(buildtests_cxx channel_filter_test)
+add_dependencies(buildtests_cxx cli_call_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx client_crash_test)
+endif()
+add_dependencies(buildtests_cxx client_crash_test_server)
+add_dependencies(buildtests_cxx client_lb_end2end_test)
+add_dependencies(buildtests_cxx codegen_test_full)
+add_dependencies(buildtests_cxx codegen_test_minimal)
+add_dependencies(buildtests_cxx credentials_test)
+add_dependencies(buildtests_cxx cxx_byte_buffer_test)
+add_dependencies(buildtests_cxx cxx_slice_test)
+add_dependencies(buildtests_cxx cxx_string_ref_test)
+add_dependencies(buildtests_cxx cxx_time_test)
+add_dependencies(buildtests_cxx end2end_test)
+add_dependencies(buildtests_cxx error_details_test)
+add_dependencies(buildtests_cxx filter_end2end_test)
+add_dependencies(buildtests_cxx generic_end2end_test)
+add_dependencies(buildtests_cxx golden_file_test)
+add_dependencies(buildtests_cxx grpc_cli)
+add_dependencies(buildtests_cxx grpc_tool_test)
+add_dependencies(buildtests_cxx grpclb_api_test)
+add_dependencies(buildtests_cxx grpclb_end2end_test)
+add_dependencies(buildtests_cxx grpclb_test)
+add_dependencies(buildtests_cxx health_service_end2end_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx http2_client)
+endif()
+add_dependencies(buildtests_cxx hybrid_end2end_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx interop_client)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx interop_server)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx interop_test)
+endif()
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx json_run_localhost)
+endif()
+add_dependencies(buildtests_cxx memory_test)
+add_dependencies(buildtests_cxx metrics_client)
+add_dependencies(buildtests_cxx mock_test)
+add_dependencies(buildtests_cxx noop-benchmark)
+add_dependencies(buildtests_cxx proto_server_reflection_test)
+add_dependencies(buildtests_cxx proto_utils_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx qps_interarrival_test)
+endif()
+add_dependencies(buildtests_cxx qps_json_driver)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx qps_openloop_test)
+endif()
+add_dependencies(buildtests_cxx qps_worker)
+add_dependencies(buildtests_cxx reconnect_interop_client)
+add_dependencies(buildtests_cxx reconnect_interop_server)
+add_dependencies(buildtests_cxx secure_auth_context_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx secure_sync_unary_ping_pong_test)
+endif()
+add_dependencies(buildtests_cxx server_builder_plugin_test)
+add_dependencies(buildtests_cxx server_builder_test)
+add_dependencies(buildtests_cxx server_context_test_spouse_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx server_crash_test)
+endif()
+add_dependencies(buildtests_cxx server_crash_test_client)
+add_dependencies(buildtests_cxx server_request_call_test)
+add_dependencies(buildtests_cxx shutdown_test)
+add_dependencies(buildtests_cxx status_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx streaming_throughput_test)
+endif()
+add_dependencies(buildtests_cxx stress_test)
+add_dependencies(buildtests_cxx thread_manager_test)
+add_dependencies(buildtests_cxx thread_stress_test)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+add_dependencies(buildtests_cxx writes_per_rpc_test)
+endif()
+
+add_custom_target(buildtests
+  DEPENDS buildtests_c buildtests_cxx)
+endif (gRPC_BUILD_TESTS)
+
 
 add_library(gpr
   src/core/lib/profiling/basic_timers.c
   src/core/lib/profiling/stap_timers.c
   src/core/lib/support/alloc.c
+  src/core/lib/support/arena.c
+  src/core/lib/support/atm.c
   src/core/lib/support/avl.c
   src/core/lib/support/backoff.c
   src/core/lib/support/cmdline.c
@@ -73,9 +792,8 @@ add_library(gpr
   src/core/lib/support/log_linux.c
   src/core/lib/support/log_posix.c
   src/core/lib/support/log_windows.c
+  src/core/lib/support/mpscq.c
   src/core/lib/support/murmur_hash.c
-  src/core/lib/support/slice.c
-  src/core/lib/support/slice_buffer.c
   src/core/lib/support/stack_lockfree.c
   src/core/lib/support/string.c
   src/core/lib/support/string_posix.c
@@ -100,78 +818,228 @@ add_library(gpr
   src/core/lib/support/wrap_memcpy.c
 )
 
+if(WIN32 AND MSVC)
+  set_target_properties(gpr PROPERTIES COMPILE_PDB_NAME "gpr"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/gpr.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
 target_include_directories(gpr
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${PROTOBUF_INCLUDE_DIRS}
-  PRIVATE ${ZLIB_INCLUDE_DIRS}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
 )
 
-add_library(grpc_unsecure
+target_link_libraries(gpr
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+)
+
+foreach(_hdr
+  include/grpc/support/alloc.h
+  include/grpc/support/atm.h
+  include/grpc/support/atm_gcc_atomic.h
+  include/grpc/support/atm_gcc_sync.h
+  include/grpc/support/atm_windows.h
+  include/grpc/support/avl.h
+  include/grpc/support/cmdline.h
+  include/grpc/support/cpu.h
+  include/grpc/support/histogram.h
+  include/grpc/support/host_port.h
+  include/grpc/support/log.h
+  include/grpc/support/log_windows.h
+  include/grpc/support/port_platform.h
+  include/grpc/support/string_util.h
+  include/grpc/support/subprocess.h
+  include/grpc/support/sync.h
+  include/grpc/support/sync_generic.h
+  include/grpc/support/sync_posix.h
+  include/grpc/support/sync_windows.h
+  include/grpc/support/thd.h
+  include/grpc/support/time.h
+  include/grpc/support/tls.h
+  include/grpc/support/tls_gcc.h
+  include/grpc/support/tls_msvc.h
+  include/grpc/support/tls_pthread.h
+  include/grpc/support/useful.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS gpr EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(gpr_test_util
+  test/core/util/test_config.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(gpr_test_util PROPERTIES COMPILE_PDB_NAME "gpr_test_util"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/gpr_test_util.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(gpr_test_util
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_test_util
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc
   src/core/lib/surface/init.c
-  src/core/lib/surface/init_unsecure.c
   src/core/lib/channel/channel_args.c
   src/core/lib/channel/channel_stack.c
   src/core/lib/channel/channel_stack_builder.c
-  src/core/lib/channel/compress_filter.c
   src/core/lib/channel/connected_channel.c
-  src/core/lib/channel/http_client_filter.c
-  src/core/lib/channel/http_server_filter.c
+  src/core/lib/channel/handshaker.c
+  src/core/lib/channel/handshaker_factory.c
+  src/core/lib/channel/handshaker_registry.c
   src/core/lib/compression/compression.c
   src/core/lib/compression/message_compress.c
-  src/core/lib/debug/trace.c
+  src/core/lib/compression/stream_compression.c
   src/core/lib/http/format_request.c
   src/core/lib/http/httpcli.c
   src/core/lib/http/parser.c
   src/core/lib/iomgr/closure.c
+  src/core/lib/iomgr/combiner.c
   src/core/lib/iomgr/endpoint.c
   src/core/lib/iomgr/endpoint_pair_posix.c
+  src/core/lib/iomgr/endpoint_pair_uv.c
   src/core/lib/iomgr/endpoint_pair_windows.c
   src/core/lib/iomgr/error.c
-  src/core/lib/iomgr/ev_epoll_linux.c
-  src/core/lib/iomgr/ev_poll_and_epoll_posix.c
+  src/core/lib/iomgr/ev_epoll1_linux.c
+  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
+  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
+  src/core/lib/iomgr/ev_epollex_linux.c
+  src/core/lib/iomgr/ev_epollsig_linux.c
   src/core/lib/iomgr/ev_poll_posix.c
   src/core/lib/iomgr/ev_posix.c
+  src/core/lib/iomgr/ev_windows.c
   src/core/lib/iomgr/exec_ctx.c
   src/core/lib/iomgr/executor.c
   src/core/lib/iomgr/iocp_windows.c
   src/core/lib/iomgr/iomgr.c
   src/core/lib/iomgr/iomgr_posix.c
+  src/core/lib/iomgr/iomgr_uv.c
   src/core/lib/iomgr/iomgr_windows.c
+  src/core/lib/iomgr/is_epollexclusive_available.c
   src/core/lib/iomgr/load_file.c
+  src/core/lib/iomgr/lockfree_event.c
   src/core/lib/iomgr/network_status_tracker.c
   src/core/lib/iomgr/polling_entity.c
+  src/core/lib/iomgr/pollset_set_uv.c
   src/core/lib/iomgr/pollset_set_windows.c
+  src/core/lib/iomgr/pollset_uv.c
   src/core/lib/iomgr/pollset_windows.c
   src/core/lib/iomgr/resolve_address_posix.c
+  src/core/lib/iomgr/resolve_address_uv.c
   src/core/lib/iomgr/resolve_address_windows.c
+  src/core/lib/iomgr/resource_quota.c
   src/core/lib/iomgr/sockaddr_utils.c
+  src/core/lib/iomgr/socket_factory_posix.c
+  src/core/lib/iomgr/socket_mutator.c
   src/core/lib/iomgr/socket_utils_common_posix.c
   src/core/lib/iomgr/socket_utils_linux.c
   src/core/lib/iomgr/socket_utils_posix.c
+  src/core/lib/iomgr/socket_utils_uv.c
+  src/core/lib/iomgr/socket_utils_windows.c
   src/core/lib/iomgr/socket_windows.c
   src/core/lib/iomgr/tcp_client_posix.c
+  src/core/lib/iomgr/tcp_client_uv.c
   src/core/lib/iomgr/tcp_client_windows.c
   src/core/lib/iomgr/tcp_posix.c
   src/core/lib/iomgr/tcp_server_posix.c
+  src/core/lib/iomgr/tcp_server_utils_posix_common.c
+  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
+  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
+  src/core/lib/iomgr/tcp_server_uv.c
   src/core/lib/iomgr/tcp_server_windows.c
+  src/core/lib/iomgr/tcp_uv.c
   src/core/lib/iomgr/tcp_windows.c
   src/core/lib/iomgr/time_averaged_stats.c
-  src/core/lib/iomgr/timer.c
+  src/core/lib/iomgr/timer_generic.c
   src/core/lib/iomgr/timer_heap.c
+  src/core/lib/iomgr/timer_manager.c
+  src/core/lib/iomgr/timer_uv.c
   src/core/lib/iomgr/udp_server.c
   src/core/lib/iomgr/unix_sockets_posix.c
   src/core/lib/iomgr/unix_sockets_posix_noop.c
+  src/core/lib/iomgr/wakeup_fd_cv.c
   src/core/lib/iomgr/wakeup_fd_eventfd.c
   src/core/lib/iomgr/wakeup_fd_nospecial.c
   src/core/lib/iomgr/wakeup_fd_pipe.c
   src/core/lib/iomgr/wakeup_fd_posix.c
-  src/core/lib/iomgr/workqueue_posix.c
-  src/core/lib/iomgr/workqueue_windows.c
   src/core/lib/json/json.c
   src/core/lib/json/json_reader.c
   src/core/lib/json/json_string.c
   src/core/lib/json/json_writer.c
+  src/core/lib/slice/b64.c
+  src/core/lib/slice/percent_encoding.c
+  src/core/lib/slice/slice.c
+  src/core/lib/slice/slice_buffer.c
+  src/core/lib/slice/slice_hash_table.c
+  src/core/lib/slice/slice_intern.c
+  src/core/lib/slice/slice_string_helpers.c
   src/core/lib/surface/alarm.c
   src/core/lib/surface/api_trace.c
   src/core/lib/surface/byte_buffer.c
@@ -184,21 +1052,28 @@ add_library(grpc_unsecure
   src/core/lib/surface/channel_ping.c
   src/core/lib/surface/channel_stack_type.c
   src/core/lib/surface/completion_queue.c
+  src/core/lib/surface/completion_queue_factory.c
   src/core/lib/surface/event_string.c
-  src/core/lib/surface/lame_client.c
+  src/core/lib/surface/lame_client.cc
   src/core/lib/surface/metadata_array.c
   src/core/lib/surface/server.c
   src/core/lib/surface/validate_metadata.c
   src/core/lib/surface/version.c
+  src/core/lib/transport/bdp_estimator.c
   src/core/lib/transport/byte_stream.c
   src/core/lib/transport/connectivity_state.c
+  src/core/lib/transport/error_utils.c
   src/core/lib/transport/metadata.c
   src/core/lib/transport/metadata_batch.c
+  src/core/lib/transport/pid_controller.c
+  src/core/lib/transport/service_config.c
   src/core/lib/transport/static_metadata.c
+  src/core/lib/transport/status_conversion.c
+  src/core/lib/transport/timeout_encoding.c
   src/core/lib/transport/transport.c
   src/core/lib/transport/transport_op_string.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
+  src/core/lib/debug/trace.c
+  src/core/ext/transport/chttp2/server/secure/server_secure_chttp2.c
   src/core/ext/transport/chttp2/transport/bin_decoder.c
   src/core/ext/transport/chttp2/transport/bin_encoder.c
   src/core/ext/transport/chttp2/transport/chttp2_plugin.c
@@ -212,112 +1087,13329 @@ add_library(grpc_unsecure
   src/core/ext/transport/chttp2/transport/hpack_encoder.c
   src/core/ext/transport/chttp2/transport/hpack_parser.c
   src/core/ext/transport/chttp2/transport/hpack_table.c
+  src/core/ext/transport/chttp2/transport/http2_settings.c
   src/core/ext/transport/chttp2/transport/huffsyms.c
   src/core/ext/transport/chttp2/transport/incoming_metadata.c
   src/core/ext/transport/chttp2/transport/parsing.c
-  src/core/ext/transport/chttp2/transport/status_conversion.c
   src/core/ext/transport/chttp2/transport/stream_lists.c
   src/core/ext/transport/chttp2/transport/stream_map.c
-  src/core/ext/transport/chttp2/transport/timeout_encoding.c
   src/core/ext/transport/chttp2/transport/varint.c
   src/core/ext/transport/chttp2/transport/writing.c
   src/core/ext/transport/chttp2/alpn/alpn.c
+  src/core/ext/filters/http/client/http_client_filter.c
+  src/core/ext/filters/http/http_filters_plugin.c
+  src/core/ext/filters/http/message_compress/message_compress_filter.c
+  src/core/ext/filters/http/server/http_server_filter.c
+  src/core/lib/http/httpcli_security_connector.c
+  src/core/lib/security/context/security_context.c
+  src/core/lib/security/credentials/composite/composite_credentials.c
+  src/core/lib/security/credentials/credentials.c
+  src/core/lib/security/credentials/credentials_metadata.c
+  src/core/lib/security/credentials/fake/fake_credentials.c
+  src/core/lib/security/credentials/google_default/credentials_generic.c
+  src/core/lib/security/credentials/google_default/google_default_credentials.c
+  src/core/lib/security/credentials/iam/iam_credentials.c
+  src/core/lib/security/credentials/jwt/json_token.c
+  src/core/lib/security/credentials/jwt/jwt_credentials.c
+  src/core/lib/security/credentials/jwt/jwt_verifier.c
+  src/core/lib/security/credentials/oauth2/oauth2_credentials.c
+  src/core/lib/security/credentials/plugin/plugin_credentials.c
+  src/core/lib/security/credentials/ssl/ssl_credentials.c
+  src/core/lib/security/transport/client_auth_filter.c
+  src/core/lib/security/transport/lb_targets_info.c
+  src/core/lib/security/transport/secure_endpoint.c
+  src/core/lib/security/transport/security_connector.c
+  src/core/lib/security/transport/security_handshaker.c
+  src/core/lib/security/transport/server_auth_filter.c
+  src/core/lib/security/transport/tsi_error.c
+  src/core/lib/security/util/json_util.c
+  src/core/lib/surface/init_secure.c
+  src/core/tsi/fake_transport_security.c
+  src/core/tsi/gts_transport_security.c
+  src/core/tsi/ssl_transport_security.c
+  src/core/tsi/transport_security.c
+  src/core/tsi/transport_security_adapter.c
+  src/core/ext/transport/chttp2/server/chttp2_server.c
+  src/core/ext/transport/chttp2/client/secure/secure_channel_create.c
+  src/core/ext/filters/client_channel/channel_connectivity.c
+  src/core/ext/filters/client_channel/client_channel.c
+  src/core/ext/filters/client_channel/client_channel_factory.c
+  src/core/ext/filters/client_channel/client_channel_plugin.c
+  src/core/ext/filters/client_channel/connector.c
+  src/core/ext/filters/client_channel/http_connect_handshaker.c
+  src/core/ext/filters/client_channel/http_proxy.c
+  src/core/ext/filters/client_channel/lb_policy.c
+  src/core/ext/filters/client_channel/lb_policy_factory.c
+  src/core/ext/filters/client_channel/lb_policy_registry.c
+  src/core/ext/filters/client_channel/parse_address.c
+  src/core/ext/filters/client_channel/proxy_mapper.c
+  src/core/ext/filters/client_channel/proxy_mapper_registry.c
+  src/core/ext/filters/client_channel/resolver.c
+  src/core/ext/filters/client_channel/resolver_factory.c
+  src/core/ext/filters/client_channel/resolver_registry.c
+  src/core/ext/filters/client_channel/retry_throttle.c
+  src/core/ext/filters/client_channel/subchannel.c
+  src/core/ext/filters/client_channel/subchannel_index.c
+  src/core/ext/filters/client_channel/uri_parser.c
+  src/core/ext/filters/deadline/deadline_filter.c
+  src/core/ext/transport/chttp2/client/chttp2_connector.c
+  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
+  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
   src/core/ext/transport/chttp2/client/insecure/channel_create.c
   src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c
-  src/core/ext/client_config/channel_connectivity.c
-  src/core/ext/client_config/client_channel.c
-  src/core/ext/client_config/client_channel_factory.c
-  src/core/ext/client_config/client_config.c
-  src/core/ext/client_config/client_config_plugin.c
-  src/core/ext/client_config/connector.c
-  src/core/ext/client_config/default_initial_connect_string.c
-  src/core/ext/client_config/initial_connect_string.c
-  src/core/ext/client_config/lb_policy.c
-  src/core/ext/client_config/lb_policy_factory.c
-  src/core/ext/client_config/lb_policy_registry.c
-  src/core/ext/client_config/parse_address.c
-  src/core/ext/client_config/resolver.c
-  src/core/ext/client_config/resolver_factory.c
-  src/core/ext/client_config/resolver_registry.c
-  src/core/ext/client_config/subchannel.c
-  src/core/ext/client_config/subchannel_call_holder.c
-  src/core/ext/client_config/subchannel_index.c
-  src/core/ext/client_config/uri_parser.c
-  src/core/ext/resolver/dns/native/dns_resolver.c
-  src/core/ext/resolver/sockaddr/sockaddr_resolver.c
-  src/core/ext/load_reporting/load_reporting.c
-  src/core/ext/load_reporting/load_reporting_filter.c
-  src/core/ext/lb_policy/grpclb/load_balancer_api.c
-  src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c
+  src/core/ext/transport/inproc/inproc_plugin.c
+  src/core/ext/transport/inproc/inproc_transport.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_channel_secure.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_client_stats.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c
   third_party/nanopb/pb_common.c
   third_party/nanopb/pb_decode.c
   third_party/nanopb/pb_encode.c
-  src/core/ext/lb_policy/pick_first/pick_first.c
-  src/core/ext/lb_policy/round_robin/round_robin.c
+  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
+  src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.c
+  src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_posix.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_fallback.c
+  src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.c
+  src/core/ext/filters/client_channel/resolver/sockaddr/sockaddr_resolver.c
+  src/core/ext/filters/load_reporting/load_reporting.c
+  src/core/ext/filters/load_reporting/load_reporting_filter.c
+  src/core/ext/census/base_resources.c
   src/core/ext/census/context.c
   src/core/ext/census/gen/census.pb.c
+  src/core/ext/census/gen/trace_context.pb.c
   src/core/ext/census/grpc_context.c
   src/core/ext/census/grpc_filter.c
   src/core/ext/census/grpc_plugin.c
   src/core/ext/census/initialize.c
+  src/core/ext/census/intrusive_hash_map.c
   src/core/ext/census/mlog.c
   src/core/ext/census/operation.c
   src/core/ext/census/placeholders.c
+  src/core/ext/census/resource.c
+  src/core/ext/census/trace_context.c
   src/core/ext/census/tracing.c
-  src/core/plugin_registry/grpc_unsecure_plugin_registry.c
+  src/core/ext/filters/max_age/max_age_filter.c
+  src/core/ext/filters/message_size/message_size_filter.c
+  src/core/ext/filters/workarounds/workaround_cronet_compression_filter.c
+  src/core/ext/filters/workarounds/workaround_utils.c
+  src/core/plugin_registry/grpc_plugin_registry.c
 )
 
-target_include_directories(grpc_unsecure
+if(WIN32 AND MSVC)
+  set_target_properties(grpc PROPERTIES COMPILE_PDB_NAME "grpc"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
   PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIRS}
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
 )
 
-target_link_libraries(grpc_unsecure
+target_link_libraries(grpc
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ZLIB_LIBRARIES}
+  ${_gRPC_CARES_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
   gpr
 )
 
-add_library(grpc++_unsecure
-  src/cpp/common/insecure_create_auth_context.cc
-  src/cpp/client/channel.cc
-  src/cpp/client/client_context.cc
-  src/cpp/client/create_channel.cc
-  src/cpp/client/create_channel_internal.cc
-  src/cpp/client/create_channel_posix.cc
-  src/cpp/client/credentials.cc
-  src/cpp/client/generic_stub.cc
-  src/cpp/client/insecure_credentials.cc
-  src/cpp/common/channel_arguments.cc
-  src/cpp/common/completion_queue.cc
-  src/cpp/common/core_codegen.cc
-  src/cpp/common/rpc_method.cc
-  src/cpp/server/async_generic_service.cc
-  src/cpp/server/create_default_thread_pool.cc
-  src/cpp/server/dynamic_thread_pool.cc
-  src/cpp/server/insecure_server_credentials.cc
-  src/cpp/server/server.cc
-  src/cpp/server/server_builder.cc
-  src/cpp/server/server_context.cc
-  src/cpp/server/server_credentials.cc
-  src/cpp/server/server_posix.cc
-  src/cpp/util/byte_buffer.cc
-  src/cpp/util/slice.cc
-  src/cpp/util/status.cc
-  src/cpp/util/string_ref.cc
-  src/cpp/util/time.cc
-  src/cpp/codegen/codegen_init.cc
+foreach(_hdr
+  include/grpc/byte_buffer.h
+  include/grpc/byte_buffer_reader.h
+  include/grpc/compression.h
+  include/grpc/grpc.h
+  include/grpc/grpc_posix.h
+  include/grpc/grpc_security_constants.h
+  include/grpc/load_reporting.h
+  include/grpc/slice.h
+  include/grpc/slice_buffer.h
+  include/grpc/status.h
+  include/grpc/support/workaround_list.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+  include/grpc/grpc_security.h
+  include/grpc/census.h
 )
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
 
-target_include_directories(grpc++_unsecure
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_library(grpc_cronet
+  src/core/lib/surface/init.c
+  src/core/lib/channel/channel_args.c
+  src/core/lib/channel/channel_stack.c
+  src/core/lib/channel/channel_stack_builder.c
+  src/core/lib/channel/connected_channel.c
+  src/core/lib/channel/handshaker.c
+  src/core/lib/channel/handshaker_factory.c
+  src/core/lib/channel/handshaker_registry.c
+  src/core/lib/compression/compression.c
+  src/core/lib/compression/message_compress.c
+  src/core/lib/compression/stream_compression.c
+  src/core/lib/http/format_request.c
+  src/core/lib/http/httpcli.c
+  src/core/lib/http/parser.c
+  src/core/lib/iomgr/closure.c
+  src/core/lib/iomgr/combiner.c
+  src/core/lib/iomgr/endpoint.c
+  src/core/lib/iomgr/endpoint_pair_posix.c
+  src/core/lib/iomgr/endpoint_pair_uv.c
+  src/core/lib/iomgr/endpoint_pair_windows.c
+  src/core/lib/iomgr/error.c
+  src/core/lib/iomgr/ev_epoll1_linux.c
+  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
+  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
+  src/core/lib/iomgr/ev_epollex_linux.c
+  src/core/lib/iomgr/ev_epollsig_linux.c
+  src/core/lib/iomgr/ev_poll_posix.c
+  src/core/lib/iomgr/ev_posix.c
+  src/core/lib/iomgr/ev_windows.c
+  src/core/lib/iomgr/exec_ctx.c
+  src/core/lib/iomgr/executor.c
+  src/core/lib/iomgr/iocp_windows.c
+  src/core/lib/iomgr/iomgr.c
+  src/core/lib/iomgr/iomgr_posix.c
+  src/core/lib/iomgr/iomgr_uv.c
+  src/core/lib/iomgr/iomgr_windows.c
+  src/core/lib/iomgr/is_epollexclusive_available.c
+  src/core/lib/iomgr/load_file.c
+  src/core/lib/iomgr/lockfree_event.c
+  src/core/lib/iomgr/network_status_tracker.c
+  src/core/lib/iomgr/polling_entity.c
+  src/core/lib/iomgr/pollset_set_uv.c
+  src/core/lib/iomgr/pollset_set_windows.c
+  src/core/lib/iomgr/pollset_uv.c
+  src/core/lib/iomgr/pollset_windows.c
+  src/core/lib/iomgr/resolve_address_posix.c
+  src/core/lib/iomgr/resolve_address_uv.c
+  src/core/lib/iomgr/resolve_address_windows.c
+  src/core/lib/iomgr/resource_quota.c
+  src/core/lib/iomgr/sockaddr_utils.c
+  src/core/lib/iomgr/socket_factory_posix.c
+  src/core/lib/iomgr/socket_mutator.c
+  src/core/lib/iomgr/socket_utils_common_posix.c
+  src/core/lib/iomgr/socket_utils_linux.c
+  src/core/lib/iomgr/socket_utils_posix.c
+  src/core/lib/iomgr/socket_utils_uv.c
+  src/core/lib/iomgr/socket_utils_windows.c
+  src/core/lib/iomgr/socket_windows.c
+  src/core/lib/iomgr/tcp_client_posix.c
+  src/core/lib/iomgr/tcp_client_uv.c
+  src/core/lib/iomgr/tcp_client_windows.c
+  src/core/lib/iomgr/tcp_posix.c
+  src/core/lib/iomgr/tcp_server_posix.c
+  src/core/lib/iomgr/tcp_server_utils_posix_common.c
+  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
+  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
+  src/core/lib/iomgr/tcp_server_uv.c
+  src/core/lib/iomgr/tcp_server_windows.c
+  src/core/lib/iomgr/tcp_uv.c
+  src/core/lib/iomgr/tcp_windows.c
+  src/core/lib/iomgr/time_averaged_stats.c
+  src/core/lib/iomgr/timer_generic.c
+  src/core/lib/iomgr/timer_heap.c
+  src/core/lib/iomgr/timer_manager.c
+  src/core/lib/iomgr/timer_uv.c
+  src/core/lib/iomgr/udp_server.c
+  src/core/lib/iomgr/unix_sockets_posix.c
+  src/core/lib/iomgr/unix_sockets_posix_noop.c
+  src/core/lib/iomgr/wakeup_fd_cv.c
+  src/core/lib/iomgr/wakeup_fd_eventfd.c
+  src/core/lib/iomgr/wakeup_fd_nospecial.c
+  src/core/lib/iomgr/wakeup_fd_pipe.c
+  src/core/lib/iomgr/wakeup_fd_posix.c
+  src/core/lib/json/json.c
+  src/core/lib/json/json_reader.c
+  src/core/lib/json/json_string.c
+  src/core/lib/json/json_writer.c
+  src/core/lib/slice/b64.c
+  src/core/lib/slice/percent_encoding.c
+  src/core/lib/slice/slice.c
+  src/core/lib/slice/slice_buffer.c
+  src/core/lib/slice/slice_hash_table.c
+  src/core/lib/slice/slice_intern.c
+  src/core/lib/slice/slice_string_helpers.c
+  src/core/lib/surface/alarm.c
+  src/core/lib/surface/api_trace.c
+  src/core/lib/surface/byte_buffer.c
+  src/core/lib/surface/byte_buffer_reader.c
+  src/core/lib/surface/call.c
+  src/core/lib/surface/call_details.c
+  src/core/lib/surface/call_log_batch.c
+  src/core/lib/surface/channel.c
+  src/core/lib/surface/channel_init.c
+  src/core/lib/surface/channel_ping.c
+  src/core/lib/surface/channel_stack_type.c
+  src/core/lib/surface/completion_queue.c
+  src/core/lib/surface/completion_queue_factory.c
+  src/core/lib/surface/event_string.c
+  src/core/lib/surface/lame_client.cc
+  src/core/lib/surface/metadata_array.c
+  src/core/lib/surface/server.c
+  src/core/lib/surface/validate_metadata.c
+  src/core/lib/surface/version.c
+  src/core/lib/transport/bdp_estimator.c
+  src/core/lib/transport/byte_stream.c
+  src/core/lib/transport/connectivity_state.c
+  src/core/lib/transport/error_utils.c
+  src/core/lib/transport/metadata.c
+  src/core/lib/transport/metadata_batch.c
+  src/core/lib/transport/pid_controller.c
+  src/core/lib/transport/service_config.c
+  src/core/lib/transport/static_metadata.c
+  src/core/lib/transport/status_conversion.c
+  src/core/lib/transport/timeout_encoding.c
+  src/core/lib/transport/transport.c
+  src/core/lib/transport/transport_op_string.c
+  src/core/lib/debug/trace.c
+  src/core/ext/transport/cronet/client/secure/cronet_channel_create.c
+  src/core/ext/transport/cronet/transport/cronet_api_dummy.c
+  src/core/ext/transport/cronet/transport/cronet_transport.c
+  src/core/ext/transport/chttp2/client/secure/secure_channel_create.c
+  src/core/ext/transport/chttp2/transport/bin_decoder.c
+  src/core/ext/transport/chttp2/transport/bin_encoder.c
+  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
+  src/core/ext/transport/chttp2/transport/chttp2_transport.c
+  src/core/ext/transport/chttp2/transport/frame_data.c
+  src/core/ext/transport/chttp2/transport/frame_goaway.c
+  src/core/ext/transport/chttp2/transport/frame_ping.c
+  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
+  src/core/ext/transport/chttp2/transport/frame_settings.c
+  src/core/ext/transport/chttp2/transport/frame_window_update.c
+  src/core/ext/transport/chttp2/transport/hpack_encoder.c
+  src/core/ext/transport/chttp2/transport/hpack_parser.c
+  src/core/ext/transport/chttp2/transport/hpack_table.c
+  src/core/ext/transport/chttp2/transport/http2_settings.c
+  src/core/ext/transport/chttp2/transport/huffsyms.c
+  src/core/ext/transport/chttp2/transport/incoming_metadata.c
+  src/core/ext/transport/chttp2/transport/parsing.c
+  src/core/ext/transport/chttp2/transport/stream_lists.c
+  src/core/ext/transport/chttp2/transport/stream_map.c
+  src/core/ext/transport/chttp2/transport/varint.c
+  src/core/ext/transport/chttp2/transport/writing.c
+  src/core/ext/transport/chttp2/alpn/alpn.c
+  src/core/ext/filters/http/client/http_client_filter.c
+  src/core/ext/filters/http/http_filters_plugin.c
+  src/core/ext/filters/http/message_compress/message_compress_filter.c
+  src/core/ext/filters/http/server/http_server_filter.c
+  src/core/ext/filters/client_channel/channel_connectivity.c
+  src/core/ext/filters/client_channel/client_channel.c
+  src/core/ext/filters/client_channel/client_channel_factory.c
+  src/core/ext/filters/client_channel/client_channel_plugin.c
+  src/core/ext/filters/client_channel/connector.c
+  src/core/ext/filters/client_channel/http_connect_handshaker.c
+  src/core/ext/filters/client_channel/http_proxy.c
+  src/core/ext/filters/client_channel/lb_policy.c
+  src/core/ext/filters/client_channel/lb_policy_factory.c
+  src/core/ext/filters/client_channel/lb_policy_registry.c
+  src/core/ext/filters/client_channel/parse_address.c
+  src/core/ext/filters/client_channel/proxy_mapper.c
+  src/core/ext/filters/client_channel/proxy_mapper_registry.c
+  src/core/ext/filters/client_channel/resolver.c
+  src/core/ext/filters/client_channel/resolver_factory.c
+  src/core/ext/filters/client_channel/resolver_registry.c
+  src/core/ext/filters/client_channel/retry_throttle.c
+  src/core/ext/filters/client_channel/subchannel.c
+  src/core/ext/filters/client_channel/subchannel_index.c
+  src/core/ext/filters/client_channel/uri_parser.c
+  src/core/ext/filters/deadline/deadline_filter.c
+  src/core/lib/http/httpcli_security_connector.c
+  src/core/lib/security/context/security_context.c
+  src/core/lib/security/credentials/composite/composite_credentials.c
+  src/core/lib/security/credentials/credentials.c
+  src/core/lib/security/credentials/credentials_metadata.c
+  src/core/lib/security/credentials/fake/fake_credentials.c
+  src/core/lib/security/credentials/google_default/credentials_generic.c
+  src/core/lib/security/credentials/google_default/google_default_credentials.c
+  src/core/lib/security/credentials/iam/iam_credentials.c
+  src/core/lib/security/credentials/jwt/json_token.c
+  src/core/lib/security/credentials/jwt/jwt_credentials.c
+  src/core/lib/security/credentials/jwt/jwt_verifier.c
+  src/core/lib/security/credentials/oauth2/oauth2_credentials.c
+  src/core/lib/security/credentials/plugin/plugin_credentials.c
+  src/core/lib/security/credentials/ssl/ssl_credentials.c
+  src/core/lib/security/transport/client_auth_filter.c
+  src/core/lib/security/transport/lb_targets_info.c
+  src/core/lib/security/transport/secure_endpoint.c
+  src/core/lib/security/transport/security_connector.c
+  src/core/lib/security/transport/security_handshaker.c
+  src/core/lib/security/transport/server_auth_filter.c
+  src/core/lib/security/transport/tsi_error.c
+  src/core/lib/security/util/json_util.c
+  src/core/lib/surface/init_secure.c
+  src/core/tsi/fake_transport_security.c
+  src/core/tsi/gts_transport_security.c
+  src/core/tsi/ssl_transport_security.c
+  src/core/tsi/transport_security.c
+  src/core/tsi/transport_security_adapter.c
+  src/core/ext/transport/chttp2/client/chttp2_connector.c
+  src/core/ext/filters/load_reporting/load_reporting.c
+  src/core/ext/filters/load_reporting/load_reporting_filter.c
+  src/core/plugin_registry/grpc_cronet_plugin_registry.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_cronet PROPERTIES COMPILE_PDB_NAME "grpc_cronet"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_cronet.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_cronet
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${PROTOBUF_INCLUDE_DIRS}
-  PRIVATE ${ZLIB_INCLUDE_DIRS}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
 )
 
-target_link_libraries(grpc++_unsecure
-  ${PROTOBUF_LIBRARIES}
+target_link_libraries(grpc_cronet
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ZLIB_LIBRARIES}
+  ${_gRPC_CARES_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
   gpr
-  grpc_unsecure
 )
+
+foreach(_hdr
+  include/grpc/byte_buffer.h
+  include/grpc/byte_buffer_reader.h
+  include/grpc/compression.h
+  include/grpc/grpc.h
+  include/grpc/grpc_posix.h
+  include/grpc/grpc_security_constants.h
+  include/grpc/load_reporting.h
+  include/grpc/slice.h
+  include/grpc/slice_buffer.h
+  include/grpc/status.h
+  include/grpc/support/workaround_list.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+  include/grpc/grpc_cronet.h
+  include/grpc/grpc_security.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_cronet EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc_test_util
+  test/core/end2end/data/client_certs.c
+  test/core/end2end/data/server1_cert.c
+  test/core/end2end/data/server1_key.c
+  test/core/end2end/data/test_root_cert.c
+  test/core/security/oauth2_utils.c
+  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
+  test/core/end2end/cq_verifier.c
+  test/core/end2end/fixtures/http_proxy_fixture.c
+  test/core/end2end/fixtures/proxy.c
+  test/core/iomgr/endpoint_tests.c
+  test/core/util/debugger_macros.c
+  test/core/util/grpc_profiler.c
+  test/core/util/memory_counters.c
+  test/core/util/mock_endpoint.c
+  test/core/util/parse_hexstring.c
+  test/core/util/passthru_endpoint.c
+  test/core/util/port.c
+  test/core/util/port_server_client.c
+  test/core/util/slice_splitter.c
+  test/core/util/trickle_endpoint.c
+  src/core/lib/channel/channel_args.c
+  src/core/lib/channel/channel_stack.c
+  src/core/lib/channel/channel_stack_builder.c
+  src/core/lib/channel/connected_channel.c
+  src/core/lib/channel/handshaker.c
+  src/core/lib/channel/handshaker_factory.c
+  src/core/lib/channel/handshaker_registry.c
+  src/core/lib/compression/compression.c
+  src/core/lib/compression/message_compress.c
+  src/core/lib/compression/stream_compression.c
+  src/core/lib/http/format_request.c
+  src/core/lib/http/httpcli.c
+  src/core/lib/http/parser.c
+  src/core/lib/iomgr/closure.c
+  src/core/lib/iomgr/combiner.c
+  src/core/lib/iomgr/endpoint.c
+  src/core/lib/iomgr/endpoint_pair_posix.c
+  src/core/lib/iomgr/endpoint_pair_uv.c
+  src/core/lib/iomgr/endpoint_pair_windows.c
+  src/core/lib/iomgr/error.c
+  src/core/lib/iomgr/ev_epoll1_linux.c
+  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
+  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
+  src/core/lib/iomgr/ev_epollex_linux.c
+  src/core/lib/iomgr/ev_epollsig_linux.c
+  src/core/lib/iomgr/ev_poll_posix.c
+  src/core/lib/iomgr/ev_posix.c
+  src/core/lib/iomgr/ev_windows.c
+  src/core/lib/iomgr/exec_ctx.c
+  src/core/lib/iomgr/executor.c
+  src/core/lib/iomgr/iocp_windows.c
+  src/core/lib/iomgr/iomgr.c
+  src/core/lib/iomgr/iomgr_posix.c
+  src/core/lib/iomgr/iomgr_uv.c
+  src/core/lib/iomgr/iomgr_windows.c
+  src/core/lib/iomgr/is_epollexclusive_available.c
+  src/core/lib/iomgr/load_file.c
+  src/core/lib/iomgr/lockfree_event.c
+  src/core/lib/iomgr/network_status_tracker.c
+  src/core/lib/iomgr/polling_entity.c
+  src/core/lib/iomgr/pollset_set_uv.c
+  src/core/lib/iomgr/pollset_set_windows.c
+  src/core/lib/iomgr/pollset_uv.c
+  src/core/lib/iomgr/pollset_windows.c
+  src/core/lib/iomgr/resolve_address_posix.c
+  src/core/lib/iomgr/resolve_address_uv.c
+  src/core/lib/iomgr/resolve_address_windows.c
+  src/core/lib/iomgr/resource_quota.c
+  src/core/lib/iomgr/sockaddr_utils.c
+  src/core/lib/iomgr/socket_factory_posix.c
+  src/core/lib/iomgr/socket_mutator.c
+  src/core/lib/iomgr/socket_utils_common_posix.c
+  src/core/lib/iomgr/socket_utils_linux.c
+  src/core/lib/iomgr/socket_utils_posix.c
+  src/core/lib/iomgr/socket_utils_uv.c
+  src/core/lib/iomgr/socket_utils_windows.c
+  src/core/lib/iomgr/socket_windows.c
+  src/core/lib/iomgr/tcp_client_posix.c
+  src/core/lib/iomgr/tcp_client_uv.c
+  src/core/lib/iomgr/tcp_client_windows.c
+  src/core/lib/iomgr/tcp_posix.c
+  src/core/lib/iomgr/tcp_server_posix.c
+  src/core/lib/iomgr/tcp_server_utils_posix_common.c
+  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
+  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
+  src/core/lib/iomgr/tcp_server_uv.c
+  src/core/lib/iomgr/tcp_server_windows.c
+  src/core/lib/iomgr/tcp_uv.c
+  src/core/lib/iomgr/tcp_windows.c
+  src/core/lib/iomgr/time_averaged_stats.c
+  src/core/lib/iomgr/timer_generic.c
+  src/core/lib/iomgr/timer_heap.c
+  src/core/lib/iomgr/timer_manager.c
+  src/core/lib/iomgr/timer_uv.c
+  src/core/lib/iomgr/udp_server.c
+  src/core/lib/iomgr/unix_sockets_posix.c
+  src/core/lib/iomgr/unix_sockets_posix_noop.c
+  src/core/lib/iomgr/wakeup_fd_cv.c
+  src/core/lib/iomgr/wakeup_fd_eventfd.c
+  src/core/lib/iomgr/wakeup_fd_nospecial.c
+  src/core/lib/iomgr/wakeup_fd_pipe.c
+  src/core/lib/iomgr/wakeup_fd_posix.c
+  src/core/lib/json/json.c
+  src/core/lib/json/json_reader.c
+  src/core/lib/json/json_string.c
+  src/core/lib/json/json_writer.c
+  src/core/lib/slice/b64.c
+  src/core/lib/slice/percent_encoding.c
+  src/core/lib/slice/slice.c
+  src/core/lib/slice/slice_buffer.c
+  src/core/lib/slice/slice_hash_table.c
+  src/core/lib/slice/slice_intern.c
+  src/core/lib/slice/slice_string_helpers.c
+  src/core/lib/surface/alarm.c
+  src/core/lib/surface/api_trace.c
+  src/core/lib/surface/byte_buffer.c
+  src/core/lib/surface/byte_buffer_reader.c
+  src/core/lib/surface/call.c
+  src/core/lib/surface/call_details.c
+  src/core/lib/surface/call_log_batch.c
+  src/core/lib/surface/channel.c
+  src/core/lib/surface/channel_init.c
+  src/core/lib/surface/channel_ping.c
+  src/core/lib/surface/channel_stack_type.c
+  src/core/lib/surface/completion_queue.c
+  src/core/lib/surface/completion_queue_factory.c
+  src/core/lib/surface/event_string.c
+  src/core/lib/surface/lame_client.cc
+  src/core/lib/surface/metadata_array.c
+  src/core/lib/surface/server.c
+  src/core/lib/surface/validate_metadata.c
+  src/core/lib/surface/version.c
+  src/core/lib/transport/bdp_estimator.c
+  src/core/lib/transport/byte_stream.c
+  src/core/lib/transport/connectivity_state.c
+  src/core/lib/transport/error_utils.c
+  src/core/lib/transport/metadata.c
+  src/core/lib/transport/metadata_batch.c
+  src/core/lib/transport/pid_controller.c
+  src/core/lib/transport/service_config.c
+  src/core/lib/transport/static_metadata.c
+  src/core/lib/transport/status_conversion.c
+  src/core/lib/transport/timeout_encoding.c
+  src/core/lib/transport/transport.c
+  src/core/lib/transport/transport_op_string.c
+  src/core/lib/debug/trace.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_test_util PROPERTIES COMPILE_PDB_NAME "grpc_test_util"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_test_util.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_test_util
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_test_util
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+  grpc
+)
+
+foreach(_hdr
+  include/grpc/byte_buffer.h
+  include/grpc/byte_buffer_reader.h
+  include/grpc/compression.h
+  include/grpc/grpc.h
+  include/grpc/grpc_posix.h
+  include/grpc/grpc_security_constants.h
+  include/grpc/load_reporting.h
+  include/grpc/slice.h
+  include/grpc/slice_buffer.h
+  include/grpc/status.h
+  include/grpc/support/workaround_list.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc_test_util_unsecure
+  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
+  test/core/end2end/cq_verifier.c
+  test/core/end2end/fixtures/http_proxy_fixture.c
+  test/core/end2end/fixtures/proxy.c
+  test/core/iomgr/endpoint_tests.c
+  test/core/util/debugger_macros.c
+  test/core/util/grpc_profiler.c
+  test/core/util/memory_counters.c
+  test/core/util/mock_endpoint.c
+  test/core/util/parse_hexstring.c
+  test/core/util/passthru_endpoint.c
+  test/core/util/port.c
+  test/core/util/port_server_client.c
+  test/core/util/slice_splitter.c
+  test/core/util/trickle_endpoint.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_test_util_unsecure PROPERTIES COMPILE_PDB_NAME "grpc_test_util_unsecure"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_test_util_unsecure.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_test_util_unsecure
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_test_util_unsecure
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr
+  gpr_test_util
+  grpc_unsecure
+  grpc
+)
+
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc_unsecure
+  src/core/lib/surface/init.c
+  src/core/lib/surface/init_unsecure.c
+  src/core/lib/channel/channel_args.c
+  src/core/lib/channel/channel_stack.c
+  src/core/lib/channel/channel_stack_builder.c
+  src/core/lib/channel/connected_channel.c
+  src/core/lib/channel/handshaker.c
+  src/core/lib/channel/handshaker_factory.c
+  src/core/lib/channel/handshaker_registry.c
+  src/core/lib/compression/compression.c
+  src/core/lib/compression/message_compress.c
+  src/core/lib/compression/stream_compression.c
+  src/core/lib/http/format_request.c
+  src/core/lib/http/httpcli.c
+  src/core/lib/http/parser.c
+  src/core/lib/iomgr/closure.c
+  src/core/lib/iomgr/combiner.c
+  src/core/lib/iomgr/endpoint.c
+  src/core/lib/iomgr/endpoint_pair_posix.c
+  src/core/lib/iomgr/endpoint_pair_uv.c
+  src/core/lib/iomgr/endpoint_pair_windows.c
+  src/core/lib/iomgr/error.c
+  src/core/lib/iomgr/ev_epoll1_linux.c
+  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
+  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
+  src/core/lib/iomgr/ev_epollex_linux.c
+  src/core/lib/iomgr/ev_epollsig_linux.c
+  src/core/lib/iomgr/ev_poll_posix.c
+  src/core/lib/iomgr/ev_posix.c
+  src/core/lib/iomgr/ev_windows.c
+  src/core/lib/iomgr/exec_ctx.c
+  src/core/lib/iomgr/executor.c
+  src/core/lib/iomgr/iocp_windows.c
+  src/core/lib/iomgr/iomgr.c
+  src/core/lib/iomgr/iomgr_posix.c
+  src/core/lib/iomgr/iomgr_uv.c
+  src/core/lib/iomgr/iomgr_windows.c
+  src/core/lib/iomgr/is_epollexclusive_available.c
+  src/core/lib/iomgr/load_file.c
+  src/core/lib/iomgr/lockfree_event.c
+  src/core/lib/iomgr/network_status_tracker.c
+  src/core/lib/iomgr/polling_entity.c
+  src/core/lib/iomgr/pollset_set_uv.c
+  src/core/lib/iomgr/pollset_set_windows.c
+  src/core/lib/iomgr/pollset_uv.c
+  src/core/lib/iomgr/pollset_windows.c
+  src/core/lib/iomgr/resolve_address_posix.c
+  src/core/lib/iomgr/resolve_address_uv.c
+  src/core/lib/iomgr/resolve_address_windows.c
+  src/core/lib/iomgr/resource_quota.c
+  src/core/lib/iomgr/sockaddr_utils.c
+  src/core/lib/iomgr/socket_factory_posix.c
+  src/core/lib/iomgr/socket_mutator.c
+  src/core/lib/iomgr/socket_utils_common_posix.c
+  src/core/lib/iomgr/socket_utils_linux.c
+  src/core/lib/iomgr/socket_utils_posix.c
+  src/core/lib/iomgr/socket_utils_uv.c
+  src/core/lib/iomgr/socket_utils_windows.c
+  src/core/lib/iomgr/socket_windows.c
+  src/core/lib/iomgr/tcp_client_posix.c
+  src/core/lib/iomgr/tcp_client_uv.c
+  src/core/lib/iomgr/tcp_client_windows.c
+  src/core/lib/iomgr/tcp_posix.c
+  src/core/lib/iomgr/tcp_server_posix.c
+  src/core/lib/iomgr/tcp_server_utils_posix_common.c
+  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
+  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
+  src/core/lib/iomgr/tcp_server_uv.c
+  src/core/lib/iomgr/tcp_server_windows.c
+  src/core/lib/iomgr/tcp_uv.c
+  src/core/lib/iomgr/tcp_windows.c
+  src/core/lib/iomgr/time_averaged_stats.c
+  src/core/lib/iomgr/timer_generic.c
+  src/core/lib/iomgr/timer_heap.c
+  src/core/lib/iomgr/timer_manager.c
+  src/core/lib/iomgr/timer_uv.c
+  src/core/lib/iomgr/udp_server.c
+  src/core/lib/iomgr/unix_sockets_posix.c
+  src/core/lib/iomgr/unix_sockets_posix_noop.c
+  src/core/lib/iomgr/wakeup_fd_cv.c
+  src/core/lib/iomgr/wakeup_fd_eventfd.c
+  src/core/lib/iomgr/wakeup_fd_nospecial.c
+  src/core/lib/iomgr/wakeup_fd_pipe.c
+  src/core/lib/iomgr/wakeup_fd_posix.c
+  src/core/lib/json/json.c
+  src/core/lib/json/json_reader.c
+  src/core/lib/json/json_string.c
+  src/core/lib/json/json_writer.c
+  src/core/lib/slice/b64.c
+  src/core/lib/slice/percent_encoding.c
+  src/core/lib/slice/slice.c
+  src/core/lib/slice/slice_buffer.c
+  src/core/lib/slice/slice_hash_table.c
+  src/core/lib/slice/slice_intern.c
+  src/core/lib/slice/slice_string_helpers.c
+  src/core/lib/surface/alarm.c
+  src/core/lib/surface/api_trace.c
+  src/core/lib/surface/byte_buffer.c
+  src/core/lib/surface/byte_buffer_reader.c
+  src/core/lib/surface/call.c
+  src/core/lib/surface/call_details.c
+  src/core/lib/surface/call_log_batch.c
+  src/core/lib/surface/channel.c
+  src/core/lib/surface/channel_init.c
+  src/core/lib/surface/channel_ping.c
+  src/core/lib/surface/channel_stack_type.c
+  src/core/lib/surface/completion_queue.c
+  src/core/lib/surface/completion_queue_factory.c
+  src/core/lib/surface/event_string.c
+  src/core/lib/surface/lame_client.cc
+  src/core/lib/surface/metadata_array.c
+  src/core/lib/surface/server.c
+  src/core/lib/surface/validate_metadata.c
+  src/core/lib/surface/version.c
+  src/core/lib/transport/bdp_estimator.c
+  src/core/lib/transport/byte_stream.c
+  src/core/lib/transport/connectivity_state.c
+  src/core/lib/transport/error_utils.c
+  src/core/lib/transport/metadata.c
+  src/core/lib/transport/metadata_batch.c
+  src/core/lib/transport/pid_controller.c
+  src/core/lib/transport/service_config.c
+  src/core/lib/transport/static_metadata.c
+  src/core/lib/transport/status_conversion.c
+  src/core/lib/transport/timeout_encoding.c
+  src/core/lib/transport/transport.c
+  src/core/lib/transport/transport_op_string.c
+  src/core/lib/debug/trace.c
+  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
+  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
+  src/core/ext/transport/chttp2/transport/bin_decoder.c
+  src/core/ext/transport/chttp2/transport/bin_encoder.c
+  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
+  src/core/ext/transport/chttp2/transport/chttp2_transport.c
+  src/core/ext/transport/chttp2/transport/frame_data.c
+  src/core/ext/transport/chttp2/transport/frame_goaway.c
+  src/core/ext/transport/chttp2/transport/frame_ping.c
+  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
+  src/core/ext/transport/chttp2/transport/frame_settings.c
+  src/core/ext/transport/chttp2/transport/frame_window_update.c
+  src/core/ext/transport/chttp2/transport/hpack_encoder.c
+  src/core/ext/transport/chttp2/transport/hpack_parser.c
+  src/core/ext/transport/chttp2/transport/hpack_table.c
+  src/core/ext/transport/chttp2/transport/http2_settings.c
+  src/core/ext/transport/chttp2/transport/huffsyms.c
+  src/core/ext/transport/chttp2/transport/incoming_metadata.c
+  src/core/ext/transport/chttp2/transport/parsing.c
+  src/core/ext/transport/chttp2/transport/stream_lists.c
+  src/core/ext/transport/chttp2/transport/stream_map.c
+  src/core/ext/transport/chttp2/transport/varint.c
+  src/core/ext/transport/chttp2/transport/writing.c
+  src/core/ext/transport/chttp2/alpn/alpn.c
+  src/core/ext/filters/http/client/http_client_filter.c
+  src/core/ext/filters/http/http_filters_plugin.c
+  src/core/ext/filters/http/message_compress/message_compress_filter.c
+  src/core/ext/filters/http/server/http_server_filter.c
+  src/core/ext/transport/chttp2/server/chttp2_server.c
+  src/core/ext/transport/chttp2/client/insecure/channel_create.c
+  src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c
+  src/core/ext/transport/chttp2/client/chttp2_connector.c
+  src/core/ext/filters/client_channel/channel_connectivity.c
+  src/core/ext/filters/client_channel/client_channel.c
+  src/core/ext/filters/client_channel/client_channel_factory.c
+  src/core/ext/filters/client_channel/client_channel_plugin.c
+  src/core/ext/filters/client_channel/connector.c
+  src/core/ext/filters/client_channel/http_connect_handshaker.c
+  src/core/ext/filters/client_channel/http_proxy.c
+  src/core/ext/filters/client_channel/lb_policy.c
+  src/core/ext/filters/client_channel/lb_policy_factory.c
+  src/core/ext/filters/client_channel/lb_policy_registry.c
+  src/core/ext/filters/client_channel/parse_address.c
+  src/core/ext/filters/client_channel/proxy_mapper.c
+  src/core/ext/filters/client_channel/proxy_mapper_registry.c
+  src/core/ext/filters/client_channel/resolver.c
+  src/core/ext/filters/client_channel/resolver_factory.c
+  src/core/ext/filters/client_channel/resolver_registry.c
+  src/core/ext/filters/client_channel/retry_throttle.c
+  src/core/ext/filters/client_channel/subchannel.c
+  src/core/ext/filters/client_channel/subchannel_index.c
+  src/core/ext/filters/client_channel/uri_parser.c
+  src/core/ext/filters/deadline/deadline_filter.c
+  src/core/ext/transport/inproc/inproc_plugin.c
+  src/core/ext/transport/inproc/inproc_transport.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_posix.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.c
+  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_fallback.c
+  src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.c
+  src/core/ext/filters/client_channel/resolver/sockaddr/sockaddr_resolver.c
+  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
+  src/core/ext/filters/load_reporting/load_reporting.c
+  src/core/ext/filters/load_reporting/load_reporting_filter.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_channel.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_client_stats.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
+  src/core/ext/filters/client_channel/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c
+  third_party/nanopb/pb_common.c
+  third_party/nanopb/pb_decode.c
+  third_party/nanopb/pb_encode.c
+  src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.c
+  src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.c
+  src/core/ext/census/base_resources.c
+  src/core/ext/census/context.c
+  src/core/ext/census/gen/census.pb.c
+  src/core/ext/census/gen/trace_context.pb.c
+  src/core/ext/census/grpc_context.c
+  src/core/ext/census/grpc_filter.c
+  src/core/ext/census/grpc_plugin.c
+  src/core/ext/census/initialize.c
+  src/core/ext/census/intrusive_hash_map.c
+  src/core/ext/census/mlog.c
+  src/core/ext/census/operation.c
+  src/core/ext/census/placeholders.c
+  src/core/ext/census/resource.c
+  src/core/ext/census/trace_context.c
+  src/core/ext/census/tracing.c
+  src/core/ext/filters/max_age/max_age_filter.c
+  src/core/ext/filters/message_size/message_size_filter.c
+  src/core/ext/filters/workarounds/workaround_cronet_compression_filter.c
+  src/core/ext/filters/workarounds/workaround_utils.c
+  src/core/plugin_registry/grpc_unsecure_plugin_registry.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_unsecure PROPERTIES COMPILE_PDB_NAME "grpc_unsecure"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_unsecure.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_unsecure
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_unsecure
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_ZLIB_LIBRARIES}
+  ${_gRPC_CARES_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr
+)
+
+foreach(_hdr
+  include/grpc/byte_buffer.h
+  include/grpc/byte_buffer_reader.h
+  include/grpc/compression.h
+  include/grpc/grpc.h
+  include/grpc/grpc_posix.h
+  include/grpc/grpc_security_constants.h
+  include/grpc/load_reporting.h
+  include/grpc/slice.h
+  include/grpc/slice_buffer.h
+  include/grpc/status.h
+  include/grpc/support/workaround_list.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+  include/grpc/census.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_unsecure EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(reconnect_server
+  test/core/util/reconnect_server.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(reconnect_server PROPERTIES COMPILE_PDB_NAME "reconnect_server"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/reconnect_server.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(reconnect_server
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(reconnect_server
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  test_tcp_server
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(test_tcp_server
+  test/core/util/test_tcp_server.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(test_tcp_server PROPERTIES COMPILE_PDB_NAME "test_tcp_server"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/test_tcp_server.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(test_tcp_server
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(test_tcp_server
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc++
+  src/cpp/client/insecure_credentials.cc
+  src/cpp/client/secure_credentials.cc
+  src/cpp/common/auth_property_iterator.cc
+  src/cpp/common/secure_auth_context.cc
+  src/cpp/common/secure_channel_arguments.cc
+  src/cpp/common/secure_create_auth_context.cc
+  src/cpp/server/insecure_server_credentials.cc
+  src/cpp/server/secure_server_credentials.cc
+  src/cpp/client/channel_cc.cc
+  src/cpp/client/client_context.cc
+  src/cpp/client/create_channel.cc
+  src/cpp/client/create_channel_internal.cc
+  src/cpp/client/create_channel_posix.cc
+  src/cpp/client/credentials_cc.cc
+  src/cpp/client/generic_stub.cc
+  src/cpp/common/channel_arguments.cc
+  src/cpp/common/channel_filter.cc
+  src/cpp/common/completion_queue_cc.cc
+  src/cpp/common/core_codegen.cc
+  src/cpp/common/resource_quota_cc.cc
+  src/cpp/common/rpc_method.cc
+  src/cpp/common/version_cc.cc
+  src/cpp/server/async_generic_service.cc
+  src/cpp/server/channel_argument_option.cc
+  src/cpp/server/create_default_thread_pool.cc
+  src/cpp/server/dynamic_thread_pool.cc
+  src/cpp/server/health/default_health_check_service.cc
+  src/cpp/server/health/health.pb.c
+  src/cpp/server/health/health_check_service.cc
+  src/cpp/server/health/health_check_service_server_builder_option.cc
+  src/cpp/server/server_builder.cc
+  src/cpp/server/server_cc.cc
+  src/cpp/server/server_context.cc
+  src/cpp/server/server_credentials.cc
+  src/cpp/server/server_posix.cc
+  src/cpp/thread_manager/thread_manager.cc
+  src/cpp/util/byte_buffer_cc.cc
+  src/cpp/util/slice_cc.cc
+  src/cpp/util/status.cc
+  src/cpp/util/string_ref.cc
+  src/cpp/util/time_cc.cc
+  third_party/nanopb/pb_common.c
+  third_party/nanopb/pb_decode.c
+  third_party/nanopb/pb_encode.c
+  src/cpp/codegen/codegen_init.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++ PROPERTIES COMPILE_PDB_NAME "grpc++"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc++
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+)
+
+foreach(_hdr
+  include/grpc++/alarm.h
+  include/grpc++/channel.h
+  include/grpc++/client_context.h
+  include/grpc++/completion_queue.h
+  include/grpc++/create_channel.h
+  include/grpc++/create_channel_posix.h
+  include/grpc++/ext/health_check_service_server_builder_option.h
+  include/grpc++/generic/async_generic_service.h
+  include/grpc++/generic/generic_stub.h
+  include/grpc++/grpc++.h
+  include/grpc++/health_check_service_interface.h
+  include/grpc++/impl/call.h
+  include/grpc++/impl/channel_argument_option.h
+  include/grpc++/impl/client_unary_call.h
+  include/grpc++/impl/codegen/core_codegen.h
+  include/grpc++/impl/grpc_library.h
+  include/grpc++/impl/method_handler_impl.h
+  include/grpc++/impl/rpc_method.h
+  include/grpc++/impl/rpc_service_method.h
+  include/grpc++/impl/serialization_traits.h
+  include/grpc++/impl/server_builder_option.h
+  include/grpc++/impl/server_builder_plugin.h
+  include/grpc++/impl/server_initializer.h
+  include/grpc++/impl/service_type.h
+  include/grpc++/resource_quota.h
+  include/grpc++/security/auth_context.h
+  include/grpc++/security/auth_metadata_processor.h
+  include/grpc++/security/credentials.h
+  include/grpc++/security/server_credentials.h
+  include/grpc++/server.h
+  include/grpc++/server_builder.h
+  include/grpc++/server_context.h
+  include/grpc++/server_posix.h
+  include/grpc++/support/async_stream.h
+  include/grpc++/support/async_unary_call.h
+  include/grpc++/support/byte_buffer.h
+  include/grpc++/support/channel_arguments.h
+  include/grpc++/support/config.h
+  include/grpc++/support/slice.h
+  include/grpc++/support/status.h
+  include/grpc++/support/status_code_enum.h
+  include/grpc++/support/string_ref.h
+  include/grpc++/support/stub_options.h
+  include/grpc++/support/sync_stream.h
+  include/grpc++/support/time.h
+  include/grpc++/impl/codegen/async_stream.h
+  include/grpc++/impl/codegen/async_unary_call.h
+  include/grpc++/impl/codegen/call.h
+  include/grpc++/impl/codegen/call_hook.h
+  include/grpc++/impl/codegen/channel_interface.h
+  include/grpc++/impl/codegen/client_context.h
+  include/grpc++/impl/codegen/client_unary_call.h
+  include/grpc++/impl/codegen/completion_queue.h
+  include/grpc++/impl/codegen/completion_queue_tag.h
+  include/grpc++/impl/codegen/config.h
+  include/grpc++/impl/codegen/core_codegen_interface.h
+  include/grpc++/impl/codegen/create_auth_context.h
+  include/grpc++/impl/codegen/grpc_library.h
+  include/grpc++/impl/codegen/metadata_map.h
+  include/grpc++/impl/codegen/method_handler_impl.h
+  include/grpc++/impl/codegen/rpc_method.h
+  include/grpc++/impl/codegen/rpc_service_method.h
+  include/grpc++/impl/codegen/security/auth_context.h
+  include/grpc++/impl/codegen/serialization_traits.h
+  include/grpc++/impl/codegen/server_context.h
+  include/grpc++/impl/codegen/server_interface.h
+  include/grpc++/impl/codegen/service_type.h
+  include/grpc++/impl/codegen/slice.h
+  include/grpc++/impl/codegen/status.h
+  include/grpc++/impl/codegen/status_code_enum.h
+  include/grpc++/impl/codegen/string_ref.h
+  include/grpc++/impl/codegen/stub_options.h
+  include/grpc++/impl/codegen/sync_stream.h
+  include/grpc++/impl/codegen/time.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+  include/grpc++/impl/codegen/proto_utils.h
+  include/grpc++/impl/codegen/config_protobuf.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc++ EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_library(grpc++_cronet
+  src/cpp/client/cronet_credentials.cc
+  src/cpp/client/insecure_credentials.cc
+  src/cpp/common/insecure_create_auth_context.cc
+  src/cpp/server/insecure_server_credentials.cc
+  src/cpp/client/channel_cc.cc
+  src/cpp/client/client_context.cc
+  src/cpp/client/create_channel.cc
+  src/cpp/client/create_channel_internal.cc
+  src/cpp/client/create_channel_posix.cc
+  src/cpp/client/credentials_cc.cc
+  src/cpp/client/generic_stub.cc
+  src/cpp/common/channel_arguments.cc
+  src/cpp/common/channel_filter.cc
+  src/cpp/common/completion_queue_cc.cc
+  src/cpp/common/core_codegen.cc
+  src/cpp/common/resource_quota_cc.cc
+  src/cpp/common/rpc_method.cc
+  src/cpp/common/version_cc.cc
+  src/cpp/server/async_generic_service.cc
+  src/cpp/server/channel_argument_option.cc
+  src/cpp/server/create_default_thread_pool.cc
+  src/cpp/server/dynamic_thread_pool.cc
+  src/cpp/server/health/default_health_check_service.cc
+  src/cpp/server/health/health.pb.c
+  src/cpp/server/health/health_check_service.cc
+  src/cpp/server/health/health_check_service_server_builder_option.cc
+  src/cpp/server/server_builder.cc
+  src/cpp/server/server_cc.cc
+  src/cpp/server/server_context.cc
+  src/cpp/server/server_credentials.cc
+  src/cpp/server/server_posix.cc
+  src/cpp/thread_manager/thread_manager.cc
+  src/cpp/util/byte_buffer_cc.cc
+  src/cpp/util/slice_cc.cc
+  src/cpp/util/status.cc
+  src/cpp/util/string_ref.cc
+  src/cpp/util/time_cc.cc
+  third_party/nanopb/pb_common.c
+  third_party/nanopb/pb_decode.c
+  third_party/nanopb/pb_encode.c
+  src/cpp/codegen/codegen_init.cc
+  src/core/ext/transport/chttp2/client/insecure/channel_create.c
+  src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c
+  src/core/ext/transport/chttp2/client/chttp2_connector.c
+  src/core/ext/transport/chttp2/transport/bin_decoder.c
+  src/core/ext/transport/chttp2/transport/bin_encoder.c
+  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
+  src/core/ext/transport/chttp2/transport/chttp2_transport.c
+  src/core/ext/transport/chttp2/transport/frame_data.c
+  src/core/ext/transport/chttp2/transport/frame_goaway.c
+  src/core/ext/transport/chttp2/transport/frame_ping.c
+  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
+  src/core/ext/transport/chttp2/transport/frame_settings.c
+  src/core/ext/transport/chttp2/transport/frame_window_update.c
+  src/core/ext/transport/chttp2/transport/hpack_encoder.c
+  src/core/ext/transport/chttp2/transport/hpack_parser.c
+  src/core/ext/transport/chttp2/transport/hpack_table.c
+  src/core/ext/transport/chttp2/transport/http2_settings.c
+  src/core/ext/transport/chttp2/transport/huffsyms.c
+  src/core/ext/transport/chttp2/transport/incoming_metadata.c
+  src/core/ext/transport/chttp2/transport/parsing.c
+  src/core/ext/transport/chttp2/transport/stream_lists.c
+  src/core/ext/transport/chttp2/transport/stream_map.c
+  src/core/ext/transport/chttp2/transport/varint.c
+  src/core/ext/transport/chttp2/transport/writing.c
+  src/core/lib/channel/channel_args.c
+  src/core/lib/channel/channel_stack.c
+  src/core/lib/channel/channel_stack_builder.c
+  src/core/lib/channel/connected_channel.c
+  src/core/lib/channel/handshaker.c
+  src/core/lib/channel/handshaker_factory.c
+  src/core/lib/channel/handshaker_registry.c
+  src/core/lib/compression/compression.c
+  src/core/lib/compression/message_compress.c
+  src/core/lib/compression/stream_compression.c
+  src/core/lib/http/format_request.c
+  src/core/lib/http/httpcli.c
+  src/core/lib/http/parser.c
+  src/core/lib/iomgr/closure.c
+  src/core/lib/iomgr/combiner.c
+  src/core/lib/iomgr/endpoint.c
+  src/core/lib/iomgr/endpoint_pair_posix.c
+  src/core/lib/iomgr/endpoint_pair_uv.c
+  src/core/lib/iomgr/endpoint_pair_windows.c
+  src/core/lib/iomgr/error.c
+  src/core/lib/iomgr/ev_epoll1_linux.c
+  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
+  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
+  src/core/lib/iomgr/ev_epollex_linux.c
+  src/core/lib/iomgr/ev_epollsig_linux.c
+  src/core/lib/iomgr/ev_poll_posix.c
+  src/core/lib/iomgr/ev_posix.c
+  src/core/lib/iomgr/ev_windows.c
+  src/core/lib/iomgr/exec_ctx.c
+  src/core/lib/iomgr/executor.c
+  src/core/lib/iomgr/iocp_windows.c
+  src/core/lib/iomgr/iomgr.c
+  src/core/lib/iomgr/iomgr_posix.c
+  src/core/lib/iomgr/iomgr_uv.c
+  src/core/lib/iomgr/iomgr_windows.c
+  src/core/lib/iomgr/is_epollexclusive_available.c
+  src/core/lib/iomgr/load_file.c
+  src/core/lib/iomgr/lockfree_event.c
+  src/core/lib/iomgr/network_status_tracker.c
+  src/core/lib/iomgr/polling_entity.c
+  src/core/lib/iomgr/pollset_set_uv.c
+  src/core/lib/iomgr/pollset_set_windows.c
+  src/core/lib/iomgr/pollset_uv.c
+  src/core/lib/iomgr/pollset_windows.c
+  src/core/lib/iomgr/resolve_address_posix.c
+  src/core/lib/iomgr/resolve_address_uv.c
+  src/core/lib/iomgr/resolve_address_windows.c
+  src/core/lib/iomgr/resource_quota.c
+  src/core/lib/iomgr/sockaddr_utils.c
+  src/core/lib/iomgr/socket_factory_posix.c
+  src/core/lib/iomgr/socket_mutator.c
+  src/core/lib/iomgr/socket_utils_common_posix.c
+  src/core/lib/iomgr/socket_utils_linux.c
+  src/core/lib/iomgr/socket_utils_posix.c
+  src/core/lib/iomgr/socket_utils_uv.c
+  src/core/lib/iomgr/socket_utils_windows.c
+  src/core/lib/iomgr/socket_windows.c
+  src/core/lib/iomgr/tcp_client_posix.c
+  src/core/lib/iomgr/tcp_client_uv.c
+  src/core/lib/iomgr/tcp_client_windows.c
+  src/core/lib/iomgr/tcp_posix.c
+  src/core/lib/iomgr/tcp_server_posix.c
+  src/core/lib/iomgr/tcp_server_utils_posix_common.c
+  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
+  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
+  src/core/lib/iomgr/tcp_server_uv.c
+  src/core/lib/iomgr/tcp_server_windows.c
+  src/core/lib/iomgr/tcp_uv.c
+  src/core/lib/iomgr/tcp_windows.c
+  src/core/lib/iomgr/time_averaged_stats.c
+  src/core/lib/iomgr/timer_generic.c
+  src/core/lib/iomgr/timer_heap.c
+  src/core/lib/iomgr/timer_manager.c
+  src/core/lib/iomgr/timer_uv.c
+  src/core/lib/iomgr/udp_server.c
+  src/core/lib/iomgr/unix_sockets_posix.c
+  src/core/lib/iomgr/unix_sockets_posix_noop.c
+  src/core/lib/iomgr/wakeup_fd_cv.c
+  src/core/lib/iomgr/wakeup_fd_eventfd.c
+  src/core/lib/iomgr/wakeup_fd_nospecial.c
+  src/core/lib/iomgr/wakeup_fd_pipe.c
+  src/core/lib/iomgr/wakeup_fd_posix.c
+  src/core/lib/json/json.c
+  src/core/lib/json/json_reader.c
+  src/core/lib/json/json_string.c
+  src/core/lib/json/json_writer.c
+  src/core/lib/slice/b64.c
+  src/core/lib/slice/percent_encoding.c
+  src/core/lib/slice/slice.c
+  src/core/lib/slice/slice_buffer.c
+  src/core/lib/slice/slice_hash_table.c
+  src/core/lib/slice/slice_intern.c
+  src/core/lib/slice/slice_string_helpers.c
+  src/core/lib/surface/alarm.c
+  src/core/lib/surface/api_trace.c
+  src/core/lib/surface/byte_buffer.c
+  src/core/lib/surface/byte_buffer_reader.c
+  src/core/lib/surface/call.c
+  src/core/lib/surface/call_details.c
+  src/core/lib/surface/call_log_batch.c
+  src/core/lib/surface/channel.c
+  src/core/lib/surface/channel_init.c
+  src/core/lib/surface/channel_ping.c
+  src/core/lib/surface/channel_stack_type.c
+  src/core/lib/surface/completion_queue.c
+  src/core/lib/surface/completion_queue_factory.c
+  src/core/lib/surface/event_string.c
+  src/core/lib/surface/lame_client.cc
+  src/core/lib/surface/metadata_array.c
+  src/core/lib/surface/server.c
+  src/core/lib/surface/validate_metadata.c
+  src/core/lib/surface/version.c
+  src/core/lib/transport/bdp_estimator.c
+  src/core/lib/transport/byte_stream.c
+  src/core/lib/transport/connectivity_state.c
+  src/core/lib/transport/error_utils.c
+  src/core/lib/transport/metadata.c
+  src/core/lib/transport/metadata_batch.c
+  src/core/lib/transport/pid_controller.c
+  src/core/lib/transport/service_config.c
+  src/core/lib/transport/static_metadata.c
+  src/core/lib/transport/status_conversion.c
+  src/core/lib/transport/timeout_encoding.c
+  src/core/lib/transport/transport.c
+  src/core/lib/transport/transport_op_string.c
+  src/core/lib/debug/trace.c
+  src/core/ext/transport/chttp2/alpn/alpn.c
+  src/core/ext/filters/http/client/http_client_filter.c
+  src/core/ext/filters/http/http_filters_plugin.c
+  src/core/ext/filters/http/message_compress/message_compress_filter.c
+  src/core/ext/filters/http/server/http_server_filter.c
+  src/core/ext/filters/client_channel/channel_connectivity.c
+  src/core/ext/filters/client_channel/client_channel.c
+  src/core/ext/filters/client_channel/client_channel_factory.c
+  src/core/ext/filters/client_channel/client_channel_plugin.c
+  src/core/ext/filters/client_channel/connector.c
+  src/core/ext/filters/client_channel/http_connect_handshaker.c
+  src/core/ext/filters/client_channel/http_proxy.c
+  src/core/ext/filters/client_channel/lb_policy.c
+  src/core/ext/filters/client_channel/lb_policy_factory.c
+  src/core/ext/filters/client_channel/lb_policy_registry.c
+  src/core/ext/filters/client_channel/parse_address.c
+  src/core/ext/filters/client_channel/proxy_mapper.c
+  src/core/ext/filters/client_channel/proxy_mapper_registry.c
+  src/core/ext/filters/client_channel/resolver.c
+  src/core/ext/filters/client_channel/resolver_factory.c
+  src/core/ext/filters/client_channel/resolver_registry.c
+  src/core/ext/filters/client_channel/retry_throttle.c
+  src/core/ext/filters/client_channel/subchannel.c
+  src/core/ext/filters/client_channel/subchannel_index.c
+  src/core/ext/filters/client_channel/uri_parser.c
+  src/core/ext/filters/deadline/deadline_filter.c
+  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
+  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
+  src/core/ext/transport/chttp2/server/chttp2_server.c
+  src/core/ext/census/base_resources.c
+  src/core/ext/census/context.c
+  src/core/ext/census/gen/census.pb.c
+  src/core/ext/census/gen/trace_context.pb.c
+  src/core/ext/census/grpc_context.c
+  src/core/ext/census/grpc_filter.c
+  src/core/ext/census/grpc_plugin.c
+  src/core/ext/census/initialize.c
+  src/core/ext/census/intrusive_hash_map.c
+  src/core/ext/census/mlog.c
+  src/core/ext/census/operation.c
+  src/core/ext/census/placeholders.c
+  src/core/ext/census/resource.c
+  src/core/ext/census/trace_context.c
+  src/core/ext/census/tracing.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_cronet PROPERTIES COMPILE_PDB_NAME "grpc++_cronet"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_cronet.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc++_cronet
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_cronet
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr
+  grpc_cronet
+  grpc
+)
+
+foreach(_hdr
+  include/grpc++/alarm.h
+  include/grpc++/channel.h
+  include/grpc++/client_context.h
+  include/grpc++/completion_queue.h
+  include/grpc++/create_channel.h
+  include/grpc++/create_channel_posix.h
+  include/grpc++/ext/health_check_service_server_builder_option.h
+  include/grpc++/generic/async_generic_service.h
+  include/grpc++/generic/generic_stub.h
+  include/grpc++/grpc++.h
+  include/grpc++/health_check_service_interface.h
+  include/grpc++/impl/call.h
+  include/grpc++/impl/channel_argument_option.h
+  include/grpc++/impl/client_unary_call.h
+  include/grpc++/impl/codegen/core_codegen.h
+  include/grpc++/impl/grpc_library.h
+  include/grpc++/impl/method_handler_impl.h
+  include/grpc++/impl/rpc_method.h
+  include/grpc++/impl/rpc_service_method.h
+  include/grpc++/impl/serialization_traits.h
+  include/grpc++/impl/server_builder_option.h
+  include/grpc++/impl/server_builder_plugin.h
+  include/grpc++/impl/server_initializer.h
+  include/grpc++/impl/service_type.h
+  include/grpc++/resource_quota.h
+  include/grpc++/security/auth_context.h
+  include/grpc++/security/auth_metadata_processor.h
+  include/grpc++/security/credentials.h
+  include/grpc++/security/server_credentials.h
+  include/grpc++/server.h
+  include/grpc++/server_builder.h
+  include/grpc++/server_context.h
+  include/grpc++/server_posix.h
+  include/grpc++/support/async_stream.h
+  include/grpc++/support/async_unary_call.h
+  include/grpc++/support/byte_buffer.h
+  include/grpc++/support/channel_arguments.h
+  include/grpc++/support/config.h
+  include/grpc++/support/slice.h
+  include/grpc++/support/status.h
+  include/grpc++/support/status_code_enum.h
+  include/grpc++/support/string_ref.h
+  include/grpc++/support/stub_options.h
+  include/grpc++/support/sync_stream.h
+  include/grpc++/support/time.h
+  include/grpc++/impl/codegen/async_stream.h
+  include/grpc++/impl/codegen/async_unary_call.h
+  include/grpc++/impl/codegen/call.h
+  include/grpc++/impl/codegen/call_hook.h
+  include/grpc++/impl/codegen/channel_interface.h
+  include/grpc++/impl/codegen/client_context.h
+  include/grpc++/impl/codegen/client_unary_call.h
+  include/grpc++/impl/codegen/completion_queue.h
+  include/grpc++/impl/codegen/completion_queue_tag.h
+  include/grpc++/impl/codegen/config.h
+  include/grpc++/impl/codegen/core_codegen_interface.h
+  include/grpc++/impl/codegen/create_auth_context.h
+  include/grpc++/impl/codegen/grpc_library.h
+  include/grpc++/impl/codegen/metadata_map.h
+  include/grpc++/impl/codegen/method_handler_impl.h
+  include/grpc++/impl/codegen/rpc_method.h
+  include/grpc++/impl/codegen/rpc_service_method.h
+  include/grpc++/impl/codegen/security/auth_context.h
+  include/grpc++/impl/codegen/serialization_traits.h
+  include/grpc++/impl/codegen/server_context.h
+  include/grpc++/impl/codegen/server_interface.h
+  include/grpc++/impl/codegen/service_type.h
+  include/grpc++/impl/codegen/slice.h
+  include/grpc++/impl/codegen/status.h
+  include/grpc++/impl/codegen/status_code_enum.h
+  include/grpc++/impl/codegen/string_ref.h
+  include/grpc++/impl/codegen/stub_options.h
+  include/grpc++/impl/codegen/sync_stream.h
+  include/grpc++/impl/codegen/time.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+  include/grpc/byte_buffer.h
+  include/grpc/byte_buffer_reader.h
+  include/grpc/compression.h
+  include/grpc/grpc.h
+  include/grpc/grpc_posix.h
+  include/grpc/grpc_security_constants.h
+  include/grpc/load_reporting.h
+  include/grpc/slice.h
+  include/grpc/slice_buffer.h
+  include/grpc/status.h
+  include/grpc/support/workaround_list.h
+  include/grpc/census.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc++_cronet EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_library(grpc++_error_details
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.grpc.pb.h
+  src/cpp/util/error_details.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_error_details PROPERTIES COMPILE_PDB_NAME "grpc++_error_details"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_error_details.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/status/status.proto
+)
+
+target_include_directories(grpc++_error_details
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_error_details
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+)
+
+foreach(_hdr
+  include/grpc++/support/error_details.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc++_error_details EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc++_proto_reflection_desc_db
+  test/cpp/util/proto_reflection_descriptor_database.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.h
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_proto_reflection_desc_db PROPERTIES COMPILE_PDB_NAME "grpc++_proto_reflection_desc_db"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_proto_reflection_desc_db.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/reflection/v1alpha/reflection.proto
+)
+
+target_include_directories(grpc++_proto_reflection_desc_db
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_proto_reflection_desc_db
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+)
+
+foreach(_hdr
+  include/grpc++/impl/codegen/config_protobuf.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc++_reflection
+  src/cpp/ext/proto_server_reflection.cc
+  src/cpp/ext/proto_server_reflection_plugin.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.h
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_reflection PROPERTIES COMPILE_PDB_NAME "grpc++_reflection"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_reflection.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/reflection/v1alpha/reflection.proto
+)
+
+target_include_directories(grpc++_reflection
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_reflection
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+)
+
+foreach(_hdr
+  include/grpc++/ext/proto_server_reflection_plugin.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc++_reflection EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc++_test_config
+  test/cpp/util/test_config_cc.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_test_config PROPERTIES COMPILE_PDB_NAME "grpc++_test_config"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_test_config.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc++_test_config
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_test_config
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc++_test_util
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_mock.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.grpc.pb.h
+  test/cpp/end2end/test_service_impl.cc
+  test/cpp/util/byte_buffer_proto_helper.cc
+  test/cpp/util/create_test_channel.cc
+  test/cpp/util/string_ref_helper.cc
+  test/cpp/util/subprocess.cc
+  test/cpp/util/test_credentials_provider.cc
+  src/cpp/codegen/codegen_init.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_test_util PROPERTIES COMPILE_PDB_NAME "grpc++_test_util"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_test_util.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/health/v1/health.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo_messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/duplicate/echo_duplicate.proto
+)
+
+target_include_directories(grpc++_test_util
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_test_util
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc_test_util
+  grpc
+)
+
+foreach(_hdr
+  include/grpc++/impl/codegen/async_stream.h
+  include/grpc++/impl/codegen/async_unary_call.h
+  include/grpc++/impl/codegen/call.h
+  include/grpc++/impl/codegen/call_hook.h
+  include/grpc++/impl/codegen/channel_interface.h
+  include/grpc++/impl/codegen/client_context.h
+  include/grpc++/impl/codegen/client_unary_call.h
+  include/grpc++/impl/codegen/completion_queue.h
+  include/grpc++/impl/codegen/completion_queue_tag.h
+  include/grpc++/impl/codegen/config.h
+  include/grpc++/impl/codegen/core_codegen_interface.h
+  include/grpc++/impl/codegen/create_auth_context.h
+  include/grpc++/impl/codegen/grpc_library.h
+  include/grpc++/impl/codegen/metadata_map.h
+  include/grpc++/impl/codegen/method_handler_impl.h
+  include/grpc++/impl/codegen/rpc_method.h
+  include/grpc++/impl/codegen/rpc_service_method.h
+  include/grpc++/impl/codegen/security/auth_context.h
+  include/grpc++/impl/codegen/serialization_traits.h
+  include/grpc++/impl/codegen/server_context.h
+  include/grpc++/impl/codegen/server_interface.h
+  include/grpc++/impl/codegen/service_type.h
+  include/grpc++/impl/codegen/slice.h
+  include/grpc++/impl/codegen/status.h
+  include/grpc++/impl/codegen/status_code_enum.h
+  include/grpc++/impl/codegen/string_ref.h
+  include/grpc++/impl/codegen/stub_options.h
+  include/grpc++/impl/codegen/sync_stream.h
+  include/grpc++/impl/codegen/time.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+  include/grpc++/impl/codegen/proto_utils.h
+  include/grpc++/impl/codegen/config_protobuf.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc++_unsecure
+  src/cpp/client/insecure_credentials.cc
+  src/cpp/common/insecure_create_auth_context.cc
+  src/cpp/server/insecure_server_credentials.cc
+  src/cpp/client/channel_cc.cc
+  src/cpp/client/client_context.cc
+  src/cpp/client/create_channel.cc
+  src/cpp/client/create_channel_internal.cc
+  src/cpp/client/create_channel_posix.cc
+  src/cpp/client/credentials_cc.cc
+  src/cpp/client/generic_stub.cc
+  src/cpp/common/channel_arguments.cc
+  src/cpp/common/channel_filter.cc
+  src/cpp/common/completion_queue_cc.cc
+  src/cpp/common/core_codegen.cc
+  src/cpp/common/resource_quota_cc.cc
+  src/cpp/common/rpc_method.cc
+  src/cpp/common/version_cc.cc
+  src/cpp/server/async_generic_service.cc
+  src/cpp/server/channel_argument_option.cc
+  src/cpp/server/create_default_thread_pool.cc
+  src/cpp/server/dynamic_thread_pool.cc
+  src/cpp/server/health/default_health_check_service.cc
+  src/cpp/server/health/health.pb.c
+  src/cpp/server/health/health_check_service.cc
+  src/cpp/server/health/health_check_service_server_builder_option.cc
+  src/cpp/server/server_builder.cc
+  src/cpp/server/server_cc.cc
+  src/cpp/server/server_context.cc
+  src/cpp/server/server_credentials.cc
+  src/cpp/server/server_posix.cc
+  src/cpp/thread_manager/thread_manager.cc
+  src/cpp/util/byte_buffer_cc.cc
+  src/cpp/util/slice_cc.cc
+  src/cpp/util/status.cc
+  src/cpp/util/string_ref.cc
+  src/cpp/util/time_cc.cc
+  third_party/nanopb/pb_common.c
+  third_party/nanopb/pb_decode.c
+  third_party/nanopb/pb_encode.c
+  src/cpp/codegen/codegen_init.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc++_unsecure PROPERTIES COMPILE_PDB_NAME "grpc++_unsecure"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_unsecure.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc++_unsecure
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc++_unsecure
+  ${_gRPC_BASELIB_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr
+  grpc_unsecure
+)
+
+foreach(_hdr
+  include/grpc++/alarm.h
+  include/grpc++/channel.h
+  include/grpc++/client_context.h
+  include/grpc++/completion_queue.h
+  include/grpc++/create_channel.h
+  include/grpc++/create_channel_posix.h
+  include/grpc++/ext/health_check_service_server_builder_option.h
+  include/grpc++/generic/async_generic_service.h
+  include/grpc++/generic/generic_stub.h
+  include/grpc++/grpc++.h
+  include/grpc++/health_check_service_interface.h
+  include/grpc++/impl/call.h
+  include/grpc++/impl/channel_argument_option.h
+  include/grpc++/impl/client_unary_call.h
+  include/grpc++/impl/codegen/core_codegen.h
+  include/grpc++/impl/grpc_library.h
+  include/grpc++/impl/method_handler_impl.h
+  include/grpc++/impl/rpc_method.h
+  include/grpc++/impl/rpc_service_method.h
+  include/grpc++/impl/serialization_traits.h
+  include/grpc++/impl/server_builder_option.h
+  include/grpc++/impl/server_builder_plugin.h
+  include/grpc++/impl/server_initializer.h
+  include/grpc++/impl/service_type.h
+  include/grpc++/resource_quota.h
+  include/grpc++/security/auth_context.h
+  include/grpc++/security/auth_metadata_processor.h
+  include/grpc++/security/credentials.h
+  include/grpc++/security/server_credentials.h
+  include/grpc++/server.h
+  include/grpc++/server_builder.h
+  include/grpc++/server_context.h
+  include/grpc++/server_posix.h
+  include/grpc++/support/async_stream.h
+  include/grpc++/support/async_unary_call.h
+  include/grpc++/support/byte_buffer.h
+  include/grpc++/support/channel_arguments.h
+  include/grpc++/support/config.h
+  include/grpc++/support/slice.h
+  include/grpc++/support/status.h
+  include/grpc++/support/status_code_enum.h
+  include/grpc++/support/string_ref.h
+  include/grpc++/support/stub_options.h
+  include/grpc++/support/sync_stream.h
+  include/grpc++/support/time.h
+  include/grpc++/impl/codegen/async_stream.h
+  include/grpc++/impl/codegen/async_unary_call.h
+  include/grpc++/impl/codegen/call.h
+  include/grpc++/impl/codegen/call_hook.h
+  include/grpc++/impl/codegen/channel_interface.h
+  include/grpc++/impl/codegen/client_context.h
+  include/grpc++/impl/codegen/client_unary_call.h
+  include/grpc++/impl/codegen/completion_queue.h
+  include/grpc++/impl/codegen/completion_queue_tag.h
+  include/grpc++/impl/codegen/config.h
+  include/grpc++/impl/codegen/core_codegen_interface.h
+  include/grpc++/impl/codegen/create_auth_context.h
+  include/grpc++/impl/codegen/grpc_library.h
+  include/grpc++/impl/codegen/metadata_map.h
+  include/grpc++/impl/codegen/method_handler_impl.h
+  include/grpc++/impl/codegen/rpc_method.h
+  include/grpc++/impl/codegen/rpc_service_method.h
+  include/grpc++/impl/codegen/security/auth_context.h
+  include/grpc++/impl/codegen/serialization_traits.h
+  include/grpc++/impl/codegen/server_context.h
+  include/grpc++/impl/codegen/server_interface.h
+  include/grpc++/impl/codegen/service_type.h
+  include/grpc++/impl/codegen/slice.h
+  include/grpc++/impl/codegen/status.h
+  include/grpc++/impl/codegen/status_code_enum.h
+  include/grpc++/impl/codegen/string_ref.h
+  include/grpc++/impl/codegen/stub_options.h
+  include/grpc++/impl/codegen/sync_stream.h
+  include/grpc++/impl/codegen/time.h
+  include/grpc/impl/codegen/byte_buffer_reader.h
+  include/grpc/impl/codegen/compression_types.h
+  include/grpc/impl/codegen/connectivity_state.h
+  include/grpc/impl/codegen/exec_ctx_fwd.h
+  include/grpc/impl/codegen/grpc_types.h
+  include/grpc/impl/codegen/propagation_bits.h
+  include/grpc/impl/codegen/slice.h
+  include/grpc/impl/codegen/status.h
+  include/grpc/impl/codegen/atm.h
+  include/grpc/impl/codegen/atm_gcc_atomic.h
+  include/grpc/impl/codegen/atm_gcc_sync.h
+  include/grpc/impl/codegen/atm_windows.h
+  include/grpc/impl/codegen/gpr_slice.h
+  include/grpc/impl/codegen/gpr_types.h
+  include/grpc/impl/codegen/port_platform.h
+  include/grpc/impl/codegen/sync.h
+  include/grpc/impl/codegen/sync_generic.h
+  include/grpc/impl/codegen/sync_posix.h
+  include/grpc/impl/codegen/sync_windows.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc++_unsecure EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc_benchmark
+  test/cpp/microbenchmarks/helpers.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_benchmark PROPERTIES COMPILE_PDB_NAME "grpc_benchmark"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_benchmark.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_benchmark
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_benchmark
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  benchmark
+  grpc++
+  grpc_test_util
+  grpc
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(grpc_cli_libs
+  test/cpp/util/cli_call.cc
+  test/cpp/util/cli_credentials.cc
+  test/cpp/util/grpc_tool.cc
+  test/cpp/util/proto_file_parser.cc
+  test/cpp/util/service_describer.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.h
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_cli_libs PROPERTIES COMPILE_PDB_NAME "grpc_cli_libs"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_cli_libs.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/reflection/v1alpha/reflection.proto
+)
+
+target_include_directories(grpc_cli_libs
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_cli_libs
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_proto_reflection_desc_db
+  grpc++
+  grpc
+)
+
+foreach(_hdr
+  include/grpc++/impl/codegen/config_protobuf.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc_plugin_support
+  src/compiler/cpp_generator.cc
+  src/compiler/csharp_generator.cc
+  src/compiler/node_generator.cc
+  src/compiler/objective_c_generator.cc
+  src/compiler/php_generator.cc
+  src/compiler/python_generator.cc
+  src/compiler/ruby_generator.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_plugin_support PROPERTIES COMPILE_PDB_NAME "grpc_plugin_support"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_plugin_support.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_plugin_support
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_plugin_support
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+)
+
+foreach(_hdr
+  include/grpc++/impl/codegen/config_protobuf.h
+)
+  string(REPLACE "include/" "" _path ${_hdr})
+  get_filename_component(_path ${_path} PATH)
+  install(FILES ${_hdr}
+    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
+  )
+endforeach()
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_plugin_support EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(http2_client_main
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
+  test/cpp/interop/http2_client.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(http2_client_main PROPERTIES COMPILE_PDB_NAME "http2_client_main"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/http2_client_main.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/empty.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/test.proto
+)
+
+target_include_directories(http2_client_main
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(http2_client_main
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  grpc++_test_config
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(interop_client_helper
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  test/cpp/interop/client_helper.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(interop_client_helper PROPERTIES COMPILE_PDB_NAME "interop_client_helper"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_client_helper.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+
+target_include_directories(interop_client_helper
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_client_helper
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(interop_client_main
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
+  test/cpp/interop/client.cc
+  test/cpp/interop/interop_client.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(interop_client_main PROPERTIES COMPILE_PDB_NAME "interop_client_main"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_client_main.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/empty.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/test.proto
+)
+
+target_include_directories(interop_client_main
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_client_main
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  interop_client_helper
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(interop_server_helper
+  test/cpp/interop/server_helper.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(interop_server_helper PROPERTIES COMPILE_PDB_NAME "interop_server_helper"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_server_helper.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(interop_server_helper
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_server_helper
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(interop_server_lib
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
+  test/cpp/interop/interop_server.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(interop_server_lib PROPERTIES COMPILE_PDB_NAME "interop_server_lib"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_server_lib.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/empty.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/test.proto
+)
+
+target_include_directories(interop_server_lib
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_server_lib
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  interop_server_helper
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(interop_server_main
+  test/cpp/interop/interop_server_bootstrap.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(interop_server_main PROPERTIES COMPILE_PDB_NAME "interop_server_main"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_server_main.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(interop_server_main
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_server_main
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  interop_server_lib
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(qps
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.h
+  test/cpp/qps/benchmark_config.cc
+  test/cpp/qps/client_async.cc
+  test/cpp/qps/client_sync.cc
+  test/cpp/qps/driver.cc
+  test/cpp/qps/parse_json.cc
+  test/cpp/qps/qps_worker.cc
+  test/cpp/qps/report.cc
+  test/cpp/qps/server_async.cc
+  test/cpp/qps/server_sync.cc
+  test/cpp/qps/usage_timer.cc
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(qps PROPERTIES COMPILE_PDB_NAME "qps"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/qps.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/payloads.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/stats.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/control.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/services.proto
+)
+
+target_include_directories(qps
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(qps
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++_test_util
+  grpc++
+  grpc
+)
+
+
+endif (gRPC_BUILD_TESTS)
+
+add_library(grpc_csharp_ext SHARED
+  src/csharp/ext/grpc_csharp_ext.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(grpc_csharp_ext PROPERTIES COMPILE_PDB_NAME "grpc_csharp_ext"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_csharp_ext.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(grpc_csharp_ext
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_csharp_ext
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_csharp_ext EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_library(ares
+  third_party/cares/cares/ares__close_sockets.c
+  third_party/cares/cares/ares__get_hostent.c
+  third_party/cares/cares/ares__read_line.c
+  third_party/cares/cares/ares__timeval.c
+  third_party/cares/cares/ares_cancel.c
+  third_party/cares/cares/ares_create_query.c
+  third_party/cares/cares/ares_data.c
+  third_party/cares/cares/ares_destroy.c
+  third_party/cares/cares/ares_expand_name.c
+  third_party/cares/cares/ares_expand_string.c
+  third_party/cares/cares/ares_fds.c
+  third_party/cares/cares/ares_free_hostent.c
+  third_party/cares/cares/ares_free_string.c
+  third_party/cares/cares/ares_getenv.c
+  third_party/cares/cares/ares_gethostbyaddr.c
+  third_party/cares/cares/ares_gethostbyname.c
+  third_party/cares/cares/ares_getnameinfo.c
+  third_party/cares/cares/ares_getopt.c
+  third_party/cares/cares/ares_getsock.c
+  third_party/cares/cares/ares_init.c
+  third_party/cares/cares/ares_library_init.c
+  third_party/cares/cares/ares_llist.c
+  third_party/cares/cares/ares_mkquery.c
+  third_party/cares/cares/ares_nowarn.c
+  third_party/cares/cares/ares_options.c
+  third_party/cares/cares/ares_parse_a_reply.c
+  third_party/cares/cares/ares_parse_aaaa_reply.c
+  third_party/cares/cares/ares_parse_mx_reply.c
+  third_party/cares/cares/ares_parse_naptr_reply.c
+  third_party/cares/cares/ares_parse_ns_reply.c
+  third_party/cares/cares/ares_parse_ptr_reply.c
+  third_party/cares/cares/ares_parse_soa_reply.c
+  third_party/cares/cares/ares_parse_srv_reply.c
+  third_party/cares/cares/ares_parse_txt_reply.c
+  third_party/cares/cares/ares_platform.c
+  third_party/cares/cares/ares_process.c
+  third_party/cares/cares/ares_query.c
+  third_party/cares/cares/ares_search.c
+  third_party/cares/cares/ares_send.c
+  third_party/cares/cares/ares_strcasecmp.c
+  third_party/cares/cares/ares_strdup.c
+  third_party/cares/cares/ares_strerror.c
+  third_party/cares/cares/ares_timeout.c
+  third_party/cares/cares/ares_version.c
+  third_party/cares/cares/ares_writev.c
+  third_party/cares/cares/bitncmp.c
+  third_party/cares/cares/inet_net_pton.c
+  third_party/cares/cares/inet_ntop.c
+  third_party/cares/cares/windows_port.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(ares PROPERTIES COMPILE_PDB_NAME "ares"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ares.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(ares
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(ares
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(bad_client_test
+  test/core/bad_client/bad_client.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(bad_client_test PROPERTIES COMPILE_PDB_NAME "bad_client_test"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/bad_client_test.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(bad_client_test
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bad_client_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(bad_ssl_test_server
+  test/core/bad_ssl/server_common.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(bad_ssl_test_server PROPERTIES COMPILE_PDB_NAME "bad_ssl_test_server"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/bad_ssl_test_server.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(bad_ssl_test_server
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bad_ssl_test_server
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(end2end_tests
+  test/core/end2end/end2end_tests.c
+  test/core/end2end/end2end_test_utils.c
+  test/core/end2end/tests/authority_not_supported.c
+  test/core/end2end/tests/bad_hostname.c
+  test/core/end2end/tests/bad_ping.c
+  test/core/end2end/tests/binary_metadata.c
+  test/core/end2end/tests/call_creds.c
+  test/core/end2end/tests/cancel_after_accept.c
+  test/core/end2end/tests/cancel_after_client_done.c
+  test/core/end2end/tests/cancel_after_invoke.c
+  test/core/end2end/tests/cancel_after_round_trip.c
+  test/core/end2end/tests/cancel_before_invoke.c
+  test/core/end2end/tests/cancel_in_a_vacuum.c
+  test/core/end2end/tests/cancel_with_status.c
+  test/core/end2end/tests/compressed_payload.c
+  test/core/end2end/tests/connectivity.c
+  test/core/end2end/tests/default_host.c
+  test/core/end2end/tests/disappearing_server.c
+  test/core/end2end/tests/empty_batch.c
+  test/core/end2end/tests/filter_call_init_fails.c
+  test/core/end2end/tests/filter_causes_close.c
+  test/core/end2end/tests/filter_latency.c
+  test/core/end2end/tests/graceful_server_shutdown.c
+  test/core/end2end/tests/high_initial_seqno.c
+  test/core/end2end/tests/hpack_size.c
+  test/core/end2end/tests/idempotent_request.c
+  test/core/end2end/tests/invoke_large_request.c
+  test/core/end2end/tests/keepalive_timeout.c
+  test/core/end2end/tests/large_metadata.c
+  test/core/end2end/tests/load_reporting_hook.c
+  test/core/end2end/tests/max_concurrent_streams.c
+  test/core/end2end/tests/max_connection_age.c
+  test/core/end2end/tests/max_connection_idle.c
+  test/core/end2end/tests/max_message_length.c
+  test/core/end2end/tests/negative_deadline.c
+  test/core/end2end/tests/network_status_change.c
+  test/core/end2end/tests/no_logging.c
+  test/core/end2end/tests/no_op.c
+  test/core/end2end/tests/payload.c
+  test/core/end2end/tests/ping.c
+  test/core/end2end/tests/ping_pong_streaming.c
+  test/core/end2end/tests/proxy_auth.c
+  test/core/end2end/tests/registered_call.c
+  test/core/end2end/tests/request_with_flags.c
+  test/core/end2end/tests/request_with_payload.c
+  test/core/end2end/tests/resource_quota_server.c
+  test/core/end2end/tests/server_finishes_request.c
+  test/core/end2end/tests/shutdown_finishes_calls.c
+  test/core/end2end/tests/shutdown_finishes_tags.c
+  test/core/end2end/tests/simple_cacheable_request.c
+  test/core/end2end/tests/simple_delayed_request.c
+  test/core/end2end/tests/simple_metadata.c
+  test/core/end2end/tests/simple_request.c
+  test/core/end2end/tests/streaming_error_response.c
+  test/core/end2end/tests/trailing_metadata.c
+  test/core/end2end/tests/workaround_cronet_compression.c
+  test/core/end2end/tests/write_buffering.c
+  test/core/end2end/tests/write_buffering_at_end.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(end2end_tests PROPERTIES COMPILE_PDB_NAME "end2end_tests"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/end2end_tests.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(end2end_tests
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(end2end_tests
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_library(end2end_nosec_tests
+  test/core/end2end/end2end_nosec_tests.c
+  test/core/end2end/end2end_test_utils.c
+  test/core/end2end/tests/authority_not_supported.c
+  test/core/end2end/tests/bad_hostname.c
+  test/core/end2end/tests/bad_ping.c
+  test/core/end2end/tests/binary_metadata.c
+  test/core/end2end/tests/cancel_after_accept.c
+  test/core/end2end/tests/cancel_after_client_done.c
+  test/core/end2end/tests/cancel_after_invoke.c
+  test/core/end2end/tests/cancel_after_round_trip.c
+  test/core/end2end/tests/cancel_before_invoke.c
+  test/core/end2end/tests/cancel_in_a_vacuum.c
+  test/core/end2end/tests/cancel_with_status.c
+  test/core/end2end/tests/compressed_payload.c
+  test/core/end2end/tests/connectivity.c
+  test/core/end2end/tests/default_host.c
+  test/core/end2end/tests/disappearing_server.c
+  test/core/end2end/tests/empty_batch.c
+  test/core/end2end/tests/filter_call_init_fails.c
+  test/core/end2end/tests/filter_causes_close.c
+  test/core/end2end/tests/filter_latency.c
+  test/core/end2end/tests/graceful_server_shutdown.c
+  test/core/end2end/tests/high_initial_seqno.c
+  test/core/end2end/tests/hpack_size.c
+  test/core/end2end/tests/idempotent_request.c
+  test/core/end2end/tests/invoke_large_request.c
+  test/core/end2end/tests/keepalive_timeout.c
+  test/core/end2end/tests/large_metadata.c
+  test/core/end2end/tests/load_reporting_hook.c
+  test/core/end2end/tests/max_concurrent_streams.c
+  test/core/end2end/tests/max_connection_age.c
+  test/core/end2end/tests/max_connection_idle.c
+  test/core/end2end/tests/max_message_length.c
+  test/core/end2end/tests/negative_deadline.c
+  test/core/end2end/tests/network_status_change.c
+  test/core/end2end/tests/no_logging.c
+  test/core/end2end/tests/no_op.c
+  test/core/end2end/tests/payload.c
+  test/core/end2end/tests/ping.c
+  test/core/end2end/tests/ping_pong_streaming.c
+  test/core/end2end/tests/proxy_auth.c
+  test/core/end2end/tests/registered_call.c
+  test/core/end2end/tests/request_with_flags.c
+  test/core/end2end/tests/request_with_payload.c
+  test/core/end2end/tests/resource_quota_server.c
+  test/core/end2end/tests/server_finishes_request.c
+  test/core/end2end/tests/shutdown_finishes_calls.c
+  test/core/end2end/tests/shutdown_finishes_tags.c
+  test/core/end2end/tests/simple_cacheable_request.c
+  test/core/end2end/tests/simple_delayed_request.c
+  test/core/end2end/tests/simple_metadata.c
+  test/core/end2end/tests/simple_request.c
+  test/core/end2end/tests/streaming_error_response.c
+  test/core/end2end/tests/trailing_metadata.c
+  test/core/end2end/tests/workaround_cronet_compression.c
+  test/core/end2end/tests/write_buffering.c
+  test/core/end2end/tests/write_buffering_at_end.c
+)
+
+if(WIN32 AND MSVC)
+  set_target_properties(end2end_nosec_tests PROPERTIES COMPILE_PDB_NAME "end2end_nosec_tests"
+    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  )
+  if (gRPC_INSTALL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/end2end_nosec_tests.pdb
+      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
+    )
+  endif()
+endif()
+
+
+target_include_directories(end2end_nosec_tests
+  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${ZLIB_INCLUDE_DIR}
+  PRIVATE ${BENCHMARK}/include
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(end2end_nosec_tests
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+
+endif (gRPC_BUILD_TESTS)
+
+if (gRPC_BUILD_TESTS)
+
+add_executable(alarm_test
+  test/core/surface/alarm_test.c
+)
+
+
+target_include_directories(alarm_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(alarm_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(algorithm_test
+  test/core/compression/algorithm_test.c
+)
+
+
+target_include_directories(algorithm_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(algorithm_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(alloc_test
+  test/core/support/alloc_test.c
+)
+
+
+target_include_directories(alloc_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(alloc_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(alpn_test
+  test/core/transport/chttp2/alpn_test.c
+)
+
+
+target_include_directories(alpn_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(alpn_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(arena_test
+  test/core/support/arena_test.c
+)
+
+
+target_include_directories(arena_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(arena_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(bad_server_response_test
+  test/core/end2end/bad_server_response_test.c
+)
+
+
+target_include_directories(bad_server_response_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bad_server_response_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  test_tcp_server
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(bdp_estimator_test
+  test/core/transport/bdp_estimator_test.c
+)
+
+
+target_include_directories(bdp_estimator_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bdp_estimator_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(bin_decoder_test
+  test/core/transport/chttp2/bin_decoder_test.c
+)
+
+
+target_include_directories(bin_decoder_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bin_decoder_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(bin_encoder_test
+  test/core/transport/chttp2/bin_encoder_test.c
+)
+
+
+target_include_directories(bin_encoder_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bin_encoder_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(census_context_test
+  test/core/census/context_test.c
+)
+
+
+target_include_directories(census_context_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(census_context_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(census_intrusive_hash_map_test
+  test/core/census/intrusive_hash_map_test.c
+)
+
+
+target_include_directories(census_intrusive_hash_map_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(census_intrusive_hash_map_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(census_resource_test
+  test/core/census/resource_test.c
+)
+
+
+target_include_directories(census_resource_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(census_resource_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(census_trace_context_test
+  test/core/census/trace_context_test.c
+)
+
+
+target_include_directories(census_trace_context_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(census_trace_context_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(channel_create_test
+  test/core/surface/channel_create_test.c
+)
+
+
+target_include_directories(channel_create_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(channel_create_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+
+add_executable(check_epollexclusive
+  test/build/check_epollexclusive.c
+)
+
+
+target_include_directories(check_epollexclusive
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(check_epollexclusive
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS check_epollexclusive EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_executable(chttp2_hpack_encoder_test
+  test/core/transport/chttp2/hpack_encoder_test.c
+)
+
+
+target_include_directories(chttp2_hpack_encoder_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(chttp2_hpack_encoder_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(chttp2_stream_map_test
+  test/core/transport/chttp2/stream_map_test.c
+)
+
+
+target_include_directories(chttp2_stream_map_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(chttp2_stream_map_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(chttp2_varint_test
+  test/core/transport/chttp2/varint_test.c
+)
+
+
+target_include_directories(chttp2_varint_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(chttp2_varint_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(combiner_test
+  test/core/iomgr/combiner_test.c
+)
+
+
+target_include_directories(combiner_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(combiner_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(compression_test
+  test/core/compression/compression_test.c
+)
+
+
+target_include_directories(compression_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(compression_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(concurrent_connectivity_test
+  test/core/surface/concurrent_connectivity_test.c
+)
+
+
+target_include_directories(concurrent_connectivity_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(concurrent_connectivity_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(connection_refused_test
+  test/core/end2end/connection_refused_test.c
+)
+
+
+target_include_directories(connection_refused_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(connection_refused_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(dns_resolver_connectivity_test
+  test/core/client_channel/resolvers/dns_resolver_connectivity_test.c
+)
+
+
+target_include_directories(dns_resolver_connectivity_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(dns_resolver_connectivity_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(dns_resolver_test
+  test/core/client_channel/resolvers/dns_resolver_test.c
+)
+
+
+target_include_directories(dns_resolver_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(dns_resolver_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(dualstack_socket_test
+  test/core/end2end/dualstack_socket_test.c
+)
+
+
+target_include_directories(dualstack_socket_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(dualstack_socket_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(endpoint_pair_test
+  test/core/iomgr/endpoint_pair_test.c
+)
+
+
+target_include_directories(endpoint_pair_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(endpoint_pair_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(error_test
+  test/core/iomgr/error_test.c
+)
+
+
+target_include_directories(error_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(error_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(ev_epollsig_linux_test
+  test/core/iomgr/ev_epollsig_linux_test.c
+)
+
+
+target_include_directories(ev_epollsig_linux_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(ev_epollsig_linux_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(fake_resolver_test
+  test/core/client_channel/resolvers/fake_resolver_test.c
+)
+
+
+target_include_directories(fake_resolver_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fake_resolver_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(fd_conservation_posix_test
+  test/core/iomgr/fd_conservation_posix_test.c
+)
+
+
+target_include_directories(fd_conservation_posix_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fd_conservation_posix_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(fd_posix_test
+  test/core/iomgr/fd_posix_test.c
+)
+
+
+target_include_directories(fd_posix_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fd_posix_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(fling_client
+  test/core/fling/client.c
+)
+
+
+target_include_directories(fling_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fling_client
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(fling_server
+  test/core/fling/server.c
+)
+
+
+target_include_directories(fling_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fling_server
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(fling_stream_test
+  test/core/fling/fling_stream_test.c
+)
+
+
+target_include_directories(fling_stream_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fling_stream_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(fling_test
+  test/core/fling/fling_test.c
+)
+
+
+target_include_directories(fling_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(fling_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+
+add_executable(gen_hpack_tables
+  tools/codegen/core/gen_hpack_tables.c
+)
+
+
+target_include_directories(gen_hpack_tables
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gen_hpack_tables
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr
+  grpc
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS gen_hpack_tables EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(gen_legal_metadata_characters
+  tools/codegen/core/gen_legal_metadata_characters.c
+)
+
+
+target_include_directories(gen_legal_metadata_characters
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gen_legal_metadata_characters
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS gen_legal_metadata_characters EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(gen_percent_encoding_tables
+  tools/codegen/core/gen_percent_encoding_tables.c
+)
+
+
+target_include_directories(gen_percent_encoding_tables
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gen_percent_encoding_tables
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS gen_percent_encoding_tables EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(goaway_server_test
+  test/core/end2end/goaway_server_test.c
+)
+
+
+target_include_directories(goaway_server_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(goaway_server_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_avl_test
+  test/core/support/avl_test.c
+)
+
+
+target_include_directories(gpr_avl_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_avl_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_backoff_test
+  test/core/support/backoff_test.c
+)
+
+
+target_include_directories(gpr_backoff_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_backoff_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_cmdline_test
+  test/core/support/cmdline_test.c
+)
+
+
+target_include_directories(gpr_cmdline_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_cmdline_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_cpu_test
+  test/core/support/cpu_test.c
+)
+
+
+target_include_directories(gpr_cpu_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_cpu_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_env_test
+  test/core/support/env_test.c
+)
+
+
+target_include_directories(gpr_env_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_env_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_histogram_test
+  test/core/support/histogram_test.c
+)
+
+
+target_include_directories(gpr_histogram_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_histogram_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_host_port_test
+  test/core/support/host_port_test.c
+)
+
+
+target_include_directories(gpr_host_port_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_host_port_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_log_test
+  test/core/support/log_test.c
+)
+
+
+target_include_directories(gpr_log_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_log_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_mpscq_test
+  test/core/support/mpscq_test.c
+)
+
+
+target_include_directories(gpr_mpscq_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_mpscq_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_spinlock_test
+  test/core/support/spinlock_test.c
+)
+
+
+target_include_directories(gpr_spinlock_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_spinlock_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_stack_lockfree_test
+  test/core/support/stack_lockfree_test.c
+)
+
+
+target_include_directories(gpr_stack_lockfree_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_stack_lockfree_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_string_test
+  test/core/support/string_test.c
+)
+
+
+target_include_directories(gpr_string_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_string_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_sync_test
+  test/core/support/sync_test.c
+)
+
+
+target_include_directories(gpr_sync_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_sync_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_thd_test
+  test/core/support/thd_test.c
+)
+
+
+target_include_directories(gpr_thd_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_thd_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_time_test
+  test/core/support/time_test.c
+)
+
+
+target_include_directories(gpr_time_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_time_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_tls_test
+  test/core/support/tls_test.c
+)
+
+
+target_include_directories(gpr_tls_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_tls_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(gpr_useful_test
+  test/core/support/useful_test.c
+)
+
+
+target_include_directories(gpr_useful_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(gpr_useful_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_auth_context_test
+  test/core/security/auth_context_test.c
+)
+
+
+target_include_directories(grpc_auth_context_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_auth_context_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_b64_test
+  test/core/slice/b64_test.c
+)
+
+
+target_include_directories(grpc_b64_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_b64_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_byte_buffer_reader_test
+  test/core/surface/byte_buffer_reader_test.c
+)
+
+
+target_include_directories(grpc_byte_buffer_reader_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_byte_buffer_reader_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_channel_args_test
+  test/core/channel/channel_args_test.c
+)
+
+
+target_include_directories(grpc_channel_args_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_channel_args_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_channel_stack_test
+  test/core/channel/channel_stack_test.c
+)
+
+
+target_include_directories(grpc_channel_stack_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_channel_stack_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_completion_queue_test
+  test/core/surface/completion_queue_test.c
+)
+
+
+target_include_directories(grpc_completion_queue_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_completion_queue_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_completion_queue_threading_test
+  test/core/surface/completion_queue_threading_test.c
+)
+
+
+target_include_directories(grpc_completion_queue_threading_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_completion_queue_threading_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+
+add_executable(grpc_create_jwt
+  test/core/security/create_jwt.c
+)
+
+
+target_include_directories(grpc_create_jwt
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_create_jwt
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_create_jwt EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_credentials_test
+  test/core/security/credentials_test.c
+)
+
+
+target_include_directories(grpc_credentials_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_credentials_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_fetch_oauth2
+  test/core/security/fetch_oauth2.c
+)
+
+
+target_include_directories(grpc_fetch_oauth2
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_fetch_oauth2
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_invalid_channel_args_test
+  test/core/surface/invalid_channel_args_test.c
+)
+
+
+target_include_directories(grpc_invalid_channel_args_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_invalid_channel_args_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(grpc_json_token_test
+  test/core/security/json_token_test.c
+)
+
+
+target_include_directories(grpc_json_token_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_json_token_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_jwt_verifier_test
+  test/core/security/jwt_verifier_test.c
+)
+
+
+target_include_directories(grpc_jwt_verifier_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_jwt_verifier_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+
+add_executable(grpc_print_google_default_creds_token
+  test/core/security/print_google_default_creds_token.c
+)
+
+
+target_include_directories(grpc_print_google_default_creds_token
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_print_google_default_creds_token
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_print_google_default_creds_token EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_security_connector_test
+  test/core/security/security_connector_test.c
+)
+
+
+target_include_directories(grpc_security_connector_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_security_connector_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+
+add_executable(grpc_verify_jwt
+  test/core/security/verify_jwt.c
+)
+
+
+target_include_directories(grpc_verify_jwt
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(grpc_verify_jwt
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_verify_jwt EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(handshake_client
+  test/core/handshake/client_ssl.c
+)
+
+
+target_include_directories(handshake_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(handshake_client
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(handshake_server
+  test/core/handshake/server_ssl.c
+)
+
+
+target_include_directories(handshake_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(handshake_server
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(hpack_parser_test
+  test/core/transport/chttp2/hpack_parser_test.c
+)
+
+
+target_include_directories(hpack_parser_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(hpack_parser_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(hpack_table_test
+  test/core/transport/chttp2/hpack_table_test.c
+)
+
+
+target_include_directories(hpack_table_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(hpack_table_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(http_parser_test
+  test/core/http/parser_test.c
+)
+
+
+target_include_directories(http_parser_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(http_parser_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(httpcli_format_request_test
+  test/core/http/format_request_test.c
+)
+
+
+target_include_directories(httpcli_format_request_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(httpcli_format_request_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(httpcli_test
+  test/core/http/httpcli_test.c
+)
+
+
+target_include_directories(httpcli_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(httpcli_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(httpscli_test
+  test/core/http/httpscli_test.c
+)
+
+
+target_include_directories(httpscli_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(httpscli_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(init_test
+  test/core/surface/init_test.c
+)
+
+
+target_include_directories(init_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(init_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(invalid_call_argument_test
+  test/core/end2end/invalid_call_argument_test.c
+)
+
+
+target_include_directories(invalid_call_argument_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(invalid_call_argument_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(json_rewrite
+  test/core/json/json_rewrite.c
+)
+
+
+target_include_directories(json_rewrite
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(json_rewrite
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(json_rewrite_test
+  test/core/json/json_rewrite_test.c
+)
+
+
+target_include_directories(json_rewrite_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(json_rewrite_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(json_stream_error_test
+  test/core/json/json_stream_error_test.c
+)
+
+
+target_include_directories(json_stream_error_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(json_stream_error_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(json_test
+  test/core/json/json_test.c
+)
+
+
+target_include_directories(json_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(json_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(lame_client_test
+  test/core/surface/lame_client_test.c
+)
+
+
+target_include_directories(lame_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(lame_client_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(lb_policies_test
+  test/core/client_channel/lb_policies_test.c
+)
+
+
+target_include_directories(lb_policies_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(lb_policies_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(load_file_test
+  test/core/iomgr/load_file_test.c
+)
+
+
+target_include_directories(load_file_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(load_file_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(memory_profile_client
+  test/core/memory_usage/client.c
+)
+
+
+target_include_directories(memory_profile_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(memory_profile_client
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(memory_profile_server
+  test/core/memory_usage/server.c
+)
+
+
+target_include_directories(memory_profile_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(memory_profile_server
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(memory_profile_test
+  test/core/memory_usage/memory_usage_test.c
+)
+
+
+target_include_directories(memory_profile_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(memory_profile_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(message_compress_test
+  test/core/compression/message_compress_test.c
+)
+
+
+target_include_directories(message_compress_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(message_compress_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(minimal_stack_is_minimal_test
+  test/core/channel/minimal_stack_is_minimal_test.c
+)
+
+
+target_include_directories(minimal_stack_is_minimal_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(minimal_stack_is_minimal_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(mlog_test
+  test/core/census/mlog_test.c
+)
+
+
+target_include_directories(mlog_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(mlog_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(multiple_server_queues_test
+  test/core/end2end/multiple_server_queues_test.c
+)
+
+
+target_include_directories(multiple_server_queues_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(multiple_server_queues_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(murmur_hash_test
+  test/core/support/murmur_hash_test.c
+)
+
+
+target_include_directories(murmur_hash_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(murmur_hash_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(no_server_test
+  test/core/end2end/no_server_test.c
+)
+
+
+target_include_directories(no_server_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(no_server_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(num_external_connectivity_watchers_test
+  test/core/surface/num_external_connectivity_watchers_test.c
+)
+
+
+target_include_directories(num_external_connectivity_watchers_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(num_external_connectivity_watchers_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(parse_address_test
+  test/core/client_channel/parse_address_test.c
+)
+
+
+target_include_directories(parse_address_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(parse_address_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(percent_encoding_test
+  test/core/slice/percent_encoding_test.c
+)
+
+
+target_include_directories(percent_encoding_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(percent_encoding_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(pollset_set_test
+  test/core/iomgr/pollset_set_test.c
+)
+
+
+target_include_directories(pollset_set_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(pollset_set_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(resolve_address_posix_test
+  test/core/iomgr/resolve_address_posix_test.c
+)
+
+
+target_include_directories(resolve_address_posix_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(resolve_address_posix_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(resolve_address_test
+  test/core/iomgr/resolve_address_test.c
+)
+
+
+target_include_directories(resolve_address_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(resolve_address_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(resource_quota_test
+  test/core/iomgr/resource_quota_test.c
+)
+
+
+target_include_directories(resource_quota_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(resource_quota_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(secure_channel_create_test
+  test/core/surface/secure_channel_create_test.c
+)
+
+
+target_include_directories(secure_channel_create_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(secure_channel_create_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(secure_endpoint_test
+  test/core/security/secure_endpoint_test.c
+)
+
+
+target_include_directories(secure_endpoint_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(secure_endpoint_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(sequential_connectivity_test
+  test/core/surface/sequential_connectivity_test.c
+)
+
+
+target_include_directories(sequential_connectivity_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(sequential_connectivity_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_chttp2_test
+  test/core/surface/server_chttp2_test.c
+)
+
+
+target_include_directories(server_chttp2_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(server_chttp2_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_test
+  test/core/surface/server_test.c
+)
+
+
+target_include_directories(server_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(server_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(slice_buffer_test
+  test/core/slice/slice_buffer_test.c
+)
+
+
+target_include_directories(slice_buffer_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(slice_buffer_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(slice_hash_table_test
+  test/core/slice/slice_hash_table_test.c
+)
+
+
+target_include_directories(slice_hash_table_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(slice_hash_table_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(slice_string_helpers_test
+  test/core/slice/slice_string_helpers_test.c
+)
+
+
+target_include_directories(slice_string_helpers_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(slice_string_helpers_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(slice_test
+  test/core/slice/slice_test.c
+)
+
+
+target_include_directories(slice_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(slice_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(sockaddr_resolver_test
+  test/core/client_channel/resolvers/sockaddr_resolver_test.c
+)
+
+
+target_include_directories(sockaddr_resolver_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(sockaddr_resolver_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(sockaddr_utils_test
+  test/core/iomgr/sockaddr_utils_test.c
+)
+
+
+target_include_directories(sockaddr_utils_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(sockaddr_utils_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(socket_utils_test
+  test/core/iomgr/socket_utils_test.c
+)
+
+
+target_include_directories(socket_utils_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(socket_utils_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(status_conversion_test
+  test/core/transport/status_conversion_test.c
+)
+
+
+target_include_directories(status_conversion_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(status_conversion_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(stream_compression_test
+  test/core/compression/stream_compression_test.c
+)
+
+
+target_include_directories(stream_compression_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(stream_compression_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(stream_owned_slice_test
+  test/core/transport/stream_owned_slice_test.c
+)
+
+
+target_include_directories(stream_owned_slice_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(stream_owned_slice_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(tcp_client_posix_test
+  test/core/iomgr/tcp_client_posix_test.c
+)
+
+
+target_include_directories(tcp_client_posix_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(tcp_client_posix_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(tcp_client_uv_test
+  test/core/iomgr/tcp_client_uv_test.c
+)
+
+
+target_include_directories(tcp_client_uv_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(tcp_client_uv_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(tcp_posix_test
+  test/core/iomgr/tcp_posix_test.c
+)
+
+
+target_include_directories(tcp_posix_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(tcp_posix_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(tcp_server_posix_test
+  test/core/iomgr/tcp_server_posix_test.c
+)
+
+
+target_include_directories(tcp_server_posix_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(tcp_server_posix_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(tcp_server_uv_test
+  test/core/iomgr/tcp_server_uv_test.c
+)
+
+
+target_include_directories(tcp_server_uv_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(tcp_server_uv_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(time_averaged_stats_test
+  test/core/iomgr/time_averaged_stats_test.c
+)
+
+
+target_include_directories(time_averaged_stats_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(time_averaged_stats_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(timeout_encoding_test
+  test/core/transport/timeout_encoding_test.c
+)
+
+
+target_include_directories(timeout_encoding_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(timeout_encoding_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(timer_heap_test
+  test/core/iomgr/timer_heap_test.c
+)
+
+
+target_include_directories(timer_heap_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(timer_heap_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(timer_list_test
+  test/core/iomgr/timer_list_test.c
+)
+
+
+target_include_directories(timer_list_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(timer_list_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(transport_connectivity_state_test
+  test/core/transport/connectivity_state_test.c
+)
+
+
+target_include_directories(transport_connectivity_state_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(transport_connectivity_state_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(transport_metadata_test
+  test/core/transport/metadata_test.c
+)
+
+
+target_include_directories(transport_metadata_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(transport_metadata_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(transport_pid_controller_test
+  test/core/transport/pid_controller_test.c
+)
+
+
+target_include_directories(transport_pid_controller_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(transport_pid_controller_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(transport_security_test
+  test/core/tsi/transport_security_test.c
+)
+
+
+target_include_directories(transport_security_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(transport_security_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(udp_server_test
+  test/core/iomgr/udp_server_test.c
+)
+
+
+target_include_directories(udp_server_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(udp_server_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(uri_parser_test
+  test/core/client_channel/uri_parser_test.c
+)
+
+
+target_include_directories(uri_parser_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(uri_parser_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(wakeup_fd_cv_test
+  test/core/iomgr/wakeup_fd_cv_test.c
+)
+
+
+target_include_directories(wakeup_fd_cv_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(wakeup_fd_cv_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(alarm_cpp_test
+  test/cpp/common/alarm_cpp_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(alarm_cpp_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(alarm_cpp_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(async_end2end_test
+  test/cpp/end2end/async_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(async_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(async_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(auth_property_iterator_test
+  test/cpp/common/auth_property_iterator_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(auth_property_iterator_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(auth_property_iterator_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_arena
+  test/cpp/microbenchmarks/bm_arena.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_arena
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_arena
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_call_create
+  test/cpp/microbenchmarks/bm_call_create.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_call_create
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_call_create
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_chttp2_hpack
+  test/cpp/microbenchmarks/bm_chttp2_hpack.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_chttp2_hpack
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_chttp2_hpack
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_chttp2_transport
+  test/cpp/microbenchmarks/bm_chttp2_transport.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_chttp2_transport
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_chttp2_transport
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_closure
+  test/cpp/microbenchmarks/bm_closure.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_closure
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_closure
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_cq
+  test/cpp/microbenchmarks/bm_cq.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_cq
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_cq
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_cq_multiple_threads
+  test/cpp/microbenchmarks/bm_cq_multiple_threads.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_cq_multiple_threads
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_cq_multiple_threads
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_error
+  test/cpp/microbenchmarks/bm_error.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_error
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_error
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_fullstack_streaming_ping_pong
+  test/cpp/microbenchmarks/bm_fullstack_streaming_ping_pong.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_fullstack_streaming_ping_pong
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_fullstack_streaming_ping_pong
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_fullstack_streaming_pump
+  test/cpp/microbenchmarks/bm_fullstack_streaming_pump.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_fullstack_streaming_pump
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_fullstack_streaming_pump
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_fullstack_trickle
+  test/cpp/microbenchmarks/bm_fullstack_trickle.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_fullstack_trickle
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_fullstack_trickle
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_fullstack_unary_ping_pong
+  test/cpp/microbenchmarks/bm_fullstack_unary_ping_pong.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_fullstack_unary_ping_pong
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_fullstack_unary_ping_pong
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_metadata
+  test/cpp/microbenchmarks/bm_metadata.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_metadata
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_metadata
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bm_pollset
+  test/cpp/microbenchmarks/bm_pollset.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(bm_pollset
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(bm_pollset
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_benchmark
+  benchmark
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(channel_arguments_test
+  test/cpp/common/channel_arguments_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(channel_arguments_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(channel_arguments_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(channel_filter_test
+  test/cpp/common/channel_filter_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(channel_filter_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(channel_filter_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(cli_call_test
+  test/cpp/util/cli_call_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(cli_call_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(cli_call_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_cli_libs
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(client_crash_test
+  test/cpp/end2end/client_crash_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(client_crash_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(client_crash_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(client_crash_test_server
+  test/cpp/end2end/client_crash_test_server.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(client_crash_test_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(client_crash_test_server
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(client_lb_end2end_test
+  test/cpp/end2end/client_lb_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(client_lb_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(client_lb_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(codegen_test_full
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.h
+  test/cpp/codegen/codegen_test_full.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/control.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/payloads.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/services.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/stats.proto
+)
+
+target_include_directories(codegen_test_full
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(codegen_test_full
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(codegen_test_minimal
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.h
+  test/cpp/codegen/codegen_test_minimal.cc
+  src/cpp/codegen/codegen_init.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/control.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/payloads.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/services.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/stats.proto
+)
+
+target_include_directories(codegen_test_minimal
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(codegen_test_minimal
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(credentials_test
+  test/cpp/client/credentials_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(credentials_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(credentials_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(cxx_byte_buffer_test
+  test/cpp/util/byte_buffer_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(cxx_byte_buffer_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(cxx_byte_buffer_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(cxx_slice_test
+  test/cpp/util/slice_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(cxx_slice_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(cxx_slice_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(cxx_string_ref_test
+  test/cpp/util/string_ref_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(cxx_string_ref_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(cxx_string_ref_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(cxx_time_test
+  test/cpp/util/time_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(cxx_time_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(cxx_time_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(end2end_test
+  test/cpp/end2end/end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(error_details_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
+  test/cpp/util/error_details_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo_messages.proto
+)
+
+target_include_directories(error_details_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(error_details_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_error_details
+  grpc++
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(filter_end2end_test
+  test/cpp/end2end/filter_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(filter_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(filter_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(generic_end2end_test
+  test/cpp/end2end/generic_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(generic_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(generic_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(golden_file_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.grpc.pb.h
+  test/cpp/codegen/golden_file_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/compiler_test.proto
+)
+
+target_include_directories(golden_file_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(golden_file_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_cli
+  test/cpp/util/grpc_cli.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(grpc_cli
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_cli
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_cli_libs
+  grpc++_proto_reflection_desc_db
+  grpc++
+  grpc
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+
+add_executable(grpc_cpp_plugin
+  src/compiler/cpp_plugin.cc
+)
+
+
+target_include_directories(grpc_cpp_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_cpp_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_cpp_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(grpc_csharp_plugin
+  src/compiler/csharp_plugin.cc
+)
+
+
+target_include_directories(grpc_csharp_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_csharp_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_csharp_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(grpc_node_plugin
+  src/compiler/node_plugin.cc
+)
+
+
+target_include_directories(grpc_node_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_node_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_node_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(grpc_objective_c_plugin
+  src/compiler/objective_c_plugin.cc
+)
+
+
+target_include_directories(grpc_objective_c_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_objective_c_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_objective_c_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(grpc_php_plugin
+  src/compiler/php_plugin.cc
+)
+
+
+target_include_directories(grpc_php_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_php_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_php_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(grpc_python_plugin
+  src/compiler/python_plugin.cc
+)
+
+
+target_include_directories(grpc_python_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_python_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_python_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+
+add_executable(grpc_ruby_plugin
+  src/compiler/ruby_plugin.cc
+)
+
+
+target_include_directories(grpc_ruby_plugin
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_ruby_plugin
+  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_plugin_support
+)
+
+
+if (gRPC_INSTALL)
+  install(TARGETS grpc_ruby_plugin EXPORT gRPCTargets
+    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
+  )
+endif()
+
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpc_tool_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
+  test/cpp/util/grpc_tool_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo_messages.proto
+)
+
+target_include_directories(grpc_tool_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpc_tool_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_cli_libs
+  grpc++_proto_reflection_desc_db
+  grpc++_reflection
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpclb_api_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.h
+  test/cpp/grpclb/grpclb_api_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/lb/v1/load_balancer.proto
+)
+
+target_include_directories(grpclb_api_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpclb_api_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpclb_end2end_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.h
+  test/cpp/end2end/grpclb_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/lb/v1/load_balancer.proto
+)
+
+target_include_directories(grpclb_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpclb_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(grpclb_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.h
+  test/cpp/grpclb/grpclb_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/lb/v1/load_balancer.proto
+)
+
+target_include_directories(grpclb_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(grpclb_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(health_service_end2end_test
+  test/cpp/end2end/health_service_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(health_service_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(health_service_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(http2_client
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(http2_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(http2_client
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  http2_client_main
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(hybrid_end2end_test
+  test/cpp/end2end/hybrid_end2end_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(hybrid_end2end_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(hybrid_end2end_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(interop_client
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(interop_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_client
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  interop_client_main
+  interop_client_helper
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(interop_server
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(interop_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_server
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  interop_server_main
+  interop_server_helper
+  interop_server_lib
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(interop_test
+  test/cpp/interop/interop_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(interop_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(interop_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(json_run_localhost
+  test/cpp/qps/json_run_localhost.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(json_run_localhost
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(json_run_localhost
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(memory_test
+  test/core/support/memory_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(memory_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(memory_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(metrics_client
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.h
+  test/cpp/interop/metrics_client.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/metrics.proto
+)
+
+target_include_directories(metrics_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(metrics_client
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(mock_test
+  test/cpp/end2end/mock_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(mock_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(mock_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(noop-benchmark
+  test/cpp/microbenchmarks/noop-benchmark.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(noop-benchmark
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(noop-benchmark
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  benchmark
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(proto_server_reflection_test
+  test/cpp/end2end/proto_server_reflection_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(proto_server_reflection_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(proto_server_reflection_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_proto_reflection_desc_db
+  grpc++_reflection
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(proto_utils_test
+  test/cpp/codegen/proto_utils_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(proto_utils_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(proto_utils_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(qps_interarrival_test
+  test/cpp/qps/qps_interarrival_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(qps_interarrival_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(qps_interarrival_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  qps
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(qps_json_driver
+  test/cpp/qps/qps_json_driver.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(qps_json_driver
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(qps_json_driver
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  qps
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(qps_openloop_test
+  test/cpp/qps/qps_openloop_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(qps_openloop_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(qps_openloop_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  qps
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(qps_worker
+  test/cpp/qps/worker.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(qps_worker
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(qps_worker
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  qps
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(reconnect_interop_client
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
+  test/cpp/interop/reconnect_interop_client.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/empty.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/test.proto
+)
+
+target_include_directories(reconnect_interop_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(reconnect_interop_client
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(reconnect_interop_server
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
+  test/cpp/interop/reconnect_interop_server.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/empty.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/test.proto
+)
+
+target_include_directories(reconnect_interop_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(reconnect_interop_server
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  reconnect_server
+  test_tcp_server
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(secure_auth_context_test
+  test/cpp/common/secure_auth_context_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(secure_auth_context_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(secure_auth_context_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(secure_sync_unary_ping_pong_test
+  test/cpp/qps/secure_sync_unary_ping_pong_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(secure_sync_unary_ping_pong_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(secure_sync_unary_ping_pong_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  qps
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_builder_plugin_test
+  test/cpp/end2end/server_builder_plugin_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(server_builder_plugin_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(server_builder_plugin_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_builder_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
+  test/cpp/server/server_builder_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo_messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo.proto
+)
+
+target_include_directories(server_builder_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(server_builder_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  gpr_test_util
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_context_test_spouse_test
+  test/cpp/test/server_context_test_spouse_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(server_context_test_spouse_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(server_context_test_spouse_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(server_crash_test
+  test/cpp/end2end/server_crash_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(server_crash_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(server_crash_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_crash_test_client
+  test/cpp/end2end/server_crash_test_client.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(server_crash_test_client
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(server_crash_test_client
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_request_call_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
+  test/cpp/server/server_request_call_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo_messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/echo.proto
+)
+
+target_include_directories(server_request_call_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(server_request_call_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  gpr_test_util
+  grpc++
+  grpc
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(shutdown_test
+  test/cpp/end2end/shutdown_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(shutdown_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(shutdown_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(status_test
+  test/cpp/util/status_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(status_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(status_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(streaming_throughput_test
+  test/cpp/end2end/streaming_throughput_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(streaming_throughput_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(streaming_throughput_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(stress_test
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
+  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
+  test/cpp/interop/interop_client.cc
+  test/cpp/interop/stress_interop_client.cc
+  test/cpp/interop/stress_test.cc
+  test/cpp/util/metrics_server.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/empty.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/messages.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/metrics.proto
+)
+protobuf_generate_grpc_cpp(
+  src/proto/grpc/testing/test.proto
+)
+
+target_include_directories(stress_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(stress_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(thread_manager_test
+  test/cpp/thread_manager/thread_manager_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(thread_manager_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(thread_manager_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++
+  grpc
+  gpr
+  grpc++_test_config
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(thread_stress_test
+  test/cpp/end2end/thread_stress_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(thread_stress_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(thread_stress_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(writes_per_rpc_test
+  test/cpp/performance/writes_per_rpc_test.cc
+  third_party/googletest/googletest/src/gtest-all.cc
+  third_party/googletest/googlemock/src/gmock-all.cc
+)
+
+
+target_include_directories(writes_per_rpc_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+  PRIVATE third_party/googletest/googletest/include
+  PRIVATE third_party/googletest/googletest
+  PRIVATE third_party/googletest/googlemock/include
+  PRIVATE third_party/googletest/googlemock
+  PRIVATE ${_gRPC_PROTO_GENS_DIR}
+)
+
+target_link_libraries(writes_per_rpc_test
+  ${_gRPC_PROTOBUF_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc++_test_util
+  grpc_test_util
+  grpc++
+  grpc
+  gpr_test_util
+  gpr
+  ${_gRPC_GFLAGS_LIBRARIES}
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(public_headers_must_be_c89
+  test/core/surface/public_headers_must_be_c89.c
+)
+
+
+target_include_directories(public_headers_must_be_c89
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(public_headers_must_be_c89
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(badreq_bad_client_test
+  test/core/bad_client/tests/badreq.c
+)
+
+
+target_include_directories(badreq_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(badreq_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(connection_prefix_bad_client_test
+  test/core/bad_client/tests/connection_prefix.c
+)
+
+
+target_include_directories(connection_prefix_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(connection_prefix_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(head_of_line_blocking_bad_client_test
+  test/core/bad_client/tests/head_of_line_blocking.c
+)
+
+
+target_include_directories(head_of_line_blocking_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(head_of_line_blocking_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(headers_bad_client_test
+  test/core/bad_client/tests/headers.c
+)
+
+
+target_include_directories(headers_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(headers_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(initial_settings_frame_bad_client_test
+  test/core/bad_client/tests/initial_settings_frame.c
+)
+
+
+target_include_directories(initial_settings_frame_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(initial_settings_frame_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(large_metadata_bad_client_test
+  test/core/bad_client/tests/large_metadata.c
+)
+
+
+target_include_directories(large_metadata_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(large_metadata_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_registered_method_bad_client_test
+  test/core/bad_client/tests/server_registered_method.c
+)
+
+
+target_include_directories(server_registered_method_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(server_registered_method_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(simple_request_bad_client_test
+  test/core/bad_client/tests/simple_request.c
+)
+
+
+target_include_directories(simple_request_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(simple_request_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(unknown_frame_bad_client_test
+  test/core/bad_client/tests/unknown_frame.c
+)
+
+
+target_include_directories(unknown_frame_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(unknown_frame_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(window_overflow_bad_client_test
+  test/core/bad_client/tests/window_overflow.c
+)
+
+
+target_include_directories(window_overflow_bad_client_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(window_overflow_bad_client_test
+  ${_gRPC_SSL_LIBRARIES}
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_client_test
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bad_ssl_cert_server
+  test/core/bad_ssl/servers/cert.c
+)
+
+
+target_include_directories(bad_ssl_cert_server
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bad_ssl_cert_server
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  bad_ssl_test_server
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(bad_ssl_cert_test
+  test/core/bad_ssl/bad_ssl_test.c
+)
+
+
+target_include_directories(bad_ssl_cert_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(bad_ssl_cert_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_census_test
+  test/core/end2end/fixtures/h2_census.c
+)
+
+
+target_include_directories(h2_census_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_census_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_compress_test
+  test/core/end2end/fixtures/h2_compress.c
+)
+
+
+target_include_directories(h2_compress_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_compress_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_fakesec_test
+  test/core/end2end/fixtures/h2_fakesec.c
+)
+
+
+target_include_directories(h2_fakesec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_fakesec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(h2_fd_test
+  test/core/end2end/fixtures/h2_fd.c
+)
+
+
+target_include_directories(h2_fd_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_fd_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_full_test
+  test/core/end2end/fixtures/h2_full.c
+)
+
+
+target_include_directories(h2_full_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(h2_full+pipe_test
+  test/core/end2end/fixtures/h2_full+pipe.c
+)
+
+
+target_include_directories(h2_full+pipe_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full+pipe_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_full+trace_test
+  test/core/end2end/fixtures/h2_full+trace.c
+)
+
+
+target_include_directories(h2_full+trace_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full+trace_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_full+workarounds_test
+  test/core/end2end/fixtures/h2_full+workarounds.c
+)
+
+
+target_include_directories(h2_full+workarounds_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full+workarounds_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_http_proxy_test
+  test/core/end2end/fixtures/h2_http_proxy.c
+)
+
+
+target_include_directories(h2_http_proxy_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_http_proxy_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_load_reporting_test
+  test/core/end2end/fixtures/h2_load_reporting.c
+)
+
+
+target_include_directories(h2_load_reporting_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_load_reporting_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_oauth2_test
+  test/core/end2end/fixtures/h2_oauth2.c
+)
+
+
+target_include_directories(h2_oauth2_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_oauth2_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_proxy_test
+  test/core/end2end/fixtures/h2_proxy.c
+)
+
+
+target_include_directories(h2_proxy_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_proxy_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_sockpair_test
+  test/core/end2end/fixtures/h2_sockpair.c
+)
+
+
+target_include_directories(h2_sockpair_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_sockpair_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_sockpair+trace_test
+  test/core/end2end/fixtures/h2_sockpair+trace.c
+)
+
+
+target_include_directories(h2_sockpair+trace_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_sockpair+trace_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_sockpair_1byte_test
+  test/core/end2end/fixtures/h2_sockpair_1byte.c
+)
+
+
+target_include_directories(h2_sockpair_1byte_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_sockpair_1byte_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_ssl_test
+  test/core/end2end/fixtures/h2_ssl.c
+)
+
+
+target_include_directories(h2_ssl_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_ssl_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_ssl_cert_test
+  test/core/end2end/fixtures/h2_ssl_cert.c
+)
+
+
+target_include_directories(h2_ssl_cert_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_ssl_cert_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_ssl_proxy_test
+  test/core/end2end/fixtures/h2_ssl_proxy.c
+)
+
+
+target_include_directories(h2_ssl_proxy_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_ssl_proxy_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(h2_uds_test
+  test/core/end2end/fixtures/h2_uds.c
+)
+
+
+target_include_directories(h2_uds_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_uds_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(inproc_test
+  test/core/end2end/fixtures/inproc.c
+)
+
+
+target_include_directories(inproc_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(inproc_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_tests
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_census_nosec_test
+  test/core/end2end/fixtures/h2_census.c
+)
+
+
+target_include_directories(h2_census_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_census_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_compress_nosec_test
+  test/core/end2end/fixtures/h2_compress.c
+)
+
+
+target_include_directories(h2_compress_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_compress_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(h2_fd_nosec_test
+  test/core/end2end/fixtures/h2_fd.c
+)
+
+
+target_include_directories(h2_fd_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_fd_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_full_nosec_test
+  test/core/end2end/fixtures/h2_full.c
+)
+
+
+target_include_directories(h2_full_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX)
+
+add_executable(h2_full+pipe_nosec_test
+  test/core/end2end/fixtures/h2_full+pipe.c
+)
+
+
+target_include_directories(h2_full+pipe_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full+pipe_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_full+trace_nosec_test
+  test/core/end2end/fixtures/h2_full+trace.c
+)
+
+
+target_include_directories(h2_full+trace_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full+trace_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_full+workarounds_nosec_test
+  test/core/end2end/fixtures/h2_full+workarounds.c
+)
+
+
+target_include_directories(h2_full+workarounds_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_full+workarounds_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_http_proxy_nosec_test
+  test/core/end2end/fixtures/h2_http_proxy.c
+)
+
+
+target_include_directories(h2_http_proxy_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_http_proxy_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_load_reporting_nosec_test
+  test/core/end2end/fixtures/h2_load_reporting.c
+)
+
+
+target_include_directories(h2_load_reporting_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_load_reporting_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_proxy_nosec_test
+  test/core/end2end/fixtures/h2_proxy.c
+)
+
+
+target_include_directories(h2_proxy_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_proxy_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_sockpair_nosec_test
+  test/core/end2end/fixtures/h2_sockpair.c
+)
+
+
+target_include_directories(h2_sockpair_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_sockpair_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_sockpair+trace_nosec_test
+  test/core/end2end/fixtures/h2_sockpair+trace.c
+)
+
+
+target_include_directories(h2_sockpair+trace_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_sockpair+trace_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(h2_sockpair_1byte_nosec_test
+  test/core/end2end/fixtures/h2_sockpair_1byte.c
+)
+
+
+target_include_directories(h2_sockpair_1byte_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_sockpair_1byte_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
+
+add_executable(h2_uds_nosec_test
+  test/core/end2end/fixtures/h2_uds.c
+)
+
+
+target_include_directories(h2_uds_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(h2_uds_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif()
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(inproc_nosec_test
+  test/core/end2end/fixtures/inproc.c
+)
+
+
+target_include_directories(inproc_nosec_test
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(inproc_nosec_test
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  end2end_nosec_tests
+  grpc_test_util_unsecure
+  grpc_unsecure
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(api_fuzzer_one_entry
+  test/core/end2end/fuzzers/api_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(api_fuzzer_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(api_fuzzer_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(client_fuzzer_one_entry
+  test/core/end2end/fuzzers/client_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(client_fuzzer_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(client_fuzzer_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(hpack_parser_fuzzer_test_one_entry
+  test/core/transport/chttp2/hpack_parser_fuzzer_test.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(hpack_parser_fuzzer_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(hpack_parser_fuzzer_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(http_request_fuzzer_test_one_entry
+  test/core/http/request_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(http_request_fuzzer_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(http_request_fuzzer_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(http_response_fuzzer_test_one_entry
+  test/core/http/response_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(http_response_fuzzer_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(http_response_fuzzer_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(json_fuzzer_test_one_entry
+  test/core/json/fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(json_fuzzer_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(json_fuzzer_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(nanopb_fuzzer_response_test_one_entry
+  test/core/nanopb/fuzzer_response.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(nanopb_fuzzer_response_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(nanopb_fuzzer_response_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(nanopb_fuzzer_serverlist_test_one_entry
+  test/core/nanopb/fuzzer_serverlist.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(nanopb_fuzzer_serverlist_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(nanopb_fuzzer_serverlist_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(percent_decode_fuzzer_one_entry
+  test/core/slice/percent_decode_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(percent_decode_fuzzer_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(percent_decode_fuzzer_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(percent_encode_fuzzer_one_entry
+  test/core/slice/percent_encode_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(percent_encode_fuzzer_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(percent_encode_fuzzer_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(server_fuzzer_one_entry
+  test/core/end2end/fuzzers/server_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(server_fuzzer_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(server_fuzzer_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(ssl_server_fuzzer_one_entry
+  test/core/security/ssl_server_fuzzer.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(ssl_server_fuzzer_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(ssl_server_fuzzer_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+if (gRPC_BUILD_TESTS)
+
+add_executable(uri_fuzzer_test_one_entry
+  test/core/client_channel/uri_fuzzer_test.c
+  test/core/util/one_corpus_entry_fuzzer.c
+)
+
+
+target_include_directories(uri_fuzzer_test_one_entry
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
+  PRIVATE ${BORINGSSL_ROOT_DIR}/include
+  PRIVATE ${PROTOBUF_ROOT_DIR}/src
+  PRIVATE ${BENCHMARK_ROOT_DIR}/include
+  PRIVATE ${ZLIB_ROOT_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
+  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
+  PRIVATE ${CARES_INCLUDE_DIR}
+  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
+)
+
+target_link_libraries(uri_fuzzer_test_one_entry
+  ${_gRPC_ALLTARGETS_LIBRARIES}
+  grpc_test_util
+  grpc
+  gpr_test_util
+  gpr
+)
+
+endif (gRPC_BUILD_TESTS)
+
+
+
+
+
+
+
+if (gRPC_INSTALL)
+  install(EXPORT gRPCTargets
+    DESTINATION ${gRPC_INSTALL_CMAKEDIR}
+    NAMESPACE gRPC::
+  )
+endif()
+
+foreach(_config gRPCConfig gRPCConfigVersion)
+  configure_file(tools/cmake/${_config}.cmake.in
+    ${_config}.cmake @ONLY)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_config}.cmake
+    DESTINATION ${gRPC_INSTALL_CMAKEDIR}
+  )
+endforeach()
diff --git a/tensorflow/tensorboard/components/vz_projector/test/assert.ts b/tensorflow/contrib/cmake/patches/grpc/rand.h
similarity index 80%
rename from tensorflow/tensorboard/components/vz_projector/test/assert.ts
rename to tensorflow/contrib/cmake/patches/grpc/rand.h
index f489517a7f23f36ecb91875638e464e3c7312926..194cb683fc53df1d13f5206fc9507c6d3f88d4a5 100644
--- a/tensorflow/tensorboard/components/vz_projector/test/assert.ts
+++ b/tensorflow/contrib/cmake/patches/grpc/rand.h
@@ -1,10 +1,10 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,5 +12,3 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-const assert = chai.assert;
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 069cdfa35261fdfa909b4a4f5761be9e1d9d185f..9ea7600a06f71d2e6f31895ecdfdad1a7d6ab15d 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -26,3 +26,9 @@ set(tf_c_srcs
 
 add_library(tf_c OBJECT ${tf_c_srcs})
 add_dependencies(tf_c tf_cc_framework tf_core_lib tf_protos_cc)
+
+add_library(tf_c_python_api OBJECT
+  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+)
+add_dependencies(tf_c_python_api tf_c tf_cc_framework tf_core_lib tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 1c80ffcd7b129f6b9e934c205c1d73273f1eb9b8..c76f124892c2fec4a0757ce453a779aa2616d1f7 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -46,6 +46,10 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
     "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -53,16 +57,11 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
 )
 list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs})
 
-# We need to include stubs for the GPU tracer, which are in the exclude glob.
-list(APPEND tf_core_cpu_srcs
-     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.cc"
-     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.h"
-)
-
 if (tensorflow_ENABLE_GPU)
   file(GLOB_RECURSE tf_core_gpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.h"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a048194a1973188dfe3bba88b2dd8b65a7a55b55..d2a817390af9857345b105f10a17fcc368de3f6d 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -147,13 +147,24 @@ file(GLOB tf_core_platform_srcs
     "${tensorflow_source_dir}/tensorflow/core/platform/*.h"
     "${tensorflow_source_dir}/tensorflow/core/platform/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/platform/default/*.h"
-    "${tensorflow_source_dir}/tensorflow/core/platform/default/*.cc")
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.h"
+    "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.cc")
 if (NOT tensorflow_ENABLE_GPU)
   file(GLOB tf_core_platform_gpu_srcs
       "${tensorflow_source_dir}/tensorflow/core/platform/cuda_libdevice_path.*"
       "${tensorflow_source_dir}/tensorflow/core/platform/default/cuda_libdevice_path.*")
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
+else()
+  file(GLOB tf_core_platform_srcs_exclude
+      "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc")
+  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_srcs_exclude})
 endif()
+
+file(GLOB tf_core_platform_exclude_srcs
+  "${tensorflow_source_dir}/tensorflow/core/platform/variant_coding.cc")
+list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_exclude_srcs})
+
 list(APPEND tf_core_lib_srcs ${tf_core_platform_srcs})
 
 if(UNIX)
@@ -223,6 +234,12 @@ set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.c
 file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/*.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/variant_coding.h"
+    "${tensorflow_source_dir}/tensorflow/core/platform/variant_coding.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
@@ -231,18 +248,19 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/public/*.h"
 )
 
-file(GLOB_RECURSE tf_core_framework_test_srcs
+file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/framework/*testutil.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/*testutil.cc"
     "${tensorflow_source_dir}/tensorflow/core/framework/*main.cc"
+    "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
 )
 
-list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs})
+list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
 
 add_library(tf_core_framework OBJECT
     ${tf_core_framework_srcs}
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 500b917ac996e190bcf41e2b5455755ee02423fd..95e884840b818fe0c2642764acca6d2d181cf2e1 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -40,6 +40,33 @@ endif(tensorflow_BUILD_ALL_KERNELS)
 
 if(tensorflow_BUILD_CONTRIB_KERNELS)
   set(tf_contrib_kernels_srcs
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/ensemble_optimizer_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/model_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/kernels/training_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/batch_features.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/ensemble_optimizer_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/model_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
@@ -55,6 +82,8 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/resampler_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
@@ -63,16 +92,9 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc"
@@ -84,6 +106,12 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/text/kernels/skip_gram_kernels.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/text/ops/skip_gram_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/cross_replica_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/infeed_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/outfeed_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/replication_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/tpu_sendrecv_ops.cc"
     )
   list(APPEND tf_core_kernels_srcs ${tf_contrib_kernels_srcs})
 endif(tensorflow_BUILD_CONTRIB_KERNELS)
@@ -102,10 +130,11 @@ file(GLOB_RECURSE tf_core_kernels_exclude_srcs
    "${tensorflow_source_dir}/tensorflow/core/kernels/*test*.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/*testutil.h"
    "${tensorflow_source_dir}/tensorflow/core/kernels/*testutil.cc"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/*test_utils.h"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/*test_utils.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/*main.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/*"
-   "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_execute*.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform*.cc"
 )
 list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
@@ -117,12 +146,15 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
-      # no in tensorflow.dll - comes from .so
+      # not in core - those are loaded dynamically as dll
+      "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/resampler_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
       # temporarily disable nccl (nccl itself needs to be ported to windows first)
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 3c2f89c6c824d2a5098bf6f8ceaadd26fd47d61f..e04972b0245e1e4c203c6c3047298e82c36d2224 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -14,6 +14,7 @@
 # ==============================================================================
 set(tf_op_lib_names
     "array_ops"
+    "bitwise_ops"
     "candidate_sampling_ops"
     "control_flow_ops"
     "ctc_ops"
@@ -66,6 +67,17 @@ file(GLOB_RECURSE tensor_forest_hybrid_srcs
      "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/hybrid/core/ops/*.cc"
 )
 
+file(GLOB_RECURSE tpu_ops_srcs
+     "${tensorflow_source_dir}/tensorflow/contrib/tpu/ops/*.cc"
+)
+
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_model "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/model_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_split_handler "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_training "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_ensemble_optimzier "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/ensemble_optimizer_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
@@ -76,12 +88,16 @@ GENERATE_CONTRIB_OP_LIBRARY(image "${tensorflow_source_dir}/tensorflow/contrib/i
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(resampler "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_gru "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_lstm "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(seq2seq_beam_search "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tensor_forest "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_hybrid "${tensor_forest_hybrid_srcs}")
+GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_model "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/model_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_stats "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(text_skip_gram "${tensorflow_source_dir}/tensorflow/contrib/text/ops/skip_gram_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(tpu "${tpu_ops_srcs}")
 GENERATE_CONTRIB_OP_LIBRARY(bigquery_reader "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc")
 
 ########################################################
@@ -120,3 +136,15 @@ list(REMOVE_ITEM tf_core_ops_srcs ${tf_core_ops_exclude_srcs})
 add_library(tf_core_ops OBJECT ${tf_core_ops_srcs})
 
 add_dependencies(tf_core_ops tf_core_cpu)
+
+########################################################
+# tf_debug_ops library
+########################################################
+
+file(GLOB tf_debug_ops_srcs
+    "${tensorflow_source_dir}/tensorflow/core/ops/debug_ops.cc"
+)
+
+add_library(tf_debug_ops OBJECT ${tf_debug_ops_srcs})
+
+add_dependencies(tf_debug_ops tf_core_framework)
diff --git a/tensorflow/contrib/cmake/tf_core_profiler.cmake b/tensorflow/contrib/cmake/tf_core_profiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..61ed6a1e145299125d037b48b8b644cae1ce96e7
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_core_profiler.cmake
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_core_profiler library
+########################################################
+file(GLOB_RECURSE tf_core_profiler_srcs
+    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/regexp.h"
+)
+
+file(GLOB_RECURSE tf_core_profiler_exclude_srcs
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/*test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/print_model_analysis.cc"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/print_model_analysis.h"
+)
+list(REMOVE_ITEM tf_core_profiler_srcs ${tf_core_profiler_exclude_srcs})
+
+add_library(tf_core_profiler OBJECT ${tf_core_profiler_srcs})
+add_dependencies(tf_core_profiler tf_core_lib)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_grappler.cmake b/tensorflow/contrib/cmake/tf_grappler.cmake
index 4811c8cce9cecbd47d1b3f883f540e854dad5e35..a7841c98e83ec8c3eb91edfd9d639e169cb5f440 100644
--- a/tensorflow/contrib/cmake/tf_grappler.cmake
+++ b/tensorflow/contrib/cmake/tf_grappler.cmake
@@ -20,6 +20,8 @@ file(GLOB tf_grappler_srcs
    "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
    "${tensorflow_source_dir}/tensorflow/python/grappler/cost_analyzer.cc"
    "${tensorflow_source_dir}/tensorflow/python/grappler/cost_analyzer.h"
+   "${tensorflow_source_dir}/tensorflow/python/grappler/model_analyzer.cc"
+   "${tensorflow_source_dir}/tensorflow/python/grappler/model_analyzer.h"
  )
  
 add_library(tf_grappler OBJECT ${tf_grappler_srcs})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index a969bb03eec1ae3c5d9bfad73b4bc58b0047a5a1..f32e7adaecf92d89722dd86a77681685a9be89e1 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -122,10 +122,14 @@ endfunction()
 
 file(GLOB_RECURSE tf_protos_python_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
     "${tensorflow_source_dir}/tensorflow/python/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/decision_trees/proto/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/tensorboard/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/proto/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/tpu/profiler/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
 )
 RELATIVE_PROTOBUF_GENERATE_PYTHON(
@@ -136,9 +140,10 @@ RELATIVE_PROTOBUF_GENERATE_PYTHON(
 # can cause benign-but-failing-on-Windows-due-to-file-locking conflicts
 # when two rules attempt to generate the same file.
 file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
+    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
     "${tensorflow_source_dir}/tensorflow/python/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
 )
@@ -206,6 +211,7 @@ add_python_module("tensorflow/python/debug/examples")
 add_python_module("tensorflow/python/debug/lib")
 add_python_module("tensorflow/python/debug/wrappers")
 add_python_module("tensorflow/python/estimator")
+add_python_module("tensorflow/python/estimator/canned")
 add_python_module("tensorflow/python/estimator/export")
 add_python_module("tensorflow/python/estimator/inputs")
 add_python_module("tensorflow/python/estimator/inputs/queues")
@@ -224,6 +230,8 @@ add_python_module("tensorflow/python/ops/losses")
 add_python_module("tensorflow/python/platform")
 add_python_module("tensorflow/python/platform/default")
 add_python_module("tensorflow/python/platform/summary")
+add_python_module("tensorflow/python/profiler/")
+add_python_module("tensorflow/python/profiler/internal")
 add_python_module("tensorflow/python/saved_model")
 add_python_module("tensorflow/python/summary")
 add_python_module("tensorflow/python/summary/writer")
@@ -232,19 +240,6 @@ add_python_module("tensorflow/python/training")
 add_python_module("tensorflow/python/user_ops")
 add_python_module("tensorflow/python/util")
 add_python_module("tensorflow/python/util/protobuf")
-add_python_module("tensorflow/tensorboard")
-add_python_module("tensorflow/tensorboard/backend")
-add_python_module("tensorflow/tensorboard/backend/event_processing")
-add_python_module("tensorflow/tensorboard/plugins")
-add_python_module("tensorflow/tensorboard/plugins/audio")
-add_python_module("tensorflow/tensorboard/plugins/distributions")
-add_python_module("tensorflow/tensorboard/plugins/graphs")
-add_python_module("tensorflow/tensorboard/plugins/histograms")
-add_python_module("tensorflow/tensorboard/plugins/images")
-add_python_module("tensorflow/tensorboard/plugins/projector")
-add_python_module("tensorflow/tensorboard/plugins/scalars")
-add_python_module("tensorflow/tensorboard/plugins/text")
-add_python_module("tensorflow/tensorboard/scripts")
 add_python_module("tensorflow/contrib")
 add_python_module("tensorflow/contrib/android")
 add_python_module("tensorflow/contrib/android/java")
@@ -259,11 +254,21 @@ add_python_module("tensorflow/contrib/bayesflow/examples/reinforce_simple")
 add_python_module("tensorflow/contrib/bayesflow/python")
 add_python_module("tensorflow/contrib/bayesflow/python/kernel_tests")
 add_python_module("tensorflow/contrib/bayesflow/python/ops")
+add_python_module("tensorflow/contrib/boosted_trees")
+add_python_module("tensorflow/contrib/boosted_trees/estimator_batch")
+add_python_module("tensorflow/contrib/boosted_trees/ops")
+add_python_module("tensorflow/contrib/boosted_trees/proto")
+add_python_module("tensorflow/contrib/boosted_trees/python")
+add_python_module("tensorflow/contrib/boosted_trees/python/kernel_tests")
+add_python_module("tensorflow/contrib/boosted_trees/python/ops")
 add_python_module("tensorflow/contrib/cloud")
 add_python_module("tensorflow/contrib/cloud/kernels")
 add_python_module("tensorflow/contrib/cloud/ops")
 add_python_module("tensorflow/contrib/cloud/python")
 add_python_module("tensorflow/contrib/cloud/python/ops")
+add_python_module("tensorflow/contrib/cluster_resolver")
+add_python_module("tensorflow/contrib/cluster_resolver/python")
+add_python_module("tensorflow/contrib/cluster_resolver/python/training")
 add_python_module("tensorflow/contrib/compiler")
 add_python_module("tensorflow/contrib/copy_graph")
 add_python_module("tensorflow/contrib/copy_graph/python")
@@ -284,6 +289,8 @@ add_python_module("tensorflow/contrib/data/python/framework")
 add_python_module("tensorflow/contrib/data/python/kernel_tests")
 add_python_module("tensorflow/contrib/data/python/ops")
 add_python_module("tensorflow/contrib/data/python/util")
+add_python_module("tensorflow/contrib/decision_trees")
+add_python_module("tensorflow/contrib/decision_trees/proto")
 add_python_module("tensorflow/contrib/deprecated")
 add_python_module("tensorflow/contrib/distributions")
 add_python_module("tensorflow/contrib/distributions/python")
@@ -341,6 +348,7 @@ add_python_module("tensorflow/contrib/keras/api/keras")
 add_python_module("tensorflow/contrib/keras/api/keras/activations")
 add_python_module("tensorflow/contrib/keras/api/keras/applications")
 add_python_module("tensorflow/contrib/keras/api/keras/applications/inception_v3")
+add_python_module("tensorflow/contrib/keras/api/keras/applications/mobilenet")
 add_python_module("tensorflow/contrib/keras/api/keras/applications/resnet50")
 add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg16")
 add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg19")
@@ -456,8 +464,17 @@ add_python_module("tensorflow/contrib/pi_examples")
 add_python_module("tensorflow/contrib/pi_examples/camera")
 add_python_module("tensorflow/contrib/pi_examples/label_image")
 add_python_module("tensorflow/contrib/pi_examples/label_image/data")
+add_python_module("tensorflow/contrib/predictor")
 add_python_module("tensorflow/contrib/quantization")
 add_python_module("tensorflow/contrib/quantization/python")
+add_python_module("tensorflow/contrib/remote_fused_graph/pylib")
+add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python")
+add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python/ops")
+add_python_module("tensorflow/contrib/resampler")
+add_python_module("tensorflow/contrib/resampler/kernels")
+add_python_module("tensorflow/contrib/resampler/ops")
+add_python_module("tensorflow/contrib/resampler/python")
+add_python_module("tensorflow/contrib/resampler/python/ops")
 add_python_module("tensorflow/contrib/rnn")
 add_python_module("tensorflow/contrib/rnn/kernels")
 add_python_module("tensorflow/contrib/rnn/ops")
@@ -523,10 +540,20 @@ add_python_module("tensorflow/contrib/text/kernels")
 add_python_module("tensorflow/contrib/text/ops")
 add_python_module("tensorflow/contrib/text/python")
 add_python_module("tensorflow/contrib/text/python/ops")
-add_python_module("tensorflow/contrib/tfprof" DONTCOPY)  # SWIG wrapper not implemented.
-#add_python_module("tensorflow/contrib/tfprof/python")
-#add_python_module("tensorflow/contrib/tfprof/python/tools")
-#add_python_module("tensorflow/contrib/tfprof/python/tools/tfprof")
+add_python_module("tensorflow/contrib/tfprof")
+add_python_module("tensorflow/contrib/timeseries")
+add_python_module("tensorflow/contrib/timeseries/examples")
+add_python_module("tensorflow/contrib/timeseries/examples/data")
+add_python_module("tensorflow/contrib/timeseries/python")
+add_python_module("tensorflow/contrib/timeseries/python/timeseries")
+add_python_module("tensorflow/contrib/timeseries/python/timeseries/state_space_models")
+add_python_module("tensorflow/contrib/tpu")
+add_python_module("tensorflow/contrib/tpu/ops")
+add_python_module("tensorflow/contrib/tpu/profiler")
+add_python_module("tensorflow/contrib/tpu/python")
+add_python_module("tensorflow/contrib/tpu/python/ops")
+add_python_module("tensorflow/contrib/tpu/python/profiler")
+add_python_module("tensorflow/contrib/tpu/python/tpu")
 add_python_module("tensorflow/contrib/training")
 add_python_module("tensorflow/contrib/training/python")
 add_python_module("tensorflow/contrib/training/python/training")
@@ -603,6 +630,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
 endfunction()
 
 GENERATE_PYTHON_OP_LIB("array_ops")
+GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
 GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
@@ -619,6 +647,8 @@ GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
+GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
@@ -631,6 +661,20 @@ GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/training/gen_training_ops.py)
 
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_model_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_model_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_split_handler_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_split_handler_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_training_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_training_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_prediction_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_prediction_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_quantiles_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_quantile_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_ensemble_optimzier_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_ensemble_optimizer_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_cudnn_rnn_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
@@ -651,6 +695,8 @@ GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/resampler/ops/gen_resampler_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_rnn_gru_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/ops/gen_gru_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_rnn_lstm_ops"
@@ -661,12 +707,18 @@ GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/python/ops/gen_tensor_forest_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_hybrid_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/hybrid/ops/gen_training_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_model_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/python/ops/gen_model_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_stats_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/python/ops/gen_stats_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_text_skip_gram_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/text/python/ops/gen_skip_gram_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
 GENERATE_PYTHON_OP_LIB("stateless_random_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
+GENERATE_PYTHON_OP_LIB("debug_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/debug/ops/gen_debug_ops.py)
 
 add_custom_target(tf_python_ops SOURCES ${tf_python_ops_generated_files} ${PYTHON_PROTO_GENFILES})
 add_dependencies(tf_python_ops tf_python_op_gen_main)
@@ -698,6 +750,8 @@ add_custom_command(
       VERBATIM )
 
 set (pywrap_tensorflow_internal_src
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/print_model_analysis.h"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/print_model_analysis.cc"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
@@ -734,9 +788,11 @@ if(WIN32)
     add_library(pywrap_tensorflow_internal_static STATIC
         ${pywrap_tensorflow_internal_src}
         $<TARGET_OBJECTS:tf_c>
+        $<TARGET_OBJECTS:tf_c_python_api>
         $<TARGET_OBJECTS:tf_core_lib>
         $<TARGET_OBJECTS:tf_core_cpu>
         $<TARGET_OBJECTS:tf_core_framework>
+        $<TARGET_OBJECTS:tf_core_profiler>
         $<TARGET_OBJECTS:tf_cc>
         $<TARGET_OBJECTS:tf_cc_ops>
         $<TARGET_OBJECTS:tf_core_ops>
@@ -781,9 +837,11 @@ endif(WIN32)
 add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_internal_src}
     $<TARGET_OBJECTS:tf_c>
+    $<TARGET_OBJECTS:tf_c_python_api>
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_profiler>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_core_ops>
@@ -911,19 +969,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/MANIFEST.in
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 
-# Copy resources for TensorBoard.
-file(DOWNLOAD http://mirror.bazel.build/tensorboard/index.html ${DOWNLOAD_LOCATION}/tensorboard/index.html
-  EXPECTED_HASH SHA256=25554e708552ad8587152f7a444db3f4ca753f9ed72d9f8105203c1d1806d521)
-add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E make_directory
-  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/components/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${DOWNLOAD_LOCATION}/tensorboard/index.html
-  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/components/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/TAG
-  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
-
 # Copy datasets for tf.contrib.learn.
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/datasets/data/boston_house_prices.csv
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 55e9e311f92124d27df6282d571dc077cc20b909..c7d2ac7b56b56158eaf5f9c2aedec89ee7551120 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -141,12 +141,14 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
+    "${tensorflow_source_dir}/tensorflow/python/profiler/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/profiler/internal/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/saved_model/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
-    "${tensorflow_source_dir}/tensorflow/tensorboard/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/integration_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/stateless/python/kernel_tests/*_test.py"
     # NOTE: tensor_forest tests in tensor_forest/hybrid/... still don't pass.
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/client/*_test.py"
@@ -160,13 +162,26 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/profile_analyzer_cli_test.py"
     # Windows does not have the curses library and uses readline.
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/curses_ui_test.py"
+    # TFDBG grpc:// mode is not yet available on Windows.
+    "${tensorflow_source_dir}/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
     # generally not working
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/__init__.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/benchmark_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/resource_variable_ops_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/profiler/pprof_profiler_test.py"
+    # flaky test
+    "${tensorflow_source_dir}/tensorflow/python/profiler/internal/run_metadata_test.py"
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
     # requires scipy
     "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py"
+    # flaky tests
+    "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cwise_ops_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/tfprof/python/tools/tfprof/internal/run_metadata_test.py"
+    # Loading resources in contrib doesn't seem to work on Windows
+    "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/client/random_forest_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py"
   )
   if (WIN32)
     set(tf_test_src_py_exclude
@@ -191,7 +206,6 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reshape_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"
-      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/server_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py"  # Depends on gemmlowp -> pthread.
       # int32/int64 mixup
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
@@ -206,13 +220,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py"  # Flaky I/O error on rename.
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"  # Needs portpicker.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
-      # Broken TensorBoard tests due to different paths in windows
-      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/application_test.py"
-      "${tensorflow_source_dir}/tensorflow/tensorboard/lib/python/http_util_test.py"
-      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py"
-      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/images/images_plugin_test.py"
       # Broken tensorboard test due to cmake issues.
-      "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/debugger/plugin_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
@@ -221,8 +229,6 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/topn_test.py"  # Results inaccurate
       "${tensorflow_source_dir}/tensorflow/python/ops/cloud/bigquery_reader_ops_test.py"  # No libcurl support
       # Newly running on Windows since TensorBoard backend move. Fail on Windows and need debug.
-      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py"
-      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
   )
   endif()
@@ -288,7 +294,6 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/cc/framework/gradients_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
-    "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/graph_transferer_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
@@ -383,6 +388,11 @@ if (tensorflow_BUILD_CC_TESTS)
     ${tf_cc_saved_model_test_srcs}
   )
 
+  file(GLOB tf_core_profiler_test_srcs
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/*_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*_test.cc"
+  )
+
   set(tf_test_lib tf_test_lib)
   add_library(${tf_test_lib} STATIC ${tf_src_testlib})
 
@@ -428,4 +438,15 @@ if (tensorflow_BUILD_CC_TESTS)
     LIBS ${tf_test_libs}
   )
 
+  file(GLOB_RECURSE tf_core_profiler_test_data
+    "${tensorflow_source_dir}/tensorflow/core/profiler/testdata/*"
+  )
+
+  AddTests(
+    SOURCES ${tf_core_profiler_test_srcs}
+    DATA ${tf_core_profiler_test_data}
+    OBJECTS ${tf_obj_test}
+    LIBS ${tf_test_libs}
+  )
+
 endif(tensorflow_BUILD_CC_TESTS)
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index f47a084db1416a89b8000ee78039dd7a3b04aadf..e683f6229fe9fcd120b0a37b979e380484bc731c 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -20,7 +20,7 @@ py_library(
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:session",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
 )
@@ -37,9 +37,7 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 3f124be771d421e6ea7c91ff9bf23983955cccdb..8c2528f548799f9facef740b0134ac56966b2b04 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -100,7 +100,9 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
 
 def copy_op_to_graph(org_instance, to_graph, variables,
                      scope=""):
-  """Given an `Operation` 'org_instance` from one `Graph`,
+  """Returns a copy of an operation from another Graph under a specified scope.
+
+  Given an `Operation` `org_instance` from one `Graph`,
   initializes and returns a copy of it from another `Graph`,
   under the specified scope (default `""`).
 
diff --git a/tensorflow/contrib/crf/BUILD b/tensorflow/contrib/crf/BUILD
index e82d2cf6f8a3340e56d59cabef863bb4c1559a70..7aad4abdb908d0284b85137bff842bd0f38d09c6 100644
--- a/tensorflow/contrib/crf/BUILD
+++ b/tensorflow/contrib/crf/BUILD
@@ -15,11 +15,12 @@ py_library(
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index b1caac476a2b355d5697a698d259dae4c45d264b..fc473d3380db69bc15e8b90ffc8f34703b935abb 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -87,6 +87,8 @@ cuda_py_test(
     additional_deps = [
         ":cudnn_rnn_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python/ops/losses:losses",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 7c3ca46e106d98b8e7da13c42662536953f55820..50a90e753d5dd7fd496cdb570b55da96f18ea836 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -565,15 +566,20 @@ class CudnnRNNParamsSizeOp<GPUDevice, T, Index> : public CudnnRNNKernelCommon {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsSize")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("num_layers")
-                            .HostMemory("num_units")
-                            .HostMemory("input_size")
-                            .HostMemory("params_size")
-                            .TypeConstraint<float>("T")
-                            .TypeConstraint<int32>("S"),
-                        CudnnRNNParamsSizeOp<GPUDevice, float, int32>);
+#define REGISTER_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsSize")       \
+                              .Device(DEVICE_GPU)          \
+                              .HostMemory("num_layers")    \
+                              .HostMemory("num_units")     \
+                              .HostMemory("input_size")    \
+                              .HostMemory("params_size")   \
+                              .TypeConstraint<T>("T")      \
+                              .TypeConstraint<int32>("S"), \
+                          CudnnRNNParamsSizeOp<GPUDevice, T, int32>);
+
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
 
 // Convert weight and bias params from a platform-specific layout to the
 // canonical form.
@@ -683,13 +689,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
   int num_params_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsToCanonical")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("num_layers")
-                            .HostMemory("num_units")
-                            .HostMemory("input_size")
-                            .TypeConstraint<float>("T"),
-                        CudnnRNNParamsToCanonical<GPUDevice, float>);
+#define REGISTER_GPU(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsToCanonical") \
+                              .Device(DEVICE_GPU)           \
+                              .HostMemory("num_layers")     \
+                              .HostMemory("num_units")      \
+                              .HostMemory("input_size")     \
+                              .TypeConstraint<T>("T"),      \
+                          CudnnRNNParamsToCanonical<GPUDevice, T>);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
 
 // Convert weight and bias params from the canonical form to a
 // platform-specific layout.
@@ -725,13 +735,16 @@ class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("CudnnRNNCanonicalToParams")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("num_layers")
-                            .HostMemory("num_units")
-                            .HostMemory("input_size")
-                            .TypeConstraint<float>("T"),
-                        CudnnRNNCanonicalToParams<GPUDevice, float>);
+#define REGISTER_GPU(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNCanonicalToParams") \
+                              .Device(DEVICE_GPU)           \
+                              .HostMemory("num_layers")     \
+                              .HostMemory("num_units")      \
+                              .HostMemory("input_size")     \
+                              .TypeConstraint<T>("T"),      \
+                          CudnnRNNCanonicalToParams<GPUDevice, T>);
+TF_CALL_float(REGISTER_GPU) TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
 
 // Run the forward operation of the RNN model.
 template <typename T>
@@ -874,9 +887,14 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       GUARDED_BY(mu_);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("CudnnRNN").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    CudnnRNNForwardOp<GPUDevice, float>);
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("CudnnRNN").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      CudnnRNNForwardOp<GPUDevice, T>);
+
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
 
 // Run the backward operation of the RNN model.
 template <typename T>
@@ -1088,9 +1106,14 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       GUARDED_BY(mu_);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("CudnnRNNBackprop").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    CudnnRNNBackwardOp<GPUDevice, float>);
+#define REGISTER_GPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("CudnnRNNBackprop").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      CudnnRNNBackwardOp<GPUDevice, T>);
+
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
 
 // TODO(zhengxq): Add the conversion of Cudnn RNN Params from and to
 // its canonical form.
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 2c631b064b559e19d767297e8ba5bfda06ab0880..151dc10c26bc19c39caa00db7c0377599a653f77 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -75,7 +75,7 @@ REGISTER_OP("CudnnRNNParamsSize")
     .Input("num_layers: int32")
     .Input("num_units: int32")
     .Input("input_size: int32")
-    .Attr("T: {float}")
+    .Attr("T: {float32, float64}")
     .Attr("S: {int32, int64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -130,7 +130,7 @@ REGISTER_OP("CudnnRNN")
     .Output("output_h: T")
     .Output("output_c: T")
     .Output("reserve_space: T")
-    .Attr("T: {float}")
+    .Attr("T: {float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -190,7 +190,7 @@ REGISTER_OP("CudnnRNNBackprop")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
     .Output("params_backprop: T")
-    .Attr("T: {float}")
+    .Attr("T: {float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -236,7 +236,7 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("params: T")
     .Output("weights: num_params * T")
     .Output("biases: num_params * T")
-    .Attr("T: {float}")
+    .Attr("T: {float32, float64}")
     .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -279,7 +279,7 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .Input("weights: num_params * T")
     .Input("biases: num_params * T")
     .Output("params: T")
-    .Attr("T: {float}")
+    .Attr("T: {float32, float64}")
     .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 08ec3076e49696602f729772e8dc3686c281cbaa..28d68ac6109188f7f3440d8ce1dccb74f0ef53d8 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -18,10 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import os
 import unittest
+import numpy as np
+
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework.test_util import TensorFlowTestCase
@@ -29,13 +34,42 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn as rnn_lib
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
 
+def _create_cudnn_compatible_canonical_rnn(cudnn_model,
+                                           inputs,
+                                           use_block_cell,
+                                           scope="rnn"):
+  model = cudnn_model.rnn_mode
+  if model not in (cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU):
+    raise ValueError("%s is not supported!" % model)
+  if model == cudnn_rnn_ops.CUDNN_GRU and use_block_cell:
+    raise ValueError("gru is not supported when using block cell!")
+
+  num_units = cudnn_model.num_units
+  num_layers = cudnn_model.num_layers
+  # To reuse cuDNN-trained models, must use cudnn compatible rnn cells.
+  if use_block_cell:
+    single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMBlockCell(num_units)
+  else:
+    if model == cudnn_rnn_ops.CUDNN_LSTM:
+      single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units)
+    else:
+      single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units)
+  cell = rnn_cell_impl.MultiRNNCell([single_cell() for _ in range(num_layers)])
+  return rnn_lib.dynamic_rnn(
+      cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope)
+
+
 class CudnnRNNTest(TensorFlowTestCase):
 
   def _CreateModel(self,
@@ -44,40 +78,48 @@ class CudnnRNNTest(TensorFlowTestCase):
                    num_units,
                    input_size,
                    input_mode="linear_input",
+                   dtype=dtypes.float32,
                    dropout=0.):
-    if rnn_mode == "lstm":
+    if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM:
       model = cudnn_rnn_ops.CudnnLSTM(
-          num_layers, num_units, input_size, dropout=dropout)
-    elif rnn_mode == "gru":
+          num_layers, num_units, input_size, dtype=dtype, dropout=dropout)
+    elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU:
       model = cudnn_rnn_ops.CudnnGRU(
-          num_layers, num_units, input_size, dropout=dropout)
-    elif rnn_mode == "rnn_tanh":
+          num_layers, num_units, input_size, dtype=dtype, dropout=dropout)
+    elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH:
       model = cudnn_rnn_ops.CudnnRNNTanh(
-          num_layers, num_units, input_size, dropout=dropout)
-    elif rnn_mode == "rnn_relu":
+          num_layers, num_units, input_size, dtype=dtype, dropout=dropout)
+    elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU:
       model = cudnn_rnn_ops.CudnnRNNRelu(
-          num_layers, num_units, input_size, dropout=dropout)
+          num_layers, num_units, input_size, dtype=dtype, dropout=dropout)
     else:
       raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
 
-  def _create_params_savable(self, params, model):
+  def _create_params_savable(self, params, model, base_variable_scope="rnn",
+                             name="params_canonical"):
     """Create a RNNParamsSaveable for the weight and bias parameters.
 
     Args:
       params: a Variable for weight and bias parameters.
       model: a CudnnRNN model.
+      base_variable_scope: a string, prefix of names of saved variables.
+      name: a string, name of the RNNParamsSaveable object.
     """
     params_saveable = cudnn_rnn_ops.RNNParamsSaveable(
-        model.params_to_canonical, model.canonical_to_params, [params])
+        model, model.params_to_canonical, model.canonical_to_params, [params],
+        base_variable_scope=base_variable_scope, name=name)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
 
-  def _testSaveRestoreVariable(self, rnn_mode):
-    model = self._CreateModel(rnn_mode, num_layers=2, num_units=7, input_size=3)
+  def _testSaveRestoreVariable(self, rnn_mode, dtype):
+    model = self._CreateModel(
+        rnn_mode, num_layers=2, num_units=7, input_size=3, dtype=dtype)
     random_seed.set_random_seed(1234)
     params_size_t = model.params_size()
     params = variables.Variable(
-        random_ops.random_uniform([params_size_t]), validate_shape=False)
+        random_ops.random_uniform([params_size_t], dtype=dtype),
+        dtype=dtype,
+        validate_shape=False)
     self._create_params_savable(params, model)
     save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test")
     saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
@@ -87,32 +129,251 @@ class CudnnRNNTest(TensorFlowTestCase):
       val = saver.save(sess, save_path)
       self.assertEqual(save_path, val)
     with self.test_session(use_gpu=True) as sess:
-      reset_params = state_ops.assign(params, array_ops.zeros([params_size_t]))
+      reset_params = state_ops.assign(params,
+                                      array_ops.zeros(
+                                          [params_size_t], dtype=dtype))
       sess.run(reset_params)
       saver.restore(sess, save_path)
       params_v_restored = sess.run(params)
       self.assertAllEqual(params_v, params_v_restored)
 
-  def _testSaveRestoreOutput(self, rnn_mode):
+  def _testSaveRestoreTwoVariables(self, rnn_mode, dtype):
+    model = self._CreateModel(
+        rnn_mode, num_layers=2, num_units=7, input_size=3, dtype=dtype)
+    random_seed.set_random_seed(1234)
+    params_size_t = model.params_size()
+    names = ["rnn_1", "rnn_2"]
+    param_vars = [variables.Variable(
+        random_ops.random_uniform([params_size_t], dtype=dtype),
+        dtype=dtype,
+        validate_shape=False) for name in names]
+    for name, params in zip(names, param_vars):
+      self._create_params_savable(params, model, name, name)
+    save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test")
+    saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(variables.global_variables_initializer())
+      params_v = sess.run(param_vars)
+      val = saver.save(sess, save_path)
+      self.assertEqual(save_path, val)
+    with self.test_session(use_gpu=True) as sess:
+      reset_params = [
+          state_ops.assign(params,
+                           array_ops.zeros(
+                               [params_size_t], dtype=dtype))
+          for params in param_vars]
+      sess.run(reset_params)
+      saver.restore(sess, save_path)
+      params_v_restored = sess.run(param_vars)
+      for v, v_restored in zip(params_v, params_v_restored):
+        self.assertAllEqual(v, v_restored)
+
+  def _build_forward_cudnn_model(self,
+                                 rnn_mode,
+                                 num_layers,
+                                 num_units,
+                                 input_data,
+                                 is_training=False):
+    input_data_shape = input_data.get_shape().with_rank(3)
+    batch_size = input_data_shape[1].value
+    input_size = input_data_shape[2].value
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+
+    # Set zero init input states
+    input_h = constant_op.constant(
+        np.zeros([num_layers, batch_size, num_units]), dtype=dtypes.float32)
+    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
+    if has_input_c:
+      input_c = constant_op.constant(
+          np.zeros([num_layers, batch_size, num_units]), dtype=dtypes.float32)
+
+    # Set rnn params
+    params_size_t = model.params_size()
+    params = variables.Variable(
+        random_ops.random_uniform([params_size_t]), validate_shape=False)
+    args = {
+        "input_data": input_data,
+        "input_h": input_h,
+        "params": params,
+        "is_training": is_training
+    }
+    if has_input_c:
+      args["input_c"] = input_c
+    # Build cell
+    output_tuple = model(**args)
+
+    # Create savable objects for params
+    self._create_params_savable(params, model)
+
+    return output_tuple, model, params
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testCudnnCompatibleRnnCells(self):
+    configs = [
+        {
+            "num_layers": 1,
+            "seq_length": 3,
+            "num_units": 4,
+            "input_size": 5,
+            "batch_size": 6,
+        },
+        {
+            "num_layers": 2,
+            "seq_length": 8,
+            "num_units": 4,
+            "input_size": 8,
+            "batch_size": 16,
+        },
+        {
+            "num_layers": 2,
+            "seq_length": 3,
+            "num_units": 4,
+            "input_size": 5,
+            "batch_size": 6,
+        },
+        {
+            "num_layers": 1,
+            "seq_length": 2,
+            "num_units": 2,
+            "input_size": 4,
+            "batch_size": 1,
+        },
+    ]
+    for rnn, cfg, use_block_cell in itertools.product(
+        (cudnn_rnn_ops.CUDNN_LSTM,), configs, (True, False,)):
+      self._testCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"],
+                                        cfg["num_units"], cfg["input_size"],
+                                        cfg["batch_size"], rnn, use_block_cell)
+    # TODO(jamesqin): Add CudnnCompatibleGRUBlockCell.
+    for rnn, cfg, use_block_cell in itertools.product(
+        (cudnn_rnn_ops.CUDNN_GRU,), configs, (False,)):
+      self._testCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"],
+                                        cfg["num_units"], cfg["input_size"],
+                                        cfg["batch_size"], rnn, use_block_cell)
+
+  def _testCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units,
+                                   input_size, batch_size, rnn_mode,
+                                   use_block_cell):
+    has_state_c = rnn_mode == cudnn_rnn_ops.CUDNN_LSTM
+    np.random.seed(0)
+    # Train graph
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(299)
+      input_data = array_ops.placeholder(
+          dtypes.float32, shape=[seq_length, batch_size, input_size])
+      output_tuple, cudnn_model, cudnn_params = self._build_forward_cudnn_model(
+          rnn_mode, num_layers, num_units, input_data, is_training=True)
+      target_output = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      total_sum = sum(map(math_ops.reduce_sum, output_tuple))
+
+      loss_op = losses.log_loss(labels=target_output, predictions=total_sum)
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1e-2)
+      train_op = optimizer.minimize(loss_op)
+
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      # Train Cudnn model
+      with self.test_session(
+          use_gpu=True, graph=ops.get_default_graph()) as sess:
+        sess.run(variables.global_variables_initializer())
+        # Train 128 steps
+        num_steps = 128
+        for _ in range(num_steps):
+          inputs = np.random.rand(seq_length, batch_size,
+                                  input_size).astype(np.float32)
+          targets = np.random.rand()
+          sess.run(
+              train_op, feed_dict={input_data: inputs,
+                                   target_output: targets})
+
+        save_path = os.path.join(self.get_temp_dir(),
+                                 ("cudnn-rnn-%s-test" % rnn_mode))
+        save_v = saver.save(sess, save_path)
+        self.assertEqual(save_path, save_v)
+        cudnn_params_v = sess.run(cudnn_params)
+
+    # cuDNN inference graph
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(299)
+      cudnn_inputs = array_ops.placeholder(
+          dtypes.float32, shape=[seq_length, batch_size, input_size])
+      (cudnn_output_tuple, cudnn_model,
+       cudnn_params) = self._build_forward_cudnn_model(
+           rnn_mode, num_layers, num_units, cudnn_inputs, is_training=False)
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      inference_input = np.random.rand(seq_length, batch_size,
+                                       input_size).astype(np.float32)
+      with self.test_session(
+          use_gpu=True, graph=ops.get_default_graph()) as sess:
+        sess.run(variables.global_variables_initializer())
+        saver.restore(sess, save_path)
+        restored_cudnn_params_v = sess.run(cudnn_params)
+        self.assertAllEqual(cudnn_params_v, restored_cudnn_params_v)
+
+        # Cudnn inference
+        cudnn_output = sess.run(
+            cudnn_output_tuple, feed_dict={cudnn_inputs: inference_input})
+
+    # Canonical RNN inference graph
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(299)
+      cell_inputs = array_ops.placeholder(
+          dtypes.float32, shape=[seq_length, batch_size, input_size])
+      (output, states) = _create_cudnn_compatible_canonical_rnn(
+          cudnn_model, cell_inputs, use_block_cell)
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      with self.test_session(
+          use_gpu=True, graph=ops.get_default_graph()) as sess:
+        saver.restore(sess, save_path)
+
+        # BlockCell inference
+        output_v, states_v = sess.run(
+            [output, states], feed_dict={cell_inputs: inference_input})
+
+        # output across timestamps are packed into one tensor.
+        self.assertAllClose(cudnn_output[0], output_v, atol=1e-6, rtol=1e-6)
+
+        for i in range(num_layers):
+          if has_state_c:
+            # output_h
+            self.assertAllClose(
+                cudnn_output[1][i, :], states_v[i].h, atol=1e-6, rtol=1e-6)
+            # output_c
+            self.assertAllClose(
+                cudnn_output[2][i, :], states_v[i].c, atol=1e-6, rtol=1e-6)
+          else:
+            self.assertAllClose(
+                cudnn_output[1][i, :], states_v[i], atol=1e-6, rtol=1e-6)
+
+  def _testSaveRestoreOutput(self, rnn_mode, dtype):
     num_layers = 2
     num_units = 7
     input_size = 7
     seq_length = 10
     batch_size = 5
     dir_count = 1
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+    model = self._CreateModel(
+        rnn_mode, num_layers, num_units, input_size, dtype=dtype)
     params_size_t = model.params_size()
     params = variables.Variable(
-        array_ops.ones([params_size_t]), validate_shape=False)
+        array_ops.ones([params_size_t], dtype=dtype),
+        validate_shape=False,
+        dtype=dtype)
     self._create_params_savable(params, model)
     save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
     saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
 
-    has_input_c = (rnn_mode == "lstm")
-    input_data = array_ops.ones([seq_length, batch_size, input_size])
-    input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
+    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
+    input_data = array_ops.ones(
+        [seq_length, batch_size, input_size], dtype=dtype)
+    input_h = array_ops.ones(
+        [num_layers * dir_count, batch_size, num_units], dtype=dtype)
     if has_input_c:
-      input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units])
+      input_c = array_ops.ones(
+          [num_layers * dir_count, batch_size, num_units], dtype=dtype)
       outputs = model(
           input_data=input_data,
           input_h=input_h,
@@ -132,7 +393,9 @@ class CudnnRNNTest(TensorFlowTestCase):
       val = saver.save(sess, save_path)
       self.assertEqual(save_path, val)
     with self.test_session(use_gpu=True) as sess:
-      reset_params = state_ops.assign(params, array_ops.zeros([params_size_t]))
+      reset_params = state_ops.assign(params,
+                                      array_ops.zeros(
+                                          [params_size_t], dtype=dtype))
       sess.run(reset_params)
       saver.restore(sess, save_path)
       total_sum_v_restored = sess.run(total_sum)
@@ -141,18 +404,23 @@ class CudnnRNNTest(TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveRestore(self):
-    rnn_modes = ["lstm", "gru", "rnn_tanh", "rnn_relu"]
-    for rnn_mode in rnn_modes:
-      self._testSaveRestoreVariable(rnn_mode)
-      self._testSaveRestoreOutput(rnn_mode)
+    rnn_modes = [
+        cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU,
+        cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU
+    ]
+    dtype_list = [dtypes.float32, dtypes.float64]
+    for rnn_mode, dtype in itertools.product(rnn_modes, dtype_list):
+      self._testSaveRestoreVariable(rnn_mode, dtype)
+      self._testSaveRestoreTwoVariables(rnn_mode, dtype)
+      self._testSaveRestoreOutput(rnn_mode, dtype)
 
   def _MinLSTMParamSize(self,
                         num_layers,
                         num_units,
                         input_size,
                         input_mode="auto_select",
-                        direction="unidirection"):
-    if direction != "unidirection":
+                        direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION):
+    if direction != cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION:
       # TODO(zhengxq): support bidirection in parameter size estimate.
       raise ValueError("Only unidirection in parameter size estimate")
     first_layer_weights = 4 * num_units * (num_units + input_size)
@@ -162,7 +430,8 @@ class CudnnRNNTest(TensorFlowTestCase):
 
   def _testOneLSTMParamsSize(self, num_layers, num_units, input_size):
     min_params_size = self._MinLSTMParamSize(num_layers, num_units, input_size)
-    model = self._CreateModel("lstm", num_layers, num_units, input_size)
+    model = self._CreateModel(cudnn_rnn_ops.CUDNN_LSTM, num_layers, num_units,
+                              input_size)
     params_size = model.params_size()
     with self.test_session(use_gpu=True) as sess:
       params_size_v = sess.run(params_size)
@@ -187,10 +456,14 @@ class CudnnRNNTest(TensorFlowTestCase):
                               batch_size, seq_length, dir_count, dropout,
                               expected, tolerance):
     random_seed.set_random_seed(5678)
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
-                              input_mode="auto_select",
-                              dropout=dropout)
-    has_input_c = (rnn_mode == "lstm")
+    model = self._CreateModel(
+        rnn_mode,
+        num_layers,
+        num_units,
+        input_size,
+        input_mode="auto_select",
+        dropout=dropout)
+    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
     params_size_t = model.params_size()
     input_data = array_ops.ones([seq_length, batch_size, input_size])
     input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
@@ -216,7 +489,7 @@ class CudnnRNNTest(TensorFlowTestCase):
     if has_input_c:
       output_c_sum = math_ops.reduce_sum(output_c)
       total_sum += output_c_sum
-    with self.test_session(use_gpu=True) as sess:
+    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
       sess.run(variables.global_variables_initializer())
       total_sum_v = sess.run([total_sum])
 
@@ -232,7 +505,7 @@ class CudnnRNNTest(TensorFlowTestCase):
     # demonstrative of the dropout-invariant nature of CudnnRnn.)
     test_configs = [
         {
-            "rnn_mode": "lstm",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
             "dropout": [0., 0.5, 1.],
             "expected": 231833.22,
             "tolerance": 1e-2,
@@ -246,7 +519,7 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
         {
-            "rnn_mode": "gru",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
             "dropout": [0., 0.5, 1.],
             "expected": 56000,
             "tolerance": 1e-2,
@@ -260,7 +533,7 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
         {
-            "rnn_mode": "rnn_tanh",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
             "dropout": [0., 0.5, 1.],
             "expected": 56000,
             "tolerance": 1e-2,
@@ -274,7 +547,7 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
         {
-            "rnn_mode": "rnn_relu",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
             "dropout": [0., 0.5, 1.],
             "expected": 130688,
             "tolerance": 1e-2,
@@ -302,28 +575,39 @@ class CudnnRNNTest(TensorFlowTestCase):
               shape["dir_count"], dropout, expected, tolerance)
 
   def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, dropout,
-                             tolerance):
+                             batch_size, seq_length, dir_count, dropout, dtype,
+                             delta, tolerance):
     # Gradient checking runs two forward ops with almost the same input. Need to
     # make sure the drop patterns across the two runs are the same.
     old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
     os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
-    has_input_c = (rnn_mode == "lstm")
+    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
     random_seed.set_random_seed(1234)
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
-                              dropout=dropout)
+    model = self._CreateModel(
+        rnn_mode,
+        num_layers,
+        num_units,
+        input_size,
+        dtype=dtype,
+        dropout=dropout)
     params_size_t = model.params_size()
     input_data = variables.Variable(
-        random_ops.random_uniform([seq_length, batch_size, input_size]))
+        random_ops.random_uniform(
+            [seq_length, batch_size, input_size], dtype=dtype),
+        dtype=dtype)
     input_h = variables.Variable(
         random_ops.random_uniform(
-            [num_layers * dir_count, batch_size, num_units]))
+            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
+        dtype=dtype)
     params = variables.Variable(
-        random_ops.random_uniform([params_size_t]), validate_shape=False)
+        random_ops.random_uniform([params_size_t], dtype=dtype),
+        validate_shape=False,
+        dtype=dtype)
     if has_input_c:
       input_c = variables.Variable(
           random_ops.random_uniform(
-              [num_layers * dir_count, batch_size, num_units]))
+              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
+          dtype=dtype)
 
       output, output_h, output_c = model(
           input_data=input_data,
@@ -354,8 +638,8 @@ class CudnnRNNTest(TensorFlowTestCase):
       all_inputs = [entry[0] for entry in inputs_and_shapes]
       all_shapes = [entry[1] for entry in inputs_and_shapes]
 
-      err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
-                                                    total_sum, [1])
+      err = gradient_checker.compute_gradient_error(
+          all_inputs, all_shapes, total_sum, [1], delta=delta)
 
       self.assertLess(err, tolerance)
       os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
@@ -365,8 +649,69 @@ class CudnnRNNTest(TensorFlowTestCase):
   def testSimpleTraining(self):
     test_configs = [
         {
-            "rnn_mode": "lstm",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
+            "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float64,
+            "delta": 1e-4,
+            "tolerance": 5e-6,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+                "dir_count": 1,
+            },
+        },
+        {
+            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
+            "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float64,
+            "delta": 1e-4,
+            "tolerance": 5e-6,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+                "dir_count": 1,
+            },
+        },
+        {
+            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
+            "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float64,
+            "delta": 1e-4,
+            "tolerance": 5e-6,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+                "dir_count": 1,
+            },
+        },
+        {
+            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
+            "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float64,
+            "delta": 1e-4,
+            "tolerance": 5e-6,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+                "dir_count": 1,
+            },
+        },
+        {
+            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
             "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float32,
             "tolerance": 1e-2,
             "shape": {
                 "num_layers": 2,
@@ -378,8 +723,9 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
         {
-            "rnn_mode": "gru",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
             "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float32,
             "tolerance": 4e-3,
             "shape": {
                 "num_layers": 2,
@@ -391,8 +737,9 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
         {
-            "rnn_mode": "rnn_tanh",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
             "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float32,
             "tolerance": 5e-3,
             "shape": {
                 "num_layers": 2,
@@ -404,8 +751,9 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
         {
-            "rnn_mode": "rnn_relu",
+            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
             "dropout": [0., 0.5, 1.],
+            "dtype": dtypes.float32,
             "tolerance": 4e-1,
             "shape": {
                 "num_layers": 2,
@@ -417,17 +765,20 @@ class CudnnRNNTest(TensorFlowTestCase):
             },
         },
     ]
+    ops.reset_default_graph()
     with ops.Graph().as_default():
       for config in test_configs:
         rnn_mode = config["rnn_mode"]
         dropout_list = config.get("dropout", [0.])
+        dtype = config.get("dtype", dtypes.float32)
+        delta = config.get("delta", 1e-3)
         tolerance = config["tolerance"]
         shape = config["shape"]
         for dropout in dropout_list:
-          self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                      shape["num_units"], shape["input_size"],
-                                      shape["batch_size"], shape["seq_length"],
-                                      shape["dir_count"], dropout, tolerance)
+          self._testOneSimpleTraining(
+              rnn_mode, shape["num_layers"], shape["num_units"],
+              shape["input_size"], shape["batch_size"], shape["seq_length"],
+              shape["dir_count"], dropout, dtype, delta, tolerance)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index cc0c7b082964546741c17a5cc9345330c3d8d6cc..be5af6900cc1e0495e3522f3f7b97d4a6befdd61 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -16,9 +16,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import itertools
 
 from tensorflow.contrib.cudnn_rnn.ops import gen_cudnn_rnn_ops
+from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
@@ -26,13 +26,30 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
 
 _cudnn_rnn_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
 
+_flatten_transpose = lambda t: array_ops.reshape(array_ops.transpose(t), [-1])
+# pylint: disable=g-long-lambda
+_transpose_reshape = lambda t, shape: array_ops.transpose(
+    array_ops.reshape(t, shape))
+# pylint: enable=g-long-lambda
+
+CUDNN_RNN_UNIDIRECTION = "unidirectional"
+CUDNN_RNN_BIDIRECTION = "bidirectional"
+CUDNN_LSTM = "lstm"
+CUDNN_GRU = "gru"
+CUDNN_RNN_RELU = "rnn_relu"
+CUDNN_RNN_TANH = "rnn_tanh"
+
 
 # TODO(yaozhang): make sure we only save the canonical version of params and
 # don't save the platform-specific version to avoid potential race
@@ -46,9 +63,11 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation that handles the RNN params variable."""
 
   def __init__(self,
+               cudnn_rnn,
                params_to_canonical,
                canonical_to_params,
                param_variables,
+               base_variable_scope=None,
                name="params_canonical"):
     """Creates a RNNParamsSaveable object.
 
@@ -75,6 +94,7 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
        tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
 
     Args:
+      cudnn_rnn: cudnn RNN class instance.
       params_to_canonical: a function to convert params from a specific format
           for cuDNN or other RNN ops to the canonical format.
           _CudnnRNN.params_to_canonical() should be provided here.
@@ -87,25 +107,52 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           For cuDNN RNN ops, this is a single merged variable for both weights
           and biases; for other RNN ops, this might be multiple unmerged or
           partially merged variables respectively for weights and biases.
+      base_variable_scope: a string, name of outer variable scope, used as
+          part of prefix of names of saved variables.
       name: the name of the RNNParamsSaveable object.
     """
     # There is only a single merged parameter variable for cuDNN when saving.
+    self._cudnn_rnn = cudnn_rnn
     weights, biases = params_to_canonical(param_variables[0])
+    weights, biases, = self._transform_canonical(weights, biases)
+    weight_names, biase_names = self._transformed_canonical_names(
+        weights, biases)
     self._canonical_to_params = canonical_to_params
     self._variables = param_variables
     # We currently don't use slice_spec. It might be useful in a distributed
     # setting where each parameter server node stores a slice of variable,
     # instead of having the master pull all slices and then save them.
     slice_spec = ""
+    params = weights + biases
+    param_names = weight_names + biase_names
+    if base_variable_scope:
+      param_names = ["%s/%s" % (base_variable_scope, pn) for pn in param_names]
     specs = [
-        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param.name)
-        for param in itertools.chain(weights, biases)
+        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
+        for param, param_name in zip(params, param_names)
     ]
-    super(RNNParamsSaveable, self).__init__(None, specs, name)
+    super(RNNParamsSaveable, self).__init__(
+        param_variables[0], specs, name)
 
   def restore(self, restored_tensors, restored_shapes):
-    weights = restored_tensors[:len(restored_tensors) // 2]
-    biases = restored_tensors[len(restored_tensors) // 2:]
+    if (self._cudnn_rnn.direction == CUDNN_RNN_UNIDIRECTION and
+        self._cudnn_rnn.rnn_mode == CUDNN_LSTM):
+      if len(restored_tensors) % 4 != 0:
+        raise ValueError(
+            "Invalid count of restored_tensors, expecting a multiple of 4.")
+      weights = restored_tensors[:len(restored_tensors) // 4]
+      biases = restored_tensors[len(restored_tensors) // 4:]
+    elif (self._cudnn_rnn.direction == CUDNN_RNN_UNIDIRECTION and
+          self._cudnn_rnn.rnn_mode == CUDNN_GRU):
+      if len(restored_tensors) % 8 != 0:
+        raise ValueError(
+            "Invalid count of restored_tensors, expecting a multiple of 8.")
+      weights = restored_tensors[:len(restored_tensors) // 8 * 3]
+      biases = restored_tensors[len(restored_tensors) // 8 * 3:]
+    else:
+      weights = restored_tensors[:len(restored_tensors) // 2]
+      biases = restored_tensors[len(restored_tensors) // 2:]
+    weights, biases = self._untransform_canonical(weights, biases)
     params = self._canonical_to_params(weights, biases)
     if not isinstance(params, tuple):
       params = (params,)
@@ -115,6 +162,318 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     ]
     return control_flow_ops.group(*assign_ops)
 
+  def _switch_inner(self, array, base_idx):
+    array[base_idx + 1], array[base_idx + 2] = (array[base_idx + 2],
+                                                array[base_idx + 1])
+
+  def _transform_canonical(self, weights, biases):
+    if self._cudnn_rnn.direction != CUDNN_RNN_UNIDIRECTION:
+      return weights, biases
+    elif self._cudnn_rnn.rnn_mode == CUDNN_LSTM:
+      return self._transform_lstm_canonical(weights, biases)
+    elif self._cudnn_rnn.rnn_mode == CUDNN_GRU:
+      return self._transform_gru_canonical(weights, biases)
+    return weights, biases
+
+  def _transformed_canonical_names(self, weights, biases):
+    """Returns canonical names for transformed weight and bias tensors."""
+    if self._cudnn_rnn.direction != CUDNN_RNN_UNIDIRECTION:
+      assert len(weights) == len(biases)
+      return ([w.name for w in weights], [b.name for b in biases])
+    elif self._cudnn_rnn.rnn_mode == CUDNN_LSTM:
+      assert len(weights) * 3 == len(biases)
+      return self._transformed_lstm_canonical_names()
+    elif self._cudnn_rnn.rnn_mode == CUDNN_GRU:
+      assert len(weights) * 5 == len(biases) * 3
+      return self._transformed_gru_canonical_names()
+    assert len(weights) == len(biases)
+    return ([w.name for w in weights], [b.name for b in biases])
+
+  def _transformed_lstm_canonical_names(self):
+    w_names, b_names = [], []
+    num_layers = self._cudnn_rnn.num_layers
+    # TODO(jamesqin): get rid of multi_rnn_cell when num_layers is 1
+    for i in range(num_layers):
+      # One transformed weight tensor each layer.
+      prefix = "multi_rnn_cell/cell_%d/cudnn_compatible_lstm_cell" % i
+      w_names.append(prefix + "/kernel")
+      # Three transformed bias tensors each layer:
+      # the 1st is for CudnnCompatibleLSTM(Block)Cell restore; the latter two
+      # sum up to the 1st, and are used for cuDNN restore.
+      b_names.append(prefix + "/bias")
+      b_names.extend([prefix + "/bias_cudnn_%d" % j for j in range(2)])
+    return w_names, b_names
+
+  def _transformed_gru_canonical_names(self):
+    w_names, b_names = [], []
+    num_layers = self._cudnn_rnn.num_layers
+    # TODO(jamesqin): get rid of multi_rnn_cell when num_layers is 1
+    for i in range(num_layers):
+      prefix = "multi_rnn_cell/cell_%d/cudnn_compatible_gru_cell" % i
+      # 2 transformed weight tensor each layer.
+      w_names.append(prefix + "/gates/kernel")
+      w_names.append(prefix + "/candidate/input_projection/kernel")
+      w_names.append(prefix + "/candidate/hidden_projection/kernel")
+      # 5 transformed bias tensors each layer:
+      b_names.append(prefix + "/gates/bias")
+      b_names.append(prefix + "/candidate/input_projection/bias")
+      b_names.append(prefix + "/candidate/hidden_projection/bias")
+      b_names.extend([
+          "multi_rnn_cell/cell_%d/cudnn_compatible_gru_cell/bias_cudnn %d" % (i,
+                                                                              j)
+          for j in range(2)
+      ])
+    return w_names, b_names
+
+  def _transform_lstm_canonical(self, weights, biases):
+    """Create transformed canonical params.
+
+    Produce properly-shaped monolithic weight and bias tensors to share between
+    cuDNN and cudnn_compatible non-platform specific LSTM cells.
+    Args:
+      weights: a list of Tensors recovered from cuDNN params_to_canonical.
+      biases: a list of Tensors recovered from cuDNN params_to_canonical.
+    Returns:
+      Two lists of tensors, one for weight and bias each.
+      The weight list contains num_layers tensors and bias one contains 3 *
+      num_layers tensors. Both original and combined biases since cuDNN biases
+      are not restorable from the transformed version.
+    """
+    transformed_weights, transformed_biases = [], []
+    for i in range(self._cudnn_rnn.num_layers):
+      base_idx = i * 8
+      num_units = self._cudnn_rnn.num_units
+      input_size = self._cudnn_rnn.input_size if i == 0 else num_units
+      # cuDNN tensor shapes per time_step:
+      # input.shape:         [batch_size, input_size],
+      # input_weights.shape: [num_units, input_size] (first layer)
+      #                      [num_units, num_units]  (other layers)
+      # state_weights.shape: [num_units, num_units]
+      # biases.shape:        [num_units]
+      #
+      # General LSTM cells compute gate functions using:
+      #   [x, h_prev] * weights + biases
+      # Therefore for each layer, they expect
+      # weight.shape: [input_size + num_units, 4 * num_units] (first_layer)
+      #               [num_units + num_units, 4 * num_units]  (other layers)
+      # bias.shape:   [4 * num_units]
+
+      # Stitch weights together in this layer.
+      stitched_w = []
+      for j in range(4):
+        stitched_w.append(
+            array_ops.concat(
+                [
+                    array_ops.reshape(weights[base_idx + j],
+                                      [num_units, input_size]),
+                    array_ops.reshape(weights[base_idx + j + 4],
+                                      [num_units, num_units])
+                ],
+                axis=1))
+      # cuDNN weights are in ifco order, convert to icfo order.
+      self._switch_inner(stitched_w, 0)
+      transformed_weights.append(
+          array_ops.transpose(array_ops.concat(stitched_w, axis=0)))
+
+      # Stitch biases together in this layer.
+      # Convert to icfo order.
+      self._switch_inner(biases, base_idx)
+      self._switch_inner(biases, base_idx + 4)
+      # The bias for layer input.
+      b_in = array_ops.concat(biases[base_idx:base_idx + 4], axis=0)
+      # The bias for recurrent input.
+      b_rec = array_ops.concat(biases[base_idx + 4:base_idx + 8], axis=0)
+
+      transformed_biases.extend([b_in + b_rec, b_in, b_rec])
+    return transformed_weights, transformed_biases
+
+  def _transform_gru_canonical(self, weights, biases):
+    """Creates transformed gru canonical params.
+
+    Produce properly-formatted weight and bias tensors to share between
+    cuDNN and cudnn_compatible non-platform specific GRU cells.
+    Args:
+      weights: a list of Tensors recovered from cuDNN params_to_canonical.
+      biases: a list of Tensors recovered from cuDNN params_to_canonical.
+    Returns:
+      Two lists of tensors, one for weight and bias each.
+      weight list: 3 tensors each layer. One for reset and update gates, the
+        other two for candidate gate.
+      bias list: 5 tensors each layer. The 1st for reset_and_update gate,
+        the next 2 in line for candidate gate. The last 2 are original
+        tensors for reset_and_update gates stitched together, retained since
+        cuDNN biases are not restorable from the transformed version.
+    """
+    transformed_weights, transformed_biases = [], []
+    for i in range(self._cudnn_rnn.num_layers):
+      base_idx = i * 6
+      num_units = self._cudnn_rnn.num_units
+      input_size = self._cudnn_rnn.input_size if i == 0 else num_units
+      # cuDNN tensor shapes per time_step:
+      # input.shape:         [batch_size, input_size],
+      # input_weights.shape: [num_units, input_size] (first layer)
+      #                      [num_units, num_units]  (other layers)
+      # state_weights.shape: [num_units, num_units]
+      # biases.shape:        [num_units]
+      #
+      # cuDNN compatible GRU cell:
+      # reset and update gate:
+      #  [x, h_prev] * weights + biases
+      # new memory gate (same as cuDNN):
+      #  x * W_h + B_wh + r \dot (h * R_h + B_rh)
+      #
+      # Therefore for each layer, it expects:
+      # reset and update gate:
+      # weight.shape: [input_size + num_units, 2 * num_units] (first_layer)
+      #               [num_units + num_units, 2 * num_units]  (other layers)
+      # bias.shape:   [4 * num_units]
+      # new memory gate: same weights and biases as cuDNN GRU.
+
+      stitched_w = []
+      # Stitch together weights for reset and update gate.
+      for j in range(2):
+        stitched_w.append(
+            array_ops.concat(
+                [
+                    array_ops.reshape(weights[base_idx + j],
+                                      [num_units, input_size]),
+                    array_ops.reshape(weights[base_idx + j + 3],
+                                      [num_units, num_units])
+                ],
+                axis=1))
+      transformed_weights.append(
+          array_ops.transpose(array_ops.concat(stitched_w[:2], axis=0)))
+      # weights for new memory gate are kept separate.
+      transformed_weights.append(
+          _transpose_reshape(weights[base_idx + 2], [num_units, input_size]))
+      transformed_weights.append(
+          _transpose_reshape(weights[base_idx + 5], [num_units, num_units]))
+
+      # Bias for reset and update gates.
+      b_r = array_ops.concat(biases[base_idx:base_idx + 2], axis=0)
+      b_u = array_ops.concat(biases[base_idx + 3:base_idx + 5], axis=0)
+      # Biases for new memory gate.
+      b_c = biases[base_idx + 2]
+      b_h = biases[base_idx + 5]
+
+      transformed_biases.extend([b_r + b_u, b_c, b_h, b_r, b_u])
+    return transformed_weights, transformed_biases
+
+  def _untransform_canonical(self, weights, biases):
+    if self._cudnn_rnn.direction != CUDNN_RNN_UNIDIRECTION:
+      return weights, biases
+    elif self._cudnn_rnn.rnn_mode == CUDNN_LSTM:
+      return self._untransform_lstm_canonical(weights, biases)
+    elif self._cudnn_rnn.rnn_mode == CUDNN_GRU:
+      return self._untransform_gru_canonical(weights, biases)
+    return weights, biases
+
+  def _untransform_lstm_canonical(self, transformed_weights,
+                                  transformed_biases):
+    """The reverse procedure of _transform_lstm_canonical().
+
+    Args:
+      transformed_weights: a list of tensors, one for each layer.
+      transformed_biases: a list of tensors , 3 for each layer: the 2nd for
+        layer input, the 3rd for recurrent input, the 1st is the sum of the
+        latter two.
+    Returns:
+      Two lists of tensors for weights and biases respectively.
+      There are 8 tensors per weight and per bias for each layer:
+      tensor 0-3 are applied to the input from the previous layer;
+      tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
+      tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
+      tensor 3 and 7 the output gate.
+    """
+    weights, biases = [], []
+    assert 3 * len(transformed_weights) == len(transformed_biases)
+    for i in range(len(transformed_weights)):
+      num_units = self._cudnn_rnn.num_units
+      input_size = self._cudnn_rnn.input_size if i == 0 else num_units
+      # weights applied on layer inputs.
+      wi = array_ops.slice(transformed_weights[i], [0, 0],
+                           [input_size, 4 * num_units])
+      # weights applied on recurrent inputs.
+      wr = array_ops.slice(transformed_weights[i], [input_size, 0],
+                           [num_units, 4 * num_units])
+      wi_list = array_ops.split(wi, 4, axis=1)
+      wr_list = array_ops.split(wr, 4, axis=1)
+
+      for j in range(len(wi_list)):
+        wi_list[j] = array_ops.reshape(array_ops.transpose(wi_list[j]), [-1])
+        wr_list[j] = array_ops.reshape(array_ops.transpose(wr_list[j]), [-1])
+      # canonical weights are in icfo order, convert to ifco order for cuDNN.
+      self._switch_inner(wi_list, 0)
+      self._switch_inner(wr_list, 0)
+      weights.extend(wi_list)
+      weights.extend(wr_list)
+
+      base_idx = 3 * i
+      bi_list = array_ops.split(transformed_biases[base_idx + 1], 4, axis=0)
+      br_list = array_ops.split(transformed_biases[base_idx + 2], 4, axis=0)
+      # canonical weights are in icfo order, convert to ifco order for cuDNN.
+      self._switch_inner(bi_list, 0)
+      self._switch_inner(br_list, 0)
+      biases.extend(bi_list)
+      biases.extend(br_list)
+    return weights, biases
+
+  def _untransform_gru_canonical(self, transformed_weights, transformed_biases):
+    """The reverse procedure of _fuse_gru_canonical().
+
+    Args:
+      transformed_weights: a list of tensors, 3 for each layer. The 1st for
+        reset and update gates; the 2nd and 3rd for the new memory gate.
+      transformed_biases: 5 tensors each layer. The first for reset_and_update
+        gate; the next two in line for candidate gate. The last 2 are original
+        tensors for reset_and_update gates, retained since cuDNN biases are not
+        restorable from the fused version.
+
+    Returns:
+      Two lists of tensors for weights and biases respectively.
+      There are 6 tensors per weight and per bias for each layer:
+      tensor 0-2 are applied to the input from the previous layer and
+      tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
+      tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+    """
+    weights, biases = [], []
+    assert 5 * len(transformed_weights) == len(transformed_biases) * 3
+    for i in range(len(transformed_weights) // 3):
+      base_idx = 3 * i
+      num_units = self._cudnn_rnn.num_units
+      input_size = self._cudnn_rnn.input_size if i == 0 else num_units
+      # reset and update gate weights applied on layer inputs.
+      w_i = array_ops.slice(transformed_weights[base_idx], [0, 0],
+                            [input_size, 2 * num_units])
+      # reset and update gate weights applied on recurrent inputs.
+      w_r = array_ops.slice(transformed_weights[base_idx], [input_size, 0],
+                            [num_units, 2 * num_units])
+      wi_list = array_ops.split(w_i, 2, axis=1)
+      wr_list = array_ops.split(w_r, 2, axis=1)
+
+      wi_list = [_flatten_transpose(w) for w in wi_list]
+      wr_list = [_flatten_transpose(w) for w in wr_list]
+
+      # candidate gate weights
+      ih, hh = [
+          _flatten_transpose(w)
+          for w in transformed_weights[base_idx + 1:base_idx + 3]
+      ]
+      weights.extend(wi_list)
+      weights.append(ih)
+      weights.extend(wr_list)
+      weights.append(hh)
+
+      base_idx = 5 * i
+      # Recover biases for reset and update gates.
+      bi_list = array_ops.split(transformed_biases[base_idx + 3], 2, axis=0)
+      br_list = array_ops.split(transformed_biases[base_idx + 4], 2, axis=0)
+      biases.extend(bi_list)
+      biases.append(transformed_biases[base_idx + 1])
+      biases.extend(br_list)
+      biases.append(transformed_biases[base_idx + 2])
+    return weights, biases
+
 
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
@@ -155,13 +514,15 @@ class _CudnnRNN(object):
   """
   __doc__ += _cudnn_rnn_common_doc_string
 
+  # TODO(jamesqin): support float16 CuDNN RNN
   def __init__(self,
                rnn_mode,
                num_layers,
                num_units,
                input_size,
                input_mode="linear_input",
-               direction="unidirectional",
+               direction=CUDNN_RNN_UNIDIRECTION,
+               dtype=dtypes.float32,
                dropout=0.,
                seed=0):
     """Creates a CudnnRNN model from model spec.
@@ -183,6 +544,7 @@ class _CudnnRNN(object):
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
+      dtype: dtype of params, tf.float32 or tf.float64.
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
       seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
           for behavior.
@@ -193,12 +555,33 @@ class _CudnnRNN(object):
     self._rnn_mode = rnn_mode
     self._input_mode = input_mode
     self._direction = direction
+    self._dtype = dtype
     self._dropout = dropout
     # get graph and op seed.
     self._seed, self._seed2 = random_seed.get_seed(seed)
     if self._seed is None and self._seed2 is None:
       self._seed, self._seed2 = 0, 0
 
+  @property
+  def input_size(self):
+    return self._input_size
+
+  @property
+  def num_units(self):
+    return self._num_units
+
+  @property
+  def num_layers(self):
+    return self._num_layers
+
+  @property
+  def rnn_mode(self):
+    return self._rnn_mode
+
+  @property
+  def direction(self):
+    return self._direction
+
   def params_size(self):
     """Calculates the size of the opaque parameter buffer needed for this model.
 
@@ -209,7 +592,7 @@ class _CudnnRNN(object):
         num_layers=self._num_layers,
         num_units=self._num_units,
         input_size=self._input_size,
-        T=dtypes.float32,
+        T=self._dtype,
         S=dtypes.int32,
         dropout=self._dropout,
         seed=self._seed,
@@ -222,20 +605,22 @@ class _CudnnRNN(object):
     """Runs the forward step for the RNN model.
 
     Args:
-      input_data: the input sequence to the RNN model.
-      input_h: the initial hidden state for h.
+      input_data: the input sequence to the RNN model. A Tensor of shape [?,
+        batch_size, input_size].
+      input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+        batch_size, num_units].
       input_c: the initial hidden state for c. This is only relevant for LSTM.
+        A Tensor of the same shape as input_h.
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
-
     Returns:
       output: the output sequuence.
       output_h: the final state for h.
       output_c: the final state for c. This is only relevant for LSTM.
     """
-    if self._rnn_mode != "lstm":
+    if self._rnn_mode != CUDNN_LSTM:
       # For model that doesn't take input_c, replace with a dummy tensor.
-      input_c = array_ops.constant([], dtype=dtypes.float32)
+      input_c = array_ops.constant([], dtype=self._dtype)
     output, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         input=input_data,
         input_h=input_h,
@@ -308,8 +693,9 @@ class CudnnLSTM(_CudnnRNN):
                num_layers,
                num_units,
                input_size,
-               input_mode="auto_select",
-               direction="unidirectional",
+               input_mode="linear_input",
+               direction=CUDNN_RNN_UNIDIRECTION,
+               dtype=dtypes.float32,
                dropout=0.,
                seed=0):
     """Creates a Cudnn LSTM model from model spec.
@@ -327,16 +713,18 @@ class CudnnLSTM(_CudnnRNN):
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
+      dtype: dtype of params, tf.float32 or tf.float64.
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
       seed: the seed used for initializing dropout.
     """
     super(CudnnLSTM, self).__init__(
-        "lstm",
+        CUDNN_LSTM,
         num_layers,
         num_units,
         input_size,
         input_mode=input_mode,
         direction=direction,
+        dtype=dtype,
         dropout=dropout,
         seed=seed)
 
@@ -344,12 +732,14 @@ class CudnnLSTM(_CudnnRNN):
     """Runs the forward step for the Cudnn LSTM model.
 
     Args:
-      input_data: the input sequence to the LSTM model.
-      input_h: the initial hidden state for h.
-      input_c: the initial hidden state for c.
+      input_data: the input sequence to the LSTM model. A Tensor of shape [?,
+        batch_size, input_size].
+      input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+        batch_size, num_units].
+      input_c: the initial hidden state for c. A Tensor of the same shape as
+        input_h.
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
-
     Returns:
       output: the output sequuence.
       output_h: the final state for h.
@@ -368,8 +758,9 @@ class _CudnnRNNNoInputC(_CudnnRNN):
                num_layers,
                num_units,
                input_size,
-               input_mode="auto_select",
-               direction="unidirectional",
+               input_mode="linear_input",
+               direction=CUDNN_RNN_UNIDIRECTION,
+               dtype=dtypes.float32,
                dropout=0.,
                seed=0):
     """Creates a Cudnn RNN model from model without hidden-state C.
@@ -387,9 +778,17 @@ class _CudnnRNNNoInputC(_CudnnRNN):
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
+      dtype: dtype of params, tf.float32 or tf.float64.
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
       seed: the seed used for initializing dropout.
+
+    Raises:
+      ValueError: if direction is not 'unidirectional' or 'bidirectional'.
     """
+
+    if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
+      raise ValueError("Invalid direction: %s", direction)
+
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
         num_layers,
@@ -397,6 +796,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         input_size,
         input_mode=input_mode,
         direction=direction,
+        dtype=dtype,
         dropout=dropout,
         seed=seed)
 
@@ -404,11 +804,12 @@ class _CudnnRNNNoInputC(_CudnnRNN):
     """Runs the forward step for the Cudnn LSTM model.
 
     Args:
-      input_data: the input sequence to the LSTM model.
-      input_h: the initial hidden state for h.
+      input_data: the input sequence to the RNN model. A Tensor of shape [?,
+        batch_size, input_size].
+      input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+        batch_size, num_units].
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
-
     Returns:
       output: the output sequuence.
       output_h: the final state for h.
@@ -421,7 +822,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
 class CudnnGRU(_CudnnRNNNoInputC):
   """Cudnn implementation of the GRU model."""
   __doc__ += _cudnn_rnn_common_doc_string
-  _rnn_mode = "gru"
+  _rnn_mode = CUDNN_GRU
   # 3 sets of weight and bias parameters for the recurrent input, and 3 for the
   # previous layer input.
   _NUM_PARAMS_PER_LAYER = 6
@@ -430,7 +831,7 @@ class CudnnGRU(_CudnnRNNNoInputC):
 class CudnnRNNTanh(_CudnnRNNNoInputC):
   """Cudnn implementation of the RNN-tanh model."""
   __doc__ += _cudnn_rnn_common_doc_string
-  _rnn_mode = "rnn_tanh"
+  _rnn_mode = CUDNN_RNN_TANH
   # 1 set of weight and bias parameters for the recurrent input, and 1 for the
   # previous layer input.
   _NUM_PARAMS_PER_LAYER = 2
@@ -439,12 +840,122 @@ class CudnnRNNTanh(_CudnnRNNNoInputC):
 class CudnnRNNRelu(_CudnnRNNNoInputC):
   """Cudnn implementation of the RNN-relu model."""
   __doc__ += _cudnn_rnn_common_doc_string
-  _rnn_mode = "rnn_relu"
+  _rnn_mode = CUDNN_RNN_RELU
   # 1 set of weight and bias parameters for the recurrent input, and 1 for the
   # previous layer input.
   _NUM_PARAMS_PER_LAYER = 2
 
 
+class CudnnCompatibleLSTMBlockCell(lstm_ops.LSTMBlockCell):
+  """Cudnn Compatible LSTMBlockCell.
+
+  A simple wrapper around @{tf.contrib.rnn.LSTMBlockCell} to use along with
+  @{tf.contrib.cudnn_rnn.CudnnLSTM}. The latter's params can be used by the
+  this cell seamlessly. It is the more performant than
+  @{tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell}, the same way
+  @{tf.contrib.rnn.LSTMBlockCell} can be more performant than
+  @{tf.nn.rnn_cell.LSTMCell}.
+  """
+
+  def __init__(self, num_units):
+    super(CudnnCompatibleLSTMBlockCell, self).__init__(
+        num_units, forget_bias=0, clip_cell=False, use_peephole=False)
+    self._names.update({"scope": "cudnn_compatible_lstm_cell"})
+
+
+class CudnnCompatibleLSTMCell(rnn_cell_impl.LSTMCell):
+  """Cudnn Compatible LSTMCell.
+
+  A simple wrapper around @{tf.nn.rnn_cell.LSTMCell} to use along with
+  @{tf.contrib.cudnn_rnn.CudnnLSTM}. The latter's params can be used by the
+  former seamlessly.
+  """
+
+  def __init__(self, num_units, reuse=None):
+    super(CudnnCompatibleLSTMCell, self).__init__(
+        num_units,
+        use_peepholes=False,
+        cell_clip=None,
+        num_proj=None,
+        proj_clip=None,
+        state_is_tuple=True,
+        activation=None,
+        reuse=reuse,
+        forget_bias=0)
+
+
+class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
+  """Cudnn Compatible GRUCell.
+
+  A GRU impl akin to @{tf.nn.rnn_cell.GRUCell} to use along with
+  @{tf.contrib.cudnn_rnn.CudnnGRU}. The latter's params can be used by the
+  it seamlessly.
+
+  It differs from non-cudnn-compatible GRUs in how the new memory gate is
+  calculated. Nvidia picks this variant based on GRU author's[1] suggestion and
+  the fact it has no accuracy impact[2].
+  [1] https://arxiv.org/abs/1406.1078
+  [2] http://svail.github.io/diff_graphs/
+
+  cuDNN compatible GRU (from cuDNN library user guide):
+  ```python
+  r_t = sigma(x_t * W_r + h_t-1 * R_h + b_Wr + b_Rr)  # reset gate
+  i_t = sigma(x_t * W_i + h_t-1 * R_i + b_Wi + b_Ru)  # update gate
+  h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_Rh) + b_Wh)  # new memory gate
+  h_t = (1 - i_t) .* h'_t + i_t .* h_t-1
+  ```
+
+  Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
+  ```python
+  h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_Wh)  # new memory gate
+  ```
+
+  Note: in addition to the extra bias term b_Rh,
+  ```python
+  r .* (h * R) != (r .* h) * R
+  ```
+
+  TODO(jamesqin): change the impl to mirror the canonical version, since cuDNN
+  will do the same after v7.1.
+  """
+
+  def __init__(self, num_units, reuse=None, kernel_initializer=None):
+    super(CudnnCompatibleGRUCell, self).__init__(
+        num_units,
+        activation=None,
+        reuse=reuse,
+        kernel_initializer=kernel_initializer)
+
+  def call(self, inputs, state):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    with vs.variable_scope("gates"):  # Reset gate and update gate.
+      # We start with bias of 1.0 to not reset and not update.
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        dtype = inputs.dtype
+        bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
+      # pylint: disable=protected-access
+      value = math_ops.sigmoid(
+          rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True,
+                                bias_ones, self._kernel_initializer))
+      r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+      # pylint: enable=protected-access
+    with vs.variable_scope("candidate"):
+      # pylint: disable=protected-access
+      with vs.variable_scope("input_projection"):
+        hi = rnn_cell_impl._linear(inputs, self._num_units, True,
+                                   self._bias_initializer,
+                                   self._kernel_initializer)
+      with vs.variable_scope("hidden_projection"):
+        hh = r * (rnn_cell_impl._linear(state, self._num_units, True,
+                                        self._bias_initializer,
+                                        self._kernel_initializer))
+      # pylint: enable=protected-access
+      c = self._activation(hi + hh)
+    new_h = u * state + (1 - u) * c
+    return new_h, new_h
+
+
 @ops.RegisterGradient("CudnnRNN")
 def _cudnn_rnn_backward(op, *grad):
   if not op.get_attr("is_training"):
diff --git a/tensorflow/contrib/data/README.md b/tensorflow/contrib/data/README.md
index 9505f5c4653e23e89c17935425b15d4ec9522667..7c59a1ffc37085f17f8f4e693c0bc874c77f914a 100644
--- a/tensorflow/contrib/data/README.md
+++ b/tensorflow/contrib/data/README.md
@@ -1,631 +1,8 @@
-# Using the `Dataset` API for TensorFlow Input Pipelines
+`tf.contrib.data` API
+=====================
 
-The `Dataset` API is designed to let you build complex input pipelines from
-simple, reusable pieces. For example, the pipeline for an image model might
-aggregate data from files in a distributed file system, apply random
-perturbations to each image, and merge randomly selected images into a batch
-for training. The pipeline for a text model might involve extracting symbols
-from raw text data, converting them to embedding identifiers with a lookup
-table, and batching together sequences of different lengths. The `Dataset` API
-makes it easy to deal with large amounts of data, different data formats, and
-complicated transformations.
+This directory contains the Python API for the `tf.contrib.data.Dataset` and
+`tf.contrib.data.Iterator` classes, which can be used to build input pipelines.
 
-The `Dataset` API introduces two new abstractions to TensorFlow:
-
-* A `tf.contrib.data.Dataset` represents a sequence of elements, in which
-  each element contains one or more `Tensor` objects. For example, in an image
-  pipeline, an element might be a single training example, with a pair of
-  tensors representing the image data and a label. A `Dataset` can either be a
-  *source* (e.g. `Dataset.from_tensor_slices()` constructs a dataset from one
-  or more `tf.Tensor` objects), or a *transformation* (e.g. `Dataset.batch()`
-  constructs a dataset by stacking consecutive elements of another dataset into
-  a single element).
-
-* A `tf.contrib.data.Iterator` provides the main way to extract elements from a
-  dataset. The `Iterator.get_next()` operation yields the next element of a
-  `Dataset`, and typically acts as the interface between input pipeline code and
-  your model. The simplest iterator is a "one-shot iterator", which is
-  associated with a particular `Dataset` and iterates through it once. For more
-  sophisticated uses, the `Iterator.initializer` operation enables you to
-  reinitialize and parameterize an iterator with different datasets, so that
-  you can, for example, iterate over training and validation data multiple times
-  in the same program.
-
-## Tutorial
-
-This programmers' guide includes step-by-step instructions for a variety of
-input data use cases. Also see the `Dataset` and `Iterator` class references
-for more detailed information about the API.
-
-### Basic mechanics
-
-This section of the guide describes the fundamentals of creating different kinds
-of `Dataset` and `Iterator` objects, and how to extract data from them.
-
-#### Defining a source dataset
-
-You can build a `Dataset` using one of the following *source* dataset
-constructors:
-
-* From in-memory data:
-  * `tf.contrib.data.Dataset.from_tensors()`
-  * `tf.contrib.data.Dataset.from_tensor_slices()`
-
-* From on-disk data:
-  * `tf.contrib.data.FixedLengthRecordDataset()`
-  * `tf.contrib.data.TextLineDataset()`
-  * `tf.contrib.data.TFRecordDataset()`
-
-* From parameters:
-  * `tf.contrib.data.Dataset.range()`
-
-#### Transforming a dataset
-
-The `tf.contrib.data.Dataset` class has many methods that can be chained
-together to *transform* one dataset into another:
-
-* Per-element transformations:
-  * `Dataset.filter()`
-  * `Dataset.flat_map()`
-  * `Dataset.map()`
-  * `Dataset.zip()`
-
-* Multi-element transformations:
-  * `Dataset.batch()`
-  * `Dataset.dense_to_sparse_batch()`
-  * `Dataset.group_by_window()`
-  * `Dataset.padded_batch()`
-  * `Dataset.repeat()`
-  * `Dataset.shuffle()`
-  * `Dataset.skip()`
-  * `Dataset.take()`
-
-The following sections contain examples of how to use these transformations to
-solve common problems.
-
-#### Dataset structure
-
-A dataset comprises elements that each have the same structure. An element
-contains one or more `tf.Tensor` objects, called *components*. Each component
-has a `tf.DType` representing the type of elements in the tensor, and a
-`tf.TensorShape` representing the (possibly partially specified) static shape of
-each element. The `Dataset.output_types` and `Dataset.output_shapes` properties
-allow you to inspect the inferred types and shapes of each component of a
-dataset element. The *nested structure* of these properties map to the structure
-of an element, which may be a single tensor, a tuple of tensors, or a nested
-tuple of tensors. For example:
-
-```python
-dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
-print(dataset1.output_types)  # ==> "tf.float32"
-print(dataset1.output_shapes)  # ==> "(10,)"
-
-dataset2 = tf.contrib.data.Dataset.from_tensor_slices(
-   (tf.random_uniform([4]),
-    tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)))
-print(dataset2.output_types)  # ==> "(tf.float32, tf.int32)"
-print(dataset2.output_shapes)  # ==> "((), (100,))"
-
-dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
-print(dataset3.output_types)  # ==> (tf.float32, (tf.float32, tf.int32))
-print(dataset3.output_shapes)  # ==> "(10, ((), (100,)))"
-```
-
-The `Dataset` transformations support datasets of any structure. When using the
-`Dataset.map()`, `Dataset.flat_map()` and `Dataset.filter()` transformations,
-which apply a function to each element, the element structure determines the
-arguments of the function:
-
-```python
-dataset1 = dataset1.map(lambda x: ...)
-
-dataset2 = dataset2.flat_map(lambda x, y: ...)
-
-# *N.B.* Lambda argument destructuring is not available in Python 3.
-dataset3 = dataset3.filter(lambda x, (y, z): ...)
-```
-
-#### Creating an iterator
-
-One you have built a `Dataset` to represent your input data, the next step is to
-create an `Iterator` to access elements from that dataset.  The `Dataset` API
-currently supports three kinds of iterator, in increasing level of
-sophistication:
-
-A *one-shot* iterator is the simplest form of iterator, which only supports
-iterating once through a dataset, with no need for explicit initialization.
-One-shot iterators handle almost all of the cases that the existing queue-based
-input pipelines support, but they do not support parameterization. Using the
-example of `Dataset.range()`:
-
-```python
-dataset = tf.contrib.data.Dataset.range(100)
-iterator = dataset.make_one_shot_iterator()
-next_element = iterator.get_next()
-
-for i in range(100):
-  value = sess.run(next_element)
-  assert i == value
-```
-
-An *initializable* iterator requires you to run an explicit
-`iterator.initializer` operation before using it. In exchange for this
-inconvenience, it enables you to *parameterize* the definition of the dataset,
-using one or more `tf.placeholder()` tensors that can be fed when you
-initialize the iterator. Continuing the `Dataset.range()` example:
-
-```python
-max_value = tf.placeholder(tf.int64, shape=[])
-dataset = tf.contrib.data.Dataset.range(max_value)
-iterator = dataset.make_initializable_iterator()
-next_element = iterator.get_next()
-
-# Initialize an iterator over a dataset with 10 elements.
-sess.run(iterator.initializer, feed_dict={max_value: 10})
-for i in range(10):
-  value = sess.run(next_element)
-  assert i == value
-
-# Initialize the same iterator over a dataset with 100 elements.
-sess.run(iterator.initializer, feed_dict={max_value: 100})
-for i in range(100):
-  value = sess.run(next_element)
-  assert i == value
-```
-
-A *reinitializable* iterator can be initialized from multiple different
-`Dataset` objects. For example, you might have a training input pipeline that
-uses random perturbations to the input images to improve generalization, and
-a validation input pipeline that evaluates predictions on unmodified data. These
-pipelines will typically use different `Dataset` objects that have the same
-structure (i.e. the same types and compatible shapes for each component). 
-
-```python
-training_dataset = tf.contrib.data.Dataset.range(100).map(
-    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
-validation_dataset = tf.contrib.data.Dataset.range(50)
-
-# A reinitializable iterator is defined by its structure. We could use the
-# `output_types` and `output_shapes` properties of either `training_dataset`
-# or `validation_dataset` here, because they are compatible.
-iterator = Iterator.from_structure(training_dataset.output_types,
-                                   training_dataset.output_shapes)
-next_element = iterator.get_next()
-
-training_init_op = iterator.make_initializer(training_dataset)
-validation_init_op = iterator.make_initializer(validation_dataset)
-
-# Run 20 epochs in which the training dataset is traversed, followed by the
-# validation dataset.
-for _ in range(20):
-  # Initialize an iterator over the training dataset.
-  sess.run(training_init_op)
-  for _ in range(100):
-    sess.run(next_element)
-
-  # Initialize an iterator over the validation dataset.
-  sess.run(validation_init_op)
-  for _ in range(50):
-    sess.run(next_element)
-```
-
-#### Consuming values from an iterator
-
-The `Iterator.get_next()` method returns one or more `tf.Tensor` objects that
-correspond to the symbolic next element of an iterator. Each time these tensors
-are evaluated, they take the value of the next element in the underlying
-dataset. (Note that, like other stateful objects in TensorFlow, calling
-`Iterator.get_next()` does not immediately advance the iterator. Instead you
-must use the returned `tf.Tensor` objects in a TensorFlow expression, and pass
-the result of that expression to `tf.Session.run()` to get the next elements and
-advance the iterator.)
-
-If the iterator reaches the end of the dataset, executing
-the `Iterator.get_next()` operation will raise a `tf.errors.OutOfRangeError`.
-After this point the iterator will be in an unusable state, and you must
-initialize it again if you want to use it further.
-
-```python
-dataset = tf.contrib.data.Dataset.range(5)
-iterator = dataset.make_initializable_iterator()
-next_element = iterator.get_next()
-
-# Typically `result` will be the output of a model, or an optimizer's
-# training operation.
-result = tf.add(next_element, next_element)
-
-sess.run(iterator.initializer)
-print(sess.run(result))  # ==> "0"
-print(sess.run(result))  # ==> "2"
-print(sess.run(result))  # ==> "4"
-print(sess.run(result))  # ==> "6"
-print(sess.run(result))  # ==> "8"
-try:
-  sess.run(result)
-except tf.errors.OutOfRangeError:
-  print("End of dataset")  # ==> "End of dataset"
-```
-
-A common pattern is to wrap the "training loop" in a `try`-`except` block:
-
-```python
-sess.run(iterator.initializer)
-while True:
-  try:
-    sess.run(result)
-  except tf.errors.OutOfRangeError:
-    break
-```
-
-If each element of the dataset has a nested structure, the return value of
-`Iterator.get_next()` will be one or more `tf.Tensor` objects in the same
-nested structure:
-
-```python
-dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
-dataset2 = tf.contrib.data.Dataset.from_tensor_slices((tf.random_uniform([4]), tf.random_uniform([4, 100])))
-dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
-
-iterator = dataset3.make_initializable_iterator()
-
-sess.run(iterator.initializer)
-next1, (next2, next3) = iterator.get_next()
-```
-
-Note that evaluating *any* of `next1`, `next2`, or `next3` will advance the
-iterator for all components. A typical consumer of an iterator will include all
-components in a single expression.
-
-### Reading input data
-
-#### Consuming NumPy arrays
-
-If all of your input data fit in memory, the simplest way to create a `Dataset`
-from them is to convert them to `tf.Tensor` objects and use
-`Dataset.from_tensor_slices()`.
-
-```python
-# Load the training data into two NumPy arrays, for example using `np.load()`.
-with np.load("/var/data/training_data.npy") as data:
-  features = data["features"]
-  labels = data["labels"]
-
-# Assume that each row of `features` corresponds to the same row as `labels`.
-assert features.shape[0] == labels.shape[0]
-
-dataset = tf.contrib.data.Dataset.from_tensor_slices((features, labels))
-```
-
-Note that the above code snippet will embed the `features` and `labels` arrays
-in your TensorFlow graph as constants. This works well for a small dataset, but
-wastes memory, and can run into the 2GB limit for the `tf.GraphDef` protocol
-buffer.
-
-As an alternative, you can define the `Dataset` in terms of `tf.placeholder()`
-tensors, and *feed* the NumPy arrays when you initialize an `Iterator` over the
-dataset.
-
-```python
-# Load the training data into two NumPy arrays, for example using `np.load()`.
-with np.load("/var/data/training_data.npy") as data:
-  features = data["features"]
-  labels = data["labels"]
-
-# Assume that each row of `features` corresponds to the same row as `labels`.
-assert features.shape[0] == labels.shape[0]
-
-features_placeholder = tf.placeholder(features.dtype, features.shape)
-labels_placeholder = tf.placeholder(labels.dtype, labels.shape)
-
-dataset = tf.contrib.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
-# [Other transformations on `dataset`...]
-dataset = ...
-iterator = dataset.make_initializable_iterator()
-
-sess.run(iterator.initializer, feed_dict={features_placeholder: features,
-                                          labels_placeholder: labels})
-```
-
-#### Consuming TFRecord data
-
-The `Dataset` API supports a variety of file formats so that you can process
-large datasets that do not fit in memory. The TFRecord file format is a
-simple record-oriented binary format that many TensorFlow applications use for
-training data. The `tf.contrib.data.TFRecordDataset` class enables you to
-stream over the contents of one or more TFRecord files as part of an input
-pipeline.
-
-```python
-# Creates a dataset that reads all of the examples from two files.
-filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
-dataset = tf.contrib.data.TFRecordDataset(filenames)
-```
-
-The `filenames` argument to the `TFRecordDataset` initializer can be a
-`tf.Tensor` of strings. Therefore if you have two sets of files for training
-and validation purposes, you can use a `tf.placeholder(tf.string)` to represent
-the filenames, and initialize an iterator from the appropriate filenames:
-
-```python
-filenames = tf.placeholder(tf.string, shape=[None])
-dataset = tf.contrib.data.TFRecordDataset(filenames)
-# [Other transformations on `dataset`...]
-dataset = ...
-iterator = dataset.make_initializable_iterator()
-
-# You can feed the initializer with the appropriate filenames for the current
-# phase of execution, e.g. training vs. validation.
-
-# Initialize `iterator` with training data.
-training_filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
-sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
-
-# Initialize `iterator` with validation data.
-validation_filenames = ["/var/data/validation1.tfrecord", ...]
-sess.run(iterator.initializer, feed_dict={filenames: validation_filenames})
-```
-
-#### Consuming text data
-
-Many datasets are distributed as one or more text files. The
-`tf.contrib.data.TextLineDataset` provides an easy way to extract lines from
-one or more text files. Given one or more filenames, a `TextLineDataset` will
-produce one string-valued element per line of those files. Like a
-`TFRecordDataset`, `TextLineDataset` accepts `filenames` as a `tf.Tensor`, so
-you can parameterize it by passing a `tf.placeholder(tf.string)`.
-
-```python
-filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
-dataset = tf.contrib.data.TextLineDataset(filenames)
-```
-
-By default, a `TextLineDataset` yields *every* line of each file, which may
-not be desirable, for example if the file starts with a header line, or contains
-comments. These lines can be removed using the `Dataset.skip()` and
-`Dataset.filter()` transformations. To apply these transformations to each
-file separately, we use `Dataset.flat_map()` to create a nested `Dataset` for
-each file.
-
-```python
-filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
-
-dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)
-
-# Use `Dataset.flat_map()` to transform each file separately.
-# * Skip the first line (header row).
-# * Filter out lines beginning with "#" (comments).
-dataset = dataset.flat_map(
-    lambda filename: (
-        tf.contrib.data.Dataset.TextLineDataset(filename)
-        .skip(1)
-        .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
-```
-
-<!--
-TODO(mrry): Add these sections.
-
-#### Consuming from a Python generator
-#### Consuming from an index file and images
--->
-
-### Preprocessing data with `Dataset.map()`
-
-The `Dataset.map(f)` transformation produces a new dataset by applying a given
-function `f` to each element of the input dataset. It is based on
-the
-[`map()` function](https://en.wikipedia.org/wiki/Map_(higher-order_function))
-that is commonly applied to lists (and other structures) in functional
-programming languages.  The function `f` takes the `tf.Tensor` objects that
-represent a single element in the input, and returns the `tf.Tensor` objects
-that will represent a single element in the new dataset. Its implementation uses
-standard TensorFlow operations to transform one element into another.
-
-This section covers common examples of how to use `Dataset.map()`.
-
-#### Parsing `tf.Example` protocol buffer messages
-
-Many input pipelines extract `tf.train.Example` protocol buffer messages from a
-TFRecord-format file (written, for example, using
-`tf.python_io.TFRecordWriter`). Each `tf.train.Example` record contains one or
-more "features", and the input pipeline typically converts these features into
-tensors.
-
-```python
-# Transforms a scalar string `example_proto` into a pair of a scalar string and
-# a scalar integer, representing an image and its label, respectively.
-def _parse_function(example_proto):
-  features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
-              "label": tf.FixedLenFeature((), tf.int32, default_value=0)}
-  parsed_features = tf.parse_single_example(example_proto, features)
-  return parsed_features["image"], parsed_features["label"]
-
-# Creates a dataset that reads all of the examples from two files, and extracts
-# the image and label features.
-filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
-dataset = tf.contrib.data.TFRecordDataset(filenames)
-dataset = dataset.map(_parse_function)
-```
-
-#### Decoding image data and resizing it
-
-When training a neural network on real-world image data, it is often necessary
-to convert images of different sizes to a common size, so that they may be
-batched into a fixed size.
-
-```python
-# Reads an image from a file, decodes it into a dense tensor, and resizes it
-# to a fixed shape.
-def _parse_function(filename, label):
-  image_string = tf.read_file(filename)
-  image_decoded = tf.image.decode_image(image_string)
-  image_resized = tf.image.resize_images(image_decoded, [28, 28])
-  return image_resized, label
-
-filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...])
-labels = tf.constant([0, 37, 29, 1, ...])
-
-dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
-dataset = dataset.map(_parse_function)
-```
-
-#### Applying arbitrary Python logic with `tf.py_func()`
-
-For performance reasons, we encourage you to use TensorFlow operations for
-preprocessing your data whenever possible. However, it is sometimes useful to
-be able to call upon external Python libraries when parsing your input data,
-and you can do this by invoking the `tf.py_func()` operation in a
-`Dataset.map()` transformation.
-
-```python
-import cv2
-
-# Use a custom OpenCV function to read the image, instead of the standard
-# TensorFlow `tf.read_file()` operation.
-def _read_py_function(filename, label):
-  image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
-  return image_decoded, label
-
-# Use standard TensorFlow operations to resize the image to a fixed shape.
-def _resize_function(image_decoded, label):
-  image_decoded.set_shape([None, None, None])
-  image_resized = tf.image.resize_images(image_decoded, [28, 28])
-  return image_resized, label
-
-filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...]
-labels = [0, 37, 29, 1, ...]
-
-dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
-dataset = dataset.map(
-    lambda filename, label: tf.py_func(
-        _read_py_function, [filename, label], [tf.uint8, label.dtype]))
-dataset = dataset.map(_resize_function)
-```
-
-<!--
-TODO(mrry): Add this section.
-
-#### Handling text data with unusual sizes
--->
-
-### Batching dataset elements
-
-#### Simple batching
-
-The simplest form of batching stacks `n` consecutive elements of a dataset into
-a single element. The `Dataset.batch()` transformation does exactly this, with
-the same constraints as the `tf.stack()` operator, applied to each component
-of the elements: i.e. for each component *i*, all elements must have a tensor
-of the exact same shape.
-
-```python
-inc_dataset = tf.contrib.data.Dataset.range(100)
-dec_dataset = tf.contrib.data.Dataset.range(0, -100, -1)
-dataset = tf.contrib.data.Dataset.zip((inc_dataset, dec_dataset))
-batched_dataset = dataset.batch(4)
-
-iterator = batched_dataset.make_one_shot_iterator()
-next_element = iterator.get_next()
-
-print(sess.run(next_element))  # ==> ([0, 1, 2,   3],   [ 0, -1,  -2,  -3])
-print(sess.run(next_element))  # ==> ([4, 5, 6,   7],   [-4, -5,  -6,  -7])
-print(sess.run(next_element))  # ==> ([8, 9, 10, 11],   [-8, -9, -10, -11])
-```
-
-#### Batching tensors with padding
-
-The above recipe works for tensors that all have the same size. However, many
-models (e.g. sequence models) work with input data that can have varying size
-(e.g. sequences of different lengths). To handle this case, the
-`Dataset.padded_batch()` transformation enables you to batch tensors of
-different shape by specifying one or more dimensions in which they may be
-padded.
-
-```python
-dataset = tf.contrib.data.Dataset.range(100)
-dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
-dataset = dataset.padded_batch(4, padded_shapes=[None])
-
-iterator = batched_dataset.make_one_shot_iterator()
-next_element = iterator.get_next()
-
-print(sess.run(next_element))  # ==> [[0, 0, 0], [1, 0, 0], [2, 2, 0], [3, 3, 3]]
-print(sess.run(next_element))  # ==> [[4, 4, 4, 4, 0, 0, 0],
-                               #      [5, 5, 5, 5, 5, 0, 0],
-                               #      [6, 6, 6, 6, 6, 6, 0],
-                               #      [7, 7, 7, 7, 7, 7, 7]]
-```
-
-The `Dataset.padded_batch()` transformation allows you to set different padding
-for each dimension of each component, and it may be variable-length (signified
-by `None` in the example above) or constant-length. It is also possible to
-override the padding value, which defaults to 0.
-
-<!--
-TODO(mrry): Add this section.
-
-#### Dense ragged -> tf.SparseTensor
--->
-
-### Training workflows
-
-#### Processing multiple epochs
-
-The `Dataset` API offers two main ways to process multiple epochs of the same
-data.
-
-The simplest way to iterate over a dataset in multiple epochs is to use the
-`Dataset.repeat()` transformation. For example, to create a dataset that repeats
-its input for 10 epochs:
-
-```python
-filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
-dataset = tf.contrib.data.TFRecordDataset(filenames)
-dataset = dataset.map(...)
-dataset = dataset.repeat(10)
-dataset = dataset.batch(32)
-```
-
-Applying the `Dataset.repeat()` transformation with no arguments will repeat
-the input indefinitely. The `Dataset.repeat()` transformation concatenates its
-arguments without signaling the end of one epoch and the beginning of the next
-epoch.
-
-If you want to receive a signal at the end of each epoch, you can write a
-training loop that catches the `tf.errors.OutOfRangeError` at the end of a
-dataset. At that point you might collect some statistics (e.g. the validation
-error) for the epoch.
-
-```python
-filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
-dataset = tf.contrib.data.TFRecordDataset(filenames)
-dataset = dataset.map(...)
-dataset = dataset.batch(32)
-iterator = dataset.make_initializable_iterator()
-next_element = iterator.get_next()
-
-# Compute for 100 epochs.
-for _ in range(100):
-  sess.run(iterator.initializer)
-  while True:
-    try:
-      sess.run(next_element)
-    except tf.errors.OutOfRangeError:
-      break
-
-  # [Perform end-of-epoch calculations here.]
-```
-
-#### Randomly shuffling input data
-
-The `Dataset.shuffle()` transformation randomly shuffles the input dataset
-using a similar algorithm to `tf.RandomShuffleQueue`: it maintains a fixed-size
-buffer and chooses the next element uniformly at random from that buffer.
-
-```python
-filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
-dataset = tf.contrib.data.TFRecordDataset(filenames)
-dataset = dataset.map(...)
-dataset = dataset.repeat()
-dataset = dataset.shuffle(buffer_size=10000)
-dataset = dataset.batch(32)
-```
+The documentation for this API has moved to the programmers'
+guide, [here](../../docs_src/programmers_guide/datasets.md).
diff --git a/tensorflow/contrib/data/python/framework/BUILD b/tensorflow/contrib/data/python/framework/BUILD
index 7b84825bb4fd8a07b63150122f84b4faab73a893..164aa598f6f7d3cdb0925a35f6ab082713d8a172 100644
--- a/tensorflow/contrib/data/python/framework/BUILD
+++ b/tensorflow/contrib/data/python/framework/BUILD
@@ -11,7 +11,11 @@ py_library(
     srcs = ["function.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -23,8 +27,10 @@ py_test(
     deps = [
         ":function",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index ab4d80c3275d2cd491ab3a9fd936f0bd41d1b7f6..8e9f1527ef8628cb091ecb67665768007d6cae26 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -12,9 +12,19 @@ py_test(
     srcs = ["iterator_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -24,14 +34,18 @@ py_test(
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -41,9 +55,17 @@ py_test(
     srcs = ["bucketing_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -57,13 +79,15 @@ py_test(
         "nomac",  # b/62040583
     ],
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -73,13 +97,13 @@ py_test(
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -89,13 +113,29 @@ py_test(
     srcs = ["flat_map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "list_files_dataset_op_test",
+    size = "small",
+    srcs = ["list_files_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -105,16 +145,21 @@ py_test(
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -124,12 +169,13 @@ py_test(
     srcs = ["range_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
     ],
 )
 
@@ -139,12 +185,17 @@ py_test(
     srcs = ["reader_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -154,14 +205,13 @@ py_test(
     srcs = ["resample_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -171,12 +221,12 @@ py_test(
     srcs = ["sequence_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -186,12 +236,13 @@ py_test(
     srcs = ["shuffle_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -201,12 +252,14 @@ py_test(
     srcs = ["cache_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -216,12 +269,27 @@ py_test(
     srcs = ["zip_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "concatenate_dataset_op_test",
+    size = "small",
+    srcs = ["concatenate_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/util:nest",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 1f87f141872817b1b7af6bcec34448c2a9c882f6..496cdab4ba2848d7074c7316223979932eee8fe7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -271,18 +271,64 @@ class BatchDatasetTest(test.TestCase):
                                    "larger than the row shape"):
         sess.run(get_next)
 
-  def testUnbatchDataset(self):
-    data = [math_ops.range(10) for _ in range(3)]
+  def testUnbatchScalarDataset(self):
+    data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
+    expected_types = (dtypes.int32,) * 3
     data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
     data = data.unbatch()
+    self.assertEqual(expected_types, data.output_types)
 
-    iter = data.make_one_shot_iterator()
-    op = iter.get_next()
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
 
     with self.test_session() as sess:
-      for i in range(3):
-        self.assertAllClose([range(10)], sess.run(op))
+      for i in range(10):
+        self.assertEqual((i,) * 3, sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchSingleElementTupleDataset(self):
+    data = tuple([(math_ops.range(10),) for _ in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    expected_types = ((dtypes.int32,),) * 3
+    data = data.batch(2)
+    self.assertEqual(expected_types, data.output_types)
+    data = data.unbatch()
+    self.assertEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(((i,),) * 3, sess.run(op))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(op)
+
+  def testUnbatchMultiElementTupleDataset(self):
+    data = tuple([(math_ops.range(10 * i, 10 * i + 10),
+                   array_ops.fill([10], "hi"))
+                  for i in range(3)])
+    data = dataset_ops.Dataset.from_tensor_slices(data)
+    expected_types = ((dtypes.int32, dtypes.string),) * 3
+    data = data.batch(2)
+    self.assertAllEqual(expected_types, data.output_types)
+    data = data.unbatch()
+    self.assertAllEqual(expected_types, data.output_types)
+
+    iterator = data.make_one_shot_iterator()
+    op = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(((i, b"hi"),
+                          (10 + i, b"hi"),
+                          (20 + i, b"hi")),
+                         sess.run(op))
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(op)
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 20d66d7f231d753436ec104dd0dbfbc469076bca..71df1ee0a501f16571017dd61e1635a8ae866d07 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
-class BucketingTest(test.TestCase):
+class GroupByWindowTest(test.TestCase):
 
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
@@ -257,16 +257,24 @@ class BucketTest(test.TestCase):
 
   def testEvenOddBucketsFilterOutAllOdd(self):
     def _map_fn(v):
-      return (v, array_ops.fill([v], v),
-              array_ops.fill([3], string_ops.as_string(v)))
+      return {"x": v,
+              "y": array_ops.fill([v], v),
+              "z": array_ops.fill([3], string_ops.as_string(v))}
+
+    def _dynamic_pad_fn(bucket, window, _):
+      return dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
+              32, {"x": tensor_shape.TensorShape([]),
+                   "y": tensor_shape.TensorShape([None]),
+                   "z": tensor_shape.TensorShape([3])})))
 
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
-        .filter(lambda x, y, z: math_ops.equal(x % 2, 0)))
+        .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))
 
     bucketed_dataset = input_dataset.group_by_window(
-        lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
-        lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)
+        lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
+        lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32)
 
     iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
     init_op = iterator.initializer
@@ -283,9 +291,9 @@ class BucketTest(test.TestCase):
       self.assertAllEqual(0, which_bucket0)
       self.assertAllEqual(0, which_bucket1)
       self.assertAllEqual(
-          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0[0])
+          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
       self.assertAllEqual(
-          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1[0])
+          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3407a06a984db0e09bff032b95547b12a22ed0d1
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
@@ -0,0 +1,134 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import test
+
+
+class ConcatenateDatasetTest(test.TestCase):
+
+  def testConcatenateDataset(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 15),
+        np.array([37.0, 38.0, 39.0, 40.0]))
+    to_concatenate_components = (
+        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
+        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
+        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+    concatenated = input_dataset.concatenate(dataset_to_concatenate)
+    self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
+        [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
+
+    iterator = concatenated.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(9):
+        result = sess.run(get_next)
+        if i < 4:
+          for component, result_component in zip(input_components, result):
+            self.assertAllEqual(component[i], result_component)
+        else:
+          for component, result_component in zip(to_concatenate_components,
+                                                 result):
+            self.assertAllEqual(component[i - 4], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testConcatenateDatasetDifferentShape(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 4))
+    to_concatenate_components = (
+        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
+        np.tile(np.array([[12], [13], [14], [15], [16]]), 15))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+    concatenated = input_dataset.concatenate(dataset_to_concatenate)
+    self.assertEqual(
+        [ts.as_list()
+         for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
+
+    iterator = concatenated.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(9):
+        result = sess.run(get_next)
+        if i < 4:
+          for component, result_component in zip(input_components, result):
+            self.assertAllEqual(component[i], result_component)
+        else:
+          for component, result_component in zip(to_concatenate_components,
+                                                 result):
+            self.assertAllEqual(component[i - 4], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testConcatenateDatasetDifferentStructure(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 5),
+        np.tile(np.array([[12], [13], [14], [15]]), 4))
+    to_concatenate_components = (
+        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
+        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
+        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same number of elements"):
+      input_dataset.concatenate(dataset_to_concatenate)
+
+  def testConcatenateDatasetDifferentType(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 5),
+        np.tile(np.array([[12], [13], [14], [15]]), 4))
+    to_concatenate_components = (
+        np.tile(np.array([[1.0], [2.0], [3.0], [4.0]]), 5),
+        np.tile(np.array([[12], [13], [14], [15]]), 15))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+
+    with self.assertRaisesRegexp(TypeError, "have different types"):
+      input_dataset.concatenate(dataset_to_concatenate)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 3ea783ad899cac2451f573edd154aba41f6a8a03..e6d50dc15476dce249375900f5ca1df4d1287389 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -72,6 +72,34 @@ class FilterDatasetTest(test.TestCase):
       # Test an empty dataset.
       do_test(0, 1)
 
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(100).filter(
+        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertEqual(0, sess.run(get_next))
+      self.assertEqual(1, sess.run(get_next))
+      self.assertEqual(3, sess.run(get_next))
+
+  def testFilterDict(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+                .filter(lambda d: math_ops.equal(d["bar"] % 2, 0))
+                .map(lambda d: d["foo"] + d["bar"])
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        if (i ** 2) % 2 == 0:
+          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index 3c9c714bde41a1165dd9c136c1e0a64a976df901..2a582ae6620ac8276d290c7b995588640e36929c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -17,13 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import random
 
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -101,8 +104,174 @@ class FlatMapDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess = random.choice([sess1, sess2])
           sess.run(get_next)
+
+  def testMapDict(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
+                          .repeat(d["bar"]))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        for _ in range(i ** 2):
+          self.assertEqual(i * 2, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
   # pylint: enable=g-long-lambda
 
 
+class InterleaveDatasetTest(test.TestCase):
+
+  def _interleave(self, lists, cycle_length, block_length):
+    num_open = 0
+
+    # `all_iterators` acts as a queue of iterators over each element of `lists`.
+    all_iterators = [iter(l) for l in lists]
+
+    # `open_iterators` are the iterators whose elements are currently being
+    # interleaved.
+    open_iterators = []
+    for i in range(cycle_length):
+      if all_iterators:
+        open_iterators.append(all_iterators.pop(0))
+        num_open += 1
+      else:
+        open_iterators.append(None)
+
+    while num_open or all_iterators:
+      for i in range(cycle_length):
+        if open_iterators[i] is None:
+          if all_iterators:
+            open_iterators[i] = all_iterators.pop(0)
+            num_open += 1
+          else:
+            continue
+        for _ in range(block_length):
+          try:
+            yield next(open_iterators[i])
+          except StopIteration:
+            open_iterators[i] = None
+            num_open -= 1
+            break
+
+  def testPythonImplementation(self):
+    input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6],
+                   [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]]
+
+    # Cycle length 1 acts like `Dataset.flat_map()`.
+    expected_elements = itertools.chain(*input_lists)
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 1, 1)):
+      self.assertEqual(expected, produced)
+
+    # Cycle length > 1.
+    expected_elements = [4, 5, 4, 5, 4, 5, 4,
+                         5, 5, 6, 6,  # NOTE(mrry): When we cycle back
+                                      # to a list and are already at
+                                      # the end of that list, we move
+                                      # on to the next element.
+                         4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5]
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 2, 1)):
+      self.assertEqual(expected, produced)
+
+    # Cycle length > 1 and block length > 1.
+    expected_elements = [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6,
+                         4, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 2, 3)):
+      self.assertEqual(expected, produced)
+
+    # Cycle length > len(input_values).
+    expected_elements = [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6,
+                         4, 4, 5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 7, 2)):
+      self.assertEqual(expected, produced)
+
+  def testInterleaveDataset(self):
+    input_values = array_ops.placeholder(dtypes.int64, shape=[None])
+    cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
+    block_length = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_count = 2
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_values)
+        .repeat(repeat_count)
+        .interleave(lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+                    cycle_length, block_length))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Cycle length 1 acts like `Dataset.flat_map()`.
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 1, block_length: 3})
+
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 1, 3):
+        self.assertEqual(expected_element, sess.run(next_element))
+
+      # Cycle length > 1.
+      # expected: [4, 5, 4, 5, 4, 5, 4, 5, 5, 6, 6, 4, 6, 4, 6, 4, 6, 4, 6, 5,
+      #            6, 5, 6, 5, 6, 5, 6, 5]
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 2, block_length: 1})
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 1):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Cycle length > 1 and block length > 1.
+      # expected: [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 5,
+      #            5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 2, block_length: 3})
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 3):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Cycle length > len(input_values) * repeat_count.
+      # expected: [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4,
+      #            5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 7, block_length: 2})
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 7, 2):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Empty input.
+      sess.run(init_op, feed_dict={input_values: [],
+                                   cycle_length: 2, block_length: 3})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Non-empty input leading to empty output.
+      sess.run(init_op, feed_dict={input_values: [0, 0, 0],
+                                   cycle_length: 2, block_length: 3})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Mixture of non-empty and empty interleaved datasets.
+      sess.run(init_op, feed_dict={input_values: [4, 0, 6],
+                                   cycle_length: 2, block_length: 3})
+      for expected_element in self._interleave(
+          [[4] * 4, [], [6] * 6] * repeat_count, 2, 3):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index d6dd134a5b9537326bb4d956b2814733812d305b..30f685842b0b34e7f5606ef8e9583e03dd33cb17 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -128,6 +130,71 @@ class IteratorTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  def testOneShotIteratorNonBlocking(self):
+    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    # Create a session with a single thread to ensure that the
+    # one-shot iterator initializer does not deadlock.
+    config = config_pb2.ConfigProto(inter_op_parallelism_threads=1,
+                                    use_per_session_threads=True)
+    with session.Session(config=config) as sess:
+      self.assertAllEqual([1, 4, 9], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    # Test with multiple threads invoking the one-shot iterator concurrently.
+    with session.Session(config=config) as sess:
+      results = []
+      def consumer_thread():
+        try:
+          results.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          results.append(None)
+
+      num_threads = 8
+      threads = [
+          self.checkedThread(consumer_thread) for _ in range(num_threads)]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      self.assertEqual(num_threads, len(results))
+      self.assertEqual(num_threads - 1,
+                       len([None for r in results if r is None]))
+      self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
+
+  def testOneShotIteratorInitializerFails(self):
+    # Define a dataset whose initialization will always fail.
+    dataset = dataset_ops.Dataset.from_tensors(
+        array_ops.check_numerics(
+            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+        sess.run(next_element)
+
+      # Test that subsequent attempts to use the iterator also fail.
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+        sess.run(next_element)
+
+    with self.test_session() as sess:
+      def consumer_thread():
+        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+          sess.run(next_element)
+
+      num_threads = 8
+      threads = [
+          self.checkedThread(consumer_thread) for _ in range(num_threads)]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
   def testSimpleSharedResource(self):
     components = (
         np.array(1, dtype=np.int64),
@@ -261,6 +328,94 @@ class IteratorTest(test.TestCase):
               [1, 2, 3], dtype=dtypes.int64), constant_op.constant(
                   [4., 5., 6., 7.], dtype=dtypes.float64))))
 
+  def testIteratorStringHandle(self):
+    dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+    dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
+
+    iterator_3 = dataset_3.make_one_shot_iterator()
+    iterator_4 = dataset_4.make_one_shot_iterator()
+
+    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+    feedable_iterator = dataset_ops.Iterator.from_string_handle(
+        handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
+    next_element = feedable_iterator.get_next()
+
+    self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
+    self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
+    self.assertEqual([], feedable_iterator.output_shapes)
+
+    with self.test_session() as sess:
+      iterator_3_handle = sess.run(iterator_3.string_handle())
+      iterator_4_handle = sess.run(iterator_4.string_handle())
+
+      self.assertEqual(
+          10, sess.run(next_element,
+                       feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(
+          1, sess.run(next_element,
+                      feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(
+          20, sess.run(next_element,
+                       feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(
+          2, sess.run(next_element,
+                      feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(
+          30, sess.run(next_element,
+                       feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(
+          3, sess.run(next_element,
+                      feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(
+          40, sess.run(next_element,
+                       feed_dict={handle_placeholder: iterator_4_handle}))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element,
+                 feed_dict={handle_placeholder: iterator_3_handle})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element,
+                 feed_dict={handle_placeholder: iterator_4_handle})
+
+  def testIteratorStringHandleError(self):
+    dataset_int_scalar = (dataset_ops.Dataset.from_tensor_slices([1, 2,
+                                                                  3]).repeat())
+    dataset_float_vector = (dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]))
+
+    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+
+    feedable_int_scalar = dataset_ops.Iterator.from_string_handle(
+        handle_placeholder, dtypes.int32, [])
+    feedable_int_vector = dataset_ops.Iterator.from_string_handle(
+        handle_placeholder, dtypes.int32, [None])
+    feedable_int_any = dataset_ops.Iterator.from_string_handle(
+        handle_placeholder, dtypes.int32)
+
+    with self.test_session() as sess:
+      handle_int_scalar = sess.run(
+          dataset_int_scalar.make_one_shot_iterator().string_handle())
+      handle_float_vector = sess.run(
+          dataset_float_vector.make_one_shot_iterator().string_handle())
+
+      self.assertEqual(1,
+                       sess.run(
+                           feedable_int_scalar.get_next(),
+                           feed_dict={handle_placeholder: handle_int_scalar}))
+
+      self.assertEqual(2,
+                       sess.run(
+                           feedable_int_any.get_next(),
+                           feed_dict={handle_placeholder: handle_int_scalar}))
+
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(
+            feedable_int_vector.get_next(),
+            feed_dict={handle_placeholder: handle_int_scalar}))
+
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(
+            feedable_int_vector.get_next(),
+            feed_dict={handle_placeholder: handle_float_vector}))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..27298de65f90c627e5eb638385bfe0478ef74fca
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
@@ -0,0 +1,159 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class ListFilesDatasetOpTest(test.TestCase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def _touchTempFiles(self, filenames):
+    for filename in filenames:
+      open(path.join(self.tmp_dir, filename), 'a').close()
+
+  def testEmptyDirectory(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    with self.test_session() as sess:
+      itr = dataset.make_one_shot_iterator()
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testSimpleDirectory(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    with self.test_session() as sess:
+      itr = dataset.make_one_shot_iterator()
+
+      full_filenames = []
+      produced_filenames = []
+      for filename in filenames:
+        full_filenames.append(
+            compat.as_bytes(path.join(self.tmp_dir, filename)))
+        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+      self.assertItemsEqual(full_filenames, produced_filenames)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testEmptyDirectoryInitializer(self):
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
+
+    with self.test_session() as sess:
+      itr = dataset.make_initializable_iterator()
+      sess.run(
+          itr.initializer,
+          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testSimpleDirectoryInitializer(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
+
+    with self.test_session() as sess:
+      itr = dataset.make_initializable_iterator()
+      sess.run(
+          itr.initializer,
+          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
+
+      full_filenames = []
+      produced_filenames = []
+      for filename in filenames:
+        full_filenames.append(
+            compat.as_bytes(path.join(self.tmp_dir, filename)))
+        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+
+      self.assertItemsEqual(full_filenames, produced_filenames)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testFileSuffixes(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
+
+    with self.test_session() as sess:
+      itr = dataset.make_initializable_iterator()
+      sess.run(
+          itr.initializer,
+          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
+
+      full_filenames = []
+      produced_filenames = []
+      for filename in filenames[1:-1]:
+        full_filenames.append(
+            compat.as_bytes(path.join(self.tmp_dir, filename)))
+        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+      self.assertItemsEqual(full_filenames, produced_filenames)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+  def testFileMiddles(self):
+    filenames = ['a.txt', 'b.py', 'c.pyc']
+    self._touchTempFiles(filenames)
+
+    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
+
+    with self.test_session() as sess:
+      itr = dataset.make_initializable_iterator()
+      sess.run(
+          itr.initializer,
+          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
+
+      full_filenames = []
+      produced_filenames = []
+      for filename in filenames[1:]:
+        full_filenames.append(
+            compat.as_bytes(path.join(self.tmp_dir, filename)))
+        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
+
+      self.assertItemsEqual(full_filenames, produced_filenames)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(itr.get_next())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index b5956ac49c310f7319ef700f94b441e54aa700ed..8a78752584a6bb4a8ec289b93c795d336f793bc3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import dataset_ops
@@ -25,12 +27,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class MapDatasetTest(test.TestCase):
@@ -199,11 +203,27 @@ class MapDatasetTest(test.TestCase):
   def testImplicitDisposeParallelMapDataset(self):
     self._testDisposeParallelMapDataset(False)
 
+  def testParallelMapUnspecifiedOutputSize(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message"),
+                    num_threads=2))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+
   def testParallelMapError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message")))
+               .map(lambda x: array_ops.check_numerics(x, "message"),
+                    num_threads=2, output_buffer_size=2))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -219,6 +239,76 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testMapIgnoreError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message"))
+               .ignore_errors())
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for x in [1., 2., 3., 5.]:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testParallelMapIgnoreError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message"),
+                    num_threads=2, output_buffer_size=2)
+               .ignore_errors())
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for x in [1., 2., 3., 5.]:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testReadFileIgnoreError(self):
+    def write_string_to_file(value, filename):
+      with open(filename, "w") as f:
+        f.write(value)
+    filenames = [os.path.join(self.get_temp_dir(), "file_%d.txt" % i)
+                 for i in range(5)]
+    for filename in filenames:
+      write_string_to_file(filename, filename)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(filenames)
+               .map(io_ops.read_file, num_threads=2, output_buffer_size=2)
+               .ignore_errors())
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # All of the files are present.
+      sess.run(init_op)
+      for filename in filenames:
+        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Delete one of the files.
+      os.remove(filenames[0])
+
+      # Attempting to read filenames[0] will fail, but ignore_errors()
+      # will catch the error.
+      sess.run(init_op)
+      for filename in filenames[1:]:
+        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testCaptureHashTable(self):
     # NOTE(mrry): We must use the V2 variants of `HashTable`
     # etc. because these produce a `tf.resource`-typed output that is
@@ -324,5 +414,21 @@ class MapDatasetTest(test.TestCase):
       # Randomness is repeatable given same seed
       self.assertAllClose(random_values, random_values_2)
 
+  def testMapDict(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+                .map(lambda d: d["foo"] + d["bar"])
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 133165a1c25f85ec38f4df0c08ca784301719c2d..fbac4317b10e1bd1968516e02039519554eb7e75 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -40,27 +40,48 @@ class TextLineDatasetTest(test.TestCase):
   def _lineText(self, f, l):
     return compat.as_bytes("%d: %d" % (f, l))
 
-  def _createFiles(self, num_files, num_lines, crlf=False):
+  def _createFiles(self,
+                   num_files,
+                   num_lines,
+                   crlf=False,
+                   compression_type=None):
     filenames = []
     for i in range(num_files):
       fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
       filenames.append(fn)
-      with open(fn, "wb") as f:
-        for j in range(num_lines):
-          f.write(self._lineText(i, j))
-          # Always include a newline after the record unless it is
-          # at the end of the file, in which case we include it sometimes.
-          if j + 1 != num_lines or i == 0:
-            f.write(b"\r\n" if crlf else b"\n")
+      contents = []
+      for j in range(num_lines):
+        contents.append(self._lineText(i, j))
+        # Always include a newline after the record unless it is
+        # at the end of the file, in which case we include it sometimes.
+        if j + 1 != num_lines or i == 0:
+          contents.append(b"\r\n" if crlf else b"\n")
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
     return filenames
 
-  def testTextLineDataset(self):
-    test_filenames = self._createFiles(2, 5, crlf=True)
+  def _testTextLineDataset(self, compression_type=None):
+    test_filenames = self._createFiles(
+        2, 5, crlf=True, compression_type=compression_type)
     filenames = array_ops.placeholder(dtypes.string, shape=[None])
     num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    repeat_dataset = dataset_ops.TextLineDataset(filenames).repeat(num_epochs)
+    repeat_dataset = dataset_ops.TextLineDataset(
+        filenames, compression_type=compression_type).repeat(num_epochs)
     batch_dataset = repeat_dataset.batch(batch_size)
 
     iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
@@ -114,6 +135,15 @@ class TextLineDatasetTest(test.TestCase):
         self.assertAllEqual([self._lineText(1, i) for i in range(5)],
                             sess.run(get_next))
 
+  def testTextLineDatasetNoCompression(self):
+    self._testTextLineDataset()
+
+  def testTextLineDatasetGzipCompression(self):
+    self._testTextLineDataset(compression_type="GZIP")
+
+  def testTextLineDatasetZlibCompression(self):
+    self._testTextLineDataset(compression_type="ZLIB")
+
 
 class FixedLengthRecordReaderTest(test.TestCase):
 
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 08a2774ece20f823407f8a6cdda359207e7b480c..f49350505ae5a2f3b7d3461636ffc2913a039e9a 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -11,10 +11,23 @@ py_library(
     deps = [
         "//tensorflow/contrib/data/python/framework:function",
         "//tensorflow/contrib/data/python/util:nest",
-        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 89410bf84472d1b46e941e6a87233303afa36d14..3dfe6463dcabb252fb074ed571cdeb79c74ff5b5 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -181,6 +182,64 @@ class Iterator(object):
         output_shapes=nest.flatten(output_shapes))
     return Iterator(iterator_resource, None, output_types, output_shapes)
 
+  @staticmethod
+  def from_string_handle(string_handle, output_types, output_shapes=None):
+    """Creates a new, uninitialized `Iterator` based on the given handle.
+
+    This method allows you to define a "feedable" iterator where you can choose
+    between concrete iterators by feeding a value in a @{tf.Session.run} call.
+    In that case, `string_handle` would a @{tf.placeholder}, and you would feed
+    it with the value of @{tf.contrib.data.Iterator.string_handle} in each step.
+
+    For example, if you had two iterators that marked the current position in
+    a training dataset and a test dataset, you could choose which to use in
+    each step as follows:
+
+    ```python
+    train_iterator = tf.contrib.data.Dataset(...).make_one_shot_iterator()
+    train_iterator_handle = sess.run(train_iterator.string_handle())
+
+    test_iterator = tf.contrib.data.Dataset(...).make_one_shot_iterator()
+    test_iterator_handle = sess.run(test_iterator.string_handle())
+
+    handle = tf.placeholder(tf.string, shape=[])
+    iterator = tf.contrib.data.Iterator.from_string_handle(
+        handle, train_iterator.output_types)
+
+    next_element = iterator.get_next()
+    loss = f(next_element)
+
+    train_loss = sess.run(loss, feed_dict={handle: train_iterator_handle})
+    test_loss = sess.run(loss, feed_dict={handle: test_iterator_handle})
+    ```
+
+    Args:
+      string_handle: A scalar `tf.Tensor` of type `tf.string` that evaluates
+        to a handle produced by the `Iterator.string_handle()` method.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this iterator.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
+        corresponding to each component of an element of this dataset. If
+        omitted, each component will have an unconstrainted shape.
+
+    Returns:
+      An `Iterator`.
+    """
+    output_types = nest.map_structure(dtypes.as_dtype, output_types)
+    if output_shapes is None:
+      output_shapes = nest.map_structure(
+          lambda _: tensor_shape.TensorShape(None), output_types)
+    else:
+      output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+    nest.assert_same_structure(output_types, output_shapes)
+    string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
+    iterator_resource = gen_dataset_ops.iterator_from_string_handle(
+        string_handle,
+        output_types=nest.flatten(output_types),
+        output_shapes=nest.flatten(output_shapes))
+    return Iterator(iterator_resource, None, output_types, output_shapes)
+
   @property
   def initializer(self):
     """A `tf.Operation` that should be run to initialize this iterator.
@@ -260,6 +319,18 @@ class Iterator(object):
     """
     return gen_dataset_ops.iterator_dispose(self._iterator_resource, name=name)
 
+  def string_handle(self, name=None):
+    """Returns a string-valued `tf.Tensor` that represents this iterator.
+
+    Args:
+      name: (Optional.) A name for the created operation.
+
+    Returns:
+      A scalar `tf.Tensor` of type `tf.string`.
+    """
+    return gen_dataset_ops.iterator_to_string_handle(self._iterator_resource,
+                                                     name=name)
+
   @property
   def output_shapes(self):
     """Returns the shape of each component of an element of this iterator.
@@ -349,8 +420,7 @@ def _estimate_data_distribution(c, num_examples_per_class_seen):
   # cross-device round-trip.  Just use the cached value.
   num_examples_per_class_seen = num_examples_per_class_seen.assign_add(
       math_ops.reduce_sum(
-          array_ops.one_hot(c, num_classes, dtype=dtypes.int64),
-          0))
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
   init_prob_estimate = math_ops.truediv(
       num_examples_per_class_seen,
       math_ops.reduce_sum(num_examples_per_class_seen))
@@ -444,8 +514,8 @@ class Dataset(object):
     output_shapes = str(output_shapes).replace("'", "")
     output_types = nest.map_structure(repr, self.output_types)
     output_types = str(output_types).replace("'", "")
-    return ("<%s shapes: %s, types: %s>"
-            % (type(self).__name__, output_shapes, output_types))
+    return ("<%s shapes: %s, types: %s>" % (type(self).__name__, output_shapes,
+                                            output_types))
 
   @staticmethod
   def from_tensors(tensors):
@@ -554,6 +624,32 @@ class Dataset(object):
     """
     return ZipDataset(datasets)
 
+  def concatenate(self, dataset):
+    """Creates a `Dataset` by concatenating given dataset with this dataset.
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3 }
+    b = { 4, 5, 6, 7 }
+
+    # Input dataset and dataset to be concatenated should have same
+    # nested structures and output types.
+    # c = { (8, 9), (10, 11), (12, 13) }
+    # d = { 14.0, 15.0, 16.0 }
+    # a.concatenate(c) and a.concatenate(d) would result in error.
+
+    a.concatenate(b) == { 1, 2, 3, 4, 5, 6, 7 }
+    ```
+
+    Args:
+      dataset: `Dataset` to be concatenated.
+
+    Returns:
+      A `Dataset`.
+    """
+    return ConcatenateDataset(self, dataset)
+
   @staticmethod
   def read_batch_features(file_pattern,
                           batch_size,
@@ -592,14 +688,40 @@ class Dataset(object):
     else:
       dataset = reader(filenames)
     dataset = dataset.repeat(num_epochs)
+    if dataset.output_types == (dtypes.string, dtypes.string):
+      dataset = dataset.map(lambda unused_k, v: v)
+    elif dataset.output_types != dtypes.string:
+      raise TypeError("`reader` must be a dataset of `tf.string` values, "
+                      "or `(tf.string, tf.string)` key-value pairs.")
     if randomize_input:
       dataset = dataset.shuffle(capacity)
-    dataset = dataset.map(
-        lambda x: _parse_example(nest.flatten(x), features)
-    )
+    dataset = dataset.map(lambda x: _parse_example(nest.flatten(x), features))
     dataset = dataset.batch(batch_size)
     return dataset
 
+  @staticmethod
+  def list_files(file_pattern):
+    """A dataset of all files matching a pattern.
+
+    Example:
+      If we had the following files on our filesystem:
+        - /path/to/dir/a.txt
+        - /path/to/dir/b.py
+        - /path/to/dir/c.py
+      If we pass "/path/to/dir/*.py" as the directory, the dataset would
+      produce:
+        - /path/to/dir/b.py
+        - /path/to/dir/c.py
+
+    Args:
+      file_pattern: A string or scalar string `tf.Tensor`, representing
+        the filename pattern that will be matched.
+
+    Returns:
+     A `Dataset` of strings corresponding to file names.
+    """
+    return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
+
   def repeat(self, count=None):
     """Repeats this dataset `count` times.
 
@@ -699,6 +821,28 @@ class Dataset(object):
     """
     return SkipDataset(self, count)
 
+  def ignore_errors(self):
+    """Creates a `Dataset` from this one and silently ignores any errors.
+
+    Use this transformation to produce a dataset that contains the same elements
+    as the input, but silently drops any elements that caused an error. For
+    example:
+
+    ```python
+    dataset = tf.contrib.data.Dataset.from_tensor_slices([1., 2., 0., 4.])
+
+    # Computing `tf.check_numerics(1. / 0.)` will raise an InvalidArgumentError.
+    dataset = dataset.map(lambda x: tf.check_numerics(1. / x, "error"))
+
+    # Using `ignore_errors()` will drop the element that causes an error.
+    dataset = dataset.ignore_errors()  # ==> { 1., 0.5, 0.2 }
+    ```
+
+    Returns:
+      A `Dataset`.
+    """
+    return IgnoreErrorsDataset(self)
+
   def batch(self, batch_size):
     """Combines consecutive elements of this dataset into batches.
 
@@ -839,6 +983,70 @@ class Dataset(object):
     """
     return FlatMapDataset(self, map_func)
 
+  def interleave(self, map_func, cycle_length, block_length=1):
+    """Maps `map_func` across this dataset, and interleaves the results.
+
+    For example, you can use `Dataset.interleave()` to process many input files
+    concurrently:
+
+    ```python
+    # Preprocess 4 files concurrently, and interleave blocks of 16 records from
+    # each file.
+    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ..."]
+    dataset = (Dataset.from_tensor_slices(filenames)
+               .interleave(
+                   lambda x: TextLineDataset(x).map(parse_fn, num_threads=1)
+                   cycle_length=4, block_length=16))
+    ```
+
+    The `cycle_length` and `block_length` arguments control the order in which
+    elements are produced. `cycle_length` controls the number of input elements
+    that are processed concurrently. If you set `cycle_length` to 1, this
+    transformation will handle one input element at a time, and will produce
+    identical results = to @{tf.contrib.data.Dataset.flat_map}. In general,
+    this transformation will apply `map_func` to `cycle_length` input elements,
+    open iterators on the returned `Dataset` objects, and cycle through them
+    producing `block_length` consecutive elements from each iterator, and
+    consuming the next input element each time it reaches the end of an
+    iterator.
+
+    For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3, 4, 5 }
+
+    # NOTE: New lines indicate "block" boundaries.
+    a.interleave(lambda x: Dataset.from_tensors(x).repeat(6),
+                 cycle_length=2, block_length=4) == {
+        1, 1, 1, 1,
+        2, 2, 2, 2,
+        1, 1,
+        2, 2,
+        3, 3, 3, 3,
+        4, 4, 4, 4,
+        3, 3,
+        4, 4,
+        5, 5, 5, 5,
+        5, 5,
+    }
+    ```
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        `Dataset`.
+      cycle_length: The number of elements from this dataset that will be
+        processed concurrently.
+      block_length: The number of consecutive elements to produce from each
+        input element before cycling to another input element.
+
+    Returns:
+      A `Dataset`.
+    """
+    return InterleaveDataset(self, map_func, cycle_length, block_length)
+
   def unbatch(self):
     """Splits elements of this dataset into sequences of consecutive elements.
 
@@ -850,8 +1058,12 @@ class Dataset(object):
     Returns:
       A `Dataset`.
     """
-    return self.flat_map(
-      map_func=lambda *args: Dataset.from_tensor_slices(args))
+    def unbatch_map(arg, *rest):
+      if rest:
+        return Dataset.from_tensor_slices((arg,) + rest)
+      else:
+        return Dataset.from_tensor_slices(arg)
+    return self.flat_map(map_func=unbatch_map)
 
   def filter(self, predicate):
     """Filters this dataset according to `predicate`.
@@ -985,12 +1197,53 @@ class ZipDataset(Dataset):
   @property
   def output_shapes(self):
     return nest.pack_sequence_as(self._datasets, [
-        ds.output_shapes for ds in nest.flatten(self._datasets)])
+        ds.output_shapes for ds in nest.flatten(self._datasets)
+    ])
 
   @property
   def output_types(self):
     return nest.pack_sequence_as(self._datasets, [
-        ds.output_types for ds in nest.flatten(self._datasets)])
+        ds.output_types for ds in nest.flatten(self._datasets)
+    ])
+
+
+class ConcatenateDataset(Dataset):
+  """A `Dataset` that concatenates its input with given dataset."""
+
+  def __init__(self, input_dataset, dataset_to_concatenate):
+    """See `Dataset.concatenate()` for details."""
+    super(ConcatenateDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._dataset_to_concatenate = dataset_to_concatenate
+    nest.assert_same_structure(input_dataset.output_types,
+                               dataset_to_concatenate.output_types)
+    for a, b in zip(
+        nest.flatten(input_dataset.output_types),
+        nest.flatten(dataset_to_concatenate.output_types)):
+      if a != b:
+        raise TypeError(
+            "Two datasets to concatenate have different types %s and %s" %
+            (input_dataset.output_types, dataset_to_concatenate.output_types))
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.concatenate_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._dataset_to_concatenate.make_dataset_resource(),
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._input_dataset.output_shapes, [
+        ts1.most_specific_compatible_shape(ts2)
+        for (ts1, ts2) in zip(
+            nest.flatten(self._input_dataset.output_shapes),
+            nest.flatten(self._dataset_to_concatenate.output_shapes))
+    ])
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
 
 
 class RepeatDataset(Dataset):
@@ -1003,8 +1256,8 @@ class RepeatDataset(Dataset):
     if count is None:
       self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
     else:
-      self._count = ops.convert_to_tensor(count, dtype=dtypes.int64,
-                                          name="count")
+      self._count = ops.convert_to_tensor(
+          count, dtype=dtypes.int64, name="count")
 
   def make_dataset_resource(self):
     return gen_dataset_ops.repeat_dataset(
@@ -1109,8 +1362,8 @@ class ShuffleDataset(Dataset):
     if seed2 is None:
       self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
     else:
-      self._seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64,
-                                          name="seed2")
+      self._seed2 = ops.convert_to_tensor(
+          seed2, dtype=dtypes.int64, name="seed2")
 
   def make_dataset_resource(self):
     return gen_dataset_ops.shuffle_dataset(
@@ -1180,6 +1433,29 @@ class SkipDataset(Dataset):
     return self._input_dataset.output_types
 
 
+class IgnoreErrorsDataset(Dataset):
+  """A `Dataset` that silently ignores errors when computing its input."""
+
+  def __init__(self, input_dataset):
+    """See `Dataset.ignore_errors()` for details."""
+    super(IgnoreErrorsDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.ignore_errors_dataset(
+        self._input_dataset.make_dataset_resource(),
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
 class BatchDataset(Dataset):
   """A `Dataset` that batches contiguous elements from its input."""
 
@@ -1241,12 +1517,10 @@ def _padding_value_to_tensor(value, output_type):
   """
   value = ops.convert_to_tensor(value, name="padding_value")
   if not value.shape.is_compatible_with(tensor_shape.scalar()):
-    raise ValueError(
-        "Padding value should be a scalar, but is not: %s" % value)
+    raise ValueError("Padding value should be a scalar, but is not: %s" % value)
   if value.dtype != output_type:
-    raise TypeError(
-        "Padding value tensor (%s) does not match output type: %s"
-        % (value, output_type))
+    raise TypeError("Padding value tensor (%s) does not match output type: %s" %
+                    (value, output_type))
   return value
 
 
@@ -1260,20 +1534,20 @@ class PaddedBatchDataset(Dataset):
     self._batch_size = batch_size
     padding_values = (padding_values if padding_values is not None else
                       self._default_padding(input_dataset))
-    self._padded_shapes = nest.map_structure_up_to(input_dataset.output_shapes,
-                                                   _partial_shape_to_tensor,
-                                                   padded_shapes)
-    self._padding_values = nest.map_structure_up_to(input_dataset.output_shapes,
-                                                    _padding_value_to_tensor,
-                                                    padding_values,
-                                                    input_dataset.output_types)
+    self._padded_shapes = nest.map_structure_up_to(
+        input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
+    self._padding_values = nest.map_structure_up_to(
+        input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
+        input_dataset.output_types)
 
   def _default_padding(self, input_dataset):
+
     def make_zero(t):
       if t.base_dtype == dtypes.string:
         return ""
       else:
         return np.zeros_like(t.as_numpy_dtype())
+
     return nest.map_structure(make_zero, input_dataset.output_types)
 
   def make_dataset_resource(self):
@@ -1289,9 +1563,11 @@ class PaddedBatchDataset(Dataset):
 
   @property
   def output_shapes(self):
+
     def _padded_shape_to_batch_shape(s):
       return tensor_shape.vector(None).concatenate(
           tensor_util.constant_value_as_shape(s))
+
     return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
 
   @property
@@ -1307,8 +1583,8 @@ class DenseToSparseBatchDataset(Dataset):
     super(DenseToSparseBatchDataset, self).__init__()
     if not isinstance(input_dataset.output_types, dtypes.DType):
       raise TypeError("DenseToSparseDataset requires an input whose elements "
-                      "have a single component, whereas the input has %r."
-                      % input_dataset.output_types)
+                      "have a single component, whereas the input has %r." %
+                      input_dataset.output_types)
     self._input_dataset = input_dataset
     self._batch_size = batch_size
     self._row_shape = _partial_shape_to_tensor(row_shape)
@@ -1333,6 +1609,11 @@ class DenseToSparseBatchDataset(Dataset):
     return (dtypes.int64, self._input_dataset.output_types, dtypes.int64)
 
 
+def _should_unpack_args(args):
+  """Returns `True` if `args` should be `*args` when passed to a callable."""
+  return nest.is_sequence(args) and not isinstance(args, dict)
+
+
 class _ResourceDataset(Dataset):
   """A Dataset wrapper for a tf.resource-typed function argument."""
 
@@ -1370,7 +1651,7 @@ class GroupByWindowDataset(Dataset):
       for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
         arg.set_shape(shape)
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-      if nest.is_sequence(nested_args):
+      if _should_unpack_args(nested_args):
         ret = key_func(*nested_args)
       else:
         ret = key_func(nested_args)
@@ -1419,22 +1700,6 @@ class GroupByWindowDataset(Dataset):
     return self._output_types
 
 
-def _most_specific_compatible_shape(s1, s2):
-  """Returns the most specific shape compatible with `s1` and `s2`."""
-  if s1.dims is None:
-    return s1
-  if s2.dims is None:
-    return s2
-  s1.assert_same_rank(s2)
-  dims = []
-  for dim1, dim2 in zip(s1, s2):
-    if dim1.value is None or dim2.value is None or dim1.value != dim2.value:
-      dims.append(tensor_shape.Dimension(None))
-    else:
-      dims.append(dim1.value)
-  return tensor_shape.TensorShape(dims)
-
-
 class MapDataset(Dataset):
   """A `Dataset` that maps a function over elements in its input."""
 
@@ -1459,7 +1724,7 @@ class MapDataset(Dataset):
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
 
-      if nest.is_sequence(nested_args):
+      if _should_unpack_args(nested_args):
         ret = map_func(*nested_args)
       else:
         ret = map_func(nested_args)
@@ -1483,7 +1748,7 @@ class MapDataset(Dataset):
             output_buffer_size, dtype=dtypes.int64, name="output_buffer_size")
       else:
         self._output_buffer_size = ops.convert_to_tensor(
-            self._num_threads, dtype=dtypes.int64, name="output_buffer_size")
+            num_threads, dtype=dtypes.int64, name="output_buffer_size")
     else:
       self._num_threads = None
       self._output_buffer_size = None
@@ -1519,9 +1784,7 @@ class MapDataset(Dataset):
 class FlatMapDataset(Dataset):
   """A `Dataset` that maps a function over its input and flattens the result."""
 
-  def __init__(self,
-               input_dataset,
-               map_func):
+  def __init__(self, input_dataset, map_func):
     """See `Dataset.flat_map()` for details."""
     super(FlatMapDataset, self).__init__()
     self._input_dataset = input_dataset
@@ -1535,7 +1798,7 @@ class FlatMapDataset(Dataset):
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
 
-      if nest.is_sequence(nested_args):
+      if _should_unpack_args(nested_args):
         dataset = map_func(*nested_args)
       else:
         dataset = map_func(nested_args)
@@ -1568,6 +1831,65 @@ class FlatMapDataset(Dataset):
     return self._output_types
 
 
+class InterleaveDataset(Dataset):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               cycle_length,
+               block_length):
+    """See `Dataset.interleave()` for details."""
+    super(InterleaveDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_map_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        dataset = map_func(*nested_args)
+      else:
+        dataset = map_func(nested_args)
+
+      if not isinstance(dataset, Dataset):
+        raise TypeError("`map_func` must return a `Dataset` object.")
+
+      self._output_types = dataset.output_types
+      self._output_shapes = dataset.output_shapes
+
+      return dataset.make_dataset_resource()
+
+    self._map_func = tf_map_func
+    self._map_func.add_to_graph(ops.get_default_graph())
+
+    self._cycle_length = ops.convert_to_tensor(cycle_length, dtype=dtypes.int64)
+    self._block_length = ops.convert_to_tensor(block_length, dtype=dtypes.int64)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.interleave_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._map_func.captured_inputs,
+        self._cycle_length,
+        self._block_length,
+        f=self._map_func,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
 class FilterDataset(Dataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
@@ -1585,7 +1907,7 @@ class FilterDataset(Dataset):
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
 
-      if nest.is_sequence(nested_args):
+      if _should_unpack_args(nested_args):
         ret = predicate(*nested_args)
       else:
         ret = predicate(nested_args)
@@ -1620,18 +1942,26 @@ class FilterDataset(Dataset):
 class TextLineDataset(Dataset):
   """A `Dataset` comprising lines from one or more text files."""
 
-  def __init__(self, filenames):
+  def __init__(self, filenames, compression_type=None):
     """Creates a `TextLineDataset`.
 
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
+      compression_type: A `tf.string` scalar evaluating to one of `""` (no
+        compression), `"ZLIB"`, or `"GZIP"`.
     """
     super(TextLineDataset, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
+    if compression_type is not None:
+      self._compression_type = ops.convert_to_tensor(
+          compression_type, dtype=dtypes.string, name="compression_type")
+    else:
+      self._compression_type = constant_op.constant("", name="compression_type")
 
   def make_dataset_resource(self):
-    return gen_dataset_ops.text_line_dataset(self._filenames)
+    return gen_dataset_ops.text_line_dataset(self._filenames,
+                                             self._compression_type)
 
   @property
   def output_shapes(self):
@@ -1725,8 +2055,11 @@ class FixedLengthRecordDataset(Dataset):
     return dtypes.string
 
 
-def rejection_resample(dataset, class_func, target_dist,
-                       initial_dist=None, seed=None):
+def rejection_resample(dataset,
+                       class_func,
+                       target_dist,
+                       initial_dist=None,
+                       seed=None):
   """Resamples this dataset to achieve a target class distribution.
 
   **NOTE** Resampling is performed via rejection sampling; some fraction
@@ -1751,36 +2084,34 @@ def rejection_resample(dataset, class_func, target_dist,
   target_dist = ops.convert_to_tensor(target_dist, name="initial_dist")
   class_values_ds = dataset.map(class_func)
   if initial_dist is not None:
-    initial_dist = ops.convert_to_tensor(
-        initial_dist, name="initial_dist")
+    initial_dist = ops.convert_to_tensor(initial_dist, name="initial_dist")
     acceptance_dist = _calculate_acceptance_probs(initial_dist, target_dist)
     initial_dist_ds = Dataset.from_tensors(initial_dist).repeat()
     acceptance_dist_ds = Dataset.from_tensors(acceptance_dist).repeat()
   else:
-    num_classes = (target_dist.shape[0].value
-                   or array_ops.shape(target_dist)[0])
+    num_classes = (target_dist.shape[0].value or
+                   array_ops.shape(target_dist)[0])
     smoothing_constant = 10
     num_examples_per_class_seen = resource_variable_ops.ResourceVariable(
-        initial_value=array_ops.fill(
-            [num_classes], np.int64(smoothing_constant)),
+        initial_value=array_ops.fill([num_classes],
+                                     np.int64(smoothing_constant)),
         trainable=False,
         name="class_count",
         dtype=dtypes.int64)
+
     def update_estimate_and_tile(c):
       return array_ops.tile(
           array_ops.expand_dims(
               _estimate_data_distribution(c, num_examples_per_class_seen), 0),
           [dist_estimation_batch_size, 1])
-    initial_dist_ds = (class_values_ds
-                       .batch(dist_estimation_batch_size)
-                       .map(update_estimate_and_tile)
-                       .unbatch())
+
+    initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
+                       .map(update_estimate_and_tile).unbatch())
     acceptance_dist_ds = initial_dist_ds.map(
         lambda initial: _calculate_acceptance_probs(initial, target_dist))
 
   def maybe_warn_on_large_rejection(accept_dist, initial_dist):
-    proportion_rejected = math_ops.reduce_sum(
-        (1 - accept_dist) * initial_dist)
+    proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
     return control_flow_ops.cond(
         math_ops.less(proportion_rejected, .5),
         lambda: accept_dist,
@@ -1790,12 +2121,10 @@ def rejection_resample(dataset, class_func, target_dist,
             summarize=100,
             first_n=10))
 
-  acceptance_dist_ds = (
-      Dataset.zip((acceptance_dist_ds, initial_dist_ds))
-      .map(maybe_warn_on_large_rejection))
+  acceptance_dist_ds = (Dataset.zip((acceptance_dist_ds, initial_dist_ds))
+                        .map(maybe_warn_on_large_rejection))
 
-  current_probabilities_ds = (Dataset
-                              .zip((acceptance_dist_ds, class_values_ds))
+  current_probabilities_ds = (Dataset.zip((acceptance_dist_ds, class_values_ds))
                               .map(array_ops.gather))
   filtered_ds = (
       Dataset.zip((class_values_ds, current_probabilities_ds, dataset))
diff --git a/tensorflow/contrib/data/python/util/BUILD b/tensorflow/contrib/data/python/util/BUILD
index b9691c8e4912019bb5395fef81e64fdf1e00c379..a2b80590bacb0b159bcfe94cbe203be237279a20 100644
--- a/tensorflow/contrib/data/python/util/BUILD
+++ b/tensorflow/contrib/data/python/util/BUILD
@@ -12,6 +12,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:util",
+        "@six_archive//:six",
     ],
 )
 
@@ -26,7 +27,6 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/data/python/util/nest.py b/tensorflow/contrib/data/python/util/nest.py
index 91c8416d5aebea2a51e01e260ea1ff8d7a6aec27..a29c3c562bd5c065fae66ea34ebc318c655753d8 100644
--- a/tensorflow/contrib/data/python/util/nest.py
+++ b/tensorflow/contrib/data/python/util/nest.py
@@ -286,7 +286,8 @@ def map_structure(func, *structure, **check_types_dict):
 def _yield_flat_up_to(shallow_tree, input_tree):
   """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
   if is_sequence(shallow_tree):
-    for shallow_branch, input_branch in zip(shallow_tree, input_tree):
+    for shallow_branch, input_branch in zip(_elements_of(shallow_tree),
+                                            _elements_of(input_tree)):
       for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
         yield input_leaf
   else:
@@ -495,6 +496,7 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   # then repack based on the structure of the first input.
   all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
                          for input_tree in inputs]
+
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
 
diff --git a/tensorflow/contrib/data/python/util/nest_test.py b/tensorflow/contrib/data/python/util/nest_test.py
index 7852e4f86176c5b3e083847f6a4974b1e224ee12..5132881afb9e73aa080c9f4c548e6a20f7dad2a7 100644
--- a/tensorflow/contrib/data/python/util/nest_test.py
+++ b/tensorflow/contrib/data/python/util/nest_test.py
@@ -287,6 +287,14 @@ class NestTest(test.TestCase):
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, list(shallow_tree))
 
+    # Using dict.
+    input_tree = {"a": ((2, 2), (3, 3)), "b": ((4, 9), (5, 5))}
+    shallow_tree = {"a": (True, True), "b": (False, True)}
+    flattened_input_tree = nest.flatten_up_to(shallow_tree, input_tree)
+    flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree, [(2, 2), (3, 3), (4, 9), (5, 5)])
+    self.assertEqual(flattened_shallow_tree, [True, True, False, True])
+
   def testMapStructureUpTo(self):
     ab_tuple = collections.namedtuple("ab_tuple", "a, b")
     op_tuple = collections.namedtuple("op_tuple", "add, mul")
diff --git a/tensorflow/contrib/decision_trees/BUILD b/tensorflow/contrib/decision_trees/BUILD
deleted file mode 100644
index 4045b92f10dbaf817f5edd99dd147c068ce82ebd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/decision_trees/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-# Files common to decision-tree algorithms.
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/decision_trees/__init__.py b/tensorflow/contrib/decision_trees/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c44e4eb59f16b0f80257021749073f94b5a3d7c
--- /dev/null
+++ b/tensorflow/contrib/decision_trees/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Shared representations for tree-based models in tensorflow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.decision_trees.proto import *
+# pylint: enable=unused-import,wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index 86174c5865f90d678c355f6ad895259ddc5b9c0a..87c80740a8f0c0721394b5d832bc96e548e3a313 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -9,12 +9,21 @@ exports_files([
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
 tf_proto_library(
     name = "generic_tree_model",
     srcs = ["generic_tree_model.proto"],
     cc_api_version = 2,
     go_api_version = 2,
     java_api_version = 2,
+    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
@@ -23,4 +32,5 @@ tf_proto_library(
     cc_api_version = 2,
     go_api_version = 2,
     protodeps = [":generic_tree_model"],
+    visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index adcef730a87ee5007e4c40349c212ceeab0fa1e0..f48c6341e3dd24b64b17760ef66c4dcb4e8cfeed 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -11,11 +11,12 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
     name = "deprecated_py",
-    srcs = [
-        "__init__.py",
-    ],
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/python:logging_ops"],
+    deps = [
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:util",
+    ],
 )
 
 py_test(
@@ -24,7 +25,6 @@ py_test(
     srcs = ["summaries_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":deprecated_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/contrib/deprecated/__init__.py b/tensorflow/contrib/deprecated/__init__.py
index befb8e6198bbb542499028498fbeb8d886f9814f..0bbca8d8edea155a7f251a385704ccc77502e9d8 100644
--- a/tensorflow/contrib/deprecated/__init__.py
+++ b/tensorflow/contrib/deprecated/__init__.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,20 @@
 # ==============================================================================
 """Non-core alias for the deprecated tf.X_summary ops.
 
-For TensorFlow 1.0, we have re-organized the TensorFlow summary ops into a
+For TensorFlow 1.0, we have reorganized the TensorFlow summary ops into a
 submodule, and made some semantic tweaks. The first thing to note is that we
 moved the APIs around as follows:
 
-tf.scalar_summary      -> tf.summary.scalar
-tf.histogram_summary   -> tf.summary.histogram
-tf.audio_summary       -> tf.summary.audio
-tf.image_summary       -> tf.summary.image
-tf.merge_summary       -> tf.summary.merge
+tf.scalar_summary -> tf.summary.scalar
+
+tf.histogram_summary -> tf.summary.histogram
+
+tf.audio_summary -> tf.summary.audio
+
+tf.image_summary -> tf.summary.image
+
+tf.merge_summary -> tf.summary.merge
+
 tf.merge_all_summaries -> tf.summary.merge_all
 
 We think this is a cleaner API and will improve long-term discoverability and
@@ -35,14 +40,14 @@ Previously, the tag was allowed to be any unique string, and had no relation
 to the summary op generating it, and no relation to the TensorFlow name system.
 This made it very difficult to write re-usable code that would add summary
 ops to the graph. If you had a function that would add summary ops, you would
-need to manually pass in a name scope to that function to create de-duplicated
+need to manually pass in a name scope to that function to create deduplicated
 tags, otherwise your program would fail with a runtime error due to tag
 collision.
 
 The new summary APIs under tf.summary throw away the "tag" as an independent
-concept; instead, the first argument is the node name. This means that summary
-tags now automatically inherit the surrounding TF name scope, and automatically
-are deduplicated if there is a conflict. However, now the only allowed
+concept; instead, the first argument is the node name. So summary tags now 
+automatically inherit the surrounding TF name scope, and automatically
+are deduplicated if there is a conflict. Now however, the only allowed
 characters are alphanumerics, underscores, and forward slashes. To make
 migration easier, the new APIs automatically convert illegal characters to
 underscores.
@@ -75,7 +80,7 @@ to the new summary ops:
   tf.summary.scalar requires a single scalar name and scalar value. In most
   cases, you can create tf.summary.scalars in a loop to get the same behavior
 
-As before, TensorBoard will group charts by the top-level name scope. This may
+As before, TensorBoard groups charts by the top-level name scope. This may
 be inconvenient, since in the new summary ops the summary will inherit that
 name scope without user control. We plan to add more grouping mechanisms to
 TensorBoard, so it will be possible to specify the TensorBoard group for
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 8dea2763f2946bea9a4b7ef00353b10560fc700c..94e5c3785b9396feabdb1546e29bcba29ec6e19d 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -18,20 +18,15 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -41,26 +36,49 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":bijectors_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/learn",
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+cuda_py_test(
+    name = "estimator_test",
+    size = "small",
+    srcs = ["python/kernel_tests/estimator_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/contrib/learn:head_test",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:session",
+    ],
+    tags = ["no_pip"],  # contrib/learn:head_test is not available in pip.
+)
+
 cuda_py_test(
     name = "distribution_test",
     size = "small",
@@ -97,102 +115,6 @@ cuda_py_test(
     tags = ["no_pip"],
 )
 
-cuda_py_test(
-    name = "operator_pd_test",
-    size = "small",
-    srcs = ["python/kernel_tests/operator_pd_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "operator_pd_cholesky_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/operator_pd_cholesky_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "operator_pd_diag_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/operator_pd_diag_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "operator_pd_full_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/operator_pd_full_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "operator_pd_identity_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/operator_pd_identity_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "operator_pd_vdvt_update_test",
-    size = "large",
-    srcs = ["python/kernel_tests/operator_pd_vdvt_update_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-    shard_count = 5,
-    tags = ["notap"],  # http://b/30441813
-)
-
 cuda_py_test(
     name = "binomial_test",
     size = "small",
@@ -268,6 +190,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "moving_stats_test",
+    size = "small",
+    srcs = ["python/kernel_tests/moving_stats_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["nomsan"],  # disable to avoid false positives from scipy.
+)
+
 cuda_py_test(
     name = "mvn_diag_test",
     size = "small",
@@ -405,6 +345,22 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_test(
+    name = "vector_exponential_diag_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/vector_exponential_diag_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "vector_laplace_diag_test",
     size = "medium",
@@ -560,6 +516,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "vector_diffeomixture_test",
+    size = "large",
+    srcs = ["python/kernel_tests/vector_diffeomixture_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "conditional_transformed_distribution_test",
     size = "medium",
@@ -649,7 +623,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "affine_test",
-    size = "medium",
+    size = "large",
     srcs = ["python/kernel_tests/bijectors/affine_test.py"],
     additional_deps = [
         ":bijectors_py",
@@ -665,6 +639,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
+    tags = ["noasan"],  # times out b/63678675
 )
 
 cuda_py_test(
@@ -838,6 +813,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "sinh_arcsinh_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/sinh_arcsinh_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "softmax_centered_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 1fddad53689a0d74d00c1f210d81b83975fb1d37..7a2aebddd25392bf53c19844946368e176592c01 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -30,10 +30,12 @@ from tensorflow.contrib.distributions.python.ops.conditional_transformed_distrib
 from tensorflow.contrib.distributions.python.ops.deterministic import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
+from tensorflow.contrib.distributions.python.ops.estimator import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
 from tensorflow.contrib.distributions.python.ops.mixture import *
+from tensorflow.contrib.distributions.python.ops.moving_stats import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
 from tensorflow.contrib.distributions.python.ops.mvn_full_covariance import *
@@ -46,6 +48,7 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.vector_exponential_diag import *
 from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
 from tensorflow.contrib.distributions.python.ops.wishart import *
 from tensorflow.python.ops.distributions.bernoulli import *
@@ -101,6 +104,7 @@ _allowed_symbols = [
     'VectorDeterministic',
     'Exponential',
     'ExponentialWithSoftplusRate',
+    'VectorExponentialDiag',
     'Gamma',
     'GammaWithSoftplusConcentrationRate',
     'Geometric',
@@ -124,6 +128,7 @@ _allowed_symbols = [
     'Dirichlet',
     'DirichletMultinomial',
     'Multinomial',
+    'VectorDiffeomixture',
     'VectorLaplaceDiag',
     'WishartCholesky',
     'WishartFull',
@@ -140,7 +145,10 @@ _allowed_symbols = [
     'normal_conjugates_known_scale_posterior',
     'normal_conjugates_known_scale_predictive',
     'softplus_inverse',
-    'percentile'
+    'percentile',
+    'assign_exponential_moving_mean_variance',
+    'exponential_moving_mean_variance',
+    'estimator_head_distribution_regression',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index e8fd6aa2f73fa3457333483111379f0d987801ff..2c4b8277d01c7a2929fdde7babf809f2c16f730b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -111,28 +111,51 @@ class AffineBijectorTest(test.TestCase):
         self.assertAllClose(-np.log(2.),
                             run(bijector.inverse_log_det_jacobian, x))
 
-  def testOneBatchScalarViaIdentity(self):
+  def testOneBatchScalarViaIdentityIn64BitUserProvidesShiftOnly(self):
     with self.test_session() as sess:
 
       def static_run(fun, x):
         return fun(x).eval()
 
       def dynamic_run(fun, x_value):
-        x_value = np.array(x_value)
-        x = array_ops.placeholder(dtypes.float32, name="x")
+        x_value = np.array(x_value).astype(np.float64)
+        x = array_ops.placeholder(dtypes.float64, name="x")
         return sess.run(fun(x), feed_dict={x: x_value})
 
       for run in (static_run, dynamic_run):
-        mu = [1.]
+        mu = np.float64([1.])
         # One batch, scalar.
         # Corresponds to scale = 1.
         bijector = Affine(shift=mu, event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
-        x = [1.]  # One sample from one batches.
+        x = np.float64([1.])  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
         self.assertAllClose([0.], run(bijector.inverse, x))
         self.assertAllClose(0., run(bijector.inverse_log_det_jacobian, x))
 
+  def testOneBatchScalarViaIdentityIn64BitUserProvidesMultiplierOnly(self):
+    with self.test_session() as sess:
+
+      def static_run(fun, x):
+        return fun(x).eval()
+
+      def dynamic_run(fun, x_value):
+        x_value = np.array(x_value).astype(np.float64)
+        x = array_ops.placeholder(dtypes.float64, name="x")
+        return sess.run(fun(x), feed_dict={x: x_value})
+
+      for run in (static_run, dynamic_run):
+        multiplier = np.float64([2.])
+        # One batch, scalar.
+        # Corresponds to scale = 2, shift = 0.
+        bijector = Affine(scale_identity_multiplier=multiplier, event_ndims=0)
+        self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
+        x = np.float64([1.])  # One sample from one batches.
+        self.assertAllClose([2.], run(bijector.forward, x))
+        self.assertAllClose([0.5], run(bijector.inverse, x))
+        self.assertAllClose([np.log(0.5)],
+                            run(bijector.inverse_log_det_jacobian, x))
+
   def testOneBatchScalarViaDiag(self):
     with self.test_session() as sess:
 
@@ -621,15 +644,17 @@ class AffineBijectorTest(test.TestCase):
           # Has zero on the diagonal.
           scale_diag=[0., 1],
           validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0"):
+      with self.assertRaisesOpError("diagonal part must be non-zero"):
         bijector.forward([1., 1.]).eval()
 
   def testEventNdimsLargerThanOneRaises(self):
     with self.test_session():
       mu = [1., -1]
-      # Scale corresponds to 2x2 identity matrix.
-      bijector = Affine(shift=mu, event_ndims=2, validate_args=True)
-      bijector.forward([1., 1.]).eval()
+      with self.assertRaisesRegexp(
+          ValueError, (r"event_ndims\(2\) was not 0 or 1")):
+        # Scale corresponds to 2x2 identity matrix.
+        bijector = Affine(shift=mu, event_ndims=2, validate_args=True)
+        bijector.forward([1., 1.]).eval()
 
   def testScaleZeroScalarRaises(self):
     with self.test_session():
@@ -637,24 +662,23 @@ class AffineBijectorTest(test.TestCase):
       # Check Identity matrix with zero scaling.
       bijector = Affine(
           shift=mu,
-          scale_identity_multiplier=0.0,
+          scale_identity_multiplier=0.,
           event_ndims=0,
           validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0"):
+      with self.assertRaisesOpError("identity_multiplier should be non-zero"):
         bijector.forward(1.).eval()
 
       # Check Diag matrix with zero scaling.
       bijector = Affine(
           shift=mu, scale_diag=[0.0], event_ndims=0, validate_args=True)
-      with self.assertRaisesOpError("Condition x > 0"):
+      with self.assertRaisesOpError("diagonal part must be non-zero"):
         bijector.forward(1.).eval()
 
   def testScalarCongruency(self):
     with self.test_session():
       bijector = Affine(
           shift=3.6, scale_identity_multiplier=0.42, event_ndims=0)
-      assert_scalar_congruency(
-          bijector, lower_x=-2., upper_x=2.)
+      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
 
   def _makeScale(self,
                  x,
@@ -677,11 +701,7 @@ class AffineBijectorTest(test.TestCase):
     if c is None and d1 is None and tril is None:
       # Special case when no scale args are passed in. This means use an
       # identity matrix.
-      if v is None and d2 is None:
-        c = 1.
-      # No scale.
-      else:
-        return None
+      c = 1.
 
     matrix = np.float32(0.)
     if c is not None:
@@ -810,13 +830,6 @@ class AffineBijectorTest(test.TestCase):
         x=np.array(
             [1., 2], dtype=np.float32))
 
-  def testScalePropertyAssertsCorrectly(self):
-    with self.test_session():
-      with self.assertRaises(NotImplementedError):
-        scale = Affine(  # pylint:disable=unused-variable
-            scale_tril=[[1., 0], [2, 1]],
-            scale_perturb_factor=[2., 1.]).scale
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1159d13f379aebff92eb3b6144e7ca806b162d0
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py
@@ -0,0 +1,127 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SinhArcsinh Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+# pylint: disable=g-importing-member
+from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import SinhArcsinh
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
+from tensorflow.python.platform import test
+
+# pylint: enable=g-importing-member
+
+
+class SinhArcsinhBijectorTest(test.TestCase):
+  """Tests correctness of the power transformation."""
+
+  def testBijectorVersusNumpyRewriteOfBasicFunctions(self):
+    with self.test_session():
+      skewness = 0.2
+      tailweight = 2.0
+      bijector = SinhArcsinh(
+          skewness=skewness,
+          tailweight=tailweight,
+          event_ndims=1,
+          validate_args=True)
+      self.assertEqual("sinh_arcsinh", bijector.name)
+      x = np.array([[[-2.01], [2.], [1e-4]]]).astype(np.float32)
+      y = np.sinh((np.arcsinh(x) + skewness) * tailweight)
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(
+          np.sum(
+              np.log(np.cosh(np.arcsinh(y) / tailweight - skewness)) -
+              np.log(tailweight) - np.log(np.sqrt(y**2 + 1)),
+              axis=-1), bijector.inverse_log_det_jacobian(y).eval())
+      self.assertAllClose(
+          -bijector.inverse_log_det_jacobian(y).eval(),
+          bijector.forward_log_det_jacobian(x).eval(),
+          rtol=1e-4,
+          atol=0.)
+
+  def testLargerTailWeightPutsMoreWeightInTails(self):
+    with self.test_session():
+      # Will broadcast together to shape [3, 2].
+      x = [-1., 1.]
+      tailweight = [[0.5], [1.0], [2.0]]
+      bijector = SinhArcsinh(tailweight=tailweight, validate_args=True)
+      y = bijector.forward(x).eval()
+
+      # x = -1, 1 should be mapped to points symmetric about 0
+      self.assertAllClose(y[:, 0], -1. * y[:, 1])
+
+      # forward(1) should increase as tailweight increases, since higher
+      # tailweight should map 1 to a larger number.
+      forward_1 = y[:, 1]  # The positive values of y.
+      self.assertLess(forward_1[0], forward_1[1])
+      self.assertLess(forward_1[1], forward_1[2])
+
+  def testSkew(self):
+    with self.test_session():
+      # Will broadcast together to shape [3, 2].
+      x = [-1., 1.]
+      skewness = [[-1.], [0.], [1.]]
+      bijector = SinhArcsinh(skewness=skewness, validate_args=True)
+      y = bijector.forward(x).eval()
+
+      # For skew < 0, |forward(-1)| > |forward(1)|
+      self.assertGreater(np.abs(y[0, 0]), np.abs(y[0, 1]))
+
+      # For skew = 0, |forward(-1)| = |forward(1)|
+      self.assertAllClose(np.abs(y[1, 0]), np.abs(y[1, 1]))
+
+      # For skew > 0, |forward(-1)| < |forward(1)|
+      self.assertLess(np.abs(y[2, 0]), np.abs(y[2, 1]))
+
+  def testScalarCongruencySkewness1Tailweight0p5(self):
+    with self.test_session():
+      bijector = SinhArcsinh(skewness=1.0, tailweight=0.5, validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.0, rtol=0.05)
+
+  def testScalarCongruencySkewnessNeg1Tailweight1p5(self):
+    with self.test_session():
+      bijector = SinhArcsinh(skewness=-1.0, tailweight=1.5, validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.0, rtol=0.05)
+
+  def testBijectiveAndFiniteSkewnessNeg1Tailweight0p5(self):
+    with self.test_session():
+      bijector = SinhArcsinh(skewness=-1., tailweight=0.5, validate_args=True)
+      # Increasing upper logspace limit to 10 results in Inf due to y**2 being
+      # Inf.
+      x = np.concatenate((-np.logspace(-2, 9, 1000), [0], np.logspace(
+          -2, 9, 1000))).astype(np.float32)
+      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+
+  def testBijectiveAndFiniteSkewness2Tailweight3(self):
+    with self.test_session():
+      bijector = SinhArcsinh(skewness=1., tailweight=3., validate_args=True)
+      x = np.concatenate((-np.logspace(-2, 5, 1000), [0], np.logspace(
+          -2, 5, 1000))).astype(np.float32)
+      assert_bijective_and_finite(bijector, x, x, rtol=1e-3)
+
+  def testZeroTailweightRaises(self):
+    with self.test_session():
+      with self.assertRaisesOpError("not positive"):
+        SinhArcsinh(tailweight=0., validate_args=True).forward(1.0).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 1c67a1b8f6e7eba052555e0f930e1db80d8f1b12..6f67f86b9b40e56dcbb2f7abf691625a91cb0b66 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -18,116 +18,199 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+import itertools
 
 import numpy as np
-from scipy import special
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.linalg.python.ops import linear_operator_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
 
 
-class AssertCloseTest(test.TestCase):
-
-  def testAssertCloseIntegerDtype(self):
-    x = array_ops.placeholder(dtypes.int32)
-    y = x
-    z = array_ops.placeholder(dtypes.int32)
-    feed_dict = {x: [1, 5, 10, 15, 20], z: [2, 5, 10, 15, 20]}
+def _powerset(x):
+  s = list(x)
+  return itertools.chain.from_iterable(
+      itertools.combinations(s, r) for r in range(len(s) + 1))
+
+
+def _matrix_diag(d):
+  """Batch version of np.diag."""
+  orig_shape = d.shape
+  d = np.reshape(d, (int(np.prod(d.shape[:-1])), d.shape[-1]))
+  diag_list = []
+  for i in range(d.shape[0]):
+    diag_list.append(np.diag(d[i, ...]))
+  return np.reshape(diag_list, orig_shape + (d.shape[-1],))
+
+
+def _make_tril_scale(
+    loc=None,
+    scale_tril=None,
+    scale_diag=None,
+    scale_identity_multiplier=None,
+    shape_hint=None):
+  if scale_tril is not None:
+    scale_tril = np.tril(scale_tril)
+    if scale_diag is not None:
+      scale_tril += _matrix_diag(np.array(scale_diag, dtype=np.float32))
+    if scale_identity_multiplier is not None:
+      scale_tril += (
+          scale_identity_multiplier * _matrix_diag(np.ones(
+              [scale_tril.shape[-1]], dtype=np.float32)))
+    return scale_tril
+  return _make_diag_scale(
+      loc, scale_diag, scale_identity_multiplier, shape_hint)
+
+
+def _make_diag_scale(
+    loc=None,
+    scale_diag=None,
+    scale_identity_multiplier=None,
+    shape_hint=None):
+  if scale_diag is not None:
+    scale_diag = np.asarray(scale_diag)
+    if scale_identity_multiplier is not None:
+      scale_diag += scale_identity_multiplier
+    return _matrix_diag(scale_diag)
+
+  if loc is None and shape_hint is None:
+    return None
+
+  if shape_hint is None:
+    shape_hint = loc.shape[-1]
+  if scale_identity_multiplier is None:
+    scale_identity_multiplier = 1.
+  return scale_identity_multiplier * np.diag(np.ones(shape_hint))
+
+
+class MakeTrilScaleTest(test.TestCase):
+
+  def _testLegalInputs(
+      self, loc=None, shape_hint=None, scale_params=None):
+    for args in _powerset(scale_params.items()):
+      with self.test_session():
+        args = dict(args)
+
+        scale_args = dict({
+            "loc": loc,
+            "shape_hint": shape_hint}, **args)
+        expected_scale = _make_tril_scale(**scale_args)
+        if expected_scale is None:
+          # Not enough shape information was specified.
+          with self.assertRaisesRegexp(ValueError, ("is specified.")):
+            scale = distribution_util.make_tril_scale(**scale_args)
+            scale.to_dense().eval()
+        else:
+          scale = distribution_util.make_tril_scale(**scale_args)
+          self.assertAllClose(expected_scale, scale.to_dense().eval())
+
+  def testLegalInputs(self):
+    self._testLegalInputs(
+        loc=np.array([-1., -1.], dtype=np.float32),
+        shape_hint=2,
+        scale_params={
+            "scale_identity_multiplier": 2.,
+            "scale_diag": [2., 3.],
+            "scale_tril": [[1., 0.],
+                           [-3., 3.]],
+        })
+
+  def testLegalInputsMultidimensional(self):
+    self._testLegalInputs(
+        loc=np.array([[[-1., -1., 2.], [-2., -3., 4.]]], dtype=np.float32),
+        shape_hint=3,
+        scale_params={
+            "scale_identity_multiplier": 2.,
+            "scale_diag": [[[2., 3., 4.], [3., 4., 5.]]],
+            "scale_tril": [[[[1., 0., 0.],
+                             [-3., 3., 0.],
+                             [1., -2., 1.]],
+                            [[2., 1., 0.],
+                             [-4., 7., 0.],
+                             [1., -1., 1.]]]]
+        })
+
+  def testZeroTriU(self):
     with self.test_session():
-      with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-        array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with ops.control_dependencies([distribution_util.assert_close(y, x)]):
-        array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with self.assertRaisesOpError("Condition x ~= y"):
-        with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-          array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with self.assertRaisesOpError("Condition x ~= y"):
-        with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval(feed_dict=feed_dict)
+      scale = distribution_util.make_tril_scale(scale_tril=[[1., 1], [1., 1.]])
+      self.assertAllClose([[1., 0], [1., 1.]], scale.to_dense().eval())
 
-  def testAssertCloseNonIntegerDtype(self):
-    x = array_ops.placeholder(dtypes.float32)
-    y = x + 1e-8
-    z = array_ops.placeholder(dtypes.float32)
-    feed_dict = {x: [1., 5, 10, 15, 20], z: [2., 5, 10, 15, 20]}
+  def testValidateArgs(self):
     with self.test_session():
-      with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-        array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with ops.control_dependencies([distribution_util.assert_close(y, x)]):
-        array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with self.assertRaisesOpError("Condition x ~= y"):
-        with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-          array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with self.assertRaisesOpError("Condition x ~= y"):
-        with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval(feed_dict=feed_dict)
-
-  def testAssertCloseEpsilon(self):
-    x = [0., 5, 10, 15, 20]
-    # x != y
-    y = [0.1, 5, 10, 15, 20]
-    # x = z
-    z = [1e-8, 5, 10, 15, 20]
+      with self.assertRaisesOpError("diagonal part must be non-zero"):
+        scale = distribution_util.make_tril_scale(
+            scale_tril=[[0., 1], [1., 1.]], validate_args=True)
+        scale.to_dense().eval()
+
+  def testAssertPositive(self):
     with self.test_session():
-      with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-        array_ops.identity(x).eval()
-
-      with self.assertRaisesOpError("Condition x ~= y"):
-        with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-          array_ops.identity(x).eval()
-
-      with self.assertRaisesOpError("Condition x ~= y"):
-        with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval()
-
-  def testAssertIntegerForm(self):
-    # This should only be detected as an integer.
-    x = array_ops.placeholder(dtypes.float32)
-    y = array_ops.placeholder(dtypes.float32)
-    # First component isn't less than float32.eps = 1e-7
-    z = array_ops.placeholder(dtypes.float32)
-    # This shouldn"t be detected as an integer.
-    w = array_ops.placeholder(dtypes.float32)
-    feed_dict = {x: [1., 5, 10, 15, 20], y: [1.1, 5, 10, 15, 20],
-                 z: [1.0001, 5, 10, 15, 20], w: [1e-8, 5, 10, 15, 20]}
+      with self.assertRaisesOpError("diagonal part must be positive"):
+        scale = distribution_util.make_tril_scale(
+            scale_tril=[[-1., 1], [1., 1.]],
+            validate_args=True,
+            assert_positive=True)
+        scale.to_dense().eval()
+
+
+class MakeDiagScaleTest(test.TestCase):
+
+  def _testLegalInputs(
+      self, loc=None, shape_hint=None, scale_params=None):
+    for args in _powerset(scale_params.items()):
+      with self.test_session():
+        args = dict(args)
+
+        scale_args = dict({
+            "loc": loc,
+            "shape_hint": shape_hint}, **args)
+        expected_scale = _make_diag_scale(**scale_args)
+        if expected_scale is None:
+          # Not enough shape information was specified.
+          with self.assertRaisesRegexp(ValueError, ("is specified.")):
+            scale = distribution_util.make_diag_scale(**scale_args)
+            scale.to_dense().eval()
+        else:
+          scale = distribution_util.make_diag_scale(**scale_args)
+          self.assertAllClose(expected_scale, scale.to_dense().eval())
+
+  def testLegalInputs(self):
+    self._testLegalInputs(
+        loc=np.array([-1., -1.], dtype=np.float32),
+        shape_hint=2,
+        scale_params={
+            "scale_identity_multiplier": 2.,
+            "scale_diag": [2., 3.]
+        })
+
+  def testLegalInputsMultidimensional(self):
+    self._testLegalInputs(
+        loc=np.array([[[-1., -1., 2.], [-2., -3., 4.]]], dtype=np.float32),
+        shape_hint=3,
+        scale_params={
+            "scale_identity_multiplier": 2.,
+            "scale_diag": [[[2., 3., 4.], [3., 4., 5.]]]
+        })
+
+  def testValidateArgs(self):
     with self.test_session():
-      with ops.control_dependencies([distribution_util.assert_integer_form(x)]):
-        array_ops.identity(x).eval(feed_dict=feed_dict)
-
-      with self.assertRaisesOpError("x has non-integer components"):
-        with ops.control_dependencies(
-            [distribution_util.assert_integer_form(y)]):
-          array_ops.identity(y).eval(feed_dict=feed_dict)
-
-      with self.assertRaisesOpError("x has non-integer components"):
-        with ops.control_dependencies(
-            [distribution_util.assert_integer_form(z)]):
-          array_ops.identity(z).eval(feed_dict=feed_dict)
+      with self.assertRaisesOpError("diagonal part must be non-zero"):
+        scale = distribution_util.make_diag_scale(
+            scale_diag=[[0., 1], [1., 1.]], validate_args=True)
+        scale.to_dense().eval()
 
-      with self.assertRaisesOpError("x has non-integer components"):
-        with ops.control_dependencies(
-            [distribution_util.assert_integer_form(w)]):
-          array_ops.identity(w).eval(feed_dict=feed_dict)
+  def testAssertPositive(self):
+    with self.test_session():
+      with self.assertRaisesOpError("diagonal part must be positive"):
+        scale = distribution_util.make_diag_scale(
+            scale_diag=[[-1., 1], [1., 1.]],
+            validate_args=True,
+            assert_positive=True)
+        scale.to_dense().eval()
 
 
 class ShapesFromLocAndScaleTest(test.TestCase):
@@ -204,460 +287,56 @@ class ShapesFromLocAndScaleTest(test.TestCase):
       self.assertAllEqual([3], event_shape)
 
 
-class GetLogitsAndProbsTest(test.TestCase):
-
-  def testGetLogitsAndProbsImproperArguments(self):
-    with self.test_session():
-      with self.assertRaises(ValueError):
-        distribution_util.get_logits_and_probs(logits=None, probs=None)
-
-      with self.assertRaises(ValueError):
-        distribution_util.get_logits_and_probs(logits=[0.1], probs=[0.1])
-
-  def testGetLogitsAndProbsLogits(self):
-    p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
-    logits = special.logit(p)
-
-    with self.test_session():
-      new_logits, new_p = distribution_util.get_logits_and_probs(
-          logits=logits, validate_args=True)
-
-      self.assertAllClose(p, new_p.eval())
-      self.assertAllClose(logits, new_logits.eval())
-
-  def testGetLogitsAndProbsLogitsMultidimensional(self):
-    p = np.array([0.2, 0.3, 0.5], dtype=np.float32)
-    logits = np.log(p)
-
-    with self.test_session():
-      new_logits, new_p = distribution_util.get_logits_and_probs(
-          logits=logits, multidimensional=True, validate_args=True)
-
-      self.assertAllClose(new_p.eval(), p)
-      self.assertAllClose(new_logits.eval(), logits)
-
-  def testGetLogitsAndProbsProbability(self):
-    p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
-
-    with self.test_session():
-      new_logits, new_p = distribution_util.get_logits_and_probs(
-          probs=p, validate_args=True)
-
-      self.assertAllClose(special.logit(p), new_logits.eval())
-      self.assertAllClose(p, new_p.eval())
-
-  def testGetLogitsAndProbsProbabilityMultidimensional(self):
-    p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
-
-    with self.test_session():
-      new_logits, new_p = distribution_util.get_logits_and_probs(
-          probs=p, multidimensional=True, validate_args=True)
-
-      self.assertAllClose(np.log(p), new_logits.eval())
-      self.assertAllClose(p, new_p.eval())
-
-  def testGetLogitsAndProbsProbabilityValidateArgs(self):
-    p = [0.01, 0.2, 0.5, 0.7, .99]
-    # Component less than 0.
-    p2 = [-1, 0.2, 0.5, 0.3, .2]
-    # Component greater than 1.
-    p3 = [2, 0.2, 0.5, 0.3, .2]
+class TridiagTest(test.TestCase):
 
+  def testWorksCorrectlyNoBatches(self):
     with self.test_session():
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p, validate_args=True)
-      prob.eval()
-
-      with self.assertRaisesOpError("Condition x >= 0"):
-        _, prob = distribution_util.get_logits_and_probs(
-            probs=p2, validate_args=True)
-        prob.eval()
-
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p2, validate_args=False)
-      prob.eval()
-
-      with self.assertRaisesOpError("probs has components greater than 1"):
-        _, prob = distribution_util.get_logits_and_probs(
-            probs=p3, validate_args=True)
-        prob.eval()
-
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p3, validate_args=False)
-      prob.eval()
-
-  def testGetLogitsAndProbsProbabilityValidateArgsMultidimensional(self):
-    p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
-    # Component less than 0. Still sums to 1.
-    p2 = np.array([[-.3, 0.4, 0.9], [0.1, 0.5, 0.4]], dtype=np.float32)
-    # Component greater than 1. Does not sum to 1.
-    p3 = np.array([[1.3, 0.0, 0.0], [0.1, 0.5, 0.4]], dtype=np.float32)
-    # Does not sum to 1.
-    p4 = np.array([[1.1, 0.3, 0.4], [0.1, 0.5, 0.4]], dtype=np.float32)
-
-    with self.test_session():
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p, multidimensional=True)
-      prob.eval()
-
-      with self.assertRaisesOpError("Condition x >= 0"):
-        _, prob = distribution_util.get_logits_and_probs(
-            probs=p2, multidimensional=True, validate_args=True)
-        prob.eval()
-
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p2, multidimensional=True, validate_args=False)
-      prob.eval()
-
-      with self.assertRaisesOpError(
-          "(probs has components greater than 1|probs does not sum to 1)"):
-        _, prob = distribution_util.get_logits_and_probs(
-            probs=p3, multidimensional=True, validate_args=True)
-        prob.eval()
-
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p3, multidimensional=True, validate_args=False)
-      prob.eval()
-
-      with self.assertRaisesOpError("probs does not sum to 1"):
-        _, prob = distribution_util.get_logits_and_probs(
-            probs=p4, multidimensional=True, validate_args=True)
-        prob.eval()
-
-      _, prob = distribution_util.get_logits_and_probs(
-          probs=p4, multidimensional=True, validate_args=False)
-      prob.eval()
-
-
-class LogCombinationsTest(test.TestCase):
-
-  def testLogCombinationsBinomial(self):
-    n = [2, 5, 12, 15]
-    k = [1, 2, 4, 11]
-    log_combs = np.log(special.binom(n, k))
-
-    with self.test_session():
-      n = np.array(n, dtype=np.float32)
-      counts = [[1., 1], [2., 3], [4., 8], [11, 4]]
-      log_binom = distribution_util.log_combinations(n, counts)
-      self.assertEqual([4], log_binom.get_shape())
-      self.assertAllClose(log_combs, log_binom.eval())
-
-  def testLogCombinationsShape(self):
-    # Shape [2, 2]
-    n = [[2, 5], [12, 15]]
-
-    with self.test_session():
-      n = np.array(n, dtype=np.float32)
-      # Shape [2, 2, 4]
-      counts = [[[1., 1, 0, 0], [2., 2, 1, 0]], [[4., 4, 1, 3], [10, 1, 1, 4]]]
-      log_binom = distribution_util.log_combinations(n, counts)
-      self.assertEqual([2, 2], log_binom.get_shape())
-
-
-class DynamicShapeTest(test.TestCase):
-
-  def testSameDynamicShape(self):
-    with self.test_session():
-      scalar = constant_op.constant(2.0)
-      scalar1 = array_ops.placeholder(dtype=dtypes.float32)
-
-      vector = [0.3, 0.4, 0.5]
-      vector1 = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
-      vector2 = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
-
-      multidimensional = [[0.3, 0.4], [0.2, 0.6]]
-      multidimensional1 = array_ops.placeholder(
-          dtype=dtypes.float32, shape=[None, None])
-      multidimensional2 = array_ops.placeholder(
-          dtype=dtypes.float32, shape=[None, None])
-
-      # Scalar
-      self.assertTrue(
-          distribution_util.same_dynamic_shape(scalar, scalar1).eval({
-              scalar1: 2.0
-          }))
-
-      # Vector
-
-      self.assertTrue(
-          distribution_util.same_dynamic_shape(vector, vector1).eval({
-              vector1: [2.0, 3.0, 4.0]
-          }))
-      self.assertTrue(
-          distribution_util.same_dynamic_shape(vector1, vector2).eval({
-              vector1: [2.0, 3.0, 4.0],
-              vector2: [2.0, 3.5, 6.0]
-          }))
-
-      # Multidimensional
-      self.assertTrue(
-          distribution_util.same_dynamic_shape(
-              multidimensional, multidimensional1).eval({
-                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
-              }))
-      self.assertTrue(
-          distribution_util.same_dynamic_shape(
-              multidimensional1, multidimensional2).eval({
-                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]],
-                  multidimensional2: [[1.0, 3.5], [6.3, 2.3]]
-              }))
-
-      # Scalar, X
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(scalar, vector1).eval({
-              vector1: [2.0, 3.0, 4.0]
-          }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(scalar1, vector1).eval({
-              scalar1: 2.0,
-              vector1: [2.0, 3.0, 4.0]
-          }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(scalar, multidimensional1).eval({
-              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
-          }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(scalar1, multidimensional1).eval(
-              {
-                  scalar1: 2.0,
-                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
-              }))
-
-      # Vector, X
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(vector, vector1).eval({
-              vector1: [2.0, 3.0]
-          }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(vector1, vector2).eval({
-              vector1: [2.0, 3.0, 4.0],
-              vector2: [6.0]
-          }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(vector, multidimensional1).eval({
-              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
-          }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(vector1, multidimensional1).eval(
-              {
-                  vector1: [2.0, 3.0, 4.0],
-                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
-              }))
-
-      # Multidimensional, X
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(
-              multidimensional, multidimensional1).eval({
-                  multidimensional1: [[1.0, 3.5, 5.0], [6.3, 2.3, 7.1]]
-              }))
-      self.assertFalse(
-          distribution_util.same_dynamic_shape(
-              multidimensional1, multidimensional2).eval({
-                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]],
-                  multidimensional2: [[1.0, 3.5, 5.0], [6.3, 2.3, 7.1]]
-              }))
-
-
-class RotateTransposeTest(test.TestCase):
-
-  def _np_rotate_transpose(self, x, shift):
-    if not isinstance(x, np.ndarray):
-      x = np.array(x)
-    return np.transpose(x, np.roll(np.arange(len(x.shape)), shift))
-
-  def testRollStatic(self):
-    with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "None values not supported."):
-        distribution_util.rotate_transpose(None, 1)
-      for x in (np.ones(1), np.ones((2, 1)), np.ones((3, 2, 1))):
-        for shift in np.arange(-5, 5):
-          y = distribution_util.rotate_transpose(x, shift)
-          self.assertAllEqual(self._np_rotate_transpose(x, shift), y.eval())
-          self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
-
-  def testRollDynamic(self):
-    with self.test_session() as sess:
-      x = array_ops.placeholder(dtypes.float32)
-      shift = array_ops.placeholder(dtypes.int32)
-      for x_value in (np.ones(
-          1, dtype=x.dtype.as_numpy_dtype()), np.ones(
-              (2, 1), dtype=x.dtype.as_numpy_dtype()), np.ones(
-                  (3, 2, 1), dtype=x.dtype.as_numpy_dtype())):
-        for shift_value in np.arange(-5, 5):
-          self.assertAllEqual(
-              self._np_rotate_transpose(x_value, shift_value),
-              sess.run(distribution_util.rotate_transpose(x, shift),
-                       feed_dict={x: x_value,
-                                  shift: shift_value}))
-
-
-class PickVectorTest(test.TestCase):
-
-  def testCorrectlyPicksVector(self):
-    with self.test_session():
-      x = np.arange(10, 12)
-      y = np.arange(15, 18)
-      self.assertAllEqual(x,
-                          distribution_util.pick_vector(
-                              math_ops.less(0, 5), x, y).eval())
-      self.assertAllEqual(y,
-                          distribution_util.pick_vector(
-                              math_ops.less(5, 0), x, y).eval())
-      self.assertAllEqual(x,
-                          distribution_util.pick_vector(
-                              constant_op.constant(True), x, y))  # No eval.
-      self.assertAllEqual(y,
-                          distribution_util.pick_vector(
-                              constant_op.constant(False), x, y))  # No eval.
-
-
-class FillLowerTriangularTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _fill_lower_triangular(self, x):
-    """Numpy implementation of `fill_lower_triangular`."""
-    x = np.asarray(x)
-    d = x.shape[-1]
-    # d = n(n+1)/2 implies n is:
-    n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
-    ids = np.tril_indices(n)
-    y = np.zeros(list(x.shape[:-1]) + [n, n], dtype=x.dtype)
-    y[..., ids[0], ids[1]] = x
-    return y
-
-  def testCorrectlyMakes1x1LowerTril(self):
-    with self.test_session():
-      x = ops.convert_to_tensor(self._rng.randn(3, 1))
-      expected = self._fill_lower_triangular(tensor_util.constant_value(x))
-      actual = distribution_util.fill_lower_triangular(x, validate_args=True)
-      self.assertAllEqual(expected.shape, actual.get_shape())
-      self.assertAllEqual(expected, actual.eval())
-
-  def testCorrectlyMakesNoBatchLowerTril(self):
-    with self.test_session():
-      x = ops.convert_to_tensor(self._rng.randn(10))
-      expected = self._fill_lower_triangular(tensor_util.constant_value(x))
-      actual = distribution_util.fill_lower_triangular(x, validate_args=True)
-      self.assertAllEqual(expected.shape, actual.get_shape())
-      self.assertAllEqual(expected, actual.eval())
-      g = gradients_impl.gradients(
-          distribution_util.fill_lower_triangular(x), x)
-      self.assertAllEqual(np.tri(4).reshape(-1), g[0].values.eval())
-
-  def testCorrectlyMakesBatchLowerTril(self):
-    with self.test_session():
-      x = ops.convert_to_tensor(self._rng.randn(2, 2, 6))
-      expected = self._fill_lower_triangular(tensor_util.constant_value(x))
-      actual = distribution_util.fill_lower_triangular(x, validate_args=True)
-      self.assertAllEqual(expected.shape, actual.get_shape())
-      self.assertAllEqual(expected, actual.eval())
       self.assertAllEqual(
-          np.ones((2, 2, 6)),
-          gradients_impl.gradients(
-              distribution_util.fill_lower_triangular(x), x)[0].eval())
-
-
-class GenNewSeedTest(test.TestCase):
-
-  def testOnlyNoneReturnsNone(self):
-    self.assertFalse(distribution_util.gen_new_seed(0, "salt") is None)
-    self.assertTrue(distribution_util.gen_new_seed(None, "salt") is None)
-
-
-# TODO(jvdillon): Merge this test back into:
-# tensorflow/python/kernel_tests/softplus_op_test.py
-# once TF core is accepting new ops.
-class SoftplusTest(test.TestCase):
-
-  def _npSoftplus(self, np_features):
-    np_features = np.asarray(np_features)
-    zero = np.asarray(0).astype(np_features.dtype)
-    return np.logaddexp(zero, np_features)
-
-  def _testSoftplus(self, np_features, use_gpu=False):
-    np_features = np.asarray(np_features)
-    np_softplus = self._npSoftplus(np_features)
-    with self.test_session(use_gpu=use_gpu) as sess:
-      softplus = nn_ops.softplus(np_features)
-      softplus_inverse = distribution_util.softplus_inverse(softplus)
-      [tf_softplus, tf_softplus_inverse] = sess.run([
-          softplus, softplus_inverse])
-    self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
-    rtol = {"float16": 0.07, "float32": 0.003, "float64": 0.002}.get(
-        str(np_features.dtype), 1e-6)
-    # This will test that we correctly computed the inverse by verifying we
-    # recovered the original input.
-    self.assertAllCloseAccordingToType(
-        np_features, tf_softplus_inverse,
-        atol=0., rtol=rtol)
-    self.assertAllEqual(np.ones_like(tf_softplus).astype(np.bool),
-                        tf_softplus > 0)
-
-    self.assertShapeEqual(np_softplus, softplus)
-    self.assertShapeEqual(np_softplus, softplus_inverse)
-
-    self.assertAllEqual(np.ones_like(tf_softplus).astype(np.bool),
-                        np.isfinite(tf_softplus))
-    self.assertAllEqual(np.ones_like(tf_softplus_inverse).astype(np.bool),
-                        np.isfinite(tf_softplus_inverse))
-
-  def testNumbers(self):
-    for t in [np.float16, np.float32, np.float64]:
-      lower = {np.float16: -15, np.float32: -50, np.float64: -50}.get(t, -100)
-      upper = {np.float16: 50, np.float32: 50, np.float64: 50}.get(t, 100)
-      self._testSoftplus(
-          np.array(np.linspace(lower, upper, int(1e3)).astype(t)).reshape(
-              [2, -1]),
-          use_gpu=False)
-      self._testSoftplus(
-          np.array(np.linspace(lower, upper, int(1e3)).astype(t)).reshape(
-              [2, -1]),
-          use_gpu=True)
-      log_eps = np.log(np.finfo(t).eps)
-      one = t(1)
-      ten = t(10)
-      self._testSoftplus(
-          [
-              log_eps, log_eps - one, log_eps + one, log_eps - ten,
-              log_eps + ten, -log_eps, -log_eps - one, -log_eps + one,
-              -log_eps - ten, -log_eps + ten
-          ],
-          use_gpu=False)
-      self._testSoftplus(
-          [
-              log_eps, log_eps - one, log_eps + one, log_eps - ten,
-              log_eps + ten - log_eps, -log_eps - one, -log_eps + one,
-              -log_eps - ten, -log_eps + ten
-          ],
-          use_gpu=True)
-
-  def testGradient(self):
+          [[4., 8., 0., 0.],
+           [1., 5., 9., 0.],
+           [0., 2., 6., 10.],
+           [0., 0., 3, 7.]],
+          distribution_util.tridiag(
+              [1., 2., 3.],
+              [4., 5., 6., 7.],
+              [8., 9., 10.]).eval())
+
+  def testWorksCorrectlyBatches(self):
     with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.softplus(x, name="softplus")
-      x_init = np.asarray(
-          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
-          dtype=np.float32,
-          order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
-    logging.vlog(2, "softplus (float) gradient err = ", err)
-    self.assertLess(err, 1e-4)
-
-  def testInverseSoftplusGradientNeverNan(self):
+      self.assertAllClose(
+          [[[4., 8., 0., 0.],
+            [1., 5., 9., 0.],
+            [0., 2., 6., 10.],
+            [0., 0., 3, 7.]],
+           [[0.7, 0.1, 0.0, 0.0],
+            [0.8, 0.6, 0.2, 0.0],
+            [0.0, 0.9, 0.5, 0.3],
+            [0.0, 0.0, 1.0, 0.4]]],
+          distribution_util.tridiag(
+              [[1., 2., 3.],
+               [0.8, 0.9, 1.]],
+              [[4., 5., 6., 7.],
+               [0.7, 0.6, 0.5, 0.4]],
+              [[8., 9., 10.],
+               [0.1, 0.2, 0.3]]).eval(),
+          rtol=1e-5, atol=0.)
+
+  def testHandlesNone(self):
     with self.test_session():
-      # Note that this range contains both zero and inf.
-      x = constant_op.constant(np.logspace(-8, 6).astype(np.float16))
-      y = distribution_util.softplus_inverse(x)
-      grads = gradients_impl.gradients(y, x)[0].eval()
-      # Equivalent to `assertAllFalse` (if it existed).
-      self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
+      self.assertAllClose(
+          [[[4., 0., 0., 0.],
+            [0., 5., 0., 0.],
+            [0., 0., 6., 0.],
+            [0., 0., 0, 7.]],
+           [[0.7, 0.0, 0.0, 0.0],
+            [0.0, 0.6, 0.0, 0.0],
+            [0.0, 0.0, 0.5, 0.0],
+            [0.0, 0.0, 0.0, 0.4]]],
+          distribution_util.tridiag(
+              diag=[[4., 5., 6., 7.],
+                    [0.7, 0.6, 0.5, 0.4]]).eval(),
+          rtol=1e-5, atol=0.)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff0544c977a37d377fbc95f0e980ee55e429df6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for estimator.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.contrib.distributions.python.ops import estimator as estimator_lib
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators.head_test import _assert_metrics
+from tensorflow.contrib.learn.python.learn.estimators.head_test import _assert_no_variables
+from tensorflow.contrib.learn.python.learn.estimators.head_test import _assert_summary_tags
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class EstimatorHeadDistributionRegressionTest(test.TestCase):
+
+  def _assert_output_alternatives(self, model_fn_ops):
+    self.assertEquals({
+        None: constants.ProblemType.LINEAR_REGRESSION
+    }, {
+        k: v[0] for k, v in six.iteritems(model_fn_ops.output_alternatives)
+    })
+
+  def testNormalLocScaleLogits(self):
+    # We will bias logits[..., 1] so that: logits[..., 1]=0 implies scale=1.
+    scale_bias = np.log(np.expm1(1.))
+
+    def softplus(x):
+      return np.log1p(np.exp(x))
+
+    def actual_loss(logits, labels):
+      mu = actual_mean(logits)
+      sigma = actual_stddev(logits)
+      labels = np.squeeze(labels, -1)
+      z = (labels - mu) / sigma
+      loss = 0.5 * (z**2. + np.log(2. * np.pi)) + np.log(sigma)
+      return loss.mean()
+
+    def actual_mean(logits):
+      return logits[..., 0]
+
+    def actual_stddev(logits):
+      return softplus(logits[..., 1] + scale_bias)
+
+    def make_distribution_fn(logits):
+      return normal_lib.Normal(
+          loc=logits[..., 0],
+          scale=nn_ops.softplus(logits[..., 1] + scale_bias))
+
+    head = estimator_lib.estimator_head_distribution_regression(
+        make_distribution_fn,
+        logits_dimension=2)
+    labels = np.float32([[-1.],
+                         [0.],
+                         [1.]])
+    logits = np.float32([[0., -1],
+                         [1, 0.5],
+                         [-1, 1]])
+    with ops.Graph().as_default(), session.Session():
+      # Convert to tensor so we can index into head.distributions.
+      tflogits = ops.convert_to_tensor(logits, name="logits")
+      model_fn_ops = head.create_model_fn_ops(
+          {},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=tflogits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_summary_tags(self, ["loss"])
+      _assert_no_variables(self)
+      loss = actual_loss(logits, labels)
+      _assert_metrics(self, loss, {"loss": loss}, model_fn_ops)
+
+      # Now we verify the underlying distribution was correctly constructed.
+      expected_mean = logits[..., 0]
+      self.assertAllClose(
+          expected_mean,
+          head.distribution(tflogits).mean().eval(),
+          rtol=1e-6, atol=0.)
+
+      expected_stddev = softplus(logits[..., 1] + scale_bias)
+      self.assertAllClose(
+          expected_stddev,
+          head.distribution(tflogits).stddev().eval(),
+          rtol=1e-6, atol=0.)
+      # Should have created only one distribution.
+      self.assertEqual(1, len(head.distributions))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
index 9ef68c4c2cbdbfab48602d2fd98fe30acede06f3..87cdd0485a64b227061b5ee9e9162dc8093ad41d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
@@ -47,13 +47,13 @@ class GeometricTest(test.TestCase):
     invalid_ps = [-.01, -0.01, -2.]
     with self.test_session():
       with self.assertRaisesOpError("Condition x >= 0"):
-        geom = geometric.Geometric(probs=invalid_ps)
+        geom = geometric.Geometric(probs=invalid_ps, validate_args=True)
         geom.probs.eval()
 
     invalid_ps = [1.1, 3., 5.]
     with self.test_session():
       with self.assertRaisesOpError("Condition x <= y"):
-        geom = geometric.Geometric(probs=invalid_ps)
+        geom = geometric.Geometric(probs=invalid_ps, validate_args=True)
         geom.probs.eval()
 
   def testGeomLogPmf(self):
@@ -78,7 +78,7 @@ class GeometricTest(test.TestCase):
       probs = constant_op.constant([.9] * batch_size)
       x = array_ops.placeholder(dtypes.float32, shape=[6])
       feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
-      geom = geometric.Geometric(probs=probs)
+      geom = geometric.Geometric(probs=probs, validate_args=True)
 
       with self.assertRaisesOpError("Condition x == y"):
         log_prob = geom.log_prob(x)
@@ -88,7 +88,7 @@ class GeometricTest(test.TestCase):
         log_prob = geom.log_prob(np.array([-1.], dtype=np.float32))
         log_prob.eval()
 
-      geom = geometric.Geometric(probs=probs, validate_args=False)
+      geom = geometric.Geometric(probs=probs)
       log_prob = geom.log_prob(x)
       self.assertEqual([6,], log_prob.get_shape())
       pmf = geom.prob(x)
@@ -216,7 +216,7 @@ class GeometricTest(test.TestCase):
 
   def testGeometricAtBoundary(self):
     with self.test_session():
-      geom = geometric.Geometric(probs=1.)
+      geom = geometric.Geometric(probs=1., validate_args=True)
 
       x = np.array([0., 2., 3., 4., 5., 6., 7.], dtype=np.float32)
       expected_log_prob = stats.geom.logpmf(x, [1.], loc=-1)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..329f168f424e73fbd2c8c53fb4ac37df2b96fa62
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for computing moving-average statistics."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import moving_stats
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(0)
+
+
+class MovingReduceMeanVarianceTest(test.TestCase):
+
+  def test_assign_exponential_moving_mean_variance(self):
+    shape = [1, 2]
+    true_mean = np.array([[0., 3.]])
+    true_stddev = np.array([[1.1, 0.5]])
+    with self.test_session() as sess:
+      # Start "x" out with this mean.
+      mean_var = variables.Variable(array_ops.zeros_like(true_mean))
+      variance_var = variables.Variable(array_ops.ones_like(true_stddev))
+      x = random_ops.random_normal(shape, dtype=np.float64, seed=0)
+      x = true_stddev * x + true_mean
+      ema, emv = moving_stats.assign_exponential_moving_mean_variance(
+          mean_var, variance_var, x, decay=0.99)
+
+      self.assertEqual(ema.dtype.base_dtype, dtypes.float64)
+      self.assertEqual(emv.dtype.base_dtype, dtypes.float64)
+
+      # Run 1000 updates; moving averages should be near the true values.
+      variables.global_variables_initializer().run()
+      for _ in range(2000):
+        sess.run([ema, emv])
+
+      [mean_var_, variance_var_, ema_, emv_] = sess.run([
+          mean_var, variance_var, ema, emv])
+      # Test that variables are passed-through.
+      self.assertAllEqual(mean_var_, ema_)
+      self.assertAllEqual(variance_var_, emv_)
+      # Test that values are as expected.
+      self.assertAllClose(true_mean, ema_, rtol=0.005, atol=0.015)
+      self.assertAllClose(true_stddev**2., emv_, rtol=0.06, atol=0.)
+
+      # Change the mean, var then update some more. Moving averages should
+      # re-converge.
+      sess.run([
+          mean_var.assign(np.array([[-1., 2.]])),
+          variance_var.assign(np.array([[2., 1.]])),
+      ])
+      for _ in range(2000):
+        sess.run([ema, emv])
+
+      [mean_var_, variance_var_, ema_, emv_] = sess.run([
+          mean_var, variance_var, ema, emv])
+      # Test that variables are passed-through.
+      self.assertAllEqual(mean_var_, ema_)
+      self.assertAllEqual(variance_var_, emv_)
+      # Test that values are as expected.
+      self.assertAllClose(true_mean, ema_, rtol=0.005, atol=0.015)
+      self.assertAllClose(true_stddev**2., emv_, rtol=0.1, atol=0.)
+
+  def test_exponential_moving_mean_variance(self):
+    shape = [1, 2]
+    true_mean = np.array([[0., 3.]])
+    true_stddev = np.array([[1.1, 0.5]])
+    with self.test_session() as sess:
+      # Start "x" out with this mean.
+      x = random_ops.random_normal(shape, dtype=np.float64, seed=0)
+      x = true_stddev * x + true_mean
+      ema, emv = moving_stats.exponential_moving_mean_variance(
+          x, decay=0.99)
+
+      self.assertEqual(ema.dtype.base_dtype, dtypes.float64)
+      self.assertEqual(emv.dtype.base_dtype, dtypes.float64)
+
+      # Run 1000 updates; moving averages should be near the true values.
+      variables.global_variables_initializer().run()
+      for _ in range(2000):
+        sess.run([ema, emv])
+
+      [ema_, emv_] = sess.run([ema, emv])
+      self.assertAllClose(true_mean, ema_, rtol=0.005, atol=0.015)
+      self.assertAllClose(true_stddev**2., emv_, rtol=0.06, atol=0.)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index 3f4582eb7ee1319684a9209465046bb241337f9d..43e302475b49ef5245ba324c35ca294b51a566b6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -24,7 +24,12 @@ from tensorflow.contrib import distributions
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -233,6 +238,57 @@ class MultivariateNormalDiagTest(test.TestCase):
       self.assertAllClose(mu, samps.mean(axis=0), atol=0.1)
       self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
 
+  def testMultivariateNormalDiagNegLogLikelihood(self):
+    num_draws = 50
+    dims = 3
+    with self.test_session() as sess:
+      x_pl = array_ops.placeholder(dtype=dtypes.float32,
+                                   shape=[None, dims],
+                                   name="x")
+      mu_var = variable_scope.get_variable(
+          name="mu",
+          shape=[dims],
+          dtype=dtypes.float32,
+          initializer=init_ops.constant_initializer(1.))
+      sess.run([variables.global_variables_initializer()])
+
+      mvn = ds.MultivariateNormalDiag(
+          loc=mu_var,
+          scale_diag=array_ops.ones(shape=[dims], dtype=dtypes.float32))
+
+      # Typically you'd use `mvn.log_prob(x_pl)` which is always at least as
+      # numerically stable as `tf.log(mvn.prob(x_pl))`. However in this test
+      # we're testing a bug specific to `prob` and not `log_prob`;
+      # http://stackoverflow.com/q/45109305. (The underlying issue was not
+      # related to `Distributions` but that `reduce_prod` didn't correctly
+      # handle negative indexes.)
+      neg_log_likelihood = -math_ops.reduce_sum(math_ops.log(mvn.prob(x_pl)))
+      grad_neg_log_likelihood = gradients_impl.gradients(
+          neg_log_likelihood, variables.trainable_variables())
+
+      x = np.zeros([num_draws, dims], dtype=np.float32)
+      grad_neg_log_likelihood_ = sess.run(
+          grad_neg_log_likelihood,
+          feed_dict={x_pl: x})
+      self.assertEqual(1, len(grad_neg_log_likelihood_))
+      self.assertAllClose(grad_neg_log_likelihood_[0],
+                          np.tile(num_draws, dims),
+                          rtol=1e-6, atol=0.)
+
+  def testDynamicBatchShape(self):
+    mvn = ds.MultivariateNormalDiag(
+        loc=array_ops.placeholder(dtypes.float32, shape=[None, None, 2]),
+        scale_diag=array_ops.placeholder(dtypes.float32, shape=[None, None, 2]))
+    self.assertListEqual(mvn.batch_shape.as_list(), [None, None])
+    self.assertListEqual(mvn.event_shape.as_list(), [2])
+
+  def testDynamicEventShape(self):
+    mvn = ds.MultivariateNormalDiag(
+        loc=array_ops.placeholder(dtypes.float32, shape=[2, 3, None]),
+        scale_diag=array_ops.placeholder(dtypes.float32, shape=[2, 3, None]))
+    self.assertListEqual(mvn.batch_shape.as_list(), [2, 3])
+    self.assertListEqual(mvn.event_shape.as_list(), [None])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
deleted file mode 100644
index 6549992633dcc384f26950f4c80ade60f337b78d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import distributions
-from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import util as distribution_util
-from tensorflow.python.platform import test
-
-
-def softplus(x):
-  return np.log(1 + np.exp(x))
-
-
-class OperatorPDCholeskyTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_cholesky_array(self, shape):
-    mat = self._rng.rand(*shape)
-    chol = distribution_util.matrix_diag_transform(
-        mat, transform=nn_ops.softplus)
-    # Zero the upper triangle because we're using this as a true Cholesky factor
-    # in our tests.
-    return array_ops.matrix_band_part(chol, -1, 0).eval()
-
-  def testLogDet(self):
-    with self.test_session():
-      batch_shape = ()
-      for k in [1, 4]:
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-        log_det = operator.log_det()
-        expected_log_det = np.log(np.prod(np.diag(chol))**2)
-
-        self.assertEqual(batch_shape, log_det.get_shape())
-        self.assertAllClose(expected_log_det, log_det.eval())
-
-  def testLogDetBatchMatrix(self):
-    with self.test_session():
-      batch_shape = (2, 3)
-      for k in [1, 4]:
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-        log_det = operator.log_det()
-
-        self.assertEqual(batch_shape, log_det.get_shape())
-
-        # Test the log-determinant of the [1, 1] matrix.
-        chol_11 = chol[1, 1, :, :]
-        expected_log_det = np.log(np.prod(np.diag(chol_11))**2)
-        self.assertAllClose(expected_log_det, log_det.eval()[1, 1])
-
-  def testSqrtMatmulSingleMatrix(self):
-    with self.test_session():
-      batch_shape = ()
-      for k in [1, 4]:
-        x_shape = batch_shape + (k, 3)
-        x = self._rng.rand(*x_shape)
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-
-        sqrt_operator_times_x = operator.sqrt_matmul(x)
-        expected = math_ops.matmul(chol, x)
-
-        self.assertEqual(expected.get_shape(),
-                         sqrt_operator_times_x.get_shape())
-        self.assertAllClose(expected.eval(), sqrt_operator_times_x.eval())
-
-  def testSqrtMatmulBatchMatrix(self):
-    with self.test_session():
-      batch_shape = (2, 3)
-      for k in [1, 4]:
-        x_shape = batch_shape + (k, 5)
-        x = self._rng.rand(*x_shape)
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-
-        sqrt_operator_times_x = operator.sqrt_matmul(x)
-        expected = math_ops.matmul(chol, x)
-
-        self.assertEqual(expected.get_shape(),
-                         sqrt_operator_times_x.get_shape())
-        self.assertAllClose(expected.eval(), sqrt_operator_times_x.eval())
-
-  def testSqrtMatmulBatchMatrixWithTranspose(self):
-    with self.test_session():
-      batch_shape = (2, 3)
-      for k in [1, 4]:
-        x_shape = batch_shape + (5, k)
-        x = self._rng.rand(*x_shape)
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-
-        sqrt_operator_times_x = operator.sqrt_matmul(x, transpose_x=True)
-        # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
-        expected = math_ops.matmul(chol, x, adjoint_b=True)
-
-        self.assertEqual(expected.get_shape(),
-                         sqrt_operator_times_x.get_shape())
-        self.assertAllClose(expected.eval(), sqrt_operator_times_x.eval())
-
-  def testMatmulSingleMatrix(self):
-    with self.test_session():
-      batch_shape = ()
-      for k in [1, 4]:
-        x_shape = batch_shape + (k, 5)
-        x = self._rng.rand(*x_shape)
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-        matrix = math_ops.matmul(chol, chol, adjoint_b=True)
-
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-
-        expected = math_ops.matmul(matrix, x)
-
-        self.assertEqual(expected.get_shape(), operator.matmul(x).get_shape())
-        self.assertAllClose(expected.eval(), operator.matmul(x).eval())
-
-  def testMatmulBatchMatrix(self):
-    with self.test_session():
-      batch_shape = (2, 3)
-      for k in [1, 4]:
-        x_shape = batch_shape + (k, 5)
-        x = self._rng.rand(*x_shape)
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-        matrix = math_ops.matmul(chol, chol, adjoint_b=True)
-
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-
-        expected = math_ops.matmul(matrix, x)
-
-        self.assertEqual(expected.get_shape(), operator.matmul(x).get_shape())
-        self.assertAllClose(expected.eval(), operator.matmul(x).eval())
-
-  def testMatmulBatchMatrixWithTranspose(self):
-    with self.test_session():
-      batch_shape = (2, 3)
-      for k in [1, 4]:
-        x_shape = batch_shape + (5, k)
-        x = self._rng.rand(*x_shape)
-        chol_shape = batch_shape + (k, k)
-        chol = self._random_cholesky_array(chol_shape)
-        matrix = math_ops.matmul(chol, chol, adjoint_b=True)
-
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-        operator_times_x = operator.matmul(x, transpose_x=True)
-
-        # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
-        expected = math_ops.matmul(matrix, x, adjoint_b=True)
-
-        self.assertEqual(expected.get_shape(), operator_times_x.get_shape())
-        self.assertAllClose(expected.eval(), operator_times_x.eval())
-
-  def testShape(self):
-    # All other shapes are defined by the abstractmethod shape, so we only need
-    # to test this.
-    with self.test_session():
-      for shape in [(3, 3), (2, 3, 3), (1, 2, 3, 3)]:
-        chol = self._random_cholesky_array(shape)
-        operator = operator_pd_cholesky.OperatorPDCholesky(chol)
-        self.assertAllEqual(shape, operator.shape().eval())
-
-  def testToDense(self):
-    with self.test_session():
-      chol = self._random_cholesky_array((3, 3))
-      chol_2 = chol.copy()
-      chol_2[0, 2] = 1000  # Make sure upper triangular part makes no diff.
-      operator = operator_pd_cholesky.OperatorPDCholesky(chol_2)
-      self.assertAllClose(chol.dot(chol.T), operator.to_dense().eval())
-
-  def testSqrtToDense(self):
-    with self.test_session():
-      chol = self._random_cholesky_array((2, 3, 3))
-      chol_2 = chol.copy()
-      chol_2[0, 0, 2] = 1000  # Make sure upper triangular part makes no diff.
-      operator = operator_pd_cholesky.OperatorPDCholesky(chol_2)
-      self.assertAllClose(chol, operator.sqrt_to_dense().eval())
-
-  def testNonPositiveDefiniteMatrixRaises(self):
-    # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
-      lower_mat = [[1.0, 0.0], [2.0, 0.0]]
-      operator = operator_pd_cholesky.OperatorPDCholesky(lower_mat)
-      with self.assertRaisesOpError("x > 0 did not hold"):
-        operator.to_dense().eval()
-
-  def testNonPositiveDefiniteMatrixDoesNotRaiseIfNotVerifyPd(self):
-    # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
-      lower_mat = [[1.0, 0.0], [2.0, 0.0]]
-      operator = operator_pd_cholesky.OperatorPDCholesky(
-          lower_mat, verify_pd=False)
-      operator.to_dense().eval()  # Should not raise.
-
-  def testNotHavingTwoIdenticalLastDimsRaises(self):
-    # Unless the last two dims are equal, this cannot represent a matrix, and it
-    # should raise.
-    with self.test_session():
-      batch_vec = [[1.0], [2.0]]  # shape 2 x 1
-      with self.assertRaisesOpError("x == y did not hold"):
-        operator = operator_pd_cholesky.OperatorPDCholesky(batch_vec)
-        operator.to_dense().eval()
-
-
-class MatrixDiagTransformTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(0)
-
-  def check_off_diagonal_same(self, m1, m2):
-    """Check the lower triangular part, not upper or diag."""
-    self.assertAllClose(np.tril(m1, k=-1), np.tril(m2, k=-1))
-    self.assertAllClose(np.triu(m1, k=1), np.triu(m2, k=1))
-
-  def testNonBatchMatrixWithTransform(self):
-    mat = self._rng.rand(4, 4)
-    with self.test_session():
-      chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-      self.assertEqual((4, 4), chol.get_shape())
-
-      self.check_off_diagonal_same(mat, chol.eval())
-      self.assertAllClose(softplus(np.diag(mat)), np.diag(chol.eval()))
-
-  def testNonBatchMatrixNoTransform(self):
-    mat = self._rng.rand(4, 4)
-    with self.test_session():
-      # Default is no transform.
-      chol = distributions.matrix_diag_transform(mat)
-      self.assertEqual((4, 4), chol.get_shape())
-      self.assertAllClose(mat, chol.eval())
-
-  def testBatchMatrixWithTransform(self):
-    mat = self._rng.rand(2, 4, 4)
-    mat_0 = mat[0, :, :]
-    with self.test_session():
-      chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-
-      self.assertEqual((2, 4, 4), chol.get_shape())
-
-      chol_0 = chol.eval()[0, :, :]
-
-      self.check_off_diagonal_same(mat_0, chol_0)
-      self.assertAllClose(softplus(np.diag(mat_0)), np.diag(chol_0))
-
-      self.check_off_diagonal_same(mat_0, chol_0)
-      self.assertAllClose(softplus(np.diag(mat_0)), np.diag(chol_0))
-
-  def testBatchMatrixNoTransform(self):
-    mat = self._rng.rand(2, 4, 4)
-    with self.test_session():
-      # Default is no transform.
-      chol = distributions.matrix_diag_transform(mat)
-
-      self.assertEqual((2, 4, 4), chol.get_shape())
-      self.assertAllClose(mat, chol.eval())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py
deleted file mode 100644
index 0a8f9640c467c94058a6180ba8772cd4b2c9e810..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-
-from tensorflow.contrib.distributions.python.ops import operator_pd_diag
-from tensorflow.contrib.distributions.python.ops import operator_test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-@six.add_metaclass(abc.ABCMeta)
-class OperatorPDDiagBaseTest(object):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_pd_diag(self, diag_shape):
-    return self._rng.rand(*diag_shape) + 0.1
-
-  @abc.abstractmethod
-  def _diag_to_matrix(self, diag):
-    pass
-
-  @abc.abstractproperty
-  def operator_class(self):
-    # Return the operator class that this tests.
-    pass
-
-  def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
-    # Create a diagonal matrix explicitly.
-    # Create an OperatorPDSqrtDiag using the same diagonal.
-    # The operator should have the same behavior.
-    #
-    batch_shape = list(batch_shape)
-    diag_shape = batch_shape + [k]
-
-    # The diag is the square root.
-    diag = self._random_pd_diag(diag_shape).astype(dtype)
-    mat = self._diag_to_matrix(diag).astype(dtype)
-    operator = self.operator_class(diag)
-
-    return operator, mat
-
-  def testNonPositiveDefiniteMatrixRaises(self):
-    # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
-      diag = [1.0, 0.0]
-      operator = operator_pd_diag.OperatorPDSqrtDiag(diag)
-      with self.assertRaisesOpError("assert_positive"):
-        operator.to_dense().eval()
-
-  def testNonPositiveDefiniteMatrixDoesNotRaiseIfNotVerifyPd(self):
-    # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
-      diag = [1.0, 0.0]
-      operator = operator_pd_diag.OperatorPDSqrtDiag(diag, verify_pd=False)
-      operator.to_dense().eval()  # Should not raise
-
-
-class OperatorPDDiagTest(OperatorPDDiagBaseTest,
-                         operator_test_util.OperatorPDDerivedClassTest):
-  """Most tests done in the base classes."""
-
-  def _diag_to_matrix(self, diag):
-    return array_ops.matrix_diag(diag).eval()
-
-  @property
-  def operator_class(self):
-    return operator_pd_diag.OperatorPDDiag
-
-
-class OperatorPDSqrtDiagTest(OperatorPDDiagBaseTest,
-                             operator_test_util.OperatorPDDerivedClassTest):
-  """Most tests done in the base classes."""
-
-  def _diag_to_matrix(self, diag):
-    return array_ops.matrix_diag(diag**2).eval()
-
-  @property
-  def operator_class(self):
-    return operator_pd_diag.OperatorPDSqrtDiag
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
deleted file mode 100644
index 35a7c7e60392347fa47470ce5c57d1056cab9c76..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.distributions.python.ops import operator_pd_full
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class OperatorPDFullTest(test.TestCase):
-  # The only method needing checked (because it isn't part of the parent class)
-  # is the check for symmetry.
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_positive_def_array(self, *shape):
-    matrix = self._rng.rand(*shape)
-    return math_ops.matmul(matrix, matrix, adjoint_b=True).eval()
-
-  def testPositiveDefiniteMatrixDoesntRaise(self):
-    with self.test_session():
-      matrix = self._random_positive_def_array(2, 3, 3)
-      operator = operator_pd_full.OperatorPDFull(matrix, verify_pd=True)
-      operator.to_dense().eval()  # Should not raise
-
-  def testNegativeDefiniteMatrixRaises(self):
-    with self.test_session():
-      matrix = -1 * self._random_positive_def_array(3, 2, 2)
-      operator = operator_pd_full.OperatorPDFull(matrix, verify_pd=True)
-      # Could fail inside Cholesky decomposition, or later when we test the
-      # diag.
-      with self.assertRaisesOpError("x > 0|Cholesky"):
-        operator.to_dense().eval()
-
-  def testNonSymmetricMatrixRaises(self):
-    with self.test_session():
-      matrix = self._random_positive_def_array(3, 2, 2)
-      matrix[0, 0, 1] += 0.001
-      operator = operator_pd_full.OperatorPDFull(matrix, verify_pd=True)
-      with self.assertRaisesOpError("x == y"):
-        operator.to_dense().eval()
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_identity_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_identity_test.py
deleted file mode 100644
index 2b1e4c912bd09c51c5dad15e52e0ffa2c35e54d3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_identity_test.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.distributions.python.ops import operator_pd_identity
-from tensorflow.contrib.distributions.python.ops import operator_test_util
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-distributions = distributions_lib
-
-
-class OperatorPDIdentityTest(operator_test_util.OperatorPDDerivedClassTest):
-  """Most tests done in the base class."""
-
-  def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
-    # Build an identity matrix with right shape and dtype.
-    # Build an operator that should act the same way.
-    batch_shape = list(batch_shape)
-    diag_shape = batch_shape + [k]
-    matrix_shape = batch_shape + [k, k]
-    diag = array_ops.ones(diag_shape, dtype=dtype)
-    scale = constant_op.constant(2.0, dtype=dtype)
-    scaled_identity_matrix = scale * array_ops.matrix_diag(diag)
-    operator = operator_pd_identity.OperatorPDIdentity(
-        matrix_shape, dtype, scale=scale)
-    return operator, scaled_identity_matrix.eval()
-
-  def testBadDtypeArgsRaise(self):
-    dtype = np.float32
-    batch_shape = [2, 3]
-    k = 4
-    with self.test_session():
-      operator, _ = self._build_operator_and_mat(batch_shape, k, dtype=dtype)
-
-      x_good_shape = batch_shape + [k, 5]
-      x_good = self._rng.randn(*x_good_shape).astype(dtype)
-      x_bad = x_good.astype(np.float64)
-
-      operator.matmul(x_good).eval()  # Should not raise.
-
-      with self.assertRaisesRegexp(TypeError, "dtype"):
-        operator.matmul(x_bad)
-
-      with self.assertRaisesRegexp(TypeError, "dtype"):
-        operator.solve(x_bad)
-
-      with self.assertRaisesRegexp(TypeError, "dtype"):
-        operator.sqrt_solve(x_bad)
-
-  def testBadRankArgsRaise(self):
-    # Prepend a singleton dimension, changing the rank of "x", but not the size.
-    dtype = np.float32
-    batch_shape = [2, 3]
-    k = 4
-    with self.test_session():
-      operator, _ = self._build_operator_and_mat(batch_shape, k, dtype=dtype)
-
-      x_good_shape = batch_shape + [k, 5]
-      x_good = self._rng.randn(*x_good_shape).astype(dtype)
-      x_bad = x_good.reshape(1, 2, 3, 4, 5)
-
-      operator.matmul(x_good).eval()  # Should not raise.
-
-      with self.assertRaisesRegexp(ValueError, "tensor rank"):
-        operator.matmul(x_bad)
-
-      with self.assertRaisesRegexp(ValueError, "tensor rank"):
-        operator.solve(x_bad)
-
-      with self.assertRaisesRegexp(ValueError, "tensor rank"):
-        operator.sqrt_solve(x_bad)
-
-  def testIncompatibleShapeArgsRaise(self):
-    # Test shapes that are the same rank but incompatible for matrix
-    # multiplication.
-    dtype = np.float32
-    batch_shape = [2, 3]
-    k = 4
-    with self.test_session():
-      operator, _ = self._build_operator_and_mat(batch_shape, k, dtype=dtype)
-
-      x_good_shape = batch_shape + [k, 5]
-      x_good = self._rng.randn(*x_good_shape).astype(dtype)
-      x_bad_shape = batch_shape + [5, k]
-      x_bad = x_good.reshape(*x_bad_shape)
-
-      operator.matmul(x_good).eval()  # Should not raise.
-
-      with self.assertRaisesRegexp(ValueError, "Incompatible"):
-        operator.matmul(x_bad)
-
-      with self.assertRaisesRegexp(ValueError, "Incompatible"):
-        operator.solve(x_bad)
-
-      with self.assertRaisesRegexp(ValueError, "Incompatible"):
-        operator.sqrt_solve(x_bad)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py
deleted file mode 100644
index a2a1b54be629d29e6740a0dbb3bb31af0c05d39c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-# For private members.
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.distributions.python.ops import operator_pd
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.platform import test
-
-distributions = distributions_lib
-
-
-class OperatorShape(operator_pd.OperatorPDBase):
-  """Operator implements the ABC method ._shape."""
-
-  def __init__(self, shape):
-    self._stored_shape = shape
-
-  @property
-  def verify_pd(self):
-    return True
-
-  def get_shape(self):
-    return tensor_shape.TensorShape(self._stored_shape)
-
-  def _shape(self):
-    return array_ops.shape(np.random.rand(*self._stored_shape))
-
-  @property
-  def name(self):
-    return "OperatorShape"
-
-  def dtype(self):
-    return dtypes.int32
-
-  @property
-  def inputs(self):
-    return []
-
-
-class OperatorSqrtSolve(OperatorShape):
-  """Operator implements .sqrt_solve."""
-
-  def __init__(self, chol_array):
-    self._chol = ops.convert_to_tensor(chol_array)
-    super(OperatorSqrtSolve, self).__init__(chol_array.shape)
-
-  def _sqrt_solve(self, rhs):
-    return linalg_ops.matrix_triangular_solve(self._chol, rhs, lower=True)
-
-  def _batch_sqrt_solve(self, rhs):
-    return linalg_ops.matrix_triangular_solve(self._chol, rhs, lower=True)
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    return self._iqfov_via_sqrt_solve(x)
-
-
-class OperatorSolve(OperatorShape):
-  """Operator implements .solve."""
-
-  def __init__(self, chol):
-    self._pos_def_matrix = math_ops.matmul(chol, chol, adjoint_b=True)
-    super(OperatorSolve, self).__init__(chol.shape)
-
-  def _solve(self, rhs):
-    return linalg_ops.matrix_solve(self._pos_def_matrix, rhs)
-
-  def _batch_solve(self, rhs):
-    return linalg_ops.matrix_solve(self._pos_def_matrix, rhs)
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    return self._iqfov_via_solve(x)
-
-
-class OperatorPDBaseTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_cholesky_array(self, shape):
-    mat = self._rng.rand(*shape)
-    chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-    # Zero the upper triangle because we're using this as a true Cholesky factor
-    # in our tests.
-    return array_ops.matrix_band_part(chol, -1, 0).eval()
-
-  def _numpy_inv_quadratic_form_on_vectors(self, chol, x):
-    # Numpy works with batches now (calls them "stacks").
-    x_expanded = np.expand_dims(x, -1)
-    whitened = np.linalg.solve(chol, x_expanded)
-    return (whitened**2).sum(axis=-1).sum(axis=-1)
-
-  def testAllShapesMethodsDefinedByTheOneAbstractpropertyShape(self):
-
-    shape = (1, 2, 3, 3)
-    with self.test_session():
-      operator = OperatorShape(shape)
-
-      self.assertAllEqual(shape, operator.shape().eval())
-      self.assertAllEqual(4, operator.rank().eval())
-      self.assertAllEqual((1, 2), operator.batch_shape().eval())
-      self.assertAllEqual((1, 2, 3), operator.vector_shape().eval())
-      self.assertAllEqual(3, operator.vector_space_dimension().eval())
-
-      self.assertEqual(shape, operator.get_shape())
-      self.assertEqual((1, 2), operator.get_batch_shape())
-      self.assertEqual((1, 2, 3), operator.get_vector_shape())
-
-  def testIqfovXRankSameAsBroadcastRankUsingSqrtSolve(self):
-    with self.test_session():
-      for batch_shape in [(), (2,)]:
-        for k in [1, 3]:
-
-          x_shape = batch_shape + (k,)
-          x = self._rng.randn(*x_shape)
-
-          chol_shape = batch_shape + (k, k)
-          chol = self._random_cholesky_array(chol_shape)
-          operator = OperatorSqrtSolve(chol)
-          qf = operator.inv_quadratic_form_on_vectors(x)
-
-          self.assertEqual(batch_shape, qf.get_shape())
-
-          numpy_qf = self._numpy_inv_quadratic_form_on_vectors(chol, x)
-          self.assertAllClose(numpy_qf, qf.eval())
-
-  def testIqfovXRankGreaterThanBroadcastRankUsingSqrtSolve(self):
-    with self.test_session():
-      for batch_shape in [(), (2,), (2, 3)]:
-        for k in [1, 4]:
-
-          x_shape = batch_shape + (k,)
-          x = self._rng.randn(*x_shape)
-
-          # chol will not have the leading dimension.
-          chol_shape = batch_shape[1:] + (k, k)
-          chol = self._random_cholesky_array(chol_shape)
-          operator = OperatorSqrtSolve(chol)
-          qf = operator.inv_quadratic_form_on_vectors(x)
-          numpy_qf = self._numpy_inv_quadratic_form_on_vectors(chol, x)
-
-          self.assertEqual(batch_shape, qf.get_shape())
-          self.assertAllClose(numpy_qf, qf.eval())
-
-  def testIqfovXRankTwoGreaterThanBroadcastRankUsingSqrtSolve(self):
-    with self.test_session():
-      for batch_shape in [(2, 3), (2, 3, 4), (2, 3, 4, 5)]:
-        for k in [1, 4]:
-
-          x_shape = batch_shape + (k,)
-          x = self._rng.randn(*x_shape)
-
-          # chol will not have the leading two dimensions.
-          chol_shape = batch_shape[2:] + (k, k)
-          chol = self._random_cholesky_array(chol_shape)
-          operator = OperatorSqrtSolve(chol)
-          qf = operator.inv_quadratic_form_on_vectors(x)
-          numpy_qf = self._numpy_inv_quadratic_form_on_vectors(chol, x)
-
-          self.assertEqual(batch_shape, qf.get_shape())
-          self.assertAllClose(numpy_qf, qf.eval())
-
-  def testIqfovXRankSameAsBroadcastRankUsingSolve(self):
-    with self.test_session():
-      for batch_shape in [(), (2,)]:
-        for k in [1, 3]:
-
-          x_shape = batch_shape + (k,)
-          x = self._rng.randn(*x_shape)
-
-          chol_shape = batch_shape + (k, k)
-          chol = self._random_cholesky_array(chol_shape)
-          operator = OperatorSolve(chol)
-          qf = operator.inv_quadratic_form_on_vectors(x)
-
-          self.assertEqual(batch_shape, qf.get_shape())
-
-          numpy_qf = self._numpy_inv_quadratic_form_on_vectors(chol, x)
-          self.assertAllClose(numpy_qf, qf.eval())
-
-  def testIqfovXRankGreaterThanBroadcastRankUsingSolve(self):
-    with self.test_session():
-      for batch_shape in [(2,), (2, 3)]:
-        for k in [1, 4]:
-
-          x_shape = batch_shape + (k,)
-          x = self._rng.randn(*x_shape)
-
-          # chol will not have the leading dimension.
-          chol_shape = batch_shape[1:] + (k, k)
-          chol = self._random_cholesky_array(chol_shape)
-          operator = OperatorSolve(chol)
-          qf = operator.inv_quadratic_form_on_vectors(x)
-          numpy_qf = self._numpy_inv_quadratic_form_on_vectors(chol, x)
-
-          self.assertEqual(batch_shape, qf.get_shape())
-          self.assertAllClose(numpy_qf, qf.eval())
-
-  def testIqfovXRankTwoGreaterThanBroadcastRankUsingSolve(self):
-    with self.test_session():
-      for batch_shape in [(2, 3), (2, 3, 4), (2, 3, 4, 5)]:
-        for k in [1, 4]:
-
-          x_shape = batch_shape + (k,)
-          x = self._rng.randn(*x_shape)
-
-          # chol will not have the leading two dimensions.
-          chol_shape = batch_shape[2:] + (k, k)
-          chol = self._random_cholesky_array(chol_shape)
-          operator = OperatorSolve(chol)
-          qf = operator.inv_quadratic_form_on_vectors(x)
-          numpy_qf = self._numpy_inv_quadratic_form_on_vectors(chol, x)
-
-          self.assertEqual(batch_shape, qf.get_shape())
-          self.assertAllClose(numpy_qf, qf.eval())
-
-
-class FlipMatrixToVectorTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState()
-
-  def testMatrixAndVectorBatchShapesTheSame(self):
-    batch_shape = [6, 2, 3]
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = self._rng.rand(2, 3, 4, 6)
-        vec = operator_pd.flip_matrix_to_vector(mat, batch_shape,
-                                                static_batch_shape)
-        vec_v = vec.eval()
-        self.assertAllEqual((6, 2, 3, 4), vec_v.shape)
-        self.assertAllEqual(mat[1, 2, 3, 4], vec_v[4, 1, 2, 3])
-
-  def testMatrixAndVectorBatchShapesSameRankButPermuted(self):
-    batch_shape = [6, 3, 2]
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = self._rng.rand(2, 3, 4, 6)
-        vec = operator_pd.flip_matrix_to_vector(mat, batch_shape,
-                                                static_batch_shape)
-        vec_v = vec.eval()
-        self.assertAllEqual((6, 3, 2, 4), vec_v.shape)
-
-  def testVectorBatchShapeLongerThanMatrixBatchShape(self):
-    batch_shape = [2, 3, 2, 3]
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = self._rng.rand(2, 3, 4, 6)
-        vec = operator_pd.flip_matrix_to_vector(mat, batch_shape,
-                                                static_batch_shape)
-        vec_v = vec.eval()
-        self.assertAllEqual((2, 3, 2, 3, 4), vec_v.shape)
-
-  def testMatrixBatchShapeHasASingletonThatVecBatchShapeDoesnt(self):
-    batch_shape = [6, 3]
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = self._rng.rand(1, 3, 4, 6)
-        vec = operator_pd.flip_matrix_to_vector(mat, batch_shape,
-                                                static_batch_shape)
-        vec_v = vec.eval()
-        self.assertAllEqual((6, 3, 4), vec_v.shape)
-        self.assertAllEqual(mat[0, 2, 3, 4], vec_v[4, 2, 3])
-
-
-class FlipVectorToMatrixTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState()
-
-  def testWhenXBatchRankIsSameAsBatchRankArg(self):
-    batch_shape = [4, 5]
-    x = self._rng.rand(4, 5, 6)
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = operator_pd.flip_vector_to_matrix(x, batch_shape,
-                                                static_batch_shape)
-        mat_v = mat.eval()
-        expected_mat_v = x.reshape(x.shape + (1,))
-        self.assertAllEqual(expected_mat_v, mat_v)
-
-  def testWhenXHasOneLargerLargerBatchRankThanBatchRankArg(self):
-    batch_shape = [4, 5]
-    x = self._rng.rand(3, 4, 5, 6)
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = operator_pd.flip_vector_to_matrix(x, batch_shape,
-                                                static_batch_shape)
-        mat_v = mat.eval()
-        self.assertAllEqual((4, 5, 6, 3), mat_v.shape)
-        self.assertAllEqual(x[2, 2, 2, 1], mat_v[2, 2, 1, 2])
-
-  def testWhenBatchShapeRequiresReshapeOfVectorBatchShape(self):
-    batch_shape = [5, 4]
-    x = self._rng.rand(3, 4, 5, 6)  # Note x has (4,5) and batch_shape is (5, 4)
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = operator_pd.flip_vector_to_matrix(x, batch_shape,
-                                                static_batch_shape)
-        mat_v = mat.eval()
-        self.assertAllEqual((5, 4, 6, 3), mat_v.shape)
-
-  def testWhenXHasTwoLargerLargerBatchRankThanBatchRankArg(self):
-    batch_shape = [4, 5]
-    x = self._rng.rand(2, 3, 4, 5, 6)
-    for static_batch_shape in [
-        tensor_shape.TensorShape(batch_shape), tensor_shape.TensorShape(None)
-    ]:
-      with self.test_session():
-        mat = operator_pd.flip_vector_to_matrix(x, batch_shape,
-                                                static_batch_shape)
-        mat_v = mat.eval()
-        self.assertAllEqual((4, 5, 6, 2 * 3), mat_v.shape)
-
-
-class ExtractBatchShapeTest(test.TestCase):
-
-  def setUp(self):
-    self._rng = np.random.RandomState()
-
-  def testXHasEmptyBatchShape(self):
-    with self.test_session():
-      x = self._rng.rand(2, 3)
-      num_event_dims = 2
-      batch_shape = operator_pd.extract_batch_shape(x, num_event_dims)
-      self.assertAllEqual([], batch_shape.eval())
-
-  def testXHasNonEmptyBatchShape(self):
-    with self.test_session():
-      x = self._rng.rand(2, 3, 4, 5)
-      num_event_dims = 2
-      batch_shape = operator_pd.extract_batch_shape(x, num_event_dims)
-      self.assertAllEqual([2, 3], batch_shape.eval())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
deleted file mode 100644
index e2fc081b356f432eb15f1b4de3cb21d0c03da694..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.distributions.python.ops import operator_pd_full
-from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
-from tensorflow.contrib.distributions.python.ops import operator_test_util
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-distributions = distributions_lib
-
-
-class OperatorPDSqrtVDVTUpdateTest(
-    operator_test_util.OperatorPDDerivedClassTest):
-  """Most tests done in the base class."""
-  _diag_is_none = False
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _random_pd_matrix(self, shape):
-    # With probability 1 this is positive definite.
-    sqrt = self._rng.randn(*shape)
-    mat = math_ops.matmul(sqrt, sqrt, adjoint_b=True)
-    return mat.eval()
-
-  def _random_v_and_diag(self, mat_shape, v_matrix_rank):
-    # Get the necessary elements to make the sqrt update.
-    mat_shape = list(mat_shape)
-    batch_shape = mat_shape[:-2]
-    diag_shape = mat_shape[:-2] + [v_matrix_rank]
-    k = mat_shape[-1]
-    assert k == mat_shape[-2], "Must be a square matrix"
-    v_shape = batch_shape + [k, v_matrix_rank]
-    v = self._rng.randn(*v_shape)  # anything goes with "v"!
-
-    if self._diag_is_none:
-      diag = None
-    else:
-      diag = self._rng.rand(*diag_shape) + 0.1  # Positive diag!
-    return v, diag
-
-  def _updated_mat(self, mat, v, diag):
-    # Get dense matrix defined by its square root, which is an update of `mat`:
-    # A = (mat + v D v^T) (mat + v D v^T)^T
-    # D is the diagonal matrix with `diag` on the diagonal.
-
-    # If diag is None, then it defaults to the identity matrix, so DV^T = V^T
-    if diag is None:
-      diag_vt = array_ops.matrix_transpose(v)
-    else:
-      diag_mat = array_ops.matrix_diag(diag)
-      diag_vt = math_ops.matmul(diag_mat, v, adjoint_b=True)
-
-    v_diag_vt = math_ops.matmul(v, diag_vt)
-    sqrt = mat + v_diag_vt
-    a = math_ops.matmul(sqrt, sqrt, adjoint_b=True)
-    return a.eval()
-
-  def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
-    """This method is called by base class, enabling many standard tests."""
-    # Create a matrix then explicitly update it with v and diag.
-    # Create an OperatorPDSqrtVDVTUpdate from the matrix and v and diag
-    # The operator should have the same behavior.
-    #
-    # The low-rank matrix V will have rank 1/2 of k, unless k is 1, in which
-    # case it will be 1 as well.
-    if k == 1:
-      v_matrix_rank = k
-    else:
-      v_matrix_rank = k // 2
-    mat_shape = list(batch_shape) + [k, k]
-    mat = self._random_pd_matrix(mat_shape)
-    v, diag = self._random_v_and_diag(mat_shape, v_matrix_rank)
-
-    # Set dtypes
-    mat = mat.astype(dtype)
-    v = v.astype(dtype)
-    if diag is not None:
-      diag = diag.astype(dtype)
-
-    # The matrix: (mat + v*diag*v^T) * (mat + v*diag*v^T)^T
-    # Our final updated operator should behave like this.
-    updated_mat = self._updated_mat(mat, v, diag)
-
-    # Represents the matrix: `mat`, before updating.
-    # This is the Operator that we will update.
-    o_made_with_mat = operator_pd_full.OperatorPDFull(mat)
-
-    # Represents the matrix: (mat + v*diag*v^T) * (mat + v*diag*v^T)^T,
-    # achieved by updating the operator "o_made_with_mat".
-    # This is the operator we're testing.
-    operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(o_made_with_mat,
-                                                                v, diag)
-
-    return operator, updated_mat
-
-  def testToDensePlaceholder(self):
-    # Test simple functionality when the inputs are placeholders.
-    mat_shape = [3, 3]
-    v_matrix_rank = 2
-    with self.test_session():
-      # Make an OperatorPDFull with a matrix placeholder.
-      mat_ph = array_ops.placeholder(dtypes.float64, name="mat_ph")
-      mat = self._random_pd_matrix(mat_shape)
-      o_made_with_mat = operator_pd_full.OperatorPDFull(mat_ph)
-
-      # Make the placeholders and arrays for the updated operator.
-      v_ph = array_ops.placeholder(dtypes.float64, name="v_ph")
-      v, diag = self._random_v_and_diag(mat_shape, v_matrix_rank)
-      if self._diag_is_none:
-        diag_ph = None
-        feed_dict = {v_ph: v, mat_ph: mat}
-      else:
-        diag_ph = array_ops.placeholder(dtypes.float64, name="diag_ph")
-        feed_dict = {v_ph: v, diag_ph: diag, mat_ph: mat}
-
-      # Make the OperatorPDSqrtVDVTUpdate with v and diag placeholders.
-      operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          o_made_with_mat, v_ph, diag=diag_ph)
-
-      # Should not fail
-      operator.to_dense().eval(feed_dict=feed_dict)
-      operator.log_det().eval(feed_dict=feed_dict)
-
-  def testOperatorNotSubclassOfOperatorPdRaises(self):
-    # We enforce that `operator` is an `OperatorPDBase`.
-    with self.test_session():
-      v, diag = self._random_v_and_diag((3, 3), 2)
-      operator_m = "I am not a subclass of OperatorPDBase"
-
-      with self.assertRaisesRegexp(TypeError, "not instance"):
-        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
-
-  def testNonPosDefDiagRaises(self):
-    if self._diag_is_none:
-      return
-    # We enforce that the diag is positive definite.
-    with self.test_session():
-      matrix_shape = (3, 3)
-      v_rank = 2
-      v, diag = self._random_v_and_diag(matrix_shape, v_rank)
-      mat = self._random_pd_matrix(matrix_shape)
-      diag[0] = 0.0
-
-      operator_m = operator_pd_full.OperatorPDFull(mat)
-      operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v,
-                                                                  diag)
-
-      with self.assertRaisesOpError("positive"):
-        operator.to_dense().eval()
-
-  def testNonPosDefDiagDoesntRaiseIfVerifyPd_false(self):
-    # We enforce that the diag is positive definite.
-    if self._diag_is_none:
-      return
-    with self.test_session():
-      matrix_shape = (3, 3)
-      v_rank = 2
-      v, diag = self._random_v_and_diag(matrix_shape, v_rank)
-      mat = self._random_pd_matrix(matrix_shape)
-      diag[0] = 0.0
-
-      operator_m = operator_pd_full.OperatorPDFull(mat)
-      operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          operator_m, v, diag, verify_pd=False)
-
-      operator.to_dense().eval()  # Should not raise.
-
-  def testEventShapeMismatchVAndDiagRaisesStatic(self):
-    v = self._rng.rand(4, 3, 2)
-    diag = self._rng.rand(4, 1)  # Should be shape (4, 2,) to match v.
-    with self.test_session():
-
-      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
-      operator_m = operator_pd_full.OperatorPDFull(mat)
-      with self.assertRaisesRegexp(ValueError, "diag.*v.*last dimension"):
-        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
-
-  def testBatchShapeMismatchVAndDiagRaisesStatic(self):
-    v = self._rng.rand(4, 3, 2)
-    diag = self._rng.rand(5, 1)  # Should be shape (4, 2,) to match v.
-    with self.test_session():
-
-      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
-      operator_m = operator_pd_full.OperatorPDFull(mat)
-      with self.assertRaisesRegexp(ValueError, "diag.*batch shape"):
-        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
-
-  def testTensorRankShapeMismatchVAndDiagRaisesStatic(self):
-    v = self._rng.rand(1, 2, 2, 2)
-    diag = self._rng.rand(5, 1)  # Should have rank 1 less than v.
-    with self.test_session():
-
-      mat = self._random_pd_matrix((1, 2, 2, 2))  # mat and v match
-      operator_m = operator_pd_full.OperatorPDFull(mat)
-      with self.assertRaisesRegexp(ValueError, "diag.*rank"):
-        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
-
-  def testEventShapeMismatchVAndDiagRaisesDynamic(self):
-    with self.test_session():
-
-      v = self._rng.rand(4, 3, 2)
-      diag = self._rng.rand(4, 1)  # Should be shape (4, 2,) to match v.
-      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
-
-      v_ph = array_ops.placeholder(dtypes.float32, name="v_ph")
-      diag_ph = array_ops.placeholder(dtypes.float32, name="diag_ph")
-      mat_ph = array_ops.placeholder(dtypes.float32, name="mat_ph")
-
-      operator_m = operator_pd_full.OperatorPDFull(mat_ph)
-      updated = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m,
-                                                                 v_ph, diag_ph)
-      with self.assertRaisesOpError("x == y"):
-        updated.to_dense().eval(feed_dict={v_ph: v, diag_ph: diag, mat_ph: mat})
-
-  def testBatchShapeMismatchVAndDiagRaisesDynamic(self):
-    with self.test_session():
-      v = self._rng.rand(4, 3, 2)
-      diag = self._rng.rand(5, 1)  # Should be shape (4, 2,) to match v.
-      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
-
-      v_ph = array_ops.placeholder(dtypes.float32, name="v_ph")
-      diag_ph = array_ops.placeholder(dtypes.float32, name="diag_ph")
-      mat_ph = array_ops.placeholder(dtypes.float32, name="mat_ph")
-
-      operator_m = operator_pd_full.OperatorPDFull(mat_ph)
-      updated = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m,
-                                                                 v_ph, diag_ph)
-      with self.assertRaisesOpError("x == y"):
-        updated.to_dense().eval(feed_dict={v_ph: v, diag_ph: diag, mat_ph: mat})
-
-  def testTensorRankShapeMismatchVAndDiagRaisesDynamic(self):
-    with self.test_session():
-
-      v = self._rng.rand(2, 2, 2, 2)
-      diag = self._rng.rand(2, 2)  # Should have rank 1 less than v.
-      mat = self._random_pd_matrix((2, 2, 2, 2))  # mat and v match
-
-      v_ph = array_ops.placeholder(dtypes.float32, name="v_ph")
-      diag_ph = array_ops.placeholder(dtypes.float32, name="diag_ph")
-      mat_ph = array_ops.placeholder(dtypes.float32, name="mat_ph")
-
-      operator_m = operator_pd_full.OperatorPDFull(mat_ph)
-      updated = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m,
-                                                                 v_ph, diag_ph)
-      with self.assertRaisesOpError("rank"):
-        updated.to_dense().eval(feed_dict={v_ph: v, diag_ph: diag, mat_ph: mat})
-
-
-class OperatorPDSqrtVDVTUpdateNoneDiagTest(OperatorPDSqrtVDVTUpdateTest):
-  _diag_is_none = True
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ffbea1b5c978730c147d98477e99fdd6f364c2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
@@ -0,0 +1,384 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VectorDiffeomixture."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import vector_diffeomixture as vector_diffeomixture_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_diag as linop_diag_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_identity as linop_identity_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class VectorDistributionTestHelpers(object):
+  """VectorDistributionTestHelpers helps test vector-event distributions."""
+
+  def linop(self, num_rows=None, multiplier=None, diag=None):
+    """Helper to create non-singular, symmetric, positive definite matrices."""
+    if num_rows is not None and multiplier is not None:
+      if any(p is not None for p in [diag]):
+        raise ValueError("Found extra args for scaled identity.")
+      return linop_identity_lib.LinearOperatorScaledIdentity(
+          num_rows=num_rows,
+          multiplier=multiplier,
+          is_positive_definite=True)
+    elif num_rows is not None:
+      if any(p is not None for p in [multiplier, diag]):
+        raise ValueError("Found extra args for identity.")
+      return linop_identity_lib.LinearOperatorIdentity(
+          num_rows=num_rows,
+          is_positive_definite=True)
+    elif diag is not None:
+      if any(p is not None for p in [num_rows, multiplier]):
+        raise ValueError("Found extra args for diag.")
+      return linop_diag_lib.LinearOperatorDiag(
+          diag=diag,
+          is_positive_definite=True)
+    else:
+      raise ValueError("Must specify at least one arg.")
+
+  def run_test_sample_consistent_log_prob(
+      self,
+      sess,
+      dist,
+      num_samples=int(1e5),
+      radius=1.,
+      center=0.,
+      seed=42,
+      rtol=1e-2,
+      atol=0.):
+    """Tests that sample/log_prob are mutually consistent.
+
+    "Consistency" means that `sample` and `log_prob` correspond to the same
+    distribution.
+
+    The idea of this test is to compute the Monte-Carlo estimate of the volume
+    enclosed by a hypersphere, i.e., the volume of an `n`-ball. While we could
+    choose an arbitrary function to integrate, the hypersphere's volume is nice
+    because it is intuitive, has an easy analytical expression, and works for
+    `dimensions > 1`.
+
+    Technical Details:
+
+    Observe that:
+
+    ```none
+    int_{R**d} dx [x in Ball(radius=r, center=c)]
+    = E_{p(X)}[ [X in Ball(r, c)] / p(X) ]
+    = lim_{m->infty} m**-1 sum_j^m [x[j] in Ball(r, c)] / p(x[j]),
+        where x[j] ~iid p(X)
+    ```
+
+    Thus, for fixed `m`, the above is approximately true when `sample` and
+    `log_prob` are mutually consistent.
+
+    Furthermore, the above calculation has the analytical result:
+    `pi**(d/2) r**d / Gamma(1 + d/2)`.
+
+    Note: this test only verifies a necessary condition for consistency--it does
+    does not verify sufficiency hence does not prove `sample`, `log_prob` truly
+    are consistent. For this reason we recommend testing several different
+    hyperspheres (assuming the hypersphere is supported by the distribution).
+    Furthermore, we gain additional trust in this test when also tested `sample`
+    against the first, second moments
+    (`run_test_sample_consistent_mean_covariance`); it is probably unlikely that
+    a "best-effort" implementation of `log_prob` would incorrectly pass both
+    tests and for different hyperspheres.
+
+    For a discussion on the analytical result (second-line) see:
+      https://en.wikipedia.org/wiki/Volume_of_an_n-ball.
+
+    For a discussion of importance sampling (fourth-line) see:
+      https://en.wikipedia.org/wiki/Importance_sampling.
+
+    Args:
+      sess: Tensorflow session.
+      dist: Distribution instance or object which implements `sample`,
+        `log_prob`, `event_shape_tensor` and `batch_shape_tensor`. The
+        distribution must have non-zero probability of sampling every point
+        enclosed by the hypersphere.
+      num_samples: Python `int` scalar indicating the number of Monte-Carlo
+        samples to draw from `dist`.
+      radius: Python `float`-type indicating the radius of the `n`-ball which
+        we're computing the volume.
+      center: Python floating-type vector (or scalar) indicating the center of
+        the `n`-ball which we're computing the volume. When scalar, the value is
+        broadcast to all event dims.
+      seed: Python `int` indicating the seed to use when sampling from `dist`.
+        In general it is not recommended to use `None` during a test as this
+        increases the likelihood of spurious test failure.
+      rtol: Python `float`-type indicating the admissible relative error between
+        actual- and approximate-volumes.
+      atol: Python `float`-type indicating the admissible absolute error between
+        actual- and approximate-volumes. In general this should be zero since
+        a typical radius implies a non-zero volume.
+    """
+
+    def actual_hypersphere_volume(dims, radius):
+      # https://en.wikipedia.org/wiki/Volume_of_an_n-ball
+      # Using tf.lgamma because we'd have to otherwise use SciPy which is not
+      # a required dependency of core.
+      radius = np.asarray(radius)
+      dims = math_ops.cast(dims, dtype=radius.dtype)
+      return math_ops.exp(
+          (dims / 2.) * np.log(np.pi)
+          - math_ops.lgamma(1. + dims / 2.)
+          + dims * math_ops.log(radius))
+
+    def is_in_ball(x, radius, center):
+      return math_ops.cast(linalg_ops.norm(x - center, axis=-1) <= radius,
+                           dtype=x.dtype)
+
+    def monte_carlo_hypersphere_volume(dist, num_samples, radius, center):
+      # https://en.wikipedia.org/wiki/Importance_sampling
+      x = dist.sample(num_samples, seed=seed)
+      return math_ops.reduce_mean(
+          math_ops.exp(-dist.log_prob(x)) * is_in_ball(x, radius, center),
+          axis=0)
+
+    [
+        batch_shape_,
+        actual_volume_,
+        sample_volume_,
+    ] = sess.run([
+        dist.batch_shape_tensor(),
+        actual_hypersphere_volume(
+            dims=dist.event_shape_tensor()[0],
+            radius=radius),
+        monte_carlo_hypersphere_volume(
+            dist,
+            num_samples=num_samples,
+            radius=radius,
+            center=center),
+    ])
+
+    self.assertAllClose(np.tile(actual_volume_, reps=batch_shape_),
+                        sample_volume_,
+                        rtol=rtol, atol=atol)
+
+  def run_test_sample_consistent_mean_covariance(
+      self,
+      sess,
+      dist,
+      num_samples=int(1e5),
+      seed=24,
+      rtol=1e-2,
+      atol=0.,
+      cov_rtol=None,
+      cov_atol=None):
+    """Tests that sample/mean/covariance are consistent with each other.
+
+    "Consistency" means that `sample`, `mean`, `covariance`, etc all correspond
+    to the same distribution.
+
+    Args:
+      sess: Tensorflow session.
+      dist: Distribution instance or object which implements `sample`,
+        `log_prob`, `event_shape_tensor` and `batch_shape_tensor`.
+      num_samples: Python `int` scalar indicating the number of Monte-Carlo
+        samples to draw from `dist`.
+      seed: Python `int` indicating the seed to use when sampling from `dist`.
+        In general it is not recommended to use `None` during a test as this
+        increases the likelihood of spurious test failure.
+      rtol: Python `float`-type indicating the admissible relative error between
+        analytical and sample statistics.
+      atol: Python `float`-type indicating the admissible absolute error between
+        analytical and sample statistics.
+      cov_rtol: Python `float`-type indicating the admissible relative error
+        between analytical and sample covariance. Default: rtol.
+      cov_atol: Python `float`-type indicating the admissible absolute error
+        between analytical and sample covariance. Default: atol.
+    """
+
+    def vec_osquare(x):
+      """Computes the outer-product of a vector, i.e., x.T x."""
+      return x[..., :, array_ops.newaxis] * x[..., array_ops.newaxis, :]
+
+    x = dist.sample(num_samples, seed=seed)
+    sample_mean = math_ops.reduce_mean(x, axis=0)
+    sample_covariance = math_ops.reduce_mean(
+        vec_osquare(x - sample_mean), axis=0)
+    sample_variance = array_ops.matrix_diag_part(sample_covariance)
+    sample_stddev = math_ops.sqrt(sample_variance)
+
+    [
+        sample_mean_,
+        sample_covariance_,
+        sample_variance_,
+        sample_stddev_,
+        mean_,
+        covariance_,
+        variance_,
+        stddev_
+    ] = sess.run([
+        sample_mean,
+        sample_covariance,
+        sample_variance,
+        sample_stddev,
+        dist.mean(),
+        dist.covariance(),
+        dist.variance(),
+        dist.stddev(),
+    ])
+
+    self.assertAllClose(mean_, sample_mean_, rtol=rtol, atol=atol)
+    self.assertAllClose(covariance_, sample_covariance_,
+                        rtol=cov_rtol or rtol,
+                        atol=cov_atol or atol)
+    self.assertAllClose(variance_, sample_variance_, rtol=rtol, atol=atol)
+    self.assertAllClose(stddev_, sample_stddev_, rtol=rtol, atol=atol)
+
+
+class VectorDiffeomixtureTest(VectorDistributionTestHelpers, test.TestCase):
+  """Tests the VectorDiffeomixture distribution."""
+
+  def testSampleProbConsistentBroadcastMix(self):
+    with self.test_session() as sess:
+      dims = 4
+      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+          mix_loc=[[0.], [1.]],
+          mix_scale=[1.],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              None,
+              np.float32([2.]*dims),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(1.1),
+                  is_positive_definite=True),
+              linop_diag_lib.LinearOperatorDiag(
+                  diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
+                  is_positive_definite=True),
+          ],
+          validate_args=True)
+      # Ball centered at component0's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess, vdm, radius=2., center=0., rtol=0.005)
+      # Larger ball centered at component1's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess, vdm, radius=4., center=2., rtol=0.005)
+
+  def testSampleProbConsistentBroadcastMixNonStandardBase(self):
+    with self.test_session() as sess:
+      dims = 4
+      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+          mix_loc=[[0.], [1.]],
+          mix_scale=[1.],
+          distribution=normal_lib.Normal(1., 1.5),
+          loc=[
+              None,
+              np.float32([2.]*dims),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(1.1),
+                  is_positive_definite=True),
+              linop_diag_lib.LinearOperatorDiag(
+                  diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
+                  is_positive_definite=True),
+          ],
+          validate_args=True)
+      # Ball centered at component0's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess, vdm, radius=2., center=1., rtol=0.006)
+      # Larger ball centered at component1's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess, vdm, radius=4., center=3., rtol=0.009)
+
+  def testMeanCovariance(self):
+    with self.test_session() as sess:
+      dims = 3
+      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+          mix_loc=[[0.], [4.]],
+          mix_scale=[10.],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              np.float32([-2.]),
+              None,
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(1.5),
+                  is_positive_definite=True),
+              linop_diag_lib.LinearOperatorDiag(
+                  diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
+                  is_positive_definite=True),
+          ],
+          validate_args=True)
+      self.run_test_sample_consistent_mean_covariance(
+          sess, vdm, rtol=0.02, cov_rtol=0.06)
+
+  def testMeanCovarianceUncenteredNonStandardBase(self):
+    with self.test_session() as sess:
+      dims = 3
+      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+          mix_loc=[[0.], [4.]],
+          mix_scale=[10.],
+          distribution=normal_lib.Normal(-1., 1.5),
+          loc=[
+              np.float32([-2.]),
+              np.float32([0.]),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(1.5),
+                  is_positive_definite=True),
+              linop_diag_lib.LinearOperatorDiag(
+                  diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
+                  is_positive_definite=True),
+          ],
+          validate_args=True)
+      self.run_test_sample_consistent_mean_covariance(
+          sess, vdm, num_samples=int(1e6), rtol=0.01, cov_atol=0.025)
+
+  # TODO(jvdillon): We've tested that (i) .sample and .log_prob are consistent,
+  # (ii) .mean, .stddev etc... and .sample are consistent. However, we haven't
+  # tested that the quadrature approach well-approximates the integral.
+  #
+  # To that end, consider adding these tests:
+  #
+  # Test1: In the limit of high mix_scale, this approximates a discrete mixture,
+  # and there are many discrete mixtures where we can explicitly compute
+  # mean/var, etc... So test1 would choose one of those discrete mixtures and
+  # show our mean/var/etc... is close to that.
+  #
+  # Test2:  In the limit of low mix_scale, the a diffeomixture of Normal(-5, 1),
+  # Normal(5, 1) should (I believe...must check) should look almost like
+  # Uniform(-5, 5), and thus (i) .prob(x) should be about 1/10 for x in (-5, 5),
+  # and (ii) the first few moments should approximately match that of
+  # Uniform(-5, 5)
+  #
+  # Test3:  If mix_loc is symmetric, then for any mix_scale, our
+  # quadrature-based diffeomixture of Normal(-1, 1), Normal(1, 1) should have
+  # mean zero, exactly.
+
+  # TODO(jvdillon): Add more tests which verify broadcasting.
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd05bd207f87c6d241ff619fbe3113fe8257cb07
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py
@@ -0,0 +1,206 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VectorExponentialLinearOperator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import distributions
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+
+
+class VectorExponentialDiagTest(test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testScalarParams(self):
+    mu = -1.
+    diag = -5.
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
+        ds.VectorExponentialDiag(mu, diag)
+
+  def testVectorParams(self):
+    mu = [-1.]
+    diag = [-5.]
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([3, 1], dist.sample(3).get_shape())
+
+  def testMean(self):
+    mu = [-1., 1]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1. + 1., 1. - 5.], dist.mean().eval())
+
+  def testMode(self):
+    mu = [-1.]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1., -1.], dist.mode().eval())
+
+  def testMeanWithBroadcastLoc(self):
+    mu = [-1.]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1. + 1, -1. - 5], dist.mean().eval())
+
+  def testSample(self):
+    mu = [-2., 1]
+    diag = [1., -2]
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+      samps = dist.sample(int(1e4), seed=0).eval()
+      cov_mat = array_ops.matrix_diag(diag).eval()**2
+
+      self.assertAllClose([-2 + 1, 1. - 2], samps.mean(axis=0),
+                          atol=0., rtol=0.05)
+      self.assertAllClose(cov_mat, np.cov(samps.T),
+                          atol=0.05, rtol=0.05)
+
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
+  def testSampleWithBroadcastScale(self):
+    # mu corresponds to a 2-batch of 3-variate normals
+    mu = np.zeros([2, 3])
+
+    # diag corresponds to no batches of 3-variate normals
+    diag = np.ones([3])
+
+    with self.test_session():
+      dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
+
+      mean = dist.mean()
+      self.assertAllEqual([2, 3], mean.get_shape())
+      self.assertAllClose(mu + diag, mean.eval())
+
+      n = int(1e4)
+      samps = dist.sample(n, seed=0).eval()
+      samps_centered = samps - samps.mean(axis=0)
+      cov_mat = array_ops.matrix_diag(diag).eval()**2
+      sample_cov = np.matmul(samps_centered.transpose([1, 2, 0]),
+                             samps_centered.transpose([1, 0, 2])) / n
+
+      self.assertAllClose(mu + diag, samps.mean(axis=0),
+                          atol=0.10, rtol=0.05)
+      self.assertAllClose([cov_mat, cov_mat], sample_cov,
+                          atol=0.10, rtol=0.05)
+
+  def testCovariance(self):
+    with self.test_session():
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.ones([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.diag(np.ones([3], dtype=np.float32)),
+          vex.covariance().eval())
+
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.ones([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllEqual([2], vex.batch_shape)
+      self.assertAllEqual([3], vex.event_shape)
+      self.assertAllClose(
+          np.array([[[3., 0, 0],
+                     [0, 3, 0],
+                     [0, 0, 3]],
+                    [[2, 0, 0],
+                     [0, 2, 0],
+                     [0, 0, 2]]])**2.,
+          vex.covariance().eval())
+
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.ones([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllEqual([2], vex.batch_shape)
+      self.assertAllEqual([3], vex.event_shape)
+      self.assertAllClose(
+          np.array([[[3., 0, 0],
+                     [0, 2, 0],
+                     [0, 0, 1]],
+                    [[4, 0, 0],
+                     [0, 5, 0],
+                     [0, 0, 6]]])**2.,
+          vex.covariance().eval())
+
+  def testVariance(self):
+    with self.test_session():
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.ones([3], dtype=np.float32),
+          vex.variance().eval())
+
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.ones([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.array([[3., 3, 3],
+                    [2., 2, 2]])**2.,
+          vex.variance().eval())
+
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.ones([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1],
+                      [4., 5, 6]])
+      self.assertAllClose(
+          np.array([[3., 2, 1],
+                    [4., 5, 6]])**2.,
+          vex.variance().eval())
+
+  def testStddev(self):
+    with self.test_session():
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.ones([3], dtype=np.float32),
+          vex.stddev().eval())
+
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.array([[3., 3, 3],
+                    [2., 2, 2]]),
+          vex.stddev().eval())
+
+      vex = ds.VectorExponentialDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllClose(
+          np.array([[3., 2, 1],
+                    [4., 5, 6]]),
+          vex.stddev().eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index d9dc978f23d4dc35bcfc0b910853f7cee083cde4..9044aa2850ae35f29cd48b0c5f54aa948bea0408 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -351,6 +351,13 @@ class WishartCholeskyTest(test.TestCase):
                          (3, 3), dtype=np.float32)
                  })
 
+      with self.assertRaisesOpError("scale must be square"):
+        chol_w = distributions.WishartCholesky(
+            df=4.,
+            scale=np.array([[2., 3., 4.], [1., 2., 3.]], dtype=np.float32),
+            validate_args=True)
+        sess.run(chol_w.scale().eval())
+
       # Ensure no assertions.
       chol_w = distributions.WishartCholesky(
           df=df_deferred,
@@ -366,13 +373,18 @@ class WishartCholeskyTest(test.TestCase):
                feed_dict={df_deferred: 4,
                           chol_scale_deferred: np.ones((3, 3))})
 
+  def testStaticAsserts(self):
+    with self.test_session():
+      x = make_pd(1., 3)
+      chol_scale = chol(x)
+
       # Still has these assertions because they're resolveable at graph
       # construction
       with self.assertRaisesRegexp(ValueError, "cannot be less than"):
-        chol_w = distributions.WishartCholesky(
+        distributions.WishartCholesky(
             df=2, scale=chol_scale, validate_args=False)
-      with self.assertRaisesRegexp(TypeError, "not a floating-point type"):
-        chol_w = distributions.WishartCholesky(
+      with self.assertRaisesRegexp(TypeError, "Argument tril must have dtype"):
+        distributions.WishartCholesky(
             df=4.,
             scale=np.asarray(
                 chol_scale, dtype=np.int32),
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 1684a5fffe13fa8a074ae7ede0182a9d145300c7..f1879a828a326e85fc5eb3ff1f34e575e64ff5cf 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -27,6 +27,7 @@
 @@PowerTransform
 @@Sigmoid
 @@SigmoidCentered
+@@SinhArcsinh
 @@SoftmaxCentered
 @@Softplus
 """
@@ -48,6 +49,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
+from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
 from tensorflow.python.ops.distributions.bijector import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
index 42865ed404d6ca317e79df8bd63ddeaf8283bbf2..6a5c37da161f0ed28c29b5d5043c1d07a63ee360 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -18,10 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
-from tensorflow.contrib.distributions.python.ops import operator_pd_diag
-from tensorflow.contrib.distributions.python.ops import operator_pd_identity
-from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,7 +27,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bijector
 
@@ -44,143 +41,6 @@ def _as_tensor(x, name):
   return None if x is None else ops.convert_to_tensor(x, name=name)
 
 
-# TODO(srvasude): Deprecate this class with a dedicated Linear Operator
-# corresponding to TriL + V D V.T.
-class _TriLPlusVDVTLightweightOperatorPD(object):
-  """Helper/hidden class fake an OperatorPD for TriL+VDV.T."""
-
-  def __init__(self, tril, v, diag=None, validate_args=False):
-    """Creates an instance of _TriLPlusVDVTLightweightOperatorPD.
-
-    WARNING: This object is not to be used outside of `Affine` where it is
-    currently being temporarily used for refactoring purposes.
-
-    Args:
-      tril: `Tensor` of shape `[B1,..,Bb, d, d]`.
-      v: `Tensor` of shape `[B1,...,Bb, d, k]`.
-      diag: `Tensor` of shape `[B1,...,Bb, k, k]` or None
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-    """
-    self._m = tril
-    self._v = v
-    self._validate_args = validate_args
-    self._inputs = [tril, v]
-    if diag is not None:
-      self._inputs += [diag]
-      self._d = operator_pd_diag.OperatorPDDiag(diag, verify_pd=validate_args)
-      self._d_inv = operator_pd_diag.OperatorPDDiag(1. / diag,
-                                                    verify_pd=validate_args)
-      return
-    if v.get_shape().is_fully_defined():
-      v_shape = v.get_shape().as_list()
-      id_shape = v_shape[:-2] + [v_shape[-1], v_shape[-1]]
-    else:
-      v_shape = array_ops.shape(v)
-      id_shape = array_ops.concat([v_shape[:-2], [v_shape[-1], v_shape[-1]]], 0)
-    self._d = operator_pd_identity.OperatorPDIdentity(
-        id_shape, v.dtype, verify_pd=self.validate_args)
-    self._d_inv = self._d
-
-  @property
-  def inputs(self):
-    return self._inputs
-
-  @property
-  def dtype(self):
-    return self._m.dtype.base_dtype
-
-  @property
-  def validate_args(self):
-    return self._validate_args
-
-  def rank(self):
-    """Returns `rank(self)`."""
-    return array_ops.rank(self._m)
-
-  def sqrt_matmul(self, x):
-    """Computes `matmul(self, x)`.
-
-    Doesn't actually do the sqrt! Named as such to agree with API.
-
-    Args:
-      x: `Tensor`
-
-    Returns:
-      self_times_x: `Tensor`
-    """
-    m_x = math_ops.matmul(self._m, x)
-    vt_x = math_ops.matmul(self._v, x, adjoint_a=True)
-    d_vt_x = self._d.matmul(vt_x)
-    v_d_vt_x = math_ops.matmul(self._v, d_vt_x)
-    return m_x + v_d_vt_x
-
-  def sqrt_solve(self, x):
-    """Computes `solve(self, x)`.
-
-    Doesn't actually do the sqrt! Named as such to agree with API.
-
-    To compute (M + V D V.T), we use the Woodbury matrix identity:
-      inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
-    where,
-      C = inv(D) + V.T inv(M) V.
-    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity
-
-    Args:
-      x: `Tensor`
-
-    Returns:
-      inv_of_self_times_x: `Tensor`
-    """
-    minv_x = linalg_ops.matrix_triangular_solve(self._m, x)
-    vt_minv_x = math_ops.matmul(self._v, minv_x, transpose_a=True)
-    cinv_vt_minv_x = linalg_ops.matrix_solve(
-        self._woodbury_sandwiched_term(), vt_minv_x)
-    v_cinv_vt_minv_x = math_ops.matmul(self._v, cinv_vt_minv_x)
-    minv_v_cinv_vt_minv_x = linalg_ops.matrix_triangular_solve(
-        self._m, v_cinv_vt_minv_x)
-    return minv_x - minv_v_cinv_vt_minv_x
-
-  def sqrt_log_abs_det(self):
-    """Computes (log o abs o det)(X) for matrix X.
-
-    Doesn't actually do the sqrt! Named as such to agree with API.
-
-    To compute det(M + V D V.T), we use the matrix determinant lemma:
-      det(Tril + V D V.T) = det(C) det(D) det(M)
-    where C is defined as in `_inverse`, ie,
-      C = inv(D) + V.T inv(M) V.
-
-    See: https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-
-    Returns:
-      log_abs_det: `Tensor`.
-    """
-    log_det_c = math_ops.log(math_ops.abs(
-        linalg_ops.matrix_determinant(self._woodbury_sandwiched_term())))
-    # Reduction is ok because we always prepad inputs to this class.
-    log_det_m = math_ops.reduce_sum(math_ops.log(math_ops.abs(
-        array_ops.matrix_diag_part(self._m))), axis=[-1])
-    return log_det_c + 2. * self._d.sqrt_log_abs_det() + log_det_m
-
-  def _woodbury_sandwiched_term(self):
-    """Computes the sandwiched term in the Woodbury identity.
-
-    Computes the "`C`" in the identity:
-       inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
-    where,
-       C = inv(D) + V.T inv(M) V.
-
-    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity
-
-    Returns:
-      woodbury_sandwich_term: A `Tensor` to be used like `C`, above.
-    """
-    minv_v = linalg_ops.matrix_triangular_solve(self._m, self._v)
-    vt_minv_v = math_ops.matmul(self._v, minv_v, adjoint_a=True)
-    return self._d_inv.add_to_tensor(vt_minv_v)
-
-
 class Affine(bijector.Bijector):
   """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
 
@@ -297,7 +157,7 @@ class Affine(bijector.Bijector):
         matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
         represents an `r x r` diagonal matrix. When `None` low rank updates will
         take the form `scale_perturb_factor * scale_perturb_factor.T`.
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
         associated with a particular draw from the distribution. Must be 0 or 1.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
@@ -310,10 +170,12 @@ class Affine(bijector.Bijector):
     self._graph_parents = []
     self._name = name
     self._validate_args = validate_args
+
     # Ambiguous definition of low rank update.
     if scale_perturb_diag is not None and scale_perturb_factor is None:
       raise ValueError("When scale_perturb_diag is specified, "
                        "scale_perturb_factor must be specified.")
+
     # Special case, only handling a scaled identity matrix. We don't know its
     # dimensions, so this is special cased.
     # We don't check identity_multiplier, since below we set it to 1. if all
@@ -321,38 +183,69 @@ class Affine(bijector.Bijector):
     self._is_only_identity_multiplier = (scale_tril is None and
                                          scale_diag is None and
                                          scale_perturb_factor is None)
-    # When no args are specified, pretend the scale matrix is the identity
-    # matrix.
-    if self._is_only_identity_multiplier and scale_identity_multiplier is None:
-      scale_identity_multiplier = 1.
+
     with self._name_scope("init", values=[
         shift, scale_identity_multiplier, scale_diag, scale_tril,
-        scale_perturb_diag, scale_perturb_factor, event_ndims]):
+        scale_perturb_diag, scale_perturb_factor]):
       event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if validate_args:
-        is_less_than_two = check_ops.assert_less(
-            event_ndims, 2,
-            message="event_ndims must be 0 or 1")
-        event_ndims = control_flow_ops.with_dependencies(
-            [is_less_than_two], event_ndims)
-      self._shift = _as_tensor(shift, "shift")
-      # self._create_scale_operator returns an OperatorPD in all cases except if
-      # self._is_only_identity_multiplier; in which case it returns a scalar
-      # Tensor.
-      self._scale = self._create_scale_operator(
+      event_ndims_const = tensor_util.constant_value(event_ndims)
+      if event_ndims_const is not None and event_ndims_const not in (0, 1):
+        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      # When no args are specified, pretend the scale matrix is the identity
+      # matrix.
+      if (self._is_only_identity_multiplier and
+          scale_identity_multiplier is None):
+        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
+
+      # self._create_scale_operator returns a LinearOperator in all cases
+      # except if self._is_only_identity_multiplier; in which case it
+      # returns a scalar Tensor.
+      scale = self._create_scale_operator(
           identity_multiplier=scale_identity_multiplier,
           diag=scale_diag,
           tril=scale_tril,
           perturb_diag=scale_perturb_diag,
           perturb_factor=scale_perturb_factor,
-          event_ndims=event_ndims,
+          shift=shift,
           validate_args=validate_args)
-      if (self._shift is not None and
-          self._shift.dtype.base_dtype != self._scale.dtype.base_dtype):
-        raise TypeError("shift.dtype({}) does not match scale.dtype({})".format(
-            self._shift.dtype, self._scale.dtype))
+
+      if scale.dtype is not None:
+        dtype = scale.dtype.base_dtype
+
+      if scale is not None and not self._is_only_identity_multiplier:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+      else:
+        # We won't need shape inference when scale is None or when scale is a
+        # scalar.
+        batch_ndims = 0
+      self._scale = scale
       self._shaper = _DistributionShape(
-          batch_ndims=self._infer_batch_ndims(),
+          batch_ndims=batch_ndims,
           event_ndims=event_ndims,
           validate_args=validate_args)
       super(Affine, self).__init__(
@@ -360,15 +253,15 @@ class Affine(bijector.Bijector):
           graph_parents=(
               [event_ndims] +
               [self._scale] if tensor_util.is_tensor(self._scale)
-              else self._scale.inputs +
+              else self._scale.graph_parents +
               [self._shift] if self._shift is not None else []),
           is_constant_jacobian=True,
-          dtype=self._scale.dtype,
+          dtype=dtype,
           validate_args=validate_args,
           name=name)
 
   def _create_scale_operator(self, identity_multiplier, diag, tril,
-                             perturb_diag, perturb_factor, event_ndims,
+                             perturb_diag, perturb_factor, shift,
                              validate_args):
     """Construct `scale` from various components.
 
@@ -384,14 +277,13 @@ class Affine(bijector.Bijector):
       perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
         the low rank update.
       perturb_factor: Floating-point `Tensor` representing factor matrix.
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1
+      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
 
     Returns:
       scale. In the case of scaling by a constant, scale is a
-      floating point `Tensor`. Otherwise, scale is an `OperatorPD`.
+      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
 
     Raises:
       ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
@@ -402,119 +294,44 @@ class Affine(bijector.Bijector):
     perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
     perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
 
-    identity_multiplier = self._maybe_validate_identity_multiplier(
-        identity_multiplier, validate_args)
-
+    # If possible, use the low rank update to infer the shape of
+    # the identity matrix, when scale represents a scaled identity matrix
+    # with a low rank update.
+    shape_hint = None
     if perturb_factor is not None:
-      perturb_factor = self._process_matrix(
-          perturb_factor, min_rank=2, event_ndims=event_ndims)
-
-    if perturb_diag is not None:
-      perturb_diag = self._process_matrix(
-          perturb_diag, min_rank=1, event_ndims=event_ndims)
-
-    # The following if-statments are ordered by increasingly stronger
-    # assumptions in the base matrix, i.e., we process in the order:
-    # TriL, Diag, Identity.
-
-    if tril is not None:
-      tril = self._preprocess_tril(
-          identity_multiplier, diag, tril, event_ndims)
-      if perturb_factor is None:
-        return operator_pd_cholesky.OperatorPDCholesky(
-            tril, verify_pd=validate_args)
-      return _TriLPlusVDVTLightweightOperatorPD(
-          tril=tril, v=perturb_factor, diag=perturb_diag,
-          validate_args=validate_args)
+      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
 
-    if diag is not None:
-      diag = self._preprocess_diag(identity_multiplier, diag, event_ndims)
-      if perturb_factor is None:
-        return operator_pd_diag.OperatorPDSqrtDiag(
-            diag, verify_pd=validate_args)
-      return operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          operator=operator_pd_diag.OperatorPDDiag(
-              diag, verify_pd=validate_args),
-          v=perturb_factor,
-          diag=perturb_diag,
-          verify_pd=validate_args)
-
-    if identity_multiplier is not None:
-      if perturb_factor is None:
-        return identity_multiplier
-      # Infer the shape from the V and D.
-      v_shape = array_ops.shape(perturb_factor)
-      identity_shape = array_ops.concat([v_shape[:-1], [v_shape[-2]]], 0)
-      scaled_identity = operator_pd_identity.OperatorPDIdentity(
-          identity_shape,
-          perturb_factor.dtype.base_dtype,
-          scale=identity_multiplier,
-          verify_pd=validate_args)
-      return operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
-          operator=scaled_identity,
-          v=perturb_factor,
-          diag=perturb_diag,
-          verify_pd=validate_args)
-
-    raise ValueError("One of tril, diag and/or identity_multiplier must be "
-                     "specified.")
-
-  def _maybe_validate_identity_multiplier(self, identity_multiplier,
-                                          validate_args):
-    """Check that the init arg `identity_multiplier` is valid."""
-    if identity_multiplier is None or not validate_args:
-      return identity_multiplier
-    if validate_args:
-      identity_multiplier = control_flow_ops.with_dependencies(
-          [check_ops.assert_positive(identity_multiplier)],
-          identity_multiplier)
-    return identity_multiplier
-
-  def _preprocess_tril(self, identity_multiplier, diag, tril, event_ndims):
-    """Helper to preprocess a lower triangular matrix."""
-    tril = array_ops.matrix_band_part(tril, -1, 0)  # Zero out TriU.
-    if identity_multiplier is None and diag is None:
-      return self._process_matrix(tril, min_rank=2, event_ndims=event_ndims)
-    new_diag = array_ops.matrix_diag_part(tril)
-    if identity_multiplier is not None:
-      new_diag += identity_multiplier
-    if diag is not None:
-      new_diag += diag
-    tril = array_ops.matrix_set_diag(tril, new_diag)
-    return self._process_matrix(tril, min_rank=2, event_ndims=event_ndims)
-
-  def _preprocess_diag(self, identity_multiplier, diag, event_ndims):
-    """Helper to preprocess a diagonal matrix."""
-    if identity_multiplier is not None:
-      diag += identity_multiplier
-    return self._process_matrix(diag, min_rank=1, event_ndims=event_ndims)
-
-  def _process_matrix(self, matrix, min_rank, event_ndims):
-    """Helper to __init__ which gets matrix in batch-ready form."""
-    # Pad the matrix so that matmul works in the case of a matrix and vector
-    # input. Keep track if the matrix was padded, to distinguish between a
-    # rank 3 tensor and a padded rank 2 tensor.
-    # TODO(srvasude): Remove side-effects from functions. Its currently unbroken
-    # but error-prone since the function call order may change in the future.
-    self._rank_two_event_ndims_one = math_ops.logical_and(
-        math_ops.equal(array_ops.rank(matrix), min_rank),
-        math_ops.equal(event_ndims, 1))
-    left = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
-    pad = array_ops.concat(
-        [array_ops.ones(
-            [left], dtype=dtypes.int32), array_ops.shape(matrix)],
-        0)
-    return array_ops.reshape(matrix, pad)
-
-  def _infer_batch_ndims(self):
-    """Return batch_ndims."""
     if self._is_only_identity_multiplier:
-      return 0
-    # The real batch dims is one less when we pad in the case of event_ndims =
-    # 1, and the rank of the underlying scale being 2. This allows us to have
-    # non-negative sample dims.
-    return (self._scale.rank() - 2 -
-            array_ops.where(self._rank_two_event_ndims_one, 1, 0))
+      if validate_args:
+        return control_flow_ops.with_dependencies(
+            [check_ops.assert_none_equal(
+                identity_multiplier,
+                array_ops.zeros([], identity_multiplier.dtype),
+                ["identity_multiplier should be non-zero."])],
+            identity_multiplier)
+      return identity_multiplier
+
+    scale = distribution_util.make_tril_scale(
+        loc=shift,
+        scale_tril=tril,
+        scale_diag=diag,
+        scale_identity_multiplier=identity_multiplier,
+        validate_args=validate_args,
+        assert_positive=False,
+        shape_hint=shape_hint)
+
+    if perturb_factor is not None:
+      return linalg.LinearOperatorUDVHUpdate(
+          scale,
+          u=perturb_factor,
+          diag_update=perturb_diag,
+          is_diag_update_positive=perturb_diag is None,
+          is_non_singular=True,  # Implied by is_positive_definite=True.
+          is_self_adjoint=True,
+          is_positive_definite=True,
+          is_square=True)
+
+    return scale
 
   @property
   def shift(self):
@@ -524,10 +341,6 @@ class Affine(bijector.Bijector):
   @property
   def scale(self):
     """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    # TODO(srvasude): Remove this exception once TriLPlusVDVT is properly
-    # implemented.
-    if isinstance(self._scale, _TriLPlusVDVTLightweightOperatorPD):
-      raise NotImplementedError("Cannot access scale when Tril+VDV.T.")
     return self._scale
 
   def _forward(self, x):
@@ -537,11 +350,15 @@ class Affine(bijector.Bijector):
       if self.shift is not None:
         return y + self.shift
       return  y
-    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(y)
-    y = self._scale.sqrt_matmul(y)
-    y = self._shaper.undo_make_batch_of_event_sample_matrices(y, sample_shape)
+    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        y, expand_batch_dim=False)
+    with ops.control_dependencies(self._maybe_check_scale() if
+                                  self.validate_args else []):
+      y = self.scale.matmul(y)
+    y = self._shaper.undo_make_batch_of_event_sample_matrices(
+        y, sample_shape, expand_batch_dim=False)
     if self.shift is not None:
-      return y + self.shift
+      y += self.shift
     return y
 
   def _inverse(self, y):
@@ -550,9 +367,13 @@ class Affine(bijector.Bijector):
       x -= self.shift
     if self._is_only_identity_multiplier:
       return x / self._scale
-    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(x)
-    x = self._scale.sqrt_solve(x)
-    x = self._shaper.undo_make_batch_of_event_sample_matrices(x, sample_shape)
+
+    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        x, expand_batch_dim=False)
+    # Solve fails if the op is singular so we may safely skip this assertion.
+    x = self.scale.solve(x)
+    x = self._shaper.undo_make_batch_of_event_sample_matrices(
+        x, sample_shape, expand_batch_dim=False)
     return x
 
   def _inverse_log_det_jacobian(self, y):
@@ -560,12 +381,17 @@ class Affine(bijector.Bijector):
 
   def _forward_log_det_jacobian(self, x):
     if self._is_only_identity_multiplier:
-      # TODO(jvdillon): We don't pad in this case and instead let the fldj be
-      # applied via broadcast.
+      # We don't pad in this case and instead let the fldj be applied
+      # via broadcast.
       d = math_ops.cast(array_ops.shape(x)[-1], dtype=self._scale.dtype)
+      one = ops.convert_to_tensor(1., self._scale.dtype)
       return math_ops.log(math_ops.abs(self._scale)) * array_ops.where(
-          math_ops.equal(self._shaper.event_ndims, 0), 1., d)
-    fldj = self._scale.sqrt_log_abs_det()
-    # We need to squeeze off the padded dimension.
-    start = array_ops.where(self._rank_two_event_ndims_one, 1, 0)
-    return array_ops.reshape(fldj, array_ops.shape(fldj)[start:])
+          math_ops.equal(self._shaper.event_ndims, 0), one, d)
+    return self.scale.log_abs_determinant()
+
+  def _maybe_check_scale(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3cf03c24612f5c618c71c0a8615f272acdf2d10
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -0,0 +1,29 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SinhArcsinh bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["SinhArcsinh"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4007e605b657ec8ff6f50b2597ff6c9896cdd0b6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SinhArcsinh bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+__all__ = [
+    "SinhArcsinh",
+]
+
+
+class SinhArcsinh(bijector.Bijector):
+  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
+
+  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
+  transformation is a
+  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
+  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
+
+  The `SinhArcsinh` transformation of the Normal is described in
+  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
+  This Bijector allows a similar transformation of any distribution supported on
+  `(-inf, inf)`.
+
+  #### Meaning of the parameters
+
+  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
+  * Positive (negative) `skewness` leads to positive (negative) skew.
+    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
+      "tilted" to the right.
+    * positive skew means positive values of `Y` become more likely, and
+      negative values become less likely.
+  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
+    * Fatter tails mean larger values of `|Y|` become more likely.
+    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
+      "flat" around `Y = 0`, and a very steep drop-off in the tails.
+    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
+      peaked at the mode with heavier tails.
+
+  To see the argument about the tails, note that for `|X| >> 1` and
+  `|X| >> (|skewness| * tailweight)**tailweight`, we have
+  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
+  """
+
+  def __init__(self,
+               skewness=0.,
+               tailweight=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="sinh_arcsinh"):
+    """Instantiates the `SinhArcsinh` bijector.
+
+    Args:
+      skewness:  Skewness parameter.  Float-type `Tensor`.
+      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
+        `skewness`
+        and broadcastable `shape`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[skewness, tailweight]):
+      self._skewness = ops.convert_to_tensor(skewness, name="skewness")
+      self._tailweight = ops.convert_to_tensor(tailweight, name="tailweight")
+      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
+      if validate_args:
+        self._tailweight = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._tailweight,
+                message="Argument tailweight was not positive")
+        ], self._tailweight)
+    super(SinhArcsinh, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def skewness(self):
+    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._skewness
+
+  @property
+  def tailweight(self):
+    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._tailweight
+
+  def _forward(self, x):
+    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
+
+  def _inverse(self, y):
+    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # x = sinh(arcsinh(y) / tailweight - skewness)
+    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
+    # dx/dy
+    # = cosh(arcsinh(y) / tailweight - skewness)
+    #     / (tailweight * sqrt(y**2 + 1))
+    # Note that this could potentially return a NaN due to the log1p(x**2)
+    # term since, for instance, this will only be valid for float32 til ~1.7e19.
+    # This is in contrast with the forward/inverse passes since an arcsinh
+    # transformation is done first, which is valid until the maximum float
+    # value.
+    # TODO(srvasude): It might be possible to extend the range of validity to
+    # match that of forward/inverse by approximating log1p(y**2) by 2 * log(y).
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        math_ops.log(math_ops.cosh(
+            math_ops.asinh(y) / self.tailweight - self.skewness)) -
+        math_ops.log(self.tailweight) - 0.5 * math_ops.log1p(y**2),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    # y = sinh((arcsinh(x) + skewness) * tailweight)
+    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
+    # dy/dx
+    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
+    # Note that this could potentially return a NaN due to the log1p(x**2)
+    # term since, for instance, this will only be valid for float32 til ~1.7e19.
+    # This is in contrast with the forward/inverse passes since an arcsinh
+    # transformation is done first, which is valid until the maximum float
+    # value.
+    # TODO(srvasude): It might be possible to extend the range of validity to
+    # match that of forward/inverse by approximating log1p(y**2) by 2 * log(y).
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        math_ops.log(math_ops.cosh(
+            (math_ops.asinh(x) + self.skewness) * self.tailweight)) +
+        math_ops.log(self.tailweight) - 0.5 * math_ops.log1p(x**2),
+        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 9304a56491ece71fe9d8151a28a3f087882222a2..6a1bb39ab28218a411bdf4329965186bcf32bf30 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -196,7 +196,7 @@ class Binomial(distribution.Distribution):
 
   @property
   def probs(self):
-    """Probability of of drawing a `1`."""
+    """Probability of drawing a `1`."""
     return self._probs
 
   def _batch_shape_tensor(self):
@@ -272,13 +272,11 @@ class Binomial(distribution.Distribution):
             message="total_count cannot contain fractional components."),
     ], total_count)
 
-  def _maybe_assert_valid_sample(self, counts, check_integer=True):
+  def _maybe_assert_valid_sample(self, counts):
     """Check counts for proper shape, values, then return tensor version."""
     if not self.validate_args:
       return counts
-
-    counts = distribution_util.embed_check_nonnegative_discrete(
-        counts, check_integer=check_integer)
+    counts = distribution_util.embed_check_nonnegative_integer_form(counts)
     return control_flow_ops.with_dependencies([
         check_ops.assert_less_equal(
             counts, self.total_count,
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 5e3b42dd2aa5e85fab23820fc63a69be77c3ac27..a0ff2778aa69a259587356a4869fa0d5ce72bc66 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -19,22 +19,153 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops.distributions import util
 from tensorflow.python.ops.distributions.util import *  # pylint: disable=wildcard-import
 
 
-# TODO(b/35290280): Add unit-tests.
-def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
-                    validate_args, assert_positive, name=None):
-  """Creates a LinOp from `scale_diag`, `scale_identity_multiplier` kwargs."""
-  def _convert_to_tensor(x, name):
-    return None if x is None else ops.convert_to_tensor(x, name=name)
+def _convert_to_tensor(x, name):
+  return None if x is None else ops.convert_to_tensor(x, name=name)
+
+
+def make_tril_scale(
+    loc=None,
+    scale_tril=None,
+    scale_diag=None,
+    scale_identity_multiplier=None,
+    shape_hint=None,
+    validate_args=False,
+    assert_positive=False,
+    name=None):
+  """Creates a LinOp representing a lower triangular matrix.
+
+  Args:
+    loc: Floating-point `Tensor`. This is used for inferring shape in the case
+      where only `scale_identity_multiplier` is set.
+    scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+      `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
+      lower triangular matrix.
+      When `None` no `scale_tril` term is added to the LinOp.
+      The upper triangular elements above the diagonal are ignored.
+    scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+      `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+      diagonal matrix.
+      When `None` no diagonal term is added to the LinOp.
+    scale_identity_multiplier: floating point rank 0 `Tensor` representing a
+      scaling done to the identity matrix.
+      When `scale_identity_multiplier = scale_diag = scale_tril = None` then
+      `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
+      to `scale`.
+    shape_hint: scalar integer `Tensor` representing a hint at the dimension of
+      the identity matrix when only `scale_identity_multiplier` is set.
+    validate_args: Python `bool` indicating whether arguments should be
+      checked for correctness.
+    assert_positive: Python `bool` indicating whether LinOp should be checked
+      for being positive definite.
+    name: Python `str` name given to ops managed by this object.
+
+  Returns:
+    `LinearOperator` representing a lower triangular matrix.
+
+  Raises:
+    ValueError:  If only `scale_identity_multiplier` is set and `loc` and
+      `shape_hint` are both None.
+  """
+
+  def _maybe_attach_assertion(x):
+    if not validate_args:
+      return x
+    if assert_positive:
+      return control_flow_ops.with_dependencies([
+          check_ops.assert_positive(
+              array_ops.matrix_diag_part(x),
+              message="diagonal part must be positive"),
+      ], x)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_none_equal(
+            array_ops.matrix_diag_part(x),
+            array_ops.zeros([], x.dtype),
+            message="diagonal part must be non-zero"),
+    ], x)
+
+  with ops.name_scope(name, "make_tril_scale",
+                      values=[loc, scale_diag, scale_identity_multiplier]):
+
+    loc = _convert_to_tensor(loc, name="loc")
+    scale_tril = _convert_to_tensor(scale_tril, name="scale_tril")
+    scale_diag = _convert_to_tensor(scale_diag, name="scale_diag")
+    scale_identity_multiplier = _convert_to_tensor(
+        scale_identity_multiplier,
+        name="scale_identity_multiplier")
+
+  if scale_tril is not None:
+    scale_tril = array_ops.matrix_band_part(scale_tril, -1, 0)  # Zero out TriU.
+    tril_diag = array_ops.matrix_diag_part(scale_tril)
+    if scale_diag is not None:
+      tril_diag += scale_diag
+    if scale_identity_multiplier is not None:
+      tril_diag += scale_identity_multiplier[..., array_ops.newaxis]
+
+    scale_tril = array_ops.matrix_set_diag(scale_tril, tril_diag)
+
+    return linalg.LinearOperatorTriL(
+        tril=_maybe_attach_assertion(scale_tril),
+        is_non_singular=True,
+        is_self_adjoint=False,
+        is_positive_definite=assert_positive)
+
+  return make_diag_scale(
+      loc=loc,
+      scale_diag=scale_diag,
+      scale_identity_multiplier=scale_identity_multiplier,
+      shape_hint=shape_hint,
+      validate_args=validate_args,
+      assert_positive=assert_positive,
+      name=name)
+
+
+def make_diag_scale(
+    loc=None,
+    scale_diag=None,
+    scale_identity_multiplier=None,
+    shape_hint=None,
+    validate_args=False,
+    assert_positive=False,
+    name=None):
+  """Creates a LinOp representing a diagonal matrix.
+
+  Args:
+    loc: Floating-point `Tensor`. This is used for inferring shape in the case
+      where only `scale_identity_multiplier` is set.
+    scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+      `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+      diagonal matrix.
+      When `None` no diagonal term is added to the LinOp.
+    scale_identity_multiplier: floating point rank 0 `Tensor` representing a
+      scaling done to the identity matrix.
+      When `scale_identity_multiplier = scale_diag = scale_tril = None` then
+      `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
+      to `scale`.
+    shape_hint: scalar integer `Tensor` representing a hint at the dimension of
+      the identity matrix when only `scale_identity_multiplier` is set.
+    validate_args: Python `bool` indicating whether arguments should be
+      checked for correctness.
+    assert_positive: Python `bool` indicating whether LinOp should be checked
+      for being positive definite.
+    name: Python `str` name given to ops managed by this object.
+
+  Returns:
+    `LinearOperator` representing a lower triangular matrix.
+
+  Raises:
+    ValueError:  If only `scale_identity_multiplier` is set and `loc` and
+      `shape_hint` are both None.
+  """
 
   def _maybe_attach_assertion(x):
     if not validate_args:
@@ -67,23 +198,24 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
           is_self_adjoint=True,
           is_positive_definite=assert_positive)
 
-    # TODO(b/35290280): Consider inferring shape from scale_perturb_factor.
-    if loc is None:
+    if loc is None and shape_hint is None:
       raise ValueError(
-          "Cannot infer `event_shape` unless `loc` is specified.")
+          "Cannot infer `event_shape` unless `loc` or "
+          "`shape_hint` is specified.")
 
-    num_rows = util.dimension_size(loc, -1)
+    if shape_hint is None:
+      shape_hint = loc.shape[-1]
 
     if scale_identity_multiplier is None:
       return linalg.LinearOperatorIdentity(
-          num_rows=num_rows,
+          num_rows=shape_hint,
           dtype=loc.dtype.base_dtype,
           is_self_adjoint=True,
           is_positive_definite=True,
           assert_proper_shapes=validate_args)
 
     return linalg.LinearOperatorScaledIdentity(
-        num_rows=num_rows,
+        num_rows=shape_hint,
         multiplier=_maybe_attach_assertion(scale_identity_multiplier),
         is_non_singular=True,
         is_self_adjoint=True,
@@ -170,12 +302,33 @@ def prefer_static_broadcast_shape(
       statically), or as a `Tensor`.
   """
   with ops.name_scope(name, values=[shape1, shape2]):
-    if (tensor_util.constant_value(shape1) is not None and
-        tensor_util.constant_value(shape2) is not None):
-      return array_ops.broadcast_static_shape(
-          tensor_shape.TensorShape(tensor_util.constant_value(shape1)),
-          tensor_shape.TensorShape(tensor_util.constant_value(shape2)))
-    return array_ops.broadcast_dynamic_shape(shape1, shape2)
+    def make_shape_tensor(x):
+      return ops.convert_to_tensor(x, name="shape", dtype=dtypes.int32)
+
+    def get_tensor_shape(s):
+      if isinstance(s, tensor_shape.TensorShape):
+        return s
+      s_ = tensor_util.constant_value(make_shape_tensor(s))
+      if s_ is not None:
+        return tensor_shape.TensorShape(s_)
+      return None
+
+    def get_shape_tensor(s):
+      if not isinstance(s, tensor_shape.TensorShape):
+        return make_shape_tensor(s)
+      if s.is_fully_defined():
+        return make_shape_tensor(s.as_list())
+      raise ValueError("Cannot broadcast from partially "
+                       "defined `TensorShape`.")
+
+    shape1_ = get_tensor_shape(shape1)
+    shape2_ = get_tensor_shape(shape2)
+    if shape1_ is not None and shape2_ is not None:
+      return array_ops.broadcast_static_shape(shape1_, shape2_)
+
+    shape1_ = get_shape_tensor(shape1)
+    shape2_ = get_shape_tensor(shape2)
+    return array_ops.broadcast_dynamic_shape(shape1_, shape2_)
 
 
 def is_diagonal_scale(scale):
diff --git a/tensorflow/contrib/distributions/python/ops/estimator.py b/tensorflow/contrib/distributions/python/ops/estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b53338c4542c75d3977c075b7750c780080ac48
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/estimator.py
@@ -0,0 +1,185 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to bridge `Distribution`s and `tf.contrib.learn.estimator` APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators.head import _compute_weighted_loss
+from tensorflow.contrib.learn.python.learn.estimators.head import _RegressionHead
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+
+
+__all__ = [
+    "estimator_head_distribution_regression",
+]
+
+
+def estimator_head_distribution_regression(make_distribution_fn,
+                                           label_dimension=1,
+                                           logits_dimension=None,
+                                           label_name=None,
+                                           weight_column_name=None,
+                                           enable_centered_bias=False,
+                                           head_name=None):
+  """Creates a `Head` for regression under a generic distribution.
+
+  Args:
+    make_distribution_fn: Python `callable` which returns a `tf.Distribution`
+      instance created using only logits.
+    label_dimension: Number of regression labels per example. This is the size
+      of the last dimension of the labels `Tensor` (typically, this has shape
+      `[batch_size, label_dimension]`).
+    logits_dimension: Number of logits per example. This is the size of the last
+      dimension of the logits `Tensor` (typically, this has shape
+      `[batch_size, logits_dimension]`).
+      Default value: `label_dimension`.
+    label_name: Python `str`, name of the key in label `dict`. Can be `None` if
+      label is a `Tensor` (single headed models).
+    weight_column_name: Python `str` defining feature column name representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    enable_centered_bias: Python `bool`. If `True`, estimator will learn a
+      centered bias variable for each class. Rest of the model structure learns
+      the residual after centered bias.
+    head_name: Python `str`, name of the head. Predictions, summary and metrics
+      keys are suffixed by `"/" + head_name` and the default variable scope is
+      `head_name`.
+
+  Returns:
+    An instance of `Head` for generic regression.
+  """
+  return _DistributionRegressionHead(
+      make_distribution_fn=make_distribution_fn,
+      label_dimension=label_dimension,
+      logits_dimension=logits_dimension,
+      label_name=label_name,
+      weight_column_name=weight_column_name,
+      enable_centered_bias=enable_centered_bias,
+      head_name=head_name)
+
+
+class _DistributionRegressionHead(_RegressionHead):
+  """Creates a _RegressionHead instance from an arbitray `Distribution`."""
+
+  def __init__(self,
+               make_distribution_fn,
+               label_dimension,
+               logits_dimension=None,
+               label_name=None,
+               weight_column_name=None,
+               enable_centered_bias=False,
+               head_name=None):
+    """`Head` for regression.
+
+    Args:
+      make_distribution_fn: Python `callable` which returns a `tf.Distribution`
+        instance created using only logits.
+      label_dimension: Number of regression labels per example. This is the
+        size of the last dimension of the labels `Tensor` (typically, this has
+        shape `[batch_size, label_dimension]`).
+      logits_dimension: Number of logits per example. This is the size of the
+        last dimension of the logits `Tensor` (typically, this has shape
+        `[batch_size, logits_dimension]`).
+        Default value: `label_dimension`.
+      label_name: Python `str`, name of the key in label `dict`. Can be `None`
+        if label is a tensor (single headed models).
+      weight_column_name: Python `str` defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      enable_centered_bias: Python `bool`. If `True`, estimator will learn a
+        centered bias variable for each class. Rest of the model structure
+        learns the residual after centered bias.
+      head_name: Python `str`, name of the head. Predictions, summary and
+        metrics keys are suffixed by `"/" + head_name` and the default variable
+        scope is `head_name`.
+
+    Raises:
+      TypeError: if `make_distribution_fn` is not `callable`.
+    """
+    if not callable(make_distribution_fn):
+      raise TypeError("`make_distribution_fn` must be a callable function.")
+
+    self._distributions = {}
+    self._make_distribution_fn = make_distribution_fn
+
+    def static_value(x):
+      """Returns the static value of a `Tensor` or `None`."""
+      return tensor_util.constant_value(ops.convert_to_tensor(x))
+
+    def concat_vectors(*args):
+      """Concatenates input vectors, statically if possible."""
+      args_ = [static_value(x) for x in args]
+      if any(vec is None for vec in args_):
+        return array_ops.concat(args, axis=0)
+      return [val for vec in args_ for val in vec]
+
+    def loss_fn(labels, logits, weights=None):
+      """Returns the loss of using `logits` to predict `labels`."""
+      d = self.distribution(logits)
+      labels_batch_shape = labels.shape.with_rank_at_least(1)[:-1]
+      labels_batch_shape = (
+          labels_batch_shape.as_list() if labels_batch_shape.is_fully_defined()
+          else array_ops.shape(labels)[:-1])
+      labels = array_ops.reshape(
+          labels,
+          shape=concat_vectors(labels_batch_shape, d.event_shape_tensor()))
+      return _compute_weighted_loss(
+          loss_unweighted=-d.log_prob(labels),
+          weight=weights)
+
+    def link_fn(logits):
+      """Returns the inverse link function at `logits`."""
+      # Note: What the API calls a "link function" is really the inverse-link
+      # function, i.e., the "mean".
+      d = self.distribution(logits)
+      return d.mean()
+
+    super(_DistributionRegressionHead, self).__init__(
+        label_dimension=label_dimension,
+        loss_fn=loss_fn,
+        link_fn=link_fn,
+        logits_dimension=logits_dimension,
+        label_name=label_name,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=enable_centered_bias,
+        head_name=head_name)
+
+  @property
+  def distributions(self):
+    """Returns all distributions created by `DistributionRegressionHead`."""
+    return self._distributions
+
+  def distribution(self, logits, name=None):
+    """Retrieves a distribution instance, parameterized by `logits`.
+
+    Args:
+      logits: `float`-like `Tensor` representing the parameters of the
+        underlying distribution.
+      name: The Python `str` name to given to this op.
+        Default value: "distribution".
+
+    Returns:
+      distribution: `tf.Distribution` instance parameterized by `logits`.
+    """
+    with ops.name_scope(name, "distribution", [logits]):
+      d = self._distributions.get(logits, None)
+      if d is None:
+        d = self._make_distribution_fn(logits)
+        self._distributions[logits] = d
+      return d
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index 918200830c35536e110b9a2ce4fdf35e55caac18..8f190e48a7148d84082d73771cba4660a1a0d221 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -58,8 +58,8 @@ class Geometric(distribution.Distribution):
   def __init__(self,
                logits=None,
                probs=None,
-               validate_args=True,
-               allow_nan_stats=False,
+               validate_args=False,
+               allow_nan_stats=True,
                name="Geometric"):
     """Construct Geometric distributions.
 
@@ -143,32 +143,32 @@ class Geometric(distribution.Distribution):
     return math_ops.floor(
         math_ops.log(sampled) / math_ops.log1p(-self.probs))
 
-  def _cdf(self, counts):
+  def _cdf(self, x):
     if self.validate_args:
-      # We set `check_integer=False` since the CDF is defined on whole real
-      # line.
-      counts = math_ops.floor(
-          distribution_util.embed_check_nonnegative_discrete(
-              counts, check_integer=False))
-    counts *= array_ops.ones_like(self.probs)
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    else:
+      # Whether or not x is integer-form, the following is well-defined.
+      # However, scipy takes the floor, so we do too.
+      x = math_ops.floor(x)
+    x *= array_ops.ones_like(self.probs)
     return array_ops.where(
-        counts < 0.,
-        array_ops.zeros_like(counts),
-        -math_ops.expm1(
-            (counts + 1) * math_ops.log1p(-self.probs)))
+        x < 0.,
+        array_ops.zeros_like(x),
+        -math_ops.expm1((1. + x) * math_ops.log1p(-self.probs)))
 
-  def _log_prob(self, counts):
+  def _log_prob(self, x):
     if self.validate_args:
-      counts = distribution_util.embed_check_nonnegative_discrete(
-          counts, check_integer=True)
-    counts *= array_ops.ones_like(self.probs)
-    probs = self.probs * array_ops.ones_like(counts)
-
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    else:
+      # For consistency with cdf, we take the floor.
+      x = math_ops.floor(x)
+    x *= array_ops.ones_like(self.probs)
+    probs = self.probs * array_ops.ones_like(x)
     safe_domain = array_ops.where(
-        math_ops.equal(counts, 0.),
+        math_ops.equal(x, 0.),
         array_ops.zeros_like(probs),
         probs)
-    return counts * math_ops.log1p(-safe_domain) + math_ops.log(probs)
+    return x * math_ops.log1p(-safe_domain) + math_ops.log(probs)
 
   def _entropy(self):
     probs = self._probs
diff --git a/tensorflow/contrib/distributions/python/ops/moving_stats.py b/tensorflow/contrib/distributions/python/ops/moving_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba816d55ef70f35e737a48af3f35836d7851bdb1
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/moving_stats.py
@@ -0,0 +1,166 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for computing moving-average statistics."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+
+__all__ = [
+    "assign_exponential_moving_mean_variance",
+    "exponential_moving_mean_variance",
+]
+
+
+def assign_exponential_moving_mean_variance(
+    mean_var, variance_var, value, decay, name=None):
+  """Compute the exponential-moving mean, variance of a streaming value.
+
+  The exponential moving `mean_var`, `variance_var` updated by `value` is
+  given by the following recurrence relations,
+
+  ```python
+  variance_var = decay * (variance_var + (1-decay) * (value - mean_var)**2)
+  mean_var     = decay * mean_var + (1 - decay) * value
+  ```
+
+  Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
+  the lag-1 mean.
+
+  For derivation justification, see equation 143 of:
+    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
+    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+
+  Args:
+    mean_var: `float`-like `Variable` representing the exponential-moving
+      mean. Same shape as `variance_var` and `value`.
+    variance_var: `float`-like `Variable` representing the exponential-moving
+      variance. Same shape as `mean_var` and `value`.
+    value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`.
+    decay: A `float`-like `Tensor`. The moving average decay. Typically close to
+      `1.`, e.g., `0.999`.
+    name: Optional name of the returned operation.
+
+  Returns:
+    mean_var: `Variable` representing the `value`-updated exponential-moving
+      mean.
+    variance_var: `Variable` representing the `value`-updated exponential-moving
+      variance.
+
+  Raises:
+    TypeError: if `mean_var` is not a floating type.
+    TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different
+      `base_dtype`.
+  """
+  with ops.name_scope(name, "assign_exponential_moving_mean_variance",
+                      [variance_var, mean_var, value, decay]):
+    with ops.colocate_with(variance_var):
+      with ops.colocate_with(mean_var):
+        base_dtype = mean_var.dtype.base_dtype
+        if not base_dtype.is_floating:
+          raise TypeError(
+              "mean_var.base_dtype({}) is not a floating-type.".format(
+                  base_dtype.name))
+        if base_dtype != variance_var.dtype.base_dtype:
+          raise TypeError(
+              "mean_var.base_dtype({}) != variance_var.base_dtype({})".format(
+                  base_dtype.name,
+                  variance_var.dtype.base_dtype.name))
+        value = ops.convert_to_tensor(value, dtype=base_dtype, name="value")
+        decay = ops.convert_to_tensor(decay, dtype=base_dtype, name="decay")
+        delta = value - mean_var
+        with ops.control_dependencies([delta]):
+          mean_var = state_ops.assign_add(
+              mean_var,
+              (1. - decay) * delta)
+          variance_var = state_ops.assign_sub(
+              variance_var,
+              (1. - decay) * (variance_var - decay * math_ops.square(delta)))
+        return mean_var, variance_var
+
+
+def exponential_moving_mean_variance(value, decay, collections=None, name=None):
+  """Compute the exponential-moving mean, variance of a streaming value.
+
+  The exponential moving `mean_var`, `variance_var` updated by `value` is
+  given by the following recurrence relations,
+
+  ```python
+  variance_var = decay * (variance_var + (1-decay) * (value - mean_var)**2)
+  mean_var     = decay * mean_var + (1 - decay) * value
+  ```
+
+  Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
+  the lag-`1` mean.
+
+  For derivation justification, see equation 143 of:
+    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
+    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+
+  Unlike `assign_exponential_moving_mean_variance`, this function handles
+  variable creation.
+
+  Args:
+    value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`.
+    decay: A `float`-like `Tensor`. The moving average decay. Typically close to
+      `1.`, e.g., `0.999`.
+    collections: Python list of graph-collections keys to which the the internal
+      variables `mean_var` and `variance_var` are added.
+      Default value is `[GraphKeys.GLOBAL_VARIABLES]`.
+    name: Optional name of the returned operation.
+
+  Returns:
+    mean_var: `Variable` representing the `value`-updated exponential-moving
+      mean.
+    variance_var: `Variable` representing the `value`-updated exponential-moving
+      variance.
+
+  Raises:
+    TypeError: if `value_var` is not a floating type.
+    TypeError: if `value`, `decay` have different `base_dtype`.
+  """
+  if collections is None:
+    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  with variable_scope.variable_scope(
+      name, "exponential_moving_mean_variance", [value, decay]):
+    value = ops.convert_to_tensor(value, name="value")
+    base_dtype = value.dtype.base_dtype
+    if not base_dtype.is_floating:
+      raise TypeError(
+          "value.base_dtype({}) is not a floating-type.".format(
+              base_dtype.name))
+    decay = ops.convert_to_tensor(decay, dtype=base_dtype, name="decay")
+    variance_var = variable_scope.get_variable(
+        "exponential_moving_variance",
+        shape=value.shape,
+        dtype=value.dtype,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        collections=collections)
+    mean_var = variable_scope.get_variable(
+        "exponential_moving_mean",
+        shape=value.shape,
+        dtype=value.dtype,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        collections=collections)
+    return assign_exponential_moving_mean_variance(
+        mean_var, variance_var, value, decay)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index b25250d3671ff68a8362c7f2eaa8f586900f27e2..50c7ba418be5b66127a3fde9f02a39b8f52ff841 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops.bijectors import AffineLinearOperator
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
@@ -189,7 +189,7 @@ class MultivariateNormalLinearOperator(
         distribution=normal.Normal(
             loc=array_ops.zeros([], dtype=scale.dtype),
             scale=array_ops.ones([], dtype=scale.dtype)),
-        bijector=bijectors.AffineLinearOperator(
+        bijector=AffineLinearOperator(
             shift=loc, scale=scale, validate_args=validate_args),
         batch_shape=batch_shape,
         event_shape=event_shape,
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 8895fd8b465bf1f1e6f6b818cfbfc1aaa86a522e..c8c396f6f80cf7f3228a75d279fff91ae15813ad 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -148,39 +148,33 @@ class NegativeBinomial(distribution.Distribution):
         beta=math_ops.exp(-self.logits),
         dtype=self.dtype,
         seed=seed)
-
     return random_ops.random_poisson(
         rate,
         shape=[],
         dtype=self.dtype,
         seed=distribution_util.gen_new_seed(seed, "negative_binom"))
 
-  def _cdf(self, positive_counts):
+  def _cdf(self, x):
     if self.validate_args:
-      positive_counts = math_ops.floor(
-          distribution_util.embed_check_nonnegative_discrete(
-              positive_counts, check_integer=False))
-    return math_ops.betainc(
-        self.total_count, positive_counts + 1.,
-        math_ops.sigmoid(-self.logits))
-
-  def _log_prob(self, positive_counts):
-    return (self._log_unnormalized_prob(positive_counts)
-            - self._log_normalization(positive_counts))
-
-  def _log_unnormalized_prob(self, positive_counts):
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    return math_ops.betainc(self.total_count, 1. + x,
+                            math_ops.sigmoid(-self.logits))
+
+  def _log_prob(self, x):
+    return (self._log_unnormalized_prob(x)
+            - self._log_normalization(x))
+
+  def _log_unnormalized_prob(self, x):
     if self.validate_args:
-      positive_counts = distribution_util.embed_check_nonnegative_discrete(
-          positive_counts, check_integer=True)
-    return self.total_count * math_ops.log1p(
-        -self.probs) + positive_counts * math_ops.log(self.probs)
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    return (self.total_count * math_ops.log1p(-self.probs)
+            + x * math_ops.log(self.probs))
 
-  def _log_normalization(self, positive_counts):
+  def _log_normalization(self, x):
     if self.validate_args:
-      positive_counts = distribution_util.embed_check_nonnegative_discrete(
-          positive_counts, check_integer=True)
-    return (-math_ops.lgamma(self.total_count + positive_counts)
-            + math_ops.lgamma(positive_counts + 1.)
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    return (-math_ops.lgamma(self.total_count + x)
+            + math_ops.lgamma(1. + x)
             + math_ops.lgamma(self.total_count))
 
   def _mean(self):
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd.py b/tensorflow/contrib/distributions/python/ops/operator_pd.py
deleted file mode 100644
index 5471db21ed2ed3dbe350a51f7a960aa927c61124..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_pd.py
+++ /dev/null
@@ -1,850 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Base class for symmetric positive definite operator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-
-
-@six.add_metaclass(abc.ABCMeta)
-class OperatorPDBase(object):
-  """Class representing a (batch) of positive definite matrices `A`.
-
-  This class provides access to functions of a (batch) symmetric positive
-  definite (PD) matrix, without the need to materialize them.  In other words,
-  this provides means to do "matrix free" computations.
-
-  ### Basics
-
-  For example, `my_operator.matmul(x)` computes the result of matrix
-  multiplication, and this class is free to do this computation with or without
-  ever materializing a matrix.
-
-  In practice, this operator represents a (batch) matrix `A` with shape
-  `[N1,...,Nn, k, k]` for some `n >= 0`.  The first `n` indices index a
-  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,in, : :]` is
-  a `k x k` matrix.  Again, this matrix `A` may not be materialized, but for
-  purposes of broadcasting this shape will be relevant.
-
-  Since `A` is (batch) positive definite, it has a (or several) square roots `S`
-  such that `A = SS^T`.
-
-  For example, if `MyOperator` inherits from `OperatorPDBase`, the user can do
-
-  ```python
-  operator = MyOperator(...)  # Initialize with some tensors.
-  operator.log_det()
-
-  # Compute the quadratic form x^T A^{-1} x for vector x.
-  x = ... # some shape [M1,...,Mm, N1,...,Nn, k] tensor
-  operator.inv_quadratic_form_on_vectors(x)
-
-  # Matrix multiplication by the square root, S w.
-  # If w is iid normal, S w has covariance A.
-  w = ... # some shape [N1,...,Nn, k, r] tensor, r >= 1
-  operator.sqrt_matmul(w)
-  ```
-
-  The above three methods, `log_det`, `inv_quadratic_form_on_vectors`, and
-  `sqrt_matmul` provide "all" that is necessary to use a covariance matrix
-  in a multi-variate normal distribution.  See the class `MVNOperatorPD`.
-
-  ### Details about shape requirements
-
-  The `Operator` classes operate on batch vectors and batch matrices with
-  compatible shapes.  `matrix` is a batch matrix with compatible shape if
-
-  ```
-  operator.shape = [N1,...,Nn] + [j, k]
-  matrix.shape =   [N1,...,Nn] + [k, r]
-  ```
-
-  This is the same requirement as `tf.matmul`.  `vec` is a batch vector with
-  compatible shape if
-
-  ```
-  operator.shape = [N1,...,Nn] + [j, k]
-  vec.shape =   [N1,...,Nn] + [k]
-  OR
-  vec.shape = [M1,...,Mm] + [N1,...,Nn] + [k]
-  ```
-
-  We are strict with the matrix shape requirements since we do not want to
-  require `Operator` broadcasting.  The `Operator` may be defined by large
-  tensors (thus broadcasting is expensive), or the `Operator` may be matrix
-  free, in which case there is no guarantee that the underlying implementation
-  will broadcast.
-
-  We are more flexible with vector shapes since extra leading dimensions can
-  be "flipped" to the end to change the vector to a compatible matrix.
-
-  """
-
-  @abc.abstractproperty
-  def name(self):
-    """String name identifying this `Operator`."""
-    return self._name
-
-  @abc.abstractproperty
-  def verify_pd(self):
-    """Whether to verify that this `Operator` is positive definite."""
-    # return self._verify_pd
-    pass
-
-  @abc.abstractproperty
-  def dtype(self):
-    """Data type of matrix elements of `A`."""
-    pass
-
-  def add_to_tensor(self, mat, name="add_to_tensor"):
-    """Add matrix represented by this operator to `mat`.  Equiv to `A + mat`.
-
-    Args:
-      mat:  `Tensor` with same `dtype` and shape broadcastable to `self`.
-      name:  A name to give this `Op`.
-
-    Returns:
-      A `Tensor` with broadcast shape and same `dtype` as `self`.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs + [mat]):
-        mat = ops.convert_to_tensor(mat, name="mat")
-        return self._add_to_tensor(mat)
-
-  def _add_to_tensor(self, mat):
-    # Re-implement in derived class if a more efficient method is available.
-    return self.to_dense() + mat
-
-  def _dispatch_based_on_batch(self, batch_method, singleton_method, **args):
-    """Helper to automatically call batch or singleton operation."""
-    if self.get_shape().ndims is not None:
-      is_batch = self.get_shape().ndims > 2
-      if is_batch:
-        return batch_method(**args)
-      else:
-        return singleton_method(**args)
-    else:
-      is_batch = self.rank() > 2
-      return control_flow_ops.cond(
-          is_batch,
-          lambda: batch_method(**args),
-          lambda: singleton_method(**args)
-      )
-
-  def inv_quadratic_form_on_vectors(
-      self, x, name="inv_quadratic_form_on_vectors"):
-    """Compute the quadratic form: `x^T A^{-1} x` where `x` is a batch vector.
-
-    `x` is a batch vector with compatible shape if
-
-    ```
-    self.shape = [N1,...,Nn] + [k, k]
-    x.shape = [M1,...,Mm] + [N1,...,Nn] + [k]
-    ```
-
-    Args:
-      x: `Tensor` with compatible batch vector shape and same `dtype` as self.
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `Tensor` with shape `[M1,...,Mm] + [N1,...,Nn]` and same `dtype`
-        as `self`.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=[x] + self.inputs):
-        x = ops.convert_to_tensor(x, name="x")
-        return self._inv_quadratic_form_on_vectors(x)
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    # Implement in derived class to enable self.inv_quadratic_form_on_vectors().
-    #
-    # To implement,
-    # Depending on which is more efficient, derived class should be a one-liner
-    # calling either
-    # return self._iqfov_via_sqrt_solve(x)
-    # OR
-    # return self._iqfov_via_solve(x)
-    # both of which are written in this base class.
-    raise NotImplementedError(
-        "inv_quadratic_form_on_vectors not implemented")
-
-  def _iqfov_via_sqrt_solve(self, x):
-    """Get the inverse quadratic form on vectors via a sqrt_solve."""
-    # x^{-1} A^{-1} x = || S^{-1}x ||^2,
-    # where S is a square root of A (A = SS^T).
-    # Steps:
-    # 1. Convert x to a matrix, flipping all extra dimensions in `x` to the
-    #    final dimension of x_matrix.
-    x_matrix = flip_vector_to_matrix(
-        x, self.batch_shape(), self.get_batch_shape())
-    # 2. Get soln_matrix = S^{-1} x_matrix
-    soln_matrix = self.sqrt_solve(x_matrix)
-    # 3. Reshape back to a vector.
-    soln = flip_matrix_to_vector(
-        soln_matrix, extract_batch_shape(x, 1), x.get_shape()[:-1])
-    # 4. L2 (batch) vector norm squared.
-    result = math_ops.reduce_sum(
-        math_ops.square(soln), reduction_indices=[-1])
-    result.set_shape(x.get_shape()[:-1])
-    return result
-
-  def _iqfov_via_solve(self, x):
-    """Get the inverse quadratic form on vectors via a solve."""
-    # x^{-1} A^{-1} x
-    # 1. Convert x to a matrix, flipping all extra dimensions in `x` to the
-    #    final dimension of x_matrix.
-    x_matrix = flip_vector_to_matrix(
-        x, self.batch_shape(), self.get_batch_shape())
-    # 2. Get x_whitened_matrix = A^{-1} x_matrix
-    soln_matrix = self.solve(x_matrix)
-    # 3. Reshape back to a vector.
-    soln = flip_matrix_to_vector(
-        soln_matrix, extract_batch_shape(x, 1), x.get_shape()[:-1])
-    # 4. Compute the dot product: x^T soln
-    result = math_ops.reduce_sum(x * soln, reduction_indices=[-1])
-    result.set_shape(x.get_shape()[:-1])
-    return result
-
-  def det(self, name="det"):
-    """Determinant for every batch member.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      Determinant for every batch member.
-    """
-    # Derived classes are encouraged to implement log_det() (since it is
-    # usually more stable), and then det() comes for free.
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._det()
-
-  def _det(self):
-    return math_ops.exp(self.log_det())
-
-  def log_det(self, name="log_det"):
-    """Log of the determinant for every batch member.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      Logarithm of determinant for every batch member.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._dispatch_based_on_batch(self._batch_log_det, self._log_det)
-
-  def _batch_log_det(self):
-    # Implement in derived class to enable self.log_det(x).
-    raise NotImplementedError("Log determinant (log_det) not implemented.")
-
-  def _log_det(self):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_log_det()
-
-  def sqrt_log_abs_det(self, name="sqrt_log_det"):
-    """Log absolute value determinant of the sqrt `S` for every batch member.
-
-    In most cases, this will be the same as `sqrt_log_det`, but for certain
-    operators defined by a square root, this might be implemented slightly
-    differently.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      Logarithm of absolute value determinant of the square root `S` for
-      every batch member.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._dispatch_based_on_batch(
-            self._batch_sqrt_log_abs_det, self._sqrt_log_abs_det)
-
-  def sqrt_log_det(self, name="sqrt_log_det"):
-    """Log of the determinant of the sqrt `S` for every batch member.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      Logarithm of determinant of the square root `S` for every batch member.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._dispatch_based_on_batch(
-            self._batch_sqrt_log_det, self._sqrt_log_det)
-
-  def _batch_sqrt_log_det(self):
-    # Over-ride in derived class if it can be done more efficiently.
-    return 0.5 * self._log_det()
-
-  def _sqrt_log_det(self):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_sqrt_log_det()
-
-  def _batch_sqrt_log_abs_det(self):
-    # Over-ride in derived class if it can be done more efficiently.
-    return self._sqrt_log_det()
-
-  def _sqrt_log_abs_det(self):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_sqrt_log_abs_det()
-
-  @abc.abstractproperty
-  def inputs(self):
-    """List of tensors that were provided as initialization inputs."""
-    pass
-
-  @abc.abstractmethod
-  def get_shape(self):
-    """Static `TensorShape` of entire operator.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, then this returns
-    `TensorShape([N1,...,Nn, k, k])`
-
-    Returns:
-      `TensorShape`, statically determined, may be undefined.
-    """
-    pass
-
-  def get_batch_shape(self):
-    """`TensorShape` with batch shape.  Statically determined if possible.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, then this returns `TensorShape([N1,...,Nn])`
-
-    Returns:
-      `TensorShape`, statically determined, may be undefined.
-    """
-    # Derived classes get this "for free" once .get_shape() is implemented.
-    return self.get_shape()[:-2]
-
-  def get_vector_shape(self):
-    """`TensorShape` of vectors this operator will work with.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, then this returns
-    `TensorShape([N1,...,Nn, k])`
-
-    Returns:
-      `TensorShape`, statically determined, may be undefined.
-    """
-    # Derived classes get this "for free" once .get_shape() is implemented.
-    return self.get_shape()[:-1]
-
-  def shape(self, name="shape"):
-    """Equivalent to `tf.shape(A).`  Equal to `[N1,...,Nn, k, k]`, `n >= 0`.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `int32` `Tensor`
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._shape()
-
-  @abc.abstractmethod
-  def _shape(self):
-    # Implement in derived class to enable .shape().
-    pass
-
-  def rank(self, name="rank"):
-    """Tensor rank.  Equivalent to `tf.rank(A)`.  Will equal `n + 2`.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, the `rank` is `n + 2`.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `int32` `Tensor`
-    """
-    # Derived classes get this "for free" once .shape() is implemented.
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return array_ops.size(self.shape())
-
-  def batch_shape(self, name="batch_shape"):
-    """Shape of batches associated with this operator.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, the `batch_shape` is `[N1,...,Nn]`.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `int32` `Tensor`
-    """
-    # Derived classes get this "for free" once .shape() is implemented.
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return array_ops.strided_slice(self.shape(), [0], [self.rank() - 2])
-
-  def vector_shape(self, name="vector_shape"):
-    """Shape of (batch) vectors that this (batch) matrix will multiply.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, the `vector_shape` is `[N1,...,Nn, k]`.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `int32` `Tensor`
-    """
-    # Derived classes get this "for free" once .shape() is implemented.
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return array_ops.concat(
-            (self.batch_shape(), [self.vector_space_dimension()]), 0)
-
-  def vector_space_dimension(self, name="vector_space_dimension"):
-    """Dimension of vector space on which this acts.  The `k` in `R^k`.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, the `vector_space_dimension` is `k`.
-
-    Args:
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `int32` `Tensor`
-    """
-    # Derived classes get this "for free" once .shape() is implemented.
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return array_ops.gather(self.shape(), self.rank() - 1)
-
-  def matmul(self, x, transpose_x=False, name="matmul"):
-    """Left (batch) matmul `x` by this matrix:  `Ax`.
-
-    `x` is a batch matrix with compatible shape if
-
-    ```
-    self.shape = [N1,...,Nn] + [k, k]
-    x.shape = [N1,...,Nn] + [k, r]
-    ```
-
-    Args:
-      x: `Tensor` with shape `self.batch_shape + [k, r]` and same `dtype` as
-        this `Operator`.
-      transpose_x: If `True`, `x` is transposed before multiplication.
-      name:  A name to give this `Op`.
-
-    Returns:
-      A result equivalent to `tf.matmul(self.to_dense(), x)`.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=[x] + self.inputs):
-        x = ops.convert_to_tensor(x, name="x")
-        return self._dispatch_based_on_batch(
-            self._batch_matmul, self._matmul, x=x, transpose_x=transpose_x)
-
-  def _batch_matmul(self, x, transpose_x=False):
-    # Implement in derived class to enable self.matmul(x).
-    raise NotImplementedError("This operator has no batch matmul Op.")
-
-  def _matmul(self, x, transpose_x=False):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_matmul(x, transpose_x=transpose_x)
-
-  def sqrt_matmul(self, x, transpose_x=False, name="sqrt_matmul"):
-    """Left (batch) matmul `x` by a sqrt of this matrix: `Sx` where `A = S S^T`.
-
-    `x` is a batch matrix with compatible shape if
-
-    ```
-    self.shape = [N1,...,Nn] + [k, k]
-    x.shape = [N1,...,Nn] + [k, r]
-    ```
-
-    Args:
-      x: `Tensor` with shape `self.batch_shape + [k, r]` and same `dtype` as
-        this `Operator`.
-      transpose_x: If `True`, `x` is transposed before multiplication.
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      A result equivalent to `tf.matmul(self.sqrt_to_dense(), x)`.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=[x] + self.inputs):
-        x = ops.convert_to_tensor(x, name="x")
-        return self._dispatch_based_on_batch(
-            self._batch_sqrt_matmul, self._sqrt_matmul, x=x,
-            transpose_x=transpose_x)
-
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    # Implement in derived class to enable self.sqrt_matmul(x).
-    raise NotImplementedError("This operator has no batch_sqrt_matmul Op.")
-
-  def _sqrt_matmul(self, x, transpose_x=False):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_sqrt_matmul(x, transpose_x=transpose_x)
-
-  def solve(self, rhs, name="solve"):
-    """Solve `r` batch systems: `A X = rhs`.
-
-    `rhs` is a batch matrix with compatible shape if
-
-    ```python
-    self.shape = [N1,...,Nn] + [k, k]
-    rhs.shape = [N1,...,Nn] + [k, r]
-    ```
-
-    For every batch member, this is done in `O(r*k^2)` complexity using back
-    substitution.
-
-    ```python
-    # Solve one linear system (r = 1) for every member of the length 10 batch.
-    A = ... # shape 10 x 2 x 2
-    RHS = ... # shape 10 x 2 x 1
-    operator.shape # = 10 x 2 x 2
-    X = operator.squrt_solve(RHS)  # shape 10 x 2 x 1
-    # operator.squrt_matmul(X) ~ RHS
-    X[3, :, 0]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 0]
-
-    # Solve five linear systems (r = 5) for every member of the length 10 batch.
-    operator.shape # = 10 x 2 x 2
-    RHS = ... # shape 10 x 2 x 5
-    ...
-    X[3, :, 2]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 2]
-    ```
-
-    Args:
-      rhs: `Tensor` with same `dtype` as this operator and compatible shape,
-        `rhs.shape = self.shape[:-1] + [r]` for `r >= 1`.
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `Tensor` with same `dtype` and shape as `x`.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=[rhs] + self.inputs):
-        rhs = ops.convert_to_tensor(rhs, name="rhs")
-        return self._dispatch_based_on_batch(
-            self._batch_solve, self._solve, rhs=rhs)
-
-  def _solve(self, rhs):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_solve(rhs)
-
-  def _batch_solve(self, rhs):
-    # Implement in derived class to enable self.solve().
-    raise NotImplementedError("batch_solve not implemented for this Operator.")
-
-  def sqrt_solve(self, rhs, name="sqrt_solve"):
-    """Solve `r` batch systems involving sqrt: `S X = rhs` where `A = SS^T`.
-
-    `rhs` is a batch matrix with compatible shape if
-
-    ```python
-    self.shape = [N1,...,Nn] + [k, k]
-    rhs.shape = [N1,...,Nn] + [k, r]
-    ```
-
-    For every batch member, this is done in `O(r*k^2)` complexity using back
-    substitution.
-
-    ```python
-    # Solve one linear system (r = 1) for every member of the length 10 batch.
-    A = ... # shape 10 x 2 x 2
-    RHS = ... # shape 10 x 2 x 1
-    operator.shape # = 10 x 2 x 2
-    X = operator.squrt_solve(RHS)  # shape 10 x 2 x 1
-    # operator.squrt_matmul(X) ~ RHS
-    X[3, :, 0]  # Solution to the linear system S[3, :, :] x = RHS[3, :, 0]
-
-    # Solve five linear systems (r = 5) for every member of the length 10 batch.
-    operator.shape # = 10 x 2 x 2
-    RHS = ... # shape 10 x 2 x 5
-    ...
-    X[3, :, 2]  # Solution to the linear system S[3, :, :] x = RHS[3, :, 2]
-    ```
-
-    Args:
-      rhs: `Tensor` with same `dtype` as this operator and compatible shape,
-        `rhs.shape = self.shape[:-1] + [r]` for `r >= 1`.
-      name:  A name scope to use for ops added by this method.
-
-    Returns:
-      `Tensor` with same `dtype` and shape as `x`.
-    """
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=[rhs] + self.inputs):
-        rhs = ops.convert_to_tensor(rhs, name="rhs")
-        return self._dispatch_based_on_batch(
-            self._batch_sqrt_solve, self._sqrt_solve, rhs=rhs)
-
-  def _sqrt_solve(self, rhs):
-    # As implemented here, this just calls the batch version.  If a more
-    # efficient non-batch version is available, override in the derived class.
-    return self._batch_sqrt_solve(rhs)
-
-  def _batch_sqrt_solve(self, rhs):
-    # Implement in derived class to enable self.sqrt_solve()
-    raise NotImplementedError(
-        "batch sqrt_solve not implemented for this Operator.")
-
-  def to_dense(self, name="to_dense"):
-    """Return a dense (batch) matrix representing this operator."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._to_dense()
-
-  def _to_dense(self):
-    # Implement in derived class to enable self.to_dense().
-    raise NotImplementedError("This operator has no dense representation.")
-
-  def sqrt_to_dense(self, name="sqrt_to_dense"):
-    """Return a dense (batch) matrix representing sqrt of this operator."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.inputs):
-        return self._sqrt_to_dense()
-
-  def _sqrt_to_dense(self):
-    # Implement in derived class to enable self.sqrt_to_dense().
-    raise NotImplementedError("This operator has no dense sqrt representation.")
-
-
-def flip_matrix_to_vector(mat, batch_shape, static_batch_shape):
-  """Flip dims to reshape batch matrix `mat` to a vector with given batch shape.
-
-  ```python
-  mat = tf.random_normal(2, 3, 4, 6)
-
-  # Flip the trailing dimension around to the front.
-  flip_matrix_to_vector(mat, [6, 2, 3], [6, 3, 2])  # Shape [6, 2, 3, 4]
-
-  # Flip the trailing dimension around then reshape batch indices to batch_shape
-  flip_matrix_to_vector(mat, [6, 3, 2], [6, 3, 2])  # Shape [6, 3, 2, 4]
-  flip_matrix_to_vector(mat, [2, 3, 2, 3], [2,3,2,3])  # Shape [2, 3, 2, 3, 4]
-  ```
-
-  Assume `mat.shape = matrix_batch_shape + [k, M]`.  The returned vector is
-  generated in two steps:
-
-  1. Flip the final dimension to the front, giving a shape
-    `[M] + matrix_batch_shape + [k]`.
-  2. Reshape the leading dimensions, giving final shape = `batch_shape + [k]`.
-
-  The reshape in step 2 will fail if the number of elements is not equal, i.e.
-  `M*prod(matrix_batch_shape) != prod(batch_shape)`.
-
-  See also:  flip_vector_to_matrix.
-
-  Args:
-    mat:  `Tensor` with rank `>= 2`.
-    batch_shape:  `int32` `Tensor` giving leading "batch" shape of result.
-    static_batch_shape:  `TensorShape` object giving batch shape of result.
-
-  Returns:
-    `Tensor` with same elements as `mat` but with shape `batch_shape + [k]`.
-  """
-  mat = ops.convert_to_tensor(mat, name="mat")
-  if (static_batch_shape.is_fully_defined()
-      and mat.get_shape().is_fully_defined()):
-    return _flip_matrix_to_vector_static(mat, static_batch_shape)
-  else:
-    return _flip_matrix_to_vector_dynamic(mat, batch_shape)
-
-
-def _flip_matrix_to_vector_static(mat, static_batch_shape):
-  """Flip matrix to vector with static shapes."""
-  mat_rank = mat.get_shape().ndims
-  k = mat.get_shape()[-2]
-  final_shape = static_batch_shape.concatenate(k)
-
-  # mat.shape = matrix_batch_shape + [k, M]
-  # Permutation corresponding to [M] + matrix_batch_shape + [k]
-  perm = [mat_rank - 1] + list(range(0, mat_rank - 1))
-  mat_with_end_at_beginning = array_ops.transpose(mat, perm=perm)
-  vector = array_ops.reshape(mat_with_end_at_beginning, final_shape)
-  return vector
-
-
-def _flip_matrix_to_vector_dynamic(mat, batch_shape):
-  """Flip matrix to vector with dynamic shapes."""
-  mat_rank = array_ops.rank(mat)
-  k = array_ops.gather(array_ops.shape(mat), mat_rank - 2)
-  final_shape = array_ops.concat((batch_shape, [k]), 0)
-
-  # mat.shape = matrix_batch_shape + [k, M]
-  # Permutation corresponding to [M] + matrix_batch_shape + [k]
-  perm = array_ops.concat(([mat_rank - 1], math_ops.range(0, mat_rank - 1)), 0)
-  mat_with_end_at_beginning = array_ops.transpose(mat, perm=perm)
-  vector = array_ops.reshape(mat_with_end_at_beginning, final_shape)
-  return vector
-
-
-def flip_vector_to_matrix(vec, batch_shape, static_batch_shape):
-  """Flip dims to reshape batch vector `x` to a matrix with given batch shape.
-
-  ```python
-  vec = tf.random_normal(2, 3, 4, 5)
-
-  # Flip the leading dimension to the end.
-  flip_vector_to_matrix(vec, [3, 4], [3, 4])  # Shape [3, 4, 5, 2]
-
-  # Flip nothing, just extend with a singleton dimension.
-  flip_vector_to_matrix(vec, [2, 3, 4], [2, 3, 4])  # Shape [2, 3, 4, 5, 1]
-
-  # Flip leading dimension to the end and reshape the batch indices to
-  # batch_shape.
-  flip_vector_to_matrix(vec, [4, 3], [4, 3])  # Shape [4, 3, 5, 2]
-  ```
-
-  Suppose `batch_shape` is length `n`.  Then...
-
-  Given `vec.shape = [M1,...,Mm] + [N1,...,Nn] + [k]`, for some
-  `m > 0` we reshape to a batch matrix with shape `batch_shape + [k, M]`
-  where `M = M1*...*Mm`.  This is done by "flipping" the leading dimensions to
-  the end and possibly reshaping `[N1,...,Nn]` to `batch_shape`.
-
-  In the case `vec.shape = [N1,...,Nn] + [k]`, we reshape to
-  `batch_shape + [k, 1]` by extending the tensor with a singleton dimension and
-  possibly reshaping `[N1,...,Nn]` to `batch_shape`.
-
-  See also: flip_matrix_to_vector.
-
-  Args:
-    vec:  `Tensor` with shape `[M1,...,Mm] + [N1,...,Nn] + [k]`
-    batch_shape:  `int32` `Tensor`.
-    static_batch_shape:  `TensorShape` with statically determined batch shape.
-
-  Returns:
-    `Tensor` with same `dtype` as `vec` and new shape.
-  """
-  vec = ops.convert_to_tensor(vec, name="vec")
-  if (
-      vec.get_shape().is_fully_defined()
-      and static_batch_shape.is_fully_defined()):
-    return _flip_vector_to_matrix_static(vec, static_batch_shape)
-  else:
-    return _flip_vector_to_matrix_dynamic(vec, batch_shape)
-
-
-def _flip_vector_to_matrix_dynamic(vec, batch_shape):
-  """flip_vector_to_matrix with dynamic shapes."""
-  # Shapes associated with batch_shape
-  batch_rank = array_ops.size(batch_shape)
-
-  # Shapes associated with vec.
-  vec = ops.convert_to_tensor(vec, name="vec")
-  vec_shape = array_ops.shape(vec)
-  vec_rank = array_ops.rank(vec)
-  vec_batch_rank = vec_rank - 1
-
-  m = vec_batch_rank - batch_rank
-  # vec_shape_left = [M1,...,Mm] or [].
-  vec_shape_left = array_ops.strided_slice(vec_shape, [0], [m])
-  # If vec_shape_left = [], then condensed_shape = [1] since reduce_prod([]) = 1
-  # If vec_shape_left = [M1,...,Mm], condensed_shape = [M1*...*Mm]
-  condensed_shape = [math_ops.reduce_prod(vec_shape_left)]
-  k = array_ops.gather(vec_shape, vec_rank - 1)
-  new_shape = array_ops.concat((batch_shape, [k], condensed_shape), 0)
-
-  def _flip_front_dims_to_back():
-    # Permutation corresponding to [N1,...,Nn] + [k, M1,...,Mm]
-    perm = array_ops.concat((math_ops.range(m, vec_rank), math_ops.range(0, m)),
-                            0)
-    return array_ops.transpose(vec, perm=perm)
-
-  x_flipped = control_flow_ops.cond(
-      math_ops.less(0, m),
-      _flip_front_dims_to_back,
-      lambda: array_ops.expand_dims(vec, -1))
-
-  return array_ops.reshape(x_flipped, new_shape)
-
-
-def _flip_vector_to_matrix_static(vec, batch_shape):
-  """flip_vector_to_matrix with static shapes."""
-  # Shapes associated with batch_shape
-  batch_rank = batch_shape.ndims
-
-  # Shapes associated with vec.
-  vec = ops.convert_to_tensor(vec, name="vec")
-  vec_shape = vec.get_shape()
-  vec_rank = len(vec_shape)
-  vec_batch_rank = vec_rank - 1
-
-  m = vec_batch_rank - batch_rank
-  # vec_shape_left = [M1,...,Mm] or [].
-  vec_shape_left = vec_shape[:m]
-  # If vec_shape_left = [], then condensed_shape = [1] since reduce_prod([]) = 1
-  # If vec_shape_left = [M1,...,Mm], condensed_shape = [M1*...*Mm]
-  condensed_shape = [np.prod(vec_shape_left)]
-  k = vec_shape[-1]
-  new_shape = batch_shape.concatenate(k).concatenate(condensed_shape)
-
-  def _flip_front_dims_to_back():
-    # Permutation corresponding to [N1,...,Nn] + [k, M1,...,Mm]
-    perm = array_ops.concat((math_ops.range(m, vec_rank), math_ops.range(0, m)),
-                            0)
-    return array_ops.transpose(vec, perm=perm)
-
-  if 0 < m:
-    x_flipped = _flip_front_dims_to_back()
-  else:
-    x_flipped = array_ops.expand_dims(vec, -1)
-
-  return array_ops.reshape(x_flipped, new_shape)
-
-
-def extract_batch_shape(x, num_event_dims, name="extract_batch_shape"):
-  """Extract the batch shape from `x`.
-
-  Assuming `x.shape = batch_shape + event_shape`, when `event_shape` has
-  `num_event_dims` dimensions.  This `Op` returns the batch shape `Tensor`.
-
-  Args:
-    x: `Tensor` with rank at least `num_event_dims`.  If rank is not high enough
-      this `Op` will fail.
-    num_event_dims:  `int32` scalar `Tensor`.  The number of trailing dimensions
-      in `x` to be considered as part of `event_shape`.
-    name:  A name to prepend to created `Ops`.
-
-  Returns:
-    batch_shape:  `1-D` `int32` `Tensor`
-  """
-  with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    return array_ops.strided_slice(
-        array_ops.shape(x), [0], [array_ops.rank(x) - num_event_dims])
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py b/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py
deleted file mode 100644
index 09712f686003fdae796858aa46e532bb6937a6d9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Symmetric positive definite (PD) Operator defined by a Cholesky factor."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops import operator_pd
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-
-
-class OperatorPDCholesky(operator_pd.OperatorPDBase):
-  """Class representing a (batch) of positive definite matrices `A`.
-
-  This class provides access to functions of a batch of symmetric positive
-  definite (PD) matrices `A` in `R^{k x k}` defined by Cholesky factor(s).
-  Determinants and solves are `O(k^2)`.
-
-  In practice, this operator represents a (batch) matrix `A` with shape
-  `[N1,...,Nn, k, k]` for some `n >= 0`.  The first `n` indices designate a
-  batch member.  For every batch member `(i1,...,in)`, `A[i1,...,ib, : :]` is
-  a `k x k` matrix.
-
-  Since `A` is (batch) positive definite, it has a (or several) square roots `S`
-  such that `A = SS^T`.
-
-  For example,
-
-  ```python
-  distributions = tf.contrib.distributions
-  chol = [[1.0, 0.0], [1.0, 2.0]]
-  operator = OperatorPDCholesky(chol)
-  operator.log_det()
-
-  # Compute the quadratic form x^T A^{-1} x for vector x.
-  x = [1.0, 2.0]
-  operator.inv_quadratic_form_on_vectors(x)
-
-  # Matrix multiplication by the square root, S w.
-  # If w is iid normal, S w has covariance A.
-  w = [[1.0], [2.0]]
-  operator.sqrt_matmul(w)
-  ```
-
-  The above three methods, `log_det`, `inv_quadratic_form_on_vectors`, and
-  `sqrt_matmul` provide "all" that is necessary to use a covariance matrix
-  in a multi-variate normal distribution.  See the class
-  `MultivariateNormalCholesky`.
-  """
-
-  def __init__(self, chol, verify_pd=True, name="OperatorPDCholesky"):
-    """Initialize an OperatorPDCholesky.
-
-    Args:
-      chol:  Shape `[N1,...,Nn, k, k]` tensor with `n >= 0`, `k >= 1`, and
-        positive diagonal elements.  The strict upper triangle of `chol` is
-        never used, and the user may set these elements to zero, or ignore them.
-      verify_pd: Whether to check that `chol` has positive diagonal (this is
-        equivalent to it being a Cholesky factor of a symmetric positive
-        definite matrix.  If `verify_pd` is `False`, correct behavior is not
-        guaranteed.
-      name:  A name to prepend to all ops created by this class.
-    """
-    self._verify_pd = verify_pd
-    self._name = name
-    with ops.name_scope(name):
-      with ops.name_scope("init", values=[chol]):
-        self._chol = self._check_chol(chol)
-
-  @property
-  def verify_pd(self):
-    """Whether to verify that this `Operator` is positive definite."""
-    return self._verify_pd
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def dtype(self):
-    return self._chol.dtype
-
-  def _batch_log_det(self):
-    """Log determinant of every batch member."""
-    # Note that array_ops.diag_part does not seem more efficient for non-batch,
-    # and would give a bad result for a batch matrix, so aways use
-    # matrix_diag_part.
-    diag = array_ops.matrix_diag_part(self._chol)
-    det = 2.0 * math_ops.reduce_sum(math_ops.log(math_ops.abs(diag)),
-                                    reduction_indices=[-1])
-    det.set_shape(self.get_shape()[:-2])
-    return det
-
-  @property
-  def inputs(self):
-    """List of tensors that were provided as initialization inputs."""
-    return [self._chol]
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    # This Operator is defined in terms of the square root, which is easy to
-    # solve with (backsubstitution), so this is the preferred way to do
-    # inv_quadratic_form_on_vectors().
-    return self._iqfov_via_sqrt_solve(x)
-
-  def _matmul(self, x, transpose_x=False):
-    # tf.matmul is defined a * b.
-    chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    chol_times_x = math_ops.matmul(
-        chol, x, transpose_a=True, transpose_b=transpose_x)
-    return math_ops.matmul(chol, chol_times_x)
-
-  def _batch_matmul(self, x, transpose_x=False):
-    # tf.matmul is defined x * y, so "y" is on the right, not "x".
-    chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    chol_times_x = math_ops.matmul(
-        chol, x, adjoint_a=True, adjoint_b=transpose_x)
-    return math_ops.matmul(chol, chol_times_x)
-
-  def _sqrt_matmul(self, x, transpose_x=False):
-    chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    # tf.matmul is defined a * b
-    return math_ops.matmul(chol, x, adjoint_b=transpose_x)
-
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
-    return math_ops.matmul(chol, x, adjoint_b=transpose_x)
-
-  def _batch_solve(self, rhs):
-    return linalg_ops.cholesky_solve(self._chol, rhs)
-
-  def _batch_sqrt_solve(self, rhs):
-    return linalg_ops.matrix_triangular_solve(self._chol, rhs, lower=True)
-
-  def get_shape(self):
-    """`TensorShape` giving static shape."""
-    return self._chol.get_shape()
-
-  def _shape(self):
-    return array_ops.shape(self._chol)
-
-  def _check_chol(self, chol):
-    """Verify that `chol` is proper."""
-    chol = ops.convert_to_tensor(chol, name="chol")
-    if not self.verify_pd:
-      return chol
-
-    shape = array_ops.shape(chol)
-    rank = array_ops.rank(chol)
-
-    is_matrix = check_ops.assert_rank_at_least(chol, 2)
-    is_square = check_ops.assert_equal(
-        array_ops.gather(shape, rank - 2), array_ops.gather(shape, rank - 1))
-
-    deps = [is_matrix, is_square]
-    diag = array_ops.matrix_diag_part(chol)
-    deps.append(check_ops.assert_positive(diag))
-
-    return control_flow_ops.with_dependencies(deps, chol)
-
-  def _sqrt_to_dense(self):
-    chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    return array_ops.identity(chol)
-
-  def _to_dense(self):
-    chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    return math_ops.matmul(chol, chol, adjoint_b=True)
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py b/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py
deleted file mode 100644
index 9d7d2a362154396251209c6e9cf4048c808823a3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Positive definite Operator defined with diagonal covariance."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import six
-
-from tensorflow.contrib.distributions.python.ops import operator_pd
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-
-__all__ = [
-    "OperatorPDDiag",
-    "OperatorPDSqrtDiag",
-]
-
-
-@six.add_metaclass(abc.ABCMeta)
-class OperatorPDDiagBase(operator_pd.OperatorPDBase):
-  """Base class for diagonal operators."""
-
-  def __init__(self, diag, verify_pd=True, name="OperatorPDDiagBase"):
-    self._verify_pd = verify_pd
-    self._name = name
-    with ops.name_scope(name):
-      with ops.name_scope("init", values=[diag]):
-        self._diag = self._check_diag(diag)
-
-  def _check_diag(self, diag):
-    """Verify that `diag` is positive."""
-    diag = ops.convert_to_tensor(diag, name="diag")
-    if not self.verify_pd:
-      return diag
-    deps = [check_ops.assert_positive(diag)]
-    return control_flow_ops.with_dependencies(deps, diag)
-
-  @property
-  def name(self):
-    """String name identifying this `Operator`."""
-    return self._name
-
-  @property
-  def verify_pd(self):
-    """Whether to verify that this `Operator` is positive definite."""
-    return self._verify_pd
-
-  @property
-  def dtype(self):
-    """Data type of matrix elements of `A`."""
-    return self._diag.dtype
-
-  @property
-  def inputs(self):
-    """Initialization arguments."""
-    return [self._diag]
-
-  def get_shape(self):
-    """`TensorShape` giving static shape."""
-    # If d_shape = [5, 3], we return [5, 3, 3].
-    d_shape = self._diag.get_shape()
-    return d_shape.concatenate(d_shape[-1:])
-
-  def _shape(self):
-    d_shape = array_ops.shape(self._diag)
-    k = array_ops.gather(d_shape, array_ops.size(d_shape) - 1)
-    return array_ops.concat((d_shape, [k]), 0)
-
-  @abc.abstractmethod
-  def _batch_log_det(self):
-    pass
-
-  @abc.abstractmethod
-  def _inv_quadratic_form_on_vectors(self, x):
-    pass
-
-  @abc.abstractmethod
-  def _batch_matmul(self, x, transpose_x=False):
-    pass
-
-  @abc.abstractmethod
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    pass
-
-  @abc.abstractmethod
-  def _batch_solve(self, rhs):
-    pass
-
-  @abc.abstractmethod
-  def _batch_sqrt_solve(self, rhs):
-    pass
-
-  @abc.abstractmethod
-  def _to_dense(self):
-    pass
-
-  @abc.abstractmethod
-  def _sqrt_to_dense(self):
-    pass
-
-  @abc.abstractmethod
-  def _add_to_tensor(self, mat):
-    pass
-
-
-class OperatorPDDiag(OperatorPDDiagBase):
-  """Class representing a (batch) of positive definite matrices `A`.
-
-  This class provides access to functions of a batch of symmetric positive
-  definite (PD) matrices `A` in `R^{k x k}`.
-
-  In this case, `A` is diagonal and is defined by a provided tensor `diag`,
-  `A_{ii} = diag[i]`.
-
-  Determinants, solves, and storage are `O(k)`.
-
-  In practice, this operator represents a (batch) matrix `A` with shape
-  `[N1,...,Nn, k, k]` for some `n >= 0`.  The first `n` indices designate a
-  batch member.  For every batch member `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-  a `k x k` matrix.
-
-  For example,
-
-  ```python
-  distributions = tf.contrib.distributions
-  diag = [1.0, 2.0]
-  operator = OperatorPDDiag(diag)
-  operator.det()  # ==> (1 * 2)
-
-  # Compute the quadratic form x^T A^{-1} x for vector x.
-  x = [1.0, 2.0]
-  operator.inv_quadratic_form_on_vectors(x)
-
-  # Matrix multiplication by the square root, S w, with A = S S^T.
-  # Recall A is diagonal, and so then is S, with  S_{ij} = sqrt(A_{ij}).
-  # If w is iid normal, S w has covariance A.
-  w = [[1.0],
-       [2.0]]
-  operator.sqrt_matmul(w)
-  ```
-
-  The above three methods, `log_det`, `inv_quadratic_form_on_vectors`, and
-  `sqrt_matmul` provide "all" that is necessary to use a covariance matrix
-  in a multi-variate normal distribution.  See the class
-  `MultivariateNormalDiag`.
-  """
-
-  def __init__(self, diag, verify_pd=True, name="OperatorPDDiag"):
-    """Initialize an OperatorPDDiag.
-
-    Args:
-      diag:  Shape `[N1,...,Nn, k]` positive tensor with `n >= 0`, `k >= 1`.
-      verify_pd: Whether to check `diag` is positive.
-      name:  A name to prepend to all ops created by this class.
-    """
-    super(OperatorPDDiag, self).__init__(
-        diag, verify_pd=verify_pd, name=name)
-
-  def _batch_log_det(self):
-    return math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    return self._iqfov_via_solve(x)
-
-  def _batch_matmul(self, x, transpose_x=False):
-    if transpose_x:
-      x = array_ops.matrix_transpose(x)
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return diag_mat * x
-
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    if transpose_x:
-      x = array_ops.matrix_transpose(x)
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return math_ops.sqrt(diag_mat) * x
-
-  def _batch_solve(self, rhs):
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return rhs / diag_mat
-
-  def _batch_sqrt_solve(self, rhs):
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return rhs / math_ops.sqrt(diag_mat)
-
-  def _to_dense(self):
-    return array_ops.matrix_diag(self._diag)
-
-  def _sqrt_to_dense(self):
-    return array_ops.matrix_diag(math_ops.sqrt(self._diag))
-
-  def _add_to_tensor(self, mat):
-    mat_diag = array_ops.matrix_diag_part(mat)
-    new_diag = self._diag + mat_diag
-    return array_ops.matrix_set_diag(mat, new_diag)
-
-
-class OperatorPDSqrtDiag(OperatorPDDiagBase):
-  """Class representing a (batch) of positive definite matrices `A`.
-
-  This class provides access to functions of a batch of symmetric positive
-  definite (PD) matrices `A` in `R^{k x k}` defined by their square root,
-  `S`, such that `A = SS^T`.
-
-  In this case, `S` is diagonal and is defined by a provided tensor `diag`,
-  `S_{ii} = diag[i]`.  As a result, `A` is diagonal with `A_{ii} = diag[i]**2`.
-
-  Determinants, solves, and storage are `O(k)`.
-
-  In practice, this operator represents a (batch) matrix `A` with shape
-  `[N1,...,Nn, k, k]` for some `n >= 0`.  The first `n` indices designate a
-  batch member.  For every batch member `(i1,...,ib)`, `A[i1,...,ib, : :]` is
-  a `k x k` matrix.
-
-  For example,
-
-  ```python
-  distributions = tf.contrib.distributions
-  diag = [1.0, 2.0]
-  operator = OperatorPDSqrtDiag(diag)
-  operator.det()  # ==> (1 * 2)**2
-
-  # Compute the quadratic form x^T A^{-1} x for vector x.
-  x = [1.0, 2.0]
-  operator.inv_quadratic_form_on_vectors(x)
-
-  # Matrix multiplication by the square root, S w.
-  # If w is iid normal, S w has covariance A.
-  w = [[1.0], [2.0]]
-  operator.sqrt_matmul(w)
-  ```
-
-  The above three methods, `log_det`, `inv_quadratic_form_on_vectors`, and
-  `sqrt_matmul` provide "all" that is necessary to use a covariance matrix
-  in a multi-variate normal distribution.  See the class
-  `MultivariateNormalDiag`.
-  """
-
-  def __init__(self, diag, verify_pd=True, name="OperatorPDSqrtDiag"):
-    """Initialize an OperatorPDSqrtDiag.
-
-    Args:
-      diag:  Shape `[N1,...,Nn, k]` positive tensor with `n >= 0`, `k >= 1`.
-      verify_pd: Whether to check `diag` is positive.
-      name:  A name to prepend to all ops created by this class.
-    """
-    super(OperatorPDSqrtDiag, self).__init__(
-        diag, verify_pd=verify_pd, name=name)
-
-  def _batch_log_det(self):
-    return 2 * math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)),
-        reduction_indices=[-1])
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    # This Operator is defined in terms of diagonal entries of the sqrt.
-    return self._iqfov_via_sqrt_solve(x)
-
-  def _batch_matmul(self, x, transpose_x=False):
-    if transpose_x:
-      x = array_ops.matrix_transpose(x)
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return math_ops.square(diag_mat) * x
-
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    if transpose_x:
-      x = array_ops.matrix_transpose(x)
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return diag_mat * x
-
-  def _batch_solve(self, rhs):
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return rhs / math_ops.square(diag_mat)
-
-  def _batch_sqrt_solve(self, rhs):
-    diag_mat = array_ops.expand_dims(self._diag, -1)
-    return rhs / diag_mat
-
-  def _to_dense(self):
-    return array_ops.matrix_diag(math_ops.square(self._diag))
-
-  def _sqrt_to_dense(self):
-    return array_ops.matrix_diag(self._diag)
-
-  def _add_to_tensor(self, mat):
-    mat_diag = array_ops.matrix_diag_part(mat)
-    new_diag = math_ops.square(self._diag) + mat_diag
-    return array_ops.matrix_set_diag(mat, new_diag)
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
deleted file mode 100644
index 3ca341bb830b0baafa75765abe7f695021bfed1e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Symmetric positive definite (PD) Operator defined by a full matrix."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "OperatorPDFull",
-]
-
-
-class OperatorPDFull(operator_pd_cholesky.OperatorPDCholesky):
-  """Class representing a (batch) of positive definite matrices `A`.
-
-  This class provides access to functions of a batch of symmetric positive
-  definite (PD) matrices `A` in `R^{k x k}` defined by dense matrices.
-  Determinants and solves are `O(k^3)`.
-
-  In practice, this operator represents a (batch) matrix `A` with shape
-  `[N1,...,Nb, k, k]` for some `b >= 0`.  The first `b` indices designate a
-  batch member.  For every batch member `(n1,...,nb)`, `A[n1,...,nb, : :]` is
-  a `k x k` matrix.
-
-  Since `A` is (batch) positive definite, it has a (or several) square roots `S`
-  such that `A = SS^T`.
-
-  For example,
-
-  ```python
-  distributions = tf.contrib.distributions
-  matrix = [[1.0, 0.5], [1.0, 2.0]]
-  operator = OperatorPDFull(matrix)
-  operator.log_det()
-
-  # Compute the quadratic form x^T A^{-1} x for vector x.
-  x = [1.0, 2.0]
-  operator.inv_quadratic_form(x)
-
-  # Matrix multiplication by the square root, S w.
-  # If w is iid normal, S w has covariance A.
-  w = [[1.0], [2.0]]
-  operator.sqrt_matmul(w)
-  ```
-
-  The above three methods, `log_det`, `inv_quadratic_form`, and
-  `sqrt_matmul` provide "all" that is necessary to use a covariance matrix
-  in a multi-variate normal distribution.  See the class `MVNOperatorPD`.
-  """
-
-  def __init__(self, matrix, verify_pd=True, name="OperatorPDFull"):
-    """Initialize an OperatorPDFull.
-
-    Args:
-      matrix:  Shape `[N1,...,Nb, k, k]` tensor with `b >= 0`, `k >= 1`.  The
-        last two dimensions should be `k x k` symmetric positive definite
-        matrices.
-      verify_pd: Whether to check that `matrix` is symmetric positive definite.
-        If `verify_pd` is `False`, correct behavior is not guaranteed.
-      name:  A name to prepend to all ops created by this class.
-    """
-    with ops.name_scope(name):
-      with ops.name_scope("init", values=[matrix]):
-        matrix = ops.convert_to_tensor(matrix)
-        # Check symmetric here.  Positivity will be verified by checking the
-        # diagonal of the Cholesky factor inside the parent class.  The Cholesky
-        # factorization linalg_ops.cholesky() does not always fail for non PSD
-        # matrices, so don't rely on that.
-        if verify_pd:
-          matrix = distribution_util.assert_symmetric(matrix)
-        chol = linalg_ops.cholesky(matrix)
-        super(OperatorPDFull, self).__init__(chol, verify_pd=verify_pd)
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
deleted file mode 100644
index 4cee2997909dbd105fd045be9ea1238a343a2c27..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Identity operator in `R^k`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from tensorflow.contrib.distributions.python.ops import operator_pd
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-
-
-class OperatorPDIdentity(operator_pd.OperatorPDBase):
-  """Identity operator in `R^k`:  `Ax = x`.
-
-  This provides an efficient implementation of the identity as an `OperatorPD`.
-  Storage, solves, and matmul are all `O(1)`, independent of batch size.
-
-  In order to be a drop-in replacement for other operators, shape and dtype
-  of arguments (e.g. to `matmul`) are checked statically as though this operator
-  was an instantiated matrix.
-
-  Dynamic shape checks of arguments are not done since that could impede
-  performance.
-  """
-
-  def __init__(self, shape, dtype, scale=None,
-               verify_pd=True, name="OperatorPDIdentity"):
-    """Initialize an `OperatorPDIdentity`.
-
-    Args:
-      shape:  `int32` rank 1 `Tensor` of length at least 2, and with the last
-        two entries equal (since this is a square matrix).
-      dtype:  Data type of the matrix that this operator represents.
-      scale: floating point rank 0 `Tensor` representing a scalar to
-        multiply the identity matrix by. This will default to a scale of 1.
-        This will be converted to the dtype `dtype`.
-      verify_pd:  `Boolean`, if `True`, asserts are added to the initialization
-        args to ensure they define this operator as a square (batch) matrix.
-      name:  Name to prepend to `Ops`.
-    """
-
-    # Grab static shape if available now.
-    with ops.name_scope(name):
-      with ops.name_scope("init", values=[shape, scale]):
-        self._dtype = dtypes.as_dtype(dtype)
-        self._verify_pd = verify_pd
-        self._name = name
-
-        # Store the static shape (if possible) right now before adding the
-        # asserts, since the asserts prevent .constant_value from working.
-        shape = ops.convert_to_tensor(shape, name="shape")
-        self._get_shape = tensor_shape.TensorShape(
-            tensor_util.constant_value(shape))
-        self._shape_arg = self._check_shape(shape)
-        self._scale = self._check_scale(scale, self._dtype)
-
-  def _check_shape(self, shape):
-    """Check that the init arg `shape` defines a valid operator."""
-    shape = ops.convert_to_tensor(shape, name="shape")
-    if not self._verify_pd:
-      return shape
-
-    # Further checks are equivalent to verification that this is positive
-    # definite.  Why?  Because the further checks simply check that this is a
-    # square matrix, and combining the fact that this is square (and thus maps
-    # a vector space R^k onto itself), with the behavior of .matmul(), this must
-    # be the identity operator.
-    rank = array_ops.size(shape)
-    assert_matrix = check_ops.assert_less_equal(2, rank)
-    with ops.control_dependencies([assert_matrix]):
-      last_dim = array_ops.gather(shape, rank - 1)
-      second_to_last_dim = array_ops.gather(shape, rank - 2)
-      assert_square = check_ops.assert_equal(last_dim, second_to_last_dim)
-      return control_flow_ops.with_dependencies([assert_matrix, assert_square],
-                                                shape)
-
-  def _check_scale(self, scale, dtype):
-    """Check that the init arg `scale` defines a valid operator."""
-    if scale is None:
-      return constant_op.constant(1.0, dtype=dtype)
-
-    scale = ops.convert_to_tensor(scale, dtype=dtype, name="scale")
-
-    if not self._verify_pd:
-      return scale
-
-    # Further check that this is a rank 0, positive tensor.
-    scale = check_ops.assert_scalar(scale)
-    return control_flow_ops.with_dependencies(
-        [check_ops.assert_positive(scale)], scale)
-
-  def _check_x(self, x):
-    """Static check that the argument `x` is proper `shape`, `dtype`."""
-    # x is a typical argument e.g. to matmul or solve.  In both cases, x should
-    # have the same type/shape since this is a square matrix.  These checks are
-    # usually not needed since we usually have some tensor backing this
-    # distribution, and the calls to tf.matmul do a shape/type check.
-    #
-    # Static checks only for efficiency, the identity should be fast.
-    #
-    # Why check at all?  Because we want this operator to be swappable for a
-    # real Operator.
-    if self.dtype != x.dtype:
-      raise TypeError(
-          "Expected argument \"x\" to have same dtype as this operator (%s).  "
-          "Found: %s" % (self.dtype, x.dtype))
-
-    x_shape = x.get_shape()
-    self_shape = self.get_shape()
-    found_msg = (
-        "Found: operator.shape = %s,  x.shape = %s" % (self_shape, x_shape))
-    if x_shape.ndims is not None and self_shape.ndims is not None:
-      if x_shape.ndims != self_shape.ndims:
-        raise ValueError(
-            "Expected argument \"x\" to have same tensor rank as this "
-            "operator. " + found_msg)
-      if x_shape.is_fully_defined() and self_shape.is_fully_defined():
-        if x_shape[-2] != self_shape[-1]:
-          raise ValueError(
-              "Incompatible shapes for matrix-matrix operation.  " + found_msg)
-
-  @property
-  def name(self):
-    """String name identifying this `Operator`."""
-    return self._name
-
-  @property
-  def verify_pd(self):
-    """Whether to verify that this `Operator` is positive definite."""
-    return self._verify_pd
-
-  @property
-  def dtype(self):
-    """Data type of matrix elements of `A`."""
-    return self._dtype
-
-  def _add_to_tensor(self, mat):
-    # Add to a tensor in O(k) time!
-    mat_diag = array_ops.matrix_diag_part(mat)
-    new_diag = self._scale + mat_diag
-    return array_ops.matrix_set_diag(mat, new_diag)
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    self._check_x(x)
-    return self._iqfov_via_sqrt_solve(x)
-
-  @property
-  def inputs(self):
-    """List of tensors that were provided as initialization inputs."""
-    return [self._shape_arg, self._scale]
-
-  def get_shape(self):
-    """Static `TensorShape` of entire operator.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, then this returns
-    `TensorShape([N1,...,Nn, k, k])`
-
-    Returns:
-      `TensorShape`, statically determined, may be undefined.
-    """
-    return self._get_shape
-
-  def _shape(self):
-    return self._shape_arg
-
-  def _det(self):
-    return math_ops.exp(self._batch_log_det())
-
-  def _batch_log_det(self):
-    rank = array_ops.size(self._shape_arg)
-    last_dim = math_ops.cast(
-        array_ops.gather(self._shape_arg, rank - 1), dtype=self.dtype)
-    log_det = (last_dim * math_ops.log(math_ops.abs(self._scale)) *
-               array_ops.ones(self.batch_shape(), dtype=self.dtype))
-    log_det.set_shape(self.get_batch_shape())
-    return log_det
-
-  def _batch_sqrt_log_det(self):
-    return 0.5 * self._batch_log_det()
-
-  def _batch_sqrt_log_abs_det(self):
-    rank = array_ops.size(self._shape_arg)
-    last_dim = math_ops.cast(
-        array_ops.gather(self._shape_arg, rank - 1), dtype=self.dtype)
-    sqrt_log_abs_det = 0.5 * last_dim * math_ops.log(
-        math_ops.abs(self._scale)) * array_ops.ones(
-            self.batch_shape(), dtype=self.dtype)
-    sqrt_log_abs_det.set_shape(self.get_batch_shape())
-    return sqrt_log_abs_det
-
-  def _batch_matmul(self, x, transpose_x=False):
-    if transpose_x:
-      x = array_ops.matrix_transpose(x)
-    self._check_x(x)
-    return self._scale * x
-
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    if transpose_x:
-      x = array_ops.matrix_transpose(x)
-    self._check_x(x)
-    return math_ops.sqrt(self._scale) * x
-
-  def _batch_solve(self, rhs):
-    self._check_x(rhs)
-    return rhs / self._scale
-
-  def _batch_sqrt_solve(self, rhs):
-    self._check_x(rhs)
-    return rhs / math_ops.sqrt(self._scale)
-
-  def _to_dense(self):
-    diag = array_ops.ones(self.vector_shape(), dtype=self.dtype)
-    dense = array_ops.matrix_diag(diag)
-    dense.set_shape(self.get_shape())
-    return self._scale * dense
-
-  def _sqrt_to_dense(self):
-    diag = array_ops.ones(self.vector_shape(), dtype=self.dtype)
-    dense = array_ops.matrix_diag(diag)
-    dense.set_shape(self.get_shape())
-    return math_ops.sqrt(self._scale) * dense
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
deleted file mode 100644
index 9f494e9e3d58c11fbbf8c96eb5c085fda53ce4d9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Operator defined: `A = SS^T` where `S = M + VDV^T`, for `OperatorPD` `M`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops import operator_pd
-from tensorflow.contrib.distributions.python.ops import operator_pd_diag
-from tensorflow.contrib.distributions.python.ops import operator_pd_identity
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-
-
-class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
-  r"""Operator defined by `A=SS^T`, where `S = M + VDV^T` for `OperatorPD` `M`.
-
-  This provides efficient low-rank updates of arbitrary `OperatorPD`.
-
-  Some math:
-
-  Given positive definite operator representing positive definite (batch) matrix
-  `M` in `R^{k x k}`, diagonal matrix `D` in `R^{r x r}`, and low rank `V` in
-  `R^{k x r}` this class represents the batch matrix `A`, defined by its square
-  root `S` as follows:
-
-  ```
-  A = SS^T, where
-  S := M + VDV^T
-  ```
-
-  Defining an operator in terms of its square root means that
-  `A_{ij} = S_i S_j^T`, where `S_i` is the ith row of `S`.  The update
-  `VDV^T` has `ij` coordinate equal to `sum_k V_{ik} D_{kk} V_{jk}`.
-
-  Computational efficiency:
-
-  Defining `A` via its square root eliminates the need to compute the square
-  root.
-
-  Performance depends on the operator representing `M`, the batch size `B`, and
-  the width of the matrix being multiplied, or systems being solved `L`.
-
-  Since `V` is rank `r`, the update adds
-
-  * `O(B L k r)` to matmul, which requires a call to `M.matmul`.
-  * `O(B L r^3)` to solves, which require a call to `M.solve` as well as the
-    solution to a batch of rank `r` systems.
-  * `O(B r^3)` to determinants, which require a call to `M.solve` as well as the
-    solution to a batch of rank `r` systems.
-
-  The rank `r` solve and determinant are both done through a Cholesky
-  factorization, thus some computation is shared.
-
-  See
-    https://en.wikipedia.org/wiki/Woodbury_matrix_identity
-    https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-  """
-
-  # Note that diag must be nonsingular to use Woodbury lemma, and must be
-  # positive def to use a Cholesky factorization, so we enforce that here.
-  def __init__(self,
-               operator,
-               v,
-               diag=None,
-               verify_pd=True,
-               verify_shapes=True,
-               name="OperatorPDSqrtVDVTUpdate"):
-    """Initialize an `OperatorPDSqrtVDVTUpdate`.
-
-    Args:
-      operator:  Subclass of `OperatorPDBase`.  Represents the (batch) positive
-        definite matrix `M` in `R^{k x k}`.
-      v: `Tensor` defining batch matrix of same `dtype` and `batch_shape` as
-        `operator`, and last two dimensions of shape `(k, r)`.
-      diag:  Optional `Tensor` defining batch vector of same `dtype` and
-        `batch_shape` as `operator`, and last dimension of size `r`.  If `None`,
-        the update becomes `VV^T` rather than `VDV^T`.
-      verify_pd:  `Boolean`.  If `True`, add asserts that `diag > 0`, which,
-        along with the positive definiteness of `operator`, is sufficient to
-        make the resulting operator positive definite.
-      verify_shapes:  `Boolean`.  If `True`, check that `operator`, `v`, and
-        `diag` have compatible shapes.
-      name:  A name to prepend to `Op` names.
-    """
-
-    if not isinstance(operator, operator_pd.OperatorPDBase):
-      raise TypeError("operator was not instance of OperatorPDBase.")
-
-    with ops.name_scope(name):
-      with ops.name_scope("init", values=operator.inputs + [v, diag]):
-        self._operator = operator
-        self._v = ops.convert_to_tensor(v, name="v")
-        self._verify_pd = verify_pd
-        self._verify_shapes = verify_shapes
-        self._name = name
-
-        # This operator will be PD so long as the diag is PSD, but Woodbury
-        # and determinant lemmas require diag to be PD.  So require diag PD
-        # whenever we ask to "verify_pd".
-        if diag is not None:
-          self._diag = ops.convert_to_tensor(diag, name="diag")
-          self._diag_operator = operator_pd_diag.OperatorPDDiag(
-              diag, verify_pd=self.verify_pd)
-          # No need to verify that the inverse of a PD is PD.
-          self._diag_inv_operator = operator_pd_diag.OperatorPDDiag(
-              1 / self._diag, verify_pd=False)
-        else:
-          self._diag = None
-          self._diag_operator = self._get_identity_operator(self._v)
-          self._diag_inv_operator = self._diag_operator
-
-        self._check_types(operator, self._v, self._diag)
-        # Always check static.
-        checked = self._check_shapes_static(operator, self._v, self._diag)
-        if not checked and self._verify_shapes:
-          self._v, self._diag = self._check_shapes_dynamic(
-              operator, self._v, self._diag)
-
-  def _get_identity_operator(self, v):
-    """Get an `OperatorPDIdentity` to play the role of `D` in `VDV^T`."""
-    with ops.name_scope("get_identity_operator", values=[v]):
-      if v.get_shape().is_fully_defined():
-        v_shape = v.get_shape().as_list()
-        v_batch_shape = v_shape[:-2]
-        r = v_shape[-1]
-        id_shape = v_batch_shape + [r, r]
-      else:
-        v_shape = array_ops.shape(v)
-        v_rank = array_ops.rank(v)
-        v_batch_shape = array_ops.strided_slice(v_shape, [0], [v_rank - 2])
-        r = array_ops.gather(v_shape, v_rank - 1)  # Last dim of v
-        id_shape = array_ops.concat((v_batch_shape, [r, r]), 0)
-      return operator_pd_identity.OperatorPDIdentity(
-          id_shape, v.dtype, verify_pd=self._verify_pd)
-
-  def _check_types(self, operator, v, diag):
-    def msg():
-      string = (
-          "dtypes must match:  Found operator.dtype = %s, v.dtype = %s"
-          % (operator.dtype, v.dtype))
-      return string
-
-    if operator.dtype != v.dtype:
-      raise TypeError(msg())
-    if diag is not None:
-      if diag.dtype != v.dtype:
-        raise TypeError("%s, diag.dtype = %s" % (msg(), diag.dtype))
-
-  def _check_shapes_static(self, operator, v, diag):
-    """True if they are compatible. Raise if not. False if could not check."""
-    def msg():
-      # Error message when shapes don't match.
-      string = "  Found: operator.shape = %s, v.shape = %s" % (s_op, s_v)
-      if diag is not None:
-        string += ", diag.shape = " % s_d
-      return string
-
-    s_op = operator.get_shape()
-    s_v = v.get_shape()
-
-    # If everything is not fully defined, return False because we couldn"t check
-    if not (s_op.is_fully_defined() and s_v.is_fully_defined()):
-      return False
-    if diag is not None:
-      s_d = diag.get_shape()
-      if not s_d.is_fully_defined():
-        return False
-
-    # Now perform the checks, raising ValueError if they fail.
-
-    # Check tensor rank.
-    if s_v.ndims != s_op.ndims:
-      raise ValueError("v should have same rank as operator" + msg())
-    if diag is not None:
-      if s_d.ndims != s_op.ndims - 1:
-        raise ValueError("diag should have rank 1 less than operator" + msg())
-
-    # Check batch shape
-    if s_v[:-2] != s_op[:-2]:
-      raise ValueError("v and operator should have same batch shape" + msg())
-    if diag is not None:
-      if s_d[:-1] != s_op[:-2]:
-        raise ValueError(
-            "diag and operator should have same batch shape" + msg())
-
-    # Check event shape
-    if s_v[-2] != s_op[-1]:
-      raise ValueError(
-          "v and operator should be compatible for matmul" + msg())
-    if diag is not None:
-      if s_d[-1] != s_v[-1]:
-        raise ValueError("diag and v should have same last dimension" + msg())
-
-    return True
-
-  def _check_shapes_dynamic(self, operator, v, diag):
-    """Return (v, diag) with Assert dependencies, which check shape."""
-    checks = []
-    with ops.name_scope("check_shapes", values=[operator, v, diag]):
-      s_v = array_ops.shape(v)
-      r_op = operator.rank()
-      r_v = array_ops.rank(v)
-      if diag is not None:
-        s_d = array_ops.shape(diag)
-        r_d = array_ops.rank(diag)
-
-      # Check tensor rank.
-      checks.append(check_ops.assert_rank(
-          v, r_op, message="v is not the same rank as operator."))
-      if diag is not None:
-        checks.append(check_ops.assert_rank(
-            diag, r_op - 1, message="diag is not the same rank as operator."))
-
-      # Check batch shape
-      checks.append(check_ops.assert_equal(
-          operator.batch_shape(), array_ops.strided_slice(s_v, [0], [r_v - 2]),
-          message="v does not have same batch shape as operator."))
-      if diag is not None:
-        checks.append(check_ops.assert_equal(
-            operator.batch_shape(), array_ops.strided_slice(
-                s_d, [0], [r_d - 1]),
-            message="diag does not have same batch shape as operator."))
-
-      # Check event shape
-      checks.append(check_ops.assert_equal(
-          operator.vector_space_dimension(), array_ops.gather(s_v, r_v - 2),
-          message="v does not have same event shape as operator."))
-      if diag is not None:
-        checks.append(check_ops.assert_equal(
-            array_ops.gather(s_v, r_v - 1), array_ops.gather(s_d, r_d - 1),
-            message="diag does not have same event shape as v."))
-
-      v = control_flow_ops.with_dependencies(checks, v)
-      if diag is not None:
-        diag = control_flow_ops.with_dependencies(checks, diag)
-      return v, diag
-
-  @property
-  def name(self):
-    """String name identifying this `Operator`."""
-    return self._name
-
-  @property
-  def verify_pd(self):
-    """Whether to verify that this `Operator` is positive definite."""
-    return self._verify_pd
-
-  @property
-  def dtype(self):
-    """Data type of matrix elements of `A`."""
-    return self._v.dtype
-
-  def _inv_quadratic_form_on_vectors(self, x):
-    return self._iqfov_via_sqrt_solve(x)
-
-  @property
-  def inputs(self):
-    """List of tensors that were provided as initialization inputs."""
-    return self._operator.inputs + self._diag_operator.inputs + [self._v]
-
-  def get_shape(self):
-    """Static `TensorShape` of entire operator.
-
-    If this operator represents the batch matrix `A` with
-    `A.shape = [N1,...,Nn, k, k]`, then this returns
-    `TensorShape([N1,...,Nn, k, k])`
-
-    Returns:
-      `TensorShape`, statically determined, may be undefined.
-    """
-    return self._operator.get_shape()
-
-  def _shape(self):
-    return self._operator.shape()
-
-  def _det(self):
-    return math_ops.exp(self.log_det())
-
-  def _batch_log_det(self):
-    return 2 * self._batch_sqrt_log_det()
-
-  def _log_det(self):
-    return 2 * self._sqrt_log_det()
-
-  def _sqrt_log_det(self):
-    # The matrix determinant lemma states:
-    # det(M + VDV^T) = det(D^{-1} + V^T M^{-1} V) * det(D) * det(M)
-    #                = det(C) * det(D) * det(M)
-    #
-    # Here we compute the Cholesky factor of "C", then pass the result on.
-    abs_diag_chol_c = math_ops.abs(array_ops.matrix_diag_part(
-        self._chol_capacitance(batch_mode=False)))
-    return self._sqrt_log_det_core(abs_diag_chol_c)
-
-  def _batch_sqrt_log_det(self):
-    # Here we compute the Cholesky factor of "C", then pass the result on.
-    abs_diag_chol_c = math_ops.abs(array_ops.matrix_diag_part(
-        self._chol_capacitance(batch_mode=True)))
-    return self._sqrt_log_det_core(abs_diag_chol_c)
-
-  def _chol_capacitance(self, batch_mode):
-    """Cholesky factorization of the capacitance term."""
-    # Cholesky factor for (D^{-1} + V^T M^{-1} V), which is sometimes
-    # known as the "capacitance" matrix.
-    # We can do a Cholesky decomposition, since a priori M is a
-    # positive-definite Hermitian matrix, which causes the "capacitance" to
-    # also be positive-definite Hermitian, and thus have a Cholesky
-    # decomposition.
-
-    # self._operator will use batch if need be. Automatically.  We cannot force
-    # that here.
-    # M^{-1} V
-    minv_v = self._operator.solve(self._v)
-    # V^T M^{-1} V
-    vt_minv_v = math_ops.matmul(self._v, minv_v, adjoint_a=True)
-
-    # D^{-1} + V^T M^{-1} V
-    capacitance = self._diag_inv_operator.add_to_tensor(vt_minv_v)
-    # Cholesky[D^{-1} + V^T M^{-1} V]
-    return linalg_ops.cholesky(capacitance)
-
-  def _sqrt_log_det_core(self, diag_chol_c):
-    """Finish computation of Sqrt[Log[Det]]."""
-    # Complete computation of ._log_det and ._batch_log_det, after the initial
-    # Cholesky factor has been taken with the appropriate batch/non-batch method
-
-    # det(M + VDV^T) = det(D^{-1} + V^T M^{-1} V) * det(D) * det(M)
-    #                = det(C) * det(D) * det(M)
-    # Multiply by 2 here because this is the log-det of the Cholesky factor of C
-    log_det_c = 2 * math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(diag_chol_c)),
-        reduction_indices=[-1])
-    # Add together to get Log[det(M + VDV^T)], the Log-det of the updated square
-    # root.
-    log_det_updated_sqrt = (
-        log_det_c + self._diag_operator.log_det() + self._operator.log_det())
-    return log_det_updated_sqrt
-
-  def _batch_matmul(self, x, transpose_x=False):
-    # Since the square root is PD, it is symmetric, and so A = SS^T = SS.
-    s_x = self._batch_sqrt_matmul(x, transpose_x=transpose_x)
-    return self._batch_sqrt_matmul(s_x)
-
-  def _matmul(self, x, transpose_x=False):
-    # Since the square root is PD, it is symmetric, and so A = SS^T = SS.
-    s_x = self._sqrt_matmul(x, transpose_x=transpose_x)
-    return self._sqrt_matmul(s_x)
-
-  def _batch_sqrt_matmul(self, x, transpose_x=False):
-    v = self._v
-    m = self._operator
-    d = self._diag_operator
-    # The operators call the appropriate matmul/batch_matmul automatically.
-    # We cannot override.
-    # batch_matmul is defined as:  x * y, so adjoint_a and adjoint_b are the
-    # ways to transpose the left and right.
-    mx = m.matmul(x, transpose_x=transpose_x)
-    vt_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=transpose_x)
-    d_vt_x = d.matmul(vt_x)
-    v_d_vt_x = math_ops.matmul(v, d_vt_x)
-
-    return mx + v_d_vt_x
-
-  def _sqrt_matmul(self, x, transpose_x=False):
-    v = self._v
-    m = self._operator
-    d = self._diag_operator
-    # The operators call the appropriate matmul/batch_matmul automatically.  We
-    # cannot override.
-    # matmul is defined as:  a * b, so transpose_a, transpose_b are used.
-    # transpose the left and right.
-    mx = m.matmul(x, transpose_x=transpose_x)
-    vt_x = math_ops.matmul(v, x, transpose_a=True, transpose_b=transpose_x)
-    d_vt_x = d.matmul(vt_x)
-    v_d_vt_x = math_ops.matmul(v, d_vt_x)
-
-    return mx + v_d_vt_x
-
-  def _solve(self, rhs):
-    # This operator represents A = SS^T, but S is symmetric, so A = SS,
-    # which means A^{-1} = S^{-1}S^{-2}
-    # S^{-1} rhs
-    sqrtinv_rhs = self._sqrt_solve(rhs)
-    return self._sqrt_solve(sqrtinv_rhs)
-
-  def _batch_solve(self, rhs):
-    sqrtinv_rhs = self._batch_sqrt_solve(rhs)
-    return self._batch_sqrt_solve(sqrtinv_rhs)
-
-  def _sqrt_solve(self, rhs):
-    # Recall the square root of this operator is M + VDV^T.
-    # The Woodbury formula gives:
-    # (M + VDV^T)^{-1}
-    # = M^{-1} - M^{-1} V (D^{-1} + V^T M^{-1} V)^{-1} V^T M^{-1}
-    # = M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
-    # where C is the capacitance matrix.
-    # TODO(jvdillon) Determine if recursively applying rank-1 updates is more
-    # efficient.  May not be possible because a general n x n matrix can be
-    # represeneted as n rank-1 updates, and solving with this matrix is always
-    # done in O(n^3) time.
-    m = self._operator
-    v = self._v
-    cchol = self._chol_capacitance(batch_mode=False)
-
-    # The operators will use batch/singleton mode automatically.  We don't
-    # override.
-    # M^{-1} rhs
-    minv_rhs = m.solve(rhs)
-    # V^T M^{-1} rhs
-    vt_minv_rhs = math_ops.matmul(v, minv_rhs, transpose_a=True)
-    # C^{-1} V^T M^{-1} rhs
-    cinv_vt_minv_rhs = linalg_ops.cholesky_solve(cchol, vt_minv_rhs)
-    # V C^{-1} V^T M^{-1} rhs
-    v_cinv_vt_minv_rhs = math_ops.matmul(v, cinv_vt_minv_rhs)
-    # M^{-1} V C^{-1} V^T M^{-1} rhs
-    minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)
-
-    # M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
-    return minv_rhs - minv_v_cinv_vt_minv_rhs
-
-  def _batch_sqrt_solve(self, rhs):
-    # Recall the square root of this operator is M + VDV^T.
-    # The Woodbury formula gives:
-    # (M + VDV^T)^{-1}
-    # = M^{-1} - M^{-1} V (D^{-1} + V^T M^{-1} V)^{-1} V^T M^{-1}
-    # = M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
-    # where C is the capacitance matrix.
-    m = self._operator
-    v = self._v
-    cchol = self._chol_capacitance(batch_mode=True)
-
-    # The operators will use batch/singleton mode automatically.  We don't
-    # override.
-    # M^{-1} rhs
-    minv_rhs = m.solve(rhs)
-    # V^T M^{-1} rhs
-    vt_minv_rhs = math_ops.matmul(v, minv_rhs, adjoint_a=True)
-    # C^{-1} V^T M^{-1} rhs
-    cinv_vt_minv_rhs = linalg_ops.cholesky_solve(cchol, vt_minv_rhs)
-    # V C^{-1} V^T M^{-1} rhs
-    v_cinv_vt_minv_rhs = math_ops.matmul(v, cinv_vt_minv_rhs)
-    # M^{-1} V C^{-1} V^T M^{-1} rhs
-    minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)
-
-    # M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
-    return minv_rhs - minv_v_cinv_vt_minv_rhs
-
-  def _to_dense(self):
-    sqrt = self.sqrt_to_dense()
-    return math_ops.matmul(sqrt, sqrt, adjoint_b=True)
-
-  def _sqrt_to_dense(self):
-    v = self._v
-    d = self._diag_operator
-    m = self._operator
-
-    d_vt = d.matmul(v, transpose_x=True)
-    # Batch op won't be efficient for singletons.  Currently we don't break
-    # to_dense into batch/singleton methods.
-    v_d_vt = math_ops.matmul(v, d_vt)
-    m_plus_v_d_vt = m.to_dense() + v_d_vt
-    return m_plus_v_d_vt
diff --git a/tensorflow/contrib/distributions/python/ops/operator_test_util.py b/tensorflow/contrib/distributions/python/ops/operator_test_util.py
deleted file mode 100644
index bc78340e5a4e24577d42fccd306c64f2fce19d72..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/operator_test_util.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for testing `OperatorPDBase` and related classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-@six.add_metaclass(abc.ABCMeta)  # pylint: disable=no-init
-class OperatorPDDerivedClassTest(test.TestCase):
-  """Tests for derived classes.
-
-  Subclasses should implement every abstractmethod, and this will enable all
-  test methods to work.
-  """
-
-  def setUp(self):
-    self._rng = np.random.RandomState(42)
-
-  def _compare_results(self, expected, actual, static_shapes=True, atol=1e-5):
-    """Compare expected value (array) to the actual value (Tensor)."""
-    if static_shapes:
-      self.assertEqual(expected.shape, actual.get_shape())
-    self.assertAllClose(expected, actual.eval(), atol=atol)
-
-  @abc.abstractmethod
-  def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
-    """Build a batch matrix and an Operator that should have similar behavior.
-
-    Every operator represents a (batch) matrix.  This method returns both
-    together, and is used e.g. by tests.
-
-    Args:
-      batch_shape:  List-like of Python integers giving batch shape of operator.
-      k: Python integer, the event size.
-      dtype:  Numpy dtype.  Data type of returned array/operator.
-
-    Returns:
-      operator:  `OperatorPDBase` subclass.
-      mat:  numpy array representing a (batch) matrix.
-    """
-    # Create a matrix as a numpy array.  Shape = batch_shape + [k, k].
-    # Create an OperatorPDDiag that should have the same behavior as the matrix.
-    # All arguments are convertable to numpy arrays.
-    #
-    batch_shape = list(batch_shape)
-    mat_shape = batch_shape + [k, k]
-    # return operator, mat
-    raise NotImplementedError("Not implemented yet.")
-
-  def testToDense(self):
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          for dtype in [np.float32, np.float64]:
-            operator, mat = self._build_operator_and_mat(
-                batch_shape, k, dtype=dtype)
-            self._compare_results(expected=mat, actual=operator.to_dense())
-
-  def testSqrtToDense(self):
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-          sqrt = operator.sqrt_to_dense()
-          self.assertEqual(mat.shape, sqrt.get_shape())
-          # Square roots are not unique, but SS^T should equal mat.  In this
-          # case however, we should have S = S^T.
-          self._compare_results(
-              expected=mat, actual=math_ops.matmul(sqrt, sqrt))
-
-  def testDeterminants(self):
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-          expected_det = linalg_ops.matrix_determinant(mat).eval()
-
-          self._compare_results(expected_det, operator.det())
-          self._compare_results(np.log(expected_det), operator.log_det())
-
-  def testMatmul(self):
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-
-          # Work with 5 simultaneous systems.  5 is arbitrary.
-          x = self._rng.randn(*(batch_shape + (k, 5)))
-
-          self._compare_results(
-              expected=math_ops.matmul(mat, x).eval(),
-              actual=operator.matmul(x))
-
-  def testSqrtMatmul(self):
-    # Square roots are not unique, but we should have SS^T x = Ax, and in our
-    # case, we should have S = S^T, so SSx = Ax.
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-
-          # Work with 5 simultaneous systems.  5 is arbitrary.
-          x = self._rng.randn(*(batch_shape + (k, 5)))
-
-          self._compare_results(
-              expected=math_ops.matmul(mat, x).eval(),
-              actual=operator.sqrt_matmul(operator.sqrt_matmul(x)))
-
-  def testSolve(self):
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-
-          # Work with 5 simultaneous systems.  5 is arbitrary.
-          x = self._rng.randn(*(batch_shape + (k, 5)))
-
-          self._compare_results(
-              expected=linalg_ops.matrix_solve(mat, x).eval(),
-              actual=operator.solve(x))
-
-  def testSqrtSolve(self):
-    # Square roots are not unique, but we should still have
-    # S^{-T} S^{-1} x = A^{-1} x.
-    # In our case, we should have S = S^T, so then S^{-1} S^{-1} x = A^{-1} x.
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-
-          # Work with 5 simultaneous systems.  5 is arbitrary.
-          x = self._rng.randn(*(batch_shape + (k, 5)))
-
-          self._compare_results(
-              expected=linalg_ops.matrix_solve(mat, x).eval(),
-              actual=operator.sqrt_solve(operator.sqrt_solve(x)))
-
-  def testAddToTensor(self):
-    with self.test_session():
-      for batch_shape in [(), (
-          2,
-          3,)]:
-        for k in [1, 4]:
-          operator, mat = self._build_operator_and_mat(batch_shape, k)
-          tensor = array_ops.ones_like(mat)
-
-          self._compare_results(
-              expected=(mat + tensor).eval(),
-              actual=operator.add_to_tensor(tensor))
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index d9929183c1a85f2ed16f289c795c4c7bf46caec0..59a98e5682d5b3c053a18a19a1da0d2f320f21a6 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -114,10 +114,6 @@ class Poisson(distribution.Distribution):
   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()
 
-  @distribution_util.AppendDocstring(_poisson_sample_note)
-  def _prob(self, x):
-    return math_ops.exp(self._log_prob(x))
-
   @distribution_util.AppendDocstring(_poisson_sample_note)
   def _log_cdf(self, x):
     return math_ops.log(self.cdf(x))
@@ -125,20 +121,23 @@ class Poisson(distribution.Distribution):
   @distribution_util.AppendDocstring(_poisson_sample_note)
   def _cdf(self, x):
     if self.validate_args:
-      # We set `check_integer=False` since the CDF is defined on whole real
-      # line.
-      x = distribution_util.embed_check_nonnegative_discrete(
-          x, check_integer=False)
-    return math_ops.igammac(math_ops.floor(x + 1), self.rate)
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    else:
+      # Whether or not x is integer-form, the following is well-defined.
+      # However, scipy takes the floor, so we do too.
+      x = math_ops.floor(x)
+    return math_ops.igammac(1. + x, self.rate)
 
   def _log_normalization(self):
     return self.rate
 
   def _log_unnormalized_prob(self, x):
     if self.validate_args:
-      x = distribution_util.embed_check_nonnegative_discrete(
-          x, check_integer=True)
-    return x * math_ops.log(self.rate) - math_ops.lgamma(x + 1)
+      x = distribution_util.embed_check_nonnegative_integer_form(x)
+    else:
+      # For consistency with cdf, we take the floor.
+      x = math_ops.floor(x)
+    return x * math_ops.log(self.rate) - math_ops.lgamma(1. + x)
 
   def _mean(self):
     return array_ops.identity(self.rate)
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index 5b57a95c55eca7f3d6301c1e87a6cf52f040ab26..b525809015537ac8c7ee701c100fba6541fe2e92 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -52,7 +52,7 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
   the RelaxedBernoulli can suffer from underflow issues. In many case loss
   functions such as these are invariant under invertible transformations of
   the random variables. The KL divergence, found in the variational autoencoder
-  loss, is an example. Because RelaxedBernoullis are sampled by by a Logistic
+  loss, is an example. Because RelaxedBernoullis are sampled by a Logistic
   random variable followed by a `tf.sigmoid` op, one solution is to treat
   the Logistic as the random variable and `tf.sigmoid` as downstream. The
   KL divergences of two Logistics, which are always followed by a `tf.sigmoid`
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 26cf922d0afe0c8c07da1e3e8da43e1d5cea25c4..2a4b92c72900f79785e7e34b77179d3decbace5b 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -47,7 +47,7 @@ def percentile(x,
   """Compute the `q`-th percentile of `x`.
 
   Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
-  way from the minimum to the maximum in in a sorted copy of `x`.
+  way from the minimum to the maximum in a sorted copy of `x`.
 
   The values and distances of the two nearest neighbors as well as the
   `interpolation` parameter will determine the percentile if the normalized
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..282683ef39d96793ebf875805d8de0c8c8e18843
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -0,0 +1,836 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The VectorDiffeomixture distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import AffineLinearOperator
+from tensorflow.contrib.linalg.python.ops import linear_operator_addition as linop_add_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_composition as linop_composition_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_diag as linop_diag_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_full_matrix as linop_full_lib
+from tensorflow.contrib.linalg.python.ops import linear_operator_identity as linop_identity_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import categorical as categorical_lib
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+
+
+__all__ = [
+    "VectorDiffeomixture",
+]
+
+
+class VectorDiffeomixture(distribution_lib.Distribution):
+  """VectorDiffeomixture distribution.
+
+  The VectorDiffeomixture is an approximation to a [compound distribution](
+  https://en.wikipedia.org/wiki/Compound_probability_distribution), i.e.,
+
+  ```none
+  p(x) = int_{X} q(x | v) p(v) dv
+       = lim_{Q->infty} sum{ prob[i] q(x | loc=sum_k^K lambda[k;i] loc[k],
+                                            scale=sum_k^K lambda[k;i] scale[k])
+                            : i=0, ..., Q-1 }
+  ```
+
+  where `q(x | v)` is a vector version of the `distribution` argument and `p(v)`
+  is a SoftmaxNormal parameterized by `mix_loc` and `mix_scale`. The
+  vector-ization of `distribution` entails an affine transformation of iid
+  samples from `distribution`.  The `prob` term is from quadrature and
+  `lambda[k] = sigmoid(mix_loc[k] + sqrt(2) mix_scale[k] grid[k])` where the
+  `grid` points correspond to the `prob`s.
+
+  In the non-approximation case, a draw from the mixture distribution (the
+  "prior") represents the convex weights for different affine transformations.
+  I.e., draw a mixing vector `v` (from the `K-1`-simplex) and let the final
+  sample be: `y = (sum_k^K v[k] scale[k]) @ x + (sum_k^K v[k] loc[k])` where `@`
+  denotes matrix multiplication.  However, the non-approximate distribution does
+  not have an analytical probability density function (pdf). Therefore the
+  `VectorDiffeomixture` class implements an approximation based on
+  [Gauss-Hermite quadrature](
+  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature). I.e., in
+  Note: although the `VectorDiffeomixture` is approximately the
+  `SoftmaxNormal-Distribution` compound distribution, it is itself a valid
+  distribution. It possesses a `sample`, `log_prob`, `mean`, `covariance` which
+  are all mutually consistent.
+
+  #### Intended Use
+
+  This distribution is noteworthy because it implements a mixture of
+  `Vector`-ized distributions yet has samples differentiable in the
+  distribution's parameters (aka "reparameterized"). It has an analytical
+  density function with `O(dKQ)` complexity. `d` is the vector dimensionality,
+  `K` is the number of components, and `Q` is the number of quadrature points.
+  These properties make it well-suited for Bayesian Variational Inference, i.e.,
+  as a surrogate family for the posterior.
+
+  For large values of `mix_scale`, the `VectorDistribution` behaves increasingly
+  like a discrete mixture. (In most cases this limit is only achievable by also
+  increasing the quadrature polynomial degree, `Q`.)
+
+  The term `Vector` is consistent with similar named Tensorflow `Distribution`s.
+  For more details, see the "About `Vector` distributions in Tensorflow."
+  section.
+
+  The term `Diffeomixture` is a portmanteau of
+  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism) and [compound
+  mixture](https://en.wikipedia.org/wiki/Compound_probability_distribution). For
+  more details, see the "About `Diffeomixture`s and reparametrization.`"
+  section.
+
+  #### Mathematical Details
+
+  The `VectorDiffeomixture` approximates a SoftmaxNormal-mixed ("prior")
+  [compound distribution](
+  https://en.wikipedia.org/wiki/Compound_probability_distribution).
+  Using variable-substitution and [Gauss-Hermite quadrature](
+  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature) we can
+  redefine the distribution to be a parameter-less convex combination of `K`
+  different affine combinations of a `d` iid samples from `distribution`.
+
+  That is, defined over `R**d` this distribution is parameterized by a
+  (batch of) length-`K` `mix_loc` and `mix_scale` vectors, a length-`K` list of
+  (a batch of) length-`d` `loc` vectors, and a length-`K` list of `scale`
+  `LinearOperator`s each operating on a (batch of) length-`d` vector space.
+  Finally, a `distribution` parameter specifies the underlying base distribution
+  which is "lifted" to become multivariate ("lifting" is the same concept as in
+  `TransformedDistribution`).
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(y; mix_loc, mix_scale, loc, scale, phi)
+    = sum{ prob[i] phi(f_inverse(x; i)) / abs(det(interp_scale[i]))
+          : i=0, ..., Q-1 }
+  ```
+
+  where, `phi` is the base distribution pdf, and,
+
+  ```none
+  f_inverse(x; i) = inv(interp_scale[i]) @ (x - interp_loc[i])
+  interp_loc[i]   = sum{ lambda[k; i] loc[k]   : k=0, ..., K-1 }
+  interp_scale[i] = sum{ lambda[k; i] scale[k] : k=0, ..., K-1 }
+  ```
+
+  and,
+
+  ```none
+  grid, weight = np.polynomial.hermite.hermgauss(quadrature_polynomial_degree)
+  prob[k]   = weight[k] / sqrt(pi)
+  lambda[k; i] = sigmoid(mix_loc[k] + sqrt(2) mix_scale[k] grid[i])
+  ```
+
+  The distribution corresponding to `phi` must be a scalar-batch, scalar-event
+  distribution. Typically it is reparameterized. If not, it must be a function
+  of non-trainable parameters.
+
+  WARNING: If you backprop through a VectorDiffeomixture sample and the "base"
+  distribution is both: not `FULLY_REPARAMETERIZED` and a function of trainable
+  variables, then the gradient is not guaranteed correct!
+
+  #### About `Vector` distributions in TensorFlow.
+
+  The `VectorDiffeomixture` is a non-standard distribution that has properties
+  particularly useful in [variational Bayesian
+  methods](https://en.wikipedia.org/wiki/Variational_Bayesian_methods).
+
+  Conditioned on a draw from the SoftmaxNormal, `Y|v` is a vector whose
+  components are linear combinations of affine transformations, thus is itself
+  an affine transformation. Therefore `Y|v` lives in the vector space generated
+  by vectors of affine-transformed distributions.
+
+  Note: The marginals `Y_1|v, ..., Y_d|v` are *not* generally identical to some
+  parameterization of `distribution`.  This is due to the fact that the sum of
+  draws from `distribution` are not generally itself the same `distribution`.
+
+  #### About `Diffeomixture`s and reparameterization.
+
+  The `VectorDiffeomixture` is designed to be reparameterized, i.e., its
+  parameters are only used to transform samples from a distribution which has no
+  trainable parameters. This property is important because backprop stops at
+  sources of stochasticity. That is, as long as the parameters are used *after*
+  the underlying source of stochasticity, the computed gradient is accurate.
+
+  Reparametrization means that we can use gradient-descent (via backprop) to
+  optimize Monte-Carlo objectives. Such objectives are a finite-sample
+  approximation of an expectation and arise throughout scientific computing.
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.] and
+  # another with mix_loc=[1]. In both cases, `K=2` and the affine
+  # transformations involve:
+  # k=0: loc=zeros(dims)  scale=LinearOperatorScaledIdentity
+  # k=1: loc=[2.]*dims    scale=LinOpDiag
+  dims = 5
+  vdm = ds.VectorDiffeomixture(
+      mix_loc=[[0.], [1]],
+      mix_scale=[1.],
+      distribution=ds.Normal(loc=0., scale=1.),
+      loc=[
+          None,  # Equivalent to `np.zeros(dims, dtype=np.float32)`.
+          np.float32([2.]*dims),
+      ],
+      scale=[
+          la.LinearOperatorScaledIdentity(
+            num_rows=dims,
+            multiplier=np.float32(1.1),
+            is_positive_definite=True),
+          la.LinearOperatorDiag(
+            diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
+            is_positive_definite=True),
+      ],
+      validate_args=True)
+  """
+
+  def __init__(self,
+               mix_loc,
+               mix_scale,
+               distribution,
+               loc=None,
+               scale=None,
+               quadrature_polynomial_degree=8,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLocationScaleDiffeomixture"):
+    """Constructs the VectorDiffeomixture on `R**k`.
+
+    Args:
+      mix_loc: `float`-like `Tensor`. Represents the `location` parameter of the
+        SoftmaxNormal used for selecting one of the `K` affine transformations.
+      mix_scale: `float`-like `Tensor`. Represents the `scale` parameter of the
+        SoftmaxNormal used for selecting one of the `K` affine transformations.
+      distribution: `tf.Distribution`-like instance. Distribution from which `d`
+        iid samples are used as input to the selected affine transformation.
+        Must be a scalar batch, scalar event distribution.  Typically
+        `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
+        a function of non-trainable parameters. WARNING: If you backprop through
+        a VectorDiffeomixture sample and the `distribution` is not
+        `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then
+        the gradient will be incorrect!
+      loc: Length-`K` list of `float`-type `Tensor`s. The `k`-th element
+        represents the `shift` used for the `k`-th affine transformation.  If
+        the `k`-th item is `None`, `loc` is implicitly `0`.  When specified,
+        must have shape `[B1, ..., Bb, d]` where `b >= 0` and `d` is the event
+        size.
+      scale: Length-`K` list of `LinearOperator`s. Each should be
+        positive-definite and operate on a `d`-dimensional vector space. The
+        `k`-th element represents the `scale` used for the `k`-th affine
+        transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`,
+        `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices
+      quadrature_polynomial_degree: Python `int`-like scalar.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if `not scale or len(scale) < 2`.
+      ValueError: if `len(loc) != len(scale)`
+      ValueError: if `quadrature_polynomial_degree < 1`.
+      ValueError: if `validate_args` and any not scale.is_positive_definite.
+      TypeError: if any scale.dtype != scale[0].dtype.
+      TypeError: if any loc.dtype != scale[0].dtype.
+      NotImplementedError: if `len(scale) != 2`.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[mix_loc, mix_scale]):
+      if not scale or len(scale) < 2:
+        raise ValueError("Must specify list (or list-like object) of scale "
+                         "LinearOperators, one for each component with "
+                         "num_component >= 2.")
+
+      if loc is None:
+        loc = [None]*len(scale)
+
+      if len(loc) != len(scale):
+        raise ValueError("loc/scale must be same-length lists "
+                         "(or same-length list-like objects).")
+
+      dtype = scale[0].dtype.base_dtype
+
+      loc = [ops.convert_to_tensor(loc_, dtype=dtype, name="loc{}".format(k))
+             if loc_ is not None else None
+             for k, loc_ in enumerate(loc)]
+
+      for k, scale_ in enumerate(scale):
+        if validate_args and not scale_.is_positive_definite:
+          raise ValueError("scale[{}].is_positive_definite = {} != True".format(
+              k, scale_.is_positive_definite))
+        if scale_.dtype.base_dtype != dtype:
+          raise TypeError(
+              "dtype mismatch; scale[{}].base_dtype=\"{}\" != \"{}\"".format(
+                  k, scale_.dtype.base_dtype.name, dtype.name))
+
+      self._endpoint_affine = [
+          AffineLinearOperator(shift=loc_,
+                               scale=scale_,
+                               event_ndims=1,
+                               validate_args=validate_args,
+                               name="endpoint_affine_{}".format(k))
+          for k, (loc_, scale_) in enumerate(zip(loc, scale))]
+
+      if quadrature_polynomial_degree < 1:
+        raise ValueError("quadrature_polynomial_degree={} "
+                         "is not at least 1".format(
+                             quadrature_polynomial_degree))
+      self._degree = quadrature_polynomial_degree
+
+      # TODO(jvdillon): Remove once we support k-mixtures.
+      # We make this assertion here because otherwise `grid` would need to be a
+      # vector not a scalar.
+      if len(scale) != 2:
+        raise NotImplementedError("Currently only bimixtures are supported; "
+                                  "len(scale)={} is not 2.".format(len(scale)))
+
+      grid, prob = np.polynomial.hermite.hermgauss(
+          deg=quadrature_polynomial_degree)
+      grid = grid.astype(dtype.as_numpy_dtype)
+      prob = prob.astype(dtype.as_numpy_dtype)
+      prob /= np.linalg.norm(prob, ord=1)
+
+      self._mixture_distribution = categorical_lib.Categorical(
+          logits=np.log(prob),
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats)
+
+      mix_loc = maybe_check_mix_param(
+          mix_loc, "mix_loc", dtype, validate_args)
+      mix_scale = maybe_check_mix_param(
+          mix_scale, "mix_scale", dtype, validate_args)
+
+      distribution_assertions = maybe_check_distribution(
+          distribution, dtype, validate_args)
+      if distribution_assertions:
+        mix_loc = control_flow_ops.with_dependencies(
+            distribution_assertions, mix_loc)
+      self._distribution = distribution
+
+      # shape: [B, deg]
+      self._interpolate_weight = math_ops.sigmoid(
+          mix_loc
+          + np.sqrt(2.) * mix_scale * grid)
+
+      self._interpolated_affine = [
+          AffineLinearOperator(shift=loc_,
+                               scale=scale_,
+                               event_ndims=1,
+                               validate_args=validate_args,
+                               name="interpolated_affine_{}".format(k))
+          for k, (loc_, scale_) in enumerate(zip(
+              interpolate_loc(quadrature_polynomial_degree,
+                              self._interpolate_weight,
+                              loc),
+              interpolate_scale(quadrature_polynomial_degree,
+                                self._interpolate_weight,
+                                scale)))]
+
+      self._batch_shape_, self._event_shape_ = determine_batch_event_shapes(
+          mix_loc, mix_scale, self._endpoint_affine)
+
+      super(VectorDiffeomixture, self).__init__(
+          dtype=dtype,
+          # We hard-code `FULLY_REPARAMETERIZED` because when
+          # `validate_args=True` we verify that indeed
+          # `distribution.reparameterization_type == FULLY_REPARAMETERIZED`. A
+          # distribution which is a function of only non-trainable parameters
+          # also implies we can use `FULLY_REPARAMETERIZED`. However, we cannot
+          # easily test for that possibility thus we use `validate_args=False`
+          # as a "back-door" to allow users a way to use non
+          # `FULLY_REPARAMETERIZED` distribution. In such cases IT IS THE USERS
+          # RESPONSIBILITY to verify that the base distribution is a function of
+          # non-trainable parameters.
+          reparameterization_type=distribution_lib.FULLY_REPARAMETERIZED,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats,
+          parameters=parameters,
+          graph_parents=(
+              [mix_loc, mix_scale]
+              + distribution._graph_parents  # pylint: disable=protected-access
+              + [loc_ for loc_ in loc if loc_ is not None]
+              + [p for scale_ in scale for p in scale_.graph_parents]),
+          name=name)
+
+  @property
+  def mixture_distribution(self):
+    return self._mixture_distribution
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  @property
+  def interpolate_weight(self):
+    return self._interpolate_weight
+
+  @property
+  def endpoint_affine(self):
+    return self._endpoint_affine
+
+  @property
+  def interpolated_affine(self):
+    return self._interpolated_affine
+
+  def _batch_shape_tensor(self):
+    return self._batch_shape_
+
+  def _batch_shape(self):
+    return tensor_shape.TensorShape(static_value(self._batch_shape_))
+
+  def _event_shape_tensor(self):
+    return self._event_shape_
+
+  def _event_shape(self):
+    return tensor_shape.TensorShape(static_value(self._event_shape_))
+
+  def _sample_n(self, n, seed=None):
+    batch_size = reduce_prod(self.batch_shape_tensor())
+    x = self.distribution.sample(
+        sample_shape=concat_vectors(
+            [n * batch_size],
+            self.event_shape_tensor()),
+        seed=seed)
+    x = [array_ops.reshape(
+        aff.forward(x),
+        shape=concat_vectors(
+            [-1],
+            self.batch_shape_tensor(),
+            self.event_shape_tensor()))
+         for aff in self.endpoint_affine]
+
+    # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
+    # ids as a [n]-shaped vector.
+    ids = self._mixture_distribution.sample(
+        sample_shape=concat_vectors(
+            [n],
+            distribution_util.pick_vector(
+                self.is_scalar_batch(),
+                np.int32([]),
+                [batch_size])),
+        seed=distribution_util.gen_new_seed(
+            seed, "vector_diffeomixture"))
+
+    # Stride `self._degree` for `batch_size` number of times.
+    offset = math_ops.range(start=0,
+                            limit=batch_size * self._degree,
+                            delta=self._degree,
+                            dtype=ids.dtype)
+
+    weight = array_ops.gather(
+        array_ops.reshape(self.interpolate_weight, shape=[-1]),
+        ids + offset)
+    weight = weight[..., array_ops.newaxis]
+
+    # Alternatively:
+    # x = weight * x[0] + (1. - weight) * x[1]
+    x = weight * (x[0] - x[1]) + array_ops.ones_like(x[0]) * x[1]
+
+    return x
+
+  def _log_prob(self, x):
+    # By convention, we always put the the grid points right-most.
+    y = array_ops.stack(
+        [aff.inverse(x) for aff in self.interpolated_affine],
+        axis=-1)
+    log_prob = math_ops.reduce_sum(self.distribution.log_prob(y), axis=-2)
+    # Because the affine transformation has a constant Jacobian, it is the case
+    # that `affine.fldj(x) = -affine.ildj(x)`. This is not true in general.
+    fldj = array_ops.stack(
+        [aff.forward_log_det_jacobian(x) for aff in self.interpolated_affine],
+        axis=-1)
+    return math_ops.reduce_logsumexp(
+        self.mixture_distribution.logits - fldj + log_prob, axis=-1)
+
+  def _mean(self):
+    # Since we created logits to already be scaled, we can use exp which is
+    # slightly cheaper than `self.mixture_distribution.probs`.
+    p = math_ops.exp(self.mixture_distribution.logits)
+
+    m = array_ops.tile(self.distribution.mean()[..., array_ops.newaxis],
+                       multiples=self.event_shape_tensor())
+    m = m[..., array_ops.newaxis, :]
+    mean = None
+    for k, aff in enumerate(self.interpolated_affine):
+      # aff.forward is going to do this:
+      # y = array_ops.squeeze(aff.scale.matmul(m), axis=[-1])
+      # if aff.shift is not None:
+      #   y += aff.shift
+      mean = add(mean, p[..., k] * aff.forward(m))
+    return mean
+
+  def _covariance(self):
+    # Law of total variance:
+    #
+    # Cov[Z] = E[Cov[Z | V]] + Cov[E[Z | V]]
+    #
+    # where,
+    #
+    # E[Cov[Z | V]] = sum_i mix_prob[i] Scale[i]
+    # Cov[E[Z | V]] = sum_i mix_prob[i] osquare(loc[i])
+    #                  - osquare(sum_i mix_prob[i] loc[i])
+    #
+    # osquare(x) = x.transpose @ x
+    return add(
+        self._mean_of_covariance_given_quadrature_component(diag_only=False),
+        self._covariance_of_mean_given_quadrature_component(diag_only=False))
+
+  def _variance(self):
+    # Equivalent to: tf.diag_part(self._covariance()),
+    return add(
+        self._mean_of_covariance_given_quadrature_component(diag_only=True),
+        self._covariance_of_mean_given_quadrature_component(diag_only=True))
+
+  def _mean_of_covariance_given_quadrature_component(self, diag_only):
+    # Since we created logits to already be scaled, we can use exp which is
+    # slightly cheaper than `self.mixture_distribution.probs`.
+    p = math_ops.exp(self.mixture_distribution.logits)
+
+    # To compute E[Cov(Z|V)], we'll add matrices within three categories:
+    # scaled-identity, diagonal, and full. Then we'll combine these at the end.
+    scaled_identity = None
+    diag = None
+    full = None
+
+    for k, aff in enumerate(self.interpolated_affine):
+      s = aff.scale  # Just in case aff.scale has side-effects, we'll call once.
+      if (s is None
+          or isinstance(s, linop_identity_lib.LinearOperatorIdentity)):
+        scaled_identity = add(scaled_identity, p[..., k, array_ops.newaxis])
+      elif isinstance(s, linop_identity_lib.LinearOperatorScaledIdentity):
+        scaled_identity = add(scaled_identity, (p[..., k, array_ops.newaxis] *
+                                                math_ops.square(s.multiplier)))
+      elif isinstance(s, linop_diag_lib.LinearOperatorDiag):
+        diag = add(diag, (p[..., k, array_ops.newaxis] *
+                          math_ops.square(s.diag_part())))
+      else:
+        x = (p[..., k, array_ops.newaxis, array_ops.newaxis] *
+             s.matmul(s.to_dense(), adjoint_arg=True))
+        if diag_only:
+          x = array_ops.matrix_diag_part(x)
+        full = add(full, x)
+
+    # We must now account for the fact that the base distribution might have a
+    # non-unity variance. Recall that `Cov(SX+m) = S.T Cov(X) S = S.T S Var(X)`.
+    # We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid
+    # samples from a scalar-event distribution.
+    v = self.distribution.variance()
+    if scaled_identity is not None:
+      scaled_identity *= v
+    if diag is not None:
+      diag *= v[..., array_ops.newaxis]
+    if full is not None:
+      full *= v[..., array_ops.newaxis]
+
+    if diag_only:
+      # Apparently we don't need the full matrix, just the diagonal.
+      r = add(diag, full)
+      if r is None and scaled_identity is not None:
+        ones = array_ops.ones(self.event_shape_tensor(), dtype=self.dtype)
+        return scaled_identity * ones
+      return add(r, scaled_identity)
+
+    # `None` indicates we don't know if the result is positive-definite.
+    is_positive_definite = (True if all(aff.scale.is_positive_definite
+                                        for aff in self.endpoint_affine)
+                            else None)
+
+    to_add = []
+    if diag is not None:
+      to_add.append(linop_diag_lib.LinearOperatorDiag(
+          diag=diag,
+          is_positive_definite=is_positive_definite))
+    if full is not None:
+      to_add.append(linop_full_lib.LinearOperatorFullMatrix(
+          matrix=full,
+          is_positive_definite=is_positive_definite))
+    if scaled_identity is not None:
+      to_add.append(linop_identity_lib.LinearOperatorScaledIdentity(
+          num_rows=self.event_shape_tensor()[0],
+          multiplier=scaled_identity,
+          is_positive_definite=is_positive_definite))
+
+    return (linop_add_lib.add_operators(to_add)[0].to_dense()
+            if to_add else None)
+
+  def _covariance_of_mean_given_quadrature_component(self, diag_only):
+    square = math_ops.square if diag_only else vec_osquare
+
+    # Since we created logits to already be scaled, we can use exp which is
+    # slightly cheaper than `self.mixture_distribution.probs`.
+    p = math_ops.exp(self.mixture_distribution.logits)
+
+    m = array_ops.tile(self.distribution.mean()[..., array_ops.newaxis],
+                       multiples=self.event_shape_tensor())
+    m = m[..., array_ops.newaxis, :]
+
+    cov_e_z_given_v = None
+    e_z_given_v = self._mean()
+    for k, aff in enumerate(self.interpolated_affine):
+      y = aff.forward(m)
+      cov_e_z_given_v = add(cov_e_z_given_v,
+                            p[..., k] * square(y - e_z_given_v))
+
+    return cov_e_z_given_v
+
+
+def maybe_check_mix_param(param, name, expected_base_dtype, validate_args):
+  """Helper which checks validity of `mix_loc` and `mix_scale` init args."""
+  with ops.name_scope(name="check_" + name, values=[param]):
+    param = ops.convert_to_tensor(param, dtype=expected_base_dtype, name=name)
+
+    if param.dtype.base_dtype != expected_base_dtype:
+      raise TypeError(
+          "dtype mismatch; {}.base_dtype=\"{}\" is not \"{}\".".format(
+              name, param.dtype.base_dtype.name, expected_base_dtype.name))
+
+    assertions = []
+    if param.shape.ndims is not None:
+      if param.shape.ndims == 0:
+        raise ValueError("Mixing params must be a (batch of) vector; "
+                         "{}.rank={} is not at least one.".format(
+                             name, param.shape.ndims))
+    elif validate_args:
+      assertions.append(check_ops.assert_rank_at_least(
+          param, 1,
+          message=("Mixing params must be a (batch of) vector; "
+                   "{}.rank is not at least one.".format(
+                       name))))
+
+    # TODO(jvdillon): Remove once we support k-mixtures.
+    if param.shape.with_rank_at_least(1)[-1] is not None:
+      if param.shape[-1].value != 1:
+        raise NotImplementedError("Currently only bimixtures are supported; "
+                                  "{}.shape[-1]={} is not 1.".format(
+                                      name, param.shape[-1].value))
+    elif validate_args:
+      assertions.append(check_ops.assert_equal(
+          array_ops.shape(param)[-1], 1,
+          message=("Currently only bimixtures are supported; "
+                   "{}.shape[-1] is not 1.".format(name))))
+
+    if assertions:
+      return control_flow_ops.with_dependencies(assertions, param)
+    return param
+
+
+def maybe_check_distribution(distribution, expected_base_dtype, validate_args):
+  """Helper which checks validity of `distribution` init arg."""
+  if distribution.dtype != expected_base_dtype:
+    raise TypeError("dtype mismatch; "
+                    "distribution.dtype=\"{}\" is not \"{}\"".format(
+                        distribution.dtype.name, expected_base_dtype.name))
+
+  # Although `reparameterization_type` is a static property, we guard it by
+  # `validate_args`. This allows users to use a `distribution` which is not
+  # reparameterized itself. However, we tacitly assume that although the
+  # distribution is not reparameterized, it only depends on non-trainable
+  # variables.
+  if validate_args and (distribution.reparameterization_type
+                        != distribution_lib.FULLY_REPARAMETERIZED):
+    raise ValueError("Base distribution should be reparameterized or be "
+                     "a function of non-trainable variables; "
+                     "distribution.reparameterization_type = \"{}\" "
+                     "!= \"FULLY_REPARAMETERIZED\".".format(
+                         distribution.reparameterization_type))
+  with ops.name_scope(name="check_distribution"):
+    assertions = []
+    def check_is_scalar(is_scalar, name):
+      is_scalar_ = static_value(is_scalar)
+      if is_scalar_ is not None:
+        if not is_scalar_:
+          raise ValueError("distribution must be scalar; "
+                           "distribution.{}=False is not True".format(name))
+      elif validate_args:
+        assertions.append(check_ops.assert_equal(
+            is_scalar, True,
+            message=("distribution must be scalar; "
+                     "distribution.{}=False is not True".format(name))))
+    check_is_scalar(distribution.is_scalar_event(), "is_scalar_event")
+    check_is_scalar(distribution.is_scalar_batch(), "is_scalar_batch")
+    return assertions
+
+
+def determine_batch_event_shapes(mix_loc, mix_scale, endpoint_affine):
+  """Helper to infer batch_shape and event_shape."""
+  with ops.name_scope(name="determine_batch_event_shapes"):
+    mix_batch_shape = distribution_util.prefer_static_broadcast_shape(
+        array_ops.shape(mix_loc, name="mix_loc_shape"),
+        array_ops.shape(mix_scale, name="mix_scale_shape"))
+    if isinstance(mix_batch_shape, tensor_shape.TensorShape):
+      mix_batch_shape = mix_batch_shape.with_rank_at_least(1)[:-1]
+    else:
+      s = static_value(mix_batch_shape)
+      if s is not None:
+        mix_batch_shape = ops.convert_to_tensor(
+            s[:-1], dtype=dtypes.int32, name="mix_batch_shape")
+      else:
+        mix_batch_shape = mix_batch_shape[:-1]
+
+    # We broadcast with a 1D constant to automatically make the result a
+    # TensorShape if possible.
+    batch_shape = distribution_util.prefer_static_broadcast_shape(
+        mix_batch_shape,
+        constant_op.constant([], dtype=dtypes.int32, name="batch_shape"))
+    event_shape = constant_op.constant(
+        [], dtype=dtypes.int32, name="event_shape")
+    for aff in endpoint_affine:
+      b, e = distribution_util.shapes_from_loc_and_scale(aff.shift, aff.scale)
+      if batch_shape is None:
+        batch_shape = distribution_util.prefer_static_broadcast_shape(
+            mix_batch_shape, b)
+      else:
+        batch_shape = distribution_util.prefer_static_broadcast_shape(
+            batch_shape, b)
+      event_shape = distribution_util.prefer_static_broadcast_shape(
+          event_shape, e)
+    if isinstance(batch_shape, tensor_shape.TensorShape):
+      batch_shape = ops.convert_to_tensor(
+          batch_shape.as_list(), dtype=dtypes.int32, name="batch_shape")
+    if isinstance(event_shape, tensor_shape.TensorShape):
+      event_shape = ops.convert_to_tensor(
+          event_shape.as_list(), dtype=dtypes.int32, name="event_shape")
+    return batch_shape, event_shape
+
+
+def interpolate_loc(deg, interpolate_weight, loc):
+  """Helper which interpolates between two locs."""
+  if len(loc) != 2:
+    raise NotImplementedError("Currently only bimixtures are supported; "
+                              "len(scale)={} is not 2.".format(len(loc)))
+  with ops.name_scope("interpolate_loc", values=[interpolate_weight, loc]):
+    if loc is None or loc[0] is None and loc[1] is None:
+      return [None]*deg
+    if loc[0] is None:
+      x = (1. - interpolate_weight[..., array_ops.newaxis]) * loc[1]
+    elif loc[1] is None:
+      x = interpolate_weight[..., array_ops.newaxis] * loc[0]
+    else:
+      delta = loc[0] - loc[1]
+      offset = array_ops.ones_like(loc[0]) * loc[1]
+      x = interpolate_weight[..., array_ops.newaxis] * delta + offset
+    return [x[..., k, :] for k in range(deg)]
+
+
+def interpolate_scale(deg, interpolate_weight, scale):
+  """Helper which interpolates between two scales."""
+  if len(scale) != 2:
+    raise NotImplementedError("Currently only bimixtures are supported; "
+                              "len(scale)={} is not 2.".format(len(scale)))
+  with ops.name_scope("interpolate_scale", values=[interpolate_weight]):
+    return [linop_add_lib.add_operators([
+        linop_scale(interpolate_weight[..., k], scale[0]),
+        linop_scale(1. - interpolate_weight[..., k], scale[1]),
+    ])[0] for k in range(deg)]
+
+
+def linop_scale(w, op):
+  # We assume w > 0. (This assumption only relates to the is_* attributes.)
+  with ops.name_scope("linop_scale", values=[w]):
+    def scaled_identity(w):
+      return linop_identity_lib.LinearOperatorScaledIdentity(
+          num_rows=op.range_dimension_tensor(),
+          multiplier=w,
+          is_non_singular=op.is_non_singular,
+          is_self_adjoint=op.is_self_adjoint,
+          is_positive_definite=op.is_positive_definite)
+    if isinstance(op, linop_identity_lib.LinearOperatorIdentity):
+      return scaled_identity(w)
+    elif isinstance(op, linop_identity_lib.LinearOperatorScaledIdentity):
+      return scaled_identity(w * op.multiplier)
+    elif isinstance(op, linop_diag_lib.LinearOperatorDiag):
+      return linop_diag_lib.LinearOperatorDiag(
+          diag=w[..., array_ops.newaxis] * op.diag_part(),
+          is_non_singular=op.is_non_singular,
+          is_self_adjoint=op.is_self_adjoint,
+          is_positive_definite=op.is_positive_definite)
+    else:
+      return linop_composition_lib.LinearOperatorComposition([
+          scaled_identity(w), op])
+
+
+def static_value(x):
+  """Returns the static value of a `Tensor` or `None`."""
+  return tensor_util.constant_value(ops.convert_to_tensor(x))
+
+
+def concat_vectors(*args):
+  """Concatenates input vectors, statically if possible."""
+  args_ = [static_value(x) for x in args]
+  if any(vec is None for vec in args_):
+    return array_ops.concat(args, axis=0)
+  return [val for vec in args_ for val in vec]
+
+
+def reduce_prod(x):
+  """Same as `math_ops.reduce_prod` but statically if possible."""
+  x_ = static_value(x)
+  if x_ is not None:
+    return np.prod(x_, dtype=x.dtype.as_numpy_dtype)
+  return array_ops.reduce_prod(x)
+
+
+def ndims_from_shape(shape):
+  """Returns `Tensor`'s `rank` implied by a `Tensor` shape."""
+  if shape.shape.ndims not in (None, 1):
+    raise ValueError("input is not a valid shape: not 1D")
+  if not shape.dtype.is_integer:
+    raise TypeError("input is not a valid shape: wrong dtype")
+  if shape.shape.is_fully_defined():
+    return shape.shape.as_list()[0]
+  return array_ops.shape(shape)[0]
+
+
+def ndims(x):
+  """Returns rank, statically if possible."""
+  x = ops.convert_to_tensor(x)
+  if x.shape.ndims is not None:
+    return x.shape.ndims
+  return array_ops.rank(x)
+
+
+def add(x, y):
+  """Adds inputs; interprets `None` as zero."""
+  if x is None:
+    return y
+  if y is None:
+    return x
+  return x + y
+
+
+def vec_osquare(x):
+  """Computes the outer-product of a (batch of) vector, i.e., x.T x."""
+  return x[..., :, array_ops.newaxis] * x[..., array_ops.newaxis, :]
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88572e17fa43ac11778bdddc02484d284b6eb36
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -0,0 +1,198 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution of a vectorized Exponential, with uncorrelated components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import vector_exponential_linear_operator as vector_exponential_linop
+from tensorflow.python.framework import ops
+
+
+__all__ = [
+    "VectorExponentialDiag",
+]
+
+
+class VectorExponentialDiag(
+    vector_exponential_linop.VectorExponentialLinearOperator):
+  """The vectorization of the Exponential distribution on `R^k`.
+
+  The vector exponential distribution is defined over a subset of `R^k`, and
+  parameterized by a (batch of) length-`k` `loc` vector and a (batch of) `k x k`
+  `scale` matrix:  `covariance = scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is defined over the image of the
+  `scale` matrix + `loc`, applied to the positive half-space:
+  `Supp = {loc + scale @ x : x in R^k, x_1 > 0, ..., x_k > 0}`.  On this set,
+
+  ```none
+  pdf(y; loc, scale) = exp(-||x||_1) / Z,  for y in Supp
+  x = inv(scale) @ (y - loc),
+  Z = |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||x||_1` denotes the `l1` norm of `x`, `sum_i |x_i|`.
+
+  The VectorExponential distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Exponential(rate=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorExponential` and `Vector` distributions in TensorFlow.
+
+  The `VectorExponential` is a non-standard distribution that has useful
+  properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Exponential random variables, due to
+  the fact that the sum of Exponential random variables is not Exponential.
+
+  Instead, `Y` is a vector whose components are linear combinations of
+  Exponential random variables.  Thus, `Y` lives in the vector space generated
+  by `vectors` of Exponential distributions.  This allows the user to decide the
+  mean and covariance (by setting `loc` and `scale`), while preserving some
+  properties of the Exponential distribution.  In particular, the tails of `Y_i`
+  will be (up to polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Exponential random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Initialize a single 2-variate VectorExponential, supported on
+  # {(x, y) in R^2 : x > 0, y > 0}.
+
+  # The first component has pdf exp{-x}, the second 0.5 exp{-x / 2}
+  vex = ds.VectorExponentialDiag(scale_diag=[1., 2.])
+
+  # Compute the pdf of an`R^2` observation; return a scalar.
+  vex.prob([3., 4.]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Vector Exponential's.
+  loc = [[1., 2, 3],
+         [1., 0, 0]]              # shape: [2, 3]
+  scale_diag = [[1., 2, 3],
+                [0.5, 1, 1.5]]     # shape: [2, 3]
+
+  vex = ds.VectorExponentialDiag(loc, scale_diag)
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[1.9, 2.2, 3.1],
+       [10., 1.0, 9.0]]     # shape: [2, 3]
+  vex.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_diag=None,
+               scale_identity_multiplier=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorExponentialDiag"):
+    """Construct Vector Exponential distribution supported on a subset of `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = scale @ scale.T`.
+
+    ```none
+    scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+    ```
+
+    where:
+
+    * `scale_diag.shape = [k]`, and,
+    * `scale_identity_multiplier.shape = []`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+    `scale` is the Identity matrix.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_diag: Non-zero, floating-point `Tensor` representing a diagonal
+        matrix added to `scale`. May have shape `[B1, ..., Bb, k]`, `b >= 0`,
+        and characterizes `b`-batches of `k x k` diagonal matrices added to
+        `scale`. When both `scale_identity_multiplier` and `scale_diag` are
+        `None` then `scale` is the `Identity`.
+      scale_identity_multiplier: Non-zero, floating-point `Tensor` representing
+        a scaled-identity-matrix added to `scale`. May have shape
+        `[B1, ..., Bb]`, `b >= 0`, and characterizes `b`-batches of scaled
+        `k x k` identity matrices added to `scale`. When both
+        `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
+        the `Identity`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if at most `scale_identity_multiplier` is specified.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[
+          loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
+        scale = distribution_util.make_diag_scale(
+            loc=loc,
+            scale_diag=scale_diag,
+            scale_identity_multiplier=scale_identity_multiplier,
+            validate_args=False,
+            assert_positive=False)
+    super(VectorExponentialDiag, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7123165417ea010fa9da5263e429734d34df3dbd
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -0,0 +1,288 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Vectorized Exponential distribution class, directly using LinearOperator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import exponential
+from tensorflow.python.ops.distributions import transformed_distribution
+
+__all__ = ["VectorExponentialLinearOperator"]
+
+_mvn_sample_note = """
+`value` is a batch vector with compatible shape if `value` is a `Tensor` whose
+shape can be broadcast up to either:
+
+```python
+self.batch_shape + self.event_shape
+```
+
+or
+
+```python
+[M1, ..., Mm] + self.batch_shape + self.event_shape
+```
+
+"""
+
+
+class VectorExponentialLinearOperator(
+    transformed_distribution.TransformedDistribution):
+  """The vectorization of the Exponential distribution on `R^k`.
+
+  The vector exponential distribution is defined over a subset of `R^k`, and
+  parameterized by a (batch of) length-`k` `loc` vector and a (batch of) `k x k`
+  `scale` matrix:  `covariance = scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is
+
+  ```none
+  pdf(y; loc, scale) = exp(-||x||_1) / Z,  for y in S(loc, scale),
+  x = inv(scale) @ (y - loc),
+  Z = |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `S = {loc + scale @ x : x in R^k, x_1 > 0, ..., x_k > 0}`, is an image of
+     the positive half-space,
+  * `||x||_1` denotes the `l1` norm of `x`, `sum_i |x_i|`,
+  * `Z` denotes the normalization constant.
+
+  The VectorExponential distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Exponential(rate=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorExponential` and `Vector` distributions in TensorFlow.
+
+  The `VectorExponential` is a non-standard distribution that has useful
+  properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Exponential random variables, due to
+  the fact that the sum of Exponential random variables is not Exponential.
+
+  Instead, `Y` is a vector whose components are linear combinations of
+  Exponential random variables.  Thus, `Y` lives in the vector space generated
+  by `vectors` of Exponential distributions.  This allows the user to decide the
+  mean and covariance (by setting `loc` and `scale`), while preserving some
+  properties of the Exponential distribution.  In particular, the tails of `Y_i`
+  will be (up to polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Exponential random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Initialize a single 2-variate VectorExponential, supported on
+  # {(x, y) in R^2 : x > 0, y > 0}.
+  mat = [[1.0, 0.1],
+         [0.1, 1.0]]
+
+  vex = ds.VectorExponentialLinearOperator(
+      scale=la.LinearOperatorFullMatrix(mat))
+
+  # Compute the pdf of an`R^2` observation; return a scalar.
+  vex.prob([1., 2.]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Vector Exponential's.
+  mu = [[1., 2, 3],
+        [1., 0, 0]]              # shape: [2, 3]
+  scale_diag = [[1., 2, 3],
+                [0.5, 1, 1.5]]     # shape: [2, 3]
+
+  vex = ds.VectorExponentialLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorDiag(scale_diag))
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[1.9, 2.2, 3.1],
+       [10., 1.0, 9.0]]     # shape: [2, 3]
+  vex.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorExponentialLinearOperator"):
+    """Construct Vector Exponential distribution supported on a subset of `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = scale @ scale.T`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape
+        `[B1, ..., Bb, k, k]`.
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are
+        invalid, correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to give Ops created by the initializer.
+
+    Raises:
+      ValueError: if `scale` is unspecified.
+      TypeError: if not `scale.dtype.is_floating`
+    """
+    parameters = locals()
+    if scale is None:
+      raise ValueError("Missing required `scale` parameter.")
+    if not scale.dtype.is_floating:
+      raise TypeError("`scale` parameter must have floating-point dtype.")
+
+    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+      # Since expand_dims doesn't preserve constant-ness, we obtain the
+      # non-dynamic value if possible.
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+
+      super(VectorExponentialLinearOperator, self).__init__(
+          distribution=exponential.Exponential(rate=array_ops.ones(
+              [], dtype=scale.dtype), allow_nan_stats=allow_nan_stats),
+          bijector=bijectors.AffineLinearOperator(
+              shift=loc, scale=scale, validate_args=validate_args),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=validate_args,
+          name=name)
+      self._parameters = parameters
+
+  @property
+  def loc(self):
+    """The `loc` `Tensor` in `Y = scale @ X + loc`."""
+    return self.bijector.shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
+    return self.bijector.scale
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _log_prob(self, x):
+    return super(VectorExponentialLinearOperator, self)._log_prob(x)
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _prob(self, x):
+    return super(VectorExponentialLinearOperator, self)._prob(x)
+
+  def _mean(self):
+    # Let
+    #   W = (w1,...,wk), with wj ~ iid Exponential(0, 1).
+    # Then this distribution is
+    #   X = loc + LW,
+    # and then E[X] = loc + L1, where 1 is the vector of ones.
+    scale_x_ones = self.bijector.scale.matvec(
+        array_ops.ones(self._mode_mean_shape(), self.dtype))
+
+    if self.loc is None:
+      return scale_x_ones
+
+    return array_ops.identity(self.loc) + scale_x_ones
+
+  def _covariance(self):
+    # Let
+    #   W = (w1,...,wk), with wj ~ iid Exponential(0, 1).
+    # Then this distribution is
+    #   X = loc + LW,
+    # and then since Cov(wi, wj) = 1 if i=j, and 0 otherwise,
+    #   Cov(X) = L Cov(W W^T) L^T = L L^T.
+    if distribution_util.is_diagonal_scale(self.scale):
+      return array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
+    else:
+      return self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
+
+  def _variance(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return math_ops.square(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate) and
+          self.scale.is_self_adjoint):
+      return array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense()))
+    else:
+      return array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
+
+  def _stddev(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return math_ops.abs(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate) and
+          self.scale.is_self_adjoint):
+      return math_ops.sqrt(
+          array_ops.matrix_diag_part(self.scale.matmul(self.scale.to_dense())))
+    else:
+      return math_ops.sqrt(
+          array_ops.matrix_diag_part(
+              self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
+
+  def _mode(self):
+    scale_x_zeros = self.bijector.scale.matvec(
+        array_ops.zeros(self._mode_mean_shape(), self.dtype))
+
+    if self.loc is None:
+      return scale_x_zeros
+
+    return array_ops.identity(self.loc) + scale_x_zeros
+
+  def _mode_mean_shape(self):
+    """Shape for the mode/mean Tensors."""
+    shape = self.batch_shape.concatenate(self.event_shape)
+    has_static_shape = shape.is_fully_defined()
+    if not has_static_shape:
+      shape = array_ops.concat([
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], 0)
+    return shape
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index fd2c46d94de9c031768be1410990b180b30497d2..fdee57695e4e598929396ee4c9fe9f8014ea0f8b 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Vectorized Laplace distribution class, directly using LinearOpeartor."""
+"""Vectorized Laplace distribution class, directly using LinearOperator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index ae804b61727b820b2af3c32f05818324bfbccf93..29d41ab81c62d621c3c3533e1449341e9a085645 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -19,87 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import student_t
 from tensorflow.python.ops.distributions import transformed_distribution
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-# TODO(jvdillon): Add unittests for this once we know where will put this code
-# and how it will generally be used. In the interim this code is tested via the
-# _VectorStudentT tests.
-def _infer_shapes(scale_oppd, shift):
-  """Helper which returns batch_shape, event_shape from `Affine` properties.
-
-  The `Affine` `Bijector` (roughly) computes `Y = scale @ X.T + shift`. This
-  function infers the `batch_shape` and `event_shape` from the `scale` and
-  `shift` terms.
-
-  Args:
-    scale_oppd: Instance of OperatorPDBase subclass representing the `Affine`
-      `Bijector` scale matrix.
-    shift: `Tensor` representing the `shift` vector.
-
-  Returns:
-    batch_shape: 1D, integer `Tensor` representing the shape of batch
-      dimensions.
-    event_shape: 1D, integer `Tensor` representing the shape of event
-      dimensions.
-
-  Raises:
-    ValueError: if we are not able to infer batch/event shapes from the args.
-  """
-  # Collect known static shape.
-  def _has_static_ndims(x):
-    return x is not None and x.get_shape().ndims is not None
-  if _has_static_ndims(scale_oppd) and _has_static_ndims(shift):
-    batch_shape = scale_oppd.get_batch_shape().merge_with(
-        shift.get_shape()[:-1])
-    event_shape = scale_oppd.get_shape()[-1:].merge_with(
-        shift.get_shape()[-1:])
-  elif _has_static_ndims(scale_oppd):
-    batch_shape = scale_oppd.get_batch_shape()
-    event_shape = scale_oppd.get_shape()[-1:]
-  elif _has_static_ndims(shift):
-    batch_shape = shift.get_shape()[:-1]
-    event_shape = shift.get_shape()[-1:]
-  else:
-    batch_shape = tensor_shape.TensorShape(None)
-    event_shape = tensor_shape.TensorShape(None)
-
-  # Convert TensorShape to Tensors and see if we're done.
-  if batch_shape.is_fully_defined():
-    batch_shape = constant_op.constant(batch_shape.as_list(),
-                                       dtype=dtypes.int32)
-  else:
-    batch_shape = None
-  if event_shape.is_fully_defined():
-    event_shape = constant_op.constant(event_shape.as_list(),
-                                       dtype=dtypes.int32)
-  else:
-    event_shape = None
-  if batch_shape is not None and event_shape is not None:
-    return batch_shape, event_shape
-
-  # Collect known dynamic shape.
-  if scale_oppd is not None:
-    shape = scale_oppd.shape()
-  elif shift is not None:
-    shape = array_ops.shape(shift)
-  else:
-    raise ValueError("unable to infer batch_shape, event_shape")
-
-  # Fill in what we don't know.
-  if batch_shape is None:
-    batch_shape = array_ops.identity(shape[:-1], name="batch_shape")
-  if event_shape is None:
-    event_shape = array_ops.identity(shape[-1:], name="event_shape")
-
-  return batch_shape, event_shape
 
 
 class _VectorStudentT(transformed_distribution.TransformedDistribution):
@@ -160,7 +86,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   #### Examples
 
   A single instance of a "Vector Student's t-distribution" is defined by a mean
-  vector of of length `k` and a scale matrix of shape `k x k`.
+  vector of length `k` and a scale matrix of shape `k x k`.
 
   Extra leading dimensions, if provided, allow for batches.
 
@@ -282,8 +208,9 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
             df=df,
             loc=array_ops.zeros([], dtype=affine.dtype),
             scale=array_ops.ones([], dtype=affine.dtype))
-        batch_shape, override_event_shape = _infer_shapes(
-            affine.scale, affine.shift)
+        batch_shape, override_event_shape = (
+            distribution_util.shapes_from_loc_and_scale(
+                affine.shift, affine.scale))
         override_batch_shape = distribution_util.pick_vector(
             distribution.is_scalar_batch(),
             batch_shape,
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index e162a796100ae877c92932c0a805787526eb7ce0..9d30ce67197ebdeefc69d9b9979fdad4797bb183 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
-from tensorflow.contrib.distributions.python.ops import operator_pd_full
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -34,8 +36,6 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
-from tensorflow.python.ops.distributions import util as distribution_util
-
 
 __all__ = [
     "WishartCholesky",
@@ -43,11 +43,11 @@ __all__ = [
 ]
 
 
-class _WishartOperatorPD(distribution.Distribution):
+class _WishartLinearOperator(distribution.Distribution):
   """The matrix Wishart distribution on positive definite matrices.
 
   This distribution is defined by a scalar number of degrees of freedom `df` and
-  an instance of `OperatorPDBase`, which provides matrix-free access to a
+  an instance of `LinearOperator`, which provides matrix-free access to a
   symmetric positive definite operator, which defines the scale matrix.
 
   #### Mathematical Details
@@ -75,7 +75,7 @@ class _WishartOperatorPD(distribution.Distribution):
 
   def __init__(self,
                df,
-               scale_operator_pd,
+               scale_operator,
                cholesky_input_output_matrices=False,
                validate_args=False,
                allow_nan_stats=True,
@@ -85,7 +85,7 @@ class _WishartOperatorPD(distribution.Distribution):
     Args:
       df: `float` or `double` tensor, the degrees of freedom of the
         distribution(s). `df` must be greater than or equal to `k`.
-      scale_operator_pd: `float` or `double` instance of `OperatorPDBase`.
+      scale_operator: `float` or `double` instance of `LinearOperator`.
       cholesky_input_output_matrices: Python `bool`. Any function which whose
         input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
@@ -109,28 +109,32 @@ class _WishartOperatorPD(distribution.Distribution):
     """
     parameters = locals()
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
-    with ops.name_scope(name):
-      with ops.name_scope("init", values=[df, scale_operator_pd]):
-        if not scale_operator_pd.dtype.is_floating:
+    with ops.name_scope(name) as ns:
+      with ops.name_scope("init", values=[df, scale_operator]):
+        if not scale_operator.dtype.is_floating:
           raise TypeError(
-              "scale_operator_pd.dtype=%s is not a floating-point type" %
-              scale_operator_pd.dtype)
-        self._scale_operator_pd = scale_operator_pd
+              "scale_operator.dtype=%s is not a floating-point type" %
+              scale_operator.dtype)
+        if not scale_operator.is_square:
+          print(scale_operator.to_dense().eval())
+          raise ValueError("scale_operator must be square.")
+
+        self._scale_operator = scale_operator
         self._df = ops.convert_to_tensor(
             df,
-            dtype=scale_operator_pd.dtype,
+            dtype=scale_operator.dtype,
             name="df")
-        check_ops.assert_same_float_dtype(
-            (self._df, self._scale_operator_pd))
-        if (self._scale_operator_pd.get_shape().ndims is None or
-            self._scale_operator_pd.get_shape()[-1].value is None):
+        contrib_tensor_util.assert_same_float_dtype(
+            (self._df, self._scale_operator))
+        if (self._scale_operator.shape.ndims is None or
+            self._scale_operator.shape[-1].value is None):
           self._dimension = math_ops.cast(
-              self._scale_operator_pd.vector_space_dimension(),
-              dtype=self._scale_operator_pd.dtype, name="dimension")
+              self._scale_operator.domain_dimension_tensor(),
+              dtype=self._scale_operator.dtype, name="dimension")
         else:
           self._dimension = ops.convert_to_tensor(
-              self._scale_operator_pd.get_shape()[-1].value,
-              dtype=self._scale_operator_pd.dtype, name="dimension")
+              self._scale_operator.shape[-1].value,
+              dtype=self._scale_operator.dtype, name="dimension")
         df_val = tensor_util.constant_value(self._df)
         dim_val = tensor_util.constant_value(self._dimension)
         if df_val is not None and dim_val is not None:
@@ -151,32 +155,36 @@ class _WishartOperatorPD(distribution.Distribution):
                        (self._dimension, self._df)))
           self._df = control_flow_ops.with_dependencies(
               [assertions], self._df)
-    super(_WishartOperatorPD, self).__init__(
-        dtype=self._scale_operator_pd.dtype,
+    super(_WishartLinearOperator, self).__init__(
+        dtype=self._scale_operator.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=([self._df, self._dimension] +
-                       self._scale_operator_pd.inputs),
-        name=name)
+                       self._scale_operator.graph_parents),
+        name=ns)
 
   @property
   def df(self):
     """Wishart distribution degree(s) of freedom."""
     return self._df
 
+  def _square_scale_operator(self):
+    return self.scale_operator.matmul(
+        self.scale_operator.to_dense(), adjoint_arg=True)
+
   def scale(self):
     """Wishart distribution scale matrix."""
     if self._cholesky_input_output_matrices:
-      return self.scale_operator_pd.sqrt_to_dense()
+      return self.scale_operator.to_dense()
     else:
-      return self.scale_operator_pd.to_dense()
+      return self._square_scale_operator()
 
   @property
-  def scale_operator_pd(self):
-    """Wishart distribution scale matrix as an OperatorPD."""
-    return self._scale_operator_pd
+  def scale_operator(self):
+    """Wishart distribution scale matrix as an Linear Operator."""
+    return self._scale_operator
 
   @property
   def cholesky_input_output_matrices(self):
@@ -189,18 +197,18 @@ class _WishartOperatorPD(distribution.Distribution):
     return self._dimension
 
   def _event_shape_tensor(self):
-    s = self.scale_operator_pd.shape()
-    return array_ops.strided_slice(s, array_ops.shape(s) - 2,
-                                   array_ops.shape(s))
+    dimension = self.scale_operator.domain_dimension_tensor()
+    return array_ops.stack([dimension, dimension])
 
   def _event_shape(self):
-    return self.scale_operator_pd.get_shape()[-2:]
+    dimension = self.scale_operator.domain_dimension
+    return tensor_shape.TensorShape([dimension, dimension])
 
   def _batch_shape_tensor(self):
-    return self.scale_operator_pd.batch_shape()
+    return self.scale_operator.batch_shape_tensor()
 
   def _batch_shape(self):
-    return self.scale_operator_pd.get_batch_shape()
+    return self.scale_operator.batch_shape
 
   def _sample_n(self, n, seed):
     batch_shape = self.batch_shape_tensor()
@@ -242,10 +250,10 @@ class _WishartOperatorPD(distribution.Distribution):
     x = array_ops.reshape(x, shape)
 
     # Complexity: O(nbM) where M is the complexity of the operator solving a
-    # vector system. E.g., for OperatorPDDiag, each matmul is O(k**2), so
-    # this complexity is O(nbk**2). For OperatorPDCholesky, each matmul is
+    # vector system. E.g., for LinearOperatorDiag, each matmul is O(k**2), so
+    # this complexity is O(nbk**2). For LinearOperatorTriL, each matmul is
     # O(k^3) so this step has complexity O(nbk^3).
-    x = self.scale_operator_pd.sqrt_matmul(x)
+    x = self.scale_operator.matmul(x)
 
     # Undo make batch-op ready.
     # Complexity: O(nbk**2)
@@ -298,10 +306,10 @@ class _WishartOperatorPD(distribution.Distribution):
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
 
     # Complexity: O(nbM*k) where M is the complexity of the operator solving
-    # a vector system. E.g., for OperatorPDDiag, each solve is O(k), so
-    # this complexity is O(nbk**2). For OperatorPDCholesky, each solve is
+    # a vector system. E.g., for LinearOperatorDiag, each solve is O(k), so
+    # this complexity is O(nbk**2). For LinearOperatorTriL, each solve is
     # O(k**2) so this step has complexity O(nbk^3).
-    scale_sqrt_inv_x_sqrt = self.scale_operator_pd.sqrt_solve(
+    scale_sqrt_inv_x_sqrt = self.scale_operator.solve(
         scale_sqrt_inv_x_sqrt)
 
     # Undo make batch-op ready.
@@ -353,18 +361,18 @@ class _WishartOperatorPD(distribution.Distribution):
     half_dp1 = 0.5 * self.dimension + 0.5
     half_df = 0.5 * self.df
     return (self.dimension * (half_df + half_dp1 * math.log(2.)) +
-            half_dp1 * self.scale_operator_pd.log_det() +
+            2 * half_dp1 * self.scale_operator.log_abs_determinant() +
             self._multi_lgamma(half_df, self.dimension) +
             (half_dp1 - half_df) * self._multi_digamma(half_df, self.dimension))
 
   def _mean(self):
     if self.cholesky_input_output_matrices:
       return (math_ops.sqrt(self.df)
-              * self.scale_operator_pd.sqrt_to_dense())
-    return self.df * self.scale_operator_pd.to_dense()
+              * self.scale_operator.to_dense())
+    return self.df * self._square_scale_operator()
 
   def _variance(self):
-    x = math_ops.sqrt(self.df) * self.scale_operator_pd.to_dense()
+    x = math_ops.sqrt(self.df) * self._square_scale_operator()
     d = array_ops.expand_dims(array_ops.matrix_diag_part(x), -1)
     v = math_ops.square(x) + math_ops.matmul(d, d, adjoint_b=True)
     if self.cholesky_input_output_matrices:
@@ -385,20 +393,20 @@ class _WishartOperatorPD(distribution.Distribution):
         constant_op.constant(float("NaN"), dtype=self.dtype, name="nan"),
         s)
     if self.cholesky_input_output_matrices:
-      return math_ops.sqrt(s) * self.scale_operator_pd.sqrt_to_dense()
-    return s * self.scale_operator_pd.to_dense()
+      return math_ops.sqrt(s) * self.scale_operator.to_dense()
+    return s * self._square_scale_operator()
 
   def mean_log_det(self, name="mean_log_det"):
     """Computes E[log(det(X))] under this Wishart distribution."""
     with self._name_scope(name):
       return (self._multi_digamma(0.5 * self.df, self.dimension) +
               self.dimension * math.log(2.) +
-              self.scale_operator_pd.log_det())
+              2 * self.scale_operator.log_abs_determinant())
 
   def log_normalization(self, name="log_normalization"):
     """Computes the log normalizing constant, log(Z)."""
     with self._name_scope(name):
-      return (self.df * self.scale_operator_pd.sqrt_log_det() +
+      return (self.df * self.scale_operator.log_abs_determinant() +
               0.5 * self.df * self.dimension * math.log(2.) +
               self._multi_lgamma(0.5 * self.df, self.dimension))
 
@@ -428,7 +436,7 @@ class _WishartOperatorPD(distribution.Distribution):
                                  axis=[-1])
 
 
-class WishartCholesky(_WishartOperatorPD):
+class WishartCholesky(_WishartLinearOperator):
   """The matrix Wishart distribution on positive definite matrices.
 
   This distribution is defined by a scalar degrees of freedom `df` and a
@@ -521,10 +529,26 @@ class WishartCholesky(_WishartOperatorPD):
     """
     parameters = locals()
     with ops.name_scope(name, values=[scale]):
+      with ops.name_scope("init", values=[scale]):
+        scale = ops.convert_to_tensor(scale)
+        if validate_args:
+          scale = control_flow_ops.with_dependencies([
+              check_ops.assert_positive(
+                  array_ops.matrix_diag_part(scale),
+                  message="scale must be positive definite"),
+              check_ops.assert_equal(
+                  array_ops.shape(scale)[-1],
+                  array_ops.shape(scale)[-2],
+                  message="scale must be square")
+          ] if validate_args else [], scale)
+
       super(WishartCholesky, self).__init__(
           df=df,
-          scale_operator_pd=operator_pd_cholesky.OperatorPDCholesky(
-              scale, verify_pd=validate_args),
+          scale_operator=linalg.LinearOperatorTriL(
+              tril=scale,
+              is_non_singular=True,
+              is_positive_definite=True,
+              is_square=True),
           cholesky_input_output_matrices=cholesky_input_output_matrices,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
@@ -532,7 +556,7 @@ class WishartCholesky(_WishartOperatorPD):
     self._parameters = parameters
 
 
-class WishartFull(_WishartOperatorPD):
+class WishartFull(_WishartLinearOperator):
   """The matrix Wishart distribution on positive definite matrices.
 
   This distribution is defined by a scalar degrees of freedom `df` and a
@@ -620,13 +644,24 @@ class WishartFull(_WishartOperatorPD):
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[scale]) as ns:
-      super(WishartFull, self).__init__(
-          df=df,
-          scale_operator_pd=operator_pd_full.OperatorPDFull(
-              scale, verify_pd=validate_args),
-          cholesky_input_output_matrices=cholesky_input_output_matrices,
-          validate_args=validate_args,
-          allow_nan_stats=allow_nan_stats,
-          name=ns)
+    with ops.name_scope(name) as ns:
+      with ops.name_scope("init", values=[scale]):
+        scale = ops.convert_to_tensor(scale)
+        if validate_args:
+          scale = distribution_util.assert_symmetric(scale)
+        chol = linalg_ops.cholesky(scale)
+        chol = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(array_ops.matrix_diag_part(chol))
+        ] if validate_args else [], chol)
+    super(WishartFull, self).__init__(
+        df=df,
+        scale_operator=linalg.LinearOperatorTriL(
+            tril=chol,
+            is_non_singular=True,
+            is_positive_definite=True,
+            is_square=True),
+        cholesky_input_output_matrices=cholesky_input_output_matrices,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=ns)
     self._parameters = parameters
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 0b4dc5667f368536c95f32fcdafb0d5d722df461..8b0f63f29ebeace74704d0c0ac8054cc95d024e1 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -55,6 +55,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -171,10 +172,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -214,7 +216,11 @@ tf_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "noasan",  # times out b/63678675
+        "nomsan",
+    ],
 )
 
 # Kernel tests
diff --git a/tensorflow/contrib/factorization/__init__.py b/tensorflow/contrib/factorization/__init__.py
index f0ca879259013fa19420dd9e7adf21e1bf5376f0..486c2ea9336d19fb7273d02502f9865adc6aefed 100644
--- a/tensorflow/contrib/factorization/__init__.py
+++ b/tensorflow/contrib/factorization/__init__.py
@@ -25,3 +25,20 @@ from tensorflow.contrib.factorization.python.ops.gmm import *
 from tensorflow.contrib.factorization.python.ops.gmm_ops import *
 from tensorflow.contrib.factorization.python.ops.wals import *
 # pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'KMeans',
+    'COSINE_DISTANCE',
+    'KMEANS_PLUS_PLUS_INIT',
+    'RANDOM_INIT',
+    'SQUARED_EUCLIDEAN_DISTANCE',
+    'WALSModel',
+    'GMM',
+    'gmm',
+    'GmmAlgorithm',
+    'WALSMatrixFactorization',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 1884ef8c93a9c19a90381acb25837ad1790169bc..50deb08616a4ac8b8b3d46f1ccbea83a53b7f899 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -12,7 +12,7 @@ cc_library(
         ":clustering_ops",
         ":masked_matmul_ops",
         ":wals_solver_ops",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
@@ -22,7 +22,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
@@ -33,7 +33,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
@@ -43,8 +43,9 @@ cc_library(
     srcs = ["masked_matmul_ops.cc"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc b/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc
index cd1f23bba2725e5cf39227efb3604adb3043e4b2..bb9b835889b1b5e36d6f470b51834d4c6bb3d493 100644
--- a/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc
@@ -213,7 +213,7 @@ class WALSComputePartialLhsAndRhsOp : public OpKernel {
       CHECK_LE(shard.second, perm.size());
       CHECK_LE(shard.first, shard.second);
       const int64 input_index = get_input_index(perm[shard.first]);
-      // Acccumulate the rhs and lhs terms in the normal equations
+      // Accumulate the rhs and lhs terms in the normal equations
       // for the non-zero elements in the row or column of the sparse matrix
       // corresponding to input_index.
       int num_batched = 0;
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 2e9b5e22c73e02dce01fe6f62ad1de5fced88dd9..c9b3314d6b582cef7275b4833d5b7ccf86e96f26 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -93,15 +93,24 @@ class KMeans(object):
 
     Args:
       inputs: An input tensor or list of input tensors
-      num_clusters: number of clusters.
-      initial_clusters: Specifies the clusters used during initialization.  Can
-        be a tensor or numpy array, or a function that generates the clusters.
-        Can also be "random" to specify that clusters should be chosen randomly
-        from input data.
-      distance_metric: distance metric used for clustering.
+      num_clusters: An integer tensor specifying the number of clusters. This
+        argument is ignored if initial_clusters is a tensor or numpy array.
+      initial_clusters: Specifies the clusters used during initialization. One
+        of the following:
+        - a tensor or numpy array with the initial cluster centers.
+        - a function f(inputs, k) that returns up to k centers from `inputs`.
+        - "random": Choose centers randomly from `inputs`.
+        - "kmeans_plus_plus": Use kmeans++ to choose centers from `inputs`.
+        In the last three cases, one batch of `inputs` may not yield
+        `num_clusters` centers, in which case initialization will require
+        multiple batches until enough centers are chosen. In the case of
+        "random" or "kmeans_plus_plus", if the input size is <= `num_clusters`
+        then the entire batch is chosen to be cluster centers.
+      distance_metric: Distance metric used for clustering. Supported options:
+        "squared_euclidean", "cosine".
       use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
         full batch.
-      mini_batch_steps_per_iteration: number of steps after which the updated
+      mini_batch_steps_per_iteration: Number of steps after which the updated
         cluster centers are synced back to a master copy.
       random_seed: Seed for PRNG used to initialize seeds.
       kmeans_plus_plus_num_retries: For each point that is sampled during
@@ -109,14 +118,21 @@ class KMeans(object):
         additional points to draw from the current distribution before selecting
         the best. If a negative value is specified, a heuristic is used to
         sample O(log(num_to_sample)) additional points.
+
+    Raises:
+      ValueError: An invalid argument was passed to initial_clusters or
+        distance_metric.
     """
+    if isinstance(initial_clusters, str) and initial_clusters not in [
+        RANDOM_INIT, KMEANS_PLUS_PLUS_INIT
+    ]:
+      raise ValueError(
+          "Unsupported initialization algorithm '%s'" % initial_clusters)
+    if distance_metric not in [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]:
+      raise ValueError("Unsupported distance metric '%s'" % distance_metric)
     self._inputs = inputs if isinstance(inputs, list) else [inputs]
-    assert num_clusters > 0, num_clusters
     self._num_clusters = num_clusters
-    if initial_clusters is None:
-      initial_clusters = RANDOM_INIT
     self._initial_clusters = initial_clusters
-    assert distance_metric in [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]
     self._distance_metric = distance_metric
     self._use_mini_batch = use_mini_batch
     self._mini_batch_steps_per_iteration = int(mini_batch_steps_per_iteration)
@@ -144,8 +160,7 @@ class KMeans(object):
       return cls._compute_cosine_distance(
           inputs, clusters, inputs_normalized=True)
     else:
-      assert False, ('Unsupported distance metric passed to Kmeans %s' %
-                     str(distance_metric))
+      assert False, str(distance_metric)
 
   @classmethod
   def _compute_euclidean_distance(cls, inputs, clusters):
@@ -238,90 +253,29 @@ class KMeans(object):
                        array_ops.squeeze(indices)))
     return zip(*output)
 
-  def _init_clusters_random(self):
-    """Does random initialization of clusters.
-
-    Returns:
-      Tensor of randomly initialized clusters.
-    """
-    num_data = math_ops.add_n([array_ops.shape(inp)[0] for inp in self._inputs])
-    # Note that for mini-batch k-means, we should ensure that the batch size of
-    # data used during initialization is sufficiently large to avoid duplicated
-    # clusters.
-    with ops.control_dependencies(
-        [check_ops.assert_less_equal(self._num_clusters, num_data)]):
-      indices = random_ops.random_uniform(
-          array_ops.reshape(self._num_clusters, [-1]),
-          minval=0,
-          maxval=math_ops.cast(num_data, dtypes.int64),
-          seed=self._random_seed,
-          dtype=dtypes.int64)
-      clusters_init = embedding_lookup(
-          self._inputs, indices, partition_strategy='div')
-      return clusters_init
-
   def _clusters_l2_normalized(self):
     """Returns True if clusters centers are kept normalized."""
     return (self._distance_metric == COSINE_DISTANCE and
             (not self._use_mini_batch or
              self._mini_batch_steps_per_iteration > 1))
 
-  def _initialize_clusters(self, cluster_centers, cluster_centers_initialized,
-                           cluster_centers_updated):
-    """Returns an op to initialize the cluster centers."""
-
-    init = self._initial_clusters
-    if init == RANDOM_INIT:
-      clusters_init = self._init_clusters_random()
-    elif init == KMEANS_PLUS_PLUS_INIT:
-      # Points from only the first shard are used for initializing centers.
-      # TODO(ands): Use all points.
-      inp = self._inputs[0]
-      if self._distance_metric == COSINE_DISTANCE:
-        inp = nn_impl.l2_normalize(inp, dim=1)
-      clusters_init = gen_clustering_ops.kmeans_plus_plus_initialization(
-          inp, self._num_clusters, self._random_seed,
-          self._kmeans_plus_plus_num_retries)
-    elif callable(init):
-      clusters_init = init(self._inputs, self._num_clusters)
-    elif not isinstance(init, str):
-      clusters_init = init
-    else:
-      assert False, 'Unsupported init passed to Kmeans %s' % str(init)
-    if self._distance_metric == COSINE_DISTANCE and clusters_init is not None:
-      clusters_init = nn_impl.l2_normalize(clusters_init, dim=1)
-
-    with ops.colocate_with(cluster_centers_initialized):
-      initialized = control_flow_ops.with_dependencies(
-          [clusters_init], array_ops.identity(cluster_centers_initialized))
-    with ops.colocate_with(cluster_centers):
-      assign_centers = state_ops.assign(
-          cluster_centers, clusters_init, validate_shape=False)
-      if cluster_centers_updated != cluster_centers:
-        assign_centers = control_flow_ops.group(assign_centers,
-                                                state_ops.assign(
-                                                    cluster_centers_updated,
-                                                    clusters_init,
-                                                    validate_shape=False))
-    assign_centers = control_flow_ops.with_dependencies(
-        [assign_centers], state_ops.assign(cluster_centers_initialized, True))
-    return control_flow_ops.cond(initialized, control_flow_ops.no_op,
-                                 lambda: assign_centers).op
-
-  def _create_variables(self):
+  def _create_variables(self, num_clusters):
     """Creates variables.
 
+    Args:
+      num_clusters: an integer Tensor providing the number of clusters.
+
     Returns:
-    Tuple with following elements:
-      cluster_centers: a Tensor for storing cluster centers
-      cluster_centers_initialized: bool Variable indicating whether clusters
-        are initialized.
-      cluster_counts: a Tensor for storing counts of points assigned to this
-        cluster. This is used by mini-batch training.
-      cluster_centers_updated: Tensor representing copy of cluster centers that
-        are updated every step.
-      update_in_steps: numbers of steps left before we sync
-        cluster_centers_updated back to cluster_centers.
+      Tuple with following elements:
+      - cluster_centers: a Tensor for storing cluster centers
+      - cluster_centers_initialized: bool Variable indicating whether clusters
+            are initialized.
+      - cluster_counts: a Tensor for storing counts of points assigned to this
+            cluster. This is used by mini-batch training.
+      - cluster_centers_updated: Tensor representing copy of cluster centers
+            that are updated every step.
+      - update_in_steps: numbers of steps left before we sync
+            cluster_centers_updated back to cluster_centers.
     """
     init_value = array_ops.constant([], dtype=dtypes.float32)
     cluster_centers = variable_scope.variable(
@@ -341,12 +295,12 @@ class KMeans(object):
           name='update_in_steps')
       # Count of points assigned to cluster_centers_updated.
       cluster_counts = variable_scope.variable(
-          array_ops.zeros([self._num_clusters], dtype=dtypes.int64))
+          array_ops.zeros([num_clusters], dtype=dtypes.int64))
     else:
       cluster_centers_updated = cluster_centers
       update_in_steps = None
       cluster_counts = (variable_scope.variable(
-          array_ops.ones([self._num_clusters], dtype=dtypes.int64))
+          array_ops.ones([num_clusters], dtype=dtypes.int64))
                         if self._use_mini_batch else None)
     return (cluster_centers, cluster_centers_initialized, cluster_counts,
             cluster_centers_updated, update_in_steps)
@@ -363,6 +317,14 @@ class KMeans(object):
   def training_graph(self):
     """Generate a training graph for kmeans algorithm.
 
+    This returns, among other things, an op that chooses initial centers
+    (init_op), a boolean variable that is set to True when the initial centers
+    are chosen (cluster_centers_initialized), and an op to perform either an
+    entire Lloyd iteration or a mini-batch of a Lloyd iteration (training_op).
+    The caller should use these components as follows. A single worker should
+    execute init_op multiple times until cluster_centers_initialized becomes
+    True. Then multiple workers may execute training_op any number of times.
+
     Returns:
       A tuple consisting of:
       all_scores: A matrix (or list of matrices) of dimensions (num_input,
@@ -379,12 +341,23 @@ class KMeans(object):
       training_op: an op that runs an iteration of training.
     """
     # Implementation of kmeans.
+    if (isinstance(self._initial_clusters, str) or
+        callable(self._initial_clusters)):
+      initial_clusters = self._initial_clusters
+      num_clusters = ops.convert_to_tensor(self._num_clusters)
+    else:
+      initial_clusters = ops.convert_to_tensor(self._initial_clusters)
+      num_clusters = array_ops.shape(initial_clusters)[0]
+
     inputs = self._inputs
     (cluster_centers_var, cluster_centers_initialized, total_counts,
-     cluster_centers_updated, update_in_steps) = self._create_variables()
-    init_op = self._initialize_clusters(cluster_centers_var,
-                                        cluster_centers_initialized,
-                                        cluster_centers_updated)
+     cluster_centers_updated,
+     update_in_steps) = self._create_variables(num_clusters)
+    init_op = _InitializeClustersOpFactory(
+        self._inputs, num_clusters, initial_clusters, self._distance_metric,
+        self._random_seed, self._kmeans_plus_plus_num_retries,
+        cluster_centers_var, cluster_centers_updated,
+        cluster_centers_initialized).op()
     cluster_centers = cluster_centers_var
 
     if self._distance_metric == COSINE_DISTANCE:
@@ -403,8 +376,8 @@ class KMeans(object):
             inputs, cluster_idx, cluster_centers_updated, total_counts)
     else:
       assert cluster_centers == cluster_centers_var
-      training_op = self._full_batch_training_op(inputs, cluster_idx,
-                                                 cluster_centers_var)
+      training_op = self._full_batch_training_op(
+          inputs, num_clusters, cluster_idx, cluster_centers_var)
 
     return (all_scores, cluster_idx, scores, cluster_centers_initialized,
             init_op, training_op)
@@ -516,11 +489,13 @@ class KMeans(object):
       update_ops.extend([update_counts, update_cluster_centers])
     return control_flow_ops.group(*update_ops)
 
-  def _full_batch_training_op(self, inputs, cluster_idx_list, cluster_centers):
+  def _full_batch_training_op(self, inputs, num_clusters, cluster_idx_list,
+                              cluster_centers):
     """Creates an op for training for full batch case.
 
     Args:
       inputs: list of input Tensors.
+      num_clusters: an integer Tensor providing the number of clusters.
       cluster_idx_list: A vector (or list of vectors). Each element in the
         vector corresponds to an input row in 'inp' and specifies the cluster id
         corresponding to the input.
@@ -535,16 +510,161 @@ class KMeans(object):
     for inp, cluster_idx in zip(inputs, cluster_idx_list):
       with ops.colocate_with(inp, ignore_existing=True):
         cluster_sums.append(
-            math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters))
+            math_ops.unsorted_segment_sum(inp, cluster_idx, num_clusters))
         cluster_counts.append(
             math_ops.unsorted_segment_sum(
                 array_ops.reshape(
                     array_ops.ones(
                         array_ops.reshape(array_ops.shape(inp)[0], [-1])),
-                    [-1, 1]), cluster_idx, self._num_clusters))
+                    [-1, 1]), cluster_idx, num_clusters))
     with ops.colocate_with(cluster_centers, ignore_existing=True):
       new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast(
           math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon)
       if self._clusters_l2_normalized():
         new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
     return state_ops.assign(cluster_centers, new_clusters_centers)
+
+
+class _InitializeClustersOpFactory(object):
+  """Internal class to create the op to initialize the clusters.
+
+    The op performs this algorithm (see constructor args):
+
+    num_remaining = num_clusters - length(cluster_centers)
+    if num_remaining == 0:
+      assert that cluster_centers_initialized is true
+    else:
+      assert that num_remaining > 0
+      new_centers = choose up to num_remaining initial centers
+      l2-normalize new_centers if using cosine distance
+      all_centers = concat(cluster_centers, new_centers)
+      cluster_centers := all_centers
+      if there is a cluster_centers_updated variable:
+        cluster_centers_updated := cluster_centers
+      num_now_remaining = num_clusters - length(cluster_centers)
+      if num_now_remaining == 0:
+        cluster_centers_initialized := true
+  """
+
+  def __init__(self, inputs, num_clusters, initial_clusters, distance_metric,
+               random_seed, kmeans_plus_plus_num_retries, cluster_centers,
+               cluster_centers_updated, cluster_centers_initialized):
+    """Creates an op factory.
+
+    Args:
+      inputs: See KMeans constructor.
+      num_clusters: An integer Tensor providing the number of clusters.
+      initial_clusters: See KMeans constructor.
+      distance_metric: See KMeans constructor.
+      random_seed: See KMeans constructor.
+      kmeans_plus_plus_num_retries: See KMeans constructor.
+      cluster_centers: The TF variable holding the initial centers. It may
+          already contain some centers when the op is executed.
+      cluster_centers_updated: A second TF variable to hold a copy of the
+          initial centers, used for full-batch mode. In mini-batch mode,
+          cluster_centers_updated is the same variable as cluster_centers.
+      cluster_centers_initialized: A boolean TF variable that will be set
+          to true when all the initial centers have been chosen.
+    """
+    # All of these instance variables are constants.
+    self._inputs = inputs
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._random_seed = random_seed
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._cluster_centers = cluster_centers
+    self._cluster_centers_updated = cluster_centers_updated
+    self._cluster_centers_initialized = cluster_centers_initialized
+
+    self._num_selected = array_ops.shape(self._cluster_centers)[0]
+    self._num_remaining = self._num_clusters - self._num_selected
+    self._num_data = math_ops.add_n(
+        [array_ops.shape(i)[0] for i in self._inputs])
+
+  def _random(self):
+    indices = random_ops.random_uniform(
+        array_ops.reshape(self._num_remaining, [-1]),
+        minval=0,
+        maxval=math_ops.cast(self._num_data, dtypes.int64),
+        seed=self._random_seed,
+        dtype=dtypes.int64)
+    return embedding_lookup(self._inputs, indices, partition_strategy='div')
+
+  def _kmeans_plus_plus(self):
+    # Points from only the first shard are used for initializing centers.
+    # TODO(ands): Use all points.
+    inp = self._inputs[0]
+    if self._distance_metric == COSINE_DISTANCE:
+      inp = nn_impl.l2_normalize(inp, dim=1)
+    return gen_clustering_ops.kmeans_plus_plus_initialization(
+        inp,
+        math_ops.to_int64(self._num_remaining), self._random_seed,
+        self._kmeans_plus_plus_num_retries)
+
+  def _greedy_batch_sampler(self, sampler):
+    # If the input dataset size is smaller than the number of centers
+    # remaining, choose the entire input dataset as centers. This can happen
+    # with mini-batch. Otherwise, sample the batch according to the provided
+    # sampler.
+    return control_flow_ops.cond(self._num_data <= self._num_remaining,
+                                 lambda: array_ops.concat(self._inputs, 0),
+                                 sampler)
+
+  def _single_batch_sampler(self, sampler):
+    # Enforce that there are at least as many data points as centers
+    # remaining. This gives the provided sampler the chance to select all
+    # remaining centers from a single batch.
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(self._num_data, self._num_remaining)]):
+      return sampler()
+
+  def _choose_initial_centers(self):
+    if isinstance(self._initial_clusters, str):
+      if self._initial_clusters == RANDOM_INIT:
+        return self._greedy_batch_sampler(self._random)
+      else:  # self._initial_clusters == KMEANS_PLUS_PLUS_INIT
+        return self._single_batch_sampler(self._kmeans_plus_plus)
+    elif callable(self._initial_clusters):
+      return self._initial_clusters(self._inputs, self._num_remaining)
+    else:
+      with ops.control_dependencies([
+          check_ops.assert_equal(self._num_remaining,
+                                 array_ops.shape(self._initial_clusters)[0])
+      ]):
+        return self._initial_clusters
+
+  def _add_new_centers(self):
+    """Adds some centers and returns the number of centers remaining."""
+    new_centers = self._choose_initial_centers()
+    if self._distance_metric == COSINE_DISTANCE:
+      new_centers = nn_impl.l2_normalize(new_centers, dim=1)
+    # If cluster_centers is empty, it doesn't have the right shape for concat.
+    all_centers = control_flow_ops.cond(
+        math_ops.equal(self._num_selected, 0), lambda: new_centers,
+        lambda: array_ops.concat([self._cluster_centers, new_centers], 0))
+    # TODO(ccolby): De-dupe all_centers?
+    with ops.colocate_with(self._cluster_centers):
+      a = state_ops.assign(
+          self._cluster_centers, all_centers, validate_shape=False)
+      if self._cluster_centers_updated is not self._cluster_centers:
+        a = state_ops.assign(
+            self._cluster_centers_updated, a, validate_shape=False)
+      return self._num_clusters - array_ops.shape(a)[0]
+
+  def _initialize(self):
+    with ops.control_dependencies([
+        check_ops.assert_positive(self._num_remaining),
+    ]):
+      num_now_remaining = self._add_new_centers()
+      return control_flow_ops.cond(
+          math_ops.equal(num_now_remaining, 0),
+          lambda: state_ops.assign(self._cluster_centers_initialized, True),
+          control_flow_ops.no_op)
+
+  def op(self):
+    """Returns the cluster initializer op."""
+    return control_flow_ops.cond(
+        math_ops.equal(self._num_remaining, 0),
+        lambda: check_ops.assert_equal(self._cluster_centers_initialized, True),
+        self._initialize)
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 000e2403a7009c06284c8e1079eb4c02bb7a7add..054888e734086c153f7af59f4548d4d20abab813 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -23,7 +23,6 @@ import numbers
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-
 from tensorflow.contrib.factorization.python.ops import gen_factorization_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import constant_op
@@ -81,13 +80,16 @@ class WALSModel(object):
   a sparse term and a Gramian term, see wals.md.
   The loss is returned by the update_{col, row}_factors(sp_input), and is
   normalized as follows:
-  _, _, minibatch_loss = update_row_factors(sp_input)
+    _, _, unregularized_loss, regularization, sum_weights =
+        update_row_factors(sp_input)
   if sp_input contains the rows {A_i, i \in I}, and the input matrix A has n
-  total rows, then minibatch_loss is
+  total rows, then the minibatch loss = unregularized_loss + regularization is
    \\(
-   (\|\sqrt W \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
+   (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
    \lambda \|V\|_F^2
    )\\
+  The sum_weights tensor contains the normalized sum of weights
+  sum(W_I) * n / |I|.
 
   A typical usage example (pseudocode):
 
@@ -119,11 +121,13 @@ class WALSModel(object):
       # Ops to upate row(column). This can either take the entire sparse tensor
       # or slices of sparse tensor. For distributed trainer, each trainer
       # handles just part of the matrix.
-      _, row_update_op, row_loss = model.update_row_factors(
+      _, row_update_op, unreg_row_loss, row_reg, _ = model.update_row_factors(
            sp_input=matrix_slices_from_queue_for_worker_shard)
-      _, col_update_op, col_loss = model.update_col_factors(
+      row_loss = unreg_row_loss + row_reg
+      _, col_update_op, unreg_col_loss, col_reg, _ = model.update_col_factors(
            sp_input=transposed_matrix_slices_from_queue_for_worker_shard,
            transpose_input=True)
+      col_loss = unreg_col_loss + col_reg
 
       ...
 
@@ -225,7 +229,13 @@ class WALSModel(object):
         a single number or vice versa.
       col_weights: See row_weights.
       use_factors_weights_cache: When True, the factors and weights will be
-        cached on the workers before the updates start. Defaults to True.
+        cached on the workers before the updates start. Defaults to True. Note
+        that the weights cache is initialized through `worker_init`, and the
+        row/col factors cache is initialized through
+        `initialize_{col/row}_update_op`. In the case where the weights are
+        computed outside and set before the training iterations start, it is
+        important to ensure the `worker_init` op is run afterwards for the
+        weights cache to take effect.
       use_gramian_cache: When True, the Gramians will be cached on the workers
         before the updates start. Defaults to True.
     """
@@ -240,28 +250,24 @@ class WALSModel(object):
         regularization * linalg_ops.eye(self._n_components)
         if regularization is not None else None)
     assert (row_weights is None) == (col_weights is None)
-    self._row_weights = WALSModel._create_weights(row_weights, self._input_rows,
-                                                  self._num_row_shards,
-                                                  "row_weights")
-    self._col_weights = WALSModel._create_weights(col_weights, self._input_cols,
-                                                  self._num_col_shards,
-                                                  "col_weights")
+    self._row_weights = WALSModel._create_weights(
+        row_weights, self._input_rows, self._num_row_shards, "row_weights")
+    self._col_weights = WALSModel._create_weights(
+        col_weights, self._input_cols, self._num_col_shards, "col_weights")
     self._use_factors_weights_cache = use_factors_weights_cache
     self._use_gramian_cache = use_gramian_cache
-    self._row_factors = self._create_factors(self._input_rows,
-                                             self._n_components,
-                                             self._num_row_shards, row_init,
-                                             "row_factors")
-    self._col_factors = self._create_factors(self._input_cols,
-                                             self._n_components,
-                                             self._num_col_shards, col_init,
-                                             "col_factors")
+    self._row_factors = self._create_factors(
+        self._input_rows, self._n_components, self._num_row_shards, row_init,
+        "row_factors")
+    self._col_factors = self._create_factors(
+        self._input_cols, self._n_components, self._num_col_shards, col_init,
+        "col_factors")
     self._row_gramian = self._create_gramian(self._n_components, "row_gramian")
     self._col_gramian = self._create_gramian(self._n_components, "col_gramian")
-    self._row_update_prep_gramian = self._prepare_gramian(self._col_factors,
-                                                          self._col_gramian)
-    self._col_update_prep_gramian = self._prepare_gramian(self._row_factors,
-                                                          self._row_gramian)
+    self._row_update_prep_gramian = self._prepare_gramian(
+        self._col_factors, self._col_gramian)
+    self._col_update_prep_gramian = self._prepare_gramian(
+        self._row_factors, self._row_gramian)
     self._create_transient_vars()
 
   @property
@@ -538,14 +544,12 @@ class WALSModel(object):
          "col_gramian_cache",
          pass_through=not self._use_gramian_cache)
 
-    self._row_updates_init = control_flow_ops.group(col_factors_cache_init,
-                                                    row_factors_cache_reset,
-                                                    col_gramian_cache_init,
-                                                    row_gramian_cache_reset)
-    self._col_updates_init = control_flow_ops.group(row_factors_cache_init,
-                                                    col_factors_cache_reset,
-                                                    row_gramian_cache_init,
-                                                    col_gramian_cache_reset)
+    self._row_updates_init = control_flow_ops.group(
+        col_factors_cache_init, row_factors_cache_reset, col_gramian_cache_init,
+        row_gramian_cache_reset)
+    self._col_updates_init = control_flow_ops.group(
+        row_factors_cache_init, col_factors_cache_reset, row_gramian_cache_init,
+        col_gramian_cache_reset)
 
     if self._row_wt_cache is not None:
       assert self._col_wt_cache is not None
@@ -556,7 +560,14 @@ class WALSModel(object):
 
   @property
   def worker_init(self):
-    """Op to initialize worker state once before starting any updates."""
+    """Op to initialize worker state once before starting any updates.
+
+    Note that specifically this initializes the cache of the row and column
+    weights on workers when `use_factors_weights_cache` is True. In this case,
+    if these weights are being calcualted and reset after the object is created,
+    it is important to ensure this ops is run afterwards so the cache reflects
+    the correct values.
+    """
     return self._worker_init
 
   @property
@@ -565,6 +576,9 @@ class WALSModel(object):
 
     Must be run before initialize_row_update_op and should only be run by one
     trainer (usually the chief) when doing distributed training.
+
+    Returns:
+      Op to form the gramian.
     """
     return self._row_update_prep_gramian
 
@@ -574,6 +588,9 @@ class WALSModel(object):
 
     Must be run before initialize_col_update_op and should only be run by one
     trainer (usually the chief) when doing distributed training.
+
+    Returns:
+      Op to form the gramian.
     """
     return self._col_update_prep_gramian
 
@@ -613,8 +630,8 @@ class WALSModel(object):
     if len(factor) == 1:
       with ops.colocate_with(factor[0]):
         # TODO(agarwal): assign instead of scatter update for full batch update.
-        return state_ops.scatter_update(factor[0], indices, values,
-                                        name=name).op
+        return state_ops.scatter_update(
+            factor[0], indices, values, name=name).op
     else:
       num_shards = len(factor)
       assignments, new_ids = sharding_func(indices)
@@ -626,8 +643,9 @@ class WALSModel(object):
                                                        num_shards)
       updates = []
       for i in xrange(num_shards):
-        updates.append(state_ops.scatter_update(factor[i], sharded_ids[i],
-                                                sharded_values[i]))
+        updates.append(
+            state_ops.scatter_update(factor[i], sharded_ids[i], sharded_values[
+                i]))
       return control_flow_ops.group(*updates, name=name)
 
   def update_row_factors(self, sp_input=None, transpose_input=False):
@@ -645,15 +663,23 @@ class WALSModel(object):
       new_values: New values for the row factors.
       update_op: An op that assigns the newly computed values to the row
         factors.
-      loss: A tensor (scalar) that contains the normalized minibatch loss,
-        corresponding to sp_input.
-        if sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
-        A has n total rows, then loss is:
-        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) *
-        n / |I| + \lambda \|V\|_F^2.
+      unregularized_loss: A tensor (scalar) that contains the normalized
+        minibatch loss corresponding to sp_input, without the regularization
+        term. If sp_input contains the rows {A_{i, :}, i \in I}, and the input
+        matrix A has n total rows, then the unregularized loss is:
+        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|
+        The total loss is unregularized_loss + regularization.
+      regularization: A tensor (scalar) that contains the normalized
+        regularization term for the minibatch loss corresponding to sp_input.
+        If sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
+        A has n total rows, then the regularization term is:
+        \lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2.
+      sum_weights: The sum of the weights W_I corresponding to sp_input,
+        normalized by a factor of n / |I|. The root weighted squared error is:
+        \sqrt(unregularized_loss / sum_weights).
     """
-    return self._process_input_helper(True, sp_input=sp_input,
-                                      transpose_input=transpose_input)
+    return self._process_input_helper(
+        True, sp_input=sp_input, transpose_input=transpose_input)
 
   def update_col_factors(self, sp_input=None, transpose_input=False):
     r"""Updates the column factors.
@@ -666,21 +692,31 @@ class WALSModel(object):
         columns corresponding to the transposed input are updated.
 
     Returns:
-      A tuple consisting of the following two elements:
+      A tuple consisting of the following elements:
       new_values: New values for the column factors.
       update_op: An op that assigns the newly computed values to the column
         factors.
-      loss: A tensor (scalar) that contains the normalized minibatch loss,
-        corresponding to sp_input.
+      unregularized_loss: A tensor (scalar) that contains the normalized
+        minibatch loss corresponding to sp_input, without the regularization
+        term. If sp_input contains the columns {A_{:, j}, j \in J}, and the
+        input matrix A has m total columns, then the unregularized loss is:
+        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|
+        The total loss is unregularized_loss + regularization.
+      regularization: A tensor (scalar) that contains the normalized
+        regularization term for the minibatch loss corresponding to sp_input.
         If sp_input contains the columns {A_{:, j}, j \in J}, and the input
-        matrix A has m total columns, then loss is:
-        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 + \lambda \|V_J\|_F^2) *
-        m / |J| + \lambda \|U\|_F^2.
+        matrix A has m total columns, then the regularization term is:
+        \lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2.
+      sum_weights: The sum of the weights W_J corresponding to sp_input,
+        normalized by a factor of m / |J|. The root weighted squared error is:
+        \sqrt(unregularized_loss / sum_weights).
     """
-    return self._process_input_helper(False, sp_input=sp_input,
-                                      transpose_input=transpose_input)
+    return self._process_input_helper(
+        False, sp_input=sp_input, transpose_input=transpose_input)
 
-  def project_row_factors(self, sp_input=None, transpose_input=False,
+  def project_row_factors(self,
+                          sp_input=None,
+                          transpose_input=False,
                           projection_weights=None):
     """Projects the row factors.
 
@@ -705,9 +741,11 @@ class WALSModel(object):
     """
     if projection_weights is None:
       projection_weights = 1
-    return self._process_input_helper(True, sp_input=sp_input,
-                                      transpose_input=transpose_input,
-                                      row_weights=projection_weights)[0]
+    return self._process_input_helper(
+        True,
+        sp_input=sp_input,
+        transpose_input=transpose_input,
+        row_weights=projection_weights)[0]
 
   def project_col_factors(self,
                           sp_input=None,
@@ -736,12 +774,16 @@ class WALSModel(object):
     """
     if projection_weights is None:
       projection_weights = 1
-    return self._process_input_helper(False, sp_input=sp_input,
-                                      transpose_input=transpose_input,
-                                      row_weights=projection_weights)[0]
-
-  def _process_input_helper(self, update_row_factors,
-                            sp_input=None, transpose_input=False,
+    return self._process_input_helper(
+        False,
+        sp_input=sp_input,
+        transpose_input=transpose_input,
+        row_weights=projection_weights)[0]
+
+  def _process_input_helper(self,
+                            update_row_factors,
+                            sp_input=None,
+                            transpose_input=False,
                             row_weights=None):
     """Creates the graph for processing a sparse slice of input.
 
@@ -761,12 +803,18 @@ class WALSModel(object):
         of columns to be updated/projected.
 
     Returns:
-      A tuple consisting of the following three elements:
+      A tuple consisting of the following elements:
       new_values: New values for the row/column factors.
       update_op: An op that assigns the newly computed values to the row/column
         factors.
-      loss: A tensor (scalar) that contains the normalized minibatch loss,
-        corresponding to sp_input.
+      unregularized_loss: A tensor (scalar) that contains the normalized
+        minibatch loss corresponding to sp_input, without the regularization
+        term. Add the regularization term below to yield the loss.
+      regularization: A tensor (scalar) that contains the normalized
+        regularization term for the minibatch loss corresponding to sp_input.
+      sum_weights: The sum of the weights corresponding to sp_input. This
+        can be used with unregularized loss to calculate the root weighted
+        squared error.
     """
     assert isinstance(sp_input, sparse_tensor.SparseTensor)
 
@@ -776,6 +824,7 @@ class WALSModel(object):
       row_wt = self._row_wt_cache
       col_wt = self._col_wt_cache
       total_rows = self._input_rows
+      total_cols = self._input_cols
       sharding_func = WALSModel._get_sharding_func(self._input_rows,
                                                    self._num_row_shards)
       gramian = self._col_gramian_cache
@@ -785,6 +834,7 @@ class WALSModel(object):
       row_wt = self._col_wt_cache
       col_wt = self._row_wt_cache
       total_rows = self._input_cols
+      total_cols = self._input_rows
       sharding_func = WALSModel._get_sharding_func(self._input_cols,
                                                    self._num_col_shards)
       gramian = self._row_gramian_cache
@@ -820,8 +870,8 @@ class WALSModel(object):
     right = embedding_ops.embedding_lookup(
         right_factors, gather_indices, partition_strategy="div")
     new_sp_indices = array_ops.concat([row_ids, col_ids], 1)
-    new_sp_shape = (array_ops.concat([row_shape, col_shape], 0) if
-                    transpose_input else
+    new_sp_shape = (array_ops.concat([row_shape, col_shape], 0)
+                    if transpose_input else
                     array_ops.concat([col_shape, row_shape], 0))
     new_sp_input = sparse_tensor.SparseTensor(
         indices=new_sp_indices,
@@ -834,9 +884,9 @@ class WALSModel(object):
       total_lhs += self._regularization_matrix
     if self._row_weights is None:
       # Special case of ALS. Use a much simpler update rule.
-      total_rhs = (self._unobserved_weight *
-                   sparse_ops.sparse_tensor_dense_matmul(
-                       new_sp_input, right, adjoint_a=transpose_input))
+      total_rhs = (
+          self._unobserved_weight * sparse_ops.sparse_tensor_dense_matmul(
+              new_sp_input, right, adjoint_a=transpose_input))
       # TODO(rmlarsen): handle transposing in tf.matrix_solve instead of
       # transposing explicitly.
       # TODO(rmlarsen): multi-thread tf.matrix_solve.
@@ -877,8 +927,12 @@ class WALSModel(object):
           linalg_ops.matrix_solve(total_lhs, total_rhs), [2])
 
     update_op_name = "row_update" if update_row_factors else "col_update"
-    update_op = self.scatter_update(left, update_indices, new_left_values,
-                                    sharding_func, name=update_op_name)
+    update_op = self.scatter_update(
+        left,
+        update_indices,
+        new_left_values,
+        sharding_func,
+        name=update_op_name)
 
     # Create the loss subgraph
     loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
@@ -886,32 +940,52 @@ class WALSModel(object):
     # sp_approx is the low rank estimate of the input matrix, formed by
     # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
     sp_approx_vals = gen_factorization_ops.masked_matmul(
-        new_left_values, right, loss_sp_input.indices, transpose_a=False,
+        new_left_values,
+        right,
+        loss_sp_input.indices,
+        transpose_a=False,
         transpose_b=True)
     sp_approx = sparse_tensor.SparseTensor(
         loss_sp_input.indices, sp_approx_vals, loss_sp_input.dense_shape)
     sp_approx_sq = math_ops.square(sp_approx)
     sp_residual = sparse_ops.sparse_add(loss_sp_input, sp_approx * (-1))
     sp_residual_sq = math_ops.square(sp_residual)
-    row_wt_mat = (constant_op.constant(0.) if self._row_weights is None else
-                  array_ops.expand_dims(row_weights_slice, 1))
-    col_wt_mat = (constant_op.constant(0.) if self._col_weights is None else
-                  array_ops.expand_dims(col_weights, 0))
+    row_wt_mat = (constant_op.constant(0.)
+                  if self._row_weights is None else array_ops.expand_dims(
+                      row_weights_slice, 1))
+    col_wt_mat = (constant_op.constant(0.)
+                  if self._col_weights is None else array_ops.expand_dims(
+                      col_weights, 0))
+
     # We return the normalized loss
     partial_row_gramian = math_ops.matmul(
         new_left_values, new_left_values, transpose_a=True)
     normalization_factor = total_rows / math_ops.cast(num_rows, dtypes.float32)
-    loss = (
-        self._unobserved_weight * (
-            sparse_ops.sparse_reduce_sum(sp_residual_sq) -
-            sparse_ops.sparse_reduce_sum(sp_approx_sq) +
-            math_ops.trace(math_ops.matmul(partial_row_gramian, gramian))
-        ) +
+
+    unregularized_loss = (
+        self._unobserved_weight * (  # pyformat line break
+            sparse_ops.sparse_reduce_sum(sp_residual_sq) -  # pyformat break
+            sparse_ops.sparse_reduce_sum(sp_approx_sq) +  # pyformat break
+            math_ops.trace(math_ops.matmul(partial_row_gramian, gramian))) +
         sparse_ops.sparse_reduce_sum(row_wt_mat * (sp_residual_sq * col_wt_mat))
     ) * normalization_factor
+
     if self._regularization is not None:
-      loss += self._regularization * (
+      regularization = self._regularization * (
           math_ops.trace(partial_row_gramian) * normalization_factor +
-          math_ops.trace(gramian)
-      )
-    return (new_left_values, update_op, loss)
+          math_ops.trace(gramian))
+    else:
+      regularization = constant_op.constant(0.)
+
+    sum_weights = self._unobserved_weight * math_ops.cast(
+        total_rows * total_cols, dtypes.float32)
+    if self._row_weights is not None and self._col_weights is not None:
+      ones = sparse_tensor.SparseTensor(
+          indices=loss_sp_input.indices,
+          values=array_ops.ones(array_ops.shape(loss_sp_input.values)),
+          dense_shape=loss_sp_input.dense_shape)
+      sum_weights += sparse_ops.sparse_reduce_sum(row_wt_mat * (
+          ones * col_wt_mat)) * normalization_factor
+
+    return (new_left_values, update_op, unregularized_loss, regularization,
+            sum_weights)
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index bcee881854586571061264155b2b346c88d4860c..c8137339155ef1da8ee53967eea84a550f12ecbc 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-
 INPUT_MATRIX = factorization_ops_test_utils.INPUT_MATRIX
 np_matrix_to_tf_sparse = factorization_ops_test_utils.np_matrix_to_tf_sparse
 
@@ -53,16 +52,20 @@ class WalsModelTest(test.TestCase):
 
   def calculate_loss_from_wals_model(self, wals_model, sp_inputs):
     current_rows = embedding_ops.embedding_lookup(
-        wals_model.row_factors, math_ops.range(wals_model._input_rows),
+        wals_model.row_factors,
+        math_ops.range(wals_model._input_rows),
         partition_strategy="div")
     current_cols = embedding_ops.embedding_lookup(
-        wals_model.col_factors, math_ops.range(wals_model._input_cols),
+        wals_model.col_factors,
+        math_ops.range(wals_model._input_cols),
         partition_strategy="div")
     row_wts = embedding_ops.embedding_lookup(
-        wals_model._row_weights, math_ops.range(wals_model._input_rows),
+        wals_model._row_weights,
+        math_ops.range(wals_model._input_rows),
         partition_strategy="div")
     col_wts = embedding_ops.embedding_lookup(
-        wals_model._col_weights, math_ops.range(wals_model._input_cols),
+        wals_model._col_weights,
+        math_ops.range(wals_model._input_cols),
         partition_strategy="div")
     return factorization_ops_test_utils.calculate_loss(
         sp_inputs, current_rows, current_cols, wals_model._regularization,
@@ -71,9 +74,11 @@ class WalsModelTest(test.TestCase):
   def setUp(self):
     self.col_init = [
         # shard 0
-        [[-0.36444709, -0.39077035, -0.32528427],
-         [1.19056475, 0.07231052, 2.11834812],
-         [0.93468881, -0.71099287, 1.91826844]],
+        [
+            [-0.36444709, -0.39077035, -0.32528427],  # pyformat line break
+            [1.19056475, 0.07231052, 2.11834812],
+            [0.93468881, -0.71099287, 1.91826844]
+        ],
         # shard 1
         [[1.18160152, 1.52490723, -0.50015002],
          [1.82574749, -0.57515913, -1.32810032]],
@@ -87,19 +92,72 @@ class WalsModelTest(test.TestCase):
 
     # Values of factor shards after running one iteration of row and column
     # updates.
-    self._row_factors_0 = [[0.097689, -0.219293, -0.020780],
-                           [0.50842, 0.64626, 0.22364],
-                           [0.401159, -0.046558, -0.192854]]
+    self._row_factors_0 = [
+        [0.097689, -0.219293, -0.020780],  # pyformat line break
+        [0.50842, 0.64626, 0.22364],
+        [0.401159, -0.046558, -0.192854]
+    ]
     self._row_factors_1 = [[1.20597, -0.48025, 0.35582],
                            [1.5564, 1.2528, 1.0528]]
-    self._col_factors_0 = [[2.4725, -1.2950, -1.9980],
-                           [0.44625, 1.50771, 1.27118],
-                           [1.39801, -2.10134, 0.73572]]
+    self._col_factors_0 = [
+        [2.4725, -1.2950, -1.9980],  # pyformat line break
+        [0.44625, 1.50771, 1.27118],
+        [1.39801, -2.10134, 0.73572]
+    ]
     self._col_factors_1 = [[3.36509, -0.66595, -3.51208],
                            [0.57191, 1.59407, 1.33020]]
     self._col_factors_2 = [[3.3459, -1.3341, -3.3008],
                            [0.57366, 1.83729, 1.26798]]
 
+  def _run_test_sum_weights(self, test_rows):
+    # test_rows: True to test row weights, False to test column weights.
+
+    num_rows = 5
+    num_cols = 5
+    unobserved_weight = 0.1
+    row_weights = [[8., 18., 28., 38., 48.]]
+    col_weights = [[90., 91., 92., 93., 94.]]
+    sparse_indices = [[0, 1], [2, 3], [4, 1]]
+    sparse_values = [666., 777., 888.]
+
+    unobserved = unobserved_weight * num_rows * num_cols
+    observed = 8. * 91. + 28. * 93. + 48. * 91.
+    # sparse_indices has three unique rows and two unique columns
+    observed *= num_rows / 3. if test_rows else num_cols / 2.
+    want_weight_sum = unobserved + observed
+
+    with ops.Graph().as_default(), self.test_session() as sess:
+      wals_model = factorization_ops.WALSModel(
+          input_rows=num_rows,
+          input_cols=num_cols,
+          n_components=5,
+          unobserved_weight=unobserved_weight,
+          row_weights=row_weights,
+          col_weights=col_weights,
+          use_factors_weights_cache=False)
+
+      wals_model.initialize_op.run()
+      wals_model.worker_init.run()
+
+      update_factors = (wals_model.update_row_factors
+                        if test_rows else wals_model.update_col_factors)
+
+      (_, _, _, _, sum_weights) = update_factors(
+          sp_input=sparse_tensor.SparseTensor(
+              indices=sparse_indices,
+              values=sparse_values,
+              dense_shape=[num_rows, num_cols]),
+          transpose_input=False)
+
+      got_weight_sum = sess.run(sum_weights)
+
+      self.assertNear(
+          got_weight_sum,
+          want_weight_sum,
+          err=.001,
+          msg="got weight sum [{}], want weight sum [{}]".format(
+              got_weight_sum, want_weight_sum))
+
   def _run_test_process_input(self,
                               use_factors_weights_cache,
                               compute_loss=False):
@@ -138,8 +196,10 @@ class WalsModelTest(test.TestCase):
       # Here we feed in scattered rows of the input.
       wals_model.row_update_prep_gramian_op.run()
       wals_model.initialize_row_update_op.run()
-      _, process_input_op, factor_loss = wals_model.update_row_factors(
-          sp_input=sp_feeder, transpose_input=False)
+      (_, process_input_op, unregularized_loss, regularization,
+       _) = wals_model.update_row_factors(
+           sp_input=sp_feeder, transpose_input=False)
+      factor_loss = unregularized_loss + regularization
       for inp in input_scattered_rows:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -163,8 +223,8 @@ class WalsModelTest(test.TestCase):
           sp_input=sp_feeder, transpose_input=False)
       feed_dict = {
           sp_feeder:
-              np_matrix_to_tf_sparse(
-                  INPUT_MATRIX, [1, 4], shuffle=False).eval()
+              np_matrix_to_tf_sparse(INPUT_MATRIX, [1, 4], shuffle=False)
+              .eval()
       }
       self.assertAllClose(
           projected_rows.eval(feed_dict=feed_dict),
@@ -178,15 +238,17 @@ class WalsModelTest(test.TestCase):
       if compute_loss:
         # Test loss computation after the row update
         loss = sum(
-            sess.run(factor_loss * self.count_rows(inp) / num_rows,
-                     feed_dict={sp_feeder: inp})
-            for inp in input_scattered_rows)
-        true_loss = self.calculate_loss_from_wals_model(
-            wals_model, self._wals_inputs)
+            sess.run(
+                factor_loss * self.count_rows(inp) / num_rows,
+                feed_dict={sp_feeder: inp}) for inp in input_scattered_rows)
+        true_loss = self.calculate_loss_from_wals_model(wals_model,
+                                                        self._wals_inputs)
         self.assertNear(
-            loss, true_loss, err=.001,
-            msg="""After row update, computed loss = {}, does not match
-            the true loss = {}.""".format(loss, true_loss))
+            loss,
+            true_loss,
+            err=.001,
+            msg="After row update, computed loss [{}] does not match"
+            " true loss [{}]".format(loss, true_loss))
 
       # Split input into multiple sparse tensors with scattered columns. Note
       # that here the elements in the sparse tensors are not ordered and also
@@ -206,8 +268,10 @@ class WalsModelTest(test.TestCase):
       # Here we feed in scattered columns of the input.
       wals_model.col_update_prep_gramian_op.run()
       wals_model.initialize_col_update_op.run()
-      _, process_input_op, factor_loss = wals_model.update_col_factors(
-          sp_input=sp_feeder, transpose_input=False)
+      (_, process_input_op, unregularized_loss, regularization,
+       _) = wals_model.update_col_factors(
+           sp_input=sp_feeder, transpose_input=False)
+      factor_loss = unregularized_loss + regularization
       for inp in input_scattered_cols:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -243,25 +307,28 @@ class WalsModelTest(test.TestCase):
           atol=1e-3)
       self.assertAllClose(
           projected_cols_no_weights.eval(feed_dict=feed_dict),
-          [[3.471045, -1.250835, -3.598917],
-           [3.585139, -0.487476, -3.852232],
+          [[3.471045, -1.250835, -3.598917], [3.585139, -0.487476, -3.852232],
            [0.346433, 1.360644, 1.677121]],
           atol=1e-3)
 
       if compute_loss:
         # Test loss computation after the column update.
         loss = sum(
-            sess.run(factor_loss * self.count_cols(inp) / num_cols,
-                     feed_dict={sp_feeder: inp})
+            sess.run(
+                factor_loss * self.count_cols(inp) / num_cols,
+                feed_dict={sp_feeder: inp})
             for inp in input_scattered_cols_non_duplicate)
-        true_loss = self.calculate_loss_from_wals_model(
-            wals_model, self._wals_inputs)
+        true_loss = self.calculate_loss_from_wals_model(wals_model,
+                                                        self._wals_inputs)
         self.assertNear(
-            loss, true_loss, err=.001,
-            msg="""After col update, computed loss = {}, does not match the true
-            loss = {}.""".format(loss, true_loss))
-
-  def _run_test_process_input_transposed(self, use_factors_weights_cache,
+            loss,
+            true_loss,
+            err=.001,
+            msg="After col update, computed loss [{}] does not match"
+            " true loss [{}]".format(loss, true_loss))
+
+  def _run_test_process_input_transposed(self,
+                                         use_factors_weights_cache,
                                          compute_loss=False):
     with ops.Graph().as_default(), self.test_session() as sess:
       self._wals_inputs = self.sparse_input()
@@ -304,8 +371,10 @@ class WalsModelTest(test.TestCase):
       # they appear.
       wals_model.row_update_prep_gramian_op.run()
       wals_model.initialize_row_update_op.run()
-      _, process_input_op, factor_loss = wals_model.update_row_factors(
-          sp_input=sp_feeder, transpose_input=True)
+      (_, process_input_op, unregularized_loss, regularization,
+       _) = wals_model.update_row_factors(
+           sp_input=sp_feeder, transpose_input=True)
+      factor_loss = unregularized_loss + regularization
       for inp in input_scattered_rows:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -344,15 +413,18 @@ class WalsModelTest(test.TestCase):
       if compute_loss:
         # Test loss computation after the row update
         loss = sum(
-            sess.run(factor_loss * self.count_cols(inp) / num_rows,
-                     feed_dict={sp_feeder: inp})
+            sess.run(
+                factor_loss * self.count_cols(inp) / num_rows,
+                feed_dict={sp_feeder: inp})
             for inp in input_scattered_rows_non_duplicate)
-        true_loss = self.calculate_loss_from_wals_model(
-            wals_model, self._wals_inputs)
+        true_loss = self.calculate_loss_from_wals_model(wals_model,
+                                                        self._wals_inputs)
         self.assertNear(
-            loss, true_loss, err=.001,
-            msg="""After row update, computed loss = {}, does not match the true
-            loss = {}.""".format(loss, true_loss))
+            loss,
+            true_loss,
+            err=.001,
+            msg="After row update, computed loss [{}] does not match"
+            " true loss [{}]".format(loss, true_loss))
 
       # Split input into multiple SparseTensors with scattered columns.
       # Here the inputs are transposed. But the same constraints as described in
@@ -375,8 +447,10 @@ class WalsModelTest(test.TestCase):
       # Here we feed in scattered columns of the input.
       wals_model.col_update_prep_gramian_op.run()
       wals_model.initialize_col_update_op.run()
-      _, process_input_op, factor_loss = wals_model.update_col_factors(
-          sp_input=sp_feeder, transpose_input=True)
+      (_, process_input_op, unregularized_loss, regularization,
+       _) = wals_model.update_col_factors(
+           sp_input=sp_feeder, transpose_input=True)
+      factor_loss = unregularized_loss + regularization
       for inp in input_scattered_cols:
         feed_dict = {sp_feeder: inp}
         process_input_op.run(feed_dict=feed_dict)
@@ -406,21 +480,23 @@ class WalsModelTest(test.TestCase):
           atol=1e-3)
       self.assertAllClose(
           projected_cols_no_weights.eval(feed_dict=feed_dict),
-          [[3.585139, -0.487476, -3.852232],
-           [0.557937, 1.813907, 1.331171]],
+          [[3.585139, -0.487476, -3.852232], [0.557937, 1.813907, 1.331171]],
           atol=1e-3)
       if compute_loss:
         # Test loss computation after the col update
         loss = sum(
-            sess.run(factor_loss * self.count_rows(inp) / num_cols,
-                     feed_dict={sp_feeder: inp})
+            sess.run(
+                factor_loss * self.count_rows(inp) / num_cols,
+                feed_dict={sp_feeder: inp})
             for inp in input_scattered_cols_non_duplicate)
-        true_loss = self.calculate_loss_from_wals_model(
-            wals_model, self._wals_inputs)
+        true_loss = self.calculate_loss_from_wals_model(wals_model,
+                                                        self._wals_inputs)
         self.assertNear(
-            loss, true_loss, err=.001,
-            msg="""After col update, computed loss = {}, does not match the true
-            loss = {}.""".format(loss, true_loss))
+            loss,
+            true_loss,
+            err=.001,
+            msg="After col update, computed loss [{}] does not match"
+            " true loss [{}]".format(loss, true_loss))
 
   # Note that when row_weights and col_weights are 0, WALS gives identical
   # results as ALS (Alternating Least Squares). However our implementation does
@@ -503,8 +579,7 @@ class WalsModelTest(test.TestCase):
       for c1, c2 in zip(col_factors1, col_factors2):
         self.assertAllClose(c1, c2, rtol=5e-3, atol=1e-2)
       self.assertAllClose(
-          als_projected_col_factors1,
-          [col_factors2[0][2], col_factors2[0][0]],
+          als_projected_col_factors1, [col_factors2[0][2], col_factors2[0][0]],
           atol=1e-2)
 
   def _run_test_als_transposed(self, use_factors_weights_cache):
@@ -599,8 +674,8 @@ class WalsModelTest(test.TestCase):
     cols = 11
     dims = 3
     with ops.Graph().as_default(), self.test_session():
-      data = np.dot(np.random.rand(rows, 3),
-                    np.random.rand(3, cols)).astype(np.float32) / 3.0
+      data = np.dot(np.random.rand(rows, 3), np.random.rand(
+          3, cols)).astype(np.float32) / 3.0
       indices = [[i, j] for i in xrange(rows) for j in xrange(cols)]
       values = data.reshape(-1)
       inp = sparse_tensor.SparseTensor(indices, values, [rows, cols])
@@ -629,8 +704,8 @@ class WalsModelTest(test.TestCase):
     dims = 3
 
     with ops.Graph().as_default(), self.test_session():
-      data = np.dot(np.random.rand(rows, 3),
-                    np.random.rand(3, cols)).astype(np.float32) / 3.0
+      data = np.dot(np.random.rand(rows, 3), np.random.rand(
+          3, cols)).astype(np.float32) / 3.0
       indices = [[i, j] for i in xrange(rows) for j in xrange(cols)]
       values = data.reshape(-1)
       inp = sparse_tensor.SparseTensor(indices, values, [rows, cols])
@@ -664,8 +739,8 @@ class WalsModelTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session():
       row_wts = 0.1 + np.random.rand(rows)
       col_wts = 0.1 + np.random.rand(cols)
-      data = np.dot(np.random.rand(rows, 3),
-                    np.random.rand(3, cols)).astype(np.float32) / 3.0
+      data = np.dot(np.random.rand(rows, 3), np.random.rand(
+          3, cols)).astype(np.float32) / 3.0
       indices = np.array(
           list(
               filter(keep_index,
@@ -741,5 +816,12 @@ class WalsModelTest(test.TestCase):
   def test_loss_without_cache(self):
     self._run_test_process_input(False, compute_loss=True)
 
+  def test_sum_row_weights(self):
+    self._run_test_sum_weights(True)
+
+  def test_sum_col_weights(self):
+    self._run_test_sum_weights(False)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index 396dd286b6cb9e1d2a03229a378e62aff5f799e3..0d67e09f8151b48c97094b6b48f26e63443707ef 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
 import numpy as np
 
 from tensorflow.contrib import framework
@@ -27,10 +28,13 @@ from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import logging_ops as logging
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops.control_flow_ops import with_dependencies
+from tensorflow.python.training import session_run_hook
 
 
 def _streaming_sum(scalar_tensor):
@@ -40,6 +44,29 @@ def _streaming_sum(scalar_tensor):
   return sum_metric, sum_update
 
 
+class _InitializeClustersHook(session_run_hook.SessionRunHook):
+  """Initializes clusters or waits for cluster initialization."""
+
+  def __init__(self, init_op, is_initialized_op, is_chief):
+    self._init_op = init_op
+    self._is_chief = is_chief
+    self._is_initialized_op = is_initialized_op
+
+  def after_create_session(self, session, _):
+    assert self._init_op.graph == ops.get_default_graph()
+    assert self._is_initialized_op.graph == self._init_op.graph
+    while True:
+      try:
+        if session.run(self._is_initialized_op):
+          break
+        elif self._is_chief:
+          session.run(self._init_op)
+        else:
+          time.sleep(1)
+      except RuntimeError as e:
+        logging.info(e)
+
+
 class GMM(estimator.Estimator):
   """An estimator for GMM clustering."""
   SCORES = 'scores'
@@ -128,16 +155,23 @@ class GMM(estimator.Estimator):
   def _model_builder(self):
     """Creates a model function."""
 
-    def _model_fn(features, labels, mode):
+    def _model_fn(features, labels, mode, config):
       """Model function."""
       assert labels is None, labels
-      (all_scores, model_predictions, losses, training_op) = gmm_ops.gmm(
-          self._parse_tensor_or_dict(features), self._training_initial_clusters,
-          self._num_clusters, self._random_seed, self._covariance_type,
-          self._params)
+      (all_scores,
+       model_predictions,
+       losses, training_op,
+       init_op,
+       is_initialized) = gmm_ops.gmm(self._parse_tensor_or_dict(features),
+                                     self._training_initial_clusters,
+                                     self._num_clusters, self._random_seed,
+                                     self._covariance_type,
+                                     self._params)
       incr_step = state_ops.assign_add(variables.get_global_step(), 1)
       loss = math_ops.reduce_sum(losses)
       training_op = with_dependencies([training_op, incr_step], loss)
+      training_hooks = [_InitializeClustersHook(
+          init_op, is_initialized, config.is_chief)]
       predictions = {
           GMM.ALL_SCORES: all_scores[0],
           GMM.ASSIGNMENTS: model_predictions[0][0],
@@ -147,6 +181,7 @@ class GMM(estimator.Estimator):
       }
       return model_fn_lib.ModelFnOps(mode=mode, predictions=predictions,
                                      eval_metric_ops=eval_metric_ops,
-                                     loss=loss, train_op=training_op)
+                                     loss=loss, train_op=training_op,
+                                     training_hooks=training_hooks)
 
     return _model_fn
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index b092eab316664705a455b88a524a77917f141b37..a61681c7f5a69a0fff1089404fc80b95c1c3106e 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.embedding_ops import embedding_lookup
 from tensorflow.python.summary import summary
 
@@ -144,15 +145,34 @@ class GmmAlgorithm(object):
     # Small value to guarantee that covariances are invertible.
     self._min_var = array_ops.diag(
         array_ops.ones(array_ops.stack([self._dimensions]))) * 1e-3
-    self._create_variables(data, initial_means)
+    self._create_variables()
+    self._initialize_variables(data, initial_means)
     # Operations of partial statistics for the computation of the means.
     self._w_mul_x = []
     # Operations of partial statistics for the computation of the covariances.
     self._w_mul_x2 = []
     self._define_graph(data)
 
-  def _create_variables(self, data, initial_means=None):
-    """Initializes GMM algorithm.
+  def _create_variables(self):
+    """Initializes GMM algorithm."""
+    init_value = array_ops.constant([], dtype=dtypes.float32)
+    self._means = variables.Variable(init_value,
+                                     name=self.CLUSTERS_VARIABLE,
+                                     validate_shape=False)
+    self._covs = variables.Variable(
+        init_value, name=self.CLUSTERS_COVS_VARIABLE, validate_shape=False)
+    # Mixture weights, representing the probability that a randomly
+    # selected unobservable data (in EM terms) was generated by component k.
+    self._alpha = variable_scope.variable(
+        array_ops.tile([1.0 / self._num_classes], [self._num_classes]),
+        name=self.CLUSTERS_WEIGHT,
+        validate_shape=False)
+    self._cluster_centers_initialized = variables.Variable(False,
+                                                           dtype=dtypes.bool,
+                                                           name='initialized')
+
+  def _initialize_variables(self, data, initial_means=None):
+    """Initializes variables.
 
     Args:
       data: a list of Tensors with data, each row is a new example.
@@ -161,19 +181,11 @@ class GmmAlgorithm(object):
     first_shard = data[0]
     # Initialize means: num_classes X 1 X dimensions.
     if initial_means is not None:
-      self._means = variable_scope.variable(
-          array_ops.expand_dims(initial_means, 1),
-          name=self.CLUSTERS_VARIABLE,
-          validate_shape=False,
-          dtype=dtypes.float32)
+      means = array_ops.expand_dims(initial_means, 1)
     else:
       # Sample data randomly
-      self._means = variable_scope.variable(
-          array_ops.expand_dims(
-              _init_clusters_random(data, self._num_classes, self._random_seed),
-              1),
-          name=self.CLUSTERS_VARIABLE,
-          validate_shape=False)
+      means = array_ops.expand_dims(
+          _init_clusters_random(data, self._num_classes, self._random_seed), 1)
 
     # Initialize covariances.
     if self._covariance_type == FULL_COVARIANCE:
@@ -187,18 +199,40 @@ class GmmAlgorithm(object):
       covs = array_ops.tile(
           array_ops.expand_dims(array_ops.diag_part(cov), 0),
           [self._num_classes, 1])
-    self._covs = variable_scope.variable(
-        covs, name=self.CLUSTERS_COVS_VARIABLE, validate_shape=False)
-    # Mixture weights, representing the probability that a randomly
-    # selected unobservable data (in EM terms) was generated by component k.
-    self._alpha = variable_scope.variable(
-        array_ops.tile([1.0 / self._num_classes], [self._num_classes]),
-        name=self.CLUSTERS_WEIGHT,
-        validate_shape=False)
+
+    with ops.colocate_with(self._cluster_centers_initialized):
+      initialized = control_flow_ops.with_dependencies(
+          [means, covs],
+          array_ops.identity(self._cluster_centers_initialized))
+    self._init_ops = []
+    with ops.colocate_with(self._means):
+      init_means = state_ops.assign(self._means, means, validate_shape=False)
+      init_means = control_flow_ops.with_dependencies(
+          [init_means],
+          state_ops.assign(self._cluster_centers_initialized, True))
+      self._init_ops.append(control_flow_ops.cond(initialized,
+                                                  control_flow_ops.no_op,
+                                                  lambda: init_means).op)
+    with ops.colocate_with(self._covs):
+      init_covs = state_ops.assign(self._covs, covs, validate_shape=False)
+      init_covs = control_flow_ops.with_dependencies(
+          [init_covs],
+          state_ops.assign(self._cluster_centers_initialized, True))
+      self._init_ops.append(control_flow_ops.cond(initialized,
+                                                  control_flow_ops.no_op,
+                                                  lambda: init_covs).op)
+
+  def init_ops(self):
+    """Returns the initialization operation."""
+    return control_flow_ops.group(*self._init_ops)
 
   def training_ops(self):
     """Returns the training operation."""
-    return self._train_ops
+    return control_flow_ops.group(*self._train_ops)
+
+  def is_initialized(self):
+    """Returns a boolean operation for initialized variables."""
+    return self._cluster_centers_initialized
 
   def alphas(self):
     return self._alpha
@@ -486,6 +520,7 @@ def gmm(inp,
     scores: Similar to assignments but specifies the distance to the
       assigned cluster instead.
     training_op: an op that runs an iteration of training.
+    init_op: an op that runs the initialization.
   """
   initial_means = None
   if initial_clusters != 'random' and not isinstance(initial_clusters,
@@ -496,8 +531,7 @@ def gmm(inp,
   inp = inp if isinstance(inp, list) else [inp]
   gmm_tool = GmmAlgorithm(inp, num_clusters, initial_means, params,
                           covariance_type, random_seed)
-  training_ops = gmm_tool.training_ops()
   assignments = gmm_tool.assignments()
   all_scores, scores = gmm_tool.scores()
-  return [all_scores], [assignments], [scores], control_flow_ops.group(
-      *training_ops)
+  return ([all_scores], [assignments], [scores], gmm_tool.training_ops(),
+          gmm_tool.init_ops(), gmm_tool.is_initialized())
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
index df8fc40ffaede2b24e832a74edbec496e877d010..c50e82db8a230012ba13c1d7ad7e28c23bd27355 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
@@ -122,12 +122,11 @@ class GmmOpsTest(test.TestCase):
       g.seed = 5
       with self.test_session() as sess:
         data = constant_op.constant(self.data, dtype=dtypes.float32)
-        _, assignments, _, training_op = gmm_ops.gmm(data,
-                                                     'random',
-                                                     num_classes,
-                                                     random_seed=self.seed)
+        _, assignments, _, training_op, init_op, _ = gmm_ops.gmm(
+            data, 'random', num_classes, random_seed=self.seed)
 
         variables.global_variables_initializer().run()
+        sess.run(init_op)
         for _ in xrange(self.iterations):
           sess.run(training_op)
         assignments = sess.run(assignments)
@@ -146,6 +145,7 @@ class GmmOpsTest(test.TestCase):
                                       [[3.0, 3.0], [0.0, 0.0]], 'w')
       training_ops = gmm_tool.training_ops()
       variables.global_variables_initializer().run()
+      sess.run(gmm_tool.init_ops())
       for _ in xrange(self.iterations):
         sess.run(training_ops)
 
@@ -163,6 +163,7 @@ class GmmOpsTest(test.TestCase):
                                       [[3.0, 3.0], [0.0, 0.0]], 'mc')
       training_ops = gmm_tool.training_ops()
       variables.global_variables_initializer().run()
+      sess.run(gmm_tool.init_ops())
       for _ in xrange(self.iterations):
         sess.run(training_ops)
       alphas = sess.run(gmm_tool.alphas())
@@ -181,6 +182,7 @@ class GmmOpsTest(test.TestCase):
                                       [[-1.0, -1.0], [1.0, 1.0]], 'c')
       training_ops = gmm_tool.training_ops()
       variables.global_variables_initializer().run()
+      sess.run(gmm_tool.init_ops())
       for _ in xrange(self.iterations):
         sess.run(training_ops)
       alphas = sess.run(gmm_tool.alphas())
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
index 758c54fbf492b2a3c8aa6805b8b8c404e0aa4bda..7717b47daefce9ff65b1f1e84f671a463cf2e826 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -28,9 +28,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed as random_seed_lib
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
+from tensorflow.python.training import queue_runner
 
 FLAGS = flags.FLAGS
 
@@ -102,7 +104,7 @@ class GMMTest(test.TestCase):
                          np.linalg.inv(covs[assignments[r]])), points[r, :] -
                   means[assignments[r]])))
     return (points, assignments, scores)
-  
+
   def test_weights(self):
     """Tests the shape of the weights."""
     gmm = gmm_lib.GMM(self.num_centers,
@@ -223,5 +225,27 @@ class GMMTest(test.TestCase):
     self.assertFalse(np.isnan(gmm.clusters()).any())
 
 
+class GMMTestQueues(test.TestCase):
+
+  def input_fn(self):
+    def _fn():
+      queue = data_flow_ops.FIFOQueue(capacity=10,
+                                      dtypes=dtypes.float32,
+                                      shapes=[10, 3])
+      enqueue_op = queue.enqueue(array_ops.zeros([10, 3], dtype=dtypes.float32))
+      queue_runner.add_queue_runner(queue_runner.QueueRunner(queue,
+                                                             [enqueue_op]))
+      return queue.dequeue(), None
+    return _fn
+
+  # This test makes sure that there are no deadlocks when using a QueueRunner.
+  # Note that since cluster initialization is dependendent on inputs, if input
+  # is generated using a QueueRunner, one has to make sure that these runners
+  # are started before the initialization.
+  def test_queues(self):
+    gmm = gmm_lib.GMM(2, covariance_type='diag')
+    gmm.fit(input_fn=self.input_fn(), steps=1)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 0bc0ef39ec9ec58652dc44f5b03564a9e3003e8e..3e3ee5fa57f1356db98a17f9e17e60f01d85d3b9 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -38,17 +38,9 @@ from tensorflow.python.training import session_run_hook
 class _SweepHook(session_run_hook.SessionRunHook):
   """Keeps track of row/col sweeps, and runs prep ops before each sweep."""
 
-  def __init__(self,
-               is_row_sweep_var,
-               train_op,
-               num_rows,
-               num_cols,
-               processed_row_indices,
-               processed_col_indices,
-               row_prep_ops,
-               col_prep_ops,
-               cache_init_ops,
-               completed_sweeps_var):
+  def __init__(self, is_row_sweep_var, train_op, num_rows, num_cols,
+               processed_row_indices, processed_col_indices, row_prep_ops,
+               col_prep_ops, cache_init_ops, completed_sweeps_var):
     """Initializes SweepHook.
 
     Args:
@@ -90,12 +82,9 @@ class _SweepHook(session_run_hook.SessionRunHook):
     # _is_row_sweep_var and incrementing the global_step and completed_sweeps
     # counters. They have control_dependencies on train_op.
     self._fetches = self._create_switch_ops(processed_row_indices,
-                                            processed_col_indices,
-                                            train_op)
+                                            processed_col_indices, train_op)
 
-  def _create_switch_ops(self,
-                         processed_row_indices,
-                         processed_col_indices,
+  def _create_switch_ops(self, processed_row_indices, processed_col_indices,
                          train_op):
     """Creates ops to update is_row_sweep_var, global_step and completed_sweeps.
 
@@ -147,17 +136,20 @@ class _SweepHook(session_run_hook.SessionRunHook):
     # After running the train_op, update processed_rows or processed_cols
     # tensors, depending on whether we are currently doing a row or a col sweep
     with ops.control_dependencies([train_op]):
+
       def get_row_update_op():
         with ops.colocate_with(processed_rows):
-          return state_ops.scatter_update(
-              processed_rows, processed_row_indices,
-              array_ops.ones_like(processed_row_indices, dtype=dtypes.bool))
+          return state_ops.scatter_update(processed_rows, processed_row_indices,
+                                          array_ops.ones_like(
+                                              processed_row_indices,
+                                              dtype=dtypes.bool))
 
       def get_col_update_op():
         with ops.colocate_with(processed_cols):
-          return state_ops.scatter_update(
-              processed_cols, processed_col_indices,
-              array_ops.ones_like(processed_col_indices, dtype=dtypes.bool))
+          return state_ops.scatter_update(processed_cols, processed_col_indices,
+                                          array_ops.ones_like(
+                                              processed_col_indices,
+                                              dtype=dtypes.bool))
 
       update_processed_op = control_flow_ops.cond(
           self._is_row_sweep_var, get_row_update_op, get_col_update_op)
@@ -166,6 +158,7 @@ class _SweepHook(session_run_hook.SessionRunHook):
       # If this is the case, flip the is_row_sweep_var and reset processed_rows
       # and processed_cols tensors.
       with ops.control_dependencies([update_processed_op]):
+
         def get_switch_op():
           return state_ops.assign(
               self._is_row_sweep_var,
@@ -182,13 +175,17 @@ class _SweepHook(session_run_hook.SessionRunHook):
             lambda: math_ops.reduce_all(processed_cols),
             name="sweep_hook_is_sweep_done")
         switch_op = control_flow_ops.cond(
-            is_sweep_done, get_switch_op, control_flow_ops.no_op,
+            is_sweep_done,
+            get_switch_op,
+            control_flow_ops.no_op,
             name="sweep_hook_switch_op")
         reset_op = control_flow_ops.cond(
-            is_sweep_done, get_reset_op, control_flow_ops.no_op,
+            is_sweep_done,
+            get_reset_op,
+            control_flow_ops.no_op,
             name="sweep_hook_reset_op")
-        switch_ops = control_flow_ops.group(switch_op, reset_op,
-                                            name="sweep_hook_switch_ops")
+        switch_ops = control_flow_ops.group(
+            switch_op, reset_op, name="sweep_hook_switch_ops")
 
         with ops.control_dependencies([switch_ops]):
           # Op to increment the completed_sweeps counter.
@@ -208,7 +205,8 @@ class _SweepHook(session_run_hook.SessionRunHook):
                 name="global_step_incr")
 
           incr_ops = control_flow_ops.group(
-              completed_sweeps_incr_op, global_step_incr_op,
+              completed_sweeps_incr_op,
+              global_step_incr_op,
               name="counter_incr_ops")
 
     return [is_sweep_done, switch_ops, incr_ops]
@@ -260,7 +258,7 @@ class _StopAtSweepHook(session_run_hook.SessionRunHook):
   def begin(self):
     try:
       self._completed_sweeps_var = ops.get_default_graph().get_tensor_by_name(
-          WALSMatrixFactorization.COMPLETED_SWEEPS+":0")
+          WALSMatrixFactorization.COMPLETED_SWEEPS + ":0")
     except KeyError:
       raise RuntimeError(WALSMatrixFactorization.COMPLETED_SWEEPS +
                          " counter should be created to use StopAtSweepHook.")
@@ -288,12 +286,10 @@ def _wals_factorization_model_function(features, labels, mode, params):
     A ModelFnOps object.
   """
   assert labels is None
-  use_factors_weights_cache = (
-      params["use_factors_weights_cache_for_training"]
-      and mode == model_fn.ModeKeys.TRAIN)
-  use_gramian_cache = (
-      params["use_gramian_cache_for_training"]
-      and mode == model_fn.ModeKeys.TRAIN)
+  use_factors_weights_cache = (params["use_factors_weights_cache_for_training"]
+                               and mode == model_fn.ModeKeys.TRAIN)
+  use_gramian_cache = (params["use_gramian_cache_for_training"] and
+                       mode == model_fn.ModeKeys.TRAIN)
   max_sweeps = params["max_sweeps"]
   model = factorization_ops.WALSModel(
       params["num_rows"],
@@ -348,15 +344,22 @@ def _wals_factorization_model_function(features, labels, mode, params):
 
   def update_row_factors():
     return model.update_row_factors(sp_input=input_rows, transpose_input=False)
+
   def update_col_factors():
     return model.update_col_factors(sp_input=input_cols, transpose_input=True)
-  _, train_op, loss = control_flow_ops.cond(
-      is_row_sweep, update_row_factors, update_col_factors)
 
-  row_prep_ops = [model.row_update_prep_gramian_op,
-                  model.initialize_row_update_op]
-  col_prep_ops = [model.col_update_prep_gramian_op,
-                  model.initialize_col_update_op]
+  (_, train_op,
+   unregularized_loss, regularization, sum_weights) = control_flow_ops.cond(
+       is_row_sweep, update_row_factors, update_col_factors)
+  loss = unregularized_loss + regularization
+  root_weighted_squared_error = math_ops.sqrt(unregularized_loss / sum_weights)
+
+  row_prep_ops = [
+      model.row_update_prep_gramian_op, model.initialize_row_update_op
+  ]
+  col_prep_ops = [
+      model.col_update_prep_gramian_op, model.initialize_col_update_op
+  ]
   cache_init_ops = [model.worker_init]
 
   sweep_hook = _SweepHook(
@@ -369,13 +372,15 @@ def _wals_factorization_model_function(features, labels, mode, params):
       row_prep_ops,
       col_prep_ops,
       cache_init_ops,
-      completed_sweeps_var,
-  )
+      completed_sweeps_var,)
   training_hooks = [sweep_hook]
   if max_sweeps is not None:
     training_hooks.append(_StopAtSweepHook(max_sweeps))
 
-  summary.scalar("loss", loss)
+  # The root weighted squared error =
+  #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
+  summary.scalar("loss", loss)  # the estimated total training loss
+  summary.scalar("root_weighted_squared_error", root_weighted_squared_error)
   summary.scalar("completed_sweeps", completed_sweeps_var)
 
   # Prediction ops (only return predictions in INFER mode)
@@ -384,11 +389,13 @@ def _wals_factorization_model_function(features, labels, mode, params):
     project_row = features[WALSMatrixFactorization.PROJECT_ROW]
     projection_weights = features.get(
         WALSMatrixFactorization.PROJECTION_WEIGHTS)
+
     def get_row_projection():
       return model.project_row_factors(
           sp_input=input_rows,
           projection_weights=projection_weights,
           transpose_input=False)
+
     def get_col_projection():
       return model.project_col_factors(
           sp_input=input_cols,
@@ -396,8 +403,8 @@ def _wals_factorization_model_function(features, labels, mode, params):
           transpose_input=True)
 
     predictions[WALSMatrixFactorization.PROJECTION_RESULT] = (
-        control_flow_ops.cond(
-            project_row, get_row_projection, get_col_projection))
+        control_flow_ops.cond(project_row, get_row_projection,
+                              get_col_projection))
 
   return model_fn.ModelFnOps(
       mode=mode,
@@ -556,26 +563,41 @@ class WALSMatrixFactorization(estimator.Estimator):
     # TODO(walidk): Provide input pipelines that handle missing rows.
 
     params = {
-        "num_rows": num_rows,
-        "num_cols": num_cols,
-        "embedding_dimension": embedding_dimension,
-        "unobserved_weight": unobserved_weight,
-        "regularization_coeff": regularization_coeff,
-        "row_init": row_init,
-        "col_init": col_init,
-        "num_row_shards": num_row_shards,
-        "num_col_shards": num_col_shards,
-        "row_weights": row_weights,
-        "col_weights": col_weights,
-        "max_sweeps": max_sweeps,
+        "num_rows":
+            num_rows,
+        "num_cols":
+            num_cols,
+        "embedding_dimension":
+            embedding_dimension,
+        "unobserved_weight":
+            unobserved_weight,
+        "regularization_coeff":
+            regularization_coeff,
+        "row_init":
+            row_init,
+        "col_init":
+            col_init,
+        "num_row_shards":
+            num_row_shards,
+        "num_col_shards":
+            num_col_shards,
+        "row_weights":
+            row_weights,
+        "col_weights":
+            col_weights,
+        "max_sweeps":
+            max_sweeps,
         "use_factors_weights_cache_for_training":
             use_factors_weights_cache_for_training,
-        "use_gramian_cache_for_training": use_gramian_cache_for_training
+        "use_gramian_cache_for_training":
+            use_gramian_cache_for_training
     }
-    self._row_factors_names = ["row_factors_shard_%d" % i
-                               for i in range(num_row_shards)]
-    self._col_factors_names = ["col_factors_shard_%d" % i
-                               for i in range(num_col_shards)]
+    self._row_factors_names = [
+        "row_factors_shard_%d" % i for i in range(num_row_shards)
+    ]
+    self._col_factors_names = [
+        "col_factors_shard_%d" % i for i in range(num_col_shards)
+    ]
 
     super(WALSMatrixFactorization, self).__init__(
         model_fn=_wals_factorization_model_function,
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index 31a286939b66673d648a03b480e4fc8dfb57d80a..e495ab4880356181ff38c5ea1846b54fe5273ff4 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -124,10 +124,9 @@ py_library(
         ":decode_audio_op_py",
         ":encode_audio_op_py",
         "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index bf8419c04a692bf468d62f0891af476ffb50c608..917488080e7081ac5c5f5ffc9dd354982ceee860 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -17,7 +17,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 61fe729fd754b6b51a8e4165abe7cfbac989b1f7..a953c04c1a9cd1fafcaa3bf06172e97c819657c8 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -178,7 +178,6 @@ py_test(
     deps = [
         ":framework_py",
         "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -198,7 +197,6 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -211,7 +209,6 @@ py_test(
         ":framework_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -223,9 +220,9 @@ py_test(
     deps = [
         ":framework_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -241,7 +238,6 @@ py_test(
         ":framework_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -255,8 +251,8 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -308,6 +304,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":framework_py",
+        ":gen_checkpoint_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -316,6 +313,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/framework/python/ops/ops.py b/tensorflow/contrib/framework/python/ops/ops.py
index 4fccc2ceac7cdcdb3ce6de86e9591d1f7eb53212..ac451a974d50e540524293dcf6a6c14e0016974e 100644
--- a/tensorflow/contrib/framework/python/ops/ops.py
+++ b/tensorflow/contrib/framework/python/ops/ops.py
@@ -68,6 +68,6 @@ def get_name_scope():
     would print the string `scope1/scope2`.
 
   Returns:
-    A string represnting the current name scope.
+    A string representing the current name scope.
   """
   return ops.get_default_graph().get_name_scope()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index f02a7c636068b651c60f58009ad056036473d655..411b4facdb1d9dcbac5c81753cdf868719e77119 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -416,7 +416,7 @@ def get_unique_variable(var_op_name):
   for candidate in candidates:
     if candidate.op.name == var_op_name:
       return candidate
-  raise ValueError('Variable %s does not uniquely identify a variable',
+  raise ValueError('Variable %s does not uniquely identify a variable' %
                    var_op_name)
 
 
@@ -444,7 +444,7 @@ def assign_from_values(var_names_to_values):
     var_value = var_names_to_values[var_name]
     var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, var_name)
     if not var:
-      raise ValueError('Variable %s wasnt found', var_name)
+      raise ValueError('Variable %s wasn\'t found' % var_name)
     elif len(var) > 1:
       # tf.get_collection is just a filter on the prefix: find the exact match:
       found = False
@@ -455,7 +455,7 @@ def assign_from_values(var_names_to_values):
           break
 
       if not found:
-        raise ValueError('Variable %s doesnt uniquely identify a variable',
+        raise ValueError('Variable %s doesn\'t uniquely identify a variable' %
                          var_name)
     else:
       var = var[0]
@@ -525,7 +525,7 @@ def get_variable_full_name(var):
 # TODO(sguada): Update docs in slim/g3doc/index.md to describe
 # the new feature where the var_list dictionary can have values that
 # are each a list of Variables.
-def assign_from_checkpoint(model_path, var_list):
+def assign_from_checkpoint(model_path, var_list, ignore_missing_vars=False):
   """Creates an operation to assign specific variables from a checkpoint.
 
   Args:
@@ -538,13 +538,15 @@ def assign_from_checkpoint(model_path, var_list):
         name in the checkpoint must be the full variable, not the
         name of the partitioned variable, eg. "my_var" rather than
         "my_var/part_4". If empty, returns no_op(), {}.
+    ignore_missing_vars: Boolean, if True ignore variables missing in the
+        checkpoint with a warning instead of failing.
 
   Returns:
     the restore_op and the feed_dict that need to be run to restore var_list.
 
   Raises:
-    ValueError: If the checkpoint specified at `model_path` is missing one of
-      the variables in `var_list`.
+    ValueError: If `ignore_missing_vars` is False and the checkpoint specified
+        at `model_path` is missing one of the variables in `var_list`.
   """
   # Normalize var_list into a dictionary mapping names in the
   # checkpoint to the list of variables to initialize from that
@@ -572,8 +574,12 @@ def assign_from_checkpoint(model_path, var_list):
   assign_ops = []
   for ckpt_name in grouped_vars:
     if not reader.has_tensor(ckpt_name):
-      raise ValueError(
-          'Checkpoint is missing variable [%s]' % ckpt_name)
+      log_str = 'Checkpoint is missing variable [%s]' % ckpt_name
+      if ignore_missing_vars:
+        logging.warning(log_str)
+        continue
+      else:
+        raise ValueError(log_str)
     ckpt_value = reader.get_tensor(ckpt_name)
 
     for var in grouped_vars[ckpt_name]:
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5a9eeea70e08a80d9419203e0c3117aced850dee
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -0,0 +1,170 @@
+# Description:
+#   A Fused Conv Bias Activation operator wrapper.
+#   APIs are meant to change over time.
+package(
+    default_visibility = ["//visibility:private"],
+    features = ["-parse_headers"],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/...",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+tf_custom_op_py_library(
+    name = "fused_conv_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    dso = [":python/ops/_fused_conv2d_bias_activation_op.so"],
+    kernels = [
+        ":fused_conv2d_bias_activation_op_kernels",
+        ":fused_conv2d_bias_activation_op_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":fused_conv2d_bias_activation_op",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_kernel_library(
+    name = "fused_conv2d_bias_activation_op_kernels",
+    srcs = [
+        "kernels/fused_conv2d_bias_activation_op.cc",
+        "kernels/fused_conv2d_bias_activation_op.h",
+    ],
+    prefix = "fused_conv2d_bias_activation_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:conv_2d_hdrs",
+        "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
+        "//tensorflow/core/kernels:gpu_util_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_fused_conv2d_bias_activation_op.so",
+    srcs = [
+        "kernels/fused_conv2d_bias_activation_op.cc",
+        "kernels/fused_conv2d_bias_activation_op.h",
+        "ops/fused_conv2d_bias_activation_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:conv_2d_hdrs",
+        "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
+        "//tensorflow/core/kernels:gpu_util_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "fused_conv2d_bias_activation_op",
+    ],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "fused_conv2d_bias_activation_op",
+    deps = [":fused_conv2d_bias_activation_op_op_lib"],
+)
+
+cuda_py_test(
+    name = "fused_conv2d_bias_activation_op_test",
+    size = "small",
+    srcs = ["python/ops/fused_conv2d_bias_activation_op_test.py"],
+    additional_deps = [
+        ":fused_conv_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+    tags = [
+        "manual",
+        "requires_cudnn6",
+    ],
+)
+
+cuda_py_test(
+    name = "fused_conv2d_bias_activation_benchmark",
+    size = "large",
+    srcs = ["python/ops/fused_conv2d_bias_activation_benchmark.py"],
+    additional_deps = [
+        ":fused_conv_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    main = "python/ops/fused_conv2d_bias_activation_benchmark.py",
+    tags = [
+        "manual",
+        "requires_cudnn6",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/fused_conv/__init__.py b/tensorflow/contrib/fused_conv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4d3fc707d002e55a5531f524fd4c9f771d3024
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to fused_conv2d_bias_activation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.fused_conv.python.ops.fused_conv2d_bias_activation_op import *
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__, ['fused_conv2d_bias_activation'])
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d553d5a0a657b1f5b50a7fc9bdf6b34ca24a325e
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -0,0 +1,497 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/activation_mode.h"
+#endif  // GOOGLE_CUDA
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchConvOp;
+
+template <typename Device, typename T>
+class FusedConv2DBiasActivationOp : public OpKernel {
+ public:
+  explicit FusedConv2DBiasActivationOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument("Current implementation only supports "
+                                        "NHWC and NCHW data formats."));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(strides_, data_format_, 'N') == 1 &&
+         GetTensorDim(strides_, data_format_, 'C') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    string activation_mode_str;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("activation_mode", &activation_mode_str));
+    OP_REQUIRES_OK(context, GetActivationModeFromString(activation_mode_str,
+                                                        &activation_mode_));
+    OP_REQUIRES(context, activation_mode_ == ActivationMode::RELU,
+                errors::InvalidArgument("Current implementation only supports "
+                                        "relu as the activation mode."));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is one of the following shapes:
+    // [ batch, in_rows, in_cols, in_depth ] (for NHWC data format)
+    // [ batch, in_depth, in_rows, in_cols ] (for NCHW data format)
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth ]
+    const Tensor& filter = context->input(1);
+
+    // Input bias is a 1-D tensor the size of the last
+    // dimension of Output tensor
+    const Tensor& bias = context->input(2);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // Bias should be a 1-D tensor.
+    OP_REQUIRES(context, bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional: ",
+                                        bias.shape().DebugString()));
+
+    for (int i = 0; i < 4; i++) {
+      OP_REQUIRES(context,
+                  FastBoundsCheck(filter.dim_size(i),
+                                  std::numeric_limits<int32>::max()),
+                  errors::InvalidArgument("filter dimension too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(input.dim_size(i), std::numeric_limits<int32>::max()),
+          errors::InvalidArgument("input dimension too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int32 out_depth = static_cast<int32>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
+    const int32 input_rows = static_cast<int32>(input_rows_raw);
+    const int32 filter_rows = static_cast<int32>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
+    const int32 input_cols = static_cast<int32>(input_cols_raw);
+    const int32 filter_cols = static_cast<int32>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = GetTensorDim(input, data_format_, 'N');
+    const int32 batch = static_cast<int32>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int32 stride_rows =
+        static_cast<int32>(GetTensorDim(strides_, data_format_, 'H'));
+    const int32 stride_cols =
+        static_cast<int32>(GetTensorDim(strides_, data_format_, 'W'));
+    const int32 bias_size = static_cast<int32>(bias.dim_size(0));
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    // Bias size should be the same as the size of the channel dimension of
+    // output.
+    OP_REQUIRES(context, bias_size == out_depth,
+                errors::InvalidArgument(
+                    "bias size should equal the channel "
+                    "dimension size of output. bias shape: ",
+                    bias.shape().DebugString() +
+                        ", output shape: " + output->shape().DebugString()));
+
+    VLOG(2) << "FusedConv2DBiasActivation: in_depth = " << in_depth
+            << ", input_cols = " << input_cols
+            << ", filter_cols = " << filter_cols
+            << ", input_rows = " << input_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", bias_size = " << bias_size << ", out_depth = " << out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    launcher_.launch(context, cudnn_use_autotune_, input, filter, stride_rows,
+                     stride_cols, bias, activation_mode_,
+                     BrainPadding2EigenPadding(padding_), data_format_, output);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  ActivationMode activation_mode_;
+  TensorFormat data_format_;
+  LaunchFusedConv2DBiasActivationOp<Device, T> launcher_;
+  bool cudnn_use_autotune_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DBiasActivationOp);
+};
+
+#if GOOGLE_CUDA
+namespace dnn = ::perftools::gputools::dnn;
+
+dnn::ActivationMode BrainActivationMode2CudnnActivationMode(
+    ActivationMode activation_mode) {
+  switch (activation_mode) {
+    case ActivationMode::SIGMOID:
+      return dnn::ActivationMode::kSigmoid;
+    case ActivationMode::RELU:
+      return dnn::ActivationMode::kRelu;
+    case ActivationMode::RELUX:
+      return dnn::ActivationMode::kReluX;
+    case ActivationMode::RELU6:
+      return dnn::ActivationMode::kRelu6;
+    case ActivationMode::TANH:
+      return dnn::ActivationMode::kTanh;
+    case ActivationMode::BANDPASS:
+      return dnn::ActivationMode::kBandPass;
+  }
+  // Prevent compiler warning about missing return
+  return dnn::ActivationMode::kRelu;
+}
+
+// A dummy type to group forward convolution autotune results together.
+struct ConvBiasActivationAutoTuneGroup {
+  static string name() { return "ConvBiasActivation"; }
+};
+typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, ConvParameters,
+                          perftools::gputools::dnn::AlgorithmConfig>
+    AutoTuneConvBiasActivation;
+
+template <typename T>
+void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
+    OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param,
+    const Tensor& filter, int32 row_stride, int32 col_stride,
+    const Tensor& bias, const ActivationMode& activation_mode,
+    const Eigen::PaddingType& padding, TensorFormat data_format,
+    Tensor* output) {
+  using perftools::gputools::dnn::AlgorithmConfig;
+  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::ProfileResult;
+  using perftools::gputools::dnn::kDefaultAlgorithm;
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+  Tensor input = input_param;
+
+  perftools::gputools::dnn::ActivationMode cudnn_activation_mode =
+      BrainActivationMode2CudnnActivationMode(activation_mode);
+
+  // TODO(yangzihao): refactor all the complicated/duplicated code in regular
+  // conv ops to a shared conv utility.
+  int32 padding_rows = 0;
+  int32 padding_cols = 0;
+  const int64 in_batch = GetTensorDim(input, data_format, 'N');
+  int64 in_rows = GetTensorDim(input, data_format, 'H');
+  int64 in_cols = GetTensorDim(input, data_format, 'W');
+  const int64 in_depths = GetTensorDim(input, data_format, 'C');
+  const int64 out_batch = GetTensorDim(*output, data_format, 'N');
+  const int64 out_rows = GetTensorDim(*output, data_format, 'H');
+  const int64 out_cols = GetTensorDim(*output, data_format, 'W');
+  const int64 out_depths = GetTensorDim(*output, data_format, 'C');
+  const int64 patch_rows = filter.dim_size(0);
+  const int64 patch_cols = filter.dim_size(1);
+  if (padding == Eigen::PADDING_SAME) {
+    // Total padding on rows and cols is
+    // Pr = (R' - 1) * S + Kr - R
+    // Pc = (C' - 1) * S + Kc - C
+    // where (R', C') are output dimensions, (R, C) are input dimensions, S
+    // is stride, (Kr, Kc) are filter dimensions.
+    // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+    // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+    // we pad more on the right and bottom than on the top and left.
+    padding_rows =
+        std::max<int32>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
+    padding_cols =
+        std::max<int32>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
+    const int rows_parity = padding_rows & 1;
+    const int cols_parity = padding_cols & 1;
+    if ((rows_parity | cols_parity) != 0) {
+      Tensor transformed_input;
+      int64 new_in_rows = in_rows + rows_parity;
+      int64 new_in_cols = in_cols + cols_parity;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(DataTypeToEnum<T>::value,
+                             ShapeFromFormat(data_format, in_batch, new_in_rows,
+                                             new_in_cols, in_depths),
+                             &transformed_input));
+
+      functor::PadInput<GPUDevice, T, int, 4>()(
+          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
+          {{0, 0}}, {{rows_parity, cols_parity}},
+          To32Bit(transformed_input.tensor<T, 4>()), data_format);
+
+      input = transformed_input;
+      in_rows = new_in_rows;
+      in_cols = new_in_cols;
+    }
+  }
+
+  if (data_format == FORMAT_NHWC) {
+    // Convert the input tensor from NHWC to NCHW.
+    TensorShape nchw_shape =
+        ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
+    if (in_depths > 1) {
+      Tensor transformed_input;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             nchw_shape, &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(input).tensor<T, 4>(),
+          transformed_input.tensor<T, 4>());
+      input = transformed_input;
+    } else {
+      // If depth <= 1, then just reshape.
+      CHECK(input.CopyFrom(input, nchw_shape));
+    }
+  }
+
+  CHECK(padding_rows >= 0 && padding_cols >= 0)
+      << "Negative row or col paddings: (" << padding_rows << ", "
+      << padding_cols << ")";
+  perftools::gputools::dnn::BatchDescriptor input_desc;
+  input_desc.set_count(in_batch)
+      .set_feature_map_count(in_depths)
+      .set_height(in_rows)
+      .set_width(in_cols)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::BatchDescriptor output_desc;
+  output_desc.set_count(out_batch)
+      .set_height(out_rows)
+      .set_width(out_cols)
+      .set_feature_map_count(out_depths)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(filter.dim_size(0))
+      .set_input_filter_width(filter.dim_size(1))
+      .set_input_feature_map_count(filter.dim_size(2))
+      .set_output_feature_map_count(filter.dim_size(3));
+  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  conv_desc.set_vertical_filter_stride(row_stride)
+      .set_horizontal_filter_stride(col_stride)
+      .set_zero_padding_height(padding_rows / 2)
+      .set_zero_padding_width(padding_cols / 2);
+
+  // Shuffles a filter tensor from:
+  //   [<spatial_dims>, in, out]
+  // to:
+  //   [out, in, <spatial_dims>]
+  // TODO(yangzihao): Support a data layout tag for the filter weights, and only
+  // do the transform if the weights are not already in the correct layout.
+  Tensor transformed_filter;
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                          DataTypeToEnum<T>::value,
+                          TensorShape({filter.dim_size(3), filter.dim_size(2),
+                                       filter.dim_size(0), filter.dim_size(1)}),
+                          &transformed_filter));
+
+  functor::TransformFilter<GPUDevice, T, int, 4>()(
+      ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()),
+      To32Bit(transformed_filter.tensor<T, 4>()));
+
+  Tensor transformed_output;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                              ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
+                                              out_cols, out_depths),
+                              &transformed_output));
+
+  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                  input.template flat<T>().size());
+  auto filter_ptr =
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
+  auto output_ptr =
+      AsDeviceMemory(transformed_output.template flat<T>().data(),
+                     transformed_output.template flat<T>().size());
+
+  auto bias_ptr = AsDeviceMemory(bias.template flat<T>().data(),
+                                 bias.template flat<T>().size());
+
+  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+
+  int device_id = stream->parent()->device_ordinal();
+  DataType dtype = input.dtype();
+  ConvParameters conv_parameters = {
+      in_batch,
+      in_depths,
+      {{in_rows, in_cols}},
+      out_depths,
+      {{patch_rows, patch_cols}},
+      {{row_stride, col_stride}},
+      {{padding_rows, padding_cols}},
+      dtype,
+      device_id,
+  };
+
+  AlgorithmConfig algorithm_config;
+  if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
+                                conv_parameters, &algorithm_config)) {
+    std::vector<AlgorithmType> algorithms;
+    CHECK(stream->parent()->GetConvolveAlgorithms(
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+    ProfileResult best_result;
+    ProfileResult best_result_no_scratch;
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveWithAlgorithm(
+                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                  bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
+                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                  &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
+          }
+        }
+      }
+    }
+    OP_REQUIRES(
+        ctx,
+        best_result.is_valid() && best_result.algorithm() != kDefaultAlgorithm,
+        errors::NotFound("No algorithm worked!"));
+    OP_REQUIRES(ctx,
+                best_result_no_scratch.is_valid() &&
+                    best_result_no_scratch.algorithm() != kDefaultAlgorithm,
+                errors::NotFound("No algorithm without scratch worked!"));
+    algorithm_config.set_algorithm(best_result.algorithm());
+    algorithm_config.set_algorithm_no_scratch(
+        best_result_no_scratch.algorithm());
+    AutoTuneConvBiasActivation::GetInstance()->Insert(conv_parameters,
+                                                      algorithm_config);
+  }
+
+  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  bool cudnn_launch_status =
+      stream
+          ->ThenConvolveWithAlgorithm(
+              input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+              bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
+              &scratch_allocator, algorithm_config,
+              /*output_profile_result=*/nullptr)
+          .ok();
+
+  if (!cudnn_launch_status) {
+    ctx->SetStatus(errors::Internal(
+        "cuDNN launch failure : input shape(", input.shape().DebugString(),
+        ") filter shape(", filter.shape().DebugString(), ")"));
+  }
+
+  // Convert the output tensor back from NCHW to NHWC.
+  if (data_format == FORMAT_NHWC) {
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        ctx->eigen_device<GPUDevice>(),
+        const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
+        output->tensor<T, 4>());
+  } else {
+    *output = transformed_output;
+  }
+}
+
+// Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(Name("FusedConv2DBiasActivation")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        FusedConv2DBiasActivationOp<GPUDevice, float>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d71b26cf1db4bd79f238d66417c437288bf50ad8
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+#define THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/util/activation_mode.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+class LaunchFusedConv2DBiasActivationOp {
+ public:
+  void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+              const Tensor& input, const Tensor& filter, int row_stride,
+              int col_stride, const Tensor& bias,
+              const ActivationMode& activation_mode,
+              const Eigen::PaddingType& padding, TensorFormat data_format,
+              Tensor* output);
+};
+
+#ifdef GOOGLE_CUDA
+template <typename T>
+class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T> {
+ public:
+  void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+              const Tensor& input, const Tensor& filter, int32 row_stride,
+              int32 col_stride, const Tensor& bias,
+              const ActivationMode& activation_mode,
+              const Eigen::PaddingType& padding, TensorFormat data_format,
+              Tensor* output);
+};
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6134c5c699dad7e0464495feb49d6519a333e576
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/activation_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+// Return the string containing the list of valid activation modes, that can be
+// used as an Attr() in REGISTER_OP.
+string GetAllActivationModeAttrString() { return "activation_mode: {'Relu'}"; }
+
+}  // namespace
+
+// --------------------------------------------------------------------------
+REGISTER_OP("FusedConv2DBiasActivation")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Output("output: T")
+    .Attr("T: {float}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetAllActivationModeAttrString())
+    .SetShapeFn(shape_inference::FusedConvBiasActivationShape)
+    .Doc(R"doc(
+    Computes a fused 2-D convolution, adds bias, and applies an activation function
+    on the output given 4-D `input`, 4-D `filter`, 1-D `bias` tensors and an activation mode.
+
+    input: A 4-D tensor. The dimension order is interpreted according to the value
+        of `data_format`, see below for details.
+    filter: A 4-D tensor of shape
+        `[filter_height, filter_width, in_channels, out_channels]`
+    bias: 1-D with size of the `out_channels` dimension in filter.
+    output: A 4-D tensor. The dimension order is determined by the value of
+        `data_format`, see below for details.
+    T: The data type for the elements of input, filter, bias, and output Tensors.
+    strides: 1-D tensor of length 4.  The stride of the sliding window for each
+        dimension of `input`. The dimension order is determined by the value of
+        `data_format`, see below for details.
+    padding: The type of padding algorithm to use.
+    data_format: Specify the data format of the input and output data. With the
+        default format "NHWC", the data is stored in the order of:
+        [batch, height, width, channels].
+        Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, channels, height, width].
+    activation_mode: Specify the activation function to apply to the output tensor
+        of bias add. Currently only supports "Relu".
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/python/constants.py b/tensorflow/contrib/fused_conv/python/__init__.py
similarity index 77%
rename from tensorflow/contrib/tensor_forest/python/constants.py
rename to tensorflow/contrib/fused_conv/python/__init__.py
index d222cbb6adc9074990ecee863a2f1ace301f6bf9..23d817cefbddb3cf8bac8453305b3e03f86a2e28 100644
--- a/tensorflow/contrib/tensor_forest/python/constants.py
+++ b/tensorflow/contrib/fused_conv/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Constants used by tensorforest.  Some of these map to values in C++ ops."""
+"""ops module."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-# If tree[i][0] equals this value, then i is a leaf node.
-LEAF_NODE = -1
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a65d4bc50ff796977e8ea7f652b7cbe3fe37f673
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
@@ -0,0 +1,243 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for fused conv2d bias and activation op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+
+from tensorflow.contrib.fused_conv.python.ops import fused_conv2d_bias_activation_op
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def build_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
+                               padding, num_iters, data_format):
+  """builds a graph containing a sequence of conv2d operations.
+
+  Args:
+    device: String, the device to run on.
+    input_shape: Shape of the input tensor.
+    filter_shape: Shape of the filter tensor.
+    strides: A list of ints. 1-D of length 4. The stride of sliding
+             window for each dimension of input.
+    padding: A string from: "SAME", "VALID". The type of padding
+             algorithm to use.
+    num_iters: number of iterations to run conv2d.
+    data_format: data format string of input, 'NHWC' and 'NCHW' are
+    supported.
+
+  Returns:
+    An array of tensors to run()
+  """
+  if data_format == "NCHW":
+    input_shape = [
+        input_shape[0], input_shape[3], input_shape[1], input_shape[2]
+    ]
+  with ops.device("/%s:0" % device):
+    inp = variables.Variable(random_ops.truncated_normal(input_shape))
+    filt = variables.Variable(random_ops.truncated_normal(filter_shape))
+    bias_shape = [filter_shape[-1]]
+    bias = variables.Variable(random_ops.truncated_normal(bias_shape))
+
+    outputs = []
+    conv2d_out = nn_ops.conv2d(
+        inp, filt, strides, padding, data_format=data_format)
+    bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format)
+    relu_out = nn_ops.relu(bias_out)
+    outputs.append(relu_out)
+    for _ in range(1, num_iters):
+      with ops.control_dependencies([relu_out]):
+        conv2d_out = nn_ops.conv2d(
+            inp, filt, strides, padding, data_format=data_format)
+        bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format)
+        relu_out = nn_ops.relu(bias_out)
+        outputs.append(relu_out)
+    return control_flow_ops.group(*outputs)
+
+
+def build_fused_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
+                                     padding, num_iters, data_format):
+  """builds a graph containing a sequence of conv2d operations.
+
+  Args:
+    device: String, the device to run on.
+    input_shape: Shape of the input tensor.
+    filter_shape: Shape of the filter tensor.
+    strides: A list of ints. 1-D of length 4. The stride of sliding
+             window for each dimension of input.
+    padding: A string from: "SAME", "VALID". The type of padding
+             algorithm to use.
+    num_iters: number of iterations to run conv2d.
+    data_format: data format string of input, 'NHWC' and 'NCHW' are
+    supported.
+
+  Returns:
+    An array of tensors to run()
+  """
+  if data_format == "NCHW":
+    input_shape = [
+        input_shape[0], input_shape[3], input_shape[1], input_shape[2]
+    ]
+  with ops.device("/%s:0" % device):
+    inp = variables.Variable(random_ops.truncated_normal(input_shape))
+    filt = variables.Variable(random_ops.truncated_normal(filter_shape))
+    bias_shape = [filter_shape[-1]]
+    bias = variables.Variable(random_ops.truncated_normal(bias_shape))
+
+    outputs = []
+    fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        inp,
+        filt,
+        bias,
+        strides,
+        padding,
+        data_format=data_format,
+        activation_mode="Relu")
+    outputs.append(fused_out)
+    for _ in range(1, num_iters):
+      with ops.control_dependencies([fused_out]):
+        # pylint: disable=g-line-too-long
+        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+            inp,
+            filt,
+            bias,
+            strides,
+            padding,
+            data_format=data_format,
+            activation_mode="Relu")
+        outputs.append(fused_out)
+    return control_flow_ops.group(*outputs)
+
+
+class FusedConv2DBiasActivationBenchmark(test.Benchmark):
+  """Benchmark conv2d!"""
+
+  def _run_graph(self, device, input_shape, filter_shape, strides, padding,
+                 num_iters, data_format):
+    """runs the graph and print its execution time.
+
+    Args:
+      device: String, the device to run on.
+      input_shape: Shape of the input tensor.
+      filter_shape: Shape of the filter tensor.
+      strides: A list of ints. 1-D of length 4. The stride of sliding
+               window for each dimension of input.
+      padding: A string from: "SAME", "VALID". The type of padding
+               algorithm to use.  num_iters: Number of iterations to run the
+                 benchmark.
+      num_iters: number of iterations to run conv2d.
+      data_format: data format string of input, 'NHWC' and 'NCHW' are
+      supported.
+
+    Returns:
+      The duration of the run in seconds.
+    """
+    graph = ops.Graph()
+    with graph.as_default():
+      outputs = build_fused_conv_bias_relu_graph(device, input_shape,
+                                                 filter_shape, strides, padding,
+                                                 num_iters, data_format)
+      with session_lib.Session(graph=graph) as session:
+        variables.global_variables_initializer().run()
+        # warmup runs
+        session.run(outputs)
+
+        start_time = time.time()
+        session.run(outputs)
+        duration = (time.time() - start_time) / num_iters
+
+        print("%s inputshape:%s filtershape:%s strides:%s padding:%s "
+              "%d iters: %.8f sec" %
+              (device, str(input_shape).replace(" ", ""),
+               str(filter_shape).replace(" ", ""),
+               str(strides).replace(" ", ""), padding, num_iters, duration))
+    name_template = (
+        "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_"
+        "strides_{strides}_padding_{padding}")
+
+    self.report_benchmark(
+        name=name_template.format(
+            device=device,
+            inputshape=str(input_shape).replace(" ", ""),
+            filtershape=str(filter_shape).replace(" ", ""),
+            strides=str(strides).replace(" ", ""),
+            padding=padding).replace(" ", ""),
+        iters=num_iters,
+        wall_time=duration)
+
+    return duration
+
+  def benchmark_fused_conv2d_bias_activation(self):
+
+    stride = [1, 1, 1, 1]
+    paddings = ["VALID", "SAME"]
+    data_formats = ["NHWC", "NCHW"]
+
+    resnet50_input_shapes = [[64, 14, 14, 256], [64, 14, 14, 256], [
+        64, 14, 14, 1024
+    ], [64, 55, 55, 64], [64, 28, 28, 128], [64, 28, 28, 128], [64, 55, 55, 64],
+                             [64, 7, 7, 512], [64, 7, 7, 512],
+                             [64, 28, 28, 512], [64, 55, 55,
+                                                 256], [64, 7, 7, 2048]]
+
+    resnet50_filter_shapes = [[1, 1, 256, 1024], [3, 3, 256, 256], [
+        1, 1, 1024, 256
+    ], [1, 1, 64, 256], [1, 1, 128, 512], [3, 3, 128, 128], [3, 3, 64, 64], [
+        3, 3, 512, 512
+    ], [1, 1, 512, 2048], [1, 1, 512, 128], [1, 1, 256, 64], [1, 1, 2048, 512]]
+
+    inception3_input_shapes = [[64, 17, 17, 768], [64, 35, 35, 96], [
+        64, 35, 35, 288
+    ], [64, 8, 8, 384], [64, 8, 8, 384], [64, 17, 17, 192], [64, 35, 35, 64], [
+        64, 17, 17, 192
+    ], [64, 17, 17, 160], [64, 17, 17, 160], [64, 17, 17, 768], [
+        64, 35, 35, 256
+    ], [64, 35, 35, 48], [64, 35, 35, 192], [64, 17, 17, 128], [
+        64, 17, 17, 160
+    ], [64, 8, 8, 448], [64, 17, 17, 128], [64, 17, 17, 768], [64, 17, 17, 160]]
+    inception3_filter_shapes = [[1, 1, 768, 192], [3, 3, 96, 96], [
+        1, 1, 288, 64
+    ], [1, 3, 384, 384], [3, 1, 384, 384], [7, 1, 192, 192], [3, 3, 64, 96], [
+        1, 7, 192, 192
+    ], [7, 1, 160, 160], [1, 7, 160, 160], [1, 1, 768, 160], [1, 1, 256, 64], [
+        5, 5, 48, 64
+    ], [1, 1, 192, 64], [1, 7, 128, 128], [1, 7, 160, 192], [3, 3, 448, 384],
+                                [7, 1, 128, 128], [1, 1, 768,
+                                                   128], [7, 1, 160, 192]]
+
+    print("fused conv2d bias activation benchmark using resnet50's shapes:")
+    for ishape, fshape in zip(resnet50_input_shapes, resnet50_filter_shapes):
+      for padding in paddings:
+        for data_format in data_formats:
+          self._run_graph("gpu", ishape, fshape, stride, padding, 80,
+                          data_format)
+    print("fused conv2d bias activation benchmark using inception3's shapes:")
+    for ishape, fshape in zip(inception3_input_shapes,
+                              inception3_filter_shapes):
+      for padding in paddings:
+        for data_format in data_formats:
+          self._run_graph("gpu", ishape, fshape, stride, padding, 80,
+                          data_format)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..41fd114f0fe65868a8496c018178be2cdaa36453
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -0,0 +1,87 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tensorflow op performing fused conv2d bias_add and relu."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_fused_conv2d_bias_activation_op_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_fused_conv2d_bias_activation_op.so"))
+
+
+def fused_conv2d_bias_activation(input_tensor,
+                                 filter_tensor,
+                                 bias,
+                                 strides,
+                                 padding,
+                                 activation_mode,
+                                 data_format=None,
+                                 name=None):
+  """Computes a fused 2-D convolution, adds bias, and applies relu.
+
+      input_tensor: A 4-D tensor. The dimension order is interpreted
+      according to the value of `data_format`, see below for details.
+      filter_tensor: A 4-D tensor of shape
+          `[filter_height, filter_width, in_channels, out_channels]`
+      bias: 1-D with size of the `out_channels` dimension in filter.
+      output: A 4-D tensor. The dimension order is determined by the value of
+          `data_format`, see below for details.
+      T: The data type for the elements of input, filter, bias, and output
+      Tensors.
+      strides: 1-D tensor of length 4.  The stride of the sliding window for
+      each
+          dimension of `input`. The dimension order is determined by the value
+          of
+          `data_format`, see below for details.
+      padding: The type of padding algorithm to use.
+      data_format: Specify the data format of the input and output data. With
+      the
+          default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+          Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+      activation_mode: Specify the activation function to apply to the output
+      tensor
+          of bias add. Currently only supports "Relu".
+
+  Args:
+    input_tensor: A `Tensor`. Must be one of the following types: `float32`.
+    filter_tensor: A `Tensor`. Must have the same type as `input`.
+    bias: A `Tensor`. Must have the same type as `input`.
+    strides: A list of `ints`.
+    padding: A `string` from: `"SAME", "VALID"`.
+    activation_mode: A `string` from: `"Sigmoid", "Relu", "Relu6", "ReluX",
+      "Tanh", "BandPass"`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`. Defaults to
+      `"NHWC"`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  return gen_fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+      input=input_tensor,
+      filter=filter_tensor,
+      bias=bias,
+      strides=strides,
+      padding=padding,
+      activation_mode=activation_mode,
+      data_format=data_format,
+      name=name)
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6a2fa3b83cc36b507947586c24fd2770ffb96a
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -0,0 +1,573 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for fused conv2d bias and activation operation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib.fused_conv.python.ops import fused_conv2d_bias_activation_op
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def GetShrunkInceptionShapes(shrink=10):
+  """Iterator for smaller versions of convolution shapes in 2015 Inception.
+
+  Relative to inception, each depth value is `depth // shrink`.
+
+  Args:
+    shrink: Factor to shrink each depth value by relative to Inception.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, padding), the convolution
+    parameters of Inception layers.
+  """
+  input_sizes = [[4, 5, 5, 1248], [4, 8, 8, 384], [4, 8, 8, 384], [
+      4, 8, 8, 2048
+  ], [4, 8, 8, 448], [4, 8, 8, 2048], [4, 8, 8, 2048], [4, 8, 8, 2048], [
+      4, 8, 8, 1760
+  ], [4, 8, 8, 1760], [4, 8, 8, 1760], [4, 8, 8, 1760], [4, 17, 17, 192], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 1248], [4, 17, 17, 128], [4, 17, 17, 1248], [4, 17, 17, 224], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 192], [4, 17, 17, 1216], [4, 17, 17, 1216], [4, 17, 17, 224], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 192], [4, 17, 17, 1152], [4, 17, 17, 1152], [4, 17, 17, 192], [
+      4, 17, 17, 160
+  ], [4, 17, 17, 1152], [4, 17, 17, 1024], [4, 17, 17, 128], [4, 17, 17, 1024],
+                 [4, 17, 17, 128], [4, 17, 17, 1024], [4, 17, 17, 128], [
+                     4, 17, 17, 768
+                 ], [4, 17, 17, 128], [4, 17, 17, 128], [4, 17, 17, 768],
+                 [4, 17, 17, 768], [4, 35, 35, 96], [4, 35, 35, 288], [
+                     4, 35, 35, 64
+                 ], [4, 35, 35, 288], [4, 35, 35, 256], [4, 35, 35, 48], [
+                     4, 35, 35, 256
+                 ], [4, 35, 35, 96], [4, 35, 35, 192], [4, 35, 35, 192], [
+                     4, 35, 35, 192
+                 ], [4, 73, 73, 64], [4, 73, 73, 64], [4, 147, 147, 24]]
+  filter_sizes = [[1, 1, 1248, 128], [1, 3, 384, 384], [3, 1, 384, 384], [
+      1, 1, 2048, 192
+  ], [3, 3, 448, 384], [1, 1, 2048, 320], [1, 1, 2048, 448], [1, 1, 2048, 384],
+                  [1, 1, 1760, 384], [1, 1, 1760, 192], [1, 1, 1760, 448], [
+                      1, 1, 1760, 320
+                  ], [3, 3, 192, 192], [3, 3, 192, 192], [1, 1, 1248, 192], [
+                      3, 3, 128, 320
+                  ], [1, 1, 1248, 128], [1, 3, 224, 224], [3, 1, 192, 256], [
+                      1, 3, 192, 256
+                  ], [1, 1, 1216, 192], [1, 1, 1216, 96], [3, 1, 224, 224], [
+                      3, 3, 192, 224
+                  ], [1, 3, 192, 192], [1, 1, 1152, 192], [1, 1, 1152, 128], [
+                      3, 1, 192, 192
+                  ], [3, 3, 160, 192], [1, 1, 1152, 160], [1, 1, 1024, 128], [
+                      1, 3, 128, 192
+                  ], [1, 1, 1024, 160], [3, 1, 128, 192], [1, 1, 1024, 256], [
+                      3, 1, 128, 128
+                  ], [1, 1, 768, 192], [1, 3, 128, 128], [3, 3, 128, 128], [
+                      1, 1, 768, 128
+                  ], [1, 1, 768, 320], [3, 3, 96, 96], [3, 3, 288, 384], [
+                      3, 3, 64, 96
+                  ], [1, 1, 288, 64], [1, 1, 256, 64], [5, 5, 48, 64],
+                  [1, 1, 256, 48], [3, 3, 96, 96], [1, 1, 192, 32], [
+                      1, 1, 192, 64
+                  ], [1, 1, 192, 48], [3, 3, 64, 192], [1, 1, 64,
+                                                        64], [1, 1, 24, 64]]
+  out_sizes = [[4, 5, 5, 128], [4, 8, 8, 384], [4, 8, 8, 384], [4, 8, 8, 192], [
+      4, 8, 8, 384
+  ], [4, 8, 8, 320], [4, 8, 8, 448], [4, 8, 8, 384], [4, 8, 8, 384], [
+      4, 8, 8, 192
+  ], [4, 8, 8, 448], [4, 8, 8, 320], [4, 8, 8, 192], [4, 17, 17, 192], [
+      4, 17, 17, 192
+  ], [4, 8, 8, 320], [4, 17, 17, 128], [4, 17, 17, 224], [4, 17, 17, 256], [
+      4, 17, 17, 256
+  ], [4, 17, 17, 192], [4, 17, 17, 96], [4, 17, 17, 224], [4, 17, 17, 224], [
+      4, 17, 17, 192
+  ], [4, 17, 17, 192], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 192], [
+      4, 17, 17, 160
+  ], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 160], [4, 17, 17, 192], [
+      4, 17, 17, 256
+  ], [4, 17, 17, 128], [4, 17, 17, 192], [4, 17, 17, 128], [4, 17, 17, 128], [
+      4, 17, 17, 128
+  ], [4, 17, 17, 320], [4, 17, 17, 96], [4, 17, 17, 384], [4, 35, 35, 96], [
+      4, 35, 35, 64
+  ], [4, 35, 35, 64], [4, 35, 35, 64], [4, 35, 35, 48], [4, 35, 35, 96],
+               [4, 35, 35, 32], [4, 35, 35, 64], [4, 35, 35, 48],
+               [4, 71, 71, 192], [4, 73, 73, 64], [4, 147, 147, 64]]
+  strides = [
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1
+  ]
+  # Shrink sizes to make the test faster
+  for i in input_sizes:
+    i[3] //= shrink
+  for f in filter_sizes:
+    f[2] //= shrink
+    f[3] //= shrink
+  for o in out_sizes:
+    o[3] //= shrink
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [
+      SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
+      VALID, SAME, SAME, VALID, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
+      SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME, SAME,
+      SAME, SAME, SAME, SAME, SAME, VALID, VALID, SAME, SAME, SAME, SAME, SAME,
+      SAME, SAME, SAME, SAME, VALID, VALID, VALID
+  ]
+  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                           paddings):
+    yield i, f, o, s, p
+
+
+def GetTestConfigs():
+  """Get all the valid tests configs to run.
+
+  Returns:
+    all the valid test configs as tuples of data_format and use_gpu.
+  """
+  test_configs = [("NCHW", True), ("NHWC", True)]
+  return test_configs
+
+
+class FusedConv2DBiasActivationTest(test.TestCase):
+
+  def _DtypesToTest(self, use_gpu):
+    return [dtypes.float32]
+
+  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias,
+                            strides, padding, activation_mode, data_format,
+                            dtype):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [kernel_rows, kernel_cols, input_depth, output_depth].
+      bias: 1-D bias tensor of length output_depth.
+      strides: Stride: [col_stride, row_stride]
+      padding: Padding type.
+      activation_mode: Activation mode.
+      data_format: Format of the data tensors.
+      dtype: Data type for inputs and outputs.
+    Returns:
+      Symbolic tensor value and reference value that can be used to
+      execute the computation and verify the results.
+    """
+    input_size = np.prod(tensor_in_sizes)
+    filter_size = np.prod(filter_in_sizes)
+    bias_size = filter_in_sizes[-1]  # equals to output depth
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, filter_size + 1)]
+    # This is to guarantee that there is always negative values after
+    # bias add so that we can test whether relu works correctly.
+    x3 = bias
+    with self.test_session(use_gpu=True):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
+      t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype)
+      strides = [1] + strides + [1]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        strides = test_util.NHWCToNCHW(strides)
+      output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          t1,
+          t2,
+          t3,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          activation_mode=activation_mode)
+      ref_conv_output = nn_ops.conv2d(
+          t1, t2, strides=strides, padding=padding, data_format=data_format)
+      ref_bias_output = nn_ops.bias_add(
+          ref_conv_output, t3, data_format=data_format)
+      ref_output = nn_ops.relu(ref_bias_output)
+      if data_format == "NCHW":
+        output = test_util.NCHWToNHWC(output)
+        ref_output = test_util.NCHWToNHWC(ref_output)
+
+      return output, ref_output
+
+  def _CompareFwdValues(self, tensor_in_sizes, filter_in_sizes, conv_strides,
+                        padding):
+    """Verifies that CPU and GPU produce the same values.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [kernel_rows, kernel_cols, input_depth, output_depth].
+      conv_strides: [row_stride, col_stride] for the convolution;
+      padding: Padding type.
+    """
+    x1 = np.random.rand(*tensor_in_sizes).astype(np.float32)
+    x2 = np.random.rand(*filter_in_sizes).astype(np.float32)
+    x3 = np.random.rand(*[filter_in_sizes[-1]]).astype(np.float32)
+
+    def _SetupVal(data_format, use_gpu):
+      with self.test_session(use_gpu=use_gpu):
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+        t3 = constant_op.constant(x3, shape=[filter_in_sizes[-1]])
+        strides = [1] + conv_strides + [1]
+        if data_format == "NCHW":
+          t1 = test_util.NHWCToNCHW(t1)
+          strides = test_util.NHWCToNCHW(strides)
+        output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+            t1,
+            t2,
+            t3,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            activation_mode="Relu")
+
+        if data_format == "NCHW":
+          output = test_util.NCHWToNHWC(output)
+        return output
+
+    tensors = []
+    for (data_format, use_gpu) in GetTestConfigs():
+      tensors.append(_SetupVal(data_format, use_gpu))
+    with self.test_session() as sess:
+      values = sess.run(tensors)
+      for i in range(1, len(values)):
+        self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
+
+  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, bias, strides,
+                    padding):
+    tensors = []
+    ref_tensors = []
+    for (data_format, use_gpu) in GetTestConfigs():
+      for dtype in self._DtypesToTest(use_gpu):
+        result, expected = self._SetupValuesForDevice(
+            tensor_in_sizes, filter_in_sizes, bias, strides, padding, "Relu",
+            data_format, dtype)
+        tensors.append(result)
+        ref_tensors.append(expected)
+      with self.test_session() as sess:
+        values = sess.run(tensors)
+        ref_values = sess.run(ref_tensors)
+        for i in range(len(tensors)):
+          conv = tensors[i]
+          value = values[i]
+          ref_value = ref_values[i]
+          print("expected = ", ref_value)
+          print("actual = ", value)
+          tol = 1e-5
+          if value.dtype == np.float16:
+            tol = 1e-3
+          self.assertAllClose(
+              np.ravel(ref_value), np.ravel(value), atol=tol, rtol=tol)
+          self.assertShapeEqual(value, conv)
+
+  def testConv2D1x1Filter(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D1x1Filter test.")
+      return
+    # expected_output = [
+    #    0.0, 0.0, 0.0, 21.0, 0.0, 0.0, 57.0, 0.0, 0.0, 93.0, 41.0, 0.0, 129.0,
+    #    86.0, 43.0, 165.0, 131.0, 97.0
+    # ]
+    medians = [-45.0, -130.0, -215.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        bias=medians,
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2DEmpty(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DEmpty test.")
+      return
+    # expected_output = []
+    self._VerifyValues(
+        tensor_in_sizes=[0, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        bias=[0.0, 0.0, 0.0],
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2D2x2Filter(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2Filter test.")
+      return
+    # expected_output = [0.0, 0.0, 0.0, 401.0, 533.0, 665.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        bias=[-2500.0, -2500.0, -2500.0],
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2D1x2Filter(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D1x2Filter test.")
+      return
+    # expected_output = [
+    #    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 190.0, 265.0, 340.0, 343.0, 436.0, 529.0
+    # ]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 2, 3, 3],
+        bias=[-500.0, -500.0, -500.0],
+        strides=[1, 1],
+        padding="VALID")
+
+  def testConv2D2x2FilterStride2(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2FilterStride2 test.")
+      return
+    # expected_output = [0.0, 67.0, 163.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        bias=[-2300.0, -2300.0, -2300.0],
+        strides=[2, 2],
+        padding="VALID")
+
+  def testConv2D2x2FilterStride2Same(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2FilterStride2Same test.")
+      return
+    # expected_output = [0.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        bias=[-2300.0, -1000.0, -1000.0],
+        strides=[2, 2],
+        padding="SAME")
+
+  def testConv2D2x2FilterStride1x2(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2D2x2FilterStride1x2 test.")
+      return
+    # expected_output = [0.0, 0.0, 8.0, 28.0, 48.0, 68.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 3, 6, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-90.0],
+        strides=[1, 2],
+        padding="VALID")
+
+  def testConv2DKernelSmallerThanStrideValid(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DKernelSmallerThanStrideValid test.")
+      return
+    # expected_output = [0, 0, 175, 205]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 7, 7, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-100.0],
+        strides=[3, 3],
+        padding="VALID")
+
+  def testConv2DKernelSmallerThanStrideSame(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DKernelSmallerThanStrideSame test.")
+      return
+    # expected = [0, 0, 2, 4]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 3, 3, 1],
+        filter_in_sizes=[1, 1, 1, 1],
+        bias=[-5.0],
+        strides=[2, 2],
+        padding="SAME")
+
+    # expected = [0, 0, 4, 6]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[1, 1, 1, 1],
+        bias=[-5.0],
+        strides=[2, 2],
+        padding="SAME")
+
+    # expected = [4, 0, 1, 0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-40.0],
+        strides=[3, 3],
+        padding="SAME")
+
+  def testConv2DKernelSizeMatchesInputSize(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping Conv2DKernelSizeMatchesInputSize test.")
+      return
+    # expected = [0, 5]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[2, 2, 1, 2],
+        bias=[-50.0, -55.0],
+        strides=[1, 1],
+        padding="VALID")
+
+    # expected = [0, 2, 282, 322]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 8, 8, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        bias=[-200.0],
+        strides=[4, 4],
+        padding="SAME")
+
+  def testShapeFunctionEdgeCases(self):
+    # All shapes unknown.
+    c1 = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        array_ops.placeholder(dtypes.float32),
+        array_ops.placeholder(dtypes.float32),
+        array_ops.placeholder(dtypes.float32),
+        strides=[1, 1, 1, 1],
+        padding="SAME",
+        activation_mode="Relu")
+    self.assertEqual([None, None, None, None], c1.get_shape().as_list())
+
+    # Incorrect input shape.
+    with self.assertRaises(ValueError):
+      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3]),
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          activation_mode="Relu")
+
+    # Incorrect filter shape.
+    with self.assertRaises(ValueError):
+      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32, shape=[1, 3]),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          activation_mode="Relu")
+
+    # Depth mismatch.
+    with self.assertRaises(ValueError):
+      fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+          array_ops.placeholder(dtypes.float32, shape=[4, 4, 2, 2]),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          activation_mode="Relu")
+
+  def testOpEdgeCases(self, gpu_only=True):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping OpEdgeCases tests.")
+      return
+    with self.test_session() as sess:
+      # Illegal strides.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "strides in the batch and depth"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                array_ops.placeholder(dtypes.float32),
+                array_ops.placeholder(dtypes.float32),
+                array_ops.placeholder(dtypes.float32),
+                strides=[2, 1, 1, 1],
+                padding="SAME",
+                activation_mode="Relu"))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "strides in the batch and depth"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                array_ops.placeholder(dtypes.float32),
+                array_ops.placeholder(dtypes.float32),
+                array_ops.placeholder(dtypes.float32),
+                strides=[1, 1, 1, 2],
+                padding="SAME",
+                activation_mode="Relu"))
+
+      # Illegal activation mode.
+      with self.assertRaisesRegexp(ValueError,
+                                   "Op passed string 'Tanh' not in:"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                array_ops.placeholder(dtypes.float32),
+                array_ops.placeholder(dtypes.float32),
+                array_ops.placeholder(dtypes.float32),
+                strides=[1, 1, 1, 1],
+                padding="SAME",
+                activation_mode="Tanh"))
+
+      # Filter larger than input.
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                array_ops.placeholder(dtypes.float32, shape=[20, 21, 3, 2]),
+                array_ops.placeholder(dtypes.float32, shape=[2]),
+                strides=[1, 1, 1, 1],
+                padding="VALID",
+                activation_mode="Relu"))
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                array_ops.placeholder(dtypes.float32, shape=[21, 20, 3, 2]),
+                array_ops.placeholder(dtypes.float32, shape=[2]),
+                strides=[1, 1, 1, 1],
+                padding="VALID",
+                activation_mode="Relu"))
+
+
+def GetInceptionFwdTest(input_size, filter_size, stride, padding,
+                        gpu_only=True):
+
+  def Test(self):
+    if gpu_only and not test.is_gpu_available():
+      tf_logging.info("Skipping InceptionFwd %s", (input_size, filter_size,
+                                                   stride, padding))
+      return
+    tf_logging.info("Testing InceptionFwd %s", (input_size, filter_size, stride,
+                                                padding))
+    self._CompareFwdValues(input_size, filter_size, [stride, stride], padding)
+
+  return Test
+
+
+if __name__ == "__main__":
+  for index, (input_size_, filter_size_, output_size_, stride_,
+              padding_) in enumerate(GetShrunkInceptionShapes()):
+    setattr(FusedConv2DBiasActivationTest, "testInceptionFwd_" + str(index),
+            GetInceptionFwdTest(input_size_, filter_size_, stride_, padding_))
+
+  # TODO(b/35359731)
+  # Fwd, BckInput, and BackFilter to test that for certain input parameter
+  # set, winograd nonfused algorithm will be excluded from conv autotune. If
+  # in such case, winograd nonfused algorithm is added as one option of the
+  # conv autotune, and cuDNN version is smaller than 7, the following tests
+  # will fail.
+  ishape = [1, 400, 400, 1]
+  fshape = [1, 1, 1, 256]
+  oshape = [1, 400, 400, 256]
+  setattr(FusedConv2DBiasActivationTest,
+          "testInceptionFwd_No_Winograd_Nonfused",
+          GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True))
+  test.main()
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index d570a6d702d6cb1634906cb40fb64780c22139e3..c6023c25b1fb8ca3ba5953c32e352bee5be0c530 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -25,6 +25,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
         "@six_archive//:six",
     ],
 )
@@ -54,7 +55,11 @@ py_library(
     name = "match",
     srcs = ["tests/match.py"],
     srcs_version = "PY2AND3",
-    deps = [":graph_editor_py"],
+    deps = [
+        ":graph_editor_py",
+        "//tensorflow/python:framework_ops",
+        "@six_archive//:six",
+    ],
 )
 
 py_test(
@@ -66,9 +71,7 @@ py_test(
         ":graph_editor_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -81,9 +84,7 @@ py_test(
         ":graph_editor_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -93,13 +94,10 @@ py_test(
     srcs = ["tests/match_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":graph_editor_py",
         ":match",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -112,9 +110,7 @@ py_test(
         ":graph_editor_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -128,9 +124,7 @@ py_test(
         ":match",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -144,9 +138,7 @@ py_test(
         ":match",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -163,9 +155,8 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/graph_editor/tests/transform_test.py b/tensorflow/contrib/graph_editor/tests/transform_test.py
index a4105645c68bd78ecc4474d14d05a3f3f5c1cbaa..ab5776b9dd66bb082e9ca3922e8902bfebe6b0b8 100644
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -181,6 +182,34 @@ class TransformTest(test.TestCase):
     self.assertEqual(res[0].name, "b:0")
     self.assertEqual(res[1].name, "add_1:0")
 
+  def test_graph_replace_gradients(self):
+    ops.reset_default_graph()
+    w = variables.Variable(0.0, name="w")
+    y = math_ops.multiply(math_ops.multiply(w, w, name="mul1"), w, name="mul2")
+    g = gradients_impl.gradients(y, w, name="grad")[0]
+
+    # Extract the operations.
+    replacement_ts = {w.value(): g}
+    original_mul1_grad = (ops.get_default_graph().
+                          get_operation_by_name("grad/mul1_grad/mul_1"))
+
+    # Should not raise exception.
+    res = ge.graph_replace(g, replacement_ts, dst_scope="res")
+
+    # Extract the operations after graph_replace.
+    result_mul1_grad = (ops.get_default_graph().
+                        get_operation_by_name("res/grad/mul1_grad/mul_1"))
+
+    # Make sure _original_ops are as expected.
+    self.assertEquals(original_mul1_grad._original_op.name, u"mul1")
+    self.assertEquals(result_mul1_grad._original_op.name, u"res/mul1")
+    self.assertNotEquals(res.name, g.name)
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      g_val, res_val = sess.run([g, res])
+    self.assertNear(g_val, 0.0, ERROR_TOLERANCE)
+    self.assertNear(res_val, 0.0, ERROR_TOLERANCE)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 762bc44814174da8cf8b7ee35a6b23d9d0554ebb..14ac5296657d48c7f9e94d220c9e7e28af4d4353 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -166,13 +166,12 @@ def copy_op_handler(info, op, copy_shape=True):
     for t, t_ in zip(op.outputs, op_.outputs):
       t_.set_shape(t.get_shape())
 
-  # Finalize original op.
+  # Original op cannot be finalised here yet. Because some ops require this
+  # attribute to exist, we will create a dummy original_op first and then
+  # later finalise it with the actual original_op when all the ops have
+  # been copied.
   if op._original_op:
-    original_op = info.transform_original_op_handler(info, op._original_op)
-    if original_op is None:
-      logging.debug("Could not find original op of: %s", op_.name)
-    else:
-      op_._original_op = original_op
+    op_._original_op = op._original_op
 
   # Add op to the graph
   info.graph_._add_op(op_)
@@ -446,7 +445,7 @@ class Transformer(object):
       # TODO(fkp): return a subgraph?
       op_, op_outputs_ = self.transform_op_handler(info, op)
       if op is op_:
-        raise ValueError("In-place tranformation not allowed.")
+        raise ValueError("In-place transformation not allowed.")
 
       # Process op.
       info.transformed_ops[op] = op_
@@ -471,6 +470,14 @@ class Transformer(object):
       for t in inputs_:
         op_._add_input(t)
 
+      # Finalize original op.
+      if op._original_op:
+        original_op = info.transform_original_op_handler(info, op._original_op)
+        if original_op is None:
+          logging.debug("Could not find original op for: %s", op_.name)
+        else:
+          op_._original_op = original_op
+
       # Finalize control inputs:
       control_inputs_ = [self.transform_control_input_handler(info, ci)
                          for ci in op.control_inputs]
diff --git a/tensorflow/contrib/grid_rnn/BUILD b/tensorflow/contrib/grid_rnn/BUILD
index 73473becf9bcad3f781dcbf274b28388dae67269..7fbb9f024c589895aa2dff7b6f5d8ba8c399af48 100644
--- a/tensorflow/contrib/grid_rnn/BUILD
+++ b/tensorflow/contrib/grid_rnn/BUILD
@@ -20,6 +20,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
         "//tensorflow/python:variable_scope",
     ],
 )
diff --git a/tensorflow/contrib/hooks/BUILD b/tensorflow/contrib/hooks/BUILD
index b6920ffd3d9e019a7522268e7963109ab554adcc..d81e868d4a922698e4755733b999112088fa2a0b 100644
--- a/tensorflow/contrib/hooks/BUILD
+++ b/tensorflow/contrib/hooks/BUILD
@@ -19,12 +19,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:platform",
         "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -35,10 +34,12 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":hooks",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
     ],
 )
 
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
index 7c82158522422a7a00b91a429341499947effa96..f6a38fe8a9471d0903c9b37c90bd7d2c0d9b64cd 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/graph_functions_wrapper.c
@@ -52,21 +52,21 @@ static enum InceptionVersion s_inception_version = INCEPTION_V3;
 /////////////////////////////////////////////////
 // file local functions
 
-static const char *ConvertGraphInfoIdToName(unsigned int id) {
+static const char* ConvertGraphInfoIdToName(unsigned int id) {
   // TODO(satok): implement
   return "?";
 }
 
-static const char *ConvertGraphInfoIdToOpName(unsigned int id) {
+static const char* ConvertGraphInfoIdToOpName(unsigned int id) {
   // TODO(satok): implement
   return "?";
 }
 
 /////////////////////////////////////////////////
 // file local utilities
-static uint32_t FindMaxIdxWithExcludeList(
-    const float *data, uint32_t entries, const int exclude_size,
-    const int* exclude_idx) {
+static uint32_t FindMaxIdxWithExcludeList(const float* data, uint32_t entries,
+                                          const int exclude_size,
+                                          const int* exclude_idx) {
   int i;
   float maxval = data[0];
   int maxidx = 0;
@@ -93,13 +93,16 @@ static uint32_t FindMaxIdx(const float* data, uint32_t entries) {
   return FindMaxIdxWithExcludeList(data, entries, 0, NULL);
 }
 
-void hexagon_controller_PrintMaxNIdx(const float *data, const uint32_t entries,
-                         const int n, int* out_ranking) {
+void hexagon_controller_PrintMaxNIdx(const float* data, const uint32_t entries,
+                                     const int n, int* out_ranking) {
   if (DUMP_OUTPUT) {
     for (int i = 0; i < entries; ++i) {
       TFMLOGD("%d: val = %f", i, data[i]);
     }
   }
+  if (n >= entries) {
+    TFMLOGD("Too many N %d >= %d", n, entries);
+  }
   for (int i = 0; i < n; ++i) {
     out_ranking[i] = INT_MAX;
   }
@@ -120,9 +123,9 @@ static inline unsigned long long int GetCounter(hexagon_nn_perfinfo s) {
   return ret;
 }
 
-static int CompareCycle(const void *va, const void *vb) {
-  const hexagon_nn_perfinfo *a = va;
-  const hexagon_nn_perfinfo *b = vb;
+static int CompareCycle(const void* va, const void* vb) {
+  const hexagon_nn_perfinfo* a = va;
+  const hexagon_nn_perfinfo* b = vb;
   unsigned long long int acount = GetCounter(*a);
   unsigned long long int bcount = GetCounter(*b);
   if (acount < bcount) {
@@ -139,8 +142,6 @@ static int CompareCycle(const void *va, const void *vb) {
 
 uint32_t hexagon_controller_InstantiateGraph() {
   const uint32_t nn_id = hexagon_nn_init();
-  // set debug level to 99 for now
-  //hexagon_nn_set_debug_level(nn_id, 99);
   // TODO(satok): make this as argument
   hexagon_nn_set_debug_level(nn_id, 0);
   return nn_id;
@@ -167,7 +168,7 @@ bool hexagon_controller_ConstructGraph(uint32_t nn_id) {
   int err;
   if ((err = hexagon_nn_prepare(nn_id)) != 0) {
     TFMLOGE("Prepare failed! returned 0x%x\n", err);
-    hexagon_controller_PrintLog(nn_id);
+    DumpNNId(nn_id);
     return false;
   } else {
     TFMLOGD("Prepare success!\n");
@@ -175,65 +176,80 @@ bool hexagon_controller_ConstructGraph(uint32_t nn_id) {
   }
 }
 
-uint32_t hexagon_controller_SetupGraph(int version)  {
+uint32_t hexagon_controller_SetupGraph(int version) {
   const uint32_t nn_id = hexagon_controller_InstantiateGraph();
   hexagon_controller_InitGraph(version, nn_id);
   hexagon_controller_ConstructGraph(nn_id);
   return nn_id;
 }
 
-bool hexagon_controller_ExecuteGraph(
-    const uint32_t nn_id,
-    const uint32_t batches,
-    const uint32_t height,
-    const uint32_t width,
-    const uint32_t depth,
-    uint8_t* int_data,
-    const uint32_t int_data_size,
-    uint32_t* out_batches,
-    uint32_t* out_height,
-    uint32_t* out_width,
-    uint32_t* out_depth,
-    uint8_t* out_vals,
-    const uint32_t output_val_byte_size,
-    uint32_t* out_data_byte_size) {
-  int err;
+bool hexagon_controller_ExecuteGraphWithMultipleInOut(
+    const uint32_t nn_id, const int input_count, hexagon_nn_tensordef* inputs,
+    const int output_count, hexagon_nn_tensordef* outputs) {
   if (DBG_EXECUTION) {
-    TFMLOGD("Preparing to execute...");
-    TFMLOGD("Input: %d, %d, %d, %d, %d, %d",
-            batches, height, width, depth, int_data[0], int_data_size);
-    TFMLOGD("Output: %d, %p", output_val_byte_size, out_vals);
+    TFMLOGD("Preparing to execute... in = %d, out = %d", input_count,
+            output_count);
     LogDHexagon("Execute graph!");
   }
-  
-  if ((err = hexagon_nn_execute(nn_id,
-                                batches,
-                                height,
-                                width,
-                                depth,
-                                int_data,
-                                int_data_size,
-                                out_batches,
-                                out_height,
-                                out_width,
-                                out_depth,
-                                out_vals,
-                                output_val_byte_size,
-                                out_data_byte_size)) != 0) {
+
+  const int err =
+      hexagon_nn_execute_new(nn_id, inputs, input_count, outputs, output_count);
+  if (err != 0) {
     if (DBG_EXECUTION) {
       LogDHexagon("Execution failed!");
-      TFMLOGE("execute got err: %d\n",err);
+      TFMLOGE("execute got err: %d\n", err);
+      DumpNNId(nn_id);
+    }
+    return false;
+  } else {
+    if (DBG_EXECUTION) {
+      LogDHexagon("Execution succeeded!");
     }
+    return true;
+  }
+}
+
+bool hexagon_controller_ExecuteGraph(
+    const uint32_t nn_id, const uint32_t batches, const uint32_t height,
+    const uint32_t width, const uint32_t depth, uint8_t* int_data,
+    const uint32_t int_data_size, uint32_t* out_batches, uint32_t* out_height,
+    uint32_t* out_width, uint32_t* out_depth, uint8_t* out_vals,
+    const uint32_t output_val_byte_size, uint32_t* out_data_byte_size) {
+  if (DBG_EXECUTION) {
+    TFMLOGD("Preparing to execute...");
+    TFMLOGD("Input: %d, %d, %d, %d, %d, %d", batches, height, width, depth,
+            int_data[0], int_data_size);
+    TFMLOGD("Output: %d, %p", output_val_byte_size, out_vals);
+    LogDHexagon("Execute graph!");
+  }
+
+  hexagon_nn_tensordef input;
+  hexagon_nn_tensordef output;
+
+  input.batches = batches;
+  input.height = height;
+  input.width = width;
+  input.depth = depth;
+  input.data = int_data;
+  input.dataLen = int_data_size;
+
+  output.data = out_vals;
+  output.dataLen = output_val_byte_size;
+
+  if (!hexagon_controller_ExecuteGraphWithMultipleInOut(nn_id, 1, &input, 1,
+                                                        &output)) {
     return false;
   } else {
+    *out_batches = output.batches;
+    *out_height = output.height;
+    *out_width = output.width;
+    *out_depth = output.depth;
+    *out_data_byte_size = output.dataLen;
+
     if (DBG_EXECUTION) {
       LogDHexagon("Execution succeeded!");
-      TFMLOGD("%d x %d x %d x %d, byte size = %d\n",
-              *out_batches,
-              *out_height,
-              *out_width,
-              *out_depth,
-              *out_data_byte_size);
+      TFMLOGD("%d x %d x %d x %d, byte size = %d\n", *out_batches, *out_height,
+              *out_width, *out_depth, *out_data_byte_size);
     }
     return true;
   }
@@ -246,27 +262,21 @@ bool hexagon_controller_ExecuteInceptionDummyData(uint32_t nn_id) {
   const bool success = hexagon_controller_ExecuteGraph(
       nn_id, INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
       INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
-      (uint8_t *)inception_dummy_int_data_299x299,
+      (uint8_t*)inception_dummy_int_data_299x299,
       INCEPTION_PARAM_HEIGHT_V3 * INCEPTION_PARAM_WIDTH_V3 *
-      INCEPTION_PARAM_DEPTH,
+          INCEPTION_PARAM_DEPTH,
       &out_batches, &out_height, &out_width, &out_depth,
-      (uint8_t *)s_output_values, sizeof(s_output_values),
-      &out_data_size);
+      (uint8_t*)s_output_values, sizeof(s_output_values), &out_data_size);
   if (success) {
     int out_ranking[OUT_RANKING_SIZE];
     hexagon_controller_PrintMaxNIdx(
-        s_output_values,
-        out_batches * out_height * out_width * out_depth,
+        s_output_values, out_batches * out_height * out_width * out_depth,
         OUT_RANKING_SIZE, out_ranking);
-    TFMLOGD("%d x %d x %d x %d, size = %d\n",
-            out_batches,
-            out_height,
-            out_width,
-            out_depth,
-            out_data_size);
-    TFMLOGD("max idx: %d\n", FindMaxIdx(
-        s_output_values,
-        out_batches * out_height * out_width * out_depth));
+    TFMLOGD("%d x %d x %d x %d, size = %d\n", out_batches, out_height,
+            out_width, out_depth, out_data_size);
+    TFMLOGD("max idx: %d\n",
+            FindMaxIdx(s_output_values,
+                       out_batches * out_height * out_width * out_depth));
     if (out_ranking[0] == 169 && out_ranking[1] == 7) {
       return true;
     } else {
@@ -290,25 +300,22 @@ void hexagon_controller_DumpPerf(uint32_t nn_id) {
     TFMLOGE("perf info failure");
     return;
   }
-  TFMLOGD("Total %d nodes.",n_nodes);
-  qsort(info,n_nodes,sizeof(info[0]), CompareCycle);
+  TFMLOGD("Total %d nodes.", n_nodes);
+  qsort(info, n_nodes, sizeof(info[0]), CompareCycle);
   for (i = 0; i < n_nodes; i++) {
     total_cycles += GetCounter(info[i]);
   }
-  TFMLOGD("Total %lld cycles.",total_cycles);
+  TFMLOGD("Total %lld cycles.", total_cycles);
   for (i = 0; i < n_nodes; i++) {
     counter = GetCounter(info[i]);
     cum_cycles += counter;
-    TFMLOGD("node,0x%x,%s,%s,executions,%d,cycles,%lld,%f %%,"
-            "cum_cycles,%lld,%f %%\n",
-           info[i].node_id,
-           ConvertGraphInfoIdToName(info[i].node_id),
-           ConvertGraphInfoIdToOpName(info[i].node_id),
-           info[i].executions,
-           counter,
-           100*((double)counter)/total_cycles,
-           cum_cycles,
-           100*((double)cum_cycles)/total_cycles);
+    TFMLOGD(
+        "node,0x%x,%s,%s,executions,%d,cycles,%lld,%f %%,"
+        "cum_cycles,%lld,%f %%\n",
+        info[i].node_id, ConvertGraphInfoIdToName(info[i].node_id),
+        ConvertGraphInfoIdToOpName(info[i].node_id), info[i].executions,
+        counter, 100 * ((double)counter) / total_cycles, cum_cycles,
+        100 * ((double)cum_cycles) / total_cycles);
   }
 #ifdef ENABLE_HVX_FULL_DEBUG
   DumpAllPerf(nn_id);
@@ -329,7 +336,7 @@ void hexagon_controller_DumpNodeName(uint32_t nn_id) {
     TFMLOGD("perf info failure");
     return;
   }
-  TFMLOGD("Total %d nodes.",node_count);
+  TFMLOGD("Total %d nodes.", node_count);
   qsort(info, node_count, sizeof(info[0]), CompareCycle);
   for (i = 0; i < node_count; i++) {
     total_cycles += GetCounter(info[i]);
@@ -338,19 +345,14 @@ void hexagon_controller_DumpNodeName(uint32_t nn_id) {
   for (i = 0; i < node_count; i++) {
     counter = GetCounter(info[i]);
     cum_cycles += counter;
-    TFMLOGD("node,0x%x,%s,%s,executions,%d,cycles,%lld,%f %%,"
-            "cum_cycles,%lld,%f %%",
-            info[i].node_id,
-            ConvertGraphInfoIdToName(info[i].node_id),
-            ConvertGraphInfoIdToOpName(info[i].node_id),
-            info[i].executions,
-            counter,
-            100*((double)counter)/total_cycles,
-            cum_cycles,
-            100*((double)cum_cycles)/total_cycles);
+    TFMLOGD(
+        "node,0x%x,%s,%s,executions,%d,cycles,%lld,%f %%,"
+        "cum_cycles,%lld,%f %%",
+        info[i].node_id, ConvertGraphInfoIdToName(info[i].node_id),
+        ConvertGraphInfoIdToOpName(info[i].node_id), info[i].executions,
+        counter, 100 * ((double)counter) / total_cycles, cum_cycles,
+        100 * ((double)cum_cycles) / total_cycles);
   }
 }
 
-void hexagon_controller_Teardown(uint32_t nn_id) {
-  hexagon_nn_teardown(nn_id);
-}
+void hexagon_controller_Teardown(uint32_t nn_id) { hexagon_nn_teardown(nn_id); }
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 31caebf8728cf7c98207370f5bb4db83407ec763..6a5d982dc8514d69277b8f042ac1256e28715d9e 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -24,11 +24,13 @@ limitations under the License.
 
 #include "adspmsgd.h"
 #include "dspCV.h"
-#include "rpcmem.h"    // helper API's for shared buffer allocation
+#include "node_data_float.h"
+#include "rpcmem.h"  // helper API's for shared buffer allocation
 #include "soc_interface.h"
 #include "tfm_log.h"
 
-// if false, use int data as input.  This is only for acceleration purpose
+// if false, use int data as input.  This is only for acceleration purpose.
+// Also you may need to change android.min.
 static const bool USE_FLOAT_DATA = true;
 
 // if true, show id for each node
@@ -43,27 +45,96 @@ extern uint8_t inception_dummy_int_data_224x224[];
 extern uint8_t inception_dummy_int_data_299x299[];
 extern float inception_dummy_float_data_299x299[];
 
-#define HEXAGON_CONTROLLER_VERSION 92
+#define HEXAGON_CONTROLLER_VERSION 101
 
 // allocate print bufsize in advance @MB
 #define PRINT_BUFSIZE (2 * 1024 * 1024)
 
 static unsigned char s_print_buf[PRINT_BUFSIZE];
 
-// input node data buffer size
-// x2 1024 * 1024 * 2 > 299 * 299 * 3 * 4 > 1024 * 1024
-static const int INPUT_NODE_DATA_BUFFER_SIZE = 1024 * 1024 * 2;
-// output node data buffer size
-// (1008 is enough for inception)
-static const int OUTPUT_NODE_DATA_BUFFER_SIZE = 300 * 300 * 3 * 4;
-
-static struct NodeDataFloat s_input_node_data_float_buffer;
-static float* s_output_node_data_float_buffer;
-static int s_output_node_data_float_buffer_byte_size;
-static int s_output_node_data_float_array_size;
+#define MAX_INPUTS 10
+#define MAX_OUTPUTS 10
+
+static struct NodeDataFloat s_input_node_data_buffer[MAX_INPUTS];
+static uint8_t* s_output_node_data_buffer[MAX_OUTPUTS];
+static int s_output_node_data_buffer_max_byte_size[MAX_OUTPUTS];
+static int s_output_node_data_array_byte_size[MAX_OUTPUTS];
 static uint32_t s_target_graph_id;
 
 static bool s_dbg_use_inception_dummy_data = false;
+static int s_dbg_inception_version = 3;
+
+static int GetInputNodeCount() {
+  for (int i = 0; i < MAX_INPUTS; ++i) {
+    if (s_input_node_data_buffer[i].max_buf_byte_size == 0) {
+      return i;
+    }
+  }
+  return 0;
+}
+
+static int GetOutputNodeCount() {
+  for (int i = 0; i < MAX_OUTPUTS; ++i) {
+    if (s_output_node_data_buffer_max_byte_size[i] == 0) {
+      return i;
+    }
+  }
+  return 0;
+}
+
+static bool SetInputTensorDef(int port, hexagon_nn_tensordef* tensordef) {
+  if (port >= GetInputNodeCount()) {
+    TFMLOGE("Error exceeds input count.");
+    return false;
+  }
+  struct NodeDataFloat* input_node_data_buffer =
+      &s_input_node_data_buffer[port];
+  tensordef->batches = input_node_data_buffer->x;
+  tensordef->height = input_node_data_buffer->y;
+  tensordef->width = input_node_data_buffer->z;
+  tensordef->depth = input_node_data_buffer->d;
+  tensordef->data = input_node_data_buffer->byte_array_data;
+  tensordef->dataLen = input_node_data_buffer->array_byte_size;
+
+  return true;
+}
+
+bool hexagon_controller_SetAllInputTensorDef(int node_count,
+                                             hexagon_nn_tensordef* tensordef) {
+  bool success = true;
+  if (node_count != GetInputNodeCount()) {
+    TFMLOGE("Error invalid input node count.");
+    return false;
+  }
+  for (int i = 0; i < node_count; ++i) {
+    SetInputTensorDef(i, &tensordef[i]);
+  }
+  return success;
+}
+
+static bool SetOutputTensorDef(int port, hexagon_nn_tensordef* tensordef) {
+  if (port >= GetOutputNodeCount()) {
+    TFMLOGE("Error exceeds output count.");
+    return false;
+  }
+  tensordef->data = s_output_node_data_buffer[port];
+  tensordef->dataLen = s_output_node_data_buffer_max_byte_size[port];
+  return true;
+}
+
+bool hexagon_controller_SetAllOutputTensorDef(int node_count,
+                                              hexagon_nn_tensordef* tensordef) {
+  bool success = true;
+  if (node_count != GetOutputNodeCount()) {
+    TFMLOGE("Error invalid output node count. %d != %d", node_count,
+            GetOutputNodeCount());
+    return false;
+  }
+  for (int i = 0; i < node_count; ++i) {
+    SetOutputTensorDef(i, &tensordef[i]);
+  }
+  return success;
+}
 
 void hexagon_controller_InitInputNodeDataToInceptionDummyData(int version) {
   if (version == 1) {
@@ -72,44 +143,54 @@ void hexagon_controller_InitInputNodeDataToInceptionDummyData(int version) {
       return;
     }
     hexagon_controller_CopyByteNodeData(
-        INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V1,
-        INCEPTION_PARAM_WIDTH_V1, INCEPTION_PARAM_DEPTH,
-        1, inception_dummy_int_data_224x224);
+        0, INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V1,
+        INCEPTION_PARAM_WIDTH_V1, INCEPTION_PARAM_DEPTH, 1,
+        inception_dummy_int_data_224x224);
   } else if (version == 3) {
     if (USE_FLOAT_DATA) {
       hexagon_controller_CopyByteNodeData(
-          INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
-          INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
-          sizeof(float), (uint8_t*)inception_dummy_float_data_299x299);
+          0, INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
+          INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH, sizeof(float),
+          (uint8_t*)inception_dummy_float_data_299x299);
     } else {
       hexagon_controller_CopyByteNodeData(
-          INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
-          INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH,
-          1, inception_dummy_int_data_299x299);
+          0, INCEPTION_PARAM_BATCHES, INCEPTION_PARAM_HEIGHT_V3,
+          INCEPTION_PARAM_WIDTH_V3, INCEPTION_PARAM_DEPTH, 1,
+          inception_dummy_int_data_299x299);
     }
   }
 }
 
-bool hexagon_controller_ExecuteGraphWithBuffer(
-    uint32_t nn_id, bool show_ranking) {
-  uint32_t out_batches, out_height, out_width, out_depth;
-  uint32_t out_data_size;
-  int x = s_input_node_data_float_buffer.x;
-  int y = s_input_node_data_float_buffer.y;
-  int z = s_input_node_data_float_buffer.z;
-  int d = s_input_node_data_float_buffer.d;
-  uint8_t *byte_data = s_input_node_data_float_buffer.byte_array_data;
-  int array_size = s_input_node_data_float_buffer.array_size;
-  const bool success = hexagon_controller_ExecuteGraph(
-      nn_id, x, y, z, d, byte_data, array_size,
-      &out_batches, &out_height, &out_width, &out_depth,
-      (uint8_t *)s_output_node_data_float_buffer,
-      s_output_node_data_float_buffer_byte_size,
-      &out_data_size);
-  s_output_node_data_float_array_size =
-      out_batches * out_height * out_width * out_depth;
+bool hexagon_controller_ExecuteGraphWithBuffer(uint32_t nn_id,
+                                               bool show_ranking) {
+  const int input_node_count = GetInputNodeCount();
+  hexagon_nn_tensordef inputs[input_node_count];
+  const int output_node_count = GetOutputNodeCount();
+  if (output_node_count <= 0) {
+    TFMLOGI("Error output node count is 0.");
+    return false;
+  }
+  hexagon_nn_tensordef outputs[output_node_count];
+  hexagon_controller_SetAllInputTensorDef(input_node_count, inputs);
+  hexagon_controller_SetAllOutputTensorDef(output_node_count, outputs);
+  const bool success = hexagon_controller_ExecuteGraphWithMultipleInOut(
+      nn_id, input_node_count, inputs, output_node_count, outputs);
+  for (int i = 0; i < output_node_count; ++i) {
+    s_output_node_data_array_byte_size[i] = outputs[i].data_valid_len;
+  }
+
+  const hexagon_nn_tensordef* output0 = &outputs[0];
+
+  const uint32_t out_batches = output0->batches;
+  const uint32_t out_height = output0->height;
+  const uint32_t out_width = output0->width;
+  const uint32_t out_depth = output0->depth;
+  const uint32_t out_data_size = output0->data_valid_len;
+  const uint32_t out_buf_byte_size = output0->dataLen;
+
   if (!success) {
     TFMLOGE("Execution failed");
+    DumpNNId(nn_id);
     return false;
   } else if (!show_ranking) {
     return true;
@@ -118,15 +199,11 @@ bool hexagon_controller_ExecuteGraphWithBuffer(
   static const int OUT_RANKING_SIZE = 5;
   int out_ranking[OUT_RANKING_SIZE];
   hexagon_controller_PrintMaxNIdx(
-      s_output_node_data_float_buffer,
-      out_batches * out_height * out_width * out_depth,
-      OUT_RANKING_SIZE, out_ranking);
-  TFMLOGD("%d x %d x %d x %d, byte size = %d\n",
-          out_batches,
-          out_height,
-          out_width,
-          out_depth,
-          out_data_size);
+      (float*)s_output_node_data_buffer[0],
+      out_batches * out_height * out_width * out_depth, OUT_RANKING_SIZE,
+      out_ranking);
+  TFMLOGD("%d x %d x %d x %d, byte size = %d, buf size = %d\n", out_batches,
+          out_height, out_width, out_depth, out_data_size, out_buf_byte_size);
   if (s_dbg_use_inception_dummy_data) {
     // Check the result of inception with a dummy data. This step shouldn't
     // be passed when show_ranking != true to avoid adding unnecessary
@@ -142,9 +219,7 @@ bool hexagon_controller_ExecuteGraphWithBuffer(
   return true;
 }
 
-uint32_t hexagon_controller_GetTargetGraphId() {
-  return s_target_graph_id;
-}
+uint32_t hexagon_controller_GetTargetGraphId() { return s_target_graph_id; }
 
 void hexagon_controller_SetTargetGraphId(uint32_t graph_id) {
   s_target_graph_id = graph_id;
@@ -168,69 +243,129 @@ int hexagon_controller_GetHexagonBinaryVersion() {
   return retval;
 }
 
-bool hexagon_controller_AllocateNodeDataBuffers(
-    int input_size, int output_size) {
-  TFMLOGD("Allocate memory for input / output node data float");
-  if (s_input_node_data_float_buffer.buf_size != 0) {
+bool hexagon_controller_AllocateInputNodeDataBuffers(int port,
+                                                     int input_buf_byte_size) {
+  TFMLOGD("Allocate memory for input node data. port = %d, size = %d", port,
+          input_buf_byte_size);
+  if (s_input_node_data_buffer[port].max_buf_byte_size != 0) {
     TFMLOGE("ERROR! input buffer is already allocated!!");
     return false;
   } else {
-    int byte_array_data_size = USE_FLOAT_DATA ?
-        input_size * sizeof(float) : input_size; /* sizeof(uint8_t) ? */
-    s_input_node_data_float_buffer.buf_size = input_size;
-    // unused? remove?
-    s_input_node_data_float_buffer.array_data =
-        malloc(input_size * sizeof(float));
-    s_input_node_data_float_buffer.byte_array_data =
-        malloc(byte_array_data_size);
-
-    s_output_node_data_float_buffer = malloc(output_size * sizeof(float));
-    s_output_node_data_float_buffer_byte_size = output_size * sizeof(float);
-    s_output_node_data_float_array_size = 0;
-    TFMLOGD("allocate node data buffers");
+    s_input_node_data_buffer[port].max_buf_byte_size = input_buf_byte_size;
+    posix_memalign((void**)&s_input_node_data_buffer[port].byte_array_data, 128,
+                   input_buf_byte_size);
+    TFMLOGD("allocate input node data buffers done");
   }
   return true;
 }
 
-bool hexagon_controller_ReleaseNodeDataBuffers() {
-  if (s_input_node_data_float_buffer.buf_size == 0) {
+bool hexagon_controller_AllocateOutputNodeDataBuffers(
+    int port, int output_buf_byte_size) {
+  TFMLOGD("Allocate memory for output node data. port = %d, size = %d", port,
+          output_buf_byte_size);
+  if (s_output_node_data_buffer_max_byte_size[port] != 0) {
+    TFMLOGE("ERROR! input buffer is already allocated!!");
+    return false;
+  } else {
+    // s_output_node_data_buffer = malloc(output_size * sizeof(float));
+    posix_memalign((void**)&s_output_node_data_buffer[port], 128,
+                   output_buf_byte_size);
+    s_output_node_data_buffer_max_byte_size[port] = output_buf_byte_size;
+    s_output_node_data_array_byte_size[port] = 0;
+    TFMLOGD("allocate output node data buffers");
+  }
+  return true;
+}
+
+bool hexagon_controller_AllocateMultipleNodeDataBuffers(int input_count,
+                                                        int* input_sizes,
+                                                        int output_count,
+                                                        int* output_sizes) {
+  bool success = true;
+  for (int i = 0; i < input_count; ++i) {
+    success &=
+        hexagon_controller_AllocateInputNodeDataBuffers(i, input_sizes[i]);
+  }
+  for (int i = 0; i < output_count; ++i) {
+    success &=
+        hexagon_controller_AllocateOutputNodeDataBuffers(i, output_sizes[i]);
+  }
+
+  if (s_dbg_use_inception_dummy_data) {
+    hexagon_controller_InitInputNodeDataToInceptionDummyData(
+        s_dbg_inception_version);
+  }
+  return success;
+}
+
+bool hexagon_controller_AllocateNodeDataBuffers(int input_size,
+                                                int output_size) {
+  return hexagon_controller_AllocateMultipleNodeDataBuffers(1, &input_size, 1,
+                                                            &output_size);
+}
+
+bool hexagon_controller_ReleaseInputNodeDataBuffersWithPort(int port) {
+  struct NodeDataFloat* input_node_data_buffer =
+      &s_input_node_data_buffer[port];
+  if (input_node_data_buffer->max_buf_byte_size == 0) {
     TFMLOGE("ERROR! input buffer has not been allocated yet!!");
     return false;
   } else {
-    s_input_node_data_float_buffer.buf_size = 0;
-    free(s_input_node_data_float_buffer.array_data);
+    input_node_data_buffer->max_buf_byte_size = 0;
+    input_node_data_buffer->array_byte_size = 0;
+    free(input_node_data_buffer->byte_array_data);
   }
-  if (s_output_node_data_float_buffer_byte_size == 0) {
+  return true;
+}
+
+bool hexagon_controller_ReleaseOutputNodeDataBuffersWithPort(int port) {
+  if (s_output_node_data_buffer_max_byte_size[port] == 0) {
     TFMLOGE("ERROR! output buffer has not been allocated yet!!");
     return false;
   } else {
-    s_output_node_data_float_buffer_byte_size = 0;
-    free(s_input_node_data_float_buffer.byte_array_data);
+    s_output_node_data_buffer_max_byte_size[port] = 0;
+    s_output_node_data_array_byte_size[port] = 0;
+    free(s_output_node_data_buffer[port]);
   }
   return true;
 }
 
-bool hexagon_controller_CopyByteNodeData(
-    int x, int y, int z, int d, int type_byte_size, uint8_t* array_data) {
+bool hexagon_controller_ReleaseNodeDataBuffers() {
+  bool success = true;
+  for (int i = 0; i < GetInputNodeCount(); ++i) {
+    success &= hexagon_controller_ReleaseInputNodeDataBuffersWithPort(i);
+  }
+  for (int i = 0; i < GetOutputNodeCount(); ++i) {
+    success &= hexagon_controller_ReleaseOutputNodeDataBuffersWithPort(i);
+  }
+  return success;
+}
+
+bool hexagon_controller_CopyByteNodeData(int port, int x, int y, int z, int d,
+                                         int type_byte_size,
+                                         uint8_t* array_data) {
   int array_byte_size = x * y * z * d * type_byte_size;
-  TFMLOGD("--- %d, %d, %d, %d, %d, %d",x,y,z,d,type_byte_size,array_byte_size);
-  if (s_input_node_data_float_buffer.buf_size < array_byte_size) {
+  TFMLOGD("--- %d, %d, %d, %d, %d, %d", x, y, z, d, type_byte_size,
+          array_byte_size);
+  struct NodeDataFloat* input_node_data_buffer = &s_input_node_data_buffer[0];
+
+  if (input_node_data_buffer->max_buf_byte_size < array_byte_size) {
     TFMLOGE("ERROR! input buffer size is too small! %d < %d",
-            s_input_node_data_float_buffer.buf_size, array_byte_size);
+            input_node_data_buffer->max_buf_byte_size, array_byte_size);
     return false;
   }
-  memcpy(s_input_node_data_float_buffer.byte_array_data,
-         array_data, array_byte_size);
-  s_input_node_data_float_buffer.array_size = array_byte_size;
-  s_input_node_data_float_buffer.x = x;
-  s_input_node_data_float_buffer.y = y;
-  s_input_node_data_float_buffer.z = z;
-  s_input_node_data_float_buffer.d = d;
+  memcpy(input_node_data_buffer->byte_array_data, array_data, array_byte_size);
+  input_node_data_buffer->array_byte_size = array_byte_size;
+  input_node_data_buffer->x = x;
+  input_node_data_buffer->y = y;
+  input_node_data_buffer->z = z;
+  input_node_data_buffer->d = d;
   return true;
 }
 
-int hexagon_controller_InitHexagonWithMaxAttributes(
-    int enable_dcvs, int bus_usage, int version) {
+int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
+                                                    int bus_usage,
+                                                    int version) {
   TFMLOGI("Init hexagon with max attributes (Controller version = %d)",
           HEXAGON_CONTROLLER_VERSION);
   const int MCPS = 1000;
@@ -239,17 +374,17 @@ int hexagon_controller_InitHexagonWithMaxAttributes(
   adspmsgd_start(0, RPCMEM_HEAP_DEFAULT, 4096);
 
   dspCV_Attribute attrib[] = {
-    // The below values will result in the maximum aDSP performance,
-    // at Turbo voltage.
-    // Slightly more MCPS than are available on current targets
-    {DSP_TOTAL_MCPS, MCPS},
-    // drive the clock to MAX on known targets
-    {DSP_MCPS_PER_THREAD, MCPS / 2},
-    // 12 GB/sec is slightly higher than the max realistic
-    // max BW on existing targets.
-    {PEAK_BUS_BANDWIDTH_MBPS, MBPS},
-    // This app is non-real time, and constantly reading/writing memory
-    {BUS_USAGE_PERCENT, bus_usage},
+      // The below values will result in the maximum aDSP performance,
+      // at Turbo voltage.
+      // Slightly more MCPS than are available on current targets
+      {DSP_TOTAL_MCPS, MCPS},
+      // drive the clock to MAX on known targets
+      {DSP_MCPS_PER_THREAD, MCPS / 2},
+      // 12 GB/sec is slightly higher than the max realistic
+      // max BW on existing targets.
+      {PEAK_BUS_BANDWIDTH_MBPS, MBPS},
+      // This app is non-real time, and constantly reading/writing memory
+      {BUS_USAGE_PERCENT, bus_usage},
   };
   int retval = 0;
   if (!enable_dcvs) {
@@ -263,13 +398,8 @@ int hexagon_controller_InitHexagonWithMaxAttributes(
       dspCV_initQ6_with_attributes(attrib, sizeof(attrib) / sizeof(attrib[0]));
   TFMLOGD("Return value from dspCV_initQ6() : %d\n", retval);
 
-  hexagon_controller_AllocateNodeDataBuffers(
-      INPUT_NODE_DATA_BUFFER_SIZE, OUTPUT_NODE_DATA_BUFFER_SIZE);
-
-  if (s_dbg_use_inception_dummy_data) {
-    hexagon_controller_InitInputNodeDataToInceptionDummyData(version);
-  }
   s_target_graph_id = 0;
+  s_dbg_inception_version = version;
 
   return retval;
 }
@@ -285,31 +415,36 @@ int hexagon_controller_DeInitHexagon() {
   return retval;
 }
 
-void hexagon_controller_GrowMemorySize() {
-  hexagon_nn_config();
-}
+void hexagon_controller_GrowMemorySize() { hexagon_nn_config(); }
 
-struct NodeDataFloat* hexagon_controller_GetInputNodeDataFloatBuffer() {
-  return &s_input_node_data_float_buffer;
+struct NodeDataFloat* hexagon_controller_GetInputNodeDataBuffer(int port) {
+  if (port >= GetInputNodeCount()) {
+    TFMLOGE("port should be less than 1");
+  }
+  return &s_input_node_data_buffer[port];
 }
 
-float* hexagon_controller_GetOutputNodeDataFloatBuffer(
-    const char *const node_name, int* out_array_size) {
-  *out_array_size = s_output_node_data_float_array_size;
-  return s_output_node_data_float_buffer;
+uint8_t* hexagon_controller_GetOutputNodeDataBuffer(int port,
+                                                    int* out_array_byte_size) {
+  if (port >= GetOutputNodeCount()) {
+    TFMLOGE("port should be less than 1");
+  }
+  *out_array_byte_size = s_output_node_data_array_byte_size[port];
+  return s_output_node_data_buffer[port];
 }
 
 // Append const node to the graph
-int hexagon_controller_AppendConstNode(
-    const char* const name, int graph_id, int node_id,
-    int batch, int height, int width, int depth,
-    const uint8_t* const data, int data_length) {
+int hexagon_controller_AppendConstNode(const char* const name, int graph_id,
+                                       int node_id, int batch, int height,
+                                       int width, int depth,
+                                       const uint8_t* const data,
+                                       int data_length) {
   if (DBG_SHOW_ID) {
-    TFMLOGV("---(CONST) %s, %d, %d, %d, %d, %d, %d",
-            name, node_id, batch, height, width, depth, data_length);
+    TFMLOGV("---(CONST) %s, %d, %d, %d, %d, %d, %d", name, node_id, batch,
+            height, width, depth, data_length);
   } else {
-    TFMLOGV("---(CONST) %s, %d, %d, %d, %d, %d",
-            name, batch, height, width, depth, data_length);
+    TFMLOGV("---(CONST) %s, %d, %d, %d, %d, %d", name, batch, height, width,
+            depth, data_length);
   }
   const int retval = hexagon_nn_append_const_node(
       graph_id, node_id, batch, height, width, depth, data, data_length);
@@ -321,11 +456,12 @@ int hexagon_controller_AppendConstNode(
 }
 
 // Append node to the graph
-int hexagon_controller_AppendNode(
-    const char* const name, int graph_id, int node_id, int ops_id,
-    int padding_id, const hexagon_nn_input* const inputs,
-    int inputs_count, const hexagon_nn_output* const outputs,
-    int outputs_count) {
+int hexagon_controller_AppendNode(const char* const name, int graph_id,
+                                  int node_id, int ops_id, int padding_id,
+                                  const hexagon_nn_input* const inputs,
+                                  int inputs_count,
+                                  const hexagon_nn_output* const outputs,
+                                  int outputs_count) {
   char input_param_buf[OUTPUT_PARAM_MAX_LINE_SIZE];
   memset(input_param_buf, 0, OUTPUT_PARAM_MAX_LINE_SIZE);
   int pos = 0;
@@ -335,8 +471,8 @@ int hexagon_controller_AppendNode(
       pos += snprintf(&input_param_buf[pos], 500, "(%d, %d), ",
                       inputs[i].src_id, inputs[i].output_idx);
     } else {
-      pos += snprintf(&input_param_buf[pos], 500, "(%d), ",
-                      inputs[i].output_idx);
+      pos +=
+          snprintf(&input_param_buf[pos], 500, "(%d), ", inputs[i].output_idx);
     }
   }
 
@@ -349,18 +485,16 @@ int hexagon_controller_AppendNode(
   }
 
   if (DBG_SHOW_ID) {
-    TFMLOGV("---(OP) %s, %d, %d, %d, %d, %d, %s, %s", name, node_id,
-            ops_id, padding_id, inputs_count, outputs_count, input_param_buf,
+    TFMLOGV("---(OP) %s, %d, %d, %d, %d, %d, %s, %s", name, node_id, ops_id,
+            padding_id, inputs_count, outputs_count, input_param_buf,
             output_param_buf);
   } else {
-    TFMLOGV("---(OP) %s, %d, %d, %d, %d, %s, %s", name,
-            ops_id, padding_id, inputs_count, outputs_count, input_param_buf,
-            output_param_buf);
+    TFMLOGV("---(OP) %s, %d, %d, %d, %d, %s, %s", name, ops_id, padding_id,
+            inputs_count, outputs_count, input_param_buf, output_param_buf);
   }
-  const int retval = hexagon_nn_append_node(
-      graph_id, node_id, ops_id, padding_id,
-      inputs, inputs_count,
-      outputs, outputs_count);
+  const int retval =
+      hexagon_nn_append_node(graph_id, node_id, ops_id, padding_id, inputs,
+                             inputs_count, outputs, outputs_count);
   if (retval != 0) {
     TFMLOGE("Failed to append const node %d", node_id);
     return retval;
@@ -375,13 +509,3 @@ void hexagon_controller_EnableDbgUseInceptionDummyData(bool enable) {
 bool hexagon_controller_IsDbgUseInceptionDummyDataEnabled() {
   return s_dbg_use_inception_dummy_data;
 }
-
-void hexagon_controller_PrintLog(uint32_t nn_id) {
-  unsigned char *buf;
-  if ((buf = malloc(PRINT_BUFSIZE)) == NULL) {
-    return;
-  }
-  hexagon_nn_getlog(nn_id, buf, PRINT_BUFSIZE);
-  TFMLOGE("DUMP HEXAGON LOG: %s", buf);
-  free(buf);
-}
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h b/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
index ab8c80c0f328085073770c87da71f608dd4704c4..fc921ff8b980eb3501bd06bc2882c13c5eee98f5 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/include/hexagon_controller.h
@@ -40,16 +40,37 @@ int hexagon_controller_GetWrapperVersion();
 
 int hexagon_controller_GetHexagonBinaryVersion();
 
+// Buffer operations
+bool hexagon_controller_SetAllInputTensorDef(int node_count,
+                                             hexagon_nn_tensordef* tensordef);
+
+bool hexagon_controller_SetAllInputTensorDef(int node_count,
+                                             hexagon_nn_tensordef* tensordef);
+
 // Hexagon perf functions
 int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
                                                     int bus_usage, int version);
 
+bool hexagon_controller_AllocateInputNodeDataBuffersWithPort(int port,
+                                                             int input_size);
+
+bool hexagon_controller_AllocateOutNodeDataBuffersWithPort(int port,
+                                                           int output_size);
+
 bool hexagon_controller_AllocateNodeDataBuffers(int input_size,
                                                 int output_size);
 
+bool hexagon_controller_AllocateMultipleNodeDataBuffers(int input_count,
+                                                        int* input_sizes,
+                                                        int output_count,
+                                                        int* output_sizes);
+
+bool hexagon_controller_ReleaseInputNodeDataBuffersWithPort(int port);
+bool hexagon_controller_ReleaseOutputNodeDataBuffersWithPort(int port);
+
 bool hexagon_controller_ReleaseNodeDataBuffers();
 
-bool hexagon_controller_CopyByteNodeData(int x, int y, int z, int d,
+bool hexagon_controller_CopyByteNodeData(int port, int x, int y, int z, int d,
                                          int type_byte_size,
                                          uint8_t* array_data);
 
@@ -63,10 +84,10 @@ void hexagon_controller_SetTargetGraphId(uint32_t graph_id);
 void hexagon_controller_GrowMemorySize();
 
 // Graph data transfer functions
-struct NodeDataFloat* hexagon_controller_GetInputNodeDataFloatBuffer();
+struct NodeDataFloat* hexagon_controller_GetInputNodeDataBuffer(int port);
 
-float* hexagon_controller_GetOutputNodeDataFloatBuffer(
-    const char* const node_name, int* out_array_size);
+uint8_t* hexagon_controller_GetOutputNodeDataBuffer(int port,
+                                                    int* out_array_byte_size);
 
 // Graph functions
 uint32_t hexagon_controller_InstantiateGraph();
@@ -79,6 +100,10 @@ uint32_t hexagon_controller_SetupGraph(int version);
 
 bool hexagon_controller_ExecuteInceptionDummyData(uint32_t nn_id);
 
+bool hexagon_controller_ExecuteGraphWithMultipleInOut(
+    const uint32_t nn_id, const int input_count, hexagon_nn_tensordef* inputs,
+    const int output_count, hexagon_nn_tensordef* outputs);
+
 bool hexagon_controller_ExecuteGraph(
     const uint32_t nn_id, const uint32_t batches, const uint32_t height,
     const uint32_t width, const uint32_t depth, uint8_t* int_data,
@@ -117,8 +142,6 @@ void hexagon_controller_EnableDbgUseInceptionDummyData(bool enable);
 
 bool hexagon_controller_IsDbgUseInceptionDummyDataEnabled();
 
-void hexagon_controller_PrintLog(uint32_t nn_id);
-
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h b/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h
index e8615fd4ec030abb1c25b955dd6f793c0af5e651..8d11ee4a3400a247e4951d5494c30222eb25109b 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_log/include/tfm_log.h
@@ -33,6 +33,9 @@ static inline bool IsLogOn(int log_level) { return log_level >= s_log_level; }
 
 static inline void SetLogLevel(int log_level) { s_log_level = log_level; }
 
+// Do nothing
+static inline void SetExperimentalDebug() {}
+
 #define TFMLOGV(fmt, ...)                       \
   do {                                          \
     if (!IsLogOn(TFM_LOG_LEVEL_VERBOSE)) break; \
@@ -71,4 +74,9 @@ static inline void LogDHexagon(const char* fmt, ...) {
   va_end(ap);
 }
 
+static inline void DumpNNId(uint32_t nn_id) {
+  // TODO(satok): Dump more information
+  TFMLOGI("NN Id = %d", nn_id);
+}
+
 #endif
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h
index a9c3296e9f49474b193bcc8c686f3c21b16cd23a..c7034cc3a0d19f912b0b29b8ca0313d7528fc079 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/node_data_float.h
@@ -28,9 +28,8 @@ struct NodeDataFloat {
   int y;
   int z;
   int d;
-  int buf_size;
-  int array_size;
-  float* array_data;
+  int max_buf_byte_size;
+  int array_byte_size;
   uint8_t* byte_array_data;
   char node_name[NODE_DATA_FLOAT_NODE_NAME_BUF_SIZE];
 };
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h
index 6d85e6ce487da5d13342df622fed4ca8c07176d2..30fad13fb5f6ed1064784bc45448c786e401e49c 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h
@@ -43,13 +43,30 @@ bool soc_interface_Finalize();
 bool soc_interface_ExecuteGraph();
 // Teardown graph setup
 bool soc_interface_TeardownGraph();
+
+// Allocate buffers for input node and output node
+bool soc_interface_AllocateInOutNodeBuffers(int input_count, int* input_sizes,
+                                            int output_count,
+                                            int* output_sizes);
+
+// Send input data to SOC with port
+bool soc_interface_FillInputNodeWithPort(int port, int x, int y, int z, int d,
+                                         const uint8_t* const buf,
+                                         uint64_t buf_byte_size);
+
 // Send input data to SOC
 bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d,
                                       const uint8_t* const buf,
-                                      uint64_t buf_size);
+                                      uint64_t buf_byte_size);
+
+// Load output data from SOC with port
+bool soc_interface_ReadOutputNodeWithPort(int port, uint8_t** buf,
+                                          uint64_t* buf_byte_size);
+
 // Load output data from SOC
 bool soc_interface_ReadOutputNodeFloat(const char* const node_name,
-                                       uint8_t** buf, uint64_t* buf_size);
+                                       uint8_t** buf, uint64_t* buf_byte_size);
+
 // Setup graph
 // TODO(satok): Remove and use runtime version
 bool soc_interface_setupDummyGraph(int version);
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
index 7db8d4870c75b66d757f0640b7cd9b6a3b7b3845..a1387ee5736e1b118c45a55f32b164681a8af521 100755
--- a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
@@ -22,7 +22,12 @@ limitations under the License.
 #include "node_data_float.h"
 #include "tfm_log.h"
 
+// to demonstrate the performance difference between ION and HLOS memory
+// for sharing with ADSP.
+#define USE_ION_MEMORY
+
 const int64_t FLAG_ENABLE_INCEPTION_DUMMY_BINARY_INPUT = 0x01;
+const int64_t FLAG_ENABLE_EXPERIMENTAL_DEBUG = 0x02;
 
 static const int INCEPTION_VERSION = 3;
 
@@ -84,48 +89,62 @@ bool soc_interface_TeardownGraph() {
   return true;
 }
 
-bool soc_interface_FillInputNodeFloat(
-    int x, int y, int z, int d, const uint8_t* const buf,
-    uint64_t buf_size) {
-  TFMLOGD("FillInputNodeFloat");
-  struct NodeDataFloat* node_data_float =
-      hexagon_controller_GetInputNodeDataFloatBuffer();
-  const int array_size = x * y * z * d;
-  if (array_size > node_data_float->buf_size) {
-    TFMLOGE("Array size exceeds buf size %d > %d",
-            array_size, node_data_float->buf_size);
-    return false;
-  }
-  if (buf_size != array_size * sizeof(float)) {
-    TFMLOGE("Invalid buf size!");
+bool soc_interface_AllocateInOutNodeBuffers(int input_count, int* input_sizes,
+                                            int output_count,
+                                            int* output_sizes) {
+  TFMLOGD("AllocateInOutNodeBuffers");
+  return hexagon_controller_AllocateMultipleNodeDataBuffers(
+      input_count, input_sizes, output_count, output_sizes);
+}
+
+bool soc_interface_FillInputNodeWithPort(int port, int x, int y, int z, int d,
+                                         const uint8_t* const buf,
+                                         uint64_t buf_byte_size) {
+  TFMLOGD("FillInputNodeWithPort %d", port);
+  struct NodeDataFloat* node_data =
+      hexagon_controller_GetInputNodeDataBuffer(port);
+  if (buf_byte_size > node_data->max_buf_byte_size) {
+    TFMLOGE("buf size exceeds max buf size");
     return false;
   }
-  memcpy(node_data_float->byte_array_data, buf, buf_size);
-  node_data_float->x = x;
-  node_data_float->y = y;
-  node_data_float->z = z;
-  node_data_float->d = d;
-  node_data_float->array_size = buf_size;
+  memcpy(node_data->byte_array_data, buf, buf_byte_size);
+  node_data->x = x;
+  node_data->y = y;
+  node_data->z = z;
+  node_data->d = d;
+  node_data->array_byte_size = buf_byte_size;
   return true;
 }
 
+bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d,
+                                      const uint8_t* const buf,
+                                      uint64_t buf_byte_size) {
+  return soc_interface_FillInputNodeWithPort(
+      /*port=*/0, x, y, z, d, buf, buf_byte_size);
+}
+
 // TODO(satok): Remove and use runtime version
-bool soc_interface_ReadOutputNodeFloat(
-    const char* const node_name, uint8_t** buf, uint64_t *buf_size) {
-  TFMLOGD("ReadOutputNodeFloat");
-  int array_size = -1;
-  float* output_node_data_float =
-      hexagon_controller_GetOutputNodeDataFloatBuffer(node_name, &array_size);
-  if (array_size < 0) {
+bool soc_interface_ReadOutputNodeWithPort(int port, uint8_t** buf,
+                                          uint64_t* buf_byte_size) {
+  TFMLOGD("ReadOutputNodeWithPort");
+  int array_byte_size = -1;
+  uint8_t* output_node_data_buffer =
+      hexagon_controller_GetOutputNodeDataBuffer(port, &array_byte_size);
+  if (array_byte_size < 0) {
     TFMLOGE("Failed to read data.");
     return false;
   }
-  *buf = (uint8_t*)output_node_data_float;
-  *buf_size = array_size * sizeof(float);
+  *buf = output_node_data_buffer;
+  *buf_byte_size = array_byte_size;
   return true;
 }
 
-bool soc_interface_SetupGraphDummy(int version) {
+bool soc_interface_ReadOutputNodeFloat(const char* const node_name,
+                                       uint8_t** buf, uint64_t* buf_byte_size) {
+  return soc_interface_ReadOutputNodeWithPort(/*port=*/0, buf, buf_byte_size);
+}
+
+bool soc_interface_setupDummyGraph(int version) {
   TFMLOGD("SetupGraphDummy");
   const uint32_t graph_id = hexagon_controller_SetupGraph(version);
   if (graph_id == 0) {
@@ -136,12 +155,14 @@ bool soc_interface_SetupGraphDummy(int version) {
   return true;
 }
 
-bool soc_interface_AllocateNodeInputAndNodeOutputArray(
-    int total_input_count, int total_output_count) {
+bool soc_interface_AllocateNodeInputAndNodeOutputArray(int total_input_count,
+                                                       int total_output_count) {
   TFMLOGD("Allocate node inputs and node outputs array %d, %d",
           total_input_count, total_output_count);
-  s_node_inputs_array = malloc(total_input_count * sizeof(hexagon_nn_input));
-  s_node_outputs_array = malloc(total_output_count * sizeof(hexagon_nn_output));
+  posix_memalign((void**)&s_node_inputs_array, 128,
+                 total_input_count * sizeof(hexagon_nn_input));
+  posix_memalign((void**)&s_node_outputs_array, 128,
+                 total_output_count * sizeof(hexagon_nn_output));
   s_node_inputs_array_index = 0;
   s_node_outputs_array_index = 0;
   s_node_inputs_array_max_count = total_input_count;
@@ -188,9 +209,9 @@ void* soc_interface_SetOneNodeOutputs(int output_count, int* max_size) {
 }
 
 // Append const node to the graph
-bool soc_interface_AppendConstNode(
-    const char* const name, int node_id, int batch, int height, int width, int depth,
-    const uint8_t* const data, int data_length) {
+bool soc_interface_AppendConstNode(const char* const name, int node_id,
+                                   int batch, int height, int width, int depth,
+                                   const uint8_t* const data, int data_length) {
   const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
   const int retval = hexagon_controller_AppendConstNode(
       name, graph_id, node_id, batch, height, width, depth, data, data_length);
@@ -202,14 +223,14 @@ bool soc_interface_AppendConstNode(
 }
 
 // Append node to the graph
-bool soc_interface_AppendNode(
-    const char* const name, int node_id, int ops_id, int padding_id, const void* const inputs,
-    int inputs_count, const void* const outputs, int outputs_count) {
+bool soc_interface_AppendNode(const char* const name, int node_id, int ops_id,
+                              int padding_id, const void* const inputs,
+                              int inputs_count, const void* const outputs,
+                              int outputs_count) {
   const uint32_t graph_id = hexagon_controller_GetTargetGraphId();
   const int retval = hexagon_controller_AppendNode(
-      name, graph_id, node_id, ops_id, padding_id,
-      (hexagon_nn_input*) inputs, inputs_count,
-      (hexagon_nn_output*) outputs, outputs_count);
+      name, graph_id, node_id, ops_id, padding_id, (hexagon_nn_input*)inputs,
+      inputs_count, (hexagon_nn_output*)outputs, outputs_count);
   if (retval != 0) {
     TFMLOGE("Failed to append const node %d", node_id);
     return false;
@@ -217,7 +238,6 @@ bool soc_interface_AppendNode(
   return true;
 }
 
-
 // Instantiate graph
 bool soc_interface_InstantiateGraph() {
   const uint32_t nn_id = hexagon_controller_InstantiateGraph();
@@ -240,5 +260,7 @@ void soc_interface_SetDebugFlag(uint64_t flag) {
   if ((flag & FLAG_ENABLE_INCEPTION_DUMMY_BINARY_INPUT) != 0) {
     TFMLOGI("Enable always use panda data");
     hexagon_controller_EnableDbgUseInceptionDummyData(true);
+  } else if ((flag & FLAG_ENABLE_EXPERIMENTAL_DEBUG) != 0) {
+    SetExperimentalDebug();
   }
 }
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 5309724580cb73fe28ac27e16c82340569e00a64..3497c84d582abbb461273a40458e8d572a13838f 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -30,6 +30,6 @@ cc_binary(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/core/kernels/hexagon:graph_transferer",
-        "//tensorflow/tools/graph_transforms:transform_utils",
+        "//tensorflow/tools/graph_transforms:file_utils",
     ],
 )
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
index 6ae7c4a7420e8d7a58bc0a83e14e792b442f6d5d..60281951dda94008cad3a164be67d6fe8b59a916 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -24,19 +24,25 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
+#include "tensorflow/tools/graph_transforms/file_utils.h"
 
 namespace tensorflow {
+
 namespace {
-static int ParseFlags(int argc, char* argv[], string* in_graph) {
+static int ParseFlags(int argc, char* argv[], string* in_graph,
+                      bool* dump_all_nodes, bool* dump_shape_and_type) {
   std::vector<Flag> flag_list = {
-      Flag("in_graph", in_graph, "input graph file name"),
+      Flag("in_graph", in_graph, "Input graph file name to check hvx support."),
+      Flag("dump_all_nodes", dump_all_nodes, "Dump all nodes in the model."),
+      Flag("dump_shape_and_type", dump_shape_and_type,
+           "Dump shape and type of nodes"),
   };
   CHECK(Flags::Parse(&argc, argv, flag_list));
   // We need to call this to set up global state for TensorFlow.
@@ -48,12 +54,25 @@ static int ParseFlags(int argc, char* argv[], string* in_graph) {
   return 0;
 }
 
-static void SummarizeNode(const NodeDef& node_def) {
+static void SummarizeNode(const NodeDef& node_def,
+                          const bool dump_shape_and_type) {
   LOG(INFO) << "Node(" << node_def.name() << ")";
   LOG(INFO) << "  op: " << node_def.op();
   for (const string& input : node_def.input()) {
     LOG(INFO) << " Input: " << input;
   }
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  const Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+      node_def, &data_types, &shapes);
+  if (data_types.empty() || shapes.empty()) {
+    return;
+  }
+  CHECK_EQ(data_types.size(), shapes.size());
+  for (int i = 0; i < data_types.size(); ++i) {
+    LOG(INFO) << " Output(" << i << "): " << DataType_Name(data_types.at(i))
+              << ", " << shapes.at(i).DebugString();
+  }
 }
 
 static void DumpRemoteFusedGraph(const NodeDef& node_def) {
@@ -89,10 +108,14 @@ static void DumpRemoteFusedGraph(const NodeDef& node_def) {
   }
 }
 
-static void CheckOpsSupport(const GraphDef& graph_def) {
-  const IGraphTransferOpsDefinitions& ops_definition =
+static void CheckOpsSupport(const GraphDef& graph_def,
+                            const bool dump_all_nodes,
+                            const bool dump_shape_and_type) {
+  const IRemoteFusedGraphOpsDefinitions& ops_definition =
       HexagonOpsDefinitions::getInstance();
   LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
+  LOG(INFO) << "dump_all_nodes = " << dump_all_nodes
+            << ", dump_shape_and_tpye = " << dump_shape_and_type;
 
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
@@ -105,7 +128,7 @@ static void CheckOpsSupport(const GraphDef& graph_def) {
     }
     // TODO(satok): Set correct data type if it's given.
     const int op_id = ops_definition.GetOpIdFor(node.op(), {});
-    if (op_id == IGraphTransferOpsDefinitions::INVALID_OP_ID) {
+    if (op_id == IRemoteFusedGraphOpsDefinitions::INVALID_OP_ID) {
       all_supported = false;
       LOG(ERROR) << "OP type: " << node.op() << " is not supported on hvx. "
                  << "Name = " << node.name();
@@ -125,9 +148,9 @@ static void CheckOpsSupport(const GraphDef& graph_def) {
     LOG(INFO) << count << " ops are not supported.";
   }
 
-  if (contains_remote_graph) {
+  if (contains_remote_graph || dump_all_nodes) {
     for (const NodeDef& node : graph_def.node()) {
-      SummarizeNode(node);
+      SummarizeNode(node, dump_shape_and_type);
     }
   }
 }
@@ -137,7 +160,10 @@ static void CheckOpsSupport(const GraphDef& graph_def) {
 
 int main(int argc, char** argv) {
   tensorflow::string in_graph;
-  const int ret = tensorflow::ParseFlags(argc, argv, &in_graph);
+  bool dump_all_nodes;
+  bool dump_shape_and_type;
+  const int ret = tensorflow::ParseFlags(argc, argv, &in_graph, &dump_all_nodes,
+                                         &dump_shape_and_type);
   if (ret != 0) {
     return ret;
   }
@@ -146,6 +172,6 @@ int main(int argc, char** argv) {
   TF_CHECK_OK(tensorflow::graph_transforms::LoadTextOrBinaryGraphFile(
       in_graph, &graph_def));
 
-  tensorflow::CheckOpsSupport(graph_def);
+  tensorflow::CheckOpsSupport(graph_def, dump_all_nodes, dump_shape_and_type);
   return 0;
 }
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index a095f0e048a9b5831222d153a6c84cfa8ccd0dce..b2dfd7a0df330a95f0cab9debcbfd06339e1079a 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -105,6 +105,9 @@ tf_custom_op_library(
         "kernels/single_image_random_dot_stereograms_ops.cc",
         "ops/single_image_random_dot_stereograms_ops.cc",
     ],
+    deps = [
+        "@protobuf_archive//:protobuf",
+    ],
 )
 
 tf_gen_op_libs(
@@ -122,7 +125,11 @@ py_library(
     data = [":python/ops/_single_image_random_dot_stereograms.so"],
     srcs_version = "PY2AND3",
     deps = [
-        ":single_image_random_dot_stereograms_ops",
+        ":image_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 8a97f07732c4be43192f6ea8f6934118b49875f8..6adf837ca0ab506bd18f5e2e1fc1847e31d782bf 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -32,11 +32,11 @@ namespace functor {
 // Explicit instantiation of the CPU functor.
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-template class FillProjectiveTransform<CPUDevice, uint8>;
-template class FillProjectiveTransform<CPUDevice, int32>;
-template class FillProjectiveTransform<CPUDevice, int64>;
-template class FillProjectiveTransform<CPUDevice, float>;
-template class FillProjectiveTransform<CPUDevice, double>;
+template struct FillProjectiveTransform<CPUDevice, uint8>;
+template struct FillProjectiveTransform<CPUDevice, int32>;
+template struct FillProjectiveTransform<CPUDevice, int64>;
+template struct FillProjectiveTransform<CPUDevice, float>;
+template struct FillProjectiveTransform<CPUDevice, double>;
 
 }  // end namespace functor
 
@@ -116,7 +116,7 @@ namespace functor {
   void FillProjectiveTransform<GPUDevice, TYPE>::operator()(                \
       const GPUDevice& device, OutputType* output, const InputType& images, \
       const TransformsType& transform) const;                               \
-  extern template class FillProjectiveTransform<GPUDevice, TYPE>
+  extern template struct FillProjectiveTransform<GPUDevice, TYPE>
 
 TF_CALL_uint8(DECLARE_FUNCTOR);
 TF_CALL_int32(DECLARE_FUNCTOR);
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
index 6efcc29654fe9417abdb325de400db07d04e8416..9f0bf37aed3fc9aeefb7602ef3fda4cfd76f1917 100755
--- a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -54,8 +54,8 @@ class SingleImageRandomDotStereogramsOp : public OpKernel {
   float normalize_min;
   float border_level;
   int number_colors;
-  ::tensorflow::TensorShapeProto output_image_shape;
-  ::tensorflow::TensorShapeProto output_data_window;
+  ::tensorflow::PartialTensorShape output_image_shape;
+  ::tensorflow::PartialTensorShape output_data_window;
 
   uint8 Cblack = 0;
   uint8 Cwhite = 255;
@@ -109,15 +109,15 @@ class SingleImageRandomDotStereogramsOp : public OpKernel {
     input_Yvalue =
         input_tensor.shape().dim_size(0);  // Y value is the number of rows
 
-    output_Ximage = output_image_shape.dim(0).size();
-    output_Yimage = output_image_shape.dim(1).size();
-    output_Cimage = output_image_shape.dim(2).size();
+    output_Ximage = output_image_shape.dim_size(0);
+    output_Yimage = output_image_shape.dim_size(1);
+    output_Cimage = output_image_shape.dim_size(2);
 
     if (number_colors > 256)  // Go to full color image
       output_Cimage = 3;
 
-    int data_Xwindow = output_data_window.dim(0).size();
-    int data_Ywindow = output_data_window.dim(1).size();
+    int data_Xwindow = output_data_window.dim_size(0);
+    int data_Ywindow = output_data_window.dim_size(1);
 
     int deltaX_border_image = output_Ximage - data_Xwindow;
     int deltaY_border_image = output_Yimage - data_Ywindow;
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index b396dcea2118a2aa602d71e0316ba7f272ff9599..aef3e385b57486d5cb3cb13d9e8b9519768abd7c 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -136,7 +136,7 @@ def transform(images, transforms, interpolation="NEAREST"):
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points.
-     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
+    interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
diff --git a/tensorflow/contrib/imperative/BUILD b/tensorflow/contrib/imperative/BUILD
index c01d3f2d05e0fa2b6d1561bcbe27dbd4b591de2e..df6e5c4e0af0664cc663614934cb5f695c52c05b 100644
--- a/tensorflow/contrib/imperative/BUILD
+++ b/tensorflow/contrib/imperative/BUILD
@@ -20,6 +20,18 @@ py_library(
     deps = [
         ":imperative_graph",
         ":imperative_mode",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -28,9 +40,16 @@ py_library(
     srcs = ["imperative_graph.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
-        "//tensorflow/python:ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -41,7 +60,9 @@ py_library(
     deps = [
         ":imperative_graph",
         "//tensorflow/python:client",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
     ],
 )
 
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index 9aa5763efcc099ddfd4a3180917cb6b66b29efe5..bb7857eb998beb89517985a401d5b7afe483d843 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -82,9 +82,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":input_pipeline_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
diff --git a/tensorflow/contrib/input_pipeline/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
index 99bf1c87430c74f205e1b700357aec3abb52488a..f20a6e38d4e80f869e9274d6fc49338a95fc6788 100644
--- a/tensorflow/contrib/input_pipeline/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -13,7 +13,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index d70be594b21ac828d715e3a24684cc9106c59e2d..66948c1ea1f3f239d3f43a57626f8c229fe24ad9 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -20,9 +20,11 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:util",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/integrate/python/ops/odes.py b/tensorflow/contrib/integrate/python/ops/odes.py
index d13844d6132bf504c503ee72cf4f72d9250febb3..b4a99867ed46897f60be3f230838c3f576d5455e 100644
--- a/tensorflow/contrib/integrate/python/ops/odes.py
+++ b/tensorflow/contrib/integrate/python/ops/odes.py
@@ -12,47 +12,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """ODE solvers for TensorFlow."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
 
+import six
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 
-
-_ButcherTableau = collections.namedtuple(
-    '_ButcherTableau', 'alpha beta c_sol c_mid c_error')
+_ButcherTableau = collections.namedtuple('_ButcherTableau',
+                                         'alpha beta c_sol c_mid c_error')
 
 # Parameters from Shampine (1986), section 4.
 _DORMAND_PRINCE_TABLEAU = _ButcherTableau(
-    alpha=[1/5, 3/10, 4/5, 8/9, 1., 1.],
-    beta=[[1/5],
-          [3/40, 9/40],
-          [44/45, -56/15, 32/9],
-          [19372/6561, -25360/2187, 64448/6561, -212/729],
-          [9017/3168, -355/33, 46732/5247, 49/176, -5103/18656],
-          [35/384, 0, 500/1113, 125/192, -2187/6784, 11/84]],
-    c_sol=[35/384, 0, 500/1113, 125/192, -2187/6784, 11/84, 0],
-    c_mid=[6025192743/30085553152 / 2, 0, 51252292925/65400821598 / 2,
-           -2691868925/45128329728 / 2, 187940372067/1594534317056 / 2,
-           -1776094331/19743644256 / 2, 11237099/235043384 / 2],
-    c_error=[1951/21600 - 35/384,
-             0,
-             22642/50085 - 500/1113,
-             451/720 - 125/192,
-             -12231/42400 - -2187/6784,
-             649/6300 - 11/84,
-             1/60],
-)
+    alpha=[1 / 5, 3 / 10, 4 / 5, 8 / 9, 1., 1.],
+    beta=[
+        [1 / 5],
+        [3 / 40, 9 / 40],
+        [44 / 45, -56 / 15, 32 / 9],
+        [19372 / 6561, -25360 / 2187, 64448 / 6561, -212 / 729],
+        [9017 / 3168, -355 / 33, 46732 / 5247, 49 / 176, -5103 / 18656],
+        [35 / 384, 0, 500 / 1113, 125 / 192, -2187 / 6784, 11 / 84],
+    ],
+    c_sol=[35 / 384, 0, 500 / 1113, 125 / 192, -2187 / 6784, 11 / 84, 0],
+    c_mid=[
+        6025192743 / 30085553152 / 2, 0, 51252292925 / 65400821598 / 2,
+        -2691868925 / 45128329728 / 2, 187940372067 / 1594534317056 / 2,
+        -1776094331 / 19743644256 / 2, 11237099 / 235043384 / 2
+    ],
+    c_error=[
+        1951 / 21600 - 35 / 384,
+        0,
+        22642 / 50085 - 500 / 1113,
+        451 / 720 - 125 / 192,
+        -12231 / 42400 - -2187 / 6784,
+        649 / 6300 - 11 / 84,
+        1 / 60,
+    ],)
 
 
 def _possibly_nonzero(x):
@@ -64,9 +72,10 @@ def _scaled_dot_product(scale, xs, ys, name=None):
   with ops.name_scope(name, 'scaled_dot_product', [scale, xs, ys]) as scope:
     # Some of the parameters in our Butcher tableau include zeros. Using
     # _possibly_nonzero lets us avoid wasted computation.
-    return math_ops.add_n([(scale * x) * y for x, y in zip(xs, ys)
-                           if _possibly_nonzero(x) or _possibly_nonzero(y)],
-                          name=scope)
+    return math_ops.add_n(
+        [(scale * x) * y for x, y in zip(xs, ys)
+         if _possibly_nonzero(x) or _possibly_nonzero(y)],
+        name=scope)
 
 
 def _dot_product(xs, ys, name=None):
@@ -75,7 +84,12 @@ def _dot_product(xs, ys, name=None):
     return math_ops.add_n([x * y for x, y in zip(xs, ys)], name=scope)
 
 
-def _runge_kutta_step(func, y0, f0, t0, dt, tableau=_DORMAND_PRINCE_TABLEAU,
+def _runge_kutta_step(func,
+                      y0,
+                      f0,
+                      t0,
+                      dt,
+                      tableau=_DORMAND_PRINCE_TABLEAU,
                       name=None):
   """Take an arbitrary Runge-Kutta step and estimate error.
 
@@ -115,8 +129,8 @@ def _runge_kutta_step(func, y0, f0, t0, dt, tableau=_DORMAND_PRINCE_TABLEAU,
 
     y1 = array_ops.identity(yi, name='%s/y1' % scope)
     f1 = array_ops.identity(k[-1], name='%s/f1' % scope)
-    y1_error = _scaled_dot_product(dt_cast, tableau.c_error, k,
-                                   name='%s/y1_error' % scope)
+    y1_error = _scaled_dot_product(
+        dt_cast, tableau.c_error, k, name='%s/y1_error' % scope)
     return (y1, f1, y1_error, k)
 
 
@@ -208,15 +222,15 @@ def _optimal_step_size(last_step,
                        order=5,
                        name=None):
   """Calculate the optimal size for the next Runge-Kutta step."""
-  with ops.name_scope(
-      name, 'optimal_step_size', [last_step, error_ratio]) as scope:
+  with ops.name_scope(name, 'optimal_step_size', [last_step,
+                                                  error_ratio]) as scope:
     error_ratio = math_ops.cast(error_ratio, last_step.dtype)
     exponent = math_ops.cast(1 / order, last_step.dtype)
     # this looks more complex than necessary, but importantly it keeps
     # error_ratio in the numerator so we can't divide by zero:
-    factor = math_ops.maximum(
-        1 / ifactor,
-        math_ops.minimum(error_ratio ** exponent / safety, 1 / dfactor))
+    factor = math_ops.maximum(1 / ifactor,
+                              math_ops.minimum(error_ratio**exponent / safety,
+                                               1 / dfactor))
     return math_ops.div(last_step, factor, name=scope)
 
 
@@ -232,8 +246,9 @@ def _ta_append(tensor_array, value):
   return tensor_array.write(tensor_array.size(), value)
 
 
-class _RungeKuttaState(collections.namedtuple(
-    '_RungeKuttaState', 'y1, f1, t0, t1, dt, interp_coeff')):
+class _RungeKuttaState(
+    collections.namedtuple('_RungeKuttaState',
+                           'y1, f1, t0, t1, dt, interp_coeff')):
   """Saved state of the Runge Kutta solver.
 
   Attributes:
@@ -247,8 +262,8 @@ class _RungeKuttaState(collections.namedtuple(
   """
 
 
-class _History(collections.namedtuple(
-    '_History', 'integrate_points, error_ratio')):
+class _History(
+    collections.namedtuple('_History', 'integrate_points, error_ratio')):
   """Saved integration history for use in `info_dict`.
 
   Attributes:
@@ -258,6 +273,20 @@ class _History(collections.namedtuple(
   """
 
 
+def _assert_increasing(t):
+  assert_increasing = control_flow_ops.Assert(
+      math_ops.reduce_all(t[1:] > t[:-1]), ['`t` must be monotonic increasing'])
+  return ops.control_dependencies([assert_increasing])
+
+
+def _check_input_types(t, y0):
+  if not (y0.dtype.is_floating or y0.dtype.is_complex):
+    raise TypeError('`y0` must have a floating point or complex floating '
+                    'point dtype')
+  if not t.dtype.is_floating:
+    raise TypeError('`t` must have a floating point dtype')
+
+
 def _dopri5(func,
             y0,
             t,
@@ -277,24 +306,24 @@ def _dopri5(func,
     # automatically
     first_step = 1.0
 
-  with ops.name_scope(
-      name, 'dopri5',
-      [y0, t, rtol, atol, safety, ifactor, dfactor, max_num_steps]) as scope:
+  with ops.name_scope(name, 'dopri5', [
+      y0, t, rtol, atol, safety, ifactor, dfactor, max_num_steps
+  ]) as scope:
 
-    first_step = ops.convert_to_tensor(first_step, dtype=t.dtype,
-                                       name='first_step')
+    first_step = ops.convert_to_tensor(
+        first_step, dtype=t.dtype, name='first_step')
     safety = ops.convert_to_tensor(safety, dtype=t.dtype, name='safety')
     ifactor = ops.convert_to_tensor(ifactor, dtype=t.dtype, name='ifactor')
     dfactor = ops.convert_to_tensor(dfactor, dtype=t.dtype, name='dfactor')
-    max_num_steps = ops.convert_to_tensor(max_num_steps, dtype=dtypes.int32,
-                                          name='max_num_steps')
+    max_num_steps = ops.convert_to_tensor(
+        max_num_steps, dtype=dtypes.int32, name='max_num_steps')
 
     def adaptive_runge_kutta_step(rk_state, history, n_steps):
       """Take an adaptive Runge-Kutta step to integrate the ODE."""
       y0, f0, _, t0, dt, interp_coeff = rk_state
       with ops.name_scope('assertions'):
-        check_underflow = control_flow_ops.Assert(
-            t0 + dt > t0, ['underflow in dt', dt])
+        check_underflow = control_flow_ops.Assert(t0 + dt > t0,
+                                                  ['underflow in dt', dt])
         check_max_num_steps = control_flow_ops.Assert(
             n_steps < max_num_steps, ['max_num_steps exceeded'])
         check_numerics = control_flow_ops.Assert(
@@ -320,16 +349,16 @@ def _dopri5(func,
         f_next = control_flow_ops.cond(accept_step, lambda: f1, lambda: f0)
         t_next = control_flow_ops.cond(accept_step, lambda: t0 + dt, lambda: t0)
         interp_coeff = control_flow_ops.cond(
-            accept_step,
-            lambda: _interp_fit_rk(y0, y1, k, dt),
+            accept_step, lambda: _interp_fit_rk(y0, y1, k, dt),
             lambda: interp_coeff)
         dt_next = _optimal_step_size(dt, error_ratio, safety, ifactor, dfactor)
-        rk_state = _RungeKuttaState(
-            y_next, f_next, t0, t_next, dt_next, interp_coeff)
+        rk_state = _RungeKuttaState(y_next, f_next, t0, t_next, dt_next,
+                                    interp_coeff)
 
       with ops.name_scope('update/history'):
-        history = _History(_ta_append(history.integrate_points, t0 + dt),
-                           _ta_append(history.error_ratio, error_ratio))
+        history = _History(
+            _ta_append(history.integrate_points, t0 + dt),
+            _ta_append(history.error_ratio, error_ratio))
       return rk_state, history, n_steps + 1
 
     def interpolate(solution, history, rk_state, i):
@@ -337,18 +366,14 @@ def _dopri5(func,
       with ops.name_scope('interpolate'):
         rk_state, history, _ = control_flow_ops.while_loop(
             lambda rk_state, *_: t[i] > rk_state.t1,
-            adaptive_runge_kutta_step,
-            (rk_state, history, 0),
+            adaptive_runge_kutta_step, (rk_state, history, 0),
             name='integrate_loop')
-        y = _interp_evaluate(
-            rk_state.interp_coeff, rk_state.t0, rk_state.t1, t[i])
+        y = _interp_evaluate(rk_state.interp_coeff, rk_state.t0, rk_state.t1,
+                             t[i])
         solution = solution.write(i, y)
         return solution, history, rk_state, i + 1
 
-    assert_increasing = control_flow_ops.Assert(
-        math_ops.reduce_all(t[1:] > t[:-1]),
-        ['`t` must be monotonic increasing'])
-    with ops.control_dependencies([assert_increasing]):
+    with _assert_increasing(t):
       num_times = array_ops.size(t)
 
     solution = tensor_array_ops.TensorArray(
@@ -363,8 +388,7 @@ def _dopri5(func,
 
     solution, history, _, _ = control_flow_ops.while_loop(
         lambda _, __, ___, i: i < num_times,
-        interpolate,
-        (solution, history, rk_state, 1),
+        interpolate, (solution, history, rk_state, 1),
         name='interpolate_loop')
 
     y = solution.stack(name=scope)
@@ -373,9 +397,11 @@ def _dopri5(func,
       return y
     else:
       integrate_points = history.integrate_points.stack()
-      info_dict = {'num_func_evals': 6 * array_ops.size(integrate_points) + 1,
-                   'integrate_points': integrate_points,
-                   'error_ratio': history.error_ratio.stack()}
+      info_dict = {
+          'num_func_evals': 6 * array_ops.size(integrate_points) + 1,
+          'integrate_points': integrate_points,
+          'error_ratio': history.error_ratio.stack()
+      }
       return (y, info_dict)
 
 
@@ -390,7 +416,7 @@ def odeint(func,
            name=None):
   """Integrate a system of ordinary differential equations.
 
-  Solves the initial value problem for a non-stiff system of first order ode-s:
+  Solves the initial value problem for a non-stiff system of first order ODEs:
 
     ```
     dy/dt = func(y, t), y(t[0]) = y0
@@ -483,21 +509,109 @@ def odeint(func,
     # arbitrarily nested tuple. This will help performance and usability by
     # avoiding the need to pack/unpack in user functions.
     y0 = ops.convert_to_tensor(y0, name='y0')
-    if not (y0.dtype.is_floating or y0.dtype.is_complex):
-      raise TypeError('`y0` must have a floating point or complex floating '
-                      'point dtype')
-
     t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
-    if not t.dtype.is_floating:
-      raise TypeError('`t` must have a floating point dtype')
+    _check_input_types(t, y0)
 
     error_dtype = abs(y0).dtype
     rtol = ops.convert_to_tensor(rtol, dtype=error_dtype, name='rtol')
     atol = ops.convert_to_tensor(atol, dtype=error_dtype, name='atol')
 
-    return _dopri5(func, y0, t,
-                   rtol=rtol,
-                   atol=atol,
-                   full_output=full_output,
-                   name=scope,
-                   **options)
+    return _dopri5(
+        func,
+        y0,
+        t,
+        rtol=rtol,
+        atol=atol,
+        full_output=full_output,
+        name=scope,
+        **options)
+
+
+class _FixedGridIntegrator(six.with_metaclass(abc.ABCMeta)):
+  """Base class for fixed-grid ODE integrators."""
+
+  def integrate(self, evol_func, y0, time_grid):
+    time_delta_grid = time_grid[1:] - time_grid[:-1]
+
+    scan_func = self._make_scan_func(evol_func)
+
+    y_grid = functional_ops.scan(scan_func, (time_grid[:-1], time_delta_grid),
+                                 y0)
+    return array_ops.concat([[y0], y_grid], axis=0)
+
+  def _make_scan_func(self, evol_func):
+
+    def scan_func(y, t_and_dt):
+      t, dt = t_and_dt
+      dy = self._step_func(evol_func, t, dt, y)
+      dy = math_ops.cast(dy, dtype=y.dtype)
+      return y + dy
+
+    return scan_func
+
+  @abc.abstractmethod
+  def _step_func(self, evol_func, t, dt, y):
+    pass
+
+
+class _MidpointFixedGridIntegrator(_FixedGridIntegrator):
+
+  def _step_func(self, evol_func, t, dt, y):
+    dt_cast = math_ops.cast(dt, y.dtype)
+    # yn1 = yn + h * f(tn + h/2, yn + f(tn, yn) * h/2)
+    return dt_cast * evol_func(y + evol_func(y, t) * dt_cast / 2, t + dt / 2)
+
+
+class _RK4FixedGridIntegrator(_FixedGridIntegrator):
+
+  def _step_func(self, evol_func, t, dt, y):
+    k1 = evol_func(y, t)
+    half_step = t + dt / 2
+    dt_cast = math_ops.cast(dt, y.dtype)
+
+    k2 = evol_func(y + dt_cast * k1 / 2, half_step)
+    k3 = evol_func(y + dt_cast * k2 / 2, half_step)
+    k4 = evol_func(y + dt_cast * k3, t + dt)
+    return math_ops.add_n([k1, 2 * k2, 2 * k3, k4]) * (dt_cast / 6)
+
+
+def odeint_fixed(func, y0, t, method='rk4', name=None):
+  """ODE integration on a fixed grid (with no step size control).
+
+  Useful in certain scenarios to avoid the overhead of adaptive step size
+  control, e.g. when differentiation of the integration result is desired and/or
+  the time grid is known a priori to be sufficient.
+
+  Args:
+    func: Function that maps a Tensor holding the state `y` and a scalar Tensor
+      `t` into a Tensor of state derivatives with respect to time.
+    y0: N-D Tensor giving starting value of `y` at time point `t[0]`.
+    t: 1-D Tensor holding a sequence of time points for which to solve for
+      `y`. The initial time point should be the first element of this sequence,
+      and each time must be larger than the previous time. May have any floating
+      point dtype.
+    method: One of 'midpoint' or 'rk4'.
+    name: Optional name for the resulting operation.
+
+  Returns:
+    y: (N+1)-D tensor, where the first dimension corresponds to different
+      time points. Contains the solved value of y for each desired time point in
+      `t`, with the initial value `y0` being the first element along the first
+      dimension.
+
+  Raises:
+    ValueError: Upon caller errors.
+  """
+  with ops.name_scope(name, 'odeint_fixed', [y0, t]):
+    t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
+    y0 = ops.convert_to_tensor(y0, name='y0')
+    _check_input_types(t, y0)
+
+    with _assert_increasing(t):
+      with ops.name_scope(method):
+        if method == 'midpoint':
+          return _MidpointFixedGridIntegrator().integrate(func, y0, t)
+        elif method == 'rk4':
+          return _RK4FixedGridIntegrator().integrate(func, y0, t)
+        else:
+          raise ValueError('method not supported: {!s}'.format(method))
diff --git a/tensorflow/contrib/integrate/python/ops/odes_test.py b/tensorflow/contrib/integrate/python/ops/odes_test.py
index 009e1d1f77ca7efc976e86b13a838e399f1dd45b..3ec01212d25ca8dc6e13f340177a5e85138868d5 100644
--- a/tensorflow/contrib/integrate/python/ops/odes_test.py
+++ b/tensorflow/contrib/integrate/python/ops/odes_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for ODE solvers."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -165,11 +166,9 @@ class OdeIntTest(test.TestCase):
 
     with self.test_session() as sess:
       y_solved_0, info_0 = sess.run(
-          odes.odeint(
-              self.func, self.y0, times0, full_output=True))
+          odes.odeint(self.func, self.y0, times0, full_output=True))
       y_solved_1, info_1 = sess.run(
-          odes.odeint(
-              self.func, self.y0, times1, full_output=True))
+          odes.odeint(self.func, self.y0, times1, full_output=True))
 
     self.assertAllClose(y_solved_0, y_solved_1[::10])
     self.assertEqual(info_0['num_func_evals'], info_1['num_func_evals'])
@@ -182,11 +181,9 @@ class OdeIntTest(test.TestCase):
         full_output=True, method='dopri5', options=dict(max_num_steps=2000))
     with self.test_session() as sess:
       _, info_0 = sess.run(
-          odes.odeint(
-              self.func, self.y0, t, rtol=0, atol=1e-6, **kwargs))
+          odes.odeint(self.func, self.y0, t, rtol=0, atol=1e-6, **kwargs))
       _, info_1 = sess.run(
-          odes.odeint(
-              self.func, self.y0, t, rtol=0, atol=1e-9, **kwargs))
+          odes.odeint(self.func, self.y0, t, rtol=0, atol=1e-9, **kwargs))
     self.assertAllClose(
         info_0['integrate_points'].size * 1000**0.2,
         float(info_1['integrate_points'].size),
@@ -243,5 +240,49 @@ class InterpolationTest(test.TestCase):
         sess.run(y_invalid)
 
 
+class OdeIntFixedTest(test.TestCase):
+
+  def _test_integrate_sine(self, method):
+
+    def evol_func(y, t):
+      del t
+      return array_ops.stack([y[1], -y[0]])
+
+    y0 = [0., 1.]
+    time_grid = np.linspace(0., 10., 200)
+    y_grid = odes.odeint_fixed(evol_func, y0, time_grid, method=method)
+
+    with self.test_session() as sess:
+      y_grid_array = sess.run(y_grid)
+
+    np.testing.assert_allclose(
+        y_grid_array[:, 0], np.sin(time_grid), rtol=1e-2, atol=1e-2)
+
+  def _test_integrate_gaussian(self, method):
+
+    def evol_func(y, t):
+      return -math_ops.cast(t, dtype=y.dtype) * y[0]
+
+    y0 = [1.]
+    time_grid = np.linspace(0., 2., 100)
+    y_grid = odes.odeint_fixed(evol_func, y0, time_grid, method=method)
+
+    with self.test_session() as sess:
+      y_grid_array = sess.run(y_grid)
+
+    np.testing.assert_allclose(
+        y_grid_array[:, 0], np.exp(-time_grid**2 / 2), rtol=1e-2, atol=1e-2)
+
+  def _test_everything(self, method):
+    self._test_integrate_sine(method)
+    self._test_integrate_gaussian(method)
+
+  def test_midpoint(self):
+    self._test_everything('midpoint')
+
+  def test_rk4(self):
+    self._test_everything('rk4')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 71ce6540d62530e1e03e99cd10b39f9f20d05cf3..b923fc7e9a1db6ed9a3cc4dad203af26b255d5e2 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -7,6 +7,7 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
@@ -18,6 +19,7 @@ py_library(
         "api/keras/activations/__init__.py",
         "api/keras/applications/__init__.py",
         "api/keras/applications/inception_v3/__init__.py",
+        "api/keras/applications/mobilenet/__init__.py",
         "api/keras/applications/resnet50/__init__.py",
         "api/keras/applications/vgg16/__init__.py",
         "api/keras/applications/vgg19/__init__.py",
@@ -51,6 +53,7 @@ py_library(
         "python/keras/applications/__init__.py",
         "python/keras/applications/imagenet_utils.py",
         "python/keras/applications/inception_v3.py",
+        "python/keras/applications/mobilenet.py",
         "python/keras/applications/resnet50.py",
         "python/keras/applications/vgg16.py",
         "python/keras/applications/vgg19.py",
@@ -113,24 +116,35 @@ py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:ctc_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "@six_archive//:six",
     ],
 )
 
@@ -142,8 +156,8 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -155,6 +169,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -166,6 +181,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -177,6 +193,8 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:init_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -187,7 +205,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -200,8 +217,9 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -213,6 +231,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -224,6 +243,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -238,6 +258,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "mobilenet_test",
+    size = "small",
+    srcs = ["python/keras/applications/mobilenet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "resnet50_test",
     size = "small",
@@ -289,7 +320,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -298,11 +328,13 @@ py_test(
     name = "convolutional_recurrent_test",
     size = "medium",
     srcs = ["python/keras/layers/convolutional_recurrent_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
+    tags = ["noasan"],  # times out b/63678675
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -313,12 +345,13 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "manual",
+        "noasan",  # times out b/63678675
         "notsan",
     ],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -329,7 +362,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -341,8 +373,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -353,7 +385,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -365,8 +396,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -378,6 +409,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -388,19 +420,17 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "normalization_test",
     size = "small",
     srcs = ["python/keras/layers/normalization_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        ":testing_utils",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -413,8 +443,8 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -426,8 +456,8 @@ py_test(
     tags = ["notsan"],  # http://b/62136390
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -436,11 +466,14 @@ py_test(
     size = "medium",
     srcs = ["python/keras/layers/lstm_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # http://b/62189182
+    tags = [
+        "noasan",  # times out b/63678675
+        "notsan",  # http://b/62189182
+    ],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -463,6 +496,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -474,8 +508,47 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "data_utils_test",
+    size = "small",
+    srcs = ["python/keras/utils/data_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "noasan",  # times out
+        "notsan",
+    ],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "generic_utils_test",
+    size = "small",
+    srcs = ["python/keras/utils/generic_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "imagenet_utils_test",
+    size = "small",
+    srcs = ["python/keras/applications/imagenet_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -487,6 +560,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -498,6 +572,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -509,6 +584,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -520,8 +596,8 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -533,8 +609,8 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":keras",
-        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -545,7 +621,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -557,6 +636,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -568,6 +648,8 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -579,6 +661,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
+        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/keras/api/keras/activations/__init__.py b/tensorflow/contrib/keras/api/keras/activations/__init__.py
index e4d4b1e42cb43da6a1f555e0bf8741e9dcac6509..af6f249e71c9b6c5c23d0f3c9aef91e52b37e8a5 100644
--- a/tensorflow/contrib/keras/api/keras/activations/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/activations/__init__.py
@@ -23,6 +23,7 @@ from tensorflow.contrib.keras.python.keras.activations import elu
 from tensorflow.contrib.keras.python.keras.activations import hard_sigmoid
 from tensorflow.contrib.keras.python.keras.activations import linear
 from tensorflow.contrib.keras.python.keras.activations import relu
+from tensorflow.contrib.keras.python.keras.activations import selu
 from tensorflow.contrib.keras.python.keras.activations import sigmoid
 from tensorflow.contrib.keras.python.keras.activations import softmax
 from tensorflow.contrib.keras.python.keras.activations import softplus
diff --git a/tensorflow/contrib/keras/api/keras/applications/__init__.py b/tensorflow/contrib/keras/api/keras/applications/__init__.py
index fee5b7103ac4b6419e01662c30e84563c1b65acb..f943e84606be0e8276ea8750416a2845644c8077 100644
--- a/tensorflow/contrib/keras/api/keras/applications/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/__init__.py
@@ -19,11 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.keras.api.keras.applications import inception_v3
+from tensorflow.contrib.keras.api.keras.applications import mobilenet
 from tensorflow.contrib.keras.api.keras.applications import resnet50
 from tensorflow.contrib.keras.api.keras.applications import vgg16
 from tensorflow.contrib.keras.api.keras.applications import vgg19
 from tensorflow.contrib.keras.api.keras.applications import xception
 from tensorflow.contrib.keras.api.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.api.keras.applications.mobilenet import MobileNet
 from tensorflow.contrib.keras.api.keras.applications.resnet50 import ResNet50
 from tensorflow.contrib.keras.api.keras.applications.vgg16 import VGG16
 from tensorflow.contrib.keras.api.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py b/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..594861fb51c7138dec7f4a8d9badf34cc3870594
--- /dev/null
+++ b/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MobileNet Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python.keras.applications.mobilenet import decode_predictions
+from tensorflow.contrib.keras.python.keras.applications.mobilenet import MobileNet
+from tensorflow.contrib.keras.python.keras.applications.mobilenet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
index 36db34f592d839619112a1945c31fbcdbd2cfaf4..3a970748573004cc43ae0d15d07576d678fee3e3 100644
--- a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.keras.python.keras.callbacks import ProgbarLogger
 from tensorflow.contrib.keras.python.keras.callbacks import ReduceLROnPlateau
 from tensorflow.contrib.keras.python.keras.callbacks import RemoteMonitor
 from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard
+from tensorflow.contrib.keras.python.keras.callbacks import TerminateOnNaN
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/initializers/__init__.py b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
index f0c1540d9ad32df4ef7e960b8ff914c0517bc89c..9b58723ed5c93d441b2ae8976d5acaba2db3ad40 100644
--- a/tensorflow/contrib/keras/api/keras/initializers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
@@ -36,6 +36,7 @@ from tensorflow.contrib.keras.python.keras.initializers import glorot_normal
 from tensorflow.contrib.keras.python.keras.initializers import glorot_uniform
 from tensorflow.contrib.keras.python.keras.initializers import he_normal
 from tensorflow.contrib.keras.python.keras.initializers import he_uniform
+from tensorflow.contrib.keras.python.keras.initializers import lecun_normal
 from tensorflow.contrib.keras.python.keras.initializers import lecun_uniform
 
 # Auxiliary utils.
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index 3c6dce5ee8ff263ffadc967bf2895e2ff887de34..aafd18921754657be4eb06de98dd52c6ca579564 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -36,6 +36,7 @@ from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv1D
 from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv2D
 from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv3D
 from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Conv3DTranspose
 from tensorflow.contrib.keras.python.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
@@ -43,6 +44,7 @@ from tensorflow.contrib.keras.python.keras.layers.convolutional import Convoluti
 from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution2D
 from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution3D
 from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.contrib.keras.python.keras.layers.convolutional import Convolution3DTranspose
 from tensorflow.contrib.keras.python.keras.layers.convolutional import SeparableConvolution2D
 
 # Image processing layers.
@@ -96,6 +98,7 @@ from tensorflow.contrib.keras.python.keras.layers.merge import concatenate
 from tensorflow.contrib.keras.python.keras.layers.merge import dot
 
 # Noise layers.
+from tensorflow.contrib.keras.python.keras.layers.noise import AlphaDropout
 from tensorflow.contrib.keras.python.keras.layers.noise import GaussianNoise
 from tensorflow.contrib.keras.python.keras.layers.noise import GaussianDropout
 
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
index 2d2fee2698d3676ac465411056c9403f38b7e58a..06dd679f9cadedb93c87ee7b8210fb91d9a867c4 100644
--- a/tensorflow/contrib/keras/api/keras/losses/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -21,9 +21,11 @@ from __future__ import print_function
 # Loss functions.
 from tensorflow.contrib.keras.python.keras.losses import binary_crossentropy
 from tensorflow.contrib.keras.python.keras.losses import categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.losses import categorical_hinge
 from tensorflow.contrib.keras.python.keras.losses import cosine_proximity
 from tensorflow.contrib.keras.python.keras.losses import hinge
 from tensorflow.contrib.keras.python.keras.losses import kullback_leibler_divergence
+from tensorflow.contrib.keras.python.keras.losses import logcosh
 from tensorflow.contrib.keras.python.keras.losses import mean_absolute_error
 from tensorflow.contrib.keras.python.keras.losses import mean_absolute_percentage_error
 from tensorflow.contrib.keras.python.keras.losses import mean_squared_error
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
index ba43ffece81dbdd8ee9ee3ac11f61e4f08fc0559..99496edde2daf68e7bc28fa272a34bd295855d86 100644
--- a/tensorflow/contrib/keras/api/keras/metrics/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -32,6 +32,7 @@ from tensorflow.contrib.keras.python.keras.metrics import mean_squared_error
 from tensorflow.contrib.keras.python.keras.metrics import mean_squared_logarithmic_error
 from tensorflow.contrib.keras.python.keras.metrics import poisson
 from tensorflow.contrib.keras.python.keras.metrics import sparse_categorical_crossentropy
+from tensorflow.contrib.keras.python.keras.metrics import sparse_top_k_categorical_accuracy
 from tensorflow.contrib.keras.python.keras.metrics import squared_hinge
 from tensorflow.contrib.keras.python.keras.metrics import top_k_categorical_accuracy
 
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
index 7f14fa206579f1b4f938c6da5abc67fc83b500c2..d6d70f79d5fae12f624cca17d8496af3340f572f 100644
--- a/tensorflow/contrib/keras/api/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.keras.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.data_utils import Sequence
+from tensorflow.contrib.keras.python.keras.utils.data_utils import SequenceEnqueuer
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
diff --git a/tensorflow/contrib/keras/python/keras/__init__.py b/tensorflow/contrib/keras/python/keras/__init__.py
index 1c1485c0cda352991fdfb61780e00ee1c7e1309a..19380bc8c5aaec057b0280822263bde33ed92e15 100644
--- a/tensorflow/contrib/keras/python/keras/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/__init__.py
@@ -37,4 +37,4 @@ from tensorflow.contrib.keras.python.keras import utils
 from tensorflow.contrib.keras.python.keras import wrappers
 from tensorflow.contrib.keras.python.keras.layers import Input
 
-__version__ = '2.0.4-tf'
+__version__ = '2.0.6-tf'
diff --git a/tensorflow/contrib/keras/python/keras/activations.py b/tensorflow/contrib/keras/python/keras/activations.py
index 35d15e74c266a19509674d2e726d39e45e3807ef..7f04234e018676ac036d5f56bd712bc5a21ef6d5 100644
--- a/tensorflow/contrib/keras/python/keras/activations.py
+++ b/tensorflow/contrib/keras/python/keras/activations.py
@@ -54,6 +54,23 @@ def elu(x, alpha=1.0):
   return K.elu(x, alpha)
 
 
+def selu(x):
+  """Scaled Exponential Linear Unit. (Klambauer et al., 2017).
+
+  Arguments:
+      x: A tensor or variable to compute the activation function for.
+
+  Returns:
+    Tensor with the same shape and dtype as `x`.
+
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+  """
+  alpha = 1.6732632423543772848170429916717
+  scale = 1.0507009873554804934193349852946
+  return scale * K.elu(x, alpha)
+
+
 def softplus(x):
   return K.softplus(x)
 
diff --git a/tensorflow/contrib/keras/python/keras/activations_test.py b/tensorflow/contrib/keras/python/keras/activations_test.py
index eec4d257f2c84706ccccf7a97b1372db8b193e4b..3d21610e497b7fb525962548e2a86f7c123a5a38 100644
--- a/tensorflow/contrib/keras/python/keras/activations_test.py
+++ b/tensorflow/contrib/keras/python/keras/activations_test.py
@@ -35,7 +35,7 @@ class KerasActivationsTest(test.TestCase):
   def test_serialization(self):
     all_activations = ['softmax', 'relu', 'elu', 'tanh',
                        'sigmoid', 'hard_sigmoid', 'linear',
-                       'softplus', 'softsign']
+                       'softplus', 'softsign', 'selu']
     for name in all_activations:
       fn = keras.activations.get(name)
       ref_fn = getattr(keras.activations, name)
@@ -63,6 +63,22 @@ class KerasActivationsTest(test.TestCase):
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
+  def test_selu(self):
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.selu(x)])
+    alpha = 1.6732632423543772848170429916717
+    scale = 1.0507009873554804934193349852946
+
+    with self.test_session():
+      positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
+      result = f([positive_values])[0]
+      self.assertAllClose(result, positive_values * scale, rtol=1e-05)
+
+      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+      result = f([negative_values])[0]
+      true_result = (np.exp(negative_values) - 1) * scale * alpha
+      self.assertAllClose(result, true_result)
+
   def test_softplus(self):
     def softplus(x):
       return np.log(np.ones_like(x) + np.exp(x))
diff --git a/tensorflow/contrib/keras/python/keras/applications/__init__.py b/tensorflow/contrib/keras/python/keras/applications/__init__.py
index c6af9ea9f16f8552fbbed6b8d6db46c9921914f1..9139df30a6e8db86cef752f7739f8bd047dc16a7 100644
--- a/tensorflow/contrib/keras/python/keras/applications/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/applications/__init__.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.keras.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.contrib.keras.python.keras.applications.mobilenet import MobileNet
 from tensorflow.contrib.keras.python.keras.applications.resnet50 import ResNet50
 from tensorflow.contrib.keras.python.keras.applications.vgg16 import VGG16
 from tensorflow.contrib.keras.python.keras.applications.vgg19 import VGG19
 from tensorflow.contrib.keras.python.keras.applications.xception import Xception
-
diff --git a/tensorflow/contrib/keras/python/keras/applications/imagenet_utils_test.py b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3bcf93a95052a9596bc4b912ee313bbda2a09d4
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/imagenet_utils_test.py
@@ -0,0 +1,139 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Inception V3 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class ImageNetUtilsTest(test.TestCase):
+
+  def test_preprocess_input(self):
+    x = np.random.uniform(0, 255, (2, 3, 2, 3))
+    self.assertEqual(
+        keras.applications.imagenet_utils.preprocess_input(x).shape, x.shape)
+
+    out1 = keras.applications.imagenet_utils.preprocess_input(
+        x, 'channels_last')
+    out2 = keras.applications.imagenet_utils.preprocess_input(
+        np.transpose(x, (0, 3, 1, 2)), 'channels_first')
+    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
+
+  def test_decode_predictions(self):
+    x = np.zeros((2, 1000))
+    x[0, 372] = 1.0
+    x[1, 549] = 1.0
+    outs = keras.applications.imagenet_utils.decode_predictions(x, top=1)
+    scores = [out[0][2] for out in outs]
+    self.assertEqual(scores[0], scores[1])
+
+    # the numbers of columns and ImageNet classes are not identical.
+    with self.assertRaises(ValueError):
+      keras.applications.imagenet_utils.decode_predictions(np.ones((2, 100)))
+
+  def test_obtain_input_shape(self):
+    # input_shape and default_size are not identical.
+    with self.assertRaises(ValueError):
+      keras.applications.imagenet_utils._obtain_input_shape(
+          input_shape=(224, 224, 3),
+          default_size=299,
+          min_size=139,
+          data_format='channels_last',
+          include_top=True)
+
+    # Test invalid use cases
+    for data_format in ['channels_last', 'channels_first']:
+      # input_shape is smaller than min_size.
+      shape = (100, 100)
+      if data_format == 'channels_last':
+        input_shape = shape + (3,)
+      else:
+        input_shape = (3,) + shape
+      with self.assertRaises(ValueError):
+        keras.applications.imagenet_utils._obtain_input_shape(
+            input_shape=input_shape,
+            default_size=None,
+            min_size=139,
+            data_format=data_format,
+            include_top=False)
+
+      # shape is 1D.
+      shape = (100,)
+      if data_format == 'channels_last':
+        input_shape = shape + (3,)
+      else:
+        input_shape = (3,) + shape
+      with self.assertRaises(ValueError):
+        keras.applications.imagenet_utils._obtain_input_shape(
+            input_shape=input_shape,
+            default_size=None,
+            min_size=139,
+            data_format=data_format,
+            include_top=False)
+
+      # the number of channels is 5 not 3.
+      shape = (100, 100)
+      if data_format == 'channels_last':
+        input_shape = shape + (5,)
+      else:
+        input_shape = (5,) + shape
+      with self.assertRaises(ValueError):
+        keras.applications.imagenet_utils._obtain_input_shape(
+            input_shape=input_shape,
+            default_size=None,
+            min_size=139,
+            data_format=data_format,
+            include_top=False)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=None,
+        default_size=None,
+        min_size=139,
+        data_format='channels_last',
+        include_top=False) == (None, None, 3)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=None,
+        default_size=None,
+        min_size=139,
+        data_format='channels_first',
+        include_top=False) == (3, None, None)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=None,
+        default_size=None,
+        min_size=139,
+        data_format='channels_last',
+        include_top=False) == (None, None, 3)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=(150, 150, 3),
+        default_size=None,
+        min_size=139,
+        data_format='channels_last',
+        include_top=False) == (150, 150, 3)
+
+    assert keras.applications.imagenet_utils._obtain_input_shape(
+        input_shape=(3, None, None),
+        default_size=None,
+        min_size=139,
+        data_format='channels_first',
+        include_top=False) == (3, None, None)
diff --git a/tensorflow/contrib/keras/python/keras/applications/mobilenet.py b/tensorflow/contrib/keras/python/keras/applications/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..37240234d37677e9ad676be192b606c4d4fa3d8b
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/mobilenet.py
@@ -0,0 +1,655 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MobileNet v1 models for Keras.
+
+MobileNet is a general architecture and can be used for multiple use cases.
+Depending on the use case, it can use different input layer size and
+different width factors. This allows different width models to reduce
+the number of multiply-adds and thereby
+reduce inference cost on mobile devices.
+
+MobileNets support any input size greater than 32 x 32, with larger image sizes
+offering better performance.
+The number of parameters and number of multiply-adds
+can be modified by using the `alpha` parameter,
+which increases/decreases the number of filters in each layer.
+By altering the image size and `alpha` parameter,
+all 16 models from the paper can be built, with ImageNet weights provided.
+
+The paper demonstrates the performance of MobileNets using `alpha` values of
+1.0 (also called 100 % MobileNet), 0.75, 0.5 and 0.25.
+For each of these `alpha` values, weights for 4 different input image sizes
+are provided (224, 192, 160, 128).
+
+The following table describes the size and accuracy of the 100% MobileNet
+on size 224 x 224:
+----------------------------------------------------------------------------
+Width Multiplier (alpha) | ImageNet Acc |  Multiply-Adds (M) |  Params (M)
+----------------------------------------------------------------------------
+|   1.0 MobileNet-224    |    70.6 %     |        529        |     4.2     |
+|   0.75 MobileNet-224   |    68.4 %     |        325        |     2.6     |
+|   0.50 MobileNet-224   |    63.7 %     |        149        |     1.3     |
+|   0.25 MobileNet-224   |    50.6 %     |        41         |     0.5     |
+----------------------------------------------------------------------------
+
+The following table describes the performance of
+the 100 % MobileNet on various input sizes:
+------------------------------------------------------------------------
+      Resolution      | ImageNet Acc | Multiply-Adds (M) | Params (M)
+------------------------------------------------------------------------
+|  1.0 MobileNet-224  |    70.6 %    |        529        |     4.2     |
+|  1.0 MobileNet-192  |    69.1 %    |        529        |     4.2     |
+|  1.0 MobileNet-160  |    67.2 %    |        529        |     4.2     |
+|  1.0 MobileNet-128  |    64.4 %    |        529        |     4.2     |
+------------------------------------------------------------------------
+
+The weights for all 16 models are obtained and translated
+from Tensorflow checkpoints found at
+https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.md
+
+# Reference
+- [MobileNets: Efficient Convolutional Neural Networks for
+   Mobile Vision Applications](https://arxiv.org/pdf/1704.04861.pdf))
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.keras.python.keras import backend as K
+from tensorflow.contrib.keras.python.keras import constraints
+from tensorflow.contrib.keras.python.keras import initializers
+from tensorflow.contrib.keras.python.keras import regularizers
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.contrib.keras.python.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.contrib.keras.python.keras.engine import InputSpec
+from tensorflow.contrib.keras.python.keras.engine.topology import get_source_inputs
+from tensorflow.contrib.keras.python.keras.layers import Activation
+from tensorflow.contrib.keras.python.keras.layers import BatchNormalization
+from tensorflow.contrib.keras.python.keras.layers import Conv2D
+from tensorflow.contrib.keras.python.keras.layers import Dropout
+from tensorflow.contrib.keras.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.contrib.keras.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.contrib.keras.python.keras.layers import Input
+from tensorflow.contrib.keras.python.keras.layers import Reshape
+from tensorflow.contrib.keras.python.keras.models import Model
+from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+
+BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
+
+
+def relu6(x):
+  return K.relu(x, max_value=6)
+
+
+def preprocess_input(x):
+  x /= 255.
+  x -= 0.5
+  x *= 2.
+  return x
+
+
+class DepthwiseConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Depthwise Separable convolutions consists in performing
+  just the first step in a depthwise spatial convolution
+  (which acts on each input channel separately).
+  The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Arguments:
+      kernel_size: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      depth_multiplier: The number of depthwise convolution output channels
+          for each input channel.
+          The total number of depthwise convolution output
+          channels will be equal to `filters_in * depth_multiplier`.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      depthwise_initializer: Initializer for the depthwise kernel matrix
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      depthwise_regularizer: Regularizer function applied to
+          the depthwise kernel matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
+      depthwise_constraint: Constraint function applied to
+          the depthwise kernel matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+
+  Input shape:
+      4D tensor with shape:
+      `[batch, channels, rows, cols]` if data_format='channels_first'
+      or 4D tensor with shape:
+      `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+      or 4D tensor with shape:
+      `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               depth_multiplier=1,
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DepthwiseConv2D, self).__init__(
+        filters=None,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`DepthwiseConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
+                              input_dim, self.depth_multiplier)
+
+    self.depthwise_kernel = self.add_weight(
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          shape=(input_dim * self.depth_multiplier,),
+          initializer=self.bias_initializer,
+          name='bias',
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs, training=None):
+    outputs = K.depthwise_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        strides=self.strides,
+        padding=self.padding,
+        dilation_rate=self.dilation_rate,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+      out_filters = input_shape[1] * self.depth_multiplier
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+      out_filters = input_shape[3] * self.depth_multiplier
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding, self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding, self.strides[1])
+
+    if self.data_format == 'channels_first':
+      return (input_shape[0], out_filters, rows, cols)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], rows, cols, out_filters)
+
+  def get_config(self):
+    config = super(DepthwiseConv2D, self).get_config()
+    config.pop('filters')
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    return config
+
+
+def MobileNet(input_shape=None,  # pylint: disable=invalid-name
+              alpha=1.0,
+              depth_multiplier=1,
+              dropout=1e-3,
+              include_top=True,
+              weights='imagenet',
+              input_tensor=None,
+              pooling=None,
+              classes=1000):
+  """Instantiates the MobileNet architecture.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  To load a MobileNet model via `load_model`, import the custom
+  objects `relu6` and `DepthwiseConv2D` and pass them to the
+  `custom_objects` parameter.
+  E.g.
+  model = load_model('mobilenet.h5', custom_objects={
+                     'relu6': mobilenet.relu6,
+                     'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+
+  Arguments:
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or (3, 224, 224) (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(200, 200, 3)` would be one valid value.
+      alpha: controls the width of the network.
+          - If `alpha` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `alpha` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `alpha` = 1, default number of filters from the paper
+               are used at each layer.
+      depth_multiplier: depth multiplier for depthwise convolution
+          (also called the resolution multiplier)
+      dropout: dropout rate
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only TensorFlow backend is currently supported, '
+                       'as other backends do not support '
+                       'depthwise convolution.')
+
+  if weights not in {'imagenet', None}:
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization) or `imagenet` '
+                     '(pre-training on ImageNet).')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as ImageNet with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  # Determine proper input shape.
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=32,
+      data_format=K.image_data_format(),
+      include_top=include_top or weights)
+  if K.image_data_format() == 'channels_last':
+    row_axis, col_axis = (0, 1)
+  else:
+    row_axis, col_axis = (1, 2)
+  rows = input_shape[row_axis]
+  cols = input_shape[col_axis]
+
+  if weights == 'imagenet':
+    if depth_multiplier != 1:
+      raise ValueError('If imagenet weights are being loaded, '
+                       'depth multiplier must be 1')
+
+    if alpha not in [0.25, 0.50, 0.75, 1.0]:
+      raise ValueError('If imagenet weights are being loaded, '
+                       'alpha can be one of'
+                       '`0.25`, `0.50`, `0.75` or `1.0` only.')
+
+    if rows != cols or rows not in [128, 160, 192, 224]:
+      raise ValueError('If imagenet weights are being loaded, '
+                       'input must have a static square shape (one of '
+                       '(128,128), (160,160), (192,192), or (224, 224)).'
+                       ' Input shape provided = %s' % (input_shape,))
+
+  if K.image_data_format() != 'channels_last':
+    warnings.warn('The MobileNet family of models is only available '
+                  'for the input data format "channels_last" '
+                  '(width, height, channels). '
+                  'However your settings specify the default '
+                  'data format "channels_first" (channels, width, height).'
+                  ' You should set `image_data_format="channels_last"` '
+                  'in your Keras config located at ~/.keras/keras.json. '
+                  'The model being returned right now will expect inputs '
+                  'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  x = _conv_block(img_input, 32, alpha, strides=(2, 2))
+  x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
+
+  x = _depthwise_conv_block(
+      x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2)
+  x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
+
+  x = _depthwise_conv_block(
+      x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4)
+  x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
+
+  x = _depthwise_conv_block(
+      x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
+  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)
+
+  x = _depthwise_conv_block(
+      x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12)
+  x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)
+
+  if include_top:
+    if K.image_data_format() == 'channels_first':
+      shape = (int(1024 * alpha), 1, 1)
+    else:
+      shape = (1, 1, int(1024 * alpha))
+
+    x = GlobalAveragePooling2D()(x)
+    x = Reshape(shape, name='reshape_1')(x)
+    x = Dropout(dropout, name='dropout')(x)
+    x = Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x)
+    x = Activation('softmax', name='act_softmax')(x)
+    x = Reshape((classes,), name='reshape_2')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  model = Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows))
+
+  # load weights
+  if weights == 'imagenet':
+    if K.image_data_format() == 'channels_first':
+      raise ValueError('Weights for "channels_last" format '
+                       'are not available.')
+    if alpha == 1.0:
+      alpha_text = '1_0'
+    elif alpha == 0.75:
+      alpha_text = '7_5'
+    elif alpha == 0.50:
+      alpha_text = '5_0'
+    else:
+      alpha_text = '2_5'
+
+    if include_top:
+      model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)
+      weigh_path = BASE_WEIGHT_PATH + model_name
+      weights_path = get_file(model_name, weigh_path, cache_subdir='models')
+    else:
+      model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)
+      weigh_path = BASE_WEIGHT_PATH + model_name
+      weights_path = get_file(model_name, weigh_path, cache_subdir='models')
+    model.load_weights(weights_path)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+  return model
+
+
+def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
+  """Adds an initial convolution layer (with batch normalization and relu6).
+
+  Arguments:
+      inputs: Input tensor of shape `(rows, cols, 3)`
+          (with `channels_last` data format) or
+          (3, rows, cols) (with `channels_first` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the convolution).
+      alpha: controls the width of the network.
+          - If `alpha` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `alpha` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `alpha` = 1, default number of filters from the paper
+               are used at each layer.
+      kernel: An integer or tuple/list of 2 integers, specifying the
+          width and height of the 2D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+
+  Input shape:
+      4D tensor with shape:
+      `(samples, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to stride.
+
+  Returns:
+      Output tensor of block.
+  """
+  channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+  filters = int(filters * alpha)
+  x = Conv2D(
+      filters,
+      kernel,
+      padding='same',
+      use_bias=False,
+      strides=strides,
+      name='conv1')(inputs)
+  x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
+  return Activation(relu6, name='conv1_relu')(x)
+
+
+def _depthwise_conv_block(inputs,
+                          pointwise_conv_filters,
+                          alpha,
+                          depth_multiplier=1,
+                          strides=(1, 1),
+                          block_id=1):
+  """Adds a depthwise convolution block.
+
+  A depthwise convolution block consists of a depthwise conv,
+  batch normalization, relu6, pointwise convolution,
+  batch normalization and relu6 activation.
+
+  Arguments:
+      inputs: Input tensor of shape `(rows, cols, channels)`
+          (with `channels_last` data format) or
+          (channels, rows, cols) (with `channels_first` data format).
+      pointwise_conv_filters: Integer, the dimensionality of the output space
+          (i.e. the number output of filters in the pointwise convolution).
+      alpha: controls the width of the network.
+          - If `alpha` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `alpha` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `alpha` = 1, default number of filters from the paper
+               are used at each layer.
+      depth_multiplier: The number of depthwise convolution output channels
+          for each input channel.
+          The total number of depthwise convolution output
+          channels will be equal to `filters_in * depth_multiplier`.
+      strides: An integer or tuple/list of 2 integers,
+          specifying the strides of the convolution along the width and height.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      block_id: Integer, a unique identification designating the block number.
+
+  Input shape:
+      4D tensor with shape:
+      `(batch, channels, rows, cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      4D tensor with shape:
+      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+      or 4D tensor with shape:
+      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to stride.
+
+  Returns:
+      Output tensor of block.
+  """
+  channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+  pointwise_conv_filters = int(pointwise_conv_filters * alpha)
+
+  x = DepthwiseConv2D(  # pylint: disable=not-callable
+      (3, 3),
+      padding='same',
+      depth_multiplier=depth_multiplier,
+      strides=strides,
+      use_bias=False,
+      name='conv_dw_%d' % block_id)(inputs)
+  x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
+  x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
+
+  x = Conv2D(
+      pointwise_conv_filters, (1, 1),
+      padding='same',
+      use_bias=False,
+      strides=(1, 1),
+      name='conv_pw_%d' % block_id)(x)
+  x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
+  return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
diff --git a/tensorflow/contrib/keras/python/keras/applications/mobilenet_test.py b/tensorflow/contrib/keras/python/keras/applications/mobilenet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aa786f9b1d6f90e8fc11bb1b56532a3162fe0ef
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/applications/mobilenet_test.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MobileNet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class MobileNetTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.MobileNet(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.MobileNet(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1024))
+
+  def test_with_pooling(self):
+    model = keras.applications.MobileNet(weights=None,
+                                         include_top=False,
+                                         pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1024))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 7a0056030841b4dfade1279fa9f8a3c0f1e0df9d..4fa4ec0dd49d79c2f73636b706157eab802738e3 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -21,6 +21,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import json
 import os
 
@@ -263,15 +264,19 @@ def get_uid(prefix=''):
   ```
   """
   graph = ops.get_default_graph()
+  if graph not in tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS:
+    tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(
+      int)
   layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
   layer_name_uids[prefix] += 1
   return layer_name_uids[prefix]
 
 
 def reset_uids():
-  layer_name_uids_collection = ops.get_collection_ref('LAYER_NAME_UIDS')
-  if layer_name_uids_collection:
-    layer_name_uids_collection.pop()
+  per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
+  keys = list(per_graph_layer_name_uids.keys())
+  for key in keys:
+    del per_graph_layer_name_uids[key]
 
 
 def clear_session():
@@ -707,7 +712,7 @@ def dtype(x):
       'float32_ref'
   ```
   """
-  return x.dtype.name
+  return x.dtype.base_dtype.name
 
 
 def eval(x):
@@ -1308,7 +1313,7 @@ def max(x, axis=None, keepdims=False):
       A tensor with maximum values of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_max(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_max(x, axis=axis, keep_dims=keepdims)
 
 
 def min(x, axis=None, keepdims=False):
@@ -1326,7 +1331,7 @@ def min(x, axis=None, keepdims=False):
       A tensor with miminum values of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_min(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_min(x, axis=axis, keep_dims=keepdims)
 
 
 def sum(x, axis=None, keepdims=False):
@@ -1344,7 +1349,7 @@ def sum(x, axis=None, keepdims=False):
       A tensor with sum of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_sum(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_sum(x, axis=axis, keep_dims=keepdims)
 
 
 def prod(x, axis=None, keepdims=False):
@@ -1362,7 +1367,7 @@ def prod(x, axis=None, keepdims=False):
       A tensor with the product of elements of `x`.
   """
   axis = _normalize_axis(axis, ndim(x))
-  return math_ops.reduce_prod(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_prod(x, axis=axis, keep_dims=keepdims)
 
 
 def cumsum(x, axis=0):
@@ -1410,10 +1415,10 @@ def var(x, axis=None, keepdims=False):
   axis = _normalize_axis(axis, ndim(x))
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=True)
+  m = math_ops.reduce_mean(x, axis=axis, keep_dims=True)
   devs_squared = math_ops.square(x - m)
   return math_ops.reduce_mean(
-      devs_squared, reduction_indices=axis, keep_dims=keepdims)
+      devs_squared, axis=axis, keep_dims=keepdims)
 
 
 def std(x, axis=None, keepdims=False):
@@ -1450,7 +1455,7 @@ def mean(x, axis=None, keepdims=False):
   axis = _normalize_axis(axis, ndim(x))
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  return math_ops.reduce_mean(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_mean(x, axis=axis, keep_dims=keepdims)
 
 
 def any(x, axis=None, keepdims=False):
@@ -1466,7 +1471,7 @@ def any(x, axis=None, keepdims=False):
   """
   axis = _normalize_axis(axis, ndim(x))
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_any(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_any(x, axis=axis, keep_dims=keepdims)
 
 
 def all(x, axis=None, keepdims=False):
@@ -1482,7 +1487,7 @@ def all(x, axis=None, keepdims=False):
   """
   axis = _normalize_axis(axis, ndim(x))
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_all(x, reduction_indices=axis, keep_dims=keepdims)
+  return math_ops.reduce_all(x, axis=axis, keep_dims=keepdims)
 
 
 def argmax(x, axis=-1):
@@ -2888,13 +2893,13 @@ def categorical_crossentropy(output, target, from_logits=False):
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
     output /= math_ops.reduce_sum(
-        output, reduction_indices=len(output.get_shape()) - 1, keep_dims=True)
+        output, axis=len(output.get_shape()) - 1, keep_dims=True)
     # manual computation of crossentropy
     epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon, 1. - epsilon)
     return -math_ops.reduce_sum(
         target * math_ops.log(output),
-        reduction_indices=len(output.get_shape()) - 1)
+        axis=len(output.get_shape()) - 1)
   else:
     return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
 
@@ -2913,7 +2918,7 @@ def sparse_categorical_crossentropy(output, target, from_logits=False):
   Returns:
       Output tensor.
   """
-  # Note: nn.softmax_cross_entropy_with_logits
+  # Note: nn.sparse_softmax_cross_entropy_with_logits
   # expects logits, Keras expects probabilities.
   if not from_logits:
     epsilon = _to_tensor(_EPSILON, output.dtype.base_dtype)
@@ -3017,7 +3022,7 @@ def dropout(x, level, noise_shape=None, seed=None):
   if seed is None:
     seed = np.random.randint(10e6)
   # the dummy 1. works around a TF bug
-  # (float32_ref vs. float32 incomptability)
+  # (float32_ref vs. float32 incompatibility)
   return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
 
 
@@ -3379,6 +3384,42 @@ def separable_conv2d(x,
   return _postprocess_conv2d_output(x, data_format)
 
 
+def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
+                     data_format=None, dilation_rate=(1, 1)):
+  """2D convolution with separable filters.
+
+  Arguments:
+    x: input tensor
+    depthwise_kernel: convolution kernel for the depthwise convolution.
+    strides: strides tuple (length 2).
+    padding: string, `"same"` or `"valid"`.
+    data_format: string, `"channels_last"` or `"channels_first"`.
+    dilation_rate: tuple of integers,
+        dilation rates for the separable convolution.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if `data_format` is neither `channels_last`
+      or `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x = _preprocess_conv2d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  strides = (1,) + strides + (1,)
+
+  x = nn.depthwise_conv2d(x, depthwise_kernel,
+                          strides=strides,
+                          padding=padding,
+                          rate=dilation_rate)
+  return _postprocess_conv2d_output(x, data_format)
+
+
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -3514,41 +3555,177 @@ def pool3d(x,
   return _postprocess_conv3d_output(x, data_format)
 
 
+def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
+  """Apply 1D conv with un-shared weights.
+
+  Arguments:
+      inputs: 3D tensor with shape: (batch_size, steps, input_dim)
+      kernel: the unshared weight for convolution,
+              with shape (output_length, feature_dim, filters)
+      kernel_size: a tuple of a single integer,
+                   specifying the length of the 1D convolution window
+      strides: a tuple of a single integer,
+               specifying the stride length of the convolution
+      data_format: the data format, channels_first or channels_last
+
+  Returns:
+      the tensor after 1d conv with un-shared weights, with shape (batch_size,
+      output_lenght, filters)
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  stride = strides[0]
+  kernel_shape = int_shape(kernel)
+  output_length = kernel_shape[0]
+  feature_dim = kernel_shape[1]
+
+  xs = []
+  for i in range(output_length):
+    slice_length = slice(i * stride, i * stride + kernel_size[0])
+    xs.append(reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
+  x_aggregate = concatenate(xs, axis=0)
+  # Shape: `(output_length, batch_size, filters)`.
+  output = batch_dot(x_aggregate, kernel)
+  return permute_dimensions(output, (1, 0, 2))
+
+
+def local_conv2d(inputs,
+                 kernel,
+                 kernel_size,
+                 strides,
+                 output_shape,
+                 data_format=None):
+  """Apply 2D conv with un-shared weights.
+
+  Arguments:
+      inputs: 4D tensor with shape:
+              (batch_size, filters, new_rows, new_cols)
+              if data_format='channels_first'
+              or 4D tensor with shape:
+              (batch_size, new_rows, new_cols, filters)
+              if data_format='channels_last'.
+      kernel: the unshared weight for convolution,
+              with shape (output_items, feature_dim, filters)
+      kernel_size: a tuple of 2 integers, specifying the
+                   width and height of the 2D convolution window.
+      strides: a tuple of 2 integers, specifying the strides
+               of the convolution along the width and height.
+      output_shape: a tuple with (output_row, output_col)
+      data_format: the data format, channels_first or channels_last
+
+  Returns:
+      A 4d tensor with shape:
+      (batch_size, filters, new_rows, new_cols)
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      (batch_size, new_rows, new_cols, filters)
+      if data_format='channels_last'.
+
+  Raises:
+      ValueError: if `data_format` is neither
+                  `channels_last` or `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  stride_row, stride_col = strides
+  output_row, output_col = output_shape
+  kernel_shape = int_shape(kernel)
+  feature_dim = kernel_shape[1]
+  filters = kernel_shape[2]
+
+  xs = []
+  for i in range(output_row):
+    for j in range(output_col):
+      slice_row = slice(i * stride_row, i * stride_row + kernel_size[0])
+      slice_col = slice(j * stride_col, j * stride_col + kernel_size[1])
+      if data_format == 'channels_first':
+        xs.append(
+            reshape(inputs[:, :, slice_row, slice_col], (1, -1, feature_dim)))
+      else:
+        xs.append(
+            reshape(inputs[:, slice_row, slice_col, :], (1, -1, feature_dim)))
+
+  x_aggregate = concatenate(xs, axis=0)
+  output = batch_dot(x_aggregate, kernel)
+  output = reshape(output, (output_row, output_col, -1, filters))
+
+  if data_format == 'channels_first':
+    output = permute_dimensions(output, (2, 3, 0, 1))
+  else:
+    output = permute_dimensions(output, (2, 0, 1, 3))
+  return output
+
+
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
   Arguments:
       x: Tensor or variable.
       bias: Bias tensor to add.
-      data_format: Data format for 3D, 4D or 5D tensors:
-          one of "channels_first", "channels_last".
+      data_format: string, `"channels_last"` or `"channels_first"`.
 
   Returns:
       Output tensor.
 
   Raises:
-      ValueError: In case of invalid `data_format` argument.
+      ValueError: In one of the two cases below:
+                  1. invalid `data_format` argument.
+                  2. invalid bias shape.
+                     the bias should be either a vector or
+                     a tensor with ndim(x) - 1 dimension
   """
   if data_format is None:
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
+  bias_shape = int_shape(bias)
+  if len(bias_shape) != 1 and len(bias_shape) != ndim(x) - 1:
+    raise ValueError(
+        'Unexpected bias dimensions %d, expect to be 1 or %d dimensions' %
+        (len(bias_shape), ndim(x)))
   if ndim(x) == 5:
     if data_format == 'channels_first':
-      x += reshape(bias, (1, int_shape(bias)[0], 1, 1, 1))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, bias_shape[0], 1, 1, 1))
+      else:
+        x += reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
     elif data_format == 'channels_last':
-      x += reshape(bias, (1, 1, 1, 1, int_shape(bias)[0]))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, 1, 1, bias_shape[0]))
+      else:
+        x += reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 4:
     if data_format == 'channels_first':
-      # No support yet for NCHW in bias_add.
-      x += reshape(bias, (1, int_shape(bias)[0], 1, 1))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, bias_shape[0], 1, 1))
+      else:
+        x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
     elif data_format == 'channels_last':
-      x = nn.bias_add(x, bias, data_format='NHWC')
+      if len(bias_shape) == 1:
+        x = nn.bias_add(x, bias, data_format='NHWC')
+      else:
+        x += reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 3:
     if data_format == 'channels_first':
-      x += reshape(bias, (1, int_shape(bias)[0], 1))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, bias_shape[0], 1))
+      else:
+        x += reshape(bias, (1, bias_shape[1], bias_shape[0]))
     elif data_format == 'channels_last':
-      x += reshape(bias, (1, 1, int_shape(bias)[0]))
+      if len(bias_shape) == 1:
+        x += reshape(bias, (1, 1, bias_shape[0]))
+      else:
+        x += reshape(bias, (1,) + bias_shape)
   else:
     x = nn.bias_add(x, bias)
   return x
diff --git a/tensorflow/contrib/keras/python/keras/backend_test.py b/tensorflow/contrib/keras/python/keras/backend_test.py
index 2da5aee58e5633fa0461a08d352d696f710d9620..a2bc95e4a109c050728435b61fdf8012b183fae1 100644
--- a/tensorflow/contrib/keras/python/keras/backend_test.py
+++ b/tensorflow/contrib/keras/python/keras/backend_test.py
@@ -105,10 +105,13 @@ class BackendUtilsTest(test.TestCase):
     self.assertEqual(keras.backend.image_data_format(), image_data_format)
     keras.backend.set_image_data_format('channels_last')
 
-  def test_get_uid(self):
+  def test_get_reset_uids(self):
     self.assertEqual(keras.backend.get_uid('foo'), 1)
     self.assertEqual(keras.backend.get_uid('foo'), 2)
 
+    keras.backend.reset_uids()
+    self.assertEqual(keras.backend.get_uid('foo'), 1)
+
 
 class BackendVariableTest(test.TestCase):
 
diff --git a/tensorflow/contrib/keras/python/keras/callbacks.py b/tensorflow/contrib/keras/python/keras/callbacks.py
index d0587a549b2104726d65e74dfdf01e5c526d0fc7..6df6662081ccb6628b0d9684f4463e08944a38a5 100644
--- a/tensorflow/contrib/keras/python/keras/callbacks.py
+++ b/tensorflow/contrib/keras/python/keras/callbacks.py
@@ -513,7 +513,9 @@ class EarlyStopping(Callback):
   def on_epoch_end(self, epoch, logs=None):
     current = logs.get(self.monitor)
     if current is None:
-      logging.warning('Early stopping requires %s available!' % (self.monitor))
+      logging.warning('Early stopping conditioned on metric `%s` '
+                      'which is not available. Available metrics are: %s' %
+                      (self.monitor, ','.join(list(logs.keys()))))
 
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -680,10 +682,11 @@ class TensorBoard(Callback):
     if self.histogram_freq and self.merged is None:
       for layer in self.model.layers:
         for weight in layer.weights:
-          tf_summary.histogram(weight.name, weight)
+          mapped_weight_name = weight.name.replace(':', '_')
+          tf_summary.histogram(mapped_weight_name, weight)
           if self.write_grads:
             grads = model.optimizer.get_gradients(model.total_loss, weight)
-            tf_summary.histogram('{}_grad'.format(weight.name), grads)
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -708,7 +711,7 @@ class TensorBoard(Callback):
 
             shape = K.int_shape(w_img)
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
-            tf_summary.image(weight.name, w_img)
+            tf_summary.image(mapped_weight_name, w_img)
 
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
@@ -896,8 +899,9 @@ class ReduceLROnPlateau(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
     current = logs.get(self.monitor)
     if current is None:
-      logging.warning('Learning Rate Plateau Reducing requires %s available!' %
-                      self.monitor)
+      logging.warning('Reduce LR on plateau conditioned on metric `%s` '
+                      'which is not available. Available metrics are: %s' %
+                      (self.monitor, ','.join(list(logs.keys()))))
     else:
       if self.in_cooldown():
         self.cooldown_counter -= 1
@@ -998,7 +1002,7 @@ class CSVLogger(Callback):
 
 
 class LambdaCallback(Callback):
-  """Callback for creating simple, custom callbacks on-the-fly.
+  r"""Callback for creating simple, custom callbacks on-the-fly.
 
   This callback is constructed with anonymous functions that will be called
   at the appropriate time. Note that the callbacks expects positional
@@ -1020,17 +1024,21 @@ class LambdaCallback(Callback):
       on_train_end: called at the end of model training.
 
   Example:
+
       ```python
       # Print the batch number at the beginning of every batch.
       batch_print_callback = LambdaCallback(
           on_batch_begin=lambda batch,logs: print(batch))
 
-      # Plot the loss after every epoch.
-      import numpy as np
-      import matplotlib.pyplot as plt
-      plot_loss_callback = LambdaCallback(
-          on_epoch_end=lambda epoch, logs: plt.plot(np.arange(epoch),
-                                                    logs['loss']))
+      # Stream the epoch loss to a file in JSON format. The file content
+      # is not well-formed JSON but rather has a JSON object per line.
+      import json
+      json_log = open('loss_log.json', mode='wt', buffering=1)
+      json_logging_callback = LambdaCallback(
+          on_epoch_end=lambda epoch, logs: json_log.write(
+              json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
+          on_train_end=lambda logs: json_log.close()
+      )
 
       # Terminate some processes after having finished model training.
       processes = ...
@@ -1040,7 +1048,7 @@ class LambdaCallback(Callback):
 
       model.fit(...,
                 callbacks=[batch_print_callback,
-                           plot_loss_callback,
+                           json_logging_callback,
                            cleanup_callback])
       ```
   """
diff --git a/tensorflow/contrib/keras/python/keras/constraints.py b/tensorflow/contrib/keras/python/keras/constraints.py
index 91d6153862388b45ff5bd370c8759567dc705430..0a59dd92c114f1dc431c0c644e5788e064fa9131 100644
--- a/tensorflow/contrib/keras/python/keras/constraints.py
+++ b/tensorflow/contrib/keras/python/keras/constraints.py
@@ -47,7 +47,7 @@ class MaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Convolution2D` layer with `data_format="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -92,7 +92,7 @@ class UnitNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Convolution2D` layer with `data_format="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -132,7 +132,7 @@ class MinMaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Convolution2D` layer with `dim_ordering="tf"`,
+          In a `Conv2D` layer with `dim_ordering="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
index 07d708ada3ca3c18971007c372dd510ecbf96c3f..c8c746e8affd306ee1d38c4862d5c42fb6a61e5c 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -29,13 +29,13 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils import conv_utils
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import has_arg
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summary as print_layer_summary
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=g-import-not-at-top
@@ -386,7 +386,7 @@ class Layer(tf_base_layers.Layer):
     user_kwargs = copy.copy(kwargs)
     if not _is_all_none(previous_mask):
       # The previous layer generated a mask.
-      if 'mask' in tf_inspect.getargspec(self.call).args:
+      if has_arg(self.call, 'mask'):
         if 'mask' not in kwargs:
           # If mask is explicitly passed to __call__,
           # we should override the default mask.
@@ -1176,6 +1176,7 @@ class Container(Layer):
     # The following properties are not actually used by Keras;
     # they exist for compatibility with TF.
     self._updates = []
+    self._losses = []
     self._scope = None
     self._reuse = None
     self._base_name = name
@@ -1915,7 +1916,7 @@ class Container(Layer):
               kwargs = {}
             if len(computed_data) == 1:
               computed_tensor, computed_mask = computed_data[0]
-              if 'mask' in tf_inspect.getargspec(layer.call).args:
+              if has_arg(layer.call, 'mask'):
                 if 'mask' not in kwargs:
                   kwargs['mask'] = computed_mask
               output_tensors = _to_list(layer.call(computed_tensor, **kwargs))
@@ -1926,7 +1927,7 @@ class Container(Layer):
             else:
               computed_tensors = [x[0] for x in computed_data]
               computed_masks = [x[1] for x in computed_data]
-              if 'mask' in tf_inspect.getargspec(layer.call).args:
+              if has_arg(layer.call, 'mask'):
                 if 'mask' not in kwargs:
                   kwargs['mask'] = computed_masks
               output_tensors = _to_list(layer.call(computed_tensors, **kwargs))
@@ -2351,8 +2352,25 @@ class Container(Layer):
       raise ImportError('Requires yaml module installed.')
     return yaml.dump(self._updated_config(), **kwargs)
 
-  def summary(self, line_length=None, positions=None):
-    print_layer_summary(self, line_length=line_length, positions=positions)
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+    """
+    print_layer_summary(self,
+                        line_length=line_length,
+                        positions=positions,
+                        print_fn=print_fn)
 
 
 def get_source_inputs(tensor, layer=None, node_index=None):
@@ -2609,6 +2627,35 @@ def preprocess_weights_for_loading(layer,
           recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
         weights = [kernel, recurrent_kernel, bias]
 
+    if layer.__class__.__name__ in ['Model', 'Sequential']:
+      new_weights = []
+      # trainable weights
+      for sublayer in layer.layers:
+        num_weights = len(sublayer.trainable_weights)
+        if num_weights > 0:
+          new_weights.extend(
+              preprocess_weights_for_loading(
+                  layer=sublayer,
+                  weights=weights[:num_weights],
+                  original_keras_version=original_keras_version,
+                  original_backend=original_backend))
+          weights = weights[num_weights:]
+
+      # non-trainable weights
+      for sublayer in layer.layers:
+        num_weights = len([
+            l for l in sublayer.weights if l not in sublayer.trainable_weights
+        ])
+        if num_weights > 0:
+          new_weights.extend(
+              preprocess_weights_for_loading(
+                  layer=sublayer,
+                  weights=weights[:num_weights],
+                  original_keras_version=original_keras_version,
+                  original_backend=original_backend))
+          weights = weights[num_weights:]
+      weights = new_weights
+
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
     if original_backend and K.backend() != original_backend:
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology_test.py b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
index 531ed4be3e3672eb45f982ff6d9bb471bf47d7cc..ec4fa2eed8189bb6fc3065194b8c7d2a8e146d85 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology_test.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
@@ -195,13 +195,13 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.output_shape, (None, 16))
 
     # pylint: disable=pointless-statement
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.input
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.output
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.input_mask
-    with self.assertRaises(Exception):
+    with self.assertRaises(AttributeError):
       dense.output_mask
     # pylint: enable=pointless-statement
 
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
index 09459fd713c59c74eb8a5d9db9c55ee17f1a376e..1563cf8c41a58d4885cb6a796384106d79b220e8 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -20,9 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import multiprocessing
-import threading
-import time
 
 import numpy as np
 import six
@@ -33,18 +30,13 @@ from tensorflow.contrib.keras.python.keras import losses
 from tensorflow.contrib.keras.python.keras import metrics as metrics_module
 from tensorflow.contrib.keras.python.keras import optimizers
 from tensorflow.contrib.keras.python.keras.engine.topology import Container
+from tensorflow.contrib.keras.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.contrib.keras.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.contrib.keras.python.keras.utils.data_utils import Sequence
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 
 
-# pylint: disable=g-import-not-at-top
-try:
-  import queue
-except ImportError:
-  import Queue as queue
-# pylint: enable=g-import-not-at-top
-
-
 def _standardize_input_data(data,
                             names,
                             shapes=None,
@@ -115,8 +107,9 @@ def _standardize_input_data(data,
     if len(names) > 1:
       # Case: model expects multiple inputs but only received
       # a single Numpy array.
-      raise ValueError('The model expects ' + str(len(names)) + exception_prefix
-                       + ' arrays, but only received one array. '
+      raise ValueError('The model expects ' + str(len(names)) + ' ' +
+                       exception_prefix +
+                       ' arrays, but only received one array. '
                        'Found: array with shape ' + str(data.shape))
     arrays = [data]
 
@@ -205,7 +198,7 @@ def _standardize_sample_weights(sample_weight, output_names):
                                               'sample_weight')
 
 
-def _check_array_lengths(inputs, targets, weights):
+def _check_array_lengths(inputs, targets, weights=None):
   """Does user input validation for numpy arrays.
 
   Arguments:
@@ -216,29 +209,35 @@ def _check_array_lengths(inputs, targets, weights):
   Raises:
       ValueError: in case of incorrectly formatted data.
   """
-  x_lengths = [x.shape[0] for x in inputs]
-  y_lengths = [y.shape[0] for y in targets]
-  w_lengths = [w.shape[0] for w in weights]
-  set_x = set(x_lengths)
+
+  def set_of_lengths(x):
+    # return a set with the variation between
+    # different shapes, with None => 0
+    if x is None:
+      return {0}
+    else:
+      return set([0 if y is None else y.shape[0] for y in x])
+
+  set_x = set_of_lengths(inputs)
+  set_y = set_of_lengths(targets)
+  set_w = set_of_lengths(weights)
   if len(set_x) > 1:
     raise ValueError('All input arrays (x) should have '
                      'the same number of samples. Got array shapes: ' + str(
                          [x.shape for x in inputs]))
-  set_y = set(y_lengths)
   if len(set_y) > 1:
     raise ValueError('All target arrays (y) should have '
                      'the same number of samples. Got array shapes: ' + str(
                          [y.shape for y in targets]))
-  set_w = set(w_lengths)
-  if len(set_w) > 1:
-    raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [w.shape for w in weights]))
   if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
     raise ValueError('Input arrays should have '
                      'the same number of samples as target arrays. '
                      'Found ' + str(list(set_x)[0]) + ' input samples '
                      'and ' + str(list(set_y)[0]) + ' target samples.')
+  if len(set_w) > 1:
+    raise ValueError('All sample_weight arrays should have '
+                     'the same number of samples. Got array shapes: ' + str(
+                         [w.shape for w in weights]))
   if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
     raise ValueError('Sample_weight arrays should have '
                      'the same number of samples as target arrays. Got ' +
@@ -261,7 +260,7 @@ def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
           is incompatible with an output.
   """
   key_losses = {
-      'mean_square_error', 'binary_crossentropy', 'categorical_crossentropy'
+      'mean_squared_error', 'binary_crossentropy', 'categorical_crossentropy'
   }
   for y, loss, shape in zip(targets, loss_fns, output_shapes):
     if loss is None:
@@ -389,21 +388,25 @@ def _slice_arrays(arrays, start=None, stop=None):
   Returns:
       A slice of the array(s).
   """
-  if isinstance(arrays, list):
+  if arrays is None:
+    return [None]
+  elif isinstance(arrays, list):
     if hasattr(start, '__len__'):
       # hdf5 datasets only support list objects as indices
       if hasattr(start, 'shape'):
         start = start.tolist()
-      return [x[start] for x in arrays]
+      return [None if x is None else x[start] for x in arrays]
     else:
-      return [x[start:stop] for x in arrays]
+      return [None if x is None else x[start:stop] for x in arrays]
   else:
     if hasattr(start, '__len__'):
       if hasattr(start, 'shape'):
         start = start.tolist()
       return arrays[start]
-    else:
+    elif hasattr(start, '__getitem__'):
       return arrays[start:stop]
+    else:
+      return [None]
 
 
 def _weighted_masked_objective(fn):
@@ -445,13 +448,12 @@ def _weighted_masked_objective(fn):
       #  to the number of unmasked samples.
       score_array /= K.mean(mask)
 
-    # reduce score_array to same ndim as weight array
-    ndim = K.ndim(score_array)
-    weight_ndim = K.ndim(weights)
-    score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
-
     # apply sample weighting
     if weights is not None:
+      # reduce score_array to same ndim as weight array
+      ndim = K.ndim(score_array)
+      weight_ndim = K.ndim(weights)
+      score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
       score_array *= weights
       score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
     return K.mean(score_array)
@@ -567,7 +569,7 @@ def _standardize_weights(y,
     return sample_weight
   elif isinstance(class_weight, dict):
     if len(y.shape) > 2:
-      raise ValueError('class_weight not supported for '
+      raise ValueError('`class_weight` not supported for '
                        '3+ dimensional targets.')
     if y.shape[1] > 1:
       y_classes = y.argmax(axis=1)
@@ -575,7 +577,18 @@ def _standardize_weights(y,
       y_classes = np.reshape(y, y.shape[0])
     else:
       y_classes = y
-    weights = np.asarray([class_weight[cls] for cls in y_classes])
+
+    weights = np.asarray(
+        [class_weight[cls] for cls in y_classes if cls in class_weight])
+
+    if len(weights) != len(y_classes):
+      # subtract the sets to pick all missing classes
+      existing_classes = set(y_classes)
+      existing_class_weight = set(class_weight.keys())
+      raise ValueError('`class_weight` must contain all classes in the data.'
+                       ' The classes %s exist in the data but not in '
+                       '`class_weight`.' %
+                       (existing_classes - existing_class_weight))
     return weights
   else:
     if sample_weight_mode is None:
@@ -584,97 +597,6 @@ def _standardize_weights(y,
       return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
 
 
-class GeneratorEnqueuer(object):
-  """Builds a queue out of a data generator.
-
-  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
-
-  Arguments:
-      generator: a generator function which endlessly yields data
-      pickle_safe: use multiprocessing if True, otherwise threading
-  """
-
-  def __init__(self, generator, pickle_safe=False):
-    self._generator = generator
-    self._pickle_safe = pickle_safe
-    self._threads = []
-    self._stop_event = None
-    self.queue = None
-
-  def start(self, workers=1, max_q_size=10, wait_time=0.05):
-    """Kicks off threads which add data from the generator into the queue.
-
-    Arguments:
-        workers: number of worker threads
-        max_q_size: queue size (when full, threads could block on put())
-        wait_time: time to sleep in-between calls to put()
-    """
-
-    def data_generator_task():
-      while not self._stop_event.is_set():
-        try:
-          if self._pickle_safe or self.queue.qsize() < max_q_size:
-            generator_output = next(self._generator)
-            self.queue.put(generator_output)
-          else:
-            time.sleep(wait_time)
-        except Exception:
-          self._stop_event.set()
-          raise
-
-    try:
-      if self._pickle_safe:
-        self.queue = multiprocessing.Queue(maxsize=max_q_size)
-        self._stop_event = multiprocessing.Event()
-      else:
-        self.queue = queue.Queue()
-        self._stop_event = threading.Event()
-
-      for _ in range(workers):
-        if self._pickle_safe:
-          # Reset random seed else all children processes
-          # share the same seed
-          np.random.seed()
-          thread = multiprocessing.Process(target=data_generator_task)
-          thread.daemon = True
-        else:
-          thread = threading.Thread(target=data_generator_task)
-        self._threads.append(thread)
-        thread.start()
-    except:
-      self.stop()
-      raise
-
-  def is_running(self):
-    return self._stop_event is not None and not self._stop_event.is_set()
-
-  def stop(self, timeout=None):
-    """Stop running threads and wait for them to exit, if necessary.
-
-    Should be called by the same thread which called start().
-
-    Arguments:
-        timeout: maximum time to wait on thread.join()
-    """
-    if self.is_running():
-      self._stop_event.set()
-
-    for thread in self._threads:
-      if thread.is_alive():
-        if self._pickle_safe:
-          thread.terminate()
-        else:
-          thread.join(timeout)
-
-    if self._pickle_safe:
-      if self.queue is not None:
-        self.queue.close()
-
-    self._threads = []
-    self._stop_event = None
-    self.queue = None
-
-
 class Model(Container):
   """The `Model` class adds training & evaluation routines to a `Container`.
   """
@@ -723,7 +645,7 @@ class Model(Container):
     Raises:
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
-        RuntimeError: If the model has no loss to optimize.
+        RuntimeError: In case of ill-formulated optimization problem.
     """
     loss = loss or {}
     self.optimizer = optimizers.get(optimizer)
@@ -981,24 +903,22 @@ class Model(Container):
     # Functions for train, test and predict will
     # be compiled lazily when required.
     # This saves time when the user is not using all functions.
+    self._function_kwargs = kwargs
+
     self.train_function = None
     self.test_function = None
     self.predict_function = None
-    self._function_kwargs = kwargs
 
-    # Collected trainable weights and sort them deterministically.
+    # Collected trainable weights, sorted in topological order.
     trainable_weights = self.trainable_weights
-    # Sort weights by name.
-    if trainable_weights:
-      trainable_weights.sort(key=lambda x: x.name)
     self._collected_trainable_weights = trainable_weights
 
   def _make_train_function(self):
     if not hasattr(self, 'train_function'):
       raise RuntimeError('You must compile your model before using it.')
     if self.train_function is None:
-      inputs = (
-          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      inputs = (self._feed_inputs +
+                self._feed_targets + self._feed_sample_weights)
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs += [K.learning_phase()]
 
@@ -1016,8 +936,8 @@ class Model(Container):
     if not hasattr(self, 'test_function'):
       raise RuntimeError('You must compile your model before using it.')
     if self.test_function is None:
-      inputs = (
-          self._feed_inputs + self._feed_targets + self._feed_sample_weights)
+      inputs = (self._feed_inputs +
+                self._feed_targets + self._feed_sample_weights)
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs += [K.learning_phase()]
       # Return loss and metrics, no gradient updates.
@@ -1031,7 +951,6 @@ class Model(Container):
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
-      self._function_kwargs = {}
     if self.predict_function is None:
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         inputs = self._feed_inputs + [K.learning_phase()]
@@ -1039,12 +958,13 @@ class Model(Container):
         inputs = self._feed_inputs
       # Gets network outputs. Does not update weights.
       # Does update the network states.
+      kwargs = getattr(self, '_function_kwargs', {})
       self.predict_function = K.function(
           inputs,
           self.outputs,
           updates=self.state_updates,
           name='predict_function',
-          **self._function_kwargs)
+          **kwargs)
 
   def _fit_loop(self,
                 f,
@@ -1430,6 +1350,7 @@ class Model(Container):
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
+
     # Validate user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1468,7 +1389,10 @@ class Model(Container):
 
     elif validation_split and 0. < validation_split < 1.:
       do_validation = True
-      split_at = int(len(x[0]) * (1. - validation_split))
+      if hasattr(x[0], 'shape'):
+        split_at = int(x[0].shape[0] * (1. - validation_split))
+      else:
+        split_at = int(len(x[0]) * (1. - validation_split))
       x, val_x = (_slice_arrays(x, 0, split_at), _slice_arrays(x, split_at))
       y, val_y = (_slice_arrays(y, 0, split_at), _slice_arrays(y, split_at))
       sample_weights, val_sample_weights = (_slice_arrays(
@@ -1725,18 +1649,25 @@ class Model(Container):
                     validation_data=None,
                     validation_steps=None,
                     class_weight=None,
-                    max_q_size=10,
+                    max_queue_size=10,
                     workers=1,
-                    pickle_safe=False,
-                    initial_epoch=0):
+                    use_multiprocessing=False,
+                    initial_epoch=0,
+                    **kwargs):
     """Fits the model on data yielded batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
     For instance, this allows you to do real-time data augmentation
     on images on CPU in parallel to training your model on GPU.
 
+    The use of `keras.utils.Sequence` guarantees the ordering
+    and guarantees the single use of every input per epoch when
+    using `use_multiprocessing=True`.
+
     Arguments:
-        generator: a generator.
+        generator: a generator or an instance of Sequence (keras.utils.Sequence)
+                object in order to avoid duplicate data
+                when using multiprocessing.
             The output of the generator must be either
             - a tuple (inputs, targets)
             - a tuple (inputs, targets, sample_weights).
@@ -1761,10 +1692,10 @@ class Model(Container):
             to yield from `generator` before stopping.
         class_weight: dictionary mapping class indices to a weight
             for the class.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
             when using process based threading
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
@@ -1773,6 +1704,7 @@ class Model(Container):
             easily to children processes.
         initial_epoch: epoch at which to start training
             (useful for resuming a previous training run)
+        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
@@ -1798,6 +1730,19 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     wait_time = 0.01  # in seconds
     epoch = initial_epoch
 
@@ -1809,7 +1754,8 @@ class Model(Container):
     # python 2 has 'next', 3 has '__next__'
     # avoid any explicit version checks
     val_gen = (hasattr(validation_data, 'next') or
-               hasattr(validation_data, '__next__'))
+               hasattr(validation_data, '__next__') or
+               isinstance(validation_data, Sequence))
     if val_gen and not validation_steps:
       raise ValueError('When using a generator for validation data, '
                        'you must specify a value for '
@@ -1848,7 +1794,7 @@ class Model(Container):
       elif len(validation_data) == 3:
         val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
       else:
-        raise ValueError('validation_data should be a tuple '
+        raise ValueError('`validation_data` should be a tuple '
                          '`(val_x, val_y, val_sample_weight)` '
                          'or `(val_x, val_y)`. Found: ' + str(validation_data))
       val_x, val_y, val_sample_weights = self._standardize_user_data(
@@ -1858,11 +1804,25 @@ class Model(Container):
         val_data += [0.]
       for cbk in callbacks:
         cbk.validation_data = val_data
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          'Using a generator with `use_multiprocessing=True`'
+          ' may duplicate your data.Please consider using '
+          'the `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
-      enqueuer.start(max_q_size=max_q_size, workers=workers)
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
 
       callback_model.stop_training = False
       while epoch < epochs:
@@ -1870,25 +1830,19 @@ class Model(Container):
         steps_done = 0
         batch_index = 0
         while steps_done < steps_per_epoch:
-          generator_output = None
-          while enqueuer.is_running():
-            if not enqueuer.queue.empty():
-              generator_output = enqueuer.queue.get()
-              break
-            else:
-              time.sleep(wait_time)
+          generator_output = next(output_generator)
 
           if not hasattr(generator_output, '__len__'):
-            raise ValueError('output of generator should be '
+            raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
           if len(generator_output) == 2:
-            x, y = generator_output  # pylint: disable=unpacking-non-sequence
+            x, y = generator_output
             sample_weight = None
           elif len(generator_output) == 3:
-            x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+            x, y, sample_weight = generator_output
           else:
-            raise ValueError('output of generator should be '
+            raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
           # build batch logs
@@ -1924,9 +1878,9 @@ class Model(Container):
               val_outs = self.evaluate_generator(
                   validation_data,
                   validation_steps,
-                  max_q_size=max_q_size,
+                  max_queue_size=max_queue_size,
                   workers=workers,
-                  pickle_safe=pickle_safe)
+                  use_multiprocessing=use_multiprocessing)
             else:
               # No need for try/except because
               # data has already been validated.
@@ -1957,9 +1911,10 @@ class Model(Container):
   def evaluate_generator(self,
                          generator,
                          steps,
-                         max_q_size=10,
+                         max_queue_size=10,
                          workers=1,
-                         pickle_safe=False):
+                         use_multiprocessing=False,
+                         **kwargs):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -1968,18 +1923,22 @@ class Model(Container):
     Arguments:
         generator: Generator yielding tuples (inputs, targets)
             or (inputs, targets, sample_weights)
+            or an instance of Sequence (keras.utils.Sequence)
+                object in order to avoid duplicate data
+                when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
             when using process based threading
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
+        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1991,38 +1950,58 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self._make_test_function()
 
     steps_done = 0
     wait_time = 0.01
     all_outs = []
     batch_sizes = []
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          'Using a generator with `use_multiprocessing=True`'
+          ' may duplicate your data.Please consider using '
+          'the `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
-      enqueuer.start(workers=workers, max_q_size=max_q_size)
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
 
       while steps_done < steps:
-        generator_output = None
-        while enqueuer.is_running():
-          if not enqueuer.queue.empty():
-            generator_output = enqueuer.queue.get()
-            break
-          else:
-            time.sleep(wait_time)
-
+        generator_output = next(output_generator)
         if not hasattr(generator_output, '__len__'):
-          raise ValueError('output of generator should be a tuple '
+          raise ValueError('Output of generator should be a tuple '
                            '(x, y, sample_weight) '
                            'or (x, y). Found: ' + str(generator_output))
         if len(generator_output) == 2:
-          x, y = generator_output  # pylint: disable=unpacking-non-sequence
+          x, y = generator_output
           sample_weight = None
         elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output  # pylint: disable=unpacking-non-sequence
+          x, y, sample_weight = generator_output
         else:
-          raise ValueError('output of generator should be a tuple '
+          raise ValueError('Output of generator should be a tuple '
                            '(x, y, sample_weight) '
                            'or (x, y). Found: ' + str(generator_output))
         outs = self.test_on_batch(x, y, sample_weight=sample_weight)
@@ -2033,6 +2012,9 @@ class Model(Container):
           batch_size = len(list(x.values())[0])
         else:
           batch_size = len(x)
+        if batch_size == 0:
+          raise ValueError('Received an empty batch. '
+                           'Batches should at least contain one item.')
         all_outs.append(outs)
 
         steps_done += 1
@@ -2054,23 +2036,27 @@ class Model(Container):
   def predict_generator(self,
                         generator,
                         steps,
-                        max_q_size=10,
+                        max_queue_size=10,
                         workers=1,
-                        pickle_safe=False,
-                        verbose=0):
+                        use_multiprocessing=False,
+                        verbose=0,
+                        **kwargs):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
     `predict_on_batch`.
 
     Arguments:
-        generator: Generator yielding batches of input samples.
+        generator: Generator yielding batches of input samples
+                or an instance of Sequence (keras.utils.Sequence)
+                object in order to avoid duplicate data
+                when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: Maximum size for the generator queue.
+        max_queue_size: Maximum size for the generator queue.
         workers: Maximum number of processes to spin up
             when using process based threading
-        pickle_safe: If `True`, use process based threading.
+        use_multiprocessing: If `True`, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
@@ -2078,6 +2064,7 @@ class Model(Container):
             as they can't be passed
             easily to children processes.
         verbose: verbosity mode, 0 or 1.
+        **kwargs: support for legacy arguments.
 
     Returns:
         Numpy array(s) of predictions.
@@ -2086,38 +2073,58 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self._make_predict_function()
 
     steps_done = 0
     wait_time = 0.01
     all_outs = []
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          'Using a generator with `use_multiprocessing=True`'
+          ' may duplicate your data.Please consider using '
+          'the `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
-      enqueuer.start(workers=workers, max_q_size=max_q_size)
+      if is_sequence:
+        enqueuer = OrderedEnqueuer(
+            generator, use_multiprocessing=use_multiprocessing)
+      else:
+        enqueuer = GeneratorEnqueuer(
+            generator,
+            use_multiprocessing=use_multiprocessing,
+            wait_time=wait_time)
+      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+      output_generator = enqueuer.get()
 
       if verbose == 1:
         progbar = Progbar(target=steps)
 
       while steps_done < steps:
-        generator_output = None
-        while enqueuer.is_running():
-          if not enqueuer.queue.empty():
-            generator_output = enqueuer.queue.get()
-            break
-          else:
-            time.sleep(wait_time)
-
+        generator_output = next(output_generator)
         if isinstance(generator_output, tuple):
           # Compatibility with the generators
           # used for training.
           if len(generator_output) == 2:
-            x, _ = generator_output  # pylint: disable=unpacking-non-sequence
+            x, _ = generator_output
           elif len(generator_output) == 3:
-            x, _, _ = generator_output  # pylint: disable=unpacking-non-sequence
+            x, _, _ = generator_output
           else:
-            raise ValueError('output of generator should be '
+            raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
         else:
diff --git a/tensorflow/contrib/keras/python/keras/engine/training_test.py b/tensorflow/contrib/keras/python/keras/engine/training_test.py
index a23838f7b4f3208842a4acc83dd882f24ae74305..d2aac54c94766723a3807de405eba391420e82d4 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training_test.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training_test.py
@@ -463,6 +463,38 @@ class LossWeightingTest(test.TestCase):
           temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
       self.assertLess(score, ref_score)
 
+  def test_class_weight_wrong_classes(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(num_classes),
+              input_shape=(timesteps, input_dim)))
+      model.add(keras.layers.Activation('softmax'))
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer='rmsprop')
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=test_samples,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      # convert class vectors to binary class matrices
+      y_train = keras.utils.to_categorical(y_train, num_classes)
+      class_weight = dict([(i, 1.) for i in range(num_classes)])
+
+      del class_weight[1]
+      with self.assertRaises(ValueError):
+        model.fit(x_train, y_train,
+                  epochs=0, verbose=0, class_weight=class_weight)
+
 
 class LossMaskingTest(test.TestCase):
 
@@ -654,41 +686,41 @@ class TestGeneratorMethods(test.TestCase):
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
-                        max_q_size=10,
+                        max_queue_size=10,
                         workers=4,
-                        pickle_safe=True)
+                        use_multiprocessing=True)
     model.fit_generator(custom_generator(),
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
-                        max_q_size=10,
-                        pickle_safe=False)
+                        max_queue_size=10,
+                        use_multiprocessing=False)
     model.fit_generator(custom_generator(),
                         steps_per_epoch=5,
                         epochs=1,
                         verbose=1,
-                        max_q_size=10,
-                        pickle_safe=False,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
                         validation_data=custom_generator(),
                         validation_steps=10)
     model.predict_generator(custom_generator(),
                             steps=5,
-                            max_q_size=10,
+                            max_queue_size=10,
                             workers=2,
-                            pickle_safe=True)
+                            use_multiprocessing=True)
     model.predict_generator(custom_generator(),
                             steps=5,
-                            max_q_size=10,
-                            pickle_safe=False)
+                            max_queue_size=10,
+                            use_multiprocessing=False)
     model.evaluate_generator(custom_generator(),
                              steps=5,
-                             max_q_size=10,
+                             max_queue_size=10,
                              workers=2,
-                             pickle_safe=True)
+                             use_multiprocessing=True)
     model.evaluate_generator(custom_generator(),
                              steps=5,
-                             max_q_size=10,
-                             pickle_safe=False)
+                             max_queue_size=10,
+                             use_multiprocessing=False)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/keras/python/keras/initializers.py b/tensorflow/contrib/keras/python/keras/initializers.py
index b0b71e7cb4b1dac6f7edbec45d0cb8760dd40f86..af1dc914bb97266f1e153ad5cb04d459560007f1 100644
--- a/tensorflow/contrib/keras/python/keras/initializers.py
+++ b/tensorflow/contrib/keras/python/keras/initializers.py
@@ -57,6 +57,28 @@ class Identity(Initializer):
     return {'gain': self.gain}
 
 
+def lecun_normal(seed=None):
+  """LeCun normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(1 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+      - [Efficient
+      Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+  """
+  return VarianceScaling(
+      scale=1., mode='fan_in', distribution='normal', seed=seed)
+
+
 def lecun_uniform(seed=None):
   """LeCun uniform initializer.
 
diff --git a/tensorflow/contrib/keras/python/keras/initializers_test.py b/tensorflow/contrib/keras/python/keras/initializers_test.py
index 0a07eddd89ac68f2f408e419f92e189ad525ff1f..f39d2bfd525fbd37055a85ffdf583629a79dcae1 100644
--- a/tensorflow/contrib/keras/python/keras/initializers_test.py
+++ b/tensorflow/contrib/keras/python/keras/initializers_test.py
@@ -103,6 +103,14 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
                    target_mean=0., target_max=scale, target_min=-scale)
 
+  def test_lecun_normal(self):
+    tensor_shape = (5, 6, 4, 2)
+    with self.test_session():
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
+      scale = np.sqrt(1. / fan_in)
+      self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
+                   target_mean=0., target_std=None, target_max=2 * scale)
+
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
diff --git a/tensorflow/contrib/keras/python/keras/integration_test.py b/tensorflow/contrib/keras/python/keras/integration_test.py
index bcd844201c172a0602782e1cf75e0b934440029b..32b0a95fe32869c84f18e044d7866e0ae1a4392f 100644
--- a/tensorflow/contrib/keras/python/keras/integration_test.py
+++ b/tensorflow/contrib/keras/python/keras/integration_test.py
@@ -161,6 +161,80 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.70)
 
+  def test_vector_classification_shared_sequential(self):
+    # Test that Sequential models that feature internal updates
+    # and internal losses can be shared.
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(10,),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      base_model = keras.models.Sequential([
+          keras.layers.Dense(16,
+                             activation='relu',
+                             kernel_regularizer=keras.regularizers.l2(1e-5),
+                             bias_regularizer=keras.regularizers.l2(1e-5),
+                             input_shape=x_train.shape[1:]),
+          keras.layers.BatchNormalization(),
+      ])
+      x = keras.layers.Input(x_train.shape[1:])
+      y = base_model(x)
+      y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
+      model = keras.models.Model(x, y)
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='rmsprop',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
+
+  def test_vector_classification_shared_model(self):
+    # Test that functional models that feature internal updates
+    # and internal losses can be shared.
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=200,
+          test_samples=100,
+          input_shape=(10,),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+      y_test = keras.utils.to_categorical(y_test)
+
+      inputs = keras.layers.Input(x_train.shape[1:])
+      x = keras.layers.Dense(16,
+                             activation='relu',
+                             kernel_regularizer=keras.regularizers.l2(1e-5),
+                             bias_regularizer=keras.regularizers.l2(1e-5),
+                             input_shape=x_train.shape[1:])(inputs)
+      x = keras.layers.BatchNormalization()(x)
+      base_model = keras.models.Model(inputs, x)
+
+      x = keras.layers.Input(x_train.shape[1:])
+      y = base_model(x)
+      y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
+      model = keras.models.Model(x, y)
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='rmsprop',
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
+                          validation_data=(x_test, y_test),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
+
+  def test_embedding_with_clipnorm(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Embedding(input_dim=1, output_dim=1))
+      model.compile(optimizer=keras.optimizers.SGD(clipnorm=0.1), loss='mse')
+      model.fit(np.array([[0]]), np.array([[[0.5]]]), epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
index 2c957ece4466660cd5e62aa1fcc9dc9f9052091d..55f17ac4e29eee361e29be0c4cd6ee6d33bc5d22 100644
--- a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
@@ -57,7 +57,7 @@ class LeakyReLU(Layer):
     return K.relu(inputs, alpha=self.alpha)
 
   def get_config(self):
-    config = {'alpha': self.alpha}
+    config = {'alpha': float(self.alpha)}
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional.py b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
index 9ee5aa21217d9944cb09885935f1290e74c26eb2..73e87f68dd38a8a7bc1334518a0616e7f8bae6b4 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
@@ -316,7 +316,7 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
   When using this layer as the first layer in a model,
   provide the keyword argument `input_shape`
   (tuple of integers, does not include the sample axis),
-  e.g. `input_shape=(128, 128, 128, 3)` for 128x128x128 volumes
+  e.g. `input_shape=(128, 128, 128, 1)` for 128x128x128 volumes
   with a single channel,
   in `data_format="channels_last"`.
 
@@ -324,7 +324,7 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
       filters: Integer, the dimensionality of the output space
           (i.e. the number output of filters in the convolution).
       kernel_size: An integer or tuple/list of 3 integers, specifying the
-          width and height of the 3D convolution window.
+          depth, height and width of the 3D convolution window.
           Can be a single integer to specify the same value for
           all spatial dimensions.
       strides: An integer or tuple/list of 3 integers,
@@ -599,6 +599,163 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
+  """Transposed convolution layer (sometimes called Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  When using this layer as the first layer in a model,
+  provide the keyword argument `input_shape`
+  (tuple of integers, does not include the sample axis),
+  e.g. `input_shape=(128, 128, 128, 3)` for a 128x128x128 volume with 3 channels
+  if `data_format="channels_last"`.
+
+  Arguments:
+      filters: Integer, the dimensionality of the output space
+          (i.e. the number of output filters in the convolution).
+      kernel_size: An integer or tuple/list of 3 integers, specifying the
+          depth, height and width of the 3D convolution window.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+      strides: An integer or tuple/list of 3 integers,
+          specifying the strides of the convolution along the depth, height
+            and width.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Specifying any stride value != 1 is incompatible with specifying
+          any `dilation_rate` value != 1.
+      padding: one of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, depth, height, width, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, depth, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      dilation_rate: an integer or tuple/list of 3 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Can be a single integer to specify the same value for
+          all spatial dimensions.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any stride value != 1.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you don't specify anything, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
+      kernel_constraint: Constraint function applied to the kernel matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+
+  Input shape:
+      5D tensor with shape:
+      `(batch, channels, depth, rows, cols)` if data_format='channels_first'
+      or 5D tensor with shape:
+      `(batch, depth, rows, cols, channels)` if data_format='channels_last'.
+
+  Output shape:
+      5D tensor with shape:
+      `(batch, filters, new_depth, new_rows, new_cols)` if
+        data_format='channels_first'
+      or 5D tensor with shape:
+      `(batch, new_depth, new_rows, new_cols, filters)` if
+        data_format='channels_last'.
+      `depth` and `rows` and `cols` values might have changed due to padding.
+
+  References:
+      - [A guide to convolution arithmetic for deep
+        learning](https://arxiv.org/abs/1603.07285v1)
+      - [Deconvolutional
+        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(Conv3DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv3DTranspose, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv3DTranspose, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
@@ -976,7 +1133,7 @@ class ZeroPadding1D(Layer):
 class ZeroPadding2D(Layer):
   """Zero-padding layer for 2D input (e.g. picture).
 
-  This layer can add rows and columns or zeros
+  This layer can add rows and columns of zeros
   at the top, bottom, left and right side of an image tensor.
 
   Arguments:
@@ -1550,4 +1707,6 @@ Convolution2D = Conv2D
 Convolution3D = Conv3D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Conv2DTranspose
+Convolution3DTranspose = Conv3DTranspose
 Deconvolution2D = Deconv2D = Conv2DTranspose
+Deconvolution3D = Deconv3D = Conv3DTranspose
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 7e567d3fb02b8ae47f3855748fb4229b57b9a1bf..2a05ac55db2755f27012eef9687662dcaf3cc87e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -239,6 +239,66 @@ class Conv2DTransposeTest(test.TestCase):
       self.assertEqual(len(layer.constraints), 2)
 
 
+class Conv3DTransposeTest(test.TestCase):
+
+  def test_conv3d_transpose(self):
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    num_row = 5
+    num_col = 6
+    depth = 4
+
+    for padding in ['valid', 'same']:
+      for strides in [(1, 1, 1), (2, 2, 2)]:
+        if padding == 'same' and strides != (1, 1, 1):
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.Conv3DTranspose,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': 'channels_last'
+              },
+              input_shape=(num_samples, depth, num_row, num_col, stack_size))
+
+  def test_conv3dtranspose_regularization(self):
+    # regularizers
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+    # constraints
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': 'max_norm',
+        'bias_constraint': 'max_norm',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.constraints), 2)
+
+
 class SeparableConv2DTest(test.TestCase):
 
   def test_separable_conv_2d(self):
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index d287fa56d91840e710c93afc0a84b875b9ec3aef..1f9ee384c28fdb0fe339cc638836b7a00798c426 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -33,9 +33,9 @@ from tensorflow.contrib.keras.python.keras.engine import Layer
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_dump
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_load
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import core as tf_core_layers
-from tensorflow.python.util import tf_inspect
 
 
 class Masking(Layer):
@@ -107,7 +107,7 @@ class Dropout(tf_core_layers.Dropout, Layer):
     self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(**kwargs)
+    super(Dropout, self).__init__(rate=rate, noise_shape=noise_shape, seed=seed, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -590,8 +590,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    arg_spec = tf_inspect.getargspec(self.function)
-    if 'mask' in arg_spec.args:
+    if has_arg(self.function, 'mask'):
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
 
@@ -634,6 +633,16 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    # If arguments were numpy array, they have been saved as
+    # list. We need to recover the ndarray
+    if 'arguments' in config:
+      for key in config['arguments']:
+        if isinstance(config['arguments'][key], dict):
+          arg_dict = config['arguments'][key]
+          if 'type' in arg_dict and arg_dict['type'] == 'ndarray':
+            # Overwrite the argument with its numpy translation
+            config['arguments'][key] = np.array(arg_dict['value'])
+
     config['function'] = function
     return cls(**config)
 
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings.py b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
index bc0bae67d05275346e40791e2a6d58a6b89bdf30..9f617fd3e425ae7eb03a5f92f0ac850a5f3e3cb0 100644
--- a/tensorflow/contrib/keras/python/keras/layers/embeddings.py
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
@@ -96,7 +96,6 @@ class Embedding(Layer):
                mask_zero=False,
                input_length=None,
                **kwargs):
-    kwargs['dtype'] = 'int32'
     if 'input_shape' not in kwargs:
       if input_length:
         kwargs['input_shape'] = (input_length,)
@@ -120,7 +119,8 @@ class Embedding(Layer):
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint)
+        constraint=self.embeddings_constraint,
+        dtype=self.dtype)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
@@ -131,12 +131,26 @@ class Embedding(Layer):
 
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if not self.input_length:
-      input_length = input_shape[1]
+    if self.input_length is None:
+      return tensor_shape.TensorShape(input_shape + [self.output_dim])
     else:
-      input_length = self.input_length
-    return tensor_shape.TensorShape(
-        [input_shape[0], input_length, self.output_dim])
+      # input_length can be tuple if input is 3D or higher
+      if isinstance(self.input_length, (list, tuple)):
+        in_lens = list(self.input_length)
+      else:
+        in_lens = [self.input_length]
+      if len(in_lens) != len(input_shape) - 1:
+        ValueError('"input_length" is %s, but received input has shape %s' %
+                   (str(self.input_length), str(input_shape)))
+      else:
+        for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
+          if s1 is not None and s2 is not None and s1 != s2:
+            ValueError('"input_length" is %s, but received input has shape %s' %
+                       (str(self.input_length), str(input_shape)))
+          elif s1 is None:
+            in_lens[i] = s2
+      return tensor_shape.TensorShape(
+          (input_shape[0],) + tuple(in_lens) + (self.output_dim,))
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
index ca7ca3efd81f4c38bb5c8be74e06557a86b55806..5d6d386862bf9921a6f0f8b58e494ee04e1643ed 100644
--- a/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings_test.py
@@ -46,6 +46,27 @@ class EmbeddingTest(test.TestCase):
           input_dtype='int32',
           expected_output_dtype='float32')
 
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'mask_zero': True},
+          input_shape=(3, 4, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Embedding,
+          kwargs={'output_dim': 4,
+                  'input_dim': 10,
+                  'mask_zero': True,
+                  'input_length': (None, 2)},
+          input_shape=(3, 4, 2),
+          input_dtype='int32',
+          expected_output_dtype='float32')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/local.py b/tensorflow/contrib/keras/python/keras/layers/local.py
index 863674c1cbd95f2a93e24297278fc9e60800bc14..31a29cdaf467bde6ad6076f398c98987d67435b6 100644
--- a/tensorflow/contrib/keras/python/keras/layers/local.py
+++ b/tensorflow/contrib/keras/python/keras/layers/local.py
@@ -154,52 +154,30 @@ class LocallyConnected1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], length, self.filters])
 
   def call(self, inputs):
-    stride = self.strides[0]
-    output_length, feature_dim, filters = self.kernel_shape
-
-    xs = []
-    for i in range(output_length):
-      slice_length = slice(i * stride, i * stride + self.kernel_size[0])
-      xs.append(K.reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
-    x_aggregate = K.concatenate(xs, axis=0)
-    # Shape: `(output_length, batch_size, filters)`.
-    output = K.batch_dot(x_aggregate, self.kernel)
-    output = K.permute_dimensions(output, (1, 0, 2))
+    output = K.local_conv1d(inputs, self.kernel, self.kernel_size, self.strides)
 
     if self.use_bias:
-      output += K.reshape(self.bias, (1, output_length, filters))
+      output = K.bias_add(output, self.bias)
     if self.activation is not None:
       output = self.activation(output)
     return output
 
   def get_config(self):
     config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -381,97 +359,35 @@ class LocallyConnected2D(Layer):
           [input_shape[0], rows, cols, self.filters])
 
   def call(self, inputs):
-    stride_row, stride_col = self.strides
-    _, feature_dim, filters = self.kernel_shape
-
-    if self.data_format == 'channels_first':
-      if K.backend() == 'theano':
-        output = []
-        for i in range(self.output_row):
-          for j in range(self.output_col):
-            slice_row = slice(i * stride_row,
-                              i * stride_row + self.kernel_size[0])
-            slice_col = slice(j * stride_col,
-                              j * stride_col + self.kernel_size[1])
-            x_flatten = K.reshape(inputs[:, :, slice_row, slice_col],
-                                  (1, -1, feature_dim))
-            output.append(
-                K.dot(x_flatten, self.kernel[i * self.output_col + j, :, :]))
-        output = K.concatenate(output, axis=0)
-      else:
-        xs = []
-        for i in range(self.output_row):
-          for j in range(self.output_col):
-            slice_row = slice(i * stride_row,
-                              i * stride_row + self.kernel_size[0])
-            slice_col = slice(j * stride_col,
-                              j * stride_col + self.kernel_size[1])
-            xs.append(
-                K.reshape(inputs[:, :, slice_row, slice_col], (1, -1,
-                                                               feature_dim)))
-        x_aggregate = K.concatenate(xs, axis=0)
-        output = K.batch_dot(x_aggregate, self.kernel)
-      output = K.reshape(output, (self.output_row, self.output_col, -1,
-                                  filters))
-      output = K.permute_dimensions(output, (2, 3, 0, 1))
-
-    elif self.data_format == 'channels_last':
-      xs = []
-      for i in range(self.output_row):
-        for j in range(self.output_col):
-          slice_row = slice(i * stride_row,
-                            i * stride_row + self.kernel_size[0])
-          slice_col = slice(j * stride_col,
-                            j * stride_col + self.kernel_size[1])
-          xs.append(
-              K.reshape(inputs[:, slice_row, slice_col, :], (1, -1, feature_dim
-                                                            )))
-      x_aggregate = K.concatenate(xs, axis=0)
-      output = K.batch_dot(x_aggregate, self.kernel)
-      output = K.reshape(output, (self.output_row, self.output_col, -1,
-                                  filters))
-      output = K.permute_dimensions(output, (2, 0, 1, 3))
-
+    output = K.local_conv2d(inputs,
+                            self.kernel,
+                            self.kernel_size,
+                            self.strides,
+                            (self.output_row, self.output_col),
+                            self.data_format)
     if self.use_bias:
-      if self.data_format == 'channels_first':
-        output += K.reshape(self.bias, (1, filters, self.output_row,
-                                        self.output_col))
-      elif self.data_format == 'channels_last':
-        output += K.reshape(self.bias, (1, self.output_row, self.output_col,
-                                        filters))
+      output = K.bias_add(output, self.bias, data_format=self.data_format)
+
     output = self.activation(output)
     return output
 
   def get_config(self):
     config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
index 90bf95a781a90d1998237e339a641a690d67933f..04a04838e775bb6e284bbd5c150a7177ccffd0bc 100644
--- a/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/lstm_test.py
@@ -118,7 +118,7 @@ class LSTMLayerTest(test.TestCase):
       # check that container-level reset_states() works
       model.reset_states()
       out4 = model.predict(np.ones((num_samples, timesteps)))
-      np.testing.assert_allclose(out3, out4, atol=1e-5)
+      self.assertAllClose(out3, out4, atol=1e-5)
 
       # check that the call to `predict` updated the states
       out5 = model.predict(np.ones((num_samples, timesteps)))
@@ -139,7 +139,7 @@ class LSTMLayerTest(test.TestCase):
       right_padded_input[1, -2:] = 0
       out7 = model.predict(right_padded_input)
 
-      np.testing.assert_allclose(out7, out6, atol=1e-5)
+      self.assertAllClose(out7, out6, atol=1e-5)
 
   def test_regularization_LSTM(self):
     embedding_dim = 4
@@ -252,7 +252,7 @@ class LSTMLayerTest(test.TestCase):
       layer.reset_states()
       assert len(layer.states) == num_states
       assert layer.states[0] is not None
-      np.testing.assert_allclose(
+      self.assertAllClose(
           keras.backend.eval(layer.states[0]),
           np.zeros(keras.backend.int_shape(layer.states[0])),
           atol=1e-4)
@@ -261,7 +261,7 @@ class LSTMLayerTest(test.TestCase):
       if len(values) == 1:
         values = values[0]
       layer.reset_states(values)
-      np.testing.assert_allclose(
+      self.assertAllClose(
           keras.backend.eval(layer.states[0]),
           np.ones(keras.backend.int_shape(layer.states[0])),
           atol=1e-4)
@@ -292,6 +292,42 @@ class LSTMLayerTest(test.TestCase):
       targets = np.random.random((num_samples, units))
       model.train_on_batch([inputs] + initial_state, targets)
 
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = keras.layers.LSTM(units, return_state=True, stateful=True)
+      outputs = layer(inputs)
+      state = outputs[1:]
+      assert len(state) == num_states
+      model = keras.models.Model(inputs, state[0])
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      state = model.predict(inputs)
+      self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    with self.test_session():
+      inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = keras.layers.LSTM(units, return_state=True, return_sequences=True)
+      outputs = layer(inputs)
+      output, state = outputs[0], outputs[1:]
+      output = keras.layers.LSTM(units)(output, initial_state=state)
+      model = keras.models.Model(inputs, output)
+
+      inputs = np.random.random((num_samples, timesteps, embedding_dim))
+      outputs = model.predict(inputs)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise.py b/tensorflow/contrib/keras/python/keras/layers/noise.py
index adc88a4fce45a37a4c50ec8685b8a95a732bc4af..e3cfa1f711cd6f4f62e230db5984da2c1e833440 100644
--- a/tensorflow/contrib/keras/python/keras/layers/noise.py
+++ b/tensorflow/contrib/keras/python/keras/layers/noise.py
@@ -109,3 +109,65 @@ class GaussianDropout(Layer):
     config = {'rate': self.rate}
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+class AlphaDropout(Layer):
+  """Applies Alpha Dropout to the input.
+
+  Alpha Dropout is a `Dropout` that keeps mean and variance of inputs
+  to their original values, in order to ensure the self-normalizing property
+  even after this dropout.
+  Alpha Dropout fits well to Scaled Exponential Linear Units
+  by randomly setting activations to the negative saturation value.
+
+  Arguments:
+      rate: float, drop probability (as with `Dropout`).
+          The multiplicative noise will have
+          standard deviation `sqrt(rate / (1 - rate))`.
+      seed: A Python integer to use as random seed.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+  """
+
+  def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
+    super(AlphaDropout, self).__init__(**kwargs)
+    self.rate = rate
+    self.noise_shape = noise_shape
+    self.seed = seed
+    self.supports_masking = True
+
+  def _get_noise_shape(self, inputs):
+    return self.noise_shape if self.noise_shape else K.shape(inputs)
+
+  def call(self, inputs, training=None):
+    if 0. < self.rate < 1.:
+      noise_shape = self._get_noise_shape(inputs)
+      alpha = 1.6732632423543772848170429916717
+      scale = 1.0507009873554804934193349852946
+
+      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):
+        alpha_p = -alpha * scale
+        kept_idx = K.greater_equal(K.random_uniform(noise_shape, seed=seed),
+                                   rate)
+        kept_idx = K.cast(kept_idx, K.floatx())
+        a = ((1 - rate) * (1 + rate * alpha_p ** 2)) ** -0.5
+        b = -a * alpha_p * rate
+        x = inputs * kept_idx + alpha_p * (1 - kept_idx)
+        return a * x + b
+
+      return K.in_train_phase(dropped_inputs, inputs, training=training)
+    return inputs
+
+  def get_config(self):
+    config = {'rate': self.rate}
+    base_config = super(AlphaDropout, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/noise_test.py b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
index b0257b167a7c622cf55bf1f7c1998c47946dbc57..8fb1339c2ef5134ce936338c2e6baaadae43142a 100644
--- a/tensorflow/contrib/keras/python/keras/layers/noise_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/noise_test.py
@@ -39,6 +39,13 @@ class NoiseLayersTest(test.TestCase):
           kwargs={'rate': 0.5},
           input_shape=(3, 2, 3))
 
+  def test_AlphaDropout(self):
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.AlphaDropout,
+          kwargs={'rate': 0.2},
+          input_shape=(3, 2, 3))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization_test.py b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
index dc410f84d8563aa9e112b270a08c91cf73771930..1a0686800eb2807bb5dc9788c2eac34cdd1a95e1 100644
--- a/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
@@ -94,22 +94,23 @@ class NoiseLayersTest(test.TestCase):
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
   def test_batchnorm_convnet(self):
-    with self.test_session():
-      model = keras.models.Sequential()
-      norm = keras.layers.BatchNormalization(
-          axis=1, input_shape=(3, 4, 4), momentum=0.8)
-      model.add(norm)
-      model.compile(loss='mse', optimizer='sgd')
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
-      model.fit(x, x, epochs=4, verbose=0)
-      out = model.predict(x)
-      out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
-      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
-
-      np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
-      np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        model = keras.models.Sequential()
+        norm = keras.layers.BatchNormalization(
+            axis=1, input_shape=(3, 4, 4), momentum=0.8)
+        model.add(norm)
+        model.compile(loss='mse', optimizer='sgd')
+
+        # centered on 5.0, variance 10.0
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+        model.fit(x, x, epochs=4, verbose=0)
+        out = model.predict(x)
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index 5e8c23ed3e2c31b1506d09c31d1608a4c503fe37..592e5f5e3aae74d6baf5d0c985f5a8a81c7de4c8 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -123,6 +123,8 @@ class Recurrent(Layer):
           `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
       return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
       go_backwards: Boolean (default False).
           If True, process the input sequence backwards and return the
           reversed sequence.
@@ -166,6 +168,9 @@ class Recurrent(Layer):
       (Optional) 2D tensors with shape `(batch_size, output_dim)`.
 
   Output shape:
+      - if `return_state`: a list of tensors. The first tensor is
+          the output. The remaining tensors are the last states,
+          each with shape `(batch_size, units)`.
       - if `return_sequences`: 3D tensor with shape
           `(batch_size, timesteps, units)`.
       - else, 2D tensor with shape `(batch_size, units)`.
@@ -211,6 +216,7 @@ class Recurrent(Layer):
 
   def __init__(self,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
@@ -218,6 +224,7 @@ class Recurrent(Layer):
                **kwargs):
     super(Recurrent, self).__init__(**kwargs)
     self.return_sequences = return_sequences
+    self.return_state = return_state
     self.go_backwards = go_backwards
     self.stateful = stateful
     self.unroll = unroll
@@ -233,18 +240,24 @@ class Recurrent(Layer):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.return_sequences:
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], self.units])
+      output_shape = (input_shape[0], input_shape[1], self.units)
     else:
-      return tensor_shape.TensorShape([input_shape[0], self.units])
+      output_shape = (input_shape[0], self.units)
+
+    if self.return_state:
+      state_shape = [tensor_shape.TensorShape(
+          (input_shape[0], self.units)) for _ in self.states]
+      return [tensor_shape.TensorShape(output_shape)] + state_shape
+    return tensor_shape.TensorShape(output_shape)
 
   def compute_mask(self, inputs, mask):
-    if self.return_sequences:
-      if isinstance(mask, list):
-        return mask[0]
-      return mask
-    else:
-      return None
+    if isinstance(mask, list):
+      mask = mask[0]
+    output_mask = mask if self.return_sequences else None
+    if self.return_state:
+      state_mask = [None for _ in self.states]
+      return [output_mask] + state_mask
+    return output_mask
 
   def step(self, inputs, states):
     raise NotImplementedError
@@ -361,10 +374,16 @@ class Recurrent(Layer):
       last_output._uses_learning_phase = True
       outputs._uses_learning_phase = True
 
-    if self.return_sequences:
-      return outputs
-    else:
-      return last_output
+    if not self.return_sequences:
+      outputs = last_output
+
+    if self.return_state:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [outputs] + states
+    return outputs
 
   def reset_states(self, states=None):
     if not self.stateful:
@@ -406,6 +425,7 @@ class Recurrent(Layer):
   def get_config(self):
     config = {
         'return_sequences': self.return_sequences,
+        'return_state': self.return_state,
         'go_backwards': self.go_backwards,
         'stateful': self.stateful,
         'unroll': self.unroll,
@@ -601,36 +621,25 @@ class SimpleRNN(Recurrent):
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -902,38 +911,27 @@ class GRU(Recurrent):
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(GRU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -985,7 +983,7 @@ class LSTM(Recurrent):
 
   References:
       - [Long short-term
-        memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
+        memory]((http://www.bioinf.jku.at/publications/older/2604.pdf)
         (original 1997 paper)
       - [Supervised sequence labeling with recurrent neural
         networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
@@ -1239,40 +1237,28 @@ class LSTM(Recurrent):
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'unit_forget_bias': self.unit_forget_bias,
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
     }
     base_config = super(LSTM, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
index dbc79fb1933d2b083b65f18b4a0547bfe6b5df69..91614c288d4047c421aab917ca6af5a36fe33846 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -24,8 +24,8 @@ import copy
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.engine import InputSpec
 from tensorflow.contrib.keras.python.keras.engine import Layer
+from tensorflow.contrib.keras.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.util import tf_inspect
 
 
 class Wrapper(Layer):
@@ -183,15 +183,29 @@ class TimeDistributed(Wrapper):
     return tensor_shape.TensorShape([child_output_shape[0], timesteps] +
                                     child_output_shape[1:])
 
-  def call(self, inputs, mask=None):
+  def call(self, inputs, training=None, mask=None):
+    kwargs = {}
+    if has_arg(self.layer.call, 'training'):
+      kwargs['training'] = training
+    uses_learning_phase = False  # pylint: disable=redefined-outer-name
+
     input_shape = K.int_shape(inputs)
     if input_shape[0]:
       # batch size matters, use rnn-based implementation
       def step(x, _):
-        output = self.layer.call(x)
+        global uses_learning_phase  # pylint: disable=global-variable-undefined
+        output = self.layer.call(x, **kwargs)
+        if hasattr(output, '_uses_learning_phase'):
+          uses_learning_phase = (output._uses_learning_phase or
+                                 uses_learning_phase)
         return output, []
 
-      _, outputs, _ = K.rnn(step, inputs, initial_states=[], unroll=False)
+      _, outputs, _ = K.rnn(
+          step,
+          inputs,
+          initial_states=[],
+          input_length=input_shape[1],
+          unroll=False)
       y = outputs
     else:
       # No batch size specified, therefore the layer will be able
@@ -202,16 +216,22 @@ class TimeDistributed(Wrapper):
         input_length = K.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...)
       inputs = K.reshape(inputs, (-1,) + input_shape[2:])
-      y = self.layer.call(inputs)  # (num_samples * timesteps, ...)
+      # (num_samples * timesteps, ...)
+      y = self.layer.call(inputs, **kwargs)
+      if hasattr(y, '_uses_learning_phase'):
+        uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
-      output_shape = self._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
-      y = K.reshape(y, [-1, input_length] + output_shape[2:])
+      output_shape = self._compute_output_shape(input_shape).as_list()
+      y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
     if (hasattr(self.layer, 'activity_regularizer') and
         self.layer.activity_regularizer is not None):
       regularization_loss = self.layer.activity_regularizer(y)
       self.add_loss(regularization_loss, inputs)
+
+    if uses_learning_phase:
+      y._uses_learning_phase = True
     return y
 
 
@@ -285,10 +305,9 @@ class Bidirectional(Wrapper):
 
   def call(self, inputs, training=None, mask=None):
     kwargs = {}
-    func_args = tf_inspect.getargspec(self.layer.call).args
-    if 'training' in func_args:
+    if has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    if 'mask' in func_args:
+    if has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
     y = self.forward_layer.call(inputs, **kwargs)
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
index b892681adabde6b8159561c676d8c15d4b95774b..d4cd1ccbb478fb57aaf4c27b2aef8202aa5a54dc 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers_test.py
@@ -113,6 +113,16 @@ class TimeDistributedTest(test.TestCase):
       model.compile(optimizer='rmsprop', loss='mse')
       self.assertEqual(len(model.losses), 1)
 
+  def test_TimeDistributed_learning_phase(self):
+    # test layers that need learning_phase to be set
+    np.random.seed(1234)
+    x = keras.layers.Input(shape=(3, 2))
+    y = keras.layers.TimeDistributed(
+        keras.layers.Dropout(.999))(x, training=True)
+    model = keras.models.Model(x, y)
+    y = model.predict(np.random.random((10, 3, 2)))
+    self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
+
 
 class BidirectionalTest(test.TestCase):
 
diff --git a/tensorflow/contrib/keras/python/keras/metrics.py b/tensorflow/contrib/keras/python/keras/metrics.py
index 93c8684f91072dcb70bd767dfa7ccc2f9bcb578a..999e9cb9d4d03c74da7f4775398afb2135980421 100644
--- a/tensorflow/contrib/keras/python/keras/metrics.py
+++ b/tensorflow/contrib/keras/python/keras/metrics.py
@@ -59,6 +59,11 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
 
 
+def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
+  return K.mean(K.in_top_k(y_pred,
+                           K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+
+
 # Aliases
 
 mse = MSE = mean_squared_error
diff --git a/tensorflow/contrib/keras/python/keras/metrics_test.py b/tensorflow/contrib/keras/python/keras/metrics_test.py
index ac0a1372c649d5f8f4c6a45274501285639bd322..84c6528174ddf27e44cb3dbc8d1f1d2fcab41890 100644
--- a/tensorflow/contrib/keras/python/keras/metrics_test.py
+++ b/tensorflow/contrib/keras/python/keras/metrics_test.py
@@ -42,6 +42,21 @@ class KerasMetricsTest(test.TestCase):
       y_b = keras.backend.variable(np.random.random((6, 7)))
       self.assertEqual(keras.backend.eval(metric(y_a, y_b)).shape, (6,))
 
+  def test_sparse_top_k_categorical_accuracy(self):
+    with self.test_session():
+      y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
+                                                [0.1, 0.2, 0.7]]))
+      y_true = keras.backend.variable(np.array([[1], [0]]))
+      result = keras.backend.eval(
+          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = keras.backend.eval(
+          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = keras.backend.eval(
+          keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
   def test_top_k_categorical_accuracy(self):
     with self.test_session():
       y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
index 0ae373da3cda1541cb999c1c7a62b5de5cb1419e..8864f5e69dc7c69899dcd4a503bb1c9de1cbdcaa 100644
--- a/tensorflow/contrib/keras/python/keras/models.py
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -97,7 +97,10 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
     # if obj is any numpy type
     if type(obj).__module__ == np.__name__:
-      return obj.item()
+      if isinstance(obj, np.ndarray):
+        return {'type': type(obj), 'value': obj.tolist()}
+      else:
+        return obj.item()
 
     # misc functions (e.g. loss function)
     if callable(obj):
@@ -232,89 +235,79 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
     if isinstance(obj, list):
       deserialized = []
       for value in obj:
-        if value in custom_objects:
-          deserialized.append(custom_objects[value])
-        else:
-          deserialized.append(value)
+        deserialized.append(convert_custom_objects(value))
       return deserialized
     if isinstance(obj, dict):
       deserialized = {}
       for key, value in obj.items():
-        deserialized[key] = []
-        if isinstance(value, list):
-          for element in value:
-            if element in custom_objects:
-              deserialized[key].append(custom_objects[element])
-            else:
-              deserialized[key].append(element)
-        elif value in custom_objects:
-          deserialized[key] = custom_objects[value]
-        else:
-          deserialized[key] = value
+        deserialized[key] = convert_custom_objects(value)
       return deserialized
     if obj in custom_objects:
       return custom_objects[obj]
     return obj
 
-  f = h5py.File(filepath, mode='r')
-
-  # instantiate model
-  model_config = f.attrs.get('model_config')
-  if model_config is None:
-    raise ValueError('No model found in config file.')
-  model_config = json.loads(model_config.decode('utf-8'))
-  model = model_from_config(model_config, custom_objects=custom_objects)
-
-  # set weights
-  topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
-
-  # Early return if compilation is not required.
-  if not compile:
-    f.close()
-    return model
-
-  # instantiate optimizer
-  training_config = f.attrs.get('training_config')
-  if training_config is None:
-    logging.warning('No training configuration found in save file: '
-                    'the model was *not* compiled. Compile it manually.')
-    f.close()
-    return model
-  training_config = json.loads(training_config.decode('utf-8'))
-  optimizer_config = training_config['optimizer_config']
-  optimizer = optimizers.deserialize(
-      optimizer_config, custom_objects=custom_objects)
-
-  # Recover loss functions and metrics.
-  loss = convert_custom_objects(training_config['loss'])
-  metrics = convert_custom_objects(training_config['metrics'])
-  sample_weight_mode = training_config['sample_weight_mode']
-  loss_weights = training_config['loss_weights']
-
-  # Compile model.
-  model.compile(
-      optimizer=optimizer,
-      loss=loss,
-      metrics=metrics,
-      loss_weights=loss_weights,
-      sample_weight_mode=sample_weight_mode)
-
-  # Set optimizer weights.
-  if 'optimizer_weights' in f:
-    # Build train function (to get weight updates).
-    if isinstance(model, Sequential):
-      model.model._make_train_function()
-    else:
-      model._make_train_function()
-    optimizer_weights_group = f['optimizer_weights']
-    optimizer_weight_names = [
-        n.decode('utf8') for n in optimizer_weights_group.attrs['weight_names']
-    ]
-    optimizer_weight_values = [
-        optimizer_weights_group[n] for n in optimizer_weight_names
-    ]
-    model.optimizer.set_weights(optimizer_weight_values)
-  f.close()
+  with h5py.File(filepath, mode='r') as f:
+    # instantiate model
+    model_config = f.attrs.get('model_config')
+    if model_config is None:
+      raise ValueError('No model found in config file.')
+    model_config = json.loads(model_config.decode('utf-8'))
+    model = model_from_config(model_config, custom_objects=custom_objects)
+
+    # set weights
+    topology.load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+    # Early return if compilation is not required.
+    if not compile:
+      return model
+
+    # instantiate optimizer
+    training_config = f.attrs.get('training_config')
+    if training_config is None:
+      logging.warning('No training configuration found in save file: '
+                      'the model was *not* compiled. Compile it manually.')
+      return model
+    training_config = json.loads(training_config.decode('utf-8'))
+    optimizer_config = training_config['optimizer_config']
+    optimizer = optimizers.deserialize(
+        optimizer_config, custom_objects=custom_objects)
+
+    # Recover loss functions and metrics.
+    loss = convert_custom_objects(training_config['loss'])
+    metrics = convert_custom_objects(training_config['metrics'])
+    sample_weight_mode = training_config['sample_weight_mode']
+    loss_weights = training_config['loss_weights']
+
+    # Compile model.
+    model.compile(
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=sample_weight_mode)
+
+    # Set optimizer weights.
+    if 'optimizer_weights' in f:
+      # Build train function (to get weight updates).
+      if isinstance(model, Sequential):
+        model.model._make_train_function()
+      else:
+        model._make_train_function()
+      optimizer_weights_group = f['optimizer_weights']
+      optimizer_weight_names = [
+          n.decode('utf8')
+          for n in optimizer_weights_group.attrs['weight_names']
+      ]
+      optimizer_weight_values = [
+          optimizer_weights_group[n] for n in optimizer_weight_names
+      ]
+      try:
+        model.optimizer.set_weights(optimizer_weight_values)
+      except ValueError:
+        logging.warning('Error in loading the saved optimizer '
+                        'state. As a result, your model is '
+                        'starting with a freshly initialized '
+                        'optimizer.')
   return model
 
 
@@ -331,7 +324,7 @@ def model_from_config(config, custom_objects=None):
       A Keras model instance (uncompiled).
 
   Raises:
-      TypeError if `config` is not a dictionary
+      TypeError: if `config` is not a dictionary.
   """
   if isinstance(config, list):
     raise TypeError('`model_from_config` expects a dictionary, not a list. '
@@ -436,6 +429,7 @@ class Sequential(Model):
     # The following properties are not actually used by Keras;
     # they exist for compatibility with TF's variable scoping mechanism.
     self._updates = []
+    self._losses = []
     self._scope = None
     self._reuse = None
     self._base_name = name
@@ -1017,10 +1011,11 @@ class Sequential(Model):
                     validation_data=None,
                     validation_steps=None,
                     class_weight=None,
-                    max_q_size=10,
+                    max_queue_size=10,
                     workers=1,
-                    pickle_safe=False,
-                    initial_epoch=0):
+                    use_multiprocessing=False,
+                    initial_epoch=0,
+                    **kwargs):
     """Fits the model on data generated batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
@@ -1056,9 +1051,9 @@ class Sequential(Model):
             validation dataset divided by the batch size.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
-        max_q_size: Maximum size for the generator queue
+        max_queue_size: Maximum size for the generator queue
         workers: Maximum number of processes to spin up
-        pickle_safe: Ff True, use process based threading.
+        use_multiprocessing: If True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
@@ -1067,12 +1062,15 @@ class Sequential(Model):
             easily to children processes.
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
+        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
 
     Raises:
         RuntimeError: if the model was never compiled.
+        ValueError: In case the generator yields
+            data in an invalid format.
 
     Example:
 
@@ -1091,6 +1089,19 @@ class Sequential(Model):
                             steps_per_epoch=1000, epochs=10)
     ```
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     if self.model is None:
       raise RuntimeError('The model needs to be compiled ' 'before being used.')
     return self.model.fit_generator(
@@ -1102,17 +1113,18 @@ class Sequential(Model):
         validation_data=validation_data,
         validation_steps=validation_steps,
         class_weight=class_weight,
-        max_q_size=max_q_size,
+        max_queue_size=max_queue_size,
         workers=workers,
-        pickle_safe=pickle_safe,
+        use_multiprocessing=use_multiprocessing,
         initial_epoch=initial_epoch)
 
   def evaluate_generator(self,
                          generator,
                          steps,
-                         max_q_size=10,
+                         max_queue_size=10,
                          workers=1,
-                         pickle_safe=False):
+                         use_multiprocessing=False,
+                         **kwargs):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -1123,13 +1135,14 @@ class Sequential(Model):
             or (inputs, targets, sample_weights)
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because this implementation
             relies on multiprocessing, you should not pass
             non picklable arguments to the generator
             as they can't be passed easily to children processes.
+        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has no metrics)
@@ -1139,23 +1152,39 @@ class Sequential(Model):
 
     Raises:
         RuntimeError: if the model was never compiled.
+        ValueError: In case the generator yields
+            data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     if self.model is None:
       raise RuntimeError('The model needs to be compiled ' 'before being used.')
     return self.model.evaluate_generator(
         generator,
         steps,
-        max_q_size=max_q_size,
+        max_queue_size=max_queue_size,
         workers=workers,
-        pickle_safe=pickle_safe)
+        use_multiprocessing=use_multiprocessing)
 
   def predict_generator(self,
                         generator,
                         steps,
-                        max_q_size=10,
+                        max_queue_size=10,
                         workers=1,
-                        pickle_safe=False,
-                        verbose=0):
+                        use_multiprocessing=False,
+                        verbose=0,
+                        **kwargs):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -1165,26 +1194,44 @@ class Sequential(Model):
         generator: generator yielding batches of input samples.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-        max_q_size: maximum size for the generator queue
+        max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
-        pickle_safe: if True, use process based threading.
+        use_multiprocessing: if True, use process based threading.
             Note that because this implementation
             relies on multiprocessing, you should not pass
             non picklable arguments to the generator
             as they can't be passed easily to children processes.
         verbose: verbosity mode, 0 or 1.
+        **kwargs: support for legacy arguments.
 
     Returns:
         A Numpy array of predictions.
+
+    Raises:
+        ValueError: In case the generator yields
+            data in an invalid format.
     """
+    # Legacy support
+    if 'max_q_size' in kwargs:
+      max_queue_size = kwargs.pop('max_q_size')
+      logging.warning('The argument `max_q_size` has been renamed '
+                      '`max_queue_size`. Update your method calls accordingly.')
+    if 'pickle_safe' in kwargs:
+      use_multiprocessing = kwargs.pop('pickle_safe')
+      logging.warning('The argument `pickle_safe` has been renamed '
+                      '`use_multiprocessing`. '
+                      'Update your method calls accordingly.')
+    if kwargs:
+      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+
     if self.model is None:
       self.build()
     return self.model.predict_generator(
         generator,
         steps,
-        max_q_size=max_q_size,
+        max_queue_size=max_queue_size,
         workers=workers,
-        pickle_safe=pickle_safe,
+        use_multiprocessing=use_multiprocessing,
         verbose=verbose)
 
   def get_config(self):
diff --git a/tensorflow/contrib/keras/python/keras/models_test.py b/tensorflow/contrib/keras/python/keras/models_test.py
index 99fd6e1cbe1bbcb6494a06bf26d06edd03f4507b..f7246097ee00e1071047af06705d6ed6469d67cb 100644
--- a/tensorflow/contrib/keras/python/keras/models_test.py
+++ b/tensorflow/contrib/keras/python/keras/models_test.py
@@ -163,6 +163,27 @@ class TestModelSaving(test.TestCase):
       model = keras.models.load_model(fname)
       os.remove(fname)
 
+  def test_saving_lambda_numpy_array_arguments(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    mean = np.random.random((4, 2, 3))
+    std = np.abs(np.random.random((4, 2, 3))) + 1e-5
+    inputs = keras.layers.Input(shape=(4, 2, 3))
+    output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
+                                 arguments={'mu': mean, 'std': std})(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+
+    _, fname = tempfile.mkstemp('.h5')
+    keras.models.save_model(model, fname)
+
+    model = keras.models.load_model(fname)
+    os.remove(fname)
+
+    self.assertAllClose(mean, model.layers[1].arguments['mu'])
+    self.assertAllClose(std, model.layers[1].arguments['std'])
+
 
 class TestSequential(test.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
diff --git a/tensorflow/contrib/keras/python/keras/optimizers.py b/tensorflow/contrib/keras/python/keras/optimizers.py
index 75fce5c96f6bdb766239ccc59de18a24e7ebdd3f..a1bd3be026c5d8244af1320aa0ce7acd7f4be26f 100644
--- a/tensorflow/contrib/keras/python/keras/optimizers.py
+++ b/tensorflow/contrib/keras/python/keras/optimizers.py
@@ -18,18 +18,49 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
 
 
 def clip_norm(g, c, n):
+  """Clip a tensor by norm.
+
+  Arguments:
+    g: gradient tensor to clip.
+    c: clipping threshold.
+    n: norm of gradient tensor.
+
+  Returns:
+    Clipped gradient tensor.
+  """
   if c > 0:
-    g = K.switch(n >= c, g * c / n, g)
+    condition = n >= c
+    then_expression = lambda: math_ops.scalar_mul(c / n, g)
+    else_expression = lambda: g
+
+    # saving the shape to avoid converting sparse tensor to dense
+    if isinstance(g, ops.Tensor):
+      g_shape = copy.copy(g.get_shape())
+    elif isinstance(g, ops.IndexedSlices):
+      g_shape = copy.copy(g.dense_shape)
+    if condition.dtype != dtypes_module.bool:
+      condition = math_ops.cast(condition, 'bool')
+    g = control_flow_ops.cond(condition, then_expression, else_expression)
+    if isinstance(g, ops.Tensor):
+      g.set_shape(g_shape)
+    elif isinstance(g, ops.IndexedSlices):
+      g._dense_shape = g_shape  # pylint: disable=protected-access
   return g
 
 
diff --git a/tensorflow/contrib/keras/python/keras/optimizers_test.py b/tensorflow/contrib/keras/python/keras/optimizers_test.py
index af5e3c99b96344db7d410d7ff5e31d5f60fa64e9..bb598f30373e797a7850d232d11ac2ace3150b05 100644
--- a/tensorflow/contrib/keras/python/keras/optimizers_test.py
+++ b/tensorflow/contrib/keras/python/keras/optimizers_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.keras.python import keras
 from tensorflow.contrib.keras.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.adam import AdamOptimizer
 
 
 def _get_model(input_dim, num_hidden, output_dim):
@@ -103,5 +104,28 @@ class KerasOptimizersTest(test.TestCase):
                                            momentum=0.9,
                                            clipvalue=0.5))
 
+  def test_tfoptimizer(self):
+    optimizer = keras.optimizers.TFOptimizer(AdamOptimizer)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(
+        2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
+    # This is possible
+    model.compile(loss='mean_squared_error', optimizer=optimizer)
+    # TF optimizers do not support weights constraints
+    with self.assertRaises(ValueError):
+      model.fit(np.random.random((5, 3)),
+                np.random.random((5, 2)),
+                epochs=1,
+                batch_size=5,
+                verbose=0)
+    # not supported
+    with self.assertRaises(NotImplementedError):
+      _ = optimizer.weights
+    with self.assertRaises(NotImplementedError):
+      optimizer.get_config()
+    with self.assertRaises(NotImplementedError):
+      optimizer.from_config(None)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image.py b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
index 0d69396e8b596f84a5f66ac8526120a14117b32b..4f2cff804e56aa7716e3368a2fcd4b0ecb45a49d 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/image.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
@@ -21,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from functools import partial
+import multiprocessing.pool
 import os
 import re
 import threading
@@ -178,7 +180,7 @@ def random_zoom(x,
       ValueError: if `zoom_range` isn't a tuple.
   """
   if len(zoom_range) != 2:
-    raise ValueError('zoom_range should be a tuple or list of two floats. '
+    raise ValueError('`zoom_range` should be a tuple or list of two floats. '
                      'Received arg: ', zoom_range)
 
   if zoom_range[0] == 1 and zoom_range[1] == 1:
@@ -408,8 +410,9 @@ class ImageDataGenerator(object):
       horizontal_flip: whether to randomly flip images horizontally.
       vertical_flip: whether to randomly flip images vertically.
       rescale: rescaling factor. If None or 0, no rescaling is applied,
-          otherwise we multiply the data by the value provided
-          (before applying any other transformation).
+          otherwise we multiply the data by the value provided. This is
+          applied after the `preprocessing_function` (if any provided)
+          but before any other transformation.
       preprocessing_function: function that will be implied on each input.
           The function will run before any other modification on it.
           The function should take one argument:
@@ -466,8 +469,8 @@ class ImageDataGenerator(object):
 
     if data_format not in {'channels_last', 'channels_first'}:
       raise ValueError(
-          'data_format should be "channels_last" (channel after row and '
-          'column) or "channels_first" (channel before row and column). '
+          '`data_format` should be `"channels_last"` (channel after row and '
+          'column) or `"channels_first"` (channel before row and column). '
           'Received arg: ', data_format)
     self.data_format = data_format
     if data_format == 'channels_first':
@@ -488,7 +491,7 @@ class ImageDataGenerator(object):
     elif len(zoom_range) == 2:
       self.zoom_range = [zoom_range[0], zoom_range[1]]
     else:
-      raise ValueError('zoom_range should be a float or '
+      raise ValueError('`zoom_range` should be a float or '
                        'a tuple or list of two floats. '
                        'Received arg: ', zoom_range)
 
@@ -590,11 +593,12 @@ class ImageDataGenerator(object):
                         'first by calling `.fit(numpy_data)`.')
     return x
 
-  def random_transform(self, x):
+  def random_transform(self, x, seed=None):
     """Randomly augment a single image tensor.
 
     Arguments:
         x: 3D tensor, single image.
+        seed: random seed.
 
     Returns:
         A randomly transformed version of the input (same shape).
@@ -610,6 +614,9 @@ class ImageDataGenerator(object):
     img_col_axis = self.col_axis - 1
     img_channel_axis = self.channel_axis - 1
 
+    if seed is not None:
+      np.random.seed(seed)
+
     # use composition of homographies
     # to generate final transform that needs to be applied
     if self.rotation_range:
@@ -709,8 +716,8 @@ class ImageDataGenerator(object):
     if x.ndim != 4:
       raise ValueError('Input to `.fit()` should have rank 4. '
                        'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {1, 3, 4}:
-      raise ValueError(
+    if x.shape[self.channel_axis] not in {3, 4}:
+      logging.warning(
           'Expected input to be images (as Numpy array) '
           'following the data format convention "' + self.data_format + '" '
           '(channels on axis ' + str(self.channel_axis) + '), i.e. expected '
@@ -911,6 +918,81 @@ class NumpyArrayIterator(Iterator):
     return batch_x, batch_y
 
 
+def _count_valid_files_in_directory(directory, white_list_formats,
+                                    follow_links):
+  """Count files with extension in `white_list_formats` in a directory.
+
+  Arguments:
+      directory: absolute path to the directory containing files to be counted
+      white_list_formats: set of strings containing allowed extensions for
+          the files to be counted.
+      follow_links: boolean.
+
+  Returns:
+      the count of files with extension in `white_list_formats` contained in
+      the directory.
+  """
+
+  def _recursive_list(subpath):
+    return sorted(
+        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+
+  samples = 0
+  for _, _, files in _recursive_list(directory):
+    for fname in files:
+      is_valid = False
+      for extension in white_list_formats:
+        if fname.lower().endswith('.' + extension):
+          is_valid = True
+          break
+      if is_valid:
+        samples += 1
+  return samples
+
+
+def _list_valid_filenames_in_directory(directory, white_list_formats,
+                                       class_indices, follow_links):
+  """List paths of files in `subdir` with extensions in `white_list_formats`.
+
+  Arguments:
+      directory: absolute path to a directory containing the files to list.
+          The directory name is used as class label and must be a key of
+            `class_indices`.
+      white_list_formats: set of strings containing allowed extensions for
+          the files to be counted.
+      class_indices: dictionary mapping a class name to its index.
+      follow_links: boolean.
+
+  Returns:
+      classes: a list of class indices
+      filenames: the path of valid files in `directory`, relative from
+          `directory`'s parent (e.g., if `directory` is "dataset/class1",
+          the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
+  """
+
+  def _recursive_list(subpath):
+    return sorted(
+        os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+
+  classes = []
+  filenames = []
+  subdir = os.path.basename(directory)
+  basedir = os.path.dirname(directory)
+  for root, _, files in _recursive_list(directory):
+    for fname in files:
+      is_valid = False
+      for extension in white_list_formats:
+        if fname.lower().endswith('.' + extension):
+          is_valid = True
+          break
+      if is_valid:
+        classes.append(class_indices[subdir])
+        # add filename relative to directory
+        absolute_path = os.path.join(root, fname)
+        filenames.append(os.path.relpath(absolute_path, basedir))
+  return classes, filenames
+
+
 class DirectoryIterator(Iterator):
   """Iterator capable of reading images from a directory on disk.
 
@@ -1007,43 +1089,35 @@ class DirectoryIterator(Iterator):
     self.num_class = len(classes)
     self.class_indices = dict(zip(classes, range(len(classes))))
 
-    def _recursive_list(subpath):
-      return sorted(
-          os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
-
-    for subdir in classes:
-      subpath = os.path.join(directory, subdir)
-      for root, _, files in _recursive_list(subpath):
-        for fname in files:
-          is_valid = False
-          for extension in white_list_formats:
-            if fname.lower().endswith('.' + extension):
-              is_valid = True
-              break
-          if is_valid:
-            self.samples += 1
+    pool = multiprocessing.pool.ThreadPool()
+    function_partial = partial(
+        _count_valid_files_in_directory,
+        white_list_formats=white_list_formats,
+        follow_links=follow_links)
+    self.samples = sum(
+        pool.map(function_partial, (os.path.join(directory, subdir)
+                                    for subdir in classes)))
+
     print('Found %d images belonging to %d classes.' % (self.samples,
                                                         self.num_class))
 
     # second, build an index of the images in the different class subfolders
+    results = []
+
     self.filenames = []
     self.classes = np.zeros((self.samples,), dtype='int32')
     i = 0
-    for subdir in classes:
-      subpath = os.path.join(directory, subdir)
-      for root, _, files in _recursive_list(subpath):
-        for fname in files:
-          is_valid = False
-          for extension in white_list_formats:
-            if fname.lower().endswith('.' + extension):
-              is_valid = True
-              break
-          if is_valid:
-            self.classes[i] = self.class_indices[subdir]
-            i += 1
-            # add filename relative to directory
-            absolute_path = os.path.join(root, fname)
-            self.filenames.append(os.path.relpath(absolute_path, directory))
+    for dirpath in (os.path.join(directory, subdir) for subdir in classes):
+      results.append(
+          pool.apply_async(_list_valid_filenames_in_directory, (
+              dirpath, white_list_formats, self.class_indices, follow_links)))
+    for res in results:
+      classes, filenames = res.get()
+      self.classes[i:i + len(classes)] = classes
+      self.filenames += filenames
+      i += len(classes)
+    pool.close()
+    pool.join()
     super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
                                             seed)
 
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
index 0dedf8f8500f710530870690f4b08c510f3eaad9..94768f525883b3ebb5f8f806409992c662f6ceab 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py
@@ -99,22 +99,10 @@ class TestImage(test.TestCase):
     with self.assertRaises(ValueError):
       x = np.random.random((3, 10, 10))
       generator.fit(x)
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 3, 10, 10))
-      generator.fit(x)
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 10, 10, 5))
-      generator.fit(x)
     # Test flow with invalid data
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 10, 10, 5))
-      generator.flow(np.arange(x.shape[0]))
     with self.assertRaises(ValueError):
       x = np.random.random((32, 10, 10))
       generator.flow(np.arange(x.shape[0]))
-    with self.assertRaises(ValueError):
-      x = np.random.random((32, 3, 10, 10))
-      generator.flow(np.arange(x.shape[0]))
 
   def test_image_data_generator_fit(self):
     generator = keras.preprocessing.image.ImageDataGenerator(
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text.py b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
index 93e629af17baf9986c49d0403e3712c630686d1c..ed00eef6ad8b4d36ef4aeac49dbe4d80f9e90cbc 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/text.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import OrderedDict
+from hashlib import md5
 import string
 import sys
 
@@ -61,8 +62,45 @@ def one_hot(text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
+  return hashing_trick(
+      text, n, hash_function=hash, filters=filters, lower=lower, split=split)
+
+
+def hashing_trick(text,
+                  n,
+                  hash_function=None,
+                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                  lower=True,
+                  split=' '):
+  """Converts a text to a sequence of indexes in a fixed-size hashing space.
+
+  Arguments:
+      text: Input text (string).
+      n: Dimension of the hashing space.
+      hash_function: if `None` uses python `hash` function, can be 'md5' or
+          any function that takes in input a string and returns a int.
+          Note that `hash` is not a stable hashing function, so
+          it is not consistent across different runs, while 'md5'
+          is a stable hashing function.
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of integer word indices (unicity non-guaranteed).
+
+  `0` is a reserved index that won't be assigned to any word.
+
+  Two or more words may be assigned to the same index, due to possible
+  collisions by the hashing function.
+  """
+  if hash_function is None:
+    hash_function = hash
+  elif hash_function == 'md5':
+    hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
+
   seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
-  return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
+  return [(hash_function(w) % (n - 1) + 1) for w in seq]
 
 
 class Tokenizer(object):
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
index e94b9019b281912fc63b73514d8fb212d54e48bc..7b26219e61bba0c0503e8a886e78810d8fad23fa 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text_test.py
@@ -30,8 +30,8 @@ class TestText(test.TestCase):
     text = 'The cat sat on the mat.'
     encoded = keras.preprocessing.text.one_hot(text, 5)
     self.assertEqual(len(encoded), 6)
-    assert np.max(encoded) <= 4
-    assert np.min(encoded) >= 0
+    self.assertLessEqual(np.max(encoded), 4)
+    self.assertGreaterEqual(np.min(encoded), 0)
 
   def test_tokenizer(self):
     texts = [
@@ -45,7 +45,7 @@ class TestText(test.TestCase):
     sequences = []
     for seq in tokenizer.texts_to_sequences_generator(texts):
       sequences.append(seq)
-    assert np.max(np.max(sequences)) < 10
+    self.assertLess(np.max(np.max(sequences)), 10)
     self.assertEqual(np.min(np.min(sequences)), 1)
 
     tokenizer.fit_on_sequences(sequences)
@@ -54,6 +54,21 @@ class TestText(test.TestCase):
       matrix = tokenizer.texts_to_matrix(texts, mode)
       self.assertEqual(matrix.shape, (3, 10))
 
+  def test_hashing_trick_hash(self):
+    text = 'The cat sat on the mat.'
+    encoded = keras.preprocessing.text.hashing_trick(text, 5)
+    self.assertEqual(len(encoded), 6)
+    self.assertLessEqual(np.max(encoded), 4)
+    self.assertGreaterEqual(np.min(encoded), 1)
+
+  def test_hashing_trick_md5(self):
+    text = 'The cat sat on the mat.'
+    encoded = keras.preprocessing.text.hashing_trick(
+        text, 5, hash_function='md5')
+    self.assertEqual(len(encoded), 6)
+    self.assertLessEqual(np.max(encoded), 4)
+    self.assertGreaterEqual(np.min(encoded), 1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/keras/python/keras/testing_utils.py b/tensorflow/contrib/keras/python/keras/testing_utils.py
index bf6f661adff4a22626763f207ef91839766da774..2f51ace945fbce636f947659162af186da453ae0 100644
--- a/tensorflow/contrib/keras/python/keras/testing_utils.py
+++ b/tensorflow/contrib/keras/python/keras/testing_utils.py
@@ -78,7 +78,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
       if e is None:
         input_data_shape[i] = np.random.randint(1, 4)
     input_data = 10 * np.random.random(input_data_shape)
-    if input_dtype[:4] == 'float':
+    if input_dtype[:5] == 'float':
       input_data -= 0.5
     input_data = input_data.astype(input_dtype)
   elif input_shape is None:
diff --git a/tensorflow/contrib/keras/python/keras/utils/__init__.py b/tensorflow/contrib/keras/python/keras/utils/__init__.py
index 68c28ab585b10048feb1c6961c01b6439fe6eaf8..3b197653f382278afffe2a4f26d73be0fc8ab495 100644
--- a/tensorflow/contrib/keras/python/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/utils/__init__.py
@@ -23,7 +23,10 @@ from tensorflow.contrib.keras.python.keras.utils import data_utils
 from tensorflow.contrib.keras.python.keras.utils import generic_utils
 from tensorflow.contrib.keras.python.keras.utils import io_utils
 from tensorflow.contrib.keras.python.keras.utils import np_utils
+from tensorflow.contrib.keras.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.contrib.keras.python.keras.utils.data_utils import get_file
+from tensorflow.contrib.keras.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.contrib.keras.python.keras.utils.data_utils import Sequence
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import custom_object_scope
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils.py b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
index 61a11b95e8db53ae9c4ed21edf00ca0f80b4f7ec..853625e7c475af7f5927bf60c9900c29708c4b48 100644
--- a/tensorflow/contrib/keras/python/keras/utils/data_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
@@ -17,13 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from abc import abstractmethod
 import hashlib
+import multiprocessing
+from multiprocessing.pool import ThreadPool
 import os
+import random
 import shutil
 import sys
 import tarfile
+import threading
+import time
 import zipfile
 
+import numpy as np
 import six
 from six.moves.urllib.error import HTTPError
 from six.moves.urllib.error import URLError
@@ -31,6 +38,11 @@ from six.moves.urllib.request import urlopen
 
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
 
+try:
+  import queue  # pylint:disable=g-import-not-at-top
+except ImportError:
+  import Queue as queue  # pylint:disable=g-import-not-at-top
+
 
 if sys.version_info[0] == 2:
 
@@ -300,3 +312,345 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return True
   else:
     return False
+
+
+class Sequence(object):
+  """Base object for fitting to a sequence of data, such as a dataset.
+
+  Every `Sequence` must implements the `__getitem__` and the `__len__` methods.
+
+  Examples:
+
+  ```python
+  from skimage.io import imread
+  from skimage.transform import resize
+  import numpy as np
+
+  # Here, `x_set` is list of path to the images
+  # and `y_set` are the associated classes.
+
+  class CIFAR10Sequence(Sequence):
+      def __init__(self, x_set, y_set, batch_size):
+          self.X,self.y = x_set,y_set
+          self.batch_size = batch_size
+
+      def __len__(self):
+          return len(self.X) // self.batch_size
+
+      def __getitem__(self,idx):
+          batch_x = self.X[idx*self.batch_size:(idx+1)*self.batch_size]
+          batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
+
+          return np.array([
+              resize(imread(file_name), (200,200))
+                 for file_name in batch_x]), np.array(batch_y)
+  ```
+  """
+
+  @abstractmethod
+  def __getitem__(self, index):
+    """Gets batch at position `index`.
+
+    Arguments:
+        index: position of the batch in the Sequence.
+
+    Returns:
+        A batch
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def __len__(self):
+    """Number of batch in the Sequence.
+
+    Returns:
+        The number of batches in the Sequence.
+    """
+    raise NotImplementedError
+
+
+def get_index(ds, i):
+  """Quick fix for Python2, otherwise, it cannot be pickled.
+
+  Arguments:
+      ds: a Holder or Sequence object.
+      i: index
+
+  Returns:
+      The value at index `i`.
+  """
+  return ds[i]
+
+
+class SequenceEnqueuer(object):
+  """Base class to enqueue inputs.
+
+  The task of an Enqueuer is to use parallelism to speed up preprocessing.
+  This is done with processes or threads.
+
+  Examples:
+
+  ```python
+  enqueuer = SequenceEnqueuer(...)
+  enqueuer.start()
+  datas = enqueuer.get()
+  for data in datas:
+      # Use the inputs; training, evaluating, predicting.
+      # ... stop sometime.
+  enqueuer.close()
+  ```
+
+  The `enqueuer.get()` should be an infinite stream of datas.
+
+  """
+
+  @abstractmethod
+  def is_running(self):
+    raise NotImplementedError
+
+  @abstractmethod
+  def start(self, workers=1, max_queue_size=10):
+    """Starts the handler's workers.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`).
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def stop(self, timeout=None):
+    """Stop running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called start().
+
+    Arguments:
+        timeout: maximum time to wait on thread.join()
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+
+    Returns:
+        Generator yielding tuples `(inputs, targets)`
+            or `(inputs, targets, sample_weights)`.
+    """
+    raise NotImplementedError
+
+
+class OrderedEnqueuer(SequenceEnqueuer):
+  """Builds a Enqueuer from a Sequence.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      sequence: A `keras.utils.data_utils.Sequence` object.
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      scheduling: Sequential querying of datas if 'sequential', random
+        otherwise.
+  """
+
+  def __init__(self,
+               sequence,
+               use_multiprocessing=False,
+               scheduling='sequential'):
+    self.sequence = sequence
+    self.use_multiprocessing = use_multiprocessing
+    self.scheduling = scheduling
+    self.workers = 0
+    self.executor = None
+    self.queue = None
+    self.run_thread = None
+    self.stop_signal = None
+
+  def is_running(self):
+    return self.stop_signal is not None and not self.stop_signal.is_set()
+
+  def start(self, workers=1, max_queue_size=10):
+    """Start the handler's workers.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, workers could block on `put()`)
+    """
+    if self.use_multiprocessing:
+      self.executor = multiprocessing.Pool(workers)
+    else:
+      self.executor = ThreadPool(workers)
+    self.queue = queue.Queue(max_queue_size)
+    self.stop_signal = threading.Event()
+    self.run_thread = threading.Thread(target=self._run)
+    self.run_thread.daemon = True
+    self.run_thread.start()
+
+  def _run(self):
+    """Submits requests to the executor and queues the `Future` objects."""
+    sequence = list(range(len(self.sequence)))
+    while True:
+      if self.scheduling is not 'sequential':
+        random.shuffle(sequence)
+      for i in sequence:
+        if self.stop_signal.is_set():
+          return
+        self.queue.put(
+            self.executor.apply_async(get_index, (self.sequence, i)),
+            block=True)
+
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+
+    Yields:
+        Tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+    """
+    try:
+      while self.is_running():
+        inputs = self.queue.get(block=True).get()
+        if inputs is not None:
+          yield inputs
+    except Exception as e:
+      self.stop()
+      raise StopIteration(e)
+
+  def stop(self, timeout=None):
+    """Stops running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called `start()`.
+
+    Arguments:
+        timeout: maximum time to wait on `thread.join()`
+    """
+    self.stop_signal.set()
+    with self.queue.mutex:
+      self.queue.queue.clear()
+      self.queue.unfinished_tasks = 0
+      self.queue.not_full.notify()
+    self.executor.close()
+    self.executor.join()
+    self.run_thread.join(timeout)
+
+
+class GeneratorEnqueuer(SequenceEnqueuer):
+  """Builds a queue out of a data generator.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      generator: a generator function which endlessly yields data
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      wait_time: time to sleep in-between calls to `put()`
+      random_seed: Initial seed for workers,
+          will be incremented by one for each workers.
+  """
+
+  def __init__(self,
+               generator,
+               use_multiprocessing=False,
+               wait_time=0.05,
+               random_seed=None):
+    self.wait_time = wait_time
+    self._generator = generator
+    self._use_multiprocessing = use_multiprocessing
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+    self.random_seed = random_seed
+
+  def start(self, workers=1, max_queue_size=10):
+    """Kicks off threads which add data from the generator into the queue.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`)
+    """
+
+    def data_generator_task():
+      while not self._stop_event.is_set():
+        try:
+          if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
+            generator_output = next(self._generator)
+            self.queue.put(generator_output)
+          else:
+            time.sleep(self.wait_time)
+        except Exception:
+          self._stop_event.set()
+          raise
+
+    try:
+      if self._use_multiprocessing:
+        self.queue = multiprocessing.Queue(maxsize=max_queue_size)
+        self._stop_event = multiprocessing.Event()
+      else:
+        self.queue = queue.Queue()
+        self._stop_event = threading.Event()
+
+      for _ in range(workers):
+        if self._use_multiprocessing:
+          # Reset random seed else all children processes
+          # share the same seed
+          np.random.seed(self.random_seed)
+          thread = multiprocessing.Process(target=data_generator_task)
+          thread.daemon = True
+          if self.random_seed is not None:
+            self.random_seed += 1
+        else:
+          thread = threading.Thread(target=data_generator_task)
+        self._threads.append(thread)
+        thread.start()
+    except:
+      self.stop()
+      raise
+
+  def is_running(self):
+    return self._stop_event is not None and not self._stop_event.is_set()
+
+  def stop(self, timeout=None):
+    """Stops running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called `start()`.
+
+    Arguments:
+        timeout: maximum time to wait on `thread.join()`.
+    """
+    if self.is_running():
+      self._stop_event.set()
+
+    for thread in self._threads:
+      if thread.is_alive():
+        if self._use_multiprocessing:
+          thread.terminate()
+        else:
+          thread.join(timeout)
+
+    if self._use_multiprocessing:
+      if self.queue is not None:
+        self.queue.close()
+
+    self._threads = []
+    self._stop_event = None
+    self.queue = None
+
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+
+    Yields:
+        Data arrays.
+    """
+    while self.is_running():
+      if not self.queue.empty():
+        inputs = self.queue.get()
+        if inputs is not None:
+          yield inputs
+      else:
+        time.sleep(self.wait_time)
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py b/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b73775f461c59a070e95c0269a28eaf8da5f6a7
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py
@@ -0,0 +1,172 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for data_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from itertools import cycle
+import threading
+
+import numpy as np
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class ThreadsafeIter(object):
+
+  def __init__(self, it):
+    self.it = it
+    self.lock = threading.Lock()
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    return self.next()
+
+  def next(self):
+    with self.lock:
+      return next(self.it)
+
+
+def threadsafe_generator(f):
+
+  def g(*a, **kw):
+    return ThreadsafeIter(f(*a, **kw))
+
+  return g
+
+
+class TestSequence(keras.utils.data_utils.Sequence):
+
+  def __init__(self, shape):
+    self.shape = shape
+
+  def __getitem__(self, item):
+    return np.ones(self.shape, dtype=np.uint8) * item
+
+  def __len__(self):
+    return 100
+
+
+class FaultSequence(keras.utils.data_utils.Sequence):
+
+  def __getitem__(self, item):
+    raise IndexError(item, 'item is not present')
+
+  def __len__(self):
+    return 100
+
+
+@threadsafe_generator
+def create_generator_from_sequence_threads(ds):
+  for i in cycle(range(len(ds))):
+    yield ds[i]
+
+
+def create_generator_from_sequence_pcs(ds):
+  for i in cycle(range(len(ds))):
+    yield ds[i]
+
+
+class TestEnqueuers(test.TestCase):
+
+  def test_generator_enqueuer_threads(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_threads(TestSequence([3, 200, 200, 3])),
+        use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(int(next(gen_output)[0, 0, 0, 0]))
+
+    self.assertEqual(len(set(acc) - set(range(100))), 0)
+    enqueuer.stop()
+
+  def test_generator_enqueuer_processes(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_pcs(TestSequence([3, 200, 200, 3])),
+        use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(int(next(gen_output)[0, 0, 0, 0]))
+    self.assertNotEqual(acc, list(range(100)))
+    enqueuer.stop()
+
+  def test_generator_enqueuer_fail_threads(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_threads(FaultSequence()),
+        use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+  def test_generator_enqueuer_fail_processes(self):
+    enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
+        create_generator_from_sequence_pcs(FaultSequence()),
+        use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+  def test_ordered_enqueuer_threads(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    self.assertEqual(acc, list(range(100)))
+    enqueuer.stop()
+
+  def test_ordered_enqueuer_processes(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    self.assertEqual(acc, list(range(100)))
+    enqueuer.stop()
+
+  def test_ordered_enqueuer_fail_threads(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        FaultSequence(), use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+  def test_ordered_enqueuer_fail_processes(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        FaultSequence(), use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with self.assertRaises(StopIteration):
+      next(gen_output)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
index 5cae694d54857ac5538ed89cf5a2474f82c5fd04..ed57144f9c88f3f21849fad701aef96fd8e6833d 100644
--- a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -227,6 +227,24 @@ def func_load(code, defaults=None, closure=None, globs=None):
       code, globs, name=code.co_name, argdefs=defaults, closure=closure)
 
 
+def has_arg(fn, name, accept_all=False):
+  """Checks if a callable accepts a given keyword argument.
+
+  Arguments:
+      fn: Callable to inspect.
+      name: Check if `fn` can be called with `name` as a keyword argument.
+      accept_all: What to return if there is no parameter called `name`
+                  but the function accepts a `**kwargs` argument.
+
+  Returns:
+      bool, whether `fn` accepts a `name` keyword argument.
+  """
+  arg_spec = tf_inspect.getargspec(fn)
+  if accept_all and arg_spec.keywords is not None:
+    return True
+  return name in arg_spec.args
+
+
 class Progbar(object):
   """Displays a progress bar.
 
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils_test.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a6519f4cc7c0313dbd95331b912d0a7d4c84bf2
--- /dev/null
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils_test.py
@@ -0,0 +1,75 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras generic Python utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.keras.python import keras
+from tensorflow.python.platform import test
+
+
+class HasArgTest(test.TestCase):
+
+  def test_has_arg(self):
+
+    def f_x(x):
+      return x
+
+    def f_x_args(x, *args):
+      _ = args
+      return x
+
+    def f_x_kwargs(x, **kwargs):
+      _ = kwargs
+      return x
+
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x, 'x', accept_all=False))
+    self.assertFalse(keras.utils.generic_utils.has_arg(
+        f_x, 'y', accept_all=False))
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x_args, 'x', accept_all=False))
+    self.assertFalse(keras.utils.generic_utils.has_arg(
+        f_x_args, 'y', accept_all=False))
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x_kwargs, 'x', accept_all=False))
+    self.assertFalse(keras.utils.generic_utils.has_arg(
+        f_x_kwargs, 'y', accept_all=False))
+    self.assertTrue(keras.utils.generic_utils.has_arg(
+        f_x_kwargs, 'y', accept_all=True))
+
+
+class TestCustomObjectScope(test.TestCase):
+
+  def test_custom_object_scope(self):
+
+    def custom_fn():
+      pass
+
+    class CustomClass(object):
+      pass
+
+    with keras.utils.generic_utils.custom_object_scope(
+        {'CustomClass': CustomClass, 'custom_fn': custom_fn}):
+      act = keras.activations.get('custom_fn')
+      self.assertEqual(act, custom_fn)
+      cl = keras.regularizers.get('CustomClass')
+      self.assertEqual(cl.__class__, CustomClass)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils.py b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
index 55c135b5ebff904742cdae10d9f1d7b913410eb1..70b2d96907d883a028cbf83ac94404d3846d4d19 100644
--- a/tensorflow/contrib/keras/python/keras/utils/io_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/io_utils.py
@@ -113,8 +113,40 @@ class HDF5Matrix(object):
 
   @property
   def shape(self):
+    """Gets a numpy-style shape tuple giving the dataset dimensions.
+
+    Returns:
+        A numpy-style shape tuple.
+    """
     return (self.end - self.start,) + self.data.shape[1:]
 
+  @property
+  def dtype(self):
+    """Gets the datatype of the dataset.
+
+    Returns:
+        A numpy dtype string.
+    """
+    return self.data.dtype
+
+  @property
+  def ndim(self):
+    """Gets the number of dimensions (rank) of the dataset.
+
+    Returns:
+        An integer denoting the number of dimensions (rank) of the dataset.
+    """
+    return self.data.ndim
+
+  @property
+  def size(self):
+    """Gets the total dataset size (number of elements).
+
+    Returns:
+        An integer denoting the number of elements in the dataset.
+    """
+    return np.prod(self.shape)
+
 
 def ask_to_proceed_with_overwrite(filepath):
   """Produces a prompt asking about overwriting a file.
diff --git a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
index 154070fb932079eb839e3812f9dd42e333475c8a..1c3481fdb8ef31e875f8f06ce2d01a73abf4bb77 100644
--- a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
@@ -24,15 +24,24 @@ from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.conv_utils import convert_kernel
 
 
-def print_summary(model, line_length=None, positions=None):
+def print_summary(model, line_length=None, positions=None, print_fn=None):
   """Prints a summary of a model.
 
   Arguments:
       model: Keras model instance.
-      line_length: total length of printed lines
-      positions: relative or absolute positions of log elements in each line.
+      line_length: Total length of printed lines
+          (e.g. set this to adapt the display to different
+          terminal window sizes).
+      positions: Relative or absolute positions of log elements in each line.
           If not provided, defaults to `[.33, .55, .67, 1.]`.
+      print_fn: Print function to use (defaults to `print`).
+          It will be called on each line of the summary.
+          You can set it to a custom function
+          in order to capture the string summary.
   """
+  if print_fn is None:
+    print_fn = print
+
   if model.__class__.__name__ == 'Sequential':
     sequential_like = True
   else:
@@ -70,11 +79,11 @@ def print_summary(model, line_length=None, positions=None):
       line += str(fields[i])
       line = line[:positions[i]]
       line += ' ' * (positions[i] - len(line))
-    print(line)
+    print_fn(line)
 
-  print('_' * line_length)
+  print_fn('_' * line_length)
   print_row(to_display, positions)
-  print('=' * line_length)
+  print_fn('=' * line_length)
 
   def print_layer_summary(layer):
     try:
@@ -131,19 +140,19 @@ def print_summary(model, line_length=None, positions=None):
     else:
       print_layer_summary_with_connections(layers[i])
     if i == len(layers) - 1:
-      print('=' * line_length)
+      print_fn('=' * line_length)
     else:
-      print('_' * line_length)
+      print_fn('_' * line_length)
 
   trainable_count = int(
       np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
   non_trainable_count = int(
       np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
 
-  print('Total params: {:,}'.format(trainable_count + non_trainable_count))
-  print('Trainable params: {:,}'.format(trainable_count))
-  print('Non-trainable params: {:,}'.format(non_trainable_count))
-  print('_' * line_length)
+  print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
+  print_fn('Trainable params: {:,}'.format(trainable_count))
+  print_fn('Non-trainable params: {:,}'.format(non_trainable_count))
+  print_fn('_' * line_length)
 
 
 def convert_all_kernels_in_model(model):
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index fccaa3abd4d70519741b8e375a077d95b51f1b2c..ae1402b0e6688a0f43278999d1d93282ea2a11a5 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -14,6 +14,7 @@ py_library(
     srcs = [
         "__init__.py",
         "python/kernel_estimators.py",
+        "python/losses.py",
         "python/mappers/random_fourier_features.py",
     ],
     srcs_version = "PY2AND3",
@@ -22,11 +23,15 @@ py_library(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -71,6 +76,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "losses_test",
+    srcs = ["python/losses_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":kernel_methods",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/kernel_methods/__init__.py b/tensorflow/contrib/kernel_methods/__init__.py
index 7272e5951605617ffcde46bab2743bde70123f35..0f3827d187098962231bb6435fcf938ee5d7ad5c 100644
--- a/tensorflow/contrib/kernel_methods/__init__.py
+++ b/tensorflow/contrib/kernel_methods/__init__.py
@@ -16,12 +16,14 @@
 
 @@KernelLinearClassifier
 @@RandomFourierFeatureMapper
+@@sparse_multiclass_hinge_loss
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.kernel_methods.python.kernel_estimators import KernelLinearClassifier
+from tensorflow.contrib.kernel_methods.python.losses import sparse_multiclass_hinge_loss
 from tensorflow.contrib.kernel_methods.python.mappers.random_fourier_features import RandomFourierFeatureMapper
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/kernel_methods/g3doc/tutorial.md b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
index 9877375c2c1a22c76c205174fb9a58da6a187f61..f39a8d80d22043bc3b4d2da60922aaea6599e315 100644
--- a/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
+++ b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
@@ -13,7 +13,7 @@ for sparse features is in the works.
 We will use [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn) (TensorFlow's high-level Machine Learning API) Estimators for our ML models. The
 tf.contrib.learn API reduces the boilerplate code one needs to write for
 configuring, training and evaluating models and will let us focus on the core
-ideas. If you are not familiar with this API, [tf.contrib.learn Quickstart](https://www.tensorflow.org/get_started/tflearn) is a good place to start. We
+ideas. If you are not familiar with this API, [tf.estimator Quickstart](https://www.tensorflow.org/get_started/estimator) is a good place to start. We
 will use MNIST, a widely-used dataset containing images of handwritten digits
 (between 0 and 9). The tutorial consists of the following steps:
 
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..208b0e1c9dbe93fb99e17e7be5ed5b6e30f4e201
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -0,0 +1,135 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of kernel-methods-related loss operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.losses import losses
+
+
+def sparse_multiclass_hinge_loss(
+    labels,
+    logits,
+    weights=1.0,
+    scope=None,
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
+  """Adds Ops for computing the multiclass hinge loss.
+
+  The implementation is based on the following paper:
+  On the Algorithmic Implementation of Multiclass Kernel-based Vector Machines
+  by Crammer and Singer.
+  link: http://jmlr.csail.mit.edu/papers/volume2/crammer01a/crammer01a.pdf
+
+  This is a generalization of standard (binary) hinge loss. For a given instance
+  with correct label c*, the loss is given by:
+    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+  or equivalently
+    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
+  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+
+  Args:
+    labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
+      the ground truth. Each entry must be an index in `[0, num_classes)`.
+    logits: `Tensor` of shape [batch_size, num_classes] corresponding to the
+      unscaled logits. Its dtype should be either `float32` or `float64`.
+    weights: Optional (python) scalar or `Tensor`. If a non-scalar `Tensor`, its
+      rank should be either 1 ([batch_size]) or 2 ([batch_size, 1]).
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
+
+  Returns:
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is a scalar.
+
+  Raises:
+    ValueError: If `logits`, `labels` or `weights` have invalid or inconsistent
+      shapes.
+    ValueError: If `labels` tensor has invalid dtype.
+  """
+
+  with ops.name_scope(scope, 'sparse_multiclass_hinge_loss', (logits,
+                                                              labels)) as scope:
+
+    # Check logits Tensor has valid rank.
+    logits_shape = logits.get_shape()
+    logits_rank = logits_shape.ndims
+    if logits_rank != 2:
+      raise ValueError(
+          'logits should have rank 2 ([batch_size, num_classes]). Given rank is'
+          ' {}'.format(logits_rank))
+    batch_size, num_classes = logits_shape[0].value, logits_shape[1].value
+    logits = math_ops.to_float(logits)
+
+    # Check labels have valid type.
+    if labels.dtype != dtypes.int32 and labels.dtype != dtypes.int64:
+      raise ValueError(
+          'Invalid dtype for labels: {}. Acceptable dtypes: int32 and int64'.
+          format(labels.dtype))
+
+    # Check labels and weights have valid ranks and are consistent.
+    labels_rank = labels.get_shape().ndims
+    if labels_rank not in [1, 2]:
+      raise ValueError(
+          'labels should have rank 1 ([batch_size]) or 2 ([batch_size, 1]). '
+          'Given rank is {}'.format(labels_rank))
+    with ops.control_dependencies([
+        check_ops.assert_less(labels, math_ops.cast(num_classes, labels.dtype))
+    ]):
+      labels = array_ops.reshape(labels, shape=[-1])
+
+    weights = ops.convert_to_tensor(weights)
+    weights_rank = weights.get_shape().ndims
+    if weights_rank not in [0, 1, 2]:
+      raise ValueError(
+          'non-scalar weights should have rank 1 ([batch_size]) or 2 '
+          '([batch_size, 1]). Given rank is {}'.format(labels_rank))
+
+    if weights_rank > 0:
+      weights = array_ops.reshape(weights, shape=[-1])
+      # Check weights and labels have the same number of elements.
+      weights.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    # Compute the logits tensor corresponding to the correct class per instance.
+    example_indices = array_ops.reshape(
+        math_ops.range(batch_size), shape=[batch_size, 1])
+    indices = array_ops.concat(
+        [
+            example_indices,
+            array_ops.reshape(
+                math_ops.cast(labels, example_indices.dtype),
+                shape=[batch_size, 1])
+        ],
+        axis=1)
+    label_logits = array_ops.reshape(
+        array_ops.gather_nd(params=logits, indices=indices),
+        shape=[batch_size, 1])
+
+    one_cold_labels = array_ops.one_hot(
+        indices=labels, depth=num_classes, on_value=0.0, off_value=1.0)
+    margin = logits - label_logits + one_cold_labels
+    margin = nn_ops.relu(margin)
+    loss = math_ops.reduce_max(margin, axis=1)
+    return losses.compute_weighted_loss(
+        loss, weights, scope, loss_collection, reduction=reduction)
diff --git a/tensorflow/contrib/kernel_methods/python/losses_test.py b/tensorflow/contrib/kernel_methods/python/losses_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a1a5ffe56ba283bfae514738fa87e4055f8934e
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/python/losses_test.py
@@ -0,0 +1,206 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for third_party.tensorflow.contrib.kernel_methods.python.losses."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kernel_methods.python import losses
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class SparseMulticlassHingeLossTest(test.TestCase):
+
+  def testInvalidLogitsShape(self):
+    """An error is raised when logits have invalid shape."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1], shape=(2,))
+      labels = constant_op.constant([0, 1])
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits)
+
+  def testInvalidLabelsShape(self):
+    """An error is raised when labels have invalid shape."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1], shape=(2, 1))
+      labels = constant_op.constant([1, 0], shape=(1, 1, 2))
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits)
+
+  def testInvalidWeightsShape(self):
+    """An error is raised when weights have invalid shape."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1], shape=(2, 1))
+      labels = constant_op.constant([1, 0], shape=(2,))
+      weights = constant_op.constant([1.5, 0.2], shape=(2, 1, 1))
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+
+  def testInvalidLabelsDtype(self):
+    """An error is raised when labels have invalid shape."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1], shape=(2, 1))
+      labels = constant_op.constant([1, 0], dtype=dtypes.float32)
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits)
+
+  def testNoneWeightRaisesValueError(self):
+    """An error is raised when weights are None."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1], shape=(2, 1))
+      labels = constant_op.constant([1, 0])
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits, weights=None)
+
+  def testInconsistentLabelsAndWeightsShapesSameRank(self):
+    """Error raised when weights and labels have same ranks, different sizes."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1, 4.1], shape=(3, 1))
+      labels = constant_op.constant([1, 0, 2], shape=(3, 1))
+      weights = constant_op.constant([1.1, 2.0], shape=(2, 1))
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+
+  def testInconsistentLabelsAndWeightsShapesDifferentRank(self):
+    """Error raised when weights and labels have different ranks and sizes."""
+    with self.test_session():
+      logits = constant_op.constant([-1.0, 2.1], shape=(2, 1))
+      labels = constant_op.constant([1, 0], shape=(2, 1))
+      weights = constant_op.constant([1.1, 2.0, 2.8], shape=(3,))
+      with self.assertRaises(ValueError):
+        _ = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+
+  def testOutOfRangeLabels(self):
+    """An error is raised when labels are not in [0, num_classes)."""
+    with self.test_session():
+      logits = constant_op.constant([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0],
+                                     [0.5, 1.8, -1.0]])
+      labels = constant_op.constant([1, 0, 4])
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+      with self.assertRaises(errors.InvalidArgumentError):
+        loss.eval()
+
+  def testZeroLossInt32Labels(self):
+    """Loss is 0 if true class logits sufficiently higher than other classes."""
+    with self.test_session():
+      logits = constant_op.constant([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0],
+                                     [0.5, 1.8, -1.0]])
+      labels = constant_op.constant([0, 2, 1], dtype=dtypes.int32)
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testZeroLossInt64Labels(self):
+    """Loss is 0 if true class logits sufficiently higher than other classes."""
+    with self.test_session():
+      logits = constant_op.constant([[2.1, -0.4, -1.0], [1.4, 2.8, 4.0],
+                                     [-0.5, 0.8, -1.0]])
+      labels = constant_op.constant([0, 2, 1], dtype=dtypes.int64)
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testCorrectPredictionsSomeClassesInsideMargin(self):
+    """Loss is > 0 even if true class logits are higher than other classes."""
+    with self.test_session():
+      logits = constant_op.constant([[1.2, -1.4, 0.8], [1.4, 1.8, 4.0],
+                                     [1.5, 1.8, -1.0]])
+      labels = constant_op.constant([0, 2, 1])
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+      # The first and third samples incur some loss (0.6 and 0.7 respectively).
+      self.assertAlmostEqual(loss.eval(), 0.4333, 3)
+
+  def testIncorrectPredictions(self):
+    """Loss is >0 when an incorrect class has higher logits than true class."""
+    with self.test_session():
+      logits = constant_op.constant([[2.6, 0.4, 0.8], [1.4, 0.8, -1.0],
+                                     [0.5, -1.8, 2.0]])
+      labels = constant_op.constant([1, 0, 2])
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+      # The first examples incurs a high loss (3.2) since the logits of an
+      # incorrect class (0) are higher than the logits of the ground truth. The
+      # second example also incures a (smaller) loss (0.4).
+      self.assertAlmostEqual(loss.eval(), 1.2, 3)
+
+  def testIncorrectPredictionsColumnLabels(self):
+    """Same as above but labels is a rank-2 tensor."""
+    with self.test_session():
+      logits = constant_op.constant([[1.6, -0.4, 0.8], [1.5, 0.8, -1.0],
+                                     [0.2, -1.8, 4.0]])
+      labels = constant_op.constant([1, 0, 2], shape=(3, 1))
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+      # The first examples incurs a high loss (3.0) since the logits of an
+      # incorrect class (0) are higher than the logits of the ground truth. The
+      # second example also incures a (smaller) loss (0.3).
+      self.assertAlmostEqual(loss.eval(), 1.1, 3)
+
+  def testIncorrectPredictionsZeroWeights(self):
+    """Loss is 0 when all weights are missing even if predictions are wrong."""
+    with self.test_session():
+      logits = constant_op.constant([[1.6, -0.4, 0.8], [1.5, 0.8, -1.0],
+                                     [0.2, -1.8, 4.0]])
+      labels = constant_op.constant([1, 0, 2], shape=(3, 1))
+      weights = constant_op.constant([0.0, 0.0, 0.0], shape=(3, 1))
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+      # No overall loss since all weights are 0.
+      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+
+  def testNonZeroLossWithPythonScalarWeights(self):
+    """Weighted loss is correctly computed when weights is a python scalar."""
+    with self.test_session():
+      logits = constant_op.constant([[1.6, -0.4, 0.8], [1.5, 0.8, -1.0],
+                                     [0.2, -1.8, 4.0]])
+      labels = constant_op.constant([1, 0, 2], shape=(3, 1))
+      weights = 10.0
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+      self.assertAlmostEqual(loss.eval(), 11.0, 3)
+
+  def testNonZeroLossWithScalarTensorWeights(self):
+    """Weighted loss is correctly computed when weights is a rank-0 tensor."""
+    with self.test_session():
+      logits = constant_op.constant([[1.6, -0.4, 0.8], [1.5, 0.8, -1.0],
+                                     [0.2, -1.8, 4.0]])
+      labels = constant_op.constant([1, 0, 2], shape=(3, 1))
+      weights = constant_op.constant(5.0)
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+      self.assertAlmostEqual(loss.eval(), 5.5, 3)
+
+  def testNonZeroLossWith1DTensorWeightsColumnLabels(self):
+    """Weighted loss is correctly computed when weights is a rank-0 tensor."""
+    with self.test_session():
+      logits = constant_op.constant([[1.6, -0.4, 0.8], [1.5, 0.8, -1.0],
+                                     [0.2, -1.8, 4.0]])
+      labels = constant_op.constant([1, 0, 2], shape=(3, 1))
+      weights = constant_op.constant([1.0, 0.5, 2.0], shape=(3,))
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+      # The overall loss is 1/3 *(3.0*1.0 + 0.5*0.3+ 2.0*0.0) = 1.05
+      self.assertAlmostEqual(loss.eval(), 1.05, 3)
+
+  def testNonZeroLossWith2DTensorWeights1DLabelsSomeWeightsMissing(self):
+    """Weighted loss is correctly computed when weights is a rank-0 tensor."""
+    with self.test_session():
+      logits = constant_op.constant([[1.6, -0.4, 0.8], [1.5, 0.8, -1.0],
+                                     [0.2, -1.8, 4.0], [1.6, 1.8, -4.0]])
+      labels = constant_op.constant([1, 0, 2, 1])
+      weights = constant_op.constant([[1.0], [0.0], [2.0], [4.0]])
+      loss = losses.sparse_multiclass_hinge_loss(labels, logits, weights)
+      # The overall loss is 1/3 *(3.0*1.0 + 0.0*0.3+ 2.0*0.0 + 4.0*0.8) = 6.2/3.
+      self.assertAlmostEqual(loss.eval(), 2.06666, 3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 55258f264aeac86edd4336b6c6261c7e9cd59c19..4eba29caecbddc408d168158daf8377aedab7bcc 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -36,6 +36,7 @@ py_library(
     srcs = ["python/ops/_typecheck.py"],
     srcs_version = "PY2AND3",
     visibility = [":__subpackages__"],
+    deps = ["//tensorflow/python:util"],
 )
 
 py_library(
@@ -48,6 +49,7 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -56,8 +58,6 @@ py_library(
     srcs = ["python/ops/test_util.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":_typecheck",
-        ":core",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
     ],
@@ -92,6 +92,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
+        "@six_archive//:six",
     ],
 )
 
@@ -105,7 +106,6 @@ py_test(
     deps = [
         ":core",
         ":io_ops",
-        ":ops",
         ":test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -151,11 +151,13 @@ py_library(
         ":core",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:numerics",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -175,6 +177,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -188,6 +191,7 @@ py_library(
         ":core",
         ":ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index 04bf26a5dd2d67b40d201fe9cfb4769b695da2f6..fc1ea834492ccd5dbf8c9eed016ab6352ca13c8a 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -199,7 +199,7 @@ class Axes(collections.Mapping):
   """Axis names and indices for a tensor.
 
   It is an ordered mapping, with keys given by axis name and values given
-  by Axis objets. Duplicate axis names are not allowed.
+  by Axis objects. Duplicate axis names are not allowed.
   """
 
   @tc.accepts(object, tc.List(AxisLike))
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 5cd338f7918e0eb2c2f25f6c8d54dd3217f255c5..c96e42de457fa0563184f593785d00d30017b02e 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -121,7 +121,6 @@ cuda_py_test(
         ":layers_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -141,6 +140,7 @@ cuda_py_test(
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses:losses",
     ],
 )
 
@@ -155,9 +155,7 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
         "//third_party/py/numpy",
     ],
 )
@@ -172,8 +170,6 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -189,12 +185,10 @@ py_test(
         ":layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -213,11 +207,8 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -229,11 +220,9 @@ py_test(
     deps = [
         ":layers_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -253,17 +242,16 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
     ],
 )
@@ -278,10 +266,7 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -294,11 +279,9 @@ py_test(
         ":layers_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//third_party/py/numpy",
     ],
 )
@@ -311,15 +294,17 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
@@ -334,8 +319,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -351,8 +334,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
         "//third_party/py/numpy",
     ],
 )
@@ -365,12 +347,9 @@ py_test(
     deps = [
         ":layers_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index ef05dbaa651ab4cfabda19e7a0239ae002ccf154..528de313468c56546424f058e2a30d2189aaaa23 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -17,12 +17,16 @@
 See the @{$python/contrib.layers} guide.
 
 @@avg_pool2d
+@@avg_pool3d
 @@batch_norm
 @@convolution2d
+@@convolution3d
 @@conv2d_in_plane
 @@convolution2d_in_plane
 @@conv2d_transpose
 @@convolution2d_transpose
+@@conv3d_transpose
+@@convolution3d_transpose
 @@dropout
 @@elu
 @@embedding_lookup_unique
@@ -31,6 +35,7 @@ See the @{$python/contrib.layers} guide.
 @@layer_norm
 @@linear
 @@max_pool2d
+@@max_pool3d
 @@one_hot_encoding
 @@relu
 @@relu6
@@ -101,6 +106,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['bias_add',
                     'conv2d',
+                    'conv3d',
                     'elu',
                     'feature_column',
                     'legacy_fully_connected',
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index 15b984f93893b9da3a202129b7532c37338fb4d4..e407a9ce015603094c7bbab72856403e2f0eb1a1 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -14,7 +14,7 @@ cc_library(
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 72df272af89543ecee6f165de2e7751021702933..932c5ab99249feda1e3a7f2d707ce4237fe7177f 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -176,7 +177,7 @@ class StringCrosser {
     static const auto k_feature_separator = "_X_";
 
     gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
-    for (int i = 0; i < permutation.size(); i++) {
+    for (size_t i = 0; i < permutation.size(); i++) {
       cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
     }
     // TODO(zakaria): this will copy the string twice, might effect
@@ -266,7 +267,7 @@ class ProductIterator {
     next_permutation_.resize(columns_.size(), 0);
     // Sets has_next_ to false if any feature column has 0 features.
     has_next_ = true;
-    for (int i = 0; i < columns_.size(); i++) {
+    for (size_t i = 0; i < columns_.size(); i++) {
       if (columns_[i]->FeatureCount(batch_index_) == 0) {
         has_next_ = false;
         break;
@@ -580,7 +581,7 @@ class SparseFeatureCrossOp : public OpKernel {
           columns,
       int batch_index) {
     int64 cross_count = 1;
-    for (int i = 0; i < columns.size(); i++) {
+    for (size_t i = 0; i < columns.size(); i++) {
       const auto feature_count = columns[i]->FeatureCount(batch_index);
       // If one column is missing any feature, there won't be any cross.
       if (feature_count == 0) {
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index f8f4122d1db4470701cd1d9599add842349943f4..b62e3050cd7003f1ba72061b133ff9b5d6b616da 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -871,7 +871,7 @@ def _embedding_lookup_with_distributed_aggregation(params,
           p_segment_ids = array_ops.gather(segment_ids, pindices[p])
           # Number the p_segment_ids to meet segment_sum's requirements. Note
           # that unique_p_segment_ids contains unique segment ids of this
-          # partiton and these ids' order is unchanged.
+          # partition and these ids' order is unchanged.
           unique_p_segment_ids, unique_p_segment_idx = array_ops.unique(
               p_segment_ids)
           partitioned_segment_ids.append(unique_p_segment_ids)
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 68159fe9b9976e3f3da7dc65cfa894bb60e01fe8..0ba9873f3ab978c58cff50b98142c5a77aa82225 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -165,7 +165,7 @@ class _LinearEmbeddingLookupArguments(
                             "combiner"])):
   """Represents the information needed from a column for embedding lookup.
 
-  Used to to compute DNN inputs and weighted sum.
+  Used to compute DNN inputs and weighted sum.
   """
   pass
 
@@ -184,7 +184,7 @@ class _DeepEmbeddingLookupArguments(
                             "trainable"])):
   """Represents the information needed from a column for embedding lookup.
 
-  Used to to compute DNN inputs and weighted sum.
+  Used to compute DNN inputs and weighted sum.
   """
   pass
 
@@ -1352,8 +1352,8 @@ def shared_embedding_columns(sparse_id_columns,
                      "element.")
   for sparse_id_column in sparse_id_columns:
     if not isinstance(sparse_id_column, _SparseColumn):
-      raise TypeError("Elements of sparse_id_columns must be _SparseColumn, but"
-                      "{} is not.".format(sparse_id_column))
+      raise TypeError("Elements of sparse_id_columns must be _SparseColumn, "
+                      "but {} is not.".format(sparse_id_column))
 
   if len(sparse_id_columns) == 1:
     return [
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 271b3c01ffc86aeb031ec2737c96b926e6d16697..b12a882d9ae88f7cf4f920cfa5872e5de1c67290 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -42,7 +42,7 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   This initializer is designed to keep the scale of the gradients roughly the
   same in all layers. In uniform distribution this ends up being the range:
   `x = sqrt(6. / (in + out)); [-x, x]` and for normal distribution a standard
-  deviation of `sqrt(3. / (in + out))` is used.
+  deviation of `sqrt(2. / (in + out))` is used.
 
   Args:
     uniform: Whether to use uniform or normal distributed random initialization.
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index d3b10949630c5c1e32bf1c128c922c3650427c18..ff7545bb000fad09c2ed06692f8efa2f5d3a2773 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,13 +32,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base
 from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
-from tensorflow.python.layers import  normalization as normalization_layers
+from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
@@ -49,15 +53,20 @@ from tensorflow.python.training import moving_averages
 # TODO(b/28426988): Replace legacy_* fns migrated from slim.
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
 __all__ = ['avg_pool2d',
+           'avg_pool3d',
            'batch_norm',
            'bias_add',
            'conv2d',
+           'conv3d',
            'conv2d_in_plane',
            'conv2d_transpose',
+           'conv3d_transpose',
            'convolution',
            'convolution2d',
            'convolution2d_in_plane',
            'convolution2d_transpose',
+           'convolution3d',
+           'convolution3d_transpose',
            'dropout',
            'elu',
            'flatten',
@@ -66,6 +75,7 @@ __all__ = ['avg_pool2d',
            'linear',
            'pool',
            'max_pool2d',
+           'max_pool3d',
            'one_hot_encoding',
            'relu',
            'relu6',
@@ -82,6 +92,8 @@ __all__ = ['avg_pool2d',
 
 DATA_FORMAT_NCHW = 'NCHW'
 DATA_FORMAT_NHWC = 'NHWC'
+DATA_FORMAT_NCDHW = 'NCDHW'
+DATA_FORMAT_NDHWC = 'NDHWC'
 
 
 @add_arg_scope
@@ -132,6 +144,54 @@ def avg_pool2d(inputs,
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
+@add_arg_scope
+def avg_pool3d(inputs,
+               kernel_size,
+               stride=2,
+               padding='VALID',
+               data_format=DATA_FORMAT_NDHWC,
+               outputs_collections=None,
+               scope=None):
+  """Adds a 3D average pooling op.
+
+  It is assumed that the pooling is done per image but not in batch or channels.
+
+  Args:
+    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]` if
+      `data_format` is `NDHWC`, and `[batch_size, channels, depth, height, width]` if
+      `data_format` is `NCDHW`.
+    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width] of the
+      pooling kernel over which the op is computed. Can be an int if both
+      values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
+      both strides must have the same value.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+  """
+  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+    raise ValueError('data_format has to be either NCDHW or NDHWC.')
+  with ops.name_scope(scope, 'AvgPool3D', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.AveragePooling3D(pool_size=kernel_size,
+                                            strides=stride,
+                                            padding=padding,
+                                            data_format=df,
+                                            _scope=sc)
+    outputs = layer.apply(inputs)
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 def _fused_batch_norm(
     inputs,
     decay=0.999,
@@ -257,27 +317,33 @@ def _fused_batch_norm(
                                                       'beta')
     if not param_initializers:
       param_initializers = {}
-    beta_initializer = param_initializers.get('beta',
-                                              init_ops.zeros_initializer())
-    beta = variables.model_variable(
-        'beta',
-        shape=params_shape,
-        dtype=dtype,
-        initializer=beta_initializer,
-        collections=beta_collections,
-        trainable=trainable_beta)
-    trainable_gamma = trainable and scale
-    gamma_collections = utils.get_variable_collections(variables_collections,
-                                                       'gamma')
-    gamma_initializer = param_initializers.get('gamma',
-                                               init_ops.ones_initializer())
-    gamma = variables.model_variable(
-        'gamma',
-        shape=params_shape,
-        dtype=dtype,
-        initializer=gamma_initializer,
-        collections=gamma_collections,
-        trainable=trainable_gamma)
+    if center:
+      beta_initializer = param_initializers.get('beta',
+                                                init_ops.zeros_initializer())
+      beta = variables.model_variable(
+          'beta',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=beta_initializer,
+          collections=beta_collections,
+          trainable=trainable_beta)
+    else:
+      beta = array_ops.constant(0.0, shape=params_shape)
+
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma_initializer = param_initializers.get('gamma',
+                                                 init_ops.ones_initializer())
+      gamma = variables.model_variable(
+          'gamma',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=gamma_initializer,
+          collections=gamma_collections,
+          trainable=trainable)
+    else:
+      gamma = array_ops.constant(1.0, shape=params_shape)
 
     # Create moving_mean and moving_variance variables and add them to the
     # appropriate collections.
@@ -449,7 +515,8 @@ def batch_norm(inputs,
       then the batch normalization uses weighted mean and
       variance. (This can be used to correct for bias in training
       example selection.)
-    fused:  Use nn.fused_batch_norm if True, nn.batch_normalization otherwise.
+    fused: if `True`, use a faster, fused implementation based on
+      nn.fused_batch_norm. If `None`, use the fused implementation if possible.
     data_format: A string. `NHWC` (default) and `NCHW` are supported.
     zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
       pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
@@ -473,7 +540,6 @@ def batch_norm(inputs,
 
   Raises:
     ValueError: If `batch_weights` is not None and `fused` is True.
-    ValueError: If `param_regularizers` is not None and `fused` is True.
     ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
     ValueError: If the rank of `inputs` is undefined.
     ValueError: If rank or channels dimension of `inputs` is undefined.
@@ -487,6 +553,21 @@ def batch_norm(inputs,
                        'supported for fused batch norm.')
     if renorm:
       raise ValueError('Renorm is not supported for fused batch norm.')
+
+  # Only use _fused_batch_norm (1) if fused is set True or if it is
+  # possible to use (currently it doesn't support batch weights,
+  # renorm, and the case when rank is neither 2 nor 4),
+  # and (2) if used with zero_debias_moving_mean, or an input shape of rank 2,
+  # or non-default updates_collections (not implemented in
+  # normalization_layers.BatchNormalization yet); otherwise use the fused
+  # implementation in normalization_layers.BatchNormalization.
+  inputs = ops.convert_to_tensor(inputs)
+  rank = inputs.get_shape().ndims
+  feature_supported = batch_weights is None and not renorm and rank in [2, 4]
+  possible_to_fuse = fused is None and feature_supported
+  if (fused or possible_to_fuse) and (
+      zero_debias_moving_mean or rank == 2 or
+      updates_collections is not ops.GraphKeys.UPDATE_OPS):
     return _fused_batch_norm(
         inputs,
         decay=decay,
@@ -552,7 +633,8 @@ def batch_norm(inputs,
           renorm_momentum=renorm_decay,
           name=sc.name,
           _scope=sc,
-          _reuse=reuse)
+          _reuse=reuse,
+          fused=fused)
       outputs = layer.apply(inputs, training=is_training)
 
       # Add variables to collections.
@@ -560,9 +642,9 @@ def batch_norm(inputs,
           layer.moving_mean, variables_collections, 'moving_mean')
       _add_variable_to_collections(
           layer.moving_variance, variables_collections, 'moving_variance')
-      if layer.beta:
+      if layer.beta is not None:
         _add_variable_to_collections(layer.beta, variables_collections, 'beta')
-      if layer.gamma:
+      if layer.gamma is not None:
         _add_variable_to_collections(
             layer.gamma, variables_collections, 'gamma')
 
@@ -860,7 +942,7 @@ def convolution(inputs,
       with "NC".
     num_outputs: Integer, the number of output filters.
     kernel_size: A sequence of N positive integers specifying the spatial
-      dimensions of of the filters.  Can be a single integer to specify the same
+      dimensions of the filters.  Can be a single integer to specify the same
       value for all spatial dimensions.
     stride: A sequence of N positive integers specifying the stride at which to
       compute output.  Can be a single integer to specify the same value for all
@@ -963,6 +1045,7 @@ def convolution(inputs,
                                        sc.original_name_scope, outputs)
 
 convolution2d = convolution
+convolution3d = convolution
 
 
 @add_arg_scope
@@ -1181,6 +1264,116 @@ def convolution2d_transpose(
                                        sc.original_name_scope, outputs)
 
 
+@add_arg_scope
+def convolution3d_transpose(
+    inputs,
+    num_outputs,
+    kernel_size,
+    stride=1,
+    padding='SAME',
+    data_format=DATA_FORMAT_NDHWC,
+    activation_fn=nn.relu,
+    normalizer_fn=None,
+    normalizer_params=None,
+    weights_initializer=initializers.xavier_initializer(),
+    weights_regularizer=None,
+    biases_initializer=init_ops.zeros_initializer(),
+    biases_regularizer=None,
+    reuse=None,
+    variables_collections=None,
+    outputs_collections=None,
+    trainable=True,
+    scope=None):
+  """Adds a convolution3d_transpose with an optional batch normalization layer.
+
+  The function creates a variable called `weights`, representing the
+  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
+  second variable called 'biases' is added to the result of the operation.
+  Args:
+    inputs: A 5-D `Tensor` of type `float` and shape
+      `[batch, depth, height, width, in_channels]` for `NDHWC` data format or
+      `[batch, in_channels, depth, height, width]` for `NCDHW` data format.
+    num_outputs: Integer, the number of output filters.
+    kernel_size: A list of length 3 holding the [kernel_depth, kernel_height, kernel_width] of
+      of the filters. Can be an int if both values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same.  Note that presently
+      both strides must have the same value.
+    padding: One of 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    activation_fn: Activation function. The default value is a ReLU function.
+      Explicitly set it to None to skip it and maintain a linear activation.
+    normalizer_fn: Normalization function to use instead of `biases`. If
+      `normalizer_fn` is provided then `biases_initializer` and
+      `biases_regularizer` are ignored and `biases` are not created nor added.
+      default set to None for no normalizer function
+    normalizer_params: Normalization function parameters.
+    weights_initializer: An initializer for the weights.
+    weights_regularizer: Optional regularizer for the weights.
+    biases_initializer: An initializer for the biases. If None skip biases.
+    biases_regularizer: Optional regularizer for the biases.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional list of collections for all the variables or
+      a dictionary containing a different list of collection per variable.
+    outputs_collections: Collection to add the outputs.
+    trainable: Whether or not the variables should be trainable or not.
+    scope: Optional scope for variable_scope.
+  Returns:
+    A tensor representing the output of the operation.
+  Raises:
+    ValueError: If 'kernel_size' is not a list of length 3.
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+    ValueError: If `C` dimension of `inputs` is None.
+  """
+  layer_variable_getter = _build_variable_getter(
+      {'bias': 'biases', 'kernel': 'weights'})
+
+  with variable_scope.variable_scope(
+      scope, 'Conv3d_transpose', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
+    if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+      raise ValueError('data_format has to be either NCDHW or NDHWC.')
+
+    inputs = ops.convert_to_tensor(inputs)
+
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = convolutional_layers.Convolution3DTranspose(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
+    outputs = layer.apply(inputs)
+
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.bias:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
+
+    if normalizer_fn is not None:
+      normalizer_params = normalizer_params or {}
+      outputs = normalizer_fn(outputs, **normalizer_params)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)
+
+
 @add_arg_scope
 def dropout(inputs,
             keep_prob=0.5,
@@ -1445,7 +1638,8 @@ def fully_connected(inputs,
     ValueError: If x has rank less than 2 or if its last dimension is not set.
   """
   if not isinstance(num_outputs, six.integer_types):
-    raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
+    raise ValueError(
+        'num_outputs should be int or long, got %s.' % (num_outputs,))
 
   layer_variable_getter = _build_variable_getter({'bias': 'biases',
                                                   'kernel': 'weights'})
@@ -1618,6 +1812,300 @@ def layer_norm(inputs,
                                        outputs)
 
 
+class GDN(base.Layer):
+  """Generalized divisive normalization layer.
+
+  Based on the papers:
+
+    "Density Modeling of Images using a Generalized Normalization
+    Transformation"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1511.06281
+
+    "End-to-end Optimized Image Compression"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1611.01704
+
+  Implements an activation function that is essentially a multivariate
+  generalization of a particular sigmoid-type function:
+
+  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
+
+  where i and j run over channels. This implementation never sums across spatial
+  dimensions. It is similar to local response normalization, but more powerful,
+  as beta and gamma are trainable parameters.
+
+  Arguments:
+    inverse: If False (default), compute GDN response. If True, compute IGDN
+      response (one step of fixed point iteration to invert GDN; the division
+      is replaced by multiplication).
+    beta_min: Lower bound for beta, to prevent numerical error from causing
+      square root of zero or negative values.
+    gamma_init: The gamma matrix will be initialized as the identity matrix
+      multiplied with this value. If set to zero, the layer is effectively
+      initialized to the identity operation, since beta is initialized as one.
+      A good default setting is somewhere between 0 and 0.5.
+    reparam_offset: Offset added to the reparameterization of beta and gamma.
+      The reparameterization of beta and gamma as their square roots lets the
+      training slow down when their values are close to zero, which is desirable
+      as small values in the denominator can lead to a situation where gradient
+      noise on beta/gamma leads to extreme amounts of noise in the GDN
+      activations. However, without the offset, we would get zero gradients if
+      any elements of beta or gamma were exactly zero, and thus the training
+      could get stuck. To prevent this, we add this small constant. The default
+      value was empirically determined as a good starting point. Making it
+      bigger potentially leads to more gradient noise on the activations, making
+      it too small may lead to numerical precision issues.
+    data_format: Format of input tensor. Currently supports 'channels_first' and
+      'channels_last'.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such cases.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Properties:
+    inverse: Boolean, whether GDN is computed (True) or IGDN (False).
+    data_format: Format of input tensor. Currently supports 'channels_first' and
+      'channels_last'.
+    beta: The beta parameter as defined above (1D TensorFlow tensor).
+    gamma: The gamma parameter as defined above (2D TensorFlow tensor).
+  """
+
+  def __init__(self,
+               inverse=False,
+               beta_min=1e-6,
+               gamma_init=.1,
+               reparam_offset=2 ** -18,
+               data_format='channels_last',
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(GDN, self).__init__(trainable=trainable, name=name, **kwargs)
+    self.inverse = inverse
+    self._beta_min = beta_min
+    self._gamma_init = gamma_init
+    self._reparam_offset = reparam_offset
+    self.data_format = data_format
+    self._channel_axis()  # trigger ValueError early
+    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+
+  def _channel_axis(self):
+    try:
+      return {'channels_first': 1, 'channels_last': -1}[self.data_format]
+    except KeyError:
+      raise ValueError('Unsupported `data_format` for GDN layer: {}.'.format(
+          self.data_format))
+
+  @staticmethod
+  def _lower_bound(inputs, bound, name=None):
+    """Same as tf.maximum, but with helpful gradient for inputs < bound.
+
+    The gradient is overwritten so that it is passed through if the input is not
+    hitting the bound. If it is, only gradients that push `inputs` higher than
+    the bound are passed through. No gradients are passed through to the bound.
+
+    Args:
+      inputs: input tensor
+      bound: lower bound for the input tensor
+      name: name for this op
+
+    Returns:
+      tf.maximum(inputs, bound)
+    """
+    with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
+      inputs = ops.convert_to_tensor(inputs, name='inputs')
+      bound = ops.convert_to_tensor(bound, name='bound')
+      with ops.get_default_graph().gradient_override_map(
+          {'Maximum': 'GDNLowerBound'}):
+        return math_ops.maximum(inputs, bound, name=scope)
+
+  @ops.RegisterGradient('GDNLowerBound')
+  @staticmethod
+  def _lower_bound_grad(op, grad):
+    """Gradient for `_lower_bound`.
+
+    Args:
+      op: the tensorflow op for which to calculate a gradient
+      grad: gradient with respect to the output of the op
+
+    Returns:
+      gradients with respect to the inputs of the op
+    """
+    inputs = op.inputs[0]
+    bound = op.inputs[1]
+    pass_through_if = math_ops.logical_or(inputs >= bound, grad < 0)
+    return [math_ops.cast(pass_through_if, grad.dtype) * grad, None]
+
+  def build(self, input_shape):
+    channel_axis = self._channel_axis()
+    input_shape = tensor_shape.TensorShape(input_shape)
+    num_channels = input_shape[channel_axis].value
+    if num_channels is None:
+      raise ValueError('The channel dimension of the inputs to `GDN` '
+                       'must be defined.')
+    self._input_rank = input_shape.ndims
+    self.input_spec = base.InputSpec(ndim=input_shape.ndims,
+                                     axes={channel_axis: num_channels})
+
+    pedestal = array_ops.constant(self._reparam_offset ** 2, dtype=self.dtype)
+    beta_bound = array_ops.constant(
+        (self._beta_min + self._reparam_offset ** 2) ** .5, dtype=self.dtype)
+    gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
+
+    def beta_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      return math_ops.sqrt(array_ops.ones(shape, dtype=dtype) + pedestal)
+
+    def gamma_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      assert len(shape) == 2
+      assert shape[0] == shape[1]
+      eye = linalg_ops.eye(shape[0], dtype=dtype)
+      return math_ops.sqrt(self._gamma_init * eye + pedestal)
+
+    beta = self.add_variable('reparam_beta',
+                             shape=[num_channels],
+                             initializer=beta_initializer,
+                             dtype=self.dtype,
+                             trainable=True)
+    beta = self._lower_bound(beta, beta_bound)
+    self.beta = math_ops.square(beta) - pedestal
+
+    gamma = self.add_variable('reparam_gamma',
+                              shape=[num_channels, num_channels],
+                              initializer=gamma_initializer,
+                              dtype=self.dtype,
+                              trainable=True)
+    gamma = self._lower_bound(gamma, gamma_bound)
+    self.gamma = math_ops.square(gamma) - pedestal
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    ndim = self._input_rank
+
+    shape = self.gamma.get_shape().as_list()
+    gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape)
+
+    # Compute normalization pool.
+    if self.data_format == 'channels_first':
+      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID',
+                                 data_format='NC' + 'DHW'[-(ndim - 2):])
+      if ndim == 3:
+        norm_pool = array_ops.expand_dims(norm_pool, 2)
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+        norm_pool = array_ops.squeeze(norm_pool, [2])
+      elif ndim == 5:
+        shape = array_ops.shape(norm_pool)
+        norm_pool = array_ops.reshape(norm_pool, shape[:3] + [-1])
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+        norm_pool = array_ops.reshape(norm_pool, shape)
+      else:  # ndim == 4
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+    else:  # channels_last
+      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID')
+      norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NHWC')
+    norm_pool = math_ops.sqrt(norm_pool)
+
+    if self.inverse:
+      outputs = inputs * norm_pool
+    else:
+      outputs = inputs / norm_pool
+    outputs.set_shape(inputs.get_shape())
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    channel_axis = self._channel_axis()
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not 3 <= input_shape.ndim <= 5:
+      raise ValueError('`input_shape` must be of rank 3 to 5, inclusive.')
+    if input_shape[channel_axis].value is None:
+      raise ValueError(
+          'The channel dimension of `input_shape` must be defined.')
+    return input_shape
+
+
+def gdn(inputs,
+        inverse=False,
+        beta_min=1e-6,
+        gamma_init=.1,
+        reparam_offset=2 ** -18,
+        data_format='channels_last',
+        trainable=True,
+        name=None,
+        reuse=None):
+  """Functional interface for GDN layer.
+
+  Based on the papers:
+
+    "Density Modeling of Images using a Generalized Normalization
+    Transformation"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1511.06281
+
+    "End-to-end Optimized Image Compression"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1611.01704
+
+  Implements an activation function that is essentially a multivariate
+  generalization of a particular sigmoid-type function:
+
+  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
+
+  where i and j run over channels. This implementation never sums across spatial
+  dimensions. It is similar to local response normalization, but more powerful,
+  as beta and gamma are trainable parameters.
+
+  Arguments:
+    inputs: Tensor input.
+    inverse: If False (default), compute GDN response. If True, compute IGDN
+      response (one step of fixed point iteration to invert GDN; the division
+      is replaced by multiplication).
+    beta_min: Lower bound for beta, to prevent numerical error from causing
+      square root of zero or negative values.
+    gamma_init: The gamma matrix will be initialized as the identity matrix
+      multiplied with this value. If set to zero, the layer is effectively
+      initialized to the identity operation, since beta is initialized as one.
+      A good default setting is somewhere between 0 and 0.5.
+    reparam_offset: Offset added to the reparameterization of beta and gamma.
+      The reparameterization of beta and gamma as their square roots lets the
+      training slow down when their values are close to zero, which is desirable
+      as small values in the denominator can lead to a situation where gradient
+      noise on beta/gamma leads to extreme amounts of noise in the GDN
+      activations. However, without the offset, we would get zero gradients if
+      any elements of beta or gamma were exactly zero, and thus the training
+      could get stuck. To prevent this, we add this small constant. The default
+      value was empirically determined as a good starting point. Making it
+      bigger potentially leads to more gradient noise on the activations, making
+      it too small may lead to numerical precision issues.
+    data_format: Format of input tensor. Currently supports 'channels_first' and
+      'channels_last'.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such cases.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = GDN(inverse=inverse,
+              beta_min=beta_min,
+              gamma_init=gamma_init,
+              reparam_offset=reparam_offset,
+              data_format=data_format,
+              trainable=trainable,
+              name=name,
+              dtype=inputs.dtype.base_dtype,
+              _scope=name,
+              _reuse=reuse)
+  return layer.apply(inputs)
+
+
 @add_arg_scope
 def max_pool2d(inputs,
                kernel_size,
@@ -1667,6 +2155,55 @@ def max_pool2d(inputs,
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
+@add_arg_scope
+def max_pool3d(inputs,
+               kernel_size,
+               stride=2,
+               padding='VALID',
+               data_format=DATA_FORMAT_NDHWC,
+               outputs_collections=None,
+               scope=None):
+  """Adds a 3D Max Pooling op.
+
+  It is assumed that the pooling is done per image but not in batch or channels.
+
+  Args:
+    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]` if
+      `data_format` is `NDHWC`, and `[batch_size, channels, depth, height, width]` if
+      `data_format` is `NCDHW`.
+    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width] of the
+      pooling kernel over which the op is computed. Can be an int if both
+      values are the same.
+    stride: A list of length 3: [stride_depth, stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
+      both strides must have the same value.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+
+  Returns:
+    A `Tensor` representing the results of the pooling operation.
+
+  Raises:
+    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
+    ValueError: If 'kernel_size' is not a 3-D list
+  """
+  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
+    raise ValueError('data_format has to be either NCDHW or NDHWC.')
+  with ops.name_scope(scope, 'MaxPool3D', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first' if data_format and data_format.startswith('NC')
+          else 'channels_last')
+    layer = pooling_layers.MaxPooling3D(pool_size=kernel_size,
+                                        strides=stride,
+                                        padding=padding,
+                                        data_format=df,
+                                        _scope=sc)
+    outputs = layer.apply(inputs)
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 @add_arg_scope
 def pool(inputs,
          kernel_size,
@@ -2145,6 +2682,44 @@ def unit_norm(inputs, dim, epsilon=1e-7, scope=None):
     return math_ops.div(inputs, array_ops.tile(lengths, multiples))
 
 
+def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
+  """Project into the Poincare ball with norm <= 1.0 - epsilon.
+
+  https://en.wikipedia.org/wiki/Poincare_ball_model
+
+  Used in
+  Poincare Embeddings for Learning Hierarchical Representations
+  Maximilian Nickel, Douwe Kiela
+  https://arxiv.org/pdf/1705.08039.pdf
+
+  For a 1-D tensor with `axis = 0`, computes
+
+                (x * (1 - epsilon)) / ||x||     if ||x|| > 1 - epsilon
+      output =
+                 x                              otherwise
+
+  For `x` with more dimensions, independently normalizes each 1-D slice along
+  dimension `axis`.
+
+  Args:
+    x: A `Tensor`.
+    axis: Axis along which to normalize.  A scalar or a vector of
+      integers.
+    epsilon: A small deviation from the edge of the unit sphere for numerical
+      stability.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `Tensor` with the same shape as `x`.
+  """
+  with ops.name_scope(name, 'poincare_normalize', [x]) as name:
+    x = ops.convert_to_tensor(x, name='x')
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
+    x_inv_norm = math_ops.rsqrt(square_sum)
+    x_inv_norm = math_ops.minimum((1. - epsilon) * x_inv_norm, 1.)
+    return math_ops.multiply(x, x_inv_norm, name=name)
+
+
 def legacy_fully_connected(x,
                            num_output_units,
                            activation_fn=None,
@@ -2286,6 +2861,8 @@ linear = functools.partial(fully_connected, activation_fn=None)
 
 # Simple alias.
 conv2d = convolution2d
+conv3d = convolution3d
 conv2d_transpose = convolution2d_transpose
+conv3d_transpose = convolution3d_transpose
 conv2d_in_plane = convolution2d_in_plane
 separable_conv2d = separable_convolution2d
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index b49c33e99699fdb135cfdf6c500426b1f4da5204..8867e069d1f428cad918864fbf48a1add8c2eb81 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -27,7 +27,6 @@ from tensorflow.contrib.framework.python.ops import arg_scope
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers as _layers
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -38,6 +37,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -48,6 +48,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 
 
@@ -120,6 +121,76 @@ class AvgPool2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
 
+class AvgPool3DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCDHW or NDHWC.'):
+      _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
+
+  def testCreateAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    output = _layers.avg_pool3d(images, [3, 3, 3])
+    self.assertEqual(output.op.name, 'AvgPool3D/AvgPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateAvgPoolNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, 2, depth, height, width))
+    output = _layers.avg_pool3d(images, [3, 3, 3], data_format='NCDHW')
+    self.assertEquals(output.op.name, 'AvgPool3D/transpose_1')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 1, 2, 4])
+
+  def testCollectOutputs(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output_collected = ops.get_collection('outputs')[0]
+    self.assertEqual(output_collected.aliases, ['AvgPool3D'])
+    self.assertEqual(output_collected, output)
+
+  def testCreateSquareAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, 3)
+    self.assertEqual(output.op.name, 'AvgPool3D/AvgPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateAvgPoolWithScope(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], scope='pool1')
+    self.assertEqual(output.op.name, 'pool1/AvgPool3D')
+
+  def testCreateAvgPoolWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 5, 3])
+
+  def testCreateAvgPoolWithSamePaddingNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, 3, depth, height, width), seed=1)
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], padding='SAME', data_format='NCDHW')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 2, 3, 5])
+
+  def testCreateAvgPoolStrideWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+
+  def testGlobalAvgPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.avg_pool3d(images, images.get_shape()[1:4], stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 1, 3])
+
+
 class PoolTest(test.TestCase):
 
   def testCreatePool(self):
@@ -1422,12 +1493,12 @@ class PartialFlattenTest(test.TestCase):
 
   def testSparsePartialFlatten(self):
     """Test `_inner_flatten` on `SparseTensor`s."""
-    shape = [4, 3, 11, 6, 1, 3]
+    shape = [4, 3, 11, 6]
     np.random.seed(10301)
     random_ = np.random.rand(*shape)
     indices, values, _ = _sparsify(random_)
 
-    for new_rank in [1, 2, 3, 4, 5]:
+    for new_rank in [1, 2, 3]:
       expected_shape = (shape[:new_rank - 1] + [np.prod(shape[new_rank - 1:])])
       reshaped_random_ = np.reshape(random_, expected_shape)
       expected_indices, expected_values, _ = _sparsify(reshaped_random_)
@@ -1558,23 +1629,23 @@ class FCTest(test.TestCase):
         inputs, 32, scope='fc1', weights_regularizer=regularizer)
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    self.assertEqual(len(loss_ops.get_regularization_losses()), 1)
+    self.assertEqual(len(losses.get_regularization_losses()), 1)
     _layers.fully_connected(
         inputs, 32, scope='fc1', weights_regularizer=regularizer, reuse=True)
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
-    self.assertEqual(len(loss_ops.get_regularization_losses()), 1)
+    self.assertEqual(len(losses.get_regularization_losses()), 1)
 
     with variable_scope.variable_scope('outer', reuse=False):
       _layers.fully_connected(inputs, 32, weights_regularizer=regularizer)
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
-      self.assertEqual(len(loss_ops.get_regularization_losses()), 2)
+      self.assertEqual(len(losses.get_regularization_losses()), 2)
     with variable_scope.variable_scope('outer', reuse=True):
       _layers.fully_connected(inputs, 32, weights_regularizer=regularizer)
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
-      self.assertEqual(len(loss_ops.get_regularization_losses()), 2)
+      self.assertEqual(len(losses.get_regularization_losses()), 2)
 
   def testCreateFCWithoutActivation(self):
     height, width = 3, 3
@@ -1702,13 +1773,6 @@ class BatchNormTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'Weighted mean and variance'):
         _layers.batch_norm(inputs, batch_weights=batch_weights, fused=True)
 
-  def testParamRegularizersFused(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
-      inputs = array_ops.placeholder(dtype=dtypes.float32, shape=(5, 3, 3, 7))
-      with self.assertRaisesRegexp(ValueError,
-                                   'Regularizers are not currently'):
-        _layers.batch_norm(inputs, param_regularizers={}, fused=True)
-
   def _testCreateOp(self, fused):
     height, width = 3, 3
     with self.test_session():
@@ -1779,7 +1843,8 @@ class BatchNormTest(test.TestCase):
     height, width = 3, 3
     with self.test_session():
       images = random_ops.random_uniform((5, height, width, 3), seed=1)
-      _layers.batch_norm(images, scale=True, zero_debias_moving_mean=True)
+      _layers.batch_norm(
+          images, scale=True, zero_debias_moving_mean=True, fused=False)
       self.assertEqual(len(variables.get_model_variables()), 6)
       moving_mean = variables.get_variables_by_name('moving_mean')[0]
       moving_variance = variables.get_variables_by_name('moving_variance')[0]
@@ -1873,7 +1938,8 @@ class BatchNormTest(test.TestCase):
         images,
         decay=0.1,
         updates_collections=None,
-        zero_debias_moving_mean=True)
+        zero_debias_moving_mean=True,
+        fused=False)
     moving_mean = variables.get_variables_by_name('BatchNorm/moving_mean')[0]
     moving_variance = variables.get_variables_by_name('moving_variance')[0]
     biased = variables.get_variables_by_name('biased')[0]
@@ -2522,7 +2588,7 @@ class BatchNormTest(test.TestCase):
 
   def _runBatchNormalizationWithFormat(self, shape, data_format, is_training):
     channels = shape[-1]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       images = np.arange(np.product(shape), dtype=np.float32).reshape(shape)
       beta = init_ops.constant_initializer(
           np.arange(
@@ -2560,20 +2626,22 @@ class BatchNormTest(test.TestCase):
       return sess.run(output)
 
   def testNHWCAndNCHWInferenceProduceSameOutput(self):
-    for shape in [[7, 3, 5], [5, 2, 3, 4], [11, 3, 2, 4, 5]]:
-      nhwc = self._runBatchNormalizationWithFormat(
-          data_format='NHWC', shape=shape, is_training=False)
-      nchw = self._runBatchNormalizationWithFormat(
-          data_format='NCHW', shape=shape, is_training=False)
-      self.assertAllClose(nhwc, nchw, atol=1e-4, rtol=1e-4)
+    if test.is_gpu_available(cuda_only=True):
+      for shape in [[7, 3, 5], [5, 2, 3, 4], [11, 3, 2, 4, 5]]:
+        nhwc = self._runBatchNormalizationWithFormat(
+            data_format='NHWC', shape=shape, is_training=False)
+        nchw = self._runBatchNormalizationWithFormat(
+            data_format='NCHW', shape=shape, is_training=False)
+        self.assertAllClose(nhwc, nchw, atol=1e-4, rtol=1e-4)
 
   def testNHWCAndNCHWTrainingProduceSameOutput(self):
-    for shape in [[7, 3, 5], [5, 2, 3, 4], [11, 3, 2, 4, 5]]:
-      nhwc = self._runBatchNormalizationWithFormat(
-          data_format='NHWC', shape=shape, is_training=True)
-      nchw = self._runBatchNormalizationWithFormat(
-          data_format='NCHW', shape=shape, is_training=True)
-      self.assertAllClose(nhwc, nchw, atol=1e-4, rtol=1e-4)
+    if test.is_gpu_available(cuda_only=True):
+      for shape in [[7, 3, 5], [5, 2, 3, 4], [11, 3, 2, 4, 5]]:
+        nhwc = self._runBatchNormalizationWithFormat(
+            data_format='NHWC', shape=shape, is_training=True)
+        nchw = self._runBatchNormalizationWithFormat(
+            data_format='NCHW', shape=shape, is_training=True)
+        self.assertAllClose(nhwc, nchw, atol=1e-4, rtol=1e-4)
 
 
 class LayerNormTest(test.TestCase):
@@ -2704,6 +2772,56 @@ class LayerNormTest(test.TestCase):
     self.doOutputTest((1, 100, 100, 1))
 
 
+class GDNTest(test.TestCase):
+
+  def _runGDN(self, x, shape, inverse, data_format):
+    inputs = array_ops.placeholder(dtypes.float32, shape)
+    outputs = _layers.gdn(inputs, inverse=inverse, data_format=data_format)
+    with self.test_session() as sess:
+      variables_lib.global_variables_initializer().run()
+      y, = sess.run([outputs], {inputs: x})
+    return y
+
+  def testInvalidDataFormat(self):
+    x = np.random.uniform(size=(1, 2, 3, 4))
+    with self.assertRaises(ValueError):
+      self._runGDN(x, x.shape, False, 'NHWC')
+
+  def testUnknownDim(self):
+    x = np.random.uniform(size=(1, 2, 3, 4))
+    with self.assertRaises(ValueError):
+      self._runGDN(x, 4 * [None], False, 'channels_last')
+
+  def testChannelsLast(self):
+    for ndim in [3, 4, 5]:
+      x = np.random.uniform(size=(1, 2, 3, 4)[:ndim])
+      y = self._runGDN(x, x.shape, False, 'channels_last')
+      self.assertEqual(x.shape, y.shape)
+      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+  def testChannelsFirst(self):
+    # `bias_add` doesn't support NCHW on CPU.
+    if test.is_gpu_available(cuda_only=True):
+      for ndim in [3, 4, 5]:
+        x = np.random.uniform(size=(4, 3, 2, 1)[:ndim])
+        y = self._runGDN(x, x.shape, False, 'channels_first')
+        self.assertEqual(x.shape, y.shape)
+        self.assertAllClose(
+            y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+  def testWrongDims(self):
+    for ndim in [1, 2, 6]:
+      x = np.random.uniform(size=(1, 2, 3, 4, 3, 2)[:ndim])
+      with self.assertRaises(ValueError):
+        self._runGDN(x, x.shape, False, 'channels_last')
+
+  def testIGDN(self):
+    x = np.random.uniform(size=(1, 2, 3, 4))
+    y = self._runGDN(x, x.shape, True, 'channels_last')
+    self.assertEqual(x.shape, y.shape)
+    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+
 class MaxPool2DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -2773,6 +2891,76 @@ class MaxPool2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
 
+class MaxPool3DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCDHW or NDHWC.'):
+      _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
+
+  def testCreateMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    output = _layers.max_pool3d(images, [3, 3, 3])
+    self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateMaxPoolNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
+    self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
+
+  def testCollectOutputs(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output_collected = ops.get_collection('outputs')[0]
+    self.assertEqual(output_collected.aliases, ['MaxPool3D'])
+    self.assertEqual(output_collected, output)
+
+  def testCreateSquareMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, 3)
+    self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
+
+  def testCreateMaxPoolWithScope(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], scope='pool1')
+    self.assertEqual(output.op.name, 'pool1/MaxPool3D')
+
+  def testCreateMaxPoolWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 5, 3])
+
+  def testCreateMaxPoolWithSamePaddingNCDHW(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, 3, depth, height, width), seed=1)
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], padding='SAME', data_format='NCDHW')
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 2, 3, 5])
+
+  def testCreateMaxPoolStrideWithSamePadding(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+
+  def testGlobalMaxPool(self):
+    depth, height, width = 3, 6, 9
+    images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
+    output = _layers.max_pool3d(images, images.get_shape()[1:4], stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 1, 3])
+
+
 class OneHotEncodingTest(test.TestCase):
 
   def testOneHotEncodingCreate(self):
@@ -3231,6 +3419,69 @@ class UnitNormTests(test.TestCase):
         self.assertAllClose(expected, actual, 1e-4, 1e-4)
 
 
+class PoincareNormalizeTest(test.TestCase):
+
+  def _PoincareNormalize(self, x, dim, epsilon=1e-5):
+    if isinstance(dim, list):
+      norm = np.linalg.norm(x, axis=tuple(dim))
+      for d in dim:
+        norm = np.expand_dims(norm, d)
+      norm_x = ((1. - epsilon) * x) / norm
+    else:
+      norm = np.expand_dims(np.apply_along_axis(np.linalg.norm, dim, x), dim)
+      norm_x = ((1. - epsilon) * x) / norm
+    return np.where(norm > 1.0 - epsilon, norm_x, x)
+
+  def testPoincareNormalize(self):
+    x_shape = [20, 7, 3]
+    epsilon = 1e-5
+    tol = 1e-6
+    np.random.seed(1)
+    x_np = np.random.random_sample(x_shape).astype(np.float32)
+    for dim in range(len(x_shape)):
+      y_np = self._PoincareNormalize(x_np, dim, epsilon)
+      with self.test_session():
+        x_tf = constant_op.constant(x_np, name='x')
+        y_tf = _layers.poincare_normalize(x_tf, dim, epsilon)
+        y_tf_eval = y_tf.eval()
+        norm = np.linalg.norm(y_np, axis=dim)
+        self.assertLessEqual(norm.max(), 1. - epsilon + tol)
+        norm = np.linalg.norm(y_tf_eval, axis=dim)
+        self.assertLessEqual(norm.max(), 1. - epsilon + tol)
+        self.assertAllClose(y_np, y_tf_eval)
+
+  def testPoincareNormalizeDimArray(self):
+    x_shape = [20, 7, 3]
+    epsilon = 1e-5
+    tol = 1e-6
+    np.random.seed(1)
+    x_np = np.random.random_sample(x_shape).astype(np.float32)
+    dim = [1, 2]
+    y_np = self._PoincareNormalize(x_np, dim, epsilon)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, name='x')
+      y_tf = _layers.poincare_normalize(x_tf, dim, epsilon)
+      y_tf_eval = y_tf.eval()
+      norm = np.linalg.norm(y_np, axis=tuple(dim))
+      self.assertLess(norm.max(), 1. - epsilon + tol)
+      norm = np.linalg.norm(y_tf_eval, axis=tuple(dim))
+      self.assertLess(norm.max(), 1. - epsilon + tol)
+      self.assertAllClose(y_np, y_tf_eval, rtol=1e-6, atol=1e-6)
+
+  def testPoincareNormalizeGradient(self):
+    x_shape = [20, 7, 3]
+    np.random.seed(1)
+    x_np = np.random.random_sample(x_shape).astype(np.float64)
+    for dim in range(len(x_shape)):
+      with self.test_session():
+        x_tf = constant_op.constant(x_np, name='x')
+        y_tf = _layers.poincare_normalize(x_tf, dim)
+        err = gradient_checker.compute_gradient_error(x_tf, x_shape,
+                                                      y_tf, x_shape)
+      print('PoinCareNormalize gradient err = %g ' % err)
+      self.assertLess(err, 1e-4)
+
+
 # TODO(b/28426988): Add separate tests for non-legacy versions.
 class LegacyFullyConnectedTest(test.TestCase):
 
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index 50c11c696a9f7dfb1ba904b84cca737e8345073a..ac217f043f1d8c3053f4e1638e90011b88eab800 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -71,28 +71,29 @@ def optimize_loss(loss,
                   increment_global_step=True):
   """Given loss and parameters for optimizer, returns a training op.
 
-  Various ways of passing optimizers, include:
+  Various ways of passing optimizers include:
 
-  - string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES
+  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
       for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
-  - function, takes learning rate `Tensor` as argument and must return
+  - by function taking learning rate `Tensor` as argument and returning an
       `Optimizer` instance. E.g. `optimize_loss(...,
       optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
     Alternatively, if `learning_rate` is `None`, the function takes no
     arguments. E.g. `optimize_loss(..., learning_rate=None,
       optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
-  - class, subclass of `Optimizer` that takes only one required argument -
-      learning rate, such as AdamOptimizer, AdagradOptimizer.
-      E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`.
-  - object, instance of subclass of `Optimizer`.
-      E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.
+  - by a subclass of `Optimizer` having a single-argument constructor
+      (the argument is the learning rate), such as AdamOptimizer or
+      AdagradOptimizer. E.g. `optimize_loss(...,
+      optimizer=tf.train.AdagradOptimizer)`.
+  - by an instance of a subclass of `Optimizer`.
+      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.
 
   Args:
     loss: Scalar `Tensor`.
     global_step: Scalar int `Tensor`, step counter to update on each step
                  unless `increment_global_step` is `False`. If not supplied,
                  it will be fetched from the default graph (see
-                 `tf.train.get_global_step` for details). If it's
+                 `tf.train.get_global_step` for details). If it has
                  not been created, no step will be incremented with each weight
                  update. `learning_rate_decay_fn` requires `global_step`.
     learning_rate: float or `Tensor`, magnitude of update per each training
@@ -145,11 +146,11 @@ def optimize_loss(loss,
         * `loss` is an invalid type or shape.
         * `global_step` is an invalid type or shape.
         * `learning_rate` is an invalid type or value.
-        * `optimizer` is wrong type.
-        * `clip_gradients` is not float or callable.
+        * `optimizer` has the wrong type.
+        * `clip_gradients` is neither float nor callable.
         * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
           `global_step` is available.
-        * `gradients` is empty
+        * `gradients` is empty.
   """
   loss = ops.convert_to_tensor(loss)
   contrib_framework.assert_scalar(loss)
@@ -350,8 +351,8 @@ def adaptive_clipping_fn(std_factor=2.,
   https://arxiv.org/abs/1412.1602.
 
   Keeps a moving average of the mean and std of the log(norm) of the gradient.
-  if the norm exceeds `exp(mean + std_factor*std)`, all gradients are rescaled
-  such that the global norm becomes `exp(mean)`.
+  If the norm exceeds `exp(mean + std_factor*std)` then all gradients will be
+  rescaled such that the global norm becomes `exp(mean)`.
 
   Args:
     std_factor: Python scaler (or tensor).
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 0ab39e35d0b4539c0d76094d61a09f16d221eced..01d769c80dfd28d89b5f8ddc03bf9c9dcc17c545 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
+        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/contrib/rnn:rnn_py",
@@ -50,24 +51,37 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:logging_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:resources",
         "//tensorflow/python:rnn",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:inputs",
+        "//tensorflow/python/estimator:inputs_queues",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:pandas_io",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:loader",
@@ -107,14 +121,11 @@ py_test(
 py_test(
     name = "feeding_functions_test",
     size = "small",
-    srcs = [
-        "python/learn/tests/dataframe/feeding_functions_test.py",
-    ],
+    srcs = ["python/learn/tests/dataframe/feeding_functions_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -122,16 +133,13 @@ py_test(
 py_test(
     name = "feeding_queue_runner_test",
     size = "small",
-    srcs = [
-        "python/learn/tests/dataframe/feeding_queue_runner_test.py",
-    ],
+    srcs = ["python/learn/tests/dataframe/feeding_queue_runner_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -146,7 +154,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -160,7 +167,6 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:parsing_ops",
     ],
 )
@@ -174,7 +180,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -186,7 +191,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -201,7 +205,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
@@ -217,7 +220,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -230,7 +232,7 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
     ],
 )
@@ -245,7 +247,6 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -259,7 +260,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -273,7 +273,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -287,9 +286,9 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:parsing_ops",
         "//third_party/py/numpy",
@@ -304,7 +303,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -318,7 +316,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -333,7 +330,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -347,9 +343,8 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
         "//third_party/py/numpy",
     ],
 )
@@ -361,11 +356,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -380,7 +374,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -394,11 +387,11 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/estimator",
     ],
 )
 
@@ -414,9 +407,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resources",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
@@ -433,8 +424,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
+        "//tensorflow/contrib/training:training_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
     ],
 )
@@ -458,21 +449,21 @@ py_test(
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/estimator",
     ],
 )
 
 py_test(
     name = "run_config_test",
     size = "small",
-    srcs = [
-        "python/learn/estimators/run_config_test.py",
-    ],
+    srcs = ["python/learn/estimators/run_config_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
+        "//tensorflow/python/estimator:run_config",
     ],
 )
 
@@ -485,9 +476,8 @@ py_test(
         ":learn",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
@@ -506,14 +496,18 @@ py_test(
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:protos_all_py",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
@@ -559,7 +553,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//third_party/py/numpy",
@@ -574,18 +567,17 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
     ],
@@ -595,15 +587,17 @@ py_test(
     name = "head_test",
     size = "medium",
     srcs = ["python/learn/estimators/head_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = ["noasan"],  # times out b/63678675
     deps = [
         ":learn",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
@@ -624,12 +618,12 @@ py_test(
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
     ],
 )
@@ -655,9 +649,8 @@ py_test(
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
     ],
 )
@@ -669,14 +662,16 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
 )
@@ -700,9 +695,8 @@ py_test(
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -720,13 +714,14 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -740,17 +735,15 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -771,10 +764,32 @@ py_test(
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "debug_test",
+    size = "medium",
+    srcs = ["python/learn/estimators/debug_test.py"],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/learn/python/learn/datasets",
+        "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -791,9 +806,8 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
     ],
 )
@@ -807,9 +821,8 @@ py_test(
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
@@ -821,7 +834,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -845,9 +857,14 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//third_party/py/numpy",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/saved_model:signature_constants",
+        "@six_archive//:six",
     ],
 )
 
@@ -859,7 +876,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -874,8 +890,7 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:random_seed",
     ],
 )
 
@@ -887,7 +902,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -899,12 +913,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -918,9 +930,8 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -933,11 +944,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:rnn_cell",
         "//third_party/py/numpy",
     ],
 )
@@ -950,7 +960,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -963,7 +972,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -975,7 +983,6 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -993,6 +1000,7 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
@@ -1009,7 +1017,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -1024,7 +1031,6 @@ py_test(
         ":learn",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -1044,12 +1050,11 @@ py_test(
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/session_bundle:exporter",
+        "//tensorflow/contrib/session_bundle:manifest_proto_py_pb2",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
@@ -1069,6 +1074,7 @@ py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -1079,12 +1085,13 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
     ],
@@ -1102,9 +1109,7 @@ py_test(
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
     ],
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index c06343b731d3de785ff51650cdeb478b7375cc56..3698af027e38f1063ad829c26eb179734968f813 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -73,6 +73,10 @@ See the @{$python/contrib.learn} guide.
 @@read_batch_examples
 @@read_batch_features
 @@read_batch_record_features
+@@read_keyed_batch_examples
+@@read_keyed_batch_examples_shared_queue
+@@read_keyed_batch_features
+@@read_keyed_batch_features_shared_queue
 
 @@InputFnOps
 @@ProblemType
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
index 130ac0c90f528af174b356615fe17264a7edb236..874b02c6c9899ac6174d16af96ea7a432bc321b7 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
@@ -35,7 +35,7 @@ def sparse_boolean_mask(sparse_tensor, mask, name="sparse_boolean_mask"):
 
   Args:
     sparse_tensor: a `SparseTensor`.
-    mask: a 1D boolean dense`Tensor` whose length is equal to the 0th dimension
+    mask: a 1D boolean dense `Tensor` whose length is equal to the 0th dimension
       of `sparse_tensor`.
     name: optional name for this operation.
   Returns:
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index de9b786a2c85cca62755e5dcc5501f1e7ae7b9c2..8bf372841d04dc9e1339925474801d5aa3af4ccd 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
         "//third_party/py/numpy",
     ],
 )
@@ -39,7 +40,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:platform",
     ],
 )
@@ -63,9 +63,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
-        "//tensorflow/contrib/learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -76,9 +74,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
-        "//tensorflow/contrib/learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
     ],
 )
 
@@ -89,10 +85,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug.py b/tensorflow/contrib/learn/python/learn/estimators/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d5f6c2bf969d7c85d251bf1b06a0307a41b2297
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug.py
@@ -0,0 +1,325 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Debug estimators.
+
+Debug estimators are bias-only estimators that can be used for debugging
+and as simple baselines.
+
+Example:
+
+```
+# Build DebugClassifier
+classifier = DebugClassifier()
+
+# Input builders
+def input_fn_train: # returns x, y (where y represents label's class index).
+  pass
+
+def input_fn_eval: # returns x, y (where y represents label's class index).
+  pass
+
+# Fit model.
+classifier.fit(input_fn=input_fn_train)
+
+# Evaluate cross entropy between the test and train labels.
+loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
+
+# predict_classes outputs the most commonly seen class in training.
+predicted_label = classifier.predict_classes(new_samples)
+
+# predict_proba outputs the class distribution from training.
+label_distribution = classifier.predict_proba(new_samples)
+```
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+
+
+def _get_feature_dict(features):
+  if isinstance(features, dict):
+    return features
+  return {"": features}
+
+
+def debug_model_fn(features, labels, mode, params, config=None):
+  """Model_fn for debug models.
+
+  Args:
+    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
+    labels: Labels that are compatible with the `_Head` instance in `params`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    params: A dict of hyperparameters containing:
+      * head: A `_Head` instance.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Raises:
+    KeyError: If weight column is specified but not present.
+    ValueError: If features is an empty dictionary.
+
+  Returns:
+    A `ModelFnOps` instance.
+  """
+  del config  # Unused.
+
+  features = _get_feature_dict(features)
+  if not features:
+    raise ValueError("Features cannot be empty.")
+
+  head = params["head"]
+  size_checks = []
+  batch_size = None
+
+  # The first dimension is assumed to be a batch size and must be consistent
+  # among all of the features.
+  for feature in features.values():
+    first_dim = array_ops.shape(feature)[0]
+    if batch_size is None:
+      batch_size = first_dim
+    else:
+      size_checks.append(check_ops.assert_equal(batch_size, first_dim))
+
+  with ops.control_dependencies(size_checks):
+    logits = array_ops.zeros([batch_size, head.logits_dimension])
+
+  def train_op_fn(loss):
+    return optimizers.optimize_loss(
+        loss, global_step=None, learning_rate=0.3, optimizer="Adagrad")
+
+  return head.create_model_fn_ops(
+      features=features,
+      labels=labels,
+      mode=mode,
+      train_op_fn=train_op_fn,
+      logits=logits)
+
+
+class DebugClassifier(estimator.Estimator):
+  """A classifier for TensorFlow Debug models.
+
+  Example:
+
+  ```python
+
+  # Build DebugClassifier
+  classifier = DebugClassifier()
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    pass
+
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    pass
+
+  # Fit model.
+  classifier.fit(input_fn=input_fn_train)
+
+  # Evaluate cross entropy between the test and train labels.
+  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
+
+  # predict_class outputs the most commonly seen class in training.
+  predicted_label = classifier.predict_class(new_samples)
+
+  # predict_proba outputs the class distribution from training.
+  label_distribution = classifier.predict_proba(new_samples)
+  ```
+
+  Input of `fit` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column_name` is not `None`, a feature with
+     `key=weight_column_name` whose value is a `Tensor`.
+  """
+
+  def __init__(self,
+               model_dir=None,
+               n_classes=2,
+               weight_column_name=None,
+               config=None,
+               feature_engineering_fn=None,
+               label_keys=None):
+    """Initializes a DebugClassifier instance.
+
+    Args:
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        It must be greater than 1. Note: Class labels are integers representing
+        the class index (i.e. values from 0 to n_classes-1). For arbitrary
+        label values (e.g. string labels), convert to class indices first.
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+                        labels which are the output of `input_fn` and returns
+                        features and labels which will be fed into the model.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+    Returns:
+      A `DebugClassifier` estimator.
+
+    Raises:
+      ValueError: If `n_classes` < 2.
+    """
+    params = {"head":
+              head_lib.multi_class_head(
+                  n_classes=n_classes,
+                  weight_column_name=weight_column_name,
+                  enable_centered_bias=True,
+                  label_keys=label_keys)}
+
+    super(DebugClassifier, self).__init__(
+        model_fn=debug_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        feature_engineering_fn=feature_engineering_fn)
+
+  def predict_classes(self, input_fn=None, batch_size=None):
+    """Returns predicted classes for given features.
+
+    Args:
+      input_fn: Input function.
+      batch_size: Override default batch size.
+
+    Returns:
+      An iterable of predicted classes. Each predicted class is represented by
+      its class index (i.e. integer from 0 to n_classes-1).
+    """
+    key = prediction_key.PredictionKey.CLASSES
+    preds = self.predict(
+        input_fn=input_fn, batch_size=batch_size, outputs=[key])
+    return (pred[key] for pred in preds)
+
+  def predict_proba(self,
+                    input_fn=None,
+                    batch_size=None):
+    """Returns prediction probabilities for given features.
+
+    Args:
+      input_fn: Input function.
+      batch_size: Override default batch size.
+
+    Returns:
+      An iterable of predicted probabilities with shape [batch_size, n_classes].
+    """
+    key = prediction_key.PredictionKey.PROBABILITIES
+    preds = self.predict(
+        input_fn=input_fn,
+        batch_size=batch_size,
+        outputs=[key])
+    return (pred[key] for pred in preds)
+
+
+class DebugRegressor(estimator.Estimator):
+  """A regressor for TensorFlow Debug models.
+
+  Example:
+
+  ```python
+
+  # Build DebugRegressor
+  regressor = DebugRegressor()
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    pass
+
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    pass
+
+  # Fit model.
+  regressor.fit(input_fn=input_fn_train)
+
+  # Evaluate squared-loss between the test and train targets.
+  loss = regressor.evaluate(input_fn=input_fn_eval)["loss"]
+
+  # predict_scores outputs mean value seen during training.
+  predicted_targets = regressor.predict_scores(new_samples)
+  ```
+
+  Input of `fit` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column_name` is not `None`, a feature with
+     `key=weight_column_name` whose value is a `Tensor`.
+  """
+
+  def __init__(self,
+               model_dir=None,
+               label_dimension=1,
+               weight_column_name=None,
+               config=None,
+               feature_engineering_fn=None):
+    """Initializes a DebugRegressor instance.
+
+    Args:
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_column_name: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+                        labels which are the output of `input_fn` and returns
+                        features and labels which will be fed into the model.
+    Returns:
+      A `DebugRegressor` estimator.
+    """
+
+    params = {
+        "head":
+            head_lib.regression_head(
+                weight_column_name=weight_column_name,
+                label_dimension=label_dimension,
+                enable_centered_bias=True)
+    }
+
+    super(DebugRegressor, self).__init__(
+        model_fn=debug_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        feature_engineering_fn=feature_engineering_fn)
+
+  def predict_scores(self, input_fn=None, batch_size=None):
+    """Returns predicted scores for given features.
+
+    Args:
+      input_fn: Input function.
+      batch_size: Override default batch size.
+
+    Returns:
+      An iterable of predicted scores.
+    """
+    key = prediction_key.PredictionKey.SCORES
+    preds = self.predict(
+        input_fn=input_fn, batch_size=batch_size, outputs=[key])
+    return (pred[key] for pred in preds)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b125534a42c5cdde69773d99cefd6e7b2d60c9c
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -0,0 +1,857 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Debug estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import operator
+import tempfile
+
+import numpy as np
+
+from tensorflow.contrib.layers.python.layers import feature_column
+from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.contrib.learn.python.learn import experiment
+from tensorflow.contrib.learn.python.learn.datasets import base
+from tensorflow.contrib.learn.python.learn.estimators import _sklearn
+from tensorflow.contrib.learn.python.learn.estimators import debug
+from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.contrib.learn.python.learn.estimators import test_data
+from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
+from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import input as input_lib
+
+
+NUM_EXAMPLES = 100
+N_CLASSES = 5  #  Cardinality of multiclass labels.
+LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
+
+
+def _train_test_split(features_and_labels):
+  features, labels = features_and_labels
+  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
+  return train_set, test_set
+
+
+def _input_fn_builder(features, labels):
+
+  def input_fn():
+    feature_dict = {'features': constant_op.constant(features)}
+    my_labels = labels
+    if my_labels is not None:
+      my_labels = constant_op.constant(my_labels)
+    return feature_dict, my_labels
+
+  return input_fn
+
+
+class DebugClassifierTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(100)
+    self.features = np.random.rand(NUM_EXAMPLES, 5)
+    self.labels = np.random.choice(
+        range(N_CLASSES), p=[0.1, 0.3, 0.4, 0.1, 0.1], size=NUM_EXAMPLES)
+    self.binary_labels = np.random.choice(
+        range(2), p=[0.2, 0.8], size=NUM_EXAMPLES)
+    self.binary_float_labels = np.random.choice(
+        range(2), p=[0.2, 0.8], size=NUM_EXAMPLES)
+
+  def testPredict(self):
+    """Tests that DebugClassifier outputs the majority class."""
+    (train_features, train_labels), (test_features,
+                                     test_labels) = _train_test_split(
+                                         [self.features, self.labels])
+    majority_class, _ = max(collections.Counter(train_labels).items(),
+                            key=operator.itemgetter(1))
+    expected_prediction = np.vstack(
+        [[majority_class] for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugClassifier(n_classes=N_CLASSES)
+    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
+                   steps=50)
+
+    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
+                                                                 None))
+    self.assertAllEqual(expected_prediction, np.vstack(pred))
+
+  def testPredictBinary(self):
+    """Same as above for binary predictions."""
+    (train_features, train_labels), (test_features,
+                                     test_labels) = _train_test_split(
+                                         [self.features, self.binary_labels])
+
+    majority_class, _ = max(collections.Counter(train_labels).items(),
+                            key=operator.itemgetter(1))
+    expected_prediction = np.vstack(
+        [[majority_class] for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugClassifier(n_classes=2)
+    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
+                   steps=50)
+
+    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
+                                                                 None))
+    self.assertAllEqual(expected_prediction, np.vstack(pred))
+
+    (train_features, train_labels), (
+        test_features, test_labels) = _train_test_split(
+            [self.features, self.binary_float_labels])
+
+    majority_class, _ = max(collections.Counter(train_labels).items(),
+                            key=operator.itemgetter(1))
+    expected_prediction = np.vstack(
+        [[majority_class] for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugClassifier(n_classes=2)
+    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
+                   steps=50)
+
+    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
+                                                                 None))
+    self.assertAllEqual(expected_prediction, np.vstack(pred))
+
+  def testPredictProba(self):
+    """Tests that DebugClassifier outputs observed class distribution."""
+    (train_features, train_labels), (test_features,
+                                     test_labels) = _train_test_split(
+                                         [self.features, self.labels])
+
+    class_distribution = np.zeros((1, N_CLASSES))
+    for label in train_labels:
+      class_distribution[0, label] += 1
+    class_distribution /= len(train_labels)
+
+    expected_prediction = np.vstack(
+        [class_distribution for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugClassifier(n_classes=N_CLASSES)
+    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
+                   steps=50)
+
+    pred = classifier.predict_proba(
+        input_fn=_input_fn_builder(test_features, None))
+
+    self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
+
+  def testPredictProbaBinary(self):
+    """Same as above but for binary classification."""
+    (train_features, train_labels), (test_features,
+                                     test_labels) = _train_test_split(
+                                         [self.features, self.binary_labels])
+
+    class_distribution = np.zeros((1, 2))
+    for label in train_labels:
+      class_distribution[0, label] += 1
+    class_distribution /= len(train_labels)
+
+    expected_prediction = np.vstack(
+        [class_distribution for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugClassifier(n_classes=2)
+    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
+                   steps=50)
+
+    pred = classifier.predict_proba(
+        input_fn=_input_fn_builder(test_features, None))
+
+    self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
+
+    (train_features, train_labels), (
+        test_features, test_labels) = _train_test_split(
+            [self.features, self.binary_float_labels])
+
+    class_distribution = np.zeros((1, 2))
+    for label in train_labels:
+      class_distribution[0, int(label)] += 1
+    class_distribution /= len(train_labels)
+
+    expected_prediction = np.vstack(
+        [class_distribution for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugClassifier(n_classes=2)
+    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
+                   steps=50)
+
+    pred = classifier.predict_proba(
+        input_fn=_input_fn_builder(test_features, None))
+
+    self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
+
+  def testExperimentIntegration(self):
+    exp = experiment.Experiment(
+        estimator=debug.DebugClassifier(n_classes=3),
+        train_input_fn=test_data.iris_input_multiclass_fn,
+        eval_input_fn=test_data.iris_input_multiclass_fn)
+    exp.test()
+
+  def _assertInRange(self, expected_min, expected_max, actual):
+    self.assertLessEqual(expected_min, actual)
+    self.assertGreaterEqual(expected_max, actual)
+
+  def testEstimatorContract(self):
+    estimator_test_utils.assert_estimator_contract(self, debug.DebugClassifier)
+
+  def testLogisticRegression_MatrixData(self):
+    """Tests binary classification using matrix data as input."""
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
+    input_fn = test_data.iris_input_logistic_fn
+    classifier.fit(input_fn=input_fn, steps=5)
+    scores = classifier.evaluate(input_fn=input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+
+  def testLogisticRegression_MatrixData_Labels1D(self):
+    """Same as the last test, but label shape is [100] instead of [100, 1]."""
+
+    def _input_fn():
+      iris = test_data.prepare_iris_data_for_logistic_regression()
+      return {
+          'feature': constant_op.constant(
+              iris.data, dtype=dtypes.float32)
+      }, constant_op.constant(
+          iris.target, shape=[100], dtype=dtypes.int32)
+
+    classifier = debug.DebugClassifier(config=run_config.RunConfig(
+        tf_random_seed=1))
+    classifier.fit(input_fn=_input_fn, steps=5)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertIn('loss', scores)
+
+  def testLogisticRegression_NpMatrixData(self):
+    """Tests binary classification using numpy matrix data as input."""
+    iris = test_data.prepare_iris_data_for_logistic_regression()
+    train_x = iris.data
+    train_y = iris.target
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
+    classifier.fit(x=train_x, y=train_y, steps=5)
+    scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+
+  def _assertBinaryPredictions(self, expected_len, predictions):
+    self.assertEqual(expected_len, len(predictions))
+    for prediction in predictions:
+      self.assertIn(prediction, (0, 1))
+
+  def _assertProbabilities(self, expected_batch_size, expected_n_classes,
+                           probabilities):
+    self.assertEqual(expected_batch_size, len(probabilities))
+    for b in range(expected_batch_size):
+      self.assertEqual(expected_n_classes, len(probabilities[b]))
+      for i in range(expected_n_classes):
+        self._assertInRange(0.0, 1.0, probabilities[b][i])
+
+  def testLogisticRegression_TensorData(self):
+    """Tests binary classification using tensor data as input."""
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    classifier = debug.DebugClassifier(n_classes=2)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions = list(classifier.predict_classes(input_fn=predict_input_fn))
+    self._assertBinaryPredictions(3, predictions)
+
+  def testLogisticRegression_FloatLabel(self):
+    """Tests binary classification with float labels."""
+
+    def _input_fn_float_label(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[50], [20], [10]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant([[0.8], [0.], [0.2]], dtype=dtypes.float32)
+      return features, labels
+
+    classifier = debug.DebugClassifier(n_classes=2)
+
+    classifier.fit(input_fn=_input_fn_float_label, steps=50)
+
+    predict_input_fn = functools.partial(_input_fn_float_label, num_epochs=1)
+    predictions = list(classifier.predict_classes(input_fn=predict_input_fn))
+    self._assertBinaryPredictions(3, predictions)
+    predictions_proba = list(
+        classifier.predict_proba(input_fn=predict_input_fn))
+    self._assertProbabilities(3, 2, predictions_proba)
+
+  def testMultiClass_MatrixData(self):
+    """Tests multi-class classification using matrix data as input."""
+    classifier = debug.DebugClassifier(n_classes=3)
+
+    input_fn = test_data.iris_input_multiclass_fn
+    classifier.fit(input_fn=input_fn, steps=200)
+    scores = classifier.evaluate(input_fn=input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+
+  def testMultiClass_MatrixData_Labels1D(self):
+    """Same as the last test, but label shape is [150] instead of [150, 1]."""
+
+    def _input_fn():
+      iris = base.load_iris()
+      return {
+          'feature': constant_op.constant(
+              iris.data, dtype=dtypes.float32)
+      }, constant_op.constant(
+          iris.target, shape=[150], dtype=dtypes.int32)
+
+    classifier = debug.DebugClassifier(n_classes=3)
+
+    classifier.fit(input_fn=_input_fn, steps=200)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+
+  def testMultiClass_NpMatrixData(self):
+    """Tests multi-class classification using numpy matrix data as input."""
+    iris = base.load_iris()
+    train_x = iris.data
+    train_y = iris.target
+    classifier = debug.DebugClassifier(n_classes=3)
+    classifier.fit(x=train_x, y=train_y, steps=200)
+    scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+
+  def testMultiClass_StringLabel(self):
+    """Tests multi-class classification with string labels."""
+
+    def _input_fn_train():
+      labels = constant_op.constant([['foo'], ['bar'], ['baz'], ['bar']])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
+      return features, labels
+
+    classifier = debug.DebugClassifier(
+        n_classes=3, label_keys=['foo', 'bar', 'baz'])
+
+    classifier.fit(input_fn=_input_fn_train, steps=5)
+    scores = classifier.evaluate(input_fn=_input_fn_train, steps=1)
+    self.assertIn('loss', scores)
+
+  def testLoss(self):
+    """Tests loss calculation."""
+
+    def _input_fn_train():
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      # The logistic prediction should be (y = 0.25).
+      labels = constant_op.constant([[1], [0], [0], [0]])
+      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      return features, labels
+
+    classifier = debug.DebugClassifier(n_classes=2)
+
+    classifier.fit(input_fn=_input_fn_train, steps=5)
+    scores = classifier.evaluate(input_fn=_input_fn_train, steps=1)
+    self.assertIn('loss', scores)
+
+  def testLossWithWeights(self):
+    """Tests loss calculation with weights."""
+
+    def _input_fn_train():
+      # 4 rows with equal weight, one of them (y = x), three of them (y=Not(x))
+      # The logistic prediction should be (y = 0.25).
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x': array_ops.ones(
+              shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    def _input_fn_eval():
+      # 4 rows, with different weights.
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x': array_ops.ones(
+              shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[7.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    classifier = debug.DebugClassifier(
+        weight_column_name='w',
+        n_classes=2,
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn_train, steps=5)
+    scores = classifier.evaluate(input_fn=_input_fn_eval, steps=1)
+    self.assertIn('loss', scores)
+
+  def testTrainWithWeights(self):
+    """Tests training with given weight column."""
+
+    def _input_fn_train():
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      # First row has more weight than others. Model should fit (y=x) better
+      # than (y=Not(x)) due to the relative higher weight of the first row.
+      labels = constant_op.constant([[1], [0], [0], [0]])
+      features = {
+          'x': array_ops.ones(
+              shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[100.], [3.], [2.], [2.]])
+      }
+      return features, labels
+
+    def _input_fn_eval():
+      # Create 4 rows (y = x)
+      labels = constant_op.constant([[1], [1], [1], [1]])
+      features = {
+          'x': array_ops.ones(
+              shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    classifier = debug.DebugClassifier(weight_column_name='w')
+
+    classifier.fit(input_fn=_input_fn_train, steps=5)
+    scores = classifier.evaluate(input_fn=_input_fn_eval, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+
+  def testCustomMetrics(self):
+    """Tests custom evaluation metrics."""
+
+    def _input_fn(num_epochs=None):
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      labels = constant_op.constant([[1], [0], [0], [0]])
+      features = {
+          'x':
+              input_lib.limit_epochs(
+                  array_ops.ones(
+                      shape=[4, 1], dtype=dtypes.float32),
+                  num_epochs=num_epochs),
+      }
+      return features, labels
+
+    def _my_metric_op(predictions, labels):
+      # For the case of binary classification, the 2nd column of "predictions"
+      # denotes the model predictions.
+      labels = math_ops.to_float(labels)
+      predictions = array_ops.strided_slice(
+          predictions, [0, 1], [-1, 2], end_mask=1)
+      labels = math_ops.cast(labels, predictions.dtype)
+      return math_ops.reduce_sum(math_ops.multiply(predictions, labels))
+
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn, steps=5)
+    scores = classifier.evaluate(
+        input_fn=_input_fn,
+        steps=5,
+        metrics={
+            'my_accuracy':
+                MetricSpec(
+                    metric_fn=metric_ops.streaming_accuracy,
+                    prediction_key='classes'),
+            'my_precision':
+                MetricSpec(
+                    metric_fn=metric_ops.streaming_precision,
+                    prediction_key='classes'),
+            'my_metric':
+                MetricSpec(
+                    metric_fn=_my_metric_op, prediction_key='probabilities')
+        })
+    self.assertTrue(
+        set(['loss', 'my_accuracy', 'my_precision', 'my_metric']).issubset(
+            set(scores.keys())))
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions = np.array(
+        list(classifier.predict_classes(input_fn=predict_input_fn)))
+    self.assertEqual(
+        _sklearn.accuracy_score([1, 0, 0, 0], predictions),
+        scores['my_accuracy'])
+
+    # Test the case where the 2nd element of the key is neither "classes" nor
+    # "probabilities".
+    with self.assertRaisesRegexp(KeyError, 'bad_type'):
+      classifier.evaluate(
+          input_fn=_input_fn,
+          steps=5,
+          metrics={
+              'bad_name':
+                  MetricSpec(
+                      metric_fn=metric_ops.streaming_auc,
+                      prediction_key='bad_type')
+          })
+
+  def testTrainSaveLoad(self):
+    """Tests that insures you can save and reload a trained model."""
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    model_dir = tempfile.mkdtemp()
+    classifier = debug.DebugClassifier(
+        model_dir=model_dir,
+        n_classes=3,
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn, steps=5)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions1 = classifier.predict_classes(input_fn=predict_input_fn)
+    del classifier
+
+    classifier2 = debug.DebugClassifier(
+        model_dir=model_dir,
+        n_classes=3,
+        config=run_config.RunConfig(tf_random_seed=1))
+    predictions2 = classifier2.predict_classes(input_fn=predict_input_fn)
+    self.assertEqual(list(predictions1), list(predictions2))
+
+  def testExport(self):
+    """Tests export model for servo."""
+
+    def input_fn():
+      return {
+          'age':
+              constant_op.constant([1]),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
+      }, constant_op.constant([[1]])
+
+    language = feature_column.sparse_column_with_hash_bucket('language', 100)
+    feature_columns = [
+        feature_column.real_valued_column('age'),
+        feature_column.embedding_column(
+            language, dimension=1)
+    ]
+
+    classifier = debug.DebugClassifier(config=run_config.RunConfig(
+        tf_random_seed=1))
+    classifier.fit(input_fn=input_fn, steps=5)
+
+    def default_input_fn(unused_estimator, examples):
+      return feature_column_ops.parse_feature_columns_from_examples(
+          examples, feature_columns)
+
+    export_dir = tempfile.mkdtemp()
+    classifier.export(export_dir, input_fn=default_input_fn)
+
+
+class DebugRegressorTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(100)
+    self.features = np.random.rand(NUM_EXAMPLES, 5)
+    self.targets = np.random.rand(NUM_EXAMPLES, LABEL_DIMENSION)
+
+  def testPredictScores(self):
+    """Tests that DebugRegressor outputs the mean target."""
+    (train_features, train_labels), (test_features,
+                                     test_labels) = _train_test_split(
+                                         [self.features, self.targets])
+    mean_target = np.mean(train_labels, 0)
+    expected_prediction = np.vstack(
+        [mean_target for _ in range(test_labels.shape[0])])
+
+    classifier = debug.DebugRegressor(label_dimension=LABEL_DIMENSION)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
+
+    pred = classifier.predict_scores(input_fn=_input_fn_builder(test_features,
+                                                                None))
+    self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
+
+  def testExperimentIntegration(self):
+    exp = experiment.Experiment(
+        estimator=debug.DebugRegressor(),
+        train_input_fn=test_data.iris_input_logistic_fn,
+        eval_input_fn=test_data.iris_input_logistic_fn)
+    exp.test()
+
+  def testEstimatorContract(self):
+    estimator_test_utils.assert_estimator_contract(self, debug.DebugRegressor)
+
+  def testRegression_MatrixData(self):
+    """Tests regression using matrix data as input."""
+    regressor = debug.DebugRegressor(
+        config=run_config.RunConfig(tf_random_seed=1))
+    input_fn = test_data.iris_input_logistic_fn
+    regressor.fit(input_fn=input_fn, steps=200)
+    scores = regressor.evaluate(input_fn=input_fn, steps=1)
+    self.assertIn('loss', scores)
+
+  def testRegression_MatrixData_Labels1D(self):
+    """Same as the last test, but label shape is [100] instead of [100, 1]."""
+
+    def _input_fn():
+      iris = test_data.prepare_iris_data_for_logistic_regression()
+      return {
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
+      }, constant_op.constant(
+          iris.target, shape=[100], dtype=dtypes.int32)
+
+    regressor = debug.DebugRegressor(
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn, steps=200)
+    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
+    self.assertIn('loss', scores)
+
+  def testRegression_NpMatrixData(self):
+    """Tests binary classification using numpy matrix data as input."""
+    iris = test_data.prepare_iris_data_for_logistic_regression()
+    train_x = iris.data
+    train_y = iris.target
+    regressor = debug.DebugRegressor(
+        config=run_config.RunConfig(tf_random_seed=1))
+    regressor.fit(x=train_x, y=train_y, steps=200)
+    scores = regressor.evaluate(x=train_x, y=train_y, steps=1)
+    self.assertIn('loss', scores)
+
+  def testRegression_TensorData(self):
+    """Tests regression using tensor data as input."""
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [.15], [0.]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)
+
+    regressor = debug.DebugRegressor(
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn, steps=200)
+
+    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
+    self.assertIn('loss', scores)
+
+  def testLoss(self):
+    """Tests loss calculation."""
+
+    def _input_fn_train():
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      # The algorithm should learn (y = 0.25).
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      return features, labels
+
+    regressor = debug.DebugRegressor(
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn_train, steps=5)
+    scores = regressor.evaluate(input_fn=_input_fn_train, steps=1)
+    self.assertIn('loss', scores)
+
+  def testLossWithWeights(self):
+    """Tests loss calculation with weights."""
+
+    def _input_fn_train():
+      # 4 rows with equal weight, one of them (y = x), three of them (y=Not(x))
+      # The algorithm should learn (y = 0.25).
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    def _input_fn_eval():
+      # 4 rows, with different weights.
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[7.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    regressor = debug.DebugRegressor(
+        weight_column_name='w', config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn_train, steps=5)
+    scores = regressor.evaluate(input_fn=_input_fn_eval, steps=1)
+    self.assertIn('loss', scores)
+
+  def testTrainWithWeights(self):
+    """Tests training with given weight column."""
+
+    def _input_fn_train():
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      # First row has more weight than others. Model should fit (y=x) better
+      # than (y=Not(x)) due to the relative higher weight of the first row.
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[100.], [3.], [2.], [2.]])
+      }
+      return features, labels
+
+    def _input_fn_eval():
+      # Create 4 rows (y = x)
+      labels = constant_op.constant([[1.], [1.], [1.], [1.]])
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+          'w': constant_op.constant([[1.], [1.], [1.], [1.]])
+      }
+      return features, labels
+
+    regressor = debug.DebugRegressor(
+        weight_column_name='w', config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn_train, steps=5)
+    scores = regressor.evaluate(input_fn=_input_fn_eval, steps=1)
+    self.assertIn('loss', scores)
+
+  def testCustomMetrics(self):
+    """Tests custom evaluation metrics."""
+
+    def _input_fn(num_epochs=None):
+      # Create 4 rows, one of them (y = x), three of them (y=Not(x))
+      labels = constant_op.constant([[1.], [0.], [0.], [0.]])
+      features = {
+          'x':
+              input_lib.limit_epochs(
+                  array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+                  num_epochs=num_epochs),
+      }
+      return features, labels
+
+    def _my_metric_op(predictions, labels):
+      return math_ops.reduce_sum(math_ops.multiply(predictions, labels))
+
+    regressor = debug.DebugRegressor(
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn, steps=5)
+    scores = regressor.evaluate(
+        input_fn=_input_fn,
+        steps=1,
+        metrics={
+            'my_error':
+                MetricSpec(
+                    metric_fn=metric_ops.streaming_mean_squared_error,
+                    prediction_key='scores'),
+            'my_metric':
+                MetricSpec(metric_fn=_my_metric_op, prediction_key='scores')
+        })
+    self.assertIn('loss', set(scores.keys()))
+    self.assertIn('my_error', set(scores.keys()))
+    self.assertIn('my_metric', set(scores.keys()))
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions = np.array(
+        list(regressor.predict_scores(input_fn=predict_input_fn)))
+    self.assertAlmostEqual(
+        _sklearn.mean_squared_error(np.array([1, 0, 0, 0]), predictions),
+        scores['my_error'])
+
+    # Tests the case where the prediction_key is not "scores".
+    with self.assertRaisesRegexp(KeyError, 'bad_type'):
+      regressor.evaluate(
+          input_fn=_input_fn,
+          steps=1,
+          metrics={
+              'bad_name':
+                  MetricSpec(
+                      metric_fn=metric_ops.streaming_auc,
+                      prediction_key='bad_type')
+          })
+
+  def testTrainSaveLoad(self):
+    """Tests that insures you can save and reload a trained model."""
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[0.8], [0.15], [0.]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)
+
+    model_dir = tempfile.mkdtemp()
+    regressor = debug.DebugRegressor(
+        model_dir=model_dir, config=run_config.RunConfig(tf_random_seed=1))
+
+    regressor.fit(input_fn=_input_fn, steps=5)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predictions = list(regressor.predict_scores(input_fn=predict_input_fn))
+    del regressor
+
+    regressor2 = debug.DebugRegressor(
+        model_dir=model_dir, config=run_config.RunConfig(tf_random_seed=1))
+    predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn))
+    self.assertAllClose(predictions, predictions2)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index 04607dcdb1390add2ec12a60e8fa57adb57d228e..cb15ef23e95d27c737d8ae08065b804bafd39a07 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -245,7 +245,9 @@ class DNNClassifier(estimator.Estimator):
   def input_fn_eval: # returns x, y (where y represents label's class index).
     pass
   estimator.evaluate(input_fn=input_fn_eval)
+
   def input_fn_predict: # returns x, None
+    pass
   # predict_classes returns class indices.
   estimator.predict_classes(input_fn=input_fn_predict)
   ```
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index b87b75d5c4cd321f66d699e7861fdc473aed1124..b9201cc805ba9c5516bcf3b808e6052f36859833 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -530,7 +530,7 @@ class BaseEstimator(
     """
     _verify_input_args(x, y, input_fn, feed_fn, batch_size)
     if x is not None:
-      return SKCompat(self).score(x, y, batch_size, steps, metrics)
+      return SKCompat(self).score(x, y, batch_size, steps, metrics, name)
 
     if metrics is not None and not isinstance(metrics, dict):
       raise ValueError('Metrics argument should be None or dict. '
@@ -838,6 +838,10 @@ class BaseEstimator(
       hooks = hooks[:] if hooks else []
       if feed_fn:
         hooks.append(basic_session_run_hooks.FeedFnHook(feed_fn))
+      if steps == 0:
+        logging.warning('evaluation steps are 0. If `input_fn` does not raise'
+                        'OutOfRangeError`, the evaluation will never stop.'
+                        'Use steps=None if intended.')
       if steps:
         hooks.append(
             evaluation.StopAfterNEvalsHook(
@@ -1353,7 +1357,7 @@ class SKCompat(sklearn.BaseEstimator):
                         monitors=all_monitors)
     return self
 
-  def score(self, x, y, batch_size=128, steps=None, metrics=None):
+  def score(self, x, y, batch_size=128, steps=None, metrics=None, name=None):
     input_fn, feed_fn = _get_input_fn(x, y, input_fn=None,
                                       feed_fn=None, batch_size=batch_size,
                                       shuffle=False, epochs=1)
@@ -1365,7 +1369,7 @@ class SKCompat(sklearn.BaseEstimator):
         feed_fn=feed_fn,
         steps=steps,
         metrics=metrics,
-        name='score')
+        name=name)
     if eval_results is not None:
       eval_results.update({'global_step': global_step})
     return eval_results
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 6e15e7891e9ebfdb713de86246f16df59e86e59b..7b49cd475d074da2d9702bc653925cfa1b522fbf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -25,8 +25,7 @@ import six
 from tensorflow.contrib import framework as framework_lib
 from tensorflow.contrib import layers as layers_lib
 from tensorflow.contrib import lookup as lookup_lib
-# TODO(ptucker): Use tf.losses and tf.metrics.
-from tensorflow.contrib import losses as losses_lib
+# TODO(ptucker): Use tf.metrics.
 from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
@@ -44,6 +43,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.ops.losses import losses as losses_lib
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training
@@ -438,7 +438,7 @@ def loss_only_head(loss_fn, head_name=None):
   Args:
     loss_fn: a function that takes no argument and returns a list of
         scalar tensors.
-    head_name: a name for for the head.
+    head_name: a name for the head.
 
   Returns:
     An instance of `Head` to hold the additional losses.
@@ -665,6 +665,7 @@ class _RegressionHead(_SingleHead):
                label_dimension,
                loss_fn,
                link_fn,
+               logits_dimension=None,
                label_name=None,
                weight_column_name=None,
                enable_centered_bias=False,
@@ -677,6 +678,10 @@ class _RegressionHead(_SingleHead):
         shape `[batch_size, label_dimension]`).
       loss_fn: Loss function, takes logits and labels and returns loss.
       link_fn: Link function, takes a logits tensor and returns the output.
+      logits_dimension: Number of logits per example. This is the
+        size of the last dimension of the logits `Tensor` (typically, this has
+        shape `[batch_size, label_dimension]`).
+        Default value: `label_dimension`.
       label_name: String, name of the key in label dict. Can be null if label
           is a tensor (single headed models).
       weight_column_name: A string defining feature column name representing
@@ -691,7 +696,8 @@ class _RegressionHead(_SingleHead):
     """
     super(_RegressionHead, self).__init__(
         problem_type=constants.ProblemType.LINEAR_REGRESSION,
-        logits_dimension=label_dimension,
+        logits_dimension=(logits_dimension if logits_dimension is not None
+                          else label_dimension),
         label_name=label_name,
         weight_column_name=weight_column_name,
         head_name=head_name)
@@ -1212,7 +1218,8 @@ class _BinarySvmHead(_SingleHead):
       with ops.name_scope(None, "hinge_loss", (logits, labels)) as name:
         with ops.control_dependencies((_assert_labels_rank(labels),)):
           labels = array_ops.reshape(labels, shape=(-1, 1))
-        loss = losses_lib.hinge_loss(logits=logits, labels=labels, scope=name)
+        loss = losses_lib.hinge_loss(labels=labels, logits=logits, scope=name,
+                                     reduction=losses_lib.Reduction.NONE)
         return _compute_weighted_loss(loss, weights)
 
     super(_BinarySvmHead, self).__init__(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index e5c01336cf7247f0aa186c38f446b3c8996b5be8..ce87b4723d436495e5fb149f0ab8f2eea44d82b8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -64,13 +64,16 @@ def make_random_points(centers, num_points, max_offset=20):
   assignments = np.random.choice(num_centers, num_points)
   offsets = np.round(
       np.random.randn(num_points, num_dims).astype(np.float32) * max_offset)
-  return (centers[assignments] + offsets, assignments,
-          np.add.reduce(offsets * offsets, 1))
+  return (centers[assignments] + offsets, assignments, np.add.reduce(
+      offsets * offsets, 1))
 
 
 class KMeansTestBase(test.TestCase):
 
-  def input_fn(self, batch_size=None, points=None, randomize=None,
+  def input_fn(self,
+               batch_size=None,
+               points=None,
+               randomize=None,
                num_epochs=None):
     """Returns an input_fn that randomly selects batches from given points."""
     batch_size = batch_size or self.batch_size
@@ -79,6 +82,7 @@ class KMeansTestBase(test.TestCase):
     if randomize is None:
       randomize = (self.use_mini_batch and
                    self.mini_batch_steps_per_iteration <= 1)
+
     def _fn():
       x = constant_op.constant(points)
       if batch_size == num_points:
@@ -86,33 +90,40 @@ class KMeansTestBase(test.TestCase):
       if randomize:
         indices = random_ops.random_uniform(
             constant_op.constant([batch_size]),
-            minval=0, maxval=num_points-1,
+            minval=0,
+            maxval=num_points - 1,
             dtype=dtypes.int32,
             seed=10)
       else:
         # We need to cycle through the indices sequentially. We create a queue
         # to maintain the list of indices.
-        q = data_flow_ops.FIFOQueue(self.num_points, dtypes.int32, ())
+        q = data_flow_ops.FIFOQueue(num_points, dtypes.int32, ())
+
         # Conditionally initialize the Queue.
         def _init_q():
-          with ops.control_dependencies([q.enqueue_many(
-              math_ops.range(self.num_points))]):
+          with ops.control_dependencies(
+              [q.enqueue_many(math_ops.range(num_points))]):
             return control_flow_ops.no_op()
-        init_q = control_flow_ops.cond(q.size() <= 0,
-                                       _init_q,
+
+        init_q = control_flow_ops.cond(q.size() <= 0, _init_q,
                                        control_flow_ops.no_op)
         with ops.control_dependencies([init_q]):
-          offsets = q.dequeue_many(self.batch_size)
+          offsets = q.dequeue_many(batch_size)
           with ops.control_dependencies([q.enqueue_many(offsets)]):
             indices = array_ops.identity(offsets)
       batch = array_ops.gather(x, indices)
       return (input_lib.limit_epochs(batch, num_epochs=num_epochs), None)
+
     return _fn
 
   @staticmethod
   def config(tf_random_seed):
     return run_config.RunConfig(tf_random_seed=tf_random_seed)
 
+  @property
+  def initial_clusters(self):
+    return kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT
+
   @property
   def batch_size(self):
     return self.num_points
@@ -141,7 +152,7 @@ class KMeansTest(KMeansTestBase):
   def _kmeans(self, relative_tolerance=None):
     return kmeans_lib.KMeansClustering(
         self.num_centers,
-        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        initial_clusters=self.initial_clusters,
         distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=self.use_mini_batch,
         mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
@@ -172,7 +183,7 @@ class KMeansTest(KMeansTestBase):
       return
     kmeans = kmeans_lib.KMeansClustering(
         self.num_centers,
-        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        initial_clusters=self.initial_clusters,
         distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=self.use_mini_batch,
         mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
@@ -197,11 +208,12 @@ class KMeansTest(KMeansTestBase):
 
     # Make a small test set
     num_points = 10
-    points, true_assignments, true_offsets = make_random_points(clusters,
-                                                                num_points)
+    points, true_assignments, true_offsets = make_random_points(
+        clusters, num_points)
     # Test predict
-    assignments = list(kmeans.predict_cluster_idx(input_fn=self.input_fn(
-        batch_size=num_points, points=points, num_epochs=1)))
+    assignments = list(
+        kmeans.predict_cluster_idx(input_fn=self.input_fn(
+            batch_size=num_points, points=points, num_epochs=1)))
     self.assertAllEqual(assignments, true_assignments)
 
     # Test score
@@ -214,35 +226,62 @@ class KMeansTest(KMeansTestBase):
         input_fn=lambda: (constant_op.constant(points), None))
     true_transform = np.maximum(
         0,
-        np.sum(np.square(points), axis=1, keepdims=True) - 2 * np.dot(
-            points, np.transpose(clusters)) +
+        np.sum(np.square(points), axis=1,
+               keepdims=True) - 2 * np.dot(points, np.transpose(clusters)) +
         np.transpose(np.sum(np.square(clusters), axis=1, keepdims=True)))
     self.assertAllClose(transform, true_transform, rtol=0.05, atol=10)
 
-  def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
-    points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
-    with self.assertRaisesOpError('less'):
-      kmeans = learn.KMeansClustering(
-          num_clusters=3,
-          use_mini_batch=self.use_mini_batch,
-          mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
-          initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT)
-      kmeans.fit(input_fn=lambda: (constant_op.constant(points), None),
-                 steps=10)
+class KMeansTestMultiStageInit(KMeansTestBase):
 
-  def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
-      self):
-    points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
+  def test_random(self):
+    points = np.array(
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32)
+    kmeans = kmeans_lib.KMeansClustering(
+        num_clusters=points.shape[0],
+        initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
+        use_mini_batch=True,
+        mini_batch_steps_per_iteration=100,
+        random_seed=24,
+        relative_tolerance=None)
+    kmeans.fit(
+        input_fn=self.input_fn(batch_size=1, points=points, randomize=False),
+        steps=1)
+    clusters = kmeans.clusters()
+    self.assertAllEqual(points, clusters)
 
+  def test_kmeans_plus_plus_batch_just_right(self):
+    points = np.array([[1, 2]], dtype=np.float32)
+    kmeans = kmeans_lib.KMeansClustering(
+        num_clusters=points.shape[0],
+        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
+        use_mini_batch=True,
+        mini_batch_steps_per_iteration=100,
+        random_seed=24,
+        relative_tolerance=None)
+    kmeans.fit(
+        input_fn=self.input_fn(batch_size=1, points=points, randomize=False),
+        steps=1)
+    clusters = kmeans.clusters()
+    self.assertAllEqual(points, clusters)
+
+  def test_kmeans_plus_plus_batch_too_small(self):
+    points = np.array(
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32)
+    kmeans = kmeans_lib.KMeansClustering(
+        num_clusters=points.shape[0],
+        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE,
+        use_mini_batch=True,
+        mini_batch_steps_per_iteration=100,
+        random_seed=24,
+        relative_tolerance=None)
     with self.assertRaisesOpError(AssertionError):
-      kmeans = learn.KMeansClustering(
-          num_clusters=3,
-          use_mini_batch=self.use_mini_batch,
-          mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
-          initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT)
-      kmeans.fit(input_fn=lambda: (constant_op.constant(points), None),
-                 steps=10)
+      kmeans.fit(
+          input_fn=self.input_fn(batch_size=4, points=points, randomize=False),
+          steps=1)
 
 
 class MiniBatchKMeansTest(KMeansTest):
@@ -282,11 +321,11 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     self.true_centers = np.array(
         [
             normalize(
-                np.mean(
-                    normalize(self.points)[0:4, :], axis=0, keepdims=True))[0],
+                np.mean(normalize(self.points)[0:4, :], axis=0, keepdims=True))[
+                    0],
             normalize(
-                np.mean(
-                    normalize(self.points)[4:, :], axis=0, keepdims=True))[0]
+                np.mean(normalize(self.points)[4:, :], axis=0, keepdims=True))[
+                    0]
         ],
         dtype=np.float32)
     self.true_assignments = np.array([0] * 4 + [1] * 4)
@@ -323,8 +362,9 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     self.kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
     centers = normalize(self.kmeans.clusters())
 
-    assignments = list(self.kmeans.predict_cluster_idx(
-        input_fn=self.input_fn(num_epochs=1, batch_size=self.num_points)))
+    assignments = list(
+        self.kmeans.predict_cluster_idx(input_fn=self.input_fn(
+            num_epochs=1, batch_size=self.num_points)))
     self.assertAllClose(
         centers[assignments],
         self.true_centers[self.true_assignments],
@@ -333,8 +373,8 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     centers = centers[centers[:, 0].argsort()]
     true_centers = self.true_centers[self.true_centers[:, 0].argsort()]
     self.assertAllClose(centers, true_centers, atol=0.04)
-    score = self.kmeans.score(input_fn=self.input_fn(
-        batch_size=self.num_points), steps=1)
+    score = self.kmeans.score(
+        input_fn=self.input_fn(batch_size=self.num_points), steps=1)
     self.assertAllClose(score, self.true_score, atol=1e-2)
 
   def test_predict_kmeans_plus_plus(self):
@@ -348,13 +388,11 @@ class KMeansCosineDistanceTest(KMeansTestBase):
     true_centers = np.array(
         [
             normalize(
-                np.mean(
-                    normalize(points)[0:2, :], axis=0, keepdims=True))[0],
+                np.mean(normalize(points)[0:2, :], axis=0, keepdims=True))[0],
+            normalize(
+                np.mean(normalize(points)[2:4, :], axis=0, keepdims=True))[0],
             normalize(
-                np.mean(
-                    normalize(points)[2:4, :], axis=0, keepdims=True))[0],
-            normalize(np.mean(
-                normalize(points)[4:, :], axis=0, keepdims=True))[0]
+                np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0]
         ],
         dtype=np.float32)
     true_assignments = [0] * 2 + [1] * 2 + [2] * 8
@@ -363,7 +401,7 @@ class KMeansCosineDistanceTest(KMeansTestBase):
 
     kmeans = kmeans_lib.KMeansClustering(
         3,
-        initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
+        initial_clusters=self.initial_clusters,
         distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
         use_mini_batch=self.use_mini_batch,
         mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
@@ -375,9 +413,9 @@ class KMeansCosineDistanceTest(KMeansTestBase):
         sorted(centers.tolist()), sorted(true_centers.tolist()), atol=1e-2)
 
     def _input_fn():
-      return (
-          input_lib.limit_epochs(constant_op.constant(points), num_epochs=1),
-          None)
+      return (input_lib.limit_epochs(
+          constant_op.constant(points), num_epochs=1), None)
+
     assignments = list(kmeans.predict_cluster_idx(input_fn=_input_fn))
     self.assertAllClose(
         centers[assignments], true_centers[true_assignments], atol=1e-2)
@@ -489,8 +527,8 @@ class TensorflowKMeansBenchmark(KMeansBenchmark):
           random_seed=i * 42,
           relative_tolerance=1e-6,
           config=run_config.RunConfig(tf_random_seed=3))
-      tf_kmeans.fit(input_fn=lambda: (constant_op.constant(self.points), None),
-                    steps=50)
+      tf_kmeans.fit(
+          input_fn=lambda: (constant_op.constant(self.points), None), steps=50)
       _ = tf_kmeans.clusters()
       scores.append(
           tf_kmeans.score(
@@ -521,14 +559,15 @@ class SklearnKMeansBenchmark(KMeansBenchmark):
 class KMeansTestQueues(test.TestCase):
 
   def input_fn(self):
+
     def _fn():
-      queue = data_flow_ops.FIFOQueue(capacity=10,
-                                      dtypes=dtypes.float32,
-                                      shapes=[10, 3])
+      queue = data_flow_ops.FIFOQueue(
+          capacity=10, dtypes=dtypes.float32, shapes=[10, 3])
       enqueue_op = queue.enqueue(array_ops.zeros([10, 3], dtype=dtypes.float32))
-      queue_runner.add_queue_runner(queue_runner.QueueRunner(queue,
-                                                             [enqueue_op]))
+      queue_runner.add_queue_runner(
+          queue_runner.QueueRunner(queue, [enqueue_op]))
       return queue.dequeue(), None
+
     return _fn
 
   # This test makes sure that there are no deadlocks when using a QueueRunner.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 021918f0efc87871cd626619a35b9814fd3684d8..93c62f87e8495f299a8c456574c7b40534186304 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -26,12 +26,12 @@ from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import logistic_regressor
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
-from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 
 
@@ -55,7 +55,7 @@ def _logistic_regression_model_fn(features, labels, mode):
       # AUC/precision/recall/etc will change meaningfully even on a toy dataset.
       biases_initializer=init_ops.constant_initializer(-10.0))
   predictions = math_ops.sigmoid(logits)
-  loss = loss_ops.sigmoid_cross_entropy(logits, labels)
+  loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
       loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return predictions, loss, train_op
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index 8a327ab01f2b272f687a9507deb89225c1e5d38c..8be9c72adf1602826fabc650f350b57f72c886be 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -270,8 +270,17 @@ class ModelFnOps(
           result[key] = value
       return result
 
+    # Convert the contrib mode enum to the core mode enum.
+    # Note: mode already validated in __new__().
+    if self.mode == ModeKeys.TRAIN:
+      core_mode = core_model_fn_lib.ModeKeys.TRAIN
+    elif self.mode == ModeKeys.EVAL:
+      core_mode = core_model_fn_lib.ModeKeys.EVAL
+    elif self.mode == ModeKeys.INFER:
+      core_mode = core_model_fn_lib.ModeKeys.PREDICT
+
     return core_model_fn_lib.EstimatorSpec(
-        mode=self.mode,
+        mode=core_mode,
         predictions=self.predictions,
         loss=self.loss,
         train_op=self.train_op,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 3aaee5862df4fdc5af8ead37e6ebff7944e29dbd..0642c5900a83e6c08c6a94995ec41295ac5d721f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -42,6 +42,7 @@ _DEFAULT_UID_WHITE_LIST = [
     'session_config',
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
+    'log_step_count_steps',
 ]
 
 
@@ -210,12 +211,7 @@ class ClusterConfig(object):
 class RunConfig(ClusterConfig, core_run_config.RunConfig):
   """This class specifies the configurations for an `Estimator` run.
 
-  This class is the implementation of ${tf.estimator.RunConfig} interface.
-
-  If you're a Google-internal user using command line flags with
-  `learn_runner.py` (for instance, to do distributed training or to use
-  parameter servers), you probably want to use `learn_runner.EstimatorConfig`
-  instead.
+  This class is the implementation of @{tf.estimator.RunConfig} interface.
   """
   _USE_DEFAULT = 0
 
@@ -230,6 +226,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
                save_checkpoints_steps=None,
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
+               log_step_count_steps=100,
                evaluation_master='',
                model_dir=None,
                session_config=None):
@@ -261,6 +258,8 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
       keep_checkpoint_every_n_hours: Number of hours between each checkpoint
         to be saved. The default value of 10,000 hours effectively disables
         the feature.
+      log_step_count_steps: The frequency, in number of global steps, that the
+        global step/sec will be logged during training.
       evaluation_master: the master on which to perform evaluation.
       model_dir: directory where model parameters, graph etc are saved. If
         `None`, will use `model_dir` property in `TF_CONFIG` environment
@@ -284,6 +283,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     self._tf_random_seed = tf_random_seed
     self._save_summary_steps = save_summary_steps
     self._save_checkpoints_secs = save_checkpoints_secs
+    self._log_step_count_steps = log_step_count_steps
     self._session_config = session_config
     if save_checkpoints_secs == RunConfig._USE_DEFAULT:
       if save_checkpoints_steps is None:
@@ -367,6 +367,10 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
   def keep_checkpoint_every_n_hours(self):
     return self._keep_checkpoint_every_n_hours
 
+  @property
+  def log_step_count_steps(self):
+    return self._log_step_count_steps
+
 
 def _count_ps(cluster_spec):
   """Counts the number of parameter servers in cluster_spec."""
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 1c6ac08e4647579b281da957b30648ff86ff8b84..3075d9355c97672d82d03bf2f0e5479721a82ef5 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -245,6 +245,11 @@ class Experiment(object):
     # Otherwise, the servers will wait to connect to each other before starting
     # to train. We might as well start as soon as we can.
     config = self._estimator.config
+    if (config.cluster_spec and config.master and
+        config.environment == run_config.Environment.LOCAL):
+      logging.warn("ClusterSpec and master are provided, but environment is "
+                   "set to 'local'. Set environment to 'cloud' if you intend "
+                   "to use the distributed runtime.")
     if (config.environment != run_config.Environment.LOCAL and
         config.environment != run_config.Environment.GOOGLE and
         config.cluster_spec and config.master):
@@ -274,7 +279,7 @@ class Experiment(object):
                             max_steps=self._train_steps,
                             hooks=self._train_monitors + extra_hooks)
 
-  def evaluate(self, delay_secs=None):
+  def evaluate(self, delay_secs=None, name=None):
     """Evaluate on the evaluation data.
 
     Runs evaluation on the evaluation data and returns the result. Runs for
@@ -286,6 +291,8 @@ class Experiment(object):
     Args:
       delay_secs: Start evaluating after this many seconds. If `None`, defaults
         to using `self._eval_delays_secs`.
+      name: Gives the name to the evauation for the case multiple evaluation is
+        run for the same experiment.
 
     Returns:
       The result of the `evaluate` call to the `Estimator`.
@@ -300,7 +307,7 @@ class Experiment(object):
     return self._call_evaluate(input_fn=self._eval_input_fn,
                                steps=self._eval_steps,
                                metrics=self._eval_metrics,
-                               name="one_pass",
+                               name=(name or "one_pass"),
                                hooks=self._eval_hooks)
 
   @deprecated(
@@ -523,7 +530,12 @@ class Experiment(object):
       differences in resource control. First, the resources (e.g., memory) used
       by training will be released before evaluation (`train_and_evaluate` takes
       double resources). Second, more checkpoints will be saved as a checkpoint
-      is generated at the end of each small training iteration.
+      is generated at the end of each training iteration.
+
+      3. As the estimator.train starts from scratch (new graph, new states for
+      input, etc) at each iteration, it is recommended to have the
+      `train_steps_per_iteration` larger. It is also recommended to shuffle your
+      input.
 
     Args:
       continuous_eval_predicate_fn: A predicate function determining whether to
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 61208ba24e107141e60d22dfe2ab6460dcc77ed6..bdb88b89bb3dba95a229724994874b0a26b1fc3f 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -75,10 +75,10 @@ def read_batch_examples(file_pattern,
       `tf.local_variables_initializer()` and run the op in a session.
     queue_capacity: Capacity for input queue.
     num_threads: The number of threads enqueuing examples. In order to have
-      predicted and repeatable order of reading and enqueueing, such as in
+      predictable and repeatable order of reading and enqueueing, such as in
       prediction and evaluation mode, `num_threads` should be 1.
     read_batch_size: An int or scalar `Tensor` specifying the number of
-      records to read at once
+      records to read at once.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
@@ -142,10 +142,10 @@ def read_keyed_batch_examples(file_pattern,
       `tf.local_variables_initializer()` and run the op in a session.
     queue_capacity: Capacity for input queue.
     num_threads: The number of threads enqueuing examples. In order to have
-      predicted and repeatable order of reading and enqueueing, such as in
+      predictable and repeatable order of reading and enqueueing, such as in
       prediction and evaluation mode, `num_threads` should be 1.
     read_batch_size: An int or scalar `Tensor` specifying the number of
-      records to read at once
+      records to read at once.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
@@ -217,7 +217,7 @@ def read_keyed_batch_examples_shared_queue(file_pattern,
     queue_capacity: Capacity for input queue.
     num_threads: The number of threads enqueuing examples.
     read_batch_size: An int or scalar `Tensor` specifying the number of
-      records to read at once
+      records to read at once.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
@@ -335,7 +335,7 @@ def _read_keyed_batch_examples_helper(file_pattern,
     queue_capacity: Capacity for input queue.
     num_threads: The number of threads enqueuing examples.
     read_batch_size: An int or scalar `Tensor` specifying the number of
-      records to read at once
+      records to read at once.
     filter_fn: Filtering function, takes both keys as well `Example` Tensors
       and returns a boolean mask of the same shape as the input Tensors to
       be applied for filtering. If `None`, no filtering is done.
@@ -470,13 +470,15 @@ def read_keyed_batch_features(file_pattern,
       tf.local_variables_initializer() and run the op in a session.
     queue_capacity: Capacity for input queue.
     reader_num_threads: The number of threads to read examples. In order to have
-      predicted and repeatable order of reading and enqueueing, such as in
+      predictable and repeatable order of reading and enqueueing, such as in
       prediction and evaluation mode, `reader_num_threads` should be 1.
     feature_queue_capacity: Capacity of the parsed features queue.
     num_enqueue_threads: Number of threads to enqueue the parsed example queue.
       Using multiple threads to enqueue the parsed example queue helps maintain
       a full queue when the subsequent computations overall are cheaper than
-      parsing.
+      parsing. In order to have predictable and repeatable order of reading and
+      enqueueing, such as in prediction and evaluation mode,
+      `num_enqueue_threads` should be 1.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
@@ -617,7 +619,9 @@ def queue_parsed_features(parsed_features,
     num_enqueue_threads: Number of threads to enqueue the parsed example queue.
       Using multiple threads to enqueue the parsed example queue helps maintain
       a full queue when the subsequent computations overall are cheaper than
-      parsing.
+      parsing. In order to have predictable and repeatable order of reading and
+      enqueueing, such as in prediction and evaluation mode,
+      `num_enqueue_threads` should be 1.
     name: Name of resulting op.
 
   Returns:
@@ -721,6 +725,7 @@ def read_batch_features(file_pattern,
                         queue_capacity=10000,
                         feature_queue_capacity=100,
                         reader_num_threads=1,
+                        num_enqueue_threads=2,
                         parse_fn=None,
                         name=None):
   """Adds operations to read, queue, batch and parse `Example` protos.
@@ -752,8 +757,14 @@ def read_batch_features(file_pattern,
     feature_queue_capacity: Capacity of the parsed features queue. Set this
       value to a small number, for example 5 if the parsed features are large.
     reader_num_threads: The number of threads to read examples. In order to have
-      predicted and repeatable order of reading and enqueueing, such as in
+      predictable and repeatable order of reading and enqueueing, such as in
       prediction and evaluation mode, `reader_num_threads` should be 1.
+    num_enqueue_threads: Number of threads to enqueue the parsed example queue.
+      Using multiple threads to enqueue the parsed example queue helps maintain
+      a full queue when the subsequent computations overall are cheaper than
+      parsing. In order to have predictable and repeatable order of reading and
+      enqueueing, such as in prediction and evaluation mode,
+      `num_enqueue_threads` should be 1.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
@@ -772,8 +783,9 @@ def read_batch_features(file_pattern,
       randomize_input=randomize_input,
       num_epochs=num_epochs,
       queue_capacity=queue_capacity,
-      feature_queue_capacity=feature_queue_capacity,
       reader_num_threads=reader_num_threads,
+      feature_queue_capacity=feature_queue_capacity,
+      num_enqueue_threads=num_enqueue_threads,
       parse_fn=parse_fn,
       name=name)
   return features
@@ -804,7 +816,7 @@ def read_batch_record_features(file_pattern,
       tf.local_variables_initializer() and run the op in a session.
     queue_capacity: Capacity for input queue.
     reader_num_threads: The number of threads to read examples. In order to have
-      predicted and repeatable order of reading and enqueueing, such as in
+      predictable and repeatable order of reading and enqueueing, such as in
       prediction and evaluation mode, `reader_num_threads` should be 1.
     name: Name of resulting op.
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index f25f7caf61574f4d6cbd4d64b99a5d4f18b6fb44..6f0fd9a2976d37d1c701a96f50c2b987562cb191 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -350,6 +350,16 @@ class GraphIOTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+  def _create_file_from_list_of_features(self, lines):
+    json_lines = [
+        "".join([
+            '{"features": { "feature": { "sequence": {',
+            '"bytes_list": { "value": ["', base64.b64encode(l).decode("ascii"),
+            '"]}}}}}\n'
+        ]) for l in lines
+    ]
+    return self._create_temp_file("".join(json_lines))
+
   def test_read_text_lines_large(self):
     gfile.Glob = self._orig_glob
     sequence_prefix = "abcdefghijklmnopqrstuvwxyz123456789"
@@ -358,14 +368,7 @@ class GraphIOTest(test.TestCase):
         "".join([sequence_prefix, str(l)]).encode("ascii")
         for l in xrange(num_records)
     ]
-    json_lines = [
-        "".join([
-            '{"features": { "feature": { "sequence": {',
-            '"bytes_list": { "value": ["', base64.b64encode(l).decode("ascii"),
-            '"]}}}}}\n'
-        ]) for l in lines
-    ]
-    filename = self._create_temp_file("".join(json_lines))
+    filename = self._create_file_from_list_of_features(lines)
     batch_size = 10000
     queue_capacity = 100000
     name = "my_large_batch"
@@ -410,6 +413,61 @@ class GraphIOTest(test.TestCase):
     self.assertEqual(len(parsed_records), num_records)
     self.assertEqual(set(parsed_records), set(lines))
 
+  def test_read_batch_features_maintains_order(self):
+    """Make sure that examples are read in the right order.
+
+    When randomize_input=False, num_enqueue_threads=1 and reader_num_threads=1
+    read_keyed_batch_features() should read the examples in the same order as
+    they appear in the file.
+    """
+    gfile.Glob = self._orig_glob
+    num_records = 1000
+    lines = ["".join(str(l)).encode("ascii") for l in xrange(num_records)]
+    filename = self._create_file_from_list_of_features(lines)
+    batch_size = 10
+    queue_capacity = 1000
+    name = "my_large_batch"
+
+    features = {"sequence": parsing_ops.FixedLenFeature([], dtypes_lib.string)}
+
+    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      result = graph_io.read_batch_features(
+          filename,
+          batch_size,
+          features,
+          io_ops.TextLineReader,
+          randomize_input=False,
+          num_epochs=1,
+          queue_capacity=queue_capacity,
+          reader_num_threads=1,
+          num_enqueue_threads=1,
+          parse_fn=parsing_ops.decode_json_example,
+          name=name)
+      self.assertEqual(1, len(result))
+      self.assertAllEqual((None,), result["sequence"].get_shape().as_list())
+      session.run(variables.local_variables_initializer())
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      data = []
+      try:
+        while not coord.should_stop():
+          data.append(session.run(result))
+      except errors.OutOfRangeError:
+        pass
+      finally:
+        coord.request_stop()
+
+      coord.join(threads)
+
+    parsed_records = [
+        item for sublist in [d["sequence"] for d in data] for item in sublist
+    ]
+    # Check that the number of records matches expected and all records
+    # are present in the right order.
+    self.assertEqual(len(parsed_records), num_records)
+    self.assertEqual(parsed_records, lines)
+
   def test_read_text_lines_multifile(self):
     gfile.Glob = self._orig_glob
     filenames = self._create_sorted_temp_files(["ABC\n", "DEF\nGHK\n"])
diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
index 086e5d78bb2a3d715588bf085e06eda88c65861f..b040ab3bb6c516158589a8e30d56fff1f7728951 100644
--- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
@@ -20,14 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops as array_ops_
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses
 
 
-@deprecated('2016-12-01', 'Use `tf.contrib.losses.mean_squared_error` '
+@deprecated('2016-12-01', 'Use `tf.losses.mean_squared_error` '
             'and explicit logits computation.')
 def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None):
   """Returns prediction and loss for mean squared error regression."""
@@ -36,10 +36,10 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None):
     predictions = nn.xw_plus_b(tensor_in, weights, biases)
     if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2:
       predictions = array_ops_.squeeze(predictions, squeeze_dims=[1])
-    return predictions, loss_ops.mean_squared_error(predictions, labels)
+    return predictions, losses.mean_squared_error(labels, predictions)
 
 
-@deprecated('2016-12-01', 'Use `tf.contrib.losses.softmax_cross_entropy` '
+@deprecated('2016-12-01', 'Use `tf.losses.softmax_cross_entropy` '
             'and explicit logits computation.')
 def softmax_classifier(tensor_in,
                        labels,
@@ -72,4 +72,4 @@ def softmax_classifier(tensor_in,
     logits = nn.xw_plus_b(tensor_in, weights, biases)
     if class_weight is not None:
       logits = math_ops.multiply(logits, class_weight)
-    return nn.softmax(logits), loss_ops.softmax_cross_entropy(logits, labels)
+    return nn.softmax(logits), losses.softmax_cross_entropy(labels, logits)
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc.py b/tensorflow/contrib/learn/python/learn/utils/gc.py
index 5af9e8b9e2b7cf23505a53063d1538a35f022d55..226915987a4934626066b12810f579ae675107b2 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc.py
@@ -32,13 +32,14 @@ Note that functions should always return a sorted list.
 
 For example,
   base_dir = "/tmp"
-  # create the directories
+  # Create the directories.
   for e in xrange(10):
     os.mkdir("%s/%d" % (base_dir, e), 0o755)
 
-  # create a simple parser that pulls the export_version from the directory
+  # Create a simple parser that pulls the export_version from the directory.
+  path_regex = "^" + re.escape(base_dir) + "/(\\d+)$"
   def parser(path):
-    match = re.match("^" + base_dir + "/(\\d+)$", path.path)
+    match = re.match(path_regex, path.path)
     if not match:
       return None
     return path._replace(export_version=int(match.group(1)))
@@ -46,15 +47,15 @@ For example,
   path_list = gc.get_paths("/tmp", parser)  # contains all ten Paths
 
   every_fifth = gc.mod_export_version(5)
-  print every_fifth(path_list) # shows ["/tmp/0", "/tmp/5"]
+  print(every_fifth(path_list))  # shows ["/tmp/0", "/tmp/5"]
 
   largest_three = gc.largest_export_versions(3)
-  print largest_three(all_paths)  # shows ["/tmp/7", "/tmp/8", "/tmp/9"]
+  print(largest_three(all_paths))  # shows ["/tmp/7", "/tmp/8", "/tmp/9"]
 
   both = gc.union(every_fifth, largest_three)
-  print both(all_paths)  # shows ["/tmp/0", "/tmp/5",
-                         #        "/tmp/7", "/tmp/8", "/tmp/9"]
-  # delete everything not in 'both'
+  print(both(all_paths))  # shows ["/tmp/0", "/tmp/5",
+                          #        "/tmp/7", "/tmp/8", "/tmp/9"]
+  # Delete everything not in 'both'.
   to_delete = gc.negation(both)
   for p in to_delete(all_paths):
     gfile.DeleteRecursively(p.path)  # deletes:  "/tmp/1", "/tmp/2",
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 3f0f3092534e6c886bb24d368e0e60322213e1d2..1b9a25ae414e156995fd1f3d63c53795f6ba4572 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -417,7 +417,11 @@ def make_export_strategy(serving_input_fn,
   return export_strategy.ExportStrategy('Servo', export_fn)
 
 
-def make_parsing_export_strategy(feature_columns, exports_to_keep=5):
+def make_parsing_export_strategy(feature_columns,
+                                 default_output_alternative_key=None,
+                                 assets_extra=None,
+                                 as_text=False,
+                                 exports_to_keep=5):
   """Create an ExportStrategy for use with Experiment, using `FeatureColumn`s.
 
   Creates a SavedModel export that expects to be fed with a single string
@@ -427,6 +431,18 @@ def make_parsing_export_strategy(feature_columns, exports_to_keep=5):
   Args:
     feature_columns: An iterable of `FeatureColumn`s representing the features
       that must be provided at serving time (excluding labels!).
+    default_output_alternative_key: the name of the head to serve when an
+      incoming serving request does not explicitly request a specific head.
+      Must be `None` if the estimator inherits from ${tf.estimator.Estimator}
+      or for single-headed models.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel.  Each key should give the destination
+      path (including the filename) relative to the assets.extra directory.
+      The corresponding value gives the full path of the source file to be
+      copied.  For example, the simple case of copying a single file without
+      renaming it is specified as
+      `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+    as_text: whether to write the SavedModel proto in text format.
     exports_to_keep: Number of exports to keep.  Older exports will be
       garbage-collected.  Defaults to 5.  Set to None to disable garbage
       collection.
@@ -436,5 +452,9 @@ def make_parsing_export_strategy(feature_columns, exports_to_keep=5):
   """
   feature_spec = feature_column.create_feature_spec_for_parsing(feature_columns)
   serving_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec)
-  return make_export_strategy(serving_input_fn, exports_to_keep=exports_to_keep)
-
+  return make_export_strategy(
+      serving_input_fn,
+      default_output_alternative_key=default_output_alternative_key,
+      assets_extra=assets_extra,
+      as_text=as_text,
+      exports_to_keep=exports_to_keep)
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 119f0e67bef80d1d0e907f2097cb40f7677544e5..1fa55132b1fc0cd3367ca2eb331b6870edc30c3b 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -58,6 +58,7 @@ cuda_py_tests(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["noasan"],  # times out b/63678675
 )
 
 filegroup(
diff --git a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
index 23b4a73b23d28ecc4ad3a2e0181a5c059de4a030..d4de638338689d2775efe6988af3a058bb128c07 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
@@ -1189,10 +1189,10 @@ def model_with_buckets(encoder_inputs,
     raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
                      "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
   if len(targets) < buckets[-1][1]:
-    raise ValueError("Length of targets (%d) must be at least that of last"
+    raise ValueError("Length of targets (%d) must be at least that of last "
                      "bucket (%d)." % (len(targets), buckets[-1][1]))
   if len(weights) < buckets[-1][1]:
-    raise ValueError("Length of weights (%d) must be at least that of last"
+    raise ValueError("Length of weights (%d) must be at least that of last "
                      "bucket (%d)." % (len(weights), buckets[-1][1]))
 
   all_inputs = encoder_inputs + decoder_inputs + targets + weights
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 9b4f36da15d6089566f06530deca9df0858a33cc..810a3d34eee0a886fcf49ca3209547c9307a6e67 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -60,6 +60,7 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["noasan"],  # times out b/63678675
 )
 
 cuda_py_tests(
@@ -165,12 +166,16 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:common_shapes",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator.py b/tensorflow/contrib/linalg/python/ops/linear_operator.py
index 6cdfa8618932d0e9ae1198d68e78f36583022390..91c0938e395b61cb8a42a7954c47c7d6b18e091c 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator.py
@@ -126,7 +126,8 @@ class LinearOperator(object):
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
   for `X = non_singular, self_adjoint, positive_definite, square`.
-  These have the following meaning
+  These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
@@ -893,6 +894,23 @@ class LinearOperator(object):
     with self._name_scope(name):
       return self._diag_part()
 
+  def _trace(self):
+    return math_ops.reduce_sum(self.diag_part(), axis=-1)
+
+  def trace(self, name="trace"):
+    """Trace of the linear operator, equal to sum of `self.diag_part()`.
+
+    If the operator is square, this is also the sum of the eigenvalues.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      Shape `[B1,...,Bb]` `Tensor` of same `dtype` as `self`.
+    """
+    with self._name_scope(name):
+      return self._trace()
+
   def _add_to_tensor(self, x):
     # Override if a more efficient implementation is available.
     return self._get_cached_dense_matrix() + x
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
index 0853ea03af0aa7270865e27805ba93693f43e6d8..0a71a73a9c5c7926c91cc544bdbce04c7c606cbd 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
@@ -97,7 +97,8 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
   for `X = non_singular, self_adjoint, positive_definite, square`.
-  These have the following meaning
+  These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index 56bc967706a9f2b15aabead4d6864d02e3e5ed08..29184483bf870ad4bf53874d23077ccdf90c72cb 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -98,7 +98,8 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
   for `X = non_singular, self_adjoint, positive_definite, square`.
-  These have the following meaning
+  These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
index 67889511cbffcbec934855d67914e40b157bdc91..52b40eaf8d07a18ab614c48a3654f00c460c285d 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
@@ -92,7 +92,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
   for `X = non_singular, self_adjoint, positive_definite, square`.
-  These have the following meaning
+  These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
index acba1c7035d738d878d801463b857104b98cfc83..b9ac90ff33706b19e4f639d3617f115bcf31294e 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
@@ -44,17 +44,15 @@ class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
     """Static check of init arg `num_rows`, possibly add asserts."""
     # Possibly add asserts.
     if self._assert_proper_shapes:
-      self._num_rows = control_flow_ops.with_dependencies(
-          [
-              check_ops.assert_rank(
-                  self._num_rows,
-                  0,
-                  message="Argument num_rows must be a 0-D Tensor."),
-              check_ops.assert_non_negative(
-                  self._num_rows,
-                  message="Argument num_rows must be non-negative."),
-          ],
-          self._num_rows)
+      self._num_rows = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._num_rows,
+              0,
+              message="Argument num_rows must be a 0-D Tensor."),
+          check_ops.assert_non_negative(
+              self._num_rows,
+              message="Argument num_rows must be non-negative."),
+      ], self._num_rows)
 
     # Static checks.
     if not self._num_rows.dtype.is_integer:
@@ -74,15 +72,26 @@ class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
       raise ValueError("Argument num_rows must be non-negative.  Found:"
                        " %s" % num_rows_static)
 
+  def _min_matrix_dim(self):
+    """Minimum of domain/range dimension, if statically available, else None."""
+    domain_dim = self.domain_dimension.value
+    range_dim = self.range_dimension.value
+    if domain_dim is None or range_dim is None:
+      return None
+    return min(domain_dim, range_dim)
+
+  def _min_matrix_dim_tensor(self):
+    """Minimum of domain/range dimension, as a tensor."""
+    return math_ops.reduce_min(self.shape_tensor()[-2:])
+
   def _ones_diag(self):
     """Returns the diagonal of this operator as all ones."""
     if self.shape.is_fully_defined():
-      d_shape = self.batch_shape.concatenate(
-          [min(self.domain_dimension.value, self.range_dimension.value)])
+      d_shape = self.batch_shape.concatenate([self._min_matrix_dim()])
     else:
       d_shape = array_ops.concat(
           [self.batch_shape_tensor(),
-           [math_ops.reduce_min(self.shape_tensor()[-2:])]], axis=0)
+           [self._min_matrix_dim_tensor()]], axis=0)
 
     return array_ops.ones(shape=d_shape, dtype=self.dtype)
 
@@ -181,7 +190,8 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
   for `X = non_singular, self_adjoint, positive_definite, square`.
-  These have the following meaning
+  These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
@@ -276,8 +286,8 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         self._check_batch_shape_possibly_add_asserts()
 
   def _shape(self):
-    matrix_shape = tensor_shape.TensorShape(
-        (self._num_rows_static, self._num_rows_static))
+    matrix_shape = tensor_shape.TensorShape((self._num_rows_static,
+                                             self._num_rows_static))
     if self._batch_shape_arg is None:
       return matrix_shape
 
@@ -285,8 +295,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     return batch_shape.concatenate(matrix_shape)
 
   def _shape_tensor(self):
-    matrix_shape = array_ops.stack(
-        (self._num_rows, self._num_rows), axis=0)
+    matrix_shape = array_ops.stack((self._num_rows, self._num_rows), axis=0)
     if self._batch_shape_arg is None:
       return matrix_shape
 
@@ -338,8 +347,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     # Note that adjoint has no effect since this matrix is self-adjoint.
     x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if self._assert_proper_shapes:
-      aps = linear_operator_util.assert_compatible_matrix_dimensions(
-          self, x)
+      aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x)
       x = control_flow_ops.with_dependencies([aps], x)
     return self._possibly_broadcast_batch_shape(x)
 
@@ -352,6 +360,20 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     return self._matmul(rhs, adjoint_arg=adjoint_arg)
 
+  def _trace(self):
+    # Get Tensor of all ones of same shape as self.batch_shape.
+    if self.batch_shape.is_fully_defined():
+      batch_of_ones = array_ops.ones(shape=self.batch_shape, dtype=self.dtype)
+    else:
+      batch_of_ones = array_ops.ones(
+          shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+    if self._min_matrix_dim() is not None:
+      return self._min_matrix_dim() * batch_of_ones
+    else:
+      return (math_ops.cast(self._min_matrix_dim_tensor(), self.dtype) *
+              batch_of_ones)
+
   def _diag_part(self):
     return self._ones_diag()
 
@@ -375,17 +397,15 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     """Static check of init arg `num_rows`, possibly add asserts."""
     # Possibly add asserts.
     if self._assert_proper_shapes:
-      self._num_rows = control_flow_ops.with_dependencies(
-          [
-              check_ops.assert_rank(
-                  self._num_rows,
-                  0,
-                  message="Argument num_rows must be a 0-D Tensor."),
-              check_ops.assert_non_negative(
-                  self._num_rows,
-                  message="Argument num_rows must be non-negative."),
-          ],
-          self._num_rows)
+      self._num_rows = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._num_rows,
+              0,
+              message="Argument num_rows must be a 0-D Tensor."),
+          check_ops.assert_non_negative(
+              self._num_rows,
+              message="Argument num_rows must be non-negative."),
+      ], self._num_rows)
 
     # Static checks.
     if not self._num_rows.dtype.is_integer:
@@ -412,17 +432,15 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
 
     # Possibly add asserts
     if self._assert_proper_shapes:
-      self._batch_shape_arg = control_flow_ops.with_dependencies(
-          [
-              check_ops.assert_rank(
-                  self._batch_shape_arg,
-                  1,
-                  message="Argument batch_shape must be a 1-D Tensor."),
-              check_ops.assert_non_negative(
-                  self._batch_shape_arg,
-                  message="Argument batch_shape must be non-negative."),
-          ],
-          self._batch_shape_arg)
+      self._batch_shape_arg = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._batch_shape_arg,
+              1,
+              message="Argument batch_shape must be a 1-D Tensor."),
+          check_ops.assert_non_negative(
+              self._batch_shape_arg,
+              message="Argument batch_shape must be non-negative."),
+      ], self._batch_shape_arg)
 
     # Static checks
     if not self._batch_shape_arg.dtype.is_integer:
@@ -585,8 +603,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       # Shape [B1,...Bb, 1, 1]
       self._multiplier_matrix = array_ops.expand_dims(
           array_ops.expand_dims(self.multiplier, -1), -1)
-      self._multiplier_matrix_conj = math_ops.conj(
-          self._multiplier_matrix)
+      self._multiplier_matrix_conj = math_ops.conj(self._multiplier_matrix)
       self._abs_multiplier = math_ops.abs(self.multiplier)
 
       self._num_rows = linear_operator_util.shape_tensor(
@@ -594,27 +611,25 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       self._num_rows_static = tensor_util.constant_value(self._num_rows)
       self._check_num_rows_possibly_add_asserts()
       self._num_rows_cast_to_dtype = math_ops.cast(self._num_rows, self.dtype)
-      self._num_rows_cast_to_real_dtype = math_ops.cast(
-          self._num_rows, self.dtype.real_dtype)
+      self._num_rows_cast_to_real_dtype = math_ops.cast(self._num_rows,
+                                                        self.dtype.real_dtype)
 
   def _shape(self):
-    matrix_shape = tensor_shape.TensorShape(
-        (self._num_rows_static, self._num_rows_static))
+    matrix_shape = tensor_shape.TensorShape((self._num_rows_static,
+                                             self._num_rows_static))
 
     batch_shape = self.multiplier.get_shape()
     return batch_shape.concatenate(matrix_shape)
 
   def _shape_tensor(self):
-    matrix_shape = array_ops.stack(
-        (self._num_rows, self._num_rows), axis=0)
+    matrix_shape = array_ops.stack((self._num_rows, self._num_rows), axis=0)
 
     batch_shape = array_ops.shape(self.multiplier)
     return array_ops.concat((batch_shape, matrix_shape), 0)
 
   def _assert_non_singular(self):
     return check_ops.assert_positive(
-        math_ops.abs(self.multiplier),
-        message="LinearOperator was singular")
+        math_ops.abs(self.multiplier), message="LinearOperator was singular")
 
   def _assert_positive_definite(self):
     return check_ops.assert_positive(
@@ -635,13 +650,12 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     else:
       matrix = self._multiplier_matrix
     if self._assert_proper_shapes:
-      aps = linear_operator_util.assert_compatible_matrix_dimensions(
-          self, x)
+      aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x)
       x = control_flow_ops.with_dependencies([aps], x)
     return x * matrix
 
   def _determinant(self):
-    return self.multiplier ** self._num_rows_cast_to_dtype
+    return self.multiplier**self._num_rows_cast_to_dtype
 
   def _log_abs_determinant(self):
     return self._num_rows_cast_to_real_dtype * math_ops.log(
@@ -654,11 +668,24 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     else:
       matrix = self._multiplier_matrix
     if self._assert_proper_shapes:
-      aps = linear_operator_util.assert_compatible_matrix_dimensions(
-          self, rhs)
+      aps = linear_operator_util.assert_compatible_matrix_dimensions(self, rhs)
       rhs = control_flow_ops.with_dependencies([aps], rhs)
     return rhs / matrix
 
+  def _trace(self):
+    # Get Tensor of all ones of same shape as self.batch_shape.
+    if self.batch_shape.is_fully_defined():
+      batch_of_ones = array_ops.ones(shape=self.batch_shape, dtype=self.dtype)
+    else:
+      batch_of_ones = array_ops.ones(
+          shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+    if self._min_matrix_dim() is not None:
+      return self.multiplier * self._min_matrix_dim() * batch_of_ones
+    else:
+      return (self.multiplier * math_ops.cast(self._min_matrix_dim_tensor(),
+                                              self.dtype) * batch_of_ones)
+
   def _diag_part(self):
     return self._ones_diag() * self.multiplier[..., array_ops.newaxis]
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index b2d7b10157b02ff2814de12459b1e417c22128b5..af14f34600e3fa38566a8146d8e0e0359ef0802b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -169,10 +169,6 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
-          if dtype.is_complex:
-            self.skipTest(
-                "tf.matrix_determinant does not work with complex, so this "
-                "test is being skipped.")
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
@@ -190,10 +186,6 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
-          if dtype.is_complex:
-            self.skipTest(
-                "tf.matrix_determinant does not work with complex, so this "
-                "test is being skipped.")
           with self.test_session(graph=ops.Graph()) as sess:
             sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
@@ -263,6 +255,23 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                                                    feed_dict=feed_dict)
                 self.assertAC(op_solve_v, mat_solve_v)
 
+  def test_trace(self):
+    self._skip_if_tests_to_skip_contains("trace")
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                shape, dtype, use_placeholder=use_placeholder)
+            op_trace = operator.trace()
+            mat_trace = math_ops.trace(mat)
+            if not use_placeholder:
+              self.assertAllEqual(op_trace.get_shape(), mat_trace.get_shape())
+            op_trace_v, mat_trace_v = sess.run([op_trace, mat_trace],
+                                               feed_dict=feed_dict)
+            self.assertAC(op_trace_v, mat_trace_v)
+
   def test_add_to_tensor(self):
     self._skip_if_tests_to_skip_contains("add_to_tensor")
     for use_placeholder in False, True:
@@ -445,7 +454,7 @@ def random_tril_matrix(shape,
     remove_upper:  Python `bool`.
       If `True`, zero out the strictly upper triangle.
       If `False`, the lower triangle of returned matrix will have desired
-      properties, but will not not have the strictly upper triangle zero'd out.
+      properties, but will not have the strictly upper triangle zero'd out.
 
   Returns:
     `Tensor` with desired shape and dtype.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
index 8a152a9b475f4e3fdfd8e3045ab1028eb467997b..22ccf6f1310ce0f689b3bad8e9100099a53e1919 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -91,7 +91,8 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
   for `X = non_singular, self_adjoint, positive_definite, square`.
-  These have the following meaning
+  These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
index 546d899e74e53d529dd58fc75a4e06f2fb920d1b..9c9c359574667570dbfaaebe9f918b49ea637e51 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
@@ -112,9 +112,9 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite, diag_update_positive`
-  and `square`
-  These have the following meaning
+  for `X = non_singular`, `self_adjoint`, `positive_definite`,
+  `diag_update_positive` and `square`. These have the following meaning:
+
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
     runtime assert.  For example, finite floating point precision may result
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index 1fde6e5c6cb0e2d6097c63dcd707c35a491acaaa..22398d225561039921daa6384c335fdd5bc89a45 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -131,6 +131,7 @@ py_test(
     deps = [
         ":sdca_estimator_py",
         "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:sparse_tensor",
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 32b7f956e476ca79cc77338cde496cd0c517c401..79a5928a21cb9a2633b2aac178f185ba333790d6 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.linear_optimizer.python import sdca_estimator
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.platform import test
@@ -40,15 +41,17 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
 
-    maintenance_cost = feature_column_lib.real_valued_column('maintenance_cost')
-    sq_footage = feature_column_lib.real_valued_column('sq_footage')
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id',
-        feature_columns=[maintenance_cost, sq_footage],
-        weight_column_name='weights')
-    classifier.fit(input_fn=input_fn, steps=100)
-    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
-    self.assertLess(loss, 0.05)
+    with self.test_session():
+      maintenance_cost = feature_column_lib.real_valued_column(
+          'maintenance_cost')
+      sq_footage = feature_column_lib.real_valued_column('sq_footage')
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id',
+          feature_columns=[maintenance_cost, sq_footage],
+          weight_column_name='weights')
+      classifier.fit(input_fn=input_fn, steps=100)
+      loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+      self.assertLess(loss, 0.05)
 
   def testRealValuedFeatureWithHigherDimension(self):
     """Tests SDCALogisticClassifier with high-dimension real valued features."""
@@ -63,13 +66,14 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
       }, constant_op.constant([[0], [1]])
 
-    dense_feature = feature_column_lib.real_valued_column(
-        'dense_feature', dimension=2)
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id', feature_columns=[dense_feature])
-    classifier.fit(input_fn=input_fn, steps=100)
-    loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
-    self.assertLess(loss, 0.05)
+    with self.test_session():
+      dense_feature = feature_column_lib.real_valued_column(
+          'dense_feature', dimension=2)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id', feature_columns=[dense_feature])
+      classifier.fit(input_fn=input_fn, steps=100)
+      loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+      self.assertLess(loss, 0.05)
 
   def testBucketizedFeatures(self):
     """Tests SDCALogisticClassifier with bucketized features."""
@@ -82,19 +86,21 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    price_bucket = feature_column_lib.bucketized_column(
-        feature_column_lib.real_valued_column('price'),
-        boundaries=[500.0, 700.0])
-    sq_footage_bucket = feature_column_lib.bucketized_column(
-        feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0])
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id',
-        feature_columns=[price_bucket, sq_footage_bucket],
-        weight_column_name='weights',
-        l2_regularization=1.0)
-    classifier.fit(input_fn=input_fn, steps=50)
-    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
-    self.assertGreater(metrics['accuracy'], 0.9)
+    with self.test_session():
+      price_bucket = feature_column_lib.bucketized_column(
+          feature_column_lib.real_valued_column('price'),
+          boundaries=[500.0, 700.0])
+      sq_footage_bucket = feature_column_lib.bucketized_column(
+          feature_column_lib.real_valued_column('sq_footage'),
+          boundaries=[650.0])
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id',
+          feature_columns=[price_bucket, sq_footage_bucket],
+          weight_column_name='weights',
+          l2_regularization=1.0)
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertGreater(metrics['accuracy'], 0.9)
 
   def testSparseFeatures(self):
     """Tests SDCALogisticClassifier with sparse features."""
@@ -114,16 +120,17 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    price = feature_column_lib.real_valued_column('price')
-    country = feature_column_lib.sparse_column_with_hash_bucket(
-        'country', hash_bucket_size=5)
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id',
-        feature_columns=[price, country],
-        weight_column_name='weights')
-    classifier.fit(input_fn=input_fn, steps=50)
-    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
-    self.assertGreater(metrics['accuracy'], 0.9)
+    with self.test_session():
+      price = feature_column_lib.real_valued_column('price')
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id',
+          feature_columns=[price, country],
+          weight_column_name='weights')
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertGreater(metrics['accuracy'], 0.9)
 
   def testWeightedSparseFeatures(self):
     """Tests SDCALogisticClassifier with weighted sparse features."""
@@ -144,16 +151,17 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 5])
       }, constant_op.constant([[1], [0], [1]])
 
-    country = feature_column_lib.sparse_column_with_hash_bucket(
-        'country', hash_bucket_size=5)
-    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
-        country, 'price')
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id',
-        feature_columns=[country_weighted_by_price])
-    classifier.fit(input_fn=input_fn, steps=50)
-    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
-    self.assertGreater(metrics['accuracy'], 0.9)
+    with self.test_session():
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      country_weighted_by_price = feature_column_lib.weighted_sparse_column(
+          country, 'price')
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id',
+          feature_columns=[country_weighted_by_price])
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertGreater(metrics['accuracy'], 0.9)
 
   def testCrossedFeatures(self):
     """Tests SDCALogisticClassifier with crossed features."""
@@ -174,17 +182,18 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 1])
       }, constant_op.constant([[0], [0], [1]])
 
-    language = feature_column_lib.sparse_column_with_hash_bucket(
-        'language', hash_bucket_size=5)
-    country = feature_column_lib.sparse_column_with_hash_bucket(
-        'country', hash_bucket_size=5)
-    country_language = feature_column_lib.crossed_column(
-        [language, country], hash_bucket_size=10)
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id', feature_columns=[country_language])
-    classifier.fit(input_fn=input_fn, steps=10)
-    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
-    self.assertGreater(metrics['accuracy'], 0.9)
+    with self.test_session():
+      language = feature_column_lib.sparse_column_with_hash_bucket(
+          'language', hash_bucket_size=5)
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      country_language = feature_column_lib.crossed_column(
+          [language, country], hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id', feature_columns=[country_language])
+      classifier.fit(input_fn=input_fn, steps=10)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertGreater(metrics['accuracy'], 0.9)
 
   def testMixedFeatures(self):
     """Tests SDCALogisticClassifier with a mix of features."""
@@ -206,25 +215,35 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[3.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    price = feature_column_lib.real_valued_column('price')
-    sq_footage_bucket = feature_column_lib.bucketized_column(
-        feature_column_lib.real_valued_column('sq_footage'),
-        boundaries=[650.0, 800.0])
-    country = feature_column_lib.sparse_column_with_hash_bucket(
-        'country', hash_bucket_size=5)
-    sq_footage_country = feature_column_lib.crossed_column(
-        [sq_footage_bucket, country], hash_bucket_size=10)
-    classifier = sdca_estimator.SDCALogisticClassifier(
-        example_id_column='example_id',
-        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
-        weight_column_name='weights')
-    classifier.fit(input_fn=input_fn, steps=50)
-    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
-    self.assertGreater(metrics['accuracy'], 0.9)
+    with self.test_session():
+      price = feature_column_lib.real_valued_column('price')
+      sq_footage_bucket = feature_column_lib.bucketized_column(
+          feature_column_lib.real_valued_column('sq_footage'),
+          boundaries=[650.0, 800.0])
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      sq_footage_country = feature_column_lib.crossed_column(
+          [sq_footage_bucket, country], hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id',
+          feature_columns=[
+              price, sq_footage_bucket, country, sq_footage_country
+          ],
+          weight_column_name='weights')
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertGreater(metrics['accuracy'], 0.9)
 
 
 class SDCALinearRegressorTest(test.TestCase):
 
+  def _single_threaded_test_session(self):
+    # TODO(andreasst): figure out why SDCALinearRegressor needs a single
+    # threaded session to pass in tsan mode but SDCALogisticClassifier does not.
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
+    return self.test_session(config=config)
+
   def testRealValuedLinearFeatures(self):
     """Tests SDCALinearRegressor works with real valued features."""
     x = [[1.2, 2.0, -1.5], [-2.0, 3.0, -0.5], [1.0, -0.5, 4.0]]
@@ -238,18 +257,19 @@ class SDCALinearRegressorTest(test.TestCase):
           'weights': constant_op.constant([[10.0], [10.0], [10.0]])
       }, constant_op.constant(y)
 
-    x_column = feature_column_lib.real_valued_column('x', dimension=3)
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id',
-        feature_columns=[x_column],
-        weight_column_name='weights')
-    regressor.fit(input_fn=input_fn, steps=20)
-    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
-    self.assertLess(loss, 0.01)
-    self.assertIn('linear/x/weight', regressor.get_variable_names())
-    regressor_weights = regressor.get_variable_value('linear/x/weight')
-    self.assertAllClose(
-        [w[0] for w in weights], regressor_weights.flatten(), rtol=0.1)
+    with self._single_threaded_test_session():
+      x_column = feature_column_lib.real_valued_column('x', dimension=3)
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[x_column],
+          weight_column_name='weights')
+      regressor.fit(input_fn=input_fn, steps=20)
+      loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+      self.assertLess(loss, 0.01)
+      self.assertIn('linear/x/weight', regressor.get_variable_names())
+      regressor_weights = regressor.get_variable_value('linear/x/weight')
+      self.assertAllClose(
+          [w[0] for w in weights], regressor_weights.flatten(), rtol=0.1)
 
   def testMixedFeaturesArbitraryWeights(self):
     """Tests SDCALinearRegressor works with a mix of features."""
@@ -271,22 +291,25 @@ class SDCALinearRegressorTest(test.TestCase):
               constant_op.constant([[3.0], [5.0], [7.0]])
       }, constant_op.constant([[1.55], [-1.25], [-3.0]])
 
-    price = feature_column_lib.real_valued_column('price')
-    sq_footage_bucket = feature_column_lib.bucketized_column(
-        feature_column_lib.real_valued_column('sq_footage'),
-        boundaries=[650.0, 800.0])
-    country = feature_column_lib.sparse_column_with_hash_bucket(
-        'country', hash_bucket_size=5)
-    sq_footage_country = feature_column_lib.crossed_column(
-        [sq_footage_bucket, country], hash_bucket_size=10)
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id',
-        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
-        l2_regularization=1.0,
-        weight_column_name='weights')
-    regressor.fit(input_fn=input_fn, steps=20)
-    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
-    self.assertLess(loss, 0.05)
+    with self._single_threaded_test_session():
+      price = feature_column_lib.real_valued_column('price')
+      sq_footage_bucket = feature_column_lib.bucketized_column(
+          feature_column_lib.real_valued_column('sq_footage'),
+          boundaries=[650.0, 800.0])
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      sq_footage_country = feature_column_lib.crossed_column(
+          [sq_footage_bucket, country], hash_bucket_size=10)
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[
+              price, sq_footage_bucket, country, sq_footage_country
+          ],
+          l2_regularization=1.0,
+          weight_column_name='weights')
+      regressor.fit(input_fn=input_fn, steps=20)
+      loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+      self.assertLess(loss, 0.05)
 
   def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
     """SDCALinearRegressor works with sparse features and L1 regularization."""
@@ -306,56 +329,57 @@ class SDCALinearRegressorTest(test.TestCase):
               constant_op.constant([[10.0], [10.0], [10.0]])
       }, constant_op.constant([[1.4], [-0.8], [2.6]])
 
-    price = feature_column_lib.real_valued_column('price')
-    country = feature_column_lib.sparse_column_with_hash_bucket(
-        'country', hash_bucket_size=5)
-    # Regressor with no L1 regularization.
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id',
-        feature_columns=[price, country],
-        weight_column_name='weights')
-    regressor.fit(input_fn=input_fn, steps=20)
-    no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
-    variable_names = regressor.get_variable_names()
-    self.assertIn('linear/price/weight', variable_names)
-    self.assertIn('linear/country/weights', variable_names)
-    no_l1_reg_weights = {
-        'linear/price/weight':
-            regressor.get_variable_value('linear/price/weight'),
-        'linear/country/weights':
-            regressor.get_variable_value('linear/country/weights'),
-    }
-
-    # Regressor with L1 regularization.
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id',
-        feature_columns=[price, country],
-        l1_regularization=1.0,
-        weight_column_name='weights')
-    regressor.fit(input_fn=input_fn, steps=20)
-    l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
-    l1_reg_weights = {
-        'linear/price/weight':
-            regressor.get_variable_value('linear/price/weight'),
-        'linear/country/weights':
-            regressor.get_variable_value('linear/country/weights'),
-    }
-
-    # Unregularized loss is lower when there is no L1 regularization.
-    self.assertLess(no_l1_reg_loss, l1_reg_loss)
-    self.assertLess(no_l1_reg_loss, 0.05)
-
-    # But weights returned by the regressor with L1 regularization have smaller
-    # L1 norm.
-    l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
-    for var_name in sorted(l1_reg_weights):
-      l1_reg_weights_norm += sum(
-          np.absolute(l1_reg_weights[var_name].flatten()))
-      no_l1_reg_weights_norm += sum(
-          np.absolute(no_l1_reg_weights[var_name].flatten()))
-      print('Var name: %s, value: %s' % (var_name,
-                                         no_l1_reg_weights[var_name].flatten()))
-    self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
+    with self._single_threaded_test_session():
+      price = feature_column_lib.real_valued_column('price')
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      # Regressor with no L1 regularization.
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[price, country],
+          weight_column_name='weights')
+      regressor.fit(input_fn=input_fn, steps=20)
+      no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+      variable_names = regressor.get_variable_names()
+      self.assertIn('linear/price/weight', variable_names)
+      self.assertIn('linear/country/weights', variable_names)
+      no_l1_reg_weights = {
+          'linear/price/weight':
+              regressor.get_variable_value('linear/price/weight'),
+          'linear/country/weights':
+              regressor.get_variable_value('linear/country/weights'),
+      }
+
+      # Regressor with L1 regularization.
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[price, country],
+          l1_regularization=1.0,
+          weight_column_name='weights')
+      regressor.fit(input_fn=input_fn, steps=20)
+      l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+      l1_reg_weights = {
+          'linear/price/weight':
+              regressor.get_variable_value('linear/price/weight'),
+          'linear/country/weights':
+              regressor.get_variable_value('linear/country/weights'),
+      }
+
+      # Unregularized loss is lower when there is no L1 regularization.
+      self.assertLess(no_l1_reg_loss, l1_reg_loss)
+      self.assertLess(no_l1_reg_loss, 0.05)
+
+      # But weights returned by the regressor with L1 regularization have
+      # smaller L1 norm.
+      l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
+      for var_name in sorted(l1_reg_weights):
+        l1_reg_weights_norm += sum(
+            np.absolute(l1_reg_weights[var_name].flatten()))
+        no_l1_reg_weights_norm += sum(
+            np.absolute(no_l1_reg_weights[var_name].flatten()))
+        print('Var name: %s, value: %s' %
+              (var_name, no_l1_reg_weights[var_name].flatten()))
+      self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)
 
   def testBiasOnly(self):
     """Tests SDCALinearRegressor has a valid bias weight."""
@@ -380,12 +404,13 @@ class SDCALinearRegressorTest(test.TestCase):
       }, constant_op.constant([[1 if i % 4 is 0 else 0]
                                for i in range(num_examples)])
 
-    place_holder = feature_column_lib.real_valued_column('place_holder')
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id', feature_columns=[place_holder])
-    regressor.fit(input_fn=input_fn, steps=100)
-    self.assertNear(
-        regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
+    with self._single_threaded_test_session():
+      place_holder = feature_column_lib.real_valued_column('place_holder')
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id', feature_columns=[place_holder])
+      regressor.fit(input_fn=input_fn, steps=100)
+      self.assertNear(
+          regressor.get_variable_value('linear/bias_weight')[0], 0.25, err=0.1)
 
   def testBiasAndOtherColumns(self):
     """SDCALinearRegressor has valid bias weight with other columns present."""
@@ -426,26 +451,27 @@ class SDCALinearRegressorTest(test.TestCase):
            for x in [1, 0, 0, 1, 1, 0, 0, 0, 1, 0] * int(half / 10) +
            [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] * int(half / 10)])
 
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id',
-        feature_columns=[
-            feature_column_lib.real_valued_column('a'),
-            feature_column_lib.real_valued_column('b')
-        ])
-
-    regressor.fit(input_fn=input_fn, steps=200)
-
-    variable_names = regressor.get_variable_names()
-    self.assertIn('linear/bias_weight', variable_names)
-    self.assertIn('linear/a/weight', variable_names)
-    self.assertIn('linear/b/weight', variable_names)
-    # TODO(b/29339026): Change the expected results to expect a centered bias.
-    self.assertNear(
-        regressor.get_variable_value('linear/bias_weight')[0], 0.2, err=0.05)
-    self.assertNear(
-        regressor.get_variable_value('linear/a/weight')[0], 0.2, err=0.05)
-    self.assertNear(
-        regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
+    with self._single_threaded_test_session():
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[
+              feature_column_lib.real_valued_column('a'),
+              feature_column_lib.real_valued_column('b')
+          ])
+
+      regressor.fit(input_fn=input_fn, steps=200)
+
+      variable_names = regressor.get_variable_names()
+      self.assertIn('linear/bias_weight', variable_names)
+      self.assertIn('linear/a/weight', variable_names)
+      self.assertIn('linear/b/weight', variable_names)
+      # TODO(b/29339026): Change the expected results to expect a centered bias.
+      self.assertNear(
+          regressor.get_variable_value('linear/bias_weight')[0], 0.2, err=0.05)
+      self.assertNear(
+          regressor.get_variable_value('linear/a/weight')[0], 0.2, err=0.05)
+      self.assertNear(
+          regressor.get_variable_value('linear/b/weight')[0], 0.0, err=0.05)
 
   def testBiasAndOtherColumnsFabricatedCentered(self):
     """SDCALinearRegressor has valid bias weight when instances are centered."""
@@ -476,25 +502,26 @@ class SDCALinearRegressorTest(test.TestCase):
       }, constant_op.constant([[1 if x % 10 == 0 else 0] for x in range(half)] +
                               [[-1 if x % 10 == 0 else 0] for x in range(half)])
 
-    regressor = sdca_estimator.SDCALinearRegressor(
-        example_id_column='example_id',
-        feature_columns=[
-            feature_column_lib.real_valued_column('a'),
-            feature_column_lib.real_valued_column('b')
-        ])
-
-    regressor.fit(input_fn=input_fn, steps=100)
-
-    variable_names = regressor.get_variable_names()
-    self.assertIn('linear/bias_weight', variable_names)
-    self.assertIn('linear/a/weight', variable_names)
-    self.assertIn('linear/b/weight', variable_names)
-    self.assertNear(
-        regressor.get_variable_value('linear/bias_weight')[0], 0.0, err=0.05)
-    self.assertNear(
-        regressor.get_variable_value('linear/a/weight')[0], 0.1, err=0.05)
-    self.assertNear(
-        regressor.get_variable_value('linear/b/weight')[0], -0.1, err=0.05)
+    with self._single_threaded_test_session():
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[
+              feature_column_lib.real_valued_column('a'),
+              feature_column_lib.real_valued_column('b')
+          ])
+
+      regressor.fit(input_fn=input_fn, steps=100)
+
+      variable_names = regressor.get_variable_names()
+      self.assertIn('linear/bias_weight', variable_names)
+      self.assertIn('linear/a/weight', variable_names)
+      self.assertIn('linear/b/weight', variable_names)
+      self.assertNear(
+          regressor.get_variable_value('linear/bias_weight')[0], 0.0, err=0.05)
+      self.assertNear(
+          regressor.get_variable_value('linear/a/weight')[0], 0.1, err=0.05)
+      self.assertNear(
+          regressor.get_variable_value('linear/b/weight')[0], -0.1, err=0.05)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 1090cecab51352f1283bb4f85f30b41c2939360f..b8455477b0e39b54b6a5419ebd6ad41b2fc07912 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -18,14 +18,9 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:lookup_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
     ],
@@ -42,11 +37,10 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index f0f1c14fcaa945a5434a0daf838d4ccd374f013f..66caa6a2e5d17f74706965b7ca3f7928d63ae130 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -86,7 +86,7 @@ def index_table_from_tensor(mapping,
   Any lookup of an out-of-vocabulary token will return a bucket ID based on its
   hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
   `default_value`.
-  The bucket ID range is `[mapping size, mapping size + num_oov_buckets]`.
+  The bucket ID range is `[mapping size, mapping size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
   `tf.tables_initializer.run()` or `table.init.run()` once.
@@ -300,7 +300,7 @@ class MutableHashTable(LookupInterface):
                                              default_value=-1)
   table.insert(keys, values)
   out = table.lookup(query_keys)
-  print out.eval()
+  print(out.eval())
   ```
   """
 
@@ -502,7 +502,7 @@ class MutableDenseHashTable(LookupInterface):
                                                   empty_key=0)
   table.insert(keys, values)
   out = table.lookup(query_keys)
-  print out.eval()
+  print(out.eval())
   ```
   """
 
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index d9074e385a3851b2ff0f61326d44ea77dd3cf785..f75b0aa1b3e6606b0c92ae94b15b12781fe8b777 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -38,12 +38,11 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/losses/README.md b/tensorflow/contrib/losses/README.md
index 7b73c4483a934c0f3aada9810ee6ec74288392c4..dcda898ed850b2b14c18b3ce1b28192d2324d344 100644
--- a/tensorflow/contrib/losses/README.md
+++ b/tensorflow/contrib/losses/README.md
@@ -1,7 +1,13 @@
 # TensorFlow contrib losses.
 
+## Deprecated
+
+This module is deprecated. Instructions for updating: Use tf.losses instead.
+
 ## losses
 
+Note: By default all the losses are collected into the GraphKeys.LOSSES collection.
+
 Loss operations for use in training models, typically with signature like the
 following:
 
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index f6d3601c7dc6002673a7d056313939bf99cbaa44..1d2477b8b794240bd348cec7f626be794181ffb4 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -301,7 +301,7 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
 
 @deprecated("2016-12-30",
             "Use tf.losses.sigmoid_cross_entropy instead. Note that the order "
-            "of the predictions and labels arguments was changed.")
+            "of the predictions and labels arguments has been changed.")
 def sigmoid_cross_entropy(
     logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
@@ -436,7 +436,7 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
 
 @deprecated("2016-12-30",
             "Use tf.losses.log_loss instead. Note that the order of the "
-            "predictions and labels arguments was changed.")
+            "predictions and labels arguments has been changed.")
 def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
   """Adds a Log Loss term to the training procedure.
 
@@ -477,7 +477,8 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
 
 @deprecated("2016-12-30",
             "Use tf.losses.hinge_loss instead. Note that the order of the "
-            "predictions and labels arguments were changed.")
+            "logits and labels arguments has been changed, and to stay "
+            "unweighted, reduction=Reduction.NONE")
 def hinge_loss(logits, labels=None, scope=None):
   """Method that returns the loss tensor for hinge loss.
 
@@ -488,8 +489,8 @@ def hinge_loss(logits, labels=None, scope=None):
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
-    A `Tensor` of same shape as `logits` and `labels` representing the loss
-      values across the batch.
+    An unweighted `Tensor` of same shape as `logits` and `labels` representing the
+      loss values across the batch.
 
   Raises:
     ValueError: If the shapes of `logits` and `labels` don't match.
@@ -541,7 +542,7 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
 
 @deprecated("2016-12-30",
             "Use tf.losses.mean_pairwise_squared_error instead. Note that the "
-            "order of the predictions and labels arguments was changed.")
+            "order of the predictions and labels arguments has been changed.")
 def mean_pairwise_squared_error(
     predictions, labels=None, weights=1.0, scope=None):
   """Adds a pairwise-errors-squared loss to the training procedure.
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 305ed0d11ec11ef24971a47f6b4d7f3bb25f82b2..2e45ddad54b14ac2b3c7807149ee6c7248524977 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -202,7 +202,7 @@ ifeq ($(TARGET),LINUX)
 endif
 # If we're cross-compiling for the Raspberry Pi, use the right gcc.
 ifeq ($(TARGET),PI)
-	CXXFLAGS += -D__ANDROID_TYPES_SLIM__
+	CXXFLAGS += -D__ANDROID_TYPES_SLIM__ -DRASPBERRY_PI
 	LDFLAGS := -Wl,--no-whole-archive
 	LIBS += -ldl -lpthread
 	LIBFLAGS += -Wl,--allow-multiple-definition -Wl,--whole-archive
@@ -279,6 +279,16 @@ ifeq ($(TARGET),ANDROID)
 		LIBS += -lhexagon_controller
 		LDFLAGS += -L$(HEXAGON_LIBS)
 		CXXFLAGS += -DUSE_HEXAGON_LIBS
+
+# CAVEAT: We should disable TENSORFLOW_DISABLE_META while running
+# quantized_matmul on Android because it crashes in
+# MultiThreadGemm in tensorflow/core/kernels/meta_support.cc
+# See http://b/33270149
+# TODO(satok): Remove once it's fixed
+		CXXFLAGS += -DTENSORFLOW_DISABLE_META
+
+# Declare __ANDROID_TYPES_FULL__ to enable required types for hvx
+		CXXFLAGS += -D__ANDROID_TYPES_FULL__
 	endif
 
 	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
@@ -490,7 +500,7 @@ $(wildcard tensorflow/core/grappler/clusters/single_machine.*)
 # Filter out all the excluded files.
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
-TF_CC_SRCS += tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+TF_CC_SRCS += tensorflow/core/platform/default/gpu_tracer.cc
 # Also include the op and kernel definitions.
 TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
@@ -500,6 +510,18 @@ tensorflow/core/util/reporter.cc \
 tensorflow/tools/benchmark/benchmark_model.cc \
 tensorflow/tools/benchmark/benchmark_model_main.cc
 
+ifdef HEXAGON_LIBS
+	TF_CC_SRCS += \
+tensorflow/cc/framework/scope.cc \
+tensorflow/cc/framework/ops.cc \
+tensorflow/cc/ops/const_op.cc \
+tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
+tensorflow/core/kernels/hexagon/graph_transferer.cc \
+tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
+tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
+tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
+endif
+
 # File names of the intermediate files target compilation generates.
 TF_CC_OBJS := $(addprefix $(OBJDIR), $(TF_CC_SRCS:.cc=.o))
 PBT_GEN_FILES := $(addprefix $(PBTGENDIR), $(PBT_CC_SRCS))
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 9ba5c035a269e4a76a7f6214394c6577ed6a6471..0306ecb21420ffd742276986b53ec09a482b4ba4 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -130,7 +130,7 @@ For more details, see the [benchmark documentation](../../tools/benchmark).
 ## iOS
 
 _Note: To use this library in an iOS application, see related instructions in
-the [iOS examples](../ios_examples/) directory._
+the [iOS examples](../../examples/ios/) directory._
 
 Install XCode 7.3 or more recent. If you have not already, you will need to
 install the command-line tools using `xcode-select`:
@@ -176,7 +176,7 @@ benchmark program. Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.
 
 To see TensorFlow running on iOS, the example Xcode project in
-[tensorflow/contrib/ios_examples](../ios_examples) shows how to use the static
+[tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
 
 ### Building by hand
@@ -214,7 +214,7 @@ benchmark program. Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app. 
 
 To see TensorFlow running on iOS, the example Xcode project in
-[tensorflow/contrib/ios_examples](../ios_examples) shows how to use the static
+[tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
 
 #### Universal binaries
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 161f2df5b27044971c6fd7e13c321c95e0ab4d02..7f0c3f38c2da4c53dd7be6ce58102edbccce2876 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -82,7 +82,7 @@ fi
 if [[ "${USE_HEXAGON}" == "true" ]]; then
     HEXAGON_PARENT_DIR=$(cd "${HEXAGON_DOWNLOAD_PATH}" >/dev/null && pwd)
     HEXAGON_LIBS="${HEXAGON_PARENT_DIR}/libs"
-    HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" >/dev/null && pwd)
+    HEXAGON_INCLUDE=$(cd "tensorflow/core/kernels/hexagon" >/dev/null && pwd)
 fi
 
 if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index 344bf49dcf0423f79c0cfc35ca7bb65e7d8565f8..a0f565285484153706a9683bcd42040f704aa0d8 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -18,7 +18,7 @@ set -e
 
 # Make sure we're on OS X.
 if [[ $(uname) != "Darwin" ]]; then
-    echo "ERROR: This makefile build requires OS X, which the current system "\
+    echo "ERROR: This makefile build requires macOS, which the current system "\
     "is not."
     exit 1
 fi
@@ -37,7 +37,9 @@ rm -rf tensorflow/contrib/makefile/downloads
 #
 #    ld: -bind_at_load and -bitcode_bundle (Xcode setting ENABLE_BITCODE=YES) cannot be used together
 #
-export MACOSX_DEPLOYMENT_TARGET="10.10"
+if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
+    export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
+fi
 
 # Pull down the required versions of the frameworks we need.
 tensorflow/contrib/makefile/download_dependencies.sh
@@ -48,6 +50,5 @@ tensorflow/contrib/makefile/compile_ios_protobuf.sh
 # Build the iOS TensorFlow libraries.
 tensorflow/contrib/makefile/compile_ios_tensorflow.sh "-O3"
 
-# Creates a static universal library in 
+# Creates a static universal library in
 # tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a
-
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index e8b9454e7e6e808623f496ffb56eb9f0190b5677..4056db18a76fc8a58240d9116b19cd8b68c1ee45 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -15,9 +15,12 @@
 # ==============================================================================
 # Builds protobuf 3 for iOS.
 
-set -x
 set -e
 
+if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
+    export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
+fi
+
 SCRIPT_DIR=$(dirname $0)
 source "${SCRIPT_DIR}/build_helper.subr"
 
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index bcf097b3031fc1ab1a20b77948ae662781fe8043..5d1cc8b375b99d97603c5d7dff78a5ac4eef751b 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -31,6 +31,10 @@ function less_than_required_version() {
   )
 }
 
+if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
+    export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
+fi
+
 ACTUAL_XCODE_VERSION=$(xcodebuild -version | head -n 1 | sed 's/Xcode //')
 REQUIRED_XCODE_VERSION=7.3.0
 if less_than_required_version $ACTUAL_XCODE_VERSION 7 3 0
@@ -44,7 +48,7 @@ LIBDIR=${GENDIR}lib
 LIB_PREFIX=libtensorflow-core
 
 make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a OPTFLAGS="$1" 
+TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a OPTFLAGS="$1"
 if [ $? -ne 0 ]
 then
   echo "armv7 compilation failed."
diff --git a/tensorflow/contrib/makefile/create_ios_frameworks.sh b/tensorflow/contrib/makefile/create_ios_frameworks.sh
index 2ad095b3971f59b72250ffb06a35f4708337d328..2bbde6aa8850eacaac0d5ec77b7a01e4bd6bd47a 100644
--- a/tensorflow/contrib/makefile/create_ios_frameworks.sh
+++ b/tensorflow/contrib/makefile/create_ios_frameworks.sh
@@ -79,7 +79,7 @@ cd $SCRIPT_DIR/gen/proto
 tar cf $FW_DIR_TFCORE_HDRS/tmp.tar tensorflow
 cd $FW_DIR_TFCORE_HDRS
 tar xf tmp.tar
-# Dont include the auto downloaded/generated to build this library
+# Don't include the auto downloaded/generated to build this library
 rm -rf tensorflow/contrib/makefile
 rm -f tmp.tar
 
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 67bc8f8ada3d92b6458fea73951f7043c14c973c..d56e388477db6239cfb577f7e2754321ff33bd82 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -1,5 +1,6 @@
 tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
 tensorflow/tools/proto_text/gen_proto_text_functions.cc
+tensorflow/core/framework/resource_handle.cc
 tensorflow/core/platform/default/protobuf.cc
 tensorflow/core/platform/tracing.cc
 tensorflow/core/platform/tensor_coding.cc
diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
index 2a6f66edcb72f10fe44ff1b8351bedca6a72d52e..9aa81144fd23cf5b1fb0d70fbedfd5a96afcedae 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
@@ -34,27 +34,7 @@ $(wildcard $(GTEST_DIR)/src/*.cc) \
 $(wildcard $(GTEST_DIR)/src/*.h) \
 $(GTEST_HEADERS)
 
-# CAVEAT: We should disable TENSORFLOW_DISABLE_META while running
-# quantized_matmul on Android because it crashes in
-# MultiThreadGemm in tensorflow/core/kernels/meta_support.cc
-# TODO(satok): Remove once it's fixed
-CXXFLAGS += -DTENSORFLOW_DISABLE_META
-
-# Declare __ANDROID_TYPES_FULL__ to enable required types for hvx
-CXXFLAGS += -D__ANDROID_TYPES_FULL__
-
 GRAPH_TRANSFER_SRCS := \
-tensorflow/cc/framework/scope.cc \
-tensorflow/cc/framework/ops.cc \
-tensorflow/cc/ops/const_op.cc \
-tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
-tensorflow/core/kernels/hexagon/graph_transferer.cc \
-tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
-tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
-tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc \
-tensorflow/core/kernels/remote_fused_graph_execute_op.cc \
-tensorflow/core/kernels/remote_fused_graph_execute_utils.cc \
-tensorflow/core/ops/remote_fused_graph_ops.cc \
 tensorflow/core/platform/posix/test.cc
 
 GRAPH_EXECUTION_SRCS := \
diff --git a/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
index 6ba41d5d12a3f5243b797b253fed46aceae9ba9c..362ccedfc2f2c227ba26591d566e39f27df40e23 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
@@ -56,7 +56,6 @@ tensorflow/core/platform/posix/test.cc
 
 QUANTIZATION_TEST_SRCS := \
 $(GRAPH_TRANSFER_SRCS) \
-tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc \
 tensorflow/core/kernels/hexagon/graph_transferer_test.cc \
 tensorflow/contrib/makefile/test/test_main.cc
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 857d6fa21bc3ec19837fb458e9d52b6490e72f05..f1f6144acd4f09492baf2d1e4825b24f4f63f566 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -1,3 +1,10 @@
+tensorflow/contrib/boosted_trees/ops/ensemble_optimizer_ops.cc
+tensorflow/contrib/boosted_trees/ops/model_ops.cc
+tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
+tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
+tensorflow/contrib/boosted_trees/ops/training_ops.cc
 tensorflow/core/kernels/xent_op.cc
 tensorflow/core/kernels/where_op.cc
 tensorflow/core/kernels/variable_ops.cc
@@ -7,6 +14,7 @@ tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_op_helpers.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
+tensorflow/core/kernels/tile_functor_cpu.cc
 tensorflow/core/kernels/tile_ops.cc
 tensorflow/core/kernels/tile_ops_cpu_impl_1.cc
 tensorflow/core/kernels/tile_ops_cpu_impl_2.cc
@@ -135,9 +143,12 @@ tensorflow/core/kernels/cwise_op_minimum.cc
 tensorflow/core/kernels/cwise_op_maximum.cc
 tensorflow/core/kernels/cwise_op_logical_not.cc
 tensorflow/core/kernels/cwise_op_logical_and.cc
+tensorflow/core/kernels/cwise_op_logical_or.cc
 tensorflow/core/kernels/cwise_op_log.cc
 tensorflow/core/kernels/cwise_op_less.cc
+tensorflow/core/kernels/cwise_op_less_equal.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
+tensorflow/core/kernels/cwise_op_invert.cc
 tensorflow/core/kernels/cwise_op_greater_equal.cc
 tensorflow/core/kernels/cwise_op_greater.cc
 tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -146,6 +157,9 @@ tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_div.cc
+tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+tensorflow/core/kernels/cwise_op_bitwise_or.cc
+tensorflow/core/kernels/cwise_op_bitwise_and.cc
 tensorflow/core/kernels/cwise_op_add_2.cc
 tensorflow/core/kernels/cwise_op_add_1.cc
 tensorflow/core/kernels/cwise_op_abs.cc
@@ -185,6 +199,7 @@ tensorflow/core/kernels/aggregate_ops.cc
 tensorflow/core/kernels/depthwise_conv_op.cc
 tensorflow/core/kernels/dequantize_op.cc
 tensorflow/core/kernels/meta_support.cc
+tensorflow/core/kernels/population_count_op.cc
 tensorflow/core/kernels/quantization_utils.cc
 tensorflow/core/kernels/quantize_down_and_shrink_range.cc
 tensorflow/core/kernels/quantize_op.cc
@@ -202,12 +217,16 @@ tensorflow/core/kernels/quantized_reshape_op.cc
 tensorflow/core/kernels/quantized_resize_bilinear_op.cc
 tensorflow/core/kernels/requantization_range_op.cc
 tensorflow/core/kernels/requantize.cc
+tensorflow/core/kernels/remote_fused_graph_execute_op.cc
+tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+tensorflow/core/kernels/batch_matmul_op_real.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
 tensorflow/core/ops/state_ops.cc
 tensorflow/core/ops/sparse_ops.cc
 tensorflow/core/ops/sendrecv_ops.cc
 tensorflow/core/ops/script_ops.cc
+tensorflow/core/ops/remote_fused_graph_ops.cc
 tensorflow/core/ops/random_ops.cc
 tensorflow/core/ops/random_grad.cc
 tensorflow/core/ops/parsing_ops.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 36d9cb74a704172a44e77952d021cab671806b03..a1a9aa7190205d9f3c34ef01b65db85f89f2ac85 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -1,3 +1,7 @@
+tensorflow/contrib/boosted_trees/proto/learner.proto
+tensorflow/contrib/boosted_trees/proto/quantiles.proto
+tensorflow/contrib/boosted_trees/proto/split_info.proto
+tensorflow/contrib/boosted_trees/proto/tree_config.proto
 tensorflow/core/util/test_log.proto
 tensorflow/core/util/saved_tensor_slice.proto
 tensorflow/core/util/memmapped_file_system.proto
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index 8b792a0f6851173ef281b0d2910a31c103186c65..c17bf60606af12e6139215ada3423833433cb35c 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -31,7 +31,6 @@ py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:confusion_matrix",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:histogram_ops",
         "//tensorflow/python:init_ops",
@@ -40,11 +39,9 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:sets",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
     ],
 )
 
@@ -58,9 +55,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -74,8 +68,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -86,6 +78,7 @@ py_test(
     srcs = ["python/ops/metric_ops_test.py"],
     shard_count = 3,
     srcs_version = "PY2AND3",
+    tags = ["noasan"],  # times out b/63678675
     deps = [
         ":metrics_py",
         "//tensorflow/python:array_ops",
@@ -98,6 +91,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 6c773d9a7f26bebd918c7ffa9ef945fbb5dc2ae8..feb2b36d5e7c1bf4f0867f890ab47c205f478d96 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1972,9 +1972,17 @@ def streaming_covariance(predictions,
                       (prev_count * batch_count / update_count))
     update_comoment = state_ops.assign_add(comoment, delta_comoment)
 
-    covariance = _safe_div(comoment, count - 1, 'covariance')
+    covariance = array_ops.where(
+        math_ops.less_equal(count, 1.),
+        float('nan'),
+        math_ops.truediv(comoment, count - 1),
+        name='covariance')
     with ops.control_dependencies([update_comoment]):
-      update_op = _safe_div(comoment, count - 1, 'update_op')
+      update_op = array_ops.where(
+          math_ops.less_equal(count, 1.),
+          float('nan'),
+          math_ops.truediv(comoment, count - 1),
+          name='update_op')
 
   if metrics_collections:
     ops.add_to_collections(metrics_collections, covariance)
@@ -2051,16 +2059,16 @@ def streaming_pearson_correlation(predictions,
     var_labels, update_var_labels = streaming_covariance(
         labels, labels, weights=weights, name='variance_labels')
 
-    pearson_r = _safe_div(
+    pearson_r = math_ops.truediv(
         cov,
         math_ops.multiply(math_ops.sqrt(var_predictions),
                           math_ops.sqrt(var_labels)),
-        'pearson_r')
-    update_op = _safe_div(
+        name='pearson_r')
+    update_op = math_ops.truediv(
         update_cov,
         math_ops.multiply(math_ops.sqrt(update_var_predictions),
                           math_ops.sqrt(update_var_labels)),
-        'update_op')
+        name='update_op')
 
   if metrics_collections:
     ops.add_to_collections(metrics_collections, pearson_r)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 6496cecfbd0f505b163c1b16dfdcff07502da12f..00cde08bff1157dd44adad3e1bdeff674fb0a444 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -4183,11 +4183,6 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
 
 
-def _reweight(predictions, labels, weights):
-  return (np.concatenate([[p] * int(w) for p, w in zip(predictions, weights)]),
-          np.concatenate([[l] * int(w) for l, w in zip(labels, weights)]))
-
-
 class StreamingCovarianceTest(test.TestCase):
 
   def setUp(self):
@@ -4277,8 +4272,9 @@ class StreamingCovarianceTest(test.TestCase):
       cov, update_op = metrics.streaming_covariance(
           predictions, labels, weights=weights)
 
-      p, l = _reweight([2, 4, 6, 8], [1, 3, 2, 7], [0, 1, 3, 1])
-      expected_cov = np.cov(p, l)[0, 1]
+      expected_cov = np.cov([2, 4, 6, 8],
+                            [1, 3, 2, 7],
+                            fweights=[0, 1, 3, 1])[0, 1]
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expected_cov, sess.run(update_op))
       self.assertAlmostEqual(expected_cov, cov.eval())
@@ -4297,14 +4293,17 @@ class StreamingCovarianceTest(test.TestCase):
       cov, update_op = metrics.streaming_covariance(predictions_t, labels_t)
 
       sess.run(variables.local_variables_initializer())
-      prev_expected_cov = 0.
+      prev_expected_cov = NAN
       for i in range(n // stride):
         feed_dict = {
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertAlmostEqual(
-            prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+        self.assertEqual(np.isnan(prev_expected_cov),
+                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        if not np.isnan(prev_expected_cov):
+          self.assertAlmostEqual(
+              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
         expected_cov = np.cov(predictions[:stride * (i + 1)],
                               labels[:stride * (i + 1)])[0, 1]
         self.assertAlmostEqual(
@@ -4331,18 +4330,21 @@ class StreamingCovarianceTest(test.TestCase):
           predictions_t, labels_t, weights=weights_t)
 
       sess.run(variables.local_variables_initializer())
-      prev_expected_cov = 0.
+      prev_expected_cov = NAN
       for i in range(n // stride):
         feed_dict = {
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertAlmostEqual(
-            prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
-        p, l = _reweight(predictions[:stride * (i + 1)],
-                         labels[:stride * (i + 1)], weights[:stride * (i + 1)])
-        expected_cov = np.cov(p, l)[0, 1]
+        self.assertEqual(np.isnan(prev_expected_cov),
+                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        if not np.isnan(prev_expected_cov):
+          self.assertAlmostEqual(
+              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+        expected_cov = np.cov(predictions[:stride * (i + 1)],
+                              labels[:stride * (i + 1)],
+                              fweights=weights[:stride * (i + 1)])[0, 1]
         self.assertAlmostEqual(
             expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
         self.assertAlmostEqual(
@@ -4453,8 +4455,7 @@ class StreamingPearsonRTest(test.TestCase):
       pearson_r, update_op = metrics.streaming_pearson_correlation(
           predictions_t, labels_t, weights=weights_t)
 
-      p, l = _reweight(predictions, labels, weights)
-      cmat = np.cov(p, l)
+      cmat = np.cov(predictions, labels, fweights=weights)
       expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expected_r, sess.run(update_op))
@@ -4475,14 +4476,17 @@ class StreamingPearsonRTest(test.TestCase):
           predictions_t, labels_t)
 
       sess.run(variables.local_variables_initializer())
-      prev_expected_r = 0.
+      prev_expected_r = NAN
       for i in range(n // stride):
         feed_dict = {
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertAlmostEqual(
-            prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertEqual(np.isnan(prev_expected_r),
+                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        if not np.isnan(prev_expected_r):
+          self.assertAlmostEqual(
+              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
         expected_r = np.corrcoef(predictions[:stride * (i + 1)],
                                  labels[:stride * (i + 1)])[0, 1]
         self.assertAlmostEqual(
@@ -4509,18 +4513,21 @@ class StreamingPearsonRTest(test.TestCase):
           predictions_t, labels_t, weights=weights_t)
 
       sess.run(variables.local_variables_initializer())
-      prev_expected_r = 0.
+      prev_expected_r = NAN
       for i in range(n // stride):
         feed_dict = {
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertAlmostEqual(
-            prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
-        p, l = _reweight(predictions[:stride * (i + 1)],
-                         labels[:stride * (i + 1)], weights[:stride * (i + 1)])
-        cmat = np.cov(p, l)
+        self.assertEqual(np.isnan(prev_expected_r),
+                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        if not np.isnan(prev_expected_r):
+          self.assertAlmostEqual(
+              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        cmat = np.cov(predictions[:stride * (i + 1)],
+                      labels[:stride * (i + 1)],
+                      fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
         self.assertAlmostEqual(
             expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
@@ -4528,6 +4535,47 @@ class StreamingPearsonRTest(test.TestCase):
             expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
+  def testMultiUpdateWithErrorAndSingletonBatches(self):
+    with self.test_session() as sess:
+      np.random.seed(123)
+      n = 100
+      predictions = np.random.randn(n)
+      labels = 0.5 * predictions + np.random.randn(n)
+      stride = 10
+      weights = (np.arange(n).reshape(n//stride, stride) % stride == 0)
+      for row in weights:
+        np.random.shuffle(row)
+      # Now, weights is one-hot by row - one item per batch has non-zero weight.
+      weights = weights.reshape((n,))
+
+      predictions_t = array_ops.placeholder(dtypes_lib.float32, [stride])
+      labels_t = array_ops.placeholder(dtypes_lib.float32, [stride])
+      weights_t = array_ops.placeholder(dtypes_lib.float32, [stride])
+
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions_t, labels_t, weights=weights_t)
+
+      sess.run(variables.local_variables_initializer())
+      for i in range(n // stride):
+        feed_dict = {
+            predictions_t: predictions[stride * i:stride * (i + 1)],
+            labels_t: labels[stride * i:stride * (i + 1)],
+            weights_t: weights[stride * i:stride * (i + 1)]
+        }
+        cmat = np.cov(predictions[:stride * (i + 1)],
+                      labels[:stride * (i + 1)],
+                      fweights=weights[:stride * (i + 1)])
+        expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
+        actual_r = sess.run(update_op, feed_dict=feed_dict)
+        self.assertEqual(np.isnan(expected_r), np.isnan(actual_r))
+        self.assertEqual(np.isnan(expected_r),
+                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        if not np.isnan(expected_r):
+          self.assertAlmostEqual(
+              expected_r, actual_r, 5)
+          self.assertAlmostEqual(
+              expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+
 
 class StreamingMeanCosineDistanceTest(test.TestCase):
 
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index e97e8d01638abd0e21ee697d2f54d6dfa7da6dcd..1a2563d20fdc33d3c5e4a85561b61d04d3eeabff 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -44,7 +44,8 @@ MPIRendezvousMgr::MPIRendezvousMgr(const WorkerEnv* env)
 
   // extract worker-name
   auto parsed = env->local_devices[0]->parsed_name();
-  const std::string task_id = strings::StrCat(parsed.job, ":", parsed.replica);
+  const std::string task_id =
+      strings::StrCat(parsed.job, ":", parsed.replica, ":", parsed.task);
 
   mpiutils_ = new MPIUtils(task_id);
   background_thread_ =
@@ -66,8 +67,8 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync(
   VLOG(2) << "MPI User requested " << parsed.FullKey()
           << " @ step: " << step_id_;
 
-  std::string src_task =
-      strings::StrCat(parsed.src.job, ":", parsed.src.replica);
+  std::string src_task = strings::StrCat(
+      parsed.src.job, ":", parsed.src.replica, ":", parsed.src.task);
   const int dst = mpiutils_->GetSourceID(src_task);
 
   Device* dst_device;
@@ -138,11 +139,7 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync(
                     std::move(request_call), rendezvous_call);
 }
 
-MPIRemoteRendezvous::~MPIRemoteRendezvous() {
-  MPIRendezvousMgr* mgr =
-      reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
-  mgr->RemoveStepID(step_id_);
-}
+MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
 
 /*
  * Add the request for one of our Tensors by a remote process
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index 50fc3804967c5889fcc211aff7ed96d03a517fba..b15748d63c9fdbc5134069b63fd998e46c499e16 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -114,7 +114,7 @@ class MPIRemoteRendezvous : public BaseRemoteRendezvous {
  public:
   MPIRemoteRendezvous(const WorkerEnv* env, int64 step_id, const MPIUtils* util,
                       BaseRendezvousMgr* mgr_)
-      : BaseRemoteRendezvous(env, step_id, false),
+      : BaseRemoteRendezvous(env, step_id),
         mpiutils_(util),
         rendezvous_mgr_(mgr_) {}
 
@@ -147,15 +147,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
                     MPIRequestTensorCall* rCall) {
     mutex_lock l(mrq_);
     request_queue_.push(RequestQueueEntry(key, std::move(request_call)));
-    recv_tensor_map_[step_id][key] =
-        std::shared_ptr<MPIRequestTensorCall>(rCall);
-  }
-
-  void RemoveStepID(const int64 step_id) {
-    mutex_lock l(mrq_);
-    CHECK(recv_tensor_map_[step_id].size() == 0) << "Removing unfinished step";
-    recv_tensor_map_.erase(step_id);
-    // TODO(jbedorf) Should we verify that the step_id is clear before remove?
+    const std::string key_id = strings::StrCat(key, "_", step_id);
+    recv_tensor_map_[key_id] = std::shared_ptr<MPIRequestTensorCall>(rCall);
   }
 
  protected:
@@ -181,9 +174,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
 
   std::queue<SendQueueEntry> send_queue_ GUARDED_BY(msq_);
   std::queue<RequestQueueEntry> request_queue_ GUARDED_BY(mrq_);
-  std::map<int64, std::unordered_map<std::string,
-                                     std::shared_ptr<MPIRequestTensorCall>>>
-      recv_tensor_map_ GUARDED_BY(mrq_);
+  std::map<std::string, std::shared_ptr<MPIRequestTensorCall>> recv_tensor_map_
+      GUARDED_BY(mrq_);
 
   void AddRequest(RecvTensorRequest, const int);
   void MPIBackgroundThread();
@@ -196,22 +188,19 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
   void GetRecvCall(const int64 step_id, const std::string& key,
                    std::shared_ptr<MPIRequestTensorCall>* call) {
     mutex_lock l(mrq_);
-    if (recv_tensor_map_.find(step_id) == recv_tensor_map_.end()) {
-      LOG(FATAL) << "Step not found in recv_tensor_map_, step: " << step_id
-                 << " key:  " << key << std::endl;
-    }
-    if (recv_tensor_map_[step_id].find(key) !=
-        recv_tensor_map_[step_id].end()) {
-      *call = recv_tensor_map_[step_id][key];
-    } else {
-      LOG(FATAL) << "Key not found in recv_tensor_map_, step: " << step_id
+
+    const std::string key_id = strings::StrCat(key, "_", step_id);
+    if (recv_tensor_map_.find(key_id) == recv_tensor_map_.end()) {
+      LOG(FATAL) << "Key/step not found in recv_tensor_map_, step: " << step_id
                  << " key:  " << key << std::endl;
     }
+    *call = recv_tensor_map_[key_id];
   }
 
   void RemoveRecvCall(const int64 step_id, const std::string& key) {
     mutex_lock l(mrq_);
-    recv_tensor_map_[step_id].erase(key);
+    const std::string key_id = strings::StrCat(key, "_", step_id);
+    recv_tensor_map_.erase(key_id);
   }
 
   bool GetRequest(RequestQueueEntry* req) {
diff --git a/tensorflow/contrib/ndlstm/BUILD b/tensorflow/contrib/ndlstm/BUILD
index 73a4ad2e70fabbace1cdc9640419ca37f62d6792..ba23c70dc0a3729e89ae76ae65dfdc9c6524ee22 100644
--- a/tensorflow/contrib/ndlstm/BUILD
+++ b/tensorflow/contrib/ndlstm/BUILD
@@ -33,6 +33,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d.py b/tensorflow/contrib/ndlstm/python/lstm2d.py
index da9698ec5b3e43f7f2a11eba0cf4721a21d49fcc..ebbb4ccf11b219e86578d05e99a7a02ebe08271e 100644
--- a/tensorflow/contrib/ndlstm/python/lstm2d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm2d.py
@@ -91,21 +91,71 @@ def horizontal_lstm(images, num_filters_out, scope=None):
     return output
 
 
-def separable_lstm(images, num_filters_out, nhidden=None, scope=None):
+def get_blocks(images, kernel_size):
+  """Split images in blocks
+
+  Args:
+    images: (num_images, height, width, depth) tensor
+    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
+      of the pooling. Can be an int if both values are the same.
+
+  Returns:
+    (num_images, height/kernel_height, width/kernel_width,
+    depth*kernel_height*kernel_width) tensor
+  """
+  with variable_scope.variable_scope("image_blocks"):
+    batch_size, height, width, chanels = _shape(images)
+
+    if height % kernel_size[0] != 0:
+      offset = array_ops.zeros([batch_size,
+                                kernel_size[0] - (height % kernel_size[0]),
+                                width,
+                                chanels])
+      images = array_ops.concat([images, offset], 1)
+      batch_size, height, width, chanels = _shape(images)
+    if width % kernel_size[1] != 0:
+      offset = array_ops.zeros([batch_size,
+                                height,
+                                kernel_size[1] - (width % kernel_size[1]),
+                                chanels])
+      images = array_ops.concat([images, offset], 2)
+      batch_size, height, width, chanels = _shape(images)
+
+    h, w = int(height / kernel_size[0]), int(width / kernel_size[1])
+    features = kernel_size[1] * kernel_size[0] * chanels
+
+    lines = array_ops.split(images, h, axis=1)
+    line_blocks = []
+    for line in lines:
+      line = array_ops.transpose(line, [0, 2, 3, 1])
+      line = array_ops.reshape(line, [batch_size, w, features])
+      line_blocks.append(line)
+
+    return array_ops.stack(line_blocks, axis=1)
+
+
+def separable_lstm(images, num_filters_out,
+                   kernel_size=None, nhidden=None, scope=None):
   """Run bidirectional LSTMs first horizontally then vertically.
 
   Args:
     images: (num_images, height, width, depth) tensor
     num_filters_out: output layer depth
+    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
+      of the pooling. Can be an int if both values are the same. Set to None for
+      not using blocks
     nhidden: hidden layer depth
     scope: optional scope name
 
   Returns:
-    (num_images, height, width, num_filters_out) tensor
+    (num_images, height/kernel_height, width/kernel_width,
+    num_filters_out) tensor
   """
   with variable_scope.variable_scope(scope, "SeparableLstm", [images]):
     if nhidden is None:
       nhidden = num_filters_out
+    if kernel_size is not None:
+      images = get_blocks(images, kernel_size)
     hidden = horizontal_lstm(images, nhidden)
     with variable_scope.variable_scope("vertical"):
       transposed = array_ops.transpose(hidden, [0, 2, 1, 3])
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d_test.py b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
index 3dbbb817968d368e6f71237a71064f28d07a410c..f1b37d701b868438dcbac4e713ccc2136dacd983 100644
--- a/tensorflow/contrib/ndlstm/python/lstm2d_test.py
+++ b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
@@ -69,6 +69,14 @@ class Lstm2DTest(test_util.TensorFlowTestCase):
       result = outputs.eval()
       self.assertEqual(tuple(result.shape), (2, 7, 11, 8))
 
+  def testSeparableLstmDimsBlocks(self):
+    with self.test_session():
+      inputs = constant_op.constant(_rand(2, 7, 11, 5))
+      outputs = lstm2d.separable_lstm(inputs, 8, kernel_size=[2, 2])
+      variables.global_variables_initializer().run()
+      result = outputs.eval()
+      self.assertEqual(tuple(result.shape), (2, 4, 6, 8))
+
   def testReduceToSequenceDims(self):
     with self.test_session():
       inputs = constant_op.constant(_rand(2, 7, 11, 5))
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index 13a05bf378762625b887a65d4cc20afa69548968..af33496e5d7933fc0adec2925271203a43e3b830 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -7,6 +7,8 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//visibility:public"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 py_library(
     name = "nn_py",
     srcs = [
@@ -14,10 +16,32 @@ py_library(
         "python/__init__.py",
         "python/ops/__init__.py",
         "python/ops/cross_entropy.py",
+        "python/ops/sampling_ops.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python:nn"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "sampling_ops_test",
+    size = "small",
+    srcs = ["python/ops/sampling_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nn_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+    ],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index 73757a6696e04dc29f1570c329897a4e1b9e63fd..ec832cbd4905d70f154cebb5a3c387ca7e148b79 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for deprecated ops in tf.nn.
+"""Module for variants of ops in tf.nn.
 
 @@deprecated_flipped_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sparse_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
+@@rank_sampled_softmax_loss
 """
 
 from __future__ import absolute_import
@@ -25,6 +26,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.nn.python.ops.cross_entropy import *
+from tensorflow.contrib.nn.python.ops.sampling_ops import *
 # pylint: enable=unused-import,wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/nn/python/__init__.py b/tensorflow/contrib/nn/python/__init__.py
index 285c094ca7da965235ecb2db8bb00c1e85736a89..73dd4d4015168096d7a28af5840ed20440f42cd1 100644
--- a/tensorflow/contrib/nn/python/__init__.py
+++ b/tensorflow/contrib/nn/python/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for deprecated ops in tf.nn."""
+"""Module for variants of ops in tf.nn."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/nn/python/ops/__init__.py b/tensorflow/contrib/nn/python/ops/__init__.py
index a945bcdccf8c43220d8d6bc2e90f0dc6b95e9622..73dd4d4015168096d7a28af5840ed20440f42cd1 100644
--- a/tensorflow/contrib/nn/python/ops/__init__.py
+++ b/tensorflow/contrib/nn/python/ops/__init__.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for deprecated ops in tf.nn."""
+"""Module for variants of ops in tf.nn."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ae529e0155f5ad9b40391c2f728c5c594e72dc9
--- /dev/null
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -0,0 +1,242 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops related to candidate sampling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
+                   resampling_temperature, partition_strategy):
+  """A helper function for rank_sampled_softmax_loss.
+
+  This computes, for each i in `sampled_values`,
+
+      log(sum_j exp((w_i * x_j + b_i) / resampling_temperature))
+
+  where w_i, b_i are the weight and bias of the i-th class, repsectively,
+  and j ranges over the rows of `inputs`. For efficiency, we rearrange the
+  computation to
+
+      log(sum_j exp(w_i * (x_j / resampling_temperature))) +
+          b_i / resampling_temperature.
+
+  This translates to the following batched computation using tensorflow ops:
+
+      reduce_logsumexp(matmul(embeddings,
+                       transpose(inputs / resampling_temperature))) +
+          biases / resampling_temperature
+
+  The computation of the first term is colocated with the embeddings using
+  `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second
+  term, not the bottleneck, is computed at the worker.
+
+  Args:
+    weights: From `rank_sampled_softmax_loss`.
+    biases: From `rank_sampled_softmax_loss`.
+    inputs: From `rank_sampled_softmax_loss`.
+    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
+        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+    num_resampled: An `int`. This many values are selected from
+        `sampled_values` using the adaptive resampling algorithm. The caller
+        must ensure that `num_resampled` is less than the size of
+        `sampled_values`.
+    resampling_temperature: A scalar `Tensor` with the temperature parameter
+        for the adaptive resampling algorithm.
+    partition_strategy: From `rank_sampled_softmax_loss`.
+
+  Returns:
+    A tuple of (`resampled_candidates`, `true_expected_count`,
+        `resampled_expected_count`), similar to `sampled_values` but sampled
+        down to `num_resampled` values.
+  """
+  # This code supports passing a Tensor for num_resampled, but since it is only
+  # called with an int, that's what we specify in the arg list. If this
+  # function is ever externalized, we should change the doc to support Tensor.
+
+  sampled, true_expected_count, sampled_expected_count = sampled_values
+
+  sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64)
+  true_expected_count = array_ops.stop_gradient(true_expected_count)
+  sampled_expected_count = array_ops.stop_gradient(sampled_expected_count)
+
+  reweighted_inputs = inputs / resampling_temperature
+
+  def logsumexp_logit(embeddings):
+    return math_ops.reduce_logsumexp(
+        math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True),
+        axis=1,
+        keep_dims=False)
+
+  # Calling this protected form of embedding_lookup allows co-locating
+  # the logsumexp computation with the partitioned weights, which yields
+  # a large speedup in practice.
+  sampled_logits = embedding_ops._embedding_lookup_and_transform(  # pylint: disable=protected-access
+      weights, sampled, partition_strategy, transform_fn=logsumexp_logit)
+  sampled_b = array_ops.reshape(
+      embedding_ops.embedding_lookup(biases, sampled, partition_strategy), [-1])
+  sampled_logits += sampled_b / resampling_temperature
+
+  _, resampled_indices = nn.top_k(sampled_logits, k=num_resampled, sorted=False)
+  resampled = array_ops.gather(sampled, indices=resampled_indices)
+  resampled_expected_count = array_ops.gather(
+      sampled_expected_count, indices=resampled_indices)
+
+  return resampled, true_expected_count, resampled_expected_count
+
+
+def rank_sampled_softmax_loss(weights,
+                              biases,
+                              labels,
+                              inputs,
+                              num_sampled,
+                              num_resampled,
+                              num_classes,
+                              num_true,
+                              sampled_values,
+                              resampling_temperature,
+                              remove_accidental_hits,
+                              partition_strategy,
+                              name=None):
+  """Computes softmax loss using rank-based adaptive resampling.
+
+  This has been shown to improve rank loss after training compared to
+  @{tf.nn.sampled_softmax_loss}. For a description of the algorithm and some
+  experimental results, please see: [TAPAS: Two-pass Approximate Adaptive
+  Sampling for Softmax](https://arxiv.org/abs/1707.03073).
+
+  Sampling follows two phases:
+  * In the first phase, `num_sampled` classes are selected using
+    @{tf.nn.learned_unigram_candidate_sampler} or supplied `sampled_values`.
+    The logits are calculated on those sampled classes. This phases is
+    similar to @{tf.nn.sampled_softmax_loss}.
+  * In the second phase, the `num_resampled` classes with highest predicted
+    probability are kept. Probabilities are
+    `LogSumExp(logits / resampling_temperature)`, where the sum is over
+    `inputs`.
+
+  The `resampling_temperature` parameter controls the "adaptiveness" of the
+  resampling. At lower temperatures, resampling is more adaptive because it
+  picks more candidates close to the predicted classes. A common strategy is
+  to decrease the temperature as training proceeds.
+
+  See @{tf.nn.sampled_softmax_loss} for more documentation on sampling and
+  for typical default values for some of the parameters.
+
+  This operation is for training only. It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  softmax loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = rank_sampled_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+  ```
+
+  Args:
+    weights: A `Tensor` or `PartitionedVariable` of shape `[num_classes, dim]`,
+        or a list of `Tensor` objects whose concatenation along dimension 0
+        has shape [num_classes, dim]. The (possibly-sharded) class embeddings.
+    biases: A `Tensor` or `PartitionedVariable` of shape `[num_classes]`.
+        The (possibly-sharded) class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size,
+        num_true]`. The target classes. Note that this format differs from
+        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
+        activations of the input network.
+    num_sampled: An `int`. The number of classes to randomly sample per batch.
+    num_resampled: An `int`. The number of classes to select from the
+        `num_sampled` classes using the adaptive resampling algorithm. Must be
+        less than `num_sampled`.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
+        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+        If None, default to `nn.learned_unigram_candidate_sampler`.
+    resampling_temperature: A scalar `Tensor` with the temperature parameter
+        for the adaptive resampling algorithm.
+    remove_accidental_hits: A `bool`. Whether to remove "accidental hits"
+        where a sampled class equals one of the target classes.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
+        See @{tf.nn.embedding_lookup} for more details.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  Raises:
+    ValueError: If `num_sampled <= num_resampled`.
+  """
+  if num_sampled > num_classes:
+    raise ValueError("num_sampled ({}) cannot be greater than num_classes ({})".
+                     format(num_sampled, num_classes))
+  if num_sampled <= num_resampled:
+    raise ValueError("num_resampled ({}) must be less than num_sampled ({})".
+                     format(num_resampled, num_sampled))
+  if partition_strategy not in ("div", "mod"):
+    raise ValueError(
+        "unsupported partition_strategy ({})".format(partition_strategy))
+  with ops.name_scope(name, "rank_sampled_softmax_loss", [
+      weights, biases, labels, inputs, sampled_values, resampling_temperature
+  ]) as name:
+    if not sampled_values:
+      sampled_values = nn.learned_unigram_candidate_sampler(
+          true_classes=labels,
+          num_true=num_true,
+          num_sampled=num_sampled,
+          unique=True,
+          range_max=num_classes)
+    # From sampled_values, select the top num_resampled values using the
+    # adaptive rank resampling strategy.
+    resampled_values = _rank_resample(weights, biases, inputs, sampled_values,
+                                      num_resampled, resampling_temperature,
+                                      partition_strategy)
+    return nn.sampled_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        num_sampled=num_resampled,
+        num_classes=num_classes,
+        num_true=num_true,
+        sampled_values=resampled_values,
+        remove_accidental_hits=remove_accidental_hits,
+        partition_strategy=partition_strategy,
+        name=name)
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops_test.py b/tensorflow/contrib/nn/python/ops/sampling_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4fe1321b82b1c561c514eded30ceb7f9675c37
--- /dev/null
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops_test.py
@@ -0,0 +1,322 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sampling_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.nn.python.ops import sampling_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class RankSampledSoftmaxLossTest(test.TestCase):
+
+  def setUp(self):
+    self._sampled = [3, 4, 5, 6, 7]
+    self._num_sampled = len(self._sampled)
+    # Because values of all matrices increase with indices, logits increase with
+    # class id. So, for the above sampled classes, adaptive sampling will select
+    # these resampled classes.
+    self._resampled = [5, 6, 7]
+    self._num_resampled = len(self._resampled)
+    self._num_classes = 10
+    self._num_true = 2
+    self._sampled_values = (self._sampled, [[0.5], [0.5]],
+                            [0.5, 0.5, 0.5, 0.5, 0.5])
+    self._resampled_values = (self._resampled, [[0.5], [0.5]], [0.5, 0.5, 0.5])
+    self._remove_accidental_hits = False
+    self._embed_dim = 5
+    self._batch_size = 2
+
+  def _weights(self):
+    return constant_op.constant([
+        [0.0, 0.1, 0.2, 0.3, 0.4],
+        [1.0, 1.1, 1.2, 1.3, 1.4],
+        [2.0, 2.1, 2.2, 2.3, 2.4],
+        [3.0, 3.1, 3.2, 3.3, 3.4],
+        [4.0, 4.1, 4.2, 4.3, 4.4],
+        [5.0, 5.1, 5.2, 5.3, 5.4],
+        [6.0, 6.1, 6.2, 6.3, 6.4],
+        [7.0, 7.1, 7.2, 7.3, 7.4],
+        [8.0, 8.1, 8.2, 8.3, 8.4],
+        [9.0, 9.1, 9.2, 9.3, 9.4],
+    ])
+
+  def _div_sharded_weights(self):
+    return [
+        constant_op.constant([
+            [0.0, 0.1, 0.2, 0.3, 0.4],
+            [1.0, 1.1, 1.2, 1.3, 1.4],
+        ]),
+        constant_op.constant([
+            [2.0, 2.1, 2.2, 2.3, 2.4],
+            [3.0, 3.1, 3.2, 3.3, 3.4],
+        ]),
+        constant_op.constant([
+            [4.0, 4.1, 4.2, 4.3, 4.4],
+            [5.0, 5.1, 5.2, 5.3, 5.4],
+        ]),
+        constant_op.constant([
+            [6.0, 6.1, 6.2, 6.3, 6.4],
+            [7.0, 7.1, 7.2, 7.3, 7.4],
+        ]),
+        constant_op.constant([
+            [8.0, 8.1, 8.2, 8.3, 8.4],
+            [9.0, 9.1, 9.2, 9.3, 9.4],
+        ]),
+    ]
+
+  def _mod_sharded_weights(self):
+    return [
+        constant_op.constant([
+            [0.0, 0.1, 0.2, 0.3, 0.4],
+            [5.0, 5.1, 5.2, 5.3, 5.4],
+        ]),
+        constant_op.constant([
+            [1.0, 1.1, 1.2, 1.3, 1.4],
+            [6.0, 6.1, 6.2, 6.3, 6.4],
+        ]),
+        constant_op.constant([
+            [2.0, 2.1, 2.2, 2.3, 2.4],
+            [7.0, 7.1, 7.2, 7.3, 7.4],
+        ]),
+        constant_op.constant([
+            [3.0, 3.1, 3.2, 3.3, 3.4],
+            [8.0, 8.1, 8.2, 8.3, 8.4],
+        ]),
+        constant_op.constant([
+            [4.0, 4.1, 4.2, 4.3, 4.4],
+            [9.0, 9.1, 9.2, 9.3, 9.4],
+        ]),
+    ]
+
+  def _biases(self):
+    return constant_op.constant(
+        [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
+
+  def _div_sharded_biases(self):
+    return [
+        constant_op.constant([0.0, 0.1]),
+        constant_op.constant([0.2, 0.3]),
+        constant_op.constant([0.4, 0.5]),
+        constant_op.constant([0.6, 0.7]),
+        constant_op.constant([0.8, 0.9]),
+    ]
+
+  def _mod_sharded_biases(self):
+    return [
+        constant_op.constant([0.0, 0.5]),
+        constant_op.constant([0.1, 0.6]),
+        constant_op.constant([0.2, 0.7]),
+        constant_op.constant([0.3, 0.8]),
+        constant_op.constant([0.4, 0.9]),
+    ]
+
+  def _labels(self):
+    return constant_op.constant(
+        [[0, 1], [1, 2]],
+        shape=(self._batch_size, self._num_true),
+        name='labels',
+        dtype=dtypes.int64)
+
+  def _inputs(self):
+    return constant_op.constant(
+        [
+            [0., 1., 2., 3., 4.],
+            [10., 11., 12., 13., 14.],
+        ],
+        shape=(self._batch_size, self._embed_dim),
+        name='inputs')
+
+  def testInvalidNumSampled0(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'num_resampled \(3\) must be less than num_sampled \(3\)'):
+        sampling_ops.rank_sampled_softmax_loss(
+            weights=self._weights(),
+            biases=self._biases(),
+            labels=self._labels(),
+            inputs=self._inputs(),
+            num_sampled=3,
+            num_resampled=3,
+            num_classes=self._num_classes,
+            num_true=self._num_true,
+            sampled_values=None,
+            resampling_temperature=1.,
+            remove_accidental_hits=True,
+            partition_strategy='div')
+
+  def testInvalidNumSampled1(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'num_resampled \(3\) must be less than num_sampled \(2\)'):
+        sampling_ops.rank_sampled_softmax_loss(
+            weights=self._weights(),
+            biases=self._biases(),
+            labels=self._labels(),
+            inputs=self._inputs(),
+            num_sampled=2,
+            num_resampled=3,
+            num_classes=self._num_classes,
+            num_true=self._num_true,
+            sampled_values=None,
+            resampling_temperature=1.,
+            remove_accidental_hits=True,
+            partition_strategy='div')
+
+  def testMissingPartitionStrategy(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   r'unsupported partition_strategy \(None\)'):
+        sampling_ops.rank_sampled_softmax_loss(
+            weights=self._weights(),
+            biases=self._biases(),
+            labels=self._labels(),
+            inputs=self._inputs(),
+            num_sampled=2,
+            num_resampled=1,
+            num_classes=self._num_classes,
+            num_true=self._num_true,
+            sampled_values=None,
+            resampling_temperature=1.,
+            remove_accidental_hits=True,
+            partition_strategy=None)
+
+  def _testCompareWithNN(self, weights, biases, partition_strategy):
+    with ops.Graph().as_default():
+      loss = sampling_ops.rank_sampled_softmax_loss(
+          weights=weights(),
+          biases=biases(),
+          labels=self._labels(),
+          inputs=self._inputs(),
+          num_sampled=self._num_sampled,
+          num_resampled=self._num_resampled,
+          num_classes=self._num_classes,
+          num_true=self._num_true,
+          sampled_values=self._sampled_values,
+          resampling_temperature=1.,
+          remove_accidental_hits=self._remove_accidental_hits,
+          partition_strategy=partition_strategy)
+      loss_nn = nn.sampled_softmax_loss(
+          weights=weights(),
+          biases=biases(),
+          labels=self._labels(),
+          inputs=self._inputs(),
+          num_sampled=self._num_resampled,
+          num_classes=self._num_classes,
+          num_true=self._num_true,
+          sampled_values=self._resampled_values,
+          remove_accidental_hits=self._remove_accidental_hits,
+          partition_strategy=partition_strategy)
+      with self.test_session() as sess:
+        loss_val = sess.run(loss)
+        loss_nn_val = sess.run(loss_nn)
+
+    self.assertAllClose(loss_val, loss_nn_val)
+
+  def testCompareWithNNUnsharded(self):
+    self._testCompareWithNN(self._weights, self._biases, 'div')
+
+  def testCompareWithNNShardWeightsDiv(self):
+    self._testCompareWithNN(self._div_sharded_weights, self._biases, 'div')
+
+  def testCompareWithNNShardWeightsAndBiasesDiv(self):
+    self._testCompareWithNN(self._div_sharded_weights, self._div_sharded_biases,
+                            'div')
+
+  def testCompareWithNNShardWeightsMod(self):
+    self._testCompareWithNN(self._mod_sharded_weights, self._biases, 'mod')
+
+  def testCompareWithNNShardWeightsAndBiasesMod(self):
+    self._testCompareWithNN(self._mod_sharded_weights, self._mod_sharded_biases,
+                            'mod')
+
+  def _testCompareWithNNTemperature(self, temperature, resampled):
+    weights = [[1., 2.], [3., 4.]]  # two sampled classes
+    inputs = [[6., -5. / 2.], [-11., 21. / 2.]]
+    # Let w0, w1 = weights of sampled classes (biases set to 0 for simplicity)
+    # Let x0, x1 = inputs
+    # logits:
+    #   w0.x0 = 1
+    #   w0.x1 = 10
+    #   w1.x0 = 8
+    #   w1.x1 = 9
+    # Resampling 1 class with temperature = t will pick the larger of:
+    #   exp(1/t) + exp(10/t)  ==> w0, for values of t < 2.12
+    #   exp(8/t) + exp(9/t)   ==> w1, for values of t > 2.13
+    num_sampled = 2
+    num_resampled = 1
+    num_classes = 2
+    num_true = 1
+    sampled_values = [0, 1], [[1.], [1.]], [1., 1.]
+    resampled_values = [resampled], [[1.], [1.]], [1.]
+    remove_accidental_hits = False
+    with ops.Graph().as_default():
+      weights = constant_op.constant(weights)
+      biases = constant_op.constant([0., 0.])
+      labels = constant_op.constant([[0], [1]], dtype=dtypes.int64)
+      inputs = constant_op.constant(inputs)
+      loss = sampling_ops.rank_sampled_softmax_loss(
+          weights=weights,
+          biases=biases,
+          labels=labels,
+          inputs=inputs,
+          num_sampled=num_sampled,
+          num_resampled=num_resampled,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_values,
+          resampling_temperature=constant_op.constant(temperature),
+          remove_accidental_hits=remove_accidental_hits,
+          partition_strategy='div')
+      loss_nn = nn.sampled_softmax_loss(
+          weights=weights,
+          biases=biases,
+          labels=labels,
+          inputs=inputs,
+          num_sampled=num_resampled,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=resampled_values,
+          remove_accidental_hits=remove_accidental_hits,
+          partition_strategy='div')
+      with self.test_session() as sess:
+        loss_val = sess.run(loss)
+        loss_nn_val = sess.run(loss_nn)
+
+    self.assertAllClose(loss_val, loss_nn_val)
+
+  def testCompareWithNNTemperatureLo1(self):
+    self._testCompareWithNNTemperature(1., 0)
+
+  def testCompareWithNNTemperatureLo2(self):
+    self._testCompareWithNNTemperature(2.12, 0)
+
+  def testCompareWithNNTemperatureHi1(self):
+    self._testCompareWithNNTemperature(2.13, 1)
+
+  def testCompareWithNNTemperatureHi2(self):
+    self._testCompareWithNNTemperature(3., 1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 0953a6d0011c891aa4f25f0e24aa762471c5783d..01a5573a1f9a50aaa746b380c82df6aa4f209942 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -29,11 +29,16 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -45,15 +50,12 @@ py_test(
     tags = ["manual"],
     deps = [
         ":opt_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -89,7 +91,6 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -100,6 +101,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "manual",  # Flaky: b/29892493
+        "notap",  # data race due to b/62910646
     ],
     deps = [
         ":opt_py",
@@ -138,11 +140,13 @@ py_test(
     deps = [
         ":opt_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
index 1dbd8416a087fb3ccbd7c90e4146d72fc24d6dcb..1fa76a360c0e88322694b2950d4c45915510d05a 100644
--- a/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
+++ b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import resource_variable_ops
@@ -26,11 +28,19 @@ from tensorflow.python.platform import test
 from tensorflow.contrib.opt.python.training import delay_compensated_gradient_descent
 
 
+def build_session_config():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  config = config_pb2.ConfigProto(graph_options=graph_options)
+  return config
+
+
 class DelayCompensatedGradientDescentOptimizerTest(test.TestCase):
 
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.test_session(config=build_session_config()):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -56,7 +66,7 @@ class DelayCompensatedGradientDescentOptimizerTest(test.TestCase):
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.test_session(config=build_session_config()):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -83,7 +93,7 @@ class DelayCompensatedGradientDescentOptimizerTest(test.TestCase):
 
     def testGradWrtRef(self):
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        with self.test_session():
+        with self.test_session(cnofig=build_session_config()):
           optimizer = (delay_compensated_gradient_descent.
                        DelayCompensatedGradientDescentOptimizer)(
                            learning_rate=3.0,
@@ -99,7 +109,7 @@ class DelayCompensatedGradientDescentOptimizerTest(test.TestCase):
 
   def testWithGlobalStep(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.test_session(config=build_session_config()):
         global_step = variables.Variable(0, trainable=False)
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index 0909760b383d3b810f4f208763b3c10d3e902ee6..ff87a95f72f59c5ac6f6bb674732f34f79dd2db6 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -12,20 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """TensorFlow interface for third-party optimizers."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
-
 __all__ = ['ExternalOptimizerInterface', 'ScipyOptimizerInterface']
 
 
@@ -43,19 +43,41 @@ class ExternalOptimizerInterface(object):
   @@minimize
   """
 
-  def __init__(self, loss, var_list=None, equalities=None, inequalities=None,
+  def __init__(self,
+               loss,
+               var_list=None,
+               equalities=None,
+               inequalities=None,
+               var_to_bounds=None,
                **optimizer_kwargs):
     """Initialize a new interface instance.
 
     Args:
       loss: A scalar `Tensor` to be minimized.
-      var_list: Optional list of `Variable` objects to update to minimize
+      var_list: Optional `list` of `Variable` objects to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph
         under the key `GraphKeys.TRAINABLE_VARIABLES`.
-      equalities: Optional list of equality constraint scalar `Tensor`s to be
+      equalities: Optional `list` of equality constraint scalar `Tensor`s to be
         held equal to zero.
-      inequalities: Optional list of inequality constraint scalar `Tensor`s
-        to be kept nonnegative.
+      inequalities: Optional `list` of inequality constraint scalar `Tensor`s
+        to be held nonnegative.
+      var_to_bounds: Optional `dict` where each key is an optimization
+        `Variable` and each corresponding value is a length-2 tuple of
+        `(low, high)` bounds. Although enforcing this kind of simple constraint
+        could be accomplished with the `inequalities` arg, not all optimization
+        algorithms support general inequality constraints, e.g. L-BFGS-B. Both
+        `low` and `high` can either be numbers or anything convertible to a
+        NumPy array that can be broadcast to the shape of `var` (using
+        `np.broadcast_to`). To indicate that there is no bound, use `None` (or
+        `+/- np.infty`). For example, if `var` is a 2x3 matrix, then any of
+        the following corresponding `bounds` could be supplied:
+        * `(0, np.infty)`: Each element of `var` held positive.
+        * `(-np.infty, [1, 2])`: First column less than 1, second column less
+          than 2.
+        * `(-np.infty, [[1], [2], [3]])`: First row less than 1, second row less
+          than 2, etc.
+        * `(-np.infty, [[1, 2, 3], [4, 5, 6]])`: Entry `var[0, 0]` less than 1,
+          `var[0, 1]` less than 2, etc.
       **optimizer_kwargs: Other subclass-specific keyword arguments.
     """
     self._loss = loss
@@ -67,37 +89,55 @@ class ExternalOptimizerInterface(object):
     else:
       self._vars = list(var_list)
 
-    self._update_placeholders = [array_ops.placeholder(var.dtype)
-                                 for var in self._vars]
-    self._var_updates = [var.assign(array_ops.reshape(placeholder,
-                                                      _get_shape_tuple(var)))
-                         for var, placeholder in
-                         zip(self._vars, self._update_placeholders)]
+    packed_bounds = None
+    if var_to_bounds is not None:
+      left_packed_bounds = []
+      right_packed_bounds = []
+      for var in self._vars:
+        shape = var.get_shape().as_list()
+        bounds = (-np.infty, np.infty)
+        if var in var_to_bounds:
+          bounds = var_to_bounds[var]
+        left_packed_bounds.extend(list(np.broadcast_to(bounds[0], shape).flat))
+        right_packed_bounds.extend(list(np.broadcast_to(bounds[1], shape).flat))
+      packed_bounds = list(zip(left_packed_bounds, right_packed_bounds))
+    self._packed_bounds = packed_bounds
+
+    self._update_placeholders = [
+        array_ops.placeholder(var.dtype) for var in self._vars
+    ]
+    self._var_updates = [
+        var.assign(array_ops.reshape(placeholder, _get_shape_tuple(var)))
+        for var, placeholder in zip(self._vars, self._update_placeholders)
+    ]
 
     loss_grads = _compute_gradients(loss, self._vars)
-    equalities_grads = [_compute_gradients(equality, self._vars)
-                        for equality in self._equalities]
-    inequalities_grads = [_compute_gradients(inequality, self._vars)
-                          for inequality in self._inequalities]
+    equalities_grads = [
+        _compute_gradients(equality, self._vars)
+        for equality in self._equalities
+    ]
+    inequalities_grads = [
+        _compute_gradients(inequality, self._vars)
+        for inequality in self._inequalities
+    ]
 
     self.optimizer_kwargs = optimizer_kwargs
 
     self._packed_var = self._pack(self._vars)
     self._packed_loss_grad = self._pack(loss_grads)
     self._packed_equality_grads = [
-        self._pack(equality_grads)
-        for equality_grads in equalities_grads
+        self._pack(equality_grads) for equality_grads in equalities_grads
     ]
     self._packed_inequality_grads = [
-        self._pack(inequality_grads)
-        for inequality_grads in inequalities_grads
+        self._pack(inequality_grads) for inequality_grads in inequalities_grads
     ]
 
     dims = [_prod(_get_shape_tuple(var)) for var in self._vars]
     accumulated_dims = list(_accumulate(dims))
     self._packing_slices = [
-        slice(start, end) for start, end in zip(accumulated_dims[:-1],
-                                                accumulated_dims[1:])]
+        slice(start, end)
+        for start, end in zip(accumulated_dims[:-1], accumulated_dims[1:])
+    ]
 
   def minimize(self,
                session=None,
@@ -135,35 +175,39 @@ class ExternalOptimizerInterface(object):
     step_callback = step_callback or (lambda xk: None)
 
     # Construct loss function and associated gradient.
-    loss_grad_func = self._make_eval_func(
-        [self._loss, self._packed_loss_grad],
-        session, feed_dict, fetches, loss_callback)
+    loss_grad_func = self._make_eval_func([self._loss,
+                                           self._packed_loss_grad], session,
+                                          feed_dict, fetches, loss_callback)
 
     # Construct equality constraint functions and associated gradients.
-    equality_funcs = self._make_eval_funcs(
-        self._equalities, session, feed_dict, fetches)
-    equality_grad_funcs = self._make_eval_funcs(
-        self._packed_equality_grads, session, feed_dict, fetches)
+    equality_funcs = self._make_eval_funcs(self._equalities, session, feed_dict,
+                                           fetches)
+    equality_grad_funcs = self._make_eval_funcs(self._packed_equality_grads,
+                                                session, feed_dict, fetches)
 
     # Construct inequality constraint functions and associated gradients.
-    inequality_funcs = self._make_eval_funcs(
-        self._inequalities, session, feed_dict, fetches)
-    inequality_grad_funcs = self._make_eval_funcs(
-        self._packed_inequality_grads, session, feed_dict, fetches)
+    inequality_funcs = self._make_eval_funcs(self._inequalities, session,
+                                             feed_dict, fetches)
+    inequality_grad_funcs = self._make_eval_funcs(self._packed_inequality_grads,
+                                                  session, feed_dict, fetches)
 
     # Get initial value from TF session.
     initial_packed_var_val = session.run(self._packed_var)
 
     # Perform minimization.
     packed_var_val = self._minimize(
-        initial_val=initial_packed_var_val, loss_grad_func=loss_grad_func,
+        initial_val=initial_packed_var_val,
+        loss_grad_func=loss_grad_func,
         equality_funcs=equality_funcs,
         equality_grad_funcs=equality_grad_funcs,
         inequality_funcs=inequality_funcs,
         inequality_grad_funcs=inequality_grad_funcs,
-        step_callback=step_callback, optimizer_kwargs=self.optimizer_kwargs)
-    var_vals = [packed_var_val[packing_slice]
-                for packing_slice in self._packing_slices]
+        packed_bounds=self._packed_bounds,
+        step_callback=step_callback,
+        optimizer_kwargs=self.optimizer_kwargs)
+    var_vals = [
+        packed_var_val[packing_slice] for packing_slice in self._packing_slices
+    ]
 
     # Set optimization variables to their new values.
     session.run(
@@ -173,7 +217,7 @@ class ExternalOptimizerInterface(object):
 
   def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
-                step_callback, optimizer_kwargs):
+                packed_bounds, step_callback, optimizer_kwargs):
     """Wrapper for a particular optimization algorithm implementation.
 
     It would be appropriate for a subclass implementation of this method to
@@ -191,6 +235,7 @@ class ExternalOptimizerInterface(object):
       inequality_funcs: A list of functions each of which specifies a scalar
         quantity that an optimizer should hold >= 0.
       inequality_grad_funcs: A list of gradients of inequality_funcs.
+      packed_bounds: A list of bounds for each index, or `None`.
       step_callback: A callback function to execute at each optimization step,
         supplied with the current value of the packed variable vector.
       optimizer_kwargs: Other key-value arguments available to the optimizer.
@@ -239,7 +284,11 @@ class ExternalOptimizerInterface(object):
 
     return eval_func
 
-  def _make_eval_funcs(self, tensors, session, feed_dict, fetches,
+  def _make_eval_funcs(self,
+                       tensors,
+                       session,
+                       feed_dict,
+                       fetches,
                        callback=None):
     return [
         self._make_eval_func(tensor, session, feed_dict, fetches, callback)
@@ -266,7 +315,24 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
   # The value of vector should now be [0., 0.].
   ```
 
-  Example with constraints:
+  Example with simple bound constraints:
+
+  ```python
+  vector = tf.Variable([7., 7.], 'vector')
+
+  # Make vector norm as small as possible.
+  loss = tf.reduce_sum(tf.square(vector))
+
+  optimizer = ScipyOptimizerInterface(
+      loss, var_to_bounds={vector: ([1, 2], np.infty)})
+
+  with tf.Session() as session:
+    optimizer.minimize(session)
+
+  # The value of vector should now be [1., 2.].
+  ```
+
+  Example with more complicated constraints:
 
   ```python
   vector = tf.Variable([7., 7.], 'vector')
@@ -294,7 +360,8 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
 
   def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
-                step_callback, optimizer_kwargs):
+                packed_bounds, step_callback, optimizer_kwargs):
+
     def loss_grad_func_wrapper(x):
       # SciPy's L-BFGS-B Fortran implementation requires gradients as doubles.
       loss, gradient = loss_grad_func(x)
@@ -314,7 +381,20 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
         'callback': step_callback,
         'method': method,
         'constraints': constraints,
+        'bounds': packed_bounds,
     }
+
+    for kwarg in minimize_kwargs:
+      if kwarg in optimizer_kwargs:
+        if kwarg == 'bounds':
+          # Special handling for 'bounds' kwarg since ability to specify bounds
+          # was added after this module was already publicly released.
+          raise ValueError(
+              'Bounds must be set using the var_to_bounds argument')
+        raise ValueError(
+            'Optimizer keyword arg \'{}\' is set '
+            'automatically and cannot be injected manually'.format(kwarg))
+
     minimize_kwargs.update(optimizer_kwargs)
     if method == 'SLSQP':
       # SLSQP doesn't support step callbacks. Obviate associated warning
@@ -327,8 +407,8 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
                  '  Message: %s\n'
                  '  Objective function value: %f\n'
                  '  Number of iterations: %d\n'
-                 '  Number of functions evaluations: %d',
-                 result.message, result.fun, result.nit, result.nfev)
+                 '  Number of functions evaluations: %d', result.message,
+                 result.fun, result.nit, result.nfev)
 
     return result['x']
 
@@ -355,5 +435,7 @@ def _prod(array):
 def _compute_gradients(tensor, var_list):
   grads = gradients.gradients(tensor, var_list)
   # tf.gradients sometimes returns `None` when it should return 0.
-  return [grad if grad is not None else array_ops.zeros_like(var)
-          for var, grad in zip(var_list, grads)]
+  return [
+      grad if grad is not None else array_ops.zeros_like(var)
+      for var, grad in zip(var_list, grads)
+  ]
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index c9f5a2ca3f1226614c486f35da954ff7e84267ff..f39134936f9d4b650e4faa368653d0efb6d99d12 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+
 from tensorflow.contrib.opt.python.training import external_optimizer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -74,13 +75,13 @@ class ExternalOptimizerInterfaceTest(TestCase):
 
     minimum_location = constant_op.constant(np.arange(9), dtype=dtypes.float32)
 
-    loss = math_ops.reduce_sum(math_ops.square(vector -
-                                               minimum_location[:2])) / 2.
-    loss += math_ops.reduce_sum(math_ops.square(scalar - minimum_location[
-        2])) / 2.
+    loss = math_ops.reduce_sum(
+        math_ops.square(vector - minimum_location[:2])) / 2.
+    loss += math_ops.reduce_sum(
+        math_ops.square(scalar - minimum_location[2])) / 2.
     loss += math_ops.reduce_sum(
-        math_ops.square(matrix - array_ops.reshape(minimum_location[3:],
-                                                   [2, 3]))) / 2.
+        math_ops.square(
+            matrix - array_ops.reshape(minimum_location[3:], [2, 3]))) / 2.
 
     optimizer = MockOptimizerInterface(loss)
 
@@ -184,6 +185,41 @@ class ScipyOptimizerInterfaceTest(TestCase):
       optimizer.minimize(sess)
       self.assertAllClose(np.ones(2), sess.run(vector))
 
+  def test_scalar_bounds(self):
+    vector_initial_value = [7., 7.]
+    vector = variables.Variable(vector_initial_value, 'vector')
+
+    # Make norm as small as possible.
+    loss = math_ops.reduce_sum(math_ops.square(vector))
+
+    # Make the minimum value of each component be 1.
+    var_to_bounds = {vector: (1., np.infty)}
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(
+        loss, var_to_bounds=var_to_bounds)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      optimizer.minimize(sess)
+      self.assertAllClose(np.ones(2), sess.run(vector))
+
+  def test_vector_bounds(self):
+    vector_initial_value = [7., 7.]
+    vector = variables.Variable(vector_initial_value, 'vector')
+
+    # Make norm as small as possible.
+    loss = math_ops.reduce_sum(math_ops.square(vector))
+
+    var_to_bounds = {vector: ([None, 2.], None)}
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(
+        loss, var_to_bounds=var_to_bounds)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      optimizer.minimize(sess)
+      self.assertAllClose([0., 2.], sess.run(vector))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..745dc2f8366a319dc94246228a6cc3efc12a53b8
--- /dev/null
+++ b/tensorflow/contrib/predictor/BUILD
@@ -0,0 +1,167 @@
+# `Predictor` classes provide an interface for efficient, repeated inference.
+
+package(default_visibility = ["//tensorflow/contrib/predictor:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "predictor",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [":predictor_factories"],
+)
+
+py_library(
+    name = "predictor_factories",
+    srcs = ["predictor_factories.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_estimator_predictor",
+        ":core_estimator_predictor",
+        ":saved_model_predictor",
+        "//tensorflow/python/estimator",
+    ],
+)
+
+py_library(
+    name = "base_predictor",
+    srcs = ["predictor.py"],
+    srcs_version = "PY2AND3",
+    deps = ["@six_archive//:six"],
+)
+
+py_library(
+    name = "saved_model_predictor",
+    srcs = ["saved_model_predictor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_predictor",
+        "//tensorflow/contrib/saved_model:saved_model_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+    ],
+)
+
+py_library(
+    name = "core_estimator_predictor",
+    srcs = ["core_estimator_predictor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_predictor",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_library(
+    name = "contrib_estimator_predictor",
+    srcs = ["contrib_estimator_predictor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_predictor",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "testing_common",
+    srcs = ["testing_common.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "predictor_pip",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":contrib_estimator_predictor",
+        ":core_estimator_predictor",
+        ":saved_model_predictor",
+    ],
+)
+
+py_test(
+    name = "saved_model_predictor_test",
+    srcs = ["saved_model_predictor_test.py"],
+    data = [":test_export_dir"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":saved_model_predictor",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "core_estimator_predictor_test",
+    srcs = ["core_estimator_predictor_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":core_estimator_predictor",
+        ":testing_common",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "contrib_estimator_predictor_test",
+    srcs = ["contrib_estimator_predictor_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":contrib_estimator_predictor",
+        ":testing_common",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "test_export_dir",
+    srcs = glob(["test_export_dir/**/*"]),
+    tags = ["nopip"],
+)
diff --git a/tensorflow/contrib/predictor/README.md b/tensorflow/contrib/predictor/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..16cdcf3e70634cdefe481bb19e8bb10682bcd2ce
--- /dev/null
+++ b/tensorflow/contrib/predictor/README.md
@@ -0,0 +1,96 @@
+# Predictors
+
+The `Predictor` classes provide a simple interface for performing repeated,
+efficient inference. A `Predictor` can be constructed from a `SavedModel` on
+disk, a `tf.Estimator` or a `tf.contrib.Estimator`.
+
+To facilitate the examples below, let's define a trivial `Estimator` that just
+calculates a sum:
+
+```python
+def model_fn(features, labels, mode):
+  z = tf.add(features['x'], features['y'], name='z')
+  return tf.contrib.learn.ModelFnOps(
+      mode, {'z': z}, loss=tf.constant(0.0), train_op=tf.no_op())
+
+estimator = tf.contrib.learn.Estimator(model_fn=model_fn)
+```
+
+We can then construct a `Predictor` in two different ways.
+
+## `Predictor` from a `SavedModel`
+
+Given a trained `Estimator`, we first export a `SavedModel`:
+
+```python
+def serving_input_fn():
+  x = tf.placeholder(dtype=tf.float32, shape=[None], name='x')
+  y = tf.placeholder(dtype=tf.float32, shape=[None], name='y')
+
+  features = {'x': x, 'y': y}
+  return tf.contrib.learn.utils.input_fn_utils.InputFnOps(
+           features, None, default_inputs=features)
+
+saved_model_dir = estimator.export_savedmodel(my_export_dir, serving_input_fn)
+```
+
+We can then construct a `Predictor` as follows:
+
+```python
+saved_model_predictor = predictor.from_saved_model(export_dir='test_export_dir')
+output_dict = saved_model_predictor({'x': [1.0], 'y': [5.2]})
+# output_dict == {'sum': [6.2]}
+```
+
+By specifying a signature definition, we can feed and fetch any `Tensor`s in
+the `Graph`. In this example, we feed and fetch the same `Tensor`, `z`:
+
+```python
+inputs = outputs = {'z': tf.TensorInfo(
+                        name='z:0',
+                        dtype=types_pb2.DT_FLOAT,
+                        tensor_shape=tensor_shape_pb2.TensorShapeProto())}
+
+signature_def = tf.saved_model.signature_def_utils.build_signature_def(
+          inputs=inputs,
+          outputs=outputs,
+          method_name='tensorflow/serving/regress')
+
+trivial_predictor = predictor.from_saved_model(
+          export_dir=saved_model_dir,
+          signature_def=signature_def)
+
+output_dict = trivial_predictor({'z': [32.]})
+# output_dict == {'z': [32.]}
+```
+
+You can also specify input and output `Tensor`s by name using the `input_names`
+and `output_names` keywords:
+
+```python
+saved_model_predictor = predictor.from_saved_model(
+          export_dir=saved_model_dir,
+          input_names={'x': 'x:0', 'y': 'y:0'},
+          outputs={'z': 'z:0'})
+
+output_dict = saved_model_predictor({'x': [6.], 'y': [11.]})
+# output_dict == {'z': [17.]}
+```
+
+This functionality is particularly useful for performing encoding once, but
+doing multiple decoding iterations with e.g. seq2seq models.
+
+## `Predictor` from an `Estimator`
+
+We can also construct a `Predictor` directly from an `Estimator`. Defining
+`serving_input_fn` as above,
+
+```python
+estimator_predictor = predictor.from_contrib_estimator(
+    estimator, serving_input_fn)
+output_dict = sum_predictor({'x': [1., 2.], 'y': [3., 4.]})
+# output_dict == {'z': [4., 6.]}
+```
+
+Construction from a `tf.Estimator` is almost identical.
+
diff --git a/tensorflow/contrib/predictor/__init__.py b/tensorflow/contrib/predictor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68146aea174dc65b321da2e8801cfd6f5c33c10b
--- /dev/null
+++ b/tensorflow/contrib/predictor/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Modules for `Predictor`s.
+
+@@from_contrib_estimator
+@@from_estimator
+@@from_saved_model
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.predictor.predictor_factories import from_contrib_estimator
+from tensorflow.contrib.predictor.predictor_factories import from_estimator
+from tensorflow.contrib.predictor.predictor_factories import from_saved_model
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a98c68e2343e9c8bb4b41556dc96bfe4ef444c
--- /dev/null
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -0,0 +1,74 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `Predictor constructed from a `tf.contrib.learn.Estimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+from tensorflow.contrib.predictor import predictor
+from tensorflow.python.framework import ops
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import saver
+
+
+class ContribEstimatorPredictor(predictor.Predictor):
+  """A `Predictor constructed from a `tf.contrib.learn.Estimator`."""
+
+  def __init__(self,
+               estimator,
+               prediction_input_fn,
+               input_alternative_key=None,
+               output_alternative_key=None,
+               graph=None):
+    """Initialize a `ContribEstimatorPredictor`.
+
+    Args:
+      estimator: an instance of `tf.contrib.learn.Estimator`.
+      prediction_input_fn: a function that takes no arguments and returns an
+        instance of `InputFnOps`.
+      input_alternative_key: Optional. Specify the input alternative used for
+        prediction.
+      output_alternative_key: Specify the output alternative used for
+        prediction. Not needed for single-headed models but required for
+        multi-headed models.
+      graph: Optional. The Tensorflow `graph` in which prediction should be
+        done.
+    """
+    self._graph = graph or ops.Graph()
+    with self._graph.as_default():
+      input_fn_ops = prediction_input_fn()
+      # pylint: disable=protected-access
+      model_fn_ops = estimator._get_predict_ops(input_fn_ops.features)
+      # pylint: enable=protected-access
+      checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
+      self._session = monitored_session.MonitoredSession(
+          session_creator=monitored_session.ChiefSessionCreator(
+              checkpoint_filename_with_path=checkpoint_path))
+
+    input_alternative_key = (
+        input_alternative_key or
+        saved_model_export_utils.DEFAULT_INPUT_ALTERNATIVE_KEY)
+    input_alternatives, _ = saved_model_export_utils.get_input_alternatives(
+        input_fn_ops)
+    self._feed_tensors = input_alternatives[input_alternative_key]
+
+    (output_alternatives,
+     output_alternative_key) = saved_model_export_utils.get_output_alternatives(
+         model_fn_ops, output_alternative_key)
+    _, fetch_tensors = output_alternatives[output_alternative_key]
+    self._fetch_tensors = fetch_tensors
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor_test.py b/tensorflow/contrib/predictor/contrib_estimator_predictor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b97a52b1a3d53c056a003c5b4ae3cae25a1159f
--- /dev/null
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor_test.py
@@ -0,0 +1,70 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for predictor.contrib_estimator_predictor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import numpy as np
+
+from tensorflow.contrib.predictor import contrib_estimator_predictor
+from tensorflow.contrib.predictor import testing_common
+from tensorflow.python.platform import test
+
+
+KEYS_AND_OPS = (('sum', lambda x, y: x + y),
+                ('product', lambda x, y: x * y,),
+                ('difference', lambda x, y: x - y))
+
+
+class ContribEstimatorPredictorTest(test.TestCase):
+  """Test fixture for `ContribEstimatorPredictor`."""
+
+  def setUp(self):
+    model_dir = tempfile.mkdtemp()
+    self._estimator = testing_common.get_arithmetic_estimator(
+        core=False, model_dir=model_dir)
+    self._prediction_input_fn = testing_common.get_arithmetic_input_fn(
+        core=False, train=False)
+
+  def testSpecifiedSignatureKey(self):
+    """Test prediction with spedicified signatures."""
+    np.random.seed(1234)
+    for key, op in KEYS_AND_OPS:
+      x = np.random.rand()
+      y = np.random.rand()
+      expected_output = op(x, y)
+
+      predictor = contrib_estimator_predictor.ContribEstimatorPredictor(
+          estimator=self._estimator,
+          prediction_input_fn=self._prediction_input_fn,
+          output_alternative_key=key)
+      output_tensor_name = predictor.fetch_tensors[key].name
+      self.assertRegexpMatches(
+          output_tensor_name,
+          key,
+          msg='Unexpected fetch tensor.')
+      output = predictor({'x': x, 'y': y})[key]
+      self.assertAlmostEqual(
+          expected_output, output, places=3,
+          msg='Failed for output key "{}." '
+          'Got output {} for x = {} and y = {}'.format(
+              key, output, x, y))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5557ef510171b97ffa1b4345e22217dd5f17603e
--- /dev/null
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -0,0 +1,80 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `Predictor` constructed from an `learn.python.estimator.Estimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.predictor import predictor
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import monitored_session
+
+
+def _get_signature_def(
+    serving_input_receiver, estimator, output_key=None):
+  """Construct a `SignatureDef` proto."""
+  if output_key is None:
+    output_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  # pylint: disable=protected-access
+  estimator_spec = estimator._call_model_fn(
+      serving_input_receiver.features, None, model_fn.ModeKeys.PREDICT)
+  # pylint: enable=protected-access
+  export_outputs = estimator_spec.export_outputs
+  export_output = export_outputs.get(output_key)
+  if export_output is None:
+    raise KeyError('output_key must be one of {}; got {}'.format(
+        export_outputs.keys(), output_key))
+  return export_output.as_signature_def(serving_input_receiver.receiver_tensors)
+
+
+class CoreEstimatorPredictor(predictor.Predictor):
+  """A `Predictor` constructed from an `learn.python.estimator.Estimator`."""
+
+  def __init__(self,
+               estimator,
+               serving_input_receiver_fn,
+               output_key=None,
+               graph=None):
+    """Initialize a `CoreEstimatorPredictor`.
+
+    Args:
+      estimator: an instance of `learn.python.estimator.Estimator`.
+      serving_input_receiver_fn: a function that takes no arguments and returns
+        an instance of `ServingInputReceiver` compatible with `estimator`.
+      output_key: Optional string specifying the export output to use. If
+        `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
+      graph: Optional. The Tensorflow `graph` in which prediction should be
+        done.
+    """
+    self._graph = graph or ops.Graph()
+    with self._graph.as_default():
+      serving_input_receiver = serving_input_receiver_fn()
+      signature_def = _get_signature_def(
+          serving_input_receiver, estimator, output_key)
+      checkpoint_path = estimator.model_dir
+      self._session = monitored_session.MonitoredSession(
+          session_creator=monitored_session.ChiefSessionCreator(
+              checkpoint_filename_with_path=checkpoint_path))
+
+    feed_tensor_info = signature_def.inputs
+    self._feed_tensors = {k: self._graph.get_tensor_by_name(v.name)
+                          for k, v in feed_tensor_info.items()}
+    fetch_tensor_info = signature_def.outputs
+    self._fetch_tensors = {k: self._graph.get_tensor_by_name(v.name)
+                           for k, v in fetch_tensor_info.items()}
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor_test.py b/tensorflow/contrib/predictor/core_estimator_predictor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4221086794426bd0f58353650d5c4e3e14367b1e
--- /dev/null
+++ b/tensorflow/contrib/predictor/core_estimator_predictor_test.py
@@ -0,0 +1,81 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for predictor.core_estimator_predictor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import numpy as np
+
+from tensorflow.contrib.predictor import core_estimator_predictor
+from tensorflow.contrib.predictor import testing_common
+from tensorflow.python.platform import test
+
+
+KEYS_AND_OPS = (('sum', lambda x, y: x + y),
+                ('product', lambda x, y: x * y,),
+                ('difference', lambda x, y: x - y))
+
+
+class CoreEstimatorPredictorTest(test.TestCase):
+  """Test fixture for `CoreEstimatorPredictor`."""
+
+  def setUp(self):
+    model_dir = tempfile.mkdtemp()
+    self._estimator = testing_common.get_arithmetic_estimator(
+        core=True, model_dir=model_dir)
+    self._serving_input_receiver_fn = testing_common.get_arithmetic_input_fn(
+        core=True, train=False)
+
+  def testDefault(self):
+    """Test prediction with default signature."""
+    np.random.seed(1111)
+    x = np.random.rand()
+    y = np.random.rand()
+    predictor = core_estimator_predictor.CoreEstimatorPredictor(
+        estimator=self._estimator,
+        serving_input_receiver_fn=self._serving_input_receiver_fn)
+    output = predictor({'x': x, 'y': y})['sum']
+    self.assertAlmostEqual(output, x + y, places=3)
+
+  def testSpecifiedSignatureKey(self):
+    """Test prediction with spedicified signatures."""
+    np.random.seed(1234)
+    for output_key, op in KEYS_AND_OPS:
+      x = np.random.rand()
+      y = np.random.rand()
+      expected_output = op(x, y)
+
+      predictor = core_estimator_predictor.CoreEstimatorPredictor(
+          estimator=self._estimator,
+          serving_input_receiver_fn=self._serving_input_receiver_fn,
+          output_key=output_key)
+      output_tensor_name = predictor.fetch_tensors[output_key].name
+      self.assertRegexpMatches(
+          output_tensor_name,
+          output_key,
+          msg='Unexpected fetch tensor.')
+      output = predictor({'x': x, 'y': y})[output_key]
+      self.assertAlmostEqual(
+          expected_output, output, places=3,
+          msg='Failed for output key "{}." '
+          'Got output {} for x = {} and y = {}'.format(
+              output_key, output, x, y))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/predictor/predictor.py b/tensorflow/contrib/predictor/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc0028259ebe50bdbe8dee9ef3ccff1aff5507c
--- /dev/null
+++ b/tensorflow/contrib/predictor/predictor.py
@@ -0,0 +1,77 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Abstract base class for all predictors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Predictor(object):
+  """Abstract base class for all predictors."""
+
+  @property
+  def graph(self):
+    return self._graph
+
+  @property
+  def session(self):
+    return self._session
+
+  @property
+  def feed_tensors(self):
+    return self._feed_tensors
+
+  @property
+  def fetch_tensors(self):
+    return self._fetch_tensors
+
+  def __repr__(self):
+    return '{} with feed tensors {} and fetch_tensors {}'.format(
+        type(self).__name__, self._feed_tensors, self._fetch_tensors)
+
+  def __call__(self, input_dict):
+    """Returns predictions based on `input_dict`.
+
+    Args:
+      input_dict: a `dict` mapping strings to numpy arrays. These keys
+        must match `self._feed_tensors.keys()`.
+
+    Returns:
+      A `dict` mapping strings to numpy arrays. The keys match
+      `self.fetch_tensors.keys()`.
+
+    Raises:
+      ValueError: `input_dict` does not match `feed_tensors`.
+    """
+    # TODO(jamieas): make validation optional?
+    input_keys = set(input_dict.keys())
+    expected_keys = set(self.feed_tensors.keys())
+    unexpected_keys = input_keys - expected_keys
+    if unexpected_keys:
+      raise ValueError('Got unexpected keys in input_dict: {}'.format(
+          unexpected_keys))
+
+    feed_dict = {}
+    for key in self.feed_tensors.keys():
+      value = input_dict.get(key)
+      if value is not None:
+        feed_dict[self.feed_tensors[key]] = value
+    return self._session.run(fetches=self.fetch_tensors, feed_dict=feed_dict)
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3f30d917d637d2e2d821a727e12b8d0b54942df
--- /dev/null
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -0,0 +1,132 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Factory functions for `Predictor`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.predictor import contrib_estimator_predictor
+from tensorflow.contrib.predictor import core_estimator_predictor
+from tensorflow.contrib.predictor import saved_model_predictor
+from tensorflow.python.estimator import estimator as core_estimator
+
+
+def from_contrib_estimator(estimator,
+                           prediction_input_fn,
+                           input_alternative_key=None,
+                           output_alternative_key=None,
+                           graph=None):
+  """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
+
+  Args:
+    estimator: an instance of `tf.contrib.learn.Estimator`.
+    prediction_input_fn: a function that takes no arguments and returns an
+      instance of `InputFnOps`.
+    input_alternative_key: Optional. Specify the input alternative used for
+      prediction.
+    output_alternative_key: Specify the output alternative used for
+      prediction. Not needed for single-headed models but required for
+      multi-headed models.
+    graph: Optional. The Tensorflow `graph` in which prediction should be
+      done.
+
+  Returns:
+    An initialized `Predictor`.
+
+  Raises:
+    TypeError: if `estimator` is a core `Estimator` instead of a contrib
+      `Estimator`.
+  """
+  if isinstance(estimator, core_estimator.Estimator):
+    raise TypeError('Espected estimator to be of type '
+                    'tf.contrib.learn.Estimator, but got type '
+                    'tf.python.estimator.Estimator. You likely want to call '
+                    'from_estimator.')
+  return contrib_estimator_predictor.ContribEstimatorPredictor(
+      estimator,
+      prediction_input_fn,
+      input_alternative_key,
+      output_alternative_key,
+      graph)
+
+
+def from_estimator(estimator,
+                   serving_input_receiver_fn,
+                   output_key=None,
+                   graph=None):
+  """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
+
+  Args:
+    estimator: an instance of `learn.python.estimator.Estimator`.
+    serving_input_receiver_fn: a function that takes no arguments and returns
+      an instance of `ServingInputReceiver` compatible with `estimator`.
+    output_key: Optional string specifying the export output to use. If
+      `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
+    graph: Optional. The Tensorflow `graph` in which prediction should be
+      done.
+
+  Returns:
+    An initialized `Predictor`.
+
+  Raises:
+    TypeError: if `estimator` is a contrib `Estimator` instead of a core
+      `Estimator`.
+  """
+  if isinstance(estimator, estimator.Estimator):
+    raise TypeError('Espected estimator to be of type '
+                    'tf.python.estimator.Estimator, but got type '
+                    'tf.contrib.learn.Estimator. You likely want to call '
+                    'from_contrib_estimator.')
+  return core_estimator_predictor.CoreEstimatorPredictor(
+      estimator,
+      serving_input_receiver_fn,
+      output_key,
+      graph)
+
+
+def from_saved_model(export_dir,
+                     signature_def_key=None,
+                     signature_def=None,
+                     tags=None,
+                     graph=None):
+  """Constructs a `Predictor` from a `SavedModel` on disk.
+
+  Args:
+    export_dir: a path to a directory containing a `SavedModel`.
+    signature_def_key: Optional string specifying the signature to use. If
+      `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used. Only one of
+    `signature_def_key` and `signature_def`
+    signature_def: A `SignatureDef` proto specifying the inputs and outputs
+      for prediction. Only one of `signature_def_key` and `signature_def`
+      should be specified.
+    tags: Optional. Tags that will be used to retrieve the correct
+      `SignatureDef`. Defaults to `DEFAULT_TAGS`.
+    graph: Optional. The Tensorflow `graph` in which prediction should be
+      done.
+
+  Returns:
+    An initialized `Predictor`.
+
+  Raises:
+    ValueError: More than one of `signature_def_key` and `signature_def` is
+      specified.
+  """
+  return saved_model_predictor.SavedModelPredictor(export_dir,
+                                                   signature_def_key,
+                                                   signature_def,
+                                                   tags,
+                                                   graph)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dbca0f8136e4e618234101ee41c80bc085511c0
--- /dev/null
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -0,0 +1,167 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `Predictor` constructed from a `SavedModel`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+
+from tensorflow.contrib.predictor import predictor
+from tensorflow.contrib.saved_model.python.saved_model import reader
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+
+
+DEFAULT_TAGS = 'serve'
+
+_DEFAULT_INPUT_ALTERNATIVE_FORMAT = 'default_input_alternative:{}'
+
+
+def get_meta_graph_def(saved_model_dir, tags):
+  """Gets `MetaGraphDef` from a directory containing a `SavedModel`.
+
+  Returns the `MetaGraphDef` for the given tag-set and SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel.
+    tags: Comma separated list of tags used to identify the correct
+      `MetaGraphDef`.
+
+  Raises:
+    ValueError: An error when the given tags cannot be found.
+
+  Returns:
+    A `MetaGraphDef` corresponding to the given tags.
+  """
+  saved_model = reader.read_saved_model(saved_model_dir)
+  set_of_tags = set([tag.strip() for tag in tags.split(',')])
+  for meta_graph_def in saved_model.meta_graphs:
+    if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
+      return meta_graph_def
+  raise ValueError('Could not find MetaGraphDef with tags {}'.format(tags))
+
+
+def _get_signature_def(signature_def_key, export_dir, tags):
+  """Construct a `SignatureDef` proto."""
+  signature_def_key = (
+      signature_def_key or
+      signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+
+  metagraph_def = get_meta_graph_def(export_dir, tags)
+
+  try:
+    signature_def = signature_def_utils.get_signature_def_by_key(
+        metagraph_def,
+        signature_def_key)
+  except ValueError as e:
+    try:
+      formatted_key = _DEFAULT_INPUT_ALTERNATIVE_FORMAT.format(
+          signature_def_key)
+      signature_def = signature_def_utils.get_signature_def_by_key(
+          metagraph_def, formatted_key)
+
+      logging.warning('Could not find signature def "%s". '
+                      'Using "%s" instead', signature_def_key, formatted_key)
+    except ValueError:
+      raise ValueError(
+          'Got signature_def_key "{}". Available signatures are {}. '
+          'Original error:\n{}'.format(
+              signature_def_key, list(metagraph_def.signature_def), e))
+  return signature_def
+
+
+def _check_signature_arguments(signature_def_key,
+                               signature_def,
+                               input_names,
+                               output_names):
+  """Validates signature arguments for `SavedModelPredictor`."""
+  signature_def_key_specified = signature_def_key is not None
+  signature_def_specified = signature_def is not None
+  input_names_specified = input_names is not None
+  output_names_specified = output_names is not None
+  if input_names_specified != output_names_specified:
+    raise ValueError(
+        'input_names and output_names must both be specified or both be '
+        'unspecified.'
+    )
+
+  if (signature_def_key_specified + signature_def_specified +
+      input_names_specified > 1):
+    raise ValueError(
+        'You must specify at most one of signature_def_key OR signature_def OR'
+        '(input_names AND output_names).'
+    )
+
+
+class SavedModelPredictor(predictor.Predictor):
+  """A `Predictor` constructed from a `SavedModel`."""
+
+  def __init__(self,
+               export_dir,
+               signature_def_key=None,
+               signature_def=None,
+               input_names=None,
+               output_names=None,
+               tags=None,
+               graph=None):
+    """Initialize a `CoreEstimatorPredictor`.
+
+    Args:
+      export_dir: a path to a directory containing a `SavedModel`.
+      signature_def_key: Optional string specifying the signature to use. If
+        `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used. Only one of
+        `signature_def_key` and `signature_def` should be specified.
+      signature_def: A `SignatureDef` proto specifying the inputs and outputs
+        for prediction. Only one of `signature_def_key` and `signature_def`
+        should be specified.
+      input_names: A dictionary mapping strings to `Tensor`s in the `SavedModel`
+        that represent the input. The keys can be any string of the user's
+        choosing.
+      output_names: A dictionary mapping strings to `Tensor`s in the
+        `SavedModel` that represent the output. The keys can be any string of
+        the user's choosing.
+      tags: Optional. Comma separated list of tags that will be used to retrieve
+        the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
+      graph: Optional. The Tensorflow `graph` in which prediction should be
+        done.
+    Raises:
+      ValueError: If more than one of signature_def_key OR signature_def OR
+        (input_names AND output_names) is specified.
+    """
+    _check_signature_arguments(
+        signature_def_key, signature_def, input_names, output_names)
+    tags = tags or DEFAULT_TAGS
+    self._graph = graph or ops.Graph()
+
+    with self._graph.as_default():
+      self._session = session.Session()
+      loader.load(self._session, tags.split(','), export_dir)
+
+    if input_names is None:
+      if signature_def is None:
+        signature_def = _get_signature_def(signature_def_key, export_dir, tags)
+      input_names = {k: v.name for k, v in signature_def.inputs.items()}
+      output_names = {k: v.name for k, v in signature_def.outputs.items()}
+
+    self._feed_tensors = {k: self._graph.get_tensor_by_name(v)
+                          for k, v in input_names.items()}
+    self._fetch_tensors = {k: self._graph.get_tensor_by_name(v)
+                           for k, v in output_names.items()}
diff --git a/tensorflow/contrib/predictor/saved_model_predictor_test.py b/tensorflow/contrib/predictor/saved_model_predictor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f40e2e73d99dde67f2104194f758113925ebe10d
--- /dev/null
+++ b/tensorflow/contrib/predictor/saved_model_predictor_test.py
@@ -0,0 +1,170 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for predictor.saved_model_predictor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.predictor import saved_model_predictor
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_def_utils
+
+
+KEYS_AND_OPS = (('sum', lambda x, y: x + y),
+                ('product', lambda x, y: x * y,),
+                ('difference', lambda x, y: x - y))
+
+MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
+
+
+class SavedModelPredictorTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    # Load a saved model exported from the arithmetic `Estimator`.
+    # See `testing_common.py`.
+    cls._export_dir = test.test_src_dir_path(MODEL_DIR_NAME)
+
+  def testDefault(self):
+    """Test prediction with default signature."""
+    np.random.seed(1111)
+    x = np.random.rand()
+    y = np.random.rand()
+    predictor = saved_model_predictor.SavedModelPredictor(
+        export_dir=self._export_dir)
+    output = predictor({'x': x, 'y': y})['outputs']
+    self.assertAlmostEqual(output, x + y, places=3)
+
+  def testSpecifiedSignatureKey(self):
+    """Test prediction with spedicified signature key."""
+    np.random.seed(1234)
+    for signature_def_key, op in KEYS_AND_OPS:
+      x = np.random.rand()
+      y = np.random.rand()
+      expected_output = op(x, y)
+
+      predictor = saved_model_predictor.SavedModelPredictor(
+          export_dir=self._export_dir,
+          signature_def_key=signature_def_key)
+
+      output_tensor_name = predictor.fetch_tensors['outputs'].name
+      self.assertRegexpMatches(
+          output_tensor_name,
+          signature_def_key,
+          msg='Unexpected fetch tensor.')
+
+      output = predictor({'x': x, 'y': y})['outputs']
+      self.assertAlmostEqual(
+          expected_output, output, places=3,
+          msg='Failed for signature "{}." '
+          'Got output {} for x = {} and y = {}'.format(
+              signature_def_key, output, x, y))
+
+  def testSpecifiedSignature(self):
+    """Test prediction with spedicified signature definition."""
+    np.random.seed(4444)
+    for key, op in KEYS_AND_OPS:
+      x = np.random.rand()
+      y = np.random.rand()
+      expected_output = op(x, y)
+
+      inputs = {
+          'x': meta_graph_pb2.TensorInfo(
+              name='inputs/x:0',
+              dtype=types_pb2.DT_FLOAT,
+              tensor_shape=tensor_shape_pb2.TensorShapeProto()),
+          'y': meta_graph_pb2.TensorInfo(
+              name='inputs/y:0',
+              dtype=types_pb2.DT_FLOAT,
+              tensor_shape=tensor_shape_pb2.TensorShapeProto())}
+      outputs = {
+          key: meta_graph_pb2.TensorInfo(
+              name='outputs/{}:0'.format(key),
+              dtype=types_pb2.DT_FLOAT,
+              tensor_shape=tensor_shape_pb2.TensorShapeProto())}
+      signature_def = signature_def_utils.build_signature_def(
+          inputs=inputs,
+          outputs=outputs,
+          method_name='tensorflow/serving/regress')
+      predictor = saved_model_predictor.SavedModelPredictor(
+          export_dir=self._export_dir,
+          signature_def=signature_def)
+
+      output_tensor_name = predictor.fetch_tensors[key].name
+      self.assertRegexpMatches(
+          output_tensor_name,
+          key,
+          msg='Unexpected fetch tensor.')
+
+      output = predictor({'x': x, 'y': y})[key]
+      self.assertAlmostEqual(
+          expected_output, output, places=3,
+          msg='Failed for signature "{}". '
+          'Got output {} for x = {} and y = {}'.format(key, output, x, y))
+
+  def testSpecifiedTensors(self):
+    """Test prediction with spedicified `Tensor`s."""
+    np.random.seed(987)
+    for key, op in KEYS_AND_OPS:
+      x = np.random.rand()
+      y = np.random.rand()
+      expected_output = op(x, y)
+      input_names = {'x': 'inputs/x:0',
+                     'y': 'inputs/y:0'}
+      output_names = {key: 'outputs/{}:0'.format(key)}
+      predictor = saved_model_predictor.SavedModelPredictor(
+          export_dir=self._export_dir,
+          input_names=input_names,
+          output_names=output_names)
+
+      output_tensor_name = predictor.fetch_tensors[key].name
+      self.assertRegexpMatches(
+          output_tensor_name,
+          key,
+          msg='Unexpected fetch tensor.')
+
+      output = predictor({'x': x, 'y': y})[key]
+      self.assertAlmostEqual(
+          expected_output, output, places=3,
+          msg='Failed for signature "{}". '
+          'Got output {} for x = {} and y = {}'.format(key, output, x, y))
+
+  def testBadTagsFail(self):
+    """Test that predictor construction fails for bad tags."""
+    bad_tags_regex = ('.* could not be found in SavedModel')
+    with self.assertRaisesRegexp(RuntimeError, bad_tags_regex):
+      _ = saved_model_predictor.SavedModelPredictor(
+          export_dir=self._export_dir,
+          tags=('zomg, bad, tags'))
+
+  def testSpecifiedGraph(self):
+    """Test that the predictor remembers a specified `Graph`."""
+    g = ops.Graph()
+    predictor = saved_model_predictor.SavedModelPredictor(
+        export_dir=self._export_dir,
+        graph=g)
+    self.assertEqual(predictor.graph, g)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/predictor/test_export_dir/saved_model.pb b/tensorflow/contrib/predictor/test_export_dir/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..9100fefb720673429a40dd0d6a31ed6fa1bc5bfa
Binary files /dev/null and b/tensorflow/contrib/predictor/test_export_dir/saved_model.pb differ
diff --git a/tensorflow/contrib/predictor/test_export_dir/variables/variables.data-00000-of-00001 b/tensorflow/contrib/predictor/test_export_dir/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..1b1cb4d44c57c2d7a5122870fa6ac3e62ff7e94e
Binary files /dev/null and b/tensorflow/contrib/predictor/test_export_dir/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/contrib/predictor/test_export_dir/variables/variables.index b/tensorflow/contrib/predictor/test_export_dir/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..dd32e9b71b3fb752f8a1965427ae4c682089a28a
Binary files /dev/null and b/tensorflow/contrib/predictor/test_export_dir/variables/variables.index differ
diff --git a/tensorflow/contrib/predictor/testing_common.py b/tensorflow/contrib/predictor/testing_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..1767704b9931e12b687ef07010f7f2ee4ed8c54c
--- /dev/null
+++ b/tensorflow/contrib/predictor/testing_common.py
@@ -0,0 +1,102 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Common code used for testing `Predictor`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import estimator as contrib_estimator
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as contrib_model_fn
+from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.python.estimator import estimator as core_estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.export import export_lib
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.saved_model import signature_constants
+
+
+def get_arithmetic_estimator(core=True, model_dir=None):
+  """Returns an `Estimator` that performs basic arithmetic.
+
+  Args:
+    core: if `True`, returns a `tensorflow.python.estimator.Estimator`.
+      Otherwise, returns a `tensorflow.contrib.learn.Estimator`.
+    model_dir: directory in which to export checkpoints and saved models.
+  Returns:
+    An `Estimator` that performs arithmetic operations on its inputs.
+  """
+  def _model_fn(features, labels, mode):
+    _ = labels
+    x = features['x']
+    y = features['y']
+    with ops.name_scope('outputs'):
+      predictions = {'sum': math_ops.add(x, y, name='sum'),
+                     'product': math_ops.multiply(x, y, name='product'),
+                     'difference': math_ops.subtract(x, y, name='difference')}
+    if core:
+      export_outputs = {k: export_output.PredictOutput({k: v})
+                        for k, v in predictions.items()}
+      export_outputs[signature_constants.
+                     DEFAULT_SERVING_SIGNATURE_DEF_KEY] = export_outputs['sum']
+      return model_fn.EstimatorSpec(mode=mode,
+                                    predictions=predictions,
+                                    export_outputs=export_outputs,
+                                    loss=constant_op.constant(0),
+                                    train_op=control_flow_ops.no_op())
+    else:
+      output_alternatives = {k: (constants.ProblemType.UNSPECIFIED, {k: v})
+                             for k, v in predictions.items()}
+      return contrib_model_fn.ModelFnOps(
+          mode=mode,
+          predictions=predictions,
+          output_alternatives=output_alternatives,
+          loss=constant_op.constant(0),
+          train_op=control_flow_ops.no_op())
+  if core:
+    return core_estimator.Estimator(_model_fn)
+  else:
+    return contrib_estimator.Estimator(_model_fn, model_dir=model_dir)
+
+
+def get_arithmetic_input_fn(core=True, train=False):
+  """Returns a input functions or serving input receiver function."""
+  def _input_fn():
+    with ops.name_scope('inputs'):
+      x = array_ops.placeholder_with_default(0.0, shape=[], name='x')
+      y = array_ops.placeholder_with_default(0.0, shape=[], name='y')
+    label = constant_op.constant(0.0)
+    features = {'x': x, 'y': y}
+    if core:
+      if train:
+        return features, label
+      return export_lib.ServingInputReceiver(
+          features=features,
+          receiver_tensors=features)
+    else:
+      if train:
+        return features, label
+      return input_fn_utils.InputFnOps(
+          features=features,
+          labels={},
+          default_inputs=features)
+  return _input_fn
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index b1d12cc510a57c6777ec4e2b8611f6bb1e81e929..c19a31afb2a1a86159eae5c94bbd83daa28caaeb 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -35,13 +35,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:common_shapes",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
 )
diff --git a/tensorflow/contrib/remote_fused_graph/README.md b/tensorflow/contrib/remote_fused_graph/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..267cfa101924b3c989546ef8e92cef5339c3fee0
--- /dev/null
+++ b/tensorflow/contrib/remote_fused_graph/README.md
@@ -0,0 +1,8 @@
+# Remote Fused Graph
+
+## Description
+
+This module contains libraries for remote fused graph utilities
+
+Maintainers:
+- Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..27f0a7f58f78135f1d73ae04bd1e76ef496fa549
--- /dev/null
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -0,0 +1,62 @@
+# Description:
+# Contains ops for remote fused graph
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+tf_gen_op_wrapper_py(
+    name = "gen_remote_fused_graph_ops",
+    out = "python/ops/gen_remote_fused_graph_ops.py",
+    deps = [
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "remote_fused_graph_ops_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_remote_fused_graph_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "remote_fused_graph_ops_test",
+    size = "small",
+    srcs = ["python/ops/remote_fused_graph_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":remote_fused_graph_ops_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/__init__.py b/tensorflow/contrib/remote_fused_graph/pylib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d23c38932ec1f48f5ff55f275ca99b9f695a3c7
--- /dev/null
+++ b/tensorflow/contrib/remote_fused_graph/pylib/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Remote fused graph ops python library.
+
+## This package provides classes for remote fused graph ops.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import, line-too-long
+from tensorflow.contrib.remote_fused_graph.pylib.python.ops.remote_fused_graph_ops import *
+# pylint: enable=unused-import,wildcard-import,line-too-long
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['remote_fused_graph_execute']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/tools/ci_build/install/install_tensorboard_packages.sh b/tensorflow/contrib/remote_fused_graph/pylib/python/__init__.py
old mode 100755
new mode 100644
similarity index 65%
rename from tensorflow/tools/ci_build/install/install_tensorboard_packages.sh
rename to tensorflow/contrib/remote_fused_graph/pylib/python/__init__.py
index ca5092cd4757a8affa892dd7f9b8a3ea774d42fd..b66091f875903dccaf463f9b0d7c7cd7ac2bf870
--- a/tensorflow/tools/ci_build/install/install_tensorboard_packages.sh
+++ b/tensorflow/contrib/remote_fused_graph/pylib/python/__init__.py
@@ -1,5 +1,4 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Remote fused graph ops python library."""
 
-set -e
-
-# Install dependencies from ubuntu deb repository.
-apt-get update
-apt-get install -y --no-install-recommends \
-    chromium-browser \
-    nodejs \
-    nodejs-legacy \
-    npm \
-    python-numpy \
-    xvfb
-apt-get clean
-rm -rf /var/lib/apt/lists/*
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/__init__.py b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b66091f875903dccaf463f9b0d7c7cd7ac2bf870
--- /dev/null
+++ b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Remote fused graph ops python library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2054367f0d1461c8868e3332d82322a8a3dd38af
--- /dev/null
+++ b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
@@ -0,0 +1,66 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations to execute a subgraph on a remote processor."""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import, line-too-long
+from tensorflow.contrib.remote_fused_graph.pylib.python.ops import gen_remote_fused_graph_ops
+from tensorflow.core.framework import remote_fused_graph_execute_info_pb2 as info_pb2
+# pylint: enable=unused-import,wildcard-import,line-too-long
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+# RemoteFusedGraphExecute is not differenciable op.
+ops.NotDifferentiable("RemoteFusedGraphExecute")
+
+
+def remote_fused_graph_execute(inputs,
+                               output_types,
+                               graph_def,
+                               graph_input_node_names,
+                               graph_output_node_names,
+                               executor_name,
+                               serialized_executor_parameters,
+                               default_graph_input_tensor_type_shapes=None,
+                               default_graph_output_tensor_type_shapes=None):
+  """A wrapper for remote_fused_graph_execute."""
+  info_proto = info_pb2.RemoteFusedGraphExecuteInfo()
+  info_proto.remote_graph.CopyFrom(graph_def)
+  info_proto.graph_input_node_name.extend(graph_input_node_names)
+  info_proto.graph_output_node_name.extend(graph_output_node_names)
+  info_proto.executor_name = executor_name
+  info_proto.serialized_executor_parameters = serialized_executor_parameters
+  if default_graph_input_tensor_type_shapes:
+    for type_shape in default_graph_input_tensor_type_shapes:
+      type_shape_proto = info_proto.default_graph_input_tensor_shape.add()
+      type_shape_proto.dtype = int(dtypes.as_dtype(type_shape[0]))
+      for dim in type_shape[1]:
+        type_shape_proto.shape.dim.add().size = dim
+  if default_graph_output_tensor_type_shapes:
+    for type_shape in default_graph_output_tensor_type_shapes:
+      type_shape_proto = info_proto.default_graph_output_tensor_shape.add()
+      type_shape_proto.dtype = int(dtypes.as_dtype(type_shape[0]))
+      for dim in type_shape[1]:
+        type_shape_proto.shape.dim.add().size = dim
+
+  serialized_info = info_proto.SerializeToString()
+
+  return gen_remote_fused_graph_ops.remote_fused_graph_execute(
+      inputs, output_types, serialized_info)
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops_test.py b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..45df9091482b610868a2769e3ebd7c925342a883
--- /dev/null
+++ b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops_test.py
@@ -0,0 +1,66 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.remote_fused_graph_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+# pylint: disable=unused-import,wildcard-import, line-too-long
+from tensorflow.contrib.remote_fused_graph.pylib.python.ops import remote_fused_graph_ops
+# pylint: enable=unused-import,wildcard-import,line-too-long
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class RemoteFusedGraphExecuteTest(test_util.TensorFlowTestCase):
+  """Tests for RemoteFusedGraphExecute op."""
+
+  def testBuild(self):
+    graph = graph_pb2.GraphDef()
+    node = graph.node.add()
+    node.name = "a"
+    node.op = "op0"
+    node = graph.node.add()
+    node.name = "b"
+    node.op = "op1"
+    inputs = [ops.convert_n_to_tensor([1], dtypes.int64)]
+    output_types = [np.int64, np.int64]
+    graph_input_node_names = ["a"]
+    graph_output_node_names = ["a", "b"]
+    executor_name = ""
+    serialized_executor_parameters = b""
+    default_graph_input_tensor_type_shapes = [[dtypes.int64, [1]]]
+    default_graph_output_tensor_type_shapes = [[dtypes.int64, [1]],
+                                               [dtypes.int64, [1]]]
+
+    output_nodes = remote_fused_graph_ops.remote_fused_graph_execute(
+        inputs, output_types, graph, graph_input_node_names,
+        graph_output_node_names, executor_name, serialized_executor_parameters,
+        default_graph_input_tensor_type_shapes,
+        default_graph_output_tensor_type_shapes)
+    self.assertEqual(2, len(output_nodes))
+    for output_node in output_nodes:
+      with self.test_session(use_gpu=False):
+        output_node.eval()
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1b9efd1ecd7d4807fe04b52f2f4148e95fce9a8c
--- /dev/null
+++ b/tensorflow/contrib/resampler/BUILD
@@ -0,0 +1,92 @@
+licenses(["notice"])  # Apache 2.0 License
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//visibility:public"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+tf_custom_op_py_library(
+    name = "resampler_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    dso = [":python/ops/_resampler_ops.so"],
+    kernels = [
+        ":resampler_ops_kernels",
+        ":resampler_ops_op_lib",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":resampler_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_kernel_library(
+    name = "resampler_ops_kernels",
+    prefix = "resampler_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_resampler_ops.so",
+    srcs = [
+        "kernels/resampler_ops.cc",
+        "kernels/resampler_ops.h",
+        "ops/resampler_ops.cc",
+    ],
+    gpu_srcs = [
+        "kernels/resampler_ops_gpu.cu.cc",
+        "kernels/resampler_ops.h",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "resampler_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "resampler_ops",
+    deps = [":resampler_ops_op_lib"],
+)
+
+cuda_py_test(
+    name = "resampler_ops_test",
+    size = "small",
+    srcs = ["python/ops/resampler_ops_test.py"],
+    additional_deps = [
+        ":resampler_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:array_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/resampler/__init__.py b/tensorflow/contrib/resampler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e04e5762da56cac58830095e866f00126da0d08
--- /dev/null
+++ b/tensorflow/contrib/resampler/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to resampler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.resampler.python.ops.resampler_ops import *
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__, ["resampler"])
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afc8bcd4462bbfd7c7f87480a795088ada35365f
--- /dev/null
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -0,0 +1,465 @@
+// Copyright 2017 The Sonnet Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/contrib/resampler/kernels/resampler_ops.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace functor {
+
+template <typename T>
+struct Resampler2DFunctor<CPUDevice, T>{
+  void operator ()(::tensorflow::OpKernelContext* ctx,
+                   const CPUDevice& d,
+                   const T* __restrict__ data,
+                   const T* __restrict__ warp,
+                   T* __restrict__ output,
+                   const int batch_size,
+                   const int data_height,
+                   const int data_width,
+                   const int data_channels,
+                   const int num_sampling_points){
+    const int warp_batch_stride = num_sampling_points * 2;
+    const int data_batch_stride = data_height * data_width * data_channels;
+    const int output_batch_stride = num_sampling_points * data_channels;
+    const T zero = static_cast<T>(0.0);
+    const T one = static_cast<T>(1.0);
+
+    auto resample_batches = [&](const int start, const int limit) {
+      for (int batch_id = start; batch_id < limit; ++batch_id) {
+        // Utility lambda to access data point and set output values.
+        // The functions take care of performing the relevant pointer
+        // arithmetics abstracting away the low level details in the
+        // main loop over samples. Note that data is stored in NHWC format.
+        auto set_output = [&](const int sample_id,
+                              const int channel,
+                              const T value) {
+          output[batch_id * output_batch_stride +
+                 sample_id * data_channels +
+                 channel] = value;
+        };
+
+        auto get_data_point = [&](const int x,
+                                  const int y,
+                                  const int chan) {
+          const bool point_is_in_range =
+              (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+          return point_is_in_range
+                 ? data[batch_id * data_batch_stride +
+                        data_channels * (y * data_width + x) +
+                        chan]
+                 : zero;
+        };
+
+        for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
+          const T x = warp[batch_id * warp_batch_stride + sample_id * 2];
+          const T y = warp[batch_id * warp_batch_stride + sample_id * 2 + 1];
+          // The interpolation function:
+          // a) implicitly pads the input data with 0s (hence the unusual checks
+          // with {x,y} > -1)
+          // b) returns 0 when sampling outside the (padded) image.
+          // The effect is that the sampled signal smoothly goes to 0 outside
+          // the original input domain, rather than presenting a jump
+          // discontinuity at the image boundaries.
+          if (x > static_cast<T>(-1.0) &&
+              y > static_cast<T>(-1.0) &&
+              x < static_cast<T>(data_width) &&
+              y < static_cast<T>(data_height)) {
+            // Precompute floor (f) and ceil (c) values for x and y.
+            const int fx = std::floor(static_cast<float>(x));
+            const int fy = std::floor(static_cast<float>(y));
+            const int cx = fx + 1;
+            const int cy = fy + 1;
+            const T dx = static_cast<T>(cx) - x;
+            const T dy = static_cast<T>(cy) - y;
+
+            for (int chan = 0; chan < data_channels; ++chan) {
+              const T img_fxfy = dx * dy * get_data_point(fx, fy, chan);
+              const T img_cxcy = (one - dx) * (one - dy) *
+                                   get_data_point(cx, cy, chan);
+              const T img_fxcy = dx * (one - dy) *
+                                   get_data_point(fx, cy, chan);
+              const T img_cxfy = (one - dx) * dy *
+                                   get_data_point(cx, fy, chan);
+              set_output(sample_id, chan,
+                         img_fxfy + img_cxcy + img_fxcy + img_cxfy);
+            }
+          } else {
+            for (int chan = 0; chan < data_channels; ++chan) {
+              set_output(sample_id, chan, zero);
+            }
+          }
+        }
+      }
+    };
+    // Rough estimate of work for each batch entry.
+    // From third_party/tensorflow/core/util/work_sharder.cc we gather that an
+    // estimate of the cost of each work unit is needed to correclty shard the
+    // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
+    // being 10us.
+    const int64 cost =  static_cast<int64>(num_sampling_points) *
+        data_channels * 1000;
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                        batch_size, cost, resample_batches);
+  }
+};
+
+}  // namespace functor
+
+template <typename Device, typename T>
+class ResamplerOp : public ::tensorflow::OpKernel {
+ public:
+  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) :
+      ::tensorflow::OpKernel(context) {}
+
+  void Compute(::tensorflow::OpKernelContext* ctx) override {
+    const ::tensorflow::Tensor& data = ctx->input(0);
+    const ::tensorflow::Tensor& warp = ctx->input(1);
+
+    const ::tensorflow::TensorShape& data_shape = data.shape();
+    OP_REQUIRES(ctx, data_shape.dims() == 4,
+                ::tensorflow::errors::Unimplemented(
+                    "Only bilinear interpolation is currently supported. The "
+                    "input data shape must be [batch_size, data_height, "
+                    "data_width, data_channels], but is: ",
+                    data_shape.DebugString()));
+    const ::tensorflow::TensorShape& warp_shape = warp.shape();
+    OP_REQUIRES(ctx,
+                ::tensorflow::TensorShapeUtils::IsMatrixOrHigher(warp_shape),
+                ::tensorflow::errors::InvalidArgument(
+                    "warp should be at least a matrix, got shape ",
+                    warp_shape.DebugString()));
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+                ::tensorflow::errors::Unimplemented(
+                    "Only bilinear interpolation is supported, warping "
+                    "coordinates must be 2D; warp shape last entry should be "
+                    "2, but shape vector is: ", warp_shape.DebugString()));
+    OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0),
+                ::tensorflow::errors::InvalidArgument(
+                    "Batch size of data and warp tensor must be the same, but "
+                    "input shapes are: ", data_shape.DebugString(), ", ",
+                    warp_shape.DebugString()));
+    const int batch_size = data_shape.dim_size(0);
+    const int data_height = data_shape.dim_size(1);
+    const int data_width = data_shape.dim_size(2);
+    const int data_channels = data_shape.dim_size(3);
+    ::tensorflow::TensorShape output_shape = warp.shape();
+    output_shape.set_dim(output_shape.dims() - 1, data_channels);
+    const int num_sampling_points = warp.NumElements() / batch_size / 2;
+    ::tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output));
+
+    // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
+    if (num_sampling_points > 0) {
+      functor::Resampler2DFunctor<Device, T>()(ctx,
+                                               ctx->eigen_device<Device>(),
+                                               data.flat<T>().data(),
+                                               warp.flat<T>().data(),
+                                               output->flat<T>().data(),
+                                               batch_size,
+                                               data_height,
+                                               data_width,
+                                               data_channels,
+                                               num_sampling_points);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp);
+};
+
+
+#define REGISTER(TYPE)                       \
+  REGISTER_KERNEL_BUILDER(                   \
+      Name("Resampler")                      \
+          .Device(DEVICE_CPU)  \
+          .TypeConstraint<TYPE>("T"),        \
+      ResamplerOp<CPUDevice, TYPE>);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#if GOOGLE_CUDA
+#define REGISTER(TYPE)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Resampler")                      \
+                              .Device(DEVICE_GPU)  \
+                              .TypeConstraint<TYPE>("T"),        \
+                          ResamplerOp<GPUDevice, TYPE>)
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+#endif  // GOOGLE_CUDA
+
+
+namespace functor {
+
+template <typename T>
+struct ResamplerGrad2DFunctor<CPUDevice, T>{
+  void operator ()(::tensorflow::OpKernelContext* ctx,
+                   const CPUDevice& d,
+                   const T* __restrict__ data,
+                   const T* __restrict__ warp,
+                   const T* __restrict__ grad_output,
+                   T* __restrict__ grad_data,
+                   T* __restrict__ grad_warp,
+                   const int batch_size,
+                   const int data_height,
+                   const int data_width,
+                   const int data_channels,
+                   const int num_sampling_points){
+    // Set gradients to 0, because the kernel incrementally updates the
+    // tensor entries by adding partial contributions.
+    const int resampler_output_size = batch_size * num_sampling_points *
+        data_channels;
+    const int grad_warp_size = resampler_output_size / data_channels * 2;
+    const int grad_data_size = data_height * data_width * data_channels *
+        batch_size;
+    memset(grad_data, 0, sizeof(T) * grad_data_size);
+    memset(grad_warp, 0, sizeof(T) * grad_warp_size);
+
+    const auto&& data_batch_stride = data_height * data_width * data_channels;
+    const auto&& warp_batch_stride = num_sampling_points * 2;
+    const int output_batch_stride = num_sampling_points * data_channels;
+    const T zero = static_cast<T>(0.0);
+    const T one = static_cast<T>(1.0);
+
+    auto update_grads_for_batches = [&](const int start, const int limit) {
+      for (int batch_id = start; batch_id < limit; ++batch_id) {
+        // Utility lambdas to access data and update gradient tensors.
+        // The functions take care of performing the relevant pointer
+        // arithmetics abstracting away the low level details in the
+        // main loop over samples. Note that data is stored in NHWC format.
+        auto get_data_point = [&](const int x,
+                                  const int y,
+                                  const int chan) {
+          const bool point_is_in_range =
+            (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+          return point_is_in_range
+                 ? data[batch_id * data_batch_stride +
+                        data_channels * (y * data_width + x) +
+                        chan]
+                 : zero;
+        };
+
+        auto update_grad_data = [&](const int x, const int y, const int chan,
+                                    const T value) {
+          const bool point_is_in_range =
+              (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+          if (point_is_in_range){
+            grad_data[batch_id * data_batch_stride +
+                      data_channels * (y * data_width + x) +
+                      chan] += value;
+          }
+        };
+
+        auto update_grad_warp = [&](const int sample_id,
+                                    const int channel,
+                                    const T value) {
+          grad_warp[batch_id * warp_batch_stride +
+                    sample_id * 2 +
+                    channel] += value;
+        };
+
+        for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
+          const T x = warp[batch_id * warp_batch_stride + sample_id * 2];
+          const T y = warp[batch_id * warp_batch_stride + sample_id * 2 + 1];
+          // The interpolation function whose gradient this function implements:
+          // a) implicitly pads the input data with 0s (hence the unusual checks
+          // with {x,y} > -1)
+          // b) returns 0 when sampling outside the (padded) image.
+          // The effect is that the sampled signal smoothly goes to 0 outside
+          // the original input domain, rather than presenting a jump
+          // discontinuity at the image boundaries.
+          if (x > static_cast<T>(-1.0) &&
+              y > static_cast<T>(-1.0) &&
+              x < static_cast<T>(data_width) &&
+              y < static_cast<T>(data_height)) {
+            // Precompute floor (f) and ceil (c) values for x and y.
+            const int fx = std::floor(static_cast<float>(x));
+            const int fy = std::floor(static_cast<float>(y));
+            const int cx = fx + 1;
+            const int cy = fy + 1;
+            const T dx = static_cast<T>(cx) - x;
+            const T dy = static_cast<T>(cy) - y;
+
+            for (int chan = 0; chan < data_channels; ++chan) {
+              const T grad_output_value =
+                  grad_output[batch_id * output_batch_stride +
+                              sample_id * data_channels +
+                              chan];
+              const T img_fxfy = get_data_point(fx, fy, chan);
+              const T img_cxcy = get_data_point(cx, cy, chan);
+              const T img_fxcy = get_data_point(fx, cy, chan);
+              const T img_cxfy = get_data_point(cx, fy, chan);
+
+              // Update partial gradients wrt relevant warp field entries
+              update_grad_warp(sample_id, 0,
+                               grad_output_value *
+                                   ((one - dy) * (img_cxcy - img_fxcy) +
+                                    dy * (img_cxfy - img_fxfy)));
+
+              update_grad_warp(sample_id, 1,
+                               grad_output_value *
+                                   ((one - dx) * (img_cxcy - img_cxfy) +
+                                    dx * (img_fxcy - img_fxfy)));
+
+              // Update partial gradients wrt sampled data
+              update_grad_data(fx, fy, chan,
+                               grad_output_value * dx * dy);
+              update_grad_data(cx, cy, chan,
+                               grad_output_value * (one - dx) * (one - dy));
+              update_grad_data(fx, cy, chan,
+                               grad_output_value * dx * (one - dy));
+              update_grad_data(cx, fy, chan,
+                               grad_output_value * (one - dx) * dy);
+            }
+          }
+        }
+      }
+    };
+    // Rough estimate of work for each batch entry.
+    // From third_party/tensorflow/core/util/work_sharder.cc we gather that an
+    // estimate of the cost of each work unit is needed to correctly shard the
+    // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
+    // being 10us.
+    // TODO(fviola): Check out if there is a better way of doing this.
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    const int64 cost =  static_cast<int64>(num_sampling_points) *
+        data_channels * 1000;
+    ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                        batch_size, cost, update_grads_for_batches);
+  }
+};
+
+}  // namespace functor
+
+
+template <typename Device, typename T>
+class ResamplerGradOp : public ::tensorflow::OpKernel {
+ public:
+  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) :
+      ::tensorflow::OpKernel(context) {}
+
+  void Compute(::tensorflow::OpKernelContext* ctx) override {
+    const ::tensorflow::Tensor& data = ctx->input(0);
+    const ::tensorflow::Tensor& warp = ctx->input(1);
+    const ::tensorflow::Tensor& grad_output = ctx->input(2);
+
+    const ::tensorflow::TensorShape& data_shape = data.shape();
+    OP_REQUIRES(ctx, data_shape.dims() == 4,
+                ::tensorflow::errors::Unimplemented(
+                    "Only bilinear interpolation is supported, the input data "
+                    "tensor must be a batch of 2d data; data shape should have "
+                    "4 entries corresponding to [batch_size, data_height, "
+                    "data_width, data_channels], but is: ",
+                data_shape.DebugString()));
+    const int batch_size = data_shape.dim_size(0);
+    const int data_height = data_shape.dim_size(1);
+    const int data_width = data_shape.dim_size(2);
+    const int data_channels = data_shape.dim_size(3);
+    const ::tensorflow::TensorShape& warp_shape = warp.shape();
+    OP_REQUIRES(ctx,
+                ::tensorflow::TensorShapeUtils::IsMatrixOrHigher(warp_shape),
+                ::tensorflow::errors::InvalidArgument(
+                    "warp should be at least a matrix, got shape ",
+                    warp_shape.DebugString()));
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+                ::tensorflow::errors::Unimplemented(
+                    "Only bilinear interpolation is supported, warping "
+                    "coordinates must be 2D; warp shape last entry should be "
+                    "2, but shape vector is: ",
+                    warp_shape.DebugString()));
+    const ::tensorflow::TensorShape& grad_output_shape = grad_output.shape();
+    ::tensorflow::TensorShape resampler_output_shape = warp.shape();
+    resampler_output_shape.set_dim(resampler_output_shape.dims() - 1,
+                                   data_channels);
+    OP_REQUIRES(ctx, grad_output_shape == resampler_output_shape,
+                ::tensorflow::errors::InvalidArgument(
+                   "grad_output shape is not consistent with data and warp "
+                   "shapes; it should be ",
+                   resampler_output_shape.DebugString(), " but is ",
+                   grad_output_shape.DebugString()))
+    const int num_sampling_points = warp.NumElements() / batch_size / 2;
+    ::tensorflow::Tensor* grad_data = nullptr;
+    ::tensorflow::Tensor* grad_warp = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, data.shape(), &grad_data));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp));
+    // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
+    if (num_sampling_points > 0) {
+      functor::ResamplerGrad2DFunctor<Device, T>()(ctx,
+                                                   ctx->eigen_device<Device>(),
+                                                   data.flat<T>().data(),
+                                                   warp.flat<T>().data(),
+                                                   grad_output.flat<T>().data(),
+                                                   grad_data->flat<T>().data(),
+                                                   grad_warp->flat<T>().data(),
+                                                   batch_size,
+                                                   data_height,
+                                                   data_width,
+                                                   data_channels,
+                                                   num_sampling_points);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp);
+};
+
+#define REGISTER(TYPE)                       \
+  REGISTER_KERNEL_BUILDER(                   \
+      Name("ResamplerGrad")                  \
+          .Device(DEVICE_CPU)  \
+          .TypeConstraint<TYPE>("T"),        \
+      ResamplerGradOp<CPUDevice, TYPE>);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+#undef REGISTER
+
+#if GOOGLE_CUDA
+#define REGISTER(TYPE)                                           \
+  REGISTER_KERNEL_BUILDER(Name("ResamplerGrad")                  \
+                              .Device(DEVICE_GPU)  \
+                              .TypeConstraint<TYPE>("T"),        \
+                          ResamplerGradOp<GPUDevice, TYPE>)
+// Disable half and double precision since atomicAdds are not supported
+// TF_CALL_half(REGISTER);
+// TF_CALL_double(REGISTER);
+TF_CALL_float(REGISTER);
+
+#undef REGISTER
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8258ecaf5d3ba67094194c5cb12ca6d4d6efc85f
--- /dev/null
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -0,0 +1,68 @@
+// Copyright 2017 The Sonnet Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+
+#if PLATFORM_WINDOWS
+#define __restrict__ __restrict
+#endif
+
+namespace tensorflow {
+class OpKernelContext;
+}
+
+namespace tensorflow {
+namespace functor {
+
+// Helper functor for the Resampler Op in 2D
+template <typename Device, typename T>
+struct Resampler2DFunctor{
+  void operator ()(::tensorflow::OpKernelContext* ctx,
+                   const Device& d,
+                   const T* __restrict__ data,
+                   const T* __restrict__ warp,
+                   T* __restrict__ output,
+                   const int batch_size,
+                   const int data_height,
+                   const int data_width,
+                   const int data_channels,
+                   const int num_sampling_points);
+};
+
+
+// Helper functor for the Resampler Gradient Op in 2D
+template <typename Device, typename T>
+struct ResamplerGrad2DFunctor{
+  void operator ()(::tensorflow::OpKernelContext* ctx,
+                   const Device& d,
+                   const T* __restrict__ data,
+                   const T* __restrict__ warp,
+                   const T* __restrict__ grad_output,
+                   T* __restrict__ grad_data,
+                   T* __restrict__ grad_warp,
+                   const int batch_size,
+                   const int data_height,
+                   const int data_width,
+                   const int data_channels,
+                   const int num_sampling_points);
+};
+
+
+}  // namespace functor
+}  // namespace tensorflow
+
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..636847a212f27c738032128e3f3f653ec32f851b
--- /dev/null
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -0,0 +1,310 @@
+// Copyright 2016 The Sonnet Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/resampler/kernels/resampler_ops.h"
+
+#include <stdio.h>
+#include <cmath>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+#define GET_DATA_POINT(x, y)                   \
+  data[batch_id * data_batch_stride +          \
+       data_channels * (y * data_width + x) +  \
+       chan]
+
+template <typename T>
+__global__ void Resampler2DKernel(const T* __restrict__ data,
+                                  const T* __restrict__ warp,
+                                  T* __restrict__ output,
+                                  const int batch_size,
+                                  const int data_height,
+                                  const int data_width,
+                                  const int data_channels,
+                                  const int num_sampling_points) {
+  const int output_data_size = batch_size * num_sampling_points * data_channels;
+  CUDA_1D_KERNEL_LOOP(index, output_data_size) {
+    const int out_index = index;
+
+    // Get (idxSample, channel, point) from the index.
+    // Use this formula
+    //   index = batch_id * num_sampling_points * num_chans +
+    //           sample_id * num_chans + chan_id,
+    // with sample_id = [0, ... ,num_sampling_points)
+    const int data_batch_stride = data_height * data_width * data_channels;
+    const int warp_batch_stride = num_sampling_points * 2;
+    const int output_batch_stride = num_sampling_points * data_channels;
+
+    const int batch_id = index / output_batch_stride;
+    const int index_in_batch = index % output_batch_stride;
+    const int chan = index_in_batch % data_channels;
+    const int sample_id = index_in_batch / data_channels;
+
+    // Get coords of 2D point where data will be resampled
+    const T x = warp[batch_id * warp_batch_stride + sample_id * 2];
+    const T y = warp[batch_id * warp_batch_stride + sample_id * 2 + 1];
+    const T zero = static_cast<T>(0.0);
+    const T one = static_cast<T>(1.0);
+    // The interpolation function:
+    // a) implicitly pads the input data with 0s (hence the unusual checks
+    // with {x,y} > -1)
+    // b) returns 0 when sampling outside the (padded) image.
+    // The effect is that the sampled signal smoothly goes to 0 outside
+    // the original input domain, rather than presenting a jump
+    // discontinuity at the image boundaries.
+    if (x > static_cast<T>(-1.0) &&
+        y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) &&
+        y < static_cast<T>(data_height)) {
+      // Precompute floor (f) and ceil (c) values for x and y.
+      const int fx = std::floor(static_cast<float>(x));
+      const int fy = std::floor(static_cast<float>(y));
+      const int cx = fx + 1;
+      const int cy = fy + 1;
+      const T dx = static_cast<T>(cx) - x;
+      const T dy = static_cast<T>(cy) - y;
+
+      const T img_fxfy = (fx >= 0 && fy >= 0)
+                         ? dx * dy * GET_DATA_POINT(fx, fy)
+                         : zero;
+
+      const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
+                         ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
+                         : zero;
+
+      const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
+                         ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
+                         : zero;
+
+      const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
+                         ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
+                         : zero;
+
+      output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy;
+    } else {
+      output[out_index] = zero;
+    }
+  }
+}
+
+}  // namespace
+
+namespace functor {
+
+template <typename T>
+struct Resampler2DFunctor<GPUDevice, T>{
+  void operator ()(::tensorflow::OpKernelContext* ctx,
+                   const GPUDevice& d,
+                   const T* __restrict__ data,
+                   const T* __restrict__ warp,
+                   T* __restrict__ output,
+                   const int batch_size,
+                   const int data_height,
+                   const int data_width,
+                   const int data_channels,
+                   const int num_sampling_points) {
+  const int output_data_size = batch_size * num_sampling_points * data_channels;
+  ::tensorflow::CudaLaunchConfig config =
+      ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+  Resampler2DKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          data, warp, output, batch_size, data_height, data_width,
+          data_channels, num_sampling_points);
+  }
+};
+
+// TODO(fviola): gcudacc fails at compile time with Eigen::half.
+// template struct Resampler2DFunctor<GPUDevice, Eigen::half>;
+template struct Resampler2DFunctor<GPUDevice, float>;
+template struct Resampler2DFunctor<GPUDevice, double>;
+
+}  // namespace functor
+
+namespace {
+
+#define UPDATE_GRAD_DATA_POINT(x, y, v)                  \
+  atomicAdd(grad_data + (batch_id * data_batch_stride +  \
+            data_channels * (y * data_width + x) +       \
+            chan),                                       \
+            v)
+
+
+template <typename T>
+__global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
+                                      const T* __restrict__ warp,
+                                      const T* __restrict__ grad_output,
+                                      T* __restrict__ grad_data,
+                                      T* __restrict__ grad_warp,
+                                      const int batch_size,
+                                      const int data_height,
+                                      const int data_width,
+                                      const int data_channels,
+                                      const int num_sampling_points) {
+  const int resampler_output_size = batch_size * num_sampling_points *
+      data_channels;
+  CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
+    const int out_index = index;
+
+    // Get (idxSample, channel, point) from the index.
+    // Use this formula
+    //   index = batch_id * num_sampling_points * num_chans +
+    //           sample_id * num_chans + chan_id,
+    // with sample_id = [0, ... ,num_sampling_points)
+    const int data_batch_stride = data_height * data_width * data_channels;
+    const int warp_batch_stride = num_sampling_points * 2;
+    const int output_batch_stride = num_sampling_points * data_channels;
+
+    const int batch_id = index / output_batch_stride;
+    const int index_in_batch = index % output_batch_stride;
+    const int chan = index_in_batch % data_channels;
+    const int sample_id = index_in_batch / data_channels;
+
+    // Get coords of 2D point where data will be resampled
+    const int warp_id_x = batch_id * warp_batch_stride + sample_id * 2;
+    const int warp_id_y = warp_id_x + 1;
+    const T x = warp[warp_id_x];
+    const T y = warp[warp_id_y];
+    const T zero = static_cast<T>(0.0);
+    const T one = static_cast<T>(1.0);
+
+    // Get grad output
+    const T grad_output_value = grad_output[out_index];
+    // The interpolation function whose gradient this kernel implements:
+    // a) implicitly pads the input data with 0s (hence the unusual checks
+    // with {x,y} > -1)
+    // b) returns 0 when sampling outside the (padded) image.
+    // The effect is that the sampled signal smoothly goes to 0 outside
+    // the original input domain, rather than presenting a jump
+    // discontinuity at the image boundaries.
+    if (x > static_cast<T>(-1.0) &&
+        y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) &&
+        y < static_cast<T>(data_height)) {
+      // Precompute floor (f) and ceil (c) values for x and y.
+      const int fx = std::floor(static_cast<float>(x));
+      const int fy = std::floor(static_cast<float>(y));
+      const int cx = fx + 1;
+      const int cy = fy + 1;
+      const T dx = static_cast<T>(cx) - x;
+      const T dy = static_cast<T>(cy) - y;
+
+      const T img_fxfy = (fx >= 0 && fy >= 0)
+                         ? GET_DATA_POINT(fx, fy)
+                         : zero;
+
+      const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
+                         ? GET_DATA_POINT(cx, cy)
+                         : zero;
+
+      const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
+                         ? GET_DATA_POINT(fx, cy)
+                         : zero;
+
+      const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
+                         ? GET_DATA_POINT(cx, fy)
+                         : zero;
+
+      // Update partial gradients wrt relevant warp field entries
+      atomicAdd(grad_warp + warp_id_x,
+                grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) +
+                                     dy * (img_cxfy - img_fxfy)));
+      atomicAdd(grad_warp + warp_id_y,
+                grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) +
+                                     dx * (img_fxcy - img_fxfy)));
+
+      // Update partial gradients wrt sampled data
+      if (fx >= 0 && fy >= 0) {
+        UPDATE_GRAD_DATA_POINT(fx, fy, grad_output_value * dx * dy);
+      }
+      if (cx <= data_width - 1 && cy <= data_height - 1) {
+        UPDATE_GRAD_DATA_POINT(cx, cy,
+                               grad_output_value  * (one - dx) * (one - dy));
+      }
+      if (fx >= 0 && cy <= data_height - 1) {
+        UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy));
+      }
+      if (cx <= data_width - 1 && fy >= 0) {
+        UPDATE_GRAD_DATA_POINT(cx, fy, grad_output_value * (one - dx) * dy);
+      }
+    }
+  }
+}
+
+#undef GET_DATA_POINT
+#undef UPDATE_GRAD_DATA_POINT
+
+}  // namespace
+
+namespace functor {
+
+template <typename T>
+struct ResamplerGrad2DFunctor<GPUDevice, T>{
+  void operator ()(::tensorflow::OpKernelContext* ctx,
+                   const GPUDevice& d,
+                   const T* __restrict__ data,
+                   const T* __restrict__ warp,
+                   const T* __restrict__ grad_output,
+                   T* __restrict__ grad_data,
+                   T* __restrict__ grad_warp,
+                   const int batch_size,
+                   const int data_height,
+                   const int data_width,
+                   const int data_channels,
+                   const int num_sampling_points) {
+  // Set gradients to 0, because the kernel incrementally updates the
+  // tensor entries by adding partial contributions.
+  const int grad_warp_size = batch_size * num_sampling_points * 2;
+  const int grad_data_size = batch_size * data_height * data_width *
+      data_channels;
+
+  ::tensorflow::CudaLaunchConfig config =
+     ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+  ::tensorflow::SetZero
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          grad_warp_size, grad_warp);
+
+  config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+  ::tensorflow::SetZero
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          grad_data_size, grad_data);
+
+  const int resampler_output_size = batch_size * num_sampling_points *
+      data_channels;
+  config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+  ResamplerGrad2DKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          data, warp, grad_output, grad_data, grad_warp, batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
+  }
+};
+
+template struct ResamplerGrad2DFunctor<GPUDevice, float>;
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/resampler/ops/resampler_ops.cc b/tensorflow/contrib/resampler/ops/resampler_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ab212032e50ace9545762bebda5679f68fbf77c
--- /dev/null
+++ b/tensorflow/contrib/resampler/ops/resampler_ops.cc
@@ -0,0 +1,59 @@
+// Copyright 2017 The Sonnet Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("Resampler")
+    .Input("data: T")
+    .Input("warp: T")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle data;
+      ShapeHandle warp;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &data));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &warp));
+
+      ShapeHandle output;  // will be warp[:-1] + [data[-1]]
+      TF_RETURN_IF_ERROR(c->Subshape(warp, 0, -1, &output));
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(output, c->Vector(c->Dim(data, -1)), &output));
+
+      c->set_output(0, output);
+      return ::tensorflow::Status::OK();
+    })
+    .Doc(R"doc(Resampler op.)doc");
+
+REGISTER_OP("ResamplerGrad")
+    .Input("data: T")
+    .Input("warp: T")
+    .Input("grad_output: T")
+    .Output("grad_data: T")
+    .Output("grad_warp: T")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      c->set_output(1, c->input(1));
+      return ::tensorflow::Status::OK();
+    })
+    .Doc(R"doc(Resampler Grad op.)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/tensorboard/__main__.py b/tensorflow/contrib/resampler/python/__init__.py
similarity index 87%
rename from tensorflow/tensorboard/__main__.py
rename to tensorflow/contrib/resampler/python/__init__.py
index f172583d7c564ab8bfca3a89c373fa0f6b4c24f4..c5ca3a623fb15c44d04f2222708353d2934490e4 100644
--- a/tensorflow/tensorboard/__main__.py
+++ b/tensorflow/contrib/resampler/python/__init__.py
@@ -12,14 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""ops module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-import sys
-
-from tensorflow.tensorboard.tensorboard import main
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/tensorflow/contrib/resampler/python/ops/resampler_ops.py b/tensorflow/contrib/resampler/python/ops/resampler_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..355d15f0c7348aad5908cc72b0dad3308927d161
--- /dev/null
+++ b/tensorflow/contrib/resampler/python/ops/resampler_ops.py
@@ -0,0 +1,69 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Sonnet Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Tensorflow op performing differentiable resampling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.resampler.ops import gen_resampler_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+
+_resampler_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_resampler_ops.so"))
+
+
+def resampler(data, warp, name="resampler"):
+  """Resamples input data at user defined coordinates.
+
+  The resampler currently only supports bilinear interpolation of 2D data.
+
+  Args:
+    data: Tensor of shape `[batch_size, data_height, data_width,
+      data_num_channels]` containing 2D data that will be resampled.
+    warp: Tensor of minimum rank 2 containing the coordinates at which
+      resampling will be performed. Since only bilinear interpolation is
+      currently supported, the last dimension of the `warp` tensor must be 2.
+    name: Optional name of the op.
+
+  Returns:
+    Tensor of resampled values from `data`. The output tensor shape is
+    determined by the shape of the warp tensor. For example, if `data` is of
+    shape `[batch_size, data_height, data_width, data_num_channels]` and warp of
+    shape `[batch_size, dim_0, ... , dim_n, 2]` the output will be of shape
+    `[batch_size, dim_0, ... , dim_n, data_num_channels]`.
+
+  Raises:
+    ImportError: if the wrapper generated during compilation is not present when
+    the function is called.
+  """
+  with ops.name_scope(name, "resampler", [data, warp]):
+    data_tensor = ops.convert_to_tensor(data, name="data")
+    warp_tensor = ops.convert_to_tensor(warp, name="warp")
+    return gen_resampler_ops.resampler(data_tensor, warp_tensor)
+
+
+@ops.RegisterGradient("Resampler")
+def _resampler_grad(op, grad_output):
+  data, warp = op.inputs
+  grad_output_tensor = ops.convert_to_tensor(grad_output, name="grad_output")
+  return gen_resampler_ops.resampler_grad(data, warp, grad_output_tensor)
+
+
+ops.NotDifferentiable("ResamplerGrad")
diff --git a/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py b/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a4360150ca5aafad516ee1b9d77f3c4601a1689
--- /dev/null
+++ b/tensorflow/contrib/resampler/python/ops/resampler_ops_test.py
@@ -0,0 +1,270 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Sonnet Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Tests for contrib.resampler.python.ops.resampler_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib import resampler
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+def _bilinearly_interpolate(data, x, y):
+  """Performs bilinenar interpolation of grid data at user defined coordinates.
+
+  This interpolation function:
+    a) implicitly pads the input data with 0s.
+    b) returns 0 when sampling outside the (padded) image.
+  The effect is that the sampled signal smoothly goes to 0 outside the original
+  input domain, rather than producing a jump discontinuity at the image
+  boundaries.
+
+  Args:
+    data: numpy array of shape `[data_height, data_width]` containing data
+      samples assumed to be defined at the corresponding pixel coordinates.
+    x: numpy array of shape `[warp_height, warp_width]` containing x coordinates
+      at which interpolation will be performed.
+    y: numpy array of shape `[warp_height, warp_width]` containing y coordinates
+      at which interpolation will be performed.
+
+  Returns:
+    Numpy array of shape `[warp_height, warp_width]` containing interpolated
+      values.
+  """
+  shape = x.shape
+  x = np.asarray(x) + 1
+  y = np.asarray(y) + 1
+  data = np.lib.pad(data, 1, "constant", constant_values=0)
+
+  x_0 = np.floor(x).astype(int)
+  x_1 = x_0 + 1
+  y_0 = np.floor(y).astype(int)
+  y_1 = y_0 + 1
+
+  x_0 = np.clip(x_0, 0, data.shape[1] - 1)
+  x_1 = np.clip(x_1, 0, data.shape[1] - 1)
+  y_0 = np.clip(y_0, 0, data.shape[0] - 1)
+  y_1 = np.clip(y_1, 0, data.shape[0] - 1)
+
+  i_a = data[y_0, x_0]
+  i_b = data[y_1, x_0]
+  i_c = data[y_0, x_1]
+  i_d = data[y_1, x_1]
+
+  w_a = (x_1 - x) * (y_1 - y)
+  w_b = (x_1 - x) * (y - y_0)
+  w_c = (x - x_0) * (y_1 - y)
+  w_d = (x - x_0) * (y - y_0)
+
+  samples = (w_a * i_a + w_b * i_b + w_c * i_c + w_d * i_d)
+  samples.reshape(shape)
+
+  return samples
+
+
+def _make_warp(batch_size, warp_height, warp_width, dtype):
+  """Creates batch of warping coordinates."""
+  x, y = np.meshgrid(np.linspace(0, warp_width - 1, warp_width),
+                     np.linspace(0, warp_height - 1, warp_height))
+  warp = np.concatenate((x.reshape([warp_height, warp_width, 1]),
+                         y.reshape([warp_height, warp_width, 1])), 2)
+  warp = np.tile(warp.reshape([1, warp_height, warp_width, 2]),
+                 [batch_size, 1, 1, 1])
+  warp += np.random.randn(*warp.shape)
+  return warp.astype(dtype)
+
+
+class ResamplerTest(test.TestCase):
+
+  def test_op_forward_pass_gpu_float32(self):
+    self._test_op_forward_pass(True, dtypes.float32, 1e-4)
+
+  def test_op_forward_pass_gpu_float64(self):
+    self._test_op_forward_pass(True, dtypes.float64, 1e-5)
+
+  def test_op_forward_pass_cpu_float16(self):
+    self._test_op_forward_pass(False, dtypes.float16, 1e-2)
+
+  def test_op_forward_pass_cpu_float32(self):
+    self._test_op_forward_pass(False, dtypes.float32, 1e-4)
+
+  def test_op_forward_pass_cpu_float64(self):
+    self._test_op_forward_pass(False, dtypes.float64, 1e-5)
+
+  def test_op_backward_pass_gpu_float32(self):
+    self._test_op_backward_pass(True, dtypes.float32, 1e-3)
+
+  def test_op_backward_pass_cpu_float16(self):
+    self._test_op_backward_pass(False, dtypes.float16, 1e-3)
+
+  def test_op_backward_pass_cpu_float32(self):
+    self._test_op_backward_pass(False, dtypes.float32, 1e-4)
+
+  def test_op_backward_pass_cpu_float64(self):
+    self._test_op_backward_pass(False, dtypes.float64, 1e-6)
+
+  def _test_op_forward_pass(self, on_gpu, dtype, tol):
+    np.random.seed(0)
+    data_width = 7
+    data_height = 9
+    data_channels = 5
+    warp_width = 4
+    warp_height = 8
+    batch_size = 10
+
+    warp = _make_warp(batch_size, warp_height, warp_width, dtype.as_numpy_dtype)
+    data_shape = (batch_size, data_height, data_width, data_channels)
+    data = np.random.rand(*data_shape).astype(dtype.as_numpy_dtype)
+
+    with self.test_session(use_gpu=on_gpu, force_gpu=False) as sess:
+      data_ph = array_ops.placeholder(dtype, shape=(None,) + data.shape[1:])
+      warp_ph = array_ops.placeholder(dtype, shape=(None,) + warp.shape[1:])
+      outputs = resampler.resampler(data=data_ph, warp=warp_ph)
+      self.assertEqual(outputs.get_shape().as_list(),
+                       [None, warp_height, warp_width, data_channels])
+      out = sess.run(outputs, feed_dict={data_ph: data, warp_ph: warp})
+
+    # Generate reference output via bilinear interpolation in numpy
+    reference_output = np.zeros_like(out)
+    for batch in xrange(batch_size):
+      for c in xrange(data_channels):
+        reference_output[batch, :, :, c] = _bilinearly_interpolate(
+            data[batch, :, :, c],
+            warp[batch, :, :, 0],
+            warp[batch, :, :, 1])
+
+    self.assertAllClose(out, reference_output, rtol=tol, atol=tol)
+
+  def _test_op_backward_pass(self, on_gpu, dtype, tol):
+    np.random.seed(13)
+    data_width = 5
+    data_height = 4
+    data_channels = 3
+    warp_width = 2
+    warp_height = 6
+    batch_size = 10
+
+    warp = _make_warp(batch_size, warp_height, warp_width, dtype.as_numpy_dtype)
+    data_shape = (batch_size, data_height, data_width, data_channels)
+    data = np.random.rand(*data_shape).astype(dtype.as_numpy_dtype)
+
+    with self.test_session(use_gpu=on_gpu, force_gpu=False):
+      data_tensor = constant_op.constant(data)
+      warp_tensor = constant_op.constant(warp)
+      output_tensor = resampler.resampler(data=data_tensor, warp=warp_tensor)
+
+      grads = test.compute_gradient([data_tensor, warp_tensor], [
+          data_tensor.get_shape().as_list(),
+          warp_tensor.get_shape().as_list()
+      ], output_tensor, output_tensor.get_shape().as_list(), [data, warp])
+
+      if not on_gpu:
+        # On CPU we perform numerical differentiation at the best available
+        # precision, and compare against that. This is necessary for test to
+        # pass for float16.
+        data_tensor_64 = constant_op.constant(data, dtype=dtypes.float64)
+        warp_tensor_64 = constant_op.constant(warp, dtype=dtypes.float64)
+        output_tensor_64 = resampler.resampler(data=data_tensor_64,
+                                               warp=warp_tensor_64)
+        grads_64 = test.compute_gradient([data_tensor_64, warp_tensor_64], [
+            data_tensor.get_shape().as_list(),
+            warp_tensor.get_shape().as_list()
+        ], output_tensor_64, output_tensor.get_shape().as_list(), [data, warp])
+
+        for g, g_64 in zip(grads, grads_64):
+          self.assertLess(np.fabs(g[0] - g_64[1]).max(), tol)
+
+      else:
+        for g in grads:
+          self.assertLess(np.fabs(g[0] - g[1]).max(), tol)
+
+  def test_op_errors(self):
+    data_width = 7
+    data_height = 9
+    data_depth = 3
+    data_channels = 5
+    warp_width = 4
+    warp_height = 8
+    batch_size = 10
+
+    # Input data shape is not defined over a 2D grid, i.e. its shape is not like
+    # (batch_size, data_height, data_width, data_channels).
+    with self.test_session() as sess:
+      data_shape = (batch_size, data_height, data_width, data_depth,
+                    data_channels)
+      data = np.zeros(data_shape)
+      warp_shape = (batch_size, warp_height, warp_width, 2)
+      warp = np.zeros(warp_shape)
+      outputs = resampler.resampler(constant_op.constant(data),
+                                    constant_op.constant(warp))
+
+      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
+                                   "Only bilinear interpolation is currently "
+                                   "supported."):
+        sess.run(outputs)
+
+    # Warp tensor must be at least a matrix, with shape [batch_size, 2].
+    with self.test_session() as sess:
+      data_shape = (batch_size, data_height, data_width, data_channels)
+      data = np.zeros(data_shape)
+      warp_shape = (batch_size,)
+      warp = np.zeros(warp_shape)
+      outputs = resampler.resampler(constant_op.constant(data),
+                                    constant_op.constant(warp))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "warp should be at least a matrix"):
+        sess.run(outputs)
+
+    # The batch size of the data and warp tensors must be the same.
+    with self.test_session() as sess:
+      data_shape = (batch_size, data_height, data_width, data_channels)
+      data = np.zeros(data_shape)
+      warp_shape = (batch_size+1, warp_height, warp_width, 2)
+      warp = np.zeros(warp_shape)
+      outputs = resampler.resampler(constant_op.constant(data),
+                                    constant_op.constant(warp))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Batch size of data and warp tensor"):
+        sess.run(outputs)
+
+    # The warp tensor must contain 2D coordinates, i.e. its shape last dimension
+    # must be 2.
+    with self.test_session() as sess:
+      data_shape = (batch_size, data_height, data_width, data_channels)
+      data = np.zeros(data_shape)
+      warp_shape = (batch_size, warp_height, warp_width, 3)
+      warp = np.zeros(warp_shape)
+      outputs = resampler.resampler(constant_op.constant(data),
+                                    constant_op.constant(warp))
+
+      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
+                                   "Only bilinear interpolation is supported, "
+                                   "warping"):
+        sess.run(outputs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 8c71977d5ac6269af5691f54e819d75aafe4dd19..784ac96149cd5af1ed200bee65d142a39fed842b 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -65,7 +65,7 @@ tf_custom_op_py_library(
 
 cuda_py_tests(
     name = "rnn_cell_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/rnn_cell_test.py"],
     additional_deps = [
         ":rnn_py",
@@ -83,7 +83,10 @@ cuda_py_tests(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = ["notsan"],  # http://b/62135981
+    tags = [
+        "noasan",  # times out b/63678675
+        "notsan",  # http://b/62135981
+    ],
     xla_enabled = True,
 )
 
@@ -360,7 +363,10 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
@@ -374,6 +380,10 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":checkpoint_convert",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
     ],
 )
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index d9ed9e3ab71ae0f172edd187177d332bdf410514..6317f32ac3b72d9fadf3c410de0f1df6539bc501 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -279,9 +279,6 @@ struct LSTMBlockCellBprop : public LSTMBlockCell {
           cs_prev_grad +
           di * wci.reshape(p_shape).broadcast(p_broadcast_shape) +
           df * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
-    }
-
-    if (use_peephole) {
       wci_grad.device(d) = (di * cs_prev).sum(Eigen::array<int, 1>({0}));
       wcf_grad.device(d) = (df * cs_prev).sum(Eigen::array<int, 1>({0}));
       wco_grad.device(d) = (do_ * cs).sum(Eigen::array<int, 1>({0}));
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 06954f51d8eadb23b2cd50ac9420914b9c5ff980..c14463bdad29af693f1d6ba9f0fdb8a766b6d6f4 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -210,7 +210,7 @@ class RNNCellTest(test.TestCase):
           sess.run([variables_lib.global_variables_initializer()])
           sess.run([g, out_m],
                    {x.name: 1 * np.ones([batch_size, input_size]),
-               m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+                    m.name: 0.1 * np.ones([batch_size - 1, state_size])})
 
   def testBasicLSTMCellStateSizeError(self):
     """Tests that state_size must be num_units * 2."""
@@ -218,7 +218,7 @@ class RNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         num_units = 2
-        state_size = num_units * 3 # state_size must be num_units * 2
+        state_size = num_units * 3  # state_size must be num_units * 2
         batch_size = 3
         input_size = 4
         x = array_ops.zeros([batch_size, input_size])
@@ -406,6 +406,31 @@ class RNNCellTest(test.TestCase):
         # States are left untouched
         self.assertAllClose(res[2], res[3])
 
+  def testResidualWrapperWithSlice(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 5])
+        m = array_ops.zeros([1, 3])
+        base_cell = rnn_cell_impl.GRUCell(3)
+        g, m_new = base_cell(x, m)
+        variable_scope.get_variable_scope().reuse_variables()
+        def residual_with_slice_fn(inp, out):
+          inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
+          return inp_sliced + out
+        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
+            base_cell, residual_with_slice_fn)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res_g, res_g_res, res_m_new, res_m_new_res = sess.run(
+            [g, g_res, m_new, m_new_res], {
+                x: np.array([[1., 1., 1., 1., 1.]]),
+                m: np.array([[0.1, 0.1, 0.1]])
+            })
+        # Residual connections
+        self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
+        # States are left untouched
+        self.assertAllClose(res_m_new, res_m_new_res)
+
   def testDeviceWrapper(self):
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 09aa30a20b32f454fd88353961537abffb544d34..701590a8febef871019044783075bb1dea7bfe77 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -74,7 +74,7 @@ class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
       without including 'Time' or 'Batch' dimensions.
     """
     if not isinstance(dims, tuple):
-      raise TypeError("The dimensions passed to DummyMultiDimensionalLSTM"
+      raise TypeError("The dimensions passed to DummyMultiDimensionalLSTM "
                       "should be a tuple of ints.")
     self._dims = dims
     self._output_size = tensor_shape.TensorShape(self._dims)
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index c41b5793fc96b706ecb08f778aff3349a0fca69d..48c2c5a724b2c0f8a0ad6d8f38f672258d06dc48 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -58,7 +58,7 @@ def _lstm_block_cell(x,
 
   ```python
   xh = [x, h_prev]
-  [i, f, ci, o] = xh * w + b
+  [i, ci, f, o] = xh * w + b
   f = f + forget_bias
 
   if not use_peephole:
@@ -93,7 +93,7 @@ def _lstm_block_cell(x,
       The weight matrix for output gate peephole connection.
     forget_bias: An optional `float`. Defaults to `1`. The forget gate bias.
     cell_clip: An optional `float`. Defaults to `3`.
-      Value to clip the 'cs' value to.
+      Value to clip the 'cs' value to. Disable by setting to negative value.
     use_peephole: An optional `bool`. Defaults to `False`.
       Whether to use peephole weights.
     name: A name for the operation (optional).
@@ -341,17 +341,24 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
   def __init__(self,
                num_units,
                forget_bias=1.0,
+               clip_cell=True,
                use_peephole=False):
     """Initialize the basic LSTM cell.
 
     Args:
       num_units: int, The number of units in the LSTM cell.
       forget_bias: float, The bias added to forget gates (see above).
+      clip_cell: boolean, whether to apply cell clipping. See
+        `_lstm_block_cell()` for details.
       use_peephole: Whether to use peephole connections or not.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMBlockCell instead.
     """
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._use_peephole = use_peephole
+    self._clip_cell = clip_cell
     self._names = {
         "W": "kernel",
         "b": "bias",
@@ -400,6 +407,7 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
           wco=wco,
           wcf=wcf,
           forget_bias=self._forget_bias,
+          cell_clip=None if self._clip_cell else -1,
           use_peephole=self._use_peephole)
 
       new_state = rnn_cell_impl.LSTMStateTuple(cs, h)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 9c5e9fec9df4505ad457e88b41a7a0e0a2d541ab..ecce1d22f0df2a1667ac20ce205996bafa2a9beb 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -79,7 +79,7 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
   The default non-peephole implementation is based on:
 
-    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+    http://www.bioinf.jku.at/publications/older/2604.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
@@ -1110,14 +1110,14 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     if input_size is None:
       input_size = inputs.get_shape().as_list()[1]
     inputs = _linear([inputs, attns], input_size, True)
-    lstm_output, new_state = self._cell(inputs, state)
+    cell_output, new_state = self._cell(inputs, state)
     if self._state_is_tuple:
       new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
     else:
       new_state_cat = new_state
     new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
     with vs.variable_scope("attn_output_projection"):
-      output = _linear([lstm_output, new_attns], self._attn_size, True)
+      output = _linear([cell_output, new_attns], self._attn_size, True)
     new_attn_states = array_ops.concat(
         [new_attn_states, array_ops.expand_dims(output, 1)], 1)
     new_attn_states = array_ops.reshape(
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
index 1cbd27a2e53f379b45634c7d3e10c8fcd8132c25..da129b68a6ef10a64088cae2bda7a839b3cc5514 100644
--- a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
@@ -124,6 +124,20 @@ _RNN_NAME_REPLACEMENTS = collections.OrderedDict([
      'attention_cell_wrapper/attention/kernel'),
     ('attention_cell_wrapper/attention/biases',
      'attention_cell_wrapper/attention/bias'),
+    ############################################################################
+    # contrib/legacy_seq2seq/python/ops/seq2seq.py
+    ('attention_decoder/weights',
+     'attention_decoder/kernel'),
+    ('attention_decoder/biases',
+     'attention_decoder/bias'),
+    ('attention_decoder/Attention_0/weights',
+     'attention_decoder/Attention_0/kernel'),
+    ('attention_decoder/Attention_0/biases',
+     'attention_decoder/Attention_0/bias'),
+    ('attention_decoder/AttnOutputProjection/weights',
+     'attention_decoder/AttnOutputProjection/kernel'),
+    ('attention_decoder/AttnOutputProjection/biases',
+     'attention_decoder/AttnOutputProjection/bias'),
 ])
 
 _RNN_SHARDED_NAME_REPLACEMENTS = collections.OrderedDict([
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 6ab9631d29f6684adb568ce7361aa5fef80f502e..a82ee6ac41ed3f81bd96c61dafb2144c41b07065 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -37,6 +37,9 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/saved_model:constants",
     ],
 )
 
@@ -64,6 +67,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":saved_model_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -73,6 +77,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "utils_test",
+    size = "small",
+    srcs = ["python/saved_model/utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saved_model_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
index 76d5a3e96d24daf5bff3e82d6af2c6a1bbe03320..a8331cbc8f04f74294675d7ceb57412e1f0b6170 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
@@ -81,16 +81,23 @@ class ReaderTest(test.TestCase):
 
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
-    # - multiple custom tags.
+    # - multiple predefined tags.
     with self.test_session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple custom tags.
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
       builder.add_meta_graph(["foo", "bar"])
 
     # Save the SavedModel to disk.
     builder.save()
 
     actual_tags = reader.get_saved_model_tag_sets(saved_model_dir)
-    expected_tags = [["train"], ["serve"], ["foo", "bar"]]
+    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["foo", "bar"]]
     self.assertEqual(expected_tags, actual_tags)
 
 
diff --git a/tensorflow/contrib/saved_model/python/saved_model/utils.py b/tensorflow/contrib/saved_model/python/saved_model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f34af64a6253eecf45351d4e844265b922d9313
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/utils.py
@@ -0,0 +1,81 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+
+
+def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
+  """Convenience function to build a SavedModel suitable for serving.
+
+  In many common cases, saving models for serving will be as simple as:
+
+      simple_save(session,
+                  export_dir,
+                  inputs={"x": x, "y": y},
+                  outputs={"z": z})
+
+  Although in many cases it's not necessary to understand all of the many ways
+      to configure a SavedModel, this method has a few practical implications:
+    - It will be treated as a graph for inference / serving (i.e. uses the tag
+      `tag_constants.SERVING`)
+    - The saved model will load in TensorFlow Serving and supports the
+      [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
+      To use the Classify, Regress, or MultiInference APIs, please
+      use either
+      [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
+      or the lower level
+      [SavedModel APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+    - Some TensorFlow ops depend on information on disk or other information
+      called "assets". These are generally handled automatically by adding the
+      assets to the `GraphKeys.ASSET_FILEPATHS` collection. Only assets in that
+      collection are exported; if you need more custom behavior, you'll need to
+      use the [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
+
+  More information about SavedModel and signatures can be found here:
+  https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md.
+
+  Args:
+    session: The TensorFlow session from which to save the meta graph and
+        variables.
+    export_dir: The path to which the SavedModel will be stored.
+    inputs: dict mapping string input names to tensors. These are added
+        to the SignatureDef as the inputs.
+    outputs:  dict mapping string output names to tensors. These are added
+        to the SignatureDef as the outputs.
+    legacy_init_op: Legacy support for op or group of ops to execute after the
+        restore op upon a load.
+  """
+  signature_def_map = {
+      signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+          signature_def_utils.predict_signature_def(inputs, outputs)
+  }
+  b = builder.SavedModelBuilder(export_dir)
+  b.add_meta_graph_and_variables(
+      session,
+      tags=[tag_constants.SERVING],
+      signature_def_map=signature_def_map,
+      assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+      legacy_init_op=legacy_init_op,
+      clear_devices=True)
+  b.save()
diff --git a/tensorflow/contrib/saved_model/python/saved_model/utils_test.py b/tensorflow/contrib/saved_model/python/saved_model/utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..36dfb88871f39218ea19c2e6f40675914510e4c4
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/utils_test.py
@@ -0,0 +1,102 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for saved_model utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.saved_model.python.saved_model import utils
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+
+
+class UtilsTest(test.TestCase):
+
+  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+    v = variables.Variable(variable_value, name=variable_name)
+    sess.run(variables.global_variables_initializer())
+    self.assertEqual(variable_value, v.eval())
+    return v
+
+  def _check_variable_info(self, actual_variable, expected_variable):
+    self.assertEqual(actual_variable.name, expected_variable.name)
+    self.assertEqual(actual_variable.dtype, expected_variable.dtype)
+    self.assertEqual(len(actual_variable.shape), len(expected_variable.shape))
+    for i in range(len(actual_variable.shape)):
+      self.assertEqual(actual_variable.shape[i], expected_variable.shape[i])
+
+  def _check_tensor_info(self, actual_tensor_info, expected_tensor):
+    self.assertEqual(actual_tensor_info.name, expected_tensor.name)
+    self.assertEqual(actual_tensor_info.dtype, expected_tensor.dtype)
+    self.assertEqual(
+        len(actual_tensor_info.tensor_shape.dim), len(expected_tensor.shape))
+    for i in range(len(actual_tensor_info.tensor_shape.dim)):
+      self.assertEqual(actual_tensor_info.tensor_shape.dim[i].size,
+                       expected_tensor.shape[i])
+
+  def testSimpleSave(self):
+    """Test simple_save that uses the default parameters."""
+    export_dir = os.path.join(test.get_temp_dir(),
+                              "test_simple_save")
+
+    # Initialize input and output variables and save a prediction graph using
+    # the default parameters.
+    with self.test_session(graph=ops.Graph()) as sess:
+      var_x = self._init_and_validate_variable(sess, "var_x", 1)
+      var_y = self._init_and_validate_variable(sess, "var_y", 2)
+      inputs = {"x": var_x}
+      outputs = {"y": var_y}
+      utils.simple_save(sess, export_dir, inputs, outputs)
+
+    # Restore the graph with a valid tag and check the global variables and
+    # signature def map.
+    with self.test_session(graph=ops.Graph()) as sess:
+      graph = loader.load(sess, [tag_constants.SERVING], export_dir)
+      collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+
+      # Check value and metadata of the saved variables.
+      self.assertEqual(len(collection_vars), 2)
+      self.assertEqual(1, collection_vars[0].eval())
+      self.assertEqual(2, collection_vars[1].eval())
+      self._check_variable_info(collection_vars[0], var_x)
+      self._check_variable_info(collection_vars[1], var_y)
+
+      # Check that the appropriate signature_def_map is created with the
+      # default key and method name, and the specified inputs and outputs.
+      signature_def_map = graph.signature_def
+      self.assertEqual(1, len(signature_def_map))
+      self.assertEqual(signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+                       list(signature_def_map.keys())[0])
+
+      signature_def = signature_def_map[
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+      self.assertEqual(signature_constants.PREDICT_METHOD_NAME,
+                       signature_def.method_name)
+
+      self.assertEqual(1, len(signature_def.inputs))
+      self._check_tensor_info(signature_def.inputs["x"], var_x)
+      self.assertEqual(1, len(signature_def.outputs))
+      self._check_tensor_info(signature_def.outputs["y"], var_y)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index d36d7e16dec1d00ec6fc028e36c338f23da78f03..c4abef268b0bf56c19672e1b4e5cc37162b3c86f 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -57,7 +57,13 @@ _allowed_symbols = [
     "AttentionWrapperState",
     "AttentionWrapper",
     "AttentionMechanism",
-    "tile_batch"]
+    "tile_batch",
+    "safe_cumprod",
+    "monotonic_attention",
+    "monotonic_probability_fn",
+    "BahdanauMonotonicAttention",
+    "LuongMonotonicAttention",
+]
 
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 99e51589c9a1adb9172a209e9d68fab216217b86..91493302b1abb3dd0fbfe824a798e68f83cc9fc7 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -28,9 +28,11 @@ from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
@@ -86,6 +88,30 @@ class AttentionWrapperTest(test.TestCase):
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
                          name=''):
+    self._testWithMaybeMultiAttention(
+        is_multi=False,
+        create_attention_mechanisms=[create_attention_mechanism],
+        expected_final_output=expected_final_output,
+        expected_final_state=expected_final_state,
+        attention_mechanism_depths=[attention_mechanism_depth],
+        alignment_history=alignment_history,
+        expected_final_alignment_history=expected_final_alignment_history,
+        attention_layer_sizes=[attention_layer_size],
+        name=name)
+
+  def _testWithMaybeMultiAttention(self,
+                                   is_multi,
+                                   create_attention_mechanisms,
+                                   expected_final_output,
+                                   expected_final_state,
+                                   attention_mechanism_depths,
+                                   alignment_history=False,
+                                   expected_final_alignment_history=None,
+                                   attention_layer_sizes=None,
+                                   name=''):
+    # Allow is_multi to be True with a single mechanism to enable test for
+    # passing in a single mechanism in a list.
+    assert len(create_attention_mechanisms) == 1 or is_multi
     encoder_sequence_length = [3, 2, 3, 1, 1]
     decoder_sequence_length = [2, 0, 1, 2, 3]
     batch_size = 5
@@ -95,10 +121,12 @@ class AttentionWrapperTest(test.TestCase):
     encoder_output_depth = 10
     cell_depth = 9
 
-    if attention_layer_size is not None:
-      attention_depth = attention_layer_size
+    if attention_layer_sizes is None:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
     else:
-      attention_depth = encoder_output_depth
+      # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
+      attention_depth = sum([attention_layer_size or encoder_output_depth
+                             for attention_layer_size in attention_layer_sizes])
 
     decoder_inputs = array_ops.placeholder_with_default(
         np.random.randn(batch_size, decoder_max_time,
@@ -109,10 +137,12 @@ class AttentionWrapperTest(test.TestCase):
                         encoder_output_depth).astype(np.float32),
         shape=(None, None, encoder_output_depth))
 
-    attention_mechanism = create_attention_mechanism(
-        num_units=attention_mechanism_depth,
-        memory=encoder_outputs,
-        memory_sequence_length=encoder_sequence_length)
+    attention_mechanisms = [
+        creator(num_units=depth,
+                memory=encoder_outputs,
+                memory_sequence_length=encoder_sequence_length)
+        for creator, depth in zip(create_attention_mechanisms,
+                                  attention_mechanism_depths)]
 
     with self.test_session(use_gpu=True) as sess:
       with vs.variable_scope(
@@ -121,8 +151,9 @@ class AttentionWrapperTest(test.TestCase):
         cell = rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
             cell,
-            attention_mechanism,
-            attention_layer_size=attention_layer_size,
+            attention_mechanisms if is_multi else attention_mechanisms[0],
+            attention_layer_size=(attention_layer_sizes if is_multi
+                                  else attention_layer_sizes[0]),
             alignment_history=alignment_history)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
@@ -154,12 +185,23 @@ class AttentionWrapperTest(test.TestCase):
                        tuple(final_state.cell_state.h.get_shape().as_list()))
 
       if alignment_history:
-        state_alignment_history = final_state.alignment_history.stack()
+        if is_multi:
+          state_alignment_history = []
+          for history_array in final_state.alignment_history:
+            history = history_array.stack()
+            self.assertEqual(
+                (None, batch_size, None),
+                tuple(history.get_shape().as_list()))
+            state_alignment_history.append(history)
+          state_alignment_history = tuple(state_alignment_history)
+        else:
+          state_alignment_history = final_state.alignment_history.stack()
+          self.assertEqual(
+              (None, batch_size, None),
+              tuple(state_alignment_history.get_shape().as_list()))
         # Remove the history from final_state for purposes of the
         # remainder of the tests.
         final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
-        self.assertEqual((None, batch_size, None),
-                         tuple(state_alignment_history.get_shape().as_list()))
       else:
         state_alignment_history = ()
 
@@ -174,6 +216,7 @@ class AttentionWrapperTest(test.TestCase):
                                              sess_results['final_outputs'])
       final_state_info = nest.map_structure(get_result_summary,
                                             sess_results['final_state'])
+      print(name)
       print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info))
       print('expected_final_state = %s' % str(final_state_info))
       nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
@@ -335,6 +378,393 @@ class AttentionWrapperTest(test.TestCase):
         attention_layer_size=None,
         name='testNotUseAttentionLayer')
 
+  def test_safe_cumprod(self):
+    # Create some random test input
+    test_input = np.random.uniform(size=(10, 20))
+
+    for axis in [0, 1]:
+      for exclusive in [True, False]:
+        with self.test_session():
+          # Compute cumprod with regular tf.cumprod
+          cumprod_output = math_ops.cumprod(
+              test_input, axis=axis, exclusive=exclusive).eval()
+          # Compute cumprod with safe_cumprod
+          safe_cumprod_output = wrapper.safe_cumprod(
+              test_input, axis=axis, exclusive=exclusive).eval()
+        for x, y in zip(cumprod_output.shape, safe_cumprod_output.shape):
+          self.assertEqual(x, y)
+        for x, y in zip(cumprod_output.flatten(),
+                        safe_cumprod_output.flatten()):
+          # Use assertAlmostEqual for the actual values due to floating point
+          self.assertAlmostEqual(x, y, places=5)
+
+  def test_monotonic_attention(self):
+    def monotonic_attention_explicit(p_choose_i, previous_attention):
+      """Explicitly compute monotonic attention distribution using numpy."""
+      # Base case for recurrence relation
+      out = [previous_attention[0]]
+      # Explicitly follow the recurrence relation
+      for j in range(1, p_choose_i.shape[0]):
+        out.append((1 - p_choose_i[j - 1])*out[j - 1] + previous_attention[j])
+      return p_choose_i*np.array(out)
+
+    # Generate a random batch of choosing probabilities for seq. len. 20
+    p_choose_i = np.random.uniform(size=(10, 20)).astype(np.float32)
+    # Generate random previous attention distributions
+    previous_attention = np.random.uniform(size=(10, 20)).astype(np.float32)
+    previous_attention /= previous_attention.sum(axis=1).reshape((-1, 1))
+
+    # Create the output to test against
+    explicit_output = np.array([
+        monotonic_attention_explicit(p, a)
+        for p, a in zip(p_choose_i, previous_attention)])
+
+    # Compute output with TensorFlow function, for both calculation types
+    with self.test_session():
+      recursive_output = wrapper.monotonic_attention(
+          p_choose_i, previous_attention, 'recursive').eval()
+
+    self.assertEqual(recursive_output.ndim, explicit_output.ndim)
+    for x, y in zip(recursive_output.shape, explicit_output.shape):
+      self.assertEqual(x, y)
+    for x, y in zip(recursive_output.flatten(), explicit_output.flatten()):
+      # Use assertAlmostEqual for the actual values due to floating point
+      self.assertAlmostEqual(x, y, places=5)
+
+    # Generate new p_choose_i for parallel, which is unstable when p_choose_i[n]
+    # is close to 1
+    p_choose_i = np.random.uniform(0, 0.9, size=(10, 20)).astype(np.float32)
+
+    # Create new output to test against
+    explicit_output = np.array([
+        monotonic_attention_explicit(p, a)
+        for p, a in zip(p_choose_i, previous_attention)])
+
+    # Compute output with TensorFlow function, for both calculation types
+    with self.test_session():
+      parallel_output = wrapper.monotonic_attention(
+          p_choose_i, previous_attention, 'parallel').eval()
+
+    self.assertEqual(parallel_output.ndim, explicit_output.ndim)
+    for x, y in zip(parallel_output.shape, explicit_output.shape):
+      self.assertEqual(x, y)
+    for x, y in zip(parallel_output.flatten(), explicit_output.flatten()):
+      # Use assertAlmostEqual for the actual values due to floating point
+      self.assertAlmostEqual(x, y, places=5)
+
+    # Now, test hard mode, where probabilities must be 0 or 1
+    p_choose_i = np.random.choice(np.array([0, 1], np.float32), (10, 20))
+    previous_attention = np.zeros((10, 20), np.float32)
+    # Randomly choose input sequence indices at each timestep
+    random_idx = np.random.randint(0, previous_attention.shape[1],
+                                   previous_attention.shape[0])
+    previous_attention[np.arange(previous_attention.shape[0]), random_idx] = 1
+
+    # Create the output to test against
+    explicit_output = np.array([
+        monotonic_attention_explicit(p, a)
+        for p, a in zip(p_choose_i, previous_attention)])
+
+    # Compute output with TensorFlow function, for both calculation types
+    with self.test_session():
+      hard_output = wrapper.monotonic_attention(
+          # TensorFlow is unhappy when these are not wrapped as tf.constant
+          constant_op.constant(p_choose_i),
+          constant_op.constant(previous_attention),
+          'hard').eval()
+
+    self.assertEqual(hard_output.ndim, explicit_output.ndim)
+    for x, y in zip(hard_output.shape, explicit_output.shape):
+      self.assertEqual(x, y)
+    for x, y in zip(hard_output.flatten(), explicit_output.flatten()):
+      # Use assertAlmostEqual for the actual values due to floating point
+      self.assertAlmostEqual(x, y, places=5)
+
+    # Now, test recursively computing attention distributions vs. sampling
+    def sample(p_choose_i):
+      """Generate a sequence of emit-ingest decisions from p_choose_i."""
+      output = np.zeros(p_choose_i.shape)
+      t_im1 = 0
+      for i in range(p_choose_i.shape[0]):
+        for j in range(t_im1, p_choose_i.shape[1]):
+          if np.random.uniform() <= p_choose_i[i, j]:
+            output[i, j] = 1
+            t_im1 = j
+            break
+        else:
+          t_im1 = p_choose_i.shape[1]
+      return output
+
+    # Now, the first axis is output timestep and second is input timestep
+    p_choose_i = np.random.uniform(size=(4, 5)).astype(np.float32)
+    # Generate the average of a bunch of samples
+    n_samples = 100000
+    sampled_output = np.mean(
+        [sample(p_choose_i) for _ in range(n_samples)], axis=0)
+
+    # Create initial previous_attention base case
+    recursive_output = [np.array([1] + [0]*(p_choose_i.shape[1] - 1),
+                                 np.float32)]
+    # Compute output with TensorFlow function, for both calculation types
+    with self.test_session():
+      for j in range(p_choose_i.shape[0]):
+        # Compute attention distribution for this output time step
+        recursive_output.append(wrapper.monotonic_attention(
+            # newaxis is for adding the expected batch dimension
+            p_choose_i[j][np.newaxis],
+            recursive_output[-1][np.newaxis], 'recursive').eval()[0])
+      # Stack together distributions; remove basecase
+      recursive_output = np.array(recursive_output[1:])
+
+    self.assertEqual(recursive_output.ndim, sampled_output.ndim)
+    for x, y in zip(recursive_output.shape, sampled_output.shape):
+      self.assertEqual(x, y)
+    for x, y in zip(recursive_output.flatten(), sampled_output.flatten()):
+      # Use a very forgiving threshold since we are sampling
+      self.assertAlmostEqual(x, y, places=2)
+
+  def testBahdanauMonotonicNotNormalized(self):
+    create_attention_mechanism = functools.partial(
+        wrapper.BahdanauMonotonicAttention, sigmoid_noise=1.0,
+        sigmoid_noise_seed=3)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.002122893),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.7333333333333334))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040002423),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019968653)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-5.9313523e-05),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032228071),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050430927)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testBahdanauMonotonicNotNormalized')
+
+  def testBahdanauMonotonicNormalized(self):
+    create_attention_mechanism = functools.partial(
+        wrapper.BahdanauMonotonicAttention, normalize=True,
+        sigmoid_noise=1.0, sigmoid_noise_seed=3)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0025896581),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040013152),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019973689)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00069823361),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.046009291)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testBahdanauMonotonicNormalized')
+
+  def testLuongMonotonicNotNormalized(self):
+    create_attention_mechanism = functools.partial(
+        wrapper.LuongMonotonicAttention, sigmoid_noise=1.0,
+        sigmoid_noise_seed=3)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0021257224),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.7333333333333334))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040003359),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.001996913)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-5.2024145e-05),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050387777)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testLuongMonotonicNotNormalized')
+
+  def testLuongMonotonicScaled(self):
+    create_attention_mechanism = functools.partial(
+        wrapper.LuongMonotonicAttention, scale=True, sigmoid_noise=1.0,
+        sigmoid_noise_seed=3)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0021257224),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.7333333333333334))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040003359),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.001996913)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-5.2024145e-05),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050387777)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testLuongMonotonicScaled')
+
+  def testMultiAttention(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 7), dtype=dtype('float32'), mean=0.0011709079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=3.2000000000000002))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0038725811),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019329828)),
+        attention=ResultSummary(
+            shape=(5, 7), dtype=dtype('float32'), mean=0.001174294),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        True,
+        create_attention_mechanisms,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        attention_layer_sizes=[3, 4],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
+  def testMultiAttentionNoAttentionLayer(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11691988),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=7.2666666666666666))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0036486709),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0018835809)),
+        attention=ResultSummary(
+            shape=(5, 20), dtype=dtype('float32'), mean=0.11680689),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        is_multi=True,
+        create_attention_mechanisms=create_attention_mechanisms,
+        expected_final_output=expected_final_output,
+        expected_final_state=expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
+  def testSingleAttentionAsList(self):
+    create_attention_mechanisms = [wrapper.BahdanauAttention]
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 3), dtype=dtype('float32'), mean=-0.0098485695),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.8))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0040023471),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019979973)),
+        attention=ResultSummary(
+            shape=(5, 3), dtype=dtype('float32'), mean=-0.0098808752),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),)
+
+    self._testWithMaybeMultiAttention(
+        is_multi=True,  # pass the AttentionMechanism wrapped in a list
+        create_attention_mechanisms=create_attention_mechanisms,
+        expected_final_output=expected_final_output,
+        expected_final_state=expected_final_state,
+        attention_mechanism_depths=[9],
+        attention_layer_sizes=[3],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index cb12bc9450c626d861a726e8e79bf48d9f5a6b4f..c99562555a1a51c1d0a8441e23123ccba4cf4e3f 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -111,6 +111,8 @@ class BasicDecoderTest(test.TestCase):
                           sess_results["first_finished"])
       self.assertAllEqual([False, False, False, True, True],
                           sess_results["step_finished"])
+      self.assertEqual(output_dtype.sample_id,
+                       sess_results["step_outputs"].sample_id.dtype)
       self.assertAllEqual(
           np.argmax(sess_results["step_outputs"].rnn_output, -1),
           sess_results["step_outputs"].sample_id)
@@ -186,6 +188,8 @@ class BasicDecoderTest(test.TestCase):
       self.assertAllEqual([False, False, False, False, False],
                           sess_results["first_finished"])
       self.assertAllEqual(expected_step_finished, sess_results["step_finished"])
+      self.assertEqual(output_dtype.sample_id,
+                       sess_results["step_outputs"].sample_id.dtype)
       self.assertAllEqual(expected_sample_ids,
                           sess_results["step_outputs"].sample_id)
       self.assertAllEqual(expected_step_next_inputs,
@@ -254,6 +258,7 @@ class BasicDecoderTest(test.TestCase):
         })
 
         sample_ids = sess_results["step_outputs"].sample_id
+        self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
         expected_step_finished = (sample_ids == end_token)
         expected_step_next_inputs = embeddings[sample_ids]
         self.assertAllEqual(expected_step_finished,
@@ -337,6 +342,7 @@ class BasicDecoderTest(test.TestCase):
       self.assertAllEqual([False, False, False, True, True],
                           sess_results["step_finished"])
       sample_ids = sess_results["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
       batch_where_not_sampling = np.where(sample_ids == -1)
       batch_where_sampling = np.where(sample_ids > -1)
       self.assertAllClose(
@@ -441,6 +447,7 @@ class BasicDecoderTest(test.TestCase):
                           sess_results["step_finished"])
 
       sample_ids = sess_results["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
       batch_where_not_sampling = np.where(np.logical_not(sample_ids))
       batch_where_sampling = np.where(sample_ids)
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 491d87f62d807c5ef5306b2e55eec7ebae889be6..3496b355b4b264e5839ddbfae7ef8e468150abba 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -65,7 +65,9 @@ class GatherTreeTest(test.TestCase):
         _ = beams.eval()
 
   def testBadParentValuesOnGPU(self):
-    if not test.is_gpu_available():
+    # Only want to run this test on CUDA devices, as gather_tree is not
+    # registered for SYCL devices.
+    if not test.is_gpu_available(cuda_only=True):
       return
     # (max_time = 4, batch_size = 1, beams = 3)
     # bad parent in beam 1 time 1; appears as a negative index at time 0
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index b659988a5601f12336ee9adb912bc2595d32a7a0..a162a919cf868dbf2761002534833fdd211f6728 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -19,8 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import math
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -28,9 +31,12 @@ from tensorflow.python.layers import base as layers_base
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
@@ -44,6 +50,10 @@ __all__ = [
     "LuongAttention",
     "BahdanauAttention",
     "hardmax",
+    "safe_cumprod",
+    "monotonic_attention",
+    "BahdanauMonotonicAttention",
+    "LuongMonotonicAttention",
 ]
 
 
@@ -241,6 +251,66 @@ class _BaseAttentionMechanism(AttentionMechanism):
     return _zero_state_tensors(max_time, batch_size, dtype)
 
 
+def _luong_score(query, keys, scale):
+  """Implements Luong-style (multiplicative) scoring function.
+
+  This attention has two forms.  The first is standard Luong attention,
+  as described in:
+
+  Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+  "Effective Approaches to Attention-based Neural Machine Translation."
+  EMNLP 2015.  https://arxiv.org/abs/1508.04025
+
+  The second is the scaled form inspired partly by the normalized form of
+  Bahdanau attention.
+
+  To enable the second form, call this function with `scale=True`.
+
+  Args:
+    query: Tensor, shape `[batch_size, num_units]` to compare to keys.
+    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
+    scale: Whether to apply a scale to the score function.
+
+  Returns:
+    A `[batch_size, max_time]` tensor of unnormalized score values.
+
+  Raises:
+    ValueError: If `key` and `query` depths do not match.
+  """
+  depth = query.get_shape()[-1]
+  key_units = keys.get_shape()[-1]
+  if depth != key_units:
+    raise ValueError(
+        "Incompatible or unknown inner dimensions between query and keys.  "
+        "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
+        "Perhaps you need to set num_units to the keys' dimension (%s)?"
+        % (query, depth, keys, key_units, key_units))
+  dtype = query.dtype
+
+  # Reshape from [batch_size, depth] to [batch_size, 1, depth]
+  # for matmul.
+  query = array_ops.expand_dims(query, 1)
+
+  # Inner product along the query units dimension.
+  # matmul shapes: query is [batch_size, 1, depth] and
+  #                keys is [batch_size, max_time, depth].
+  # the inner product is asked to **transpose keys' inner shape** to get a
+  # batched matmul on:
+  #   [batch_size, 1, depth] . [batch_size, depth, max_time]
+  # resulting in an output shape of:
+  #   [batch_time, 1, max_time].
+  # we then squeee out the center singleton dimension.
+  score = math_ops.matmul(query, keys, transpose_b=True)
+  score = array_ops.squeeze(score, [1])
+
+  if scale:
+    # Scalar used in weight scaling
+    g = variable_scope.get_variable(
+        "attention_g", dtype=dtype, initializer=1.)
+    score = g * score
+  return score
+
+
 class LuongAttention(_BaseAttentionMechanism):
   """Implements Luong-style (multiplicative) attention scoring.
 
@@ -317,47 +387,66 @@ class LuongAttention(_BaseAttentionMechanism):
       alignments: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]` (`alignments_size` is memory's
         `max_time`).
-
-    Raises:
-      ValueError: If `key` and `query` depths do not match.
     """
-    depth = query.get_shape()[-1]
-    key_units = self.keys.get_shape()[-1]
-    if depth != key_units:
-      raise ValueError(
-          "Incompatible or unknown inner dimensions between query and keys.  "
-          "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
-          "Perhaps you need to set num_units to the keys' dimension (%s)?"
-          % (query, depth, self.keys, key_units, key_units))
-    dtype = query.dtype
-
     with variable_scope.variable_scope(None, "luong_attention", [query]):
-      # Reshape from [batch_size, depth] to [batch_size, 1, depth]
-      # for matmul.
-      query = array_ops.expand_dims(query, 1)
-
-      # Inner product along the query units dimension.
-      # matmul shapes: query is [batch_size, 1, depth] and
-      #                keys is [batch_size, max_time, depth].
-      # the inner product is asked to **transpose keys' inner shape** to get a
-      # batched matmul on:
-      #   [batch_size, 1, depth] . [batch_size, depth, max_time]
-      # resulting in an output shape of:
-      #   [batch_time, 1, max_time].
-      # we then squeee out the center singleton dimension.
-      score = math_ops.matmul(query, self.keys, transpose_b=True)
-      score = array_ops.squeeze(score, [1])
-
-      if self._scale:
-        # Scalar used in weight scaling
-        g = variable_scope.get_variable(
-            "attention_g", dtype=dtype, initializer=1.)
-        score = g * score
-
+      score = _luong_score(query, self._keys, self._scale)
     alignments = self._probability_fn(score, previous_alignments)
     return alignments
 
 
+def _bahdanau_score(processed_query, keys, normalize):
+  """Implements Bahdanau-style (additive) scoring function.
+
+  This attention has two forms.  The first is Bhandanau attention,
+  as described in:
+
+  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+  "Neural Machine Translation by Jointly Learning to Align and Translate."
+  ICLR 2015. https://arxiv.org/abs/1409.0473
+
+  The second is the normalized form.  This form is inspired by the
+  weight normalization article:
+
+  Tim Salimans, Diederik P. Kingma.
+  "Weight Normalization: A Simple Reparameterization to Accelerate
+   Training of Deep Neural Networks."
+  https://arxiv.org/abs/1602.07868
+
+  To enable the second form, set `normalize=True`.
+
+  Args:
+    processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
+    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
+    normalize: Whether to normalize the score function.
+
+  Returns:
+    A `[batch_size, max_time]` tensor of unnormalized score values.
+  """
+  dtype = processed_query.dtype
+  # Get the number of hidden units from the trailing dimension of keys
+  num_units = keys.shape[2].value or array_ops.shape(keys)[2]
+  # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
+  processed_query = array_ops.expand_dims(processed_query, 1)
+  v = variable_scope.get_variable(
+      "attention_v", [num_units], dtype=dtype)
+  if normalize:
+    # Scalar used in weight normalization
+    g = variable_scope.get_variable(
+        "attention_g", dtype=dtype,
+        initializer=math.sqrt((1. / num_units)))
+    # Bias added prior to the nonlinearity
+    b = variable_scope.get_variable(
+        "attention_b", [num_units], dtype=dtype,
+        initializer=init_ops.zeros_initializer())
+    # normed_v = g * v / ||v||
+    normed_v = g * v * math_ops.rsqrt(
+        math_ops.reduce_sum(math_ops.square(v)))
+    return math_ops.reduce_sum(
+        normed_v * math_ops.tanh(keys + processed_query + b), [2])
+  else:
+    return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
+
+
 class BahdanauAttention(_BaseAttentionMechanism):
   """Implements Bahdanau-style (additive) attention.
 
@@ -441,30 +530,377 @@ class BahdanauAttention(_BaseAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
-      dtype = processed_query.dtype
-      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
-      processed_query = array_ops.expand_dims(processed_query, 1)
-      keys = self._keys
-      v = variable_scope.get_variable(
-          "attention_v", [self._num_units], dtype=dtype)
-      if self._normalize:
-        # Scalar used in weight normalization
-        g = variable_scope.get_variable(
-            "attention_g", dtype=dtype,
-            initializer=math.sqrt((1. / self._num_units)))
-        # Bias added prior to the nonlinearity
-        b = variable_scope.get_variable(
-            "attention_b", [self._num_units], dtype=dtype,
-            initializer=init_ops.zeros_initializer())
-        # normed_v = g * v / ||v||
-        normed_v = g * v * math_ops.rsqrt(
-            math_ops.reduce_sum(math_ops.square(v)))
-        score = math_ops.reduce_sum(
-            normed_v * math_ops.tanh(keys + processed_query + b), [2])
-      else:
-        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
-                                    [2])
+      score = _bahdanau_score(processed_query, self._keys, self._normalize)
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
+
+
+def safe_cumprod(x, *args, **kwargs):
+  """Computes cumprod of x in logspace using cumsum to avoid underflow.
+
+  The cumprod function and its gradient can result in numerical instabilities
+  when its argument has very small and/or zero values.  As long as the argument
+  is all positive, we can instead compute the cumulative product as
+  exp(cumsum(log(x))).  This function can be called identically to tf.cumprod.
+
+  Args:
+    x: Tensor to take the cumulative product of.
+    *args: Passed on to cumsum; these are identical to those in cumprod.
+    **kwargs: Passed on to cumsum; these are identical to those in cumprod.
+  Returns:
+    Cumulative product of x.
+  """
+  with ops.name_scope(None, "SafeCumprod", [x]):
+    x = ops.convert_to_tensor(x, name="x")
+    tiny = np.finfo(x.dtype.as_numpy_dtype).tiny
+    return math_ops.exp(math_ops.cumsum(
+        math_ops.log(clip_ops.clip_by_value(x, tiny, 1)), *args, **kwargs))
 
+
+def monotonic_attention(p_choose_i, previous_attention, mode):
+  """Compute monotonic attention distribution from choosing probabilities.
+
+  Monotonic attention implies that the input sequence is processed in an
+  explicitly left-to-right manner when generating the output sequence.  In
+  addition, once an input sequence element is attended to at a given output
+  timestep, elements occurring before it cannot be attended to at subsequent
+  output timesteps.  This function generates attention distributions according
+  to these assumptions.  For more information, see ``Online and Linear-Time
+  Attention by Enforcing Monotonic Alignments''.
+
+  Args:
+    p_choose_i: Probability of choosing input sequence/memory element i.  Should
+      be of shape (batch_size, input_sequence_length), and should all be in the
+      range [0, 1].
+    previous_attention: The attention distribution from the previous output
+      timestep.  Should be of shape (batch_size, input_sequence_length).  For
+      the first output timestep, preevious_attention[n] should be [1, 0, 0, ...,
+      0] for all n in [0, ... batch_size - 1].
+    mode: How to compute the attention distribution.  Must be one of
+      'recursive', 'parallel', or 'hard'.
+        * 'recursive' uses tf.scan to recursively compute the distribution.
+          This is slowest but is exact, general, and does not suffer from
+          numerical instabilities.
+        * 'parallel' uses parallelized cumulative-sum and cumulative-product
+          operations to compute a closed-form solution to the recurrence
+          relation defining the attention distribution.  This makes it more
+          efficient than 'recursive', but it requires numerical checks which
+          make the distribution non-exact.  This can be a problem in particular
+          when input_sequence_length is long and/or p_choose_i has entries very
+          close to 0 or 1.
+        * 'hard' requires that the probabilities in p_choose_i are all either 0
+          or 1, and subsequently uses a more efficient and exact solution.
+
+  Returns:
+    A tensor of shape (batch_size, input_sequence_length) representing the
+    attention distributions for each sequence in the batch.
+
+  Raises:
+    ValueError: mode is not one of 'recursive', 'parallel', 'hard'.
+  """
+  # Force things to be tensors
+  p_choose_i = ops.convert_to_tensor(p_choose_i, name="p_choose_i")
+  previous_attention = ops.convert_to_tensor(
+      previous_attention, name="previous_attention")
+  if mode == "recursive":
+    # Use .shape[0].value when it's not None, or fall back on symbolic shape
+    batch_size = p_choose_i.shape[0].value or array_ops.shape(p_choose_i)[0]
+    # Compute [1, 1 - p_choose_i[0], 1 - p_choose_i[1], ..., 1 - p_choose_i[-2]]
+    shifted_1mp_choose_i = array_ops.concat(
+        [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1)
+    # Compute attention distribution recursively as
+    # q[i] = (1 - p_choose_i[i])*q[i - 1] + previous_attention[i]
+    # attention[i] = p_choose_i[i]*q[i]
+    attention = p_choose_i*array_ops.transpose(functional_ops.scan(
+        # Need to use reshape to remind TF of the shape between loop iterations
+        lambda x, yz: array_ops.reshape(yz[0]*x + yz[1], (batch_size,)),
+        # Loop variables yz[0] and yz[1]
+        [array_ops.transpose(shifted_1mp_choose_i),
+         array_ops.transpose(previous_attention)],
+        # Initial value of x is just zeros
+        array_ops.zeros((batch_size,))))
+  elif mode == "parallel":
+    # safe_cumprod computes cumprod in logspace with numeric checks
+    cumprod_1mp_choose_i = safe_cumprod(1 - p_choose_i, axis=1, exclusive=True)
+    # Compute recurrence relation solution
+    attention = p_choose_i*cumprod_1mp_choose_i*math_ops.cumsum(
+        previous_attention /
+        # Clip cumprod_1mp to avoid divide-by-zero
+        clip_ops.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), axis=1)
+  elif mode == "hard":
+    # Remove any probabilities before the index chosen last time step
+    p_choose_i *= math_ops.cumsum(previous_attention, axis=1)
+    # Now, use exclusive cumprod to remove probabilities after the first
+    # chosen index, like so:
+    # p_choose_i = [0, 0, 0, 1, 1, 0, 1, 1]
+    # cumprod(1 - p_choose_i, exclusive=True) = [1, 1, 1, 1, 0, 0, 0, 0]
+    # Product of above: [0, 0, 0, 1, 0, 0, 0, 0]
+    attention = p_choose_i*math_ops.cumprod(
+        1 - p_choose_i, axis=1, exclusive=True)
+  else:
+    raise ValueError("mode must be 'recursive', 'parallel', or 'hard'.")
+  return attention
+
+
+def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode,
+                              seed=None):
+  """Attention probability function for monotonic attention.
+
+  Takes in unnormalized attention scores, adds pre-sigmoid noise to encourage
+  the model to make discrete attention decisions, passes them through a sigmoid
+  to obtain "choosing" probabilities, and then calls monotonic_attention to
+  obtain the attention distribution.  For more information, see
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+
+  Args:
+    score: Unnormalized attention scores, shape `[batch_size, alignments_size]`
+    previous_alignments: Previous attention distribution, shape
+      `[batch_size, alignments_size]`
+    sigmoid_noise: Standard deviation of pre-sigmoid noise.  Setting this larger
+      than 0 will encourage the model to produce large attention scores,
+      effectively making the choosing probabilities discrete and the resulting
+      attention distribution one-hot.  It should be set to 0 at test-time, and
+      when hard attention is not desired.
+    mode: How to compute the attention distribution.  Must be one of
+      'recursive', 'parallel', or 'hard'.  See the docstring for
+      `tf.contrib.seq2seq.monotonic_attention` for more information.
+    seed: (optional) Random seed for pre-sigmoid noise.
+
+  Returns:
+    A `[batch_size, alignments_size]`-shape tensor corresponding to the
+    resulting attention distribution.
+  """
+  # Optionally add pre-sigmoid noise to the scores
+  if sigmoid_noise > 0:
+    noise = random_ops.random_normal(array_ops.shape(score), dtype=score.dtype,
+                                     seed=seed)
+    score += sigmoid_noise*noise
+  # Compute "choosing" probabilities from the attention scores
+  p_choose_i = math_ops.sigmoid(score)
+  # Convert from choosing probabilities to attention distribution
+  return monotonic_attention(p_choose_i, previous_alignments, mode)
+
+
+class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
+  """Base attention mechanism for monotonic attention.
+
+  Simply overrides the initial_alignments function to provide a dirac
+  distribution,which is needed in order for the monotonic attention
+  distributions to have the correct behavior.
+  """
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the monotonic attentions.
+
+    Initializes to dirac distributions, i.e. [1, 0, 0, ...memory length..., 0]
+    for all entries in the batch.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return array_ops.one_hot(
+        array_ops.zeros((batch_size,), dtype=dtypes.int32), max_time,
+        dtype=dtype)
+
+
+class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
+  """Monotonic attention mechanism with Bahadanau-style energy function.
+
+  This type of attention encorces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the memory
+  it can't attend to any prior points at subsequence output timesteps.  It
+  achieves this by using the _monotonic_probability_fn instead of softmax to
+  construct its attention distributions.  Since the attention scores are passed
+  through a sigmoid, a learnable scalar bias parameter is applied after the
+  score function and before the sigmoid.  Otherwise, it is equivalent to
+  BahdanauAttention.  This approach is proposed in
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+  """
+
+  def __init__(self,
+               num_units,
+               memory,
+               memory_sequence_length=None,
+               normalize=False,
+               score_mask_value=float("-inf"),
+               sigmoid_noise=0.,
+               sigmoid_noise_seed=None,
+               score_bias_init=0.,
+               mode="parallel",
+               name="BahdanauMonotonicAttention"):
+    """Construct the Attention mechanism.
+
+    Args:
+      num_units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      normalize: Python boolean.  Whether to normalize the energy term.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the docstring
+        for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar.  It's recommended to
+        initialize this to a negative value when the length of the memory is
+        large.
+      mode: How to compute the attention distribution.  Must be one of
+        'recursive', 'parallel', or 'hard'.  See the docstring for
+        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      name: Name to use when creating ops.
+    """
+    # Set up the monotonic probability fn with supplied parameters
+    wrapped_probability_fn = functools.partial(
+        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
+        seed=sigmoid_noise_seed)
+    super(BahdanauMonotonicAttention, self).__init__(
+        query_layer=layers_core.Dense(
+            num_units, name="query_layer", use_bias=False),
+        memory_layer=layers_core.Dense(
+            num_units, name="memory_layer", use_bias=False),
+        memory=memory,
+        probability_fn=wrapped_probability_fn,
+        memory_sequence_length=memory_sequence_length,
+        score_mask_value=score_mask_value,
+        name=name)
+    self._num_units = num_units
+    self._normalize = normalize
+    self._name = name
+    self._score_bias_init = score_bias_init
+
+  def __call__(self, query, previous_alignments):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+    """
+    with variable_scope.variable_scope(
+        None, "bahdanau_monotonic_attention", [query]):
+      processed_query = self.query_layer(query) if self.query_layer else query
+      score = _bahdanau_score(processed_query, self._keys, self._normalize)
+      score_bias = variable_scope.get_variable(
+          "attention_score_bias", dtype=processed_query.dtype,
+          initializer=self._score_bias_init)
+      score += score_bias
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
+
+
+class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
+  """Monotonic attention mechanism with Luong-style energy function.
+
+  This type of attention encorces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the memory
+  it can't attend to any prior points at subsequence output timesteps.  It
+  achieves this by using the _monotonic_probability_fn instead of softmax to
+  construct its attention distributions.  Otherwise, it is equivalent to
+  LuongAttention.  This approach is proposed in
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+  """
+
+  def __init__(self,
+               num_units,
+               memory,
+               memory_sequence_length=None,
+               scale=False,
+               score_mask_value=float("-inf"),
+               sigmoid_noise=0.,
+               sigmoid_noise_seed=None,
+               score_bias_init=0.,
+               mode="parallel",
+               name="LuongMonotonicAttention"):
+    """Construct the Attention mechanism.
+
+    Args:
+      num_units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      scale: Python boolean.  Whether to scale the energy term.
+      score_mask_value: (optional): The mask value for score before passing into
+        `probability_fn`. The default is -inf. Only used if
+        `memory_sequence_length` is not None.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the docstring
+        for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar.  It's recommended to
+        initialize this to a negative value when the length of the memory is
+        large.
+      mode: How to compute the attention distribution.  Must be one of
+        'recursive', 'parallel', or 'hard'.  See the docstring for
+        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      name: Name to use when creating ops.
+    """
+    # Set up the monotonic probability fn with supplied parameters
+    wrapped_probability_fn = functools.partial(
+        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
+        seed=sigmoid_noise_seed)
+    super(LuongMonotonicAttention, self).__init__(
+        query_layer=layers_core.Dense(
+            num_units, name="query_layer", use_bias=False),
+        memory_layer=layers_core.Dense(
+            num_units, name="memory_layer", use_bias=False),
+        memory=memory,
+        probability_fn=wrapped_probability_fn,
+        memory_sequence_length=memory_sequence_length,
+        score_mask_value=score_mask_value,
+        name=name)
+    self._num_units = num_units
+    self._scale = scale
+    self._score_bias_init = score_bias_init
+    self._name = name
+
+  def __call__(self, query, previous_alignments):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+    """
+    with variable_scope.variable_scope(None, "luong_monotonic_attention",
+                                       [query]):
+      score = _luong_score(query, self._keys, self._scale)
+      score_bias = variable_scope.get_variable(
+          "attention_score_bias", dtype=query.dtype,
+          initializer=self._score_bias_init)
+      score += score_bias
     alignments = self._probability_fn(score, previous_alignments)
     return alignments
 
@@ -481,9 +917,11 @@ class AttentionWrapperState(
       step.
     - `attention`: The attention emitted at the previous time step.
     - `time`: int32 scalar containing the current time step.
-    - `alignments`: The alignment emitted at the previous time step.
-    - `alignment_history`: (if enabled) a `TensorArray` containing alignment
-       matrices from all time steps.  Call `stack()` to convert to a `Tensor`.
+    - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
+       emitted at the previous time step for each attention mechanism.
+    - `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s)
+       containing alignment matrices from all time steps for each attention
+       mechanism. Call `stack()` on each to convert to a `Tensor`.
   """
 
   def clone(self, **kwargs):
@@ -528,6 +966,34 @@ def hardmax(logits, name=None):
         math_ops.argmax(logits, -1), depth, dtype=logits.dtype)
 
 
+def _compute_attention(attention_mechanism, cell_output, previous_alignments,
+                       attention_layer):
+  """Computes the attention and alignments for a given attention_mechanism."""
+  alignments = attention_mechanism(
+      cell_output, previous_alignments=previous_alignments)
+
+  # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
+  expanded_alignments = array_ops.expand_dims(alignments, 1)
+  # Context is the inner product of alignments and values along the
+  # memory time dimension.
+  # alignments shape is
+  #   [batch_size, 1, memory_time]
+  # attention_mechanism.values shape is
+  #   [batch_size, memory_time, attention_mechanism.num_units]
+  # the batched matmul is over memory_time, so the output shape is
+  #   [batch_size, 1, attention_mechanism.num_units].
+  # we then squeeze out the singleton dim.
+  context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
+  context = array_ops.squeeze(context, [1])
+
+  if attention_layer is not None:
+    attention = attention_layer(array_ops.concat([cell_output, context], 1))
+  else:
+    attention = context
+
+  return attention, alignments
+
+
 class AttentionWrapper(rnn_cell_impl.RNNCell):
   """Wraps another `RNNCell` with attention.
   """
@@ -545,11 +1011,14 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
 
     Args:
       cell: An instance of `RNNCell`.
-      attention_mechanism: An instance of `AttentionMechanism`.
-      attention_layer_size: Python integer, the depth of the attention (output)
-        layer. If None (default), use the context as attention at each time
-        step. Otherwise, feed the context and cell output into the attention
-        layer to generate attention at each time step.
+      attention_mechanism: A list of `AttentionMechanism` instances or a single
+        instance.
+      attention_layer_size: A list of Python integers or a single Python
+        integer, the depth of the attention (output) layer(s). If None
+        (default), use the context as attention at each time step. Otherwise,
+        feed the context and cell output into the attention layer to generate
+        attention at each time step. If attention_mechanism is a list,
+        attention_layer_size must be a list of the same length.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -569,15 +1038,35 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         does not match the batch size of `initial_cell_state`, proper
         behavior is not guaranteed.
       name: Name to use when creating ops.
+
+    Raises:
+      TypeError: `attention_layer_size` is not None and (`attention_mechanism`
+        is a list but `attention_layer_size` is not; or vice versa).
+      ValueError: if `attention_layer_size` is not None, `attention_mechanism`
+        is a list, and its length does not match that of `attention_layer_size`.
     """
     super(AttentionWrapper, self).__init__(name=name)
     if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError(
           "cell must be an RNNCell, saw type: %s" % type(cell).__name__)
-    if not isinstance(attention_mechanism, AttentionMechanism):
-      raise TypeError(
-          "attention_mechanism must be a AttentionMechanism, saw type: %s"
-          % type(attention_mechanism).__name__)
+    if isinstance(attention_mechanism, (list, tuple)):
+      self._is_multi = True
+      attention_mechanisms = attention_mechanism
+      for attention_mechanism in attention_mechanisms:
+        if not isinstance(attention_mechanism, AttentionMechanism):
+          raise TypeError(
+              "attention_mechanism must contain only instances of "
+              "AttentionMechanism, saw type: %s"
+              % type(attention_mechanism).__name__)
+    else:
+      self._is_multi = False
+      if not isinstance(attention_mechanism, AttentionMechanism):
+        raise TypeError(
+            "attention_mechanism must be an AttentionMechanism or list of "
+            "multiple AttentionMechanism instances, saw type: %s"
+            % type(attention_mechanism).__name__)
+      attention_mechanisms = (attention_mechanism,)
+
     if cell_input_fn is None:
       cell_input_fn = (
           lambda inputs, attention: array_ops.concat([inputs, attention], -1))
@@ -588,16 +1077,28 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             % type(cell_input_fn).__name__)
 
     if attention_layer_size is not None:
-      self._attention_layer = layers_core.Dense(
-          attention_layer_size, name="attention_layer", use_bias=False)
-      self._attention_layer_size = attention_layer_size
+      attention_layer_sizes = tuple(
+          attention_layer_size
+          if isinstance(attention_layer_size, (list, tuple))
+          else (attention_layer_size,))
+      if len(attention_layer_sizes) != len(attention_mechanisms):
+        raise ValueError(
+            "If provided, attention_layer_size must contain exactly one "
+            "integer per attention_mechanism, saw: %d vs %d"
+            % (len(attention_layer_sizes), len(attention_mechanisms)))
+      self._attention_layers = tuple(
+          layers_core.Dense(
+              attention_layer_size, name="attention_layer", use_bias=False)
+          for attention_layer_size in attention_layer_sizes)
+      self._attention_layer_size = sum(attention_layer_sizes)
     else:
-      self._attention_layer = None
-      self._attention_layer_size = attention_mechanism.values.get_shape()[
-          -1].value
+      self._attention_layers = None
+      self._attention_layer_size = sum(
+          attention_mechanism.values.get_shape()[-1].value
+          for attention_mechanism in attention_mechanisms)
 
     self._cell = cell
-    self._attention_mechanism = attention_mechanism
+    self._attention_mechanisms = attention_mechanisms
     self._cell_input_fn = cell_input_fn
     self._output_attention = output_attention
     self._alignment_history = alignment_history
@@ -617,13 +1118,36 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "via the tf.contrib.seq2seq.tile_batch function with argument "
             "multiple=beam_width.")
         with ops.control_dependencies(
-            [check_ops.assert_equal(state_batch_size,
-                                    self._attention_mechanism.batch_size,
-                                    message=error_message)]):
+            self._batch_size_checks(state_batch_size, error_message)):
           self._initial_cell_state = nest.map_structure(
               lambda s: array_ops.identity(s, name="check_initial_cell_state"),
               initial_cell_state)
 
+  def _batch_size_checks(self, batch_size, error_message):
+    return [check_ops.assert_equal(batch_size,
+                                   attention_mechanism.batch_size,
+                                   message=error_message)
+            for attention_mechanism in self._attention_mechanisms]
+
+  def _item_or_tuple(self, seq):
+    """Returns `seq` as tuple or the singular element.
+
+    Which is returned is determined by how the AttentionMechanism(s) were passed
+    to the constructor.
+
+    Args:
+      seq: A non-empty sequence of items or generator.
+
+    Returns:
+       Either the values in the sequence as a tuple if AttentionMechanism(s)
+       were passed to the constructor as a sequence or the singular element.
+    """
+    t = tuple(seq)
+    if self._is_multi:
+      return t
+    else:
+      return t[0]
+
   @property
   def output_size(self):
     if self._output_attention:
@@ -637,8 +1161,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         cell_state=self._cell.state_size,
         time=tensor_shape.TensorShape([]),
         attention=self._attention_layer_size,
-        alignments=self._attention_mechanism.alignments_size,
-        alignment_history=())  # alignment_history is sometimes a TensorArray
+        alignments=self._item_or_tuple(
+            a.alignments_size for a in self._attention_mechanisms),
+        alignment_history=self._item_or_tuple(
+            () for _ in self._attention_mechanisms))  # sometimes a TensorArray
 
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
@@ -655,25 +1181,23 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           "the batch_size= argument passed to zero_state is "
           "batch_size * beam_width.")
       with ops.control_dependencies(
-          [check_ops.assert_equal(batch_size,
-                                  self._attention_mechanism.batch_size,
-                                  message=error_message)]):
+          self._batch_size_checks(batch_size, error_message)):
         cell_state = nest.map_structure(
             lambda s: array_ops.identity(s, name="checked_cell_state"),
             cell_state)
-      if self._alignment_history:
-        alignment_history = tensor_array_ops.TensorArray(
-            dtype=dtype, size=0, dynamic_size=True)
-      else:
-        alignment_history = ()
       return AttentionWrapperState(
           cell_state=cell_state,
           time=array_ops.zeros([], dtype=dtypes.int32),
           attention=_zero_state_tensors(self._attention_layer_size, batch_size,
                                         dtype),
-          alignments=self._attention_mechanism.initial_alignments(
-              batch_size, dtype),
-          alignment_history=alignment_history)
+          alignments=self._item_or_tuple(
+              attention_mechanism.initial_alignments(batch_size, dtype)
+              for attention_mechanism in self._attention_mechanisms),
+          alignment_history=self._item_or_tuple(
+              tensor_array_ops.TensorArray(dtype=dtype, size=0,
+                                           dynamic_size=True)
+              if self._alignment_history else ()
+              for _ in self._attention_mechanisms))
 
   def call(self, inputs, state):
     """Perform a step of attention-wrapped RNN.
@@ -701,7 +1225,14 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
       - `attention_or_cell_output` depending on `output_attention`.
       - `next_state` is an instance of `AttentionWrapperState`
          containing the state calculated at this time step.
+
+    Raises:
+      TypeError: If `state` is not an instance of `AttentionWrapperState`.
     """
+    if not isinstance(state, AttentionWrapperState):
+      raise TypeError("Expected state to be instance of AttentionWrapperState. "
+                      "Received type %s instead."  % type(state))
+
     # Step 1: Calculate the true inputs to the cell based on the
     # previous attention value.
     cell_inputs = self._cell_input_fn(inputs, state.attention)
@@ -718,48 +1249,38 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         "the tf.contrib.seq2seq.tile_batch function with argument "
         "multiple=beam_width.")
     with ops.control_dependencies(
-        [check_ops.assert_equal(cell_batch_size,
-                                self._attention_mechanism.batch_size,
-                                message=error_message)]):
+        self._batch_size_checks(cell_batch_size, error_message)):
       cell_output = array_ops.identity(
           cell_output, name="checked_cell_output")
 
-    alignments = self._attention_mechanism(
-        cell_output, previous_alignments=state.alignments)
-
-    # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
-    expanded_alignments = array_ops.expand_dims(alignments, 1)
-    # Context is the inner product of alignments and values along the
-    # memory time dimension.
-    # alignments shape is
-    #   [batch_size, 1, memory_time]
-    # attention_mechanism.values shape is
-    #   [batch_size, memory_time, attention_mechanism.num_units]
-    # the batched matmul is over memory_time, so the output shape is
-    #   [batch_size, 1, attention_mechanism.num_units].
-    # we then squeeze out the singleton dim.
-    attention_mechanism_values = self._attention_mechanism.values
-    context = math_ops.matmul(expanded_alignments, attention_mechanism_values)
-    context = array_ops.squeeze(context, [1])
-
-    if self._attention_layer is not None:
-      attention = self._attention_layer(
-          array_ops.concat([cell_output, context], 1))
+    if self._is_multi:
+      previous_alignments = state.alignments
+      previous_alignment_history = state.alignment_history
     else:
-      attention = context
-
-    if self._alignment_history:
-      alignment_history = state.alignment_history.write(
-          state.time, alignments)
-    else:
-      alignment_history = ()
-
+      previous_alignments = [state.alignments]
+      previous_alignment_history = [state.alignment_history]
+
+    all_alignments = []
+    all_attentions = []
+    all_histories = []
+    for i, attention_mechanism in enumerate(self._attention_mechanisms):
+      attention, alignments = _compute_attention(
+          attention_mechanism, cell_output, previous_alignments[i],
+          self._attention_layers[i] if self._attention_layers else None)
+      alignment_history = previous_alignment_history[i].write(
+          state.time, alignments) if self._alignment_history else ()
+
+      all_alignments.append(alignments)
+      all_histories.append(alignment_history)
+      all_attentions.append(attention)
+
+    attention = array_ops.concat(all_attentions, 1)
     next_state = AttentionWrapperState(
         time=state.time + 1,
         cell_state=next_cell_state,
         attention=attention,
-        alignments=alignments,
-        alignment_history=alignment_history)
+        alignments=self._item_or_tuple(all_alignments),
+        alignment_history=self._item_or_tuple(all_histories))
 
     if self._output_attention:
       return attention, next_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 1d1babda16334c80e2182693fe6e8a7ac14d8953..ef1735a6c98bc3735a5acf1d85b80dd46bc52cf7 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -486,7 +486,7 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
 
   # Calculate the continuation lengths by adding to all continuing beams.
-  vocab_size = logits.shape[-1].value
+  vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1]
   lengths_to_add = array_ops.one_hot(
       indices=array_ops.tile(
           array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index 4795dfb8c91bf83dc8642a9cb760043e75143a5d..fbe53fc60ada85c40970870c6d0bdb93d17ea6d4 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -99,9 +99,9 @@ class Decoder(object):
       name: Name scope for any created operations.
 
     Returns:
-      `(outputs, next_state, next_inputs, finished)`: `outputs` is an instance
-      of BasicDecoderOutput, `next_state` is a (structure of) state tensors and
-      TensorArrays, `next_inputs` is the tensor that should be used as input for
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an object
+      containing the decoder output, `next_state` is a (structure of) state tensors
+      and TensorArrays, `next_inputs` is the tensor that should be used as input for
       the next step, `finished` is a boolean tensor telling whether the sequence
       is complete, for each sequence in the batch.
     """
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index bee75479357c8f56b3b3a11738139744f6a70d52..9d3f8ad44118c04918fd1f36a1236cdfe6669f58 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -30,8 +30,8 @@ from tensorflow.python.layers import base as layers_base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.distributions import bernoulli
 from tensorflow.python.ops.distributions import categorical
@@ -258,14 +258,15 @@ class ScheduledEmbeddingTrainingHelper(TrainingHelper):
     with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
                         [time, outputs, state]):
       # Return -1s where we did not sample, and sample_ids elsewhere
-      select_sample_noise = random_ops.random_uniform(
-          [self.batch_size], seed=self._scheduling_seed)
-      select_sample = (self._sampling_probability > select_sample_noise)
+      select_sampler = bernoulli.Bernoulli(
+          probs=self._sampling_probability, dtype=dtypes.bool)
+      select_sample = select_sampler.sample(
+          sample_shape=self.batch_size, seed=self._scheduling_seed)
       sample_id_sampler = categorical.Categorical(logits=outputs)
       return array_ops.where(
           select_sample,
           sample_id_sampler.sample(seed=self._seed),
-          array_ops.tile([-1], [self.batch_size]))
+          gen_array_ops.fill([self.batch_size], -1))
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
     with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
@@ -284,11 +285,9 @@ class ScheduledEmbeddingTrainingHelper(TrainingHelper):
             array_ops.where(sample_ids > -1), dtypes.int32)
         where_not_sampling = math_ops.cast(
             array_ops.where(sample_ids <= -1), dtypes.int32)
-        where_sampling_flat = array_ops.reshape(where_sampling, [-1])
-        where_not_sampling_flat = array_ops.reshape(where_not_sampling, [-1])
-        sample_ids_sampling = array_ops.gather(sample_ids, where_sampling_flat)
-        inputs_not_sampling = array_ops.gather(
-            base_next_inputs, where_not_sampling_flat)
+        sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling)
+        inputs_not_sampling = array_ops.gather_nd(
+            base_next_inputs, where_not_sampling)
         sampled_next_inputs = self._embedding_fn(sample_ids_sampling)
         base_shape = array_ops.shape(base_next_inputs)
         return (array_ops.scatter_nd(indices=where_sampling,
@@ -382,9 +381,7 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperSample",
                         [time, outputs, state]):
       sampler = bernoulli.Bernoulli(probs=self._sampling_probability)
-      return math_ops.cast(
-          sampler.sample(sample_shape=self.batch_size, seed=self._seed),
-          dtypes.bool)
+      return sampler.sample(sample_shape=self.batch_size, seed=self._seed)
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperNextInputs",
@@ -396,6 +393,7 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
               state=state,
               sample_ids=sample_ids,
               name=name))
+      sample_ids = math_ops.cast(sample_ids, dtypes.bool)
 
       def maybe_sample():
         """Perform scheduled sampling."""
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index fc34d82f25663d30b7617c3b010e0b5573af854b..6cd9aee04950eec3fbce7dea208869ac677e3778 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -63,6 +63,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
@@ -83,36 +84,37 @@ py_test(
     deps = [
         ":bundle_shim_py",
         ":constants",
+        ":manifest_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:tag_constants",
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 py_library(
     name = "constants",
     srcs = ["constants.py"],
-    deprecation = "Use SavedModel constants instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     srcs_version = "PY2AND3",
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 py_library(
     name = "exporter",
     srcs = ["exporter.py"],
-    deprecation = "Use SavedModel Builder instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
         ":gc",
         ":manifest_proto_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
@@ -125,9 +127,7 @@ py_library(
 py_test(
     name = "exporter_test",
     size = "small",
-    srcs = [
-        "exporter_test.py",
-    ],
+    srcs = ["exporter_test.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:private"],
     deps = [
@@ -148,24 +148,23 @@ py_test(
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 py_library(
     name = "gc",
     srcs = ["gc.py"],
-    deprecation = "Use SavedModel instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
 py_test(
     name = "gc_test",
     size = "small",
-    srcs = [
-        "gc_test.py",
-    ],
+    srcs = ["gc_test.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:private"],
     deps = [
@@ -192,19 +191,23 @@ filegroup(
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 cc_library(
     name = "session_bundle",
     hdrs = ["session_bundle.h"],
-    deprecation = "Use SavedModel Loader instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     visibility = ["//visibility:public"],
     deps = [
+        ":manifest_proto_cc",
         ":session_bundle_lite",
         ":signature",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 # This is a lite version of the session_bundle target that does not link in any
 # Tensorflow ops in order to minimize its size. Clients using this should link
 # any required ops manually.
@@ -213,10 +216,11 @@ cc_library(
     srcs = ["session_bundle.cc"],
     hdrs = ["session_bundle.h"],
     copts = if_ios(["-DGOOGLE_LOGGING"]),
-    deprecation = "Use SavedModel Loader instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     visibility = ["//visibility:public"],
     deps = [
         ":signature_lite",
+        "//tensorflow/core:lib_internal",
     ] + if_not_mobile([
         ":manifest_proto_cc",
         "//tensorflow/core:core_cpu",
@@ -238,21 +242,23 @@ cc_test(
     visibility = ["//visibility:private"],
     deps = [
         ":session_bundle",
+        ":signature",
         ":test_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 py_library(
     name = "session_bundle_py",
     srcs = ["session_bundle.py"],
-    deprecation = "Use SavedModel Loader instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -263,6 +269,7 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -281,8 +288,8 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:graph_util",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:training",
@@ -292,7 +299,7 @@ py_test(
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 # This is a lite version of the signature target that does not link in any
 # Tensorflow ops in order to minimize its size. Clients using this should
 # link any required ops manually.
@@ -300,7 +307,7 @@ cc_library(
     name = "signature_lite",
     srcs = ["signature.cc"],
     hdrs = ["signature.h"],
-    deprecation = "Use SavedModel instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     visibility = ["//visibility:public"],
     deps = if_not_mobile([
         ":manifest_proto_cc",
@@ -311,14 +318,19 @@ cc_library(
     ]),
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 cc_library(
     name = "signature",
     hdrs = ["signature.h"],
-    deprecation = "Use SavedModel instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     visibility = ["//visibility:public"],
     deps = [
+        ":manifest_proto_cc",
         ":signature_lite",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ] + if_not_mobile([
         "//tensorflow/core:tensorflow_opensource",
     ]),
@@ -343,13 +355,13 @@ cc_test(
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 cc_library(
     name = "test_util",
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
-    deprecation = "Use SavedModel instead.",
+    deprecation = "No longer supported. Switch to SavedModel immediately.",
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:lib",
@@ -398,13 +410,14 @@ cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
 
-# DEPRECATED: Use SavedModel instead.
+# DEPRECATED: No longer supported. Switch to SavedModel immediately.
 tf_proto_library(
     name = "manifest_proto",
     srcs = ["manifest.proto"],
diff --git a/tensorflow/contrib/session_bundle/README.md b/tensorflow/contrib/session_bundle/README.md
index 5bcc8fab70f8f492f687fa37b022ee324429f530..e3a87a0eb90d80bd5ed58c59914bac2aea4e3f42 100644
--- a/tensorflow/contrib/session_bundle/README.md
+++ b/tensorflow/contrib/session_bundle/README.md
@@ -1,7 +1,8 @@
 # TensorFlow Inference Model Format
 
-WARNING: SessionBundle has been deprecated. Please use
-[SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) instead.
+WARNING: SessionBundle has been deprecated and is no longer supported. Switch to
+[SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md)
+immediately.
 
 [TOC]
 
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index efeb808ee7ca11d943b1c3bde5cbf7c0880c98c7..dcc7fbaa2d61a48eddb5d8495360cb5559c99f4c 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -41,7 +41,8 @@ from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
 
-@deprecated("2017-06-30", "Please use SavedModel instead.")
+@deprecated("2017-06-30",
+            "No longer supported. Switch to SavedModel immediately.")
 def gfile_copy_callback(files_to_copy, export_dir_path):
   """Callback to copy files using `gfile.Copy` to an export directory.
 
@@ -71,7 +72,8 @@ def gfile_copy_callback(files_to_copy, export_dir_path):
     gfile.Copy(source_filepath, new_path)
 
 
-@deprecated("2017-06-30", "Please use SavedModel instead.")
+@deprecated("2017-06-30",
+            "No longer supported. Switch to SavedModel immediately.")
 def regression_signature(input_tensor, output_tensor):
   """Creates a regression signature.
 
@@ -88,7 +90,8 @@ def regression_signature(input_tensor, output_tensor):
   return signature
 
 
-@deprecated("2017-06-30", "Please use SavedModel instead.")
+@deprecated("2017-06-30",
+            "No longer supported. Switch to SavedModel immediately.")
 def classification_signature(input_tensor,
                              classes_tensor=None,
                              scores_tensor=None):
@@ -111,7 +114,8 @@ def classification_signature(input_tensor,
   return signature
 
 
-@deprecated("2017-06-30", "Please use SavedModel instead.")
+@deprecated("2017-06-30",
+            "No longer supported. Switch to SavedModel immediately.")
 def generic_signature(name_tensor_map):
   """Creates a generic signature of name to Tensor name.
 
@@ -145,7 +149,8 @@ class Exporter(object):
     self._has_init = False
     self._assets_to_copy = {}
 
-  @deprecated("2017-06-30", "Please use SavedModel instead.")
+  @deprecated("2017-06-30",
+              "No longer supported. Switch to SavedModel immediately.")
   def init(self,
            graph_def=None,
            init_op=None,
@@ -227,7 +232,8 @@ class Exporter(object):
 
     self._assets_callback = assets_callback
 
-  @deprecated("2017-06-30", "Please use SavedModel instead.")
+  @deprecated("2017-06-30",
+              "No longer supported. Switch to SavedModel immediately.")
   def export(self,
              export_dir_base,
              global_step_tensor,
diff --git a/tensorflow/contrib/session_bundle/gc.py b/tensorflow/contrib/session_bundle/gc.py
index 885b888e27ce01d04c3ee8edef3657f0acd37de8..249c23c88f3043403e322b73b6c9df97e932a92a 100644
--- a/tensorflow/contrib/session_bundle/gc.py
+++ b/tensorflow/contrib/session_bundle/gc.py
@@ -46,14 +46,14 @@ For example,
   path_list = gc.get_paths("/tmp", parser)  # contains all ten Paths
 
   every_fifth = gc.mod_export_version(5)
-  print every_fifth(path_list) # shows ["/tmp/0", "/tmp/5"]
+  print(every_fifth(path_list))  # shows ["/tmp/0", "/tmp/5"]
 
   largest_three = gc.largest_export_versions(3)
-  print largest_three(all_paths)  # shows ["/tmp/7", "/tmp/8", "/tmp/9"]
+  print(largest_three(all_paths))  # shows ["/tmp/7", "/tmp/8", "/tmp/9"]
 
   both = gc.union(every_fifth, largest_three)
-  print both(all_paths)  # shows ["/tmp/0", "/tmp/5",
-                         #        "/tmp/7", "/tmp/8", "/tmp/9"]
+  print(both(all_paths))  # shows ["/tmp/0", "/tmp/5",
+                          #        "/tmp/7", "/tmp/8", "/tmp/9"]
   # delete everything not in 'both'
   to_delete = gc.negation(both)
   for p in to_delete(all_paths):
@@ -77,7 +77,8 @@ from tensorflow.python.util.deprecation import deprecated
 Path = collections.namedtuple('Path', 'path export_version')
 
 
-@deprecated('2017-06-30', 'Please use SavedModel instead.')
+@deprecated('2017-06-30',
+            'No longer supported. Switch to SavedModel immediately.')
 def largest_export_versions(n):
   """Creates a filter that keeps the largest n export versions.
 
@@ -98,7 +99,8 @@ def largest_export_versions(n):
   return keep
 
 
-@deprecated('2017-06-30', 'Please use SavedModel instead.')
+@deprecated('2017-06-30',
+            'No longer supported. Switch to SavedModel immediately.')
 def one_of_every_n_export_versions(n):
   r"""Creates a filter that keeps one of every n export versions.
 
@@ -128,7 +130,8 @@ def one_of_every_n_export_versions(n):
   return keep
 
 
-@deprecated('2017-06-30', 'Please use SavedModel instead.')
+@deprecated('2017-06-30',
+            'No longer supported. Switch to SavedModel immediately.')
 def mod_export_version(n):
   """Creates a filter that keeps every export that is a multiple of n.
 
@@ -147,7 +150,8 @@ def mod_export_version(n):
   return keep
 
 
-@deprecated('2017-06-30', 'Please use SavedModel instead.')
+@deprecated('2017-06-30',
+            'No longer supported. Switch to SavedModel immediately.')
 def union(lf, rf):
   """Creates a filter that keeps the union of two filters.
 
@@ -165,7 +169,8 @@ def union(lf, rf):
   return keep
 
 
-@deprecated('2017-06-30', 'Please use SavedModel instead.')
+@deprecated('2017-06-30',
+            'No longer supported. Switch to SavedModel immediately.')
 def negation(f):
   """Negate a filter.
 
@@ -182,7 +187,8 @@ def negation(f):
   return keep
 
 
-@deprecated('2017-06-30', 'Please use SavedModel instead.')
+@deprecated('2017-06-30',
+            'No longer supported. Switch to SavedModel immediately.')
 def get_paths(base_dir, parser):
   """Gets a list of Paths in a given directory.
 
diff --git a/tensorflow/contrib/session_bundle/session_bundle.cc b/tensorflow/contrib/session_bundle/session_bundle.cc
index bc6fdcd4de9c1f4e989a56a9fb64083a4af474d1..cf26e3cae7e9247e387ee8294c4c0d5de8781d39 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "google/protobuf/any.pb.h"
 #include "tensorflow/contrib/session_bundle/manifest.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -163,6 +164,13 @@ Status LoadSessionBundleFromPathUsingRunOptionsInternal(
   TF_RETURN_IF_ERROR(
       GetMetaGraphDefFromExport(export_dir, &(bundle->meta_graph_def)));
 
+  // Deprecated SessionBundle models may fail to load because newly added
+  // attributes are not added to the Graph in the default Session initialization
+  // flow. Add an explicit call here when first loading the graph from disk.
+  TF_RETURN_IF_ERROR(
+      AddDefaultAttrsToGraphDef(bundle->meta_graph_def.mutable_graph_def(),
+                                *OpRegistry::Global(), 0 /* node_offset */));
+
   const auto& collection_def_map = bundle->meta_graph_def.collection_def();
   const auto graph_it = bundle->meta_graph_def.collection_def().find(kGraphKey);
   if (graph_it != collection_def_map.end()) {
diff --git a/tensorflow/contrib/session_bundle/session_bundle.py b/tensorflow/contrib/session_bundle/session_bundle.py
index 37407f90420776ef12693a974d3ce7a24fa78c0c..66f2e32f58ea5c17a1225e0c77a6d7db6d22edd4 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.py
+++ b/tensorflow/contrib/session_bundle/session_bundle.py
@@ -34,7 +34,8 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util.deprecation import deprecated
 
 
-@deprecated("2017-06-30", "Please use SavedModel instead.")
+@deprecated("2017-06-30",
+            "No longer supported. Switch to SavedModel immediately.")
 def maybe_session_bundle_dir(export_dir):
   """Checks if the model path contains session bundle model.
 
@@ -50,7 +51,8 @@ def maybe_session_bundle_dir(export_dir):
   return file_io.file_exists(meta_graph_filename)
 
 
-@deprecated("2017-06-30", "Please use SavedModel instead.")
+@deprecated("2017-06-30",
+            "No longer supported. Switch to SavedModel immediately.")
 def load_session_bundle_from_path(export_dir,
                                   target="",
                                   config=None,
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 5b65a6ae05ed98eb0ac5218c804eca37ea4743e6..52813b76fb2c5b6006d11ac07aa5d5237a02dd8e 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -12,19 +12,76 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_tests(
+    name = "reconstruction_ops_test",
+    srcs = ["python/kernel_tests/reconstruction_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
     ],
 )
 
 cuda_py_tests(
     name = "shape_ops_test",
-    size = "small",
     srcs = ["python/kernel_tests/shape_ops_test.py"],
     additional_deps = [
         ":signal_py",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "spectral_ops_test",
+    size = "large",
+    srcs = ["python/kernel_tests/spectral_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:spectral_ops_test_util",
+    ],
+)
+
+cuda_py_tests(
+    name = "window_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/window_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index 9f906dd28e8dc9130d87f4cd4a126e033fa66293..6cc51d6fb0d84dd6894a64a6d3b84104105696d9 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -12,16 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""##Signal ops.
+"""Signal processing operations.
 
-@@frames
+@@frame
+@@hamming_window
+@@hann_window
+@@inverse_stft
+@@overlap_and_add
+@@stft
+
+[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops.shape_ops import frames
+from tensorflow.contrib.signal.python.ops.reconstruction_ops import overlap_and_add
+from tensorflow.contrib.signal.python.ops.shape_ops import frame
+# `frame` used to be named `frames`, which is a noun and not a verb.
+# Keep an alias to `frames` for backwards compatibility.
+from tensorflow.contrib.signal.python.ops.shape_ops import frame as frames
+from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft
+from tensorflow.contrib.signal.python.ops.spectral_ops import stft
+from tensorflow.contrib.signal.python.ops.window_ops import hamming_window
+from tensorflow.contrib.signal.python.ops.window_ops import hann_window
 
 from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9b2ac51811b02d7519f796d5bff340b35863ec
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
@@ -0,0 +1,192 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for reconstruction_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import reconstruction_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ReconstructionOpsTest(test.TestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(ReconstructionOpsTest, self).__init__(*args, **kwargs)
+    self.batch_size = 3
+    self.frames = 3
+    self.samples = 5
+
+    self.bases = np.array(range(2, 5))
+    exponents = np.array(range(self.frames * self.samples))
+    powers = np.power(self.bases[:, np.newaxis], exponents[np.newaxis, :])
+
+    self.powers = np.reshape(powers, [self.batch_size, self.frames,
+                                      self.samples])
+    self.frame_hop = 2
+
+    # Hand computed example using powers of unique numbers: this is easily
+    # verified.
+    self.expected_string = ["1", "10", "100100", "1001000", "10010010000",
+                            "100100000000", "1001000000000", "10000000000000",
+                            "100000000000000"]
+
+  def test_all_ones(self):
+    signal = constant_op.constant(np.ones((3, 5)), dtype=dtypes.int64)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
+
+    with self.test_session(use_gpu=True) as sess:
+      output = sess.run(reconstruction)
+
+      expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
+
+      self.assertAllClose(output, expected_output)
+
+  def test_simple(self):
+    def make_input(frame_length, num_frames=3):
+      """Generate a tensor of num_frames frames of frame_length."""
+      return np.reshape(np.arange(1, num_frames * frame_length + 1),
+                        (-1, frame_length))
+
+    # List of (signal, expected_result, frame_hop).
+    configurations = [
+        # All hop lengths on a frame length of 2.
+        (make_input(2), [1, 5, 9, 6], 1),
+        (make_input(2), [1, 2, 3, 4, 5, 6], 2),
+
+        # All hop lengths on a frame length of 3.
+        (make_input(3), [1, 6, 15, 14, 9], 1),
+        (make_input(3), [1, 2, 7, 5, 13, 8, 9], 2),
+        (make_input(3), [1, 2, 3, 4, 5, 6, 7, 8, 9], 3),
+
+        # All hop lengths on a frame length of 4.
+        (make_input(4), [1, 7, 18, 21, 19, 12], 1),
+        (make_input(4), [1, 2, 8, 10, 16, 18, 11, 12], 2),
+        (make_input(4), [1, 2, 3, 9, 6, 7, 17, 10, 11, 12], 3),
+        (make_input(4), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 4),
+    ]
+
+    with self.test_session(use_gpu=True):
+      for signal, expected, frame_hop in configurations:
+        reconstruction = reconstruction_ops.overlap_and_add(
+            np.array(signal), frame_hop).eval()
+        expected_output = np.array(expected)
+        self.assertAllClose(reconstruction, expected_output)
+
+  def test_powers(self):
+    signal = constant_op.constant(np.squeeze(self.powers[0, :, :]),
+                                  dtype=dtypes.int64)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
+
+    with self.test_session(use_gpu=True) as sess:
+      output = sess.run(reconstruction)
+      string_output = [np.base_repr(x, self.bases[0]) for x in output]
+
+      self.assertEqual(string_output, self.expected_string)
+
+  def test_batch(self):
+    signal = constant_op.constant(self.powers, dtype=dtypes.int64)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
+
+    with self.test_session(use_gpu=True) as sess:
+      output = sess.run(reconstruction)
+
+      accumulator = True
+      for i in range(self.batch_size):
+        string_output = [np.base_repr(x, self.bases[i]) for x in output[i, :]]
+        accumulator = accumulator and (string_output == self.expected_string)
+
+      self.assertTrue(accumulator)
+
+  def test_one_element_batch(self):
+    input_matrix = np.squeeze(self.powers[0, :, :])
+    input_matrix = input_matrix[np.newaxis, :, :].astype(float)
+    signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
+
+    with self.test_session(use_gpu=True) as sess:
+      output = sess.run(reconstruction)
+
+      string_output = [np.base_repr(int(x), self.bases[0]) for x in
+                       np.squeeze(output)]
+
+      self.assertEqual(output.shape, (1, 9))
+      self.assertEqual(string_output, self.expected_string)
+
+  def test_gradient(self):
+    configurations = [
+        ((1, 128), 1),
+        ((5, 35), 17),
+        ((10, 128), 128),
+        ((2, 10, 128), 127),
+        ((2, 2, 10, 128), 126),
+        ((2, 2, 2, 10, 128), 125),
+    ]
+
+    for shape, frame_hop in configurations:
+      with self.test_session(use_gpu=True) as sess:
+        signal = array_ops.zeros(shape)
+        reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
+        loss = math_ops.reduce_sum(reconstruction)
+        # Increasing any sample in the input frames by one will increase the sum
+        # of all the samples in the reconstruction by 1, so the gradient should
+        # be all ones, no matter the shape or hop.
+        gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
+        self.assertTrue((gradient == 1.0).all())
+
+  def test_gradient_batch(self):
+    with self.test_session(use_gpu=True) as sess:
+      signal = array_ops.zeros((2, 10, 10))
+      frame_hop = 10
+      reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
+
+      # Multiply the first batch-item's reconstruction by zeros. This will block
+      # gradient from flowing into the first batch item from the loss. Multiply
+      # the second batch item by the integers from 0 to 99. Since there is zero
+      # overlap, the gradient for this batch item will be 0-99 shaped as (10,
+      # 10).
+      reconstruction *= array_ops.stack(
+          [array_ops.zeros((100,)), math_ops.to_float(math_ops.range(100))])
+      loss = math_ops.reduce_sum(reconstruction)
+
+      # Verify that only the second batch item receives gradient.
+      gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
+      expected_gradient = np.stack([
+          np.zeros((10, 10)),
+          np.reshape(np.arange(100).astype(np.float32), (10, 10))])
+      self.assertAllEqual(expected_gradient, gradient)
+
+  def test_gradient_numerical(self):
+    with self.test_session(use_gpu=True):
+      shape = (2, 10, 10)
+      framed_signal = array_ops.zeros(shape)
+      frame_hop = 10
+      reconstruction = reconstruction_ops.overlap_and_add(
+          framed_signal, frame_hop)
+      error = test.compute_gradient_error(
+          framed_signal, shape, reconstruction, [2, 100])
+      self.assertLess(error, 2e-5)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
index e07942875fdf3d0266824cf546a2a9dda94b1877..8633ced599f137da08a4181ec9cbf4b48517199d 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -24,18 +24,18 @@ from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class FramesTest(test.TestCase):
+class FrameTest(test.TestCase):
 
   def test_mapping_of_indices_without_padding(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       tensor = constant_op.constant(np.arange(9152), dtypes.int32)
       tensor = array_ops.expand_dims(tensor, 0)
 
-      result = shape_ops.frames(tensor, 512, 180)
-      result = result.eval()
+      result = shape_ops.frame(tensor, 512, 180, pad_end=False).eval()
 
       expected = np.tile(np.arange(512), (49, 1))
       expected += np.tile(np.arange(49) * 180, (512, 1)).T
@@ -46,15 +46,14 @@ class FramesTest(test.TestCase):
       self.assertAllEqual(expected, result)
 
   def test_mapping_of_indices_with_padding(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       tensor = constant_op.constant(np.arange(10000), dtypes.int32)
       tensor = array_ops.expand_dims(tensor, 0)
 
-      result = shape_ops.frames(tensor, 512, 192)
-      result = result.eval()
+      result = shape_ops.frame(tensor, 512, 192, pad_end=True).eval()
 
-      expected = np.tile(np.arange(512), (51, 1))
-      expected += np.tile(np.arange(51) * 192, (512, 1)).T
+      expected = np.tile(np.arange(512), (53, 1))
+      expected += np.tile(np.arange(53) * 192, (512, 1)).T
 
       expected[expected >= 10000] = 0
 
@@ -63,6 +62,277 @@ class FramesTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  def test_invalid_inputs(self):
+    # Rank 0 input signal.
+    with self.assertRaises(ValueError):
+      shape_ops.frame(1, 1, 1)
+
+    # If the rank is unknown, do not raise an exception.
+    shape_ops.frame(array_ops.placeholder(dtypes.float32), 1, 1)
+
+    # Non-scalar frame_length.
+    with self.assertRaises(ValueError):
+      shape_ops.frame([1], [1], 1)
+
+    # Non-scalar frame_step.
+    with self.assertRaises(ValueError):
+      shape_ops.frame([1], 1, [1])
+
+    # Non-scalar pad_value.
+    with self.assertRaises(ValueError):
+      shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
+
+  def test_length_zero(self):
+    signal = constant_op.constant([], dtype=dtypes.float32)
+    frame_length = 2
+    frame_step = 1
+
+    with self.test_session(use_gpu=True):
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=True, pad_value=99).eval()
+      self.assertEqual((0, 2), result.shape)
+
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=False).eval()
+      self.assertEqual((0, 2), result.shape)
+
+  def test_shape_inference(self):
+    signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
+    frame_length = 2
+    frame_step = 1
+    # Shape inference is able to detect the rank and inner-most dimension
+    # if frame_length is known at graph definition time.
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=True, pad_value=99)
+    self.assertEqual([1, 1, 2], result.shape.as_list())
+
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=False)
+    self.assertEqual([1, 0, 2], result.shape.as_list())
+
+    # If frame_length is not known, rank and (known) outer and inner dimensions
+    # are inferred.
+    signal = array_ops.placeholder(dtypes.int32, shape=[1, 2, 3, 4])
+    frame_length = array_ops.placeholder(dtypes.int32, shape=[])
+    frame_step = 1
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=True, pad_value=99, axis=1)
+    self.assertEqual([1, None, None, 3, 4], result.shape.as_list())
+
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=False, axis=1)
+    self.assertEqual([1, None, None, 3, 4], result.shape.as_list())
+
+    # If frame_length and inner-most dimension is known, rank, inner dimensions,
+    # and known outer dimensions are inferred.
+    signal = array_ops.placeholder(dtypes.int32,
+                                   shape=[None, 5, None, 20, 5, 3])
+    frame_length = 4
+    frame_step = 3
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=True, pad_value=99, axis=3)
+    self.assertEqual([None, 5, None, 7, 4, 5, 3], result.shape.as_list())
+
+    result = shape_ops.frame(signal, frame_length, frame_step,
+                             pad_end=False, axis=3)
+    self.assertEqual([None, 5, None, 6, 4, 5, 3], result.shape.as_list())
+
+    # Test that shape inference is consistent with actual returned shapes for
+    # small values of signal_length, frame_length, frame_step, and pad_end in
+    # [True, False].
+    frame_step = 1
+    for signal_length in range(2):
+      signal = [0] * signal_length
+      for frame_length in range(2):
+        for pad_end in [False, True]:
+          op = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=pad_end, pad_value=99)
+          with self.test_session(use_gpu=True):
+            result = op.eval()
+          self.assertEqual(op.shape.as_list(), list(result.shape))
+
+  def test_basic_mono(self):
+    signal = np.arange(6)
+    frame_length = 3
+    frame_step = 2
+
+    with self.test_session(use_gpu=True):
+      for rank in range(5):
+        nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
+
+        # With padding, we pad the last frame with pad_value.
+        result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                                 pad_end=True, pad_value=99).eval()
+        expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4], [4, 5, 99]])
+        expected = np.reshape(
+            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+        self.assertAllEqual(expected, result)
+
+        # Without padding, we drop the last frame.
+        expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4]])
+        expected = np.reshape(
+            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+        result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                                 pad_end=False).eval()
+        self.assertAllEqual(expected, result)
+
+  def test_basic_stereo(self):
+    signal = np.vstack([np.arange(6),
+                        np.arange(6) + 10])
+    frame_length = 3
+    frame_step = 2
+
+    with self.test_session(use_gpu=True):
+      for rank in range(5):
+        nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
+
+        # With padding, we pad the last frame with pad_value.
+        result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                                 pad_end=True, pad_value=99).eval()
+        expected_inner_frames = np.array([
+            [[0, 1, 2], [2, 3, 4], [4, 5, 99]],
+            [[10, 11, 12], [12, 13, 14], [14, 15, 99]]])
+        expected = np.reshape(
+            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+        self.assertAllEqual(expected, result)
+
+        # Without padding, we drop the last frame.
+        expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]],
+                                          [[10, 11, 12], [12, 13, 14]]])
+        expected = np.reshape(
+            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
+        result = shape_ops.frame(nd_signal, frame_length, frame_step,
+                                 pad_end=False).eval()
+        self.assertAllEqual(expected, result)
+
+  def test_complex_shape(self):
+    signal = np.vstack([np.arange(6),
+                        np.arange(6) + 10,
+                        np.arange(6) + 20,
+                        np.arange(6) + 30,
+                        np.arange(6) + 40,
+                        np.arange(6) + 50])
+    signal = np.reshape(signal, (2, 1, 3, 1, 6))
+    frame_length = 3
+    frame_step = 2
+
+    with self.test_session(use_gpu=True):
+      # With padding, we pad the last frame with pad_value.
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=True, pad_value=99).eval()
+      # Resulting shape is (2, 1, 3, 1, 3, 3).
+      expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]],
+                    [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]],
+                    [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]],
+                  [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]],
+                    [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]],
+                    [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]]
+      self.assertAllEqual(expected, result)
+
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=False).eval()
+      # Resulting shape is (2, 1, 3, 1, 3, 2).
+      expected = [[[[[[0, 1, 2], [2, 3, 4]]],
+                    [[[10, 11, 12], [12, 13, 14]]],
+                    [[[20, 21, 22], [22, 23, 24]]]]],
+                  [[[[[30, 31, 32], [32, 33, 34]]],
+                    [[[40, 41, 42], [42, 43, 44]]],
+                    [[[50, 51, 52], [52, 53, 54]]]]]]
+      self.assertAllEqual(expected, result)
+
+  def test_axis(self):
+    signal = np.reshape(np.arange(16), (2, 4, 2))
+    with self.test_session(use_gpu=True):
+      result = shape_ops.frame(signal, frame_length=2, frame_step=2,
+                               pad_end=True, axis=1)
+      expected = np.reshape(np.arange(16), (2, 2, 2, 2))
+      self.assertAllEqual(expected, result.eval())
+
+      result = shape_ops.frame(signal, frame_length=2, frame_step=1,
+                               pad_end=True, axis=1)
+      expected = [[[[0, 1], [2, 3]],
+                   [[2, 3], [4, 5]],
+                   [[4, 5], [6, 7]],
+                   [[6, 7], [0, 0]]],
+                  [[[8, 9], [10, 11]],
+                   [[10, 11], [12, 13]],
+                   [[12, 13], [14, 15]],
+                   [[14, 15], [0, 0]]]]
+      self.assertAllEqual(expected, result.eval())
+
+      result = shape_ops.frame(signal, frame_length=3, frame_step=1,
+                               pad_end=True, axis=1)
+      expected = [[[[0, 1], [2, 3], [4, 5]],
+                   [[2, 3], [4, 5], [6, 7]],
+                   [[4, 5], [6, 7], [0, 0]],
+                   [[6, 7], [0, 0], [0, 0]]],
+                  [[[8, 9], [10, 11], [12, 13]],
+                   [[10, 11], [12, 13], [14, 15]],
+                   [[12, 13], [14, 15], [0, 0]],
+                   [[14, 15], [0, 0], [0, 0]]]]
+      self.assertAllEqual(expected, result.eval())
+
+  def test_window_larger_than_signal(self):
+    signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
+    frame_length = 4
+    frame_step = 1
+
+    with self.test_session(use_gpu=True):
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=True, pad_value=99).eval()
+      self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]],
+                           [[11, 12, 99, 99], [12, 99, 99, 99]]], result)
+
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=False).eval()
+      self.assertEqual((2, 0, 4), result.shape)
+
+      frame_step = 2
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=True, pad_value=99).eval()
+      self.assertAllClose([[[1, 2, 99, 99]], [[11, 12, 99, 99]]], result)
+
+      result = shape_ops.frame(signal, frame_length, frame_step,
+                               pad_end=False).eval()
+      self.assertEqual((2, 0, 4), result.shape)
+
+  def test_preserves_type(self):
+    signal = math_ops.range(10, dtype=dtypes.float64)
+    frame_length = 2
+    frame_step = 3
+
+    with self.test_session(use_gpu=True):
+      result = shape_ops.frame(signal, frame_length, frame_step)
+      self.assertEqual(result.dtype, signal.dtype)
+
+  def test_dynamic_tensor(self):
+    # Show that frame works even when the dimensions of its input are
+    # not known at graph creation time.
+    input_signal = np.vstack([np.arange(4), np.arange(4) + 10,
+                              np.arange(4) + 20])
+    frame_length = 2
+    frame_step = 2
+
+    with self.test_session(use_gpu=True) as sess:
+      signal_placeholder = array_ops.placeholder(shape=(None, None),
+                                                 dtype=dtypes.float32)
+      result = sess.run(shape_ops.frame(
+          signal_placeholder, frame_length, frame_step),
+                        feed_dict={signal_placeholder: input_signal})
+      self.assertAllEqual([[[0, 1], [2, 3]],
+                           [[10, 11], [12, 13]],
+                           [[20, 21], [22, 23]]], result)
+
+  def test_gradient_numerical(self):
+    with self.test_session(use_gpu=True):
+      signal_shape = (2, 128)
+      signal = array_ops.ones(signal_shape)
+      frame_length = 33
+      frame_step = 9
+      frames = shape_ops.frame(signal, frame_length, frame_step)
+      error = test.compute_gradient_error(
+          signal, signal_shape, frames, frames.shape.as_list())
+      self.assertLess(error, 2e-5)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..61b7107a172ee791cfbd6bfc84568722cf1ac0a2
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
@@ -0,0 +1,249 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for spectral_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import spectral_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.platform import test
+
+
+class SpectralOpsTest(test.TestCase):
+
+  @staticmethod
+  def _np_hann_periodic_window(length):
+    if length == 1:
+      return np.ones(1)
+    odd = length % 2
+    if not odd:
+      length += 1
+    window = 0.5 - 0.5 * np.cos(2.0 * np.pi * np.arange(length) / (length - 1))
+    if not odd:
+      window = window[:-1]
+    return window
+
+  @staticmethod
+  def _np_frame(data, window_length, hop_length):
+    num_frames = 1 + int(np.floor((len(data) - window_length) // hop_length))
+    shape = (num_frames, window_length)
+    strides = (data.strides[0] * hop_length, data.strides[0])
+    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+
+  @staticmethod
+  def _np_stft(data, fft_length, hop_length, window_length):
+    frames = SpectralOpsTest._np_frame(data, window_length, hop_length)
+    window = SpectralOpsTest._np_hann_periodic_window(window_length)
+    return np.fft.rfft(frames * window, fft_length)
+
+  @staticmethod
+  def _np_inverse_stft(stft, fft_length, hop_length, window_length):
+    frames = np.fft.irfft(stft, fft_length)[..., :window_length]
+    window = SpectralOpsTest._np_hann_periodic_window(window_length)
+    return SpectralOpsTest._np_overlap_add(frames * window, hop_length)
+
+  @staticmethod
+  def _np_overlap_add(stft, hop_length):
+    num_frames, window_length = np.shape(stft)
+    # Output length will be one complete window, plus another hop_length's
+    # worth of points for each additional window.
+    output_length = window_length + (num_frames - 1) * hop_length
+    output = np.zeros(output_length)
+    for i in range(num_frames):
+      output[i * hop_length:i * hop_length + window_length] += stft[i,]
+    return output
+
+  def _compare(self, signal, frame_length, frame_step, fft_length):
+    with spectral_ops_test_util.fft_kernel_label_map(), (
+        self.test_session(use_gpu=True)) as sess:
+      actual_stft = spectral_ops.stft(
+          signal, frame_length, frame_step, fft_length, pad_end=False)
+
+      actual_inverse_stft = spectral_ops.inverse_stft(
+          actual_stft, frame_length, frame_step, fft_length)
+
+      actual_stft, actual_inverse_stft = sess.run(
+          [actual_stft, actual_inverse_stft])
+
+      expected_stft = SpectralOpsTest._np_stft(
+          signal, fft_length, frame_step, frame_length)
+      self.assertAllClose(expected_stft, actual_stft, 1e-4, 1e-4)
+
+      expected_inverse_stft = SpectralOpsTest._np_inverse_stft(
+          expected_stft, fft_length, frame_step, frame_length)
+      self.assertAllClose(
+          expected_inverse_stft, actual_inverse_stft, 1e-4, 1e-4)
+
+  def _compare_round_trip(self, signal, frame_length, frame_step, fft_length):
+    with spectral_ops_test_util.fft_kernel_label_map(), (
+        self.test_session(use_gpu=True)) as sess:
+      stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
+                               pad_end=False)
+      inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
+                                               fft_length)
+      signal, inverse_stft = sess.run([signal, inverse_stft])
+
+      # Since the shapes can differ due to padding, pad both signals to the max
+      # of their lengths.
+      max_length = max(signal.shape[0], inverse_stft.shape[0])
+      signal = np.pad(signal, (0, max_length - signal.shape[0]), "constant")
+      inverse_stft = np.pad(inverse_stft,
+                            (0, max_length - inverse_stft.shape[0]), "constant")
+
+      # Ignore the frame_length samples at either edge.
+      start = frame_length
+      end = signal.shape[0] - frame_length
+      ratio = signal[start:end] / inverse_stft[start:end]
+
+      # Check that the inverse and original signal are equal up to a constant
+      # factor.
+      self.assertLess(np.var(ratio), 2e-5)
+
+  def test_shapes(self):
+    with spectral_ops_test_util.fft_kernel_label_map(), (
+        self.test_session(use_gpu=True)):
+      signal = np.zeros((512,)).astype(np.float32)
+
+      # If fft_length is not provided, the smallest enclosing power of 2 of
+      # frame_length (8) is used.
+      stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
+                               pad_end=True)
+      self.assertAllEqual([64, 5], stft.shape.as_list())
+      self.assertAllEqual([64, 5], stft.eval().shape)
+
+      stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
+                               pad_end=True)
+      self.assertAllEqual([64, 5], stft.shape.as_list())
+      self.assertAllEqual([64, 5], stft.eval().shape)
+
+      stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
+                               fft_length=16, pad_end=True)
+      self.assertAllEqual([64, 9], stft.shape.as_list())
+      self.assertAllEqual([64, 9], stft.eval().shape)
+
+      stft = np.zeros((32, 9)).astype(np.complex64)
+
+      inverse_stft = spectral_ops.inverse_stft(stft, frame_length=8,
+                                               fft_length=16, frame_step=8)
+      expected_length = (stft.shape[0] - 1) * 8 + 8
+      self.assertAllEqual([None], inverse_stft.shape.as_list())
+      self.assertAllEqual([expected_length], inverse_stft.eval().shape)
+
+  def test_stft_and_inverse_stft(self):
+    """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
+    # Tuples of (signal_length, frame_length, frame_step, fft_length).
+    test_configs = [
+        (512, 64, 32, 64),
+        (512, 64, 64, 64),
+        (512, 64, 25, 64),
+        (512, 25, 15, 36),
+        (123, 23, 5, 42),
+    ]
+
+    for signal_length, frame_length, frame_step, fft_length in test_configs:
+      signal = np.random.random(signal_length).astype(np.float32)
+      self._compare(signal, frame_length, frame_step, fft_length)
+
+  def test_stft_round_trip(self):
+    # Tuples of (signal_length, frame_length, frame_step, fft_length).
+    test_configs = [
+        # 87.5% overlap.
+        (4096, 256, 32, 256),
+        # 75% overlap.
+        (4096, 256, 64, 256),
+        # Odd frame hop.
+        (4096, 128, 25, 128),
+        # Odd frame length.
+        (4096, 127, 32, 128),
+    ]
+
+    for signal_length, frame_length, frame_step, fft_length in test_configs:
+      # Generate a 440Hz signal at 8kHz sample rate.
+      signal = math_ops.sin(2 * np.pi * 440 / 8000 *
+                            math_ops.to_float(math_ops.range(signal_length)))
+      self._compare_round_trip(signal, frame_length, frame_step, fft_length)
+
+  @staticmethod
+  def _compute_stft_gradient(signal, frame_length=32, frame_step=16,
+                             fft_length=32):
+    """Computes the gradient of the STFT with respect to `signal`."""
+    stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length)
+    magnitude_stft = math_ops.abs(stft)
+    loss = math_ops.reduce_sum(magnitude_stft)
+    return gradients_impl.gradients([loss], [signal])[0]
+
+  def test_gradients(self):
+    """Test that spectral_ops.stft has a working gradient."""
+    with spectral_ops_test_util.fft_kernel_label_map(), (
+        self.test_session(use_gpu=True)) as sess:
+      signal_length = 512
+
+      # An all-zero signal has all zero gradients with respect to the sum of the
+      # magnitude STFT.
+      empty_signal = array_ops.zeros([signal_length], dtype=dtypes.float32)
+      empty_signal_gradient = sess.run(
+          self._compute_stft_gradient(empty_signal))
+      self.assertTrue((empty_signal_gradient == 0.0).all())
+
+      # A sinusoid will have non-zero components of its gradient with respect to
+      # the sum of the magnitude STFT.
+      sinusoid = math_ops.sin(
+          2 * np.pi * math_ops.linspace(0.0, 1.0, signal_length))
+      sinusoid_gradient = sess.run(self._compute_stft_gradient(sinusoid))
+      self.assertFalse((sinusoid_gradient == 0.0).all())
+
+  def test_gradients_numerical(self):
+    with spectral_ops_test_util.fft_kernel_label_map(), (
+        self.test_session(use_gpu=True)):
+      # Tuples of (signal_length, frame_length, frame_step, fft_length,
+      # stft_bound, inverse_stft_bound).
+      # TODO(rjryan): Investigate why STFT gradient error is so high.
+      test_configs = [
+          (64, 16, 8, 16),
+          (64, 16, 16, 16),
+          (64, 16, 7, 16),
+          (64, 7, 4, 9),
+          (29, 5, 1, 10),
+      ]
+
+      for (signal_length, frame_length, frame_step, fft_length) in test_configs:
+        signal_shape = [signal_length]
+        signal = random_ops.random_uniform(signal_shape)
+        stft_shape = [max(0, 1 + (signal_length - frame_length) // frame_step),
+                      fft_length // 2 + 1]
+        stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
+                                 pad_end=False)
+        inverse_stft_shape = [(stft_shape[0] - 1) * frame_step + frame_length]
+        inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
+                                                 fft_length)
+        stft_error = test.compute_gradient_error(signal, [signal_length],
+                                                 stft, stft_shape)
+        inverse_stft_error = test.compute_gradient_error(
+            stft, stft_shape, inverse_stft, inverse_stft_shape)
+        self.assertLess(stft_error, 2e-3)
+        self.assertLess(inverse_stft_error, 4e-5)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e0464596244b331906dab47cee349c1ea737b5
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for window_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import window_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+def _scipy_raised_cosine(length, symmetric=True, a=0.5, b=0.5):
+  """A simple implementation of a raised cosine window that matches SciPy.
+
+  https://en.wikipedia.org/wiki/Window_function#Hann_window
+  https://github.com/scipy/scipy/blob/v0.14.0/scipy/signal/windows.py#L615
+
+  Args:
+    length: The window length.
+    symmetric: Whether to create a symmetric window.
+    a: The alpha parameter of the raised cosine window.
+    b: The beta parameter of the raised cosine window.
+
+  Returns:
+    A raised cosine window of length `length`.
+  """
+  if length == 1:
+    return np.ones(1)
+  odd = length % 2
+  if not symmetric and not odd:
+    length += 1
+  window = a - b * np.cos(2.0 * np.pi * np.arange(length) / (length - 1))
+  if not symmetric and not odd:
+    window = window[:-1]
+  return window
+
+
+class WindowOpsTest(test.TestCase):
+
+  def setUp(self):
+    self._window_lengths = [1, 2, 3, 4, 5, 31, 64, 128]
+    self._dtypes = [(dtypes.float16, 1e-2),
+                    (dtypes.float32, 1e-6),
+                    (dtypes.float64, 1e-9)]
+
+  def _compare_window_fns(self, np_window_fn, tf_window_fn):
+    with self.test_session(use_gpu=True):
+      for window_length in self._window_lengths:
+        for periodic in [False, True]:
+          for tf_dtype, tol in self._dtypes:
+            np_dtype = tf_dtype.as_numpy_dtype
+            expected = np_window_fn(window_length,
+                                    symmetric=not periodic).astype(np_dtype)
+            actual = tf_window_fn(window_length, periodic=periodic,
+                                  dtype=tf_dtype).eval()
+            self.assertAllClose(expected, actual, tol, tol)
+
+  def test_hann_window(self):
+    """Check that hann_window matches scipy.signal.hann behavior."""
+    # The Hann window is a raised cosine window with parameters alpha=0.5 and
+    # beta=0.5.
+    # https://en.wikipedia.org/wiki/Window_function#Hann_window
+    self._compare_window_fns(
+        functools.partial(_scipy_raised_cosine, a=0.5, b=0.5),
+        window_ops.hann_window)
+
+  def test_hamming_window(self):
+    """Check that hamming_window matches scipy.signal.hamming's behavior."""
+    # The Hamming window is a raised cosine window with parameters alpha=0.54
+    # and beta=0.46.
+    # https://en.wikipedia.org/wiki/Window_function#Hamming_window
+    self._compare_window_fns(
+        functools.partial(_scipy_raised_cosine, a=0.54, b=0.46),
+        window_ops.hamming_window)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..653c030a04c2bbc7e3ee49b9c85a781fb49de8d0
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal reconstruction via overlapped addition of frames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.signal.python.ops import shape_ops
+from tensorflow.contrib.signal.python.ops import util_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _shuffle_to_front(input_tensor, k):
+  """Shuffles the last `k` indices of `input_tensor` to the front.
+
+  Transposes `input_tensor` to have the last `k` indices at the front. The input
+  may have arbitrary rank and unknown shape.
+
+  Args:
+    input_tensor: A `Tensor` of arbitrary rank and unknown shape.
+    k: A scalar `Tensor` specifying how many indices to shuffle.
+
+  Returns:
+    A transposed version of `input_tensor` with `k` indices shuffled to the
+    front.
+
+  Raises:
+    ValueError: If `input_tensor` is not at least rank `k` or `k` is not scalar.
+  """
+  k = ops.convert_to_tensor(k, name="k")
+  k.shape.with_rank(0)
+  k_static = tensor_util.constant_value(k)
+  if k_static is not None:
+    input_tensor.shape.with_rank_at_least(k_static)
+
+  rank = array_ops.rank(input_tensor)
+  outer_indices, inner_indices = array_ops.split(math_ops.range(rank),
+                                                 [rank - k, k])
+  permutation = array_ops.concat([inner_indices, outer_indices], 0)
+
+  return array_ops.transpose(input_tensor, perm=permutation)
+
+
+def overlap_and_add(signal, frame_step, name=None):
+  """Reconstructs a signal from a framed representation.
+
+  Adds potentially overlapping frames of a signal with shape
+  `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+  The resulting tensor has shape `[..., output_size]` where
+
+      output_size = (frames - 1) * frame_step + frame_length
+
+  Args:
+    signal: A [..., frames, frame_length] `Tensor`. All dimensions may be
+      unknown, and rank must be at least 2.
+    frame_step: An integer or scalar `Tensor` denoting overlap offsets. Must be
+      less than or equal to `frame_length`.
+    name: An optional name for the operation.
+
+  Returns:
+    A `Tensor` with shape `[..., output_size]` containing the overlap-added
+    frames of `signal`'s inner-most two dimensions.
+
+  Raises:
+    ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar
+      integer or `frame_step` is greater than `frame_length`.
+  """
+  with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
+    signal = ops.convert_to_tensor(signal, name="signal")
+    signal.shape.with_rank_at_least(2)
+    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
+    frame_step.shape.assert_has_rank(0)
+    if not frame_step.dtype.is_integer:
+      raise ValueError("frame_step must be an integer. Got %s" %
+                       frame_step.dtype)
+
+    # If frame_length and frame_step are known at graph construction time, check
+    # frame_step is less than or equal to frame_length.
+    frame_step_static = tensor_util.constant_value(frame_step)
+    if (frame_step_static is not None and signal.shape.ndims is not None and
+        signal.shape[-1].value is not None and
+        frame_step_static > signal.shape[-1].value):
+      raise ValueError(
+          "frame_step (%d) must be less than or equal to frame_length (%d)" % (
+              frame_step_static, signal.shape[-1].value))
+
+    signal_shape = array_ops.shape(signal)
+
+    # All dimensions that are not part of the overlap-and-add. Can be empty for
+    # rank 2 inputs.
+    outer_dimensions = signal_shape[:-2]
+
+    signal_rank = array_ops.rank(signal)
+    frames = signal_shape[-2]
+    frame_length = signal_shape[-1]
+
+    subframe_length = util_ops.gcd(frame_length, frame_step)
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+
+    # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe"
+    # level, where a subframe is gcd(frame_length, frame_step). Reshape signal
+    # from [..., frames, frame_length] into [..., subframes, subframe_length].
+    subframe_shape = array_ops.concat(
+        [outer_dimensions, [-1, subframe_length]], 0)
+    subframe_signal = array_ops.reshape(signal, subframe_shape)
+
+    # Now we shuffle the last [subframes, subframe_length] dimensions to the
+    # front.
+    # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can
+    # avoid this pair of transposes.
+    subframe_signal = _shuffle_to_front(subframe_signal, 2)
+
+    # Use unsorted_segment_sum to add overlapping subframes together.
+    segment_ids = array_ops.reshape(shape_ops.frame(
+        math_ops.range(output_subframes), subframes_per_frame, subframe_step,
+        pad_end=False), [-1])
+    result = math_ops.unsorted_segment_sum(subframe_signal, segment_ids,
+                                           num_segments=output_subframes)
+
+    # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We
+    # return a [...outer_dimensions, output_size] tensor with a transpose and
+    # reshape.
+    result_shape = array_ops.concat([outer_dimensions, [output_size]], 0)
+    return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2),
+                             result_shape)
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/contrib/signal/python/ops/shape_ops.py
index 4914f19be75398d50dc47fad0e8d7ab42e7d44aa..1ddc2941ec402992c16cd16717a966c96100738d 100644
--- a/tensorflow/contrib/signal/python/ops/shape_ops.py
+++ b/tensorflow/contrib/signal/python/ops/shape_ops.py
@@ -18,70 +18,174 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
+
+from tensorflow.contrib.signal.python.ops import util_ops
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 
-def frames(signal, frame_length, frame_step, name=None):
-  """Frame a signal into overlapping frames.
-
-  May be used in front of spectral functions.
+def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
+  """Infers the shape of the return value of `frame`."""
+  frame_length = tensor_util.constant_value(frame_length)
+  frame_step = tensor_util.constant_value(frame_step)
+  axis = tensor_util.constant_value(axis)
+  if signal.shape.ndims is None:
+    return None
+  if axis is None:
+    return [None] * (signal.shape.ndims + 1)
+
+  signal_shape = signal.shape.as_list()
+  num_frames = None
+  frame_axis = signal_shape[axis]
+  outer_dimensions = signal_shape[:axis]
+  inner_dimensions = signal_shape[axis:][1:]
+  if signal_shape and frame_axis is not None:
+    if frame_step and frame_length is not None:
+      if pad_end:
+        # Double negative is so that we round up.
+        num_frames = -(-frame_axis // frame_step)
+      else:
+        num_frames = (frame_axis - frame_length + frame_step) // frame_step
+      num_frames = max(0, num_frames)
+  return outer_dimensions + [num_frames, frame_length] + inner_dimensions
+
+
+def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
+          name=None):
+  """Expands `signal`'s `axis` dimension into frames of `frame_length`.
+
+  Slides a window of size `frame_length` over `signal`'s `axis` dimension
+  with a stride of `frame_step`, replacing the `axis` dimension with
+  `[frames, frame_length]` frames.
+
+  If `pad_end` is True, window positions that are past the end of the `axis`
+  dimension are padded with `pad_value` until the window moves fully past the
+  end of the dimension. Otherwise, only window positions that fully overlap the
+  `axis` dimension are produced.
 
   For example:
 
   ```python
   pcm = tf.placeholder(tf.float32, [None, 9152])
-  frames = tf.contrib.signal.frames(pcm, 512, 180)
+  frames = tf.contrib.signal.frame(pcm, 512, 180)
   magspec = tf.abs(tf.spectral.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
   ```
 
   Args:
-    signal: A `Tensor` of shape `[batch_size, signal_length]`.
-    frame_length: An `int32` or `int64` `Tensor`. The length of each frame.
-    frame_step: An `int32` or `int64` `Tensor`. The step between frames.
-    name: A name for the operation (optional).
+    signal: A `[..., samples, ...]` `Tensor`. The rank and dimensions
+      may be unknown. Rank must be at least 1.
+    frame_length: The frame length in samples. An integer or scalar `Tensor`.
+    frame_step: The frame hop size in samples. An integer or scalar `Tensor`.
+    pad_end: Whether to pad the end of `signal` with `pad_value`.
+    pad_value: An optional scalar `Tensor` to use where the input signal
+      does not exist when `pad_end` is True.
+    axis: A scalar integer `Tensor` indicating the axis to frame. Defaults to
+      the last axis. Supports negative values for indexing from the end.
+    name: An optional name for the operation.
 
   Returns:
-    A `Tensor` of frames with shape `[batch_size, num_frames, frame_length]`.
+    A `Tensor` of frames with shape `[..., frames, frame_length, ...]`.
 
   Raises:
-    ValueError: if signal does not have rank 2.
+    ValueError: If `frame_length`, `frame_step`, `pad_value`, or `axis` are not
+      scalar.
   """
-  with ops.name_scope(name, "frames", [signal, frame_length, frame_step]):
+  with ops.name_scope(name, "frame", [signal, frame_length, frame_step,
+                                      pad_value]):
     signal = ops.convert_to_tensor(signal, name="signal")
     frame_length = ops.convert_to_tensor(frame_length, name="frame_length")
     frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
-
-    signal_rank = signal.shape.ndims
-
-    if signal_rank != 2:
-      raise ValueError("expected signal to have rank 2 but was " + signal_rank)
-
-    signal_length = array_ops.shape(signal)[1]
-
-    num_frames = math_ops.ceil((signal_length - frame_length) / frame_step)
-    num_frames = 1 + math_ops.cast(num_frames, dtypes.int32)
-
-    pad_length = (num_frames - 1) * frame_step + frame_length
-    pad_signal = array_ops.pad(signal, [[0, 0], [0,
-                                                 pad_length - signal_length]])
-
-    indices_frame = array_ops.expand_dims(math_ops.range(frame_length), 0)
-    indices_frames = array_ops.tile(indices_frame, [num_frames, 1])
-
-    indices_step = array_ops.expand_dims(
-        math_ops.range(num_frames) * frame_step, 1)
-    indices_steps = array_ops.tile(indices_step, [1, frame_length])
-
-    indices = indices_frames + indices_steps
-
-    # TODO(androbin): remove `transpose` when `gather` gets `axis` support
-    pad_signal = array_ops.transpose(pad_signal)
-    signal_frames = array_ops.gather(pad_signal, indices)
-    signal_frames = array_ops.transpose(signal_frames, perm=[2, 0, 1])
-
-    return signal_frames
+    axis = ops.convert_to_tensor(axis, name="axis")
+
+    signal.shape.with_rank_at_least(1)
+    frame_length.shape.assert_has_rank(0)
+    frame_step.shape.assert_has_rank(0)
+    axis.shape.assert_has_rank(0)
+
+    result_shape = _infer_frame_shape(signal, frame_length, frame_step, pad_end,
+                                      axis)
+
+    # Axis can be negative. Convert it to positive.
+    signal_rank = array_ops.rank(signal)
+    axis = math_ops.range(signal_rank)[axis]
+
+    signal_shape = array_ops.shape(signal)
+    outer_dimensions, length_samples, inner_dimensions = array_ops.split(
+        signal_shape, [axis, 1, signal_rank - 1 - axis])
+    length_samples = array_ops.reshape(length_samples, [])
+    num_outer_dimensions = array_ops.size(outer_dimensions)
+    num_inner_dimensions = array_ops.size(inner_dimensions)
+
+    # If padding is requested, pad the input signal tensor with pad_value.
+    if pad_end:
+      pad_value = ops.convert_to_tensor(pad_value, signal.dtype)
+      pad_value.shape.assert_has_rank(0)
+
+      # Calculate number of frames, using double negatives to round up.
+      num_frames = -(-length_samples // frame_step)
+
+      # Pad the signal by up to frame_length samples based on how many samples
+      # are remaining starting from last_frame_position.
+      pad_samples = math_ops.maximum(
+          0, frame_length + frame_step * (num_frames - 1) - length_samples)
+
+      # Pad the inner dimension of signal by pad_samples.
+      paddings = array_ops.concat(
+          [array_ops.zeros([num_outer_dimensions, 2], dtype=pad_samples.dtype),
+           [[0, pad_samples]],
+           array_ops.zeros([num_inner_dimensions, 2], dtype=pad_samples.dtype)],
+          0)
+      signal = array_ops.pad(signal, paddings, constant_values=pad_value)
+
+      signal_shape = array_ops.shape(signal)
+      length_samples = signal_shape[axis]
+    else:
+      num_frames = math_ops.maximum(
+          0, 1 + (length_samples - frame_length) // frame_step)
+
+    subframe_length = util_ops.gcd(frame_length, frame_step)
+    subframes_per_frame = frame_length // subframe_length
+    subframes_per_hop = frame_step // subframe_length
+    num_subframes = length_samples // subframe_length
+
+    slice_shape = array_ops.concat([outer_dimensions,
+                                    [num_subframes * subframe_length],
+                                    inner_dimensions], 0)
+    subframe_shape = array_ops.concat([outer_dimensions,
+                                       [num_subframes, subframe_length],
+                                       inner_dimensions], 0)
+    subframes = array_ops.reshape(array_ops.strided_slice(
+        signal, array_ops.zeros_like(signal_shape),
+        slice_shape), subframe_shape)
+
+    # frame_selector is a [num_frames, subframes_per_frame] tensor
+    # that indexes into the appropriate frame in subframes. For example:
+    # [[0, 0, 0, 0], [2, 2, 2, 2], [4, 4, 4, 4]]
+    frame_selector = array_ops.reshape(
+        math_ops.range(num_frames) * subframes_per_hop, [num_frames, 1])
+
+    # subframe_selector is a [num_frames, subframes_per_frame] tensor
+    # that indexes into the appropriate subframe within a frame. For example:
+    # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
+    subframe_selector = array_ops.reshape(
+        math_ops.range(subframes_per_frame), [1, subframes_per_frame])
+
+    # Adding the 2 selector tensors together produces a [num_frames,
+    # subframes_per_frame] tensor of indices to use with tf.gather to select
+    # subframes from subframes. We then reshape the inner-most
+    # subframes_per_frame dimension to stitch the subframes together into
+    # frames. For example: [[0, 1, 2, 3], [2, 3, 4, 5], [4, 5, 6, 7]].
+    selector = frame_selector + subframe_selector
+
+    frames = array_ops.reshape(
+        array_ops.gather(subframes, selector, axis=axis),
+        array_ops.concat([outer_dimensions, [num_frames, frame_length],
+                          inner_dimensions], 0))
+
+    if result_shape:
+      frames.set_shape(result_shape)
+    return frames
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/contrib/signal/python/ops/spectral_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..950d8f471c6b34ecd7488b4434776a333d2fa782
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/spectral_ops.py
@@ -0,0 +1,180 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Spectral operations (e.g. Short-time Fourier Transform)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import reconstruction_ops
+from tensorflow.contrib.signal.python.ops import shape_ops
+from tensorflow.contrib.signal.python.ops import window_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops
+
+
+def stft(signals, frame_length, frame_step, fft_length=None,
+         window_fn=functools.partial(window_ops.hann_window, periodic=True),
+         pad_end=False, name=None):
+  """Computes the [Short-time Fourier Transform][stft] of `signals`.
+
+  Implemented with GPU-compatible ops and supports gradients.
+
+  Args:
+    signals: A `[..., samples]` `float32` `Tensor` of real-valued signals.
+    frame_length: An integer scalar `Tensor`. The window length in samples.
+    frame_step: An integer scalar `Tensor`. The number of samples to step.
+    fft_length: An integer scalar `Tensor`. The size of the FFT to apply.
+      If not provided, uses the smallest power of 2 enclosing `frame_length`.
+    window_fn: A callable that takes a window length and a `dtype` keyword
+      argument and returns a `[window_length]` `Tensor` of samples in the
+      provided datatype. If set to `None`, no windowing is used.
+    pad_end: Whether to pad the end of `signals` with zeros when the provided
+      frame length and step produces a frame that lies partially past its end.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., frames, fft_unique_bins]` `Tensor` of `complex64` STFT values where
+    `fft_unique_bins` is `fft_length // 2 + 1` (the unique components of the
+    FFT).
+
+  Raises:
+    ValueError: If `signals` is not at least rank 1, `frame_length` is
+      not scalar, `frame_step` is not scalar, or `frame_length`
+      is greater than `fft_length`.
+
+  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+  """
+  with ops.name_scope(name, 'stft', [signals, frame_length,
+                                     frame_step]):
+    signals = ops.convert_to_tensor(signals, name='signals')
+    signals.shape.with_rank_at_least(1)
+    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
+    frame_length.shape.assert_has_rank(0)
+    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
+    frame_step.shape.assert_has_rank(0)
+
+    if fft_length is None:
+      fft_length = _enclosing_power_of_two(frame_length)
+    else:
+      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
+
+    frame_length_static = tensor_util.constant_value(
+        frame_length)
+    fft_length_static = tensor_util.constant_value(fft_length)
+    if (frame_length_static is not None and fft_length_static is not None and
+        frame_length_static > fft_length_static):
+      raise ValueError('frame_length (%d) may not be larger than '
+                       'fft_length (%d)' % (frame_length_static,
+                                            fft_length_static))
+
+    framed_signals = shape_ops.frame(
+        signals, frame_length, frame_step, pad_end=pad_end)
+
+    # Optionally window the framed signals.
+    if window_fn is not None:
+      window = window_fn(frame_length, dtype=framed_signals.dtype)
+      framed_signals *= window
+
+    # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the
+    # FFT of the real windowed signals in framed_signals.
+    return spectral_ops.rfft(framed_signals, [fft_length])
+
+
+def inverse_stft(stfts,
+                 frame_length,
+                 frame_step,
+                 fft_length=None,
+                 window_fn=functools.partial(window_ops.hann_window,
+                                             periodic=True),
+                 name=None):
+  """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
+
+  Implemented with GPU-compatible ops and supports gradients.
+
+  Args:
+    stfts: A `complex64` `[..., frames, fft_unique_bins]` `Tensor` of STFT bins
+      representing a batch of `fft_length`-point STFTs where `fft_unique_bins`
+      is `fft_length // 2 + 1`
+    frame_length: An integer scalar `Tensor`. The window length in samples.
+    frame_step: An integer scalar `Tensor`. The number of samples to step.
+    fft_length: An integer scalar `Tensor`. The size of the FFT that produced
+      `stfts`. If not provided, uses the smallest power of 2 enclosing
+      `frame_length`.
+    window_fn: A callable that takes a window length and a `dtype` keyword
+      argument and returns a `[window_length]` `Tensor` of samples in the
+      provided datatype. If set to `None`, no windowing is used.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `Tensor` of `float32` signals representing the inverse
+    STFT for each input STFT in `stfts`.
+
+  Raises:
+    ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar,
+      `frame_step` is not scalar, or `fft_length` is not scalar, or
+      `frame_length` is greater than `fft_length`.
+
+  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+  """
+  with ops.name_scope(name, 'inverse_stft', [stfts]):
+    stfts = ops.convert_to_tensor(stfts, name='stfts')
+    stfts.shape.with_rank_at_least(2)
+    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
+    frame_length.shape.assert_has_rank(0)
+    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
+    frame_step.shape.assert_has_rank(0)
+    if fft_length is None:
+      fft_length = _enclosing_power_of_two(frame_length)
+    else:
+      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
+      fft_length.shape.assert_has_rank(0)
+
+    frame_length_static = tensor_util.constant_value(
+        frame_length)
+    fft_length_static = tensor_util.constant_value(fft_length)
+    if (frame_length_static is not None and fft_length_static is not None and
+        frame_length_static > fft_length_static):
+      raise ValueError('frame_length (%d) may not be larger than '
+                       'fft_length (%d)' % (frame_length_static,
+                                            fft_length_static))
+
+    real_frames = spectral_ops.irfft(stfts, [fft_length])[..., :frame_length]
+
+    # Optionally window and overlap-add the inner 2 dimensions of real_frames
+    # into a single [samples] dimension.
+    if window_fn is not None:
+      window = window_fn(frame_length, dtype=stfts.dtype.real_dtype)
+      real_frames *= window
+    return reconstruction_ops.overlap_and_add(real_frames, frame_step)
+
+
+def _enclosing_power_of_two(value):
+  """Return 2**N for integer N such that 2**N >= value."""
+  value_static = tensor_util.constant_value(value)
+  if value_static is not None:
+    return constant_op.constant(
+        int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
+  return math_ops.cast(
+      math_ops.pow(2.0, math_ops.ceil(
+          math_ops.log(math_ops.to_float(value)) / math_ops.log(2.0))),
+      value.dtype)
diff --git a/tensorflow/contrib/signal/python/ops/util_ops.py b/tensorflow/contrib/signal/python/ops/util_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee829d799eb149bfb2af0dfe92c9fc1b55c452c
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/util_ops.py
@@ -0,0 +1,57 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility ops shared across tf.contrib.signal."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+
+def gcd(a, b, name=None):
+  """Returns the greatest common divisor via Euclid's algorithm.
+
+  Args:
+    a: The dividend. A scalar integer `Tensor`.
+    b: The divisor. A scalar integer `Tensor`.
+    name: An optional name for the operation.
+
+  Returns:
+    A scalar `Tensor` representing the greatest common divisor between `a` and
+    `b`.
+
+  Raises:
+    ValueError: If `a` or `b` are not scalar integers.
+  """
+  with ops.name_scope(name, 'gcd', [a, b]):
+    a = ops.convert_to_tensor(a)
+    b = ops.convert_to_tensor(b)
+
+    a.shape.assert_has_rank(0)
+    b.shape.assert_has_rank(0)
+
+    if not a.dtype.is_integer:
+      raise ValueError('a must be an integer type. Got: %s' % a.dtype)
+    if not b.dtype.is_integer:
+      raise ValueError('b must be an integer type. Got: %s' % b.dtype)
+
+    cond = lambda _, b: math_ops.greater(b, array_ops.zeros_like(b))
+    body = lambda a, b: [b, math_ops.mod(a, b)]
+    a, b = control_flow_ops.while_loop(cond, body, [a, b], back_prop=False)
+    return a
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/contrib/signal/python/ops/window_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..07a847dd2a440254d50759308006c7121eee13f2
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/window_ops.py
@@ -0,0 +1,121 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for computing common window functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+
+def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
+  """Generate a [Hann window][hann].
+
+  Args:
+    window_length: A scalar `Tensor` indicating the window length to generate.
+    periodic: A bool `Tensor` indicating whether to generate a periodic or
+      symmetric window. Periodic windows are typically used for spectral
+      analysis while symmetric windows are typically used for digital
+      filter design.
+    dtype: The data type to produce. Must be a floating point type.
+    name: An optional name for the operation.
+
+  Returns:
+    A `Tensor` of shape `[window_length]` of type `dtype`.
+
+  Raises:
+    ValueError: If `dtype` is not a floating point type.
+
+  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+  """
+  return _raised_cosine_window(name, 'hann_window', window_length, periodic,
+                               dtype, 0.5, 0.5)
+
+
+def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
+                   name=None):
+  """Generate a [Hamming][hamming] window.
+
+  Args:
+    window_length: A scalar `Tensor` indicating the window length to generate.
+    periodic: A bool `Tensor` indicating whether to generate a periodic or
+      symmetric window. Periodic windows are typically used for spectral
+      analysis while symmetric windows are typically used for digital
+      filter design.
+    dtype: The data type to produce. Must be a floating point type.
+    name: An optional name for the operation.
+
+  Returns:
+    A `Tensor` of shape `[window_length]` of type `dtype`.
+
+  Raises:
+    ValueError: If `dtype` is not a floating point type.
+
+  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+  """
+  return _raised_cosine_window(name, 'hamming_window', window_length, periodic,
+                               dtype, 0.54, 0.46)
+
+
+def _raised_cosine_window(name, default_name, window_length, periodic,
+                          dtype, a, b):
+  """Helper function for computing a raised cosine window.
+
+  Args:
+    name: Name to use for the scope.
+    default_name: Default name to use for the scope.
+    window_length: A scalar `Tensor` or integer indicating the window length.
+    periodic: A bool `Tensor` indicating whether to generate a periodic or
+      symmetric window.
+    dtype: A floating point `DType`.
+    a: The alpha parameter to the raised cosine window.
+    b: The beta parameter to the raised cosine window.
+
+  Returns:
+    A `Tensor` of shape `[window_length]` of type `dtype`.
+
+  Raises:
+    ValueError: If `dtype` is not a floating point type or `window_length` is
+      not scalar or `periodic` is not scalar.
+  """
+  if not dtype.is_floating:
+    raise ValueError('dtype must be a floating point type. Found %s' % dtype)
+
+  with ops.name_scope(name, default_name, [window_length, periodic]):
+    window_length = ops.convert_to_tensor(window_length, dtype=dtypes.int32,
+                                          name='window_length')
+    window_length.shape.assert_has_rank(0)
+    periodic = math_ops.cast(
+        ops.convert_to_tensor(periodic, dtype=dtypes.bool, name='periodic'),
+        dtypes.int32)
+    periodic.shape.assert_has_rank(0)
+    even = 1 - math_ops.mod(window_length, 2)
+
+    n = math_ops.cast(window_length + periodic * even - 1, dtype=dtype)
+    count = math_ops.cast(math_ops.range(window_length), dtype)
+    cos_arg = constant_op.constant(2 * np.pi, dtype=dtype) * count / n
+
+    return control_flow_ops.cond(
+        math_ops.equal(window_length, 1),
+        lambda: array_ops.ones([1], dtype=dtype),
+        lambda: math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype))
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index 427c25e07c7a5162bf9a86bcf0f8b4bcd1e65fa4..2d48a91f9f657e6ee63cee5e2bf6faf5a2d82a35 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -15,9 +15,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/training:training_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
     ],
@@ -38,10 +35,8 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
@@ -56,17 +51,15 @@ py_library(
     deps = [
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/training:training_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
@@ -88,11 +81,9 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
@@ -128,7 +119,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:ops",
         "//tensorflow/python:training",
     ],
 )
@@ -150,7 +140,6 @@ py_library(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/metrics:metrics_py",
-        "//tensorflow/contrib/slim/python/slim/data",
         "//tensorflow/contrib/slim/python/slim/data:data_decoder",
         "//tensorflow/contrib/slim/python/slim/data:data_provider",
         "//tensorflow/contrib/slim/python/slim/data:dataset",
@@ -167,7 +156,7 @@ py_library(
     srcs = ["python/slim/summaries.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:summary",
@@ -183,7 +172,7 @@ py_test(
         ":summaries",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
     ],
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index d37c632be7f1f911e62e35df3b6af3820201ee51..c0aa6d445acfc99ef9da9a54fc269babee754951 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -775,7 +775,7 @@ images, labels = LoadTestData(...)
 predictions = MyModel(images)
 
 mae_value_op, mae_update_op = slim.metrics.streaming_mean_absolute_error(predictions, labels)
-mre_value_op, mre_update_op = slim.metrics.streaming_mean_relative_error(predictions, labels, labels)
+mre_value_op, mre_update_op = slim.metrics.streaming_mean_relative_error(predictions, labels)
 pl_value_op, pl_update_op = slim.metrics.percentage_less(mean_relative_errors, 0.3)
 ```
 
@@ -836,7 +836,7 @@ with tf.Session() as sess:
   for batch_id in range(num_batches):
     sess.run(names_to_updates.values())
 
-  metric_values = sess.run(name_to_values.values())
+  metric_values = sess.run(names_to_values.values())
   for metric, value in zip(names_to_values.keys(), metric_values):
     print('Metric %s has value: %f' % (metric, value))
 ```
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index 1326f4093c06b4d2554bc6af4853f82a7ca69973..4d42a11d166b58352229ab006132c41dfab951b8 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -72,12 +72,10 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -87,11 +85,9 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
@@ -111,9 +107,7 @@ py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:io_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
@@ -125,10 +119,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
     ],
@@ -141,11 +133,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":prefetch_queue",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
@@ -174,13 +166,13 @@ py_library(
         ":data_decoder",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:functional_ops",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
@@ -195,11 +187,9 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform_test",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 15c9f3d3f4488818cf2690c240af676bfd1f5128..2d4b08df61a22b270ab5ed31a5a2b33b108de29b 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -81,7 +81,7 @@ more summaries and call the evaluation_loop method:
 
   # Evaluate every 10 minutes:
   slim.evaluation_loop(
-      master='',
+      '',
       checkpoint_dir,
       logdir,
       num_evals=num_evals,
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index f7dddc46c365262ed30cec8e4ece694c47705cf4..8f690fb5490d333417442bc1f2626db2c6a68fa1 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -603,9 +603,9 @@ def train(train_op,
     saver: Saver to save checkpoints. If None, a default one will be created
       and used.
     save_interval_secs: How often, in seconds, to save the model to `logdir`.
-    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
-      argument is supplied, gradient updates will be synchronous. If left as
-      `None`, gradient updates will be asynchronous.
+    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of
+      them. If the argument is supplied, gradient updates will be synchronous.
+      If left as `None`, gradient updates will be asynchronous.
     session_config: An instance of `tf.ConfigProto` that will be used to
       configure the `Session`. If left as `None`, the default will be used.
     trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
@@ -633,6 +633,8 @@ def train(train_op,
       raise ValueError('Cannot provide trace_every_n_steps because '
                        'logdir=None')
 
+  if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
+    sync_optimizer = [sync_optimizer]
   if sync_optimizer is not None and startup_delay_steps > 0:
     raise ValueError(
         'startup_delay_steps must be zero when sync_optimizer is supplied.')
@@ -647,6 +649,12 @@ def train(train_op,
       global_step = variables.get_or_create_global_step()
     saver = saver or tf_saver.Saver()
 
+    if sync_optimizer is not None:
+      for opt in sync_optimizer:
+        if not isinstance(opt, sync_replicas_optimizer.SyncReplicasOptimizer):
+          raise ValueError(
+              '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.')
+
     with ops.name_scope('init_ops'):
       if init_op == _USE_DEFAULT:
         init_op = tf_variables.global_variables_initializer()
@@ -659,15 +667,17 @@ def train(train_op,
             tf_variables.local_variables_initializer(),
             lookup_ops.tables_initializer())
 
-      if sync_optimizer is not None and isinstance(
-          sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
+      if sync_optimizer is not None and isinstance(sync_optimizer, list):
         with ops.control_dependencies([local_init_op] if local_init_op is
                                       not None else []):
           if is_chief:
-            local_init_op = sync_optimizer.chief_init_op
+            local_init_op = control_flow_ops.group(
+                *[opt.chief_init_op for opt in sync_optimizer])
           else:
-            local_init_op = sync_optimizer.local_step_init_op
-        ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
+            local_init_op = control_flow_ops.group(
+                *[opt.local_step_init_op for opt in sync_optimizer])
+        ready_for_local_init_op = control_flow_ops.group(
+            *[opt.ready_for_local_init_op for opt in sync_optimizer])
       else:
         ready_for_local_init_op = None
 
@@ -678,14 +688,10 @@ def train(train_op,
       summary_writer = supervisor.Supervisor.USE_DEFAULT
 
     if is_chief and sync_optimizer is not None:
-      if not isinstance(sync_optimizer,
-                        (sync_replicas_optimizer.SyncReplicasOptimizer)):
-        raise ValueError(
-            '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.')
-
       # Need to create these BEFORE the supervisor finalizes the graph:
-      init_tokens_op = sync_optimizer.get_init_tokens_op()
-      chief_queue_runner = sync_optimizer.get_chief_queue_runner()
+      init_tokens_op = [opt.get_init_tokens_op() for opt in sync_optimizer]
+      chief_queue_runner = [
+          opt.get_chief_queue_runner() for opt in sync_optimizer]
 
     if train_step_kwargs == _USE_DEFAULT:
       with ops.name_scope('train_step'):
@@ -741,7 +747,7 @@ def train(train_op,
         threads = sv.start_queue_runners(sess)
         logging.info('Starting Queues.')
         if is_chief and sync_optimizer is not None:
-          sv.start_queue_runners(sess, [chief_queue_runner])
+          sv.start_queue_runners(sess, chief_queue_runner)
           sess.run(init_tokens_op)
         try:
           while not sv.should_stop():
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 83d45f6f5adaccfca0a04629172ee803bab10ba7..69061460eb6b343edb8637f596c0586e809d62eb 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -220,7 +220,7 @@ def LogisticClassifier(inputs):
 
 
 def BatchNormClassifier(inputs):
-  inputs = layers.batch_norm(inputs, decay=0.1)
+  inputs = layers.batch_norm(inputs, decay=0.1, fused=None)
   return layers.fully_connected(inputs, 1, activation_fn=math_ops.sigmoid)
 
 
@@ -267,6 +267,11 @@ class CreateTrainOpTest(test.TestCase):
     self._inputs = np.random.rand(16, 4).astype(np.float32)
     self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32)
 
+  def _addBesselsCorrection(self, sample_size, expected_var):
+    correction_factor = sample_size / (sample_size - 1)
+    expected_var *= correction_factor
+    return expected_var
+
   def testUseUpdateOps(self):
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
@@ -275,6 +280,7 @@ class CreateTrainOpTest(test.TestCase):
 
       expected_mean = np.mean(self._inputs, axis=(0))
       expected_var = np.var(self._inputs, axis=(0))
+      expected_var = self._addBesselsCorrection(16, expected_var)
 
       tf_predictions = BatchNormClassifier(tf_inputs)
       loss_ops.log_loss(tf_predictions, tf_labels)
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 737bbbe57b2ecb1fb56052a7a10ca92fa19415f9..e2035ab014cfd09682257fbbbf3a2868681aa850 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -243,9 +243,9 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -272,14 +272,15 @@ py_test(
     srcs = ["resnet_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":resnet_utils",
         ":resnet_v2",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -299,7 +300,6 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v2.py b/tensorflow/contrib/slim/python/slim/nets/inception_v2.py
index 46062600e6b4006661edfc833fc2409d9ad6c831..e44cb770ba9e4479401f435f465ddd0d98b597de 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v2.py
@@ -495,7 +495,9 @@ def inception_v2(inputs,
   Constructs an Inception v2 network for classification as described in
   http://arxiv.org/abs/1502.03167.
 
-  The default image size used to train this network is 224x224.
+  The recommended image size used to train this network is 224x224. For image
+  sizes that differ substantially, it is recommended to use inception_v2_base()
+  and connect custom final layers to the output.
 
   Args:
     inputs: a tensor of shape [batch_size, height, width, channels].
@@ -510,8 +512,12 @@ def inception_v2(inputs,
       usage will be to set this value in (0, 1) to reduce the number of
       parameters or computation cost of the model.
     prediction_fn: a function to get predictions out of logits.
-    spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is
         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+        Note that input image sizes other than 224x224 might lead to different
+        spatial dimensions, and hence cannot be squeezed. In this event,
+        it is best to set spatial_squeeze as False, and perform a reduce_mean
+        over the resulting spatial dimensions with sizes exceeding 1.
     reuse: whether or not the network and its variables should be reused. To be
       able to reuse 'scope' must be given.
     scope: Optional variable_scope.
@@ -523,8 +529,7 @@ def inception_v2(inputs,
       activation.
 
   Raises:
-    ValueError: if final_endpoint is not set to one of the predefined values,
-                or depth_multiplier <= 0
+    ValueError: if depth_multiplier <= 0.
   """
   if depth_multiplier <= 0:
     raise ValueError('depth_multiplier is not greater than zero.')
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index 7441f1429fb31da40d088a7ae1ce9efb0bc34798..fcfaa2aba4e8ab086a9eac053188f8fbd4f6f39a 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -31,6 +31,8 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index dfdbb61dccf0b5cec08fb2225deb9f16438e511d..808709f83f07a25c023d77d76227e82b9b33face 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -25,16 +25,14 @@ py_library(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/ndlstm",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/staging/BUILD b/tensorflow/contrib/staging/BUILD
index 8ffc96c34696abd5a0f0c28a5071e282700511da..bc4a289468c257e7e5e2bd437b8d6d1235980495 100644
--- a/tensorflow/contrib/staging/BUILD
+++ b/tensorflow/contrib/staging/BUILD
@@ -22,4 +22,5 @@ py_library(
     name = "staging",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:data_flow_ops"],
 )
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 11f65b474862f47ad2384792689c78d7905525ee..5fd02efbf6327b20eade6785007930eed3fd4e03 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -13,6 +13,10 @@ py_library(
     name = "stat_summarizer_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:util",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index 1d9c1ffa50d767fa5bd1235fe1cc453681634f96..598e6513aebe54224409fbdf0a6077c03ee3d2d1 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -21,6 +21,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":stateless_random_ops",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 40d231831125f3048231f7cfdcf763e9c8279d2a..5f5be7c07321e58b86c801703f816cfc08e0cecc 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -1,11 +1,6 @@
 # TensorFlow code for training random forests.
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
 
-package(default_visibility = [
-    "//visibility:public",
-])
+licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
@@ -14,6 +9,10 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
+package(default_visibility = ["//visibility:public"])
+
+exports_files(["LICENSE"])
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -21,72 +20,56 @@ filegroup(
         exclude = [
             "**/METADATA",
             "**/OWNERS",
+            "kernels/v4/*",
+            "proto/*",
         ],
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+# ---------------------------------- V2 ops ------------------------------------------#
 filegroup(
-    name = "custom_op_sources",
-    srcs = glob(
-        [
-            "kernels/*.cc",
-            "ops/*.cc",
-        ],
-        exclude = [
-            "kernels/*_test.cc",
-            "kernels/tree_utils.cc",
-        ],
-    ),
+    name = "v2_op_sources",
+    srcs = [
+        "kernels/reinterpret_string_to_float_op.cc",
+        "kernels/scatter_add_ndim_op.cc",
+    ],
 )
 
 filegroup(
-    name = "custom_op_headers",
-    srcs = glob(
-        [
-            "kernels/*.h",
-        ],
-        exclude = [
-            "kernels/data_spec.h",
-            "kernels/tree_utils.h",
-        ],
-    ),
+    name = "v2_op_defs",
+    srcs = [
+        "ops/tensor_forest_ops.cc",
+    ],
 )
 
 cc_library(
-    name = "all_ops",
-    srcs = [":custom_op_sources"],
-    hdrs = [":custom_op_headers"],
+    name = "v2_ops",
+    srcs = [
+        ":v2_op_defs",
+        ":v2_op_sources",
+    ],
     deps = [
         ":tree_utils",
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
 
-py_library(
-    name = "constants",
-    srcs = [
-        "python/constants.py",
-    ],
-    srcs_version = "PY2AND3",
-)
-
 py_library(
     name = "data_ops_py",
-    srcs = [
-        "python/ops/data_ops.py",
-    ],
+    srcs = ["python/ops/data_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":tensor_forest_ops_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
@@ -103,16 +86,8 @@ tf_gen_op_wrapper_py(
 tf_custom_op_library(
     name = "python/ops/_tensor_forest_ops.so",
     srcs = [
-        "kernels/best_splits_op.cc",
-        "kernels/count_extremely_random_stats_op.cc",
-        "kernels/finished_nodes_op.cc",
-        "kernels/grow_tree_op.cc",
-        "kernels/reinterpret_string_to_float_op.cc",
-        "kernels/sample_inputs_op.cc",
-        "kernels/scatter_add_ndim_op.cc",
-        "kernels/tree_predictions_op.cc",
-        "kernels/update_fertile_slots_op.cc",
-        "ops/tensor_forest_ops.cc",
+        ":v2_op_defs",
+        ":v2_op_sources",
     ],
     deps = [":tree_utils"],
 )
@@ -126,10 +101,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":constants",
         ":data_ops_py",
         ":eval_metrics",
+        ":model_ops_py",
         ":random_forest",
+        ":stats_ops_py",
         ":tensor_forest_ops_py",
         ":tensor_forest_py",
     ],
@@ -137,22 +113,10 @@ py_library(
 
 tf_kernel_library(
     name = "tensor_forest_kernels",
-    srcs = [
-        "kernels/best_splits_op.cc",
-        "kernels/count_extremely_random_stats_op.cc",
-        "kernels/finished_nodes_op.cc",
-        "kernels/grow_tree_op.cc",
-        "kernels/reinterpret_string_to_float_op.cc",
-        "kernels/sample_inputs_op.cc",
-        "kernels/scatter_add_ndim_op.cc",
-        "kernels/tree_predictions_op.cc",
-        "kernels/update_fertile_slots_op.cc",
-    ],
+    srcs = [":v2_op_sources"],
     deps = [
         ":tree_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core/kernels:bounds_check",
     ],
 )
@@ -167,7 +131,6 @@ tf_custom_op_py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":constants",
         ":gen_tensor_forest_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
@@ -179,165 +142,277 @@ tf_custom_op_py_library(
     ],
 )
 
-py_library(
-    name = "eval_metrics",
-    srcs = ["client/eval_metrics.py"],
-    srcs_version = "PY2AND3",
+cc_test(
+    name = "tensor_forest_ops_test",
+    size = "small",
+    srcs = [
+        "kernels/tensor_forest_ops_test.cc",
+        ":v2_op_defs",
+        ":v2_op_sources",
+    ],
     deps = [
-        "//tensorflow/contrib/learn:estimator_constants_py",
-        "//tensorflow/contrib/losses:losses_py",
-        "//tensorflow/contrib/metrics:metrics_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//third_party/py/numpy",
+        ":tree_utils",
+        "//tensorflow/core",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
     ],
 )
 
-py_test(
-    name = "eval_metrics_test",
-    size = "small",
-    srcs = ["client/eval_metrics_test.py"],
-    srcs_version = "PY2AND3",
+# -------------------------------------- V4 ops ------------------------------- #
+cc_library(
+    name = "tensor_forest_v4_kernels",
     deps = [
-        ":eval_metrics",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        ":model_ops_kernels",
+        ":stats_ops_kernels",
+    ],
+)
+
+cc_library(
+    name = "tensor_forest_v4_ops_op_lib",
+    deps = [
+        ":model_ops_op_lib",
+        ":stats_ops_op_lib",
     ],
 )
 
 py_library(
-    name = "client_lib",
+    name = "tensor_forest_v4_ops_py",
     srcs_version = "PY2AND3",
     deps = [
-        ":eval_metrics",
-        ":tensor_forest_ops_py",
-        ":tensor_forest_py",
+        ":model_ops_py",
+        ":stats_ops_py",
     ],
 )
 
+# Model Ops.
 cc_library(
-    name = "tree_utils",
-    srcs = ["kernels/tree_utils.cc"],
-    hdrs = [
-        "kernels/data_spec.h",
-        "kernels/tree_utils.h",
-    ],
+    name = "model_ops_lib",
+    srcs = ["kernels/model_ops.cc"],
     deps = [
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_cc",
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:decision-tree-resource",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:input_data",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
     ],
+    alwayslink = 1,
 )
 
-py_test(
-    name = "best_splits_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/best_splits_op_test.py"],
-    srcs_version = "PY2AND3",
+tf_gen_op_libs(
+    op_lib_names = ["model_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_model_ops_py",
+    out = "python/ops/gen_model_ops.py",
+    deps = [":model_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "model_ops_kernels",
     deps = [
-        ":tensor_forest_ops_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":model_ops_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
+    alwayslink = 1,
 )
 
-py_test(
-    name = "count_extremely_random_stats_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/count_extremely_random_stats_op_test.py"],
+tf_custom_op_library(
+    name = "python/ops/_model_ops.so",
+    srcs = ["ops/model_ops.cc"],
+    deps = [":model_ops_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "model_ops_py",
+    srcs = ["python/ops/model_ops.py"],
+    dso = ["python/ops/_model_ops.so"],
+    kernels = [
+        ":model_ops_kernels",
+        ":model_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
-        ":data_ops_py",
-        ":tensor_forest_ops_py",
+        ":gen_model_ops_py",
+        ":stats_ops_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
 
-py_test(
-    name = "grow_tree_op_test",
+cc_test(
+    name = "model_ops_test",
     size = "small",
-    srcs = ["python/kernel_tests/grow_tree_op_test.py"],
-    srcs_version = "PY2AND3",
+    srcs = [
+        "kernels/model_ops_test.cc",
+        "ops/model_ops.cc",
+    ],
     deps = [
-        ":tensor_forest_ops_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        ":model_ops_lib",
+        "//tensorflow/core",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
     ],
 )
 
-py_test(
-    name = "finished_nodes_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/finished_nodes_op_test.py"],
+# Stats Ops.
+cc_library(
+    name = "stats_ops_lib",
+    srcs = ["kernels/stats_ops.cc"],
+    deps = [
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:decision-tree-resource",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:fertile-stats-resource",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:input_data",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:input_target",
+        "//tensorflow/contrib/tensor_forest/kernels/v4:params",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["stats_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_stats_ops_py",
+    out = "python/ops/gen_stats_ops.py",
+    deps = [":stats_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "stats_ops_kernels",
+    deps = [
+        ":stats_ops_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_stats_ops.so",
+    srcs = ["ops/stats_ops.cc"],
+    deps = [":stats_ops_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "stats_ops_py",
+    srcs = ["python/ops/stats_ops.py"],
+    dso = ["python/ops/_stats_ops.so"],
+    kernels = [
+        ":stats_ops_kernels",
+        ":stats_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
-        ":tensor_forest_ops_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":gen_stats_ops_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
 
-py_test(
-    name = "sample_inputs_op_test",
+cc_test(
+    name = "stats_ops_test",
     size = "small",
-    srcs = ["python/kernel_tests/sample_inputs_op_test.py"],
+    srcs = [
+        "kernels/stats_ops_test.cc",
+        "ops/stats_ops.cc",
+    ],
+    deps = [
+        ":stats_ops_lib",
+        "//tensorflow/core",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+    ],
+)
+
+# ---------------------------------- Common libs ------------------------ #
+cc_library(
+    name = "tree_utils",
+    srcs = ["kernels/tree_utils.cc"],
+    hdrs = [
+        "kernels/data_spec.h",
+        "kernels/tree_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+# --------------------------------- Python -------------------------------- #
+
+py_library(
+    name = "eval_metrics",
+    srcs = ["client/eval_metrics.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":data_ops_py",
-        ":tensor_forest_ops_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        "//tensorflow/contrib/learn:estimator_constants_py",
+        "//tensorflow/contrib/losses:losses_py",
+        "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//third_party/py/numpy",
     ],
 )
 
 py_test(
-    name = "scatter_add_ndim_op_test",
+    name = "eval_metrics_test",
     size = "small",
-    srcs = ["python/kernel_tests/scatter_add_ndim_op_test.py"],
+    srcs = ["client/eval_metrics_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip_gpu"],
     deps = [
-        ":tensor_forest_ops_py",
+        ":eval_metrics",
+        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
 )
 
-py_test(
-    name = "tree_predictions_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/tree_predictions_op_test.py"],
+py_library(
+    name = "client_lib",
     srcs_version = "PY2AND3",
     deps = [
-        ":constants",
+        ":eval_metrics",
         ":tensor_forest_ops_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":tensor_forest_py",
+        ":tensor_forest_v4_ops_py",
     ],
 )
 
 py_test(
-    name = "update_fertile_slots_op_test",
+    name = "scatter_add_ndim_op_test",
     size = "small",
-    srcs = ["python/kernel_tests/update_fertile_slots_op_test.py"],
+    srcs = ["python/kernel_tests/scatter_add_ndim_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip_gpu"],
     deps = [
         ":tensor_forest_ops_py",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -346,21 +421,17 @@ py_library(
     srcs = ["python/tensor_forest.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":constants",
         ":data_ops_py",
         ":tensor_forest_ops_py",
-        "//tensorflow/contrib/losses:losses_py",
-        "//tensorflow/python:array_ops",
+        ":tensor_forest_v4_ops_py",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_py",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "@six_archive//:six",
     ],
 )
 
@@ -371,29 +442,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":tensor_forest_py",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-    ],
-)
-
-cc_test(
-    name = "tensor_forest_ops_test",
-    size = "small",
-    srcs = [
-        "kernels/tensor_forest_ops_test.cc",
-        ":custom_op_sources",
-    ],
-    deps = [
-        ":tree_utils",
-        "//tensorflow/core",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//third_party/eigen3",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
@@ -403,13 +455,16 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client_lib",
-        ":data_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/learn",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -418,12 +473,15 @@ py_test(
     size = "medium",
     srcs = ["client/random_forest_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+        "nomac",  # b/63258195
+    ],
     deps = [
         ":random_forest",
         ":tensor_forest_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index 17269863542a38724d6fc9d7f9958aa563370ea9..90033015ebc5e44ea70fbf2bc9735d0aeb4ec27d 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -47,8 +47,6 @@ def _accuracy(predictions, targets, weights=None):
 
 
 def _r2(probabilities, targets, weights=None):
-  if targets.get_shape().ndims == 1:
-    targets = array_ops.expand_dims(targets, -1)
   targets = math_ops.to_float(targets)
   y_mean = math_ops.reduce_mean(targets, 0)
   squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
@@ -117,7 +115,13 @@ def _recall_at_thresholds(predictions, targets, weights=None):
       weights=weights)
 
 
+def _auc(probs, targets, weights=None):
+  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
+                                  targets, weights=weights)
+
+
 _EVAL_METRICS = {
+    'auc': _auc,
     'sigmoid_entropy': _sigmoid_entropy,
     'softmax_entropy': _softmax_entropy,
     'accuracy': _accuracy,
@@ -132,10 +136,11 @@ _EVAL_METRICS = {
 }
 
 _PREDICTION_KEYS = {
+    'auc': INFERENCE_PROB_NAME,
     'sigmoid_entropy': INFERENCE_PROB_NAME,
     'softmax_entropy': INFERENCE_PROB_NAME,
     'accuracy': INFERENCE_PRED_NAME,
-    'r2': INFERENCE_PROB_NAME,
+    'r2': prediction_key.PredictionKey.SCORES,
     'predictions': INFERENCE_PRED_NAME,
     'classification_log_loss': INFERENCE_PROB_NAME,
     'precision': INFERENCE_PRED_NAME,
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics_test.py b/tensorflow/contrib/tensor_forest/client/eval_metrics_test.py
index 7c559cdd8540ec10129ac2c637eab2bd30e08e2d..aa30919167d8240ad13432d4e589cbb358e5cb44 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics_test.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics_test.py
@@ -69,18 +69,18 @@ class EvalMetricsTest(test_util.TensorFlowTestCase):
       self.assertNear(0.6, accuracy_op.eval(), 0.0001)
 
   def testR2(self):
-    probabilities = constant_op.constant(
+    scores = constant_op.constant(
         [1.2, 3.9, 2.1, 0.9, 2.2, 0.1, 6.0, 4.0, 0.9])
     targets = constant_op.constant(
         [1.0, 4.3, 2.6, 0.5, 1.1, 0.7, 5.1, 3.4, 1.8])
-    r2_op, update_op = eval_metrics._r2(probabilities, targets)
+    r2_op, update_op = eval_metrics._r2(scores, targets)
     with self.test_session():
       # initializes internal accuracy vars
       variables.local_variables_initializer().run()
       # need to call in order to run the r2_op internal operations because
       # it is a streaming function
       update_op.eval()
-      self.assertNear(-19.7729, r2_op.eval(), 0.0001)
+      self.assertNear(0.813583, r2_op.eval(), 0.0001)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index ef2f0337ac903f1aeea7bfd0d6d588f35b53bd91..21fdff23b0ead3dc94bdd7194be901fe24868010 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -19,28 +19,33 @@ from __future__ import print_function
 
 from tensorflow.contrib import framework as contrib_framework
 
-from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
 from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 
 
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 
+EPSILON = 0.000001
+
 
 def _assert_float32(tensors):
   """Assert all tensors are float32.
@@ -67,15 +72,20 @@ class TensorForestRunOpAtEndHook(session_run_hook.SessionRunHook):
     self._ops = op_dict
 
   def end(self, session):
-    for name, op in self._ops.iteritems():
-      logging.info('{0}: {1}'.format(name, session.run(op)))
+    for name in sorted(self._ops.keys()):
+      logging.info('{0}: {1}'.format(name, session.run(self._ops[name])))
 
 
 class TensorForestLossHook(session_run_hook.SessionRunHook):
   """Monitor to request stop when loss stops decreasing."""
 
-  def __init__(self, early_stopping_rounds):
+  def __init__(self,
+               early_stopping_rounds,
+               early_stopping_loss_threshold=None,
+               loss_op=None):
     self.early_stopping_rounds = early_stopping_rounds
+    self.early_stopping_loss_threshold = early_stopping_loss_threshold
+    self.loss_op = loss_op
     self.min_loss = None
     self.last_step = -1
     # self.steps records the number of steps for which the loss has been
@@ -83,10 +93,12 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
     self.steps = 0
 
   def before_run(self, run_context):
+    loss = (self.loss_op if self.loss_op is not None else
+            run_context.session.graph.get_operation_by_name(
+                LOSS_NAME).outputs[0])
     return session_run_hook.SessionRunArgs(
         {'global_step': contrib_framework.get_global_step(),
-         'current_loss': run_context.session.graph.get_operation_by_name(
-             LOSS_NAME).outputs[0]})
+         'current_loss': loss})
 
   def after_run(self, run_context, run_values):
     current_loss = run_values.results['current_loss']
@@ -102,7 +114,8 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
       return
 
     self.last_step = current_step
-    if self.min_loss is None or current_loss < self.min_loss:
+    if (self.min_loss is None or current_loss <
+        (self.min_loss - self.min_loss * self.early_stopping_loss_threshold)):
       self.min_loss = current_loss
       self.steps = 0
     if self.steps > self.early_stopping_rounds:
@@ -126,20 +139,43 @@ class EveryCheckpointPreSaveListener(
     session.run(self._op)
 
 
+def get_default_head(params, weights_name, name=None):
+  if params.regression:
+    return head_lib.regression_head(
+        weight_column_name=weights_name,
+        label_dimension=params.num_outputs,
+        enable_centered_bias=False,
+        head_name=name)
+  else:
+    return head_lib.multi_class_head(
+        params.num_classes,
+        weight_column_name=weights_name,
+        enable_centered_bias=False,
+        head_name=name)
+
+
 def get_model_fn(params,
                  graph_builder_class,
                  device_assigner,
                  weights_name=None,
+                 model_head=None,
                  keys_name=None,
                  early_stopping_rounds=100,
+                 early_stopping_loss_threshold=0.001,
                  num_trainers=1,
                  trainer_id=0,
                  report_feature_importances=False,
-                 model_dir=None,
-                 local_eval=False):
+                 local_eval=False,
+                 head_scope=None):
   """Return a model function given a way to construct a graph builder."""
+  if model_head is None:
+    model_head = get_default_head(params, weights_name)
+
   def _model_fn(features, labels, mode):
     """Function that returns predictions, training loss, and training op."""
+    if (isinstance(features, ops.Tensor) or
+        isinstance(features, sparse_tensor.SparseTensor)):
+      features = {'features': features}
     weights = None
     if weights_name and weights_name in features:
       weights = features.pop(weights_name)
@@ -157,83 +193,75 @@ def get_model_fn(params,
 
     graph_builder = graph_builder_class(params,
                                         device_assigner=dev_assn)
-    inference = {}
-    output_alternatives = None
-    if (mode == model_fn_lib.ModeKeys.EVAL or
-        mode == model_fn_lib.ModeKeys.INFER):
-      inference[eval_metrics.INFERENCE_PROB_NAME] = (
-          graph_builder.inference_graph(features))
-
-      if params.regression:
-        predictions = {
-            None: inference[eval_metrics.INFERENCE_PROB_NAME]}
-        output_alternatives = {
-            None: (constants.ProblemType.LINEAR_REGRESSION, predictions)}
-      else:
-        inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax(
-            inference[eval_metrics.INFERENCE_PROB_NAME], 1)
-
-        predictions = {
-            prediction_key.PredictionKey.PROBABILITIES:
-                inference[eval_metrics.INFERENCE_PROB_NAME],
-            prediction_key.PredictionKey.CLASSES:
-                inference[eval_metrics.INFERENCE_PRED_NAME]}
-        output_alternatives = {
-            None: (constants.ProblemType.CLASSIFICATION, predictions)}
-
-      if keys is not None:
-        inference[keys_name] = keys
+
+    logits = graph_builder.inference_graph(features)
+
+    summary.scalar('average_tree_size', graph_builder.average_size())
+    # For binary classification problems, convert probabilities to logits.
+    # Includes hack to get around the fact that a probability might be 0 or 1.
+    if not params.regression and params.num_classes == 2:
+      class_1_probs = array_ops.slice(logits, [0, 1], [-1, 1])
+      logits = math_ops.log(
+          math_ops.maximum(class_1_probs / math_ops.maximum(
+              1.0 - class_1_probs, EPSILON), EPSILON))
 
     # labels might be None if we're doing prediction (which brings up the
     # question of why we force everything to adhere to a single model_fn).
-    loss_deps = []
     training_graph = None
     training_hooks = []
-    scaffold = None
     if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
-      training_graph = control_flow_ops.group(
-          graph_builder.training_graph(
-              features, labels, input_weights=weights,
-              num_trainers=num_trainers,
-              trainer_id=trainer_id),
-          state_ops.assign_add(contrib_framework.get_global_step(), 1))
-      loss_deps.append(training_graph)
-      if hasattr(graph_builder, 'finalize_training'):
-        finalize_listener = EveryCheckpointPreSaveListener(
-            graph_builder.finalize_training())
-        scaffold = monitored_session.Scaffold()
-        training_hooks.append(
-            basic_session_run_hooks.CheckpointSaverHook(
-                model_dir, save_secs=600, save_steps=None,
-                scaffold=scaffold,
-                listeners=[finalize_listener]))
-
-    training_loss = None
-    if (mode == model_fn_lib.ModeKeys.EVAL or
-        mode == model_fn_lib.ModeKeys.TRAIN):
-      with ops.control_dependencies(loss_deps):
-        training_loss = graph_builder.training_loss(
-            features, labels, name=LOSS_NAME)
+      with ops.control_dependencies([logits.op]):
+        training_graph = control_flow_ops.group(
+            graph_builder.training_graph(
+                features, labels, input_weights=weights,
+                num_trainers=num_trainers,
+                trainer_id=trainer_id),
+            state_ops.assign_add(contrib_framework.get_global_step(), 1))
 
     # Put weights back in
     if weights is not None:
       features[weights_name] = weights
 
-    if early_stopping_rounds:
-      training_hooks.append(TensorForestLossHook(early_stopping_rounds))
+    # TensorForest's training graph isn't calculated directly from the loss
+    # like many other models.
+    def _train_fn(unused_loss):
+      return training_graph
+
+    model_ops = model_head.create_model_fn_ops(
+        features=features,
+        labels=labels,
+        mode=mode,
+        train_op_fn=_train_fn,
+        logits=logits,
+        scope=head_scope)
+
+    # Ops are run in lexigraphical order of their keys. Run the resource
+    # clean-up op last.
+    all_handles = graph_builder.get_all_resource_handles()
+    ops_at_end = {
+        '9: clean up resources': control_flow_ops.group(
+            *[resource_variable_ops.destroy_resource_op(handle)
+              for handle in all_handles])}
 
     if report_feature_importances:
-      training_hooks.append(TensorForestRunOpAtEndHook(
-          {'feature_importances': graph_builder.feature_importances()}))
+      ops_at_end['1: feature_importances'] = (
+          graph_builder.feature_importances())
 
-    return model_fn_lib.ModelFnOps(
-        mode=mode,
-        predictions=inference,
-        loss=training_loss,
-        train_op=training_graph,
-        training_hooks=training_hooks,
-        scaffold=scaffold,
-        output_alternatives=output_alternatives)
+    training_hooks.append(TensorForestRunOpAtEndHook(ops_at_end))
+
+    if early_stopping_rounds:
+      training_hooks.append(
+          TensorForestLossHook(
+              early_stopping_rounds,
+              early_stopping_loss_threshold=early_stopping_loss_threshold,
+              loss_op=model_ops.loss))
+
+    model_ops.training_hooks.extend(training_hooks)
+
+    if keys is not None:
+      model_ops.predictions[keys_name] = keys
+
+    return model_ops
 
   return _model_fn
 
@@ -270,14 +298,23 @@ class TensorForestEstimator(estimator.Estimator):
   ```
   """
 
-  def __init__(self, params, device_assigner=None, model_dir=None,
+  def __init__(self,
+               params,
+               device_assigner=None,
+               model_dir=None,
                graph_builder_class=tensor_forest.RandomForestGraphs,
-               config=None, weights_name=None, keys_name=None,
+               config=None,
+               weights_name=None,
+               keys_name=None,
                feature_engineering_fn=None,
                early_stopping_rounds=100,
-               num_trainers=1, trainer_id=0,
+               early_stopping_loss_threshold=0.001,
+               num_trainers=1,
+               trainer_id=0,
                report_feature_importances=False,
-               local_eval=False):
+               local_eval=False,
+               version=None,
+               head=None):
     """Initializes a TensorForestEstimator instance.
 
     Args:
@@ -291,7 +328,8 @@ class TensorForestEstimator(estimator.Estimator):
         directory into an estimator.
       graph_builder_class: An `object` instance that defines how TF graphs for
         random forest training and inference are built. By default will use
-        `tensor_forest.RandomForestGraphs`.
+        `tensor_forest.RandomForestGraphs`. Can be overridden by version
+        kwarg.
       config: `RunConfig` object to configure the runtime settings.
       weights_name: A string defining feature column name representing
         weights. Will be multiplied by the loss of the example. Used to
@@ -305,6 +343,9 @@ class TensorForestEstimator(estimator.Estimator):
       early_stopping_rounds: Allows training to terminate early if the forest is
         no longer growing. 100 by default.  Set to a Falsy value to disable
         the default training hook.
+      early_stopping_loss_threshold: Percentage (as fraction) that loss must
+        improve by within early_stopping_rounds steps, otherwise training will
+        terminate.
       num_trainers: Number of training jobs, which will partition trees
         among them.
       trainer_id: Which trainer this instance is.
@@ -313,6 +354,9 @@ class TensorForestEstimator(estimator.Estimator):
       local_eval: If True, don't use a device assigner for eval. This is to
         support some common setups where eval is done on a single machine, even
         though training might be distributed.
+      version: Unused.
+      head: A heads_lib.Head object that calculates losses and such. If None,
+        one will be automatically created based on params.
 
     Returns:
       A `TensorForestEstimator` instance.
@@ -322,15 +366,167 @@ class TensorForestEstimator(estimator.Estimator):
             params.fill(),
             graph_builder_class,
             device_assigner,
+            model_head=head,
             weights_name=weights_name,
             keys_name=keys_name,
             early_stopping_rounds=early_stopping_rounds,
+            early_stopping_loss_threshold=early_stopping_loss_threshold,
             num_trainers=num_trainers,
             trainer_id=trainer_id,
             report_feature_importances=report_feature_importances,
-            model_dir=model_dir,
             local_eval=local_eval),
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
+def get_combined_model_fn(model_fns):
+  """Get a combined model function given a list of other model fns.
+
+  The model function returned will call the individual model functions and
+  combine them appropriately.  For:
+
+  training ops: tf.group them.
+  loss: average them.
+  predictions: concat probabilities such that predictions[*][0-C1] are the
+    probablities for output 1 (where C1 is the number of classes in output 1),
+    predictions[*][C1-(C1+C2)] are the probabilities for output 2 (where C2
+    is the number of classes in output 2), etc.  Also stack predictions such
+    that predictions[i][j] is the class prediction for example i and output j.
+
+  This assumes that labels are 2-dimensional, with labels[i][j] being the
+  label for example i and output j, where forest j is trained using only
+  output j.
+
+  Args:
+    model_fns: A list of model functions obtained from get_model_fn.
+
+  Returns:
+    A ModelFnOps instance.
+  """
+  def _model_fn(features, labels, mode):
+    """Function that returns predictions, training loss, and training op."""
+    model_fn_ops = []
+    for i in range(len(model_fns)):
+      with variable_scope.variable_scope('label_{0}'.format(i)):
+        sliced_labels = array_ops.slice(labels, [0, i], [-1, 1])
+        model_fn_ops.append(
+            model_fns[i](features, sliced_labels, mode))
+    training_hooks = []
+    for mops in model_fn_ops:
+      training_hooks += mops.training_hooks
+    predictions = {}
+    if (mode == model_fn_lib.ModeKeys.EVAL or
+        mode == model_fn_lib.ModeKeys.INFER):
+      # Flatten the probabilities into one dimension.
+      predictions[eval_metrics.INFERENCE_PROB_NAME] = array_ops.concat(
+          [mops.predictions[eval_metrics.INFERENCE_PROB_NAME]
+           for mops in model_fn_ops], axis=1)
+      predictions[eval_metrics.INFERENCE_PRED_NAME] = array_ops.stack(
+          [mops.predictions[eval_metrics.INFERENCE_PRED_NAME]
+           for mops in model_fn_ops], axis=1)
+    loss = None
+    if (mode == model_fn_lib.ModeKeys.EVAL or
+        mode == model_fn_lib.ModeKeys.TRAIN):
+      loss = math_ops.reduce_sum(
+          array_ops.stack(
+              [mops.loss for mops in model_fn_ops])) / len(model_fn_ops)
+
+    train_op = None
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      train_op = control_flow_ops.group(
+          *[mops.train_op for mops in model_fn_ops])
+    return model_fn_lib.ModelFnOps(
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        training_hooks=training_hooks,
+        scaffold=None,
+        output_alternatives=None)
+
+  return _model_fn
+
+
+class MultiForestMultiHeadEstimator(estimator.Estimator):
+  """An estimator that can train a forest for a multi-headed problems.
+
+  This class essentially trains separate forests (each with their own
+  ForestHParams) for each output.
+
+  For multi-headed regression, a single-headed TensorForestEstimator can
+  be used to train a single model that predicts all outputs.  This class can
+  be used to train separate forests for each output.
+  """
+
+  def __init__(self, params_list, device_assigner=None, model_dir=None,
+               graph_builder_class=tensor_forest.RandomForestGraphs,
+               config=None, weights_name=None, keys_name=None,
+               feature_engineering_fn=None,
+               early_stopping_rounds=100,
+               num_trainers=1, trainer_id=0,
+               report_feature_importances=False,
+               local_eval=False):
+    """Initializes a TensorForestEstimator instance.
+
+    Args:
+      params_list: A list of ForestHParams objects for each head, given in order
+        of outputs in the label tensor to be trained on.
+      device_assigner: An `object` instance that controls how trees get
+        assigned to devices. If `None`, will use
+        `tensor_forest.RandomForestDeviceAssigner`.
+      model_dir: Directory to save model parameters, graph, etc. To continue
+        training a previously saved model, load checkpoints saved to this
+        directory into an estimator.
+      graph_builder_class: An `object` instance that defines how TF graphs for
+        random forest training and inference are built. By default will use
+        `tensor_forest.RandomForestGraphs`.
+      config: `RunConfig` object to configure the runtime settings.
+      weights_name: A string defining feature column name representing
+        weights. Will be multiplied by the loss of the example. Used to
+        downweight or boost examples during training.
+      keys_name: A string naming one of the features to strip out and
+        pass through into the inference/eval results dict.  Useful for
+        associating specific examples with their prediction.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      early_stopping_rounds: Allows training to terminate early if the forest is
+        no longer growing. 100 by default.  Set to a Falsy value to disable
+        the default training hook.
+      num_trainers: Number of training jobs, which will partition trees
+        among them.
+      trainer_id: Which trainer this instance is.
+      report_feature_importances: If True, print out feature importances
+        during evaluation.
+      local_eval: If True, don't use a device assigner for eval. This is to
+        support some common setups where eval is done on a single machine, even
+        though training might be distributed.
+
+    Returns:
+      A `TensorForestEstimator` instance.
+    """
+    model_fns = []
+    for i in range(len(params_list)):
+      params = params_list[i].fill()
+      model_fns.append(
+          get_model_fn(
+              params,
+              graph_builder_class,
+              device_assigner,
+              model_head=get_default_head(
+                  params, weights_name, name='head{0}'.format(i)),
+              weights_name=weights_name,
+              keys_name=keys_name,
+              early_stopping_rounds=early_stopping_rounds,
+              num_trainers=num_trainers,
+              trainer_id=trainer_id,
+              report_feature_importances=report_feature_importances,
+              local_eval=local_eval,
+              head_scope='output{0}'.format(i)))
+
+    super(MultiForestMultiHeadEstimator, self).__init__(
+        model_fn=get_combined_model_fn(model_fns),
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest_test.py b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
index e78c772af3e2805b1228e63a3ea481adc2c25a88..52e41a6fe8f8050845447fdeba057de571888478 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
@@ -40,26 +40,11 @@ class TensorForestTrainerTests(test.TestCase):
 
     iris = base.load_iris()
     data = iris.data.astype(np.float32)
-    labels = iris.target.astype(np.float32)
+    labels = iris.target.astype(np.int32)
 
     classifier.fit(x=data, y=labels, steps=100, batch_size=50)
     classifier.evaluate(x=data, y=labels, steps=10)
 
-  def testClassificationTrainingLoss(self):
-    """Tests multi-class classification using matrix data as input."""
-    hparams = tensor_forest.ForestHParams(
-        num_trees=3, max_nodes=1000, num_classes=3, num_features=4)
-    classifier = random_forest.TensorForestEstimator(
-        hparams, graph_builder_class=(tensor_forest.TrainingLossForest))
-
-    iris = base.load_iris()
-    data = iris.data.astype(np.float32)
-    labels = iris.target.astype(np.float32)
-
-    monitors = [random_forest.TensorForestLossHook(10)]
-    classifier.fit(x=data, y=labels, steps=100, monitors=monitors)
-    classifier.evaluate(x=data, y=labels, steps=10)
-
   def testRegression(self):
     """Tests multi-class classification using matrix data as input."""
 
@@ -75,7 +60,7 @@ class TensorForestTrainerTests(test.TestCase):
 
     boston = base.load_boston()
     data = boston.data.astype(np.float32)
-    labels = boston.target.astype(np.float32)
+    labels = boston.target.astype(np.int32)
 
     regressor.fit(x=data, y=labels, steps=100, batch_size=50)
     regressor.evaluate(x=data, y=labels, steps=10)
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index 41a815bdec3e4a106276eaee5856db97488b5e18..13b9749756d60e2a8ecc5e4cbfd3d3a60c496552 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -2,17 +2,15 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
-package(default_visibility = [
-    "//visibility:public",
-])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
+package(default_visibility = ["//visibility:public"])
+
+exports_files(["LICENSE"])
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -28,20 +26,14 @@ filegroup(
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
-        [
-            "core/ops/*.cc",
-        ],
+        ["core/ops/*.cc"],
         exclude = ["core/ops/*_test.cc"],
     ),
 )
 
 filegroup(
     name = "custom_op_headers",
-    srcs = glob(
-        [
-            "core/ops/*.h",
-        ],
-    ),
+    srcs = glob(["core/ops/*.h"]),
 )
 
 cc_library(
@@ -92,13 +84,11 @@ tf_custom_op_library(
 cc_library(
     name = "utils",
     srcs = ["core/ops/utils.cc"],
-    hdrs = [
-        "core/ops/utils.h",
-    ],
+    hdrs = ["core/ops/utils.h"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
@@ -108,9 +98,7 @@ tf_custom_op_py_library(
         "__init__.py",
         "python/ops/training_ops.py",
     ],
-    dso = [
-        "python/ops/_training_ops.so",
-    ],
+    dso = ["python/ops/_training_ops.so"],
     kernels = [
         ":all_kernels",
     ],
@@ -139,22 +127,16 @@ py_library(
         "python/hybrid_layer.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
-    ],
+    deps = ["//tensorflow/contrib/framework:framework_py"],
 )
 
 py_test(
     name = "hybrid_layer_test",
     size = "small",
-    srcs = [
-        "python/hybrid_layer_test.py",
-    ],
+    srcs = ["python/hybrid_layer_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":fully_connected_layer",
-        ":hybrid_layer",
         ":hybrid_model",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_test_lib",
@@ -164,14 +146,11 @@ py_test(
 
 py_library(
     name = "hybrid_model",
-    srcs = [
-        "python/hybrid_model.py",
-    ],
+    srcs = ["python/hybrid_model.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
@@ -182,13 +161,10 @@ py_library(
 
 py_library(
     name = "fully_connected_layer",
-    srcs = [
-        "python/layers/fully_connected.py",
-    ],
+    srcs = ["python/layers/fully_connected.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":hybrid_layer",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -219,7 +195,6 @@ py_test(
     deps = [
         ":ops_lib",
         ":training_ops",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -228,15 +203,12 @@ py_test(
 
 py_library(
     name = "decisions_to_data_layer",
-    srcs = [
-        "python/layers/decisions_to_data.py",
-    ],
+    srcs = ["python/layers/decisions_to_data.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":hybrid_layer",
         ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
+        ":training_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
@@ -246,13 +218,10 @@ py_library(
 
 py_test(
     name = "decisions_to_data_test",
-    srcs = [
-        "python/layers/decisions_to_data_test.py",
-    ],
+    srcs = ["python/layers/decisions_to_data_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -263,35 +232,25 @@ py_test(
 
 py_library(
     name = "decisions_to_data_then_nn",
-    srcs = [
-        "python/models/decisions_to_data_then_nn.py",
-    ],
+    srcs = ["python/models/decisions_to_data_then_nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
         ":fully_connected_layer",
         ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:training",
     ],
 )
 
 py_library(
     name = "hard_decisions_to_data_then_nn",
-    srcs = [
-        "python/models/hard_decisions_to_data_then_nn.py",
-    ],
+    srcs = ["python/models/hard_decisions_to_data_then_nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
         ":fully_connected_layer",
         ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:training",
     ],
@@ -304,7 +263,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_then_nn",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -315,17 +273,12 @@ py_test(
 
 py_library(
     name = "k_feature_decisions_to_data_then_nn",
-    srcs = [
-        "python/models/k_feature_decisions_to_data_then_nn.py",
-    ],
+    srcs = ["python/models/k_feature_decisions_to_data_then_nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
         ":fully_connected_layer",
         ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:training",
     ],
 )
@@ -337,7 +290,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":k_feature_decisions_to_data_then_nn",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -348,17 +300,12 @@ py_test(
 
 py_library(
     name = "forest_to_data_then_nn",
-    srcs = [
-        "python/models/forest_to_data_then_nn.py",
-    ],
+    srcs = ["python/models/forest_to_data_then_nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
         ":fully_connected_layer",
         ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:training",
     ],
 )
@@ -370,7 +317,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":forest_to_data_then_nn",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -381,52 +327,36 @@ py_test(
 
 py_library(
     name = "nn",
-    srcs = [
-        "python/models/nn.py",
-    ],
+    srcs = ["python/models/nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":fully_connected_layer",
         ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:training",
     ],
 )
 
 py_library(
     name = "stochastic_hard_decisions_to_data_then_nn",
-    srcs = [
-        "python/models/stochastic_hard_decisions_to_data_then_nn.py",
-    ],
+    srcs = ["python/models/stochastic_hard_decisions_to_data_then_nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
         ":fully_connected_layer",
         ":hard_decisions_to_data_then_nn",
-        ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/python:training",
     ],
 )
 
 py_library(
     name = "stochastic_soft_decisions_to_data_then_nn",
-    srcs = [
-        "python/models/stochastic_soft_decisions_to_data_then_nn.py",
-    ],
+    srcs = ["python/models/stochastic_soft_decisions_to_data_then_nn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":decisions_to_data_layer",
         ":fully_connected_layer",
         ":hard_decisions_to_data_then_nn",
-        ":hybrid_model",
-        ":ops_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tensor_forest:tensor_forest_py",
+        "//tensorflow/python:training",
     ],
 )
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
index 555674ca69e63628ab7c037387df33f48559b405..9d5e1400a58cce75c03dfe3e0b5c973c11b89199 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
@@ -52,7 +52,7 @@ REGISTER_OP("UnpackPath")
       auto tree_depth = c->Dim(params, 1);
       int64 num_nodes = InferenceContext::kUnknownDim;
       if (c->ValueKnown(tree_depth)) {
-        num_nodes = (1 << c->Value(tree_depth)) - 1;
+        num_nodes = (static_cast<int64>(1) << c->Value(tree_depth)) - 1;
       }
 
       c->set_output(0, c->Matrix(num_points, num_nodes));
diff --git a/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc b/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc
deleted file mode 100644
index b56185e99eb22e41b9037545c02e7f3d8b0902b0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// BestSplits returns the index of the best candidate for each finished node.
-// This decision is based on the Gini score of the pcw_candidate_split counts,
-// and the right-branch-taken counts inferred from pcw_total_splits.
-#include <functional>
-
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-
-namespace tensorflow {
-
-using std::placeholders::_1;
-using tensorforest::BestFeatureClassification;
-using tensorforest::BestFeatureRegression;
-using tensorforest::CheckTensorBounds;
-
-
-class BestSplits : public OpKernel {
- public:
-  explicit BestSplits(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "regression", &regression_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& finished = context->input(0);
-    const Tensor& node_to_accumulator = context->input(1);
-    const Tensor& split_sums = context->input(2);
-    const Tensor& split_squares = context->input(3);
-    const Tensor& accumulator_sums = context->input(4);
-    const Tensor& accumulator_squares = context->input(5);
-
-    OP_REQUIRES(context, finished.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "finished should be one-dimensional"));
-    OP_REQUIRES(context, node_to_accumulator.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "node_to_accumulator should be one-dimensional"));
-
-    OP_REQUIRES(context, split_sums.shape().dims() == 3,
-                errors::InvalidArgument(
-                    "split_sums should be three-dimensional"));
-    OP_REQUIRES(context, accumulator_sums.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "accumulator_sums should be two-dimensional"));
-
-    if (regression_) {
-      OP_REQUIRES(context,
-                  split_sums.shape() == split_squares.shape(),
-                  errors::InvalidArgument(
-                      "split_sums and split_squares should "
-                      "be the same shape."));
-      OP_REQUIRES(context,
-                  accumulator_sums.shape() == accumulator_squares.shape(),
-                  errors::InvalidArgument(
-                      "accumulator_sums and accumulator_squares should "
-                      "be the same shape."));
-    }
-
-    OP_REQUIRES(
-        context,
-        accumulator_sums.shape().dim_size(0) ==
-        split_sums.shape().dim_size(0),
-        errors::InvalidArgument(
-            "Number of accumulators should be the same in split_sums "
-            "and accumulator_sums."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, finished)) return;
-    if (!CheckTensorBounds(context, node_to_accumulator)) return;
-    if (!CheckTensorBounds(context, split_sums)) return;
-    if (!CheckTensorBounds(context, split_squares)) return;
-    if (!CheckTensorBounds(context, accumulator_sums)) return;
-    if (!CheckTensorBounds(context, accumulator_squares)) return;
-
-    Tensor* output_splits = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, finished.shape(),
-                                            &output_splits));
-    auto best_splits = output_splits->unaligned_flat<int32>();
-
-    const auto finished_vec = finished.unaligned_flat<int32>();
-    const auto node_map = node_to_accumulator.unaligned_flat<int32>();
-
-    const int32 num_finished = static_cast<int32>(finished.shape().dim_size(0));
-
-    std::function<int32(int32)> best_feature_func =
-        std::bind(BestFeatureClassification, accumulator_sums, split_sums, _1);
-    if (regression_) {
-       best_feature_func = std::bind(
-           BestFeatureRegression, accumulator_sums, accumulator_squares,
-           split_sums, split_squares, _1);
-    }
-
-    for (int32 i = 0; i < num_finished; i++) {
-      const int32 node = internal::SubtleMustCopy(finished_vec(i));
-      OP_REQUIRES(
-          context, FastBoundsCheck(node, node_map.size()),
-          errors::InvalidArgument("finished node is outside the valid range"));
-
-      const int32 accumulator = internal::SubtleMustCopy(node_map(node));
-      if (accumulator < 0) {
-        LOG(ERROR) << "Something has gone wrong, we got a finished node that "
-                   << "doesn't have an accumulator allocated to it.";
-        continue;
-      }
-
-      best_splits(i) = best_feature_func(accumulator);
-    }
-  }
-
- private:
-  bool regression_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("BestSplits").Device(DEVICE_CPU), BestSplits);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc b/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc
deleted file mode 100644
index bf8d28c6e05e2cc4f02c38925e80e99d7dc0aaac..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc
+++ /dev/null
@@ -1,700 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// CountExtremelyRandomStats outputs count-deltas that should be added to
-// the node pcws, candidate split pcws, and total split pcws.  It also outputs
-// the leaves that each input arrived to for use in SampleInputs.  This is the
-// only op that involves tree traversal, and is constructed so that it can
-// be run in parallel on separate batches of data.
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-namespace tensorflow {
-
-using std::get;
-using std::make_pair;
-using std::make_tuple;
-using std::tuple;
-
-using tensorforest::CHILDREN_INDEX;
-using tensorforest::FEATURE_INDEX;
-using tensorforest::LEAF_NODE;
-using tensorforest::FREE_NODE;
-
-using tensorforest::CheckTensorBounds;
-using tensorforest::DecideNode;
-using tensorforest::TensorForestDataSpec;
-using tensorforest::Initialize;
-using tensorforest::IsAllInitialized;
-
-// A data structure to store the results of parallel tree traversal.
-struct InputDataResult {
-  // A list of each node that was visited.
-  std::vector<int32> node_indices;
-  // The accumulator of the leaf that a data point ended up at, or -1 if none.
-  int32 leaf_accumulator;
-  // The left-branch taken candidate splits.
-  std::vector<int32> split_adds;
-  // If the candidate splits for the leaf that a data point arrived at
-  // were initialized or not, which determines if we add this to total
-  // pcw counts or not.
-  bool splits_initialized;
-};
-
-
-struct EvaluateParams {
-  TensorForestDataSpec input_spec;
-  Tensor dense_input;
-  Tensor sparse_indices;
-  Tensor sparse_values;
-  Tensor input_labels;
-  Tensor tree_tensor;
-  Tensor tree_thresholds;
-  Tensor node_to_accumulator;
-  Tensor candidate_split_features;
-  Tensor candidate_split_thresholds;
-  InputDataResult* results;
-};
-
-void Evaluate(const EvaluateParams& params, int32 start, int32 end) {
-  const auto tree = params.tree_tensor.tensor<int32, 2>();
-  const auto thresholds = params.tree_thresholds.unaligned_flat<float>();
-  const auto node_map = params.node_to_accumulator.unaligned_flat<int32>();
-  const auto split_features =
-      params.candidate_split_features.tensor<int32, 2>();
-  const auto split_thresholds =
-      params.candidate_split_thresholds.tensor<float, 2>();
-
-  const int32 num_splits = static_cast<int32>(
-      params.candidate_split_features.shape().dim_size(1));
-  const int32 num_nodes = static_cast<int32>(
-      params.tree_tensor.shape().dim_size(0));
-  const int32 num_accumulators = static_cast<int32>(
-      params.candidate_split_features.shape().dim_size(0));
-
-  // Lambdas to capture the eigen-tensors so we don't the conversion overhead
-  // on each call to DecideNode.
-  const auto get_dense = tensorforest::GetDenseFunctor(params.dense_input);
-  const auto get_sparse = tensorforest::GetSparseFunctor(params.sparse_indices,
-                                                         params.sparse_values);
-
-  for (int32 i = start; i < end; ++i) {
-    int node_index = 0;
-    params.results[i].splits_initialized = false;
-    while (true) {
-      params.results[i].node_indices.push_back(node_index);
-      CHECK_LT(node_index, num_nodes);
-      int32 left_child =
-          internal::SubtleMustCopy(tree(node_index, CHILDREN_INDEX));
-      if (left_child == LEAF_NODE) {
-        const int32 accumulator =
-            internal::SubtleMustCopy(node_map(node_index));
-        params.results[i].leaf_accumulator = accumulator;
-        // If the leaf is not fertile or is not yet initialized, we don't
-        // count it in the candidate/total split per-class-weights because
-        // it won't have any candidate splits yet.
-        if (accumulator >= 0 &&
-            IsAllInitialized(split_features, accumulator, num_splits)) {
-          CHECK_LT(accumulator, num_accumulators);
-          params.results[i].splits_initialized = true;
-          for (int split = 0; split < num_splits; split++) {
-            const int32 feature = split_features(accumulator, split);
-            if (!DecideNode(get_dense, get_sparse, i, feature,
-                            split_thresholds(accumulator, split),
-                            params.input_spec)) {
-              params.results[i].split_adds.push_back(split);
-            }
-          }
-        }
-        break;
-      } else if (left_child == FREE_NODE) {
-        LOG(ERROR) << "Reached a free node, not good.";
-        params.results[i].node_indices.push_back(FREE_NODE);
-        break;
-      }
-      const int32 feature = tree(node_index, FEATURE_INDEX);
-      node_index =
-          left_child + DecideNode(get_dense, get_sparse, i, feature,
-                                  thresholds(node_index), params.input_spec);
-    }
-  }
-}
-
-class CountExtremelyRandomStats : public OpKernel {
- public:
-  explicit CountExtremelyRandomStats(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "num_classes", &num_classes_));
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "regression", &regression_));
-    string serialized_proto;
-    OP_REQUIRES_OK(context, context->GetAttr("input_spec", &serialized_proto));
-    input_spec_.ParseFromString(serialized_proto);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_data = context->input(0);
-    const Tensor& sparse_input_indices = context->input(1);
-    const Tensor& sparse_input_values = context->input(2);
-    const Tensor& sparse_input_shape = context->input(3);
-    const Tensor& input_labels = context->input(4);
-    const Tensor& input_weights = context->input(5);
-    const Tensor& tree_tensor = context->input(6);
-    const Tensor& tree_thresholds = context->input(7);
-    const Tensor& node_to_accumulator = context->input(8);
-    const Tensor& candidate_split_features = context->input(9);
-    const Tensor& candidate_split_thresholds = context->input(10);
-    const Tensor& birth_epochs = context->input(11);
-    const Tensor& current_epoch = context->input(12);
-
-    bool sparse_input = (sparse_input_indices.shape().dims() == 2);
-    bool have_weights = (input_weights.shape().dim_size(0) > 0);
-    int32 num_data = -1;
-
-    // Check inputs.
-    if (sparse_input) {
-      const auto sparse_shape = sparse_input_shape.unaligned_flat<int64>();
-      // TODO(gilberth): This is because we can't figure out the shape
-      // of a sparse tensor at graph-build time, even if the dimension is
-      // actually known.
-      input_spec_.mutable_sparse(0)->set_size(sparse_shape(1));
-      num_data = sparse_shape(0);
-
-      OP_REQUIRES(context, sparse_input_shape.shape().dims() == 1,
-                  errors::InvalidArgument(
-                      "sparse_input_shape should be one-dimensional"));
-      OP_REQUIRES(context,
-                  sparse_input_shape.shape().dim_size(0) == 2,
-                  errors::InvalidArgument(
-                      "The sparse input data should be two-dimensional"));
-      OP_REQUIRES(context, sparse_input_values.shape().dims() == 1,
-                  errors::InvalidArgument(
-                      "sparse_input_values should be one-dimensional"));
-      OP_REQUIRES(context, sparse_input_indices.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "The sparse input data should be two-dimensional"));
-      OP_REQUIRES(context,
-                  sparse_input_indices.shape().dim_size(0) ==
-                  sparse_input_values.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "sparse_input_indices and sparse_input_values should "
-                      "agree on the number of non-zero values"));
-    }
-
-    if (input_data.shape().dim_size(0) > 0) {
-      const int32 dense_num_data =
-          static_cast<int32>(input_data.shape().dim_size(0));
-      if (num_data > 0) {
-        CHECK_EQ(num_data, dense_num_data)
-            << "number of examples must match for sparse + dense input.";
-      }
-      num_data = dense_num_data;
-
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
-      OP_REQUIRES(
-          context,
-          input_data.shape().dim_size(0) == input_labels.shape().dim_size(0),
-          errors::InvalidArgument(
-              "Number of inputs should be the same in "
-              "input_data and input_labels."));
-    }
-
-    if (have_weights) {
-      OP_REQUIRES(
-          context,
-          input_weights.shape().dim_size(0) == input_labels.shape().dim_size(0),
-          errors::InvalidArgument(
-              "Number of inputs should be the same in input_weights and "
-              "input_labels."));
-    }
-
-    OP_REQUIRES(context, input_labels.shape().dims() >= 1,
-                errors::InvalidArgument(
-                    "input_labels should be at least one-dimensional"));
-    OP_REQUIRES(context, tree_tensor.shape().dims() == 2,
-            errors::InvalidArgument(
-                "tree should be two-dimensional"));
-    OP_REQUIRES(context, tree_thresholds.shape().dims() == 1,
-            errors::InvalidArgument(
-                "tree_thresholds should be one-dimensional"));
-    OP_REQUIRES(context, node_to_accumulator.shape().dims() == 1,
-            errors::InvalidArgument(
-                "node_to_accumulator should be one-dimensional"));
-    OP_REQUIRES(context, candidate_split_features.shape().dims() == 2,
-            errors::InvalidArgument(
-                "candidate_split_features should be two-dimensional"));
-    OP_REQUIRES(context, candidate_split_thresholds.shape().dims() == 2,
-            errors::InvalidArgument(
-                "candidate_split_thresholds should be two-dimensional"));
-    OP_REQUIRES(context, birth_epochs.shape().dims() == 1,
-            errors::InvalidArgument(
-                "birth_epochs should be one-dimensional"));
-    OP_REQUIRES(context, current_epoch.shape().dims() == 1,
-            errors::InvalidArgument(
-                "current_epoch should be one-dimensional"));
-
-    OP_REQUIRES(
-        context,
-        tree_tensor.shape().dim_size(0) ==
-        tree_thresholds.shape().dim_size(0) &&
-        tree_tensor.shape().dim_size(0) ==
-        node_to_accumulator.shape().dim_size(0) &&
-        tree_tensor.shape().dim_size(0) ==
-        birth_epochs.shape().dim_size(0),
-        errors::InvalidArgument(
-            "Number of nodes should be the same in "
-            "tree, tree_thresholds, node_to_accumulator, and birth_epoch."));
-    OP_REQUIRES(
-        context,
-        candidate_split_features.shape() == candidate_split_thresholds.shape(),
-        errors::InvalidArgument(
-            "candidate_split_features and candidate_split_thresholds should be "
-            "the same shape."));
-    OP_REQUIRES(
-        context,
-        current_epoch.shape().dim_size(0) == 1,
-        errors::InvalidArgument(
-            "The current_epoch should be a tensor of shape (1)."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, input_data)) return;
-    if (!CheckTensorBounds(context, sparse_input_indices)) return;
-    if (!CheckTensorBounds(context, sparse_input_values)) return;
-    if (!CheckTensorBounds(context, sparse_input_shape)) return;
-    if (!CheckTensorBounds(context, input_labels)) return;
-    if (!CheckTensorBounds(context, input_weights)) return;
-    if (!CheckTensorBounds(context, tree_tensor)) return;
-    if (!CheckTensorBounds(context, tree_thresholds)) return;
-    if (!CheckTensorBounds(context, node_to_accumulator)) return;
-    if (!CheckTensorBounds(context, candidate_split_features)) return;
-    if (!CheckTensorBounds(context, candidate_split_thresholds)) return;
-    if (!CheckTensorBounds(context, birth_epochs)) return;
-    if (!CheckTensorBounds(context, current_epoch)) return;
-
-    // Evaluate input data in parallel.
-    const int32 epoch = current_epoch.unaligned_flat<int32>()(0);
-
-    std::unique_ptr<InputDataResult[]> results(new InputDataResult[num_data]);
-    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
-    int num_threads = worker_threads->num_threads;
-    EvaluateParams params;
-    params.dense_input = input_data;
-    params.sparse_indices = sparse_input_indices;
-    params.sparse_values = sparse_input_values;
-    params.input_spec = input_spec_;
-    params.input_labels = input_labels;
-    params.tree_tensor = tree_tensor;
-    params.tree_thresholds = tree_thresholds;
-    params.node_to_accumulator = node_to_accumulator;
-    params.candidate_split_features = candidate_split_features;
-    params.candidate_split_thresholds = candidate_split_thresholds;
-    params.results = results.get();
-    // Require at least 100 inputs per thread.  I guess that's about 800 cost
-    // per unit.  This isn't well defined.
-    const int64 costPerUnit = 800;
-    auto work = [&params, num_data](int64 start, int64 end) {
-      CHECK(start <= end);
-      CHECK(end <= num_data);
-      Evaluate(params, static_cast<int32>(start), static_cast<int32>(end));
-    };
-    Shard(num_threads, worker_threads->workers, num_data, costPerUnit, work);
-
-    const int32 num_nodes = static_cast<int32>(tree_tensor.shape().dim_size(0));
-    if (regression_) {
-      ProcessResultsRegression(context, input_labels, input_weights,
-                               birth_epochs, epoch, std::move(results),
-                               num_nodes);
-    } else {
-      ProcessResultsClassification(context, input_labels, input_weights,
-                                   birth_epochs, epoch, std::move(results),
-                                   num_nodes);
-    }
-  }
-
- protected:
-  void ProcessResultsClassification(OpKernelContext* context,
-                                    const Tensor& input_labels,
-                                    const Tensor& input_weights,
-                                    const Tensor& birth_epochs, int32 epoch,
-                                    std::unique_ptr<InputDataResult[]> results,
-                                    int32 num_nodes) {
-    const int32 num_data = static_cast<int32>(input_labels.shape().dim_size(0));
-    const auto labels = input_labels.unaligned_flat<float>();
-    const auto start_epochs = birth_epochs.unaligned_flat<int32>();
-    const auto weights = input_weights.unaligned_flat<float>();
-
-    // Unused outputs for classification.  Still have to specify them or
-    // tensorflow complains.
-    Tensor* dummy = nullptr;
-    TensorShape dummy_shape;
-    dummy_shape.AddDim(0);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, dummy_shape, &dummy));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(4, dummy_shape, &dummy));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(7, dummy_shape, &dummy));
-
-    // node pcw delta
-    Tensor* output_node_pcw_sums_delta = nullptr;
-    TensorShape node_pcw_sums_shape;
-    node_pcw_sums_shape.AddDim(num_nodes);
-    node_pcw_sums_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, node_pcw_sums_shape,
-                                            &output_node_pcw_sums_delta));
-    Initialize<float>(*output_node_pcw_sums_delta, 0);
-    auto out_node_sums = output_node_pcw_sums_delta->tensor<float, 2>();
-
-    // leaves
-    Tensor* output_leaves = nullptr;
-    TensorShape leaves_shape;
-    leaves_shape.AddDim(num_data);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(8, leaves_shape, &output_leaves));
-    auto out_leaves = output_leaves->unaligned_flat<int32>();
-
-    // <accumulator, class> -> count delta
-    PairMapType<float> total_delta;
-    // <accumulator, split, class> -> count delta
-    TupleMapType<float> split_delta;
-
-    for (int32 i = 0; i < num_data; ++i) {
-      out_leaves(i) = results[i].node_indices.back();
-      float w = 1.0;
-      if (weights.size() > 0) {
-        w = weights(i);
-      }
-
-      const int32 label = internal::SubtleMustCopy(
-          static_cast<int32>(labels(i)));
-      // Labels that come from sparse tensors can have missing values.
-      if (label < 0) {
-        continue;
-      }
-      const int32 column = label + 1;
-      CHECK_LT(column, num_classes_);
-      const int32 accumulator = results[i].leaf_accumulator;
-      for (const int32 node : results[i].node_indices) {
-        if (epoch > start_epochs(node) + 1) {
-          continue;
-        }
-        out_node_sums(node, column) += w;
-        out_node_sums(node, 0) += w;
-      }
-
-      if (epoch > start_epochs(out_leaves(i)) + 1) {
-        continue;
-      }
-      if (accumulator >= 0 && results[i].splits_initialized) {
-        total_delta[make_pair(accumulator, column)] += w;
-        total_delta[make_pair(accumulator, 0)] += w;
-        for (const int32 split : results[i].split_adds) {
-          split_delta[make_tuple(accumulator, split, column)] += w;
-          split_delta[make_tuple(accumulator, split, 0)] += w;
-        }
-      }
-    }
-
-    // candidate splits pcw indices
-    Tensor* output_candidate_pcw_indices = nullptr;
-    TensorShape candidate_pcw_shape;
-    candidate_pcw_shape.AddDim(split_delta.size());
-    candidate_pcw_shape.AddDim(3);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, candidate_pcw_shape,
-                                            &output_candidate_pcw_indices));
-    auto out_candidate_indices =
-        output_candidate_pcw_indices->tensor<int32, 2>();
-
-    // candidate splits pcw delta
-    Tensor* output_candidate_pcw_delta = nullptr;
-    TensorShape candidate_pcw_delta_shape;
-    candidate_pcw_delta_shape.AddDim(split_delta.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(3, candidate_pcw_delta_shape,
-                                            &output_candidate_pcw_delta));
-    auto out_candidate = output_candidate_pcw_delta->unaligned_flat<float>();
-
-    // total splits indices
-    Tensor* output_total_pcw_indices = nullptr;
-    TensorShape total_pcw_shape;
-    total_pcw_shape.AddDim(total_delta.size());
-    total_pcw_shape.AddDim(2);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(5, total_pcw_shape,
-                                            &output_total_pcw_indices));
-    auto out_total_indices = output_total_pcw_indices->tensor<int32, 2>();
-
-    // total splits delta
-    Tensor* output_total_pcw_delta = nullptr;
-    TensorShape total_pcw_delta_shape;
-    total_pcw_delta_shape.AddDim(total_delta.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(6, total_pcw_delta_shape,
-                                            &output_total_pcw_delta));
-    auto out_total = output_total_pcw_delta->unaligned_flat<float>();
-
-    // Copy total deltas to output.
-    int32 output_slot = 0;
-    for (const auto& updates : total_delta) {
-      out_total_indices(output_slot, 0) = updates.first.first;
-      out_total_indices(output_slot, 1) = updates.first.second;
-      out_total(output_slot) = updates.second;
-      ++output_slot;
-    }
-
-    // Copy split deltas to output.
-    output_slot = 0;
-    for (const auto& updates : split_delta) {
-      out_candidate_indices(output_slot, 0) = get<0>(updates.first);
-      out_candidate_indices(output_slot, 1) = get<1>(updates.first);
-      out_candidate_indices(output_slot, 2) = get<2>(updates.first);
-      out_candidate(output_slot) = updates.second;
-      ++output_slot;
-    }
-  }
-
-  void ProcessResultsRegression(OpKernelContext* context,
-                                const Tensor& input_labels,
-                                const Tensor& input_weights,
-                                const Tensor& birth_epochs, const int32 epoch,
-                                std::unique_ptr<InputDataResult[]> results,
-                                int32 num_nodes) {
-    const int32 num_data = static_cast<int32>(input_labels.shape().dim_size(0));
-    int32 num_outputs = 1;
-    if (input_labels.shape().dims() > 1) {
-        num_outputs = static_cast<int32>(input_labels.shape().dim_size(1));
-    }
-    const auto labels = input_labels.unaligned_flat<float>();
-    const auto start_epochs = birth_epochs.unaligned_flat<int32>();
-    const auto weights = input_weights.unaligned_flat<float>();
-
-    // node pcw delta
-    Tensor* output_node_pcw_sums_delta = nullptr;
-    TensorShape node_pcw_sums_shape;
-    node_pcw_sums_shape.AddDim(num_nodes);
-    node_pcw_sums_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, node_pcw_sums_shape,
-                                            &output_node_pcw_sums_delta));
-    Initialize<float>(*output_node_pcw_sums_delta, 0);
-    auto out_node_sums = output_node_pcw_sums_delta->tensor<float, 2>();
-
-    Tensor* output_node_pcw_squares_delta = nullptr;
-    TensorShape node_pcw_squares_shape;
-    node_pcw_squares_shape.AddDim(num_nodes);
-    node_pcw_squares_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, node_pcw_squares_shape,
-                                            &output_node_pcw_squares_delta));
-    Initialize<float>(*output_node_pcw_squares_delta, 0);
-    auto out_node_squares = output_node_pcw_squares_delta->tensor<float, 2>();
-
-    // leaves
-    Tensor* output_leaves = nullptr;
-    TensorShape leaves_shape;
-    leaves_shape.AddDim(num_data);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(8, leaves_shape, &output_leaves));
-    auto out_leaves = output_leaves->unaligned_flat<int32>();
-
-    // <accumulator> -> label index
-    std::unordered_map<int32, std::unordered_set<int32>> total_delta;
-    // <accumulator, split> -> label index
-    PairMapType<std::unordered_set<int32>> split_delta;
-
-    for (int32 i = 0; i < num_data; ++i) {
-      const int32 accumulator = results[i].leaf_accumulator;
-      float w = 1.0;
-      if (weights.size() > 0) {
-        w = weights(i);
-      }
-
-      for (const int32 node : results[i].node_indices) {
-        if (epoch > start_epochs(node) + 1) {
-          continue;
-        }
-        for (int32 j = 0; j < num_outputs; ++j) {
-          const float output = labels(i * num_outputs + j);
-          out_node_sums(node, j + 1) += w * output;
-          out_node_squares(node, j + 1) += w * output * output;
-        }
-        out_node_sums(node, 0) += w;
-        out_node_squares(node, 0) += w;
-      }
-      out_leaves(i) = results[i].node_indices.back();
-      if (epoch > start_epochs(out_leaves(i)) + 1) {
-        continue;
-      }
-      if (accumulator >= 0 && results[i].splits_initialized) {
-        total_delta[accumulator].insert(i);
-        for (const int32 split : results[i].split_adds) {
-          split_delta[make_pair(accumulator, split)].insert(i);
-        }
-      }
-    }
-
-    // candidate splits pcw indices
-    Tensor* output_candidate_pcw_indices = nullptr;
-    TensorShape candidate_pcw_shape;
-    candidate_pcw_shape.AddDim(split_delta.size());
-    candidate_pcw_shape.AddDim(2);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, candidate_pcw_shape,
-                                            &output_candidate_pcw_indices));
-    auto out_candidate_indices =
-        output_candidate_pcw_indices->tensor<int32, 2>();
-
-    // candidate splits pcw delta
-    // sums
-    Tensor* output_candidate_pcw_sums = nullptr;
-    TensorShape candidate_pcw_sums_shape;
-    candidate_pcw_sums_shape.AddDim(split_delta.size());
-    candidate_pcw_sums_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(3, candidate_pcw_sums_shape,
-                                            &output_candidate_pcw_sums));
-    Initialize<float>(*output_candidate_pcw_sums, 0);
-    auto out_split_sums = output_candidate_pcw_sums->tensor<float, 2>();
-
-    // squares
-    Tensor* output_candidate_pcw_squares = nullptr;
-    TensorShape candidate_pcw_squares_shape;
-    candidate_pcw_squares_shape.AddDim(split_delta.size());
-    candidate_pcw_squares_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(4, candidate_pcw_squares_shape,
-                                            &output_candidate_pcw_squares));
-    Initialize<float>(*output_candidate_pcw_squares, 0);
-    auto out_split_squares = output_candidate_pcw_squares->tensor<float, 2>();
-
-    // total splits indices
-    Tensor* output_total_pcw_indices = nullptr;
-    TensorShape total_pcw_shape;
-    total_pcw_shape.AddDim(total_delta.size());
-    total_pcw_shape.AddDim(1);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(5, total_pcw_shape,
-                                            &output_total_pcw_indices));
-    auto out_total_indices = output_total_pcw_indices->unaligned_flat<int32>();
-
-    // total splits delta
-    // sums
-    Tensor* output_total_pcw_sums = nullptr;
-    TensorShape total_pcw_sums_shape;
-    total_pcw_sums_shape.AddDim(total_delta.size());
-    total_pcw_sums_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(6, total_pcw_sums_shape,
-                                            &output_total_pcw_sums));
-    Initialize<float>(*output_total_pcw_sums, 0);
-    auto out_total_sums = output_total_pcw_sums->tensor<float, 2>();
-
-    // squares
-    Tensor* output_total_pcw_squares = nullptr;
-    TensorShape total_pcw_squares_shape;
-    total_pcw_squares_shape.AddDim(total_delta.size());
-    total_pcw_squares_shape.AddDim(num_classes_);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(7, total_pcw_squares_shape,
-                                            &output_total_pcw_squares));
-    Initialize<float>(*output_total_pcw_squares, 0);
-    auto out_total_squares = output_total_pcw_squares->tensor<float, 2>();
-
-    // Copy total deltas to output.
-    int32 output_slot = 0;
-    for (const auto& updates : total_delta) {
-      out_total_indices(output_slot) = updates.first;
-      for (const int32 i : updates.second) {
-        for (int32 j = 0; j < num_outputs; ++j) {
-          const float output = labels(i * num_outputs + j);
-          out_total_sums(output_slot, j + 1) += output;
-          out_total_squares(output_slot, j + 1) += output * output;
-        }
-      }
-      out_total_sums(output_slot, 0) += updates.second.size();
-      out_total_squares(output_slot, 0) += updates.second.size();
-      ++output_slot;
-    }
-
-    // Copy split deltas to output.
-    output_slot = 0;
-    for (const auto& updates : split_delta) {
-      out_candidate_indices(output_slot, 0) = updates.first.first;
-      out_candidate_indices(output_slot, 1) = updates.first.second;
-      for (const int32 i : updates.second) {
-        for (int32 j = 0; j < num_outputs; ++j) {
-          const float output = labels(i * num_outputs + j);
-          out_split_sums(output_slot, j + 1) += output;
-          out_split_squares(output_slot, j + 1) += output * output;
-        }
-      }
-      out_split_sums(output_slot, 0) += updates.second.size();
-      out_split_squares(output_slot, 0) += updates.second.size();
-      ++output_slot;
-    }
-  }
-
-  struct PairIntHash {
-   public:
-    std::size_t operator()(const std::pair<int32, int32>& x) const {
-      // Bit-rotate x.first by 16 bits before xor-ing to minimize hash
-      // collisions in the frequent case when both elements of the pair are
-      // small.
-      return (x.first << 16 | x.first >> 16) ^ x.second;
-    }
-  };
-  template <typename V>
-  using PairMapType =
-      std::unordered_map<std::pair<int32, int32>, V, PairIntHash>;
-
-  struct TupleIntHash {
-   public:
-    std::size_t operator()(const std::tuple<int32, int32, int32>& x) const {
-      const int32 first = get<0>(x);
-      const int32 second = get<1>(x);
-      // Again, we bit-rotate (once by 16 bits, and once by 8 bits) to minimize
-      // hash collisions among small values.
-      return (first << 16 | first >> 16) ^ (second << 8 | second >> 24) ^
-          get<2>(x);
-    }
-  };
-  template <typename V>
-  using TupleMapType = std::unordered_map<tuple<int32, int32, int32>, V,
-      TupleIntHash>;
-
-  int32 num_classes_;
-  bool regression_;
-  tensorforest::TensorForestDataSpec input_spec_;
-};
-
-
-REGISTER_KERNEL_BUILDER(Name("CountExtremelyRandomStats").Device(DEVICE_CPU),
-                        CountExtremelyRandomStats);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc b/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc
deleted file mode 100644
index f71ba17fc335a665041b65c7d9f977de004069e4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/finished_nodes_op.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// FinishedNodes returns a 1-D tensor listing the nodes that are finished
-// accumulating.
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-namespace tensorflow {
-
-using std::placeholders::_1;
-using std::placeholders::_2;
-
-using tensorforest::CheckTensorBounds;
-using tensorforest::BestSplitDominatesClassificationBootstrap;
-using tensorforest::BestSplitDominatesClassificationChebyshev;
-using tensorforest::BestSplitDominatesClassificationHoeffding;
-using tensorforest::BestSplitDominatesRegression;
-
-namespace {
-
-struct EvaluateParams {
-  Tensor leaves;
-  Tensor node_to_accumulator;
-  Tensor accumulator_sums;
-  Tensor birth_epochs;
-  int current_epoch;
-  int32 num_split_after_samples;
-  int32 min_split_samples;
-  int32 check_dominates_every_samples;
-  bool need_random;
-  int64 random_seed;
-  std::function<bool(int, random::SimplePhilox*)> dominate_method;
-};
-
-void Evaluate(const EvaluateParams& params, mutex* mutex, int32 start,
-              int32 end, std::unordered_set<int32>* final_finished_leaves,
-              std::unordered_set<int32>* final_stale) {
-  const auto leaves = params.leaves.unaligned_flat<int32>();
-  const auto node_map = params.node_to_accumulator.unaligned_flat<int32>();
-  const auto sums = params.accumulator_sums.tensor<float, 2>();
-  const auto start_epochs = params.birth_epochs.unaligned_flat<int32>();
-
-  const int32 num_accumulators =
-      static_cast<int32>(params.accumulator_sums.shape().dim_size(0));
-
-  std::vector<int32> finished_leaves;
-  std::vector<int32> stale;
-
-  std::unique_ptr<random::SimplePhilox> simple_philox;
-  random::PhiloxRandom rnd_gen(params.random_seed);
-
-  if (params.need_random) {
-    simple_philox.reset(new random::SimplePhilox(&rnd_gen));
-  }
-
-  std::unordered_set<int32> visited;
-  for (int32 i = start; i < end; i++) {
-    const int32 leaf = internal::SubtleMustCopy(leaves(i));
-    if (leaf == -1 || visited.find(leaf) != visited.end()) {
-      continue;
-    }
-    if (!FastBoundsCheck(leaf, node_map.size())) {
-      LOG(ERROR) << "leaf " << leaf << " not in valid range.";
-    }
-    const int32 accumulator = internal::SubtleMustCopy(node_map(leaf));
-    if (accumulator < 0) {
-      continue;
-    }
-
-    if (!FastBoundsCheck(accumulator, num_accumulators)) {
-      LOG(ERROR) << "accumulator " << accumulator << " not in valid range.";
-    }
-    // The first column holds the number of samples seen.
-    // For classification, this should be the sum of the other columns.
-    int32 count = sums(accumulator, 0);
-
-    if (params.current_epoch > start_epochs(leaf) + 1) {
-      if (count >= params.min_split_samples) {
-        finished_leaves.push_back(leaf);
-      } else {
-        stale.push_back(leaf);
-      }
-      continue;
-    }
-
-    if (count >= params.num_split_after_samples) {
-      finished_leaves.push_back(leaf);
-      continue;
-    }
-
-    if (count < params.min_split_samples) {
-      continue;
-    }
-
-    if (count % params.check_dominates_every_samples != 0) {
-      continue;
-    }
-
-    bool finished = params.dominate_method(accumulator, simple_philox.get());
-    if (finished) {
-      finished_leaves.push_back(leaf);
-    }
-
-    visited.insert(leaf);
-  }
-  mutex_lock m(*mutex);
-  final_finished_leaves->insert(finished_leaves.begin(), finished_leaves.end());
-  final_stale->insert(stale.begin(), stale.end());
-}
-}  // namespace
-
-
-class FinishedNodes : public OpKernel {
- public:
-  explicit FinishedNodes(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "regression", &regression_));
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "num_split_after_samples", &num_split_after_samples_));
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "min_split_samples", &min_split_samples_));
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "dominate_fraction", &dominate_fraction_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("dominate_method", &dominate_method_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("check_dominates_every_samples",
-                                    &check_dominates_every_samples_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& leaf_tensor = context->input(0);
-    const Tensor& node_to_accumulator = context->input(1);
-    const Tensor& split_sums = context->input(2);
-    const Tensor& split_squares = context->input(3);
-    const Tensor& accumulator_sums = context->input(4);
-    const Tensor& accumulator_squares = context->input(5);
-    const Tensor& birth_epochs = context->input(6);
-    const Tensor& current_epoch = context->input(7);
-
-    OP_REQUIRES(context, leaf_tensor.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "leaf_tensor should be one-dimensional"));
-    OP_REQUIRES(context, node_to_accumulator.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "node_to_accumulator should be one-dimensional"));
-    OP_REQUIRES(context, split_sums.shape().dims() == 3,
-                errors::InvalidArgument(
-                    "split_sums should be three-dimensional"));
-    OP_REQUIRES(context, accumulator_sums.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "accumulator_sums should be two-dimensional"));
-    OP_REQUIRES(context, birth_epochs.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "birth_epochs should be one-dimensional"));
-    OP_REQUIRES(
-        context,
-        birth_epochs.shape().dim_size(0) ==
-        node_to_accumulator.shape().dim_size(0),
-        errors::InvalidArgument(
-            "birth_epochs and node_to_accumulator should be the same size."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, leaf_tensor)) return;
-    if (!CheckTensorBounds(context, node_to_accumulator)) return;
-    if (!CheckTensorBounds(context, split_sums)) return;
-    if (!CheckTensorBounds(context, split_squares)) return;
-    if (!CheckTensorBounds(context, accumulator_sums)) return;
-    if (!CheckTensorBounds(context, accumulator_squares)) return;
-    if (!CheckTensorBounds(context, birth_epochs)) return;
-    if (!CheckTensorBounds(context, current_epoch)) return;
-
-    const int32 epoch = current_epoch.unaligned_flat<int32>()(0);
-
-    const int32 num_leaves = static_cast<int32>(
-        leaf_tensor.shape().dim_size(0));
-
-    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
-    int num_threads = worker_threads->num_threads;
-
-    EvaluateParams params;
-    params.leaves = leaf_tensor;
-    params.node_to_accumulator = node_to_accumulator;
-    params.accumulator_sums = accumulator_sums;
-    params.birth_epochs = birth_epochs;
-    params.current_epoch = epoch;
-    params.min_split_samples = min_split_samples_;
-    params.num_split_after_samples = num_split_after_samples_;
-    params.need_random = false;
-    params.check_dominates_every_samples = check_dominates_every_samples_;
-
-    if (regression_) {
-      params.dominate_method =
-          std::bind(&BestSplitDominatesRegression, accumulator_sums,
-                    accumulator_squares, split_sums, split_squares, _1);
-    } else {
-      if (dominate_method_ == "none") {
-        params.dominate_method = [](int, random::SimplePhilox*) {
-          return false;
-        };
-      } else if (dominate_method_ == "hoeffding") {
-        params.dominate_method =
-            std::bind(&BestSplitDominatesClassificationHoeffding,
-                      accumulator_sums, split_sums, _1, dominate_fraction_);
-      } else if (dominate_method_ == "chebyshev") {
-        params.dominate_method =
-            std::bind(&BestSplitDominatesClassificationChebyshev,
-                      accumulator_sums, split_sums, _1, dominate_fraction_);
-      } else if (dominate_method_ == "bootstrap") {
-        params.need_random = true;
-
-        params.random_seed = random_seed_;
-        if (params.random_seed == 0) {
-          params.random_seed = static_cast<uint64>(Env::Default()->NowMicros());
-        }
-
-        params.dominate_method =
-            std::bind(&BestSplitDominatesClassificationBootstrap,
-                      accumulator_sums, split_sums, _1, dominate_fraction_, _2);
-      } else {
-        LOG(FATAL) << "Unknown dominate method " << dominate_method_;
-      }
-    }
-
-    std::unordered_set<int32> finished_leaves;
-    std::unordered_set<int32> stale;
-    mutex m;
-    // Require at least 100 leaves per thread.  I guess that's about 800 cost
-    // per unit.  This isn't well defined.
-    const int64 costPerUnit = 800;
-    auto work = [&params, &finished_leaves, &stale, &m, num_leaves](int64 start,
-                                                                    int64 end) {
-      CHECK(start <= end);
-      CHECK(end <= num_leaves);
-      Evaluate(params, &m, static_cast<int32>(start), static_cast<int32>(end),
-               &finished_leaves, &stale);
-    };
-    Shard(num_threads, worker_threads->workers, num_leaves, costPerUnit, work);
-
-    // Copy to output.
-    Tensor* output_finished = nullptr;
-    TensorShape finished_shape;
-    finished_shape.AddDim(finished_leaves.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, finished_shape,
-                                            &output_finished));
-    auto out_finished = output_finished->unaligned_flat<int32>();
-    std::copy(finished_leaves.begin(), finished_leaves.end(),
-              out_finished.data());
-
-    Tensor* output_stale = nullptr;
-    TensorShape stale_shape;
-    stale_shape.AddDim(stale.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, stale_shape,
-                                            &output_stale));
-    auto out_stale = output_stale->unaligned_flat<int32>();
-    std::copy(stale.begin(), stale.end(), out_stale.data());
-  }
-
- private:
-  bool regression_;
-  int32 num_split_after_samples_;
-  int32 min_split_samples_;
-  float dominate_fraction_;
-  string dominate_method_;
-  int32 random_seed_;
-  int32 check_dominates_every_samples_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("FinishedNodes").Device(DEVICE_CPU),
-                        FinishedNodes);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc b/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc
deleted file mode 100644
index ae5580582504f36e57a52b1320dfd90d8bd5c745..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/grow_tree_op.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// GrowTree adds children to the tree for finished nodes by using the
-// end_of_tree tensor as an indicator for where free nodes are in the
-// pre-allocated tree tensor.
-// For example if the tree is:
-//    1, -1, -1, -2, -2, -2, ...
-// Then end_of_tree should be 3 (the first -2, or "free" slot in the tensor).
-// If node 1 is now finished, the tree tensor after this op would be:
-//    1, 3, -1, -1, -1, -2, ...
-// and end_of_tree would be 5.
-
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-using tensorforest::CHILDREN_INDEX;
-using tensorforest::FEATURE_INDEX;
-using tensorforest::LEAF_NODE;
-
-using tensorforest::CheckTensorBounds;
-
-
-class GrowTree : public OpKernel {
- public:
-  explicit GrowTree(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& end_of_tree = context->input(0);
-    const Tensor& node_to_accumulator = context->input(1);
-    const Tensor& finished = context->input(2);
-    const Tensor& best_splits = context->input(3);
-    const Tensor& candidate_split_features = context->input(4);
-    const Tensor& candidate_split_thresholds = context->input(5);
-
-    OP_REQUIRES(context, end_of_tree.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "end_of_tree should be one-dimensional"));
-    OP_REQUIRES(context, node_to_accumulator.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "node_to_accumulator should be one-dimensional"));
-    OP_REQUIRES(context, finished.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "finished should be one-dimensional"));
-    OP_REQUIRES(context, best_splits.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "best_splits should be one-dimensional"));
-    OP_REQUIRES(context, candidate_split_features.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "candidate_split_features should be two-dimensional"));
-    OP_REQUIRES(context, candidate_split_thresholds.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "candidate_split_thresholds should be two-dimensional"));
-
-    OP_REQUIRES(
-        context,
-        finished.shape().dim_size(0) ==
-        best_splits.shape().dim_size(0),
-        errors::InvalidArgument(
-            "Number of finished nodes should be the same in finished and "
-            "best_splits."));
-    OP_REQUIRES(
-        context,
-        candidate_split_features.shape().dim_size(0) ==
-        candidate_split_thresholds.shape().dim_size(0),
-        errors::InvalidArgument(
-            "Number of accumulators should be the same in "
-            "candidate_split_features and candidate_split_thresholds."));
-    OP_REQUIRES(
-        context,
-        candidate_split_features.shape().dim_size(1) ==
-        candidate_split_thresholds.shape().dim_size(1),
-        errors::InvalidArgument(
-            "Number of splits should be the same in "
-            "candidate_split_features and candidate_split_thresholds."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, end_of_tree)) return;
-    if (!CheckTensorBounds(context, node_to_accumulator)) return;
-    if (!CheckTensorBounds(context, finished)) return;
-    if (!CheckTensorBounds(context, best_splits)) return;
-    if (!CheckTensorBounds(context, candidate_split_features)) return;
-    if (!CheckTensorBounds(context, candidate_split_thresholds)) return;
-
-    int32 current_end_of_tree = end_of_tree.unaligned_flat<int32>()(0);
-    const auto node_map = node_to_accumulator.unaligned_flat<int32>();
-    const auto finished_vec = finished.unaligned_flat<int32>();
-    const auto best_vec = best_splits.unaligned_flat<int32>();
-    const auto split_features = candidate_split_features.tensor<int32, 2>();
-    const auto split_thresholds = candidate_split_thresholds.tensor<float, 2>();
-
-    const int32 num_finished = static_cast<int32>(finished.shape().dim_size(0));
-    const int32 num_nodes = static_cast<int32>(
-        node_to_accumulator.shape().dim_size(0));
-    const int32 num_accumulators = static_cast<int32>(
-        candidate_split_features.shape().dim_size(0));
-    const int32 num_splits = static_cast<int32>(
-        candidate_split_features.shape().dim_size(1));
-
-    // Converting a leaf node into an internal node requires space for its
-    // two children.
-    int32 remaining_node_space = (num_nodes - current_end_of_tree) / 2;
-    int32 nodes_we_can_allocate = std::min(num_finished, remaining_node_space);
-    // Each conversion touches three nodes: the transitioning node and its
-    // two new children.
-    int32 num_updates = 3 * nodes_we_can_allocate;
-
-    Tensor* nodes_to_update_tensor = nullptr;
-    TensorShape nodes_to_update_shape;
-    nodes_to_update_shape.AddDim(num_updates);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, nodes_to_update_shape,
-                                            &nodes_to_update_tensor));
-    auto nodes_to_update_flat = nodes_to_update_tensor->tensor<int32, 1>();
-
-    Tensor* tree_updates_tensor = nullptr;
-    TensorShape tree_updates_shape;
-    tree_updates_shape.AddDim(num_updates);
-    tree_updates_shape.AddDim(2);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, tree_updates_shape,
-                                            &tree_updates_tensor));
-    auto tree_updates_flat = tree_updates_tensor->tensor<int32, 2>();
-
-    Tensor* threshold_updates_tensor = nullptr;
-    TensorShape threshold_updates_shape;
-    threshold_updates_shape.AddDim(num_updates);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, threshold_updates_shape,
-                                            &threshold_updates_tensor));
-    auto threshold_updates_flat = threshold_updates_tensor->tensor<float, 1>();
-
-    int output_slot = 0;
-    for (int32 i = 0; i < nodes_we_can_allocate; i++) {
-      const int32 node = internal::SubtleMustCopy(finished_vec(i));
-      OP_REQUIRES(context, FastBoundsCheck(node, node_map.size()),
-                  errors::InvalidArgument("finished node not in valid range."))
-      const int32 best = internal::SubtleMustCopy(best_vec(i));
-      const int32 accumulator = internal::SubtleMustCopy(node_map(node));
-      if (accumulator < 0) {
-        LOG(ERROR) << "Finished node doesn't have an accumulator.";
-        continue;
-      }
-
-      OP_REQUIRES(context, FastBoundsCheck(accumulator, num_accumulators),
-                  errors::InvalidArgument("accumulator not in valid range."))
-      OP_REQUIRES(context, FastBoundsCheck(best, num_splits),
-                  errors::InvalidArgument("best split not in valid range."))
-
-      if (current_end_of_tree >= num_nodes - 1) {
-        LOG(ERROR) << "Could not grow tree any further.";
-        return;
-      }
-      const int32 left = current_end_of_tree;
-      nodes_to_update_flat(output_slot) = node;
-
-      tree_updates_flat(output_slot, CHILDREN_INDEX) = left;
-      tree_updates_flat(output_slot, FEATURE_INDEX) =
-          split_features(accumulator, best);
-      threshold_updates_flat(output_slot) = split_thresholds(accumulator, best);
-      output_slot++;
-
-      nodes_to_update_flat(output_slot) = left;
-      tree_updates_flat(output_slot, CHILDREN_INDEX) = LEAF_NODE;
-      tree_updates_flat(output_slot, FEATURE_INDEX) = -1;
-      threshold_updates_flat(output_slot) = 0.0;
-      output_slot++;
-
-      nodes_to_update_flat(output_slot) = left + 1;
-      tree_updates_flat(output_slot, CHILDREN_INDEX) = LEAF_NODE;
-      tree_updates_flat(output_slot, FEATURE_INDEX) = -1;
-      threshold_updates_flat(output_slot) = 0.0;
-      output_slot++;
-
-      current_end_of_tree += 2;
-    }
-
-    Tensor* new_end_of_tree_tensor = nullptr;
-    TensorShape new_end_of_tree_shape;
-    new_end_of_tree_shape.AddDim(1);
-    OP_REQUIRES_OK(context, context->allocate_output(3, new_end_of_tree_shape,
-                                                     &new_end_of_tree_tensor));
-    auto new_end_of_tree_flat = new_end_of_tree_tensor->tensor<int32, 1>();
-    new_end_of_tree_flat(0) = current_end_of_tree;
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("GrowTree").Device(DEVICE_CPU), GrowTree);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..221f8d969bc1d788107be10fe4e2017409e8f785
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
@@ -0,0 +1,454 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <functional>
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Creates a tree  variable.
+class CreateTreeVariableOp : public OpKernel {
+ public:
+  explicit CreateTreeVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(tree_config_t->shape()),
+                errors::InvalidArgument("Tree config must be a scalar."));
+
+    auto* result = new DecisionTreeResource(param_proto_);
+    if (!ParseProtoUnlimited(result->mutable_decision_tree(),
+                             tree_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree  config."));
+    }
+
+    result->MaybeInitialize();
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+
+ private:
+  TensorForestParams param_proto_;
+};
+
+// Op for serializing a model.
+class TreeSerializeOp : public OpKernel {
+ public:
+  explicit TreeSerializeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    DecisionTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape(), &output_config_t));
+    output_config_t->scalar<string>()() =
+        decision_tree_resource->decision_tree().SerializeAsString();
+  }
+};
+
+// Op for deserializing a tree variable from a checkpoint.
+class TreeDeserializeOp : public OpKernel {
+ public:
+  explicit TreeDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    DecisionTreeResource* decision_tree_resource;
+    auto handle = HandleFromInput(context, 0);
+    OP_REQUIRES_OK(context,
+                   LookupResource(context, handle, &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(tree_config_t->shape()),
+                errors::InvalidArgument("Tree config must be a scalar."));
+    // Deallocate all the previous objects on the resource.
+    decision_tree_resource->Reset();
+    decision_trees::Model* config =
+        decision_tree_resource->mutable_decision_tree();
+    OP_REQUIRES(context,
+                ParseProtoUnlimited(config, tree_config_t->scalar<string>()()),
+                errors::InvalidArgument("Unable to parse tree  config."));
+    decision_tree_resource->MaybeInitialize();
+  }
+
+ private:
+  TensorForestParams param_proto_;
+};
+
+// Op for getting tree size.
+class TreeSizeOp : public OpKernel {
+ public:
+  explicit TreeSizeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    DecisionTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape(), &output_t));
+    output_t->scalar<int32>()() =
+        decision_tree_resource->decision_tree().decision_tree().nodes_size();
+  }
+};
+
+void TraverseTree(const DecisionTreeResource* tree_resource,
+                  const std::unique_ptr<TensorDataSet>& data, int32 start,
+                  int32 end,
+                  const std::function<void(int32, int32)>& set_leaf_id) {
+  for (int i = start; i < end; ++i) {
+    const int32 id = tree_resource->TraverseTree(data, i, nullptr);
+    set_leaf_id(i, id);
+  }
+}
+
+// Op for tree inference.
+class TreePredictionsV4Op : public OpKernel {
+ public:
+  explicit TreePredictionsV4Op(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+
+    string serialized_proto;
+    OP_REQUIRES_OK(context, context->GetAttr("input_spec", &serialized_proto));
+    input_spec_.ParseFromString(serialized_proto);
+
+    data_set_ =
+        std::unique_ptr<TensorDataSet>(new TensorDataSet(input_spec_, 0));
+
+    model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(param_proto_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_data = context->input(1);
+    const Tensor& sparse_input_indices = context->input(2);
+    const Tensor& sparse_input_values = context->input(3);
+    const Tensor& sparse_input_shape = context->input(4);
+
+    data_set_->set_input_tensors(input_data, sparse_input_indices,
+                                 sparse_input_values, sparse_input_shape);
+
+    DecisionTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const int num_data = data_set_->NumItems();
+    const int32 num_outputs = param_proto_.num_outputs();
+
+    Tensor* output_predictions = nullptr;
+    TensorShape output_shape;
+    output_shape.AddDim(num_data);
+    output_shape.AddDim(num_outputs);
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_predictions));
+    TTypes<float, 2>::Tensor out = output_predictions->tensor<float, 2>();
+
+    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    int num_threads = worker_threads->num_threads;
+    const int64 costPerTraverse = 500;
+    auto traverse = [this, &out, decision_tree_resource, num_data](int64 start,
+                                                                   int64 end) {
+      CHECK(start <= end);
+      CHECK(end <= num_data);
+      TraverseTree(decision_tree_resource, data_set_, static_cast<int32>(start),
+                   static_cast<int32>(end),
+                   std::bind(&TreePredictionsV4Op::set_output_value, this,
+                             std::placeholders::_1, std::placeholders::_2,
+                             decision_tree_resource, &out));
+    };
+    Shard(num_threads, worker_threads->workers, num_data, costPerTraverse,
+          traverse);
+  }
+
+  void set_output_value(int32 i, int32 id,
+                        DecisionTreeResource* decision_tree_resource,
+                        TTypes<float, 2>::Tensor* out) {
+    const decision_trees::Leaf& leaf = decision_tree_resource->get_leaf(id);
+
+    float sum = 0;
+    for (int j = 0; j < param_proto_.num_outputs(); ++j) {
+      const float count = model_op_->GetOutputValue(leaf, j);
+      (*out)(i, j) = count;
+      sum += count;
+    }
+
+    if (!param_proto_.is_regression() && sum > 0 && sum != 1) {
+      for (int j = 0; j < param_proto_.num_outputs(); ++j) {
+        (*out)(i, j) /= sum;
+      }
+    }
+  }
+
+ private:
+  tensorforest::TensorForestDataSpec input_spec_;
+  std::unique_ptr<TensorDataSet> data_set_;
+  std::unique_ptr<LeafModelOperator> model_op_;
+  TensorForestParams param_proto_;
+};
+
+// Outputs leaf ids for the given examples.
+class TraverseTreeV4Op : public OpKernel {
+ public:
+  explicit TraverseTreeV4Op(OpKernelConstruction* context) : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+
+    string serialized_proto;
+    OP_REQUIRES_OK(context, context->GetAttr("input_spec", &serialized_proto));
+    input_spec_.ParseFromString(serialized_proto);
+
+    data_set_ =
+        std::unique_ptr<TensorDataSet>(new TensorDataSet(input_spec_, 0));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_data = context->input(1);
+    const Tensor& sparse_input_indices = context->input(2);
+    const Tensor& sparse_input_values = context->input(3);
+    const Tensor& sparse_input_shape = context->input(4);
+
+    data_set_->set_input_tensors(input_data, sparse_input_indices,
+                                 sparse_input_values, sparse_input_shape);
+
+    DecisionTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const int num_data = data_set_->NumItems();
+
+    Tensor* output_predictions = nullptr;
+    TensorShape output_shape;
+    output_shape.AddDim(num_data);
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_predictions));
+
+    auto leaf_ids = output_predictions->tensor<int32, 1>();
+
+    auto set_leaf_ids = [&leaf_ids](int32 i, int32 id) { leaf_ids(i) = id; };
+
+    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    int num_threads = worker_threads->num_threads;
+    const int64 costPerTraverse = 500;
+    auto traverse = [this, &set_leaf_ids, decision_tree_resource, num_data](
+                        int64 start, int64 end) {
+      CHECK(start <= end);
+      CHECK(end <= num_data);
+      TraverseTree(decision_tree_resource, data_set_, static_cast<int32>(start),
+                   static_cast<int32>(end), set_leaf_ids);
+    };
+    Shard(num_threads, worker_threads->workers, num_data, costPerTraverse,
+          traverse);
+  }
+
+ private:
+  tensorforest::TensorForestDataSpec input_spec_;
+  std::unique_ptr<TensorDataSet> data_set_;
+  TensorForestParams param_proto_;
+};
+
+// Update the given leaf models using the batch of labels.
+class UpdateModelV4Op : public OpKernel {
+ public:
+  explicit UpdateModelV4Op(OpKernelConstruction* context) : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+
+    model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(param_proto_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& leaf_ids = context->input(1);
+    const Tensor& input_labels = context->input(2);
+    const Tensor& input_weights = context->input(3);
+
+    DecisionTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const int num_data = input_labels.shape().dim_size(0);
+    const int32 label_dim =
+        input_labels.shape().dims() <= 1
+            ? 0
+            : static_cast<int>(input_labels.shape().dim_size(1));
+    const int32 num_targets =
+        param_proto_.is_regression() ? (std::max(1, label_dim)) : 1;
+
+    TensorInputTarget target(input_labels, input_weights, num_targets);
+
+    // TODO(gilberth): Make this thread safe and multi-thread.
+    UpdateModel(leaf_ids, target, 0, num_data, decision_tree_resource);
+  }
+
+  void UpdateModel(const Tensor& leaf_ids, const TensorInputTarget& target,
+                   int32 start, int32 end,
+                   DecisionTreeResource* decision_tree_resource) {
+    const auto leaves = leaf_ids.unaligned_flat<int32>();
+    for (int i = start; i < end; ++i) {
+      model_op_->UpdateModel(
+          decision_tree_resource->get_mutable_tree_node(leaves(i))
+              ->mutable_leaf(),
+          &target, i);
+    }
+  }
+
+ private:
+  std::unique_ptr<LeafModelOperator> model_op_;
+  TensorForestParams param_proto_;
+};
+
+// Op for getting feature usage counts.
+class FeatureUsageCountsOp : public OpKernel {
+ public:
+  explicit FeatureUsageCountsOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    DecisionTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const auto& tree = decision_tree_resource->decision_tree();
+
+    Tensor* output_counts = nullptr;
+    TensorShape output_shape;
+    output_shape.AddDim(param_proto_.num_features());
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_counts));
+
+    auto counts = output_counts->unaligned_flat<int32>();
+    counts.setZero();
+
+    for (const auto& node : tree.decision_tree().nodes()) {
+      if (node.has_custom_node_type()) {
+        LOG(WARNING) << "Can't count feature usage for custom nodes.";
+      } else if (node.has_binary_node()) {
+        const auto& bnode = node.binary_node();
+        if (bnode.has_custom_left_child_test()) {
+          decision_trees::MatchingValuesTest test;
+          if (!bnode.custom_left_child_test().UnpackTo(&test)) {
+            LOG(WARNING) << "Unknown custom child test";
+            continue;
+          }
+          int32 feat;
+          safe_strto32(test.feature_id().id().value(), &feat);
+          ++counts(feat);
+        } else {
+          const auto& test = bnode.inequality_left_child_test();
+          if (test.has_feature_id()) {
+            int32 feat;
+            safe_strto32(test.feature_id().id().value(), &feat);
+            ++counts(feat);
+          } else if (test.has_oblique()) {
+            for (const auto& featid : test.oblique().features()) {
+              int32 feat;
+              safe_strto32(featid.id().value(), &feat);
+              ++counts(feat);
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  TensorForestParams param_proto_;
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(DecisionTreeResource);
+
+REGISTER_KERNEL_BUILDER(Name("TreeIsInitializedOp").Device(DEVICE_CPU),
+                        IsResourceInitialized<DecisionTreeResource>);
+
+REGISTER_KERNEL_BUILDER(Name("CreateTreeVariable").Device(DEVICE_CPU),
+                        CreateTreeVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreeSerialize").Device(DEVICE_CPU),
+                        TreeSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreeDeserialize").Device(DEVICE_CPU),
+                        TreeDeserializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreeSize").Device(DEVICE_CPU), TreeSizeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TreePredictionsV4").Device(DEVICE_CPU),
+                        TreePredictionsV4Op);
+
+REGISTER_KERNEL_BUILDER(Name("TraverseTreeV4").Device(DEVICE_CPU),
+                        TraverseTreeV4Op);
+
+REGISTER_KERNEL_BUILDER(Name("FeatureUsageCounts").Device(DEVICE_CPU),
+                        FeatureUsageCountsOp);
+
+REGISTER_KERNEL_BUILDER(Name("UpdateModelV4").Device(DEVICE_CPU),
+                        UpdateModelV4Op);
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/model_ops_test.cc b/tensorflow/contrib/tensor_forest/kernels/model_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fdab8e6e0bb7d5faee3e7345e68d180346eb768
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/model_ops_test.cc
@@ -0,0 +1,99 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ModelOpsTest, CreateTreeVariable_ShapeFn) {
+  ShapeInferenceTestOp op("CreateTreeVariable");
+  INFER_OK(op, "[1];[1]", "");
+}
+
+TEST(ModelOpsTest, TreeSerialize_ShapeFn) {
+  ShapeInferenceTestOp op("TreeSerialize");
+  INFER_OK(op, "[1]", "[]");
+}
+
+TEST(ModelOpsTest, TreeDeserialize_ShapeFn) {
+  ShapeInferenceTestOp op("TreeDeserialize");
+  INFER_OK(op, "[1];[1]", "");
+}
+
+TEST(ModelOpsTest, TreeSize_ShapeFn) {
+  ShapeInferenceTestOp op("TreeSize");
+  INFER_OK(op, "[1]", "[]");
+}
+
+TEST(ModelOpsTest, TreePredictionsV4_ShapeFn) {
+  ShapeInferenceTestOp op("TreePredictionsV4");
+  TF_ASSERT_OK(NodeDefBuilder("test", "TreePredictionsV4")
+                   .Input("a", 0, DT_RESOURCE)
+                   .Input("b", 1, DT_FLOAT)
+                   .Input("c", 2, DT_INT64)
+                   .Input("d", 3, DT_FLOAT)
+                   .Input("e", 5, DT_INT64)
+                   .Attr("input_spec", "")
+                   .Attr("params", "")
+                   .Finalize(&op.node_def));
+
+  // num_points = 2, sparse shape not known
+  INFER_OK(op, "?;[2,3];?;?;?", "[d1_0,?]");
+
+  // num_points = 2, sparse and dense shape rank known and > 1
+  INFER_OK(op, "?;[2,3];?;?;[10,11]", "[d1_0,?]");
+
+  // num_points = 2, sparse shape rank known and > 1
+  INFER_OK(op, "?;?;?;?;[10,11]", "[?,?]");
+}
+
+TEST(ModelOpsTest, TraverseTreeV4_ShapeFn) {
+  ShapeInferenceTestOp op("TraverseTreeV4");
+  TF_ASSERT_OK(NodeDefBuilder("test", "TraverseTreeV4")
+                   .Input("a", 0, DT_RESOURCE)
+                   .Input("b", 1, DT_FLOAT)
+                   .Input("c", 2, DT_INT64)
+                   .Input("d", 3, DT_FLOAT)
+                   .Input("e", 5, DT_INT64)
+                   .Attr("input_spec", "")
+                   .Attr("params", "")
+                   .Finalize(&op.node_def));
+
+  // num_points = 2, sparse shape not known
+  INFER_OK(op, "?;[2,3];?;?;?", "[d1_0]");
+
+  // num_points = 2, sparse and dense shape rank known and > 1
+  INFER_OK(op, "?;[2,3];?;?;[10,11]", "[d1_0]");
+
+  // num_points = 2, sparse shape rank known and > 1
+  INFER_OK(op, "?;?;?;?;[10,11]", "[?]");
+}
+
+TEST(ModelOpsTest, UpdateModelV4_ShapeFn) {
+  ShapeInferenceTestOp op("UpdateModelV4");
+  INFER_OK(op, "[1];?;?;?", "");
+}
+
+TEST(ModelOpsTest, FeatureUsageCounts_ShapeFn) {
+  ShapeInferenceTestOp op("FeatureUsageCounts");
+  INFER_OK(op, "[1]", "[?]");
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc b/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
deleted file mode 100644
index 6bfc29d96fea8dc5338879c150a6e92d19aae873..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// SampleInputs initializes candidate splits/threshold values randomly
-// from incoming data for not-yet-initialized fertile nodes.
-#include <ctime>
-#include <unordered_map>
-#include <set>
-
-#include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/lib/random/distribution_sampler.h"
-#include "tensorflow/core/lib/random/philox_random.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-using tensorforest::CheckTensorBounds;
-using tensorforest::IsAllInitialized;
-
-class SampleInputs : public OpKernel {
- public:
-  explicit SampleInputs(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "split_initializations_per_input", &split_initializations_per_input_));
-    OP_REQUIRES_OK(context, context->GetAttr(
-        "split_sampling_random_seed", &split_sampling_random_seed_));
-    // Set up the random number generator.
-    if (split_sampling_random_seed_ == 0) {
-      uint64 time_seed = static_cast<uint64>(std::clock());
-      single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(time_seed));
-    } else {
-      single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(split_sampling_random_seed_));
-    }
-
-    rng_ = std::unique_ptr<random::SimplePhilox>(
-        new random::SimplePhilox(single_rand_.get()));
-
-    string serialized_proto;
-    OP_REQUIRES_OK(context, context->GetAttr("input_spec", &serialized_proto));
-    input_spec_.ParseFromString(serialized_proto);
-  }
-
-  // increment_input implements a "++" operation for the situation when
-  // you want to do something n times on an underlying iterator.
-  // In an ideal world, this would be a built-in iterator adaptor.
-  template <typename T>
-  static void increment_input(const int n, T* it, int* count) {
-    *count += 1;
-    if (*count == n) {
-      *count = 0;
-      (*it)++;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_data = context->input(0);
-    const Tensor& sparse_input_indices = context->input(1);
-    const Tensor& sparse_input_values = context->input(2);
-    const Tensor& sparse_input_shape = context->input(3);
-    const Tensor& input_weights = context->input(4);
-    const Tensor& node_to_accumulator = context->input(5);
-    const Tensor& leaves = context->input(6);
-    const Tensor& split_features = context->input(7);
-    const Tensor& split_thresholds = context->input(8);
-
-    bool sparse_input = (sparse_input_indices.shape().dims() == 2);
-
-    bool have_weights = (input_weights.shape().dim_size(0) > 0);
-
-    if (sparse_input) {
-      // TODO(gilberth): This is because we can't figure out the shape
-      // of a sparse tensor at graph-build time, even if the dimension is
-      // actually known.
-      input_spec_.mutable_sparse(0)->set_size(
-          sparse_input_shape.unaligned_flat<int64>()(1));
-      OP_REQUIRES(context, sparse_input_shape.shape().dims() == 1,
-                  errors::InvalidArgument(
-                      "sparse_input_shape should be one-dimensional"));
-      OP_REQUIRES(context,
-                  sparse_input_shape.shape().dim_size(0) == 2,
-                  errors::InvalidArgument(
-                      "The sparse input data should be two-dimensional"));
-      OP_REQUIRES(context, sparse_input_values.shape().dims() == 1,
-                  errors::InvalidArgument(
-                      "sparse_input_values should be one-dimensional"));
-      OP_REQUIRES(context, sparse_input_indices.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "The sparse input data should be two-dimensional"));
-      OP_REQUIRES(context,
-                  sparse_input_indices.shape().dim_size(0) ==
-                  sparse_input_values.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "sparse_input_indices and sparse_input_values should "
-                      "agree on the number of non-zero values"));
-      if (have_weights) {
-        OP_REQUIRES(context, sparse_input_shape.unaligned_flat<int64>()(0) ==
-                                 input_weights.shape().dim_size(0),
-                    errors::InvalidArgument(
-                        "sparse_input_values and input_weights should agree "
-                        "on the number of inputs"));
-      }
-    }
-    if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                  "input_data should be two-dimensional"));
-      if (have_weights) {
-        OP_REQUIRES(context, input_data.shape().dim_size(0) ==
-                                 input_weights.shape().dim_size(0),
-                    errors::InvalidArgument(
-                        "input_data and input_weights should agree on the "
-                        "number of inputs"));
-      }
-    }
-
-    OP_REQUIRES(context, node_to_accumulator.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "node_to_accumulator should be one-dimensional"));
-    OP_REQUIRES(context, leaves.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "leaves should be one-dimensional"));
-    OP_REQUIRES(context, split_features.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "split_features should be two-dimensional"));
-    OP_REQUIRES(context, split_thresholds.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "split_thresholds should be two-dimensional"));
-
-    OP_REQUIRES(
-        context,
-        split_features.shape() == split_thresholds.shape(),
-        errors::InvalidArgument(
-            "split_features and split_thresholds should be the same shape."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, input_data)) return;
-    if (!CheckTensorBounds(context, sparse_input_indices)) return;
-    if (!CheckTensorBounds(context, sparse_input_values)) return;
-    if (!CheckTensorBounds(context, sparse_input_shape)) return;
-    if (!CheckTensorBounds(context, input_weights)) return;
-    if (!CheckTensorBounds(context, node_to_accumulator)) return;
-    if (!CheckTensorBounds(context, leaves)) return;
-    if (!CheckTensorBounds(context, split_features)) return;
-    if (!CheckTensorBounds(context, split_thresholds)) return;
-
-    const auto leaves_vec = leaves.unaligned_flat<int32>();
-    const auto node_map = node_to_accumulator.unaligned_flat<int32>();
-    const auto features = split_features.tensor<int32, 2>();
-    const auto thresholds = split_thresholds.tensor<float, 2>();
-    const auto weights = input_weights.unaligned_flat<float>();
-
-    const int32 num_data = static_cast<int32>(leaves.shape().dim_size(0));
-    const int32 num_splits = static_cast<int32>(
-        split_features.shape().dim_size(1));
-    const int32 num_accumulators = static_cast<int32>(
-        split_features.shape().dim_size(0));
-
-    std::unordered_map<int32, std::set<int32>> accumulator_to_leaves;
-
-    // The first pass just calculates num_output_accumulators.
-    for (int32 i = 0; i < num_data; i++) {
-      const int32 leaf = internal::SubtleMustCopy(leaves_vec(i));
-      OP_REQUIRES(context, FastBoundsCheck(leaf, node_map.size()),
-                  errors::InvalidArgument("leaf not in valid range."))
-      const int32 accumulator = internal::SubtleMustCopy(node_map(leaf));
-
-      // Check for non-fertile node or fertile node that is already
-      // initialized.
-      if (accumulator >= 0 &&
-          !IsAllInitialized(features, accumulator, num_splits)) {
-        accumulator_to_leaves[accumulator].insert(i);
-      }
-    }
-
-    // Now we can allocate the outputs.
-    int32 num_output_accumulators = static_cast<int32>(
-        accumulator_to_leaves.size());
-    VLOG(1) << "num output accumulators = " << num_output_accumulators;
-    Tensor* accumulators_tensor = nullptr;
-    TensorShape accumulators_shape;
-    accumulators_shape.AddDim(num_output_accumulators);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, accumulators_shape,
-                                            &accumulators_tensor));
-    auto accumulators_flat = accumulators_tensor->tensor<int32, 1>();
-
-    Tensor* new_split_feature_rows_tensor = nullptr;
-    TensorShape new_split_feature_rows_shape;
-    new_split_feature_rows_shape.AddDim(num_output_accumulators);
-    new_split_feature_rows_shape.AddDim(num_splits);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, new_split_feature_rows_shape,
-                                            &new_split_feature_rows_tensor));
-    auto new_split_feature_rows_flat =
-        new_split_feature_rows_tensor->tensor<int32, 2>();
-
-    Tensor* new_split_threshold_rows_tensor = nullptr;
-    TensorShape new_split_threshold_rows_shape;
-    new_split_threshold_rows_shape.AddDim(num_output_accumulators);
-    new_split_threshold_rows_shape.AddDim(num_splits);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, new_split_threshold_rows_shape,
-                                            &new_split_threshold_rows_tensor));
-    auto new_split_threshold_rows_flat =
-        new_split_threshold_rows_tensor->tensor<float, 2>();
-
-    // The second pass fills out the outputs.
-    int output_slot = 0;
-    for (const auto& active : accumulator_to_leaves) {
-      const int32 accumulator = active.first;
-      OP_REQUIRES(context, FastBoundsCheck(accumulator, num_accumulators),
-                  errors::InvalidArgument("accumulator not in valid range."))
-      const std::set<int32> inputs_for_accumulator = active.second;
-      VLOG(1) << "Accumulator " << accumulator
-                  << " gets new output slot " << output_slot;
-      accumulators_flat(output_slot) = accumulator;
-
-      // scatter_update updates entire rows, so we first copy the existing
-      // rows into the output tensors, and then write over the values we
-      // want to change.
-      for (int split = 0; split < num_splits; split++) {
-        new_split_feature_rows_flat(output_slot, split) =
-            features(accumulator, split);
-        new_split_threshold_rows_flat(output_slot, split) =
-            thresholds(accumulator, split);
-      }
-
-      auto it = inputs_for_accumulator.begin();
-      int input_used_count = 0;
-      for (int split = 0;
-           split < num_splits && it != inputs_for_accumulator.end(); split++) {
-        if (new_split_feature_rows_flat(output_slot, split) < 0) {
-          if (have_weights) {
-            // If we have weights, we probabilistically reject inputs with
-            // low weight.  Which means we might have to look at a bunch of
-            // inputs -- maybe even all of them -- to fill this slot.
-            while (it != inputs_for_accumulator.end()) {
-              float w = weights(*it);
-              if (rng_->RandFloat() <= w) {
-                break;
-              }
-              increment_input(split_initializations_per_input_, &it,
-                              &input_used_count);
-            }
-            if (it == inputs_for_accumulator.end()) {
-              break;
-            }
-          }
-          int32 index;
-          float val;
-          int64 sparse_input_start;
-          int32 num_total_features = input_spec_.dense_features_size();
-          if (sparse_input) {
-            num_total_features += tensorforest::GetNumSparseFeatures(
-                sparse_input_indices.matrix<int64>(), *it, &sparse_input_start);
-          }
-          if (num_total_features == 0) {
-            LOG(WARNING) << "num total features is zero.";
-            break;
-          }
-          const int32 rand_feature = rng_->Uniform(num_total_features);
-          if (rand_feature < input_spec_.dense_features_size()) {
-            const auto inputs = input_data.tensor<float, 2>();
-            index = rand_feature;
-            val = inputs(*it, rand_feature);
-          } else {
-            const auto indices = sparse_input_indices.matrix<int64>();
-            const auto values = sparse_input_values.vec<float>();
-            const int32 sparse_index = sparse_input_start + rand_feature -
-                                       input_spec_.dense_features_size();
-            index =
-                indices(sparse_index, 1) + input_spec_.dense_features_size();
-            val = values(sparse_index);
-          }
-          CHECK(index >= 0)
-              << "sample inputs chose negative feature: " << index;
-          increment_input(split_initializations_per_input_, &it,
-                          &input_used_count);
-
-          VLOG(1) << "Over-writing @ " << output_slot << "," << split;
-          new_split_feature_rows_flat(output_slot, split) = index;
-          new_split_threshold_rows_flat(output_slot, split) = val;
-        }
-      }
-      ++output_slot;
-    }
-  }
-
- private:
-  int32 split_initializations_per_input_;
-  int32 split_sampling_random_seed_;
-  std::unique_ptr<random::PhiloxRandom> single_rand_;
-  std::unique_ptr<random::SimplePhilox> rng_;
-  tensorforest::TensorForestDataSpec input_spec_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("SampleInputs").Device(DEVICE_CPU), SampleInputs);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6d57ef952777bc204f9534e60f2ce7de3687615
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
@@ -0,0 +1,523 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include <queue>
+
+#include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+using gtl::FindOrNull;
+
+// Creates a stats variable.
+class CreateFertileStatsVariableOp : public OpKernel {
+ public:
+  explicit CreateFertileStatsVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* stats_config_t;
+    OP_REQUIRES_OK(context, context->input("stats_config", &stats_config_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(stats_config_t->shape()),
+                errors::InvalidArgument("Stats config must be a scalar."));
+    auto* result = new FertileStatsResource(param_proto_);
+    FertileStats stats;
+    if (!ParseProtoUnlimited(&stats, stats_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse stats config."));
+    }
+
+    result->ExtractFromProto(stats);
+    result->MaybeInitialize();
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+
+ private:
+  TensorForestParams param_proto_;
+};
+
+// Op for serializing a model.
+class FertileStatsSerializeOp : public OpKernel {
+ public:
+  explicit FertileStatsSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    FertileStatsResource* fertile_stats_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &fertile_stats_resource));
+    mutex_lock l(*fertile_stats_resource->get_mutex());
+    core::ScopedUnref unref_me(fertile_stats_resource);
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape(), &output_config_t));
+
+    FertileStats stats;
+    fertile_stats_resource->PackToProto(&stats);
+    output_config_t->scalar<string>()() = stats.SerializeAsString();
+  }
+
+ private:
+  TensorForestParams param_proto_;
+};
+
+// Op for deserializing a stats variable from a checkpoint.
+class FertileStatsDeserializeOp : public OpKernel {
+ public:
+  explicit FertileStatsDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    FertileStatsResource* fertile_stats_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &fertile_stats_resource));
+    mutex_lock l(*fertile_stats_resource->get_mutex());
+    core::ScopedUnref unref_me(fertile_stats_resource);
+
+    const Tensor* stats_config_t;
+    OP_REQUIRES_OK(context, context->input("stats_config", &stats_config_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(stats_config_t->shape()),
+                errors::InvalidArgument("Stats config must be a scalar."));
+    // Deallocate all the previous objects on the resource.
+    fertile_stats_resource->Reset();
+    FertileStats stats;
+    OP_REQUIRES(context,
+                ParseProtoUnlimited(&stats, stats_config_t->scalar<string>()()),
+                errors::InvalidArgument("Unable to parse stats config."));
+
+    fertile_stats_resource->ExtractFromProto(stats);
+    fertile_stats_resource->MaybeInitialize();
+  }
+
+ private:
+  TensorForestParams param_proto_;
+};
+
+// Try to update a leaf's stats by acquiring its lock.  If it can't be
+// acquired, put it in a waiting queue to come back to later and try the next
+// one.  Once all leaf_ids have been visited, cycle through the waiting ids
+// until they're gone.
+void UpdateStats(FertileStatsResource* fertile_stats_resource,
+                 const std::unique_ptr<TensorDataSet>& data,
+                 const TensorInputTarget& target, int num_targets,
+                 const Tensor& leaf_ids_tensor,
+                 std::unordered_map<int32, std::unique_ptr<mutex>>* locks,
+                 mutex* set_lock, int32 start, int32 end,
+                 std::unordered_set<int32>* ready_to_split) {
+  const auto leaf_ids = leaf_ids_tensor.unaligned_flat<int32>();
+
+  // Stores leaf_id, leaf_depth, example_id for examples that are waiting
+  // on another to finish.
+  std::queue<std::tuple<int32, int32>> waiting;
+
+  int32 i = start;
+  while (i < end || !waiting.empty()) {
+    int32 leaf_id;
+    int32 example_id;
+    bool was_waiting = false;
+    if (i >= end) {
+      std::tie(leaf_id, example_id) = waiting.front();
+      waiting.pop();
+      was_waiting = true;
+    } else {
+      leaf_id = leaf_ids(i);
+      example_id = i;
+      ++i;
+    }
+    const std::unique_ptr<mutex>& leaf_lock = (*locks)[leaf_id];
+    if (was_waiting) {
+      leaf_lock->lock();
+    } else {
+      if (!leaf_lock->try_lock()) {
+        waiting.emplace(leaf_id, example_id);
+        continue;
+      }
+    }
+
+    bool is_finished;
+    fertile_stats_resource->AddExampleToStatsAndInitialize(
+        data, &target, {example_id}, leaf_id, &is_finished);
+    leaf_lock->unlock();
+    if (is_finished) {
+      set_lock->lock();
+      ready_to_split->insert(leaf_id);
+      set_lock->unlock();
+    }
+  }
+}
+
+// Update leaves from start through end in the leaf_examples iterator.
+void UpdateStatsCollated(
+    FertileStatsResource* fertile_stats_resource,
+    DecisionTreeResource* tree_resource,
+    const std::unique_ptr<TensorDataSet>& data, const TensorInputTarget& target,
+    int num_targets,
+    const std::unordered_map<int32, std::vector<int>>& leaf_examples,
+    mutex* set_lock, int32 start, int32 end,
+    std::unordered_set<int32>* ready_to_split) {
+  auto it = leaf_examples.begin();
+  std::advance(it, start);
+  auto end_it = leaf_examples.begin();
+  std::advance(end_it, end);
+  while (it != end_it) {
+    int32 leaf_id = it->first;
+    bool is_finished;
+    fertile_stats_resource->AddExampleToStatsAndInitialize(
+        data, &target, it->second, leaf_id, &is_finished);
+    if (is_finished) {
+      set_lock->lock();
+      ready_to_split->insert(leaf_id);
+      set_lock->unlock();
+    }
+    ++it;
+  }
+}
+
+// Op for traversing the tree with each example, accumulating statistics, and
+// outputting node ids that are ready to split.
+class ProcessInputOp : public OpKernel {
+ public:
+  explicit ProcessInputOp(OpKernelConstruction* context) : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
+
+    string serialized_proto;
+    OP_REQUIRES_OK(context, context->GetAttr("input_spec", &serialized_proto));
+    input_spec_.ParseFromString(serialized_proto);
+
+    data_set_ = std::unique_ptr<TensorDataSet>(
+        new TensorDataSet(input_spec_, random_seed_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_data = context->input(2);
+    const Tensor& sparse_input_indices = context->input(3);
+    const Tensor& sparse_input_values = context->input(4);
+    const Tensor& sparse_input_shape = context->input(5);
+    const Tensor& input_labels = context->input(6);
+    const Tensor& input_weights = context->input(7);
+    const Tensor& leaf_ids_tensor = context->input(8);
+
+    data_set_->set_input_tensors(input_data, sparse_input_indices,
+                                 sparse_input_values, sparse_input_shape);
+
+    FertileStatsResource* fertile_stats_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 1),
+                                           &fertile_stats_resource));
+    DecisionTreeResource* tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_resource));
+    mutex_lock l1(*fertile_stats_resource->get_mutex());
+    mutex_lock l2(*tree_resource->get_mutex());
+
+    core::ScopedUnref unref_stats(fertile_stats_resource);
+    core::ScopedUnref unref_tree(tree_resource);
+
+    const int32 num_data = data_set_->NumItems();
+    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    int num_threads = worker_threads->num_threads;
+
+    const auto leaf_ids = leaf_ids_tensor.unaligned_flat<int32>();
+
+    // Create one mutex per leaf. We need to protect access to leaf pointers,
+    // so instead of grouping examples by leaf, we spread examples out among
+    // threads to provide uniform work for each of them and protect access
+    // with mutexes.
+    std::unordered_map<int, std::unique_ptr<mutex>> locks;
+    std::unordered_map<int32, std::vector<int>> leaf_examples;
+    if (param_proto_.collate_examples()) {
+      for (int i = 0; i < num_data; ++i) {
+        leaf_examples[leaf_ids(i)].push_back(i);
+      }
+    } else {
+      for (int i = 0; i < num_data; ++i) {
+        const int32 id = leaf_ids(i);
+        if (FindOrNull(locks, id) == nullptr) {
+          // TODO(gilberth): Consider using a memory pool for these.
+          locks[id] = std::unique_ptr<mutex>(new mutex);
+        }
+      }
+    }
+
+    const int32 num_leaves = leaf_examples.size();
+    const int32 label_dim =
+        input_labels.shape().dims() <= 1
+            ? 0
+            : static_cast<int>(input_labels.shape().dim_size(1));
+    const int32 num_targets =
+        param_proto_.is_regression() ? (std::max(1, label_dim)) : 1;
+
+    // Ids of leaves that can split.
+    std::unordered_set<int32> ready_to_split;
+    mutex set_lock;
+
+    TensorInputTarget target(input_labels, input_weights, num_targets);
+
+    // TODO(gilberth): This is a rough approximation based on measurements
+    // from a digits run on local desktop.  Heuristics might be necessary
+    // if it really matters that much.
+    const int64 costPerUpdate = 1000;
+    auto update = [this, &target, &leaf_ids_tensor, &num_targets,
+                   fertile_stats_resource, &locks, &set_lock, &ready_to_split,
+                   num_data](int64 start, int64 end) {
+      CHECK(start <= end);
+      CHECK(end <= num_data);
+      UpdateStats(fertile_stats_resource, data_set_, target, num_targets,
+                  leaf_ids_tensor, &locks, &set_lock, static_cast<int32>(start),
+                  static_cast<int32>(end), &ready_to_split);
+    };
+
+    auto update_collated = [this, &target, &num_targets, fertile_stats_resource,
+                            tree_resource, &leaf_examples, &set_lock,
+                            &ready_to_split,
+                            num_leaves](int64 start, int64 end) {
+      CHECK(start <= end);
+      CHECK(end <= num_leaves);
+      UpdateStatsCollated(fertile_stats_resource, tree_resource, data_set_,
+                          target, num_targets, leaf_examples, &set_lock,
+                          static_cast<int32>(start), static_cast<int32>(end),
+                          &ready_to_split);
+    };
+
+    if (param_proto_.collate_examples()) {
+      Shard(num_threads, worker_threads->workers, num_leaves, costPerUpdate,
+            update_collated);
+    } else {
+      Shard(num_threads, worker_threads->workers, num_data, costPerUpdate,
+            update);
+    }
+
+    Tensor* output_finished_t = nullptr;
+    TensorShape output_shape;
+    output_shape.AddDim(ready_to_split.size());
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, output_shape, &output_finished_t));
+    auto output = output_finished_t->unaligned_flat<int32>();
+    std::copy(ready_to_split.begin(), ready_to_split.end(), output.data());
+  }
+
+ private:
+  int32 random_seed_;
+  tensorforest::TensorForestDataSpec input_spec_;
+  std::unique_ptr<TensorDataSet> data_set_;
+  TensorForestParams param_proto_;
+};
+
+// Op for growing finished nodes.
+class GrowTreeOp : public OpKernel {
+ public:
+  explicit GrowTreeOp(OpKernelConstruction* context) : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    FertileStatsResource* fertile_stats_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 1),
+                                           &fertile_stats_resource));
+    DecisionTreeResource* tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_resource));
+    mutex_lock l1(*fertile_stats_resource->get_mutex());
+    mutex_lock l2(*tree_resource->get_mutex());
+
+    core::ScopedUnref unref_stats(fertile_stats_resource);
+    core::ScopedUnref unref_tree(tree_resource);
+
+    const Tensor& finished_nodes = context->input(2);
+
+    const auto finished = finished_nodes.unaligned_flat<int32>();
+
+    const int32 num_nodes =
+        static_cast<int32>(finished_nodes.shape().dim_size(0));
+
+    // This op takes so little of the time for one batch that it isn't worth
+    // threading this.
+    for (int i = 0;
+         i < num_nodes &&
+         tree_resource->decision_tree().decision_tree().nodes_size() <
+             param_proto_.max_nodes();
+         ++i) {
+      const int32 node = finished(i);
+      std::unique_ptr<SplitCandidate> best(new SplitCandidate);
+      int32 parent_depth;
+      // TODO(gilberth): Pushing these to an output would allow the complete
+      // decoupling of tree from resource.
+      bool found =
+          fertile_stats_resource->BestSplit(node, best.get(), &parent_depth);
+      if (found) {
+        std::vector<int32> new_children;
+        tree_resource->SplitNode(node, best.get(), &new_children);
+        fertile_stats_resource->Allocate(parent_depth, new_children);
+        // We are done with best, so it is now safe to clear node.
+        fertile_stats_resource->Clear(node);
+        CHECK(tree_resource->get_mutable_tree_node(node)->has_leaf() == false);
+      } else {  // reset
+        fertile_stats_resource->ResetSplitStats(node, parent_depth);
+      }
+    }
+  }
+
+ private:
+  tensorforest::TensorForestDataSpec input_spec_;
+  TensorForestParams param_proto_;
+};
+
+void FinalizeLeaf(bool is_regression, bool drop_final_class,
+                  const std::unique_ptr<LeafModelOperator>& leaf_op,
+                  decision_trees::Leaf* leaf) {
+  // regression models are already stored in leaf in normalized form.
+  if (is_regression) {
+    return;
+  }
+
+  // TODO(gilberth): Calculate the leaf's sum.
+  float sum = 0;
+  LOG(FATAL) << "FinalizeTreeOp is disabled for now.";
+  if (sum <= 0.0) {
+    LOG(WARNING) << "Leaf with sum " << sum << " has stats "
+                 << leaf->ShortDebugString();
+    return;
+  }
+
+  if (leaf->has_vector()) {
+    for (int i = 0; i < leaf->vector().value_size(); i++) {
+      auto* v = leaf->mutable_vector()->mutable_value(i);
+      v->set_float_value(v->float_value() / sum);
+    }
+    if (drop_final_class) {
+      leaf->mutable_vector()->mutable_value()->RemoveLast();
+    }
+    return;
+  }
+
+  if (leaf->has_sparse_vector()) {
+    for (auto& it : *leaf->mutable_sparse_vector()->mutable_sparse_value()) {
+      it.second.set_float_value(it.second.float_value() / sum);
+    }
+    return;
+  }
+
+  LOG(FATAL) << "Unknown leaf type in " << leaf->DebugString();
+}
+
+// Op for finalizing a tree at the end of training.
+class FinalizeTreeOp : public OpKernel {
+ public:
+  explicit FinalizeTreeOp(OpKernelConstruction* context) : OpKernel(context) {
+    string serialized_params;
+    OP_REQUIRES_OK(context, context->GetAttr("params", &serialized_params));
+    ParseProtoUnlimited(&param_proto_, serialized_params);
+
+    model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(param_proto_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    DecisionTreeResource* tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_resource));
+    FertileStatsResource* fertile_stats_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 1),
+                                           &fertile_stats_resource));
+
+    mutex_lock l1(*fertile_stats_resource->get_mutex());
+    mutex_lock l2(*tree_resource->get_mutex());
+
+    core::ScopedUnref unref_me(tree_resource);
+    core::ScopedUnref unref_stats(fertile_stats_resource);
+
+    // TODO(thomaswc): Add threads
+    int num_nodes = tree_resource->decision_tree().decision_tree().nodes_size();
+    for (int i = 0; i < num_nodes; i++) {
+      auto* node = tree_resource->mutable_decision_tree()
+                       ->mutable_decision_tree()
+                       ->mutable_nodes(i);
+      if (node->has_leaf()) {
+        FinalizeLeaf(param_proto_.is_regression(),
+                     param_proto_.drop_final_class(), model_op_,
+                     node->mutable_leaf());
+      }
+    }
+  }
+
+ private:
+  std::unique_ptr<LeafModelOperator> model_op_;
+  TensorForestParams param_proto_;
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(FertileStatsResource);
+
+REGISTER_KERNEL_BUILDER(Name("FertileStatsIsInitializedOp").Device(DEVICE_CPU),
+                        IsResourceInitialized<FertileStatsResource>);
+
+REGISTER_KERNEL_BUILDER(Name("CreateFertileStatsVariable").Device(DEVICE_CPU),
+                        CreateFertileStatsVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("FertileStatsSerialize").Device(DEVICE_CPU),
+                        FertileStatsSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("FertileStatsDeserialize").Device(DEVICE_CPU),
+                        FertileStatsDeserializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("ProcessInputV4").Device(DEVICE_CPU),
+                        ProcessInputOp);
+
+REGISTER_KERNEL_BUILDER(Name("GrowTreeV4").Device(DEVICE_CPU), GrowTreeOp);
+
+REGISTER_KERNEL_BUILDER(Name("FinalizeTree").Device(DEVICE_CPU),
+                        FinalizeTreeOp);
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/stats_ops_test.cc b/tensorflow/contrib/tensor_forest/kernels/stats_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3aa3a96f4308fd6d2c8baeddf845371a483cffe
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/stats_ops_test.cc
@@ -0,0 +1,56 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(StatsOpsTest, CreateFertileStatsVariable_ShapeFn) {
+  ShapeInferenceTestOp op("CreateFertileStatsVariable");
+  INFER_OK(op, "[1];[1]", "");
+}
+
+TEST(StatsOpsTest, FertileStatsSerialize_ShapeFn) {
+  ShapeInferenceTestOp op("FertileStatsSerialize");
+  INFER_OK(op, "[1]", "[]");
+}
+
+TEST(StatsOpsTest, FertileStatsDeserialize_ShapeFn) {
+  ShapeInferenceTestOp op("FertileStatsDeserialize");
+  INFER_OK(op, "[1];[1]", "");
+}
+
+TEST(StatsOpsTest, GrowTreeV4_ShapeFn) {
+  ShapeInferenceTestOp op("GrowTreeV4");
+  INFER_OK(op, "[1];[1];?", "");
+}
+
+TEST(StatsOpsTest, ProcessInputV4_ShapeFn) {
+  ShapeInferenceTestOp op("ProcessInputV4");
+  INFER_OK(op, "[1];[1];?;?;?;?;?;?;?", "[?]");
+}
+
+TEST(StatsOpsTest, FinalizeTree_ShapeFn) {
+  ShapeInferenceTestOp op("FinalizeTree");
+  INFER_OK(op, "[1];[1]", "");
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/tensor_forest_ops_test.cc b/tensorflow/contrib/tensor_forest/kernels/tensor_forest_ops_test.cc
index d238fc1c68c831743a99533dcf75babba9687eaf..8b33e0d81910b1109b0236c8a35afde33e93e36e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tensor_forest_ops_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tensor_forest_ops_test.cc
@@ -24,123 +24,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-TEST(TrainingOpsTest, UpdateFertileSlots_ShapeFn) {
-  ShapeInferenceTestOp op("UpdateFertileSlots");
-  INFER_OK(op, "?;?;?;?;?;?;?;?", "[2,?];[2,?];[?];[?]");
-}
-
 TEST(TrainingOpsTest, ScatterAddNdim_ShapeFn) {
   ShapeInferenceTestOp op("ScatterAddNdim");
   INFER_OK(op, "?;?;?", "");
 }
 
-TEST(TrainingOpsTest, GrowTree_ShapeFn) {
-  ShapeInferenceTestOp op("GrowTree");
-  INFER_OK(op, "?;?;?;?;?;?", "[?];[?,2];[?];[1]");
-}
-
-TEST(TrainingOpsTest, FinishedNodes_ShapeFn) {
-  ShapeInferenceTestOp op("FinishedNodes");
-  INFER_OK(op, "?;?;?;?;?;?;?;?", "[?];[?]");
-}
-
-TEST(TrainingOpsTest, BestSplits_ShapeFn) {
-  ShapeInferenceTestOp op("BestSplits");
-  INFER_OK(op, "?;?;?;?;?;?", "[?]");
-  INFER_OK(op, "[?];?;?;?;?;?", "[d0_0]");
-  INFER_OK(op, "[1];?;?;?;?;?", "[d0_0]");
-  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];?;?;?;?;?");
-}
-
-TEST(TrainingOpsTest, SampleInputs_ShapeFn) {
-  ShapeInferenceTestOp op("SampleInputs");
-
-  // input[7].dim(1) determines dims in the output.
-  INFER_OK(op, "?;?;?;?;?;?;?;?;?", "[?];[?,?];[?,?]");
-  INFER_OK(op, "?;?;?;?;?;?;?;[?,?];?", "[?];[?,d7_1];[?,d7_1]");
-  INFER_OK(op, "?;?;?;?;?;?;?;[1,2];?", "[?];[?,d7_1];[?,d7_1]");
-  INFER_ERROR("Shape must be rank 2 but is rank 3", op,
-              "?;?;?;?;?;?;?;[1,2,3];?");
-}
-
-TEST(TrainingOpsTest, CountExtremelyRandomStats_ShapeFn) {
-  ShapeInferenceTestOp op("CountExtremelyRandomStats");
-  TF_ASSERT_OK(NodeDefBuilder("test", "CountExtremelyRandomStats")
-                   .Input("input_data", 0, DT_FLOAT)
-                   .Input("sparse_input_indices", 1, DT_INT64)
-                   .Input("sparse_input_values", 2, DT_FLOAT)
-                   .Input("sparse_input_shape", 3, DT_INT64)
-                   .Input("input_labels", 4, DT_FLOAT)
-                   .Input("input_weights", 5, DT_FLOAT)
-                   .Input("tree", 6, DT_INT32)
-                   .Input("tree_thresholds", 7, DT_FLOAT)
-                   .Input("node_to_accumulator", 8, DT_INT32)
-                   .Input("candidate_split_features", 9, DT_INT32)
-                   .Input("candidate_split_thresholds", 10, DT_FLOAT)
-                   .Input("birth_epochs", 11, DT_INT32)
-                   .Input("current_epoch", 12, DT_INT32)
-                   .Attr("input_spec", "")
-                   .Attr("num_classes", 10)
-                   .Attr("regression", false)
-                   .Finalize(&op.node_def));
-
-  // num_points = 2, num_nodes = 4, regression = false, num_classes = 10
-  // num_nodes = 4
-  INFER_OK(op, "[2,3];?;?;?;?;?;[4];?;?;?;?;?;?",
-           "[d6_0,10];[d6_0,10];[?,3];[?];[0];[?,2];[?];[0];[d0_0]");
-
-  TF_ASSERT_OK(NodeDefBuilder("test", "CountExtremelyRandomStats")
-                   .Input("input_data", 0, DT_FLOAT)
-                   .Input("sparse_input_indices", 1, DT_INT64)
-                   .Input("sparse_input_values", 2, DT_FLOAT)
-                   .Input("sparse_input_shape", 3, DT_INT64)
-                   .Input("input_labels", 4, DT_FLOAT)
-                   .Input("input_weights", 5, DT_FLOAT)
-                   .Input("tree", 6, DT_INT32)
-                   .Input("tree_thresholds", 7, DT_FLOAT)
-                   .Input("node_to_accumulator", 8, DT_INT32)
-                   .Input("candidate_split_features", 9, DT_INT32)
-                   .Input("candidate_split_thresholds", 10, DT_FLOAT)
-                   .Input("birth_epochs", 11, DT_INT32)
-                   .Input("current_epoch", 12, DT_INT32)
-                   .Attr("input_spec", "")
-                   .Attr("num_classes", 10)
-                   .Attr("regression", true)
-                   .Finalize(&op.node_def));
-
-  // num_points = 2, num_nodes = 4, regression = false, num_classes = 10
-  // num_nodes = 4
-  INFER_OK(
-      op, "[2,3];?;?;?;?;?;[4];?;?;?;?;?;?",
-      "[d6_0,10];[d6_0,10];[?,2];[?,10];[?,10];[?,1];[?,10];[?,10];[d0_0]");
-
-  // Sparse shape known and > 1, so num_points is unknown
-  INFER_OK(op, "[2,3];?;?;[10,11];?;?;[4];?;?;?;?;?;?",
-           "[d6_0,10];[d6_0,10];[?,2];[?,10];[?,10];[?,1];[?,10];[?,10];[?]");
-}
-
-TEST(TrainingOpsTest, TreePredictions_ShapeFn) {
-  ShapeInferenceTestOp op("TreePredictions");
-  TF_ASSERT_OK(NodeDefBuilder("test", "TreePredictions")
-                   .Input("a", 0, DT_FLOAT)
-                   .Input("b", 1, DT_INT64)
-                   .Input("c", 2, DT_FLOAT)
-                   .Input("d", 3, DT_INT64)
-                   .Input("f", 5, DT_INT32)
-                   .Input("g", 6, DT_FLOAT)
-                   .Input("h", 7, DT_FLOAT)
-                   .Attr("input_spec", "")
-                   .Attr("valid_leaf_threshold", 0.5)
-                   .Finalize(&op.node_def));
-
-  // num_points = 2, num_classes = 10, sparse shape not known
-  INFER_OK(op, "[2,3];?;?;?;?;?;[1,10]", "[d0_0,9]");
-
-  // num_points = 2, num_classes = 10, sparse and dense shape rank known and > 1
-  INFER_OK(op, "[2,3];?;?;[10,11];?;?;[1,10]", "[d0_0,9]");
-
-  // num_points = 2, num_classes = 10, sparse shape rank known and > 1
-  INFER_OK(op, "?;?;?;[10,11];?;?;[1,10]", "[?,9]");
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc b/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc
deleted file mode 100644
index 3fe37d56a8fb21b2077b187256568021422f0cf2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/tree_predictions_op.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// TreePredictions returns the per-class probabilities for each input by
-// evaluating the given tree.
-#include <algorithm>
-
-#include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-namespace tensorflow {
-
-using tensorforest::CHILDREN_INDEX;
-using tensorforest::FEATURE_INDEX;
-using tensorforest::LEAF_NODE;
-using tensorforest::FREE_NODE;
-
-using tensorforest::CheckTensorBounds;
-
-namespace {
-// Traverse the tree for every example from start to end. Put the resulting
-// prediction probability into output_predictions[i].
-void Evaluate(OpKernelContext* context,
-              const std::function<bool(int, int32, float)>& decide,
-              const Tensor& weights, const Tensor& tree_tensor,
-              const Tensor& tree_thresholds, int valid_leaf_threshold,
-              Tensor* output_predictions, int32 start, int32 end) {
-  auto out = output_predictions->tensor<float, 2>();
-
-  const auto node_pcw = weights.tensor<float, 2>();
-  const auto tree = tree_tensor.tensor<int32, 2>();
-  const auto thresholds = tree_thresholds.unaligned_flat<float>();
-
-  const int32 num_classes = static_cast<int32>(weights.shape().dim_size(1));
-  const int32 num_nodes = static_cast<int32>(tree_tensor.shape().dim_size(0));
-
-  for (int i = start; i < end; i++) {
-    int node_index = 0;
-    int parent = -1;
-    while (true) {
-      OP_REQUIRES(context, FastBoundsCheck(node_index, num_nodes),
-                  errors::InvalidArgument("node_index not in valid range."))
-      const int32 left_child = tree(node_index, CHILDREN_INDEX);
-      if (left_child == LEAF_NODE) {
-        const int32 flat_leaf_index = node_index * num_classes + 1;
-        const int32 flat_parent_index = parent * num_classes + 1;
-        std::vector<float> means(num_classes - 1);
-        tensorforest::GetParentWeightedMean(
-            node_pcw(node_index, 0), node_pcw.data() + flat_leaf_index,
-            node_pcw(parent, 0), node_pcw.data() + flat_parent_index,
-            valid_leaf_threshold, num_classes - 1, &means);
-        const int32 start_index = i * (num_classes - 1);
-        std::copy(means.begin(), means.end(), out.data() + start_index);
-        break;
-      } else if (left_child == FREE_NODE) {
-        LOG(ERROR) << "Reached a free node, not good.";
-        return;
-      }
-      parent = node_index;
-      const int32 feature = tree(node_index, FEATURE_INDEX);
-      node_index = left_child + decide(i, feature, thresholds(node_index));
-    }
-  }
-}
-}  // namespace
-
-class TreePredictions : public OpKernel {
- public:
-  explicit TreePredictions(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr(
-      "valid_leaf_threshold", &valid_leaf_threshold_));
-
-    string serialized_proto;
-    OP_REQUIRES_OK(context, context->GetAttr("input_spec", &serialized_proto));
-    input_spec_.ParseFromString(serialized_proto);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_data = context->input(0);
-    const Tensor& sparse_input_indices = context->input(1);
-    const Tensor& sparse_input_values = context->input(2);
-    const Tensor& sparse_input_shape = context->input(3);
-    const Tensor& tree_tensor = context->input(4);
-    const Tensor& tree_thresholds = context->input(5);
-    const Tensor& node_per_class_weights = context->input(6);
-
-    int32 num_data = 0;
-    if (sparse_input_indices.shape().dims() == 2) {
-      const auto sparse_shape = sparse_input_shape.unaligned_flat<int64>();
-      // TODO(gilberth): This is because we can't figure out the shape
-      // of a sparse tensor at graph-build time, even if the dimension is
-      // actually known.
-      input_spec_.mutable_sparse(0)->set_size(sparse_shape(1));
-      num_data = sparse_shape(0);
-      OP_REQUIRES(context, sparse_input_values.shape().dims() == 1,
-                  errors::InvalidArgument(
-                      "sparse_input_values should be one-dimensional"));
-      OP_REQUIRES(context, sparse_input_shape.shape().dims() == 1,
-                  errors::InvalidArgument(
-                      "sparse_input_shape should be one-dimensional"));
-      OP_REQUIRES(context,
-                  sparse_input_indices.shape().dim_size(0) ==
-                  sparse_input_values.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "sparse_input_indices and sparse_input_values should "
-                      "agree on the number of non-zero values"));
-      OP_REQUIRES(context,
-                  sparse_input_indices.shape().dim_size(1) ==
-                  sparse_input_shape.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "sparse_input_indices and sparse_input_shape should "
-                      "agree on the dimensionality of data points"));
-    }
-
-    if (input_data.shape().dim_size(0) > 0) {
-      const int32 dense_num_data =
-          static_cast<int32>(input_data.shape().dim_size(0));
-      if (num_data > 0) {
-        CHECK_EQ(num_data, dense_num_data)
-            << "number of examples must match for sparse + dense input.";
-      }
-      num_data = dense_num_data;
-      OP_REQUIRES(
-          context, input_data.shape().dims() == 2,
-          errors::InvalidArgument("input_data should be two-dimensional"));
-    }
-
-    OP_REQUIRES(context, tree_tensor.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "tree should be two-dimensional"));
-    OP_REQUIRES(context, tree_thresholds.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "tree_threhsolds should be one-dimensional"));
-    OP_REQUIRES(context, node_per_class_weights.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "node_pcw should be two-dimensional"));
-
-    OP_REQUIRES(
-        context,
-        tree_tensor.shape().dim_size(0) ==
-        tree_thresholds.shape().dim_size(0) &&
-        tree_tensor.shape().dim_size(0) ==
-        node_per_class_weights.shape().dim_size(0),
-        errors::InvalidArgument(
-            "Number of nodes should be the same in "
-            "tree, tree_thresholds and node_pcw."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, input_data)) return;
-    if (!CheckTensorBounds(context, sparse_input_indices)) return;
-    if (!CheckTensorBounds(context, sparse_input_values)) return;
-    if (!CheckTensorBounds(context, sparse_input_shape)) return;
-    if (!CheckTensorBounds(context, tree_tensor)) return;
-    if (!CheckTensorBounds(context, tree_thresholds)) return;
-    if (!CheckTensorBounds(context, node_per_class_weights)) return;
-
-    const int32 num_classes = static_cast<int32>(
-        node_per_class_weights.shape().dim_size(1));
-
-    Tensor* output_predictions = nullptr;
-    TensorShape output_shape;
-    output_shape.AddDim(num_data);
-    output_shape.AddDim(num_classes - 1);
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_predictions));
-
-    // Lambdas to capture the eigen-tensors so we don't the conversion overhead
-    // on each call to DecideNode.
-    const auto get_dense = tensorforest::GetDenseFunctor(input_data);
-    const auto get_sparse = tensorforest::GetSparseFunctor(sparse_input_indices,
-                                                           sparse_input_values);
-
-    auto decide = [&get_dense, &get_sparse, this](int example, int32 feature,
-                                                  float threshold) {
-      return tensorforest::DecideNode(get_dense, get_sparse, example, feature,
-                                      threshold, input_spec_);
-    };
-
-    auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
-    int num_threads = worker_threads->num_threads;
-
-    const int64 costPerUnit = 800;
-    auto work = [context, &decide, &node_per_class_weights, &tree_tensor,
-                 &tree_thresholds, this, &output_predictions,
-                 num_data](int64 start, int64 end) {
-      CHECK(start <= end);
-      CHECK(end <= num_data);
-      Evaluate(context, decide, node_per_class_weights, tree_tensor,
-               tree_thresholds, valid_leaf_threshold_, output_predictions,
-               static_cast<int32>(start), static_cast<int32>(end));
-    };
-    Shard(num_threads, worker_threads->workers, num_data, costPerUnit, work);
-  }
-
- private:
-  float valid_leaf_threshold_;
-  tensorforest::TensorForestDataSpec input_spec_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("TreePredictions").Device(DEVICE_CPU),
-                        TreePredictions);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc b/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc
deleted file mode 100644
index 798b003a86558aac1057b2ac1f650010256cc56b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/kernels/update_fertile_slots_op.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-// UpdateFertileSlots manages accumulator slots.  It assigns free or newly
-// finished accumulator slots to waiting non-fertile nodes and new leaves
-// according to their existing split scores (based on node pcws).
-#include <unordered_map>
-#include <set>
-
-#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/gtl/top_n.h"
-
-
-namespace tensorflow {
-
-using gtl::TopN;
-using tensorforest::CheckTensorBounds;
-using tensorforest::WeightedGiniImpurity;
-
-class UpdateFertileSlots : public OpKernel {
- public:
-  explicit UpdateFertileSlots(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr(
-      "regression", &regression_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& finished = context->input(0);
-
-    const Tensor& non_fertile_leaves =  context->input(1);
-    const Tensor& non_fertile_leaf_scores =  context->input(2);
-    const Tensor& end_of_tree = context->input(3);
-
-    const Tensor& accumulator_sums = context->input(4);
-    const Tensor& node_to_accumulator = context->input(5);
-    const Tensor& stale_leaves = context->input(6);
-    const Tensor& node_sums = context->input(7);
-
-    OP_REQUIRES(context, finished.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "finished should be one-dimensional"));
-    OP_REQUIRES(context, non_fertile_leaves.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "non_fertile_leaves should be one-dimensional"));
-    OP_REQUIRES(context, non_fertile_leaf_scores.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "non_fertile_leaves_scores should be one-dimensional"));
-    OP_REQUIRES(context, end_of_tree.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "end_of_tree should be one-dimensional"));
-    OP_REQUIRES(context, accumulator_sums.shape().dims() == 2,
-                errors::InvalidArgument(
-                    "accumulator_sums should be two-dimensional"));
-     OP_REQUIRES(context, node_to_accumulator.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "node_to_accumulator should be one-dimensional"));
-     OP_REQUIRES(context, stale_leaves.shape().dims() == 1,
-                errors::InvalidArgument(
-                    "stale_leaves should be one-dimensional"));
-
-    OP_REQUIRES(
-        context,
-        non_fertile_leaves.shape().dim_size(0) ==
-        non_fertile_leaf_scores.shape().dim_size(0),
-        errors::InvalidArgument(
-            "Number of non fertile leaves should be the same in "
-            "non_fertile_leaves and non_fertile_leaf_scores."));
-
-    // Check tensor bounds.
-    if (!CheckTensorBounds(context, finished)) return;
-    if (!CheckTensorBounds(context, non_fertile_leaves)) return;
-    if (!CheckTensorBounds(context, non_fertile_leaf_scores)) return;
-    if (!CheckTensorBounds(context, end_of_tree)) return;
-    if (!CheckTensorBounds(context, accumulator_sums)) return;
-    if (!CheckTensorBounds(context, node_to_accumulator)) return;
-    if (!CheckTensorBounds(context, stale_leaves)) return;
-
-    // Read finished accumulators into a set for quick lookup.
-    const auto node_map = node_to_accumulator.unaligned_flat<int32>();
-    const auto finished_vec = finished.unaligned_flat<int32>();
-    const int32 num_finished = static_cast<int32>(finished.shape().dim_size(0));
-    std::set<int32> finished_accumulators;
-    for (int32 i = 0; i < num_finished; ++i) {
-      const int32 node = internal::SubtleMustCopy(finished_vec(i));
-      OP_REQUIRES(
-          context, FastBoundsCheck(node, node_map.size()),
-          errors::InvalidArgument("finished node is outside the valid range"));
-      finished_accumulators.insert(node_map(node));
-    }
-    // Stale accumulators are also finished for the purposes of clearing
-    // and re-allocating.
-    const auto stale_vec = stale_leaves.unaligned_flat<int32>();
-    for (int32 i = 0; i < stale_vec.size(); ++i) {
-      const int32 node = internal::SubtleMustCopy(stale_vec(i));
-      OP_REQUIRES(
-          context, FastBoundsCheck(node, node_map.size()),
-          errors::InvalidArgument("stale node is outside the valid range"));
-      finished_accumulators.insert(node_map(node));
-    }
-
-    // Construct leaf heap to sort leaves to allocate accumulators to.
-    const int32 num_nodes =
-        static_cast<int32>(node_to_accumulator.shape().dim_size(0));
-    const int32 eot = internal::SubtleMustCopy(
-        end_of_tree.unaligned_flat<int32>()(0));
-    // end-of-tree points to one beyond the last node, so it's allowed to go
-    // up to num_nodes inclusive.
-    OP_REQUIRES(
-        context, FastBoundsCheck(eot, num_nodes + 1),
-        errors::InvalidArgument("end-of-tree is outside the valid range"));
-
-    const int32 num_new_leaves = std::min(num_finished * 2, num_nodes - eot);
-
-    LeafHeapType leaf_heap(
-        static_cast<int32>(non_fertile_leaves.shape().dim_size(0)) +
-        num_new_leaves, OrderBySecondGreater());
-    ConstructLeafHeap(
-        non_fertile_leaves, non_fertile_leaf_scores, eot, num_new_leaves,
-        static_cast<int32>(accumulator_sums.shape().dim_size(1)), &leaf_heap);
-
-    const auto sums = node_sums.unaligned_flat<float>();
-    const int32 num_columns = node_sums.shape().dim_size(1);
-    // Allocate leaves.
-    std::unique_ptr<HeapValuesType> values(
-        leaf_heap.Extract());
-    int32 accumulator = -1;  // This will first get incremented to 0.
-    int32 num_accumulators_allocated = 0;
-    std::unordered_map<int32, int32> accumulators_to_node;
-    FindNextAccumulator(accumulator_sums, finished_accumulators, &accumulator);
-    int32 i = 0;
-    for (; i < values->size(); ++i) {
-      const std::pair<int32, float>& node = (*values)[i];
-      if (accumulator < 0) {
-        VLOG(1) << "No allocators left.";
-        break;
-      }
-      // For classification, don't make a node fertile until it is unpure.
-      if (!regression_) {
-        // Add 1 here because index 0 contains the sum of the weights across
-        // classes.
-        Eigen::array<int, 1> offsets = {node.first * num_columns + 1};
-        Eigen::array<int, 1> extents = {num_columns - 1};
-        const auto node_counts = sums.slice(offsets, extents);
-        // TODO(thomaswc): Implement a faster check for pure nodes.
-        if (tensorforest::RawWeightedGiniImpurity(node_counts) == 0) {
-          continue;
-        }
-      }
-      VLOG(1) << "setting node " << node.first << " to accumulator "
-              << accumulator;
-      ++num_accumulators_allocated;
-      accumulators_to_node[accumulator] = node.first;
-
-      FindNextAccumulator(accumulator_sums, finished_accumulators,
-                          &accumulator);
-    }
-
-    // Construct and fill outputs.
-    SetNodeMapUpdates(finished_accumulators, accumulators_to_node, finished,
-                      stale_leaves, context);
-    SetAccumulatorsCleared(finished_accumulators,
-                           accumulators_to_node, context);
-    SetAccumulatorsAllocated(accumulators_to_node, context);
-  }
-
- private:
-  struct OrderBySecondGreater {
-    bool operator()(const std::pair<int32, float> &left,
-                    const std::pair<int32, float> &right) {
-        return left.second > right.second;
-    }
-  };
-
-  typedef TopN<std::pair<int32, float>, OrderBySecondGreater> LeafHeapType;
-  typedef std::vector<std::pair<int32, float>> HeapValuesType;
-
-  // Creates an update tensor for the node to accumulator and accumulator to
-  // node maps.  Sets finished and stale nodes to -1 (no accumulator assigned)
-  // and newly allocated nodes to their accumulator.  De-allocated accumulators
-  // are also set to -1.
-  void SetNodeMapUpdates(
-      const std::set<int32>& finished_accumulators,
-      const std::unordered_map<int32, int32>& accumulators_to_node,
-      const Tensor& finished, const Tensor& stale, OpKernelContext* context) {
-    // Node-to-accumulator map updates.
-    Tensor* output_n2a_map = nullptr;
-    TensorShape n2a_map_shape;
-    n2a_map_shape.AddDim(2);
-    n2a_map_shape.AddDim(accumulators_to_node.size() +
-                         static_cast<int32>(stale.shape().dim_size(0) +
-                                            finished.shape().dim_size(0)));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, n2a_map_shape, &output_n2a_map));
-
-    // Calculate how many finished accumulators were not re-used, so that
-    // we can properly size the a2n output.
-    std::vector<int32> totally_finished_accumulators;
-    for (const int32 finished_accumulator : finished_accumulators) {
-      if (!gtl::FindOrNull(accumulators_to_node, finished_accumulator)) {
-        totally_finished_accumulators.push_back(finished_accumulator);
-      }
-    }
-
-    // Accumulator-to-node map updates.
-    Tensor* output_a2n_map = nullptr;
-    TensorShape a2n_map_shape;
-    a2n_map_shape.AddDim(2);
-    a2n_map_shape.AddDim(accumulators_to_node.size() +
-                         totally_finished_accumulators.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, a2n_map_shape, &output_a2n_map));
-
-    auto out_n2a = output_n2a_map->tensor<int32, 2>();
-    auto out_a2n = output_a2n_map->tensor<int32, 2>();
-    int32 n2a_slot = 0;
-    int32 a2n_slot = 0;
-
-    // Set finished nodes to -1.
-    const auto finished_vec = finished.unaligned_flat<int32>();
-    for (int32 i = 0; i < finished_vec.size(); ++i) {
-      out_n2a(0, n2a_slot) = finished_vec(i);
-      out_n2a(1, n2a_slot) = -1;
-      ++n2a_slot;
-    }
-    // Set stale nodes to -1.
-    const auto stale_vec = stale.unaligned_flat<int32>();
-    for (int32 i = 0; i < stale_vec.size(); ++i) {
-      out_n2a(0, n2a_slot) = stale_vec(i);
-      out_n2a(1, n2a_slot) = -1;
-      ++n2a_slot;
-    }
-
-    for (const int32 finished_accumulator : totally_finished_accumulators) {
-      out_a2n(0, a2n_slot) = finished_accumulator;
-      out_a2n(1, a2n_slot) = -1;
-      ++a2n_slot;
-    }
-
-    // Set newly allocated nodes to their allocator.
-    for (const auto& node_alloc_pair : accumulators_to_node) {
-      VLOG(1) << "a2n[" << node_alloc_pair.first
-              << "] = " << node_alloc_pair.second;
-      out_n2a(0, n2a_slot) = node_alloc_pair.second;
-      out_n2a(1, n2a_slot) = node_alloc_pair.first;
-      ++n2a_slot;
-
-      out_a2n(0, a2n_slot) = node_alloc_pair.first;
-      out_a2n(1, a2n_slot) = node_alloc_pair.second;
-      ++a2n_slot;
-    }
-  }
-
-  // Creates output tensor for cleared accumulators. Cleared accumulators are
-  // those that were finished but not re-allocated.
-  void SetAccumulatorsCleared(
-      const std::set<int32>& finished_accumulators,
-      const std::unordered_map<int32, int32>& accumulators_to_node,
-      OpKernelContext* context) {
-    std::set<int32> cleared;
-    for (const int32 node : finished_accumulators) {
-      if (accumulators_to_node.find(node) == accumulators_to_node.end()) {
-        cleared.insert(node);
-      }
-    }
-
-    Tensor* output_cleared = nullptr;
-    TensorShape cleared_shape;
-    cleared_shape.AddDim(cleared.size());
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, cleared_shape, &output_cleared));
-
-    auto out = output_cleared->unaligned_flat<int32>();
-
-    int32 i = 0;
-    for (const int32 accumulator : cleared) {
-      out(i) = accumulator;
-      ++i;
-    }
-  }
-
-  // Creates output tensor for accumulators that were allocated to now-fertile
-  // nodes.
-  void SetAccumulatorsAllocated(
-      const std::unordered_map<int32, int32>& accumulators_to_node,
-      OpKernelContext* context) {
-    // Node map updates.
-    Tensor* output_allocated = nullptr;
-    TensorShape allocated_shape;
-    allocated_shape.AddDim(accumulators_to_node.size());
-    OP_REQUIRES_OK(context, context->allocate_output(3, allocated_shape,
-                                                     &output_allocated));
-
-    auto out = output_allocated->unaligned_flat<int32>();
-    int32 output_slot = 0;
-
-    // Set newly allocated nodes to their allocator.
-    for (const auto& node_alloc_pair : accumulators_to_node) {
-      out(output_slot) = node_alloc_pair.first;
-      ++output_slot;
-    }
-  }
-
-  void ConstructLeafHeap(const Tensor& non_fertile_leaves,
-                         const Tensor& non_fertile_leaf_scores,
-                         int32 end_of_tree, int32 num_new_leaves,
-                         int32 num_classes, LeafHeapType* leaf_heap) {
-    const auto leaf_vec = non_fertile_leaves.unaligned_flat<int32>();
-    const auto leaf_score_vec = non_fertile_leaf_scores.unaligned_flat<float>();
-
-    for (int32 i = 0; i < leaf_vec.size(); i++) {
-      const int32 leaf = internal::SubtleMustCopy(leaf_vec(i));
-      // Filter out leaves < 0, non_fertile_nodes can contain garbage at
-      // startup.
-      if (leaf >= 0) {
-        leaf_heap->push(std::make_pair(leaf, leaf_score_vec(i)));
-      }
-    }
-
-    // Add new leaves.
-    Eigen::Tensor<float, 1, 1> zeros(num_classes - 1);
-    zeros.setZero();
-    // No data is 0 variance (for regression), not necessarily so for
-    // gini (classification).
-    const float zero_score = regression_ ? 0.0 : WeightedGiniImpurity(zeros);
-    for (int32 leaf = end_of_tree; leaf < end_of_tree + num_new_leaves;
-         leaf++) {
-      leaf_heap->push(std::make_pair(leaf, zero_score));
-    }
-  }
-
-  // Finds the next available or newly-finished accumulator.
-  void FindNextAccumulator(Tensor totals_tensor,
-                           const std::set<int32>& finished_accumulators,
-                           int* current) {
-    ++(*current);
-    const auto totals = totals_tensor.tensor<float, 2>();
-    for (; *current < totals_tensor.shape().dim_size(0); ++(*current)) {
-      if (totals(*current, 0) < 0 ||
-          finished_accumulators.find(*current) != finished_accumulators.end()) {
-        return;
-      }
-    }
-    *current = -1;
-  }
-
-  bool regression_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("UpdateFertileSlots").Device(DEVICE_CPU),
-                        UpdateFertileSlots);
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d14d4d2d6b1a0e87688bcb14317ba6aecf858d11
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -0,0 +1,243 @@
+# TensorFlow code for training random forests.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**/*"]),
+)
+
+cc_library(
+    name = "decision-tree-resource",
+    srcs = ["decision-tree-resource.cc"],
+    hdrs = ["decision-tree-resource.h"],
+    deps = [
+        ":decision_node_evaluator",
+        ":input_data",
+        ":leaf_model_operators",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "fertile-stats-resource",
+    srcs = ["fertile-stats-resource.cc"],
+    hdrs = ["fertile-stats-resource.h"],
+    deps = [
+        ":decision_node_evaluator",
+        ":input_data",
+        ":input_target",
+        ":leaf_model_operators",
+        ":split_collection_operators",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "input_data",
+    srcs = ["input_data.cc"],
+    hdrs = ["input_data.h"],
+    deps = [
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_cc",
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "input_target",
+    hdrs = ["input_target.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "leaf_model_operators",
+    srcs = ["leaf_model_operators.cc"],
+    hdrs = ["leaf_model_operators.h"],
+    deps = [
+        ":input_target",
+        ":params",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+    ],
+)
+
+cc_test(
+    name = "leaf_model_operators_test",
+    srcs = ["leaf_model_operators_test.cc"],
+    deps = [
+        ":leaf_model_operators",
+        ":test_utils",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+        "//tensorflow/core",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "grow_stats",
+    srcs = ["grow_stats.cc"],
+    hdrs = ["grow_stats.h"],
+    deps = [
+        ":decision_node_evaluator",
+        ":input_data",
+        ":input_target",
+        ":params",
+        ":stat_utils",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "grow_stats_test",
+    srcs = ["grow_stats_test.cc"],
+    deps = [
+        ":grow_stats",
+        ":test_utils",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+        "//tensorflow/core",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "candidate_graph_runner",
+    srcs = ["candidate_graph_runner.cc"],
+    hdrs = ["candidate_graph_runner.h"],
+    deps = [
+        ":input_data",
+        ":input_target",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/core:core_cpu",
+    ],
+)
+
+cc_library(
+    name = "decision_node_evaluator",
+    srcs = ["decision_node_evaluator.cc"],
+    hdrs = ["decision_node_evaluator.h"],
+    deps = [
+        ":input_data",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "decision_node_evaluator_test",
+    srcs = ["decision_node_evaluator_test.cc"],
+    deps = [
+        ":decision_node_evaluator",
+        ":test_utils",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/core",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "split_collection_operators",
+    srcs = ["split_collection_operators.cc"],
+    hdrs = ["split_collection_operators.h"],
+    deps = [
+        ":grow_stats",
+        ":input_data",
+        ":input_target",
+        ":leaf_model_operators",
+        ":params",
+        ":stat_utils",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_cc",
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "graph_collection_operator",
+    srcs = ["graph_collection_operator.cc"],
+    hdrs = ["graph_collection_operator.h"],
+    deps = [
+        ":candidate_graph_runner",
+        ":grow_stats",
+        ":input_data",
+        ":input_target",
+        ":leaf_model_operators",
+        ":params",
+        ":split_collection_operators",
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest:tree_utils",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "stat_utils",
+    srcs = ["stat_utils.cc"],
+    hdrs = ["stat_utils.h"],
+    deps = [
+        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    hdrs = ["test_utils.h"],
+    deps = [
+        ":input_data",
+        ":input_target",
+    ],
+)
+
+cc_library(
+    name = "params",
+    srcs = ["params.cc"],
+    hdrs = ["params.h"],
+    deps = [
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "params_test",
+    srcs = ["params_test.cc"],
+    deps = [
+        ":params",
+        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81e2a1b2a1b720574210e376fa786923367794a6
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
@@ -0,0 +1,137 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h"
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Names of ops in the graph to run.
+constexpr char kInitializeOp[] = "init";
+constexpr char kAddExampleOp[] = "add_example";
+constexpr char kSplitScoreName[] = "split_score";
+constexpr char kGetSplitName[] = "get_split";
+constexpr char kGetLeftStatsName[] = "get_left_stats";
+constexpr char kGetRightStatsName[] = "get_right_stats";
+
+// Names of files written by python graph builder.
+constexpr char kGraphFilename[] = "graph";
+constexpr char kSaverDefFilename[] = "saver";
+constexpr char kMetaDefFilename[] = "meta";
+
+// Names of Tensor inputs.
+constexpr char kFeaturesName[] = "features";
+constexpr char kInputDataName[] = "input_data";
+constexpr char kTargetsName[] = "targets";
+constexpr char kExamplesName[] = "examples";
+
+constexpr char kNoOp[] = "none";
+
+CandidateGraphRunner::CandidateGraphRunner(
+    const string& graph_dir, const decision_trees::BinaryNode& split)
+    : split_(split) {
+  // read graph from file.
+  GraphDef graph_def;
+  TF_CHECK_OK(ReadBinaryProto(
+      Env::Default(), io::JoinPath(graph_dir, kGraphFilename), &graph_def))
+      << "Could not read graph def.";
+
+  // create session.
+  session_.reset(::tensorflow::NewSession(SessionOptions()));
+  TF_CHECK_OK(session_->Create(graph_def)) << "Failed to create session";
+
+  // Features don't change, store them in a tensor.
+  const auto& oblique = split.inequality_left_child_test().oblique();
+  const int32 feat_size = oblique.features_size();
+  features_.reset(
+      new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
+  auto feat = features_->flat<int32>();
+  int i = 0;
+  for (const auto& id : oblique.features()) {
+    safe_strto32(id.id().value(), &feat(i++));
+  }
+}
+
+void CandidateGraphRunner::RunOp(
+    const string& name, const TensorNameValueList& inputs,
+    const std::vector<string>& output_tensor_names,
+    std::vector<Tensor>* outputs) {
+  std::vector<string> op_name;
+  if (name != kNoOp) {
+    op_name.push_back(name);
+  }
+  TF_CHECK_OK(session_->Run(inputs, output_tensor_names, op_name, outputs))
+      << "Failed to run: " << name;
+}
+
+void CandidateGraphRunner::Init() {
+  RunOp(kInitializeOp, TensorNameValueList(), std::vector<string>(), nullptr);
+}
+
+void CandidateGraphRunner::AddExample(const Tensor& input_data,
+                                      const Tensor& target,
+                                      const Tensor& examples) {
+  TensorNameValueList inputs;
+  inputs.emplace_back(kFeaturesName, *features_);
+  inputs.emplace_back(kExamplesName, examples);
+  inputs.emplace_back(kInputDataName, input_data);
+  inputs.emplace_back(kTargetsName, target);
+
+  RunOp(kAddExampleOp, inputs, std::vector<string>(), nullptr);
+}
+
+float CandidateGraphRunner::SplitScore() {
+  std::vector<Tensor> outputs;
+  RunOp(kNoOp, TensorNameValueList(), {kSplitScoreName}, &outputs);
+  return outputs[0].unaligned_flat<float>()(0);
+}
+
+void CandidateGraphRunner::GetSplit(decision_trees::BinaryNode* node) {
+  std::vector<Tensor> outputs;
+  RunOp(kNoOp, TensorNameValueList(), {kGetSplitName}, &outputs);
+  ParseProtoUnlimited(node, outputs[0].unaligned_flat<string>()(0));
+  const auto& oblique = split_.inequality_left_child_test().oblique();
+  auto* new_split =
+      node->mutable_inequality_left_child_test()->mutable_oblique();
+  for (const auto& id : oblique.features()) {
+    *new_split->add_features() = id;
+  }
+}
+
+void CandidateGraphRunner::GetLeftStats(LeafStat* stats) {
+  std::vector<Tensor> outputs;
+  RunOp(kNoOp, TensorNameValueList(), {kGetLeftStatsName}, &outputs);
+  const auto& counts = outputs[0].unaligned_flat<float>();
+  auto* dense = stats->mutable_classification()->mutable_dense_counts();
+  for (int i = 0; i < counts.size(); ++i) {
+    dense->add_value()->set_float_value(counts(i));
+  }
+}
+
+void CandidateGraphRunner::GetRightStats(LeafStat* stats) {
+  std::vector<Tensor> outputs;
+  RunOp(kNoOp, TensorNameValueList(), {kGetRightStatsName}, &outputs);
+  const auto& counts = outputs[0].unaligned_flat<float>();
+  auto* dense = stats->mutable_classification()->mutable_dense_counts();
+  for (int i = 0; i < counts.size(); ++i) {
+    dense->add_value()->set_float_value(counts(i));
+  }
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bd1f06c72945f73e50301c337692e0b510d3693
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
@@ -0,0 +1,73 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+typedef std::vector<std::pair<string, ::tensorflow::Tensor>>
+    TensorNameValueList;
+
+// Class that represents one split candidate, and can perform operations
+// on a session created from a graph.
+class CandidateGraphRunner {
+ public:
+  // split should contain the features that are being used.
+  CandidateGraphRunner(const string& graph_dir,
+                       const decision_trees::BinaryNode& split);
+
+  // Input the given data and target Tensors to the add_example op.
+  void AddExample(const Tensor& input_data, const Tensor& target,
+                  const Tensor& examples);
+
+  // Get the candidates' split score with the split_score op.
+  float SplitScore();
+
+  // Fills in the split in node with weights and threshold.
+  void GetSplit(decision_trees::BinaryNode* node);
+
+  // Fills in the stats for the left-branch taken.
+  void GetLeftStats(LeafStat* stats);
+
+  // Fills in the stats for the right-branch taken.
+  void GetRightStats(LeafStat* stats);
+
+  // Initializes variables, must be run before other ops.
+  void Init();
+
+ protected:
+  void RunOp(const string& name, const TensorNameValueList& inputs,
+             const std::vector<string>& output_tensor_names,
+             std::vector<Tensor>* outputs);
+
+  std::unique_ptr<Session> session_;
+  decision_trees::BinaryNode split_;
+  std::unique_ptr<Tensor> features_;
+};
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.cc
new file mode 100644
index 0000000000000000000000000000000000000000..881e4339a751e440716018c786733d934b2a8e26
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.cc
@@ -0,0 +1,91 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+using decision_trees::DecisionTree;
+using decision_trees::Leaf;
+using decision_trees::TreeNode;
+
+int32 DecisionTreeResource::TraverseTree(
+    const std::unique_ptr<TensorDataSet>& input_data, int example,
+    int32* leaf_depth) const {
+  const DecisionTree& tree = decision_tree_->decision_tree();
+  int32 current_id = 0;
+  int32 depth = 0;
+  while (true) {
+    const TreeNode& current = tree.nodes(current_id);
+    if (current.has_leaf()) {
+      if (leaf_depth != nullptr) {
+        *leaf_depth = depth;
+      }
+      return current_id;
+    }
+    ++depth;
+    const int32 next_id =
+        node_evaluators_[current_id]->Decide(input_data, example);
+    current_id = tree.nodes(next_id).node_id().value();
+  }
+}
+
+void DecisionTreeResource::SplitNode(int32 node_id, SplitCandidate* best,
+                                     std::vector<int32>* new_children) {
+  DecisionTree* tree = decision_tree_->mutable_decision_tree();
+  TreeNode* node = tree->mutable_nodes(node_id);
+  int32 newid = tree->nodes_size();
+
+  // left
+  new_children->push_back(newid);
+  TreeNode* new_left = tree->add_nodes();
+  new_left->mutable_node_id()->set_value(newid++);
+  Leaf* left_leaf = new_left->mutable_leaf();
+  model_op_->ExportModel(best->left_stats(), left_leaf);
+
+  // right
+  new_children->push_back(newid);
+  TreeNode* new_right = tree->add_nodes();
+  new_right->mutable_node_id()->set_value(newid);
+  Leaf* right_leaf = new_right->mutable_leaf();
+  model_op_->ExportModel(best->right_stats(), right_leaf);
+
+  node->clear_leaf();
+  node->mutable_binary_node()->Swap(best->mutable_split());
+  node->mutable_binary_node()->mutable_left_child_id()->set_value(newid - 1);
+  node->mutable_binary_node()->mutable_right_child_id()->set_value(newid);
+  while (node_evaluators_.size() <= node_id) {
+    node_evaluators_.emplace_back(nullptr);
+  }
+  node_evaluators_[node_id] = CreateDecisionNodeEvaluator(*node);
+}
+
+void DecisionTreeResource::MaybeInitialize() {
+  DecisionTree* tree = decision_tree_->mutable_decision_tree();
+  if (tree->nodes_size() == 0) {
+    model_op_->InitModel(tree->add_nodes()->mutable_leaf());
+  } else if (node_evaluators_.empty()) {  // reconstruct evaluators
+    for (const auto& node : tree->nodes()) {
+      if (node.has_leaf()) {
+        node_evaluators_.emplace_back(nullptr);
+      } else {
+        node_evaluators_.push_back(CreateDecisionNodeEvaluator(node));
+      }
+    }
+  }
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..438d3d817c49f6c228dfab04659026afc2fb78e9
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -0,0 +1,94 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class DecisionTreeResource : public ResourceBase {
+ public:
+  // Constructor.
+  explicit DecisionTreeResource(const TensorForestParams& params)
+      : params_(params), decision_tree_(new decision_trees::Model()) {
+    model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
+  }
+
+  string DebugString() override {
+    return strings::StrCat("DecisionTree[size=",
+                           decision_tree_->decision_tree().nodes_size(),
+                           "]");
+  }
+
+  void MaybeInitialize();
+
+  const decision_trees::Model& decision_tree() const {
+    return *decision_tree_;
+  }
+
+  decision_trees::Model* mutable_decision_tree() {
+    return decision_tree_.get();
+  }
+
+  const decision_trees::Leaf& get_leaf(int32 id) const {
+    return decision_tree_->decision_tree().nodes(id).leaf();
+  }
+
+  decision_trees::TreeNode* get_mutable_tree_node(int32 id) {
+    return decision_tree_->mutable_decision_tree()->mutable_nodes(id);
+  }
+
+  // Resets the resource and frees the proto.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset() {
+    decision_tree_.reset(new decision_trees::Model());
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+  // Return the TreeNode for the leaf that the example ends up at according
+  // to decsion_tree_. Also fill in that leaf's depth if it isn't nullptr.
+  int32 TraverseTree(const std::unique_ptr<TensorDataSet>& input_data,
+                     int example, int32* depth) const;
+
+  // Split the given node_id, turning it from a Leaf to a BinaryNode and
+  // setting it's split to the given best.  Add new children ids to
+  // new_children.
+  void SplitNode(int32 node_id, SplitCandidate* best,
+                 std::vector<int32>* new_children);
+
+ private:
+  mutex mu_;
+  const TensorForestParams params_;
+  std::unique_ptr<decision_trees::Model> decision_tree_;
+  std::shared_ptr<LeafModelOperator> model_op_;
+  std::vector<std::unique_ptr<DecisionNodeEvaluator>> node_evaluators_;
+};
+
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e25579070eef13682dedfcd3c9e435333f65687
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
@@ -0,0 +1,120 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+std::unique_ptr<DecisionNodeEvaluator> CreateDecisionNodeEvaluator(
+    const decision_trees::TreeNode& node) {
+  const decision_trees::BinaryNode& bnode = node.binary_node();
+  return CreateBinaryDecisionNodeEvaluator(bnode, bnode.left_child_id().value(),
+                                           bnode.right_child_id().value());
+}
+
+std::unique_ptr<DecisionNodeEvaluator> CreateBinaryDecisionNodeEvaluator(
+    const decision_trees::BinaryNode& bnode, int32 left, int32 right) {
+  if (bnode.has_inequality_left_child_test()) {
+    const auto& test = bnode.inequality_left_child_test();
+    if (test.has_oblique()) {
+      return std::unique_ptr<ObliqueInequalityDecisionNodeEvaluator>(
+          new ObliqueInequalityDecisionNodeEvaluator(test, left, right));
+    } else {
+      return std::unique_ptr<InequalityDecisionNodeEvaluator>(
+          new InequalityDecisionNodeEvaluator(test, left, right));
+    }
+  } else {
+    decision_trees::MatchingValuesTest test;
+    if (bnode.custom_left_child_test().UnpackTo(&test)) {
+      return std::unique_ptr<MatchingValuesDecisionNodeEvaluator>(
+          new MatchingValuesDecisionNodeEvaluator(test, left, right));
+    } else {
+      LOG(ERROR) << "Unknown split test: " << bnode.DebugString();
+      return nullptr;
+    }
+  }
+}
+
+InequalityDecisionNodeEvaluator::InequalityDecisionNodeEvaluator(
+    const decision_trees::InequalityTest& test, int32 left, int32 right)
+    : BinaryDecisionNodeEvaluator(left, right) {
+  safe_strto32(test.feature_id().id().value(), &feature_num_);
+  threshold_ = test.threshold().float_value();
+  include_equals_ =
+      test.type() == decision_trees::InequalityTest::LESS_OR_EQUAL;
+}
+
+int32 InequalityDecisionNodeEvaluator::Decide(
+    const std::unique_ptr<TensorDataSet>& dataset, int example) const {
+  const float val = dataset->GetExampleValue(example, feature_num_);
+  if (val < threshold_ || (include_equals_ && val == threshold_)) {
+    return left_child_id_;
+  } else {
+    return right_child_id_;
+  }
+}
+
+ObliqueInequalityDecisionNodeEvaluator::ObliqueInequalityDecisionNodeEvaluator(
+    const decision_trees::InequalityTest& test, int32 left, int32 right)
+    : BinaryDecisionNodeEvaluator(left, right) {
+  for (int i = 0; i < test.oblique().features_size(); ++i) {
+    int32 val;
+    safe_strto32(test.oblique().features(i).id().value(), &val);
+    feature_num_.push_back(val);
+    feature_weights_.push_back(test.oblique().weights(i));
+  }
+  threshold_ = test.threshold().float_value();
+}
+
+int32 ObliqueInequalityDecisionNodeEvaluator::Decide(
+    const std::unique_ptr<TensorDataSet>& dataset, int example) const {
+  float val = 0;
+  for (int i = 0; i < feature_num_.size(); ++i) {
+    val += feature_weights_[i] *
+           dataset->GetExampleValue(example, feature_num_[i]);
+  }
+
+  if (val <= threshold_) {
+    return left_child_id_;
+  } else {
+    return right_child_id_;
+  }
+}
+
+MatchingValuesDecisionNodeEvaluator::MatchingValuesDecisionNodeEvaluator(
+    const decision_trees::MatchingValuesTest& test, int32 left, int32 right)
+    : BinaryDecisionNodeEvaluator(left, right) {
+  safe_strto32(test.feature_id().id().value(), &feature_num_);
+  for (const auto& val : test.value()) {
+    values_.push_back(val.float_value());
+  }
+  inverse_ = test.inverse();
+}
+
+int32 MatchingValuesDecisionNodeEvaluator::Decide(
+    const std::unique_ptr<TensorDataSet>& dataset, int example) const {
+  const float val = dataset->GetExampleValue(example, feature_num_);
+  for (float testval : values_) {
+    if (val == testval) {
+      return inverse_ ? right_child_id_ : left_child_id_;
+    }
+  }
+
+  return inverse_ ? left_child_id_ : right_child_id_;
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f03c2d05bb1090fa75f4b6e7ad4f00caaea61a4
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -0,0 +1,107 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+
+// Base class for evaluators of decision nodes that effectively copy proto
+// contents into C++ structures for faster execution.
+class DecisionNodeEvaluator {
+ public:
+  virtual ~DecisionNodeEvaluator() {}
+
+  // Returns the index of the child node.
+  virtual int32 Decide(const std::unique_ptr<TensorDataSet>& dataset,
+                       int example) const = 0;
+};
+
+// An evaluator for binary decisions with left and right children.
+class BinaryDecisionNodeEvaluator : public DecisionNodeEvaluator {
+ protected:
+  BinaryDecisionNodeEvaluator(int32 left, int32 right)
+      : left_child_id_(left), right_child_id_(right) {}
+
+  int32 left_child_id_;
+  int32 right_child_id_;
+};
+
+// Evaluator for basic inequality decisions (f[x] <= T).
+class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
+ public:
+  InequalityDecisionNodeEvaluator(const decision_trees::InequalityTest& test,
+                                  int32 left, int32 right);
+
+  int32 Decide(const std::unique_ptr<TensorDataSet>& dataset,
+               int example) const override;
+
+ protected:
+  int32 feature_num_;
+  float threshold_;
+
+  // If decision is '<=' as opposed to '<'.
+  bool include_equals_;
+};
+
+// Evalutor for splits with multiple weighted features.
+class ObliqueInequalityDecisionNodeEvaluator
+    : public BinaryDecisionNodeEvaluator {
+ public:
+  ObliqueInequalityDecisionNodeEvaluator(
+      const decision_trees::InequalityTest& test, int32 left, int32 right);
+
+  int32 Decide(const std::unique_ptr<TensorDataSet>& dataset,
+               int example) const override;
+
+ protected:
+  std::vector<int32> feature_num_;
+  std::vector<float> feature_weights_;
+  float threshold_;
+};
+
+// Evaluator for contains-in-set decisions.  Also supports inverse (not-in-set).
+class MatchingValuesDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
+ public:
+  MatchingValuesDecisionNodeEvaluator(
+      const decision_trees::MatchingValuesTest& test, int32 left, int32 right);
+
+  int32 Decide(const std::unique_ptr<TensorDataSet>& dataset,
+               int example) const override;
+
+ protected:
+  int32 feature_num_;
+  std::vector<float> values_;
+  bool inverse_;
+};
+
+std::unique_ptr<DecisionNodeEvaluator> CreateDecisionNodeEvaluator(
+    const decision_trees::TreeNode& node);
+std::unique_ptr<DecisionNodeEvaluator> CreateBinaryDecisionNodeEvaluator(
+    const decision_trees::BinaryNode& node, int32 left, int32 right);
+
+struct CandidateEvalatorCollection {
+  std::vector<std::unique_ptr<DecisionNodeEvaluator>> splits;
+};
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c49b87443e7b1f4ef532256ae2efdc9fa985d8a
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -0,0 +1,127 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::tensorforest::InequalityDecisionNodeEvaluator;
+using tensorflow::tensorforest::MatchingValuesDecisionNodeEvaluator;
+using tensorflow::tensorforest::ObliqueInequalityDecisionNodeEvaluator;
+using tensorflow::decision_trees::InequalityTest;
+using tensorflow::decision_trees::MatchingValuesTest;
+
+TEST(InequalityDecisionNodeEvaluatorTest, TestLessOrEqual) {
+  InequalityTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::LESS_OR_EQUAL);
+  std::unique_ptr<InequalityDecisionNodeEvaluator> eval(
+      new InequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 0);
+  ASSERT_EQ(eval->Decide(dataset, 3), 0);
+  ASSERT_EQ(eval->Decide(dataset, 4), 1);
+}
+
+TEST(InequalityDecisionNodeEvaluatorTest, TestStrictlyLess) {
+  InequalityTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::LESS_THAN);
+  std::unique_ptr<InequalityDecisionNodeEvaluator> eval(
+      new InequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 0);
+  ASSERT_EQ(eval->Decide(dataset, 3), 1);
+  ASSERT_EQ(eval->Decide(dataset, 4), 1);
+}
+
+TEST(MatchingDecisionNodeEvaluatorTest, Basic) {
+  MatchingValuesTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.add_value()->set_float_value(3.0);
+  test.add_value()->set_float_value(5.0);
+
+  std::unique_ptr<MatchingValuesDecisionNodeEvaluator> eval(
+      new MatchingValuesDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 1);
+  ASSERT_EQ(eval->Decide(dataset, 3), 0);
+  ASSERT_EQ(eval->Decide(dataset, 4), 1);
+  ASSERT_EQ(eval->Decide(dataset, 5), 0);
+}
+
+TEST(MatchingDecisionNodeEvaluatorTest, Inverse) {
+  MatchingValuesTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.add_value()->set_float_value(3.0);
+  test.add_value()->set_float_value(5.0);
+  test.set_inverse(true);
+
+  std::unique_ptr<MatchingValuesDecisionNodeEvaluator> eval(
+      new MatchingValuesDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 0);
+  ASSERT_EQ(eval->Decide(dataset, 3), 1);
+  ASSERT_EQ(eval->Decide(dataset, 4), 0);
+  ASSERT_EQ(eval->Decide(dataset, 5), 1);
+}
+
+TEST(ObliqueDecisionNodeEvaluatorTest, Basic) {
+  InequalityTest test;
+  auto* feat1 = test.mutable_oblique()->add_features();
+  feat1->mutable_id()->set_value("0");
+  test.mutable_oblique()->add_weights(1.0);
+  auto* feat2 = test.mutable_oblique()->add_features();
+  feat2->mutable_id()->set_value("1");
+  test.mutable_oblique()->add_weights(1.0);
+
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::LESS_OR_EQUAL);
+
+  std::unique_ptr<ObliqueInequalityDecisionNodeEvaluator> eval(
+      new ObliqueInequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 2));
+
+  ASSERT_EQ(eval->Decide(dataset, 0), 0);
+  ASSERT_EQ(eval->Decide(dataset, 1), 1);
+}
+
+}  // namespace
+}  // namespace tensorflow
+
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.cc b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f914aac319c2f50d7c81fd2bcc01a2b056ea331
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.cc
@@ -0,0 +1,78 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h"
+
+#include <cfloat>
+
+namespace tensorflow {
+namespace tensorforest {
+
+void FertileStatsResource::AddExampleToStatsAndInitialize(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    const std::vector<int>& examples, int32 node_id, bool* is_finished) {
+  // Update stats or initialize if needed.
+  if (collection_op_->IsInitialized(node_id)) {
+    collection_op_->AddExample(input_data, target, examples, node_id);
+  } else {
+    // This throws away any extra examples, which is more inefficient towards
+    // the top but gradually becomes less of an issue as the tree grows.
+    for (int example : examples) {
+      collection_op_->CreateAndInitializeCandidateWithExample(
+          input_data, target, example, node_id);
+      if (collection_op_->IsInitialized(node_id)) {
+        break;
+      }
+    }
+  }
+
+  *is_finished = collection_op_->IsFinished(node_id);
+}
+
+void FertileStatsResource::AllocateNode(int32 node_id, int32 depth) {
+  collection_op_->InitializeSlot(node_id, depth);
+}
+
+void FertileStatsResource::Allocate(int32 parent_depth,
+                                    const std::vector<int32>& new_children) {
+  const int32 children_depth = parent_depth + 1;
+  for (const int32 child : new_children) {
+    AllocateNode(child, children_depth);
+  }
+}
+
+void FertileStatsResource::Clear(int32 node) {
+  collection_op_->ClearSlot(node);
+}
+
+bool FertileStatsResource::BestSplit(int32 node_id, SplitCandidate* best,
+                                     int32* depth) {
+  return collection_op_->BestSplit(node_id, best, depth);
+}
+
+void FertileStatsResource::MaybeInitialize() {
+  collection_op_->MaybeInitialize();
+}
+
+void FertileStatsResource::ExtractFromProto(const FertileStats& stats) {
+  collection_op_ =
+      SplitCollectionOperatorFactory::CreateSplitCollectionOperator(params_);
+  collection_op_->ExtractFromProto(stats);
+}
+
+void FertileStatsResource::PackToProto(FertileStats* stats) const {
+  collection_op_->PackToProto(stats);
+}
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..dacf033d99018d47787b644b12d3181780df7113
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -0,0 +1,101 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Stores a FertileStats proto and implements operations on it.
+class FertileStatsResource : public ResourceBase {
+ public:
+  // Constructor.
+  explicit FertileStatsResource(const TensorForestParams& params)
+      : params_(params) {
+    model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
+  }
+
+  string DebugString() override {
+    return "FertileStats";
+  }
+
+  void ExtractFromProto(const FertileStats& stats);
+
+  void PackToProto(FertileStats* stats) const;
+
+  // Resets the resource and frees the proto.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset() {
+  }
+
+  // Reset the stats for a node, but leave the leaf_stats intact.
+  void ResetSplitStats(int32 node_id, int32 depth) {
+    collection_op_->ClearSlot(node_id);
+    collection_op_->InitializeSlot(node_id, depth);
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+  void MaybeInitialize();
+
+  // Applies the example to the given leaf's statistics. Also applies it to the
+  // node's fertile slot's statistics if or initializes a split candidate,
+  // where applicable.  Returns if the node is finished or if it's ready to
+  // allocate to a fertile slot.
+  void AddExampleToStatsAndInitialize(
+      const std::unique_ptr<TensorDataSet>& input_data,
+      const InputTarget* target, const std::vector<int>& examples,
+      int32 node_id, bool* is_finished);
+
+  // Allocate a fertile slot for each ready node, then new children up to
+  // max_fertile_nodes_.
+  void Allocate(int32 parent_depth, const std::vector<int32>& new_children);
+
+  // Remove a node's fertile slot.  Should only be called when the node is
+  // no longer a leaf.
+  void Clear(int32 node);
+
+  // Return the best SplitCandidate for a node, or NULL if no suitable split
+  // was found.
+  bool BestSplit(int32 node_id, SplitCandidate* best, int32* depth);
+
+
+ private:
+  mutex mu_;
+  std::shared_ptr<LeafModelOperator> model_op_;
+  std::unique_ptr<SplitCollectionOperator> collection_op_;
+  const TensorForestParams params_;
+
+  void AllocateNode(int32 node_id, int32 depth);
+};
+
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.cc b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7faea0aef1441ac453a986ca6e674d08d8a1b33
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.cc
@@ -0,0 +1,142 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h"
+
+#include <cfloat>
+
+#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+REGISTER_SPLIT_COLLECTION(GRAPH_RUNNER_COLLECTION,
+                          GraphRunnerSplitCollectionOperator);
+
+std::unique_ptr<GrowStats> GraphRunnerSplitCollectionOperator::CreateGrowStats(
+    int32 node_id, int32 depth) const {
+  return std::unique_ptr<GrowStats>(new SimpleStats(params_, depth));
+}
+
+int64 GraphRunnerSplitCollectionOperator::UniqueId(int32 node_id,
+                                                   int32 split_id) const {
+  return node_id * num_splits_to_consider_ + split_id;
+}
+
+bool GraphRunnerSplitCollectionOperator::BestSplit(int32 node_id,
+                                                   SplitCandidate* best,
+                                                   int32* depth) const {
+  float min_score = FLT_MAX;
+  int best_index = -1;
+  auto* slot = stats_.at(node_id).get();
+  *depth = slot->depth();
+  for (int i = 0; i < slot->num_splits(); ++i) {
+    // TODO(gilberth): Support uselessness.
+    auto& runner = runners_[UniqueId(node_id, i)];
+    const float split_score = runner->SplitScore();
+    if (split_score < min_score) {
+      min_score = split_score;
+      best_index = i;
+    }
+  }
+
+  // This could happen if all the splits are useless.
+  if (best_index < 0) {
+    return false;
+  }
+
+  // Fill in split info and left/right stats to initialize models with.
+  *best = SplitCandidate();
+  auto& runner = runners_[UniqueId(node_id, best_index)];
+  runner->GetLeftStats(best->mutable_left_stats());
+  runner->GetRightStats(best->mutable_right_stats());
+  runner->GetSplit(best->mutable_split());
+  return true;
+}
+
+void GraphRunnerSplitCollectionOperator::AddExample(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    const std::vector<int>& examples, int32 node_id) const {
+  // Build input Tensors.
+  int size = examples.size();
+  Tensor examples_t(tensorflow::DT_INT32, TensorShape({size}));
+  auto ex_data = examples_t.flat<int32>();
+  std::copy(examples.begin(), examples.end(), ex_data.data());
+
+  const TensorInputTarget* tensor_target =
+      dynamic_cast<const TensorInputTarget*>(target);
+  CHECK_NOTNULL(tensor_target);
+
+  const Tensor& data_t = input_data->original_tensor();
+  const Tensor& target_t = tensor_target->original_tensor();
+
+  // Add to candidates.
+  auto* slot = stats_.at(node_id).get();
+  for (int i = 0; i < slot->num_splits(); ++i) {
+    auto& runner = runners_[UniqueId(node_id, i)];
+    runner->AddExample(data_t, target_t, examples_t);
+  }
+
+  // Update simple weight sums so we know when we're done.
+  for (int example : examples) {
+    slot->AddExample(input_data, target, example);
+  }
+}
+
+void GraphRunnerSplitCollectionOperator::
+    CreateAndInitializeCandidateWithExample(
+        const std::unique_ptr<TensorDataSet>& input_data,
+        const InputTarget* target, int example, int32 node_id) const {
+  auto* slot = stats_.at(node_id).get();
+  int cand_num = slot->num_splits();
+  const int64 unique_id = UniqueId(node_id, cand_num);
+
+  decision_trees::BinaryNode split;
+
+  decision_trees::InequalityTest* test =
+      split.mutable_inequality_left_child_test();
+  auto* oblique = test->mutable_oblique();
+  for (int i = 0; i < features_per_node_; ++i) {
+    float bias;
+    int type;
+    // This is really just a way to select a list of random features.
+    // Also a way to warn the user that categoricals don't make sense here.
+    input_data->RandomSample(example, oblique->add_features(), &bias, &type);
+
+    if (type == kDataFloat) {
+      test->set_type(decision_trees::InequalityTest::LESS_OR_EQUAL);
+
+      // The comparison bias is assumed to be zero.
+      test->mutable_threshold()->set_float_value(0);
+    } else {
+      LOG(ERROR) << "Categorical features not supported with this system.";
+      return;
+    }
+  }
+
+  slot->AddSplit(split, input_data, target, example);
+
+  runners_[unique_id].reset(new CandidateGraphRunner(graph_dir_, split));
+  runners_[unique_id]->Init();
+}
+
+void GraphRunnerSplitCollectionOperator::ClearSlot(int32 node_id) {
+  SplitCollectionOperator::ClearSlot(node_id);
+  for (int i = 0; i < num_splits_to_consider_; ++i) {
+    runners_.erase(UniqueId(node_id, i));
+  }
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ae3a79b3dd69b3fd3d31a055589b2edc63afa3c
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
@@ -0,0 +1,81 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+
+#include <vector>
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Holds split candidates that are trained by running any TF graph.
+class GraphRunnerSplitCollectionOperator : public SplitCollectionOperator {
+ public:
+  explicit GraphRunnerSplitCollectionOperator(const TensorForestParams& params)
+      : SplitCollectionOperator(params) {
+    if (params.num_splits_to_consider().ParamType_case() ==
+        DepthDependentParam::PARAMTYPE_NOT_SET) {
+      LOG(FATAL) << "GRAPH_RUNNER_COLLECTION must specify a constant value for "
+                 << " num_splits_to_consider";
+    } else {
+      num_splits_to_consider_ =
+          params.num_splits_to_consider().constant_value();
+    }
+  }
+
+  std::unique_ptr<GrowStats> CreateGrowStats(int32 node_id,
+                                             int32 depth) const override;
+
+  // Updates the slot's candidates with the new example.
+  // Assumes slot has been initialized.
+  void AddExample(const std::unique_ptr<TensorDataSet>& input_data,
+                  const InputTarget* target, const std::vector<int>& examples,
+                  int32 node_id) const override;
+
+  // Create a new candidate and initialize it with the given example.
+  void CreateAndInitializeCandidateWithExample(
+      const std::unique_ptr<TensorDataSet>& input_data,
+      const InputTarget* target, int example, int32 node_id) const override;
+
+  bool BestSplit(int32 node_id, SplitCandidate* best,
+                 int32* depth) const override;
+
+  void ClearSlot(int32 node_id) override;
+
+ protected:
+  int64 UniqueId(int32 node_id, int32 split_id) const;
+
+  mutable std::unordered_map<int64, std::unique_ptr<CandidateGraphRunner>>
+      runners_;
+  int features_per_node_;
+  string graph_dir_;
+  // Must have a constant value because of how we make unique ids right now.
+  int32 num_splits_to_consider_;
+};
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
new file mode 100644
index 0000000000000000000000000000000000000000..63bfc1aef18beffc0f5819c1e610331e5c576af3
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -0,0 +1,809 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h"
+
+#include <cfloat>
+#include <queue>
+#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+
+
+namespace tensorflow {
+namespace tensorforest {
+
+// When creating evaluators for the split candidates, use these
+// for the left and right return values.
+static const int32 LEFT_INDEX = 0;
+static const int32 RIGHT_INDEX = 1;
+
+GrowStats::GrowStats(const TensorForestParams& params, int32 depth)
+    : weight_sum_(0),
+      depth_(depth),
+      params_(params),
+      split_after_samples_(ResolveParam(params.split_after_samples(), depth)),
+      num_splits_to_consider_(
+          ResolveParam(params.num_splits_to_consider(), depth)),
+      num_outputs_(params.num_outputs()) {}
+
+void GrowStats::AddSplit(const decision_trees::BinaryNode& split,
+                         const std::unique_ptr<TensorDataSet>& input_data,
+                         const InputTarget* target, int example) {
+  // It's possible that the split collection calls AddSplit, but we actually
+  // have all the splits we need and are just waiting for them to be fully
+  // initialized.
+  if (splits_.size() < num_splits_to_consider_) {
+    splits_.push_back(split);
+    evaluators_.emplace_back(
+        CreateBinaryDecisionNodeEvaluator(split, LEFT_INDEX, RIGHT_INDEX));
+    AddSplitStats(target, example);
+  }
+
+  if (input_data != nullptr && target != nullptr &&
+      params_.initialize_average_splits()) {
+    AdditionalInitializationExample(input_data, target, example);
+  }
+}
+
+void GrowStats::RemoveSplit(int split_num) {
+  splits_.erase(splits_.begin() + split_num);
+  evaluators_.erase(evaluators_.begin() + split_num);
+  RemoveSplitStats(split_num);
+}
+
+// ------------------------ Classification --------------------------- //
+
+ClassificationStats::ClassificationStats(const TensorForestParams& params,
+                                         int32 depth)
+    : GrowStats(params, depth), finish_early_(false) {
+  // Early splitting params.
+  if (params.finish_type().type() == SPLIT_FINISH_BASIC) {
+    min_split_samples_ = split_after_samples_;
+    finish_sample_epoch_ = 1;
+    finish_check_every_ = split_after_samples_ * 2;
+  } else {
+    if (!params.has_dominate_fraction() || !params.has_min_split_samples()) {
+      LOG(FATAL) << "dominate_fraction and min_split_samples "
+                 << "required for early-finish strategy.";
+    } else {
+      min_split_samples_ = ResolveParam(params.min_split_samples(), depth);
+      finish_check_every_ =
+          ResolveParam(params.finish_type().check_every_steps(), depth);
+      finish_sample_epoch_ = min_split_samples_ / finish_check_every_;
+
+      dominate_fraction_ = ResolveParam(params.dominate_fraction(), depth_);
+      if (dominate_fraction_ <= 0 || dominate_fraction_ > 1.0) {
+        LOG(FATAL) << "Invalid dominate fraction " << dominate_fraction_;
+      }
+    }
+  }
+
+  // Pruning params.
+  if (params.pruning_type().type() != SPLIT_PRUNE_NONE) {
+    prune_check_every_ =
+        ResolveParam(params.pruning_type().prune_every_samples(), depth);
+    prune_sample_epoch_ = 1;
+    prune_fraction_ = 0.0;
+    switch (params_.pruning_type().type()) {
+      case SPLIT_PRUNE_HALF:
+        prune_fraction_ = 0.5;
+        break;
+      case SPLIT_PRUNE_QUARTER:
+        prune_fraction_ = 0.25;
+        break;
+      case SPLIT_PRUNE_10_PERCENT:
+        prune_fraction_ = 0.10;
+        break;
+      case SPLIT_PRUNE_HOEFFDING:
+        dominate_fraction_ = ResolveParam(params.dominate_fraction(), depth_);
+        half_ln_dominate_frac_ = 0.5 * log(1.0 / (1.0 - dominate_fraction_));
+        break;
+      default:
+        LOG(WARNING) << "Unknown pruning type";
+    }
+  } else {
+    prune_check_every_ = split_after_samples_ * 2;
+    prune_sample_epoch_ = 1;
+  }
+
+  if (params.use_running_stats_method()) {
+    left_gini_.reset(new RunningGiniScores());
+    right_gini_.reset(new RunningGiniScores());
+  }
+
+  uint64 time_seed = static_cast<uint64>(std::clock());
+  single_rand_ = std::unique_ptr<random::PhiloxRandom>(
+      new random::PhiloxRandom(time_seed));
+  rng_ = std::unique_ptr<random::SimplePhilox>(
+      new random::SimplePhilox(single_rand_.get()));
+}
+
+void ClassificationStats::AdditionalInitializationExample(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    int example) {
+  const int32 new_target = target->GetTargetAsClassIndex(example, 0);
+  std::unordered_set<int> to_erase;
+  for (auto it = half_initialized_splits_.begin();
+       it != half_initialized_splits_.end(); ++it) {
+    if (it->second != new_target) {
+      auto& split = splits_[it->first];
+      if (split.has_inequality_left_child_test()) {
+        auto& test = split.inequality_left_child_test();
+        auto* thresh =
+            split.mutable_inequality_left_child_test()->mutable_threshold();
+        if (test.has_feature_id()) {
+          const float val =
+              input_data->GetExampleValue(example, test.feature_id());
+          thresh->set_float_value((thresh->float_value() + val) / 2);
+        }
+      }
+      to_erase.insert(it->first);
+    }
+  }
+
+  for (const int split_id : to_erase) {
+    half_initialized_splits_.erase(split_id);
+  }
+}
+
+bool ClassificationStats::IsFinished() const {
+  bool basic = weight_sum_ >= split_after_samples_ && num_outputs_seen() > 1;
+  return basic || finish_early_;
+}
+
+float ClassificationStats::MaybeCachedGiniScore(int split, float* left_sum,
+                                                float* right_sum) const {
+  if (left_gini_ == nullptr) {
+    return GiniScore(split, left_sum, right_sum);
+  } else {
+    *left_sum = left_gini_->sum(split);
+    const float left = WeightedSmoothedGini(
+        *left_sum, left_gini_->square(split), num_outputs_);
+
+    *right_sum = right_gini_->sum(split);
+    const float right = WeightedSmoothedGini(
+        *right_sum, right_gini_->square(split), num_outputs_);
+
+    return left + right;
+  }
+}
+
+void ClassificationStats::AddExample(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    int example) {
+  const int64 int_label = target->GetTargetAsClassIndex(example, 0);
+  const float weight = target->GetTargetWeight(example);
+
+  for (int i = 0; i < num_splits(); ++i) {
+    auto& eval = evaluators_[i];
+    if (eval->Decide(input_data, example) == LEFT_INDEX) {
+      if (left_gini_ != nullptr) {
+        left_gini_->update(i, left_count(i, int_label), weight);
+      }
+      ClassificationAddLeftExample(i, int_label, weight);
+    } else if (right_gini_ != nullptr) {
+      right_gini_->update(i, right_count(i, int_label), weight);
+    }
+  }
+
+  ClassificationAddTotalExample(int_label, weight);
+
+  weight_sum_ += weight;
+
+  CheckFinishEarly();
+  CheckPrune();
+}
+
+void ClassificationStats::CheckPrune() {
+  if (params_.pruning_type().type() == SPLIT_PRUNE_NONE || IsFinished() ||
+      weight_sum_ < prune_sample_epoch_ * prune_check_every_) {
+    return;
+  }
+  ++prune_sample_epoch_;
+
+  if (params_.pruning_type().type() == SPLIT_PRUNE_HOEFFDING) {
+    CheckPruneHoeffding();
+    return;
+  }
+
+  const int to_remove = num_splits() * prune_fraction_;
+  if (to_remove <= 0) {
+    return;
+  }
+
+  // pair ordering is first-then-second by default, no need for custom
+  // comparison.  Use std::greater to make it a min-heap.
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
+                      std::greater<std::pair<float, int>>>
+      worst;
+
+  // Track indices that are in the heap so we can iterate over them
+  // by largest-first later.
+  std::set<int> indices;
+
+  for (int i = 0; i < num_splits(); ++i) {
+    float left, right;
+    const float split_score = MaybeCachedGiniScore(i, &left, &right);
+    if (worst.size() < to_remove) {
+      worst.push(std::pair<float, int>(split_score, i));
+      indices.insert(i);
+    } else if (worst.top().first < split_score) {
+      indices.erase(worst.top().second);
+      worst.pop();
+      worst.push(std::pair<float, int>(split_score, i));
+      indices.insert(i);
+    }
+  }
+
+  // traverse indices from the back so that they are removed correctly.
+  for (auto it = indices.rbegin(); it != indices.rend(); ++it) {
+    RemoveSplit(*it);
+  }
+}
+
+void ClassificationStats::CheckPruneHoeffding() {
+  std::vector<float> split_scores(num_splits());
+  // Find best split score
+  float best_split_score = FLT_MAX;
+  for (int i = 0; i < num_splits(); ++i) {
+    float left, right;
+    split_scores[i] = MaybeCachedGiniScore(i, &left, &right);
+    if (split_scores[i] < best_split_score) {
+      best_split_score = split_scores[i];
+    }
+  }
+
+  // We apply the Hoeffding bound to the difference between the best split
+  // score and the i-th split score.
+  // Raw Gini ranges from 0 to 1 - (1/n), but our gini score is weighted.
+  const float num_classes = params_.num_outputs();
+  const float gini_diff_range = weight_sum_ * (1.0 - 1.0 / num_classes);
+  float epsilon = gini_diff_range * sqrt(half_ln_dominate_frac_ / weight_sum_);
+  for (int i = num_splits() - 1; i >= 0; i--) {
+    if (split_scores[i] - best_split_score > epsilon) {
+      RemoveSplit(i);
+    }
+  }
+}
+
+void ClassificationStats::CheckFinishEarly() {
+  if (weight_sum_ < min_split_samples_ ||
+      weight_sum_ < finish_sample_epoch_ * finish_check_every_) {
+    return;
+  }
+  ++finish_sample_epoch_;
+
+  if (params_.finish_type().type() == SPLIT_FINISH_DOMINATE_HOEFFDING) {
+    CheckFinishEarlyHoeffding();
+  } else if (params_.finish_type().type() == SPLIT_FINISH_DOMINATE_BOOTSTRAP) {
+    CheckFinishEarlyBootstrap();
+  }
+}
+
+void ClassificationStats::CheckFinishEarlyHoeffding() {
+  // Each term in the Gini impurity can range from 0 to 0.5 * 0.5.
+  float range = 0.25 * static_cast<float>(params_.num_outputs()) * weight_sum_;
+
+  float hoeffding_bound =
+      range * sqrt(log(1.0 / (1.0 - dominate_fraction_)) / (2.0 * weight_sum_));
+
+  float unused_left_sum, unused_right_sum;
+  std::function<float(int)> score_fn =
+      std::bind(&ClassificationStats::MaybeCachedGiniScore, this,
+                std::placeholders::_1, &unused_left_sum, &unused_right_sum);
+
+  float best_score;
+  int32 best_index;
+  float second_best_score;
+  int32 second_best_index;
+  GetTwoBest(num_splits(), score_fn, &best_score, &best_index,
+             &second_best_score, &second_best_index);
+
+  finish_early_ = (second_best_score - best_score) > hoeffding_bound;
+}
+
+void ClassificationStats::MakeBootstrapWeights(int index,
+                                               std::vector<float>* weights) {
+  int n = weight_sum_;
+  float denom = static_cast<float>(n) + static_cast<float>(num_outputs_);
+  for (int i = 0; i < num_outputs_; ++i) {
+    // Use the Laplace smoothed per-class probabilities when generating the
+    // bootstrap samples.
+    (*weights)[i] = (left_count(index, i) + 1.0) / denom;
+    (*weights)[num_outputs_ + i] = (right_count(index, i) + 1.0) / denom;
+  }
+}
+
+int ClassificationStats::NumBootstrapSamples() const {
+  float p = 1.0 - dominate_fraction_;
+  int bootstrap_samples = 1;
+  while (p < 1.0) {
+    ++bootstrap_samples;
+    p = p * 2;
+  }
+  return bootstrap_samples;
+}
+
+void ClassificationStats::CheckFinishEarlyBootstrap() {
+  float unused_left_sum, unused_right_sum;
+  std::function<float(int)> score_fn =
+      std::bind(&ClassificationStats::MaybeCachedGiniScore, this,
+                std::placeholders::_1, &unused_left_sum, &unused_right_sum);
+
+  float best_score;
+  int32 best_index;
+  float second_best_score;
+  int32 second_best_index;
+  GetTwoBest(num_splits(), score_fn, &best_score, &best_index,
+             &second_best_score, &second_best_index);
+
+  std::vector<float> weights1(num_outputs_ * 2);
+  MakeBootstrapWeights(best_index, &weights1);
+  random::DistributionSampler ds1(weights1);
+
+  std::vector<float> weights2(num_outputs_ * 2);
+  MakeBootstrapWeights(second_best_index, &weights2);
+  random::DistributionSampler ds2(weights2);
+
+  const int bootstrap_samples = NumBootstrapSamples();
+
+  int worst_g1 = 0;
+  for (int i = 0; i < bootstrap_samples; i++) {
+    int g1 = BootstrapGini(weight_sum_, 2 * num_outputs_, ds1, rng_.get());
+    worst_g1 = std::max(worst_g1, g1);
+  }
+
+  int best_g2 = 99;
+  for (int i = 0; i < bootstrap_samples; i++) {
+    int g2 = BootstrapGini(weight_sum_, 2 * num_outputs_, ds2, rng_.get());
+    best_g2 = std::min(best_g2, g2);
+  }
+
+  finish_early_ = worst_g1 < best_g2;
+}
+
+// ------------------------ Dense Classification --------------------------- //
+void DenseClassificationGrowStats::ExtractFromProto(const FertileSlot& slot) {
+  Initialize();
+  if (!slot.has_post_init_leaf_stats()) {
+    return;
+  }
+  const int32 num_classes = params_.num_outputs();
+  weight_sum_ = slot.post_init_leaf_stats().weight_sum();
+  const auto& class_stats =
+      slot.post_init_leaf_stats().classification().dense_counts();
+
+  // Total counts.
+  for (int i = 0; i < num_classes; ++i) {
+    total_counts_[i] = class_stats.value(i).float_value();
+    num_outputs_seen_ += total_counts_[i] != 0;
+  }
+
+  // Candidate counts and splits.
+  int split_num = 0;
+  for (const auto& cand : slot.candidates()) {
+    AddSplit(cand.split(), nullptr, nullptr, -1);
+    const auto& left_stats = cand.left_stats().classification().dense_counts();
+    for (int i = 0; i < num_classes; ++i) {
+      const float val = left_stats.value(i).float_value();
+      mutable_left_count(split_num, i) = val;
+      MaybeInitializeRunningCount(split_num, val);
+    }
+    ++split_num;
+  }
+}
+
+void DenseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
+  auto* slot_stats = slot->mutable_post_init_leaf_stats();
+  slot_stats->set_weight_sum(weight_sum_);
+
+  auto* class_stats = slot->mutable_post_init_leaf_stats()
+                          ->mutable_classification()
+                          ->mutable_dense_counts();
+  for (int i = 0; i < num_outputs_; ++i) {
+    class_stats->add_value()->set_float_value(total_counts_[i]);
+  }
+
+  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+    auto* cand = slot->add_candidates();
+    *cand->mutable_split() = splits_[split_num];
+    auto* left_stats = cand->mutable_left_stats()
+                           ->mutable_classification()
+                           ->mutable_dense_counts();
+    for (int i = 0; i < num_outputs_; ++i) {
+       left_stats->add_value()->set_float_value(left_count(split_num, i));
+    }
+  }
+}
+
+float DenseClassificationGrowStats::GiniScore(int split, float* left_sum,
+                                              float* right_sum) const {
+  float left_square = 0, right_square = 0;
+  *left_sum = 0;
+  *right_sum = 0;
+  for (int j = 0; j < num_outputs_; ++j) {
+    const float left = left_count(split, j);
+    *left_sum += left;
+    left_square += left * left;
+    const float right = right_count(split, j);
+    *right_sum += right;
+    right_square += right * right;
+  }
+
+  const float left_score =
+      WeightedSmoothedGini(*left_sum, left_square, num_outputs_);
+  const float right_score =
+      WeightedSmoothedGini(*right_sum, right_square, num_outputs_);
+  return left_score + right_score;
+}
+
+bool DenseClassificationGrowStats::BestSplit(SplitCandidate* best) const {
+  float min_score = FLT_MAX;
+  int best_index = -1;
+  float best_left_sum, best_right_sum;
+
+  // Calculate sums.
+  for (int i = 0; i < num_splits(); ++i) {
+    float left_sum, right_sum;
+    const float split_score = MaybeCachedGiniScore(i, &left_sum, &right_sum);
+    // Find the lowest gini.
+    if (left_sum > 0 && right_sum > 0 &&
+        split_score < min_score) {  // useless check
+      min_score = split_score;
+      best_index = i;
+      best_left_sum = left_sum;
+      best_right_sum = right_sum;
+    }
+  }
+
+  // This could happen if all the splits are useless.
+  if (best_index < 0) {
+    return false;
+  }
+
+  // Fill in stats to be used for leaf model.
+  *best->mutable_split() = splits_[best_index];
+  // Left
+  auto* left = best->mutable_left_stats();
+  auto* left_class_stats = left->mutable_classification();
+  left->set_weight_sum(best_left_sum);
+  auto* left_counts = left_class_stats->mutable_dense_counts();
+  for (int i = 0; i < params_.num_outputs(); ++i) {
+    left_counts->add_value()->set_float_value(
+        left_count(best_index, i));
+  }
+
+  // Right
+  auto* right = best->mutable_right_stats();
+  auto* right_class_stats = right->mutable_classification();
+  right->set_weight_sum(best_right_sum);
+  auto* right_counts = right_class_stats->mutable_dense_counts();
+  for (int i = 0; i < params_.num_outputs(); ++i) {
+    right_counts->add_value()->set_float_value(
+        total_counts_[i] - left_count(best_index, i));
+  }
+  return true;
+}
+
+// ------------------------ Sparse Classification --------------------------- //
+void SparseClassificationGrowStats::ExtractFromProto(const FertileSlot& slot) {
+  Initialize();
+  if (!slot.has_post_init_leaf_stats()) {
+    return;
+  }
+  weight_sum_ = slot.post_init_leaf_stats().weight_sum();
+  const auto& class_stats =
+      slot.post_init_leaf_stats().classification().sparse_counts();
+
+  // Total counts.
+  for (auto const& entry : class_stats.sparse_value()) {
+    total_counts_[entry.first] = entry.second.float_value();
+  }
+
+  // Candidate counts and splits.
+  int split_num = 0;
+  for (const auto& cand : slot.candidates()) {
+    AddSplit(cand.split(), nullptr, nullptr, -1);
+    const auto& left_stats = cand.left_stats().classification().sparse_counts();
+    for (auto const& entry : left_stats.sparse_value()) {
+      const float val = entry.second.float_value();
+      left_counts_[split_num][entry.first] = val;
+      MaybeInitializeRunningCount(split_num, val);
+    }
+    ++split_num;
+  }
+}
+
+void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
+  auto* slot_stats = slot->mutable_post_init_leaf_stats();
+  slot_stats->set_weight_sum(weight_sum_);
+
+  auto* class_stats = slot->mutable_post_init_leaf_stats()
+                          ->mutable_classification()
+                          ->mutable_sparse_counts()
+                          ->mutable_sparse_value();
+  for (const auto& entry : total_counts_) {
+    decision_trees::Value val;
+    val.set_float_value(entry.second);
+    (*class_stats)[entry.first] = val;
+  }
+
+  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+    auto* cand = slot->add_candidates();
+    *cand->mutable_split() = splits_[split_num];
+    auto* left_stats = cand->mutable_left_stats()
+                           ->mutable_classification()
+                           ->mutable_sparse_counts()
+                           ->mutable_sparse_value();
+    for (const auto& entry : left_counts_[split_num]) {
+      decision_trees::Value val;
+      val.set_float_value(entry.second);
+      (*left_stats)[entry.first] = val;
+    }
+  }
+}
+
+float SparseClassificationGrowStats::GiniScore(
+    int split, float* left_sum, float* right_sum) const {
+  float left_square = 0, right_square = 0;
+  *left_sum = 0;
+  *right_sum = 0;
+  for (const auto& entry : total_counts_) {
+    const int label = entry.first;
+    float left = 0;
+    float right = 0;
+    auto it = left_counts_[split].find(label);
+    if (it == left_counts_[split].end()) {
+      right = entry.second;
+    } else {
+      left = it->second;
+      right = entry.second - it->second;
+    }
+    *left_sum += left;
+    left_square += left * left;
+    *right_sum += right;
+    right_square += right * right;
+  }
+  const int32 num_classes = params_.num_outputs();
+  const float left_score =
+      WeightedSmoothedGini(*left_sum, left_square, num_classes);
+  const float right_score =
+      WeightedSmoothedGini(*right_sum, right_square, num_classes);
+  return left_score + right_score;
+}
+
+bool SparseClassificationGrowStats::BestSplit(SplitCandidate* best) const {
+  float min_score = FLT_MAX;
+  int best_index = -1;
+  float best_left_sum = -1;
+  float best_right_sum = -1;
+
+  // Find the lowest gini.
+  for (int i = 0; i < num_splits(); ++i) {
+    float left_sum, right_sum;
+    const float split_score = MaybeCachedGiniScore(i, &left_sum, &right_sum);
+    if (left_sum > 0 && right_sum > 0 &&
+        split_score < min_score) {  // useless check
+      min_score = split_score;
+      best_index = i;
+      best_left_sum = left_sum;
+      best_right_sum = right_sum;
+    }
+  }
+
+  // This could happen if all the splits are useless.
+  if (best_index < 0) {
+    return false;
+  }
+
+  // Fill in stats to be used for leaf model.
+  *best->mutable_split() = splits_[best_index];
+  // Left
+  auto* left = best->mutable_left_stats();
+  auto* left_class_stats = left->mutable_classification();
+  left->set_weight_sum(best_left_sum);
+  auto* left_counts =
+      left_class_stats->mutable_sparse_counts()->mutable_sparse_value();
+
+  // Right
+  auto* right = best->mutable_right_stats();
+  auto* right_class_stats = right->mutable_classification();
+  right->set_weight_sum(best_right_sum);
+  auto* right_counts =
+      right_class_stats->mutable_sparse_counts()->mutable_sparse_value();
+
+  for (const auto& entry : total_counts_) {
+    auto it = left_counts_[best_index].find(entry.first);
+    if (it == left_counts_[best_index].end()) {
+      (*right_counts)[entry.first].set_float_value(entry.second);
+    } else {
+      const float left = it->second;
+      const float right = entry.second - it->second;
+      (*left_counts)[entry.first].set_float_value(left);
+      if (right > 0) {
+        (*right_counts)[entry.first].set_float_value(right);
+      }
+    }
+  }
+  return true;
+}
+
+// --------------------- Least Squares Regression --------------------------- //
+void LeastSquaresRegressionGrowStats::ExtractFromProto(
+    const FertileSlot& slot) {
+  const int32 num_outputs = params_.num_outputs();
+  Initialize();
+  if (!slot.has_post_init_leaf_stats()) {
+    return;
+  }
+  weight_sum_ = slot.post_init_leaf_stats().weight_sum();
+  const auto& total_sums =
+      slot.post_init_leaf_stats().regression().mean_output();
+  const auto& total_squares =
+      slot.post_init_leaf_stats().regression().mean_output_squares();
+
+  // Total counts.
+  for (int i = 0; i < num_outputs; ++i) {
+    total_sum_[i] = total_sums.value(i).float_value();
+    total_sum_squares_[i] = total_squares.value(i).float_value();
+  }
+
+  // Candidate counts and splits.
+  int split_num = 0;
+  for (const auto& cand : slot.candidates()) {
+    AddSplit(cand.split(), nullptr, nullptr, -1);
+    const auto& sums = cand.left_stats().regression().mean_output();
+    const auto& squares = cand.left_stats().regression().mean_output_squares();
+    for (int i = 0; i < num_outputs; ++i) {
+      left_sum(split_num, i) = sums.value(i).float_value();
+      left_square(split_num, i) = squares.value(i).float_value();
+    }
+    left_counts_[split_num] = cand.left_stats().weight_sum();
+    ++split_num;
+  }
+}
+
+void LeastSquaresRegressionGrowStats::PackToProto(FertileSlot* slot) const {
+  const int32 num_outputs = params_.num_outputs();
+  auto* slot_stats = slot->mutable_post_init_leaf_stats();
+  slot_stats->set_weight_sum(weight_sum_);
+
+  auto* total_sums = slot->mutable_post_init_leaf_stats()
+                         ->mutable_regression()
+                         ->mutable_mean_output();
+  auto* total_squares = slot->mutable_post_init_leaf_stats()
+                            ->mutable_regression()
+                            ->mutable_mean_output_squares();
+
+  for (int i = 0; i < total_sum_.size(); ++i) {
+    total_sums->add_value()->set_float_value(total_sum_[i]);
+    total_squares->add_value()->set_float_value(total_sum_squares_[i]);
+  }
+
+  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+    auto* cand = slot->add_candidates();
+    *cand->mutable_split() = splits_[split_num];
+    auto* sums = cand->mutable_left_stats()
+                           ->mutable_regression()
+                           ->mutable_mean_output();
+    auto* squares = cand->mutable_left_stats()
+                        ->mutable_regression()
+                        ->mutable_mean_output_squares();
+    for (int i = 0; i < num_outputs; ++i) {
+      sums->add_value()->set_float_value(left_sum(split_num, i));
+      squares->add_value()->set_float_value(left_square(split_num, i));
+    }
+    cand->mutable_left_stats()->set_weight_sum(left_counts_[split_num]);
+  }
+}
+
+void LeastSquaresRegressionGrowStats::AddExample(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    int example) {
+  const int32 num_outputs = params_.num_outputs();
+  // Update splits.
+  for (int i = 0; i < num_splits(); ++i) {
+    auto& eval = evaluators_[i];
+    if (eval->Decide(input_data, example) == LEFT_INDEX) {
+      for (int j = 0; j < num_outputs; ++j) {
+        const float output = target->GetTargetAsContinuous(example, j);
+        left_sum(i, j) += output;
+        left_square(i, j) += output * output;
+      }
+      ++left_counts_[i];
+    }
+  }
+
+  // Update totals.
+  for (int i = 0; i < num_outputs; ++i) {
+    const float output = target->GetTargetAsContinuous(example, i);
+    total_sum_[i] += output;
+    total_sum_squares_[i] += output * output;
+  }
+  weight_sum_ += 1.0;
+}
+
+float LeastSquaresRegressionGrowStats::SplitVariance(int split) const {
+  float total_variance = 0;
+  for (int i = 0; i < params_.num_outputs(); ++i) {
+    // Left side
+    const float le_x =
+        left_sum(split, i) / left_counts_[split];
+
+    const float le_x2 =
+        left_square(split, i) / left_counts_[split];
+    total_variance += le_x2 - le_x * le_x;
+
+    // Right side
+    const float re_x = (total_sum_[i] - left_sum(split, i)) /
+                       (weight_sum_ - left_counts_[split]);
+
+    const float re_x2 =
+        (total_sum_squares_[i] - left_square(split, i)) /
+        (weight_sum_ - left_counts_[split]);
+    total_variance += re_x2 - re_x * re_x;
+  }
+  return total_variance;
+}
+
+bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
+  float min_score = FLT_MAX;
+  int best_index = -1;
+  const int32 num_outputs = params_.num_outputs();
+  for (int i = 0; i < num_splits(); ++i) {
+    if (left_counts_[i] > 0 && weight_sum_ - left_counts_[i] > 0) {
+      const float split_score = SplitVariance(i);
+      if (split_score < min_score) {
+        min_score = split_score;
+        best_index = i;
+      }
+    }
+  }
+
+  // This could happen if all the splits are useless.
+  if (best_index < 0) {
+    return false;
+  }
+
+  // Fill in right stats to be used for leaf model.
+  *best->mutable_split() = splits_[best_index];
+  // Left
+  auto* left = best->mutable_left_stats();
+  auto* left_reg_stats = left->mutable_regression();
+  left->set_weight_sum(left_counts_[best_index]);
+  auto* left_output_sum = left_reg_stats->mutable_mean_output();
+  for (int i = 0; i < num_outputs; ++i) {
+    left_output_sum->add_value()->set_float_value(
+        left_sum(best_index, i));
+  }
+
+  // Right
+  auto* right = best->mutable_right_stats();
+  auto* right_reg_stats = right->mutable_regression();
+  right->set_weight_sum(weight_sum_ - left_counts_[best_index]);
+  auto* right_output_sum = right_reg_stats->mutable_mean_output();
+  for (int i = 0; i < num_outputs; ++i) {
+    right_output_sum->add_value()->set_float_value(
+        total_sum_[i] - left_sum(best_index, i));
+  }
+  return true;
+}
+
+bool LeastSquaresRegressionGrowStats::IsFinished() const {
+  return weight_sum_ >= split_after_samples_;
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba73d1d246d46aaaeeab907c65460b9ac63f379b
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -0,0 +1,493 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Base class for tracking stats necessary to split a leaf.
+// Holds and tracks stats for every candidate split.
+class GrowStats {
+ public:
+  virtual ~GrowStats() {}
+  // Perform any initialization.
+  virtual void Initialize() = 0;
+
+  // Add an example to any stats being collected.
+  virtual void AddExample(const std::unique_ptr<TensorDataSet>& input_data,
+                          const InputTarget* target, int example) = 0;
+
+  // Fill in the best split, return false if none were valid.
+  virtual bool BestSplit(SplitCandidate* best) const = 0;
+
+  // Return true if this leaf is finished splitting.
+  virtual bool IsFinished() const = 0;
+
+  // Get the split_num BinaryNode.
+  const decision_trees::BinaryNode& Split(int split_num) const {
+    return splits_[split_num];
+  }
+
+  // Clear all state.
+  virtual void Clear() {
+    weight_sum_ = 0;
+    splits_.clear();
+    evaluators_.clear();
+    ClearInternal();
+  }
+
+  virtual void ExtractFromProto(const FertileSlot& slot) = 0;
+  virtual void PackToProto(FertileSlot* slot) const = 0;
+
+  // Add split to the list of candidate splits.
+  void AddSplit(const decision_trees::BinaryNode& split,
+                const std::unique_ptr<TensorDataSet>& input_data,
+                const InputTarget* target, int example);
+  virtual void AdditionalInitializationExample(
+      const std::unique_ptr<TensorDataSet>& input_data,
+      const InputTarget* target, int example) {}
+  void RemoveSplit(int split_num);
+
+  int num_splits() const {
+    return splits_.size();
+  }
+
+  float weight_sum() const {
+    return weight_sum_;
+  }
+
+  virtual bool IsInitialized() const {
+    return weight_sum_ > 0 || splits_.size() == num_splits_to_consider_;
+  }
+
+  int32 depth() const {
+    return depth_;
+  }
+
+ protected:
+  GrowStats(const TensorForestParams& params, int32 depth);
+
+  // Function called by AddSplit for subclasses to initialize stats for a split.
+  virtual void AddSplitStats(const InputTarget* target, int example) = 0;
+
+  virtual void RemoveSplitStats(int split_num) = 0;
+
+  // Function called by Clear for subclasses to clear their state.
+  virtual void ClearInternal() = 0;
+
+  std::vector<decision_trees::BinaryNode> splits_;
+  std::vector<std::unique_ptr<DecisionNodeEvaluator>> evaluators_;
+
+  float weight_sum_;
+
+  const int32 depth_;
+
+  const TensorForestParams& params_;
+
+  // We cache these because they're used often.
+  const int split_after_samples_;
+  const int num_splits_to_consider_;
+
+  const int32 num_outputs_;
+};
+
+// Don't track anything, useful for systems that want to track split
+// candidates but train the model in some other way.
+class SimpleStats : public GrowStats {
+ public:
+  SimpleStats(const TensorForestParams& params, int32 depth)
+      : GrowStats(params, depth) {}
+  void Initialize() override {}
+
+  void ExtractFromProto(const FertileSlot& slot) override {}
+  void PackToProto(FertileSlot* slot) const override {}
+
+  void AddExample(const std::unique_ptr<TensorDataSet>& input_data,
+                  const InputTarget* target, int example) override {
+    weight_sum_ += target->GetTargetWeight(example);
+  }
+
+  bool BestSplit(SplitCandidate* best) const override { return false; }
+
+  bool IsFinished() const override {
+    return weight_sum_ >= split_after_samples_;
+  }
+
+ protected:
+  void AddSplitStats(const InputTarget* target, int example) override {}
+  void RemoveSplitStats(int split_num) override {}
+  void ClearInternal() override {}
+};
+
+// Tracks the sum and square of one side of a split for each Gini calculation.
+class RunningGiniScores {
+ public:
+  float sum(int split) const { return sum_[split]; }
+  float square(int split) const { return square_[split]; }
+
+  void update(int split, float old_val, float weight) {
+    sum_[split] += weight;
+    const float new_val = old_val + weight;
+    square_[split] = square_[split] - old_val * old_val + new_val * new_val;
+  }
+
+  void add_split() {
+    sum_.push_back(0);
+    square_.push_back(0);
+  }
+
+  void remove_split(int i) {
+    sum_.erase(sum_.begin() + i);
+    square_.erase(square_.begin() + i);
+  }
+
+ private:
+  std::vector<float> sum_;
+  std::vector<float> square_;
+};
+
+class ClassificationStats : public GrowStats {
+ public:
+  ClassificationStats(const TensorForestParams& params, int32 depth);
+
+  bool IsFinished() const override;
+
+  void AddExample(const std::unique_ptr<TensorDataSet>& input_data,
+                  const InputTarget* target, int example) override;
+
+  void AdditionalInitializationExample(
+      const std::unique_ptr<TensorDataSet>& input_data,
+      const InputTarget* target, int example) override;
+
+  bool IsInitialized() const override {
+    return weight_sum_ > 0 || (splits_.size() == num_splits_to_consider_ &&
+                               half_initialized_splits_.empty());
+  }
+
+ protected:
+  virtual float GiniScore(int split, float* left_sum,
+                          float* right_sum) const = 0;
+  virtual int num_outputs_seen() const = 0;
+  virtual float left_count(int split, int class_num) const = 0;
+  virtual float right_count(int split, int class_num) const = 0;
+
+  virtual void ClassificationAddLeftExample(
+      int split, int64 int_label, float weight) = 0;
+  virtual void ClassificationAddTotalExample(int64 int_label, float weight) = 0;
+
+  virtual void ClassificationAddSplitStats() = 0;
+  virtual void ClassificationRemoveSplitStats(int split) = 0;
+
+  void AddSplitStats(const InputTarget* target, int example) override {
+    if (left_gini_ != nullptr) {
+      left_gini_->add_split();
+      right_gini_->add_split();
+    }
+    if (params_.initialize_average_splits()) {
+      if (splits_[splits_.size() - 1].has_inequality_left_child_test()) {
+        half_initialized_splits_[splits_.size() - 1] =
+            target->GetTargetAsClassIndex(example, 0);
+      }
+    }
+    ClassificationAddSplitStats();
+  }
+  void RemoveSplitStats(int split) override {
+    if (left_gini_ != nullptr) {
+      left_gini_->remove_split(split);
+      right_gini_->remove_split(split);
+    }
+    ClassificationRemoveSplitStats(split);
+  }
+
+  // Virtual so we can override these to test.
+  virtual void CheckFinishEarly();
+  virtual void CheckFinishEarlyHoeffding();
+  virtual void CheckFinishEarlyBootstrap();
+
+  virtual void CheckPrune();
+
+  // Implement SplitPruningStrategyType::SPLIT_PRUNE_HOEFFDING.
+  void CheckPruneHoeffding();
+
+  // Return the gini score, possibly being calculated from sums and squares
+  // saved in left_gini_ and right_gini_, otherwise calculated from raw counts.
+  float MaybeCachedGiniScore(int split, float* left_sum,
+                             float* right_sum) const;
+
+  // Initialize the sum and squares of left_gini_ and right_gini_ for given
+  // split and value (being extracted from a proto), if left_gini_ isn't null.
+  void MaybeInitializeRunningCount(int split, float val) {
+    if (left_gini_ != nullptr) {
+      left_gini_->update(split, 0, val);
+      right_gini_->update(split, 0, val);
+    }
+  }
+
+  int NumBootstrapSamples() const;
+
+  // Populate *weights with the smoothed per-class frequencies needed to
+  // initialize a DistributionSampler.
+  void MakeBootstrapWeights(int index, std::vector<float>* weights);
+
+  // Accessors for RunningGiniScores objects, for testing.
+  virtual const std::unique_ptr<RunningGiniScores>& get_left_gini() const {
+    return left_gini_;
+  }
+  virtual const std::unique_ptr<RunningGiniScores>& get_right_gini() const {
+    return right_gini_;
+  }
+
+ private:
+  // Tracks how many check_every_samples epochs we've seen go by in weight_sum.
+  int32 finish_sample_epoch_;
+  int32 finish_check_every_;
+  int32 prune_sample_epoch_;
+  int32 prune_check_every_;
+  bool finish_early_;
+  int32 min_split_samples_;
+  float dominate_fraction_;
+  float prune_fraction_;
+
+  // When using SPLIT_PRUNE_HOEFFDING, we precompute and store
+  // 0.5 * ln(1 / (1.0 - dominate_fraction_)).
+  float half_ln_dominate_frac_;
+
+  std::unique_ptr<random::PhiloxRandom> single_rand_;
+  std::unique_ptr<random::SimplePhilox> rng_;
+
+  std::unique_ptr<RunningGiniScores> left_gini_;
+  std::unique_ptr<RunningGiniScores> right_gini_;
+
+  // Stores split number -> class that was first seen.
+  std::unordered_map<int, int32> half_initialized_splits_;
+};
+
+// Tracks classification stats by storing class counts densely.
+class DenseClassificationGrowStats : public ClassificationStats {
+ public:
+  DenseClassificationGrowStats(const TensorForestParams& params, int32 depth)
+      : ClassificationStats(params, depth) {}
+
+  void Initialize() override {
+    Clear();
+    total_counts_.resize(num_outputs_);
+  }
+
+  void ExtractFromProto(const FertileSlot& slot) override;
+  void PackToProto(FertileSlot* slot) const override;
+
+  bool BestSplit(SplitCandidate* best) const override;
+
+ protected:
+  void ClassificationAddSplitStats() override {
+    left_counts_.resize(num_outputs_ * num_splits());
+  }
+  void ClassificationRemoveSplitStats(int split_num) override {
+    left_counts_.erase(left_counts_.begin() + num_outputs_ * split_num,
+                       left_counts_.begin() + num_outputs_ * (split_num + 1));
+  }
+  void ClearInternal() override {
+    total_counts_.clear();
+    left_counts_.clear();
+    num_outputs_seen_ = 0;
+  }
+
+  int num_outputs_seen() const override {
+    return num_outputs_seen_;
+  }
+
+  void ClassificationAddLeftExample(int split, int64 int_label,
+                                    float weight) override {
+    mutable_left_count(split, int_label) += weight;
+  }
+  void ClassificationAddTotalExample(int64 int_label, float weight) override {
+    num_outputs_seen_ += total_counts_[int_label] == 0 && weight > 0;
+    total_counts_[int_label] += weight;
+  }
+
+  float GiniScore(int split, float* left_sum, float* right_sum) const override;
+
+  float left_count(int split, int class_num) const override {
+    return left_counts_[split * num_outputs_ + class_num];
+  }
+  float right_count(int split, int class_num) const override {
+    return total_counts_[class_num] -
+           left_counts_[split * num_outputs_ + class_num];
+  }
+
+ private:
+  inline float& mutable_left_count(int split, int class_num) {
+    return left_counts_[split * num_outputs_ + class_num];
+  }
+  // Total class counts seen at this leaf
+  std::vector<float> total_counts_;
+
+  // Also track the number of classes seen for not splitting pure leaves.
+  int num_outputs_seen_;
+
+  // Left-branch taken class counts at this leaf for each split.
+  // This is a flat vector for memory-performance reasons.
+  // left_counts_[i * num_outputs_ + j] has the j-th class count for split i.
+  std::vector<float> left_counts_;
+};
+
+// Tracks classification stats by storing class counts sparsely.
+class SparseClassificationGrowStats : public ClassificationStats {
+ public:
+  SparseClassificationGrowStats(const TensorForestParams& params, int32 depth)
+      : ClassificationStats(params, depth) {}
+
+  void Initialize() override {
+    Clear();
+  }
+
+  void ExtractFromProto(const FertileSlot& slot) override;
+  void PackToProto(FertileSlot* slot) const override;
+
+  bool BestSplit(SplitCandidate* best) const override;
+
+ protected:
+  void ClassificationAddSplitStats() override {
+    left_counts_.resize(num_splits());
+  }
+  void ClassificationRemoveSplitStats(int split_num) override {
+    left_counts_.erase(left_counts_.begin() + split_num,
+                       left_counts_.begin() + (split_num + 1));
+  }
+  void ClearInternal() override {
+    total_counts_.clear();
+    left_counts_.clear();
+  }
+
+  int num_outputs_seen() const override { return total_counts_.size(); }
+
+  void ClassificationAddLeftExample(int split, int64 int_label,
+                                    float weight) override {
+    left_counts_[split][int_label] += weight;
+  }
+  void ClassificationAddTotalExample(int64 int_label, float weight) override {
+    total_counts_[int_label] += weight;
+  }
+
+  float GiniScore(int split, float* left_sum, float* right_sum) const override;
+
+  float left_count(int split, int class_num) const override {
+    return left_counts_[split].at(class_num);
+  }
+  float right_count(int split, int class_num) const override {
+    return total_counts_.at(class_num) - left_counts_[split].at(class_num);
+  }
+
+ private:
+  // Total class counts seen at this leaf
+  std::unordered_map<int, float> total_counts_;
+
+  // Left-branch taken class counts at this leaf for each split.
+  // left_counts_[i][j] has the j-th class count for split i.
+  std::vector<std::unordered_map<int, float>> left_counts_;
+};
+
+// Tracks regression stats using least-squares minimization.
+class LeastSquaresRegressionGrowStats : public GrowStats {
+ public:
+  LeastSquaresRegressionGrowStats(const TensorForestParams& params, int32 depth)
+      : GrowStats(params, depth) {}
+
+  void Initialize() override {
+    Clear();
+    total_sum_.resize(num_outputs_);
+    total_sum_squares_.resize(num_outputs_);
+  }
+
+  void ExtractFromProto(const FertileSlot& slot) override;
+  void PackToProto(FertileSlot* slot) const override;
+
+  void AddExample(const std::unique_ptr<TensorDataSet>& input_data,
+                  const InputTarget* target, int example) override;
+  bool BestSplit(SplitCandidate* best) const override;
+  bool IsFinished() const override;
+
+ protected:
+  // Returns the variance of split.
+  float SplitVariance(int split) const;
+
+  void AddSplitStats(const InputTarget* target, int example) override {
+    left_sums_.resize(num_outputs_ * num_splits());
+    left_squares_.resize(num_outputs_ * num_splits());
+    left_counts_.push_back(0);
+  }
+  void RemoveSplitStats(int split_num) override {
+    left_sums_.erase(left_sums_.begin() + num_outputs_ * split_num,
+                       left_sums_.begin() + num_outputs_ * (split_num + 1));
+    left_squares_.erase(left_squares_.begin() + num_outputs_ * split_num,
+                       left_squares_.begin() + num_outputs_ * (split_num + 1));
+    left_counts_.erase(left_counts_.begin() + split_num,
+                       left_counts_.begin() + (split_num + 1));
+  }
+
+  void ClearInternal() override {
+    total_sum_.clear();
+    total_sum_squares_.clear();
+    left_sums_.clear();
+    left_squares_.clear();
+  }
+
+ private:
+  // Convenience methods for accessing the flat count vectors.
+  inline const float& left_sum(int split, int output_num) const {
+    return left_sums_[split * num_outputs_ + output_num];
+  }
+  inline float& left_sum(int split, int output_num) {
+    return left_sums_[split * num_outputs_ + output_num];
+  }
+  inline const float& left_square(int split, int output_num) const {
+    return left_squares_[split * num_outputs_ + output_num];
+  }
+  inline float& left_square(int split, int output_num) {
+    return left_squares_[split * num_outputs_ + output_num];
+  }
+
+  // Total sums and squares seen at this leaf.
+  // sum[i] is the sum of the i-th output.
+  std::vector<float> total_sum_;
+  std::vector<float> total_sum_squares_;
+
+  // Per-split sums and squares, stored flat for performance.
+  // left_sums_[i * num_outputs_ + j] has the j-th sum for split i.
+  std::vector<float> left_sums_;
+  std::vector<float> left_squares_;
+
+  // The number of example seen at each split.
+  std::vector<int64> left_counts_;
+};
+
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa959e8373a059bf28b63dcdc974f1155a101429
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
@@ -0,0 +1,364 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h"
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::tensorforest::GrowStats;
+using tensorflow::tensorforest::TestableInputTarget;
+using tensorflow::tensorforest::FertileSlot;
+using tensorflow::tensorforest::DenseClassificationGrowStats;
+using tensorflow::tensorforest::SparseClassificationGrowStats;
+using tensorflow::tensorforest::LeastSquaresRegressionGrowStats;
+using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::SPLIT_FINISH_BASIC;
+using tensorflow::tensorforest::SPLIT_FINISH_DOMINATE_HOEFFDING;
+using tensorflow::tensorforest::SPLIT_PRUNE_HOEFFDING;
+using tensorflow::decision_trees::BinaryNode;
+using tensorflow::decision_trees::InequalityTest;
+using tensorflow::decision_trees::FeatureId;
+
+BinaryNode MakeSplit(const string& feat, float val) {
+  BinaryNode split;
+  InequalityTest* test = split.mutable_inequality_left_child_test();
+  FeatureId feature_id;
+  feature_id.mutable_id()->set_value(feat);
+  *test->mutable_feature_id() = feature_id;
+  test->mutable_threshold()->set_float_value(val);
+  test->set_type(InequalityTest::LESS_OR_EQUAL);
+
+  return split;
+}
+
+void RunBatch(GrowStats* stats,
+              const TestableInputTarget* target) {
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2));
+
+  stats->AddSplit(MakeSplit("0", 10.0), dataset, target, 0);
+  stats->AddSplit(MakeSplit("1", 4.0), dataset, target, 0);
+
+  for (int i = 0; i < target->NumItems(); ++i) {
+    stats->AddExample(dataset, target, i);
+  }
+}
+
+TEST(GrowStatsDenseClassificationTest, Basic) {
+  TensorForestParams params;
+  params.set_num_outputs(2);
+  params.mutable_split_after_samples()->set_constant_value(2);
+  params.mutable_num_splits_to_consider()->set_constant_value(2);
+  std::unique_ptr<DenseClassificationGrowStats> stat(
+      new DenseClassificationGrowStats(params, 1));
+  stat->Initialize();
+
+  std::vector<float> labels = {1, 0, 1};
+  std::vector<float> weights = {2.3, 20.3, 1.1};
+  std::unique_ptr<TestableInputTarget> target(
+      new TestableInputTarget(labels, weights, 1));
+
+  RunBatch(stat.get(), target.get());
+  CHECK(stat->IsFinished());
+
+  FertileSlot slot;
+  stat->PackToProto(&slot);
+
+  string serialized = slot.DebugString();
+
+  std::unique_ptr<DenseClassificationGrowStats> new_stat(
+      new DenseClassificationGrowStats(params, 1));
+  new_stat->ExtractFromProto(slot);
+  FertileSlot second_one;
+  new_stat->PackToProto(&second_one);
+  string serialized_again = second_one.DebugString();
+  ASSERT_EQ(serialized_again, serialized);
+}
+
+class TestableRunningStats : public DenseClassificationGrowStats {
+ public:
+  TestableRunningStats(const TensorForestParams& params, int32 depth)
+      : DenseClassificationGrowStats(params, depth) {}
+
+  float test_left_sum(int split) {
+    return get_left_gini()->sum(split);
+  }
+  float test_left_square(int split) {
+    return get_left_gini()->square(split);
+  }
+  float test_right_sum(int split) {
+    return get_right_gini()->sum(split);
+  }
+  float test_right_square(int split) {
+    return get_right_gini()->square(split);
+  }
+};
+
+TEST(GrowStatsDenseClassificationTest, BasicRunningStats) {
+  TensorForestParams params;
+  params.set_num_outputs(2);
+  params.mutable_split_after_samples()->set_constant_value(2);
+  params.mutable_num_splits_to_consider()->set_constant_value(2);
+  params.set_use_running_stats_method(true);
+  std::unique_ptr<TestableRunningStats> stat(
+      new TestableRunningStats(params, 1));
+  stat->Initialize();
+
+  std::vector<float> labels = {1, 0, 1};
+  std::vector<float> weights = {2.3, 20.3, 1.1};
+  std::unique_ptr<TestableInputTarget> target(
+      new TestableInputTarget(labels, weights, 1));
+
+  RunBatch(stat.get(), target.get());
+  CHECK(stat->IsFinished());
+
+  ASSERT_FLOAT_EQ(stat->test_left_sum(0), 2.3 + 20.3 + 1.1);
+  ASSERT_FLOAT_EQ(stat->test_left_square(0), 3.4 * 3.4 + 20.3 * 20.3);
+  ASSERT_FLOAT_EQ(stat->test_right_sum(0), 0.0);
+  ASSERT_FLOAT_EQ(stat->test_right_square(0), 0.0);
+
+  ASSERT_FLOAT_EQ(stat->test_left_sum(1), 2.3 + 20.3);
+  ASSERT_FLOAT_EQ(stat->test_left_square(1), 2.3 * 2.3 + 20.3 * 20.3);
+  ASSERT_FLOAT_EQ(stat->test_right_sum(1), 1.1);
+  ASSERT_FLOAT_EQ(stat->test_right_square(1), 1.1 * 1.1);
+
+  FertileSlot slot;
+  stat->PackToProto(&slot);
+
+  string serialized = slot.DebugString();
+
+  std::unique_ptr<DenseClassificationGrowStats> new_stat(
+      new DenseClassificationGrowStats(params, 1));
+  new_stat->ExtractFromProto(slot);
+  FertileSlot second_one;
+  new_stat->PackToProto(&second_one);
+  string serialized_again = second_one.DebugString();
+  ASSERT_EQ(serialized_again, serialized);
+}
+
+class TestableFinishEarly : public DenseClassificationGrowStats {
+ public:
+  TestableFinishEarly(const TensorForestParams& params, int32 depth)
+      : DenseClassificationGrowStats(params, depth), num_times_called_(0) {}
+
+  int num_times_called_;
+
+ protected:
+  void CheckFinishEarlyHoeffding() override {
+    ++num_times_called_;
+  }
+};
+
+TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
+  TensorForestParams params;
+  params.set_num_outputs(2);
+  params.mutable_split_after_samples()->set_constant_value(2);
+  params.mutable_num_splits_to_consider()->set_constant_value(2);
+  params.mutable_min_split_samples()->set_constant_value(15);
+  params.mutable_dominate_fraction()->set_constant_value(0.99);
+  auto* finish = params.mutable_finish_type();
+  finish->set_type(SPLIT_FINISH_DOMINATE_HOEFFDING);
+  finish->mutable_check_every_steps()->set_constant_value(5);
+  std::unique_ptr<TestableFinishEarly> stat(new TestableFinishEarly(params, 1));
+  stat->Initialize();
+
+  std::vector<float> labels = {1, 0, 1};
+  std::vector<float> weights = {1, 1, 1};
+  std::unique_ptr<TestableInputTarget> target(
+      new TestableInputTarget(labels, weights, 1));
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2));
+
+  // Run through the 3 examples
+  RunBatch(stat.get(), target.get());
+
+  ASSERT_EQ(stat->num_times_called_, 0);
+
+  // Go over min_split_samples.
+  for (int i = 0; i < 13; ++i) {
+    stat->AddExample(dataset, target.get(), 0);
+  }
+
+  ASSERT_EQ(stat->num_times_called_, 1);
+
+  // More examples up to 55.
+  for (int i = 0; i < 39; ++i) {
+    stat->AddExample(dataset, target.get(), 0);
+  }
+
+  ASSERT_EQ(stat->num_times_called_, 9);
+}
+
+
+TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
+  TensorForestParams params;
+  params.set_num_outputs(2);
+  params.mutable_split_after_samples()->set_constant_value(2000);
+  params.mutable_num_splits_to_consider()->set_constant_value(2);
+  params.mutable_min_split_samples()->set_constant_value(15);
+  params.mutable_dominate_fraction()->set_constant_value(0.99);
+  auto* finish = params.mutable_finish_type();
+  finish->set_type(SPLIT_FINISH_BASIC);
+  finish->mutable_check_every_steps()->set_constant_value(100);
+  params.mutable_pruning_type()->set_type(SPLIT_PRUNE_HOEFFDING);
+  params.mutable_pruning_type()->mutable_prune_every_samples()
+      ->set_constant_value(1);
+
+  // On each iteration, we add two examples, one of class 0 and one
+  // of class 1.  Split #0 classifies them perfectly, while split #1
+  // sends them both to the left.
+  std::vector<float> labels = {0, 1};
+  std::vector<float> weights = {1, 1};
+  TestableInputTarget target(labels, weights, 1);
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {-1.0, -1.0, 1.0, -1.0}, 2));
+
+  DenseClassificationGrowStats stats(params, 1);
+  stats.Initialize();
+  stats.AddSplit(MakeSplit("0", 0.0), dataset, &target, 0);
+  stats.AddSplit(MakeSplit("1", 0.0), dataset, &target, 0);
+
+  // Math time!
+  // After 2n samples,
+  // split 0 has smoothed counts (n+1,1);(1,n+1) and
+  // split 1 has smoothed counts (n+1,n+1);(1,1)
+  // split 0 smoothed ginis are both 1 - (n+1)^2/(n+2)^2 - 1/(n+2)^2 and
+  // split 1 smoothed ginis are 1 - 2 (n+1)^2 / (2n+2)^2 and 1 - 2 (1/4) = 1/2
+  // split 0 weighted smoothed ginis are both n (1 - (n^2 + 2n + 2) / (n+2)^2)
+  // split 1 weighted smoothed ginis are 0 and 2n (1 - 2(n+1)^2 / (2n+2)^2)
+  // split 0 split score = 2n (1 - (n^2 + 2n + 2) / (n+2)^2)
+  // split 1 spilt score = 2n (1 - 2(n+1)^2 / (2n+2)^2)
+  // split 1 score - split 0 score =
+  //    2n ( (n^2 + 2n + 2) / (n+2)^2 - 2(n+1)^2 / (2n+2)^2 )
+  //  = 2n ( (n^2 + 2n + 2) (2n+2)^2 - 2(n+1)^2 (n+2)^2 ) / ((n+2)^2 (2n+2)^2 )
+  //  = 2n ((n^2+2n+2)(4n^2+8n+4) - 2(n^2+2n+1)(n^2+4n+4)) / ((n+2)^2 (2n+2)^2)
+  //  = 2n (4n^4+8n^3+4n^2+8n^3+16n^2+8n+8n^2+16n+8
+  //         - (2n^4+8n^3+8n^2+4n^3+16n^2+16n+2n^2+8n+8)) / ((n+2)^2 (2n+2)^2)
+  //  = 2n (2n^4 + 4n^3 + 2n^2) / ((n+2)^2 (2n+2)^2)
+  //  = 4n^3 (n^2 + 2n + 1) / ((n+2)^2 (2n+2)^2)
+  //  = n^3  / (n+2)^2
+  //  Meanwhile, after 2n samples,
+  //  epsilon = 2n (1 - 1/2) sqrt(0.5 ln(1/0.01) / 2n)
+  //          = n sqrt( ln(10) / 2n)
+  //  Graphical comparison says that epsilon is greater between 0 and 4.5,
+  //  and then the split score difference is greater for n >= 5.
+  // n = 1
+  stats.AddExample(dataset, &target, 0);
+  stats.AddExample(dataset, &target, 1);
+  ASSERT_EQ(stats.num_splits(), 2);
+
+  // n = 2
+  stats.AddExample(dataset, &target, 0);
+  stats.AddExample(dataset, &target, 1);
+  ASSERT_EQ(stats.num_splits(), 2);
+
+  // n = 3
+  stats.AddExample(dataset, &target, 0);
+  stats.AddExample(dataset, &target, 1);
+  ASSERT_EQ(stats.num_splits(), 2);
+
+  // n = 4
+  stats.AddExample(dataset, &target, 0);
+  stats.AddExample(dataset, &target, 1);
+  ASSERT_EQ(stats.num_splits(), 2);
+
+  // n = 5
+  stats.AddExample(dataset, &target, 0);
+  stats.AddExample(dataset, &target, 1);
+  ASSERT_EQ(stats.num_splits(), 1);
+
+  // n = 6
+  stats.AddExample(dataset, &target, 0);
+  stats.AddExample(dataset, &target, 1);
+  ASSERT_EQ(stats.num_splits(), 1);
+}
+
+TEST(GrowStatsLeastSquaresRegressionTest, Basic) {
+  TensorForestParams params;
+  params.set_num_outputs(1);
+  params.mutable_split_after_samples()->set_constant_value(2);
+  params.mutable_num_splits_to_consider()->set_constant_value(2);
+  std::unique_ptr<LeastSquaresRegressionGrowStats> stat(
+      new LeastSquaresRegressionGrowStats(params, 1));
+  stat->Initialize();
+
+  std::vector<float> labels = {2.3, 5.6, 1.1};
+  std::unique_ptr<TestableInputTarget> target(
+      new TestableInputTarget(labels, {}, 1));
+  std::vector<int> branches = {1, 0, 1, 1, 0, 0};
+
+  RunBatch(stat.get(), target.get());
+  CHECK(stat->IsFinished());
+
+  FertileSlot slot;
+  stat->PackToProto(&slot);
+
+  string serialized = slot.DebugString();
+
+  std::unique_ptr<LeastSquaresRegressionGrowStats> new_stat(
+      new LeastSquaresRegressionGrowStats(params, 1));
+  new_stat->ExtractFromProto(slot);
+  FertileSlot second_one;
+  new_stat->PackToProto(&second_one);
+  string serialized_again = second_one.DebugString();
+
+  ASSERT_EQ(serialized_again, serialized);
+}
+
+
+TEST(GrowStatsSparseClassificationTest, Basic) {
+  TensorForestParams params;
+  params.set_num_outputs(2);
+  params.mutable_split_after_samples()->set_constant_value(2);
+  params.mutable_num_splits_to_consider()->set_constant_value(2);
+  std::unique_ptr<SparseClassificationGrowStats> stat(
+      new SparseClassificationGrowStats(params, 1));
+  stat->Initialize();
+
+  std::vector<float> labels = {100, 1000, 1};
+  std::vector<float> weights = {2.3, 20.3, 1.1};
+  std::unique_ptr<TestableInputTarget> target(
+      new TestableInputTarget(labels, weights, 1));
+  std::vector<int> branches = {1, 0, 1, 1, 0, 0};
+
+  RunBatch(stat.get(), target.get());
+  CHECK(stat->IsFinished());
+
+  FertileSlot slot;
+  stat->PackToProto(&slot);
+
+  string serialized = slot.DebugString();
+
+  std::unique_ptr<SparseClassificationGrowStats> new_stat(
+      new SparseClassificationGrowStats(params, 1));
+  new_stat->ExtractFromProto(slot);
+  FertileSlot second_one;
+  new_stat->PackToProto(&second_one);
+  string serialized_again = second_one.DebugString();
+  ASSERT_EQ(serialized_again, serialized);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14cb19d36f33e478728aba3e28b7bea11b691d34
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -0,0 +1,156 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+
+namespace tensorflow {
+namespace tensorforest {
+namespace {
+
+const int32 SPARSE_DEFAULT = 0;
+
+bool DecideInequalityTest(const decision_trees::InequalityTest& test,
+                          float value) {
+  float bias = test.threshold().float_value();
+  switch (test.type()) {
+    case decision_trees::InequalityTest::LESS_OR_EQUAL:
+      return value <= bias;
+
+    case decision_trees::InequalityTest::LESS_THAN:
+      return value < bias;
+
+    case decision_trees::InequalityTest::GREATER_OR_EQUAL:
+      return value >= bias;
+
+    case decision_trees::InequalityTest::GREATER_THAN:
+      return value > bias;
+
+    default:
+      return false;
+  }
+}
+
+bool DecideMatchingValuesTest(const decision_trees::MatchingValuesTest& test,
+                              float value) {
+  for (const decision_trees::Value& test_value : test.value()) {
+    if (test_value.float_value() == value) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+bool TensorDataSet::Decide(const decision_trees::BinaryNode& node,
+                           int example) const {
+  // TODO(gilberth): Support missing values.
+  float val = 0;
+  const auto& test = node.inequality_left_child_test();
+
+  if (test.has_oblique()) {
+    for (int i = 0; i < test.oblique().features_size(); ++i) {
+      val += test.oblique().weights(i) *
+             GetExampleValue(example, test.oblique().features(i));
+    }
+  } else {
+    val = GetExampleValue(example, test.feature_id());
+  }
+
+  if (node.has_inequality_left_child_test()) {
+    return DecideInequalityTest(node.inequality_left_child_test(), val);
+  } else {
+    decision_trees::MatchingValuesTest test;
+    if (node.custom_left_child_test().UnpackTo(&test)) {
+      return DecideMatchingValuesTest(test, val);
+    } else {
+      return false;
+    }
+  }
+}
+
+float TensorDataSet::GetExampleValue(
+    int example, const decision_trees::FeatureId& feature_id) const {
+  int32 feature;
+  safe_strto32(feature_id.id().value(), &feature);
+  if (feature >= input_spec_.dense_features_size()) {
+    return FindSparseValue(*sparse_indices_, *sparse_values_, example, feature);
+  } else {
+    return (*dense_data_)(example, feature);
+  }
+}
+
+float TensorDataSet::GetExampleValue(int example, int32 feature_id) const {
+  if (feature_id >= input_spec_.dense_features_size()) {
+    return FindSparseValue(*sparse_indices_, *sparse_values_, example,
+                           feature_id);
+  } else {
+    return (*dense_data_)(example, feature_id);
+  }
+}
+
+void TensorDataSet::set_input_tensors(const Tensor& dense,
+                                      const Tensor& sparse_indices,
+                                      const Tensor& sparse_values,
+                                      const Tensor& sparse_shape) {
+  if (dense.shape().dims() == 2) {
+    dense_data_.reset(new DenseStorageType(dense.tensor<float, 2>()));
+  }
+  if (sparse_indices.shape().dims() == 2) {
+    sparse_indices_.reset(new SparseIndicesStorageType(
+        sparse_indices.tensor<int64, 2>()));
+    sparse_values_.reset(new SparseValuesStorageType(
+        sparse_values.tensor<float, 1>()));
+    sparse_batch_size_ = sparse_shape.tensor<int64, 1>()(0);
+  }
+  original_dense_tensor_ = dense;
+}
+
+void TensorDataSet::RandomSample(int example,
+                                 decision_trees::FeatureId* feature_id,
+                                 float* bias, int* type) const {
+  int32 num_total_features = input_spec_.dense_features_size();
+  int64 sparse_input_start;
+  if (sparse_indices_ != nullptr) {
+    const int32 num_sparse = tensorforest::GetNumSparseFeatures(
+        *sparse_indices_, example, &sparse_input_start);
+    if (sparse_input_start >= 0) {
+      num_total_features += num_sparse;
+    }
+  }
+  int rand_feature = rng_->Uniform(num_total_features);
+  if (rand_feature < available_features_.size()) {  // it's dense.
+    *feature_id = available_features_[rand_feature];
+    *type = input_spec_.GetDenseFeatureType(rand_feature);
+  } else {
+    const int32 sparse_index =
+        sparse_input_start + rand_feature - input_spec_.dense_features_size();
+    const int32 saved_index =
+        (*sparse_indices_)(sparse_index, 1) + input_spec_.dense_features_size();
+    *feature_id = decision_trees::FeatureId();
+    feature_id->mutable_id()->set_value(strings::StrCat(saved_index));
+
+    // TODO(gilberth): Remove this shortcut when different sparse types are
+    // allowed.
+    *type = input_spec_.sparse(0).original_type();
+  }
+
+  *bias = GetExampleValue(example, *feature_id);
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3d4edbf8a512a027e4b67916d1f2ad3f347a18b
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -0,0 +1,126 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#include <ctime>
+#include <unordered_map>
+#include "google/protobuf/any.pb.h"
+#include "google/protobuf/wrappers.pb.h"
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+typedef TTypes<const float, 2>::ConstTensor DenseStorageType;
+typedef TTypes<const int64, 2>::ConstTensor SparseIndicesStorageType;
+typedef TTypes<const float, 1>::ConstTensor SparseValuesStorageType;
+
+class TensorDataSet {
+ public:
+  TensorDataSet(const tensorforest::TensorForestDataSpec& input_spec,
+                int32 seed)
+      : dense_data_(nullptr),
+        sparse_indices_(nullptr),
+        sparse_values_(nullptr),
+        input_spec_(input_spec),
+        split_sampling_random_seed_(seed) {
+    int column_count = 0;
+    for (int i = 0; i < input_spec_.dense_size(); ++i) {
+      for (int j = 0; j < input_spec_.dense(i).size(); ++j) {
+        decision_trees::FeatureId id;
+        id.mutable_id()->set_value(strings::StrCat(column_count));
+        available_features_.push_back(id);
+        ++column_count;
+      }
+    }
+
+    // Set up the random number generator.
+    if (split_sampling_random_seed_ == 0) {
+      uint64 time_seed = static_cast<uint64>(std::clock());
+      single_rand_ = std::unique_ptr<random::PhiloxRandom>(
+          new random::PhiloxRandom(time_seed));
+    } else {
+      single_rand_ = std::unique_ptr<random::PhiloxRandom>(
+          new random::PhiloxRandom(split_sampling_random_seed_));
+    }
+
+    rng_ = std::unique_ptr<random::SimplePhilox>(
+        new random::SimplePhilox(single_rand_.get()));
+  }
+  virtual ~TensorDataSet() {}
+
+  void set_input_tensors(const Tensor& dense, const Tensor& sparse_indices,
+                         const Tensor& sparse_values,
+                         const Tensor& sparse_shape);
+
+  float get_input_value(int offset, int col) {
+    return (*dense_data_)(offset, col);
+  }
+
+  int NumItems() const {
+    if (dense_data_ != nullptr) {
+      return dense_data_->dimensions()[0];
+    } else if (sparse_indices_ != nullptr) {
+      return sparse_batch_size_;
+    } else {
+      return 0;
+    }
+  }
+
+  // This looks up a value by example and int32_id, which is much faster than
+  // GetFeature.
+  float GetExampleValue(int example,
+                        const decision_trees::FeatureId& feature_id) const;
+
+  // Same as overload with FeatureId, but if you already have the feature as
+  // an int32 you can avoid the atoi32.
+  virtual float GetExampleValue(int example, int32 feature_id) const;
+
+  int num_features() {
+    return available_features_.size();
+  }
+
+  const Tensor& original_tensor() const { return original_dense_tensor_; }
+
+  bool Decide(const decision_trees::BinaryNode& node, int example) const;
+
+  // Randomly samples a feature from example, returns its id in feature_name,
+  // the value in bias, and it's type from input_spec in type.
+  void RandomSample(int example, decision_trees::FeatureId* feature_name,
+                    float* bias, int* type) const;
+
+ private:
+  std::unique_ptr<DenseStorageType> dense_data_;
+  std::unique_ptr<SparseIndicesStorageType> sparse_indices_;
+  std::unique_ptr<SparseValuesStorageType> sparse_values_;
+  int sparse_batch_size_;
+
+  Tensor original_dense_tensor_;
+  const tensorforest::TensorForestDataSpec input_spec_;
+  std::vector<decision_trees::FeatureId> available_features_;
+
+  int32 split_sampling_random_seed_;
+  std::unique_ptr<random::PhiloxRandom> single_rand_;
+  std::unique_ptr<random::SimplePhilox> rng_;
+};
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..0309ec1de9aec1044eb87e01cafc40c26ba3de14
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
@@ -0,0 +1,92 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+typedef TTypes<float, 1>::UnalignedConstTensor SingleDimStorageType;
+
+// Base class for classes that hold labels and weights. Mostly for testing
+// purposes, because it's inconvenient to construct nasty Eigen::things.
+class InputTarget {
+ public:
+  virtual ~InputTarget() {}
+  virtual int32 GetTargetAsClassIndex(int example_index,
+                                      int target_index) const = 0;
+
+  virtual float GetTargetWeight(int example_index) const = 0;
+
+  virtual float GetTargetAsContinuous(int example_index,
+                                      int target_index) const = 0;
+};
+
+template <typename T>
+class StoredInputTarget : public InputTarget {
+ protected:
+  // Takes ownership of t and w with a std::unique_ptr.
+  StoredInputTarget(const T* t, const T* w, int num_targets)
+      : target_(t), weight_(w), num_targets_(num_targets) {}
+
+  const std::unique_ptr<const T> target_;
+  const std::unique_ptr<const T> weight_;
+  int num_targets_;
+};
+
+// Holds labels/targets and weights. Assumes that tensors are passed as
+// t.unaligned_flat<float>(). For multi-output, specifying the number of
+// outputs will correctly index the flattened data.
+class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
+ public:
+  TensorInputTarget(const Tensor& target, const Tensor& weight, int num_targets)
+      : StoredInputTarget(
+            new SingleDimStorageType(target.unaligned_flat<float>()),
+            new SingleDimStorageType(weight.unaligned_flat<float>()),
+            num_targets),
+        original_tensor_(target) {}
+
+  int32 GetTargetAsClassIndex(int example_index,
+                              int target_index) const override {
+    return static_cast<int32>(
+        GetTargetAsContinuous(example_index, target_index));
+  }
+
+  float GetTargetWeight(int example_index) const override {
+    const size_t num_weights = weight_->size();
+    return num_weights > 0 && example_index < num_weights
+               ? (*weight_)(example_index)
+               : 1.0;
+  }
+
+  float GetTargetAsContinuous(int example_index,
+                              int target_index) const override {
+    QCHECK_LT(target_index, num_targets_);
+    return (*target_)(example_index * num_targets_ + target_index);
+  }
+
+  const Tensor& original_tensor() const {
+    return original_tensor_;
+  }
+
+ protected:
+  Tensor original_tensor_;
+};
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d43c068e462ff78b114fb29bd8cf0ee0c6080fcd
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
@@ -0,0 +1,165 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+using decision_trees::Leaf;
+
+std::unique_ptr<LeafModelOperator>
+LeafModelOperatorFactory::CreateLeafModelOperator(
+    const TensorForestParams& params) {
+  switch (params.leaf_type()) {
+    case MODEL_DENSE_CLASSIFICATION:
+      return std::unique_ptr<LeafModelOperator>(
+          new DenseClassificationLeafModelOperator(params));
+
+    case MODEL_SPARSE_CLASSIFICATION:
+      return std::unique_ptr<LeafModelOperator>(
+          new SparseClassificationLeafModelOperator(params));
+
+    case MODEL_SPARSE_OR_DENSE_CLASSIFICATION:
+      return std::unique_ptr<LeafModelOperator>(
+          new SparseOrDenseClassificationLeafModelOperator(params));
+
+    case MODEL_REGRESSION:
+      return std::unique_ptr<LeafModelOperator>(
+          new RegressionLeafModelOperator(params));
+
+    default:
+      LOG(ERROR) << "Unknown model operator: " << params.leaf_type();
+      return nullptr;
+  }
+}
+
+// ------------------------ Dense ----------------------------- //
+float DenseClassificationLeafModelOperator::GetOutputValue(
+    const decision_trees::Leaf& leaf, int32 o) const {
+  return leaf.vector().value(o).float_value();
+}
+
+void DenseClassificationLeafModelOperator::UpdateModel(
+    Leaf* leaf, const InputTarget* target, int example) const {
+  const int32 int_label = target->GetTargetAsClassIndex(example, 0);
+  QCHECK_LT(int_label, params_.num_outputs())
+      << "Got label greater than indicated number of classes. Is "
+         "params.num_classes set correctly?";
+  QCHECK_GE(int_label, 0);
+  auto* val = leaf->mutable_vector()->mutable_value(int_label);
+
+  float weight = target->GetTargetWeight(example);
+  val->set_float_value(val->float_value() + weight);
+}
+
+void DenseClassificationLeafModelOperator::InitModel(Leaf* leaf) const {
+  for (int i = 0; i < params_.num_outputs(); ++i) {
+    leaf->mutable_vector()->add_value();
+  }
+}
+
+void DenseClassificationLeafModelOperator::ExportModel(
+    const LeafStat& stat, decision_trees::Leaf* leaf) const {
+  *leaf->mutable_vector() = stat.classification().dense_counts();
+}
+
+// ------------------------- Sparse -------------------------- //
+float SparseClassificationLeafModelOperator::GetOutputValue(
+    const decision_trees::Leaf& leaf, int32 o) const {
+  const auto it = leaf.sparse_vector().sparse_value().find(o);
+  if (it == leaf.sparse_vector().sparse_value().end()) {
+    return 0;  // default value
+  } else {
+    return it->second.float_value();
+  }
+}
+
+void SparseClassificationLeafModelOperator::UpdateModel(
+    Leaf* leaf, const InputTarget* target, int example) const {
+  const int32 int_label = target->GetTargetAsClassIndex(example, 0);
+  QCHECK_LT(int_label, params_.num_outputs())
+      << "Got label greater than indicated number of classes. Is "
+         "params.num_classes set correctly?";
+  QCHECK_GE(int_label, 0);
+  const float weight = target->GetTargetWeight(example);
+
+  auto value_map = leaf->mutable_sparse_vector()->mutable_sparse_value();
+  auto it = value_map->find(int_label);
+  if (it == value_map->end()) {
+    (*value_map)[int_label].set_float_value(weight);
+  } else {
+    it->second.set_float_value(it->second.float_value() + weight);
+  }
+}
+
+void SparseClassificationLeafModelOperator::ExportModel(
+    const LeafStat& stat, decision_trees::Leaf* leaf) const {
+  *leaf->mutable_sparse_vector() = stat.classification().sparse_counts();
+}
+
+// ------------------------- SparseOrDense -------------------------- //
+float SparseOrDenseClassificationLeafModelOperator::GetOutputValue(
+    const decision_trees::Leaf& leaf, int32 o) const {
+  if (leaf.has_vector()) {
+    return dense_->GetOutputValue(leaf, o);
+  } else {
+    return sparse_->GetOutputValue(leaf, o);
+  }
+}
+
+void SparseOrDenseClassificationLeafModelOperator::UpdateModel(
+    Leaf* leaf, const InputTarget* target, int example) const {
+  if (leaf->has_vector()) {
+    return dense_->UpdateModel(leaf, target, example);
+  } else {
+    return sparse_->UpdateModel(leaf, target, example);
+  }
+}
+
+void SparseOrDenseClassificationLeafModelOperator::ExportModel(
+    const LeafStat& stat, decision_trees::Leaf* leaf) const {
+  if (stat.classification().has_dense_counts()) {
+    return dense_->ExportModel(stat, leaf);
+  } else {
+    return sparse_->ExportModel(stat, leaf);
+  }
+}
+
+// ------------------------ Regression ----------------------------- //
+float RegressionLeafModelOperator::GetOutputValue(
+    const decision_trees::Leaf& leaf, int32 o) const {
+  return leaf.vector().value(o).float_value();
+}
+
+void RegressionLeafModelOperator::InitModel(Leaf* leaf) const {
+  for (int i = 0; i < params_.num_outputs(); ++i) {
+    leaf->mutable_vector()->add_value();
+  }
+}
+
+void RegressionLeafModelOperator::ExportModel(
+    const LeafStat& stat, decision_trees::Leaf* leaf) const {
+  leaf->clear_vector();
+  for (int i = 0; i < params_.num_outputs(); ++i) {
+    const float new_val =
+        stat.regression().mean_output().value(i).float_value() /
+        stat.weight_sum();
+    leaf->mutable_vector()->add_value()->set_float_value(new_val);
+  }
+}
+
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..946a648f22ff4175782c42cc70c59440e6ac0e17
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
@@ -0,0 +1,149 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Abstract base class for classes that can initialize, get, and update leaf
+// models.
+class LeafModelOperator {
+ public:
+  // Number of outputs is interpreted differently for classification and
+  // regression.  For classification, it's the number of possible classes.
+  // For regression, it's the target dimensions.
+  explicit LeafModelOperator(const TensorForestParams& params)
+      : params_(params) {}
+  virtual ~LeafModelOperator() {}
+
+  // Returns the value of the requested output, which should be
+  // in [0, num_outputs_).  For classification, it's the class count (weighted
+  // number of instances seen).  For regression, it's e.g. the average value.
+  virtual float GetOutputValue(const decision_trees::Leaf& leaf,
+                               int32 o) const = 0;
+
+  // Update the given Leaf's model with the given example.
+  virtual void UpdateModel(decision_trees::Leaf* leaf,
+                           const InputTarget* target, int example) const = 0;
+
+  // Initialize an empty Leaf model.
+  virtual void InitModel(decision_trees::Leaf* leaf) const = 0;
+
+  virtual void ExportModel(const LeafStat& stat,
+                           decision_trees::Leaf* leaf) const = 0;
+
+ protected:
+  const TensorForestParams& params_;
+};
+
+// LeafModelOperator that stores class counts in a dense vector.
+class DenseClassificationLeafModelOperator : public LeafModelOperator {
+ public:
+  explicit DenseClassificationLeafModelOperator(
+      const TensorForestParams& params)
+      : LeafModelOperator(params) {}
+  float GetOutputValue(const decision_trees::Leaf& leaf,
+                       int32 o) const override;
+
+  void UpdateModel(decision_trees::Leaf* leaf, const InputTarget* target,
+                   int example) const override;
+
+  void InitModel(decision_trees::Leaf* leaf) const override;
+
+  void ExportModel(const LeafStat& stat,
+                   decision_trees::Leaf* leaf) const override;
+};
+
+// LeafModelOperator that stores class counts sparsely in a map. Assumes default
+// value for yet-unseen classes is 0.
+class SparseClassificationLeafModelOperator : public LeafModelOperator {
+ public:
+  explicit SparseClassificationLeafModelOperator(
+      const TensorForestParams& params)
+      : LeafModelOperator(params) {}
+  float GetOutputValue(const decision_trees::Leaf& leaf,
+                       int32 o) const override;
+
+  void UpdateModel(decision_trees::Leaf* leaf, const InputTarget* target,
+                   int example) const override;
+
+  void InitModel(decision_trees::Leaf* leaf) const override {}
+
+  void ExportModel(const LeafStat& stat,
+                   decision_trees::Leaf* leaf) const override;
+};
+
+class SparseOrDenseClassificationLeafModelOperator : public LeafModelOperator {
+ public:
+  explicit SparseOrDenseClassificationLeafModelOperator(
+      const TensorForestParams& params)
+      : LeafModelOperator(params),
+        dense_(new DenseClassificationLeafModelOperator(params)),
+        sparse_(new SparseClassificationLeafModelOperator(params)) {}
+  float GetOutputValue(const decision_trees::Leaf& leaf,
+                       int32 o) const override;
+
+  void UpdateModel(decision_trees::Leaf* leaf, const InputTarget* target,
+                   int example) const override;
+
+  void InitModel(decision_trees::Leaf* leaf) const override {}
+
+  void ExportModel(const LeafStat& stat,
+                   decision_trees::Leaf* leaf) const override;
+
+ protected:
+  std::unique_ptr<DenseClassificationLeafModelOperator> dense_;
+  std::unique_ptr<SparseClassificationLeafModelOperator> sparse_;
+};
+
+// LeafModelOperator that stores regression leaf models with constant-value
+// prediction.
+class RegressionLeafModelOperator : public LeafModelOperator {
+ public:
+  explicit RegressionLeafModelOperator(const TensorForestParams& params)
+      : LeafModelOperator(params) {}
+  float GetOutputValue(const decision_trees::Leaf& leaf,
+                       int32 o) const override;
+
+  // TODO(gilberth): Quick experimentation suggests it's not even worth
+  // updating model and just using the seeded values.  Can add this in
+  // with additional_data, though protobuf::Any is slow.  Maybe make it
+  // optional.  Maybe make any update optional.
+  void UpdateModel(decision_trees::Leaf* leaf, const InputTarget* target,
+                   int example) const override {}
+
+  void InitModel(decision_trees::Leaf* leaf) const override;
+
+  void ExportModel(const LeafStat& stat,
+                   decision_trees::Leaf* leaf) const override;
+};
+
+class LeafModelOperatorFactory {
+ public:
+  static std::unique_ptr<LeafModelOperator> CreateLeafModelOperator(
+      const TensorForestParams& params);
+};
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd92c01f9a59719e6bb2458c2f28253c364a2e8
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
@@ -0,0 +1,218 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h"
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::decision_trees::Leaf;
+using tensorflow::tensorforest::DenseClassificationLeafModelOperator;
+using tensorflow::tensorforest::LeafModelOperator;
+using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
+using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
+using tensorflow::tensorforest::LeafStat;
+using tensorflow::tensorforest::RegressionLeafModelOperator;
+using tensorflow::tensorforest::TestableInputTarget;
+using tensorflow::tensorforest::TensorForestParams;
+
+const int32 kNumClasses = 3;
+
+constexpr char kRegressionStatProto[] =
+  "weight_sum: 3 "
+  "regression { "
+  "mean_output { "
+    "value { "
+    "  float_value: 27 "
+    "} "
+    "value { "
+    "  float_value: 282 "
+    "} "
+    "value { "
+    "  float_value: 10 "
+    "} "
+  "} "
+  "mean_output_squares { "
+    "value {"
+    "  float_value: 245"
+    "}"
+    "value {"
+    "  float_value: 26564"
+    "}"
+    "value {"
+    "  float_value: 46"
+    "}"
+  "}"
+"}";
+
+void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
+  Leaf l;
+  op->InitModel(&l);
+  // Make sure it was initialized correctly.
+  for (int i = 0; i < kNumClasses; ++i) {
+    EXPECT_EQ(op->GetOutputValue(l, i), 0);
+  }
+
+  std::vector<float> labels = {1, 0, 1};
+  std::vector<float> weights = {2.3, 20.3, 1.1};
+  std::unique_ptr<TestableInputTarget> target(
+      new TestableInputTarget(labels, weights, 1));
+
+  // Update and check value.
+  op->UpdateModel(&l, target.get(), 0);
+  op->UpdateModel(&l, target.get(), 1);
+  op->UpdateModel(&l, target.get(), 2);
+
+  EXPECT_FLOAT_EQ(op->GetOutputValue(l, 1), 3.4);
+}
+
+
+TEST(DenseLeafModelOperatorsTest, NormalUse) {
+  TensorForestParams params;
+  params.set_num_outputs(kNumClasses);
+  std::unique_ptr<LeafModelOperator> op(
+      new DenseClassificationLeafModelOperator(params));
+  TestClassificationNormalUse(op);
+}
+
+TEST(SparseLeafModelOperatorsTest, NormalUse) {
+  TensorForestParams params;
+  params.set_num_outputs(kNumClasses);
+  std::unique_ptr<LeafModelOperator> op(
+      new SparseClassificationLeafModelOperator(params));
+  TestClassificationNormalUse(op);
+}
+
+TEST(DenseLeafModelOperatorsTest, InitWithExisting) {
+  TensorForestParams params;
+  params.set_num_outputs(kNumClasses);
+  std::unique_ptr<LeafModelOperator> op(
+      new DenseClassificationLeafModelOperator(params));
+
+  std::unique_ptr<LeafStat> stat(new LeafStat);
+  stat->mutable_classification()
+      ->mutable_dense_counts()
+      ->add_value()
+      ->set_float_value(1.1);
+  stat->mutable_classification()
+      ->mutable_dense_counts()
+      ->add_value()
+      ->set_float_value(2.2);
+  stat->mutable_classification()
+      ->mutable_dense_counts()
+      ->add_value()
+      ->set_float_value(3.3);
+
+  std::unique_ptr<Leaf> leaf(new Leaf);
+
+  op->ExportModel(*stat, leaf.get());
+
+  // Make sure it was initialized correctly.
+  EXPECT_EQ(leaf->vector().value_size(), kNumClasses);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 1), 2.2);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 2), 3.3);
+}
+
+TEST(SparseOrDenseClassificationLeafModelOperator, InitWithExisting) {
+  TensorForestParams params;
+  params.set_num_outputs(kNumClasses);
+  std::unique_ptr<LeafModelOperator> op(
+      new SparseOrDenseClassificationLeafModelOperator(params));
+
+  std::unique_ptr<LeafStat> stat(new LeafStat);
+  (*stat->mutable_classification()
+        ->mutable_sparse_counts()
+        ->mutable_sparse_value())[0]
+      .set_float_value(1.1);
+  (*stat->mutable_classification()
+        ->mutable_sparse_counts()
+        ->mutable_sparse_value())[1]
+      .set_float_value(2.2);
+  (*stat->mutable_classification()
+        ->mutable_sparse_counts()
+        ->mutable_sparse_value())[2]
+      .set_float_value(3.3);
+
+  std::unique_ptr<Leaf> leaf(new Leaf);
+
+  op->ExportModel(*stat, leaf.get());
+
+  // Make sure it was initialized correctly.
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 1), 2.2);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 2), 3.3);
+}
+
+TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
+  TensorForestParams params;
+  params.set_num_outputs(kNumClasses);
+  std::unique_ptr<LeafModelOperator> op(
+      new SparseClassificationLeafModelOperator(params));
+  std::unique_ptr<LeafStat> stat(new LeafStat);
+  (*stat->mutable_classification()
+        ->mutable_sparse_counts()
+        ->mutable_sparse_value())[0]
+      .set_float_value(1.1);
+  (*stat->mutable_classification()
+        ->mutable_sparse_counts()
+        ->mutable_sparse_value())[1]
+      .set_float_value(2.2);
+  (*stat->mutable_classification()
+        ->mutable_sparse_counts()
+        ->mutable_sparse_value())[2]
+      .set_float_value(3.3);
+
+  std::unique_ptr<Leaf> leaf(new Leaf);
+
+  op->ExportModel( *stat, leaf.get());
+
+  // Make sure it was initialized correctly.
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 1), 2.2);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 2), 3.3);
+
+  // check default value.
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 100), 0);
+  EXPECT_EQ(leaf->sparse_vector().sparse_value().size(), kNumClasses);
+}
+
+
+TEST(RegressionLeafModelOperatorsTest, NormalUse) {
+  TensorForestParams params;
+  params.set_num_outputs(kNumClasses);
+  std::unique_ptr<LeafModelOperator> op(
+      new RegressionLeafModelOperator(params));
+
+  std::unique_ptr<LeafStat> stat(new LeafStat());
+  const string contents(kRegressionStatProto);
+  ::tensorflow::protobuf::TextFormat::ParseFromString(contents, stat.get());
+
+  std::unique_ptr<Leaf> leaf(new Leaf);
+  op->ExportModel(*stat, leaf.get());
+
+  // Make sure it was initialized correctly.
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 9);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 1), 94);
+  EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 2), 3.3333333);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3b09c17d5173a57b1d8edd2b5afb656dd971755
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.cc
@@ -0,0 +1,54 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include <math.h>
+#include <stdlib.h>
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+float ResolveParam(const DepthDependentParam& param, int32 depth) {
+  float val;
+  switch (param.ParamType_case()) {
+    case DepthDependentParam::kConstantValue:
+      return param.constant_value();
+
+    case DepthDependentParam::kLinear:
+      val = depth * param.linear().slope() + param.linear().y_intercept();
+      return std::min(std::max(val, param.linear().min_val()),
+                      param.linear().max_val());
+
+    case DepthDependentParam::kExponential:
+      return param.exponential().bias() +
+             param.exponential().multiplier() *
+                 static_cast<float>(
+                     pow(param.exponential().base(),
+                         param.exponential().depth_multiplier() * depth));
+
+    case DepthDependentParam::kThreshold:
+      if (depth >= param.threshold().threshold()) {
+        return param.threshold().on_value();
+      } else {
+        return param.threshold().off_value();
+      }
+
+    default:
+      LOG(FATAL) << "unknown parameter type";
+  }
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
new file mode 100644
index 0000000000000000000000000000000000000000..97a9d8d096311faaae774e9e4b2e45f28ed7fa29
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
@@ -0,0 +1,32 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Return the value of the given depth-dependent parameter given a leaf's depth.
+float ResolveParam(const DepthDependentParam& param, int32 depth);
+
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..801881af1368dc33f00b356d12bea07ae3161ef6
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
@@ -0,0 +1,75 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::tensorforest::DepthDependentParam;
+using tensorflow::tensorforest::ResolveParam;
+
+TEST(ParamsTest, TestConstant) {
+  DepthDependentParam param;
+  param.set_constant_value(10.0);
+
+  ASSERT_EQ(ResolveParam(param, 0), 10.0);
+  ASSERT_EQ(ResolveParam(param, 100), 10.0);
+}
+
+TEST(ParamsTest, TestLinear) {
+  DepthDependentParam param;
+  auto* linear = param.mutable_linear();
+  linear->set_y_intercept(100.0);
+  linear->set_slope(-10.0);
+  linear->set_min_val(23.0);
+  linear->set_max_val(90.0);
+
+  ASSERT_EQ(ResolveParam(param, 0), 90);
+  ASSERT_EQ(ResolveParam(param, 1), 90);
+  ASSERT_EQ(ResolveParam(param, 2), 80);
+
+  ASSERT_EQ(ResolveParam(param, 30), 23);
+}
+
+TEST(ParamsTest, TestExponential) {
+  DepthDependentParam param;
+  auto* expo = param.mutable_exponential();
+  expo->set_bias(100.0);
+  expo->set_base(10.0);
+  expo->set_multiplier(-1.0);
+  expo->set_depth_multiplier(1.0);
+
+  ASSERT_EQ(ResolveParam(param, 0), 99);
+  ASSERT_EQ(ResolveParam(param, 1), 90);
+  ASSERT_EQ(ResolveParam(param, 2), 0);
+}
+
+TEST(ParamsTest, TestThreshold) {
+  DepthDependentParam param;
+  auto* threshold = param.mutable_threshold();
+  threshold->set_on_value(100.0);
+  threshold->set_off_value(10.0);
+  threshold->set_threshold(5.0);
+
+  ASSERT_EQ(ResolveParam(param, 0), 10);
+  ASSERT_EQ(ResolveParam(param, 4), 10);
+  ASSERT_EQ(ResolveParam(param, 5), 100);
+  ASSERT_EQ(ResolveParam(param, 6), 100);
+}
+
+}  // namespace
+
+
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccc412600c760e6d453d896121e99916129c196c
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
@@ -0,0 +1,138 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h"
+
+#include <cfloat>
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+std::unordered_map<int, CollectionCreator*>
+    SplitCollectionOperatorFactory::factories_;  // NOLINT
+REGISTER_SPLIT_COLLECTION(COLLECTION_BASIC, SplitCollectionOperator);
+
+std::unique_ptr<SplitCollectionOperator>
+SplitCollectionOperatorFactory::CreateSplitCollectionOperator(
+    const TensorForestParams& params) {
+  auto it = factories_.find(params.collection_type());
+  if (it == factories_.end()) {
+    LOG(ERROR) << "Unknown split collection operator: "
+               << params.collection_type();
+    return nullptr;
+  } else {
+    return it->second->Create(params);
+  }
+}
+
+std::unique_ptr<GrowStats> SplitCollectionOperator::CreateGrowStats(
+    int32 node_id, int32 depth) const {
+  switch (params_.stats_type()) {
+    case STATS_DENSE_GINI:
+      return std::unique_ptr<GrowStats>(
+          new DenseClassificationGrowStats(params_, depth));
+
+    case STATS_SPARSE_GINI:
+      return std::unique_ptr<GrowStats>(
+          new SparseClassificationGrowStats(params_, depth));
+
+    case STATS_LEAST_SQUARES_REGRESSION:
+      return std::unique_ptr<GrowStats>(new LeastSquaresRegressionGrowStats(
+          params_, depth));
+
+    default:
+      LOG(ERROR) << "Unknown grow stats type: " << params_.stats_type();
+      return nullptr;
+  }
+}
+
+void SplitCollectionOperator::ExtractFromProto(
+    const FertileStats& stats_proto) {
+  for (int i = 0; i < stats_proto.node_to_slot_size(); ++i) {
+    const auto& slot = stats_proto.node_to_slot(i);
+    stats_[slot.node_id()] = CreateGrowStats(slot.node_id(), slot.depth());
+    stats_[slot.node_id()]->ExtractFromProto(slot);
+  }
+}
+
+void SplitCollectionOperator::PackToProto(FertileStats* stats_proto) const {
+  for (const auto& pair : stats_) {
+    auto* new_slot = stats_proto->add_node_to_slot();
+    new_slot->set_node_id(pair.first);
+    if (params_.checkpoint_stats()) {
+      pair.second->PackToProto(new_slot);
+    }
+    new_slot->set_depth(pair.second->depth());
+  }
+}
+
+void SplitCollectionOperator::InitializeSlot(int32 node_id, int32 depth) {
+  stats_[node_id] = std::unique_ptr<GrowStats>(CreateGrowStats(node_id, depth));
+  stats_[node_id]->Initialize();
+}
+
+void SplitCollectionOperator::AddExample(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    const std::vector<int>& examples, int32 node_id) const {
+  auto* slot = stats_.at(node_id).get();
+  for (int example : examples) {
+    slot->AddExample(input_data, target, example);
+  }
+}
+
+bool SplitCollectionOperator::IsInitialized(int32 node_id) const {
+  return stats_.at(node_id)->IsInitialized();
+}
+
+void SplitCollectionOperator::CreateAndInitializeCandidateWithExample(
+    const std::unique_ptr<TensorDataSet>& input_data, const InputTarget* target,
+    int example, int32 node_id) const {
+  // Assumes split_initializations_per_input == 1.
+  decision_trees::BinaryNode split;
+  float bias;
+  int type;
+  decision_trees::FeatureId feature_id;
+  input_data->RandomSample(example, &feature_id, &bias, &type);
+
+  if (type == kDataFloat) {
+    decision_trees::InequalityTest* test =
+        split.mutable_inequality_left_child_test();
+    *test->mutable_feature_id() = feature_id;
+    test->mutable_threshold()->set_float_value(bias);
+    test->set_type(params_.inequality_test_type());
+  } else if (type == kDataCategorical) {
+    decision_trees::MatchingValuesTest test;
+    *test.mutable_feature_id() = feature_id;
+    test.add_value()->set_float_value(bias);
+    split.mutable_custom_left_child_test()->PackFrom(test);
+  } else {
+    LOG(ERROR) << "Unknown feature type " << type << ", not sure which "
+               << "node type to use.";
+  }
+  stats_.at(node_id)->AddSplit(split, input_data, target, example);
+}
+
+bool SplitCollectionOperator::BestSplit(int32 node_id,
+                                        SplitCandidate* best,
+                                        int32* depth) const {
+  auto* slot = stats_.at(node_id).get();
+  *depth = slot->depth();
+  return slot->BestSplit(best);
+}
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c21c0bd3443347bdb0102727b15b26754a0ed53
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
@@ -0,0 +1,133 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+
+#include <vector>
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/params.h"
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Class that can initialize and update split collections, and
+// report if one is finished and ready to split.  Designed to be inherited
+// from to implement techniques such as pruning and early/delayed finishing.
+class SplitCollectionOperator {
+ public:
+  explicit SplitCollectionOperator(const TensorForestParams& params)
+      : params_(params) {}
+  virtual ~SplitCollectionOperator() {}
+
+  // Return a new GrowStats object according to stats_type_;
+  virtual std::unique_ptr<GrowStats> CreateGrowStats(int32 node_id,
+                                                     int32 depth) const;
+
+  // Initialize from a previously serialized proto.
+  virtual void ExtractFromProto(const FertileStats& stats);
+
+  // Serialize contents to the given proto.
+  virtual void PackToProto(FertileStats* stats) const;
+
+  // Updates the slot's candidates with the new example.
+  // Assumes slot has been initialized.
+  virtual void AddExample(const std::unique_ptr<TensorDataSet>& input_data,
+                          const InputTarget* target,
+                          const std::vector<int>& examples,
+                          int32 node_id) const;
+
+  // Create a new candidate and initialize it with the given example.
+  virtual void CreateAndInitializeCandidateWithExample(
+      const std::unique_ptr<TensorDataSet>& input_data,
+      const InputTarget* target, int example, int32 node_id) const;
+
+  // Create a new GrowStats for the given node id and initialize it.
+  virtual void InitializeSlot(int32 node_id, int32 depth);
+
+  // Called when the resource is deserialized, possibly needing an
+  // initialization.
+  virtual void MaybeInitialize() {
+    if (stats_.empty()) {
+      InitializeSlot(0, 0);
+    }
+  }
+
+  // Perform any necessary cleanup for any tracked state for the slot.
+  virtual void ClearSlot(int32 node_id) {
+    stats_.erase(node_id);
+  }
+
+  // Return true if slot is fully initialized.
+  virtual bool IsInitialized(int32 node_id) const;
+
+  // Return true if slot is finished.
+  virtual bool IsFinished(int32 node_id) const {
+    return stats_.at(node_id)->IsFinished();
+  }
+
+  // Fill in best with the best split that node_id has, return true if this
+  // was successful, false if no good split was found.
+  virtual bool BestSplit(int32 node_id, SplitCandidate* best,
+                         int32* depth) const;
+
+ protected:
+  const TensorForestParams& params_;
+  std::unordered_map<int32, std::unique_ptr<GrowStats>> stats_;
+};
+
+class CollectionCreator {
+ public:
+  virtual std::unique_ptr<SplitCollectionOperator> Create(
+      const TensorForestParams& params) = 0;
+  virtual ~CollectionCreator() {}
+};
+
+class SplitCollectionOperatorFactory {
+ public:
+  static std::unique_ptr<SplitCollectionOperator> CreateSplitCollectionOperator(
+      const TensorForestParams& params);
+
+  static std::unordered_map<int, CollectionCreator*> factories_;
+};
+
+template <typename T>
+class AnyCollectionCreator : public CollectionCreator {
+ public:
+  AnyCollectionCreator(SplitCollectionType type) {
+    SplitCollectionOperatorFactory::factories_[type] = this;
+  }
+  virtual std::unique_ptr<SplitCollectionOperator> Create(
+      const TensorForestParams& params) {
+    return std::unique_ptr<SplitCollectionOperator>(new T(params));
+  }
+};
+
+#define REGISTER_SPLIT_COLLECTION(name, cls) \
+  namespace {                                \
+  AnyCollectionCreator<cls> creator(name);   \
+  }
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bec198e97e8215d2cfdb9ada5355dd5b0d2d97b
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
@@ -0,0 +1,87 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
+#include <cfloat>
+
+#include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// When using smoothing but only tracking sum and squares, and we're adding
+// num_classes for smoothing each class, then Gini looks more like this:
+//   Gini = 1 - \sum_i (c_i + 1)^2 / C^2
+//   = 1 - (1 / C^2) ( (\sum_i c_i)^2 + 2 (\sum_i c_i) + (\sum_i 1))
+//   = 1 - (1 / C^2) ( stats.square() + 2 stats.sum() + #_classes)
+//   = 1 - ( stats.square() + 2 stats.sum() + #_classes) / (smoothed_sum *
+//                                                          smoothed_sum)
+//
+//   where
+//   smoothed_sum = stats.sum() + #_classes
+float GiniImpurity(const LeafStat& stats, int32 num_classes) {
+  const float smoothed_sum = num_classes + stats.weight_sum();
+  return 1.0 - (
+      (stats.classification().gini().square()
+       + 2 * stats.weight_sum() + num_classes) / (smoothed_sum * smoothed_sum));
+}
+
+float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes) {
+  return stats.weight_sum() * GiniImpurity(stats, num_classes);
+}
+
+void UpdateGini(LeafStat* stats, float old_val, float weight) {
+  stats->set_weight_sum(stats->weight_sum() + weight);
+  // Equivalent to stats->square() - old_val * old_val + new_val * new_val,
+  // (for new_val = old_val + weight), but more numerically stable.
+  stats->mutable_classification()->mutable_gini()->set_square(
+      stats->classification().gini().square()
+      + weight * weight + 2 * old_val * weight);
+}
+
+
+float Variance(const LeafStat& stats, int output) {
+  if (stats.weight_sum() == 0) {
+    return 0;
+  }
+  const float e_x =
+      stats.regression().mean_output().value(output).float_value()
+      / stats.weight_sum();
+  const auto e_x2 =
+      stats.regression().mean_output_squares().value(output).float_value()
+      / stats.weight_sum();
+  return e_x2 - e_x * e_x;
+}
+
+float TotalVariance(const LeafStat& stats) {
+  float sum = 0;
+  for (int i = 0; i < stats.regression().mean_output().value_size(); ++i) {
+    sum += Variance(stats, i);
+  }
+  return sum;
+}
+
+float SmoothedGini(float sum, float square, int num_classes) {
+  // See comments for GiniImpurity above.
+  const float smoothed_sum = num_classes + sum;
+  return 1.0 -
+         (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
+}
+
+float WeightedSmoothedGini(float sum, float square, int num_classes) {
+  return sum * SmoothedGini(sum, square, num_classes);
+}
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e002d0414f48a1f409952f56c57b4e37815bca0
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
@@ -0,0 +1,50 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+// Returns the smoothed, unweighted Gini impurity.
+float GiniImpurity(const LeafStat& stats, int32 num_classes);
+
+// Returns the smoothed, weighted Gini impurity
+float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes);
+
+// Updates the GiniStats given the old and new values of a class count that
+// was updated.
+void UpdateGini(LeafStat* stats, float old_val, float weight);
+
+// Returns the variance in stats for the given output.
+float Variance(const LeafStat& stats, int output);
+
+// Returns the variance sum for all outputs.
+float TotalVariance(const LeafStat& stats);
+
+// ------- functions used by C++ stats classes  -------- //
+// Returns the smoothed gini score given the sum and sum of the squares of the
+// class counts.
+float SmoothedGini(float sum, float square, int num_classes);
+
+// Returns the smoothed gini score weighted by the sum.
+float WeightedSmoothedGini(float sum, float square, int num_classes);
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6e543b96fd5a00f78555eaf8558f0a95d0a6713
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
@@ -0,0 +1,74 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
+#include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
+
+namespace tensorflow {
+namespace tensorforest {
+
+class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
+ public:
+  TestableInputTarget(const std::vector<float>& t, const std::vector<float>& w,
+                      int num_t)
+      : StoredInputTarget(new std::vector<float>(t), new std::vector<float>(w),
+                          num_t) {}
+
+  int NumItems() const {
+    return target_->size();
+  }
+
+  int32 GetTargetAsClassIndex(int example_index,
+                              int target_index) const override {
+    return static_cast<int32>(
+        GetTargetAsContinuous(example_index, target_index));
+  }
+
+  float GetTargetWeight(int example_index) const override {
+    const size_t num_weights = weight_->size();
+    return num_weights > 0 && example_index < num_weights
+               ? (*weight_)[example_index]
+               : 1.0;
+  }
+
+  float GetTargetAsContinuous(int example_index,
+                              int target_index) const override {
+    QCHECK_LT(target_index, num_targets_);
+    return (*target_)[example_index * num_targets_ + target_index];
+  }
+};
+
+
+class TestableDataSet : public TensorDataSet {
+ public:
+  TestableDataSet(const std::vector<float>& data, int num_features)
+      : TensorDataSet(TensorForestDataSpec(), 11),
+        num_features_(num_features),
+        data_(data) {}
+
+  float GetExampleValue(int example, int32 feature_id) const override {
+    return data_[example * num_features_ + feature_id];
+  }
+
+ protected:
+  int num_features_;
+  std::vector<float> data_;
+};
+
+}  // namespace tensorforest
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/ops/model_ops.cc b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3dca6913f65ae7af0c916306848d4f24d4458170
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
@@ -0,0 +1,187 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+
+namespace tensorforest {
+
+REGISTER_RESOURCE_HANDLE_OP(DecisionTreeResource);
+
+REGISTER_OP("TreeIsInitializedOp")
+    .Input("tree_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a tree has been initialized.
+)doc");
+
+REGISTER_OP("CreateTreeVariable")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+Creates a tree  model and returns a handle to it.
+
+params: A serialized TensorForestParams proto.
+tree_handle: handle to the tree resource to be created.
+tree_config: Serialized proto of the tree.
+)doc");
+
+REGISTER_OP("TreeSerialize")
+    .Input("tree_handle: resource")
+    .Output("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Serializes the tree  to a proto.
+
+tree_handle: The handle to the tree.
+tree_config: Serialized proto of the tree.
+)doc");
+
+REGISTER_OP("TreeDeserialize")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+Deserializes a serialized tree config and replaces current tree.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree .
+tree_config: Serialized proto of the .
+)doc");
+
+REGISTER_OP("TreeSize")
+    .Input("tree_handle: resource")
+    .Output("tree_size: int32")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Outputs the size of the tree, including leaves.
+
+tree_handle: The handle to the tree.
+tree_size: Size scalar.
+)doc");
+
+REGISTER_OP("TreePredictionsV4")
+    .Attr("input_spec: string")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("input_data: float")
+    .Input("sparse_input_indices: int64")
+    .Input("sparse_input_values: float")
+    .Input("sparse_input_shape: int64")
+    .Output("predictions: float")
+    .SetShapeFn([](InferenceContext* c) {
+      DimensionHandle num_points = c->UnknownDim();
+
+      if (c->RankKnown(c->input(1)) && c->Rank(c->input(1)) > 0 &&
+          c->Value(c->Dim(c->input(1), 0)) > 0) {
+        num_points = c->Dim(c->input(1), 0);
+      }
+
+      c->set_output(0, c->Matrix(num_points, c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs the predictions for the given input data.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+input_data: The training batch's features as a 2-d tensor; `input_data[i][j]`
+   gives the j-th feature of the i-th input.
+sparse_input_indices: The indices tensor from the SparseTensor input.
+sparse_input_values: The values tensor from the SparseTensor input.
+sparse_input_shape: The shape tensor from the SparseTensor input.
+predictions: `predictions[i][j]` is the probability that input i is class j.
+)doc");
+
+REGISTER_OP("TraverseTreeV4")
+    .Attr("input_spec: string")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("input_data: float")
+    .Input("sparse_input_indices: int64")
+    .Input("sparse_input_values: float")
+    .Input("sparse_input_shape: int64")
+    .Output("leaf_ids: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      DimensionHandle num_points = c->UnknownDim();
+
+      if (c->RankKnown(c->input(1)) && c->Rank(c->input(1)) > 0 &&
+          c->Value(c->Dim(c->input(1), 0)) > 0) {
+        num_points = c->Dim(c->input(1), 0);
+      }
+
+      c->set_output(0, c->Vector(num_points));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs the leaf ids for the given input data.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+input_data: The training batch's features as a 2-d tensor; `input_data[i][j]`
+   gives the j-th feature of the i-th input.
+sparse_input_indices: The indices tensor from the SparseTensor input.
+sparse_input_values: The values tensor from the SparseTensor input.
+sparse_input_shape: The shape tensor from the SparseTensor input.
+leaf_ids: `leaf_ids[i]` is the leaf id for input i.
+)doc");
+
+REGISTER_OP("UpdateModelV4")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("leaf_ids: int32")
+    .Input("input_labels: float")
+    .Input("input_weights: float")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+Updates the given leaves for each example with the new labels.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+leaf_ids: `leaf_ids[i]` is the leaf id for input i.
+input_labels: The training batch's labels as a 1 or 2-d tensor.
+  'input_labels[i][j]' gives the j-th label/target for the i-th input.
+input_weights: The training batch's eample weights as a 1-d tensor.
+  'input_weights[i]' gives the weight for the i-th input.
+)doc");
+
+REGISTER_OP("FeatureUsageCounts")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Output("feature_counts: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs the number of times each feature was used in a split.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+feature_counts: `feature_counts[i]` is the number of times feature i was used
+    in a split.
+)doc");
+
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8b5c5d8a6efae83b9e408ed3d05ac72d848ef7f
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
@@ -0,0 +1,145 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+
+namespace tensorforest {
+
+REGISTER_RESOURCE_HANDLE_OP(FertileStatsResource);
+
+REGISTER_OP("FertileStatsIsInitializedOp")
+    .Input("stats_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a stats has been initialized.
+)doc");
+
+REGISTER_OP("CreateFertileStatsVariable")
+    .Attr("params: string")
+    .Input("stats_handle: resource")
+    .Input("stats_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+Creates a stats model and returns a handle to it.
+
+params: A serialized TensorForestParams proto.
+stats_handle: handle to the stats resource to be created.
+stats_config: Serialized proto of the stats.
+)doc");
+
+REGISTER_OP("FertileStatsSerialize")
+    .Attr("params: string")
+    .Input("stats_handle: resource")
+    .Output("stats_config: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Serializes the stats to a proto.
+
+params: A serialized TensorForestParams proto.
+stats_handle: The handle to the stats.
+stats_config: Serialized proto of the stats.
+)doc");
+
+REGISTER_OP("FertileStatsDeserialize")
+    .Attr("params: string")
+    .Input("stats_handle: resource")
+    .Input("stats_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+Deserializes a serialized stats config and replaces current stats.
+
+params: A serialized TensorForestParams proto.
+stats_handle: The handle to the stats.
+stats_config: Serialized proto of the stats.
+)doc");
+
+REGISTER_OP("GrowTreeV4")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("stats_handle: resource")
+    .Input("finshed_nodes: int32")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(R"doc(
+Grows the tree for finished nodes and allocates waiting nodes.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+stats_handle: The handle to the stats.
+finshed_nodes: A 1-d Tensor of finished node ids from ProcessInput.
+)doc");
+
+REGISTER_OP("ProcessInputV4")
+    .Attr("random_seed: int")
+    .Attr("input_spec: string")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("stats_handle: resource")
+    .Input("input_data: float")
+    .Input("sparse_input_indices: int64")
+    .Input("sparse_input_values: float")
+    .Input("sparse_input_shape: int64")
+    .Input("input_labels: float")
+    .Input("input_weights: float")
+    .Input("leaf_ids: int32")
+    .Output("finished_nodes: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Add labels to stats after traversing the tree for each example.
+
+Outputs node ids that are finished.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+stats_handle: The handle to the stats.
+input_data: The training batch's features as a 2-d tensor; `input_data[i][j]`
+   gives the j-th feature of the i-th input.
+sparse_input_indices: The indices tensor from the SparseTensor input.
+sparse_input_values: The values tensor from the SparseTensor input.
+sparse_input_shape: The shape tensor from the SparseTensor input.
+input_labels: The training batch's labels as a 1 or 2-d tensor.
+  'input_labels[i][j]' gives the j-th label/target for the i-th input.
+input_weights: The training batch's eample weights as a 1-d tensor.
+  'input_weights[i]' gives the weight for the i-th input.
+finished_nodes: A 1-d tensor of node ids that have finished and are ready to
+  grow.
+leaf_ids: `leaf_ids[i]` is the leaf id for input i.
+)doc");
+
+REGISTER_OP("FinalizeTree")
+    .Attr("params: string")
+    .Input("tree_handle: resource")
+    .Input("stats_handle: resource")
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
+    .Doc(R"doc(
+Puts the Leaf models inside the tree into their final form.
+
+If drop_final_class is true, the per-class probability prediction of the
+last class is not stored in the leaf models.
+
+params: A serialized TensorForestParams proto.
+tree_handle: The handle to the tree.
+stats_handle: The handle to the stats.
+)doc");
+}  // namespace tensorforest
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc b/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc
index 79976a8eae06957fdd4a7800886688a80047658a..6d4a6c40b1da5f8aabc6ee6e2d9600b3e5622d41 100644
--- a/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc
@@ -22,408 +22,6 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-REGISTER_OP("BestSplits")
-    .Attr("regression: bool = false")
-    .Input("finished_nodes: int32")
-    .Input("node_to_accumulator: int32")
-    .Input("split_sums: float")
-    .Input("split_squares: float")
-    .Input("accumulator_sums: float")
-    .Input("accumulator_sqaures: float")
-    .Output("split_indices: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle finished_nodes;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &finished_nodes));
-      c->set_output(0, c->Vector(c->Dim(finished_nodes, 0)));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-  Returns the index of the best split for each finished node.
-
-  For classification, the best split is the split with the lowest weighted
-  Gini impurity, as calculated from the statistics in `split_sums` and
-  `accumulator_sums`. For regression we use the lowest variance, incoporating
-  the *_squares as well.
-
-  finished_nodes:= A 1-d int32 tensor containing the indices of finished nodes.
-  node_to_accumulator: `node_to_accumulator[i]` is the accumulator slot used by
-    fertile node i, or -1 if node i isn't fertile.
-  split_sums:= a 3-d tensor where `split_sums[a][s]` summarizes the
-    training labels for examples that fall into the fertile node associated with
-    accumulator slot s and have then taken the *left* branch of candidate split
-    s.  For a classification problem, `split_sums[a][s][c]` is the count of such
-    examples with class c and for regression problems, `split_sums[a][s]` is the
-    sum of the regression labels for such examples.
-  split_squares: Same as split_sums, but it contains the sum of the
-    squares of the regression labels.  Only used for regression.  For
-    classification problems, pass a dummy tensor into this.
-  accumulator_sums:= a 2-d tensor where `accumulator_sums[a]` summarizes the
-    training labels for examples that fall into the fertile node associated with
-    accumulator slot s.  For a classification problem, `accumulator_sums[a][c]`
-    is the count of such examples with class c and for regression problems,
-    `accumulator_sums[a]` is the sum of the regression labels for such examples.
-  accumulator_squares: Same as accumulator_sums, but it contains the sum of the
-    squares of the regression labels.  Only used for regression.  For
-    classification problems, pass a dummy tensor into this.
-  split_indices: `split_indices[i]` contains the index of the split to use for
-    `finished_nodes[i]`.
-)doc");
-
-REGISTER_OP("CountExtremelyRandomStats")
-    .Attr("input_spec: string")
-    .Attr("num_classes: int")
-    .Attr("regression: bool = false")
-    .Input("input_data: float")
-    .Input("sparse_input_indices: int64")
-    .Input("sparse_input_values: float")
-    .Input("sparse_input_shape: int64")
-    .Input("input_labels: float")
-    .Input("input_weights: float")
-    .Input("tree: int32")
-    .Input("tree_thresholds: float")
-    .Input("node_to_accumulator: int32")
-    .Input("candidate_split_features: int32")
-    .Input("candidate_split_thresholds: float")
-    .Input("birth_epochs: int32")
-    .Input("current_epoch: int32")
-    .Output("pcw_node_sums_delta: float")
-    .Output("pcw_node_squares_delta: float")
-    .Output("pcw_splits_indices: int32")
-    .Output("pcw_candidate_splits_sums_delta: float")
-    .Output("pcw_candidate_splits_squares_delta: float")
-    .Output("pcw_totals_indices: int32")
-    .Output("pcw_totals_sums_delta: float")
-    .Output("pcw_totals_squares_delta: float")
-    .Output("leaves: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      int64 num_classes;
-      TF_RETURN_IF_ERROR(c->GetAttr("num_classes", &num_classes));
-      bool regression;
-      TF_RETURN_IF_ERROR(c->GetAttr("regression", &regression));
-
-      DimensionHandle num_points = c->Dim(c->input(0), 0);
-      if (c->RankKnown(c->input(3)) && c->Rank(c->input(3)) > 0) {
-        num_points = c->UnknownDim();
-      }
-      DimensionHandle num_nodes = c->Dim(c->input(6), 0);
-
-      // Node sums
-      c->set_output(0, c->Matrix(num_nodes, num_classes));
-      // Node squares
-      c->set_output(1, c->Matrix(num_nodes, num_classes));
-
-      c->set_output(2, c->Matrix(c->UnknownDim(), regression ? 2 : 3));
-
-      c->set_output(3,
-                    regression ? c->Matrix(c->UnknownDim(), num_classes)
-                               : c->Vector(c->UnknownDim()));
-      c->set_output(4,
-                    regression ? c->Matrix(c->UnknownDim(), num_classes)
-                               : c->Vector(0LL));
-      c->set_output(5, c->Matrix(c->UnknownDim(), regression ? 1 : 2));
-      c->set_output(6,
-                    regression ? c->Matrix(c->UnknownDim(), num_classes)
-                               : c->Vector(c->UnknownDim()));
-      c->set_output(7,
-                    regression ? c->Matrix(c->UnknownDim(), num_classes)
-                               : c->Vector(0LL));
-      c->set_output(8, c->Vector(num_points));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Calculates incremental statistics for a batch of training data.
-
-Each training example in `input_data` is sent through the decision tree
-represented by `tree` and `tree_thresholds`.
-The shape and contents of the outputs differ depending on whether
-`regression` is true or not.
-
-For `regression` = false (classification), `pcw_node_sums_delta[i]` is
-incremented for every node i that it passes through, and the leaf it ends up
-in is recorded in `leaves[i]`.  Then, if the leaf is fertile and
-initialized, the statistics for its corresponding accumulator slot
-are updated in `pcw_candidate_sums_delta` and `pcw_totals_sums_delta`.
-
-For `regression` = true, outputs contain the sum of the input_labels
-for the appropriate nodes.  In adddition, the *_squares outputs are filled
-in with the sums of the squares of the input_labels. Since outputs are
-all updated at once, the *_indices outputs don't specify the output
-dimension to update, rather the *_delta output contains updates for all the
-outputs.  For example, `pcw_totals_indices` specifies the accumulators to
-update, and `pcw_total_splits_sums_delta` contains the complete output
-updates for each of those accumulators.
-
-The attr `num_classes` is needed to appropriately size the outputs.
-
-input_spec: A serialized TensorForestDataSpec proto.
-input_data: The training batch's features as a 2-d tensor; `input_data[i][j]`
-  gives the j-th feature of the i-th input.
-sparse_input_indices: The indices tensor from the SparseTensor input.
-sparse_input_values: The values tensor from the SparseTensor input.
-sparse_input_shape: The shape tensor from the SparseTensor input.
-input_spec: A 1-D tensor containing the type of each column in input_data,
-  (e.g. continuous float, categorical).  Index 0 should contain the default
-  type, individual feature types start at index 1.
-input_labels: The training batch's labels; `input_labels[i]` is the class
-  of the i-th input.
-input_weights:= A 1-D float tensor.  If non-empty, `input_weights[i]` gives
-  the weight of the i-th input.
-tree:= A 2-d int32 tensor.  `tree[i][0]` gives the index of the left child
-  of the i-th node, `tree[i][0] + 1` gives the index of the right child of
-  the i-th node, and `tree[i][1]` gives the index of the feature used to
-  split the i-th node.
-tree_thresholds: `tree_thresholds[i]` is the value used to split the i-th
-  node.
-node_to_accumulator: If the i-th node is fertile, `node_to_accumulator[i]`
-  is it's accumulator slot.  Otherwise, `node_to_accumulator[i]` is -1.
-candidate_split_features: `candidate_split_features[a][s]` is the
-  index of the feature being considered by split s of accumulator slot a.
-candidate_split_thresholds: `candidate_split_thresholds[a][s]` is the
-  threshold value being considered by split s of accumulator slot a.
-birth_epochs: `birth_epoch[i]` is the epoch node i was born in.  Only
-  nodes satisfying `current_epoch - birth_epoch <= 1` accumulate statistics.
-current_epoch:= A 1-d int32 tensor with shape (1).  current_epoch[0] contains
-  the current epoch.
-pcw_node_sums_delta: `pcw_node_sums_delta[i][c]` is the number of training
-  examples in this training batch with class c that passed through node i for
-  classification.  For regression, it is the sum of the input_labels that
-  have passed through node i.
-pcw_node_squares_delta: `pcw_node_squares_delta[i][c]` is the sum of the
-  squares of the input labels that have passed through node i for
-  regression.  Not set for classification.
-pcw_splits_indices:= A 2-d tensor of shape (?, 3) for classification and
-  (?, 2) for regression.
-  `pcw_splits_indices[i]` gives the coordinates of an entry in
-  candidate_split_pcw_sums and candidate_split_pcw_squares that need to be
-  updated.  This is meant to be passed with `pcw_candidate_splits_*_delta` to
-  a scatter_add for candidate_split_pcw_*:
-    training_ops.scatter_add_ndim(candidate_split_pcw_sums
-        pcw_splits_indices, pcw_candidate_splits_sums_delta)
-pcw_candidate_splits_sums_delta: For classification,
-  `pcw_candidate_splits_sums_delta[i]` is the
-  number of training examples in this training batch that correspond to
-  the i-th entry in `pcw_splits_indices` which took the *left* branch of
-  candidate split. For regression, it is the same but a 2-D tensor that has
-  the sum of the input_labels for each i-th entry in the indices.
-pcw_candidate_splits_squares_delta: For regression, same as
-  `pcw_candidate_splits_sums_delta` but the sum of the squares. Not set
-  for classification.
-pcw_totals_indices: For classification, 'pcw_totals_indices` contains the
-  indices (accumulator, class) into total_pcw_sums to update with
-  pcw_totals_sums_delta.  For regression, it only contains the accumulator
-  (not the class), because pcw_totals_*_delta will contain all the outputs.
-pcw_totals_sums_delta: For classification, `pcw_totals_sums_delta[i]` is the
-  number of training examples in this batch that ended up in the fertile
-  node with accumulator and class indicated by `pcw_totals_indices[i]`.
-  For regression, it is the sum of the input_labels corresponding to the
-  entries in `pcw_totals_indices[i]`.
-pcw_totals_squares_delta: For regression, same as
-  `pcw_totals_sums_delta` but the sum of the squares. Not set
-  for classification.
-leaves: `leaves[i]` is the leaf that input i ended up in.
-)doc");
-
-REGISTER_OP("FinishedNodes")
-    .Attr("regression: bool = false")
-    .Attr("num_split_after_samples: int")
-    .Attr("min_split_samples: int")
-    .Attr("dominate_fraction: float = 0.99")
-    .Attr(
-        "dominate_method:"
-        " {'none', 'hoeffding', 'bootstrap', 'chebyshev'} = 'bootstrap'")
-    .Attr("random_seed: int = 0")
-    .Attr("check_dominates_every_samples: int = 75")
-    .Input("leaves: int32")
-    .Input("node_to_accumulator: int32")
-    .Input("split_sums: float")
-    .Input("split_squares: float")
-    .Input("accumulator_sums: float")
-    .Input("accumulator_squares: float")
-    .Input("birth_epochs: int32")
-    .Input("current_epoch: int32")
-    .Output("finished: int32")
-    .Output("stale: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Determines which of the given leaf nodes are done accumulating.
-
-The `regression` attribute should be set to true for regression problems, and
-false for classification problems.
-
-If dominate_method is not set to none, then every
-`check_dominates_every_samples` steps the specified method will be used to
-see if the current best split has probability `dominate_fraction` of being
-asymptotically better than the second best split.  If so, the best split
-is picked now, rather than waiting until `num_split_after_samples` samples
-have been seen.  WARNING:  for weighted input data, only `dominate_method` =
-none is safe.
-
-leaves:= A 1-d int32 tensor.  Lists the nodes that are currently leaves.
-node_to_accumulator: If the i-th node is fertile, `node_to_accumulator[i]`
-  is it's accumulator slot.  Otherwise, `node_to_accumulator[i]` is -1.
-split_sums:= a 3-d tensor where `split_sums[a][s]` summarizes the
-  training labels for examples that fall into the fertile node associated with
-  accumulator slot s and have then taken the *left* branch of candidate split
-  s.  For a classification problem, `split_sums[a][s][c]` is the count of such
-  examples with class c and for regression problems, `split_sums[a][s]` is the
-  sum of the regression labels for such examples.
-split_squares: Same as split_sums, but it contains the sum of the
-  squares of the regression labels.  Only used for regression.  For
-  classification problems, pass a dummy tensor into this.
-accumulator_sums: For classification, `accumulator_sums[a][c]` records how
-  many training examples have class c and have ended up in the fertile node
-  associated with accumulator slot a.  It has the total sum in entry 0 for
-  convenience. For regression, it is the same except it contains the sum
-  of the input labels that have been seen, and entry 0 contains the number
-  of training examples that have been seen.
-accumulator_squares: Same as accumulator_sums, but it contains the sum of the
-  squares of the regression labels.  Only used for regression.  For
-  classification problems, pass a dummy tensor into this.
-birth_epochs:= A 1-d int32 tensor.  `birth_epochs[i]` contains the epoch
-  the i-th node was created in.
-current_epoch:= A 1-d int32 tensor with shape (1).  `current_epoch[0]`
-  stores the current epoch number.
-finished:= A 1-d int32 tensor containing the indices of the finished nodes.
-  Nodes are finished if they have received at least num_split_after_samples
-  samples, or if they have received min_split_samples and the best scoring
-  split is sufficiently greater than the next best split.
-stale:= A 1-d int32 tensor containing the fertile nodes that were created two
-  or more epochs ago.
-
-)doc");
-
-REGISTER_OP("GrowTree")
-    .Input("end_of_tree: int32")
-    .Input("node_to_accumulator: int32")
-    .Input("finished_nodes: int32")
-    .Input("best_splits: int32")
-    .Input("candidate_split_features: int32")
-    .Input("candidate_split_thresholds: float")
-    .Output("nodes_to_update: int32")
-    .Output("tree_updates: int32")
-    .Output("threshold_updates: float")
-    .Output("new_end_of_tree: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(1, c->Matrix(InferenceContext::kUnknownDim, 2));
-      c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(3, c->Vector(1));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-  Output the tree changes needed to resolve fertile nodes.
-
-  Previous Ops have already decided which fertile nodes want to stop being
-  fertile and what their best candidate split should be and have passed that
-  information to this Op in `finished_nodes` and `best_splits`.  This Op
-  merely checks that there is still space in tree to add new nodes, and if
-  so, writes out the sparse updates needed for the fertile nodes to be
-  resolved to the tree and threshold tensors.
-
-  end_of_tree: `end_of_tree[0]` is the number of allocated nodes, or
-    equivalently the index of the first free node in the tree tensor.
-  node_to_accumulator: `node_to_accumulator[i]` is the accumulator slot used by
-    fertile node i, or -1 if node i isn't fertile.
-  finished_nodes:= A 1-d int32 tensor containing the indices of finished nodes.
-  best_splits: `best_splits[i]` is the index of the best split for
-    `finished_nodes[i]`.
-  candidate_split_features: `candidate_split_features[a][s]` is the feature
-    being considered for split s of the fertile node associated with
-    accumulator slot a.
-  candidate_split_thresholds: `candidate_split_thresholds[a][s]` is the
-    threshold value being considered for split s of the fertile node associated
-    with accumulator slot a.
-  nodes_to_update:= A 1-d int32 tensor containing the node indices that need
-    updating.
-  tree_updates: The updates to apply to the 2-d tree tensor.  Intended to be
-    used with `tf.scatter_update(tree, nodes_to_update, tree_updates)`.
-  threshold_updates: The updates to apply to the 1-d thresholds tensor.
-    Intended to be used with
-    `tf.scatter_update(thresholds, nodes_to_update, threshold_updates)`.
-  new_end_of_tree: `new_end_of_tree[0]` is the new size of the tree.
-)doc");
-
-REGISTER_OP("SampleInputs")
-    .Attr("input_spec: string")
-    .Attr("split_initializations_per_input: int")
-    .Attr("split_sampling_random_seed: int")
-    .Input("input_data: float")
-    .Input("sparse_input_indices: int64")
-    .Input("sparse_input_values: float")
-    .Input("sparse_input_shape: int64")
-    .Input("input_weights: float")
-    .Input("node_to_accumulator: int32")
-    .Input("leaves: int32")
-    .Input("candidate_split_features: int32")
-    .Input("candidate_split_thresholds: float")
-    .Output("accumulators_to_update: int32")
-    .Output("new_split_feature_rows: int32")
-    .Output("new_split_threshold_rows: float")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle candidate_split_features;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(7), 2, &candidate_split_features));
-      DimensionHandle split_dim = c->Dim(candidate_split_features, 1);
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(1, c->Matrix(InferenceContext::kUnknownDim, split_dim));
-      c->set_output(2, c->Matrix(InferenceContext::kUnknownDim, split_dim));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Initializes candidate splits for newly fertile nodes.
-
-In an extremely random forest, we don't consider all possible threshold
-values for a candidate split feature, but rather only a sampling of them.
-This Op takes those samples from the training data in `input_data`.  The
-feature and threshold samples are stored in tensors that are indexed by
-accumulator slot, so for each input, we must first look up which leaf
-it ended up in (using `leaves`) and then which accumulator slot if any
-that leaf maps to (using `node_to_accumulator`).
-
-The attribute `split_initializations_per_input` controls how many splits
-a single training example can initialize, and the attribute
-`split_sampling_random_seed` sets the random number generator's seed
-(a value of 0 means use the current time as the seed).
-
-input_data: The features for the current batch of training data.
-  `input_data[i][j]` is the j-th feature of the i-th input.
-sparse_input_indices: The indices tensor from the SparseTensor input.
-sparse_input_values: The values tensor from the SparseTensor input.
-sparse_input_shape: The shape tensor from the SparseTensor input.
-input_weights: For a dense input, input_weights[i] is the weight associated
-  with input_data[i].  For sparse input, input_weights[i] is the weight
-  associated with sparse_input_values[i].  Or in either case, if all the
-  weights are 1, input_weights can be empty.  SampleInputs will reject inputs
-  with weight less than Uniform([0,1)), so weights outside of that range may
-  not be what you want.
-node_to_accumulator: For a fertile node i, node_to_accumulator[i] is the
-  associated accumulator slot.  For non-fertile nodes, it is -1.
-leaves: `leaves[i]` is the leaf that the i-th input landed in, as
-  calculated by CountExtremelyRandomStats.
-candidate_split_features: The current features for the candidate splits;
-  `candidate_split_features[a][s]` is the index of the feature being
-  considered by split s in accumulator slot a.
-candidate_split_thresholds: The current thresholds for the candidate splits;
-  `candidate_split_thresholds[a][s]` is the threshold value being
-  considered by split s in accumulator slot a.
-accumulators_to_update: A list of the accumulators to change in the
-  candidate_split_features and candidate_split_thresholds tensors.
-new_split_feature_rows: The new values for the candidate_split_features
-  tensor.  Intended to be used with
-  `tf.scatter_update(candidate_split_features,
-                     accumulators_to_update,
-                     new_split_feature_rows)`
-new_split_threshold_rows:  The new values for the candidate_split_thresholds
-  tensor.  Intended to be used with
-  `tf.scatter_update(candidate_split_thresholds,
-                     accumulators_to_update,
-                     new_split_feature_thresholds)`
-
-)doc");
 
 REGISTER_OP("ScatterAddNdim")
     .Input("input: Ref(float)")
@@ -460,170 +58,4 @@ REGISTER_OP("ReinterpretStringToFloat")
 
 )doc");
 
-REGISTER_OP("TopNInsert")
-    .Input("ids: int64")
-    .Input("scores: float32")
-    .Input("new_ids: int64")
-    .Input("new_scores: float32")
-    .Output("shortlist_ids: int64")
-    .Output("update_ids: int64")
-    .Output("update_scores: float32")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-  Outputs update Tensors for adding new_ids and new_scores to the shortlist.
-
-  ids:= A 1-D int64 tensor containing the ids on the shortlist (except for
-    ids[0], which is the current size of the shortlist.
-  scores:= A 1-D float32 tensor containing the scores on the shortlist.
-  new_ids:= A 1-D int64 tensor containing the new ids to add to the shortlist.
-  shortlist_ids:= A 1-D int64 tensor containing the ids of the shortlist entries
-    to update.  Intended to be used with
-    tf.scatter_update(shortlist_scores, shortlist_ids, new_scores).
-  update_ids:= A 1-D int64 tensor containing ...
-  update_scores:= A 1-D float32 tensor containing ...
-)doc");
-
-REGISTER_OP("TopNRemove")
-    .Input("ids: int64")
-    .Input("remove_ids: int64")
-    .Output("shortlist_ids: int64")
-    .Output("new_length: int64")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-  Remove ids from a shortlist.
-
-  ids:= A 1-D int64 tensor containing the ids on the shortlist (except for
-    ids[0], which is the current size of the shortlist.
-  remove_ids:= A 1-D int64 tensor containing the ids to remove.
-  shortlist_ids:= A 1-D int64 tensor containing the shortlist entries that
-    need to be removed.
-  new_length:= A length 1 1-D int64 tensor containing the new length of the
-    shortlist.
-)doc");
-
-REGISTER_OP("TreePredictions")
-    .Attr("input_spec: string")
-    .Attr("valid_leaf_threshold: float")
-    .Input("input_data: float")
-    .Input("sparse_input_indices: int64")
-    .Input("sparse_input_values: float")
-    .Input("sparse_input_shape: int64")
-    .Input("tree: int32")
-    .Input("tree_thresholds: float")
-    .Input("node_per_class_weights: float")
-
-    .Output("predictions: float")
-    .SetShapeFn([](InferenceContext* c) {
-      // The output of TreePredictions is
-      // [node_pcw(evaluate_tree(x), c) for c in classes for x in input_data].
-      DimensionHandle num_classes = c->Dim(c->input(6), 1);
-      DimensionHandle num_points = c->UnknownDim();
-
-      if (c->RankKnown(c->input(0)) && c->Rank(c->input(0)) > 0) {
-        num_points = c->Dim(c->input(0), 0);
-      }
-
-      TF_RETURN_IF_ERROR(c->Subtract(num_classes, 1, &num_classes));
-
-      c->set_output(0, c->Matrix(num_points, num_classes));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-  Returns the per-class probabilities for each input.
-
-  input_spec: A serialized TensorForestDataSpec proto.
-  input_data: The training batch's features as a 2-d tensor; `input_data[i][j]`
-   gives the j-th feature of the i-th input.
-  sparse_input_indices: The indices tensor from the SparseTensor input.
-  sparse_input_values: The values tensor from the SparseTensor input.
-  sparse_input_shape: The shape tensor from the SparseTensor input.
-  tree:= A 2-d int32 tensor.  `tree[i][0]` gives the index of the left child
-   of the i-th node, `tree[i][0] + 1` gives the index of the right child of
-   the i-th node, and `tree[i][1]` gives the index of the feature used to
-   split the i-th node.
-  tree_thresholds: `tree_thresholds[i]` is the value used to split the i-th
-   node.
-  node_per_class_weights: `node_per_class_weights[n][c]` records how many
-   training examples have class c and have ended up in node n.
-  predictions: `predictions[i][j]` is the probability that input i is class j.
-  valid_leaf_threshold: Minimum number of samples that have arrived to a leaf
-    to be considered a valid leaf, otherwise use the parent.
-)doc");
-
-REGISTER_OP("UpdateFertileSlots")
-    .Attr("regression: bool = False")
-    .Input("finished: int32")
-    .Input("non_fertile_leaves: int32")
-    .Input("non_fertile_leaf_scores: float")
-    .Input("end_of_tree: int32")
-    .Input("accumulator_sums: float")
-    .Input("node_to_accumulator: int32")
-    .Input("stale_leaves: int32")
-    .Input("node_sums: float")
-    .Output("node_to_accumulator_map_updates: int32")
-    .Output("accumulator_to_node_map_updates: int32")
-    .Output("accumulators_cleared: int32")
-    .Output("accumulators_allocated: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->Matrix(c->MakeDim(2), InferenceContext::kUnknownDim));
-      c->set_output(1, c->Matrix(c->MakeDim(2), InferenceContext::kUnknownDim));
-      c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
-      c->set_output(3, c->Vector(InferenceContext::kUnknownDim));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Updates accumulator slots to reflect finished or newly fertile nodes.
-
-finished:= A 1-d int32 tensor containing the indices of fertile nodes that
-  are ready to decide on a split.
-non_fertile_leaves:= A 1-d int32 tensor containing the indices of all the
-  currently non-fertile leaves.  If there are free accumulator slots after
-  deallocation, UpdateFertileSlots will consider these nodes (plus the ones
-  in new_leaves) and potentially turn some of them fertile.
-non_fertile_leaf_scores: `non_fertile_leaf_scores[i]` is the splitting score
-  of the non-fertile leaf `non_fertile_leaves[i]`.
-end_of_tree: The end of tree tensor from the previous training iteration, used
-  with the finished input to calculate a list of new leaf indices created by
-  GrowTree, which will be considered to become fertile if there are free
-  slots.
-accumulator_sums: For classification, `accumulator_sums[a][c]` records how
-  many training examples have class c and have ended up in the fertile node
-  associated with accumulator slot a.  It has the total sum in entry 0 for
-  convenience. For regression, it is the same except it contains the sum
-  of the input labels that have been seen, and entry 0 contains the number
-  of training examples that have been seen.
-node_to_accumulator: `node_to_accumulator[i]` is the accumulator slot used by
-  fertile node i, or -1 if node i isn't fertile.
-stale_leaves:= A 1-d int32 tensor containing the indices of all leaves that
-  have stopped accumulating statistics because they are too old.
-node_sums: `node_sums[n][c]` records how many
-   training examples have class c and have ended up in node n.
-node_to_accumulator_map_updates:= A 2-d int32 tensor describing the changes
-  that need to be applied to the node_to_accumulator map.  Intended to be used
-  with
-  `tf.scatter_update(node_to_accumulator,
-                     node_to_accumulator_map_updates[0],
-                     node_to_accumulator_map_updates[1])`.
-accumulator_to_node_map_updates:= A 2-d int32 tensor describing the changes
-  that need to be applied to the node_to_accumulator map.  Intended to be used
-  with
-  `tf.scatter_update(accumulator_to_node_map,
-                     accumulator_to_node_map_updates[0],
-                     accumulator_to_node_map_updates[1])`.
-accumulators_cleared:= A 1-d int32 tensor containing the indices of all
-  the accumulator slots that need to be cleared.
-accumulators_allocated:= A 1-d int32 tensor containing the indices of all
-  the accumulator slots that need to be allocated.
-
-)doc");
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/proto/BUILD b/tensorflow/contrib/tensor_forest/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1cfef44af1aaee3c105664398200524f2770f7d7
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/proto/BUILD
@@ -0,0 +1,31 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "fertile_stats_proto",
+    srcs = ["fertile_stats.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/contrib/decision_trees/proto:generic_tree_model"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "tensor_forest_params_proto",
+    srcs = ["tensor_forest_params.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/contrib/decision_trees/proto:generic_tree_model"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tensor_forest/proto/fertile_stats.proto b/tensorflow/contrib/tensor_forest/proto/fertile_stats.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0ded04ad75af1a2cdc2820a4f16a11d3a76a17af
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/proto/fertile_stats.proto
@@ -0,0 +1,92 @@
+syntax = "proto3";
+option cc_enable_arenas = true;
+
+package tensorflow.tensorforest;
+
+import "tensorflow/contrib/decision_trees/proto/generic_tree_model.proto";
+
+
+message FertileStats {
+  // Tracks stats for each node.  node_to_slot[i] is the FertileSlot for node i.
+  // This may be sized to max_nodes initially, or grow dynamically as needed.
+  repeated FertileSlot node_to_slot = 1;
+}
+
+
+message GiniStats {
+  // This allows us to quickly track and calculate impurity (classification)
+  //  by storing the sum of input weights and the sum of the squares of the
+  // input weights.  Weighted gini is then: 1 - (square / sum * sum).
+  // Updates to these numbers are:
+  //   old_i = leaf->value(label)
+  //   new_i = old_i + incoming_weight
+  //   sum -> sum + incoming_weight
+  //   square -> square - (old_i ^ 2) + (new_i ^ 2)
+  //   total_left_sum -> total_left_sum - old_left_i * old_total_i +
+  //                                      new_left_i * new_total_i
+  float square = 2;
+}
+
+message LeafStat {
+  // The sum of the weights of the training examples that we have seen.
+  // This is here, outside of the leaf_stat oneof, because almost all
+  // types will want it.
+  float weight_sum = 3;
+
+  // TODO(thomaswc): Move the GiniStats out of LeafStats and into something
+  // that only tracks them for splits.
+  message GiniImpurityClassificationStats {
+    oneof counts {
+      decision_trees.Vector dense_counts = 1;
+      decision_trees.SparseVector sparse_counts = 2;
+    }
+    GiniStats gini = 3;
+  }
+
+  // This is the info needed for calculating variance for regression.
+  // Variance will still have to be summed over every output, but the
+  // number of outputs in regression problems is almost always 1.
+  message LeastSquaresRegressionStats {
+    decision_trees.Vector mean_output = 1;
+    decision_trees.Vector mean_output_squares = 2;
+  }
+
+  oneof leaf_stat {
+    GiniImpurityClassificationStats classification = 1;
+    LeastSquaresRegressionStats regression = 2;
+    // TODO(thomaswc): Add in v5's SparseClassStats.
+  }
+}
+
+message FertileSlot {
+  // The statistics for *all* the examples seen at this leaf.
+  LeafStat leaf_stats = 4;
+
+  repeated SplitCandidate candidates = 1;
+
+  // The statistics for the examples seen at this leaf after all the
+  // splits have been initialized.  If post_init_leaf_stats.weight_sum
+  // is > 0, then all candidates have been initialized.  We need to track
+  // both leaf_stats and post_init_leaf_stats because the first is used
+  // to create the decision_tree::Leaf and the second is used to infer
+  // the statistics for the right side of a split (given the leaf side
+  // stats).
+  LeafStat post_init_leaf_stats = 6;
+
+  int32 node_id = 5;
+  int32 depth = 7;
+}
+
+message SplitCandidate {
+  // proto representing the potential node.
+  decision_trees.BinaryNode split = 1;
+
+  // Right counts are inferred from FertileSlot.leaf_stats and left.
+  LeafStat left_stats = 4;
+
+  // Right stats (not full counts) are kept here.
+  LeafStat right_stats = 5;
+
+  // Fields used when training with a graph runner.
+  string unique_id = 6;
+}
diff --git a/tensorflow/contrib/tensor_forest/proto/tensor_forest_params.proto b/tensorflow/contrib/tensor_forest/proto/tensor_forest_params.proto
new file mode 100644
index 0000000000000000000000000000000000000000..58c5b9bbe739e5f461ef4171b072076790ac1948
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/proto/tensor_forest_params.proto
@@ -0,0 +1,147 @@
+syntax = "proto3";
+
+package tensorflow.tensorforest;
+
+import "tensorflow/contrib/decision_trees/proto/generic_tree_model.proto";
+
+// Leaf models specify what is returned at inference time, and how it is
+// stored in the decision_trees.Leaf protos.
+enum LeafModelType {
+  MODEL_DENSE_CLASSIFICATION = 0;
+  MODEL_SPARSE_CLASSIFICATION = 1;
+  MODEL_REGRESSION = 2;
+  MODEL_SPARSE_OR_DENSE_CLASSIFICATION = 3;
+}
+
+// Stats models generally specify information that is collected which is
+// necessary to choose a split at a node. Specifically, they operate on
+// a SplitCandidate::LeafStat proto.
+enum StatsModelType {
+  STATS_DENSE_GINI = 0;
+  STATS_SPARSE_GINI = 1;
+  STATS_LEAST_SQUARES_REGRESSION = 2;
+  STATS_SPARSE_THEN_DENSE_GINI = 3;
+}
+
+// Allows selection of operations on the collection of split candidates.
+// Basic infers right split stats from the leaf stats and each candidate's
+// left stats.
+enum SplitCollectionType {
+  COLLECTION_BASIC = 0;
+  GRAPH_RUNNER_COLLECTION = 1;
+}
+
+// Pruning strategies define how candidates are pruned over time.
+// SPLIT_PRUNE_HALF prunes the worst half of splits every prune_ever_samples,
+// etc.  Note that prune_every_samples plays against the depth-dependent
+// split_after_samples, so they should be set together.
+enum SplitPruningStrategyType {
+  SPLIT_PRUNE_NONE = 0;
+  SPLIT_PRUNE_HALF = 1;
+  SPLIT_PRUNE_QUARTER = 2;
+  SPLIT_PRUNE_10_PERCENT = 3;
+  // SPLIT_PRUNE_HOEFFDING prunes splits whose Gini impurity is worst than
+  // the best split's by more than the Hoeffding bound.
+  SPLIT_PRUNE_HOEFFDING = 4;
+}
+
+message SplitPruningConfig {
+  DepthDependentParam prune_every_samples = 1;
+  SplitPruningStrategyType type = 2;
+}
+
+// Finish strategies define when slots are considered finished.
+// Basic requires at least split_after_samples, and doesn't allow slots to
+// finish until the leaf has received more than one class. Hoeffding splits
+// early after min_split_samples if one split is dominating the rest according
+// to hoeffding bounds. Bootstrap does the same but compares gini's calculated
+// with sampled smoothed counts.
+enum SplitFinishStrategyType {
+  SPLIT_FINISH_BASIC = 0;
+  SPLIT_FINISH_DOMINATE_HOEFFDING = 2;
+  SPLIT_FINISH_DOMINATE_BOOTSTRAP = 3;
+}
+
+message SplitFinishConfig {
+  // Configure how often we check for finish, because some finish methods
+  // are expensive to perform.
+  DepthDependentParam check_every_steps = 1;
+  SplitFinishStrategyType type = 2;
+}
+
+// A parameter that changes linearly with depth, with upper and lower bounds.
+message LinearParam {
+  float slope = 1;
+  float y_intercept = 2;
+  float min_val = 3;
+  float max_val = 4;
+}
+
+// A parameter that changes expoentially with the form
+//     f = c + mb^(k*d)
+// where:
+//  c: constant bias
+//  b: base
+//  m: multiplier
+//  k: depth multiplier
+//  d: depth
+message ExponentialParam {
+  float bias = 1;
+  float base = 2;
+  float multiplier = 3;
+  float depth_multiplier = 4;
+}
+
+// A parameter that is 'off' until depth >= a threshold, then is 'on'.
+message ThresholdParam {
+  float on_value = 1;
+  float off_value = 2;
+  float threshold = 3;
+}
+
+// A parameter that may change with node depth.
+message DepthDependentParam {
+  oneof ParamType {
+    float constant_value = 1;
+    LinearParam linear = 2;
+    ExponentialParam exponential = 3;
+    ThresholdParam threshold = 4;
+ }
+}
+
+message TensorForestParams {
+  // ------------ Types that control training subsystems ------ //
+  LeafModelType leaf_type = 1;
+  StatsModelType stats_type = 2;
+  SplitCollectionType collection_type = 3;
+  SplitPruningConfig pruning_type = 4;
+  SplitFinishConfig finish_type = 5;
+
+  // --------- Parameters that can't change by definition --------------- //
+  int32 num_trees = 6;
+  int32 max_nodes = 7;
+  int32 num_features = 21;
+
+  decision_trees.InequalityTest.Type inequality_test_type = 19;
+
+  // Some booleans controlling execution
+  bool is_regression = 8;
+  bool drop_final_class = 9;
+  bool collate_examples = 10;
+  bool checkpoint_stats = 11;
+  bool use_running_stats_method = 20;
+  bool initialize_average_splits = 22;
+
+  // Number of classes (classification) or targets (regression)
+  int32 num_outputs = 12;
+
+  // --------- Parameters that could be depth-dependent --------------- //
+  DepthDependentParam num_splits_to_consider = 13;
+  DepthDependentParam split_after_samples = 14;
+  DepthDependentParam dominate_fraction = 15;
+  DepthDependentParam min_split_samples = 18;
+
+  // --------- Parameters for experimental features ---------------------- //
+  string graph_dir = 16;
+  int32 num_select_features = 17;
+}
diff --git a/tensorflow/contrib/tensor_forest/python/__init__.py b/tensorflow/contrib/tensor_forest/python/__init__.py
index 0d41d3500d1956fe815068dfab6a60d443d4a4fc..0688f7c81687e4574a4001feab4d785f05007492 100644
--- a/tensorflow/contrib/tensor_forest/python/__init__.py
+++ b/tensorflow/contrib/tensor_forest/python/__init__.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensor_forest.python import constants
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.contrib.tensor_forest.python.ops import data_ops
+from tensorflow.contrib.tensor_forest.python.ops import model_ops
+from tensorflow.contrib.tensor_forest.python.ops import stats_ops
 from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/best_splits_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/best_splits_op_test.py
deleted file mode 100644
index 628b7f3dadc102622f1f35f6f8e7ca44a4052f9d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/best_splits_op_test.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.best_splits_op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow  # pylint: disable=unused-import
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class BestSplitsClassificationTests(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.finished = [3, 5]
-    self.node_map = [-1, -1, -1, 0, -1, 3, -1, -1, -1]
-    self.candidate_counts = [[[153., 50., 60., 40., 3.],
-                              [200., 70., 30., 70., 30.]],
-                             [[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]],
-                             [[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]],
-                             [[40., 10., 10., 10., 10.],
-                              [30., 10., 5., 5., 10.]]]
-    self.total_counts = [[400., 100., 100., 100., 100.],
-                         [0., 0., 0., 0., 0.],
-                         [0., 0., 0., 0., 0.],
-                         [400., 100., 100., 100., 100.]]
-    self.squares = []
-
-  def testSimple(self):
-    with self.test_session():
-      split_indices = tensor_forest_ops.best_splits(
-          self.finished,
-          self.node_map,
-          self.candidate_counts,
-          self.squares,
-          self.total_counts,
-          self.squares,
-          regression=False)
-
-      self.assertAllEqual([0, 1], split_indices.eval())
-
-  def testNoFinished(self):
-    with self.test_session():
-      split_indices = tensor_forest_ops.best_splits(
-          [],
-          self.node_map,
-          self.candidate_counts,
-          self.squares,
-          self.total_counts,
-          self.squares,
-          regression=False)
-
-      self.assertAllEqual([], split_indices.eval())
-
-  def testBadInput(self):
-    del self.total_counts[1]
-
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'Number of accumulators should be the same in split_sums '
-          'and accumulator_sums.'):
-        tensor_forest_ops.best_splits(
-            self.finished,
-            self.node_map,
-            self.candidate_counts,
-            self.squares,
-            self.total_counts,
-            self.squares,
-            regression=False).eval()
-
-
-class BestSplitsRegressionTests(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.finished = [3, 5]
-    self.node_map = [-1, -1, -1, 0, -1, 3, -1, -1, -1]
-    self.candidate_sums = [[[5., 8., 8., 8.], [5., 10., 10., 10.]],
-                           [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-                           [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-                           [[10., 10., 20., 10.], [10., 5., 5., 5.]]]
-
-    self.candidate_squares = [[[5., 50., 50., 50.], [5., 50., 50., 50.]],
-                              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-                              [[0., 0., 0., 0.], [0., 0., 0., 0.]],
-                              [[10., 40., 50., 60.], [10., 40., 40., 40.]]]
-
-    self.total_sums = [[15., 10., 10., 10.],
-                       [0., 0., 0., 0.],
-                       [0., 0., 0., 0.],
-                       [20., 20., 20., 20.]]
-
-    self.total_squares = [[15., 50., 50., 50.],
-                          [0., 0., 0., 0.],
-                          [0., 0., 0., 0.],
-                          [20., 60., 60., 60.]]
-
-  def testSimple(self):
-    with self.test_session():
-      split_indices = tensor_forest_ops.best_splits(
-          self.finished,
-          self.node_map,
-          self.candidate_sums,
-          self.candidate_squares,
-          self.total_sums,
-          self.total_squares,
-          regression=True)
-
-      self.assertAllEqual([1, 0], split_indices.eval())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py
deleted file mode 100644
index 351245fbdd2c506fb56a7dcf04fd105903dcf9a3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.count_extremely_random_stats."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tensor_forest.python.ops import data_ops
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class CountExtremelyRandomStatsClassificationTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.input_data = [[-1., 0.], [-1., 2.],  # node 1
-                       [1., 0.], [1., -2.]]  # node 2
-    self.input_labels = [0, 1, 2, 3]
-    self.tree = [[1, 0], [-1, 0], [-1, 0]]
-    self.tree_thresholds = [0., 0., 0.]
-    self.node_map = [-1, 0, -1]
-    self.split_features = [[1], [-1]]
-    self.split_thresholds = [[1.], [0.]]
-    self.epochs = [0, 1, 1]
-    self.current_epoch = [1]
-
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.dense.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = 1
-
-    f2 = spec_proto.dense.add()
-    f2.name = 'f2'
-    f2.original_type = data_ops.DATA_FLOAT
-    f2.size = 1
-    spec_proto.dense_features_size = 2
-    self.data_spec = spec_proto.SerializeToString()
-
-  def testSimple(self):
-    with self.test_session():
-      (pcw_node_sums, _, pcw_splits_indices, pcw_splits_sums, _,
-       pcw_totals_indices, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels, [],
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual(
-          [[4., 1., 1., 1., 1.], [2., 1., 1., 0., 0.], [2., 0., 0., 1., 1.]],
-          pcw_node_sums.eval())
-      self.assertAllEqual([[0, 0, 0], [0, 0, 1]], pcw_splits_indices.eval())
-      self.assertAllEqual([1., 1.], pcw_splits_sums.eval())
-      self.assertAllEqual([[0, 2], [0, 0], [0, 1]], pcw_totals_indices.eval())
-      self.assertAllEqual([1., 2., 1.], pcw_totals_sums.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testSimpleWeighted(self):
-    with self.test_session():
-      input_weights = [1.5, 2.0, 3.0, 4.0]
-      (pcw_node_sums, _, pcw_splits_indices, pcw_splits_sums, _,
-       pcw_totals_indices, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels,
-           input_weights,
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual([[10.5, 1.5, 2., 3., 4.], [3.5, 1.5, 2., 0., 0.],
-                           [7., 0., 0., 3., 4.]], pcw_node_sums.eval())
-      self.assertAllEqual([[0, 0, 0], [0, 0, 1]], pcw_splits_indices.eval())
-      self.assertAllEqual([1.5, 1.5], pcw_splits_sums.eval())
-      self.assertAllEqual([[0, 2], [0, 0], [0, 1]], pcw_totals_indices.eval())
-      self.assertAllEqual([2., 3.5, 1.5], pcw_totals_sums.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testMissingLabel(self):
-    labels = [0, 1, -1, 3]
-    with self.test_session():
-      (pcw_node_sums, _, pcw_splits_indices, pcw_splits_sums, _,
-       pcw_totals_indices, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           labels, [],
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual(
-          [[3., 1., 1., 0., 1.], [2., 1., 1., 0., 0.], [1., 0., 0., 0., 1.]],
-          pcw_node_sums.eval())
-      self.assertAllEqual([[0, 0, 0], [0, 0, 1]], pcw_splits_indices.eval())
-      self.assertAllEqual([1., 1.], pcw_splits_sums.eval())
-      self.assertAllEqual([[0, 2], [0, 0], [0, 1]], pcw_totals_indices.eval())
-      self.assertAllEqual([1., 2., 1.], pcw_totals_sums.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testSparseInput(self):
-    sparse_shape = [4, 10]
-    sparse_indices = [[0, 0], [0, 4], [0, 9], [1, 1], [1, 7], [2, 0], [3, 0],
-                      [3, 4]]
-    sparse_values = [3.0, -1.0, 0.5, -1.5, 6.0, -2.0, -0.5, 2.0]
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.sparse.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = -1
-
-    spec_proto.dense_features_size = 0
-    data_spec = spec_proto.SerializeToString()
-
-    with self.test_session():
-      (pcw_node_sums, _, pcw_splits_indices, pcw_splits_sums, _,
-       pcw_totals_indices, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           [],
-           sparse_indices,
-           sparse_values,
-           sparse_shape,
-           self.input_labels, [],
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual([[4., 1., 1., 1., 1.],
-                           [2., 0., 0., 1., 1.],
-                           [2., 1., 1., 0., 0.]],
-                          pcw_node_sums.eval())
-      self.assertAllEqual([[0, 0, 4],
-                           [0, 0, 0],
-                           [0, 0, 3]],
-                          pcw_splits_indices.eval())
-      self.assertAllEqual([1., 2., 1.], pcw_splits_sums.eval())
-      self.assertAllEqual([[0, 4], [0, 0], [0, 3]], pcw_totals_indices.eval())
-      self.assertAllEqual([1., 2., 1.], pcw_totals_sums.eval())
-      self.assertAllEqual([2, 2, 1, 1], leaves.eval())
-
-  def testFutureEpoch(self):
-    current_epoch = [3]
-    with self.test_session():
-      (pcw_node_sums, _, _, pcw_splits_sums, _, _, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels, [],
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           current_epoch,
-           input_spec=self.data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual(
-          [[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]],
-          pcw_node_sums.eval())
-      self.assertAllEqual([], pcw_splits_sums.eval())
-      self.assertAllEqual([], pcw_totals_sums.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testThreaded(self):
-    with self.test_session(
-        config=config_pb2.ConfigProto(intra_op_parallelism_threads=2)):
-      (pcw_node_sums, _, pcw_splits_indices, pcw_splits_sums, _,
-       pcw_totals_indices, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels, [],
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual([[4., 1., 1., 1., 1.], [2., 1., 1., 0., 0.],
-                           [2., 0., 0., 1., 1.]], pcw_node_sums.eval())
-      self.assertAllEqual([[0, 0, 0], [0, 0, 1]], pcw_splits_indices.eval())
-      self.assertAllEqual([1., 1.], pcw_splits_sums.eval())
-      self.assertAllEqual([[0, 2], [0, 0], [0, 1]], pcw_totals_indices.eval())
-      self.assertAllEqual([1., 2., 1.], pcw_totals_sums.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testNoAccumulators(self):
-    with self.test_session():
-      (pcw_node_sums, _, pcw_splits_indices, pcw_splits_sums, _,
-       pcw_totals_indices, pcw_totals_sums, _,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels, [],
-           self.tree,
-           self.tree_thresholds, [-1] * 3,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=5,
-           regression=False))
-
-      self.assertAllEqual([[4., 1., 1., 1., 1.], [2., 1., 1., 0., 0.],
-                           [2., 0., 0., 1., 1.]], pcw_node_sums.eval())
-      self.assertEquals((0, 3), pcw_splits_indices.eval().shape)
-      self.assertAllEqual([], pcw_splits_sums.eval())
-      self.assertEquals((0, 2), pcw_totals_indices.eval().shape)
-      self.assertAllEqual([], pcw_totals_sums.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testBadInput(self):
-    del self.node_map[-1]
-
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'Number of nodes should be the same in '
-          'tree, tree_thresholds, node_to_accumulator, and birth_epoch.'):
-        pcw_node, _, _, _, _, _, _, _, _ = (
-            tensor_forest_ops.count_extremely_random_stats(
-                self.input_data, [], [], [],
-                self.input_labels, [],
-                self.tree,
-                self.tree_thresholds,
-                self.node_map,
-                self.split_features,
-                self.split_thresholds,
-                self.epochs,
-                self.current_epoch,
-                input_spec=self.data_spec,
-                num_classes=5,
-                regression=False))
-
-        self.assertAllEqual([], pcw_node.eval())
-
-
-class CountExtremelyRandomStatsRegressionTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.input_data = [[-1., 0.], [-1., 2.],  # node 1
-                       [1., 0.], [1., -2.]]  # node 2
-    self.input_labels = [[3.], [6.], [2.], [3.]]
-    self.tree = [[1, 0], [-1, 0], [-1, 0]]
-    self.tree_thresholds = [0., 0., 0.]
-    self.node_map = [-1, 0, -1]
-    self.split_features = [[1], [-1]]
-    self.split_thresholds = [[1.], [0.]]
-    self.epochs = [0, 1, 1]
-    self.current_epoch = [1]
-
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.dense.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = 1
-
-    f2 = spec_proto.dense.add()
-    f2.name = 'f2'
-    f2.original_type = data_ops.DATA_FLOAT
-    f2.size = 1
-    spec_proto.dense_features_size = 2
-    self.data_spec = spec_proto.SerializeToString()
-
-  def testSimple(self):
-    with self.test_session():
-      (pcw_node_sums, pcw_node_squares, pcw_splits_indices, pcw_splits_sums,
-       pcw_splits_squares, pcw_totals_indices, pcw_totals_sums,
-       pcw_totals_squares,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels, [],
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=2,
-           regression=True))
-
-      self.assertAllEqual([[4., 14.], [2., 9.], [2., 5.]], pcw_node_sums.eval())
-      self.assertAllEqual([[4., 58.], [2., 45.], [2., 13.]],
-                          pcw_node_squares.eval())
-      self.assertAllEqual([[0, 0]], pcw_splits_indices.eval())
-      self.assertAllEqual([[1., 3.]], pcw_splits_sums.eval())
-      self.assertAllEqual([[1., 9.]], pcw_splits_squares.eval())
-      self.assertAllEqual([[0]], pcw_totals_indices.eval())
-      self.assertAllEqual([[2., 9.]], pcw_totals_sums.eval())
-      self.assertAllEqual([[2., 45.]], pcw_totals_squares.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-  def testSimpleWeighted(self):
-    with self.test_session():
-      input_weights = [1.0, 2.0, 3.0, 4.0]
-      (pcw_node_sums, pcw_node_squares, pcw_splits_indices, pcw_splits_sums,
-       pcw_splits_squares, pcw_totals_indices, pcw_totals_sums,
-       pcw_totals_squares,
-       leaves) = (tensor_forest_ops.count_extremely_random_stats(
-           self.input_data, [], [], [],
-           self.input_labels,
-           input_weights,
-           self.tree,
-           self.tree_thresholds,
-           self.node_map,
-           self.split_features,
-           self.split_thresholds,
-           self.epochs,
-           self.current_epoch,
-           input_spec=self.data_spec,
-           num_classes=2,
-           regression=True))
-
-      self.assertAllEqual([[10., 33.], [3., 15.], [7., 18.]],
-                          pcw_node_sums.eval())
-      self.assertAllEqual([[10., 129.], [3., 81.], [7., 48.]],
-                          pcw_node_squares.eval())
-      self.assertAllEqual([[0, 0]], pcw_splits_indices.eval())
-      self.assertAllEqual([[1., 3.]], pcw_splits_sums.eval())
-      self.assertAllEqual([[1., 9.]], pcw_splits_squares.eval())
-      self.assertAllEqual([[0]], pcw_totals_indices.eval())
-      self.assertAllEqual([[2., 9.]], pcw_totals_sums.eval())
-      self.assertAllEqual([[2., 45.]], pcw_totals_squares.eval())
-      self.assertAllEqual([1, 1, 2, 2], leaves.eval())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/finished_nodes_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/finished_nodes_op_test.py
deleted file mode 100644
index 9cd05a507b79423666b6e1ac1b65396c0ca15c30..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/finished_nodes_op_test.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.finished_nodes_op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow  # pylint: disable=unused-import
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class FinishedNodesTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.leaves = [1, 3, 4]
-    self.node_map = [-1, -1, -1, 0, 1, -1]
-    self.split_sums = [
-        # Accumulator 0
-        [[3, 0, 3], [2, 1, 1], [3, 1, 2]],
-        # Accumulator 1
-        [[6, 3, 3], [6, 2, 4], [5, 0, 5]],
-        # Accumulator 2
-        [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
-        # Accumulator 3
-        [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
-        # Accumulator 4
-        [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
-    ]
-    self.split_squares = []
-    self.accumulator_sums = [[6, 3, 3], [11, 4, 7], [0, 0, 0], [0, 0, 0],
-                             [0, 0, 0]]
-    self.accumulator_squares = []
-    self.birth_epochs = [0, 0, 0, 1, 1, 1]
-    self.current_epoch = [1]
-
-  def testSimple(self):
-    with self.test_session():
-      finished, stale = tensor_forest_ops.finished_nodes(
-          self.leaves,
-          self.node_map,
-          self.split_sums,
-          self.split_squares,
-          self.accumulator_sums,
-          self.accumulator_squares,
-          self.birth_epochs,
-          self.current_epoch,
-          regression=False,
-          num_split_after_samples=10,
-          min_split_samples=10)
-
-      self.assertAllEqual([4], finished.eval())
-      self.assertAllEqual([], stale.eval())
-
-  def testLeavesCanBeNegativeOne(self):
-    with self.test_session():
-      finished, stale = tensor_forest_ops.finished_nodes(
-          [-1, -1, 1, -1, 3, -1, -1, 4, -1, -1, -1],
-          self.node_map,
-          self.split_sums,
-          self.split_squares,
-          self.accumulator_sums,
-          self.accumulator_squares,
-          self.birth_epochs,
-          self.current_epoch,
-          regression=False,
-          num_split_after_samples=10,
-          min_split_samples=10)
-
-      self.assertAllEqual([4], finished.eval())
-      self.assertAllEqual([], stale.eval())
-
-  def testNoAccumulators(self):
-    with self.test_session():
-      finished, stale = tensor_forest_ops.finished_nodes(
-          self.leaves, [-1] * 6,
-          self.split_sums,
-          self.split_squares,
-          self.accumulator_sums,
-          self.accumulator_squares,
-          self.birth_epochs,
-          self.current_epoch,
-          regression=False,
-          num_split_after_samples=10,
-          min_split_samples=10)
-
-      self.assertAllEqual([], finished.eval())
-      self.assertAllEqual([], stale.eval())
-
-  def testBadInput(self):
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'leaf_tensor should be one-dimensional'):
-        finished, stale = tensor_forest_ops.finished_nodes(
-            [self.leaves],
-            self.node_map,
-            self.split_sums,
-            self.split_squares,
-            self.accumulator_sums,
-            self.accumulator_squares,
-            self.birth_epochs,
-            self.current_epoch,
-            regression=False,
-            num_split_after_samples=10,
-            min_split_samples=10)
-
-        self.assertAllEqual([], finished.eval())
-        self.assertAllEqual([], stale.eval())
-
-  def testEarlyDominatesHoeffding(self):
-    with self.test_session():
-      finished, stale = tensor_forest_ops.finished_nodes(
-          self.leaves,
-          self.node_map,
-          self.split_sums,
-          self.split_squares,
-          self.accumulator_sums,
-          self.accumulator_squares,
-          self.birth_epochs,
-          self.current_epoch,
-          dominate_method='hoeffding',
-          regression=False,
-          num_split_after_samples=10,
-          min_split_samples=5)
-
-      self.assertAllEqual([4], finished.eval())
-      self.assertAllEqual([], stale.eval())
-
-  def testEarlyDominatesBootstrap(self):
-    with self.test_session():
-      finished, stale = tensor_forest_ops.finished_nodes(
-          self.leaves,
-          self.node_map,
-          self.split_sums,
-          self.split_squares,
-          self.accumulator_sums,
-          self.accumulator_squares,
-          self.birth_epochs,
-          self.current_epoch,
-          dominate_method='bootstrap',
-          regression=False,
-          num_split_after_samples=10,
-          min_split_samples=5,
-          random_seed=1)
-
-      self.assertAllEqual([4], finished.eval())
-      self.assertAllEqual([], stale.eval())
-
-  def testEarlyDominatesChebyshev(self):
-    with self.test_session():
-      finished, stale = tensor_forest_ops.finished_nodes(
-          self.leaves,
-          self.node_map,
-          self.split_sums,
-          self.split_squares,
-          self.accumulator_sums,
-          self.accumulator_squares,
-          self.birth_epochs,
-          self.current_epoch,
-          dominate_method='chebyshev',
-          regression=False,
-          num_split_after_samples=10,
-          min_split_samples=5)
-
-      self.assertAllEqual([4], finished.eval())
-      self.assertAllEqual([], stale.eval())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py
deleted file mode 100644
index 150632c3984d499870c6d63f7531f203febf13ae..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/grow_tree_op_test.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.grow_tree_op."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
-
-class GrowTreeTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.tree = variables.Variable([[1, 0], [-1, 0], [-1, 0], [-2, 0], [-2, 0],
-                                    [-2, 0], [-2, 0]])
-    self.tree_thresholds = variables.Variable([0., 0., 0., 0., 0., 0., 0.])
-    self.eot = variables.Variable([3])
-    self.node_map = [-1, 0, 1, -1, -1, -1, -1]
-    self.finished = [1, 2]
-    self.best_splits = [2, 3]
-    self.split_features = [[1, 2, 3, 4], [5, 6, 7, 8]]
-    self.split_thresholds = [[10., 20., 30., 40.], [50., 60., 70., 80.]]
-
-  def testSimple(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      update_list, tree_updates, threshold_updates, new_eot = (
-          tensor_forest_ops.grow_tree(self.eot, self.node_map, self.finished,
-                                      self.best_splits, self.split_features,
-                                      self.split_thresholds))
-
-      self.assertAllEqual([1, 3, 4, 2, 5, 6], update_list.eval())
-      self.assertAllEqual(
-          [[3, 3], [-1, -1], [-1, -1], [5, 8], [-1, -1], [-1, -1]],
-          tree_updates.eval())
-      self.assertAllEqual([30.0, 0.0, 0.0, 80.0, 0.0, 0.0],
-                          threshold_updates.eval())
-      self.assertAllEqual([7], new_eot.eval())
-
-  def testNoRoomToGrow(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      # Even though there's one free node, there needs to be 2 to grow.
-      state_ops.assign(self.eot, [6]).eval()
-
-      update_list, tree_updates, threshold_updates, new_eot = (
-          tensor_forest_ops.grow_tree(self.eot, self.node_map, self.finished,
-                                      self.best_splits, self.split_features,
-                                      self.split_thresholds))
-
-      self.assertAllEqual([], update_list.eval())
-      self.assertEquals((0, 2), tree_updates.eval().shape)
-      self.assertAllEqual([], threshold_updates.eval())
-      self.assertAllEqual([6], new_eot.eval())
-
-  def testNoFinished(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-
-      update_list, tree_updates, threshold_updates, new_eot = (
-          tensor_forest_ops.grow_tree(self.eot, self.node_map, [], [],
-                                      self.split_features,
-                                      self.split_thresholds))
-
-      self.assertAllEqual([], update_list.eval())
-      self.assertAllEqual((0, 2), tree_updates.eval().shape)
-      self.assertAllEqual([], threshold_updates.eval())
-      self.assertAllEqual([3], new_eot.eval())
-
-  def testBadInput(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      with self.assertRaisesOpError(
-          'Number of finished nodes should be the same in finished and '
-          'best_splits.'):
-        update_list, _, _, _ = (tensor_forest_ops.grow_tree(
-            self.eot, self.node_map, [], self.best_splits, self.split_features,
-            self.split_thresholds))
-        self.assertAllEqual([], update_list.eval())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py
deleted file mode 100644
index 705949a4540fbb58c686b8d11a86e5e7424e2638..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.sample_inputs_op."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tensor_forest.python.ops import data_ops
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
-
-class SampleInputsTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.input_data = [[-1., 10.], [-10., 2.],  # node 1
-                       [20., 50.], [1., -2.]]  # node 2
-    self.node_map = [-1, 0, 1]
-    self.leaves = [1, 1, 2, 2]
-    self.split_features = [[-1, -1, -1], [1, 0, -1], [-1, -1, -1]]
-    self.split_thresholds = [[0., 0., 0.], [5., -2., 0.], [0., 0., 0.]]
-
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.dense.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = 1
-
-    f2 = spec_proto.dense.add()
-    f2.name = 'f2'
-    f2.original_type = data_ops.DATA_FLOAT
-    f2.size = 1
-    spec_proto.dense_features_size = 2
-    self.data_spec = spec_proto.SerializeToString()
-
-  def testSimple(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      (indices, feature_updates,
-       threshold_updates) = (tensor_forest_ops.sample_inputs(
-           self.input_data, [], [], [], [],
-           self.node_map,
-           self.leaves,
-           self.split_features,
-           self.split_thresholds,
-           split_initializations_per_input=1,
-           input_spec=self.data_spec,
-           split_sampling_random_seed=2))
-      self.assertAllEqual([1, 0], indices.eval())
-      self.assertAllEqual([[1, 0, 1], [1, 1, -1]], feature_updates.eval())
-      self.assertAllEqual([[5., -2., 50.], [10., 2., 0.]],
-                          threshold_updates.eval())
-
-  def testSparse(self):
-    sparse_shape = [4, 10]
-    sparse_indices = [[0, 0], [0, 4], [0, 9],
-                      [1, 0], [1, 7],
-                      [2, 0],
-                      [3, 1], [3, 4]]
-    sparse_values = [3.0, -1.0, 0.5,
-                     1.5, 6.0,
-                     -2.0,
-                     -0.5, 2.0]
-
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.sparse.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = -1
-
-    spec_proto.dense_features_size = 0
-    data_spec = spec_proto.SerializeToString()
-
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      (indices, feature_updates,
-       threshold_updates) = (tensor_forest_ops.sample_inputs(
-           [],
-           sparse_indices,
-           sparse_values,
-           sparse_shape, [],
-           self.node_map,
-           self.leaves,
-           self.split_features,
-           self.split_thresholds,
-           input_spec=data_spec,
-           split_initializations_per_input=1,
-           split_sampling_random_seed=3))
-      self.assertAllEqual([1, 0], indices.eval())
-      self.assertAllEqual([[1, 0, 0], [4, 0, -1]], feature_updates.eval())
-
-      self.assertAllEqual([[5., -2., -2.], [-1., 1.5, 0.]],
-                          threshold_updates.eval())
-
-  def testWeights(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      (indices, feature_updates,
-       threshold_updates) = (tensor_forest_ops.sample_inputs(
-           self.input_data, [], [], [], [0.5, 0.1, 0.8, 0.7],
-           self.node_map,
-           self.leaves,
-           self.split_features,
-           self.split_thresholds,
-           input_spec=self.data_spec,
-           split_initializations_per_input=1,
-           split_sampling_random_seed=3))
-      self.assertAllEqual([1, 0], indices.eval())
-      self.assertAllEqual([[1, 0, 0], [-1, -1, -1]], feature_updates.eval())
-      self.assertAllEqual([[5., -2., 20.], [0., 0., 0.]],
-                          threshold_updates.eval())
-
-  def testNoAccumulators(self):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      (indices, feature_updates,
-       threshold_updates) = (tensor_forest_ops.sample_inputs(
-           self.input_data, [], [], [], [], [-1] * 3,
-           self.leaves,
-           self.split_features,
-           self.split_thresholds,
-           input_spec=self.data_spec,
-           split_initializations_per_input=1,
-           split_sampling_random_seed=3))
-      self.assertAllEqual([], indices.eval())
-      self.assertAllEqual((0, 3), feature_updates.eval().shape)
-      self.assertAllEqual((0, 3), threshold_updates.eval().shape)
-
-  def testBadInput(self):
-    del self.split_features[1]
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      with self.assertRaisesOpError(
-          'split_features and split_thresholds should be the same shape.'):
-        indices, _, _ = tensor_forest_ops.sample_inputs(
-            self.input_data, [], [], [], [],
-            self.node_map,
-            self.leaves,
-            self.split_features,
-            self.split_thresholds,
-            input_spec=self.data_spec,
-            split_initializations_per_input=1,
-            split_sampling_random_seed=3)
-        self.assertAllEqual([], indices.eval())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/tree_predictions_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/tree_predictions_op_test.py
deleted file mode 100644
index d7d50070305384c86bdec496abf27bbf21ddba00..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/tree_predictions_op_test.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.tree_predictions_op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tensor_forest.python.ops import data_ops
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class TreePredictionsDenseTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.nothing = []
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.dense.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = 1
-
-    f2 = spec_proto.dense.add()
-    f2.name = 'f2'
-    f2.original_type = data_ops.DATA_FLOAT
-    f2.size = 1
-    spec_proto.dense_features_size = 2
-    self.data_spec = spec_proto.SerializeToString()
-
-  def testSimple(self):
-    input_data = [[-1., 0.], [-1., 2.],  # node 1
-                  [1., 0.], [1., -2.]]  # node 2
-
-    tree = [[1, 0], [-1, 0], [-1, 0]]
-    tree_thresholds = [0., 0., 0.]
-    node_pcw = [[1.0, 0.3, 0.4, 0.3], [1.0, 0.1, 0.1, 0.8],
-                [1.0, 0.5, 0.25, 0.25]]
-
-    with self.test_session():
-      predictions = tensor_forest_ops.tree_predictions(
-          input_data,
-          self.nothing,
-          self.nothing,
-          self.nothing,
-          tree,
-          tree_thresholds,
-          node_pcw,
-          input_spec=self.data_spec,
-          valid_leaf_threshold=1)
-
-      self.assertAllClose([[0.1, 0.1, 0.8], [0.1, 0.1, 0.8],
-                           [0.5, 0.25, 0.25], [0.5, 0.25, 0.25]],
-                          predictions.eval())
-
-  def testBackoffToParent(self):
-    input_data = [
-        [-1., 0.],
-        [-1., 2.],  # node 1
-        [1., 0.],
-        [1., -2.]
-    ]  # node 2
-
-    tree = [[1, 0], [-1, 0], [-1, 0]]
-    tree_thresholds = [0., 0., 0.]
-    node_pcw = [[15.0, 3.0, 9.0, 3.0], [5.0, 1.0, 1.0, 3.0],
-                [25.0, 5.0, 20.0, 0.0]]
-
-    with self.test_session():
-      predictions = tensor_forest_ops.tree_predictions(
-          input_data,
-          self.nothing,
-          self.nothing,
-          self.nothing,
-          tree,
-          tree_thresholds,
-          node_pcw,
-          valid_leaf_threshold=10,
-          input_spec=self.data_spec)
-
-      # Node 2 has enough data, but Node 1 needs to combine with the parent
-      # counts.
-      self.assertAllClose([[0.2, 0.4, 0.4], [0.2, 0.4, 0.4], [0.2, 0.8, 0.0],
-                           [0.2, 0.8, 0.0]], predictions.eval())
-
-  def testNoInput(self):
-    input_data = []
-
-    tree = [[1, 0], [-1, 0], [-1, 0]]
-    tree_thresholds = [0., 0., 0.]
-    node_pcw = [[1.0, 0.3, 0.4, 0.3], [1.0, 0.1, 0.1, 0.8],
-                [1.0, 0.5, 0.25, 0.25]]
-
-    with self.test_session():
-      predictions = tensor_forest_ops.tree_predictions(
-          input_data,
-          self.nothing,
-          self.nothing,
-          self.nothing,
-          tree,
-          tree_thresholds,
-          node_pcw,
-          valid_leaf_threshold=10,
-          input_spec=self.data_spec)
-
-      self.assertEquals((0, 3), predictions.eval().shape)
-
-  def testBadInput(self):
-    input_data = [
-        [-1., 0.],
-        [-1., 2.],  # node 1
-        [1., 0.],
-        [1., -2.]
-    ]  # node 2
-
-    tree = [[1, 0], [-1, 0], [-1, 0]]
-    tree_thresholds = [0., 0.]  # not enough nodes.
-    node_pcw = [[1.0, 0.3, 0.4, 0.3], [1.0, 0.1, 0.1, 0.8],
-                [1.0, 0.5, 0.25, 0.25]]
-
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'Number of nodes should be the same in tree, tree_thresholds '
-          'and node_pcw.'):
-        predictions = tensor_forest_ops.tree_predictions(
-            input_data,
-            self.nothing,
-            self.nothing,
-            self.nothing,
-            tree,
-            tree_thresholds,
-            node_pcw,
-            valid_leaf_threshold=10,
-            input_spec=self.data_spec)
-
-        self.assertEquals((0, 3), predictions.eval().shape)
-
-
-class TreePredictionsSparseTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.nothing = []
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.sparse.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = 1
-
-    f2 = spec_proto.sparse.add()
-    f2.name = 'f2'
-    f2.original_type = data_ops.DATA_FLOAT
-    f2.size = 9
-    spec_proto.dense_features_size = 0
-    self.data_spec = spec_proto.SerializeToString()
-
-  def testSparseInput(self):
-    sparse_shape = [3, 10]
-    sparse_indices = [[0, 0], [0, 4], [0, 9],
-                      [1, 0], [1, 7],
-                      [2, 0]]
-    sparse_values = [3.0, -1.0, 0.5,
-                     1.5, 6.0,
-                     -2.0]
-
-    tree = [[1, 0], [-1, 0], [-1, 0]]
-    tree_thresholds = [0., 0., 0.]
-    node_pcw = [[1.0, 0.3, 0.4, 0.3], [1.0, 0.1, 0.1, 0.8],
-                [1.0, 0.5, 0.25, 0.25]]
-
-    with self.test_session():
-      predictions = tensor_forest_ops.tree_predictions(
-          self.nothing,
-          sparse_indices,
-          sparse_values,
-          sparse_shape,
-          tree,
-          tree_thresholds,
-          node_pcw,
-          valid_leaf_threshold=1,
-          input_spec=self.data_spec)
-
-      self.assertAllClose([[0.5, 0.25, 0.25],
-                           [0.5, 0.25, 0.25],
-                           [0.1, 0.1, 0.8]],
-                          predictions.eval())
-
-  def testSparseInputDefaultIsZero(self):
-    sparse_shape = [3, 10]
-    sparse_indices = [[0, 0], [0, 4], [0, 9],
-                      [1, 0], [1, 7],
-                      [2, 0]]
-    sparse_values = [3.0, -1.0, 0.5,
-                     1.5, 6.0,
-                     -2.0]
-
-    tree = [[1, 7], [-1, 0], [-1, 0]]
-    tree_thresholds = [3.0, 0., 0.]
-    node_pcw = [[1.0, 0.3, 0.4, 0.3], [1.0, 0.1, 0.1, 0.8],
-                [1.0, 0.5, 0.25, 0.25]]
-
-    with self.test_session():
-      predictions = tensor_forest_ops.tree_predictions(
-          self.nothing,
-          sparse_indices,
-          sparse_values,
-          sparse_shape,
-          tree,
-          tree_thresholds,
-          node_pcw,
-          valid_leaf_threshold=1,
-          input_spec=self.data_spec)
-
-      self.assertAllClose([[0.1, 0.1, 0.8],
-                           [0.5, 0.25, 0.25],
-                           [0.1, 0.1, 0.8]],
-                          predictions.eval())
-
-
-class TreePredictionsMixedTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self.nothing = []
-    spec_proto = data_ops.TensorForestDataSpec()
-    f1 = spec_proto.dense.add()
-    f1.name = 'f1'
-    f1.original_type = data_ops.DATA_FLOAT
-    f1.size = 2
-
-    f2 = spec_proto.dense.add()
-    f2.name = 'f2'
-    f2.original_type = data_ops.DATA_CATEGORICAL
-    f2.size = 1
-
-    f3 = spec_proto.sparse.add()
-    f3.name = 'f3'
-    f3.original_type = data_ops.DATA_FLOAT
-    f3.size = -1
-    spec_proto.dense_features_size = 3
-    self.data_spec = spec_proto.SerializeToString()
-
-  def testSimpleMixed(self):
-    #        0       1       2       3        4        5        6
-    tree = [[1, 0], [3, 2], [5, 5], [-1, 0], [-1, 0], [-1, 0], [-1, 0]]
-    tree_thresholds = [0., 15., 1., 0., 0., 0., 0.]
-    node_pcw = [[1.0, 0., 1.0, 0.4, 0.3], [1.0, 0., 0.1, 0.1, 0.8],
-                [1.0, 0., 0.5, 0.25, 0.25], [1.0, 1., 0., 0., 0.],
-                [1.0, 0., 1., 0., 0.], [1.0, 0., 0., 1., 0.],
-                [1.0, 0., 0., 0., 1.]]
-
-    input_data = [
-        [-1., 0., 15.],  # node 3
-        [-1., 2., 11.],  # node 4
-        [1., 0., 11.],
-        [1., -2., 30.]
-    ]
-
-    sparse_shape = [4, 5]
-    sparse_indices = [
-        [0, 0],
-        [0, 1],
-        [0, 4],
-        [1, 0],
-        [1, 2],
-        [2, 1],  # node 5
-        [3, 2]
-    ]  # node 6
-    sparse_values = [3.0, -1.0, 0.5, 1.5, 6.0, -2.0, 2.0]
-
-    with self.test_session():
-      predictions = tensor_forest_ops.tree_predictions(
-          input_data,
-          sparse_indices,
-          sparse_values,
-          sparse_shape,
-          tree,
-          tree_thresholds,
-          node_pcw,
-          valid_leaf_threshold=1,
-          input_spec=self.data_spec)
-
-      self.assertAllClose([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.],
-                           [0., 0., 0., 1.]], predictions.eval())
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/kernel_tests/update_fertile_slots_op_test.py b/tensorflow/contrib/tensor_forest/python/kernel_tests/update_fertile_slots_op_test.py
deleted file mode 100644
index 3f71a0dba4e5713eab6ea6c772847b5ef16bbbd3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensor_forest/python/kernel_tests/update_fertile_slots_op_test.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.tensor_forest.ops.allocate_deallocate_op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow  # pylint: disable=unused-import
-
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class UpdateFertileSlotsTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    # tree is:
-    #         0
-    #     1       2
-    #   3   4   5   6
-    self.finished = [2]
-    self.non_fertile_leaves = [3, 4]
-    self.non_fertile_leaf_scores = [10., 15.]
-    self.end_of_tree = [5]
-    self.node_map = [-1, -1, 0, -1, -1, -1, -1]
-    self.total_counts = [[80., 40., 40.]]
-    self.stale_leaves = []
-    self.node_sums = [[3, 1, 2], [4, 2, 2], [5, 2, 3], [6, 1, 5], [7, 5, 2],
-                      [8, 4, 4], [9, 7, 2]]
-
-  def testSimple(self):
-    with self.test_session():
-      (n2a_map_updates, a2n_map_updates, accumulators_cleared,
-       accumulators_allocated) = tensor_forest_ops.update_fertile_slots(
-           self.finished, self.non_fertile_leaves, self.non_fertile_leaf_scores,
-           self.end_of_tree, self.total_counts, self.node_map,
-           self.stale_leaves, self.node_sums)
-
-      self.assertAllEqual([[2, 4], [-1, 0]], n2a_map_updates.eval())
-      self.assertAllEqual([[0], [4]], a2n_map_updates.eval())
-      self.assertAllEqual([], accumulators_cleared.eval())
-      self.assertAllEqual([0], accumulators_allocated.eval())
-
-  def testNoFinished(self):
-    with self.test_session():
-      (n2a_map_updates, a2n_map_updates, accumulators_cleared,
-       accumulators_allocated) = tensor_forest_ops.update_fertile_slots(
-           [], self.non_fertile_leaves, self.non_fertile_leaf_scores,
-           self.end_of_tree, self.total_counts, self.node_map,
-           self.stale_leaves, self.node_sums)
-
-      self.assertAllEqual((2, 0), n2a_map_updates.eval().shape)
-      self.assertAllEqual((2, 0), a2n_map_updates.eval().shape)
-      self.assertAllEqual([], accumulators_cleared.eval())
-      self.assertAllEqual([], accumulators_allocated.eval())
-
-  def testPureCounts(self):
-    with self.test_session():
-      self.node_sums[4] = [10, 0, 10]
-      (n2a_map_updates, a2n_map_updates, accumulators_cleared,
-       accumulators_allocated) = tensor_forest_ops.update_fertile_slots(
-           self.finished, self.non_fertile_leaves, self.non_fertile_leaf_scores,
-           self.end_of_tree, self.total_counts, self.node_map,
-           self.stale_leaves, self.node_sums)
-
-      self.assertAllEqual([[2, 3], [-1, 0]], n2a_map_updates.eval())
-      self.assertAllEqual([[0], [3]], a2n_map_updates.eval())
-      self.assertAllEqual([], accumulators_cleared.eval())
-      self.assertAllEqual([0], accumulators_allocated.eval())
-
-  def testBadInput(self):
-    del self.non_fertile_leaf_scores[-1]
-    with self.test_session():
-      with self.assertRaisesOpError(
-          'Number of non fertile leaves should be the same in '
-          'non_fertile_leaves and non_fertile_leaf_scores.'):
-        (n2a_map_updates, _, _, _) = tensor_forest_ops.update_fertile_slots(
-            self.finished, self.non_fertile_leaves,
-            self.non_fertile_leaf_scores, self.end_of_tree, self.total_counts,
-            self.node_map, self.stale_leaves, self.node_sums)
-        self.assertAllEqual((2, 0), n2a_map_updates.eval().shape)
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
index 2e54f620d5b306096a146a8cb9b6c0e89f317e3e..f878e5989cf2b43be960d34a45c4014d412f1c67 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
@@ -97,6 +97,13 @@ class DataColumn(object):
                                                            self.size)
 
 
+def GetColumnName(column_key, col_num):
+  if isinstance(column_key, str):
+    return column_key
+  else:
+    return getattr(column_key, 'column_name', str(col_num))
+
+
 def ParseDataTensorOrDict(data):
   """Return a tensor to use for input data.
 
@@ -119,14 +126,13 @@ def ParseDataTensorOrDict(data):
     for k in sorted(data.keys()):
       is_sparse = isinstance(data[k], sparse_tensor.SparseTensor)
       if is_sparse:
-        # TODO(gilberth): support sparse categorical.
-        if data[k].dtype == dtypes.string:
-          logging.info('TensorForest does not support sparse categorical. '
-                       'Transform it into a number with hash buckets.')
+        # TODO(gilberth): support sparse continuous.
+        if data[k].dtype == dtypes.float32:
+          logging.info('TensorForest does not support sparse continuous.')
           continue
         elif data_spec.sparse.size() == 0:
           col_spec = data_spec.sparse.add()
-          col_spec.original_type = DATA_FLOAT
+          col_spec.original_type = DATA_CATEGORICAL
           col_spec.name = 'all_sparse'
           col_spec.size = -1
         sparse_features.append(
@@ -136,7 +142,7 @@ def ParseDataTensorOrDict(data):
         col_spec = data_spec.dense.add()
 
         col_spec.original_type = DTYPE_TO_FTYPE[data[k].dtype]
-        col_spec.name = k
+        col_spec.name = GetColumnName(k, len(dense_features))
         # the second dimension of get_shape should always be known.
         shape = data[k].get_shape()
         if len(shape) == 1:
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d240e2f6dec6ae31b08328739de7c868d568ecbe
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model ops python wrappers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensor_forest.python.ops import gen_model_ops
+
+# pylint: disable=unused-import
+from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import feature_usage_counts
+from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import traverse_tree_v4
+from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import tree_predictions_v4
+from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import tree_size
+from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import update_model_v4
+# pylint: enable=unused-import
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import saver
+
+
+_model_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_model_ops.so"))
+
+
+ops.NotDifferentiable("TreeVariable")
+ops.NotDifferentiable("TreeSerialize")
+ops.NotDifferentiable("TreeDeserialize")
+ops.NotDifferentiable("TreeSize")
+ops.NotDifferentiable("TreePredictionsV4")
+ops.NotDifferentiable("FeatureUsageCounts")
+
+
+class TreeVariableSavable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for TreeVariable."""
+
+  def __init__(self, params, tree_handle, stats_handle, create_op, name):
+    """Creates a TreeVariableSavable object.
+
+    Args:
+      params: A TensorForestParams object.
+      tree_handle: handle to the tree variable.
+      stats_handle: handle to the stats variable.
+      create_op: the op to initialize the variable.
+      name: the name to save the tree variable under.
+    """
+    self.params = params
+    tensor = gen_model_ops.tree_serialize(tree_handle)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree variable. So we just pass an empty value.
+    slice_spec = ""
+    specs = [saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name),]
+    super(TreeVariableSavable,
+          self).__init__(tree_handle, specs, name)
+    self._tree_handle = tree_handle
+    self._create_op = create_op
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return gen_model_ops.tree_deserialize(
+          self._tree_handle,
+          restored_tensors[0],
+          params=self.params.serialized_params_proto)
+
+
+def tree_variable(params, tree_config, stats_handle, name, container=None):
+  r"""Creates a tree model and returns a handle to it.
+
+  Args:
+    params: A TensorForestParams object.
+    tree_config: A `Tensor` of type `string`. Serialized proto of the tree.
+    stats_handle: Resource handle to the stats object.
+    name: A name for the variable.
+    container: An optional `string`. Defaults to `""`.
+
+  Returns:
+    A `Tensor` of type mutable `string`. The handle to the tree.
+  """
+  with ops.name_scope(name, "TreeVariable") as name:
+    resource_handle = gen_model_ops.decision_tree_resource_handle_op(
+        container, name, name=name)
+
+    create_op = gen_model_ops.create_tree_variable(
+        resource_handle,
+        tree_config,
+        params=params.serialized_params_proto)
+    is_initialized_op = gen_model_ops.tree_is_initialized_op(resource_handle)
+    # Adds the variable to the savable list.
+    saveable = TreeVariableSavable(params, resource_handle, stats_handle,
+                                   create_op,
+                                   "tree_checkpoint_{0}".format(name))
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    resources.register_resource(resource_handle, create_op, is_initialized_op)
+    return resource_handle
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..be9f2e12b77bf9483ec62ae30a0573296387a31f
--- /dev/null
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stats ops python wrappers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensor_forest.python.ops import gen_stats_ops
+# pylint: disable=unused-import
+from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import finalize_tree
+from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import grow_tree_v4
+from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import process_input_v4
+# pylint: enable=unused-import
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.training import saver
+
+
+_stats_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_stats_ops.so"))
+
+
+ops.NotDifferentiable("FertileStatsVariable")
+ops.NotDifferentiable("FertileStatsSerialize")
+ops.NotDifferentiable("FertileStatsDeserialize")
+ops.NotDifferentiable("GrowTreeV4")
+ops.NotDifferentiable("ProcessInputV4")
+ops.NotDifferentiable("FinalizeTree")
+
+
+class FertileStatsVariableSavable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for FertileStatsVariable."""
+
+  def __init__(self, params, stats_handle, create_op, name):
+    """Creates a FertileStatsVariableSavable object.
+
+    Args:
+      params: A TensorForestParams object.
+      stats_handle: handle to the tree variable.
+      create_op: the op to initialize the variable.
+      name: the name to save the tree variable under.
+    """
+    self.params = params
+    tensor = gen_stats_ops.fertile_stats_serialize(
+        stats_handle, params=params.serialized_params_proto)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree variable. So we just pass an empty value.
+    slice_spec = ""
+    specs = [saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name),]
+    super(FertileStatsVariableSavable,
+          self).__init__(stats_handle, specs, name)
+    self._stats_handle = stats_handle
+    self._create_op = create_op
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return gen_stats_ops.fertile_stats_deserialize(
+          self._stats_handle, restored_tensors[0],
+          params=self.params.serialized_params_proto)
+
+
+def fertile_stats_variable(params, stats_config, name,
+                           container=None):
+  r"""Creates a stats object and returns a handle to it.
+
+  Args:
+    params: A TensorForestParams object.
+    stats_config: A `Tensor` of type `string`. Serialized proto of the stats.
+    name: A name for the variable.
+    container: An optional `string`. Defaults to `""`.
+
+  Returns:
+    A `Tensor` of type mutable `string`. The handle to the stats.
+  """
+  with ops.name_scope(name, "FertileStatsVariable") as name:
+    resource_handle = gen_stats_ops.fertile_stats_resource_handle_op(
+        container, name, name=name)
+
+    create_op = gen_stats_ops.create_fertile_stats_variable(
+        resource_handle, stats_config,
+        params=params.serialized_params_proto)
+    is_initialized_op = gen_stats_ops.fertile_stats_is_initialized_op(
+        resource_handle)
+    # Adds the variable to the savable list.
+    saveable = FertileStatsVariableSavable(params, resource_handle, create_op,
+                                           "stats_checkpoint_{0}".format(name))
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    resources.register_resource(resource_handle, create_op, is_initialized_op)
+    return resource_handle
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 177783c207ef44b94a80ece5265a4b161de63d71..b1a83570482362b1c2e293adc4740080746cb732 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -18,29 +18,112 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import numbers
 import random
-import sys
 
+from google.protobuf import text_format
+
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.framework.python.ops import variables as framework_variables
-from tensorflow.contrib.losses.python.losses import loss_ops
-from tensorflow.contrib.tensor_forest.python import constants
+from tensorflow.contrib.tensor_forest.proto import tensor_forest_params_pb2 as _params_proto
 from tensorflow.contrib.tensor_forest.python.ops import data_ops
-from tensorflow.contrib.tensor_forest.python.ops import tensor_forest_ops
+from tensorflow.contrib.tensor_forest.python.ops import model_ops
+from tensorflow.contrib.tensor_forest.python.ops import stats_ops
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 
 
+# Stores tuples of (leaf model type, stats model type)
+CLASSIFICATION_LEAF_MODEL_TYPES = {
+    'all_dense': (_params_proto.MODEL_DENSE_CLASSIFICATION,
+                  _params_proto.STATS_DENSE_GINI),
+    'all_sparse': (_params_proto.MODEL_SPARSE_CLASSIFICATION,
+                   _params_proto.STATS_SPARSE_GINI),
+    'sparse_then_dense':
+        (_params_proto.MODEL_SPARSE_OR_DENSE_CLASSIFICATION,
+         _params_proto.STATS_SPARSE_THEN_DENSE_GINI),
+}
+REGRESSION_MODEL_TYPE = (
+    _params_proto.MODEL_REGRESSION,
+    _params_proto.STATS_LEAST_SQUARES_REGRESSION,
+    _params_proto.COLLECTION_BASIC)
+
+FINISH_TYPES = {
+    'basic': _params_proto.SPLIT_FINISH_BASIC,
+    'hoeffding': _params_proto.SPLIT_FINISH_DOMINATE_HOEFFDING,
+    'bootstrap': _params_proto.SPLIT_FINISH_DOMINATE_BOOTSTRAP
+}
+PRUNING_TYPES = {
+    'none': _params_proto.SPLIT_PRUNE_NONE,
+    'half': _params_proto.SPLIT_PRUNE_HALF,
+    'quarter': _params_proto.SPLIT_PRUNE_QUARTER,
+    '10_percent': _params_proto.SPLIT_PRUNE_10_PERCENT,
+    'hoeffding': _params_proto.SPLIT_PRUNE_HOEFFDING,
+}
+SPLIT_TYPES = {
+    'less_or_equal': _tree_proto.InequalityTest.LESS_OR_EQUAL,
+    'less': _tree_proto.InequalityTest.LESS_THAN
+}
+
+
+def parse_number_or_string_to_proto(proto, param):
+  if isinstance(param, numbers.Number):
+    proto.constant_value = param
+  else:  # assume it's a string
+    if param.isdigit():
+      proto.constant_value = int(param)
+    else:
+      text_format.Merge(param, proto)
+
+
+def build_params_proto(params):
+  """Build a TensorForestParams proto out of the V4ForestHParams object."""
+  proto = _params_proto.TensorForestParams()
+  proto.num_trees = params.num_trees
+  proto.max_nodes = params.max_nodes
+  proto.is_regression = params.regression
+  proto.num_outputs = params.num_classes
+  proto.num_features = params.num_features
+
+  proto.leaf_type = params.leaf_model_type
+  proto.stats_type = params.stats_model_type
+  proto.collection_type = _params_proto.COLLECTION_BASIC
+  proto.pruning_type.type = params.pruning_type
+  proto.finish_type.type = params.finish_type
+
+  proto.inequality_test_type = params.split_type
+
+  proto.drop_final_class = False
+  proto.collate_examples = params.collate_examples
+  proto.checkpoint_stats = params.checkpoint_stats
+  proto.use_running_stats_method = params.use_running_stats_method
+  proto.initialize_average_splits = params.initialize_average_splits
+
+  parse_number_or_string_to_proto(proto.pruning_type.prune_every_samples,
+                                  params.prune_every_samples)
+  parse_number_or_string_to_proto(proto.finish_type.check_every_steps,
+                                  params.early_finish_check_every_samples)
+  parse_number_or_string_to_proto(proto.split_after_samples,
+                                  params.split_after_samples)
+  parse_number_or_string_to_proto(proto.num_splits_to_consider,
+                                  params.num_splits_to_consider)
+
+  proto.dominate_fraction.constant_value = params.dominate_fraction
+
+  if params.param_file:
+    with open(params.param_file) as f:
+      text_format.Merge(f.read(), proto)
+
+  return proto
+
+
 # A convenience class for holding random forest hyperparameters.
 #
 # To just get some good default parameters, use:
@@ -62,12 +145,22 @@ class ForestHParams(object):
                bagging_fraction=1.0,
                num_splits_to_consider=0,
                feature_bagging_fraction=1.0,
-               max_fertile_nodes=0,
+               max_fertile_nodes=0,  # deprecated, unused.
                split_after_samples=250,
-               min_split_samples=5,
                valid_leaf_threshold=1,
                dominate_method='bootstrap',
                dominate_fraction=0.99,
+               model_name='all_dense',
+               split_finish_name='basic',
+               split_pruning_name='none',
+               prune_every_samples=0,
+               early_finish_check_every_samples=0,
+               collate_examples=False,
+               checkpoint_stats=False,
+               use_running_stats_method=False,
+               initialize_average_splits=False,
+               param_file=None,
+               split_name='less_or_equal',
                **kwargs):
     self.num_trees = num_trees
     self.max_nodes = max_nodes
@@ -76,10 +169,20 @@ class ForestHParams(object):
     self.num_splits_to_consider = num_splits_to_consider
     self.max_fertile_nodes = max_fertile_nodes
     self.split_after_samples = split_after_samples
-    self.min_split_samples = min_split_samples
     self.valid_leaf_threshold = valid_leaf_threshold
     self.dominate_method = dominate_method
     self.dominate_fraction = dominate_fraction
+    self.model_name = model_name
+    self.split_finish_name = split_finish_name
+    self.split_pruning_name = split_pruning_name
+    self.collate_examples = collate_examples
+    self.checkpoint_stats = checkpoint_stats
+    self.use_running_stats_method = use_running_stats_method
+    self.initialize_average_splits = initialize_average_splits
+    self.param_file = param_file
+    self.split_name = split_name
+    self.early_finish_check_every_samples = early_finish_check_every_samples
+    self.prune_every_samples = prune_every_samples
 
     for name, value in kwargs.items():
       setattr(self, name, value)
@@ -115,24 +218,64 @@ class ForestHParams(object):
     # Our experiments have found that num_splits_to_consider = num_features
     # gives good accuracy.
     self.num_splits_to_consider = self.num_splits_to_consider or min(
-        self.num_features, 1000)
-
-    self.max_fertile_nodes = (self.max_fertile_nodes or
-                              int(math.ceil(self.max_nodes / 2.0)))
-
-    # We have num_splits_to_consider slots to fill, and we want to spend
-    # approximately split_after_samples samples initializing them.
-    num_split_initializiations_per_input = max(1, int(math.floor(
-        self.num_splits_to_consider / self.split_after_samples)))
-    self.split_initializations_per_input = getattr(
-        self, 'split_initializations_per_input',
-        num_split_initializiations_per_input)
+        max(10, math.floor(math.sqrt(self.num_features))), 1000)
 
     # If base_random_seed is 0, the current time will be used to seed the
     # random number generators for each tree.  If non-zero, the i-th tree
     # will be seeded with base_random_seed + i.
     self.base_random_seed = getattr(self, 'base_random_seed', 0)
 
+    # How to store leaf models.
+    self.leaf_model_type = (
+        REGRESSION_MODEL_TYPE[0] if self.regression else
+        CLASSIFICATION_LEAF_MODEL_TYPES[self.model_name][0])
+
+    # How to store stats objects.
+    self.stats_model_type = (
+        REGRESSION_MODEL_TYPE[1] if self.regression else
+        CLASSIFICATION_LEAF_MODEL_TYPES[self.model_name][1])
+
+    self.finish_type = (
+        _params_proto.SPLIT_FINISH_BASIC if self.regression else
+        FINISH_TYPES[self.split_finish_name])
+
+    self.pruning_type = PRUNING_TYPES[self.split_pruning_name]
+
+    if self.pruning_type == _params_proto.SPLIT_PRUNE_NONE:
+      self.prune_every_samples = 0
+    else:
+      if (not self.prune_every_samples and
+          not (isinstance(numbers.Number) or
+               self.split_after_samples.isdigit())):
+        logging.error(
+            'Must specify prune_every_samples if using a depth-dependent '
+            'split_after_samples')
+      # Pruning half-way through split_after_samples seems like a decent
+      # default, making it easy to select the number being pruned with
+      # pruning_type while not paying the cost of pruning too often.  Note that
+      # this only holds if not using a depth-dependent split_after_samples.
+      self.prune_every_samples = (self.prune_every_samples or
+                                  int(self.split_after_samples) / 2)
+
+    if self.finish_type == _params_proto.SPLIT_FINISH_BASIC:
+      self.early_finish_check_every_samples = 0
+    else:
+      if (not self.early_finish_check_every_samples and
+          not (isinstance(numbers.Number) or
+               self.split_after_samples.isdigit())):
+        logging.error(
+            'Must specify prune_every_samples if using a depth-dependent '
+            'split_after_samples')
+      # Checking for early finish every quarter through split_after_samples
+      # seems like a decent default. We don't want to incur the checking cost
+      # too often, but (at least for hoeffding) it's lower than the cost of
+      # pruning so we can do it a little more frequently.
+      self.early_finish_check_every_samples = (
+          self.early_finish_check_every_samples or
+          int(self.split_after_samples) / 4)
+
+    self.split_type = SPLIT_TYPES[self.split_name]
+
     return self
 
 
@@ -157,116 +300,25 @@ class TreeTrainingVariables(object):
   """
 
   def __init__(self, params, tree_num, training):
-    self.tree = variable_scope.get_variable(
-        name=self.get_tree_name('tree', tree_num), dtype=dtypes.int32,
-        shape=[params.max_nodes, 2],
-        initializer=init_ops.constant_initializer(-2))
-    self.tree_thresholds = variable_scope.get_variable(
-        name=self.get_tree_name('tree_thresholds', tree_num),
-        shape=[params.max_nodes],
-        initializer=init_ops.constant_initializer(-1.0))
-    self.end_of_tree = variable_scope.get_variable(
-        name=self.get_tree_name('end_of_tree', tree_num),
-        dtype=dtypes.int32,
-        initializer=constant_op.constant([1]))
-    self.start_epoch = variable_scope.get_variable(
-        name=self.get_tree_name('start_epoch', tree_num),
-        dtype=dtypes.int32, shape=[params.max_nodes],
-        initializer=init_ops.constant_initializer(0))
-
-    if training:
-      self.node_to_accumulator_map = variable_scope.get_variable(
-          name=self.get_tree_name('node_to_accumulator_map', tree_num),
-          shape=[params.max_nodes],
-          dtype=dtypes.int32,
-          initializer=init_ops.constant_initializer(-1))
-      self.accumulator_to_node_map = variable_scope.get_variable(
-          name=self.get_tree_name('accumulator_to_node_map', tree_num),
-          shape=[params.max_fertile_nodes],
-          dtype=dtypes.int32,
-          initializer=init_ops.constant_initializer(-1))
-
-      self.candidate_split_features = variable_scope.get_variable(
-          name=self.get_tree_name('candidate_split_features', tree_num),
-          shape=[params.max_fertile_nodes, params.num_splits_to_consider],
-          dtype=dtypes.int32,
-          initializer=init_ops.constant_initializer(-1))
-      self.candidate_split_thresholds = variable_scope.get_variable(
-          name=self.get_tree_name('candidate_split_thresholds', tree_num),
-          shape=[params.max_fertile_nodes, params.num_splits_to_consider],
-          initializer=init_ops.constant_initializer(0.0))
-
-    # Statistics shared by classification and regression.
-    self.node_sums = variable_scope.get_variable(
-        name=self.get_tree_name('node_sums', tree_num),
-        shape=[params.max_nodes, params.num_output_columns],
-        initializer=init_ops.constant_initializer(0.0))
+    if (not hasattr(params, 'params_proto') or
+        not isinstance(params.params_proto,
+                       _params_proto.TensorForestParams)):
+      params.params_proto = build_params_proto(params)
 
+    params.serialized_params_proto = params.params_proto.SerializeToString()
+    self.stats = None
     if training:
-      self.candidate_split_sums = variable_scope.get_variable(
-          name=self.get_tree_name('candidate_split_sums', tree_num),
-          shape=[params.max_fertile_nodes, params.num_splits_to_consider,
-                 params.num_output_columns],
-          initializer=init_ops.constant_initializer(0.0))
-      self.accumulator_sums = variable_scope.get_variable(
-          name=self.get_tree_name('accumulator_sums', tree_num),
-          shape=[params.max_fertile_nodes, params.num_output_columns],
-          initializer=init_ops.constant_initializer(-1.0))
-
-      # Regression also tracks second order stats.
-      if params.regression:
-        self.node_squares = variable_scope.get_variable(
-            name=self.get_tree_name('node_squares', tree_num),
-            shape=[params.max_nodes, params.num_output_columns],
-            initializer=init_ops.constant_initializer(0.0))
-
-        self.candidate_split_squares = variable_scope.get_variable(
-            name=self.get_tree_name('candidate_split_squares', tree_num),
-            shape=[params.max_fertile_nodes, params.num_splits_to_consider,
-                   params.num_output_columns],
-            initializer=init_ops.constant_initializer(0.0))
-
-        self.accumulator_squares = variable_scope.get_variable(
-            name=self.get_tree_name('accumulator_squares', tree_num),
-            shape=[params.max_fertile_nodes, params.num_output_columns],
-            initializer=init_ops.constant_initializer(-1.0))
-
-      else:
-        self.node_squares = constant_op.constant(
-            0.0, name=self.get_tree_name('node_squares', tree_num))
-
-        self.candidate_split_squares = constant_op.constant(
-            0.0, name=self.get_tree_name('candidate_split_squares', tree_num))
-
-        self.accumulator_squares = constant_op.constant(
-            0.0, name=self.get_tree_name('accumulator_squares', tree_num))
+      # TODO(gilberth): Manually shard this to be able to fit it on
+      # multiple machines.
+      self.stats = stats_ops.fertile_stats_variable(
+          params, '', self.get_tree_name('stats', tree_num))
+    self.tree = model_ops.tree_variable(
+        params, '', self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestStats(object):
-
-  def __init__(self, tree_stats, params):
-    """A simple container for stats about a forest."""
-    self.tree_stats = tree_stats
-    self.params = params
-
-  def get_average(self, thing):
-    val = 0.0
-    for i in range(self.params.num_trees):
-      val += getattr(self.tree_stats[i], thing)
-
-    return val / self.params.num_trees
-
-
-class TreeStats(object):
-
-  def __init__(self, num_nodes, num_leaves):
-    self.num_nodes = num_nodes
-    self.num_leaves = num_leaves
-
-
 class ForestTrainingVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
@@ -330,6 +382,10 @@ class RandomForestGraphs(object):
     return array_ops.concat(
         [split_data[ind] for ind in self.params.bagged_features[tree_num]], 1)
 
+  def get_all_resource_handles(self):
+    return ([self.variables[i].tree for i in range(len(self.trees))] +
+            [self.variables[i].stats for i in range(len(self.trees))])
+
   def training_graph(self,
                      input_data,
                      input_labels,
@@ -365,7 +421,6 @@ class RandomForestGraphs(object):
     tree_start = int(trainer_id * trees_per_trainer)
     tree_end = int((trainer_id + 1) * trees_per_trainer)
     for i in range(tree_start, tree_end):
-      logging.info('training graph for tree: %d' % i)
       with ops.device(self.variables.device_dummies[i].device):
         seed = self.params.base_random_seed
         if seed != 0:
@@ -397,16 +452,13 @@ class RandomForestGraphs(object):
                 'Feature bagging not supported with sparse features.')
           tree_data = self._bag_features(i, tree_data)
 
-        initialization = self.trees[i].tree_initialization()
-
-        with ops.control_dependencies([initialization]):
-          tree_graphs.append(self.trees[i].training_graph(
-              tree_data,
-              tree_labels,
-              seed,
-              data_spec=data_spec,
-              sparse_features=processed_sparse_features,
-              **tree_kwargs))
+        tree_graphs.append(self.trees[i].training_graph(
+            tree_data,
+            tree_labels,
+            seed,
+            data_spec=data_spec,
+            sparse_features=processed_sparse_features,
+            **tree_kwargs))
 
     return control_flow_ops.group(*tree_graphs, name='train')
 
@@ -435,7 +487,7 @@ class RandomForestGraphs(object):
           if processed_sparse_features is not None:
             raise NotImplementedError(
                 'Feature bagging not supported with sparse features.')
-          tree_data = self._bag_features(i, input_data)
+          tree_data = self._bag_features(i, tree_data)
         probabilities.append(self.trees[i].inference_graph(
             tree_data,
             data_spec,
@@ -479,13 +531,6 @@ class RandomForestGraphs(object):
         impurities.append(self.trees[i].average_impurity())
     return math_ops.reduce_mean(array_ops.stack(impurities))
 
-  def get_stats(self, session):
-    tree_stats = []
-    for i in range(self.params.num_trees):
-      with ops.device(self.variables.device_dummies[i].device):
-        tree_stats.append(self.trees[i].get_stats(session))
-    return ForestStats(tree_stats, self.params)
-
   def feature_importances(self):
     tree_counts = [self.trees[i].feature_usage_counts()
                    for i in range(self.params.num_trees)]
@@ -493,64 +538,6 @@ class RandomForestGraphs(object):
     return total_counts / math_ops.reduce_sum(total_counts)
 
 
-def one_hot_wrapper(num_classes, loss_fn):
-  """Some loss functions take one-hot labels."""
-  def _loss(probs, targets):
-    if targets.get_shape().ndims > 1:
-      targets = array_ops.squeeze(targets, squeeze_dims=[1])
-    one_hot_labels = array_ops.one_hot(
-        math_ops.to_int32(targets),
-        num_classes,
-        on_value=1.,
-        off_value=0.,
-        dtype=dtypes.float32)
-    return loss_fn(probs, one_hot_labels)
-  return _loss
-
-
-class TrainingLossForest(RandomForestGraphs):
-  """Random Forest that uses training loss as the termination criteria."""
-
-  def __init__(self, params, loss_fn=None, **kwargs):
-    """Initialize.
-
-    Args:
-      params: Like RandomForestGraphs, a ForestHParams object.
-      loss_fn: A function that takes probabilities and targets and returns
-        a loss for each example.
-      **kwargs: Keyword args to pass to superclass (RandomForestGraphs).
-    """
-    self.loss_fn = loss_fn or one_hot_wrapper(params.num_classes,
-                                              loss_ops.log_loss)
-    self._loss = None
-    super(TrainingLossForest, self).__init__(params, **kwargs)
-
-  def _get_loss(self, features, labels):
-    """Constructs, caches, and returns the inference-based loss."""
-    if self._loss is not None:
-      return self._loss
-
-    def _average_loss():
-      probs = self.inference_graph(features)
-      return math_ops.reduce_sum(self.loss_fn(
-          probs, labels)) / math_ops.to_float(array_ops.shape(labels)[0])
-
-    self._loss = control_flow_ops.cond(
-        self.average_size() > 0, _average_loss,
-        lambda: constant_op.constant(sys.maxsize, dtype=dtypes.float32))
-
-    return self._loss
-
-  def training_graph(self, input_data, input_labels, **kwargs):
-    loss = self._get_loss(input_data, input_labels)
-    with ops.control_dependencies([loss.op]):
-      return super(TrainingLossForest, self).training_graph(
-          input_data, input_labels, **kwargs)
-
-  def training_loss(self, features, labels, name='training_loss'):
-    return array_ops.identity(self._get_loss(features, labels), name=name)
-
-
 class RandomTreeGraphs(object):
   """Builds TF graphs for random tree training and inference."""
 
@@ -559,77 +546,6 @@ class RandomTreeGraphs(object):
     self.params = params
     self.tree_num = tree_num
 
-  def tree_initialization(self):
-    def _init_tree():
-      return state_ops.scatter_update(self.variables.tree, [0], [[-1, -1]]).op
-
-    def _nothing():
-      return control_flow_ops.no_op()
-
-    return control_flow_ops.cond(
-        math_ops.equal(
-            array_ops.squeeze(
-                array_ops.strided_slice(self.variables.tree, [0, 0], [1, 1])),
-            -2), _init_tree, _nothing)
-
-  def _gini(self, class_counts):
-    """Calculate the Gini impurity.
-
-    If c(i) denotes the i-th class count and c = sum_i c(i) then
-      score = 1 - sum_i ( c(i) / c )^2
-
-    Args:
-      class_counts: A 2-D tensor of per-class counts, usually a slice or
-        gather from variables.node_sums.
-
-    Returns:
-      A 1-D tensor of the Gini impurities for each row in the input.
-    """
-    smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
-    sums = math_ops.reduce_sum(smoothed, 1)
-    sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
-
-    return 1.0 - sum_squares / (sums * sums)
-
-  def _weighted_gini(self, class_counts):
-    """Our split score is the Gini impurity times the number of examples.
-
-    If c(i) denotes the i-th class count and c = sum_i c(i) then
-      score = c * (1 - sum_i ( c(i) / c )^2 )
-            = c - sum_i c(i)^2 / c
-    Args:
-      class_counts: A 2-D tensor of per-class counts, usually a slice or
-        gather from variables.node_sums.
-
-    Returns:
-      A 1-D tensor of the Gini impurities for each row in the input.
-    """
-    smoothed = 1.0 + array_ops.slice(class_counts, [0, 1], [-1, -1])
-    sums = math_ops.reduce_sum(smoothed, 1)
-    sum_squares = math_ops.reduce_sum(math_ops.square(smoothed), 1)
-
-    return sums - sum_squares / sums
-
-  def _variance(self, sums, squares):
-    """Calculate the variance for each row of the input tensors.
-
-    Variance is V = E[x^2] - (E[x])^2.
-
-    Args:
-      sums: A tensor containing output sums, usually a slice from
-        variables.node_sums.  Should contain the number of examples seen
-        in index 0 so we can calculate expected value.
-      squares: Same as sums, but sums of squares.
-
-    Returns:
-      A 1-D tensor of the variances for each row in the input.
-    """
-    total_count = array_ops.slice(sums, [0, 0], [-1, 1])
-    e_x = sums / total_count
-    e_x2 = squares / total_count
-
-    return math_ops.reduce_sum(e_x2 - math_ops.square(e_x), 1)
-
   def training_graph(self,
                      input_data,
                      input_labels,
@@ -655,16 +571,12 @@ class RandomTreeGraphs(object):
     Returns:
       The last op in the random tree training graph.
     """
-    epoch = math_ops.to_int32(get_epoch_variable())
-
-    serialized_input_spec = data_spec.SerializeToString()
+    # TODO(gilberth): Use this.
+    unused_epoch = math_ops.to_int32(get_epoch_variable())
 
     if input_weights is None:
       input_weights = []
 
-    if input_data is None:
-      input_data = []
-
     sparse_indices = []
     sparse_values = []
     sparse_shape = []
@@ -673,245 +585,45 @@ class RandomTreeGraphs(object):
       sparse_values = sparse_features.values
       sparse_shape = sparse_features.dense_shape
 
-    # Count extremely random stats.
-    (node_sums, node_squares, splits_indices, splits_sums, splits_squares,
-     totals_indices, totals_sums, totals_squares,
-     input_leaves) = (tensor_forest_ops.count_extremely_random_stats(
-         input_data,
-         sparse_indices,
-         sparse_values,
-         sparse_shape,
-         input_labels,
-         input_weights,
-         self.variables.tree,
-         self.variables.tree_thresholds,
-         self.variables.node_to_accumulator_map,
-         self.variables.candidate_split_features,
-         self.variables.candidate_split_thresholds,
-         self.variables.start_epoch,
-         epoch,
-         input_spec=serialized_input_spec,
-         num_classes=self.params.num_output_columns,
-         regression=self.params.regression))
-    node_update_ops = []
-    node_update_ops.append(
-        state_ops.assign_add(self.variables.node_sums, node_sums))
-
-    splits_update_ops = []
-    splits_update_ops.append(
-        tensor_forest_ops.scatter_add_ndim(self.variables.candidate_split_sums,
-                                           splits_indices, splits_sums))
-    splits_update_ops.append(
-        tensor_forest_ops.scatter_add_ndim(self.variables.accumulator_sums,
-                                           totals_indices, totals_sums))
-
-    if self.params.regression:
-      node_update_ops.append(state_ops.assign_add(self.variables.node_squares,
-                                                  node_squares))
-      splits_update_ops.append(
-          tensor_forest_ops.scatter_add_ndim(
-              self.variables.candidate_split_squares, splits_indices,
-              splits_squares))
-      splits_update_ops.append(
-          tensor_forest_ops.scatter_add_ndim(self.variables.accumulator_squares,
-                                             totals_indices, totals_squares))
-
-    # Sample inputs.
-    update_indices, feature_updates, threshold_updates = (
-        tensor_forest_ops.sample_inputs(
-            input_data,
-            sparse_indices,
-            sparse_values,
-            sparse_shape,
-            input_weights,
-            self.variables.node_to_accumulator_map,
-            input_leaves,
-            self.variables.candidate_split_features,
-            self.variables.candidate_split_thresholds,
-            input_spec=serialized_input_spec,
-            split_initializations_per_input=(
-                self.params.split_initializations_per_input),
-            split_sampling_random_seed=random_seed))
-    update_features_op = state_ops.scatter_update(
-        self.variables.candidate_split_features, update_indices,
-        feature_updates)
-    update_thresholds_op = state_ops.scatter_update(
-        self.variables.candidate_split_thresholds, update_indices,
-        threshold_updates)
-
-    # Calculate finished nodes.
-    with ops.control_dependencies(splits_update_ops):
-      # Passing input_leaves to finished nodes here means that nodes that
-      # have become stale won't be deallocated until an input reaches them,
-      # because we're trying to avoid considering every fertile node for
-      # performance reasons.
-      finished, stale = tensor_forest_ops.finished_nodes(
-          input_leaves,
-          self.variables.node_to_accumulator_map,
-          self.variables.candidate_split_sums,
-          self.variables.candidate_split_squares,
-          self.variables.accumulator_sums,
-          self.variables.accumulator_squares,
-          self.variables.start_epoch,
-          epoch,
-          num_split_after_samples=self.params.split_after_samples,
-          min_split_samples=self.params.min_split_samples,
-          dominate_method=self.params.dominate_method,
-          dominate_fraction=self.params.dominate_fraction)
-
-    # Update leaf scores.
-    # TODO(thomaswc): Store the leaf scores in a TopN and only update the
-    # scores of the leaves that were touched by this batch of input.
-    children = array_ops.squeeze(
-        array_ops.slice(self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1])
-    is_leaf = math_ops.equal(constants.LEAF_NODE, children)
-    leaves = math_ops.to_int32(
-        array_ops.squeeze(
-            array_ops.where(is_leaf), squeeze_dims=[1]))
-    non_fertile_leaves = array_ops.boolean_mask(
-        leaves, math_ops.less(array_ops.gather(
-            self.variables.node_to_accumulator_map, leaves), 0))
-
-    # TODO(gilberth): It should be possible to limit the number of non
-    # fertile leaves we calculate scores for, especially since we can only take
-    # at most array_ops.shape(finished)[0] of them.
-    with ops.control_dependencies(node_update_ops):
-      sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves)
-      if self.params.regression:
-        squares = array_ops.gather(self.variables.node_squares,
-                                   non_fertile_leaves)
-        non_fertile_leaf_scores = self._variance(sums, squares)
-      else:
-        non_fertile_leaf_scores = self._weighted_gini(sums)
-
-    # Calculate best splits.
-    with ops.control_dependencies(splits_update_ops):
-      split_indices = tensor_forest_ops.best_splits(
-          finished,
-          self.variables.node_to_accumulator_map,
-          self.variables.candidate_split_sums,
-          self.variables.candidate_split_squares,
-          self.variables.accumulator_sums,
-          self.variables.accumulator_squares,
-          regression=self.params.regression)
-
-    # Grow tree.
-    with ops.control_dependencies([update_features_op, update_thresholds_op,
-                                   non_fertile_leaves.op]):
-      (tree_update_indices, tree_children_updates, tree_threshold_updates,
-       new_eot) = (tensor_forest_ops.grow_tree(
-           self.variables.end_of_tree, self.variables.node_to_accumulator_map,
-           finished, split_indices, self.variables.candidate_split_features,
-           self.variables.candidate_split_thresholds))
-      tree_update_op = state_ops.scatter_update(
-          self.variables.tree, tree_update_indices, tree_children_updates)
-      thresholds_update_op = state_ops.scatter_update(
-          self.variables.tree_thresholds, tree_update_indices,
-          tree_threshold_updates)
-      # TODO(thomaswc): Only update the epoch on the new leaves.
-      new_epoch_updates = epoch * array_ops.ones_like(tree_threshold_updates,
-                                                      dtype=dtypes.int32)
-      epoch_update_op = state_ops.scatter_update(
-          self.variables.start_epoch, tree_update_indices,
-          new_epoch_updates)
-
-    # Update fertile slots.
-    with ops.control_dependencies([tree_update_op]):
-      (n2a_map_updates, a2n_map_updates, accumulators_cleared,
-       accumulators_allocated) = (tensor_forest_ops.update_fertile_slots(
-           finished,
-           non_fertile_leaves,
-           non_fertile_leaf_scores,
-           self.variables.end_of_tree,
-           self.variables.accumulator_sums,
-           self.variables.node_to_accumulator_map,
-           stale,
-           self.variables.node_sums,
-           regression=self.params.regression))
-
-    # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has
-    # used it to calculate new leaves.
-    with ops.control_dependencies([n2a_map_updates.op]):
-      eot_update_op = state_ops.assign(self.variables.end_of_tree, new_eot)
-
-    updates = []
-    updates.append(eot_update_op)
-    updates.append(tree_update_op)
-    updates.append(thresholds_update_op)
-    updates.append(epoch_update_op)
-
-    updates.append(
-        state_ops.scatter_update(self.variables.node_to_accumulator_map,
-                                 n2a_map_updates[0], n2a_map_updates[1]))
-
-    updates.append(
-        state_ops.scatter_update(self.variables.accumulator_to_node_map,
-                                 a2n_map_updates[0], a2n_map_updates[1]))
-
-    cleared_and_allocated_accumulators = array_ops.concat(
-        [accumulators_cleared, accumulators_allocated], 0)
-
-    # Calculate values to put into scatter update for candidate counts.
-    # Candidate split counts are always reset back to 0 for both cleared
-    # and allocated accumulators. This means some accumulators might be doubly
-    # reset to 0 if the were released and not allocated, then later allocated.
-    split_values = array_ops.tile(
-        array_ops.expand_dims(array_ops.expand_dims(
-            array_ops.zeros_like(cleared_and_allocated_accumulators,
-                                 dtype=dtypes.float32), 1), 2),
-        [1, self.params.num_splits_to_consider, self.params.num_output_columns])
-    updates.append(state_ops.scatter_update(
-        self.variables.candidate_split_sums,
-        cleared_and_allocated_accumulators, split_values))
-    if self.params.regression:
-      updates.append(state_ops.scatter_update(
-          self.variables.candidate_split_squares,
-          cleared_and_allocated_accumulators, split_values))
-
-    # Calculate values to put into scatter update for total counts.
-    total_cleared = array_ops.tile(
-        array_ops.expand_dims(
-            math_ops.negative(array_ops.ones_like(accumulators_cleared,
-                                                  dtype=dtypes.float32)), 1),
-        [1, self.params.num_output_columns])
-    total_reset = array_ops.tile(
-        array_ops.expand_dims(
-            array_ops.zeros_like(accumulators_allocated,
-                                 dtype=dtypes.float32), 1),
-        [1, self.params.num_output_columns])
-    accumulator_updates = array_ops.concat([total_cleared, total_reset], 0)
-    updates.append(state_ops.scatter_update(
-        self.variables.accumulator_sums,
-        cleared_and_allocated_accumulators, accumulator_updates))
-    if self.params.regression:
-      updates.append(state_ops.scatter_update(
-          self.variables.accumulator_squares,
-          cleared_and_allocated_accumulators, accumulator_updates))
-
-    # Calculate values to put into scatter update for candidate splits.
-    split_features_updates = array_ops.tile(
-        array_ops.expand_dims(
-            math_ops.negative(array_ops.ones_like(
-                cleared_and_allocated_accumulators)), 1),
-        [1, self.params.num_splits_to_consider])
-    updates.append(state_ops.scatter_update(
-        self.variables.candidate_split_features,
-        cleared_and_allocated_accumulators, split_features_updates))
-
-    updates += self.finish_iteration()
-
-    return control_flow_ops.group(*updates)
-
-  def finish_iteration(self):
-    """Perform any operations that should be done at the end of an iteration.
-
-    This is mostly useful for subclasses that need to reset variables after
-    an iteration, such as ones that are used to finish nodes.
+    if input_data is None:
+      input_data = []
 
-    Returns:
-      A list of operations.
-    """
-    return []
+    leaf_ids = model_ops.traverse_tree_v4(
+        self.variables.tree,
+        input_data,
+        sparse_indices,
+        sparse_values,
+        sparse_shape,
+        input_spec=data_spec.SerializeToString(),
+        params=self.params.serialized_params_proto)
+
+    update_model = model_ops.update_model_v4(
+        self.variables.tree,
+        leaf_ids,
+        input_labels,
+        input_weights,
+        params=self.params.serialized_params_proto)
+
+    finished_nodes = stats_ops.process_input_v4(
+        self.variables.tree,
+        self.variables.stats,
+        input_data,
+        sparse_indices,
+        sparse_values,
+        sparse_shape,
+        input_labels,
+        input_weights,
+        leaf_ids,
+        input_spec=data_spec.SerializeToString(),
+        random_seed=random_seed,
+        params=self.params.serialized_params_proto)
+
+    with ops.control_dependencies([update_model]):
+      return stats_ops.grow_tree_v4(
+          self.variables.tree,
+          self.variables.stats,
+          finished_nodes,
+          params=self.params.serialized_params_proto)
 
   def inference_graph(self, input_data, data_spec, sparse_features=None):
     """Constructs a TF graph for evaluating a random tree.
@@ -925,9 +637,6 @@ class RandomTreeGraphs(object):
     Returns:
       The last op in the random tree inference graph.
     """
-    if input_data is None:
-      input_data = []
-
     sparse_indices = []
     sparse_values = []
     sparse_shape = []
@@ -935,43 +644,17 @@ class RandomTreeGraphs(object):
       sparse_indices = sparse_features.indices
       sparse_values = sparse_features.values
       sparse_shape = sparse_features.dense_shape
+    if input_data is None:
+      input_data = []
 
-    return tensor_forest_ops.tree_predictions(
+    return model_ops.tree_predictions_v4(
+        self.variables.tree,
         input_data,
         sparse_indices,
         sparse_values,
         sparse_shape,
-        self.variables.tree,
-        self.variables.tree_thresholds,
-        self.variables.node_sums,
         input_spec=data_spec.SerializeToString(),
-        valid_leaf_threshold=self.params.valid_leaf_threshold)
-
-  def average_impurity(self):
-    """Constructs a TF graph for evaluating the average leaf impurity of a tree.
-
-    If in regression mode, this is the leaf variance. If in classification mode,
-    this is the gini impurity.
-
-    Returns:
-      The last op in the graph.
-    """
-    children = array_ops.squeeze(array_ops.slice(
-        self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1])
-    is_leaf = math_ops.equal(constants.LEAF_NODE, children)
-    leaves = math_ops.to_int32(array_ops.squeeze(array_ops.where(is_leaf),
-                                                 squeeze_dims=[1]))
-    counts = array_ops.gather(self.variables.node_sums, leaves)
-    gini = self._weighted_gini(counts)
-    # Guard against step 1, when there often are no leaves yet.
-    def impurity():
-      return gini
-    # Since average impurity can be used for loss, when there's no data just
-    # return a big number so that loss always decreases.
-    def big():
-      return array_ops.ones_like(gini, dtype=dtypes.float32) * 10000000.
-    return control_flow_ops.cond(math_ops.greater(
-        array_ops.shape(leaves)[0], 0), impurity, big)
+        params=self.params.serialized_params_proto)
 
   def size(self):
     """Constructs a TF graph for evaluating the current number of nodes.
@@ -979,19 +662,8 @@ class RandomTreeGraphs(object):
     Returns:
       The current number of nodes in the tree.
     """
-    return self.variables.end_of_tree - 1
-
-  def get_stats(self, session):
-    num_nodes = self.variables.end_of_tree.eval(session=session) - 1
-    num_leaves = array_ops.where(
-        math_ops.equal(array_ops.squeeze(array_ops.slice(
-            self.variables.tree, [0, 0], [-1, 1])), constants.LEAF_NODE)
-        ).eval(session=session).shape[0]
-    return TreeStats(num_nodes, num_leaves)
+    return model_ops.tree_size(self.variables.tree)
 
   def feature_usage_counts(self):
-    features = array_ops.slice(self.variables.tree, [0, 1], [-1, 1])
-    # One hot ignores negative values, which is the default for unused nodes.
-    one_hots = array_ops.one_hot(
-        array_ops.squeeze(features), self.params.num_features)
-    return math_ops.reduce_sum(one_hots, 0)
+    return model_ops.feature_usage_counts(
+        self.variables.tree, params=self.params.serialized_params_proto)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index a9a3f66bbfed9403daad44df5795d437226d0354..ddbe30426d46f3fdde9cb73b4caec87f156a4610 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -36,13 +36,9 @@ class TensorForestTest(test_util.TensorFlowTestCase):
         num_features=60).fill()
     self.assertEquals(2, hparams.num_classes)
     self.assertEquals(3, hparams.num_output_columns)
-    self.assertEquals(60, hparams.num_splits_to_consider)
-    # Don't have more fertile nodes than max # leaves, which is 500.
-    self.assertEquals(500, hparams.max_fertile_nodes)
+    self.assertEquals(10, hparams.num_splits_to_consider)
     # Default value of valid_leaf_threshold
     self.assertEquals(1, hparams.valid_leaf_threshold)
-    # floor(60 / 25) = 2
-    self.assertEquals(2, hparams.split_initializations_per_input)
     self.assertEquals(0, hparams.base_random_seed)
 
   def testForestHParamsBigTree(self):
@@ -52,11 +48,17 @@ class TensorForestTest(test_util.TensorFlowTestCase):
         max_nodes=1000000,
         split_after_samples=25,
         num_features=1000).fill()
-    self.assertEquals(1000, hparams.num_splits_to_consider)
-    # 1000000 / 2 = 500000
-    self.assertEquals(500000, hparams.max_fertile_nodes)
-    # floor(1000 / 25) = 40
-    self.assertEquals(40, hparams.split_initializations_per_input)
+    self.assertEquals(31, hparams.num_splits_to_consider)
+
+  def testForestHParamsStringParams(self):
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_trees=100,
+        max_nodes=1000000,
+        split_after_samples="25",
+        num_splits_to_consider="1000000",
+        num_features=1000).fill()
+    self.assertEquals("1000000", hparams.num_splits_to_consider)
 
   def testTrainingConstructionClassification(self):
     input_data = [[-1., 0.], [-1., 2.],  # node 1
@@ -106,18 +108,6 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     graph = graph_builder.inference_graph(input_data)
     self.assertTrue(isinstance(graph, ops.Tensor))
 
-  def testImpurityConstruction(self):
-    params = tensor_forest.ForestHParams(
-        num_classes=4,
-        num_features=2,
-        num_trees=10,
-        max_nodes=1000,
-        split_after_samples=25).fill()
-
-    graph_builder = tensor_forest.RandomForestGraphs(params)
-    graph = graph_builder.average_impurity()
-    self.assertTrue(isinstance(graph, ops.Tensor))
-
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 13de7fb39d97269604d924641b32e1eb3b535e60..2e0a46ffe432341a423ac159deb7745d9ef15374 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -42,8 +42,8 @@ py_library(
     srcs = ["plugins/projector/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":protos_all_py",
         "//tensorflow/python:lib",
-        "//tensorflow/tensorboard/plugins/projector:protos_all_py",
     ],
 )
 
@@ -54,10 +54,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":projector",
+        ":protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
-        "//tensorflow/tensorboard/plugins/projector:protos_all_py",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
index be2398cdc0c97055d7ce29d1150f57bc85f77398..7b9be76757c680afe0e802a924f479bb182a54ed 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
@@ -28,11 +28,11 @@ from __future__ import print_function
 import os
 
 from google.protobuf import text_format
-from tensorflow.python.lib.io import file_io
-from tensorflow.tensorboard.plugins.projector import projector_config_pb2
+from tensorflow.contrib.tensorboard.plugins.projector import projector_config_pb2
 # pylint: disable=wildcard-import
-from tensorflow.tensorboard.plugins.projector.projector_config_pb2 import *
+from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import *
 # pylint: enable=wildcard-import
+from tensorflow.python.lib.io import file_io
 
 
 def visualize_embeddings(summary_writer, config):
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
index 5f86f57a1c6213f4fb1e15bb2a37d33a7b21b564..9ad42bff47ffa3a235c6d0f3cd14a25f370cc74a 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
@@ -24,10 +24,10 @@ import shutil
 from google.protobuf import text_format
 
 from tensorflow.contrib.tensorboard.plugins import projector
+from tensorflow.contrib.tensorboard.plugins.projector import projector_config_pb2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer as writer_lib
-from tensorflow.tensorboard.plugins.projector import projector_config_pb2
 
 
 class ProjectorApiTest(test.TestCase):
diff --git a/tensorflow/tensorboard/plugins/projector/projector_config.proto b/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
similarity index 100%
rename from tensorflow/tensorboard/plugins/projector/projector_config.proto
rename to tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
diff --git a/tensorflow/contrib/testing/BUILD b/tensorflow/contrib/testing/BUILD
index 225a1ccd126bf02e030862857925c543cf9f350b..0be6aa755bee50451f6717139fd8e1315789b389 100644
--- a/tensorflow/contrib/testing/BUILD
+++ b/tensorflow/contrib/testing/BUILD
@@ -16,6 +16,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/testing/testdata/mobilenet_224_gender_basic_fixed.mb b/tensorflow/contrib/testing/testdata/mobilenet_224_gender_basic_fixed.mb
new file mode 100644
index 0000000000000000000000000000000000000000..fefe5f4d5f5a26419cc165ac895865b95b66ebf7
Binary files /dev/null and b/tensorflow/contrib/testing/testdata/mobilenet_224_gender_basic_fixed.mb differ
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index 6bcb03238cc81cd8ccc0d423bc9b65cb594166db..8a2cb28684fe5151176b00fbcfaa64626ec18c38 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -101,8 +101,6 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
     ],
diff --git a/tensorflow/contrib/tfprof/BUILD b/tensorflow/contrib/tfprof/BUILD
index 944d767e21a116436307ac5c9e12988b49f1de70..4ff97e5d761016a353695d01ab56863d303f379a 100644
--- a/tensorflow/contrib/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/BUILD
@@ -8,12 +8,26 @@ py_library(
     name = "tfprof",
     srcs = [
         "__init__.py",
+        "model_analyzer.py",
+        "tfprof_logger.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/contrib/tfprof/python/tools/tfprof:model_analyzer",
-        "//tensorflow/contrib/tfprof/python/tools/tfprof:tfprof_logger",
         "//tensorflow/python:util",
+        "//tensorflow/python/profiler:model_analyzer",
+        "//tensorflow/python/profiler:tfprof_logger",
     ],
 )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index 4fa1ccea699ee405d0bf7a14256cdc216257c8a4..7faf2b9b24acfd71f0ffa6d4a8477a34ff3ed321 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,26 +1,24 @@
 # tfprof: TensorFlow Profiler and Beyond
 
-# Full Document in tensorflow/tools/tfprof/README.md
-
-Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
-
-Consultants: Jon Shlens, Pete Warden
-
-###Major Features
-
-1.  Measure model parameters, float operations, tensor shapes.
-2.  Profile op execution times, requested memory size and device placement.
-3.  Inspect checkpoint tensors' shapes and their values.
-4.  Selectively group, filter, account and order ops.
-
-####tfprof supports 3 views to organize TensorFlow model profiles
-
-    *  code view: Stats are associated your Python codes and organized as call stacks.
-    *  scope view: Stats are organized as name scope hierarchies.
-    *  graph view: Stats are organized as Tensorflow Op graph.
-
-####For each view, there are 3 ways to display outputs:
-
-    *  stdout: Results are written to stdout.
-    *  timeline: Visualized in chrome browser as time series.
-    *  file: Results are dumped to file.
+<h1>Please use `tf.profiler.xxx` instead of `tf.contrib.tfprof.xxx`</h1>
+<h1>Full Document in <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md">tensorflow/core/profiler/README.md</a><h1>
+
+### Features
+
+* Profile model architectures
+  * parameters, tensor shapes, float operations, device placement, etc.
+* Profile model performance
+  * execution time, memory consumption
+  * Profile multiple steps.
+* Auto profile and advise.
+  * accelerator utilization check
+  * expensive operation check
+  * operation configuration check
+  * distributed runtime check (Not OSS)
+
+### Interfaces
+
+* Python API
+* Command Line
+* Visualization
+* C++ API (Not public, contact us if needed.)
diff --git a/tensorflow/contrib/tfprof/__init__.py b/tensorflow/contrib/tfprof/__init__.py
index f3952f6cb5cf01039dab6dae810fefe3f5b4ef20..7a023e5d67dea6071e91a07887cd5c51a554be48 100644
--- a/tensorflow/contrib/tfprof/__init__.py
+++ b/tensorflow/contrib/tfprof/__init__.py
@@ -17,5 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
-from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
+# pylint: disable=unused-import
+from tensorflow.contrib.tfprof import model_analyzer
+from tensorflow.contrib.tfprof import tfprof_logger
diff --git a/tensorflow/contrib/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/model_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d45c8f54c48c4ae44bdf8a33d5454459812fb745
--- /dev/null
+++ b/tensorflow/contrib/tfprof/model_analyzer.py
@@ -0,0 +1,117 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model Analyzer.
+
+Analyze model, including shape, params, time, memory, structure, etc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# Import the names here for existing users.
+# pylint: disable=unused-import
+from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.profiler.model_analyzer import advise as _advise
+from tensorflow.python.profiler.model_analyzer import ALL_ADVICE
+from tensorflow.python.profiler.model_analyzer import profile as _profile
+from tensorflow.python.profiler.model_analyzer import Profiler
+from tensorflow.python.util.deprecation import deprecated
+
+_DEFAULT_PROFILE_OPTIONS = 0
+_DEFAULT_ADVISE_OPTIONS = 0
+
+# pylint: disable=bad-whitespace
+# pylint: disable=bad-continuation
+# options examples for profiling API.
+#
+# Show the parameter statistics of trainable variables.
+TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
+    'max_depth': 10000,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 0,
+    'order_by': 'name',
+    'account_type_regexes': [tfprof_logger.TRAINABLE_VARIABLES],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['params'],
+    'output': 'stdout',
+    'dump_to_file': ''  # Deprecated, use 'output': 'file:outfile=<name>'
+}
+
+# Show the number float operations.
+FLOAT_OPS_OPTIONS = {
+    'max_depth': 10000,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 1,
+    'order_by': 'float_ops',
+    'account_type_regexes': ['.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['float_ops'],
+    'output': 'stdout',
+    'dump_to_file': ''  # Deprecated, use 'output': 'file:outfile=<name>'
+}
+
+
+# Show the timing stats and memory demands.
+PRINT_ALL_TIMING_MEMORY = {
+    'max_depth': 10000,
+    'min_bytes': 1,  # Only >=1
+    'min_micros': 1,  # Only >=1
+    'min_params': 0,
+    'min_float_ops': 0,
+    'order_by': 'name',
+    'account_type_regexes': ['.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['micros', 'bytes'],
+    'output': 'stdout',
+    'dump_to_file': ''  # Deprecated, use 'output': 'file:outfile=<name>'
+}
+
+# pylint: enable=bad-whitespace
+# pylint: enable=bad-continuation
+
+
+@deprecated('2018-01-01',
+            'Use `tf.profiler.advise(graph, run_meta, options)`. See README.md')
+def advise(graph, run_meta=None, tfprof_options=_DEFAULT_ADVISE_OPTIONS):
+  return _advise(graph, run_meta, tfprof_options)
+
+
+@deprecated('2018-01-01',
+            'Use `tf.profiler.profile(graph, run_meta, op_log, cmd, options)`. '
+            'Build `options` with `tf.profiler.ProfileOptionBuilder`. '
+            'See README.md for details')
+def print_model_analysis(graph,
+                         run_meta=None,
+                         op_log=None,
+                         tfprof_cmd='scope',
+                         tfprof_options=_DEFAULT_PROFILE_OPTIONS):
+  return _profile(graph, run_meta, op_log, tfprof_cmd, tfprof_options)
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
deleted file mode 100644
index 419beac0b9b066ed1d15287559236b4e8d0d2ef8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Model Analyzer.
-
-Analyze model, including shape, params, time, memory, structure, etc.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
-from tensorflow.contrib.tfprof.python.tools.tfprof.internal import pywrap_tensorflow_print_model_analysis_lib as print_mdl
-from tensorflow.python.framework import errors
-from tensorflow.tools.tfprof import tfprof_options_pb2
-from tensorflow.tools.tfprof import tfprof_output_pb2
-
-# pylint: disable=bad-whitespace
-# pylint: disable=bad-continuation
-# 2 example tfprof_options for print_model_analysis API.
-#
-# Show the parameter statistics of trainable variables.
-TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
-    'max_depth': 10000,
-    'min_bytes': 0,
-    'min_micros': 0,
-    'min_params': 0,
-    'min_float_ops': 0,
-    'order_by': 'name',
-    'account_type_regexes': [tfprof_logger.TRAINABLE_VARIABLES],
-    'start_name_regexes': ['.*'],
-    'trim_name_regexes': [],
-    'show_name_regexes': ['.*'],
-    'hide_name_regexes': [],
-    'account_displayed_op_only': True,
-    'select': ['params'],
-    'output': 'stdout',
-    'dump_to_file': ''
-}
-
-# Show the number float operations.
-FLOAT_OPS_OPTIONS = {
-    'max_depth': 10000,
-    'min_bytes': 0,
-    'min_micros': 0,
-    'min_params': 0,
-    'min_float_ops': 1,
-    'order_by': 'float_ops',
-    'account_type_regexes': ['.*'],
-    'start_name_regexes': ['.*'],
-    'trim_name_regexes': [],
-    'show_name_regexes': ['.*'],
-    'hide_name_regexes': [],
-    'account_displayed_op_only': True,
-    'select': ['float_ops'],
-    'output': 'stdout',
-    'dump_to_file': ''
-}
-
-# Show number of parameters on parameter server 0.
-# It is recommended to provide`run_meta` argument
-# to have complete device placement info.
-PRINT_PARAMS_ON_DEVICE = {
-    'max_depth': 1,
-    'min_bytes': 0,
-    'min_micros': 0,
-    'min_params': 0,
-    'min_float_ops': 0,
-    'order_by': 'name',
-    'account_type_regexes': ['.*ps.*task:0.*'],
-    'start_name_regexes': ['.*'],
-    'trim_name_regexes': [],
-    'show_name_regexes': ['.*'],
-    'hide_name_regexes': [],
-    'account_displayed_op_only': False,
-    'select': ['device', 'params'],
-    'output': 'stdout',
-    'dump_to_file': ''
-}
-
-# Show the timing stats and memory demands.
-PRINT_ALL_TIMING_MEMORY = {
-    'max_depth': 10000,
-    'min_bytes': 1,  # Only >=1
-    'min_micros': 1,  # Only >=1
-    'min_params': 0,
-    'min_float_ops': 0,
-    'order_by': 'name',
-    'account_type_regexes': ['.*'],
-    'start_name_regexes': ['.*'],
-    'trim_name_regexes': [],
-    'show_name_regexes': ['.*'],
-    'hide_name_regexes': [],
-    'account_displayed_op_only': True,
-    'select': ['micros', 'bytes'],
-    'output': 'stdout',
-    'dump_to_file': ''
-}
-
-# pylint: enable=bad-whitespace
-# pylint: enable=bad-continuation
-
-
-def _build_options(tfprof_options):
-  """Build tfprof.OptionsProto.
-
-  Args:
-    tfprof_options: A dictionary of options.
-  Returns:
-    tfprof.OptionsProto.
-  """
-  opts = tfprof_options_pb2.OptionsProto()
-  opts.max_depth = tfprof_options.get('max_depth', 10)
-  opts.min_bytes = tfprof_options.get('min_bytes', 0)
-  opts.min_micros = tfprof_options.get('min_micros', 0)
-  opts.min_params = tfprof_options.get('min_params', 0)
-  opts.min_float_ops = tfprof_options.get('min_float_ops', 0)
-  opts.min_occurrence = tfprof_options.get('min_occurrence', 0)
-
-  opts.step = tfprof_options.get('step', -1)
-
-  opts.order_by = tfprof_options.get('order_by', 'name')
-
-  for p in tfprof_options.get('account_type_regexes', []):
-    opts.account_type_regexes.append(p)
-  for p in tfprof_options.get('start_name_regexes', []):
-    opts.start_name_regexes.append(p)
-  for p in tfprof_options.get('trim_name_regexes', []):
-    opts.trim_name_regexes.append(p)
-  for p in tfprof_options.get('show_name_regexes', []):
-    opts.show_name_regexes.append(p)
-  for p in tfprof_options.get('hide_name_regexes', []):
-    opts.hide_name_regexes.append(p)
-  opts.account_displayed_op_only = tfprof_options.get(
-      'account_displayed_op_only', False)
-
-  for p in tfprof_options.get('select', []):
-    opts.select.append(p)
-
-  opts.output = tfprof_options.get('output', 'stdout')
-  opts.dump_to_file = tfprof_options.get('dump_to_file', '')
-
-  return opts
-
-
-class Profiler(object):
-  """TensorFlow multi-step profiler.
-
-  See go/tfprof or README for details.
-
-  Typical use case:
-    # Currently we are only allowed to create 1 profiler per process.
-    profiler = Profile(sess.graph)
-
-    for i in xrange(total_steps):
-      if i % 10000 == 0:
-        run_meta = tf.RunMetadata()
-        _ = sess.run(...,
-                     options=tf.RunOptions(
-                         trace_level=tf.RunOptions.FULL_TRACE),
-                     run_metadata=run_meta)
-        profiler.add_step(i, run_meta)
-
-        # Profile the parameters of your model.
-        profiler.profile_name_scope(options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
-
-        # Or profile the timing of your model operations.
-        opts = PRINT_ALL_TIMING_MEMORY.copy()
-        opts['order_by'] = 'micros'
-        opts['select'] = ['micros', 'occurrence']
-        opts['max_depth'] = 20
-        profiler.profile_operations(options=opts)
-
-        # Or you can generate a timeline:
-        opts = PRINT_ALL_TIMING_MEMORY.copy()
-        opts['output'] = 'timeline:outfile=' + filename
-        opts['step'] = i
-        profiler.profile_graph(options=opts)
-      else:
-        _ = sess.run(...)
-    # Auto detect problems and generate advice.
-    profiler.advise()
-  """
-
-  def __init__(self, graph, op_log=None):
-    """Constructor.
-
-    Args:
-      graph: tf.Graph.
-      op_log: optional. tensorflow::tfprof::OpLog proto. Used to define
-          extra op types.
-    """
-    self._graph = graph
-    # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
-        self._graph, op_log=op_log)
-    # pylint: enable=protected-access
-
-    print_mdl.NewProfiler(
-        self._graph.as_graph_def(add_shapes=True).SerializeToString(),
-        op_log.SerializeToString())
-
-  def __del__(self):
-    print_mdl.DeleteProfiler()
-
-  def add_step(self, step, run_meta):
-    """Add statistics of a step.
-
-    Args:
-      step: A step uint64 used to identify the RunMetadata. Must be different
-         across different AddStep() calls.
-      run_meta: RunMetadata proto that contains statistics of a session run.
-    """
-    # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
-        self._graph, run_meta=run_meta, add_trace=False,
-        add_trainable_var=False)
-    # pylint: enable=protected-access
-    print_mdl.AddStep(
-        step, run_meta.SerializeToString(), op_log.SerializeToString())
-
-  def profile_python_codes(self, options):
-    """Profile the statistics of the Python codes.
-
-      Hint: set options['show_name_regexes'] = ['.*my_code.py.*']
-
-    Args:
-      options: A dict of profiler options.
-    Returns:
-      a TFMultiGraphNodeProto that records the results.
-    """
-    opts = _build_options(options)
-    tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
-    tfprof_node.ParseFromString(
-        print_mdl.Profile('code'.encode('utf-8'), opts.SerializeToString()))
-    return tfprof_node
-
-  def profile_operations(self, options):
-    """Profile the statistics of the Operation types (e.g. MatMul, Conv2D).
-
-    Args:
-      options: A dict of profiler options.
-    Returns:
-      a TFMultiGraphNodeProto that records the results.
-    """
-    opts = _build_options(options)
-    tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
-    tfprof_node.ParseFromString(
-        print_mdl.Profile('op'.encode('utf-8'), opts.SerializeToString()))
-    return tfprof_node
-
-  def profile_name_scope(self, options):
-    """Profile the statistics of graph nodes, organized by name scope.
-
-    Args:
-      options: A dict of profiler options.
-    Returns:
-      a TFGraphNodeProto that records the results.
-    """
-    opts = _build_options(options)
-    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
-    tfprof_node.ParseFromString(
-        print_mdl.Profile('scope'.encode('utf-8'), opts.SerializeToString()))
-    return tfprof_node
-
-  def profile_graph(self, options):
-    """Profile the statistics of graph nodes, organized by dataflow graph.
-
-    Args:
-      options: A dict of profiler options.
-    Returns:
-      a TFGraphNodeProto that records the results.
-    """
-    opts = _build_options(options)
-    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
-    tfprof_node.ParseFromString(
-        print_mdl.Profile('graph'.encode('utf-8'), opts.SerializeToString()))
-    return tfprof_node
-
-  def advise(self):
-    """Automatically detect problems and generate reports."""
-    print_mdl.Advise()
-
-
-def print_model_analysis(graph,
-                         run_meta=None,
-                         op_log=None,
-                         tfprof_cmd='scope',
-                         tfprof_options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS):
-  """Print model statistics.
-
-    See go/tfprof or README for examples and tutorials.
-    Run tfprof tool for help:
-    'bazel run third_party/tensorflow/tools/tfprof help'
-
-  Args:
-    graph: tf.Graph.
-    run_meta: tensorflow::RunMetadata proto. When provided, also shows valid
-              timing and memory information when 'select' option contains
-              'micros' and 'bytes'.
-    op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
-            group together ops and use a op_type to select the group.
-    tfprof_cmd: string. Either 'op', 'scope', 'graph', 'code'.
-                'op' view organize outputs using operation type. (e.g. MatMul)
-                'scope' view organize outputs using graph node name scope.
-                'graph' view organize outputs using graph node inputs/outputs.
-                'code' view organize outputs using Python call stack.
-    tfprof_options: See 'tfprof help' for details.
-  Returns:
-    If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
-    If tfprof_cmd is 'op' or 'code', returns TFMultiGraphNodeProto proto.
-    Side effect: stdout/file/timeline.json depending on tfprof_options['output']
-  """
-  # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
-      graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
-  # pylint: enable=protected-access
-
-  opts = _build_options(tfprof_options)
-
-  run_meta_str = run_meta.SerializeToString() if run_meta else b''
-
-  if tfprof_cmd == 'code' or tfprof_cmd == 'op':
-    tfprof_node = tfprof_output_pb2.TFMultiGraphNodeProto()
-    tfprof_node.ParseFromString(
-        print_mdl.PrintModelAnalysis(
-            graph.as_graph_def(add_shapes=True).SerializeToString(),
-            run_meta_str,
-            op_log.SerializeToString(),
-            tfprof_cmd.encode('utf-8'),
-            opts.SerializeToString()))
-  elif tfprof_cmd == 'graph' or tfprof_cmd == 'scope':
-    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
-    tfprof_node.ParseFromString(
-        print_mdl.PrintModelAnalysis(
-            graph.as_graph_def(add_shapes=True).SerializeToString(),
-            run_meta_str,
-            op_log.SerializeToString(),
-            tfprof_cmd.encode('utf-8'),
-            opts.SerializeToString()))
-  else:
-    raise errors.InvalidArgumentError(
-        None, None, 'unknown tfprof_cmd: %s\n' % tfprof_cmd)
-
-  return tfprof_node
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
deleted file mode 100644
index 913971afaf1433c06c83a21cfc07b4cd5c0da134..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-# XXX: this depends on pywrap_tensorflow and must come later
-from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
-from tensorflow.contrib.tfprof.python.tools.tfprof.internal import model_analyzer_testlib as lib
-
-
-class PrintModelAnalysisTest(test.TestCase):
-
-  def testDumpToFile(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    outfile = os.path.join(test.get_temp_dir(), 'dump')
-    opts['output'] = 'file:outfile=' + outfile
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      _ = lib.BuildSmallModel()
-      model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
-
-      with gfile.Open(outfile, 'r') as f:
-        self.assertEqual(u'node name | # parameters\n'
-                         '_TFProfRoot (--/451 params)\n'
-                         '  DW (3x3x3x6, 162/162 params)\n'
-                         '  DW2 (2x2x6x12, 288/288 params)\n'
-                         '  ScalarW (1, 1/1 params)\n',
-                         f.read())
-
-  def testSelectEverything(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    outfile = os.path.join(test.get_temp_dir(), 'dump')
-    opts['output'] = 'file:outfile=' + outfile
-    opts['account_type_regexes'] = ['.*']
-    opts['select'] = [
-        'bytes', 'params', 'float_ops', 'occurrence', 'device', 'op_types',
-        'input_shapes'
-    ]
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = lib.BuildSmallModel()
-
-      sess.run(variables.global_variables_initializer())
-      run_meta = config_pb2.RunMetadata()
-      _ = sess.run(x,
-                   options=config_pb2.RunOptions(
-                       trace_level=config_pb2.RunOptions.FULL_TRACE),
-                   run_metadata=run_meta)
-
-      model_analyzer.print_model_analysis(
-          sess.graph, run_meta, tfprof_options=opts)
-
-      with gfile.Open(outfile, 'r') as f:
-        # pylint: disable=line-too-long
-        self.assertEqual(
-            'node name | # parameters | # float_ops | output bytes | assigned devices | op types | input shapes\n_TFProfRoot (--/451 params, --/10.44k flops, --/5.28KB, _kTFScopeParent, )\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D, 0:2x6x6x3|1:3x3x3x6)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D, 0:2x3x3x6|1:2x2x6x12)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables, )\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign, 0:3x3x3x6|1:3x3x3x6)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent, )\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal, 0:4)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const, )\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity, 0:3x3x3x6)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables, )\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, Assign, 0:2x2x6x12|1:2x2x6x12)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent, )\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal, 0:4)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const, )\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const, )\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity, 0:2x2x6x12)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, VariableV2|_trainable_variables, )\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign, 0:1|1:1)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent, )\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add, 0:1|1:1)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal, 0:0)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const, )\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul, 0:1|1:1)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const, )\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const, )\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, Identity, 0:1)\n  init (0/0 params, 0/0 flops, 0B/0B, NoOp, 0:1|1:3x3x3x6|2:2x2x6x12)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const, )\n',
-            f.read())
-        # pylint: enable=line-too-long
-
-  def testSimpleCodeView(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    outfile = os.path.join(test.get_temp_dir(), 'dump')
-    opts['output'] = 'file:outfile=' + outfile
-    opts['account_type_regexes'] = ['.*']
-    opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
-    opts['account_displayed_op_only'] = False
-    # TODO(xpan): Test 'micros'. Since the execution time changes each run,
-    # it's a bit difficult to test it now.
-    opts['select'] = [
-        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
-        'input_shapes'
-    ]
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = lib.BuildSmallModel()
-
-      sess.run(variables.global_variables_initializer())
-      run_meta = config_pb2.RunMetadata()
-      _ = sess.run(x,
-                   options=config_pb2.RunOptions(
-                       trace_level=config_pb2.RunOptions.FULL_TRACE),
-                   run_metadata=run_meta)
-
-      model_analyzer.print_model_analysis(
-          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
-
-      with gfile.Open(outfile, 'r') as f:
-        # pylint: disable=line-too-long
-        self.assertEqual(
-            'node name | output bytes | # parameters | # float_ops | assigned devices | input',
-            f.read()[0:80])
-        # pylint: enable=line-too-long
-
-  def testComplexCodeView(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    outfile = os.path.join(test.get_temp_dir(), 'dump')
-    opts['output'] = 'file:outfile=' + outfile
-    opts['account_type_regexes'] = ['.*']
-    opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
-    opts['account_displayed_op_only'] = False
-    opts['select'] = ['params', 'float_ops']
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = lib.BuildFullModel()
-
-      sess.run(variables.global_variables_initializer())
-      run_meta = config_pb2.RunMetadata()
-      _ = sess.run(x,
-                   options=config_pb2.RunOptions(
-                       trace_level=config_pb2.RunOptions.FULL_TRACE),
-                   run_metadata=run_meta)
-
-      tfprof_node = model_analyzer.print_model_analysis(
-          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
-
-      # pylint: disable=line-too-long
-      with gfile.Open(outfile, 'r') as f:
-        lines = f.read().split('\n')
-        result = '\n'.join([l[:min(len(l), 80)] for l in lines])
-        self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/54.08k flops)\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n    model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n    model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n    model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n    model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n  model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n  model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n',
-                         result)
-
-      self.assertLess(0, tfprof_node.total_exec_micros)
-      self.assertEqual(2844, tfprof_node.total_parameters)
-      self.assertEqual(54080, tfprof_node.total_float_ops)
-      self.assertEqual(5, len(tfprof_node.children))
-      self.assertEqual('_TFProfRoot', tfprof_node.name)
-      self.assertEqual(
-          'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_...',
-          tfprof_node.children[0].name)
-      self.assertEqual(
-          'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...',
-          tfprof_node.children[1].name)
-      self.assertEqual(
-          'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...',
-          tfprof_node.children[2].name)
-      self.assertEqual(
-          'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...',
-          tfprof_node.children[3].name)
-      self.assertEqual(
-          'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...',
-          tfprof_node.children[4].name)
-      # pylint: enable=line-too-long
-
-  def testCodeViewLeafGraphNode(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    opts['account_type_regexes'] = ['.*']
-    opts['account_displayed_op_only'] = False
-    opts['select'] = [
-        'bytes', 'params', 'float_ops', 'device'
-    ]
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = lib.BuildSmallModel()
-
-      sess.run(variables.global_variables_initializer())
-      run_meta = config_pb2.RunMetadata()
-      _ = sess.run(x,
-                   options=config_pb2.RunOptions(
-                       trace_level=config_pb2.RunOptions.FULL_TRACE),
-                   run_metadata=run_meta)
-
-      tfprof_node = model_analyzer.print_model_analysis(
-          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
-
-      leaf = tfprof_node
-      while leaf.children:
-        self.assertEqual(0, len(leaf.graph_nodes))
-        leaf = leaf.children[0]
-      self.assertEqual(1, len(leaf.graph_nodes))
-
-  def testTimeline(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    outfile = os.path.join(test.get_temp_dir(), 'timeline')
-    opts['output'] = 'timeline:outfile=' + outfile
-    opts['account_type_regexes'] = ['.*']
-    opts['max_depth'] = 100000
-    opts['step'] = 0
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = lib.BuildFullModel()
-
-      sess.run(variables.global_variables_initializer())
-      run_meta = config_pb2.RunMetadata()
-      _ = sess.run(
-          x,
-          options=config_pb2.RunOptions(
-              trace_level=config_pb2.RunOptions.FULL_TRACE),
-          run_metadata=run_meta)
-
-      _ = model_analyzer.print_model_analysis(
-          sess.graph, run_meta, tfprof_cmd='graph', tfprof_options=opts)
-
-      with gfile.Open(outfile, 'r') as f:
-        # Test that a json file is created.
-        self.assertLess(1000, len(f.read()))
-
-  def testOpView(self):
-    ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    outfile = os.path.join(test.get_temp_dir(), 'dump')
-    opts['output'] = 'file:outfile=' + outfile
-    opts['account_type_regexes'] = ['.*']
-    opts['min_occurrence'] = 10
-    opts['select'] = ['params', 'micros', 'occurrence', 'input_shapes']
-    opts['order_by'] = 'occurrence'
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = lib.BuildFullModel()
-
-      sess.run(variables.global_variables_initializer())
-      run_meta = config_pb2.RunMetadata()
-      _ = sess.run(x,
-                   options=config_pb2.RunOptions(
-                       trace_level=config_pb2.RunOptions.FULL_TRACE),
-                   run_metadata=run_meta)
-
-      tfprof_node = model_analyzer.print_model_analysis(
-          sess.graph, run_meta, tfprof_cmd='op', tfprof_options=opts)
-
-      with gfile.Open(outfile, 'r') as f:
-        self.assertEqual(
-            'nodename|executiontime|#parameters|opoccurrence|inputshapes\n',
-            f.read().replace('\t', '').replace(' ', '')[0:60])
-
-      total_children = 0
-      last_occurrence = 1e32
-      input_shapes = 0
-      last_total_micros = tfprof_node.total_exec_micros
-      last_micros = tfprof_node.exec_micros
-      while tfprof_node.children:
-        for gnode in tfprof_node.graph_nodes:
-          input_shapes += len(gnode.input_shapes)
-        self.assertEqual(len(tfprof_node.children), 1)
-        tfprof_node = tfprof_node.children[0]
-
-        self.assertEqual(
-            last_total_micros, tfprof_node.total_exec_micros + last_micros)
-        last_total_micros = tfprof_node.total_exec_micros
-        last_micros = tfprof_node.exec_micros
-
-        total_children += 1
-        self.assertLessEqual(len(tfprof_node.graph_nodes), last_occurrence)
-        last_occurrence = len(tfprof_node.graph_nodes)
-
-      self.assertEqual(total_children, 15)
-      self.assertGreater(input_shapes, 0)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/tfprof_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..78eaca37a1f6f05f582eb6f011410e5fdea830e1
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tfprof_logger.py
@@ -0,0 +1,29 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logging tensorflow::tfprof::OpLogProto.
+
+OpLogProto is used to add extra model information for offline analysis.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.profiler.tfprof_logger import write_op_log as _write_op_log
+from tensorflow.python.util.deprecation import deprecated
+
+
+@deprecated("2018-01-01", "Use `tf.profiler.write_op_log. go/tfprof`")
+def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
+  _write_op_log(graph, log_dir, op_log, run_meta, add_trace)
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b4ecb61a42d71e1901f78095830db63bbc2e0e98
--- /dev/null
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -0,0 +1,48 @@
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "timeseries",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/timeseries/python/timeseries:estimators",
+        "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
+        "//tensorflow/contrib/timeseries/python/timeseries:input_pipeline",
+        "//tensorflow/contrib/timeseries/python/timeseries:py_init",
+        "//tensorflow/contrib/timeseries/python/timeseries:saved_model_utils",
+    ],
+)
+
+py_library(
+    name = "timeseries_pip",
+    deps = [
+        ":timeseries",
+        "//tensorflow/contrib/timeseries/examples:known_anomaly",
+        "//tensorflow/contrib/timeseries/examples:lstm",
+        "//tensorflow/contrib/timeseries/examples:multivariate",
+        "//tensorflow/contrib/timeseries/examples:predict",
+        "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
+        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/timeseries/README.md b/tensorflow/contrib/timeseries/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2b36ade986e845b4606c9bcb4bd3a4632840a4f4
--- /dev/null
+++ b/tensorflow/contrib/timeseries/README.md
@@ -0,0 +1,26 @@
+# TensorFlow Time Series
+
+TensorFlow Time Series (TFTS) is a collection of ready-to-use classic models
+(state space, autoregressive), and flexible infrastructure for building
+high-performance time series models whatever the architecture. It includes tools
+for chunking and batching a series, and for saving model state across chunks,
+making use of parallel computation even when training sequential models on long
+series (using truncated backpropagation).
+
+To get started, take a look at the `examples/` directory, which includes:
+
+ - Making probabilistic forecasts (`examples/predict.py`)
+ - Using exogenous features to train on data with known anomalies/changepoints (`examples/known_anomaly.py`)
+ - Learning correlations between series (multivariate forecasting/anomaly
+   detection; `examples/multivariate.py`)
+ - More advanced custom model building (`examples/lstm.py`)
+
+TFTS includes many other modeling tools, including non-linear autoregression
+(see the `hidden_layer_sizes` argument to `ARRegressor` in `estimators.py`) and
+a collection of components for linear state space modeling (level, trend,
+period, vector autoregression, moving averages; see the
+`StructuralEnsembleRegressor` in `estimators.py`). Both model classes support
+heuristics for ignoring un-labeled anomalies in training data. Trained models
+can be exported for inference/serving in
+[SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md)
+(see `examples/multivariate.py`).
diff --git a/tensorflow/contrib/timeseries/__init__.py b/tensorflow/contrib/timeseries/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cd9366f1f461dc6e60b2ade48056bb3cac1eeeb
--- /dev/null
+++ b/tensorflow/contrib/timeseries/__init__.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A time series library in TensorFlow (TFTS).
+
+@@StructuralEnsembleRegressor
+@@ARRegressor
+
+@@ARModel
+
+@@CSVReader
+@@RandomWindowInputFn
+@@WholeDatasetInputFn
+@@predict_continuation_input_fn
+
+@@TrainEvalFeatures
+@@FilteringResults
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.timeseries.python.timeseries import *
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(module_name=__name__,
+                    allowed_exception_list=['saved_model_utils'])
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..015d0eba29f281d78ed6717271987cf3f2e121e9
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -0,0 +1,112 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_binary(
+    name = "predict",
+    srcs = ["predict.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "predict_test",
+    timeout = "long",  # Moderate but for asan
+    srcs = ["predict_test.py"],
+    data = ["data/period_trend.csv"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":predict",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_binary(
+    name = "known_anomaly",
+    srcs = ["known_anomaly.py"],
+    data = ["data/changepoints.csv"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "known_anomaly_test",
+    timeout = "long",  # Moderate but for asan
+    srcs = ["known_anomaly_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":known_anomaly",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_binary(
+    name = "multivariate",
+    srcs = ["multivariate.py"],
+    data = ["data/multivariate_level.csv"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "multivariate_test",
+    timeout = "long",  # Moderate but for asan
+    srcs = [
+        "multivariate_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":multivariate",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_binary(
+    name = "lstm",
+    srcs = ["lstm.py"],
+    data = ["data/multivariate_periods.csv"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "lstm_test",
+    timeout = "long",  # Moderate but for asan
+    srcs = ["lstm_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [":lstm"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/__init__.py b/tensorflow/contrib/timeseries/examples/__init__.py
similarity index 100%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/__init__.py
rename to tensorflow/contrib/timeseries/examples/__init__.py
diff --git a/tensorflow/contrib/timeseries/examples/data/changepoints.csv b/tensorflow/contrib/timeseries/examples/data/changepoints.csv
new file mode 100644
index 0000000000000000000000000000000000000000..377091675f39497c5e2e75a2d5c6cdd337fd7094
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/data/changepoints.csv
@@ -0,0 +1,201 @@
+time,value,is_changepoint
+0,1.1214962292980595,no
+1,0.9426651261919153,no
+2,0.4031960302687991,no
+3,-0.2934288551216625,no
+4,-0.5743074870294929,no
+5,-1.0849473811550254,no
+6,-0.41937946806801163,no
+7,0.018935886380438346,no
+8,0.623316417711846,no
+9,1.536149296834255,no
+10,1.7409091603912816,no
+11,2.051703703599627,no
+12,0.6384001894023976,no
+13,0.5486930106811457,no
+14,-0.2125368668501388,no
+15,0.013749126561190012,no
+16,0.14880983597607,no
+17,0.1635284853313118,no
+18,1.897997315187439,no
+19,1.6579340030214134,no
+20,2.8447844385162195,no
+21,1.6218988328670014,no
+22,1.2696459559280746,no
+23,0.9479057398771089,no
+24,0.24823674890404254,no
+25,0.27109416160211997,yes
+26,0.6012714690807257,no
+27,1.9638777275054928,no
+28,2.4058943017150844,no
+29,2.1603787636120795,no
+30,2.653021551450762,no
+31,2.6302422291515883,no
+32,2.1672420831416983,no
+33,1.900648370044954,no
+34,0.6392456915214698,no
+35,0.9261522810148497,no
+36,1.7007734373862495,no
+37,2.114997001994879,no
+38,2.5608972952926123,no
+39,3.544458039317542,no
+40,3.012875548231654,no
+41,3.73017069321913,no
+42,2.6946843638167333,no
+43,2.8311515823580367,no
+44,1.5925217307525887,no
+45,2.0309433269232073,no
+46,1.9180019103182229,no
+47,2.502600804744764,no
+48,3.2807166427258965,no
+49,3.2786059081935983,no
+50,-0.03890687766970344,yes
+51,0.11514469108191117,no
+52,-0.26083055108544917,no
+53,-1.6028085522978275,no
+54,-1.4337505749859742,no
+55,-1.5254043377879583,no
+56,-1.8019508271507605,no
+57,-0.938164718968729,no
+58,-0.04584450628604822,no
+59,0.21179188280568237,no
+60,0.6070671683315334,no
+61,0.6276767611311048,no
+62,-0.03193958766467522,no
+63,-0.5093408620803364,no
+64,-0.8078327975830406,no
+65,-0.7926626666688175,no
+66,-1.263702632888315,no
+67,-0.756316161320801,no
+68,0.6539838846224881,no
+69,1.039181766829142,no
+70,0.9530783360822233,no
+71,0.5846593087566678,no
+72,0.3789183086882727,no
+73,-0.005047297176443883,no
+74,-0.3257930437614064,no
+75,-12.073284032544285,yes
+76,-10.985299021088675,no
+77,-11.105443872355911,no
+78,-10.159082851902342,no
+79,-9.744065848145468,no
+80,-9.73365351235421,no
+81,-9.337054697472244,no
+82,-9.562841115527014,no
+83,-10.415757451905112,no
+84,-11.342362490064435,no
+85,-11.616152164963092,no
+86,-11.151346186288702,no
+87,-10.429303505950227,no
+88,-9.387541028841701,no
+89,-8.835597195885683,no
+90,-9.042472454146061,no
+91,-8.826062322444617,no
+92,-9.58027111131134,no
+93,-10.105297327026976,no
+94,-10.87201188348659,no
+95,-10.734063432244673,no
+96,-10.564284857652627,no
+97,-10.222152572358237,no
+98,-10.034222253530091,no
+99,-8.569926275463303,no
+100,-20.94641864280508,yes
+101,-20.328581083983565,no
+102,-20.309010035007223,no
+103,-21.903487576802988,no
+104,-21.452703450083913,no
+105,-22.456525013367447,no
+106,-21.400574790162256,no
+107,-21.31159417717666,no
+108,-20.761111930463677,no
+109,-20.03122903525928,no
+110,-19.775288115644337,no
+111,-20.204309173889285,no
+112,-20.289200792476837,no
+113,-20.996369123999006,no
+114,-21.526187402690034,no
+115,-21.954579978399675,no
+116,-21.35551171030287,no
+117,-20.81049230404345,no
+118,-19.913346111752116,no
+119,-19.69212526824288,no
+120,-19.917131232233228,no
+121,-20.06819925540892,no
+122,-19.935797523800392,no
+123,-20.450306566286283,no
+124,-20.633236594032798,no
+125,-13.032619653152857,yes
+126,-13.78752822118153,no
+127,-12.768298388725846,no
+128,-12.592028243555017,no
+129,-11.341855179200538,no
+130,-11.409551272753419,no
+131,-11.61577579285243,no
+132,-12.100218074308154,no
+133,-12.019249802111034,no
+134,-13.11581463614673,no
+135,-13.088783512286215,no
+136,-13.070654237103765,no
+137,-12.175249761266228,no
+138,-11.093992412775233,no
+139,-10.912276441852045,no
+140,-11.084638857082574,no
+141,-11.573963550454287,no
+142,-11.831767648915175,no
+143,-12.198248248363289,no
+144,-12.647459048291175,no
+145,-12.00526050990987,no
+146,-12.86075732253657,no
+147,-12.097659271522993,no
+148,-11.304397011757692,no
+149,-10.907765581608706,no
+150,4.152294032629092,yes
+151,3.846976311678118,no
+152,3.7700912822157164,no
+153,2.848094096503526,no
+154,2.7852277792642517,no
+155,2.083452358392344,no
+156,3.10521897109034,no
+157,2.6883597273679953,no
+158,3.6853340905320797,no
+159,4.657935435352187,no
+160,4.205192591470813,no
+161,4.181620027424676,no
+162,3.7946083469988627,no
+163,4.006143660715422,no
+164,3.302226947074533,no
+165,2.3998839690439953,no
+166,2.359677958463434,no
+167,3.9511382664496764,no
+168,4.2544962146733365,no
+169,4.776487678103945,no
+170,5.524025961899998,no
+171,4.609095497010403,no
+172,4.420771370427846,no
+173,4.5501449534143354,no
+174,3.8649120165249387,no
+175,7.974015818409917,yes
+176,8.231372133173299,no
+177,9.03995638047549,no
+178,10.047862771863846,no
+179,9.955465317225139,no
+180,10.353973597173228,no
+181,10.50998966501437,no
+182,9.439675801327347,no
+183,9.597509477726458,no
+184,8.575204551834439,no
+185,8.63550641368468,no
+186,8.646333808129821,no
+187,8.837543235768495,no
+188,9.661532154505531,no
+189,9.900003537805857,no
+190,10.918387185224491,no
+191,10.027537345249021,no
+192,10.344965386383317,no
+193,9.05542200420013,no
+194,9.41967229823334,no
+195,8.745145521533027,no
+196,9.15826892409396,no
+197,9.716409087979185,no
+198,9.873840185125154,no
+199,10.664518096235417,no
diff --git a/tensorflow/contrib/timeseries/examples/data/multivariate_level.csv b/tensorflow/contrib/timeseries/examples/data/multivariate_level.csv
new file mode 100644
index 0000000000000000000000000000000000000000..17a9d4f879bf87a2e70b5083e65707c34d21e2d8
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/data/multivariate_level.csv
@@ -0,0 +1,1000 @@
+0,-0.0414646733535,-0.268848856679,-0.734895900031,-0.446004825141,-0.694399529702
+1,-0.152094899259,-0.322007819938,-0.332268116964,-0.105095114653,-0.178601838378
+2,0.250556262671,-0.751761834516,-0.770932451952,-0.455887554092,-0.831949277137
+3,-0.373766175967,0.738512142445,-0.199851965386,-0.113920326737,-0.746597969326
+4,-0.075847586994,2.49850954084,-0.195929673324,-0.072250117751,-0.849909928747
+5,-0.205779412656,2.36687030724,-0.0626565533539,-0.144351330587,-0.645795602996
+6,-0.846980328423,1.88954123472,0.268955172856,0.0955759146678,-0.491047551885
+7,-1.71690266703,1.59009549063,-0.819726283855,0.0704148937343,-1.36443168813
+8,-1.62116547912,1.12644651303,-0.548591331565,0.159809406514,-0.982531332323
+9,-2.30511521562,0.233327108262,-0.720377250338,0.185365674993,-0.581283365228
+10,-1.39509634301,0.727639387607,-0.9472391049,-0.24675750758,-1.24362211749
+11,-1.66199313961,-0.429411794076,-1.14309941456,-0.10410046279,-1.50770930313
+12,-1.94640731282,-0.273995766959,-0.716939919329,0.242478963841,-1.01869993398
+13,-1.73697738539,-0.297774741968,-1.80542901165,-0.595544065613,-1.88517320016
+14,-1.98436000953,0.835646566838,-1.48895743706,-0.64954533885,-2.07546616553
+15,-2.19661690257,0.135737360133,-2.04801419858,-0.83727343112,-2.24381736404
+16,-2.05515485655,0.524110912353,-1.07121349384,-0.235165807213,-1.28329038964
+17,-2.05194303098,1.12739698675,-1.19731273863,-0.718272345602,-1.8681304519
+18,-1.46342408153,1.60848661464,-0.982386371124,-0.241330484258,-1.53944424301
+19,-2.2011585973,0.862807915046,-0.995794140283,-0.277024972786,-1.47741597501
+20,-2.23578343098,0.211562996187,-1.80353693381,-0.750494902773,-2.28181147838
+21,-2.4248161381,1.12076760123,-1.59718553352,-0.425584020125,-2.04861425617
+22,-2.31880631104,1.12602215115,-1.91151504658,-0.75502569518,-2.63242617399
+23,-2.11892926843,-0.0776836392241,-1.56290301815,-0.415718204976,-2.30107410479
+24,-2.69745336988,1.1149476191,-1.43630550668,0.280056938851,-2.22883491018
+25,-2.96630018615,-0.806783462871,-0.841372722536,0.583627043401,-1.03872536926
+26,-3.05451403268,-0.41632426626,-1.58925935691,0.0555428856781,-2.1353650451
+27,-3.13467782667,-0.205721140141,-1.36817316767,0.415306030296,-1.8505395134
+28,-3.37583309658,-0.818083884025,-1.55652668932,0.145422266739,-1.66599545695
+29,-2.95706967161,-0.126708568148,-2.49609054237,-0.359693478269,-3.00630852278
+30,-2.38734689742,-0.350923993879,-3.11222966842,-0.984597775882,-3.62711470966
+31,-2.93362724797,-3.34981184909,-2.62299010845,0.0519325700903,-2.33361674027
+32,-2.45038317895,-2.14261343917,-2.11366789279,0.0743102043259,-2.11874446647
+33,-2.64495216711,-2.93675730003,-2.01659804851,0.220799850535,-1.87237671926
+34,-2.12812770812,-2.92126192546,-2.56372930688,0.132880158227,-2.29573778931
+35,-1.35216831892,-2.50483041175,-3.13901618308,-0.37336037936,-3.33252674805
+36,-1.04567351754,-1.83302495038,-3.08212228316,-0.545002988717,-3.6701174207
+37,-1.16496488915,-1.40938171638,-2.94209012758,-0.152560391217,-3.50296269877
+38,-0.84385282013,-0.648804927911,-2.75862460218,-0.279881663166,-3.63690027991
+39,-1.1037352916,-0.851582153896,-2.77523199927,-0.307039823577,-3.38853728212
+40,-0.957048835866,-0.716389490538,-2.5075186655,-0.368254150835,-3.12977069615
+41,-0.391472235219,0.226506834239,-2.52187570516,-0.61894949021,-3.47667477699
+42,-0.405242757907,0.418957308382,-2.62051759148,-0.9796272777,-3.42806374772
+43,-0.748240793265,0.0115709569987,-2.6684888437,-0.667482764946,-3.43428687409
+44,-0.216387397721,0.448920090064,-2.70192416248,-0.808861227389,-3.75630714954
+45,-1.12093785791,0.813988652808,-2.34432442826,-0.474816708663,-3.456833936
+46,-2.04823857595,-0.0815669196333,-1.86066257604,0.263513591785,-2.62013231073
+47,-2.23426540572,1.14990130874,-2.12369491495,-0.0668207664299,-3.32831571065
+48,-1.57555379584,1.82043610627,-1.9765782176,-0.102298636364,-3.43261409083
+49,-2.05662519541,0.591204633026,-2.25118904188,-0.104063054092,-3.20456376636
+50,-1.8285006832,0.104590609107,-2.44968651963,-0.484283165496,-3.4943191607
+51,-1.1158243274,1.51817970836,-2.92579532611,-0.95832702811,-4.14551045434
+52,-1.3300517478,1.4640581709,-3.08363690864,-0.908736611138,-4.69257252131
+53,-1.12868164589,1.57313046956,-3.16435479065,-0.957447102204,-4.56171750677
+54,-0.669145314902,1.75434515643,-2.81083675355,-0.774573791032,-4.59395663147
+55,-0.892340515591,0.173537296383,-2.49815052235,-0.375439267459,-3.52324421038
+56,-1.26929850789,0.649913804094,-2.57885164397,-0.46483791564,-3.59996099296
+57,-1.28158565705,-0.570153054645,-2.78819462608,-0.306397870867,-3.4258361014
+58,-1.28263745221,0.301364082853,-3.28815747643,-0.76434916506,-4.3591321818
+59,-1.64608482042,0.079268277506,-3.13957219344,-0.545170637087,-3.94856837712
+60,-1.77018149232,1.34239749475,-3.52189227312,-0.677099703264,-4.62920748509
+61,-1.75655632146,1.24086450834,-3.65478985863,-0.790231809835,-4.83729210022
+62,-1.4538974886,0.583118143939,-2.94897344935,-0.528868313288,-4.45102769399
+63,-1.79084029065,-0.182981048507,-2.30504593196,0.0348033949121,-3.11721052434
+64,-1.8073178267,0.277552929826,-2.50207638545,-0.0968222603598,-3.6080745155
+65,-2.46622391377,-0.669940539918,-2.51643284284,-0.163468699096,-3.19159436336
+66,-1.69971961367,-1.41118297672,-2.71054034025,-0.34638825907,-3.04528220656
+67,-1.96457404093,-1.67429576673,-2.54850971749,-0.067237849466,-2.90503073443
+68,-2.6973521408,-1.66907013816,-1.9623006077,0.479537600832,-2.30009440384
+69,-2.40620671435,-1.53765150204,-2.18583073806,0.446553641646,-2.63840332383
+70,-2.01757147472,-1.22761328,-2.55621892108,0.400640935149,-3.16530630377
+71,-1.94664421789,-0.129913185847,-2.71957281003,-0.147173342055,-3.58233183488
+72,-1.94706064604,1.15127195098,-2.61814243108,-0.304736837281,-3.9346910679
+73,-1.9350122257,1.07051301058,-2.20968576985,-0.149892580485,-3.74024140039
+74,-2.38619066594,1.68724247518,-2.41160829661,-0.00913356005116,-3.99782659316
+75,-2.24809078587,1.21157820875,-2.07388139686,0.407473663742,-3.21636196087
+76,-3.05601728047,-0.159092449639,-1.70662323792,0.549119297817,-2.37237670193
+77,-3.41423377011,0.383317925933,-2.1179809253,0.440933949104,-3.43621876374
+78,-3.06397628245,0.127724697663,-1.50730821646,0.792369512789,-2.51400429958
+79,-2.46206559193,1.39197941174,-2.07070430872,0.0723755941552,-3.42215049587
+80,-2.95861031286,0.492643378852,-2.21356929084,0.219154069898,-3.50736459718
+81,-3.6878555302,-1.16408933615,-1.90916720564,0.805245613936,-2.52416993459
+82,-3.62030177826,-1.61633663455,-1.91163502791,0.769301736417,-2.32917764535
+83,-4.01196983047,-2.48802319211,-1.64295948963,1.24479580458,-1.62606743949
+84,-4.47183588705,-1.86895788707,-1.71275388306,1.32487605749,-2.04746598078
+85,-5.4955157643,-2.66146586745,-1.3125133071,1.83584422007,-1.92358421676
+86,-4.76151142281,-4.0158514346,-1.20747992628,1.77597998686,-0.701607451242
+87,-5.31155771019,-3.15539819753,-0.921016402738,2.05873025447,-0.960023072679
+88,-4.16798254912,-3.00934457399,-0.588731448912,1.90691848431,-0.651736302671
+89,-3.72133637608,-2.63534770736,-0.751807733258,1.73266437472,-0.747133247155
+90,-4.89252968501,-2.11988988053,-0.909281004084,1.89134204562,-1.12181355673
+91,-4.76565397768,-2.45166833079,-0.345448694534,2.12063951416,-0.63677850405
+92,-4.63858695121,-3.55046524931,-0.56519223243,2.27714507148,-0.758053667144
+93,-4.48414155394,-3.7219215509,-0.619311750125,2.23433435538,-0.438222718006
+94,-4.30438313321,-3.95244620189,-0.359853685703,2.27004922971,0.170025525215
+95,-4.7937387984,-3.3691885737,0.178884395796,2.86298118154,0.232096604308
+96,-4.77291737506,-2.52854276077,-0.345707006275,2.50163637851,-0.456607039838
+97,-4.62298139493,-2.29649895418,-0.490691361517,2.46124256727,-0.82515464821
+98,-3.96843790784,-2.7210597373,-0.847691030288,1.82818727491,-0.965708260824
+99,-3.52732312968,-2.13122777966,-1.48257599212,1.58444779334,-1.88600127741
+100,-3.93577934279,-2.96124673527,-0.853709706956,1.9254416322,-0.904048307626
+101,-4.3401932336,-3.10020192226,-0.20986281874,2.57273546,-0.632369678503
+102,-3.87139651112,-2.49995721503,-0.302048022933,2.45265811426,-0.582067583909
+103,-3.68490099559,-1.86926403845,-1.07896542406,1.98128996318,-1.52227363904
+104,-3.73795132118,-1.2813093908,-1.02231707609,1.9052598382,-1.69077029177
+105,-4.11227647638,-0.97321018636,-1.24214456086,1.66677538807,-2.01422017997
+106,-4.31128617249,-0.859790642205,-1.20138722257,1.46077710527,-2.03451018811
+107,-3.09504395205,1.00438274912,-1.97264240173,0.732150386166,-3.32632736368
+108,-2.681407939,1.44318461325,-2.39764334203,0.411009469585,-4.08770189209
+109,-3.06128776345,0.164570601827,-2.83505639278,0.0309828158983,-3.93363349737
+110,-2.96002558525,0.568530678109,-3.06826048982,-0.120518420294,-4.27282226734
+111,-3.19185112296,-0.595185852518,-2.71221607519,0.166579412962,-3.44187869033
+112,-3.81846994181,-1.83388142802,-2.90816681647,0.235773907539,-3.20090724714
+113,-4.18583974049,-1.97129111676,-2.74503137596,0.699189049686,-3.18638906104
+114,-4.15164488936,-1.58883698504,-2.38619384026,0.896062753518,-2.97261262165
+115,-3.94254676388,-1.70839450301,-2.81258836056,0.491894248181,-3.43646144606
+116,-3.96761881212,-1.55620520351,-2.33133041981,0.626269020808,-3.01677540027
+117,-3.50646136496,-0.950732274753,-2.35560895864,0.256356938967,-2.79127289896
+118,-3.25934857603,-0.732566929111,-2.41419969521,0.264090641824,-3.03291722901
+119,-3.42002240184,-0.415338127265,-3.07048729009,-0.0231731719376,-4.02562825774
+120,-3.53921857173,0.0158398656482,-2.70532905559,0.0565707967172,-3.84427790925
+121,-3.15592574046,0.784247799282,-3.58692679922,-0.242085258667,-5.06601908662
+122,-3.76809400203,0.463131255016,-3.8035033346,-0.429273348415,-4.98980268082
+123,-2.96288215704,1.77966617942,-3.32459939883,-0.551051151437,-4.93441116438
+124,-2.4962953866,3.39328322058,-3.03997171564,-0.216002386353,-5.18521353952
+125,-2.82557238612,2.76978657257,-3.23390042871,-0.0004479395837,-5.2176856873
+126,-2.58945498797,2.52368015772,-3.15380668865,-0.0639940636338,-5.26689084107
+127,-2.18245541691,2.46233728874,-3.33317082971,-0.281707789522,-5.35318328452
+128,-2.62315865429,1.51428800961,-3.62498863249,-0.270500998569,-5.2760566531
+129,-1.67705168949,2.67883021797,-3.17315867722,-0.341422601729,-5.19145005014
+130,-1.13323473613,3.3659551068,-2.9678582214,-0.429528041988,-5.25094097547
+131,-1.04105880247,3.50972732877,-3.11167783436,-0.456888273574,-5.24428811476
+132,-1.04773214914,3.82875614139,-3.04149583424,-0.371839445182,-5.61930173874
+133,-1.82094528277,2.83823742395,-3.33646912963,-0.322370252244,-5.61791889275
+134,-2.91804499764,1.8780201922,-4.06997503816,-0.498867961513,-6.17519153519
+135,-2.46795673425,2.31056167769,-4.50715795956,-0.766293318265,-6.953363658
+136,-3.0183856337,1.30296095196,-3.7714443347,-0.182133829714,-5.68593518503
+137,-4.1459367179,1.35223688484,-3.76595632938,0.0559619112277,-5.76499263486
+138,-4.26014113282,0.840287216995,-4.24154634876,0.0170854664832,-5.97212592449
+139,-3.71553554501,1.76361822382,-4.04901573444,-0.184615894262,-6.13019444273
+140,-4.98134444903,0.643221203145,-3.63502050732,0.732739839835,-5.31102037188
+141,-4.93231227313,0.451617863662,-4.18338313485,0.358996447266,-5.87642542398
+142,-3.93259210355,0.149657165734,-3.83132770968,0.269488626765,-5.34545674445
+143,-4.11814307699,0.721393968739,-3.59007389208,0.370660172423,-5.42955757642
+144,-4.23367298838,1.05376963285,-3.53969923896,0.614112257164,-5.48630859728
+145,-3.95869636495,0.472136852797,-3.18310972263,0.689673233548,-4.80901429311
+146,-4.04538674379,0.0738523975768,-3.99667527523,0.330477257534,-5.41593638963
+147,-4.1468026697,0.415399188108,-3.91404753507,0.0872810690256,-5.67680456497
+148,-4.15460923758,0.0717509451784,-3.9588831606,0.147162867399,-5.76814370589
+149,-3.57523193245,0.195417056414,-4.80410000105,-0.473719267884,-6.38176324697
+150,-3.51025861874,0.856118375832,-5.40027963164,-0.624382376552,-7.1359928798
+151,-4.38237447519,1.60011445758,-4.7521606876,0.0518728621325,-7.00004915101
+152,-4.27623318984,1.47324864364,-4.30781497365,0.124827291806,-6.54377411992
+153,-4.32194428155,2.06587484172,-4.20631276842,0.607399497692,-6.33547724603
+154,-5.00493869459,2.22966866366,-3.9061149897,0.696656417204,-6.17196154778
+155,-5.17550776754,2.21188658685,-3.99954696629,0.727786103955,-6.21657810996
+156,-5.24496674952,1.83542511918,-3.88063039409,0.75800286432,-6.40628509389
+157,-5.54708551632,2.82198533139,-3.84132303481,0.85671393851,-6.64762596715
+158,-5.16618921679,4.2567793887,-4.16745677914,0.477109844238,-7.43645189108
+159,-4.81521278827,4.45683428742,-4.34746873006,0.519627524599,-7.68745975295
+160,-4.8275925528,3.88330106165,-5.18631184892,0.100119923085,-8.75285457556
+161,-5.2511588662,2.8143123273,-5.12844694126,0.182664198339,-7.88009814396
+162,-5.61992128109,2.26792622321,-5.18009673261,0.414975247919,-8.16165288241
+163,-5.8000776885,2.80532060518,-5.0908514781,0.427068246658,-8.20088294419
+164,-6.19575329831,3.46018564451,-4.95019927132,0.464146926353,-8.03609530647
+165,-5.98656441427,4.12718110229,-4.69880353388,0.66144547648,-8.01893703451
+166,-5.90243834383,3.40320096162,-4.73707576726,0.831951793066,-7.9500544624
+167,-5.52097545345,5.42275420718,-4.70631335864,0.625767110232,-8.55065884765
+168,-6.3208950302,4.69282424685,-3.95049725277,1.158893991,-7.72947279651
+169,-5.84293846832,5.68653008392,-3.95380995318,1.03375698147,-8.176175529
+170,-6.22912432085,6.26160115626,-3.48554924886,1.83394247508,-7.61795902223
+171,-6.79635647476,4.81763687682,-2.88341433446,2.40218881576,-6.7290747344
+172,-6.35740862152,4.86326806909,-3.90426205429,1.82254002966,-7.9740801346
+173,-6.41833669784,5.48670890573,-4.25844101784,1.56886914101,-8.73388254897
+174,-6.59970408703,6.96889783847,-5.12329365495,1.27291511887,-10.0637459177
+175,-6.89356714328,6.82125797655,-5.35183724204,1.28221224054,-10.299517114
+176,-6.53648411794,7.59971661375,-5.67104067257,0.937822866785,-10.6853124564
+177,-6.34795240544,7.43562146373,-5.59471665917,0.687679154495,-10.7924921375
+178,-6.74375937622,7.84481794642,-5.77388691142,0.671113346933,-10.8776476236
+179,-6.27766821353,7.77861696919,-5.76077074367,0.546799950703,-10.9823512841
+180,-6.49156694365,8.4168114601,-5.10671295801,0.767265383419,-10.2992665517
+181,-6.93745307283,7.99884405249,-4.49778243184,1.5053842749,-10.0219833424
+182,-7.62614402526,8.28362768788,-4.8817171443,1.58756869903,-10.0113326358
+183,-7.45408528272,9.21062050653,-4.50836413806,1.55357765269,-10.36611078
+184,-7.57686652686,8.34558837325,-5.11443691137,0.963096126747,-10.5108606964
+185,-6.92122064974,7.51524254836,-5.57250467259,1.01375152529,-10.7968229459
+186,-6.72655857299,8.04758597168,-6.21552896393,0.446764941624,-11.2492272194
+187,-5.85311297855,9.03487849651,-6.69493774758,-0.195398222154,-12.3710872296
+188,-6.00899190946,9.38922023344,-6.97730266135,-0.380856322772,-12.2637154276
+189,-6.49145245774,10.1647973339,-6.28129203131,0.115144503479,-12.3743463557
+190,-6.27834310837,9.99838303926,-5.88823594787,0.137306166694,-11.9359447297
+191,-6.18887468225,10.0129537645,-5.36641590697,0.556318460862,-11.2236130627
+192,-5.38824249705,8.39926750316,-5.62662536761,0.112733314113,-10.8342041582
+193,-5.94700531263,8.45621383638,-5.72646583607,0.104835375975,-11.0102721795
+194,-5.88731938235,7.87130356003,-6.04562962765,0.0582654951822,-11.0785293513
+195,-6.12296915533,7.67676320126,-6.08425425732,0.128442314493,-10.870182626
+196,-6.76745956648,6.90599173991,-5.86846152418,0.366938513283,-10.7343594584
+197,-6.61306016783,7.05872603181,-5.47259551971,0.977253158729,-10.3434841626
+198,-7.09239521343,7.14941614266,-4.80874910844,1.01243852786,-9.66069391146
+199,-7.38673297862,7.43814214737,-4.44623639066,1.5645798163,-9.15539407176
+200,-7.48246373967,8.55940983389,-4.33638555438,1.2240147956,-9.53626954869
+201,-7.21850098183,9.1950295545,-4.730392637,0.874271961768,-10.091394825
+202,-6.9392406751,10.2887634467,-5.41488091389,0.18265008598,-11.3076106182
+203,-7.26584899566,9.12976316461,-5.74460604835,0.187796976379,-11.0112649197
+204,-7.51134356753,9.05338864894,-6.56081603752,-0.0561586757562,-11.7831914994
+205,-7.97118026399,9.76330823758,-6.26730117464,0.376533822272,-12.2031607289
+206,-7.45868593516,10.2530837275,-6.07391131954,0.364262738555,-12.0736691866
+207,-8.12691253342,9.24459713855,-5.97428342628,0.635535612617,-11.2507334533
+208,-7.20091748623,11.1013098384,-6.83778235396,-0.138780408438,-12.9427713046
+209,-7.61964438298,10.327417776,-6.98359476483,-0.0664752350588,-13.1302270255
+210,-7.79613275942,10.837752242,-7.28147308701,-0.474615333034,-13.3508962504
+211,-7.78521335775,10.3730194336,-7.35618333602,-0.289191229756,-13.6843532603
+212,-8.04127663087,10.6233080294,-7.48542165156,-0.644399871806,-13.6645997419
+213,-8.01756661016,10.391940659,-7.98425489911,-0.962726403901,-13.7924918661
+214,-8.59317772223,11.1119446083,-7.40280873564,-0.497346500246,-13.7378251017
+215,-8.38305522644,12.1987008902,-7.45623668664,-0.582090762387,-14.3403441054
+216,-8.67060276519,11.5901567318,-7.25709578849,-0.515554082904,-13.7864233282
+217,-8.349348253,11.8027493028,-7.54164271484,-0.813689735705,-14.348414961
+218,-8.78380465277,11.3381534207,-7.5020327912,-0.355159369441,-14.0982531397
+219,-9.3214734511,11.1679500639,-7.67333636,-0.4250652702,-14.3222456079
+220,-9.15694333638,12.3560660117,-7.92809167696,-0.588298226567,-14.5592229741
+221,-8.73166048156,12.9604680058,-7.43962374061,-0.292518475383,-14.3984790658
+222,-8.97336044919,13.74188509,-7.35793426906,-0.159817176776,-14.756374594
+223,-8.75144955694,14.0589520743,-7.54994899643,-0.462314767605,-14.7541319882
+224,-8.63086320415,14.0005000344,-7.51702697384,-0.397726224712,-14.6442062488
+225,-8.70762188928,16.0671192568,-7.00784449217,-0.396231113707,-14.8201329203
+226,-8.61581099463,15.7018046731,-6.94624241624,-0.350128048379,-14.5712260419
+227,-7.85516546328,16.5305738539,-6.36847585564,-0.0971574185121,-14.2732253652
+228,-7.44856034418,16.6129622221,-6.95911528586,-0.687249360667,-14.7132252284
+229,-8.29375420796,16.16225917,-7.06728376394,-0.718870475558,-14.686513317
+230,-8.46427112341,16.0560304351,-6.8922471382,-0.677120915865,-14.7286081861
+231,-8.88992293868,14.8914382303,-6.90414048177,-0.666284849292,-14.1391128579
+232,-9.31120795733,14.0459663856,-6.75198833777,-0.154635341816,-13.7273488654
+233,-9.37635859426,13.7418797754,-6.6410488689,0.0613764927805,-13.4681990315
+234,-8.80448017418,13.6388580502,-6.99969731685,-0.0900762552979,-13.8724315274
+235,-9.01006306752,14.2852668918,-7.25491354287,-0.409362384944,-14.382897651
+236,-8.57792765048,15.7096818747,-7.00877065001,-0.595308753776,-15.0260436319
+237,-8.60184033492,14.3796851876,-7.19521914664,-0.550311620755,-14.265822895
+238,-8.94163840381,14.3183598613,-7.0359158869,-0.400976116498,-14.2532990052
+239,-9.3070203095,15.5652000641,-7.95128252992,-0.639667873342,-15.5896992352
+240,-8.82016405663,15.7330800842,-7.86869074318,-0.933230358533,-15.7089395104
+241,-9.4856501791,15.7912223759,-7.80735482929,-0.668568530685,-15.7856988097
+242,-8.9900527358,17.3718918969,-8.14598138032,-0.891178246628,-16.4867759227
+243,-8.88752095529,17.0948456317,-8.65523789694,-1.19384456759,-17.2502909815
+244,-9.57142544421,16.3720586957,-8.25086719699,-0.707862475354,-16.3197585858
+245,-9.48896482888,15.6989106851,-7.66201034479,-0.619600466391,-15.917874378
+246,-8.79593217567,15.5835619365,-8.24452537259,-1.12780083711,-16.3627709762
+247,-8.57322176601,15.8140158209,-8.52836593325,-1.08463712668,-16.5510991233
+248,-8.71874430208,15.3577624571,-8.62679442048,-1.37193794843,-16.5463190818
+249,-9.33830295829,13.0154673362,-8.49285848946,-1.00041668691,-15.6359758613
+250,-8.83607864155,13.574529239,-8.73003933687,-1.0069409092,-15.8121118717
+251,-8.73419046103,14.1413919698,-8.45361543575,-1.05530996662,-15.7008095844
+252,-8.51250301902,12.8454604082,-8.27395071761,-1.13899488147,-15.3212622241
+253,-8.42164535712,11.8453765781,-8.50516745095,-1.42570410787,-15.117382849
+254,-7.93476425082,12.2082524385,-8.46929344259,-1.46632372125,-15.0349582594
+255,-7.53426932033,11.3727331533,-8.77796978976,-1.76328714062,-14.9524826889
+256,-7.66194424498,10.6074570685,-8.02922988632,-1.42416786209,-13.9694181048
+257,-7.87020349787,9.28265577492,-7.82107325808,-1.31988163397,-13.7307340377
+258,-8.18580557787,9.65222218113,-8.53500645205,-1.30714240789,-14.2405722581
+259,-7.88996563906,10.2870025349,-8.4651922959,-1.4478220229,-14.5529501083
+260,-7.10369597376,10.1606032203,-9.39594661371,-2.33084317893,-15.1718326936
+261,-7.09484890298,10.809308086,-9.00822404135,-2.35340975254,-15.1491036286
+262,-6.97923701083,10.894171574,-9.34356851911,-2.3552075855,-15.3790138989
+263,-6.54815714427,10.5558119327,-9.60943596808,-2.83709923394,-15.6984261838
+264,-7.52466864813,10.6074764226,-8.81546833145,-2.12697832338,-14.8605232114
+265,-6.68068650388,11.4325142332,-8.6458933814,-2.0858974939,-14.7757513859
+266,-6.72542130087,10.7652185601,-8.92488578669,-2.55857898751,-14.8408148286
+267,-6.36853891919,10.0330817319,-8.06704438026,-1.97650693552,-13.7565890029
+268,-6.75972177878,9.46254218998,-8.17519398045,-2.0356655338,-13.7296226176
+269,-6.72830368775,9.95996574271,-8.09966107306,-1.88096455638,-13.4537590725
+270,-7.25281069894,9.00039802252,-8.43087794948,-1.59518807804,-13.8324318848
+271,-7.34676646312,9.2332963445,-8.87758902215,-1.91851145778,-14.2402770285
+272,-7.60555427514,10.3683663569,-8.62960432947,-1.8715362348,-14.4440048398
+273,-6.34609526555,9.56457806807,-8.82058757395,-2.32053449485,-14.2312523277
+274,-6.27500527223,9.7769411337,-8.50044891775,-2.04382988555,-13.8562530462
+275,-6.27120764234,10.1217570647,-8.18775047076,-2.10760833342,-13.8255438186
+276,-6.49987895418,9.1911629968,-8.45083088562,-2.08796801275,-13.9944685673
+277,-6.35046807084,10.2752194384,-8.69121785385,-2.11255263809,-14.3500695934
+278,-6.66043619212,10.0482845632,-8.07443586272,-1.68107376781,-13.7766137644
+279,-7.30234454298,9.44138356831,-8.48265665138,-1.99144464045,-13.9109921893
+280,-6.66700071077,9.75725682908,-8.50659322657,-2.55844250149,-14.0404929251
+281,-7.17284956409,9.3309388284,-9.02423273637,-2.65154399393,-14.4188418891
+282,-7.75673997054,8.86248796819,-8.93104147921,-2.18942615812,-14.2328347205
+283,-7.9307794564,8.30911121619,-8.82805191069,-2.18434065675,-13.9540073488
+284,-7.94515487849,7.16633108103,-8.93261674774,-2.05258990224,-13.921181043
+285,-6.89073732868,9.0669674291,-9.27322930982,-2.69984976871,-14.2888970314
+286,-7.68149430413,9.526019294,-9.37317120005,-2.50239215959,-14.8762473099
+287,-7.87506432698,9.55682993661,-8.73916120297,-1.98556331574,-14.4020632243
+288,-7.9308171581,9.27936808462,-8.96875334983,-1.99200813219,-14.4681893019
+289,-7.71933923761,9.52466498356,-8.57369359544,-1.97148092967,-13.9167103945
+290,-8.16797108102,9.0930328534,-8.36464379169,-1.63034152109,-13.6595520431
+291,-8.1816768367,8.32626244106,-8.61943531951,-1.78181836368,-13.9995756587
+292,-7.9645109582,7.06701926668,-8.44493703906,-1.74330898922,-13.4519493468
+293,-8.70877689778,6.488141635,-8.5036551661,-1.33518463487,-13.1214932344
+294,-8.21130712747,5.89761163474,-8.67487122629,-1.22358402498,-13.1215056619
+295,-8.11184176326,5.69659470364,-8.35843148429,-1.25345246377,-12.7367609041
+296,-7.59679352613,5.29345996205,-9.14700078738,-1.89139702176,-13.4180099999
+297,-8.14593559557,6.0823772954,-9.17188932674,-1.79373143414,-13.5899514482
+298,-8.25434689916,5.67908362977,-8.67798309869,-1.50536157367,-12.9812208909
+299,-8.31992334944,6.01770976425,-8.36447676215,-1.62051875941,-12.7740048704
+300,-8.55973573267,6.24266175088,-8.38454488839,-1.48289072379,-12.9518594323
+301,-8.07098868066,7.13097695654,-8.70841035451,-1.81933335558,-13.4692357278
+302,-8.00738027817,7.48517128583,-9.1175107329,-2.08638442837,-13.6289945429
+303,-7.67555797247,5.85535456848,-8.57835902147,-1.99672216847,-13.4519936916
+304,-7.17113560979,7.897123317,-8.58077837481,-1.9849486514,-13.5609516831
+305,-7.74501395333,8.23761870152,-8.02337810445,-1.80279122296,-13.1100399173
+306,-7.83553877289,8.08692810618,-7.79517677,-1.54128934086,-12.5675110288
+307,-7.59936150426,8.13782680576,-8.1637459926,-1.81708440891,-13.1358857094
+308,-6.8797520041,8.40385878651,-7.8875084341,-1.92904732502,-12.8213860036
+309,-7.33433976102,8.08088891939,-7.77382848857,-1.76305237547,-12.717040465
+310,-6.68713926075,7.72749856298,-8.21678039046,-1.90270285054,-13.2127751796
+311,-6.70434428608,6.61533749738,-8.9877239181,-2.35288063661,-13.7545458859
+312,-6.62325074124,5.36985664044,-8.53357076753,-2.07858885262,-12.6706567245
+313,-6.36793176198,4.67411807807,-8.93003824436,-2.48360228543,-12.9012562645
+314,-7.4251367394,3.76718041663,-8.75924948942,-1.98051655899,-12.3486642241
+315,-7.74323298007,3.9002170746,-8.70997761306,-2.02075466455,-12.4530152562
+316,-8.16195993167,3.05809953596,-8.37083266122,-1.46263058118,-11.8299483584
+317,-7.76294402749,3.33339938209,-9.0169500054,-2.01322423303,-12.7246180699
+318,-6.77300251583,4.1381755879,-9.96437118554,-2.64977253693,-13.9334170269
+319,-5.68919346979,4.15024496509,-10.1822536661,-3.02376284478,-13.8262785847
+320,-5.6660630642,3.32349919292,-9.56251399298,-2.66613135588,-13.2866909086
+321,-5.77263052353,3.54761446413,-9.59000948626,-2.77468082443,-13.340576542
+322,-5.59178974833,3.79160216461,-9.32578298592,-2.64936489619,-13.0391774098
+323,-5.9441302159,2.93665441759,-9.24012497249,-2.44269179206,-12.8489809931
+324,-6.22776210297,3.01137237874,-8.99107354733,-2.06959185216,-12.6158805468
+325,-6.04632024125,2.74512311451,-9.31973261204,-2.2294248457,-12.8887092527
+326,-6.39429216352,2.58110958534,-9.31349087742,-2.13489797997,-12.5581005801
+327,-6.35202191405,1.94057814029,-8.81292014758,-1.99170382564,-12.3683404001
+328,-7.30664497735,0.646753225813,-8.44332778639,-1.43134623832,-11.0274594572
+329,-7.03922913692,-0.172030452415,-8.08523119806,-1.33238525219,-10.4035148775
+330,-7.638767446,-0.215797324305,-7.58528456832,-0.535244386178,-9.94757304346
+331,-7.69345658182,1.0095577025,-7.42572493956,-0.764915839276,-10.0734793193
+332,-7.9740779231,1.24699877584,-7.43457837986,-0.695000777285,-10.3275779811
+333,-8.13450555948,1.0851730114,-8.26439951721,-1.18285132409,-11.1064610788
+334,-7.64604201635,1.17190534668,-8.32846475808,-1.26428583568,-11.1978145063
+335,-7.86227590763,1.23644329355,-7.98241891083,-1.01881893793,-11.1019154916
+336,-8.07392084404,1.29285054701,-7.75112983568,-0.887180853053,-10.3990134854
+337,-7.77074799392,1.33571271474,-8.17261149451,-1.2961960474,-10.9697625056
+338,-7.54518040047,1.35571580111,-8.36968178671,-1.63184603769,-11.3848167619
+339,-7.26396147425,1.25194427727,-8.2594101378,-1.84017201169,-11.0964468548
+340,-7.21692660812,1.17463895033,-8.24530098759,-1.72745172455,-10.9309613177
+341,-6.96431233538,1.39566590865,-8.21893296452,-1.99333090699,-11.0208185194
+342,-7.22518846005,1.78587044509,-7.89452043194,-1.54125303747,-10.5967924562
+343,-7.41977517323,2.09881464935,-8.15550645686,-1.4683919704,-11.0518180815
+344,-8.65233938538,1.83137629284,-8.95072994464,-1.91710308696,-11.8900397288
+345,-7.96531007734,2.84497555763,-8.41060200443,-1.74403434745,-11.5965730615
+346,-8.17512800379,2.99324435503,-7.6687530415,-1.33699922955,-10.8538569021
+347,-7.66730341149,3.81846474142,-7.27749787852,-1.13206673388,-10.581276293
+348,-7.99246315234,4.44538302363,-7.3661376177,-0.847999173145,-11.1595182428
+349,-8.09542213161,3.62834552547,-7.03258734543,-0.634758408362,-10.356529982
+350,-8.27277001696,2.83535037977,-6.86005974941,-0.403647888916,-10.2592523154
+351,-8.26342778767,1.59305662629,-6.74207121258,-0.0105493464392,-9.70449602712
+352,-8.96298112824,0.937331350565,-6.87160234577,-0.0947280487254,-9.39585530657
+353,-8.37289297775,2.23714548944,-6.50881832377,-0.313517794165,-9.52361761042
+354,-8.72558987342,2.15129451527,-6.63801943091,-0.30500141967,-9.7663517167
+355,-9.69325911088,2.26938504001,-6.58079800818,0.23540993923,-9.63629749511
+356,-9.32607150829,3.26416240131,-6.44532904681,0.0140546277237,-9.68575588712
+357,-9.45213458555,3.39206029738,-5.64506189906,0.580562413607,-9.06915234442
+358,-8.79778827057,4.43827280973,-5.62252020993,0.374243769749,-9.27801372635
+359,-8.30978473075,4.33244702297,-5.82748073003,0.00249565593622,-9.30666107791
+360,-8.40294793053,4.37059764263,-5.61862134091,0.425251368895,-8.95415809954
+361,-8.30590324107,5.03925469249,-5.13844936139,0.0120098731111,-8.98056487952
+362,-8.76442947313,2.88260536591,-5.47206586821,0.0107759697454,-8.49820042931
+363,-8.10484566069,3.4190123307,-5.61698767617,-0.115206338007,-8.85192466698
+364,-7.82034518727,3.31449030434,-6.16597727057,-0.77011725995,-9.27594400942
+365,-7.5371576461,2.53681231277,-5.54995506174,-0.486611898012,-8.25319686113
+366,-7.77279623646,2.16250984029,-5.88440952329,-0.481927501937,-8.51532364379
+367,-8.15782617037,1.33003334758,-5.37147430649,0.0658360700383,-7.53393410332
+368,-7.83253589206,0.437254273227,-5.60155383897,-0.208864608532,-7.57863904568
+369,-8.26990979582,0.824040921398,-5.76367892903,-0.33466404889,-8.14395492128
+370,-8.54100293687,0.934081863545,-6.06588862301,-0.318402102047,-8.06765646531
+371,-8.10309392539,2.04844276819,-5.88457436783,-0.203370433555,-8.24963880732
+372,-8.55713704274,2.66130315091,-5.60092150186,-0.247773585611,-8.51034862129
+373,-8.97491690133,2.63089653151,-6.19344763285,-0.372412848934,-9.01424694103
+374,-9.46440345525,3.27883707552,-6.14904204358,-0.130038700067,-9.09291725832
+375,-9.76852837167,2.22005774597,-6.64842922659,-0.638330195338,-9.40888660449
+376,-9.81343356342,2.54796033493,-6.54654622474,-0.466846945321,-9.37728410355
+377,-9.90595390825,3.11472798511,-6.08319228033,-0.276385566216,-8.89707105564
+378,-10.0606978494,2.92644277261,-5.90574021754,0.0367629974542,-9.14893104395
+379,-9.50652925057,3.28266883242,-4.9613208755,0.342209248718,-8.10537624928
+380,-9.58418665872,3.8766496612,-5.24830100967,0.359084238186,-8.19590693663
+381,-9.89401168579,2.06957552028,-5.63523655415,0.145310208416,-8.19634691258
+382,-9.65468123662,2.02909735743,-5.32637534899,0.423842420979,-8.06404805054
+383,-9.79396204752,1.54333684439,-5.19759372657,0.602031889142,-7.71507075773
+384,-9.88286541318,1.78013548446,-5.53743142504,0.282331702022,-7.92410392346
+385,-9.48530868847,1.92025383026,-5.46791292084,0.225389019431,-7.94717342937
+386,-8.55400180014,2.84556540415,-5.22055031165,0.00423736484871,-8.25373689445
+387,-8.40696358222,2.42245234412,-4.72874143522,0.315896235064,-7.36916602825
+388,-8.14322786372,1.30858876418,-4.82083008604,0.424401643227,-6.98000265843
+389,-8.16291600017,1.69186634645,-4.51155283419,0.291003204101,-6.70316047617
+390,-8.53091874205,1.27144221699,-4.50718964342,0.840421450226,-6.68204395158
+391,-8.66087628811,1.48490925703,-3.97132998311,0.875778086732,-6.30483045504
+392,-8.40002328914,1.38502525426,-3.75445926445,1.18526369322,-5.73899118538
+393,-8.27669129654,1.13692236299,-3.86856389371,0.850751134598,-6.09108617681
+394,-7.31317837859,1.76308330778,-5.1213071419,0.114118785201,-7.32826075836
+395,-7.25022422516,2.96161225375,-5.62420024952,-0.732659758151,-8.33834224117
+396,-7.86073669368,2.84663589523,-5.85914635384,-0.387925476411,-8.77189288818
+397,-8.14796647926,3.38900309609,-5.7991352622,-0.527063650768,-8.83085023799
+398,-8.56664316014,4.60772362136,-6.12900909676,-0.398177419983,-9.53720460939
+399,-8.61231338753,4.92462337169,-5.78979304588,-0.453294363648,-9.30874415042
+400,-7.70873173115,6.16721620762,-5.85656227757,-0.239444088884,-9.77125080503
+401,-7.26972385405,5.61093419903,-6.30580446895,-0.949776276452,-10.1532343556
+402,-8.18118410514,5.08660551332,-5.79117039661,-0.422191796007,-9.4646787436
+403,-8.39302835758,5.0186308153,-5.98055682464,-0.286398046377,-9.77303263751
+404,-8.45120564479,5.43622599565,-6.01100741435,-0.0924211653169,-9.66587896574
+405,-8.77717487724,4.57784356666,-5.42815805348,0.274359490904,-8.69207026548
+406,-7.99762789661,6.97906775455,-5.75459747037,-0.325703879872,-9.92401567847
+407,-7.47412209775,8.12110138125,-5.43940822872,-0.334143670966,-9.88979621842
+408,-7.5076625296,8.16014031735,-5.39546303451,-0.66865947211,-9.80992172332
+409,-7.56219875714,9.34156848616,-5.54616791243,-0.681005635408,-10.1546704081
+410,-7.63574817061,9.31637788592,-6.00730915613,-1.06318316396,-10.6843343988
+411,-7.65555736237,8.59215891383,-5.71515975211,-0.546468229129,-10.2026101588
+412,-7.66567019955,8.2000572421,-5.28553211424,-0.346009703564,-9.74519606652
+413,-8.59988941769,8.81778864354,-5.60867089774,-0.173846094319,-10.5147296075
+414,-8.28473746502,9.73446559574,-5.52025007461,-0.51324797308,-10.6856067909
+415,-8.0514735486,9.87640655675,-5.28097052559,-0.483061911563,-10.2841109492
+416,-7.50275965972,10.66586562,-5.40068269393,-0.721122223517,-10.5494663524
+417,-7.38409558973,9.84989909125,-5.51700728169,-0.551493987826,-10.6729821817
+418,-7.89270233241,9.4678135448,-4.80993870943,-0.0410824462506,-9.81241582448
+419,-7.73693337744,9.74385841501,-5.06706350035,-0.227931465969,-10.2169061939
+420,-7.25289280104,9.75097055911,-4.68360735941,0.0725458155731,-9.63680971649
+421,-7.85447518349,9.41684052024,-4.78934582494,0.0277483434469,-9.916599754
+422,-7.21944245027,8.15535815451,-4.33769370049,0.224186164904,-8.65592658869
+423,-7.40298953781,7.81105918663,-4.73301890265,0.106922482561,-8.94538459399
+424,-7.73029367255,7.60646949382,-4.68875474343,0.166924934716,-9.05172100717
+425,-8.05324202277,7.27356876639,-4.41979641869,0.130739269101,-8.59688325423
+426,-7.81084501956,7.7930148114,-4.29311237575,0.405559230994,-8.54447769864
+427,-7.53417549554,6.67778131612,-5.00513140275,-0.326244682892,-8.99173502529
+428,-7.7357111047,7.62045657247,-4.82306327693,-0.132391671396,-9.11775513748
+429,-7.7911062034,8.88230287703,-4.42487324004,-0.113841966333,-8.88818802495
+430,-7.83530916906,8.96587714813,-4.69677204484,0.0187803889705,-8.97989947253
+431,-7.42014546711,9.52102154038,-5.213329279,-0.666141912715,-10.2570045187
+432,-7.90113403405,8.94855601903,-5.43017559315,-0.416684186876,-10.2555224557
+433,-7.77056066986,9.67406943057,-5.27412712919,-0.330577152846,-10.225851485
+434,-7.53729176639,9.09677751245,-5.44013208696,-0.378973572394,-10.1139227671
+435,-7.90999682194,10.1914292853,-5.73555243988,-0.669245398375,-10.7891552325
+436,-7.07468896027,10.2961189,-5.83629197868,-1.20419418996,-11.087497576
+437,-7.76007059592,10.3100931082,-6.22272952033,-1.07571510279,-11.1875318833
+438,-7.96264221574,10.0917413626,-6.61961853455,-1.31866473913,-11.5703014082
+439,-7.70634352868,9.48925131874,-6.02747636691,-1.20374307985,-11.0233131669
+440,-7.92952012287,8.176289281,-6.25078541743,-1.213336004,-10.9163228899
+441,-7.61000038351,7.60002390721,-6.58005538523,-1.60599428606,-11.0160160331
+442,-7.71997695775,6.95657228664,-6.64504615318,-1.18628364509,-10.7089656534
+443,-8.45792022606,5.68756171623,-6.11406348849,-0.455851618868,-9.96455616534
+444,-8.63704101162,4.52356954888,-5.76194770581,-0.0735564578827,-9.32882534384
+445,-8.83852254797,4.59575340633,-6.26772767449,-0.304832549365,-9.89891221823
+446,-9.28388744959,6.76240785433,-5.62508526868,0.0406823127046,-10.1534140373
+447,-8.99402212503,6.05994562464,-5.1722868443,0.183630881805,-9.22305412258
+448,-9.46860166801,6.33427372021,-4.56946478971,0.81492218602,-8.3865172097
+449,-9.44904192017,7.10371649114,-3.84718258557,1.14522132528,-7.83567030285
+450,-9.30068905891,6.26780398737,-3.76264645987,1.05703889768,-7.60883361447
+451,-9.05363785907,7.08639425464,-3.5723518982,1.19752274819,-7.77917735641
+452,-9.45011447489,7.69958976553,-3.65613567541,1.26177311127,-7.86654932962
+453,-9.37828458035,7.90189506739,-3.18362808079,1.26932312211,-7.57080295941
+454,-9.00342903927,8.59705538568,-2.74452614018,1.41558921167,-7.40040203318
+455,-9.30866334623,8.92724332929,-2.82863765568,1.8799201478,-7.44004070415
+456,-8.80460535166,9.35788488726,-2.88301106748,1.46270161512,-7.76989572533
+457,-7.83491155484,9.05252174014,-2.61219108646,1.243057125,-7.23600233632
+458,-8.00626585156,8.99523619937,-3.24749002239,0.984196902256,-7.88996734183
+459,-8.28543558439,8.89861964252,-3.22518774513,1.45142473469,-7.76297147151
+460,-8.6252064253,8.86482028319,-4.1729426727,0.564955757709,-8.66786554543
+461,-8.86659274988,9.04428287641,-3.93088339641,0.760658009952,-8.65573753271
+462,-7.83673197153,9.67167516954,-4.45977710346,0.202469477781,-9.27234575235
+463,-7.61283804644,8.77427878023,-4.32300177412,0.211056749915,-9.39567839173
+464,-7.97006867205,9.2437852131,-3.91591221507,0.551959380889,-8.70345380127
+465,-8.12954182258,10.5250892462,-4.0265848175,0.620954610909,-9.2153873358
+466,-8.39415189099,8.79267278439,-4.24443524754,0.617886880166,-9.01116625927
+467,-7.75939747441,10.2206349381,-4.32542056965,0.0382391624687,-9.55060037024
+468,-7.3878455675,10.5931317384,-4.67966200294,-0.32270383705,-10.018580555
+469,-6.62747411302,11.3852625973,-4.11973958456,-0.138222869787,-9.61881878072
+470,-7.32967370223,11.436494309,-4.43570985115,-0.0384019039489,-9.76173887141
+471,-7.60119371513,11.7167191169,-4.29435287095,-0.138608189796,-9.94846811681
+472,-7.56984590221,10.6323592942,-4.83278406931,-0.554011667964,-10.0689564692
+473,-7.98024487248,10.3345816287,-5.21855708438,-0.345941788809,-10.2299248207
+474,-7.39348086618,11.0032546009,-5.59831945775,-0.989658264728,-10.9366152086
+475,-7.29383837744,10.8331985202,-5.69579470023,-0.914770698374,-10.9477323007
+476,-7.06461535502,11.1040234457,-5.77558791439,-1.25593234237,-11.1211327558
+477,-6.88896938016,11.5735224439,-6.39115832553,-1.7624374916,-11.9493852064
+478,-7.55671754218,13.4943718782,-6.46196393323,-1.65053404919,-12.4846864751
+479,-8.38609650853,13.1134194359,-6.76328291227,-1.73090050038,-12.8709323161
+480,-9.04284843042,14.5239363857,-6.74779448588,-1.73952140978,-13.0705720039
+481,-8.60716065284,14.8954501626,-7.1352385002,-1.98104858032,-13.5983782215
+482,-8.72574238064,13.7802052507,-6.98589932587,-1.97446305837,-13.338191356
+483,-8.73482675513,13.8210408929,-7.75921259593,-2.41182448906,-14.0620638562
+484,-9.06670465571,12.5822383107,-7.16145451299,-1.76309768158,-13.0663818785
+485,-8.33823360512,12.7726425667,-7.79413096054,-2.49821005812,-13.7482397933
+486,-7.80803101857,13.6969917396,-7.6772874059,-2.86938488621,-13.9728420688
+487,-7.90706317854,13.6001196399,-7.55718412173,-2.55031853651,-13.6750813602
+488,-7.70543957218,13.7961012533,-8.13150113839,-3.12691380322,-14.3613180308
+489,-7.46745148663,13.8870335928,-7.98469677293,-3.15082657539,-14.031984076
+490,-7.56894929039,13.529133025,-7.86581980201,-3.4904976915,-13.9537179422
+491,-7.90817321347,12.6224920923,-7.76928527731,-3.07432375361,-13.3906077063
+492,-7.74395775214,11.9421206595,-8.2383068654,-3.54607771194,-13.7826114429
+493,-6.89149021797,11.1459154364,-8.84651776388,-4.02705388465,-14.2798531336
+494,-6.83004452131,11.3949568352,-9.02745233278,-4.09550125277,-14.2779559998
+495,-8.20499265384,9.92810262799,-8.23624802267,-3.09030964092,-12.9438913323
+496,-7.90265306486,10.3879136248,-8.47991644205,-3.42574193544,-13.5207488054
+497,-6.99430831807,10.2745360103,-8.63073093963,-3.48171940609,-13.6908123216
+498,-6.5341483316,10.8149075985,-9.23225598149,-3.82382844706,-14.4066704898
+499,-7.22665817291,11.1019560664,-8.95168258846,-3.53128619196,-14.1223733425
+500,-7.86277256547,10.60752055,-8.45029118876,-3.0795014755,-13.8857299247
+501,-7.62848920276,11.4835661568,-8.6249884299,-3.37504848417,-14.4473529862
+502,-7.60851695553,11.4903822799,-8.42751680283,-3.17082651098,-14.151223807
+503,-7.41243008797,11.9379955165,-8.55682214019,-3.15903468408,-14.4977835601
+504,-7.64888210927,12.0776474255,-8.30432405903,-2.92603812562,-14.1611867869
+505,-8.6741774247,12.5280743316,-8.17741652592,-2.39848547493,-14.1288630903
+506,-8.41238889407,13.451442609,-8.03745344589,-2.83762630479,-14.5714832036
+507,-8.77066432459,13.8056270205,-8.52337183467,-2.92202821614,-14.8586924241
+508,-8.56343769158,12.9475210923,-9.14684844463,-3.26053368467,-15.6518225187
+509,-8.464455479,13.7823831968,-8.58680769101,-2.65905012676,-14.9408435091
+510,-8.85276625908,12.3419527514,-8.25968756831,-2.65926448761,-14.148178069
+511,-8.74103528242,12.492267677,-8.16660354159,-2.79576840443,-14.0153826665
+512,-8.6953577304,12.2037388265,-7.48737064047,-2.34766806188,-13.5043309345
+513,-8.31867578197,12.8511848476,-7.63850544137,-2.31687292089,-13.4887027518
+514,-8.72089212441,11.3608422123,-7.15361856699,-2.26748317198,-12.6046672194
+515,-8.97049831731,11.8138559199,-6.88358866448,-2.22921272844,-12.8922632333
+516,-9.03794706422,13.1022633982,-6.90344107633,-2.04962429964,-12.7236624824
+517,-8.76051670559,13.4409848492,-6.58883983792,-2.05127921863,-12.6074998148
+518,-8.91338039346,12.6519258915,-6.64030013965,-1.82264899838,-11.980924941
+519,-9.53039972035,11.9926068536,-5.73664271767,-1.19906734984,-11.3024098119
+520,-9.56933539459,12.4738179657,-5.36522119534,-1.27302916004,-11.3220620935
+521,-9.27093923233,12.1809713568,-6.05137153051,-1.56542168147,-11.6892131865
+522,-9.10470381783,12.8443664375,-6.17461429576,-1.70650623113,-12.2023759588
+523,-8.70481209955,12.8757796631,-5.62283784861,-1.02909920774,-11.3078745886
+524,-8.36832904481,14.0887254045,-4.54259677738,-0.756947847904,-10.7469425555
+525,-8.27192838772,13.5284975888,-4.98814929013,-0.982554080124,-10.9537711845
+526,-7.81605932555,12.3349289299,-5.04685757359,-1.063642532,-10.4764999707
+527,-7.62519968994,11.762425275,-4.95656489865,-0.900379743579,-10.276332642
+528,-8.1882585754,11.3375643458,-4.91191857095,-0.807628368749,-10.0569120581
+529,-8.45941601411,10.454784064,-4.74170003843,-0.542878703538,-9.73767739866
+530,-8.73117345399,11.4131380545,-4.28437756707,-0.327483941886,-9.51709313378
+531,-8.68143454688,12.1767437654,-4.44634362562,-0.355483985965,-9.99097182509
+532,-9.07184340599,11.2145931578,-4.61698978583,-0.283813726328,-9.90119512012
+533,-8.36517669787,10.90191099,-4.73905011775,-0.0679401761346,-10.0480795818
+534,-8.12854443861,12.7793891363,-4.60968203496,-0.684053053687,-10.4959523652
+535,-7.82630271147,13.2889108151,-4.03395603121,-0.0526246773507,-9.65707971333
+536,-7.13754030246,12.33305472,-4.62251351717,-0.513591809941,-10.0506041372
+537,-6.79153765699,11.5254685522,-5.28642436084,-0.796837909631,-10.7505435107
+538,-6.61249938734,10.5939083448,-5.29295891382,-0.916465477084,-10.3119753157
+539,-6.83068917195,10.4725693866,-4.36085444131,-0.368642493868,-9.382408942
+540,-5.6249721521,11.927704318,-4.24339737787,-0.619606917631,-9.81949407439
+541,-5.82758001032,12.6964738173,-4.03365702004,-0.416245692851,-9.38431980968
+542,-5.46903104807,13.2779186097,-4.12343285594,-0.682641438986,-9.87646142634
+543,-4.97778064306,13.4885100764,-4.2759284571,-0.793641438115,-9.72648765469
+544,-5.0560312196,14.051502874,-3.71367375886,-0.489199656744,-9.81689350133
+545,-4.55187916276,14.2140313756,-3.70650309796,-0.655470642907,-9.73140394667
+546,-5.04254752385,13.6516363661,-4.30195859918,-0.728160625021,-10.180321458
+547,-4.26246022655,13.9133421333,-3.84133734094,-0.583289916747,-9.97918976505
+548,-4.30214989462,13.1366364502,-3.81401849127,-0.608200897574,-9.74794696739
+549,-4.35823825962,14.4217154288,-4.46489684024,-1.00864119121,-10.7181584852
+550,-4.32602595593,14.5305788233,-3.98466464503,-1.03214650642,-10.1469047302
+551,-4.02879976477,14.7398679005,-4.06644601558,-0.721154120771,-10.2467158925
+552,-4.27308512946,14.6201722033,-4.55144033265,-0.952365352585,-10.747980141
+553,-4.28874816482,13.7473321846,-4.79460329121,-0.955454563345,-11.1410125472
+554,-3.74881459131,14.20530683,-5.36614021922,-1.34080791792,-11.8195624562
+555,-4.20199718521,13.5226440531,-5.81821952939,-1.38950062498,-11.9642409495
+556,-4.40355016096,14.3638955592,-5.87343422489,-1.50608877181,-12.4185984184
+557,-4.45454402232,15.042505325,-6.1696896542,-1.68202900454,-13.0148345617
+558,-4.31821523634,15.3293822395,-6.2316188467,-1.90132290723,-13.2911360142
+559,-4.04312675663,15.6577447607,-6.72475537859,-1.91372353818,-13.6463671282
+560,-3.42184078061,15.3309173613,-7.40708003453,-2.55459473853,-14.3931190219
+561,-3.46542523055,15.5265808332,-7.15205396266,-2.41805844221,-14.4278386867
+562,-3.78797354277,15.2357698666,-7.24297919198,-1.98435074217,-14.2522705939
+563,-4.20737944965,14.4814608454,-7.09199259694,-1.95163465121,-14.1030466685
+564,-4.54269522253,12.2715493338,-7.654516447,-2.1202464768,-14.0360563518
+565,-4.47303696956,12.2247565117,-8.17007775873,-2.19147781262,-14.8423105568
+566,-4.84128186341,11.8935839328,-8.38936181625,-2.18331007359,-14.448125846
+567,-5.12214988866,11.5443487689,-8.47707379112,-2.32353455782,-14.68071878
+568,-5.25781192718,11.1259589377,-8.27449247298,-1.89947257662,-14.1643465599
+569,-5.28658831291,10.7458012983,-7.88058968271,-1.50991744653,-13.7810715039
+570,-4.95171525155,10.2711007527,-7.59925773373,-1.55919448714,-13.5703836116
+571,-4.79287660952,9.96925879248,-7.58528757465,-1.68185852437,-13.4275989095
+572,-4.91077459307,9.35293272808,-7.34826410505,-0.994021443761,-12.632467089
+573,-5.02873222517,9.7990983064,-6.91338823615,-1.01585391571,-12.5090859382
+574,-4.62672655999,10.4287748512,-7.19565884292,-1.06561909499,-12.8337233374
+575,-3.9909839471,10.5360769285,-6.3867871409,-1.06884544163,-12.4760850685
+576,-4.12511021435,10.865201491,-6.63547505921,-0.909982534914,-12.5158359497
+577,-4.27063822465,10.7290321497,-6.91243856629,-1.30452901855,-12.9756509074
+578,-4.56000092545,12.508475048,-5.96571115089,-0.625251071604,-12.3235128486
+579,-5.20045551281,11.7188498044,-5.88311399809,-0.454483225893,-11.9946006971
+580,-5.94018271083,9.90268294036,-5.60932152216,-0.0347275761663,-11.3316152655
+581,-6.35766764757,9.20913413382,-5.2576362716,0.162587884754,-10.849867214
+582,-6.72627744267,8.18433300416,-5.29058658023,0.524518988078,-10.3179059859
+583,-6.51015900161,7.91043597658,-5.02336734175,0.475116906637,-9.8179849834
+584,-7.25652757603,7.80180711227,-5.14917539965,0.684433012121,-9.93995920855
+585,-7.44035353094,8.20685317741,-4.9611278416,0.810327631401,-9.84229115563
+586,-7.62766364006,7.92427710556,-5.1492800484,0.748908880878,-10.141460514
+587,-7.70178958959,7.63113921284,-5.43877867225,0.752647825505,-10.4246339377
+588,-7.49106609626,9.14308677063,-5.48085850864,0.635582264394,-10.7402738296
+589,-7.25616895029,8.8736707429,-5.76380250306,0.132945563895,-11.0433448485
+590,-7.33431616966,8.14642933723,-6.09803772091,-0.0205251631087,-10.8770554984
+591,-7.43448468892,9.45371300809,-6.27090816414,0.014245651137,-11.7834224741
+592,-7.19098227612,10.2584564544,-6.38153071625,-0.402953286303,-12.1013384295
+593,-7.54879122218,10.8772018605,-6.44407288881,-0.196619130316,-12.4926897215
+594,-7.43726130917,10.7988362824,-6.52533498918,-0.424125937965,-12.2598369654
+595,-7.51923947359,10.576279005,-6.23134715896,-0.447945723202,-12.0205301554
+596,-6.57639833166,11.5521989554,-5.8161016976,-0.660638769623,-12.1212685319
+597,-6.45425915778,11.1904992511,-5.40884639078,-0.57603862369,-11.0086974409
+598,-6.65475002972,10.507567486,-5.8045033429,-0.546150253913,-11.0720770569
+599,-6.83628199523,9.98916308058,-5.72309371328,-0.47068711995,-11.3171561967
+600,-6.40270013575,9.50088787203,-5.59674002321,-0.522702858737,-10.9974602158
+601,-6.31702264897,8.71390106346,-5.84838538666,-0.722974436048,-10.8802371565
+602,-5.85616493326,9.66760249247,-5.86858496783,-0.979889372383,-10.8289542507
+603,-4.99700649684,9.47359543822,-5.9264016263,-0.957137982385,-10.90037621
+604,-5.96165331384,9.29344332534,-5.21791642348,-0.329339871814,-10.0527965597
+605,-5.59958623603,9.55233799498,-5.25838326099,-0.666078081926,-10.2800391203
+606,-6.28423323392,10.5618365531,-5.33635198344,-0.338854421759,-10.7994108691
+607,-6.20609011005,10.6712030539,-5.6477416912,-0.567773066416,-10.8319140177
+608,-5.89383316309,10.6117497366,-5.06653971734,-0.595664771056,-10.2450349708
+609,-5.39098713941,10.2826363488,-5.42351347656,-0.93382506455,-10.6656641642
+610,-5.37558699323,10.5928691156,-5.69331708589,-0.913659120363,-11.2238003136
+611,-4.79321498006,10.9584722697,-5.79209207482,-1.12055325463,-11.1181507679
+612,-4.20012794958,11.3358361906,-6.29262536462,-1.7997411365,-12.0616244934
+613,-4.06827298003,11.2260338308,-6.5078844824,-1.73864594698,-11.9520265037
+614,-4.68824364941,9.53765424762,-6.39107707531,-1.64430306595,-11.4008937318
+615,-4.333568712,10.2322219592,-6.16100882455,-1.61071064434,-11.3360552224
+616,-5.28896089746,8.81440811911,-6.11083146356,-1.09024225699,-10.6386340873
+617,-5.61895478343,8.97582579379,-6.26562522823,-1.25483168214,-11.4151122526
+618,-5.39151720994,9.10308049435,-6.29574940775,-1.29122403525,-11.3080905271
+619,-5.67004035877,8.13514388254,-5.95060921903,-1.07624631024,-10.7249156376
+620,-5.9409193667,7.82636509969,-6.01479703818,-0.747904744361,-10.7382074755
+621,-6.08666094717,6.45166087746,-5.6912383329,-0.384432155041,-9.92811621033
+622,-5.96197903317,7.46389005128,-5.59272861253,-0.292797751004,-10.1161188471
+623,-6.35515903365,7.93597796374,-5.91736775562,-0.48114585324,-10.9649848953
+624,-7.32092249928,8.06970979388,-5.5646187706,0.148659138064,-10.5784669781
+625,-7.67352757707,8.58622024166,-5.90106242674,-0.107241677064,-10.9309742618
+626,-7.82316942682,8.06324909482,-6.57837839745,-0.337796398254,-11.3582637987
+627,-7.79055569042,8.35663991703,-6.03415399688,-0.136005286013,-11.0348750904
+628,-7.66418439941,8.15778365353,-6.77675085366,-0.782683321613,-11.8135527176
+629,-8.47793636094,6.43751021678,-6.22516237698,-0.410418798614,-10.3963509426
+630,-8.37018996314,7.12293519844,-6.41155372408,-0.170493319344,-11.2222577492
+631,-7.76247910431,8.04775925117,-6.15816802589,-0.375175553598,-11.2454400195
+632,-8.64277045125,6.63034588782,-5.99218096017,0.285219294969,-10.585924295
+633,-9.19485691557,6.63463311394,-6.01348867194,0.446238666716,-10.6987401232
+634,-8.78628767952,8.67650487412,-5.95220465628,0.257241434434,-11.2890938009
+635,-8.86689196065,9.07477663572,-6.46028028093,-0.0860344247153,-11.8903971948
+636,-8.42919690551,9.35005160334,-5.26054753395,0.266549698447,-10.4563193072
+637,-7.96179979842,9.36795539252,-6.15509700803,-0.319844350195,-11.4919166459
+638,-8.21500353198,10.1720461638,-5.80720898706,-0.0432955320264,-11.3745673095
+639,-8.34997922566,11.4328723533,-5.62650314648,0.0635914309315,-11.2690101799
+640,-8.72450254851,10.6212277058,-5.14718048962,0.392934800088,-10.8287851994
+641,-8.86936806254,10.3823506314,-5.25234903323,0.406129888894,-10.8226662622
+642,-7.55340396531,11.4108988611,-5.66510528278,-0.00188711481323,-11.4863768062
+643,-7.96629936617,9.91534871051,-5.88423383214,-0.0196619591553,-10.9315552741
+644,-7.38327037015,10.0729998009,-5.77174333374,-0.412355343943,-11.1572623847
+645,-7.39457357296,9.16923250992,-5.83076222376,-0.407360151486,-11.0243956425
+646,-8.00404327268,8.75338244604,-5.88121439097,-0.128696293641,-11.0727732758
+647,-7.52676260797,7.84656868254,-5.18995203862,0.410626342705,-10.0560473406
+648,-7.56769890592,7.13922319886,-4.90610149255,0.616152320526,-9.28084331636
+649,-7.36422240599,6.95918083598,-5.50779580176,0.0734546001841,-9.98589746207
+650,-7.31465254398,6.2066469425,-5.74855248183,-0.147086471988,-9.70986816079
+651,-7.36877709902,6.40480609309,-5.52745982897,-0.262173192346,-9.96687184193
+652,-7.22248648288,6.12935925218,-5.50562603459,-0.0288147824838,-9.71613313098
+653,-7.06059909056,6.96185396183,-5.12170245121,0.25200954223,-9.33429804032
+654,-7.33100628333,6.53773446505,-5.47195358789,0.07057154315,-9.475894278
+655,-6.9820374743,6.64369997084,-5.59125586929,-0.21023749067,-10.0974227438
+656,-7.42528052115,6.00550992939,-5.87736367459,-0.276582527915,-10.058745816
+657,-7.8951722821,5.89341262122,-5.0277623395,0.513264104677,-8.9419088894
+658,-7.82426425136,5.86610467231,-5.42973604633,0.579845870883,-9.23760800538
+659,-7.6612805635,5.54851677406,-5.35767785247,0.274720415231,-9.21986637073
+660,-7.31096488925,5.30055162713,-4.68068538588,0.456142560505,-8.43103327044
+661,-6.99940783035,5.53626344284,-4.8967008757,0.214130846864,-8.69255980792
+662,-6.74412698867,5.67978658306,-4.35881995758,0.426349839973,-8.18350599855
+663,-7.20716008137,5.71759636047,-4.76794301802,0.376503569826,-8.58264487246
+664,-7.4422827163,5.09575033732,-4.65277280088,0.540370731978,-8.44354929701
+665,-7.72407252154,3.8368952519,-5.10912362071,0.832151509446,-8.31656354701
+666,-7.6756765792,4.02904996763,-4.67220063132,0.923193983332,-7.87159862485
+667,-7.44766395806,3.86614359714,-4.7944621399,0.543414838244,-7.87279478449
+668,-7.25937962602,2.93869048913,-4.74002146987,0.710791482998,-7.92797864948
+669,-6.52761452884,2.78202577741,-5.51790285557,0.0561235542347,-8.66875779618
+670,-6.68665186158,1.94821419299,-4.84544453174,0.510960451429,-7.63541307249
+671,-6.92875602961,1.83058897211,-5.23800466058,0.505447681518,-7.85641627241
+672,-6.63536433283,1.20444406247,-5.71702347349,0.300327255742,-8.35310333729
+673,-6.3384086199,0.73017334055,-5.48238397369,0.375710125434,-7.75390922244
+674,-6.34357000752,1.55663058861,-5.03470808266,0.359971047376,-7.76053956747
+675,-5.94028553326,2.43199585766,-5.09621433527,0.229659270944,-8.07821189072
+676,-6.25897679782,2.69406242016,-5.17138954514,0.42042113816,-8.16018619034
+677,-6.69209433793,2.0324795398,-4.66977280949,0.68411650407,-7.44078895275
+678,-6.40468704147,1.61071567704,-4.23146923035,0.832000027117,-6.65512888455
+679,-6.66132500822,1.99604864852,-4.53336489546,0.639292369325,-7.22463873813
+680,-6.91208712765,2.40989957967,-4.61332209194,0.576387344841,-7.69925913415
+681,-6.73765670651,1.78720953306,-4.92693953402,0.422258171775,-7.89048876601
+682,-7.64061157539,2.37987586954,-5.20850149612,0.630708319893,-7.79528337721
+683,-7.54679349991,2.17942194278,-5.61934679035,0.218386160231,-8.39483144812
+684,-7.54125834141,0.835304170411,-4.86935961238,0.553335337336,-7.525139522
+685,-7.5086273713,1.36116757282,-4.90558279198,0.794107455743,-7.23086456321
+686,-7.7461923385,2.4193649484,-5.51033221406,0.00654594237631,-8.0356851685
+687,-7.76241899392,2.21803593906,-4.95928237875,0.212900845441,-8.04726441212
+688,-7.0433409924,3.24860727507,-5.57623238173,-0.461690807678,-8.6688356437
+689,-7.28969708048,2.80967718149,-6.10414906222,-0.427454179703,-9.1015527772
+690,-7.39644892549,2.87510414223,-6.1875061726,-0.449133740241,-9.21862992049
+691,-7.37646265439,2.61267170972,-6.20944729039,-0.421285115907,-8.96867571003
+692,-7.09646306357,3.49152325194,-5.39579635573,-0.365591886467,-8.30953814114
+693,-6.50513394267,5.15855802041,-5.66665072601,-0.640210396427,-9.28055135762
+694,-6.13051463364,6.41389810361,-5.34176976591,-0.823623519687,-9.20537449094
+695,-6.36723598408,5.54328985421,-5.74475914008,-0.975366290819,-9.43252755039
+696,-6.91214369815,5.44605892685,-6.23017574508,-0.961233988667,-9.94932866501
+697,-6.65001944482,4.71339054555,-6.79598385537,-1.18895185732,-10.3258154228
+698,-6.6469536433,4.77953859565,-6.73005042362,-1.19496905249,-10.2752635942
+699,-7.4023108563,4.48256127238,-6.4695661708,-1.2425981984,-9.77229943843
+700,-7.67969706944,4.90067978358,-6.51013757105,-0.699847921714,-10.0406919
+701,-7.62267589332,4.29231248937,-7.04878184319,-1.12180371462,-10.2244480923
+702,-8.03913354476,2.23863428466,-6.74472847184,-0.646262716956,-9.73640472641
+703,-8.50723674577,1.89745906234,-6.49109370708,-0.444589897447,-9.54682908844
+704,-7.75029099976,2.98203193151,-6.94346904671,-0.801790845952,-10.2251154176
+705,-7.28785811927,3.13113336047,-7.3116859698,-1.4377314492,-10.6418381098
+706,-6.98934980143,4.02979535667,-6.89126357266,-1.31623495347,-10.6754242063
+707,-7.48665821261,3.61452387991,-6.73606064589,-1.19715140442,-9.92213030459
+708,-6.83419122043,4.67947228607,-7.84574578667,-2.039294219,-11.575591842
+709,-7.47840517635,3.92011152907,-7.61895070354,-1.85481713606,-10.9481770032
+710,-7.55326784057,3.28603090769,-7.44202529127,-1.21637628248,-10.8060757203
+711,-7.79525152167,1.79222368955,-7.17323621434,-0.980525616591,-9.99120396889
+712,-7.61516673261,1.60884638757,-7.13883506973,-1.17278481839,-9.89665740678
+713,-7.47273256367,1.64096134297,-7.18350600842,-1.09082692103,-9.83737076811
+714,-8.08073073502,0.34036709366,-7.85247598452,-1.45719294076,-10.1103382971
+715,-7.28504311012,-0.131516143604,-8.2850712132,-2.04627026327,-10.1841090123
+716,-7.35496138612,-0.698449616329,-8.41319192417,-2.19707895325,-10.2932025533
+717,-7.55975613801,-1.38771383611,-8.49930382214,-2.06348952289,-10.1121907007
+718,-8.18194259285,-2.24582797484,-8.60276926588,-1.96278368365,-10.0567472033
+719,-7.87723984599,-2.40513159387,-8.90856781967,-2.16986749468,-10.4375983978
+720,-8.54871094765,-3.20447198015,-8.52984265654,-1.75093818422,-9.6195506035
+721,-8.30887049766,-3.11039927891,-8.1370700895,-1.88962388648,-9.37946117952
+722,-8.52478123779,-3.07006155417,-8.44093129431,-1.97028186819,-9.40056065167
+723,-8.52959757054,-2.93103223485,-8.35386134974,-1.8306290944,-9.5524735806
+724,-8.40016469044,-3.240631443,-7.7859386619,-1.50196984774,-8.68319548012
+725,-8.59832053918,-3.51219301753,-7.52539429167,-1.13003941015,-8.33059335705
+726,-8.53674267008,-2.76618311607,-7.7530733192,-1.69087066846,-8.96196019468
+727,-9.19387946828,-2.09173011762,-8.11641103356,-1.68864155233,-9.70160036684
+728,-9.71516949212,-2.62482117394,-7.90969632308,-1.26813949299,-8.96905926297
+729,-9.52794567184,-1.68622279228,-8.28791102338,-1.60448067804,-9.91633883276
+730,-9.57183342665,-1.81496469212,-8.17395895818,-1.43372380124,-10.1839728234
+731,-8.99515542033,-0.250160261099,-7.75076894272,-1.36923957925,-10.0688109461
+732,-8.83295039135,1.25519263025,-8.41144950163,-1.8664787253,-10.5848038246
+733,-8.74828754976,0.699157910387,-8.04868435757,-1.54160207748,-10.5024249452
+734,-9.30583193211,0.983113690689,-7.5205459678,-1.11311517965,-10.3627201353
+735,-8.20982783506,1.2065013237,-8.14327438861,-1.78182751349,-10.9668725757
+736,-7.87906463554,1.5188258917,-8.06185450647,-1.75657399227,-10.8879705059
+737,-7.87666466486,0.419814819599,-8.37849950898,-2.10739374139,-10.5729436234
+738,-8.13388013683,-0.316121636573,-8.54804736991,-2.09712190262,-10.8076758633
+739,-8.21410994533,-1.47890094587,-9.01606970706,-2.52170386093,-10.603555136
+740,-8.42632949015,-0.798181541948,-9.54681371394,-2.5773365312,-11.6090378118
+741,-7.49930210003,-0.84917442156,-9.45594393931,-2.73866916404,-11.2306569351
+742,-8.02482733789,-1.94745762639,-9.12481196805,-2.36548222365,-10.6961213331
+743,-8.17658714341,-1.67764505438,-9.72271321311,-2.5264638587,-11.4782089629
+744,-7.52625751472,-0.134031885533,-9.48827442084,-2.88193133131,-11.9667258197
+745,-6.76522036535,-1.2403510848,-9.4082814575,-2.84879748694,-11.0511462345
+746,-6.70032283363,-0.70871581707,-9.10485958634,-2.79375242472,-11.0742431844
+747,-6.39694751946,-2.67577505274,-8.92290004959,-2.4587626515,-10.0951297874
+748,-6.50803611445,-2.9409132331,-9.05649851838,-2.44089453865,-10.3345494614
+749,-6.71843096837,-4.09330699746,-9.13889571697,-2.14359951033,-10.0992804477
+750,-6.19027004409,-3.93193723844,-8.62311199117,-2.05344115029,-9.64988643498
+751,-5.52532828617,-3.25965524913,-8.21724737534,-2.02353190822,-9.35485880265
+752,-5.45782149369,-4.15164648266,-8.0418293307,-1.9214129807,-8.85922683562
+753,-5.43313572956,-5.11948686973,-8.1432013039,-2.00951314781,-8.57352596717
+754,-5.98022831689,-5.76742824891,-8.18231319605,-1.85589805443,-8.2537198511
+755,-7.37186032969,-7.60592929099,-8.13120689363,-1.43792835215,-7.61687051359
+756,-7.07220582922,-8.30202341168,-8.08044789156,-1.47175646366,-7.81358150205
+757,-6.63860809394,-7.12903781228,-9.23866261402,-2.06703243146,-8.96067559424
+758,-7.00161369044,-7.12676923747,-9.02934867312,-1.75960600043,-9.11285889003
+759,-7.16503633732,-7.27530599809,-8.03638123039,-1.08849865462,-8.07625355978
+760,-6.55683779099,-8.47383725447,-9.2929150117,-2.20782556809,-8.95891383008
+761,-5.85343931709,-7.09408525175,-9.17341410885,-2.33159568227,-9.21128318731
+762,-6.14974268131,-7.47507434432,-8.86395361181,-2.14024541583,-8.75968855905
+763,-5.85472174815,-7.41876132672,-8.92606746044,-2.39473914238,-8.86980147794
+764,-5.77682503944,-7.80458488217,-9.16613807779,-2.38145245119,-8.6081870417
+765,-5.52753885566,-8.2640123029,-9.40737630902,-2.29512943068,-9.08677762619
+766,-5.71482111437,-8.61702201056,-10.1491815553,-2.69672975536,-9.605407411
+767,-5.17286601838,-8.20744376087,-9.86396914758,-2.74099555848,-9.46591540033
+768,-6.24066735189,-9.57145025614,-10.4714986062,-2.78574171544,-9.63238368379
+769,-5.83243430426,-8.86447926464,-11.2251700642,-3.22312477123,-10.6399883773
+770,-6.50533240766,-10.0822555194,-10.8842257729,-3.01310870154,-10.1120901828
+771,-7.04331851901,-11.4224349418,-10.2445352237,-2.06945966606,-9.0720350199
+772,-7.20960196966,-11.7135406423,-9.69445401905,-1.48892315697,-8.50014413517
+773,-6.77879904408,-10.9098497644,-9.08724727013,-1.68474476946,-8.06826729631
+774,-6.67370506951,-11.1019363285,-8.8421376116,-1.48202359185,-7.84135210128
+775,-6.28924573554,-12.0271508956,-9.58191702864,-2.19805930201,-8.31285662507
+776,-5.24159091994,-11.9347932332,-9.77023874889,-2.63025686989,-8.14144923574
+777,-5.30571198974,-11.8734921481,-9.2612909609,-2.35795191309,-7.6826631425
+778,-4.82719999144,-9.13275377878,-9.43892590655,-2.74635106794,-8.81147257705
+779,-4.94339599756,-9.70879041703,-9.69485303762,-2.58426051003,-8.89948625305
+780,-4.69911816094,-8.18469113133,-9.85780441535,-2.99039908692,-9.53840201769
+781,-4.16937923924,-7.71569812857,-9.19258733366,-2.67089950399,-9.05103555919
+782,-3.74658846191,-7.80741210756,-8.97408173259,-2.53138216921,-8.60245317629
+783,-4.16636400254,-8.50191051501,-8.83798163077,-2.24051342002,-8.04678801391
+784,-4.22987739708,-8.18896719056,-8.73994530266,-2.20671322325,-8.18398369333
+785,-4.46237499869,-8.60546266644,-8.75484072625,-2.07291813141,-8.31526095617
+786,-4.32313638155,-8.17602589622,-8.57326357401,-2.0485540226,-8.03778459064
+787,-4.35119832634,-7.95918846227,-7.60268998452,-1.29836713334,-7.32421130626
+788,-3.4774808039,-6.88524451673,-8.31442790417,-2.23634977969,-8.2817847793
+789,-3.5367436959,-7.15856998427,-7.66291253354,-1.7689341912,-7.42919997291
+790,-3.73047905477,-7.02108521278,-7.64772686299,-1.78559302188,-7.3322780351
+791,-3.57092483255,-7.53859198786,-7.62325426896,-1.80310702834,-7.2811030197
+792,-4.07004258112,-7.89769765952,-7.93185777821,-2.09070689838,-7.24131372215
+793,-3.90598869552,-6.68611426923,-8.44676683037,-2.45596857582,-8.10824803235
+794,-3.62925950616,-7.05580766647,-8.75520760554,-2.54761341898,-8.66398761353
+795,-3.83539280671,-7.26472621244,-8.3838727897,-2.3083993919,-8.11786978314
+796,-3.39078237227,-6.77779215923,-7.54119347001,-2.05839956015,-7.20000181348
+797,-2.86095921973,-6.6460132865,-7.9846207344,-2.43156390162,-7.833959745
+798,-3.58339317213,-7.40658020612,-8.30747043936,-2.34127833832,-7.99400311332
+799,-4.05735150766,-7.6894007875,-8.0514269085,-1.71383242971,-7.60715122237
+800,-4.11334149223,-7.82041147573,-8.13460602521,-1.91266321281,-7.95127646808
+801,-4.19430614563,-7.15548397079,-8.81920405808,-2.00488109671,-8.85604026612
+802,-3.90560035133,-7.41564863107,-8.86559774402,-2.50477800344,-8.67311797548
+803,-4.04969939337,-5.75768881934,-8.20979519656,-2.20091628217,-8.81047399259
+804,-5.32580790109,-6.57969517922,-8.13859892574,-1.87340025205,-8.30150763772
+805,-5.51547171713,-7.3653493055,-7.94122203565,-1.62971818724,-7.66352114904
+806,-5.89615838955,-6.95628142084,-7.89982941063,-1.59282917714,-7.97140787565
+807,-5.44545940767,-7.41246619793,-8.45591381088,-1.75499785845,-8.32287480625
+808,-5.12696405191,-7.64332394915,-8.46022406057,-1.75660032298,-8.31078001037
+809,-5.10003621285,-7.69223206021,-9.45922248915,-2.65349194012,-9.15881194448
+810,-4.37879437467,-7.39837885934,-9.79537992918,-2.95324865162,-9.8285259825
+811,-5.28507183956,-8.53731789306,-9.6046984541,-2.68456242414,-9.22732569872
+812,-5.45159713421,-8.39419715392,-10.4423257643,-2.89190983186,-10.1082008821
+813,-5.91451554501,-9.10258415112,-10.5772502437,-2.58302722675,-10.3123414871
+814,-6.03426014442,-9.77741088846,-10.3617681731,-2.31586721078,-9.73571315453
+815,-6.9361788229,-10.6410389072,-10.4247564072,-2.1472551503,-9.68193300656
+816,-6.59124282241,-10.6686859648,-10.587027567,-2.09823590688,-9.73197770458
+817,-6.85724569273,-10.5963067931,-10.8504250501,-2.39356301649,-9.96640351421
+818,-6.67601557814,-10.2900261531,-10.5856453886,-2.04655961641,-10.2256689444
+819,-6.36975101338,-9.88548536245,-10.9941452176,-2.50293102934,-10.6884801676
+820,-5.7201019671,-9.62904701482,-11.0757644818,-2.81787443227,-10.811227566
+821,-5.88136828861,-10.0485305235,-11.0368333373,-2.74346407665,-10.4987071322
+822,-5.92244155638,-10.563683653,-11.3891633506,-3.27282179395,-10.5384888
+823,-5.56553525864,-10.1943597155,-11.1550286895,-3.091142523,-10.5200057918
+824,-5.1750861124,-9.58135637232,-10.9544165963,-3.10012581049,-10.4534999255
+825,-4.81068605986,-10.2512835849,-11.2523988611,-3.45149709098,-10.5455545113
+826,-4.77065553871,-9.27599529856,-10.5473517298,-3.5124067208,-10.0936271797
+827,-4.62187265825,-9.58385856423,-10.7835596315,-3.55065421529,-10.1495713732
+828,-4.99394186571,-9.73504199801,-10.4080283661,-3.19850469597,-9.86540060464
+829,-6.04301416832,-9.4905445057,-10.217630934,-2.7993198868,-9.39950645806
+830,-6.22607112393,-8.90197696891,-9.71195117055,-2.55266945055,-9.29596717094
+831,-6.47003509227,-9.84928292417,-9.64940994951,-2.30807130278,-8.86878170644
+832,-6.24414514767,-9.64658044598,-9.62867285951,-2.33264261079,-8.92542297282
+833,-5.70405163059,-9.87088180021,-9.66205052893,-2.48584625192,-8.77636805566
+834,-5.59710911671,-10.8667563765,-9.34316825892,-1.87013065519,-7.89508088269
+835,-5.86841467119,-9.31702725562,-9.22282437969,-2.38195919702,-8.77905185727
+836,-5.75751604756,-7.37561768028,-9.19797178729,-2.06448853087,-9.17194475488
+837,-5.66597217308,-7.702837823,-9.08959006793,-2.19657340287,-8.96951635093
+838,-5.5560004368,-8.15580988401,-9.12023651732,-1.95986394265,-8.95680066104
+839,-5.56789940234,-8.96612365277,-9.88672703435,-2.28632194079,-9.34330293209
+840,-5.90816704445,-8.65039384567,-9.28073482777,-1.80111495621,-9.09961445248
+841,-6.03571587002,-8.59828957237,-9.09556631715,-1.68073087689,-9.0948717191
+842,-6.23674476005,-8.5458085374,-8.99629920596,-1.72275668131,-8.89617133153
+843,-6.40904521546,-7.82437374598,-8.58441964198,-1.27414650706,-8.55570549555
+844,-6.31196036404,-8.04545898534,-8.10542652833,-1.26573880715,-8.02706433545
+845,-7.04057620737,-9.12438651353,-8.21153381541,-1.06291118812,-7.52584269483
+846,-7.43447207972,-9.91236672692,-7.79209857331,-0.665960786301,-7.00978620266
+847,-7.1077277343,-8.85393820368,-7.74386589419,-0.640450442391,-7.13684883516
+848,-6.73246185993,-8.46021072259,-8.63508648274,-1.37411002809,-8.12756512196
+849,-6.54166313699,-8.26414102661,-8.95422879427,-1.73344426103,-8.8797972288
+850,-6.49740218939,-7.8981763155,-9.34082421201,-1.98335386589,-9.46988943454
+851,-6.25836969256,-6.95659322533,-9.92701628524,-2.46135660166,-10.0835354762
+852,-5.63028082237,-5.12362477697,-9.94849221414,-2.72065732112,-10.8253574997
+853,-5.50004076646,-5.20771030873,-10.1054768228,-2.99905086854,-10.8757400348
+854,-5.7397058484,-5.44793639357,-10.8865135242,-3.19044704641,-11.4482661928
+855,-4.83515296569,-5.94793368511,-10.984490205,-3.56258441682,-11.5248475043
+856,-3.49352741632,-4.00243235228,-11.1489893181,-4.09441088131,-11.9491315213
+857,-3.5770969747,-5.06572530654,-11.1315834052,-4.01940012726,-11.7615704496
+858,-3.94842351769,-4.58122100883,-11.0071331814,-3.8025735928,-11.7854938583
+859,-3.4211792454,-4.97503654793,-10.8449773536,-3.93046499843,-11.8412663002
+860,-3.62690298857,-4.28207188284,-11.0050744151,-3.89103158697,-12.1670012829
+861,-3.71868534874,-2.90302446585,-10.8536688411,-4.04261559749,-12.279042204
+862,-3.73117059503,-2.85309613076,-11.5731838798,-4.39469820582,-13.0659652366
+863,-3.60836529353,-1.67020548391,-11.5401913026,-4.60672264662,-13.3690800682
+864,-3.37301631807,-0.873358903384,-11.4115331248,-4.32507657404,-13.5286194771
+865,-3.72844188778,-1.72656423838,-11.763358309,-4.53308391543,-13.6382324948
+866,-3.20795129173,-0.578835060185,-11.791821504,-4.64628962224,-14.0529916139
+867,-3.36840710688,-1.15767544124,-11.7286009163,-4.85919791973,-13.7629274302
+868,-3.99696038152,-0.844739867102,-11.4956065626,-4.59579700043,-13.7139545177
+869,-4.21651630045,-1.20666757418,-11.4409365306,-4.40324050056,-13.3790084561
+870,-4.30023850626,-0.105574700022,-11.0958188265,-4.24749649785,-13.8174479173
+871,-4.11564108077,1.58973067252,-11.1369488411,-4.13703426385,-14.1831915489
+872,-4.56786374208,1.36695328606,-11.3904547477,-4.28431290639,-14.4269468632
+873,-4.50056636709,2.25128049162,-10.8564830063,-4.06104934048,-14.2166133475
+874,-4.94600010137,2.32635390812,-10.1386450721,-3.42878455743,-13.3495463995
+875,-4.41764311496,3.13302363303,-10.9020615228,-3.93809970023,-14.5958420203
+876,-4.61049405127,2.36550122929,-10.2714575893,-3.44884237467,-13.7738492901
+877,-4.86266655361,2.90211961629,-10.2016289116,-3.32349381891,-13.8765687916
+878,-4.10224512836,4.28900758429,-10.3582535797,-3.66123383351,-14.2537660588
+879,-3.99572428667,4.92312524363,-9.92240178393,-3.15192212119,-14.0912440489
+880,-4.17482690469,2.82141903136,-10.3817372341,-3.38879952845,-14.0848619935
+881,-4.15388559346,2.83519451643,-10.2283454015,-3.11153755345,-13.7824999673
+882,-4.66708473879,2.27055533414,-10.068101875,-2.73121983697,-13.6321369875
+883,-4.72185523255,4.32901439084,-9.89926131515,-2.73739550829,-14.3959783089
+884,-4.62333271414,4.49791615508,-10.192144157,-2.7682136781,-14.765040152
+885,-4.42065022717,5.04638223181,-10.11667254,-2.76876495112,-14.8306344489
+886,-4.25287698714,4.46321940764,-10.6987002939,-3.08173196085,-15.1398990315
+887,-4.44527678316,4.80432837449,-10.9031982909,-3.03978127773,-15.5532613382
+888,-4.4642282456,4.63540402499,-10.0626863105,-2.70703732986,-14.7688870887
+889,-4.98732399964,4.87396862775,-10.0933839368,-2.3725316622,-14.925094003
+890,-5.50258285626,4.70974864013,-10.2136983639,-2.33421912597,-14.7739657493
+891,-6.0556292827,4.08759536053,-10.1966612527,-2.13033581665,-14.7915496701
+892,-6.20357855741,4.41719312508,-9.8421416554,-2.07446968619,-14.1713058575
+893,-6.84406671834,5.86978276688,-9.33795319327,-1.73984810104,-14.3196899621
+894,-6.80649620344,5.94516877123,-9.08785071528,-1.43538867005,-14.2552864661
+895,-6.7485085783,4.11530120378,-8.82862693553,-1.04215688796,-13.0594162959
+896,-6.51110684629,3.59145076114,-8.77190944029,-1.0336741571,-13.1947952633
+897,-6.68974487202,2.17969311498,-9.22802137982,-1.37409264821,-13.4683842394
+898,-6.55440942186,3.06243533357,-9.1698473504,-1.22021527104,-13.3344614816
+899,-6.63447988618,3.55724318482,-9.44164231674,-1.46933867665,-13.5789206573
+900,-6.10621651832,3.86726041575,-8.77940026076,-1.27191360892,-12.969591172
+901,-6.37728622241,4.69536630774,-8.83064573683,-1.48510895647,-13.2622278618
+902,-6.73732689789,4.59670094866,-9.30892122513,-1.85270026166,-13.7780337461
+903,-6.71798901303,4.45723210353,-9.1424686697,-1.52731047172,-13.6308255327
+904,-6.56073004994,4.50198204099,-9.24472131532,-2.11736724125,-13.732689125
+905,-6.76157327,3.93798583921,-9.29070539054,-1.7374687717,-13.6927290301
+906,-7.43721721675,3.5707456852,-9.26665110758,-1.51542861756,-13.7058162401
+907,-6.85342227753,3.82989187772,-8.98816923297,-1.33835204828,-13.1117000732
+908,-7.0993953667,3.17751065309,-9.06413387904,-1.27911918294,-13.3040051871
+909,-7.08694160784,3.03820583095,-9.10819228366,-1.36034995,-13.1626547474
+910,-8.18691005387,3.50249637009,-8.86726834627,-0.714119788469,-12.9565949834
+911,-9.07831367579,2.11456462943,-8.61472892476,-0.128253572059,-12.3898258754
+912,-8.3820305592,2.22008103714,-8.51299251791,-0.0986489157316,-12.6233941336
+913,-8.72000218177,3.08589880464,-8.66351295867,-0.411793736793,-12.7851524847
+914,-8.74651500043,2.85876888988,-8.4120536043,-0.420198109207,-12.5291559218
+915,-8.9968175645,2.9685564391,-8.02363684798,-0.0764392246616,-12.0284422995
+916,-8.74260823661,2.92566915507,-7.45471871959,0.257374447637,-11.518919906
+917,-8.93260604901,1.65576683465,-7.35321770881,0.393954098311,-11.0521551646
+918,-8.96243902825,2.45208538958,-7.69838778123,0.254203381415,-11.1617644952
+919,-9.17403416046,2.07207623048,-7.48620011154,0.278258872769,-11.0194718332
+920,-8.91691652509,2.7063931525,-7.33596000124,0.298617715114,-11.2794005695
+921,-8.3185258608,4.04538421473,-7.50710825059,0.187238944453,-11.4962398955
+922,-8.0880214816,4.86962512793,-6.79900357691,0.113003235773,-11.4543285503
+923,-7.7808560494,5.36315159783,-7.2641077495,-0.390436374957,-11.9166186106
+924,-8.30164135841,5.67266091001,-6.90854040016,0.091019078768,-11.6092027642
+925,-8.18716428083,5.90817103838,-6.53794712547,0.350449931756,-11.1997110809
+926,-7.97590318546,6.18876860301,-7.32446190805,-0.364186767392,-12.1064087473
+927,-7.36813973229,6.30080622833,-8.56444673032,-1.06607789153,-13.5327707721
+928,-7.47438341107,6.54564731749,-7.77778828611,-0.599102284886,-12.8316239236
+929,-8.43358868231,6.62729734146,-7.39925811474,0.0345529056643,-12.4093872413
+930,-8.52094198827,7.36948836644,-7.70961662968,-0.134881418035,-13.1902579702
+931,-8.60348660531,6.62975932886,-7.2778856103,0.0086373289791,-12.3167137895
+932,-7.75539739982,6.03802159915,-6.73220924154,0.0964482432497,-11.7150796308
+933,-8.07017826456,6.59761960451,-6.49024505985,0.570725237191,-11.4462016059
+934,-8.17032530401,7.15432012027,-6.13655820403,0.684695465667,-11.3025357581
+935,-8.82708352261,6.44731630625,-5.84459822382,1.14847078031,-10.4702312417
+936,-7.79227668835,6.85551389158,-5.64796318875,0.874638310579,-10.4993529134
+937,-8.42438939918,7.3398323113,-6.22577551108,0.648630454082,-11.173472379
+938,-7.79328671347,7.87646751961,-6.00995450033,0.65769025014,-11.0606936451
+939,-7.66422652233,8.42776912822,-5.04040669214,0.777026160172,-10.398857728
+940,-8.22094066572,8.88718803888,-4.54407938894,1.34296022466,-10.058501687
+941,-8.02010947554,8.58461439056,-4.81739676524,1.35158321272,-10.1755989433
+942,-8.29753691759,8.19549965909,-4.70111602242,1.6313734469,-9.91662167777
+943,-8.15323197518,7.33486798493,-4.77948295011,1.66233857286,-9.62911605955
+944,-7.34749352711,7.08587215854,-4.61738615971,1.40339212113,-9.73673844855
+945,-7.47898261999,6.14325567551,-5.18201817358,1.13230907003,-9.98463073065
+946,-8.13116166773,5.98978856631,-4.75412438167,1.7102099887,-9.35795393448
+947,-7.34994930371,6.19940931393,-5.73197537437,1.20326930426,-10.5180109564
+948,-7.17074949635,6.65816101079,-5.23026868798,1.28982433737,-10.0824753725
+949,-7.76597962386,6.95928381995,-5.02161019202,1.12113521372,-10.2769781877
+950,-7.63573676719,7.72866085033,-5.32728577955,1.04328250806,-10.8923822067
+951,-7.47901625134,7.41536748099,-5.59494941785,0.879389596137,-10.7638746251
+952,-7.42321529206,6.12958993416,-5.52542223264,0.891617493788,-10.5311623823
+953,-6.99206951689,7.91537991306,-5.55681707193,0.915533809983,-10.9806672293
+954,-6.37124585044,8.11731460297,-5.4829566968,1.06052146019,-10.7126439958
+955,-6.72935163183,9.0958054212,-5.13030598258,0.860570008679,-11.0866052948
+956,-6.65236463089,9.01413915524,-5.2841246082,0.811132797352,-10.8690269559
+957,-6.93006576998,8.22454116575,-4.68365081154,1.21933228307,-9.91900365174
+958,-6.67184011636,9.32827916986,-4.71485735297,1.01509710922,-10.1931641062
+959,-6.65769653531,9.28227772929,-5.02472824035,0.385566058008,-10.7554568399
+960,-7.30423354111,9.4812387472,-4.84852369843,0.526143113405,-10.5471712667
+961,-7.28934962714,9.86148338916,-5.13492549346,0.353130809676,-10.8341949572
+962,-7.00841309294,10.2045356708,-4.80972618532,0.699135917281,-10.7131394662
+963,-6.90972946885,10.4143403503,-4.7422674606,0.602850182543,-10.5041859753
+964,-7.65023596676,10.9693239141,-4.50852173423,0.971221611192,-10.3388038592
+965,-7.50126627756,11.4661764956,-3.236734438,1.79203786252,-9.31863397597
+966,-8.02045465062,11.8762395074,-2.77545963217,2.35293005294,-8.75664770575
+967,-8.43432012911,10.6208368746,-2.28478217692,2.74765803446,-7.8923953312
+968,-7.88589333975,12.5033881964,-2.53503668254,2.47466210079,-8.71515334392
+969,-7.53673170714,12.2116718174,-2.72984035849,2.16282647918,-8.93573710518
+970,-7.69872073681,11.4857215525,-2.39346209479,2.40525790079,-8.45390383993
+971,-7.00033300249,11.3998425081,-2.81561362755,1.81237426604,-8.63262360538
+972,-7.85319668953,10.7518573099,-1.78529137855,2.77836489073,-7.4217912996
+973,-7.99711165522,11.151184441,-2.08049690055,2.64514021668,-7.84814819036
+974,-7.29048555635,10.8819906589,-2.56843658326,2.17496851804,-8.2953431053
+975,-6.91382111079,10.8374233194,-2.72790803968,1.90892201825,-8.45117729851
+976,-7.25741583678,10.2184680232,-2.66528269525,2.16944805502,-8.11548455923
+977,-6.98910109847,10.2176840386,-2.8262837783,2.05835661081,-8.36467932703
+978,-6.3499524905,10.7241406883,-3.08748484407,1.67002593428,-8.75145354429
+979,-5.91619792118,9.95064553608,-3.78041771895,1.44452296998,-9.59888803693
+980,-6.13826994863,11.9834095094,-4.17279129411,1.10104563929,-10.3327667198
+981,-6.29500398514,12.3622287724,-4.76534180424,0.609719559297,-10.9902322729
+982,-6.85195847662,13.1848787975,-4.13520529452,0.885753334465,-10.7496293669
+983,-6.97057922544,12.8829187447,-4.04460919449,1.21189441308,-10.7778325643
+984,-7.06691359257,11.9004026628,-3.91885184965,1.58245193255,-10.5746611573
+985,-7.39212000577,11.9770072448,-3.85627942514,1.62518404806,-10.3266702518
+986,-6.52124402017,12.8881192594,-4.39810745782,0.889702194369,-11.2263151345
+987,-7.04968730844,13.3120566941,-4.32321839293,1.12292879779,-10.9405588007
+988,-6.56894496787,12.9905890346,-4.26867533523,1.05943977388,-11.0803884375
+989,-6.932860947,11.8288652796,-3.86875096472,1.37451711082,-10.2782928433
+990,-7.33048161823,10.4399185384,-3.31214022362,2.02390868227,-9.20001462799
+991,-6.78949234367,9.64456970707,-3.1401505871,2.24106970739,-8.53386008818
+992,-6.41293915988,10.6781564573,-2.93266980603,1.9617334364,-9.05850240844
+993,-6.50288168327,10.6140520766,-2.81098857002,1.8630736286,-8.95897104858
+994,-5.51236345328,9.61060953025,-3.12491960177,1.86000813216,-8.39402979191
+995,-6.30712424741,9.24251826438,-2.32572392628,2.2191243537,-7.82591351761
+996,-6.8317122362,9.27584638454,-2.01212302451,2.68442390382,-7.24995816723
+997,-7.01590099344,8.53905454516,-1.80943487815,2.88510983538,-6.82307711465
+998,-7.40812496934,7.88216643988,-1.37516747832,3.24706573563,-5.77880298028
+999,-7.02386098432,8.59456088374,-1.59027141406,2.79354013606,-6.32114172303
diff --git a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
new file mode 100644
index 0000000000000000000000000000000000000000..02a60d1cf61765c7c916803fe918d8b7b186405e
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
@@ -0,0 +1,100 @@
+0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867
+1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303
+2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864
+3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426
+4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223
+5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842
+6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606
+7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347
+8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951
+9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228
+10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897
+11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634
+12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594
+13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394
+14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609
+15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449
+16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251
+17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382
+18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767
+19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713
+20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251
+21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811
+22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681
+23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735
+24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436
+25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899
+26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814
+27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727
+28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582
+29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555
+30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696
+31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548
+32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627
+33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104
+34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156
+35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459
+36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576
+37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584
+38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577
+39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467
+40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566
+41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909
+42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021
+43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831
+44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905
+45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271
+46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094
+47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554
+48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769
+49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606
+50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629
+51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199
+52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961
+53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122
+54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454
+55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301
+56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182
+57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365
+58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011
+59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449
+60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229
+61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259
+62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272
+63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989
+64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496
+65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376
+66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206
+67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502
+68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219
+69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125
+70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514
+71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166
+72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832
+73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913
+74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188
+75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388
+76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136
+77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766
+78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959
+79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083
+80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483
+81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656
+82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107
+83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991
+84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527
+85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649
+86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788
+87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289
+88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298
+89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873
+90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669
+91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462
+92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232
+93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225
+94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288
+95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086
+96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161
+97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227
+98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937
+99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724
diff --git a/tensorflow/contrib/timeseries/examples/data/period_trend.csv b/tensorflow/contrib/timeseries/examples/data/period_trend.csv
new file mode 100644
index 0000000000000000000000000000000000000000..39c1952339a8a683aa6bc394526172dafa8c5176
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/data/period_trend.csv
@@ -0,0 +1,500 @@
+1,-0.6656603714
+2,-0.1164380359
+3,0.7398626488
+4,0.7368633029
+5,0.2289480898
+6,2.257073255
+7,3.023457405
+8,2.481161007
+9,3.773638612
+10,5.059257738
+11,3.553186083
+12,4.554486452
+13,3.655475698
+14,3.419647598
+15,4.303376245
+16,4.830153934
+17,7.253057441
+18,5.064802335
+19,5.448082106
+20,6.251301517
+21,6.214335675
+22,3.07021164
+23,6.995487627
+24,7.180942656
+25,6.084876071
+26,6.95580607
+27,6.692312738
+28,6.339959049
+29,7.659013269
+30,6.157071564
+31,4.023661782
+32,7.380555018
+33,6.972155839
+34,6.655956847
+35,6.532594924
+36,6.780524726
+37,6.723407547
+38,7.616777776
+39,6.394157367
+40,5.046574011
+41,5.715326568
+42,6.536737479
+43,6.527307846
+44,5.671954159
+45,6.508512087
+46,4.740656344
+47,5.449062618
+48,5.796110609
+49,4.802213058
+50,4.627081034
+51,5.748934924
+52,4.05776044
+53,2.743057715
+54,3.590052501
+55,2.937786376
+56,5.333221794
+57,5.102383904
+58,5.097946146
+59,2.771776766
+60,3.75493571
+61,3.268329562
+62,3.127887555
+63,5.723894838
+64,2.365351066
+65,2.030890988
+66,5.74385257
+67,2.637874242
+68,2.851492945
+69,1.907194917
+70,2.568816256
+71,3.869259698
+72,3.989917724
+73,3.641515351
+74,2.812911768
+75,4.964828171
+76,3.050937945
+77,4.203046785
+78,4.269162745
+79,2.818643243
+80,3.334928424
+81,5.239741508
+82,4.972880771
+83,5.212782208
+84,6.056729012
+85,5.404247421
+86,4.733521027
+87,5.241044888
+88,6.844720502
+89,8.242617764
+90,6.686818708
+91,6.429035591
+92,7.45926043
+93,8.225717423
+94,7.661722793
+95,8.348721917
+96,8.029228135
+97,9.780942864
+98,9.755623978
+99,9.149489124
+100,8.947965351
+101,9.176768019
+102,8.768408716
+103,10.39624874
+104,10.39477408
+105,11.63126076
+106,11.8222078
+107,13.60107691
+108,14.54919169
+109,12.63475358
+110,13.77411599
+111,14.45808191
+112,13.27674112
+113,16.00004992
+114,13.04977221
+115,14.65730048
+116,14.76178039
+117,14.62716229
+118,16.20697047
+119,14.79470608
+120,16.70541749
+121,15.8638474
+122,15.63192699
+123,17.20433954
+124,16.29180965
+125,16.93688521
+126,16.07521662
+127,18.33942893
+128,15.62502668
+129,16.81519558
+130,16.86177911
+131,19.18323671
+132,16.68993279
+133,16.52735528
+134,15.22702085
+135,16.13574242
+136,16.08079964
+137,17.16828833
+138,16.09004409
+139,16.92712829
+140,15.54298161
+141,16.03893798
+142,15.38310389
+143,16.18064645
+144,16.22326501
+145,17.1657127
+146,14.87850136
+147,12.80968507
+148,16.25354113
+149,15.14082073
+150,15.79111348
+151,14.02005588
+152,14.32583767
+153,13.87437546
+154,14.47127314
+155,14.29661188
+156,14.68406313
+157,15.84514503
+158,13.89667867
+159,13.58135083
+160,14.26005818
+161,13.3826131
+162,12.85293827
+163,11.06745237
+164,14.08812275
+165,13.05949205
+166,12.18454971
+167,13.01005879
+168,12.45032762
+169,12.20445297
+170,14.39420173
+171,13.49261191
+172,14.91460871
+173,15.97672915
+174,13.96235436
+175,13.77840615
+176,14.39425289
+177,14.31499272
+178,14.37080989
+179,15.34130707
+180,13.42441434
+181,14.54726137
+182,12.51644144
+183,15.36040785
+184,14.52577002
+185,15.90562887
+186,15.12482026
+187,15.55534424
+188,12.22427756
+189,15.11554898
+190,14.23464612
+191,16.52156964
+192,18.14558077
+193,16.51932129
+194,16.88159194
+195,18.08337828
+196,18.70889734
+197,20.97040748
+198,18.98358689
+199,20.76308391
+200,19.81117586
+201,20.24139919
+202,20.78884634
+203,19.92458806
+204,21.60401889
+205,23.30040897
+206,22.2621713
+207,21.24305034
+208,22.07690632
+209,21.78022193
+210,22.94853418
+211,23.72076264
+212,24.12217213
+213,23.04498673
+214,23.8767225
+215,26.52157498
+216,26.24329682
+217,24.83932457
+218,25.66570111
+219,25.61834475
+220,24.41079934
+221,25.31871793
+222,26.7612452
+223,27.00663389
+224,27.86719501
+225,24.87319457
+226,27.85768696
+227,25.70405436
+228,26.11077958
+229,28.11250875
+230,27.6743468
+231,27.19705336
+232,28.08086799
+233,26.19946123
+234,27.32830376
+235,25.98334256
+236,26.71791978
+237,26.67921906
+238,26.25811051
+239,26.64228363
+240,26.20667398
+241,26.39816025
+242,24.83672957
+243,24.27745854
+244,26.10007483
+245,25.67761738
+246,25.91667268
+247,27.57057095
+248,25.68913621
+249,24.92375989
+250,25.5593706
+251,25.14638402
+252,26.46738639
+253,24.55740644
+254,23.5691458
+255,24.07138538
+256,24.94177528
+257,22.33546227
+258,22.32323763
+259,24.38075647
+260,22.40754744
+261,22.61183469
+262,23.28658677
+263,22.98637689
+264,25.46468191
+265,24.14497597
+266,22.97023633
+267,24.37831161
+268,24.86418705
+269,22.61185053
+270,21.70979546
+271,22.09389192
+272,23.25882086
+273,23.56494308
+274,24.13181731
+275,24.28160263
+276,24.43623736
+277,23.24956419
+278,21.76696726
+279,25.14997786
+280,24.67520728
+281,23.40400797
+282,26.24489282
+283,25.05952039
+284,24.53922399
+285,24.89917455
+286,25.13438134
+287,26.05220822
+288,26.94133112
+289,26.02788294
+290,26.65909349
+291,26.0832158
+292,27.39946496
+293,26.57973099
+294,27.49867838
+295,29.89834253
+296,27.78403709
+297,28.92405258
+298,26.58518509
+299,30.91291741
+300,31.73949474
+301,29.25173685
+302,30.3747463
+303,30.59695095
+304,31.50757627
+305,30.97036633
+306,31.27177079
+307,33.43369051
+308,33.9848363
+309,33.31775176
+310,31.69164009
+311,33.07897081
+312,33.10849644
+313,33.29428375
+314,35.60397723
+315,35.33614012
+316,33.95701506
+317,35.16914759
+318,35.92430987
+319,35.81820171
+320,37.36378976
+321,36.74459793
+322,35.27569759
+323,35.9767425
+324,36.17811539
+325,35.68567729
+326,35.54212562
+327,38.78114238
+328,36.46819618
+329,38.07352601
+330,36.56662256
+331,38.1938068
+332,37.42919226
+333,37.44666875
+334,37.16795054
+335,34.97440399
+336,35.6174255
+337,37.37634133
+338,37.26137677
+339,38.09726659
+340,36.04071363
+341,37.07494746
+342,34.4281316
+343,35.1959716
+344,35.26041345
+345,36.9398346
+346,33.58933988
+347,35.00075536
+348,35.97807689
+349,35.66631707
+350,35.44925794
+351,33.69565848
+352,35.38969147
+353,35.96432261
+354,33.6956667
+355,34.05230212
+356,32.70536873
+357,33.91009672
+358,34.45606416
+359,34.97972516
+360,32.36260234
+361,31.69621537
+362,33.02307596
+363,33.94445036
+364,32.2763097
+365,32.06228645
+366,34.25956906
+367,33.61620818
+368,35.00141908
+369,34.47493965
+370,34.31576327
+371,33.24772844
+372,32.95185358
+373,32.55224164
+374,33.06560689
+375,35.2082848
+376,34.50372086
+377,33.54922461
+378,35.46287805
+379,34.68829823
+380,35.04640557
+381,33.48711975
+382,34.03264662
+383,34.43296169
+384,35.7571391
+385,32.58466542
+386,34.44295272
+387,35.43369124
+388,37.7196386
+389,37.55863215
+390,35.11245844
+391,37.36667774
+392,36.41904568
+393,38.11951592
+394,39.351325
+395,38.87795167
+396,38.8144378
+397,38.96059714
+398,39.95536453
+399,39.78580611
+400,40.70319964
+401,41.32804151
+402,42.79937243
+403,38.43432481
+404,42.12051726
+405,42.50068551
+406,43.89812523
+407,42.18632495
+408,43.99716859
+409,43.67726129
+410,42.98072384
+411,43.59181621
+412,44.98283057
+413,42.17674627
+414,46.49541908
+415,45.58212027
+416,42.7202171
+417,45.66108535
+418,45.03844556
+419,44.96618253
+420,45.0371585
+421,46.12237848
+422,46.18891162
+423,46.82075672
+424,47.25058257
+425,45.91853936
+426,46.83241571
+427,47.77383153
+428,48.12984438
+429,46.74042025
+430,46.66834779
+431,47.41473153
+432,46.93101415
+433,48.24438209
+434,47.41007874
+435,46.92607209
+436,46.77346554
+437,47.80447575
+438,45.7000972
+439,46.60252512
+440,45.59290618
+441,47.37025588
+442,46.46333171
+443,46.19762396
+444,47.57763766
+445,46.92624737
+446,46.1536802
+447,45.94947611
+448,46.37457004
+449,44.22344538
+450,43.18937717
+451,44.3387774
+452,45.63204816
+453,43.87816917
+454,43.67301546
+455,42.11959709
+456,43.89387883
+457,44.40734798
+458,42.67367897
+459,43.76501429
+460,44.74698445
+461,43.14500236
+462,42.41214263
+463,44.1631715
+464,41.81378406
+465,43.00929934
+466,42.80360515
+467,44.30252713
+468,42.88123048
+469,43.47049118
+470,44.42168141
+471,42.43276664
+472,44.57582419
+473,43.56138481
+474,43.4549005
+475,43.06396235
+476,43.8737132
+477,42.1428636
+478,43.60856585
+479,44.16778079
+480,42.90474298
+481,44.99882414
+482,43.304605
+483,44.4468626
+484,45.49241923
+485,44.46713555
+486,46.27348465
+487,45.76034556
+488,45.37440079
+489,46.19246701
+490,48.28190231
+491,47.81719203
+492,47.23213374
+493,48.03313818
+494,46.73599653
+495,47.12327054
+496,48.58597108
+497,48.6738899
+498,48.52018743
+499,48.50385022
+500,50.17026668
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py
new file mode 100644
index 0000000000000000000000000000000000000000..7659dd308a7ee1b70d6688b85e4f6157ddee0540
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py
@@ -0,0 +1,150 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Example of using an exogenous feature to ignore a known anomaly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+from os import path
+
+import numpy as np
+import tensorflow as tf
+
+
+try:
+  import matplotlib  # pylint: disable=g-import-not-at-top
+  matplotlib.use("TkAgg")  # Need Tk for interactive plots.
+  from matplotlib import pyplot  # pylint: disable=g-import-not-at-top
+  HAS_MATPLOTLIB = True
+except ImportError:
+  # Plotting requires matplotlib, but the unit test running this code may
+  # execute in an environment without it (i.e. matplotlib is not a build
+  # dependency). We'd still like to test the TensorFlow-dependent parts of this
+  # example, namely train_and_predict.
+  HAS_MATPLOTLIB = False
+
+_MODULE_PATH = path.dirname(__file__)
+_DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv")
+
+
+def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
+  """Training, evaluating, and predicting on a series with changepoints."""
+
+  # Indicate the format of our exogenous feature, in this case a string
+  # representing a boolean value.
+  string_feature = tf.contrib.layers.sparse_column_with_keys(
+      column_name="is_changepoint", keys=["no", "yes"])
+  # Specify the way this feature is presented to the model, here using a one-hot
+  # encoding.
+  one_hot_feature = tf.contrib.layers.one_hot_column(
+      sparse_id_column=string_feature)
+
+  estimator = tf.contrib.timeseries.StructuralEnsembleRegressor(
+      periodicities=12,
+      # Extract a smooth period by constraining the number of latent values
+      # being cycled between.
+      cycle_num_latent_values=3,
+      num_features=1,
+      exogenous_feature_columns=[one_hot_feature],
+      # Make exogenous updates sparse by setting an update condition. This in
+      # effect allows missing exogenous features: if the condition evaluates to
+      # False, no update is performed. Otherwise we sometimes end up with
+      # "leaky" updates which add unnecessary uncertainty to the model even when
+      # there is no changepoint.
+      exogenous_update_condition=
+      lambda times, features: tf.equal(features["is_changepoint"], "yes"))
+  reader = tf.contrib.timeseries.CSVReader(
+      csv_file_name,
+      # Indicate the format of our CSV file. First we have two standard columns,
+      # one for times and one for values. The third column is a custom exogenous
+      # feature indicating whether each timestep is a changepoint. The
+      # changepoint feature name must match the string_feature column name
+      # above.
+      column_names=(tf.contrib.timeseries.TrainEvalFeatures.TIMES,
+                    tf.contrib.timeseries.TrainEvalFeatures.VALUES,
+                    "is_changepoint"),
+      # Indicate dtypes for our features.
+      column_dtypes=(tf.int64, tf.float32, tf.string),
+      # This CSV has a header line; here we just ignore it.
+      skip_header_lines=1)
+  train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
+      # Use truncated backpropagation with a window size of 64, batching
+      # together 4 of these windows (random offsets) per training step. Training
+      # with exogenous features often requires somewhat larger windows.
+      reader, batch_size=4, window_size=64)
+  estimator.train(input_fn=train_input_fn, steps=train_steps)
+  evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
+  evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
+  # Create an input_fn for prediction, with a simulated changepoint. Since all
+  # of the anomalies in the training data are explained by the exogenous
+  # feature, we should get relatively confident predictions before the indicated
+  # changepoint (since we are telling the model that no changepoint exists at
+  # those times) and relatively uncertain predictions after.
+  (predictions,) = tuple(estimator.predict(
+      input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
+          evaluation, steps=100,
+          exogenous_features={
+              "is_changepoint": [["no"] * 49 + ["yes"] + ["no"] * 50]})))
+  times = evaluation["times"][0]
+  observed = evaluation["observed"][0, :, 0]
+  mean = np.squeeze(np.concatenate(
+      [evaluation["mean"][0], predictions["mean"]], axis=0))
+  variance = np.squeeze(np.concatenate(
+      [evaluation["covariance"][0], predictions["covariance"]], axis=0))
+  all_times = np.concatenate([times, predictions["times"]], axis=0)
+  upper_limit = mean + np.sqrt(variance)
+  lower_limit = mean - np.sqrt(variance)
+  # Indicate the locations of the changepoints for plotting vertical lines.
+  anomaly_locations = []
+  with open(csv_file_name, "r") as csv_file:
+    csv_reader = csv.DictReader(csv_file)
+    for row in csv_reader:
+      if row["is_changepoint"] == "yes":
+        anomaly_locations.append(int(row["time"]))
+  anomaly_locations.append(predictions["times"][49])
+  return (times, observed, all_times, mean, upper_limit, lower_limit,
+          anomaly_locations)
+
+
+def make_plot(name, training_times, observed, all_times, mean,
+              upper_limit, lower_limit, anomaly_locations):
+  """Plot the time series and anomalies in a new figure."""
+  pyplot.figure()
+  pyplot.plot(training_times, observed, "b", label="training series")
+  pyplot.plot(all_times, mean, "r", label="forecast")
+  pyplot.axvline(anomaly_locations[0], linestyle="dotted", label="changepoints")
+  for anomaly_location in anomaly_locations[1:]:
+    pyplot.axvline(anomaly_location, linestyle="dotted")
+  pyplot.fill_between(all_times, lower_limit, upper_limit, color="grey",
+                      alpha="0.2")
+  pyplot.axvline(training_times[-1], color="k", linestyle="--")
+  pyplot.xlabel("time")
+  pyplot.ylabel("observations")
+  pyplot.legend(loc=0)
+  pyplot.title(name)
+
+
+def main(unused_argv):
+  if not HAS_MATPLOTLIB:
+    raise ImportError(
+        "Please install matplotlib to generate a plot from this example.")
+  make_plot("Ignoring a known anomaly", *train_and_evaluate_exogenous())
+  pyplot.show()
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e307cad815d3c9c8556d0349d366d6f938101a
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
@@ -0,0 +1,49 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that the TensorFlow parts of the known anomaly example run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.examples import known_anomaly
+
+from tensorflow.python.platform import test
+
+
+class KnownAnaomalyExampleTest(test.TestCase):
+
+  def test_shapes_and_variance_structural(self):
+    (times, observed, all_times, mean, upper_limit, lower_limit,
+     anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
+         train_steps=50)
+    self.assertAllEqual(
+        anomaly_locations,
+        [25, 50, 75, 100, 125, 150, 175, 249])
+    self.assertAllEqual([200], times.shape)
+    self.assertAllEqual([200], observed.shape)
+    self.assertAllEqual([300], all_times.shape)
+    self.assertAllEqual([300], mean.shape)
+    self.assertAllEqual([300], upper_limit.shape)
+    self.assertAllEqual([300], lower_limit.shape)
+    # Check that initial predictions are relatively confident.
+    self.assertLess(upper_limit[210] - lower_limit[210],
+                    3.0 * (upper_limit[200] - lower_limit[200]))
+    # Check that post-changepoint predictions are less confident
+    self.assertGreater(upper_limit[290] - lower_limit[290],
+                       3.0 * (upper_limit[240] - lower_limit[240]))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bab06f56c859705597027369147643a43ce01c0
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -0,0 +1,219 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A more advanced example, of building an RNN-based time series model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+
+import numpy
+import tensorflow as tf
+
+from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
+from tensorflow.contrib.timeseries.python.timeseries import model as ts_model
+
+try:
+  import matplotlib  # pylint: disable=g-import-not-at-top
+  matplotlib.use("TkAgg")  # Need Tk for interactive plots.
+  from matplotlib import pyplot  # pylint: disable=g-import-not-at-top
+  HAS_MATPLOTLIB = True
+except ImportError:
+  # Plotting requires matplotlib, but the unit test running this code may
+  # execute in an environment without it (i.e. matplotlib is not a build
+  # dependency). We'd still like to test the TensorFlow-dependent parts of this
+  # example.
+  HAS_MATPLOTLIB = False
+
+_MODULE_PATH = path.dirname(__file__)
+_DATA_FILE = path.join(_MODULE_PATH, "data/multivariate_periods.csv")
+
+
+class _LSTMModel(ts_model.SequentialTimeSeriesModel):
+  """A time series model-building example using an RNNCell."""
+
+  def __init__(self, num_units, num_features, dtype=tf.float32):
+    """Initialize/configure the model object.
+
+    Note that we do not start graph building here. Rather, this object is a
+    configurable factory for TensorFlow graphs which are run by an Estimator.
+
+    Args:
+      num_units: The number of units in the model's LSTMCell.
+      num_features: The dimensionality of the time series (features per
+        timestep).
+      dtype: The floating point data type to use.
+    """
+    super(_LSTMModel, self).__init__(
+        # Pre-register the metrics we'll be outputting (just a mean here).
+        train_output_names=["mean"],
+        predict_output_names=["mean"],
+        num_features=num_features,
+        dtype=dtype)
+    self._num_units = num_units
+    # Filled in by initialize_graph()
+    self._lstm_cell = None
+    self._lstm_cell_run = None
+    self._predict_from_lstm_output = None
+
+  def initialize_graph(self, input_statistics):
+    """Save templates for components, which can then be used repeatedly.
+
+    This method is called every time a new graph is created. It's safe to start
+    adding ops to the current default graph here, but the graph should be
+    constructed from scratch.
+
+    Args:
+      input_statistics: A math_utils.InputStatistics object.
+    """
+    super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics)
+    self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
+    # Create templates so we don't have to worry about variable reuse.
+    self._lstm_cell_run = tf.make_template(
+        name_="lstm_cell",
+        func_=self._lstm_cell,
+        create_scope_now_=True)
+    # Transforms LSTM output into mean predictions.
+    self._predict_from_lstm_output = tf.make_template(
+        name_="predict_from_lstm_output",
+        func_=
+        lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features),
+        create_scope_now_=True)
+
+  def get_start_state(self):
+    """Return initial state for the time series model."""
+    return (
+        # Keeps track of the time associated with this state for error checking.
+        tf.zeros([], dtype=tf.int64),
+        # The previous observation or prediction.
+        tf.zeros([self.num_features], dtype=self.dtype),
+        # The state of the RNNCell (batch dimension removed since this parent
+        # class will broadcast).
+        [tf.squeeze(state_element, axis=0)
+         for state_element
+         in self._lstm_cell.zero_state(batch_size=1, dtype=self.dtype)])
+
+  def _transform(self, data):
+    """Normalize data based on input statistics to encourage stable training."""
+    mean, variance = self._input_statistics.overall_feature_moments
+    return (data - mean) / variance
+
+  def _de_transform(self, data):
+    """Transform data back to the input scale."""
+    mean, variance = self._input_statistics.overall_feature_moments
+    return data * variance + mean
+
+  def _filtering_step(self, current_times, current_values, state, predictions):
+    """Update model state based on observations.
+
+    Note that we don't do much here aside from computing a loss. In this case
+    it's easier to update the RNN state in _prediction_step, since that covers
+    running the RNN both on observations (from this method) and our own
+    predictions. This distinction can be important for probabilistic models,
+    where repeatedly predicting without filtering should lead to low-confidence
+    predictions.
+
+    Args:
+      current_times: A [batch size] integer Tensor.
+      current_values: A [batch size, self.num_features] floating point Tensor
+        with new observations.
+      state: The model's state tuple.
+      predictions: The output of the previous `_prediction_step`.
+    Returns:
+      A tuple of new state and a predictions dictionary updated to include a
+      loss (note that we could also return other measures of goodness of fit,
+      although only "loss" will be optimized).
+    """
+    state_from_time, prediction, lstm_state = state
+    with tf.control_dependencies(
+        [tf.assert_equal(current_times, state_from_time)]):
+      transformed_values = self._transform(current_values)
+      # Use mean squared error across features for the loss.
+      predictions["loss"] = tf.reduce_mean(
+          (prediction - transformed_values) ** 2, axis=-1)
+      # Keep track of the new observation in model state. It won't be run
+      # through the LSTM until the next _imputation_step.
+      new_state_tuple = (current_times, transformed_values, lstm_state)
+    return (new_state_tuple, predictions)
+
+  def _prediction_step(self, current_times, state):
+    """Advance the RNN state using a previous observation or prediction."""
+    _, previous_observation_or_prediction, lstm_state = state
+    lstm_output, new_lstm_state = self._lstm_cell_run(
+        inputs=previous_observation_or_prediction, state=lstm_state)
+    next_prediction = self._predict_from_lstm_output(lstm_output)
+    new_state_tuple = (current_times, next_prediction, new_lstm_state)
+    return new_state_tuple, {"mean": self._de_transform(next_prediction)}
+
+  def _imputation_step(self, current_times, state):
+    """Advance model state across a gap."""
+    # Does not do anything special if we're jumping across a gap. More advanced
+    # models, especially probabilistic ones, would want a special case that
+    # depends on the gap size.
+    return state
+
+  def _exogenous_input_step(
+      self, current_times, current_exogenous_regressors, state):
+    """Update model state based on exogenous regressors."""
+    raise NotImplementedError(
+        "Exogenous inputs are not implemented for this example.")
+
+
+def train_and_predict(csv_file_name=_DATA_FILE, training_steps=200):
+  """Train and predict using a custom time series model."""
+  # Construct an Estimator from our LSTM model.
+  estimator = ts_estimators.TimeSeriesRegressor(
+      model=_LSTMModel(num_features=5, num_units=128),
+      optimizer=tf.train.AdamOptimizer(0.001))
+  reader = tf.contrib.timeseries.CSVReader(
+      csv_file_name,
+      column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
+                    + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5))
+  train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
+      reader, batch_size=4, window_size=32)
+  estimator.train(input_fn=train_input_fn, steps=training_steps)
+  evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
+  evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
+  # Predict starting after the evaluation
+  (predictions,) = tuple(estimator.predict(
+      input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
+          evaluation, steps=100)))
+  times = evaluation["times"][0]
+  observed = evaluation["observed"][0, :, :]
+  predicted_mean = numpy.squeeze(numpy.concatenate(
+      [evaluation["mean"][0], predictions["mean"]], axis=0))
+  all_times = numpy.concatenate([times, predictions["times"]], axis=0)
+  return times, observed, all_times, predicted_mean
+
+
+def main(unused_argv):
+  if not HAS_MATPLOTLIB:
+    raise ImportError(
+        "Please install matplotlib to generate a plot from this example.")
+  (observed_times, observations,
+   all_times, predictions) = train_and_predict()
+  pyplot.axvline(99, linestyle="dotted")
+  observed_lines = pyplot.plot(
+      observed_times, observations, label="Observed", color="k")
+  predicted_lines = pyplot.plot(
+      all_times, predictions, label="Predicted", color="b")
+  pyplot.legend(handles=[observed_lines[0], predicted_lines[0]],
+                loc="upper left")
+  pyplot.show()
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..138e64dbd82316436d713b9253532835326d328e
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that the TensorFlow parts of the LSTM example run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.examples import lstm
+
+from tensorflow.python.platform import test
+
+
+class LSTMExampleTest(test.TestCase):
+
+  def test_periodicity_learned(self):
+    (observed_times, observed_values,
+     all_times, predicted_values) = lstm.train_and_predict(training_steps=100)
+    self.assertAllEqual([100], observed_times.shape)
+    self.assertAllEqual([100, 5], observed_values.shape)
+    self.assertAllEqual([200], all_times.shape)
+    self.assertAllEqual([200, 5], predicted_values.shape)
+    self.assertGreater(
+        predicted_values[100, 4]
+        - predicted_values[115, 4],  # Amplitude of fifth component
+        0.4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/examples/multivariate.py b/tensorflow/contrib/timeseries/examples/multivariate.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed799542fd50cd150f13533c5f33bd67ed09fff6
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/multivariate.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A multivariate TFTS example.
+
+Fits a multivariate model, exports it, and visualizes the learned correlations
+by iteratively predicting and sampling from the predictions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import tempfile
+
+import numpy
+import tensorflow as tf
+
+try:
+  import matplotlib  # pylint: disable=g-import-not-at-top
+  matplotlib.use("TkAgg")  # Need Tk for interactive plots.
+  from matplotlib import pyplot  # pylint: disable=g-import-not-at-top
+  HAS_MATPLOTLIB = True
+except ImportError:
+  # Plotting requires matplotlib, but the unit test running this code may
+  # execute in an environment without it (i.e. matplotlib is not a build
+  # dependency). We'd still like to test the TensorFlow-dependent parts of this
+  # example, namely train_and_predict.
+  HAS_MATPLOTLIB = False
+
+_MODULE_PATH = path.dirname(__file__)
+_DATA_FILE = path.join(_MODULE_PATH, "data/multivariate_level.csv")
+
+
+def multivariate_train_and_sample(
+    csv_file_name=_DATA_FILE, export_directory=None, training_steps=500):
+  """Trains, evaluates, and exports a multivariate model."""
+  estimator = tf.contrib.timeseries.StructuralEnsembleRegressor(
+      periodicities=[], num_features=5)
+  reader = tf.contrib.timeseries.CSVReader(
+      csv_file_name,
+      column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
+                    + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5))
+  train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
+      # Larger window sizes generally produce a better covariance matrix.
+      reader, batch_size=4, window_size=64)
+  estimator.train(input_fn=train_input_fn, steps=training_steps)
+  evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
+  current_state = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
+  values = [current_state["observed"]]
+  times = [current_state[tf.contrib.timeseries.FilteringResults.TIMES]]
+  # Export the model so we can do iterative prediction and filtering without
+  # reloading model checkpoints.
+  if export_directory is None:
+    export_directory = tempfile.mkdtemp()
+  input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+  export_location = estimator.export_savedmodel(
+      export_directory, input_receiver_fn)
+  with tf.Graph().as_default():
+    numpy.random.seed(1)  # Make the example a bit more deterministic
+    with tf.Session() as session:
+      signatures = tf.saved_model.loader.load(
+          session, [tf.saved_model.tag_constants.SERVING], export_location)
+      for _ in range(100):
+        current_prediction = (
+            tf.contrib.timeseries.saved_model_utils.predict_continuation(
+                continue_from=current_state, signatures=signatures,
+                session=session, steps=1))
+        next_sample = numpy.random.multivariate_normal(
+            # Squeeze out the batch and series length dimensions (both 1).
+            mean=numpy.squeeze(current_prediction["mean"], axis=[0, 1]),
+            cov=numpy.squeeze(current_prediction["covariance"], axis=[0, 1]))
+        # Update model state so that future predictions are conditional on the
+        # value we just sampled.
+        filtering_features = {
+            tf.contrib.timeseries.TrainEvalFeatures.TIMES: current_prediction[
+                tf.contrib.timeseries.FilteringResults.TIMES],
+            tf.contrib.timeseries.TrainEvalFeatures.VALUES: next_sample[
+                None, None, :]}
+        current_state = (
+            tf.contrib.timeseries.saved_model_utils.filter_continuation(
+                continue_from=current_state,
+                session=session,
+                signatures=signatures,
+                features=filtering_features))
+        values.append(next_sample[None, None, :])
+        times.append(current_state["times"])
+  all_observations = numpy.squeeze(numpy.concatenate(values, axis=1), axis=0)
+  all_times = numpy.squeeze(numpy.concatenate(times, axis=1), axis=0)
+  return all_times, all_observations
+
+
+def main(unused_argv):
+  if not HAS_MATPLOTLIB:
+    raise ImportError(
+        "Please install matplotlib to generate a plot from this example.")
+  all_times, all_observations = multivariate_train_and_sample()
+  # Show where sampling starts on the plot
+  pyplot.axvline(1000, linestyle="dotted")
+  pyplot.plot(all_times, all_observations)
+  pyplot.show()
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/timeseries/examples/multivariate_test.py b/tensorflow/contrib/timeseries/examples/multivariate_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b8ee0606e4733c91c111c9a2bbe20a6bb1080cf
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/multivariate_test.py
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that the TensorFlow parts of the multivariate example run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.examples import multivariate
+
+from tensorflow.python.platform import test
+
+
+class MultivariateExampleTest(test.TestCase):
+
+  def test_shapes_structural(self):
+    times, values = multivariate.multivariate_train_and_sample(
+        export_directory=self.get_temp_dir(), training_steps=5)
+    self.assertAllEqual([1100], times.shape)
+    self.assertAllEqual([1100, 5], values.shape)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/examples/predict.py b/tensorflow/contrib/timeseries/examples/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..8147d40caa521533e8eb68f2175fdc3ec2125436
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/predict.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An example of training and predicting with a TFTS estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import numpy as np
+import tensorflow as tf
+
+
+try:
+  import matplotlib  # pylint: disable=g-import-not-at-top
+  matplotlib.use("TkAgg")  # Need Tk for interactive plots.
+  from matplotlib import pyplot  # pylint: disable=g-import-not-at-top
+  HAS_MATPLOTLIB = True
+except ImportError:
+  # Plotting requires matplotlib, but the unit test running this code may
+  # execute in an environment without it (i.e. matplotlib is not a build
+  # dependency). We'd still like to test the TensorFlow-dependent parts of this
+  # example, namely train_and_predict.
+  HAS_MATPLOTLIB = False
+
+FLAGS = None
+
+
+def structural_ensemble_train_and_predict(csv_file_name):
+  # Cycle between 5 latent values over a period of 100. This leads to a very
+  # smooth periodic component (and a small model), which is a good fit for our
+  # example data. Modeling high-frequency periodic variations will require a
+  # higher cycle_num_latent_values.
+  structural = tf.contrib.timeseries.StructuralEnsembleRegressor(
+      periodicities=100, num_features=1, cycle_num_latent_values=5)
+  return train_and_predict(structural, csv_file_name, training_steps=150)
+
+
+def ar_train_and_predict(csv_file_name):
+  # An autoregressive model, with periodicity handled as a time-based
+  # regression. Note that this requires windows of size 16 (input_window_size +
+  # output_window_size) for training.
+  ar = tf.contrib.timeseries.ARRegressor(
+      periodicities=100, input_window_size=10, output_window_size=6,
+      num_features=1,
+      # Use the (default) normal likelihood loss to adaptively fit the
+      # variance. SQUARED_LOSS overestimates variance when there are trends in
+      # the series.
+      loss=tf.contrib.timeseries.ARModel.NORMAL_LIKELIHOOD_LOSS)
+  return train_and_predict(ar, csv_file_name, training_steps=600)
+
+
+def train_and_predict(estimator, csv_file_name, training_steps):
+  """A simple example of training and predicting."""
+  # Read data in the default "time,value" CSV format with no header
+  reader = tf.contrib.timeseries.CSVReader(csv_file_name)
+  # Set up windowing and batching for training
+  train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
+      reader, batch_size=16, window_size=16)
+  # Fit model parameters to data
+  estimator.train(input_fn=train_input_fn, steps=training_steps)
+  # Evaluate on the full dataset sequentially, collecting in-sample predictions
+  # for a qualitative evaluation. Note that this loads the whole dataset into
+  # memory. For quantitative evaluation, use RandomWindowChunker.
+  evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
+  evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
+  # Predict starting after the evaluation
+  (predictions,) = tuple(estimator.predict(
+      input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
+          evaluation, steps=200)))
+  times = evaluation["times"][0]
+  observed = evaluation["observed"][0, :, 0]
+  mean = np.squeeze(np.concatenate(
+      [evaluation["mean"][0], predictions["mean"]], axis=0))
+  variance = np.squeeze(np.concatenate(
+      [evaluation["covariance"][0], predictions["covariance"]], axis=0))
+  all_times = np.concatenate([times, predictions["times"]], axis=0)
+  upper_limit = mean + np.sqrt(variance)
+  lower_limit = mean - np.sqrt(variance)
+  return times, observed, all_times, mean, upper_limit, lower_limit
+
+
+def make_plot(name, training_times, observed, all_times, mean,
+              upper_limit, lower_limit):
+  """Plot a time series in a new figure."""
+  pyplot.figure()
+  pyplot.plot(training_times, observed, "b", label="training series")
+  pyplot.plot(all_times, mean, "r", label="forecast")
+  pyplot.plot(all_times, upper_limit, "g", label="forecast upper bound")
+  pyplot.plot(all_times, lower_limit, "g", label="forecast lower bound")
+  pyplot.fill_between(all_times, lower_limit, upper_limit, color="grey",
+                      alpha="0.2")
+  pyplot.axvline(training_times[-1], color="k", linestyle="--")
+  pyplot.xlabel("time")
+  pyplot.ylabel("observations")
+  pyplot.legend(loc=0)
+  pyplot.title(name)
+
+
+def main(unused_argv):
+  if not HAS_MATPLOTLIB:
+    raise ImportError(
+        "Please install matplotlib to generate a plot from this example.")
+  make_plot("Structural ensemble",
+            *structural_ensemble_train_and_predict(FLAGS.input_filename))
+  make_plot("AR", *ar_train_and_predict(FLAGS.input_filename))
+  pyplot.show()
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--input_filename",
+      type=str,
+      required=True,
+      help="Input csv file.")
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/timeseries/examples/predict_test.py b/tensorflow/contrib/timeseries/examples/predict_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..678fd71cd8b94ee0be46e10a9a673de55bd44215
--- /dev/null
+++ b/tensorflow/contrib/timeseries/examples/predict_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that the TensorFlow parts of the prediction example run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+
+from tensorflow.contrib.timeseries.examples import predict
+
+from tensorflow.python.platform import test
+
+
+_MODULE_PATH = path.dirname(__file__)
+_DATA_FILE = path.join(_MODULE_PATH, "data/period_trend.csv")
+
+
+class PeriodTrendExampleTest(test.TestCase):
+
+  def test_shapes_and_variance_structural(self):
+    (times, observed, all_times, mean, upper_limit, lower_limit
+    ) = predict.structural_ensemble_train_and_predict(_DATA_FILE)
+    # Just check that plotting will probably be OK. We can't actually run the
+    # plotting code since we don't want to pull in matplotlib as a dependency
+    # for this test.
+    self.assertAllEqual([500], times.shape)
+    self.assertAllEqual([500], observed.shape)
+    self.assertAllEqual([700], all_times.shape)
+    self.assertAllEqual([700], mean.shape)
+    self.assertAllEqual([700], upper_limit.shape)
+    self.assertAllEqual([700], lower_limit.shape)
+    # Check that variance hasn't blown up too much. This is a relatively good
+    # indication that training was successful.
+    self.assertLess(upper_limit[-1] - lower_limit[-1],
+                    1.5 * (upper_limit[0] - lower_limit[0]))
+
+  def test_ar(self):
+    (times, observed, all_times, mean,
+     upper_limit, lower_limit) = predict.ar_train_and_predict(_DATA_FILE)
+    self.assertAllEqual(times.shape, observed.shape)
+    self.assertAllEqual(all_times.shape, mean.shape)
+    self.assertAllEqual(all_times.shape, upper_limit.shape)
+    self.assertAllEqual(all_times.shape, lower_limit.shape)
+    self.assertLess((upper_limit - lower_limit).mean(), 4.)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/__init__.py b/tensorflow/contrib/timeseries/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5072feab59d54d1ac22ce9f517f962fb92f92f62
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A time series library in TensorFlow (TFTS)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.timeseries.python.timeseries import *
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3726e4b8890bb96f788cc529958d5150a90f315d
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -0,0 +1,416 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "py_init",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ar_model",
+        ":estimators",
+        ":feature_keys",
+        ":input_pipeline",
+        ":saved_model_utils",
+    ],
+)
+
+py_library(
+    name = "feature_keys",
+    srcs = [
+        "feature_keys.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python/saved_model:signature_constants"],
+)
+
+py_library(
+    name = "saved_model_utils",
+    srcs = [
+        "saved_model_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        ":input_pipeline",
+        ":model_utils",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "model",
+    srcs = [
+        "model.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        ":math_utils",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "estimators",
+    srcs = [
+        "estimators.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ar_model",
+        ":feature_keys",
+        ":math_utils",
+        ":model_utils",
+        ":state_management",
+        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:filtering_postprocessor",
+        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:state_space_model",
+        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:structural_ensemble",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:export",
+    ],
+)
+
+py_test(
+    name = "estimators_test",
+    timeout = "long",
+    srcs = [
+        "estimators_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
+    deps = [
+        ":ar_model",
+        ":estimators",
+        ":feature_keys",
+        ":input_pipeline",
+        ":saved_model_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "model_utils",
+    srcs = [
+        "model_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "model_utils_test",
+    srcs = [
+        "model_utils_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
+    deps = [
+        ":feature_keys",
+        ":model",
+        ":model_utils",
+        ":state_management",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_library(
+    name = "state_management",
+    srcs = [
+        "state_management.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        ":math_utils",
+        ":model",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "state_management_test",
+    srcs = [
+        "state_management_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
+    deps = [
+        ":feature_keys",
+        ":input_pipeline",
+        ":math_utils",
+        ":model",
+        ":state_management",
+        ":test_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "input_pipeline",
+    srcs = [
+        "input_pipeline.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        ":model_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "input_pipeline_test",
+    srcs = [
+        "input_pipeline_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
+    deps = [
+        ":feature_keys",
+        ":input_pipeline",
+        ":test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "test_utils",
+    srcs = [
+        "test_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":estimators",
+        ":feature_keys",
+        ":input_pipeline",
+        ":state_management",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_library(
+    name = "ar_model",
+    srcs = [
+        "ar_model.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        ":model",
+        ":model_utils",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "ar_model_test",
+    timeout = "long",  # Moderate but for asan
+    srcs = [
+        "ar_model_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ar_model",
+        ":estimators",
+        ":feature_keys",
+        ":input_pipeline",
+        ":test_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "math_utils",
+    srcs = [
+        "math_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_keys",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/lookup:lookup_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_test(
+    name = "math_utils_test",
+    srcs = [
+        "math_utils_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
+    deps = [
+        ":feature_keys",
+        ":input_pipeline",
+        ":math_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/__init__.py b/tensorflow/contrib/timeseries/python/timeseries/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c683dad71de8f8502f08a4e823faa79d60d5604d
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A time series library in TensorFlow (TFTS)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.timeseries.python.timeseries.ar_model import *
+from tensorflow.contrib.timeseries.python.timeseries.estimators import *
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import *
+from tensorflow.contrib.timeseries.python.timeseries.input_pipeline import *
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7452dc7dc362b304ca3b3717bad039df17012e5c
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -0,0 +1,706 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Auto-Regressive models for time series data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import distributions
+
+from tensorflow.contrib.timeseries.python.timeseries import model
+from tensorflow.contrib.timeseries.python.timeseries import model_utils
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
+
+
+class ARModel(model.TimeSeriesModel):
+  """Auto-regressive model, both linear and non-linear.
+
+  Features to the model include time and values of input_window_size timesteps,
+  and times for output_window_size timesteps. These are passed through zero or
+  more hidden layers, and then fed to a loss function (e.g. squared loss).
+
+  Note that this class can also be used to regress against time only by setting
+  the input_window_size to zero.
+  """
+  SQUARED_LOSS = "squared_loss"
+  NORMAL_LIKELIHOOD_LOSS = "normal_likelihood_loss"
+
+  def __init__(self,
+               periodicities,
+               input_window_size,
+               output_window_size,
+               num_features,
+               num_time_buckets=10,
+               loss=NORMAL_LIKELIHOOD_LOSS,
+               hidden_layer_sizes=None):
+    """Constructs an auto-regressive model.
+
+    Args:
+      periodicities: periodicities of the input data, in the same units as the
+        time feature. Note this can be a single value or a list of values for
+        multiple periodicities.
+      input_window_size: Number of past time steps of data to look at when doing
+        the regression.
+      output_window_size: Number of future time steps to predict. Note that
+        setting it to > 1 empiricaly seems to give a better fit.
+      num_features: number of input features per time step.
+      num_time_buckets: Number of buckets into which to divide (time %
+        periodicity) for generating time based features.
+      loss: Loss function to use for training. Currently supported values are
+        SQUARED_LOSS and NORMAL_LIKELIHOOD_LOSS. Note that for
+        NORMAL_LIKELIHOOD_LOSS, we train the covariance term as well. For
+        SQUARED_LOSS, the evaluation loss is reported based on un-scaled
+        observations and predictions, while the training loss is computed on
+        normalized data (if input statistics are available).
+      hidden_layer_sizes: list of sizes of hidden layers.
+    """
+    self.input_window_size = input_window_size
+    self.output_window_size = output_window_size
+    if hidden_layer_sizes is None:
+      hidden_layer_sizes = []
+    self.hidden_layer_sizes = hidden_layer_sizes
+    self.window_size = self.input_window_size + self.output_window_size
+    self.loss = loss
+    self.stats_means = None
+    self.stats_sigmas = None
+    super(ARModel, self).__init__(
+        num_features=num_features)
+    assert num_time_buckets > 0
+    self._buckets = int(num_time_buckets)
+    if periodicities is None or not periodicities:
+      periodicities = []
+    elif (not isinstance(periodicities, list) and
+          not isinstance(periodicities, tuple)):
+      periodicities = [periodicities]
+    self._periods = [int(p) for p in periodicities]
+    for p in self._periods:
+      assert p > 0
+    assert len(self._periods) or self.input_window_size
+    assert output_window_size > 0
+
+  def scale_data(self, data):
+    """Scale data according to stats."""
+    if self._input_statistics is not None:
+      return (data - self.stats_means) / self.stats_sigmas
+    else:
+      return data
+
+  def scale_back_data(self, data):
+    if self._input_statistics is not None:
+      return (data * self.stats_sigmas) + self.stats_means
+    else:
+      return data
+
+  def scale_back_variance(self, var):
+    if self._input_statistics is not None:
+      return var * self.stats_sigmas * self.stats_sigmas
+    else:
+      return var
+
+  def initialize_graph(self, input_statistics=None):
+    super(ARModel, self).initialize_graph(input_statistics=input_statistics)
+    if self._input_statistics:
+      self.stats_means, variances = (
+          self._input_statistics.overall_feature_moments)
+      self.stats_sigmas = math_ops.sqrt(variances)
+
+  def get_start_state(self):
+    # State which matches the format we'll return later. Typically this will not
+    # be used by the model directly, but the shapes and dtypes should match so
+    # that the serving input_receiver_fn gets placeholder shapes correct.
+    return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64),
+            array_ops.zeros(
+                [self.input_window_size, self.num_features], dtype=self.dtype))
+
+  # TODO(allenl,agarwal): Support sampling for AR.
+  def random_model_parameters(self, seed=None):
+    pass
+
+  def generate(self, number_of_series, series_length,
+               model_parameters=None, seed=None):
+    pass
+
+  def _predicted_covariance_op(self, activations, num_values):
+    activation, activation_size = activations[-1]
+    if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
+      log_sigma_square = model_utils.fully_connected(
+          activation,
+          activation_size,
+          self.output_window_size * num_values,
+          name="log_sigma_square",
+          activation=None)
+      predicted_covariance = gen_math_ops.exp(log_sigma_square)
+      predicted_covariance = array_ops.reshape(
+          predicted_covariance, [-1, self.output_window_size, num_values])
+    else:
+      shape = array_ops.stack([
+          array_ops.shape(activation)[0],
+          constant_op.constant(self.output_window_size),
+          constant_op.constant(num_values)
+      ])
+      predicted_covariance = array_ops.ones(shape=shape, dtype=activation.dtype)
+    return predicted_covariance
+
+  def _predicted_mean_op(self, activations):
+    activation, activation_size = activations[-1]
+    predicted_mean = model_utils.fully_connected(
+        activation,
+        activation_size,
+        self.output_window_size * self.num_features,
+        name="predicted_mean",
+        activation=None)
+    return array_ops.reshape(predicted_mean,
+                             [-1, self.output_window_size, self.num_features])
+
+  def _create_hidden_stack(self, activation, activation_size):
+    activations = []
+    for layer_number, layer_size in enumerate(self.hidden_layer_sizes):
+      # TODO(agarwal): Migrate to fully_connected in tf slim
+      activation = model_utils.fully_connected(
+          activation, activation_size, layer_size,
+          name="layer_{}".format(layer_number))
+      activation_size = layer_size
+      activations.append((activation, activation_size))
+    return activations
+
+  def prediction_ops(self, times, values):
+    """Compute model predictions given input data.
+
+    Args:
+      times: A [batch size, self.window_size] integer Tensor, the first
+          self.input_window_size times in each part of the batch indicating
+          input features, and the last self.output_window_size times indicating
+          prediction times.
+      values: A [batch size, self.input_window_size, self.num_features] Tensor
+          with input features.
+    Returns:
+      Tuple (predicted_mean, predicted_covariance), where each element is a
+      Tensor with shape [batch size, self.output_window_size,
+      self.num_features].
+    """
+    times.get_shape().assert_is_compatible_with([None, self.window_size])
+    activations = []
+    if self.input_window_size:
+      values.get_shape().assert_is_compatible_with(
+          [None, self.input_window_size, self.num_features])
+    # Create input features.
+    if self._periods:
+      _, time_features = self._compute_time_features(times)
+      activation_size = self.window_size * self._buckets * len(self._periods)
+      activation = array_ops.reshape(time_features, [-1, activation_size])
+    else:
+      activation_size = 0
+      activation = None
+
+    if self.input_window_size:
+      inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1])
+      inp_size = self.input_window_size * self.num_features
+      inp = array_ops.reshape(inp, [-1, inp_size])
+      if activation is not None:
+        activation = array_ops.concat([inp, activation], 1)
+      else:
+        activation = inp
+      activation_size += inp_size
+    assert activation_size
+    activations.append((activation, activation_size))
+    # Create hidden layers.
+    activations += self._create_hidden_stack(activation, activation_size)
+    # Create mean and convariance ops.
+    predicted_mean = self._predicted_mean_op(activations)
+    predicted_covariance = self._predicted_covariance_op(activations,
+                                                         self.num_features)
+    return {"activations": activations,
+            "mean": predicted_mean,
+            "covariance": predicted_covariance}
+
+  def loss_op(self, targets, prediction_ops):
+    """Create loss_op."""
+    prediction = prediction_ops["mean"]
+    if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
+      covariance = prediction_ops["covariance"]
+      sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
+      normal = distributions.Normal(loc=targets, scale=sigma)
+      loss_op = -math_ops.reduce_sum(normal.log_prob(prediction))
+    else:
+      assert self.loss == ARModel.SQUARED_LOSS, self.loss
+      loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets))
+    loss_op /= math_ops.cast(
+        math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
+    return loss_op
+
+  # TODO(allenl, agarwal): Consider better ways of warm-starting predictions.
+  def predict(self, features):
+    """Computes predictions multiple steps into the future.
+
+    Args:
+      features: A dictionary with the following key/value pairs:
+        PredictionFeatures.TIMES: A [batch size, predict window size]
+          integer Tensor of times, after the window of data indicated by
+          `STATE_TUPLE`, to make predictions for.
+        PredictionFeatures.STATE_TUPLE: A tuple of (times, values), times with
+          shape [batch size, self.input_window_size], values with shape [batch
+          size, self.input_window_size, self.num_features] representing a
+          segment of the time series before `TIMES`. This data is used
+          to start of the autoregressive computation. This should have data for
+          at least self.input_window_size timesteps.
+    Returns:
+      A dictionary with keys, "mean", "covariance". The
+      values are Tensors of shape [batch_size, predict window size,
+      num_features] and correspond to the values passed in `TIMES`.
+    """
+    predict_times = math_ops.cast(
+        ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32)
+    batch_size = array_ops.shape(predict_times)[0]
+    num_predict_values = array_ops.shape(predict_times)[1]
+    prediction_iterations = ((num_predict_values + self.output_window_size - 1)
+                             // self.output_window_size)
+    # Pad predict_times so as to have exact multiple of self.output_window_size
+    # values per example.
+    padding_size = (prediction_iterations * self.output_window_size -
+                    num_predict_values)
+    padding = array_ops.zeros([batch_size, padding_size], predict_times.dtype)
+    predict_times = control_flow_ops.cond(
+        padding_size > 0, lambda: array_ops.concat([predict_times, padding], 1),
+        lambda: predict_times)
+    state = features[PredictionFeatures.STATE_TUPLE]
+    (state_times, state_values) = state
+    state_times = math_ops.cast(
+        ops.convert_to_tensor(state_times), dtypes.int32)
+    state_values = ops.convert_to_tensor(state_values, dtype=self.dtype)
+
+    initial_input_times = predict_times[:, :self.output_window_size]
+    if self.input_window_size > 0:
+      initial_input_times = array_ops.concat(
+          [state_times[:, -self.input_window_size:], initial_input_times], 1)
+      values_size = array_ops.shape(state_values)[1]
+      times_size = array_ops.shape(state_times)[1]
+      with ops.control_dependencies([
+          check_ops.assert_greater_equal(values_size, self.input_window_size),
+          check_ops.assert_equal(values_size, times_size)
+      ]):
+        initial_input_values = state_values[:, -self.input_window_size:, :]
+    else:
+      initial_input_values = 0
+
+    # Iterate over the predict_times, predicting self.output_window_size values
+    # in each iteration.
+    def _while_condition(iteration_number, *unused_args):
+      return math_ops.less(iteration_number, prediction_iterations)
+
+    def _while_body(iteration_number, input_times, input_values,
+                    mean_ta, covariance_ta):
+      """Predict self.output_window_size values."""
+      prediction_ops = self.prediction_ops(input_times, input_values)
+      predicted_mean = prediction_ops["mean"]
+      predicted_covariance = prediction_ops["covariance"]
+      offset = self.output_window_size * gen_math_ops.minimum(
+          iteration_number + 1, prediction_iterations - 1)
+      if self.input_window_size > 0:
+        if self.output_window_size < self.input_window_size:
+          new_input_values = array_ops.concat(
+              [input_values[:, self.output_window_size:, :], predicted_mean], 1)
+          new_input_times = array_ops.concat([
+              input_times[:, self.output_window_size:],
+              predict_times[:, offset:offset + self.output_window_size]
+          ], 1)
+        else:
+          new_input_values = predicted_mean[:, -self.input_window_size:, :]
+          new_input_times = predict_times[
+              :,
+              offset - self.input_window_size:offset + self.output_window_size]
+      else:
+        new_input_values = input_values
+        new_input_times = predict_times[:,
+                                        offset:offset + self.output_window_size]
+      new_input_times.set_shape(initial_input_times.get_shape())
+      new_mean_ta = mean_ta.write(iteration_number, predicted_mean)
+      if isinstance(covariance_ta, tensor_array_ops.TensorArray):
+        new_covariance_ta = covariance_ta.write(iteration_number,
+                                                predicted_covariance)
+      else:
+        new_covariance_ta = covariance_ta
+      return (iteration_number + 1,
+              new_input_times,
+              new_input_values,
+              new_mean_ta,
+              new_covariance_ta)
+
+    # Note that control_flow_ops.while_loop doesn't seem happy with None. Hence
+    # using 0 for cases where we don't want to predict covariance.
+    covariance_ta_init = (tensor_array_ops.TensorArray(
+        dtype=self.dtype, size=prediction_iterations)
+                          if self.loss != ARModel.SQUARED_LOSS else 0.)
+    mean_ta_init = tensor_array_ops.TensorArray(
+        dtype=self.dtype, size=prediction_iterations)
+    _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
+        _while_condition, _while_body, [
+            0, initial_input_times, initial_input_values, mean_ta_init,
+            covariance_ta_init
+        ])
+
+    def _parse_ta(values_ta):
+      """Helper function to parse the returned TensorArrays."""
+
+      if not isinstance(values_ta, tensor_array_ops.TensorArray):
+        return None
+      predictions_length = prediction_iterations * self.output_window_size
+      # Shape [prediction_iterations, batch_size, self.output_window_size,
+      #        self.num_features]
+      values_packed = values_ta.stack()
+      # Transpose to move batch dimension outside.
+      output_values = array_ops.reshape(
+          array_ops.transpose(values_packed, [1, 0, 2, 3]),
+          array_ops.stack([batch_size, predictions_length, -1]))
+      # Clip to desired size
+      return output_values[:, :num_predict_values, :]
+
+    predicted_mean = _parse_ta(mean_ta)
+    predicted_covariance = _parse_ta(covariance_ta)
+    if predicted_covariance is None:
+      predicted_covariance = array_ops.ones_like(predicted_mean)
+
+    # Transform and scale the mean and covariance appropriately.
+    predicted_mean = self.scale_back_data(predicted_mean)
+    predicted_covariance = self.scale_back_variance(predicted_covariance)
+
+    return {"mean": predicted_mean,
+            "covariance": predicted_covariance}
+
+  def _process_window(self, features, mode):
+    """Compute model outputs on a single window of data."""
+    # TODO(agarwal): Use exogenous features
+    times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtypes.int64)
+    values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype)
+    original_values = values
+
+    # Extra shape checking for the window size (above that in
+    # model_utils.make_model_fn).
+    expected_times_shape = [None, self.window_size]
+    if not times.get_shape().is_compatible_with(expected_times_shape):
+      raise ValueError(
+          ("ARModel with input_window_size={input_window_size} "
+           "and output_window_size={output_window_size} expects "
+           "feature '{times_feature}' to have shape (batch_size, "
+           "{window_size}) (for any batch_size), but got shape {times_shape}. "
+           "If you are using RandomWindowInputFn, set "
+           "window_size={window_size} or adjust the input_window_size and "
+           "output_window_size arguments to ARModel.").format(
+               input_window_size=self.input_window_size,
+               output_window_size=self.output_window_size,
+               times_feature=TrainEvalFeatures.TIMES,
+               window_size=self.window_size,
+               times_shape=times.get_shape()))
+    values = self.scale_data(values)
+    if self.input_window_size > 0:
+      input_values = values[:, :self.input_window_size, :]
+    else:
+      input_values = None
+    prediction_ops = self.prediction_ops(times, input_values)
+    prediction = prediction_ops["mean"]
+    covariance = prediction_ops["covariance"]
+    targets = array_ops.slice(values, [0, self.input_window_size, 0],
+                              [-1, -1, -1])
+    targets.get_shape().assert_is_compatible_with(prediction.get_shape())
+    if (mode == estimator_lib.ModeKeys.EVAL
+        and self.loss == ARModel.SQUARED_LOSS):
+      # Report an evaluation loss which matches the expected
+      #  (observed - predicted) ** 2.
+      # Note that this affects only evaluation; the training loss is unaffected.
+      loss = self.loss_op(
+          self.scale_back_data(targets),
+          {"mean": self.scale_back_data(prediction_ops["mean"])})
+    else:
+      loss = self.loss_op(targets, prediction_ops)
+
+    # Scale back the prediction.
+    prediction = self.scale_back_data(prediction)
+    covariance = self.scale_back_variance(covariance)
+
+    return model.ModelOutputs(
+        loss=loss,
+        end_state=(times[:, -self.input_window_size:],
+                   values[:, -self.input_window_size:, :]),
+        predictions={"mean": prediction, "covariance": covariance,
+                     "observed": original_values[:, -self.output_window_size:]},
+        prediction_times=times[:, -self.output_window_size:])
+
+  def get_batch_loss(self, features, mode, state):
+    """Computes predictions and a loss.
+
+    Args:
+      features: A dictionary (such as is produced by a chunker) with the
+        following key/value pairs (shapes are given as required for training):
+          TrainEvalFeatures.TIMES: A [batch size, self.window_size] integer
+            Tensor with times for each observation. To train on longer
+            sequences, the data should first be chunked.
+          TrainEvalFeatures.VALUES: A [batch size, self.window_size,
+            self.num_features] Tensor with values for each observation.
+        When evaluating, `TIMES` and `VALUES` must have a window size of at
+        least self.window_size, but it may be longer, in which case the last
+        window_size - self.input_window_size times (or fewer if this is not
+        divisible by self.output_window_size) will be evaluated on with
+        non-overlapping output windows (and will have associated
+        predictions). This is primarily to support qualitative
+        evaluation/plotting, and is not a recommended way to compute evaluation
+        losses (since there is no overlap in the output windows, which for
+        window-based models is an undesirable bias).
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN or EVAL).
+      state: Unused
+    Returns:
+      A model.ModelOutputs object.
+    Raises:
+      ValueError: If `mode` is not TRAIN or EVAL, or if static shape information
+      is incorrect.
+    """
+    features = {feature_name: ops.convert_to_tensor(feature_value)
+                for feature_name, feature_value in features.items()}
+    if mode == estimator_lib.ModeKeys.TRAIN:
+      # For training, we require the window size to be self.window_size as
+      # iterating sequentially on larger windows could introduce a bias.
+      return self._process_window(features, mode=mode)
+    elif mode == estimator_lib.ModeKeys.EVAL:
+      # For evaluation, we allow the user to pass in a larger window, in which
+      # case we try to cover as much of the window as possible without
+      # overlap. Quantitative evaluation is more efficient/correct with fixed
+      # windows matching self.window_size (as with training), but this looping
+      # allows easy plotting of "in-sample" predictions.
+      times = features[TrainEvalFeatures.TIMES]
+      times.get_shape().assert_has_rank(2)
+      static_window_size = times.get_shape()[1].value
+      if (static_window_size is not None
+          and static_window_size < self.window_size):
+        raise ValueError(
+            ("ARModel requires a window of at least input_window_size + "
+             "output_window_size to evaluate on (input_window_size={}, "
+             "output_window_size={}, and got shape {} for feature '{}' (batch "
+             "size, window size)).").format(
+                 self.input_window_size, self.output_window_size,
+                 times.get_shape(), TrainEvalFeatures.TIMES))
+      num_iterations = ((array_ops.shape(times)[1] -  self.input_window_size)
+                        // self.output_window_size)
+      output_size = num_iterations * self.output_window_size
+      # Rather than dealing with overlapping windows of output, discard a bit at
+      # the beginning if output windows don't cover evenly.
+      crop_length = output_size + self.input_window_size
+      features = {feature_name: feature_value[:, -crop_length:]
+                  for feature_name, feature_value in features.items()}
+      # Note that, unlike the ARModel's predict() while_loop and the
+      # SequentialTimeSeriesModel while_loop, each iteration here can run in
+      # parallel, since we are not feeding predictions or state from previous
+      # iterations.
+      def _while_condition(iteration_number, loss_ta, mean_ta, covariance_ta):
+        del loss_ta, mean_ta, covariance_ta  # unused
+        return iteration_number < num_iterations
+
+      def _while_body(iteration_number, loss_ta, mean_ta, covariance_ta):
+        """Perform a processing step on a single window of data."""
+        base_offset = iteration_number * self.output_window_size
+        model_outputs = self._process_window(
+            features={
+                feature_name:
+                feature_value[:, base_offset:base_offset + self.window_size]
+                for feature_name, feature_value in features.items()},
+            mode=mode)
+        # This code needs to be updated if new predictions are added in
+        # self._process_window
+        assert len(model_outputs.predictions) == 3
+        assert "mean" in model_outputs.predictions
+        assert "covariance" in model_outputs.predictions
+        assert "observed" in model_outputs.predictions
+        return (iteration_number + 1,
+                loss_ta.write(
+                    iteration_number, model_outputs.loss),
+                mean_ta.write(
+                    iteration_number, model_outputs.predictions["mean"]),
+                covariance_ta.write(
+                    iteration_number, model_outputs.predictions["covariance"]))
+      _, loss_ta, mean_ta, covariance_ta = control_flow_ops.while_loop(
+          _while_condition, _while_body,
+          [0,
+           tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations),
+           tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations),
+           tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations)])
+      values = math_ops.cast(features[TrainEvalFeatures.VALUES],
+                             dtype=self.dtype)
+      batch_size = array_ops.shape(times)[0]
+      prediction_shape = [batch_size, self.output_window_size * num_iterations,
+                          self.num_features]
+      previous_state_times, previous_state_values = state
+      # Make sure returned state always has windows of self.input_window_size,
+      # even if we were passed fewer than self.input_window_size points this
+      # time.
+      if self.input_window_size > 0:
+        new_state_times = array_ops.concat(
+            [previous_state_times,
+             math_ops.cast(times, dtype=dtypes.int64)],
+            axis=1)[:, -self.input_window_size:]
+        new_state_times.set_shape((None, self.input_window_size))
+        new_state_values = array_ops.concat(
+            [previous_state_values,
+             self.scale_data(values)], axis=1)[:, -self.input_window_size:, :]
+        new_state_values.set_shape((None, self.input_window_size,
+                                    self.num_features))
+      else:
+        # There is no state to keep, and the strided slices above do not handle
+        # input_window_size=0.
+        new_state_times = previous_state_times
+        new_state_values = previous_state_values
+      return model.ModelOutputs(
+          loss=math_ops.reduce_mean(loss_ta.stack(), axis=0),
+          end_state=(new_state_times, new_state_values),
+          predictions={
+              "mean": array_ops.reshape(
+                  array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]),
+                  prediction_shape),
+              "covariance": array_ops.reshape(
+                  array_ops.transpose(covariance_ta.stack(), [1, 0, 2, 3]),
+                  prediction_shape),
+              "observed": values[:, -output_size:]},
+          prediction_times=times[:, -output_size:])
+    else:
+      raise ValueError(
+          "Unknown mode '{}' passed to get_batch_loss.".format(mode))
+
+  def _compute_time_features(self, time):
+    """Compute some features on the time value."""
+    batch_size = array_ops.shape(time)[0]
+    num_periods = len(self._periods)
+    # Reshape to 3D.
+    periods = constant_op.constant(
+        self._periods, shape=[1, 1, num_periods, 1], dtype=time.dtype)
+    time = array_ops.reshape(time, [batch_size, -1, 1, 1])
+    window_offset = time / self._periods
+    # Cast to appropriate type and scale to [0, 1) range
+    mod = (math_ops.cast(time % periods, self.dtype) * self._buckets /
+           math_ops.cast(periods, self.dtype))
+    # Bucketize based on some fixed width intervals. For a value t and interval
+    # [a, b), we return (t - a) if a <= t < b, else 0.
+    intervals = array_ops.reshape(
+        math_ops.range(self._buckets, dtype=self.dtype),
+        [1, 1, 1, self._buckets])
+    mod = nn_ops.relu(mod - intervals)
+    mod = array_ops.where(mod < 1.0, mod, array_ops.zeros_like(mod))
+    return window_offset, mod
+
+
+class AnomalyMixtureARModel(ARModel):
+  """Model data as a mixture of normal and anomaly distributions.
+
+  Note that this model works by changing the loss function to reduce the penalty
+  when predicting an anomalous target. However the predictions are still based
+  on anomalous input features, and this may affect the quality of fit. One
+  possible solution is to downweight/filter anomalous inputs, but that requires
+  more sequential processing instead of completely random windows.
+  """
+
+  GAUSSIAN_ANOMALY = "gaussian"
+  CAUCHY_ANOMALY = "cauchy"
+
+  def __init__(self,
+               periodicities,
+               anomaly_prior_probability,
+               input_window_size,
+               output_window_size,
+               num_features,
+               anomaly_distribution=GAUSSIAN_ANOMALY,
+               num_time_buckets=10,
+               hidden_layer_sizes=None):
+    assert (anomaly_prior_probability < 1.0 and
+            anomaly_prior_probability > 0.0)
+    self._anomaly_prior_probability = anomaly_prior_probability
+    assert anomaly_distribution in [
+        AnomalyMixtureARModel.GAUSSIAN_ANOMALY,
+        AnomalyMixtureARModel.CAUCHY_ANOMALY]
+    self._anomaly_distribution = anomaly_distribution
+    super(AnomalyMixtureARModel, self).__init__(
+        periodicities=periodicities,
+        num_features=num_features,
+        num_time_buckets=num_time_buckets,
+        input_window_size=input_window_size,
+        output_window_size=output_window_size,
+        loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+        hidden_layer_sizes=hidden_layer_sizes)
+
+  def _create_anomaly_ops(self, times, values, prediction_ops_dict):
+    anomaly_log_param = variable_scope.get_variable(
+        "anomaly_log_param",
+        shape=[],
+        dtype=self.dtype,
+        initializer=init_ops.zeros_initializer())
+    # Anomaly param is the variance for Gaussian and scale for Cauchy
+    # distribution.
+    prediction_ops_dict["anomaly_params"] = gen_math_ops.exp(anomaly_log_param)
+
+  def prediction_ops(self, times, values):
+    prediction_ops_dict = super(AnomalyMixtureARModel, self).prediction_ops(
+        times, values)
+    self._create_anomaly_ops(times, values, prediction_ops_dict)
+    return prediction_ops_dict
+
+  def _anomaly_log_prob(self, targets, prediction_ops):
+    prediction = prediction_ops["mean"]
+    if self._anomaly_distribution == AnomalyMixtureARModel.GAUSSIAN_ANOMALY:
+      anomaly_variance = prediction_ops["anomaly_params"]
+      anomaly_sigma = math_ops.sqrt(
+          gen_math_ops.maximum(anomaly_variance, 1e-5))
+      normal = distributions.Normal(loc=targets, scale=anomaly_sigma)
+      log_prob = normal.log_prob(prediction)
+    else:
+      assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY
+      anomaly_scale = prediction_ops["anomaly_params"]
+      cauchy = distributions.StudentT(
+          df=array_ops.ones([], dtype=anomaly_scale.dtype),
+          loc=targets,
+          scale=anomaly_scale)
+      log_prob = cauchy.log_prob(prediction)
+    return log_prob
+
+  def loss_op(self, targets, prediction_ops):
+    """Create loss_op."""
+    prediction = prediction_ops["mean"]
+    covariance = prediction_ops["covariance"]
+    # Normal data log probability.
+    sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
+    normal1 = distributions.Normal(loc=targets, scale=sigma)
+    log_prob1 = normal1.log_prob(prediction)
+    log_prob1 += math_ops.log(1 - self._anomaly_prior_probability)
+    # Anomaly log probability.
+    log_prob2 = self._anomaly_log_prob(targets, prediction_ops)
+    log_prob2 += math_ops.log(self._anomaly_prior_probability)
+    # We need to compute log(exp(log_prob1) + exp(log_prob2). For numerical
+    # stability, we rewrite the expression as below.
+    p1 = gen_math_ops.minimum(log_prob1, log_prob2)
+    p2 = gen_math_ops.maximum(log_prob1, log_prob2)
+    mixed_log_prob = p2 + math_ops.log(1 + gen_math_ops.exp(p1 - p2))
+    loss_op = -math_ops.reduce_sum(mixed_log_prob)
+    loss_op /= math_ops.cast(
+        math_ops.reduce_prod(array_ops.shape(targets)), self.dtype)
+    return loss_op
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1ca4e77fc41bb418cf2521c2c7fbed9f27c6a8
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -0,0 +1,337 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ar_model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import test_utils
+from tensorflow.contrib.timeseries.python.timeseries.ar_model import AnomalyMixtureARModel
+from tensorflow.contrib.timeseries.python.timeseries.ar_model import ARModel
+from tensorflow.contrib.timeseries.python.timeseries.estimators import ARRegressor
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import training
+
+
+class ARModelTest(test.TestCase):
+
+  def create_data(self,
+                  noise_stddev,
+                  anomaly_prob,
+                  multiple_periods=False,
+                  anomaly_stddev_scale=20):
+    self.period = 25
+    num_samples = 200
+    time = 1 + 3 * np.arange(num_samples).astype(np.int64)
+    time_offset = (2 * np.pi * (time % self.period).astype(np.float) /
+                   self.period).reshape([-1, 1])
+    if multiple_periods:
+      period2 = 55
+      self.period = [self.period, period2]
+      time_offset2 = ((time % period2).astype(np.float) / period2).reshape(
+          [-1, 1])
+      data1 = np.sin(time_offset / 2.0) ** 2 * (1 + time_offset2)
+    else:
+      data1 = np.sin(2 * time_offset) + np.cos(3 * time_offset)
+    data1 += noise_stddev / 4. * np.random.randn(num_samples, 1)
+    data2 = (np.sin(3 * time_offset) + np.cos(5 * time_offset) +
+             noise_stddev / 3. * np.random.randn(num_samples, 1))
+    # Add some anomalies to data1
+    if anomaly_prob > 0.:
+      num_anomalies = int(anomaly_prob * num_samples)
+      anomaly_values = (anomaly_stddev_scale * noise_stddev / 4 *
+                        np.random.randn(num_anomalies))
+      indices = np.random.randint(0, num_samples, num_anomalies)
+      for index, val in zip(indices, anomaly_values):
+        data1[index] += val
+
+    data = np.concatenate((4 * data1, 3 * data2), axis=1)
+    split = int(num_samples * 0.8)
+    train_data = {TrainEvalFeatures.TIMES: time[0:split],
+                  TrainEvalFeatures.VALUES: data[0:split]}
+    test_data = {TrainEvalFeatures.TIMES: time[split:],
+                 TrainEvalFeatures.VALUES: data[split:]}
+    return (train_data, test_data)
+
+  # Note that most models will require many more steps to fully converge. We
+  # have used a small number of steps here to keep the running time small.
+  def train_helper(self, input_window_size, loss,
+                   max_loss=None, train_steps=200,
+                   anomaly_prob=0.01,
+                   anomaly_distribution=None,
+                   multiple_periods=False):
+    np.random.seed(3)
+    data_noise_stddev = 0.2
+    if max_loss is None:
+      if loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
+        max_loss = 1.0
+      else:
+        max_loss = 0.05 / (data_noise_stddev ** 2)
+    train_data, test_data = self.create_data(
+        noise_stddev=data_noise_stddev,
+        anomaly_prob=anomaly_prob,
+        multiple_periods=multiple_periods)
+    output_window_size = 10
+    window_size = input_window_size + output_window_size
+
+    class _RunConfig(estimator_lib.RunConfig):
+
+      @property
+      def tf_random_seed(self):
+        return 3
+
+    estimator = ARRegressor(
+        periodicities=self.period,
+        anomaly_prior_probability=0.01 if anomaly_distribution else None,
+        anomaly_distribution=anomaly_distribution,
+        num_features=2,
+        output_window_size=output_window_size,
+        num_time_buckets=20,
+        input_window_size=input_window_size,
+        hidden_layer_sizes=[16],
+        loss=loss,
+        config=_RunConfig())
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        time_series_reader=input_pipeline.NumpyReader(train_data),
+        window_size=window_size,
+        batch_size=64,
+        num_threads=1,
+        shuffle_seed=2)
+    test_input_fn = test_utils.AllWindowInputFn(
+        time_series_reader=input_pipeline.NumpyReader(test_data),
+        window_size=window_size)
+
+    # Test training
+    estimator.train(
+        input_fn=train_input_fn,
+        steps=train_steps)
+    test_evaluation = estimator.evaluate(input_fn=test_input_fn, steps=1)
+    test_loss = test_evaluation["loss"]
+    logging.info("Final test loss: %f", test_loss)
+    self.assertLess(test_loss, max_loss)
+    if loss == ARModel.SQUARED_LOSS:
+      # Test that the evaluation loss is reported without input scaling.
+      self.assertAllClose(
+          test_loss,
+          np.mean((test_evaluation["mean"] - test_evaluation["observed"]) ** 2))
+
+    # Test predict
+    train_data_times = train_data[TrainEvalFeatures.TIMES]
+    train_data_values = train_data[TrainEvalFeatures.VALUES]
+    test_data_times = test_data[TrainEvalFeatures.TIMES]
+    test_data_values = test_data[TrainEvalFeatures.VALUES]
+    predict_times = np.expand_dims(np.concatenate(
+        [train_data_times[input_window_size:], test_data_times]), 0)
+    predict_true_values = np.expand_dims(np.concatenate(
+        [train_data_values[input_window_size:], test_data_values]), 0)
+    state_times = np.expand_dims(train_data_times[:input_window_size], 0)
+    state_values = np.expand_dims(
+        train_data_values[:input_window_size, :], 0)
+
+    def prediction_input_fn():
+      return ({
+          PredictionFeatures.TIMES: training.limit_epochs(
+              predict_times, num_epochs=1),
+          PredictionFeatures.STATE_TUPLE: (state_times, state_values)
+      }, {})
+    (predictions,) = tuple(estimator.predict(input_fn=prediction_input_fn))
+    predicted_mean = predictions["mean"][:, 0]
+    true_values = predict_true_values[0, :, 0]
+
+    if loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
+      variances = predictions["covariance"][:, 0]
+      standard_deviations = np.sqrt(variances)
+      # Note that we may get tighter bounds with more training steps.
+      errors = np.abs(predicted_mean - true_values) > 4 * standard_deviations
+      fraction_errors = np.mean(errors)
+      logging.info("Fraction errors: %f", fraction_errors)
+
+  def test_time_regression_squared(self):
+    self.train_helper(input_window_size=0,
+                      train_steps=350,
+                      loss=ARModel.SQUARED_LOSS)
+
+  def test_autoregression_squared(self):
+    self.train_helper(input_window_size=15,
+                      loss=ARModel.SQUARED_LOSS)
+
+  def test_autoregression_short_input_window(self):
+    self.train_helper(input_window_size=8,
+                      loss=ARModel.SQUARED_LOSS)
+
+  def test_autoregression_normal(self):
+    self.train_helper(input_window_size=10,
+                      loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+                      train_steps=300,
+                      max_loss=1.5,
+                      anomaly_distribution=None)
+
+  def test_autoregression_normal_multiple_periods(self):
+    self.train_helper(input_window_size=10,
+                      loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+                      max_loss=2.0,
+                      multiple_periods=True,
+                      anomaly_distribution=None)
+
+  def test_autoregression_normal_anomalies_normal(self):
+    self.train_helper(
+        input_window_size=10,
+        loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+        anomaly_distribution=AnomalyMixtureARModel.GAUSSIAN_ANOMALY)
+
+  def test_autoregression_normal_anomalies_cauchy(self):
+    self.train_helper(
+        input_window_size=10,
+        max_loss=1.5,
+        loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+        anomaly_distribution=AnomalyMixtureARModel.CAUCHY_ANOMALY)
+
+  def test_wrong_window_size(self):
+    estimator = ARRegressor(
+        periodicities=10, num_features=1,
+        input_window_size=10, output_window_size=6)
+    def _bad_window_size_input_fn():
+      return ({TrainEvalFeatures.TIMES: [[1]],
+               TrainEvalFeatures.VALUES: [[[1.]]]},
+              None)
+    def _good_data():
+      return ({TrainEvalFeatures.TIMES: np.arange(16)[None, :],
+               TrainEvalFeatures.VALUES: array_ops.reshape(
+                   np.arange(16), [1, 16, 1])},
+              None)
+    with self.assertRaisesRegexp(ValueError, "set window_size=16"):
+      estimator.train(input_fn=_bad_window_size_input_fn, steps=1)
+    # Get a checkpoint for evaluation
+    estimator.train(input_fn=_good_data, steps=1)
+    with self.assertRaisesRegexp(ValueError, "requires a window of at least"):
+      estimator.evaluate(input_fn=_bad_window_size_input_fn, steps=1)
+
+  def test_predictions_direct(self):
+    g = ops.Graph()
+    with g.as_default():
+      model = ARModel(periodicities=2,
+                      num_features=1,
+                      num_time_buckets=10,
+                      input_window_size=2,
+                      output_window_size=2,
+                      hidden_layer_sizes=[40, 10])
+      with session.Session():
+        predicted_values = model.predict({
+            PredictionFeatures.TIMES: [[4, 6, 10]],
+            PredictionFeatures.STATE_TUPLE: ([[1, 2]], [[[1.], [2.]]])
+        })
+        variables.global_variables_initializer().run()
+        self.assertAllEqual(predicted_values["mean"].eval().shape,
+                            [1, 3, 1])
+
+  def test_long_eval(self):
+    g = ops.Graph()
+    with g.as_default():
+      model = ARModel(periodicities=2,
+                      num_features=1,
+                      num_time_buckets=10,
+                      input_window_size=2,
+                      output_window_size=1)
+      raw_features = {
+          TrainEvalFeatures.TIMES: [[1, 3, 5, 7, 11]],
+          TrainEvalFeatures.VALUES: [[[1.], [2.], [3.], [4.], [5.]]]}
+      chunked_features, _ = test_utils.AllWindowInputFn(
+          time_series_reader=input_pipeline.NumpyReader(raw_features),
+          window_size=3)()
+      model.initialize_graph()
+      with variable_scope.variable_scope("armodel") as scope:
+        raw_evaluation = model.define_loss(
+            raw_features, mode=estimator_lib.ModeKeys.EVAL)
+      with variable_scope.variable_scope(scope, reuse=True):
+        chunked_evaluation = model.define_loss(
+            chunked_features, mode=estimator_lib.ModeKeys.EVAL)
+      with session.Session() as sess:
+        coordinator = coordinator_lib.Coordinator()
+        queue_runner_impl.start_queue_runners(sess, coord=coordinator)
+        variables.global_variables_initializer().run()
+        raw_evaluation_evaled, chunked_evaluation_evaled = sess.run(
+            [raw_evaluation, chunked_evaluation])
+        self.assertAllClose(chunked_evaluation_evaled.loss,
+                            raw_evaluation_evaled.loss)
+        last_chunk_evaluation_state = [
+            state[-1, None] for state in
+            chunked_evaluation_evaled.end_state]
+        for last_chunk_state_member, raw_state_member in zip(
+            last_chunk_evaluation_state, raw_evaluation_evaled.end_state):
+          self.assertAllClose(last_chunk_state_member, raw_state_member)
+        self.assertAllEqual([[5, 7, 11]],
+                            raw_evaluation_evaled.prediction_times)
+        for feature_name in raw_evaluation.predictions:
+          self.assertAllEqual(
+              [1, 3, 1],  # batch, window, num_features. The window size has 2
+                          # cut off for the first input_window.
+              raw_evaluation_evaled.predictions[feature_name].shape)
+          self.assertAllClose(
+              np.reshape(chunked_evaluation_evaled.predictions[feature_name],
+                         [-1]),
+              np.reshape(raw_evaluation_evaled.predictions[feature_name],
+                         [-1]))
+        coordinator.request_stop()
+        coordinator.join()
+
+  def test_long_eval_discard_indivisible(self):
+    g = ops.Graph()
+    with g.as_default():
+      model = ARModel(periodicities=2,
+                      num_features=1,
+                      num_time_buckets=10,
+                      input_window_size=2,
+                      output_window_size=2)
+      raw_features = {
+          TrainEvalFeatures.TIMES: [[1, 3, 5, 7, 11]],
+          TrainEvalFeatures.VALUES: [[[1.], [2.], [3.], [4.], [5.]]]}
+      model.initialize_graph()
+      raw_evaluation = model.define_loss(
+          raw_features, mode=estimator_lib.ModeKeys.EVAL)
+      with session.Session() as sess:
+        coordinator = coordinator_lib.Coordinator()
+        queue_runner_impl.start_queue_runners(sess, coord=coordinator)
+        variables.global_variables_initializer().run()
+        raw_evaluation_evaled = sess.run(raw_evaluation)
+        self.assertAllEqual([[7, 11]],
+                            raw_evaluation_evaled.prediction_times)
+        for feature_name in raw_evaluation.predictions:
+          self.assertAllEqual(
+              [1, 2, 1],  # batch, window, num_features. The window has two cut
+                          # off for the first input window and one discarded so
+                          # that the remainder is divisible into output windows.
+              raw_evaluation_evaled.predictions[feature_name].shape)
+        coordinator.request_stop()
+        coordinator.join()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
new file mode 100644
index 0000000000000000000000000000000000000000..4025a8f0142b68c275122dac7ee384341d07163a
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -0,0 +1,379 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimators for time series models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import ar_model
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries import model_utils
+from tensorflow.contrib.timeseries.python.timeseries import state_management
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import structural_ensemble
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models.filtering_postprocessor import StateInterpolatingAnomalyDetector
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.estimator.export import export_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.training import training as train
+
+
+class TimeSeriesRegressor(estimator_lib.Estimator):
+  """An Estimator to fit and evaluate a time series model."""
+
+  def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
+               config=None):
+    """Initialize the Estimator.
+
+    Args:
+      model: The time series model to wrap (inheriting from TimeSeriesModel).
+      state_manager: The state manager to use, or (by default)
+          PassthroughStateManager if none is needed.
+      optimizer: The optimization algorithm to use when training, inheriting
+          from tf.train.Optimizer. Defaults to Adam with step size 0.02.
+      model_dir: See `Estimator`.
+      config: See `Estimator`.
+    """
+    input_statistics_generator = math_utils.InputStatisticsFromMiniBatch(
+        dtype=model.dtype, num_features=model.num_features)
+    if state_manager is None:
+      state_manager = state_management.PassthroughStateManager()
+    if optimizer is None:
+      optimizer = train.AdamOptimizer(0.02)
+    self._model = model
+    model_fn = model_utils.make_model_fn(
+        model, state_manager, optimizer,
+        input_statistics_generator=input_statistics_generator)
+    super(TimeSeriesRegressor, self).__init__(
+        model_fn=model_fn,
+        model_dir=model_dir,
+        config=config)
+
+  # TODO(allenl): A parsing input receiver function, which takes a serialized
+  # tf.Example containing all features (times, values, any exogenous features)
+  # and serialized model state (possibly also as a tf.Example).
+  def build_raw_serving_input_receiver_fn(self,
+                                          exogenous_features=None,
+                                          default_batch_size=None,
+                                          default_series_length=None):
+    """Build an input_receiver_fn for export_savedmodel which accepts arrays.
+
+    Args:
+      exogenous_features: A dictionary mapping feature keys to exogenous
+        features (either Numpy arrays or Tensors). Used to determine the shapes
+        of placeholders for these features.
+      default_batch_size: If specified, must be a scalar integer. Sets the batch
+        size in the static shape information of all feature Tensors, which means
+        only this batch size will be accepted by the exported model. If None
+        (default), static shape information for batch sizes is omitted.
+      default_series_length: If specified, must be a scalar integer. Sets the
+        series length in the static shape information of all feature Tensors,
+        which means only this series length will be accepted by the exported
+        model. If None (default), static shape information for series length is
+        omitted.
+    Returns:
+      An input_receiver_fn which may be passed to the Estimator's
+      export_savedmodel.
+    """
+    if exogenous_features is None:
+      exogenous_features = {}
+
+    def _serving_input_receiver_fn():
+      """A receiver function to be passed to export_savedmodel."""
+      placeholders = {}
+      placeholders[feature_keys.TrainEvalFeatures.TIMES] = (
+          array_ops.placeholder(
+              name=feature_keys.TrainEvalFeatures.TIMES,
+              dtype=dtypes.int64,
+              shape=[default_batch_size, default_series_length]))
+      # Values are only necessary when filtering. For prediction the default
+      # value will be ignored.
+      placeholders[feature_keys.TrainEvalFeatures.VALUES] = (
+          array_ops.placeholder_with_default(
+              name=feature_keys.TrainEvalFeatures.VALUES,
+              input=array_ops.zeros(
+                  shape=[
+                      default_batch_size
+                      if default_batch_size else 0, default_series_length
+                      if default_series_length else 0, self._model.num_features
+                  ],
+                  dtype=self._model.dtype),
+              shape=(default_batch_size, default_series_length,
+                     self._model.num_features)))
+      for feature_key, feature_value in exogenous_features.items():
+        value_tensor = ops.convert_to_tensor(feature_value)
+        value_tensor.get_shape().with_rank_at_least(2)
+        feature_shape = value_tensor.get_shape().as_list()
+        feature_shape[0] = default_batch_size
+        feature_shape[1] = default_series_length
+        placeholders[feature_key] = array_ops.placeholder(
+            dtype=value_tensor.dtype, name=feature_key, shape=feature_shape)
+      # Models may not know the shape of their state without creating some
+      # variables/ops. Avoid polluting the default graph by making a new one. We
+      # use only static metadata from the returned Tensors.
+      with ops.Graph().as_default():
+        self._model.initialize_graph()
+        model_start_state = self._model.get_start_state()
+      for prefixed_state_name, state_tensor in model_utils.state_to_dictionary(
+          model_start_state).items():
+        state_shape_with_batch = tensor_shape.TensorShape(
+            (default_batch_size,)).concatenate(state_tensor.get_shape())
+        placeholders[prefixed_state_name] = array_ops.placeholder(
+            name=prefixed_state_name,
+            shape=state_shape_with_batch,
+            dtype=state_tensor.dtype)
+      return export_lib.ServingInputReceiver(placeholders, placeholders)
+
+    return _serving_input_receiver_fn
+
+
+class ARRegressor(TimeSeriesRegressor):
+  """An Estimator for an (optionally non-linear) autoregressive model.
+
+  ARRegressor is a window-based model, inputting fixed windows of length
+  `input_window_size` and outputting fixed windows of length
+  `output_window_size`. These two parameters must add up to the window_size
+  passed to the `Chunker` used to create an `input_fn` for training or
+  evaluation. `RandomWindowInputFn` is suggested for both training and
+  evaluation, although it may be seeded for deterministic evaluation.
+  """
+
+  def __init__(
+      self, periodicities, input_window_size, output_window_size,
+      num_features, num_time_buckets=10,
+      loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, hidden_layer_sizes=None,
+      anomaly_prior_probability=None, anomaly_distribution=None,
+      optimizer=None, model_dir=None, config=None):
+    """Initialize the Estimator.
+
+    Args:
+      periodicities: periodicities of the input data, in the same units as the
+        time feature. Note this can be a single value or a list of values for
+        multiple periodicities.
+      input_window_size: Number of past time steps of data to look at when doing
+        the regression.
+      output_window_size: Number of future time steps to predict. Note that
+        setting it to > 1 empirically seems to give a better fit.
+      num_features: The dimensionality of the time series (one for univariate,
+          more than one for multivariate).
+      num_time_buckets: Number of buckets into which to divide (time %
+        periodicity) for generating time based features.
+      loss: Loss function to use for training. Currently supported values are
+        SQUARED_LOSS and NORMAL_LIKELIHOOD_LOSS. Note that for
+        NORMAL_LIKELIHOOD_LOSS, we train the covariance term as well. For
+        SQUARED_LOSS, the evaluation loss is reported based on un-scaled
+        observations and predictions, while the training loss is computed on
+        normalized data.
+      hidden_layer_sizes: list of sizes of hidden layers.
+      anomaly_prior_probability: If specified, constructs a mixture model under
+        which anomalies (modeled with `anomaly_distribution`) have this prior
+        probability. See `AnomalyMixtureARModel`.
+      anomaly_distribution: May not be specified unless
+        anomaly_prior_probability is specified and is not None. Controls the
+        distribution of anomalies under the mixture model. Currently either
+        `ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY` or
+        `ar_model.AnomalyMixtureARModel.CAUCHY_ANOMALY`. See
+        `AnomalyMixtureARModel`. Defaults to `GAUSSIAN_ANOMALY`.
+      optimizer: The optimization algorithm to use when training, inheriting
+          from tf.train.Optimizer. Defaults to Adagrad with step size 0.1.
+      model_dir: See `Estimator`.
+      config: See `Estimator`.
+    Raises:
+      ValueError: For invalid combinations of arguments.
+    """
+    if optimizer is None:
+      optimizer = train.AdagradOptimizer(0.1)
+    if anomaly_prior_probability is None and anomaly_distribution is not None:
+      raise ValueError("anomaly_prior_probability is required if "
+                       "anomaly_distribution is specified.")
+    if anomaly_prior_probability is None:
+      if anomaly_distribution is None:
+        anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY
+      model = ar_model.ARModel(
+          periodicities=periodicities, num_features=num_features,
+          num_time_buckets=num_time_buckets,
+          input_window_size=input_window_size,
+          output_window_size=output_window_size, loss=loss,
+          hidden_layer_sizes=hidden_layer_sizes)
+    else:
+      if loss != ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS:
+        raise ValueError(
+            "AnomalyMixtureARModel only supports "
+            "ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS for its loss argument.")
+      model = ar_model.AnomalyMixtureARModel(
+          periodicities=periodicities,
+          input_window_size=input_window_size,
+          output_window_size=output_window_size,
+          num_features=num_features,
+          num_time_buckets=num_time_buckets,
+          hidden_layer_sizes=hidden_layer_sizes,
+          anomaly_prior_probability=anomaly_prior_probability,
+          anomaly_distribution=anomaly_distribution)
+    state_manager = state_management.FilteringOnlyStateManager()
+    super(ARRegressor, self).__init__(
+        model=model,
+        state_manager=state_manager,
+        optimizer=optimizer,
+        model_dir=model_dir,
+        config=config)
+
+
+class StateSpaceRegressor(TimeSeriesRegressor):
+  """An Estimator for general state space models."""
+
+  def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
+               config=None):
+    """See TimeSeriesRegressor. Uses the ChainingStateManager by default."""
+    if not isinstance(model, state_space_model.StateSpaceModel):
+      raise ValueError(
+          "StateSpaceRegressor only supports state space models (children of "
+          "StateSpaceModel) in its `model` argument, got {}.".format(model))
+    if state_manager is None:
+      state_manager = state_management.ChainingStateManager()
+    super(StateSpaceRegressor, self).__init__(
+        model=model,
+        state_manager=state_manager,
+        optimizer=optimizer,
+        model_dir=model_dir,
+        config=config)
+
+
+class StructuralEnsembleRegressor(StateSpaceRegressor):
+  """An Estimator for structural time series models.
+
+  "Structural" refers to the fact that this model explicitly accounts for
+  structure in the data, such as periodicity and trends.
+
+  `StructuralEnsembleRegressor` is a state space model. It contains components
+  for modeling level, local linear trends, periodicity, and mean-reverting
+  transients via a moving average component. Multivariate series are fit with
+  full covariance matrices for observation and latent state transition noise,
+  each feature of the multivariate series having its own latent components.
+
+  Note that unlike `ARRegressor`, `StructuralEnsembleRegressor` is sequential,
+  and so accepts variable window sizes with the same model.
+
+  For training, `RandomWindowInputFn` is recommended as an `input_fn`. Model
+  state is managed through `ChainingStateManager`: since state space models are
+  inherently sequential, we save state from previous iterations to get
+  approximate/eventual consistency while achieving good performance through
+  batched computation.
+
+  For evaluation, either pass a significant chunk of the series in a single
+  window (e.g. set `window_size` to the whole series with
+  `WholeDatasetInputFn`), or use enough random evaluation iterations to cover
+  several passes through the whole dataset. Either method will ensure that stale
+  saved state has been flushed.
+  """
+
+  def __init__(self,
+               periodicities,
+               num_features,
+               cycle_num_latent_values=11,
+               moving_average_order=4,
+               autoregressive_order=0,
+               exogenous_feature_columns=None,
+               exogenous_update_condition=None,
+               dtype=dtypes.float64,
+               anomaly_prior_probability=None,
+               optimizer=None,
+               model_dir=None,
+               config=None):
+    """Initialize the Estimator.
+
+    Args:
+      periodicities: The expected periodicity of the data (for example 24 if
+          feeding hourly data with a daily periodicity, or 60 * 24 if feeding
+          minute-level data with daily periodicity). Either a scalar or a
+          list. This parameter can be any real value, and does not control the
+          size of the model. However, increasing this without increasing
+          `num_values_per_cycle` will lead to smoother periodic behavior, as the
+          same number of distinct values will be cycled through over a longer
+          period of time.
+      num_features: The dimensionality of the time series (one for univariate,
+          more than one for multivariate).
+      cycle_num_latent_values: Along with `moving_average_order` and
+          `num_features`, controls the latent state size of the model. Square
+          matrices of size `num_features * (moving_average_order +
+          cycle_num_latent_values + 3)` are created and multiplied, so larger
+          values may be slow. The trade-off is with resolution: cycling between
+          a smaller number of latent values means that only smoother functions
+          can be modeled.
+      moving_average_order: Controls model size (along with
+          `cycle_num_latent_values` and `autoregressive_order`) and the number
+          of steps before transient deviations revert to the mean defined by the
+          period and level/trend components.
+      autoregressive_order: Each contribution from this component is a linear
+          combination of this many previous contributions. Also helps to
+          determine the model size. Learning autoregressive coefficients
+          typically requires more steps and a smaller step size than other
+          components.
+      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
+          objects (for example tf.contrib.layers.embedding_column) corresponding
+          to exogenous features which provide extra information to the model but
+          are not part of the series to be predicted. Passed to
+          tf.contrib.layers.input_from_feature_columns.
+      exogenous_update_condition: A function taking two Tensor arguments,
+          `times` (shape [batch size]) and `features` (a dictionary mapping
+          exogenous feature keys to Tensors with shapes [batch size, ...]), and
+          returning a boolean Tensor with shape [batch size] indicating whether
+          state should be updated using exogenous features for each part of the
+          batch. Where it is False, no exogenous update is performed. If None
+          (default), exogenous updates are always performed. Useful for avoiding
+          "leaky" frequent exogenous updates when sparse updates are
+          desired. Called only during graph construction. See the "known
+          anomaly" example for example usage.
+      dtype: The floating point data type to compute with. float32 may be
+        faster, but can be problematic for larger models and longer time series.
+      anomaly_prior_probability: If not None, the model attempts to
+          automatically detect and ignore anomalies during training. This
+          parameter then controls the prior probability of an anomaly. Values
+          closer to 0 mean that points will be discarded less frequently. The
+          default value (None) means that anomalies are not discarded, which may
+          be slightly faster.
+      optimizer: The optimization algorithm to use when training, inheriting
+          from tf.train.Optimizer. Defaults to Adam with step size 0.02.
+      model_dir: See `Estimator`.
+      config: See `Estimator`.
+    """
+    if anomaly_prior_probability is not None:
+      filtering_postprocessor = StateInterpolatingAnomalyDetector(
+          anomaly_prior_probability=anomaly_prior_probability)
+    else:
+      filtering_postprocessor = None
+    state_space_model_configuration = (
+        state_space_model.StateSpaceModelConfiguration(
+            num_features=num_features,
+            dtype=dtype,
+            filtering_postprocessor=filtering_postprocessor,
+            exogenous_feature_columns=exogenous_feature_columns,
+            exogenous_update_condition=exogenous_update_condition))
+    model = structural_ensemble.MultiResolutionStructuralEnsemble(
+        cycle_num_latent_values=cycle_num_latent_values,
+        moving_average_order=moving_average_order,
+        autoregressive_order=autoregressive_order,
+        periodicities=periodicities,
+        configuration=state_space_model_configuration)
+    super(StructuralEnsembleRegressor, self).__init__(
+        model=model,
+        optimizer=optimizer,
+        model_dir=model_dir,
+        config=config)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4304f2560a82b666f87f302a821a39b0e9e140e
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -0,0 +1,149 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import ar_model
+from tensorflow.contrib.timeseries.python.timeseries import estimators
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
+
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+
+
+class _SeedRunConfig(estimator_lib.RunConfig):
+
+  @property
+  def tf_random_seed(self):
+    return 3
+
+
+class TimeSeriesRegressorTest(test.TestCase):
+
+  def _fit_restore_fit_test_template(self, estimator_fn, dtype):
+    """Tests restoring previously fit models."""
+    model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    first_estimator = estimator_fn(model_dir)
+    times = numpy.arange(20, dtype=numpy.int64)
+    values = numpy.arange(20, dtype=dtype.as_numpy_dtype)
+    features = {
+        feature_keys.TrainEvalFeatures.TIMES: times,
+        feature_keys.TrainEvalFeatures.VALUES: values
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(features), shuffle_seed=2, num_threads=1,
+        batch_size=16, window_size=16)
+    eval_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(features), shuffle_seed=3, num_threads=1,
+        batch_size=16, window_size=16)
+    first_estimator.train(input_fn=train_input_fn, steps=5)
+    first_loss_before_fit = first_estimator.evaluate(
+        input_fn=eval_input_fn, steps=1)["loss"]
+    first_estimator.train(input_fn=train_input_fn, steps=50)
+    first_loss_after_fit = first_estimator.evaluate(
+        input_fn=eval_input_fn, steps=1)["loss"]
+    self.assertLess(first_loss_after_fit, first_loss_before_fit)
+    second_estimator = estimator_fn(model_dir)
+    second_estimator.train(input_fn=train_input_fn, steps=2)
+    whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(features))
+    whole_dataset_evaluation = second_estimator.evaluate(
+        input_fn=whole_dataset_input_fn, steps=1)
+    predict_input_fn = input_pipeline.predict_continuation_input_fn(
+        evaluation=whole_dataset_evaluation,
+        steps=10)
+    # Also tests that limit_epochs in predict_continuation_input_fn prevents
+    # infinite iteration
+    (estimator_predictions,
+    ) = list(second_estimator.predict(input_fn=predict_input_fn))
+    self.assertAllEqual([10, 1], estimator_predictions["mean"].shape)
+    input_receiver_fn = first_estimator.build_raw_serving_input_receiver_fn()
+    export_location = first_estimator.export_savedmodel(self.get_temp_dir(),
+                                                        input_receiver_fn)
+    with ops.Graph().as_default():
+      with session.Session() as sess:
+        signatures = loader.load(sess, [tag_constants.SERVING], export_location)
+        # Test that prediction and filtering can continue from evaluation output
+        saved_prediction = saved_model_utils.predict_continuation(
+            continue_from=whole_dataset_evaluation,
+            steps=10,
+            signatures=signatures,
+            session=sess)
+        # Saved model predictions should be the same as Estimator predictions
+        # starting from the same evaluation.
+        for prediction_key, prediction_value in estimator_predictions.items():
+          self.assertAllClose(prediction_value,
+                              numpy.squeeze(
+                                  saved_prediction[prediction_key], axis=0))
+        first_filtering = saved_model_utils.filter_continuation(
+            continue_from=whole_dataset_evaluation,
+            features={
+                feature_keys.FilteringFeatures.TIMES: times[None, -1] + 2,
+                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.
+            },
+            signatures=signatures,
+            session=sess)
+        # Test that prediction and filtering can continue from filtering output
+        second_saved_prediction = saved_model_utils.predict_continuation(
+            continue_from=first_filtering,
+            steps=1,
+            signatures=signatures,
+            session=sess)
+        self.assertEqual(
+            times[-1] + 3,
+            numpy.squeeze(
+                second_saved_prediction[feature_keys.PredictionResults.TIMES]))
+        saved_model_utils.filter_continuation(
+            continue_from=first_filtering,
+            features={
+                feature_keys.FilteringFeatures.TIMES: times[-1] + 3,
+                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.
+            },
+            signatures=signatures,
+            session=sess)
+
+  def test_fit_restore_fit_ar_regressor(self):
+    def _estimator_fn(model_dir):
+      return estimators.ARRegressor(
+          periodicities=10, input_window_size=10, output_window_size=6,
+          num_features=1, model_dir=model_dir, config=_SeedRunConfig(),
+          # This test is flaky with normal likelihood loss (could add more
+          # training iterations instead).
+          loss=ar_model.ARModel.SQUARED_LOSS)
+    self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32)
+
+  def test_fit_restore_fit_structural_ensemble_regressor(self):
+    dtype = dtypes.float32
+    def _estimator_fn(model_dir):
+      return estimators.StructuralEnsembleRegressor(
+          num_features=1, periodicities=10, model_dir=model_dir, dtype=dtype,
+          config=_SeedRunConfig())
+    self._fit_restore_fit_test_template(_estimator_fn, dtype=dtype)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py b/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..970b9aa8acd6f55db843a4e023052b122992baf4
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/feature_keys.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Commonly used special feature names for time series models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.saved_model import signature_constants
+
+
+class State(object):
+  """Key formats for accepting/returning state."""
+  # The model-dependent state to start from, as a single tuple.
+  STATE_TUPLE = "start_tuple"
+  # Same meaning as STATE_TUPLE, but prefixes keys representing flattened model
+  # state rather than mapping to a nested tuple containing model state,
+  # primarily for use with export_savedmodel.
+  STATE_PREFIX = "model_state"
+
+
+class Times(object):
+  """Key formats for accepting/returning times."""
+  # An increasing vector of integers.
+  TIMES = "times"
+
+
+class Values(object):
+  """Key formats for accepting/returning values."""
+  # Floating point, with one or more values corresponding to each time in TIMES.
+  VALUES = "values"
+
+
+class TrainEvalFeatures(Times, Values):
+  """Feature names used during training and evaluation."""
+  pass
+
+
+class PredictionFeatures(Times, State):
+  """Feature names used during prediction."""
+  pass
+
+
+class FilteringFeatures(Times, Values, State):
+  """Special feature names for filtering."""
+  pass
+
+
+class PredictionResults(Times):
+  """Keys returned when predicting (not comprehensive)."""
+  pass
+
+
+class FilteringResults(Times, State):
+  """Keys returned from evaluation/filtering."""
+  pass
+
+
+class SavedModelLabels(object):
+  """Names of signatures exported with export_savedmodel."""
+  PREDICT = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  FILTER = "filter"
diff --git a/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4ee59036624cffb216709e096981d362670e416
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline.py
@@ -0,0 +1,832 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines ways of splicing and re-arranging time series.
+
+This file provides methods for reading, parsing, and re-arranging a time
+series. The main departure from standard TensorFlow input pipelines is a focus
+on "chunking" a time series, i.e. slicing it into small contiguous windows which
+are then batched together for training, a form of truncated
+backpropagation. This typically provides a significant speedup compared to
+looping over the whole series sequentially, by exploiting data parallelism and
+by reducing redundant contributions to gradients (due to redundant information
+in the series itself).
+
+A series, consisting of times (an increasing vector of integers) and values (one
+or more floating point values for each time) along with any exogenous features,
+is stored either in memory or on disk in various formats (e.g. "one record per
+timestep" on disk, or as a dictionary of Numpy arrays in memory). The location
+and format is specified by configuring a `TimeSeriesReader` object
+(e.g. `NumpyReader`, `CSVReader`), which reads the data into the TensorFlow
+graph. A `TimeSeriesInputFn` object (typically `RandomWindowInputFn`) then
+performs windowing and batching.
+
+Time series are passed through this pipeline as dictionaries mapping feature
+names to their values. For training and evaluation, these require at minimum
+`TrainEvalFeatures.TIMES` (scalar integers, one per timestep) and
+`TrainEvalFeatures.VALUES` (may be either univariate or multivariate). Exogenous
+features may have any shape, but are likewise associated with a timestep. Times
+themselves need not be contiguous or regular (although smaller/fewer gaps are
+generally better), but each timestep must have all `VALUES` and any exogenous
+features (i.e. times may be missing, but given that a time is specified, every
+other feature must also be specified for that step; some models may support
+making exogenous updates conditional).
+
+The expected use case of a `TimeSeriesInputFn` is that it is first configured
+(for example setting a batch or window size) and passed a reader (a
+`TimeSeriesReader` object). The `TimeSeriesInputFn` can then be passed as the
+input_fn of an Estimator.
+
+For example, `RandomWindowInputFn` is useful for creating batches of random
+chunks of a series for training:
+
+```
+  # Read data in the default "time,value" CSV format with no header
+  reader = input_pipeline.CSVReader(csv_file_name)
+  # Set up windowing and batching for training
+  train_input_fn = input_pipeline.RandomWindowInputFn(
+      reader, batch_size=16, window_size=16)
+  # Fit model parameters to data
+  estimator.train(input_fn=train_input_fn, steps=150)
+```
+
+`RandomWindowInputFn` is the primary tool for training and quantitative
+evaluation of time series. `WholeDatasetInputFn`, which reads a whole series
+into memory, is useful for qualitative evaluation and preparing to make
+predictions with `predict_continuation_input_fn`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import model_utils
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import training
+from tensorflow.python.util import nest
+
+
+def predict_continuation_input_fn(
+    evaluation, steps=None, times=None, exogenous_features=None):
+  """An Estimator input_fn for running predict() after evaluate().
+
+  If the call to evaluate() we are making predictions based on had a batch_size
+  greater than one, predictions will start after each of these windows
+  (i.e. will have the same batch dimension).
+
+  Args:
+    evaluation: The dictionary returned by `Estimator.evaluate`, with keys
+      FilteringResults.STATE_TUPLE and FilteringResults.TIMES.
+    steps: The number of steps to predict (scalar), starting after the
+      evaluation. If `times` is specified, `steps` must not be; one is required.
+    times: A [batch_size x window_size] array of integers (not a Tensor)
+      indicating times to make predictions for. These times must be after the
+      corresponding evaluation. If `steps` is specified, `times` must not be;
+      one is required. If the batch dimension is omitted, it is assumed to be 1.
+    exogenous_features: Optional dictionary. If specified, indicates exogenous
+      features for the model to use while making the predictions. Values must
+      have shape [batch_size x window_size x ...], where `batch_size` matches
+      the batch dimension used when creating `evaluation`, and `window_size` is
+      either the `steps` argument or the `window_size` of the `times` argument
+      (depending on which was specified).
+  Returns:
+    An `input_fn` suitable for passing to the `predict` function of a time
+    series `Estimator`.
+  Raises:
+    ValueError: If `times` or `steps` are misspecified.
+  """
+  if exogenous_features is None:
+    exogenous_features = {}
+  predict_times = model_utils.canonicalize_times_or_steps_from_output(
+      times=times, steps=steps, previous_model_output=evaluation)
+  features = {
+      feature_keys.PredictionFeatures.STATE_TUPLE:
+          evaluation[feature_keys.FilteringResults.STATE_TUPLE],
+      feature_keys.PredictionFeatures.TIMES:
+          predict_times
+  }
+  features.update(exogenous_features)
+  def _predict_input_fn():
+    """An input_fn for predict()."""
+    # Prevents infinite iteration with a constant output in an Estimator's
+    # predict().
+    limited_features = {}
+    for key, values in features.items():
+      limited_values = nest.map_structure(
+          lambda value: training.limit_epochs(value, num_epochs=1), values)
+      limited_features[key] = limited_values
+    return (limited_features, None)
+  return _predict_input_fn
+
+
+class TimeSeriesReader(object):
+  """Reads from and parses a data source for a `TimeSeriesInputFn`.
+
+  This class provides methods that read a few records (`read`) or the full data
+  set at once (`read_full`), and returns them as dictionaries mapping feature
+  names to feature Tensors. Please see note at the top of the file for the
+  structure of these dictionaries. The output is generally chunked by a
+  `TimeSeriesInputFn` before being passed to the model.
+  """
+
+  def check_dataset_size(self, minimum_dataset_size):
+    """When possible, raises an error if the dataset is too small.
+
+    This method allows TimeSeriesReaders to raise informative error messages if
+    the user has selected a window size in their TimeSeriesInputFn which is
+    larger than the dataset size. However, many TimeSeriesReaders will not have
+    access to a dataset size, in which case they do not need to override this
+    method.
+
+    Args:
+      minimum_dataset_size: The minimum number of records which should be
+        contained in the dataset. Readers should attempt to raise an error when
+        possible if an epoch of data contains fewer records.
+    """
+    pass
+
+  @abc.abstractmethod
+  def read(self):
+    """Parses one or more records into a feature dictionary.
+
+    This method is expected to be called by a `TimeSeriesInputFn` object, and is
+    not for use with models directly.
+
+    A `TimeSeriesReader` object reads multiple records at a single time for
+    efficiency; the size of these batches is an implementation detail internal
+    to the input pipeline. These records should generally be sequential,
+    although some out-of-order records due to file wraparounds are expected and
+    must be handled by callers.
+
+    Returns:
+      A dictionary mapping feature names to `Tensor` values, each with an
+      arbitrary batch dimension (for efficiency) as their first dimension.
+    """
+    pass
+
+  @abc.abstractmethod
+  def read_full(self):
+    """Return the full dataset.
+
+    Largely for interactive use/plotting (or evaluation on small
+    datasets). Generally not very efficient. Not recommended for training.
+
+    Returns:
+      Same return type as `read`, but with the full dataset rather than an
+      arbitrary chunk of it. A dictionary mapping feature names to `Tensor`
+      values, where the size of the first dimension of each `Tensor` is the
+      number of samples in the entire dataset. These `Tensor`s should be
+      constant across graph invocations, assuming that the underlying data
+      remains constant. Current implementations re-read data on each graph
+      invocation, although this may change in the future.
+    """
+    pass
+
+
+class NumpyReader(TimeSeriesReader):
+  """A time series parser for feeding Numpy arrays to a `TimeSeriesInputFn`.
+
+  Avoids embedding data in the graph as constants.
+  """
+
+  def __init__(self, data, read_num_records_hint=4096):
+    """Numpy array input for a `TimeSeriesInputFn`.
+
+    Args:
+      data: A dictionary mapping feature names to Numpy arrays, with two
+        possible shapes (requires keys `TrainEvalFeatures.TIMES` and
+        `TrainEvalFeatures.VALUES`):
+          Univariate; `TIMES` and `VALUES` are both vectors of shape [series
+            length]
+          Multivariate; `TIMES` is a vector of shape [series length], `VALUES`
+            has shape [series length x number of features].
+        In any case, `VALUES` and any exogenous features must have their shapes
+        prefixed by the shape of the value corresponding to the `TIMES` key.
+      read_num_records_hint: The maximum number of samples to read at one time,
+        for efficiency.
+    """
+    self._features = _canonicalize_numpy_data(
+        data, require_single_batch=True)
+    self._read_num_records_hint = read_num_records_hint
+
+  def check_dataset_size(self, minimum_dataset_size):
+    """Raise an error if the dataset is too small."""
+    dataset_size = self._features[feature_keys.TrainEvalFeatures.TIMES].shape[1]
+    if dataset_size < minimum_dataset_size:
+      raise ValueError(
+          ("A TimeSeriesInputFn is configured to create windows of size {}, "
+           "but only {} records were available in the dataset. Either decrease "
+           "the window size or provide more records.").format(
+               minimum_dataset_size, dataset_size))
+
+  def read(self):
+    """Returns a large chunk of the Numpy arrays for later re-chunking."""
+    # Remove the batch dimension from all features
+    features = {key: numpy.squeeze(value, axis=0)
+                for key, value in self._features.items()}
+    return estimator_lib.inputs.numpy_input_fn(
+        x=features,
+        # The first dimensions of features are the series length, since we have
+        # removed the batch dimension above. We now pull out
+        # self._read_num_records_hint steps of this single time series to pass
+        # to the TimeSeriesInputFn.
+        batch_size=self._read_num_records_hint,
+        num_epochs=None,
+        shuffle=False)()
+
+  def read_full(self):
+    """Returns `Tensor` versions of the full Numpy arrays."""
+    features = estimator_lib.inputs.numpy_input_fn(
+        x=self._features,
+        batch_size=1,
+        num_epochs=None,
+        queue_capacity=2,  # Each queue element is a full copy of the dataset
+        shuffle=False)()
+    # TimeSeriesInputFn expect just a batch dimension
+    return {feature_name: array_ops.squeeze(feature_value, axis=0)
+            for feature_name, feature_value in features.items()}
+
+
+class ReaderBaseTimeSeriesParser(TimeSeriesReader):
+  """Base for time series readers which wrap a `tf.ReaderBase`."""
+
+  def __init__(self, filenames, read_num_records_hint=4096):
+    """Configure the time series reader.
+
+    Args:
+      filenames: A string or list of strings indicating files to read records
+        from.
+      read_num_records_hint: When not reading a full dataset, indicates the
+        number of records to transfer in a single chunk (for efficiency). The
+        actual number transferred at one time may vary.
+    """
+    self._filenames = filenames
+    self._read_num_records_hint = read_num_records_hint
+
+  @abc.abstractmethod
+  def _get_reader(self):
+    """Get an instance of the tf.ReaderBase associated with this class."""
+    pass
+
+  @abc.abstractmethod
+  def _process_records(self, lines):
+    """Given string items, return a processed dictionary of Tensors.
+
+    Args:
+      lines: A 1-dimensional string Tensor, each representing a record to parse
+        (source dependent, e.g. a line of a file, or a serialized protocol
+        buffer).
+
+    Returns:
+      A dictionary mapping feature names to their values. The batch dimensions
+      should match the length of `lines`.
+    """
+    pass
+
+  def _get_filename_queue(self, epoch_limit):
+    """Constructs a filename queue with an epoch limit.
+
+    `epoch_limit` is intended as an error checking fallback to prevent a reader
+    from infinitely looping in its requests for more work items if none are
+    available in any file. It should be set high enough that it is never reached
+    assuming at least one record exists in some file.
+
+    Args:
+      epoch_limit: The maximum number of times to read through the complete list
+        of files before throwing an OutOfRangeError.
+    Returns:
+      A tuple of (filename_queue, epoch_limiter):
+        filename_queue: A FIFOQueue with filename work items.
+        epoch_limiter: The local variable used for epoch limitation. This should
+          be set to zero before a reader is passed `filename_queue` in order to
+          reset the epoch limiter's state.
+    """
+    epoch_limiter = variable_scope.variable(
+        initial_value=constant_op.constant(0, dtype=dtypes.int64),
+        name="epoch_limiter",
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES])
+    filenames_tensor = array_ops.reshape(
+        ops.convert_to_tensor(self._filenames), [-1])
+    # We can't rely on epoch_limiter being initialized, since queue runners are
+    # started before local variables are initialized. Instead, we ignore epoch
+    # limits before variable initialization. This means that prior to variable
+    # initialization, a QueueRunner may cause a reader to enter an un-checked
+    # infinite loop. However, as soon as local variables are initialized, we
+    # will start incrementing and checking epoch_limiter, which will interrupt
+    # any in-progress loops.
+    conditional_count_up_to = control_flow_ops.cond(
+        state_ops.is_variable_initialized(epoch_limiter),
+        lambda: epoch_limiter.count_up_to(epoch_limit),
+        lambda: constant_op.constant(0, dtype=dtypes.int64))
+    with ops.control_dependencies([conditional_count_up_to]):
+      filenames_tensor = array_ops.identity(filenames_tensor)
+    filename_queue = input_lib.string_input_producer(
+        filenames_tensor, shuffle=False, capacity=1)
+    return filename_queue, epoch_limiter
+
+  def read(self):
+    """Reads a chunk of data from the `tf.ReaderBase` for later re-chunking."""
+    # Assuming there is at least one item to be read among all of the files in
+    # self._filenames, we will not need to go through more than
+    # self._read_num_records_hint epochs to get a batch of
+    # self._read_num_records_hint records. Setting this limit and resetting it
+    # before each reader.read_up_to call prevents infinite looping when there
+    # are no records available in any of the files.
+    filename_queue, epoch_limiter = self._get_filename_queue(
+        epoch_limit=self._read_num_records_hint)
+    reader = self._get_reader()
+    epoch_reset_op = state_ops.assign(epoch_limiter, 0)
+    with ops.control_dependencies([epoch_reset_op]):
+      _, records = reader.read_up_to(
+          filename_queue, self._read_num_records_hint)
+    return self._process_records(records)
+
+  def read_full(self):
+    """Reads a full epoch of data into memory."""
+    reader = self._get_reader()
+    # Set a hard limit of 2 epochs through self._filenames. If there are any
+    # records available, we should only end up reading the first record in the
+    # second epoch before exiting the while loop and subsequently resetting the
+    # epoch limit. If there are no records available in any of the files, this
+    # hard limit prevents the reader.read_up_to call from looping infinitely.
+    filename_queue, epoch_limiter = self._get_filename_queue(epoch_limit=2)
+    epoch_reset_op = state_ops.assign(epoch_limiter, 0)
+    with ops.control_dependencies([epoch_reset_op]):
+      first_key, first_value = reader.read_up_to(filename_queue, 1)
+    # Read until we get a duplicate key (one epoch)
+    def _while_condition(
+        current_key, current_value, current_index, collected_records):
+      del current_value, current_index, collected_records  # unused
+      return math_ops.not_equal(array_ops.squeeze(current_key, axis=0),
+                                array_ops.squeeze(first_key, axis=0))
+
+    def _while_body(
+        current_key, current_value, current_index, collected_records):
+      del current_key  # unused
+      new_key, new_value = reader.read_up_to(filename_queue, 1)
+      new_key.set_shape([1])
+      new_value.set_shape([1])
+      return (new_key,
+              new_value,
+              current_index + 1,
+              collected_records.write(current_index, current_value))
+    _, _, _, records_ta = control_flow_ops.while_loop(
+        _while_condition,
+        _while_body,
+        [constant_op.constant([""]), first_value,
+         0,  # current_index starting value
+         tensor_array_ops.TensorArray(  # collected_records
+             dtype=dtypes.string, size=0, dynamic_size=True)])
+    records = records_ta.concat()
+    # Reset the reader when we're done so that subsequent requests for data get
+    # the dataset in the proper order.
+    with ops.control_dependencies([records]):
+      reader_reset_op = reader.reset()
+    with ops.control_dependencies([reader_reset_op]):
+      records = array_ops.identity(records)
+    return self._process_records(records)
+
+
+class CSVReader(ReaderBaseTimeSeriesParser):
+  """Reads from a collection of CSV-formatted files."""
+
+  def __init__(self,
+               filenames,
+               column_names=(feature_keys.TrainEvalFeatures.TIMES,
+                             feature_keys.TrainEvalFeatures.VALUES),
+               column_dtypes=None,
+               skip_header_lines=None,
+               read_num_records_hint=4096):
+    """CSV-parsing reader for a `TimeSeriesInputFn`.
+
+    Args:
+      filenames: A filename or list of filenames to read the time series
+          from. Each line must have columns corresponding to `column_names`.
+      column_names: A list indicating names for each
+          feature. `TrainEvalFeatures.TIMES` and `TrainEvalFeatures.VALUES` are
+          required; `VALUES` may be repeated to indicate a multivariate series.
+      column_dtypes: If provided, must be a list with the same length as
+          `column_names`, indicating dtypes for each column. Defaults to
+          `tf.int64` for `TrainEvalFeatures.TIMES` and `tf.float32` for
+          everything else.
+      skip_header_lines: Passed on to `tf.TextLineReader`; skips this number of
+          lines at the beginning of each file.
+      read_num_records_hint: When not reading a full dataset, indicates the
+          number of records to parse/transfer in a single chunk (for
+          efficiency). The actual number transferred at one time may be more or
+          less.
+    Raises:
+      ValueError: If required column names are not specified, or if lengths do
+        not match.
+    """
+    if feature_keys.TrainEvalFeatures.TIMES not in column_names:
+      raise ValueError("'{}' is a required column.".format(
+          feature_keys.TrainEvalFeatures.TIMES))
+    if feature_keys.TrainEvalFeatures.VALUES not in column_names:
+      raise ValueError("'{}' is a required column.".format(
+          feature_keys.TrainEvalFeatures.VALUES))
+    if column_dtypes is not None and len(column_dtypes) != len(column_names):
+      raise ValueError(
+          ("If specified, the length of column_dtypes must match the length of "
+           "column_names (got column_dtypes={} and column_names={}).").format(
+               column_dtypes, column_names))
+    if sum(1 for column_name in column_names
+           if column_name == feature_keys.TrainEvalFeatures.TIMES) != 1:
+      raise ValueError(
+          "Got more than one times column ('{}'), but exactly "
+          "one is required.".format(feature_keys.TrainEvalFeatures.TIMES))
+    self._column_names = column_names
+    self._column_dtypes = column_dtypes
+    self._skip_header_lines = skip_header_lines
+    super(CSVReader, self).__init__(
+        filenames=filenames, read_num_records_hint=read_num_records_hint)
+
+  def _get_reader(self):
+    return io_ops.TextLineReader(skip_header_lines=self._skip_header_lines)
+
+  def _process_records(self, lines):
+    """Parse `lines` as CSV records."""
+    if self._column_dtypes is None:
+      default_values = [(array_ops.zeros([], dtypes.int64),)
+                        if column_name == feature_keys.TrainEvalFeatures.TIMES
+                        else () for column_name in self._column_names]
+    else:
+      default_values = [(array_ops.zeros([], dtype),)
+                        for dtype in self._column_dtypes]
+    columns = parsing_ops.decode_csv(lines, default_values)
+    features_lists = {}
+    for column_name, value in zip(self._column_names, columns):
+      features_lists.setdefault(column_name, []).append(value)
+    features = {}
+    for column_name, values in features_lists.items():
+      if (len(values) == 1 and
+          column_name != feature_keys.TrainEvalFeatures.VALUES):
+        features[column_name] = values[0]
+      else:
+        features[column_name] = array_ops.stack(values, axis=1)
+    return features
+
+
+class TimeSeriesInputFn(object):
+  """Base for classes which create batches of windows from a time series."""
+
+  @abc.abstractmethod
+  def create_batch(self):
+    """Creates chunked Tensors from times, values, and other features.
+
+    Suitable for use as the input_fn argument of a tf.estimator.Estimator's
+    fit() or evaluate() method.
+
+    Returns:
+      A tuple of (features, targets):
+        features: A dictionary with `TrainEvalFeatures.TIMES` and
+          `TrainEvalFeatures.VALUES` as keys, `TIMES` having an associated value
+          with shape [batch size x window length], `VALUES` with shape [batch
+          size x window length x number of features]. Any other features will
+          also have shapes prefixed with [batch size x window length].
+        targets: Not used, but must have a value for compatibility with the
+          Estimator API. That value should be None.
+    """
+    pass
+
+  def __call__(self):
+    # Allow a TimeSeriesInputFn to be used as an input function directly
+    return self.create_batch()
+
+
+class WholeDatasetInputFn(TimeSeriesInputFn):
+  """Supports passing a full time series to a model for evaluation/inference.
+
+  Note that this `TimeSeriesInputFn` is not designed for high throughput, and
+  should not be used for training. It allows for sequential evaluation on a full
+  dataset (with sequential in-sample predictions), which then feeds naturally
+  into `predict_continuation_input_fn` for making out-of-sample
+  predictions. While this is useful for plotting and interactive use,
+  `RandomWindowInputFn` is better suited to training and quantitative
+  evaluation.
+  """
+  # TODO(allenl): A SequentialWindowInputFn for getting model end state without
+  # loading the whole dataset into memory (or for quantitative evaluation of
+  # sequential models). Note that an Estimator using such a TimeSeriesInputFn
+  # won't return in-sample predictions for the whole dataset, which means it
+  # won't be terribly useful for interactive use/plotting (unless the user
+  # passes in concat metrics). Also need to be careful about state saving for
+  # sequential models, particularly the gaps between chunks.
+
+  def __init__(self, time_series_reader):
+    """Initialize the `TimeSeriesInputFn`.
+
+    Args:
+      time_series_reader: A TimeSeriesReader object.
+    """
+    self._reader = time_series_reader
+    super(WholeDatasetInputFn, self).__init__()
+
+  def create_batch(self):
+    """A suitable `input_fn` for an `Estimator`'s `evaluate()`.
+
+    Returns:
+      A dictionary mapping feature names to `Tensors`, each shape
+      prefixed by [1, data set size] (i.e. a batch size of 1).
+    """
+    features = self._reader.read_full()
+    # Add a batch dimension of one to each feature.
+    return ({feature_name: feature_value[None, ...]
+             for feature_name, feature_value in features.items()},
+            None)
+
+
+class RandomWindowInputFn(TimeSeriesInputFn):
+  """Wraps a `TimeSeriesReader` to create random batches of windows.
+
+  Tensors are first collected into sequential windows (in a windowing queue
+  created by `tf.train.batch`, based on the order returned from
+  `time_series_reader`), then these windows are randomly batched (in a
+  `RandomShuffleQueue`), the Tensors returned by `create_batch` having shapes
+  prefixed by [`batch_size`, `window_size`].
+
+  This `TimeSeriesInputFn` is useful for both training and quantitative
+  evaluation (but be sure to run several epochs for sequential models such as
+  `StructuralEnsembleRegressor` to completely flush stale state left over from
+  training). For qualitative evaluation or when preparing for predictions, use
+  `WholeDatasetInputFn`.
+  """
+
+  def __init__(
+      self, time_series_reader, window_size, batch_size,
+      queue_capacity_multiplier=1000, shuffle_min_after_dequeue_multiplier=2,
+      discard_out_of_order=True, discard_consecutive_batches_limit=1000,
+      jitter=True, num_threads=2, shuffle_seed=None):
+    """Configure the RandomWindowInputFn.
+
+    Args:
+      time_series_reader: A TimeSeriesReader object.
+      window_size: The number of examples to keep together sequentially. This
+        controls the length of truncated backpropagation: smaller values mean
+        less sequential computation, which can lead to faster training, but
+        create a coarser approximation to the gradient (which would ideally be
+        computed by a forward pass over the entire sequence in order).
+      batch_size: The number of windows to place together in a batch. Larger
+        values will lead to more stable gradients during training.
+      queue_capacity_multiplier: The capacity for the queues used to create
+        batches, specified as a multiple of `batch_size` (for
+        RandomShuffleQueue) and `batch_size * window_size` (for the
+        FIFOQueue). Controls the maximum number of windows stored. Should be
+        greater than `shuffle_min_after_dequeue_multiplier`.
+      shuffle_min_after_dequeue_multiplier: The minimum number of windows in the
+        RandomShuffleQueue after a dequeue, which controls the amount of entropy
+        introduced during batching. Specified as a multiple of `batch_size`.
+      discard_out_of_order: If True, windows of data which have times which
+        decrease (a higher time followed by a lower time) are discarded. If
+        False, the window and associated features are instead sorted so that
+        times are non-decreasing. Discarding is typically faster, as models do
+        not have to deal with artificial gaps in the data. However, discarding
+        does create a bias where the beginnings and endings of files are
+        under-sampled.
+      discard_consecutive_batches_limit: Raise an OutOfRangeError if more than
+        this number of batches are discarded without a single non-discarded
+        window (prevents infinite looping when the dataset is too small).
+      jitter: If True, randomly discards examples between some windows in order
+        to avoid deterministic chunking patterns. This is important for models
+        like AR which may otherwise overfit a fixed chunking.
+      num_threads: Use this number of threads for queues. Setting a value of 1
+        removes one source of non-determinism (and in combination with
+        shuffle_seed should provide deterministic windowing).
+      shuffle_seed: A seed for window shuffling. The default value of None
+        provides random behavior. With `shuffle_seed` set and
+        `num_threads=1`, provides deterministic behavior.
+    """
+    self._reader = time_series_reader
+    self._window_size = window_size
+    self._reader.check_dataset_size(minimum_dataset_size=self._window_size)
+    self._batch_size = batch_size
+    self._queue_capacity_multiplier = queue_capacity_multiplier
+    self._shuffle_min_after_dequeue_multiplier = (
+        shuffle_min_after_dequeue_multiplier)
+    self._discard_out_of_order = discard_out_of_order
+    self._discard_limit = discard_consecutive_batches_limit
+    self._jitter = jitter
+    if num_threads is None:
+      self._num_threads = self._batch_size
+    else:
+      self._num_threads = num_threads
+    self._shuffle_seed = shuffle_seed
+    super(RandomWindowInputFn, self).__init__()
+
+  def create_batch(self):
+    """Create queues to window and batch time series data.
+
+    Returns:
+      A dictionary of Tensors corresponding to the output of `self._reader`
+      (from the `time_series_reader` constructor argument), each with shapes
+      prefixed by [`batch_size`, `window_size`].
+    """
+    features = self._reader.read()
+    if self._jitter:
+      # TODO(agarwal, allenl): Figure out if more jitter is needed here.
+      jitter = random_ops.random_uniform(shape=[], maxval=2, dtype=dtypes.int32)
+    else:
+      jitter = 0
+    # To keep things efficient, we pass from the windowing batcher to the
+    # batch-of-windows batcher in batches. This avoids the need for huge numbers
+    # of threads, but does mean that jitter is only applied occasionally.
+    # TODO(allenl): Experiment with different internal passing sizes.
+    internal_passing_size = self._batch_size
+    features_windowed = input_lib.batch(
+        features,
+        batch_size=self._window_size * internal_passing_size + jitter,
+        enqueue_many=True,
+        capacity=(self._queue_capacity_multiplier
+                  * internal_passing_size * self._window_size),
+        num_threads=self._num_threads)
+    raw_features_windowed = features_windowed
+    if self._jitter:
+      features_windowed = {
+          key: value[jitter:]
+          for key, value in features_windowed.items()}
+    features_windowed = {
+        key: array_ops.reshape(
+            value,
+            array_ops.concat(
+                [[internal_passing_size, self._window_size],
+                 array_ops.shape(value)[1:]],
+                axis=0))
+        for key, value in features_windowed.items()}
+    batch_and_window_shape = tensor_shape.TensorShape(
+        [internal_passing_size, self._window_size])
+    for key in features_windowed.keys():
+      features_windowed[key].set_shape(
+          batch_and_window_shape.concatenate(
+              raw_features_windowed[key].get_shape()[1:]))
+    # When switching files, we may end up with windows where the time is not
+    # decreasing, even if times within each file are sorted (and even if those
+    # files are visited in order, when looping back around to the beginning of
+    # the first file). This is hard for models to deal with, so we either
+    # discard such examples, creating a bias where the beginning and end of the
+    # series is under-sampled, or we sort the window, creating large gaps.
+    times = features_windowed[feature_keys.TrainEvalFeatures.TIMES]
+    if self._discard_out_of_order:
+      non_decreasing = math_ops.reduce_all(
+          times[:, 1:] >= times[:, :-1], axis=1)
+      # Ensure that no more than self._discard_limit complete batches are
+      # discarded contiguously (resetting the count when we find a single clean
+      # window). This prevents infinite looping when the dataset is smaller than
+      # the window size.
+      # TODO(allenl): Figure out a way to return informative errors from
+      # count_up_to.
+      discarded_windows_limiter = variable_scope.variable(
+          initial_value=constant_op.constant(0, dtype=dtypes.int64),
+          name="discarded_windows_limiter",
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      def _initialized_limit_check():
+        return control_flow_ops.cond(
+            math_ops.reduce_any(non_decreasing),
+            lambda: state_ops.assign(discarded_windows_limiter, 0),
+            lambda: discarded_windows_limiter.count_up_to(self._discard_limit))
+      discard_limit_op = control_flow_ops.cond(
+          state_ops.is_variable_initialized(discarded_windows_limiter),
+          _initialized_limit_check,
+          lambda: constant_op.constant(0, dtype=dtypes.int64))
+      with ops.control_dependencies([discard_limit_op]):
+        non_decreasing = array_ops.identity(non_decreasing)
+    else:
+      _, indices_descending = nn.top_k(
+          times, k=array_ops.shape(times)[-1], sorted=True)
+      indices = array_ops.reverse(indices_descending, axis=[0])
+      features_windowed = {
+          key: array_ops.gather(params=value, indices=indices)
+          for key, value in features_windowed.items()
+      }
+      non_decreasing = True
+    features_batched = input_lib.maybe_shuffle_batch(
+        features_windowed,
+        num_threads=self._num_threads,
+        seed=self._shuffle_seed,
+        batch_size=self._batch_size,
+        capacity=self._queue_capacity_multiplier * self._batch_size,
+        min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier *
+                           self._batch_size),
+        keep_input=non_decreasing,
+        enqueue_many=True)
+    return (features_batched, None)
+
+
+def _canonicalize_numpy_data(data, require_single_batch):
+  """Do basic checking and reshaping for Numpy data.
+
+  Args:
+    data: A dictionary mapping keys to Numpy arrays, with several possible
+      shapes (requires keys `TrainEvalFeatures.TIMES` and
+      `TrainEvalFeatures.VALUES`):
+        Single example; `TIMES` is a scalar and `VALUES` is either a scalar or a
+          vector of length [number of features].
+        Sequence; `TIMES` is a vector of shape [series length], `VALUES` either
+          has shape [series length] (univariate) or [series length x number of
+          features] (multivariate).
+        Batch of sequences; `TIMES` is a vector of shape [batch size x series
+          length], `VALUES` has shape [batch size x series length] or [batch
+          size x series length x number of features].
+      In any case, `VALUES` and any exogenous features must have their shapes
+      prefixed by the shape of the value corresponding to the `TIMES` key.
+    require_single_batch: If True, raises an error if the provided data has a
+      batch dimension > 1.
+  Returns:
+    A dictionary with features normalized to have shapes prefixed with [batch
+    size x series length]. The sizes of dimensions which were omitted in the
+    inputs are 1.
+  Raises:
+    ValueError: If dimensions are incorrect or do not match, or required
+      features are missing.
+  """
+  features = {key: numpy.array(value) for key, value in data.items()}
+  if (feature_keys.TrainEvalFeatures.TIMES not in features or
+      feature_keys.TrainEvalFeatures.VALUES not in features):
+    raise ValueError("{} and {} are required features.".format(
+        feature_keys.TrainEvalFeatures.TIMES,
+        feature_keys.TrainEvalFeatures.VALUES))
+  times = features[feature_keys.TrainEvalFeatures.TIMES]
+  for key, value in features.items():
+    if value.shape[:len(times.shape)] != times.shape:
+      raise ValueError(
+          ("All features must have their shapes prefixed by the shape of the"
+           " times feature. Got shape {} for feature '{}', but shape {} for"
+           " '{}'").format(value.shape, key, times.shape,
+                           feature_keys.TrainEvalFeatures.TIMES))
+  if not times.shape:  # a single example
+    if not features[feature_keys.TrainEvalFeatures.VALUES].shape:  # univariate
+      # Add a feature dimension (with one feature)
+      features[feature_keys.TrainEvalFeatures.VALUES] = features[
+          feature_keys.TrainEvalFeatures.VALUES][..., None]
+    elif len(features[feature_keys.TrainEvalFeatures.VALUES].shape) > 1:
+      raise ValueError(
+          ("Got an unexpected number of dimensions for the '{}' feature."
+           " Was expecting at most 1 dimension"
+           " ([number of features]) since '{}' does not "
+           "have a batch or time dimension, but got shape {}").format(
+               feature_keys.TrainEvalFeatures.VALUES,
+               feature_keys.TrainEvalFeatures.TIMES,
+               features[feature_keys.TrainEvalFeatures.VALUES].shape))
+    # Add trivial batch and time dimensions for every feature
+    features = {key: value[None, None, ...] for key, value in features.items()}
+  if len(times.shape) == 1:  # shape [series length]
+    if len(features[feature_keys.TrainEvalFeatures.VALUES]
+           .shape) == 1:  # shape [series length]
+      # Add a feature dimension (with one feature)
+      features[feature_keys.TrainEvalFeatures.VALUES] = features[
+          feature_keys.TrainEvalFeatures.VALUES][..., None]
+    elif len(features[feature_keys.TrainEvalFeatures.VALUES].shape) > 2:
+      raise ValueError(
+          ("Got an unexpected number of dimensions for the '{}' feature."
+           " Was expecting at most 2 dimensions"
+           " ([series length, number of features]) since '{}' does not "
+           "have a batch dimension, but got shape {}").format(
+               feature_keys.TrainEvalFeatures.VALUES,
+               feature_keys.TrainEvalFeatures.TIMES,
+               features[feature_keys.TrainEvalFeatures.VALUES].shape))
+    # Add trivial batch dimensions for every feature
+    features = {key: value[None, ...] for key, value in features.items()}
+  elif len(features[feature_keys.TrainEvalFeatures.TIMES]
+           .shape) != 2:  # shape [batch size, series length]
+    raise ValueError(
+        ("Got an unexpected number of dimensions for times. Was expecting at "
+         "most two ([batch size, series length]), but got shape {}.").format(
+             times.shape))
+  if require_single_batch:
+    # We don't expect input to be already batched; batching is done later
+    if features[feature_keys.TrainEvalFeatures.TIMES].shape[0] != 1:
+      raise ValueError("Got batch input, was expecting unbatched input.")
+  return features
diff --git a/tensorflow/contrib/timeseries/python/timeseries/input_pipeline_test.py b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed78a835a4d451e9e7d18bb833d8ebed6c05a195
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/input_pipeline_test.py
@@ -0,0 +1,319 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the time series input pipeline."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import tempfile
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import test_utils
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+
+
+def _make_csv_temp_file(to_write, test_tmpdir):
+  _, data_file = tempfile.mkstemp(dir=test_tmpdir)
+  with open(data_file, "w") as f:
+    csvwriter = csv.writer(f)
+    for record in to_write:
+      csvwriter.writerow(record)
+  return data_file
+
+
+def _make_csv_time_series(num_features, num_samples, test_tmpdir):
+  filename = _make_csv_temp_file(
+      [[i] + [float(i) * 2. + feature_number
+              for feature_number in range(num_features)]
+       for i in range(num_samples)],
+      test_tmpdir=test_tmpdir)
+  return filename
+
+
+def _make_numpy_time_series(num_features, num_samples):
+  times = numpy.arange(num_samples)
+  values = times[:, None] * 2. + numpy.arange(num_features)[None, :]
+  return {TrainEvalFeatures.TIMES: times,
+          TrainEvalFeatures.VALUES: values}
+
+
+class RandomWindowInputFnTests(test.TestCase):
+
+  def _random_window_input_fn_test_template(
+      self, time_series_reader, window_size, batch_size, num_features,
+      discard_out_of_order=False):
+    input_fn = input_pipeline.RandomWindowInputFn(
+        time_series_reader=time_series_reader,
+        window_size=window_size, batch_size=batch_size)
+    result, _ = input_fn()
+    init_op = variables.local_variables_initializer()
+    with self.test_session() as session:
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      session.run(init_op)
+      features = session.run(result)
+      coordinator.request_stop()
+      coordinator.join()
+    self.assertAllEqual([batch_size, window_size],
+                        features[TrainEvalFeatures.TIMES].shape)
+    for window_position in range(window_size - 1):
+      for batch_position in range(batch_size):
+        # Checks that all times are contiguous
+        self.assertEqual(
+            features[TrainEvalFeatures.TIMES][batch_position,
+                                              window_position + 1],
+            features[TrainEvalFeatures.TIMES][batch_position,
+                                              window_position] + 1)
+    self.assertAllEqual([batch_size, window_size, num_features],
+                        features[TrainEvalFeatures.VALUES].shape)
+    self.assertEqual("int64", features[TrainEvalFeatures.TIMES].dtype)
+    for feature_number in range(num_features):
+      self.assertAllEqual(
+          features[TrainEvalFeatures.TIMES] * 2. + feature_number,
+          features[TrainEvalFeatures.VALUES][:, :, feature_number])
+    return features
+
+  def _test_out_of_order(self, time_series_reader, discard_out_of_order):
+    self._random_window_input_fn_test_template(
+        time_series_reader=time_series_reader,
+        num_features=1, window_size=2, batch_size=5,
+        discard_out_of_order=discard_out_of_order)
+
+  def test_csv_sort_out_of_order(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=50,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader([filename])
+    self._test_out_of_order(time_series_reader, discard_out_of_order=False)
+
+  def test_numpy_sort_out_of_order(self):
+    data = _make_numpy_time_series(num_features=1, num_samples=50)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._test_out_of_order(time_series_reader, discard_out_of_order=False)
+
+  def test_csv_discard_out_of_order(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=50,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader([filename])
+    self._test_out_of_order(time_series_reader, discard_out_of_order=True)
+
+  def test_csv_discard_out_of_order_window_equal(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=3,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader([filename])
+    self._random_window_input_fn_test_template(
+        time_series_reader=time_series_reader,
+        num_features=1, window_size=3, batch_size=5,
+        discard_out_of_order=True)
+
+  def test_csv_discard_out_of_order_window_too_large(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=2,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader([filename])
+    with self.assertRaises(errors.OutOfRangeError):
+      self._random_window_input_fn_test_template(
+          time_series_reader=time_series_reader,
+          num_features=1, window_size=3, batch_size=5,
+          discard_out_of_order=True)
+
+  def test_csv_no_data(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=0,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader([filename])
+    with self.assertRaises(errors.OutOfRangeError):
+      self._test_out_of_order(time_series_reader, discard_out_of_order=True)
+
+  def test_numpy_discard_out_of_order(self):
+    data = _make_numpy_time_series(num_features=1, num_samples=50)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._test_out_of_order(time_series_reader, discard_out_of_order=True)
+
+  def test_numpy_discard_out_of_order_window_equal(self):
+    data = _make_numpy_time_series(num_features=1, num_samples=3)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._random_window_input_fn_test_template(
+        time_series_reader=time_series_reader,
+        num_features=1, window_size=3, batch_size=5,
+        discard_out_of_order=True)
+
+  def test_numpy_discard_out_of_order_window_too_large(self):
+    data = _make_numpy_time_series(num_features=1, num_samples=2)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    with self.assertRaisesRegexp(ValueError, "only 2 records were available"):
+      self._random_window_input_fn_test_template(
+          time_series_reader=time_series_reader,
+          num_features=1, window_size=3, batch_size=5,
+          discard_out_of_order=True)
+
+  def _test_multivariate(self, time_series_reader, num_features):
+    self._random_window_input_fn_test_template(
+        time_series_reader=time_series_reader,
+        num_features=num_features,
+        window_size=2,
+        batch_size=5)
+
+  def test_csv_multivariate(self):
+    filename = _make_csv_time_series(num_features=2, num_samples=50,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader(
+        [filename],
+        column_names=(TrainEvalFeatures.TIMES, TrainEvalFeatures.VALUES,
+                      TrainEvalFeatures.VALUES))
+    self._test_multivariate(time_series_reader=time_series_reader,
+                            num_features=2)
+
+  def test_numpy_multivariate(self):
+    data = _make_numpy_time_series(num_features=3, num_samples=50)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._test_multivariate(time_series_reader, num_features=3)
+
+  def test_numpy_withbatch(self):
+    data_nobatch = _make_numpy_time_series(num_features=4, num_samples=100)
+    data = {feature_name: feature_value[None]
+            for feature_name, feature_value in data_nobatch.items()}
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._random_window_input_fn_test_template(
+        time_series_reader=time_series_reader,
+        num_features=4,
+        window_size=3,
+        batch_size=5)
+
+  def test_numpy_nobatch_nofeatures(self):
+    data = _make_numpy_time_series(num_features=1, num_samples=100)
+    data[TrainEvalFeatures.VALUES] = data[TrainEvalFeatures.VALUES][:, 0]
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._random_window_input_fn_test_template(
+        time_series_reader=time_series_reader,
+        num_features=1,
+        window_size=16,
+        batch_size=16)
+
+
+class WholeDatasetInputFnTests(test.TestCase):
+
+  def _whole_dataset_input_fn_test_template(
+      self, time_series_reader, num_features, num_samples):
+    result, _ = input_pipeline.WholeDatasetInputFn(time_series_reader)()
+    with self.test_session() as session:
+      session.run(variables.local_variables_initializer())
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      features = session.run(result)
+      coordinator.request_stop()
+      coordinator.join()
+    self.assertEqual("int64", features[TrainEvalFeatures.TIMES].dtype)
+    self.assertAllEqual(numpy.arange(num_samples, dtype=numpy.int64)[None, :],
+                        features[TrainEvalFeatures.TIMES])
+    for feature_number in range(num_features):
+      self.assertAllEqual(
+          features[TrainEvalFeatures.TIMES] * 2. + feature_number,
+          features[TrainEvalFeatures.VALUES][:, :, feature_number])
+
+  def test_csv(self):
+    filename = _make_csv_time_series(num_features=3, num_samples=50,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader(
+        [filename],
+        column_names=(TrainEvalFeatures.TIMES, TrainEvalFeatures.VALUES,
+                      TrainEvalFeatures.VALUES, TrainEvalFeatures.VALUES))
+    self._whole_dataset_input_fn_test_template(
+        time_series_reader=time_series_reader, num_features=3, num_samples=50)
+
+  def test_csv_no_data(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=0,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader([filename])
+    with self.assertRaises(errors.OutOfRangeError):
+      self._whole_dataset_input_fn_test_template(
+          time_series_reader=time_series_reader, num_features=1, num_samples=50)
+
+  def test_numpy(self):
+    data = _make_numpy_time_series(num_features=4, num_samples=100)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._whole_dataset_input_fn_test_template(
+        time_series_reader=time_series_reader, num_features=4, num_samples=100)
+
+  def test_numpy_withbatch(self):
+    data_nobatch = _make_numpy_time_series(num_features=4, num_samples=100)
+    data = {feature_name: feature_value[None]
+            for feature_name, feature_value in data_nobatch.items()}
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._whole_dataset_input_fn_test_template(
+        time_series_reader=time_series_reader, num_features=4, num_samples=100)
+
+  def test_numpy_nobatch_nofeatures(self):
+    data = _make_numpy_time_series(num_features=1, num_samples=100)
+    data[TrainEvalFeatures.VALUES] = data[TrainEvalFeatures.VALUES][:, 0]
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._whole_dataset_input_fn_test_template(
+        time_series_reader=time_series_reader, num_features=1, num_samples=100)
+
+
+class AllWindowInputFnTests(test.TestCase):
+
+  def _all_window_input_fn_test_template(
+      self, time_series_reader, num_samples, window_size,
+      original_numpy_features=None):
+    input_fn = test_utils.AllWindowInputFn(
+        time_series_reader=time_series_reader,
+        window_size=window_size)
+    features, _ = input_fn()
+    init_op = variables.local_variables_initializer()
+    with self.test_session() as session:
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      session.run(init_op)
+      chunked_times, chunked_values = session.run(
+          [features[TrainEvalFeatures.TIMES],
+           features[TrainEvalFeatures.VALUES]])
+      coordinator.request_stop()
+      coordinator.join()
+    self.assertAllEqual([num_samples - window_size + 1, window_size],
+                        chunked_times.shape)
+    if original_numpy_features is not None:
+      original_times = original_numpy_features[TrainEvalFeatures.TIMES]
+      original_values = original_numpy_features[TrainEvalFeatures.VALUES]
+      self.assertAllEqual(original_times, numpy.unique(chunked_times))
+      self.assertAllEqual(original_values[chunked_times],
+                          chunked_values)
+
+  def test_csv(self):
+    filename = _make_csv_time_series(num_features=1, num_samples=50,
+                                     test_tmpdir=self.get_temp_dir())
+    time_series_reader = input_pipeline.CSVReader(
+        [filename],
+        column_names=(TrainEvalFeatures.TIMES, TrainEvalFeatures.VALUES))
+    self._all_window_input_fn_test_template(
+        time_series_reader=time_series_reader, num_samples=50, window_size=10)
+
+  def test_numpy(self):
+    data = _make_numpy_time_series(num_features=2, num_samples=31)
+    time_series_reader = input_pipeline.NumpyReader(data)
+    self._all_window_input_fn_test_template(
+        time_series_reader=time_series_reader, original_numpy_features=data,
+        num_samples=31, window_size=5)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70da3e082245e76ab3225676c2d37c4ea95292d
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -0,0 +1,952 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Miscellaneous utilities used by time series models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+from tensorflow.contrib import lookup
+from tensorflow.contrib.layers.python.layers import layers
+
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def clip_covariance(
+    covariance_matrix, maximum_variance_ratio, minimum_variance):
+  """Enforce constraints on a covariance matrix to improve numerical stability.
+
+  Args:
+    covariance_matrix: A [..., N, N] batch of covariance matrices.
+    maximum_variance_ratio: The maximum allowed ratio of two diagonal
+      entries. Any entries lower than the maximum entry divided by this ratio
+      will be set to that value.
+    minimum_variance: A floor for diagonal entries in the returned matrix.
+  Returns:
+    A new covariance matrix with the requested constraints enforced. If the
+    input was positive definite, the output will be too.
+  """
+  # TODO(allenl): Smarter scaling here so that correlations are preserved when
+  # fiddling with diagonal elements.
+  diagonal = array_ops.matrix_diag_part(covariance_matrix)
+  maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True)
+  new_diagonal = gen_math_ops.maximum(
+      diagonal, maximum / maximum_variance_ratio)
+  return array_ops.matrix_set_diag(
+      covariance_matrix, math_ops.maximum(new_diagonal, minimum_variance))
+
+
+def block_diagonal(matrices, dtype=dtypes.float32, name="block_diagonal"):
+  r"""Constructs block-diagonal matrices from a list of batched 2D tensors.
+
+  Args:
+    matrices: A list of Tensors with shape [..., N_i, M_i] (i.e. a list of
+      matrices with the same batch dimension).
+    dtype: Data type to use. The Tensors in `matrices` must match this dtype.
+    name: A name for the returned op.
+  Returns:
+    A matrix with the input matrices stacked along its main diagonal, having
+    shape [..., \sum_i N_i, \sum_i M_i].
+  """
+  matrices = [ops.convert_to_tensor(matrix, dtype=dtype) for matrix in matrices]
+  blocked_rows = tensor_shape.Dimension(0)
+  blocked_cols = tensor_shape.Dimension(0)
+  batch_shape = tensor_shape.TensorShape(None)
+  for matrix in matrices:
+    full_matrix_shape = matrix.get_shape().with_rank_at_least(2)
+    batch_shape = batch_shape.merge_with(full_matrix_shape[:-2])
+    blocked_rows += full_matrix_shape[-2]
+    blocked_cols += full_matrix_shape[-1]
+  ret_columns_list = []
+  for matrix in matrices:
+    matrix_shape = array_ops.shape(matrix)
+    ret_columns_list.append(matrix_shape[-1])
+  ret_columns = math_ops.add_n(ret_columns_list)
+  row_blocks = []
+  current_column = 0
+  for matrix in matrices:
+    matrix_shape = array_ops.shape(matrix)
+    row_before_length = current_column
+    current_column += matrix_shape[-1]
+    row_after_length = ret_columns - current_column
+    row_blocks.append(
+        array_ops.pad(
+            tensor=matrix,
+            paddings=array_ops.concat(
+                [
+                    array_ops.zeros(
+                        [array_ops.rank(matrix) - 1, 2], dtype=dtypes.int32), [(
+                            row_before_length, row_after_length)]
+                ],
+                axis=0)))
+  blocked = array_ops.concat(row_blocks, -2, name=name)
+  blocked.set_shape(batch_shape.concatenate((blocked_rows, blocked_cols)))
+  return blocked
+
+
+def power_sums_tensor(array_size, power_matrix, multiplier):
+  r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1).
+
+  Args:
+    array_size: The number of non-trivial sums to pre-compute.
+    power_matrix: The "A" matrix above.
+    multiplier: The "B" matrix above
+  Returns:
+    A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T
+      S[0] is the zero matrix
+      S[1] is B
+      S[2] is A B A^T + B
+      ...and so on
+  """
+  array_size = math_ops.cast(array_size, dtypes.int32)
+  power_matrix = ops.convert_to_tensor(power_matrix)
+  identity_like_power_matrix = linalg_ops.eye(
+      array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype)
+  identity_like_power_matrix.set_shape(
+      ops.convert_to_tensor(power_matrix).get_shape())
+  transition_powers = functional_ops.scan(
+      lambda previous_power, _: math_ops.matmul(previous_power, power_matrix),
+      math_ops.range(array_size - 1),
+      initializer=identity_like_power_matrix)
+  summed = math_ops.cumsum(
+      array_ops.concat([
+          array_ops.expand_dims(multiplier, 0), math_ops.matmul(
+              batch_times_matrix(transition_powers, multiplier),
+              transition_powers,
+              adjoint_b=True)
+      ], 0))
+  return array_ops.concat(
+      [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
+
+
+def matrix_to_powers(matrix, powers):
+  """Raise a single matrix to multiple powers."""
+  matrix_tiled = array_ops.tile(
+      array_ops.expand_dims(matrix, 0), [array_ops.size(powers), 1, 1])
+  return batch_matrix_pow(matrix_tiled, powers)
+
+
+def batch_matrix_pow(matrices, powers):
+  """Compute powers of matrices, e.g. A^3 = matmul(matmul(A, A), A).
+
+  Uses exponentiation by squaring, with O(log(p)) matrix multiplications to
+  compute A^p.
+
+  Args:
+    matrices: [batch size x N x N]
+    powers: Which integer power to raise each matrix to [batch size]
+  Returns:
+    The matrices raised to their respective powers, same dimensions as the
+    "matrices" argument.
+  """
+
+  def terminate_when_all_zero(current_argument, residual_powers, accumulator):
+    del current_argument, accumulator  # not used for condition
+    do_exit = math_ops.reduce_any(
+        math_ops.greater(residual_powers, array_ops.ones_like(residual_powers)))
+    return do_exit
+
+  def do_iteration(current_argument, residual_powers, accumulator):
+    """Compute one step of iterative exponentiation by squaring.
+
+    The recursive form is:
+      power(A, p) = { power(matmul(A, A), p / 2) for even p
+                    { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p
+      power(A, 0) = I
+
+    The power(A, 0) = I case is handeled by starting with accumulator set to the
+    identity matrix; matrices with zero residual powers are passed through
+    unchanged.
+
+    Args:
+      current_argument: On this step, what is the first argument (A^2..^2) to
+          the (unrolled) recursive function? [batch size x N x N]
+      residual_powers: On this step, what is the second argument (residual p)?
+          [batch_size]
+      accumulator: Accumulates the exterior multiplications from the odd
+          powers (initially the identity matrix). [batch_size x N x N]
+    Returns:
+      Updated versions of each argument for one step of the unrolled
+      computation. Does not change parts of the batch which have a residual
+      power of zero.
+    """
+    is_even = math_ops.equal(residual_powers % 2,
+                             array_ops.zeros(
+                                 array_ops.shape(residual_powers),
+                                 dtype=dtypes.int32))
+    new_accumulator = array_ops.where(is_even, accumulator,
+                                      math_ops.matmul(accumulator,
+                                                      current_argument))
+    new_argument = math_ops.matmul(current_argument, current_argument)
+    do_update = math_ops.greater(residual_powers, 1)
+    new_residual_powers = residual_powers - residual_powers % 2
+    new_residual_powers //= 2
+    # Stop updating if we've reached our base case; some batch elements may
+    # finish sooner than others
+    accumulator = array_ops.where(do_update, new_accumulator, accumulator)
+    current_argument = array_ops.where(do_update, new_argument,
+                                       current_argument)
+    residual_powers = array_ops.where(do_update, new_residual_powers,
+                                      residual_powers)
+    return (current_argument, residual_powers, accumulator)
+
+  matrices = ops.convert_to_tensor(matrices)
+  powers = math_ops.cast(powers, dtype=dtypes.int32)
+  ident = array_ops.expand_dims(
+      array_ops.diag(
+          array_ops.ones([array_ops.shape(matrices)[1]], dtype=matrices.dtype)),
+      0)
+  ident_tiled = array_ops.tile(ident, [array_ops.shape(matrices)[0], 1, 1])
+  (final_argument,
+   final_residual_power, final_accumulator) = control_flow_ops.while_loop(
+       terminate_when_all_zero, do_iteration, [matrices, powers, ident_tiled])
+  return array_ops.where(
+      math_ops.equal(final_residual_power,
+                     array_ops.zeros_like(
+                         final_residual_power, dtype=dtypes.int32)),
+      ident_tiled, math_ops.matmul(final_argument, final_accumulator))
+
+
+# TODO(allenl): would be useful if this was built into batch_matmul
+def batch_times_matrix(batch, matrix, adj_x=False, adj_y=False):
+  """Multiply a batch of matrices by a single matrix.
+
+  Functionally equivalent to:
+  tf.matmul(batch, array_ops.tile(gen_math_ops.expand_dims(matrix, 0),
+                                 [array_ops.shape(batch)[0], 1, 1]),
+                  adjoint_a=adj_x, adjoint_b=adj_y)
+
+  Args:
+    batch: [batch_size x N x M] after optional transpose
+    matrix: [M x P] after optional transpose
+    adj_x: If true, transpose the second two dimensions of "batch" before
+        multiplying.
+    adj_y: If true, transpose "matrix" before multiplying.
+  Returns:
+    [batch_size x N x P]
+  """
+  batch = ops.convert_to_tensor(batch)
+  matrix = ops.convert_to_tensor(matrix)
+  assert batch.get_shape().ndims == 3
+  assert matrix.get_shape().ndims == 2
+  if adj_x:
+    batch = array_ops.transpose(batch, [0, 2, 1])
+  batch_dimension = batch.get_shape()[0].value
+  first_dimension = batch.get_shape()[1].value
+  tensor_batch_shape = array_ops.shape(batch)
+  if batch_dimension is None:
+    batch_dimension = tensor_batch_shape[0]
+  if first_dimension is None:
+    first_dimension = tensor_batch_shape[1]
+  matrix_first_dimension, matrix_second_dimension = matrix.get_shape().as_list()
+  batch_reshaped = array_ops.reshape(batch, [-1, tensor_batch_shape[2]])
+  if adj_y:
+    if matrix_first_dimension is None:
+      matrix_first_dimension = array_ops.shape(matrix)[0]
+    result_shape = [batch_dimension, first_dimension, matrix_first_dimension]
+  else:
+    if matrix_second_dimension is None:
+      matrix_second_dimension = array_ops.shape(matrix)[1]
+    result_shape = [batch_dimension, first_dimension, matrix_second_dimension]
+  return array_ops.reshape(
+      math_ops.matmul(batch_reshaped, matrix, adjoint_b=adj_y), result_shape)
+
+
+def matrix_times_batch(matrix, batch, adj_x=False, adj_y=False):
+  """Like batch_times_matrix, but with the multiplication order swapped."""
+  return array_ops.transpose(
+      batch_times_matrix(
+          batch=batch, matrix=matrix, adj_x=not adj_y, adj_y=not adj_x),
+      [0, 2, 1])
+
+
+def make_toeplitz_matrix(inputs, name=None):
+  """Make a symmetric Toeplitz matrix from input array of values.
+
+  Args:
+    inputs: a 3-D tensor of shape [num_blocks, block_size, block_size].
+    name: the name of the operation.
+
+  Returns:
+    a symmetric Toeplitz matrix of shape
+      [num_blocks*block_size, num_blocks*block_size].
+  """
+  num_blocks = array_ops.shape(inputs)[0]
+  block_size = array_ops.shape(inputs)[1]
+  output_size = block_size * num_blocks
+  lags = array_ops.reshape(math_ops.range(num_blocks), shape=[1, -1])
+  indices = math_ops.abs(lags - array_ops.transpose(lags))
+  output = array_ops.gather(inputs, indices)
+  output = array_ops.reshape(
+      array_ops.transpose(output, [0, 2, 1, 3]), [output_size, output_size])
+  return array_ops.identity(output, name=name)
+
+
+# TODO(allenl): Investigate alternative parameterizations.
+def sign_magnitude_positive_definite(
+    raw, off_diagonal_scale=0., overall_scale=0.):
+  """Constructs a positive definite matrix from an unconstrained input matrix.
+
+  We want to keep the whole matrix on a log scale, but also allow off-diagonal
+  elements to be negative, so the sign of off-diagonal elements is modeled
+  separately from their magnitude (using the lower and upper triangles
+  respectively). Specifically:
+
+  for i < j, we have:
+    output_cholesky[i, j] = raw[j, i] / (abs(raw[j, i]) + 1) *
+        exp((off_diagonal_scale + overall_scale + raw[i, j]) / 2)
+
+  output_cholesky[i, i] = exp((raw[i, i] + overall_scale) / 2)
+
+  output = output_cholesky^T * output_cholesky
+
+  where raw, off_diagonal_scale, and overall_scale are
+  un-constrained real-valued variables. The resulting values are stable
+  around zero due to the exponential (and the softsign keeps the function
+  smooth).
+
+  Args:
+    raw: A [..., M, M] Tensor.
+    off_diagonal_scale: A scalar or [...] shaped Tensor controlling the relative
+        scale of off-diagonal values in the output matrix.
+    overall_scale: A scalar or [...] shaped Tensor controlling the overall scale
+        of the output matrix.
+  Returns:
+    The `output` matrix described above, a [..., M, M] positive definite matrix.
+
+  """
+  raw = ops.convert_to_tensor(raw)
+  diagonal = array_ops.matrix_diag_part(raw)
+  def _right_pad_with_ones(tensor, target_rank):
+    # Allow broadcasting even if overall_scale and off_diagonal_scale have batch
+    # dimensions
+    tensor = ops.convert_to_tensor(tensor, dtype=raw.dtype.base_dtype)
+    return array_ops.reshape(tensor,
+                             array_ops.concat(
+                                 [
+                                     array_ops.shape(tensor), array_ops.ones(
+                                         [target_rank - array_ops.rank(tensor)],
+                                         dtype=target_rank.dtype)
+                                 ],
+                                 axis=0))
+  # We divide the log values by 2 to compensate for the squaring that happens
+  # when transforming Cholesky factors into positive definite matrices.
+  sign_magnitude = (gen_math_ops.exp(
+      (raw + _right_pad_with_ones(off_diagonal_scale, array_ops.rank(raw)) +
+       _right_pad_with_ones(overall_scale, array_ops.rank(raw))) / 2.) *
+                    nn.softsign(array_ops.matrix_transpose(raw)))
+  sign_magnitude.set_shape(raw.get_shape())
+  cholesky_factor = array_ops.matrix_set_diag(
+      input=array_ops.matrix_band_part(sign_magnitude, 0, -1),
+      diagonal=gen_math_ops.exp((diagonal + _right_pad_with_ones(
+          overall_scale, array_ops.rank(diagonal))) / 2.))
+  return math_ops.matmul(cholesky_factor, cholesky_factor, transpose_a=True)
+
+
+def transform_to_covariance_matrices(input_vectors, matrix_size):
+  """Construct covariance matrices via transformations from input_vectors.
+
+  Args:
+    input_vectors: A [batch size x input size] batch of vectors to transform.
+    matrix_size: An integer indicating one dimension of the (square) output
+        matrix.
+  Returns:
+    A [batch size x matrix_size x matrix_size] batch of covariance matrices.
+  """
+  combined_values = layers.fully_connected(
+      input_vectors, matrix_size**2 + 2, activation_fn=None)
+  return sign_magnitude_positive_definite(
+      raw=array_ops.reshape(combined_values[..., :-2],
+                            array_ops.concat([
+                                array_ops.shape(combined_values)[:-1],
+                                [matrix_size, matrix_size]
+                            ], 0)),
+      off_diagonal_scale=combined_values[..., -2],
+      overall_scale=combined_values[..., -1])
+
+
+def variable_covariance_matrix(
+    size, name, dtype, initial_diagonal_values=None,
+    initial_overall_scale_log=0.):
+  """Construct a Variable-parameterized positive definite matrix.
+
+  Useful for parameterizing covariance matrices.
+
+  Args:
+    size: The size of the main diagonal, the returned matrix having shape [size
+        x size].
+    name: The name to use when defining variables and ops.
+    dtype: The floating point data type to use.
+    initial_diagonal_values: A Tensor with shape [size] with initial values for
+        the diagonal values of the returned matrix. Must be positive.
+    initial_overall_scale_log: Initial value of the bias term for every element
+        of the matrix in log space.
+  Returns:
+    A Variable-parameterized covariance matrix with shape [size x size].
+  """
+  raw_values = variable_scope.get_variable(
+      name + "_pre_transform",
+      dtype=dtype,
+      shape=[size, size],
+      initializer=init_ops.zeros_initializer())
+  if initial_diagonal_values is not None:
+    raw_values += array_ops.matrix_diag(math_ops.log(initial_diagonal_values))
+  return array_ops.identity(
+      sign_magnitude_positive_definite(
+          raw=raw_values,
+          off_diagonal_scale=variable_scope.get_variable(
+              name + "_off_diagonal_scale",
+              dtype=dtype,
+              initializer=constant_op.constant(-5., dtype=dtype)),
+          overall_scale=ops.convert_to_tensor(
+              initial_overall_scale_log, dtype=dtype) +
+          variable_scope.get_variable(
+              name + "_overall_scale",
+              dtype=dtype,
+              shape=[],
+              initializer=init_ops.zeros_initializer())),
+      name=name)
+
+
+def batch_start_time(times):
+  return times[:, 0]
+
+
+def batch_end_time(times):
+  return times[:, -1]
+
+
+def log_noninformative_covariance_prior(covariance):
+  """Compute a relatively uninformative prior for noise parameters.
+
+  Helpful for avoiding noise over-estimation, where noise otherwise decreases
+  very slowly during optimization.
+
+  See:
+    Villegas, C. On the A Priori Distribution of the Covariance Matrix.
+    Ann. Math. Statist. 40 (1969), no. 3, 1098--1099.
+
+  Args:
+    covariance: A covariance matrix.
+  Returns:
+    For a [p x p] matrix:
+      log(det(covariance)^(-(p + 1) / 2))
+  """
+  # Avoid zero/negative determinants due to numerical errors
+  covariance += array_ops.diag(1e-8 * array_ops.ones(
+      shape=[array_ops.shape(covariance)[0]], dtype=covariance.dtype))
+  power = -(math_ops.cast(array_ops.shape(covariance)[0] + 1,
+                          covariance.dtype) / 2.)
+  return power * math_ops.log(linalg_ops.matrix_determinant(covariance))
+
+
+def entropy_matched_cauchy_scale(covariance):
+  """Approximates a similar Cauchy distribution given a covariance matrix.
+
+  Since Cauchy distributions do not have moments, entropy matching provides one
+  way to set a Cauchy's scale parameter in a way that provides a similar
+  distribution. The effect is dividing the standard deviation of an independent
+  Gaussian by a constant very near 3.
+
+  To set the scale of the Cauchy distribution, we first select the diagonals of
+  `covariance`. Since this ignores cross terms, it overestimates the entropy of
+  the Gaussian. For each of these variances, we solve for the Cauchy scale
+  parameter which gives the same entropy as the Gaussian with that
+  variance. This means setting the (univariate) Gaussian entropy
+      0.5 * ln(2 * variance * pi * e)
+  equal to the Cauchy entropy
+      ln(4 * pi * scale)
+  Solving, we get scale = sqrt(variance * (e / (8 pi))).
+
+  Args:
+    covariance: A [batch size x N x N] batch of covariance matrices to produce
+        Cauchy scales for.
+  Returns:
+    A [batch size x N] set of Cauchy scale parameters for each part of the batch
+    and each dimension of the input Gaussians.
+  """
+  return math_ops.sqrt(math.e / (8. * math.pi) *
+                       array_ops.matrix_diag_part(covariance))
+
+
+class TensorValuedMutableDenseHashTable(lookup.MutableDenseHashTable):
+  """A version of MutableDenseHashTable which stores arbitrary Tensor shapes.
+
+  Since MutableDenseHashTable only allows vectors right now, simply adds reshape
+  ops on both ends.
+  """
+
+  def __init__(self, key_dtype, value_dtype, default_value, *args, **kwargs):
+    self._non_vector_value_shape = array_ops.shape(default_value)
+    super(TensorValuedMutableDenseHashTable, self).__init__(
+        key_dtype=key_dtype,
+        value_dtype=value_dtype,
+        default_value=array_ops.reshape(default_value, [-1]),
+        *args,
+        **kwargs)
+
+  def insert(self, keys, values, name=None):
+    keys = ops.convert_to_tensor(keys, dtype=self._key_dtype)
+    keys_flat = array_ops.reshape(keys, [-1])
+    return super(TensorValuedMutableDenseHashTable, self).insert(
+        keys=keys_flat,
+        # Each key has one corresponding value, so the shape of the tensor of
+        # values for every key is key_shape + value_shape
+        values=array_ops.reshape(values, [array_ops.shape(keys_flat)[0], -1]),
+        name=name)
+
+  def lookup(self, keys, name=None):
+    keys_flat = array_ops.reshape(
+        ops.convert_to_tensor(keys, dtype=self._key_dtype), [-1])
+    return array_ops.reshape(
+        super(TensorValuedMutableDenseHashTable, self).lookup(
+            keys=keys_flat, name=name),
+        array_ops.concat([array_ops.shape(keys), self._non_vector_value_shape],
+                         0))
+
+
+class TupleOfTensorsLookup(lookup.LookupInterface):
+  """A LookupInterface with nested tuples of Tensors as values.
+
+  Creates one MutableDenseHashTable per value Tensor, which has some unnecessary
+  overhead.
+  """
+
+  def __init__(
+      self, key_dtype, default_values, empty_key, name, checkpoint=True):
+    default_values_flat = nest.flatten(default_values)
+    self._hash_tables = nest.pack_sequence_as(
+        default_values,
+        [TensorValuedMutableDenseHashTable(
+            key_dtype=key_dtype,
+            value_dtype=default_value.dtype.base_dtype,
+            default_value=default_value,
+            empty_key=empty_key,
+            name=name + "_{}".format(table_number),
+            checkpoint=checkpoint)
+         for table_number, default_value
+         in enumerate(default_values_flat)])
+    self._name = name
+
+  def lookup(self, keys):
+    return nest.pack_sequence_as(
+        self._hash_tables,
+        [hash_table.lookup(keys)
+         for hash_table in nest.flatten(self._hash_tables)])
+
+  def insert(self, keys, values):
+    nest.assert_same_structure(self._hash_tables, values)
+    # Avoid race conditions by requiring that all inputs are computed before any
+    # inserts happen (an issue if one key's update relies on another's value).
+    values_flat = [array_ops.identity(value) for value in nest.flatten(values)]
+    with ops.control_dependencies(values_flat):
+      insert_ops = [hash_table.insert(keys, value)
+                    for hash_table, value
+                    in zip(nest.flatten(self._hash_tables),
+                           values_flat)]
+    return control_flow_ops.group(*insert_ops)
+
+  def check_table_dtypes(self, key_dtype, value_dtype):
+    # dtype checking is done in the objects in self._hash_tables
+    pass
+
+
+def replicate_state(start_state, batch_size):
+  """Create batch versions of state.
+
+  Takes a list of Tensors, adds a batch dimension, and replicates
+  batch_size times across that batch dimension. Used to replicate the
+  non-batch state returned by get_start_state in define_loss.
+
+  Args:
+    start_state: Model-defined state to replicate.
+    batch_size: Batch dimension for data.
+  Returns:
+    Replicated versions of the state.
+  """
+  flattened_state = nest.flatten(start_state)
+  replicated_state = [
+      array_ops.tile(
+          array_ops.expand_dims(state_nonbatch, 0),
+          array_ops.concat([[batch_size], array_ops.ones(
+              [array_ops.rank(state_nonbatch)], dtype=dtypes.int32)], 0))
+      for state_nonbatch in flattened_state
+  ]
+  return nest.pack_sequence_as(start_state, replicated_state)
+
+
+Moments = collections.namedtuple("Moments", ["mean", "variance"])
+
+
+# Currently all of these statistics are computed incrementally (i.e. are updated
+# every time a new mini-batch of training data is presented) when this object is
+# created in InputStatisticsFromMiniBatch.
+InputStatistics = collections.namedtuple(
+    "InputStatistics",
+    ["series_start_moments",  # The mean and variance of each feature in a chunk
+                              # (with a size configured in the statistics
+                              # object) at the start of the series. A tuple of
+                              # (mean, variance), each with shape [number of
+                              # features], floating point. One use is in state
+                              # space models, to keep priors calibrated even as
+                              # earlier parts of the series are presented. If
+                              # this object was created by
+                              # InputStatisticsFromMiniBatch, these moments are
+                              # computed based on the earliest chunk of data
+                              # presented so far. However, there is a race
+                              # condition in the update, so these may reflect
+                              # statistics later in the series, but should
+                              # eventually reflect statistics in a chunk at the
+                              # series start.
+     "overall_feature_moments",  # The mean and variance of each feature over
+                                 # the entire series. A tuple of (mean,
+                                 # variance), each with shape [number of
+                                 # features]. If this object was created by
+                                 # InputStatisticsFromMiniBatch, these moments
+                                 # are estimates based on the data seen so far.
+     "start_time",  # The first (lowest) time in the series, a scalar
+                    # integer. If this object was created by
+                    # InputStatisticsFromMiniBatch, this is the lowest time seen
+                    # so far rather than the lowest time that will ever be seen
+                    # (guaranteed to be at least as low as the lowest time
+                    # presented in the current minibatch).
+     "total_observation_count",  # Count of data points, a scalar integer. If
+                                 # this object was created by
+                                 # InputStatisticsFromMiniBatch, this is an
+                                 # estimate of the total number of observations
+                                 # in the whole dataset computed based on the
+                                 # density of the series and the minimum and
+                                 # maximum times seen.
+    ])
+
+
+# TODO(allenl): It would be nice to do something with full series statistics
+# when the user provides that.
+class InputStatisticsFromMiniBatch(object):
+  """Generate statistics from mini-batch input."""
+
+  def __init__(self, num_features, dtype, starting_variance_window_size=16):
+    """Configure the input statistics object.
+
+    Args:
+      num_features: Number of features for the time series
+      dtype: The floating point data type to use.
+      starting_variance_window_size: The number of datapoints to use when
+          computing the mean and variance at the start of the series.
+    """
+    self._starting_variance_window_size = starting_variance_window_size
+    self._num_features = num_features
+    self._dtype = dtype
+
+  def initialize_graph(self, features, update_statistics=True):
+    """Create any ops needed to provide input statistics.
+
+    Should be called before statistics are requested.
+
+    Args:
+      features: A dictionary, the output of a `TimeSeriesInputFn` (with keys
+          TrainEvalFeatures.TIMES and TrainEvalFeatures.VALUES).
+      update_statistics: Whether `features` should be used to update adaptive
+          statistics. Typically True for training and false for evaluation.
+    Returns:
+      An InputStatistics object composed of Variables, which will be updated
+      based on mini-batches of data if requested.
+    """
+    if (TrainEvalFeatures.TIMES in features
+        and TrainEvalFeatures.VALUES in features):
+      times = features[TrainEvalFeatures.TIMES]
+      values = features[TrainEvalFeatures.VALUES]
+    else:
+      # times and values may not be available, for example during prediction. We
+      # still need to retrieve our variables so that they can be read from, even
+      # if we're not going to update them.
+      times = None
+      values = None
+    # Create/retrieve variables representing input statistics, initialized
+    # without data to avoid deadlocking if variables are initialized before
+    # queue runners are started.
+    with variable_scope.variable_scope("input_statistics", use_resource=True):
+      statistics = self._create_variable_statistics_object()
+    with variable_scope.variable_scope(
+        "input_statistics_auxiliary", use_resource=True):
+      # Secondary statistics, necessary for the incremental computation of the
+      # primary statistics (e.g. counts and sums for computing a mean
+      # incrementally).
+      auxiliary_variables = self._AdaptiveInputAuxiliaryStatistics(
+          num_features=self._num_features, dtype=self._dtype)
+    if update_statistics and times is not None and values is not None:
+      # If we have times and values from mini-batch input, create update ops to
+      # take the new data into account.
+      assign_op = self._update_statistics_from_mini_batch(
+          statistics, auxiliary_variables, times, values)
+      with ops.control_dependencies([assign_op]):
+        stat_variables = nest.pack_sequence_as(statistics, [
+            array_ops.identity(tensor) for tensor in nest.flatten(statistics)
+        ])
+        # Since start time updates have a race condition, ensure that the
+        # reported start time is at least as low as the lowest time in this
+        # mini-batch. The start time should converge on the correct value
+        # eventually even with the race condition, but for example state space
+        # models have an assertion which could fail without this
+        # post-processing.
+        return stat_variables._replace(start_time=gen_math_ops.minimum(
+            stat_variables.start_time, math_ops.reduce_min(times)))
+    else:
+      return statistics
+
+  class _AdaptiveInputAuxiliaryStatistics(collections.namedtuple(
+      "_AdaptiveInputAuxiliaryStatistics",
+      ["max_time_seen",  # The maximum time seen (best effort if updated from
+                         # multiple workers; see notes about race condition
+                         # below).
+       "chunk_count",  # The number of chunks seen.
+       "inter_observation_duration_sum",  # The sum across chunks of their "time
+                                          # density" (number of times per
+                                          # example).
+       "example_count",  # The number of examples seen (each example has a
+                         # single time associated with it and one or more
+                         # real-valued features).
+       "overall_feature_sum",  # The sum of values for each feature. Shape
+                               # [number of features].
+       "overall_feature_sum_of_squares",  # The sum of squared values for each
+                                          # feature. Shape [number of features]
+      ])):
+    """Extra statistics used to incrementally update InputStatistics."""
+
+    def __new__(cls, num_features, dtype):
+      return super(
+          InputStatisticsFromMiniBatch  # pylint: disable=protected-access
+          ._AdaptiveInputAuxiliaryStatistics,
+          cls).__new__(
+              cls,
+              max_time_seen=variable_scope.get_variable(
+                  name="max_time_seen",
+                  initializer=dtypes.int64.min,
+                  dtype=dtypes.int64,
+                  trainable=False),
+              chunk_count=variable_scope.get_variable(
+                  name="chunk_count",
+                  initializer=init_ops.zeros_initializer(),
+                  shape=[],
+                  dtype=dtypes.int64,
+                  trainable=False),
+              inter_observation_duration_sum=variable_scope.get_variable(
+                  name="inter_observation_duration_sum",
+                  initializer=init_ops.zeros_initializer(),
+                  shape=[],
+                  dtype=dtype,
+                  trainable=False),
+              example_count=variable_scope.get_variable(
+                  name="example_count",
+                  shape=[],
+                  dtype=dtypes.int64,
+                  trainable=False),
+              overall_feature_sum=variable_scope.get_variable(
+                  name="overall_feature_sum",
+                  shape=[num_features],
+                  dtype=dtype,
+                  initializer=init_ops.zeros_initializer(),
+                  trainable=False),
+              overall_feature_sum_of_squares=variable_scope.get_variable(
+                  name="overall_feature_sum_of_squares",
+                  shape=[num_features],
+                  dtype=dtype,
+                  initializer=init_ops.zeros_initializer(),
+                  trainable=False))
+
+  def _update_statistics_from_mini_batch(
+      self, statistics, auxiliary_variables, times, values):
+    """Given mini-batch input, update `statistics` and `auxiliary_variables`."""
+    values = math_ops.cast(values, self._dtype)
+    # The density (measured in times per observation) that we see in each part
+    # of the mini-batch.
+    batch_inter_observation_duration = (math_ops.cast(
+        math_ops.reduce_max(times, axis=1) - math_ops.reduce_min(times, axis=1),
+        self._dtype) / math_ops.cast(
+            array_ops.shape(times)[1] - 1, self._dtype))
+    # Co-locate updates with their variables to minimize race conditions when
+    # updating statistics.
+    with ops.colocate_with(auxiliary_variables.max_time_seen):
+      # There is a race condition if this value is being updated from multiple
+      # workers. However, it should eventually reach the correct value if the
+      # last chunk is presented enough times.
+      max_time_seen_assign = state_ops.assign(
+          auxiliary_variables.max_time_seen,
+          gen_math_ops.maximum(auxiliary_variables.max_time_seen,
+                               math_ops.reduce_max(times)))
+    with ops.colocate_with(auxiliary_variables.chunk_count):
+      chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count,
+                                                array_ops.shape(
+                                                    times,
+                                                    out_type=dtypes.int64)[0])
+    with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum):
+      inter_observation_duration_assign = state_ops.assign_add(
+          auxiliary_variables.inter_observation_duration_sum,
+          math_ops.reduce_sum(batch_inter_observation_duration))
+    with ops.colocate_with(auxiliary_variables.example_count):
+      example_count_assign = state_ops.assign_add(
+          auxiliary_variables.example_count,
+          array_ops.size(times, out_type=dtypes.int64))
+    # Note: These mean/variance updates assume that all points are equally
+    # likely, which is not true if _chunks_ are sampled uniformly from the space
+    # of all possible contiguous chunks, since points at the start and end of
+    # the series are then members of fewer chunks. For series which are much
+    # longer than the chunk size (the usual/expected case), this effect becomes
+    # irrelevant.
+    with ops.colocate_with(auxiliary_variables.overall_feature_sum):
+      overall_feature_sum_assign = state_ops.assign_add(
+          auxiliary_variables.overall_feature_sum,
+          math_ops.reduce_sum(values, axis=[0, 1]))
+    with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares):
+      overall_feature_sum_of_squares_assign = state_ops.assign_add(
+          auxiliary_variables.overall_feature_sum_of_squares,
+          math_ops.reduce_sum(values**2, axis=[0, 1]))
+    per_chunk_aux_updates = control_flow_ops.group(
+        max_time_seen_assign, chunk_count_assign,
+        inter_observation_duration_assign, example_count_assign,
+        overall_feature_sum_assign, overall_feature_sum_of_squares_assign)
+    with ops.control_dependencies([per_chunk_aux_updates]):
+      example_count_float = math_ops.cast(auxiliary_variables.example_count,
+                                          self._dtype)
+      new_feature_mean = (auxiliary_variables.overall_feature_sum /
+                          example_count_float)
+      overall_feature_mean_update = state_ops.assign(
+          statistics.overall_feature_moments.mean, new_feature_mean)
+      overall_feature_var_update = state_ops.assign(
+          statistics.overall_feature_moments.variance,
+          # De-biased n / (n - 1) variance correction
+          example_count_float / (example_count_float - 1.) *
+          (auxiliary_variables.overall_feature_sum_of_squares /
+           example_count_float - new_feature_mean**2))
+      # TODO(b/35675805): Remove this cast
+      min_time_batch = math_ops.cast(math_ops.argmin(times[:, 0]), dtypes.int32)
+      def series_start_updates():
+        # If this is the lowest-time chunk that we have seen so far, update
+        # series start moments to reflect that. Note that these statistics are
+        # "best effort", as there are race conditions in the update (however,
+        # they should eventually converge if the start of the series is
+        # presented enough times).
+        mean, variance = nn.moments(
+            values[min_time_batch, :self._starting_variance_window_size],
+            axes=[0])
+        return control_flow_ops.group(
+            state_ops.assign(statistics.series_start_moments.mean, mean),
+            state_ops.assign(statistics.series_start_moments.variance,
+                             variance))
+      with ops.colocate_with(statistics.start_time):
+        series_start_update = control_flow_ops.cond(
+            # Update moments whenever we even match the lowest time seen so far,
+            # to ensure that series start statistics are eventually updated to
+            # their correct values, despite race conditions (i.e. eventually
+            # statistics.start_time will reflect the global lowest time, and
+            # given that we will eventually update the series start moments to
+            # their correct values).
+            math_ops.less_equal(times[min_time_batch, 0],
+                                statistics.start_time),
+            series_start_updates,
+            control_flow_ops.no_op)
+        with ops.control_dependencies([series_start_update]):
+          # There is a race condition if this update is performed in parallel on
+          # multiple workers. Since models may be sensitive to being presented
+          # with times before the putative start time, the value of this
+          # variable is post-processed above to guarantee that each worker is
+          # presented with a start time which is at least as low as the lowest
+          # time in its current mini-batch.
+          start_time_update = state_ops.assign(statistics.start_time,
+                                               gen_math_ops.minimum(
+                                                   statistics.start_time,
+                                                   math_ops.reduce_min(times)))
+      inter_observation_duration_estimate = (
+          auxiliary_variables.inter_observation_duration_sum / math_ops.cast(
+              auxiliary_variables.chunk_count, self._dtype))
+      # Estimate the total number of observations as:
+      #   (end time - start time + 1) * average intra-chunk time density
+      total_observation_count_update = state_ops.assign(
+          statistics.total_observation_count,
+          math_ops.cast(
+              gen_math_ops.round(
+                  math_ops.cast(auxiliary_variables.max_time_seen -
+                                statistics.start_time + 1, self._dtype) /
+                  inter_observation_duration_estimate), dtypes.int64))
+      per_chunk_stat_updates = control_flow_ops.group(
+          overall_feature_mean_update, overall_feature_var_update,
+          series_start_update, start_time_update,
+          total_observation_count_update)
+    return per_chunk_stat_updates
+
+  def _create_variable_statistics_object(self):
+    """Creates non-trainable variables representing input statistics."""
+    series_start_moments = Moments(
+        mean=variable_scope.get_variable(
+            name="series_start_mean",
+            shape=[self._num_features],
+            dtype=self._dtype,
+            initializer=init_ops.zeros_initializer(),
+            trainable=False),
+        variance=variable_scope.get_variable(
+            name="series_start_variance",
+            shape=[self._num_features],
+            dtype=self._dtype,
+            initializer=init_ops.ones_initializer(),
+            trainable=False))
+    overall_feature_moments = Moments(
+        mean=variable_scope.get_variable(
+            name="overall_feature_mean",
+            shape=[self._num_features],
+            dtype=self._dtype,
+            initializer=init_ops.zeros_initializer(),
+            trainable=False),
+        variance=variable_scope.get_variable(
+            name="overall_feature_var",
+            shape=[self._num_features],
+            dtype=self._dtype,
+            initializer=init_ops.ones_initializer(),
+            trainable=False))
+    start_time = variable_scope.get_variable(
+        name="start_time",
+        dtype=dtypes.int64,
+        initializer=init_ops.zeros_initializer(),
+        shape=[],
+        trainable=False)
+    total_observation_count = variable_scope.get_variable(
+        name="total_observation_count",
+        shape=[],
+        dtype=dtypes.int64,
+        initializer=init_ops.ones_initializer(),
+        trainable=False)
+    return InputStatistics(
+        series_start_moments=series_start_moments,
+        overall_feature_moments=overall_feature_moments,
+        start_time=start_time,
+        total_observation_count=total_observation_count)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f8620fd81e9c04ee8e1e80b7849079efea7eee
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
@@ -0,0 +1,344 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for math_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+
+
+class MathUtilsTest(test.TestCase):
+
+  def setUp(self):
+    numpy.random.seed(10)
+
+  def test_power_sums_tensor(self):
+    transition = numpy.random.normal(size=[4, 4]).astype(numpy.float32)
+    addition = numpy.random.normal(size=[4, 4]).astype(numpy.float32)
+    array_size = 2
+    result = []
+    transition_power = numpy.identity(4)
+    running_sum = numpy.zeros([4, 4], dtype=numpy.float32)
+    for _ in range(array_size + 1):
+      result.append(running_sum)
+      current_contribution = numpy.dot(numpy.dot(transition_power, addition),
+                                       transition_power.T)
+      # pylint: disable=g-no-augmented-assignment
+      # += has different semantics here; want to make a copy
+      running_sum = running_sum + current_contribution
+      # pylint: enable=g-no-augmented-assignment
+      transition_power = numpy.dot(transition, transition_power)
+    with self.test_session():
+      self.assertAllClose(result,
+                          math_utils.power_sums_tensor(
+                              array_size, transition, addition).eval())
+
+  def test_matrix_to_powers(self):
+    matrix = numpy.random.normal(size=[4, 4]).astype(numpy.float32)
+    powers = numpy.random.randint(low=0, high=10, size=20)
+    result = []
+    for i in range(powers.shape[0]):
+      result.append(numpy.linalg.matrix_power(matrix, powers[i]))
+    with self.test_session():
+      self.assertAllClose(result,
+                          math_utils.matrix_to_powers(matrix, powers).eval(),
+                          rtol=1e-5,
+                          atol=1e-5)
+
+  def test_batch_matrix_pow(self):
+    batch = numpy.random.normal(size=[15, 4, 4]).astype(numpy.float32)
+    powers = numpy.random.randint(low=0, high=10, size=batch.shape[0])
+    result = []
+    for i in range(batch.shape[0]):
+      result.append(numpy.linalg.matrix_power(batch[i], powers[i]))
+    with self.test_session():
+      # TODO(allenl): Numerical errors seem to be creeping in. Maybe it can be
+      # made slightly more stable?
+      self.assertAllClose(result,
+                          math_utils.batch_matrix_pow(batch, powers).eval(),
+                          rtol=1e-5,
+                          atol=1e-5)
+
+  def test_batch_times_matrix(self):
+    left = numpy.random.normal(size=[5, 3, 2]).astype(numpy.float32)
+    left_transpose = numpy.transpose(left, [0, 2, 1])
+    right = numpy.random.normal(size=[2, 3]).astype(numpy.float32)
+    expected_result = numpy.dot(left, right)
+    with self.test_session():
+      self.assertAllClose(expected_result,
+                          math_utils.batch_times_matrix(
+                              left, right).eval())
+      self.assertAllClose(expected_result,
+                          math_utils.batch_times_matrix(
+                              left_transpose, right,
+                              adj_x=True).eval())
+      self.assertAllClose(expected_result,
+                          math_utils.batch_times_matrix(
+                              left, right.T,
+                              adj_y=True).eval())
+      self.assertAllClose(expected_result,
+                          math_utils.batch_times_matrix(
+                              left_transpose, right.T,
+                              adj_x=True, adj_y=True).eval())
+
+  def test_matrix_times_batch(self):
+    left = numpy.random.normal(size=[5, 7]).astype(numpy.float32)
+    right = numpy.random.normal(size=[3, 7, 9]).astype(numpy.float32)
+    right_transpose = numpy.transpose(right, [0, 2, 1])
+    expected_result = numpy.transpose(numpy.dot(right_transpose, left.T),
+                                      [0, 2, 1])
+    with self.test_session():
+      self.assertAllClose(expected_result,
+                          math_utils.matrix_times_batch(
+                              left, right).eval())
+      self.assertAllClose(expected_result,
+                          math_utils.matrix_times_batch(
+                              left.T, right,
+                              adj_x=True).eval())
+      self.assertAllClose(expected_result,
+                          math_utils.matrix_times_batch(
+                              left, right_transpose,
+                              adj_y=True).eval())
+      self.assertAllClose(expected_result,
+                          math_utils.matrix_times_batch(
+                              left.T, right_transpose,
+                              adj_x=True, adj_y=True).eval())
+
+  def test_make_diagonal_undefined_shapes(self):
+    with self.test_session():
+      completely_undefined = array_ops.placeholder(dtype=dtypes.float32)
+      partly_undefined = array_ops.placeholder(
+          shape=[None, None], dtype=dtypes.float32)
+      blocked = math_utils.block_diagonal([completely_undefined,
+                                           [[2.]],
+                                           partly_undefined])
+      self.assertEqual([None, None],
+                       blocked.get_shape().as_list())
+      self.assertAllEqual(
+          [[1., 0., 0., 0.],
+           [0., 2., 0., 0.],
+           [0., 0., 3., 4.],
+           [0., 0., 5., 6.]],
+          blocked.eval(feed_dict={
+              completely_undefined: [[1.]],
+              partly_undefined: [[3., 4.],
+                                 [5., 6.]]}))
+
+  def test_make_diagonal_mostly_defined_shapes(self):
+    with self.test_session():
+      mostly_defined = array_ops.placeholder(
+          shape=[None, 2], dtype=dtypes.float32)
+      blocked = math_utils.block_diagonal([[[2.]],
+                                           mostly_defined,
+                                           [[7.]]])
+      self.assertEqual([None, 4],
+                       blocked.get_shape().as_list())
+      self.assertAllEqual(
+          [[2., 0., 0., 0.],
+           [0., 3., 4., 0.],
+           [0., 5., 6., 0.],
+           [0., 0., 0., 7.]],
+          blocked.eval(feed_dict={
+              mostly_defined: [[3., 4.],
+                               [5., 6.]]}))
+
+
+class TestMakeToeplitzMatrix(test.TestCase):
+
+  def test_make_toeplitz_matrix_1(self):
+    inputs = numpy.array([[[1.]], [[2.]], [[3.]]])
+    output_expected = numpy.array([[1., 2, 3], [2, 1, 2], [3, 2, 1]])
+    self._test_make_toeplitz_matrix(inputs, output_expected)
+
+  def test_make_toeplitz_matrix_2(self):
+    inputs = numpy.array(
+        [[[1, 2.], [3, 4]], [[5, 6], [7, 8]], [[8, 9], [10, 11]]])
+
+    output_expected = numpy.array(
+        [[1., 2., 5., 6, 8, 9],
+         [3, 4, 7, 8, 10, 11],
+         [5, 6, 1, 2, 5, 6],
+         [7, 8, 3, 4, 7, 8],
+         [8, 9, 5, 6, 1, 2],
+         [10, 11, 7, 8, 3, 4]])
+    self._test_make_toeplitz_matrix(inputs, output_expected)
+
+  def _test_make_toeplitz_matrix(self, inputs, output_expected):
+    output_tf = math_utils.make_toeplitz_matrix(inputs)
+    with self.test_session() as sess:
+      output_tf_np = sess.run(output_tf)
+    self.assertAllClose(output_tf_np, output_expected)
+
+
+class TestMakeCovarianceMatrix(test.TestCase):
+
+  def test_zero_size_matrix(self):
+    raw = numpy.zeros([0, 0])
+    with self.test_session():
+      constructed = math_utils.sign_magnitude_positive_definite(raw=raw).eval()
+    self.assertEqual((0, 0), constructed.shape)
+
+  def test_sign_magnitude_positive_definite(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        matrix_tensor = math_utils.sign_magnitude_positive_definite(
+            raw=constant_op.constant([[-1., -2.], [3., 4.]], dtype=dtype),
+            off_diagonal_scale=constant_op.constant(-1., dtype=dtype),
+            overall_scale=constant_op.constant(1., dtype=dtype))
+        matrix_evaled = matrix_tensor.eval()
+        self.assertAllClose(matrix_evaled, matrix_evaled.T)
+        self.assertTrue(numpy.all(numpy.linalg.eigvals(matrix_evaled) > 0))
+
+
+class TestLookupTable(test.TestCase):
+
+  def test_tuple_of_tensors_lookup(self):
+    hash_table = math_utils.TupleOfTensorsLookup(
+        key_dtype=dtypes.int64,
+        default_values=[[
+            array_ops.ones([3, 2], dtype=dtypes.float32), array_ops.zeros(
+                [5], dtype=dtypes.float64)
+        ], array_ops.ones([7, 7], dtype=dtypes.int64)],
+        empty_key=-1,
+        name="test_lookup")
+    def stack_tensor(base_tensor):
+      return array_ops.stack([base_tensor + 1, base_tensor + 2])
+    with self.test_session() as session:
+      ((float_output, double_output), int_output) = session.run(
+          hash_table.lookup([2, 1, 0]))
+      def expected_output_before_insert(base_tensor):
+        return [base_tensor,
+                base_tensor,
+                base_tensor]
+      self.assertAllClose(
+          expected_output_before_insert(numpy.ones([3, 2])),
+          float_output)
+      self.assertAllClose(
+          expected_output_before_insert(numpy.zeros([5])),
+          double_output)
+      self.assertAllEqual(
+          expected_output_before_insert(numpy.ones([7, 7], dtype=numpy.int64)),
+          int_output)
+      hash_table.insert(
+          keys=[1, 2],
+          values=[[
+              stack_tensor(array_ops.ones([3, 2], dtype=dtypes.float32)),
+              stack_tensor(array_ops.zeros([5], dtype=dtypes.float64))
+          ], stack_tensor(array_ops.ones([7, 7], dtype=dtypes.int64))]).run()
+      ((float_output, double_output), int_output) = session.run(
+          hash_table.lookup([2, 1, 0]))
+      def expected_output_after_insert(base_tensor):
+        return [base_tensor + 2,
+                base_tensor + 1,
+                base_tensor]
+      self.assertAllClose(
+          expected_output_after_insert(numpy.ones([3, 2])),
+          float_output)
+      self.assertAllClose(
+          expected_output_after_insert(numpy.zeros([5])),
+          double_output)
+      self.assertAllEqual(
+          expected_output_after_insert(numpy.ones([7, 7], dtype=numpy.int64)),
+          int_output)
+
+
+class InputStatisticsTests(test.TestCase):
+
+  def _input_statistics_test_template(
+      self, stat_object, num_features, dtype, give_full_data,
+      warmup_iterations=0, rtol=1e-6, data_length=500, chunk_size=4):
+    graph = ops.Graph()
+    with graph.as_default():
+      numpy_dtype = dtype.as_numpy_dtype
+      values = (
+          (numpy.arange(data_length, dtype=numpy_dtype)[..., None]
+           + numpy.arange(num_features, dtype=numpy_dtype)[None, ...])[None])
+      times = 2 * (numpy.arange(data_length)[None]) - 3
+      if give_full_data:
+        stat_object.set_data((times, values))
+      features = {TrainEvalFeatures.TIMES: times,
+                  TrainEvalFeatures.VALUES: values}
+      input_fn = input_pipeline.RandomWindowInputFn(
+          batch_size=16, window_size=chunk_size,
+          time_series_reader=input_pipeline.NumpyReader(features))
+      statistics = stat_object.initialize_graph(
+          features=input_fn()[0])
+      with self.test_session(graph=graph) as session:
+        variables.global_variables_initializer().run()
+        coordinator = coordinator_lib.Coordinator()
+        queue_runner_impl.start_queue_runners(session, coord=coordinator)
+        for _ in range(warmup_iterations):
+          # A control dependency should ensure that, for queue-based statistics,
+          # a use of any statistic is preceded by an update of all adaptive
+          # statistics.
+          statistics.total_observation_count.eval()
+        self.assertAllClose(
+            range(num_features) + numpy.mean(numpy.arange(chunk_size))[None],
+            statistics.series_start_moments.mean.eval(),
+            rtol=rtol)
+        self.assertAllClose(
+            numpy.tile(numpy.var(numpy.arange(chunk_size))[None],
+                       [num_features]),
+            statistics.series_start_moments.variance.eval(),
+            rtol=rtol)
+        self.assertAllClose(
+            numpy.mean(values[0], axis=0),
+            statistics.overall_feature_moments.mean.eval(),
+            rtol=rtol)
+        self.assertAllClose(
+            numpy.var(values[0], axis=0),
+            statistics.overall_feature_moments.variance.eval(),
+            rtol=rtol)
+        self.assertAllClose(
+            -3,
+            statistics.start_time.eval(),
+            rtol=rtol)
+        self.assertAllClose(
+            data_length,
+            statistics.total_observation_count.eval(),
+            rtol=rtol)
+        coordinator.request_stop()
+        coordinator.join()
+
+  def test_queue(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      for num_features in [1, 2, 3]:
+        self._input_statistics_test_template(
+            math_utils.InputStatisticsFromMiniBatch(
+                num_features=num_features, dtype=dtype),
+            num_features=num_features,
+            dtype=dtype,
+            give_full_data=False,
+            warmup_iterations=1000,
+            rtol=0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ef8d22114be50a10d3b106be5e144cc70b4bfc
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -0,0 +1,736 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for time series models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+
+from tensorflow.contrib import layers
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.util import nest
+
+
+ModelOutputs = collections.namedtuple(  # pylint: disable=invalid-name
+    typename="ModelOutputs",
+    field_names=[
+        "loss",  # The scalar value to be minimized during training.
+        "end_state",  # A nested tuple specifying the model's state after
+                      # running on the specified data
+        "predictions",  # A dictionary of predictions, each with shape prefixed
+                        # by the shape of `prediction_times`.
+        "prediction_times"  # A [batch size x window size] integer Tensor
+                            # indicating times for which values in `predictions`
+                            # were computed.
+    ])
+
+
+class TimeSeriesModel(object):
+  """Base class for creating generative time series models."""
+
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self,
+               num_features,
+               exogenous_feature_columns=None,
+               dtype=dtypes.float32):
+    """Constructor for generative models.
+
+    Args:
+      num_features: Number of features for the time series
+      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
+          objects (for example tf.contrib.layers.embedding_column) corresponding
+          to exogenous features which provide extra information to the model but
+          are not part of the series to be predicted. Passed to
+          tf.contrib.layers.input_from_feature_columns.
+      dtype: The floating point datatype to use.
+    """
+    if exogenous_feature_columns:
+      self._exogenous_feature_columns = exogenous_feature_columns
+    else:
+      self._exogenous_feature_columns = []
+    self.num_features = num_features
+    self.dtype = dtype
+    self._input_statistics = None
+    self._graph_initialized = False
+
+  # TODO(allenl): Move more of the generic machinery for generating and
+  # predicting into TimeSeriesModel, and possibly share it between generate()
+  # and predict()
+  def generate(self, number_of_series, series_length,
+               model_parameters=None, seed=None):
+    """Sample synthetic data from model parameters, with optional substitutions.
+
+    Returns `number_of_series` possible sequences of future values, sampled from
+    the generative model with each conditioned on the previous. Samples are
+    based on trained parameters, except for those parameters explicitly
+    overridden in `model_parameters`.
+
+    For distributions over future observations, see predict().
+
+    Args:
+      number_of_series: Number of time series to create.
+      series_length: Length of each time series.
+      model_parameters: A dictionary mapping model parameters to values, which
+          replace trained parameters when generating data.
+      seed: If specified, return deterministic time series according to this
+          value.
+    Returns:
+      A dictionary with keys TrainEvalFeatures.TIMES (mapping to an array with
+      shape [number_of_series, series_length]) and TrainEvalFeatures.VALUES
+      (mapping to an array with shape [number_of_series, series_length,
+      num_features]).
+    """
+    raise NotImplementedError("This model does not support generation.")
+
+  def initialize_graph(self, input_statistics=None):
+    """Define ops for the model, not depending on any previously defined ops.
+
+    Args:
+      input_statistics: A math_utils.InputStatistics object containing input
+          statistics. If None, data-independent defaults are used, which may
+          result in longer or unstable training.
+    """
+    self._graph_initialized = True
+    self._input_statistics = input_statistics
+
+  def _check_graph_initialized(self):
+    if not self._graph_initialized:
+      raise ValueError(
+          "TimeSeriesModels require initialize_graph() to be called before "
+          "use. This defines variables and ops in the default graph, and "
+          "allows Tensor-valued input statistics to be specified.")
+
+  def define_loss(self, features, mode):
+    """Default loss definition with state replicated across a batch.
+
+    Time series passed to this model have a batch dimension, and each series in
+    a batch can be operated on in parallel. This loss definition assumes that
+    each element of the batch represents an independent sample conditioned on
+    the same initial state (i.e. it is simply replicated across the batch). A
+    batch size of one provides sequential operations on a single time series.
+
+    More complex processing may operate instead on get_start_state() and
+    get_batch_loss() directly.
+
+    Args:
+      features: A dictionary (such as is produced by a chunker) with at minimum
+        the following key/value pairs (others corresponding to the
+        `exogenous_feature_columns` argument to `__init__` may be included
+        representing exogenous regressors):
+        TrainEvalFeatures.TIMES: A [batch size x window size] integer Tensor
+            with times for each observation. If there is no artificial chunking,
+            the window size is simply the length of the time series.
+        TrainEvalFeatures.VALUES: A [batch size x window size x num features]
+            Tensor with values for each observation.
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN, EVAL). For INFER,
+        see predict().
+    Returns:
+      A ModelOutputs object.
+    """
+    self._check_graph_initialized()
+    start_state = math_utils.replicate_state(
+        start_state=self.get_start_state(),
+        batch_size=array_ops.shape(features[TrainEvalFeatures.TIMES])[0])
+    return self.get_batch_loss(features=features, mode=mode, state=start_state)
+
+  # TODO(vitalyk,allenl): Better documentation surrounding options for chunking,
+  # references to papers, etc.
+  @abc.abstractmethod
+  def get_start_state(self):
+    """Returns a tuple of state for the start of the time series.
+
+    For example, a mean and covariance. State should not have a batch
+    dimension, and will often be TensorFlow Variables to be learned along with
+    the rest of the model parameters.
+    """
+    pass
+
+  @abc.abstractmethod
+  def get_batch_loss(self, features, mode, state):
+    """Return predictions, losses, and end state for a time series.
+
+    Args:
+      features: A dictionary with times, values, and (optionally) exogenous
+          regressors. See `define_loss`.
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN, EVAL, INFER).
+      state: Model-dependent state, each with size [batch size x ...]. The
+          number and type will typically be fixed by the model (for example a
+          mean and variance).
+    Returns:
+      A ModelOutputs object.
+    """
+    pass
+
+  @abc.abstractmethod
+  def predict(self, features):
+    """Returns predictions of future observations given an initial state.
+
+    Computes distributions for future observations. For sampled draws from the
+    model where each is conditioned on the previous, see generate().
+
+    Args:
+      features: A dictionary with at minimum the following key/value pairs
+        (others corresponding to the `exogenous_feature_columns` argument to
+        `__init__` may be included representing exogenous regressors):
+        PredictionFeatures.TIMES: A [batch size x window size] Tensor with
+          times to make predictions for. Times must be increasing within each
+          part of the batch, and must be greater than the last time `state` was
+          updated.
+        PredictionFeatures.STATE_TUPLE: Model-dependent state, each with size
+          [batch size x ...]. The number and type will typically be fixed by the
+          model (for example a mean and variance). Typically these will be the
+          end state returned by get_batch_loss, predicting beyond that data.
+    Returns:
+      A dictionary with model-dependent predictions corresponding to the
+      requested times. Keys indicate the type of prediction, and values have
+      shape [batch size x window size x ...]. For example state space models
+      return a "predicted_mean" and "predicted_covariance".
+    """
+    pass
+
+  def _process_exogenous_features(self, times, features):
+    """Create a single vector from exogenous features.
+
+    Args:
+      times: A [batch size, window size] vector of times for this batch,
+          primarily used to check the shape information of exogenous features.
+      features: A dictionary of exogenous features corresponding to the columns
+          in self._exogenous_feature_columns. Each value should have a shape
+          prefixed by [batch size, window size].
+    Returns:
+      A Tensor with shape [batch size, window size, exogenous dimension], where
+      the size of the exogenous dimension depends on the exogenous feature
+      columns passed to the model's constructor.
+    Raises:
+      ValueError: If an exogenous feature has an unknown rank.
+    """
+    if self._exogenous_feature_columns:
+      exogenous_features_single_batch_dimension = {}
+      for name, tensor in features.items():
+        if tensor.get_shape().ndims is None:
+          # input_from_feature_columns does not support completely unknown
+          # feature shapes, so we save on a bit of logic and provide a better
+          # error message by checking that here.
+          raise ValueError(
+              ("Features with unknown rank are not supported. Got shape {} for "
+               "feature {}.").format(tensor.get_shape(), name))
+        tensor_shape_dynamic = array_ops.shape(tensor)
+        tensor = array_ops.reshape(
+            tensor,
+            array_ops.concat([[tensor_shape_dynamic[0]
+                               * tensor_shape_dynamic[1]],
+                              tensor_shape_dynamic[2:]], axis=0))
+        # Avoid shape warnings when embedding "scalar" exogenous features (those
+        # with only batch and window dimensions); input_from_feature_columns
+        # expects input ranks to match the embedded rank.
+        if tensor.get_shape().ndims == 1:
+          exogenous_features_single_batch_dimension[name] = tensor[:, None]
+        else:
+          exogenous_features_single_batch_dimension[name] = tensor
+      embedded_exogenous_features_single_batch_dimension = (
+          layers.input_from_feature_columns(
+              columns_to_tensors=exogenous_features_single_batch_dimension,
+              feature_columns=self._exogenous_feature_columns,
+              trainable=True))
+      exogenous_regressors = array_ops.reshape(
+          embedded_exogenous_features_single_batch_dimension,
+          array_ops.concat(
+              [
+                  array_ops.shape(times), array_ops.shape(
+                      embedded_exogenous_features_single_batch_dimension)[1:]
+              ],
+              axis=0))
+      exogenous_regressors.set_shape(times.get_shape().concatenate(
+          embedded_exogenous_features_single_batch_dimension.get_shape()[1:]))
+      exogenous_regressors = math_ops.cast(
+          exogenous_regressors, dtype=self.dtype)
+    else:
+      # Not having any exogenous features is a special case so that models can
+      # avoid superfluous updates, which may not be free of side effects due to
+      # bias terms in transformations.
+      exogenous_regressors = None
+    return exogenous_regressors
+
+
+# TODO(allenl): Add a superclass of SequentialTimeSeriesModel which fuses
+# filtering/prediction/exogenous into one step, and move looping constructs to
+# that class.
+class SequentialTimeSeriesModel(TimeSeriesModel):
+  """Base class for recurrent generative models.
+
+  Models implementing this interface have three main functions, corresponding to
+  abstract methods:
+    _filtering_step: Updates state based on observations and computes a loss.
+    _prediction_step: Predicts a batch of observations and new model state.
+    _imputation_step: Updates model state across a gap.
+    _exogenous_input_step: Updates state to account for exogenous regressors.
+
+  Models may also specify a _window_initializer to prepare for a window of data.
+
+  See StateSpaceModel for a concrete example of a model implementing this
+  interface.
+
+  """
+
+  def __init__(self,
+               train_output_names,
+               predict_output_names,
+               num_features,
+               dtype=dtypes.float32,
+               exogenous_feature_columns=None,
+               exogenous_update_condition=None,
+               static_unrolling_window_size_threshold=None):
+    """Initialize a SequentialTimeSeriesModel.
+
+    Args:
+      train_output_names: A list of products/predictions returned from
+          _filtering_step.
+      predict_output_names: A list of products/predictions returned from
+          _prediction_step.
+      num_features: Number of features for the time series
+      dtype: The floating point datatype to use.
+      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
+          objects. See `TimeSeriesModel`.
+      exogenous_update_condition: A function taking two Tensor arguments `times`
+          (shape [batch size]) and `features` (a dictionary mapping exogenous
+          feature keys to Tensors with shapes [batch size, ...]) and returning a
+          boolean Tensor with shape [batch size] indicating whether state should
+          be updated using exogenous features for each part of the batch. Where
+          it is False, no exogenous update is performed. If None (default),
+          exogenous updates are always performed. Useful for avoiding "leaky"
+          frequent exogenous updates when sparse updates are desired. Called
+          only during graph construction.
+      static_unrolling_window_size_threshold: Controls whether a `tf.while_loop`
+          is used when looping over a window of data. If
+          `static_unrolling_window_size_threshold` is None, a `tf.while_loop` is
+          always used. Otherwise it must be an integer, and the graph is
+          replicated for each step taken whenever the window size is less than
+          or equal to this value (if the window size is available in the static
+          shape information of the TrainEvalFeatures.TIMES feature). Static
+          unrolling generally decreases the per-step time for small window/batch
+          sizes, but increases graph construction time.
+    """
+    super(SequentialTimeSeriesModel, self).__init__(
+        num_features=num_features, dtype=dtype,
+        exogenous_feature_columns=exogenous_feature_columns)
+    self._exogenous_update_condition = exogenous_update_condition
+    self._train_output_names = train_output_names
+    self._predict_output_names = predict_output_names
+    self._static_unrolling_window_size_threshold = (
+        static_unrolling_window_size_threshold)
+
+  @abc.abstractmethod
+  def _filtering_step(self, current_times, current_values, state, predictions):
+    """Compute a single-step loss for a batch of data.
+
+    Args:
+      current_times: A [batch size] Tensor of times for each observation.
+      current_values: A [batch size] Tensor of values for each observation.
+      state: Model state, updated to current_times.
+      predictions: The outputs of _prediction_step
+    Returns:
+      A tuple of (updated state, outputs):
+        updated state: Model state taking current_values into account.
+        outputs: A dictionary of Tensors with keys corresponding to
+            self._train_output_names, plus a special "loss" key. The value
+            corresponding to "loss" is minimized during training. Other outputs
+            may include one-step-ahead predictions, for example a predicted
+            location and scale.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _prediction_step(self, current_times, state):
+    """Compute a batch of single-step predictions.
+
+    Args:
+      current_times: A [batch size] Tensor of times for each observation.
+      state: Model state, imputed to one step before current_times.
+    Returns:
+      A tuple of (updated state, outputs):
+        updated state: Model state updated to current_times.
+        outputs: A dictionary of Tensors with keys corresponding to
+            self._predict_output_names.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _imputation_step(self, current_times, state):
+    """Update model state across missing values.
+
+    Called to prepare model state for _filtering_step and _prediction_step.
+
+    Args:
+      current_times: A [batch size] Tensor; state will be imputed up to, but not
+          including, these timesteps.
+      state: The pre-imputation model state, Tensors with shape [batch size x
+          ...].
+    Returns:
+      Updated/imputed model state, corresponding to `state`.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _exogenous_input_step(
+      self, current_times, current_exogenous_regressors, state):
+    """Update state to account for exogenous regressors.
+
+    Args:
+      current_times: A [batch size] Tensor of times for the exogenous values
+          being input.
+      current_exogenous_regressors: A [batch size x exogenous input dimension]
+          Tensor of exogenous values for each part of the batch.
+      state: Model state, a possibly nested list of Tensors, each with shape
+          [batch size x ...].
+    Returns:
+      Updated model state, structure and shapes matching the `state` argument.
+    """
+    pass
+
+  # TODO(allenl): Move regularization to a separate object (optional and
+  # configurable)
+  def _loss_additions(self, times, values, mode):
+    """Additions to per-observation normalized loss, e.g. regularization.
+
+    Args:
+      times: A [batch size x window size] Tensor with times for each
+          observation.
+      values: A [batch size x window size x num features] Tensor with values for
+          each observation.
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN, EVAL, INFER).
+    Returns:
+      A scalar value to add to the per-observation normalized loss.
+    """
+    del times, values, mode
+    return 0.
+
+  def _window_initializer(self, times, state):
+    """Prepare for training or prediction on a window of data.
+
+    Args:
+      times: A [batch size x window size] Tensor with times for each
+          observation.
+      state: Model-dependent state, each with size [batch size x ...]. The
+          number and type will typically be fixed by the model (for example a
+          mean and variance).
+    Returns:
+      Nothing
+    """
+    pass
+
+  def get_batch_loss(self, features, mode, state):
+    """Calls self._filtering_step. See TimeSeriesModel.get_batch_loss."""
+    per_observation_loss, state, outputs = self.per_step_batch_loss(
+        features, mode, state)
+    # per_step_batch_loss returns [batch size, window size, ...] state, whereas
+    # get_batch_loss is expected to return [batch size, ...] state for the last
+    # element of a window
+    state = nest.pack_sequence_as(
+        state,
+        [state_element[:, -1] for state_element in nest.flatten(state)])
+    outputs["observed"] = features[TrainEvalFeatures.VALUES]
+    return ModelOutputs(
+        loss=per_observation_loss,
+        end_state=state,
+        predictions=outputs,
+        prediction_times=features[TrainEvalFeatures.TIMES])
+
+  def _apply_exogenous_update(
+      self, current_times, step_number, state, raw_features,
+      embedded_exogenous_regressors):
+    """Performs a conditional state update based on exogenous features."""
+    if embedded_exogenous_regressors is None:
+      return state
+    else:
+      current_exogenous_regressors = embedded_exogenous_regressors[
+          :, step_number, :]
+      exogenous_updated_state = self._exogenous_input_step(
+          current_times=current_times,
+          current_exogenous_regressors=current_exogenous_regressors,
+          state=state)
+      if self._exogenous_update_condition is not None:
+        current_raw_exogenous_features = {
+            key: value[:, step_number] for key, value in raw_features.items()
+            if key not in [PredictionFeatures.STATE_TUPLE,
+                           TrainEvalFeatures.TIMES,
+                           TrainEvalFeatures.VALUES]}
+        conditionally_updated_state_flat = []
+        for updated_state_element, original_state_element in zip(
+            nest.flatten(exogenous_updated_state),
+            nest.flatten(state)):
+          conditionally_updated_state_flat.append(
+              array_ops.where(
+                  self._exogenous_update_condition(
+                      times=current_times,
+                      features=current_raw_exogenous_features),
+                  updated_state_element,
+                  original_state_element))
+        return nest.pack_sequence_as(state, conditionally_updated_state_flat)
+      else:
+        return exogenous_updated_state
+
+  def per_step_batch_loss(self, features, mode, state):
+    """Computes predictions, losses, and intermediate model states.
+
+    Args:
+      features: A dictionary with times, values, and (optionally) exogenous
+          regressors. See `define_loss`.
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN, EVAL, INFER).
+      state: Model-dependent state, each with size [batch size x ...]. The
+          number and type will typically be fixed by the model (for example a
+          mean and variance).
+    Returns:
+      A tuple of (loss, filtered_states, predictions)
+        loss: Average loss values across the batch.
+        filtered_states: For each Tensor in `state` with shape [batch size x
+            ...], `filtered_states` has a Tensor with shape [batch size x window
+            size x ...] with filtered state for each part of the batch and
+            window.
+        predictions: A dictionary with model-dependent one-step-ahead (or
+            at-least-one-step-ahead with missing values) predictions, with keys
+            indicating the type of prediction and values having shape [batch
+            size x window size x ...]. For example state space models provide
+            "mean", "covariance", and "log_likelihood".
+
+    """
+    self._check_graph_initialized()
+    times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtype=dtypes.int64)
+    values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype)
+    exogenous_regressors = self._process_exogenous_features(
+        times=times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES]})
+    def _batch_loss_filtering_step(step_number, current_times, state):
+      """Make a prediction and update it based on data."""
+      current_values = values[:, step_number, :]
+      state = self._apply_exogenous_update(
+          step_number=step_number, current_times=current_times, state=state,
+          raw_features=features,
+          embedded_exogenous_regressors=exogenous_regressors)
+      predicted_state, predictions = self._prediction_step(
+          current_times=current_times,
+          state=state)
+      filtered_state, outputs = self._filtering_step(
+          current_times=current_times,
+          current_values=current_values,
+          state=predicted_state,
+          predictions=predictions)
+      return filtered_state, outputs
+    state, outputs = self._state_update_loop(
+        times=times, state=state, state_update_fn=_batch_loss_filtering_step,
+        outputs=["loss"] + self._train_output_names)
+    outputs["loss"].set_shape(times.get_shape())
+    loss_sum = math_ops.reduce_sum(outputs["loss"])
+    per_observation_loss = (loss_sum / math_ops.cast(
+        math_ops.reduce_prod(array_ops.shape(times)), dtype=self.dtype))
+    per_observation_loss += self._loss_additions(times, values, mode)
+    # Since we have window-level additions to the loss, its per-step value is
+    # misleading, so we avoid returning it.
+    del outputs["loss"]
+    return per_observation_loss, state, outputs
+
+  def predict(self, features):
+    """Calls self._prediction_step in a loop. See TimeSeriesModel.predict."""
+    predict_times = ops.convert_to_tensor(features[PredictionFeatures.TIMES],
+                                          dtypes.int64)
+    start_state = features[PredictionFeatures.STATE_TUPLE]
+    exogenous_regressors = self._process_exogenous_features(
+        times=predict_times,
+        features={
+            key: value
+            for key, value in features.items()
+            if key not in
+            [PredictionFeatures.TIMES, PredictionFeatures.STATE_TUPLE]
+        })
+    def _call_prediction_step(step_number, current_times, state):
+      state = self._apply_exogenous_update(
+          step_number=step_number, current_times=current_times, state=state,
+          raw_features=features,
+          embedded_exogenous_regressors=exogenous_regressors)
+      state, outputs = self._prediction_step(
+          current_times=current_times, state=state)
+      return state, outputs
+    _, predictions = self._state_update_loop(
+        times=predict_times, state=start_state,
+        state_update_fn=_call_prediction_step,
+        outputs=self._predict_output_names)
+    return predictions
+
+  class _FakeTensorArray(object):
+    """An interface for Python lists that is similar to TensorArray.
+
+    Used for easy switching between static and dynamic looping.
+    """
+
+    def __init__(self):
+      self.values = []
+
+    def write(self, unused_position, value):
+      del unused_position
+      self.values.append(value)
+      return self
+
+  def _state_update_loop(self, times, state, state_update_fn, outputs):
+    """Iterates over `times`, calling `state_update_fn` to collect outputs.
+
+    Args:
+      times: A [batch size x window size] Tensor of integers to iterate over.
+      state: A list of model-specific state Tensors, each with shape [batch size
+          x ...].
+      state_update_fn: A callback taking the following arguments
+            step_number; A scalar integer Tensor indicating the current position
+              in the window.
+            current_times; A [batch size] vector of Integers indicating times
+              for each part of the batch.
+            state; Current model state.
+          It returns a tuple of (updated state, output_values), output_values
+          being a dictionary of Tensors with keys corresponding to `outputs`.
+      outputs: A list of strings indicating values which will be saved while
+          iterating. Must match the keys of the dictionary returned by
+          state_update_fn.
+    Returns:
+      A tuple of (state, output_dict)
+      state: The final model state.
+      output_dict: A dictionary of outputs corresponding to those specified in
+        `outputs` and computed in state_update_fn.
+    """
+    times = ops.convert_to_tensor(times, dtype=dtypes.int64)
+    window_static_shape = times.get_shape()[1].value
+    if self._static_unrolling_window_size_threshold is None:
+      static_unroll = False
+    else:
+      # The user has specified a threshold for static loop unrolling.
+      if window_static_shape is None:
+        # We don't have static shape information for the window size, so dynamic
+        # looping is our only option.
+        static_unroll = False
+      elif window_static_shape <= self._static_unrolling_window_size_threshold:
+        # The threshold is satisfied; unroll statically
+        static_unroll = True
+      else:
+        # A threshold was set but not satisfied
+        static_unroll = False
+
+    self._window_initializer(times, state)
+
+    def _run_condition(step_number, *unused):
+      del unused  # not part of while loop run condition
+      return math_ops.less(step_number, window_size)
+
+    def _state_update_step(
+        step_number, state, state_accumulators, output_accumulators,
+        reuse=False):
+      """Impute, then take one state_update_fn step, accumulating outputs."""
+      with variable_scope.variable_scope("state_update_step", reuse=reuse):
+        current_times = times[:, step_number]
+        state = self._imputation_step(current_times=current_times, state=state)
+        output_accumulators_dict = {
+            accumulator_key: accumulator
+            for accumulator_key, accumulator
+            in zip(outputs, output_accumulators)}
+        step_state, output_values = state_update_fn(
+            step_number=step_number,
+            current_times=current_times,
+            state=state)
+        assert set(output_values.keys()) == set(outputs)
+        new_output_accumulators = []
+        for output_key in outputs:
+          accumulator = output_accumulators_dict[output_key]
+          output_value = output_values[output_key]
+          new_output_accumulators.append(
+              accumulator.write(step_number, output_value))
+        flat_step_state = nest.flatten(step_state)
+        assert len(state_accumulators) == len(flat_step_state)
+        new_state_accumulators = []
+        new_state_flat = []
+        for step_state_value, state_accumulator, original_state in zip(
+            flat_step_state, state_accumulators, nest.flatten(state)):
+          # Make sure the static shape information is complete so while_loop
+          # does not complain about shape information changing.
+          step_state_value.set_shape(original_state.get_shape())
+          new_state_flat.append(step_state_value)
+          new_state_accumulators.append(state_accumulator.write(
+              step_number, step_state_value))
+        step_state = nest.pack_sequence_as(state, new_state_flat)
+        return (step_number + 1, step_state,
+                new_state_accumulators, new_output_accumulators)
+
+    window_size = array_ops.shape(times)[1]
+
+    def _window_size_tensor_array(dtype):
+      if static_unroll:
+        return self._FakeTensorArray()
+      else:
+        return tensor_array_ops.TensorArray(
+            dtype=dtype, size=window_size, dynamic_size=False)
+
+    initial_loop_arguments = [
+        array_ops.zeros([], dtypes.int32),
+        state,
+        [_window_size_tensor_array(element.dtype)
+         for element in nest.flatten(state)],
+        [_window_size_tensor_array(self.dtype) for _ in outputs]]
+    if static_unroll:
+      arguments = initial_loop_arguments
+      for step_number in range(times.get_shape()[1].value):
+        arguments = _state_update_step(
+            array_ops.constant(step_number, dtypes.int32), *arguments[1:],
+            reuse=(step_number > 0))  # Variable sharing between steps
+    else:
+      arguments = control_flow_ops.while_loop(
+          cond=_run_condition,
+          body=_state_update_step,
+          loop_vars=initial_loop_arguments)
+    (_, _, state_loop_result, outputs_loop_result) = arguments
+
+    def _stack_and_transpose(tensor_array):
+      """Stack and re-order the dimensions of a TensorArray."""
+      if static_unroll:
+        return array_ops.stack(tensor_array.values, axis=1)
+      else:
+        # TensorArrays from while_loop stack with window size as the first
+        # dimension, so this function swaps it and the batch dimension to
+        # maintain the [batch x window size x ...] convention used elsewhere.
+        stacked = tensor_array.stack()
+        return array_ops.transpose(
+            stacked,
+            perm=array_ops.concat([[1, 0], math_ops.range(
+                2, array_ops.rank(stacked))], 0))
+
+    outputs_dict = {output_key: _stack_and_transpose(output)
+                    for output_key, output
+                    in zip(outputs, outputs_loop_result)}
+    full_state = nest.pack_sequence_as(
+        state,
+        [_stack_and_transpose(state_element)
+         for state_element in state_loop_result])
+    return full_state, outputs_dict
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model_utils.py b/tensorflow/contrib/timeseries/python/timeseries/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..addcdb05754c6ccd736f5d21619015acfcfc906c
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/model_utils.py
@@ -0,0 +1,417 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions for training and constructing time series Models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import numpy
+
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.contrib.layers.python.layers import optimizers
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.estimator.export import export_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def _check_feature_shapes_compatible_with(
+    features, compatible_with_name, compatible_with_value, ignore=None):
+  """Checks all features are compatible with the given time-like feature."""
+  if ignore is None:
+    ignore = set()
+  for name, value in features.items():
+    if name in ignore:
+      continue
+    feature_shape = value.get_shape()
+    if feature_shape.ndims is None:
+      continue
+    if feature_shape.ndims < 2:
+      raise ValueError(
+          ("Features must have shape (batch dimension, window size, ...) "
+           "(got rank {} for feature '{}')").format(
+               feature_shape.ndims, name))
+    if not feature_shape[:2].is_compatible_with(
+        compatible_with_value.get_shape()):
+      raise ValueError(
+          ("Features must have shape (batch dimension, window size, ...) "
+           "where batch dimension and window size match the "
+           "'{times_feature}' feature (got shape {feature_shape} for "
+           "feature '{feature_name}' but shape {times_shape} for feature "
+           "'{times_feature}')").format(
+               times_feature=compatible_with_name,
+               feature_shape=feature_shape,
+               feature_name=name,
+               times_shape=compatible_with_value.get_shape()))
+
+
+def _check_predict_features(features):
+  """Raises errors if features are not suitable for prediction."""
+  if feature_keys.PredictionFeatures.TIMES not in features:
+    raise ValueError("Expected a '{}' feature for prediction.".format(
+        feature_keys.PredictionFeatures.TIMES))
+  if feature_keys.PredictionFeatures.STATE_TUPLE not in features:
+    raise ValueError("Expected a '{}' feature for prediction.".format(
+        feature_keys.PredictionFeatures.STATE_TUPLE))
+  times_feature = features[feature_keys.PredictionFeatures.TIMES]
+  if not times_feature.get_shape().is_compatible_with([None, None]):
+    raise ValueError(
+        ("Expected shape (batch dimension, window size) for feature '{}' "
+         "(got shape {})").format(feature_keys.PredictionFeatures.TIMES,
+                                  times_feature.get_shape()))
+  _check_feature_shapes_compatible_with(
+      features=features,
+      compatible_with_name=feature_keys.PredictionFeatures.TIMES,
+      compatible_with_value=times_feature,
+      ignore=set([
+          feature_keys.PredictionFeatures.STATE_TUPLE  # Model-dependent shapes
+      ]))
+
+
+def _check_train_eval_features(features, model):
+  """Raise errors if features are not suitable for training/evaluation."""
+  if feature_keys.TrainEvalFeatures.TIMES not in features:
+    raise ValueError("Expected a '{}' feature for training/evaluation.".format(
+        feature_keys.TrainEvalFeatures.TIMES))
+  if feature_keys.TrainEvalFeatures.VALUES not in features:
+    raise ValueError("Expected a '{}' feature for training/evaluation.".format(
+        feature_keys.TrainEvalFeatures.VALUES))
+  times_feature = features[feature_keys.TrainEvalFeatures.TIMES]
+  if not times_feature.get_shape().is_compatible_with([None, None]):
+    raise ValueError(
+        ("Expected shape (batch dimension, window size) for feature '{}' "
+         "(got shape {})").format(feature_keys.TrainEvalFeatures.TIMES,
+                                  times_feature.get_shape()))
+  values_feature = features[feature_keys.TrainEvalFeatures.VALUES]
+  if not values_feature.get_shape().is_compatible_with(
+      [None, None, model.num_features]):
+    raise ValueError(
+        ("Expected shape (batch dimension, window size, {num_features}) "
+         "for feature '{feature_name}', since the model was configured "
+         "with num_features={num_features} (got shape {got_shape})").format(
+             num_features=model.num_features,
+             feature_name=feature_keys.TrainEvalFeatures.VALUES,
+             got_shape=times_feature.get_shape()))
+  _check_feature_shapes_compatible_with(
+      features=features,
+      compatible_with_name=feature_keys.TrainEvalFeatures.TIMES,
+      compatible_with_value=times_feature,
+      ignore=set([
+          feature_keys.State.STATE_TUPLE  # Model-dependent shapes
+      ]))
+
+
+def _identity_metric_single(name, input_tensor):
+  """A metric which takes on its last updated value.
+
+  This keeps evaluation metrics in sync with one another, since update ops are
+  run separately from their result Tensors. Simply returning (input_tensor,
+  no_op) as a metric with a value but no update means that a metric will come
+  from a different batch of data than metrics which cache values in a Variable
+  (e.g. the default loss metric).
+
+  Args:
+    name: A name for the metric.
+    input_tensor: Any Tensor.
+  Returns:
+    A tuple of (value, update_op).
+  """
+  metric_variable = variable_scope.variable(
+      name="{}_identity_metric".format(name),
+      initial_value=array_ops.zeros([], dtype=input_tensor.dtype),
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=False)
+  update_op = state_ops.assign(metric_variable, input_tensor,
+                               validate_shape=False)
+  # This shape will be correct once the first update runs (but may be
+  # incomplete, so is not helpful for initializing the variable).
+  metric_variable.set_shape(input_tensor.get_shape())
+  return (metric_variable.value(), update_op)
+
+
+def _identity_metric_nested(name, input_tensors):
+  """Create identity metrics for a nested tuple of Tensors."""
+  update_ops = []
+  value_tensors = []
+  for tensor_number, tensor in enumerate(nest.flatten(input_tensors)):
+    value_tensor, update_op = _identity_metric_single(
+        name="{}_{}".format(name, tensor_number),
+        input_tensor=tensor)
+    update_ops.append(update_op)
+    value_tensors.append(value_tensor)
+  return (nest.pack_sequence_as(input_tensors, value_tensors),
+          control_flow_ops.group(*update_ops))
+
+
+def state_to_dictionary(state_tuple):
+  """Flatten model state into a dictionary with string keys."""
+  flattened = {}
+  for state_number, state_value in enumerate(nest.flatten(state_tuple)):
+    prefixed_state_name = "{}_{:02d}".format(feature_keys.State.STATE_PREFIX,
+                                             state_number)
+    flattened[prefixed_state_name] = state_value
+  return flattened
+
+
+def make_model_fn(
+    model, state_manager, optimizer, input_statistics_generator=None):
+  """Returns a model function suitable for use with a tf.estimator.
+
+  Args:
+    model: The object (inheriting from Model) to create a function for.
+    state_manager: A state manager to wrap the model with (or
+        PassthroughStateManager if no state needs to be managed).
+    optimizer: An instance of `tf.train.Optimizer` to use for training.
+    input_statistics_generator: An InputStatisticsFromMiniBatch object from
+        math_utils.py, used for collecting statistics about input data during
+        training.
+  Returns:
+    The model function, suitable for passing to a tf.estimator.Estimator.
+  """
+
+  def _convert_feature_to_tensor(name, value):
+    """Casts features to the correct dtype based on their name."""
+    if name in [
+        feature_keys.TrainEvalFeatures.TIMES,
+        feature_keys.PredictionFeatures.TIMES
+    ]:
+      return math_ops.cast(value, dtypes.int64)
+    if name == feature_keys.TrainEvalFeatures.VALUES:
+      return math_ops.cast(value, model.dtype)
+    if name == feature_keys.PredictionFeatures.STATE_TUPLE:
+      return value  # Correct dtypes are model-dependent
+    return ops.convert_to_tensor(value)
+
+  def _gather_state(features):
+    """Returns `features` with state packed, indicates if packing was done."""
+    prefixed_state_re = re.compile(r"^" + feature_keys.State.STATE_PREFIX +
+                                   r"_(\d+)$")
+    numbered_state = []
+    for key, tensor in features.items():
+      search_result = prefixed_state_re.search(key)
+      if search_result:
+        numbered_state.append((int(search_result.group(1)), key, tensor))
+    if not numbered_state:
+      return features, False
+    features = features.copy()
+    for _, key, _ in numbered_state:
+      del features[key]
+    numbered_state.sort(key=lambda number, *_: number)
+    features[feature_keys.State.STATE_TUPLE] = nest.pack_sequence_as(
+        structure=model.get_start_state(),
+        flat_sequence=[tensor for _, _, tensor in numbered_state])
+    return features, True
+
+  def _train(features):
+    """Add training ops to the graph."""
+    with variable_scope.variable_scope("model"):
+      model_outputs = state_manager.define_loss(model, features,
+                                                estimator_lib.ModeKeys.TRAIN)
+    train_op = optimizers.optimize_loss(
+        model_outputs.loss,
+        global_step=variables.get_global_step(),
+        optimizer=optimizer,
+        # Learning rate is set in the Optimizer object
+        learning_rate=None)
+    return estimator_lib.EstimatorSpec(
+        loss=model_outputs.loss,
+        mode=estimator_lib.ModeKeys.TRAIN,
+        train_op=train_op)
+
+  def _evaluate(features):
+    """Add ops for evaluation (aka filtering) to the graph."""
+    with variable_scope.variable_scope("model"):
+      model_outputs = state_manager.define_loss(model, features,
+                                                estimator_lib.ModeKeys.EVAL)
+    metrics = {}
+    # Just output in-sample predictions for the last chunk seen
+    for prediction_key, prediction_value in model_outputs.predictions.items():
+      metrics[prediction_key] = _identity_metric_single(prediction_key,
+                                                        prediction_value)
+    metrics[feature_keys.FilteringResults.TIMES] = _identity_metric_single(
+        feature_keys.FilteringResults.TIMES, model_outputs.prediction_times)
+    metrics[feature_keys.FilteringResults.STATE_TUPLE] = (
+        _identity_metric_nested(feature_keys.FilteringResults.STATE_TUPLE,
+                                model_outputs.end_state))
+    return estimator_lib.EstimatorSpec(
+        loss=model_outputs.loss,
+        mode=estimator_lib.ModeKeys.EVAL,
+        eval_metric_ops=metrics,
+        predictions={})
+
+  def _predict(features):
+    """Add ops for prediction to the graph."""
+    with variable_scope.variable_scope("model"):
+      prediction = model.predict(features=features)
+    prediction[feature_keys.PredictionResults.TIMES] = features[
+        feature_keys.PredictionFeatures.TIMES]
+    return estimator_lib.EstimatorSpec(
+        predictions=prediction, mode=estimator_lib.ModeKeys.PREDICT)
+
+  def _serving(features):
+    with variable_scope.variable_scope("model"):
+      prediction_outputs = model.predict(features=features)
+    with variable_scope.variable_scope("model", reuse=True):
+      filtering_outputs = state_manager.define_loss(model, features,
+                                                    estimator_lib.ModeKeys.EVAL)
+    return estimator_lib.EstimatorSpec(
+        mode=estimator_lib.ModeKeys.PREDICT,
+        export_outputs={
+            feature_keys.SavedModelLabels.PREDICT:
+                export_lib.PredictOutput(prediction_outputs),
+            feature_keys.SavedModelLabels.FILTER:
+                export_lib.PredictOutput(
+                    state_to_dictionary(filtering_outputs.end_state))
+        },
+        # Likely unused, but it is necessary to return `predictions` to satisfy
+        # the Estimator's error checking.
+        predictions={})
+
+  def _model_fn(features, labels, mode):
+    """Given a time series in `features`, define a loss for `mode`.
+
+    Args:
+      features: A dictionary, the output of a chunker (typically with keys
+          feature_keys.TrainEvalFeatures.TIMES and
+          feature_keys.TrainEvalFeatures.VALUES).
+      labels: Not used; included for compatibility with tf.learn.
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN, EVAL, INFER).
+    Returns:
+      A tuple of predictions, a loss Tensor, and a train op.
+    Raises:
+      ValueError: If the model makes predictions which do not have static shape
+          information.
+    """
+    if labels:
+      raise ValueError("The model received a `labels` dictionary, which is not"
+                       " supported. Pass '{}' and '{}' as features.".format(
+                           feature_keys.TrainEvalFeatures.TIMES,
+                           feature_keys.TrainEvalFeatures.VALUES))
+    del labels
+    features = {name: _convert_feature_to_tensor(name=name, value=value)
+                for name, value in features.items()}
+    if input_statistics_generator is not None:
+      input_statistics = input_statistics_generator.initialize_graph(
+          features, update_statistics=(mode == estimator_lib.ModeKeys.TRAIN))
+    else:
+      input_statistics = None
+    model.initialize_graph(input_statistics=input_statistics)
+    # _gather_state requires the model to have its graph initialized (so it has
+    # access to the structure of the model's state)
+    features, passed_flat_state = _gather_state(features)
+    if (mode == estimator_lib.ModeKeys.TRAIN
+        or mode == estimator_lib.ModeKeys.EVAL):
+      _check_train_eval_features(features, model)
+    elif mode == estimator_lib.ModeKeys.PREDICT:
+      _check_predict_features(features)
+    else:
+      raise ValueError("Unknown mode '{}' passed to model_fn.".format(mode))
+    state_manager.initialize_graph(
+        model=model, input_statistics=input_statistics)
+    if mode == estimator_lib.ModeKeys.TRAIN:
+      return _train(features)
+    elif mode == estimator_lib.ModeKeys.EVAL:
+      return _evaluate(features)
+    elif mode == estimator_lib.ModeKeys.PREDICT and not passed_flat_state:
+      return _predict(features)
+    elif mode == estimator_lib.ModeKeys.PREDICT and passed_flat_state:
+      # The mode is PREDICT, but we're actually in export_savedmodel for
+      # serving. We want to return two graphs: one for filtering (state + data
+      # -> state) and one for predicting (state -> prediction).
+      return _serving(features)
+  return _model_fn
+
+
+# TODO(agarwal): Remove and replace with functionality from tf.slim
+def fully_connected(inp,
+                    inp_size,
+                    layer_size,
+                    name,
+                    activation=nn_ops.relu,
+                    dtype=dtypes.float32):
+  """Helper method to create a fully connected hidden layer."""
+  wt = variable_scope.get_variable(
+      name="{}_weight".format(name), shape=[inp_size, layer_size], dtype=dtype)
+  bias = variable_scope.get_variable(
+      name="{}_bias".format(name),
+      shape=[layer_size],
+      initializer=init_ops.zeros_initializer())
+  output = nn_ops.xw_plus_b(inp, wt, bias)
+  if activation is not None:
+    assert callable(activation)
+    output = activation(output)
+  return output
+
+
+def parameter_switch(parameter_overrides):
+  """Create a function which chooses between overridden and model parameters.
+
+  Args:
+    parameter_overrides: A dictionary with explicit overrides of model
+        parameters, mapping from Tensors to their overridden values.
+  Returns:
+    A function which takes a Tensor and returns the override if it is specified,
+        or otherwise the evaluated value (given current Variable values).
+  """
+  def get_passed_or_trained_value(parameter):
+    return ops.convert_to_tensor(
+        parameter_overrides.get(parameter, parameter)).eval()
+  return get_passed_or_trained_value
+
+
+def canonicalize_times_or_steps_from_output(times, steps,
+                                            previous_model_output):
+  """Canonicalizes either relative or absolute times, with error checking."""
+  if steps is not None and times is not None:
+    raise ValueError("Only one of `steps` and `times` may be specified.")
+  if steps is None and times is None:
+    raise ValueError("One of `steps` and `times` must be specified.")
+  if times is not None:
+    times = numpy.array(times)
+    if len(times.shape) != 2:
+      times = times[None, ...]
+    if (previous_model_output[feature_keys.FilteringResults.TIMES].shape[0] !=
+        times.shape[0]):
+      raise ValueError(
+          ("`times` must have a batch dimension matching"
+           " the previous model output (got a batch dimension of {} for `times`"
+           " and {} for the previous model output).").format(
+               times.shape[0], previous_model_output[
+                   feature_keys.FilteringResults.TIMES].shape[0]))
+    if not (previous_model_output[feature_keys.FilteringResults.TIMES][:, -1] <
+            times[:, 0]).all():
+      raise ValueError("Prediction times must be after the corresponding "
+                       "previous model output.")
+  if steps is not None:
+    predict_times = (
+        previous_model_output[feature_keys.FilteringResults.TIMES][:, -1:] + 1 +
+        numpy.arange(steps)[None, ...])
+  else:
+    predict_times = times
+  return predict_times
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py b/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..29986895549d16f37c7ff929a30f9a63a56be135
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/model_utils_test.py
@@ -0,0 +1,275 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for model_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import model
+from tensorflow.contrib.timeseries.python.timeseries import model_utils
+from tensorflow.contrib.timeseries.python.timeseries import state_management
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import training as train
+
+
+class ModelUtilsTest(test.TestCase):
+
+  def test_parameter_switching(self):
+    parameter = array_ops.constant(5)
+    overridden_parameter = array_ops.constant(3)
+    with self.test_session():
+      getter = model_utils.parameter_switch({overridden_parameter: 4})
+      self.assertEqual(5, getter(parameter))
+      self.assertEqual(4, getter(overridden_parameter))
+
+  def test_labels_provided_error(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL,
+                 estimator_lib.ModeKeys.PREDICT]:
+      with self.assertRaisesRegexp(ValueError, "labels"):
+        model_fn(features={}, labels={"a": "b"}, mode=mode)
+
+  def test_unknown_mode(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError, "Unknown mode 'Not a mode'"):
+      model_fn(features={}, labels={}, mode="Not a mode")
+
+
+class _TickerModel(object):
+  num_features = 1
+  dtype = dtypes.float32
+
+  def initialize_graph(self, input_statistics):
+    pass
+
+  def define_loss(self, features, mode):
+    del mode  # unused
+    return model.ModelOutputs(
+        loss=features["ticker"],
+        end_state=(features["ticker"], features["ticker"]),
+        prediction_times=array_ops.zeros(()),
+        predictions={"ticker": features["ticker"]})
+
+
+class EvaluationMetricsTests(test.TestCase):
+
+  def test_metrics_consistent(self):
+    # Tests that the identity metrics used to report in-sample predictions match
+    # the behavior of standard metrics.
+    g = ops.Graph()
+    with g.as_default():
+      features = {
+          feature_keys.TrainEvalFeatures.TIMES:
+              array_ops.zeros((1, 1)),
+          feature_keys.TrainEvalFeatures.VALUES:
+              array_ops.zeros((1, 1, 1)),
+          "ticker":
+              array_ops.reshape(
+                  math_ops.cast(
+                      variables.Variable(
+                          name="ticker",
+                          initial_value=0,
+                          dtype=dtypes.int64,
+                          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+                      .count_up_to(10),
+                      dtype=dtypes.float32), (1, 1, 1))
+      }
+      model_fn = model_utils.make_model_fn(
+          model=_TickerModel(),
+          state_manager=state_management.PassthroughStateManager(),
+          optimizer=train.GradientDescentOptimizer(0.001))
+      outputs = model_fn(
+          features=features, labels=None, mode=estimator_lib.ModeKeys.EVAL)
+      metric_update_ops = [
+          metric[1] for metric in outputs.eval_metric_ops.values()]
+      loss_mean, loss_update = metrics.mean(outputs.loss)
+      metric_update_ops.append(loss_update)
+      with self.test_session() as sess:
+        coordinator = coordinator_lib.Coordinator()
+        queue_runner_impl.start_queue_runners(sess, coord=coordinator)
+        variables.local_variables_initializer().run()
+        sess.run(metric_update_ops)
+        loss_evaled, metric_evaled, nested_metric_evaled = sess.run(
+            (loss_mean, outputs.eval_metric_ops["ticker"][0],
+             outputs.eval_metric_ops[feature_keys.FilteringResults.STATE_TUPLE][
+                 0][0]))
+        # The custom model_utils metrics for in-sample predictions should be in
+        # sync with the Estimator's mean metric for model loss.
+        self.assertAllClose(0., loss_evaled)
+        self.assertAllClose((((0.,),),), metric_evaled)
+        self.assertAllClose((((0.,),),), nested_metric_evaled)
+        coordinator.request_stop()
+        coordinator.join()
+
+
+class _StubModel(object):
+  num_features = 3
+  dtype = dtypes.float64
+
+  def initialize_graph(self, input_statistics):
+    del input_statistics  # unused
+
+
+def _stub_model_fn():
+  return model_utils.make_model_fn(
+      model=_StubModel(),
+      state_manager=state_management.PassthroughStateManager(),
+      optimizer=train.AdamOptimizer(0.001))
+
+
+class TrainEvalFeatureCheckingTests(test.TestCase):
+
+  def test_no_time_feature(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+          feature_keys.TrainEvalFeatures.TIMES)):
+        model_fn(
+            features={feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]},
+            labels=None,
+            mode=mode)
+
+  def test_no_value_feature(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+          feature_keys.TrainEvalFeatures.VALUES)):
+        model_fn(
+            features={feature_keys.TrainEvalFeatures.TIMES: [[1]]},
+            labels=None,
+            mode=mode)
+
+  def test_bad_time_rank(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError,
+                                   "Expected shape.*for feature '{}'".format(
+                                       feature_keys.TrainEvalFeatures.TIMES)):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[[1]]],
+                feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]
+            },
+            labels=None,
+            mode=mode)
+
+  def test_bad_value_rank(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(ValueError,
+                                   "Expected shape.*for feature '{}'".format(
+                                       feature_keys.TrainEvalFeatures.VALUES)):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[1]],
+                feature_keys.TrainEvalFeatures.VALUES: [[1.]]
+            },
+            labels=None,
+            mode=mode)
+
+  def test_bad_value_num_features(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(
+          ValueError, "Expected shape.*, 3.*for feature '{}'".format(
+              feature_keys.TrainEvalFeatures.VALUES)):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[1]],
+                feature_keys.TrainEvalFeatures.VALUES: [[[1.]]]
+            },
+            labels=None,
+            mode=mode)
+
+  def test_bad_exogenous_shape(self):
+    model_fn = _stub_model_fn()
+    for mode in [estimator_lib.ModeKeys.TRAIN, estimator_lib.ModeKeys.EVAL]:
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Features must have shape.*for feature 'exogenous'"):
+        model_fn(
+            features={
+                feature_keys.TrainEvalFeatures.TIMES: [[1]],
+                feature_keys.TrainEvalFeatures.VALUES: [[[1., 2., 3.]]],
+                "exogenous": [[1], [2]]
+            },
+            labels=None,
+            mode=mode)
+
+
+class PredictFeatureCheckingTests(test.TestCase):
+
+  def test_no_time_feature(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+        feature_keys.PredictionFeatures.TIMES)):
+      model_fn(
+          features={
+              feature_keys.PredictionFeatures.STATE_TUPLE: ([[[1.]]], 1.)
+          },
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+  def test_no_start_state_feature(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError, "Expected a '{}' feature".format(
+        feature_keys.PredictionFeatures.STATE_TUPLE)):
+      model_fn(
+          features={feature_keys.PredictionFeatures.TIMES: [[1]]},
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+  def test_bad_time_rank(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(ValueError,
+                                 "Expected shape.*for feature '{}'".format(
+                                     feature_keys.PredictionFeatures.TIMES)):
+      model_fn(
+          features={
+              feature_keys.PredictionFeatures.TIMES: 1,
+              feature_keys.PredictionFeatures.STATE_TUPLE: (1, (2, 3.))
+          },
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+  def test_bad_exogenous_shape(self):
+    model_fn = _stub_model_fn()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Features must have shape.*for feature 'exogenous'"):
+      model_fn(
+          features={
+              feature_keys.PredictionFeatures.TIMES: [[1]],
+              feature_keys.PredictionFeatures.STATE_TUPLE: (1, (2, 3.)),
+              "exogenous": 1.
+          },
+          labels=None,
+          mode=estimator_lib.ModeKeys.PREDICT)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e29f5e68e4c7c0bbb0b5cd0c547ac57e2faa9f
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/saved_model_utils.py
@@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience functions for working with time series saved_models.
+
+@@predict_continuation
+@@filter_continuation
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys as _feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline as _input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import model_utils as _model_utils
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+def _colate_features_to_feeds_and_fetches(continue_from, signature, features,
+                                          graph):
+  """Uses a saved model signature to construct feed and fetch dictionaries."""
+  if _feature_keys.FilteringResults.STATE_TUPLE in continue_from:
+    # We're continuing from an evaluation, so we need to unpack/flatten state.
+    state_values = _model_utils.state_to_dictionary(
+        continue_from[_feature_keys.FilteringResults.STATE_TUPLE])
+  else:
+    state_values = continue_from
+  input_feed_tensors_by_name = {
+      input_key: graph.as_graph_element(input_value.name)
+      for input_key, input_value in signature.inputs.items()
+  }
+  output_tensors_by_name = {
+      output_key: graph.as_graph_element(output_value.name)
+      for output_key, output_value in signature.outputs.items()
+  }
+  feed_dict = {}
+  for state_key, state_value in state_values.items():
+    feed_dict[input_feed_tensors_by_name[state_key]] = state_value
+  for feature_key, feature_value in features.items():
+    feed_dict[input_feed_tensors_by_name[feature_key]] = feature_value
+  return output_tensors_by_name, feed_dict
+
+
+def predict_continuation(continue_from,
+                         signatures,
+                         session,
+                         steps=None,
+                         times=None,
+                         exogenous_features=None):
+  """Perform prediction using an exported saved model.
+
+  Analogous to _input_pipeline.predict_continuation_input_fn, but operates on a
+  saved model rather than feeding into Estimator's predict method.
+
+  Args:
+    continue_from: A dictionary containing the results of either an Estimator's
+      evaluate method or filter_continuation. Used to determine the model
+      state to make predictions starting from.
+    signatures: The `MetaGraphDef` protocol buffer returned from
+      `tf.saved_model.loader.load`. Used to determine the names of Tensors to
+      feed and fetch. Must be from the same model as `continue_from`.
+    session: The session to use. The session's graph must be the one into which
+      `tf.saved_model.loader.load` loaded the model.
+    steps: The number of steps to predict (scalar), starting after the
+      evaluation or filtering. If `times` is specified, `steps` must not be; one
+      is required.
+    times: A [batch_size x window_size] array of integers (not a Tensor)
+      indicating times to make predictions for. These times must be after the
+      corresponding evaluation or filtering. If `steps` is specified, `times`
+      must not be; one is required. If the batch dimension is omitted, it is
+      assumed to be 1.
+    exogenous_features: Optional dictionary. If specified, indicates exogenous
+      features for the model to use while making the predictions. Values must
+      have shape [batch_size x window_size x ...], where `batch_size` matches
+      the batch dimension used when creating `continue_from`, and `window_size`
+      is either the `steps` argument or the `window_size` of the `times`
+      argument (depending on which was specified).
+  Returns:
+    A dictionary with model-specific predictions (typically having keys "mean"
+    and "covariance") and a feature_keys.PredictionResults.TIMES key indicating
+    the times for which the predictions were computed.
+  Raises:
+    ValueError: If `times` or `steps` are misspecified.
+  """
+  if exogenous_features is None:
+    exogenous_features = {}
+  predict_times = _model_utils.canonicalize_times_or_steps_from_output(
+      times=times, steps=steps, previous_model_output=continue_from)
+  features = {_feature_keys.PredictionFeatures.TIMES: predict_times}
+  features.update(exogenous_features)
+  predict_signature = signatures.signature_def[
+      _feature_keys.SavedModelLabels.PREDICT]
+  output_tensors_by_name, feed_dict = _colate_features_to_feeds_and_fetches(
+      continue_from=continue_from,
+      signature=predict_signature,
+      features=features,
+      graph=session.graph)
+  output = session.run(output_tensors_by_name, feed_dict=feed_dict)
+  output[_feature_keys.PredictionResults.TIMES] = features[
+      _feature_keys.PredictionFeatures.TIMES]
+  return output
+
+
+def filter_continuation(continue_from, signatures, session, features):
+  """Perform filtering using an exported saved model.
+
+  Filtering refers to updating model state based on new observations.
+  Predictions based on the returned model state will be conditioned on these
+  observations.
+
+  Args:
+    continue_from: A dictionary containing the results of either an Estimator's
+      evaluate method or a previous filter_continuation. Used to determine the
+      model state to start filtering from.
+    signatures: The `MetaGraphDef` protocol buffer returned from
+      `tf.saved_model.loader.load`. Used to determine the names of Tensors to
+      feed and fetch. Must be from the same model as `continue_from`.
+    session: The session to use. The session's graph must be the one into which
+      `tf.saved_model.loader.load` loaded the model.
+    features: A dictionary mapping keys to Numpy arrays, with several possible
+      shapes (requires keys `FilteringFeatures.TIMES` and
+      `FilteringFeatures.VALUES`):
+        Single example; `TIMES` is a scalar and `VALUES` is either a scalar or a
+          vector of length [number of features].
+        Sequence; `TIMES` is a vector of shape [series length], `VALUES` either
+          has shape [series length] (univariate) or [series length x number of
+          features] (multivariate).
+        Batch of sequences; `TIMES` is a vector of shape [batch size x series
+          length], `VALUES` has shape [batch size x series length] or [batch
+          size x series length x number of features].
+      In any case, `VALUES` and any exogenous features must have their shapes
+      prefixed by the shape of the value corresponding to the `TIMES` key.
+  Returns:
+    A dictionary containing model state updated to account for the observations
+    in `features`.
+  """
+  filter_signature = signatures.signature_def[
+      _feature_keys.SavedModelLabels.FILTER]
+  features = _input_pipeline._canonicalize_numpy_data(  # pylint: disable=protected-access
+      data=features,
+      require_single_batch=False)
+  output_tensors_by_name, feed_dict = _colate_features_to_feeds_and_fetches(
+      continue_from=continue_from,
+      signature=filter_signature,
+      features=features,
+      graph=session.graph)
+  output = session.run(output_tensors_by_name, feed_dict=feed_dict)
+  # Make it easier to chain filter -> predict by keeping track of the current
+  # time.
+  output[_feature_keys.FilteringResults.TIMES] = features[
+      _feature_keys.FilteringFeatures.TIMES]
+  return output
+
+remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management.py b/tensorflow/contrib/timeseries/python/timeseries/state_management.py
new file mode 100644
index 0000000000000000000000000000000000000000..13eecd4d822faaeb9553c7723c6842cdcb38aa3f
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management.py
@@ -0,0 +1,264 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for wrapping a model to operate on different data shapes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries.model import ModelOutputs
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
+
+
+class PassthroughStateManager(object):
+  """A minimal wrapper for models which do not need state management."""
+
+  def __init__(self):
+    self._input_statistics = None
+    self._graph_initialized = False
+
+  def initialize_graph(self, model, input_statistics=None):
+    """Adds required operations to the graph."""
+    del model  # unused
+    self._graph_initialized = True
+    self._input_statistics = input_statistics
+
+  def define_loss(self, model, features, mode):
+    """Wrap "model" with StateManager-specific operations.
+
+    Args:
+      model: The model (inheriting from TimeSeriesModel) to manage state for.
+      features: A dictionary with the following key/value pairs:
+        feature_keys.TrainEvalFeatures.TIMES: A [batch size x window size]
+            Tensor with times for each observation.
+        feature_keys.TrainEvalFeatures.VALUES: A [batch size x window size x num
+            features] Tensor with values for each observation.
+      mode: The tf.estimator.ModeKeys mode to use (TRAIN or EVAL).
+    Returns:
+      A ModelOutputs object.
+    Raises:
+      ValueError: If start state was specified.
+    """
+    if feature_keys.State.STATE_TUPLE in features:
+      raise ValueError(
+          "Overriding start state is not supported for this model.")
+    return model.define_loss(features, mode)
+
+
+class _OverridableStateManager(PassthroughStateManager):
+  """Base class for state managers which support overriding model state."""
+
+  @abc.abstractmethod
+  def _define_loss_with_saved_state(self, model, features, mode):
+    pass
+
+  def define_loss(self, model, features, mode):
+    """Switches between explicit start state and managed state."""
+    if feature_keys.FilteringFeatures.STATE_TUPLE in features:
+      # Explicit start state has been provided, so we should use that.
+      if mode == estimator_lib.ModeKeys.TRAIN:
+        raise ValueError(
+            "Overriding saved state for training is not supported (but a value "
+            "for feature {} was specified).".format(
+                feature_keys.FilteringFeatures.STATE_TUPLE))
+      start_state = features[feature_keys.FilteringFeatures.STATE_TUPLE]
+      del features[feature_keys.FilteringFeatures.STATE_TUPLE]
+      return model.get_batch_loss(
+          features=features, mode=mode, state=start_state)
+    else:
+      # No explicit start state; use managed state.
+      return self._define_loss_with_saved_state(
+          model=model, features=features, mode=mode)
+
+
+class FilteringOnlyStateManager(_OverridableStateManager):
+  """State manager for models which use state only for filtering.
+
+  Window-based models (ARModel) do not require state to be fed during training
+  (instead requiring a specific window size). Rather than requiring a minimum
+  window size for filtering, these models maintain this window in their state,
+  and so need state to be fed.
+  """
+
+  def _define_loss_with_saved_state(self, model, features, mode):
+    return model.define_loss(features, mode)
+
+
+class ChainingStateManager(_OverridableStateManager):
+  """Maintains state across a batch for SequentialTimeSeriesModel subclasses.
+
+  The batch dimension is treated as indexing sequential chunks of the same
+  timeseries. End state from each chunk is fed as start state to the next chunk
+  during the next timestep. This is an approximation to full-batch training for
+  sequential models, but is typically much faster while still accurately
+  recovering parameters. The speedup comes from reduced scheduling overhead of
+  TensorFlow ops, since each operation can do much more work.
+  """
+
+  def __init__(self, state_saving_interval=20, checkpoint_state=False):
+    """Initialize the state manager.
+
+    Args:
+      state_saving_interval: This state manager saves intermediate model state
+          every `state_saving_interval` times. Larger values save memory, and
+          checkpoint size if `checkpoint_state` is enabled, but models
+          will need to impute across artificial gaps of up to this size
+          (i.e. gaps not appearing in the original data). This imputation may
+          affect training. Set state_saving_interval to 1 to avoid any
+          artificial imputation.
+      checkpoint_state: If True, saved intermediate model state will be
+          written to checkpoints. Checkpoints will then scale with dataset
+          size. If False, state will be freshly imputed from the beginning of a
+          series each time the model is restored, which means it may take a few
+          iterations for state to warm up.
+    """
+    super(ChainingStateManager, self).__init__()
+    self._checkpoint_state = checkpoint_state
+    self._state_saving_interval = state_saving_interval
+    self._start_state = None
+    self._cached_states = None
+
+  def initialize_graph(self, model, input_statistics=None):
+    """Adds required operations to the graph."""
+    super(ChainingStateManager, self).initialize_graph(
+        model=model, input_statistics=input_statistics)
+    self._start_state = model.get_start_state()
+    self._cached_states = math_utils.TupleOfTensorsLookup(
+        key_dtype=dtypes.int64,
+        default_values=self._start_state,
+        empty_key=-1,
+        name="cached_states",
+        checkpoint=self._checkpoint_state)
+
+  def _define_loss_with_saved_state(self, model, features, mode):
+    """Feeds end state from one training iteration into the next.
+
+    Args:
+      model: The model to wrap. Compatible with children of TimeSeriesModel.
+      features: Dictionary with Tensor values defining the data to be
+        processed. The expected key/value pairs are at minimum:
+          feature_keys.TrainEvalFeatures.TIMES: A [number of chunks x window
+            size] Tensor with times for each observation, the result of chunking
+            a single longer time series.
+          feature_keys.TrainEvalFeatures.VALUES: A [number of chunks x window
+            size x num features] Tensor with values for each observation,
+            corresponding to times.
+      mode: The tf.estimator.ModeKeys mode to use. For EVAL and INFER, no
+          batching is performed, which may be slow. This is to avoid giving
+          cached and almost certainly stale values.
+    Returns:
+      A ModelOutputs object.
+    Raises:
+      ValueError: If initialize_graph has not been called.
+    """
+    if not self._graph_initialized:
+      raise ValueError("ChainingStateManager requires initialize_graph() to be "
+                       "called before use.")
+    (loss_op, end_state, batch_predictions) = self._update_cached_states(
+        model=model,
+        features=features,
+        mode=mode)
+    # Add a batch dimension so state can be used directly (e.g. for predictions)
+    # without the user manually reshaping it.
+    last_end_state_flat = [end_state_value[-1][None]
+                           for end_state_value in nest.flatten(end_state)]
+    batch_predictions["observed"] = features[
+        feature_keys.TrainEvalFeatures.VALUES]
+    return ModelOutputs(
+        loss=loss_op,
+        end_state=nest.pack_sequence_as(end_state, last_end_state_flat),
+        predictions=batch_predictions,
+        prediction_times=features[feature_keys.TrainEvalFeatures.TIMES])
+
+  def _get_chunk_number(self, time):
+    return time // self._state_saving_interval
+
+  def _get_cached_states(self, times):
+    """Retrieve cached states for a batch of times."""
+    read_chunk_numbers = self._get_chunk_number(times)
+    looked_up_state = list(self._cached_states.lookup(
+        math_ops.cast(read_chunk_numbers, dtypes.int64)))
+    looked_up_state = tuple(looked_up_state)
+    # We need to special-case the first chunk in a series to explicitly rely on
+    # the model's starting state so that gradients flow back to it. Otherwise it
+    # would affect only initialization, and would not be read from or updated
+    # during training. Not doing this also isolates that part of the graph,
+    # leading to errors on model reload if there are trainable variables
+    # affecting a model's start state.
+    if self._input_statistics is not None:
+      start_time = self._input_statistics.start_time
+    else:
+      start_time = 0
+    set_to_start_state = math_ops.equal(read_chunk_numbers,
+                                        self._get_chunk_number(start_time))
+    new_states = []
+    for start_state_value, cache_variable in zip(
+        nest.flatten(
+            math_utils.replicate_state(self._start_state,
+                                       array_ops.shape(times)[0])),
+        nest.flatten(looked_up_state)):
+
+      new_states.append(
+          array_ops.where(set_to_start_state, start_state_value,
+                          cache_variable))
+    looked_up_state = nest.pack_sequence_as(looked_up_state, new_states)
+    return looked_up_state
+
+  def _update_cached_states(self, model, features, mode):
+    """Read, process, and write chunks to the cache."""
+    times = features[feature_keys.TrainEvalFeatures.TIMES]
+    looked_up_state = self._get_cached_states(times[:, 0])
+    (model_loss, intermediate_states,
+     batch_predictions) = model.per_step_batch_loss(
+         features=features,
+         mode=mode,
+         state=looked_up_state)
+    # We need to at least write to the bucket after the one we read from.
+    min_chunk_numbers = self._get_chunk_number(times) + 1
+    # We write to the bucket that would have been read had the window started at
+    # the next sample (except for the last sample in the window, which gets
+    # written to the next bucket). This assumes fixed missing times (i.e. if we
+    # were presented with times [10, 50] we will never see times [30, 50]).
+    #
+    # TODO(allenl): Retrieve the highest time less than the current time rather
+    # than relying on fixed bucketing.
+    write_chunk_numbers = math_ops.maximum(
+        self._get_chunk_number(array_ops.concat(
+            [times[:, 1:], times[:, -1:] + 1], axis=1)),
+        min_chunk_numbers)
+    # Write once for every computed state; this may mean that we write multiple
+    # times to the same cell, but later writes will take precedence.
+    save_ops = [
+        self._cached_states.insert(
+            keys=write_chunk_numbers,
+            values=intermediate_states)]
+    end_state = nest.pack_sequence_as(
+        intermediate_states,
+        [state_element[:, -1]
+         for state_element in nest.flatten(intermediate_states)])
+    with ops.control_dependencies(save_ops):
+      # Make sure end states get saved at each iteration
+      loss_op = array_ops.identity(model_loss)
+    return loss_op, end_state, batch_predictions
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5dce30fda0353bd70f44ec567ac91acce1e9394
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
@@ -0,0 +1,313 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for state management."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries import model
+from tensorflow.contrib.timeseries.python.timeseries import state_management
+from tensorflow.contrib.timeseries.python.timeseries import test_utils
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import training as train
+from tensorflow.python.util import nest
+
+
+class StubTimeSeriesModel(model.TimeSeriesModel):
+
+  def __init__(self, correct_offset=False):
+    self._correct_offset = correct_offset
+    super(StubTimeSeriesModel, self).__init__(1)
+
+  def initialize_graph(self, input_statistics=None):
+    super(StubTimeSeriesModel, self).initialize_graph(
+        input_statistics=input_statistics)
+    self.prior_var = variable_scope.get_variable(
+        "prior", [], initializer=init_ops.constant_initializer(0.))
+
+  def generate(self, *args):
+    pass
+
+  def predict(self, *args):
+    pass
+
+  def get_start_state(self):
+    return (array_ops.zeros([], dtype=dtypes.int64), self.prior_var)
+
+  def get_batch_loss(self, features, mode, state):
+    raise NotImplementedError("This stub only supports managed state.")
+
+  def per_step_batch_loss(self, features, mode, state):
+    times = features[feature_keys.TrainEvalFeatures.TIMES]
+    values = features[feature_keys.TrainEvalFeatures.VALUES]
+    (priors_from_time, prior) = state
+    time_corrected_priors = prior + math_ops.cast(
+        math_utils.batch_start_time(times) - priors_from_time, dtypes.float32)
+    posterior = time_corrected_priors[:, None] + math_ops.cast(
+        times - math_utils.batch_start_time(times)[:, None], dtypes.float32)
+    batch_end_values = array_ops.squeeze(
+        array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0],
+                        [-1, 1, -1]),
+        squeeze_dims=[1, 2])
+    # A pretty odd but easy to think about loss: L1 loss on the batch end
+    # values.
+    loss = math_ops.reduce_sum(
+        math_ops.abs(
+            array_ops.reshape(posterior[:, -1], [-1]) - batch_end_values))
+    if self._correct_offset:
+      posterior += batch_end_values[0] - posterior[0, -1]
+    posteriors = (times, posterior)
+    return loss, posteriors, {"dummy_predictions": array_ops.zeros_like(values)}
+
+
+class ChainingStateManagerTest(test.TestCase):
+
+  def _make_test_data(self, length, cut_start, cut_end, offset, step=1):
+    times_full = step * numpy.arange(length, dtype=numpy.int64)
+    values_full = offset + step * numpy.arange(length, dtype=numpy.float32)
+    if cut_start is not None:
+      times = numpy.concatenate((times_full[:cut_start],
+                                 times_full[cut_end:]))
+      values = numpy.concatenate((values_full[:cut_start],
+                                  values_full[cut_end:]))
+    else:
+      times = times_full
+      values = values_full
+    return {
+        feature_keys.TrainEvalFeatures.TIMES: times,
+        feature_keys.TrainEvalFeatures.VALUES: values
+    }
+
+  def _test_initialization(self, warmup_iterations, batch_size):
+    stub_model = StubTimeSeriesModel()
+    data = self._make_test_data(length=20, cut_start=None, cut_end=None,
+                                offset=0.)
+    if batch_size == -1:
+      input_fn = test_utils.AllWindowInputFn(
+          input_pipeline.NumpyReader(data), window_size=10)
+    else:
+      input_fn = input_pipeline.RandomWindowInputFn(
+          input_pipeline.NumpyReader(data),
+          window_size=10,
+          batch_size=batch_size)
+    chainer = state_management.ChainingStateManager(
+        state_saving_interval=1)
+    features, _ = input_fn()
+    stub_model.initialize_graph()
+    chainer.initialize_graph(model=stub_model)
+    model_outputs = chainer.define_loss(
+        model=stub_model, features=features, mode=estimator_lib.ModeKeys.TRAIN)
+    with self.test_session() as session:
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      for _ in range(warmup_iterations):
+        # Warm up saved state
+        model_outputs.loss.eval()
+      outputs = model_outputs.loss.eval()
+      coordinator.request_stop()
+      coordinator.join()
+      return outputs
+
+  def test_zero_initializations(self):
+    # Even with no initialization, we are imputing values up to each chunk,
+    # which in this case gives exact values.
+    self.assertEqual(0., self._test_initialization(
+        warmup_iterations=0, batch_size=-1))
+
+  def test_one_initializations(self):
+    # Further initialization should still be correct, if redundant
+    self.assertEqual(0., self._test_initialization(
+        warmup_iterations=1, batch_size=-1))
+
+  def test_stochastic_batch(self):
+    # It shouldn't matter whether we're using a full deterministic batch or a
+    # smaller stochastic batch.
+    self.assertEqual(0., self._test_initialization(
+        warmup_iterations=1, batch_size=5))
+
+  def _test_pass_to_next(self, read_offset, step, correct_offset):
+    stub_model = StubTimeSeriesModel(correct_offset=correct_offset)
+    data = self._make_test_data(
+        length=100 + read_offset, cut_start=None, cut_end=None, offset=100.,
+        step=step)
+    init_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(
+            {k: v[:-read_offset] for k, v in data.items()}))
+    result_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(
+            {k: v[read_offset:] for k, v in data.items()}))
+
+    chainer = state_management.ChainingStateManager(
+        state_saving_interval=1)
+    stub_model.initialize_graph()
+    chainer.initialize_graph(model=stub_model)
+    init_model_outputs = chainer.define_loss(
+        model=stub_model, features=init_input_fn()[0],
+        mode=estimator_lib.ModeKeys.TRAIN)
+    result_model_outputs = chainer.define_loss(
+        model=stub_model, features=result_input_fn()[0],
+        mode=estimator_lib.ModeKeys.TRAIN)
+    with self.test_session() as session:
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      init_model_outputs.loss.eval()
+      returned_loss = result_model_outputs.loss.eval()
+      coordinator.request_stop()
+      coordinator.join()
+      return returned_loss
+
+  def test_pass_to_next_step_one_no_correction(self):
+    self.assertEqual(100., self._test_pass_to_next(
+        read_offset=1, step=1, correct_offset=False))
+
+  def test_pass_to_next_step_one_with_correction(self):
+    self.assertEqual(0., self._test_pass_to_next(
+        read_offset=1, step=1, correct_offset=True))
+
+  def test_pass_to_next_step_three_with_correction(self):
+    self.assertEqual(0., self._test_pass_to_next(
+        read_offset=1, step=3, correct_offset=True))
+
+  def test_large_read_offset(self):
+    self.assertEqual(0., self._test_pass_to_next(
+        read_offset=50, step=20, correct_offset=True))
+
+  def test_past_init_offset(self):
+    self.assertEqual(100., self._test_pass_to_next(
+        read_offset=100, step=20, correct_offset=True))
+
+  def _test_missing_values(self, cut_start, cut_end, offset):
+    stub_model = StubTimeSeriesModel()
+    data = self._make_test_data(
+        length=100, cut_start=cut_start, cut_end=cut_end, offset=offset)
+    input_fn = test_utils.AllWindowInputFn(
+        input_pipeline.NumpyReader(data), window_size=10)
+    chainer = state_management.ChainingStateManager(
+        state_saving_interval=1)
+    features, _ = input_fn()
+    stub_model.initialize_graph()
+    chainer.initialize_graph(model=stub_model)
+    model_outputs = chainer.define_loss(
+        model=stub_model, features=features, mode=estimator_lib.ModeKeys.TRAIN)
+    with self.test_session() as session:
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      for _ in range(10):
+        model_outputs.loss.eval()
+      returned_loss = model_outputs.loss.eval()
+      coordinator.request_stop()
+      coordinator.join()
+      return returned_loss
+
+  def test_missing_values_ten(self):
+    # Each posterior should be off by 10 from the offset in the values. 90
+    # values with a chunk size of 10 means 90 - 10 + 1 possible chunks.
+    self.assertEqual((90 - 10 + 1) * 10, self._test_missing_values(
+        cut_start=20, cut_end=30, offset=10.))
+
+  def test_missing_values_five(self):
+    self.assertEqual((95 - 10 + 1) * 10, self._test_missing_values(
+        cut_start=15, cut_end=20, offset=10.))
+
+
+class _StateOverrideModel(model.TimeSeriesModel):
+
+  def __init__(self):
+    super(_StateOverrideModel, self).__init__(num_features=1)
+
+  def generate(self, *args):
+    pass
+
+  def predict(self, *args):
+    pass
+
+  def get_start_state(self):
+    return (constant_op.constant([20, 30, 40], dtype=dtypes.int64),
+            (constant_op.constant(-10, dtype=dtypes.int64),
+             constant_op.constant([30., 50.], dtype=dtypes.float64)))
+
+  def get_batch_loss(self, features, mode, state):
+    per_observation_loss, state, outputs = self.per_step_batch_loss(
+        features, mode, state)
+    state = nest.map_structure(lambda element: element[:, -1], state)
+    outputs["observed"] = features[feature_keys.TrainEvalFeatures.VALUES]
+    return model.ModelOutputs(
+        loss=per_observation_loss,
+        end_state=state,
+        predictions=outputs,
+        prediction_times=features[feature_keys.TrainEvalFeatures.TIMES])
+
+  def per_step_batch_loss(self, features, mode, state):
+    return (
+        constant_op.constant(1.),
+        # Assumes only one step: this is the per-step loss.
+        nest.map_structure(
+            lambda element: ops.convert_to_tensor(element)[:, None], state),
+        {
+            "dummy_predictions":
+                array_ops.zeros_like(
+                    features[feature_keys.TrainEvalFeatures.VALUES])
+        })
+
+
+class _StateOverrideTest(test.TestCase):
+
+  def test_state_override(self):
+    test_start_state = (numpy.array([[2, 3, 4]]), (numpy.array([2]),
+                                                   numpy.array([[3., 5.]])))
+    data = {
+        feature_keys.FilteringFeatures.TIMES: numpy.arange(5),
+        feature_keys.FilteringFeatures.VALUES: numpy.zeros(shape=[5, 3])
+    }
+    features, _ = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(data))()
+    features[feature_keys.FilteringFeatures.STATE_TUPLE] = test_start_state
+    stub_model = _StateOverrideModel()
+    chainer = state_management.ChainingStateManager()
+    stub_model.initialize_graph()
+    chainer.initialize_graph(model=stub_model)
+    model_outputs = chainer.define_loss(
+        model=stub_model, features=features, mode=estimator_lib.ModeKeys.EVAL)
+    with train.MonitoredSession() as session:
+      end_state = session.run(model_outputs.end_state)
+    nest.assert_same_structure(test_start_state, end_state)
+    for expected, received in zip(
+        nest.flatten(test_start_state), nest.flatten(end_state)):
+      self.assertAllEqual(expected, received)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d0deedc50f8b7953394ab2354fae9133b523d97b
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -0,0 +1,279 @@
+# State space components and ensembles
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "state_space_model",
+    srcs = ["state_space_model.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":kalman_filter",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
+        "//tensorflow/contrib/timeseries/python/timeseries:model",
+        "//tensorflow/contrib/timeseries/python/timeseries:model_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "state_space_model_test",
+    timeout = "long",  # Moderate but for asan
+    srcs = ["state_space_model_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":state_space_model",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/timeseries/python/timeseries:estimators",
+        "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
+        "//tensorflow/contrib/timeseries/python/timeseries:input_pipeline",
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
+        "//tensorflow/contrib/timeseries/python/timeseries:saved_model_utils",
+        "//tensorflow/contrib/timeseries/python/timeseries:state_management",
+        "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "kalman_filter",
+    srcs = ["kalman_filter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:numerics",
+    ],
+)
+
+tf_py_test(
+    name = "kalman_filter_test",
+    srcs = ["kalman_filter_test.py"],
+    additional_deps = [
+        ":kalman_filter",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "level_trend",
+    srcs = ["level_trend.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":state_space_model",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+tf_py_test(
+    name = "level_trend_test",
+    srcs = ["level_trend_test.py"],
+    additional_deps = [
+        ":level_trend",
+        ":test_utils",
+        ":state_space_model",
+        "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
+    name = "periodic",
+    srcs = ["periodic.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":state_space_model",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "periodic_test",
+    srcs = ["periodic_test.py"],
+    additional_deps = [
+        ":periodic",
+        ":test_utils",
+        ":state_space_model",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
+    name = "structural_ensemble",
+    srcs = ["structural_ensemble.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":level_trend",
+        ":periodic",
+        ":state_space_model",
+        ":varma",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_test(
+    name = "structural_ensemble_test",
+    timeout = "long",  # Moderate but for asan/tsan timeouts
+    srcs = ["structural_ensemble_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":state_space_model",
+        ":structural_ensemble",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/timeseries/python/timeseries:estimators",
+        "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
+        "//tensorflow/contrib/timeseries/python/timeseries:input_pipeline",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "varma",
+    srcs = ["varma.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":state_space_model",
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+tf_py_test(
+    name = "varma_test",
+    srcs = ["varma_test.py"],
+    additional_deps = [
+        ":varma",
+        "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_library(
+    name = "filtering_postprocessor",
+    srcs = ["filtering_postprocessor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "filtering_postprocessor_test",
+    srcs = ["filtering_postprocessor_test.py"],
+    additional_deps = [
+        ":filtering_postprocessor",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
+    name = "test_utils",
+    srcs = ["test_utils.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tensorboard/__init__.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/__init__.py
similarity index 100%
rename from tensorflow/tensorboard/__init__.py
rename to tensorflow/contrib/timeseries/python/timeseries/state_space_models/__init__.py
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fa538a16ecd7dcf39beeb001992fd7927cee70b
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
@@ -0,0 +1,263 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Filtering postprocessors for SequentialTimeSeriesModels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.contrib import distributions
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
+
+
+class FilteringStepPostprocessor(object):
+  """Base class for processors that are applied after each filter step."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def process_filtering_step(self, current_times, current_values,
+                             predicted_state, filtered_state, outputs):
+    """Extends/modifies a filtering step, altering state and loss.
+
+    Args:
+      current_times: A [batch size] integer Tensor of times.
+      current_values: A [batch size x num features] Tensor of values filtering
+          is being performed on.
+      predicted_state: A (possibly nested) list of Tensors indicating model
+          state which does not take `current_times` and `current_values` into
+          account.
+      filtered_state: Same structure as predicted_state, but updated to take
+          `current_times` and `current_values` into account.
+      outputs: A dictionary of outputs produced by model filtering
+          (SequentialTimeSeriesModel._process_filtering_step).
+    Returns: A tuple of (new_state, updated_outputs);
+      new_state: Updated state with the same structure as `filtered_state` and
+          `predicted_state`.
+      updated_outputs: The `outputs` dictionary, updated with any new outputs
+          from this filtering postprocessor.
+    """
+    pass
+
+  @abc.abstractproperty
+  def output_names(self):
+    return []
+
+
+def cauchy_alternative_to_gaussian(current_times, current_values, outputs):
+  """A Cauchy anomaly distribution, centered at a Gaussian prediction.
+
+  Performs an entropy-matching approximation of the scale parameters of
+  independent Cauchy distributions given the covariance matrix of a multivariate
+  Gaussian in outputs["covariance"], and centers the Cauchy distributions at
+  outputs["mean"]. This requires that the model that we are creating an
+  alternative/anomaly distribution for produces a mean and covariance.
+
+  Args:
+    current_times: A [batch size] Tensor of times, unused.
+    current_values: A [batch size x num features] Tensor of values to evaluate
+        the anomaly distribution at.
+    outputs: A dictionary of Tensors with keys "mean" and "covariance"
+        describing the Gaussian to construct an anomaly distribution from. The
+        value corresponding to "mean" has shape [batch size x num features], and
+        the value corresponding to "covariance" has shape [batch size x num
+        features x num features].
+  Returns:
+    A [batch size] Tensor of log likelihoods; the anomaly log PDF evaluated at
+    `current_values`.
+  """
+  del current_times  # unused
+  cauchy_scale = math_utils.entropy_matched_cauchy_scale(outputs["covariance"])
+  individual_log_pdfs = distributions.StudentT(
+      df=array_ops.ones([], dtype=current_values.dtype),
+      loc=outputs["mean"],
+      scale=cauchy_scale).log_prob(current_values)
+  return math_ops.reduce_sum(individual_log_pdfs, axis=1)
+
+
+def _interpolate_state_linear(first_state, second_state, first_responsibility):
+  """Interpolate between two model states linearly."""
+  interpolated_state_flat = []
+  for first_state_tensor, second_state_tensor in zip(
+      nest.flatten(first_state), nest.flatten(second_state)):
+    assert first_state_tensor.dtype == second_state_tensor.dtype
+    if first_state_tensor.dtype.is_floating:
+      # Pad the responsibility shape with ones up to the state's rank so that it
+      # broadcasts
+      first_responsibility_padded = array_ops.reshape(
+          tensor=first_responsibility,
+          shape=array_ops.concat([
+              array_ops.shape(first_responsibility), array_ops.ones(
+                  [array_ops.rank(first_state_tensor) - 1], dtype=dtypes.int32)
+          ], 0))
+      interpolated_state = (
+          first_responsibility_padded * first_state_tensor
+          + (1. - first_responsibility_padded) * second_state_tensor)
+      interpolated_state.set_shape(first_state_tensor.get_shape())
+      interpolated_state_flat.append(interpolated_state)
+    else:
+      # Integer dtypes are probably representing times, and don't need
+      # interpolation. Make sure they're identical to be sure.
+      with ops.control_dependencies(
+          [check_ops.assert_equal(first_state_tensor, second_state_tensor)]):
+        interpolated_state_flat.append(array_ops.identity(first_state_tensor))
+  return nest.pack_sequence_as(first_state, interpolated_state_flat)
+
+
+class StateInterpolatingAnomalyDetector(FilteringStepPostprocessor):
+  """An anomaly detector which guards model state against outliers.
+
+  Smoothly interpolates between a model's predicted and inferred states, based
+  on the posterior probability of an anomaly, p(anomaly | data). This is useful
+  if anomalies would otherwise lead to model state which is hard to recover
+  from (Gaussian state space models suffer from this, for example).
+
+  Relies on (1) an alternative distribution, typically with heavier tails than
+  the model's normal predictions, and (2) a prior probability of an anomaly. The
+  prior probability acts as a penalty, discouraging the system from marking too
+  many points as anomalies. The alternative distribution indicates the
+  probability of a datapoint given that it is an anomaly, and is a heavy-tailed
+  distribution (Cauchy) centered around the model's predictions by default.
+
+  Specifically, we have:
+
+    p(anomaly | data) = p(data | anomaly) * anomaly_prior_probability
+        / (p(data | not anomaly) * (1 - anomaly_prior_probability)
+           + p(data | anomaly) * anomaly_prior_probability)
+
+  This is simply Bayes' theorem, where p(data | anomaly) is the
+  alternative/anomaly distribution, p(data | not anomaly) is the model's
+  predicted distribution, and anomaly_prior_probability is the prior probability
+  of an anomaly occurring (user-specified, defaulting to 1%).
+
+  Rather than computing p(anomaly | data) directly, we use the odds ratio:
+
+    odds_ratio = p(data | anomaly) * anomaly_prior_probability
+        / (p(data | not anomaly) * (1 - anomaly_prior_probability))
+
+  This has the same information as p(anomaly | data):
+
+    odds_ratio = p(anomaly | data) / p(not anomaly | data)
+
+  A "responsibility" score is computed for the model based on the log odds
+  ratio, and state interpolated based on this responsibility:
+
+    model_responsibility = 1 / (1 + exp(-responsibility_scaling
+                                        * ln(odds_ratio)))
+    model_state = filtered_model_state * model_responsibility
+                  + predicted_model_state * (1 - model_responsibility)
+    loss = model_responsibility
+             * ln(p(data | not anomaly) * (1 - anomaly_prior_probability))
+           + (1 - model_responsibility)
+             * ln(p(data | anomaly) * anomaly_prior_probability)
+
+  """
+
+  output_names = ["anomaly_score"]
+
+  def __init__(self,
+               anomaly_log_likelihood=cauchy_alternative_to_gaussian,
+               anomaly_prior_probability=0.01,
+               responsibility_scaling=1.0):
+    """Configure the anomaly detector.
+
+    Args:
+      anomaly_log_likelihood: A function taking `current_times`,
+          `current_values`, and `outputs` (same as the corresponding arguments
+          to process_filtering_step) and returning a [batch size] Tensor of log
+          likelihoods under an anomaly distribution.
+      anomaly_prior_probability: A scalar value, between 0 and 1, indicating the
+          prior probability of a particular example being an anomaly.
+      responsibility_scaling: A positive scalar controlling how fast
+          interpolation transitions between not-anomaly and anomaly; lower
+          values (closer to 0) create a smoother/slower transition.
+    """
+    self._anomaly_log_likelihood = anomaly_log_likelihood
+    self._responsibility_scaling = responsibility_scaling
+    self._anomaly_prior_probability = anomaly_prior_probability
+
+  def process_filtering_step(self, current_times, current_values,
+                             predicted_state, filtered_state, outputs):
+    """Fall back on `predicted_state` for anomalies.
+
+    Args:
+      current_times: A [batch size] integer Tensor of times.
+      current_values: A [batch size x num features] Tensor of values filtering
+          is being performed on.
+      predicted_state: A (possibly nested) list of Tensors indicating model
+          state which does not take `current_times` and `current_values` into
+          account.
+      filtered_state: Same structure as predicted_state, but updated to take
+          `current_times` and `current_values` into account.
+      outputs: A dictionary of outputs produced by model filtering. Must
+          include `log_likelihood`, a [batch size] Tensor indicating the log
+          likelihood of the observations under the model's predictions.
+    Returns:
+      A tuple of (new_state, updated_outputs);
+        new_state: Updated state with the same structure as `filtered_state` and
+            `predicted_state`; predicted_state for anomalies and filtered_state
+            otherwise (per batch element).
+        updated_outputs: The `outputs` dictionary, updated with a new "loss"
+            (the interpolated negative log likelihoods under the model and
+            anomaly distributions) and "anomaly_score" (the log odds ratio of
+            each part of the batch being an anomaly).
+    """
+    anomaly_log_likelihood = self._anomaly_log_likelihood(
+        current_times=current_times,
+        current_values=current_values,
+        outputs=outputs)
+    anomaly_prior_probability = ops.convert_to_tensor(
+        self._anomaly_prior_probability, dtype=current_values.dtype)
+    # p(data | anomaly) * p(anomaly)
+    data_and_anomaly_log_probability = (
+        anomaly_log_likelihood + math_ops.log(anomaly_prior_probability))
+    # p(data | no anomaly) * p(no anomaly)
+    data_and_no_anomaly_log_probability = (
+        outputs["log_likelihood"] + math_ops.log(1. - anomaly_prior_probability)
+    )
+    # A log odds ratio is slightly nicer here than computing p(anomaly | data),
+    # since it is centered around zero
+    anomaly_log_odds_ratio = (
+        data_and_anomaly_log_probability
+        - data_and_no_anomaly_log_probability)
+    model_responsibility = math_ops.sigmoid(-self._responsibility_scaling *
+                                            anomaly_log_odds_ratio)
+    # Do a linear interpolation between predicted and inferred model state
+    # based on the model's "responsibility". If we knew for sure whether
+    # this was an anomaly or not (binary responsibility), this would be the
+    # correct thing to do, but given that we don't it's just a
+    # (differentiable) heuristic.
+    interpolated_state = _interpolate_state_linear(
+        first_state=filtered_state,
+        second_state=predicted_state,
+        first_responsibility=model_responsibility)
+    # TODO(allenl): Try different responsibility scalings and interpolation
+    # methods (e.g. average in probability space rather than log space).
+    interpolated_log_likelihood = (
+        model_responsibility * data_and_no_anomaly_log_probability
+        + (1. - model_responsibility) * data_and_anomaly_log_probability)
+    outputs["loss"] = -interpolated_log_likelihood
+    outputs["anomaly_score"] = anomaly_log_odds_ratio
+    return (interpolated_state, outputs)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..53d7340e85ae78c1e1550bbd63229a08970f6540
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor_test.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for filtering postprocessors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import filtering_postprocessor
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class FilteringStepPostprocessorTest(test.TestCase):
+
+  def test_gaussian_alternative(self):
+    for float_dtype in [dtypes.float32, dtypes.float64]:
+      detector = filtering_postprocessor.StateInterpolatingAnomalyDetector(
+          anomaly_log_likelihood=(filtering_postprocessor
+                                  .cauchy_alternative_to_gaussian),
+          responsibility_scaling=10.)
+      predicted_state = [
+          constant_op.constant(
+              [[40.], [20.]], dtype=float_dtype), constant_op.constant(
+                  [3., 6.], dtype=float_dtype), constant_op.constant([-1, -2])
+      ]
+      filtered_state = [
+          constant_op.constant(
+              [[80.], [180.]], dtype=float_dtype), constant_op.constant(
+                  [1., 2.], dtype=float_dtype), constant_op.constant([-1, -2])
+      ]
+      interpolated_state, updated_outputs = detector.process_filtering_step(
+          current_times=constant_op.constant([1, 2]),
+          current_values=constant_op.constant([[0.], [1.]], dtype=float_dtype),
+          predicted_state=predicted_state,
+          filtered_state=filtered_state,
+          outputs={
+              "mean":
+                  constant_op.constant([[0.1], [10.]], dtype=float_dtype),
+              "covariance":
+                  constant_op.constant([[[1.0]], [[1.0]]], dtype=float_dtype),
+              "log_likelihood":
+                  constant_op.constant([-1., -40.], dtype=float_dtype)
+          })
+      # The first batch element is not anomalous, and so should use the inferred
+      # state. The second is anomalous, and should use the predicted state.
+      expected_state = [[[80.], [20.]],
+                        [1., 6.],
+                        [-1, -2]]
+      with self.test_session():
+        for interpolated, expected in zip(interpolated_state, expected_state):
+          self.assertAllClose(expected, interpolated.eval())
+        self.assertGreater(0., updated_outputs["anomaly_score"][0].eval())
+        self.assertLess(0., updated_outputs["anomaly_score"][1].eval())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
new file mode 100644
index 0000000000000000000000000000000000000000..b174bb6af323da62afda2a74a397f25e977a48d0
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
@@ -0,0 +1,279 @@
+# Derivations for multi-resolution cycle transition matrix powers
+
+This document contains derivations for the special-cased matrix-to-powers and
+power sums used in `state_space_models/periodic.py`'s `ResolutionCycleModel` as
+part of TensorFlow Time Series (TFTS).
+
+## Setting and notation
+
+Let $$M$$ be the number of latent values being cycled through
+(`num_latent_values` in the code).
+
+The `ResolutionCycleModel` transition matrix is based on roots of a matrix which
+cycles through $$M$$ (odd) values and constrains their sum to be
+zero (which when included in a state space model means that the expected sum
+over a complete period is zero). Call this $$M - 1$$ x
+$$M - 1$$ matrix $$C$$ (`cycle_matrix`):
+
+$$ {\boldsymbol C}_{i, j} = \begin{cases} -1 & i = 0\\ 1 & j = i - 1\\ 0 &
+\text{otherwise}\end{cases} $$
+
+`ResolutionCycleModel` takes roots of this matrix using the following
+parameterization:
+
+$$ {\boldsymbol C}^p = \text{cycle_eigenvectors} *
+\text{diag}(\text{cycle_eigenvalues})^{p} * \text{cycle_eigenvectors}^{-1} $$
+
+Where:
+
+$$\text{cycle_eigenvectors}_{i, j} = w_{\lfloor j / 2 \rfloor + 1}^{i (-1)^{j +
+1}} - w_{\lfloor j / 2 \rfloor + 1}^{(i + 1) (-1)^{j + 1}}$$
+
+$$(\text{cycle_eigenvectors}^{-1})_{i, j} = \frac{1}{M}
+\sum_{k=0}^j w_{\lfloor i / 2 \rfloor + 1}^{k (-1)^i}$$
+
+$$\text{cycle_eigenvalues}_{j} = w_{\lfloor j / 2 \rfloor + 1}^{(-1)^j}$$
+
+Where $$w_j$$ is a root of unity:
+
+$$w_j = e^{\frac{2 \pi j \sqrt{-1}}{M}}$$
+
+In Sympy (useful for checking expressions when $$M$$ is small),
+this looks like:
+
+```python
+import sympy
+def root_of_unity(nth, number, to_power=1):
+    return sympy.exp(2 * sympy.pi * number * sympy.I * to_power / nth)
+matsize = 4
+def eigvec_mat_fn(i, j):
+    number = j // 2 + 1
+    powersign = (j % 2) * 2 - 1
+    return (root_of_unity(matsize + 1, number=number,
+                          to_power=powersign * i)
+            - root_of_unity(matsize + 1, number=number,
+                            to_power=powersign * (i + 1)))
+def eigvec_inverse_mat_fn(row, column):
+    number = row // 2 + 1
+    powersign = ((row + 1) % 2) * 2 - 1
+    runningsum = 0
+    for j in range(column + 1):
+        runningsum += root_of_unity(
+          matsize + 1, number, to_power=j * powersign) / (matsize + 1)
+    return runningsum
+def make_eigval_mat_fn(to_power=1):
+    def eigval_mat_fn(i, j):
+        if i == j:
+            number = j // 2 + 1
+            powersign = ((j + 1) % 2) * 2 - 1
+            return root_of_unity(matsize + 1, number=number, 
+                                 to_power=powersign*to_power)
+        else:
+            return 0
+    return eigval_mat_fn
+eigval_power = sympy.Rational(1, 1)
+eigvecs = sympy.Matrix(matsize, matsize, eigvec_mat_fn)
+eigvals = sympy.Matrix(matsize, matsize, make_eigval_mat_fn(eigval_power))
+eigvecs_inv = sympy.Matrix(matsize, matsize, eigvec_inverse_mat_fn)
+print (eigvecs * eigvals * eigvecs_inv).evalf()
+```
+
+## Proof that these are eigenvectors/eigenvalues of `cycle_matrix`
+
+We want to show that:
+
+$${\boldsymbol C} * \text{cycle_eigenvectors}_{\bullet, j} =
+\text{cycle_eigenvalues}_j * \text{cycle_eigenvectors}_{\bullet, j} $$
+
+Where $$\text{cycle_eigenvectors}_{\bullet, j}$$ is a column vector containing
+the $$j^\text{th}$$ eigenvector.
+
+We have telescoping sum in the first entry:
+
+$$({\boldsymbol C} * \text{cycle_eigenvectors}_{\bullet, j})_i =
+\begin{cases} -\sum_{k=0}^{M - 2}
+\text{cycle_eigenvectors}_{k, j} & i = 0\\ \text{cycle_eigenvectors}_{i - 1, j}
+& \text{otherwise} \end{cases}$$
+
+$$ = \begin{cases} w_{\lfloor j / 2 \rfloor + 1}^{(M -
+1)(-1)^{j + 1}} - w_{\lfloor j / 2 \rfloor + 1}^{0(-1)^{j + 1}} & i = 0\\
+\text{cycle_eigenvectors}_{i - 1, j} & \text{otherwise} \end{cases}$$
+
+$$ = \begin{cases} w_{\lfloor j / 2 \rfloor + 1}^{(-1)^{j}} \left (w_{\lfloor j
+/ 2 \rfloor + 1}^{M(-1)^{j + 1}} - w_{\lfloor j / 2
+\rfloor + 1}^{(-1)^{j + 1}} \right ) & i = 0\\ \text{cycle_eigenvectors}_{i - 1,
+j} & \text{otherwise} \end{cases}$$
+
+$$ = \begin{cases} w_{\lfloor j / 2 \rfloor + 1}^{(-1)^{j}} \left (w_{\lfloor j
+/ 2 \rfloor + 1}^{0(-1)^{j + 1}} - w_{\lfloor j / 2 \rfloor + 1}^{(-1)^{j + 1}}
+\right ) & i = 0\\ \text{cycle_eigenvectors}_{i - 1, j} & \text{otherwise}
+\end{cases}$$
+
+$$ = \begin{cases} w_{\lfloor j / 2 \rfloor + 1}^{(-1)^{j}}
+\text{cycle_eigenvectors}_{0, j} ) & i = 0\\ \text{cycle_eigenvectors}_{i - 1,
+j} & \text{otherwise} \end{cases}$$
+
+The remaining cases follow from the fact that:
+
+$$w_{\lfloor j / 2 \rfloor + 1}^{(-1)^{j}} \text{cycle_eigenvectors}_{i, j} =
+\text{cycle_eigenvectors}_{i - 1, j}$$
+
+$$w_{\lfloor j / 2 \rfloor + 1}^{(-1)^{j}} \left( w_{\lfloor j / 2 \rfloor +
+1}^{i (-1)^{j + 1}} - w_{\lfloor j / 2 \rfloor + 1}^{(i + 1) (-1)^{j + 1}}
+\right) = w_{\lfloor j / 2 \rfloor + 1}^{(i - 1) (-1)^{j + 1}} - w_{\lfloor j /
+2 \rfloor + 1}^{i (-1)^{j + 1}}$$
+
+## Proof of eigenvector inverse matrix
+
+We want to show that (for the expressions above):
+
+$$ I = \text{cycle_eigenvectors} * \text{cycle_eigenvectors}^{-1} $$
+
+Multiplying it out, we have:
+
+$$(\text{cycle_eigenvectors} * \text{cycle_eigenvectors}^{-1})_{i, j} =
+\sum_{k=0}^{M - 2} \text{cycle_eigenvectors}_{i, k}
+(\text{cycle_eigenvectors}^{-1})_{k, j} $$
+
+$$ = \frac{1}{M} \sum_{k=0}^{M -
+2} \left[ \left( w_{\lfloor k / 2 \rfloor + 1}^{i (-1)^{k + 1}} - w_{\lfloor k /
+2 \rfloor + 1}^{(i + 1) (-1)^{k + 1}} \right) \sum_{l=0}^j w_{\lfloor k / 2
+\rfloor + 1}^{l (-1)^{k}} \right]$$
+
+$$ = \frac{1}{M} \sum_{k=0}^{M -
+2} \sum_{l=0}^j \left[ \left( w_{\lfloor k / 2 \rfloor + 1}^{i (-1)^{k + 1}} -
+w_{\lfloor k / 2 \rfloor + 1}^{(i + 1) (-1)^{k + 1}} \right) w_{\lfloor k / 2
+\rfloor + 1}^{l (-1)^{k}} \right]$$
+
+$$ = \frac{1}{M} \sum_{k=0}^{M -
+2} \sum_{l=0}^j \left[ w_{\lfloor k / 2 \rfloor + 1}^{(i - l) (-1)^{k + 1}} -
+w_{\lfloor k / 2 \rfloor + 1}^{(i - l + 1) (-1)^{k + 1}} \right]$$
+
+Using telescoping:
+
+$$ = \frac{1}{M} \sum_{k=0}^{M -
+2} \left[ w_{\lfloor k / 2 \rfloor + 1}^{(i - j) (-1)^{k + 1}} - w_{\lfloor k /
+2 \rfloor + 1}^{(i + 1) (-1)^{k + 1}} \right]$$
+
+Since $$e^{-ix} = \text{conj}(e^{ix})$$, the imaginary components cancel out of
+the sum:
+
+$$ = \frac{2}{M} \sum_{k=0}^{(M -
+1) / 2 - 1} \text{Real}\left[ w_{k + 1}^{i - j} - w_{k + 1}^{i + 1} \right]$$
+
+$$ = \frac{2}{M} \sum_{k=0}^{(M -
+1) / 2 - 1} \left[ \text{cos}\left(\frac{2 \pi (i - j) (k +
+1)}{M}\right) - \text{cos}\left(\frac{2 \pi (i + 1) (k +
+1)}{M}\right) \right]$$
+
+Using Lagrange's identity $$\sum_{n=1}^N \text{cos}(n \theta) = -\frac{1}{2} +
+\frac{\text{sin}\left(\left(N + \frac{1}{2}\right) \theta\right)}{2
+\text{sin}\left(\frac{\theta}{2}\right)}$$:
+
+$$ = \frac{2}{M} \left(
+\frac{\text{sin}\left(\left(\left(\frac{M -
+1}{2}\right) + \frac{1}{2}\right) \frac{2 \pi (i -
+j)}{M}\right)}{2 \text{sin}\left(\frac{\pi (i -
+j)}{M}\right)} \\-
+\frac{\text{sin}\left(\left(\left(\frac{M -
+1}{2}\right) + \frac{1}{2}\right) \frac{2 \pi (i +
+1)}{M}\right)}{2 \text{sin}\left(\frac{\pi (i +
+1)}{M}\right)} \right)$$
+
+$$ = \frac{1}{M} \left(\frac{\text{sin}(\pi (i -
+j))}{\text{sin}\left(\frac{\pi (i - j)}{M}\right)} -
+\frac{\text{sin}(\pi (i + 1))}{\text{sin}\left(\frac{\pi (i +
+1)}{M}\right)} \right)$$
+
+The second term will always be zero, since $$i + 1$$ is at most
+$$M - 1$$ ($$i$$ ranges from $$0$$ to
+$$M - 2$$). The first term is also zero unless $$i = j$$
+($$\text{sin}(x)$$ is zero at integer multiples of $$\pi$$). Taking a limit when
+$$i = j$$, the expression evaluates to $$1$$ (L'Hospital's rule gives a ratio of
+cosines, both one, along with a ratio of the arguments from the chain rule).
+
+## Simplification of expression for matrix to a power
+
+Having established that the eigenvalues, eigenvectors, and the inverse
+eigenvector matrix are all correct, we can now use them to derive an expression
+for the matrix raised to a power:
+
+$$ {\boldsymbol C}^p = \text{cycle_eigenvectors} *
+\text{diag}(\text{cycle_eigenvalues})^p * \text{cycle_eigenvectors}^{-1} $$
+
+$$({\boldsymbol C}^p)_{i, j} = \sum_{k=0}^{M - 2}
+\text{cycle_eigenvectors}_{i, k} * \text{cycle_eigenvalues}_k^p *
+(\text{cycle_eigenvectors}^{-1})_{k, j}$$
+
+$$ = \frac{1}{M} \sum_{k=0}^{M -
+2} \left[\left( w_{\lfloor k / 2 \rfloor + 1}^{(p - i) (-1)^{k + 1}} -
+w_{\lfloor k / 2 \rfloor + 1}^{(p - i - 1) (-1)^{k + 1}} \right) \sum_{l=0}^j
+w_{\lfloor k / 2 \rfloor + 1}^{l (-1)^{k}} \right]$$
+
+Following the same logic as for the inverse matrix proof, this leads to
+($$cos(x)$$ being an even function):
+
+$$ = \frac{2}{M} \sum_{k=0}^{(M -
+1) / 2 - 1} \left[ \text{cos}\left(\frac{2 \pi (p - i + j)
+k}{M}\right) - \text{cos}\left(\frac{2 \pi (p - i - 1)
+k)}{M}\right) \right]$$
+
+Applying Lagrange's identity and simplifying, we get:
+
+$$ = \frac{1}{M} \left(\frac{\text{sin}(\pi (p - i +
+j))}{\text{sin}\left(\frac{\pi (p - i + j)}{M}\right)} -
+\frac{\text{sin}(\pi (p - i - 1))}{\text{sin}\left(\frac{\pi (p - i -
+1)}{M}\right)} \right)$$
+
+As a special/limiting case, we get the integer powers of the original matrix
+($$I(b)$$ is 1 if $$b$$ is true and zero otherwise):
+
+$$ ({\boldsymbol C}^p)_{i, j} = \frac{1}{M} \left(
+M * I(p - i + j \equiv 0\pmod {M})
+\\- M * I(p - i - 1 \equiv 0\pmod
+{M}) \right)$$
+
+## Simplification of expression for pre- and post-multiplication of noise matrix to a power
+
+Raising the transition matrix to a power allows us to transform the inferred
+mean state across time (imputing), but performing imputation on the covariance
+of the state estimate requires adding the noise covariance at each step (and
+then transforming by pre- and post-multiplying by the transition
+matrix). However, this noise covariance has a very special form, having only one
+non-zero element in the upper left hand corner.
+
+$$\text{noise_covariance}_{i, j} = \begin{cases} \text{noise_scalar} & i = j =
+0\\ 0 & \text{otherwise}\end{cases} $$
+
+This makes it easy to compute an expression for $${\boldsymbol C}^p *
+\text{noise_covariance} * ({\boldsymbol C}^T)^p$$:
+
+$$\left ({\boldsymbol C}^p * \text{noise_covariance} \right)_{i, j}
+= ({\boldsymbol C}^p)_{i, 0} * \text{noise_scalar} * I(j = 0)$$
+
+$$\left ({\boldsymbol C}^p * \text{noise_covariance} *
+({\boldsymbol C}^T \right)^p)_{i, j} = ({\boldsymbol C}^p)_{i, 0} *
+\text{noise_scalar} * ({\boldsymbol C}^p)_{j, 0}$$
+
+$$ = \frac{1}{M^2} \left(\frac{\text{sin}(\pi (p -
+i))}{\text{sin}\left(\frac{\pi (p - i)}{M}\right)} -
+\frac{\text{sin}(\pi (p - i - 1))}{\text{sin}\left(\frac{\pi (p - i -
+1)}{M}\right)} \right) \\ \left(\frac{\text{sin}(\pi (p -
+j))}{\text{sin}\left(\frac{\pi (p - j)}{M}\right)} -
+\frac{\text{sin}(\pi (p - j - 1))}{\text{sin}\left(\frac{\pi (p - j -
+1)}{M}\right)} \right)$$
+
+This (and the transition-matrix-to-a-power expression above) can be simplified
+slightly using the fact that $$\text{sin}((x + i)\pi) = (-1)^i \text{sin}(x
+\pi)$$ for integers $$i$$.
+
+## Open questions
+
+- It would be nice to have an expression for the elements of $$f(\lambda, N) =
+  \sum_{k=0}^N {\boldsymbol C}^{\lambda k} * \text{noise_covariance} *
+  ({\boldsymbol C}^T)^{\lambda k}$$, especially for $$0 < \lambda < 1$$.
+- It seems that when $$\lambda =
+  \frac{M}{\text{periodicity}}$$, then $$f(\lambda,
+  \text{periodicity}) = \frac{\text{periodicity}}{M} f(1,
+  M)$$, but I'm not exactly sure why (i.e. have not
+  proven it).
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fcd3e391b63c2362d6187da9556e2c71836dbaa
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -0,0 +1,341 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements Kalman filtering for linear state space models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import distributions
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import numerics
+
+
+# TODO(allenl): support for always-factored covariance matrices
+class KalmanFilter(object):
+  """Inference on linear state models.
+
+  The model for observations in a given state is:
+    observation(t) = observation_model * state(t)
+        + Gaussian(0, observation_noise_covariance)
+
+  State updates take the following form:
+    state(t) = state_transition * state(t-1)
+        + state_noise_transform * Gaussian(0, state_transition_noise_covariance)
+
+  This is a real-valued analog to hidden Markov models, with linear transitions
+  and a Gaussian noise model. Given initial conditions, noise, and state
+  transition, Kalman filtering recursively estimates states and observations,
+  along with their associated uncertainty. When fed observations, future state
+  and uncertainty estimates are conditioned on those observations (in a Bayesian
+  sense).
+
+  Typically some "given"s mentioned above (noises) will be unknown, and so
+  optimizing the Kalman filter's probabilistic predictions with respect to these
+  parameters is a good approach. The state transition and observation models are
+  usually known a priori as a modeling decision.
+
+  """
+
+  def __init__(self, dtype=dtypes.float32,
+               simplified_posterior_covariance_computation=False):
+    """Initialize the Kalman filter.
+
+    Args:
+      dtype: The data type to use for floating point tensors.
+      simplified_posterior_covariance_computation: If True, uses an algebraic
+        simplification of the Kalman filtering posterior covariance update,
+        which is slightly faster at the cost of numerical stability. The
+        simplified update is often stable when using double precision on small
+        models or with fixed transition matrices.
+    """
+    self._simplified_posterior_covariance_computation = (
+        simplified_posterior_covariance_computation)
+    self.dtype = dtype
+
+  def do_filter(
+      self, estimated_state, estimated_state_covariance,
+      predicted_observation, predicted_observation_covariance,
+      observation, observation_model, observation_noise):
+    """Convenience function for scoring predictions.
+
+    Scores a prediction against an observation, and computes the updated
+    posterior over states.
+
+    Shapes given below for arguments are for single-model Kalman filtering
+    (e.g. KalmanFilter). For ensembles, prior_state and prior_state_var are
+    same-length tuples of values corresponding to each model.
+
+    Args:
+      estimated_state: A prior mean over states [batch size x state dimension]
+      estimated_state_covariance: Covariance of state prior [batch size x D x
+          D], with D depending on the Kalman filter implementation (typically
+          the state dimension).
+      predicted_observation: A prediction for the observed value, such as that
+          returned by observed_from_state. A [batch size x num features] Tensor.
+      predicted_observation_covariance: A covariance matrix corresponding to
+          `predicted_observation`, a [batch size x num features x num features]
+          Tensor.
+      observation: The observed value corresponding to the predictions
+          given [batch size x observation dimension]
+      observation_model: The [batch size x observation dimension x model state
+          dimension] Tensor indicating how a particular state is mapped to
+          (pre-noise) observations for each part of the batch.
+      observation_noise: A [batch size x observation dimension x observation
+          dimension] Tensor or [observation dimension x observation dimension]
+          Tensor with covariance matrices to use for each part of the batch (a
+          two-dimensional input will be broadcast).
+    Returns:
+      posterior_state, posterior_state_var: Posterior mean and
+          covariance, updated versions of prior_state and
+          prior_state_var.
+      log_prediction_prob: Log probability of the observations under
+          the priors, suitable for optimization (should be maximized).
+
+    """
+    symmetrized_observation_covariance = 0.5 * (
+        predicted_observation_covariance + array_ops.matrix_transpose(
+            predicted_observation_covariance))
+    instability_message = (
+        "This may occur due to numerically unstable filtering when there is "
+        "a large difference in posterior variances, or when inferences are "
+        "near-deterministic. Considering tuning the "
+        "'filtering_maximum_posterior_variance_ratio' or "
+        "'filtering_minimum_posterior_variance' parameters in your "
+        "StateSpaceModelConfiguration, or tuning the transition matrix.")
+    symmetrized_observation_covariance = numerics.verify_tensor_all_finite(
+        symmetrized_observation_covariance,
+        "Predicted observation covariance was not finite. {}".format(
+            instability_message))
+    diag = array_ops.matrix_diag_part(symmetrized_observation_covariance)
+    min_diag = math_ops.reduce_min(diag)
+    non_negative_assert = control_flow_ops.Assert(
+        min_diag >= 0.,
+        [("The predicted observation covariance "
+          "has a negative diagonal entry. {}").format(instability_message),
+         min_diag])
+    with ops.control_dependencies([non_negative_assert]):
+      observation_covariance_cholesky = linalg_ops.cholesky(
+          symmetrized_observation_covariance)
+    log_prediction_prob = distributions.MultivariateNormalTriL(
+        predicted_observation, observation_covariance_cholesky).log_prob(
+            observation)
+    (posterior_state,
+     posterior_state_var) = self.posterior_from_prior_state(
+         prior_state=estimated_state,
+         prior_state_var=estimated_state_covariance,
+         observation=observation,
+         observation_model=observation_model,
+         predicted_observations=(predicted_observation,
+                                 predicted_observation_covariance),
+         observation_noise=observation_noise)
+    return (posterior_state, posterior_state_var, log_prediction_prob)
+
+  def predict_state_mean(self, prior_state, transition_matrices):
+    """Compute state transitions.
+
+    Args:
+      prior_state: Current estimated state mean [batch_size x state_dimension]
+      transition_matrices: A [batch size, state dimension, state dimension]
+        batch of matrices (dtype matching the `dtype` argument to the
+        constructor) with the transition matrix raised to the power of the
+        number of steps to be taken (not element-wise; use
+        math_utils.matrix_to_powers if there is no efficient special case) if
+        more than one step is desired.
+    Returns:
+      State mean advanced based on `transition_matrices` (dimensions matching
+      first argument).
+    """
+    advanced_state = array_ops.squeeze(
+        math_ops.matmul(
+            transition_matrices,
+            prior_state[..., None]),
+        squeeze_dims=[-1])
+    return advanced_state
+
+  def predict_state_var(
+      self, prior_state_var, transition_matrices, transition_noise_sums):
+    r"""Compute variance for state transitions.
+
+    Computes a noise estimate corresponding to the value returned by
+    predict_state_mean.
+
+    Args:
+      prior_state_var: Covariance matrix specifying uncertainty of current state
+          estimate [batch size x state dimension x state dimension]
+      transition_matrices: A [batch size, state dimension, state dimension]
+        batch of matrices (dtype matching the `dtype` argument to the
+        constructor) with the transition matrix raised to the power of the
+        number of steps to be taken (not element-wise; use
+        math_utils.matrix_to_powers if there is no efficient special case).
+      transition_noise_sums: A [batch size, state dimension, state dimension]
+        Tensor (dtype matching the `dtype` argument to the constructor) with:
+
+          \sum_{i=0}^{num_steps - 1} (
+             state_transition_to_powers_fn(i)
+             * state_transition_noise_covariance
+             * state_transition_to_powers_fn(i)^T
+          )
+
+        for the number of steps to be taken in each part of the batch (this
+        should match `transition_matrices`). Use math_utils.power_sums_tensor
+        with `tf.gather` if there is no efficient special case.
+    Returns:
+      State variance advanced based on `transition_matrices` and
+      `transition_noise_sums` (dimensions matching first argument).
+    """
+    prior_variance_transitioned = math_ops.matmul(
+        math_ops.matmul(transition_matrices, prior_state_var),
+        transition_matrices,
+        adjoint_b=True)
+    return prior_variance_transitioned + transition_noise_sums
+
+  def posterior_from_prior_state(self, prior_state, prior_state_var,
+                                 observation, observation_model,
+                                 predicted_observations,
+                                 observation_noise):
+    """Compute a posterior over states given an observation.
+
+    Args:
+      prior_state: Prior state mean [batch size x state dimension]
+      prior_state_var: Prior state covariance [batch size x state dimension x
+          state dimension]
+      observation: The observed value corresponding to the predictions given
+          [batch size x observation dimension]
+      observation_model: The [batch size x observation dimension x model state
+          dimension] Tensor indicating how a particular state is mapped to
+          (pre-noise) observations for each part of the batch.
+      predicted_observations: An (observation mean, observation variance) tuple
+          computed based on the current state, usually the output of
+          observed_from_state.
+      observation_noise: A [batch size x observation dimension x observation
+          dimension] or [observation dimension x observation dimension] Tensor
+          with covariance matrices to use for each part of the batch (a
+          two-dimensional input will be broadcast).
+    Returns:
+      Posterior mean and covariance (dimensions matching the first two
+      arguments).
+
+    """
+    observed_mean, observed_var = predicted_observations
+    residual = observation - observed_mean
+    # TODO(allenl): Can more of this be done using matrix_solve_ls?
+    kalman_solve_rhs = math_ops.matmul(
+        observation_model, prior_state_var, adjoint_b=True)
+    # This matrix_solve adjoint doesn't make a difference symbolically (since
+    # observed_var is a covariance matrix, and should be symmetric), but
+    # filtering on multivariate series is unstable without it. See
+    # test_multivariate_symmetric_covariance_float64 in kalman_filter_test.py
+    # for an example of the instability (fails with adjoint=False).
+    kalman_gain_transposed = linalg_ops.matrix_solve(
+        matrix=observed_var, rhs=kalman_solve_rhs, adjoint=True)
+    posterior_state = prior_state + array_ops.squeeze(
+        math_ops.matmul(
+            kalman_gain_transposed,
+            array_ops.expand_dims(residual, -1),
+            adjoint_a=True),
+        squeeze_dims=[-1])
+    gain_obs = math_ops.matmul(
+        kalman_gain_transposed, observation_model, adjoint_a=True)
+    identity_extradim = linalg_ops.eye(
+        array_ops.shape(gain_obs)[1], dtype=gain_obs.dtype)[None]
+    identity_minus_factor = identity_extradim - gain_obs
+    if self._simplified_posterior_covariance_computation:
+      # posterior covariance =
+      #   (I - kalman_gain * observation_model) * prior_state_var
+      posterior_state_var = math_ops.matmul(identity_minus_factor,
+                                            prior_state_var)
+    else:
+      observation_noise = ops.convert_to_tensor(observation_noise)
+      # A Joseph form update, which provides better numeric stability than the
+      # simplified optimal Kalman gain update, at the cost of a few extra
+      # operations. Joseph form updates are valid for any gain (not just the
+      # optimal Kalman gain), and so are more forgiving of numerical errors in
+      # computing the optimal Kalman gain.
+      #
+      # posterior covariance =
+      #   (I - kalman_gain * observation_model) * prior_state_var
+      #     * (I - kalman_gain * observation_model)^T
+      #   + kalman_gain * observation_noise * kalman_gain^T
+      left_multiplied_state_var = math_ops.matmul(identity_minus_factor,
+                                                  prior_state_var)
+      multiplied_state_var = math_ops.matmul(
+          identity_minus_factor, left_multiplied_state_var, adjoint_b=True)
+      def _batch_observation_noise_update():
+        return (multiplied_state_var + math_ops.matmul(
+            math_ops.matmul(
+                kalman_gain_transposed, observation_noise, adjoint_a=True),
+            kalman_gain_transposed))
+      def _matrix_observation_noise_update():
+        return (multiplied_state_var + math_ops.matmul(
+            math_utils.batch_times_matrix(
+                kalman_gain_transposed, observation_noise, adj_x=True),
+            kalman_gain_transposed))
+      if observation_noise.get_shape().ndims is None:
+        posterior_state_var = control_flow_ops.cond(
+            math_ops.equal(array_ops.rank(observation_noise), 2),
+            _matrix_observation_noise_update, _batch_observation_noise_update)
+      else:
+        # If static shape information exists, it gets checked in each cond()
+        # branch, so we need a special case to avoid graph-build-time
+        # exceptions.
+        if observation_noise.get_shape().ndims == 2:
+          posterior_state_var = _matrix_observation_noise_update()
+        else:
+          posterior_state_var = _batch_observation_noise_update()
+    return posterior_state, posterior_state_var
+
+  def observed_from_state(self, state_mean, state_var, observation_model,
+                          observation_noise):
+    """Compute an observation distribution given a state distribution.
+
+    Args:
+      state_mean: State mean vector [batch size x state dimension]
+      state_var: State covariance [batch size x state dimension x state
+          dimension]
+      observation_model: The [batch size x observation dimension x model state
+          dimension] Tensor indicating how a particular state is mapped to
+          (pre-noise) observations for each part of the batch.
+      observation_noise: A [batch size x observation dimension x observation
+          dimension] Tensor with covariance matrices to use for each part of the
+          batch. To remove observation noise, pass a Tensor of zeros (or simply
+          0, which will broadcast).
+    Returns:
+      observed_mean: Observation mean vector [batch size x observation
+          dimension]
+      observed_var: Observation covariance [batch size x observation dimension x
+          observation dimension]
+
+    """
+    observed_mean = array_ops.squeeze(
+        math_ops.matmul(
+            array_ops.expand_dims(state_mean, 1),
+            observation_model,
+            adjoint_b=True),
+        squeeze_dims=[1])
+    observed_var = math_ops.matmul(
+        math_ops.matmul(observation_model, state_var),
+        observation_model,
+        adjoint_b=True)
+    observed_var += observation_noise
+    return observed_mean, observed_var
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f29f3c7f1a05ddf2cfb55eff174f0b2e274fe8
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter_test.py
@@ -0,0 +1,425 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Kalman filtering."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import kalman_filter
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+# Two-dimensional state model with "slope" and "level" components.
+STATE_TRANSITION = [
+    [1., 1.],  # Add slope to level
+    [0., 1.]   # Maintain slope
+]
+# Independent noise for each component
+STATE_TRANSITION_NOISE = [[0.1, 0.0], [0.0, 0.2]]
+OBSERVATION_MODEL = [[[0.5, 0.0], [0.0, 1.0]]]
+OBSERVATION_NOISE = [[0.0001, 0.], [0., 0.0002]]
+STATE_NOISE_TRANSFORM = [[1.0, 0.0], [0.0, 1.0]]
+
+
+def _powers_and_sums_from_transition_matrix(
+    state_transition, state_transition_noise_covariance,
+    state_noise_transform, max_gap=1):
+  def _transition_matrix_powers(powers):
+    return math_utils.matrix_to_powers(state_transition, powers)
+  def _power_sums(num_steps):
+    power_sums_tensor = math_utils.power_sums_tensor(
+        max_gap + 1, state_transition,
+        math_ops.matmul(state_noise_transform,
+                        math_ops.matmul(
+                            state_transition_noise_covariance,
+                            state_noise_transform,
+                            adjoint_b=True)))
+    return array_ops.gather(power_sums_tensor, indices=num_steps)
+  return (_transition_matrix_powers, _power_sums)
+
+
+class MultivariateTests(test.TestCase):
+
+  def _multivariate_symmetric_covariance_test_template(
+      self, dtype, simplified_posterior_variance_computation):
+    """Check that errors aren't building up asymmetries in covariances."""
+    kf = kalman_filter.KalmanFilter(dtype=dtype)
+    observation_noise_covariance = constant_op.constant(
+        [[1., 0.5], [0.5, 1.]], dtype=dtype)
+    observation_model = constant_op.constant(
+        [[[1., 0., 0., 0.], [0., 0., 1., 0.]]], dtype=dtype)
+    state = array_ops.placeholder(shape=[1, 4], dtype=dtype)
+    state_var = array_ops.placeholder(shape=[1, 4, 4], dtype=dtype)
+    observation = array_ops.placeholder(shape=[1, 2], dtype=dtype)
+    transition_fn, power_sum_fn = _powers_and_sums_from_transition_matrix(
+        state_transition=constant_op.constant(
+            [[1., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 1.],
+             [0., 0., 0., 1.]],
+            dtype=dtype),
+        state_noise_transform=linalg_ops.eye(4, dtype=dtype),
+        state_transition_noise_covariance=constant_op.constant(
+            [[1., 0., 0.5, 0.], [0., 1., 0., 0.5], [0.5, 0., 1., 0.],
+             [0., 0.5, 0., 1.]],
+            dtype=dtype))
+    pred_state = kf.predict_state_mean(
+        prior_state=state, transition_matrices=transition_fn([1]))
+    pred_state_var = kf.predict_state_var(
+        prior_state_var=state_var, transition_matrices=transition_fn([1]),
+        transition_noise_sums=power_sum_fn([1]))
+    observed_mean, observed_var = kf.observed_from_state(
+        state_mean=pred_state, state_var=pred_state_var,
+        observation_model=observation_model,
+        observation_noise=observation_noise_covariance)
+    post_state, post_state_var = kf.posterior_from_prior_state(
+        prior_state=pred_state, prior_state_var=pred_state_var,
+        observation=observation,
+        observation_model=observation_model,
+        predicted_observations=(observed_mean, observed_var),
+        observation_noise=observation_noise_covariance)
+    with self.test_session() as session:
+      evaled_state = numpy.array([[1., 1., 1., 1.]])
+      evaled_state_var = numpy.eye(4)[None]
+      for i in range(500):
+        evaled_state, evaled_state_var, evaled_observed_var = session.run(
+            [post_state, post_state_var, observed_var],
+            feed_dict={state: evaled_state,
+                       state_var: evaled_state_var,
+                       observation: [[float(i), float(i)]]})
+        self.assertAllClose(evaled_observed_var[0],
+                            evaled_observed_var[0].T)
+        self.assertAllClose(evaled_state_var[0],
+                            evaled_state_var[0].T)
+
+  def test_multivariate_symmetric_covariance_float32(self):
+    self._multivariate_symmetric_covariance_test_template(
+        dtypes.float32, simplified_posterior_variance_computation=False)
+
+  def test_multivariate_symmetric_covariance_float64(self):
+    self._multivariate_symmetric_covariance_test_template(
+        dtypes.float64, simplified_posterior_variance_computation=True)
+
+
+class KalmanFilterNonBatchTest(test.TestCase):
+  """Single-batch KalmanFilter tests."""
+
+  def setUp(self):
+    """The basic model defined above, with unit batches."""
+    self.kalman_filter = kalman_filter.KalmanFilter()
+    self.transition_fn, self.power_sum_fn = (
+        _powers_and_sums_from_transition_matrix(
+            state_transition=STATE_TRANSITION,
+            state_transition_noise_covariance=STATE_TRANSITION_NOISE,
+            state_noise_transform=STATE_NOISE_TRANSFORM,
+            max_gap=5))
+
+  def test_observed_from_state(self):
+    """Compare observation mean and noise to hand-computed values."""
+    with self.test_session():
+      state = constant_op.constant([[2., 1.]])
+      state_var = constant_op.constant([[[4., 0.], [0., 3.]]])
+      observed_mean, observed_var = self.kalman_filter.observed_from_state(
+          state, state_var,
+          observation_model=OBSERVATION_MODEL,
+          observation_noise=OBSERVATION_NOISE)
+      observed_mean_override, observed_var_override = (
+          self.kalman_filter.observed_from_state(
+              state, state_var,
+              observation_model=OBSERVATION_MODEL,
+              observation_noise=100 * constant_op.constant(
+                  OBSERVATION_NOISE)[None]))
+      self.assertAllClose(numpy.array([[1., 1.]]),
+                          observed_mean.eval())
+      self.assertAllClose(numpy.array([[1., 1.]]),
+                          observed_mean_override.eval())
+      self.assertAllClose(numpy.array([[[1.0001, 0.], [0., 3.0002]]]),
+                          observed_var.eval())
+      self.assertAllClose(numpy.array([[[1.01, 0.], [0., 3.02]]]),
+                          observed_var_override.eval())
+
+  def _posterior_from_prior_state_test_template(
+      self, state, state_var, observation, observation_model, observation_noise,
+      expected_state, expected_state_var):
+    """Test that repeated observations converge to the expected value."""
+    predicted_observations = self.kalman_filter.observed_from_state(
+        state, state_var, observation_model,
+        observation_noise=observation_noise)
+    state_update, state_var_update = (
+        self.kalman_filter.posterior_from_prior_state(
+            state, state_var, observation,
+            observation_model=observation_model,
+            predicted_observations=predicted_observations,
+            observation_noise=observation_noise))
+    with self.test_session() as session:
+      evaled_state, evaled_state_var = session.run([state, state_var])
+      for _ in range(300):
+        evaled_state, evaled_state_var = session.run(
+            [state_update, state_var_update],
+            feed_dict={state: evaled_state, state_var: evaled_state_var})
+    self.assertAllClose(expected_state,
+                        evaled_state,
+                        atol=1e-5)
+    self.assertAllClose(
+        expected_state_var,
+        evaled_state_var,
+        atol=1e-5)
+
+  def test_posterior_from_prior_state_univariate(self):
+    self._posterior_from_prior_state_test_template(
+        state=constant_op.constant([[0.3]]),
+        state_var=constant_op.constant([[[1.]]]),
+        observation=constant_op.constant([[1.]]),
+        observation_model=[[[2.]]],
+        observation_noise=[[[0.01]]],
+        expected_state=numpy.array([[0.5]]),
+        expected_state_var=[[[0.]]])
+
+  def test_posterior_from_prior_state_univariate_unit_noise(self):
+    self._posterior_from_prior_state_test_template(
+        state=constant_op.constant([[0.3]]),
+        state_var=constant_op.constant([[[1e10]]]),
+        observation=constant_op.constant([[1.]]),
+        observation_model=[[[2.]]],
+        observation_noise=[[[1.0]]],
+        expected_state=numpy.array([[0.5]]),
+        expected_state_var=[[[1. / (300. * 2. ** 2)]]])
+
+  def test_posterior_from_prior_state_multivariate_2d(self):
+    self._posterior_from_prior_state_test_template(
+        state=constant_op.constant([[1.9, 1.]]),
+        state_var=constant_op.constant([[[1., 0.], [0., 2.]]]),
+        observation=constant_op.constant([[1., 1.]]),
+        observation_model=OBSERVATION_MODEL,
+        observation_noise=OBSERVATION_NOISE,
+        expected_state=numpy.array([[2., 1.]]),
+        expected_state_var=[[[0., 0.], [0., 0.]]])
+
+  def test_posterior_from_prior_state_multivariate_3d(self):
+    self._posterior_from_prior_state_test_template(
+        state=constant_op.constant([[1.9, 1., 5.]]),
+        state_var=constant_op.constant(
+            [[[200., 0., 1.], [0., 2000., 0.], [1., 0., 40000.]]]),
+        observation=constant_op.constant([[1., 1., 3.]]),
+        observation_model=constant_op.constant(
+            [[[0.5, 0., 0.],
+              [0., 10., 0.],
+              [0., 0., 100.]]]),
+        observation_noise=linalg_ops.eye(3) / 10000.,
+        expected_state=numpy.array([[2., .1, .03]]),
+        expected_state_var=numpy.zeros([1, 3, 3]))
+
+  def test_predict_state_mean(self):
+    """Compare state mean transitions with simple hand-computed values."""
+    with self.test_session():
+      state = constant_op.constant([[4., 2.]])
+      state = self.kalman_filter.predict_state_mean(
+          state, self.transition_fn([1]))
+      for _ in range(2):
+        state = self.kalman_filter.predict_state_mean(
+            state, self.transition_fn([1]))
+      self.assertAllClose(
+          numpy.array([[2. * 3. + 4.,  # Slope * time + base
+                        2.]]),
+          state.eval())
+
+  def test_predict_state_var(self):
+    """Compare a variance transition with simple hand-computed values."""
+    with self.test_session():
+      state_var = constant_op.constant([[[1., 0.], [0., 2.]]])
+      state_var = self.kalman_filter.predict_state_var(
+          state_var, self.transition_fn([1]), self.power_sum_fn([1]))
+      self.assertAllClose(
+          numpy.array([[[3.1, 2.0], [2.0, 2.2]]]),
+          state_var.eval())
+
+  def test_do_filter(self):
+    """Tests do_filter.
+
+    Tests that correct values have high probability and incorrect values
+    have low probability when there is low uncertainty.
+    """
+    with self.test_session():
+      state = constant_op.constant([[4., 2.]])
+      state_var = constant_op.constant([[[0.0001, 0.], [0., 0.0001]]])
+      observation = constant_op.constant([[
+          .5 * (
+              4.  # Base
+              + 2.),  # State transition
+          2.
+      ]])
+      estimated_state = self.kalman_filter.predict_state_mean(
+          state, self.transition_fn([1]))
+      estimated_state_covariance = self.kalman_filter.predict_state_var(
+          state_var, self.transition_fn([1]), self.power_sum_fn([1]))
+      (predicted_observation,
+       predicted_observation_covariance) = (
+           self.kalman_filter.observed_from_state(
+               estimated_state, estimated_state_covariance,
+               observation_model=OBSERVATION_MODEL,
+               observation_noise=OBSERVATION_NOISE))
+      (_, _, first_log_prob) = self.kalman_filter.do_filter(
+          estimated_state=estimated_state,
+          estimated_state_covariance=estimated_state_covariance,
+          predicted_observation=predicted_observation,
+          predicted_observation_covariance=predicted_observation_covariance,
+          observation=observation,
+          observation_model=OBSERVATION_MODEL,
+          observation_noise=OBSERVATION_NOISE)
+      self.assertGreater(first_log_prob.eval()[0], numpy.log(0.99))
+
+  def test_predict_n_ahead_mean(self):
+    with self.test_session():
+      original_state = constant_op.constant([[4., 2.]])
+      n = 5
+      iterative_state = original_state
+      for i in range(n):
+        self.assertAllClose(
+            iterative_state.eval(),
+            self.kalman_filter.predict_state_mean(
+                original_state,
+                self.transition_fn([i])).eval())
+        iterative_state = self.kalman_filter.predict_state_mean(
+            iterative_state,
+            self.transition_fn([1]))
+
+  def test_predict_n_ahead_var(self):
+    with self.test_session():
+      original_var = constant_op.constant([[[2., 3.], [4., 5.]]])
+      n = 5
+      iterative_var = original_var
+      for i in range(n):
+        self.assertAllClose(
+            iterative_var.eval(),
+            self.kalman_filter.predict_state_var(
+                original_var,
+                self.transition_fn([i]),
+                self.power_sum_fn([i])).eval())
+        iterative_var = self.kalman_filter.predict_state_var(
+            iterative_var,
+            self.transition_fn([1]),
+            self.power_sum_fn([1]))
+
+
+class KalmanFilterBatchTest(test.TestCase):
+  """KalmanFilter tests with more than one element batches."""
+
+  def test_do_filter_batch(self):
+    """Tests do_filter, in batch mode.
+
+    Tests that correct values have high probability and incorrect values
+    have low probability when there is low uncertainty.
+    """
+    with self.test_session():
+      state = constant_op.constant([[4., 2.], [5., 3.], [6., 4.]])
+      state_var = constant_op.constant(3 * [[[0.0001, 0.], [0., 0.0001]]])
+      observation = constant_op.constant([
+          [
+              .5 * (
+                  4.  # Base
+                  + 2.),  # State transition
+              2.
+          ],
+          [
+              .5 * (
+                  5.  # Base
+                  + 3.),  # State transition
+              3.
+          ],
+          [3.14, 2.71]
+      ])  # Low probability observation
+      kf = kalman_filter.KalmanFilter()
+      transition_fn, power_sum_fn = _powers_and_sums_from_transition_matrix(
+          state_transition=STATE_TRANSITION,
+          state_transition_noise_covariance=STATE_TRANSITION_NOISE,
+          state_noise_transform=STATE_NOISE_TRANSFORM,
+          max_gap=2)
+      estimated_state = kf.predict_state_mean(state, transition_fn(3*[1]))
+      estimated_state_covariance = kf.predict_state_var(
+          state_var, transition_fn(3*[1]), power_sum_fn(3*[1]))
+      observation_model = array_ops.tile(OBSERVATION_MODEL, [3, 1, 1])
+      (predicted_observation,
+       predicted_observation_covariance) = (
+           kf.observed_from_state(
+               estimated_state, estimated_state_covariance,
+               observation_model=observation_model,
+               observation_noise=OBSERVATION_NOISE))
+      (state, state_var, log_prob) = kf.do_filter(
+          estimated_state=estimated_state,
+          estimated_state_covariance=estimated_state_covariance,
+          predicted_observation=predicted_observation,
+          predicted_observation_covariance=predicted_observation_covariance,
+          observation=observation,
+          observation_model=observation_model,
+          observation_noise=OBSERVATION_NOISE)
+      first_log_prob, second_log_prob, third_log_prob = log_prob.eval()
+      self.assertGreater(first_log_prob.sum(), numpy.log(0.99))
+      self.assertGreater(second_log_prob.sum(), numpy.log(0.99))
+      self.assertLess(third_log_prob.sum(), numpy.log(0.01))
+
+  def test_predict_n_ahead_mean(self):
+    with self.test_session():
+      kf = kalman_filter.KalmanFilter()
+      transition_fn, _ = _powers_and_sums_from_transition_matrix(
+          state_transition=STATE_TRANSITION,
+          state_transition_noise_covariance=STATE_TRANSITION_NOISE,
+          state_noise_transform=STATE_NOISE_TRANSFORM,
+          max_gap=2)
+      original_state = constant_op.constant([[4., 2.], [3., 1.], [6., 2.]])
+      state0 = original_state
+      state1 = kf.predict_state_mean(state0, transition_fn(3 * [1]))
+      state2 = kf.predict_state_mean(state1, transition_fn(3 * [1]))
+      batch_eval = kf.predict_state_mean(
+          original_state, transition_fn([1, 0, 2])).eval()
+      self.assertAllClose(state0.eval()[1], batch_eval[1])
+      self.assertAllClose(state1.eval()[0], batch_eval[0])
+      self.assertAllClose(state2.eval()[2], batch_eval[2])
+
+  def test_predict_n_ahead_var(self):
+    with self.test_session():
+      kf = kalman_filter.KalmanFilter()
+      transition_fn, power_sum_fn = _powers_and_sums_from_transition_matrix(
+          state_transition=STATE_TRANSITION,
+          state_transition_noise_covariance=STATE_TRANSITION_NOISE,
+          state_noise_transform=STATE_NOISE_TRANSFORM,
+          max_gap=2)
+      base_var = 2.0 * numpy.identity(2) + numpy.ones([2, 2])
+      original_var = constant_op.constant(
+          numpy.array(
+              [base_var, 2.0 * base_var, 3.0 * base_var], dtype=numpy.float32))
+      var0 = original_var
+      var1 = kf.predict_state_var(
+          var0, transition_fn(3 * [1]), power_sum_fn(3 * [1]))
+      var2 = kf.predict_state_var(
+          var1, transition_fn(3 * [1]), power_sum_fn(3 * [1]))
+      batch_eval = kf.predict_state_var(
+          original_var,
+          transition_fn([1, 0, 2]),
+          power_sum_fn([1, 0, 2])).eval()
+      self.assertAllClose(var0.eval()[1], batch_eval[1])
+      self.assertAllClose(var1.eval()[0], batch_eval[0])
+      self.assertAllClose(var2.eval()[2], batch_eval[2])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/level_trend.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/level_trend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d3f55c39d32bb9f14829842fcad85571de6855
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/level_trend.py
@@ -0,0 +1,145 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements a state space model with level and local linear trends."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+
+
+class AdderStateSpaceModel(state_space_model.StateSpaceModel):
+  """A state space model component with level and slope.
+
+  At each timestep, level <- level + slope. Level is observed, slope is not.
+  """
+
+  def __init__(
+      self,
+      use_level_noise=True,
+      configuration=state_space_model.StateSpaceModelConfiguration()):
+    """Configure the model.
+
+    Args:
+      use_level_noise: Whether to model the time series as having level noise.
+      configuration: A StateSpaceModelConfiguration object.
+    """
+    self.use_level_noise = use_level_noise
+    super(AdderStateSpaceModel, self).__init__(
+        configuration=configuration)
+
+  def get_prior_mean(self):
+    """If un-chunked data is available, set initial level to the first value."""
+    with variable_scope.variable_scope(self._variable_scope):
+      if self._input_statistics is not None:
+        # TODO(allenl): Better support for multivariate series here.
+        initial_value = array_ops.stack([
+            math_ops.reduce_mean(
+                self._input_statistics.series_start_moments.mean), 0.
+        ])
+        return initial_value + variable_scope.get_variable(
+            name="prior_state_mean",
+            shape=initial_value.get_shape(),
+            initializer=init_ops.zeros_initializer(),
+            dtype=self.dtype,
+            trainable=self._configuration.trainable_start_state)
+      else:
+        return super(AdderStateSpaceModel, self).get_prior_mean()
+
+  def transition_to_powers(self, powers):
+    """Computes powers of the adder transition matrix efficiently.
+
+    Args:
+      powers: An integer Tensor, shape [...], with powers to raise the
+        transition matrix to.
+    Returns:
+      A floating point Tensor with shape [..., 2, 2] containing:
+        transition^power = [[1., power],
+                            [0., 1.]]
+    """
+    paddings = array_ops.concat(
+        [
+            array_ops.zeros([array_ops.rank(powers), 2], dtype=dtypes.int32),
+            [(0, 1), (1, 0)]
+        ],
+        axis=0)
+    powers_padded = array_ops.pad(powers[..., None, None], paddings=paddings)
+    identity_matrices = linalg_ops.eye(
+        num_rows=2, batch_shape=array_ops.shape(powers), dtype=self.dtype)
+    return identity_matrices + math_ops.cast(powers_padded, self.dtype)
+
+  def transition_power_noise_accumulator(self, num_steps):
+    """Computes power sums in closed form."""
+    def _pack_and_reshape(*values):
+      return array_ops.reshape(
+          array_ops.stack(axis=1, values=values),
+          array_ops.concat(values=[array_ops.shape(num_steps), [2, 2]], axis=0))
+
+    num_steps = math_ops.cast(num_steps, self.dtype)
+    noise_transitions = num_steps - 1
+    noise_transform = ops.convert_to_tensor(self.get_noise_transform(),
+                                            self.dtype)
+    noise_covariance_transformed = math_ops.matmul(
+        math_ops.matmul(noise_transform,
+                        self.state_transition_noise_covariance),
+        noise_transform,
+        adjoint_b=True)
+    # Un-packing the transformed noise as:
+    # [[a b]
+    #  [c d]]
+    a, b, c, d = array_ops.unstack(
+        array_ops.reshape(noise_covariance_transformed, [-1, 4]), axis=1)
+    sum_of_first_n = noise_transitions * (noise_transitions + 1) / 2
+    sum_of_first_n_squares = sum_of_first_n * (2 * noise_transitions + 1) / 3
+    return _pack_and_reshape(
+        num_steps * a + sum_of_first_n * (b + c) + sum_of_first_n_squares * d,
+        num_steps * b + sum_of_first_n * d,
+        num_steps * c + sum_of_first_n * d,
+        num_steps * d)
+
+  def get_state_transition(self):
+    return [[1., 1.],  # Add slope to level
+            [0., 1.]]  # Maintain slope
+
+  def get_noise_transform(self):
+    if self.use_level_noise:
+      return [[1., 0.],
+              [0., 1.]]
+    else:
+      return [[0.],
+              [1.]]
+
+  def get_observation_model(self, times):
+    """Observe level but not slope.
+
+    See StateSpaceModel.get_observation_model.
+
+    Args:
+      times: Unused. See the parent class for details.
+    Returns:
+      A static, univariate observation model for later broadcasting.
+    """
+    del times  # Does not rely on times. Uses broadcasting from the parent.
+    return constant_op.constant([1., 0.], dtype=self.dtype)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/level_trend_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/level_trend_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..19110820860f533ced90d690d55cbd9f611a90e5
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/level_trend_test.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for level and trend state space model components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import level_trend
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import test_utils
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class SpecialCaseTests(test.TestCase):
+
+  def test_adder_transition_to_powers(self):
+    num_steps = 3
+    dtype = dtypes.float64
+    adder = level_trend.AdderStateSpaceModel(
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            dtype=dtype))
+    test_utils.transition_power_test_template(
+        test_case=self, model=adder, num_steps=num_steps)
+
+  def test_adder_noise_accumulator(self):
+    num_steps = 3
+    dtype = dtypes.float64
+    use_level_noise = True
+    adder = level_trend.AdderStateSpaceModel(
+        use_level_noise=use_level_noise,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            dtype=dtype))
+    test_utils.noise_accumulator_test_template(
+        test_case=self, model=adder, num_steps=num_steps)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/periodic.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/periodic.py
new file mode 100644
index 0000000000000000000000000000000000000000..e70db93ea1eb95bfe98ac351a9a2560bd7a0103b
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/periodic.py
@@ -0,0 +1,535 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""State space components for modeling seasonality."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+
+
+class CycleStateSpaceModel(state_space_model.StateSpaceModel):
+  """A state space model component which cycles between values.
+
+  Stores N values using N - 1 latent values, the Nth being the negative sum of
+  those explicitly stored. At any given timestep one of these values is
+  observed. Noise is assumed to affect only one of the transitions.
+  """
+
+  def __init__(
+      self,
+      periodicity,
+      configuration=state_space_model.StateSpaceModelConfiguration()):
+    self._periodicity = periodicity
+    super(CycleStateSpaceModel, self).__init__(configuration=configuration)
+
+  def get_state_transition(self):
+    return self.transition_to_powers(array_ops.ones([], dtype=dtypes.int32))
+
+  def get_noise_transform(self):
+    # transition_power_noise_accumulator makes assumptions about this
+    # transformation. If the noise transform is modified or overridden,
+    # transition_power_noise_accumulator must be modified as well (or discarded,
+    # as it is simply an optimization).
+    return array_ops.pad(
+        array_ops.ones([1], dtype=self.dtype),
+        paddings=[(0, self._periodicity - 2)])[..., None]
+
+  def transition_to_powers(self, powers):
+    """Computes powers of the cycle transition matrix efficiently.
+
+    Args:
+      powers: An integer Tensor, shape [...], with powers to raise the
+        transition matrix to.
+    Returns:
+      A floating point Tensor with shape [..., self._periodicity - 1,
+      self._periodicity - 1] containing:
+        (transition^power)_{i, j} = {
+           1  if (i - j) % self._periodicity == power % self._periodicity
+          -1  if (i + 1) % self._periodicity == power % self._periodicity
+           0  otherwise}
+    """
+    powers %= self._periodicity
+    range_shape_padded = array_ops.reshape(
+        math_ops.range(self._periodicity - 1, dtype=powers.dtype),
+        array_ops.concat(
+            [
+                array_ops.ones([array_ops.rank(powers)], dtype=dtypes.int32),
+                [self._periodicity - 1]
+            ],
+            axis=0))
+    is_row_negative = math_ops.equal(range_shape_padded + 1, powers[..., None])
+    row_indicator_shape = array_ops.shape(is_row_negative)
+    negative_row_indicator = array_ops.where(is_row_negative, -array_ops.ones(
+        shape=row_indicator_shape, dtype=self.dtype),
+                                             array_ops.zeros(
+                                                 row_indicator_shape,
+                                                 dtype=self.dtype))
+    coord_diff = (range_shape_padded[..., None]
+                  - range_shape_padded[..., None, :])
+    is_one = math_ops.equal(coord_diff % self._periodicity,
+                            powers[..., None, None])
+    positive_ones = array_ops.where(is_one,
+                                    array_ops.ones(
+                                        array_ops.shape(is_one),
+                                        dtype=self.dtype),
+                                    array_ops.zeros(
+                                        array_ops.shape(is_one),
+                                        dtype=self.dtype))
+    return math_ops.cast(positive_ones + negative_row_indicator[..., None],
+                         self.dtype)
+
+  def transition_power_noise_accumulator(
+      self, num_steps, noise_addition_coefficient=1):
+    r"""Sum the transitioned covariance matrix over a number of steps.
+
+    Assumes that state_transition_noise_covariance is a matrix with a single
+    non-zero value in the upper left.
+
+    Args:
+      num_steps: A [...] shape integer Tensor with numbers of steps to compute
+        power sums for.
+      noise_addition_coefficient: A multiplier for the state transition noise
+        covariance (used in ResolutionCycleModel to compute multiples of full
+        period sums).
+    Returns:
+      The computed power sum, with shape [..., state dimension, state
+      dimension] containing:
+
+        [\sum_{p=0}^{num_steps - 1} (
+           state_transition^p
+           * state_transition_noise_covariance
+           * (state_transition^p)^T)]_{i, j} = {
+          -contribution_{j + 1}                   if j == i - 1
+          contribution_{j + 1} + contribution{j}  if j == i
+          -contribution_{j}                       if j == i + 1
+          0                                        otherwise
+        }
+
+        contribution_k = noise_scalar
+          * ((num_steps + self._periodicity - 1 - (k % self._periodicity))
+             // self._periodicity)
+
+      Where contribution_k is the sum of noise_scalar additions to component k
+      of the periodicity.
+    """
+    noise_addition_scalar = array_ops.squeeze(
+        self.state_transition_noise_covariance, axis=[-1, -2])
+    period_range_reshaped = array_ops.reshape(
+        math_ops.range(self._periodicity, dtype=num_steps.dtype),
+        array_ops.concat(
+            [
+                array_ops.ones([array_ops.rank(num_steps)], dtype=dtypes.int32),
+                [self._periodicity]
+            ],
+            axis=0))
+    reversed_remaining_steps = ((period_range_reshaped
+                                 - (num_steps[..., None] - 1))
+                                % self._periodicity)
+    period_additions_reversed = (ops.convert_to_tensor(
+        noise_addition_coefficient,
+        self.dtype)[..., None] * noise_addition_scalar * math_ops.cast(
+            (num_steps[..., None] + reversed_remaining_steps) //
+            self._periodicity,
+            dtype=self.dtype))
+    period_additions_diag = array_ops.matrix_diag(period_additions_reversed)
+    upper_band = array_ops.concat(
+        [
+            array_ops.zeros_like(period_additions_diag[..., :-1, 0:1]),
+            -period_additions_diag[..., :-1, 0:-2]
+        ],
+        axis=-1)
+    lower_band = array_ops.concat(
+        [
+            array_ops.zeros_like(period_additions_diag[..., 0:1, :-1]),
+            -period_additions_diag[..., 0:-2, :-1]
+        ],
+        axis=-2)
+    period_additions_rotated = array_ops.concat(
+        [
+            period_additions_reversed[..., -1:],
+            period_additions_reversed[..., :-2]
+        ],
+        axis=-1)
+    diagonal = array_ops.matrix_diag(period_additions_reversed[..., :-1] +
+                                     period_additions_rotated)
+    return diagonal + lower_band + upper_band
+
+  def get_observation_model(self, times):
+    """Observe only the first of the rotating latent values.
+
+    See StateSpaceModel.get_observation_model.
+    Args:
+      times: Unused. See the parent class for details.
+    Returns:
+      A static, univariate observation model for later broadcasting.
+    """
+    del times  # Does not rely on times. Uses broadcasting from the parent.
+    return array_ops.concat(
+        values=[
+            array_ops.ones([1], dtype=self.dtype), array_ops.zeros(
+                [self._periodicity - 2], dtype=self.dtype)
+        ],
+        axis=0)
+
+
+class ResolutionCycleModel(CycleStateSpaceModel):
+  """A version of CycleStateSpaceModel with variable resolution.
+
+  Cycles between "num_latent_values" latent values over a period of
+  "periodicity", smoothly interpolating. Simply raises the transition matrix
+  from CycleStateSpaceModel to the power (num_latent_values / periodicity).
+
+  Specifically, ResolutionCycleModel uses the following eigendecomposition of
+  the CycleStateSpaceModel matrix (there are several parameterizations, others
+  leading to roots of the matrix with complex values):
+
+    eigenvectors_{i, j}
+        = root_of_unity(floor(j / 2) + 1, i * (-1)^(j + 1))
+          - root_of_unity(floor(j / 2) + 1, (i + 1) * (-1)^(j + 1))
+    eigenvalues_j = root_of_unity(floor(j / 2) + 1, (-1)^j)
+    root_of_unity(root_number, to_power)
+        = exp(to_power * 2 * pi * sqrt(-1) * root_number
+              / num_latent_values)
+
+  The transition matrix for ResolutionCycleModel is then:
+
+    eigenvectors
+    * diag(eigenvalues^(num_latent_values / periodicity))
+    * eigenvectors^-1
+
+  Since the eigenvalues are paired with their conjugates (conj(e^(sqrt(-1)*x)) =
+  e^(-sqrt(-1)*x)), the resulting matrix has real components (this is why only
+  odd numbers of latent values are supported, since the size of the matrix is
+  one less than the number of latent values and there must be an even number of
+  eigenvalues to pair them off).
+
+  See ./g3doc/periodic_multires_derivation.md for details.
+  """
+
+  def __init__(
+      self,
+      num_latent_values,
+      periodicity,
+      near_integer_threshold=1e-8,
+      configuration=state_space_model.StateSpaceModelConfiguration()):
+    """Initialize the ResolutionCycleModel.
+
+    Args:
+      num_latent_values: Controls the representational power and memory usage of
+        the model. The transition matrix has shape [num_latent_values - 1,
+        num_latent_values - 1]. Must be an odd integer (see class docstring for
+        why).
+      periodicity: The number of steps for cyclic behavior. May be a Tensor, and
+        need not be an integer (although integer values greater than
+        num_latent_values have more efficient special cases).
+      near_integer_threshold: When avoiding singularities, controls how close a
+        number should be to that singularity before the special case takes over.
+      configuration: A StateSpaceModelConfiguration object.
+
+    Raises:
+      ValueError: If num_latent_values is not odd.
+    """
+    if num_latent_values % 2 != 1:
+      raise ValueError("Only odd numbers of latent values are supported.")
+    self._num_latent_values = num_latent_values
+    self._true_periodicity = periodicity
+    self._near_integer_threshold = near_integer_threshold
+    super(ResolutionCycleModel, self).__init__(
+        periodicity=num_latent_values,
+        configuration=configuration)
+
+  def _close_to_integer(self, value):
+    value = math_ops.cast(value, self.dtype)
+    return math_ops.less(
+        math_ops.abs(value - gen_math_ops.round(value)),
+        self._near_integer_threshold)
+
+  def transition_to_powers(self, powers):
+    """Computes TransitionMatrix^power efficiently.
+
+    For an n x n transition matrix we have:
+
+      (TransitionMatrix**power)_{i, j) = (-1) ** i * sin(pi * power) / (n + 1)
+          * ((-1) ** j / sin(pi / (n + 1) * (power - i + j))
+             + 1 / sin(pi / (n + 1) * (power - i - 1)))
+
+    The sin(pi * power) term is zero whenever "power" is an integer. However,
+    the 1 / sin(x) terms (cosecants) occasionally (when their arguments are
+    multiples of pi) cancel out this value. The limit as the argument approaches
+    an integer value gives the "correct" result, but computing these separately
+    gives 0 * inf = NaN. Instead, there is a special case for near-integer
+    values.
+
+    Args:
+      powers: A floating point Tensor of powers to raise the transition matrix
+        to.
+    Returns:
+      A [..., self._num_latent_values - 1, self._num_latent_values - 1] floating
+        point Tensor with the transition matrix raised to each power in
+        `powers`.
+
+    """
+    num_latent_values_float = math_ops.cast(self._num_latent_values, self.dtype)
+    latent_values_per_period = (num_latent_values_float / math_ops.cast(
+        self._true_periodicity, dtype=self.dtype))
+    original_matrix_powers = (math_ops.cast(powers, self.dtype) *
+                              latent_values_per_period)
+    global_coeff = (math_ops.sin(original_matrix_powers * numpy.pi) /
+                    num_latent_values_float)[..., None, None]
+    matrix_dimension_range = array_ops.reshape(
+        math_ops.range(self._num_latent_values - 1),
+        array_ops.concat(
+            [
+                array_ops.ones(
+                    [array_ops.rank(original_matrix_powers)],
+                    dtype=dtypes.int32), [self._num_latent_values - 1]
+            ],
+            axis=0))
+    matrix_dimension_range_float = math_ops.cast(matrix_dimension_range,
+                                                 self.dtype)
+    alternating = math_ops.cast(1 - 2 * (matrix_dimension_range % 2),
+                                self.dtype)
+    row_addend = 1. / math_ops.sin(numpy.pi / num_latent_values_float * (
+        original_matrix_powers[..., None] - matrix_dimension_range_float - 1))
+    column_minus_row = (matrix_dimension_range_float[..., None, :]
+                        - matrix_dimension_range_float[..., None])
+    full_matrix_addend = (alternating[..., None, :] / math_ops.sin(
+        numpy.pi / num_latent_values_float *
+        (original_matrix_powers[..., None, None] + column_minus_row)))
+    continuous_construction = global_coeff * alternating[..., None] * (
+        row_addend[..., None] + full_matrix_addend)
+    # For integer powers, the above formula is only correct in the limit,
+    # yielding NaNs as written. We defer to the super-class in such cases, which
+    # computes integer powers exactly.
+    return array_ops.where(
+        self._close_to_integer(original_matrix_powers),
+        super(ResolutionCycleModel, self).transition_to_powers(
+            math_ops.cast(
+                gen_math_ops.round(original_matrix_powers), dtypes.int64)),
+        continuous_construction)
+
+  def transition_power_noise_accumulator(self, num_steps):
+    """Sum the transitioned covariance matrix over a number of steps.
+
+    Args:
+      num_steps: An integer Tensor of any shape [...] indicating the number of
+        steps to compute for each part of the batch.
+
+    Returns:
+      A [..., self._num_latent_values - 1, self._num_latent_values - 1] floating
+      point Tensor corresponding to each requested number of steps, containing:
+
+          sum_{i=1}^{steps} transition^i * noise_covariance
+              * (transition^i)^T
+    """
+
+    def _whole_periods_folded():
+      """A more efficient special casing for integer periods.
+
+      We knock off full periods, leaving at most self._true_periodicity steps to
+      compute.
+
+      Returns:
+        A tuple of (remaining_whole_steps, current_accumulation):
+          remaining_whole_steps: An integer Tensor with the same shape as the
+            `num_steps` argument to `transition_power_noise_accumulator`,
+            indicating the reduced number of steps which must be computed
+            sequentially and added to `current_accumulation`.
+          current_accumulation: A [..., self._num_latent_values - 1,
+            self._num_latent_values - 1] floating point Tensor corresponding to
+            the accumulations for steps which were computed in this function.
+      """
+      original_transition_noise_addition_coefficient = (math_ops.cast(
+          self._true_periodicity, self.dtype) / math_ops.cast(
+              self._num_latent_values, self.dtype))
+      full_period_accumulation = super(
+          ResolutionCycleModel, self).transition_power_noise_accumulator(
+              noise_addition_coefficient=
+              original_transition_noise_addition_coefficient,
+              num_steps=ops.convert_to_tensor(
+                  self._num_latent_values, dtype=num_steps.dtype))
+      periodicity_integer = math_ops.cast(self._true_periodicity,
+                                          num_steps.dtype)
+      full_periods = math_ops.cast(num_steps // periodicity_integer, self.dtype)
+      current_accumulation = full_periods[..., None, None] * array_ops.reshape(
+          full_period_accumulation,
+          array_ops.concat(
+              [
+                  array_ops.ones(
+                      [array_ops.rank(full_periods)], dtype=dtypes.int32),
+                  array_ops.shape(full_period_accumulation)
+              ],
+              axis=0))
+      remaining_whole_steps = num_steps % periodicity_integer
+      return remaining_whole_steps, current_accumulation
+    def _no_whole_period_computation():
+      """A less efficient special casing for real valued periods.
+
+      This special casing is still preferable to computing using sequential
+      matrix multiplies (parallelizable, more numerically stable), but is linear
+      in the number of steps.
+
+      Returns:
+        Same shapes and types as `_whole_periods_folded`, but no folding is done
+        in this function.
+      """
+      current_accumulation = array_ops.zeros(
+          array_ops.concat(
+              [
+                  array_ops.shape(num_steps),
+                  [self._num_latent_values - 1, self._num_latent_values - 1]
+              ],
+              axis=0),
+          dtype=self.dtype)
+      remaining_whole_steps = num_steps
+      return remaining_whole_steps, current_accumulation
+    # Decide whether it's feasible to compute whole periods in closed form,
+    # taking advantage of the fact that a sum over self._true_periodicity steps
+    # in our transition matrix is proportional to a sum over
+    # self._num_latent_values steps in the unmodified matrix (because each
+    # latent value gets the same treatment). This is possible for integer
+    # self._true_periodicity, since we stay aligned to integer steps. For real
+    # valued self._true_periodicity, or when the cyclic behavior is a higher
+    # resolution than 1 per step, taking whole periods leads to misalignment
+    # with integer steps, which would be difficult to recover from.
+    remaining_whole_steps, current_accumulation = control_flow_ops.cond(
+        self._whole_period_folding(), _whole_periods_folded,
+        _no_whole_period_computation)
+    steps_to_compute = math_ops.reduce_max(remaining_whole_steps)
+    remaining_step_noise_additions = self._power_sum_array(steps_to_compute)
+    noise_addition_scalar = array_ops.squeeze(
+        self.state_transition_noise_covariance, axis=[-1, -2])
+    return current_accumulation + noise_addition_scalar * array_ops.gather(
+        remaining_step_noise_additions, indices=remaining_whole_steps)
+
+  def _whole_period_folding(self):
+    """Decides whether computing a whole period maintains alignment."""
+    return math_ops.logical_and(
+        self._close_to_integer(self._true_periodicity),
+        math_ops.greater_equal(self._true_periodicity, self._num_latent_values))
+
+  def _power_sum_array(self, max_remaining_steps):
+    r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..max_remaining_steps.
+
+    A is the transition matrix and B is the noise covariance.
+
+    This is more efficient in practice than math_utils.power_sums_tensor, since
+    each A^i B (A^i)^T term has a closed-form expression not depending on i - 1.
+    Thus vectorization can replace explicit looping.
+
+    Uses a cumulative sum on the following expression:
+
+      (transition^p * transition_covariance * (transition^p)^T)_{i, j}
+        = (-1)^(i + j) * sin^2(pi * p) / num_latent_values^2
+          * (1/sin(pi / num_latent_values * (p - i))
+             + 1/sin(pi / num_latent_values * (p - i - 1)))
+          * (1/sin(pi / num_latent_values * (p - j))
+             + 1/sin(pi / num_latent_values * (p - j - 1)))
+
+    The expression being derived from the eigenvectors and eigenvalues given in
+    the class docstring (and as with CycleStateSpaceModel taking advantage of
+    the sparsity of the transition covariance).
+
+    Args:
+      max_remaining_steps: A scalar integer Tensor indicating the number of
+        non-trivial values to compute.
+    Returns:
+      A [max_remaining_steps + 1, self._num_latent_values - 1,
+      self._num_latent_values - 1] floating point Tensor S with cumulative power
+      sums.
+
+      S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T
+        S[0] is the zero matrix
+        S[1] is B
+        S[2] is A B A^T + B
+
+    """
+    num_latent_values_float = math_ops.cast(self._num_latent_values, self.dtype)
+    latent_values_per_period = (num_latent_values_float / math_ops.cast(
+        self._true_periodicity, dtype=self.dtype))
+    original_matrix_powers = (math_ops.cast(
+        math_ops.range(max_remaining_steps),
+        self.dtype) * latent_values_per_period)
+    matrix_dimension_range = math_ops.range(
+        self._num_latent_values - 1)[None, ...]
+    matrix_dimension_range_float = math_ops.cast(matrix_dimension_range,
+                                                 self.dtype)
+    def _cosecant_with_freq(coefficient):
+      return 1. / math_ops.sin(numpy.pi / num_latent_values_float * coefficient)
+    power_minus_index = (original_matrix_powers[..., None]
+                         - matrix_dimension_range_float)
+    mesh_values = (_cosecant_with_freq(power_minus_index)
+                   + _cosecant_with_freq(power_minus_index - 1.))
+    meshed = mesh_values[..., None, :] * mesh_values[..., None]
+    full_matrix_alternating = math_ops.cast(1 - 2 * (
+        (matrix_dimension_range[..., None, :] +
+         matrix_dimension_range[..., None]) % 2), self.dtype)
+    def _sine_discontinuity(value):
+      """A special case for dealing with discontinuities.
+
+      Decides whether `value`  is close to an integer, and if so computes:
+
+        lim x->n |sin(x * pi)| / sin(x * pi) = sign(sin(n * pi))
+                                             = cos(n * pi)
+
+      Args:
+        value: The floating point Tensor value which may lead to a
+            discontinuity.
+      Returns:
+        A tuple of (is_discontinuous, sign):
+          is_discontinuous: A boolean Tensor of the same shape as `value`,
+              indicating whether it is near an integer.
+          sign: A floating point Tensor indicating the sign of the discontinuity
+            (being near 1 or -1 when `is_discontinuous` is True), of the same
+            shape and type as `value`.
+      """
+      normalized = value / num_latent_values_float
+      is_discontinuous = self._close_to_integer(normalized)
+      sign = math_ops.cos(normalized * numpy.pi)
+      return is_discontinuous, sign
+    index_discontinuous, index_sign = _sine_discontinuity(
+        original_matrix_powers[..., None]
+        - matrix_dimension_range_float)
+    index_minus_discontinuous, index_minus_sign = _sine_discontinuity(
+        original_matrix_powers[..., None]
+        - matrix_dimension_range_float
+        - 1)
+    ones_mask_vector = math_ops.logical_or(index_discontinuous,
+                                           index_minus_discontinuous)
+    ones_sign_vector = array_ops.where(index_discontinuous, index_sign,
+                                       index_minus_sign)
+    ones_mask = math_ops.logical_and(ones_mask_vector[..., None],
+                                     ones_mask_vector[..., None, :])
+    zeros_mask = self._close_to_integer(original_matrix_powers)
+    zeroed = array_ops.where(zeros_mask, array_ops.zeros_like(meshed), meshed)
+    global_coefficient = (math_ops.sin(numpy.pi * original_matrix_powers) /
+                          num_latent_values_float)
+    masked_meshed = array_ops.where(
+        ones_mask, ones_sign_vector[..., None] * ones_sign_vector[..., None, :],
+        zeroed * global_coefficient[..., None, None]**2)
+    powers_above_zero = full_matrix_alternating * masked_meshed
+    return array_ops.pad(
+        math_ops.cumsum(powers_above_zero), [(1, 0), (0, 0), (0, 0)])
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/periodic_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/periodic_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd68e8ad2ef78f40ca382b7bc0b55c2a235aa23
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/periodic_test.py
@@ -0,0 +1,81 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for periodic state space model components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import periodic
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import test_utils
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class SpecialCaseTests(test.TestCase):
+
+  def test_cycle_transition_to_powers(self):
+    num_steps = 3
+    dtype = dtypes.float64
+    periodicity = 3
+    cycle = periodic.CycleStateSpaceModel(
+        periodicity=periodicity,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            dtype=dtype))
+    test_utils.transition_power_test_template(
+        test_case=self, model=cycle, num_steps=num_steps)
+
+  def test_resolution_cycle_transition_to_powers(self):
+    num_steps = 3
+    dtype = dtypes.float64
+    latent_values = 3
+    periodicity = latent_values - 1
+    cycle = periodic.ResolutionCycleModel(
+        num_latent_values=latent_values,
+        periodicity=periodicity,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            dtype=dtype))
+    test_utils.transition_power_test_template(
+        test_case=self, model=cycle, num_steps=num_steps)
+
+  def test_cycle_noise_accumulator(self):
+    num_steps = 3
+    dtype = dtypes.float64
+    periodicity = 3
+    cycle = periodic.CycleStateSpaceModel(
+        periodicity=periodicity,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            dtype=dtype))
+    test_utils.noise_accumulator_test_template(
+        test_case=self, model=cycle, num_steps=num_steps)
+
+  def test_resolution_cycle_noise_accumulator(self):
+    num_steps = 3
+    dtype = dtypes.float64
+    latent_values = 3
+    periodicity = latent_values + 0.1
+    cycle = periodic.ResolutionCycleModel(
+        num_latent_values=latent_values,
+        periodicity=periodicity,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            dtype=dtype))
+    test_utils.noise_accumulator_test_template(
+        test_case=self, model=cycle, num_steps=num_steps)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a9660b400d08a0397103676344ea1969fbc1f7a
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
@@ -0,0 +1,1207 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Abstract base for state space models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+
+import numpy
+
+from tensorflow.contrib.layers.python.layers import layers
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries import model
+from tensorflow.contrib.timeseries.python.timeseries import model_utils
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import kalman_filter
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+
+
+class StateSpaceModelConfiguration(
+    collections.namedtuple(
+        typename="StateSpaceModelConfiguration",
+        field_names=[
+            "num_features", "use_observation_noise", "dtype",
+            "covariance_prior_fn", "bayesian_prior_weighting",
+            "filtering_postprocessor", "trainable_start_state",
+            "exogenous_noise_increases", "exogenous_noise_decreases",
+            "exogenous_feature_columns", "exogenous_update_condition",
+            "filtering_maximum_posterior_variance_ratio",
+            "filtering_minimum_posterior_variance",
+            "transition_covariance_initial_log_scale_bias",
+            "static_unrolling_window_size_threshold"])):
+  """Configuration options for StateSpaceModels."""
+
+  def __new__(
+      cls,
+      num_features=1,
+      use_observation_noise=True,
+      dtype=dtypes.float32,
+      covariance_prior_fn=math_utils.log_noninformative_covariance_prior,
+      bayesian_prior_weighting=True,
+      filtering_postprocessor=None,
+      trainable_start_state=False,
+      exogenous_noise_increases=True,
+      exogenous_noise_decreases=False,
+      exogenous_feature_columns=None,
+      exogenous_update_condition=None,
+      filtering_maximum_posterior_variance_ratio=1e6,
+      filtering_minimum_posterior_variance=0.,
+      transition_covariance_initial_log_scale_bias=-5.,
+      static_unrolling_window_size_threshold=None):
+    """Configuration options for StateSpaceModels.
+
+    Args:
+      num_features: Output dimension for model
+      use_observation_noise: If true, observations are modeled as noisy
+        functions of the current state. If false, observations are a
+        deterministic function of the current state. Only applicable to the
+        top-level model in an ensemble. Consider also changing the
+        transition_covariance_initial_log_scale_bias when disabling observation
+        noise, as its default setting assumes that observation noise is part of
+        the model.
+      dtype: The float dtype to use when defining the model.
+      covariance_prior_fn: A function mapping from a covariance matrix to a
+          scalar value (e.g. log likelihood) which can be summed across
+          matrices. Defaults to an independent Jeffreys prior on the diagonal
+          elements (regularizing as log(1. / variance)). To use a flat prior
+          (i.e. no regularization), set to `lambda _: 0.`.  Defaults to
+          relatively uninformative priors on state transition and observation
+          noise, which have the effect of encouraging low-noise solutions which
+          provide confident predictions when possible. Without regularization,
+          transition noise tends to remain high, and multi-step predictions are
+          under-confident.
+      bayesian_prior_weighting: If True, weights the prior (covariance_prior_fn)
+          based on an estimate of the full dataset size. If False, weights it
+          based on the mini-batch window size, which (while statistically
+          improper) can lead to more desirable low-noise solutions in cases
+          where the full dataset is large enough to overwhelm the prior.
+      filtering_postprocessor: A FilteringStepPostprocessor object to use,
+          useful for ignoring anomalies in training data.
+      trainable_start_state: If True, start state may depend on trainable
+          Variables. If False, it will not.
+      exogenous_noise_increases: If True, exogenous regressors can add to model
+          state, increasing uncertainty. If both this parameter and
+          exogenous_noise_decreases are False, exogenous regressors are ignored.
+      exogenous_noise_decreases: If True, exogenous regressors can "set" model
+          state, decreasing uncertainty. If both this parameter and
+          exogenous_noise_increases are False, exogenous regressors are ignored.
+      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
+          objects (for example tf.contrib.layers.embedding_column) corresponding
+          to exogenous features which provide extra information to the model but
+          are not part of the series to be predicted. Passed to
+          tf.contrib.layers.input_from_feature_columns.
+      exogenous_update_condition: A function taking two Tensor arguments `times`
+          (shape [batch size]) and `features` (a dictionary mapping exogenous
+          feature keys to Tensors with shapes [batch size, ...]) and returning a
+          boolean Tensor with shape [batch size] indicating whether state should
+          be updated using exogenous features for each part of the batch. Where
+          it is False, no exogenous update is performed. If None (default),
+          exogenous updates are always performed. Useful for avoiding "leaky"
+          frequent exogenous updates when sparse updates are desired. Called
+          only during graph construction.
+      filtering_maximum_posterior_variance_ratio: The maximum allowed ratio of
+          two diagonal entries in a state covariance matrix just prior to
+          filtering. Lower values mean that filtering will be more numerically
+          stable, at the cost of artificially increasing estimated uncertainty
+          in some cases. This parameter can be important when learning a
+          transition matrix.
+      filtering_minimum_posterior_variance: The minimum diagonal value in a
+          state covariance matrix just prior to filtering, preventing numerical
+          instability due to deterministic beliefs (sometimes an issue when
+          learning transition matrices). This value should be set several orders
+          of magnitude below any expected minimum state uncertainty.
+      transition_covariance_initial_log_scale_bias: Controls the initial
+          tradeoff between the transition noise covariance matrix and the
+          observation noise covariance matrix, on a log scale (the elements of
+          the transition noise covariance matrix are proportional to `e^{X +
+          transition_covariance_initial_log_scale_bias}` where `X` is learned
+          and may depend on input statistics, observation noise covariance is
+          proportional to `e^{Y -
+          transition_covariance_initial_log_scale_bias}`). For models *with*
+          observation noise, -5 is a reasonable value. Models which do not use
+          observation noise, and are not part of an ensemble which does use
+          observation noise, should have this set to 0 or more to avoid
+          numerical issues due to filtering with too little noise.
+      static_unrolling_window_size_threshold: Only relevant for the top-level
+          StateSpaceModel in an ensemble; enables switching between static and
+          dynamic looping (if not None, default, meaning that no static
+          unrolling is performed) based on the window size (windows with this
+          size and smaller will have their graphs unrolled statically). See the
+          SequentialTimeSeriesModel constructor for details.
+    Returns:
+      A StateSpaceModelConfiguration object.
+    """
+    if exogenous_feature_columns is None:
+      exogenous_feature_columns = []
+    return super(StateSpaceModelConfiguration, cls).__new__(
+        cls, num_features, use_observation_noise, dtype,
+        covariance_prior_fn, bayesian_prior_weighting,
+        filtering_postprocessor, trainable_start_state,
+        exogenous_noise_increases, exogenous_noise_decreases,
+        exogenous_feature_columns, exogenous_update_condition,
+        filtering_maximum_posterior_variance_ratio,
+        filtering_minimum_posterior_variance,
+        transition_covariance_initial_log_scale_bias,
+        static_unrolling_window_size_threshold)
+
+
+class StateSpaceModel(model.SequentialTimeSeriesModel):
+  """Base class for linear state space models.
+
+  Sub-classes can specify the model to be learned by overriding
+  get_state_transition, get_noise_transform, and get_observation_model.
+
+  See kalman_filter.py for a detailed description of the class of models covered
+  by StateSpaceModel.
+
+  Briefly, state space models are defined by a state transition equation:
+
+  state[t] = StateTransition * state[t-1] + NoiseTransform * StateNoise[t]
+             + ExogenousNoiseIncreasing[t]
+  StateNoise[t] ~ Gaussian(0, StateNoiseCovariance)
+  ExogenousNoiseIncreasing[t] ~ Gaussian(ExogenousNoiseIncreasingMean[t],
+                                         ExogenousNoiseIncreasingCovariance[t])
+
+  And an observation model:
+
+  observation[t] = ObservationModel * state[t] + ObservationNoise[t]
+  ObservationNoise[t] ~ Gaussian(0, ObservationNoiseCovariance)
+
+  Additionally, exogenous regressors can act as observations, decreasing
+  uncertainty:
+
+  ExogenousNoiseDecreasingObservation[t] ~ Gaussian(
+      ExogenousNoiseDecreasingMean[t], ExogenousNoiseDecreasingCovariance[t])
+
+  Attributes:
+    kalman_filter: If initialize_graph has been called, the initialized
+        KalmanFilter to use for inference. None otherwise.
+    prior_state_mean: If initialize_graph has been called, a
+        Variable-parameterized Tensor with shape [state dimension];
+        the initial prior mean for one or more time series. None otherwise.
+    prior_state_var: If initialize_graph has been called, a
+        Variable-parameterized Tensor with shape [state dimension x state
+        dimension]; the initial prior covariance. None otherwise.
+    state_transition_noise_covariance: If initialize_graph has been called, a
+        Variable-parameterized Tensor with shape [state noise dimension x state
+        noise dimension] indicating the amount of noise added at each
+        transition.
+  """
+
+  def __init__(self, configuration):
+    """Initialize a state space model.
+
+    Args:
+      configuration: A StateSpaceModelConfiguration object.
+    """
+    self._configuration = configuration
+    if configuration.filtering_postprocessor is not None:
+      filtering_postprocessor_names = (
+          configuration.filtering_postprocessor.output_names)
+    else:
+      filtering_postprocessor_names = []
+    super(StateSpaceModel, self).__init__(
+        train_output_names=(["mean", "covariance", "log_likelihood"]
+                            + filtering_postprocessor_names),
+        predict_output_names=["mean", "covariance"],
+        num_features=configuration.num_features,
+        dtype=configuration.dtype,
+        exogenous_feature_columns=configuration.exogenous_feature_columns,
+        exogenous_update_condition=configuration.exogenous_update_condition,
+        static_unrolling_window_size_threshold=
+        configuration.static_unrolling_window_size_threshold)
+    self._kalman_filter = None
+    self.prior_state_mean = None
+    self.prior_state_var = None
+    self.state_transition_noise_covariance = None
+    self._total_observation_count = None
+    self._observation_noise_covariance = None
+    # Capture the current variable scope and use it to define all model
+    # variables. Especially useful for ensembles, where variables may be defined
+    # for every component model in one function call, which would otherwise
+    # prevent the user from separating variables from different models into
+    # different scopes.
+    self._variable_scope = variable_scope.get_variable_scope()
+
+  def transition_power_noise_accumulator(self, num_steps):
+    r"""Sum a transitioned covariance matrix over a number of steps.
+
+    Computes
+
+      \sum_{i=0}^{num_steps - 1} (
+        state_transition^i
+        * state_transition_noise_covariance
+        * (state_transition^i)^T)
+
+    If special cases are available, overriding this function can lead to more
+    efficient inferences.
+
+    Args:
+      num_steps: A [...] shape integer Tensor with numbers of steps to compute
+        power sums for.
+    Returns:
+      The computed power sum, with shape [..., state dimension, state
+      dimension].
+    """
+    # TODO(allenl): This general case should use cumsum if transition_to_powers
+    # can be computed in constant time (important for correlated ensembles,
+    # where transition_power_noise_accumulator special cases cannot be
+    # aggregated from member models).
+    noise_transform = ops.convert_to_tensor(self.get_noise_transform(),
+                                            self.dtype)
+    noise_transformed = math_ops.matmul(
+        math_ops.matmul(noise_transform,
+                        self.state_transition_noise_covariance),
+        noise_transform,
+        transpose_b=True)
+    noise_additions = math_utils.power_sums_tensor(
+        math_ops.reduce_max(num_steps) + 1,
+        ops.convert_to_tensor(self.get_state_transition(), dtype=self.dtype),
+        noise_transformed)
+    return array_ops.gather(noise_additions, indices=num_steps)
+
+  def transition_to_powers(self, powers):
+    """Raise the transition matrix to a batch of powers.
+
+    Computes state_transition^powers. If special cases are available, overriding
+    this function can lead to more efficient inferences.
+
+    Args:
+      powers: A [...] shape integer Tensor with powers to raise the transition
+        matrix to.
+    Returns:
+      The computed matrix powers, with shape [..., state dimension, state
+      dimension].
+    """
+    return math_utils.matrix_to_powers(
+        ops.convert_to_tensor(self.get_state_transition(), dtype=self.dtype),
+        powers)
+
+  def _window_initializer(self, times, state):
+    """Prepare to impute across the gaps in a window."""
+    _, _, priors_from_time = state
+    times = ops.convert_to_tensor(times)
+    priors_from_time = ops.convert_to_tensor(priors_from_time)
+    with ops.control_dependencies([
+        control_flow_ops.Assert(
+            math_ops.reduce_all(priors_from_time <= times[:, 0]),
+            [priors_from_time, times[:, 0]],
+            summarize=100)
+    ]):
+      times = array_ops.identity(times)
+    intra_batch_gaps = array_ops.reshape(times[:, 1:] - times[:, :-1], [-1])
+    starting_gaps = times[:, 0] - priors_from_time
+    # Pre-define transition matrices raised to powers (and their sums) for every
+    # gap in this window. This avoids duplicate computation (for example many
+    # steps will use the transition matrix raised to the first power) and
+    # batches the computation rather than doing it inside the per-step loop.
+    unique_gaps, _ = array_ops.unique(
+        array_ops.concat([intra_batch_gaps, starting_gaps], axis=0))
+    self._window_power_sums = self.transition_power_noise_accumulator(
+        unique_gaps)
+    self._window_transition_powers = self.transition_to_powers(unique_gaps)
+    self._window_gap_sizes = unique_gaps
+
+  def _lookup_window_caches(self, caches, indices):
+    _, window_power_ids = array_ops.unique(
+        array_ops.concat(
+            [
+                self._window_gap_sizes, math_ops.cast(
+                    indices, self._window_gap_sizes.dtype)
+            ],
+            axis=0))
+    all_gathered_indices = []
+    for cache in caches:
+      gathered_indices = array_ops.gather(
+          cache, window_power_ids[-array_ops.shape(indices)[0]:])
+      gathered_indices.set_shape(indices.get_shape().concatenate(
+          gathered_indices.get_shape()[-2:]))
+      all_gathered_indices.append(gathered_indices)
+    return all_gathered_indices
+
+  def _cached_transition_powers_and_sums(self, num_steps):
+    return self._lookup_window_caches(
+        caches=[self._window_transition_powers, self._window_power_sums],
+        indices=num_steps)
+
+  def _imputation_step(self, current_times, state):
+    """Add state transition noise to catch `state` up to `current_times`.
+
+    State space models are inherently sequential, so we need to "predict
+    through" any missing time steps to catch up each element of the batch to its
+    next observation/prediction time.
+
+    Args:
+      current_times: A [batch size] Tensor of times to impute up to, not
+          inclusive.
+      state: A tuple of (mean, covariance, previous_times) having shapes
+          mean; [batch size x state dimension]
+          covariance; [batch size x state dimension x state dimension]
+          previous_times; [batch size]
+    Returns:
+      Imputed model state corresponding to the `state` argument.
+    """
+    estimated_state, estimated_state_var, previous_times = state
+    catchup_times = current_times - previous_times
+    non_negative_assertion = control_flow_ops.Assert(
+        math_ops.reduce_all(catchup_times >= 0), [
+            "Negative imputation interval", catchup_times, current_times,
+            previous_times
+        ],
+        summarize=100)
+    with ops.control_dependencies([non_negative_assertion]):
+      transition_matrices, transition_noise_sums = (  # pylint: disable=unbalanced-tuple-unpacking
+          self._cached_transition_powers_and_sums(catchup_times))
+      estimated_state = self._kalman_filter.predict_state_mean(
+          estimated_state, transition_matrices)
+      estimated_state_var = self._kalman_filter.predict_state_var(
+          estimated_state_var, transition_matrices, transition_noise_sums)
+    return (estimated_state, estimated_state_var,
+            previous_times + catchup_times)
+
+  def _filtering_step(self, current_times, current_values, state, predictions):
+    """Compute posteriors and accumulate one-step-ahead predictions.
+
+    Args:
+      current_times: A [batch size] Tensor for times for each observation.
+      current_values: A [batch size] Tensor of values for each observation.
+      state: A tuple of (mean, covariance, previous_times) having shapes
+          mean; [batch size x state dimension]
+          covariance; [batch size x state dimension x state dimension]
+          previous_times; [batch size]
+      predictions: A dictionary containing mean and covariance Tensors, the
+          output of _prediction_step.
+    Returns:
+      A tuple of (posteriors, outputs):
+        posteriors: Model state updated to take `current_values` into account.
+        outputs: The `predictions` dictionary updated to include "loss" and
+            "log_likelihood" entries (loss simply being negative log
+            likelihood).
+    """
+    estimated_state, estimated_state_covariance, previous_times = state
+    observation_model = self.get_broadcasted_observation_model(current_times)
+    imputed_to_current_step_assert = control_flow_ops.Assert(
+        math_ops.reduce_all(math_ops.equal(current_times, previous_times)),
+        ["Attempted to perform filtering without imputation/prediction"])
+    with ops.control_dependencies([imputed_to_current_step_assert]):
+      estimated_state_covariance = math_utils.clip_covariance(
+          estimated_state_covariance,
+          self._configuration.filtering_maximum_posterior_variance_ratio,
+          self._configuration.filtering_minimum_posterior_variance)
+      (filtered_state, filtered_state_covariance,
+       log_prob) = self._kalman_filter.do_filter(
+           estimated_state=estimated_state,
+           estimated_state_covariance=estimated_state_covariance,
+           predicted_observation=predictions["mean"],
+           predicted_observation_covariance=predictions["covariance"],
+           observation=current_values,
+           observation_model=observation_model,
+           observation_noise=self._observation_noise_covariance)
+    filtered_state = (filtered_state, filtered_state_covariance, current_times)
+    log_prob.set_shape(current_times.get_shape())
+    predictions["loss"] = -log_prob
+    predictions["log_likelihood"] = log_prob
+    if self._configuration.filtering_postprocessor is not None:
+      return self._configuration.filtering_postprocessor.process_filtering_step(
+          current_times=current_times,
+          current_values=current_values,
+          predicted_state=state,
+          filtered_state=filtered_state,
+          outputs=predictions)
+    return (filtered_state, predictions)
+
+  def _prediction_step(self, current_times, state):
+    """Make a prediction based on `state`.
+
+    Computes predictions based on the current `state`, checking that it has
+    already been updated (in `_imputation_step`) to `current_times`.
+
+    Args:
+      current_times: A [batch size] Tensor for times to make predictions for.
+      state: A tuple of (mean, covariance, previous_times) having shapes
+          mean; [batch size x state dimension]
+          covariance; [batch size x state dimension x state dimension]
+          previous_times; [batch size]
+    Returns:
+      A tuple of (updated state, predictions):
+        updated state: Model state with added transition noise.
+        predictions: A dictionary with "mean" and "covariance", having shapes
+            "mean": [batch size x num features]
+            "covariance: [batch size x num features x num features]
+    """
+    estimated_state, estimated_state_var, previous_times = state
+    advanced_to_current_assert = control_flow_ops.Assert(
+        math_ops.reduce_all(math_ops.equal(current_times, previous_times)),
+        ["Attempted to predict without imputation"])
+    with ops.control_dependencies([advanced_to_current_assert]):
+      observation_model = self.get_broadcasted_observation_model(current_times)
+      predicted_obs, predicted_obs_var = (
+          self._kalman_filter.observed_from_state(
+              state_mean=estimated_state,
+              state_var=estimated_state_var,
+              observation_model=observation_model,
+              observation_noise=self._observation_noise_covariance))
+      predicted_obs_var.set_shape(
+          ops.convert_to_tensor(current_times).get_shape()
+          .concatenate([self.num_features, self.num_features]))
+    predicted_obs.set_shape(current_times.get_shape().concatenate(
+        (self.num_features,)))
+    predicted_obs_var.set_shape(current_times.get_shape().concatenate(
+        (self.num_features, self.num_features)))
+    predictions = {
+        "mean": predicted_obs,
+        "covariance": predicted_obs_var}
+    state = (estimated_state, estimated_state_var, current_times)
+    return (state, predictions)
+
+  def _exogenous_noise_decreasing(self, current_times, exogenous_values, state):
+    """Update state with exogenous regressors, decreasing uncertainty.
+
+    Constructs a mean and covariance based on transformations of
+    `exogenous_values`, then performs Bayesian inference on the constructed
+    observation. This has the effect of lowering uncertainty.
+
+    This update refines or overrides previous inferences, useful for modeling
+    exogenous inputs which "set" state, e.g. we dumped boiling water on the
+    thermometer so we're pretty sure it's 100 degrees C.
+
+    Args:
+      current_times: A [batch size] Tensor of times for the exogenous values
+          being input.
+      exogenous_values: A [batch size x exogenous input dimension] Tensor of
+          exogenous values for each part of the batch.
+      state: A tuple of (mean, covariance, previous_times) having shapes
+          mean; [batch size x state dimension]
+          covariance; [batch size x state dimension x state dimension]
+          previous_times; [batch size]
+    Returns:
+      Updated state taking the exogenous regressors into account (with lower
+      uncertainty than the input state).
+
+    """
+    estimated_state, estimated_state_covariance, previous_times = state
+    state_transition = ops.convert_to_tensor(
+        self.get_state_transition(), dtype=self.dtype)
+    state_dimension = state_transition.get_shape()[0].value
+    # Learning the observation model would be redundant since we transform
+    # `exogenous_values` to the state space via a linear transformation anyway.
+    observation_model = linalg_ops.eye(
+        state_dimension,
+        batch_shape=array_ops.shape(exogenous_values)[:-1],
+        dtype=self.dtype)
+    with variable_scope.variable_scope("exogenous_noise_decreasing_covariance"):
+      observation_noise = math_utils.transform_to_covariance_matrices(
+          exogenous_values, state_dimension)
+    with variable_scope.variable_scope(
+        "exogenous_noise_decreasing_observation"):
+      observation = layers.fully_connected(
+          exogenous_values, state_dimension, activation_fn=None)
+    # Pretend that we are making an observation with an observation model equal
+    # to the identity matrix (i.e. a direct observation of the latent state),
+    # with learned observation noise.
+    posterior_state, posterior_state_var = (
+        self._kalman_filter.posterior_from_prior_state(
+            prior_state=estimated_state,
+            prior_state_var=estimated_state_covariance,
+            observation=observation,
+            observation_model=observation_model,
+            predicted_observations=(
+                estimated_state,
+                # The predicted noise covariance is noise due to current state
+                # uncertainty plus noise learned based on the exogenous
+                # observation (a somewhat trivial call to
+                # self._kalman_filter.observed_from_state has been omitted).
+                observation_noise + estimated_state_covariance),
+            observation_noise=observation_noise))
+    return (posterior_state, posterior_state_var, previous_times)
+
+  def _exogenous_noise_increasing(self, current_times, exogenous_values, state):
+    """Update state with exogenous regressors, increasing uncertainty.
+
+    Adds to the state mean a linear transformation of `exogenous_values`, and
+    increases uncertainty by constructing a covariance matrix based on
+    `exogenous_values` and adding it to the state covariance.
+
+    This update is useful for modeling changes relative to current state,
+    e.g. the furnace turned on so the temperature will be increasing at an
+    additional 1 degree per minute with some uncertainty, this uncertainty being
+    added to our current uncertainty in the per-minute change in temperature.
+
+    Args:
+      current_times: A [batch size] Tensor of times for the exogenous values
+          being input.
+      exogenous_values: A [batch size x exogenous input dimension] Tensor of
+          exogenous values for each part of the batch.
+      state: A tuple of (mean, covariance, previous_times) having shapes
+          mean; [batch size x state dimension]
+          covariance; [batch size x state dimension x state dimension]
+          previous_times; [batch size]
+    Returns:
+      Updated state taking the exogenous regressors into account (with higher
+      uncertainty than the input state).
+
+    """
+    start_mean, start_covariance, previous_times = state
+    with variable_scope.variable_scope("exogenous_noise_increasing_mean"):
+      mean_addition = layers.fully_connected(
+          exogenous_values, start_mean.get_shape()[1].value, activation_fn=None)
+    state_dimension = start_covariance.get_shape()[1].value
+    with variable_scope.variable_scope("exogenous_noise_increasing_covariance"):
+      covariance_addition = (
+          math_utils.transform_to_covariance_matrices(
+              exogenous_values, state_dimension))
+    return (start_mean + mean_addition,
+            start_covariance + covariance_addition,
+            previous_times)
+
+  def _exogenous_input_step(
+      self, current_times, current_exogenous_regressors, state):
+    """Update state with exogenous regressors.
+
+    Allows both increases and decreases in uncertainty.
+
+    Args:
+      current_times: A [batch size] Tensor of times for the exogenous values
+          being input.
+      current_exogenous_regressors: A [batch size x exogenous input dimension]
+          Tensor of exogenous values for each part of the batch.
+      state: A tuple of (mean, covariance, previous_times) having shapes
+          mean; [batch size x state dimension]
+          covariance; [batch size x state dimension x state dimension]
+          previous_times; [batch size]
+    Returns:
+      Updated state taking the exogenous regressors into account.
+    """
+    if self._configuration.exogenous_noise_decreases:
+      state = self._exogenous_noise_decreasing(
+          current_times, current_exogenous_regressors, state)
+    if self._configuration.exogenous_noise_increases:
+      state = self._exogenous_noise_increasing(
+          current_times, current_exogenous_regressors, state)
+    return state
+
+  def _loss_additions(self, times, values, mode):
+    """Add regularization during training."""
+    if mode == estimator_lib.ModeKeys.TRAIN:
+      if (self._input_statistics is not None
+          and self._configuration.bayesian_prior_weighting):
+        normalization = 1. / math_ops.cast(
+            self._input_statistics.total_observation_count, self.dtype)
+      else:
+        # If there is no total observation count recorded, or if we are not
+        # doing a Bayesian prior weighting, assumes/pretends that the full
+        # dataset size is the window size.
+        normalization = 1. / math_ops.cast(
+            array_ops.shape(times)[1], self.dtype)
+      transition_contribution = ops.convert_to_tensor(
+          self._configuration.covariance_prior_fn(
+              self.state_transition_noise_covariance),
+          dtype=self.dtype)
+      if (self._configuration.use_observation_noise
+          and self._observation_noise_covariance is not None):
+        observation_contribution = ops.convert_to_tensor(
+            self._configuration.covariance_prior_fn(
+                self._observation_noise_covariance),
+            dtype=self.dtype)
+        regularization_sum = transition_contribution + observation_contribution
+      else:
+        regularization_sum = transition_contribution
+      return -normalization * regularization_sum
+    else:
+      return array_ops.zeros([], dtype=self.dtype)
+
+  def _variable_observation_transition_tradeoff_log(self):
+    """Define a variable to trade off observation and transition noise."""
+    return variable_scope.get_variable(
+        name="observation_transition_tradeoff_log_scale",
+        initializer=constant_op.constant(
+            -self._configuration.transition_covariance_initial_log_scale_bias,
+            dtype=self.dtype),
+        dtype=self.dtype)
+
+  def _define_parameters(self, observation_transition_tradeoff_log=None):
+    """Define extra model-specific parameters.
+
+    Models should wrap any variables defined here in the model's variable scope.
+
+    Args:
+      observation_transition_tradeoff_log: An ensemble-global parameter
+        controlling the tradeoff between observation noise and transition
+        noise. If its value is not None, component transition noise should scale
+        with e^-observation_transition_tradeoff_log.
+    """
+    with variable_scope.variable_scope(self._variable_scope):
+      # A scalar which allows the optimizer to quickly shift from observation
+      # noise to transition noise (this value is subtracted from log transition
+      # noise and added to log observation noise).
+      if observation_transition_tradeoff_log is None:
+        self._observation_transition_tradeoff_log_scale = (
+            self._variable_observation_transition_tradeoff_log())
+      else:
+        self._observation_transition_tradeoff_log_scale = (
+            observation_transition_tradeoff_log)
+      self.state_transition_noise_covariance = (
+          self.get_state_transition_noise_covariance())
+
+  def _set_input_statistics(self, input_statistics=None):
+    super(StateSpaceModel, self).initialize_graph(
+        input_statistics=input_statistics)
+
+  def initialize_graph(self, input_statistics=None):
+    """Define variables and ops relevant to the top-level model in an ensemble.
+
+    For generic model parameters, _define_parameters() is called recursively on
+    all members of an ensemble.
+
+    Args:
+      input_statistics: A math_utils.InputStatistics object containing input
+          statistics. If None, data-independent defaults are used, which may
+          result in longer or unstable training.
+    """
+    self._set_input_statistics(input_statistics=input_statistics)
+    self._define_parameters()
+    with variable_scope.variable_scope(self._variable_scope):
+      self._observation_noise_covariance = ops.convert_to_tensor(
+          self.get_observation_noise_covariance(), dtype=self.dtype)
+    self._kalman_filter = kalman_filter.KalmanFilter(dtype=self.dtype)
+    (self.prior_state_mean,
+     self.prior_state_var) = self._make_priors()
+
+  def _make_priors(self):
+    """Creates and returns model priors."""
+    prior_state_covariance = self.get_prior_covariance()
+    prior_state_mean = self.get_prior_mean()
+    return (prior_state_mean, prior_state_covariance)
+
+  def get_prior_covariance(self):
+    """Constructs a variable prior covariance with data-based initialization.
+
+    Models should wrap any variables defined here in the model's variable scope.
+
+    Returns:
+      A two-dimensional [state dimension, state dimension] floating point Tensor
+      with a (positive definite) prior state covariance matrix.
+    """
+    with variable_scope.variable_scope(self._variable_scope):
+      state_dimension = ops.convert_to_tensor(
+          self.get_state_transition()).get_shape()[0].value
+      if self._configuration.trainable_start_state:
+        base_covariance = math_utils.variable_covariance_matrix(
+            state_dimension, "prior_state_var",
+            dtype=self.dtype)
+      else:
+        return linalg_ops.eye(state_dimension, dtype=self.dtype)
+      if self._input_statistics is not None:
+        # Make sure initial latent value uncertainty is at least on the same
+        # scale as noise in the data.
+        covariance_multiplier = math_ops.reduce_max(
+            self._input_statistics.series_start_moments.variance)
+        return base_covariance * gen_math_ops.maximum(
+            covariance_multiplier, 1.0)
+      else:
+        return base_covariance
+
+  def get_prior_mean(self):
+    """Constructs a Variable-parameterized prior mean.
+
+    Models should wrap any variables defined here in the model's variable scope.
+
+    Returns:
+      A one-dimensional floating point Tensor with shape [state dimension]
+      indicating the prior mean.
+    """
+    with variable_scope.variable_scope(self._variable_scope):
+      state_transition = ops.convert_to_tensor(
+          self.get_state_transition(), dtype=self.dtype)
+      state_dimension = state_transition.get_shape()[0].value
+      return variable_scope.get_variable(
+          name="prior_state_mean",
+          shape=[state_dimension],
+          dtype=self.dtype,
+          trainable=self._configuration.trainable_start_state)
+
+  # TODO(allenl): It would be nice if the generation were done with TensorFlow
+  # ops, and if the model parameters were somehow set instead of being passed
+  # around in a dictionary. Maybe unconditional generation should be through a
+  # special set of initializers?
+  def random_model_parameters(self, seed=None):
+    if self.num_features != 1:
+      raise NotImplementedError("Generation for multivariate state space models"
+                                " is not currently implemented.")
+    if seed:
+      numpy.random.seed(seed)
+    state_dimension, noise_dimension = ops.convert_to_tensor(
+        self.get_noise_transform()).get_shape().as_list()
+    transition_var = 1.0 / numpy.random.gamma(shape=10., scale=10.,
+                                              size=[noise_dimension])
+    initial_state = numpy.random.normal(size=[state_dimension])
+    params_dict = {}
+    if self.prior_state_mean is not None:
+      params_dict[self.prior_state_mean] = initial_state
+    if self.state_transition_noise_covariance is not None:
+      params_dict[self.state_transition_noise_covariance] = numpy.diag(
+          transition_var)
+    if self.prior_state_var is not None:
+      params_dict[self.prior_state_var] = numpy.zeros(
+          [state_dimension, state_dimension])
+    if self._configuration.use_observation_noise:
+      observation_var = 1.0 / numpy.random.gamma(shape=4, scale=4)
+      params_dict[self._observation_noise_covariance] = [[observation_var]]
+    return params_dict
+
+  def generate(self, number_of_series, series_length,
+               model_parameters=None, seed=None, add_observation_noise=None):
+    if seed is not None:
+      numpy.random.seed(seed)
+    if self.num_features != 1:
+      raise NotImplementedError("Generation for multivariate state space models"
+                                " is not currently implemented.")
+    if add_observation_noise is None:
+      add_observation_noise = self._configuration.use_observation_noise
+    if model_parameters is None:
+      model_parameters = {}
+    transitions = ops.convert_to_tensor(
+        self.get_state_transition(), dtype=self.dtype).eval(
+            feed_dict=model_parameters)
+    noise_transform = ops.convert_to_tensor(self.get_noise_transform()).eval(
+        feed_dict=model_parameters)
+
+    noise_dimension = noise_transform.shape[1]
+    get_passed_or_trained_value = model_utils.parameter_switch(model_parameters)
+    transition_var = numpy.diag(get_passed_or_trained_value(
+        self.state_transition_noise_covariance))
+    transition_std = numpy.sqrt(transition_var)
+    if add_observation_noise:
+      observation_var = get_passed_or_trained_value(
+          self._observation_noise_covariance)[0][0]
+      observation_std = numpy.sqrt(observation_var)
+    initial_state = get_passed_or_trained_value(self.prior_state_mean)
+    current_state = numpy.tile(numpy.expand_dims(initial_state, 0),
+                               [number_of_series, 1])
+    observations = numpy.zeros([number_of_series, series_length])
+    observation_models = self.get_broadcasted_observation_model(
+        times=math_ops.range(series_length)).eval(feed_dict=model_parameters)
+    for timestep, observation_model in enumerate(observation_models):
+      current_state = numpy.dot(current_state, transitions.T)
+      current_state += numpy.dot(
+          numpy.random.normal(
+              loc=numpy.zeros([number_of_series, noise_dimension]),
+              scale=numpy.tile(numpy.expand_dims(transition_std, 0),
+                               [number_of_series, 1])),
+          noise_transform.T)
+      observation_mean = numpy.dot(current_state, observation_model[0].T)
+      if add_observation_noise:
+        observations[:, timestep] = numpy.random.normal(loc=observation_mean,
+                                                        scale=observation_std)
+      else:
+        observations[:, timestep] = observation_mean
+    observations = numpy.expand_dims(observations, -1)
+    times = numpy.tile(
+        numpy.expand_dims(numpy.arange(observations.shape[1]), 0),
+        [observations.shape[0], 1])
+    return {TrainEvalFeatures.TIMES: times,
+            TrainEvalFeatures.VALUES: observations}
+
+  @abc.abstractmethod
+  def get_state_transition(self):
+    """Specifies the state transition model to use.
+
+    Returns:
+      A [state dimension x state dimension] Tensor specifying how states
+      transition from one timestep to the next.
+    """
+    pass
+
+  @abc.abstractmethod
+  def get_noise_transform(self):
+    """Specifies the noise transition model to use.
+
+    Returns:
+      A [state dimension x state noise dimension] Tensor specifying how noise
+      (generated with shape [state noise dimension]) affects the model's state.
+    """
+    pass
+
+  @abc.abstractmethod
+  def get_observation_model(self, times):
+    """Specifies the observation model to use.
+
+    Args:
+      times: A [batch dimension] int32 Tensor with times for each part of the
+          batch, on which the observation model can depend.
+    Returns:
+      This function, when overridden, has three possible return values:
+        - A [state dimension] Tensor with a static, univariate observation
+          model.
+        - A [self.num_features x state dimension] static, multivariate model.
+        - A [batch dimension x self.num_features x state dimension] observation
+          model, which may depend on `times`.
+      See get_broadcasted_observation_model for details of the broadcasting.
+    """
+    pass
+
+  def get_broadcasted_observation_model(self, times):
+    """Broadcast this model's observation model if necessary.
+
+    The model can define a univariate observation model which will be broadcast
+    over both self.num_features and the batch dimension of `times`.
+
+    The model can define a multi-variate observation model which does not depend
+    on `times`, and it will be broadcast over the batch dimension of `times`.
+
+    Finally, the model can define a multi-variate observation model with a batch
+    dimension, which will not be broadcast.
+
+    Args:
+      times: A [batch dimension] int32 Tensor with times for each part of the
+          batch, on which the observation model can depend.
+    Returns:
+      A [batch dimension x self.num_features x state dimension] Tensor
+      specifying the observation model to use for each time in `times` and each
+      feature.
+    """
+    unbroadcasted_model = ops.convert_to_tensor(
+        self.get_observation_model(times), dtype=self.dtype)
+    unbroadcasted_shape = (unbroadcasted_model.get_shape()
+                           .with_rank_at_least(1).with_rank_at_most(3))
+    if unbroadcasted_shape.ndims is None:
+      # Pass through fully undefined shapes, but make sure they're rank 3 at
+      # graph eval time
+      assert_op = control_flow_ops.Assert(
+          math_ops.equal(array_ops.rank(unbroadcasted_model), 3),
+          [array_ops.shape(unbroadcasted_model)])
+      with ops.control_dependencies([assert_op]):
+        return array_ops.identity(unbroadcasted_model)
+    if unbroadcasted_shape.ndims == 1:
+      # Unbroadcasted shape [state dimension]
+      broadcasted_model = array_ops.tile(
+          array_ops.reshape(tensor=unbroadcasted_model, shape=[1, 1, -1]),
+          [array_ops.shape(times)[0], self.num_features, 1])
+    elif unbroadcasted_shape.ndims == 2:
+      # Unbroadcasted shape [num features x state dimension]
+      broadcasted_model = array_ops.tile(
+          array_ops.expand_dims(unbroadcasted_model, dim=0),
+          [array_ops.shape(times)[0], 1, 1])
+    elif unbroadcasted_shape.ndims == 3:
+      broadcasted_model = unbroadcasted_model
+    broadcasted_model.get_shape().assert_has_rank(3)
+    return broadcasted_model
+
+  def get_state_transition_noise_covariance(
+      self, minimum_initial_variance=1e-5):
+    state_noise_transform = ops.convert_to_tensor(
+        self.get_noise_transform(), dtype=self.dtype)
+    state_noise_dimension = state_noise_transform.get_shape()[1].value
+    if self._input_statistics is not None:
+      feature_variance = self._input_statistics.series_start_moments.variance
+      initial_transition_noise_scale = math_ops.log(
+          gen_math_ops.maximum(
+              math_ops.reduce_mean(feature_variance) / math_ops.cast(
+                  self._input_statistics.total_observation_count, self.dtype),
+              minimum_initial_variance))
+    else:
+      initial_transition_noise_scale = 0.
+    # Generally high transition noise is undesirable; we want to set it quite
+    # low to start so that we don't need too much training to get to good
+    # solutions (i.e. with confident predictions into the future if possible),
+    # but not so low that training can't yield a high transition noise if the
+    # data demands it.
+    initial_transition_noise_scale -= (
+        self._observation_transition_tradeoff_log_scale)
+    return math_utils.variable_covariance_matrix(
+        state_noise_dimension, "state_transition_noise",
+        dtype=self.dtype,
+        initial_overall_scale_log=initial_transition_noise_scale)
+
+  def get_observation_noise_covariance(self, minimum_initial_variance=1e-5):
+    if self._configuration.use_observation_noise:
+      if self._input_statistics is not None:
+        # Get variance across the first few values in each batch for each
+        # feature, for an initial observation noise (over-)estimate.
+        feature_variance = self._input_statistics.series_start_moments.variance
+      else:
+        feature_variance = None
+      if feature_variance is not None:
+        feature_variance = gen_math_ops.maximum(feature_variance,
+                                                minimum_initial_variance)
+      return math_utils.variable_covariance_matrix(
+          size=self.num_features,
+          dtype=self.dtype,
+          name="observation_noise_covariance",
+          initial_diagonal_values=feature_variance,
+          initial_overall_scale_log=(
+              self._observation_transition_tradeoff_log_scale))
+    else:
+      return array_ops.zeros(
+          shape=[self.num_features, self.num_features],
+          name="observation_noise_covariance",
+          dtype=self.dtype)
+
+  def get_start_state(self):
+    """Defines and returns a non-batched prior state and covariance."""
+    # TODO(allenl,vitalyk): Add an option for non-Gaussian priors once extended
+    # Kalman filtering is implemented (ideally any Distribution object).
+    if self._input_statistics is not None:
+      start_time = self._input_statistics.start_time
+    else:
+      start_time = array_ops.zeros([], dtype=dtypes.int64)
+    return (self.prior_state_mean,
+            self.prior_state_var,
+            start_time - 1)
+
+  def get_features_for_timesteps(self, timesteps):
+    """Get features for a batch of timesteps. Default to no features."""
+    return array_ops.zeros([array_ops.shape(timesteps)[0], 0], dtype=self.dtype)
+
+
+class StateSpaceEnsemble(StateSpaceModel):
+  """Base class for combinations of state space models."""
+
+  def __init__(self, ensemble_members, configuration):
+    """Initialize the ensemble by specifying its members.
+
+    Args:
+      ensemble_members: A list of StateSpaceModel objects which will be included
+          in this ensemble.
+      configuration: A StateSpaceModelConfiguration object.
+    """
+    self._ensemble_members = ensemble_members
+    super(StateSpaceEnsemble, self).__init__(configuration=configuration)
+
+  def _set_input_statistics(self, input_statistics):
+    super(StateSpaceEnsemble, self)._set_input_statistics(input_statistics)
+    for member in self._ensemble_members:
+      member._set_input_statistics(input_statistics)  # pylint: disable=protected-access
+
+  def _loss_additions(self, times, values, mode):
+    # Allow sub-models to regularize
+    return (super(StateSpaceEnsemble, self)._loss_additions(
+        times, values, mode) + math_ops.add_n([
+            member._loss_additions(times, values, mode)  # pylint: disable=protected-access
+            for member in self._ensemble_members
+        ]))
+
+  def _compute_blocked(self, member_fn, name):
+    with variable_scope.variable_scope(self._variable_scope):
+      return math_utils.block_diagonal(
+          [member_fn(member)
+           for member in self._ensemble_members],
+          dtype=self.dtype,
+          name=name)
+
+  def transition_to_powers(self, powers):
+    return self._compute_blocked(
+        member_fn=lambda member: member.transition_to_powers(powers),
+        name="ensemble_transition_to_powers")
+
+  def _define_parameters(self, observation_transition_tradeoff_log=None):
+    with variable_scope.variable_scope(self._variable_scope):
+      if observation_transition_tradeoff_log is None:
+        # Define the tradeoff parameter between observation and transition noise
+        # once for the whole ensemble, and pass it down to members.
+        observation_transition_tradeoff_log = (
+            self._variable_observation_transition_tradeoff_log())
+      for member in self._ensemble_members:
+        member._define_parameters(observation_transition_tradeoff_log=(  # pylint: disable=protected-access
+            observation_transition_tradeoff_log))
+      super(StateSpaceEnsemble, self)._define_parameters(
+          observation_transition_tradeoff_log
+          =observation_transition_tradeoff_log)
+
+  def random_model_parameters(self, seed=None):
+    param_union = {}
+    for i, member in enumerate(self._ensemble_members):
+      member_params = member.random_model_parameters(
+          seed=seed + i if seed else None)
+      param_union.update(member_params)
+    param_union.update(
+        super(StateSpaceEnsemble, self).random_model_parameters(seed=seed))
+    return param_union
+
+  def get_prior_mean(self):
+    return array_ops.concat(
+        values=[member.get_prior_mean() for member in self._ensemble_members],
+        axis=0,
+        name="ensemble_prior_state_mean")
+
+  def get_state_transition(self):
+    return self._compute_blocked(
+        member_fn=
+        lambda member: member.get_state_transition(),
+        name="ensemble_state_transition")
+
+  def get_noise_transform(self):
+    return self._compute_blocked(
+        member_fn=
+        lambda member: member.get_noise_transform(),
+        name="ensemble_noise_transform")
+
+  def get_observation_model(self, times):
+    raise NotImplementedError("No un-broadcasted observation model defined for"
+                              " ensembles.")
+
+  def get_broadcasted_observation_model(self, times):
+    """Computes a combined observation model based on member models.
+
+    The effect is that predicted observations from each model are summed.
+
+    Args:
+      times: A [batch dimension] int32 Tensor with times for each part of the
+          batch, on which member observation models can depend.
+    Returns:
+      A [batch dimension x num features x combined state dimension] Tensor with
+      the combined observation model.
+    """
+    member_observation_models = [
+        ops.convert_to_tensor(
+            member.get_broadcasted_observation_model(times), dtype=self.dtype)
+        for member in self._ensemble_members
+    ]
+    return array_ops.concat(values=member_observation_models, axis=2)
+
+
+class StateSpaceIndependentEnsemble(StateSpaceEnsemble):
+  """Implements ensembles of independent state space models.
+
+  Useful for fitting multiple independent state space models together while
+  keeping their specifications decoupled. The "ensemble" is simply a state space
+  model with the observation models of its members concatenated, and the
+  transition matrices and noise transforms stacked in block-diagonal
+  matrices. This means that the dimensionality of the ensemble's state is the
+  sum of those of its components, which can lead to slow and memory-intensive
+  training and inference as the posterior (shape [state dimension x state
+  dimension]) gets large.
+
+  Each individual model j's state at time t is defined by:
+
+  state[t, j] = StateTransition[j] * state[t-1, j]
+      + NoiseTransform[j] * StateNoise[t, j]
+  StateNoise[t, j] ~ Gaussian(0, StateNoiseCovariance[j])
+
+  and the ensemble observation model is:
+
+  observation[t] = Sum { ObservationModel[j] * state[t, j] }
+      + ObservationNoise[t]
+  ObservationNoise[t] ~ Gaussian(0, ObservationNoiseCovariance)
+  """
+
+  def transition_power_noise_accumulator(self, num_steps):
+    return self._compute_blocked(
+        member_fn=lambda m: m.transition_power_noise_accumulator(num_steps),
+        name="ensemble_power_noise_accumulator")
+
+  def get_prior_covariance(self):
+    """Construct the ensemble prior covariance based on component models."""
+    return self._compute_blocked(
+        member_fn=
+        lambda member: member.get_prior_covariance(),
+        name="ensemble_prior_state_covariance")
+
+  def get_state_transition_noise_covariance(self):
+    """Construct the ensemble transition noise covariance from components."""
+    return self._compute_blocked(
+        member_fn=
+        lambda member: member.state_transition_noise_covariance,
+        name="ensemble_state_transition_noise")
+
+
+# TODO(allenl): It would be nice to have replicated feature models which are
+# identical batched together to reduce the graph size.
+# TODO(allenl): Support for sharing M independent models across N features, with
+# N > M.
+# TODO(allenl): Stack component prior covariances while allowing cross-model
+# correlations to be learned (currently a full covariance prior is learned, but
+# custom component model covariances are not used).
+class StateSpaceCorrelatedFeaturesEnsemble(StateSpaceEnsemble):
+  """An correlated ensemble where each model represents a feature.
+
+  Unlike `StateSpaceIndependentEnsemble`, a full state transition noise
+  covariance matrix is learned for this ensemble; the models are not assumed to
+  be independent. Rather than concatenating observation models (i.e. summing the
+  contributions of each model to each feature),
+  StateSpaceCorrelatedFeaturesEnsemble stacks observation models diagonally,
+  meaning that each model corresponds to one feature of the series.
+
+  Behaves like (and is) a single state space model where:
+
+  StateTransition = Diag(StateTransition[j] for models j)
+  ObservationModel = Diag(ObservationModel[j] for models j)
+
+  Note that each ObservationModel[j] is a [1 x S_j] matrix (S_j being the state
+  dimension of model j), i.e. a univariate model. The combined model is
+  multivariate, the number of features of the series being equal to the number
+  of component models in the ensemble.
+  """
+
+  def __init__(self, ensemble_members, configuration):
+    """Specify the ensemble's configuration and component models.
+
+    Args:
+      ensemble_members: A list of `StateSpaceModel` objects, with length equal
+        to `configuration.num_features`. Each of these models, which must be
+        univariate, corresponds to a single feature of the time series.
+      configuration: A StateSpaceModelConfiguration object.
+    Raises:
+      ValueError: If the length of `ensemble_members` does not equal the number
+        of features in the series, or any component is not univariate.
+    """
+    if len(ensemble_members) != configuration.num_features:
+      raise ValueError(
+          "The number of members in a StateSpaceCorrelatedFeaturesEnsemble "
+          "must equal the number of features in the time series.")
+    for member in ensemble_members:
+      if member.num_features != 1:
+        raise ValueError(
+            "StateSpaceCorrelatedFeaturesEnsemble components must be "
+            "univariate.")
+    super(StateSpaceCorrelatedFeaturesEnsemble, self).__init__(
+        ensemble_members=ensemble_members, configuration=configuration)
+
+  def transition_power_noise_accumulator(self, num_steps):
+    """Use a noise accumulator special case when possible."""
+    if len(self._ensemble_members) == 1:
+      # If this is a univariate series, we should use the special casing built
+      # into the single component model.
+      return self._ensemble_members[0].transition_power_noise_accumulator(
+          num_steps=num_steps)
+    # If we have multiple features, and therefore multiple models, we have
+    # introduced correlations which make noise accumulation more
+    # complicated. Here we fall back to the general case, since we can't just
+    # aggregate member special cases.
+    return super(StateSpaceCorrelatedFeaturesEnsemble,
+                 self).transition_power_noise_accumulator(num_steps=num_steps)
+
+  def get_broadcasted_observation_model(self, times):
+    """Stack observation models diagonally."""
+    def _member_observation_model(member):
+      return ops.convert_to_tensor(
+          member.get_broadcasted_observation_model(times), dtype=self.dtype)
+    return self._compute_blocked(member_fn=_member_observation_model,
+                                 name="feature_ensemble_observation_model")
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c8f81ec5165b8ba7e8a1089953e5755b5a90915
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
@@ -0,0 +1,758 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for state space model infrastructure."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy
+
+from tensorflow.contrib import layers
+
+from tensorflow.contrib.timeseries.python.timeseries import estimators
+from tensorflow.contrib.timeseries.python.timeseries import feature_keys
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
+from tensorflow.contrib.timeseries.python.timeseries import state_management
+from tensorflow.contrib.timeseries.python.timeseries import test_utils
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import queue_runner_impl
+
+
+class RandomStateSpaceModel(state_space_model.StateSpaceModel):
+
+  def __init__(self,
+               state_dimension,
+               state_noise_dimension,
+               configuration=state_space_model.StateSpaceModelConfiguration()):
+    self.transition = numpy.random.normal(
+        size=[state_dimension, state_dimension]).astype(
+            configuration.dtype.as_numpy_dtype)
+    self.noise_transform = numpy.random.normal(
+        size=(state_dimension, state_noise_dimension)).astype(
+            configuration.dtype.as_numpy_dtype)
+    # Test batch broadcasting
+    self.observation_model = numpy.random.normal(
+        size=(configuration.num_features, state_dimension)).astype(
+            configuration.dtype.as_numpy_dtype)
+    super(RandomStateSpaceModel, self).__init__(
+        configuration=configuration._replace(
+            covariance_prior_fn=lambda _: 0.))
+
+  def get_state_transition(self):
+    return self.transition
+
+  def get_noise_transform(self):
+    return self.noise_transform
+
+  def get_observation_model(self, times):
+    return self.observation_model
+
+
+class ConstructionTests(test.TestCase):
+
+  def test_initialize_graph_error(self):
+    with self.assertRaisesRegexp(ValueError, "initialize_graph"):
+      model = RandomStateSpaceModel(2, 2)
+      outputs = model.define_loss(
+          features={
+              feature_keys.TrainEvalFeatures.TIMES:
+                  constant_op.constant([[1, 2]]),
+              feature_keys.TrainEvalFeatures.VALUES:
+                  constant_op.constant([[[1.], [2.]]])
+          },
+          mode=estimator_lib.ModeKeys.TRAIN)
+      initializer = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run([initializer])
+        outputs.loss.eval()
+
+  def test_initialize_graph_state_manager_error(self):
+    with self.assertRaisesRegexp(ValueError, "initialize_graph"):
+      model = RandomStateSpaceModel(2, 2)
+      state_manager = state_management.ChainingStateManager()
+      outputs = state_manager.define_loss(
+          model=model,
+          features={
+              feature_keys.TrainEvalFeatures.TIMES:
+                  constant_op.constant([[1, 2]]),
+              feature_keys.TrainEvalFeatures.VALUES:
+                  constant_op.constant([[[1.], [2.]]])
+          },
+          mode=estimator_lib.ModeKeys.TRAIN)
+      initializer = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run([initializer])
+        outputs.loss.eval()
+
+
+class GapTests(test.TestCase):
+
+  def _gap_test_template(self, times, values):
+    random_model = RandomStateSpaceModel(
+        state_dimension=1, state_noise_dimension=1,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            num_features=1))
+    random_model.initialize_graph()
+    input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader({
+            feature_keys.TrainEvalFeatures.TIMES: times,
+            feature_keys.TrainEvalFeatures.VALUES: values
+        }))
+    features, _ = input_fn()
+    times = features[feature_keys.TrainEvalFeatures.TIMES]
+    values = features[feature_keys.TrainEvalFeatures.VALUES]
+    model_outputs = random_model.get_batch_loss(
+        features={
+            feature_keys.TrainEvalFeatures.TIMES: times,
+            feature_keys.TrainEvalFeatures.VALUES: values
+        },
+        mode=None,
+        state=math_utils.replicate_state(
+            start_state=random_model.get_start_state(),
+            batch_size=array_ops.shape(times)[0]))
+    with self.test_session() as session:
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      model_outputs.loss.eval()
+      coordinator.request_stop()
+      coordinator.join()
+
+  def test_start_gap(self):
+    self._gap_test_template(times=[20, 21, 22], values=numpy.arange(3))
+
+  def test_mid_gap(self):
+    self._gap_test_template(times=[2, 60, 61], values=numpy.arange(3))
+
+  def test_end_gap(self):
+    self._gap_test_template(times=[2, 3, 73], values=numpy.arange(3))
+
+  def test_all_gaps(self):
+    self._gap_test_template(times=[2, 4, 8, 16, 32, 64, 128],
+                            values=numpy.arange(7))
+
+
+class StateSpaceEquivalenceTests(test.TestCase):
+
+  def test_savedmodel_state_override(self):
+    random_model = RandomStateSpaceModel(
+        state_dimension=5,
+        state_noise_dimension=4,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            exogenous_feature_columns=[layers.real_valued_column("exogenous")],
+            dtype=dtypes.float64, num_features=1))
+    estimator = estimators.StateSpaceRegressor(
+        model=random_model,
+        optimizer=gradient_descent.GradientDescentOptimizer(0.1))
+    combined_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader({
+            feature_keys.FilteringFeatures.TIMES: [1, 2, 3, 4],
+            feature_keys.FilteringFeatures.VALUES: [1., 2., 3., 4.],
+            "exogenous": [-1., -2., -3., -4.]
+        }))
+    estimator.train(combined_input_fn, steps=1)
+    export_location = estimator.export_savedmodel(
+        self.get_temp_dir(),
+        estimator.build_raw_serving_input_receiver_fn(
+            exogenous_features={
+                "exogenous": numpy.zeros((0, 0), dtype=numpy.float32)}))
+    with ops.Graph().as_default() as graph:
+      random_model.initialize_graph()
+      with self.test_session(graph=graph) as session:
+        variables.global_variables_initializer().run()
+        evaled_start_state = session.run(random_model.get_start_state())
+    evaled_start_state = [
+        state_element[None, ...] for state_element in evaled_start_state]
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph=graph) as session:
+        signatures = loader.load(
+            session, [tag_constants.SERVING], export_location)
+        first_split_filtering = saved_model_utils.filter_continuation(
+            continue_from={
+                feature_keys.FilteringResults.STATE_TUPLE: evaled_start_state},
+            signatures=signatures,
+            session=session,
+            features={
+                feature_keys.FilteringFeatures.TIMES: [1, 2],
+                feature_keys.FilteringFeatures.VALUES: [1., 2.],
+                "exogenous": [-1., -2.]})
+        second_split_filtering = saved_model_utils.filter_continuation(
+            continue_from=first_split_filtering,
+            signatures=signatures,
+            session=session,
+            features={
+                feature_keys.FilteringFeatures.TIMES: [3, 4],
+                feature_keys.FilteringFeatures.VALUES: [3., 4.],
+                "exogenous": [-3., -4.]
+            })
+        combined_filtering = saved_model_utils.filter_continuation(
+            continue_from={
+                feature_keys.FilteringResults.STATE_TUPLE: evaled_start_state},
+            signatures=signatures,
+            session=session,
+            features={
+                feature_keys.FilteringFeatures.TIMES: [1, 2, 3, 4],
+                feature_keys.FilteringFeatures.VALUES: [1., 2., 3., 4.],
+                "exogenous": [-1., -2., -3., -4.]
+            })
+        split_predict = saved_model_utils.predict_continuation(
+            continue_from=second_split_filtering,
+            signatures=signatures,
+            session=session,
+            steps=1,
+            exogenous_features={
+                "exogenous": [[-5.]]})
+        combined_predict = saved_model_utils.predict_continuation(
+            continue_from=combined_filtering,
+            signatures=signatures,
+            session=session,
+            steps=1,
+            exogenous_features={
+                "exogenous": [[-5.]]})
+    for state_key, combined_state_value in combined_filtering.items():
+      if state_key == feature_keys.FilteringResults.TIMES:
+        continue
+      self.assertAllClose(
+          combined_state_value, second_split_filtering[state_key])
+    for prediction_key, combined_value in combined_predict.items():
+      self.assertAllClose(combined_value, split_predict[prediction_key])
+
+  def _equivalent_to_single_model_test_template(self, model_generator):
+    with self.test_session() as session:
+      random_model = RandomStateSpaceModel(
+          state_dimension=5,
+          state_noise_dimension=4,
+          configuration=state_space_model.StateSpaceModelConfiguration(
+              dtype=dtypes.float64, num_features=1))
+      random_model.initialize_graph()
+      series_length = 10
+      model_data = random_model.generate(
+          number_of_series=1, series_length=series_length,
+          model_parameters=random_model.random_model_parameters())
+      input_fn = input_pipeline.WholeDatasetInputFn(
+          input_pipeline.NumpyReader(model_data))
+      features, _ = input_fn()
+      model_outputs = random_model.get_batch_loss(
+          features=features,
+          mode=None,
+          state=math_utils.replicate_state(
+              start_state=random_model.get_start_state(),
+              batch_size=array_ops.shape(
+                  features[feature_keys.TrainEvalFeatures.TIMES])[0]))
+      variables.global_variables_initializer().run()
+      compare_outputs_evaled_fn = model_generator(
+          random_model, model_data)
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      compare_outputs_evaled = compare_outputs_evaled_fn(session)
+      model_outputs_evaled = session.run(
+          (model_outputs.end_state, model_outputs.predictions))
+      coordinator.request_stop()
+      coordinator.join()
+      model_posteriors, model_predictions = model_outputs_evaled
+      (_, compare_posteriors,
+       compare_predictions) = compare_outputs_evaled
+      (model_posterior_mean, model_posterior_var,
+       model_from_time) = model_posteriors
+      (compare_posterior_mean, compare_posterior_var,
+       compare_from_time) = compare_posteriors
+      self.assertAllClose(model_posterior_mean, compare_posterior_mean[0])
+      self.assertAllClose(model_posterior_var, compare_posterior_var[0])
+      self.assertAllClose(model_from_time, compare_from_time)
+      self.assertEqual(sorted(model_predictions.keys()),
+                       sorted(compare_predictions.keys()))
+      for prediction_name in model_predictions:
+        if prediction_name == "loss":
+          # Chunking means that losses will be different; skip testing them.
+          continue
+        # Compare the last chunk to their corresponding un-chunked model
+        # predictions
+        last_prediction_chunk = compare_predictions[prediction_name][-1]
+        comparison_values = last_prediction_chunk.shape[0]
+        model_prediction = (
+            model_predictions[prediction_name][0, -comparison_values:])
+        self.assertAllClose(model_prediction,
+                            last_prediction_chunk)
+
+  def _model_equivalent_to_chained_model_test_template(self, chunk_size):
+    def chained_model_outputs(original_model, data):
+      input_fn = test_utils.AllWindowInputFn(
+          input_pipeline.NumpyReader(data), window_size=chunk_size)
+      state_manager = state_management.ChainingStateManager(
+          state_saving_interval=1)
+      features, _ = input_fn()
+      state_manager.initialize_graph(original_model)
+      model_outputs = state_manager.define_loss(
+          model=original_model,
+          features=features,
+          mode=estimator_lib.ModeKeys.TRAIN)
+      def _eval_outputs(session):
+        for _ in range(50):
+          # Warm up saved state
+          model_outputs.loss.eval()
+        (posterior_mean, posterior_var,
+         priors_from_time) = model_outputs.end_state
+        posteriors = ((posterior_mean,), (posterior_var,), priors_from_time)
+        outputs = (model_outputs.loss, posteriors,
+                   model_outputs.predictions)
+        chunked_outputs_evaled = session.run(outputs)
+        return chunked_outputs_evaled
+      return _eval_outputs
+    self._equivalent_to_single_model_test_template(chained_model_outputs)
+
+  def test_model_equivalent_to_chained_model_chunk_size_one(self):
+    numpy.random.seed(2)
+    random_seed.set_random_seed(3)
+    self._model_equivalent_to_chained_model_test_template(1)
+
+  def test_model_equivalent_to_chained_model_chunk_size_five(self):
+    numpy.random.seed(4)
+    random_seed.set_random_seed(5)
+    self._model_equivalent_to_chained_model_test_template(5)
+
+
+class PredictionTests(test.TestCase):
+
+  def _check_predictions(
+      self, predicted_mean, predicted_covariance, window_size):
+    self.assertAllEqual(predicted_covariance.shape,
+                        [1,   # batch
+                         window_size,
+                         1,   # num features
+                         1])  # num features
+    self.assertAllEqual(predicted_mean.shape,
+                        [1,   # batch
+                         window_size,
+                         1])  # num features
+    for position in range(window_size - 2):
+      self.assertGreater(predicted_covariance[0, position + 2, 0, 0],
+                         predicted_covariance[0, position, 0, 0])
+
+  def test_predictions_direct(self):
+    dtype = dtypes.float64
+    with variable_scope.variable_scope(dtype.name):
+      random_model = RandomStateSpaceModel(
+          state_dimension=5, state_noise_dimension=4,
+          configuration=state_space_model.StateSpaceModelConfiguration(
+              dtype=dtype, num_features=1))
+      random_model.initialize_graph()
+      prediction_dict = random_model.predict(features={
+          feature_keys.PredictionFeatures.TIMES: [[1, 3, 5, 6]],
+          feature_keys.PredictionFeatures.STATE_TUPLE:
+              math_utils.replicate_state(
+                  start_state=random_model.get_start_state(), batch_size=1)
+      })
+      with self.test_session():
+        variables.global_variables_initializer().run()
+        predicted_mean = prediction_dict["mean"].eval()
+        predicted_covariance = prediction_dict["covariance"].eval()
+      self._check_predictions(predicted_mean, predicted_covariance,
+                              window_size=4)
+
+  def test_predictions_after_loss(self):
+    dtype = dtypes.float32
+    with variable_scope.variable_scope(dtype.name):
+      random_model = RandomStateSpaceModel(
+          state_dimension=5, state_noise_dimension=4,
+          configuration=state_space_model.StateSpaceModelConfiguration(
+              dtype=dtype, num_features=1))
+      features = {
+          feature_keys.TrainEvalFeatures.TIMES: [[1, 2, 3, 4]],
+          feature_keys.TrainEvalFeatures.VALUES:
+              array_ops.ones([1, 4, 1], dtype=dtype)
+      }
+      passthrough = state_management.PassthroughStateManager()
+      random_model.initialize_graph()
+      passthrough.initialize_graph(random_model)
+      model_outputs = passthrough.define_loss(
+          model=random_model,
+          features=features,
+          mode=estimator_lib.ModeKeys.EVAL)
+      predictions = random_model.predict({
+          feature_keys.PredictionFeatures.TIMES: [[5, 7, 8]],
+          feature_keys.PredictionFeatures.STATE_TUPLE: model_outputs.end_state
+      })
+      with self.test_session():
+        variables.global_variables_initializer().run()
+        predicted_mean = predictions["mean"].eval()
+        predicted_covariance = predictions["covariance"].eval()
+      self._check_predictions(predicted_mean, predicted_covariance,
+                              window_size=3)
+
+
+class ExogenousTests(test.TestCase):
+
+  def test_noise_increasing(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with variable_scope.variable_scope(dtype.name):
+        random_model = RandomStateSpaceModel(
+            state_dimension=5, state_noise_dimension=4,
+            configuration=state_space_model.StateSpaceModelConfiguration(
+                dtype=dtype, num_features=1))
+        original_covariance = array_ops.diag(array_ops.ones(shape=[5]))
+        _, new_covariance, _ = random_model._exogenous_noise_increasing(
+            current_times=[[1]],
+            exogenous_values=[[5.]],
+            state=[
+                array_ops.ones(shape=[1, 5]), original_covariance[None], [0]
+            ])
+        with self.test_session() as session:
+          variables.global_variables_initializer().run()
+          evaled_new_covariance, evaled_original_covariance = session.run(
+              [new_covariance[0], original_covariance])
+          new_variances = numpy.diag(evaled_new_covariance)
+          original_variances = numpy.diag(evaled_original_covariance)
+          for i in range(5):
+            self.assertGreater(new_variances[i], original_variances[i])
+
+  def test_noise_decreasing(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with variable_scope.variable_scope(dtype.name):
+        random_model = RandomStateSpaceModel(
+            state_dimension=5, state_noise_dimension=4,
+            configuration=state_space_model.StateSpaceModelConfiguration(
+                dtype=dtype, num_features=1))
+        random_model.initialize_graph()
+        original_covariance = array_ops.diag(
+            array_ops.ones(shape=[5], dtype=dtype))
+        _, new_covariance, _ = random_model._exogenous_noise_decreasing(
+            current_times=[[1]],
+            exogenous_values=constant_op.constant([[-2.]], dtype=dtype),
+            state=[
+                -array_ops.ones(shape=[1, 5], dtype=dtype),
+                original_covariance[None], [0]
+            ])
+        with self.test_session() as session:
+          variables.global_variables_initializer().run()
+          evaled_new_covariance, evaled_original_covariance = session.run(
+              [new_covariance[0], original_covariance])
+          new_variances = numpy.diag(evaled_new_covariance)
+          original_variances = numpy.diag(evaled_original_covariance)
+          for i in range(5):
+            self.assertLess(new_variances[i], original_variances[i])
+
+
+class StubStateSpaceModel(state_space_model.StateSpaceModel):
+
+  def __init__(self,
+               transition,
+               state_noise_dimension,
+               configuration=state_space_model.StateSpaceModelConfiguration()):
+    self.transition = transition
+    self.noise_transform = numpy.random.normal(
+        size=(transition.shape[0], state_noise_dimension)).astype(numpy.float32)
+    # Test feature + batch broadcasting
+    self.observation_model = numpy.random.normal(
+        size=(transition.shape[0])).astype(numpy.float32)
+    super(StubStateSpaceModel, self).__init__(
+        configuration=configuration)
+
+  def get_state_transition(self):
+    return self.transition
+
+  def get_noise_transform(self):
+    return self.noise_transform
+
+  def get_observation_model(self, times):
+    return self.observation_model
+
+
+GeneratedModel = collections.namedtuple(
+    "GeneratedModel", ["model", "data", "true_parameters"])
+
+
+class PosteriorTests(test.TestCase):
+
+  def _get_cycle_transition(self, period):
+    cycle_transition = numpy.zeros([period - 1, period - 1],
+                                   dtype=numpy.float32)
+    cycle_transition[0, :] = -1
+    cycle_transition[1:, :-1] = numpy.identity(period - 2)
+    return cycle_transition
+
+  _adder_transition = numpy.array([[1, 1],
+                                   [0, 1]], dtype=numpy.float32)
+
+  def _get_single_model(self):
+    numpy.random.seed(8)
+    stub_model = StubStateSpaceModel(
+        transition=self._get_cycle_transition(5), state_noise_dimension=0)
+    series_length = 1000
+    stub_model.initialize_graph()
+    true_params = stub_model.random_model_parameters()
+    data = stub_model.generate(
+        number_of_series=1, series_length=series_length,
+        model_parameters=true_params)
+    return GeneratedModel(
+        model=stub_model, data=data, true_parameters=true_params)
+
+  def test_exact_posterior_recovery_no_transition_noise(self):
+    with self.test_session() as session:
+      stub_model, data, true_params = self._get_single_model()
+      input_fn = input_pipeline.WholeDatasetInputFn(
+          input_pipeline.NumpyReader(data))
+      features, _ = input_fn()
+      model_outputs = stub_model.get_batch_loss(
+          features=features,
+          mode=None,
+          state=math_utils.replicate_state(
+              start_state=stub_model.get_start_state(),
+              batch_size=array_ops.shape(
+                  features[feature_keys.TrainEvalFeatures.TIMES])[0]))
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      posterior_mean, posterior_var, posterior_times = session.run(
+          # Feed the true model parameters so that this test doesn't depend on
+          # the generated parameters being close to the variable initializations
+          # (an alternative would be training steps to fit the noise values,
+          # which would be slow).
+          model_outputs.end_state, feed_dict=true_params)
+      coordinator.request_stop()
+      coordinator.join()
+
+      self.assertAllClose(numpy.zeros([1, 4, 4]), posterior_var,
+                          atol=1e-2)
+      self.assertAllClose(
+          numpy.dot(
+              numpy.linalg.matrix_power(
+                  stub_model.transition,
+                  data[feature_keys.TrainEvalFeatures.TIMES].shape[1]),
+              true_params[stub_model.prior_state_mean]),
+          posterior_mean[0],
+          rtol=1e-1)
+      self.assertAllClose(
+          math_utils.batch_end_time(
+              features[feature_keys.TrainEvalFeatures.TIMES]).eval(),
+          posterior_times)
+
+  def test_chained_exact_posterior_recovery_no_transition_noise(self):
+    with self.test_session() as session:
+      stub_model, data, true_params = self._get_single_model()
+      chunk_size = 10
+      input_fn = test_utils.AllWindowInputFn(
+          input_pipeline.NumpyReader(data), window_size=chunk_size)
+      features, _ = input_fn()
+      state_manager = state_management.ChainingStateManager(
+          state_saving_interval=1)
+      state_manager.initialize_graph(stub_model)
+      model_outputs = state_manager.define_loss(
+          model=stub_model,
+          features=features,
+          mode=estimator_lib.ModeKeys.TRAIN)
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(session, coord=coordinator)
+      for _ in range(
+          data[feature_keys.TrainEvalFeatures.TIMES].shape[1] // chunk_size):
+        model_outputs.loss.eval()
+      posterior_mean, posterior_var, posterior_times = session.run(
+          model_outputs.end_state, feed_dict=true_params)
+      coordinator.request_stop()
+      coordinator.join()
+      self.assertAllClose(numpy.zeros([1, 4, 4]), posterior_var,
+                          atol=1e-2)
+      self.assertAllClose(
+          numpy.dot(
+              numpy.linalg.matrix_power(
+                  stub_model.transition,
+                  data[feature_keys.TrainEvalFeatures.TIMES].shape[1]),
+              true_params[stub_model.prior_state_mean]),
+          posterior_mean[0],
+          rtol=1e-1)
+      self.assertAllClose(data[feature_keys.TrainEvalFeatures.TIMES][:, -1],
+                          posterior_times)
+
+
+class TimeDependentStateSpaceModel(state_space_model.StateSpaceModel):
+  """A mostly trivial model which predicts values = times + 1."""
+
+  def __init__(self, static_unrolling_window_size_threshold=None):
+    super(TimeDependentStateSpaceModel, self).__init__(
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            use_observation_noise=False,
+            static_unrolling_window_size_threshold=
+            static_unrolling_window_size_threshold))
+
+  def get_state_transition(self):
+    return array_ops.ones(shape=[1, 1])
+
+  def get_noise_transform(self):
+    return array_ops.ones(shape=[1, 1])
+
+  def get_observation_model(self, times):
+    return array_ops.reshape(
+        tensor=math_ops.cast(times + 1, dtypes.float32), shape=[-1, 1, 1])
+
+  def make_priors(self):
+    return (ops.convert_to_tensor([1.]), ops.convert_to_tensor([[0.]]))
+
+
+class UnknownShapeModel(TimeDependentStateSpaceModel):
+
+  def get_observation_model(self, times):
+    parent_model = super(UnknownShapeModel, self).get_observation_model(times)
+    parent_model._shape = tensor_shape.unknown_shape()
+    assert parent_model.get_shape().ndims is None
+    return parent_model
+
+
+class TimeDependentTests(test.TestCase):
+
+  def _time_dependency_test_template(self, model_type):
+    """Test that a time-dependent observation model influences predictions."""
+    model = model_type()
+    estimator = estimators.StateSpaceRegressor(
+        model=model, optimizer=gradient_descent.GradientDescentOptimizer(0.1))
+    values = numpy.reshape([1., 2., 3., 4.],
+                           newshape=[1, 4, 1])
+    input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader({
+            feature_keys.TrainEvalFeatures.TIMES: [[0, 1, 2, 3]],
+            feature_keys.TrainEvalFeatures.VALUES: values
+        }))
+    estimator.train(input_fn=input_fn, max_steps=1)
+    predicted_values = estimator.evaluate(input_fn=input_fn, steps=1)["mean"]
+    # Throw out the first value so we don't test the prior
+    self.assertAllEqual(values[1:], predicted_values[1:])
+
+  def test_undefined_shape_time_dependency(self):
+    self._time_dependency_test_template(UnknownShapeModel)
+
+  def test_loop_unrolling(self):
+    """Tests running/restoring from a checkpoint with static unrolling."""
+    model = TimeDependentStateSpaceModel(
+        # Unroll during training, but not evaluation
+        static_unrolling_window_size_threshold=2)
+    estimator = estimators.StateSpaceRegressor(model=model)
+    times = numpy.arange(100)
+    values = numpy.arange(100)
+    dataset = {
+        feature_keys.TrainEvalFeatures.TIMES: times,
+        feature_keys.TrainEvalFeatures.VALUES: values
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(dataset), batch_size=16, window_size=2)
+    eval_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(dataset))
+    estimator.train(input_fn=train_input_fn, max_steps=1)
+    estimator.evaluate(input_fn=eval_input_fn, steps=1)
+
+
+class LevelOnlyModel(state_space_model.StateSpaceModel):
+
+  def get_state_transition(self):
+    return linalg_ops.eye(1, dtype=self.dtype)
+
+  def get_noise_transform(self):
+    return linalg_ops.eye(1, dtype=self.dtype)
+
+  def get_observation_model(self, times):
+    return [1]
+
+
+class MultivariateLevelModel(
+    state_space_model.StateSpaceCorrelatedFeaturesEnsemble):
+
+  def __init__(self, configuration):
+    univariate_component_configuration = configuration._replace(
+        num_features=1)
+    components = []
+    for feature in range(configuration.num_features):
+      with variable_scope.variable_scope("feature{}".format(feature)):
+        components.append(
+            LevelOnlyModel(configuration=univariate_component_configuration))
+    super(MultivariateLevelModel, self).__init__(
+        ensemble_members=components, configuration=configuration)
+
+
+class MultivariateTests(test.TestCase):
+
+  def test_multivariate(self):
+    dtype = dtypes.float32
+    num_features = 3
+    covariance = numpy.eye(num_features)
+    # A single off-diagonal has a non-zero value in the true transition
+    # noise covariance.
+    covariance[-1, 0] = 1.
+    covariance[0, -1] = 1.
+    dataset_size = 100
+    values = numpy.cumsum(
+        numpy.random.multivariate_normal(
+            mean=numpy.zeros(num_features),
+            cov=covariance,
+            size=dataset_size),
+        axis=0)
+    times = numpy.arange(dataset_size)
+    model = MultivariateLevelModel(
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            num_features=num_features,
+            dtype=dtype,
+            use_observation_noise=False,
+            transition_covariance_initial_log_scale_bias=5.))
+    estimator = estimators.StateSpaceRegressor(
+        model=model, optimizer=gradient_descent.GradientDescentOptimizer(0.1))
+    data = {
+        feature_keys.TrainEvalFeatures.TIMES: times,
+        feature_keys.TrainEvalFeatures.VALUES: values
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(data), batch_size=16, window_size=16)
+    estimator.train(input_fn=train_input_fn, steps=1)
+    for component in model._ensemble_members:
+      # Check that input statistics propagated to component models
+      self.assertTrue(component._input_statistics)
+
+  def test_ensemble_observation_noise(self):
+    model = MultivariateLevelModel(
+        configuration=state_space_model.StateSpaceModelConfiguration())
+    model.initialize_graph()
+    outputs = model.define_loss(
+        features={
+            feature_keys.TrainEvalFeatures.TIMES:
+                constant_op.constant([[1, 2]]),
+            feature_keys.TrainEvalFeatures.VALUES:
+                constant_op.constant([[[1.], [2.]]])
+        },
+        mode=estimator_lib.ModeKeys.TRAIN)
+    initializer = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run([initializer])
+      outputs.loss.eval()
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/structural_ensemble.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/structural_ensemble.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7a80a8e3ef81b7a2763ace49153a6106397a611
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/structural_ensemble.py
@@ -0,0 +1,266 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements a time series model with seasonality, trends, and transients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import level_trend
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import periodic
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import varma
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def _replicate_level_trend_models(multivariate_configuration,
+                                  univariate_configuration):
+  """Helper function to construct a multivariate level/trend component."""
+  with variable_scope.variable_scope("adder"):
+    # Construct a level and trend model for each feature, with correlated
+    # transition noise.
+    adder_features = []
+    for feature in range(multivariate_configuration.num_features):
+      with variable_scope.variable_scope("feature{}".format(feature)):
+        adder_features.append(level_trend.AdderStateSpaceModel(
+            configuration=univariate_configuration))
+    adder_part = state_space_model.StateSpaceCorrelatedFeaturesEnsemble(
+        ensemble_members=adder_features,
+        configuration=multivariate_configuration)
+  return adder_part
+
+
+class StructuralEnsemble(state_space_model.StateSpaceIndependentEnsemble):
+  r"""A structural state space time series model.
+
+  In the spirit of:
+
+  Scott, Steven L., and Hal R. Varian. "Predicting the present with bayesian
+    structural time series." International Journal of Mathematical Modelling and
+    Numerical Optimisation 5.1-2 (2014): 4-23.
+
+  Without the spike-and-slab prior, and with point estimates of parameters
+  instead of sampling.
+
+  The model includes level, trend, seasonality, and a transient moving average.
+
+  An observation at time t is drawn according to:
+    observation_t = level_t + seasonality_t + moving_average_t
+        + observation_noise_t
+    level_t = level_{t-1} + trend_{t-1} + level_noise_t
+    trend_t = trend_{t-1} + trend_noise_t
+    seasonality_t = -\sum_{n=1}^{num_seasons-1} seasonality_{t-n} +
+        seasonality_noise_t
+    moving_average_t = transient_t
+        + \sum_{j=1}^{moving_average_order} ma_coefs_j * transient_{t - j}
+
+  `observation_noise`, `level_noise`, `trend noise`, `seasonality_noise`, and
+  `transient` are (typically scalar) Gaussian random variables whose variance is
+  learned from data, and that variance is not time dependent in this
+  implementation. Level noise is optional due to its similarity with observation
+  noise in some cases. Seasonality is enforced by constraining a full cycle of
+  seasonal variables to have zero expectation, allowing seasonality to adapt
+  over time. The moving average coefficients `ma_coefs` are learned.
+
+  When presented with a multivariate series (more than one "feature", here
+  referring to endogenous features of the series), the model is replicated
+  across these features (one copy per feature of each periodic component, and
+  one level/trend model per feature), and correlations in transition noise are
+  learned between these replicated components (see
+  StateSpaceCorrelatedFeaturesEnsemble). This is in addition to the learned
+  correlations in observation noise between features. While this is often the
+  most expressive thing to do with multiple features, it does mean that the
+  model grows quite quickly, creating and computing with square matrices with
+  each dimension equal to num_features * (sum(periodicities) +
+  moving_average_order + 3), meaning that some operations are approximately
+  cubic in this value.
+  """
+  # TODO(allenl): Implement partial model replication/sharing for multivariate
+  # series (to save time/memory when the series presented can be modeled as a
+  # smaller number of underlying series). Likely just a modification of the
+  # observation model so that each feature of the series is a learned linear
+  # combination of the replicated models.
+
+  def __init__(self,
+               periodicities,
+               moving_average_order,
+               autoregressive_order,
+               use_level_noise=True,
+               configuration=state_space_model.StateSpaceModelConfiguration()):
+    """Initialize the Basic Structural Time Series model.
+
+    Args:
+      periodicities: Number of time steps for cyclic behavior. May be a list, in
+          which case one periodic component is created for each element.
+      moving_average_order: The number of moving average coefficients to use,
+          which also defines the number of steps after which transient
+          deviations revert to the mean defined by periodic and level/trend
+          components.
+      autoregressive_order: The number of steps back for autoregression.
+      use_level_noise: Whether to model the time series as having level
+          noise. See level_noise in the model description above.
+      configuration: A StateSpaceModelConfiguration object.
+    """
+    component_model_configuration = configuration._replace(
+        use_observation_noise=False)
+    univariate_component_model_configuration = (
+        component_model_configuration._replace(
+            num_features=1))
+
+    adder_part = _replicate_level_trend_models(
+        multivariate_configuration=component_model_configuration,
+        univariate_configuration=univariate_component_model_configuration)
+    with variable_scope.variable_scope("varma"):
+      varma_part = varma.VARMA(
+          autoregressive_order=autoregressive_order,
+          moving_average_order=moving_average_order,
+          configuration=component_model_configuration)
+
+    cycle_parts = []
+    periodicity_list = nest.flatten(periodicities)
+    for cycle_number, cycle_periodicity in enumerate(periodicity_list):
+      # For each specified periodicity, construct models for each feature with
+      # correlated noise.
+      with variable_scope.variable_scope("cycle{}".format(cycle_number)):
+        cycle_features = []
+        for feature in range(configuration.num_features):
+          with variable_scope.variable_scope("feature{}".format(feature)):
+            cycle_features.append(periodic.CycleStateSpaceModel(
+                periodicity=cycle_periodicity,
+                configuration=univariate_component_model_configuration))
+        cycle_parts.append(
+            state_space_model.StateSpaceCorrelatedFeaturesEnsemble(
+                ensemble_members=cycle_features,
+                configuration=component_model_configuration))
+
+    super(StructuralEnsemble, self).__init__(
+        ensemble_members=[adder_part, varma_part] + cycle_parts,
+        configuration=configuration)
+
+
+# TODO(allenl): Implement a multi-resolution moving average component to
+# decouple model size from the length of transient deviations.
+class MultiResolutionStructuralEnsemble(
+    state_space_model.StateSpaceIndependentEnsemble):
+  """A structural ensemble modeling arbitrary periods with a fixed model size.
+
+  See periodic.ResolutionCycleModel, which allows a fixed number of latent
+  values to cycle at multiple/variable resolutions, for more details on the
+  difference between MultiResolutionStructuralEnsemble and
+  StructuralEnsemble. With `cycle_num_latent_values` (controlling model size)
+  equal to `periodicities` (controlling the time over which these values
+  complete a full cycle), the models are
+  equivalent. MultiResolutionStructuralEnsemble allows `periodicities` to vary
+  while the model size remains fixed. Note that high `periodicities` without a
+  correspondingly high `cycle_num_latent_values` means that the modeled series
+  must have a relatively smooth periodic component.
+
+  Multiple features are handled the same way as in StructuralEnsemble (one
+  replication per feature, with correlations learned between the replicated
+  models). This strategy produces a very flexible model, but means that series
+  with many features may be slow to train.
+
+  Model size (the state dimension) is:
+    num_features * (sum(cycle_num_latent_values)
+      + max(moving_average_order + 1, autoregressive_order) + 2)
+  """
+
+  def __init__(self,
+               cycle_num_latent_values,
+               moving_average_order,
+               autoregressive_order,
+               periodicities,
+               use_level_noise=True,
+               configuration=state_space_model.StateSpaceModelConfiguration()):
+    """Initialize the multi-resolution structural ensemble.
+
+    Args:
+      cycle_num_latent_values: Controls the model size and the number of latent
+          values cycled between (but not the periods over which they cycle).
+          Reducing this parameter can save significant amounts of memory, but
+          the tradeoff is with resolution: cycling between a smaller number of
+          latent values means that only smoother functions can be modeled. For
+          multivariate series, may either be a scalar integer (in which case it
+          is applied to all periodic components) or a list with length matching
+          `periodicities`.
+      moving_average_order: The number of moving average coefficients to use,
+          which also defines the number of steps after which transient
+          deviations revert to the mean defined by periodic and level/trend
+          components. Adds to model size.
+      autoregressive_order: The number of steps back for
+          autoregression. Learning autoregressive coefficients typically
+          requires more steps and a smaller step size than other components.
+      periodicities: Same meaning as for StructuralEnsemble: number of steps for
+          cyclic behavior. Floating point and Tensor values are supported. May
+          be a list of values, in which case one component is created for each
+          periodicity. If `periodicities` is a list while
+          `cycle_num_latent_values` is a scalar, its value is broadcast to each
+          periodic component. Otherwise they should be lists of the same length,
+          in which case they are paired.
+      use_level_noise: See StructuralEnsemble.
+      configuration: A StateSpaceModelConfiguration object.
+    Raises:
+      ValueError: If `cycle_num_latent_values` is neither a scalar nor agrees in
+          size with `periodicities`.
+    """
+    component_model_configuration = configuration._replace(
+        use_observation_noise=False)
+    univariate_component_model_configuration = (
+        component_model_configuration._replace(
+            num_features=1))
+
+    adder_part = _replicate_level_trend_models(
+        multivariate_configuration=component_model_configuration,
+        univariate_configuration=univariate_component_model_configuration)
+    with variable_scope.variable_scope("varma"):
+      varma_part = varma.VARMA(
+          autoregressive_order=autoregressive_order,
+          moving_average_order=moving_average_order,
+          configuration=component_model_configuration)
+
+    cycle_parts = []
+    if periodicities is None:
+      periodicities = []
+    periodicity_list = nest.flatten(periodicities)
+    latent_values_list = nest.flatten(cycle_num_latent_values)
+    if len(periodicity_list) != len(latent_values_list):
+      if len(latent_values_list) != 1:
+        raise ValueError(
+            ("`cycle_num_latent_values` must either be a list with the same "
+             "size as `periodicity` or a scalar. Received length {} "
+             "`cycle_num_latent_values`, while `periodicities` has length {}.")
+            .format(len(latent_values_list), len(periodicity_list)))
+      latent_values_list *= len(periodicity_list)
+    for cycle_number, (cycle_periodicity, num_latent_values) in enumerate(
+        zip(periodicity_list, latent_values_list)):
+      with variable_scope.variable_scope("cycle{}".format(cycle_number)):
+        cycle_features = []
+        for feature in range(configuration.num_features):
+          with variable_scope.variable_scope("feature{}".format(feature)):
+            cycle_features.append(
+                periodic.ResolutionCycleModel(
+                    num_latent_values=num_latent_values,
+                    periodicity=cycle_periodicity,
+                    configuration=univariate_component_model_configuration))
+        cycle_parts.append(
+            state_space_model.StateSpaceCorrelatedFeaturesEnsemble(
+                ensemble_members=cycle_features,
+                configuration=component_model_configuration))
+
+    super(MultiResolutionStructuralEnsemble, self).__init__(
+        ensemble_members=[adder_part, varma_part] + cycle_parts,
+        configuration=configuration)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/structural_ensemble_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/structural_ensemble_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f6a5341e89ecc50a51950b1f55bb7c4c7206fd
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/structural_ensemble_test.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the structural state space ensembles."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.layers.python.layers import feature_column
+
+from tensorflow.contrib.timeseries.python.timeseries import estimators
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import structural_ensemble
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class StructuralEnsembleEstimatorTests(test.TestCase):
+
+  def simple_data(self, sample_every, dtype, period, num_samples, num_features):
+    time = sample_every * numpy.arange(num_samples)
+    noise = numpy.random.normal(
+        scale=0.01, size=[num_samples, num_features])
+    values = noise + numpy.sin(
+        numpy.arange(num_features)[None, ...]
+        + time[..., None] / float(period) * 2.0 * numpy.pi).astype(
+            dtype.as_numpy_dtype)
+    return {TrainEvalFeatures.TIMES: numpy.reshape(time, [1, -1]),
+            TrainEvalFeatures.VALUES: numpy.reshape(
+                values, [1, -1, num_features])}
+
+  def dry_run_train_helper(
+      self, sample_every, period, num_samples, model_type, model_args,
+      num_features=1):
+    numpy.random.seed(1)
+    dtype = dtypes.float32
+    features = self.simple_data(
+        sample_every, dtype=dtype, period=period, num_samples=num_samples,
+        num_features=num_features)
+    model = model_type(
+        configuration=(
+            state_space_model.StateSpaceModelConfiguration(
+                num_features=num_features,
+                dtype=dtype,
+                covariance_prior_fn=lambda _: 0.)),
+        **model_args)
+
+    class _RunConfig(estimator_lib.RunConfig):
+
+      @property
+      def tf_random_seed(self):
+        return 4
+
+    estimator = estimators.StateSpaceRegressor(model, config=_RunConfig())
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(features), num_threads=1, shuffle_seed=1,
+        batch_size=16, window_size=16)
+    eval_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(features))
+    estimator.train(input_fn=train_input_fn, max_steps=1)
+    first_evaluation = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+    estimator.train(input_fn=train_input_fn, max_steps=3)
+    second_evaluation = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+    self.assertLess(second_evaluation["loss"], first_evaluation["loss"])
+
+  def test_structural_multivariate(self):
+    self.dry_run_train_helper(
+        sample_every=3,
+        period=5,
+        num_samples=100,
+        num_features=3,
+        model_type=structural_ensemble.StructuralEnsemble,
+        model_args={
+            "periodicities": 2,
+            "moving_average_order": 2,
+            "autoregressive_order": 1
+        })
+
+  def test_exogenous_input(self):
+    """Test that no errors are raised when using exogenous features."""
+    dtype = dtypes.float64
+    times = [1, 2, 3, 4, 5, 6]
+    values = [[0.01], [5.10], [5.21], [0.30], [5.41], [0.50]]
+    feature_a = [["off"], ["on"], ["on"], ["off"], ["on"], ["off"]]
+    sparse_column_a = feature_column.sparse_column_with_keys(
+        column_name="feature_a", keys=["on", "off"])
+    one_hot_a = layers.one_hot_column(sparse_id_column=sparse_column_a)
+    regressor = estimators.StructuralEnsembleRegressor(
+        periodicities=[],
+        num_features=1,
+        moving_average_order=0,
+        exogenous_feature_columns=[one_hot_a],
+        dtype=dtype)
+    features = {TrainEvalFeatures.TIMES: times,
+                TrainEvalFeatures.VALUES: values,
+                "feature_a": feature_a}
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(features),
+        window_size=6, batch_size=1)
+    regressor.train(input_fn=train_input_fn, steps=1)
+    eval_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(features))
+    evaluation = regressor.evaluate(input_fn=eval_input_fn, steps=1)
+    predict_input_fn = input_pipeline.predict_continuation_input_fn(
+        evaluation, times=[[7, 8, 9]],
+        exogenous_features={"feature_a": [[["on"], ["off"], ["on"]]]})
+    regressor.predict(input_fn=predict_input_fn)
+
+  def test_no_periodicity(self):
+    """Test that no errors are raised when periodicites is None."""
+    dtype = dtypes.float64
+    times = [1, 2, 3, 4, 5, 6]
+    values = [[0.01], [5.10], [5.21], [0.30], [5.41], [0.50]]
+    regressor = estimators.StructuralEnsembleRegressor(
+        periodicities=None,
+        num_features=1,
+        moving_average_order=0,
+        dtype=dtype)
+    features = {TrainEvalFeatures.TIMES: times,
+                TrainEvalFeatures.VALUES: values}
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(features),
+        window_size=6, batch_size=1)
+    regressor.train(input_fn=train_input_fn, steps=1)
+    eval_input_fn = input_pipeline.WholeDatasetInputFn(
+        input_pipeline.NumpyReader(features))
+    evaluation = regressor.evaluate(input_fn=eval_input_fn, steps=1)
+    predict_input_fn = input_pipeline.predict_continuation_input_fn(
+        evaluation, times=[[7, 8, 9]])
+    regressor.predict(input_fn=predict_input_fn)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f127700d99f1a9cf2549e2fdb57ce6090440ac7
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for testing state space models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+
+
+def transition_power_test_template(test_case, model, num_steps):
+  """Tests the transition_to_powers function of a state space model."""
+  transition_matrix = ops.convert_to_tensor(
+      model.get_state_transition(), dtype=model.dtype)
+  step_number = array_ops.placeholder(shape=[], dtype=dtypes.int64)
+  state_dimension = transition_matrix.get_shape()[0].value
+  previous_matrix = array_ops.placeholder(
+      shape=[state_dimension, state_dimension], dtype=transition_matrix.dtype)
+  true_single_step_update = math_ops.matmul(previous_matrix,
+                                            transition_matrix)
+  model_output_tensor = model.transition_to_powers(powers=array_ops.stack(
+      [step_number, step_number]))
+  with test_case.test_session():
+    starting_matrix = linalg_ops.eye(
+        state_dimension, batch_shape=array_ops.shape(num_steps)).eval()
+    evaled_current_matrix = starting_matrix
+    for iteration_number in range(num_steps):
+      model_output = model_output_tensor.eval(
+          feed_dict={step_number: iteration_number})
+      test_case.assertAllClose(
+          evaled_current_matrix,
+          model_output[0],
+          rtol=1e-8 if evaled_current_matrix.dtype == numpy.float64 else 1e-4)
+      evaled_current_matrix = true_single_step_update.eval(
+          feed_dict={previous_matrix: evaled_current_matrix})
+
+
+def noise_accumulator_test_template(test_case, model, num_steps):
+  """Tests `model`'s transition_power_noise_accumulator."""
+  transition_matrix = ops.convert_to_tensor(
+      model.get_state_transition(), dtype=model.dtype)
+  noise_transform = ops.convert_to_tensor(
+      model.get_noise_transform(), dtype=model.dtype)
+  state_dimension = transition_matrix.get_shape()[0].value
+  state_noise_dimension = noise_transform.get_shape()[1].value
+  gen_noise_addition = math_utils.sign_magnitude_positive_definite(
+      raw=random_ops.random_normal(
+          shape=[state_noise_dimension, state_noise_dimension],
+          dtype=model.dtype))
+  gen_starting_noise = math_utils.sign_magnitude_positive_definite(
+      random_ops.random_normal(
+          shape=[state_dimension, state_dimension], dtype=model.dtype))
+  starting_noise = array_ops.placeholder(
+      shape=[state_dimension, state_dimension], dtype=model.dtype)
+  step_number = array_ops.placeholder(shape=[], dtype=dtypes.int64)
+  starting_transitioned = math_ops.matmul(
+      math_ops.matmul(transition_matrix, starting_noise),
+      transition_matrix,
+      adjoint_b=True)
+  with test_case.test_session():
+    evaled_starting_noise = gen_starting_noise.eval()
+    current_starting_noise_transitioned = evaled_starting_noise
+    current_noise = evaled_starting_noise
+    evaled_noise_addition = gen_noise_addition.eval()
+    evaled_noise_addition_transformed = math_ops.matmul(
+        math_ops.matmul(noise_transform, evaled_noise_addition),
+        noise_transform,
+        adjoint_b=True).eval()
+    model.state_transition_noise_covariance = evaled_noise_addition
+    model._window_initializer(  # pylint: disable=protected-access
+        times=math_ops.range(num_steps + 1)[..., None], state=(None, None, 0))
+    model_update = model.transition_power_noise_accumulator(
+        num_steps=step_number)
+    for iteration_number in range(num_steps):
+      model_new_noise = model_update.eval(
+          feed_dict={step_number: iteration_number})
+      test_case.assertAllClose(
+          current_noise,
+          model_new_noise + current_starting_noise_transitioned,
+          rtol=1e-8 if current_noise.dtype == numpy.float64 else 1e-3)
+      current_starting_noise_transitioned = starting_transitioned.eval(
+          feed_dict={starting_noise: current_starting_noise_transitioned})
+      current_noise = (
+          starting_transitioned.eval(
+              feed_dict={starting_noise: current_noise})
+          + evaled_noise_addition_transformed)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
new file mode 100644
index 0000000000000000000000000000000000000000..110ba9738f8c28109282b927fd07ade071bb3e4a
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
@@ -0,0 +1,197 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Multivariate autoregressive model (vector autoregression).
+
+Implements the following model (num_blocks = max(ar_order, ma_order + 1)):
+
+  y(t, 1) = \sum_{i=1}^{ar_order} ar_coefs[i] * y(t - 1, i)
+  y(t, i) = y(t - 1, i - 1) + ma_coefs[i - 1] * e(t) for 1 < i < num_blocks
+  y(t, num_blocks) = y(t - 1, num_blocks - 1) + e(t)
+
+Where e(t) are Gaussian with zero mean and learned covariance.
+
+Each element of ar_coefs and ma_coefs is a [num_features x num_features]
+matrix. Each y(t, i) is a vector of length num_features. Indices in the above
+equations are one-based. Initial conditions y(0, i) come from prior state (which
+may either be learned or left as a constant with high prior covariance).
+
+If ar_order > ma_order, the observation model is:
+  y(t, 1) + observation_noise(t)
+
+If ma_order >= ar_order, it is (to observe the moving average component):
+  y(t, 1) + y(t, num_blocks) + observation_noise(t)
+
+Where observation_noise(t) are Gaussian with zero mean and learned covariance.
+
+This implementation uses a formulation which puts all of the autoregressive
+coefficients in the transition equation for the observed component, which
+enables learning using truncated backpropagation. Noise is not applied directly
+to the observed component (with the exception of standard observation noise),
+which further aids learning of the autoregressive coefficients when VARMA is in
+an ensemble with other models (in which case having an observation noise term is
+usually unavoidable).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+
+
+class VARMA(state_space_model.StateSpaceModel):
+  """A VARMA model implementation as a special case of the state space model."""
+
+  def __init__(self,
+               autoregressive_order,
+               moving_average_order,
+               configuration=state_space_model.StateSpaceModelConfiguration()):
+    """Construct a VARMA model.
+
+    The size of the latent state for this model is:
+      num_features * max(autoregressive_order, moving_average_order + 1)
+    Square matrices of this size are constructed and multiplied.
+
+    Args:
+      autoregressive_order: The maximum autoregressive lag.
+      moving_average_order: The maximum moving average lag, after which
+        transient deviations are expected to return to their long-term mean.
+      configuration: A StateSpaceModelConfiguration object.
+    """
+    self.ar_order = autoregressive_order
+    self.ma_order = moving_average_order
+    self.state_num_blocks = max(autoregressive_order, moving_average_order + 1)
+    super(VARMA, self).__init__(configuration=configuration)
+    self.state_dimension = self.state_num_blocks * self.num_features
+
+  def _define_parameters(self, observation_transition_tradeoff_log=None):
+    with variable_scope.variable_scope(self._variable_scope):
+      # TODO(allenl): Evaluate parameter transformations for AR/MA coefficients
+      # which improve interpretability/stability.
+      self.ar_coefs = variable_scope.get_variable(
+          name="ar_coefs",
+          shape=[self.num_features, self.num_features, self.ar_order],
+          dtype=self.dtype,
+          initializer=init_ops.zeros_initializer())
+      self.ma_coefs = variable_scope.get_variable(
+          name="ma_coefs",
+          initializer=array_ops.tile(
+              linalg_ops.eye(self.num_features, dtype=self.dtype)[None, :, :],
+              [self.ma_order, 1, 1]),
+          dtype=self.dtype)
+    super(VARMA, self)._define_parameters(
+        observation_transition_tradeoff_log=observation_transition_tradeoff_log)
+
+  def get_state_transition(self):
+    """Construct state transition matrix from VARMA parameters.
+
+    Returns:
+      the state transition matrix. It has shape
+        [self.state_dimendion, self.state_dimension].
+    """
+    # Pad any unused AR blocks with zeros. The extra state is necessary if
+    # ma_order >= ar_order.
+    ar_coefs_padded = array_ops.reshape(
+        array_ops.pad(self.ar_coefs,
+                      [[0, 0], [0, 0],
+                       [0, self.state_num_blocks - self.ar_order]]),
+        [self.num_features, self.state_dimension])
+    shift_matrix = array_ops.pad(
+        linalg_ops.eye(
+            (self.state_num_blocks - 1) * self.num_features, dtype=self.dtype),
+        [[0, 0], [0, self.num_features]])
+    return array_ops.concat([ar_coefs_padded, shift_matrix], axis=0)
+
+  def get_noise_transform(self):
+    """Construct state noise transform matrix from VARMA parameters.
+
+    Returns:
+      the state noise transform matrix. It has shape
+        [self.state_dimendion, self.num_features].
+    """
+    # Noise is broadcast, through the moving average coefficients, to
+    # un-observed parts of the latent state.
+    ma_coefs_padded = array_ops.reshape(
+        array_ops.pad(self.ma_coefs,
+                      [[self.state_num_blocks - 1 - self.ma_order, 0], [0, 0],
+                       [0, 0]]),
+        [(self.state_num_blocks - 1) * self.num_features, self.num_features],
+        name="noise_transform")
+    # Deterministically apply noise to the oldest component.
+    return array_ops.concat(
+        [ma_coefs_padded,
+         linalg_ops.eye(self.num_features, dtype=self.dtype)],
+        axis=0)
+
+  def get_observation_model(self, times):
+    """Construct observation model matrix from VARMA parameters.
+
+    Args:
+      times: A [batch size] vector indicating the times observation models are
+          requested for. Unused.
+    Returns:
+      the observation model matrix. It has shape
+        [self.num_features, self.state_dimension].
+    """
+    del times  # StateSpaceModel will broadcast along the batch dimension
+    if self.ar_order > self.ma_order or self.state_num_blocks < 2:
+      return array_ops.pad(
+          linalg_ops.eye(self.num_features, dtype=self.dtype),
+          [[0, 0], [0, self.num_features * (self.state_num_blocks - 1)]],
+          name="observation_model")
+    else:
+      # Add a second observed component which "catches" the accumulated moving
+      # average errors as they reach the end of the state. If ar_order >
+      # ma_order, this is unnecessary, since accumulated errors cycle naturally.
+      return array_ops.concat(
+          [
+              array_ops.pad(
+                  linalg_ops.eye(self.num_features, dtype=self.dtype),
+                  [[0, 0], [0,
+                            self.num_features * (self.state_num_blocks - 2)]]),
+              linalg_ops.eye(self.num_features, dtype=self.dtype)
+          ],
+          axis=1,
+          name="observation_model")
+
+  def get_state_transition_noise_covariance(
+      self, minimum_initial_variance=1e-5):
+    # Most state space models use only an explicit observation noise term to
+    # model deviations from expectations, and so a low initial transition noise
+    # parameter is helpful there. Since deviations from expectations are also
+    # modeled as transition noise in VARMA, we set its initial value based on a
+    # slight over-estimate empirical observation noise.
+    if self._input_statistics is not None:
+      feature_variance = self._input_statistics.series_start_moments.variance
+      initial_transition_noise_scale = math_ops.log(
+          math_ops.maximum(
+              math_ops.reduce_mean(feature_variance), minimum_initial_variance))
+    else:
+      initial_transition_noise_scale = 0.
+    state_noise_transform = ops.convert_to_tensor(
+        self.get_noise_transform(), dtype=self.dtype)
+    state_noise_dimension = state_noise_transform.get_shape()[1].value
+    return math_utils.variable_covariance_matrix(
+        state_noise_dimension, "state_transition_noise",
+        dtype=self.dtype,
+        initial_overall_scale_log=initial_transition_noise_scale)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..84885d5c9acbb84ed83a2d90b00816c756468118
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma_test.py
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VARMA.
+
+Tests VARMA model building and utility functions.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
+from tensorflow.contrib.timeseries.python.timeseries.state_space_models import varma
+
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class MakeModelTest(test.TestCase):
+
+  def test_ar_smaller(self):
+    model = varma.VARMA(
+        autoregressive_order=0,
+        moving_average_order=3)
+    model.initialize_graph()
+    outputs = model.define_loss(
+        features={
+            TrainEvalFeatures.TIMES: constant_op.constant([[1, 2]]),
+            TrainEvalFeatures.VALUES: constant_op.constant([[[1.], [2.]]])
+        },
+        mode=estimator_lib.ModeKeys.TRAIN)
+    initializer = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run([initializer])
+      outputs.loss.eval()
+
+  def test_ma_smaller(self):
+    model = varma.VARMA(
+        autoregressive_order=6,
+        moving_average_order=3,
+        configuration=state_space_model.StateSpaceModelConfiguration(
+            num_features=7))
+    model.initialize_graph()
+    outputs = model.define_loss(
+        features={
+            TrainEvalFeatures.TIMES: constant_op.constant([[1, 2]]),
+            TrainEvalFeatures.VALUES: constant_op.constant(
+                [[[1.] * 7, [2.] * 7]])
+        },
+        mode=estimator_lib.ModeKeys.TRAIN)
+    initializer = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run([initializer])
+      outputs.loss.eval()
+
+  def test_make_ensemble_no_errors(self):
+    with variable_scope.variable_scope("model_one"):
+      model_one = varma.VARMA(10, 5)
+    with variable_scope.variable_scope("model_two"):
+      model_two = varma.VARMA(0, 3)
+    configuration = state_space_model.StateSpaceModelConfiguration()
+    ensemble = state_space_model.StateSpaceIndependentEnsemble(
+        ensemble_members=[model_one, model_two],
+        configuration=configuration)
+    ensemble.initialize_graph()
+    outputs = ensemble.define_loss(
+        features={
+            TrainEvalFeatures.TIMES: constant_op.constant([[1, 2]]),
+            TrainEvalFeatures.VALUES: constant_op.constant([[[1.], [2.]]])},
+        mode=estimator_lib.ModeKeys.TRAIN)
+    initializer = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run([initializer])
+      outputs.loss.eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/test_utils.py b/tensorflow/contrib/timeseries/python/timeseries/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed02960faeff5617ddbab7c80c1a07ebfe271470
--- /dev/null
+++ b/tensorflow/contrib/timeseries/python/timeseries/test_utils.py
@@ -0,0 +1,282 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for testing time series models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.timeseries.python.timeseries import estimators
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
+from tensorflow.contrib.timeseries.python.timeseries import state_management
+from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
+
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import adam
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import coordinator as coordinator_lib
+from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.util import nest
+
+
+class AllWindowInputFn(input_pipeline.TimeSeriesInputFn):
+  """Returns all contiguous windows of data from a full dataset.
+
+  In contrast to WholeDatasetInputFn, which does basic shape checking but
+  maintains the flat sequencing of data, this `TimeSeriesInputFn` creates
+  batches of windows. However, unlike `RandomWindowInputFn` these windows are
+  deterministic, starting at every possible offset (i.e. batches of size
+  series_length - window_size + 1 are produced).
+  """
+
+  def __init__(self, time_series_reader, window_size):
+    """Initialize the input_pipeline.
+
+    Args:
+      time_series_reader: A `input_pipeline.TimeSeriesReader` object.
+      window_size: The size of contiguous windows of data to produce.
+    """
+    self._window_size = window_size
+    self._reader = time_series_reader
+    super(AllWindowInputFn, self).__init__()
+
+  def create_batch(self):
+    features = self._reader.read_full()
+    times = features[TrainEvalFeatures.TIMES]
+    num_windows = array_ops.shape(times)[0] - self._window_size + 1
+    indices = array_ops.reshape(math_ops.range(num_windows), [num_windows, 1])
+    # indices contains the starting point for each window. We now extend these
+    # indices to include the elements inside the windows as well by doing a
+    # broadcast addition.
+    increments = array_ops.reshape(math_ops.range(self._window_size), [1, -1])
+    all_indices = array_ops.reshape(indices + increments, [-1])
+    # Select the appropriate elements in the batch and reshape the output to 3D.
+    features = {
+        key: array_ops.reshape(
+            array_ops.gather(value, all_indices),
+            array_ops.concat(
+                [[num_windows, self._window_size], array_ops.shape(value)[1:]],
+                axis=0))
+        for key, value in features.items()
+    }
+    return (features, None)
+
+
+class _SavingTensorHook(basic_session_run_hooks.LoggingTensorHook):
+  """A hook to save Tensors during training."""
+
+  def __init__(self, tensors, every_n_iter=None, every_n_secs=None):
+    self.tensor_values = {}
+    super(_SavingTensorHook, self).__init__(
+        tensors=tensors, every_n_iter=every_n_iter,
+        every_n_secs=every_n_secs)
+
+  def after_run(self, run_context, run_values):
+    del run_context
+    if self._should_trigger:
+      for tag in self._current_tensors.keys():
+        self.tensor_values[tag] = run_values.results[tag]
+      self._timer.update_last_triggered_step(self._iter_count)
+    self._iter_count += 1
+
+
+def _train_on_generated_data(
+    generate_fn, generative_model, train_iterations, seed,
+    learning_rate=0.1, ignore_params_fn=lambda _: (),
+    derived_param_test_fn=lambda _: (),
+    train_input_fn_type=input_pipeline.WholeDatasetInputFn,
+    train_state_manager=state_management.PassthroughStateManager()):
+  """The training portion of parameter recovery tests."""
+  random_seed.set_random_seed(seed)
+  generate_graph = ops.Graph()
+  with generate_graph.as_default():
+    with session.Session(graph=generate_graph):
+      generative_model.initialize_graph()
+      time_series_reader, true_parameters = generate_fn(generative_model)
+      true_parameters = {
+          tensor.name: value for tensor, value in true_parameters.items()}
+  eval_input_fn = input_pipeline.WholeDatasetInputFn(time_series_reader)
+  eval_state_manager = state_management.PassthroughStateManager()
+  true_parameter_eval_graph = ops.Graph()
+  with true_parameter_eval_graph.as_default():
+    generative_model.initialize_graph()
+    ignore_params = ignore_params_fn(generative_model)
+    feature_dict, _ = eval_input_fn()
+    eval_state_manager.initialize_graph(generative_model)
+    feature_dict[TrainEvalFeatures.VALUES] = math_ops.cast(
+        feature_dict[TrainEvalFeatures.VALUES], generative_model.dtype)
+    model_outputs = eval_state_manager.define_loss(
+        model=generative_model,
+        features=feature_dict,
+        mode=estimator_lib.ModeKeys.EVAL)
+    with session.Session(graph=true_parameter_eval_graph) as sess:
+      variables.global_variables_initializer().run()
+      coordinator = coordinator_lib.Coordinator()
+      queue_runner_impl.start_queue_runners(sess, coord=coordinator)
+      true_param_loss = model_outputs.loss.eval(feed_dict=true_parameters)
+      true_transformed_params = {
+          param: param.eval(feed_dict=true_parameters)
+          for param in derived_param_test_fn(generative_model)}
+      coordinator.request_stop()
+      coordinator.join()
+
+  saving_hook = _SavingTensorHook(
+      tensors=true_parameters.keys(),
+      every_n_iter=train_iterations - 1)
+
+  class _RunConfig(estimator_lib.RunConfig):
+
+    @property
+    def tf_random_seed(self):
+      return seed
+
+  estimator = estimators.TimeSeriesRegressor(
+      model=generative_model,
+      config=_RunConfig(),
+      state_manager=train_state_manager,
+      optimizer=adam.AdamOptimizer(learning_rate))
+  train_input_fn = train_input_fn_type(time_series_reader=time_series_reader)
+  trained_loss = (estimator.train(
+      input_fn=train_input_fn,
+      max_steps=train_iterations,
+      hooks=[saving_hook]).evaluate(
+          input_fn=eval_input_fn, steps=1))["loss"]
+  logging.info("Final trained loss: %f", trained_loss)
+  logging.info("True parameter loss: %f", true_param_loss)
+  return (ignore_params, true_parameters, true_transformed_params,
+          trained_loss, true_param_loss, saving_hook,
+          true_parameter_eval_graph)
+
+
+def test_parameter_recovery(
+    generate_fn, generative_model, train_iterations, test_case, seed,
+    learning_rate=0.1, rtol=0.2, atol=0.1, train_loss_tolerance_coeff=0.99,
+    ignore_params_fn=lambda _: (),
+    derived_param_test_fn=lambda _: (),
+    train_input_fn_type=input_pipeline.WholeDatasetInputFn,
+    train_state_manager=state_management.PassthroughStateManager()):
+  """Test that a generative model fits generated data.
+
+  Args:
+    generate_fn: A function taking a model and returning a `TimeSeriesReader`
+        object and dictionary mapping parameters to their
+        values. model.initialize_graph() will have been called on the model
+        before it is passed to this function.
+    generative_model: A timeseries.model.TimeSeriesModel instance to test.
+    train_iterations: Number of training steps.
+    test_case: A tf.test.TestCase to run assertions on.
+    seed: Same as for TimeSeriesModel.unconditional_generate().
+    learning_rate: Step size for optimization.
+    rtol: Relative tolerance for tests.
+    atol: Absolute tolerance for tests.
+    train_loss_tolerance_coeff: Trained loss times this value must be less
+        than the loss evaluated using the generated parameters.
+    ignore_params_fn: Function mapping from a Model to a list of parameters
+        which are not tested for accurate recovery.
+    derived_param_test_fn: Function returning a list of derived parameters
+        (Tensors) which are checked for accurate recovery (comparing the value
+        evaluated with trained parameters to the value under the true
+        parameters).
+
+        As an example, for VARMA, in addition to checking AR and MA parameters,
+        this function can be used to also check lagged covariance. See
+        varma_ssm.py for details.
+    train_input_fn_type: The `TimeSeriesInputFn` type to use when training
+        (likely `WholeDatasetInputFn` or `RandomWindowInputFn`). If None, use
+        `WholeDatasetInputFn`.
+    train_state_manager: The state manager to use when training (likely
+        `PassthroughStateManager` or `ChainingStateManager`). If None, use
+        `PassthroughStateManager`.
+  """
+  (ignore_params, true_parameters, true_transformed_params,
+   trained_loss, true_param_loss, saving_hook, true_parameter_eval_graph
+  ) = _train_on_generated_data(
+      generate_fn=generate_fn, generative_model=generative_model,
+      train_iterations=train_iterations, seed=seed, learning_rate=learning_rate,
+      ignore_params_fn=ignore_params_fn,
+      derived_param_test_fn=derived_param_test_fn,
+      train_input_fn_type=train_input_fn_type,
+      train_state_manager=train_state_manager)
+  trained_parameter_substitutions = {}
+  for param in true_parameters.keys():
+    evaled_value = saving_hook.tensor_values[param]
+    trained_parameter_substitutions[param] = evaled_value
+    true_value = true_parameters[param]
+    logging.info("True %s: %s, learned: %s",
+                 param, true_value, evaled_value)
+  with session.Session(graph=true_parameter_eval_graph):
+    for transformed_param, true_value in true_transformed_params.items():
+      trained_value = transformed_param.eval(
+          feed_dict=trained_parameter_substitutions)
+      logging.info("True %s [transformed parameter]: %s, learned: %s",
+                   transformed_param, true_value, trained_value)
+      test_case.assertAllClose(true_value, trained_value,
+                               rtol=rtol, atol=atol)
+
+  if ignore_params is None:
+    ignore_params = []
+  else:
+    ignore_params = nest.flatten(ignore_params)
+  ignore_params = [tensor.name for tensor in ignore_params]
+  if trained_loss > 0:
+    test_case.assertLess(trained_loss * train_loss_tolerance_coeff,
+                         true_param_loss)
+  else:
+    test_case.assertLess(trained_loss / train_loss_tolerance_coeff,
+                         true_param_loss)
+  for param in true_parameters.keys():
+    if param in ignore_params:
+      continue
+    evaled_value = saving_hook.tensor_values[param]
+    true_value = true_parameters[param]
+    test_case.assertAllClose(true_value, evaled_value,
+                             rtol=rtol, atol=atol)
+
+
+def parameter_recovery_dry_run(
+    generate_fn, generative_model, seed,
+    learning_rate=0.1,
+    train_input_fn_type=input_pipeline.WholeDatasetInputFn,
+    train_state_manager=state_management.PassthroughStateManager()):
+  """Test that a generative model can train on generated data.
+
+  Args:
+    generate_fn: A function taking a model and returning a
+        `input_pipeline.TimeSeriesReader` object and a dictionary mapping
+        parameters to their values. model.initialize_graph() will have been
+        called on the model before it is passed to this function.
+    generative_model: A timeseries.model.TimeSeriesModel instance to test.
+    seed: Same as for TimeSeriesModel.unconditional_generate().
+    learning_rate: Step size for optimization.
+    train_input_fn_type: The type of `TimeSeriesInputFn` to use when training
+        (likely `WholeDatasetInputFn` or `RandomWindowInputFn`). If None, use
+        `WholeDatasetInputFn`.
+    train_state_manager: The state manager to use when training (likely
+        `PassthroughStateManager` or `ChainingStateManager`). If None, use
+        `PassthroughStateManager`.
+  """
+  _train_on_generated_data(
+      generate_fn=generate_fn, generative_model=generative_model,
+      seed=seed, learning_rate=learning_rate,
+      train_input_fn_type=train_input_fn_type,
+      train_state_manager=train_state_manager,
+      train_iterations=2)
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..69e91d05b346307568f73cd85a29749be9dc978f
--- /dev/null
+++ b/tensorflow/contrib/tpu/BUILD
@@ -0,0 +1,259 @@
+# Description: Operations defined for Cloud TPUs
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "all_ops",
+    deps = [
+        ":cross_replica_ops_op_lib",
+        ":infeed_ops_op_lib",
+        ":outfeed_ops_op_lib",
+        ":replication_ops_op_lib",
+        ":tpu_configuration_ops_op_lib",
+        ":tpu_sendrecv_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_estimator",
+    srcs = [
+        "python/tpu/tpu_config.py",
+        "python/tpu/tpu_estimator.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu",
+        ":tpu_feed",
+        ":tpu_py",
+        ":training_loop",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:util",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "cross_replica_ops",
+        "infeed_ops",
+        "outfeed_ops",
+        "replication_ops",
+        "tpu_configuration_ops",
+        "tpu_sendrecv_ops",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_tpu_ops.so",
+    srcs = [
+        "ops/cross_replica_ops.cc",
+        "ops/infeed_ops.cc",
+        "ops/outfeed_ops.cc",
+        "ops/replication_ops.cc",
+        "ops/tpu_configuration_ops.cc",
+        "ops/tpu_sendrecv_ops.cc",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "tpu_ops",
+    deps = [
+        ":cross_replica_ops_op_lib",
+        ":infeed_ops_op_lib",
+        ":outfeed_ops_op_lib",
+        ":replication_ops_op_lib",
+        ":tpu_configuration_ops_op_lib",
+        ":tpu_sendrecv_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "profiler",
+    srcs = ["python/profiler/__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "tpu_py",
+    srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
+    dso = [":python/ops/_tpu_ops.so"],
+    kernels = [
+        ":all_ops",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":profiler",
+        ":tpu_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "tpu_helper_library",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu",
+        ":tpu_feed",
+        ":tpu_function",
+        ":tpu_py",
+        ":tpu_sharding",
+        ":training_loop",
+    ],
+)
+
+py_library(
+    name = "tpu_function",
+    srcs = ["python/tpu/tpu_function.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_feed",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "tpu",
+    srcs = [
+        "python/tpu/__init__.py",
+        "python/tpu/tpu.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":profiler",
+        ":tpu_function",
+        ":tpu_py",
+        ":training_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "tpu_sharding",
+    srcs = ["python/tpu/tpu_sharding.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "tpu_feed",
+    srcs = ["python/tpu/tpu_feed.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_py",
+        ":tpu_sharding",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "training_loop",
+    srcs = [
+        "python/tpu/tpu_optimizer.py",
+        "python/tpu/training_loop.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_function",
+        ":tpu_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_sharding_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_sharding_test.py"],
+    additional_deps = [
+        ":tpu_sharding",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_infeed_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_infeed_test.py"],
+    additional_deps = [
+        ":tpu_feed",
+        ":tpu_sharding",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_function_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_function_test.py"],
+    additional_deps = [
+        ":tpu_function",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/tpu/README.md b/tensorflow/contrib/tpu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..69fb11e344f07337955b4ab57960cb853d156a2d
--- /dev/null
+++ b/tensorflow/contrib/tpu/README.md
@@ -0,0 +1,44 @@
+# TPU support for TensorFlow #
+
+This directory contains code required to re-target a TensorFlow model to run
+on TPUs.
+
+## Example usage - TPU Estimator
+
+Below shows example usage of the TPU Estimator for a simple convolutional
+network.
+
+```python
+import tensorflow as tf
+
+from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
+
+def model_fn(features, labels, mode, params):
+  # Define the model to construct the logits
+  logits = # ...
+  loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
+  optimizer = tpu_optimizer.CrossShardOptimizer(
+    tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate))
+  train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+  return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+
+def input_fn(params):
+  # ...
+  pass
+
+def main():
+  run_config = tpu_config.RunConfig(
+    master=FLAGS.master,
+    # ...
+  )
+  estimator = tpu_estimator.TpuEstimator(
+    model_fn=model_fn,
+    use_tpu=FLAGS.use_tpu,
+    config=run_config,
+    batch_size=FLAGS.batch_size)
+  estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
+```
+
+For the complete [executable] example, see our open source TPU models.
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1abd55b56dc767685ae8b01d6900a1e83efddd73
--- /dev/null
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Ops related to Tensor Processing Units."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.contrib.tpu.python import profiler
+from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu import *
+# pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['profiler']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbbd19800eb2e336fc343671fb82bb3ed631c129
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("CrossReplicaSum")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: {float}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+An Op to sum inputs across replicated TPU instances. Each
+instance supplies its own input, and the output of each is the sum of
+all the inputs.
+
+input: The local input to the sum.
+output: The sum of all the distributed inputs.
+T: The type of elements to be summed.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/infeed_ops.cc b/tensorflow/contrib/tpu/ops/infeed_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c12e83137aa8f4f74d63f3dac1bb490cd0977648
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/infeed_ops.cc
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("InfeedDequeue")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A placeholder op for a value that will be fed into the computation.
+
+output: A tensor that will be provided using the infeed mechanism.
+dtype: The type of elements in the tensor.
+shape: The shape of the tensor.
+)doc");
+
+REGISTER_OP("InfeedEnqueue")
+    .Input("input: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape = {}")
+    .Attr("device_ordinal: int = -1")
+    .SetIsStateful()
+    .Doc(R"doc(
+An op which feeds a single Tensor value into the computation.
+
+input: A tensor that will be provided using the infeed mechanism.
+dtype: The type of elements in the tensor.
+shape: The shape of the tensor.
+device_ordinal: The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+)doc");
+
+REGISTER_OP("InfeedEnqueueTuple")
+    .Input("inputs: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .Attr("device_ordinal: int = -1")
+    .SetIsStateful()
+    .Doc(R"doc(
+An op which feeds multiple Tensor values into the computation as an XLA tuple.
+
+inputs: A list of tensors that will be provided using the infeed mechanism.
+dtypes: The element types of each element in `inputs`.
+shapes: The shapes of each tensor in `inputs`.
+device_ordinal: The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+)doc");
+
+REGISTER_OP("InfeedDequeueTuple")
+    .Output("outputs: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      std::vector<PartialTensorShape> shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("shapes", &shapes));
+      for (int i = 0; i < shapes.size(); ++i) {
+        ShapeHandle out;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shapes[i], &out));
+        c->set_output(i, out);
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A placeholder op for multiple values that will be fed into the computation
+simultaneously as an XLA tuple.
+
+outputs: A list of tensors that will be provided using the infeed mechanism.
+dtypes: The element types of each element in `outputs`.
+shapes: The shapes of each tensor in `outputs`.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16c57a1c2b25e4e8c56cd80214759e4d0a66195d
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -0,0 +1,106 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("OutfeedEnqueue")
+    .Input("input: dtype")
+    .Attr("dtype: type")
+    .SetIsStateful()
+    .Doc(R"doc(
+An op which emits a single Tensor value from an XLA computation.
+
+input: A tensor that will be inserted into the outfeed queue.
+)doc");
+
+REGISTER_OP("OutfeedEnqueueTuple")
+    .Input("inputs: dtypes")
+    .Attr("dtypes: list(type)")
+    .SetIsStateful()
+    .Doc(R"doc(
+An op which emits multiple Tensor values from an XLA computation.
+
+inputs: A list of tensors that will be inserted into the outfeed queue as an 
+XLA tuple.
+)doc");
+
+REGISTER_OP("OutfeedDequeue")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .Attr("device_ordinal: int = -1")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Retrieves a single tensor from the computation outfeed.  This operation will
+block indefinitely until data is available.
+
+output: A tensor that will be read from the device outfeed.
+dtype: The type of elements in the tensor.
+shape: The shape of the tensor.
+device_ordinal: The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+)doc");
+
+REGISTER_OP("OutfeedDequeueTuple")
+    .Output("outputs: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .Attr("device_ordinal: int = -1")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      std::vector<PartialTensorShape> shapes;
+      std::vector<DataType> dtypes;
+      TF_RETURN_IF_ERROR(c->GetAttr("shapes", &shapes));
+      TF_RETURN_IF_ERROR(c->GetAttr("dtypes", &dtypes));
+      if (shapes.size() != dtypes.size()) {
+        return errors::InvalidArgument(
+            "Incorrect number of output shapes specified");
+      }
+      for (int i = 0; i < shapes.size(); ++i) {
+        ShapeHandle out;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shapes[i], &out));
+        c->set_output(i, out);
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Retrieve multiple values that will be emitted by the computation as an XLA
+tuple.  This operations will block indefinitely until data is available.
+Output `i` corresponds to XLA tuple element `i`.
+
+outputs: A list of tensors that will be read from the outfeed.
+dtypes: The element types of each element in `outputs`.
+shapes: The shapes of each tensor in `outputs`.
+device_ordinal: The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..282a00b52c6cba12da0648a8814ca82969d78819
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("TPUReplicatedInput")
+    .Input("inputs: N * T")
+    .Output("output: T")
+    .Attr("N: int >= 1")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle cur = c->input(c->num_inputs() - 1);
+      for (int i = c->num_inputs() - 2; i >= 0; --i) {
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(c->Merge(c->input(i), cur, &cur),
+                                        "From merging shape ", i,
+                                        " with other shapes.");
+      }
+      c->set_output(0, cur);
+      return Status::OK();
+    })
+    .Doc(
+        "Operator that connects N unreplicated inputs to an N-way "
+        "replicated TPU computation.");
+
+REGISTER_OP("TPUReplicatedOutput")
+    .Input("input: T")
+    .Output("outputs: num_replicas * T")
+    .Attr("num_replicas: int >= 1")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(0));
+      }
+      return Status::OK();
+    })
+    .Doc(
+        "Operator that connects the output of an N-way replicated TPU "
+        "computation to N separate outputs.");
+
+REGISTER_OP("TPUReplicate")
+    .Attr("computation: func")
+    .Attr("num_replicas: int >= 1")
+    .Attr("global_tpu_id: list(int) = []")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Tbroadcast_inputs: list(type) >= 0")
+    .Attr("NumVariables: int >= 0")
+    .Attr("output_types: list(type) >= 0")
+    .Input("inputs: Tinputs")
+    .Input("broadcast_inputs: Tbroadcast_inputs")
+    .Input("variables: NumVariables * resource")
+    .Output("outputs: output_types")
+    .Doc(R"doc(
+Runs replicated computations on a distributed TPU system.
+
+computation: a function containing the computation to run.
+num_replicas: the number of replicas of the computation to run.
+global_tpu_id: map from device to global tpu id.
+Tinputs: the types of the arguments to 'computation'.
+inputs: the inputs to 'computation', flattened, in replica-major order.
+Tbroadcast_inputs: the types of the additional arguments to broadcast to all
+  replicas.
+broadcast_inputs: additional arguments to broadcast to all replicas. The
+  broadcast inputs are appended to the per-replica inputs when calling
+  computation.
+output_types: the types of the outputs of 'computation'.
+outputs: the outputs of 'computation'.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dc564ed27a19db6509a1b7c12ecc8dfe924dca1
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+// Configuring a distributed TPU system is achieved by running
+// the following Ops:
+//
+// 1 Run _DisconnectHostFromDistributedTPUSystem on the CPU of each
+// host. This is needed in case the system had previously been
+// configured. It returns, for each host, the number of TPU chips on
+// the host.
+//
+// 2 Run _ConfigureDistributedTPU on TPU_SYSTEM. Takes as input the
+// number of chips on each host. Validates that all hosts have the
+// same number of chips, and that the chips are consistent with the
+// topology set by flags. Has a single output which is a proto
+// describing the requested system configuration, which is sent to all
+// hosts.
+//
+// 3 Run _InitializeHostForDistributedTPU on the CPU of each host,
+// taking as input the output from ConfigureDistributedTPU. Has a
+// single Tensor output which is a vector of int32 indicating, for
+// each TPU on the host, what its global TPU system id is.
+//
+// 4 Run _WaitForDistributedTPU on TPU_SYSTEM, taking as input the
+// outputs from all the _InitializeHostForDistributedTPU
+// Ops. _WaitForDistributedTPU has an attr host_specs which is a
+// vector<string> giving the partial device spec for each host. These
+// partial specs are combined in the Op with the outputs from the host
+// initialization Ops to construct a mapping from full TPU device
+// specs to global TPU ids. Has a single Tensor output which is a
+// matrix of int32 indicating, for each host (outer dimension) and for
+// each TPU on the host (inner dimension) what that TPU's global id
+// is. _WaitForDistributedTPU also waits for the TPU distributed
+// system to initialize fully, which may take several minutes for a
+// large system.
+//
+// 5 Run _SetGlobalTPUArray on the CPU of each host, taking as input
+// the output from _WaitForDistributedTPU. This Op tells each host the
+// global Id of every TPU on every host.
+//
+// Most user code works by placing the ConfigureDistributedTPU Op on
+// the desired TPU_SYSTEM device, and a graph rewrite replaces it by
+// the subgraph described above.
+//
+//
+// A distributed TPU system can be cleanly shut down by running
+// the following Ops:
+//
+// 1 Run _DisconnectHostFromDistributedTPUSystem on the CPU of each
+// host.
+//
+// 2 Run _ShutdownDistributedTPU on the TPU_SYSTEM where
+// _ConfigureDistributedTPU was run. The Op will return an error if no
+// system is configured.
+//
+//
+// Most user code works by placing the ShutdownDistributedTPU Op on
+// the desired TPU_SYSTEM device, and a graph rewrite replaces it by
+// the subgraph described above.
+
+REGISTER_OP("_ConfigureDistributedTPU")
+    .Input("inputs: N * int32")
+    .Output("output: string")
+    .Attr("N: int >= 1")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      // Validate that all the inputs are scalars.
+      for (int i = 0; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &input));
+      }
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+An op that sets up the centralized structures for a distributed TPU
+system.
+
+inputs: A scalar tensor for each host indicating how many TPU chips
+there are on the host.
+output: A tensor containing a TPUHostConfiguration proto serialized to
+a string, containing the information necessary to initialize the chips
+in a host.
+)doc");
+
+REGISTER_OP("_WaitForDistributedTPU")
+    .Input("inputs: N * int32")
+    .Output("global_tpu_array: int32")
+    .Attr("host_specs: list(string)")
+    .Attr("startup_timeout_sec: int = 20")
+    .Attr("N: int")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      // Validate that all the inputs have the same vector shape.
+      for (int i = 0; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &input));
+      }
+      c->set_output(0, c->UnknownShapeOfRank(2));
+      return ::tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+An op that blocks execution until a distributed TPU system has
+started up. This Op must be run on the same TPU_SYSTEM device as
+_ConfigureDistributedTPU, and takes an inputs the outputs from the
+_InitializeHostForDistributedTPU Ops.
+
+inputs: For each initialized host, a vector giving the global TPU id
+of each TPU on the host.
+global_tpu_array: A two-dimensional array. For each host (the outer
+dimension) the array lists the global ids of the TPUs on that host.
+host_specs: For each initialized host, the partial device specification
+indicating job, replica, and task. Combining this spec with
+'/device:TPU:k' gives the full device name of the k'th TPU on the
+host.
+startup_timeout_sec: The number of seconds to wait for the TPU system
+to stabilize.
+)doc");
+
+REGISTER_OP("_SetGlobalTPUArray")
+    .Input("global_tpu_array: int32")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      return ::tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+An op that informs a host of the global ids of all the of TPUs in the
+system.
+
+global_tpu_array: A two-dimensional array. For each host (the outer
+dimension) the array lists the global ids of the TPUs on that host.
+)doc");
+
+REGISTER_OP("_ShutdownDistributedTPU").SetIsStateful().Doc(R"doc(
+An op that shuts down a running distributed TPU system. The Op returns
+an error if no system is running. This Op must be run on the same
+TPU_SYSTEM device as the corresponding _ConfigureDistributedTPU was run
+to start the system, and must be run only after
+_DisconnectHostFromDistributedTPUSystem has completed on every host in
+the system.
+)doc");
+
+REGISTER_OP("_InitializeHostForDistributedTPU")
+    .Input("input: string")
+    .Output("tpu_ids: int32")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return ::tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+An op that connects each chip on the host to a centralized UberDriver to allow
+them to operate as a distributed system with chips in other hosts.
+
+input: A string containing the address of the UberDriver to connect to.
+tpu_ids: A vector containing the global TPU id of each TPU on the host.
+)doc");
+
+REGISTER_OP("_DisconnectHostFromDistributedTPUSystem")
+    .Output("number_of_tpu_chips: int32")
+    .SetIsStateful()
+    .Doc(R"doc(
+An op that disconnects the TPUs on a host from a running distributed
+TPU system.
+
+number_of_tpu_chips: A scalar tensor containing the number of TPU
+chips on the host.
+)doc");
+
+REGISTER_OP("ConfigureDistributedTPU")
+    .Output("global_tpu_array: int32")
+    .Attr("embedding_config: string = ''")
+    .SetIsStateful()
+    .Doc(R"doc(
+An op that sets up the centralized structures for a distributed TPU
+system.
+
+global_tpu_array: A two-dimensional array. For each host (the outer
+dimension) the array lists the global ids of the TPUs on that host.
+embedding_config: Internal use.
+)doc");
+
+REGISTER_OP("ShutdownDistributedTPU").SetIsStateful().Doc(R"doc(
+An op that shuts down a running distributed TPU system. The Op returns
+an error if no system is running.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_sendrecv_ops.cc b/tensorflow/contrib/tpu/ops/tpu_sendrecv_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d7c11a315adbe106b87507b626f5d7478029360
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/tpu_sendrecv_ops.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_TPUSend")
+    .Input("tensor: T")
+    .Attr("T: type")
+    .Attr("tensor_name: string")
+    .SetIsStateful()
+    .Doc(R"doc(
+Sends the named tensor over the TPU fabric.
+
+tensor: The tensor to send.
+tensor_name: The name of the tensor to send.
+)doc");
+
+REGISTER_OP("_TPURecv")
+    .Output("tensor: T")
+    .Attr("T: type")
+    .Attr("tensor_name: string")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .Doc(R"doc(
+Receives the named tensor over the TPU fabric.
+
+tensor: The tensor to receive.
+tensor_name: The name of the tensor to receive.
+shape: The shape of the input tensor.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b806a94b1bbb5f3bedb177d000501d8a470d42e9
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -0,0 +1,36 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
+
+tf_proto_library_cc(
+    name = "tpu_profiler_proto",
+    srcs = ["tpu_profiler.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = [
+        "//tensorflow/core:protos_all",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "capture_tpu_profile",
+    srcs = ["capture_tpu_profile.cc"],
+    visibility = ["//tensorflow/contrib/tpu/profiler:__subpackages__"],
+    deps = [
+        ":tpu_profiler_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+tf_proto_library(
+    name = "trace_events_proto",
+    srcs = ["trace_events.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a510cb5346d08f604be5a504065f910568284ba3
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Usage: capture_tpu_profile --service_addr="localhost:8466" --logdir=/tmp/log
+//
+// Initiates a TPU profiling on the TPUProfiler service at service_addr,
+// receives and dumps the profile data to a tensorboard log directory.
+
+#include "grpc++/grpc++.h"
+
+#include <cstdio>
+#include <ctime>
+#include <vector>
+
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+
+using ::tensorflow::TPUProfiler;
+
+using ::grpc::ClientContext;
+using ::tensorflow::io::JoinPath;
+using ::tensorflow::Env;
+using ::tensorflow::WriteStringToFile;
+
+constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+constexpr char kTraceFileName[] = "trace";
+
+tensorflow::string GetCurrentTimeStampAsString() {
+  char s[128];
+  std::time_t t = std::time(nullptr);
+  CHECK_NE(std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t)), 0);
+  return s;
+}
+
+// The trace will be stored in <logdir>/plugins/profile/<timestamp>/trace.
+void DumpTraceToLogDirectory(const tensorflow::string& logdir,
+                             tensorflow::StringPiece trace) {
+  tensorflow::string run = GetCurrentTimeStampAsString();
+  tensorflow::string run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
+  TF_CHECK_OK(Env::Default()->RecursivelyCreateDir(run_dir));
+  tensorflow::string path = JoinPath(run_dir, kTraceFileName);
+  TF_CHECK_OK(WriteStringToFile(tensorflow::Env::Default(), path, trace));
+  LOG(INFO) << "Dumped trace data to " << path;
+}
+
+ProfileResponse Profile(const tensorflow::string& service_addr,
+                        int duration_ms) {
+  ProfileRequest request;
+  request.set_duration_ms(duration_ms);
+  ProfileResponse response;
+  ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
+  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                      std::numeric_limits<int32>::max());
+  std::unique_ptr<TPUProfiler::Stub> stub =
+      TPUProfiler::NewStub(::grpc::CreateCustomChannel(
+          service_addr, ::grpc::InsecureChannelCredentials(), channel_args));
+  TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
+  return response;
+}
+
+}  // namespace
+}  // namespace tpu
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::string FLAGS_service_addr;
+  tensorflow::string FLAGS_logdir;
+  int FLAGS_duration_ms = 2000;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("service_addr", &FLAGS_service_addr,
+                       "Address of TPU profiler service e.g. localhost:8466"),
+      tensorflow::Flag("logdir", &FLAGS_logdir,
+                       "Path of TensorBoard log directory e.g. /tmp/tb_log"),
+      tensorflow::Flag("duration_ms", &FLAGS_duration_ms,
+                       "Duration of tracing in ms. Default is 2000ms."),
+  };
+
+  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) {
+    std::printf("%s", usage.c_str());
+    return 2;
+  }
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  int duration_ms = FLAGS_duration_ms;
+  tensorflow::ProfileResponse response =
+      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms);
+  // Ignore computation_graph for now.
+  if (response.encoded_trace().empty()) {
+    LOG(WARNING) << "No trace event is collected during the " << duration_ms
+                 << "ms interval.";
+  } else {
+    tensorflow::tpu::DumpTraceToLogDirectory(FLAGS_logdir,
+                                             response.encoded_trace());
+  }
+  // Print this at the end so that it's not buried in irrelevant LOG messages.
+  std::cout
+      << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
+      << "Set an appropriate duration (with --duration_ms) if you "
+         "don't see a full step in your trace or the captured trace is too "
+         "large."
+      << std::endl;
+}
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0b24da7f1e111ba897a10045859b89c830d34d1e
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -0,0 +1,33 @@
+syntax = "proto3";
+package tensorflow;
+
+import "tensorflow/core/framework/graph.proto";
+
+// The TPUProfiler service retrieves performance information about
+// the programs running on connected TPUs over a period of time.
+service TPUProfiler {
+  // Starts a profiling session, blocks until it completes, and returns data.
+  rpc Profile(ProfileRequest) returns (ProfileResponse) {
+  }
+}
+
+message ProfileRequest {
+  // In future, the caller will be able to customize when profiling starts and
+  // stops. For now, it collects `duration_ms` milliseconds worth of data.
+  uint64 duration_ms = 1;
+
+  // In future, the caller will indicate which TF session is being profiled, and
+  // only data relating to that program will be returned. For now, we assume
+  // all activity during the profiling period is relevant.
+}
+
+message ProfileResponse {
+  uint64 xprof_response_size = 1;  // Placeholder: return something meaningful.
+  // Graphs of programs executed on TPUs during the profiling period.
+  repeated GraphDef computation_graph = 2;
+
+  // Encoded Trace proto message that contains metadata about the trace captured
+  // during the profiling period. Describes the devices and resources that
+  // 'trace_events' refers to.
+  bytes encoded_trace = 3;
+}
diff --git a/tensorflow/contrib/tpu/profiler/trace_events.proto b/tensorflow/contrib/tpu/profiler/trace_events.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0ab553ca96680f7207b419100cf5ca850e80c4f3
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/trace_events.proto
@@ -0,0 +1,59 @@
+syntax = "proto3";
+
+package tensorflow.tpu;
+
+// A 'Trace' contains metadata for the individual traces of a system.
+message Trace {
+  // The devices that this trace has information about. Maps from device_id to
+  // more data about the specific device.
+  map<uint64, Device> devices = 1;
+
+  // All trace events capturing in the profiling period.
+  repeated TraceEvent trace_events = 4;
+}
+
+// A 'device' is a physical entity in the system and is comprised of several
+// resources.
+message Device {
+  // The name of the device.
+  string name = 1;
+
+  // The id of this device, unique in a single trace.
+  uint64 device_id = 2;
+
+  // The resources on this device, keyed by resource_id;
+  map<uint64, Resource> resources = 3;
+}
+
+// A 'resource' generally is a specific computation component on a device. These
+// can range from threads on CPUs to specific arithmetic units on hardware
+// devices.
+message Resource {
+  // The name of the resource.
+  string name = 1;
+
+  // The id of the resource. Unique within a device.
+  uint64 resource_id = 2;
+}
+
+message TraceEvent {
+  // The id of the device that this event occurred on. The full dataset should
+  // have this device present in the Trace object.
+  uint64 device_id = 1;
+
+  // The id of the resource that this event occurred on. The full dataset should
+  // have this resource present in the Device object of the Trace object. A
+  // resource_id is unique on a specific device, but not necessarily within the
+  // trace.
+  uint64 resource_id = 2;
+
+  // The name of this trace event.
+  string name = 3;
+
+  // The timestamp that this event occurred at (in picos since tracing started).
+  uint64 timestamp_ps = 9;
+
+  // The duration of the event in picoseconds if applicable.
+  // Events without duration are called instant events.
+  uint64 duration_ps = 10;
+}
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3344fac36be24a692f141eee140312d988a932
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -0,0 +1,38 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Operations for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+
+
+if platform.system() != "Windows":
+  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
+  from tensorflow.contrib.tpu.ops.gen_tpu_ops import *
+
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
+
+  _tpu_ops = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_tpu_ops.so"))
+else:
+  # We have already built the appropriate libraries into the binary via CMake
+  # if we have built contrib, so we don't need this
+  pass
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde13f0527a1d8c5f71dd9684b93144ae07d60e4
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Classes for TPU trace events."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
+# pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/tensorboard/defs/defs.bzl b/tensorflow/contrib/tpu/python/tpu/__init__.py
similarity index 62%
rename from tensorflow/tensorboard/defs/defs.bzl
rename to tensorflow/contrib/tpu/python/tpu/__init__.py
index 94e2d7c540f0892cc8531e05dc844f78df1a016a..0dffd7064b19f353aed6afa3ad383564643a4a90 100644
--- a/tensorflow/tensorboard/defs/defs.bzl
+++ b/tensorflow/contrib/tpu/python/tpu/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,14 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# =============================================================================
 
-def tensorboard_webcomponent_library(**kwargs):
-  """Rules referencing this will be deleted from the codebase soon."""
-  pass
+"""Ops related to Tensor Processing Units."""
 
-def _legacy_js_impl(target, ctx):
-  return struct()
-
-legacy_js = aspect(
-    implementation=_legacy_js_impl,
-    attr_aspects=["exports"])
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a65a86514bd99e91b904132100c0be4bf71e81c2
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -0,0 +1,586 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu_function
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+
+
+def initialize_system(embedding_config=None, job=None):
+  """Initializes a distributed TPU system for use with TensorFlow.
+
+  Args:
+    embedding_config: If not None, an EmbeddingLayerConfiguration proto
+      describing the desired configuration of the hardware embedding lookup
+      tables. If embedding_config is None, no hardware embeddings can be used.
+    job: The job (the XXX in TensorFlow device specification /job:XXX)
+      that contains the TPU devices that will be initialized. If job=None
+      it is assumed there is only one job in the TensorFlow flock, and an
+      error will be returned if this assumption does not hold.
+  Returns:
+    Op which, when executed, will initialize the system.
+  """
+  if job is None:
+    device_name = "/replica:0/task:0/device:TPU_SYSTEM:0"
+  else:
+    device_name = "/job:%s/replica:0/task:0/device:TPU_SYSTEM:0" % job
+  config_string = ("" if embedding_config is None else
+                   embedding_config.SerializeToString())
+  with ops.device(device_name):
+    init_distributed_tpu = tpu_ops.configure_distributed_tpu(
+        embedding_config=config_string)
+  return init_distributed_tpu
+
+
+def shutdown_system(job=None):
+  """Shuts down a running a distributed TPU system."""
+  if job is None:
+    device_name = "/replica:0/task:0/device:TPU_SYSTEM:0"
+  else:
+    device_name = "/job:%s/replica:0/task:0/device:TPU_SYSTEM:0" % job
+  with ops.device(device_name):
+    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
+  return shutdown_distributed_tpu
+
+
+def core(num):
+  """Returns the device name for a core in a replicated TPU computation.
+
+  Args:
+    num: the virtual core number within each replica to which operators should
+    be assigned.
+  Returns:
+    A device name, suitable for passing to tf.device().
+  """
+  return "device:TPU_REPLICATED_CORE:{}".format(num)
+
+
+# Experimental API to 'break out' of a tpu.rewrite() (or shard(), etc.) context.
+# In
+#
+# XXX
+# with tpu.rewrite(...):
+#   YYY
+#   with tpu.outside_all_rewrites():
+#     ZZZ
+#
+# the Ops in ZZZ are added outside the scope of the rewrite().
+# TODO(phawkins): currently outside_all_rewrites() pops out of all nested
+# control flow scopes, for example loops. It would make more sense if it only
+# popped out of a single scope.
+@contextlib.contextmanager
+def outside_all_rewrites():
+  """Experimental API to 'break out' of a tpu.rewrite() (or shard(), etc.)."""
+  with ops.control_dependencies(None):
+    yield
+
+
+class TPUReplicateContext(control_flow_ops.ControlFlowContext):
+  """A ControlFlowContext for nodes inside a TPU computation.
+
+  The primary role of TPUReplicateContext is to mark operators inside a
+  tpu.replicate() computation with attributes:
+  * _tpu_replicate=XYZ, where XYZ is a unique name, and
+  * _tpu_num_replicas=k, where k is the number of replicas.
+
+  We use a ControlFlowContext to perform the annotation since it
+  integrates with Tensorflow constructs like ResourceVariables. For example,
+  if a ResourceVariable is constructed inside a tpu.replicate() block, the
+  ResourceVariable implementation can use "with ops.control_dependencies(None)"
+  to build the variable's definition outside the replicated computation.
+  """
+
+  def __init__(self, name, num_replicas, global_tpu_id=None):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._name = name
+    self._num_replicas = num_replicas
+    self._global_tpu_id = [] if global_tpu_id is None else global_tpu_id
+
+  def AddOp(self, op):
+    self._AddOpInternal(op)
+
+  def _AddOpInternal(self, op):
+    # pylint: disable=protected-access
+    if any(x.dtype._is_ref_dtype for x in op.inputs):
+      raise NotImplementedError(
+          "Non-resource Variables are not supported inside TPU computations "
+          "(operator name: %s)" % op.name)
+    # pylint: enable=protected-access
+    if "_tpu_replicate" in op.node_def.attr:
+      raise ValueError("TPU computations cannot be nested")
+    op.node_def.attr["_tpu_replicate"].s = self._name
+    op.node_def.attr["_tpu_num_replicas"].i = self._num_replicas
+    op.node_def.attr["_tpu_global_id"].list.i.extend(self._global_tpu_id)
+    op.graph.prevent_feeding(op)
+    op.graph.prevent_fetching(op)
+
+  def AddValue(self, val):
+    result = val
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+    return result
+
+  def AddInnerOp(self, op):
+    self._AddOpInternal(op)
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+
+def replicate(computation,
+              inputs=None,
+              infeed_queue=None,
+              global_tpu_id=None,
+              name=None):
+  """Builds a graph operator that runs a replicated TPU computation.
+
+  Args:
+    computation: a Python function that builds the computation to replicate.
+    inputs: a list of lists of input tensors or None (equivalent to
+      [[]]), indexed by [replica_num][input_num]. All replicas must
+      have the same number of inputs.
+    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+      of arguments as inputs to computation.
+    global_tpu_id: if not None, a Numpy 2D array indicating the global
+      id of each TPU device in the system. The outer dimension of the
+      array is host task id, and the inner dimension is device ordinal,
+      so e.g., global_tpu_id[x][y] indicates the global id of device
+      /task:x/device:TPU_NODE:y.
+    name: name of the operator.
+  Returns:
+    A list of lists of output tensors, indexed by [replica_num][output_num].
+  Raises:
+    ValueError: if all replicas do not have equal numbers of input tensors.
+    ValueError: if the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+  """
+  if name is None:
+    name = "TPUReplicate"
+  inputs = [[]] if inputs is None else inputs
+
+  if global_tpu_id is not None:
+    # Turn the Numpy array into a flattened list.
+    global_tpu_id = global_tpu_id.flatten().tolist()
+
+  if ((not isinstance(inputs, list)) or
+      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
+    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
+
+  num_replicas = len(inputs)
+
+  # No replicas? Nothing to do.
+  if num_replicas == 0:
+    return []
+
+  # Converts inputs to Tensors.
+  inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in inputs]
+
+  # Verifies that all replicas have matching numbers and types of inputs
+  input_types = [x.dtype for x in inputs[0]]
+  input_arity = len(input_types)
+  for i in range(num_replicas):
+    if len(inputs[i]) != input_arity:
+      raise ValueError("Replicas must have the same number of inputs. "
+                       "Replica 0 had {} inputs, replica {} had {} "
+                       "inputs.".format(input_arity, i, len(inputs[i])))
+
+    types = [x.dtype for x in inputs[i]]
+    if types != input_types:
+      raise ValueError(
+          "Replicas must have matching input types. Replica 0 had "
+          "input types {}, replica {} had input types {}".format(
+              input_types, i, types))
+
+  arg_error = tpu_function.check_function_argument_count(
+      computation, input_arity, infeed_queue)
+  if arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s, but the computation needs %s" % (
+              input_arity, str([i.name for i in inputs[0]]), arg_error))
+    else:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s and %d additional inputs from infeed,"
+          " but the computation needs %s" % (input_arity, str(
+              [i.name
+               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
+                                             arg_error))
+
+  graph = ops.get_default_graph()
+
+  with ops.name_scope(name, "replicate"):
+    # Fan-in: Builds a TPUReplicatedInput node for each input.
+    computation_inputs = []
+    for i in range(0, input_arity):
+      replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
+      computation_inputs.append(
+          tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+
+    context = TPUReplicateContext(
+        name=graph.unique_name("cluster"),
+        num_replicas=num_replicas,
+        global_tpu_id=global_tpu_id)
+    try:
+      context.Enter()
+
+      with tpu_function.tpu_shard_context(num_replicas):
+
+        # The EncapsulateTPUComputations rewrite needs to identify the
+        # replicated arguments inside each computation. Adds identity operators
+        # tagged with an attribute _tpu_replicated_input to identify the
+        # replicated inputs.
+        # pylint: disable=protected-access
+        with graph._attr_scope({"_tpu_replicated_input":
+                                attr_value_pb2.AttrValue(b=True)}):
+          computation_inputs = [
+              array_ops.identity(x, name="replicated_input_{}".format(i))
+              for i, x in enumerate(computation_inputs)]
+        # pylint: enable=protected-access
+
+        # If there is an infeed queue, adds the dequeued values to the
+        # computation's inputs.
+        if infeed_queue is not None:
+          infeed_queue.set_number_of_shards(num_replicas)
+          for t in infeed_queue.generate_dequeue_op():
+            computation_inputs.append(t)
+
+        # Only resource variables work inside a TPU computation, so turn on
+        # resource variables for the computation.
+        # TODO(phawkins): consider removing this code. It will
+        # be less confusing to clients if they knowingly choose to use resource
+        # variables.
+        vscope = variable_scope.get_variable_scope()
+        saved_use_resource = vscope.use_resource
+        vscope.set_use_resource(True)
+
+        outputs = computation(*computation_inputs)
+
+        vscope.set_use_resource(saved_use_resource)
+
+      # If the computation only returned one value, makes it a tuple.
+      if not isinstance(outputs, (list, tuple)):
+        outputs = (outputs,)
+
+      try:
+        with ops.device(core(0)):
+          outputs = [
+              o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+              for o in outputs
+          ]
+      except Exception as e:
+        raise ValueError(
+            "TPU function return values must all either be Operations or "
+            "convertible to Tensors. Got '%s'" % str(e))
+
+      # Separates the returned Operations and Tensors.
+      output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+      output_tensors = [o for o in outputs
+                        if not isinstance(o, ops.Operation)]
+
+      if outputs != output_tensors + output_operations:
+        raise ValueError(
+            "TPU functions must return zero-or more Tensor values followed by "
+            "zero or more Operations.")
+      output_arity = len(output_tensors)
+
+      # Wraps outputs in Identity ops. Otherwise a replicated input copied
+      # straight to an output would bypass the replicate(). This would be bad
+      # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+      # be rewritten away, leading to a runtime error.
+      # TODO(phawkins): extend the rewrite to elide these nodes instead.
+      with ops.device(core(0)):
+        output_tensors = [array_ops.identity(x) for x in output_tensors]
+    finally:
+      context.Exit()
+
+    # Fan-out: Builds a TPUReplicatedOutput node for each output.
+    outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
+                                             name="output{}".format(i))
+               for i in xrange(output_arity)]
+
+    with ops.control_dependencies(output_operations):
+      if output_arity == 0:
+        # Returns a list of NoOps dependent on the replication Op, indexed by
+        # [replica_num].
+        return [
+            control_flow_ops.no_op(name="%s_shard_%d" % (name, i))
+            for i in range(num_replicas)
+        ]
+      else:
+        # Wraps the outputs in identity operators so the names of any possible
+        # `fetch` nodes are preserved by the replication rewrite.
+        return [
+            [array_ops.identity(outputs[out][replica],
+                                name="output_%d_shard_%d" % (out, replica))
+             for out in xrange(output_arity)]
+            for replica in xrange(num_replicas)
+        ]
+
+
+def shard(computation,
+          inputs=None,
+          num_shards=1,
+          input_shard_axes=None,
+          outputs_from_all_shards=True,
+          output_shard_axes=None,
+          infeed_queue=None,
+          global_tpu_id=None,
+          name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty
+  list), each of which has a corresponding split axis (from
+  `input_shard_axes`). Each input is split into `num_shards` pieces
+  along the corresponding axis, and computation is applied to each
+  shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  TODO(phawkins): consider adding support for broadcasting Tensors passed
+  as inputs.
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: a Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: a list of input tensors or None (equivalent to an empty
+      list). Each input tensor has a corresponding shard axes, given
+      by `input_shard_axes`, which must have size divisible by
+      `num_shards`.
+    num_shards: the number of shards.
+    input_shard_axes: a list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: a list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: if not None, the InfeedQueue to use to augment the inputs of
+      `computation`.
+    global_tpu_id: if not None, a Numpy 2D array indicating the global
+      id of each TPU device in the system. The outer dimension of the
+      array is host task id, and the inner dimension is device ordinal,
+      so e.g., global_tpu_id[x][y] indicates the global id of device
+      /task:x/device:TPU_NODE:y.
+    name: name of the operator.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: if num_shards <= 0
+    ValueError: if len(input_shard_axes) != len(inputs)
+    ValueError: if len(output_shard_axes) != len(outputs from `computation`)
+  """
+
+  if num_shards <= 0:
+    raise ValueError("num_shards must be a positive integer.")
+
+  # Converts inputs to Tensors.
+  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for x in inputs]
+
+  if input_shard_axes is None:
+    input_shard_axes = [0] * len(inputs)
+  if len(inputs) != len(input_shard_axes):
+    raise ValueError("Length of input_shard_axes must be equal to the number "
+                     "of inputs.")
+
+  if inputs:
+    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
+    # lists with layout [input][shard]
+    split_inputs = [
+        array_ops.split(x, num_shards, axis=axis)
+        for (axis, x) in zip(input_shard_axes, inputs)]
+
+    # Transposes the input lists to have layout [shard][input]
+    transposed_inputs = [list(i) for i in zip(*split_inputs)]
+  else:
+    transposed_inputs = [[]] * num_shards
+
+  outputs = replicate(
+      computation,
+      transposed_inputs,
+      infeed_queue=infeed_queue,
+      global_tpu_id=global_tpu_id,
+      name=name)
+
+  # There must be at least one shard since num_shards > 0.
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  if isinstance(outputs[0], ops.Operation):
+    # pylint: enable=indexing-exception
+    # There were no outputs from the computation and replicate returned a list
+    # of NoOps with control dependencies on the computation. Return the first
+    # one so it can be used as a control dependency or fetch node.
+    # TODO(b/36647078) remove disable when pylint bug is fixed.
+    # pylint: disable=indexing-exception
+    return [outputs[0]]
+    # pylint: enable=indexing-exception
+
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  num_outputs = len(outputs[0])
+  # pylint: enable=indexing-exception
+
+  if output_shard_axes is None:
+    output_shard_axes = [0] * num_outputs
+  if num_outputs != len(output_shard_axes):
+    raise ValueError("Length of output_shard_axes must be equal to the number "
+                     "of outputs.")
+
+  if isinstance(outputs_from_all_shards, bool):
+    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
+
+  if num_outputs != len(outputs_from_all_shards):
+    raise ValueError("Length of outputs_from_all_shards must be equal to the "
+                     "number of outputs.")
+
+  results = []
+  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
+                                   zip(*outputs)):
+    if all_shards:
+      # Concatenate all of the outputs together (use stack for scalars).
+      shape = x[0].shape
+      is_scalar = shape is not None and (shape.ndims == 0)
+      results.append((array_ops.stack(list(x)) if is_scalar
+                      else array_ops.concat(list(x), axis=axis)))
+    else:
+      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
+      results.append(x[0])
+
+  return results
+
+
+def batch_parallel(computation,
+                   inputs=None,
+                   num_shards=1,
+                   infeed_queue=None,
+                   global_tpu_id=None,
+                   name=None):
+  """Shards `computation` along the batch dimension for parallel execution.
+
+  Convenience wrapper around shard().
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty
+  list). Each input is split into `num_shards` pieces along the 0-th
+  dimension, and computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  The outputs from all shards are concatenated back together along their 0-th
+  dimension.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: a Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: a list of input tensors or None (equivalent to an empty
+      list). The 0-th dimension of each Tensor must have size
+      divisible by `num_shards`.
+    num_shards: the number of shards.
+    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+      of arguments as inputs to `computation`.
+    global_tpu_id: if not None, a Numpy 2D array indicating the global
+      id of each TPU device in the system. The outer dimension of the
+      array is host task id, and the inner dimension is device ordinal,
+      so e.g., global_tpu_id[x][y] indicates the global id of device
+      /task:x/device:TPU_NODE:y.
+    name: name of the operator.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: if num_shards <= 0
+  """
+  return shard(
+      computation,
+      inputs,
+      num_shards=num_shards,
+      infeed_queue=infeed_queue,
+      global_tpu_id=global_tpu_id,
+      name=name)
+
+
+def rewrite(computation,
+            inputs=None,
+            infeed_queue=None,
+            global_tpu_id=None,
+            name=None):
+  """Rewrites `computation` for execution on a TPU system.
+
+  Args:
+    computation: a Python function that builds a computation to apply
+      to the input. If the function takes n inputs, 'inputs' should be
+      a list of n tensors. If the function returns m outputs, rewrite
+      will return a list of m tensors.
+    inputs: a list of input tensors or None (equivalent to an empty list).
+    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+      of arguments as inputs to `computation`.
+    global_tpu_id: if not None, a Numpy 2D array indicating the global
+      id of each TPU device in the system. The outer dimension of the
+      array is host task id, and the inner dimension is device ordinal,
+      so e.g., global_tpu_id[x][y] indicates the global id of device
+      /task:x/device:TPU_NODE:y.
+    name: name of the operator.
+  Returns:
+    A list of output tensors.
+  """
+  if inputs is not None and not isinstance(inputs, (list, tuple)):
+    raise TypeError("tpu.rewrite() inputs must be a list or tuple")
+
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  return replicate(
+      computation,
+      None if inputs is None else [inputs],
+      infeed_queue=infeed_queue,
+      global_tpu_id=global_tpu_id,
+      name=name)[0]
+  # pylint: enable=indexing-exception
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..524e89f563e042f836cc73b033c97179df0eb6d2
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -0,0 +1,47 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""A RunConfig subclass with TPU support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+
+
+class TPUConfig(collections.namedtuple(
+    'TPUConfig', ['iterations_per_loop', 'num_shards'])):
+  """TPU related configuration required by `TPUEstimator`."""
+
+  def __new__(cls, iterations_per_loop=2, num_shards=2):
+    return super(TPUConfig, cls).__new__(
+        cls,
+        iterations_per_loop=iterations_per_loop,
+        num_shards=num_shards)
+
+
+class RunConfig(run_config_lib.RunConfig):
+  """RunConfig with TPU support."""
+
+  def __init__(self, tpu_config=None, **kwargs):
+    super(RunConfig, self).__init__(**kwargs)
+    self._tpu_config = tpu_config or TPUConfig()
+
+  @property
+  def tpu_config(self):
+    return self._tpu_config
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6fa185709dbd52e8129bc4c5b94ba4e62511560
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -0,0 +1,807 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""TPUEstimator class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import threading
+from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_feed
+from tensorflow.contrib.tpu.python.tpu import training_loop
+
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator import util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training
+
+
+_INITIAL_LOSS = 1e7
+_BATCH_SIZE_KEY = 'batch_size'
+_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
+
+
+def _tpu_job(run_config):
+  # The tpu job is determined by the run_config. Right now, this method is
+  # required as tpu_config is not part of the RunConfig.
+  return None if run_config.master in ['', 'local'] else 'tpu_worker'
+
+
+def _per_shard_batch_size(global_batch_size, run_config, use_tpu):
+  """Returns the batch size for each shard."""
+  if use_tpu:
+    return global_batch_size // run_config.tpu_config.num_shards
+  else:
+    return global_batch_size
+
+
+class _SIGNAL(object):
+  """Signal used to control the input thread of infeed."""
+  NEXT_BATCH = 1
+  STOP = 2
+
+
+class InfeedThreadController(object):
+  """This wraps the infeed thread and stops when Estimator train finishes.
+
+  For model_fn wrapper, it is not possible to know when the `train` API will
+  stop. It could be the cases that the `max_steps` is reached or some hook
+  requests the stop in the monitored_session.
+
+  This controller (with coordination with `TPUInfeedSessionHook`) does the
+  following:
+
+  1) It pre-infeeds one `batch` data for current TPU iterations.
+
+  2) When `before_run` of `TPUInfeedSessionHook` is called, one more `batch`
+  data will be infed.
+
+  3) When `end` of `TPUInfeedSessionHook` is called, the thread will end
+  gracefully.
+
+  So, we might need to adjust the algorithrm here if the IO is slower than the
+  computation.
+  """
+
+  def __init__(self, session, enqueue_ops, iterations):
+    self._signal_queue = Queue.Queue()
+    self._input_thd = threading.Thread(target=self._input_thread_fn_for_loading,
+                                       args=(session, enqueue_ops, iterations))
+    self._input_thd.daemon = True
+    self._input_thd.start()
+
+  def _input_thread_fn_for_loading(self, session, enqueue_ops, iterations):
+    count = 0
+    while True:
+      signal = self._signal_queue.get()
+      if signal == _SIGNAL.STOP:
+        logging.info('Stop Infeed input thread.')
+        return
+
+      for i in range(iterations):
+        logging.debug('InfeedEnqueue data for iteration (%d, %d)', count, i)
+        session.run(enqueue_ops)
+      count += 1
+
+  def load_next_batch(self):
+    self._signal_queue.put(_SIGNAL.NEXT_BATCH)
+
+  def join(self):
+    logging.info('Waiting for InputThread to exit.')
+    self._signal_queue.put(_SIGNAL.STOP)
+    self._input_thd.join()
+
+
+class TPUInfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization and infeed.
+
+  This hook does two major things:
+  1. initialize and shutdown TPU system (maybe a separated hook)
+  2. launch and join the input thread for infeed.
+  """
+
+  def __init__(self, run_config, enqueue_fn):
+    self._iterations = run_config.tpu_config.iterations_per_loop
+    self._enqueue_fn = enqueue_fn
+    self._tpu_job = _tpu_job(run_config)
+
+  def begin(self):
+    self._enqueue_ops = self._enqueue_fn()
+    logging.info('TPU job name %s', self._tpu_job)
+    self._init_op = [tpu.initialize_system(job=self._tpu_job)]
+    self._finalize_op = [tpu.shutdown_system(job=self._tpu_job)]
+
+  def after_create_session(self, session, coord):
+    logging.info('Init TPU system')
+    session.run(self._init_op)
+
+    logging.info('Start infeed input thread controller')
+    self._infeed_thd_controller = InfeedThreadController(
+        session, self._enqueue_ops, self._iterations)
+
+  def before_run(self, run_context):
+    logging.info('Load next batch of data to infeed.')
+    self._infeed_thd_controller.load_next_batch()
+
+  def end(self, session):
+    logging.info('Stop infeed input thread controller')
+    self._infeed_thd_controller.join()
+
+    logging.info('Shutdown TPU system.')
+    session.run(self._finalize_op)
+
+
+class _PerShardOutput(object):
+  """Wraps input_fn's outputs into per-shard outputs.
+
+  Used so that the model_fn can distinguish between sharded input and unsharded
+  inputs (e.g., for export_savedmodel()).
+  """
+
+  def __init__(self, output):
+    self.output = output
+
+  def as_list(self):
+    return self.output
+
+
+class _InputsHolder(object):
+  """A inputs holder holds the `features` and `labels' for all TPU shards.
+
+  Model inputs returned by the `input_fn` can have one of the following forms:
+  1. features
+  2. (features, labels)
+
+  Internally, form 1 is reformed to `(features, None)` as features and labels
+  are passed separatedly to underlying methods. For TPU training, TPUEstimator
+  expects multiple `features` and `labels` tuples one for each shard.
+
+  In addition, TPUEstimator allows various different structures for inputs
+  (namely `features` and `labels`).  `features` can be `Tensor` or dict of
+  string name to `Tensor`, and `labels` could be `None`, `Tensor`, or dict of
+  string name to `Tensor`. TPU infeed/outfeed library expects flattened tensor
+  list. So, `features` and `labels` need to be flattened, before infeed enqueue,
+  and the structure of them needs to be recorded, in order to restore them after
+  infeed dequeue.
+
+  `_InputsHolder` holds the `features` and `labels` tuple for all shards,
+  records the structure details (including presence, dict or single tensor, dict
+  names), validates the structure consistency cross all shards, and encapsulates
+  the flatten/unflatten logic.
+  """
+
+  def __init__(self, sharded_features=None, sharded_labels=None,
+               num_shards=None):
+    """Constructor.
+
+    Args:
+      sharded_features: A list of features one for each shard. Once provided,
+        the corresponding shared_labels should be set also and this
+        `_InputsHolder` is frozen to prevent from future modification. If
+        `None`, it is expected to add features and labels for each shard by
+        calling `append_shard` later.
+      sharded_labels: A list of labels one for each shard.
+      num_shards: Number of shards in the TPU system. Must be provided unless it
+        can be deduced from `sharded_features`.
+
+    Raises:
+      ValueError: If both `sharded_features` and `num_shards` are `None`.
+    """
+    # Holds the features and labels for all shards.
+    self._feature_list = []
+    self._label_list = []
+
+    # Holds the structure of inputs
+    self._feature_names = []
+    self._label_names = []
+    self._has_labels = False
+
+    # Internal state.
+    self._initialized = False
+    self._frozen = False
+
+    if sharded_features is None:
+      if num_shards is None:
+        raise ValueError(
+            '`sharded_features` and `num_shards` cannot be both None')
+      self._num_shards = num_shards
+    else:
+      self._from_sharded_inputs(sharded_features, sharded_labels, num_shards)
+
+  def _from_sharded_inputs(self, sharded_features, sharded_labels, num_shards):
+    """Initializes the inputs with sharded features and labels."""
+    if not isinstance(sharded_features, _PerShardOutput):
+      raise ValueError('`sharded_features` must have type `_PerShardOutput`.')
+    features = sharded_features.as_list()
+
+    if num_shards is not None and num_shards != len(features):
+      raise ValueError(
+          '`num_shards` should be same as the length of sharded_features.')
+
+    self._num_shards = len(features)
+    if not self._num_shards:
+      raise ValueError('`sharded_features` should not be empty.')
+
+    if sharded_labels is not None:
+      if not isinstance(sharded_labels, _PerShardOutput):
+        raise ValueError('sharded_labels` must have type `_PerShardOutput`.')
+
+      self._has_labels = True
+      labels = sharded_labels.as_list()
+      if self._num_shards != len(labels):
+        raise ValueError(
+            'Length of `sharded_features` and `sharded_labels` mismatch.')
+
+    if self._has_labels:
+      for (f, l) in zip(features, labels):
+        self.append_shard((f, l))
+    else:
+      for f in features:
+        self.append_shard(f)
+
+    self._frozen = True
+
+  def _extract_key_names(self, tensor_or_dict):
+    if tensor_or_dict is None:
+      return []
+
+    return tensor_or_dict.keys() if isinstance(tensor_or_dict, dict) else []
+
+  def _validate(self, features, labels):
+    has_labels = labels is not None
+    feature_names = self._extract_key_names(features)
+    label_names = self._extract_key_names(labels)
+
+    if self._initialized:
+      # The following should never happen.
+      assert feature_names == self._feature_names, 'feature keys mismatched'
+      assert label_names == self._label_names, 'label keys mismatched'
+      assert has_labels == self._has_labels, 'label presence mismatched'
+    else:
+      self._initialized = True
+      self._feature_names = feature_names
+      self._label_names = label_names
+      self._has_labels = has_labels
+
+  def append_shard(self, inputs):
+    """Appends `inputs` for one shard into holder.
+
+    Args:
+      inputs: The return from `input_fn`, which could be features or tuple of
+        (features, labels). After the first `inputs` appended into
+        `_InputsHolder`, the structure of `features` and `labels is recorded.
+        Any future invocation should provide the `inputs` with same structure.
+
+    Raises:
+      RuntimeError: If the internal data has been frozen already.
+    """
+    if self._frozen:
+      raise RuntimeError('InputsHolder has frozen, which cannot be mutated.')
+
+    # input_fn may return either features or (features, labels)
+    if isinstance(inputs, tuple):
+      features, labels = inputs
+    else:
+      features, labels = inputs, None
+
+    self._validate(features, labels)
+
+    self._feature_list.append(features)
+    if labels is not None:
+      self._label_list.append(labels)
+
+  def as_features_and_labels_tuple(self):
+    """Returns features and labels as grouped tuple.
+
+    This is intended to be used to pass features and labels for all shards from
+    input_fn to model_fn as the parent class `Estimator` does not have the
+    concept of shards. So, grouped tuple is required.
+
+    Once called, the internal data is frozen and `append_shard` cannot be
+    invoked anymore.
+
+    Returns:
+      A tuple of features and labels. Both have type `_PerShardOutput`, holding
+      the inputs for all shards. `labels` could be `None`.
+
+    Raises:
+      RuntimeError: If the internal data has not been initialized.
+    """
+    self._frozen = True
+    if not self._initialized:
+      raise RuntimeError('InputsHolder has not been initialized.')
+
+    assert len(self._feature_list) == self._num_shards
+    if not self._label_list or all(l is None for l in self._label_list):
+      return _PerShardOutput(self._feature_list), None
+
+    assert len(self._label_list) == self._num_shards
+    return (_PerShardOutput(self._feature_list),
+            _PerShardOutput(self._label_list))
+
+  def as_sharded_flattened_inputs(self):
+    """Flatten the features and label as tensor list for all shards.
+
+    Flattened tensor list contains all tensors in `features` (dict) and `labels`
+    (dict). Conceptually, it has the predicated structure like:
+
+    ```python
+    flatten_list = []
+    for name in features:
+      flatten_list.append(features[name])
+    for name in labels:
+      flatten_list.append(labels[name])
+    ```
+
+    This method handles the label is None case and single tensor case nicely.
+
+    Once called, the internal data is frozen and `append_shard` cannot be
+    invokded anymore.
+
+    Returns:
+      A list of flattened inputs one for each shard.
+
+    Raises:
+      RuntimeError: If the internal data has not been initialized.
+    """
+    self._frozen = True
+    if not self._initialized:
+      raise RuntimeError('InputsHolder has not been initialized.')
+
+    sharded_inputs = []
+
+    for shard in range(self._num_shards):
+      flattened_inputs = []
+      if self._feature_names:
+        # We need a fixed ordering for enqueueing and dequeueing.
+        flattened_inputs.extend([self._feature_list[shard][name] for name in
+                                 self._feature_names])
+      else:
+        flattened_inputs.append(self._feature_list[shard])
+
+      if self._has_labels:
+        if self._label_names:
+          # We need a fixed ordering for enqueueing and dequeueing.
+          flattened_inputs.extend([self._label_list[shard][name] for name in
+                                   self._label_names])
+        else:
+          flattened_inputs.append(self._label_list[shard])
+      sharded_inputs.append(flattened_inputs)
+
+    return sharded_inputs
+
+  def unflatten_features_and_labels(self, flattened_inputs):
+    """Restores the flattened inputs to original features and labels form.
+
+    Once called, the internal data is frozen and `append_shard` cannot be
+    invokded anymore.
+
+    Args:
+      flattened_inputs: Flattened inputs for one each, which should be created
+      by the `as_sharded_flattened_inputs` API.
+
+    Returns:
+      A tuple of (`features`, `labels`), where `labels` could be None.
+      Each one, if present, should have identical structure (single tensor vs
+      dict) as the one returned by input_fn.
+
+    Raises:
+      RuntimeError: If the internal data has not been initialized.
+      ValueError: If the number of expected tensors from `flattened_inputs`
+        mismatches the recorded structure.
+    """
+    self._frozen = True
+    if not self._initialized:
+      raise RuntimeError('InputsHolder has not been initialized.')
+
+    expected_num_features = (len(self._feature_names) if self._feature_names
+                             else 1)
+    if self._has_labels:
+      expected_num_labels = (len(self._label_names) if self._label_names
+                             else 1)
+    else:
+      expected_num_labels = 0
+
+    expected_num_tensors = expected_num_features + expected_num_labels
+
+    if expected_num_tensors != len(flattened_inputs):
+      raise ValueError(
+          'The number of flattened tensors mismatches expected num. '
+          'Expected {}, got {}'.format(expected_num_tensors,
+                                       len(flattened_inputs)))
+    if self._feature_names:
+      unflattened_features = dict(zip(self._feature_names,
+                                      flattened_inputs[:expected_num_features]))
+    else:
+      # Single tensor case
+      unflattened_features = flattened_inputs[0]
+
+    if expected_num_labels == 0:
+      unflattened_label = None
+    elif self._label_names:
+      unflattened_label = dict(zip(self._label_names,
+                                   flattened_inputs[expected_num_features:]))
+    else:
+      # Single tensor case.
+      unflattened_label = flattened_inputs[expected_num_features]
+
+    return unflattened_features, unflattened_label
+
+
+class _ModelFnWrapper(object):
+  """A `model_fn` wrapper.
+
+  This makes calling model_fn on CPU and TPU easier and more consistent and
+  performs necessary check and mutation required by TPU training.
+
+  In addition, this wrapper manages converting the `model_fn` to a single TPU
+  train step.
+  """
+
+  def __init__(self, model_fn, config, params, mode, train_batch_size):
+    self._model_fn = model_fn
+    self._config = config
+    self._params = params
+    self._mode = mode
+    self._train_batch_size = train_batch_size
+
+  def call_without_tpu(self, features, labels):
+    return self._call_model_fn(features, labels, False)
+
+  def convert_to_single_tpu_train_step(self, dequeue_fn):
+    """Converts the `model_fn` as a single train step on TPU."""
+
+    def train_step(loss):
+      """Training step function for use inside a while loop."""
+      del loss  # unused; required in function signature.
+      features, labels = dequeue_fn()
+
+      # Makes deep copy with `config` and params` in case user mutates them.
+      estimator_spec = self._verify_estimator_spec(
+          self._call_model_fn(features, labels, True))
+      loss, train_op = estimator_spec.loss, estimator_spec.train_op
+      with ops.control_dependencies([train_op]):
+        return array_ops.identity(loss)
+    return train_step
+
+  @property
+  def config(self):
+    return self._config
+
+  def _call_model_fn(self, features, labels, use_tpu):
+    """Calls the model_fn with required parameters."""
+    model_fn_args = util.fn_args(self._model_fn)
+    kwargs = {}
+
+    config = copy.deepcopy(self._config)
+    params = copy.deepcopy(self._params)
+
+    if 'labels' in model_fn_args:
+      kwargs['labels'] = labels
+    else:
+      if labels is not None:
+        raise ValueError(
+            'model_fn does not take labels, but input_fn returns labels.')
+    if 'mode' in model_fn_args:
+      kwargs['mode'] = self._mode
+    if 'config' in model_fn_args:
+      kwargs['config'] = config
+    if 'params' in model_fn_args:
+      kwargs['params'] = params
+
+    if 'params' not in model_fn_args:
+      raise ValueError(
+          'model_fn ({}) does not include params argument, '
+          'required by TPUEstimator to pass batch size as '
+          'params[\'batch_size\']'.format(self._model_fn))
+    if self._mode == model_fn_lib.ModeKeys.TRAIN:
+      # For TPU training. `params` is never `None`.
+      params[_BATCH_SIZE_KEY] = _per_shard_batch_size(
+          self._train_batch_size, config, use_tpu)
+
+    return self._model_fn(features=features, **kwargs)
+
+  def _verify_estimator_spec(self, estimator_spec):
+    """Validates the estimator_spec."""
+    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
+    if estimator_spec.training_chief_hooks:
+      raise ValueError(err_msg.format('training_chief_hooks'))
+    if estimator_spec.training_hooks:
+      raise ValueError(err_msg.format('training_hooks'))
+    return estimator_spec
+
+
+class TPUEstimator(estimator_lib.Estimator):
+  """Estimator with TPU support.
+
+  TPUEstimator handles many of the details of running on TPU devices, such as
+  replicating inputs and models for each core, and returning to host
+  periodically to run hooks.
+
+  Note: For training (evaluate and predict support on TPU are not yet
+  implemented), TPUEstimator transforms a global batch size in params to a
+  per-shard batch size when calling the `input_fn` and `model_fn`. Users should
+  specify `train_batch_size` in constructor, and then get the batch size for
+  each shard in `input_fn` and `model_fn` by `params['batch_size']`.
+  """
+
+  def __init__(self,
+               model_fn=None,
+               model_dir=None,
+               config=None,
+               params=None,
+               use_tpu=True,
+               train_batch_size=None):
+    """Constructs an `TPUEstimator` instance.
+
+    Args:
+      model_fn: Model function as required by `Estimator`. For training, the
+        returned `EstimatorSpec` cannot have hooks as it is not supported in
+        `TPUEstimator`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same. If
+        both are `None`, a temporary directory will be used.
+      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
+      params: An optional `dict` of hyper parameters that will be passed into
+        `input_fn` and `model_fn`.  Keys are names of parameters, values are
+        basic python types. There are reserved keys for `TPUEstimator`,
+        including 'batch_size'.
+      use_tpu: A bool indicating whether TPU support is enabled. Currently, only
+        applied to training. Evaluate and predict still happen on CPU.
+      train_batch_size: An int representing the global training batch size.
+        TPUEstimator transforms this global batch size to a per-shard batch
+        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
+        Cannot be `None` if `use_tpu` is `True`. Must be divisible by
+        `config.tpu_config.num_shards`.
+
+    Raises:
+      ValueError: `params` has reserved keys already.
+    """
+    if config is None or not isinstance(config, tpu_config.RunConfig):
+      raise ValueError(
+          '`config` must be provided with type `tpu_config.RunConfig`')
+
+    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
+      raise ValueError(
+          '{} are reserved keys but existed in params {}.'.format(
+              _RESERVED_PARAMS_KEYS, params))
+
+    if use_tpu:
+      if train_batch_size is None:
+        raise ValueError('`train_batch_size` cannot be `None`')
+      if not isinstance(train_batch_size, int):
+        raise ValueError('`train_batch_size` must be an int')
+      if train_batch_size < 1:
+        raise ValueError('`train_batch_size` must be positive')
+
+      # The specified batch size is the batch size for the entire computation.
+      # The input_fn and model_fn are called per-shard, so we want to calculate
+      # the per-shard batch size and pass that.
+      if train_batch_size % config.tpu_config.num_shards != 0:
+        raise ValueError(
+            'batch size {} must be divisible by number of shards {}'
+            .format(train_batch_size, config.tpu_config.num_shards))
+
+    # Verifies the model_fn signature according to Estimator framework.
+    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
+    # We cannot store config and params in this constructor as parent
+    # constructor might change them, such as assigning a temp dir for
+    # config.model_dir.
+    model_function = _augment_model_fn(model_fn, train_batch_size, use_tpu)
+
+    super(TPUEstimator, self).__init__(
+        model_fn=model_function,
+        model_dir=model_dir,
+        config=config,
+        params=params)
+    self._use_tpu = use_tpu
+    self._train_batch_size = train_batch_size
+
+  def _create_global_step(self, graph):
+    """Creates a global step suitable for TPUs.
+
+    Args:
+      graph: The graph in which to create the global step.
+
+    Returns:
+      A global step `Tensor`.
+
+    Raises:
+      ValueError: if the global step tensor is already defined.
+    """
+    graph = graph or ops.get_default_graph()
+    if training.get_global_step(graph) is not None:
+      raise ValueError('"global_step" already exists.')
+    # Create in proper graph and base name_scope.
+    with graph.as_default() as g, g.name_scope(None):
+      return variable_scope.get_variable(
+          ops.GraphKeys.GLOBAL_STEP,
+          shape=[],
+          dtype=dtypes.int32,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          use_resource=True,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES,
+                       ops.GraphKeys.GLOBAL_STEP])
+
+  def _call_input_fn(self, input_fn, mode):
+    """Calls the input function.
+
+    Args:
+      input_fn: The input function.
+      mode: ModeKeys
+
+    Returns:
+      Either features or (features, labels) where features and labels are:
+        features - `Tensor` or dictionary of string feature name to `Tensor`.
+        labels - `Tensor` or dictionary of `Tensor` with labels.
+
+    Raises:
+      ValueError: if input_fn takes invalid arguments or does not have `params`.
+    """
+    input_fn_args = util.fn_args(input_fn)
+    config = self.config  # a deep copy.
+    kwargs = {}
+    if 'params' in input_fn_args:
+      kwargs['params'] = self.params  # a deep copy.
+    else:
+      raise ValueError('input_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params["batch_size"]'.format(input_fn))
+    if 'config' in input_fn_args:
+      kwargs['config'] = config
+
+    # Now for TPU training.
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      kwargs['params'][_BATCH_SIZE_KEY] = (
+          _per_shard_batch_size(self._train_batch_size, config, self._use_tpu))
+
+    if not self._use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
+      with ops.device('/cpu:0'):
+        return input_fn(**kwargs)
+
+    job = _tpu_job(config)
+    def placement_function(index):
+      if job is None:
+        return '/replica:0/task:0/device:CPU:0'
+      else:
+        return '/job:%s/replica:0/task:%d/device:CPU:0' % (job, index / 8)
+
+    num_shards = config.tpu_config.num_shards
+    inputs = _InputsHolder(num_shards=num_shards)
+    for i in range(config.tpu_config.num_shards):
+      with ops.device(placement_function(i)):
+        inputs.append_shard(input_fn(**kwargs))
+
+    return inputs.as_features_and_labels_tuple()
+
+
+def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder):
+  """Utility to convert input_fn to enqueue and dequeue fns for TPU.
+
+  Args:
+    inputs_holder: An `_InputsHolder` holding features and labels.
+
+  Returns:
+    A tuple of (dequeue_fn, enqueue_fn)
+  """
+  sharded_inputs = inputs_holder.as_sharded_flattened_inputs()
+
+  infeed_queue = tpu_feed.InfeedQueue(
+      number_of_tuple_elements=len(sharded_inputs[0]))
+  infeed_queue.set_configuration_from_sharded_input_tensors(sharded_inputs)
+
+  def dequeue_fn():
+    """dequeue_fn is used by the train_step in TPU to retrieve the tensors."""
+    values = infeed_queue.generate_dequeue_op()
+    return inputs_holder.unflatten_features_and_labels(values)
+
+  def tpu_ordinal_function(index):
+    """Return the TPU ordinal associated with a shard.
+
+    Required because the enqueue ops are placed on CPU.
+
+    Args:
+      index: the shard index
+
+    Returns:
+      The ordinal of the TPU device the shard's infeed should be placed on.
+    """
+    return index % 8
+
+  def enqueue_fn():
+    """enqueue_fn is used to add ops to the graph to send tensors."""
+    return infeed_queue.generate_enqueue_ops(
+        sharded_inputs, tpu_ordinal_function=tpu_ordinal_function)
+
+  return (dequeue_fn, enqueue_fn)
+
+
+def _augment_model_fn(model_fn, train_batch_size, use_tpu):
+  """Returns a new model_fn, which wraps the TPU support."""
+
+  def _model_fn(features, labels, mode, config, params):
+    """A Estimator `model_fn` for TPUEstimator."""
+    model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, mode,
+                                       train_batch_size)
+
+    # TODO(jhseu): Move to EVAL and PREDICT to TPU.
+    if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
+      return model_fn_wrapper.call_without_tpu(features, labels)
+
+    inputs = _InputsHolder(sharded_features=features, sharded_labels=labels)
+
+    dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn(inputs)
+
+    loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn)
+
+    # Gets the variables back from TPU nodes. This means the variables updated
+    # by TPU will now be *synced* to host memory.
+    update_ops = [
+        array_ops.check_numerics(v.read_value(),
+                                 'Gradient for %s is NaN' % v.name).op
+        for v in variables.trainable_variables()
+    ]
+
+    hooks = [
+        TPUInfeedSessionHook(config, enqueue_fn),
+        training.LoggingTensorHook(
+            {'loss': array_ops.identity(loss),
+             'step': training.get_global_step()},
+            every_n_secs=30)
+    ]
+
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        loss=array_ops.identity(loss),
+        training_hooks=hooks,
+        train_op=control_flow_ops.group(*update_ops))
+  return _model_fn
+
+
+def _train_on_tpu_system(model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  config = model_fn_wrapper.config.tpu_config
+  iterations_per_loop = config.iterations_per_loop
+  num_shards = config.num_shards
+
+  single_tpu_train_step = model_fn_wrapper.convert_to_single_tpu_train_step(
+      dequeue_fn)
+
+  multi_tpu_train_steps_on_single_shard = (lambda: training_loop.repeat(  # pylint: disable=g-long-lambda
+      iterations_per_loop, single_tpu_train_step, [_INITIAL_LOSS], name='loop'))
+
+  (loss,) = tpu.shard(multi_tpu_train_steps_on_single_shard,
+                      inputs=[],
+                      num_shards=num_shards,
+                      outputs_from_all_shards=False)
+  return loss
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d0a532010e508e1239088fa1242e536e5c243df
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -0,0 +1,620 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Helper library for handling infeed between hosts and TPUs.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu_sharding
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+
+
+class InfeedQueue(object):
+  """A helper object to build a device infeed queue.
+
+  The InfeedQueue builds the host-side and device-side Ops to enqueue and
+  dequeue elements, respectively, and ensures that their types and
+  shapes match.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               shard_dimensions=None,
+               name=None):
+    """Creates a new InfeedQueue with the given configuration.
+
+    The configuration need not be fully specified at creation since it
+    can be modified subsequently by methods that set the values
+    explicitly or infer them from the shapes of inputs.
+
+    Args:
+      number_of_tuple_elements: the number of Tensors fed atomically through the
+        queue, must be present unless it can be inferred from other arguments.
+      tuple_types: if not None, a list of types of the elements of the queue.
+      tuple_shapes: if not None, a list of shapes of the elements of the queue.
+      shard_dimensions: if not None, a list of dimensions on which the
+        elements of the queue should be sharded during automatic
+        parallelization.
+      name: the name of the queue.
+
+    Raises:
+      ValueError: if number_of_tuple_elements <= 0; or
+        number_of_tuple_arguments, tuple_types, tuple_shapes, and
+        shard_dimensions are all None; or the length of tuple_types,
+        tuple_shapes, or shard_dimensions is not equal to
+        number_of_tuple_elements; or any element of shard_dimensions
+        can't be converted to a Dimension.
+      TypeError: if any element of tuple_types or tuple_shapes can't
+        be converted to a dtype or TensorShape, respectively.
+    """
+    self._frozen = False
+    self._generated_enqueue_ops = False
+    self._generated_dequeue_op = False
+    self._name = "InfeedQueue" if name is None else name
+    if number_of_tuple_elements is None:
+      if tuple_types is not None:
+        number_of_tuple_elements = len(tuple_types)
+      elif tuple_shapes is not None:
+        number_of_tuple_elements = len(tuple_shapes)
+      elif shard_dimensions is not None:
+        number_of_tuple_elements = len(shard_dimensions)
+      else:
+        raise ValueError(
+            "number of tuple elements cannot be inferred from InfeedQueue "
+            "constructor"
+        )
+    if number_of_tuple_elements <= 0:
+      raise ValueError("number_of_tuple_elements %d must be > 0" %
+                       number_of_tuple_elements)
+    # Make an empty sharding policy for each tuple element.
+    self._sharding_policies = [
+        tpu_sharding.ShardingPolicy()
+        for _ in xrange(number_of_tuple_elements)
+    ]
+    if tuple_types is not None:
+      self.set_tuple_types(tuple_types)
+    else:
+      self._tuple_types = None
+    if tuple_shapes is not None:
+      self.set_tuple_shapes(tuple_shapes)
+    else:
+      self._tuple_shapes = None
+    if shard_dimensions is not None:
+      self.set_shard_dimensions(shard_dimensions)
+    self._validate()
+
+  def _validate(self):
+    """Checks that the configuration is self-consistent.
+
+    Raises:
+      ValueError: if the shapes and sharding policies don't match.
+    """
+    if self.tuple_shapes is not None:
+      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
+        # Raise an error if the policy is incompatible with the shape.
+        _ = policy.get_sharded_shape(shape)
+
+  @property
+  def number_of_tuple_elements(self):
+    """Returns the number of InfeedQueue tuple elements."""
+    return len(self._sharding_policies)
+
+  @property
+  def tuple_types(self):
+    """Returns the types of the InfeedQueue tuple elements."""
+    return self._tuple_types
+
+  def set_tuple_types(self, tuple_types):
+    """Sets the type of each element of the queue.
+
+    tuple_types must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a dtype.
+
+    Args:
+      tuple_types: the types of each queue element.
+
+    Raises:
+      ValueError: if tuple_types is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_types cannot be converted to a
+        dtype.
+    """
+    if len(tuple_types) != self.number_of_tuple_elements:
+      raise ValueError("tuple_types is %s, but must be a list of length %d" %
+                       (str(tuple_types), self.number_of_tuple_elements))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_types, tuple_types):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible type. Frozen types are %s, updated types are %s" % (
+                  str(self._tuple_types), str(tuple_types)))
+    else:
+      try:
+        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
+      except (TypeError) as e:
+        raise TypeError(
+            "tuple_types is %s, but must be a list of elements each "
+            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
+
+  @property
+  def tuple_shapes(self):
+    """Returns the shapes of the InfeedQueue tuple elements."""
+    return self._tuple_shapes
+
+  def set_tuple_shapes(self, tuple_shapes):
+    """Sets the shape of each element of the queue.
+
+    tuple_shapes must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a TensorShape.
+
+    Args:
+      tuple_shapes: the shapes of each queue element.
+
+    Raises:
+      ValueError: if tuple_shapes is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_shapes cannot be converted to
+        a TensorShape.
+    """
+    if len(tuple_shapes) != self.number_of_tuple_elements:
+      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
+                       (str(tuple_shapes), self.number_of_tuple_elements))
+    try:
+      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
+    except (ValueError, TypeError) as e:
+      raise TypeError(
+          "tuple_shapes is %s, but must be a list of elements each "
+          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
+                                                        str(e)))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
+              % (str(self._tuple_shapes), str(tuple_shapes)))
+    else:
+      self._tuple_shapes = tuple_shapes
+    self._validate()
+
+  @property
+  def sharding_policies(self):
+    """Returns the sharding policies of the InfeedQueue tuple elements."""
+    return self._sharding_policies
+
+  @property
+  def shard_dimensions(self):
+    """Gets the shard dimension of each tuple element.
+
+    Returns:
+      A list of length number_of_tuple_elements, where each list entry
+      is the shard dimension of that tuple element or None if the
+      shard dimension has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return [policy.shard_dimension for policy in self._sharding_policies]
+
+  def set_shard_dimensions(self, shard_dimensions):
+    """Sets the shard_dimension of each element of the queue.
+
+    shard_dimensions must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a Dimension compatible with self.tuple_shapes.
+
+    Args:
+      shard_dimensions: the dimensions of each queue element.
+
+    Raises:
+      ValueError: if shard_dimensions is not of length
+        self.number_of_tuple_elements; or an element of
+        shard_dimensions cannot be converted to a Dimension; or an
+        element of shard_dimensions is a Dimension that is out of
+        range for the corresponding tuple element shape.
+    """
+    if len(shard_dimensions) != self.number_of_tuple_elements:
+      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
+                       % (str(shard_dimensions),
+                          self.number_of_tuple_elements))
+    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
+      policy.set_shard_dimension(dimension)
+    self._validate()
+
+  @property
+  def number_of_shards(self):
+    """Gets the number of shards to use for the InfeedQueue.
+
+    Returns:
+      Number of shards or None if the number of shards has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return self._sharding_policies[0].number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards to use for the InfeedQueue.
+
+    Args:
+      number_of_shards: number of ways to shard the InfeedQueue.
+
+    Raises:
+      ValueError: if number_of_shards is not > 0; or the policies have
+        been frozen and number_of_shards was already set to something
+        else.
+    """
+    for policy in self._sharding_policies:
+      policy.set_number_of_shards(number_of_shards)
+    self._validate()
+
+  def set_configuration_from_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of Tensors whose types and shapes are used
+    to set the queue configuration.
+
+    Args:
+      input_tensors: list of Tensors of the same types and shapes as
+        the desired queue Tuple.
+
+    Raises:
+      ValueError: if input_tensors is not a list of length
+        self.number_of_tuple_elements
+    """
+    if len(input_tensors) != self.number_of_tuple_elements:
+      raise ValueError(
+          "input_tensors is %s, but should be a list of %d Tensors", (
+              str(input_tensors), self.number_of_tuple_elements))
+    self.set_tuple_shapes([t.shape for t in input_tensors])
+    self.set_tuple_types([t.dtype for t in input_tensors])
+
+  def set_configuration_from_sharded_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of lists of Tensors whose types and shapes are used
+    to set the queue configuration. The length of the outer list is the number
+    of shards required, and each inner list is the tuple of Tensors to use to
+    determine the types and shapes of the corresponding shard. This method
+    depends on the shard dimension, and calling it freezes the shard policy.
+
+    Args:
+      input_tensors: list of lists of Tensors. The outer list length corresponds
+        to the desired number of shards, and each inner list is the size
+        and shape of the desired configuration of the corresponding shard.
+
+    Raises:
+      ValueError: if any inner list is not a list of length
+        self.number_of_tuple_elements; or the inner lists do not combine to
+        form a consistent unsharded shape.
+      TypeError: if the types of the Tensors in the inner lists do not match.
+    """
+    if not self._frozen:
+      # Unset the tuple shapes in case the configuration becomes
+      # transiently inconsistent.
+      self._tuple_shapes = None
+    number_of_shards = len(input_tensors)
+    self.set_number_of_shards(number_of_shards)
+    for t in input_tensors:
+      if len(t) != self.number_of_tuple_elements:
+        raise ValueError(
+            "input_tensors is %s but must be a list of lists, where each inner"
+            " list has length number_of_tuple_elements=%d" % (
+                str(input_tensors), self.number_of_tuple_elements))
+    # Transpose the inputs to make a list of shard shapes for each tuple
+    # element.
+    sharded_shapes = [[t[i].shape for t in input_tensors]
+                      for i in xrange(self.number_of_tuple_elements)]
+    # For each tuple, get the unsharded shape using that tuple's policy.
+    unsharded_shapes = [
+        policy.get_unsharded_shape(s)
+        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
+    ]
+    self.set_tuple_shapes(unsharded_shapes)
+    for i in xrange(1, self.number_of_shards):
+      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
+        if t1.dtype != t2.dtype:
+          raise TypeError(
+              "types of the tuple elements of input_tensors %s are not "
+              "consistent" % str(input_tensors))
+    self.set_tuple_types([t.dtype for t in input_tensors[0]])
+
+  def freeze(self):
+    """Freezes the InfeedQueue so it can no longer be modified.
+
+    The configuration is implicitly frozen before any host-side or
+    device-side Ops are generated. The configuration cannot be frozen
+    until the types and shapes of the tuple elements have been set.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set.
+    """
+    self._frozen = True
+    if self._tuple_types is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple types.")
+    if self._tuple_shapes is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for shape in self._tuple_shapes:
+      if shape.dims is None:
+        raise ValueError(
+            "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for policy in self._sharding_policies:
+      policy.freeze()
+    self._validate()
+
+  def generate_dequeue_op(self):
+    """Generates the device-side Op to dequeue a tuple from the queue.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen, which will raise errors if the shapes and types have not
+    been fully specified.
+
+    Returns:
+      A list of Outputs corresponding to a shard of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    return tpu_ops.infeed_dequeue_tuple(
+        dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+
+  def _generate_enqueue_op(self,
+                           inputs,
+                           name_prefix,
+                           index,
+                           device=None,
+                           tpu_ordinal=-1):
+    """Generate a host-side Op to enqueue a tuple to the queue.
+
+    If device is None the inputs are all required to have the same
+    device specification, and the enqueue Op is colocated with
+    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
+
+    Args:
+      inputs: a list of Tensors with the types and shapes of the tuple elements.
+      name_prefix: the base name for the Op.
+      index: the shard index, used to uniquify the Op name.
+      device: device to place the Op on, or None if it should be
+        colocated with the inputs.
+      tpu_ordinal: ordinal of the TPU device on the host to use for
+      infeed if device is a CPU device. Should be set to -1 if device
+      is a TPU device.
+
+    Returns:
+      An Op corresponding to a shard of infeed enqueued at the host,
+      suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if device is None and inputs do not all have the
+        same device specification.
+    """
+    full_name = "%s/%d" % (name_prefix, index)
+    shapes = [t.shape for t in inputs]
+    if device is None:
+      devices = [t.device for t in inputs]
+      for i in xrange(1, self.number_of_tuple_elements):
+        if devices[0] != devices[i]:
+          raise ValueError(
+              "input devices for shard %d are %s, but should all be the same",
+              index, str(devices))
+      with ops.colocate_with(inputs[0]):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+    else:
+      with ops.device(device):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+
+  def generate_enqueue_ops(self, sharded_inputs, tpu_ordinal_function=None):
+    """Generates the host-side Ops to enqueue the shards of a tuple.
+
+    sharded_inputs is a list, one for each shard, of lists of
+    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
+    shard 0 if the queue. Returns the host-side Ops that must be run to
+    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
+    for shard i.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of sharded_inputs, an error
+    will be raised.
+
+    Args:
+      sharded_inputs: a list of lists of Tensors. The length of the outer list
+        determines the number of shards. Each inner list indicates the types
+        and shapes of the tuples in the corresponding shard.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. tpu_ordinal_function must be
+        set if the inputs are placed on CPU devices.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    if tpu_ordinal_function is None:
+      tpu_ordinal_function = lambda index: -1
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(shard, name_prefix, index,
+                                  tpu_ordinal=tpu_ordinal_function(index))
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+  # TODO(misard) Generalize this to the case of systems that don't
+  # have 8 devices per host, and figure out what to do with
+  # model-parallelism.
+  def _default_placement_function(self, index):
+    return "/task:%d/device:CPU:0" % (index / 8)
+
+  def _default_ordinal_function(self, index):
+    return index % 8
+
+  # TODO(b/36470756) remove this from tutorials once we have a better story
+  # for automatic placement of input pipelines.
+  def split_inputs_and_generate_enqueue_ops(self,
+                                            inputs,
+                                            global_tpu_id=None,
+                                            placement_function=None,
+                                            tpu_ordinal_function=None):
+    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
+
+    Generates the host-side Ops to enqueue a tuple.
+
+    This method performs poorly because it takes an entire input on a single
+    host, splits it, and distributes it to all of the cores. It is present only
+    to simplify tutorial examples.
+
+    inputs is a list of Tensors to use to feed the queue. Each input is split
+    into self.number_of_shards shards. Returns an Op for each shard to enqueue
+    the shard. The Op for shard i is placed on device placement_function(i).
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of inputs, an error
+    will be raised.
+
+    Args:
+      inputs: a list of Tensors which indicates the types and shapes of the
+        queue tuple.
+     global_tpu_id: if not None, a Numpy 2D array indicating the global
+        id of each TPU device in the system. The outer dimension of the
+        array is host task id, and the inner dimension is device ordinal,
+        so e.g., global_tpu_id[x][y] indicates the global id of device
+        /task:x/device:TPU_NODE:y. If global_tpu_id is not None, but
+        placement_function and ordinal_function are None, then global_tpu_id
+        will be used to place infeed on the TPUs with the first k global ids,
+        where k is the number of shards in the queue.
+      placement_function: if not None, a function that takes the shard
+        index as input and returns a device string indicating which
+        device the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of inputs are not compatible with the frozen
+        configuration.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of inputs are not compatible with the frozen
+        configuration.
+    """
+    if global_tpu_id is None:
+      if placement_function is None:
+        placement_function = self._default_placement_function
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = self._default_ordinal_function
+    else:
+      global_id_map = {}
+      for host, devices in enumerate(global_tpu_id):
+        for ordinal, global_id in enumerate(devices):
+          global_id_map[global_id] = (host, ordinal)
+
+      def _placement_function_from_map(index):
+        return "/task:%d/device:CPU:0" % global_id_map[index][0]
+
+      def _ordinal_function_from_map(index):
+        return global_id_map[index][1]
+
+      if placement_function is None:
+        placement_function = _placement_function_from_map
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = _ordinal_function_from_map
+    self.set_configuration_from_input_tensors(inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    split_name_prefix = "%s/split" % self._name
+    if self.number_of_shards == 1:
+      transposed_sharded_inputs = [[inp] for inp in inputs]
+    else:
+      transposed_sharded_inputs = [
+          array_ops.split(
+              inp,
+              self.number_of_shards,
+              axis=policy.shard_dimension,
+              name="%s/%d" % (split_name_prefix, index))
+          for (inp, policy, index) in zip(inputs, self._sharding_policies,
+                                          xrange(self.number_of_tuple_elements))
+      ]
+    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
+                      for i in xrange(self.number_of_shards)]
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            device=placement_function(index),
+            tpu_ordinal=tpu_ordinal_function(index))
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..de16e3b157207ee66844f67ef50b7d0363ef3b8c
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -0,0 +1,106 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for functions used during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.util import tf_inspect
+
+
+class TpuContext(object):
+  """A context object holding state about the TPU computation being built."""
+
+  def __init__(self):
+    """Creates a new TpuContext."""
+    self._number_of_shards = None
+
+  @property
+  def number_of_shards(self):
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    self._number_of_shards = number_of_shards
+
+
+# The Tpu context holds the number of shards when a sharded computation is
+# being built, or None if no computation is being built.
+_current_tpu_context = TpuContext()
+
+
+@contextlib.contextmanager
+def tpu_shard_context(number_of_shards):
+  if _current_tpu_context.number_of_shards is not None:
+    raise NotImplementedError("tpu_shard_context cannot be nested.")
+  try:
+    _current_tpu_context.set_number_of_shards(number_of_shards)
+    yield
+  finally:
+    _current_tpu_context.set_number_of_shards(None)
+
+
+def get_tpu_context():
+  return _current_tpu_context
+
+
+def check_function_argument_count(func, input_arity, infeed_queue):
+  """Validate the number of input arguments to a tpu function.
+
+  Args:
+    func: the Python function that will be called to generate the body
+      of a TPUFunction.
+    input_arity: the number of explicit arguments supplied by the
+      caller.
+    infeed_queue: if not None, the infeed queue that will supply
+      additional arguments to the function.
+
+  Returns:
+    None if function can be called with the supplied number of
+      arguments, or an error string if it cannot.
+  """
+  def format_error(complaint, quantity):
+    return "%s %d argument%s" % (complaint, quantity, ""
+                                 if quantity == 1 else "s")
+
+  number_of_arguments_needed = input_arity
+  if infeed_queue is not None:
+    number_of_arguments_needed += infeed_queue.number_of_tuple_elements
+  arg_spec = tf_inspect.getargspec(func)
+  number_of_args = len(arg_spec.args)
+  if arg_spec.defaults is None:
+    number_of_defaults = 0
+  else:
+    number_of_defaults = len(arg_spec.defaults)
+  min_required_arguments = number_of_args - number_of_defaults
+  if number_of_arguments_needed < min_required_arguments:
+    # The required number of arguments is not enough to call the function.
+    if number_of_defaults == 0 and arg_spec.varargs is None:
+      return format_error("exactly", number_of_args)
+    else:
+      return format_error("at least", min_required_arguments)
+  if arg_spec.varargs is None and number_of_arguments_needed > number_of_args:
+    # The required number of arguments is too many to call the function.
+    if number_of_defaults == 0:
+      return format_error("exactly", number_of_args)
+    else:
+      return format_error("at most", number_of_args)
+  # Since there are varargs, func can accept any number of arguments
+  # greater than the minimum.
+  return None
+
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_function_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..463c249a95c8a07745b6603636f8f799384f2845
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function_test.py
@@ -0,0 +1,125 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for tpu_function helpers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import tpu_feed
+from tensorflow.contrib.tpu.python.tpu import tpu_function
+
+from tensorflow.python.platform import test
+
+
+class FunctionArgCheckTest(test.TestCase):
+
+  def testSimple(self):
+    """Tests that arg checker works for functions with no varargs or defaults.
+    """
+
+    def func(x, y, z):
+      return x + y + z
+
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 3, None))
+    self.assertEqual("exactly 3 arguments",
+                     tpu_function.check_function_argument_count(func, 2, None))
+    queue = tpu_feed.InfeedQueue(2)
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 1, queue))
+    self.assertEqual("exactly 3 arguments",
+                     tpu_function.check_function_argument_count(func, 2, queue))
+
+  def testDefaultArgs(self):
+    """Tests that arg checker works for a function with no varargs."""
+
+    def func(x, y, z=17):
+      return x + y + z
+
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 3, None))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 2, None))
+    self.assertEqual("at least 2 arguments",
+                     tpu_function.check_function_argument_count(func, 1, None))
+    self.assertEqual("at most 3 arguments",
+                     tpu_function.check_function_argument_count(func, 4, None))
+    queue = tpu_feed.InfeedQueue(1)
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 2, queue))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 1, queue))
+    self.assertEqual("at least 2 arguments",
+                     tpu_function.check_function_argument_count(func, 0, queue))
+    self.assertEqual("at most 3 arguments",
+                     tpu_function.check_function_argument_count(func, 4, queue))
+
+  def testVarArgs(self):
+    """Tests that arg checker works for a function with varargs."""
+
+    def func(x, y, *z):
+      return x + y + len(z)
+
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 2, None))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 3, None))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 4, None))
+    self.assertEqual("at least 2 arguments",
+                     tpu_function.check_function_argument_count(func, 1, None))
+    queue = tpu_feed.InfeedQueue(1)
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 1, queue))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 2, queue))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 3, queue))
+    self.assertEqual("at least 2 arguments",
+                     tpu_function.check_function_argument_count(func, 0, queue))
+
+  def testVarArgsAndDefaults(self):
+    """Tests that arg checker works for a function with varargs and defaults."""
+
+    def func(x, y, z=17, *q):
+      return x + y + z + len(q)
+
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 2, None))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 3, None))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 4, None))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 5, None))
+    self.assertEqual("at least 2 arguments",
+                     tpu_function.check_function_argument_count(func, 1, None))
+    queue = tpu_feed.InfeedQueue(1)
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 1, queue))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 2, queue))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 3, queue))
+    self.assertEqual(None,
+                     tpu_function.check_function_argument_count(func, 4, queue))
+    self.assertEqual("at least 2 arguments",
+                     tpu_function.check_function_argument_count(func, 0, queue))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a41ff60d0af6c89fa9825d557aceefc9f6b8098d
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
@@ -0,0 +1,130 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for TPU InfeedQueue methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import tpu_feed
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class InfeedTest(test.TestCase):
+
+  def testConstructor(self):
+    """Tests that the constructor can be called with different arguments."""
+    i = tpu_feed.InfeedQueue(number_of_tuple_elements=2)
+    self.assertEqual(i.number_of_tuple_elements, 2)
+    self.assertEqual(i.tuple_types, None)
+    self.assertEqual(i.tuple_shapes, None)
+    self.assertEqual(i.number_of_shards, None)
+    i = tpu_feed.InfeedQueue(
+        tuple_types=[dtypes.float32, dtypes.int32, dtypes.int32])
+    self.assertEqual(i.number_of_tuple_elements, 3)
+    self.assertEqual(i.tuple_types,
+                     [dtypes.float32, dtypes.int32, dtypes.int32])
+    self.assertEqual(i.tuple_shapes, None)
+    self.assertEqual(i.number_of_shards, None)
+    i = tpu_feed.InfeedQueue(tuple_shapes=[[1], [2, 3]])
+    self.assertEqual(i.number_of_tuple_elements, 2)
+    self.assertEqual(i.tuple_types, None)
+    self.assertEqual(i.tuple_shapes, [[1], [2, 3]])
+    self.assertEqual(i.number_of_shards, None)
+    i = tpu_feed.InfeedQueue(shard_dimensions=[1, 0, 7])
+    self.assertEqual(i.number_of_tuple_elements, 3)
+    self.assertEqual(i.tuple_types, None)
+    self.assertEqual(i.tuple_shapes, None)
+    self.assertEqual([p.shard_dimension
+                      for p in i.sharding_policies], [1, 0, 7])
+    with self.assertRaises(ValueError):
+      i = tpu_feed.InfeedQueue()
+    with self.assertRaises(ValueError):
+      i = tpu_feed.InfeedQueue(
+          number_of_tuple_elements=2, tuple_types=[dtypes.float32])
+    with self.assertRaises(ValueError):
+      i = tpu_feed.InfeedQueue(number_of_tuple_elements=2, tuple_shapes=[[1]])
+    with self.assertRaises(ValueError):
+      i = tpu_feed.InfeedQueue(number_of_tuple_elements=2, shard_dimensions=[1])
+    with self.assertRaises(ValueError):
+      i = tpu_feed.InfeedQueue(tuple_shapes=[[1], [2, 3]], shard_dimensions=[1])
+
+  def testModification(self):
+    """Tests modification of the queue post-construction."""
+    i = tpu_feed.InfeedQueue(number_of_tuple_elements=2)
+    i.set_tuple_types([dtypes.float32, dtypes.int32])
+    self.assertEqual(i.tuple_types, [dtypes.float32, dtypes.int32])
+    i.set_tuple_types([dtypes.float32, dtypes.float32])
+    self.assertEqual(i.tuple_types, [dtypes.float32, dtypes.float32])
+    with self.assertRaises(ValueError):
+      i.set_tuple_types([dtypes.float32])
+    i.set_tuple_shapes([[1], [2, 3]])
+    self.assertEqual(i.tuple_shapes, [[1], [2, 3]])
+    i.set_tuple_shapes([[1, 2], [3, 4]])
+    self.assertEqual(i.tuple_shapes, [[1, 2], [3, 4]])
+    with self.assertRaises(ValueError):
+      i.set_tuple_shapes([[1, 2]])
+    i.set_number_of_shards(2)
+    self.assertEqual(i.number_of_shards, 2)
+    i.set_number_of_shards(3)
+    self.assertEqual(i.number_of_shards, 3)
+    t1 = constant_op.constant(1, dtypes.int32, shape=[6])
+    t2 = constant_op.constant(2.0, dtypes.float32, shape=[3, 18])
+    i.set_configuration_from_input_tensors([t1, t2])
+    self.assertEqual(i.tuple_shapes, [[6], [3, 18]])
+    self.assertEqual(i.tuple_types, [dtypes.int32, dtypes.float32])
+    i.set_configuration_from_sharded_input_tensors([[t2, t1], [t2, t1]])
+    self.assertEqual(i.number_of_shards, 2)
+    self.assertEqual(i.tuple_shapes, [[6, 18], [12]])
+    self.assertEqual(i.tuple_types, [dtypes.float32, dtypes.int32])
+    i.set_shard_dimensions([1, 0])
+    i.set_number_of_shards(3)
+    with self.assertRaises(ValueError):
+      i.set_number_of_shards(4)
+
+  def testFreezing(self):
+    """Tests freezing the queue."""
+    i = tpu_feed.InfeedQueue(number_of_tuple_elements=2)
+    t1 = constant_op.constant(1, dtypes.int32, shape=[2])
+    t2 = constant_op.constant(2.0, dtypes.float32, shape=[2, 4])
+    i.set_configuration_from_sharded_input_tensors([[t2, t1], [t2, t1]])
+    self.assertEqual(i.number_of_shards, 2)
+    self.assertEqual(i.tuple_shapes, [[4, 4], [4]])
+    self.assertEqual(i.tuple_types, [dtypes.float32, dtypes.int32])
+    self.assertEqual(i.shard_dimensions, [0, 0])
+    i.freeze()
+    i.set_number_of_shards(2)
+    i.set_tuple_shapes([[4, 4], [4]])
+    i.set_tuple_types([dtypes.float32, dtypes.int32])
+    i.set_shard_dimensions([0, 0])
+    with self.assertRaises(ValueError):
+      i.set_number_of_shards(1)
+    with self.assertRaises(ValueError):
+      i.set_tuple_shapes([[8, 8], [8]])
+    with self.assertRaises(ValueError):
+      i.set_tuple_types([dtypes.int32, dtypes.float32])
+    with self.assertRaises(ValueError):
+      i.set_shard_dimensions([1, 0])
+    self.assertEqual(i.number_of_shards, 2)
+    self.assertEqual(i.tuple_shapes, [[4, 4], [4]])
+    self.assertEqual(i.tuple_types, [dtypes.float32, dtypes.int32])
+    self.assertEqual(i.shard_dimensions, [0, 0])
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d12a364c33a6b99f0a63c1b635bf2c4c710d0fe
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -0,0 +1,106 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Optimizer that implements cross-shard gradient reduction for TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.python.training import optimizer
+
+
+class CrossShardOptimizer(optimizer.Optimizer):
+  """A optimizer sums gradients across TPU shards."""
+
+  def __init__(self, opt, name="CrossShardOptimizer"):
+    super(CrossShardOptimizer, self).__init__(False, name)
+    self._opt = opt
+
+  def compute_gradients(self, *args, **kwargs):
+    """Compute gradients of "loss" for the variables in "var_list".
+
+    This simply wraps the compute_gradients() from the real optimizer. The
+    gradients will be aggregated in the apply_gradients() so that user can
+    modify the gradients like clipping with per replica global norm if needed.
+    The global norm with aggregated gradients can be bad as one replica's huge
+    gradients can hurt the gradients from other replicas.
+
+    Args:
+      *args: Arguments for compute_gradients().
+      **kwargs: Keyword arguments for compute_gradients().
+
+    Returns:
+      A list of (gradient, variable) pairs.
+    """
+    return self._opt.compute_gradients(*args, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
+    replicas, and then applies the real optimizer.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        compute_gradients().
+      global_step: Optional Variable to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the Optimizer constructor.
+
+    Returns:
+      An `Operation` that applies the gradients. If `global_step` was not None,
+      that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If the grads_and_vars is malformed.
+    """
+    summed_grads_and_vars = []
+    for (grad, var) in grads_and_vars:
+      if grad is None:
+        summed_grads_and_vars.append((grad, var))
+      else:
+        summed_grads_and_vars.append((tpu_ops.cross_replica_sum(grad), var))
+    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
+
+  def get_slot(self, *args, **kwargs):
+    """Return a slot named "name" created for "var" by the Optimizer.
+
+    This simply wraps the get_slot() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    This simply wraps the get_slot_names() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      A list of strings.
+    """
+    return self._opt.get_slot_names(*args, **kwargs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d545a94ca6a2fdb3a9df2748b59300fd141dc55d
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
@@ -0,0 +1,248 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for sharding during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import tensor_shape
+
+_DEFAULT_NUMBER_OF_SHARDS = 1
+_DEFAULT_SHARD_DIMENSION = 0
+
+
+# TODO(b/36777903) change other parts of tpu.py to use this class.
+class ShardingPolicy(object):
+  """An object use to hold the sharding policy for a Tensor.
+  """
+
+  def __init__(self):
+    self._number_of_shards = None
+    self._shard_dimension = None
+    self._frozen = False
+
+  def __str__(self):
+    if self.number_of_shards is None or self.shard_dimension is None:
+      return "ShardingPolicy(unset)"
+    else:
+      return ("ShardingPolicy(%d shards dimension %d)" %
+              (self.number_of_shards, self.shard_dimension))
+
+  def _fill_default_values(self):
+    if self._number_of_shards is None:
+      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
+    if self._shard_dimension is None:
+      self._shard_dimension = tensor_shape.as_dimension(
+          _DEFAULT_SHARD_DIMENSION)
+
+  def freeze(self):
+    """Prevents further modification to the sharding policy.
+
+    Any values that have not been set when freeze is called are set to
+    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
+    """
+    if not self._frozen:
+      self._fill_default_values()
+      self._frozen = True
+
+  @property
+  def number_of_shards(self):
+    """Returns the number of shards in the policy or None if unspecified."""
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards for the current policy.
+
+    If the policy has been frozen then number_of_shards must match the
+    existing setting.
+
+    Args:
+      number_of_shards: The number of shards to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and number_of_shards
+        differs from the frozen value; or number_of_shards <= 0.
+    """
+    if self._frozen:
+      if self._number_of_shards != number_of_shards:
+        raise ValueError(
+            "Can't set sharding policy to use %d shards since it has been "
+            "frozen to use %d." % (number_of_shards, self._number_of_shards))
+    else:
+      if number_of_shards > 0:
+        self._number_of_shards = number_of_shards
+      else:
+        raise ValueError(
+            "Can't set sharding policy to use %s shards; value must be >0",
+            str(number_of_shards))
+
+  @property
+  def shard_dimension(self):
+    """Returns the shard dimension of the policy or None if unspecified."""
+    return self._shard_dimension
+
+  def set_shard_dimension(self, shard_dimension):
+    """Sets the shard dimension for the current policy.
+
+    If the policy has been frozen then shard_dimension must match the
+    existing setting.
+
+    Args:
+      shard_dimension: The shard dimension to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and shard_dimension
+        differs from the frozen value, or shard_dimension can't be
+        interpreted as a Dimension.
+    """
+    if self._frozen:
+      if self._shard_dimension != shard_dimension:
+        raise ValueError(
+            "Can't set shard dimension to %d since it has been frozen to "
+            "use %d." % (shard_dimension, self._shard_dimension))
+    else:
+      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
+
+  def merge(self, other):
+    """Merges the policy of another policy into the current policy.
+
+    Args:
+      other: The policy to merge into this one.
+
+    Raises:
+      ValueError: If this policy has been frozen and the merge conflicts with
+      the frozen policy.
+    """
+    if other.number_of_shards is not None:
+      self.set_number_of_shards(other.number_of_shards)
+    if other.shard_dimension is not None:
+      self.set_shard_dimension(other.shard_dimension)
+
+  def get_sharded_shape(self, shape, shard_index=None):
+    """Returns the shape of a shard of a full Tensor.
+
+    When given the shape of a 'full-size' Tensor, returns the shape of
+    the sub-Tensor after it has been sharded. Freezes the policy if it
+    has not yet been frozen.
+
+    Args:
+      shape: The shape of the full-size Tensor to be sharded.
+      shard_index: The index of the shard whose shape should be returned.
+        shard_index can be None for sharding policies that use the same
+        shape for every shard.
+      freeze_config:
+
+    Returns:
+      The shape of the sharded version of the Tensor.
+
+    Raises:
+      ValueError: If shard_index is None when shards are of different
+        shapes; or shard_index is not None and
+        !(0<=shard_index<number_of_shards); or shape does not have at
+        least self.shard_dimension+1 dimensions; or the value of
+        shape's shard dimension is not a multiple of
+        self.number_of_shards
+    """
+    if self._shard_dimension is None or self._number_of_shards is None:
+      # Don't raise an error if the config is unset.
+      return None
+    if shard_index is not None:
+      if shard_index < 0 or shard_index >= self.number_of_shards:
+        raise ValueError("shard_index %d, but must be in [0,%d)." %
+                         (shard_index, self._number_of_shards))
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
+      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
+                       (shape.as_list(), self._number_of_shards,
+                        self._shard_dimension))
+    dims[self._shard_dimension] /= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def _unshard_shape(self, shape):
+    """Return the unsharded shape that would generate a given sharded shape.
+
+    Args:
+      shape: the sharded shape to unshard
+
+    Returns:
+      The unsharded shape.
+
+    Raises:
+      ValueError: if shape is unknown or does not contain
+        self.shard_dimension
+      TypeError: if shape is not convertible to a TensorShape
+    """
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    dims[self._shard_dimension] *= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def get_unsharded_shape(self, shapes):
+    """Returns the shape of an unsharded Tensor given a list of shards.
+
+    When given a list of shapes of shards, returns the shape of the
+    unsharded Tensor that would generate the shards. Sets defaults for the
+    policy if number_of_shards or shard_dimension is None.
+
+    Args:
+      shapes: The shapes of the Tensor shards to be combined.
+
+    Returns:
+      The shape of the unsharded version of the Tensor.
+
+    Raises:
+      ValueError: if shapes is not a list of length
+        self.number_of_shards; or any element of shapes is not a valid
+        shape consistent with the sharding policy; or the list of
+        shapes is not a valid sharding of a full shape.
+      TypeError: if an element of shapes is not convertible to a
+        TensorShape
+    """
+    self._fill_default_values()
+    if len(shapes) != self.number_of_shards:
+      raise ValueError(
+          "shapes is %s but must be a list of length number_of_shards=%d" % (
+              str(shapes), self.number_of_shards))
+    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
+    for i in xrange(self.number_of_shards - 1):
+      if unsharded_shapes[i] != unsharded_shapes[self.number_of_shards - 1]:
+        raise ValueError(
+            "sharded shapes %s are not consistent shards of a full shape "
+            "sharded %d ways along dimension %d" % (
+                str(shapes), self.number_of_shards, self.shard_dimension))
+    return unsharded_shapes[0]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0a5511d2d7683a5e0f527e49651df236c7a68d4
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for tpu_function helpers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import tpu_sharding
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import test
+
+
+class ShardingTest(test.TestCase):
+
+  def testFreeze(self):
+    """Tests that freezing a policy applies default values."""
+    p1 = tpu_sharding.ShardingPolicy()
+    p1.freeze()
+    self.assertEqual(p1.number_of_shards,
+                     tpu_sharding._DEFAULT_NUMBER_OF_SHARDS)
+    self.assertEqual(p1.shard_dimension, tpu_sharding._DEFAULT_SHARD_DIMENSION)
+    p2 = tpu_sharding.ShardingPolicy()
+    p2.set_number_of_shards(17)
+    p2.set_shard_dimension(23)
+    p2.freeze()
+    self.assertEqual(p2.number_of_shards, 17)
+    self.assertEqual(p2.shard_dimension, 23)
+
+  def testFrozen(self):
+    """Tests that frozen policies can't be changed."""
+    p1 = tpu_sharding.ShardingPolicy()
+    p1.freeze()
+    with self.assertRaises(ValueError):
+      p1.set_number_of_shards(17)
+    with self.assertRaises(ValueError):
+      p1.set_shard_dimension(22)
+
+  def testStr(self):
+    """Tests the string representation."""
+    p1 = tpu_sharding.ShardingPolicy()
+    self.assertEqual(str(p1), "ShardingPolicy(unset)")
+    p1.set_number_of_shards(17)
+    self.assertEqual(str(p1), "ShardingPolicy(unset)")
+    p1.set_shard_dimension(8)
+    self.assertEqual(str(p1), "ShardingPolicy(17 shards dimension 8)")
+
+  def testMerge(self):
+    """Tests that merging works."""
+    p1 = tpu_sharding.ShardingPolicy()
+    p1.set_number_of_shards(17)
+    p1.set_shard_dimension(23)
+    p2 = tpu_sharding.ShardingPolicy()
+    p2.merge(p1)
+    self.assertEqual(p2.number_of_shards, 17)
+    self.assertEqual(p2.shard_dimension, 23)
+    p1 = tpu_sharding.ShardingPolicy()
+    p1.set_shard_dimension(12)
+    p2.merge(p1)
+    self.assertEqual(p2.number_of_shards, 17)
+    self.assertEqual(p2.shard_dimension, 12)
+    p2.freeze()
+    p2.merge(p1)
+    self.assertEqual(p2.number_of_shards, 17)
+    self.assertEqual(p2.shard_dimension, 12)
+    p1.set_number_of_shards(1)
+    with self.assertRaises(ValueError):
+      p2.merge(p1)
+    p1 = tpu_sharding.ShardingPolicy()
+    p1.set_number_of_shards(17)
+    p2.merge(p1)
+    p1.set_shard_dimension(2)
+    with self.assertRaises(ValueError):
+      p2.merge(p1)
+
+  def testGetShardedShape(self):
+    """Tests getting a sharded shape."""
+    p = tpu_sharding.ShardingPolicy()
+    p.set_number_of_shards(3)
+    p.set_shard_dimension(1)
+    self.assertEqual(p.get_sharded_shape([4, 9]), [4, 3])
+    p.freeze()
+    with self.assertRaises(ValueError):
+      p.set_shard_dimension(0)
+    with self.assertRaises(ValueError):
+      _ = p.get_sharded_shape([4, 9], shard_index=4)
+    with self.assertRaises(ValueError):
+      _ = p.get_sharded_shape([4, 9], shard_index=-1)
+    with self.assertRaises(TypeError):
+      _ = p.get_sharded_shape("not_a_shape")
+    with self.assertRaises(ValueError):
+      _ = p.get_sharded_shape(tensor_shape.TensorShape(None))
+    with self.assertRaises(ValueError):
+      _ = p.get_sharded_shape([4, 10], shard_index=-1)
+
+  def testGetUnshardedShape(self):
+    """Tests getting an unsharded shape."""
+    p = tpu_sharding.ShardingPolicy()
+    p.set_number_of_shards(2)
+    p.set_shard_dimension(1)
+    self.assertEqual(p.get_unsharded_shape([[4, 3], [4, 3]]), [4, 6])
+    with self.assertRaises(ValueError):
+      _ = p.get_unsharded_shape([[4, 3]])
+    with self.assertRaises(ValueError):
+      _ = p.get_unsharded_shape([[4, 3], [4, 3], [4, 3]])
+    with self.assertRaises(ValueError):
+      _ = p.get_unsharded_shape([[4, 3], [4, 2]])
+    with self.assertRaises(TypeError):
+      _ = p.get_unsharded_shape([[4, 3], "not_a_shape"])
+    with self.assertRaises(ValueError):
+      _ = p.get_unsharded_shape([None, [4, 3]])
+    with self.assertRaises(ValueError):
+      _ = p.get_unsharded_shape([[2], [4, 3]])
+
+  def testScalar(self):
+    """Tests sharding and unsharding scalars."""
+    p = tpu_sharding.ShardingPolicy()
+    p.freeze()
+    self.assertEqual(p.get_sharded_shape([]), [])
+    self.assertEqual(p.get_unsharded_shape([[]]), [])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d7896127a99653167f164873331a2cc95f656e8
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Library for constructing a training loop, suitable for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import tpu_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+
+
+def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop for TPUs.
+
+  The set of loop-carried tensors corresponds to `inputs`.  Both
+  `condition` and `body` take the current value of the loop-carried
+  tensors. 'body' additionally takes a tuple of infeed from
+  infeed_queue if infeed_queue is not None. `condition` must return a
+  single boolean value that determines whether iteration
+  continues. `body` must return an updated list of values for the
+  loop-carried tensors.
+
+  Args:
+    condition: a Python function that builds the loop condition.
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop, or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: an optional name for the loop.
+
+  Returns:
+    The final values of the loop-carried tensors.
+
+  Raises:
+    TypeError: if body or condition has the wrong signature.
+  """
+
+  # Converts inputs to Tensors.
+  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
+                                      x in inputs]
+  input_types = [x.dtype for x in inputs]
+  input_arity = len(inputs)
+
+  body_arg_error = tpu_function.check_function_argument_count(
+      body, input_arity, infeed_queue)
+  if body_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
+              input_arity, str([i.name for i in inputs]), body_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s and %d additional inputs from "
+          "infeed, but the computation needs %s" % (input_arity, str(
+              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
+                                                    body_arg_error))
+  condition_arg_error = tpu_function.check_function_argument_count(
+      condition, input_arity, None)
+  if condition_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
+                                  condition_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s. Note that infeed is not passed to the loop "
+          "condition." % (input_arity, str([i.name for i in inputs]),
+                          condition_arg_error))
+
+  def condition_wrapper(*inputs):
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+    return condition(*inputs)
+
+  def body_wrapper(*inputs):
+    """Wrapper around `body` that handles infeed queues and control deps."""
+    inputs = list(inputs)
+
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+
+    # Runs `body` with the dequeue_ops appended.
+    if infeed_queue:
+      number_of_shards = tpu_function.get_tpu_context().number_of_shards
+      if number_of_shards is None:
+        raise ValueError("Can't build training loop with infeed when there is "
+                         "no tpu_shard_context. Are you building a loop or "
+                         "graph directly rather than from inside tpu.rewrite, "
+                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
+      infeed_queue.set_number_of_shards(number_of_shards)
+      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
+    else:
+      dequeue_ops = []
+    outputs = body(*(inputs + dequeue_ops))
+
+    # If the computation only returned one value, make it a tuple.
+    if not isinstance(outputs, (list, tuple)):
+      outputs = (outputs,)
+
+    outputs = [
+        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+        for o in outputs
+    ]
+
+    # Separates the returned Operations and Tensors.
+    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs
+                      if not isinstance(o, ops.Operation)]
+
+    if outputs != output_tensors + output_operations:
+      raise ValueError(
+          "TPU training loop body must return zero or more Tensor values "
+          "followed by zero or more Operations.")
+
+    output_types = [op.dtype for op in output_tensors]
+    if input_types != output_types:
+      raise TypeError(
+          "Mismatch between input types and output types for training loop "
+          "body: {} vs {}".format(input_types, output_types))
+
+    # Add the dequeue operations to output_operations to ensure they are run
+    # by the loop, even if the programmer's loop body does not use them.
+    output_operations += dequeue_ops
+
+    # Add a dummy output, if needed.
+    if not output_tensors:
+      output_tensors = array_ops.constant(0)
+
+    if output_operations:
+      # TODO(phawkins): in principle this is too restrictive since it serializes
+      # the training loop steps. In practice it does not matter since this loop
+      # will be compiled by XLA.
+      return control_flow_ops.tuple(output_tensors,
+                                    control_inputs=output_operations)
+    else:
+      return output_tensors
+
+  # If the body has arity 0, add a dummy loop-carried value to which we can add
+  # control dependencies from any side-effecting operations.
+  if input_arity == 0:
+    inputs = [array_ops.constant(0)]
+  return control_flow_ops.while_loop(condition_wrapper, body_wrapper, inputs,
+                                     name=name)
+
+
+def repeat(n, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop that executes a fixed number of interations.
+
+  The set of loop-carried tensors correspond to `inputs`.
+  `body` must be a function that takes and returns the values of the
+  loop-carried tensors.
+
+  Args:
+    n: the number of loop iterations
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: an optional name for the loop.
+  Returns:
+    The final values of the loop-carried tensors.
+  Raises:
+    ValueError: if there is a type error.
+  """
+  def _convert_to_list(xs):
+    if not isinstance(xs, (list, tuple)):
+      return [xs]
+    else:
+      return list(xs)
+
+  def cond(i, *args):
+    del args
+    return i < n
+
+  def body_wrapper(i, *args):
+    return [i + 1] + _convert_to_list(body(*args))
+
+  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
+  outputs = while_loop(
+      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
+  outputs = _convert_to_list(outputs)
+  if len(outputs) == 1:
+    # Returns the Op rather than an empty list.
+    return outputs[0].op
+  else:
+    return outputs[1:]
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 1180ea929946490092a7b5a8229f8aa7806e2b73..086372019caef14df809a2a75f83a7b9332fe2d2 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -41,24 +41,25 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:inputs_queues",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -74,7 +75,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
@@ -91,7 +91,6 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
@@ -107,12 +106,13 @@ py_test(
     tags = ["manual"],
     deps = [
         ":training_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
@@ -128,6 +128,7 @@ py_test(
     deps = [
         ":training_py",
         "//tensorflow/python:client_testlib",
+        "@six_archive//:six",
     ],
 )
 
@@ -142,7 +143,6 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -160,13 +160,12 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -186,11 +185,10 @@ py_test(
         ":training_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
@@ -206,10 +204,11 @@ py_test(
         ":training_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
@@ -223,12 +222,9 @@ py_test(
     tags = ["manual"],
     deps = [
         ":training_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
@@ -255,11 +251,10 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
@@ -278,14 +273,11 @@ py_test(
         ":training_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/losses:losses_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/losses",
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index 7e293da5511fcfc369eb9cb4fe9c68530619a9d1..5523cc375fc20dc167fee0eaa6f1682dc1892c3f 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -88,7 +88,7 @@ def bucket(tensors,
   This function is implemented using several queues. A `QueueRunner` for the
   queues is added to the current `Graph`'s `QUEUE_RUNNER` collection.
 
-  As the returned tensors are the result of of a dequeue operation, evaluating
+  As the returned tensors are the result of a dequeue operation, evaluating
   them will throw a `tf.errors.OutOfRangeError` when the input queue is
   exhausted.  If these tensors are feeding another input queue, its queue runner
   will catch this exception, however, if they are used in your main thread
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 24b733dd29cf0228ec9c5c87a8721fbfb3929574..a895d90b8e5743b8f50a54403d142223994adf7d 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -226,7 +226,7 @@ def checkpoints_iterator(checkpoint_dir,
 
   This behavior gives control to callers on what to do if checkpoints do not
   come fast enough or stop being generated.  For example, if callers have a way
-  to detect that the training has stopped and know that no new new checkpoints
+  to detect that the training has stopped and know that no new checkpoints
   will be generated, they can provide a `timeout_fn` that returns `True` when
   the training has stopped.  If they know that the training is still going on
   they return `False` instead.
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index babd2239b67bc2d1227ce90855d16b5ba01d73c3..b07039916c203940039732c12938e7f342fa72a3 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -329,7 +329,7 @@ class EvaluateRepeatedlyTest(test.TestCase):
     if not gfile.Exists(checkpoint_dir):
       gfile.MakeDirs(checkpoint_dir)
 
-    # We need a variable that that the saver will try to restore.
+    # We need a variable that the saver will try to restore.
     variables.get_or_create_global_step()
 
     # Run with placeholders. If we actually try to evaluate this, we'd fail
@@ -394,7 +394,7 @@ class EvaluateRepeatedlyTest(test.TestCase):
                                   'evaluate_with_eval_feed_dict')
     self._train_model(checkpoint_dir, num_steps=1)
 
-    # We need a variable that that the saver will try to restore.
+    # We need a variable that the saver will try to restore.
     variables.get_or_create_global_step()
 
     # Create a variable and an eval op that increments it with a placeholder.
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index c19a36eabcf7590c55bebde76bf6f50c1fd418e0..a13c7d4b4d965f20b0b0802cab3ff60e24a104f8 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Hyperparameter values."""
 from __future__ import absolute_import
 from __future__ import division
@@ -27,6 +26,116 @@ from tensorflow.contrib.training.python.training import hparam_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.util import compat
 
+# Define the regular expression for parsing a single clause of the input
+# (delimited by commas).  A legal clause looks like:
+#   <variable name>[<index>]? = <rhs>
+# where <rhs> is either a single token or [] enclosed list of tokens.
+# For example:  "var[1] = a" or "x = [1,2,3]"
+PARAM_RE = re.compile(r"""
+  (?P<name>[a-zA-Z][\w]*)      # variable name: "var" or "x"
+  (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
+  \s*=\s*
+  ((?P<val>[^,\[]*)            # single value: "a" or None
+   |
+   \[(?P<vals>[^\]]*)\])       # list of values: None or "1,2,3"
+  ($|,)""", re.VERBOSE)
+
+
+def _parse_fail(name, var_type, value, values):
+  """Helper function for raising a value error for bad assignment."""
+  raise ValueError(
+      'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s' %
+      (name, var_type.__name__, value, values))
+
+
+def _reuse_fail(name, values):
+  """Helper function for raising a value error for reuse of name."""
+  raise ValueError('Multiple assignments to variable \'%s\' in %s' % (name,
+                                                                      values))
+
+
+def _process_scalar_value(name, parse_fn, var_type, m_dict, values,
+                          results_dictionary):
+  """Update results_dictionary with a scalar value.
+
+  Used to update the results_dictionary to be returned by parse_values when
+  encountering a clause with a scalar RHS (e.g.  "s=5" or "arr[0]=5".)
+
+  Mutates results_dictionary.
+
+  Args:
+    name: Name of variable in assignment ("s" or "arr").
+    parse_fn: Function for parsing the actual value.
+    var_type: Type of named variable.
+    m_dict: Dictionary constructed from regex parsing.
+      m_dict['val']: RHS value (scalar)
+      m_dict['index']: List index value (or None)
+    values: Full expression being parsed
+    results_dictionary: The dictionary being updated for return by the parsing
+      function.
+
+  Raises:
+    ValueError: If the name has already been sued.
+  """
+  try:
+    parsed_value = parse_fn(m_dict['val'])
+  except ValueError:
+    _parse_fail(name, var_type, m_dict['val'], values)
+
+  # If no index is provided
+  if not m_dict['index']:
+    if name in results_dictionary:
+      _reuse_fail(name, values)
+    results_dictionary[name] = parsed_value
+  else:
+    if name in results_dictionary:
+      # The name has already been used as a scalar, then it
+      # will be in this dictionary and map to a non-dictionary.
+      if not isinstance(results_dictionary.get(name), dict):
+        _reuse_fail(name, values)
+    else:
+      results_dictionary[name] = {}
+
+    index = int(m_dict['index'])
+    # Make sure the index position hasn't already been assigned a value.
+    if index in results_dictionary[name]:
+      _reuse_fail('{}[{}]'.format(name, index), values)
+    results_dictionary[name][index] = parsed_value
+
+
+def _process_list_value(name, parse_fn, var_type, m_dict, values,
+                        results_dictionary):
+  """Update results_dictionary from a list of values.
+
+  Used to update results_dictionary to be returned by parse_values when
+  encountering a clause with a list RHS (e.g.  "arr=[1,2,3]".)
+
+  Mutates results_dictionary.
+
+  Args:
+    name: Name of variable in assignment ("arr").
+    parse_fn: Function for parsing individual values.
+    var_type: Type of named variable.
+    m_dict: Dictionary constructed from regex parsing.
+      m_dict['val']: RHS value (scalar)
+    values: Full expression being parsed
+    results_dictionary: The dictionary being updated for return by the parsing
+      function.
+
+  Raises:
+    ValueError: If the name has an index or the values cannot be parsed.
+  """
+  if m_dict['index'] is not None:
+    raise ValueError('Assignment of a list to a list index.')
+  elements = filter(None, re.split('[ ,]', m_dict['vals']))
+  # Make sure the name hasn't already been assigned a value
+  if name in results_dictionary:
+    raise _reuse_fail(name, values)
+  try:
+    results_dictionary[name] = [parse_fn(e) for e in elements]
+  except ValueError:
+    _parse_fail(name, var_type, m_dict['vals'], values)
+
 
 def parse_values(values, type_map):
   """Parses hyperparameter values from a string into a python map..
@@ -35,8 +144,11 @@ def parse_values(values, type_map):
   For each pair, the value of the hyperparameter named `name` is set to
   `value`.
 
-  If a hyperparameter name appears multiple times in `values`, the last
-  value is used.
+  If a hyperparameter name appears multiple times in `values`, a ValueError
+  is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
+
+  If a hyperparameter name in both an index assignment and scalar assignment,
+  a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
 
   The `value` in `name=value` must follows the syntax according to the
   type of the parameter:
@@ -49,7 +161,11 @@ def parse_values(values, type_map):
   *  Scalar string: A non-empty sequence of characters, excluding comma,
      spaces, and square brackets.  E.g.: foo, bar_1.
   *  List: A comma separated list of scalar values of the parameter type
-     enclosed in square backets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+     enclosed in square brackets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+
+  When index assignment is used, the corresponding type_map key should be the
+  list name.  E.g. for "arr[1]=0" the type_map must have the key "arr" (not
+  "arr[1]").
 
   Args:
     values: String.  Comma separated list of `name=value` pairs where
@@ -62,19 +178,23 @@ def parse_values(values, type_map):
       'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
 
   Returns:
-    A python map containing the name, value pairs.
+    A python map mapping each name to either:
+    * A scalar value.
+    * A list of scalar values.
+    * A dictionary mapping index numbers to scalar values.
+    (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
 
   Raises:
-    ValueError: If `values` cannot be parsed.
-
+    ValueError: If there is a problem with input.
+    * If `values` cannot be parsed.
+    * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
+    * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
+      'a[1]=1,a[1]=2', or 'a=1,a=[1]')
   """
-  ret = {}
-  param_re = re.compile(
-      r'(?P<name>[a-zA-Z][\w]*)\s*=\s*'
-      r'((?P<val>[^,\[]*)|\[(?P<vals>[^\]]*)\])($|,)')
+  results_dictionary = {}
   pos = 0
   while pos < len(values):
-    m = param_re.match(values, pos)
+    m = PARAM_RE.match(values, pos)
     if not m:
       raise ValueError('Malformed hyperparameter value: %s' % values[pos:])
     # Check that there is a comma between parameters and move past it.
@@ -85,38 +205,39 @@ def parse_values(values, type_map):
     if name not in type_map:
       raise ValueError('Unknown hyperparameter type for %s' % name)
     type_ = type_map[name]
-    def parse_fail(value):
-      raise ValueError(
-          'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s'
-          % (name, type_.__name__, value, values))
+
+    # Set up correct parsing function (depending on whether type_ is a bool)
     if type_ == bool:
+
       def parse_bool(value):
-        if value == 'true':
+        if value in ['true', 'True']:
           return True
-        elif value == 'false':
+        elif value in ['false', 'False']:
           return False
         else:
           try:
             return bool(int(value))
-          except (ValueError, TypeError):
-            parse_fail(value)
+          except ValueError:
+            _parse_fail(name, type_, value, values)
+
       parse = parse_bool
     else:
       parse = type_
+
+    # If a singe value is provided
     if m_dict['val'] is not None:
-      try:
-        ret[name] = parse(m_dict['val'])
-      except (ValueError, TypeError):
-        parse_fail(m_dict['val'])
+      _process_scalar_value(name, parse, type_, m_dict, values,
+                            results_dictionary)
+
+    # If the assigned value is a list:
     elif m_dict['vals'] is not None:
-      elements = filter(None, re.split('[ ,]', m_dict['vals']))
-      try:
-        ret[name] = [parse(e) for e in elements]
-      except (ValueError, TypeError):
-        parse_fail(m_dict['vals'])
-    else:
-      parse_fail('')
-  return ret
+      _process_list_value(name, parse, type_, m_dict, values,
+                          results_dictionary)
+
+    else:  # Not assigned a list or value
+      _parse_fail(name, type_, '', values)
+
+  return results_dictionary
 
 
 class HParams(object):
@@ -273,8 +394,8 @@ class HParams(object):
         elif kind.startswith('bytes'):
           # Setting attribute value to be 'str' to ensure the type is compatible
           # with both Python2 and Python3. UTF-8 encoding is assumed.
-          self.add_hparam(name, [compat.as_str(v)
-                                 for v in getattr(value, kind).value])
+          self.add_hparam(
+              name, [compat.as_str(v) for v in getattr(value, kind).value])
         else:
           self.add_hparam(name, [v for v in getattr(value, kind).value])
 
@@ -296,8 +417,8 @@ class HParams(object):
       raise ValueError('Hyperparameter name is reserved: %s' % name)
     if isinstance(value, (list, tuple)):
       if not value:
-        raise ValueError('Multi-valued hyperparameters cannot be empty: %s'
-                         % name)
+        raise ValueError(
+            'Multi-valued hyperparameters cannot be empty: %s' % name)
       self._hparam_types[name] = (type(value[0]), True)
     else:
       self._hparam_types[name] = (type(value), False)
@@ -461,7 +582,8 @@ class HParams(object):
     return HParams(hparam_def=hparam_def)
 
 
-ops.register_proto_function('hparams',
-                            proto_type=hparam_pb2.HParamDef,
-                            to_proto=HParams.to_proto,
-                            from_proto=HParams.from_proto)
+ops.register_proto_function(
+    'hparams',
+    proto_type=hparam_pb2.HParamDef,
+    to_proto=HParams.to_proto,
+    from_proto=HParams.from_proto)
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 0b900e65d8a54bc648edcf66e75a65893b7ebea5..6f807236c7d4d0d88231cf287ed36b9528ae9a7f 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for hparam."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.contrib.training.python.training import hparam
 
 from tensorflow.python.platform import test
@@ -27,50 +24,49 @@ from tensorflow.python.platform import test
 
 class HParamsTest(test.TestCase):
 
-  def _assertDictEquals(self, d1, d2):
-    self.assertEqual(len(d1), len(d2))
-    for k, v in six.iteritems(d1):
-      self.assertTrue(k in d2, k)
-      self.assertEquals(v, d2[k], d2[k])
-
   def testEmpty(self):
     hparams = hparam.HParams()
-    self._assertDictEquals({}, hparams.values())
+    self.assertDictEqual({}, hparams.values())
     hparams.parse('')
-    self._assertDictEquals({}, hparams.values())
+    self.assertDictEqual({}, hparams.values())
     with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
       hparams.parse('xyz=123')
 
   def testSomeValues(self):
     hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6')
-    self._assertDictEquals(
-        {'aaa': 1, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
+    self.assertDictEqual({'aaa': 1, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
     expected_str = '[(\'aaa\', 1), (\'b\', 2.0), (\'c_c\', \'relu6\')]'
-    self.assertEquals(expected_str, str(hparams.__str__()))
-    self.assertEquals(expected_str, str(hparams))
-    self.assertEquals(1, hparams.aaa)
-    self.assertEquals(2.0, hparams.b)
-    self.assertEquals('relu6', hparams.c_c)
+    self.assertEqual(expected_str, str(hparams.__str__()))
+    self.assertEqual(expected_str, str(hparams))
+    self.assertEqual(1, hparams.aaa)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('relu6', hparams.c_c)
     hparams.parse('aaa=12')
-    self._assertDictEquals(
-        {'aaa': 12, 'b': 2.0, 'c_c': 'relu6'}, hparams.values())
-    self.assertEquals(12, hparams.aaa)
-    self.assertEquals(2.0, hparams.b)
-    self.assertEquals('relu6', hparams.c_c)
+    self.assertDictEqual({
+        'aaa': 12,
+        'b': 2.0,
+        'c_c': 'relu6'
+    }, hparams.values())
+    self.assertEqual(12, hparams.aaa)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('relu6', hparams.c_c)
     hparams.parse('c_c=relu4,b=-2.0e10')
-    self._assertDictEquals({'aaa': 12, 'b': -2.0e10, 'c_c': 'relu4'},
-                           hparams.values())
-    self.assertEquals(12, hparams.aaa)
-    self.assertEquals(-2.0e10, hparams.b)
-    self.assertEquals('relu4', hparams.c_c)
+    self.assertDictEqual({
+        'aaa': 12,
+        'b': -2.0e10,
+        'c_c': 'relu4'
+    }, hparams.values())
+    self.assertEqual(12, hparams.aaa)
+    self.assertEqual(-2.0e10, hparams.b)
+    self.assertEqual('relu4', hparams.c_c)
     hparams.parse('c_c=,b=0,')
-    self._assertDictEquals({'aaa': 12, 'b': 0, 'c_c': ''}, hparams.values())
-    self.assertEquals(12, hparams.aaa)
-    self.assertEquals(0.0, hparams.b)
-    self.assertEquals('', hparams.c_c)
+    self.assertDictEqual({'aaa': 12, 'b': 0, 'c_c': ''}, hparams.values())
+    self.assertEqual(12, hparams.aaa)
+    self.assertEqual(0.0, hparams.b)
+    self.assertEqual('', hparams.c_c)
     hparams.parse('c_c=2.3",b=+2,')
-    self.assertEquals(2.0, hparams.b)
-    self.assertEquals('2.3"', hparams.c_c)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('2.3"', hparams.c_c)
     with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
       hparams.parse('x=123')
     with self.assertRaisesRegexp(ValueError, 'Could not parse'):
@@ -83,34 +79,34 @@ class HParamsTest(test.TestCase):
       hparams.parse('b=relu')
     with self.assertRaisesRegexp(ValueError, 'Must not pass a list'):
       hparams.parse('aaa=[123]')
-    self.assertEquals(12, hparams.aaa)
-    self.assertEquals(2.0, hparams.b)
-    self.assertEquals('2.3"', hparams.c_c)
+    self.assertEqual(12, hparams.aaa)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('2.3"', hparams.c_c)
     # Exports to proto.
     hparam_def = hparams.to_proto()
     # Imports from proto.
     hparams2 = hparam.HParams(hparam_def=hparam_def)
     # Verifies that all hparams are restored.
-    self.assertEquals(12, hparams2.aaa)
-    self.assertEquals(2.0, hparams2.b)
-    self.assertEquals('2.3"', hparams2.c_c)
+    self.assertEqual(12, hparams2.aaa)
+    self.assertEqual(2.0, hparams2.b)
+    self.assertEqual('2.3"', hparams2.c_c)
 
   def testBoolParsing(self):
-    for value in 'true', 'false', '1', '0':
+    for value in 'true', 'false', 'True', 'False', '1', '0':
       for initial in False, True:
         hparams = hparam.HParams(use_gpu=initial)
         hparams.parse('use_gpu=' + value)
-        self.assertEqual(hparams.use_gpu, value in ['true', '1'])
+        self.assertEqual(hparams.use_gpu, value in ['True', 'true', '1'])
 
         # Exports to proto.
         hparam_def = hparams.to_proto()
         # Imports from proto.
         hparams2 = hparam.HParams(hparam_def=hparam_def)
-        self.assertEquals(hparams.use_gpu, hparams2.use_gpu)
+        self.assertEqual(hparams.use_gpu, hparams2.use_gpu)
         # Check that hparams2.use_gpu is a bool rather than an int.
-        # The assertEquals() call above won't catch this, since
+        # The assertEqual() call above won't catch this, since
         # (0 == False) and (1 == True) in Python.
-        self.assertEquals(bool, type(hparams2.use_gpu))
+        self.assertEqual(bool, type(hparams2.use_gpu))
 
   def testBoolParsingFail(self):
     hparams = hparam.HParams(use_gpu=True)
@@ -119,24 +115,27 @@ class HParamsTest(test.TestCase):
 
   def testLists(self):
     hparams = hparam.HParams(aaa=[1], b=[2.0, 3.0], c_c=['relu6'])
-    self._assertDictEquals({'aaa': [1], 'b': [2.0, 3.0], 'c_c': ['relu6']},
-                           hparams.values())
-    self.assertEquals([1], hparams.aaa)
-    self.assertEquals([2.0, 3.0], hparams.b)
-    self.assertEquals(['relu6'], hparams.c_c)
+    self.assertDictEqual({
+        'aaa': [1],
+        'b': [2.0, 3.0],
+        'c_c': ['relu6']
+    }, hparams.values())
+    self.assertEqual([1], hparams.aaa)
+    self.assertEqual([2.0, 3.0], hparams.b)
+    self.assertEqual(['relu6'], hparams.c_c)
     hparams.parse('aaa=[12]')
-    self.assertEquals([12], hparams.aaa)
+    self.assertEqual([12], hparams.aaa)
     hparams.parse('aaa=[12,34,56]')
-    self.assertEquals([12, 34, 56], hparams.aaa)
+    self.assertEqual([12, 34, 56], hparams.aaa)
     hparams.parse('c_c=[relu4,relu12],b=[1.0]')
-    self.assertEquals(['relu4', 'relu12'], hparams.c_c)
-    self.assertEquals([1.0], hparams.b)
+    self.assertEqual(['relu4', 'relu12'], hparams.c_c)
+    self.assertEqual([1.0], hparams.b)
     hparams.parse('c_c=[],aaa=[-34]')
-    self.assertEquals([-34], hparams.aaa)
-    self.assertEquals([], hparams.c_c)
+    self.assertEqual([-34], hparams.aaa)
+    self.assertEqual([], hparams.c_c)
     hparams.parse('c_c=[_12,3\'4"],aaa=[+3]')
-    self.assertEquals([3], hparams.aaa)
-    self.assertEquals(['_12', '3\'4"'], hparams.c_c)
+    self.assertEqual([3], hparams.aaa)
+    self.assertEqual(['_12', '3\'4"'], hparams.c_c)
     with self.assertRaisesRegexp(ValueError, 'Unknown hyperparameter'):
       hparams.parse('x=[123]')
     with self.assertRaisesRegexp(ValueError, 'Could not parse'):
@@ -154,31 +153,129 @@ class HParamsTest(test.TestCase):
     # Imports from proto.
     hparams2 = hparam.HParams(hparam_def=hparam_def)
     # Verifies that all hparams are restored.
-    self.assertEquals([3], hparams2.aaa)
-    self.assertEquals([1.0], hparams2.b)
-    self.assertEquals(['_12', '3\'4"'], hparams2.c_c)
+    self.assertEqual([3], hparams2.aaa)
+    self.assertEqual([1.0], hparams2.b)
+    self.assertEqual(['_12', '3\'4"'], hparams2.c_c)
+
+  def testParseValuesWithIndexAssigment1(self):
+    """Assignment to an index position."""
+    parse_dict = hparam.parse_values('arr[1]=10', {'arr': int})
+    self.assertEqual(len(parse_dict), 1)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {1: 10})
+
+  def testParseValuesWithIndexAssigment2(self):
+    """Assignment to multiple index positions."""
+    parse_dict = hparam.parse_values('arr[0]=10,arr[5]=20', {'arr': int})
+    self.assertEqual(len(parse_dict), 1)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {0: 10, 5: 20})
+
+  def testParseValuesWithIndexAssigment3(self):
+    """Assignment to index positions in multiple names."""
+    parse_dict = hparam.parse_values('arr[0]=10,arr[1]=20,L[5]=100,L[10]=200',
+                                     {'arr': int,
+                                      'L': int})
+    self.assertEqual(len(parse_dict), 2)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {0: 10, 1: 20})
+    self.assertTrue(isinstance(parse_dict['L'], dict))
+    self.assertDictEqual(parse_dict['L'], {5: 100, 10: 200})
+
+  def testParseValuesWithIndexAssigment4(self):
+    """Assignment of index positions and scalars."""
+    parse_dict = hparam.parse_values('x=10,arr[1]=20,y=30',
+                                     {'x': int,
+                                      'y': int,
+                                      'arr': int})
+    self.assertEqual(len(parse_dict), 3)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {1: 20})
+    self.assertEqual(parse_dict['x'], 10)
+    self.assertEqual(parse_dict['y'], 30)
+
+  def testParseValuesWithIndexAssigment5(self):
+    """Different variable types."""
+    parse_dict = hparam.parse_values('a[0]=5,b[1]=true,c[2]=abc,d[3]=3.14', {
+        'a': int,
+        'b': bool,
+        'c': str,
+        'd': float
+    })
+    self.assertEqual(set(parse_dict.keys()), {'a', 'b', 'c', 'd'})
+    self.assertTrue(isinstance(parse_dict['a'], dict))
+    self.assertDictEqual(parse_dict['a'], {0: 5})
+    self.assertTrue(isinstance(parse_dict['b'], dict))
+    self.assertDictEqual(parse_dict['b'], {1: True})
+    self.assertTrue(isinstance(parse_dict['c'], dict))
+    self.assertDictEqual(parse_dict['c'], {2: 'abc'})
+    self.assertTrue(isinstance(parse_dict['d'], dict))
+    self.assertDictEqual(parse_dict['d'], {3: 3.14})
+
+  def testParseValuesWithBadIndexAssigment1(self):
+    """Reject assignment of list to variable type."""
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Assignment of a list to a list index.'):
+      hparam.parse_values('arr[1]=[1,2,3]', {'arr': int})
+
+  def testParseValuesWithBadIndexAssigment2(self):
+    """Reject if type missing."""
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Unknown hyperparameter type for arr'):
+      hparam.parse_values('arr[1]=5', {})
+
+  def testParseValuesWithBadIndexAssigment3(self):
+    """Reject type of the form name[index]."""
+    with self.assertRaisesRegexp(ValueError,
+                                 'Unknown hyperparameter type for arr'):
+      hparam.parse_values('arr[1]=1', {'arr[1]': int})
+
+  def testWithReusedVariables(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Multiple assignments to variable \'x\''):
+      hparam.parse_values('x=1,x=1', {'x': int})
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Multiple assignments to variable \'arr\''):
+      hparam.parse_values('arr=[100,200],arr[0]=10', {'arr': int})
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Multiple assignments to variable \'arr\[0\]\''):
+      hparam.parse_values('arr[0]=10,arr[0]=20', {'arr': int})
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Multiple assignments to variable \'arr\''):
+      hparam.parse_values('arr[0]=10,arr=[100]', {'arr': int})
 
   def testJson(self):
     hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d=True)
-    self._assertDictEquals(
-        {'aaa': 1, 'b': 2.0, 'c_c': 'relu6', 'd': True}, hparams.values())
-    self.assertEquals(1, hparams.aaa)
-    self.assertEquals(2.0, hparams.b)
-    self.assertEquals('relu6', hparams.c_c)
+    self.assertDictEqual({
+        'aaa': 1,
+        'b': 2.0,
+        'c_c': 'relu6',
+        'd': True
+    }, hparams.values())
+    self.assertEqual(1, hparams.aaa)
+    self.assertEqual(2.0, hparams.b)
+    self.assertEqual('relu6', hparams.c_c)
     hparams.parse_json('{"aaa": 12, "b": 3.0, "c_c": "relu4", "d": false}')
-    self._assertDictEquals(
-        {'aaa': 12, 'b': 3.0, 'c_c': 'relu4', 'd': False}, hparams.values())
-    self.assertEquals(12, hparams.aaa)
-    self.assertEquals(3.0, hparams.b)
-    self.assertEquals('relu4', hparams.c_c)
+    self.assertDictEqual({
+        'aaa': 12,
+        'b': 3.0,
+        'c_c': 'relu4',
+        'd': False
+    }, hparams.values())
+    self.assertEqual(12, hparams.aaa)
+    self.assertEqual(3.0, hparams.b)
+    self.assertEqual('relu4', hparams.c_c)
 
     json_str = hparams.to_json()
     hparams2 = hparam.HParams(aaa=10, b=20.0, c_c='hello', d=False)
     hparams2.parse_json(json_str)
-    self.assertEquals(12, hparams2.aaa)
-    self.assertEquals(3.0, hparams2.b)
-    self.assertEquals('relu4', hparams2.c_c)
-    self.assertEquals(False, hparams2.d)
+    self.assertEqual(12, hparams2.aaa)
+    self.assertEqual(3.0, hparams2.b)
+    self.assertEqual('relu4', hparams2.c_c)
+    self.assertEqual(False, hparams2.d)
 
   def testNonProtoFails(self):
     with self.assertRaisesRegexp(AssertionError, ''):
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index 5ad8e3dd358240bb70657e6132534fcb9d576a65..5575fb35702698b753fc5cf5d904b78a2b1e9483 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -27,7 +27,6 @@ cc_binary(
     srcs = ["convert_graphdef_memmapped_format.cc"],
     deps = [
         ":convert_graphdef_memmapped_format_lib",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
     ],
@@ -69,6 +68,7 @@ py_library(
     deps = [
         "//tensorflow/python:framework",
         "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
index 1f079027efb624b009c26c235a0b1a97ba1c36b4..2992a61ea8186caada394208e9c27ddffe896dd1 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
@@ -15,10 +15,13 @@ limitations under the License.
 #include "tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h"
 
 #include <unordered_set>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/immutable_constant_op.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index e747fa4c9e481064226a2f58356d1d4ade4a740d..173a65a7eb66678e310c67953fe969310c417257 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -66,9 +66,8 @@ cc_library(
         ":grpc_verbs_service_impl",
         ":rdma_mgr",
         ":verbs_service_proto_cc",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:session_mgr",
-        "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
         "//tensorflow/core/distributed_runtime/rpc:grpc_call",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
@@ -107,8 +106,9 @@ cc_library(
     hdrs = ["rdma_rendezvous_mgr.h"],
     deps = [
         ":rdma_mgr",
+        ":verbs_util",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
         "//tensorflow/core/distributed_runtime:worker_env",
@@ -122,10 +122,11 @@ cc_library(
     deps = [
         ":grpc_verbs_client",
         ":rdma",
+        ":verbs_service_proto_cc",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
@@ -144,6 +145,7 @@ cc_library(
         ":verbs_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -162,6 +164,8 @@ cc_library(
         ":grpc_verbs_service",
         ":rdma_mgr",
         ":rdma_rendezvous_mgr",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
index e0ba78dbfd537f6ae56627c42d5d302a5fbfbd36..a1fbea57dd1202c1a22e6b3570e9378555fe3498 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
@@ -43,20 +43,22 @@ VerbsService::Stub::Stub(
     const std::shared_ptr< ::grpc::ChannelInterface>& channel)
     : channel_(channel),
       rpcmethod_GetRemoteAddress_(grpcVerbsService_method_names[0],
-                                  ::grpc::RpcMethod::NORMAL_RPC, channel) {}
+                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                  channel) {}
 
 ::grpc::Status VerbsService::Stub::GetRemoteAddress(
     ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
     GetRemoteAddressResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_GetRemoteAddress_,
-                                   context, request, response);
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_GetRemoteAddress_, context, request, response);
 }
 
 VerbsService::AsyncService::AsyncService() {
   for (int i = 0; i < 1; ++i) {
-    AddMethod(new ::grpc::RpcServiceMethod(grpcVerbsService_method_names[i],
-                                           ::grpc::RpcMethod::NORMAL_RPC,
-                                           nullptr));
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
+        grpcVerbsService_method_names[i],
+        ::grpc::internal::RpcMethod::NORMAL_RPC,
+        nullptr));
     ::grpc::Service::MarkMethodAsync(i);
   }
 }
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index f7ea774b661e70a1cd63d844f70f77b9c5bd10a2..86431ca030c38c56155801202714ee4a49b764df 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -61,7 +61,7 @@ class VerbsService GRPC_FINAL {
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
-    const ::grpc::RpcMethod rpcmethod_GetRemoteAddress_;
+    const ::grpc::internal::RpcMethod rpcmethod_GetRemoteAddress_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index bc687be0abb612825a4e1347fda0456c14a91d00..445cbe290a127dcd13151d1c63921de495319af2 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -683,7 +684,6 @@ void RdmaTensorBuffer::SendNextItem() {
                          << " error message: " << status.error_message();
       size_t buffer_size = RdmaMessage::kMessageTotalBytes;
       size_t tensor_bytes = 0;
-      TensorProto proto;
       // Figures out which device the tensor is hosted on.
       Device* src_dev = nullptr;
       Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
@@ -703,21 +703,47 @@ void RdmaTensorBuffer::SendNextItem() {
       CHECK(s.ok()) << "dst device not found";
       AllocatorAttributes dst_alloc_attr;
       dst_alloc_attr.set_on_host(true);
+
+      bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
       // string tensor needs to be serialized
+      Tensor copy;
+      StringPiece copy_buf;
+      TensorProto proto;
       if (src_dev->tensorflow_gpu_device_info() &&
           (!send_args.alloc_attrs.on_host())) {
         CHECK(send_args.device_context)
-            << "send dev name: " << src_dev->name()
-            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-        // "val" is on a GPU. Uses GPUUtil to fill the proto.
-        s = VerbsUtil::SetProtoFromGPUSync(
-            in, src_dev, send_args.device_context, &proto, is_dead);
-        CHECK(s.ok()) << "set proto from gpu sync";
+          << "send dev name: " << src_dev->name()
+          << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+
+        if (can_memcpy) {
+          AllocatorAttributes host_alloc_attrs;
+          host_alloc_attrs.set_gpu_compatible(true);
+          host_alloc_attrs.set_on_host(true);
+          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+          copy = Tensor(alloc, in.dtype(), in.shape());
+          s = VerbsUtil::CopyGPUTensorToCPUSync(
+              src_dev, send_args.device_context, &in, &copy);
+          CHECK(s.ok()) << "copy tensor from gpu sync";
+          copy_buf = copy.tensor_data();
+        } else {
+          // "val" is on a GPU. Uses GPUUtil to fill the proto.
+          s = VerbsUtil::SetProtoFromGPUSync(
+              in, src_dev, send_args.device_context, &proto, is_dead);
+          CHECK(s.ok()) << "set proto from gpu sync";
+        }
       } else {
         // tensor is in CPU memory.
-        in.AsProtoTensorContent(&proto);
+        if (can_memcpy) {
+          copy_buf = in.tensor_data();
+        } else {
+          in.AsProtoTensorContent(&proto);
+        }
+      }
+      if (can_memcpy) {
+        tensor_bytes = in.TotalBytes();
+      } else {
+        tensor_bytes = proto.ByteSize();
       }
-      tensor_bytes = proto.ByteSize();
       // maybe some margin for string tensor?
       buffer_size += tensor_bytes;
       // prepare message
@@ -735,8 +761,8 @@ void RdmaTensorBuffer::SendNextItem() {
           (buffer_size > size_ && local_status_ == idle &&
            remote_status_ == idle)) {
         if ((local_status_ != none) && (buffer_size > size_)) {
-          CHECK(rm.data_type_ == DT_STRING)
-              << "Only string tensor allows to change size";
+          VLOG(2) << "Extend RDMA buffer from " << size_ << " to "
+                  << buffer_size;
         }
         CreateCPUBuffer(buffer_size, false);
         mu_.unlock();
@@ -756,11 +782,13 @@ void RdmaTensorBuffer::SendNextItem() {
         // local/remote_status_ won't be set back to idle
         // unitl Write() is successful
         mu_.unlock();
-        CHECK((buffer_size == size_ && rm.data_type_ != DT_STRING) ||
-              (buffer_size <= size_ && rm.data_type_ == DT_STRING))
-            << "tensor and buffer size do not agree!"
-            << " buffer_size = " << size_
-            << " requested tensor size = " << buffer_size << in.DebugString();
+        if (!((buffer_size == size_ && rm.data_type_ != DT_STRING) ||
+              (buffer_size <= size_ && rm.data_type_ == DT_STRING))) {
+          VLOG(2) << "Tensor and buffer size do not agree,"
+                  << " buffer_size = " << size_
+                  << " requested tensor size = "
+                  << buffer_size << in.DebugString();
+        }
         uint32_t imm_data = LookupBufferIndex(key);
         rm.type_ = RDMA_MESSAGE_TENSOR_WRITE;
         string message = RdmaMessage::CreateMessage(rm);
@@ -771,7 +799,16 @@ void RdmaTensorBuffer::SendNextItem() {
               static_cast<void*>(static_cast<char*>(buffer_) +
                                  RdmaMessage::kTensorBufferStartIndex);
           CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
-          proto.SerializeToArray(output, tensor_bytes);
+          if (can_memcpy) {
+            CHECK(copy_buf.size() == tensor_bytes)
+               << "unexpected tensor size: "
+               << copy_buf.size()
+               << " != "
+               << tensor_bytes;
+            memcpy(output, copy_buf.data(), tensor_bytes);
+          } else {
+            proto.SerializeToArray(output, tensor_bytes);
+          }
         } else {
           buffer_size = RdmaMessage::kMessageTotalBytes;
         }
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index 5871400f26aecba3db4fb4ce687c5891c1720df3..3ba65107116355e9aec2660ee32bc18b59bd0a4a 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -30,7 +31,7 @@ namespace tensorflow {
 class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
  public:
   RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
-      : BaseRemoteRendezvous(env, step_id, true), rdma_mgr_(rdma_mgr) {}
+      : BaseRemoteRendezvous(env, step_id), rdma_mgr_(rdma_mgr) {}
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
@@ -99,12 +100,40 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
     if (!rm.is_dead_) {
       void* input = static_cast<char*>(rb->buffer_) +
                     RdmaMessage::kTensorBufferStartIndex;
-      TensorProto proto;
-      CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
-            rb->size_);
-      CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
-          << "fail to parse proto from array";
-      s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+      bool can_memcpy = DataTypeCanUseMemcpy(rm.data_type_);
+      if (can_memcpy) {
+        if (dst_dev->tensorflow_gpu_device_info() &&
+            (!recv_args.alloc_attrs.on_host())) {
+          CHECK(recv_args.device_context)
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
+          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
+
+          Allocator* dst_alloc = dst_dev->GetAllocator(recv_args.alloc_attrs);
+          Tensor gpu_copy(dst_alloc, rm.data_type_, rm.tensor_shape_);
+          s = VerbsUtil::CopyCPUTensorToGPUSync(&copy, recv_args.device_context,
+                                                dst_dev, &gpu_copy);
+          CHECK(s.ok()) << "copy tensor to gpu sync";
+          val = std::move(gpu_copy);
+        } else {
+          AllocatorAttributes host_alloc_attrs;
+          host_alloc_attrs.set_gpu_compatible(true);
+          host_alloc_attrs.set_on_host(true);
+          Allocator* alloc = dst_dev->GetAllocator(host_alloc_attrs);
+          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
+          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
+          val = std::move(copy);
+        }
+      } else {
+        TensorProto proto;
+        CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
+              rb->size_);
+        CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
+            << "fail to parse proto from array";
+        s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+      }
     }
 
     rc->RemoveRecvCallback(key_with_step_id);
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
index c3350f7958ce0a2a740332c765e4d566982ee1cf..76e44d34a9f5e8e391f1e35f0610cd79e9be9c5d 100644
--- a/tensorflow/contrib/verbs/verbs_util.cc
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@@ -20,6 +20,40 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 namespace tensorflow {
 
+// static sync wrapper:
+Status VerbsUtil::CopyGPUTensorToCPUSync(Device* gpu_device,
+                              const DeviceContext* device_context,
+                              const Tensor* gpu_tensor,
+                              Tensor* cpu_tensor) {
+  Notification n;
+  Status status;
+  GPUUtil::CopyGPUTensorToCPU(gpu_device, device_context,
+                              gpu_tensor, cpu_tensor,
+                              [&n, &status](const Status& s) {
+                                status = s;
+                                n.Notify();
+                              });
+  n.WaitForNotification();
+  return status;
+}
+
+// static sync wrapper:
+Status VerbsUtil::CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
+                                         const DeviceContext* device_context,
+                                         Device* gpu_device,
+                                         Tensor* gpu_tensor) {
+  Notification n;
+  Status status;
+  GPUUtil::CopyCPUTensorToGPU(cpu_tensor, device_context,
+                              gpu_device, gpu_tensor,
+                              [&n, &status](const Status& s) {
+                                status = s;
+                                n.Notify();
+                              });
+  n.WaitForNotification();
+  return status;
+}
+
 // static sync wrapper:
 Status VerbsUtil::SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                       const DeviceContext* device_context,
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
index cbc01adae494da761ced327e2b860d2ee383925f..d9da396228dcd2d7341127bbd98d450523c7184f 100644
--- a/tensorflow/contrib/verbs/verbs_util.h
+++ b/tensorflow/contrib/verbs/verbs_util.h
@@ -28,6 +28,16 @@ class TensorProto;
 
 class VerbsUtil {
  public:
+  // synchronous wrapper of CopyGPUTensorToCPU
+  static Status CopyGPUTensorToCPUSync(Device* gpu_device,
+                                       const DeviceContext* device_context,
+                                       const Tensor* gpu_tensor,
+                                       Tensor* cpu_tensor);
+  // synchronous wrapper of CopyCPUTensorToGPU
+  static Status CopyCPUTensorToGPUSync(const Tensor* cpu_tensor,
+                                       const DeviceContext* device_context,
+                                       Device* gpu_device,
+                                       Tensor* gpu_tensor);
   // synchronous wrapper of SetProtoFromGPU
   static Status SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
                                     const DeviceContext* device_context,
diff --git a/tensorflow/contrib/xla_tf_graph/BUILD b/tensorflow/contrib/xla_tf_graph/BUILD
index 0487e7933476330c95902a5bba917907dd88fd5f..fa917ea849a110973c3ecdff410aff1de60d6b64 100644
--- a/tensorflow/contrib/xla_tf_graph/BUILD
+++ b/tensorflow/contrib/xla_tf_graph/BUILD
@@ -44,6 +44,7 @@ cc_test(
     size = "small",
     srcs = ["xla_tf_graph_util_test.cc"],
     linkstatic = 1,
+    tags = ["nomac"],  # b/63908145
     deps = [
         ":xla_tf_graph_util",
         "//tensorflow/cc:cc_ops",
@@ -51,6 +52,7 @@ cc_test(
         "//tensorflow/cc:scope",
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
index 270e062db648fee4e44622ad40e3fda8e8e35b06..db811bda36e57d30f3474bc64d160c0f80ff006e 100644
--- a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -45,10 +46,10 @@ static std::vector<XlaCompiler::Argument> BuildAddGraphArguments() {
   // Difference of dimension will add extra broadcast_dimensions.
   // broadcast_dimension generates an additional HloInstruction
   // in user_computation.cc
-  args[0].shape = TensorShape({2, 2});
+  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2, 2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = TensorShape({2});
+  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
   return args;
 }
 
@@ -107,7 +108,7 @@ TEST(XlaTfGraphUtil, ConvertTfGraphToSessionModule) {
   std::vector<XlaCompiler::Argument> args = BuildAddGraphArguments();
   std::unique_ptr<Graph> graph = BuildAddGraph();
 
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<xla::SessionModule> session_module,
       ConvertTfGraphToXlaSessionModule(args, std::move(graph)));
 
@@ -121,11 +122,11 @@ TEST(XlaTfGraphUtil, ConvertTfGraphToSessionModule) {
 TEST(XlaTfGraphUtil, ConvertXlaSessionModuleToXlaNodes) {
   std::vector<XlaCompiler::Argument> args = BuildAddGraphArguments();
   std::unique_ptr<Graph> graph = BuildAddGraph();
-  TF_ASSIGN_OR_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<xla::SessionModule> session_module,
       ConvertTfGraphToXlaSessionModule(args, std::move(graph)));
-  TF_ASSIGN_OR_ASSERT_OK(auto xla_nodes,
-                         ConvertXlaSessionModuleToXlaNodes(*session_module));
+  TF_ASSERT_OK_AND_ASSIGN(auto xla_nodes,
+                          ConvertXlaSessionModuleToXlaNodes(*session_module));
   EXPECT_EQ(session_module->entry().requests_size(), xla_nodes.size());
 }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1f0b100bbbd2fec5620818052eded2b4522bdedb..11df834c75dd6cb582bfd6631069c17c109d27cd 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -64,8 +64,9 @@ load(
     "//tensorflow:tensorflow.bzl",
     "full_path",
     "if_android",
+    "if_not_android_mips_and_mips64",
     "if_ios",
-    "if_x86",
+    "if_linux_x86_64",
     "if_not_mobile",
     "if_not_windows",
     "tf_copts",
@@ -94,6 +95,8 @@ load(
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
+    "tf_additional_framework_hdrs",
+    "tf_additional_framework_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
@@ -112,6 +115,10 @@ load(
     "tf_lib_proto_parsing_deps",
     "tf_additional_verbs_lib_defines",
     "tf_additional_mpi_lib_defines",
+    "tf_additional_gpu_tracer_srcs",
+    "tf_additional_gpu_tracer_deps",
+    "tf_additional_gpu_tracer_cuda_deps",
+    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -121,7 +128,6 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
-load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 
 # -----------------------------------------------------------------------------
 # Public targets
@@ -283,6 +289,7 @@ cc_library(
         "platform/env_time.h",
         "platform/file_system.h",
         "platform/fingerprint.h",
+        "platform/grpc_response_reader.h",
         "platform/init_main.h",
         "platform/logging.h",
         "platform/macros.h",
@@ -329,6 +336,7 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
+        ":protos_all_cc",
         "//tensorflow/core/platform/default/build_config:gtest",
     ] + tf_additional_test_deps(),
 )
@@ -338,6 +346,8 @@ tf_cuda_library(
     hdrs = [
         "example/feature_util.h",
         "framework/allocator.h",
+        "framework/variant.h",
+        "framework/variant_encode_decode.h",
         "framework/allocator_registry.h",
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
@@ -379,6 +389,7 @@ tf_cuda_library(
         "framework/type_traits.h",
         "framework/types.h",
         "public/version.h",
+        "util/activation_mode.h",
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
@@ -401,6 +412,7 @@ tf_cuda_library(
         "util/tensor_slice_reader_cache.h",
         "util/tensor_slice_writer.h",
         "util/use_cudnn.h",
+        "util/matmul_autotune.h",
         "util/util.h",
         "util/work_sharder.h",
     ] + select({
@@ -422,6 +434,7 @@ cc_library(
     hdrs = ["util/overflow.h"],
     deps = [
         ":framework_lite",
+        ":lib",
     ],
 )
 
@@ -433,6 +446,7 @@ cc_library(
     deps = [
         ":framework",
         ":lib",
+        ":protos_all_cc",
     ],
 )
 
@@ -460,6 +474,10 @@ cc_library(
     name = "session_options",
     hdrs = ["public/session_options.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        ":protos_all_cc",
+    ],
 )
 
 cc_library(
@@ -494,7 +512,7 @@ cc_library(
 # Generates library per group of ops.
 tf_gen_op_libs(
     op_lib_names = [
-        "array_ops",
+        "bitwise_ops",
         "candidate_sampling_ops",
         "control_flow_ops",
         "ctc_ops",
@@ -527,6 +545,13 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "array_ops",
+    ],
+    deps = [":protos_all_cc"],
+)
+
 tf_gen_op_libs(
     op_lib_names = [
         "audio_ops",
@@ -534,17 +559,9 @@ tf_gen_op_libs(
     deps = [":lib"],
 )
 
-cc_library(
-    name = "debug_ops_op_lib",
-    srcs = ["ops/debug_ops.cc"],
-    copts = tf_copts(),
-    linkstatic = 1,
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":framework",
-        "//tensorflow/core/kernels:debug_ops",
-    ],
-    alwayslink = 1,
+tf_gen_op_libs(
+    op_lib_names = ["debug_ops"],
+    deps = ["//tensorflow/core/kernels:debug_ops"],
 )
 
 # And one for all user ops
@@ -573,6 +590,7 @@ cc_library(
     deps = [
         ":array_ops_op_lib",
         ":audio_ops_op_lib",
+        ":bitwise_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
@@ -590,6 +608,7 @@ cc_library(
         ":no_op_op_lib",
         ":parsing_ops_op_lib",
         ":random_ops_op_lib",
+        ":remote_fused_graph_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
         ":sendrecv_ops_op_lib",
@@ -665,6 +684,7 @@ tf_cuda_library(
     name = "core_cpu",
     hdrs = [
         "common_runtime/device.h",
+        "common_runtime/optimization_registry.h",
         "common_runtime/shape_refiner.h",
         "graph/algorithm.h",
         "graph/default_device.h",
@@ -809,7 +829,6 @@ cc_library(
         ":test",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"])
     ],
 )
 
@@ -852,8 +871,6 @@ filegroup(
         "//tensorflow/core/platform/default/build_config:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
         "//tensorflow/core/util/tensor_bundle:android_srcs",
-        "common_runtime/gpu/gpu_tracer.cc",
-        "common_runtime/gpu/gpu_tracer.h",
     ] + glob(
         [
             "client/**/*.cc",
@@ -917,9 +934,7 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + [
-        "-Os",
-    ],
+    copts = tf_copts() + if_not_android_mips_and_mips64(["-Os"]),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1130,7 +1145,6 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":proto_text",
         ":protos_all_cc",
         ":stream_executor",
     ],
@@ -1152,6 +1166,30 @@ cc_library(
     ],
 )
 
+# -----------------------------------------------------------------------------
+# Clif-related proto libraries.
+
+tf_pyclif_proto_library(
+    name = "example/example_pyclif",
+    proto_lib = ":protos_all",
+    proto_srcfile = "example/example.proto",
+    visibility = ["//visibility:public"],
+)
+
+tf_pyclif_proto_library(
+    name = "example/feature_pyclif",
+    proto_lib = ":protos_all",
+    proto_srcfile = "example/feature.proto",
+    visibility = ["//visibility:public"],
+)
+
+tf_pyclif_proto_library(
+    name = "framework/tensor_pyclif",
+    proto_lib = ":protos_all",
+    proto_srcfile = "framework/tensor.proto",
+    visibility = ["//visibility:public"],
+)
+
 # -----------------------------------------------------------------------------
 # Internal targets
 
@@ -1207,6 +1245,9 @@ LIB_INTERNAL_WINDOWS_DEPS = glob(
         "platform/*.cc",
         "platform/profile_utils/**/*.h",
         "platform/profile_utils/**/*.cc",
+    ] + [
+        "framework/resource_handle.h",
+        "framework/resource_handle.cc",
     ],
     exclude = [
         "**/*test*",
@@ -1220,6 +1261,8 @@ LIB_INTERNAL_WINDOWS_DEPS = glob(
         "platform/**/cuda_libdevice_path.cc",
         "platform/**/stream_executor.h",
         "platform/load_library.cc",
+        "platform/variant_coding.cc",
+        "platform/**/variant_cord_coding.cc",
     ],
 )
 
@@ -1236,9 +1279,13 @@ cc_library(
                 "platform/*.cc",
                 "platform/profile_utils/**/*.h",
                 "platform/profile_utils/**/*.cc",
+                "framework/resource_handle.h",
+                "framework/resource_handle.cc",
             ],
             exclude = [
                 "**/*test*",
+                "framework/variant.cc",
+                "platform/variant_coding.cc",
                 "lib/hash/crc32c_accelerate.cc",
                 "lib/gif/**/*",
                 "lib/jpeg/**/*",
@@ -1248,6 +1295,9 @@ cc_library(
                 "platform/**/cuda.h",
                 "platform/**/cuda_libdevice_path.cc",
                 "platform/**/stream_executor.h",
+                "platform/**/gpu_tracer.cc",
+                "platform/variant_coding.cc",
+                "platform/**/variant_cord_coding.cc",
             ],
         ),
     }) + tf_additional_lib_srcs(
@@ -1257,6 +1307,9 @@ cc_library(
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
+            "platform/**/gpu_tracer.cc",
+            "platform/variant_coding.cc",
+            "platform/**/variant_cord_coding.cc",
         ] +
         # Protobuf deps already included through the ":lib_proto_parsing"
         # dependency.
@@ -1331,7 +1384,7 @@ cc_library(
     name = "lib_hash_crc32c_accelerate_internal",
     srcs = ["lib/hash/crc32c_accelerate.cc"],
     # -msse4.2 enables the use of crc32c compiler builtins.
-    copts = tf_copts() + if_x86(["-msse4.2"]),
+    copts = tf_copts() + if_linux_x86_64(["-msse4.2"]),
 )
 
 cc_library(
@@ -1348,6 +1401,7 @@ cc_library(
     }),
     deps = [
         ":lib",
+        ":lib_internal",
         "//tensorflow/core/platform/default/build_config:gif",
     ],
 )
@@ -1370,10 +1424,91 @@ cc_library(
     }),
     deps = [
         ":lib",
+        ":lib_internal",
         "//tensorflow/core/platform/default/build_config:jpeg",
     ],
 )
 
+cc_library(
+    name = "android_jpeg_internal",
+    srcs = [
+        "lib/jpeg/jpeg_handle.cc",
+        "lib/jpeg/jpeg_mem.cc",
+        "platform/jpeg.h",
+    ],
+    hdrs = [
+        "lib/core/stringpiece.h",
+        "lib/jpeg/jpeg_handle.h",
+        "lib/jpeg/jpeg_mem.h",
+        "platform/default/dynamic_annotations.h",
+        "platform/default/integral_types.h",
+        "platform/default/logging.h",
+        "platform/dynamic_annotations.h",
+        "platform/logging.h",
+        "platform/macros.h",
+        "platform/mem.h",
+        "platform/platform.h",
+        "platform/types.h",
+    ],
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:jpeg",
+    ],
+)
+
+cc_library(
+    name = "android_gif_internal",
+    srcs = [
+        "lib/gif/gif_io.cc",
+        "platform/gif.h",
+    ],
+    hdrs = [
+        "lib/core/stringpiece.h",
+        "lib/gif/gif_io.h",
+        "lib/gtl/cleanup.h",
+        "platform/default/dynamic_annotations.h",
+        "platform/default/integral_types.h",
+        "platform/default/logging.h",
+        "platform/dynamic_annotations.h",
+        "platform/logging.h",
+        "platform/macros.h",
+        "platform/mem.h",
+        "platform/platform.h",
+        "platform/types.h",
+    ],
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:gif",
+    ],
+)
+
+cc_library(
+    name = "android_png_internal",
+    srcs = [
+        "lib/png/png_io.cc",
+        "platform/png.h",
+    ],
+    hdrs = [
+        "lib/core/casts.h",
+        "lib/core/stringpiece.h",
+        "lib/png/png_io.h",
+        "platform/cpu_info.h",
+        "platform/default/integral_types.h",
+        "platform/default/logging.h",
+        "platform/logging.h",
+        "platform/macros.h",
+        "platform/platform.h",
+        "platform/types.h",
+    ],
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    deps = [
+        "@png_archive//:png",
+    ],
+)
+
 proto_text_hdrs_and_srcs = tf_generate_proto_text_sources(
     name = "proto_text_srcs_all",
     srcs = CORE_PROTO_SRCS,
@@ -1411,6 +1546,8 @@ tf_cuda_library(
             "util/**/*.h",
             "util/**/*.cc",
         ] + [
+            "platform/variant_coding.cc",
+            "platform/variant_coding.h",
             "graph/edgeset.h",
             "graph/edgeset.cc",
             "graph/graph.h",
@@ -1425,6 +1562,7 @@ tf_cuda_library(
             "framework/fake_input.*",
             "framework/op_gen_lib.*",
             "framework/reader_base.*",
+            "framework/resource_handle.cc",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
             "util/version_info.cc",
@@ -1438,20 +1576,22 @@ tf_cuda_library(
             "util/memmapped_file_system_writer.h",
             "util/memmapped_file_system_writer.cc",
         ],
-    }),
+    }) + tf_additional_framework_srcs(),
     hdrs = [
+        "framework/variant.h",
         "framework/op_segment.h",
         "framework/rendezvous.h",  # only needed for tests
         "framework/tensor_reference.h",
         "framework/tracking_allocator.h",  # only needed for tests
         "framework/unique_tensor_references.h",
+        "platform/variant_coding.h",
         "util/command_line_flags.h",
         "util/env_var.h",
         "util/equal_graph_def.h",
         "util/presized_cuckoo_map.h",
         "util/tensor_slice_set.h",
         "util/tensor_slice_util.h",
-    ],
+    ] + tf_additional_framework_hdrs(),
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:freebsd": [],
@@ -1465,6 +1605,7 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         ":version_lib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
     ] + if_mkl(["//third_party/mkl:intel_binary_blob"]),
@@ -1695,7 +1836,7 @@ cc_library(
     ],
     visibility = [
         "//tensorflow/compiler:__subpackages__",
-        "//tensorflow/tools/tfprof:__subpackages__",
+        "//tensorflow/core/profiler:__subpackages__",
     ],
     deps = [":lib_internal"],
 )
@@ -1733,6 +1874,7 @@ cc_library(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        ":core_cpu",
         ":core_cpu_internal",
         ":framework",
         ":lib",
@@ -1745,19 +1887,17 @@ cc_library(
 
 tf_cuda_library(
     name = "gpu_tracer",
-    srcs = [
-        "common_runtime/gpu/gpu_tracer.cc",
-    ],
+    srcs = tf_additional_gpu_tracer_srcs(),
     hdrs = [
-        "common_runtime/gpu/gpu_tracer.h",
+        "platform/gpu_tracer.h",
     ],
     copts = tf_copts(),
-    cuda_deps = tf_additional_cupti_wrapper_deps(),
+    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_gpu_tracer_cuda_deps(),
     deps = [
         ":core_cpu_internal",
         ":lib",
         ":protos_all_cc",
-    ],
+    ] + tf_additional_gpu_tracer_deps(),
 )
 
 tf_cuda_library(
@@ -1837,7 +1977,7 @@ cc_library(
         "common_runtime/sycl/sycl_device_context.h",
     ]),
     copts = tf_copts(),
-    linkstatic = 1,
+    linkstatic = 0,
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -1845,11 +1985,11 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
-        ":protos_all_cc",
+        ":proto_text",
         "//third_party/eigen3",
         "@local_config_sycl//sycl:sycl",
     ],
-    alwayslink = 1,
+    alwayslink = 0,
 )
 
 # -----------------------------------------------------------------------------
@@ -1866,7 +2006,10 @@ cc_library(
         "lib/random/philox_random_test_utils.h",
         "platform/snappy.h",
     ],
-    deps = [":lib_internal"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+    ],
 )
 
 cc_library(
@@ -1892,6 +2035,7 @@ cc_library(
         ":framework",
         ":lib",
         ":lib_internal",
+        ":protos_all_cc",
     ],
 )
 
@@ -2083,6 +2227,17 @@ cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "framework_op_gen_lib_test",
+    size = "small",
+    srcs = ["framework/op_gen_lib_test.cc"],
+    deps = [
+        ":op_gen_lib",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "quantize_training_test",
     srcs = ["graph/quantize_training_test.cc"],
@@ -2111,8 +2266,8 @@ tf_cc_tests(
     srcs = [
         "common_runtime/device_set_test.cc",
         "common_runtime/optimization_registry_test.cc",
-        "common_runtime/resource_variable_read_optimizer_test.cc",
         "common_runtime/pending_counts_test.cc",
+        "common_runtime/resource_variable_read_optimizer_test.cc",
         "common_runtime/session_test.cc",
         "common_runtime/simple_placer_test.cc",
         "example/feature_util_test.cc",
@@ -2133,7 +2288,7 @@ tf_cc_tests(
         "framework/op_kernel_test.cc",
         "framework/op_registration_test.cc",
         "framework/partial_tensor_shape_test.cc",
-        # "framework/rendezvous_test.cc",  # flaky b/30476344
+        "framework/rendezvous_test.cc",
         "framework/resource_mgr_test.cc",
         "framework/resource_op_kernel_test.cc",
         "framework/shape_inference_test.cc",
@@ -2145,6 +2300,7 @@ tf_cc_tests(
         "framework/tracking_allocator_test.cc",
         "framework/types_test.cc",
         "framework/unique_tensor_references_test.cc",
+        "framework/variant_test.cc",
         "graph/algorithm_test.cc",
         "graph/edgeset_test.cc",
         "graph/graph_def_builder_test.cc",
@@ -2166,8 +2322,8 @@ tf_cc_tests(
         "util/presized_cuckoo_map_test.cc",
         "util/reporter_test.cc",
         "util/saved_tensor_slice_util_test.cc",
-        "util/sparse/sparse_tensor_test.cc",
         "util/semver_test.cc",
+        "util/sparse/sparse_tensor_test.cc",
         "util/stat_summarizer_test.cc",
         "util/tensor_slice_reader_test.cc",
         "util/tensor_slice_set_test.cc",
@@ -2833,14 +2989,13 @@ cc_test(
         ":testlib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core/kernels:example_parsing_ops",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
 tf_cc_test_gpu(
     name = "gpu_tracer_test",
     size = "small",
-    srcs = ["common_runtime/gpu/gpu_tracer_test.cc"],
+    srcs = ["platform/gpu_tracer_test.cc"],
     args = ["--heap_check=local"],
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 31f12d4833793ef80646bd8936b50d4f6e812af1..471463fc8b387f90859c95b5a882e670164a26b5 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -33,6 +33,7 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
     string full_name = d->name();
     device_map_[CopyToBackingStore(full_name)] = d;
 
+    // TODO(b/62909072): Upgrade device_map_ to a better data structure.
     DeviceNameUtils::ParsedName parsed_name = d->parsed_name();
     if (parsed_name.has_job && parsed_name.has_replica &&
         parsed_name.has_task && parsed_name.has_type && parsed_name.has_id) {
@@ -40,6 +41,11 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
           parsed_name.job, parsed_name.replica, parsed_name.task,
           parsed_name.type, parsed_name.id);
       device_map_[CopyToBackingStore(canonical_name)] = d;
+
+      string legacy_name = DeviceNameUtils::LegacyName(
+          parsed_name.job, parsed_name.replica, parsed_name.task,
+          parsed_name.type, parsed_name.id);
+      device_map_[CopyToBackingStore(legacy_name)] = d;
     }
     string lname = DeviceNameUtils::LocalName(d->name());
     device_map_[CopyToBackingStore(lname)] = d;
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 17d966f77091507ba3136d489ac9958c5e694f68..dc2731c0da0cd6968ab05880c207b6791d2f0ef0 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -34,7 +34,9 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -60,7 +62,7 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
+#include "tensorflow/core/platform/gpu_tracer.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -85,17 +87,48 @@ thread::ThreadPool* NewThreadPoolFromSessionOptions(
   return new thread::ThreadPool(options.env, "Compute", num_threads);
 }
 
-thread::ThreadPool* NewThreadPoolFromThreadPoolOptions(
+Status NewThreadPoolFromThreadPoolOptions(
     const SessionOptions& options,
-    const ThreadPoolOptionProto& thread_pool_options, int pool_number) {
+    const ThreadPoolOptionProto& thread_pool_options, int pool_number,
+    thread::ThreadPool** pool, bool* owned) {
   int32 num_threads = thread_pool_options.num_threads();
   if (num_threads == 0) {
     num_threads = NumInterOpThreadsFromSessionOptions(options);
   }
-  VLOG(1) << "Direct session inter op parallelism threads for pool "
-          << pool_number << ": " << num_threads;
-  return new thread::ThreadPool(
-      options.env, strings::StrCat("Compute", pool_number), num_threads);
+  const string& name = thread_pool_options.global_name();
+  if (name.empty()) {
+    // Session-local threadpool.
+    VLOG(1) << "Direct session inter op parallelism threads for pool "
+            << pool_number << ": " << num_threads;
+    *pool = new thread::ThreadPool(
+        options.env, strings::StrCat("Compute", pool_number), num_threads);
+    *owned = true;
+    return Status::OK();
+  }
+
+  // Global, named threadpool.
+  typedef std::pair<int32, thread::ThreadPool*> MapValue;
+  static std::map<string, MapValue>* global_pool_map =
+      new std::map<string, MapValue>;
+  static mutex* mu = new mutex();
+  mutex_lock l(*mu);
+  MapValue* mvalue = &(*global_pool_map)[name];
+  if (mvalue->second == nullptr) {
+    mvalue->first = thread_pool_options.num_threads();
+    mvalue->second = new thread::ThreadPool(
+        options.env, strings::StrCat("Compute", pool_number), num_threads);
+  } else {
+    if (mvalue->first != thread_pool_options.num_threads()) {
+      return errors::InvalidArgument(
+          "Pool ", name,
+          " configured previously with num_threads=", mvalue->first,
+          "; cannot re-configure with num_threads=",
+          thread_pool_options.num_threads());
+    }
+  }
+  *owned = false;
+  *pool = mvalue->second;
+  return Status::OK();
 }
 
 thread::ThreadPool* GlobalThreadPool(const SessionOptions& options) {
@@ -232,16 +265,18 @@ DirectSession::DirectSession(const SessionOptions& options,
   if (options_.config.session_inter_op_thread_pool_size() > 0) {
     for (int i = 0; i < options_.config.session_inter_op_thread_pool_size();
          ++i) {
-      thread_pools_.push_back(NewThreadPoolFromThreadPoolOptions(
-          options_, options_.config.session_inter_op_thread_pool(i), i));
+      thread::ThreadPool* pool = nullptr;
+      bool owned = false;
+      init_error_.Update(NewThreadPoolFromThreadPoolOptions(
+          options_, options_.config.session_inter_op_thread_pool(i), i, &pool,
+          &owned));
+      thread_pools_.emplace_back(pool, owned);
     }
-    owns_thread_pools_ = true;
   } else if (options_.config.use_per_session_threads()) {
-    thread_pools_.push_back(NewThreadPoolFromSessionOptions(options_));
-    owns_thread_pools_ = true;
+    thread_pools_.emplace_back(NewThreadPoolFromSessionOptions(options_),
+                               true /* owned */);
   } else {
-    thread_pools_.push_back(GlobalThreadPool(options));
-    owns_thread_pools_ = false;
+    thread_pools_.emplace_back(GlobalThreadPool(options), false /* owned */);
   }
   // The default value of sync_on_finish will be flipped soon and this
   // environment variable will be removed as well.
@@ -290,8 +325,8 @@ DirectSession::~DirectSession() {
     d->op_segment()->RemoveHold(session_handle_);
   }
   delete cancellation_manager_;
-  if (owns_thread_pools_) {
-    for (auto* p : thread_pools_) delete p;
+  for (const auto& p_and_owned : thread_pools_) {
+    if (p_and_owned.second) delete p_and_owned.first;
   }
 
   execution_state_.reset(nullptr);
@@ -330,6 +365,7 @@ Status DirectSession::MaybeInitializeExecutionState(
 }
 
 Status DirectSession::Create(const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(init_error_);
   if (graph.node_size() > 0) {
     mutex_lock l(graph_def_lock_);
     if (graph_created_) {
@@ -354,6 +390,7 @@ Status DirectSession::ExtendLocked(const GraphDef& graph) {
   TF_RETURN_IF_ERROR(
       MaybeInitializeExecutionState(graph, &already_initialized));
   if (already_initialized) {
+    TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph.library()));
     std::unique_ptr<SimpleGraphExecutionState> state;
     TF_RETURN_IF_ERROR(execution_state_->Extend(graph, &state));
     execution_state_.swap(state);
@@ -423,7 +460,8 @@ Status DirectSession::Run(const RunOptions& run_options,
     return errors::InvalidArgument("Invalid inter_op_thread_pool: ",
                                    run_options.inter_op_thread_pool());
   }
-  thread::ThreadPool* pool = thread_pools_[run_options.inter_op_thread_pool()];
+  thread::ThreadPool* pool =
+      thread_pools_[run_options.inter_op_thread_pool()].first;
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
@@ -520,7 +558,7 @@ Status DirectSession::Run(const RunOptions& run_options,
 #if GOOGLE_CUDA
   std::unique_ptr<GPUTracer> tracer;
   if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
-    tracer.reset(CreateGPUTracer());
+    tracer = CreateGPUTracer();
     // tracer will be NULL on non-GPU platforms.
     // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
     if (tracer) tracer->Start().IgnoreError();
@@ -582,12 +620,33 @@ Status DirectSession::Run(const RunOptions& run_options,
     } else if (!s.ok()) {
       return s;
     }
+    const bool unique_outputs =
+        output_names.size() == executors_and_keys->output_name_to_index.size();
+    // first_indices[i] = j implies that j is the smallest value for which
+    // output_names[i] == output_names[j].
+    std::vector<int> first_indices;
+    if (!unique_outputs) {
+      first_indices.resize(output_names.size());
+      for (int i = 0; i < output_names.size(); ++i) {
+        for (int j = 0; j <= i; ++j) {
+          if (output_names[i] == output_names[j]) {
+            first_indices[i] = j;
+            break;
+          }
+        }
+      }
+    }
     outputs->clear();
     outputs->reserve(sorted_outputs.size());
-    for (const string& output_name : output_names) {
-      outputs->emplace_back(
-          std::move(sorted_outputs[executors_and_keys
-                                       ->output_name_to_index[output_name]]));
+    for (int i = 0; i < output_names.size(); ++i) {
+      const string& output_name = output_names[i];
+      if (first_indices.empty() || first_indices[i] == i) {
+        outputs->emplace_back(
+            std::move(sorted_outputs[executors_and_keys
+                                         ->output_name_to_index[output_name]]));
+      } else {
+        outputs->push_back((*outputs)[first_indices[i]]);
+      }
     }
   }
 
@@ -618,11 +677,11 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // If requested via RunOptions, output the partition graphs.
   if (run_options.output_partition_graphs()) {
-    protobuf::RepeatedPtrField<GraphDef>* parition_graph_defs =
+    protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
         run_metadata->mutable_partition_graphs();
     for (const PerPartitionExecutorsAndLib& exec_and_lib :
          executors_and_keys->items) {
-      GraphDef* partition_graph_def = parition_graph_defs->Add();
+      GraphDef* partition_graph_def = partition_graph_defs->Add();
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
@@ -644,7 +703,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   }
 
   // RunOptions is not available in PRunSetup, so use thread pool 0.
-  thread::ThreadPool* pool = thread_pools_[0];
+  thread::ThreadPool* pool = thread_pools_[0].first;
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
@@ -1296,7 +1355,7 @@ Status DirectSession::CreateGraphs(
 
   // Check for valid partitions.
   for (const auto& partition : partitions) {
-    const string& local_partition_name =
+    const string local_partition_name =
         DeviceNameUtils::LocalName(partition.first);
     if (std::count(device_names.begin(), device_names.end(),
                    local_partition_name) == 0) {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index b14a5171886dca0cf82d495d4069ad97752b78d4..cfc029132ae6252b9ca7ecc85ffec88998671e97 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -98,14 +98,16 @@ class DirectSession : public Session {
   ::tensorflow::Status ListDevices(
       std::vector<DeviceAttributes>* response) override;
   ::tensorflow::Status Close() override;
+  ::tensorflow::Status LocalDeviceManager(const DeviceMgr** output) override {
+    *output = device_mgr_.get();
+    return ::tensorflow::Status::OK();
+  }
 
   void ExportCostModels(CostModelManager::CostModelMap* cost_models) {
     cost_model_manager_.ExportCostModels(cost_models);
   }
 
  private:
-  typedef DirectSession ME;
-
   // We create one executor and its dependent library runtime for
   // every partition.
   struct PerPartitionExecutorsAndLib {
@@ -266,9 +268,11 @@ class DirectSession : public Session {
   mutex graph_def_lock_;
   GraphDef graph_def_ GUARDED_BY(graph_def_lock_);
 
-  // The thread-pools to use for running ops.
-  std::vector<thread::ThreadPool*> thread_pools_;
-  bool owns_thread_pools_ = false;
+  // The thread-pools to use for running ops, with a bool indicating if the pool
+  // is owned.
+  std::vector<std::pair<thread::ThreadPool*, bool>> thread_pools_;
+
+  Status init_error_;  // Set to an error if construction failed.
 
   // If true, blocks until device has finished all queued operations in a step.
   bool sync_on_finish_ = true;
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 107c84e39a22e134f4e95ffb76f8929f89b491b5..097dab8406f41c71298e028a7903b2cf82826653 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -406,6 +407,34 @@ TEST(DirectSessionTest, MultipleFeedTest) {
   EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
 }
 
+TEST(DirectSessionTest, FetchMultipleTimes) {
+  Graph g(OpRegistry::Global());
+  Tensor seven_tensor(DT_INT32, TensorShape());
+  seven_tensor.flat<int32>()(0) = 7;
+  Node* seven_node = test::graph::Constant(&g, seven_tensor);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&g, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  const std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<Tensor> outputs;
+
+  auto seven = seven_node->name();
+  Status s = session->Run(inputs, {seven, seven}, {}, &outputs);
+  TF_ASSERT_OK(s);
+
+  EXPECT_EQ(2, outputs.size());
+  for (int i = 0; i < outputs.size(); ++i) {
+    const Tensor& t = outputs[i];
+    ASSERT_TRUE(t.IsInitialized()) << i;
+    EXPECT_EQ(7, t.flat<int32>()(0)) << i;
+  }
+}
+
 REGISTER_OP("Darth")
     .Input("x: float")
     .Output("y: float")
@@ -882,7 +911,8 @@ class BlockingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("BlockingOp").Device(DEVICE_CPU), BlockingOp);
 REGISTER_OP("BlockingOp").Input("x: float").Output("y: float").Doc("");
 
-static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
+static void TestSessionInterOpThreadsImpl(bool use_function_lib,
+                                          bool use_global_pools) {
   FunctionDefLibrary library_graph_def;
   if (use_function_lib) {
     const string lib = R"proto(
@@ -921,24 +951,45 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
   (*options.config.mutable_device_count())["GPU"] = 0;
   (*options.config.mutable_device_count())["SYCL"] = 0;
 
-  options.config.add_session_inter_op_thread_pool();
   auto* p = options.config.add_session_inter_op_thread_pool();
+  if (use_global_pools) p->set_global_name("large pool");
+  p = options.config.add_session_inter_op_thread_pool();
+  if (use_global_pools) p->set_global_name("small pool");
   p->set_num_threads(1);
   const int kLargePool = 0;
   const int kSmallPool = 1;
 
-  std::unique_ptr<Session> session(NewSession(options));
-  ASSERT_TRUE(session != nullptr);
-  TF_ASSERT_OK(session->Create(def));
+  std::vector<std::unique_ptr<Session>> sessions;
+  if (!use_global_pools) {
+    sessions.emplace_back(NewSession(options));
+    TF_ASSERT_OK(sessions.back()->Create(def));
+  }
+  mutex sessions_mu;
 
   std::atomic<int32> num_done(0);
   // Runs session to compute <node>:0 using inter_op thread pool <pool>.
-  auto add_session_run_call = [&session, &num_done](
-      thread::ThreadPool* tp, Node* node, int inter_op_pool) {
-    auto fn = [&session, inter_op_pool, node, &num_done]() {
+  auto add_session_run_call = [use_global_pools, &def, &options, &sessions,
+                               &sessions_mu,
+                               &num_done](thread::ThreadPool* tp, Node* node,
+                                          int inter_op_pool) {
+    auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
+               inter_op_pool, node, &num_done]() {
       RunOptions run_options;
       run_options.set_inter_op_thread_pool(inter_op_pool);
       std::vector<Tensor> outputs;
+
+      Session* session;
+      if (use_global_pools) {
+        std::unique_ptr<Session> s(NewSession(options));
+        TF_ASSERT_OK(s->Create(def));
+        session = s.get();
+
+        mutex_lock l(sessions_mu);
+        sessions.emplace_back(std::move(s));
+      } else {
+        session = sessions[0].get();
+      }
+
       Status s = session->Run(run_options, {} /* inputs */,
                               {node->name() + ":0"} /* output_names */, {},
                               &outputs, nullptr /* run_metadata */);
@@ -999,11 +1050,23 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
 }
 
 TEST(DirectSessionTest, TestSessionInterOpThreads) {
-  TestSessionInterOpThreadsImpl(false /* use_function_lib */);
+  TestSessionInterOpThreadsImpl(false /* use_function_lib */,
+                                false /*use_global_pools */);
 }
 
 TEST(DirectSessionTest, TestSessionInterOpThreadsWithFunctions) {
-  TestSessionInterOpThreadsImpl(true /* use_function_lib */);
+  TestSessionInterOpThreadsImpl(true /* use_function_lib */,
+                                false /*use_global_pools */);
+}
+
+TEST(DirectSessionTest, TestSessionInterOpGlobalPools) {
+  TestSessionInterOpThreadsImpl(false /* use_function_lib */,
+                                true /*use_global_pools */);
+}
+
+TEST(DirectSessionTest, TestSessionInterOpGlobalPoolsWithFunctions) {
+  TestSessionInterOpThreadsImpl(true /* use_function_lib */,
+                                true /*use_global_pools */);
 }
 
 TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
@@ -1023,19 +1086,42 @@ TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
   options.config.add_session_inter_op_thread_pool();
 
   // Wrong pool number on Run call.
-  std::unique_ptr<Session> session(NewSession(options));
-  ASSERT_TRUE(session != nullptr);
-  TF_ASSERT_OK(session->Create(def));
-  for (int pool_num = -1; pool_num <= 1; pool_num += 2) {
-    RunOptions run_options;
-    run_options.set_inter_op_thread_pool(pool_num);
-    std::vector<Tensor> outputs;
-    Status s = session->Run(run_options, {} /* inputs */,
-                            {x->name() + ":0"} /* output_names */, {}, &outputs,
-                            nullptr /* run_metadata */);
-    EXPECT_EQ(strings::StrCat(
-                  "Invalid argument: Invalid inter_op_thread_pool: ", pool_num),
-              s.ToString());
+  {
+    std::unique_ptr<Session> session(NewSession(options));
+    TF_ASSERT_OK(session->Create(def));
+    for (int pool_num = -1; pool_num <= 1; pool_num += 2) {
+      RunOptions run_options;
+      run_options.set_inter_op_thread_pool(pool_num);
+      std::vector<Tensor> outputs;
+      Status s = session->Run(run_options, {} /* inputs */,
+                              {x->name() + ":0"} /* output_names */, {},
+                              &outputs, nullptr /* run_metadata */);
+      EXPECT_EQ(
+          strings::StrCat("Invalid argument: Invalid inter_op_thread_pool: ",
+                          pool_num),
+          s.ToString());
+    }
+  }
+
+  // Global name changes thread count.
+  std::vector<std::unique_ptr<Session>> sessions;
+  auto* pool_config = options.config.mutable_session_inter_op_thread_pool(0);
+  pool_config->set_num_threads(0);
+  pool_config->set_global_name("foo");
+  sessions.emplace_back(NewSession(options));
+  TF_ASSERT_OK(sessions.back()->Create(def));
+  sessions.emplace_back(NewSession(options));  // repeat creation, okay.
+  TF_ASSERT_OK(sessions.back()->Create(def));
+  for (int pass = 0; pass < 2; ++pass) {
+    for (int i = 1; i < 128; ++i) {
+      pool_config->set_num_threads(i);
+      sessions.emplace_back(NewSession(options));
+      auto status = sessions.back()->Create(def);
+      ASSERT_FALSE(status.ok()) << status;
+    }
+
+    // Clear existing sessions before second pass; error still happens.
+    sessions.clear();
   }
 }
 
@@ -1163,6 +1249,16 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
   EXPECT_EQ("Cancelled: Session has been closed.", s.ToString());
 }
 
+TEST(DirectSessionTest, LocalDeviceManager) {
+  SessionOptions options;
+  std::unique_ptr<Session> session(NewSession(options));
+
+  const DeviceMgr* mgr = nullptr;
+  TF_ASSERT_OK(session->LocalDeviceManager(&mgr));
+  ASSERT_TRUE(mgr != nullptr);
+  EXPECT_GT(mgr->ListDevices().size(), 0);
+}
+
 // A simple benchmark for the overhead of `DirectSession::Run()` calls
 // with varying numbers of feeds/fetches.
 void FeedFetchBenchmarkHelper(int num_feeds, int iters) {
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 24b519fb0780a6c8866ca457dc1d741f6720a933..dae58720edb00ad042b4aabff5b9c80082c3e0c3 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -363,7 +363,7 @@ class ExecutorImpl : public Executor {
   friend class ExecutorState;
 
   struct ControlFlowInfo {
-    gtl::FlatSet<string, HashStr> unique_frame_names;
+    gtl::FlatSet<string> unique_frame_names;
     std::vector<string> frame_names;
   };
 
@@ -423,7 +423,7 @@ class ExecutorImpl : public Executor {
   // Mapping from frame name to static information about the frame.
   // TODO(yuanbyu): We could cache it along with the graph so to avoid
   // the overhead of constructing it for each executor instance.
-  gtl::FlatMap<string, FrameInfo*, HashStr> frame_info_;
+  gtl::FlatMap<string, FrameInfo*> frame_info_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ExecutorImpl);
 };
@@ -1209,8 +1209,7 @@ class ExecutorState {
   // child frame is composed of the name of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*, HashStr> outstanding_frames_
-      GUARDED_BY(mu_);
+  gtl::FlatMap<string, FrameState*> outstanding_frames_ GUARDED_BY(mu_);
 
   // The unique name of a frame.
   inline string MakeFrameName(FrameState* frame, int64 iter_id,
@@ -1515,6 +1514,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
   params.input_device_contexts = &input_device_contexts;
   params.input_alloc_attrs = &input_alloc_attrs;
   params.runner = &runner_;
+  params.stats_collector = stats_collector_;
 
   Status s;
   NodeExecStats* stats = nullptr;
@@ -1743,7 +1743,7 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
         if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) {
           return AttachDef(errors::FailedPrecondition(
                                "Attempting to use uninitialized value ",
-                               item.kernel->def().input(i)),
+                               item.kernel->requested_input(i)),
                            item.kernel->def());
         }
       }
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 4970c2d252a0e3b42a791e7178cb528acab9c2a6..c9a4b476813b0fb203a0072b6974e71cb88eb62f 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -24,9 +24,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/gradients.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -258,7 +260,10 @@ class CallOp : public AsyncOpKernel {
                       done);
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
+    opts.rendezvous = ctx->rendezvous();
+    opts.cancellation_manager = ctx->cancellation_manager();
     opts.step_container = ctx->step_container();
+    opts.stats_collector = ctx->stats_collector();
     opts.runner = ctx->runner();
     std::vector<Tensor> args;
     args.reserve(ctx->num_inputs());
@@ -326,13 +331,14 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
 
-  // TODO(zhifengc): For now, we assume int32 is always on host memory
-  // and other types are always on device memory. We should do type
-  // inference over function body to derive the correct input/output
-  // memory types.
+  // TODO(zhifengc): For now, we assume int32 and resources are always on host
+  // memory and other types are always on device memory. We should do type
+  // inference over function body to derive the correct input/output memory
+  // types.
   MemoryTypeVector input_memory_types;
   for (const auto& t : fbody->arg_types) {
-    input_memory_types.push_back(t == DT_INT32 ? HOST_MEMORY : DEVICE_MEMORY);
+    input_memory_types.push_back(
+        (t == DT_INT32 || t == DT_RESOURCE) ? HOST_MEMORY : DEVICE_MEMORY);
   }
   MemoryTypeVector output_memory_types;
   for (const auto& t : fbody->ret_types) {
@@ -541,21 +547,18 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   Executor::Args exec_args;
   // Inherit the step_id from the caller.
   exec_args.step_id = opts.step_id;
-  exec_args.step_container = opts.step_container;
+  exec_args.rendezvous = opts.rendezvous;
+  exec_args.stats_collector = opts.stats_collector;
   exec_args.call_frame = frame;
   exec_args.cancellation_manager = opts.cancellation_manager;
+  exec_args.step_container = opts.step_container;
   exec_args.runner = *opts.runner;
-  // TODO(zhifengc): we can avoid creating rendez here if we know
-  // there is no send/recv nodes in the graph.
-  auto* rendez = new IntraProcessRendezvous(device_mgr_);
-  exec_args.rendezvous = rendez;
   item->exec->RunAsync(
       // Executor args
       exec_args,
       // Done callback.
-      [item, frame, rets, rendez, done](const Status& status) {
+      [item, frame, rets, done](const Status& status) {
         item->Unref();
-        rendez->Unref();
         Status s = status;
         if (s.ok()) {
           s = frame->GetRetvals(rets);
@@ -1046,10 +1049,12 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
     // to be unique and stable after optimization rewrites. Therefore,
     // we use "n<node id>" instead.
     for (const Edge* e : inputs) {
-      const string srcname = NewName(e->src(), pretty);
       if (e == nullptr) {
         ndef->add_input("unknown");
-      } else if (!e->src()->IsOp()) {
+        continue;
+      }
+      const string srcname = NewName(e->src(), pretty);
+      if (!e->src()->IsOp()) {
       } else if (e->IsControlEdge()) {
         ndef->add_input(strings::StrCat("^", srcname));
       } else if (e->src_output() == 0) {
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index dec6ca996aa4606338ab23fed8c02e57848bd673..b00bb453b1c85f2dfe1866326f807fa8e25a20dd 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 7eda5c90a1229987657e1ca17a45bb1bee6aa2f7..082e33b71304fa64868c8ecfe0e52699a9cd9b95 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -391,7 +392,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
 
   if (vlog_1) {
     VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
-            << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream["
+            << op_kernel->type_string() << " on GPU" << gpu_id_ << " stream["
             << stream_id << "]";
   }
 
@@ -462,16 +463,18 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
     gpu_device_context =
         static_cast<GPUDeviceContext*>(context->op_device_context());
   }
+  gpu::Stream* stream = gpu_device_context->stream();
   const auto stream_id = gpu_device_context->stream_id();
 
   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
-          << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream["
+          << op_kernel->type_string() << " on GPU" << gpu_id_ << " stream["
           << stream_id << "]";
 
   // When TraceMe profiling is off (which is the default), the
   // following TraceMe constructor is simply a conditional test of
   // false value. Measurements show that its overhead is negligible.
   port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string());
+  gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -602,14 +605,28 @@ int64 MinSystemMemory(int64 available_memory) {
   // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
   //
   // In the future we could be more sophisticated by using a table of devices.
+  int64 min_system_memory;
   if (available_memory < (1LL << 31)) {
     // 225MiB
-    return 225 * 1024 * 1024;
+    min_system_memory = 225 * 1024 * 1024;
   } else {
     // max(300 MiB, 0.05 * available_memory)
-    return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+    min_system_memory =
+        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
   }
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
+// Do nothing
+#elif !defined(__GNUC__) && defined(NDEBUG)
+// Do nothing
+#else
+  // Double the amount of available GPU memory in non-opt builds (debug
+  // builds in windows); because in non-opt builds more system memory
+  // is necessary.
+  min_system_memory *= 2;
+#endif
+  return min_system_memory;
 }
+
 }  // namespace
 
 static string GetShortDeviceDescription(int device_id,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index ae9e5aeaa3dda644cae4efcb51425e629a35f330..b69c1ae8fec21fa01d32809a03a343fe07b4f2c6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.cc b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
index 700ac347163cc660d0503cfe9c285bce99308287..66fff16e8f79d16e9077f9583e029d871f603295 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
@@ -239,11 +239,9 @@ void PoolAllocator::EvictOne() {
         (alloc_request_count == 0)
             ? 0.0
             : allocated_count_ / static_cast<double>(alloc_request_count);
-    static int log_counter = 0;
-    // (counter increment not thread safe but it's just for logging, so we
-    // don't care).
-    bool should_log = ((log_counter++ % 10) == 0);
-    if (should_log) {
+    // Can turn on for debugging purposes.
+    const bool kShouldLog = false;
+    if (kShouldLog) {
       LOG(INFO) << "PoolAllocator: After " << alloc_request_count
                 << " get requests, put_count=" << put_count_
                 << " evicted_count=" << evicted_count_
@@ -255,7 +253,7 @@ void PoolAllocator::EvictOne() {
       size_t new_size_limit = (pool_size_limit_ < kMinPoolSize)
                                   ? kMinPoolSize
                                   : (kIncreaseFactor * pool_size_limit_);
-      if (should_log) {
+      if (kShouldLog) {
         LOG(INFO) << "Raising pool_size_limit_ from " << pool_size_limit_
                   << " to " << new_size_limit;
       }
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 74b2252c7c6a4530cce3ecf59294d1f2b8798933..2ce1e8b48302811161a8ee39ffb8d4c625723d94 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 4a5b88d5fda5954de2368857f4dba6bf22585ad7..420dfe338efb473e36eb02a757fa957d15ba64df 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_segment.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/notification.h"
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 21ed73df77da46a903c5b14dd1d982e4b591828e..76b926ba40053288360f0e4e6fe2a37bd44ff0b4 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -104,10 +104,21 @@ Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
       });
 }
 
-static Node* Send(Graph* g, const string& device_name, bool host,
-                  const Edge* edge) {
-  const string tensor_name =
-      strings::StrCat("edge_", edge->id(), "_", edge->src()->name());
+// Given an Edge whose two endpoints have different memory types and
+// are gonna to insert a pair of HostSend/Recv or Send/HostRecv nodes,
+// GetTensorName() returns a unique string that we can use as part of
+// the rendezvous key. The return string is guaranteed to be unique
+// within this process. That is sufficient because EnsureMemoryTypes
+// is only used on a TensorFlow graph that is gonna to be executed in
+// a single tf device (hence within a single process).
+static string GetTensorName(const Edge* edge) {
+  static std::atomic<int64> counter(0);
+  return strings::StrCat("memtype_", counter.fetch_add(1), "_",
+                         edge->src()->name());
+}
+
+static Node* Send(Graph* g, const string& tensor_name,
+                  const string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), host ? "_HostSend" : "_Send")
                   .Input(edge->src(), edge->src_output())
@@ -115,14 +126,13 @@ static Node* Send(Graph* g, const string& device_name, bool host,
                   .Attr("send_device", device_name)
                   .Attr("send_device_incarnation", 0)  // Do not care.
                   .Attr("recv_device", device_name)
+                  .Attr("_hostmem_sendrecv", true)
                   .Finalize(g, &ret));
   return ret;
 }
 
-static Node* Recv(Graph* g, const string& device_name, bool host,
-                  const Edge* edge) {
-  const string tensor_name =
-      strings::StrCat("edge_", edge->id(), "_", edge->src()->name());
+static Node* Recv(Graph* g, const string& tensor_name,
+                  const string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(
       NodeBuilder(g->NewName("n"), host ? "_HostRecv" : "_Recv")
@@ -131,6 +141,7 @@ static Node* Recv(Graph* g, const string& device_name, bool host,
           .Attr("send_device", device_name)
           .Attr("send_device_incarnation", 0)
           .Attr("recv_device", device_name)
+          .Attr("_hostmem_sendrecv", true)
           .Finalize(g, &ret));
   return ret;
 }
@@ -171,8 +182,10 @@ Status EnsureMemoryTypes(const DeviceType& device_type,
       Endpoint key{e->src()->id(), e->src_output()};
       auto iter = recv_nodes.find(key);
       if (iter == recv_nodes.end()) {
-        Node* send = Send(g, device_name, (item.sm == HOST_MEMORY), e);
-        recv = Recv(g, device_name, (item.dm == HOST_MEMORY), e);
+        const string tensor_name = GetTensorName(e);
+        Node* send =
+            Send(g, tensor_name, device_name, (item.sm == HOST_MEMORY), e);
+        recv = Recv(g, tensor_name, device_name, (item.dm == HOST_MEMORY), e);
         if (!has_ref) {
           // We only cache if there is no ref is involved.
           recv_nodes[key] = recv;
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 41bf23be27083e56c341f7273109b9b2eb984f89..005aabf9b822ce93dc292004e239eec8c37b7f08 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mem.h"
 
-#include "third_party/mkl/include/i_malloc.h"
+#include "i_malloc.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 5103e852218cce7b8895dacfeb31373da5319628..e61ea9d84cd717d5f4ae8cc2cd19a580d38c2bd6 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -39,6 +40,10 @@ ShapeRefiner::ShapeRefiner(int graph_def_version,
       ops_registry_(ops),
       graph_runner_(Env::Default()) {}
 
+ShapeRefiner::ShapeRefiner(const VersionDef& versions,
+                           const OpRegistryInterface* ops)
+    : ShapeRefiner(versions.producer(), ops) {}
+
 ShapeRefiner::~ShapeRefiner() {
   // The lifetime of the tensors are bound to the GraphRunner, so the tensors
   // should be deleted before it.
@@ -139,7 +144,7 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   return Status::OK();
 }
 
-Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
+Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
   auto it = node_to_context_.find(node);
   if (it == node_to_context_.end()) {
     *refined = true;
@@ -155,29 +160,55 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
   for (const Edge* e : node->in_edges()) {
     if (e->IsControlEdge()) continue;
 
+    int dst_input = e->dst_input();
+    int src_output = e->src_output();
+
     Node* input = e->src();
     auto iter = node_to_context_.find(input);
     if (iter == node_to_context_.end()) {
       return errors::FailedPrecondition(
-          "Input ", e->dst_input(), " ('", input->name(), "') for '",
-          node->name(), "' was not previously added to ShapeRefiner.");
+          "Input ", dst_input, " ('", input->name(), "') for '", node->name(),
+          "' was not previously added to ShapeRefiner.");
     }
 
     InferenceContext* c = iter->second.get();
-    DCHECK_GE(e->dst_input(), 0);
-    if (node_context->MergeInput(e->dst_input(), c->output(e->src_output()))) {
+    DCHECK_GE(dst_input, 0);
+    ShapeHandle existing_input = node_context->input(dst_input);
+    if (!relax && node_context->MergeInput(dst_input, c->output(src_output))) {
       *refined = true;
+    } else if (relax) {
+      if (node_context->RelaxInput(dst_input, c->output(src_output))) {
+        if (!SameDefinedShape(node_context, node_context->input(dst_input),
+                              existing_input)) {
+          *refined = true;
+        }
+      }
     }
 
     // Also propagate handle shape and dtype of edges which are carrying
     // resource handles.
-    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
-      auto* shapes_and_types =
-          c->output_handle_shapes_and_types(e->src_output());
-      if (shapes_and_types != nullptr &&
-          node_context->MergeInputHandleShapesAndTypes(e->dst_input(),
-                                                       *shapes_and_types)) {
+    if (e->src()->output_type(src_output) == DT_RESOURCE) {
+      auto* outputs = c->output_handle_shapes_and_types(src_output);
+      if (!outputs) continue;
+
+      if (!relax &&
+          node_context->MergeInputHandleShapesAndTypes(dst_input, *outputs)) {
         *refined = true;
+      } else if (relax) {
+        std::vector<ShapeAndType> existing_inputs;
+        const std::vector<ShapeAndType>* inputs =
+            node_context->input_handle_shapes_and_types(dst_input);
+        if (inputs) {
+          existing_inputs = *inputs;
+        }
+        if (node_context->RelaxInputHandleShapesAndMergeTypes(dst_input,
+                                                              *outputs)) {
+          if (IsUpdatedShapesOrTypes(
+                  node_context, existing_inputs,
+                  *node_context->input_handle_shapes_and_types(dst_input))) {
+            *refined = true;
+          }
+        }
       }
     }
   }
@@ -268,7 +299,7 @@ Status ShapeRefiner::TryToInferTensorOutputFromInputShapes(const Edge* edge,
   }
   InferenceContext* c = it->second.get();
 
-  if (node->def().op() == "Shape") {
+  if (node->type_string() == "Shape") {
     // If input shapes to the shape op are fully defined,
     // we can infer the shape op's output tensor.
     bool fully_defined_inputs = c->FullyDefined(c->input(0));
@@ -298,7 +329,7 @@ Status ShapeRefiner::TryToInferTensorOutputFromInputShapes(const Edge* edge,
       *output = t;
       *success = true;
     }
-  } else if (node->def().op() == "Rank") {
+  } else if (node->type_string() == "Rank") {
     bool rank_known = c->RankKnown(c->input(0));
     if (rank_known) {
       int32 input_rank = c->Rank(c->input(0));
@@ -307,7 +338,7 @@ Status ShapeRefiner::TryToInferTensorOutputFromInputShapes(const Edge* edge,
       *output = t;
       *success = true;
     }
-  } else if (node->def().op() == "Size") {
+  } else if (node->type_string() == "Size") {
     bool fully_defined_inputs = c->FullyDefined(c->input(0));
     if (fully_defined_inputs) {
       int32 rank = c->Rank(c->input(0));
@@ -638,4 +669,36 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
   return Status::OK();
 }
 
+bool ShapeRefiner::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
+                                    ShapeHandle s1) {
+  if (!c->RankKnown(s0)) {
+    return !c->RankKnown(s1);
+  } else if (!c->RankKnown(s1) || c->Rank(s0) != c->Rank(s1)) {
+    return false;
+  }
+
+  for (int i = 0; i < c->Rank(s0); ++i) {
+    if (c->Value(c->Dim(s0, i)) != c->Value(c->Dim(s1, i))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool ShapeRefiner::IsUpdatedShapesOrTypes(
+    InferenceContext* c, const std::vector<ShapeAndType>& existing,
+    const std::vector<ShapeAndType>& updated) {
+  if (existing.size() != updated.size()) {
+    return true;
+  }
+  for (int i = 0; i < existing.size(); i++) {
+    if (!SameDefinedShape(c, existing[i].shape, updated[i].shape) ||
+        existing[i].dtype != updated[i].dtype) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 603659d54e2c0f3373094293c84124270148b6e2..21e58381a5f28ee59cae2ae435d3e9c0694cf88b 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
+namespace grappler {
+class GraphProperties;
+}
 
 // ShapeRefiner performs shape inference for TensorFlow Graphs.  It is
 // responsible for instantiating InferenceContext objects for each
@@ -33,6 +36,10 @@ namespace tensorflow {
 class ShapeRefiner {
  public:
   ShapeRefiner(int graph_def_version, const OpRegistryInterface* ops);
+
+  // Same as ShapeRefiner(versions.producer(), ops)
+  ShapeRefiner(const VersionDef& versions, const OpRegistryInterface* ops);
+
   ~ShapeRefiner();
 
   // Performs validation of 'node' and runs 'node's shape function,
@@ -57,8 +64,13 @@ class ShapeRefiner {
 
   // Update the input shapes of node in case the shapes of the fan-ins of 'node'
   // have themselves been modified (For example, in case of incremental shape
-  // refinement). Sets refined to true if any of the node shape has changed.
-  Status UpdateNode(const Node* node, bool* refined);
+  // refinement). If 'relax' is true, a new shape with the broadest set of
+  // information will be set as the new input (see InferenceContext::RelaxInput
+  // for full details and examples). Sets refined to true if any shapes have
+  // changed (in their string representations). Note that shapes may have been
+  // updated to newer versions (but with identical string representations) even
+  // if <*refined> is set to false.
+  Status UpdateNode(const Node* node, bool relax, bool* refined);
 
   // Returns the InferenceContext for 'node', if present.
   shape_inference::InferenceContext* GetContext(const Node* node) const {
@@ -78,6 +90,22 @@ class ShapeRefiner {
   }
 
  private:
+  friend class ShapeRefinerTest;
+  friend class ::tensorflow::grappler::GraphProperties;
+
+  // Returns true if the ranks and all dimensions of <s0> and <s1> are either
+  // equal in value or both unknown.
+  static bool SameDefinedShape(shape_inference::InferenceContext* c,
+                               shape_inference::ShapeHandle s0,
+                               shape_inference::ShapeHandle s1);
+
+  // Returns true if the shapes and types stored in <*existing> are identical in
+  // value to the shapes and types in <*updated>.
+  static bool IsUpdatedShapesOrTypes(
+      shape_inference::InferenceContext* c,
+      const std::vector<shape_inference::ShapeAndType>& existing,
+      const std::vector<shape_inference::ShapeAndType>& updated);
+
   // Tries to infer tensor output based on the input shapes of the node. In some
   // cases, the shapes of the inputs are sufficient for inferring the contents
   // of the output tensor. For example, a Shape op with fully defined input
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 55485dc979a9d3dce6c42b92234db71a1950bba6..7ffab38ba2a96b49c02516644de807562d926e2e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -26,6 +26,35 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
+
+class ShapeRefinerTest : public ::testing::Test {
+ protected:
+  // These give access to private functions of DimensionHandle and ShapeHandle.
+  bool SameHandle(shape_inference::DimensionHandle a,
+                  shape_inference::DimensionHandle b) {
+    return a.SameHandle(b);
+  }
+
+  bool SameHandle(shape_inference::ShapeHandle a,
+                  shape_inference::ShapeHandle b) {
+    return a.SameHandle(b);
+  }
+
+  // These give access to private functions of ShapeRefiner.
+  bool SameDefinedShape(shape_inference::InferenceContext* c,
+                        shape_inference::ShapeHandle s0,
+                        shape_inference::ShapeHandle s1) {
+    return ShapeRefiner::SameDefinedShape(c, s0, s1);
+  }
+
+  bool IsUpdatedShapesOrTypes(
+      shape_inference::InferenceContext* c,
+      const std::vector<shape_inference::ShapeAndType>& existing,
+      const std::vector<shape_inference::ShapeAndType>& updated) {
+    return ShapeRefiner::IsUpdatedShapesOrTypes(c, existing, updated);
+  }
+};
+
 namespace {
 
 #define EXPECT_SHAPE(EXPECTED, M, OP, IDX)                            \
@@ -34,7 +63,7 @@ namespace {
     EXPECT_EQ(EXPECTED, ctx->DebugString(ctx->output(IDX)));          \
   } while (0);
 
-TEST(ShapeRefinerTest, Constant) {
+TEST_F(ShapeRefinerTest, Constant) {
   // Create a constant node and validate that adding it is successful
   // and that its shape is correct.
   Scope root = Scope::NewRootScope();
@@ -45,7 +74,7 @@ TEST(ShapeRefinerTest, Constant) {
   EXPECT_SHAPE("[]", m, c, 0);
 }
 
-TEST(ShapeRefinerTest, MatMul) {
+TEST_F(ShapeRefinerTest, MatMul) {
   ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
 
   Scope root = Scope::NewRootScope();
@@ -62,7 +91,7 @@ TEST(ShapeRefinerTest, MatMul) {
   EXPECT_SHAPE("[2,2]", m, mm, 0);
 }
 
-TEST(ShapeRefinerTest, InvalidOrder) {
+TEST_F(ShapeRefinerTest, InvalidOrder) {
   ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   Scope root = Scope::NewRootScope();
   auto a = ops::Const(root, {{1.0f}, {2.0f}});
@@ -77,7 +106,7 @@ TEST(ShapeRefinerTest, InvalidOrder) {
       s.error_message());
 }
 
-TEST(ShapeRefinerTest, BadShapes) {
+TEST_F(ShapeRefinerTest, BadShapes) {
   ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   Scope root = Scope::NewRootScope();
   auto a = ops::Const(root, {{1.0f}, {2.0f}});
@@ -94,7 +123,7 @@ TEST(ShapeRefinerTest, BadShapes) {
                   .contains("Dimensions must be equal, but are 1 and 2"));
 }
 
-TEST(ShapeRefinerTest, SetShape) {
+TEST_F(ShapeRefinerTest, SetShape) {
   ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
 
   Scope root = Scope::NewRootScope();
@@ -133,7 +162,7 @@ REGISTER_OP("TestOpWithNoShapeFn").Input("a: int32").Output("o: int32");
 
 }  // namespace
 
-TEST(ShapeRefinerTest, MissingShapeInferenceFns) {
+TEST_F(ShapeRefinerTest, MissingShapeInferenceFns) {
   Scope root = Scope::NewRootScope();
   auto a = ops::Const(root, 42);
   Node* b;
@@ -147,7 +176,7 @@ TEST(ShapeRefinerTest, MissingShapeInferenceFns) {
   TF_EXPECT_OK(m.AddNode(b));
 }
 
-TEST(ShapeRefinerTest, PropagateConstants) {
+TEST_F(ShapeRefinerTest, PropagateConstants) {
   // Reduction dimension is a variable, so we don't know its value.
   // So the output shape value is unknown (though its rank is known).
   {
@@ -220,7 +249,7 @@ REGISTER_OP("TestOp")
 
 }  // namespace
 
-TEST(ShapeRefinerTest, InputTensorDependencies) {
+TEST_F(ShapeRefinerTest, InputTensorDependencies) {
   ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
   Graph graph(OpRegistry::Global());
   Node* node;
@@ -289,7 +318,7 @@ REGISTER_OP("ShapeDataInt64")
 
 }  // namespace
 
-TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContent) {
+TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContent) {
   Scope root = Scope::NewRootScope();
 
   // Create variable 2x4 tensor.
@@ -320,7 +349,7 @@ TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContent) {
   EXPECT_EQ("[4]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt64) {
+TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt64) {
   Scope root = Scope::NewRootScope();
 
   // Create variable 2x4 tensor.
@@ -354,7 +383,7 @@ TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt64) {
   EXPECT_EQ("[4]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt32Overflow) {
+TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt32Overflow) {
   Scope root = Scope::NewRootScope();
 
   // Create variable 2x4 tensor.
@@ -386,7 +415,7 @@ TEST(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt32Overflow) {
   EXPECT_FALSE(m.AddNode(shape_data).ok());
 }
 
-TEST(ShapeRefinerTest, PropagateRankAcrossTensorContent) {
+TEST_F(ShapeRefinerTest, PropagateRankAcrossTensorContent) {
   Scope root = Scope::NewRootScope();
 
   // Create variable 2x4x3 tensor.
@@ -412,7 +441,7 @@ TEST(ShapeRefinerTest, PropagateRankAcrossTensorContent) {
   EXPECT_EQ("[3]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContent) {
+TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContent) {
   Scope root = Scope::NewRootScope();
 
   // Create variable.
@@ -438,7 +467,7 @@ TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContent) {
   EXPECT_EQ("[120]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt64) {
+TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt64) {
   Scope root = Scope::NewRootScope();
 
   // Create variable.
@@ -469,7 +498,7 @@ TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt64) {
   EXPECT_EQ("[515396075280]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt32Overflow) {
+TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt32Overflow) {
   Scope root = Scope::NewRootScope();
 
   // Create variable.
@@ -496,7 +525,7 @@ TEST(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt32Overflow) {
   EXPECT_FALSE(m.AddNode(shape_data).ok());
 }
 
-TEST(ShapeRefinerTest, PropagateShape) {
+TEST_F(ShapeRefinerTest, PropagateShape) {
   Scope root = Scope::NewRootScope();
   // 3x2 input
   auto input = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}});
@@ -518,7 +547,7 @@ TEST(ShapeRefinerTest, PropagateShape) {
   EXPECT_EQ("[3,2]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateSize) {
+TEST_F(ShapeRefinerTest, PropagateSize) {
   Scope root = Scope::NewRootScope();
   // 3x2 input
   auto input = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}});
@@ -539,7 +568,7 @@ TEST(ShapeRefinerTest, PropagateSize) {
   EXPECT_EQ("[6]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateRank) {
+TEST_F(ShapeRefinerTest, PropagateRank) {
   Scope root = Scope::NewRootScope();
   // 3x2 input
   auto input = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}});
@@ -560,7 +589,7 @@ TEST(ShapeRefinerTest, PropagateRank) {
   EXPECT_EQ("[2]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, PropagateRange) {
+TEST_F(ShapeRefinerTest, PropagateRange) {
   Scope root = Scope::NewRootScope();
   auto begin = ops::Const(root, 1);
   auto limit = ops::Const(root, 11);
@@ -583,7 +612,7 @@ TEST(ShapeRefinerTest, PropagateRange) {
   EXPECT_EQ("[1,4,7,10]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueTwoInputsToSameNode) {
+TEST_F(ShapeRefinerTest, ConstantValueTwoInputsToSameNode) {
   Scope root = Scope::NewRootScope();
   // This node is used as two inputs to 'range'.
   auto begin_and_delta = ops::Const(root, 1);
@@ -607,7 +636,7 @@ TEST(ShapeRefinerTest, ConstantValueTwoInputsToSameNode) {
 
 // Creates a graph where 'begin' is attempted to be visited during
 // constant value evaluation after having been processed once.
-TEST(ShapeRefinerTest, ConstantValueVisitNodeTwice) {
+TEST_F(ShapeRefinerTest, ConstantValueVisitNodeTwice) {
   Scope root = Scope::NewRootScope();
   auto begin = ops::Const(root, 1);
   auto limit = ops::Const(root, 8);
@@ -716,7 +745,7 @@ REGISTER_OP("WithUnknownShape")
 
 }  // namespace
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_EmptyVector) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_EmptyVector) {
   Scope root = Scope::NewRootScope();
   Node* input;
   TF_ASSERT_OK(
@@ -734,7 +763,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_EmptyVector) {
   EXPECT_EQ("[]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_Shape) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_Shape) {
   for (int pass = 0; pass < 2; ++pass) {
     Scope root = Scope::NewRootScope();
     Node* input;
@@ -761,7 +790,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_Shape) {
   }
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
   Scope root = Scope::NewRootScope();
   Node* scalar_non_const;
   TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt32")
@@ -793,7 +822,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
   EXPECT_EQ("[10,20,?,40]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
   Scope root = Scope::NewRootScope();
   Node* scalar_non_const;
   TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt64")
@@ -825,7 +854,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
   EXPECT_EQ("[10,20,?,1099511627776]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
   Scope root = Scope::NewRootScope();
 
   InputList inputs{
@@ -851,7 +880,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
   EXPECT_EQ("[10,?]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
   Scope root = Scope::NewRootScope();
 
   // Inputs are length 2 vectors instead of scalars.
@@ -876,7 +905,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
       StringPiece(m.AddNode(result).error_message()).contains("but is rank 2"));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_Concat) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
   Scope root = Scope::NewRootScope();
   Graph* g = root.graph();
   Node* partial_1;
@@ -913,7 +942,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_Concat) {
   EXPECT_EQ("[1,?,3,?,5,6,?,8,9,10,11]", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatWithUnknown) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_ConcatWithUnknown) {
   Scope root = Scope::NewRootScope();
   Graph* g = root.graph();
   Node* scalar_non_const;
@@ -956,7 +985,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatWithUnknown) {
   EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
 }
 
-TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
   Scope root = Scope::NewRootScope();
   Graph* g = root.graph();
   Node* scalar_non_const;
@@ -995,7 +1024,78 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
-TEST(ShapeRefinerTest, IncrementalUpdates) {
+namespace {
+
+// Dummy op to test ShapeRefiner util functions
+REGISTER_OP("Dummy");
+
+}  // namespace
+
+TEST_F(ShapeRefinerTest, SameDefinedShape) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* test;
+  TF_CHECK_OK(NodeBuilder("test", "Dummy").Finalize(g, &test));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  m.set_require_shape_inference_fns(false);
+  TF_ASSERT_OK(m.AddNode(test));
+  shape_inference::InferenceContext* ctx = m.GetContext(test);
+
+  auto unknown = ctx->UnknownShape();
+  auto unknown_b = ctx->UnknownShape();
+  auto s_1_2 = ctx->MakeShape({1, 2});
+  auto s_1_2_b = ctx->MakeShape({1, 2});
+  auto s_2_2 = ctx->MakeShape({2, 2});
+  auto s_unknown_2 = ctx->MakeShape({-1, 2});
+  auto s_unknown_2_b = ctx->MakeShape({-1, 2});
+
+  EXPECT_TRUE(SameDefinedShape(ctx, unknown, unknown_b));
+  EXPECT_FALSE(SameDefinedShape(ctx, unknown, s_1_2));
+  EXPECT_TRUE(SameDefinedShape(ctx, s_1_2, s_1_2_b));
+  EXPECT_FALSE(SameDefinedShape(ctx, s_1_2, s_2_2));
+  EXPECT_TRUE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2_b));
+}
+
+TEST_F(ShapeRefinerTest, IsUpdatedShapesOrTypes) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* test;
+  TF_CHECK_OK(NodeBuilder("test", "Dummy").Finalize(g, &test));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  m.set_require_shape_inference_fns(false);
+  TF_ASSERT_OK(m.AddNode(test));
+  shape_inference::InferenceContext* ctx = m.GetContext(test);
+
+  std::vector<shape_inference::ShapeAndType> t0{
+      {ctx->MakeShape({1, 2, 3}), DT_FLOAT},
+      {ctx->UnknownShape(), DT_INVALID},
+      {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
+
+  std::vector<shape_inference::ShapeAndType> t1{
+      {ctx->MakeShape({1, 2, 3}), DT_FLOAT},
+      {ctx->UnknownShape(), DT_INVALID},
+      {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
+
+  std::vector<shape_inference::ShapeAndType> t2{
+      {ctx->MakeShape({1, 2, 4}), DT_FLOAT},
+      {ctx->UnknownShape(), DT_INVALID},
+      {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
+
+  std::vector<shape_inference::ShapeAndType> t3{
+      {ctx->MakeShape({1, 2, 3}), DT_INT32},
+      {ctx->UnknownShape(), DT_INVALID},
+      {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
+
+  EXPECT_FALSE(IsUpdatedShapesOrTypes(ctx, t0, t1));
+
+  // A shape has been modified
+  EXPECT_TRUE(IsUpdatedShapesOrTypes(ctx, t0, t2));
+
+  // A type has been modified
+  EXPECT_TRUE(IsUpdatedShapesOrTypes(ctx, t0, t3));
+}
+
+TEST_F(ShapeRefinerTest, IncrementalUpdates) {
   Scope root = Scope::NewRootScope();
   Graph* g = root.graph();
   Node* queue;
@@ -1020,12 +1120,34 @@ TEST(ShapeRefinerTest, IncrementalUpdates) {
   shape_inference::ShapeHandle shp = ctx->MakeShape({3, 7});
   ctx->set_output_handle_shapes_and_types(
       0, std::vector<shape_inference::ShapeAndType>{{shp, DT_FLOAT}});
-
   bool refined = false;
-  TF_ASSERT_OK(m.UpdateNode(dequeue, &refined));
+  TF_ASSERT_OK(m.UpdateNode(dequeue, false /* relax */, &refined));
   EXPECT_TRUE(refined);
   ctx = m.GetContext(dequeue);
   EXPECT_EQ("[3,7]", ctx->DebugString(ctx->output(0)));
+
+  // Inject another shape, but relax instead of merge.
+  ctx = m.GetContext(queue);
+  shp = ctx->MakeShape({2, 7});
+  ctx->set_output_handle_shapes_and_types(
+      0, std::vector<shape_inference::ShapeAndType>{{shp, DT_FLOAT}});
+  refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, true /* relax */, &refined));
+  EXPECT_TRUE(refined);
+  ctx = m.GetContext(dequeue);
+  EXPECT_EQ("[?,7]", ctx->DebugString(ctx->output(0)));
+
+  // Inject another partially unknown shape and attempt to relax it.
+  ctx = m.GetContext(queue);
+  shp = ctx->MakeShape({shape_inference::InferenceContext::kUnknownDim, 7});
+  ctx->set_output_handle_shapes_and_types(
+      0, std::vector<shape_inference::ShapeAndType>{{shp, DT_FLOAT}});
+  refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, true /* relax */, &refined));
+  EXPECT_FALSE(refined);
+  ctx = m.GetContext(dequeue);
+  EXPECT_EQ("[?,7]", ctx->DebugString(ctx->output(0)));
+  ASSERT_FALSE(SameHandle(ctx->Dim(ctx->output(0), 0), ctx->Dim(shp, 0)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.cc b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
index 8206a678b4bae8102b4510e7041553628f14914b..41e685bdc7f053ec278e45379788f39141f8b592 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
@@ -25,9 +25,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/simple_placer.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -114,16 +117,21 @@ SimpleGraphExecutionState::~SimpleGraphExecutionState() {
 Status SimpleGraphExecutionState::Extend(
     const GraphDef& extension_def,
     std::unique_ptr<SimpleGraphExecutionState>* out) const {
+  GraphDef gdef;
+
+  // 1. Copy the function library.
+  TF_RETURN_IF_ERROR(flib_def_->AddLibrary(extension_def.library()));
+  *gdef.mutable_library() = flib_def_->ToProto();
+
+  // 2. Build an index of the new node names.
   std::unordered_set<string> new_names;
-  // 1. Build an index of the new node names.
   for (const NodeDef& node : extension_def.node()) {
     new_names.insert(node.name());
   }
 
-  // 2. Add the non-duplicates from the old graph to the new graph.
+  // 3. Add the non-duplicates from the old graph to the new graph.
   //    Return an error if the same node name appears in both the
   //    old graph and the extension.
-  GraphDef gdef;
   for (const NodeDef& node : original_graph_def_.node()) {
     if (new_names.count(node.name()) == 0) {
       *gdef.add_node() = node;
@@ -135,7 +143,7 @@ Status SimpleGraphExecutionState::Extend(
     }
   }
 
-  // 3. Merge the versions field.
+  // 4. Merge the versions field.
   int old_node_size = gdef.node_size();
   gdef.mutable_node()->MergeFrom(extension_def.node());
   TF_RETURN_IF_ERROR(
@@ -171,12 +179,6 @@ Status SimpleGraphExecutionState::Extend(
     gdef.mutable_versions()->CopyFrom(extension_def.versions());
   }
 
-  // 4. Copy the function library from this execution state.
-  // NOTE(mrry): To match the previous behavior, the first GraphDef
-  // passed to a session will contain the function library that is
-  // used for all subsequent execution states.
-  *gdef.mutable_library() = flib_def_->ToProto();
-
   // 5. Validate that the final graphdef is valid.
   if (gdef.versions().producer() >= 5) {
     // Validate the graph: we assume that merging two valid graphs
@@ -236,60 +238,6 @@ Status SimpleGraphExecutionState::InitBaseGraph(
     const BuildGraphOptions& options) {
   const GraphDef* graph_def = &original_graph_def_;
 
-#ifndef IS_MOBILE_PLATFORM
-  GraphDef optimized_graph;
-
-  const RewriterConfig& rewrite_options =
-      session_options_->config.graph_options().rewrite_options();
-
-  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
-    // Adding this functionality in steps. The first step is to make sure
-    // we don't break dependencies. The second step will be to turn the
-    // functionality on by default.
-    grappler::GrapplerItem item;
-    item.id = "tf_graph";
-    item.graph = original_graph_def_;
-
-    item.fetch = options.fetch_endpoints;
-    item.fetch.insert(item.fetch.end(), options.target_nodes.begin(),
-                      options.target_nodes.end());
-
-    Status s;
-    if (!options.feed_endpoints.empty()) {
-      std::unordered_set<string> feeds(options.feed_endpoints.begin(),
-                                       options.feed_endpoints.end());
-      for (const NodeDef& node : original_graph_def_.node()) {
-        if (feeds.find(node.name()) == feeds.end()) {
-          continue;
-        }
-        if (node.attr().count("dtype") == 0 ||
-            node.attr().count("shape") == 0) {
-          s = errors::InvalidArgument("Missing node shape or type");
-          break;
-        }
-        TensorShape shape(node.attr().at("shape").shape());
-        DataType type = node.attr().at("dtype").type();
-        Tensor fake_input(type, shape);
-        item.feed.emplace_back(node.name(), fake_input);
-      }
-    }
-
-    if (s.ok()) {
-      std::unordered_map<string, DeviceProperties> device_map;
-      for (const auto& device : device_set_->devices()) {
-        device_map[device->name()] =
-            grappler::GetDeviceInfo(device->parsed_name());
-      }
-      grappler::VirtualCluster cluster(device_map);
-      s = grappler::RunMetaOptimizer(item, rewrite_options, &cluster,
-                                     &optimized_graph);
-    }
-    if (s.ok()) {
-      graph_def = &optimized_graph;
-    }
-  }
-#endif  // IS_MOBILE_PLATFORM
-
   std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, *graph_def, new_graph.get()));
@@ -339,6 +287,92 @@ Status SimpleGraphExecutionState::InitBaseGraph(
   return Status::OK();
 }
 
+Status SimpleGraphExecutionState::OptimizeGraph(
+    const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph) {
+#ifndef IS_MOBILE_PLATFORM
+  const RewriterConfig& rewrite_options =
+      session_options_->config.graph_options().rewrite_options();
+
+  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
+    // Adding this functionality in steps. The first step is to make sure
+    // we don't break dependencies. The second step will be to turn the
+    // functionality on by default.
+    grappler::GrapplerItem item;
+    item.id = "tf_graph";
+    graph_->ToGraphDef(&item.graph);
+
+    item.fetch = options.fetch_endpoints;
+    item.fetch.insert(item.fetch.end(), options.target_nodes.begin(),
+                      options.target_nodes.end());
+
+    if (!options.feed_endpoints.empty()) {
+      std::unordered_set<string> feeds;
+      for (const string& feed : options.feed_endpoints) {
+        TensorId id = ParseTensorName(feed);
+        if (id.second != 0) {
+          return errors::InvalidArgument("Unsupported feed: ", feed);
+        }
+        feeds.insert(id.first.ToString());
+      }
+      for (const NodeDef& node : original_graph_def_.node()) {
+        if (feeds.find(node.name()) == feeds.end()) {
+          continue;
+        }
+        if (node.attr().count("dtype") == 0 ||
+            node.attr().count("shape") == 0) {
+          return errors::InvalidArgument("Missing node shape or type");
+        }
+        TensorShapeProto shape_proto(node.attr().at("shape").shape());
+        // If the shape of the placeholder value is only partially known, we're
+        // free to use any dimension we want to feed the placeholder. We choose
+        // 1 to minimize the memory impact. Note that this only matters if an
+        // optimizer choose to run the graph to build its cost model, which
+        // doesn't happen (yet)
+        if (shape_proto.unknown_rank()) {
+          shape_proto.set_unknown_rank(false);
+        }
+        for (auto& dim : *shape_proto.mutable_dim()) {
+          if (dim.size() < 0) {
+            dim.set_size(1);
+          }
+        }
+        TensorShape shape(shape_proto);
+        DataType type = node.attr().at("dtype").type();
+        Tensor fake_input(type, shape);
+        item.feed.emplace_back(node.name(), fake_input);
+      }
+    }
+
+    std::unordered_map<string, DeviceProperties> device_map;
+    for (const auto& device : device_set_->devices()) {
+      device_map[device->name()] =
+          grappler::GetDeviceInfo(device->parsed_name());
+    }
+    grappler::VirtualCluster cluster(device_map);
+    GraphDef new_graph;
+    TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(item, rewrite_options,
+                                                  &cluster, &new_graph));
+    GraphConstructorOptions opts;
+    opts.allow_internal_ops = true;
+    optimized_graph->reset(new Graph(OpRegistry::Global()));
+    TF_RETURN_IF_ERROR(
+        ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
+    // The graph conversion sets the requested device names but not the assigned
+    // device names. However, since at this point the graph is placed TF expects
+    // an assigned device name for every node. Therefore we copy the requested
+    // device into the assigned device field.
+    for (Node* node : optimized_graph->get()->nodes()) {
+      node->set_assigned_device_name(node->requested_device());
+    }
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument("Meta Optimizer disabled");
+  }
+#else
+  return errors::InvalidArgument("Mobile platforms not supported");
+#endif  // IS_MOBILE_PLATFORM
+}
+
 Status SimpleGraphExecutionState::BuildGraph(
     const BuildGraphOptions& options, std::unique_ptr<SimpleClientGraph>* out) {
   VLOG(1) << "BuildGraph";
@@ -348,8 +382,14 @@ Status SimpleGraphExecutionState::BuildGraph(
     return errors::Internal(
         "Attempted to prune a graph that has not been fully initialized.");
   }
-  std::unique_ptr<Graph> ng(new Graph(flib_def_.get()));
-  CopyGraph(*graph_, ng.get());
+
+  std::unique_ptr<Graph> ng;
+  Status s = OptimizeGraph(options, &ng);
+  if (!s.ok()) {
+    // Simply copy the original graph if we couldn't optimize it.
+    ng.reset(new Graph(flib_def_.get()));
+    CopyGraph(*graph_, ng.get());
+  }
 
   subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.h b/tensorflow/core/common_runtime/simple_graph_execution_state.h
index 00b5509fd78209727adaeeb4eea3275a5616077c..c7f34a42d61689ea90da5a1fef84f2a56f535fd4 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.h
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.h
@@ -186,6 +186,9 @@ class SimpleGraphExecutionState {
   void SaveStatefulNodes(Graph* graph);
   void RestoreStatefulNodes(Graph* graph);
 
+  Status OptimizeGraph(const BuildGraphOptions& options,
+                       std::unique_ptr<Graph>* optimized_graph);
+
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
   const SessionOptions* session_options_;  // Not owned
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 9b43385d6f754cc5eee880b48061d3933207dc11..d410a164eac00a01f6a6c967e4cf4637a31a3d37 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/scanner.h"
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 2fc49d4412e9cf75dfa7a1e6c4c57cc0cade874a..971576698e621fbb7f95d033f3e642249ed062e3 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -15,7 +15,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-# Google-internal rules omitted.
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -28,6 +27,7 @@ load(
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
+    "tf_proto_library",
     "tf_proto_library_cc",
 )
 load(
@@ -42,13 +42,25 @@ check_deps(
     deps = ["//tensorflow/core:tensorflow"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "debug_service_proto",
-    srcs = ["debug_service.proto"],
+    srcs = [
+        "debug_service.proto",
+    ],
     has_services = 1,
     cc_api_version = 2,
     cc_grpc_version = 1,
-    protodeps = ["//tensorflow/core:protos_all"],
+    protodeps = [
+        ":debugger_event_metadata_proto",
+        "//tensorflow/core:protos_all",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "debugger_event_metadata_proto",
+    srcs = ["debugger_event_metadata.proto"],
+    cc_api_version = 2,
 )
 
 cc_library(
@@ -123,6 +135,7 @@ tf_cuda_library(
     linkstatic = 1,
     deps = [
         ":debug_service_proto_cc",
+        ":debugger_event_metadata_proto_cc",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -144,6 +157,7 @@ tf_cuda_library(
         ":debug_graph_utils",
         ":debug_io_utils",
         ":debug_service_proto_cc",
+        ":debugger_event_metadata_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -194,6 +208,8 @@ tf_cc_test(
     deps = [
         ":debug_grpc_testlib",
         ":debug_io_utils",
+        ":debug_service_proto_cc",
+        ":debugger_event_metadata_proto_cc",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -204,6 +220,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
     ],
 )
 
@@ -227,7 +244,10 @@ tf_cc_test(
     size = "medium",
     srcs = ["grpc_session_debug_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = ["nomac"],  # b/38276817
+    tags = [
+        "no_oss",  # b/62956105: port conflicts.
+        "nomac",  # b/38276817
+    ],
     deps = [
         ":debug_grpc_testlib",
         ":debug_io_utils",
@@ -247,6 +267,25 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "debug_grpc_io_utils_test",
+    size = "small",
+    srcs = ["debug_grpc_io_utils_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":debug_graph_utils",
+        ":debug_grpc_testlib",
+        ":debug_io_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # TODO(cais): Add the following back in when tfdbg is supported on Android.
 # filegroup(
 #     name = "android_srcs",
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65104241820b609cb80b6d354761084432bb7386
--- /dev/null
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -0,0 +1,432 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debug_grpc_testlib.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/tracing.h"
+
+namespace tensorflow {
+
+class GrpcDebugTest : public ::testing::Test {
+ protected:
+  struct ServerData {
+    int port;
+    string url;
+    std::unique_ptr<test::TestEventListenerImpl> server;
+    std::unique_ptr<thread::ThreadPool> thread_pool;
+  };
+
+  void SetUp() override {
+    ClearEnabledWatchKeys();
+    SetUpInProcessServer(&server_data_, 0);
+  }
+
+  void TearDown() override { TearDownInProcessServer(&server_data_); }
+
+  void SetUpInProcessServer(ServerData* server_data,
+                            int64 server_start_delay_micros) {
+    server_data->port = testing::PickUnusedPortOrDie();
+    server_data->url = strings::StrCat("grpc://localhost:", server_data->port);
+    server_data->server.reset(new test::TestEventListenerImpl());
+
+    server_data->thread_pool.reset(
+        new thread::ThreadPool(Env::Default(), "test_server", 1));
+    server_data->thread_pool->Schedule(
+        [server_data, server_start_delay_micros]() {
+          Env::Default()->SleepForMicroseconds(server_start_delay_micros);
+          server_data->server->RunServer(server_data->port);
+        });
+  }
+
+  void TearDownInProcessServer(ServerData* server_data) {
+    server_data->server->StopServer();
+    server_data->thread_pool.reset();
+  }
+
+  void ClearEnabledWatchKeys() { DebugGrpcIO::ClearEnabledWatchKeys(); }
+
+  void CreateEmptyEnabledSet(const string& grpc_debug_url) {
+    DebugGrpcIO::CreateEmptyEnabledSet(grpc_debug_url);
+  }
+
+  const int64 GetChannelConnectionTimeoutMicros() {
+    return DebugGrpcIO::channel_connection_timeout_micros;
+  }
+
+  void SetChannelConnectionTimeoutMicros(const int64 timeout) {
+    DebugGrpcIO::channel_connection_timeout_micros = timeout;
+  }
+
+  ServerData server_data_;
+};
+
+TEST_F(GrpcDebugTest, ConnectionTimeoutWorks) {
+  // Use a short timeout so the test won't take too long.
+  const int64 kOriginalTimeoutMicros = GetChannelConnectionTimeoutMicros();
+  const int64 kShortTimeoutMicros = 500 * 1000;
+  SetChannelConnectionTimeoutMicros(kShortTimeoutMicros);
+  ASSERT_EQ(kShortTimeoutMicros, GetChannelConnectionTimeoutMicros());
+
+  const string& kInvalidGrpcUrl =
+      strings::StrCat("grpc://localhost:", testing::PickUnusedPortOrDie());
+  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
+  tensor.flat<float>()(0) = 42.0;
+  Status publish_status = DebugIO::PublishDebugTensor(
+      DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", "foo_tensor", 0,
+                   "DebugIdentity"),
+      tensor, Env::Default()->NowMicros(), {kInvalidGrpcUrl});
+  SetChannelConnectionTimeoutMicros(kOriginalTimeoutMicros);
+  TF_ASSERT_OK(DebugIO::CloseDebugURL(kInvalidGrpcUrl));
+
+  ASSERT_FALSE(publish_status.ok());
+  const string expected_error_msg = strings::StrCat(
+      "Failed to connect to gRPC channel at ", kInvalidGrpcUrl.substr(7),
+      " within a timeout of ", kShortTimeoutMicros / 1e6, " s");
+  ASSERT_NE(string::npos,
+            publish_status.error_message().find(expected_error_msg));
+}
+
+TEST_F(GrpcDebugTest, ConnectionToDelayedStartingServerWorks) {
+  ServerData server_data;
+  // Server start will be delayed for 1 second.
+  SetUpInProcessServer(&server_data, 1 * 1000 * 1000);
+
+  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
+  tensor.flat<float>()(0) = 42.0;
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "foo_tensor", 0, "DebugIdentity");
+  Status publish_status = DebugIO::PublishDebugTensor(
+      kDebugNodeKey, tensor, Env::Default()->NowMicros(), {server_data.url});
+  ASSERT_TRUE(publish_status.ok());
+  TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data.url));
+
+  ASSERT_EQ(1, server_data.server->node_names.size());
+  ASSERT_EQ(1, server_data.server->output_slots.size());
+  ASSERT_EQ(1, server_data.server->debug_ops.size());
+  EXPECT_EQ(kDebugNodeKey.device_name, server_data.server->device_names[0]);
+  EXPECT_EQ(kDebugNodeKey.node_name, server_data.server->node_names[0]);
+  EXPECT_EQ(kDebugNodeKey.output_slot, server_data.server->output_slots[0]);
+  EXPECT_EQ(kDebugNodeKey.debug_op, server_data.server->debug_ops[0]);
+  TearDownInProcessServer(&server_data);
+}
+
+TEST_F(GrpcDebugTest, SendSingleDebugTensorViaGrpcTest) {
+  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
+  tensor.flat<float>()(0) = 42.0;
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "foo_tensor", 0, "DebugIdentity");
+  TF_ASSERT_OK(DebugIO::PublishDebugTensor(
+      kDebugNodeKey, tensor, Env::Default()->NowMicros(), {server_data_.url}));
+  TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
+
+  // Verify that the expected debug tensor sending happened.
+  ASSERT_EQ(1, server_data_.server->node_names.size());
+  ASSERT_EQ(1, server_data_.server->output_slots.size());
+  ASSERT_EQ(1, server_data_.server->debug_ops.size());
+  EXPECT_EQ(kDebugNodeKey.device_name, server_data_.server->device_names[0]);
+  EXPECT_EQ(kDebugNodeKey.node_name, server_data_.server->node_names[0]);
+  EXPECT_EQ(kDebugNodeKey.output_slot, server_data_.server->output_slots[0]);
+  EXPECT_EQ(kDebugNodeKey.debug_op, server_data_.server->debug_ops[0]);
+}
+
+TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
+  Tensor tensor(DT_STRING, TensorShape({1, 1}));
+  tensor.flat<string>()(0) = string(5000 * 1024, 'A');
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "foo_tensor", 0, "DebugIdentity");
+  const Status status = DebugIO::PublishDebugTensor(
+      kDebugNodeKey, tensor, Env::Default()->NowMicros(), {server_data_.url});
+  ASSERT_FALSE(status.ok());
+  ASSERT_NE(status.error_message().find("string value at index 0 from debug "
+                                        "node foo_tensor:0:DebugIdentity does "
+                                        "not fit gRPC message size limit"),
+            string::npos);
+  TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
+}
+
+TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
+  Tensor tensor(DT_STRING, TensorShape({1, 2}));
+  tensor.flat<string>()(0) = "A";
+  tensor.flat<string>()(1) = string(5000 * 1024, 'A');
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "foo_tensor", 0, "DebugIdentity");
+  const Status status = DebugIO::PublishDebugTensor(
+      kDebugNodeKey, tensor, Env::Default()->NowMicros(), {server_data_.url});
+  ASSERT_FALSE(status.ok());
+  ASSERT_NE(status.error_message().find("string value at index 1 from debug "
+                                        "node foo_tensor:0:DebugIdentity does "
+                                        "not fit gRPC message size limit"),
+            string::npos);
+  TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
+}
+
+TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
+  const int32 kSends = 4;
+
+  // Prepare the tensors to sent.
+  std::vector<Tensor> tensors;
+  for (int i = 0; i < kSends; ++i) {
+    Tensor tensor(DT_INT32, TensorShape({1, 1}));
+    tensor.flat<int>()(0) = i * i;
+    tensors.push_back(tensor);
+  }
+
+  thread::ThreadPool* tp =
+      new thread::ThreadPool(Env::Default(), "grpc_debug_test", kSends);
+
+  mutex mu;
+  Notification all_done;
+  int tensor_count GUARDED_BY(mu) = 0;
+  std::vector<Status> statuses GUARDED_BY(mu);
+
+  const std::vector<string> urls({server_data_.url});
+
+  // Set up the concurrent tasks of sending Tensors via an Event stream to the
+  // server.
+  auto fn = [this, &mu, &tensor_count, &tensors, &statuses, &all_done,
+             &urls]() {
+    int this_count;
+    {
+      mutex_lock l(mu);
+      this_count = tensor_count++;
+    }
+
+    // Different concurrent tasks will send different tensors.
+    const uint64 wall_time = Env::Default()->NowMicros();
+    Status publish_status = DebugIO::PublishDebugTensor(
+        DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                     strings::StrCat("synchronized_node_", this_count), 0,
+                     "DebugIdentity"),
+        tensors[this_count], wall_time, urls);
+
+    {
+      mutex_lock l(mu);
+      statuses.push_back(publish_status);
+      if (this_count == kSends - 1 && !all_done.HasBeenNotified()) {
+        all_done.Notify();
+      }
+    }
+  };
+
+  // Schedule the concurrent tasks.
+  for (int i = 0; i < kSends; ++i) {
+    tp->Schedule(fn);
+  }
+
+  // Wait for all client tasks to finish.
+  all_done.WaitForNotification();
+  delete tp;
+
+  // Close the debug gRPC stream.
+  Status close_status = DebugIO::CloseDebugURL(server_data_.url);
+  ASSERT_TRUE(close_status.ok());
+
+  // Check all statuses from the PublishDebugTensor calls().
+  for (const Status& status : statuses) {
+    TF_ASSERT_OK(status);
+  }
+
+  // One prep tensor plus kSends concurrent tensors are expected.
+  ASSERT_EQ(kSends, server_data_.server->node_names.size());
+  for (size_t i = 0; i < server_data_.server->node_names.size(); ++i) {
+    std::vector<string> items =
+        str_util::Split(server_data_.server->node_names[i], '_');
+    int tensor_index;
+    strings::safe_strto32(items[2], &tensor_index);
+
+    ASSERT_EQ(TensorShape({1, 1}),
+              server_data_.server->debug_tensors[i].shape());
+    ASSERT_EQ(tensor_index * tensor_index,
+              server_data_.server->debug_tensors[i].flat<int>()(0));
+  }
+}
+
+TEST_F(GrpcDebugTest, SendeDebugTensorsThroughMultipleRoundsUsingGrpcGating) {
+  // Prepare the tensor to send.
+  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
+                                   "test_namescope/test_node", 0,
+                                   "DebugIdentity");
+  Tensor tensor(DT_INT32, TensorShape({1, 1}));
+  tensor.flat<int>()(0) = 42;
+
+  const std::vector<string> urls({server_data_.url});
+  for (int i = 0; i < 3; ++i) {
+    server_data_.server->ClearReceivedDebugData();
+    const uint64 wall_time = Env::Default()->NowMicros();
+
+    // On the 1st send (i == 0), gating is disabled, so data should be sent.
+    // On the 2nd send (i == 1), gating is enabled, and the server has enabled
+    //   the watch key in the previous send, so data should be sent.
+    // On the 3rd send (i == 2), gating is enabled, but the server has disabled
+    //   the watch key in the previous send, so data should not be sent.
+    const bool enable_gated_grpc = (i != 0);
+    TF_ASSERT_OK(DebugIO::PublishDebugTensor(kDebugNodeKey, tensor, wall_time,
+                                             urls, enable_gated_grpc));
+
+    server_data_.server->RequestDebugOpStateChangeAtNextStream(i == 0,
+                                                               kDebugNodeKey);
+
+    // Close the debug gRPC stream.
+    Status close_status = DebugIO::CloseDebugURL(server_data_.url);
+    ASSERT_TRUE(close_status.ok());
+
+    // Check dumped files according to the expected gating results.
+    if (i < 2) {
+      ASSERT_EQ(1, server_data_.server->node_names.size());
+      ASSERT_EQ(1, server_data_.server->output_slots.size());
+      ASSERT_EQ(1, server_data_.server->debug_ops.size());
+      EXPECT_EQ(kDebugNodeKey.device_name,
+                server_data_.server->device_names[0]);
+      EXPECT_EQ(kDebugNodeKey.node_name, server_data_.server->node_names[0]);
+      EXPECT_EQ(kDebugNodeKey.output_slot,
+                server_data_.server->output_slots[0]);
+      EXPECT_EQ(kDebugNodeKey.debug_op, server_data_.server->debug_ops[0]);
+    } else {
+      ASSERT_EQ(0, server_data_.server->node_names.size());
+    }
+  }
+}
+
+TEST_F(GrpcDebugTest, TestGateDebugNodeOnEmptyEnabledSet) {
+  CreateEmptyEnabledSet("grpc://localhost:3333");
+
+  ASSERT_FALSE(DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity",
+                                            {"grpc://localhost:3333"}));
+
+  // file:// debug URLs are not subject to grpc gating.
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen(
+      "foo:0:DebugIdentity", {"grpc://localhost:3333", "file:///tmp/tfdbg_1"}));
+}
+
+TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSet) {
+  const string kGrpcUrl1 = "grpc://localhost:3333";
+  const string kGrpcUrl2 = "grpc://localhost:3334";
+
+  DebugGrpcIO::EnableWatchKey(kGrpcUrl1, "foo:0:DebugIdentity");
+  DebugGrpcIO::EnableWatchKey(kGrpcUrl1, "bar:0:DebugIdentity");
+
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("foo:1:DebugIdentity", {kGrpcUrl1}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("foo:1:DebugNumericSummary", {kGrpcUrl1}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("qux:0:DebugIdentity", {kGrpcUrl1}));
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity", {kGrpcUrl1}));
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity", {kGrpcUrl1}));
+
+  // Wrong grpc:// debug URLs.
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity", {kGrpcUrl2}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity", {kGrpcUrl2}));
+
+  // file:// debug URLs are not subject to grpc gating.
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("qux:0:DebugIdentity",
+                                           {"file:///tmp/tfdbg_1", kGrpcUrl1}));
+}
+
+TEST_F(GrpcDebugTest, TestGateDebugNodeOnMultipleEmptyEnabledSets) {
+  const string kGrpcUrl1 = "grpc://localhost:3333";
+  const string kGrpcUrl2 = "grpc://localhost:3334";
+  const string kGrpcUrl3 = "grpc://localhost:3335";
+
+  DebugGrpcIO::EnableWatchKey(kGrpcUrl1, "foo:0:DebugIdentity");
+  DebugGrpcIO::EnableWatchKey(kGrpcUrl2, "bar:0:DebugIdentity");
+  CreateEmptyEnabledSet(kGrpcUrl3);
+
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity", {kGrpcUrl1}));
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity", {kGrpcUrl2}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity", {kGrpcUrl2}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity", {kGrpcUrl1}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity", {kGrpcUrl3}));
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity", {kGrpcUrl3}));
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity",
+                                           {kGrpcUrl1, kGrpcUrl2}));
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity",
+                                           {kGrpcUrl1, kGrpcUrl2}));
+  ASSERT_TRUE(DebugIO::IsDebugNodeGateOpen("foo:0:DebugIdentity",
+                                           {kGrpcUrl1, kGrpcUrl3}));
+  ASSERT_FALSE(DebugIO::IsDebugNodeGateOpen("bar:0:DebugIdentity",
+                                            {kGrpcUrl1, kGrpcUrl3}));
+}
+
+TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSetAndEmptyURLs) {
+  DebugGrpcIO::EnableWatchKey("grpc://localhost:3333", "foo:0:DebugIdentity");
+
+  std::vector<string> debug_urls_1;
+  ASSERT_FALSE(
+      DebugIO::IsDebugNodeGateOpen("foo:1:DebugIdentity", debug_urls_1));
+}
+
+TEST_F(GrpcDebugTest, TestGateCopyNodeOnEmptyEnabledSet) {
+  const string kGrpcUrl1 = "grpc://localhost:3333";
+  const string kWatch1 = "foo:0:DebugIdentity";
+  CreateEmptyEnabledSet(kGrpcUrl1);
+
+  ASSERT_FALSE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, true)}));
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, false)}));
+
+  // file:// debug URLs are not subject to grpc gating.
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec("foo:0:DebugIdentity", kGrpcUrl1, true),
+       DebugWatchAndURLSpec("foo:0:DebugIdentity", "file:///tmp/tfdbg_1",
+                            false)}));
+}
+
+TEST_F(GrpcDebugTest, TestGateCopyNodeOnNonEmptyEnabledSet) {
+  const string kGrpcUrl1 = "grpc://localhost:3333";
+  const string kGrpcUrl2 = "grpc://localhost:3334";
+  const string kWatch1 = "foo:0:DebugIdentity";
+  const string kWatch2 = "foo:1:DebugIdentity";
+  CreateEmptyEnabledSet(kGrpcUrl1);
+  CreateEmptyEnabledSet(kGrpcUrl2);
+  DebugGrpcIO::EnableWatchKey(kGrpcUrl1, kWatch1);
+
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, true)}));
+
+  ASSERT_FALSE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl2, true)}));
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl2, false)}));
+
+  ASSERT_FALSE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch2, kGrpcUrl1, true)}));
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch2, kGrpcUrl1, false)}));
+
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, true),
+       DebugWatchAndURLSpec(kWatch1, kGrpcUrl2, true)}));
+  ASSERT_TRUE(DebugIO::IsCopyNodeGateOpen(
+      {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, true),
+       DebugWatchAndURLSpec(kWatch2, kGrpcUrl2, true)}));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index c19842a2f6cb019c707d39c74e174b95cc5d7482..7317aa03727a58bbd54ebeda681e2005117c2148 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_grpc_testlib.h"
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
@@ -44,8 +46,6 @@ namespace test {
           tensorflow::str_util::Split(val.node_name(), ':');
 
       const string node_name = name_items[0];
-      int32 output_slot = 0;
-      tensorflow::strings::safe_strto32(name_items[1], &output_slot);
       const string debug_op = name_items[2];
 
       const TensorProto& tensor_proto = val.tensor();
@@ -54,9 +54,24 @@ namespace test {
         return ::grpc::Status::CANCELLED;
       }
 
-      device_names.push_back(val.tag());
+      // Obtain the device name, which is encoded in JSON.
+      third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
+      for (int i = 0; i < val.metadata().plugin_data_size(); i++) {
+        if (val.metadata().plugin_data(i).plugin_name() != "debugger") {
+          // This plugin data was meant for another plugin.
+          continue;
+        }
+        auto status = tensorflow::protobuf::util::JsonStringToMessage(
+            val.metadata().plugin_data(i).content(), &metadata);
+        if (status.ok()) {
+          // The device name has been determined.
+          break;
+        }
+      }
+
+      device_names.push_back(metadata.device());
       node_names.push_back(node_name);
-      output_slots.push_back(output_slot);
+      output_slots.push_back(metadata.output_slot());
       debug_ops.push_back(debug_op);
       debug_tensors.push_back(tensor);
     }
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 54366ce2490941adde5f7312e10c10ad6ec1f309..f4208a0bbca5458d49f48a336fe2dbfc71fd2f5e 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -17,59 +17,204 @@ limitations under the License.
 
 #include <vector>
 
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
 #include "grpc++/create_channel.h"
-#endif
-
-#if defined(PLATFORM_WINDOWS)
+#else
 // winsock2.h is used in grpc, so Ws2_32.lib is needed
 #pragma comment(lib,"Ws2_32.lib")
-#endif
+#endif  // #ifndef PLATFORM_WINDOWS
 
+#include "tensorflow/core/debug/debugger_event_metadata.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/event.pb.h"
 
-#define GRPC_OSS_UNIMPLEMENTED_ERROR \
-  return errors::Unimplemented(      \
-      kGrpcURLScheme,                \
-      " debug URL scheme is not implemented in open source yet.")
+#define GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR \
+  return errors::Unimplemented(              \
+      kGrpcURLScheme, " debug URL scheme is not implemented on Windows yet.")
 
 namespace tensorflow {
 
 namespace {
 
-// Encapsulate the tensor value inside a Summary proto, and then inside an
-// Event proto.
-Event WrapTensorAsEvent(const DebugNodeKey& debug_node_key,
-                        const Tensor& tensor, const uint64 wall_time_us) {
+// Creates an Event proto representing a chunk of a Tensor. This method only
+// populates the field of the Event proto that represent the envelope
+// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype,
+// shape). It does not set the value.tensor field, which should be set by the
+// caller separately.
+Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
+                             const uint64 wall_time_us, const size_t num_chunks,
+                             const size_t chunk_index,
+                             const DataType& tensor_dtype,
+                             const TensorShapeProto& tensor_shape) {
   Event event;
   event.set_wall_time(static_cast<double>(wall_time_us));
-  Summary::Value* summ_val = event.mutable_summary()->add_value();
+  Summary::Value* value = event.mutable_summary()->add_value();
 
   // Create the debug node_name in the Summary proto.
   // For example, if tensor_name = "foo/node_a:0", and the debug_op is
   // "DebugIdentity", the debug node_name in the Summary proto will be
   // "foo/node_a:0:DebugIdentity".
-  summ_val->set_node_name(debug_node_key.debug_node_name);
-  summ_val->set_tag(debug_node_key.device_name);
+  value->set_node_name(debug_node_key.debug_node_name);
+
+  // Tag by the node name. This allows TensorBoard to quickly fetch data
+  // per op.
+  value->set_tag(debug_node_key.node_name);
+
+  // Store data within debugger metadata to be stored for each event.
+  third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
+  metadata.set_device(debug_node_key.device_name);
+  metadata.set_output_slot(debug_node_key.output_slot);
+  metadata.set_num_chunks(num_chunks);
+  metadata.set_chunk_index(chunk_index);
+
+  // Encode the data in JSON.
+  string json_output;
+  tensorflow::protobuf::util::JsonPrintOptions json_options;
+  json_options.always_print_primitive_fields = true;
+  auto status = tensorflow::protobuf::util::MessageToJsonString(
+      metadata, &json_output, json_options);
+  if (status.ok()) {
+    // Store summary metadata. Set the plugin to use this data as "debugger".
+    SummaryMetadata::PluginData* plugin_data =
+        value->mutable_metadata()->add_plugin_data();
+    plugin_data->set_plugin_name(DebugIO::kDebuggerPluginName);
+    plugin_data->set_content(json_output);
+  } else {
+    LOG(WARNING) << "Failed to convert DebuggerEventMetadata proto to JSON. "
+                 << "The debug_node_name is " << debug_node_key.debug_node_name
+                 << ".";
+  }
+
+  value->mutable_tensor()->set_dtype(tensor_dtype);
+  *value->mutable_tensor()->mutable_tensor_shape() = tensor_shape;
+
+  return event;
+}
+
+// Translates the length of a string to number of bytes when the string is
+// encoded as bytes in protobuf. Note that this makes a conservative estimate
+// (i.e., an estimate that is usually too large, but never too small under the
+// gRPC message size limit) of the Varint-encoded length, to workaround the lack
+// of a portable length function.
+const size_t StringValMaxBytesInProto(const string& str) {
+#if defined(PLATFORM_GOOGLE)
+  return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize;
+#else
+  return str.size();
+#endif
+}
+
+// Breaks a string Tensor (represented as a TensorProto) as a vector of Event
+// protos.
+Status WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
+                                const uint64 wall_time_us,
+                                const size_t chunk_size_limit,
+                                TensorProto* tensor_proto,
+                                std::vector<Event>* events) {
+  const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val();
+  const size_t num_strs = strs.size();
+  const size_t chunk_size_ub = chunk_size_limit > 0
+                                   ? chunk_size_limit
+                                   : std::numeric_limits<size_t>::max();
+
+  // E.g., if cutoffs is {j, k, l}, the chunks will have index ranges:
+  //   [0:a), [a:b), [c:<end>].
+  std::vector<size_t> cutoffs;
+  size_t chunk_size = 0;
+  for (size_t i = 0; i < num_strs; ++i) {
+    // Take into account the extra bytes in proto buffer.
+    if (StringValMaxBytesInProto(strs[i]) > chunk_size_ub) {
+      return errors::FailedPrecondition(
+          "string value at index ", i, " from debug node ",
+          debug_node_key.debug_node_name,
+          " does not fit gRPC message size limit (", chunk_size_ub, ")");
+    }
+    if (chunk_size + StringValMaxBytesInProto(strs[i]) > chunk_size_ub) {
+      cutoffs.push_back(i);
+      chunk_size = 0;
+    }
+    chunk_size += StringValMaxBytesInProto(strs[i]);
+  }
+  cutoffs.push_back(num_strs);
+  const size_t num_chunks = cutoffs.size();
+
+  for (size_t i = 0; i < num_chunks; ++i) {
+    Event event = PrepareChunkEventProto(debug_node_key, wall_time_us,
+                                         num_chunks, i, tensor_proto->dtype(),
+                                         tensor_proto->tensor_shape());
+    Summary::Value* value = event.mutable_summary()->mutable_value(0);
+
+    if (cutoffs.size() == 1) {
+      value->mutable_tensor()->mutable_string_val()->Swap(
+          tensor_proto->mutable_string_val());
+    } else {
+      const size_t begin = (i == 0) ? 0 : cutoffs[i - 1];
+      const size_t end = cutoffs[i];
+      for (size_t j = begin; j < end; ++j) {
+        value->mutable_tensor()->add_string_val(strs[j]);
+      }
+    }
+
+    events->push_back(std::move(event));
+  }
 
+  return Status::OK();
+}
+
+// Encapsulates the tensor value inside a vector of Event protos. Large tensors
+// are broken up to multiple protos to fit the chunk_size_limit. In each Event
+// proto the field summary.tensor carries the content of the tensor.
+// If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a
+// length-1 vector will be returned, regardless of the size of the tensor.
+Status WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
+                          const Tensor& tensor, const uint64 wall_time_us,
+                          const size_t chunk_size_limit,
+                          std::vector<Event>* events) {
+  TensorProto tensor_proto;
   if (tensor.dtype() == DT_STRING) {
-    // Treat DT_STRING specially, so that tensor_util.MakeNdarray can convert
-    // the TensorProto to string-type numpy array. MakeNdarray does not work
-    // with strings encoded by AsProtoTensorContent() in tensor_content.
-    tensor.AsProtoField(summ_val->mutable_tensor());
+    // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python can
+    // convert the TensorProto to string-type numpy array. MakeNdarray does not
+    // work with strings encoded by AsProtoTensorContent() in tensor_content.
+    tensor.AsProtoField(&tensor_proto);
+
+    TF_RETURN_IF_ERROR(WrapStringTensorAsEvents(
+        debug_node_key, wall_time_us, chunk_size_limit, &tensor_proto, events));
   } else {
-    tensor.AsProtoTensorContent(summ_val->mutable_tensor());
+    tensor.AsProtoTensorContent(&tensor_proto);
+
+    const size_t total_length = tensor_proto.tensor_content().size();
+    const size_t chunk_size_ub =
+        chunk_size_limit > 0 ? chunk_size_limit : total_length;
+    const size_t num_chunks =
+        (total_length == 0)
+            ? 1
+            : (total_length + chunk_size_ub - 1) / chunk_size_ub;
+    for (size_t i = 0; i < num_chunks; ++i) {
+      const size_t pos = i * chunk_size_ub;
+      const size_t len =
+          (i == num_chunks - 1) ? (total_length - pos) : chunk_size_ub;
+      Event event = PrepareChunkEventProto(debug_node_key, wall_time_us,
+                                           num_chunks, i, tensor_proto.dtype(),
+                                           tensor_proto.tensor_shape());
+      event.mutable_summary()
+          ->mutable_value(0)
+          ->mutable_tensor()
+          ->set_tensor_content(tensor_proto.tensor_content().substr(pos, len));
+      events->push_back(std::move(event));
+    }
   }
 
-  return event;
+  return Status::OK();
 }
 
-// Append an underscore and a timestamp to a file path. If the path already
+// Appends an underscore and a timestamp to a file path. If the path already
 // exists on the file system, append a hyphen and a 1-up index. Consecutive
 // values of the index will be tried until the first unused one is found.
 // TOCTOU race condition is not of concern here due to the fact that tfdbg
@@ -86,24 +231,29 @@ string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
   return out;
 }
 
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
+// Publishes encoded GraphDef through a gRPC debugger stream, in chunks,
+// conforming to the gRPC message size limit.
 Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
                                       const string& device_name,
                                       const int64 wall_time,
                                       const string& debug_url) {
-  static const size_t kChunkSizeLimitBytes = 4000 * 1024;
   const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
   const size_t total_length = encoded_graph_def.size();
-  const size_t num_chunks = static_cast<size_t>(
-      std::ceil(static_cast<float>(total_length) / kChunkSizeLimitBytes));
+  const size_t num_chunks =
+      static_cast<size_t>(std::ceil(static_cast<float>(total_length) /
+                                    DebugGrpcIO::kGrpcMessageSizeLimitBytes));
   for (size_t i = 0; i < num_chunks; ++i) {
-    const size_t pos = i * kChunkSizeLimitBytes;
-    const size_t len =
-        (i == num_chunks - 1) ? (total_length - pos) : kChunkSizeLimitBytes;
+    const size_t pos = i * DebugGrpcIO::kGrpcMessageSizeLimitBytes;
+    const size_t len = (i == num_chunks - 1)
+                           ? (total_length - pos)
+                           : DebugGrpcIO::kGrpcMessageSizeLimitBytes;
     Event event;
     event.set_wall_time(static_cast<double>(wall_time));
     // Prefix the chunk with
     //   <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|.
+    // TODO(cais): Use DebuggerEventMetadata to store device_name, num_chunks
+    // and chunk_index, instead.
     event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time,
                                         "|", i, "|", num_chunks, "|",
                                         encoded_graph_def.substr(pos, len)));
@@ -115,10 +265,13 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
   }
   return Status::OK();
 }
-#endif
+#endif  // #ifndef PLATFORM_WINDOWS
 
 }  // namespace
 
+// static
+const char* const DebugIO::kDebuggerPluginName = "debugger";
+
 // static
 const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
 
@@ -131,6 +284,9 @@ const char* const DebugIO::kDeviceTag = "device_";
 // static
 const char* const DebugIO::kGraphTag = "graph_";
 
+// static
+const char* const DebugIO::kHashTag = "hash";
+
 DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
                            const int32 output_slot, const string& debug_op)
     : device_name(device_name),
@@ -184,6 +340,7 @@ const char* const DebugIO::kFileURLScheme = "file://";
 // static
 const char* const DebugIO::kGrpcURLScheme = "grpc://";
 
+// Publishes debug metadata to a set of debug URLs.
 // static
 Status DebugIO::PublishDebugMetadata(
     const int64 global_step, const int64 session_run_index,
@@ -233,7 +390,7 @@ Status DebugIO::PublishDebugMetadata(
   Status status;
   for (const string& url : debug_urls) {
     if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
       Event grpc_event;
 
       // Determine the path (if any) in the grpc:// URL, and add it as a field
@@ -251,7 +408,7 @@ Status DebugIO::PublishDebugMetadata(
       status.Update(
           DebugGrpcIO::SendEventProtoThroughGrpcStream(grpc_event, url));
 #else
-      GRPC_OSS_UNIMPLEMENTED_ERROR;
+      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
       const string dump_root_dir = url.substr(strlen(kFileURLScheme));
@@ -290,7 +447,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
         fail_statuses.push_back(s);
       }
     } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
       Status s = DebugGrpcIO::SendTensorThroughGrpcStream(
           debug_node_key, tensor, wall_time_us, url, gated_grpc);
 
@@ -299,7 +456,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
         fail_statuses.push_back(s);
       }
 #else
-      GRPC_OSS_UNIMPLEMENTED_ERROR;
+      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     } else {
       return Status(error::UNAVAILABLE,
@@ -351,17 +508,19 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
       const string dump_root_dir =
           io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
                        DebugNodeKey::DeviceNameToDevicePath(device_name));
-      const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix,
-                                               DebugIO::kGraphTag, now_micros);
+      const uint64 graph_hash = ::tensorflow::Hash64(buf);
+      const string file_name =
+          strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kGraphTag,
+                          DebugIO::kHashTag, graph_hash, "_", now_micros);
 
       status.Update(
           DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
     } else if (debug_url.find(kGrpcURLScheme) == 0) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
       status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros,
                                                    debug_url));
 #else
-      GRPC_OSS_UNIMPLEMENTED_ERROR;
+      GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     }
   }
@@ -372,7 +531,7 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
 // static
 bool DebugIO::IsCopyNodeGateOpen(
     const std::vector<DebugWatchAndURLSpec>& specs) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
   for (const DebugWatchAndURLSpec& spec : specs) {
     if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme),
                                              DebugIO::kGrpcURLScheme)) {
@@ -392,7 +551,7 @@ bool DebugIO::IsCopyNodeGateOpen(
 // static
 bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
                                   const std::vector<string>& debug_urls) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
   for (const string& debug_url : debug_urls) {
     if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
                           DebugIO::kGrpcURLScheme)) {
@@ -412,7 +571,7 @@ bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
 // static
 bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
                                  const string& debug_url) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
   if (debug_url.find(kGrpcURLScheme) != 0) {
     return true;
   } else {
@@ -426,10 +585,10 @@ bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
 // static
 Status DebugIO::CloseDebugURL(const string& debug_url) {
   if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) {
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
     return DebugGrpcIO::CloseGrpcStream(debug_url);
 #else
-    GRPC_OSS_UNIMPLEMENTED_ERROR;
+    GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
   } else {
     // No-op for non-gRPC URLs.
@@ -499,9 +658,11 @@ Status DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
                                           const Tensor& tensor,
                                           const uint64 wall_time_us,
                                           const string& file_path) {
-  return DumpEventProtoToFile(
-      WrapTensorAsEvent(debug_node_key, tensor, wall_time_us),
-      io::Dirname(file_path).ToString(), io::Basename(file_path).ToString());
+  std::vector<Event> events;
+  TF_RETURN_IF_ERROR(
+      WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events));
+  return DumpEventProtoToFile(events[0], io::Dirname(file_path).ToString(),
+                              io::Basename(file_path).ToString());
 }
 
 // static
@@ -539,7 +700,7 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
   }
 }
 
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
 DebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
     : server_stream_addr_(server_stream_addr),
       url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
@@ -608,6 +769,12 @@ mutex DebugGrpcIO::streams_mu;
 int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
 // TODO(cais): Make this configurable?
 
+// static
+const size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024;
+
+// static
+const size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6;
+
 // static
 std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
 DebugGrpcIO::GetStreamChannels() {
@@ -625,9 +792,14 @@ Status DebugGrpcIO::SendTensorThroughGrpcStream(
   if (gated && !IsGateOpen(debug_node_key.debug_node_name, grpc_stream_url)) {
     return Status::OK();
   } else {
-    return SendEventProtoThroughGrpcStream(
-        WrapTensorAsEvent(debug_node_key, tensor, wall_time_us),
-        grpc_stream_url);
+    std::vector<Event> events;
+    TF_RETURN_IF_ERROR(WrapTensorAsEvents(debug_node_key, tensor, wall_time_us,
+                                          kGrpcMessageSizeLimitBytes, &events));
+    for (const Event& event : events) {
+      TF_RETURN_IF_ERROR(
+          SendEventProtoThroughGrpcStream(event, grpc_stream_url));
+    }
+    return Status::OK();
   }
 }
 
@@ -751,6 +923,6 @@ void DebugGrpcIO::CreateEmptyEnabledSet(const string& grpc_debug_url) {
   }
 }
 
-#endif  // #if defined(PLATFORM_GOOGLE)
+#endif  // #ifndef PLATFORM_WINDOWS
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 69d8c7bd4e014166edc68a828171fb7f0279afeb..caf9f5341d378df3be9bb075f6032f65535a4ed7 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -44,6 +44,9 @@ struct DebugNodeKey {
   DebugNodeKey(const string& device_name, const string& node_name,
                const int32 output_slot, const string& debug_op);
 
+  // Converts a device name string to a device path string.
+  // E.g., /job:localhost/replica:0/task:0/cpu:0 will be converted to
+  //   ,job_localhost,replica_0,task_0,cpu_0.
   static const string DeviceNameToDevicePath(const string& device_name);
 
   const string device_name;
@@ -56,6 +59,17 @@ struct DebugNodeKey {
 
 class DebugIO {
  public:
+  static const char* const kDebuggerPluginName;
+
+  static const char* const kMetadataFilePrefix;
+  static const char* const kCoreMetadataTag;
+  static const char* const kDeviceTag;
+  static const char* const kGraphTag;
+  static const char* const kHashTag;
+
+  static const char* const kFileURLScheme;
+  static const char* const kGrpcURLScheme;
+
   static Status PublishDebugMetadata(
       const int64 global_step, const int64 session_run_index,
       const int64 executor_step_index, const std::vector<string>& input_names,
@@ -63,7 +77,7 @@ class DebugIO {
       const std::vector<string>& target_nodes,
       const std::unordered_set<string>& debug_urls);
 
-  // Publish a tensor to a debug target URL.
+  // Publishes a tensor to a debug target URL.
   //
   // Args:
   //   debug_node_key: A DebugNodeKey identifying the debug node.
@@ -84,7 +98,7 @@ class DebugIO {
                                    const uint64 wall_time_us,
                                    const gtl::ArraySlice<string>& debug_urls);
 
-  // Publish a graph to a set of debug URLs.
+  // Publishes a graph to a set of debug URLs.
   //
   // Args:
   //   graph: The graph to be published.
@@ -92,7 +106,7 @@ class DebugIO {
   static Status PublishGraph(const Graph& graph, const string& device_name,
                              const std::unordered_set<string>& debug_urls);
 
-  // Determine whether a copy node needs to perform deep-copy of input tensor.
+  // Determines whether a copy node needs to perform deep-copy of input tensor.
   //
   // The input arguments contain sufficient information about the attached
   // downstream debug ops for this method to determine whether all the said
@@ -109,7 +123,7 @@ class DebugIO {
   static bool IsCopyNodeGateOpen(
       const std::vector<DebugWatchAndURLSpec>& specs);
 
-  // Determine whether a debug node needs to proceed given the current gRPC
+  // Determines whether a debug node needs to proceed given the current gRPC
   // gating status.
   //
   // Args:
@@ -122,7 +136,7 @@ class DebugIO {
   static bool IsDebugNodeGateOpen(const string& watch_key,
                                   const std::vector<string>& debug_urls);
 
-  // Determine whether debug information should be sent through a grpc://
+  // Determines whether debug information should be sent through a grpc://
   // debug URL given the current gRPC gating status.
   //
   // Args:
@@ -138,20 +152,12 @@ class DebugIO {
                                  const string& debug_url);
 
   static Status CloseDebugURL(const string& debug_url);
-
-  static const char* const kMetadataFilePrefix;
-  static const char* const kCoreMetadataTag;
-  static const char* const kDeviceTag;
-  static const char* const kGraphTag;
-
-  static const char* const kFileURLScheme;
-  static const char* const kGrpcURLScheme;
 };
 
 // Helper class for debug ops.
 class DebugFileIO {
  public:
-  // Encapsulate the Tensor in an Event protobuf and write it to a directory.
+  // Encapsulates the Tensor in an Event protobuf and write it to a directory.
   // The actual path of the dump file will be a contactenation of
   // dump_root_dir, tensor_name, along with the wall_time.
   //
@@ -188,12 +194,18 @@ class DebugFileIO {
                                 const DebugNodeKey& debug_node_key,
                                 const uint64 wall_time_us);
 
+  // Dumps an Event proto to a file.
+  //
+  // Args:
+  //   event_prot: The Event proto to be dumped.
+  //   dir_name: Directory path.
+  //   file_name: Base file name.
   static Status DumpEventProtoToFile(const Event& event_proto,
                                      const string& dir_name,
                                      const string& file_name);
 
  private:
-  // Encapsulate the Tensor in an Event protobuf and write it to file.
+  // Encapsulates the Tensor in an Event protobuf and write it to file.
   static Status DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
                                       const Tensor& tensor,
                                       const uint64 wall_time_us,
@@ -209,7 +221,7 @@ class DebugFileIO {
 
 // TODO(cais): Support grpc:// debug URLs in open source once Python grpc
 //   genrule becomes available. See b/23796275.
-#if defined(PLATFORM_GOOGLE)
+#ifndef PLATFORM_WINDOWS
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 
 namespace tensorflow {
@@ -266,20 +278,23 @@ class DebugGrpcChannel {
 
 class DebugGrpcIO {
  public:
-  // Send a tensor through a debug gRPC stream.
+  static const size_t kGrpcMessageSizeLimitBytes;
+  static const size_t kGrpcMaxVarintLengthSize;
+
+  // Sends a tensor through a debug gRPC stream.
   static Status SendTensorThroughGrpcStream(const DebugNodeKey& debug_node_key,
                                             const Tensor& tensor,
                                             const uint64 wall_time_us,
                                             const string& grpc_stream_url,
                                             const bool gated);
 
-  // Send an Event proto through a debug gRPC stream.
+  // Sends an Event proto through a debug gRPC stream.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to CloseGrpcStream().
   static Status SendEventProtoThroughGrpcStream(const Event& event_proto,
                                                 const string& grpc_stream_url);
 
-  // Check whether a debug watch key is allowed to send data to a given grpc://
+  // Checks whether a debug watch key is allowed to send data to a given grpc://
   // debug URL given the current gating status.
   //
   // Args:
@@ -292,16 +307,16 @@ class DebugGrpcIO {
   //     proceed.
   static bool IsGateOpen(const string& watch_key, const string& grpc_debug_url);
 
-  // Close a gRPC stream to the given address, if it exists.
+  // Closes a gRPC stream to the given address, if it exists.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to SendTensorThroughGrpcStream().
   static Status CloseGrpcStream(const string& grpc_stream_url);
 
-  // Enable a debug watch key at a grpc:// debug URL.
+  // Enables a debug watch key at a grpc:// debug URL.
   static void EnableWatchKey(const string& grpc_debug_url,
                              const string& watch_key);
 
-  // Disable a debug watch key at a grpc:// debug URL.
+  // Disables a debug watch key at a grpc:// debug URL.
   static void DisableWatchKey(const string& grpc_debug_url,
                               const string& watch_key);
 
@@ -330,6 +345,6 @@ class DebugGrpcIO {
 };
 
 }  // namespace tensorflow
-#endif  // #if defined(PLATFORM_GOOGLE)
+#endif  // #ifndef(PLATFORM_WINDOWS)
 
 #endif  // TENSORFLOW_DEBUG_IO_UTILS_H_
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 35c95fb98c430f369871f94f86dbb208ef400310..08ef4001bc1617236608328a68bc349c3c507deb 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_io_utils.h"
 
+#include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -124,10 +125,18 @@ TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
 
   ASSERT_GE(wall_time, event.wall_time());
   ASSERT_EQ(1, event.summary().value().size());
-  ASSERT_EQ(kDebugNodeKey.device_name, event.summary().value(0).tag());
+  ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag());
   ASSERT_EQ(kDebugNodeKey.debug_node_name,
             event.summary().value(0).node_name());
 
+  // Determine and validate some information from the metadata.
+  third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
+  auto status = tensorflow::protobuf::util::JsonStringToMessage(
+      event.summary().value(0).metadata().plugin_data(0).content(), &metadata);
+  ASSERT_TRUE(status.ok());
+  ASSERT_EQ(kDebugNodeKey.device_name, metadata.device());
+  ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot());
+
   Tensor b_prime(DT_STRING);
   ASSERT_TRUE(b_prime.FromProto(event.summary().value(0).tensor()));
 
@@ -229,10 +238,19 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
 
     ASSERT_GE(wall_time, event.wall_time());
     ASSERT_EQ(1, event.summary().value().size());
-    ASSERT_EQ(kDebugNodeKey.device_name, event.summary().value(0).tag());
+    ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag());
     ASSERT_EQ(kDebugNodeKey.debug_node_name,
               event.summary().value(0).node_name());
 
+    // Determine and validate some information from the metadata.
+    third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
+    auto status = tensorflow::protobuf::util::JsonStringToMessage(
+        event.summary().value(0).metadata().plugin_data(0).content(),
+        &metadata);
+    ASSERT_TRUE(status.ok());
+    ASSERT_EQ(kDebugNodeKey.device_name, metadata.device());
+    ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot());
+
     Tensor a_prime(DT_FLOAT);
     ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
 
@@ -333,10 +351,19 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
 
       ASSERT_GE(wall_time, event.wall_time());
       ASSERT_EQ(1, event.summary().value().size());
-      ASSERT_EQ(kDebugNodeKey.device_name, event.summary().value(0).tag());
+      ASSERT_EQ(kDebugNodeKey.node_name, event.summary().value(0).tag());
       ASSERT_EQ(kDebugNodeKey.debug_node_name,
                 event.summary().value(0).node_name());
 
+      // Determine and validate some information from the metadata.
+      third_party::tensorflow::core::debug::DebuggerEventMetadata metadata;
+      auto status = tensorflow::protobuf::util::JsonStringToMessage(
+          event.summary().value(0).metadata().plugin_data(0).content(),
+          &metadata);
+      ASSERT_TRUE(status.ok());
+      ASSERT_EQ(kDebugNodeKey.device_name, metadata.device());
+      ASSERT_EQ(kDebugNodeKey.output_slot, metadata.output_slot());
+
       Tensor a_prime(DT_FLOAT);
       ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
 
diff --git a/tensorflow/core/debug/debugger_event_metadata.proto b/tensorflow/core/debug/debugger_event_metadata.proto
new file mode 100644
index 0000000000000000000000000000000000000000..8bdedb1a508b7c579eadb7c02757c551ca5dcfbb
--- /dev/null
+++ b/tensorflow/core/debug/debugger_event_metadata.proto
@@ -0,0 +1,11 @@
+syntax = "proto3";
+
+package third_party.tensorflow.core.debug;
+
+// Encapsulates per-event data related to debugging.
+message DebuggerEventMetadata {
+  string device = 1;
+  int32 output_slot = 2;
+  int32 num_chunks = 3;
+  int32 chunk_index = 4;
+};
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 3827596a67635ef3315c53cd2a788c73b64aae17..d6f35fe24c385c507681f4ceb4a3ee75b40ac9fb 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -279,9 +279,12 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
 
         DeleteDumpDir();
       } else {
+        // CUDA and SYCL devices do not have an Identity op for strings
         LOG(ERROR) << "Error: " << s;
         ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
-                    (b_dev.device_type() == DEVICE_GPU));
+                    (a_dev.device_type() == DEVICE_SYCL) ||
+                    (b_dev.device_type() == DEVICE_GPU) ||
+                    (b_dev.device_type() == DEVICE_SYCL));
         ASSERT_FALSE(s.ok());
       }
     }
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index efc08e4c9d05c0993d2eff50131e90a98d8c1384..d9ed40c50a2d2fe6eb1c54075544c50a31259f0d 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -153,6 +153,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -205,6 +206,7 @@ cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -357,6 +359,7 @@ cc_library(
     srcs = ["graph_mgr.cc"],
     hdrs = ["graph_mgr.h"],
     deps = [
+        ":message_wrappers",
         ":rendezvous_mgr_interface",
         ":worker_env",
         "//tensorflow/core:core_cpu_internal",
@@ -399,7 +402,6 @@ cc_library(
     srcs = ["server_lib.cc"],
     hdrs = ["server_lib.h"],
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -450,7 +452,11 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["remote_device_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["manual"],
+    tags = tf_cuda_tests_tags() + [
+        "manual",
+        "notap",  # Memory leak due to b/62910646
+        "noguitar",  # Memory leak due to b/62910646
+    ],
     deps = [
         ":master",
         ":remote_device",
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index e68aea46ecd436d557d8394c3544684965a81878..cb2fde7dba7de717251d129aae8aab97a0127ea9 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -127,11 +127,10 @@ void BaseRendezvousMgr::CleanupAll() {
   }
 }
 
-BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
-                                           bool tolerate_dup_recv)
+BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id)
     : env_(env),
       step_id_(step_id),
-      local_(NewLocalRendezvous(tolerate_dup_recv)),
+      local_(NewLocalRendezvous()),
       session_(nullptr) {}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
@@ -248,14 +247,15 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
     return;
   }
 
+  WorkerSession* sess = session();
   Device* src_device;
-  Status s = env_->device_mgr->LookupDevice(parsed.src_device, &src_device);
+  Status s = sess->device_mgr->LookupDevice(parsed.src_device, &src_device);
   if (!s.ok()) {
     done(s);
     return;
   }
   Device* dst_device;
-  s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+  s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
   if (!s.ok()) {
     done(s);
     return;
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index b252f45fe96354f8e2a91a5aa3a05f1a937e3939..c5a56f324ab2d8f6a2e5c587875def47fd8ed9cf 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -118,8 +118,7 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 // functionality to coordinate with remote workers.
 class BaseRemoteRendezvous : public RemoteRendezvous {
  public:
-  BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
-                       bool tolerate_dup_recv);
+  BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id);
 
   // Upgrades the BaseRemoteRendezvous to full initialization.
   Status Initialize(WorkerSession* session) override;
diff --git a/tensorflow/core/distributed_runtime/executor_test.cc b/tensorflow/core/distributed_runtime/executor_test.cc
index 17843ff6b060b6f7130a73fbfd279e72d58bac45..1a4980a61b208a2df6dd877d597b3ddf3d448b93 100644
--- a/tensorflow/core/distributed_runtime/executor_test.cc
+++ b/tensorflow/core/distributed_runtime/executor_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index f4bf9dcd3b92f85af161694ceab9377851d2834a..7f77bf8b4efeade512f6307f9caa64175d81cc33 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -30,7 +30,9 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
@@ -440,10 +442,9 @@ void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
 }
 
 void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
-                            WorkerSession* session,
-                            const ExecutorOpts& /*opts*/,
+                            WorkerSession* session, const ExecutorOpts& opts,
                             StepStatsCollector* collector,
-                            CostGraphDef* cost_graph,
+                            MutableRunGraphResponseWrapper* response,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
   // Lookup an item. Holds one ref while executing.
@@ -462,6 +463,18 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
+  CostGraphDef* cost_graph = nullptr;
+  if (response != nullptr) {
+    cost_graph = response->mutable_cost_graph();
+    if (opts.record_partition_graphs()) {
+      for (const ExecutionUnit& unit : item->units) {
+        GraphDef graph_def;
+        unit.graph->ToGraphDef(&graph_def);
+        response->AddPartitionGraph(graph_def);
+      }
+    }
+  }
+
   RemoteRendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = rendezvous->Initialize(session);
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 4ee3711d02861cb7e22f4ede21964658079be711..fb83720d2ac19830e44d21f89a937eee089f48c5 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
@@ -80,7 +82,8 @@ class GraphMgr {
   typedef std::function<void(const Status&)> StatusCallback;
   void ExecuteAsync(const string& handle, const int64 step_id,
                     WorkerSession* session, const ExecutorOpts& opts,
-                    StepStatsCollector* collector, CostGraphDef* cost_graph,
+                    StepStatsCollector* collector,
+                    MutableRunGraphResponseWrapper* response,
                     CancellationManager* cancellation_manager,
                     const NamedTensors& in, StatusCallback done);
 
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index e3f23ef0dd011da4c21c94e4ae3a0e06bd4c8b97..4ff2d0f5e3ddfb3781dfdb0ea46409fe8962baa9 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -372,7 +372,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
         DeviceNameUtils::ParsedName name = d->parsed_name();
         if (name.job == *worker_cache_factory_options.job_name &&
             name.task == worker_cache_factory_options.task_index &&
-            name.type == "CPU") {
+            name.type == "CPU" && name.id == 0) {
           device_set->set_client_device(d.get());
         }
       }
@@ -399,7 +399,8 @@ void Master::CreateSession(const CreateSessionRequest* req,
       }
     }
 
-    CHECK(device_set->client_device());
+    CHECK(device_set->client_device()) << "No client device found. Missing "
+                                       << "CPU:0 device?";
 
     SessionOptions options;
     options.config = req->config();
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 94fec4f6d00532d2cee605abb889d5fe723fcf5e..361e89290d28acb86732cf9f6653cecf3f5bc733 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -26,10 +26,13 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/graph/graph_partition.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -510,6 +513,9 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   if (pss->collect_rpcs) {
     SetRPCLogging(true);
   }
+  if (pss->collect_partition_graphs) {
+    exec_opts.set_record_partition_graphs(true);
+  }
   if (pss->collect_costs || pss->collect_timeline) {
     pss->step_stats.resize(partitions_.size());
   }
@@ -612,30 +618,39 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   if (status.ok()) {
     for (int i = 0; i < num; ++i) {
       const Part& part = partitions_[i];
-      for (size_t j = 0; j < calls.get(i)->resp->num_recvs(); ++j) {
-        auto iter = part.key_fetch.find(calls.get(i)->resp->recv_key(j));
+      MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
+      for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
+        auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
         if (iter == part.key_fetch.end()) {
           status.Update(errors::Internal("Unexpected fetch key: ",
-                                         calls.get(i)->resp->recv_key(j)));
+                                         run_graph_resp->recv_key(j)));
           break;
         }
         const string& fetch = iter->second;
-        status.Update(resp->AddTensorFromRunGraphResponse(
-            fetch, calls.get(i)->resp.get(), j));
+        status.Update(
+            resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
         if (!status.ok()) {
           break;
         }
       }
       if (pss->collect_timeline) {
-        pss->step_stats[i].Swap(calls.get(i)->resp->mutable_step_stats());
+        pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
       }
       if (pss->collect_costs) {
-        CostGraphDef* cost_graph = calls.get(i)->resp->mutable_cost_graph();
+        CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
         for (int j = 0; j < cost_graph->node_size(); ++j) {
           resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
               cost_graph->mutable_node(j));
         }
       }
+      if (pss->collect_partition_graphs) {
+        protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+            resp->mutable_metadata()->mutable_partition_graphs();
+        for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
+          partition_graph_defs->Add()->Swap(
+              run_graph_resp->mutable_partition_graph(i));
+        }
+      }
     }
   }
   return status;
@@ -993,8 +1008,7 @@ MasterSession::MasterSession(
           << " #remote " << remote_devs_->size();
 
   LOG(INFO) << "Start master session " << handle_
-            << " with config: " << std::endl
-            << session_opts_.config.DebugString();
+            << " with config: " << session_opts_.config.ShortDebugString();
 }
 
 MasterSession::~MasterSession() {
@@ -1165,7 +1179,6 @@ WorkerCacheInterface* MasterSession::get_worker_cache() const {
 Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
                                 ReffedClientGraph** rcg, bool is_partial) {
   const uint64 hash = HashBuildGraphOptions(opts);
-  ReffedClientGraph* to_unref = nullptr;
   {
     mutex_lock l(mu_);
     // Keep track of how many times this subgraph has been executed in
@@ -1196,7 +1209,6 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
     *rcg = iter->second;
     (*rcg)->Ref();
   }
-  if (to_unref) to_unref->Unref();
   return Status::OK();
 }
 
@@ -1361,6 +1373,7 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     pss.collect_costs =
         build_cost_model_every > 0 &&
         ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+    pss.collect_partition_graphs = req.options().output_partition_graphs();
 
     std::unique_ptr<ProfileHandler> ph = run_state->rcg->GetProfileHandler(
         run_state->step_id, count, req.options());
@@ -1517,6 +1530,7 @@ Status MasterSession::DoRunWithLocalExecution(
   pss.collect_costs =
       build_cost_model_every > 0 &&
       ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+  pss.collect_partition_graphs = req.options().output_partition_graphs();
 
   std::unique_ptr<ProfileHandler> ph =
       rcg->GetProfileHandler(step_id, count, req.options());
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 10fc4868caa31e177dea513a3efa9a2c676b5c04..33b9bfe631a5156f0070ac7dcc45b6e66675b1f4 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -145,6 +145,7 @@ class MasterSession : public core::RefCounted {
     bool collect_costs = false;
     bool collect_timeline = false;
     bool collect_rpcs = false;
+    bool collect_partition_graphs = false;
     Microseconds start_micros = Microseconds(0);
     Microseconds end_micros = Microseconds(0);
     std::vector<StepStats> step_stats;  // per partition
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index b5b564375db3fa8398cab228c4e4791ab32321ba..a4a88e6e3b9ec734c0720b715dc9b3e30850c0ae 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -523,6 +523,19 @@ RunGraphResponse* InMemoryRunGraphResponse::get_proto() {
   return nullptr;
 }
 
+size_t InMemoryRunGraphResponse::num_partition_graphs() const {
+  return partition_graphs_.size();
+}
+
+GraphDef* InMemoryRunGraphResponse::mutable_partition_graph(size_t i) {
+  return &partition_graphs_[i];
+}
+
+void InMemoryRunGraphResponse::AddPartitionGraph(
+    const GraphDef& partition_graph) {
+  partition_graphs_.push_back(partition_graph);
+}
+
 size_t OwnedProtoRunGraphResponse::num_recvs() const {
   return response_.recv_size();
 }
@@ -563,6 +576,20 @@ CostGraphDef* OwnedProtoRunGraphResponse::mutable_cost_graph() {
 
 RunGraphResponse* OwnedProtoRunGraphResponse::get_proto() { return &response_; }
 
+size_t OwnedProtoRunGraphResponse::num_partition_graphs() const {
+  return response_.partition_graph_size();
+}
+
+GraphDef* OwnedProtoRunGraphResponse::mutable_partition_graph(size_t i) {
+  return response_.mutable_partition_graph(i);
+}
+
+void OwnedProtoRunGraphResponse::AddPartitionGraph(
+    const GraphDef& partition_graph) {
+  GraphDef* graph_def = response_.mutable_partition_graph()->Add();
+  *graph_def = partition_graph;
+}
+
 NonOwnedProtoRunGraphResponse::NonOwnedProtoRunGraphResponse(
     RunGraphResponse* response)
     : response_(response) {}
@@ -609,6 +636,20 @@ RunGraphResponse* NonOwnedProtoRunGraphResponse::get_proto() {
   return response_;
 }
 
+size_t NonOwnedProtoRunGraphResponse::num_partition_graphs() const {
+  return response_->partition_graph_size();
+}
+
+GraphDef* NonOwnedProtoRunGraphResponse::mutable_partition_graph(size_t i) {
+  return response_->mutable_partition_graph(i);
+}
+
+void NonOwnedProtoRunGraphResponse::AddPartitionGraph(
+    const GraphDef& partition_graph) {
+  GraphDef* graph_def = response_->add_partition_graph();
+  *graph_def = partition_graph;
+}
+
 MutableRunStepResponseWrapper::~MutableRunStepResponseWrapper() {}
 
 size_t InMemoryRunStepResponse::num_tensors() const { return tensors_.size(); }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index f247b50dd5aa9fc8c9b496308a2746433f3a4c96..0e3f5b98cb58bb76f599ca67938a420c9b3ffdce 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
@@ -424,6 +425,9 @@ class MutableRunGraphResponseWrapper {
   // execution, if necessary.
   virtual StepStats* mutable_step_stats() = 0;
   virtual CostGraphDef* mutable_cost_graph() = 0;
+  virtual size_t num_partition_graphs() const = 0;
+  virtual GraphDef* mutable_partition_graph(size_t i) = 0;
+  virtual void AddPartitionGraph(const GraphDef& partition_graph) = 0;
 
  protected:
   // Returns a mutable protobuf message that represents the contents of
@@ -451,6 +455,9 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   void AddRecv(const string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
+  size_t num_partition_graphs() const override;
+  GraphDef* mutable_partition_graph(size_t i) override;
+  void AddPartitionGraph(const GraphDef& partition_graph) override;
 
  protected:
   // NOTE: This method is not implemented. See
@@ -461,6 +468,7 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   gtl::InlinedVector<std::pair<string, Tensor>, 4> recvs_;
   StepStats step_stats_;
   CostGraphDef cost_graph_;
+  std::vector<GraphDef> partition_graphs_;
 };
 
 // Proto-based message wrapper for use on the client side of the RunGraph RPC.
@@ -474,6 +482,9 @@ class OwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
   void AddRecv(const string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
+  size_t num_partition_graphs() const override;
+  GraphDef* mutable_partition_graph(size_t i) override;
+  void AddPartitionGraph(const GraphDef& partition_graph) override;
 
  protected:
   RunGraphResponse* get_proto() override;
@@ -495,6 +506,9 @@ class NonOwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
   void AddRecv(const string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
+  size_t num_partition_graphs() const override;
+  GraphDef* mutable_partition_graph(size_t i) override;
+  void AddPartitionGraph(const GraphDef& partition_graph) override;
 
  protected:
   RunGraphResponse* get_proto() override;
diff --git a/tensorflow/core/distributed_runtime/message_wrappers_test.cc b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
index 5b0c2945b991968696aa06b6a4686d475e58fbb7..f64d476d6dde3eeb0e3d2f1fa31d77a437a85122 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers_test.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
@@ -23,20 +23,21 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
+namespace {
 
-static Tensor TensorA() {
+Tensor TensorA() {
   Tensor a_tensor(DT_INT32, TensorShape({2, 2}));
   test::FillValues<int32>(&a_tensor, {3, 2, -1, 0});
   return a_tensor;
 }
 
-static Tensor TensorB() {
+Tensor TensorB() {
   Tensor b_tensor(DT_INT32, TensorShape({1, 2}));
   test::FillValues<int32>(&b_tensor, {1, 2});
   return b_tensor;
 }
 
-static void BuildRunStepRequest(MutableRunStepRequestWrapper* request) {
+void BuildRunStepRequest(MutableRunStepRequestWrapper* request) {
   request->set_session_handle("handle");
   request->set_partial_run_handle("partial_handle");
   request->add_feed("feed_a:0", TensorA());
@@ -48,7 +49,7 @@ static void BuildRunStepRequest(MutableRunStepRequestWrapper* request) {
   request->mutable_options()->set_timeout_in_ms(37);
 }
 
-static void CheckRunStepRequest(const RunStepRequestWrapper& request) {
+void CheckRunStepRequest(const RunStepRequestWrapper& request) {
   EXPECT_EQ("handle", request.session_handle());
   EXPECT_EQ("partial_handle", request.partial_run_handle());
   EXPECT_EQ(2, request.num_feeds());
@@ -68,9 +69,8 @@ static void CheckRunStepRequest(const RunStepRequestWrapper& request) {
   EXPECT_EQ(37, request.options().timeout_in_ms());
 }
 
-static void BuildRunGraphRequest(
-    const RunStepRequestWrapper& run_step_request,
-    MutableRunGraphRequestWrapper* run_graph_request) {
+void BuildRunGraphRequest(const RunStepRequestWrapper& run_step_request,
+                          MutableRunGraphRequestWrapper* run_graph_request) {
   run_graph_request->set_graph_handle("graph_handle");
   run_graph_request->set_step_id(13);
   run_graph_request->mutable_exec_opts()->set_record_timeline(true);
@@ -83,11 +83,12 @@ static void BuildRunGraphRequest(
   run_graph_request->set_is_partial(true);
 }
 
-static void CheckRunGraphRequest(const RunGraphRequestWrapper& request) {
+void CheckRunGraphRequest(const RunGraphRequestWrapper& request) {
   EXPECT_EQ("graph_handle", request.graph_handle());
   EXPECT_EQ(13, request.step_id());
   EXPECT_FALSE(request.exec_opts().record_costs());
   EXPECT_TRUE(request.exec_opts().record_timeline());
+  EXPECT_FALSE(request.exec_opts().record_partition_graphs());
   EXPECT_EQ(2, request.num_sends());
   Tensor val;
   TF_EXPECT_OK(request.SendValue(0, &val));
@@ -98,17 +99,20 @@ static void CheckRunGraphRequest(const RunGraphRequestWrapper& request) {
   EXPECT_FALSE(request.is_last_partial_run());
 }
 
-static void BuildRunGraphResponse(
-    MutableRunGraphResponseWrapper* run_graph_response) {
+void BuildRunGraphResponse(MutableRunGraphResponseWrapper* run_graph_response) {
   run_graph_response->AddRecv("recv_2", TensorA());
   run_graph_response->AddRecv("recv_3", TensorB());
   run_graph_response->mutable_step_stats()->add_dev_stats()->set_device(
       "/cpu:0");
   run_graph_response->mutable_cost_graph()->add_node()->set_name("cost_node");
+  GraphDef graph_def;
+  graph_def.mutable_versions()->set_producer(1234);
+  graph_def.mutable_versions()->set_min_consumer(1234);
+  run_graph_response->AddPartitionGraph(graph_def);
 }
 
-static void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
-  EXPECT_EQ(2, response->num_recvs());
+void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
+  ASSERT_EQ(2, response->num_recvs());
   EXPECT_EQ("recv_2", response->recv_key(0));
   EXPECT_EQ("recv_3", response->recv_key(1));
   Tensor val;
@@ -116,26 +120,34 @@ static void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
   test::ExpectTensorEqual<int32>(TensorA(), val);
   TF_EXPECT_OK(response->RecvValue(1, &val));
   test::ExpectTensorEqual<int32>(TensorB(), val);
-  EXPECT_EQ(1, response->mutable_step_stats()->dev_stats_size());
+  ASSERT_EQ(1, response->mutable_step_stats()->dev_stats_size());
   EXPECT_EQ("/cpu:0", response->mutable_step_stats()->dev_stats(0).device());
-  EXPECT_EQ(1, response->mutable_cost_graph()->node_size());
+  ASSERT_EQ(1, response->mutable_cost_graph()->node_size());
   EXPECT_EQ("cost_node", response->mutable_cost_graph()->node(0).name());
+  ASSERT_EQ(1, response->num_partition_graphs());
+  EXPECT_EQ(1234, response->mutable_partition_graph(0)->versions().producer());
+  EXPECT_EQ(1234,
+            response->mutable_partition_graph(0)->versions().min_consumer());
 }
 
-static void BuildRunStepResponse(
-    MutableRunGraphResponseWrapper* run_graph_response,
-    MutableRunStepResponseWrapper* run_step_response) {
+void BuildRunStepResponse(MutableRunGraphResponseWrapper* run_graph_response,
+                          MutableRunStepResponseWrapper* run_step_response) {
   TF_EXPECT_OK(run_step_response->AddTensorFromRunGraphResponse(
       "fetch_x:0", run_graph_response, 0));
   TF_EXPECT_OK(run_step_response->AddTensorFromRunGraphResponse(
       "fetch_y:0", run_graph_response, 1));
   *run_step_response->mutable_metadata()->mutable_step_stats() =
       *run_graph_response->mutable_step_stats();
+  protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+      run_step_response->mutable_metadata()->mutable_partition_graphs();
+  for (size_t i = 0; i < run_graph_response->num_partition_graphs(); i++) {
+    partition_graph_defs->Add()->Swap(
+        run_graph_response->mutable_partition_graph(i));
+  }
 }
 
-static void CheckRunStepResponse(
-    const MutableRunStepResponseWrapper& response) {
-  EXPECT_EQ(2, response.num_tensors());
+void CheckRunStepResponse(const MutableRunStepResponseWrapper& response) {
+  ASSERT_EQ(2, response.num_tensors());
   EXPECT_EQ("fetch_x:0", response.tensor_name(0));
   EXPECT_EQ("fetch_y:0", response.tensor_name(1));
   Tensor val;
@@ -143,8 +155,13 @@ static void CheckRunStepResponse(
   test::ExpectTensorEqual<int32>(TensorA(), val);
   TF_EXPECT_OK(response.TensorValue(1, &val));
   test::ExpectTensorEqual<int32>(TensorB(), val);
-  EXPECT_EQ(1, response.metadata().step_stats().dev_stats_size());
+  ASSERT_EQ(1, response.metadata().step_stats().dev_stats_size());
   EXPECT_EQ("/cpu:0", response.metadata().step_stats().dev_stats(0).device());
+  ASSERT_EQ(1, response.metadata().partition_graphs_size());
+  EXPECT_EQ(1234,
+            response.metadata().partition_graphs(0).versions().producer());
+  EXPECT_EQ(1234,
+            response.metadata().partition_graphs(0).versions().min_consumer());
 }
 
 TEST(MessageWrappers, RunStepRequest_Basic) {
@@ -323,4 +340,5 @@ TEST(MessageWrappers, RunStepResponse_Basic) {
   }
 }
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 3ebc11614def34c6d76486004e8d5c60c1d0299d..61534ae3c982523c78a5446e39eb5cb373cd0a97 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -49,11 +49,12 @@ package(default_visibility = [
 
 cc_library(
     name = "grpc_util",
-    srcs = [],
+    srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
     deps = [
         "//tensorflow/core:lib",
         "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
     ],
 )
 
@@ -108,6 +109,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "@grpc//:grpc++_unsecure",
     ],
@@ -119,6 +121,7 @@ cc_library(
     hdrs = ["grpc_call.h"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "@grpc//:grpc++_unsecure",
     ],
 )
@@ -138,6 +141,7 @@ cc_library(
         ":grpc_channel",
         ":grpc_client_cq_tag",
         ":grpc_remote_worker",
+        ":grpc_util",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
@@ -192,6 +196,7 @@ cc_library(
         ":grpc_master_service_impl",
         ":grpc_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:master_interface",
@@ -209,6 +214,7 @@ cc_library(
         ":grpc_master_service_impl",
         ":grpc_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:master",
         "@grpc//:grpc++_unsecure",
@@ -381,6 +387,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:local_master",
         "//tensorflow/core/distributed_runtime:master_interface",
+        "//tensorflow/core/distributed_runtime:message_wrappers",
     ],
     alwayslink = 1,
 )
@@ -437,12 +444,26 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "grpc_util_test",
+    size = "small",
+    srcs = ["grpc_util_test.cc"],
+    deps = [
+        ":grpc_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:worker_proto_cc",
+        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "grpc_session_test",
     size = "medium",
     srcs = ["grpc_session_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["manual"],
+    tags = tf_cuda_tests_tags() + ["no_oss"],  # b/62956105: port conflicts.
     deps = [
         ":grpc_channel",
         ":grpc_server_lib",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index e85b8ccbd39ac213406903397be5f064600c6cef..cb133737dd6f65a8fd1582745ae40ac93686710d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -233,6 +233,11 @@ class Call : public UntypedCall<Service> {
   RequestMessage request;
   ResponseMessage response;
 
+  const std::multimap<::grpc::string_ref, ::grpc::string_ref>& client_metadata()
+      const {
+    return ctx_.client_metadata();
+  }
+
  private:
   // Creates a completion queue tag for handling cancellation by the client.
   // NOTE: This method must be called before this call is enqueued on a
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index bcd2c71f841f783ea19c5565de9b71729bc112a1..7efc0ba6d8510fb0d462df13f7b3ebf68e939313 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -67,7 +67,7 @@ Status NewHostPortGrpcChannel(const string& target,
   // on connection failure, which makes our tests time out.
   args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
   *channel_pointer = ::grpc::CreateCustomChannel(
-      target, ::grpc::InsecureChannelCredentials(), args);
+      "dns:///" + target, ::grpc::InsecureChannelCredentials(), args);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 07205bb2c2b88711e41f6061f894d2d43d69a2e7..41ee81c01d6ebb9085d3271eae86484bb786ecfb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
 namespace tensorflow {
@@ -172,6 +173,7 @@ class GrpcMasterService : public AsyncServiceInterface {
 
   // RPC handler for running one step in a session.
   void RunStepHandler(MasterCall<RunStepRequest, RunStepResponse>* call) {
+    auto* trace = TraceRpc("RunStep/Server", call->client_metadata());
     CallOptions* call_opts = new CallOptions;
     if (call->request.options().timeout_in_ms() > 0) {
       call_opts->SetTimeout(call->request.options().timeout_in_ms());
@@ -184,11 +186,12 @@ class GrpcMasterService : public AsyncServiceInterface {
         new NonOwnedProtoRunStepResponse(&call->response);
     call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
     master_impl_->RunStep(call_opts, wrapped_request, wrapped_response,
-                          [call, call_opts, wrapped_request,
-                           wrapped_response](const Status& status) {
+                          [call, call_opts, wrapped_request, wrapped_response,
+                           trace](const Status& status) {
                             call->ClearCancelCallback();
                             delete call_opts;
                             delete wrapped_request;
+                            delete trace;
                             call->SendResponse(ToGrpcStatus(status));
                           });
     ENQUEUE_REQUEST(RunStep, true);
@@ -224,6 +227,18 @@ class GrpcMasterService : public AsyncServiceInterface {
   }
 #undef ENQUEUE_REQUEST
 
+  // Start tracing, including the ID attached to the RPC.
+  port::Tracing::TraceMe* TraceRpc(
+      StringPiece name,
+      const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) {
+    StringPiece id;
+    auto it = metadata.find(GrpcIdKey());
+    if (it != metadata.end()) {
+      id = StringPiece(it->second.data(), it->second.size());
+    }
+    return new port::Tracing::TraceMe(name, id);
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index c42622dd50703ab7a771e44494ebc977473a96c1..17d0047eb2c83dd32cfa655fe4560ae33fd938d2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -49,74 +49,75 @@ MasterService::Stub::Stub(
     const std::shared_ptr< ::grpc::ChannelInterface>& channel)
     : channel_(channel),
       rpcmethod_CreateSession_(grpcMasterService_method_names[0],
-                               ::grpc::RpcMethod::NORMAL_RPC, channel),
+                               ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_ExtendSession_(grpcMasterService_method_names[1],
-                               ::grpc::RpcMethod::NORMAL_RPC, channel),
+                               ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_PartialRunSetup_(grpcMasterService_method_names[2],
-                                 ::grpc::RpcMethod::NORMAL_RPC, channel),
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_RunStep_(grpcMasterService_method_names[3],
-                         ::grpc::RpcMethod::NORMAL_RPC, channel),
+                         ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_CloseSession_(grpcMasterService_method_names[4],
-                              ::grpc::RpcMethod::NORMAL_RPC, channel),
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_ListDevices_(grpcMasterService_method_names[5],
-                             ::grpc::RpcMethod::NORMAL_RPC, channel),
+                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_Reset_(grpcMasterService_method_names[6],
-                       ::grpc::RpcMethod::NORMAL_RPC, channel) {}
+                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
 
 ::grpc::Status MasterService::Stub::CreateSession(
     ::grpc::ClientContext* context, const CreateSessionRequest& request,
     CreateSessionResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_CreateSession_,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_CreateSession_,
                                    context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::ExtendSession(
     ::grpc::ClientContext* context, const ExtendSessionRequest& request,
     ExtendSessionResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_ExtendSession_,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_ExtendSession_,
                                    context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::PartialRunSetup(
     ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
     PartialRunSetupResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_PartialRunSetup_,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_PartialRunSetup_,
                                    context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::RunStep(::grpc::ClientContext* context,
                                             const RunStepRequest& request,
                                             RunStepResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_RunStep_, context,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_RunStep_, context,
                                    request, response);
 }
 
 ::grpc::Status MasterService::Stub::CloseSession(
     ::grpc::ClientContext* context, const CloseSessionRequest& request,
     CloseSessionResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_CloseSession_,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_CloseSession_,
                                    context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::ListDevices(
     ::grpc::ClientContext* context, const ListDevicesRequest& request,
     ListDevicesResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_ListDevices_,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_ListDevices_,
                                    context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::Reset(::grpc::ClientContext* context,
                                           const ResetRequest& request,
                                           ResetResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_Reset_, context,
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_Reset_, context,
                                    request, response);
 }
 
 MasterService::AsyncService::AsyncService() {
   for (int i = 0; i < 7; ++i) {
-    AddMethod(new ::grpc::RpcServiceMethod(grpcMasterService_method_names[i],
-                                           ::grpc::RpcMethod::NORMAL_RPC,
-                                           nullptr));
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
+        grpcMasterService_method_names[i],
+        ::grpc::internal::RpcMethod::NORMAL_RPC,
+        nullptr));
     ::grpc::Service::MarkMethodAsync(i);
   }
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index a3a2ac80200203cc7c90b6f76af9143bb8b41af1..412395c52635d5c3cda95dddea50f7cd2d8c8e4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -53,7 +53,7 @@ namespace grpc {
 // definition in "//tensorflow/core/protobuf/master_service.proto",
 // and the gRPC generated stub and service classes.
 // See that file for the definition of methods and messages.
-class MasterService GRPC_FINAL {
+class MasterService final {
  public:
   class StubInterface {
    public:
@@ -80,40 +80,40 @@ class MasterService GRPC_FINAL {
                                  const ResetRequest& request,
                                  ResetResponse* response) = 0;
   };
-  class Stub GRPC_FINAL : public StubInterface {
+  class Stub final : public StubInterface {
    public:
     Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
     ::grpc::Status CreateSession(::grpc::ClientContext* context,
                                  const CreateSessionRequest& request,
-                                 CreateSessionResponse* response) GRPC_OVERRIDE;
+                                 CreateSessionResponse* response) override;
     ::grpc::Status ExtendSession(::grpc::ClientContext* context,
                                  const ExtendSessionRequest& request,
-                                 ExtendSessionResponse* response) GRPC_OVERRIDE;
+                                 ExtendSessionResponse* response) override;
     ::grpc::Status PartialRunSetup(
         ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
-        PartialRunSetupResponse* response) GRPC_OVERRIDE;
+        PartialRunSetupResponse* response) override;
     ::grpc::Status RunStep(::grpc::ClientContext* context,
                            const RunStepRequest& request,
-                           RunStepResponse* response) GRPC_OVERRIDE;
+                           RunStepResponse* response) override;
     ::grpc::Status CloseSession(::grpc::ClientContext* context,
                                 const CloseSessionRequest& request,
-                                CloseSessionResponse* response) GRPC_OVERRIDE;
+                                CloseSessionResponse* response) override;
     ::grpc::Status ListDevices(::grpc::ClientContext* context,
                                const ListDevicesRequest& request,
-                               ListDevicesResponse* response) GRPC_OVERRIDE;
+                               ListDevicesResponse* response) override;
     ::grpc::Status Reset(::grpc::ClientContext* context,
                          const ResetRequest& request,
-                         ResetResponse* response) GRPC_OVERRIDE;
+                         ResetResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
-    const ::grpc::RpcMethod rpcmethod_CreateSession_;
-    const ::grpc::RpcMethod rpcmethod_ExtendSession_;
-    const ::grpc::RpcMethod rpcmethod_PartialRunSetup_;
-    const ::grpc::RpcMethod rpcmethod_RunStep_;
-    const ::grpc::RpcMethod rpcmethod_CloseSession_;
-    const ::grpc::RpcMethod rpcmethod_ListDevices_;
-    const ::grpc::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_CreateSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_ExtendSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_PartialRunSetup_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunStep_;
+    const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
+    const ::grpc::internal::RpcMethod rpcmethod_Reset_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index bf72d9a7fcdb5e027be968e94c85970b6b127c14..c04aa449413ad5817bf8312cffcb124b04c66262 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
 namespace tensorflow {
@@ -66,6 +68,7 @@ class GrpcRemoteMaster : public MasterInterface {
   Status RunStep(CallOptions* call_options, RunStepRequestWrapper* request,
                  MutableRunStepResponseWrapper* response) override {
     ::grpc::ClientContext ctx;
+    auto trace = TraceRpc("RunStep/Client", &ctx);
     ctx.set_fail_fast(false);
     SetDeadline(&ctx, call_options->GetTimeout());
     return FromGrpcStatus(stub_->RunStep(&ctx, request->ToProto(),
@@ -99,6 +102,14 @@ class GrpcRemoteMaster : public MasterInterface {
   }
 
  private:
+  // Start tracing, attaching a unique ID to both the trace and the RPC.
+  port::Tracing::TraceMe TraceRpc(StringPiece name,
+                                  ::grpc::ClientContext* ctx) {
+    string trace_id = strings::StrCat(port::Tracing::UniqueId());
+    ctx->AddMetadata(GrpcIdKey(), trace_id);
+    return port::Tracing::TraceMe(name, trace_id);
+  }
+
   std::unique_ptr<grpc::MasterService::Stub> stub_;
 
   void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 1e8c30bad552ad7a5763b1b47a8ca12bf068a379..e0523baccd956660c0a0c8a325481d2f2e5ce827 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <utility>
 
+#include "grpc++/generic/generic_stub.h"
 #include "grpc++/grpc++.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_cache_logger.h"
@@ -28,18 +30,40 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/grpc_response_reader.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
+// Overload of GrpcParseProto so we can decode a TensorResponse without
+// extra copying.
+bool GrpcParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst) {
+  struct ByteSource : public TensorResponse::Source {
+    const ::grpc::ByteBuffer* buffer;
+    GrpcByteBufferSource src;
+    bool ok;
+
+    ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() override {
+      ok = src.Init(*buffer);
+      return &src;
+    }
+  };
+  ByteSource bs;
+  bs.buffer = &src;
+  return dst->ParseFrom(&bs).ok() && bs.ok;
+}
+
 class GrpcRemoteWorker : public WorkerInterface {
  public:
-  explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
+  explicit GrpcRemoteWorker(GrpcCounter* live_rpc_counter,
+                            SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             WorkerCacheLogger* logger)
-      : channel_(std::move(channel)),
+      : counter_(live_rpc_counter),
+        channel_(std::move(channel)),
+        stub_(channel_),
         cq_(completion_queue),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
@@ -188,82 +212,139 @@ class GrpcRemoteWorker : public WorkerInterface {
 
  private:
   // Object allocated per active RPC.
-  template <class RequestMessage, class ResponseMessage>
-  class RPCState final : public GrpcClientCQTag {
+  template <class ResponseMessage>
+  class RPCState : public GrpcClientCQTag {
    public:
-    RPCState(::grpc::ChannelInterface* channel, ::grpc::CompletionQueue* cq,
-             const ::grpc::RpcMethod& method, const RequestMessage& request,
+    RPCState(GrpcCounter* counter, ::grpc::GenericStub* stub,
+             ::grpc::CompletionQueue* cq, const ::grpc::string& method,
+             const protobuf::Message& request, ResponseMessage* response,
              StatusCallback done, CallOptions* call_opts)
-        : call_opts_(call_opts),
-          reader_(channel, cq, method, InitContext(call_opts), request),
-          done_(std::move(done)) {}
-
-    ~RPCState() override {}
+        : counter_(counter), call_opts_(call_opts), done_(std::move(done)) {
+      // TODO(sanjay): The counter will no longer be needed once we
+      // get a GenericStub API which allows us to manage an entire
+      // RPC with a single completion event instead of four events.
+      counter_->Increment();
+      // The initialization and recovery protocols rely on blocking
+      // until we get a response.
+      context_.set_fail_fast(false);
+      if (call_opts) {
+        call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
+      }
 
-    void StartRPC(ResponseMessage* response) {
-      reader_.Finish(response, &status_, this);
+      failure_.store(false);
+      remaining_callbacks_.store(4);  // Init/Read/Write/Finish callbacks
+      response_ = response;
+      GrpcUnparseProto(request, &request_buf_);
+      // TODO(sanjay): When new enough grpc is available, enable the following:
+      //   context_.set_initial_metadata_corked(true);
+      // We can then skip the extra state transition for init callback.
+      call_ = std::move(stub->Call(&context_, method, cq, this));
+      call_initialized_.Notify();
     }
 
+    // Called multiple times: when init done, read done, write done, call done.
     void OnCompleted(bool ok) override {
-      if (!ok) {
-        VLOG(2) << "Call returned with non-ok status: "
-                << status_.error_message();
-      }
-      if (call_opts_) {
-        call_opts_->ClearCancelCallback();
+      if (!ok) failure_.store(true);
+      const int old_count = remaining_callbacks_.fetch_sub(1);
+      if (old_count > 1) {
+        if (old_count == 4) {
+          // Init callback finished.  Issue remaining ops.
+
+          // Annoyingly enough, the way the generic call API works is
+          // inherently racy.  We can get the following sequence of events:
+          //  1. stub->Call() starts.
+          //  2. some stuff happens inside grpc
+          //  3. grpc delivers the completion event
+          //  4. tensorflow event handling thread calls init metadata callback
+          //  5. stub->Call() finishes
+          //  6. the result of stub->Call() is stored in call_
+          // We are currently inside the callback and therefore need to
+          // wait for step 6 to finish before attempting to touch call_.
+          call_initialized_.WaitForNotification();
+
+          if (ok) {
+            // TODO(sanjay): Use WriteLast() when grpc version we are using
+            // is new enough.
+            call_->Write(request_buf_, this);
+            call_->Read(&response_buf_, this);
+          } else {
+            // Skip Write and Read.
+            remaining_callbacks_.fetch_sub(2);
+          }
+          call_->Finish(&status_, this);
+        }
+        // Still waiting for some more callbacks to finish.
+        return;
+      } else {  // old_count == 1, i.e., all callbacks have finished
+        // Last callback finished; clean up.
+        if (call_opts_) {
+          call_opts_->ClearCancelCallback();
+        }
+        Status s = FromGrpcStatus(status_);
+        if (s.ok() && failure_.load()) {
+          s.Update(errors::Internal("callback error"));
+        }
+        if (s.ok() && !GrpcParseProto(response_buf_, response_)) {
+          s.Update(errors::Internal("could not parse rpc response"));
+        }
+        if (!s.ok()) {
+          VLOG(2) << "Call returned with non-ok status: " << s;
+        }
+        done_(s);
+        counter_->Decrement();
+        delete this;
       }
-      done_(FromGrpcStatus(status_));
-      delete this;
     }
 
    private:
+    GrpcCounter* const counter_;
     CallOptions* call_opts_;
     ::grpc::ClientContext context_;
-    ::grpc::ClientAsyncResponseReader<ResponseMessage> reader_;
+    std::unique_ptr<::grpc::GenericClientAsyncReaderWriter> call_;
+    ResponseMessage* response_;
+    ::grpc::ByteBuffer request_buf_;
+    ::grpc::ByteBuffer response_buf_;
     ::grpc::Status status_;
     StatusCallback done_;
-
-    ::grpc::ClientContext* InitContext(CallOptions* call_opts) {
-      // The initialization and recovery protocols rely on blocking
-      // until we get a response.
-      context_.set_fail_fast(false);
-      if (call_opts) {
-        call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
-      }
-      return &context_;
-    }
+    std::atomic<bool> failure_;
+    std::atomic<int> remaining_callbacks_;
+    Notification call_initialized_;
   };
 
   // Utility method for issuing a generic asynchronous request. The
   // given callback, `done`, will be called when the RPC completes.
-  template <class RequestMessage, class ResponseMessage>
-  void IssueRequest(const RequestMessage* request, ResponseMessage* response,
-                    const ::grpc::RpcMethod& method, StatusCallback done,
+  void IssueRequest(const protobuf::Message* request,
+                    protobuf::Message* response, const ::grpc::string& method,
+                    StatusCallback done, CallOptions* call_opts = nullptr) {
+    new RPCState<protobuf::Message>(counter_, &stub_, cq_, method, *request,
+                                    response, std::move(done), call_opts);
+  }
+  void IssueRequest(const protobuf::Message* request, TensorResponse* response,
+                    const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
-    auto state = new RPCState<RequestMessage, ResponseMessage>(
-        channel_.get(), cq_, method, *request, std::move(done), call_opts);
-    state->StartRPC(response);
+    new RPCState<TensorResponse>(counter_, &stub_, cq_, method, *request,
+                                 response, std::move(done), call_opts);
   }
 
   // Helper function for initializing the RpcMethod objects below.
-  ::grpc::RpcMethod Method(GrpcWorkerMethod id) {
-    return ::grpc::RpcMethod(GrpcWorkerMethodName(id),
-                             ::grpc::RpcMethod::NORMAL_RPC, channel_);
-  }
+  const char* Method(GrpcWorkerMethod id) { return GrpcWorkerMethodName(id); }
 
+  GrpcCounter* const counter_;
   SharedGrpcChannelPtr channel_;
+  ::grpc::GenericStub stub_;
+
   ::grpc::CompletionQueue* cq_;
 
-  const ::grpc::RpcMethod getstatus_;
-  const ::grpc::RpcMethod createworkersession_;
-  const ::grpc::RpcMethod registergraph_;
-  const ::grpc::RpcMethod deregistergraph_;
-  const ::grpc::RpcMethod rungraph_;
-  const ::grpc::RpcMethod cleanupgraph_;
-  const ::grpc::RpcMethod cleanupall_;
-  const ::grpc::RpcMethod recvtensor_;
-  const ::grpc::RpcMethod logging_;
-  const ::grpc::RpcMethod tracing_;
+  const ::grpc::string getstatus_;
+  const ::grpc::string createworkersession_;
+  const ::grpc::string registergraph_;
+  const ::grpc::string deregistergraph_;
+  const ::grpc::string rungraph_;
+  const ::grpc::string cleanupgraph_;
+  const ::grpc::string cleanupall_;
+  const ::grpc::string recvtensor_;
+  const ::grpc::string logging_;
+  const ::grpc::string tracing_;
 
   // Support for logging.
   WorkerCacheLogger* logger_;
@@ -271,10 +352,12 @@ class GrpcRemoteWorker : public WorkerInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
 
-WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
+WorkerInterface* NewGrpcRemoteWorker(GrpcCounter* live_rpc_counter,
+                                     SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
+  return new GrpcRemoteWorker(live_rpc_counter, std::move(channel),
+                              completion_queue, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index 8ad41335409e0a7f7576134ed12b1a233aa341e0..174dfcc7072f49c3831b74a90f602ebcfd87b453 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -26,10 +26,12 @@ class CompletionQueue;
 
 namespace tensorflow {
 
+class GrpcCounter;
 class WorkerCacheLogger;
 class WorkerInterface;
 
-WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
+WorkerInterface* NewGrpcRemoteWorker(GrpcCounter* live_rpc_counter,
+                                     SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index e7eca62fdfc5890fd35ebafc5f028cd3b1eef6b7..b35d4843e8482dc15c6013f9cd0486f8feea754a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
 
 #include "grpc++/impl/codegen/proto_utils.h"
+#include "grpc++/support/slice.h"
 
 namespace grpc {
 
@@ -24,7 +25,7 @@ namespace tensorflow_helper {
 
 const int kGrpcBufferWriterMaxBufferLength = 8192;
 
-class GrpcBufferWriter GRPC_FINAL
+class GrpcBufferWriter final
     : public ::grpc::protobuf::io::ZeroCopyOutputStream {
  public:
   explicit GrpcBufferWriter(grpc_byte_buffer** bp, int block_size)
@@ -33,52 +34,56 @@ class GrpcBufferWriter GRPC_FINAL
     slice_buffer_ = &(*bp)->data.raw.slice_buffer;
   }
 
-  ~GrpcBufferWriter() GRPC_OVERRIDE {
+  ~GrpcBufferWriter() override {
     if (have_backup_) {
-      g_core_codegen_interface->gpr_slice_unref(backup_slice_);
+      g_core_codegen_interface->grpc_slice_unref(backup_slice_);
     }
   }
 
-  bool Next(void** data, int* size) GRPC_OVERRIDE {
+  bool Next(void** data, int* size) override {
     if (have_backup_) {
       slice_ = backup_slice_;
       have_backup_ = false;
     } else {
-      slice_ = g_core_codegen_interface->gpr_slice_malloc(block_size_);
+      slice_ = g_core_codegen_interface->grpc_slice_malloc(block_size_);
     }
-    *data = GPR_SLICE_START_PTR(slice_);
+    *data = GRPC_SLICE_START_PTR(slice_);
     // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GPR_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GPR_SLICE_LENGTH(slice_);
-    g_core_codegen_interface->gpr_slice_buffer_add(slice_buffer_, slice_);
+    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
+    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
+    g_core_codegen_interface->grpc_slice_buffer_add(slice_buffer_, slice_);
     return true;
   }
 
-  void BackUp(int count) GRPC_OVERRIDE {
-    g_core_codegen_interface->gpr_slice_buffer_pop(slice_buffer_);
+  void BackUp(int count) override {
+    g_core_codegen_interface->grpc_slice_buffer_pop(slice_buffer_);
     if (count == block_size_) {
       backup_slice_ = slice_;
     } else {
-      backup_slice_ = g_core_codegen_interface->gpr_slice_split_tail(
-          &slice_, GPR_SLICE_LENGTH(slice_) - count);
-      g_core_codegen_interface->gpr_slice_buffer_add(slice_buffer_, slice_);
+      backup_slice_ = g_core_codegen_interface->grpc_slice_split_tail(
+          &slice_, GRPC_SLICE_LENGTH(slice_) - count);
+      g_core_codegen_interface->grpc_slice_buffer_add(slice_buffer_, slice_);
     }
-    have_backup_ = true;
+    // It's dangerous to keep an inlined grpc_slice as the backup slice, since
+    // on a following Next() call, a reference will be returned to this slice
+    // via GRPC_SLICE_START_PTR, which will not be an adddress held by
+    // slice_buffer_.
+    have_backup_ = backup_slice_.refcount != NULL;
     byte_count_ -= count;
   }
 
-  grpc::protobuf::int64 ByteCount() const GRPC_OVERRIDE { return byte_count_; }
+  grpc::protobuf::int64 ByteCount() const override { return byte_count_; }
 
  private:
   const int block_size_;
   int64_t byte_count_;
-  gpr_slice_buffer* slice_buffer_;
+  grpc_slice_buffer* slice_buffer_;
   bool have_backup_;
-  gpr_slice backup_slice_;
-  gpr_slice slice_;
+  grpc_slice backup_slice_;
+  grpc_slice slice_;
 };
 
-class GrpcBufferReader GRPC_FINAL
+class GrpcBufferReader final
     : public ::grpc::protobuf::io::ZeroCopyInputStream {
   typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
       grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
@@ -100,13 +105,13 @@ class GrpcBufferReader GRPC_FINAL
     ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
                buffer);
   }
-  ~GrpcBufferReader() GRPC_OVERRIDE {
+  ~GrpcBufferReader() override {
     g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
   }
 
-  bool Next(const void** data, int* size) GRPC_OVERRIDE {
+  bool Next(const void** data, int* size) override {
     if (backup_count_ > 0) {
-      *data = GPR_SLICE_START_PTR(slice_) + GPR_SLICE_LENGTH(slice_) -
+      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
               backup_count_;
       GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
       *size = (int)backup_count_;
@@ -117,17 +122,17 @@ class GrpcBufferReader GRPC_FINAL
                                                                 &slice_)) {
       return false;
     }
-    g_core_codegen_interface->gpr_slice_unref(slice_);
-    *data = GPR_SLICE_START_PTR(slice_);
+    g_core_codegen_interface->grpc_slice_unref(slice_);
+    *data = GRPC_SLICE_START_PTR(slice_);
     // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GPR_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GPR_SLICE_LENGTH(slice_);
+    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
+    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
     return true;
   }
 
-  void BackUp(int count) GRPC_OVERRIDE { backup_count_ = count; }
+  void BackUp(int count) override { backup_count_ = count; }
 
-  bool Skip(int count) GRPC_OVERRIDE {
+  bool Skip(int count) override {
     const void* data;
     int size;
     while (Next(&data, &size)) {
@@ -142,7 +147,7 @@ class GrpcBufferReader GRPC_FINAL
     return false;
   }
 
-  grpc::protobuf::int64 ByteCount() const GRPC_OVERRIDE {
+  grpc::protobuf::int64 ByteCount() const override {
     return byte_count_ - backup_count_;
   }
 
@@ -150,7 +155,7 @@ class GrpcBufferReader GRPC_FINAL
   int64_t byte_count_;
   int64_t backup_count_;
   grpc_byte_buffer_reader reader_;
-  gpr_slice slice_;
+  grpc_slice slice_;
 };
 
 }  // namespace tensorflow_helper
@@ -171,12 +176,12 @@ class UnlimitedSizeProtoSerializationTraits {
       return Status(StatusCode::INTERNAL, "Message length was negative");
     } else if (byte_size <=
                tensorflow_helper::kGrpcBufferWriterMaxBufferLength) {
-      gpr_slice slice = g_core_codegen_interface->gpr_slice_malloc(byte_size);
+      grpc_slice slice = g_core_codegen_interface->grpc_slice_malloc(byte_size);
       GPR_CODEGEN_ASSERT(
-          GPR_SLICE_END_PTR(slice) ==
-          msg.SerializeWithCachedSizesToArray(GPR_SLICE_START_PTR(slice)));
+          GRPC_SLICE_END_PTR(slice) ==
+          msg.SerializeWithCachedSizesToArray(GRPC_SLICE_START_PTR(slice)));
       *bp = g_core_codegen_interface->grpc_raw_byte_buffer_create(&slice, 1);
-      g_core_codegen_interface->gpr_slice_unref(slice);
+      g_core_codegen_interface->grpc_slice_unref(slice);
       return g_core_codegen_interface->ok();
     } else {
       tensorflow_helper::GrpcBufferWriter writer(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index fbde7aa9240fe479bfb8d71433ab43d7311a6434..9a08335c1c93c56e8bbd61a76bae211482555e62 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -31,16 +31,14 @@ limitations under the License.
 
 namespace tensorflow {
 
+const char* const kSchemePrefix = "grpc://";
+const size_t kSchemePrefixLength = strlen(kSchemePrefix);
+
 GrpcSession::GrpcSession(const SessionOptions& options)
     : options_(options), current_graph_version_(-1) {}
 
 GrpcSession::~GrpcSession() {}
 
-namespace {
-const char* kSchemePrefix = "grpc://";
-const size_t kSchemePrefixLength = strlen(kSchemePrefix);
-}  // namespace
-
 /* static */
 Status GrpcSession::Create(const SessionOptions& options,
                            std::unique_ptr<GrpcSession>* out_session) {
@@ -76,7 +74,7 @@ void ReEncodeConsts(GraphDef* gdef) {
         }
       }
       if (proto != nullptr && proto->tensor_content().empty() &&
-          proto->ByteSize() > 64) {
+          proto->ByteSizeLong() > 64) {
         // If the constant is encoded with repeated proto fields and
         // it is moderate large, we re-encode it in tensor_content as
         // a Cord. This is mildly helpful for reducing the peak memory
@@ -192,12 +190,14 @@ Status GrpcSession::RunHelper(
     req->add_feed(it.first, it.second);
   }
 
-  // Build an index from fetch tensor name to offset.
+  // Build an index from fetch tensor name to first index in
+  // output_tensor_names.
   std::unordered_map<string, int> output_name_to_offset;
-  for (const string& output_name : output_tensor_names) {
-    req->add_fetch(output_name);
-    output_name_to_offset.insert(
-        std::make_pair(output_name, output_name_to_offset.size()));
+  for (int i = 0; i < output_tensor_names.size(); ++i) {
+    const string& name = output_tensor_names[i];
+    if (output_name_to_offset.insert(std::make_pair(name, i)).second) {
+      req->add_fetch(name);
+    }
   }
   for (const string& target : target_node_names) {
     req->add_target(target);
@@ -223,6 +223,17 @@ Status GrpcSession::RunHelper(
     TF_RETURN_IF_ERROR(resp->TensorValue(i, &output));
     (*outputs)[fetch_it->second] = output;
   }
+  // In the unlikely event that output_tensor_names contains duplicates, fill in
+  // the duplicate values.
+  if (output_name_to_offset.size() != output_tensor_names.size()) {
+    for (int i = 0; i < output_tensor_names.size(); ++i) {
+      const string& name = output_tensor_names[i];
+      int offset = output_name_to_offset[name];
+      if (offset != i) {
+        (*outputs)[i] = (*outputs)[offset];
+      }
+    }
+  }
 
   if (run_metadata) {
     run_metadata->Swap(resp->mutable_metadata());
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 405b2939ebd5c03a5c8bfefc1e0dae031e3cb669..b673f200ccaaccbdab7b0f589af3d3450a6c44b6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -183,6 +183,33 @@ TEST(GrpcSessionTest, NonLocalWithFilters) {
   }
 }
 
+TEST(GrpcSessionTest, FetchMultipleTimes) {
+  GraphDef graph;
+  string node_names[3];
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(graph));
+  const std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<Tensor> outputs;
+
+  const string node = node_names[2] + ":0";
+  TF_CHECK_OK(session->Run(inputs, {node, node}, {}, &outputs));
+  EXPECT_EQ(2, outputs.size());
+  for (int i = 0; i < outputs.size(); ++i) {
+    const Tensor& t = outputs[i];
+    ASSERT_TRUE(t.IsInitialized()) << i;
+    ASSERT_EQ(4.0, t.flat<float>()(0)) << i;
+  }
+  TF_CHECK_OK(session->Close());
+}
+
 // A = [3 2; -1 0]; x = rand(2, 1); We want to compute the largest
 // eigenvalue for A, which is 2.0. Iteratively, we do
 //   repeat x = y / y.norm(); y = A * x; end
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index 90e311a493079526c10c12d44cbeac609bfa6847..9121303b0e38f84802db585118c43842bac835b0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "grpc++/support/slice.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_reference.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/io/proto_encode_helper.h"
 #include "tensorflow/core/platform/env.h"
@@ -35,10 +37,10 @@ static void unref_tensorbuffer(void* raw) {
 
 void EncodeRecvTensorResponseToByteBuffer(const RecvTensorResponse& proto,
                                           ::grpc::ByteBuffer* result) {
-  size_t len = proto.ByteSize();
-  gpr_slice s = gpr_slice_malloc(len);
+  size_t len = proto.ByteSizeLong();
+  grpc_slice s = grpc_slice_malloc(len);
   proto.SerializeWithCachedSizesToArray(
-      reinterpret_cast<uint8*>(GPR_SLICE_START_PTR(s)));
+      reinterpret_cast<uint8*>(GRPC_SLICE_START_PTR(s)));
   ::grpc::Slice slice(s, ::grpc::Slice::STEAL_REF);
   *result = ::grpc::ByteBuffer(&slice, 1);
 }
@@ -66,12 +68,12 @@ void EncodeRecvTensorResponseToByteBuffer(const RecvTensorResponse& proto,
 // E:   <actual data for val's representation>
 //
 // If the tensor data is up to "kLargeTensorBytes", then A
-// through E will all be encoded into "*result" in a single gpr_slice.
+// through E will all be encoded into "*result" in a single grpc_slice.
 //
 // If the tensor data is larger than "kLargeTensorBytes", then A through
-// D2 will be encoded in one gpr_slice, and E will be encoded in a second
-// gpr_slice that points to the backing store for the tensor data, to avoid
-// copying the tensor data (and the gpr_slice setup will be arrange so as
+// D2 will be encoded in one grpc_slice, and E will be encoded in a second
+// grpc_slice that points to the backing store for the tensor data, to avoid
+// copying the tensor data (and the grpc_slice setup will be arrange so as
 // to dereference the underlying tensor data buffer when it is no longer
 // needed in the "*result" ByteBuffer).
 static int VarLengthEncodingSize(uint32 tag, size_t bytes) {
@@ -207,11 +209,11 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
     int num_slices = 0;
     {
       size_t slice_len = e.size() + (tensor_data_is_large ? 0 : tdata.size());
-      gpr_slice s0 = gpr_slice_malloc(slice_len);
-      memcpy(GPR_SLICE_START_PTR(s0), e.data(), e.size());
+      grpc_slice s0 = grpc_slice_malloc(slice_len);
+      memcpy(GRPC_SLICE_START_PTR(s0), e.data(), e.size());
       if (!tensor_data_is_large) {
         // (E)
-        memcpy(GPR_SLICE_START_PTR(s0) + e.size(), tdata.data(), tdata.size());
+        memcpy(GRPC_SLICE_START_PTR(s0) + e.size(), tdata.data(), tdata.size());
       }
       slices[0] = ::grpc::Slice(s0, ::grpc::Slice::STEAL_REF);
       num_slices += 1;
@@ -228,7 +230,7 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
       // hypothetical grpc_slice-related changes (e.g. the
       // implementation could decide to destroy 0-length slices
       // eagerly).  In practice, this does not happen with the current
-      // implementation, and the gpr_slice interface at the moment does
+      // implementation, and the grpc_slice interface at the moment does
       // not allow us to do the Tensor-unreferencing in the right way
       // (since the Tensor pointer is different than the backing store
       // array pointer).
@@ -243,13 +245,13 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
       const TensorBuffer* buf = DMAHelper::buffer(&val);
       buf->Ref();
-      gpr_slice s1 = gpr_slice_new(
+      grpc_slice s1 = grpc_slice_new(
           const_cast<void*>(static_cast<const void*>(tdata.data())),
           tdata.size(), do_nothing);
       slices[1] = ::grpc::Slice(s1, ::grpc::Slice::STEAL_REF);
 
-      gpr_slice s2 =
-          gpr_slice_new(const_cast<TensorBuffer*>(buf), 0, unref_tensorbuffer);
+      grpc_slice s2 =
+          grpc_slice_new(const_cast<TensorBuffer*>(buf), 0, unref_tensorbuffer);
       slices[2] = ::grpc::Slice(s2, ::grpc::Slice::STEAL_REF);
       num_slices += 2;
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5d04549b41066b3aee583671171efda716c8d1c
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace tensorflow {
+
+GrpcByteBufferSource::GrpcByteBufferSource() {}
+
+bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
+  cur_ = -1;
+  left_ = 0;
+  ptr_ = nullptr;
+  byte_count_ = 0;
+  bool ok = src.Dump(&slices_).ok();
+  if (!ok) {
+    slices_.clear();
+  }
+  return ok;
+}
+
+bool GrpcByteBufferSource::Next(const void** data, int* size) {
+  // Use loop instead of if in case buffer contained empty slices.
+  while (left_ == 0) {
+    // Advance to next slice.
+    cur_++;
+    if (cur_ >= slices_.size()) {
+      return false;
+    }
+    const ::grpc::Slice& s = slices_[cur_];
+    left_ = s.size();
+    ptr_ = reinterpret_cast<const char*>(s.begin());
+  }
+
+  *data = ptr_;
+  *size = left_;
+  byte_count_ += left_;
+  ptr_ += left_;
+  left_ = 0;
+  return true;
+}
+
+void GrpcByteBufferSource::BackUp(int count) {
+  ptr_ -= count;
+  left_ += count;
+  byte_count_ -= count;
+}
+
+bool GrpcByteBufferSource::Skip(int count) {
+  const void* data;
+  int size;
+  while (Next(&data, &size)) {
+    if (size >= count) {
+      BackUp(size - count);
+      return true;
+    }
+    // size < count;
+    count -= size;
+  }
+  // error or we have too large count;
+  return false;
+}
+
+grpc::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
+  return byte_count_;
+}
+
+void GrpcUnparseProto(const protobuf::Message& src, grpc::ByteBuffer* dst) {
+  // TODO(sanjay): For bigger protos, serialize into a ZeroCopyOutputStream.
+  size_t len = src.ByteSizeLong();
+  grpc_slice s = grpc_slice_malloc(len);
+  src.SerializeWithCachedSizesToArray(
+      reinterpret_cast<uint8*>(GRPC_SLICE_START_PTR(s)));
+  ::grpc::Slice slice(s, ::grpc::Slice::STEAL_REF);
+  ::grpc::ByteBuffer buffer(&slice, 1);
+  // TODO(sanjay): Use Swap() when grpc version we are using is new enough.
+  // dst->Swap(&buffer);
+  *dst = buffer;
+}
+
+bool GrpcParseProto(const grpc::ByteBuffer& src, protobuf::Message* dst) {
+  GrpcByteBufferSource stream;
+  if (!stream.Init(src)) return false;
+  return dst->ParseFromZeroCopyStream(&stream);
+}
+
+void GrpcCounter::Increment() {
+  mutex_lock l(mu_);
+  counter_++;
+}
+
+void GrpcCounter::Decrement() {
+  mutex_lock l(mu_);
+  DCHECK_GT(counter_, 0);
+  counter_--;
+  if (counter_ == 0) {
+    empty_.notify_all();
+  }
+}
+
+void GrpcCounter::WaitUntilUnused() {
+  mutex_lock l(mu_);
+  while (counter_ != 0) {
+    empty_.wait(l);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 44473d115054f3974e94a38837d73292e6206d48..64bc960536d3f99058c33e50fae09b58c5663382 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -19,7 +19,11 @@ limitations under the License.
 #include <memory>
 
 #include "grpc++/grpc++.h"
+#include "grpc++/impl/codegen/proto_utils.h"
+#include "grpc++/support/byte_buffer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
@@ -43,6 +47,55 @@ inline ::grpc::Status ToGrpcStatus(const ::tensorflow::Status& s) {
 
 typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
 
+inline string GrpcIdKey() { return "tf-rpc"; }
+
+// Serialize src and store in *dst.
+void GrpcUnparseProto(const protobuf::Message& src, ::grpc::ByteBuffer* dst);
+
+// Parse contents of src and initialize *dst with them.
+bool GrpcParseProto(const ::grpc::ByteBuffer& src, protobuf::Message* dst);
+
+// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
+class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
+ public:
+  GrpcByteBufferSource();
+  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
+  bool Next(const void** data, int* size) override;
+  void BackUp(int count) override;
+  bool Skip(int count) override;
+  ::grpc::protobuf::int64 ByteCount() const override;
+
+ private:
+  std::vector<::grpc::Slice> slices_;
+  int cur_;          // Current slice index.
+  int left_;         // Number of bytes in slices_[cur_] left to yield.
+  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
+  ::grpc::protobuf::int64 byte_count_;
+};
+
+// GrpcCounter is used to delay shutdown until all active RPCs are done.
+class GrpcCounter {
+ public:
+  GrpcCounter() {}
+
+  GrpcCounter(const GrpcCounter&) = delete;
+  GrpcCounter& operator=(const GrpcCounter&) = delete;
+
+  // Increment the count of live RPCs.
+  void Increment();
+
+  // Decrement the count of live RPCs.
+  void Decrement();
+
+  // Wait until count of live RPCs is zero.
+  void WaitUntilUnused();
+
+ private:
+  mutex mu_;
+  condition_variable empty_;
+  int counter_ = 0;
+};
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2bc50324d18cd92b6dea0377aedb56b387adf97
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+namespace {
+string ToString(const grpc::ByteBuffer& buf) {
+  std::vector<grpc::Slice> slices;
+  CHECK(buf.Dump(&slices).ok());
+  string result;
+  for (const grpc::Slice& s : slices) {
+    result.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  return result;
+}
+
+// Return a ByteBuffer that contains str split up into num_slices slices.
+grpc::ByteBuffer MakeBuffer(const string& str, int num_slices) {
+  // Convert to a ByteBuffer.
+  std::vector<::grpc::Slice> slices;
+  const size_t per_slice = (str.size() + num_slices - 1) / num_slices;
+  for (size_t pos = 0; pos < str.size();) {
+    const size_t n = std::min(str.size() - pos, per_slice);
+    auto slice = grpc_slice_from_copied_buffer(&str[pos], n);
+    slices.push_back(::grpc::Slice(slice, ::grpc::Slice::STEAL_REF));
+    pos += n;
+  }
+  if (slices.empty()) {
+    slices.push_back(::grpc::Slice());
+  }
+  return ::grpc::ByteBuffer(&slices[0], slices.size());
+}
+
+// Make a proto with approximately the specified length.
+CleanupAllRequest MakeProto(int size) {
+  int approx_size = 0;
+  CleanupAllRequest proto;
+  int index = 0;
+  while (approx_size < size) {
+    int item_size = std::min(size - approx_size, 1024);
+    proto.add_container(string(item_size, 'a' + static_cast<char>(index % 26)));
+    approx_size += item_size + 3;  // +3 for encoding overhead.
+    index++;
+  }
+  return proto;
+}
+}  // namespace
+
+TEST(GrpcProto, Unparse) {
+  CleanupAllRequest proto;
+  proto.add_container("hello");
+  proto.add_container("world");
+  grpc::ByteBuffer buf;
+  GrpcUnparseProto(proto, &buf);
+  CleanupAllRequest parsed;
+  ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
+  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+}
+
+TEST(GrpcProto, Parse) {
+  // Test with serialization broken up into a bunch of slices.
+  struct Case {
+    int length;
+    int slices;
+  };
+  for (Case c : std::vector<Case>{
+           {0, 1},
+           {20, 1},
+           {100, 1},
+           {1 << 20, 1},
+           {100, 5},
+           {10000, 50},
+       }) {
+    CleanupAllRequest proto = MakeProto(c.length);
+    ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
+    CleanupAllRequest parsed;
+    ASSERT_TRUE(GrpcParseProto(src, &parsed)) << c.length << " " << c.slices;
+    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+  }
+}
+
+static void BM_UnparseGrpc(int iters, int size) {
+  testing::StopTiming();
+  auto proto = MakeProto(size);
+  testing::StartTiming();
+  for (int i = 0; i < iters; i++) {
+    grpc::ByteBuffer buf;
+    GrpcUnparseProto(proto, &buf);
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_UnparseGrpc)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
+
+static void BM_UnparseString(int iters, int size) {
+  testing::StopTiming();
+  auto proto = MakeProto(size);
+  testing::StartTiming();
+
+  for (int i = 0; i < iters; i++) {
+    string buf;
+    proto.SerializeToString(&buf);
+  }
+
+  testing::StopTiming();
+}
+BENCHMARK(BM_UnparseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
+
+static void BM_ParseGrpc(int iters, int size, int num_slices) {
+  testing::StopTiming();
+  CleanupAllRequest proto = MakeProto(size);
+  auto buf = MakeBuffer(proto.SerializeAsString(), num_slices);
+  testing::StartTiming();
+
+  for (int i = 0; i < iters; i++) {
+    CHECK(GrpcParseProto(buf, &proto));
+  }
+
+  testing::StopTiming();
+}
+BENCHMARK(BM_ParseGrpc)
+    ->ArgPair(1, 1)
+    ->ArgPair(1 << 10, 1)
+    ->ArgPair(1 << 10, 4)
+    ->ArgPair(1 << 20, 1)
+    ->ArgPair(1 << 20, 4);
+
+static void BM_ParseString(int iters, int size) {
+  testing::StopTiming();
+  CleanupAllRequest proto = MakeProto(size);
+  string serial = proto.SerializeAsString();
+  testing::StartTiming();
+
+  for (int i = 0; i < iters; i++) {
+    CHECK(proto.ParseFromString(serial));
+  }
+
+  testing::StopTiming();
+}
+BENCHMARK(BM_ParseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 29c812408dcfe579063049afaf901a8b7fc328c2..29acad34e9c98b0def36deda7eba018ef67abb84 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/worker_cache_logger.h"
 #include "tensorflow/core/distributed_runtime/worker_cache_partial.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
@@ -50,6 +51,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
+    // Wait until all live rpcs are done since otherwise the completion
+    // queue shutdown will interfere with rpc operation.
+    live_rpc_counter_.WaitUntilUnused();
     completion_queue_.Shutdown();
     delete polling_thread_;  // Blocks until thread exits.
     delete channel_cache_;
@@ -65,8 +69,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
     } else {
       SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target);
       if (!channel) return nullptr;
-      WorkerInterface* ret =
-          NewGrpcRemoteWorker(channel, &completion_queue_, &logger_);
+      WorkerInterface* ret = NewGrpcRemoteWorker(&live_rpc_counter_, channel,
+                                                 &completion_queue_, &logger_);
       return ret;
     }
   }
@@ -91,6 +95,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
  private:
   const string local_target_;
   WorkerInterface* const local_worker_;  // Not owned.
+  GrpcCounter live_rpc_counter_;
   GrpcChannelCache* channel_cache_;  // Owned.
   ::grpc::CompletionQueue completion_queue_;
   Thread* polling_thread_;  // Owned.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 80a2f89337c6914dd871c4df346016d70d0f4093..348c6dc98bd5bf8a4e6c0a1def8593a858fe6062 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -58,9 +58,9 @@ namespace grpc {
 
 WorkerService::AsyncService::AsyncService() {
   for (int i = 0; i < kGrpcNumWorkerMethods; ++i) {
-    AddMethod(new ::grpc::RpcServiceMethod(
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
         GrpcWorkerMethodName(static_cast<GrpcWorkerMethod>(i)),
-        ::grpc::RpcMethod::NORMAL_RPC, nullptr));
+        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
     ::grpc::Service::MarkMethodAsync(i);
   }
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index bfdd58d46d79c83a71cf2aff26b6fa63c039083c..e9862a61a3f4ece2218b281d9a78b8ff4d59594f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -130,7 +130,7 @@ namespace grpc {
 // definition in "//tensorflow/core/protobuf/worker_service.proto",
 // and the gRPC generated stub and service classes.
 // See the proto file for the definition of methods and messages.
-class WorkerService GRPC_FINAL {
+class WorkerService final {
  public:
   class AsyncService : public ::grpc::Service {
    public:
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 8265100061e4cb0a1a3ea1da96abb5b563f010c8..72dfe5c062177de7039980ece31778e7cac06592 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -39,7 +39,7 @@ namespace {
 class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  public:
   RpcRemoteRendezvous(const WorkerEnv* env, int64 step_id)
-      : BaseRemoteRendezvous(env, step_id, false) {}
+      : BaseRemoteRendezvous(env, step_id) {}
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 0b628205c3c79f4755c17a8db7603c5e910565c2..844a0643e62f63719027ad3c922a26a0f7b92505 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -33,7 +33,7 @@ void InitializePending(const Graph* graph, std::vector<int>* pending) {
     const int id = node->id();
     int num_in_edges = 0;
     if (IsMerge(node)) {
-      // For forward executon order, Merge nodes are special. We process
+      // For forward execution order, Merge nodes are special. We process
       // them only once when one of its inputs is processed.
       for (const Edge* edge : node->in_edges()) {
         if (edge->IsControlEdge()) {
@@ -122,7 +122,7 @@ Microseconds SlackAnalysis::ComputeAlap(std::vector<Microseconds>* alap_times) {
   std::vector<int> pending_count;
   pending_count.resize(graph_->num_node_ids());
   for (const Node* n : graph_->nodes()) {
-    // For reverse executon order, Switch nodes are special. We process
+    // For reverse execution order, Switch nodes are special. We process
     // them only once when one of its outputs is processed.
     if (IsSwitch(n)) {
       int32 num_control_edges = 0;
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index f98bd17ab9307a730608280ee046e631fb0ec9f2..94d54a2b16bb38c44f656455749579c364bb6424 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "google/protobuf/any.pb.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
index 540b76ada68eb8b957aeb4db78ccc30936895b63..52a057bdb2f95febf83da4ec7ddd040a7dea3f1f 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 16e450abb0083e27e586d2f1068a55b1f0ba4329..34b53e965f829e886c57f0f8249a12dd21d17472 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -156,10 +156,9 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
       return;
     }
   }
-  CostGraphDef* cost_graph = response->mutable_cost_graph();
   session->graph_mgr->ExecuteAsync(
       request->graph_handle(), step_id, session, request->exec_opts(),
-      collector, cost_graph, cm, in,
+      collector, response, cm, in,
       [this, step_id, response, session, cm, out, token, collector, opts,
        done](Status s) {
         if (s.ok()) {
@@ -230,7 +229,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
     }
     session->graph_mgr->ExecuteAsync(
         graph_handle, step_id, session, request->exec_opts(),
-        nullptr /* collector */, nullptr /* cost_graph */, cm, in,
+        nullptr /* collector */, nullptr /* response */, cm, in,
         [this, token, step_id, cm](Status s) {
           {
             mutex_lock l(mu_);
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 5ca1d92a81b61cace09a752dc48e5ad17ab31b98..ffcbc50a2a4e75648115d8d6d078c7a3679f9fb8 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -88,27 +88,39 @@ void WorkerCacheLogger::RecordRecvTensor(int64 step_id, int64 start_usecs,
                                          const string& src_device,
                                          const string& dst_device,
                                          int64 bytes) {
-  NodeExecStats* ns = new NodeExecStats;
-  ns->set_node_name("RecvTensor");
-  string byte_string = strings::StrCat("[", bytes, "B] ");
-  if (bytes >= 0.1 * 1048576.0) {
-    byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
-  }
-  ns->set_timeline_label(strings::StrCat(byte_string, tensor_name, " from ",
-                                         src_device, " to ", dst_device));
-  ns->set_all_start_micros(start_usecs);
-  ns->set_op_start_rel_micros(0);
-  int64 elapsed = end_usecs - start_usecs;
-  ns->set_op_end_rel_micros(elapsed);
-  ns->set_all_end_rel_micros(elapsed);
-  NodeOutput* no = ns->add_output();
-  no->set_slot(0);
-  // TODO(tucker): Maybe set the dimensions too, but then they'll
-  // need to be passed in.
-  no->mutable_tensor_description()
-      ->mutable_allocation_description()
-      ->set_requested_bytes(bytes);
-  Save(dst_device, step_id, ns);
+  RecordDataTransfer(step_id, start_usecs, end_usecs, tensor_name, src_device,
+    dst_device, bytes, "", "RecvTensor");
 }
 
+void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
+                                           int64 end_usecs,
+                                           const string& tensor_name,
+                                           const string& src_device,
+                                           const string& dst_device,
+                                           int64 bytes,
+                                           const string& details,
+                                           const string& transfer_method_name){
+  NodeExecStats* ns = new NodeExecStats;
+    ns->set_node_name(transfer_method_name);
+    string byte_string = strings::StrCat("[", bytes, "B] ");
+    if (bytes >= 0.1 * 1048576.0) {
+      byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
+    }
+    ns->set_timeline_label(strings::StrCat(byte_string, tensor_name, " from ",
+                                           src_device, " to ", dst_device,
+                                           details));
+    ns->set_all_start_micros(start_usecs);
+    ns->set_op_start_rel_micros(0);
+    int64 elapsed = end_usecs - start_usecs;
+    ns->set_op_end_rel_micros(elapsed);
+    ns->set_all_end_rel_micros(elapsed);
+    NodeOutput* no = ns->add_output();
+    no->set_slot(0);
+    // TODO(tucker): Maybe set the dimensions too, but then they'll
+    // need to be passed in.
+    no->mutable_tensor_description()
+        ->mutable_allocation_description()
+        ->set_requested_bytes(bytes);
+    Save(dst_device, step_id, ns);
+  }
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.h b/tensorflow/core/distributed_runtime/worker_cache_logger.h
index a92590a176e17ccb1beac5d1a0140209daee4b78..00846c25fece7bfce795f52d55767cb2e58e3750 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.h
@@ -60,6 +60,14 @@ class WorkerCacheLogger {
                         const string& tensor_name, const string& src_device,
                         const string& dst_device, int64 bytes);
 
+  // Generates a NodeExecStats record with the given data, and saves for
+  // later retrieval by RetrieveLogs().
+  void RecordDataTransfer(int64 step_id, int64 start_usecs, int64 end_usecs,
+                          const string& tensor_name, const string& src_device,
+                          const string& dst_device, int64 bytes,
+                          const string& details,
+                          const string& transfer_method_name);
+
  private:
   mutex count_mu_;
   int32 want_logging_count_ GUARDED_BY(count_mu_) = 0;
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index f09bea328fd99426d07a853791df46cf579d93fd..793d58c8a1c6c5e0974d6746b36784ca9c674633 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -48,6 +48,10 @@ struct WorkerEnv {
 
   // device_mgr manages local devices (cpu and gpu). The WorkerService
   // is the network interface for managed devices.
+  //
+  // Note: Please use the device_mgr associated with your session if appropriate
+  // instead of this one. Using this device_mgr does not support ClusterSpec
+  // propagated sessions.
   DeviceMgr* device_mgr = nullptr;
 
   // A set of rendezvous keyed by step ids.
diff --git a/tensorflow/core/example/example_parser_configuration.cc b/tensorflow/core/example/example_parser_configuration.cc
index e4a3f26209d4eaa20d6eb9bd53886340a24b6870..5660465c51adbd7bcb0c1d54ee3800976ac5937a 100644
--- a/tensorflow/core/example/example_parser_configuration.cc
+++ b/tensorflow/core/example/example_parser_configuration.cc
@@ -17,8 +17,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/example/feature.pb_text.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 943dcab36269dbc82af238f0c375f027428bdbda..e7092f549b21e9a3c72950bfb637b966936ef5ab 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -48,6 +48,14 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
+void RunResourceCtor(ResourceHandle* p, size_t n) {
+  for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
+}
+
+void RunResourceDtor(ResourceHandle* p, size_t n) {
+  for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
+}
+
 // If true, cpu allocator collects more stats.
 static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index cb58896f49288012bd3208d43691268b4531a524..868335b0729d6d701b06bd64344b86292da3646c 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -21,8 +21,9 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -229,6 +230,14 @@ class Allocator {
     for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
   }
 
+  virtual void RunVariantCtor(Variant* p, size_t n) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
+  }
+
+  virtual void RunVariantDtor(Variant* p, size_t n) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
+  }
+
   // TODO(jeff): Maybe provide some interface to give info about
   // current allocation state (total number of bytes available for
   // allocation, number of bytes free on device, etc.)
@@ -256,6 +265,16 @@ inline void Allocator::RunDtor(ResourceHandle* p, size_t n) {
   RunResourceDtor(p, n);
 }
 
+template <>
+inline void Allocator::RunCtor(Variant* p, size_t n) {
+  RunVariantCtor(p, n);
+}
+
+template <>
+inline void Allocator::RunDtor(Variant* p, size_t n) {
+  RunVariantDtor(p, n);
+}
+
 // An implementation of Allocator that delegates all calls to another Allocator.
 //
 // Useful to clients who want to override part of the functionality of another
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index b18ce3decc0268c11e16812a22ecf98275320946..95cafa24b194655ba977d16a41c7629e4a70c20b 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb_text.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb_text.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -26,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
-
 namespace {
 
 string SummarizeString(const string& str) {
@@ -287,6 +289,8 @@ bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
   return ProtoParseFromString(to_parse, out);
 }
 
+void SetAttrValue(const AttrValue& value, AttrValue* out) { *out = value; }
+
 #define DEFINE_SET_ATTR_VALUE_ONE(ARG_TYPE, FIELD) \
   void SetAttrValue(ARG_TYPE value, AttrValue* out) { out->set_##FIELD(value); }
 
@@ -457,7 +461,8 @@ bool HasPlaceHolder(const AttrValue& val) {
   return false;
 }
 
-bool SubstitutePlaceholders(SubstituteFunc substitute, AttrValue* value) {
+bool SubstitutePlaceholders(const SubstituteFunc& substitute,
+                            AttrValue* value) {
   switch (value->value_case()) {
     case AttrValue::kList: {
       for (NameAttrList& func : *value->mutable_list()->mutable_func()) {
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 0e25cec4abc078b1d3dab6a7f462e4de42b21a43..08d813bb6f9a3fcd8fe55e54abc0a619934b196f 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
 #define TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
 
+#include <functional>
 #include <string>
 #include <vector>
-#include "tensorflow/core/framework/attr_value.pb.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"  // TODO(62899350): Remove
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -29,6 +31,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Forward declare protos so their symbols can be removed from .so exports
+class AttrValue;
+class NameAttrList;
+
 // A human-readable rendering of attr_value, that is more concise than a
 // text-format proto.
 string SummarizeAttrValue(const AttrValue& attr_value);
@@ -80,9 +86,7 @@ void SetAttrValue(gtl::ArraySlice<Tensor> value, AttrValue* out);
 void SetAttrValue(gtl::ArraySlice<TensorProto> value, AttrValue* out);
 void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out);
 
-inline void SetAttrValue(const AttrValue& value, AttrValue* out) {
-  *out = value;
-}
+void SetAttrValue(const AttrValue& value, AttrValue* out);
 
 // Returns true if a and b have the same value.
 // NOTE: May return false negatives for tensor values.
@@ -98,8 +102,8 @@ bool HasPlaceHolder(const AttrValue& val);
 // SubstituteFunc is given a placeholder string. If the placeholder is
 // unknown, SubstituteFunc returns false. Otherwise, overwrites the
 // attr value and returns true.
-typedef std::function<bool(const string&, AttrValue*)> SubstituteFunc;
-bool SubstitutePlaceholders(SubstituteFunc substitute, AttrValue* value);
+using SubstituteFunc = std::function<bool(const string&, AttrValue*)>;
+bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index c14ea9b322a2449e1f71c7d1a1b8691e89ff2161..5d30d327ae111ab0b8f960b9aa59e8593fb9c4aa 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value_util.h"
 
 #include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 035bceb640f247c9dbdb0a5b910996d18e70ad31..38024fcf68b56af3c442b7ed7196234a94a009d4 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -201,32 +201,117 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+// input, filter, bias, output
+Status FusedConvBiasActivationShape(shape_inference::InferenceContext* c) {
+  TF_RETURN_IF_ERROR(Conv2DShape(c));
+
+  ShapeHandle bias_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &bias_shape));
+  DimensionHandle bias_dim = c->Dim(bias_shape, 0);
+
+  ShapeHandle filter_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape));
+  DimensionHandle output_depth_dim = c->Dim(filter_shape, 3);
+
+  int64 output_depth_dim_val = c->Value(output_depth_dim);
+  int64 bias_dim_val = c->Value(bias_dim);
+
+  if (output_depth_dim_val != bias_dim_val) {
+    return errors::InvalidArgument(
+        "Output depth dimension (", output_depth_dim_val,
+        ") and bias dimension (", bias_dim_val, ") do not match.");
+  }
+
+  return Status::OK();
+}
+
+Status DimensionsFromShape(ShapeHandle shape, TensorFormat format,
+                           DimensionHandle* batch_dim,
+                           gtl::MutableArraySlice<DimensionHandle> spatial_dims,
+                           DimensionHandle* filter_dim,
+                           InferenceContext* context) {
+  const int32 rank = GetTensorDimsFromSpatialDims(spatial_dims.size(), format);
+  // Batch.
+  *batch_dim = context->Dim(shape, GetTensorBatchDimIndex(rank, format));
+  // Spatial.
+  for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size();
+       ++spatial_dim_index) {
+    spatial_dims[spatial_dim_index] = context->Dim(
+        shape, GetTensorSpatialDimIndex(rank, format, spatial_dim_index));
+  }
+  // Channel.
+  *filter_dim = context->Dim(shape, GetTensorFeatureDimIndex(rank, format));
+  if (format == FORMAT_NCHW_VECT_C) {
+    TF_RETURN_IF_ERROR(context->Multiply(
+        *filter_dim,
+        context->Dim(shape, GetTensorInnerFeatureDimIndex(rank, format)),
+        filter_dim));
+  }
+  return Status::OK();
+}
+
+Status ShapeFromDimensions(DimensionHandle batch_dim,
+                           gtl::ArraySlice<DimensionHandle> spatial_dims,
+                           DimensionHandle filter_dim, TensorFormat format,
+                           InferenceContext* context, ShapeHandle* shape) {
+  const int32 rank = GetTensorDimsFromSpatialDims(spatial_dims.size(), format);
+  std::vector<DimensionHandle> out_dims(rank);
+
+  // Batch.
+  out_dims[tensorflow::GetTensorBatchDimIndex(rank, format)] = batch_dim;
+  // Spatial.
+  for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size();
+       ++spatial_dim_index) {
+    out_dims[tensorflow::GetTensorSpatialDimIndex(
+        rank, format, spatial_dim_index)] = spatial_dims[spatial_dim_index];
+  }
+  // Channel.
+  if (format == tensorflow::FORMAT_NCHW_VECT_C) {
+    // When format is NCHW_VECT_C, factor the feature map count
+    // into the outer feature count and the inner feature count (=4).
+    TF_RETURN_IF_ERROR(context->Divide(
+        filter_dim, 4, /*evenly_divisible=*/true,
+        &out_dims[tensorflow::GetTensorFeatureDimIndex(rank, format)]));
+    out_dims[GetTensorInnerFeatureDimIndex(rank, format)] = context->MakeDim(4);
+  } else {
+    out_dims[tensorflow::GetTensorFeatureDimIndex(rank, format)] = filter_dim;
+  }
+
+  *shape = context->MakeShape(out_dims);
+  return tensorflow::Status::OK();
+}
+
 Status Conv2DShape(shape_inference::InferenceContext* c) {
+  string data_format_str;
+  Status s = c->GetAttr("data_format", &data_format_str);
+  if (!s.ok()) {
+    data_format_str = "NHWC";
+  }
+
+  TensorFormat data_format;
+  if (!FormatFromString(data_format_str, &data_format)) {
+    return errors::InvalidArgument("Invalid data format string: ",
+                                   data_format_str);
+  }
+
+  const int rank = GetTensorDimsFromSpatialDims(2, data_format);
   ShapeHandle input_shape;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &input_shape));
+  // The filter of a 2D convolution is always 4D.
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape));
 
-  string data_format;
-  Status s = c->GetAttr("data_format", &data_format);
-
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
-  if (strides.size() != 4) {
-    return errors::InvalidArgument(
-        "Conv2D requires the stride attribute to contain 4 values, but got: ",
-        strides.size());
+  if (strides.size() != rank) {
+    return errors::InvalidArgument("Conv2D on data format ", data_format_str,
+                                   " requires the stride attribute to contain ",
+                                   rank, " values, but got: ", strides.size());
   }
 
   int32 stride_rows, stride_cols;
-
-  if (s.ok() && data_format == "NCHW") {
-    // Convert input shape to default NHWC for inference
-    auto dim = [&](char dimension) {
-      return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
-    };
-    input_shape = c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('C')}});
+  if (data_format == FORMAT_NCHW || data_format == FORMAT_NCHW_VECT_C) {
     stride_rows = strides[2];
     stride_cols = strides[3];
   } else {
@@ -234,35 +319,38 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
     stride_cols = strides[2];
   }
 
-  DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
-  DimensionHandle in_rows_dim = c->Dim(input_shape, 1);
-  DimensionHandle in_cols_dim = c->Dim(input_shape, 2);
+  DimensionHandle batch_size_dim;
+  DimensionHandle input_depth_dim;
+  gtl::InlinedVector<DimensionHandle, 2> input_spatial_dims(2);
+  TF_RETURN_IF_ERROR(DimensionsFromShape(input_shape, data_format,
+                                         &batch_size_dim, &input_spatial_dims,
+                                         &input_depth_dim, c));
+
   DimensionHandle filter_rows_dim = c->Dim(filter_shape, 0);
   DimensionHandle filter_cols_dim = c->Dim(filter_shape, 1);
   DimensionHandle output_depth_dim = c->Dim(filter_shape, 3);
 
+  // Check that the input tensor and the filter tensor agree on the input
+  // channel count.
   DimensionHandle unused;
   TF_RETURN_IF_ERROR(
-      c->Merge(c->Dim(input_shape, 3), c->Dim(filter_shape, 2), &unused));
+      c->Merge(input_depth_dim, c->Dim(filter_shape, 2), &unused));
 
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
   DimensionHandle output_rows, output_cols;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, filter_rows_dim, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[0],
+                                                   filter_rows_dim, stride_rows,
+                                                   padding, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[1],
+                                                   filter_cols_dim, stride_cols,
+                                                   padding, &output_cols));
 
   ShapeHandle output_shape;
-  if (data_format == "NCHW") {
-    output_shape = c->MakeShape(
-        {batch_size_dim, output_depth_dim, output_rows, output_cols});
-  } else {
-    output_shape = c->MakeShape(
-        {batch_size_dim, output_rows, output_cols, output_depth_dim});
-  }
-
+  TF_RETURN_IF_ERROR(
+      ShapeFromDimensions(batch_size_dim, {output_rows, output_cols},
+                          output_depth_dim, data_format, c, &output_shape));
   c->set_output(0, output_shape);
   return Status::OK();
 }
@@ -361,7 +449,8 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   int32 stride_rows;
   int32 stride_cols;
   if (s.ok() && data_format == "NCHW") {
-    // Convert input shape to default NHWC for inference
+    // Canonicalize input shape to NHWC so the shape inference code below can
+    // process it.
     input_shape =
         c->MakeShape({{c->Dim(input_shape, 0), c->Dim(input_shape, 2),
                        c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
@@ -441,7 +530,8 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
   int32 kernel_rows, kernel_cols;
 
   if (s.ok() && data_format == "NCHW") {
-    // Convert input shape to default NHWC for inference.
+    // Canonicalize input shape to NHWC so the shape inference code below can
+    // process it.
     auto dim = [&](char dimension) {
       return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
     };
@@ -516,7 +606,8 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   int32 kernel_rows, kernel_cols, kernel_depth;
 
   if (s.ok() && data_format == "NCHW") {
-    // Convert input shape to default NHWC for inference.
+    // Canonicalize input shape to NHWC so the shape inference code below can
+    // process it.
     auto dim = [&](char dimension) {
       return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
     };
@@ -922,5 +1013,67 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
   return Status::OK();
 }
 
+Status ScatterNdUpdateShape(InferenceContext* c) {
+  ShapeHandle input_shape = c->input(0);
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
+
+  if (c->Value(c->NumElements(input_shape)) == 0 &&
+      (c->Value(c->NumElements(indices_shape)) > 0 ||
+       c->Value(c->NumElements(updates_shape)) > 0)) {
+    return errors::InvalidArgument(
+        "Indices and updates specified for empty output shape");
+  }
+
+  if (c->RankKnown(indices_shape) && c->RankKnown(updates_shape)) {
+    const int64 num_outer_dims = c->Rank(indices_shape) - 1;
+    const DimensionHandle index_size = c->Dim(indices_shape, -1);
+
+    // We can only do more validation if the last dimension of indices
+    // is a known value.
+    if (c->ValueKnown(index_size)) {
+      const int64 ix = c->Value(index_size);
+      ShapeHandle unused;
+      ShapeHandle prefix_indices;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(indices_shape, 0, num_outer_dims, &prefix_indices));
+      ShapeHandle prefix_updates;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(updates_shape, 0, num_outer_dims, &prefix_updates));
+
+      Status s = c->Merge(prefix_indices, prefix_updates, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "The outer ", num_outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", num_outer_dims,
+            " dimensions of updates.shape=", c->DebugString(updates_shape),
+            ": ", s.error_message());
+      }
+
+      ShapeHandle input_suffix;
+      TF_RETURN_IF_ERROR(c->Subshape(input_shape, ix, &input_suffix));
+      ShapeHandle suffix_updates;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(updates_shape, num_outer_dims, &suffix_updates));
+      s = c->Merge(input_suffix, suffix_updates, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "The inner ", c->Rank(input_shape) - ix,
+            " dimensions of input.shape=", c->DebugString(input_shape),
+            " must match the inner ", c->Rank(updates_shape) - num_outer_dims,
+            " dimensions of updates.shape=", c->DebugString(updates_shape),
+            ": ", s.error_message());
+      }
+    }
+  }
+
+  c->set_output(0, input_shape);
+  return Status::OK();
+}
+
 }  // namespace shape_inference
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index dc99e48adb97abc6948e34da7d5bb239d208756e..73b915652f6da2373ca9d85c814d06814519372d 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -167,6 +167,9 @@ Status Conv2DShape(shape_inference::InferenceContext* c);
 // Shape function for Conv3D-like operations.
 Status Conv3DShape(shape_inference::InferenceContext* c);
 
+// Shape function for FusedConvBiasActivation operation.
+Status FusedConvBiasActivationShape(shape_inference::InferenceContext* c);
+
 // Shape function for DepthwiseConv2D-like operations.
 Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c);
 
@@ -207,6 +210,9 @@ Status RandomShape(shape_inference::InferenceContext* c);
 Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
                             ShapeHandle values_shape, ShapeHandle shape_shape);
 
+// Shape function for ScatterNd update/add/sub/... operations.
+Status ScatterNdUpdateShape(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index d14e1dfee09cff972953bed6e0351e04a5b9d16e..37e211ad683d27814f3dd28f1519b1341a6325fc 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -26,19 +26,11 @@ namespace shape_inference {
 
 namespace {
 
-TensorShapeProto S(std::initializer_list<int64> dims) {
-  PartialTensorShape shape(dims);
-  TensorShapeProto ret;
-  shape.AsProto(&ret);
-  return ret;
+PartialTensorShape S(std::initializer_list<int64> dims) {
+  return PartialTensorShape(dims);
 }
 
-TensorShapeProto Unknown() {
-  PartialTensorShape shape;
-  TensorShapeProto ret;
-  shape.AsProto(&ret);
-  return ret;
-}
+PartialTensorShape Unknown() { return PartialTensorShape(); }
 
 OpDef MakeOpDef(int num_inputs, int num_outputs) {
   OpRegistrationData op_reg_data;
@@ -487,6 +479,27 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   set_op({{1, 1, 1, 2}}, "VALID", "NCHW");
   INFER_OK(op, "[1,1,4,4];[2,1,1,1]", "[d0_0,d1_3,3,2]");
 
+  // Tests for NCHW_VECT_C
+  // 1x1 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID", "NCHW_VECT_C");
+  INFER_OK(op, "[1,1,2,2,4];[1,1,4,4]", "[d0_0,1,2,2,4]");
+
+  // 2x2 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID", "NCHW_VECT_C");
+  INFER_OK(op, "[1,1,2,2,4];[2,2,4,4]", "[d0_0,1,1,1,4]");
+
+  // 3x3 input, 1x1 filter, 2x2 stride
+  set_op({{1, 1, 2, 2, 1}}, "VALID", "NCHW_VECT_C");
+  INFER_OK(op, "[1,1,3,3,4];[1,1,4,8]", "[d0_0,2,2,2,4]");
+
+  // 3x3 input, 1x1 filter, 2x1 stride
+  set_op({{1, 1, 2, 1, 1}}, "VALID", "NCHW_VECT_C");
+  INFER_OK(op, "[1,1,3,3,4];[1,1,4,4]", "[d0_0,1,2,3,4]");
+
+  // 4x4 input, 2x1 filter, 1x2 stride
+  set_op({{1, 1, 1, 2, 1}}, "VALID", "NCHW_VECT_C");
+  INFER_OK(op, "[1,1,4,4,4];[2,1,4,4]", "[d0_0,1,3,2,4]");
+
   // Some tests for "SAME" padding
 
   // 4x4 input, 1x1 filter, 1x1 stride
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index ea0ed3ccbbd977510b6878a4a951ae61e138845f..f5bc24aafe3cdf55aad2f89a33c71cd717c303f8 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -19,4 +19,8 @@ namespace tensorflow {
 
 DeviceBase::~DeviceBase() {}
 
+const DeviceAttributes& DeviceBase::attributes() const {
+  LOG(FATAL) << "Device does not implement attributes()";
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 27fe28fe60a9bd020f9db16c49506741336c9863..e1eb387d88b4c38932f63bc555207dd544a95666 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -44,10 +44,12 @@ class Stream;
 namespace tensorflow {
 
 class Device;
+class DeviceAttributes;
 class Env;
 class EventMgr;
 class OpKernelContext;
 class ResourceMgr;
+class TensorProto;
 
 namespace thread {
 class ThreadPool;
@@ -194,11 +196,8 @@ class DeviceBase {
                                      DeviceContext* /*dc*/,
                                      Allocator* /*allocator*/) {}
 
-  virtual const DeviceAttributes& attributes() const {
-    LOG(FATAL) << "Device does not implement attributes()";
-    static DeviceAttributes dummy;
-    return dummy;
-  }
+  // Unimplemented by default
+  virtual const DeviceAttributes& attributes() const;
 
   // Materializes the given TensorProto into 'tensor' stored in Device
   // memory.  Most devices will want to override this.
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index 7a21dd5066c341dbdb397709c98e6a31df09b1ef..ad301a8aa4ba4be5b7031d00984d8e6febf1583e 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/fake_input.h"
 
 #include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 9026075a2f0f7cd7b46e3ee2d95c92cd0d37bf30..1774f74ca8f66dfc885f4a52a2624bef485f05ea 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/function.pb_text.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
@@ -722,6 +724,23 @@ string DebugStringWhole(const GraphDef& gdef) {
   return ret;
 }
 
+namespace {
+
+// Returns the name -> attr mapping of fdef's attrs that have a value set. In
+// Python, it's possible to access unset attrs, which returns a default value
+// and adds an unset attr to the map.
+std::map<string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
+  std::map<string, AttrValue> set_attrs;
+  for (auto pair : fdef.attr()) {
+    if (pair.second.value_case() != AttrValue::VALUE_NOT_SET) {
+      set_attrs[pair.first] = pair.second;
+    }
+  }
+  return set_attrs;
+}
+
+}  // end namespace
+
 bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   // NOTE(skyewm): Using MessageDifferencer would be better here, but that is
   // currently not included in tensorflow/core/platform/default/protobuf.h, so
@@ -734,10 +753,12 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   f2.signature().SerializeToString(&sig2);
   if (sig1 != sig2) return false;
 
-  if (f1.attr().size() != f2.attr().size()) return false;
-  for (auto iter1 : f1.attr()) {
-    auto iter2 = f2.attr().find(iter1.first);
-    if (iter2 == f2.attr().end()) return false;
+  std::map<string, AttrValue> f1_attrs = GetSetAttrs(f1);
+  std::map<string, AttrValue> f2_attrs = GetSetAttrs(f2);
+  if (f1_attrs.size() != f2_attrs.size()) return false;
+  for (auto iter1 : f1_attrs) {
+    auto iter2 = f2_attrs.find(iter1.first);
+    if (iter2 == f2_attrs.end()) return false;
     if (!AreAttrValuesEqual(iter1.second, iter2->second)) return false;
   }
 
@@ -881,11 +902,17 @@ const FunctionDef* FunctionLibraryDefinition::Find(const string& name) const {
 }
 
 Status FunctionLibraryDefinition::AddFunctionDef(const FunctionDef& fdef) {
-  auto& ptr = function_defs_[fdef.signature().name()];
-  if (ptr != nullptr) {
-    return errors::InvalidArgument("Function with name: ",
-                                   fdef.signature().name(),
-                                   " already exists in function library.");
+  std::unique_ptr<FunctionDefAndOpRegistration>* entry =
+      &function_defs_[fdef.signature().name()];
+  if (*entry != nullptr) {
+    if (!FunctionDefsEqual((*entry)->fdef, fdef)) {
+      return errors::InvalidArgument(
+          "Cannot add function '", fdef.signature().name(),
+          "' because a different function with the same name already "
+          "exists.");
+    }
+    // Ignore duplicate FunctionDefs
+    return Status::OK();
   }
   const OpDef* op_def;
   if (default_registry_->LookUpOpDef(fdef.signature().name(), &op_def).ok()) {
@@ -893,19 +920,27 @@ Status FunctionLibraryDefinition::AddFunctionDef(const FunctionDef& fdef) {
         "Cannot add function '", fdef.signature().name(),
         "' because an op with the same name already exists.");
   }
-  ptr.reset(new FunctionDefAndOpRegistration(fdef));
+  entry->reset(new FunctionDefAndOpRegistration(fdef));
   return Status::OK();
 }
 
 Status FunctionLibraryDefinition::AddGradientDef(const GradientDef& grad) {
-  if (func_grad_.count(grad.function_name()) > 0) {
-    return errors::InvalidArgument("Gradient for function '",
-                                   grad.function_name(), "' already exists.");
+  string* entry = &func_grad_[grad.function_name()];
+  if (!entry->empty()) {
+    if (*entry != grad.gradient_func()) {
+      return errors::InvalidArgument(
+          "Cannot assign gradient function '", grad.gradient_func(), "' to '",
+          grad.function_name(), "' because it already has gradient function ",
+          "'", *entry, "'");
+    }
+    // Ignore duplicate GradientDefs
+    return Status::OK();
   }
-  func_grad_[grad.function_name()] = grad.gradient_func();
+  *entry = grad.gradient_func();
   return Status::OK();
 }
 
+// TODO(skyewm): don't modify FunctionLibraryDefinition in case of error
 Status FunctionLibraryDefinition::AddLibrary(
     const FunctionLibraryDefinition& other) {
   for (auto iter : other.function_defs_) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 6c2da84790c021587b011c6b71ce25f833205c2b..d840d2f001d8cfe4ad7dc0e27ac02fbf00b14597 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_FUNCTION_H_
 
 #include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/selective_registration.h"
@@ -33,9 +34,12 @@ limitations under the License.
 namespace tensorflow {
 
 class CancellationManager;
+class GraphDef;
 class OpKernel;
 class ResourceMgr;
+class Rendezvous;
 class ScopedStepContainer;
+class StepStatsCollector;
 class Node;
 
 // FunctionDefHelper::Create is a convenient helper to construct a
@@ -285,20 +289,24 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   const FunctionDef* Find(const string& func) const;
 
   // Adds function definition 'fdef' to this function library.
-  // Returns status 'ok' on success, or error otherwise.
+  // Returns status 'ok' on success, or error otherwise. This is a no-op if
+  // 'fdef' already exists in this function library.
   // If 'fdef' is successfully added to the library, it will be accessible
   // from 'LookUp' and included in the proto returned by 'ToProto'.
   Status AddFunctionDef(const FunctionDef& fdef);
 
   // Adds gradient definition 'grad' to this function library.
+  // This is a no-op if 'grad' already exists in this function library.
   // If 'grad' is successfully added, it will be accessible via 'FindGradient'
   // and included in the proto returned by 'ToProto'.
   Status AddGradientDef(const GradientDef& grad);
 
   // Adds the functions and gradients in 'other' to this function library.
+  // Duplicate functions and gradients are ignored.
   Status AddLibrary(const FunctionLibraryDefinition& other);
 
   // Adds the functions and gradients in 'lib_def' to this function library.
+  // Duplicate functions and gradients are ignored.
   Status AddLibrary(const FunctionDefLibrary& lib_def);
 
   // If the gradient function for 'func' is specified explicitly in
@@ -348,9 +356,9 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   };
 
   const OpRegistryInterface* const default_registry_;
-  gtl::FlatMap<string, std::unique_ptr<FunctionDefAndOpRegistration>, HashStr>
+  gtl::FlatMap<string, std::unique_ptr<FunctionDefAndOpRegistration>>
       function_defs_;
-  gtl::FlatMap<string, string, HashStr> func_grad_;
+  gtl::FlatMap<string, string> func_grad_;
 
   // Helper function for GetAttr. Returns the FunctionDef* to get the
   // attr from.
@@ -391,12 +399,12 @@ class FunctionLibraryRuntime {
   //
   // Does not take ownership of "rets".
   struct Options {
-    CancellationManager* cancellation_manager = nullptr;
     // The id of the step that is calling this function.
     int64 step_id = 0;
-
-    // Per-step container.
-    ScopedStepContainer* step_container;
+    Rendezvous* rendezvous = nullptr;
+    CancellationManager* cancellation_manager = nullptr;
+    ScopedStepContainer* step_container = nullptr;
+    StepStatsCollector* stats_collector = nullptr;
 
     std::function<void(std::function<void()>)>* runner = nullptr;
   };
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 2ecdc36c111b7d331ab9e23221e70aff9e2ddbcc..140dbd89329cf778dd99712fc42d0aef73cdd633 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -971,6 +971,10 @@ TEST(FunctionLibraryDefinitionTest, AddFunctionDef) {
   EXPECT_EQ(s.error_message(),
             "Cannot add function 'Add' because an op with the same name "
             "already exists.");
+
+  // Already-added functions don't produce error
+  TF_EXPECT_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
+  TF_EXPECT_OK(lib_def.AddFunctionDef(test::function::WXPlusB()));
 }
 
 TEST(FunctionLibraryDefinitionTest, AddGradientDef) {
@@ -984,12 +988,16 @@ TEST(FunctionLibraryDefinitionTest, AddGradientDef) {
   grad.set_gradient_func(test::function::XTimesFour().signature().name());
   TF_EXPECT_OK(lib_def.AddGradientDef(grad));
 
+  // Already-added gradients don't produce error
+  TF_EXPECT_OK(lib_def.AddGradientDef(grad));
+
   // Test that adding a duplicate gradient fails
   grad.set_gradient_func(test::function::XTimes16().signature().name());
   Status s = lib_def.AddGradientDef(grad);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
   EXPECT_EQ(s.error_message(),
-            "Gradient for function 'XTimesTwo' already exists.");
+            "Cannot assign gradient function 'XTimes16' to 'XTimesTwo' because "
+            "it already has gradient function 'XTimesFour'");
 }
 
 TEST(FunctionLibraryDefinitionTest, AddLibrary) {
@@ -998,35 +1006,46 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   *proto.add_function() = test::function::XTimesTwo();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
 
-  // Error if you try to add the same function twice
-  Status s = lib_def.AddLibrary(lib_def);
-  EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
-  EXPECT_EQ(s.error_message(),
-            "Function with name: XTimesTwo already exists in function "
-            "library.");
-
   // Add gradient
   GradientDef grad;
   grad.set_function_name(test::function::XTimesTwo().signature().name());
   grad.set_gradient_func(test::function::XTimesFour().signature().name());
   TF_EXPECT_OK(lib_def.AddGradientDef(grad));
 
-  // Error if you try to add the same library function twice
+  // Error if you try to add conflicting function
   proto.Clear();
-  *proto.add_gradient() = grad;
+  FunctionDef fdef = test::function::XTimesFour();
+  fdef.mutable_signature()->set_name(
+      test::function::XTimesTwo().signature().name());
+  *proto.add_function() = fdef;
   FunctionLibraryDefinition lib_def2(OpRegistry::Global(), proto);
-  s = lib_def.AddLibrary(lib_def2);
+  Status s = lib_def.AddLibrary(lib_def2);
+  EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'XTimesTwo' because a different function with "
+            "the same name already exists.");
+
+  // Error if you try to add conflicting gradient
+  proto.Clear();
+  grad.set_gradient_func(test::function::XTimes16().signature().name());
+  *proto.add_gradient() = grad;
+  FunctionLibraryDefinition lib_def3(OpRegistry::Global(), proto);
+  s = lib_def.AddLibrary(lib_def3);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
   EXPECT_EQ(s.error_message(),
-            "Gradient for function 'XTimesTwo' already exists.");
+            "Cannot assign gradient function 'XTimes16' to 'XTimesTwo' because "
+            "it already has gradient function 'XTimesFour'");
 
   // No conflicting functions or gradients OK
   proto.Clear();
   *proto.add_function() = test::function::XTimesFour();
   grad.set_function_name(test::function::XTimes16().signature().name());
   *proto.add_gradient() = grad;
-  FunctionLibraryDefinition lib_def3(OpRegistry::Global(), proto);
-  TF_EXPECT_OK(lib_def.AddLibrary(lib_def3));
+  FunctionLibraryDefinition lib_def4(OpRegistry::Global(), proto);
+  TF_EXPECT_OK(lib_def.AddLibrary(lib_def4));
+
+  // OK to add the same functions and gradients twice
+  TF_EXPECT_OK(lib_def.AddLibrary(lib_def));
 }
 
 TEST(FunctionLibraryDefinitionTest, ToProto) {
@@ -1137,7 +1156,7 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
 // TODO(skyewm): this could be more thorough
 TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
-  FunctionDef fdef1 = test::function::XTimesTwo();
+  const FunctionDef fdef1 = test::function::XTimesTwo();
   FunctionDef fdef2 = test::function::XTimesTwo();
   EXPECT_TRUE(FunctionDefsEqual(fdef1, fdef2));
 
@@ -1164,6 +1183,22 @@ TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   fdef2 = test::function::XTimesTwo();
   (*fdef2.mutable_ret())["y"] = "y:z:1";  // originally is "y:z:0"
   EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different attributes
+  fdef2 = test::function::XTimesTwo();
+  SetAttrValue(&fdef2, "ExtraAttr", true);
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Multiple equivalent attributes; the two functions should be equal.
+  fdef2 = test::function::XTimesTwo();
+  FunctionDef fdef3 = test::function::XTimesTwo();
+  SetAttrValue(&fdef2, "Foo", true);
+  SetAttrValue(&fdef3, "Foo", true);
+  SetAttrValue(&fdef2, "Bar", 123);
+  SetAttrValue(&fdef3, "Bar", 123);
+  SetAttrValue(&fdef2, "Baz", "abc");
+  SetAttrValue(&fdef3, "Baz", "abc");
+  EXPECT_TRUE(FunctionDefsEqual(fdef2, fdef3));
 }
 
 }  // end namespace
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 7caddf3cb86452bc2159b8f4c5e2369b44ae9b2c..4ee23226daae0239c18a549ba21f780495924346 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/public/version.h"
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 56b8f0aa1bf70d17de1d66033875e071d82c6f5a..49e5b0c99d95bbce0070b1fb87fa35743b4ed1a6 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 8496774793d65c284610784a13db5510d3db8e9f..bd018b7243897a5b45aa35d7fb94ca1ee1b12e75 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/versions.pb_text.h"
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index 56355eaf367b264e4db4c5ae19581a7c69a4e6ed..838c9fd4ce319034cfdcf834c1af0e869a74c5d2 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -17,13 +17,15 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_GRAPH_DEF_UTIL_H_
 
 #include <set>
-
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+// Forward declare proto so that it's symbols can be removed from .so exports
+class GraphDef;
+
 // Produce a human-readable version of a GraphDef that is more concise
 // than a text-format proto.
 string SummarizeGraphDef(const GraphDef& graph_def);
@@ -62,7 +64,7 @@ Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 // attr with a default was added). Note that this will not affect
 // attrs with non-default values, so you must run a
 // ValidateGraphDef...() function to see if the result is in fact
-// compatible. If not nulllptr, the op/attr pairs that were removed
+// compatible. If not nullptr, the op/attr pairs that were removed
 // are added to '*op_attr_removed'.
 //
 // Expected usage, for a producer that wants to prepare a graph for
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index 6366ac5bebb304b0c0c978e1c3b939e361848288..eb86f18ff06c38860e0c24e60b42326317ddecfb 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb_text.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 
 namespace tensorflow {
 
@@ -24,6 +25,10 @@ KernelDefBuilder::KernelDefBuilder(const char* op_name) {
   kernel_def_->set_op(op_name);
 }
 
+KernelDefBuilder::~KernelDefBuilder() {
+  DCHECK(kernel_def_ == nullptr) << "Did not call Build()";
+}
+
 KernelDefBuilder& KernelDefBuilder::Device(const char* device_type) {
   kernel_def_->set_device_type(device_type);
   return *this;
@@ -61,4 +66,10 @@ KernelDefBuilder& KernelDefBuilder::Label(const char* label) {
   return *this;
 }
 
+const KernelDef* KernelDefBuilder::Build() {
+  KernelDef* r = kernel_def_;
+  kernel_def_ = nullptr;
+  return r;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 84657f8dbf339b61eb7e820531cbc77c6cd13a4d..27f768c72fd1e556dd65709d6ef3397546095d51 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_KERNEL_DEF_BUILDER_H_
 #define TENSORFLOW_FRAMEWORK_KERNEL_DEF_BUILDER_H_
 
-#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
@@ -24,16 +24,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Forward declare proto so that kernels don't need to depend on it
+class KernelDef;
+
 // Builder class passed to the REGISTER_KERNEL_BUILDER() macro.
 class KernelDefBuilder {
  public:
   // Starts with just the name field set.
   // Caller MUST call Build() and take ownership of the result.
   explicit KernelDefBuilder(const char* op_name);
-
-  ~KernelDefBuilder() {
-    DCHECK(kernel_def_ == nullptr) << "Did not call Build()";
-  }
+  ~KernelDefBuilder();
 
   // Required: specify the type of device this kernel supports.
   // Returns *this.
@@ -68,11 +68,7 @@ class KernelDefBuilder {
   // Returns a pointer to a KernelDef with fields set based on the
   // above calls to this instance.
   // Caller takes ownership of the result.
-  const KernelDef* Build() {
-    KernelDef* r = kernel_def_;
-    kernel_def_ = nullptr;
-    return r;
-  }
+  const KernelDef* Build();
 
  private:
   KernelDef* kernel_def_;
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index c1dde1504a7cf647455c174b659bab3fb3792789..6a2eed94b94971d20faffa1608627290c1109d66 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/framework/memory_types.h b/tensorflow/core/framework/memory_types.h
index e35e22f5907b099afa8722e291fe408cf9c96fc5..a82aea9f0763bb940c14bad7f5f2204fa5b51559 100644
--- a/tensorflow/core/framework/memory_types.h
+++ b/tensorflow/core/framework/memory_types.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_MEMORY_TYPES_H_
 #define TENSORFLOW_FRAMEWORK_MEMORY_TYPES_H_
 
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 
+class NodeDef;
+
 // Returns into *{input,output}_memory_types the memory type of each
 // {input,output} tensor.
 //
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 9385d1266a90e5d43a928f38b4f371a71c97c878..f9cf6ce87359d6e4df03306629b53a73e5673181 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 
 #include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -83,6 +84,25 @@ NodeDefBuilder& NodeDefBuilder::Input(FakeInputFunctor fake_input) {
   return *this;
 }
 
+NodeDefBuilder& NodeDefBuilder::Input(StringPiece src_node, int src_index,
+                                      DataType dt) {
+  const OpDef::ArgDef* arg = NextArgDef();
+  if (arg != nullptr) SingleInput(arg, src_node, src_index, dt);
+  return *this;
+}
+
+NodeDefBuilder& NodeDefBuilder::Input(const NodeOut& src) {
+  Input(src.node, src.index, src.data_type);
+  return *this;
+}
+
+// For inputs that take a list of tensors.
+NodeDefBuilder& NodeDefBuilder::Input(gtl::ArraySlice<NodeOut> src_list) {
+  const OpDef::ArgDef* arg = NextArgDef();
+  if (arg != nullptr) ListInput(arg, src_list);
+  return *this;
+}
+
 void NodeDefBuilder::SingleInput(const OpDef::ArgDef* input_arg,
                                  StringPiece src_node, int src_index,
                                  DataType dt) {
@@ -228,14 +248,51 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def) const {
   }
 }
 
-void NodeDefBuilder::CheckInconsistency(StringPiece attr_name,
-                                        const AttrValue& found,
-                                        const AttrValue& attr_value) {
-  if (!AreAttrValuesEqual(found, attr_value)) {
-    errors_.push_back(strings::StrCat(
-        "Inconsistent values for attr '", attr_name, "' ",
-        SummarizeAttrValue(found), " vs. ", SummarizeAttrValue(attr_value)));
+NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, const AttrValue& value) {
+  if (const AttrValue* found = AttrSlice(node_def_).Find(name)) {
+    if (!AreAttrValuesEqual(*found, value)) {
+      errors_.push_back(strings::StrCat("Inconsistent values for attr '", name,
+                                        "' ", SummarizeAttrValue(*found),
+                                        " vs. ", SummarizeAttrValue(value)));
+    }
+  } else {
+    AddNodeAttr(name, value, &node_def_);
   }
+  return *this;
 }
 
+#define ATTR(T)                                                     \
+  NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, T value) { \
+    AttrValue attr_value;                                           \
+    SetAttrValue(value, &attr_value);                               \
+    return Attr(name, attr_value);                                  \
+  }
+ATTR(StringPiece)
+ATTR(const char*)
+ATTR(int32)
+ATTR(int64)
+ATTR(float)
+ATTR(double)
+ATTR(bool)
+ATTR(DataType)
+ATTR(const PartialTensorShape&)
+ATTR(const Tensor&)
+ATTR(const TensorProto&)
+ATTR(const NameAttrList&)
+ATTR(gtl::ArraySlice<StringPiece>)
+ATTR(gtl::ArraySlice<const char*>)
+ATTR(gtl::ArraySlice<string>)
+ATTR(gtl::ArraySlice<int32>)
+ATTR(gtl::ArraySlice<int64>)
+ATTR(gtl::ArraySlice<float>)
+ATTR(gtl::ArraySlice<bool>)
+ATTR(const std::vector<bool>&)
+ATTR(gtl::ArraySlice<DataType>)
+ATTR(gtl::ArraySlice<TensorShape>)
+ATTR(gtl::ArraySlice<PartialTensorShape>)
+ATTR(gtl::ArraySlice<TensorShapeProto>)
+ATTR(gtl::ArraySlice<Tensor>)
+ATTR(gtl::ArraySlice<NameAttrList>)
+#undef ATTR
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index c09d96bfa6ddc3cd87763ee2e41cb2cf290e0598..d7f1d36540ad87090e86b7de2d9c7928f7cf274c 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include <functional>
 #include <vector>
 #include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -71,22 +72,11 @@ class NodeDefBuilder {
   // *and in the same order as the input_args appear in the OpDef.*
 
   // For inputs that take a single tensor.
-  NodeDefBuilder& Input(StringPiece src_node, int src_index, DataType dt) {
-    const OpDef::ArgDef* arg = NextArgDef();
-    if (arg != nullptr) SingleInput(arg, src_node, src_index, dt);
-    return *this;
-  }
-  NodeDefBuilder& Input(const NodeOut& src) {
-    Input(src.node, src.index, src.data_type);
-    return *this;
-  }
+  NodeDefBuilder& Input(StringPiece src_node, int src_index, DataType dt);
+  NodeDefBuilder& Input(const NodeOut& src);
 
   // For inputs that take a list of tensors.
-  NodeDefBuilder& Input(gtl::ArraySlice<NodeOut> src_list) {
-    const OpDef::ArgDef* arg = NextArgDef();
-    if (arg != nullptr) ListInput(arg, src_list);
-    return *this;
-  }
+  NodeDefBuilder& Input(gtl::ArraySlice<NodeOut> src_list);
 
   // To create inputs in tests, see fake_input.h.
   NodeDefBuilder& Input(FakeInputFunctor fake_input);
@@ -99,13 +89,39 @@ class NodeDefBuilder {
 
   // Sets the attr, if not already set.  If already set with a different
   // value, an error will be returned from Finalize().
+  NodeDefBuilder& Attr(StringPiece name, const AttrValue& value);
+  NodeDefBuilder& Attr(StringPiece name, StringPiece value);
+  NodeDefBuilder& Attr(StringPiece name, const char* value);
+  NodeDefBuilder& Attr(StringPiece name, int32 value);
+  NodeDefBuilder& Attr(StringPiece name, int64 value);
+  NodeDefBuilder& Attr(StringPiece name, float value);
+  NodeDefBuilder& Attr(StringPiece name, double value);
+  NodeDefBuilder& Attr(StringPiece name, bool value);
+  NodeDefBuilder& Attr(StringPiece name, DataType value);
+  NodeDefBuilder& Attr(StringPiece name, const PartialTensorShape& value);
+  NodeDefBuilder& Attr(StringPiece name, const Tensor& value);
+  NodeDefBuilder& Attr(StringPiece name, const TensorProto& value);
+  NodeDefBuilder& Attr(StringPiece name, const NameAttrList& value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<StringPiece> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<const char*> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<string> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<int32> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<int64> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<float> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<bool> value);
+  NodeDefBuilder& Attr(StringPiece name, const std::vector<bool>& value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<DataType> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<TensorShape> value);
+  NodeDefBuilder& Attr(StringPiece name,
+                       gtl::ArraySlice<PartialTensorShape> value);
+  NodeDefBuilder& Attr(StringPiece name,
+                       gtl::ArraySlice<TensorShapeProto> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<Tensor> value);
+  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<NameAttrList> value);
+
   template <class T>
-  NodeDefBuilder& Attr(StringPiece attr_name, T&& value);
-  // Note: overload needed to allow {...} expressions for value.
-  template <class T>
-  NodeDefBuilder& Attr(StringPiece attr_name, std::initializer_list<T> value) {
-    Attr<std::initializer_list<T>>(attr_name, std::move(value));
-    return *this;
+  NodeDefBuilder& Attr(StringPiece name, std::initializer_list<T> value) {
+    return Attr(name, gtl::ArraySlice<T>(value));
   }
 
   // Finish building the NodeDef, returning any errors or setting
@@ -151,9 +167,6 @@ class NodeDefBuilder {
     return input_arg->is_ref() ? MakeRefType(dt) : dt;
   }
 
-  void CheckInconsistency(StringPiece attr_name, const AttrValue& found,
-                          const AttrValue& attr_value);
-
   const OpDef* op_def_;
   NodeDef node_def_;
   int inputs_specified_;
@@ -161,21 +174,6 @@ class NodeDefBuilder {
   std::vector<string> errors_;
 };
 
-// IMPLEMENTATION -------------------------------------------------------------
-
-template <class T>
-NodeDefBuilder& NodeDefBuilder::Attr(StringPiece attr_name, T&& value) {
-  const AttrValue* found = AttrSlice(node_def_).Find(attr_name);
-  if (found == nullptr) {
-    AddNodeAttr(attr_name, std::forward<T>(value), &node_def_);
-  } else {
-    AttrValue attr_value;
-    SetAttrValue(std::forward<T>(value), &attr_value);
-    CheckInconsistency(attr_name, *found, attr_value);
-  }
-  return *this;
-}
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_NODE_DEF_BUILDER_H_
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 9b737e1f72d26f0c1db64553e24df65575d4b5b4..b98a6033d0a28b9bc3781bb61b6c3032d0e9eb1b 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -21,10 +21,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -611,4 +613,56 @@ Status AttachDef(const Status& status, const Node& node) {
   return AttachDef(status, node.def());
 }
 
+void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
+  node_def->mutable_attr()->insert(
+      AttrValueMap::value_type(name.ToString(), value));
+}
+
+#define ADD_NODE_ATTR(T)                                           \
+  void AddNodeAttr(StringPiece name, T value, NodeDef* node_def) { \
+    AttrValue attr_value;                                          \
+    SetAttrValue(value, &attr_value);                              \
+    AddNodeAttr(name, attr_value, node_def);                       \
+  }
+ADD_NODE_ATTR(StringPiece)
+ADD_NODE_ATTR(const char*)
+ADD_NODE_ATTR(int32)
+ADD_NODE_ATTR(int64)
+ADD_NODE_ATTR(float)
+ADD_NODE_ATTR(double)
+ADD_NODE_ATTR(bool)
+ADD_NODE_ATTR(DataType)
+ADD_NODE_ATTR(const PartialTensorShape&)
+ADD_NODE_ATTR(const Tensor&)
+ADD_NODE_ATTR(const TensorProto&)
+ADD_NODE_ATTR(const NameAttrList&)
+ADD_NODE_ATTR(gtl::ArraySlice<StringPiece>)
+ADD_NODE_ATTR(gtl::ArraySlice<const char*>)
+ADD_NODE_ATTR(gtl::ArraySlice<string>)
+ADD_NODE_ATTR(gtl::ArraySlice<int32>)
+ADD_NODE_ATTR(gtl::ArraySlice<int64>)
+ADD_NODE_ATTR(gtl::ArraySlice<float>)
+ADD_NODE_ATTR(gtl::ArraySlice<bool>)
+ADD_NODE_ATTR(const std::vector<bool>&)
+ADD_NODE_ATTR(gtl::ArraySlice<DataType>)
+ADD_NODE_ATTR(gtl::ArraySlice<TensorShape>)
+ADD_NODE_ATTR(gtl::ArraySlice<PartialTensorShape>)
+ADD_NODE_ATTR(gtl::ArraySlice<TensorShapeProto>)
+ADD_NODE_ATTR(gtl::ArraySlice<Tensor>)
+ADD_NODE_ATTR(gtl::ArraySlice<NameAttrList>)
+#undef ADD_NODE_ATTR
+
+void AddAttr(StringPiece name, const AttrValue& value, AttrValueMap* map) {
+  map->insert(AttrValueMap::value_type(name.ToString(), value));
+}
+
+#define ADD_ATTR(T)                                            \
+  void AddAttr(StringPiece name, T value, AttrValueMap* map) { \
+    AttrValue attr_value;                                      \
+    SetAttrValue(value, &attr_value);                          \
+    AddAttr(name, attr_value, map);                            \
+  }
+ADD_ATTR(bool)
+#undef ADD_ATTR
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 1438abdec606442246baba00cc6ca818c4cec7d5..a829243a75a024afafc8a628097b05d29e0aabc2 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -31,6 +30,10 @@ namespace tensorflow {
 
 class Node;
 
+// We forward declare protos so that kernels don't need to depend on them
+class NodeDef;
+class OpDef;
+
 // Name of the attribute used to encode node colocation constraints.
 //
 // Nodes can be co-located on the same device. Desire for explicit co-location
@@ -50,32 +53,61 @@ typedef protobuf::Map<string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
 // The type of the attr is based on the type of value.
-template <class T>
-void AddNodeAttr(StringPiece name, T&& value, NodeDef* node_def) {
-  AttrValue attr_value;
-  SetAttrValue(std::forward<T>(value), &attr_value);
-  node_def->mutable_attr()->insert(
-      AttrValueMap::value_type(name.ToString(), attr_value));
-}
+void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, StringPiece value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, const char* value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, int32 value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, int64 value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, float value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, double value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, bool value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, DataType value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, const PartialTensorShape& value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, const Tensor& value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, const TensorProto& value, NodeDef* node_def);
+void AddNodeAttr(StringPiece name, const NameAttrList& value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<StringPiece> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<const char*> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<string> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<int32> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<int64> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<float> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<bool> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, const std::vector<bool>& value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<DataType> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<TensorShape> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<PartialTensorShape> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<TensorShapeProto> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<Tensor> value,
+                 NodeDef* node_def);
+void AddNodeAttr(StringPiece name, gtl::ArraySlice<NameAttrList> value,
+                 NodeDef* node_def);
 
 // Version to workaround C++'s "perfect" forwarding not being able to
 // forward {...} initialization.
 template <class T>
 void AddNodeAttr(StringPiece name, std::initializer_list<T> value,
                  NodeDef* node_def) {
-  AttrValue attr_value;
-  SetAttrValue(value, &attr_value);
-  node_def->mutable_attr()->insert(
-      AttrValueMap::value_type(name.ToString(), attr_value));
+  AddNodeAttr(name, gtl::ArraySlice<T>(value), node_def);
 }
 
 // Adds an attr to an attr value map.
-template <class T>
-void AddAttr(StringPiece name, T&& value, AttrValueMap* map) {
-  AttrValue attr_value;
-  SetAttrValue(value, &attr_value);
-  map->insert(AttrValueMap::value_type(name.ToString(), attr_value));
-}
+void AddAttr(StringPiece name, const AttrValue& value, AttrValueMap* map);
+void AddAttr(StringPiece name, bool value, AttrValueMap* map);
 
 class AttrSlice {
  public:
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index b60709c5a4e18a894e994f8461872ad175fcc766..bfd598a97202e4bcbf1f869b2687f7cbca36b36b 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/node_def_util.h"
 
+#include "tensorflow/core/framework/attr_value.pb.h"  // NOLINT
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def_builder.h"
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index fe333dc9ffa3eae6904cc8bf543c93b189de22b5..4f5a1f80a025744f4b2189aa3216304a36b99044 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -77,9 +77,10 @@ Status OpRegistry::LookUp(const string& op_type_name,
     if (first_unregistered) {
       OpList op_list;
       Export(true, &op_list);
-      VLOG(1) << "All registered Ops:";
-      for (const auto& op : op_list.op()) {
-        VLOG(1) << SummarizeOpDef(op);
+      if (VLOG_IS_ON(3)) {
+         LOG(INFO) << "All registered Ops:";
+         for (const auto& op : op_list.op())
+            LOG(INFO) << SummarizeOpDef(op);
       }
       first_unregistered = false;
     }
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index a4dd06de4538f5a829fdeee1538120d6fc895a52..1c63a6f4c0e1e4947eab2dfdb83050cfbce18c14 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include <vector>
-#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/selective_registration.h"
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 58a30a87a8f5695c2923380955485a0097841375..62b504691b2b549d4454dd8c5f8654fe24250605 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index c36e6dd653b48163f6a0f4c9d7f446906be84c76..2f25b6e18fc05d25539727c30a4e642750cb9807 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index da623ae5b25b3e5271c3fbe24783323a3323fe55..143da996a1e2144a44101b004ea4130abb8e5af6 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_overrides.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -71,6 +73,181 @@ bool ConsumeEquals(StringPiece* description) {
   return false;
 }
 
+// Split `*orig` into two pieces at the first occurrence of `split_ch`.
+// Returns whether `split_ch` was found. Afterwards, `*before_split`
+// contains the maximum prefix of the input `*orig` that doesn't
+// contain `split_ch`, and `*orig` contains everything after the
+// first `split_ch`.
+static bool SplitAt(char split_ch, StringPiece* orig,
+                    StringPiece* before_split) {
+  auto pos = orig->find(split_ch);
+  if (pos == StringPiece::npos) {
+    *before_split = *orig;
+    orig->clear();
+    return false;
+  } else {
+    *before_split = orig->substr(0, pos);
+    orig->remove_prefix(pos + 1);
+    return true;
+  }
+}
+
+// Does this line start with "<spaces><field>:" where "<field>" is
+// in multi_line_fields? Sets *colon_pos to the position of the colon.
+static bool StartsWithFieldName(StringPiece line,
+                                const std::vector<string>& multi_line_fields) {
+  StringPiece up_to_colon;
+  if (!SplitAt(':', &line, &up_to_colon)) return false;
+  while (up_to_colon.Consume(" "))
+    ;  // Remove leading spaces.
+  for (const auto& field : multi_line_fields) {
+    if (up_to_colon == field) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool ConvertLine(StringPiece line,
+                        const std::vector<string>& multi_line_fields,
+                        string* ml) {
+  // Is this a field we should convert?
+  if (!StartsWithFieldName(line, multi_line_fields)) {
+    return false;
+  }
+  // Has a matching field name, so look for "..." after the colon.
+  StringPiece up_to_colon;
+  StringPiece after_colon = line;
+  SplitAt(':', &after_colon, &up_to_colon);
+  while (after_colon.Consume(" "))
+    ;  // Remove leading spaces.
+  if (!after_colon.Consume("\"")) {
+    // We only convert string fields, so don't convert this line.
+    return false;
+  }
+  auto last_quote = after_colon.rfind('\"');
+  if (last_quote == StringPiece::npos) {
+    // Error: we don't see the expected matching quote, abort the conversion.
+    return false;
+  }
+  StringPiece escaped = after_colon.substr(0, last_quote);
+  StringPiece suffix = after_colon.substr(last_quote + 1);
+  // We've now parsed line into '<up_to_colon>: "<escaped>"<suffix>'
+
+  string unescaped;
+  if (!str_util::CUnescape(escaped, &unescaped, nullptr)) {
+    // Error unescaping, abort the conversion.
+    return false;
+  }
+  // No more errors possible at this point.
+
+  // Find a string to mark the end that isn't in unescaped.
+  string end = "END";
+  for (int s = 0; unescaped.find(end) != string::npos; ++s) {
+    end = strings::StrCat("END", s);
+  }
+
+  // Actually start writing the converted output.
+  strings::StrAppend(ml, up_to_colon, ": <<", end, "\n", unescaped, "\n", end);
+  if (!suffix.empty()) {
+    // Output suffix, in case there was a trailing comment in the source.
+    strings::StrAppend(ml, suffix);
+  }
+  strings::StrAppend(ml, "\n");
+  return true;
+}
+
+string PBTxtToMultiline(StringPiece pbtxt,
+                        const std::vector<string>& multi_line_fields) {
+  string ml;
+  // Probably big enough, since the input and output are about the
+  // same size, but just a guess.
+  ml.reserve(pbtxt.size() * (17. / 16));
+  StringPiece line;
+  while (!pbtxt.empty()) {
+    // Split pbtxt into its first line and everything after.
+    SplitAt('\n', &pbtxt, &line);
+    // Convert line or output it unchanged
+    if (!ConvertLine(line, multi_line_fields, &ml)) {
+      strings::StrAppend(&ml, line, "\n");
+    }
+  }
+  return ml;
+}
+
+// Given a single line of text `line` with first : at `colon`, determine if
+// there is an "<<END" expression after the colon and if so return true and set
+// `*end` to everything after the "<<".
+static bool FindMultiline(StringPiece line, size_t colon, string* end) {
+  if (colon == StringPiece::npos) return false;
+  line.remove_prefix(colon + 1);
+  while (line.Consume(" ")) {
+  }
+  if (line.Consume("<<")) {
+    *end = line.ToString();
+    return true;
+  }
+  return false;
+}
+
+string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
+  string pbtxt;
+  // Probably big enough, since the input and output are about the
+  // same size, but just a guess.
+  pbtxt.reserve(multiline_pbtxt.size() * (33. / 32));
+  StringPiece line;
+  while (!multiline_pbtxt.empty()) {
+    // Split multiline_pbtxt into its first line and everything after.
+    if (!SplitAt('\n', &multiline_pbtxt, &line)) {
+      strings::StrAppend(&pbtxt, line);
+      break;
+    }
+
+    string end;
+    auto colon = line.find(':');
+    if (!FindMultiline(line, colon, &end)) {
+      // Normal case: not a multi-line string, just output the line as-is.
+      strings::StrAppend(&pbtxt, line, "\n");
+      continue;
+    }
+
+    // Multi-line case:
+    //     something: <<END
+    // xx
+    // yy
+    // END
+    // Should be converted to:
+    //     something: "xx\nyy"
+
+    // Output everything up to the colon ("    something:").
+    strings::StrAppend(&pbtxt, line.substr(0, colon + 1));
+
+    // Add every line to unescaped until we see the "END" string.
+    string unescaped;
+    bool first = true;
+    string suffix;
+    while (!multiline_pbtxt.empty()) {
+      SplitAt('\n', &multiline_pbtxt, &line);
+      if (line.Consume(end)) break;
+      if (first) {
+        first = false;
+      } else {
+        unescaped.push_back('\n');
+      }
+      strings::StrAppend(&unescaped, line);
+      line.clear();
+    }
+
+    // Escape what we extracted and then output it in quotes.
+    strings::StrAppend(&pbtxt, " \"", str_util::CEscape(unescaped), "\"", line,
+                       "\n");
+  }
+  return pbtxt;
+}
+
+OpGenOverrideMap::OpGenOverrideMap() {}
+OpGenOverrideMap::~OpGenOverrideMap() {}
+
 Status OpGenOverrideMap::LoadFileList(Env* env, const string& filenames) {
   std::vector<string> v = str_util::Split(filenames, ",");
   for (const string& f : v) {
@@ -86,7 +263,7 @@ Status OpGenOverrideMap::LoadFile(Env* env, const string& filename) {
   OpGenOverrides all;
   protobuf::TextFormat::ParseFromString(contents, &all);
   for (const auto& one : all.op()) {
-    map_[one.name()] = one;
+    map_[one.name()].reset(new OpGenOverride(one));
   }
   return Status::OK();
 }
@@ -142,7 +319,7 @@ const OpGenOverride* OpGenOverrideMap::ApplyOverride(OpDef* op_def) const {
   // Look up
   const auto iter = map_.find(op_def->name());
   if (iter == map_.end()) return nullptr;
-  const OpGenOverride& proto = iter->second;
+  const OpGenOverride& proto = *iter->second;
 
   // Apply overrides from `proto`.
   if (!proto.rename_to().empty()) {
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index e92dc8d92417e9a9d94148f5a4a4b7ee4c459d4c..70c140ca54145fc976ae1d81e9acefe4b81bca39 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -18,14 +18,18 @@ limitations under the License.
 
 #include <string>
 #include <unordered_map>
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_overrides.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/op_gen_overrides.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
+// Forward declare protos so their symbols can be removed from .so exports
+class OpDef;
+class OpGenOverride;
+
 inline string Spaces(int n) { return string(n, ' '); }
 
 // Wrap prefix + str to be at most width characters, indenting every line
@@ -39,10 +43,18 @@ string WordWrap(StringPiece prefix, StringPiece str, int width);
 // returns false.
 bool ConsumeEquals(StringPiece* description);
 
+// Convert text-serialized protobufs to/from multiline format.
+string PBTxtToMultiline(StringPiece pbtxt,
+                        const std::vector<string>& multi_line_fields);
+string PBTxtFromMultiline(StringPiece multiline_pbtxt);
+
 // Takes a list of files with OpGenOverrides text protos, and allows you to
 // look up the specific override for any given op.
 class OpGenOverrideMap {
  public:
+  OpGenOverrideMap();
+  ~OpGenOverrideMap();
+
   // `filenames` is a comma-separated list of file names.  If an op
   // is mentioned in more than one file, the last one takes priority.
   Status LoadFileList(Env* env, const string& filenames);
@@ -61,7 +73,7 @@ class OpGenOverrideMap {
   const OpGenOverride* ApplyOverride(OpDef* op_def) const;
 
  private:
-  std::unordered_map<string, OpGenOverride> map_;
+  std::unordered_map<string, std::unique_ptr<OpGenOverride>> map_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc1d117f38466f02fc83c95384f6f13449e46fb3
--- /dev/null
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_gen_lib.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(OpGenLibTest, MultilinePBTxt) {
+  // Non-multiline pbtxt
+  const string pbtxt = R"(foo: "abc"
+foo: ""
+foo: "\n\n"
+foo: "abc\nEND"
+  foo: "ghi\njkl\n"
+bar: "quotes:\""
+)";
+
+  // Field "foo" converted to multiline but not "bar".
+  const string ml_foo = R"(foo: <<END
+abc
+END
+foo: <<END
+
+END
+foo: <<END
+
+
+
+END
+foo: <<END0
+abc
+END
+END0
+  foo: <<END
+ghi
+jkl
+
+END
+bar: "quotes:\""
+)";
+
+  // Both fields "foo" and "bar" converted to multiline.
+  const string ml_foo_bar = R"(foo: <<END
+abc
+END
+foo: <<END
+
+END
+foo: <<END
+
+
+
+END
+foo: <<END0
+abc
+END
+END0
+  foo: <<END
+ghi
+jkl
+
+END
+bar: <<END
+quotes:"
+END
+)";
+
+  // ToMultiline
+  EXPECT_EQ(ml_foo, PBTxtToMultiline(pbtxt, {"foo"}));
+  EXPECT_EQ(pbtxt, PBTxtToMultiline(pbtxt, {"baz"}));
+  EXPECT_EQ(ml_foo_bar, PBTxtToMultiline(pbtxt, {"foo", "bar"}));
+
+  // FromMultiline
+  EXPECT_EQ(pbtxt, PBTxtFromMultiline(pbtxt));
+  EXPECT_EQ(pbtxt, PBTxtFromMultiline(ml_foo));
+  EXPECT_EQ(pbtxt, PBTxtFromMultiline(ml_foo_bar));
+}
+
+TEST(OpGenLibTest, PBTxtToMultilineErrorCases) {
+  // Everything correct.
+  EXPECT_EQ("f: <<END\n7\nEND\n", PBTxtToMultiline("f: \"7\"\n", {"f"}));
+
+  // In general, if there is a problem parsing in PBTxtToMultiline, it leaves
+  // the line alone.
+
+  // No colon
+  EXPECT_EQ("f \"7\"\n", PBTxtToMultiline("f \"7\"\n", {"f"}));
+  // Only converts strings.
+  EXPECT_EQ("f: 7\n", PBTxtToMultiline("f: 7\n", {"f"}));
+  // No quote after colon.
+  EXPECT_EQ("f: 7\"\n", PBTxtToMultiline("f: 7\"\n", {"f"}));
+  // Only one quote
+  EXPECT_EQ("f: \"7\n", PBTxtToMultiline("f: \"7\n", {"f"}));
+  // Illegal escaping
+  EXPECT_EQ("f: \"7\\\"\n", PBTxtToMultiline("f: \"7\\\"\n", {"f"}));
+}
+
+TEST(OpGenLibTest, PBTxtToMultilineComments) {
+  const string pbtxt = R"(f: "bar"  # Comment 1
+    f: "\n"  # Comment 2
+)";
+  const string ml = R"(f: <<END
+bar
+END  # Comment 1
+    f: <<END
+
+
+END  # Comment 2
+)";
+
+  EXPECT_EQ(ml, PBTxtToMultiline(pbtxt, {"f"}));
+  EXPECT_EQ(pbtxt, PBTxtFromMultiline(ml));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index dec987e1ed9ebbf3580990d85422b34c9e2a8248..067d30d21a8a842fff32b529c322f00c71a91ffc 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -20,10 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/kernel_def.pb_text.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.h"
@@ -77,7 +79,7 @@ Status MatchSignatureHelper(const DataTypeSlice expected_inputs,
 // OpKernel ------------------------------------------------------------------
 
 OpKernel::OpKernel(OpKernelConstruction* context)
-    : def_(context->def()),
+    : def_(new NodeDef(context->def())),
       input_types_(context->input_types().begin(),
                    context->input_types().end()),
       input_memory_types_(context->input_memory_types().begin(),
@@ -91,7 +93,7 @@ OpKernel::OpKernel(OpKernelConstruction* context)
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
-                 NameRangesForNode(def_, *context->op_def_, &input_name_map_,
+                 NameRangesForNode(*def_, *context->op_def_, &input_name_map_,
                                    &output_name_map_));
   OP_REQUIRES_OK(context, CheckOpDeprecation(*context->op_def_,
                                              context->graph_def_version()));
@@ -103,6 +105,11 @@ OpKernel::OpKernel(OpKernelConstruction* context)
 
 OpKernel::~OpKernel() {}
 
+const string& OpKernel::name() const { return def_->name(); }
+const string& OpKernel::type_string() const { return def_->op(); }
+const string& OpKernel::requested_device() const { return def_->device(); }
+const string& OpKernel::requested_input(int i) const { return def_->input(i); }
+
 Status OpKernel::InputRange(StringPiece input_name, int* start,
                             int* stop) const {
   const auto result = input_name_map_.find(input_name.ToString());
@@ -165,6 +172,26 @@ Tensor* PersistentTensor::AccessTensor(OpKernelContext* context) {
 
 // OpKernelConstruction ------------------------------------------------------
 
+OpKernelConstruction::OpKernelConstruction(
+    DeviceType device_type, DeviceBase* device, Allocator* allocator,
+    const NodeDef* node_def, const OpDef* op_def, FunctionLibraryRuntime* flib,
+    const DataTypeSlice& input_types, const MemoryTypeSlice& input_memory_types,
+    const DataTypeSlice& output_types,
+    const MemoryTypeSlice& output_memory_types, int graph_def_version,
+    Status* status)
+    : device_type_(std::move(device_type)),
+      device_(device),
+      allocator_(allocator),
+      def_(node_def),
+      op_def_(op_def),
+      flib_(flib),
+      input_types_(input_types),
+      input_memory_types_(input_memory_types),
+      output_types_(output_types),
+      output_memory_types_(output_memory_types),
+      graph_def_version_(graph_def_version),
+      status_(status) {}
+
 void OpKernelConstruction::SetStatus(const Status& status) {
   status_->Update(status);
 }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 465395d858c612e0c2eb12ae21a9558e9ca60c08..f8f61df872ed2a9c293405bd728645a810107856 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -24,19 +24,19 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/function.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/kernel_def.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/session_state.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -65,9 +65,14 @@ class TensorSliceReaderCacheWrapper;
 }  // namespace checkpoint
 
 class AsyncOpKernel;
+class FunctionCallFrame;
+class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
 class OpKernelContext;       // declared below
+class OpRegistryInterface;
 class ResourceMgr;
+class ScopedStepContainer;
+class StepStatsCollector;
 
 class OpKernel {
  public:
@@ -109,9 +114,10 @@ class OpKernel {
   virtual bool IsExpensive() { return expensive_; }
 
   // Accessors.
-  const NodeDef& def() const { return def_; }
-  const string& name() const { return def_.name(); }
-  const string& type_string() const { return def_.op(); }
+  const NodeDef& def() const { return *def_; }
+  const string& name() const;              // Same as def().name()
+  const string& type_string() const;       // Same as def().op()
+  const string& requested_device() const;  // Same as def().device()
   bool is_internal() const { return is_internal_; }
 
   int num_inputs() const { return input_types_.size(); }
@@ -120,6 +126,7 @@ class OpKernel {
   const MemoryTypeVector& input_memory_types() const {
     return input_memory_types_;
   }
+  const string& requested_input(int i) const;  // Same as def().input(i)
 
   int num_outputs() const { return output_types_.size(); }
   DataType output_type(int o) const { return output_types_[o]; }
@@ -157,7 +164,7 @@ class OpKernel {
   Status MakeShape(const Tensor& shape, TensorShape* out) const;
 
  private:
-  const NodeDef def_;
+  const std::unique_ptr<const NodeDef> def_;
   const DataTypeVector input_types_;
   const MemoryTypeVector input_memory_types_;
   const DataTypeVector output_types_;
@@ -227,19 +234,7 @@ class OpKernelConstruction {
                        const MemoryTypeSlice& input_memory_types,
                        const DataTypeSlice& output_types,
                        const MemoryTypeSlice& output_memory_types,
-                       int graph_def_version, Status* status)
-      : device_type_(std::move(device_type)),
-        device_(device),
-        allocator_(allocator),
-        def_(node_def),
-        op_def_(op_def),
-        flib_(flib),
-        input_types_(input_types),
-        input_memory_types_(input_memory_types),
-        output_types_(output_types),
-        output_memory_types_(output_memory_types),
-        graph_def_version_(graph_def_version),
-        status_(status) {}
+                       int graph_def_version, Status* status);
 
   Env* env() const { return device_->env(); }
 
@@ -557,6 +552,7 @@ class OpKernelContext {
     FunctionCallFrame* call_frame = nullptr;
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
+    StepStatsCollector* stats_collector = nullptr;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
@@ -682,7 +678,7 @@ class OpKernelContext {
   void forward_ref_input_to_ref_output(int input_index, int output_index);
 
   // Returns true when an alias to input[input_index], reshaped to output_shape,
-  // which is is safe to use for in-place computation was written to *output.
+  // which is safe to use for in-place computation was written to *output.
   // Returns false if input[input_index] has a refcount greater than one, or if
   // its type does not match the expected output type of output[output_index],
   // or the number of elements in input[input_index] does not equal the number
@@ -722,7 +718,7 @@ class OpKernelContext {
       StringPiece output_name, const TensorShape& output_shape,
       Tensor** output) TF_MUST_USE_RESULT;
 
-  // Tries to reuse one of of the inputs given in input_indices as a temporary.
+  // Tries to reuse one of the inputs given in input_indices as a temporary.
   // If none of the given inputs can be forwarded, calls
   // allocate_temp() to allocate a new temporary buffer.
   Status forward_input_or_allocate_temp(
@@ -946,6 +942,9 @@ class OpKernelContext {
   std::function<void(std::function<void()>)>* runner() const {
     return params_->runner;
   }
+  StepStatsCollector* stats_collector() const {
+    return params_->stats_collector;
+  }
 
   // Shared resources accessible to this kernel.
   ResourceMgr* resource_manager() const { return params_->resource_manager; }
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index f87b7178449c65a339038cbbd35257fb3ee2e367..47523358bed40898cf82c531dc1a89fea0de88a3 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -19,10 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/framework/partial_tensor_shape_test.cc b/tensorflow/core/framework/partial_tensor_shape_test.cc
index f8ebd99bf88a492d858df17f888b883c27211453..54ae019f9b48128aab86b4d6a6154ec5dd60366a 100644
--- a/tensorflow/core/framework/partial_tensor_shape_test.cc
+++ b/tensorflow/core/framework/partial_tensor_shape_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index baddf0bbfa21b863aff0ab1032f1a5df4a3a1f6d..4aeaab3d9b00a46752279a296f13e67370776357 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -77,6 +77,9 @@ class QueueInterface : public ResourceBase {
   virtual void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
                      DoneCallback callback) = 0;
 
+  // Returns true if a given queue is closed and false if it is open.
+  virtual bool is_closed() const = 0;
+
   // Assuming *this represents a shared queue, verify that it matches
   // another instantiation indicated by node_def.
   virtual Status MatchesNodeDef(const NodeDef& node_def) = 0;
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index ebed957d99df368fa126ef78a632b9c2ca8a3d47..b8c771a0a1955b29f78478f60972b22d804351b2 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/reader_base.h"
 
+#include "tensorflow/core/framework/reader_base.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/framework/reader_base.h b/tensorflow/core/framework/reader_base.h
index 0528841814b8227e4bcc293f2f6b8da0c2fb5808..2b5052e959b799cf83a1657a26a9a1b5da2e8d5b 100644
--- a/tensorflow/core/framework/reader_base.h
+++ b/tensorflow/core/framework/reader_base.h
@@ -19,12 +19,14 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include "tensorflow/core/framework/queue_interface.h"
-#include "tensorflow/core/framework/reader_base.pb.h"
+#include "tensorflow/core/framework/reader_base.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/reader_interface.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
 
+class ReaderBaseState;
+
 // Default implementation of ReaderInterface.
 class ReaderBase : public ReaderInterface {
  public:
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index d79468afa122495ca7aac12ae52afa39b03dff6b..973ad4544a290f51ae42c641e47508132631160c 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -18,7 +18,7 @@ limitations under the License.
 // This file is used by cuda code and must remain compilable by nvcc.
 
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/platform/types.h"
 
 // Two sets of macros:
@@ -167,10 +167,15 @@ limitations under the License.
 // Call "m" on POD and string types.
 #define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_string(m)
 
-// Call "m" on all types supported on GPU.
+// Call "m" on all number types supported on GPU.
 #define TF_CALL_GPU_NUMBER_TYPES(m) \
   TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
+// Call "m" on all types supported on GPU.
+#define TF_CALL_GPU_ALL_TYPES(m) \
+  TF_CALL_GPU_NUMBER_TYPES(m)    \
+  TF_CALL_bool(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
+
 #define TF_CALL_GPU_NUMBER_TYPES_NO_HALF(m) TF_CALL_float(m) TF_CALL_double(m)
 
 // Call "m" on all quantized types.
@@ -178,4 +183,18 @@ limitations under the License.
 #define TF_CALL_QUANTIZED_TYPES(m) \
   TF_CALL_qint8(m) TF_CALL_quint8(m) TF_CALL_qint32(m)
 
+#ifdef TENSORFLOW_SYCL_NO_DOUBLE
+#define TF_CALL_SYCL_double(m)
+#else  // TENSORFLOW_SYCL_NO_DOUBLE
+#define TF_CALL_SYCL_double(m) TF_CALL_double(m)
+#endif // TENSORFLOW_SYCL_NO_DOUBLE
+
+#ifdef __ANDROID_TYPES_SLIM__
+#define TF_CALL_SYCL_NUMBER_TYPES(m)  TF_CALL_float(m)
+#else  // __ANDROID_TYPES_SLIM__
+#define TF_CALL_SYCL_NUMBER_TYPES(m)    \
+    TF_CALL_float(m)                    \
+    TF_CALL_SYCL_double(m)
+#endif // __ANDROID_TYPES_SLIM__
+
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 6e578cdbab41be9fce58fb680295a312b63d647a..90426defa0834ea8d8c02e8fc6d52ec12f00a5b9 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/rendezvous.h"
 
+#include <deque>
 #include <functional>
 #include <utility>
 #include <vector>
@@ -147,76 +148,48 @@ Status Rendezvous::Recv(const ParsedKey& key, const Args& args, Tensor* val,
 
 class LocalRendezvousImpl : public Rendezvous {
  public:
-  explicit LocalRendezvousImpl(bool tolerate_dup_recv)
-      : tolerate_dup_recv_(tolerate_dup_recv) {}
+  explicit LocalRendezvousImpl() {}
 
   Status Send(const ParsedKey& key, const Args& send_args, const Tensor& val,
               const bool is_dead) override {
-    DoneCallback waiter = nullptr;
-    Args recv_args;
     uint64 key_hash = KeyHash(key.FullKey());
     VLOG(2) << "Send " << this << " " << key_hash << " " << key.FullKey();
-    {
-      mutex_lock l(mu_);
-      if (!status_.ok()) {
-        return status_;
-      }
-      Item* item = nullptr;
-      Table::iterator iter = table_.find(key_hash);
-      if (iter == table_.end()) {
-        // There is no waiter for this message. Insert the message
-        // into the waiters table. The waiter will pick it up when
-        // arrives.
-        item = new Item;
-        item->waiter = nullptr;
-        item->value = val;
-        item->is_dead = is_dead;
-        if (send_args.device_context) {
-          send_args.device_context->Ref();
-          item->send_dev_context = send_args.device_context;
-        }
-        item->recv_dev_context = nullptr;
-
-        // The allocator attributes of item->value.
-        item->send_alloc_attrs = send_args.alloc_attrs;
 
-        CHECK(table_.insert({key_hash, item}).second);
-        return Status::OK();
-      } else {
-        item = iter->second;
+    mu_.lock();
+    if (!status_.ok()) {
+      // Rendezvous has been aborted.
+      Status s = status_;
+      mu_.unlock();
+      return s;
+    }
 
-        if (item->waiter == nullptr) {
-          // There is already a message in the table under the key.
-          // Should not happen unless it has a waiter.
-          return errors::Aborted("Duplicated send: ", key.FullKey());
-        }
-        // Mark item as complete.
-        item->has_been_recvd = true;
-
-        // Get item->waiter function into waiter and set item->waiter to null
-        std::swap(item->waiter, waiter);
-        DCHECK(item->waiter == nullptr);
-        DCHECK(waiter != nullptr);
-
-        // The ref on recv_dev_context transfers below.
-        recv_args.device_context = item->recv_dev_context;
-        recv_args.alloc_attrs = item->recv_alloc_attrs;
-        item->recv_dev_context = nullptr;
-        if (tolerate_dup_recv_) {
-          item->value = val;
-          item->is_dead = is_dead;
-          if (send_args.device_context) {
-            send_args.device_context->Ref();
-            item->send_dev_context = send_args.device_context;
-          }
-          item->send_alloc_attrs = send_args.alloc_attrs;
-        }
+    ItemQueue* queue = &table_[key_hash];
+    if (queue->empty() || queue->front()->IsSendValue()) {
+      // There is no waiter for this message. Append the message
+      // into the queue. The waiter will pick it up when arrives.
+      // Only send-related fields need to be filled.
+      Item* item = new Item;
+      item->value = val;
+      item->is_dead = is_dead;
+      item->send_args = send_args;
+      if (item->send_args.device_context) {
+        item->send_args.device_context->Ref();
       }
-    }  // mutex
-    // Notify the waiter by invoking its done closure, outside scope
-    // of the table lock.
-    waiter(Status::OK(), send_args, recv_args, val, is_dead);
-    if (recv_args.device_context) recv_args.device_context->Unref();
+      queue->push_back(item);
+      mu_.unlock();
+      return Status::OK();
+    }
+
+    // There is an earliest waiter to consume this message.
+    Item* item = queue->front();
+    queue->pop_front();
+    mu_.unlock();
+
+    // Notify the waiter by invoking its done closure, outside the
+    // lock.
+    DCHECK(!item->IsSendValue());
+    item->waiter(Status::OK(), send_args, item->recv_args, val, is_dead);
+    delete item;
     return Status::OK();
   }
 
@@ -224,6 +197,7 @@ class LocalRendezvousImpl : public Rendezvous {
                  DoneCallback done) override {
     uint64 key_hash = KeyHash(key.FullKey());
     VLOG(2) << "Recv " << this << " " << key_hash << " " << key.FullKey();
+
     mu_.lock();
     if (!status_.ok()) {
       // Rendezvous has been aborted.
@@ -232,124 +206,102 @@ class LocalRendezvousImpl : public Rendezvous {
       done(s, Args(), recv_args, Tensor(), false);
       return;
     }
-    Table::iterator iter = table_.find(key_hash);
-    if (iter != table_.end()) {
-      Item* item = iter->second;
-      if (item->has_been_recvd && !tolerate_dup_recv_) {
-        mu_.unlock();
-        done(errors::Aborted("Duplicated recv: ", key.FullKey()), Args(),
-             recv_args, Tensor(), false);
-      } else if (item->waiter == nullptr || tolerate_dup_recv_) {
-        // A message has already arrived and is stored in the table
-        // under this key.  Consumes the message and invokes the done
-        // closure.
-        Tensor v = item->value;
-        if (!tolerate_dup_recv_) {
-          item->value = Tensor();
-        }
-        item->has_been_recvd = true;
-        // Before dropping the table lock, capture the item values.
-        // DeviceContext is only non-null for non-CPU devices.
-        // If we capture the send_dev_context, we need to hold a ref on
-        // it.  Our caller will have a ref on the recv_dev_context,
-        // which is not in our table.
-        DeviceContext* send_dev_context = item->send_dev_context;
-        if (send_dev_context) send_dev_context->Ref();
-        bool is_dead = item->is_dead;
-        Args send_args;
-        send_args.device_context = item->send_dev_context;
-        send_args.alloc_attrs = item->send_alloc_attrs;
-        mu_.unlock();
-        done(Status::OK(), send_args, recv_args, v, is_dead);
-        if (send_dev_context) send_dev_context->Unref();
-      } else {
-        // Already have a waiter in the waiters table under this key,
-        // which should not happen.
-        mu_.unlock();
-        done(errors::Aborted("Duplicated recv: ", key.FullKey()), Args(),
-             recv_args, Tensor(), false);
+
+    ItemQueue* queue = &table_[key_hash];
+    if (queue->empty() || !queue->front()->IsSendValue()) {
+      // There is no message to pick up.
+      // Only recv-related fileds need to be filled.
+      Item* item = new Item;
+      item->waiter = std::move(done);
+      item->recv_args = recv_args;
+      if (item->recv_args.device_context) {
+        item->recv_args.device_context->Ref();
       }
+      queue->push_back(item);
+      mu_.unlock();
       return;
     }
-    // Waiting for a message that has not arrived yet. Insert into the
-    // waiting table. The done closure will be invoked when the
-    // message arrives.
-    Item* item = new Item;
-    item->waiter = std::move(done);
-    item->recv_alloc_attrs = recv_args.alloc_attrs;
-    if (recv_args.device_context) {
-      item->recv_dev_context = recv_args.device_context;
-      item->recv_dev_context->Ref();
-    }
-    CHECK(table_.insert({key_hash, item}).second);
+
+    // A message has already arrived and is queued in the table under
+    // this key.  Consumes the message and invokes the done closure.
+    Item* item = queue->front();
+    queue->pop_front();
     mu_.unlock();
+
+    // Invokes the done() by invoking its done closure, outside scope
+    // of the table lock.
+    DCHECK(item->IsSendValue());
+    done(Status::OK(), item->send_args, recv_args, item->value, item->is_dead);
+    delete item;
   }
 
   void StartAbort(const Status& status) override {
     CHECK(!status.ok());
-    std::vector<Item*> items;
+    Table table;
     {
       mutex_lock l(mu_);
-      if (!status_.ok()) return;
-      status_ = status;
-      items.reserve(table_.size());
-      for (const auto& p : table_) items.push_back(p.second);
-      table_.clear();
+      status_.Update(status);
+      table_.swap(table);
     }
-    for (Item* item : items) {
-      if (item->waiter != nullptr) {
-        item->waiter(status, Args(), Args(), Tensor(), false);
+    for (auto& p : table) {
+      for (Item* item : p.second) {
+        if (!item->IsSendValue()) {
+          item->waiter(status, Args(), Args(), Tensor(), false);
+        }
+        delete item;
       }
-      delete item;
     }
   }
 
  private:
   typedef LocalRendezvousImpl ME;
-  const bool tolerate_dup_recv_;
 
   struct Item {
     DoneCallback waiter = nullptr;
     Tensor value;
     bool is_dead = false;
-    bool has_been_recvd = false;
-    DeviceContext* send_dev_context = nullptr;
-    DeviceContext* recv_dev_context = nullptr;
-    AllocatorAttributes send_alloc_attrs;
-    AllocatorAttributes recv_alloc_attrs;
+    Args send_args;
+    Args recv_args;
 
     ~Item() {
-      if (send_dev_context) {
-        send_dev_context->Unref();
+      if (send_args.device_context) {
+        send_args.device_context->Unref();
       }
-      if (recv_dev_context) {
-        recv_dev_context->Unref();
+      if (recv_args.device_context) {
+        recv_args.device_context->Unref();
       }
     }
+
+    // Returns true iff this item represents a value being sent.
+    bool IsSendValue() const { return this->waiter == nullptr; }
   };
+
   // We key the hash table by KeyHash of the Rendezvous::CreateKey string
   static uint64 KeyHash(const StringPiece& k) {
     return Hash64(k.data(), k.size());
   }
 
-  typedef gtl::FlatMap<uint64, Item*> Table;
+  // By invariant, the item queue under each key is of the form
+  //   [item.IsSendValue()]* meaning each item is a sent message.
+  // or
+  //   [!item.IsSendValue()]* meaning each item is a waiter.
+  //
+  // TODO(zhifengc): consider a better queue impl than std::deque.
+  typedef std::deque<Item*> ItemQueue;
+  typedef gtl::FlatMap<uint64, ItemQueue> Table;
 
   // TODO(zhifengc): shard table_.
   mutex mu_;
   Table table_ GUARDED_BY(mu_);
-  Status status_;
+  Status status_ GUARDED_BY(mu_);
 
   ~LocalRendezvousImpl() override {
-    for (auto i : table_) {
-      delete i.second;
-    }
+    StartAbort(errors::Cancelled("LocalRendezvousImpl deleted"));
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalRendezvousImpl);
 };
 
-Rendezvous* NewLocalRendezvous(bool tolerate_dup_recv) {
-  return new LocalRendezvousImpl(tolerate_dup_recv);
-}
+Rendezvous* NewLocalRendezvous() { return new LocalRendezvousImpl(); }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index c9bfe4f0175be7c4dd2ab4fb75e8b11433594fa8..01e43e44e3f71503015557e3f6b30457d89a54b8 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -27,16 +27,22 @@ limitations under the License.
 
 namespace tensorflow {
 
-// A Rendezvous is an abstraction for passing a Tensor
-// from a producer to a consumer, where the consumer may safely
-// request the Tensor before or after it has been produced.  A
-// producer never blocks when using a Rendezvous.  A consumer has the
-// choice of making a blocking call or providing a callback: in either
-// case, the consumer receives the Tensor as soon as it is available.
+// A Rendezvous is an abstraction for passing tensors from producers
+// to consumers. A rendezvous is a table of channels. Each channel is
+// keyed by a rendezvous key. The key encodes a pair of <producer,
+// consumer>, where the producer and the consumer are tensorflow
+// devices.
 //
-// A Rendezvous key encodes a single <producer, consumer> pair.  It is
-// an error to call Send() or Recv*() more than once with the same
-// key.
+// The producer calls the Send() method to send one tensor over one
+// named channel. The consumer calls the Recv() method to receive one
+// tensor from a named channel. A sequence of tensors can be passed
+// from the producer to the consumer.  The consumer receives them in
+// the order as the producer sends them.
+//
+// A consumer may safely request the tensor before or after it has
+// been produced.  A consumer has the choice of making a blocking call
+// or providing a callback: in either case, the consumer receives the
+// Tensor as soon as it is available.  A producer never blocks.
 class Rendezvous : public core::RefCounted {
  public:
   struct Args {
@@ -121,12 +127,7 @@ class Rendezvous : public core::RefCounted {
 // Returns a Rendezvous instance that is limited to use only by
 // producers and consumers in the local process.  The caller assumes
 // ownership of one Ref() on the returned object.
-//
-// If "tolerate_dup_recv" is true, then the Rendezvous will retain
-// already Recv'd values and make them available to duplicate Recv
-// calls.  This may be useful if the RPC layer is not reliable, but
-// comes at the cost of higher memory consumption.
-Rendezvous* NewLocalRendezvous(bool tolerate_dup_recv = false);
+Rendezvous* NewLocalRendezvous();
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 663c449dbedb4cb2fbbd736df1ef3a2427b67fe4..b2fe144de6dbbe4a06accf27777f807799b05197 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -108,14 +108,19 @@ Rendezvous::ParsedKey MakeKey(const string& name) {
   return k;
 }
 
-Rendezvous::ParsedKey KeyFoo() { return MakeKey("foo"); }
-Rendezvous::ParsedKey KeyBar() { return MakeKey("bar"); }
+const Rendezvous::ParsedKey& KeyFoo() {
+  static auto key = MakeKey("foo");
+  return key;
+}
+
+const Rendezvous::ParsedKey& KeyBar() {
+  static auto key = MakeKey("bar");
+  return key;
+}
 
 TEST_F(LocalRendezvousTest, SendRecv) {
   Rendezvous::Args args;
   TF_ASSERT_OK(rendez_->Send(KeyFoo(), args, V("hello"), false));
-  EXPECT_TRUE(
-      errors::IsAborted(rendez_->Send(KeyFoo(), args, V("hello"), false)));
   Tensor val(DT_STRING);
   bool is_dead = false;
   TF_ASSERT_OK(rendez_->Recv(KeyFoo(), args, &val, &is_dead));
@@ -135,26 +140,7 @@ TEST_F(LocalRendezvousTest, RecvSend) {
   EXPECT_EQ("hello", V(val));
 }
 
-TEST_F(LocalRendezvousTest, DuplicateWaiterRecv) {
-  SchedClosure([this]() {
-    Tensor t(DT_STRING);
-    bool is_dead = false;
-    Rendezvous::Args args;
-    TF_ASSERT_OK(rendez_->Recv(KeyFoo(), args, &t, &is_dead));
-    TF_ASSERT_OK(rendez_->Send(KeyBar(), args, t, is_dead));
-  });
-  Env::Default()->SleepForMicroseconds(1000000);
-  Tensor val(DT_STRING);
-  bool val_dead = false;
-  Rendezvous::Args args;
-  EXPECT_TRUE(
-      errors::IsAborted(rendez_->Recv(KeyFoo(), args, &val, &val_dead)));
-  TF_ASSERT_OK(rendez_->Send(KeyFoo(), args, V("secret msg"), val_dead));
-  TF_ASSERT_OK(rendez_->Recv(KeyBar(), args, &val, &val_dead));
-  EXPECT_EQ("secret msg", V(val));
-}
-
-TEST_F(LocalRendezvousTest, DuplicateSerialRecv) {
+TEST_F(LocalRendezvousTest, PingPong) {
   SchedClosure([this]() {
     Tensor t(DT_STRING);
     bool is_dead = false;
@@ -169,8 +155,6 @@ TEST_F(LocalRendezvousTest, DuplicateSerialRecv) {
   TF_ASSERT_OK(rendez_->Send(KeyFoo(), args, V("secret msg"), val_dead));
   TF_ASSERT_OK(rendez_->Recv(KeyBar(), args, &val, &val_dead));
   EXPECT_EQ("secret msg", V(val));
-  EXPECT_TRUE(
-      errors::IsAborted(rendez_->Recv(KeyFoo(), args, &val, &val_dead)));
 }
 
 // A simple structure that behaves a bit like a blocking counter.  The
@@ -178,32 +162,33 @@ TEST_F(LocalRendezvousTest, DuplicateSerialRecv) {
 // thread waits for done to be notified.
 struct BlockingState {
   mutex lock;
-  int counter;
+  int counter = 0;
   Notification done;
 };
 
 TEST_F(LocalRendezvousTest, RandomSendRecv) {
-  static const int N = 1000;
+  // We are scheduling 2*N closures in the this->threads_, which is
+  // configured with only 16 threads. Furthermore, because the
+  // threadpool may execute the closures in an arbitrary order, we
+  // must use RecvAsync below. Otherwise, blocking Recv() may run
+  // before all all the Send() and deadlock.
+  static const int N = 100;
+  random::PhiloxRandom philox(testing::RandomSeed(), 17);
+  random::SimplePhilox rnd(&philox);
   BlockingState state;
   state.counter = N;
   for (int i = 0; i < N; ++i) {
-    SchedClosure([this, i]() {
-      random::PhiloxRandom philox(testing::RandomSeed() + i, 17);
-      random::SimplePhilox rnd(&philox);
-      Env::Default()->SleepForMicroseconds(1000 + rnd.Uniform(10000));
+    int micros = 100 + rnd.Uniform(1000);
+    SchedClosure([this, i, micros]() {
+      Env::Default()->SleepForMicroseconds(micros);
       Rendezvous::Args args;
       TF_ASSERT_OK(rendez_->Send(MakeKey(strings::StrCat(i)), args,
                                  V(strings::StrCat(i)), false));
     });
-    SchedClosure([this, &state, i]() {
-      random::PhiloxRandom philox(testing::RandomSeed() + N + i, 17);
-      random::SimplePhilox rnd(&philox);
-      Env::Default()->SleepForMicroseconds(1000 + rnd.Uniform(10000));
-      Tensor val(DT_STRING);
-      bool val_dead = false;
-      Rendezvous::Args args;
-      TF_ASSERT_OK(
-          rendez_->Recv(MakeKey(strings::StrCat(i)), args, &val, &val_dead));
+    auto recv_done = [this, &state, i](const Status& status,
+                                       const Rendezvous::Args& sender_args,
+                                       const Rendezvous::Args& recver_args,
+                                       const Tensor& val, const bool val_dead) {
       EXPECT_EQ(strings::StrCat(i), V(val));
       bool done = false;
       {
@@ -216,12 +201,42 @@ TEST_F(LocalRendezvousTest, RandomSendRecv) {
       if (done) {
         state.done.Notify();
       }
+    };
+    micros = 100 + rnd.Uniform(1000);
+    SchedClosure([this, i, micros, recv_done]() {
+      Env::Default()->SleepForMicroseconds(micros);
+      rendez_->RecvAsync(MakeKey(strings::StrCat(i)), Rendezvous::Args(),
+                         recv_done);
     });
   }
 
   state.done.WaitForNotification();
 }
 
+static void RandomSleep() {
+  if (std::rand() % 10 == 0) {
+    Env::Default()->SleepForMicroseconds(1000);
+  }
+}
+
+TEST_F(LocalRendezvousTest, MultiSends) {
+  static const int N = 100;
+  const auto& key_foo = KeyFoo();
+  Rendezvous::Args args;
+  SchedClosure([=]() {
+    for (int i = 0; i < N; ++i) {
+      TF_ASSERT_OK(rendez_->Send(key_foo, args, V(strings::StrCat(i)), false));
+      RandomSleep();
+    }
+  });
+  Tensor val;
+  bool val_dead;
+  for (int i = 0; i < N; ++i) {
+    TF_ASSERT_OK(rendez_->Recv(key_foo, args, &val, &val_dead));
+    RandomSleep();
+  }
+}
+
 TEST_F(LocalRendezvousTest, RecvAbort) {
   rendez_->Ref();
   SchedClosure([this]() {
@@ -280,15 +295,15 @@ TEST_F(LocalRendezvousTest, TransferDummyDeviceContext) {
   Notification n;
   Rendezvous::Args args1;
   args1.device_context = new DummyDeviceContext(1);
-  rendez_->RecvAsync(KeyFoo(), args1, [&n](const Status& s,
-                                           const Rendezvous::Args& send_args,
-                                           const Rendezvous::Args& recv_args,
-                                           const Tensor& val, bool is_dead) {
-    CHECK_EQ(123,
-             dynamic_cast<const DummyDeviceContext*>(send_args.device_context)
-                 ->stream_id());
-    n.Notify();
-  });
+  rendez_->RecvAsync(
+      KeyFoo(), args1,
+      [&n](const Status& s, const Rendezvous::Args& send_args,
+           const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
+        CHECK_EQ(123, dynamic_cast<const DummyDeviceContext*>(
+                          send_args.device_context)
+                          ->stream_id());
+        n.Notify();
+      });
 
   n.WaitForNotification();
   args.device_context->Unref();
@@ -304,8 +319,8 @@ static void BM_SendRecv(int iters) {
   Status s;
   if (iters > 0) {
     while (iters--) {
-      s = rendez->Send(KeyFoo(), args, orig, is_dead);
-      s = rendez->Recv(KeyFoo(), args, &val, &is_dead);
+      TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
+      TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &val, &is_dead));
     }
     CHECK_EQ(V(val), V(orig));
   }
@@ -313,12 +328,13 @@ static void BM_SendRecv(int iters) {
 }
 BENCHMARK(BM_SendRecv);
 
-static void BM_RecvSend(int iters) {
+static void BM_PingPong(int iters) {
+  CHECK_GT(iters, 0);
   thread::ThreadPool* pool = new thread::ThreadPool(Env::Default(), "test", 1);
 
-  // The main thread sends "foo" for iters/2 times and receives "bar"
-  // for iters/2 times.  The other thread sends "bar" for iters/2
-  // times and receives "foo" for iters/2 times.
+  // The main thread sends "foo" for iters times and receives "bar"
+  // for iters times.  The other thread sends "bar" for iters times
+  // and receives "foo" for iters times.
   Rendezvous* rendez = NewLocalRendezvous();
   pool->Schedule([rendez, iters]() {
     Tensor bar = V("bar");
@@ -326,9 +342,9 @@ static void BM_RecvSend(int iters) {
     bool is_dead = false;
     Rendezvous::Args args;
     Status s;
-    for (int i = 0; i < iters / 2; ++i) {
-      s = rendez->Recv(KeyFoo(), args, &foo, &is_dead);
-      s = rendez->Send(KeyBar(), args, bar, is_dead);
+    for (int i = 0; i < iters; ++i) {
+      TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &foo, &is_dead));
+      TF_CHECK_OK(rendez->Send(KeyBar(), args, bar, is_dead));
     }
     CHECK_EQ("foo", V(foo));
   });
@@ -337,13 +353,13 @@ static void BM_RecvSend(int iters) {
   bool is_dead = false;
   Rendezvous::Args args;
   Status s;
-  for (int i = 0; i < iters / 2; ++i) {
-    s = rendez->Send(KeyFoo(), args, foo, is_dead);
-    s = rendez->Recv(KeyBar(), args, &bar, &is_dead);
+  for (int i = 0; i < iters; ++i) {
+    TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
+    TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
   }
   CHECK_EQ("bar", V(bar));
   delete pool;
 }
-BENCHMARK(BM_RecvSend);
+BENCHMARK(BM_PingPong);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39ef82765f5deda70d68c4ccab77ac7258a3a4e6
--- /dev/null
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -0,0 +1,69 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+ResourceHandle::ResourceHandle() {}
+
+ResourceHandle::ResourceHandle(const ResourceHandleProto& proto) {
+  FromProto(proto);
+}
+
+ResourceHandle::~ResourceHandle() {}
+
+void ResourceHandle::AsProto(ResourceHandleProto* proto) const {
+  proto->set_device(device());
+  proto->set_container(container());
+  proto->set_name(name());
+  proto->set_hash_code(hash_code());
+  proto->set_maybe_type_name(maybe_type_name());
+}
+
+void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
+  set_device(proto.device());
+  set_container(proto.container());
+  set_name(proto.name());
+  set_hash_code(proto.hash_code());
+  set_maybe_type_name(proto.maybe_type_name());
+}
+
+string ResourceHandle::SerializeAsString() const {
+  ResourceHandleProto proto;
+  AsProto(&proto);
+  return proto.SerializeAsString();
+}
+
+bool ResourceHandle::ParseFromString(const string& s) {
+  ResourceHandleProto proto;
+  const bool status = proto.ParseFromString(s);
+  if (status) FromProto(proto);
+  return status;
+}
+
+string ResourceHandle::DebugString() const {
+  return strings::StrCat("device: ", device(), " container: ", container(),
+                         " name: ", name(), " hash_code: ", hash_code(),
+                         " maybe_type_name: ", maybe_type_name());
+}
+
+string ProtoDebugString(const ResourceHandle& handle) {
+  return handle.DebugString();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..06df1b9046da2c99ef30b1a69763f4e18e404f94
--- /dev/null
+++ b/tensorflow/core/framework/resource_handle.h
@@ -0,0 +1,82 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_FRAMEWORK_RESOURCE_HANDLE_H_
+#define TENSORFLOW_FRAMEWORK_RESOURCE_HANDLE_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class ResourceHandleProto;
+
+// Class representing a handle to a tensorflow resource. Handles are
+// not valid across executions, but can be serialized back and forth from within
+// a single run.
+//
+// This is the native C++ class equivalent of ResourceHandleProto.  They are
+// separate so that kernels do not need to depend on protos.
+class ResourceHandle {
+ public:
+  ResourceHandle();
+  ResourceHandle(const ResourceHandleProto& proto);
+  ~ResourceHandle();
+
+  // Unique name for the device containing the resource.
+  const string& device() const { return device_; }
+  void set_device(const string& device) { device_ = device; }
+
+  // Container in which this resource is placed.
+  const string& container() const { return container_; }
+  void set_container(const string& container) { container_ = container; }
+
+  // Unique name of this resource.
+  const string& name() const { return name_; }
+  void set_name(const string& name) { name_ = name; }
+
+  // Hash code for the type of the resource. Is only valid in the same device
+  // and in the same execution.
+  uint64 hash_code() const { return hash_code_; }
+  void set_hash_code(uint64 hash_code) { hash_code_ = hash_code; }
+
+  // For debug-only, the name of the type pointed to by this handle, if
+  // available.
+  const string& maybe_type_name() const { return maybe_type_name_; }
+  void set_maybe_type_name(const string& value) { maybe_type_name_ = value; }
+
+  // Conversion to and from ResourceHandleProto
+  void AsProto(ResourceHandleProto* proto) const;
+  void FromProto(const ResourceHandleProto& proto);
+
+  // Serialization via ResourceHandleProto
+  string SerializeAsString() const;
+  bool ParseFromString(const string& s);
+
+  string DebugString() const;
+
+ public:
+  string device_;
+  string container_;
+  string name_;
+  uint64 hash_code_ = 0;
+  string maybe_type_name_;
+};
+
+// For backwards compatibility for when this was a proto
+string ProtoDebugString(const ResourceHandle& handle);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_RESOURCE_HANDLE_H_
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index f9f19ca5b49363a72abcbd3bd339b31ed82470eb..b1921337f5fd0b434e256ae85c6baffe95df286a 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -2,14 +2,14 @@ syntax = "proto3";
 
 package tensorflow;
 option cc_enable_arenas = true;
-option java_outer_classname = "ResourceHandleProto";
+option java_outer_classname = "ResourceHandle";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
 // a single run.
-message ResourceHandle {
+message ResourceHandleProto {
   // Unique name for the device containing the resource.
   string device = 1;
 
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 4365a861e52ea7d36c1eff8e051198b01f0bc193..bc3ba914e095be3eae447cb782ddefe8f3909fd8 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/resource_mgr.h"
 
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 0e1a5a82d3fa4b5d96dfd0bb899c12d65fa87574..c0128cc77da893084e1cf8e0ae556dcce03a3f78 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -22,8 +22,9 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -491,9 +492,16 @@ template <typename T>
 void IsResourceInitialized<T>::Compute(OpKernelContext* ctx) {
   Tensor* output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output));
-  T* unused;
-  output->flat<bool>()(0) =
-      LookupResource(ctx, HandleFromInput(ctx, 0), &unused).ok();
+  T* object;
+  bool found;
+  if (LookupResource(ctx, HandleFromInput(ctx, 0), &object).ok()) {
+    found = true;
+    object->Unref();
+  } else {
+    found = false;
+  }
+
+  output->flat<bool>()(0) = found;
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index cc7613b97d5f3286cd1bc04c494291fba2daa5db..07272e2374cbf4fb46c5b8da5df73ef4d6858c62 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/resource_mgr.h"
 
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h
index af86092ac7314922106a80024b8bef53802fbffc..503947969d3fd330fcbfcedd605abf193922fb54 100644
--- a/tensorflow/core/framework/selective_registration.h
+++ b/tensorflow/core/framework/selective_registration.h
@@ -31,8 +31,8 @@ limitations under the License.
 //    functions should be defined as valid constexpr functions, so that they are
 //    evaluated at compile time: this is needed to make symbols referenced by
 //    un-registered objects unused, and therefore allow the linker to strip them
-//    out.  See tools/print_required_ops/print_selective_registration_header.py
-//    for a tool that can be used to generate ops_to_register.h.
+//    out.  See python/tools/print_selective_registration_header.py for a tool
+//    that can be used to generate ops_to_register.h.
 //
 // ops_to_register.h should define macros for:
 //   // Ops for which this is false will not be registered.
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 1f9e98551f14de7a9b7ccf4f593b2a9772a036e0..6947f680021a75891491ce53b166db73ec8d57a4 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -79,6 +80,58 @@ InferenceContext::InferenceContext(
   PostInputInit(std::move(handle_data));
 }
 
+// Same as above, but with PartialTensorShape instead of TensorShapeProto
+InferenceContext::InferenceContext(
+    int graph_def_version, const NodeDef* node_def, const OpDef& op_def,
+    const std::vector<PartialTensorShape>& input_shapes,
+    const std::vector<const Tensor*>& input_tensors,
+    const std::vector<PartialTensorShape>& input_tensors_as_shapes,
+    const std::vector<
+        std::unique_ptr<std::vector<std::pair<PartialTensorShape, DataType>>>>&
+        input_handle_shapes_and_types)
+    : graph_def_version_(graph_def_version),
+      node_def_(*CHECK_NOTNULL(node_def)) {
+  std::vector<ShapeHandle> input_tensors_as_shape_handles;
+  for (const PartialTensorShape& p : input_tensors_as_shapes) {
+    ShapeHandle shape;
+    construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
+    if (!construction_status_.ok()) {
+      return;
+    }
+    input_tensors_as_shape_handles.push_back(shape);
+  }
+  PreInputInit(op_def, input_tensors, input_tensors_as_shape_handles);
+  if (!construction_status_.ok()) return;
+  for (const PartialTensorShape& p : input_shapes) {
+    ShapeHandle shape;
+    construction_status_.Update(MakeShapeFromPartialTensorShape(p, &shape));
+    if (!construction_status_.ok()) {
+      return;
+    }
+    inputs_.push_back(shape);
+  }
+  std::vector<std::unique_ptr<std::vector<ShapeAndType>>> handle_data(
+      input_shapes.size());
+  for (int i = 0; i < input_handle_shapes_and_types.size(); ++i) {
+    const auto& v = input_handle_shapes_and_types[i];
+    if (v == nullptr) {
+      continue;
+    }
+    handle_data[i].reset(new std::vector<ShapeAndType>(v->size()));
+    auto& new_v = *handle_data[i];
+    for (int j = 0; j < v->size(); ++j) {
+      const auto& p = (*v)[j];
+      construction_status_.Update(
+          MakeShapeFromPartialTensorShape(p.first, &new_v[j].shape));
+      if (!construction_status_.ok()) {
+        return;
+      }
+      new_v[j].dtype = p.second;
+    }
+  }
+  PostInputInit(std::move(handle_data));
+}
+
 InferenceContext::InferenceContext(
     int graph_def_version, const NodeDef* node_def, const OpDef& op_def,
     const std::vector<ShapeHandle>& input_shapes,
@@ -97,6 +150,21 @@ InferenceContext::InferenceContext(
 
 InferenceContext::~InferenceContext() {}
 
+Status InferenceContext::Run(
+    const std::function<Status(shape_inference::InferenceContext* c)>& fn) {
+  Status s = fn(this);
+  if (!s.ok()) {
+    return AttachContext(s);
+  }
+#ifndef NDEBUG
+  for (int i = 0; i < num_outputs(); ++i) {
+    DCHECK(output(i).IsSet())
+        << i << " for " << node_def_.name() << " of type " << node_def_.op();
+  }
+#endif  // NDEBUG
+  return s;
+}
+
 Status InferenceContext::set_output(StringPiece output_name,
                                     const std::vector<ShapeHandle>& shapes) {
   const auto result = output_name_map_.find(output_name.ToString());
@@ -314,6 +382,19 @@ Status InferenceContext::WithValue(DimensionHandle dim, int64 value,
                                  existing);
 }
 
+void InferenceContext::Relax(DimensionHandle d0, DimensionHandle d1,
+                             DimensionHandle* out) {
+  if (d0.SameHandle(d1)) {
+    *out = d0;
+  } else if (!ValueKnown(d0) || !ValueKnown(d1)) {
+    *out = UnknownDim();
+  } else if (Value(d0) == Value(d1)) {
+    *out = d0;
+  } else {
+    *out = UnknownDim();
+  }
+}
+
 Status InferenceContext::Merge(DimensionHandle d0, DimensionHandle d1,
                                DimensionHandle* out) {
   if (d0.SameHandle(d1) || !ValueKnown(d1)) {
@@ -356,6 +437,48 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   return Status::OK();
 }
 
+void InferenceContext::Relax(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out) {
+  if (s0.SameHandle(s1)) {
+    *out = s0;
+    return;
+  } else if (!RankKnown(s0) || !RankKnown(s1)) {
+    *out = UnknownShape();
+    return;
+  }
+
+  const int32 rank = Rank(s0);
+  if (rank != Rank(s1)) {
+    *out = UnknownShape();
+    return;
+  }
+
+  bool return_s0 = true;
+  for (int i = 0; i < rank; ++i) {
+    auto d0 = Dim(s0, i);
+    auto d1 = Dim(s1, i);
+    if (d0.SameHandle(d1)) continue;
+
+    auto v0 = Value(d0);
+    auto v1 = Value(d1);
+    if (v0 == kUnknownDim || v1 == kUnknownDim || v0 != v1) {
+      return_s0 = false;
+      break;
+    }
+  }
+  if (return_s0) {
+    *out = s0;
+    return;
+  }
+
+  // Relax dims.
+  std::vector<DimensionHandle> dims(rank);
+  for (int i = 0; i < rank; ++i) {
+    // Invariant for relax was checked earlier, so CHECK is ok.
+    Relax(Dim(s0, i), Dim(s1, i), &dims[i]);
+  }
+  *out = MakeShape(dims);
+}
+
 Status InferenceContext::Merge(ShapeHandle s0, ShapeHandle s1,
                                ShapeHandle* out) {
   if (s0.SameHandle(s1) || !RankKnown(s1)) {
@@ -895,9 +1018,15 @@ bool InferenceContext::MergeHandleShapesAndTypes(
   bool refined = false;
   for (int i = 0; i < shapes_and_types.size(); ++i) {
     const ShapeAndType& existing = (*to_update)[i];
-    new_values[i].dtype = shapes_and_types[i].dtype;
-    if (new_values[i].dtype != existing.dtype && existing.dtype == DT_INVALID) {
-      refined = true;
+    if (shapes_and_types[i].dtype == existing.dtype) {
+      new_values[i].dtype = existing.dtype;
+    } else {
+      if (existing.dtype != DT_INVALID) {
+        return false;
+      } else {
+        new_values[i].dtype = shapes_and_types[i].dtype;
+        refined = true;
+      }
     }
     if (!Merge(existing.shape, shapes_and_types[i].shape, &new_values[i].shape)
              .ok()) {
@@ -939,6 +1068,62 @@ bool InferenceContext::MergeInputHandleShapesAndTypes(
                                    input_handle_shapes_and_types_[idx].get());
 }
 
+bool InferenceContext::RelaxHandleShapesAndMergeTypes(
+    const std::vector<ShapeAndType>& shapes_and_types,
+    std::vector<ShapeAndType>* to_update) {
+  if (shapes_and_types.size() != to_update->size()) {
+    return false;
+  }
+  std::vector<ShapeAndType> new_values(shapes_and_types.size());
+  bool refined = false;
+  for (int i = 0; i < shapes_and_types.size(); ++i) {
+    const ShapeAndType& existing = (*to_update)[i];
+    if (shapes_and_types[i].dtype == existing.dtype) {
+      new_values[i].dtype = existing.dtype;
+    } else {
+      if (existing.dtype != DT_INVALID) {
+        return false;
+      } else {
+        new_values[i].dtype = shapes_and_types[i].dtype;
+        refined = true;
+      }
+    }
+    Relax(existing.shape, shapes_and_types[i].shape, &new_values[i].shape);
+    if (!existing.shape.SameHandle(new_values[i].shape)) {
+      refined = true;
+    }
+  }
+  if (!refined) {
+    return false;
+  }
+  for (int i = 0; i < new_values.size(); ++i) {
+    (*to_update)[i] = new_values[i];
+  }
+  return true;
+}
+
+bool InferenceContext::RelaxOutputHandleShapesAndMergeTypes(
+    int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+  if (output_handle_shapes_and_types_[idx] == nullptr) {
+    output_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+    return true;
+  }
+  return RelaxHandleShapesAndMergeTypes(
+      shapes_and_types, output_handle_shapes_and_types_[idx].get());
+}
+
+bool InferenceContext::RelaxInputHandleShapesAndMergeTypes(
+    int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+  if (input_handle_shapes_and_types_[idx] == nullptr) {
+    input_handle_shapes_and_types_[idx].reset(
+        new std::vector<ShapeAndType>(shapes_and_types));
+    return true;
+  }
+  return RelaxHandleShapesAndMergeTypes(
+      shapes_and_types, input_handle_shapes_and_types_[idx].get());
+}
+
 // -----------------------------------------------------------------------------
 // ShapeManager
 // -----------------------------------------------------------------------------
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 119bed4071fad6507d61d673270afb90c6b549f5..716cec5c4a52c65f5c9bb328c4d052ee100a0ecc 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -26,6 +26,13 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
+
+class ShapeRefinerTest;
+
+namespace grappler {
+class GraphProperties;
+}
+
 namespace shape_inference {
 
 struct DimensionOrConstant;
@@ -62,6 +69,7 @@ class DimensionHandle {
   friend class InferenceContext;
   friend class ShapeInferenceTest;
   friend class ShapeInferenceTestutil;
+  friend class ::tensorflow::ShapeRefinerTest;
   friend class ShapeManager;
 
   // Intentionally copyable.
@@ -98,6 +106,7 @@ class ShapeHandle {
   friend class InferenceContext;
   friend class ShapeInferenceTest;
   friend class ShapeInferenceTestutil;
+  friend class ::tensorflow::ShapeRefinerTest;
   friend class ShapeManager;
 
   // Intentionally copyable.
@@ -180,6 +189,26 @@ class InferenceContext {
           std::unique_ptr<std::vector<std::pair<TensorShapeProto, DataType>>>>&
           input_handle_shapes_and_types);
 
+  // <input_tensors> is NULL-padded to be the same size as <input_shapes>.
+  //
+  // Elements of <input_tensors_as_shapes> are used for when a shape
+  // function makes a call to MakeShapeFromShapeTensor; in particular, when
+  // the input_tensors[i] is nullptr but the shape represented by it is
+  // partially known from analysis of the graph. <input_tensors_as_shapes>
+  // can have fewer elements than <input_shapes>. Values of
+  // <input_tensors_as_shapes> do not need to outlive the context.
+  //
+  // REQUIRES: <node_def> is not NULL, and must outlive the
+  // InferenceContext.
+  InferenceContext(
+      int graph_def_version, const NodeDef* node_def, const OpDef& op_def,
+      const std::vector<PartialTensorShape>& input_shapes,
+      const std::vector<const Tensor*>& input_tensors,
+      const std::vector<PartialTensorShape>& input_tensors_as_shapes,
+      const std::vector<std::unique_ptr<
+          std::vector<std::pair<PartialTensorShape, DataType>>>>&
+          input_handle_shapes_and_types);
+
   ~InferenceContext();
 
   // Runs the shape inference function 'fn' with 'this' as the
@@ -187,24 +216,27 @@ class InferenceContext {
   //
   // On error, additional context is provided in the error message.
   Status Run(
-      const std::function<Status(shape_inference::InferenceContext* c)>& fn) {
-    Status s = fn(this);
-    if (!s.ok()) {
-      return AttachContext(s);
-    }
-#ifndef NDEBUG
-    for (int i = 0; i < num_outputs(); ++i) {
-      DCHECK(output(i).IsSet())
-          << i << " for " << node_def_.name() << " of type " << node_def_.op();
-    }
-#endif  // NDEBUG
-    return s;
-  }
+      const std::function<Status(shape_inference::InferenceContext* c)>& fn);
 
-  // Merge the stored shape of the input in position idx with the specified
-  // shape. This requires idx to be in the [0, num_inputs) range. If the merge
-  // is successful and the new shape differs from the old one, store the new
-  // shape and return true. Return false otherwise.
+  // Merge the stored shape of the input in position idx with <shape> according
+  // to the following rules:
+  //
+  // - If the ShapeHandles are the same or <shape> is unknown, there will be no
+  //   change. Otherwise if the stored shape is unknown, the new shape will be
+  //   <shape>.
+  // - If both shapes are known, then they must have the same rank.
+  // - For any one dimension, if the values for that dimension in both shapes
+  //   are known, then the values must match.
+  // - If one shape has equal or more information than the other shape in every
+  //   dimension, the shape with more information will be returned. Otherwise a
+  //   new shape holding the combined information of the input shapes will be
+  //   returned.
+  // - Example: merging [2,?] and [?,2] results in [2,2]
+  // - Example: [2,2] cannot be merged with [1,2]
+  //
+  // This requires idx to be in the [0, num_inputs) range. If the merge is
+  // successful and the new shape differs from the old one, store the new shape
+  // and return true. Return false otherwise.
   bool MergeInput(int idx, ShapeHandle shape) {
     ShapeHandle new_shape;
     if (!Merge(inputs_[idx], shape, &new_shape).ok() ||
@@ -214,6 +246,41 @@ class InferenceContext {
     inputs_[idx] = new_shape;
     return true;
   }
+  // Relax the stored shape of the input in position idx with <shape> according
+  // to the following rules:
+  //
+  // - If the ShapeHandles are the same then the stored shape will be returned.
+  // - If either of the ShapeHandles are unknown, then a new UnknownShape will
+  //   be returned. A new shape must be returned because we cannot claim that
+  //   the resulting shape is necessarily the same as either of the input
+  //   shapes.
+  // - If the shapes both have known ranks but their ranks are different, a new
+  //   UnknownShape will be returned.
+  // - For any one dimension, if the value for that dimension in either of the
+  //   shapes is unknown, a new shape will be returned with a new UnknownDim in
+  //   that dimension.
+  // - For any one dimension, if the values for that dimension in both shapes
+  //   are known but do not match, a new shape will be returned with a new
+  //   UnknownDim in that dimension.
+  // - If both shapes have the same known rank and match in every dimension,
+  //   the stored shape will be returned.
+  // - Example: relaxing [2,?] and [?,2] results in [?,?]
+  // - Example: relaxing [2,2] and [3,2] results in [?,2]
+  // - Example: relaxing [2,2] with [1,2,3] results in ?
+  //
+  // This requires idx to be in the [0, num_inputs) range. If the relax is
+  // successful and the new shape differs from the old one, store the new
+  // shape and return true. Return false otherwise.
+  bool RelaxInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    Relax(inputs_[idx], shape, &new_shape);
+    if (inputs_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    inputs_[idx] = new_shape;
+    return true;
+  }
+
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
@@ -313,12 +380,9 @@ class InferenceContext {
   Status WithValue(DimensionHandle dim, int64 value,
                    DimensionHandle* out) TF_MUST_USE_RESULT;
 
-  // Merges <in0> and <in1> and returns the merged shape in <*out>. If <in0> and
-  // <in1> are incompatible in rank, or in the value of any dimension, returns
-  // an error.
-  //
-  // Note that <*out> may be set to <in0> or <in1>.
-  Status Merge(ShapeHandle in0, ShapeHandle in1,
+  // Merges <s0> and <s1> and returns the merged shape in <*out>. See
+  // 'MergeInput' function for full details and examples.
+  Status Merge(ShapeHandle s0, ShapeHandle s1,
                ShapeHandle* out) TF_MUST_USE_RESULT;
 
   // Asserts that <s>'s rank >= <prefix>'s rank, and the first
@@ -471,13 +535,34 @@ class InferenceContext {
   // If the merge is successful and any of the new shapes differs from the old
   // one, or any of the old dtypes was DT_INVALID, store the new shapes and
   // return true.  Return false otherwise.
+  //
+  // See 'MergeInput' function for full details and examples.
   bool MergeInputHandleShapesAndTypes(
       int idx,
       const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
 
   // As MergeInputHandleShapesAndTypes, but for an output.
   bool MergeOutputHandleShapesAndTypes(
-      int idx, const std::vector<ShapeAndType>& shapes) TF_MUST_USE_RESULT;
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  // Relaxes the stored shapes and types corresponding to the input handle in
+  // position idx with the specified shapes and types. This requires idx to be
+  // in the [0, num_inputs) range.
+  //
+  // If the relax is successful and any of the new shapes differs from the old
+  // one, or any of the old dtypes was DT_INVALID, store the new shapes and
+  // return true.  Return false otherwise.
+  //
+  // See 'RelaxInput' function for full details and examples.
+  bool RelaxInputHandleShapesAndMergeTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
+
+  // As RelaxInputHandleShapesAndTypes, but for an output.
+  bool RelaxOutputHandleShapesAndMergeTypes(
+      int idx,
+      const std::vector<ShapeAndType>& shapes_and_types) TF_MUST_USE_RESULT;
 
   // Returns the output handle shapes and types, for the resource tensor output
   // at index <idx>. Returns NULL if the shape and types were never set.
@@ -538,6 +623,8 @@ class InferenceContext {
     std::vector<Dimension*> all_dims_;  // values are owned.
   };
 
+  friend class ::tensorflow::grappler::GraphProperties;
+  friend class ShapeInferenceTest;      // For testing Relax functions.
   friend class ShapeInferenceTestutil;  // For testing shapes.
 
   // Shared initialization across the two constructors.  Remove
@@ -563,11 +650,25 @@ class InferenceContext {
   // Adds additional context to the given status.
   Status AttachContext(const Status& status);
 
+  // Relaxes <d0> and <d1> and returns the relaxed dimension in <*out>. If <d0>
+  // and <d1> have incompatible values, returns an error.
+  //
+  // Note that <*out> may be set to <d0> or <d1>.
+  void Relax(DimensionHandle d0, DimensionHandle d1, DimensionHandle* out);
+  // Relaxes <s0> and <s1> and returns the relaxed shape in <*out>. See
+  // 'RelaxInput' function for full details and examples.
+  void Relax(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out);
+
   // Used to implement MergeInputHandleShapesAndTypes and
   // MergeOutputHandleShapesAndTypes.
   bool MergeHandleShapesAndTypes(
       const std::vector<ShapeAndType>& shapes_and_types,
       std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
+  // Used to implement RelaxInputHandleShapesAndMergeTypes and
+  // RelaxOutputHandleShapesAndMergeTypes.
+  bool RelaxHandleShapesAndMergeTypes(
+      const std::vector<ShapeAndType>& shapes_and_types,
+      std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
 
   ShapeManager shape_manager_;
 
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index a9c0303d4cb97de42185a310816e4d00a4ad5abe..57d8dc9353cd2743d9ec0868bbd7c55a37378960 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -36,19 +37,11 @@ OpDef MakeOpDefWithLists() {
   return op_reg_data.op_def;
 }
 
-TensorShapeProto S(std::initializer_list<int64> dims) {
-  PartialTensorShape shape(dims);
-  TensorShapeProto ret;
-  shape.AsProto(&ret);
-  return ret;
+PartialTensorShape S(std::initializer_list<int64> dims) {
+  return PartialTensorShape(dims);
 }
 
-TensorShapeProto Unknown() {
-  PartialTensorShape shape;
-  TensorShapeProto ret;
-  shape.AsProto(&ret);
-  return ret;
-}
+PartialTensorShape Unknown() { return PartialTensorShape(); }
 
 }  // namespace
 
@@ -61,7 +54,16 @@ class ShapeInferenceTest : public ::testing::Test {
   bool SameHandle(ShapeHandle a, ShapeHandle b) { return a.SameHandle(b); }
   bool IsSet(DimensionHandle d) { return d.IsSet(); }
   bool IsSet(ShapeHandle s) { return s.IsSet(); }
+  void Relax(InferenceContext* c, DimensionHandle d0, DimensionHandle d1,
+             DimensionHandle* out) {
+    c->Relax(d0, d1, out);
+  }
+  void Relax(InferenceContext* c, ShapeHandle s0, ShapeHandle s1,
+             ShapeHandle* out) {
+    c->Relax(s0, s1, out);
+  }
   void TestMergeHandles(bool input_not_output);
+  void TestRelaxHandles(bool input_not_output);
 
   static const int kVersion = 0;  // used for graph-def version.
 };
@@ -495,7 +497,7 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_TRUE(c.Merge(d2_b, d2, &out).ok());
   EXPECT_TRUE(SameHandle(d2_b, out));
 
-  // Merging inequal values is an error.
+  // Merging unequal values is an error.
   EXPECT_TRUE(
       StringPiece(c.Merge(d2, d1, &out).ToString())
           .contains(
@@ -510,6 +512,122 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_FALSE(IsSet(out));
 }
 
+TEST_F(ShapeInferenceTest, RelaxDim) {
+  NodeDef def;
+  InferenceContext c(kVersion, &def, MakeOpDef(1, 2),
+                     {S({2, InferenceContext::kUnknownDim, 2, 1,
+                         InferenceContext::kUnknownDim})},
+                     {}, {}, {});
+
+  auto d2 = c.Dim(c.input(0), 0);
+  auto d_unknown = c.Dim(c.input(0), 1);
+  auto d2_b = c.Dim(c.input(0), 2);
+  auto d1 = c.Dim(c.input(0), 3);
+  auto d_unknown_b = c.Dim(c.input(0), 4);
+  DimensionHandle out;
+
+  // Relaxing anything with unknown returns a new unknown.
+  Relax(&c, d2, d_unknown, &out);
+  EXPECT_FALSE(SameHandle(d_unknown, out));
+  EXPECT_FALSE(SameHandle(d_unknown_b, out));
+  EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
+  Relax(&c, d_unknown, d2, &out);
+  EXPECT_FALSE(SameHandle(d_unknown, out));
+  EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
+  Relax(&c, d_unknown, d_unknown_b, &out);
+  EXPECT_FALSE(SameHandle(d_unknown, out));
+  EXPECT_FALSE(SameHandle(d_unknown_b, out));
+  EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
+
+  // Relaxing with self returns self.
+  Relax(&c, d2, d2, &out);
+  EXPECT_TRUE(SameHandle(d2, out));
+  Relax(&c, d_unknown, d_unknown, &out);
+  EXPECT_TRUE(SameHandle(d_unknown, out));
+
+  // Relaxing equal values returns first one.
+  Relax(&c, d2, d2_b, &out);
+  EXPECT_TRUE(SameHandle(d2, out));
+  Relax(&c, d2_b, d2, &out);
+  EXPECT_TRUE(SameHandle(d2_b, out));
+
+  // Relaxing unequal values returns a new unknown.
+  Relax(&c, d2, d1, &out);
+  EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
+  Relax(&c, d1, d2, &out);
+  EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
+}
+
+TEST_F(ShapeInferenceTest, RelaxShape) {
+  NodeDef def;
+  InferenceContext c(
+      kVersion, &def, MakeOpDef(7, 2),
+      {Unknown(), S({1, 2}), S({InferenceContext::kUnknownDim, 2}),
+       S({1, InferenceContext::kUnknownDim}), S({1, 3}), Unknown(), S({1})},
+      {}, {}, {});
+
+  auto s_unknown = c.input(0);
+  auto s_1_2 = c.input(1);
+  auto s_u_2 = c.input(2);
+  auto s_1_u = c.input(3);
+  auto s_1_3 = c.input(4);
+  auto s_unknown_b = c.input(5);
+  auto s_1 = c.input(6);
+  ShapeHandle out;
+
+  // Relaxing any shape with unknown returns a new unknown.
+  Relax(&c, s_unknown, s_1_2, &out);
+  EXPECT_FALSE(SameHandle(s_u_2, s_unknown));
+  EXPECT_EQ("?", c.DebugString(out));
+  Relax(&c, s_u_2, s_unknown, &out);
+  EXPECT_FALSE(SameHandle(s_u_2, out));
+  EXPECT_EQ("?", c.DebugString(out));
+  Relax(&c, s_unknown, s_unknown_b, &out);
+  EXPECT_FALSE(SameHandle(s_unknown, out));
+  EXPECT_FALSE(SameHandle(s_unknown_b, out));
+  EXPECT_EQ("?", c.DebugString(out));
+
+  // Relaxing with self returns self.
+  Relax(&c, s_1_2, s_1_2, &out);
+  EXPECT_TRUE(SameHandle(out, s_1_2));
+
+  // Relaxing where one of the inputs has less information.
+  out = ShapeHandle();
+  Relax(&c, s_1_2, s_u_2, &out);
+  EXPECT_FALSE(SameHandle(s_u_2, out));
+  EXPECT_EQ("[?,2]", c.DebugString(out));
+  out = ShapeHandle();
+  Relax(&c, s_u_2, s_1_2, &out);
+  EXPECT_FALSE(SameHandle(s_u_2, out));
+  EXPECT_EQ("[?,2]", c.DebugString(out));
+
+  // Relaxing where each input has one distinct unknown dimension.
+  Relax(&c, s_u_2, s_1_u, &out);
+  EXPECT_EQ("[?,?]", c.DebugString(out));
+  EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
+  EXPECT_FALSE(SameHandle(c.Dim(s_1_u, 1), c.Dim(out, 1)));
+  auto s_u1 = c.UnknownShapeOfRank(1);
+  auto s_u2 = c.UnknownShapeOfRank(1);
+  Relax(&c, s_u1, s_u2, &out);
+  EXPECT_FALSE(SameHandle(s_u1, out));
+
+  // Relaxing with mismatched values in a dimension returns a shape with that
+  // dimension unknown.
+  out = s_unknown;
+  Relax(&c, s_u_2, s_1_3, &out);
+  EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
+  EXPECT_EQ("[?,?]", c.DebugString(out));
+  out = s_unknown;
+  Relax(&c, s_1_3, s_u_2, &out);
+  EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
+  EXPECT_EQ("[?,?]", c.DebugString(out));
+  out = s_unknown;
+
+  // Relaxing with mismatched ranks returns a new unknown.
+  Relax(&c, s_1, s_1_2, &out);
+  EXPECT_EQ("?", c.DebugString(out));
+}
+
 TEST_F(ShapeInferenceTest, MergeShape) {
   NodeDef def;
   InferenceContext c(kVersion, &def, MakeOpDef(7, 2),
@@ -1412,7 +1530,7 @@ void ShapeInferenceTest::TestMergeHandles(bool input_not_output) {
                      {});
   auto make_shape = [&c](std::initializer_list<int64> dim_sizes) {
     ShapeHandle s;
-    TF_CHECK_OK(c.MakeShapeFromShapeProto(S(dim_sizes), &s));
+    TF_CHECK_OK(c.MakeShapeFromPartialTensorShape(S(dim_sizes), &s));
     return s;
   };
   auto get_shapes_and_types_from_context = [&](int idx) {
@@ -1473,8 +1591,8 @@ void ShapeInferenceTest::TestMergeHandles(bool input_not_output) {
     EXPECT_EQ(t[i].dtype, v[i].dtype);
   }
 
-  // Only difference is in a mismatched dtype. That is ignored,
-  // and there are no other changes, so nothing is done.
+  // Only difference is in a mismatched dtype, but that cannot be
+  // updated unless original dtype is DT_INVALID.
   t2 = t;
   t2[2].dtype = DT_FLOAT;
   ASSERT_FALSE(merge_shapes_and_types_to_context(0, t2));
@@ -1510,11 +1628,111 @@ void ShapeInferenceTest::TestMergeHandles(bool input_not_output) {
 }
 
 TEST_F(ShapeInferenceTest, MergeInputHandleShapesAndTypes) {
-  TestMergeHandles(true);
+  TestMergeHandles(true /* input_not_output */);
 }
 
 TEST_F(ShapeInferenceTest, MergeOutputHandleShapesAndTypes) {
-  TestMergeHandles(false);
+  TestMergeHandles(false /* input_not_output */);
+}
+
+void ShapeInferenceTest::TestRelaxHandles(bool input_not_output) {
+  NodeDef def;
+  InferenceContext c(kVersion, &def, MakeOpDef(2, 2), {S({}), S({})}, {}, {},
+                     {});
+  auto make_shape = [&c](std::initializer_list<int64> dim_sizes) {
+    ShapeHandle s;
+    TF_CHECK_OK(c.MakeShapeFromPartialTensorShape(S(dim_sizes), &s));
+    return s;
+  };
+  auto get_shapes_and_types_from_context = [&](int idx) {
+    if (input_not_output) {
+      return c.input_handle_shapes_and_types(idx);
+    } else {
+      return c.output_handle_shapes_and_types(idx);
+    }
+  };
+  auto relax_shapes_and_types_to_context =
+      [&](int idx, const std::vector<ShapeAndType>& shapes_and_types) {
+        if (input_not_output) {
+          return c.RelaxInputHandleShapesAndMergeTypes(idx, shapes_and_types);
+        } else {
+          return c.RelaxOutputHandleShapesAndMergeTypes(idx, shapes_and_types);
+        }
+      };
+
+  EXPECT_TRUE(get_shapes_and_types_from_context(0) == nullptr);
+  EXPECT_TRUE(get_shapes_and_types_from_context(1) == nullptr);
+
+  // First relax will take the input completely.
+  std::vector<ShapeAndType> t{{make_shape({1, 2, 3}), DT_FLOAT},
+                              {c.UnknownShape(), DT_INVALID},
+                              {make_shape({4, 3, 2, 1}), DT_INT32}};
+  ASSERT_TRUE(relax_shapes_and_types_to_context(0, t));
+  ASSERT_TRUE(get_shapes_and_types_from_context(0) != nullptr);
+  std::vector<ShapeAndType> v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Relax that fails because wrong number of values passed.
+  // Fails, and no changes made.
+  ASSERT_FALSE(relax_shapes_and_types_to_context(
+      0, std::vector<ShapeAndType>{{make_shape({1, 2, 3}), DT_FLOAT}}));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_TRUE(SameHandle(t[i].shape, v[i].shape)) << i;
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Only difference is in a mismatched shape. This should replace
+  // the mismatched dimension with an UnknownDim.
+  auto t2 = t;
+  t2[2].shape = make_shape({4, 3, 4, 1});
+  ASSERT_TRUE(relax_shapes_and_types_to_context(0, t2));
+  v = *get_shapes_and_types_from_context(0);
+  EXPECT_EQ("[4,3,?,1]", c.DebugString(v[2].shape));
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Only difference is in a mismatched dtype, but that cannot be
+  // updated unless original dtype is DT_INVALID.
+  t2 = t;
+  t2[2].dtype = DT_FLOAT;
+  ASSERT_FALSE(relax_shapes_and_types_to_context(0, t2));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Difference is a new shape, which will result in a new UnknownShape.
+  t[1].shape = make_shape({1, 10});
+  ASSERT_TRUE(relax_shapes_and_types_to_context(0, t));
+  v = *get_shapes_and_types_from_context(0);
+  ASSERT_EQ(3, v.size());
+  EXPECT_FALSE(SameHandle(t[1].shape, v[1].shape));
+  EXPECT_EQ("?", c.DebugString(v[1].shape));
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(t[i].dtype, v[i].dtype);
+  }
+
+  // Difference is relaxable (new type).
+  t[1].dtype = DT_DOUBLE;
+  ASSERT_TRUE(relax_shapes_and_types_to_context(0, t));
+  v = *get_shapes_and_types_from_context(0);
+  EXPECT_EQ(t[1].dtype, v[1].dtype);
+}
+
+TEST_F(ShapeInferenceTest, RelaxInputHandleShapesAndTypes) {
+  TestRelaxHandles(true /* input_not_output */);
+}
+
+TEST_F(ShapeInferenceTest, RelaxOutputHandleShapesAndTypes) {
+  TestRelaxHandles(false /* input_not_output */);
 }
 
 }  // namespace shape_inference
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index 7b3cd07429b6c75b6d108811fe6f752a51b97df6..b4765ab0b2c41a1b510364d755984b6ae68dd07a 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -43,9 +43,26 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     in_shapes.push_back(shape);
   }
 
-  shape_inference::InferenceContext c(op.graph_def_version, &op.node_def,
-                                      op_reg_data->op_def, in_shapes,
-                                      op.input_tensors, {}, {});
+  std::vector<std::unique_ptr<std::vector<shape_inference::ShapeAndType>>>
+      input_resource_handle_shapes_and_types;
+  for (const auto p : op.input_resource_handle_shapes_and_types) {
+    if (p == nullptr) {
+      input_resource_handle_shapes_and_types.push_back(nullptr);
+    } else {
+      std::unique_ptr<std::vector<ShapeAndType>> v(
+          new std::vector<ShapeAndType>());
+      for (const auto& shape_and_type : *p) {
+        ShapeHandle shape;
+        TF_RETURN_IF_ERROR(
+            MakeShapeFromString(&manager, shape_and_type.first, &shape));
+        v->emplace_back(shape, shape_and_type.second);
+      }
+      input_resource_handle_shapes_and_types.emplace_back(v.release());
+    }
+  }
+  shape_inference::InferenceContext c(
+      op.graph_def_version, &op.node_def, op_reg_data->op_def, in_shapes,
+      op.input_tensors, {}, std::move(input_resource_handle_shapes_and_types));
   TF_RETURN_IF_ERROR(c.construction_status());
   if (op_reg_data->shape_inference_fn == nullptr) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 996281e70e6aa69a83248aabd8d8bc9b72909795..fbfd24538bc7a5b1f3ee3805d4a803a0e7239fca 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -27,14 +27,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-class NodeDef;
 class Tensor;
 
 struct ShapeInferenceTestOp {
+  typedef std::pair<string, DataType> ShapeAndType;
   explicit ShapeInferenceTestOp(StringPiece name) : name(name.ToString()) {}
   string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
+  std::vector<std::vector<ShapeAndType>*>
+      input_resource_handle_shapes_and_types;
   int graph_def_version = TF_GRAPH_DEF_VERSION;
 };
 
diff --git a/tensorflow/core/framework/summary.proto b/tensorflow/core/framework/summary.proto
index 12274d5e135ac43abf4ca193fb5851907de2159c..b6101fda3d07da3ea9818668aa5e420a85f446d4 100644
--- a/tensorflow/core/framework/summary.proto
+++ b/tensorflow/core/framework/summary.proto
@@ -51,6 +51,12 @@ message SummaryMetadata {
   // A list of plugin data. A single summary value instance may be used by more
   // than 1 plugin.
   repeated PluginData plugin_data = 1;
+
+  // Display name for viewing in TensorBoard.
+  string display_name = 2;
+
+  // Longform readable description of the summary sequence. Markdown supported.
+  string summary_description = 3;
 };
 
 // A Summary is a set of named values to be displayed by the
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 243b30c007f1e4a9495cc226e2640efa9ea7dc63..0a85894071de390a3e0d51204c02e0b58a0538de 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -29,10 +29,15 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -44,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/platform/variant_coding.h"
 
 namespace tensorflow {
 namespace {
@@ -217,6 +223,36 @@ struct Helper<ResourceHandle> {
   }
 };
 
+template <>
+struct Helper<Variant> {
+  // Encodes "n" elements of type Variant stored in "in" into destination
+  // "out", which is usually the TensorProto::tensor_content.
+  template <typename Destination>
+  static void Encode(TensorBuffer* in, int64 n, Destination* out) {
+    port::EncodeVariantList(in->base<const Variant>(), n, out);
+  }
+
+  // Decodes "n" elements of type Variant from "in" and constructs a
+  // buffer out of it. Returns nullptr if the decoding fails. "in" is
+  // usually the TensorProto::tensor_content.
+  template <typename Source>
+  static TensorBuffer* Decode(Allocator* a, const Source& in, int64 n) {
+    auto* buf = new Buffer<Variant>(a, n);
+    Variant* ps = buf->template base<Variant>();
+    if (ps == nullptr || !port::DecodeVariantList(in, ps, n)) {
+      buf->Unref();
+      return nullptr;
+    }
+    return buf;
+  }
+
+  // Returns the estimated memory usage of "n" elements of type T
+  // stored in buffer "in".
+  static int64 TotalBytes(TensorBuffer* in, int n) {
+    return n * sizeof(Variant);
+  }
+};
+
 template <typename T>
 struct ProtoHelper {};
 
@@ -270,7 +306,7 @@ struct ProtoHelper<int64> {
 
 template <>
 struct ProtoHelper<ResourceHandle> {
-  static protobuf::RepeatedPtrField<ResourceHandle>::const_iterator Begin(
+  static protobuf::RepeatedPtrField<ResourceHandleProto>::const_iterator Begin(
       const TensorProto& proto) {
     return proto.resource_handle_val().begin();
   }
@@ -278,8 +314,31 @@ struct ProtoHelper<ResourceHandle> {
     return proto.resource_handle_val().size();
   }
   static void Fill(const ResourceHandle* data, size_t n, TensorProto* proto) {
-    protobuf::RepeatedPtrField<ResourceHandle> copy(data, data + n);
-    proto->mutable_resource_handle_val()->Swap(&copy);
+    auto* handles = proto->mutable_resource_handle_val();
+    handles->Clear();
+    for (size_t i = 0; i < n; i++) {
+      data[i].AsProto(handles->Add());
+    }
+  }
+};
+
+template <>
+struct ProtoHelper<Variant> {
+  static protobuf::RepeatedPtrField<VariantTensorDataProto>::const_iterator
+  Begin(const TensorProto& proto) {
+    return proto.variant_val().begin();
+  }
+  static size_t NumElements(const TensorProto& proto) {
+    return proto.variant_val().size();
+  }
+  static void Fill(const Variant* data, size_t n, TensorProto* proto) {
+    auto* variant_values = proto->mutable_variant_val();
+    variant_values->Clear();
+    for (size_t i = 0; i < n; ++i) {
+      VariantTensorData tmp;
+      data[i].Encode(&tmp);
+      tmp.ToProto(variant_values->Add());
+    }
   }
 };
 
@@ -370,10 +429,10 @@ Buffer<T>::Buffer(Allocator* a, int64 n,
 
 template <typename T>
 Buffer<T>::~Buffer() {
-  if (LogMemory::IsEnabled()) {
-    RecordDeallocation();
-  }
   if (data_) {
+    if (LogMemory::IsEnabled()) {
+      RecordDeallocation();
+    }
     alloc_->Deallocate<T>(data_, elem_);
   }
 }
@@ -415,6 +474,30 @@ TensorBuffer* FromProtoField(Allocator* a, const TensorProto& in, int64 n) {
   return buf;
 }
 
+template <>
+TensorBuffer* FromProtoField<Variant>(Allocator* a, const TensorProto& in,
+                                      int64 n) {
+  CHECK_GT(n, 0);
+  Buffer<Variant>* buf = new Buffer<Variant>(a, n);
+  Variant* data = buf->template base<Variant>();
+  if (data == nullptr) {
+    buf->Unref();
+    return nullptr;
+  }
+  const int64 in_n = ProtoHelper<Variant>::NumElements(in);
+  if (in_n <= 0) {
+    std::fill_n(data, n, Variant());
+  } else {
+    for (int64 i = 0; i < in_n; ++i) {
+      data[i] = in.variant_val(i);
+    }
+    for (int64 i = in_n; i < n; ++i) {
+      data[i] = Variant();
+    }
+  }
+  return buf;
+}
+
 // fp16 is opaque to the protobuf, so we deserialize these identical to uint16
 // but with data stored in half_val instead of int_val (ie., we don't use
 // ProtoHelper<uint16>).
@@ -565,6 +648,7 @@ bool Tensor::RefCountIsOne() const {
     CASE(bfloat16, SINGLE_ARG(STMTS))                          \
     CASE(Eigen::half, SINGLE_ARG(STMTS))                       \
     CASE(ResourceHandle, SINGLE_ARG(STMTS))                    \
+    CASE(Variant, SINGLE_ARG(STMTS))                           \
     case DT_INVALID:                                           \
       INVALID;                                                 \
       break;                                                   \
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index a164fe61b5f001c1898281ad66ca782c85e7a75f..a8f9d215114da97b29d3d96017079d41a4471334 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -17,10 +17,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/tensor_description.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -35,8 +35,13 @@ limitations under the License.
 
 namespace tensorflow {
 
-class TensorBuffer;  // Forward declaration.
+// Forward declarations.  In particular, we forward declare protos so that their
+// symbols can be removed from .so exports.
+class AllocationDescription;
+class TensorBuffer;
 class TensorCApi;
+class TensorDescription;
+class TensorProto;
 
 /// @ingroup core
 /// Represents an n-dimensional array of values.
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index 5d383bcc663a5aff05deb72cc8e02b0d8a3f11e6..7e4af7a645cd98f54954400c9b4f65daad45b940 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -71,5 +71,18 @@ message TensorProto {
   repeated double dcomplex_val = 12 [packed = true];
 
   // DT_RESOURCE
-  repeated ResourceHandle resource_handle_val = 14;
+  repeated ResourceHandleProto resource_handle_val = 14;
+
+  // DT_VARIANT
+  repeated VariantTensorDataProto variant_val = 15;
 };
+
+// Protocol buffer representing the serialization format of DT_VARIANT tensors.
+message VariantTensorDataProto {
+  // Name of the type of objects being serialized.
+  string type_name = 1;
+  // Portions of the object that are not Tensors.
+  bytes metadata = 2;
+  // Tensors contained within objects being serialized.
+  repeated TensorProto tensors = 3;
+}
diff --git a/tensorflow/core/framework/tensor_reference.h b/tensorflow/core/framework/tensor_reference.h
index 186820785dd164ed24847ebb5d7ee1eee99f5b8d..37e588d4f108987f3f03ed503e9c6b66dfd7e5c7 100644
--- a/tensorflow/core/framework/tensor_reference.h
+++ b/tensorflow/core/framework/tensor_reference.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_TENSOR_REFERENCE_H_
 #define TENSORFLOW_FRAMEWORK_TENSOR_REFERENCE_H_
 
-#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 1284214952cd8096de78bee1bbfea8848d525834..14d9cea20ea955470eea5f0d4d2c14a1593acc3a 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index b2016074614297a9a7f68bcecde562cf8b943a19..e56c3d7b930681cafa7193124f31c2a11a6f886d 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -35,6 +35,7 @@ namespace tensorflow {
 template <class Shape>
 class TensorShapeIter;
 class TensorShape;
+class TensorShapeProto;
 class PartialTensorShape;
 // END_SKIP_DOXYGEN
 
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d6fe9a1511b3de282fb77e2f9f735a7242f1b715..51a7b14fed23672ce476909ec9f5f5e164306186 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 369f64e9e2d467d81a69e2660bda21043cd1d2d1..9aaf00853dcae40bef44ef92c4b37275bcfef27d 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -36,6 +38,35 @@ inline bool operator==(const ResourceHandle& a, const ResourceHandle& b) {
          a.maybe_type_name() == b.maybe_type_name();
 }
 
+inline bool operator==(const Variant& a, const Variant& b) {
+  if (a.is_empty()) {
+    return b.is_empty();
+  }
+
+  if (a.TypeId() != b.TypeId()) return false;
+  if (a.TypeName() != b.TypeName()) return false;
+
+  VariantTensorData a_data, b_data;
+  a.Encode(&a_data);
+  b.Encode(&b_data);
+
+  if (a_data.metadata != b_data.metadata) return false;
+
+  if (a_data.tensors.size() != b_data.tensors.size()) return false;
+
+  for (int i = 0; i < a_data.tensors.size(); ++i) {
+    TensorProto a_proto, b_proto;
+    a_data.tensors[i].AsProtoTensorContent(&a_proto);
+    b_data.tensors[i].AsProtoTensorContent(&b_proto);
+    string a_str, b_str;
+    a_proto.SerializeToString(&a_str);
+    b_proto.SerializeToString(&b_str);
+    if (a_str != b_str) return false;
+  }
+
+  return true;
+}
+
 TEST(TensorTest, Default) {
   Tensor t;
   EXPECT_EQ(t.dtype(), DT_FLOAT);
@@ -158,6 +189,74 @@ TEST(Tensor_ResourceHandle, Simple) {
   TestCopies<ResourceHandle>(t);
 }
 
+TEST(Tensor_Variant, Simple) {
+  Tensor t(DT_VARIANT, TensorShape({}));
+  Tensor value(DT_FLOAT, TensorShape({}));
+  value.flat<float>()(0) = 42.0f;
+  t.flat<Variant>()(0) = value;
+  // All the tests in TestCopies except the ones that serialize and deserialize
+  // the tensor. The consumer of a serialized Variant Tensor should know what
+  // type is stored in the Tensor, so not testing the generic
+  // serialize/deserialize case here.
+  {
+    LOG(INFO) << "CopyFrom()";
+    Tensor t2(t.dtype());
+    EXPECT_TRUE(t2.CopyFrom(t, t.shape()));
+    test::ExpectTensorEqual<Variant>(t, t2);
+  }
+  {
+    LOG(INFO) << "operator=()";
+    Tensor t2(t.dtype());
+    t2 = t;
+    test::ExpectTensorEqual<Variant>(t, t2);
+  }
+  {
+    LOG(INFO) << "deep copy";
+    Tensor t2(t.dtype(), t.shape());
+    t2.flat<Variant>() = t.flat<Variant>();
+    test::ExpectTensorEqual<Variant>(t, t2);
+  }
+  {
+    LOG(INFO) << "AsTensor";
+    gtl::ArraySlice<Variant> values(t.flat<Variant>().data(), t.NumElements());
+    Tensor t2 = test::AsTensor(values, t.shape());
+    test::ExpectTensorEqual<Variant>(t, t2);
+  }
+  {
+    LOG(INFO) << "Move constructor";
+    Tensor t2 = t;
+    Tensor t3(std::move(t2));
+    test::ExpectTensorEqual<Variant>(t, t3);
+    EXPECT_TRUE(t3.IsInitialized());
+    EXPECT_FALSE(t2.IsInitialized());
+  }
+  {
+    LOG(INFO) << "Move assignment";
+    Tensor t2 = t;
+    Tensor t3 = std::move(t2);
+    Tensor* t4 = &t3;
+    *t4 = std::move(t3);
+    test::ExpectTensorEqual<Variant>(t, t3);
+    EXPECT_TRUE(t3.IsInitialized());
+    EXPECT_FALSE(t2.IsInitialized());
+  }
+}
+
+TEST(Tensor_Variant, Marshal) {
+  Tensor t(DT_VARIANT, TensorShape({}));
+
+  Tensor internal(DT_FLOAT, TensorShape({}));
+  internal.flat<float>()(0) = 42.0f;
+  t.flat<Variant>()(0) = internal;
+
+  LOG(INFO) << "AsProtoField()";
+  TensorProto proto;
+  t.AsProtoField(&proto);
+
+  Tensor t2(t.dtype());
+  EXPECT_TRUE(t2.FromProto(proto));
+}
+
 TEST(Tensor_UInt16, Simple) {
   Tensor t(DT_UINT16, TensorShape({2, 2}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index dc396e468ae8ebfc357b95ff6419b20d3ac3b5ff..39dd5b435e06c27c52f6f2746c649fb902387c76 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -87,6 +87,8 @@ string DataTypeString(DataType dtype) {
       return "half";
     case DT_RESOURCE:
       return "resource";
+    case DT_VARIANT:
+      return "variant";
     default:
       LOG(ERROR) << "Unrecognized DataType enum value " << dtype;
       return strings::StrCat("unknown dtype enum (", dtype, ")");
@@ -165,6 +167,9 @@ bool DataTypeFromString(StringPiece sp, DataType* dt) {
   } else if (sp == "resource") {
     *dt = DT_RESOURCE;
     return true;
+  } else if (sp == "variant") {
+    *dt = DT_VARIANT;
+    return true;
   }
   return false;
 }
@@ -186,7 +191,7 @@ DataTypeVector AllTypes() {
   return {DT_FLOAT,   DT_DOUBLE, DT_INT32,  DT_UINT8,     DT_INT16,
           DT_UINT16,  DT_INT8,   DT_STRING, DT_COMPLEX64, DT_COMPLEX128,
           DT_INT64,   DT_BOOL,   DT_QINT8,  DT_QUINT8,    DT_QINT16,
-          DT_QUINT16, DT_QINT32, DT_HALF,   DT_RESOURCE};
+          DT_QUINT16, DT_QINT32, DT_HALF,   DT_RESOURCE,  DT_VARIANT};
 }
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
@@ -313,6 +318,11 @@ int DataTypeSize(DataType dt) {
   switch (dt) {
     TF_CALL_POD_TYPES(CASE);
     TF_CALL_QUANTIZED_TYPES(CASE);
+    // TF_CALL_QUANTIZED_TYPES() macro does no cover quint16 and qint16, since
+    // they are not supported widely, but are explicitly listed here for
+    // bitcast.
+    TF_CALL_qint16(CASE);
+    TF_CALL_quint16(CASE);
     default:
       return 0;
   }
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index f562880e7cf7120618ec240820e8b58bf7a39804..9127750d68b19f9126706516757665763cbf8357 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -28,8 +28,9 @@ limitations under the License.
 // clang-format on
 #include "tensorflow/core/framework/bfloat16.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -181,6 +182,7 @@ MATCH_TYPE_AND_ENUM(qint32, DT_QINT32);
 MATCH_TYPE_AND_ENUM(bfloat16, DT_BFLOAT16);
 MATCH_TYPE_AND_ENUM(Eigen::half, DT_HALF);
 MATCH_TYPE_AND_ENUM(ResourceHandle, DT_RESOURCE);
+MATCH_TYPE_AND_ENUM(Variant, DT_VARIANT);
 
 #undef MATCH_TYPE_AND_ENUM
 
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index b80e2b31dc8b0822d4913ebc1ffeb98ac10d969e..1beb2a1aa25dc17485abb2decc7fa8758b217282 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -34,6 +34,7 @@ enum DataType {
   DT_COMPLEX128 = 18;  // Double-precision complex
   DT_HALF = 19;
   DT_RESOURCE = 20;
+  DT_VARIANT = 21;  // Arbitrary C++ data types
 
   // TODO(josh11b): DT_GENERIC_PROTO = ??;
   // TODO(jeff,josh11b): DT_UINT64?  DT_UINT32?
@@ -60,5 +61,6 @@ enum DataType {
   DT_COMPLEX128_REF = 118;
   DT_HALF_REF = 119;
   DT_RESOURCE_REF = 120;
+  DT_VARIANT_REF = 121;
 }
 // LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/c/c_api.h,https://www.tensorflow.org/code/tensorflow/go/tensor.go)
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f03a60892c13edff07977b556f35fa3787d6319
--- /dev/null
+++ b/tensorflow/core/framework/variant.cc
@@ -0,0 +1,87 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+
+namespace tensorflow {
+
+template <>
+void* Variant::get() {
+  if (is_empty()) {
+    return nullptr;
+  }
+  return value_->RawPtr();
+}
+
+template <>
+const void* Variant::get() const {
+  if (is_empty()) {
+    return nullptr;
+  }
+  return value_->RawPtr();
+}
+
+void VariantTensorData::ToProto(VariantTensorDataProto* proto) const {
+  proto->set_type_name(type_name);
+  proto->set_metadata(metadata);
+  proto->clear_tensors();
+  for (int i = 0; i < tensors.size(); ++i) {
+    tensors[i].AsProtoField(proto->mutable_tensors()->Add());
+  }
+}
+
+bool VariantTensorData::FromProto(const VariantTensorDataProto& proto) {
+  type_name = proto.type_name();
+  metadata = proto.metadata();
+  tensors.clear();
+  for (int i = 0; i < proto.tensors_size(); ++i) {
+    Tensor tmp;
+    if (!tmp.FromProto(proto.tensors(i))) return false;
+    tensors.push_back(tmp);
+  }
+  return true;
+}
+
+template <>
+string TypeNameVariant(const VariantTensorDataProto& value) {
+  return value.GetTypeName();
+}
+
+template <>
+void EncodeVariant(const VariantTensorDataProto& value, VariantTensorData* data) {
+  data->FromProto(value);
+}
+
+template <>
+bool DecodeVariant(const VariantTensorData& data,
+                   VariantTensorDataProto* value) {
+  data.ToProto(value);
+  return true;
+}
+
+template <>
+void EncodeVariant(const VariantTensorDataProto& value, string* buf) {
+  value.SerializeToString(buf);
+}
+
+template <>
+bool DecodeVariant(const string& buf, VariantTensorDataProto* value) {
+  return value->ParseFromString(buf);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5c64aea0a0c8355bb28ff38cad0b74c2068a169
--- /dev/null
+++ b/tensorflow/core/framework/variant.h
@@ -0,0 +1,287 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_FRAMEWORK_VARIANT_H_
+#define TENSORFLOW_FRAMEWORK_VARIANT_H_
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+struct VariantTensorData;
+
+template <typename T>
+string TypeNameVariant(const T& value);
+
+template <typename T>
+void EncodeVariant(const T& value, VariantTensorData* data);
+
+template <typename T>
+bool DecodeVariant(const VariantTensorData& data, T* value);
+
+template <typename T>
+void EncodeVariant(const T& value, string* buf);
+
+template <typename T>
+bool DecodeVariant(const string& buf, T* value);
+
+// This is an implementation of a type-erased container that can store an
+// object of any type. The implementation is very similar to std::any, but has
+// restrictions on the types of objects that can be stored, and eschews some of
+// the fancier constructors available for std::any. An object of
+// tensorflow::Variant is intended to be used as the value that will be stored
+// in a tensorflow::Tensor object when its type is DT_VARIANT.
+//
+// tensorflow::Variant can store an object of a class that satisfies the
+// following constraints:
+//
+// * The class is CopyConstructible.
+// * The class has a default constructor.
+// * It's either a protocol buffer, a tensorflow::Tensor, or defines the
+// following functions:
+//
+//   string TypeName() const;
+//   void Encode(VariantTensorData* data) const;
+//   void Decode(const VariantTensorData& data);
+//
+// Simple POD types can elide the Encode/Decode functions, they are provided by
+// helper methods.
+// Here are some typical usage patterns:
+//
+//   Variant x = 10;
+//   EXPECT_EQ(*x.get<int>(), 10);
+//
+//   Tensor t(DT_FLOAT, TensorShape({}));
+//   t.flat<float>()(0) = 42.0f;
+//   Variant x = t;
+//   EXPECT_EQ(x.get<Tensor>()->flat<float>()(0), 42.0f);
+//
+// Accessing the stored object:
+//
+// The get<T> function is the main mechanism to access the object stored in the
+// contained. It is type-safe, that is, calling get<T> when the stored object's
+// type is not T, returns a nullptr. A raw pointer to the stored object can be
+// obtained by calling get<void>().
+//
+// Serializing/deserializing Variant object:
+//
+// The Variant class delegates serializing and deserializing operations to the
+// contained object. Helper functions to do these operations are provided for
+// POD data types, tensorflow::Tensor, and protocol buffer objects. However,
+// other classes have to provide Encode/Decode functions to do handle
+// serialization.
+//
+// Objects stored in a Variant object often contain references to other
+// tensorflow::Tensors of primitive types (Eg., a list of tensorflow::Tensors).
+// To efficiently support those use cases, a structure is imposed on the
+// serialization format. Namely, classes should serialize their contents in to a
+// VariantTensorData object:
+//
+// struct VariantTensorData {
+//   string type_name;
+//   string metadata;
+//   std::vector<Tensor> tensors;
+// };
+//
+// Objects with references to other Tensors can simply store those tensors in
+// the `tensors` field, and serialize other metadata content in to the
+// `metadata` field.
+//
+// Serialization example:
+//
+// Foo f = Foo {...};
+// Variant x = f;
+// string serialized_f;
+// x.Encode(&serialized_f);
+//
+// Variant y = Foo(); // default constructed Foo.
+// y.Decode(&serialized_f);
+// EXPECT_EQ(*x.get<Foo>(), *y.get<Foo>());
+//
+class Variant {
+ public:
+  constexpr Variant() noexcept = default;
+
+  Variant(const Variant& other)
+      : value_(other.is_empty() ? std::unique_ptr<ValueInterface>()
+                                : other.value_->Clone()) {}
+
+  Variant(Variant&& other) noexcept = default;
+
+  // Make sure that the type is CopyConstructible and not a tensorflow::Variant
+  // object itself. We want the copy constructor to be chosen for the
+  // tensorflow::Variant case.
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_copy_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant(T&& value)  // NOLINT
+      : value_(new Value<VT>(in_place, std::forward<T>(value))) {}
+
+  Variant& operator=(const Variant& rhs) {
+    Variant(rhs).swap(*this);
+    return *this;
+  }
+
+  Variant& operator=(Variant&& rhs) noexcept {
+    Variant(std::move(rhs)).swap(*this);
+    return *this;
+  }
+
+  bool is_empty() const { return value_ == nullptr; }
+
+  void clear() noexcept { value_.reset(); }
+
+  void swap(Variant& other) noexcept { value_.swap(other.value_); }
+
+  TypeIndex TypeId() const {
+    const TypeIndex VoidTypeIndex = MakeTypeIndex<void>();
+    if (is_empty()) {
+      return VoidTypeIndex;
+    }
+    return value_->TypeId();
+  }
+
+  template <typename T>
+  T* get() {
+    const TypeIndex TTypeIndex = MakeTypeIndex<T>();
+    if (is_empty() || (TTypeIndex != TypeId())) {
+      return nullptr;
+    }
+    return std::addressof(static_cast<Variant::Value<T>*>(value_.get())->value);
+  }
+
+  template <typename T>
+  const T* get() const {
+    const TypeIndex TTypeIndex = MakeTypeIndex<T>();
+    if (is_empty() || (TTypeIndex != TypeId())) {
+      return nullptr;
+    }
+    return std::addressof(
+        static_cast<const Variant::Value<T>*>(value_.get())->value);
+  }
+
+  string TypeName() const {
+    if (is_empty()) {
+      return "";
+    }
+    return value_->TypeName();
+  }
+
+  // Serialize the contents of the stored object into `data`.
+  void Encode(VariantTensorData* data) const {
+    if (!is_empty()) {
+      value_->Encode(data);
+    }
+  }
+
+  // Deserialize `data` and update the stored object.
+  bool Decode(const VariantTensorData& data) {
+    if (!is_empty()) {
+      return value_->Decode(data);
+    }
+    return true;
+  }
+
+  // Helper methods to directly serialize/deserialize from strings.
+  void Encode(string* buf) const {
+    if (!is_empty()) {
+      value_->Encode(buf);
+    }
+  }
+  bool Decode(const string& buf) {
+    if (!is_empty()) {
+      return value_->Decode(buf);
+    }
+    return true;
+  }
+
+ private:
+  struct in_place_t {};
+  static constexpr in_place_t in_place{};
+
+  struct ValueInterface {
+    virtual ~ValueInterface() = default;
+    virtual TypeIndex TypeId() const = 0;
+    virtual void* RawPtr() = 0;
+    virtual const void* RawPtr() const = 0;
+    virtual std::unique_ptr<ValueInterface> Clone() const = 0;
+    virtual string TypeName() const = 0;
+    virtual void Encode(VariantTensorData* data) const = 0;
+    virtual bool Decode(const VariantTensorData& data) = 0;
+    virtual void Encode(string* buf) const = 0;
+    virtual bool Decode(const string& data) = 0;
+  };
+
+  template <typename T>
+  struct Value : ValueInterface {
+    template <class... Args>
+    explicit Value(in_place_t /*tag*/, Args&&... args)
+        : value(std::forward<Args>(args)...) {}
+
+    TypeIndex TypeId() const override {
+      const TypeIndex value_type_index =
+          MakeTypeIndex<typename std::decay<T>::type>();
+      return value_type_index;
+    }
+
+    void* RawPtr() override { return &value; }
+
+    const void* RawPtr() const override { return &value; }
+
+    std::unique_ptr<ValueInterface> Clone() const override {
+      return std::unique_ptr<ValueInterface>(new Value(in_place, value));
+    }
+
+    string TypeName() const override { return TypeNameVariant(value); }
+
+    void Encode(VariantTensorData* data) const override {
+      EncodeVariant(value, data);
+    }
+
+    bool Decode(const VariantTensorData& data) override {
+      return DecodeVariant(data, &value);
+    }
+
+    void Encode(string* buf) const override { EncodeVariant(value, buf); }
+
+    bool Decode(const string& buf) override {
+      return DecodeVariant(buf, &value);
+    }
+
+    T value;
+  };
+
+  std::unique_ptr<ValueInterface> value_;
+};
+
+template <>
+void* Variant::get();
+
+template <>
+const void* Variant::get() const;
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_VARIANT_H_
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ec64448ebb5fddc3b6ad51bb67595d021c75e2f
--- /dev/null
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -0,0 +1,222 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+#define TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+
+#include <iostream>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// The serialization format for Variant objects. Objects with references to
+// other Tensors can simply store those tensors in the `tensors` field, and
+// serialize other metadata content in to the `metadata` field. Objects can
+// optionally set the `type_name` for type-checking before deserializing an
+// object.
+struct VariantTensorData {
+  string type_name;
+  string metadata;
+  std::vector<Tensor> tensors;
+  void ToProto(VariantTensorDataProto* proto) const;
+  bool FromProto(const VariantTensorDataProto& proto);
+};
+
+// Type used for tag-dispatch of the Encode/Decode Variant implementations. This
+// template can determine whether the first type parameter `T` is one of the
+// following:
+//
+// * A POD type (TypeResolver<T, true>)
+// * A tensorflow::Tensor (TypeResolver<T, false, true>)
+// * A protocol buffer (TypeResolver<T, false, false, true>)
+// * None of the above (TypeResolver<T, false, false, false>)
+//
+template <typename T, bool = std::is_pod<typename std::decay<T>::type>::value,
+          bool = std::is_same<typename std::decay<T>::type,
+                              ::tensorflow::Tensor>::value,
+          bool = std::is_base_of<protobuf::MessageLite,
+                                 typename std::decay<T>::type>::value>
+struct TypeResolver {};
+
+// Specialization for POD type
+template <typename T>
+void EncodeVariantImpl(const T& value, TypeResolver<T, true /* is_pod */>,
+                       VariantTensorData* data) {
+  data->metadata.assign(reinterpret_cast<const char*>(&value), sizeof(value));
+}
+
+// Specialization for tensorflow::Tensor
+template <typename T>
+void EncodeVariantImpl(const T& value,
+                       TypeResolver<T, false /* is_pod */, true /* Tensor */>,
+                       VariantTensorData* data) {
+  data->tensors.clear();
+  data->tensors.push_back(value);
+}
+
+// Specialization for protobuf
+template <typename T>
+void EncodeVariantImpl(const T& value,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    true /* protobuf */>,
+                       VariantTensorData* data) {
+  value.SerializeToString(&data->metadata);
+}
+
+// Specialization for other types
+template <typename T>
+void EncodeVariantImpl(const T& value,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    false /* protobuf */>,
+                       VariantTensorData* data) {
+  value.Encode(data);
+}
+
+// Specialization for POD type
+template <typename T>
+bool DecodeVariantImpl(const VariantTensorData& data,
+                       TypeResolver<T, true /* is_pod */>, T* value) {
+  std::copy_n(data.metadata.data(), sizeof(*value),
+              reinterpret_cast<char*>(value));
+  return true;
+}
+
+// Specialization for tensorflow::Tensor
+template <typename T>
+bool DecodeVariantImpl(const VariantTensorData& data,
+                       TypeResolver<T, false /* is_pod */, true /* Tensor */>,
+                       T* value) {
+  *value = data.tensors[0];
+  return true;
+}
+
+// Specialization for protobuf
+template <typename T>
+bool DecodeVariantImpl(const VariantTensorData& data,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    true /* protobuf */>,
+                       T* value) {
+  return value->ParseFromString(data.metadata);
+}
+
+// Specialization for other types
+template <typename T>
+bool DecodeVariantImpl(const VariantTensorData& data,
+                       TypeResolver<T, false /* is_pod */, false /* Tensor */,
+                                    false /* protobuf */>,
+                       T* value) {
+  return value->Decode(data);
+}
+
+template <typename C, typename = void>
+struct has_type_name : std::false_type {};
+
+template <typename C>
+struct has_type_name<
+    C, typename std::enable_if<std::is_same<
+           decltype(std::declval<C>().TypeName()), string>::value>::type>
+    : std::true_type {};
+
+template <typename T, bool = has_type_name<typename std::decay<T>::type>::value,
+          bool = std::is_same<typename std::decay<T>::type,
+                              ::tensorflow::Tensor>::value,
+          bool = std::is_base_of<protobuf::MessageLite,
+                                 typename std::decay<T>::type>::value>
+struct TypeNameResolver {};
+
+template <typename T>
+string TypeNameVariantImpl(const T& value,
+                           TypeNameResolver<T, true /* has_type_name */>) {
+  return value.TypeName();
+}
+
+template <typename T>
+string TypeNameVariantImpl(
+    const T& value,
+    TypeNameResolver<T, false /* has_type_name */, true /* Tensor */>) {
+  return "tensorflow::Tensor";
+}
+
+template <typename T>
+string TypeNameVariantImpl(
+    const T& value, TypeNameResolver<T, false /* has_type_name */,
+                                     false /* Tensor */, true /* protobuf */>) {
+  return value.GetTypeName();
+}
+
+template <typename T>
+string TypeNameVariantImpl(
+    const T& value,
+    TypeNameResolver<T, false /* has_type_name */, false /* Tensor */,
+                     false /* protobuf */>) {
+  return value.TypeName();
+}
+
+template <typename T>
+string TypeNameVariant(const T& value) {
+  return TypeNameVariantImpl(value, TypeNameResolver<T>());
+}
+
+template <typename T>
+void EncodeVariant(const T& value, VariantTensorData* data) {
+  EncodeVariantImpl(value, TypeResolver<T>(), data);
+}
+
+template <typename T>
+bool DecodeVariant(const VariantTensorData& data, T* value) {
+  return DecodeVariantImpl(data, TypeResolver<T>(), value);
+}
+
+template <typename T>
+void EncodeVariant(const T& value, string* buf) {
+  VariantTensorData data;
+  EncodeVariantImpl(value, TypeResolver<T>(), &data);
+  VariantTensorDataProto proto;
+  data.ToProto(&proto);
+  proto.SerializeToString(buf);
+}
+
+template <typename T>
+bool DecodeVariant(const string& buf, T* value) {
+  VariantTensorDataProto proto;
+  if (!proto.ParseFromString(buf)) return false;
+  VariantTensorData data;
+  if (!data.FromProto(proto)) return false;
+  if (!DecodeVariantImpl(data, TypeResolver<T>(), value)) return false;
+  return true;
+}
+
+// Specializations for VariantTensorDataProto
+template <>
+string TypeNameVariant(const VariantTensorDataProto& value);
+template <>
+void EncodeVariant(const VariantTensorDataProto& value,
+                   VariantTensorData* data);
+template <>
+bool DecodeVariant(const VariantTensorData& data,
+                   VariantTensorDataProto* value);
+template <>
+void EncodeVariant(const VariantTensorDataProto& value, string* buf);
+template <>
+bool DecodeVariant(const string& buf, VariantTensorDataProto* value);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7ffdd28f4463faf29fd82fe1954a95de5ddae6e
--- /dev/null
+++ b/tensorflow/core/framework/variant_test.cc
@@ -0,0 +1,249 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <typename T>
+struct Wrapper {
+  T value;
+  string TypeName() const { return "POD"; }
+};
+
+using Int = Wrapper<int>;
+using Float = Wrapper<float>;
+
+}  // end namespace
+
+TEST(VariantTest, Basic) {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+
+  x = Int{42};
+
+  EXPECT_NE(x.get<void>(), nullptr);
+  EXPECT_NE(x.get<Int>(), nullptr);
+  EXPECT_EQ(x.get<Int>()->value, 42);
+  EXPECT_EQ(x.TypeName(), "POD");
+}
+
+TEST(VariantTest, ConstGet) {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+
+  x = Int{42};
+
+  const Variant y = x;
+
+  EXPECT_NE(y.get<void>(), nullptr);
+  EXPECT_NE(y.get<Int>(), nullptr);
+  EXPECT_EQ(y.get<Int>()->value, 42);
+}
+
+TEST(VariantTest, Clear) {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+
+  x = Int{42};
+
+  EXPECT_NE(x.get<void>(), nullptr);
+  EXPECT_NE(x.get<Int>(), nullptr);
+  EXPECT_EQ(x.get<Int>()->value, 42);
+
+  x.clear();
+  EXPECT_EQ(x.get<void>(), nullptr);
+}
+
+TEST(VariantTest, Tensor) {
+  Variant x;
+  Tensor t(DT_FLOAT, {});
+  t.flat<float>()(0) = 42.0f;
+  x = t;
+
+  EXPECT_NE(x.get<Tensor>(), nullptr);
+  EXPECT_EQ(x.get<Tensor>()->flat<float>()(0), 42.0f);
+  x.get<Tensor>()->flat<float>()(0) += 1.0f;
+  EXPECT_EQ(x.get<Tensor>()->flat<float>()(0), 43.0f);
+  EXPECT_EQ(x.TypeName(), "tensorflow::Tensor");
+}
+
+TEST(VariantTest, TensorProto) {
+  Variant x;
+  TensorProto t;
+  t.set_dtype(DT_FLOAT);
+  t.mutable_tensor_shape()->set_unknown_rank(true);
+  x = t;
+
+  EXPECT_EQ(x.TypeName(), "tensorflow.TensorProto");
+  EXPECT_NE(x.get<TensorProto>(), nullptr);
+  EXPECT_EQ(x.get<TensorProto>()->dtype(), DT_FLOAT);
+  EXPECT_EQ(x.get<TensorProto>()->tensor_shape().unknown_rank(), true);
+}
+
+TEST(VariantTest, CopyValue) {
+  Variant x, y;
+  x = Int{10};
+  y = x;
+
+  EXPECT_EQ(x.get<Int>()->value, 10);
+  EXPECT_EQ(x.get<Int>()->value, y.get<Int>()->value);
+}
+
+TEST(VariantTest, MoveValue) {
+  Variant x;
+  x = []() -> Variant {
+    Variant y;
+    y = Int{10};
+    return y;
+  }();
+  EXPECT_EQ(x.get<Int>()->value, 10);
+}
+
+TEST(VariantTest, TypeMismatch) {
+  Variant x;
+  x = Int{10};
+  EXPECT_EQ(x.get<float>(), nullptr);
+  EXPECT_EQ(x.get<int>(), nullptr);
+  EXPECT_NE(x.get<Int>(), nullptr);
+}
+
+struct TensorList {
+  void Encode(VariantTensorData* data) const { data->tensors = vec; }
+
+  bool Decode(const VariantTensorData& data) {
+    vec = data.tensors;
+    return true;
+  }
+
+  string TypeName() const { return "TensorList"; }
+
+  std::vector<Tensor> vec;
+};
+
+TEST(VariantTest, TensorListTest) {
+  Variant x;
+
+  TensorList vec;
+  for (int i = 0; i < 4; ++i) {
+    Tensor elem(DT_INT32, {1});
+    elem.flat<int>()(0) = i;
+    vec.vec.push_back(elem);
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    Tensor elem(DT_FLOAT, {1});
+    elem.flat<float>()(0) = 2 * i;
+    vec.vec.push_back(elem);
+  }
+
+  x = vec;
+
+  EXPECT_EQ(x.TypeName(), "TensorList");
+  const TensorList& stored_vec = *x.get<TensorList>();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(stored_vec.vec[i].flat<int>()(0), i);
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(stored_vec.vec[i + 4].flat<float>()(0), 2 * i);
+  }
+
+  VariantTensorData serialized;
+  x.Encode(&serialized);
+
+  Variant y = TensorList();
+  y.Decode(serialized);
+
+  const TensorList& decoded_vec = *x.get<TensorList>();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(decoded_vec.vec[i].flat<int>()(0), i);
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(decoded_vec.vec[i + 4].flat<float>()(0), 2 * i);
+  }
+}
+
+TEST(VariantTest, VariantArray) {
+  Variant x[2];
+  x[0] = Int{2};
+  x[1] = Float{2.0f};
+
+  EXPECT_EQ(x[0].get<Int>()->value, 2);
+  EXPECT_EQ(x[1].get<Float>()->value, 2.0f);
+}
+
+TEST(VariantTest, PodUpdate) {
+  struct Pod {
+    int x;
+    float y;
+
+    string TypeName() const { return "POD"; }
+  };
+
+  Variant x = Pod{10, 20.f};
+  EXPECT_NE(x.get<Pod>(), nullptr);
+  EXPECT_EQ(x.TypeName(), "POD");
+
+  x.get<Pod>()->x += x.get<Pod>()->y;
+  EXPECT_EQ(x.get<Pod>()->x, 30);
+}
+
+TEST(VariantTest, EncodeDecodePod) {
+  struct Pod {
+    int x;
+    float y;
+
+    string TypeName() const { return "POD"; }
+  };
+
+  Variant x;
+  Pod p{10, 20.0f};
+  x = p;
+
+  VariantTensorData serialized;
+  x.Encode(&serialized);
+
+  Variant y;
+  y = Pod();
+  y.Decode(serialized);
+
+  EXPECT_EQ(p.x, y.get<Pod>()->x);
+  EXPECT_EQ(p.y, y.get<Pod>()->y);
+}
+
+TEST(VariantTest, EncodeDecodeTensor) {
+  Variant x;
+  Tensor t(DT_INT32, {});
+  t.flat<int>()(0) = 42;
+  x = t;
+
+  VariantTensorData serialized;
+  x.Encode(&serialized);
+
+  Variant y = Tensor();
+  y.Decode(serialized);
+  EXPECT_EQ(x.get<Tensor>()->flat<int>()(0), y.get<Tensor>()->flat<int>()(0));
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/framework/versions.cc b/tensorflow/core/framework/versions.cc
index 58937556d9ba56798dc88d02e8a57081edc51e9c..3ff0723ceec2576948f7e840ab0b45d2a741f215 100644
--- a/tensorflow/core/framework/versions.cc
+++ b/tensorflow/core/framework/versions.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/versions.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/public/version.h"
 
diff --git a/tensorflow/core/framework/versions.h b/tensorflow/core/framework/versions.h
index 01429b26a633d66007228d4c741a135bf7429feb..e8f07f9016a03ef0d634d781405d55980a8a1e55 100644
--- a/tensorflow/core/framework/versions.h
+++ b/tensorflow/core/framework/versions.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_VERSIONS_H_
 #define TENSORFLOW_FRAMEWORK_VERSIONS_H_
 
-#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+class VersionDef;
+
 // Check whether data with the given versions is compatible with the given
 // consumer and min producer.  upper_name and lower_name are used to form
 // error messages upon failure.  Example usage:
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index f798af85e15e36da96fadc664dd036ad421f843b..3ed32068ae19b73f93b2b2bd12d77712a1273cfb 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/graph/costmodel.h"
 
 #include <vector>
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/graph/gradients.cc
index 09c3d8d5679bf3c9b11d7b0b896219ef989fb6ac..d3e7ff781c270489f755913edf5ea9c8d3113c8d 100644
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/graph/gradients.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index dcb8520cf73f20f459df8f2c0995ad763bad98b6..f6586f0519792f617c5c19299229798bc9ac80a6 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 
 #include <vector>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -28,6 +31,21 @@ namespace tensorflow {
 
 const int Graph::kControlSlot = -1;
 
+class NodeProperties {
+ public:
+  NodeProperties(const OpDef* op_def, const NodeDef& node_def,
+                 const DataTypeSlice inputs, const DataTypeSlice outputs)
+      : op_def(op_def),
+        node_def(node_def),
+        input_types(inputs.begin(), inputs.end()),
+        output_types(outputs.begin(), outputs.end()) {}
+
+  const OpDef* op_def;  // not owned
+  NodeDef node_def;
+  const DataTypeVector input_types;
+  const DataTypeVector output_types;
+};
+
 // Node
 
 #define REF_CLASS(key, value) \
@@ -93,26 +111,17 @@ Node::Node()
       props_(nullptr),
       assigned_device_name_index_(0) {}
 
-Node::~Node() {
-  if (props_) {
-    props_->Unref();
-  }
-}
-
-void Node::Initialize(int id, int cost_id, Properties* props) {
+void Node::Initialize(int id, int cost_id,
+                      std::shared_ptr<NodeProperties> props) {
   DCHECK_EQ(id_, -1);
   DCHECK(in_edges_.empty());
   DCHECK(out_edges_.empty());
   id_ = id;
   cost_id_ = cost_id;
 
-  // Unref the old, assign the new properties.
-  if (props_) {
-    props_->Unref();
-  }
-  props_ = props;
+  props_ = std::move(props);
   // Initialize the class_ based on the type string
-  class_ = GetNodeClassForOp(props->node_def_.op());
+  class_ = GetNodeClassForOp(props_->node_def.op());
 }
 
 void Node::Clear() {
@@ -121,15 +130,33 @@ void Node::Clear() {
   id_ = -1;
   cost_id_ = -1;
   class_ = NC_UNINITIALIZED;
+  props_.reset();
+  assigned_device_name_index_ = 0;
+}
 
-  if (props_) {
-    props_->Unref();
-    props_ = nullptr;
-  }
+const string& Node::name() const { return props_->node_def.name(); }
+const string& Node::type_string() const { return props_->node_def.op(); }
+const NodeDef& Node::def() const { return props_->node_def; }
+const OpDef& Node::op_def() const { return *props_->op_def; }
 
-  assigned_device_name_index_ = 0;
+int32 Node::num_inputs() const { return props_->input_types.size(); }
+DataType Node::input_type(int32 i) const { return props_->input_types[i]; }
+const DataTypeVector& Node::input_types() const { return props_->input_types; }
+
+int32 Node::num_outputs() const { return props_->output_types.size(); }
+DataType Node::output_type(int32 o) const { return props_->output_types[o]; }
+const DataTypeVector& Node::output_types() const {
+  return props_->output_types;
 }
 
+AttrSlice Node::attrs() const { return AttrSlice(def()); }
+
+const protobuf::RepeatedPtrField<string>& Node::requested_inputs() const {
+  return def().input();
+}
+
+const string& Node::requested_device() const { return def().device(); }
+
 gtl::iterator_range<NeighborIter> Node::out_nodes() const {
   return gtl::make_range(NeighborIter(out_edges_.begin(), false),
                          NeighborIter(out_edges_.end(), false));
@@ -141,19 +168,25 @@ gtl::iterator_range<NeighborIter> Node::in_nodes() const {
 }
 
 void Node::MaybeCopyOnWrite() {
-  // Properties may be shared between Nodes. Make a copy if so.
-  if (!props_->RefCountIsOne()) {
-    Properties* new_props =
-        new Properties(props_->op_def_, props_->node_def_, props_->input_types_,
-                       props_->output_types_);
-    props_->Unref();
-    props_ = new_props;
+  // NodeProperties may be shared between Nodes. Make a copy if so.
+  if (!props_.unique()) {
+    props_ = std::make_shared<NodeProperties>(*props_);
   }
 }
 
+AttrValue* Node::AddAttrHelper(const string& name) {
+  MaybeCopyOnWrite();
+  return &((*props_->node_def.mutable_attr())[name]);
+}
+
 void Node::ClearAttr(const string& name) {
   MaybeCopyOnWrite();
-  (*props_->node_def_.mutable_attr()).erase(name);
+  (*props_->node_def.mutable_attr()).erase(name);
+}
+
+void Node::set_requested_device(const string& device) {
+  MaybeCopyOnWrite();
+  props_->node_def.set_device(device);
 }
 
 Status Node::input_edge(int idx, const Edge** e) const {
@@ -225,24 +258,15 @@ Status Node::input_node(int idx, const Node** const_n) const {
   return Status::OK();
 }
 
-// Node::Properties
-
-Node::Properties::Properties(const OpDef* op_def, const NodeDef& node_def,
-                             const DataTypeSlice inputs,
-                             const DataTypeSlice outputs)
-    : op_def_(op_def),
-      node_def_(node_def),
-      input_types_(inputs.begin(), inputs.end()),
-      output_types_(outputs.begin(), outputs.end()) {}
-
-Node::Properties::~Properties() {}
 
 // Graph
 
 Graph::Graph(const OpRegistryInterface* ops)
-    : ops_(ops, FunctionDefLibrary()), arena_(8 << 10 /* 8kB */) {
-  versions_.set_producer(TF_GRAPH_DEF_VERSION);
-  versions_.set_min_consumer(TF_GRAPH_DEF_VERSION_MIN_CONSUMER);
+    : ops_(ops, FunctionDefLibrary()),
+      versions_(new VersionDef),
+      arena_(8 << 10 /* 8kB */) {
+  versions_->set_producer(TF_GRAPH_DEF_VERSION);
+  versions_->set_min_consumer(TF_GRAPH_DEF_VERSION_MIN_CONSUMER);
 
   // Initialize the name interning table for assigned_device_name.
   device_names_.push_back("");
@@ -286,6 +310,9 @@ Graph::~Graph() {
   // destroy them.
 }
 
+const VersionDef& Graph::versions() const { return *versions_; }
+void Graph::set_versions(const VersionDef& versions) { *versions_ = versions; }
+
 Node* Graph::AddNode(const NodeDef& node_def, Status* status) {
   const OpDef* op_def;
   status->Update(ops_.LookUpOpDef(node_def.op(), &op_def));
@@ -300,16 +327,15 @@ Node* Graph::AddNode(const NodeDef& node_def, Status* status) {
   }
 
   Node* node = AllocateNode(
-      new Node::Properties(op_def, node_def, inputs, outputs), nullptr);
+      std::make_shared<NodeProperties>(op_def, node_def, inputs, outputs),
+      nullptr);
   return node;
 }
 
 Node* Graph::CopyNode(Node* node) {
   DCHECK(!node->IsSource());
   DCHECK(!node->IsSink());
-  Node::Properties* props = node->properties();
-  props->Ref();
-  Node* copy = AllocateNode(props, node);
+  Node* copy = AllocateNode(node->props_, node);
   copy->set_assigned_device_name(node->assigned_device_name());
 
   // Since the OpDef of a function may be owned by the Graph that owns 'node',
@@ -317,9 +343,9 @@ Node* Graph::CopyNode(Node* node) {
   // node properties with the updated OpDef.
   const OpDef* op_def;
   TF_CHECK_OK(ops_.LookUpOpDef(node->type_string(), &op_def));
-  if (op_def != props->op_def_) {
+  if (op_def != node->props_->op_def) {
     copy->MaybeCopyOnWrite();
-    copy->props_->op_def_ = op_def;
+    copy->props_->op_def = op_def;
   }
 
   return copy;
@@ -392,35 +418,7 @@ void Graph::RemoveEdge(const Edge* e) {
 }
 
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
-  for (const FunctionDef& fdef : fdef_lib.function()) {
-    const FunctionDef* preexisting_fdef = ops_.Find(fdef.signature().name());
-    if (preexisting_fdef != nullptr) {
-      if (!FunctionDefsEqual(*preexisting_fdef, fdef)) {
-        return errors::InvalidArgument(
-            "Cannot add function '", fdef.signature().name(),
-            "' because a different function with the same name already "
-            "exists.");
-      }
-      // Ignore duplicate FunctionDefs
-      continue;
-    }
-    TF_RETURN_IF_ERROR(ops_.AddFunctionDef(fdef));
-  }
-  for (const GradientDef& grad : fdef_lib.gradient()) {
-    string preexisting_grad_func = ops_.FindGradient(grad.function_name());
-    if (!preexisting_grad_func.empty()) {
-      if (preexisting_grad_func != grad.gradient_func()) {
-        return errors::InvalidArgument(
-            "Cannot assign gradient function '", grad.gradient_func(), "' to '",
-            grad.function_name(), "' because it already has gradient function ",
-            "'", preexisting_grad_func, "'");
-      }
-      // Ignore duplicate GradientDefs
-      continue;
-    }
-    TF_RETURN_IF_ERROR(ops_.AddGradientDef(grad));
-  }
-  return Status::OK();
+  return ops_.AddLibrary(fdef_lib);
 }
 
 namespace {
@@ -481,7 +479,11 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
     for (size_t i = 0; i < inputs.size(); ++i) {
       const Edge* edge = inputs[i];
       if (edge == nullptr) {
-        node_def->add_input(node->requested_inputs()[i]);
+        if (i < node->requested_inputs().size()) {
+          node_def->add_input(node->requested_inputs()[i]);
+        } else {
+          node_def->add_input("");
+        }
       } else {
         const Node* src = edge->src();
         if (!src->IsOp()) continue;
@@ -502,7 +504,8 @@ bool Graph::IsValidNode(Node* node) const {
   return nodes_[id] == node;
 }
 
-Node* Graph::AllocateNode(Node::Properties* props, const Node* cost_node) {
+Node* Graph::AllocateNode(std::shared_ptr<NodeProperties> props,
+                          const Node* cost_node) {
   Node* node = nullptr;
   if (free_nodes_.empty()) {
     node = new (arena_.Alloc(sizeof(Node))) Node;  // placement new
@@ -513,7 +516,7 @@ Node* Graph::AllocateNode(Node::Properties* props, const Node* cost_node) {
   node->graph_ = this;
   const int id = nodes_.size();
   int cost_id = cost_node ? cost_node->cost_id() : id;
-  node->Initialize(id, cost_id, props);
+  node->Initialize(id, cost_id, std::move(props));
   nodes_.push_back(node);
   ++num_nodes_;
   return node;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 8cb270170e944a614142ebae7fb7ec63df08ae8f..78a0e8fd79f8e89487a6b477337c864157d2d273 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -41,10 +41,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/graph/edgeset.h"
 #include "tensorflow/core/lib/core/arena.h"
 #include "tensorflow/core/lib/core/refcount.h"
@@ -59,18 +59,21 @@ namespace tensorflow {
 class Edge;
 class EdgeSetTest;
 class Graph;
+class GraphDef;
 class Node;
+class VersionDef;
 
 class NeighborIter;  // Declared below
 class NodeIter;      // Declared below
+class NodeProperties;  // Defined in .cc
 
 class Node {
  public:
   string DebugString() const;
   int id() const { return id_; }
   int cost_id() const { return cost_id_; }
-  const string& name() const { return props_->node_def_.name(); }
-  const string& type_string() const { return props_->node_def_.op(); }
+  const string& name() const;
+  const string& type_string() const;
 
   // def() provides the NodeDef the user supplied, but the specifics
   // of this Node may have changed due to placement, optimization, etc.
@@ -82,21 +85,25 @@ class Node {
   //   the actual assigned device, see assigned_device_name() below;
   // * def().attr() is authoritative.
   // TODO(irving): Replace with NodeInfo.
-  const NodeDef& def() const { return props_->node_def_; }
-  const OpDef& op_def() const { return *props_->op_def_; }
+  const NodeDef& def() const;
+  const OpDef& op_def() const;
 
   // input and output types
-  int32 num_inputs() const { return props_->input_types_.size(); }
-  DataType input_type(int32 i) const { return props_->input_types_[i]; }
-  const DataTypeVector& input_types() const { return props_->input_types_; }
+  int32 num_inputs() const;
+  DataType input_type(int32 i) const;
+  const DataTypeVector& input_types() const;
 
-  int32 num_outputs() const { return props_->output_types_.size(); }
-  DataType output_type(int32 o) const { return props_->output_types_[o]; }
-  const DataTypeVector& output_types() const { return props_->output_types_; }
+  int32 num_outputs() const;
+  DataType output_type(int32 o) const;
+  const DataTypeVector& output_types() const;
 
   // The device requested by the user.  For the actual assigned device,
   // use assigned_device_name() below.
-  const string& requested_device() const { return def().device(); }
+  const string& requested_device() const;
+
+  // This changes the user requested device but not necessarily the device that
+  // on which the operation will run.
+  void set_requested_device(const string& device);
 
   // This gives the device the runtime has assigned this node to.  If
   // you want the device the user requested, use def().device() instead.
@@ -113,12 +120,10 @@ class Node {
   void set_assigned_device_name_index(int index);
 
   // Read only access to attributes
-  AttrSlice attrs() const { return AttrSlice(def()); }
+  AttrSlice attrs() const;
 
   // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
-  const protobuf::RepeatedPtrField<string>& requested_inputs() const {
-    return def().input();
-  }
+  const protobuf::RepeatedPtrField<string>& requested_inputs() const;
 
   // Get the neighboring nodes via edges either in or out of this node.
   gtl::iterator_range<NeighborIter> in_nodes() const;
@@ -162,8 +167,7 @@ class Node {
 
   template <typename T>
   void AddAttr(const string& name, const T& val) {
-    MaybeCopyOnWrite();
-    SetAttrValue(val, &((*props_->node_def_.mutable_attr())[name]));
+    SetAttrValue(val, AddAttrHelper(name));
   }
 
   void ClearAttr(const string& name);
@@ -183,38 +187,22 @@ class Node {
  private:
   friend class Graph;
   Node();
-  ~Node();
-
-  class Properties : public core::RefCounted {
-   public:
-    Properties(const OpDef* op_def, const NodeDef& node_def,
-               const DataTypeSlice inputs, const DataTypeSlice outputs);
-
-    const OpDef* op_def_;  // not owned
-    NodeDef node_def_;
-    const DataTypeVector input_types_;
-    const DataTypeVector output_types_;
 
-   private:
-    // Destructor invoked when last reference goes away via Unref()
-    virtual ~Properties();
-    TF_DISALLOW_COPY_AND_ASSIGN(Properties);
-  };
+  NodeProperties* properties() const { return props_.get(); }
 
-  Properties* properties() const { return props_; }
+  void Initialize(int id, int cost_id, std::shared_ptr<NodeProperties> props);
 
-  // Initialize() adopts a reference to props, and so is suitable if props was
-  // just allocated or you call props->Ref() to increment the reference
-  // count for a props being held by another Node.
-  void Initialize(int id, int cost_id, Properties* props);
   // Releases memory from props_, in addition to restoring *this to its
   // uninitialized state.
   void Clear();
+
   // Make a copy of the Node's props_ if props_ is shared with
   // other nodes. This must be called before mutating properties,
   // e.g. in AddAttr.
   void MaybeCopyOnWrite();
 
+  AttrValue* AddAttrHelper(const string& name);
+
   // A set of mutually exclusive classes for different kinds of nodes,
   // class_ is initialized in the Node::Initialize routine based on the
   // node's type_string().
@@ -252,7 +240,10 @@ class Node {
   EdgeSet in_edges_;
   EdgeSet out_edges_;
 
-  Properties* props_;
+  // NOTE(skyewm): inheriting from core::RefCounted may have a slight
+  // performance benefit over using shared_ptr, at the cost of manual ref
+  // counting
+  std::shared_ptr<NodeProperties> props_;
 
   // Index within Graph::device_names_ of the name of device assigned
   // to perform this computation.
@@ -385,8 +376,8 @@ class Graph {
   static const int kControlSlot;
 
   // The GraphDef version range of this graph (see graph.proto).
-  const VersionDef& versions() const { return versions_; }
-  void set_versions(const VersionDef& versions) { versions_ = versions; }
+  const VersionDef& versions() const;
+  void set_versions(const VersionDef& versions);
 
   // Adds a new node to this graph, and returns it. Infers the Op and
   // input/output types for the node. *this owns the returned instance.
@@ -519,14 +510,17 @@ class Graph {
   // If cost_node is non-null, then cost accounting (in CostModel)
   // will be associated with that node rather than the new one being
   // created.
-  Node* AllocateNode(Node::Properties* props, const Node* cost_node);
+  //
+  // Ownership of the returned Node is not transferred to caller.
+  Node* AllocateNode(std::shared_ptr<NodeProperties> props,
+                     const Node* cost_node);
   void ReleaseNode(Node* node);
 
   // Registry of all known ops, including functions.
   FunctionLibraryDefinition ops_;
 
   // GraphDef versions
-  VersionDef versions_;
+  const std::unique_ptr<VersionDef> versions_;
 
   // Allocator which will give us good locality.
   core::Arena arena_;
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 10f110686fba2e1cd76e56f0a816392c5155237e..582c8727c6dc697d0d48319e6d69fbf656b8434b 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -25,9 +25,12 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
@@ -44,6 +47,11 @@ inline bool IsMerge(const NodeDef& node_def) {
   return node_def.op() == "Merge" || node_def.op() == "RefMerge";
 }
 
+inline bool IsNextIteration(const NodeDef& node_def) {
+  return node_def.op() == "NextIteration" ||
+         node_def.op() == "RefNextIteration";
+}
+
 bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
   using ::tensorflow::strings::Scanner;
   return Scanner(s)
@@ -364,24 +372,54 @@ Status GraphConstructor::BuildNodeIndex() {
   return Status::OK();
 }
 
+std::unordered_set<string> GetNextIterationNodes(
+    const GraphConstructor::NodeDefSlice& node_defs) {
+  std::unordered_set<string> next_iteration_nodes;
+
+  for (int n = 0; n < node_defs.size(); ++n) {
+    const NodeDef& node_def = *node_defs[n];
+    if (IsNextIteration(node_def)) {
+      next_iteration_nodes.insert(node_def.name());
+    }
+  }
+
+  return next_iteration_nodes;
+}
+
 Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_defs_.size();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
+  std::unordered_set<string> next_iteration_nodes_ =
+      GetNextIterationNodes(node_defs_);
 
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
     const NodeDef& node_def = *node_defs_[n];
     if (IsMerge(node_def)) {
-      // for merge only wait for one non-control input.
+      // Cycles in the graph are only allowed for while loops. A while loop is
+      // identified by an edge from a NextIteration node to a Merge node. For
+      // such Merge nodes, only wait for one non-control input before
+      // considering the node ready to process in Convert().
       int32 num_control_edges = 0;
+      bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
         StringPiece input_name(node_def.input(i));
         if (input_name.starts_with("^")) {
           num_control_edges++;
+        } else {
+          TensorId id(ParseTensorName(input_name));
+          if (next_iteration_nodes_.find(id.first.ToString()) !=
+              next_iteration_nodes_.end()) {
+            has_loop_back_edge = true;
+          }
         }
       }
-      pending_count_.push_back(num_control_edges + 1);
+      if (has_loop_back_edge) {
+        pending_count_.push_back(num_control_edges + 1);
+      } else {
+        pending_count_.push_back(node_def.input_size());
+      }
     } else {
       pending_count_.push_back(node_def.input_size());
     }
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 8abf21235e036dd0119030090ff92686a270b40c..f222b9b5f1dd9615f904910cd58ec4f7121529fd 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -1871,25 +1872,29 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
   // new_input
   opts.input_map[TensorId("new_input", 0)] = TensorId("input", 0);
 
-  // ImportGraphDef only allows backedges into merge nodes (since backedges are
-  // only expected in while loops)
+  // ImportGraphDef only allows backedges into merge nodes that are part of
+  // while loops (since backedges are only expected in while loops)
   ExpectOK(
       R"EOF(
       node { name: 'new_input' op: 'TestInput' }
-      node { name: 'merge' op: 'Merge' input: [ 'new_input:0', 't1:0' ]
+      node { name: 'merge' op: 'Merge' input: [ 'new_input:0', 'next:0' ]
              attr { key: "N" value: { i: 2 } }
              attr { key: "T" value: { type: DT_FLOAT } } }
       node { name: 't1' op: 'TestMul' input: [ 'merge:0', 'merge:0' ] }
+      node { name: 'next' op: 'NextIteration' input: ['t1:0']
+             attr { key: "T" value: { type: DT_FLOAT } } }
       )EOF",
       opts, &refiner);
 
   EXPECT_TRUE(HasNode("new_input"));
   EXPECT_TRUE(HasNode("merge"));
   EXPECT_TRUE(HasNode("t1"));
+  EXPECT_TRUE(HasNode("next"));
 
   // Sanity check we created cycle
   EXPECT_TRUE(HasEdge("merge", 0, "t1", 0));
-  EXPECT_TRUE(HasEdge("t1", 0, "merge", 1));
+  EXPECT_TRUE(HasEdge("t1", 0, "next", 0));
+  EXPECT_TRUE(HasEdge("next", 0, "merge", 1));
 
   // Test that control dep was added to exactly one node of cycle
   EXPECT_TRUE(HasControlEdge("W1", "merge"));
@@ -1899,13 +1904,17 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
   Node* merge = FindNode("merge");
   ASSERT_EQ(merge->requested_inputs().size(), 3);
   EXPECT_EQ(merge->requested_inputs()[0], "input:0");
-  EXPECT_EQ(merge->requested_inputs()[1], "t1:0");
+  EXPECT_EQ(merge->requested_inputs()[1], "next:0");
   EXPECT_EQ(merge->requested_inputs()[2], "^W1");
 
   Node* t1 = FindNode("t1");
   ASSERT_EQ(t1->requested_inputs().size(), 2);
   EXPECT_EQ(t1->requested_inputs()[0], "merge:0");
   EXPECT_EQ(t1->requested_inputs()[1], "merge:0");
+
+  Node* next = FindNode("next");
+  ASSERT_EQ(next->requested_inputs().size(), 1);
+  EXPECT_EQ(next->requested_inputs()[0], "t1:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsErrors) {
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index 867eca0c41f46bb170accc2e4464fce6faf1cf6f..e85de71ef79988199cd194274f2ef9986e86d350 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph_def_builder.h"
 
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index f8c6895dfa164f9c30d2da925f64f8c2aab24d9e..750e18a9ca03bd4a3a8401beae2a92223a7427ef 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -16,18 +16,23 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_partition.h"
 
 #include <deque>
+#include <queue>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -37,6 +42,15 @@ namespace tensorflow {
 
 namespace {
 
+inline bool IsMerge(const NodeDef& node_def) {
+  return node_def.op() == "Merge" || node_def.op() == "RefMerge";
+}
+
+inline bool IsNextIteration(const NodeDef& node_def) {
+  return node_def.op() == "NextIteration" ||
+         node_def.op() == "RefNextIteration";
+}
+
 struct DupRecvKey {
   int src_node_id;           // Edge's src node id
   int src_output_slot;       // Edge's src node output slot
@@ -721,7 +735,106 @@ Status AddControlFlow(const PartitionOptions& opts, Graph* g,
   return Status::OK();
 }
 
-}  // end namespace
+struct PriorityTopoSortNode {
+  PriorityTopoSortNode(const NodeDef* n, int64 st) : node(n), start_time(st) {}
+
+  const NodeDef* node;
+  int64 start_time;
+};
+
+struct PriorityTopoSortNodeGreater {
+  bool operator()(const PriorityTopoSortNode& left,
+                  const PriorityTopoSortNode& right) {
+    return left.start_time > right.start_time;
+  }
+};
+
+}  // namespace
+
+// Returns in <nodes> the nodes that should participate in epoch-based recv
+// scheduling, along with their times; <nodes> is ordered by increasing
+// start_time. Returns in <node_to_start_time_out> the timing for all nodes,
+// even those not in <nodes>.
+//
+// Comparing to sorting on the node's start time only, this also processes the
+// nodes in dependency order, and updates start times to ensure a node's
+// start_time > the start time for all dependencies.
+//
+// Note that graph_partition_test.cc accesses this function for testing, even
+// though it's not declared in the header.
+Status TopologicalSortNodesWithTimePriority(
+    const GraphDef* gdef, std::vector<std::pair<const NodeDef*, int64>>* nodes,
+    std::unordered_map<const NodeDef*, int64>* node_to_start_time_out) {
+  // Queue of nodes to process; lowest start time is returned first.
+  std::priority_queue<PriorityTopoSortNode, std::vector<PriorityTopoSortNode>,
+                      PriorityTopoSortNodeGreater>
+      q;
+  std::unordered_map<const NodeDef*, int64> node_to_start_time;
+  auto enqueue = [&q, &node_to_start_time](const NodeDef* node) {
+    const int64 start_time = node_to_start_time[node];
+    q.emplace(node, start_time);
+  };
+
+  // Build initial structures, initial contents of queue.
+  std::unordered_map<string, std::vector<const NodeDef*>> node_to_output_nodes;
+  std::unordered_map<const NodeDef*, int> inputs_needed;
+  for (int n = 0; n < gdef->node_size(); ++n) {
+    const NodeDef* ndef = &gdef->node(n);
+    for (int i = 0; i < ndef->input_size(); ++i) {
+      node_to_output_nodes[ParseTensorName(ndef->input(i)).first.ToString()]
+          .push_back(ndef);
+    }
+    int64 start_time;
+    TF_RETURN_IF_ERROR(GetNodeAttr(*ndef, "_start_time", &start_time));
+    node_to_start_time[ndef] = start_time;
+    inputs_needed[ndef] = ndef->input_size();
+    if (ndef->input_size() == 0) {
+      enqueue(ndef);
+    }
+  }
+
+  // Determine which merge nodes are parts of loops; these
+  // need to happen in the traversal after all non-NextIteration inputs
+  // are run.
+  for (int n = 0; n < gdef->node_size(); ++n) {
+    const NodeDef* ndef = &gdef->node(n);
+    if (IsNextIteration(*ndef)) {
+      for (const NodeDef* n : node_to_output_nodes[ndef->name()]) {
+        if (IsMerge(*n)) {
+          // n is a merge that is part of a loop structure.
+          // It doesn't need to wait for this NextIteration loop
+          // when doing the traversal.
+          --inputs_needed[n];
+        }
+      }
+    }
+  }
+
+  // Traverse.
+  std::vector<std::pair<const NodeDef*, int64>> start_times;
+  start_times.reserve(gdef->node_size());
+  while (!q.empty()) {
+    PriorityTopoSortNode cur = q.top();
+    q.pop();
+
+    start_times.emplace_back(cur.node, cur.start_time);
+
+    for (const NodeDef* n : node_to_output_nodes[cur.node->name()]) {
+      auto& output_start_time = node_to_start_time[n];
+      if (output_start_time <= cur.start_time) {
+        output_start_time = cur.start_time + 1;
+      }
+      if (--inputs_needed[n] == 0) {
+        enqueue(n);
+      }
+    }
+  }
+
+  // Done.
+  nodes->swap(start_times);
+  node_to_start_time_out->swap(node_to_start_time);
+  return Status::OK();
+}
 
 Status AddControlEdges(const PartitionOptions& opts,
                        std::unordered_map<string, GraphDef>* partitions) {
@@ -730,27 +843,16 @@ Status AddControlEdges(const PartitionOptions& opts,
   const int num_epochs = 100;
   const int prefetch = 6;
 
-  typedef std::pair<const NodeDef*, int64> NodeStartTime;
   for (auto& part : *partitions) {
     GraphDef* gdef = &part.second;
-
-    std::vector<NodeStartTime> start_times;
-    start_times.resize(gdef->node_size());
-    for (int n = 0; n < gdef->node_size(); ++n) {
-      const NodeDef& ndef = gdef->node(n);
-      int64 start_time;
-      status = GetNodeAttr(ndef, "_start_time", &start_time);
-      if (!status.ok()) {
-        return status;
-      }
-      start_times[n] = std::make_pair(&ndef, start_time);
+    std::vector<std::pair<const NodeDef*, int64>> start_times;
+    std::unordered_map<const NodeDef*, int64> node_to_start_time;
+    status = TopologicalSortNodesWithTimePriority(gdef, &start_times,
+                                                  &node_to_start_time);
+    if (!status.ok()) {
+      return status;
     }
 
-    // Sort the nodes based on their start times.
-    std::sort(
-        start_times.begin(), start_times.end(),
-        [](NodeStartTime x, NodeStartTime y) { return x.second < y.second; });
-
     // Add a dummy node for every epoch, and add a control edge from the
     // "last" node in the preceding epoch to the dummy node.
     string device_name = gdef->node(0).device();
@@ -782,12 +884,8 @@ Status AddControlEdges(const PartitionOptions& opts,
     for (int n = 0; n < gdef->node_size(); ++n) {
       NodeDef* ndef = gdef->mutable_node(n);
       if (ndef->op() == "_Recv") {
-        int64 start_time;
-        status = GetNodeAttr(*ndef, "_start_time", &start_time);
-        if (!status.ok()) {
-          return status;
-        }
-        int recv_epoch = start_time / resolution;
+        const int64 start_time = node_to_start_time[ndef];
+        const int recv_epoch = start_time / resolution;
         if (recv_epoch >= prefetch) {
           NodeDef* dummy = dummys[recv_epoch - prefetch];
           AddInput(ndef, dummy->name(), Graph::kControlSlot);
@@ -798,6 +896,36 @@ Status AddControlEdges(const PartitionOptions& opts,
   return Status::OK();
 }
 
+// If 'ndef' is a Send or Recv, fills its attr send_device_incarnation
+// if possible.
+void SetIncarnation(const PartitionOptions& opts, NodeDef* ndef) {
+  StringPiece op(ndef->op());
+  if (op != "_Send" && op != "_Recv") {
+    // Not related to send/recv.
+    return;
+  }
+  string send_device;
+  if (!GetNodeAttr(*ndef, "send_device", &send_device).ok()) {
+    // No known send_device. The runtime will detect it later.
+    return;
+  }
+  int64 incarnation = opts.get_incarnation(send_device);
+  AddNodeAttr("send_device_incarnation", incarnation, ndef);
+}
+
+// Sets attribute send_device_incarnation of all Send/Recv nodes in
+// 'gdef', if possible.
+void SetIncarnation(const PartitionOptions& opts, GraphDef* gdef) {
+  for (NodeDef& ndef : *gdef->mutable_node()) {
+    SetIncarnation(opts, &ndef);
+  }
+  for (FunctionDef& fdef : *gdef->mutable_library()->mutable_function()) {
+    for (NodeDef& ndef : *fdef.mutable_node_def()) {
+      SetIncarnation(opts, &ndef);
+    }
+  }
+}
+
 Status Partition(const PartitionOptions& opts, Graph* g,
                  std::unordered_map<string, GraphDef>* partitions) {
   Status status;
@@ -839,8 +967,14 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     dst_def->set_device(dst->assigned_device_name());
     dst_def->clear_input();  // Inputs are filled below
     if (opts.need_to_record_start_times) {
-      int64 start_time = opts.start_times[dst->id()].value();
-      AddNodeAttr("_start_time", start_time, dst_def);
+      int64 start_time;
+      status = GetNodeAttr(*dst_def, "_start_time", &start_time);
+      if (errors::IsNotFound(status)) {
+        start_time = opts.start_times[dst->id()].value();
+        AddNodeAttr("_start_time", start_time, dst_def);
+      } else if (!status.ok()) {
+        return status;
+      }
     }
 
     // Arrange the incoming edges to dst so that input[i] holds the
@@ -897,18 +1031,18 @@ Status Partition(const PartitionOptions& opts, Graph* g,
       int64 send_start_time = 0;
       int64 recv_start_time = 0;
       if (opts.scheduling_for_recvs) {
-        if (opts.need_to_record_start_times) {
+        status = GetNodeAttr(src->attrs(), "_start_time", &send_start_time);
+        if (errors::IsNotFound(status) && opts.need_to_record_start_times) {
           send_start_time = opts.start_times[src->id()].value();
+        } else if (!status.ok()) {
+          return status;
+        }
+
+        status = GetNodeAttr(dst->attrs(), "_start_time", &recv_start_time);
+        if (errors::IsNotFound(status) && opts.need_to_record_start_times) {
           recv_start_time = opts.start_times[dst->id()].value();
-        } else {
-          status = GetNodeAttr(src->attrs(), "_start_time", &send_start_time);
-          if (!status.ok()) {
-            return status;
-          }
-          status = GetNodeAttr(dst->attrs(), "_start_time", &recv_start_time);
-          if (!status.ok()) {
-            return status;
-          }
+        } else if (!status.ok()) {
+          return status;
         }
       }
 
@@ -1026,10 +1160,15 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     }
   }
 
-  // Set versions and function library
+  // Set versions, function library and send/recv incarnation.
   for (auto& it : *partitions) {
-    it.second.mutable_versions()->CopyFrom(g->versions());
-    *it.second.mutable_library() = g->flib_def().ToProto();
+    GraphDef* gdef = &it.second;
+    *gdef->mutable_versions() = g->versions();
+    *gdef->mutable_library() = g->flib_def().ToProto();
+
+    // Traverse the graph to fill every send/recv op's incarnation
+    // information.
+    SetIncarnation(opts, gdef);
   }
 
   // Set the start times for recvs at the very end.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index ca49ea0ac49a80e0d04665b8191c9dc90f439f59..9c49b0b67b125cd96d53c50004eae4d77f0eee31 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -22,10 +22,12 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -38,6 +40,14 @@ limitations under the License.
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
+
+using strings::StrCat;
+
+// from graph_partition.cc
+extern Status TopologicalSortNodesWithTimePriority(
+    const GraphDef* gdef, std::vector<std::pair<const NodeDef*, int64>>* nodes,
+    std::unordered_map<const NodeDef*, int64>* node_to_start_time_out);
+
 namespace {
 
 const char gpu_device[] = "/job:a/replica:0/task:0/gpu:0";
@@ -51,7 +61,7 @@ string DeviceName(const Node* node) {
   } else {
     const string cpu_prefix = "/job:a/replica:0/task:0/cpu:";
     int index = first - 'A';
-    return strings::StrCat(cpu_prefix, index);
+    return StrCat(cpu_prefix, index);
   }
 }
 
@@ -435,5 +445,187 @@ TEST_F(GraphPartitionTest, Functions) {
   ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
 }
 
+TEST(TopologicalSortNodesWithTimePriorityTest, NoDependencies) {
+  // Create placeholders, shuffle them so the order in the graph is not strictly
+  // increasing.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  std::vector<int> indexes;
+  for (int i = 0; i < 20; ++i) {
+    indexes.push_back((i + 2001) % 20);
+  }
+  std::vector<ops::Placeholder> placeholders;
+  for (int i : indexes) {
+    placeholders.emplace_back(root.WithOpName(StrCat("p", i)), DT_FLOAT);
+    placeholders.back().node()->AddAttr("_start_time", i + 1);
+  }
+
+  GraphDef gdef;
+  TF_EXPECT_OK(root.ToGraphDef(&gdef));
+
+  std::vector<std::pair<const NodeDef*, int64>> nodes;
+  std::unordered_map<const NodeDef*, int64> node_to_start_time;
+  TF_CHECK_OK(
+      TopologicalSortNodesWithTimePriority(&gdef, &nodes, &node_to_start_time));
+  ASSERT_EQ(nodes.size(), 20);
+  for (int i = 0; i < nodes.size(); ++i) {
+    EXPECT_EQ(StrCat("p", i), nodes[i].first->name());
+    EXPECT_EQ(i + 1, nodes[i].second);
+  }
+}
+
+TEST(TopologicalSortNodesWithTimePriority, Dependencies) {
+  // Create placeholders, shuffle them so the order in the graph is not strictly
+  // increasing.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  std::vector<int> indexes;
+  std::vector<ops::Placeholder> placeholders_in_order;
+  const int num_leaves = 20;
+  for (int i = 0; i < num_leaves; ++i) {
+    indexes.push_back((i + 2001) % num_leaves);
+    placeholders_in_order.emplace_back(root.WithOpName(StrCat("p", i)),
+                                       DT_FLOAT);
+    placeholders_in_order.back().node()->AddAttr("_start_time", i + 1);
+  }
+  std::vector<ops::Placeholder> placeholders;
+  for (int i : indexes) {
+    placeholders.push_back(placeholders_in_order[i]);
+  }
+
+  // Create ops that depend on the placeholders. We give start times to these
+  // that are in descending order (e.g., the op that depends on the first
+  // placeholder runs last).
+  std::vector<ops::Square> squares;
+  for (int i : indexes) {
+    squares.emplace_back(root.WithOpName(StrCat("s", i)), placeholders[i]);
+    squares.back().node()->AddAttr("_start_time", 50 - (i + 1));
+  }
+
+  // Create addn to sum all squares.
+  std::vector<Input> inputs;
+  for (const auto& s : squares) inputs.push_back(s);
+  ops::AddN addn = ops::AddN(root.WithOpName("addn"),
+                             tensorflow::gtl::ArraySlice<Input>(inputs));
+  // Start times is actually listed earlier than the nodes it depends on.
+  // But because of dependency ordering, it is last in the list.
+  addn.node()->AddAttr("_start_time", 1);
+
+  GraphDef gdef;
+  TF_EXPECT_OK(root.ToGraphDef(&gdef));
+
+  std::vector<std::pair<const NodeDef*, int64>> nodes;
+  std::unordered_map<const NodeDef*, int64> node_to_start_time;
+  TF_CHECK_OK(
+      TopologicalSortNodesWithTimePriority(&gdef, &nodes, &node_to_start_time));
+  ASSERT_EQ(1 + squares.size() + placeholders.size(), nodes.size());
+  for (int i = 0; i < placeholders.size(); ++i) {
+    const NodeDef* node = nodes[i].first;
+    EXPECT_EQ(StrCat("p", i), node->name());
+    EXPECT_EQ(i + 1, nodes[i].second);
+    EXPECT_EQ(i + 1, node_to_start_time[node]);
+  }
+  for (int i = 0; i < squares.size(); ++i) {
+    int node_index = placeholders.size() + i;
+    int square_index = num_leaves - 1 - i;
+    const NodeDef* node = nodes[node_index].first;
+    EXPECT_EQ(StrCat("s", square_index), node->name());
+    EXPECT_EQ(50 - (square_index + 1), nodes[node_index].second);
+    EXPECT_EQ(50 - (square_index + 1), node_to_start_time[node]);
+  }
+  EXPECT_EQ("addn", nodes.back().first->name());
+  EXPECT_EQ(50, nodes.back().second);
+  EXPECT_EQ(50, node_to_start_time[nodes.back().first]);
+}
+
+TEST(TopologicalSortNodesWithTimePriority, WhileLoop) {
+  using namespace ::tensorflow::ops;            // NOLINT(build/namespaces)
+  using namespace ::tensorflow::ops::internal;  // NOLINT(build/namespaces)
+
+  // Create placeholders.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  std::vector<int> indexes;
+  std::vector<Placeholder> placeholders_in_order;
+  const int num_leaves = 20;
+  for (int i = 0; i < num_leaves; ++i) {
+    indexes.push_back((i + 2001) % num_leaves);
+    placeholders_in_order.emplace_back(root.WithOpName(StrCat("p", i)),
+                                       DT_FLOAT);
+    placeholders_in_order.back().node()->AddAttr("_start_time", i + 1);
+  }
+  std::vector<Placeholder> placeholders;
+  placeholders.reserve(indexes.size());
+  for (int i : indexes) {
+    placeholders.push_back(placeholders_in_order[i]);
+  }
+
+  // Add a while loop above each placeholder.
+  std::vector<Exit> while_exits;
+  const int nodes_per_loop = 8;
+  for (int i : indexes) {
+    Scope scope = root.NewSubScope(StrCat("while", i));
+    auto dummy = Placeholder(scope, DT_FLOAT);
+
+    Enter enter(scope, placeholders[i], StrCat("frame", i));
+    Merge merge(scope, std::initializer_list<Input>{enter, dummy});
+    auto cv = Const(scope.WithControlDependencies({merge.output}), false);
+    LoopCond loop_cond(scope, cv);
+    Switch switch_node(scope, merge.output, loop_cond);
+    Identity identity(scope, switch_node.output_true);
+    NextIteration next_iteration(scope, identity);
+    while_exits.emplace_back(scope.WithOpName("exit"),
+                             switch_node.output_false);
+
+    // Complete loop by removing dummy node and attaching NextIteration to
+    // that input of the merge node.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(next_iteration.node(), 0, merge.output.node(), 1);
+
+    int base_start_time = i * 10 + 100;
+    for (const auto& op : std::initializer_list<Output>{
+             enter, merge.output, cv, loop_cond, switch_node.output_false,
+             identity, next_iteration, while_exits.back()}) {
+      op.node()->AddAttr("_start_time", base_start_time++);
+    }
+  }
+
+  // Create ops that depend on the loop exits.
+  std::vector<Square> squares;
+  squares.reserve(indexes.size());
+  for (int i : indexes) {
+    squares.emplace_back(root.WithOpName(StrCat("s", i)), while_exits[i]);
+    squares.back().node()->AddAttr("_start_time", 500 - (i + 1));
+  }
+
+  GraphDef gdef;
+  TF_EXPECT_OK(root.ToGraphDef(&gdef));
+
+  // Run the sort. The while loop nodes do not appear in the output <nodes>.
+  std::vector<std::pair<const NodeDef*, int64>> nodes;
+  std::unordered_map<const NodeDef*, int64> node_to_start_time;
+  TF_CHECK_OK(
+      TopologicalSortNodesWithTimePriority(&gdef, &nodes, &node_to_start_time));
+  ASSERT_LT(while_exits.size() + squares.size() + placeholders.size(),
+            nodes.size());
+  int node_index = 0;
+  for (int i = 0; i < placeholders.size(); ++i, ++node_index) {
+    const NodeDef* node = nodes[i].first;
+    EXPECT_EQ(StrCat("p", i), node->name());
+    EXPECT_EQ(i + 1, nodes[i].second);
+    EXPECT_EQ(i + 1, node_to_start_time[node]);
+  }
+  for (int i = 0; i < while_exits.size(); ++i, node_index += nodes_per_loop) {
+    const NodeDef* node = nodes[node_index].first;
+    EXPECT_EQ(StrCat("while", i, "/Enter"), node->name());
+    EXPECT_EQ(100 + i * 10, nodes[node_index].second);
+    EXPECT_EQ(100 + i * 10, node_to_start_time[node]);
+  }
+  for (int i = 0; i < squares.size(); ++i, ++node_index) {
+    int square_index = num_leaves - 1 - i;
+    const NodeDef* node = nodes[node_index].first;
+    EXPECT_EQ(StrCat("s", square_index), node->name());
+    EXPECT_EQ(500 - (square_index + 1), nodes[node_index].second);
+    EXPECT_EQ(500 - (square_index + 1), node_to_start_time[node]);
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 94741a11ffa0ca5eb00ff2e9e5834e153f25b4b4..625780e7c91d5901ec240c848c73970bafa332f1 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -247,16 +247,10 @@ namespace tensorflow {
 //
 //           P = Conv2DWithBiasBackpropBias(O, O_m)
 //
-// 'Distance' between input of BiasAddGrad and _MklConv2D in terms of hops is
-// the context matching depth. If _MklConv2DWithBias is not within the context
-// matching depth, then we do not rewrite BiasAddGrad.
-
-// How many hops do we search for matching node in the backward dataflow graph?
-// We use maxhop of 10 based on empirical observations. Also, these are
-// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
-// directly goes to backward nodes, we do not expect the hop-distance
-// would be more than few nodes.
-static size_t kNodeMergeContextMaxDepth = 10;
+// Rewrite of BiasAddGrad into Conv2DWithBiasBackpropBias takes place depending
+// on the matching 'context'. The term context is loosely related to which
+// forward op is _associated_ to BiasAddGrad. If it is _MklConv2DWithBias then
+// we consider it Conv2D context; if it is MatMul, then it is MatMul context.
 
 class MklLayoutRewritePass : public GraphOptimizationPass {
  public:
@@ -280,6 +274,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.max_pool = "MaxPool";
     csinfo_.max_pool_grad = "MaxPoolGrad";
     csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
+    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
                                    "_MklConv2DWithBiasBackpropBias";
@@ -360,16 +356,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
                       csinfo_.mkl_conv2d_with_bias});
 
-    // We use maxhop of 10 based on empirical observations. Also, these are
-    // maxhops in backward data-flow graph. Since input of forward nodes
-    // (Conv2D) directly goes to backward nodes, we do not expect the
-    // hop-distance would be more than few nodes.
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
-                                   kNodeMergeContextMaxDepth};
+                                   IsBiasAddGradInMatMulContext};
 
     biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
                                    csinfo_.mkl_conv2d_with_bias,
-                                   kNodeMergeContextMaxDepth};
+                                   IsBiasAddGradInConv2DWithBiasContext};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -392,9 +384,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string node;     // Name of the node to be rewritten
     string fwd;      // Name of the node in the forward pass that this node
                      // corresponds to
-    size_t max_hop;  // Maximum number of hops the fwd is located
-                     // from this node. If the fwd is farther than max_hop
-                     // then we do not rewrite the node.
+    std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
   } ContextInfo;
 
   /// Structure to specify the name of an original node, its new name after
@@ -438,7 +428,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
-  struct {
+  typedef struct {
     string avg_pool;
     string avg_pool_grad;
     string bias_add;
@@ -457,13 +447,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string max_pool;
     string max_pool_grad;
     string mkl_conv2d;
+    string mkl_conv2d_grad_input;
+    string mkl_conv2d_grad_filter;
     string mkl_conv2d_with_bias;
     string mkl_conv2d_with_bias_backprop_bias;
     string relu;
     string relu_grad;
     string reshape;
     string split;
-  } csinfo_;
+  } ConstStringsInfo;
 
  private:
   /// Maintain info about nodes to rewrite
@@ -478,6 +470,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to rewrite
   static std::vector<ContextInfo*> cinfo_;
 
+  /// Maintain structure of constant strings
+  static ConstStringsInfo csinfo_;
+
   /// Context variables used in referencing rules
   static ContextInfo biasaddgrad_matmul_context_;
   static ContextInfo biasaddgrad_conv2dwithbias_context_;
@@ -629,6 +624,173 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  // Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
+  // specified in contextinfo 'ci'. Function updates fwd_node to point
+  // to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
+  //
+  // Association checks for one of the following graphs:
+  //
+  // Graph A:
+  //
+  // _ = Conv2DWithBias(F, I, _)
+  // ..
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = Conv2DBackpropInput(_, I, G)
+  // _ = BiasAddGrad(G)
+  //
+  // OR
+  //
+  // Graph B:
+  //
+  // _ = Conv2DWithBias(F, _, _)
+  // ..
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = BiasAddGrad(G)
+  //
+  // Here F, G, and I are graph nodes; _ represents graph nodes that we
+  // don't care here.
+  //
+  // @return - true (if BiasAddGrad is associated with Conv2DWithBias);
+  //           false otherwise.
+  static bool IsBiasAddGradInConv2DWithBiasContext(const Node* n,
+                                                   const Node** fwd_node,
+                                                   void* ci) {
+    CHECK_NOTNULL(n);
+    CHECK_NOTNULL(fwd_node);
+    CHECK_NOTNULL(ci);
+    *fwd_node = nullptr;
+
+    CHECK_EQ(n->type_string(), csinfo_.bias_add_grad);
+
+    // Get the only 1 input of BiasAddGrad.
+    CHECK_EQ(n->num_inputs(), 1);
+    const Node* bias_add_grad_inp = nullptr;
+    TF_CHECK_OK(n->input_node(0, &bias_add_grad_inp));
+    CHECK_NOTNULL(bias_add_grad_inp);
+
+    // Check if this input also goes to BackpropFilter and BackpropInput
+    // as 3rd input.
+    bool found_backprop_input = false;
+    bool found_backprop_filter = false;
+    Node* backprop_filter_node = nullptr;
+    Node* backprop_input_node = nullptr;
+
+    for (const Edge* e : bias_add_grad_inp->out_edges()) {
+      Node* third_input = nullptr;
+      if (e->dst()->type_string() == csinfo_.conv2d_grad_input ||
+          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_input) {
+        // Third input (index 2) of BackpropInput
+        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
+        // Third input (index 2) of BackpropInput must be same as the input
+        // of BiasAddGrad.
+        if (third_input == bias_add_grad_inp) {
+          found_backprop_input = true;
+          backprop_input_node = e->dst();
+        }
+      }
+
+      if (e->dst()->type_string() == csinfo_.conv2d_grad_filter ||
+          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_filter) {
+        // Third input (index 2) of BackpropFilter
+        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
+        // Third input (index 2) of BackpropFilter must be same as the input
+        // of BiasAddGrad.
+        if (third_input == bias_add_grad_inp) {
+          found_backprop_filter = true;
+          backprop_filter_node = e->dst();
+        }
+      }
+
+      // If we found both the nodes, then we can stop the search.
+      if (found_backprop_input && found_backprop_filter) {
+        break;
+      }
+    }
+
+    // If BackpropFilter node is not found, then this is not
+    // Conv2DWithBias context. For 2nd graph in the example above, only
+    // BackpropFilter would be present.
+    if (!found_backprop_filter) {
+      return false;
+    }
+
+    // Otherwise, we found the nodes.
+    CHECK_NOTNULL(backprop_filter_node);
+    if (found_backprop_input) {
+      CHECK_NOTNULL(backprop_input_node);
+    }
+
+    // Now that we confirmed that this is Conv2DWithBias context, we need to
+    // get access to the forward node (Conv2DWithBias). 2nd input of
+    // Conv2DWithBias is same as the 2nd input of Conv2DBackpropInput; 1st
+    // input of Conv2DWithBias is same as the 1st input of Conv2DBackpropFilter
+    // (This comes from definition of gradient computation for Conv2D).
+    if (found_backprop_input) {
+      // Graph A in the example.
+      Node* second_inp_of_input = nullptr;
+      Node* first_inp_of_filter = nullptr;
+      TF_CHECK_OK(backprop_input_node->input_node(1, &second_inp_of_input));
+      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
+      CHECK_NOTNULL(second_inp_of_input);
+      CHECK_NOTNULL(first_inp_of_filter);
+
+      // Now we need to find out Conv2DWithBias node from these input nodes.
+      // Conv2DWithBias node is the node that accepts both the nodes
+      // second_inp_of_input and first_inp_of_filter in 2nd and 1st input slots.
+      for (const Edge* fe : first_inp_of_filter->out_edges()) {
+        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
+            fe->dst_input() == 0) {
+          for (const Edge* ie : second_inp_of_input->out_edges()) {
+            if (ie->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
+                ie->dst_input() == 1 && fe->dst() == ie->dst()) {
+              VLOG(1) << "MklLayoutRewritePass: found "
+                      << fe->dst()->DebugString()
+                      << " as the forward node for matching context, backward"
+                      << " node is: " << n->DebugString();
+              *fwd_node = fe->dst();
+              return true;
+            }
+          }
+        }
+      }
+    } else {
+      // We did not find BackpropInput, so we work with BackpropFilter only.
+      // Graph B in the example.
+      Node* first_inp_of_filter = nullptr;
+      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
+      CHECK_NOTNULL(first_inp_of_filter);
+
+      // Now we need to find out Conv2DWithBias node from first input of
+      // BackpropFIlter. Conv2DWithBias node is the node that accepts
+      // first_inp_of_filter in 1st input slot.
+      for (const Edge* fe : first_inp_of_filter->out_edges()) {
+        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
+            fe->dst_input() == 0) {
+          VLOG(1) << "MklLayoutRewritePass: found "
+                  << fe->dst()->DebugString()
+                  << " as the forward node for matching context, backward"
+                  << " node is: " << n->DebugString();
+          *fwd_node = fe->dst();
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  // Is BiasAddGrad node in 'n' is associated with MatMul node
+  // specified in contextinfo 'ci'. Function does not update fwd_node.
+  //
+  // @return - true (if BiasAddGrad is associated with MatMul);
+  //           false otherwise.
+  static bool IsBiasAddGradInMatMulContext(const Node* n,
+                                           const Node** fwd_node,
+                                           void* ci) {
+    return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
+  }
+
+
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -639,8 +801,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
 
   // Helper function that searches the matching contextinfo for the node.
-  // Implements depth-first search in the data dependence graph for the
-  // gradient op in the backward direction.
   //
   // @input n - Node (gradient op) whose contextinfo is to be searched,
   //        fwd_node - pointer to node from the forward pass that this node
@@ -788,6 +948,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                    Node* orig_node);
 };
 
+MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
   MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
@@ -1667,12 +1828,12 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   const ContextInfo* ci = nullptr;
   bool is_context_based_rewrite = false;
   if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
-    CHECK_NOTNULL(fwd_node);
     is_context_based_rewrite = true;
 
     // Sanity checks for context-based rewrite (if any)
     if (orig_node->type_string() == csinfo_.bias_add_grad &&
         ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
+      CHECK_NOTNULL(fwd_node);
       DataType orig_T, ctx_T;
       string orig_data_format, ctx_data_format;
       TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
@@ -1784,69 +1945,17 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
   CHECK_NOTNULL(fwd_node);
   *fwd_node = nullptr;
 
-  // Search for matching contextinfo based on node name.
-  // There could be more than one matching contextinfos.
-  bool is_matching_cinfo_found = false;
-  std::vector<const ContextInfo*> mci;
+  // Search for matching contextinfo based on node name and call
+  // callback function using matching contextinfo.
+  // There could be more than one matching contextinfos but whichever
+  // matches first is returned.
   for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == (*ci)->node) {
-      mci.push_back(*ci);
-      is_matching_cinfo_found = true;
+    if (n->type_string() == (*ci)->node &&
+        (*ci)->context_match_fn(n, fwd_node, *ci)) {
+      VLOG(1) << "Found context as matching: " << (*ci)->fwd;
+      return *ci;
     }
   }
-  // If no matching contextinfo is found, return immediately.
-  if (!is_matching_cinfo_found) {
-    return nullptr;
-  }
-
-  VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
-          << " in backwards.";
-
-  // Now we will check for forward op name for context info in data
-  // flow graph. Get the max hops we should search for the fwd node.
-  // We are now going to search (breadth-first) backwards in data
-  // dependence graph (for up to max hops) from n for the node
-  // specified in fwd.
-  // queue to maintain nodes to be visited and depth info for
-  // breadth-first search
-  std::queue<std::pair<const Node*, int>> nqueue;
-  const Node* curr_node = n;
-  size_t curr_depth = 0;
-  nqueue.push(std::make_pair(curr_node, curr_depth));
-
-  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
-    std::pair<const Node*, int> curr_pair = nqueue.front();
-    nqueue.pop();
-
-    std::set<const Node*> visited_nodes;
-    curr_node = curr_pair.first;
-    curr_depth = curr_pair.second;
-    CHECK_NOTNULL(curr_node);
-
-    VLOG(1) << "MklLayoutRewritePass: Visiting node: "
-            << curr_node->type_string() << " at depth: " << curr_depth
-            << " for node: " << n->type_string();
-
-    // If we find a match, we return immediately.
-    for (const ContextInfo* ci : mci) {
-      if (curr_node->type_string() == ci->fwd) {
-        *fwd_node = curr_node;
-        return ci;
-      }
-    }
-
-    // Else we explore backward edges from current node.
-    // Add the source nodes of all incoming edges of the node to the queue.
-    for (const Edge* e : curr_node->in_edges()) {
-      // We do not visit already visited node.
-      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
-        // Depth of these nodes is 1 more than the depth of current node.
-        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
-        visited_nodes.insert(e->src());
-      }
-    }
-  } /* while */
-
   return nullptr;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 3c4a5263afd3817907ede7f14c9b433de5fce83c..efbe2134e0fbf0489c51949c777e91637b070c6b 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -345,7 +345,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
 // Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
 // rewrite tests
 
-// D=_MklConv2D(A,M,B,N,C,O); E=Sub(D,A); F=BiasAddGrad(E)
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
+// and BackpropInput
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -364,16 +365,255 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
       "node { name: 'E' op: 'Sub'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['D', 'A']}"
-      "node { name: 'F' op: 'BiasAddGrad'"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'Int32Input'}"
+      "node { name: 'I' op: '_MklConv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
+      "node { name: 'J' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " input: ['E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);"
-            "N(_MklInput);O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;"
-            "DMT/_0->F:1;E->F;E:control->DMT/_0:control;M->D:3;N->D:4;"
-            "O->D:5");
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+            "I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;"
+            "B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;"
+            "E:control->DMT/_0:control;F->G:1;H->I;M->D:3;M->G:3;M->I:3;"
+            "N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
+// and BackpropInput. But nodes do not match criteria for rewrite. So
+// rewrite should not happen.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'Int32Input'}"
+      "node { name: 'I' op: '_MklConv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
+      "node { name: 'J' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
+            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
+// and BackpropInput. But nodes do not match criteria for rewrite. So
+// rewrite should not happen.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'Int32Input'}"
+      "node { name: 'I' op: '_MklConv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
+      "node { name: 'J' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
+            "B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
+            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
+}
+
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);"
+            "H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;"
+            "E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;"
+            "N->D:4;N->G:4;O->D:5;O->G:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
+// But BackpropFilter node inputs do not satisfy criteria for rewrite.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
+}
+
+// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
+// But BackpropFilter node inputs do not satisfy criteria for rewrite.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
+            "C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
 }
 
 // No _MklConv2DWithBias in context, but _MklConv2D in context.
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 500ac129e8b5d00ca2e392049bb6bb1ab138115f..138952dcb33e7b1e57cf013147581d20f509e85d 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index a22a9b3fa31ff45fa7372e9270ac4ef8968b8f66..47337ce8a26e2de10584f6c70a28ce9fa0bb842c 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -187,6 +188,12 @@ bool OptimizerCSE::Optimize(
   for (Node* n : order) {
     if (!n->IsOp()) continue;
 
+    // Don't prune placeholder nodes.
+    if (n->def().op() == "Placeholder" || n->def().op() == "PlaceholderV2" ||
+        n->def().op() == "PlaceholderWithDefault") {
+      continue;
+    }
+
     // See if we should consider this node at all
     if (consider_fn != nullptr && !consider_fn(n)) continue;
 
@@ -204,6 +211,7 @@ bool OptimizerCSE::Optimize(
       for (const Edge* e : n->out_edges()) {
         g_->AddEdge(*candidate, e->src_output(), e->dst(), e->dst_input());
       }
+
       g_->RemoveNode(n);
       changed = true;
     }
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index 48b6b2a49738bcdd2cc814b714c9fa82458c5bce..b74fa2127e4a4f539e008d96970045904757030e 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -653,28 +653,38 @@ Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
   return Status::OK();
 }
 
-Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
-                                              int32 num_bits,
-                                              const string& quant_op_type,
-                                              string* result_graph) {
-  // First create the graph from the GraphDef.
+Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
+                                    int32 num_bits, const string& quant_op_type,
+                                    GraphDef* result_graphdef) {
   Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
-  GraphDef input_graphdef;
-  if (!ParseProtoUnlimited(&input_graphdef, input_graph)) {
-    return errors::InvalidArgument("Invalid input graph");
-  }
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, input_graphdef, &graph));
 
   // Call the rewriter on the graph.
   TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, quant_op_type, &graph));
 
   // Convert the result graph back to a GraphDef.
+  graph.ToGraphDef(result_graphdef);
+  return Status::OK();
+}
+
+Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph_string,
+                                              int32 num_bits,
+                                              const string& quant_op_type,
+                                              string* result_graph_string) {
+  // First create the graph from the GraphDef.
+  GraphDef input_graphdef;
+  if (!ParseProtoUnlimited(&input_graphdef, input_graph_string)) {
+    return errors::InvalidArgument(
+        "input_graph_string is not a serialized GraphDef protocol buffer");
+  }
   GraphDef output_graphdef;
-  graph.ToGraphDef(&output_graphdef);
+  TF_RETURN_IF_ERROR(DoQuantizeTrainingOnGraphDef(
+      input_graphdef, num_bits, quant_op_type, &output_graphdef));
 
-  if (!output_graphdef.SerializeToString(result_graph)) {
-    return errors::InvalidArgument("Invalid output graph");
+  if (!output_graphdef.SerializeToString(result_graph_string)) {
+    return errors::Internal(
+        "quantize training transformation resulted in invalid GraphDef");
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h
index 2c1a7e6ae3618904ef37b5ec0ed38c61c6180455..2bb4ee1cf058a1791cc4a8704c126ec0e4999916 100644
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/graph/quantize_training.h
@@ -38,12 +38,19 @@ namespace tensorflow {
 Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
                           Graph* g);
 
-// Converts a input GraphDef and returns a rewritten GraphDef with the
-// quantized training.
+// Converts the input serialized GraphDef and returns a rewritten serialized
+// GraphDef for quantized training.
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
                                               const string& quant_op_type,
                                               string* result_graph);
+
+// Converts the input GraphDef and returns a rewritten GraphDef for quantized
+// training.
+Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
+                                    int32 num_bits, const string& quant_op_type,
+                                    GraphDef* result_graphdef);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index d817d980de90aad7df91eecbf92de50c3dd1b243..2ad69dbd0c608fa79354c73e01167c3b02ff4fc2 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -282,7 +282,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
       g, strings::StrCat(c->name(), "/FakeQuantWithMinMaxVars"), &found_node));
 }
 
-TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
+TEST_F(QuantizeTrainingTest, QuantizeSerializedGraphDef) {
   // Construct a simple graph with 5 nodes.
   Reset();
   Graph* graph = g_.get();
@@ -310,8 +310,40 @@ TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   GraphDef result_graphdef;
   EXPECT_TRUE(ParseProtoUnlimited(&result_graphdef, result_string));
 
+  // Ensure that quantizing the serialized graph_def results in a graph with the
+  // same number of nodes as quantizing the graph.
+  GraphConstructorOptions opts;
+  Graph result_graph(OpRegistry::Global());
+  TF_ASSERT_OK(ConvertGraphDefToGraph(opts, result_graphdef, &result_graph));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", graph));
+  EXPECT_EQ(graph->num_nodes(), result_graph.num_nodes());
+}
+
+TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
+  // Construct a simple graph with 5 nodes.
+  Reset();
+  Graph* graph = g_.get();
+  Node* const_a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* const_b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  graph->AddControlEdge(graph->source_node(), const_a);
+  graph->AddControlEdge(graph->source_node(), const_b);
+  Node* relu = test::graph::Relu(graph, const_a);
+  Node* identity = test::graph::Identity(graph, const_b);
+  Node* matmul = test::graph::Matmul(graph, relu, identity, false, false);
+  graph->AddControlEdge(matmul, graph->sink_node());
+
+  int num_bits = 8;
+
+  // Convert the graph to the graphdef string.
+  GraphDef input_graphdef;
+  graph->ToGraphDef(&input_graphdef);
+
+  GraphDef result_graphdef;
+  TF_ASSERT_OK(DoQuantizeTrainingOnGraphDef(
+      input_graphdef, num_bits, "QuantizeAndDequantizeV2", &result_graphdef));
+
   // Ensure that quantizing the graph_def results in a graph with the same
-  // number of nodes.
+  // number of nodes as the graph_def.
   GraphConstructorOptions opts;
   Graph result_graph(OpRegistry::Global());
   TF_ASSERT_OK(ConvertGraphDefToGraph(opts, result_graphdef, &result_graph));
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index c59c44c80edc08a16d98d08761df98f7b3c9d783..be524387474e8863223ef0201fac8072fa5ad83e 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -420,11 +421,12 @@ Node* Cast(Graph* g, Node* in, DataType dst) {
   return ret;
 }
 
-Node* Gather(Graph* g, Node* in0, Node* in1) {
+Node* Gather(Graph* g, Node* in0, Node* in1, Node* axis) {
   Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Gather")
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "GatherV2")
                   .Input(in0)
                   .Input(in1)
+                  .Input(axis)
                   .Finalize(g, &ret));
   return ret;
 }
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 48250fef0fa44ee9fe25d7751c067d3c1257d4b7..a38809e6b4c04fbc98c27d3fdaa43dc43f37bf56 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -171,8 +171,8 @@ Node* Select(Graph* g, Node* c, Node* inx, Node* iny);
 // Casts "in" into data type "dst".
 Node* Cast(Graph* g, Node* in, DataType dst);
 
-// Perform gather op on params "in0" with indices "in1".
-Node* Gather(Graph* g, Node* in0, Node* in1);
+// Perform gather op on params "in0" with indices "in1" and axis "axis".
+Node* Gather(Graph* g, Node* in0, Node* in1, Node* axis);
 
 // Gets a tensor stored in the session state.
 Node* GetSessionTensor(Graph* g, Node* in);
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index bfdc5cab0db4f6d12ecfd0262816d5a9e6d27857..bd905651d22aa374d79ada5d9f93a8ff99b57095 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/graph/validate.h"
 
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index fd2f2b32492150da6e047d865e6bc6b9f3dfb5ba..e7230b37543110cd463222bea3ba5f173ffe3686 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -65,7 +65,23 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/costs:op_level_cost_estimator",
+        "//tensorflow/core/grappler/costs:virtual_scheduler",
+    ],
+)
+
+cc_test(
+    name = "virtual_cluster_test",
+    srcs = ["virtual_cluster_test.cc"],
+    deps = [
+        ":virtual_cluster",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
 
@@ -94,10 +110,13 @@ cc_test(
     name = "single_machine_test",
     srcs = ["single_machine_test.cc"],
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    tags = ["no_gpu"],
     deps = [
         ":single_machine",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 8690d9f24adb562f92c4a39160c84024e8d2f252..3205d67517fe23432e4d6fed01429a01c11f16ab 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -14,27 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include <atomic>
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
-static std::atomic<bool> already_created(false);
-
 Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
-  // This is really ugly: to avoid leaking variables, we need to reset the tf
-  // session every time we're done processing a grappler item. However,
-  // variables are global, and therefore we can't have more than 1 session alive
-  // at a time. This check detects when more that one cluster is created.
-  CHECK(!already_created);
-  already_created = true;
-
   DisableDetailedStats(false);
 }
 
 Cluster::~Cluster() {
-  CHECK(already_created);
-  already_created = false;
 }
 
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
@@ -61,8 +50,21 @@ void Cluster::DisableOptimizer(bool disable) {
       options_.config.mutable_graph_options()->mutable_optimizer_options();
   if (disable) {
     options->set_opt_level(OptimizerOptions::L0);
+    // Disable Grappler optimizations.
+    auto rewriter_config =
+        options_.config.mutable_graph_options()->mutable_rewrite_options();
+    rewriter_config->set_optimize_tensor_layout(false);
+    rewriter_config->set_disable_model_pruning(true);
+    rewriter_config->set_constant_folding(RewriterConfig::OFF);
+    rewriter_config->set_memory_optimization(RewriterConfig::NO_MEM_OPT);
+    rewriter_config->mutable_auto_parallel()->set_enable(false);
+    rewriter_config->clear_optimizers();
   } else {
     options->set_opt_level(OptimizerOptions::L1);
+    auto rewriter_config =
+        options_.config.mutable_graph_options()->mutable_rewrite_options();
+    rewriter_config->set_constant_folding(RewriterConfig::DEFAULT);
+    rewriter_config->set_memory_optimization(RewriterConfig::DEFAULT_MEM_OPT);
   }
 }
 
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 22ccf5208c1e0a8799907a6b86f93fd4a1ca44b9..3481b2b158d2bfff1be3ef2067caf130ae7f91e5 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 
+#include <atomic>
 #include <memory>
 
 #include "tensorflow/cc/training/queue_runner.h"
@@ -31,11 +32,22 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+static std::atomic<bool> already_created(false);
+
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
     : Cluster(timeout_s),
       num_gpus_(num_gpus),
       expected_init_time_s_(0),
       closing_(false) {
+  // This is really ugly: to avoid leaking variables, we need to reset the tf
+  // session every time we're done processing a grappler item. However,
+  // variables are global, and therefore we can't have more than 1 session alive
+  // at a time. This check detects when more that one cluster is created.
+  CHECK(!already_created);
+  already_created = true;
+
+  VLOG(1) << "Number of CPU cores: " << num_cpu_cores
+          << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
       Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
 
@@ -61,7 +73,8 @@ SingleMachine::~SingleMachine() {
   // when we delete the session.
   thread_pool_.reset();
 
-  Reset(options_, {}).IgnoreError();
+  CHECK(already_created);
+  already_created = false;
 }
 
 Status SingleMachine::Provision() {
@@ -73,9 +86,12 @@ Status SingleMachine::Provision() {
   DeviceProperties attr = GetLocalCPUInfo();
   devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
 
+  VLOG(1) << "Number of GPUs: " << num_gpus_;
   for (int i = 0; i < num_gpus_; ++i) {
-    devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
-        GetLocalGPUInfo(i);
+    string device_name =
+        strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i);
+    VLOG(1) << "Adding GPU device " << device_name;
+    devices_[device_name] = GetLocalGPUInfo(i);
   }
   return Status::OK();
 }
@@ -140,7 +156,7 @@ Status SingleMachine::Run(const GraphDef& graph_def,
         // Also clear the timeline to save memory
         init_metadata_.clear_step_stats();
       }
-      for (int i = 0; i < queue_runner_defs_.size(); ++i) {
+      for (size_t i = 0; i < queue_runner_defs_.size(); ++i) {
         std::unique_ptr<QueueRunner> queue_runner;
         TF_RETURN_IF_ERROR(QueueRunner::New(queue_runner_defs_[i],
                                             coordinator_.get(), &queue_runner));
@@ -259,11 +275,9 @@ Status SingleMachine::ResetSession() {
     // Make sure the session is properly closed
     TF_RETURN_IF_ERROR(Shutdown());
 
-    // We need to Reset the session to ensure that all the variables are
-    // deleted. But first we need to delete the session since Reset()
-    // deletes some of the containers referenced by the session.
+    // Destroying the object deletes all its varibles as well. This is only true
+    // for DirectSession.
     session_.reset();
-    TF_RETURN_IF_ERROR(Reset(options_, {}));
   }
 
   LOG(INFO) << "Starting new session";
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 84e796c96016efd64b9bfc9dbdef509cd2f99658..d8660e11a28785b01fc59f6d0fed21f0c9fd4885 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -15,14 +15,19 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -348,6 +353,7 @@ TEST_F(SingleMachineTest, InitializationMemory) {
 }
 
 namespace {
+
 template <class T>
 inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
   AttrValue attr_value;
@@ -462,6 +468,124 @@ TEST_F(SingleMachineTest, PersistentMemory) {
   EXPECT_TRUE(found_hashtable);
 }
 
+#if defined(PLATFORM_GOOGLE)
+namespace {
+
+SessionOptions GetSessionOption(int num_cpu_cores, int num_gpus) {
+  SessionOptions options;
+  // Copied from single_machine.h
+  (*options.config.mutable_device_count())["CPU"] = 1;
+  if (num_gpus > 0) {
+    (*options.config.mutable_device_count())["GPU"] = num_gpus;
+  }
+  CHECK_GE(num_cpu_cores, 1);
+  options.config.set_intra_op_parallelism_threads(num_cpu_cores);
+  options.config.add_session_inter_op_thread_pool()->set_num_threads(
+      num_cpu_cores);
+  return options;
+}
+
+Status GetDeviceMemoryStats(
+    const SessionOptions& session_option,
+    std::unordered_map<string, AllocatorStats>* allocator_stats_by_device) {
+  std::vector<Device*> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(session_option,
+                                               "" /* name_prefix */, &devices));
+  allocator_stats_by_device->clear();
+  for (Device* device : devices) {
+    AllocatorStats stats;
+    auto* allocator = device->GetAllocator(AllocatorAttributes());
+    if (!allocator->TracksAllocationSizes()) {
+      return Status(error::INVALID_ARGUMENT,
+                    "Tracking allocation is not enabled.");
+    }
+    allocator->GetStats(&stats);
+    (*allocator_stats_by_device)[device->name()] = stats;
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // Add a variable and initializer.
+  Output a = ops::Variable(s.WithOpName("a"), TensorShape({128, 256}),
+                           DataType::DT_FLOAT);
+  Output a_init =
+      ops::RandomNormal(s.WithOpName("a/init"), {128, 256}, DataType::DT_FLOAT);
+  Output a_init_assign = ops::Assign(s.WithOpName("a/init/assign"), a, a_init);
+
+  // Add a resource variable.
+  Output b =
+      ops::VarHandleOp(s.WithOpName("b"), DataType::DT_FLOAT, {256, 512});
+  Output b_read =
+      ops::ReadVariableOp(s.WithOpName("b/read"), b, DataType::DT_FLOAT);
+  Output b_init =
+      ops::RandomNormal(s.WithOpName("b/init"), {256, 512}, DataType::DT_FLOAT);
+  auto b_init_assign =
+      ops::AssignVariableOp(s.WithOpName("b/init/assign"), b, b_init);
+
+  // Add a queue.
+  ops::FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_STRING});
+  Output some_string =
+      ops::Const(s.WithOpName("some_string"), string("nothing"));
+  ops::QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, {some_string});
+  ops::QueueDequeue dequeue(s.WithOpName("dequeue"), queue,
+                            {DataType::DT_STRING});
+
+  // Add a IdentityReader.
+  ops::IdentityReader reader(s.WithOpName("identity_reader"));
+  ops::ReaderRead read(s.WithOpName("read_from_queue"), reader, queue);
+
+  Output var_mul = ops::MatMul(s.WithOpName("var_matmul"), a, b_read);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  QueueRunnerDef queue_runner;
+  queue_runner.set_queue_name("queue");
+  *queue_runner.add_enqueue_op_name() = "enqueue";
+  item.queue_runners.push_back(queue_runner);
+
+  item.init_ops.push_back("a/init/assign");
+  item.init_ops.push_back("b/init/assign");
+  item.fetch.push_back("var_matmul");
+  item.fetch.push_back("dequeue");
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  EnableCPUAllocatorStats(true);
+
+  SessionOptions options =
+      GetSessionOption(3 /* cpu cores */, 0 /* num gpus */);
+  std::unordered_map<string, AllocatorStats> device_memory_before;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_before));
+  EXPECT_EQ(device_memory_before.size(), 1);
+
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check there is memory that is not released.
+  std::unordered_map<string, AllocatorStats> device_memory;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory));
+  EXPECT_EQ(device_memory.size(), 1);
+  EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
+
+  // Reset cluster_ would release all memory.
+  cluster_.reset();
+  std::unordered_map<string, AllocatorStats> device_memory_after;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
+
+  // Check memory used by resources are released after cluster destruction.
+  EXPECT_EQ(device_memory_before.size(), 1);
+  EXPECT_EQ(device_memory_after.size(), 1);
+  EXPECT_EQ(device_memory_before.begin()->second.bytes_in_use, 0);
+  EXPECT_EQ(device_memory_after.begin()->second.bytes_in_use, 0);
+}
+#endif
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index 4ca4c03dbb6dd7c2c578b0d86de2ecbe16f8e652..e717f6e761f22a62d099e0f8a1da5080728a7abc 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -14,16 +14,28 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
 namespace tensorflow {
 namespace grappler {
 
 VirtualCluster::VirtualCluster(
     const std::unordered_map<string, DeviceProperties>& devices)
-    : Cluster(0) {
+    : Cluster(0), node_estimator_(new OpLevelCostEstimator()) {
   devices_ = devices;
 }
 
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices,
+    OpLevelCostEstimator* node_estimator)
+    : Cluster(0), node_estimator_(node_estimator) {
+  devices_ = devices;
+}
 VirtualCluster::~VirtualCluster() {}
 
 Status VirtualCluster::Provision() { return Status::OK(); }
@@ -32,12 +44,61 @@ Status VirtualCluster::Initialize(const GrapplerItem& item) {
   return Status::OK();
 }
 
-Status VirtualCluster::Run(const GraphDef& item,
+Status VirtualCluster::Run(const GraphDef& graph,
                            const std::vector<std::pair<string, Tensor>>& feed,
                            const std::vector<string>& fetch,
                            RunMetadata* metadata) {
-  return Status::OK();
+  // Initialize a virtual scheduler to process the graph. Make sure to use
+  // static shape inference to prevent the schedulrer from calling the Run
+  // method on the cluster, and create an infinite loop.
+  GrapplerItem item;
+  item.graph = graph;
+  item.feed = feed;
+  item.fetch = fetch;
+  VirtualScheduler scheduler(&item, true, this);
+  TF_RETURN_IF_ERROR(scheduler.Init());
 
+  if (metadata) {
+    metadata->clear_step_stats();
+    metadata->clear_cost_graph();
+    metadata->clear_partition_graphs();
+  }
+
+  Costs node_costs;
+  do {
+    NodeInfo node_info = scheduler.GetCurrNodeInfo();
+    const auto& op_info = node_info.op_info;
+    node_costs = node_estimator_->PredictCosts(op_info);
+    if (metadata) {
+      CostGraphDef::Node* cost_node =
+          metadata->mutable_cost_graph()->add_node();
+      const string& op_name = node_info.name;
+      cost_node->set_name(op_name);
+      cost_node->set_device(node_info.device_name);
+      cost_node->set_compute_cost(
+          node_costs.execution_time.asMicroSeconds().count());
+      cost_node->set_compute_time(
+          node_costs.compute_time.asMicroSeconds().count());
+      cost_node->set_memory_time(
+          node_costs.memory_time.asMicroSeconds().count());
+      for (const auto& output : node_info.op_info.outputs()) {
+        auto output_info = cost_node->add_output_info();
+        output_info->set_dtype(output.dtype());
+        *output_info->mutable_shape() = output.shape();
+
+        int64 size = DataTypeSize(output.dtype());
+        for (const auto& dim : output.shape().dim()) {
+          size *= std::max<int64>(1, dim.size());
+        }
+        output_info->set_size(size);
+      }
+    }
+  } while (scheduler.MarkCurrNodeExecuted(node_costs));
+
+  if (metadata) {
+    scheduler.Summary(metadata);
+  }
+  return Status::OK();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index cd8436a9870e97457b67474870ad6b46215cf9ee..a74911cb23a3fcdb8f41de624c4e5c9a01602577 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -18,18 +18,20 @@ limitations under the License.
 
 #include <unordered_map>
 #include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Create a simple cluster that lists the devices (and their properties)
-// available in a TensorFlow session. This cluster doesn't allow running an
-// actual graph. It is useful however when used in conjusction with costs models
-// that aren't based on the execution of the graph.
+// available in a TensorFlow session. This cluster simulates the execution of
+// actual graphs.
 class VirtualCluster : public Cluster {
  public:
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices);
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
+                 OpLevelCostEstimator* node_estimator);
 
   ~VirtualCluster() override;
 
@@ -38,6 +40,9 @@ class VirtualCluster : public Cluster {
   Status Run(const GraphDef& item,
              const std::vector<std::pair<string, Tensor>>& feed,
              const std::vector<string>& fetch, RunMetadata* metadata) override;
+
+ private:
+  std::unique_ptr<OpLevelCostEstimator> node_estimator_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec21f5f4260d86129b63158d0d389052a8d7e82f
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class VirtualClusterTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    // Invent a CPU so that predictions remain the same from machine to machine.
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    cpu_device.set_l1_cache_size(32 * 1024);
+    cpu_device.set_l2_cache_size(256 * 1024);
+    cpu_device.set_l3_cache_size(4 * 1024 * 1024);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    cluster_.reset(new VirtualCluster(devices));
+    TF_CHECK_OK(cluster_->Provision());
+  }
+
+  void TearDown() override { cluster_.reset(); }
+
+ protected:
+  std::unique_ptr<VirtualCluster> cluster_;
+};
+
+TEST_F(VirtualClusterTest, CostModel) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // There should be at least 4 nodes corresponding to the 4 stages we created
+  // in the fake input.
+  EXPECT_LE(4, metadata.cost_graph().node_size());
+  for (const auto& node : metadata.cost_graph().node()) {
+    // Skip the constant node that configures the random number generator.
+    if (node.name().find("Const/Const") != string::npos) {
+      continue;
+    }
+    EXPECT_EQ(1, node.output_info_size());
+    EXPECT_EQ(40, node.output_info(0).size());
+    const TensorShapeProto& shape = node.output_info(0).shape();
+    EXPECT_EQ(2, shape.dim_size());
+    EXPECT_EQ(10, shape.dim(0).size());
+    EXPECT_EQ(1, shape.dim(1).size());
+    if (node.name() == "x") {
+      EXPECT_EQ(1500, node.compute_cost());
+    } else {
+      EXPECT_EQ(2500, node.compute_cost());
+    }
+  }
+
+  for (const auto& dev_stat : metadata.step_stats().dev_stats()) {
+    EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0", dev_stat.device());
+    for (const auto& node : dev_stat.node_stats()) {
+      if (node.node_name() == "AddN") {
+        EXPECT_EQ(2500, node.op_end_rel_micros());
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 2b30facd84d92cac03936f0c3e9274d84c494375..37623f8997201f62fec729756fd80c062de37e6a 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -42,6 +42,7 @@ cc_library(
         ":op_performance_data_cc",
         ":utils",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
@@ -59,6 +60,7 @@ cc_test(
         "//tensorflow/cc:scope",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -160,7 +162,7 @@ cc_test(
     srcs = ["virtual_placer_test.cc"],
     deps = [
         ":virtual_placer",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -195,6 +197,7 @@ cc_test(
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -231,6 +234,7 @@ cc_library(
         ":cost_estimator",
         ":op_performance_data_cc",
         "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/clusters:utils",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index e530f66415be2f209bcff9c0c2c8793eab390e3d..569efaf96d68ab4641c8a03c807f31ae57c8bb08 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 #include <unordered_map>
 
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 21b73b6618dad9ac828d3f47f544868417824c77..e29f32b27022995751d7d07833b1da9384a3d7bc 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -32,9 +32,108 @@ using shape_inference::ShapeHandle;
 
 namespace {
 
-// Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
-// <*queue_shapes_and_types>.
-Status MergeEnqueueShapesAndTypes(
+// If a Merge node has a NextIteration node as an input then that input will
+// try to forward an UnknownShape at graph construction time. However, the
+// Merge shape function will always propagate an UnknownShape if any of its
+// inputs are UnknownShapes. So we need to ignore the input from NextIteration
+// nodes to propagate any known shape from the Merge node.
+Status ShapeOfMergeNode(const Node* node, InferenceContext* c) {
+  ShapeHandle out = c->input(0);
+  if (!c->RankKnown(out)) {
+    out = c->UnknownShape();
+  } else {
+    int32 rank = c->Rank(out);
+    for (const Edge* e : node->in_edges()) {
+      if (e->src()->IsNextIteration() || e->dst_input() <= 0) {
+        continue;
+      }
+      ShapeHandle input = c->input(e->dst_input());
+      if (!c->RankKnown(input) || c->Rank(input) != rank) {
+        out = c->UnknownShape();
+        break;
+      }
+
+      for (int d = 0; d < rank; ++d) {
+        if (c->Value(c->Dim(input, d)) != c->Value(c->Dim(out, d))) {
+          TF_RETURN_IF_ERROR(c->ReplaceDim(out, d, c->UnknownDim(), &out));
+        }
+      }
+    }
+  }
+  c->set_output(0, out);
+  c->set_output(1, c->Scalar());
+  return Status::OK();
+}
+
+// Manually propagate the input shape for Enter nodes and update any Merge node
+// outputs.
+Status UpdateEnter(ShapeRefiner* shape_refiner, const Node* node, bool relax,
+                   std::queue<const Node*>* new_shapes) {
+  auto enter_ctx = shape_refiner->GetContext(node);
+  for (int i = 0; i < enter_ctx->num_outputs(); i++) {
+    TF_RETURN_IF_ERROR(shape_refiner->SetShape(node, i, enter_ctx->input(0)));
+  }
+  for (const Edge* e : node->out_edges()) {
+    Node* dst = e->dst();
+    if (dst->IsMerge()) {
+      bool updated = false;
+      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(dst, relax, &updated));
+      if (!updated) {
+        continue;
+      }
+      InferenceContext* merge_ctx = shape_refiner->GetContext(dst);
+      DCHECK_NE(merge_ctx, nullptr);
+      TF_RETURN_IF_ERROR(ShapeOfMergeNode(dst, merge_ctx));
+      new_shapes->push(dst);
+    }
+  }
+  return Status::OK();
+}
+
+// Propagates the shapes in the transitive fan-out of <new_shapes>.
+Status PropagateShapes(ShapeRefiner* shape_refiner, bool relax,
+                       std::queue<const Node*>* new_shapes) {
+  while (!new_shapes->empty()) {
+    const Node* n = new_shapes->front();
+    new_shapes->pop();
+    for (const Node* fanout : n->out_nodes()) {
+      bool updated = false;
+      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(fanout, relax, &updated));
+      if (fanout->IsEnter()) {
+        TF_RETURN_IF_ERROR(
+            UpdateEnter(shape_refiner, fanout, relax, new_shapes));
+      } else if (updated) {
+        // We want to avoid propagating through loops on the merge pass because
+        // the shapes are not guaranteed to converge.
+        if (!relax && fanout->IsNextIteration()) {
+          continue;
+        }
+        new_shapes->push(fanout);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+void GraphProperties::Relax(InferenceContext* c, ShapeHandle s0, ShapeHandle s1,
+                            ShapeHandle* out) {
+  c->Relax(s0, s1, out);
+}
+
+bool GraphProperties::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
+                                       ShapeHandle s1) {
+  return ShapeRefiner::SameDefinedShape(c, s0, s1);
+}
+
+bool GraphProperties::IsUpdatedShapesOrTypes(
+    InferenceContext* c, const std::vector<ShapeAndType>& existing,
+    const std::vector<ShapeAndType>& updated) {
+  return ShapeRefiner::IsUpdatedShapesOrTypes(c, existing, updated);
+}
+
+Status GraphProperties::MergeEnqueueShapesAndTypes(
     const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
@@ -42,7 +141,7 @@ Status MergeEnqueueShapesAndTypes(
         "Enqueue nodes mixed number of tensors: ", shapes_and_types.size(),
         "  vs ", queue_shapes_and_types->size());
   }
-  for (int i = 0; i < shapes_and_types.size(); ++i) {
+  for (size_t i = 0; i < shapes_and_types.size(); ++i) {
     const ShapeAndType& a = shapes_and_types[i];
     ShapeAndType& b = (*queue_shapes_and_types)[i];
     if (a.dtype != b.dtype) {
@@ -56,18 +155,41 @@ Status MergeEnqueueShapesAndTypes(
   return Status::OK();
 }
 
-}  // namespace
+Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
+    const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
+    std::vector<ShapeAndType>* queue_shapes_and_types) {
+  if (shapes_and_types.size() != queue_shapes_and_types->size()) {
+    return errors::InvalidArgument(
+        "Enqueue nodes mixed number of tensors: ", shapes_and_types.size(),
+        "  vs ", queue_shapes_and_types->size());
+  }
+  for (size_t i = 0; i < shapes_and_types.size(); ++i) {
+    const ShapeAndType& a = shapes_and_types[i];
+    ShapeAndType& b = (*queue_shapes_and_types)[i];
+    if (a.dtype != b.dtype) {
+      return errors::InvalidArgument("Enqueue nodes mixed dtypes for tensor ",
+                                     i, ": ", DataTypeString(a.dtype), " vs ",
+                                     DataTypeString(b.dtype));
+    }
+
+    Relax(qctx, a.shape, b.shape, &b.shape);
+  }
+  return Status::OK();
+}
 
 Status GraphProperties::InferStatically() {
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   shape_refiner.set_require_shape_inference_fns(false);
   ImportGraphDefOptions options;
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
-  // List the resources and the nodes using them
+  // List the resources and the nodes using them. Also collect the Enter and
+  // Merge nodes.
   std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
+  std::unordered_set<const Node*> enter_nodes;
+  std::unordered_set<const Node*> merge_nodes;
   for (const Node* const node : graph.nodes()) {
     for (int i = 0; i < node->num_inputs(); ++i) {
       if (node->input_type(i) == DataType::DT_RESOURCE) {
@@ -76,82 +198,146 @@ Status GraphProperties::InferStatically() {
         resources[resource].insert(node);
       }
     }
+    if (node->IsEnter()) {
+      enter_nodes.insert(node);
+    } else if (node->IsNextIteration()) {
+      for (const Node* output : node->out_nodes()) {
+        if (output->IsMerge()) {
+          merge_nodes.insert(output);
+        }
+      }
+    }
   }
 
-  // If we found a resource, try to propagate the shapes through it.
-  bool done = true;
-  do {
-    std::queue<const Node*> new_shapes;
-    for (const auto& resource_data : resources) {
-      const Node* qnode = resource_data.first;
-      StringPiece type(qnode->type_string());
-      if (!type.ends_with("QueueV2")) {
-        continue;
+  // Propagate the initial shapes of Enter nodes manually (the Enter shape
+  // function always forwards an UnknownShape).
+  std::queue<const Node*> new_shapes;
+  for (const Node* node : enter_nodes) {
+    TF_RETURN_IF_ERROR(
+        UpdateEnter(&shape_refiner, node, false /* relax */, &new_shapes));
+  }
+  TF_RETURN_IF_ERROR(
+      PropagateShapes(&shape_refiner, false /* relax */, &new_shapes));
+
+  // We propagate shapes through the graph in two phases. In the first phase, we
+  // exclusively merge shapes but we do not propagate shapes through loops. Then
+  // on the second phase, we exclusively relax shapes and propagate shapes
+  // through loops until reaching fixed point.
+  for (int relax = 0; relax < 2; relax++) {
+    // We don't update Merge nodes with the input of NextIteration nodes on the
+    // merge pass. So we do that at the beginning of the relax pass instead.
+    if (relax) {
+      bool updated = false;
+      for (const Node* node : merge_nodes) {
+        TF_RETURN_IF_ERROR(
+            shape_refiner.UpdateNode(node, false /* relax */, &updated));
       }
-      auto qctx = shape_refiner.GetContext(qnode);
-      if (!qctx) {
-        continue;
+    }
+
+    bool done = true;
+    do {
+      if (relax) {
+        // Propagate shapes through any loops in the graph by relaxing.
+        for (const Node* node : merge_nodes) {
+          new_shapes.push(node);
+        }
+        TF_RETURN_IF_ERROR(PropagateShapes(&shape_refiner, relax, &new_shapes));
       }
 
-      // Check to see if the shape is fully defined.
-      auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
-      if (queue_handle_data != nullptr) {
-        bool fully_defined = true;
-        for (const auto& shape_and_type : *queue_handle_data) {
-          if (!qctx->FullyDefined(shape_and_type.shape) ||
-              shape_and_type.dtype == DT_INVALID) {
-            fully_defined = false;
-          }
+      // If we found a resource, try to propagate the shapes through it.
+      new_shapes = std::queue<const Node*>();
+      for (const auto& resource_data : resources) {
+        const Node* qnode = resource_data.first;
+        StringPiece type(qnode->type_string());
+        if (!type.ends_with("QueueV2") && !qnode->IsEnter()) {
+          continue;
         }
-        if (fully_defined) {
+        auto qctx = shape_refiner.GetContext(qnode);
+        if (!qctx) {
           continue;
         }
-      }
 
-      std::vector<ShapeAndType> queue_shapes_and_types;
-      if (queue_handle_data != nullptr) {
-        queue_shapes_and_types = *queue_handle_data;
-      }
-      for (const auto& node : resource_data.second) {
-        auto ctx = shape_refiner.GetContext(node);
-        if (!ctx) {
-          continue;
+        // Check to see if the shape is fully defined.
+        auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
+        if (queue_handle_data != nullptr) {
+          bool fully_defined = true;
+          for (const auto& shape_and_type : *queue_handle_data) {
+            if (!qctx->FullyDefined(shape_and_type.shape) ||
+                shape_and_type.dtype == DT_INVALID) {
+              fully_defined = false;
+            }
+          }
+          // If we are merging, then we are done. If we are relaxing, then we
+          // could potentially propagate a less specific shape.
+          if (fully_defined && !relax) {
+            continue;
+          }
         }
-        // TODO(bsteiner): handle EnqueueMany as well.
-        if (node->type_string().find("Enqueue") != std::string::npos &&
-            node->type_string().find("EnqueueMany") == std::string::npos) {
-          std::vector<ShapeAndType> shapes_and_types;
-          for (int i = 1; i < ctx->num_inputs(); ++i) {
-            shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+
+        // Merge all inputs into the enqueue node, regardless of which phase we
+        // are in.
+        std::vector<ShapeAndType> queue_shapes_and_types;
+        for (const auto& node : resource_data.second) {
+          auto ctx = shape_refiner.GetContext(node);
+          if (!ctx) {
+            continue;
           }
+          // TODO(bsteiner): handle EnqueueMany as well.
+          if (node->type_string().find("Enqueue") != std::string::npos &&
+              node->type_string().find("EnqueueMany") == std::string::npos) {
+            std::vector<ShapeAndType> shapes_and_types;
+            for (int i = 1; i < ctx->num_inputs(); ++i) {
+              shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+            }
 
-          if (queue_shapes_and_types.empty()) {
-            queue_shapes_and_types = shapes_and_types;
+            if (queue_shapes_and_types.empty()) {
+              queue_shapes_and_types = shapes_and_types;
+            } else {
+              TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
+                  shapes_and_types, qctx, &queue_shapes_and_types));
+            }
+          }
+        }
+        // Combine the input shapes with the existing output shape. We either
+        // merge or relax depending on which phase we are in.
+        if (queue_handle_data != nullptr) {
+          if (relax) {
+            TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+                *queue_handle_data, qctx, &queue_shapes_and_types));
           } else {
             TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-                shapes_and_types, qctx, &queue_shapes_and_types));
+                *queue_handle_data, qctx, &queue_shapes_and_types));
           }
         }
-      }
-      if (!queue_shapes_and_types.empty() &&
-          qctx->MergeOutputHandleShapesAndTypes(0, queue_shapes_and_types)) {
-        new_shapes.push(qnode);
-      }
-    }
-    // Propagate the shapes in the transitive fan-out of the queue.
-    done = new_shapes.empty();
-    while (!new_shapes.empty()) {
-      const Node* n = new_shapes.front();
-      new_shapes.pop();
-      for (const Node* fanout : n->out_nodes()) {
-        bool updated = false;
-        TF_RETURN_IF_ERROR(shape_refiner.UpdateNode(fanout, &updated));
-        if (updated) {
-          new_shapes.push(fanout);
+        // Set the output ShapeAndType handles. If we successfully update the
+        // resource node, add its fan-out to the queue.
+        const std::vector<ShapeAndType>* outputs =
+            qctx->output_handle_shapes_and_types(0);
+        std::vector<ShapeAndType> existing_outputs;
+        if (outputs) {
+          existing_outputs = *outputs;
+        }
+        if (!queue_shapes_and_types.empty()) {
+          if (!relax && qctx->MergeOutputHandleShapesAndTypes(
+                            0, queue_shapes_and_types)) {
+            new_shapes.push(qnode);
+          } else if (relax && qctx->RelaxOutputHandleShapesAndMergeTypes(
+                                  0, queue_shapes_and_types)) {
+            if (IsUpdatedShapesOrTypes(
+                    qctx, existing_outputs,
+                    *qctx->output_handle_shapes_and_types(0))) {
+              new_shapes.push(qnode);
+            }
+          }
         }
       }
-    }
-  } while (!done);
+      // Propagate the shapes in the transitive fan-out of the queue.
+      done = new_shapes.empty();
+      if (!done) {
+        TF_RETURN_IF_ERROR(PropagateShapes(&shape_refiner, relax, &new_shapes));
+      }
+    } while (!done);
+  }
 
   for (const Node* const node : graph.nodes()) {
     VLOG(1) << "<Node> " << node->name();
@@ -246,26 +432,30 @@ Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
   return Status::OK();
 }
 
+bool GraphProperties::HasInputProperties(const string& name) const {
+  return input_properties_.find(name) != input_properties_.end();
+}
+
 bool GraphProperties::HasOutputProperties(const string& name) const {
   return output_properties_.find(name) != output_properties_.end();
 }
 
-std::vector<OpInfo::TensorProperties> GraphProperties::GetInputProperties(
-    const string& node_name) const {
+const std::vector<OpInfo::TensorProperties>&
+GraphProperties::GetInputProperties(const string& node_name) const {
   auto it = input_properties_.find(node_name);
   if (it != input_properties_.end()) {
     return it->second;
   }
-  return std::vector<OpInfo::TensorProperties>();
+  return missing_properties_;
 }
 
-std::vector<OpInfo::TensorProperties> GraphProperties::GetOutputProperties(
-    const string& node_name) const {
+const std::vector<OpInfo::TensorProperties>&
+GraphProperties::GetOutputProperties(const string& node_name) const {
   auto it = output_properties_.find(node_name);
   if (it != output_properties_.end()) {
     return it->second;
   }
-  return std::vector<OpInfo::TensorProperties>();
+  return missing_properties_;
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index b849c4b3f048343df1678b13cd0aa7f6fcb3bfdb..d4dc7161bf321548d32b8500be2b69c6655407b9 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 #include <vector>
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -38,10 +39,11 @@ class GraphProperties {
   Status InferDynamically(Cluster* cluster);
   Status InferFromCostGraph(const CostGraphDef& cost_graph);
 
+  bool HasInputProperties(const string& name) const;
   bool HasOutputProperties(const string& name) const;
-  std::vector<OpInfo::TensorProperties> GetInputProperties(
+  const std::vector<OpInfo::TensorProperties>& GetInputProperties(
       const string& node_name) const;
-  std::vector<OpInfo::TensorProperties> GetOutputProperties(
+  const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
       const string& node_name) const;
 
  private:
@@ -49,6 +51,35 @@ class GraphProperties {
   GrapplerItem item_;
   std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
   std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+  const std::vector<OpInfo::TensorProperties> missing_properties_;
+
+  // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
+  // <*queue_shapes_and_types>.
+  Status MergeEnqueueShapesAndTypes(
+      const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
+      shape_inference::InferenceContext* qctx,
+      std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
+  // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
+  // <*queue_shapes_and_types>.
+  Status RelaxEnqueueShapesAndMergeTypes(
+      const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
+      shape_inference::InferenceContext* qctx,
+      std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
+
+  // This gives access to private function of InferenceContext.
+  static void Relax(shape_inference::InferenceContext* c,
+                    shape_inference::ShapeHandle s0,
+                    shape_inference::ShapeHandle s1,
+                    shape_inference::ShapeHandle* out);
+
+  // These give access to private functions of ShapeRefiner.
+  static bool SameDefinedShape(shape_inference::InferenceContext* c,
+                               shape_inference::ShapeHandle s0,
+                               shape_inference::ShapeHandle s1);
+  static bool IsUpdatedShapesOrTypes(
+      shape_inference::InferenceContext* c,
+      const std::vector<shape_inference::ShapeAndType>& existing,
+      const std::vector<shape_inference::ShapeAndType>& updated);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 10a88b59a2f59f147530ffde9acd95fdfca3e6f0..109f973956e8e89906e422e8e0c3a1d640fa1c53 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -309,38 +310,2603 @@ TEST_F(GraphPropertiesTest, Queues) {
   EXPECT_EQ("float: [1,2,3]", PropToString(props5[2]));
 }
 
-TEST_F(GraphPropertiesTest, Loops) {
+TEST_F(GraphPropertiesTest, MergeWithoutLoops) {
+  // Python code used to generate the graph is below.
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 7
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "Less"
+  op: "Less"
+  input: "Const"
+  input: "Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "cond/Switch"
+  op: "Switch"
+  input: "Less"
+  input: "Less"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_t"
+  op: "Identity"
+  input: "cond/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_f"
+  op: "Identity"
+  input: "cond/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/pred_id"
+  op: "Identity"
+  input: "Less"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/concat/axis"
+  op: "Const"
+  input: "^cond/switch_t"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "cond/concat/Switch"
+  op: "Switch"
+  input: "ones"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@ones"
+      }
+    }
+  }
+}
+node {
+  name: "cond/concat"
+  op: "ConcatV2"
+  input: "cond/concat/Switch:1"
+  input: "cond/concat/Switch:1"
+  input: "cond/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "cond/concat_1/axis"
+  op: "Const"
+  input: "^cond/switch_f"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "cond/concat_1/Switch"
+  op: "Switch"
+  input: "ones"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@ones"
+      }
+    }
+  }
+}
+node {
+  name: "cond/concat_1"
+  op: "ConcatV2"
+  input: "cond/concat_1/Switch"
+  input: "cond/concat_1/Switch"
+  input: "cond/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "cond/Merge"
+  op: "Merge"
+  input: "cond/concat"
+  input: "cond/concat_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "cond/Merge"
+  input: "cond/Merge"
+  input: "concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+  // Test graph produced in python using:
+  /*
+    with tf.Graph().as_default():
+      x = tf.constant(2)
+      y = tf.constant(5)
+      z = tf.ones([1,1,1])
+      def f1(): return tf.concat([z, z], axis=0)
+      def f2(): return tf.concat([z, z], axis=1)
+      r = tf.cond(tf.less(x, y), f1, f2)
+      tf.concat([r, r], axis=2)
+      with open('/tmp/graph.pbtxt', 'w') as f:
+        f.write(str(tf.get_default_graph().as_graph_def()))
+   */
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  std::vector<string> nodes{"cond/Merge", "cond/concat", "cond/concat_1"};
+  std::vector<string> expected_outputs{"float: [-1,-1,1]", "float: [2,1,1]",
+                                       "float: [1,2,1]"};
+  for (int i = 0; i < nodes.size(); i++) {
+    const auto props = properties.GetOutputProperties(nodes[i]);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ(expected_outputs[i], PropToString(prop));
+  }
+}
+
+TEST_F(GraphPropertiesTest, WhileLoop) {
+  // Python code used to generate the graph is below.
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i0 = tf.constant(0)
+       m0 = tf.ones([2, 2])
+       c = lambda i, m: i < 10
+       b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+       r = tf.while_loop(
+              c, b, loop_vars=[i0, m0],
+              shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+       with open('/tmp/graph.pbtxt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
+                            "while/Exit_1"};
+  for (const string& node : nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ("float: [-1,2]", PropToString(prop));
+  }
+}
+
+TEST_F(GraphPropertiesTest, NestedLoop) {
+  // Python code used to generate the graph is below.
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/Const"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Enter"
+  op: "Enter"
+  input: "while/while/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/while/Enter_1"
+  op: "Enter"
+  input: "while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/while/Merge"
+  op: "Merge"
+  input: "while/while/Enter"
+  input: "while/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/Merge_1"
+  op: "Merge"
+  input: "while/while/Enter_1"
+  input: "while/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/Less/y"
+  op: "Const"
+  input: "^while/while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Less"
+  op: "Less"
+  input: "while/while/Merge"
+  input: "while/while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/LoopCond"
+  op: "LoopCond"
+  input: "while/while/Less"
+}
+node {
+  name: "while/while/Switch"
+  op: "Switch"
+  input: "while/while/Merge"
+  input: "while/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Switch_1"
+  op: "Switch"
+  input: "while/while/Merge_1"
+  input: "while/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Identity"
+  op: "Identity"
+  input: "while/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/Identity_1"
+  op: "Identity"
+  input: "while/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/add/y"
+  op: "Const"
+  input: "^while/while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/while/add"
+  op: "Add"
+  input: "while/while/Identity"
+  input: "while/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/concat/axis"
+  op: "Const"
+  input: "^while/while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "while/while/concat"
+  op: "ConcatV2"
+  input: "while/while/Identity_1"
+  input: "while/while/Identity_1"
+  input: "while/while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/NextIteration"
+  op: "NextIteration"
+  input: "while/while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/Exit"
+  op: "Exit"
+  input: "while/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/Exit_1"
+  op: "Exit"
+  input: "while/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+  // Test graph produced in python using:
+  /*
+    with tf.Graph().as_default():
+      i0 = tf.constant(0)
+
+      def inner(j, y):
+        def inner_cond(j, y):
+          return j < 3
+
+        def inner_body(j, y):
+          return j+1, tf.concat([y, y], axis=2)
+
+        return tf.while_loop(inner_cond, inner_body, loop_vars=[j, y],
+                             shape_invariants=[i0.get_shape(),
+                                              tf.TensorShape([None, 1, None])])
+
+      def outer_cond(i, x):
+        return i < 3
+
+      def outer_body(i, x):
+        j, y = inner(0, x)
+        return i+1, tf.concat([x, x], axis=0)
+
+      r = tf.while_loop(outer_cond, outer_body,
+                        loop_vars=[i0, tf.ones([1, 1, 1])],
+                        shape_invariants=[i0.get_shape(),
+                                          tf.TensorShape([None, 1, None])])
+
+      with open('/tmp/graph.pbtxt', 'w') as f:
+        f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
+                                  "while/Exit_1"};
+  std::vector<string> inner_nodes{"while/while/Merge_1",
+                                  "while/while/NextIteration_1",
+                                  "while/while/Exit_1"};
+  for (const string& node : outer_nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ("float: [-1,1,1]", PropToString(prop));
+  }
+  for (const string& node : inner_nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ("float: [-1,1,-1]", PropToString(prop));
+  }
+}
+
+TEST_F(GraphPropertiesTest, LoopsAndQueues) {
+  // Python code used to generate the graph is below.
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "fifo_queue"
+  op: "FIFOQueueV2"
+  attr {
+    key: "capacity"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/fifo_queue_enqueue/Enter"
+  op: "Enter"
+  input: "fifo_queue"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/fifo_queue_enqueue"
+  op: "QueueEnqueueV2"
+  input: "while/fifo_queue_enqueue/Enter"
+  input: "while/Identity_1"
+  attr {
+    key: "Tcomponents"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/fifo_queue_Dequeue"
+  op: "QueueDequeueV2"
+  input: "while/fifo_queue_enqueue/Enter"
+  input: "^while/Identity"
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "while/while/Const"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Enter"
+  op: "Enter"
+  input: "while/while/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/while/Enter_1"
+  op: "Enter"
+  input: "while/fifo_queue_Dequeue"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/while/Merge"
+  op: "Merge"
+  input: "while/while/Enter"
+  input: "while/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/Merge_1"
+  op: "Merge"
+  input: "while/while/Enter_1"
+  input: "while/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/Less/y"
+  op: "Const"
+  input: "^while/while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Less"
+  op: "Less"
+  input: "while/while/Merge"
+  input: "while/while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/LoopCond"
+  op: "LoopCond"
+  input: "while/while/Less"
+}
+node {
+  name: "while/while/Switch"
+  op: "Switch"
+  input: "while/while/Merge"
+  input: "while/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Switch_1"
+  op: "Switch"
+  input: "while/while/Merge_1"
+  input: "while/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/while/Identity"
+  op: "Identity"
+  input: "while/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/Identity_1"
+  op: "Identity"
+  input: "while/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/add/y"
+  op: "Const"
+  input: "^while/while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/while/add"
+  op: "Add"
+  input: "while/while/Identity"
+  input: "while/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/concat/axis"
+  op: "Const"
+  input: "^while/while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/while/concat"
+  op: "ConcatV2"
+  input: "while/while/Identity_1"
+  input: "while/while/Identity_1"
+  input: "while/while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/NextIteration"
+  op: "NextIteration"
+  input: "while/while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/while/Exit"
+  op: "Exit"
+  input: "while/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/while/Exit_1"
+  op: "Exit"
+  input: "while/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
   // Test graph produced in python using:
   /*
-     with tf.Graph().as_default():
-       i = tf.constant(0)
-       c = lambda i: tf.less(i, 10)
-       b = lambda i: tf.add(i, 1)
-       r = tf.while_loop(c, b, [i])
-       with open('/tmp/graph.txt', 'w') as f:
-         f.write(str(tf.get_default_graph().as_graph_def()))
-  */
+    with tf.Graph().as_default():
+      i0 = tf.constant(0)
+      q = tf.FIFOQueue(1, "float")
+
+      def inner(j, y):
+        def inner_cond(j, y):
+          return j < 3
+
+        def inner_body(j, y):
+          return j+1, tf.concat([y, y], axis=0)
+
+        return tf.while_loop(inner_cond, inner_body,
+                             loop_vars=[j, y],
+                             shape_invariants=[i0.get_shape(),
+                                               tf.TensorShape(None)])
+
+      def outer_cond(i, x):
+        return i < 3
+
+      def outer_body(i, x):
+        q.enqueue(x)
+        y = tf.concat([x, x], axis=2)
+        inner(0, q.dequeue())
+        return i+1, y
+
+      i, z = tf.while_loop(outer_cond, outer_body,
+                           loop_vars=[i0, tf.ones([1, 1, 1])],
+                           shape_invariants=[i0.get_shape(),
+                                             tf.TensorShape([None, 1, None])])
+
+      with open('/tmp/graph.pbtxt', 'w') as f:
+        f.write(str(tf.get_default_graph().as_graph_def()))
+   */
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
+                                  "while/Exit_1"};
+  std::vector<string> inner_nodes{"while/while/Merge_1",
+                                  "while/while/NextIteration_1",
+                                  "while/while/Exit_1"};
+  for (const string& node : outer_nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ("float: [1,1,-1]", PropToString(prop));
+  }
+  for (const string& node : inner_nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ("float: [-1,1,-1]", PropToString(prop));
+  }
+}
+
+TEST_F(GraphPropertiesTest, QueuesAndLoops) {
+  // Python code used to generate the graph is below.
   const string gdef_ascii = R"EOF(
 node {
-  name: "Const"
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "fifo_queue"
+  op: "FIFOQueueV2"
+  attr {
+    key: "capacity"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "ones"
   op: "Const"
   attr {
     key: "dtype"
     value {
-      type: DT_INT32
+      type: DT_FLOAT
     }
   }
   attr {
     key: "value"
     value {
       tensor {
-        dtype: DT_INT32
+        dtype: DT_FLOAT
         tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
         }
-        int_val: 0
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "fifo_queue_enqueue"
+  op: "QueueEnqueueV2"
+  input: "fifo_queue"
+  input: "ones"
+  attr {
+    key: "Tcomponents"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "fifo_queue_1"
+  op: "FIFOQueueV2"
+  attr {
+    key: "capacity"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "fifo_queue_Dequeue"
+  op: "QueueDequeueV2"
+  input: "fifo_queue"
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
       }
     }
   }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
 }
 node {
   name: "while/Enter"
@@ -371,6 +2937,35 @@ node {
     }
   }
 }
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "fifo_queue_Dequeue"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
 node {
   name: "while/Merge"
   op: "Merge"
@@ -389,6 +2984,24 @@ node {
     }
   }
 }
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
 node {
   name: "while/Less/y"
   op: "Const"
@@ -448,6 +3061,26 @@ node {
     }
   }
 }
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+}
 node {
   name: "while/Identity"
   op: "Identity"
@@ -460,7 +3093,18 @@ node {
   }
 }
 node {
-  name: "while/Add/y"
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
   op: "Const"
   input: "^while/Identity"
   attr {
@@ -482,12 +3126,59 @@ node {
   }
 }
 node {
-  name: "while/Add"
+  name: "while/add"
   op: "Add"
   input: "while/Identity"
-  input: "while/Add/y"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
   attr {
     key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
     value {
       type: DT_INT32
     }
@@ -496,7 +3187,7 @@ node {
 node {
   name: "while/NextIteration"
   op: "NextIteration"
-  input: "while/Add"
+  input: "while/add"
   attr {
     key: "T"
     value {
@@ -504,6 +3195,17 @@ node {
     }
   }
 }
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
 node {
   name: "while/Exit"
   op: "Exit"
@@ -515,21 +3217,151 @@ node {
     }
   }
 }
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "fifo_queue_1_enqueue"
+  op: "QueueEnqueueV2"
+  input: "fifo_queue_1"
+  input: "while/Exit_1"
+  attr {
+    key: "Tcomponents"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "fifo_queue_1_Dequeue"
+  op: "QueueDequeueV2"
+  input: "fifo_queue_1"
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "fifo_queue_1_Dequeue"
+  input: "fifo_queue_1_Dequeue"
+  input: "concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
 versions {
-  producer: 11
+  producer: 21
 }
   )EOF";
 
+  // Test graph produced in python using:
+  /*
+    with tf.Graph().as_default():
+      i0 = tf.constant(0)
+      q0 = tf.FIFOQueue(1, "float")
+      q0.enqueue(tf.ones([2, 2]))
+      q1 = tf.FIFOQueue(1, "float")
+
+      def c(i, m):
+        return i < 10
+
+      def b(i, m):
+        return i+1, tf.concat([m, m], axis=0)
+
+      i, m = tf.while_loop(
+          c, b, loop_vars=[i0,  q0.dequeue()],
+          shape_invariants=[i0.get_shape(), tf.TensorShape(None)])
+
+      q1.enqueue(m)
+      v = q1.dequeue();
+      tf.concat([v, v], axis=1)
+      with open('/tmp/graph.pbtxt', 'w') as f:
+        f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+
   GrapplerItem item;
   CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically());
 
-  const auto props = properties.GetOutputProperties("while/Exit");
-  EXPECT_EQ(1, props.size());
+  std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
+                            "while/Exit_1"};
+
+  for (const string& node : nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ("float: [-1,2]", PropToString(prop));
+  }
+
+  const auto props = properties.GetOutputProperties("concat");
   const OpInfo::TensorProperties& prop = props[0];
-  EXPECT_EQ(DT_INT32, prop.dtype());
-  EXPECT_TRUE(prop.shape().unknown_rank());
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_EQ("float: [-1,4]", PropToString(prop));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index e4a0d6f1b860d5c21d062835ac968b808252e0b7..8fd1801863ad9aadd6e9f1bbde4b90600189d77c 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -101,6 +101,7 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   // Run "measurement_steps_" and measure the time.
+  VLOG(1) << "Number of measurement steps: " << measurement_steps_;
   if (measurement_threads_ > 0) {
     for (int i = 0; i < measurement_steps_; ++i) {
       thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index d8b8a12eb2942f25bfa5457ded40e4688143d321..f13b426b3ca518163ac8e6707e3da383729d66af 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 
 namespace tensorflow {
@@ -29,12 +31,18 @@ constexpr char kConv2dBackPropInput[] = "Conv2DBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kIdentity[] = "Identity";
+constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
 constexpr char kReshape[] = "Reshape";
 constexpr char kRecv[] = "_Recv";
 constexpr char kBatchMatMul[] = "BatchMatMul";
 constexpr char kVariable[] = "Variable";
 constexpr char kVariableV2[] = "VariableV2";
+constexpr char kRank[] = "Rank";
+constexpr char kShape[] = "Shape";
+constexpr char kSize[] = "Size";
+constexpr char kStopGradient[] = "StopGradient";
+constexpr char kPreventGradient[] = "PreventGradient";
 
 namespace {
 
@@ -150,12 +158,18 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)}};
+      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+      {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
 
   elementwise_ops_ = {
       // Unary ops alphabetically sorted
@@ -254,6 +268,9 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                           Eigen::internal::scalar_quotient_op<float>>::Cost},
       {"TruncateMod", Eigen::internal::functor_traits<
                           Eigen::internal::scalar_mod_op<float>>::Cost}};
+
+  // By default, use sum of memory_time and compute_time for execution_time.
+  compute_memory_overlap_ = false;
 }
 
 Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
@@ -314,6 +331,8 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
       bandwidth = 100;
     }
   }
+  VLOG(1) << "Device: " << device.type() << " GFLOPS: " << gflops
+          << " Bandwidth: " << bandwidth;
 
   return std::make_pair(gflops, bandwidth);
 }
@@ -379,7 +398,11 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
-  costs.execution_time = compute_cost + memory_cost;
+  if (compute_memory_overlap_) {
+    costs.execution_time = std::max(compute_cost, memory_cost);
+  } else {
+    costs.execution_time = compute_cost + memory_cost;
+  }
   costs.inaccurate = found_unknown_shapes;
   return costs;
 }
@@ -461,7 +484,7 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   ops *= conv_dims.kx * conv_dims.ky;
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for Conv2D" << ops;
+  VLOG(1) << "Operations for Conv2D " << ops;
 
   if (conv_info != nullptr) {
     *conv_info = conv_dims;
@@ -679,7 +702,7 @@ int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for Conv2DBackPropInput" << ops;
+  VLOG(1) << "Operations for Conv2DBackPropInput " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -842,5 +865,17 @@ Costs OpLevelCostEstimator::PredictBatchMatMul(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictMetadata(const OpInfo& op_features) const {
+  Costs costs;
+  costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
+  // Metadata operations are so cheap we assume they take the minimum amount of
+  // time we can represent (1 ns).
+  costs.execution_time = 1;
+  costs.compute_time = 1;
+  costs.memory_time = 0;
+
+  return costs;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 28d49a77037ab841da5d67ff384e60156c075d36..36ef6a5c6167373932e0da51b9d0ce8c0018e766 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -129,6 +129,7 @@ class OpLevelCostEstimator {
   Costs PredictMatMul(const OpInfo& op_features) const;
   Costs PredictNoOp(const OpInfo& op_features) const;
   Costs PredictBatchMatMul(const OpInfo& op_features) const;
+  Costs PredictMetadata(const OpInfo& op_features) const;
 
   // Utility function for safe division. Returns 0
   // if rhs is 0 or negative.
@@ -149,6 +150,9 @@ class OpLevelCostEstimator {
   std::map<string, int> elementwise_ops_;
   typedef std::function<Costs(const OpInfo& op_feature)> CostImpl;
   std::map<string, CostImpl> device_cost_impl_;
+  // If true, assume compute and memory overlap; hence, the op cost is max of
+  // compute_time and memory_time, insteaf of sum of those two.
+  bool compute_memory_overlap_;
 
  private:
   friend class OpLevelCostEstimatorTest;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 1f0e02c16050c92f9006972952514e7c85d617cc..0cbfb10017528e438db5eef349c46bb5df64446f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -157,6 +157,10 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
                                                  found_unknown_shapes);
   }
 
+  void SetComputeMemoryOverlap(bool value) {
+    estimator_.compute_memory_overlap_ = value;
+  }
+
   OpLevelCostEstimator estimator_;
 };
 
@@ -168,6 +172,16 @@ TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   EXPECT_TRUE(cost.inaccurate);
 }
 
+TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
+  SetComputeMemoryOverlap(true);
+  auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
+  EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
+  EXPECT_TRUE(cost.inaccurate);
+  SetComputeMemoryOverlap(false);  // Set it back to default.
+}
+
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
   auto cost = PredictCosts(DescribeOp("Mul", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 2fbd54d7591879de2e5a76706c23631c5f66d15e..4135d9b3313d31896cee07d6ec8e64dd6c310e07 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
index 0291bd04909dbeeed0ac9d5cea2749d71ee046f2..64e2778fc9d72b7579a5f9a33a55d591c7bcc42b 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -36,17 +36,19 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
 
   } else {
     default_device_ = devices_.begin()->first;
+    VLOG(1) << "Number of devices: " << devices_.size();
     for (const auto& device : devices_) {
       if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
         default_device_ = device.first;
+        break;
       }
-      break;
     }
   }
 }
 
 const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
   string device = get_canonical_device_name(node);
+  VLOG(3) << "Device name: " << device;
   auto it = devices_.find(device);
   DCHECK(it != devices_.end());
   return it->second;
@@ -75,6 +77,9 @@ string VirtualPlacer::get_canonical_device_name(const NodeDef& node) const {
     if (!parsed) {
       return get_default_device_name();
     } else {
+      if (parsed_name.job.empty()) {
+        parsed_name.job = "localhost";
+      }
       device = strings::StrCat(
           "/job:", parsed_name.job, "/replica:", parsed_name.replica,
           "/task:", parsed_name.task, "/",
diff --git a/tensorflow/core/grappler/costs/virtual_placer_test.cc b/tensorflow/core/grappler/costs/virtual_placer_test.cc
index 31592cd822fa7d586b9eeeb09df6d9f565c5d7b3..65a03fb55753500bab6d41ed9fe0a6908f26a0da 100644
--- a/tensorflow/core/grappler/costs/virtual_placer_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer_test.cc
@@ -51,6 +51,29 @@ TEST(VirtualPlacerTest, LocalDevices) {
             placer.get_canonical_device_name(node));
 }
 
+TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
+  // Virtual placer should use "localhost" if device is empty.
+  // First create a cluster with only localhost devices.
+  std::unordered_map<string, DeviceProperties> devices;
+  DeviceProperties cpu_device;
+  cpu_device.set_type("CPU");
+  devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+  DeviceProperties gpu_device;
+  gpu_device.set_type("GPU");
+  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  VirtualCluster cluster(devices);
+  VirtualPlacer placer(&cluster);
+
+  NodeDef node;
+  node.set_op("Conv2D");
+  node.set_device("/device:CPU:0");
+  EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
+            placer.get_canonical_device_name(node));
+  node.set_device("/device:GPU:0");
+  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+            placer.get_canonical_device_name(node));
+}
+
 TEST(VirtualPlacerTest, FallBackUnknown) {
   // Virtual placer falls back to "UNKNOWN" only if there are no devices in the
   // cluster.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index c68d4e31c465e32b2e3f244d10a9d694621d3e8c..6b0b869df5979661048f6f5493fc0c2ab2c3445b 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -53,12 +56,40 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
           << " max_per_op_streaming=" << result.max_per_op_streaming;
   return result;
 }
+
+// Key to the cached _Recv ops map, and its hash and predicate structures.
+struct RecvNodeDescriptor {
+  const NodeDef* node;
+  const int port_num;
+  const string& device;
+
+  RecvNodeDescriptor(const NodeDef* node_, const int port_num_,
+                     const string& device_)
+      : node(node_), port_num(port_num_), device(device_) {}
+};
+
+struct RecvNodeDescritorHash {
+  std::size_t operator()(const RecvNodeDescriptor& recv_node) const {
+    return std::hash<const NodeDef*>()(recv_node.node) ^
+           std::hash<int>()(recv_node.port_num) ^
+           std::hash<string>()(recv_node.device);
+  }
+};
+
+struct RecvNodeDescriptorEqual {
+  bool operator()(const RecvNodeDescriptor& a,
+                  const RecvNodeDescriptor& b) const {
+    return a.node == b.node && a.port_num == b.port_num && a.device == b.device;
+  }
+};
 }  // namespace
 
 VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
                                    const bool use_static_shapes,
                                    Cluster* cluster)
-    :  // TODO(dyoon): Use a better way than FIFO.
+    :  // Allow LIFO as well as FIFO. LIFO allows an output node of an node to
+       // follow it in execution, saving addition memory time from having to
+       // write and read. For default cases, use FIFO for performance.
       ready_nodes_(new FIFOManager()),
       graph_costs_(Costs::ZeroCosts()),
       graph_properties_(*grappler_item),
@@ -104,6 +135,11 @@ Status VirtualScheduler::Init() {
     name_to_node[node->name()] = node;
   }
 
+  // To reuse _Recv ops.
+  std::unordered_map<RecvNodeDescriptor, const NodeDef*, RecvNodeDescritorHash,
+                     RecvNodeDescriptorEqual>
+      cached_recv_nodes;
+
   // Build node_map; for each node, create its NodeState and connect its inputs
   // and outputs.
   for (const auto* curr_node : nodes) {
@@ -126,12 +162,13 @@ Status VirtualScheduler::Init() {
         auto& input_node_state = GetNodeStateOrCreateIt(input_node);
         input_node_state.outputs[input_node_port_num].push_back(curr_node);
       } else {
-        if (cached_recv_nodes_.count(input_node) > 0 &&
-            cached_recv_nodes_[input_node].count(curr_node_device) > 0) {
+        RecvNodeDescriptor recv_node(input_node, input_node_port_num,
+                                     curr_node_device);
+        auto it = cached_recv_nodes.find(recv_node);
+        if (it != cached_recv_nodes.end()) {
           // Different device, but found an already-cached copy (a _Recv op);
           // connect the _Recv to curr_node.
-          const auto* recv_op =
-              cached_recv_nodes_[input_node][curr_node_device];
+          const NodeDef* recv_op = it->second;
           // recv_op's output port is hard-coded to zero.
           curr_node_state.inputs.push_back(std::make_pair(recv_op, 0));
           auto& input_node_state = node_map_.at(recv_op);
@@ -151,7 +188,7 @@ Status VirtualScheduler::Init() {
           input_node_state.outputs[input_node_port_num].push_back(send);
 
           // Cache the _Recv op for future use.
-          cached_recv_nodes_[input_node][curr_node_device] = recv;
+          cached_recv_nodes[recv_node] = recv;
         }
       }
     }
@@ -244,8 +281,8 @@ string VirtualScheduler::DeviceName(const NodeDef* node) const {
 string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
                                            const NodeDef* to) const {
   CHECK(!initialized_) << "ChannelDeviceName is called after Init().";
-
-  return kChannelDevice + ": " + DeviceName(from) + " to " + DeviceName(to);
+  return kChannelDevice + ": from " + DeviceName(from) + " to " +
+         DeviceName(to);
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
@@ -264,10 +301,16 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   // input names, attrs, etc.
 
   auto input_node_port_num = NodePosition(input_name);
+  string src_name;
+  if (input_node_port_num >= 0) {
+    src_name = strings::StrCat(from->name(), ":", input_node_port_num);
+  } else {
+    src_name = strings::StrCat(from->name(), ":minus1");
+  }
 
   // _Send op.
   auto* send = new NodeDef();
-  send->set_name("Send " + from->name() + " from " + DeviceName(from) + " to " +
+  send->set_name("Send " + src_name + " from " + DeviceName(from) + " to " +
                  DeviceName(to));
   send->set_op("_Send");
   send->add_input(from->name());
@@ -279,7 +322,7 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
 
   // _Recv op.
   auto* recv = new NodeDef();
-  recv->set_name("Recv " + from->name() + " on " + DeviceName(to));
+  recv->set_name("Recv " + src_name + " on " + DeviceName(to));
   recv->set_op("_Recv");
   recv->add_input(send->name());
   recv->set_device(DeviceName(to));
@@ -318,8 +361,8 @@ NodeInfo VirtualScheduler::GetCurrNodeInfo() const {
   }
 
   // Construct NodeInfo.
-  const auto& node_state = node_map_.at(node);
   NodeInfo node_info;
+  const auto& node_state = node_map_.at(node);
   node_info.name = node->name();
   node_info.device_name = node_state.device_name;
   auto& op_info = node_info.op_info;
@@ -360,7 +403,7 @@ NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
     // Initialize output port related data:
     // Assume the size of OutputProperties represents the number of output ports
     // of this node.
-    for (int i = 0; i < node_state.output_properties.size(); ++i) {
+    for (size_t i = 0; i < node_state.output_properties.size(); ++i) {
       node_state.time_no_references[i] = Costs::Duration::max();
       node_state.num_outputs_executed[i] = 0;
       // Populate an empty vector for each port. The caller will add nodes
@@ -479,7 +522,11 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     for (auto* output_node : port_num_output_pair.second) {
       auto& output_state = node_map_[output_node];
       output_state.num_inputs_ready++;
-      if (output_state.num_inputs_ready == output_state.inputs.size()) {
+      // Execute a node as soon as all its inputs are ready. Merge nodes are
+      // special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
         // This output node is now ready.
         output_state.time_ready = curr_time;
         ready_nodes_->AddNode(output_node);
@@ -577,6 +624,7 @@ Costs VirtualScheduler::Summary() const {
             << " GB, at the end: " << state.memory_usage << " B";
 
     VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):";
+
     // Profile non-persistent op memory usage.
     for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
       const auto* node = node_port.first;
@@ -617,5 +665,42 @@ Costs VirtualScheduler::Summary() const {
   return critical_path_costs;
 }
 
+Costs VirtualScheduler::Summary(RunMetadata* metadata) {
+  if (metadata != nullptr) {
+    StepStats* stepstats = metadata->mutable_step_stats();
+    for (const auto& device : device_) {
+      GraphDef* device_partition_graph =
+          metadata->mutable_partition_graphs()->Add();
+      DeviceStepStats* device_stepstats = stepstats->add_dev_stats();
+      device_stepstats->set_device(device.first);
+      for (const auto& node_def : device.second.nodes_executed) {
+        const NodeState& nodestate = node_map_.at(node_def);
+        NodeExecStats* node_stats = device_stepstats->add_node_stats();
+        for (int slot = 0; slot < nodestate.output_properties.size(); slot++) {
+          const auto& properties = nodestate.output_properties[slot];
+          NodeOutput* no = node_stats->add_output();
+          no->set_slot(slot);
+          TensorDescription* tensor_descr = no->mutable_tensor_description();
+          tensor_descr->set_dtype(properties.dtype());
+          *tensor_descr->mutable_shape() = properties.shape();
+        }
+        node_stats->set_timeline_label(node_def->op());
+        node_stats->set_node_name(node_def->name());
+        node_stats->set_op_start_rel_micros(0);
+        node_stats->set_all_start_micros(
+            nodestate.time_scheduled.asMicroSeconds().count());
+        node_stats->set_op_end_rel_micros(
+            nodestate.time_finished.asMicroSeconds().count() -
+            nodestate.time_scheduled.asMicroSeconds().count());
+        node_stats->set_all_end_rel_micros(
+            nodestate.time_finished.asMicroSeconds().count() -
+            nodestate.time_scheduled.asMicroSeconds().count());
+        *device_partition_graph->mutable_node()->Add() = *node_def;
+      }
+    }
+  }
+  return Summary();
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 472ba90f7c50e3462a50f2ac0ecb4f5809f1fafc..e9abecb12234f780a9fa053301472a68fa06a59b 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -19,7 +19,10 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
@@ -80,16 +83,27 @@ struct DeviceState {
   // Nodes executed on this device in execution order.
   std::vector<const NodeDef*> nodes_executed;
 
+  struct NodePairHash {
+   public:
+    const std::size_t operator()(
+        const std::pair<const NodeDef*, int>& element) const {
+      return std::hash<const NodeDef*>()(element.first);
+    }
+  };
+
   // Nodes currently allocated in memory: set of NodeDef* and port_num pairs
   // so that we can track which output of the node is in memory.
-  std::set<std::pair<const NodeDef*, int>> nodes_in_memory;
+  std::unordered_set<std::pair<const NodeDef*, int>, NodePairHash>
+      nodes_in_memory;
 
   // Nodes allocated in memory persistently: e.g., Variables.
-  std::set<std::pair<const NodeDef*, int>> persistent_nodes;
+  std::unordered_set<std::pair<const NodeDef*, int>, NodePairHash>
+      persistent_nodes;
 
   // Snapshot of nodes_in_memory, when memory usage is at peak.
   // Same to nodes_in_memory, it's a set of NodeDef* and port_num pairs.
-  std::set<std::pair<const NodeDef*, int>> mem_usage_snapshot_at_peak;
+  std::unordered_set<std::pair<const NodeDef*, int>, NodePairHash>
+      mem_usage_snapshot_at_peak;
 
   Costs device_costs;
   std::map<string, Costs> op_to_cost;    // Per-op cost.
@@ -113,7 +127,7 @@ class ReadyNodeManager {
   ReadyNodeManager() {}
   virtual ~ReadyNodeManager() {}
   virtual void AddNode(const NodeDef* node) = 0;
-  virtual const NodeDef* GetCurrNode() const = 0;
+  virtual const NodeDef* GetCurrNode() = 0;
   virtual void RemoveCurrNode() = 0;
   virtual bool Empty() const = 0;
 };
@@ -123,7 +137,7 @@ class FIFOManager : public ReadyNodeManager {
   FIFOManager() : ReadyNodeManager() {}
   ~FIFOManager() override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
-  const NodeDef* GetCurrNode() const override { return nodes_.front(); }
+  const NodeDef* GetCurrNode() override { return nodes_.front(); }
   void RemoveCurrNode() override { nodes_.pop_front(); }
   bool Empty() const override { return nodes_.empty(); }
 
@@ -131,6 +145,40 @@ class FIFOManager : public ReadyNodeManager {
   std::list<const NodeDef*> nodes_;
 };
 
+// The LIFOManager schedules nodes by returning the last one added to the
+// scheduler. A node is executed and then its ready outputs are newly added to
+// the scheduler, so the LIFOManager will return outputs to a node following
+// that node's execution.
+class LIFOManager : public ReadyNodeManager {
+ public:
+  LIFOManager() : ReadyNodeManager() {}
+  ~LIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() override {
+    curr_pos_ = nodes_.end();
+    curr_pos_--;
+    return nodes_.back();
+  }
+  void RemoveCurrNode() override {
+    if (curr_pos_ != nodes_.end()) {
+      nodes_.erase(curr_pos_);
+    } else if (!nodes_.empty()) {
+      nodes_.pop_back();
+    }
+    curr_pos_ = nodes_.end();
+    curr_pos_--;
+  }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+  // Keep track of the current node being executed by saving its position.
+  // Necessary because nodes may be added to the end of the list while a node is
+  // executing, and we want to remove the correct node (the one that is
+  // executing) rather than the new ones being added.
+  std::list<const NodeDef*>::iterator curr_pos_ = nodes_.end();
+};
+
 // A wrapper struct to OpInfo proto.
 // TODO(dyoon): once we extend OpInfo or implement a better interface, and  then
 // delete this wrapper struct.
@@ -158,6 +206,9 @@ class VirtualScheduler {
 
   // Prints out summary of execution (timing, memory usage, etc.)
   Costs Summary() const;
+  // Like the above, but writes detailed stats to RunMetadata.
+  // If metadata is nullptr, then just calls and return Summary().
+  Costs Summary(RunMetadata* metadata);
 
  protected:
   // GetDeviceStates and GetNodeStates are currently for testing purpuse only.
@@ -203,9 +254,6 @@ class VirtualScheduler {
 
   // Pool of NodeDefs for SendRecv and Identity ops created.
   std::vector<std::unique_ptr<NodeDef>> additional_nodes_;
-  // Cache of nodes transferred to another device.
-  std::unordered_map<const NodeDef*, std::unordered_map<string, const NodeDef*>>
-      cached_recv_nodes_;
 
   // Stats:
   std::map<string, int> op_counts_;  // Op counts with key with input shape.
@@ -216,6 +264,7 @@ class VirtualScheduler {
   // Auxilliary data structures for constructing NodeState and DeviceState.
   GraphProperties graph_properties_;
   Cluster* cluster_;                   // Not owned.
+
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
   bool initialized_;
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 9e48c411dc0c9e13f823ab500d1cdd00950f1258..a4f72b0f97c4baf2e3b1321d5e13dbe7dfce57b8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
-
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -35,41 +36,66 @@ class TestVirtualScheduler : public VirtualScheduler {
   FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
   FRIEND_TEST(VirtualSchedulerTest, ComplexDependency);
   FRIEND_TEST(VirtualSchedulerTest, Variable);
+  FRIEND_TEST(VirtualSchedulerTest, InterDeviceTransfer);
 };
 
 class VirtualSchedulerTest : public ::testing::Test {
  protected:
+  NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
+
   const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
+  const string kCPU1 = "/job:localhost/replica:0/task:0/cpu:1";
+
+  DeviceProperties GetDummyCPUDevice() {
+    // Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth.
+    // - 8 Gflops
+    // - 2 GB/s
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(4000);
+    cpu_device.set_num_cores(2);
+    cpu_device.set_bandwidth(2000000);
+    return cpu_device;
+  }
 
   void SetUp() override {
+    // Initializes nodes for manager
+    node1_.set_name("Node1");
+    node2_.set_name("Node2");
+    node3_.set_name("Node3");
+    node4_.set_name("Node4");
+    node5_.set_name("Node5");
+    node6_.set_name("Node6");
+
     // Initializes cluster_ and placer_.
     std::unordered_map<string, DeviceProperties> devices;
-    DeviceProperties cpu_device;
-    cpu_device.set_type("CPU");
-    devices[kCPU0] = cpu_device;
 
+    // Set some dummy CPU properties
+    DeviceProperties cpu_device = GetDummyCPUDevice();
+
+    // IMPORTANT: Device is not actually ever used in the test case since
+    // force_cpu_type is defaulted to "Haswell"
+    devices[kCPU0] = cpu_device;
+    devices[kCPU1] = cpu_device;
     cluster_.reset(new VirtualCluster(devices));
     placer_.reset(new VirtualPlacer(cluster_.get()));
   }
 
   // Three Conv2Ds with only two in fetch nodes.
   void CreateGrapplerItemWithConv2Ds() {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
-    auto x = tensorflow::ops::RandomUniform(
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = ops::RandomUniform(
         s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
-    auto y = tensorflow::ops::RandomUniform(
+    auto y = ops::RandomUniform(
         s.WithOpName("y"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
-    auto z = tensorflow::ops::RandomUniform(
+    auto z = ops::RandomUniform(
         s.WithOpName("z"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
-    auto f = tensorflow::ops::RandomUniform(
+    auto f = ops::RandomUniform(
         s.WithOpName("f"), {kernel_, kernel_, depth_in_, depth_out_}, DT_FLOAT);
     std::vector<int> strides = {1, 1, 1, 1};
-    auto c0 =
-        tensorflow::ops::Conv2D(s.WithOpName("c0"), x, f, strides, "SAME");
-    auto c1 =
-        tensorflow::ops::Conv2D(s.WithOpName("c1"), y, f, strides, "SAME");
-    auto c2 =
-        tensorflow::ops::Conv2D(s.WithOpName("c2"), z, f, strides, "SAME");
+    auto c0 = ops::Conv2D(s.WithOpName("c0"), x, f, strides, "SAME");
+    auto c1 = ops::Conv2D(s.WithOpName("c1"), y, f, strides, "SAME");
+    auto c2 = ops::Conv2D(s.WithOpName("c2"), z, f, strides, "SAME");
     GraphDef def;
     TF_CHECK_OK(s.ToGraphDef(&def));
 
@@ -84,13 +110,13 @@ class VirtualSchedulerTest : public ::testing::Test {
 
   // A Conv2D with a variable.
   void CreateGrapplerItemWithConv2DAndVariable() {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
-    auto x = tensorflow::ops::RandomUniform(
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = ops::RandomUniform(
         s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
-    auto f = tensorflow::ops::Variable(
-        s.WithOpName("f"), {kernel_, kernel_, depth_in_, depth_out_}, DT_FLOAT);
+    auto f = ops::Variable(s.WithOpName("f"),
+                           {kernel_, kernel_, depth_in_, depth_out_}, DT_FLOAT);
     std::vector<int> strides = {1, 1, 1, 1};
-    auto y = tensorflow::ops::Conv2D(s.WithOpName("y"), x, f, strides, "SAME");
+    auto y = ops::Conv2D(s.WithOpName("y"), x, f, strides, "SAME");
     GraphDef def;
     TF_CHECK_OK(s.ToGraphDef(&def));
 
@@ -102,19 +128,48 @@ class VirtualSchedulerTest : public ::testing::Test {
     dependency_["y"] = {"x", "f"};
   }
 
+  void CreateGrapplerItemWithMatmulChain() {
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
+    // Add control dependencies to ensure tests do not rely on specific
+    // manager and the order remains consistent for the test.
+    auto a = ops::RandomUniform(s.WithOpName("a"), {3200, 3200}, DT_FLOAT);
+    auto b = ops::RandomUniform(s.WithOpName("b").WithControlDependencies(a),
+                                {3200, 3200}, DT_FLOAT);
+    auto c = ops::RandomUniform(s.WithOpName("c").WithControlDependencies(b),
+                                {3200, 3200}, DT_FLOAT);
+    auto d = ops::RandomUniform(s.WithOpName("d").WithControlDependencies(c),
+                                {3200, 3200}, DT_FLOAT);
+    auto e = ops::RandomUniform(s.WithOpName("e").WithControlDependencies(d),
+                                {3200, 3200}, DT_FLOAT);
+
+    auto ab = ops::MatMul(s.WithOpName("ab").WithControlDependencies(e), a, b);
+    auto abc = ops::MatMul(s.WithOpName("abc"), ab, c);
+    auto abcd = ops::MatMul(s.WithOpName("abcd"), abc, d);
+    auto abcde = ops::MatMul(s.WithOpName("abcde"), abcd, e);
+
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_matmul_sequence_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"abcde"};
+
+    dependency_["ab"] = {"a", "b"};
+    dependency_["abc"] = {"ab", "c"};
+    dependency_["abcd"] = {"abc", "d"};
+    dependency_["abcde"] = {"abcd", "e"};
+  }
+
   // AddN that takes 4 tensors with 10x10x10x10.
   void CreateGrapplerItemWithAddN() {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
-    auto x = tensorflow::ops::RandomUniform(s.WithOpName("x"), {10, 10, 10, 10},
-                                            DT_FLOAT);
-    auto y = tensorflow::ops::RandomUniform(s.WithOpName("y"), {10, 10, 10, 10},
-                                            DT_FLOAT);
-    auto z = tensorflow::ops::RandomUniform(s.WithOpName("z"), {10, 10, 10, 10},
-                                            DT_FLOAT);
-    auto w = tensorflow::ops::RandomUniform(s.WithOpName("w"), {10, 10, 10, 10},
-                                            DT_FLOAT);
-    tensorflow::OutputList input_tensors = {x, y, z, w};
-    auto out = tensorflow::ops::AddN(s.WithOpName("out"), input_tensors);
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = ops::RandomUniform(s.WithOpName("x"), {10, 10, 10, 10}, DT_FLOAT);
+    auto y = ops::RandomUniform(s.WithOpName("y"), {10, 10, 10, 10}, DT_FLOAT);
+    auto z = ops::RandomUniform(s.WithOpName("z"), {10, 10, 10, 10}, DT_FLOAT);
+    auto w = ops::RandomUniform(s.WithOpName("w"), {10, 10, 10, 10}, DT_FLOAT);
+    OutputList input_tensors = {x, y, z, w};
+    auto out = ops::AddN(s.WithOpName("out"), input_tensors);
     GraphDef def;
     TF_CHECK_OK(s.ToGraphDef(&def));
 
@@ -128,15 +183,15 @@ class VirtualSchedulerTest : public ::testing::Test {
 
   // NoOp that takes 7 NoOps as control dependency.
   void CreateGrapplerItemWithControlDependency() {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
     std::vector<string> input_noop_names = {"x", "y", "z", "w", "u", "v", "t"};
-    std::vector<tensorflow::Operation> input_tensors;
+    std::vector<Operation> input_tensors;
     for (const auto& input : input_noop_names) {
-      auto x = tensorflow::ops::NoOp(s.WithOpName(input));
+      auto x = ops::NoOp(s.WithOpName(input));
       input_tensors.push_back(x.operation);
     }
-    auto out = tensorflow::ops::NoOp(
-        s.WithControlDependencies(input_tensors).WithOpName("out"));
+    auto out =
+        ops::NoOp(s.WithControlDependencies(input_tensors).WithOpName("out"));
     GraphDef def;
     TF_CHECK_OK(s.ToGraphDef(&def));
 
@@ -151,33 +206,33 @@ class VirtualSchedulerTest : public ::testing::Test {
   // FusedBN [an op with multiple outputs] with multiple consumers (including
   // control dependency).
   void CreateGrapplerItemWithBatchNorm() {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
-    auto x = tensorflow::ops::RandomUniform(
+    Scope s = Scope::NewRootScope().WithDevice(kCPU0);
+    auto x = ops::RandomUniform(
         s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
-    auto scale = tensorflow::ops::RandomUniform(s.WithOpName("scale"),
-                                                {depth_in_}, DT_FLOAT);
-    auto offset = tensorflow::ops::RandomUniform(s.WithOpName("offset"),
-                                                 {depth_in_}, DT_FLOAT);
-    auto mean =
-        tensorflow::ops::RandomUniform(s.WithOpName("mean"), {0}, DT_FLOAT);
-    auto var =
-        tensorflow::ops::RandomUniform(s.WithOpName("var"), {0}, DT_FLOAT);
-
-    auto batch_norm = tensorflow::ops::FusedBatchNorm(
+    auto scale =
+        ops::RandomUniform(s.WithOpName("scale"), {depth_in_}, DT_FLOAT);
+    auto offset =
+        ops::RandomUniform(s.WithOpName("offset"), {depth_in_}, DT_FLOAT);
+    auto mean = ops::RandomUniform(s.WithOpName("mean"), {0}, DT_FLOAT);
+    auto var = ops::RandomUniform(s.WithOpName("var"), {0}, DT_FLOAT);
+
+    auto batch_norm = ops::FusedBatchNorm(
         s.WithOpName("bn"), x, scale, offset, mean, var,
         ops::FusedBatchNorm::IsTraining(true).Epsilon(0.1f));
     auto y = batch_norm.y;
     auto batch_mean = batch_norm.batch_mean;
     auto batch_var = batch_norm.batch_variance;
 
-    auto z1 = tensorflow::ops::Add(s.WithOpName("z1"), x, y);
-    auto z2 = tensorflow::ops::Add(s.WithOpName("z2"), batch_var, batch_var);
-    auto z3 = tensorflow::ops::Add(s.WithOpName("z3"), batch_var, batch_var);
-    std::vector<tensorflow::Operation> input_tensors = {
-        batch_mean.op(), z1.z.op(), z2.z.op(), z3.z.op(),
+    auto z1 = ops::Add(s.WithOpName("z1"), x, y);
+    auto z2 = ops::Add(s.WithOpName("z2"), batch_var, batch_var);
+    auto z3 = ops::Add(s.WithOpName("z3"), batch_var, batch_var);
+    std::vector<Operation> input_tensors = {
+        batch_mean.op(),
+        z1.z.op(),
+        z2.z.op(),
+        z3.z.op(),
     };
-    auto z4 = tensorflow::ops::NoOp(
-        s.WithControlDependencies(batch_var).WithOpName("z4"));
+    auto z4 = ops::NoOp(s.WithControlDependencies(batch_var).WithOpName("z4"));
 
     GraphDef def;
     TF_CHECK_OK(s.ToGraphDef(&def));
@@ -194,6 +249,452 @@ class VirtualSchedulerTest : public ::testing::Test {
     dependency_["z4"] = {"bn"};
   }
 
+  // A simple while loop
+  void CreateGrapplerItemWithLoop() {
+    // Test graph produced in python using:
+    /*
+      with tf.Graph().as_default():
+      i0 = tf.constant(0)
+      m0 = tf.ones([2, 2])
+      c = lambda i, m: i < 10
+      b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+      r = tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+      with open('/tmp/graph.pbtxt', 'w') as f:
+      f.write(str(tf.get_default_graph().as_graph_def()))
+    */
+    const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
+  }
+
+  void CreateGrapplerItemWithInterDeviceTransfers() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+
+    // Create a FusedBatchNorm op that has multiple output ports.
+    auto x = ops::RandomUniform(
+        s.WithOpName("x"), {batch_size_, width_, height_, depth_in_}, DT_FLOAT);
+    auto scale =
+        ops::RandomUniform(s.WithOpName("scale"), {depth_in_}, DT_FLOAT);
+    auto offset =
+        ops::RandomUniform(s.WithOpName("offset"), {depth_in_}, DT_FLOAT);
+    auto mean = ops::RandomUniform(s.WithOpName("mean"), {0}, DT_FLOAT);
+    auto var = ops::RandomUniform(s.WithOpName("var"), {0}, DT_FLOAT);
+
+    auto batch_norm = ops::FusedBatchNorm(
+        s.WithOpName("bn"), x, scale, offset, mean, var,
+        ops::FusedBatchNorm::IsTraining(true).Epsilon(0.1f));
+    auto y = batch_norm.y;
+    auto batch_mean = batch_norm.batch_mean;
+    auto batch_var = batch_norm.batch_variance;
+    // y1 and y2 take the same tensor, so there should be only 1 Send and Recv.
+    auto y1 = ops::Identity(s.WithOpName("y1").WithDevice(kCPU1), y);
+    auto y2 = ops::Identity(s.WithOpName("y2").WithDevice(kCPU1), y);
+    // batch_mean1 and batch_var1 take different output ports, so each will
+    // initiate Send/Recv.
+    auto batch_mean1 = ops::Identity(
+        s.WithOpName("batch_mean1").WithDevice(kCPU1), batch_mean);
+    auto batch_var1 =
+        ops::Identity(s.WithOpName("batch_var1").WithDevice(kCPU1), batch_var);
+    // This is control dependency.
+    auto control_dep = ops::NoOp(s.WithOpName("control_dep")
+                                     .WithControlDependencies(y)
+                                     .WithDevice(kCPU1));
+
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_conv2d_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"y1", "y2", "batch_mean1", "batch_var1",
+                             "control_dep"};
+
+    dependency_["bn"] = {"x", "mean", "var"};
+    dependency_["y1"] = {"bn"};
+    dependency_["y2"] = {"bn"};
+    dependency_["batch_mean1"] = {"bn"};
+    dependency_["batch_var1"] = {"bn"};
+    dependency_["control_dep"] = {"bn"};
+  }
+
   // Call this after creating grappler_item_ and setting up dependency_.
   void InitScheduler() {
     scheduler_.reset(new TestVirtualScheduler(
@@ -201,6 +702,21 @@ class VirtualSchedulerTest : public ::testing::Test {
     TF_CHECK_OK(scheduler_->Init());
   }
 
+  // Returns cost based on op.
+  Costs SimplePredictCosts(const NodeInfo& info) const {
+    Costs c;
+    int64 exec_cost = 0;
+    if (info.op_info.op() == "MatMul") {
+      exec_cost = 2000000000;
+    } else if (info.op_info.op() == "RandomUniform") {
+      exec_cost = 1000000000;
+    } else {
+      exec_cost = 1000;
+    }
+    c.execution_time = Costs::NanoSeconds(exec_cost);
+    return c;
+  }
+
   // Call this after init scheduler_. Scheduler stops after executing
   // target_node.
   std::unordered_map<string, NodeInfo> RunScheduler(const string& target_node) {
@@ -211,6 +727,8 @@ class VirtualSchedulerTest : public ::testing::Test {
       NodeInfo node_info = scheduler_->GetCurrNodeInfo();
       ops_executed[node_info.name] = node_info;
 
+      Costs node_costs = SimplePredictCosts(node_info);
+
       // Check scheduling order.
       auto it = dependency_.find(node_info.name);
       if (it != dependency_.end()) {
@@ -218,7 +736,7 @@ class VirtualSchedulerTest : public ::testing::Test {
           EXPECT_GT(ops_executed.count(preceding_node), 0);
         }
       }
-      more_nodes = scheduler_->MarkCurrNodeExecuted(zero_costs);
+      more_nodes = scheduler_->MarkCurrNodeExecuted(node_costs);
 
       if (node_info.name == target_node) {
         // Scheduler has the state after executing the target node.
@@ -263,7 +781,8 @@ class VirtualSchedulerTest : public ::testing::Test {
   // Helper method tthat checks name - port pairs.
   void ValidateMemoryUsageSnapshot(
       const std::vector<string>& expected_names, const int port_num_expected,
-      const std::set<std::pair<const NodeDef*, int>>& mem_usage_snapshot) {
+      const std::unordered_set<std::pair<const NodeDef*, int>,
+                               DeviceState::NodePairHash>& mem_usage_snapshot) {
     std::set<std::pair<string, int>> nodes_at_peak_mem_usage;
     std::transform(
         mem_usage_snapshot.begin(), mem_usage_snapshot.end(),
@@ -311,6 +830,234 @@ class VirtualSchedulerTest : public ::testing::Test {
   const int depth_out_ = 16;
 };
 
+// Test that FIFOManager correctly returns the current node with only 1 node.
+TEST_F(VirtualSchedulerTest, GetSingleNodeFIFOManager) {
+  // Init.
+  FIFOManager manager = FIFOManager();
+
+  // Add the node to FIFOManager.
+  manager.AddNode(&node1_);
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+}
+
+// Test that FIFOManager removes the only node contained within.
+TEST_F(VirtualSchedulerTest, RemoveSingleNodeFIFOManager) {
+  // Init.
+  FIFOManager manager = FIFOManager();
+
+  // Add the node to FIFOManager.
+  manager.AddNode(&node1_);
+
+  // Remove the only node in FIFOManager.
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Test that FIFOManager can remove multiple nodes and returns the current node
+// in the right order
+TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleFIFOManager) {
+  // Init.
+  FIFOManager manager = FIFOManager();
+
+  // Add the nodes to FIFOManager.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keep checking current node while removing nodes from manager.
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Test that FIFOManager can remove multiple nodes and add more nodes, still
+// returning the current node in the right order
+TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleFIFOManager) {
+  // Init.
+  FIFOManager manager = FIFOManager();
+
+  // Add the nodes to FIFOManager.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keep checking current node as nodes are removed and added.
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.AddNode(&node5_);
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.AddNode(&node6_);
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Test that LIFOManager correctly returns the current node with only 1 node.
+TEST_F(VirtualSchedulerTest, GetSingleNodeLIFOManager) {
+  // Init.
+  LIFOManager manager = LIFOManager();
+
+  // Add the node to LIFOManager.
+  manager.AddNode(&node1_);
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+}
+
+// Test that LIFOManager removes the only node contained within.
+TEST_F(VirtualSchedulerTest, RemoveSingleNodeLIFOManager) {
+  // Init.
+  LIFOManager manager = LIFOManager();
+
+  // Add the node to LIFOManager.
+  manager.AddNode(&node1_);
+
+  // Remove the only node in LIFOManager.
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Test that LIFOManager can remove multiple nodes and returns the current node
+// in the right order
+TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleLIFOManager) {
+  // Init.
+  LIFOManager manager = LIFOManager();
+
+  // Add the nodes to LIFOManager.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keep checking current node while removing nodes from manager.
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Test that LIFOManager can remove multiple nodes (must be removing the current
+// node) and add more nodes, still returning the current node in the right order
+TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleLIFOManager) {
+  // Init.
+  LIFOManager manager = LIFOManager();
+
+  // Add the nodes to LIFOManager.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keep checking current node as nodes are removed and added.
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.AddNode(&node5_);
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.AddNode(&node6_);
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+// Create small graph, run predict costs on it, make sure the costs from the
+// summary match the hand-calculated costs.
+TEST_F(VirtualSchedulerTest, SummaryCostTest) {
+  // Run matmul test.
+  CreateGrapplerItemWithMatmulChain();
+  InitScheduler();
+  auto ops_executed = RunScheduler("");
+  Costs c = scheduler_->Summary();
+
+  // RandomUniform - 5 * 1s
+  // Matmuls - 4 * 2s = 8
+  // Misc - 5 * 1us
+  // Total: 13000005
+  EXPECT_EQ(13000005, c.execution_time.asMicroSeconds().count());
+}
+
+// Like the above SummaryCostTest, but makes sure the stepstats timeline is
+// correct.
+TEST_F(VirtualSchedulerTest, SummaryCostStepStatsTest) {
+  // Run matmul test.
+  CreateGrapplerItemWithMatmulChain();
+  InitScheduler();
+  auto ops_executed = RunScheduler("");
+  RunMetadata metadata;
+  Costs c = scheduler_->Summary(&metadata);
+  StepStats stepstats = metadata.step_stats();
+  EXPECT_EQ(13000005, c.execution_time.asMicroSeconds().count());
+
+  // Should only be 1 device!
+  EXPECT_EQ(1, stepstats.dev_stats().size());
+
+  // Create a map of op name -> start and end times (micros).
+  std::map<string, std::pair<int64, int64>> start_end_times;
+  for (const auto& device_step_stats : stepstats.dev_stats()) {
+    for (const auto& stats : device_step_stats.node_stats()) {
+      int64 start = stats.all_start_micros();
+      int64 end = start + stats.all_end_rel_micros();
+      start_end_times[stats.node_name()] = std::pair<int64, int64>(start, end);
+
+      // Make sure that the output properties are correct for
+      // MatMul and RandomUniform operations.
+      // We only check for dtype, and shape (excluding alloc)
+      // since alloc is not set by the virtual scheduler.
+      if (stats.timeline_label() == "MatMul" ||
+          stats.timeline_label() == "RandomUniform") {
+        EXPECT_EQ(1, stats.output().size());
+        for (const auto& output : stats.output()) {
+          EXPECT_EQ(DT_FLOAT, output.tensor_description().dtype());
+          EXPECT_EQ(2, output.tensor_description().shape().dim().size());
+          for (const auto& dim : output.tensor_description().shape().dim()) {
+            EXPECT_EQ(3200, dim.size());
+          }
+        }
+      }
+    }
+  }
+
+  // The base start_time is the time to compute RandomUniforms
+  int64 cur_time = static_cast<int64>(5000005);
+  // The increment is the execution time of one matmul. See
+  // CreateGrapplerItemWithMatmulChain for details.
+  int64 increment = static_cast<int64>(2000000);
+  auto op_names = {"ab", "abc", "abcd", "abcde"};
+  for (const auto& op_name : op_names) {
+    int64 actual_start = start_end_times[op_name].first;
+    int64 actual_end = start_end_times[op_name].second;
+    int64 expected_start = cur_time;
+    int64 expected_end = cur_time + increment;
+    EXPECT_EQ(expected_start, actual_start);
+    EXPECT_EQ(expected_end, actual_end);
+    cur_time += increment;
+  }
+}
+
 TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) {
   // Init.
   CreateGrapplerItemWithConv2Ds();
@@ -498,5 +1245,120 @@ TEST_F(VirtualSchedulerTest, Variable) {
   ValidateMemoryUsageSnapshot({"x"}, 0 /* port_num_expected */,
                               cpu_state.mem_usage_snapshot_at_peak);
 }
+
+TEST_F(VirtualSchedulerTest, WhileLoop) {
+  // Init.
+  CreateGrapplerItemWithLoop();
+  InitScheduler();
+
+  // Run the scheduler.
+  RunScheduler("");
+
+  // Check the timeline
+  RunMetadata metadata;
+  scheduler_->Summary(&metadata);
+
+  int num_next_iteration = 0;
+  int num_next_iteration_1 = 0;
+  int num_exit = 0;
+  int num_exit_1 = 0;
+  for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+    for (const auto& stats : device_step_stats.node_stats()) {
+      std::cout << stats.DebugString() << std::endl;
+      if (stats.node_name() == "while/NextIteration") {
+        ++num_next_iteration;
+        EXPECT_EQ(19, stats.all_start_micros());
+      } else if (stats.node_name() == "while/NextIteration_1") {
+        ++num_next_iteration_1;
+        EXPECT_EQ(20, stats.all_start_micros());
+      } else if (stats.node_name() == "while/Exit") {
+        ++num_exit;
+        EXPECT_EQ(14, stats.all_start_micros());
+      } else if (stats.node_name() == "while/Exit_1") {
+        ++num_exit_1;
+        EXPECT_EQ(12, stats.all_start_micros());
+      }
+    }
+  }
+
+  // Make sure we went though the body of the loop once, and that the output of
+  // the loop was scheduled as well.
+  EXPECT_EQ(1, num_next_iteration);
+  EXPECT_EQ(1, num_next_iteration_1);
+  EXPECT_EQ(1, num_exit);
+  EXPECT_EQ(1, num_exit_1);
+}
+
+TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
+  // Init.
+  CreateGrapplerItemWithInterDeviceTransfers();
+  InitScheduler();
+
+  // Run the scheduler.
+  auto ops_executed = RunScheduler("");
+
+  // Helper lambda to extract port num from _Send and _Recv op name.
+  auto get_port_num = [](const string& name) -> int {
+    if (name.find("bn:0") != std::string::npos) {
+      return 0;
+    } else if (name.find("bn:1") != std::string::npos) {
+      return 1;
+    } else if (name.find("bn:2") != std::string::npos) {
+      return 2;
+    } else if (name.find("bn:minus1") != std::string::npos) {
+      return -1;
+    }
+    return -999;
+  };
+
+  // Reorganize ops_executed for further testing.
+  std::unordered_map<string, int> op_count;
+  std::unordered_map<int, string> recv_op_names;
+  std::unordered_map<int, string> send_op_names;
+  for (const auto& x : ops_executed) {
+    const auto& name = x.first;
+    const auto& node_info = x.second;
+    const auto& op = node_info.op_info.op();
+    if (op == "_Recv") {
+      recv_op_names[get_port_num(name)] = name;
+    } else if (op == "_Send") {
+      send_op_names[get_port_num(name)] = name;
+    }
+    op_count[op]++;
+  }
+
+  // Same number of _Send and _Recv.
+  EXPECT_EQ(op_count.at("_Send"), op_count.at("_Recv"));
+
+  // Expect 4 Send and Recvs each: port 0, 1, and, 2, and control dependency.
+  EXPECT_EQ(op_count.at("_Recv"), 4);
+  EXPECT_EQ(op_count.at("_Send"), 4);
+
+  // Helper lambda for extracting output Tensor size.
+  auto get_output_size = [this, ops_executed](const string& name) -> int64 {
+    const auto& output_properties_ = ops_executed.at(name).op_info.outputs();
+    std::vector<OpInfo::TensorProperties> output_properties;
+    for (const auto& output_property : output_properties_) {
+      output_properties.push_back(output_property);
+    }
+    return scheduler_->CalculateOutputSize(output_properties, 0);
+
+  };
+
+  // Validate transfer size.
+  // Batchnorm output y is 4D vector: batch x width x width x depth.
+  int input_size = 4 * batch_size_ * width_ * height_ * depth_in_;
+  EXPECT_EQ(get_output_size(recv_op_names[0]), input_size);
+  EXPECT_EQ(get_output_size(send_op_names[0]), input_size);
+  // Mean and vars are 1-D vector with size depth_in_.
+  EXPECT_EQ(get_output_size(recv_op_names[1]), 4 * depth_in_);
+  EXPECT_EQ(get_output_size(send_op_names[1]), 4 * depth_in_);
+  EXPECT_EQ(get_output_size(recv_op_names[2]), 4 * depth_in_);
+  EXPECT_EQ(get_output_size(send_op_names[2]), 4 * depth_in_);
+  // Control dependency size is 4B.
+  EXPECT_EQ(get_output_size(recv_op_names[-1]), 4);
+  EXPECT_EQ(get_output_size(send_op_names[-1]), 4);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 312a457abf447be6ce291563505d77a4d7b30768..d27ddb17f11bb15d35391d2fd040fb4d71b5fa35 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -30,6 +30,16 @@ std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
   return ComputeTransitiveFanin(graph, fetch);
 }
 
+std::vector<const NodeDef*> GrapplerItem::EnqueueOpsFanin() const {
+  std::vector<string> enqueue_ops;
+  for (const auto& queue_runner : queue_runners) {
+    for (const string& enqueue_op : queue_runner.enqueue_op_name()) {
+      enqueue_ops.push_back(enqueue_op);
+    }
+  }
+  return ComputeTransitiveFanin(graph, enqueue_ops);
+}
+
 std::vector<const NodeDef*> GrapplerItem::InitOpsFanin() const {
   return ComputeTransitiveFanin(graph, init_ops);
 }
@@ -55,7 +65,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
   std::vector<const NodeDef*> queue;
   for (const string& root : terminal_nodes) {
     const NodeDef* node = name_to_node[NodeName(root)];
-    CHECK(node);
+    CHECK(node) << "Unknown root " << root;
     queue.push_back(node);
   }
 
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index e0709c682b003cd961d13902d1b7192f85c3f2b9..1e7a9dfaf5d1f5fab72b7d63ad9fadb605209a24 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
@@ -50,6 +51,8 @@ struct GrapplerItem {
 
   // Return the set of node evaluated during a regular train/inference step.
   std::vector<const NodeDef*> MainOpsFanin() const;
+  // Return the set of node run to populate the queues (if any).
+  std::vector<const NodeDef*> EnqueueOpsFanin() const;
   // Return the set nodes used by TensorFlow to initialize the graph.
   std::vector<const NodeDef*> InitOpsFanin() const;
   // Return the set of variables accessed during a regular train/inference step.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index bb36152bd87be2019d2ac36ba04fe2ad5fe42b3a..0c2801e8bc378242039380a7a986729117d08256 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -26,10 +26,13 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/inputs/utils.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -70,11 +73,16 @@ void InitializeTensor(DataType type, Tensor* tensor) {
 // correct optimizations.
 Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
                      const ItemConfig& cfg) {
+  if (!cfg.apply_optimizations && !cfg.inline_functions) {
+    return Status::OK();
+  }
+
   // Create a session option for a single GPU device.
   SessionOptions options;
 
   // Inline all functions.
   GraphDef inlined_graph_def(graph_def);
+
   for (int i = 0; i < inlined_graph_def.library().function().size(); i++) {
     FunctionDef* fdef =
         inlined_graph_def.mutable_library()->mutable_function(i);
@@ -110,6 +118,10 @@ Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
   std::unique_ptr<Graph> graphptr(new Graph(function_library));
+  // Populate default attrs to the NodeDefs in the GraphDef.
+  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def,
+                                               *graphptr->op_registry(), 0));
+
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_ctor_opts, inlined_graph_def,
                                             graphptr.get()));
 
@@ -183,7 +195,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
           shape_proto.add_dim()->set_size(
               cfg.placeholder_unknown_output_shape_dim);
         } else {
-          dims.push_back(dim_proto.size());
+          dims.push_back(std::max<int32>(1, dim_proto.size()));
           shape_proto.add_dim()->set_size(dim_proto.size());
         }
       }
@@ -328,10 +340,33 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   Status optimize_status =
       OptimizeGraph(new_item->graph, &new_item->graph, cfg);
   if (!optimize_status.ok()) {
-    LOG(ERROR) << "Function optimization failed: " << optimize_status;
+    LOG(ERROR) << "Graph preprocessing failed: " << optimize_status;
     return nullptr;
   }
 
+  // Validate feed, fetch and init nodes
+  std::unordered_set<string> nodes;
+  for (const auto& node : new_item->graph.node()) {
+    nodes.insert(node.name());
+  }
+  for (const auto& feed : new_item->feed) {
+    if (nodes.find(feed.first) == nodes.end()) {
+      LOG(ERROR) << "Feed node " << feed.first << " doesn't exist in graph";
+      return nullptr;
+    }
+  }
+  for (const auto& fetch : new_item->fetch) {
+    if (nodes.find(fetch) == nodes.end()) {
+      LOG(ERROR) << "Fetch node " << fetch << " doesn't exist in graph";
+      return nullptr;
+    }
+  }
+  for (const auto& init : new_item->init_ops) {
+    if (nodes.find(init) == nodes.end()) {
+      LOG(ERROR) << "Init node " << init << " doesn't exist in graph";
+      return nullptr;
+    }
+  }
   return new_item;
 }
 
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 3aa1d2027f585af4c439438fd013e5b8040195cf..d385a1916effefa353b612adbb148d93f1eeca95 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -31,8 +31,8 @@ struct ItemConfig {
       : ignore_user_placement(true),
         ignore_colocation(true),
         placeholder_unknown_output_shape_dim(-1),
-        apply_optimizations(true),
-        inline_functions(true) {}
+        apply_optimizations(false),
+        inline_functions(false) {}
 
   // If true, ignore all user specified node placement.
   bool ignore_user_placement;
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 92225ffb1b48ffac07e8dbfa83a7a4f778b16474..048870f9e51e8e3daa422e6f8ceab77bd42e83cd 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -51,7 +51,11 @@ void SampleSumSymbolicGradientGraphdef(
   auto g0 = SymbolicGradient(scope, std::initializer_list<Input>{x, y, z},
                              {DT_FLOAT, DT_INT32}, fn);
 
-  fetches->mutable_node_list()->add_value(g0[0].name());
+  // TODO(bsteiner): we should rewrite the feed/fetch nodes to reflect the
+  // inlining that's done in the item builder
+  // fetches->mutable_node_list()->add_value(g0[0].name());
+  fetches->mutable_node_list()->add_value("SymbolicGradient/dx");
+  fetches->mutable_node_list()->add_value("SymbolicGradient/dy_reshaped");
 
   TF_CHECK_OK(scope.ToGraphDef(def));
 
@@ -109,11 +113,12 @@ TEST_F(GrapplerItemBuilderTest, SymbolicGradientInlining) {
   std::unique_ptr<GrapplerItem> with_inline = CreateGrapplerItem(def, fetches);
 
   // For the inlined graph, there should be 0 symbolic gradient ops.
-  CHECK_EQ(0, CountSymbolicGradientOps(with_inline));
+  EXPECT_EQ(0, CountSymbolicGradientOps(with_inline));
 
   // For the inlined graph, make sure all the required expanded op’s are in the
   // graph.
-  CHECK_EQ(ops_of_inline.size(), CountOpsWithNames(with_inline, ops_of_inline));
+  EXPECT_EQ(ops_of_inline.size(),
+            CountOpsWithNames(with_inline, ops_of_inline));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 176b3e982fbef44a25bf48cde09a265696c41483..5c70f409697dc422b4a523198c65ca8e48f593f9 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -22,7 +22,6 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -37,7 +36,6 @@ cc_test(
     deps = [
         ":utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 446ae2df643aab7679313e431d4c8c0d47095d88..b1ec35e2687c3c2bb0b7f52a4eba9048354ca8fe 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -48,9 +48,17 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
   for (int i = 0; i < num_stages; i++) {
     std::vector<Output> this_stage;
     for (int j = 0; j < width; j++) {
-      Output combine = AddN(
-          s.WithDevice(device_names[use_multiple_devices ? j : 0]), last_stage);
-      this_stage.push_back(combine);
+      if (last_stage.size() == 1) {
+        Output unary_op =
+            Square(s.WithDevice(device_names[use_multiple_devices ? j : 0]),
+                   last_stage[0]);
+        this_stage.push_back(unary_op);
+      } else {
+        Output combine =
+            AddN(s.WithDevice(device_names[use_multiple_devices ? j : 0]),
+                 last_stage);
+        this_stage.push_back(combine);
+      }
     }
     last_stage = this_stage;
   }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 51146011b011c77adca413ebb459d6a5c2684059..8584681220d92c5a049d63b7d498ad9ab38455ff 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -18,6 +18,11 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool IsAddN(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "AddN";
+}
+
 bool IsConcat(const NodeDef& node) {
   const auto op = node.op();
   return op == "Concat" || op == "ConcatV2";
@@ -37,7 +42,7 @@ bool IsDequeueOp(const NodeDef& node) {
 
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "Identity";
+  return op == "Identity" || op == "RefIdentity";
 }
 
 bool IsMerge(const NodeDef& node) {
@@ -50,6 +55,11 @@ bool IsNoOp(const NodeDef& node) {
   return op == "NoOp";
 }
 
+bool IsNextIteration(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "NextIteration" || op == "RefNextIteration";
+}
+
 bool IsPlaceholder(const NodeDef& node) {
   const auto op = node.op();
   return op == "Placeholder" || op == "PlaceholderV2" ||
@@ -67,11 +77,18 @@ bool IsReduction(const NodeDef& node) {
          op == "Mean" || op == "Any" || op == "All";
 }
 
+bool IsReshape(const NodeDef& node) { return (node.op() == "Reshape"); }
+
 bool IsSend(const NodeDef& node) {
   const auto op = node.op();
   return op == "_Send";
 }
 
+bool IsStopGradient(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "StopGradient" || op == "PreventGradient";
+}
+
 bool IsSwitch(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Switch";
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index b2102c688df0a6dea2700c02305c12d2284f788e..d83cb777ed546a73ad6e203f1e61260597af7abf 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -21,16 +21,20 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool IsAddN(const NodeDef& node);
 bool IsConcat(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
+bool IsNextIteration(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
 bool IsRecv(const NodeDef& node);
 bool IsReduction(const NodeDef& node);
+bool IsReshape(const NodeDef& node);
 bool IsSend(const NodeDef& node);
+bool IsStopGradient(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 6bb3d50b76ddc9b9637e83f3a1261988cd877b90..edd3fae7b2675c356eceb4c05f299e52a3441148 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -59,7 +59,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
@@ -78,11 +77,11 @@ cc_test(
         ":auto_parallel",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
 
@@ -165,6 +164,7 @@ cc_library(
         ":graph_rewriter",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
@@ -198,8 +198,10 @@ cc_library(
         ":static_schedule",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
@@ -229,7 +231,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
@@ -254,7 +255,6 @@ cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
index d4326a022f465d8e11503b7bbae61747f8b0bb21..3f58a2abeac9f776e995f5086d7f01b76c334f55 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -167,6 +169,11 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
   for (const auto& variable : item.MainVariables()) {
     dont_replicate_nodes.insert(variable->name());
   }
+
+  for (const auto& init : item.init_ops) {
+    dont_replicate_nodes.insert(NodeName(init));
+  }
+
   // Don't replicate all input nodes, except the dequeue node.
   for (const auto& input_node : input_nodes) {
     if (input_node->name() != dequeue_node->name()) {
@@ -248,7 +255,8 @@ void AutoParallel::BuildGraph(GraphDef* graph) {
   for (const auto& fetch : item_->fetch) {
     AddNodeControl(fetch, {control->name()}, graph);
   }
-  *(graph->mutable_library()) = item_->graph.library();
+  *graph->mutable_library() = item_->graph.library();
+  *graph->mutable_versions() = item_->graph.versions();
   LOG(INFO) << "Parallelized graph size: " << graph->node_size();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index ad90bbe02892276d0e3bb28eb872c950640164a2..c5d2d47782f0d5515e65e1f99b212315dcc13c0e 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
index 3d1b4a34bfce69817d327b72b3643f3d391c10ec..9a41b5e0b5158082ea6272deecef4bab5132f0db 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
@@ -33,6 +33,7 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   Output constant_b = ops::Const(s.WithOpName("constant_b"), 1, {1});
   Output var = ops::Variable(s.WithOpName("var"), {1}, DT_FLOAT);
   Output assign = ops::Assign(s.WithOpName("assign"), {var}, {constant_a});
+  Output identity = ops::Identity(s.WithOpName("identity"), {var});
   Output fifo_queue = ops::FIFOQueue(s.WithOpName("fifo_queue"), {DT_FLOAT});
   auto dequeue = ops::QueueDequeueMany(s.WithOpName("dequeue"), {fifo_queue},
                                        {constant_b}, {DT_FLOAT});
@@ -44,13 +45,14 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   GrapplerItem item;
   item.init_ops.push_back("assign");
   item.fetch.push_back("apply_gradient");
+  item.init_ops.push_back("assign");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   AutoParallel parallel(2);
   GraphDef output;
   Status status = parallel.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  EXPECT_EQ(20, output.node_size());
+  EXPECT_EQ(21, output.node_size());
 
   const NodeDef& node_assign = output.node(0);
   EXPECT_EQ("assign", node_assign.name());
@@ -62,60 +64,64 @@ TEST_F(AutoParallelTest, SimpleParallel) {
   const NodeDef& node_fifo_queue = output.node(2);
   EXPECT_EQ("fifo_queue", node_fifo_queue.name());
 
-  const NodeDef& node_var = output.node(3);
+  const NodeDef& node_identity = output.node(3);
+  EXPECT_EQ("identity", node_identity.name());
+  EXPECT_EQ("var", node_identity.input(0));
+
+  const NodeDef& node_var = output.node(4);
   EXPECT_EQ("var", node_var.name());
 
-  const NodeDef& node_div_const0 = output.node(4);
+  const NodeDef& node_div_const0 = output.node(5);
   EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-Const",
             node_div_const0.name());
 
-  const NodeDef& node_div0 = output.node(5);
+  const NodeDef& node_div0 = output.node(6);
   EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-apply_gradient",
             node_div0.name());
-  const NodeDef& node_add0 = output.node(6);
+  const NodeDef& node_add0 = output.node(7);
   EXPECT_EQ("AutoParallel-Replica-0/add", node_add0.name());
 
-  const NodeDef& node_gradient0 = output.node(7);
+  const NodeDef& node_gradient0 = output.node(8);
   EXPECT_EQ("AutoParallel-Replica-0/apply_gradient", node_gradient0.name());
 
-  const NodeDef& node_constant_a0 = output.node(8);
+  const NodeDef& node_constant_a0 = output.node(9);
   EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_constant_a0.name());
 
-  const NodeDef& node_dequeue0 = output.node(9);
+  const NodeDef& node_dequeue0 = output.node(10);
   EXPECT_EQ("AutoParallel-Replica-0/dequeue", node_dequeue0.name());
 
-  const NodeDef& node_learning_rate0 = output.node(10);
+  const NodeDef& node_learning_rate0 = output.node(11);
   EXPECT_EQ("AutoParallel-Replica-0/learning_rate", node_learning_rate0.name());
 
-  const NodeDef& node_div_const1 = output.node(11);
+  const NodeDef& node_div_const1 = output.node(12);
   EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-Const",
             node_div_const1.name());
 
-  const NodeDef& node_div1 = output.node(12);
+  const NodeDef& node_div1 = output.node(13);
   EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-apply_gradient",
             node_div1.name());
 
-  const NodeDef& node_add1 = output.node(13);
+  const NodeDef& node_add1 = output.node(14);
   EXPECT_EQ("AutoParallel-Replica-1/add", node_add1.name());
 
-  const NodeDef& node_gradient1 = output.node(14);
+  const NodeDef& node_gradient1 = output.node(15);
   EXPECT_EQ("AutoParallel-Replica-1/apply_gradient", node_gradient1.name());
 
-  const NodeDef& node_constant_a1 = output.node(15);
+  const NodeDef& node_constant_a1 = output.node(16);
   EXPECT_EQ("AutoParallel-Replica-1/constant_a", node_constant_a1.name());
 
-  const NodeDef& node_dequeue1 = output.node(16);
+  const NodeDef& node_dequeue1 = output.node(17);
   EXPECT_EQ("AutoParallel-Replica-1/dequeue", node_dequeue1.name());
 
-  const NodeDef& node_learning_rate1 = output.node(17);
+  const NodeDef& node_learning_rate1 = output.node(18);
   EXPECT_EQ("AutoParallel-Replica-1/learning_rate", node_learning_rate1.name());
 
-  const NodeDef& node_fetch = output.node(18);
+  const NodeDef& node_fetch = output.node(19);
   EXPECT_EQ("AutoParallel-Control-Fetch", node_fetch.name());
   EXPECT_EQ("^AutoParallel-Replica-0/apply_gradient", node_fetch.input(0));
   EXPECT_EQ("^AutoParallel-Replica-1/apply_gradient", node_fetch.input(1));
 
-  const NodeDef& node_gradient = output.node(19);
+  const NodeDef& node_gradient = output.node(20);
   EXPECT_EQ("apply_gradient", node_gradient.name());
   EXPECT_EQ("^AutoParallel-Control-Fetch", node_gradient.input(0));
 }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index ac04be6d331216fbe4b5f65377061f09b890439f..72bbaa78afc1666f429447dc9db8de82e919517c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -18,15 +18,19 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -55,7 +59,7 @@ class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
 class DeviceSimple : public DeviceBase {
  public:
   DeviceSimple() : DeviceBase(Env::Default()) {
-    eigen_worker_threads_.num_threads = 1;
+    eigen_worker_threads_.num_threads = port::NumSchedulableCPUs();
     eigen_worker_threads_.workers = new thread::ThreadPool(
         Env::Default(), "constant_folding", eigen_worker_threads_.num_threads);
     eigen_threadpool_wrapper_.reset(
@@ -97,9 +101,13 @@ string AsControlDependency(const NodeDef& node) {
 }  // namespace
 
 ConstantFolding::ConstantFolding() {
+  resource_mgr_.reset(new ResourceMgr());
+
   ops_to_preserve_ = std::regex(
-      "Placeholder.*|Const|.*Save.*|.*Restore.*|.*Reader|Enter|Exit|"
-      "NextIteration");
+      "Placeholder.*|Const|.*Save.*|.*Restore.*|.*Reader|"
+      "Enter|RefEnter|Exit|RefExit|NextIteration|RefNextIteration|"
+      ".*Quantized.*",
+      std::regex_constants::optimize);
 }
 
 string ConstantFolding::AddControlDependency(const string& input_name) {
@@ -133,6 +141,8 @@ string ConstantFolding::AddControlDependency(const string& input_name) {
     NodeDef* added_node = graph_.add_node();
     added_node->set_name(ctrl_dep_name);
     added_node->set_op("Identity");
+    added_node->set_device(node->device());
+
     (*added_node->mutable_attr())["T"].set_type(output_type);
     *added_node->add_input() = input_name;
     node_map_->AddNode(added_node->name(), added_node);
@@ -141,9 +151,8 @@ string ConstantFolding::AddControlDependency(const string& input_name) {
   }
 }
 
-Status ConstantFolding::MaterializeShapes(const GrapplerItem& item) {
-  GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
+                                          const GraphProperties& properties) {
   // We may add some nodes to the graph to encode control dependencies: there is
   // no need to process these, so only iterate over the nodes of the input
   // graph.
@@ -234,13 +243,18 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item) {
 }
 
 bool ConstantFolding::IsFoldable(const NodeDef& node) const {
+  // Folding not applicable to ops with no inputs.
+  if (node.input().empty()) {
+    return false;
+  }
+
   // Skips nodes that must be preserved, and op_types that don't benefit from
   // folding
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  std::cmatch match;
-  if (std::regex_match(node.op().c_str(), match, ops_to_preserve_)) {
+  if (std::regex_match(node.op().c_str(), ops_to_preserve_,
+                       std::regex_constants::match_any)) {
     return false;
   }
 
@@ -258,22 +272,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
-  DeviceTypeVector device_types;
-  status = SupportedDeviceTypesForNode({DeviceType(DEVICE_CPU)}, node,
-                                       &device_types);
-  if (!status.ok()) {
-    return false;
-  }
-  // Only fold ops with a CPU implementation available.
-  if (device_types[0] != DeviceType(DEVICE_CPU)) {
-    return false;
-  }
-
-  // Folding not applicable to ops with no inputs.
-  if (node.input().empty()) {
-    return false;
-  }
-
   // No need to (and don't) fold nodes that have no outgoing edges. Such nodes
   // could be introduced by an earlier constant folding pass and are preserved
   // in case users want to fetch their values; re-processing them would
@@ -283,15 +281,35 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
+  // We can only fold nodes if all their inputs are known statically, except in
+  // the case of a merge node that propagate the first inputs that becomes
+  // available, and therefore only requires a single constant input to be
+  // foldable.
+  bool has_constant_input = false;
+  const bool is_merge = IsMerge(node);
   for (const auto& input : node.input()) {
     if (IsControlInput(input)) {
       continue;
     }
-    bool is_const = IsConstant(*node_map_->GetNode(input));
-    if (!is_const) {
+    const NodeDef* input_node = node_map_->GetNode(input);
+    if (!input_node) {
       return false;
     }
+    bool is_const = IsConstant(*input_node);
+    if (!is_const && !is_merge) {
+      return false;
+    }
+    // Don't fold strings constants for now since this causes problems with
+    // checkpointing.
+    if (is_const && input_node->attr().at("dtype").type() == DT_STRING) {
+      return false;
+    }
+    has_constant_input |= is_const;
+  }
+  if (is_merge) {
+    return has_constant_input;
   }
+
   return true;
 }
 
@@ -330,6 +348,7 @@ Status ConstantFolding::EvaluateNode(const NodeDef& node,
   params.frame_iter = FrameAndIter(0, 0);
   params.inputs = &inputs;
   params.op_kernel = op_kernel.get();
+  params.resource_manager = resource_mgr_.get();
 
   gtl::InlinedVector<AllocatorAttributes, 4> output_attrs;
   const int num_outputs = op_kernel->num_outputs();
@@ -345,31 +364,42 @@ Status ConstantFolding::EvaluateNode(const NodeDef& node,
   for (int i = 0; i < num_outputs; i++) {
     output->push_back(op_context.release_output(i));
   }
-  return Status::OK();
+  return op_context.status();
 }
 
 Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
                                             std::vector<NodeDef>* outputs) {
   TensorVector inputs;
+  auto inputs_cleanup = gtl::MakeCleanup([&inputs] {
+    for (const auto& input : inputs) {
+      delete input.tensor;
+    }
+  });
+
   for (const auto& input : node.input()) {
-    if (IsControlInput(input)) {
+    int position = 0;
+    ParseNodeName(input, &position);
+    if (position < 0) {
+      // Control dependency
       break;
     }
-    TensorVector output;
-    TF_RETURN_IF_ERROR(
-        EvaluateNode(*node_map_->GetNode(input), TensorVector(), &output));
-    inputs.push_back(output[0]);
+    const NodeDef* input_node = node_map_->GetNode(input);
+    if (!IsConstant(*input_node)) {
+      return Status(error::INVALID_ARGUMENT,
+                    strings::StrCat("Can't fold ", node.name(), ", its ", input,
+                                    " isn't constant"));
+    }
+    Tensor* value = new Tensor(input_node->attr().at("dtype").type());
+    CHECK(value->FromProto(input_node->attr().at("value").tensor()));
+    inputs.emplace_back(value);
   }
 
   TensorVector output_tensors;
   TF_RETURN_IF_ERROR(EvaluateNode(node, inputs, &output_tensors));
-  for (const auto& input : inputs) {
-    delete input.tensor;
-  }
   if (output_tensors.empty()) {
     Status(error::INVALID_ARGUMENT, "Expected at least one output.");
   }
-  for (int i = 0; i < output_tensors.size(); i++) {
+  for (size_t i = 0; i < output_tensors.size(); i++) {
     string node_name = AddPrefixToNodeName(node.name(), kConstantFoldingConst);
     if (output_tensors.size() > 1) {
       node_name = strings::StrCat(node_name, "-", i);
@@ -387,6 +417,95 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
 }
 
 Status ConstantFolding::FoldNode(const NodeDef& node, GraphDef* output) {
+  if (IsMerge(node)) {
+    // Merge nodes are special, in the sense that they execute as soon as one of
+    // their input is ready. We can therefore fold a merge node iff it has at
+    // least one constant input without control dependency.
+    // We still need to ensure that the nodes in the fanin of the merge node are
+    // scheduled. We'll therefore add a control dependency from the merge node
+    // to the folded constant. We end up with:
+    //  * the merge node and its inputs are preserved as is
+    //  * a new constant node C1, driven by the merge node through a control
+    //  dependency, initialized to the value of the folded input
+    //  * a new constant node C2, driven by the merge node through a control
+    //  dependency, initialized to the index of the folded input
+    //  * the fanout of the merge nodes is rewired to be driven by either C1 or
+    //  C2.
+    for (int input_index = 0; input_index < node.input_size(); ++input_index) {
+      const auto& input = node.input(input_index);
+      if (IsControlInput(input)) {
+        // Try the next input.
+        continue;
+      }
+      NodeDef* input_node = node_map_->GetNode(input);
+      if (!IsConstant(*input_node)) {
+        continue;
+      }
+      bool valid_input = true;
+      for (const string& fanin_of_input : input_node->input()) {
+        if (IsControlInput(fanin_of_input)) {
+          valid_input = false;
+          break;
+        }
+      }
+      if (!valid_input) {
+        // Try the next input
+        continue;
+      }
+
+      string const_out_name =
+          AddPrefixToNodeName(node.name(), kConstantFoldingConst);
+      string const_index_name = AddPrefixToNodeName(
+          strings::StrCat(node.name(), "_index"), kConstantFoldingConst);
+      if (node_map_->GetNode(const_out_name) ||
+          node_map_->GetNode(const_index_name)) {
+        // Intended name already exists.
+        return errors::AlreadyExists(
+            strings::StrCat(const_out_name, " or ", const_index_name,
+                            "already present in the graph"));
+      }
+
+      NodeDef* const_out = output->add_node();
+      *const_out = *input_node;
+      const_out->set_name(const_out_name);
+      const_out->set_device(node.device());
+      *const_out->add_input() = AsControlDependency(node);
+      node_map_->AddNode(const_out->name(), const_out);
+
+      NodeDef* const_index = output->add_node();
+      const_index->set_op("Const");
+      Tensor index(DT_INT32, TensorShape({}));
+      index.flat<int32>()(0) = input_index;
+      (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
+      index.AsProtoTensorContent(
+          (*const_index->mutable_attr())["value"].mutable_tensor());
+      const_index->set_name(const_index_name);
+      const_index->set_device(node.device());
+      *const_index->add_input() = AsControlDependency(node);
+      node_map_->AddNode(const_index->name(), const_index);
+
+      auto outputs = node_map_->GetOutputs(node.name());
+      for (auto& output : outputs) {
+        for (int i = 0; i < output->input_size(); i++) {
+          int position;
+          string node_name = ParseNodeName(output->input(i), &position);
+          if (node_name == node.name()) {
+            if (position == 0) {
+              *output->mutable_input(i) = const_out->name();
+            } else if (position == 1) {
+              *output->mutable_input(i) = const_index->name();
+            } else {
+              // This is a control dependency (or an invalid edge since the
+              // merge node has only 2 inputs): preserve them.
+            }
+          }
+        }
+      }
+      return Status::OK();
+    }
+    return Status::OK();
+  }
+
   std::vector<NodeDef> const_nodes;
   TF_RETURN_IF_ERROR(EvaluateOneFoldable(node, &const_nodes));
 
@@ -398,8 +517,15 @@ Status ConstantFolding::FoldNode(const NodeDef& node, GraphDef* output) {
       // below to preserve the overall behavior of the graph wrt dead edges.
       continue;
     }
+
+    if (node_map_->GetNode(const_node.name())) {
+      // Intended name already exists.
+      return errors::AlreadyExists(
+          strings::StrCat(const_node.name(), "already present in the graph"));
+    }
     NodeDef* added_node = output->add_node();
     *added_node = const_node;
+    added_node->set_device(node.device());
     node_map_->AddNode(added_node->name(), added_node);
 
     for (const auto& input : node.input()) {
@@ -451,21 +577,30 @@ Status ConstantFolding::FoldNode(const NodeDef& node, GraphDef* output) {
 }
 
 Status ConstantFolding::FoldGraph(GraphDef* output) {
-  std::set<string> processed_nodes;
-  while (1) {
-    int previous_processed = processed_nodes.size();
-    for (const auto& node : graph_.node()) {
-      if (IsFoldable(node) &&
-          processed_nodes.find(node.name()) == processed_nodes.end()) {
-        TF_RETURN_IF_ERROR(FoldNode(node, output));
-        processed_nodes.insert(node.name());
-      }
+  std::unordered_set<string> processed_nodes;
+  std::deque<const NodeDef*> queue;
+  for (const auto& node : graph_.node()) {
+    if (IsFoldable(node)) {
+      queue.push_back(&node);
     }
-    int current_processed = processed_nodes.size();
-    LOG(INFO) << "Previous number of processed nodes: " << previous_processed
-              << "; Current number of processed nodes: " << current_processed;
-    if (current_processed == previous_processed) {
-      break;
+  }
+  while (!queue.empty()) {
+    const NodeDef* node = queue.front();
+    queue.pop_front();
+    if (processed_nodes.count(node->name())) {
+      continue;
+    }
+    Status s = FoldNode(*node, output);
+    processed_nodes.insert(node->name());
+    if (!s.ok()) {
+      VLOG(1) << "Failed to fold node " << node->name() << ": " << s;
+    } else {
+      auto outputs = node_map_->GetOutputs(node->name());
+      for (auto& output : outputs) {
+        if (IsFoldable(*output)) {
+          queue.push_back(output);
+        }
+      }
     }
   }
 
@@ -502,7 +637,65 @@ bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
   return false;
 }
 
-Status ConstantFolding::SimplifyGraph(GraphDef* output) {
+bool ConstantFolding::IsSimplifiableReshape(
+    const NodeDef& node, const GraphProperties& properties) const {
+  if (!IsReshape(node)) {
+    return false;
+  }
+  CHECK_LE(2, node.input_size());
+  const NodeDef* new_shape = node_map_->GetNode(node.input(1));
+  if (!IsConstant(*new_shape)) {
+    return false;
+  }
+  TensorVector outputs;
+  auto outputs_cleanup = gtl::MakeCleanup([&outputs] {
+    for (const auto& output : outputs) {
+      delete output.tensor;
+    }
+  });
+
+  Status s = EvaluateNode(*new_shape, TensorVector(), &outputs);
+  if (!s.ok()) {
+    return false;
+  }
+  CHECK_EQ(1, outputs.size());
+
+  const std::vector<OpInfo::TensorProperties>& props =
+      properties.GetInputProperties(node.name());
+  if (props.empty()) {
+    return false;
+  }
+  const OpInfo::TensorProperties& prop = props[0];
+  if (prop.dtype() == DT_INVALID) {
+    return false;
+  }
+  const PartialTensorShape shape(prop.shape());
+  if (!shape.IsFullyDefined()) {
+    return false;
+  }
+
+  PartialTensorShape new_dims;
+  if (outputs[0]->dtype() == DT_INT32) {
+    std::vector<int32> shp;
+    for (int i = 0; i < outputs[0]->NumElements(); ++i) {
+      int32 dim = outputs[0]->flat<int32>()(i);
+      shp.push_back(dim);
+    }
+    TF_CHECK_OK(TensorShapeUtils::MakeShape(shp, &new_dims));
+  } else {
+    std::vector<int64> shp;
+    for (int i = 0; i < outputs[0]->NumElements(); ++i) {
+      int64 dim = outputs[0]->flat<int64>()(i);
+      shp.push_back(dim);
+    }
+    TF_CHECK_OK(TensorShapeUtils::MakeShape(shp, &new_dims));
+  }
+
+  return shape.IsCompatibleWith(new_dims);
+}
+
+Status ConstantFolding::SimplifyGraph(GraphDef* output,
+                                      const GraphProperties& properties) {
   for (auto& node : *output->mutable_node()) {
     if (IsSimplifiableReduction(node)) {
       // Replace the reduction node with an identity node, that can be further
@@ -523,9 +716,23 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output) {
       }
       node.mutable_input()->RemoveLast();
       for (const auto& input : reductions_indices->input()) {
-        if (IsControlInput(input)) {
-          *node.add_input() = input;
-        }
+        DCHECK(IsControlInput(input));
+        *node.add_input() = input;
+      }
+    }
+    if (IsSimplifiableReshape(node, properties)) {
+      const NodeDef* new_shape = node_map_->GetNode(node.input(1));
+      DataType output_type = node.attr().at("T").type();
+      node.set_op("Identity");
+      node.clear_attr();
+      (*node.mutable_attr())["T"].set_type(output_type);
+      if (node.input_size() > 2) {
+        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+      }
+      node.mutable_input()->RemoveLast();
+      for (const auto& input : new_shape->input()) {
+        DCHECK(IsControlInput(input));
+        *node.add_input() = input;
       }
     }
   }
@@ -535,7 +742,6 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output) {
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   graph_ = item.graph;
-  LOG(INFO) << "Initial graph size: " << item.graph.node_size();
   node_map_.reset(new NodeMap(&graph_));
   for (const auto& node : item.fetch) {
     nodes_to_preserve_.insert(NodeName(node));
@@ -545,10 +751,21 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
   device_.reset(new DeviceSimple());
   *output = GraphDef();
-  TF_RETURN_IF_ERROR(MaterializeShapes(item));
+
+  GraphProperties properties(item);
+  Status s = properties.InferStatically();
+  if (!s.ok()) {
+    VLOG(1) << "Failed to infer graph shapes: " << s;
+  } else {
+    TF_RETURN_IF_ERROR(MaterializeShapes(item, properties));
+  }
+
   TF_RETURN_IF_ERROR(FoldGraph(output));
-  TF_RETURN_IF_ERROR(SimplifyGraph(output));
-  LOG(INFO) << "Optimized graph size: " << output->node_size();
+  TF_RETURN_IF_ERROR(SimplifyGraph(output, properties));
+
+  *output->mutable_library() = item.graph.library();
+  *output->mutable_versions() = item.graph.versions();
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index dca8f174682ad5fe74b6d75d31e2c9fca127da01..88475e4e757663a5d41202b559329ed8d918f71b 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <regex>
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 
@@ -28,7 +30,7 @@ namespace grappler {
 const char kConstantFoldingConst[] = "ConstantFolding";
 const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
 
-// Contant folding optimization for a graph.
+// Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
  public:
   ConstantFolding();
@@ -45,7 +47,8 @@ class ConstantFolding : public GraphOptimizer {
 
  private:
   string AddControlDependency(const string& input_name);
-  Status MaterializeShapes(const GrapplerItem& item);
+  Status MaterializeShapes(const GrapplerItem& item,
+                           const GraphProperties& properties);
 
   bool IsFoldable(const NodeDef& node) const;
 
@@ -63,9 +66,12 @@ class ConstantFolding : public GraphOptimizer {
   Status FoldGraph(GraphDef* output);
 
   bool IsSimplifiableReduction(const NodeDef& node) const;
-  Status SimplifyGraph(GraphDef* output);
+  bool IsSimplifiableReshape(const NodeDef& node,
+                             const GraphProperties& properties) const;
+  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties);
 
   std::unique_ptr<DeviceBase> device_;
+  std::unique_ptr<ResourceMgr> resource_mgr_;
   GraphDef graph_;
   std::unique_ptr<NodeMap> node_map_;
   std::set<string> nodes_to_preserve_;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 566d3cd9a391846ee5363c6adeb83a5597601350..e2150f9e0ffb1518033c6484da951601ffb2517d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -50,7 +50,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
 
   Output a = ops::Const(s.WithOpName("a"), 1.0f, {1});
   Output b = ops::Const(s.WithOpName("b"), 2.0f, {1});
-  Output c = ops::AddN(s.WithOpName("c"), {a, b});
+  Output c = ops::AddN(s.WithOpName("c").WithDevice("/CPU:0"), {a, b});
   Output d = ops::AddN(s.WithOpName("d"), {b, c});
 
   GrapplerItem item;
@@ -67,6 +67,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   const NodeDef& new_c = output.node(0);
   EXPECT_EQ("ConstantFolding/c", new_c.name());
   EXPECT_EQ("Const", new_c.op());
+  EXPECT_EQ("/CPU:0", new_c.device());
 
   const NodeDef& new_a = output.node(1);
   EXPECT_EQ("a", new_a.name());
@@ -80,6 +81,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   const NodeDef& new_d = output.node(4);
   EXPECT_EQ("d", new_d.name());
   EXPECT_EQ("ConstantFolding/c", new_d.input(1));
+  EXPECT_EQ("", new_d.device());
 
   std::vector<string> fetch = {"a", "b", "c", "d"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
@@ -149,18 +151,16 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   Output dflt = ops::Const(scope.WithOpName("dflt"), 3.14f, {1});
   Output p1 = ops::PlaceholderWithDefault(scope.WithOpName("p1"), dflt, {1});
   Output p2 = ops::PlaceholderWithDefault(scope.WithOpName("p2"), dflt, {1});
-  Output c = ops::Const(scope.WithOpName("c"), 10, {3});
+  Output c =
+      ops::Const(scope.WithOpName("c").WithControlDependencies(p1), 10, {3});
   Output i1 = ops::Identity(scope.WithOpName("i1"), {c});
-  Output i2 = ops::Identity(scope.WithOpName("i2"), {i1});
+  Output i2 =
+      ops::Identity(scope.WithOpName("i2").WithControlDependencies(p2), {i1});
   Output i3 = ops::Identity(scope.WithOpName("e"), {i2});
 
   GrapplerItem item;
   item.fetch.push_back("i3");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-  ASSERT_EQ("c", item.graph.node(3).name());
-  (*item.graph.mutable_node(3)->add_input()) = "^p1";
-  ASSERT_EQ("i2", item.graph.node(5).name());
-  (*item.graph.mutable_node(5)->add_input()) = "^p2";
 
   ConstantFolding fold;
   GraphDef output;
@@ -383,6 +383,97 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
   }
 }
 
+TEST_F(ConstantFoldingTest, MergeNodes) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output x =
+      ops::RandomNormal(scope.WithOpName("x"), {3, 5}, DataType::DT_FLOAT);
+  Output y =
+      ops::RandomNormal(scope.WithOpName("y"), {3, 5}, DataType::DT_FLOAT);
+  Output const1 =
+      ops::Const(scope.WithOpName("const1").WithControlDependencies(x), 2.7f,
+                 TensorShape({3, 5}));
+  Output const2 =
+      ops::Const(scope.WithOpName("const2"), 3.14f, TensorShape({3, 5}));
+  Output const3 =
+      ops::Const(scope.WithOpName("const3").WithControlDependencies(x), 3.14f,
+                 TensorShape({3, 5}));
+
+  // Create 3 merge nodes: m1 is foldable, m2 and m3 aren't.
+  ops::Merge m1(scope.WithOpName("m1"), {x, const1, const2});
+  ops::Merge m2(scope.WithOpName("m2"), {const1, const3});
+  ops::Merge m3(scope.WithOpName("m3"), {x, y});
+
+  ops::Identity out1(scope.WithOpName("out1"), m1.output);
+  ops::Identity idx1(scope.WithOpName("idx1"), m1.value_index);
+  ops::Identity out2(scope.WithOpName("out2"), m2.output);
+  ops::Identity idx2(scope.WithOpName("idx2"), m2.value_index);
+  ops::Identity out3(scope.WithOpName("out3"), m3.output);
+  ops::Identity idx3(scope.WithOpName("idx3"), m3.value_index);
+
+  GrapplerItem item;
+  item.fetch.push_back("out1, idx1, out2, idx2, out3, idx3");
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found_nodes = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "out1") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/m1", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx1") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/m1_index", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "ConstantFolding/m1") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^m1", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "ConstantFolding/m1_index") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^m1", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "out2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m2", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m2:1", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "out3") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m3", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx3") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m3:1", node.input(0));
+      ++found_nodes;
+    }
+  }
+  // Make sure the graph contains all the nodes we're expecting.
+  EXPECT_EQ(8, found_nodes);
+
+  std::vector<string> fetch = {"out1", "idx1"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(2, tensors.size());
+  const Tensor& out_value = tensors[0];
+  EXPECT_EQ(3 * 5, out_value.NumElements());
+  for (int i = 0; i < 3 * 5; ++i) {
+    EXPECT_EQ(3.14f, out_value.flat<float>()(i));
+  }
+  const Tensor& out_idx = tensors[1];
+  EXPECT_EQ(1, out_idx.NumElements());
+  EXPECT_EQ(2, out_idx.flat<int32>()(0));
+}
+
 TEST_F(ConstantFoldingTest, NoOpReduction) {
   // Build a simple graph with a reduction that can be reduced to the identity.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
@@ -424,6 +515,98 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   EXPECT_TRUE(found);
 }
 
+TEST_F(ConstantFoldingTest, NoOpReshape) {
+  // Build a simple graph with a reshape that can be reduced to the identity.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  // A reshape than can be optimized
+  Output d1 = ops::Const(scope.WithOpName("d1"), 3.14f, {17});
+  Output v1 = ops::PlaceholderWithDefault(scope.WithOpName("v1"), d1, {17});
+  Output c1 =
+      ops::Const(scope.WithOpName("c1").WithControlDependencies(v1), 17, {1});
+  Output i1 = ops::Identity(scope.WithOpName("i1"), c1);
+  Output r1 =
+      ops::Reshape(scope.WithOpName("r1").WithControlDependencies(d1), v1, i1);
+  Output s1 = ops::Square(scope.WithOpName("s1"), r1);
+
+  // A multi dimensional reshape than can be optimized
+  Output d3 = ops::Const(scope.WithOpName("d3"), 3.14f, {5, 5, 5});
+  Output v3 =
+      ops::PlaceholderWithDefault(scope.WithOpName("v3"), d3, {5, 5, 5});
+  Output c3 =
+      ops::Const(scope.WithOpName("c3").WithControlDependencies(v3), 5, {3});
+  Output i3 = ops::Identity(scope.WithOpName("i3"), c3);
+  Output r3 = ops::Reshape(scope.WithOpName("r3"), v3, i3);
+  Output s3 = ops::Square(scope.WithOpName("s3"), r3);
+
+  // A multi dimensional partially defined reshape than can be optimized
+  Output d4 = ops::Const(scope.WithOpName("d4"), 3.14f, {5, 5, 5});
+  Output v4 =
+      ops::PlaceholderWithDefault(scope.WithOpName("v4"), d4, {5, 5, 5});
+  Output c4 = ops::Const(scope.WithOpName("c4").WithControlDependencies(v4),
+                         {5, -1, 5}, {3});
+  Output i4 = ops::Identity(scope.WithOpName("i4"), c4);
+  Output r4 = ops::Reshape(scope.WithOpName("r4"), v4, i4);
+  Output s4 = ops::Square(scope.WithOpName("s4"), r4);
+
+  // A reshape that can't be optimized
+  Output d2 = ops::Const(scope.WithOpName("d2"), 2.7f, {17, 1});
+  Output v2 = ops::PlaceholderWithDefault(scope.WithOpName("v2"), d2, {17, 1});
+  Output c2 =
+      ops::Const(scope.WithOpName("c2").WithControlDependencies(v2), 17, {1});
+  Output r2 = ops::Reshape(scope.WithOpName("r2"), v2, c2);
+  Output s2 = ops::Square(scope.WithOpName("s2"), r2);
+
+  GrapplerItem item;
+  item.fetch = {"s1", "s2", "s3", "s4"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding fold;
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  auto expected = EvaluateNodes(item.graph, item.fetch);
+  auto optimized = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(4, expected.size());
+  ASSERT_EQ(4, optimized.size());
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+  test::ExpectTensorEqual<float>(expected[1], optimized[1]);
+  test::ExpectTensorEqual<float>(expected[2], optimized[2]);
+  test::ExpectTensorEqual<float>(expected[3], optimized[3]);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "r1") {
+      ++found;
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(3, node.input_size());
+      EXPECT_EQ("v1", node.input(0));
+      EXPECT_EQ("^d1", node.input(1));
+      EXPECT_EQ("^v1", node.input(2));
+    } else if (node.name() == "r3") {
+      ++found;
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("v3", node.input(0));
+      EXPECT_EQ("^v3", node.input(1));
+    } else if (node.name() == "r4") {
+      ++found;
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("v4", node.input(0));
+      EXPECT_EQ("^v4", node.input(1));
+    } else if (node.name() == "r2") {
+      ++found;
+      EXPECT_EQ("Reshape", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("v2", node.input(0));
+      EXPECT_EQ("c2", node.input(1));
+    }
+  }
+  EXPECT_EQ(4, found);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
index d1ab5a1d9b45e7051e97f777634c852a715e5005..5273f11ca038b16bf5a2a80552a132398bf8c1f8 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include <unordered_map>
 #include <unordered_set>
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 
@@ -28,17 +30,13 @@ GraphRewriter::GraphRewriter(const GrapplerItem& item) {
     nodes_[node.name()] = &node;
   }
 
+  std::unordered_set<string> function_names;
+  for (const auto& function : item.graph.library().function()) {
+    function_names.insert(function.signature().name());
+  }
+
   for (auto& node : item.graph.node()) {
-    for (const auto& input : node.input()) {
-      int position = 0;
-      string input_node_name = ParseNodeName(input, &position);
-      if (position < 0) {
-        // This is a control edge
-        auto itr = nodes_.find(input_node_name);
-        CHECK(itr != nodes_.end());
-        control_dependency_drivers_.insert(itr->second);
-      }
-    }
+    RecordConnectivity(node, function_names);
   }
 }
 
@@ -46,16 +44,9 @@ void GraphRewriter::ForwardInputs(
     const NodeDef& original_node,
     const std::unordered_set<const NodeDef*>& nodes_to_delete,
     NodeDef* new_node) {
-  for (const auto& input : original_node.input()) {
-    string input_node_name = NodeName(input);
-    auto itr = nodes_.find(input_node_name);
-    CHECK(itr != nodes_.end());
-    const NodeDef* input_node = itr->second;
-    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
-      ForwardInputs(*input_node, nodes_to_delete, new_node);
-    } else {
-      *new_node->add_input() = input;
-    }
+  ForwardInputsInternal(original_node, nodes_to_delete, new_node);
+  if (!new_node->name().empty()) {
+    optimized_nodes_[new_node->name()] = new_node;
   }
 }
 
@@ -74,5 +65,74 @@ bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
   return false;
 }
 
+bool GraphRewriter::IsConnectedToFunction(const NodeDef& node) const {
+  return function_neighbors_.find(&node) != function_neighbors_.end();
+}
+
+bool GraphRewriter::IsDrivenByAnotherDevice(const NodeDef& node) const {
+  return cross_device_receivers_.find(&node) != cross_device_receivers_.end();
+}
+
+void GraphRewriter::RecordConnectivity(
+    const NodeDef& node, const std::unordered_set<string>& function_names) {
+  const bool is_function =
+      function_names.find(node.op()) != function_names.end();
+
+  for (const auto& input : node.input()) {
+    int position = 0;
+    string input_node_name = ParseNodeName(input, &position);
+    auto itr = nodes_.find(input_node_name);
+    if (itr == nodes_.end()) {
+      continue;
+    }
+    const NodeDef* fanin = itr->second;
+    if (position < 0) {
+      // This is a control edge
+      control_dependency_drivers_.insert(fanin);
+    } else {
+      // This is a regular edge
+      if (function_names.find(fanin->op()) != function_names.end()) {
+        function_neighbors_.insert(&node);
+      }
+      if (is_function) {
+        function_neighbors_.insert(fanin);
+      }
+    }
+    if (fanin->device() != node.device()) {
+      cross_device_receivers_.insert(&node);
+    }
+  }
+}
+
+void GraphRewriter::ForwardInputsInternal(
+    const NodeDef& node,
+    const std::unordered_set<const NodeDef*>& nodes_to_delete,
+    NodeDef* new_node) {
+  // To speed things up, use the optimized version of the node if
+  // available.
+  auto itr = optimized_nodes_.find(node.name());
+  if (itr != optimized_nodes_.end()) {
+    for (const string& input : itr->second->input()) {
+      *new_node->add_input() = input;
+    }
+    return;
+  }
+  for (const auto& input : node.input()) {
+    string input_node_name = NodeName(input);
+    auto itr = nodes_.find(input_node_name);
+    if (itr == nodes_.end()) {
+      // Invalid input, preserve it as is.
+      *new_node->add_input() = input;
+      continue;
+    }
+    const NodeDef* input_node = itr->second;
+    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
+      ForwardInputsInternal(*input_node, nodes_to_delete, new_node);
+    } else {
+      *new_node->add_input() = input;
+    }
+  }
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
index adbe5a24c863876c66cefc13a009324f607bf492..4bdb063d586d50368777751d495b778a1d37acb5 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -47,9 +47,27 @@ class GraphRewriter {
   // edge.
   bool IsDrivenByControlDependency(const NodeDef& node) const;
 
+  // Returns true if at least one of the nodes in the direct fanin or the direct
+  // fanout (excluding control dependencies) of 'node' is a function.
+  bool IsConnectedToFunction(const NodeDef& node) const;
+
+  // Returns true if the node is driven by at least one node placed on another
+  // device.
+  bool IsDrivenByAnotherDevice(const NodeDef& node) const;
+
  private:
+  void RecordConnectivity(const NodeDef& node,
+                          const std::unordered_set<string>& function_names);
+  void ForwardInputsInternal(
+      const NodeDef& original_node,
+      const std::unordered_set<const NodeDef*>& nodes_to_delete,
+      NodeDef* new_node);
+
   std::unordered_map<string, const NodeDef*> nodes_;
+  std::unordered_map<string, const NodeDef*> optimized_nodes_;
   std::unordered_set<const NodeDef*> control_dependency_drivers_;
+  std::unordered_set<const NodeDef*> function_neighbors_;
+  std::unordered_set<const NodeDef*> cross_device_receivers_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 0a802fa0f5a9a39b59e8ada189dd8f07eadcfef7..5a4e1b76e219872669da6743ba42a5b850a1d7aa 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
@@ -33,6 +35,7 @@ namespace grappler {
 const char kConcatConst[] = "LayoutOptimizerConcatConst";
 const char kPermNHWCToNCHW[] = "LayoutOptimizerPermConstNHWCToNCHW";
 const char kPermNCHWToNHWC[] = "LayoutOptimizerPermConstNCHWToNHWC";
+const char kGatherAxisConst[] = "LayoutOptimizerGatherAxisConst";
 const char kTransposeNHWCToNCHW[] = "LayoutOptimizerTransposeNHWCToNCHW";
 const char kTransposeNCHWToNHWC[] = "LayoutOptimizerTransposeNCHWToNHWC";
 const char kPermVecNHWCToNCHW[] = "LayoutOptimizerPermVecNHWCToNCHW";
@@ -225,35 +228,40 @@ class NodeProcessor {
     return input_pos;
   }
 
-  void AddNodeTranspose(const string& node_name, const string& input_name,
-                        DataType data_type, const TensorShapeProto& input_shape,
-                        bool NHWCToNCHW) {
+  NodeDef* AddNodeTranspose(const string& node_name, const string& input_name,
+                            DataType data_type,
+                            const TensorShapeProto& input_shape,
+                            bool NHWCToNCHW) {
     NodeDef* node = graph_->add_node();
     node_map_->AddNode(node_name, node);
     node->set_name(node_name);
     *node->add_input() = input_name;
     *node->add_input() = NHWCToNCHW ? kPermNHWCToNCHW : kPermNCHWToNHWC;
     node->set_op("Transpose");
+    node->set_device(node_->device());
     AttrValue attr_data_type;
     attr_data_type.set_type(data_type);
     node->mutable_attr()->insert({"T", attr_data_type});
     AttrValue attr_data_type_perm;
     attr_data_type_perm.set_type(DT_INT32);
     node->mutable_attr()->insert({"Tperm", attr_data_type_perm});
-    AttrValue attr_output_shape;
-    auto output_shape = attr_output_shape.mutable_list()->add_shape();
-    if (NHWCToNCHW) {
-      output_shape->add_dim()->set_size(input_shape.dim(0).size());
-      output_shape->add_dim()->set_size(input_shape.dim(3).size());
-      output_shape->add_dim()->set_size(input_shape.dim(1).size());
-      output_shape->add_dim()->set_size(input_shape.dim(2).size());
-    } else {
-      output_shape->add_dim()->set_size(input_shape.dim(0).size());
-      output_shape->add_dim()->set_size(input_shape.dim(2).size());
-      output_shape->add_dim()->set_size(input_shape.dim(3).size());
-      output_shape->add_dim()->set_size(input_shape.dim(1).size());
+    if (!input_shape.unknown_rank()) {
+      AttrValue attr_output_shape;
+      auto output_shape = attr_output_shape.mutable_list()->add_shape();
+      if (NHWCToNCHW) {
+        output_shape->add_dim()->set_size(input_shape.dim(0).size());
+        output_shape->add_dim()->set_size(input_shape.dim(3).size());
+        output_shape->add_dim()->set_size(input_shape.dim(1).size());
+        output_shape->add_dim()->set_size(input_shape.dim(2).size());
+      } else {
+        output_shape->add_dim()->set_size(input_shape.dim(0).size());
+        output_shape->add_dim()->set_size(input_shape.dim(2).size());
+        output_shape->add_dim()->set_size(input_shape.dim(3).size());
+        output_shape->add_dim()->set_size(input_shape.dim(1).size());
+      }
+      node->mutable_attr()->insert({"_output_shapes", attr_output_shape});
     }
-    node->mutable_attr()->insert({"_output_shapes", attr_output_shape});
+    return node;
   }
 
   virtual Status AddLayoutTransposeToInputs() {
@@ -590,11 +598,12 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
 
   bool Is4DOperateWithVector() const { return Is4DOperateWithND(1); }
 
-  void AddNodeShapeConst(const string& name, int num_channels) {
+  NodeDef* AddNodeShapeConst(const string& name, int num_channels) {
     NodeDef* node = graph_->add_node();
     node_map_->AddNode(name, node);
     node->set_name(name);
     node->set_op("Const");
+    node->set_device(node_->device());
     AttrValue attr_data_type;
     attr_data_type.set_type(DT_INT32);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -607,16 +616,19 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    return node;
   }
 
-  void AddNodeReshape(const string& node_name, const string& input_name,
-                      const string& shape_const_node_name, DataType data_type) {
+  NodeDef* AddNodeReshape(const string& node_name, const string& input_name,
+                          const string& shape_const_node_name,
+                          DataType data_type) {
     NodeDef* node = graph_->add_node();
     node_map_->AddNode(node_name, node);
     node->set_name(node_name);
     *node->add_input() = input_name;
     *node->add_input() = shape_const_node_name;
     node->set_op("Reshape");
+    node->set_device(node_->device());
 
     AttrValue attr_type_indices;
     attr_type_indices.set_type(DT_INT32);
@@ -625,6 +637,7 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     AttrValue attr_type_params;
     attr_type_params.set_type(data_type);
     node->mutable_attr()->insert({"T", attr_type_params});
+    return node;
   }
 
   Status CustomizedProcessing() override {
@@ -741,12 +754,17 @@ class SliceProcessor : public AgnosticNodeProcessor {
     node->set_name(node_name);
     *node->add_input() = input_name;
     *node->add_input() = NHWCToNCHW ? kPermNHWCToNCHW : kPermNCHWToNHWC;
-    node->set_op("Gather");
+    *node->add_input() = kGatherAxisConst;
+    node->set_op("GatherV2");
 
     AttrValue attr_type_indices;
     attr_type_indices.set_type(DT_INT32);
     node->mutable_attr()->insert({"Tindices", attr_type_indices});
 
+    AttrValue attr_type_axis;
+    attr_type_axis.set_type(DT_INT32);
+    node->mutable_attr()->insert({"Taxis", attr_type_axis});
+
     AttrValue attr_type_params;
     attr_type_params.set_type(data_type);
     node->mutable_attr()->insert({"Tparams", attr_type_params});
@@ -927,14 +945,18 @@ struct TuningConfig {
   // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
   // usually faster than the NCHW implementation. The downside is that this
   // might result in more non-cancellable layout conversion nodes (implemented
-  // by the Tranpose op).
+  // by the Transpose op).
   bool no_gemm;
 };
 
 class DataLayoutOptimizer {
  public:
-  explicit DataLayoutOptimizer(GraphDef* graph, TuningConfig config)
-      : graph_(graph), node_map_(graph_), config_(config) {}
+  explicit DataLayoutOptimizer(const string& default_device, GraphDef* graph,
+                               TuningConfig config)
+      : default_device_(default_device),
+        graph_(graph),
+        node_map_(graph_),
+        config_(config) {}
 
   Status Optimize() {
     LOG(INFO) << "Number of nodes for original graph: " << graph_->node_size();
@@ -946,12 +968,13 @@ class DataLayoutOptimizer {
   }
 
  private:
-  void AddNodePermConst(const string& name,
-                        const std::vector<int>& permutation) {
+  NodeDef* AddNodePermConst(const string& name,
+                            const std::vector<int>& permutation) {
     NodeDef* node = graph_->add_node();
     node_map_.AddNode(name, node);
     node->set_name(name);
     node->set_op("Const");
+    node->set_device(default_device_);
     AttrValue attr_data_type;
     attr_data_type.set_type(DT_INT32);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -962,28 +985,40 @@ class DataLayoutOptimizer {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    return node;
   }
 
-  void AddNodeConcatConst() {
+  NodeDef* AddConstScalar(const char* name, DataType dtype, int value) {
     NodeDef* node = graph_->add_node();
-    node_map_.AddNode(kConcatConst, node);
-    node->set_name(kConcatConst);
+    node_map_.AddNode(name, node);
+    node->set_name(name);
     node->set_op("Const");
+    node->set_device(default_device_);
     AttrValue attr_data_type;
-    attr_data_type.set_type(DT_INT32);
+    attr_data_type.set_type(dtype);
     node->mutable_attr()->insert({"dtype", attr_data_type});
     AttrValue attr_tensor;
-    Tensor tensor(DT_INT32, TensorShape({}));
-    tensor.scalar<int>()() = 1;
+    Tensor tensor(dtype, TensorShape({}));
+    tensor.scalar<int>()() = value;
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    return node;
   }
 
-  void AddNodeReductionConst() {
+  NodeDef* AddNodeConcatConst() {
+    return AddConstScalar(kConcatConst, DT_INT32, 1);
+  }
+
+  NodeDef* AddGatherAxisConst() {
+    return AddConstScalar(kGatherAxisConst, DT_INT32, 0);
+  }
+
+  NodeDef* AddNodeReductionConst() {
     NodeDef* node = graph_->add_node();
     node_map_.AddNode(kReductionConst, node);
     node->set_name(kReductionConst);
     node->set_op("Const");
+    node->set_device(default_device_);
     AttrValue attr_data_type;
     attr_data_type.set_type(DT_INT32);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -996,6 +1031,7 @@ class DataLayoutOptimizer {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    return node;
   }
 
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
@@ -1040,10 +1076,11 @@ class DataLayoutOptimizer {
     // only needs to be performed if at least one node in the previous pass is
     // expanded.
     if (graph_->node_size() > node_size_original) {
-      AddNodePermConst(kPermNHWCToNCHW, {0, 3, 1, 2});
-      AddNodePermConst(kPermNCHWToNHWC, {0, 2, 3, 1});
-      AddNodeConcatConst();
-      AddNodeReductionConst();
+      NodeDef* n = AddNodePermConst(kPermNHWCToNCHW, {0, 3, 1, 2});
+      n = AddNodePermConst(kPermNCHWToNHWC, {0, 2, 3, 1});
+      n = AddNodeConcatConst();
+      n = AddGatherAxisConst();
+      n = AddNodeReductionConst();
       std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
       for (int i = 0; i < graph_->node_size(); i++) {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
@@ -1132,6 +1169,7 @@ class DataLayoutOptimizer {
     return Status::OK();
   }
 
+  string default_device_;
   GraphDef* graph_;
   NodeMap node_map_;
   TuningConfig config_;
@@ -1158,7 +1196,7 @@ Status LayoutOptimizer::InferOutputShapes(GrapplerItem* item) {
     for (const auto& tensor_property : tensor_properties) {
       *attr_output_shape.mutable_list()->add_shape() = tensor_property.shape();
     }
-    node->mutable_attr()->insert({"_output_shapes", attr_output_shape});
+    (*node->mutable_attr())["_output_shapes"] = attr_output_shape;
   }
   return Status::OK();
 }
@@ -1184,21 +1222,30 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = new_item.graph;
   TuningConfig config;
   config.no_gemm = false;
-  DataLayoutOptimizer layout_optimizer(output, config);
-  status = layout_optimizer.Optimize();
+  string default_device = "/job:localhost/replica:0/task:0/cpu:0";
+  if (cluster) {
+    if (!cluster->GetDevices().empty()) {
+      default_device = cluster->GetDevices().begin()->first;
+    }
+  }
+  std::unique_ptr<DataLayoutOptimizer> layout_optimizer(
+      new DataLayoutOptimizer(default_device, output, config));
+  status = layout_optimizer->Optimize();
   // This is based on an empirical observation that if the introduced Transpose
   // nodes is more than 30, not using GEMM implementation would result in better
   // performance.
   if (status.ok() && GetNumTranspose(*output) > 30) {
     *output = new_item.graph;
     config.no_gemm = true;
-    DataLayoutOptimizer layout_optimizer(output, config);
-    status = layout_optimizer.Optimize();
+    layout_optimizer.reset(
+        new DataLayoutOptimizer(default_device, output, config));
+    status = layout_optimizer->Optimize();
   }
 
   if (!status.ok()) {
     *output = item.graph;
   }
+
   return status;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 1ed7cab4abfdc5281f3906780527eb06e6f93f03..dc1567c60a712df903aebf099d20b4935f7d8367 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -15,21 +15,343 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 
+#include <algorithm>
+#include <queue>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
+// Prefix added to nodes which are recomputed.
 const char* kRecomputedNodePrefix = "Recomputed";
+const char* kRecomputeTriggerNodePrefix = "RecomputeTrigger";
+// Attribute which may be added to nodes to manually allow them to be
+// recomputed.
+const char* kRecomputeHint = "_recompute_hint";
+const char* kRecomputationTargetNamePrefix = "gradients/";
+
+// Ops which we wouldn't mind recomputing to save memory.
+// TODO(allenl): Replace this list with a cost model.
+std::unordered_set<string> GetCheapToRecomputeOps() {
+  std::unordered_set<string> cheap_ops = {
+      "Add",  "AddN",     "BiasAdd",           "Cast",
+      "Fill", "FloorDiv", "FloorMod",          "FusedBatchNorm",
+      "Mul",  "Neg",      "RealDiv",           "Reciprocal",
+      "Relu", "Relu6",    "Reshape",           "Rsqrt",
+      "Sqrt", "Square",   "SquaredDifference", "Sub",
+      "Tile", "Transpose"};
+  return cheap_ops;
+}
+
+// Nodes whose inputs we may want to recompute (i.e. gradients).
+// TODO(allenl): Rather than blindly recomputing gradient inputs, use a static
+// schedule (grappler::EstimateEarliestExecutionTimes) to recompute only nodes
+// whose outputs will sit around for a while.
+bool IsTargetOp(const NodeDef& node) {
+  return node.name().find(kRecomputationTargetNamePrefix) == 0;
+}
+
+// Find recomputable ops which feed into target nodes.
+std::unordered_set<const NodeDef*> FindCandidateRecomputeNodes(
+    const NodeMap& node_map, const GraphDef* graph,
+    const std::function<bool(const NodeDef&)>& is_candidate) {
+  std::unordered_set<const NodeDef*> candidate_recompute_nodes;
+  for (const auto& node : graph->node()) {
+    if (!is_candidate(node)) {
+      continue;
+    }
+    bool has_target_output = false;
+    for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+      // It only makes sense to recompute this if it feeds into a target
+      // node. We expand this to dependencies in GetOpGroupsToRecompute.
+      if (IsTargetOp(*output)) {
+        has_target_output = true;
+        break;
+      }
+    }
+    if (!has_target_output) {
+      continue;
+    }
+    bool has_target_input = false;
+    for (const string& input_name : node.input()) {
+      // Don't recompute nodes which depend on target nodes.
+      const NodeDef* input_node = node_map.GetNode(input_name);
+      if (IsTargetOp(*input_node)) {
+        has_target_input = true;
+        break;
+      }
+    }
+    if (has_target_input) {
+      continue;
+    }
+    candidate_recompute_nodes.insert(&node);
+  }
+  return candidate_recompute_nodes;
+}
+
+void connected_subgraph(const NodeMap& node_map, bool collect_inputs,
+                        bool collect_outputs,
+                        const std::function<bool(const NodeDef&)>& is_candidate,
+                        std::unordered_set<const NodeDef*>* expanded_nodes) {
+  std::queue<const NodeDef*> to_visit;
+  for (const NodeDef* starting_node : *expanded_nodes) {
+    to_visit.push(starting_node);
+  }
+  expanded_nodes->clear();
+  while (!to_visit.empty()) {
+    const NodeDef* current_node = to_visit.front();
+    to_visit.pop();
+    if (!expanded_nodes->insert(current_node).second) {
+      // We already visited this node
+      continue;
+    }
+    if (collect_inputs) {
+      // Add inputs and outputs to this subgraph if they are candidates
+      for (const string& input_name_raw : current_node->input()) {
+        const NodeDef* input_node = node_map.GetNode(input_name_raw);
+        if (expanded_nodes->count(input_node) == 0 &&
+            is_candidate(*input_node)) {
+          to_visit.push(input_node);
+        }
+      }
+    }
+    if (collect_outputs) {
+      for (const NodeDef* output : node_map.GetOutputs(current_node->name())) {
+        if (expanded_nodes->count(output) == 0 && is_candidate(*output)) {
+          to_visit.push(output);
+        }
+      }
+    }
+  }
+}
+
+struct RecomputedSubGraph {
+  std::unordered_set<const NodeDef*> recomputed_source_nodes;
+  std::unordered_set<NodeDef*> target_nodes;
+};
+
+// Find groups of ops to recompute together based on `should_recompute`.
+std::vector<RecomputedSubGraph> GetOpGroupsToRecompute(
+    const GraphDef* graph, const NodeMap& node_map,
+    const std::function<bool(const NodeDef&)>& should_recompute) {
+  std::unordered_set<const NodeDef*> visited_nodes;
+  std::vector<RecomputedSubGraph> subgraphs_to_recompute;
+  std::unordered_set<const NodeDef*> candidate_recompute_nodes =
+      FindCandidateRecomputeNodes(node_map, graph, should_recompute);
+  for (const NodeDef* recompute_node : candidate_recompute_nodes) {
+    if (visited_nodes.count(recompute_node) > 0) {
+      continue;
+    }
+    RecomputedSubGraph current_recomputation;
+    // Build out recomputation groups by expanding to inexpensive-to-recompute
+    // nodes which do not feed target nodes. The goal is to capture some
+    // intermediate activations within this graph.
+    std::unordered_set<const NodeDef*> unpruned_recompute_nodes;
+    unpruned_recompute_nodes.insert(recompute_node);
+    connected_subgraph(node_map,
+                       true,  // Collect inputs
+                       true,  // Collect outputs
+                       should_recompute, &unpruned_recompute_nodes);
+    visited_nodes.insert(unpruned_recompute_nodes.begin(),
+                         unpruned_recompute_nodes.end());
+    for (const NodeDef* recompute_node : unpruned_recompute_nodes) {
+      bool inserted_feed = false;
+      for (NodeDef* output : node_map.GetOutputs(recompute_node->name())) {
+        if (IsTargetOp(*output)) {
+          current_recomputation.target_nodes.insert(output);
+          if (!inserted_feed) {
+            // Keep track of nodes which feed directly into a target node. These
+            // and nodes which feed into them will define the recomputed
+            // subgraph.
+            current_recomputation.recomputed_source_nodes.insert(
+                recompute_node);
+            inserted_feed = true;
+          }
+        }
+      }
+    }
+    // Recompute only nodes which eventually feed into a target node.
+    connected_subgraph(node_map,
+                       true,   // Collect inputs
+                       false,  // Collect outputs
+                       [&unpruned_recompute_nodes](const NodeDef& node) {
+                         return unpruned_recompute_nodes.count(&node) != 0;
+                       },
+                       &current_recomputation.recomputed_source_nodes);
+    if (current_recomputation.target_nodes.empty()) {
+      continue;
+    }
+    subgraphs_to_recompute.push_back(current_recomputation);
+  }
+  return subgraphs_to_recompute;
+}
+
+// Computes the maximum topological numbers of (1) target node components
+// (gradient nodes being fed by the recomputation), and (2) child recompute node
+// components for each recomputed node. We will not attach any control
+// dependencies to a recomputation unless they have component numbers greater
+// than this value (to prevent cycles).
+std::unordered_map<const NodeDef*, int> GetMaxDownstreamComponents(
+    const std::unordered_set<const NodeDef*>& recomputed_source_nodes,
+    const std::unordered_set<NodeDef*>& target_nodes, const NodeMap& node_map,
+    const std::unordered_map<const NodeDef*, int>& components) {
+  std::unordered_map<const NodeDef*, int> recomputed_node_components;
+  // Start by setting component numbers to the maximum among target nodes.
+  for (const NodeDef* original_recompute_node : recomputed_source_nodes) {
+    int max_target_component = -1;
+    for (NodeDef* output :
+         node_map.GetOutputs(original_recompute_node->name())) {
+      if (target_nodes.count(output) != 0) {
+        int current_target_component = components.find(output)->second;
+        if (current_target_component > max_target_component) {
+          max_target_component = current_target_component;
+        }
+      }
+    }
+    if (max_target_component > -1) {
+      recomputed_node_components[original_recompute_node] =
+          max_target_component;
+    }
+  }
+  // Sort recomputed nodes topologically (based on the original graph) so we can
+  // efficiently assign to each node the maximum of its recomputed child
+  // components and its own targets.
+  std::vector<const NodeDef*> recomputed_source_nodes_topological(
+      recomputed_source_nodes.begin(), recomputed_source_nodes.end());
+  std::sort(recomputed_source_nodes_topological.begin(),
+            recomputed_source_nodes_topological.end(),
+            [&components](const NodeDef* first, const NodeDef* second) {
+              return components.find(first)->second <
+                     components.find(second)->second;
+            });
+  for (const NodeDef* original_recompute_node :
+       recomputed_source_nodes_topological) {
+    int max_component;
+    auto recomputed_component_iterator =
+        recomputed_node_components.find(original_recompute_node);
+    if (recomputed_component_iterator != recomputed_node_components.end()) {
+      max_component = recomputed_component_iterator->second;
+    } else {
+      max_component = -1;
+    }
+    for (NodeDef* output :
+         node_map.GetOutputs(original_recompute_node->name())) {
+      if (recomputed_source_nodes.count(output) == 0) {
+        continue;
+      }
+      auto child_component_iterator = recomputed_node_components.find(output);
+      CHECK(child_component_iterator != recomputed_node_components.end());
+      int child_component = child_component_iterator->second;
+      if (child_component > max_component) {
+        max_component = child_component;
+      }
+    }
+    CHECK_GE(max_component, 0);
+    recomputed_node_components[original_recompute_node] = max_component;
+  }
+  return recomputed_node_components;
+}
+
+// Modifies `graph`, adding trigger nodes and returning a mapping from
+// `recomputed_source_nodes` to trigger nodes which will not create loops in the
+// graph (using the component numberings in `components` and
+// `recomputed_node_max_feed_components`). The copied nodes (not the nodes in
+// recomputed_source_nodes, which are the originals) eventually get these
+// control dependencies.
+std::unordered_map<const NodeDef*, const NodeDef*>
+AddRecomputeControlDependencyNodes(
+    const std::unordered_set<const NodeDef*>& recomputed_source_nodes,
+    const std::unordered_set<NodeDef*>& target_nodes, const NodeMap& node_map,
+    const std::unordered_map<const NodeDef*, int>& components,
+    const std::unordered_map<const NodeDef*, int>&
+        recomputed_node_max_feed_components,
+    GraphDef* graph) {
+  // Sort recomputed nodes based on max downstream components.
+  std::vector<const NodeDef*> recomputed_source_nodes_topological(
+      recomputed_source_nodes.begin(), recomputed_source_nodes.end());
+  std::sort(recomputed_source_nodes_topological.begin(),
+            recomputed_source_nodes_topological.end(),
+            [&recomputed_node_max_feed_components](const NodeDef* first,
+                                                   const NodeDef* second) {
+              int first_component =
+                  recomputed_node_max_feed_components.find(first)->second;
+              int second_component =
+                  recomputed_node_max_feed_components.find(second)->second;
+              return first_component > second_component
+                     // Ensure a consistent ordering. This is necessary because
+                     // we're working not with node component numbers (which are
+                     // unique) but with the maximum across nodes they feed into
+                     // (very much not unique).
+                     || (first_component == second_component &&
+                         first->name() > second->name());
+            });
+  // Create merged control dependency nodes by sorting target inputs
+  // topologically and zipper merging with the sorted recomputed nodes.
+  std::vector<const NodeDef*> target_inputs_topological;
+  for (const NodeDef* target_node : target_nodes) {
+    for (const string& target_input_name_raw : target_node->input()) {
+      const NodeDef* target_input = node_map.GetNode(target_input_name_raw);
+      // If this node has already had one of its inputs recomputed during this
+      // rewriting pass, we ignore that recomputed node here (it will not be in
+      // the NodeMap).
+      if (target_input == nullptr ||
+          recomputed_source_nodes.count(target_input) != 0 ||
+          components.find(target_node)->second ==
+              components.find(target_input)->second) {
+        continue;
+      }
+      target_inputs_topological.push_back(target_input);
+    }
+  }
+  std::sort(target_inputs_topological.begin(), target_inputs_topological.end(),
+            [&components](const NodeDef* first, const NodeDef* second) {
+              return components.find(first)->second >
+                     components.find(second)->second;
+            });
+  auto target_input_iterator = target_inputs_topological.begin();
+  NodeDef* current_trigger_node = nullptr;
+  std::unordered_map<const NodeDef*, const NodeDef*> triggers;
+  for (const NodeDef* original_recomputed_node :
+       recomputed_source_nodes_topological) {
+    NodeDef* new_trigger_node = graph->add_node();
+    new_trigger_node->set_name(AddPrefixToNodeName(
+        original_recomputed_node->name(), kRecomputeTriggerNodePrefix));
+    new_trigger_node->set_op("NoOp");
+    new_trigger_node->set_device(original_recomputed_node->device());
+    if (current_trigger_node != nullptr) {
+      *new_trigger_node->add_input() =
+          strings::StrCat("^", current_trigger_node->name());
+    }
+    current_trigger_node = new_trigger_node;
+    triggers[original_recomputed_node] = current_trigger_node;
+    for (;
+         target_input_iterator != target_inputs_topological.end() &&
+         components.find(*target_input_iterator)->second >
+             recomputed_node_max_feed_components.find(original_recomputed_node)
+                 ->second;
+         ++target_input_iterator) {
+      *current_trigger_node->add_input() =
+          strings::StrCat("^", (*target_input_iterator)->name());
+      VLOG(2) << "  Recomputation trigger " << current_trigger_node->name()
+              << " depends on " << (*target_input_iterator)->name();
+    }
+  }
+  return triggers;
+}
 
 string RecomputedOrOriginalNodeName(
     const std::unordered_set<string>& recomputed_node_names,
@@ -42,14 +364,28 @@ string RecomputedOrOriginalNodeName(
   }
 }
 
+// Helper function to recompute a sub-graph (recomputed_source_nodes). Edges
+// from recomputed_source_nodes to target_nodes are changed to start from the
+// recomputed nodes.
 void RecomputeSubgraph(
-    const std::vector<const NodeDef*>& recomputed_source_nodes,
-    const string& recompute_trigger_node_name,
-    const std::vector<NodeDef*>& target_nodes, GraphDef* graph) {
+    const std::unordered_set<const NodeDef*>& recomputed_source_nodes,
+    const std::unordered_set<NodeDef*>& target_nodes, const NodeMap& node_map,
+    const std::unordered_map<const NodeDef*, int>& components,
+    GraphDef* graph) {
   std::unordered_set<string> recomputed_node_names;
-  for (const NodeDef* to_recompute : recomputed_source_nodes) {
-    recomputed_node_names.insert(to_recompute->name());
+  VLOG(1) << "Recomputing a " << recomputed_source_nodes.size()
+          << " node subgraph";
+  std::unordered_map<const NodeDef*, int> recomputed_node_components =
+      GetMaxDownstreamComponents(recomputed_source_nodes, target_nodes,
+                                 node_map, components);
+  for (const NodeDef* original_node : recomputed_source_nodes) {
+    VLOG(2) << "  " << original_node->name();
+    recomputed_node_names.insert(original_node->name());
   }
+  std::unordered_map<const NodeDef*, const NodeDef*> triggers =
+      AddRecomputeControlDependencyNodes(recomputed_source_nodes, target_nodes,
+                                         node_map, components,
+                                         recomputed_node_components, graph);
   // Create the recomputed sub-graph
   for (const NodeDef* original_node : recomputed_source_nodes) {
     NodeDef* copied_node = graph->add_node();
@@ -64,10 +400,10 @@ void RecomputeSubgraph(
       *copied_node->add_input() = RecomputedOrOriginalNodeName(
           recomputed_node_names, original_input_name);
     }
-    // Set control dependencies on the recomputed nodes so that they are not run
-    // until the specified trigger runs.
+    // Each recomputed node gets a control dependency to prevent it from being
+    // recomputed immediately.
     *copied_node->add_input() =
-        strings::StrCat("^", recompute_trigger_node_name);
+        strings::StrCat("^", triggers[original_node]->name());
   }
   // Set the inputs of nodes in the target subgraph to the recomputed nodes
   // where applicable.
@@ -79,6 +415,60 @@ void RecomputeSubgraph(
   }
 }
 
+void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
+                                GraphDef* graph, const GrapplerItem& item) {
+  // The topological numberings and NodeMap will be stale as soon as we start
+  // modifying the graph in RecomputeSubgraph. However, RecomputeSubgraph only
+  // looks up nodes which were in the original graph, and preserves the graph
+  // topology it's interested in.
+  // We don't use the results of this topological sort until later, but this
+  // call invalidates all NodeDef pointers, so it needs to be done before we
+  // start collecting those.
+  TopologicalSort(graph);
+  NodeMap node_map(graph);
+  std::vector<RecomputedSubGraph> recomputed_subgraphs;
+  // Do not recompute nodes which are fed, since the recomputed node would not
+  // take on the fed value (i.e. gradients would be incorrect).
+  std::unordered_set<string> feeds;
+  for (const auto& feed : item.feed) {
+    feeds.insert(NodeName(feed.first));
+  }
+  if (optimization_level == RewriterConfig::HEURISTICS) {
+    // TODO(allenl): Handle ResNet-like architectures better. Right now all of
+    // the cheap forward ops get grouped into a single subgraph which must
+    // execute before gradients start executing (unless layers are manually
+    // separated by identity ops).
+    std::unordered_set<string> cheap_to_recompute_ops =
+        GetCheapToRecomputeOps();
+    recomputed_subgraphs = GetOpGroupsToRecompute(
+        graph, node_map,
+        [&cheap_to_recompute_ops, &feeds](const NodeDef& node) {
+          return !IsTargetOp(node) && feeds.count(node.name()) == 0 &&
+                 (cheap_to_recompute_ops.count(node.op()) > 0 ||
+                  node.attr().count(kRecomputeHint) > 0);
+        });
+  } else if (optimization_level == RewriterConfig::MANUAL) {
+    recomputed_subgraphs =
+        GetOpGroupsToRecompute(graph, node_map, [&feeds](const NodeDef& node) {
+          return !IsTargetOp(node) && feeds.count(node.name()) == 0 &&
+                 node.attr().count(kRecomputeHint) > 0;
+        });
+  }
+  if (!recomputed_subgraphs.empty()) {
+    std::unordered_map<const NodeDef*, int> topological_numbering;
+    for (int node_number = 0; node_number < graph->node().size();
+         ++node_number) {
+      topological_numbering[graph->mutable_node(node_number)] =
+          graph->node().size() - node_number - 1;
+    }
+    // Duplicate the indicated sub-graphs and set up control dependencies
+    for (const RecomputedSubGraph& subgraph : recomputed_subgraphs) {
+      RecomputeSubgraph(subgraph.recomputed_source_nodes, subgraph.target_nodes,
+                        node_map, topological_numbering, graph);
+    }
+  }
+}
+
 std::pair<NodeDef*, NodeDef*> BuildSwapPair(NodeDef* node, int input_to_swap,
                                             GraphDef* graph) {
   string tensor_to_swap = strings::StrCat(node->name(), "_", input_to_swap);
@@ -100,6 +490,9 @@ std::pair<NodeDef*, NodeDef*> BuildSwapPair(NodeDef* node, int input_to_swap,
   (*swap_in_node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
   (*node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
 
+  const DataType input_type = node->attr().at("T").type();
+  (*swap_in_node->mutable_attr())["T"].set_type(input_type);
+  (*swap_out_node->mutable_attr())["T"].set_type(input_type);
   return std::make_pair(swap_out_node, swap_in_node);
 }
 
@@ -175,8 +568,8 @@ static const NodeDef* FindSwapTrigger(
     // Don't jump over frames, since adding a control dependency from one frame
     // to the next isn't supported. Don't go through branches, since we don't
     // know whether they'll be executed or not.
-    if (input_node->op() == "NextIteration" || input_node->op() == "Switch" ||
-        input_node->op() == "Merge") {
+    if (IsNextIteration(*input_node) || IsSwitch(*input_node) ||
+        IsMerge(*input_node)) {
       continue;
     }
     auto it2 = execution_times.find(input_node);
@@ -205,6 +598,8 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
 
+  RecomputationRewritingPass(optimization_level_, optimized_graph, item);
+
   // Figure out what needs to be swapped;
   std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
   for (auto& node : *optimized_graph->mutable_node()) {
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
index dfb24c05c99c3292647833db058591839b1a1d15..5b7ba4001f0d0747aa7358e1d0d902c2dcfb5da4 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -16,9 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
 
-#include <vector>
-
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -26,7 +25,8 @@ namespace grappler {
 // Swap tensors in and out of device memory.
 class MemoryOptimizer : public GraphOptimizer {
  public:
-  MemoryOptimizer() {}
+  explicit MemoryOptimizer(RewriterConfig::MemOptType optimization_level)
+      : optimization_level_(optimization_level) {}
   ~MemoryOptimizer() override {}
 
   string name() const override { return "memory_optimizer"; };
@@ -36,15 +36,10 @@ class MemoryOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& pruned_graph, double result) override;
-};
 
-// Helper function to recompute a sub-graph (recomputed_source_nodes) on a
-// trigger. Edges from recomputed_source_nodes to target_nodes are changed to
-// start from the recomputed nodes.
-void RecomputeSubgraph(
-    const std::vector<const NodeDef*>& recomputed_source_nodes,
-    const string& recompute_trigger_node_name,
-    const std::vector<NodeDef*>& target_nodes, GraphDef* graph);
+ private:
+  RewriterConfig::MemOptType optimization_level_;
+};
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index a4f8e22e1d8306ac2f1499cf8031e8fc669d8855..0d5d302f4ad20002a62105b0d3a77c55838d28b9 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -34,90 +34,166 @@ class RecomputeSubgraphTest : public ::testing::Test {};
 TEST_F(RecomputeSubgraphTest, SimpleSubgraph) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Const(s.WithOpName("a"), 1.f, {2, 3, 4});
-  Output b = ops::AddN(s.WithOpName("b"), {a});  // Recomputed
-  Output c = ops::AddN(s.WithOpName("c"), {b});
-  Output d = ops::AddN(s.WithOpName("d"), {c});
-  Output e = ops::AddN(s.WithOpName("e"), {d, b});
-  Output f = ops::AddN(s.WithOpName("f"), {e, a});
+  Output a = ops::Variable(s.WithOpName("a"), {2, 3, 4}, DT_FLOAT);
+  Output b = ops::Identity(s.WithOpName("b"), a);  // Recomputed
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::AddN(s.WithOpName("gradients/d"), {c});
+  Output e = ops::AddN(s.WithOpName("gradients/e"), {d, b});
+  Output f = ops::AddN(s.WithOpName("gradients/f"), {e, a});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   EXPECT_EQ(6, item.graph.node_size());
   NodeMap pre_transform_node_map(&item.graph);
-  std::vector<const NodeDef*> recomputed_source_nodes;
-  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(b.name()));
-  std::vector<NodeDef*> target_nodes;
-  target_nodes.push_back(pre_transform_node_map.GetNode(e.name()));
-  RecomputeSubgraph(recomputed_source_nodes, d.name(), target_nodes,
-                    &item.graph);
-  NodeMap post_transform_node_map(&item.graph);
-  EXPECT_EQ(7, item.graph.node_size());
+  (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"]
+      .set_i(0);
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+
+  TF_EXPECT_OK(status);
+  NodeMap post_transform_node_map(&output);
+  EXPECT_EQ(8, output.node_size());
   NodeDef* transformed_e = post_transform_node_map.GetNode(e.name());
   EXPECT_EQ(2, transformed_e->input_size());
-  EXPECT_EQ("d", transformed_e->input(0));
+  EXPECT_EQ("gradients/d", transformed_e->input(0));
   EXPECT_EQ("Recomputed/b", transformed_e->input(1));
   NodeDef* recomputed_b = post_transform_node_map.GetNode("Recomputed/b");
   EXPECT_EQ(2, recomputed_b->input_size());
   EXPECT_EQ("a", recomputed_b->input(0));
-  EXPECT_EQ("^d", recomputed_b->input(1).substr(0, 2));
+  EXPECT_EQ("^RecomputeTrigger/b", recomputed_b->input(1));
+  NodeDef* recompute_trigger =
+      post_transform_node_map.GetNode("RecomputeTrigger/b");
+  EXPECT_EQ(1, recompute_trigger->input_size());
+  EXPECT_EQ("^gradients/d", recompute_trigger->input(0));
+}
+
+TEST_F(RecomputeSubgraphTest, NoFeedsRecomputed) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Variable(s.WithOpName("a"), {2, 3, 4}, DT_FLOAT);
+  Output b = ops::Identity(s.WithOpName("b"), a);  // Would be recomputed, but
+                                                   // for being fed
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::AddN(s.WithOpName("gradients/d"), {c});
+  Output e = ops::AddN(s.WithOpName("gradients/e"), {d, b});
+  Output f = ops::AddN(s.WithOpName("gradients/f"), {e, a});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.feed.emplace_back("b", Tensor());
+  EXPECT_EQ(6, item.graph.node_size());
+  NodeMap pre_transform_node_map(&item.graph);
+  (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"]
+      .set_i(0);
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(6, output.node_size());
+}
+
+TEST_F(RecomputeSubgraphTest, TwoInputSubgraphs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Variable(s.WithOpName("a"), {2, 3, 4}, DT_FLOAT);
+  Output b = ops::Variable(s.WithOpName("b"), {2, 3, 4}, DT_FLOAT);
+  Output d = ops::AddN(s.WithOpName("gradients/two_subgraph_inputs"), {a, b});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_EQ(3, item.graph.node_size());
+  NodeMap pre_transform_node_map(&item.graph);
+  (*pre_transform_node_map.GetNode("a")->mutable_attr())["_recompute_hint"]
+      .set_i(0);
+  (*pre_transform_node_map.GetNode("b")->mutable_attr())["_recompute_hint"]
+      .set_i(0);
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+
+  TF_EXPECT_OK(status);
+  NodeMap post_transform_node_map(&output);
+  // Mostly checking that this case does not crash.
+  EXPECT_EQ(7, output.node_size());
+  EXPECT_NE(post_transform_node_map.GetNode("Recomputed/a"), nullptr);
+  EXPECT_NE(post_transform_node_map.GetNode("Recomputed/b"), nullptr);
+  EXPECT_NE(post_transform_node_map.GetNode("RecomputeTrigger/a"), nullptr);
+  EXPECT_NE(post_transform_node_map.GetNode("RecomputeTrigger/b"), nullptr);
 }
 
 TEST_F(RecomputeSubgraphTest, MultiNode) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Const(s.WithOpName("Conv"), 1.f, {2, 3, 4});
-  Output b = ops::AddN(s.WithOpName("BN"), {a});    // Recomputed
-  Output c = ops::AddN(s.WithOpName("ReLU"), {b});  // Recomputed
-  Output d = ops::AddN(s.WithOpName("Conv1"), {c});
+  Output a = ops::Variable(s.WithOpName("Conv"), {2, 3, 4}, DT_FLOAT);
+  Output b = ops::Identity(s.WithOpName("BN"), a);    // Recomputed
+  Output c = ops::Identity(s.WithOpName("ReLU"), b);  // Recomputed
+  Output d = ops::Identity(s.WithOpName("Conv1"), c);
 
-  Output trigger = ops::Const(s.WithOpName("BN1Grad"), 0.f, {2, 3, 4});
-  Output e = ops::AddN(s.WithOpName("Conv1Grad"), {trigger, c});
-  Output f = ops::AddN(s.WithOpName("ReLUGrad"), {e, c});
-  Output g = ops::AddN(s.WithOpName("BNGrad"), {f, a});
-  Output h = ops::AddN(s.WithOpName("ConvGrad"), {g});
+  // The "gradients/" prefix means the heuristic will pick these up as
+  // candidates to have their inputs recomputed.
+  Output trigger = ops::AddN(s.WithOpName("gradients/BN1Grad"), {d});
+  Output e = ops::AddN(s.WithOpName("gradients/Conv1Grad"), {trigger, c});
+  Output f = ops::AddN(s.WithOpName("gradients/ReLUGrad"), {e, c});
+  Output g = ops::AddN(s.WithOpName("gradients/BNGrad"), {f, a});
+  Output h = ops::AddN(s.WithOpName("gradients/ConvGrad"), {g});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   EXPECT_EQ(9, item.graph.node_size());
   NodeMap pre_transform_node_map(&item.graph);
-  std::vector<const NodeDef*> recomputed_source_nodes;
-  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(b.name()));
-  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(c.name()));
-  std::vector<NodeDef*> target_nodes;
-  target_nodes.push_back(pre_transform_node_map.GetNode(e.name()));
-  target_nodes.push_back(pre_transform_node_map.GetNode(f.name()));
-  target_nodes.push_back(pre_transform_node_map.GetNode(g.name()));
-  RecomputeSubgraph(recomputed_source_nodes, trigger.name(), target_nodes,
-                    &item.graph);
-  NodeMap post_transform_node_map(&item.graph);
-  EXPECT_EQ(11, item.graph.node_size());
+  // Set op types so that the heuristic will pick these nodes up to be
+  // recomputed
+  pre_transform_node_map.GetNode("BN")->set_op("FusedBatchNorm");
+  pre_transform_node_map.GetNode("ReLU")->set_op("Relu");
+
+  MemoryOptimizer optimizer(RewriterConfig::HEURISTICS);
+  GraphDef first_pass_output;
+  Status first_pass_status =
+      optimizer.Optimize(nullptr, item, &first_pass_output);
+  TF_EXPECT_OK(first_pass_status);
+
+  NodeMap post_transform_node_map(&first_pass_output);
+  EXPECT_EQ(13, first_pass_output.node_size());
   NodeDef* transformed_e = post_transform_node_map.GetNode(e.name());
   EXPECT_EQ(2, transformed_e->input_size());
-  EXPECT_EQ("BN1Grad", transformed_e->input(0));
+  EXPECT_EQ("gradients/BN1Grad", transformed_e->input(0));
   EXPECT_EQ("Recomputed/ReLU", transformed_e->input(1));
   NodeDef* transformed_f = post_transform_node_map.GetNode(f.name());
   EXPECT_EQ(2, transformed_f->input_size());
-  EXPECT_EQ("Conv1Grad", transformed_f->input(0));
+  EXPECT_EQ("gradients/Conv1Grad", transformed_f->input(0));
   EXPECT_EQ("Recomputed/ReLU", transformed_f->input(1));
   NodeDef* transformed_g = post_transform_node_map.GetNode(g.name());
   EXPECT_EQ(2, transformed_g->input_size());
-  EXPECT_EQ("ReLUGrad", transformed_g->input(0));
+  EXPECT_EQ("gradients/ReLUGrad", transformed_g->input(0));
   EXPECT_EQ("Conv", transformed_g->input(1));
 
   NodeDef* recomputed_b = post_transform_node_map.GetNode("Recomputed/BN");
   EXPECT_EQ(2, recomputed_b->input_size());
   EXPECT_EQ("Conv", recomputed_b->input(0));
-  EXPECT_EQ("^BN1Grad", recomputed_b->input(1).substr(0, 8));
+  EXPECT_EQ("^RecomputeTrigger/BN", recomputed_b->input(1));
+  NodeDef* recompute_trigger_b =
+      post_transform_node_map.GetNode("RecomputeTrigger/BN");
+  EXPECT_EQ(1, recompute_trigger_b->input_size());
+  EXPECT_EQ("^RecomputeTrigger/ReLU", recompute_trigger_b->input(0));
+
   NodeDef* recomputed_c = post_transform_node_map.GetNode("Recomputed/ReLU");
   EXPECT_EQ(2, recomputed_c->input_size());
   EXPECT_EQ("Recomputed/BN", recomputed_c->input(0));
-  EXPECT_EQ("^BN1Grad", recomputed_c->input(1).substr(0, 8));
+  EXPECT_EQ("^RecomputeTrigger/ReLU", recomputed_c->input(1));
+  NodeDef* recompute_trigger_c =
+      post_transform_node_map.GetNode("RecomputeTrigger/ReLU");
+  EXPECT_EQ(1, recompute_trigger_c->input_size());
+  EXPECT_EQ("^gradients/BN1Grad", recompute_trigger_c->input(0));
 }
 
 class MemoryOptimizerTest : public ::testing::Test {
  public:
-  static VirtualCluster CreateVirtualCluster() {
+  static std::unique_ptr<VirtualCluster> CreateVirtualCluster() {
     DeviceProperties cpu_device;
     cpu_device.set_type("CPU");
     cpu_device.set_frequency(1000);
@@ -125,7 +201,7 @@ class MemoryOptimizerTest : public ::testing::Test {
     cpu_device.set_bandwidth(32);
     std::unordered_map<string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
-    return VirtualCluster(devices);
+    return std::unique_ptr<VirtualCluster>(new VirtualCluster(devices));
   }
 };
 
@@ -133,7 +209,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   // Build a simple graph with an op that's marked for swapping.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output a = ops::Variable(s.WithOpName("a"), {10, 10}, DT_FLOAT);
   Output b = ops::AddN(s.WithOpName("b"), {a});
   Output c = ops::AddN(s.WithOpName("c"), {b});
   Output d = ops::AddN(s.WithOpName("d"), {c});
@@ -148,11 +224,11 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
       (*item.graph.mutable_node(4)->mutable_attr())["_swap_to_host"];
   val.mutable_list()->add_i(0);
 
-  VirtualCluster cluster(CreateVirtualCluster());
+  std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
 
-  MemoryOptimizer optimizer;
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
   GraphDef output;
-  Status status = optimizer.Optimize(&cluster, item, &output);
+  Status status = optimizer.Optimize(cluster.get(), item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 8bb7800df4e204c420e15898bc04ac941b8fbdeb..5527500af95319d4c584f9a9935ac8ca53fb275a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -41,7 +42,7 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     graph_optimizer.reset(new LayoutOptimizer());
   }
   if (optimizer == "memory") {
-    graph_optimizer.reset(new MemoryOptimizer());
+    graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL));
   }
   if (optimizer == "autoparallel") {
     graph_optimizer.reset(
@@ -57,7 +58,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!cfg_.disable_model_pruning()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
-    if (cfg_.constant_folding()) {
+    if (cfg_.constant_folding() == RewriterConfig::ON) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new ConstantFolding()));
     }
@@ -65,9 +66,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
     }
-    if (cfg_.memory_optimization() > 0) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new MemoryOptimizer()));
+    if (cfg_.memory_optimization() > 1) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new MemoryOptimizer(cfg_.memory_optimization())));
     }
     if (cfg_.auto_parallel().enable()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
@@ -101,8 +102,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
   TopologicalSort(optimized_graph);
-  // Copy the graph version.
-  *optimized_graph->mutable_versions() = item.graph.versions();
+
+  // Make sure that the optimizers preserved the graph version and library.
+  DCHECK_GE(optimized_graph->library().function_size(),
+            item.graph.library().function_size());
+  DCHECK_GE(optimized_graph->library().gradient_size(),
+            item.graph.library().gradient_size());
+  DCHECK_EQ(optimized_graph->versions().producer(),
+            item.graph.versions().producer());
 
   return Status::OK();
 }
@@ -113,8 +120,10 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 }
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  return cfg.optimize_tensor_layout() || cfg.constant_folding() ||
-         cfg.auto_parallel().enable() || !cfg.optimizers().empty();
+  return !cfg.disable_model_pruning() || cfg.optimize_tensor_layout() ||
+         cfg.constant_folding() == RewriterConfig::ON ||
+         cfg.auto_parallel().enable() || cfg.memory_optimization() > 1 ||
+         !cfg.optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index efa216383696eb035291e7f9251ec9516fd0ebb4..e313155563ad7bc7f3f96b2513108a170c1f6bf2 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -15,14 +15,40 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include <unordered_set>
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
+int NumNonControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  for (int i = 0; i < node.input_size(); ++i) {
+    if (!node.input(i).empty() && node.input(i)[0] == '^') {
+      num_inputs--;
+    }
+  }
+  return num_inputs;
+}
+
+bool IsTrivialOp(const NodeDef& node) {
+  // Remove the stop gradient nodes since they serve no purpose once the graph
+  // is built. Also remove Identity ops.
+  if (IsStopGradient(node) || IsIdentity(node)) {
+    return true;
+  }
+  if (IsAddN(node) && NumNonControlInputs(node) <= 1) {
+    return true;
+  }
+
+  return false;
+}
+
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* pruned_graph) {
   GraphRewriter rewriter(item);
@@ -31,37 +57,46 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   for (const auto& node : item.fetch) {
     nodes_to_preserve.insert(NodeName(node));
   }
+  for (const auto& feed : item.feed) {
+    nodes_to_preserve.insert(NodeName(feed.first));
+  }
   for (const auto& node : item.init_ops) {
     nodes_to_preserve.insert(NodeName(node));
   }
 
   std::unordered_set<const NodeDef*> nodes_to_delete;
   for (auto& node : item.graph.node()) {
-    // Remove the stop gradient nodes since they serve no purpose once the graph
-    // is built. Also remove Identity ops.
-    if (node.op() != "StopGradient" && node.op() != "Identity") {
+    if (!IsTrivialOp(node)) {
       continue;
     }
     // Don't remove nodes that must be preserved.
     if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
       continue;
     }
-    // Don't remove nodes that are explicitly placed.
-    if (!node.device().empty()) {
-      continue;
-    }
+
     // Don't remove nodes that drive control dependencies.
     // Don't remove nodes that are driven by control dependencies either since
     // we can't ensure (yet) that we won't increase the number of control
     // dependency edges by deleting them (for example, removing a node driven by
     // 10 control edges and driving 10 control edges would result in the
     // creation of 100 edges).
+    // Don't modify nodes that are connected to functions since that can result
+    // in inlining failures later on.
+    // Don't prune nodes that are driven by another device since these could be
+    // used to reduce cross device communication.
     if (!rewriter.DrivesControlDependency(node) &&
-        !rewriter.IsDrivenByControlDependency(node)) {
+        !rewriter.IsDrivenByControlDependency(node) &&
+        !rewriter.IsConnectedToFunction(node) &&
+        !rewriter.IsDrivenByAnotherDevice(node)) {
       nodes_to_delete.insert(&node);
     }
   }
 
+  if (nodes_to_delete.empty()) {
+    *pruned_graph = item.graph;
+    return Status::OK();
+  }
+
   for (auto& node : item.graph.node()) {
     NodeDef* new_node = pruned_graph->add_node();
     *new_node = node;
@@ -73,6 +108,9 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
           << " nodes from the graph. The graph now contains "
           << pruned_graph->node_size() << " nodes.";
 
+  *pruned_graph->mutable_library() = item.graph.library();
+  *pruned_graph->mutable_versions() = item.graph.versions();
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 67954d291461084925b1ad1b44b2a1bf7dbc0f5b..72d9c7bf275c8e53b036efc04751307af59a6388 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -57,10 +57,10 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::StopGradient(s.WithOpName("c"), b);
   Output d = ops::StopGradient(s.WithOpName("d"), c);
-  Output e = ops::AddN(s.WithOpName("e"), {d});
+  Output e = ops::Sqrt(s.WithOpName("e"), {d});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -93,18 +93,14 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
   Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::AddN(s.WithOpName("e"), {d});
+  Output e = ops::Sqrt(s.WithOpName("e"), {d});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  // Force the placement of c. This should ensure it is preserved.
-  EXPECT_EQ("c", item.graph.node(2).name());
-  item.graph.mutable_node(2)->set_device("CPU");
-
   ModelPruner pruner;
   GraphDef output;
   Status status = pruner.Optimize(nullptr, item, &output);
@@ -123,29 +119,63 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
   EXPECT_EQ(1, new_e.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_e.input(0));
+  EXPECT_EQ(NodeName(b.name()), new_e.input(0));
   EXPECT_EQ(1, new_d.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
+  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+  EXPECT_EQ(1, new_c.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_c.input(0));
 }
 
-TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
+TEST_F(ModelPrunerTest, NoOpPruning) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   Output b = ops::AddN(s.WithOpName("b"), {a});
-  Output c = ops::Identity(s.WithOpName("c"), b);
-  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output c = ops::AddN(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(b), {c});
   Output e = ops::AddN(s.WithOpName("e"), {d});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  // Add a control dependency between c and e. This should ensure c is
-  // preserved.
-  EXPECT_EQ("c", item.graph.node(2).name());
-  EXPECT_EQ("e", item.graph.node(4).name());
-  *item.graph.mutable_node(4)->add_input() = "^c";
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(1, new_e.input_size());
+  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
+  EXPECT_EQ(2, new_d.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+  EXPECT_EQ(1, new_c.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+}
+
+TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::Sqrt(s.WithOpName("e").WithControlDependencies(c), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   ModelPruner pruner;
   GraphDef output;
@@ -174,11 +204,11 @@ TEST_F(ModelPrunerTest, PruningPerservesCtrlDependencies) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
-  Output c = ops::AddN(s.WithOpName("c"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
+  Output c = ops::Sqrt(s.WithOpName("c"), {a});
   Output d = ops::Identity(s.WithOpName("d"), c);
   Output e = ops::Identity(s.WithOpName("e"), d);
-  Output f = ops::AddN(s.WithOpName("f"), {e});
+  Output f = ops::Sqrt(s.WithOpName("f"), {e});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -224,7 +254,7 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
 
   GrapplerItem item;
@@ -245,6 +275,38 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   EXPECT_EQ(NodeName(c.name()), new_c.name());
 }
 
+TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c").WithDevice("/cpu:0"), 0.0f, {10, 10});
+
+  // Node i1 should be preserved.
+  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/gpu:0"), c);
+  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/gpu:0"), {i1});
+  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/gpu:0"), {i1});
+
+  // Node i2 should be pruned since it resides on the sender's device.
+  Output i2 = ops::Identity(s.WithOpName("i2").WithDevice("/cpu:0"), c);
+  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/gpu:0"), {i2});
+  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/gpu:0"), {i2});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"a1", "a2", "a3", "a4"};
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "a1" || node.name() == "a2") {
+      EXPECT_EQ("i1", node.input(0));
+    } else if (node.name() == "a3" || node.name() == "a4") {
+      EXPECT_EQ("c", node.input(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index c932c90765e5dddc4251f025ce3fa6a69b497542..95a745be21add64dfad7a50b3c8e568a63a953e5 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 class StaticScheduleTest : public ::testing::Test {
  public:
-  VirtualCluster CreateVirtualCluster() const {
+  std::unique_ptr<VirtualCluster> CreateVirtualCluster() const {
     // Invent a CPU so that predictions remain the same from machine to machine.
     DeviceProperties cpu_device;
     cpu_device.set_type("CPU");
@@ -41,7 +41,7 @@ class StaticScheduleTest : public ::testing::Test {
     cpu_device.set_l3_cache_size(4 * 1024 * 1024);
     std::unordered_map<string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
-    return VirtualCluster(devices);
+    return std::unique_ptr<VirtualCluster>(new VirtualCluster(devices));
   }
 };
 
@@ -51,11 +51,11 @@ TEST_F(StaticScheduleTest, BasicGraph) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  VirtualCluster cluster(CreateVirtualCluster());
+  std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
 
   std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
   Status status =
-      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+      EstimateEarliestExecutionTimes(item, cluster.get(), &completion_times);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(item.graph.node_size(), completion_times.size());
@@ -97,11 +97,11 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
   EXPECT_EQ("e", item.graph.node(4).name());
   *item.graph.mutable_node(4)->add_input() = "^c";
 
-  VirtualCluster cluster(CreateVirtualCluster());
+  std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
 
   std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
   Status status =
-      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+      EstimateEarliestExecutionTimes(item, cluster.get(), &completion_times);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(item.graph.node_size(), completion_times.size());
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 8839f07bc5e06ecf91975cc209a21c80d12c9a60..fd3894553b9feb002e5863c083a8415e18ee13e5 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -48,6 +48,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 131756fc5c2b2f7090934e791d6dfa7acf7ccfa7..9c5d27f3c5fb1b8c0d0df711df3d212fa126b109 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <deque>
 #include <unordered_map>
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
@@ -35,7 +36,7 @@ void TopologicalSort(GraphDef* graph) {
     if (node.op() == "Merge") {
       ready_inputs[&node] = 0;
       for (const auto& input : node.input()) {
-        if (node_map.GetNode(input)->op() == "NextIteration") {
+        if (IsNextIteration(*node_map.GetNode(input))) {
           ready_inputs[&node]++;
         }
       }
@@ -56,7 +57,7 @@ void TopologicalSort(GraphDef* graph) {
     ready_nodes.pop_front();
   }
   if (sorted_graph.node_size() == graph->node_size()) {
-    *graph = sorted_graph;
+    graph->mutable_node()->Swap(sorted_graph.mutable_node());
   }
 }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b8370a96a8566d7384449468111b5538b44c9a62..f45bb72c38e41d9b78ebbcd1af0e3a43c7984c4b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -19,6 +19,7 @@ package_group(
     name = "friends",
     packages = [
         "//learning/brain/contrib/...",
+        "//learning/brain/research/sparse_matrix/...",
         "//tensorflow/...",
     ],
 )
@@ -32,6 +33,7 @@ load(
     "tf_kernel_library",
     "tf_mkl_kernel_library",
     "cc_header_only_library",
+    "if_not_windows",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@@ -63,7 +65,6 @@ config_setting(
     # will also need appropriate -mavx*, as required by specific op you use.
     name = "xsmm_backward",
     values = {
-        "define": "tensorflow_xsmm=1",
         "define": "tensorflow_xsmm_backward=1",
     },
 )
@@ -93,13 +94,11 @@ tf_kernel_library(
         "strided_slice_op_inst_7.cc",
     ],
     hdrs = [
-        "dense_update_ops.h",
         "slice_op.h",
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
     ],
     gpu_srcs = [
-        "dense_update_ops.h",
         "slice_op.h",
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
@@ -108,6 +107,7 @@ tf_kernel_library(
     ],
     deps = [
         ":bounds_check",
+        ":dense_update_functor",
         ":ops_util",
         ":variable_ops",
         "//tensorflow/core:framework",
@@ -156,11 +156,21 @@ cc_library(
     hdrs = ["conv_2d.h"],
     deps = [
         ":eigen_helpers",
+        ":gpu_util_hdrs",
         "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
 
+cc_library(
+    name = "conv_2d_hdrs",
+    hdrs = ["conv_2d.h"],
+    deps = [
+        ":eigen_helpers",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "extract_image_patches_op",
     prefix = "extract_image_patches_op",
@@ -231,7 +241,7 @@ cc_library(
     name = "ops_util",
     srcs = ["ops_util.cc"],
     hdrs = ["ops_util.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -247,6 +257,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "conv_ops_gpu_hdrs",
+    hdrs = ["conv_ops_gpu.h"],
+    deps = [
+        ":eigen_helpers",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "gpu_util_hdrs",
+    hdrs = ["gpu_utils.h"],
+    deps = [
+        ":eigen_helpers",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "ops_util_test",
     size = "small",
@@ -300,6 +328,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -322,6 +351,7 @@ cc_library(
         ":typed_queue",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -342,7 +372,7 @@ cc_library(
     name = "save_restore_tensor",
     srcs = ["save_restore_tensor.cc"],
     hdrs = ["save_restore_tensor.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
         ":bounds_check",
         "//tensorflow/core:framework",
@@ -413,6 +443,7 @@ cc_library(
     hdrs = ["warn_about_ints.h"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -670,6 +701,39 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "compare_and_bitpack_op",
+    srcs = ["compare_and_bitpack_op.cc"],
+    hdrs = ["compare_and_bitpack_op.h"],
+    gpu_srcs = [
+        "compare_and_bitpack_op.h",
+        "compare_and_bitpack_op_gpu.cu.cc",
+    ],
+    deps = ARRAY_DEPS,
+)
+
+# TODO(ebrevdo): Add benchmarks once the op is in the autogen array namespace.
+# tf_cuda_cc_test(
+#     name = "compare_and_bitpack_op_test",
+#     srcs = ["compare_and_bitpack_op_test.cc"],
+#     deps = [
+#         ":array",
+#         ":ops_testutil",
+#         ":ops_util",
+#         "//third_party/eigen3",
+#         "//tensorflow/cc:cc_ops",
+#         "//tensorflow/cc:cc_ops_internal",
+#         "//tensorflow/core:core_cpu",
+#         "//tensorflow/core:core_cpu_internal",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core:testlib",
+#     ],
+# )
+
 tf_kernel_library(
     name = "reshape_op",
     prefix = "reshape_op",
@@ -722,6 +786,12 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "tile_ops",
+    srcs = ["tile_functor_cpu.cc"],
+    hdrs = ["tile_functor.h"],
+    gpu_srcs = [
+        "tile_functor.h",
+        "tile_functor_gpu.cu.cc",
+    ],
     prefix = "tile_ops",
     deps = ARRAY_DEPS,
 )
@@ -754,7 +824,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "where_op",
     prefix = "where_op",
-    deps = ARRAY_DEPS,
+    deps = if_cuda([
+        ":cuda_solvers",
+        "@cub_archive//:cub",
+    ]) + ARRAY_DEPS,
 )
 
 tf_cc_test(
@@ -783,6 +856,7 @@ tf_cc_test(
         ":identity_op",
         ":ops_testutil",
         ":ops_util",
+        ":variable_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -998,6 +1072,22 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "dense_update_functor",
+    srcs = ["dense_update_functor.cc"],
+    hdrs = ["dense_update_functor.h"],
+    gpu_srcs = [
+        "dense_update_functor.h",
+        "dense_update_functor_gpu.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 0,
+)
+
 tf_cuda_cc_test(
     name = "gather_op_test",
     size = "small",
@@ -1210,6 +1300,7 @@ tf_kernel_library(
     ],
     visibility = ["//visibility:private"],
     deps = [
+        ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:conv_ops",
         "//third_party/eigen3",
@@ -1413,7 +1504,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "random_shuffle_queue_op",
     prefix = "random_shuffle_queue_op",
-    deps = DATA_FLOW_DEPS,
+    deps = DATA_FLOW_DEPS + ["//tensorflow/core:protos_all_cc"],
 )
 
 tf_kernel_library(
@@ -1514,6 +1605,7 @@ cc_library(
         ":typed_queue",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -1528,6 +1620,7 @@ cc_library(
         ":typed_queue",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -1590,7 +1683,7 @@ tf_kernel_library(
     srcs = ["resource_variable_ops.cc"],
     deps = [
         ":bounds_check",
-        ":dense_update_ops",
+        ":dense_update_functor",
         ":gather_functor",
         ":scatter_functor",
         ":state",
@@ -1936,15 +2029,6 @@ tf_kernel_library(
     deps = IO_DEPS,
 )
 
-# TODO(jhseu): Restore after merge.
-#tf_kernel_library(
-#    name = "lmdb_reader_op",
-#    prefix = "lmdb_reader_op",
-#    deps = IO_DEPS + [
-#        "@lmdb",
-#    ],
-#)
-
 tf_kernel_library(
     name = "lmdb_reader_op",
     prefix = "lmdb_reader_op",
@@ -2072,7 +2156,7 @@ tf_kernel_library(
         "//tensorflow:darwin": [],
         "//conditions:default": ["-Wl,-z,muldefs"],
     }),
-    visibility = ["//visibility:private"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2293,10 +2377,12 @@ cc_library(
         ":bucketize_op",
         ":cast_op",
         ":check_numerics_op",
+        ":compare_and_bitpack_op",
         ":cross_op",
         ":cwise_op",
         ":fft_ops",
         ":matmul_op",
+        ":population_count_op",
         ":reduction_ops",
         ":scan_ops",
         ":segment_reduction_ops",
@@ -2355,6 +2441,12 @@ tf_kernel_library(
 tf_kernel_library(
     name = "cwise_op",
     prefix = "cwise_op",
+    deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "population_count_op",
+    prefix = "population_count_op",
     deps = MATH_DEPS,
 )
 
@@ -2383,7 +2475,9 @@ tf_kernel_library(
         ],
         "//conditions:default": [],
     }),
-    deps = MATH_DEPS + select({
+    deps = MATH_DEPS + [
+        ":gpu_util_hdrs",
+    ] + select({
         ":xsmm": [
             "@libxsmm_archive//:xsmm_avx",
         ],
@@ -2468,7 +2562,7 @@ tf_cc_tests(
         ":ops_util",
         ":sparse_add_op",
         ":sparse_dense_binary_op_shared",
-        ":sparse_reduce_sum_op",
+        ":sparse_reduce_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2800,7 +2894,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "topk_op",
     prefix = "topk_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -3211,9 +3305,10 @@ cc_library(
         ":sparse_cross_op",
         ":sparse_dense_binary_op_shared",
         ":sparse_fill_empty_rows_op",
-        ":sparse_reduce_sum_op",
+        ":sparse_reduce_op",
         ":sparse_reorder_op",
         ":sparse_reshape_op",
+        ":sparse_slice_op",
         ":sparse_softmax",
         ":sparse_sparse_binary_op_shared",
         ":sparse_split_op",
@@ -3267,8 +3362,8 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "sparse_reduce_sum_op",
-    prefix = "sparse_reduce_sum_op",
+    name = "sparse_reduce_op",
+    prefix = "sparse_reduce_op",
     deps = SPARSE_DEPS,
 )
 
@@ -3296,6 +3391,12 @@ tf_kernel_library(
     deps = SPARSE_DEPS,
 )
 
+tf_kernel_library(
+    name = "sparse_slice_op",
+    prefix = "sparse_slice_op",
+    deps = SPARSE_DEPS,
+)
+
 tf_kernel_library(
     name = "sparse_softmax",
     prefix = "sparse_softmax",
@@ -3335,7 +3436,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "serialize_sparse_op",
     prefix = "serialize_sparse_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + ["//tensorflow/core:protos_all_cc"],
 )
 
 tf_kernel_library(
@@ -3468,7 +3569,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "dense_update_ops",
     prefix = "dense_update_ops",
-    deps = STATE_DEPS,
+    deps = STATE_DEPS + [":dense_update_functor"],
 )
 
 tf_kernel_library(
@@ -3479,8 +3580,24 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "scatter_nd_op",
-    prefix = "scatter_nd_op",
-    deps = STATE_DEPS,
+    srcs = [
+        "scatter_nd_op.cc",
+        "scatter_nd_op_cpu_impl_0.cc",
+        "scatter_nd_op_cpu_impl_1.cc",
+        "scatter_nd_op_cpu_impl_2.cc",
+        "scatter_nd_op_cpu_impl_3.cc",
+        "scatter_nd_op_cpu_impl_4.cc",
+        "scatter_nd_op_cpu_impl_5.cc",
+    ],
+    hdrs = [
+        "scatter_nd_op.h",
+        "scatter_nd_op_cpu_impl.h",
+    ],
+    gpu_srcs = [
+        "scatter_nd_op.h",
+        "scatter_nd_op_gpu.cu.cc",
+    ],
+    deps = STATE_DEPS + [":dense_update_functor"],
 )
 
 tf_kernel_library(
@@ -4022,8 +4139,9 @@ filegroup(
         "cwise_ops_common.cc",
         "cwise_ops_common.h",
         "cwise_ops_gradients.h",
+        "dense_update_functor.cc",
+        "dense_update_functor.h",
         "dense_update_ops.cc",
-        "dense_update_ops.h",
         "example_parsing_ops.cc",
         "fill_functor.cc",
         "fill_functor.h",
@@ -4110,6 +4228,7 @@ filegroup(
     srcs = [
         "argmax_op.h",
         "avgpooling_op.h",
+        "batch_matmul_op_impl.h",
         "batch_norm_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
@@ -4137,8 +4256,10 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "tensor_array.h",
+        "tile_functor.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
+        "topk_op.h",
         "training_op_helpers.h",
         "training_ops.h",
         "transpose_functor.h",
@@ -4154,6 +4275,7 @@ filegroup(
     srcs = [
         "argmax_op.cc",
         "avgpooling_op.cc",
+        "batch_matmul_op_real.cc",
         "batch_norm_op.cc",
         "bcast_ops.cc",
         "check_numerics_op.cc",
@@ -4171,6 +4293,9 @@ filegroup(
         "cwise_op_abs.cc",
         "cwise_op_add_1.cc",
         "cwise_op_add_2.cc",
+        "cwise_op_bitwise_and.cc",
+        "cwise_op_bitwise_or.cc",
+        "cwise_op_bitwise_xor.cc",
         "cwise_op_div.cc",
         "cwise_op_equal_to_1.cc",
         "cwise_op_equal_to_2.cc",
@@ -4179,11 +4304,14 @@ filegroup(
         "cwise_op_floor_div.cc",
         "cwise_op_greater.cc",
         "cwise_op_greater_equal.cc",
+        "cwise_op_invert.cc",
         "cwise_op_isfinite.cc",
         "cwise_op_less.cc",
+        "cwise_op_less_equal.cc",
         "cwise_op_log.cc",
         "cwise_op_logical_and.cc",
         "cwise_op_logical_not.cc",
+        "cwise_op_logical_or.cc",
         "cwise_op_maximum.cc",
         "cwise_op_minimum.cc",
         "cwise_op_mul_1.cc",
@@ -4207,6 +4335,8 @@ filegroup(
         "fake_quant_ops.cc",
         "fifo_queue.cc",
         "fused_batch_norm_op.cc",
+        "population_count_op.cc",
+        "population_count_op.h",
         "winograd_transform.h",
         ":android_extended_ops_headers",
     ] + select({
@@ -4270,6 +4400,7 @@ filegroup(
         "summary_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
+        "tile_functor_cpu.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",
         "tile_ops_cpu_impl_2.cc",
@@ -4405,6 +4536,32 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "android_tensorflow_image_op",
+    srcs = [
+        "decode_image_op.cc",
+    ],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-ldl",
+        ],
+        "//conditions:default": [],
+    }),
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:android_gif_internal",
+        "//tensorflow/core:android_jpeg_internal",
+        "//tensorflow/core:android_png_internal",
+        "//tensorflow/core:android_tensorflow_lib_lite",
+    ],
+    alwayslink = 1,
+)
+
 #   Quantization-specific OpKernels
 
 tf_kernel_library(
@@ -4990,7 +5147,6 @@ tf_kernel_library(
     name = "remote_fused_graph_ops",
     prefix = "remote_fused_graph_execute_op",
     deps = [
-        ":remote_fused_graph_execute_op",
         ":remote_fused_graph_execute_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5000,23 +5156,15 @@ tf_kernel_library(
     ],
 )
 
-cc_library(
-    name = "remote_fused_graph_execute_op",
-    srcs = ["remote_fused_graph_execute_op.cc"],
-    deps = [
-        ":remote_fused_graph_execute_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "remote_fused_graph_execute_utils",
-    srcs = ["remote_fused_graph_execute_utils.cc"],
+    srcs = [
+        "i_remote_fused_graph_ops_definitions.cc",
+        "remote_fused_graph_execute_utils.cc",
+    ],
     hdrs = [
         "i_remote_fused_graph_executor.h",
+        "i_remote_fused_graph_ops_definitions.h",
         "remote_fused_graph_execute_utils.h",
     ],
     deps = [
@@ -5035,6 +5183,7 @@ cc_library(
     srcs = ["remote_fused_graph_execute_op_test_utils.cc"],
     hdrs = ["remote_fused_graph_execute_op_test_utils.h"],
     deps = [
+        ":remote_fused_graph_execute_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
@@ -5080,9 +5229,9 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ops_util",
-        ":remote_fused_graph_execute_op",
         ":remote_fused_graph_execute_op_test_utils",
         ":remote_fused_graph_execute_utils",
+        ":remote_fused_graph_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
@@ -5124,6 +5273,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -5381,6 +5531,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "interleave_dataset_op",
+    srcs = ["interleave_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "repeat_dataset_op",
     srcs = ["repeat_dataset_op.cc"],
@@ -5417,6 +5581,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "ignore_errors_dataset_op",
+    srcs = ["ignore_errors_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
@@ -5489,6 +5665,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "concatenate_dataset_op",
+    srcs = ["concatenate_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "reader_dataset_ops",
     srcs = ["reader_dataset_ops.cc"],
@@ -5533,10 +5721,13 @@ tf_kernel_library(
     deps = [
         ":batch_dataset_op",
         ":cache_dataset_ops",
+        ":concatenate_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":group_by_window_dataset_op",
+        ":ignore_errors_dataset_op",
+        ":interleave_dataset_op",
         ":iterator_ops",
         ":map_dataset_op",
         ":padded_batch_dataset_op",
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index c8f12f91a6cb7423226dad5cc9a9903c51f6b9ba..37976f71837cb365cd9d232c7c1e102ec5bfe338 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -31,6 +31,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif
 
 // AdjustContrastOp is deprecated as of GraphDef version >= 2
 
@@ -410,4 +413,25 @@ REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_GPU),
                         AdjustContrastOpv2<GPUDevice>);
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+template <>
+class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
+ public:
+  explicit AdjustContrastOpv2(OpKernelConstruction* context)
+      : AdjustContrastOpV2Base(context) {}
+
+  void DoCompute(OpKernelContext* context,
+                 const ComputeOptions& options) override {
+    const int64 shape[4] = {options.batch, options.height, options.width,
+                            options.channels};
+    functor::AdjustContrastv2<SYCLDevice>()(
+        context->eigen_device<SYCLDevice>(),
+        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
+        options.output->shaped<float, 4>(shape));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_SYCL),
+                        AdjustContrastOpv2<SYCLDevice>);
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
index ffd47406eb639fea61f1a93920aa6885c4819d88..c485f148448c9e524f8cafa145d2a32456b56fef 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc
@@ -56,6 +56,11 @@ static Graph* BM_AdjustContrast(int batches, int width, int height) {
 // BM_AdjustContrast_cpu_1_299_299     179084     340186  2181  751.9M items/s
 // BM_AdjustContrast_gpu_32_299_299     85276     123665  4189  2.9G items/s
 BM_AdjustContrastDev(cpu, 1, 299, 299);
+#if GOOGLE_CUDA
 BM_AdjustContrastDev(gpu, 32, 299, 299);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_AdjustContrastDev(sycl, 32, 299, 299);
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index 4d16601b5b3efda1efb06e34806ee9685ff46040..49cd997fed544c221a2cd32598b050a02d271f86 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -40,7 +40,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T, typename ArgFunctor>
+template <typename Device, typename T, typename Tout, typename ArgFunctor>
 class ArgOp : public OpKernel {
  public:
   explicit ArgOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -76,11 +76,11 @@ class ArgOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
-#define HANDLE_DIM(NDIM)                                         \
-  case NDIM:                                                     \
-    ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),    \
-                             input.tensor<T, NDIM>(), axis,      \
-                             output->tensor<int64, NDIM - 1>()); \
+#define HANDLE_DIM(NDIM)                                        \
+  case NDIM:                                                    \
+    ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),   \
+                             input.tensor<T, NDIM>(), axis,     \
+                             output->tensor<Tout, NDIM - 1>()); \
     break;
 
     switch (input_dims) {
@@ -102,31 +102,47 @@ class ArgOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ArgOp);
 };
 
-template <typename Device, typename T>
-class ArgMaxOp : public ArgOp<Device, T, functor::ArgMax<Device, T> > {
+template <typename Device, typename T, typename Tout>
+class ArgMaxOp
+    : public ArgOp<Device, T, Tout, functor::ArgMax<Device, T, Tout> > {
  public:
   explicit ArgMaxOp(OpKernelConstruction* context)
-      : ArgOp<Device, T, functor::ArgMax<Device, T> >(context) {}
+      : ArgOp<Device, T, Tout, functor::ArgMax<Device, T, Tout> >(context) {}
 };
 
-template <typename Device, typename T>
-class ArgMinOp : public ArgOp<Device, T, functor::ArgMin<Device, T> > {
+template <typename Device, typename T, typename Tout>
+class ArgMinOp
+    : public ArgOp<Device, T, Tout, functor::ArgMin<Device, T, Tout> > {
  public:
   explicit ArgMinOp(OpKernelConstruction* context)
-      : ArgOp<Device, T, functor::ArgMin<Device, T> >(context) {}
+      : ArgOp<Device, T, Tout, functor::ArgMin<Device, T, Tout> >(context) {}
 };
 
-#define REGISTER_ARGMAX(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("ArgMax")                 \
-                              .Device(DEVICE_CPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("dimension"),  \
-                          ArgMaxOp<CPUDevice, type>);    \
-  REGISTER_KERNEL_BUILDER(Name("ArgMin")                 \
-                              .Device(DEVICE_CPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("dimension"),  \
-                          ArgMinOp<CPUDevice, type>);
+#define REGISTER_ARGMAX(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int64>("output_type") \
+                              .HostMemory("dimension"),             \
+                          ArgMaxOp<CPUDevice, type, int64>);        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int64>("output_type") \
+                              .HostMemory("dimension"),             \
+                          ArgMinOp<CPUDevice, type, int64>);        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int32>("output_type") \
+                              .HostMemory("dimension"),             \
+                          ArgMaxOp<CPUDevice, type, int32>);        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int32>("output_type") \
+                              .HostMemory("dimension"),             \
+                          ArgMinOp<CPUDevice, type, int32>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);
 
@@ -135,26 +151,33 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX);
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 
-#define DECLARE_GPU_SPEC(T, Dims)                                              \
-  template <>                                                                  \
-  void ArgMax<GPUDevice, T>::Reduce##Dims(                                     \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,         \
-      const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output); \
-  template <>                                                                  \
-  void ArgMin<GPUDevice, T>::Reduce##Dims(                                     \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,         \
-      const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output);
-
-#define DECLARE_GPU_SPECS(T) \
-  DECLARE_GPU_SPEC(T, 1);    \
-  DECLARE_GPU_SPEC(T, 2);    \
-  DECLARE_GPU_SPEC(T, 3);    \
-  DECLARE_GPU_SPEC(T, 4);    \
-  DECLARE_GPU_SPEC(T, 5);
-
-#define DECLARE_GPU_CLASS(T)                   \
-  extern template struct ArgMax<GPUDevice, T>; \
-  extern template struct ArgMin<GPUDevice, T>;
+#define DECLARE_GPU_SPEC(T, Tout, Dims)                                       \
+  template <>                                                                 \
+  void ArgMax<GPUDevice, T, Tout>::Reduce##Dims(                              \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,        \
+      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output); \
+  template <>                                                                 \
+  void ArgMin<GPUDevice, T, Tout>::Reduce##Dims(                              \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,        \
+      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output);
+
+#define DECLARE_GPU_SPECS(T)     \
+  DECLARE_GPU_SPEC(T, int64, 1); \
+  DECLARE_GPU_SPEC(T, int64, 2); \
+  DECLARE_GPU_SPEC(T, int64, 3); \
+  DECLARE_GPU_SPEC(T, int64, 4); \
+  DECLARE_GPU_SPEC(T, int64, 5); \
+  DECLARE_GPU_SPEC(T, int32, 1); \
+  DECLARE_GPU_SPEC(T, int32, 2); \
+  DECLARE_GPU_SPEC(T, int32, 3); \
+  DECLARE_GPU_SPEC(T, int32, 4); \
+  DECLARE_GPU_SPEC(T, int32, 5);
+
+#define DECLARE_GPU_CLASS(T)                          \
+  extern template struct ArgMax<GPUDevice, T, int64>; \
+  extern template struct ArgMin<GPUDevice, T, int64>; \
+  extern template struct ArgMax<GPUDevice, T, int32>; \
+  extern template struct ArgMin<GPUDevice, T, int32>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);
@@ -165,19 +188,35 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_ARGMAX_GPU(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("ArgMax")                     \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("dimension"),      \
-                          ArgMaxOp<GPUDevice, type>);        \
-  REGISTER_KERNEL_BUILDER(Name("ArgMin")                     \
-                              .Device(DEVICE_GPU)            \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("dimension"),      \
-                          ArgMinOp<GPUDevice, type>);
+#define REGISTER_ARGMAX_GPU(type)                                   \
+  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
+                              .Device(DEVICE_GPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int64>("output_type") \
+                              .TypeConstraint<int32>("Tidx")        \
+                              .HostMemory("dimension"),             \
+                          ArgMaxOp<GPUDevice, type, int64>);        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
+                              .Device(DEVICE_GPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int64>("output_type") \
+                              .TypeConstraint<int32>("Tidx")        \
+                              .HostMemory("dimension"),             \
+                          ArgMinOp<GPUDevice, type, int64>);        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMax")                            \
+                              .Device(DEVICE_GPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int32>("output_type") \
+                              .TypeConstraint<int32>("Tidx")        \
+                              .HostMemory("dimension"),             \
+                          ArgMaxOp<GPUDevice, type, int32>);        \
+  REGISTER_KERNEL_BUILDER(Name("ArgMin")                            \
+                              .Device(DEVICE_GPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<int32>("output_type") \
+                              .TypeConstraint<int32>("Tidx")        \
+                              .HostMemory("dimension"),             \
+                          ArgMinOp<GPUDevice, type, int32>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ARGMAX_GPU);
 
diff --git a/tensorflow/core/kernels/argmax_op.h b/tensorflow/core/kernels/argmax_op.h
index 850a0390bc87a214c63020231944cad9f4d6d687..b8bc41e089f27324be0a7d14f10d4ee8be9ae570 100644
--- a/tensorflow/core/kernels/argmax_op.h
+++ b/tensorflow/core/kernels/argmax_op.h
@@ -25,14 +25,13 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tout>
 struct ArgMax {
-#define DECLARE_COMPUTE_SPEC(Dims)                                     \
-  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                        \
-      const Device& d, typename TTypes<T, Dims>::ConstTensor input,    \
-      const int32 dimension,                                           \
-      typename TTypes<int64, Dims - 1>::Tensor output) {               \
-    output.device(d) = input.argmax(dimension).template cast<int64>(); \
+#define DECLARE_COMPUTE_SPEC(Dims)                                             \
+  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                                \
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,            \
+      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output) { \
+    output.device(d) = input.argmax(dimension).template cast<Tout>();          \
   }
 
   DECLARE_COMPUTE_SPEC(1);
@@ -44,14 +43,13 @@ struct ArgMax {
 #undef DECLARE_COMPUTE_SPEC
 };
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tout>
 struct ArgMin {
-#define DECLARE_COMPUTE_SPEC(Dims)                                     \
-  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                        \
-      const Device& d, typename TTypes<T, Dims>::ConstTensor input,    \
-      const int32 dimension,                                           \
-      typename TTypes<int64, Dims - 1>::Tensor output) {               \
-    output.device(d) = input.argmin(dimension).template cast<int64>(); \
+#define DECLARE_COMPUTE_SPEC(Dims)                                             \
+  EIGEN_ALWAYS_INLINE static void Reduce##Dims(                                \
+      const Device& d, typename TTypes<T, Dims>::ConstTensor input,            \
+      const int32 dimension, typename TTypes<Tout, Dims - 1>::Tensor output) { \
+    output.device(d) = input.argmin(dimension).template cast<Tout>();          \
   }
 
   DECLARE_COMPUTE_SPEC(1);
diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
index 2915da4c08eee4bdbe7cdaa0610a9d921dcbd208..52e523003b401786676f5e6b3821bb2ae3215faf 100644
--- a/tensorflow/core/kernels/argmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc
@@ -24,9 +24,11 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPEC(T)                       \
-  template struct functor::ArgMax<GPUDevice, T>; \
-  template struct functor::ArgMin<GPUDevice, T>;
+#define DEFINE_GPU_SPEC(T)                              \
+  template struct functor::ArgMax<GPUDevice, T, int64>; \
+  template struct functor::ArgMin<GPUDevice, T, int64>; \
+  template struct functor::ArgMax<GPUDevice, T, int32>; \
+  template struct functor::ArgMin<GPUDevice, T, int32>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
 
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 83633a1dd98f172aab66088826282b28a8fb217b..3b880a963538b05789e73a9100ec5d5472d3c249 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -413,7 +413,7 @@ class Barrier : public ResourceBase {
     }
     queue_closed_ = true;
     if (cancel_pending_enqueues) queue_cancelled_ = true;
-    if (!ready_queue_->closed()) {
+    if (!ready_queue_->is_closed()) {
       ready_queue_->Close(ctx, cancel_pending_enqueues, callback);
     }
   }
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/batch_dataset_op.cc
index 67eff44a5d48e79639686a47f877cfa1268e167d..64a8e83f842b72739fc026ca1adf178207a66e56 100644
--- a/tensorflow/core/kernels/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/batch_dataset_op.cc
@@ -31,7 +31,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 batch_size;
+    int64 batch_size = 0;
     OP_REQUIRES_OK(ctx,
                    ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 1ee0796ac1472d76ec5a31582a5d678a0ea52fcf..6309e4a4dc6f3ae094e5a310ca237474afeeca14 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -45,20 +45,23 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
 
 }  // namespace functor
 
-#define CURRY_TYPES3(FN, arg0, arg1)   \
-  FN(arg0, arg1, bool);                \
-  FN(arg0, arg1, uint8);               \
-  FN(arg0, arg1, int8);                \
-  FN(arg0, arg1, uint16);              \
-  FN(arg0, arg1, int16);               \
-  FN(arg0, arg1, int32);               \
-  FN(arg0, arg1, int64);               \
-  FN(arg0, arg1, Eigen::half);         \
-  FN(arg0, arg1, float);               \
-  FN(arg0, arg1, double);              \
-  FN(arg0, arg1, std::complex<float>); \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
+  FN(arg0, arg1, bool);                        \
+  FN(arg0, arg1, uint8);                       \
+  FN(arg0, arg1, int8);                        \
+  FN(arg0, arg1, uint16);                      \
+  FN(arg0, arg1, int16);                       \
+  FN(arg0, arg1, int32);                       \
+  FN(arg0, arg1, int64);                       \
+  FN(arg0, arg1, float);                       \
+  FN(arg0, arg1, double);                      \
+  FN(arg0, arg1, std::complex<float>);         \
   FN(arg0, arg1, std::complex<double>)
 
+#define CURRY_TYPES3(FN, arg0, arg1)           \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+  FN(arg0, arg1, Eigen::half);
+
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
   if (DataTypeToEnum<OUT>::value == dst_dtype) {                           \
     return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {      \
@@ -154,6 +157,15 @@ GetGpuCastFromBfloat(DataType dst_dtype);
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromBool(DataType dst_dtype);
 
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromUint8(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromUint16(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt16(DataType dst_dtype);
+
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromInt32(DataType dst_dtype);
 
@@ -165,10 +177,8 @@ GetSyclCastFromFloat(DataType dst_dtype);
 
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromDouble(DataType dst_dtype);
-
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
-
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index a13f16300928ff62c3d8f80e386ee428df04bad9..5cd63f2458d2a01ec3d2c1621e6f86ecc6f5347f 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -38,10 +38,9 @@ GetGpuCastFromBool(DataType dst_dtype) {
 typedef Eigen::SyclDevice SYCLDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromBool(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, SYCLDevice, bool);
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, bool);
   return nullptr;
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index fdc8d51158f4261a7f72f4c6c83691d67e96bfc6..1203f066a2db366d38ae99c5ff1e1f385979a8af 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -38,10 +38,9 @@ GetGpuCastFromDouble(DataType dst_dtype) {
 typedef Eigen::SyclDevice SYCLDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromDouble(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, SYCLDevice, double);
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, double);
   return nullptr;
 }
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 1241dcd8f2e0ebabfead16a014bab1c38539d736..2ff9af21f2413b16e1d38a64594e0dcf89e14bcb 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -53,10 +53,9 @@ GetGpuCastFromFloat(DataType dst_dtype) {
 typedef Eigen::SyclDevice SYCLDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromFloat(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, SYCLDevice, float);
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, float);
   return nullptr;
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index 3c2d6185e3eb56887623f2749a58a87c3435a4e9..f12d852e957550e17521a4dc5b70d3807dacecd3 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -34,4 +34,13 @@ GetGpuCastFromInt16(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt16(DataType dst_dtype) {
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int16);
+  return nullptr;
+}
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 69ed760455885ab9ea7cbf8d08255a1784e10125..2a4b27a12dcba71f904f980c13d82fbc0dfc3ae2 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -38,9 +38,9 @@ GetGpuCastFromInt32(DataType dst_dtype) {
 typedef Eigen::SyclDevice SYCLDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromInt32(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, SYCLDevice, int32);
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int32);
   return nullptr;
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 7a8363ca39c26ed3c70406e2dd7e5a09fd4a9f6a..065defabbba6c926640b01eb68e4c0b20a8e241d 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -38,9 +38,9 @@ GetGpuCastFromInt64(DataType dst_dtype) {
 typedef Eigen::SyclDevice SYCLDevice;
 std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
 GetSyclCastFromInt64(DataType dst_dtype) {
-  CURRY_TYPES3(CAST_CASE, SYCLDevice, int64);
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int64);
   return nullptr;
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 62971fa95cbdc0cb2fc7b85e4cdbf9de5556cf19..8d678c47335f6f985c8e93bb72f61c95fbda834b 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -34,4 +34,13 @@ GetGpuCastFromInt8(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt8(DataType dst_dtype) {
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int8);
+  return nullptr;
+}
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index 529d9758f07480295bb4bb6dd4e2f9a0daa38b4d..c917aaf7bde6eccf9acb556152ee854c897143fa 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -34,4 +34,13 @@ GetGpuCastFromUint16(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromUint16(DataType dst_dtype) {
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint16);
+  return nullptr;
+}
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 1a5025b05470a3c68a4e7306b344912c95dfeacd..377c8ca9536d69a49364e8f5f2b6ffb969ce234e 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -34,4 +34,13 @@ GetGpuCastFromUint8(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromUint8(DataType dst_dtype) {
+  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint8);
+  return nullptr;
+}
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index d65a34fd735ef43b885b16e455a6ec9c7f2e4fcd..ba100b32e7d8cfcd6a0138a09062910743d6d2eb 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -35,6 +35,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif
 
 template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
@@ -146,4 +149,16 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
+                              .TypeConstraint<T>("T"),         \
+                          RGBToHSVOp<SYCLDevice, T>);          \
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
+                              .TypeConstraint<T>("T"),         \
+                          HSVToRGBOp<SYCLDevice, T>);
+TF_CALL_float(REGISTER_SYCL);
+TF_CALL_double(REGISTER_SYCL);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f626a274a4d36b568cc6e25af2e572a35ae3694
--- /dev/null
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -0,0 +1,185 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/compare_and_bitpack_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class CompareAndBitpackOp : public OpKernel {
+ public:
+  explicit CompareAndBitpackOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input_t = c->input(0);
+    const Tensor& threshold_t = c->input(1);
+    OP_REQUIRES(
+        c, TensorShapeUtils::IsScalar(threshold_t.shape()),
+        errors::InvalidArgument("Compare must be a scalar, but saw shape: ",
+                                threshold_t.shape().DebugString()));
+    const TensorShape& input_shape = input_t.shape();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "Input should be at least a vector, but saw a scalar."));
+    OP_REQUIRES(c, input_shape.dim_size(input_shape.dims() - 1) % 8 == 0,
+                errors::InvalidArgument(
+                    "Inner dimension of input should be "
+                    "divisible by ",
+                    8, ", but saw shape: ", input_shape.DebugString()));
+
+    TensorShape output_shape = input_shape;
+    int rank = input_shape.dims();
+    output_shape.set_dim(rank - 1, input_shape.dim_size(rank - 1) / 8);
+
+    Tensor* output_t;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output_t));
+
+    auto input = input_t.flat_inner_dims<T>();
+    auto threshold = threshold_t.scalar<T>();
+    auto output = output_t->flat_inner_dims<uint8>();
+
+    functor::CompareAndBitpack<Device, T> func;
+    func(c, input, threshold, output);
+  }
+};
+
+#define REGISTER_COMPARE_AND_BITPACK(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("CompareAndBitpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      CompareAndBitpackOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_COMPARE_AND_BITPACK);
+TF_CALL_bool(REGISTER_COMPARE_AND_BITPACK);
+
+#undef REGISTER_COMPARE_AND_BITPACK
+
+namespace functor {
+
+template <typename T, class = void, class = void>
+struct ComputeShard {
+  static EIGEN_STRONG_INLINE void Compute(typename TTypes<T>::ConstMatrix input,
+                                          typename TTypes<uint8>::Matrix output,
+                                          const T& thresh, int64 start,
+                                          int64 limit) {
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const T* block = input.data() + 8 * i;
+      *out = ((((block[0] > thresh) << 7)) | (((block[1] > thresh) << 6)) |
+              (((block[2] > thresh) << 5)) | (((block[3] > thresh) << 4)) |
+              (((block[4] > thresh) << 3)) | (((block[5] > thresh) << 2)) |
+              (((block[6] > thresh) << 1)) | (((block[7] > thresh))));
+    }
+  }
+};
+
+// Specialization for bool on systems where sizeof(bool) == 1.
+template <typename T>
+struct ComputeShard<T,
+                    typename std::enable_if<std::is_same<T, bool>::value>::type,
+                    typename std::enable_if<sizeof(T) == 1>::type> {
+  static EIGEN_STRONG_INLINE void Compute(
+      typename TTypes<bool>::ConstMatrix input,
+      typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
+      int64 limit) {
+    // NOTE(ebrevdo): This assumes memory is little-endian.
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out =
+          ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) |
+           (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) |
+           (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) |
+           (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) |
+           (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) |
+           (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
+           (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
+    }
+  }
+};
+
+template <typename T>
+struct CompareAndBitpack<CPUDevice, T> {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstMatrix input,
+                  typename TTypes<T>::ConstScalar threshold,
+                  TTypes<uint8>::Matrix output) {
+    const T thresh = threshold();
+    auto shard = [&, thresh](int64 start, int64 limit) {
+      ComputeShard<T>::Compute(input, output, thresh, start, limit);
+    };
+    int64 total_shards = output.size();  // Approximate cmp as an add and
+                                         // bitwise-or + shift as an add.
+    const double total_cost = 8 * (Eigen::TensorOpCost::AddCost<T>() +
+                                   Eigen::TensorOpCost::AddCost<uint8>());
+    const int64 shard_cost = (total_cost >= static_cast<double>(kint64max))
+                                 ? kint64max
+                                 : static_cast<int64>(total_cost);
+
+    auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
+          shard_cost, shard);
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+
+#define REGISTER_COMPARE_AND_BITPACK(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("CompareAndBitpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      CompareAndBitpackOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_COMPARE_AND_BITPACK);
+TF_CALL_bool(REGISTER_COMPARE_AND_BITPACK);
+
+#undef REGISTER_COMPARE_AND_BITPACK
+
+namespace functor {
+
+#define DECLARE_GPU_SPEC(T)                                      \
+  template <>                                                    \
+  void CompareAndBitpack<GPUDevice, T>::operator()(              \
+      OpKernelContext* c, typename TTypes<T>::ConstMatrix input, \
+      typename TTypes<T>::ConstScalar threshold,                 \
+      TTypes<uint8>::Matrix output);                             \
+  extern template struct CompareAndBitpack<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
+TF_CALL_bool(DECLARE_GPU_SPEC)
+
+#undef DECLARE_GPU_SPEC
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.h b/tensorflow/core/kernels/compare_and_bitpack_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e020249c106f28a8aada2cef6c31c6796b6d332
--- /dev/null
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct CompareAndBitpack {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstMatrix input,
+                  typename TTypes<T>::ConstScalar threshold,
+                  TTypes<uint8>::Matrix output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..345405e3fe6f89c5f6bbf0721cf1d6e25b6077d1
--- /dev/null
+++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/compare_and_bitpack_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+__global__ void CompareAndBitpackKernel(const int size, const T* threshold,
+                                        const T* input, uint8* output) {
+  // TODO(ebrevdo): Erich said: to get a better memory access pattern
+  // you could have 8 threads load this data and do a comparison, then
+  // use the ballot instruction to combine the values from each thread
+  // in the warp in one instruction (so each thread will have the
+  // result for 4 blocks) followed by an appropriate shift and mask to
+  // get the 8-bits of interest.
+  const T thresh = ldg(threshold);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const T* block = input + 8 * i;
+    output[i] =
+        ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) |
+         (((ldg(block + 2) > thresh) << 5)) |
+         (((ldg(block + 3) > thresh) << 4)) |
+         (((ldg(block + 4) > thresh) << 3)) |
+         (((ldg(block + 5) > thresh) << 2)) |
+         (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh))));
+  }
+}
+
+template <>
+__global__ void CompareAndBitpackKernel<bool>(const int size,
+                                              const bool* threshold,
+                                              const bool* input,
+                                              uint8* output) {
+  // TODO(ebrevdo): Erich said: I think you could again have multiple
+  // threads work on one block and use the ballot instruction to the
+  // bit packing in one instruction.
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i));
+    // NOTE(ebrevdo): This assumes memory is little-endian.
+    output[i] =
+        ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) |
+         (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) |
+         (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) |
+         (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) |
+         (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) |
+         (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
+         (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
+  }
+}
+
+template <>
+__global__ void CompareAndBitpackKernel<float>(const int size,
+                                               const float* threshold,
+                                               const float* input,
+                                               uint8* output) {
+  const float thresh = ldg(threshold);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
+    const float4 block1 =
+        ldg(reinterpret_cast<const float4*>(input + 8 * i + 4));
+    output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) |
+                 (((block0.z > thresh) << 5)) | (((block0.w > thresh) << 4)) |
+                 (((block1.x > thresh) << 3)) | (((block1.y > thresh) << 2)) |
+                 (((block1.z > thresh) << 1)) | (((block1.w > thresh))));
+  }
+}
+
+template <>
+__global__ void CompareAndBitpackKernel<double>(const int size,
+                                                const double* threshold,
+                                                const double* input,
+                                                uint8* output) {
+  const double thresh = ldg(threshold);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
+    const double2 block1 =
+        ldg(reinterpret_cast<const double2*>(input + 8 * i + 2));
+    const double2 block2 =
+        ldg(reinterpret_cast<const double2*>(input + 8 * i + 4));
+    const double2 block3 =
+        ldg(reinterpret_cast<const double2*>(input + 8 * i + 6));
+    output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) |
+                 (((block1.x > thresh) << 5)) | (((block1.y > thresh) << 4)) |
+                 (((block2.x > thresh) << 3)) | (((block2.y > thresh) << 2)) |
+                 (((block3.x > thresh) << 1)) | (((block3.y > thresh))));
+  }
+}
+
+#define DEFINE_GPU_SPECS(T)                                               \
+  template <>                                                             \
+  void CompareAndBitpack<GPUDevice, T>::operator()(                       \
+      OpKernelContext* c, typename TTypes<T>::ConstMatrix input,          \
+      typename TTypes<T>::ConstScalar threshold,                          \
+      TTypes<uint8>::Matrix output) {                                     \
+    const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
+    int64 total_count = output.size();                                    \
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
+                                                                          \
+    CompareAndBitpackKernel<T>                                            \
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
+            total_count, threshold.data(), input.data(), output.data());  \
+  }
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
+TF_CALL_bool(DEFINE_GPU_SPECS)
+
+#undef DECLARE_GPU_SPECS
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index cd0414ef4096a2c1e9bbb7b9d90412e5492aca28..5159cdaa6ec07784550cd50e3a71729268eff830 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -115,6 +115,7 @@ void ConcatGPU(
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
+TF_CALL_int64(REGISTER);
 REGISTER(bfloat16);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 3ed6241b7a746030d41f4e62cb60480587f48bea..f971637d5db498f78ed3d3c1ccd05462018b95fe 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -88,7 +88,8 @@ __global__ void concat_variable_kernel(
   // do an initial binary search and then scan linearly from there
   // works well when there are many small segments and when the
   // segments are much longer
-  IntType segment = gpu::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
+  IntType segment =
+      cuda_helper::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
 
   IntType curr_offset = col_scan[segment];
   IntType curr_segment = segment;
@@ -142,10 +143,10 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
                                       output->dimension(0), gpu_device);
 
   if (fixed_size) {
-    concat_fixed_kernel<T, IntType><<<
-        config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
-        input_ptrs, split_size, output->dimension(0), output->dimension(1),
-        output->data());
+    concat_fixed_kernel<T, IntType>
+        <<<config.block_count, config.thread_per_block, 0,
+           gpu_device.stream()>>>(input_ptrs, split_size, output->dimension(0),
+                                  output->dimension(1), output->data());
   } else {
     IntType smem_max = gpu_device.sharedMemPerBlock();
     IntType smem_usage = output_scan.size * sizeof(IntType);
@@ -155,17 +156,17 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
     // 4096 inputs is a lot, most code will take the smem path
     const int32 kMaxSmemBytesPerformance = 16384;
     if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
-      concat_variable_kernel<
-          T, IntType, true><<<config.block_count, config.thread_per_block,
-                              smem_usage, gpu_device.stream()>>>(
-          input_ptrs, output_scan, output->dimension(0), output->dimension(1),
-          output->data());
+      concat_variable_kernel<T, IntType, true>
+          <<<config.block_count, config.thread_per_block, smem_usage,
+             gpu_device.stream()>>>(input_ptrs, output_scan,
+                                    output->dimension(0), output->dimension(1),
+                                    output->data());
     else
-      concat_variable_kernel<
-          T, IntType, false><<<config.block_count, config.thread_per_block, 0,
-                               gpu_device.stream()>>>(
-          input_ptrs, output_scan, output->dimension(0), output->dimension(1),
-          output->data());
+      concat_variable_kernel<T, IntType, false>
+          <<<config.block_count, config.thread_per_block, 0,
+             gpu_device.stream()>>>(input_ptrs, output_scan,
+                                    output->dimension(0), output->dimension(1),
+                                    output->data());
   }
 }
 
@@ -200,21 +201,25 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
+TF_CALL_int64(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
+TF_CALL_int64(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
+TF_CALL_int64(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
+TF_CALL_int64(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 
 #undef REGISTER_GPUCONCAT32
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index e7848a7e2604c6a0a52d842f010afa3ded3116c0..01a744dc7ecdb72887f9b565b65414de508a17b1 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -195,6 +195,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/concatenate_dataset_op.cc b/tensorflow/core/kernels/concatenate_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a206e365353df5d6020a70bc69c119e66df9727
--- /dev/null
+++ b/tensorflow/core/kernels/concatenate_dataset_op.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
+ public:
+  explicit ConcatenateDatasetOp(OpKernelConstruction* ctx)
+      : BinaryDatasetOpKernel(ctx) {}
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase* to_concatenate, DatasetBase** output) override {
+    OP_REQUIRES(ctx, input->output_dtypes() == to_concatenate->output_dtypes(),
+                errors::InvalidArgument(
+                    "input dataset and dataset to concatenate"
+                    " have different output_types %s and %s",
+                    (DataTypeVectorString(input->output_dtypes()),
+                     DataTypeVectorString(to_concatenate->output_dtypes()))));
+    *output = new Dataset(input, to_concatenate);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(const DatasetBase* input,
+                     const DatasetBase* to_concatenate)
+        : input_(input), to_concatenate_(to_concatenate) {
+      input_->Ref();
+      to_concatenate_->Ref();
+
+      auto os_input = input->output_shapes();
+      auto os_concatenate = to_concatenate->output_shapes();
+      for (int i = 0; i < os_input.size(); i++) {
+        output_shapes_.push_back(
+            MostSpecificCompatibleShape(os_input[i], os_concatenate[i]));
+      }
+    }
+    ~Dataset() override {
+      input_->Unref();
+      to_concatenate_->Unref();
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "ConcatenateDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        while (i_ < 2) {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            return Status::OK();
+          }
+          if (++i_ < 2) {
+            input_impl_ = dataset()->to_concatenate_->MakeIterator();
+          }
+        }
+        *end_of_sequence = true;
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    static PartialTensorShape MostSpecificCompatibleShape(
+        const PartialTensorShape& ts1, const PartialTensorShape& ts2) {
+      PartialTensorShape output_tensorshape;
+      if (ts1.dims() != ts2.dims() || ts1.unknown_rank() || ts2.unknown_rank())
+        return output_tensorshape;
+      auto dims1 = ts1.dim_sizes();
+      auto dims2 = ts2.dim_sizes();
+      for (int d = 0; d < ts1.dims(); d++) {
+        if (dims1[d] == dims2[d])
+          output_tensorshape.Concatenate(dims1[d]);
+        else
+          output_tensorshape.Concatenate(-1);
+      }
+      return output_tensorshape;
+    }
+
+    const DatasetBase* input_;
+    const DatasetBase* to_concatenate_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ConcatenateDataset").Device(DEVICE_CPU),
+                        ConcatenateDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
index 8531a7a4b4372935a91ad28124d741c5efefb764..b98153e3470d498121c7058b719206491e21cd13 100644
--- a/tensorflow/core/kernels/constant_op.h
+++ b/tensorflow/core/kernels/constant_op.h
@@ -56,7 +56,7 @@ class PlaceholderOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  TensorShape expected_shape_;
+  PartialTensorShape expected_shape_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index b268f8dbd2edfe2fafad8af14671296343ee5b7a..83ef4f01ca66a8014b017f19554323b68ddc4177 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -21,26 +21,12 @@ limitations under the License.
 #include <tuple>
 #include <unordered_map>
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
 
-// TODO(zhengxq): move this to gpu_util.h. The use of such wrappers is wide
-// spread.
-template <typename T>
-inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
-                                                           uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
-  return typed;
-}
 
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
@@ -56,12 +42,10 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
   virtual ~CudnnScratchAllocator() {}
   CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  virtual int64 GetMemoryLimitInBytes(
-      perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return memory_limit_;
   }
-  virtual perftools::gputools::port::StatusOr<
-      perftools::gputools::DeviceMemory<uint8>>
+  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
   AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
@@ -185,112 +169,6 @@ class ConvParameters {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-// A helper class that looks up the best autotuned config from parameters.
-// Due to the noisy nature of autotune, especially with multiple devices, it
-// only accepts a config if its margin exceeds a threshold.
-// For the same shape configs, if a new best config matches the previous best,
-// they get promoted; otherwise, the winner gets demoted. This process stops
-// when the winner's score exceeds the threshold.
-// In a bad case when two configs are very close to each other and flips
-// back and forth randomly, the expected number of experiments before autotune
-// settles is O(threshold ^ 2). So we recommend that number of warmup runs
-// for any benchmarks.
-template <typename Parameters, typename Config>
-class AutoTuneMap {
- public:
-  bool Find(const Parameters& params, Config* config) const {
-    mutex_lock lock(mu_);
-    auto iter = params_config_map_.find(params);
-    if (iter == params_config_map_.end() ||
-        iter->second.score < min_score_threshold_) {
-      return false;
-    }
-    *config = iter->second.config;
-    return true;
-  }
-  void Insert(const ConvParameters& params, const Config& config) {
-    mutex_lock lock(mu_);
-    auto iter = params_config_map_.find(params);
-    int new_score = 0;
-    if (iter == params_config_map_.end()) {
-      // Create a new entry if params is new.
-      VLOG(1) << GetActionSummary("creates", params, config);
-      params_config_map_.insert(std::make_pair(params, ValueType{config, 1}));
-      new_score = 1;
-    } else if (iter->second.score < min_score_threshold_) {
-      DCHECK(iter->second.score > 0);
-      if (iter->second.config != config) {
-        // If it is different from the current winner, demotes the winner.
-        VLOG(1) << GetActionSummary("demotes", params, config);
-        new_score = --iter->second.score;
-        if (new_score <= 0) {
-          VLOG(1) << GetActionSummary("erases", params, config);
-          params_config_map_.erase(iter);
-        }
-      } else {
-        // If it is the same as the current winner, promotes the winner.
-        VLOG(1) << GetActionSummary("promotes", params, config);
-        new_score = ++iter->second.score;
-      }
-    }
-    if (new_score >= min_score_threshold_) {
-      VLOG(1) << GetActionSummary("accepts", params, config);
-    }
-  }
-
- private:
-  AutoTuneMap(const string& name) : name_(name) {
-    min_score_threshold_ = 1;
-    const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
-    if (threshold_str != nullptr) {
-      strings::safe_strto32(threshold_str, &min_score_threshold_);
-    }
-    min_score_threshold_ = std::max(min_score_threshold_, 1);
-  }
-
-  template <class Group, class Params, class Cfg>
-  friend class AutoTuneSingleton;
-
-  struct Hasher {
-    std::size_t operator()(const Parameters& parameter) const {
-      return parameter.hash();
-    }
-  };
-
-  string GetActionSummary(StringPiece action, const Parameters& params,
-                          const Config& config) {
-    return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
-                           action.ToString().c_str(), params.ToString().c_str(),
-                           config.ToString().c_str());
-  }
-
-  mutable mutex mu_;
-  struct ValueType {
-    Config config;
-    int32 score;
-  };
-  std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
-      GUARDED_BY(mu_);
-  string name_;
-  int32 min_score_threshold_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap);
-};
-
-// A Singleton helper that manages the global autotune results by groups.
-// The caller specified arbitrary Group type that can distinguish between
-// different autotune results, even if their Parameters and Configs are the
-// same.
-template <class Group, typename Parameters, typename Config>
-class AutoTuneSingleton {
- public:
-  typedef AutoTuneMap<Parameters, Config> AutoTuneType;
-  static AutoTuneType* GetInstance() {
-    static AutoTuneType* instance = new AutoTuneType(Group::name());
-    return instance;
-  }
-};
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 914627992b614674d35ec37fa202db9e1e127ff4..358ff7d0e04071e67846f5d4f57cdfb61b076c87 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -252,6 +252,37 @@ static inline Status PotrfImpl(BufSizeFnT bufsize, SolverFnT solver,
 
 TF_CALL_LAPACK_TYPES(POTRF_INSTANCE);
 
+template <typename Scalar, typename SolverFnT>
+static inline Status GeamImpl(SolverFnT solver, cublasHandle_t cublas_handle,
+                              cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n,
+                              const Scalar* alpha, /* host or device pointer */
+                              const Scalar* A, int lda,
+                              const Scalar* beta, /* host or device pointer */
+                              const Scalar* B, int ldb, Scalar* C, int ldc) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, transa, transb, m, n, (const CudaScalar*)alpha,
+             (const CudaScalar*)A, lda, (const CudaScalar*)beta,
+             (const CudaScalar*)B, ldb, (CudaScalar*)C, ldc));
+  return Status::OK();
+}
+
+#define GEAM_INSTANCE(Scalar, lapack_prefix)                              \
+  template <>                                                             \
+  Status CudaSolver::Geam<Scalar>(                                        \
+      cublasOperation_t transa, cublasOperation_t transb, int m, int n,   \
+      const Scalar* alpha, /* host or device pointer */                   \
+      const Scalar* A, int lda,                                           \
+      const Scalar* beta, /* host or device pointer */                    \
+      const Scalar* B, int ldb, Scalar* C, int ldc) const {               \
+    return GeamImpl(BLAS_SOLVER_FN(geam, lapack_prefix), cublas_handle_,  \
+                    transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, \
+                    ldc);                                                 \
+  }
+
+TF_CALL_LAPACK_TYPES(GEAM_INSTANCE);
+
 //=============================================================================
 // Wrappers of cuBlas computational methods begin here.
 //
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 5d1c807e66eaaf1adb5cb4272b875d7e44effdb8..26a169b93b3e74374ca70a5229139ab34be6c815 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -116,9 +116,10 @@ class CudaSolver {
   // Launches a memcpy of solver status data specified by dev_lapack_info from
   // device to the host, and asynchronously invokes the given callback when the
   // copy is complete. The first Status argument to the callback will be
-  // Status::OK if all lapack infos retrieved are zero, otherwise an error status
-  // is given. The second argument contains a host-side copy of the entire set
-  // of infos retrieved, and can be used for generating detailed error messages.
+  // Status::OK if all lapack infos retrieved are zero, otherwise an error
+  // status is given. The second argument contains a host-side copy of the
+  // entire set of infos retrieved, and can be used for generating detailed
+  // error messages.
   Status CopyLapackInfoToHostAsync(
       const std::vector<DeviceLapackInfo>& dev_lapack_info,
       std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
@@ -131,8 +132,20 @@ class CudaSolver {
   // to those in cuSolverDN and cuBlas, which follow the naming convention in
   // LAPACK see, e.g., http://docs.nvidia.com/cuda/cusolver/#naming-convention
 
+  // This function performs the matrix-matrix addition/transposition
+  //   C = alpha * op(A) + beta * op(B).
+  // Returns Status::OK() if the kernel was launched successfully.  See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam
+  // NOTE(ebrevdo): Does not support in-place transpose of non-square matrices.
+  template <typename Scalar>
+  Status Geam(cublasOperation_t transa, cublasOperation_t transb, int m, int n,
+              const Scalar* alpha, /* host or device pointer */
+              const Scalar* A, int lda,
+              const Scalar* beta, /* host or device pointer */
+              const Scalar* B, int ldb, Scalar* C, int ldc) const;
+
   // Computes the Cholesky factorization A = L * L^T for a single matrix.
-  // Returns Status::OK(), if the kernel was launched successfully. See:
+  // Returns Status::OK() if the kernel was launched successfully. See:
   // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
   template <typename Scalar>
   Status Potrf(cublasFillMode_t uplo, int n, Scalar* dev_A, int lda,
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 8cf1eac41ede6eaa93d1e44d2e02e32545a68e43..5fd38d9dc25c13e20766d1fed86c3f7af9912905 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -22,17 +22,6 @@ REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Abs")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::abs<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
 REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128);
@@ -48,4 +37,13 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                         UnaryOp<CPUDevice, functor::abs<int32>>);
 #endif
 
+#if TENSORFLOW_USE_SYCL
+REGISTER3(UnaryOp, SYCL, "Abs", functor::abs, float, double, int64);
+REGISTER_KERNEL_BUILDER(Name("Abs")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::abs<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 65801da3c7c9c47f5779f480fe88fdfc1ce71c78..12cc6c8bdd43b64aa1be2860b54e90aaf5e4c05e 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Acos")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::acos<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bdd8d22a3e9f7b064f147ee31b562b68f281c65
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Acosh")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::acosh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index f6e9b59cf8dfe7941bba74e2bf2996e67afb4cd6..c0fe81ef553028c427bc145afb47641e291bf778 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -19,26 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
           int64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Add")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::add<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Add")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
 
@@ -54,4 +34,17 @@ REGISTER_KERNEL_BUILDER(Name("Add")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_KERNEL(type) REGISTER(BinaryOp, SYCL, "Add", functor::add, type);
+TF_CALL_SYCL_NUMBER_TYPES(REGISTER_KERNEL);
+
+REGISTER_KERNEL_BUILDER(Name("Add")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index c9ebfe759b1f98117439ceeb0be3f49a54ab41d6..c28e27d95ae661bdc02a905bb6efd5bdd79f23e5 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Asin")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::asin<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0644323c0052e50e766cdb49c6ed172b88cd326
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Asinh")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::asinh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 72645b303fc4040001a0147fa585cb32754e2a60..7d73de48102189f5c0d92ce811fa639ce6ba2cf4 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Atan")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::atan<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
new file mode 100644
index 0000000000000000000000000000000000000000..058f5140c5bed7f312bf220d665da8628ca657e1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+
+namespace tensorflow {
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
+          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Atanh")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::atanh<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
+#if GOOGLE_CUDA
+REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
new file mode 100644
index 0000000000000000000000000000000000000000..017a2182dcff0f0121dd6343f1c012802cdf28d1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                      \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BitwiseAnd").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BinaryOp<SYCLDevice, functor::bitwise_and<TYPE>>);
+REGISTER_SYCL_KERNEL(int8);
+REGISTER_SYCL_KERNEL(int16);
+REGISTER_SYCL_KERNEL(int32);
+REGISTER_SYCL_KERNEL(int64);
+REGISTER_SYCL_KERNEL(uint8);
+REGISTER_SYCL_KERNEL(uint16);
+#undef REGISTER_SYCL_KERNEL
+
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER6(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36f45fe92dfce44c68a778b6c719c45d24bcaa90
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                     \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("BitwiseOr").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BinaryOp<SYCLDevice, functor::bitwise_or<TYPE>>);
+REGISTER_SYCL_KERNEL(int8);
+REGISTER_SYCL_KERNEL(int16);
+REGISTER_SYCL_KERNEL(int32);
+REGISTER_SYCL_KERNEL(int64);
+REGISTER_SYCL_KERNEL(uint8);
+REGISTER_SYCL_KERNEL(uint16);
+#undef REGISTER_SYCL_KERNEL
+
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER6(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36432d851d99f20706b7e7f8535e6ac241b00937
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                      \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BitwiseXor").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BinaryOp<SYCLDevice, functor::bitwise_xor<TYPE>>);
+REGISTER_SYCL_KERNEL(int8);
+REGISTER_SYCL_KERNEL(int16);
+REGISTER_SYCL_KERNEL(int32);
+REGISTER_SYCL_KERNEL(int64);
+REGISTER_SYCL_KERNEL(uint8);
+REGISTER_SYCL_KERNEL(uint16);
+#undef REGISTER_SYCL_KERNEL
+
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER6(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index c74e10576d5f106ac0a5bf8a7020b40c64523495..0111e9d5fd18f1d94e8d39c5e67d16e04f21e854 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Ceil")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index 634c90adc63c620ef429cf5cde1d970c7331dfdc..d4b3b0e3935deeded3a0e07bd04056476c4cc29c 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -19,19 +19,11 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cos")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cos<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 1e2300832fcfc3a1b6e2e9d28eff43a5417b6839..d44c1bf473e2e778a7d31890a25359e782e1dc94 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -24,32 +24,6 @@ REGISTER5(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16, int16,
           int32, int64);
 REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
           complex64, complex128);
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Div")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::div<TYPE>>);  \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("RealDiv")                             \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::div<TYPE>>);
-REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(double)
-#undef REGISTER_SYCL_KERNEL
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(Name("Div")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
           uint16, int16, int64, complex64, complex128);
@@ -70,4 +44,15 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(BinaryOp, SYCL, "Div", functor::div, float, double);
+REGISTER2(BinaryOp, SYCL, "RealDiv", functor::div, float, double);
+REGISTER_KERNEL_BUILDER(Name("Div")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 7049305deb03a1a0f90875ec4124add0bb5b465d..ea10ebe9a0eecaedabcdfea487400b7d3ef56102 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -47,8 +47,8 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double);
-
+REGISTER5(BinaryOp, SYCL, "Equal", functor::equal_to, float, double, uint8,
+          int8, int16);
 REGISTER_KERNEL_BUILDER(Name("Equal")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 2e3a60cf794139a8f20c42d3de34702a7c08018b..9d4d65442762b88bb418bc0266b41ae37259e43f 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -19,19 +19,11 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Exp")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::exp<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 5573c2bcc2f7aafda4b91d69023ce570a305215c..4f723080060041f1223dbd86aa95f1cc64f5452c 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -22,6 +22,6 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float);
+REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
 #endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 59e32d7f6f4cf6fd642d49efbff76f1f2cb45bb3..5a142b9ce9f8a32fe0569a78452cf710b2317760 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -18,19 +18,10 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Floor")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::floor<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_acosh.cu.cc
similarity index 58%
rename from tensorflow/core/kernels/dense_update_ops_gpu.cu.cc
rename to tensorflow/core/kernels/cwise_op_gpu_acosh.cu.cc
index 0f61506afeaa2be62b335673591c8eee12fa9bb1..a29c9a374d9c0a7f213f67020a851b221a5648eb 100644
--- a/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_acosh.cu.cc
@@ -15,23 +15,13 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/dense_update_ops.h"
-
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h"
 
 namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-#define DEFINE_GPU_KERNELS(T)                              \
-  template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
-  template struct functor::DenseUpdate<GPUDevice, T, SUB>; \
-  template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-#undef DEFINE_GPU_KERNELS
-
-}  // end namespace tensorflow
+namespace functor {
+DEFINE_UNARY2(acosh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/hierarchy-test.ts b/tensorflow/core/kernels/cwise_op_gpu_asinh.cu.cc
similarity index 56%
rename from tensorflow/tensorboard/components/tf_graph_common/test/hierarchy-test.ts
rename to tensorflow/core/kernels/cwise_op_gpu_asinh.cu.cc
index fa62ffe2c7048a50d51a57894976820a720d636b..c78f09e5e9acbe34fa263418919d9d87f2fea3a5 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/test/hierarchy-test.ts
+++ b/tensorflow/core/kernels/cwise_op_gpu_asinh.cu.cc
@@ -1,23 +1,27 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the 'License');
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-suite('graph', () => {
-  let assert = chai.assert;
+#if GOOGLE_CUDA
 
-  test('graphlib exists', () => { assert.isTrue(graphlib != null); });
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h"
 
-  // TODO(bp): write tests.
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(asinh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
 
-});
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_atanh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_atanh.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..895dcbff020350ce6d549d5adf186a3ebac99644
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_atanh.cu.cc
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY2(atanh, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27f973c90d73a1d7828ce180254363a0b7b4be76
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY6(bitwise_and, int8, int16, int32, int64, uint8, uint16);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/tensorboard/components/tf_imports/graphlib.html b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
similarity index 56%
rename from tensorflow/tensorboard/components/tf_imports/graphlib.html
rename to tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
index 664b855f17f807231b77d094c82b93f7e1801112..a34c3a52cd6253527c67d2d1f8c1498756ff5be8 100644
--- a/tensorflow/tensorboard/components/tf_imports/graphlib.html
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
@@ -1,6 +1,4 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--->
+==============================================================================*/
 
-<link rel="import" href="lodash.html">
+#if GOOGLE_CUDA
 
-<script jscomp-nocompile src="graphlib.core.js"></script>
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY6(bitwise_or, int8, int16, int32, int64, uint8, uint16);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4531ab7c6f283f8e732dbc87b3c64d93a8a5bef
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY6(bitwise_xor, int8, int16, int32, int64, uint8, uint16);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tensorboard-color.html b/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
similarity index 56%
rename from tensorflow/tensorboard/components/tf_dashboard_common/tensorboard-color.html
rename to tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
index 7f9ca6461485ad9b6356b05fac48544b4a995dfb..62f33612db079377729d8d0edde0c37d43fb9cfb 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tensorboard-color.html
+++ b/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
@@ -1,6 +1,4 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--->
+==============================================================================*/
 
-<link rel="import" href="../polymer/polymer.html">
+#if GOOGLE_CUDA
 
-<style is="custom-style">
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
-  :root {
-    --tb-orange-weak: #ffa726;
-    --tb-orange-strong: #f57c00;
-    --tb-grey-darker: #e2e2e2;
-    --tb-grey-lighter: #f3f3f3;
-    --tb-ui-dark-accent: #757575;
-    --tb-ui-light-accent: #e0e0e0;
-    --tb-graph-faded: #e0d4b3;
-  }
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY6(invert, int8, int16, int32, int64, uint8, uint16);
+}  // namespace functor
+}  // namespace tensorflow
 
-</style>
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index 6b5a806aa21945ccdf062292e0eb087908a26f99..ba89899fb323c58f0a0045f3ef32a897f5f2680a 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -34,11 +34,8 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                         BinaryOp<CPUDevice, functor::greater<int32>>);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float);
+REGISTER2(BinaryOp, SYCL, "Greater", functor::greater, float, double);
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Greater")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
@@ -47,5 +44,4 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
 #endif // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index ac215282561a50bc55476569bcdf131482d4203e..8f0c483aecd7f84bbb8ac47e4c8b5877b40335d4 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -35,7 +35,7 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
 
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .Device(DEVICE_SYCL)
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df2c02e42e17f5bbcb74b637adcfb1dbd5cac3c1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
+          uint8, uint16);
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
+         uint8, uint16);
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER6(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
+          uint8, uint16);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 0faeffa95ca51e5c3dc598819e672569723b1dcc..53ec1c1c63f17a03218535c974e591b4eec62a72 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -19,20 +19,12 @@ namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
           double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("IsFinite")                            \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
           double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index df63006b3fd79efebedf8a7b756fd1f3f087f54d..4b34744304f6c856fb98d39fbadc1e1958c84238 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("IsInf")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index e1cf7a86375f5db75b8bdd456111a3df0f9a370e..ad2dd3f722cebba926dd04748ca146c2ecfc0848 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("IsNan")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index a38f1024a9a9c47ba1c2d83bf21305f83d68e78e..136c3666dfc351fa0485eeff060a6ea3a7d48c08 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -35,7 +35,6 @@ REGISTER_KERNEL_BUILDER(Name("Less")
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
-
 REGISTER_KERNEL_BUILDER(Name("Less")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 3a2cc2ae0e8c957e3e6c4954ad965aea7ebdb9dd..97a2508d1290c5afe758db9ff54a22a22b6dcac0 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -35,8 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float);
-
+REGISTER6(BinaryOp, SYCL, "LessEqual", functor::less_equal, float, double,
+          int64, uint8, int8, int16);
 REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
@@ -45,5 +45,4 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
 #endif // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 5e74e778c76cc4498483764de4bb8581235928dd..7fdfdff0e38ea2bfe18acac86b148a4e1e944117 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -19,19 +19,11 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Log")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::log<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index edb821318e8929ad7cbdcd9af9c325f1e3483a84..25ad7b24bb1cee3a09c4ea81cccf79b6a4dabeb9 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -19,19 +19,11 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Log1p")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 7311f25ec0cd033fe40e80f8bcb6e844fe441316..87d54e380b4b923f72aff1eb33d56dd7d8a0dd11 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -35,11 +35,7 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float);
-
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER3(BinaryOp, SYCL, "Maximum", functor::maximum, float, double, int64);
 REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
@@ -48,5 +44,4 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
 #endif // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 99e5a76620362a7cf2f4b2441c39f1bd50b86730..442171193bfeb41e8594bf708590fc4d52291685 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -35,8 +35,7 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float);
-
+REGISTER3(BinaryOp, SYCL, "Minimum", functor::minimum, float, double, int64);
 REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index a3cdfa5f84dac354b91379cdb1a3da9f1eb4b22f..023eb07ca3f52f49c95b5b6450e3417b7cbeabe4 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -26,24 +26,6 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
 REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 #endif  // __ANDROID_TYPES_SLIM__
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Mul")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::mul<TYPE>>);
-REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(double)
-#undef REGISTER_SYCL_KERNEL
-REGISTER_KERNEL_BUILDER(Name("Mul")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
            uint8);
@@ -59,4 +41,14 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                         BinaryOp<CPUDevice, functor::mul<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "Mul", functor::mul, float, double, uint8);
+REGISTER_KERNEL_BUILDER(Name("Mul")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::mul<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index eb7e3764d9d51401b6544c81ecf146b5bd8e11d1..536891b548f043cb25726d70bfdd362ed0294512 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -19,27 +19,14 @@ namespace tensorflow {
 REGISTER7(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
           complex64, int64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Neg")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::neg<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
 REGISTER_KERNEL_BUILDER(Name("Neg")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::neg<int32>>);
-
-#undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index f1780168e456876545a41789683b2a531422199d..5fb0735ac19ba9eb057dd68c7f2d849c65d5edaa 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -19,20 +19,11 @@ namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
           int64, complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Pow")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::pow<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
           int64);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(BinaryOp, SYCL, "Pow", functor::pow, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index e192f89782dbdfdfd86fa5fed5f446e8f3df4ee5..163814aac4bcdbd4f4362164ae694fc3d15edd72 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -21,9 +21,6 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
-namespace functor {
-DEFINE_UNARY2(round, float, double);
-}  // namespace functor
 #endif
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index f23725f48e3865511e817f1d78d4d08007c966b5..a434538fbf886a24a09117cfc457d72bf3b6c19e 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -19,21 +19,12 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Rsqrt")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Rsqrt", functor::rsqrt, float, double);
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -41,4 +32,8 @@ REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "RsqrtGrad", functor::rsqrt_grad, float,
           Eigen::half, double);
 #endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(SimpleBinaryOp, SYCL, "RsqrtGrad", functor::rsqrt_grad, float,
+          double);
+#endif  //  TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 709628da136191d52f1d6d726557123ffbd53061..3dd9de8d897479456c462ea068c5eda6354b199b 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -181,7 +181,9 @@ REGISTER_SELECT_GPU(complex128);
       SelectOp<SYCLDevice, type>);
 
 REGISTER_SELECT_SYCL(float);
+REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
+REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index dedd414db551b43cb7b6d2f014801f56cd65173c..a4084d5ad1796f5af1ce1a62e76c9dc6b473586d 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -34,10 +34,7 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float);
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER3(UnaryOp, SYCL, "Sign", functor::sign, float, double, int64);
 REGISTER_KERNEL_BUILDER(Name("Sign")
                             .Device(DEVICE_SYCL)
                             .HostMemory("x")
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index ab54c61b56defbaa9021eb11cbd43dc3f7707554..b91ff1ac30ba8e7259223e011aa1e70b0a05f623 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -19,19 +19,11 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sin")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sin<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
+#endif // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 55acf648db00f4cc2017a0fff5de24697a0f04b8..00efbb00f1501669b221682c565b4843c0497128 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -19,26 +19,22 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sqrt")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
+#endif // TENSORFLOW_USE_SYCL
+
 REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index afcacfec1c722149c2fc8e440ee1a0ac9e252545..07a4b0b084d804c46a8a4a0bc272f78b22d7e845 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -19,18 +19,6 @@ namespace tensorflow {
 REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
           int32, int64, complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Square")                              \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::square<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
-
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Square", functor::square, float, Eigen::half, double,
           int64);
@@ -45,4 +33,14 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::square<int32>>);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(UnaryOp, SYCL, "Square", functor::square, float, double, int64);
+REGISTER_KERNEL_BUILDER(Name("Square")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        UnaryOp<CPUDevice, functor::square<int32>>);
+#endif // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index edd6c071467630ce2ced35fd1182d1f063c85586..78fefc69c776e2f7b7c44c941e0a1afefdbaf143 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -35,4 +35,17 @@ REGISTER_KERNEL_BUILDER(
         .TypeConstraint<int32>("T"),
     BinaryOp<CPUDevice, functor::squared_difference<int32>>);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "SquaredDifference", functor::squared_difference,
+          float, double, int64);
+REGISTER_KERNEL_BUILDER(
+    Name("SquaredDifference")
+        .Device(DEVICE_SYCL)
+        .HostMemory("x")
+        .HostMemory("y")
+        .HostMemory("z")
+        .TypeConstraint<int32>("T"),
+    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index eab1e2a09c901e509e36d792a6ddb7ef7ac34e4e..eb173c7040d435879abdb4d5cbae5f20a720199f 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -24,28 +24,7 @@ REGISTER7(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
 // int32 version of this op is needed, so explicitly include it.
 REGISTER(BinaryOp, CPU, "Sub", functor::sub, int32);
 #endif  // __ANDROID_TYPES_SLIM__
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sub")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          BinaryOp<SYCLDevice, functor::sub<TYPE>>);
-  REGISTER_SYCL_KERNEL(float);
-  REGISTER_SYCL_KERNEL(double);
 
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(Name("Sub")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::sub<int32>>);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Sub", functor::sub, float, Eigen::half, double, int64,
           complex64, complex128);
@@ -62,4 +41,14 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                         BinaryOp<CPUDevice, functor::sub<int32>>);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "Sub", functor::sub, float, double, int64);
+REGISTER_KERNEL_BUILDER(Name("Sub")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::sub<int32>>);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 9c850c942077d45c1c56dfbc26e0c70f465120b6..7891b1183dd56b9809ef7f5dc76c3f04fe605b02 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -18,19 +18,11 @@ limitations under the License.
 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Tan")                                 \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::tan<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
-
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 1dbc13061ba02839cc1189576bab2698a4f60f4c..8b3900892c300ee266b1a7fb066ef79c88c3d087 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -20,22 +20,14 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Tanh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::tanh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
-
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 #endif
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
+#endif // TENSORFLOW_USE_SYCL
+
 REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double, complex64, complex128);
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 6d80e4bfc1d226c108be2747c8bc5889b4543b04..65a60720ddcfb77248d41291581847acf116ed46 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -43,6 +43,42 @@ struct functor_traits<scalar_fmod2_op<T>> {
   };
 };
 
+template <typename T>
+struct scalar_asinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+    return std::asinh(a);
+  }
+};
+template <typename T>
+struct functor_traits<scalar_asinh_op<T>> {
+  enum { Cost = 5 * NumTraits<T>::MulCost, PacketAccess = false };
+};
+
+template <typename T>
+struct scalar_acosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+    return std::acosh(a);
+  }
+};
+template <typename T>
+struct functor_traits<scalar_acosh_op<T>> {
+  enum { Cost = 5 * NumTraits<T>::MulCost, PacketAccess = false };
+};
+
+template <typename T>
+struct scalar_atanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+    return std::atanh(a);
+  }
+};
+template <typename T>
+struct functor_traits<scalar_atanh_op<T>> {
+  enum { Cost = 5 * NumTraits<T>::MulCost, PacketAccess = false };
+};
+
 // TODO(rmlarsen): This is a workaround for upstream change
 // https://bitbucket.org/eigen/eigen/commits/f339468d04d0f87caeb6cab9aef568627e9f6ea9
 // that renamed scalar_binary_pow_op to scalar_pow_op and deleted the unary
@@ -121,7 +157,7 @@ struct scalar_left : private Binary {
 };
 
 template <typename Tout, typename Tin, typename Binary>
-struct functor_traits<scalar_left<Tout, Tin, Binary> > {
+struct functor_traits<scalar_left<Tout, Tin, Binary>> {
   enum {
     Cost = functor_traits<Binary>::Cost,
     PacketAccess = functor_traits<Binary>::PacketAccess,
@@ -151,7 +187,7 @@ struct scalar_right : private Binary {
 };
 
 template <typename Tout, typename Tin, typename Binary>
-struct functor_traits<scalar_right<Tout, Tin, Binary> > {
+struct functor_traits<scalar_right<Tout, Tin, Binary>> {
   enum {
     Cost = functor_traits<Binary>::Cost,
     PacketAccess = functor_traits<Binary>::PacketAccess,
@@ -368,6 +404,25 @@ struct functor_traits<scalar_round_op_google<Scalar>> {
 #undef ENABLE_FLOAT_EQUALITY_WARNING
 #undef DISABLE_FLOAT_EQUALITY_WARNING
 
+template <typename Scalar>
+struct bitwise_xor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(bitwise_xor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x, const Scalar& y) const {
+    return x ^ y;
+  }
+  typedef typename Eigen::internal::packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
+                                                        const Packet& b) const {
+    return Eigen::internal::pxor(a, b);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<bitwise_xor_op<Scalar>> {
+  enum { Cost = Eigen::NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
 }  // end namespace internal
 }  // end namespace Eigen
 
@@ -454,22 +509,22 @@ struct abs : base<T, Eigen::internal::scalar_abs_op<T>,
                   typename Eigen::internal::scalar_abs_op<T>::result_type> {};
 
 template <typename T>
-struct neg : base<T, Eigen::internal::scalar_opposite_op<T> > {};
+struct neg : base<T, Eigen::internal::scalar_opposite_op<T>> {};
 
 template <typename T>
-struct inverse : base<T, Eigen::internal::scalar_inverse_op<T> > {};
+struct inverse : base<T, Eigen::internal::scalar_inverse_op<T>> {};
 
 template <typename T>
-struct square : base<T, Eigen::internal::scalar_square_op<T> > {};
+struct square : base<T, Eigen::internal::scalar_square_op<T>> {};
 
 template <typename T>
-struct sqrt : base<T, Eigen::internal::scalar_sqrt_op<T> > {};
+struct sqrt : base<T, Eigen::internal::scalar_sqrt_op<T>> {};
 
 template <typename T>
-struct rsqrt : base<T, Eigen::internal::scalar_rsqrt_op<T> > {};
+struct rsqrt : base<T, Eigen::internal::scalar_rsqrt_op<T>> {};
 
 template <typename T>
-struct exp : base<T, Eigen::internal::scalar_exp_op<T> > {};
+struct exp : base<T, Eigen::internal::scalar_exp_op<T>> {};
 
 template <typename T>
 struct expm1 : base<T, Eigen::internal::scalar_expm1_op<T>> {};
@@ -478,56 +533,76 @@ template <typename T>
 struct log : base<T, Eigen::internal::scalar_log_op<T>> {};
 
 template <typename T>
-struct log1p : base<T, Eigen::internal::scalar_log1p_op<T> > {};
+struct log1p : base<T, Eigen::internal::scalar_log1p_op<T>> {};
+
+template <typename T>
+struct sign : base<T, Eigen::internal::scalar_sign_op<T>> {};
+
+template <typename T>
+struct sinh : base<T, Eigen::internal::scalar_sinh_op<T>> {};
 
 template <typename T>
-struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {};
+struct cosh : base<T, Eigen::internal::scalar_cosh_op<T>> {};
 
 template <typename T>
-struct sinh : base<T, Eigen::internal::scalar_sinh_op<T> > {};
+struct tanh : base<T, Eigen::internal::scalar_tanh_op<T>> {};
 
 template <typename T>
-struct cosh : base<T, Eigen::internal::scalar_cosh_op<T> > {};
+struct asinh : base<T, Eigen::internal::scalar_asinh_op<T>> {};
 
 template <typename T>
-struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {};
+struct acosh : base<T, Eigen::internal::scalar_acosh_op<T>> {};
 
 template <typename T>
-struct lgamma : base<T, Eigen::internal::scalar_lgamma_op<T> > {};
+struct atanh : base<T, Eigen::internal::scalar_atanh_op<T>> {};
+
+template <typename T>
+struct lgamma : base<T, Eigen::internal::scalar_lgamma_op<T>> {};
 
 template <typename T>
 struct digamma : base<T, Eigen::internal::scalar_digamma_op<T>> {};
 
 template <typename T>
-struct erf : base<T, Eigen::internal::scalar_erf_op<T> > {};
+struct erf : base<T, Eigen::internal::scalar_erf_op<T>> {};
 
 template <typename T>
-struct erfc : base<T, Eigen::internal::scalar_erfc_op<T> > {};
+struct erfc : base<T, Eigen::internal::scalar_erfc_op<T>> {};
 
 template <typename T>
-struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T> > {};
+struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T>> {};
 
 template <typename T>
-struct sin : base<T, Eigen::internal::scalar_sin_op<T> > {};
+struct sin : base<T, Eigen::internal::scalar_sin_op<T>> {};
 
 template <typename T>
-struct cos : base<T, Eigen::internal::scalar_cos_op<T> > {};
+struct cos : base<T, Eigen::internal::scalar_cos_op<T>> {};
 
 template <typename T>
-struct tan : base<T, Eigen::internal::scalar_tan_op<T> > {};
+struct tan : base<T, Eigen::internal::scalar_tan_op<T>> {};
 
 template <typename T>
-struct asin : base<T, Eigen::internal::scalar_asin_op<T> > {};
+struct asin : base<T, Eigen::internal::scalar_asin_op<T>> {};
 
 template <typename T>
-struct acos : base<T, Eigen::internal::scalar_acos_op<T> > {};
+struct acos : base<T, Eigen::internal::scalar_acos_op<T>> {};
 
 template <typename T>
-struct atan : base<T, Eigen::internal::scalar_atan_op<T> > {};
+struct atan : base<T, Eigen::internal::scalar_atan_op<T>> {};
+
+struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool>> {
+};
 
-struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool> > {
+// Flip all bits. Named invert to be consistent with numpy.
+template <typename T>
+struct invert_op {
+  EIGEN_EMPTY_STRUCT_CTOR(invert_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+    return ~a;
+  }
 };
 
+template <typename T>
+struct invert : base<T, invert_op<T>> {};
 
 // NOTE: std::isinf, std::isnan, std::isfinite are plain function.
 // Therefore we need to wrap them in functors to be used with Eigen's
@@ -551,8 +626,8 @@ template <typename T>
 struct ceil : base<T, Eigen::internal::scalar_ceil_op<T>> {};
 
 /** this should go in Eigen
-  * \brief Template functor to compute the round to int value of a scalar
-  */
+ * \brief Template functor to compute the round to int value of a scalar
+ */
 template <typename Scalar>
 struct scalar_rint_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op)
@@ -589,20 +664,20 @@ struct rint : base<T, scalar_rint_op<T>> {};
 // squared_difference(x, y) = (x - y) * (x - y)
 
 template <typename T>
-struct add : base<T, Eigen::internal::scalar_sum_op<T> > {
+struct add : base<T, Eigen::internal::scalar_sum_op<T>> {
   static const bool use_bcast_optimization = true;
 };
 
 template <typename T>
-struct sub : base<T, Eigen::internal::scalar_difference_op<T> > {
+struct sub : base<T, Eigen::internal::scalar_difference_op<T>> {
   static const bool use_bcast_optimization = true;
 };
 
 template <typename T>
-struct mul : base<T, Eigen::internal::scalar_product_op<T> > {};
+struct mul : base<T, Eigen::internal::scalar_product_op<T>> {};
 
 template <typename T>
-struct div : base<T, Eigen::internal::scalar_quotient_op<T> > {};
+struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
 
 template <typename T>
 struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
@@ -611,7 +686,7 @@ struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
 };
 
 template <typename T>
-struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T> > {};
+struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T>> {};
 
 template <typename T>
 struct mod : base<T, Eigen::internal::scalar_mod2_op<T>> {};
@@ -647,10 +722,10 @@ template <typename T>
 struct pow : base<T, Eigen::internal::scalar_binary_pow_op_google<T, T>> {};
 
 template <typename T>
-struct maximum : base<T, Eigen::internal::scalar_max_op<T> > {};
+struct maximum : base<T, Eigen::internal::scalar_max_op<T>> {};
 
 template <typename T>
-struct minimum : base<T, Eigen::internal::scalar_min_op<T> > {};
+struct minimum : base<T, Eigen::internal::scalar_min_op<T>> {};
 
 template <typename T>
 struct igamma : base<T, Eigen::internal::scalar_igamma_op<T>> {};
@@ -708,6 +783,33 @@ struct logical_and : base<bool, Eigen::internal::scalar_boolean_and_op> {};
 
 struct logical_or : base<bool, Eigen::internal::scalar_boolean_or_op> {};
 
+template <typename T>
+struct bitwise_and_op {
+  EIGEN_EMPTY_STRUCT_CTOR(bitwise_and_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
+                                                           const T& y) const {
+    return x & y;
+  }
+};
+
+template <typename T>
+struct bitwise_or_op {
+  EIGEN_EMPTY_STRUCT_CTOR(bitwise_or_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
+                                                           const T& y) const {
+    return x | y;
+  }
+};
+
+template <typename T>
+struct bitwise_and : base<T, bitwise_and_op<T>> {};
+
+template <typename T>
+struct bitwise_or : base<T, bitwise_or_op<T>> {};
+
+template <typename T>
+struct bitwise_xor : base<T, Eigen::internal::bitwise_xor_op<T>> {};
+
 template <typename T>
 struct make_complex_func {
   typedef std::complex<T> result_type;
@@ -718,7 +820,7 @@ struct make_complex_func {
 };
 
 template <typename T>
-struct make_complex : base<T, make_complex_func<T>, std::complex<T> > {};
+struct make_complex : base<T, make_complex_func<T>, std::complex<T>> {};
 
 template <typename T>
 struct get_real
@@ -729,7 +831,7 @@ struct get_imag
     : base<T, Eigen::internal::scalar_imag_op<T>, typename T::value_type> {};
 
 template <typename T>
-struct conj : base<T, Eigen::internal::scalar_conjugate_op<T> > {};
+struct conj : base<T, Eigen::internal::scalar_conjugate_op<T>> {};
 
 ////////////////////////////////////////////////////////////////////////////////
 // Functors takes 1 or 2 tensors, computes the base functor on
diff --git a/tensorflow/core/kernels/dataset.cc b/tensorflow/core/kernels/dataset.cc
index 925cbda56ea64aa7e000e2ec1cfd2e8ec8f0d6e1..f99684b1ca32848caf2f255811be65dc61e72490 100644
--- a/tensorflow/core/kernels/dataset.cc
+++ b/tensorflow/core/kernels/dataset.cc
@@ -39,4 +39,17 @@ void UnaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, output);
 }
 
+void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
+                                        DatasetBase** output) {
+  DatasetBase* input;
+  DatasetBase* another_input;
+  OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+  OP_REQUIRES_OK(ctx,
+                 LookupResource(ctx, HandleFromInput(ctx, 1), &another_input));
+  core::ScopedUnref unref_input(input);
+  core::ScopedUnref unref_another_input(another_input);
+
+  MakeDataset(ctx, input, another_input, output);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
index da56844dbe12942f106469fb01e9c0343d8b2056..b4d69657d85ee73bb9a4e7a5d470bf7faec2c585 100644
--- a/tensorflow/core/kernels/dataset.h
+++ b/tensorflow/core/kernels/dataset.h
@@ -199,6 +199,19 @@ class UnaryDatasetOpKernel : public DatasetOpKernel {
                            DatasetBase** output) = 0;
 };
 
+// Encapsulates the work required to plug binary Datasets into the core
+// TensorFlow graph execution engine.
+class BinaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase* another_input,
+                           DatasetBase** output) = 0;
+};
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 76f8c225432dd7ddb36933722f3cf0c9404c48ad..f5a74048af4e8b987abb1dfcaefc8d7aee4ce1e3 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gif/gif_io.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/decode_wav_op_test.cc b/tensorflow/core/kernels/decode_wav_op_test.cc
index c282d53a5a1ef20ba6d042d67367f3b90d064260..fc323a5e04205b81bc64e2335df4b9fcee5db8b7 100644
--- a/tensorflow/core/kernels/decode_wav_op_test.cc
+++ b/tensorflow/core/kernels/decode_wav_op_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/audio_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -83,4 +84,41 @@ TEST(DecodeWavOpTest, DecodeWavTest) {
   EXPECT_EQ(14099, sample_rate);
 }
 
+TEST(DecodeWavOpTest, DecodeWav_ShapeFn) {
+  ShapeInferenceTestOp op("DecodeWav");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1]");
+
+  // audio shape is unknown when desired_{samples,channels} are default.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeWav")
+                   .Input({"a", 0, DT_STRING})
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "[?,?];[]");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeWav")
+                   .Input({"a", 0, DT_STRING})
+                   .Attr("desired_samples", 42)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "[42,?];[]");
+
+  // Negative sample value is rejected.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeWav")
+                   .Input({"a", 0, DT_STRING})
+                   .Attr("desired_samples", -2)
+                   .Finalize(&op.node_def));
+  INFER_ERROR("samples must be non-negative, got -2", op, "[]");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeWav")
+                   .Input({"a", 0, DT_STRING})
+                   .Attr("desired_channels", 2)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "[?,2];[]");
+
+  // Negative channel value is rejected.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeWav")
+                   .Input({"a", 0, DT_STRING})
+                   .Attr("desired_channels", -2)
+                   .Finalize(&op.node_def));
+  INFER_ERROR("channels must be non-negative, got -2", op, "[]");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a878fe9a97059e2d55164600923c4a2e1312161b
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -0,0 +1,73 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/dense_update_functor.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <>
+struct DenseUpdate<CPUDevice, string, ASSIGN> {
+  void operator()(const CPUDevice& d, typename TTypes<string>::Flat params,
+                  typename TTypes<string>::ConstFlat update) {
+    if (params.dimension(0) == 1) {
+      params.data()->resize(update.data()->size());
+      auto work = [&params, &update](int64 start, int64 end) {
+        memmove(const_cast<char*>(params.data()->data()) + start,
+                update.data()->data() + start, end - start);
+      };
+      d.parallelFor(update.data()->size(),
+                    Eigen::TensorOpCost(.1,  // chosen to force large chunks
+                                        .1, 0),
+                    work);
+    } else {
+      auto work = [&params, &update](int64 start, int64 end) {
+        for (int i = start; i < end; ++i) {
+          params.data()[i].resize(update.data()[i].size());
+          memmove(const_cast<char*>(params.data()[i].data()),
+                  update.data()[i].data(), update.data()[i].size());
+        }
+      };
+      int64 estimated_string_size;
+      if (update.size() > 0) {
+        // first element of the tensor seems as good a guess as any of the sizes
+        // of the strings contained within...
+        estimated_string_size =
+            std::max(update.data()[0].size(), sizeof(string));
+      } else {
+        estimated_string_size = sizeof(string);
+      }
+      d.parallelFor(
+          params.dimension(0),
+          Eigen::TensorOpCost(estimated_string_size, estimated_string_size, 0),
+          work);
+    }
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..4aefe26c545ee5eaf3868b73cd9ace38fd135f53
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
+
+enum DenseUpdateType { ADD, SUB, ASSIGN };
+
+namespace functor {
+
+template <typename Device, typename T, DenseUpdateType OP>
+struct DenseUpdate {
+  void operator()(const Device& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update);
+};
+
+template <typename T>
+struct DenseUpdate<CPUDevice, T, ADD> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) += update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<CPUDevice, T, SUB> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) -= update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<CPUDevice, T, ASSIGN> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) = update;
+  }
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct DenseUpdate<SYCLDevice, T, ADD> {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) += update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<SYCLDevice, T, SUB> {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) -= update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<SYCLDevice, T, ASSIGN> {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) = update;
+  }
+};
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..208401cb24e9c7ebf28e42ccb2762764474a5377
--- /dev/null
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -0,0 +1,69 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/dense_update_functor.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct DenseUpdate<GPUDevice, T, ASSIGN> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) = update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<GPUDevice, T, ADD> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) += update;
+  }
+};
+
+template <typename T>
+struct DenseUpdate<GPUDevice, T, SUB> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat params,
+                  typename TTypes<T>::ConstFlat update) {
+    params.device(d) -= update;
+  }
+};
+
+}  // namespace functor
+
+#define DEFINE_GPU_KERNELS(T)                              \
+  template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
+  template struct functor::DenseUpdate<GPUDevice, T, SUB>;
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+#undef DEFINE_GPU_KERNELS
+
+#define DEFINE_GPU_KERNELS(T) \
+  template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
+TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
+#undef DEFINE_GPU_KERNELS
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 33991fa1f9d5b10f89b85cf2c7b709d111656cd1..6d44a92fa3c2d22ade6293d30b4f008a62eb8e0f 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -15,59 +15,20 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/dense_update_ops.h"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/assign_op.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-namespace functor {
-
-template <>
-struct DenseUpdate<Eigen::ThreadPoolDevice, string, ASSIGN> {
-  void operator()(const Eigen::ThreadPoolDevice& d,
-                  typename TTypes<string>::Flat params,
-                  typename TTypes<string>::ConstFlat update) {
-    if (params.dimension(0) == 1) {
-      params.data()->resize(update.data()->size());
-      auto work = [&params, &update](int64 start, int64 end) {
-        memmove(const_cast<char*>(params.data()->data()) + start,
-                update.data()->data() + start, end - start);
-      };
-      d.parallelFor(update.data()->size(),
-                    Eigen::TensorOpCost(.1,  // chosen to force large chunks
-                                        .1, 0),
-                    work);
-    } else {
-      auto work = [&params, &update](int64 start, int64 end) {
-        for (int i = start; i < end; ++i) {
-          params.data()[i].resize(update.data()[i].size());
-          memmove(const_cast<char*>(params.data()[i].data()),
-                  update.data()[i].data(), update.data()[i].size());
-        }
-      };
-      int64 estimated_string_size;
-      if (update.size() > 0) {
-        // first element of the tensor seems as good a guess as any of the sizes
-        // of the strings contained within...
-        estimated_string_size =
-            std::max(update.data()[0].size(), sizeof(string));
-      } else {
-        estimated_string_size = sizeof(string);
-      }
-      d.parallelFor(
-          params.dimension(0),
-          Eigen::TensorOpCost(estimated_string_size, estimated_string_size, 0),
-          work);
-    }
-  }
-};
-
-}  // namespace functor
 
 template <typename Device, typename T>
 class AssignOpT : public AssignOp {
@@ -111,13 +72,13 @@ class DenseUpdateOp : public OpKernel {
     OP_REQUIRES(context, Tparams.IsInitialized(),
                 errors::FailedPrecondition("Attempting to use uninitialized "
                                            "parameters: ",
-                                           def().input(0)));
+                                           requested_input(0)));
     OP_REQUIRES(
         context, Tparams.IsSameSize(Tupdate),
         errors::InvalidArgument("Parameters and update must be the same size"));
 
     functor::DenseUpdate<Device, T, OP> update_functor;
-    update_functor(context->eigen_device<Device>(), Tparams.flat<T>(),
+    update_functor(context->template eigen_device<Device>(), Tparams.flat<T>(),
                    Tupdate.flat<T>());
   }
 
@@ -143,18 +104,11 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // Only register 'Assign' on GPU for the subset of types also supported by
 // 'Variable' (see variable_ops.cc.)
 #define REGISTER_GPU_KERNELS(type)                                 \
-  namespace functor {                                              \
-  template <>                                                      \
-  void DenseUpdate<GPUDevice, type, ASSIGN>::operator()(           \
-      const GPUDevice& d, typename TTypes<type>::Flat lhs,         \
-      typename TTypes<type>::ConstFlat rhs);                       \
-  extern template struct DenseUpdate<GPUDevice, type, ASSIGN>;     \
-  }                                                                \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("Assign").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       AssignOpT<GPUDevice, type>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -180,22 +134,6 @@ TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC_FOR_OP(T, OP)                     \
-  template <>                                              \
-  void DenseUpdate<GPUDevice, T, OP>::operator()(          \
-      const GPUDevice& d, typename TTypes<T>::Flat params, \
-      typename TTypes<T>::ConstFlat update);               \
-  extern template struct DenseUpdate<GPUDevice, T, OP>;
-#define DECLARE_GPU_SPEC(T)                         \
-  DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::ADD); \
-  DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::SUB)
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
-#undef DECLARE_GPU_SPEC
-#undef DECLARE_GPU_SPEC_FOR_OP
-}  // namespace functor
-
 #define REGISTER_GPU_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("AssignAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
diff --git a/tensorflow/core/kernels/dense_update_ops.h b/tensorflow/core/kernels/dense_update_ops.h
deleted file mode 100644
index ec7e9dff116e55b79ecb9ac0ae014397754ffcef..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/dense_update_ops.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
-#define TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-
-namespace tensorflow {
-
-enum DenseUpdateType { ADD, SUB, ASSIGN };
-
-namespace functor {
-
-template <typename Device, typename T, DenseUpdateType OP>
-struct DenseUpdate;
-
-template <typename Device, typename T>
-struct DenseUpdate<Device, T, ADD> {
-  void operator()(const Device& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) += update;
-  }
-};
-
-template <typename Device, typename T>
-struct DenseUpdate<Device, T, SUB> {
-  void operator()(const Device& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) -= update;
-  }
-};
-
-template <typename Device, typename T>
-struct DenseUpdate<Device, T, ASSIGN> {
-  void operator()(const Device& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) = update;
-  }
-};
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index be9fc5de693b3482d98aa9e0808cccae69d95adf..f63a99a73088a3c5c339cf36e960b4771694013e 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -59,6 +59,10 @@ EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
          args.filter_rows * args.filter_cols <= args.in_cols * block_rows;
 }
 
+// The DepthwiseConv2dGPUKernels perform either forward or backprop input
+// convolution depending on a template argument of this enum.
+enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };
+
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NHWC format.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -148,8 +152,11 @@ __global__ void __launch_bounds__(1024, 2)
 // Tiles of the input and filter tensors are loaded into shared memory before
 // performing the convolution. Each thread handles two elements per iteration,
 // one each in the lower and upper half of a tile.
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, bool kKnownEvenRows>
+// Backprop input direction is the same as forward direction with the filter
+// rotated by 180°.
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
+          bool kKnownEvenRows>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
@@ -217,7 +224,9 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   const int max_depth = in_depth - thread_depth;
   const int filter_write_offset =
       thread_pix < filter_pixels ? tile_size + thread_idx : 0;
-  const int filter_read_offset = tile_size + thread_depth;
+  const int filter_read_offset =
+      tile_size + thread_depth +
+      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockSlices);
   const bool skip_second =
       !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
 
@@ -253,12 +262,17 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
       const T* filter_ptr = filter_read_offset + shared_data;
       UNROLL for (int r = 0; r < filter_rows; ++r) {
         UNROLL for (int c = 0; c < filter_cols; ++c) {
+          if (kDirection == DIRECTION_BACKWARD) {
+            filter_ptr -= kBlockSlices;
+          }
           const T filter_value = *filter_ptr;
           const T* const tile_ptr = shared_offset + shared_data;
           sum1 += filter_value * tile_ptr[0];
           sum2 += filter_value * tile_ptr[tile_offset];
           shared_offset += kBlockSlices;
-          filter_ptr += kBlockSlices;
+          if (kDirection == DIRECTION_FORWARD) {
+            filter_ptr += kBlockSlices;
+          }
         }
         shared_offset += in_increment;
       }
@@ -408,8 +422,11 @@ __global__ void __launch_bounds__(1024, 2)
 // Tiles of the input and filter tensors are loaded into shared memory before
 // performing the convolution. Each thread handles two elements per iteration,
 // one each in the lower and upper half of a tile.
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, bool kKnownEvenRows>
+// Backprop input direction is the same as forward direction with the filter
+// rotated by 180°.
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
+          bool kKnownEvenRows>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
@@ -480,7 +497,9 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   const int max_slice = in_slices - thread_depth;
   const int filter_write_offset =
       filter_pix < filter_pixels ? tile_size + thread_idx : 0;
-  const int filter_read_offset = tile_size + thread_depth;
+  const int filter_read_offset =
+      tile_size + thread_depth +
+      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockSlices);
   const bool skip_second =
       !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
 
@@ -514,12 +533,17 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
       const T* filter_ptr = filter_read_offset + shared_data;
       UNROLL for (int r = 0; r < filter_rows; ++r) {
         UNROLL for (int c = 0; c < filter_cols; ++c) {
+          if (kDirection == DIRECTION_BACKWARD) {
+            filter_ptr -= kBlockSlices;
+          }
           const T filter_value = *filter_ptr;
           const T* const tile_ptr = shared_offset + shared_data;
           sum1 += filter_value * tile_ptr[0];
           sum2 += filter_value * tile_ptr[tile_offset];
           ++shared_offset;
-          filter_ptr += kBlockSlices;
+          if (kDirection == DIRECTION_FORWARD) {
+            filter_ptr += kBlockSlices;
+          }
         }
         shared_offset += in_increment;
       }
@@ -535,83 +559,80 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   }
 }
 
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, bool kKnownEvenRows>
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
+          bool kKnownEvenRows>
 void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
                                    const T* input, const T* filter, T* output,
                                    TensorFormat data_format) {
   const int block_rows = (args.in_rows + 1) / 2;
+  dim3 block_dim;
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
+  if (data_format == FORMAT_NHWC) {
+    block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    kernel = DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth,
+                                               kKnownFilterHeight, kBlockSlices,
+                                               kKnownEvenRows>;
+  } else if (data_format == FORMAT_NCHW) {
+    block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    kernel = DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth,
+                                               kKnownFilterHeight, kBlockSlices,
+                                               kKnownEvenRows>;
+  } else {
+    assert(false && "Incorrect data format");
+    return;
+  }
   const int tile_cols = args.in_cols + args.filter_cols - 1;
   const int tile_rows = block_rows * 2 + args.filter_rows - 1;
   const int tile_pixels = tile_rows * tile_cols;
   const int filter_pixels = args.filter_rows * args.filter_cols;
-
   const int shared_memory_size =
       kBlockSlices * (tile_pixels + filter_pixels) * sizeof(T);
   const int num_outputs =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
-
-  if (data_format == FORMAT_NHWC) {
-    dim3 block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_outputs, d,
-        DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
-                                          kKnownFilterHeight, kBlockSlices,
-                                          kKnownEvenRows>,
-        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
-                                      kBlockSlices, kKnownEvenRows>
-        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-            args, input, filter, output);
-  } else if (data_format == FORMAT_NCHW) {
-    dim3 block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_outputs, d,
-        DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth,
-                                          kKnownFilterHeight, kBlockSlices,
-                                          kKnownEvenRows>,
-        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, kKnownFilterHeight,
-                                      kBlockSlices, kKnownEvenRows>
-        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-            args, input, filter, output);
-  } else {
-    assert(false && "Incorrect data format");
-  }
+  CudaLaunchConfig config =
+      GetCudaLaunchConfig(num_outputs, d, kernel, shared_memory_size,
+                          block_dim.x * block_dim.y * block_dim.z);
+  kernel<<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+      args, input, filter, output);
 }
 
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices>
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices>
 void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
                                    const T* input, const T* filter, T* output,
                                    TensorFormat data_format) {
   if (args.in_rows & 1) {
-    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
-                                  kBlockSlices, false>(d, args, input, filter,
-                                                       output, data_format);
+    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                  kKnownFilterHeight, kBlockSlices, false>(
+        d, args, input, filter, output, data_format);
   } else {
-    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
-                                  kBlockSlices, true>(d, args, input, filter,
-                                                      output, data_format);
+    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                  kKnownFilterHeight, kBlockSlices, true>(
+        d, args, input, filter, output, data_format);
   }
 }
 
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+template <typename T, DepthwiseConv2dDirection kDirection,
+          int kKnownFilterWidth, int kKnownFilterHeight>
 void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
                                    const T* input, const T* filter, T* output,
                                    TensorFormat data_format) {
   // Maximize (power of two) kBlockSlices while keeping a block within 1024
   // threads (2 pixels per thread).
-  const int in_pixels = args.in_rows * args.in_cols;
-  if (in_pixels > 512) {
-    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, 2>(
-        d, args, input, filter, output, data_format);
-  } else if (in_pixels > 256) {
-    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, 4>(
-        d, args, input, filter, output, data_format);
+  const int block_pixels = (args.in_rows + 1) / 2 * args.in_cols;
+  if (block_pixels > 256) {
+    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                  kKnownFilterHeight, 2>(d, args, input, filter,
+                                                         output, data_format);
+  } else if (block_pixels > 128) {
+    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                  kKnownFilterHeight, 4>(d, args, input, filter,
+                                                         output, data_format);
   } else {
-    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, 8>(
-        d, args, input, filter, output, data_format);
+    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
+                                  kKnownFilterHeight, 8>(d, args, input, filter,
+                                                         output, data_format);
   }
 }
 
@@ -620,38 +641,30 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
 void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
                               const T* input, const T* filter, T* output,
                               TensorFormat data_format) {
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
+  if (data_format == FORMAT_NHWC) {
+    kernel =
+        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>;
+  } else if (data_format == FORMAT_NCHW) {
+    kernel =
+        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>;
+  } else {
+    assert(false && "Incorrect data format");
+    return;
+  }
   const int num_outputs =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d, kernel, 0, 0);
   // The compile-time constant version runs faster with a single block.
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
                                   ? std::numeric_limits<int>::max()
                                   : d.getNumCudaMultiProcessors();
-  if (data_format == FORMAT_NHWC) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_outputs, d,
-        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
-                                     kKnownDepthMultiplier>,
-        0, 0);
-    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
-                                 kKnownDepthMultiplier>
-        <<<std::min(max_block_count, config.block_count),
-           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
-                                                     output, num_outputs);
-  } else if (data_format == FORMAT_NCHW) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_outputs, d,
-        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
-                                     kKnownDepthMultiplier>,
-        0, 0);
-    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
-                                 kKnownDepthMultiplier>
-        <<<std::min(max_block_count, config.block_count),
+  kernel<<<std::min(max_block_count, config.block_count),
            config.thread_per_block, 0, d.stream()>>>(args, input, filter,
                                                      output, num_outputs);
-  } else {
-    assert(false && "Incorrect data format");
-  }
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
@@ -660,8 +673,9 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
                               TensorFormat data_format) {
   if (args.depth_multiplier == 1) {
     if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
-      LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight>(
-          d, args, input, filter, output, data_format);
+      LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_FORWARD, kKnownFilterWidth,
+                                    kKnownFilterHeight>(d, args, input, filter,
+                                                        output, data_format);
       return;
     }
 
@@ -756,145 +770,6 @@ __global__ void __launch_bounds__(640, 2)
   }
 }
 
-// CUDA kernel to compute the depthwise convolution backward w.r.t. input in
-// NHWC format, tailored for small images up to 32x32. Stride and depth
-// multiplier must be 1. Padding must be 'SAME', which allows to reuse the index
-// computation. Only use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args)
-// returns true.
-// Implementation is the same as the forward pass, except that the filter is
-// rotate by 180°, see filter_read_offset and filter_ptr.
-// Tiles of the input and filter tensors are loaded into shared memory before
-// performing the convolution. Each thread handles two elements per iteration,
-// one each in the lower and upper half of a tile.
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, bool kKnownEvenRows>
-__global__
-__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
-    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
-  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
-  // Holds block plus halo and filter data for blockDim.x depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
-  T* const shared_data = reinterpret_cast<T*>(shared_memory);
-
-  const int batches = args.batch;
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
-  const int in_depth = args.in_depth;
-  const int filter_rows =
-      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
-      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-
-  // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
-  const int block_rows = blockDim.z;
-
-  // These values are the same for all threads and could
-  // be precomputed on the CPU.
-  const int block_size = block_rows * in_cols * kBlockSlices;
-  const int in_row_size = in_cols * in_depth;
-  const int in_size = in_rows * in_row_size;
-  const int in_increment = (in_cols - 1) * kBlockSlices;
-  const int filter_pixels = filter_rows * filter_cols;
-  const int tile_cols = in_cols + filter_cols - 1;
-  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
-  const int tile_rows = in_rows + filter_rows - even_rows;
-  const int tile_row_size = tile_cols * kBlockSlices;
-  const int tile_size = tile_rows * tile_row_size;
-  const int tile_offset = block_rows * tile_row_size;
-  const int pad_offset = pad_rows * tile_cols + pad_cols;
-  const int batch_blocks = (in_depth + kBlockSlices - 1) / kBlockSlices;
-  const int in_blocks = batch_blocks * batches;
-  const int tensor_offset =
-      kKnownEvenRows ? in_size / 2 : block_rows * in_row_size;
-
-  const int thread_depth = threadIdx.x;
-  const int thread_col = threadIdx.y;
-  const int thread_row = threadIdx.z;
-
-  // Position in block.
-  const int thread_pix = thread_row * in_cols + thread_col;
-  const int thread_idx = thread_pix * kBlockSlices + thread_depth;
-
-  // Initialize tile, in particular the padding.
-  for (int i = thread_idx; i < tile_size; i += block_size) {
-    shared_data[i] = T(0);
-  }
-  __syncthreads();
-
-  // Position in tensors.
-  const int tensor_idx = thread_pix * in_depth + thread_depth;
-
-  // Position in (padded) shared memory.
-  const int data_pix = thread_row * tile_cols + thread_col;
-  const int data_idx = data_pix * kBlockSlices + thread_depth;
-
-  // Position in shared memory, offset by pad_rows / pad_cols.
-  const int tile_pix = data_pix + pad_offset;
-  const int tile_idx = tile_pix * kBlockSlices + thread_depth;
-
-  const int max_depth = in_depth - thread_depth;
-  const int filter_write_offset =
-      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
-  const int filter_read_offset =
-      tile_size + filter_pixels * kBlockSlices + thread_depth;
-  const bool skip_second =
-      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
-
-  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
-    const int batch = b / batch_blocks;
-    const int stack = b - batch * batch_blocks;
-
-    const int start_depth = stack * kBlockSlices;
-    const int filter_offset = tensor_idx + start_depth;
-    const int inout_offset = batch * in_size + filter_offset;
-    const bool depth_in_range = start_depth < max_depth;
-
-    if (depth_in_range) {
-      const T* const in_ptr = inout_offset + input;
-      T* const tile_ptr = tile_idx + shared_data;
-      tile_ptr[0] = ldg(in_ptr);
-      if (!skip_second) {
-        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
-      }
-
-      if (filter_write_offset != 0) {
-        shared_data[filter_write_offset] = ldg(filter_offset + filter);
-      }
-    }
-
-    // Note: the condition to reach this is uniform across the entire block.
-    __syncthreads();
-
-    if (depth_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
-      int shared_offset = data_idx;
-      const T* filter_ptr = filter_read_offset + shared_data;
-      UNROLL for (int r = 0; r < filter_rows; ++r) {
-        UNROLL for (int c = 0; c < filter_cols; ++c) {
-          filter_ptr -= kBlockSlices;
-          const T filter_value = *filter_ptr;
-          const T* const tile_ptr = shared_offset + shared_data;
-          sum1 += filter_value * tile_ptr[0];
-          sum2 += filter_value * tile_ptr[tile_offset];
-          shared_offset += kBlockSlices;
-        }
-        shared_offset += in_increment;
-      }
-      T* const out_ptr = inout_offset + output;
-      out_ptr[0] = sum1;
-      if (!skip_second) {
-        out_ptr[tensor_offset] = sum2;
-      }
-    }
-
-    // Note: the condition to reach this is uniform across the entire block.
-    __syncthreads();
-  }
-}
-
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(640, 2)
@@ -966,234 +841,6 @@ __global__ void __launch_bounds__(640, 2)
   }
 }
 
-// CUDA kernel to compute the depthwise convolution backward w.r.t. input in
-// NHWC format, tailored for small images up to 32x32. Stride and depth
-// multiplier must be 1. Padding must be 'SAME', which allows to reuse the index
-// computation. Only use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args)
-// returns true.
-// Implementation is the same as the forward pass, except that the filter is
-// rotate by 180°, see filter_read_offset and filter_ptr.
-// Tiles of the input and filter tensors are loaded into shared memory before
-// performing the convolution. Each thread handles two elements per iteration,
-// one each in the lower and upper half of a tile.
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, bool kKnownEvenRows>
-__global__
-__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNCHWSmall(
-    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
-  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
-  // Holds block plus halo and filter data for blockDim.z depths.
-  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
-  T* const shared_data = reinterpret_cast<T*>(shared_memory);
-
-  const int batches = args.batch;
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
-  const int in_depth = args.in_depth;
-  const int filter_rows =
-      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
-      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-
-  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
-  const int block_rows = blockDim.y;
-
-  // These values are the same for all threads and could
-  // be precomputed on the CPU.
-  const int block_pixels = in_cols * block_rows;
-  const int block_size = block_pixels * kBlockSlices;
-  const int in_pixels = in_cols * in_rows;
-  const int in_increment = in_cols - 1;
-  const int filter_pixels = filter_rows * filter_cols;
-  const int tile_cols = in_cols + filter_cols - 1;
-  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
-  const int tile_rows = in_rows + filter_rows - even_rows;
-  const int tile_pixels = tile_cols * tile_rows;
-  const int tile_size = tile_pixels * kBlockSlices;
-  const int tile_offset = block_rows * tile_cols;
-  const int pad_offset = pad_rows * tile_cols + pad_cols;
-  const int in_slices = in_depth * batches;
-  const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
-
-  const int thread_col = threadIdx.x;
-  const int thread_row = threadIdx.y;
-  const int thread_depth = threadIdx.z;
-
-  // Position in block.
-  const int thread_pix = thread_row * in_cols + thread_col;
-  const int thread_idx = thread_depth * block_pixels + thread_pix;
-
-  // Initialize tile, in particular the padding.
-  for (int i = thread_idx; i < tile_size; i += block_size) {
-    shared_data[i] = T(0);
-  }
-  __syncthreads();
-
-  // Position in tensors.
-  const int tensor_idx = thread_depth * in_pixels + thread_pix;
-
-  // Position in (padded) shared memory.
-  const int data_pix = thread_row * tile_cols + thread_col;
-  const int data_idx = thread_depth * tile_pixels + data_pix;
-
-  // Position in shared memory, offset by pad_rows / pad_cols.
-  const int tile_idx = data_idx + pad_offset;
-
-  // Filter is always in HWCK format, irrespective of the input/output format.
-  const int filter_pix = thread_idx / kBlockSlices;
-  const int filter_depth = thread_idx % kBlockSlices;
-  const int filter_idx = filter_pix * in_depth;
-
-  const int max_slice = in_slices - thread_depth;
-  const int filter_write_offset =
-      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
-  const int filter_read_offset =
-      tile_size + filter_pixels * kBlockSlices + thread_depth;
-  const bool skip_second =
-      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
-
-  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
-    const int slice = b * kBlockSlices;
-
-    const int inout_offset = slice * in_pixels + tensor_idx;
-    const bool slice_in_range = slice < max_slice;
-
-    if (slice_in_range) {
-      const T* const in_ptr = inout_offset + input;
-      T* const tile_ptr = tile_idx + shared_data;
-      tile_ptr[0] = ldg(in_ptr);
-      if (!skip_second) {
-        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
-      }
-    }
-
-    if (filter_write_offset != 0) {
-      const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
-      shared_data[filter_write_offset] = ldg(filter_offset + filter);
-    }
-
-    // Note: the condition to reach this is uniform across the entire block.
-    __syncthreads();
-
-    if (slice_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
-      int shared_offset = data_idx;
-      const T* filter_ptr = filter_read_offset + shared_data;
-      UNROLL for (int r = 0; r < filter_rows; ++r) {
-        UNROLL for (int c = 0; c < filter_cols; ++c) {
-          filter_ptr -= kBlockSlices;
-          const T filter_value = *filter_ptr;
-          const T* const tile_ptr = shared_offset + shared_data;
-          sum1 += filter_value * tile_ptr[0];
-          sum2 += filter_value * tile_ptr[tile_offset];
-          ++shared_offset;
-        }
-        shared_offset += in_increment;
-      }
-      T* const out_ptr = inout_offset + output;
-      out_ptr[0] = sum1;
-      if (!skip_second) {
-        out_ptr[block_pixels] = sum2;
-      }
-    }
-
-    // Note: the condition to reach this is uniform across the entire block.
-    __syncthreads();
-  }
-}
-
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, bool kKnownEvenRows>
-void LaunchDepthwiseConv2dBackpropInputGPUSmall(const GpuDevice& d,
-                                                const DepthwiseArgs args,
-                                                const T* out_backprop,
-                                                const T* filter, T* in_backprop,
-                                                TensorFormat data_format) {
-  const int block_rows = (args.in_rows + 1) / 2;
-  const int tile_cols = args.in_cols + args.filter_cols - 1;
-  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
-  const int tile_pixels = tile_rows * tile_cols;
-  const int filter_pixels = args.filter_rows * args.filter_cols;
-
-  const int shared_memory_size =
-      kBlockSlices * (tile_pixels + filter_pixels) * sizeof(T);
-  const int num_outputs =
-      args.batch * args.out_rows * args.out_cols * args.out_depth;
-
-  if (data_format == FORMAT_NHWC) {
-    dim3 block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_outputs, d,
-        DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<
-            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
-            kKnownEvenRows>,
-        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kKnownEvenRows>
-        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-            args, out_backprop, filter, in_backprop);
-  } else if (data_format == FORMAT_NCHW) {
-    dim3 block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_outputs, d,
-        DepthwiseConv2dBackpropInputGPUKernelNCHWSmall<
-            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
-            kKnownEvenRows>,
-        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dBackpropInputGPUKernelNCHWSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kKnownEvenRows>
-        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-            args, out_backprop, filter, in_backprop);
-  } else {
-    assert(false && "Incorrect data format");
-  }
-}
-
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices>
-void LaunchDepthwiseConv2dBackpropInputGPUSmall(const GpuDevice& d,
-                                                const DepthwiseArgs args,
-                                                const T* out_backprop,
-                                                const T* filter, T* in_backprop,
-                                                TensorFormat data_format) {
-  if (args.in_rows & 1) {
-    LaunchDepthwiseConv2dBackpropInputGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, false>(
-        d, args, out_backprop, filter, in_backprop, data_format);
-  } else {
-    LaunchDepthwiseConv2dBackpropInputGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, true>(
-        d, args, out_backprop, filter, in_backprop, data_format);
-  }
-}
-
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dBackpropInputGPUSmall(const GpuDevice& d,
-                                                const DepthwiseArgs args,
-                                                const T* input, const T* filter,
-                                                T* output,
-                                                TensorFormat data_format) {
-  // Maximize (power of two) kBlockSlices while keeping a block within 1024
-  // threads (2 pixels per thread).
-  const int in_pixels = args.in_rows * args.in_cols;
-  if (in_pixels > 512) {
-    LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
-                                               kKnownFilterHeight, 2>(
-        d, args, input, filter, output, data_format);
-  } else if (in_pixels > 256) {
-    LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
-                                               kKnownFilterHeight, 4>(
-        d, args, input, filter, output, data_format);
-  } else {
-    LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
-                                               kKnownFilterHeight, 8>(
-        d, args, input, filter, output, data_format);
-  }
-}
-
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
@@ -1201,31 +848,23 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
                                            const T* out_backprop,
                                            const T* filter, T* in_backprop,
                                            TensorFormat data_format) {
-  const int num_in_backprop =
-      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
   if (data_format == FORMAT_NHWC) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_in_backprop, d,
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<
-            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
-        0, 0);
-    DepthwiseConv2dBackpropInputGPUKernelNHWC<
-        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            args, out_backprop, filter, in_backprop, num_in_backprop);
+    kernel = DepthwiseConv2dBackpropInputGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
   } else if (data_format == FORMAT_NCHW) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_in_backprop, d,
-        DepthwiseConv2dBackpropInputGPUKernelNCHW<
-            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
-        0, 0);
-    DepthwiseConv2dBackpropInputGPUKernelNCHW<
-        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            args, out_backprop, filter, in_backprop, num_in_backprop);
+    kernel = DepthwiseConv2dBackpropInputGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
   } else {
     assert(false && "Incorrect data format");
+    return;
   }
+  const int num_in_backprop =
+      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  CudaLaunchConfig config =
+      GetCudaLaunchConfig(num_in_backprop, d, kernel, 0, 0);
+  kernel<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      args, out_backprop, filter, in_backprop, num_in_backprop);
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
@@ -1236,8 +875,8 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
                                            TensorFormat data_format) {
   if (args.depth_multiplier == 1) {
     if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
-      LaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
-                                                 kKnownFilterHeight>(
+      LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_BACKWARD, kKnownFilterWidth,
+                                    kKnownFilterHeight>(
           d, args, out_backprop, filter, in_backprop, data_format);
       return;
     }
@@ -1783,17 +1422,9 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kBlockSlices, int kAccumPixels>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
-    const T* input, T* filter_backprop, TensorFormat data_format) {
-  int block_rows = (args.in_rows + 1) / 2;
-  // args.in_cols * block_rows * kBlockSlices must be multiple of 32.
-  for (int round_mask = 1; args.in_cols * block_rows * kBlockSlices & 31;
-       round_mask = round_mask * 2 + 1) {
-    block_rows = block_rows + round_mask & ~round_mask;
-  }
-  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_rows)) {
-    return false;
-  }
+    const GpuDevice& d, const DepthwiseArgs args, const int block_rows,
+    const T* out_backprop, const T* input, T* filter_backprop,
+    TensorFormat data_format) {
   const int tile_cols = args.in_cols + args.filter_cols - 1;
   const int tile_rows = block_rows * 2 + args.filter_rows - 1;
   const int tile_pixels = tile_rows * tile_cols;
@@ -1804,58 +1435,51 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     return false;
   }
 
-  const int num_out_backprop =
-      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  dim3 block_dim;
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
   if (data_format == FORMAT_NHWC) {
-    dim3 block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_out_backprop, d,
-        DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
-            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
-            kAccumPixels>,
-        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>
-        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-            args, out_backprop, input, filter_backprop);
+    block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>;
   } else if (data_format == FORMAT_NCHW) {
-    dim3 block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_out_backprop, d,
-        DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
-            T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices,
-            kAccumPixels>,
-        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>
-        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-            args, out_backprop, input, filter_backprop);
+    block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>;
   } else {
     assert(false && "Incorrect data format");
+    return false;
   }
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  CudaLaunchConfig config =
+      GetCudaLaunchConfig(num_out_backprop, d, kernel, shared_memory_size,
+                          block_dim.x * block_dim.y * block_dim.z);
+  kernel<<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
+      args, out_backprop, input, filter_backprop);
   return true;
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kBlockSlices>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
-    const T* input, T* filter_backprop, TensorFormat data_format) {
+    const GpuDevice& d, const DepthwiseArgs args, const int block_rows,
+    const T* out_backprop, const T* input, T* filter_backprop,
+    TensorFormat data_format) {
   // Minimize (power of two) kAccumPixels, while satisfying
-  // kAccumPixels * 64 >= in_pixels * kBlockSlices.
-  const int block_pixels = args.in_rows * args.in_cols * kBlockSlices;
-  if (block_pixels > 1024) {
+  // kAccumPixels * 32 >= block_rows * in_cols * kBlockSlices.
+  const int block_pixels = block_rows * args.in_cols * kBlockSlices;
+  if (block_pixels > 512) {
     return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
         T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 32>(
-        d, args, out_backprop, input, filter_backprop, data_format);
-  } else if (block_pixels > 512) {
+        d, args, block_rows, out_backprop, input, filter_backprop, data_format);
+  } else if (block_pixels > 256) {
     return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
         T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 16>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+        d, args, block_rows, out_backprop, input, filter_backprop, data_format);
   } else {
     return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
         T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 8>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+        d, args, block_rows, out_backprop, input, filter_backprop, data_format);
   }
 }
 
@@ -1865,19 +1489,43 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     const T* input, T* filter_backprop, TensorFormat data_format) {
   // Maximize (power of two) kBlockSlices while keeping a block within 1024
   // threads (2 pixels per thread).
-  const int in_pixels = args.in_rows * args.in_cols;
-  if (in_pixels > 512) {
-    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, 2>(
-        d, args, out_backprop, input, filter_backprop, data_format);
-  } else if (in_pixels > 256) {
-    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, 4>(
-        d, args, out_backprop, input, filter_backprop, data_format);
-  } else {
-    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, 8>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+  int block_slices = 8;
+  int block_rows = (args.in_rows + 1) / 2;
+  int round_mask = 1;
+  for (; block_slices > 1; block_slices /= 2) {
+    // args.in_cols * block_rows * kBlockSlices must be multiple of 32.
+    for (; block_rows * args.in_cols * block_slices & 31;
+         round_mask = round_mask * 2 + 1) {
+      block_rows = block_rows + round_mask & ~round_mask;
+    }
+    int block_size = block_rows * args.in_cols * block_slices;
+    if (block_size <= 1024) {
+      break;
+    }
+  }
+
+  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_rows)) {
+    return false;
+  }
+
+  switch (block_slices) {
+    case 8:
+      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, 8>(
+          d, args, block_rows, out_backprop, input, filter_backprop,
+          data_format);
+    case 4:
+      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, 4>(
+          d, args, block_rows, out_backprop, input, filter_backprop,
+          data_format);
+    case 2:
+      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
+          T, kKnownFilterWidth, kKnownFilterHeight, 2>(
+          d, args, block_rows, out_backprop, input, filter_backprop,
+          data_format);
+    default:
+      return false;
   }
 }
 
@@ -1888,31 +1536,23 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
                                             const T* out_backprop,
                                             const T* input, T* filter_backprop,
                                             TensorFormat data_format) {
-  const int num_out_backprop =
-      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
   if (data_format == FORMAT_NHWC) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_out_backprop, d,
-        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
-            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
-        0, 0);
-    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
-        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            args, out_backprop, input, filter_backprop, num_out_backprop);
+    kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
   } else if (data_format == FORMAT_NCHW) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        num_out_backprop, d,
-        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
-            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
-        0, 0);
-    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
-        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            args, out_backprop, input, filter_backprop, num_out_backprop);
+    kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
   } else {
     assert(false && "Incorrect data format");
+    return;
   }
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  CudaLaunchConfig config =
+      GetCudaLaunchConfig(num_out_backprop, d, kernel, 0, 0);
+  kernel<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      args, out_backprop, input, filter_backprop, num_out_backprop);
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/determinant_op.cc
index d51563580b0444952a0a211b8df02d8dffd468bd..d7e55a8ba246e8ebccde7a7706a5f0ae29fb36d8 100644
--- a/tensorflow/core/kernels/determinant_op.cc
+++ b/tensorflow/core/kernels/determinant_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
@@ -62,7 +63,14 @@ class DeterminantOp : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<float>), float);
 REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<double>), double);
+REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<complex64>), complex64);
+REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<complex128>),
+                   complex128);
 REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<float>), float);
 REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<double>), double);
+REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 8e021b92563d891b83ad7e6d9ddf574e65f10878..4fcae25aa6eac8b31f78e1d5ae964aed427fc0f4 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -55,8 +55,6 @@ class EncodeJpegOp : public OpKernel {
         context, context->GetAttr("optimize_size", &flags_.optimize_jpeg_size));
     OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
                                              &flags_.chroma_downsampling));
-    OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
-                                             &flags_.chroma_downsampling));
 
     string density_unit;
     OP_REQUIRES_OK(context, context->GetAttr("density_unit", &density_unit));
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index e0cc08f101cf23461a64cb0f03103b327abe4f91..2db844e410cea679291aec67748ed15297a0e36a 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -216,7 +216,7 @@ class SingleSequenceExampleParserOp : public OpKernel {
                   TensorShapeUtils::IsScalar(context_dense_keys[di].shape()),
                   errors::InvalidArgument(
                       "Expected context_dense_keys[", di,
-                      "] to be a vector, got shape: ",
+                      "] to be a scalar, got shape: ",
                       context_dense_keys[di].shape().DebugString()));
       context_dense_keys_t[di] = context_dense_keys[di].scalar<string>()();
     }
@@ -225,7 +225,7 @@ class SingleSequenceExampleParserOp : public OpKernel {
                   TensorShapeUtils::IsScalar(context_sparse_keys[di].shape()),
                   errors::InvalidArgument(
                       "Expected context_sparse_keys[", di,
-                      "] to be a vector, got shape: ",
+                      "] to be a scalar, got shape: ",
                       context_sparse_keys[di].shape().DebugString()));
       context_sparse_keys_t[di] = context_sparse_keys[di].scalar<string>()();
     }
@@ -234,7 +234,7 @@ class SingleSequenceExampleParserOp : public OpKernel {
           ctx, TensorShapeUtils::IsScalar(feature_list_dense_keys[di].shape()),
           errors::InvalidArgument(
               "Expected feature_list_dense_keys[", di,
-              "] to be a vector, got shape: ",
+              "] to be a scalar, got shape: ",
               feature_list_dense_keys[di].shape().DebugString()));
       feature_list_dense_keys_t[di] =
           feature_list_dense_keys[di].scalar<string>()();
@@ -244,7 +244,7 @@ class SingleSequenceExampleParserOp : public OpKernel {
           ctx, TensorShapeUtils::IsScalar(feature_list_sparse_keys[di].shape()),
           errors::InvalidArgument(
               "Expected feature_list_sparse_keys[", di,
-              "] to be a vector, got shape: ",
+              "] to be a scalar, got shape: ",
               feature_list_sparse_keys[di].shape().DebugString()));
       feature_list_sparse_keys_t[di] =
           feature_list_sparse_keys[di].scalar<string>()();
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 32936d65c8e3eb922228cf03efc1875b0184030b..ab5af8caada587805f3cf936c0bbef95ead98076 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 // See docs in ../ops/spectral_ops.cc.
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -24,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -128,7 +129,7 @@ class FFTCPU : public FFTBase {
     auto device = ctx->eigen_device<CPUDevice>();
 
     if (!IsReal()) {
-      auto input = (Tensor(in)).flat_inner_dims<complex64, FFTRank + 1>();
+      auto input = Tensor(in).flat_inner_dims<complex64, FFTRank + 1>();
       // Compute the FFT using eigen.
       auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
       constexpr auto direction =
@@ -137,7 +138,7 @@ class FFTCPU : public FFTBase {
           input.template fft<Eigen::BothParts, direction>(axes);
     } else {
       if (IsForward()) {
-        auto input = (Tensor(in)).flat_inner_dims<float, FFTRank + 1>();
+        auto input = Tensor(in).flat_inner_dims<float, FFTRank + 1>();
         const auto input_dims = input.dimensions();
 
         // Slice input to fft_shape on its inner-most dimensions.
@@ -166,7 +167,7 @@ class FFTCPU : public FFTBase {
             full_fft.slice(zero_start_indices, output.dimensions());
       } else {
         // Reconstruct the full FFT and take the inverse.
-        auto input = ((Tensor)in).flat_inner_dims<complex64, FFTRank + 1>();
+        auto input = Tensor(in).flat_inner_dims<complex64, FFTRank + 1>();
         auto output = out->flat_inner_dims<float, FFTRank + 1>();
         const auto input_dims = input.dimensions();
 
@@ -276,22 +277,93 @@ REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
 #undef FFT_LABEL
 
 #if GOOGLE_CUDA
+namespace gpu = ::perftools::gputools;
 
 namespace {
-// TODO(vrv/zhifengc): Refactor AsDeviceMemory() into GPUUtil.
 template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
+gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  gpu::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  gpu::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+
+template <typename T>
+gpu::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  gpu::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
+  gpu::DeviceMemory<T> typed(wrapped);
   return typed;
 }
+
+// A class to provide scratch-space allocator for Stream-Executor Cufft
+// callback. Tensorflow is responsible for releasing the temporary buffers after
+// the kernel finishes.
+// TODO(yangzihao): Refactor redundant code in subclasses of ScratchAllocator
+// into base class.
+class CufftScratchAllocator : public gpu::ScratchAllocator {
+ public:
+  ~CufftScratchAllocator() override {}
+  CufftScratchAllocator(int64 memory_limit, OpKernelContext* context)
+      : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
+  int64 GetMemoryLimitInBytes(gpu::Stream* stream) override {
+    return memory_limit_;
+  }
+  gpu::port::StatusOr<gpu::DeviceMemory<uint8>> AllocateBytes(
+      gpu::Stream* stream, int64 byte_size) override {
+    Tensor temporary_memory;
+    if (byte_size > memory_limit_) {
+      return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>();
+    }
+    AllocationAttributes allocation_attr;
+    allocation_attr.no_retry_on_failure = true;
+    Status allocation_status(context_->allocate_temp(
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory,
+        AllocatorAttributes(), allocation_attr));
+    if (!allocation_status.ok()) {
+      return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>();
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    total_byte_size_ += byte_size;
+    return gpu::port::StatusOr<gpu::DeviceMemory<uint8>>(
+        AsDeviceMemory(temporary_memory.flat<uint8>().data(),
+                       temporary_memory.flat<uint8>().size()));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 memory_limit_;
+  int64 total_byte_size_;
+  OpKernelContext* context_;
+  std::vector<Tensor> allocated_tensors_;
+};
+
 }  // end namespace
 
+int64 GetCufftWorkspaceLimit(const string& envvar_in_mb,
+                             int64 default_value_in_bytes) {
+  const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
+  if (workspace_limit_in_mb_str != nullptr &&
+      strcmp(workspace_limit_in_mb_str, "") != 0) {
+    int64 scratch_limit_in_mb = -1;
+    Status status = ReadInt64FromEnvVar(envvar_in_mb, default_value_in_bytes,
+                                        &scratch_limit_in_mb);
+    if (!status.ok()) {
+      LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": "
+                   << workspace_limit_in_mb_str;
+    } else {
+      return scratch_limit_in_mb * (1 << 20);
+    }
+  }
+  return default_value_in_bytes;
+}
+
 class FFTGPUBase : public FFTBase {
  public:
   using FFTBase::FFTBase;
 
  protected:
+  static int64 CufftScratchSize;
   void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
              Tensor* out) override {
     auto* stream = ctx->op_device_context()->stream();
@@ -306,10 +378,10 @@ class FFTGPUBase : public FFTBase {
       batch_size *= input_shape.dim_size(i);
     }
     uint64 input_embed[3];
-    uint64 input_stride = 1;
+    const uint64 input_stride = 1;
     uint64 input_distance = 1;
     uint64 output_embed[3];
-    uint64 output_stride = 1;
+    const uint64 output_stride = 1;
     uint64 output_distance = 1;
 
     for (int i = 0; i < fft_rank; ++i) {
@@ -322,15 +394,16 @@ class FFTGPUBase : public FFTBase {
 
     constexpr bool kInPlaceFft = false;
     const auto kFftType =
-        IsReal() ? (IsForward() ? perftools::gputools::fft::Type::kR2C
-                                : perftools::gputools::fft::Type::kC2R)
-                 : (IsForward() ? perftools::gputools::fft::Type::kC2CForward
-                                : perftools::gputools::fft::Type::kC2CInverse);
+        IsReal() ? (IsForward() ? gpu::fft::Type::kR2C : gpu::fft::Type::kC2R)
+                 : (IsForward() ? gpu::fft::Type::kC2CForward
+                                : gpu::fft::Type::kC2CInverse);
 
-    auto plan = stream->parent()->AsFft()->CreateBatchedPlan(
-        stream, fft_rank, fft_shape, input_embed, input_stride, input_distance,
-        output_embed, output_stride, output_distance, kFftType, kInPlaceFft,
-        batch_size);
+    CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
+    auto plan =
+        stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
+            stream, fft_rank, fft_shape, input_embed, input_stride,
+            input_distance, output_embed, output_stride, output_distance,
+            kFftType, kInPlaceFft, batch_size, &scratch_allocator);
 
     if (IsReal()) {
       if (IsForward()) {
@@ -375,6 +448,11 @@ class FFTGPUBase : public FFTBase {
   }
 };
 
+int64 FFTGPUBase::CufftScratchSize = GetCufftWorkspaceLimit(
+    // default value is in bytes despite the name of the environment variable
+    "TF_CUFFT_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+);
+
 template <bool Forward, bool _Real, int FFTRank>
 class FFTGPU : public FFTGPUBase {
  public:
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 030cf8a49dbde532a655bf8d4c86786dbfbbc3bf..ea86b04762d52bd1debe80c2d404cff7bd276406 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/kernels/filter_dataset_op.cc b/tensorflow/core/kernels/filter_dataset_op.cc
index 3503c45f9af390c78722cc681009cbfabb045986..c0e909d73ac527e01f750864cc0d33f5951a4138 100644
--- a/tensorflow/core/kernels/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/filter_dataset_op.cc
@@ -124,6 +124,10 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
                 "Filter predicate `f` must return a scalar bool.");
           }
           matched = result[0].scalar<bool>()();
+          if (!matched) {
+            // Clear the output tensor list since it didn't match.
+            out_tensors->clear();
+          }
         } while (!matched);
         *end_of_sequence = false;
         return Status::OK();
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index ce7fb9c332bc263d2c560d5bfb346f5585f40b3c..298a4e36780d539067fc44988fb72e9d707b6b69 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -19,83 +19,118 @@ limitations under the License.
 #include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
+// In the constructor hop_bytes_ is set to record_bytes_ if it was 0,
+// so that we will always "hop" after each read (except first).
 class FixedLengthRecordReader : public ReaderBase {
  public:
   FixedLengthRecordReader(const string& node_name, int64 header_bytes,
                           int64 record_bytes, int64 footer_bytes,
-                          int64 hop_bytes, Env* env)
+                          int64 hop_bytes, const string& encoding, Env* env)
       : ReaderBase(
             strings::StrCat("FixedLengthRecordReader '", node_name, "'")),
         header_bytes_(header_bytes),
         record_bytes_(record_bytes),
         footer_bytes_(footer_bytes),
-        hop_bytes_(hop_bytes),
+        hop_bytes_(hop_bytes == 0 ? record_bytes : hop_bytes),
         env_(env),
-        file_pos_limit_(-1),
-        record_number_(0) {}
+        record_number_(0),
+        encoding_(encoding) {}
 
   // On success:
-  // * input_buffer_ != nullptr,
-  // * input_buffer_->Tell() == header_bytes_
-  // * file_pos_limit_ == file size - footer_bytes_
+  // * buffered_inputstream_ != nullptr,
+  // * buffered_inputstream_->Tell() == header_bytes_
   Status OnWorkStartedLocked() override {
     record_number_ = 0;
-    uint64 file_size = 0;
-    TF_RETURN_IF_ERROR(env_->GetFileSize(current_work(), &file_size));
-    file_pos_limit_ = file_size - footer_bytes_;
+
+    lookahead_cache_.clear();
 
     TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file_));
+    if (encoding_ == "ZLIB" || encoding_ == "GZIP") {
+      const io::ZlibCompressionOptions zlib_options =
+          encoding_ == "ZLIB" ? io::ZlibCompressionOptions::DEFAULT()
+                              : io::ZlibCompressionOptions::GZIP();
+      file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+      buffered_inputstream_.reset(
+          new io::ZlibInputStream(file_stream_.get(), (size_t)kBufferSize,
+                                  (size_t)kBufferSize, zlib_options));
+    } else {
+      buffered_inputstream_.reset(
+          new io::BufferedInputStream(file_.get(), kBufferSize));
+    }
+    // header_bytes_ is always skipped.
+    TF_RETURN_IF_ERROR(buffered_inputstream_->SkipNBytes(header_bytes_));
 
-    input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
-    TF_RETURN_IF_ERROR(input_buffer_->SkipNBytes(header_bytes_));
     return Status::OK();
   }
 
   Status OnWorkFinishedLocked() override {
-    input_buffer_.reset(nullptr);
+    buffered_inputstream_.reset(nullptr);
     return Status::OK();
   }
 
   Status ReadLocked(string* key, string* value, bool* produced,
                     bool* at_end) override {
-    // The condition `input_buffer_->Tell() + record_bytes_ > file_pos_limit_`
-    // is to confirm that none of record bytes is out of the range of
-    // file_pos_limit_.
-    // This is necessary for the condition `hop_bytes > 0`. For example.
-    // File: "0123456"
-    // Reader setting: `record_bytes=3`, `hop_bytes=2`, `footer_bytes=0`,
-    //     `header_bytes=0`
-    // Without this checking condition, the forth time the reader will at
-    // this position: "012345|6" and the reading operation will result in
-    // an error.
-    if (input_buffer_->Tell() >= file_pos_limit_ ||
-        input_buffer_->Tell() + record_bytes_ > file_pos_limit_) {
+    // We will always "hop" the hop_bytes_ except the first record
+    // where record_number_ == 0
+    if (record_number_ != 0) {
+      if (hop_bytes_ <= lookahead_cache_.size()) {
+        // If hop_bytes_ is smaller than the cached data we skip the
+        // hop_bytes_ from the cache.
+        lookahead_cache_ = lookahead_cache_.substr(hop_bytes_);
+      } else {
+        // If hop_bytes_ is larger than the cached data, we clean up
+        // the cache, then skip hop_bytes_ - cache_size from the file
+        // as the cache_size has been skipped through cache.
+        int64 cache_size = lookahead_cache_.size();
+        lookahead_cache_.clear();
+        Status s = buffered_inputstream_->SkipNBytes(hop_bytes_ - cache_size);
+        if (!s.ok()) {
+          if (!errors::IsOutOfRange(s)) {
+            return s;
+          }
+          *at_end = true;
+          return Status::OK();
+        }
+      }
+    }
+
+    // Fill up lookahead_cache_ to record_bytes_ + footer_bytes_
+    int bytes_to_read = record_bytes_ + footer_bytes_ - lookahead_cache_.size();
+    Status s = buffered_inputstream_->ReadNBytes(bytes_to_read, value);
+    if (!s.ok()) {
+      value->clear();
+      if (!errors::IsOutOfRange(s)) {
+        return s;
+      }
       *at_end = true;
       return Status::OK();
     }
-    const int64 pos_before_read = input_buffer_->Tell();
-    TF_RETURN_IF_ERROR(input_buffer_->ReadNBytes(record_bytes_, value));
+    lookahead_cache_.append(*value, 0, bytes_to_read);
+    value->clear();
+
+    // Copy first record_bytes_ from cache to value
+    *value = lookahead_cache_.substr(0, record_bytes_);
+
     *key = strings::StrCat(current_work(), ":", record_number_);
     *produced = true;
     ++record_number_;
 
-    if (hop_bytes_ > 0) {
-      input_buffer_->Seek(pos_before_read + hop_bytes_).IgnoreError();
-    }
-
     return Status::OK();
   }
 
   Status ResetLocked() override {
-    file_pos_limit_ = -1;
     record_number_ = 0;
-    input_buffer_.reset(nullptr);
+    buffered_inputstream_.reset(nullptr);
+    lookahead_cache_.clear();
     return ReaderBase::ResetLocked();
   }
 
@@ -107,11 +142,21 @@ class FixedLengthRecordReader : public ReaderBase {
   const int64 record_bytes_;
   const int64 footer_bytes_;
   const int64 hop_bytes_;
+  // The purpose of lookahead_cache_ is to allows "one-pass" processing
+  // without revisit previous processed data of the stream. This is needed
+  // because certain compression like zlib does not allow random access
+  // or even obtain the uncompressed stream size before hand.
+  // The max size of the lookahead_cache_ could be
+  // record_bytes_ + footer_bytes_
+  string lookahead_cache_;
   Env* const env_;
-  int64 file_pos_limit_;
   int64 record_number_;
-  std::unique_ptr<RandomAccessFile> file_;  // must outlive input_buffer_
-  std::unique_ptr<io::InputBuffer> input_buffer_;
+  string encoding_;
+  // must outlive buffered_inputstream_
+  std::unique_ptr<RandomAccessFile> file_;
+  // must outlive buffered_inputstream_
+  std::unique_ptr<io::RandomAccessInputStream> file_stream_;
+  std::unique_ptr<io::InputStreamInterface> buffered_inputstream_;
 };
 
 class FixedLengthRecordReaderOp : public ReaderOpKernel {
@@ -137,11 +182,14 @@ class FixedLengthRecordReaderOp : public ReaderOpKernel {
         context, hop_bytes >= 0,
         errors::InvalidArgument("hop_bytes must be >= 0 not ", hop_bytes));
     Env* env = context->env();
-    SetReaderFactory(
-        [this, header_bytes, record_bytes, footer_bytes, hop_bytes, env]() {
-          return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
-                                             footer_bytes, hop_bytes, env);
-        });
+    string encoding;
+    TF_CHECK_OK(context->GetAttr("encoding", &encoding));
+    SetReaderFactory([this, header_bytes, record_bytes, footer_bytes, hop_bytes,
+                      encoding, env]() {
+      return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
+                                         footer_bytes, hop_bytes, encoding,
+                                         env);
+    });
   }
 };
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 8c3137ece9fa902c12452f262c9d647afce9d231..71f1b4e063aac287c3cd65ab2217f0954e923218 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -122,6 +122,12 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg")
                                                ArgOp);
 #undef REGISTER
 
+REGISTER_KERNEL_BUILDER(Name("_Arg")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("output")
+                            .TypeConstraint<ResourceHandle>("T"),
+                        ArgOp);
+
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
       Name("_Retval").Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
@@ -231,6 +237,8 @@ class SymbolicGradientOp : public AsyncOpKernel {
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
+    opts.rendezvous = ctx->rendezvous();
+    opts.cancellation_manager = ctx->cancellation_manager();
     opts.runner = ctx->runner();
     std::vector<Tensor> args;
     args.reserve(ctx->num_inputs());
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 37758e82ebd77f2547ff2b65b098a65f1ba4a117..81551ee26f0e6e319a7ae8eaeafbb28f48cd72b4 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -149,7 +149,7 @@ struct FusedBatchNormGrad<CPUDevice, T> {
     typename TTypes<T>::Vec scale_backprop(scale_backprop_output->vec<T>());
     typename TTypes<T>::Vec offset_backprop(offset_backprop_output->vec<T>());
 
-    // Note: the following formulas are used to to compute the gradients for
+    // Note: the following formulas are used to compute the gradients for
     // back propagation.
     // x_backprop = scale * rsqrt(variance + epsilon) *
     //              [y_backprop - mean(y_backprop) - (x - mean(x)) *
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3e7f9bdb447f067ce979b2444e37be0ed7399643
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -0,0 +1,55 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "fuzz_session",
+    hdrs = ["fuzz_session.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
+
+tf_ops_fuzz_target_lib("identity")
+
+tf_ops_fuzz_target_lib("string_to_number")
+
+tf_ops_fuzz_target_lib("string_split")
+
+tf_ops_fuzz_target_lib("encode_base64")
+
+tf_ops_fuzz_target_lib("decode_base64")
+
+tf_ops_fuzz_target_lib("encode_jpeg")
+
+tf_ops_fuzz_target_lib("decode_png")
+
+tf_ops_fuzz_target_lib("decode_jpeg")
+
+tf_ops_fuzz_target_lib("example_proto_fast_parsing")
+
+tf_ops_fuzz_target_lib("parse_tensor_op")
+
+tf_ops_fuzz_target_lib("decode_json_example")
diff --git a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d4a9dfdef4609a45d3a38e49a32492408043617
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzDecodeBase64 : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeBase64);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeBase64);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b084a972049cc2b1997df64a2f43a6d79b6b4e6d
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzDecodeJpeg : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeJpeg);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeJpeg);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9dd795b94e82c48ad037df67f3218ed62feb722e
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzDecodeJSONExample : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeJSONExample);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeJSONExample);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a68a5b5803f363ab93bf280df54fa8f14206a84
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzDecodePng : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodePng);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzDecodePng);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d6c82826cf9dad1ca67d6e5ee1d13a059f9c8ea
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzEncodeBase64 : public FuzzSession {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, EncodeBase64);
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzEncodeBase64);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81b6e491248fda37f602c0365c1e90d4b08f7c2a
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -0,0 +1,63 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzEncodeJpeg : public FuzzSession {
+  SINGLE_INPUT_OP_BUILDER(DT_UINT8, EncodeJpeg);
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    if (size < 6) return;
+
+    // Pick random channels and aspect ratio, and then set the
+    // input based upon the aspect ratio and size.
+    int64 channels = (data[0] % 2) * 2 + 1;  // 1, 3
+    int64 height = data[1] + (data[2] << 8);
+    int64 width = data[2] + (data[3] << 8);
+    if (width == 0) return;
+
+    // TODO(dga): kcc@ notes: better to use actual supplied h, w and then
+    // trim them if needed to ensure w*h <= size-4.
+    double hw_ratio = height / width;
+    int64 remaining_bytes = size - 5;
+    int64 pixels = remaining_bytes / channels;
+    height = static_cast<int64>(floor(sqrt(hw_ratio * pixels)));
+    if (height == 0) return;
+    width = static_cast<int64>(floor(pixels / height));
+    if (width == 0) return;
+    size_t actual_pixels = height * width * channels;
+    if (actual_pixels == 0) return;
+
+    // TODO(dga):  Generalize this by borrowing the AsTensor logic
+    // from tf testing, once we have a few more fuzzers written.
+    Tensor input_tensor(tensorflow::DT_UINT8,
+                        TensorShape({height, width, channels}));
+    auto flat_tensor = input_tensor.flat<uint8>();
+    for (size_t i = 0; i < actual_pixels; i++) {
+      flat_tensor(i) = data[i];
+    }
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    RunOneInput(input_tensor).IgnoreError();
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzEncodeJpeg);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d91a351c5969e71385348b76376202c14e86daac
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -0,0 +1,64 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Fuzz inputs to the example proto decoder.
+// TODO(dga):  Make this more comprehensive.
+// Right now, it's just a quick PoC to show how to attach the
+// plumbing, but it needs some real protos to chew on as a
+// corpus, and the sparse/dense parts should be made more rich
+// to achieve higher code coverage.
+
+class FuzzExampleProtoFastParsing : public FuzzSession {
+  void BuildGraph(const Scope& scope) final {
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+    // The serialized proto.
+    auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
+
+    auto in_expanded = ExpandDims(scope, input, Const<int>(scope, 0));
+
+    auto names = Const(scope, {"noname"});
+    std::vector<Output> dense_keys = {Const(scope, {"a"})};
+    std::vector<Output> sparse_keys;  // Empty.
+    std::vector<Output> dense_defaults = {Const(scope, {1.0f})};
+
+    DataTypeSlice sparse_types = {};
+    std::vector<PartialTensorShape> dense_shapes;
+    dense_shapes.push_back(PartialTensorShape());
+
+    std::ignore = ParseExample(scope.WithOpName("output"), in_expanded, names,
+                               sparse_keys, dense_keys, dense_defaults,
+                               sparse_types, dense_shapes);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    // TODO(dga):  Test the batch case also.
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<string>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    RunOneInput(input_tensor).IgnoreError();
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzExampleProtoFastParsing);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb518798b2b9406b1e4776efe0234f74e5c2735c
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -0,0 +1,156 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#define LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/public/session.h"
+
+// Standard invoking function macro to dispatch to a fuzzer class.
+#ifndef PLATFORM_WINDOWS
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)                              \
+  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { \
+    static FuzzerClass* fuzzer = new FuzzerClass();                         \
+    return fuzzer->Fuzz(data, size);                                        \
+  }
+#else
+// We don't compile this for Windows, MSVC doesn't like it as pywrap in Windows
+// links all the code into one big object file and there are conflicting
+// function names.
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)
+#endif
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                           \
+  void BuildGraph(const Scope& scope) override {                         \
+    auto op_node =                                                       \
+        tensorflow::ops::Placeholder(scope.WithOpName("input1"), dtype); \
+    std::ignore =                                                        \
+        tensorflow::ops::opName(scope.WithOpName("output"), op_node);    \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// class FooFuzzer : public FuzzSession {
+//   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+//   void FuzzImpl(const uint8_t* data, size_t size) {
+//      ... convert data and size to a Tensor, pass it to:
+//      RunOneInput(input_tensor);
+//
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() {}
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) {}
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const uint8_t* data, size_t size) {}
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  Status InitIfNeeded() {
+    if (initialized_) {
+      return Status::OK();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::NewRootScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: " << status.error_message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the "input1" node, and discarding
+  // any returned output.
+  Status RunOneInput(const Tensor& input_tensor) {
+    return session_->Run({{"input1", input_tensor}}, {}, {"output"}, nullptr);
+  }
+
+  Status RunTwoInputs(const Tensor& input1, const Tensor& input2) {
+    return session_->Run({{"input1", input1}, {"input2", input2}}, {},
+                         {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  int Fuzz(const uint8_t* data, size_t size) {
+    Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.error_message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(data, size);
+    return 0;
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+// A specialized fuzz implementation for ops that take
+// a single string.  Caller must still define the op
+// to plumb by overriding BuildGraph or using
+// a plumbing macro.
+class FuzzStringInputOp : public FuzzSession {
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<string>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    RunOneInput(input_tensor).IgnoreError();
+  }
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac3a12aa399a3efe532c71c49a092b6cecd6059b
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -0,0 +1,45 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzIdentity : public FuzzSession {
+  SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_INT8,
+                        TensorShape({static_cast<int64>(size)}));
+    auto flat_tensor = input_tensor.flat<int8>();
+    for (size_t i = 0; i < size; i++) {
+      flat_tensor(i) = data[i];
+    }
+
+    Status s = RunOneInput(input_tensor);
+    // Note:  For many ops, we don't care about this success -- but when
+    // testing to make sure the harness actually works, it's useful.
+    if (!s.ok()) {
+      LOG(ERROR) << "Execution failed: " << s.error_message();
+    }
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzIdentity);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..978fcd102822a6a2690478eaca473eabc6ae83ab
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -0,0 +1,45 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Fuzz inputs to the serialized Tensor decoder.
+
+class FuzzParseTensor : public FuzzSession {
+  void BuildGraph(const Scope& scope) final {
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+    // The serialized proto.
+    auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
+
+    std::ignore = ParseTensor(scope.WithOpName("output"), input, DT_FLOAT);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<string>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    RunOneInput(input_tensor).IgnoreError();
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzParseTensor);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d1aa1fbf3a149d25e82b454543a5add522145af
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzStringSplit : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto input =
+        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
+    auto delimeter =
+        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
+    std::ignore = tensorflow::ops::StringSplit(scope.WithOpName("output"),
+                                               input, delimeter);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    Tensor delimeter_tensor(tensorflow::DT_STRING, TensorShape({}));
+
+    if (size > 0) {
+      // The spec for split is that the delimeter should be 0 or 1 characters.
+      // Naturally, fuzz it with something larger.  (This omits the possibility
+      // of handing it a > int32_max size string, which should be tested for in
+      // an
+      // explicit test).
+      size_t delim_len = static_cast<size_t>(data[0]);
+      if (delim_len > size) {
+        delim_len = size - 1;
+      }
+      delimeter_tensor.scalar<string>()() =
+          string(reinterpret_cast<const char*>(data), delim_len);
+      input_tensor.scalar<string>()() = string(
+          reinterpret_cast<const char*>(data + delim_len), size - delim_len);
+    }
+
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    RunTwoInputs(input_tensor, delimeter_tensor).IgnoreError();
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzStringSplit);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..94255d215e5292bf77ab1104eb1d36c0cc1d661c
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
@@ -0,0 +1,32 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzStringToNumber : public FuzzStringInputOp {
+  SINGLE_INPUT_OP_BUILDER(DT_STRING, StringToNumber);
+};
+
+// TODO(dga):  Generalize this to hit both the float and int
+// variants of StringToNumber - requires an update to the
+// plumbing code to specify the output dtype.
+STANDARD_TF_FUZZ_FUNCTION(FuzzStringToNumber);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f752b59568a74f56c9b581651e54d1cab2af227f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
@@ -0,0 +1,13 @@
+"""Fuzzing template for TensorFlow ops."""
+
+def tf_ops_fuzz_target_lib(name):
+  native.cc_library(
+      name = name + "_fuzz_lib",
+      srcs = [name + "_fuzz.cc"],
+      deps = [
+          "//tensorflow/core/kernels/fuzzing:fuzz_session",
+          "//tensorflow/cc:cc_ops",
+      ],
+      tags = ["no_windows"],
+      alwayslink = 1,
+  )
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index c1d58733a2aec6dc1483071935866f3e6e4a7dd1..1b8be9b2cea879ca992bf484f247bb830ac6ba1f 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_INDEX(T, Index)                          \
-  template <>                                                      \
-  int64 GatherFunctor<GPUDevice, T, Index>::operator()(            \
-      const GPUDevice& d, typename TTypes<T>::ConstMatrix Tparams, \
-      typename TTypes<Index>::ConstFlat Tindices,                  \
-      typename TTypes<T>::Matrix Tout);                            \
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                             \
+  template <>                                                         \
+  int64 GatherFunctor<GPUDevice, T, Index>::operator()(               \
+      const GPUDevice& d, typename TTypes<T, 3>::ConstTensor Tparams, \
+      typename TTypes<Index>::ConstFlat Tindices,                     \
+      typename TTypes<T, 3>::Tensor Tout);                            \
   extern template struct GatherFunctor<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 1ad4d2da8d754e2bfc0f7317d40d2b542f8bb6b2..dfa1a5f1f90b498bf28ace363437fe4ea4e51ce9 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -32,40 +32,51 @@ namespace functor {
 // Helper method to copy using memcpy.
 template <typename T, typename Index, typename SliceIndex,
           SliceIndex static_slice_elems>
-SliceIndex HandleCopies(typename TTypes<T>::ConstMatrix params,
+SliceIndex HandleCopies(typename TTypes<T, 3>::ConstTensor params,
                         typename TTypes<Index>::ConstFlat indices,
                         SliceIndex slice_elems,
-                        typename TTypes<T>::Matrix out) {
-  const SliceIndex first_dim_size =
-      static_cast<SliceIndex>(indices.dimension(0));
-  const Index limit = static_cast<Index>(params.dimension(0));
-  T* out_base = &out(0, 0);
-  const T* params_base = &params(0, 0);
+                        typename TTypes<T, 3>::Tensor out) {
+  const SliceIndex indices_size = static_cast<SliceIndex>(indices.dimension(0));
+  const SliceIndex batch_size = static_cast<SliceIndex>(params.dimension(0));
+  const Index limit = static_cast<Index>(params.dimension(1));
+  T* out_base = &out(0, 0, 0);
+  const T* params_base = &params(0, 0, 0);
   if (static_slice_elems >= 0) {
     // Give compiler static knowledge of the number of elements/bytes
     slice_elems = static_slice_elems;
   }
   // Compute slice_bytes here so that static knowledge is available
   const size_t slice_bytes = slice_elems * sizeof(T);
-  for (SliceIndex i = 0; i < first_dim_size; i++) {
-    const SliceIndex j = i + 1;
-    if (j < first_dim_size) {
-      port::prefetch<port::PREFETCH_HINT_T0>(&params(indices(j), 0));
-      port::prefetch<port::PREFETCH_HINT_T0>(&out(j, 0));
-    }
-    // Grab the index and check its validity.  An earlier version of the
-    // code checked it and then grabbed it from memory a second time, which
-    // was a security risk since it could have changed in between.
-    const Index index = internal::SubtleMustCopy(indices(i));
-    if (!FastBoundsCheck(index, limit)) return i;
-    // Copy using memcpy if possible, otherwise an Eigen loop
-    // TODO(cwhipkey): avoid linking to framework to get Allocator (to improve
-    // ahead-of-time compilation binary size).
-    if (is_simple_type<T>::value) {
-      memcpy(out_base + i * slice_elems, params_base + index * slice_elems,
-             slice_bytes);
-    } else {
-      out.template chip<0>(i) = params.template chip<0>(index);
+  for (SliceIndex b = 0; b < batch_size; b++) {
+    for (SliceIndex i = 0; i < indices_size; i++) {
+      const SliceIndex i_next = i + 1;
+      const SliceIndex b_next = b + 1;
+      if (i_next < indices_size) {
+        port::prefetch<port::PREFETCH_HINT_T0>(&params(b, indices(i_next), 0));
+        port::prefetch<port::PREFETCH_HINT_T0>(&out(b, i_next, 0));
+      } else if (b_next < batch_size) {
+        port::prefetch<port::PREFETCH_HINT_T0>(&params(b_next, indices(0), 0));
+        port::prefetch<port::PREFETCH_HINT_T0>(&out(b_next, 0, 0));
+      }
+      // Grab the index and check its validity.  An earlier version of the
+      // code checked it and then grabbed it from memory a second time, which
+      // was a security risk since it could have changed in between.
+      const Index index = internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy using memcpy if possible, otherwise an Eigen loop
+      // TODO(cwhipkey): avoid linking to framework to get Allocator (to improve
+      // ahead-of-time compilation binary size).
+      if (is_simple_type<T>::value) {
+        // Avoid auto-promotion to Index from SliceIndex by casting.
+        memcpy(out_base + (b * indices_size + i) * slice_elems,
+               params_base + (b * static_cast<SliceIndex>(limit) +
+                              static_cast<SliceIndex>(index)) *
+                                 slice_elems,
+               slice_bytes);
+      } else {
+        // For non-"simple" types (e.g. strings).
+        out.template chip<1>(i) = params.template chip<1>(index);
+      }
     }
   }
   return -1;
@@ -73,11 +84,11 @@ SliceIndex HandleCopies(typename TTypes<T>::ConstMatrix params,
 
 template <typename T, typename Index>
 struct GatherFunctorCPU {
-  int64 operator()(typename TTypes<T>::ConstMatrix params,
+  int64 operator()(typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
-                   typename TTypes<T>::Matrix out) {
+                   typename TTypes<T, 3>::Tensor out) {
     const int64 N = indices.size();
-    const int64 slice_size = out.size() / N;
+    const int64 slice_size = out.dimension(2);
     int64 bad_i;
 
     bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
@@ -109,16 +120,17 @@ struct GatherFunctorCPU {
 
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(const Device& d, typename TTypes<T>::ConstMatrix params,
+  int64 operator()(const Device& d, typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
-                   typename TTypes<T>::Matrix out);
+                   typename TTypes<T, 3>::Tensor out);
 };
 
 template <typename T, typename Index>
 struct GatherFunctor<CPUDevice, T, Index> {
-  int64 operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix params,
+  int64 operator()(const CPUDevice& d,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
-                   typename TTypes<T>::Matrix out) {
+                   typename TTypes<T, 3>::Tensor out) {
     return GatherFunctorCPU<T, Index>()(params, indices, out);
   }
 };
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index ff6779b6481e4e9e031e740f6988b1831b291b9f..e2384ef01151e3c2c31b9607542f6f5cc4e9588d 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -29,21 +29,41 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T, typename Index>
+template <typename T, typename Index, bool is_axis_zero>
 __global__ void GatherOpKernel(const T* params, const Index* indices, T* out,
-                               int64 first_dim_size, int64 indices_size,
-                               int64 out_size) {
-  const int32 slice_size = out_size / indices_size;
+                               int64 gather_dim_size, int64 indices_size,
+                               int64 slice_size, int64 out_size) {
   CUDA_1D_KERNEL_LOOP(i, out_size) {
-    Index indices_i = i / slice_size;
-    Index indices_slice_i = i - indices_i * slice_size;
-    Index params_first_index = ldg(indices + indices_i);
-    if (!(params_first_index >= 0 && params_first_index < first_dim_size)) {
+    Index batch_i = 0;
+    Index indices_i = 0;
+    Index slice_i = 0;
+    if (is_axis_zero) {
+      indices_i = i / slice_size;
+      slice_i = i - indices_i * slice_size;
+    } else {
+      Index batch_indices_i = i / slice_size;
+      // The batch index into params to use for i.
+      batch_i = batch_indices_i / indices_size;
+      // The index into indices to use for i.
+      indices_i = batch_indices_i - batch_i * indices_size;
+      // Index into the current slice in params to use for i.
+      slice_i = i - batch_indices_i * slice_size;
+    }
+
+    // Index into the gather axis to use for i.
+    Index gather_i = ldg(indices + indices_i);
+
+    // Check gather_i is in [0, gather_dim_size).
+    if (!FastBoundsCheck(gather_i, gather_dim_size)) {
       // Set indices out of range to zero
       // TODO(fpmc): Log an error for transfer back to host.
       out[i] = T(0);
     } else {
-      Index params_i = params_first_index * slice_size + indices_slice_i;
+      // params is a [batch_size, gather_dim_size, slice_size] tensor. Read
+      // params[batch_i, gather_i, slice_i] and write it to the i'th position in
+      // out.
+      Index params_i =
+          (batch_i * gather_dim_size + gather_i) * slice_size + slice_i;
       out[i] = ldg(params + params_i);
     }
   }
@@ -52,9 +72,10 @@ __global__ void GatherOpKernel(const T* params, const Index* indices, T* out,
 namespace functor {
 template <typename T, typename Index>
 struct GatherFunctor<GPUDevice, T, Index> {
-  int64 operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix params,
+  int64 operator()(const GPUDevice& d,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
-                   typename TTypes<T>::Matrix out) {
+                   typename TTypes<T, 3>::Tensor out) {
     const int64 out_size = out.size();
     if (out_size == 0) {
       // We need a check here since the CPU version does useful error checking
@@ -63,15 +84,27 @@ struct GatherFunctor<GPUDevice, T, Index> {
       // checking, so we skip the loop entirely.
       return -1;
     }
-    const int64 first_dim_size = params.dimension(0);
+    const bool is_axis_zero = params.dimension(0) == 1;
+    const int64 gather_dim_size = params.dimension(1);
     const int64 indices_size = indices.size();
+    const int64 slice_size = params.dimension(2);
+
     CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-    // clang-format off
-    GatherOpKernel<T, Index>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), indices.data(), out.data(), first_dim_size,
-            indices_size, out_size);
-    // clang-format on
+    if (is_axis_zero) {
+      // clang-format off
+      GatherOpKernel<T, Index, true>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              params.data(), indices.data(), out.data(), gather_dim_size,
+              indices_size, slice_size, out_size);
+      // clang-format on
+    } else {
+      // clang-format off
+      GatherOpKernel<T, Index, false>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              params.data(), indices.data(), out.data(), gather_dim_size,
+              indices_size, slice_size, out_size);
+      // clang-format on
+    }
     // TODO(fpmc): enable indices validation on GPU.
     // Right now checking for indicies out of bound in the kernel would
     // require copying code between GPU/CPU, and thus slow.
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 73f30cdae37ffb79c80b7f3bfb6163ed08b09fa0..9526f1119b73e040e391e473af0513b3171e6397 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index dd25f589574cd023f3f08d9b5be689fc8de3b9d2..e649c54fa80a7b3f1c18a1ac6fa453105580cdf8 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -37,15 +37,7 @@ class GatherOp : public OpKernel {
   //   we have the framework do some sort of integer promotion
   //   automatically, or should that be something that users have to
   //   do explicitly with a conversion operator in the graph?
-  explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) {
-    const DataType dt = DataTypeToEnum<T>::v();
-    const DataType index_t = DataTypeToEnum<Index>::v();
-    OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t}, {dt}));
-    // We used to grab the validate_indices attribute here, but now we
-    // always validate indices since the speed difference was only 1.5%.
-    // TODO(irving): Remove the validate_indices attribute once we have
-    // support for removing attrs in a backwards compatible way.
-  }
+  explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
     const Tensor& params = c->input(0);
@@ -54,27 +46,64 @@ class GatherOp : public OpKernel {
         c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
         errors::InvalidArgument("params must be at least 1 dimensional"));
 
+    // GatherV2 added an axis argument. For backwards compatibility with Gather,
+    // fall back to axis 0 if the op does not have an axis input.
+    int64 axis = 0;
+    if (c->num_inputs() == 3) {
+      const Tensor& axis_tensor = c->input(2);
+      OP_REQUIRES(c, TensorShapeUtils::IsScalar(axis_tensor.shape()),
+                  errors::InvalidArgument("axis must be scalar"));
+
+      if (axis_tensor.dtype() == DT_INT32) {
+        axis = axis_tensor.scalar<int32>()();
+      } else if (axis_tensor.dtype() == DT_INT64) {
+        axis = axis_tensor.scalar<int64>()();
+      } else {
+        OP_REQUIRES(c, false,
+                    errors::InvalidArgument("axis must be int32 or int64."));
+      }
+    }
+
+    OP_REQUIRES(
+        c, axis >= -params.dims() && axis < params.dims(),
+        errors::InvalidArgument("Expected axis in the range [", -params.dims(),
+                                ", ", params.dims(), "), but got ", axis));
+    if (axis < 0) {
+      axis = params.dims() + axis;
+    }
+
     // Check that we have enough index space
+    const int64 gather_dim_size = params.dim_size(axis);
     const int64 N = indices.NumElements();
     OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
+        c, gather_dim_size <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("params.shape[", axis, "] too large for ",
                                 DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
+                                " indexing: ", gather_dim_size, " > ",
                                 std::numeric_limits<Index>::max()));
 
-    // The result shape is indices.shape + params.shape[1:].
-    TensorShape result_shape = indices.shape();
-    for (int i = 1; i < params.dims(); i++) {
+    // The result shape is params.shape[0:axis] + indices.shape +
+    // params.shape[axis + 1:].
+    TensorShape result_shape;
+    int64 outer_size = 1;
+    int64 inner_size = 1;
+    for (int i = 0; i < axis; i++) {
+      result_shape.AddDim(params.dim_size(i));
+      outer_size *= params.dim_size(i);
+    }
+    result_shape.AppendShape(indices.shape());
+    for (int i = axis + 1; i < params.dims(); i++) {
       result_shape.AddDim(params.dim_size(i));
+      inner_size *= params.dim_size(i);
     }
 
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
-    if (N > 0) {
-      auto params_flat = params.flat_outer_dims<T>();
+    if (N > 0 && outer_size > 0 && inner_size > 0) {
+      auto params_flat =
+          params.shaped<T, 3>({outer_size, gather_dim_size, inner_size});
       auto indices_flat = indices.flat<Index>();
-      auto out_flat = out->shaped<T, 2>({N, out->NumElements() / N});
+      auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
       int64 bad_i = functor(c->eigen_device<Device>(), params_flat,
@@ -84,7 +113,7 @@ class GatherOp : public OpKernel {
           c, bad_i < 0,
           errors::InvalidArgument(
               "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+              indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
     }
   }
 };
@@ -94,6 +123,12 @@ class GatherOp : public OpKernel {
                               .Device(DEVICE_##dev)                    \
                               .TypeConstraint<type>("Tparams")         \
                               .TypeConstraint<index_type>("Tindices"), \
+                          GatherOp<dev##Device, type, index_type>);    \
+  REGISTER_KERNEL_BUILDER(Name("GatherV2")                             \
+                              .Device(DEVICE_##dev)                    \
+                              .TypeConstraint<type>("Tparams")         \
+                              .TypeConstraint<index_type>("Tindices")  \
+                              .HostMemory("axis"),                     \
                           GatherOp<dev##Device, type, index_type>)
 
 #define REGISTER_GATHER_ALL_INDICES(dev, type) \
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 10d5aefe437ac33a9a27e17eb36ef591c60152ab..3edcb34bca3eaa28249cffaa8b0a79f90cdfb7dd 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -40,9 +40,10 @@ namespace {
 class GatherOpTest : public OpsTestBase {
  protected:
   void MakeOp(DataType data_type, DataType index_type) {
-    TF_ASSERT_OK(NodeDefBuilder("myop", "Gather")
+    TF_ASSERT_OK(NodeDefBuilder("myop", "GatherV2")
                      .Input(FakeInput(data_type))
                      .Input(FakeInput(index_type))
+                     .Input(FakeInput(index_type))
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
   }
@@ -54,6 +55,7 @@ TEST_F(GatherOpTest, ScalarIndices) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -71,6 +73,7 @@ TEST_F(GatherOpTest, ScalarIndices_Complex) {
                          std::complex<float>(2, 12), std::complex<float>(3, 13),
                          std::complex<float>(4, 14)});
   AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -80,13 +83,14 @@ TEST_F(GatherOpTest, ScalarIndices_Complex) {
   test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
 }
 
-TEST_F(GatherOpTest, Simple_TwoD32) {
+TEST_F(GatherOpTest, Simple_TwoD32_Axis0) {
   MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -95,12 +99,30 @@ TEST_F(GatherOpTest, Simple_TwoD32) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(GatherOpTest, Simple_TwoD32_Axis1) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {0, 1, 0, 2});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 4}));
+  test::FillValues<float>(&expected, {0, 1, 0, 2,  3, 4,  3,  5,  6,  7,
+                                      6, 8, 9, 10, 9, 11, 12, 13, 12, 14});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(GatherOpTest, ZeroSize_TwoD32) {
   MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 0}), {});
   AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -115,6 +137,7 @@ TEST_F(GatherOpTest, Simple_TwoD64) {
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int64>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int64>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -129,6 +152,7 @@ TEST_F(GatherOpTest, HighRank) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
   AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
+  AddInputFromArray<int32>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output
@@ -144,6 +168,7 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
   EXPECT_TRUE(
       StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
@@ -172,8 +197,12 @@ static Graph* Gather(int dim) {
     indices.flat<Index>()(i) = indices_vec[i];
   }
 
+  Tensor axis(DataTypeToEnum<Index>::value, TensorShape({}));
+  axis.scalar<Index>()() = 0;
+
   test::graph::Gather(g, test::graph::Constant(g, params),
-                      test::graph::Constant(g, indices));
+                      test::graph::Constant(g, indices),
+                      test::graph::HostConstant(g, axis));
   return g;
 }
 
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..366877bcf5f57139a5600c4e198a7862d8ed9ef7
--- /dev/null
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -0,0 +1,165 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+
+#if GOOGLE_CUDA
+
+#include <unordered_map>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+template <typename T>
+inline perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+                                                           uint64 size) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+                                                size * sizeof(T));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+
+// A helper class that looks up the best autotuned config from parameters.
+// Due to the noisy nature of autotune, especially with multiple devices, it
+// only accepts a config if its margin exceeds a threshold.
+// For the same shape configs, if a new best config matches the previous best,
+// they get promoted; otherwise, the winner gets demoted. This process stops
+// when the winner's score exceeds the threshold.
+// In a bad case when two configs are very close to each other and flips
+// back and forth randomly, the expected number of experiments before autotune
+// settles is O(threshold ^ 2). So we recommend that number of warmup runs
+// for any benchmarks.
+template <typename Parameters, typename Config>
+class AutoTuneMap {
+ public:
+  bool Find(const Parameters& params, Config* config) const {
+    mutex_lock lock(mu_);
+    auto iter = params_config_map_.find(params);
+    if (iter == params_config_map_.end() ||
+        (iter->second.score < min_score_threshold_ &&
+         iter->second.count <= max_autotune_count_)) {
+      return false;
+    }
+    *config = iter->second.config;
+    return true;
+  }
+  void Insert(const Parameters& params, const Config& config) {
+    mutex_lock lock(mu_);
+    auto iter = params_config_map_.find(params);
+    int new_score = 0;
+    if (iter == params_config_map_.end()) {
+      // Create a new entry if params is new.
+      VLOG(1) << GetActionSummary("creates", params, config);
+      params_config_map_.insert(
+          std::make_pair(params, ValueType{config, 1, 1}));
+      new_score = 1;
+    } else if (iter->second.score < min_score_threshold_ &&
+               iter->second.count <= max_autotune_count_) {
+      DCHECK_GT(iter->second.score, 0);
+      if (iter->second.config != config) {
+        // If it is different from the current winner, demotes the winner.
+        VLOG(1) << GetActionSummary("demotes", params, config);
+        new_score = --iter->second.score;
+        ++iter->second.count;
+        if (new_score <= 0) {
+          VLOG(1) << GetActionSummary("erases", params, config);
+          params_config_map_.erase(iter);
+        }
+      } else {
+        // If it is the same as the current winner, promotes the winner.
+        VLOG(1) << GetActionSummary("promotes", params, config);
+        new_score = ++iter->second.score;
+        ++iter->second.count;
+      }
+    }
+    if (new_score >= min_score_threshold_) {
+      VLOG(1) << GetActionSummary("accepts", params, config);
+    }
+  }
+
+ private:
+  AutoTuneMap(const string& name) : name_(name) {
+    min_score_threshold_ = 1;
+    int min_warmup_iterations = 10;
+    const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
+    if (threshold_str != nullptr) {
+      strings::safe_strto32(threshold_str, &min_score_threshold_);
+    }
+    const char* min_warmup_iteration_str =
+        getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
+    if (min_warmup_iteration_str != nullptr) {
+      strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
+    }
+    min_score_threshold_ = std::max(min_score_threshold_, 1);
+    max_autotune_count_ = std::max(
+        5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
+  }
+
+  template <class Group, class Params, class Cfg>
+  friend class AutoTuneSingleton;
+
+  struct Hasher {
+    std::size_t operator()(const Parameters& parameter) const {
+      return parameter.hash();
+    }
+  };
+
+  string GetActionSummary(StringPiece action, const Parameters& params,
+                          const Config& config) {
+    return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
+                           action.ToString().c_str(), params.ToString().c_str(),
+                           config.ToString().c_str());
+  }
+
+  mutable mutex mu_;
+  struct ValueType {
+    Config config;
+    int32 score;
+    int32 count;
+  };
+  std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
+      GUARDED_BY(mu_);
+  string name_;
+  int32 min_score_threshold_;
+  int32 max_autotune_count_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AutoTuneMap);
+};
+
+// A Singleton helper that manages the global autotune results by groups.
+// The caller specified arbitrary Group type that can distinguish between
+// different autotune results, even if their Parameters and Configs are the
+// same.
+template <class Group, typename Parameters, typename Config>
+class AutoTuneSingleton {
+ public:
+  typedef AutoTuneMap<Parameters, Config> AutoTuneType;
+  static AutoTuneType* GetInstance() {
+    static AutoTuneType* instance = new AutoTuneType(Group::name());
+    return instance;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/group_by_window_dataset_op.cc
index 948e83390e195f0ca0d480137da1470e9dae7d8c..94591a26af0e6dd2d97c538c2dac7cf240545c82 100644
--- a/tensorflow/core/kernels/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/group_by_window_dataset_op.cc
@@ -42,7 +42,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 window_size;
+    int64 window_size = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index e9391c8954414a6feb58adbc3679cb3bb254c605..a01a4c40dae3cf003ccd423bdaeedd5b67bb113c 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -26,23 +26,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-tf_cc_test(
-    name = "quantized_matmul_op_for_hexagon_test",
-    size = "small",
-    srcs = ["quantized_matmul_op_for_hexagon_test.cc"],
-    tags = ["nomsan"],  # http://b/32242946
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/kernels:quantized_ops",
-    ],
-)
-
 tf_cc_test(
     name = "graph_transferer_test",
     size = "small",
@@ -79,14 +62,14 @@ tf_kernel_library(
         "graph_transferer.cc",
         "hexagon_control_wrapper.cc",
         "hexagon_ops_definitions.cc",
-        "i_graph_transfer_ops_definitions.cc",
+        "soc_interface.cc",
     ],
     hdrs = [
         "graph_transfer_utils.h",
         "graph_transferer.h",
         "hexagon_control_wrapper.h",
         "hexagon_ops_definitions.h",
-        "i_graph_transfer_ops_definitions.h",
+        "soc_interface.h",
     ],
     deps = [
         "//tensorflow/cc:cc_ops",
@@ -111,6 +94,7 @@ cc_library(
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:remote_fused_graph_ops",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ],
     alwayslink = 1,
@@ -121,14 +105,17 @@ tf_cc_test(
     size = "small",
     srcs = ["hexagon_rewriter_transform_test.cc"],
     deps = [
+        ":graph_transferer",
         ":hexagon_rewriter_transform",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ],
 )
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index 04697c3b15f477d27aa7b5a348f08f4168ee16be..f0d7c670a62bf0a520cb37f01beda530d157d5c7 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/logging.h"
 namespace tensorflow {
@@ -95,7 +96,7 @@ GraphTransferUtils::BuildRemoteFusedGraphExecuteInfo(
 }
 
 /* static */ GraphDef GraphTransferUtils::BuildFusedGraphDef(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const string& remote_graph_execute_name,
     const std::vector<std::pair<string, Tensor>>& inputs,
     const std::vector<string>& outputs, GraphDef* original_def) {
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
index a11e2e6eb1bef1901a86b3803dbf20e01f55845c..352d548bd3e08dcf5d73691e64535ead1be34983 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -39,7 +39,7 @@ class GraphTransferUtils {
                                    const int element_count, const int top_n);
 
   static GraphDef BuildFusedGraphDef(
-      const IGraphTransferOpsDefinitions& ops_definitions,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
       const string& remote_graph_execute_name,
       const std::vector<std::pair<string, Tensor>>& inputs,
       const std::vector<string>& outputs, GraphDef* original_def);
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index d927ef3efa08bf7f0fdb255e21b59b0620475a83..7768acc771482e0a6f646cd60da649ed4788e162 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
@@ -43,10 +44,14 @@ const char INPUTS_NODE_PREFIX[] = "inputs_for_";
 const char OUTPUTS_NODE_PREFIX[] = "outputs_for_";
 const char DATA_NODE_PREFIX[] = "data_for_op_";
 const char CONST_SHAPE_PREFIX[] = "const_shape_";
+const char CONST_VAL_PREFIX[] = "const_val_";
+const char CONST_TENSOR_PREFIX[] = "const_tensor_";
 const char PADDING_ATTR_NAME[] = "padding";
 const char STRIDES_ATTR_NAME[] = "strides";
+const char KEEP_DIMS_ATTR_NAME[] = "keep_dims";
 const char KSIZE_ATTR_NAME[] = "ksize";
 const char NULL_OUTPUT_NAME[] = "NULL";
+const char AGGREGATED_INPUT_NODE_NAME[] = "graph_transfer_aggregated_input";
 const int PADDING_NA_ID = 0;  // VALID = 1, SAME = 2
 
 // This is a temporary workaround to support android build
@@ -58,6 +63,16 @@ static string ToString(T val) {
   return stream.str();
 }
 
+static Node* FindMutableNodeByName(const string& name, Graph* graph) {
+  const TensorId tid = ParseTensorName(name);
+  for (Node* node : graph->nodes()) {
+    if (node != nullptr && node->name() == tid.first) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
 /**
  * graph loading functions
  * - LoadGraphFromProto
@@ -66,13 +81,13 @@ static string ToString(T val) {
  * of node to transfer the graph to SOC.
  */
 Status GraphTransferer::LoadGraphFromProto(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const GraphDef& graph_def,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names,
     const bool shape_inference_for_unknown_shape) {
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   Status status = ImportGraphDef({}, graph_def, &graph, &shape_refiner);
   if (!status.ok()) {
     return status;
@@ -86,13 +101,22 @@ Status GraphTransferer::LoadGraphFromProto(
     }
   }
 
+  TF_RETURN_IF_ERROR(TransformGraphToAddAggregatedInputNode(
+      input_node_info_list, &graph, &shape_refiner));
+
   std::unordered_multimap<string, const Node*> op_name_to_node_multimap(
       graph.num_nodes());
   for (const Node* const node : graph.nodes()) {
+    if (node == nullptr) {
+      continue;
+    }
     CacheNode(*node);
   }
 
   for (const Node* const node : graph.nodes()) {
+    if (node == nullptr) {
+      continue;
+    }
     VLOG(1) << "<Node> " << node->name();
     for (const Node* const input_node : node->in_nodes()) {
       const string& name = input_node->name();
@@ -102,6 +126,9 @@ Status GraphTransferer::LoadGraphFromProto(
   }
 
   for (const Node* const node : graph.nodes()) {
+    if (node == nullptr) {
+      continue;
+    }
     status = RegisterNodeIfAllInputsAreCached(
         ops_definitions, shape_refiner, *node, false, input_node_info_list,
         output_node_names);
@@ -150,9 +177,6 @@ Status GraphTransferer::LoadGraphFromProto(
     }
   }
 
-  graph_transfer_info_.set_destination(
-      ops_definitions.GetTransferDestination());
-
   ClearCache();
   if (DBG_DUMP_PARAMS) {
     DumpNodeTransferParams();
@@ -164,7 +188,7 @@ Status GraphTransferer::LoadGraphFromProto(
 }
 
 Status GraphTransferer::LoadGraphFromProtoFile(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const string& graph_def_path,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names, const bool is_text_proto,
@@ -265,19 +289,16 @@ GraphTransferInfo& GraphTransferer::GetMutableGraphTransferInfo() {
   return graph_transfer_info_;
 }
 
-int GraphTransferer::CacheNode(const Node& node) {
+void GraphTransferer::CacheNode(const Node& node) {
   if (node_name_to_id_cache_map_.count(node.name()) > 0) {
-    VLOG(1) << "Emplace node to cache failed";
-    // TODO(satok): check here?
-    return -1;
+    return;
   }
-  VLOG(1) << "Cache node: " << node.name() << ", " << node.op_def().name();
   node_name_cache_list_.emplace_back(&node);
+  const int node_id = node_name_cache_list_.size() - 1;
   bool emplace_succeeded = false;
-  std::tie(std::ignore, emplace_succeeded) = node_name_to_id_cache_map_.emplace(
-      node.name(), node_name_cache_list_.size() - 1);
+  std::tie(std::ignore, emplace_succeeded) =
+      node_name_to_id_cache_map_.emplace(node.name(), node_id);
   CHECK(emplace_succeeded);
-  return node_name_cache_list_.size() - 1;
 }
 
 bool GraphTransferer::AreAllInputsCached(const Node& node) const {
@@ -291,26 +312,130 @@ bool GraphTransferer::AreAllInputsCached(const Node& node) const {
   return true;
 }
 
+Status GraphTransferer::TransformGraphToAddAggregatedInputNode(
+    const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+    Graph* graph, ShapeRefiner* shape_refiner) {
+  // Transform a remote fused graph to add an aggregated input node which takes
+  // all inputs of the remote graph.
+  DataTypeVector input_data_types;
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  std::vector<string> input_nodes;
+  for (int i = 0; i < input_node_info_list.size(); ++i) {
+    Node* node = FindMutableNodeByName(input_node_info_list.at(i).first, graph);
+    CHECK_NOTNULL(node);
+    input_nodes.emplace_back(node->name());
+    input_data_types.emplace_back(input_node_info_list.at(i).second.dtype());
+    data_types.emplace_back(input_node_info_list.at(i).second.dtype());
+    shapes.emplace_back(input_node_info_list.at(i).second.shape());
+  }
+
+  NodeDef input_node_def;
+  auto builder =
+      NodeBuilder(AGGREGATED_INPUT_NODE_NAME, "RemoteFusedGraphExecute")
+          .Input(std::vector<NodeBuilder::NodeOut>{})
+          .Attr("Tinputs", DataTypeVector{})
+          .Attr("Toutputs", input_data_types)
+          .Attr("serialized_remote_fused_graph_execute_info", "")
+          .Attr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES,
+                data_types)
+          .Attr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES, shapes);
+
+  Node* input_node;
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, &input_node));
+  CHECK_NOTNULL(input_node);
+
+  bool refined;
+  TF_RETURN_IF_ERROR(
+      shape_refiner->UpdateNode(input_node, false /* relax */, &refined));
+
+  shape_inference::InferenceContext* context =
+      shape_refiner->GetContext(input_node);
+  for (int i = 0; i < input_node_info_list.size(); ++i) {
+    shape_inference::ShapeHandle handle;
+    TF_RETURN_IF_ERROR(context->MakeShapeFromTensorShape(
+        input_node_info_list.at(i).second.shape(), &handle));
+    TF_RETURN_IF_ERROR(shape_refiner->SetShape(input_node, i, handle));
+  }
+
+  // Cache the aggregate input node first as it's consumed first.
+  CacheNode(*input_node);
+
+  std::vector<Node*> original_input_nodes(input_nodes.size());
+
+  for (int i = 0; i < input_nodes.size(); ++i) {
+    const string& node_name = input_nodes.at(i);
+    Node* original_input_node = FindMutableNodeByName(node_name, graph);
+    CHECK_NOTNULL(original_input_node);
+    CHECK_EQ(1, original_input_node->num_outputs());  // replaced by identity.
+    Node* created_node;
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildIdentityOpNode(
+        node_name, AGGREGATED_INPUT_NODE_NAME, i, data_types.at(i), graph,
+        &created_node));
+    CHECK_NOTNULL(created_node);
+    std::vector<DataType> data_types;
+    std::vector<TensorShape> shapes;
+    Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+        original_input_node->def(), &data_types, &shapes);
+    if (status.ok()) {
+      created_node->AddAttr(
+          RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES, data_types);
+      created_node->AddAttr(RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES,
+                            shapes);
+    }
+    for (const Edge* out_edge : original_input_node->out_edges()) {
+      Node* dst = out_edge->dst();
+      int dst_port = out_edge->dst_input();
+      // Unused edge will be removed when removing node.
+      graph->AddEdge(created_node, 0, dst, dst_port);
+    }
+    original_input_nodes[i] = original_input_node;
+
+    TF_RETURN_IF_ERROR(
+        shape_refiner->UpdateNode(created_node, false /* relax */, &refined));
+
+    shape_inference::InferenceContext* context =
+        shape_refiner->GetContext(created_node);
+    CHECK_NOTNULL(context);
+
+    // Cache replaced input node next to the aggregated input node.
+    CacheNode(*created_node);
+  }
+
+  // Remove original input nodes after adding new input nodes to avoid
+  // reusing same pointer in Graph.
+  for (Node* original_input_node : original_input_nodes) {
+    graph->RemoveNode(original_input_node);
+  }
+
+  return Status::OK();
+}
+
 Status GraphTransferer::RegisterNode(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
     const std::vector<string>& output_node_names) {
-  VLOG(1) << "Register node: " << node.name();
+  VLOG(1) << "Register node: " << node.name() << ", " << std::hex
+          << node_name_to_id_cache_map_.at(node.name());
   if (node.name() == SOURCE_NODE_NAME || node.name() == SINK_NODE_NAME) {
     // Just ignore sink and source
-    return Status();
-  } else if (RemoteFusedGraphExecuteUtils::IsInputNode(input_node_info_list,
-                                                       node.name())) {
+    return Status::OK();
+  } else if (node.name() == AGGREGATED_INPUT_NODE_NAME) {
     RegisterInputNode(ops_definitions, shape_refiner, node);
+    return Status::OK();
   } else if (node.IsConstant()) {
     RegisterConstantNode(shape_refiner, node);
+  } else if (IsPadNode(node)) {
+    RegisterPadNode(ops_definitions, shape_refiner, node);
   } else if (HasPaddingAndStrides(node)) {
     RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner, node);
+  } else if (NeedsToAddRank(node)) {
+    RegisterNodeWithRank(ops_definitions, shape_refiner, node);
   } else if (IsNodeFlattenReshape(node, shape_refiner)) {
     RegisterFlattenNode(ops_definitions, shape_refiner, node);
   } else if (ops_definitions.GetOpIdFor(node.type_string(), {}) !=
-             IGraphTransferOpsDefinitions::INVALID_OP_ID) {
+             IRemoteFusedGraphOpsDefinitions::INVALID_OP_ID) {
     // TODO(satok): Set correct data type if it's given.
     RegisterGenericNode(ops_definitions, shape_refiner, node);
   } else {
@@ -318,7 +443,7 @@ Status GraphTransferer::RegisterNode(
                                    " has not been implemented yet.");
   }
 
-  return Status();
+  return Status::OK();
 }
 
 void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
@@ -361,8 +486,7 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
   const TensorProto* proto = nullptr;
   TF_CHECK_OK(GetNodeAttr(node.attrs(), "value", &proto));
   Tensor const_tensor;
-  // TODO(b/32704451): Don't just ignore this status!
-  MakeTensorFromProto(*proto, &const_tensor).IgnoreError();
+  TF_CHECK_OK(MakeTensorFromProto(*proto, &const_tensor));
 
   const_node_info.set_dtype(const_tensor.dtype());
   if (data_size > 0) {
@@ -394,12 +518,82 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
   return node_name_to_id_cache_map_[shape_name];
 }
 
+int GraphTransferer::RegisterConstTensor(const Tensor& tensor,
+                                         const string& suffix) {
+  VLOG(1) << "Cache const tensor.";
+  const int dims = tensor.shape().dims();
+  CHECK(dims <= 4);
+  const string node_name = strings::StrCat(CONST_TENSOR_PREFIX, "_", suffix);
+  if (node_name_to_id_cache_map_.count(node_name) <= 0) {
+    node_name_cache_list_.emplace_back(nullptr);
+    const int id = node_name_cache_list_.size() - 1;
+    node_name_to_id_cache_map_.emplace(node_name, id);
+    GraphTransferInfo::ConstNodeInfo& const_node_info =
+        *graph_transfer_info_.add_const_node_info();
+    const_node_info.set_name(node_name);
+    const_node_info.set_node_id(id);
+    CHECK_EQ(4, SHAPE_ARRAY_SIZE);
+    for (int i = 0; i < SHAPE_ARRAY_SIZE; ++i) {
+      if (i < SHAPE_ARRAY_SIZE - dims) {
+        const_node_info.add_shape(1);
+      } else {
+        const_node_info.add_shape(
+            tensor.shape().dim_size(i - (SHAPE_ARRAY_SIZE - dims)));
+      }
+    }
+    const_node_info.set_dtype(tensor.dtype());
+    const_node_info.set_data(tensor.tensor_data().data(),
+                             tensor.tensor_data().size());
+  }
+  return node_name_to_id_cache_map_[node_name];
+}
+
+int GraphTransferer::RegisterConstScalar(const DataType dt, const int val,
+                                         const int dst_id,
+                                         const int dst_input_count) {
+  VLOG(1) << "Cache const.";
+  const string val_name =
+      CONST_VAL_PREFIX + ToString(dst_id) + '_' + ToString(dst_input_count);
+  if (node_name_to_id_cache_map_.count(val_name) <= 0) {
+    node_name_cache_list_.emplace_back(nullptr);
+    const int id = node_name_cache_list_.size() - 1;
+    node_name_to_id_cache_map_.emplace(val_name, id);
+    GraphTransferInfo::ConstNodeInfo& const_node_info =
+        *graph_transfer_info_.add_const_node_info();
+    const_node_info.set_name(val_name);
+    const_node_info.set_node_id(id);
+    // TODO(satok): Do not assume rank is 4 here.
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.add_shape(static_cast<int64>(1));
+    const_node_info.set_data(&val, DataTypeSize(dt));
+  }
+  return node_name_to_id_cache_map_[val_name];
+}
+
 bool GraphTransferer::HasPaddingAndStrides(const Node& node) {
   auto attrs = node.attrs();
   return attrs.Find(PADDING_ATTR_NAME) != nullptr &&
          attrs.Find(STRIDES_ATTR_NAME) != nullptr;
 }
 
+bool GraphTransferer::NeedsToAddRank(const Node& node) {
+  const string& op_type = node.def().op();
+  if (op_type == "Transpose" || op_type == "ExpandDims") {
+    return true;
+  }
+  return false;
+}
+
+bool GraphTransferer::IsPadNode(const Node& node) {
+  const string& op_type = node.def().op();
+  if (op_type == "Pad") {
+    return true;
+  }
+  return false;
+}
+
 bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
                                            const ShapeRefiner& shape_refiner) {
   // Check if node is reshape op
@@ -440,7 +634,7 @@ bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
 }
 
 void GraphTransferer::RegisterNodeWithPaddingAndStrides(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node) {
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
@@ -473,15 +667,123 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
       node.num_outputs(), true /* append_input */, true /* append_output */);
 }
 
-void GraphTransferer::RegisterInputNode(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+void GraphTransferer::RegisterNodeWithRank(
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node) {
-  VLOG(1) << "Register input node: " << node.name();
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
+  shape_inference::InferenceContext* context = shape_refiner.GetContext(&node);
+  const Node* input0_node;
+  TF_CHECK_OK(node.input_node(0, &input0_node));
+  CHECK_NOTNULL(input0_node);
+  std::vector<TensorShape> shapes;
+  Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+      input0_node->def(), nullptr, &shapes);
+  CHECK_EQ(1, shapes.size()) << "Output size should be 1.";
+  const int const_val_id =
+      RegisterConstScalar(DT_INT32, shapes.at(0).dims(), id, node.num_inputs());
+  std::vector<int> extra_inputs{const_val_id};
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
+  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
+      << "Op " << node.type_string() << " not found in map(id = " << op_type_id
+      << ")";
+  bool keep_dims = false;
+  int padding_id = PADDING_NA_ID;
+  if (context->GetAttr(KEEP_DIMS_ATTR_NAME, &keep_dims).ok()) {
+    padding_id = keep_dims ? Padding::SAME : Padding::VALID;
+  }
+
+  AppendNodeParamsWithIoParams(
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      padding_id, node.num_inputs(), extra_inputs, node.num_outputs(),
+      true /* append_input */, true /* append_output */);
+}
+
+void GraphTransferer::RegisterPadNode(
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+    const ShapeRefiner& shape_refiner, const Node& node) {
+  static constexpr int PAD_WIDTH = 4;
+  static constexpr int PAD_HEIGHT = 2;
+  VLOG(1) << "Register generic node: " << node.name();
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
+  const int id = node_name_to_id_cache_map_[node.name()];
+
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
+  CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
+
+  CHECK_EQ(2, node.num_inputs());
+
+  GraphTransferInfo::NodeInputInfo& node_input_info =
+      *graph_transfer_info_.add_node_input_info();
+  node_input_info.set_node_id(id);
+
+  AddNodeInputByInputIndex(node, 0, &node_input_info);
+
+  const Edge* edge = nullptr;
+  TF_CHECK_OK(node.input_edge(1, &edge));
+  const Node* input_node = edge->src();
+  CHECK_NOTNULL(input_node);
+  CHECK(input_node->IsConstant());
+
+  const TensorProto* tensor_proto = nullptr;
+  TF_CHECK_OK(GetNodeAttr(input_node->def(), "value", &tensor_proto));
+  CHECK_NOTNULL(tensor_proto);
+  Tensor const_tensor;
+  TF_CHECK_OK(MakeTensorFromProto(*tensor_proto, &const_tensor));
+  CHECK_EQ(2, const_tensor.shape().dims());
+  CHECK_EQ(PAD_HEIGHT, const_tensor.shape().dim_size(1));
+  if (const_tensor.shape().dim_size(0) == PAD_WIDTH) {
+    AddNodeInputByInputIndex(node, 1, &node_input_info);
+  } else if (const_tensor.shape().dim_size(0) < PAD_WIDTH) {
+    const int width = const_tensor.shape().dim_size(0);
+    const TensorProto* proto = nullptr;
+    TF_CHECK_OK(GetNodeAttr(input_node->def(), "value", &proto));
+    Tensor const_tensor;
+    TF_CHECK_OK(MakeTensorFromProto(*proto, &const_tensor));
+    CHECK_EQ(DT_INT32, const_tensor.dtype());
+    // reshape tensor input to be rank 4.
+    // TODO(satok): Never assume rank is 4.
+    Tensor new_const_tensor(const_tensor.dtype(), TensorShape{4, 2});
+    for (int i = 0; i < PAD_HEIGHT; ++i) {
+      for (int j = 0; j < PAD_WIDTH; ++j) {
+        if (j < PAD_WIDTH - width) {
+          new_const_tensor.matrix<int32>()(j, i) = 0;
+        } else {
+          new_const_tensor.matrix<int32>()(j, i) =
+              const_tensor.matrix<int32>()(j - (PAD_WIDTH - width), i);
+        }
+      }
+    }
+
+    const int id = RegisterConstTensor(
+        new_const_tensor,
+        strings::StrCat(input_node->name(), "_", node.name(), "_1"));
+
+    GraphTransferInfo::NodeInput& node_input =
+        *node_input_info.add_node_input();
+    node_input.set_node_id(id);
+    node_input.set_output_port(0);
+  } else {
+    CHECK(false);
+  }
+
+  AppendNodeParamsWithIoParams(
+      shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
+      PADDING_NA_ID, node.num_inputs(), {}, node.num_outputs(),
+      false /* append_input */, true /* append_output */);
+}
+
+void GraphTransferer::RegisterInputNode(
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+    const ShapeRefiner& shape_refiner, const Node& node) {
   const string op_type = node.type_string();
+  VLOG(1) << "Register input node: " << node.name() << ", " << op_type;
+  CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
+  const int id = node_name_to_id_cache_map_[node.name()];
   // TODO(satok): Set correct data type if it's given.
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
+  const int op_type_id = ops_definitions.GetOpIdFor("INPUT", {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op" << node.name() << ", " << op_type << " is not supported,"
       << op_type_id;
@@ -492,12 +794,13 @@ void GraphTransferer::RegisterInputNode(
 }
 
 void GraphTransferer::RegisterFlattenNode(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node) {
   VLOG(1) << "Register flatten node: " << node.name();
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
-  const string op_type = IGraphTransferOpsDefinitions::FLATTEN_OP_NAME;
+  // TODO(satok): Remove dependency to specific type
+  const string op_type = "FLATTEN";
   // TODO(satok): Set correct data type if it's given.
   const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
@@ -509,7 +812,7 @@ void GraphTransferer::RegisterFlattenNode(
 }
 
 void GraphTransferer::RegisterGenericNode(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node) {
   VLOG(1) << "Register generic node: " << node.name();
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
@@ -527,7 +830,7 @@ void GraphTransferer::RegisterGenericNode(
 // TODO(satok): Remove this function.
 // TODO(satok): Remove only_register_const_node.
 Status GraphTransferer::RegisterNodeIfAllInputsAreCached(
-    const IGraphTransferOpsDefinitions& ops_definitions,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions,
     const ShapeRefiner& shape_refiner, const Node& node,
     const bool only_register_const_node,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
@@ -546,7 +849,6 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
                                        const int padding, const int inputs_size,
                                        const std::vector<int>& extra_inputs,
                                        const int outputs_size) {
-  VLOG(1) << "Append node params: " << name;
   GraphTransferInfo::NodeInfo& node_info =
       *graph_transfer_info_.add_node_info();
   node_info.set_name(name);
@@ -559,6 +861,23 @@ void GraphTransferer::AppendNodeParams(const string& name, const int id,
   node_info.set_output_count(static_cast<int>(outputs_size));
 }
 
+void GraphTransferer::AddNodeInputByInputIndex(
+    const Node& node, const int idx,
+    GraphTransferInfo::NodeInputInfo* node_input_info) {
+  const Edge* edge = nullptr;
+  TF_CHECK_OK(node.input_edge(idx, &edge));
+  const Node* input_node = edge->src();
+  CHECK_NOTNULL(input_node);
+  const int port = edge->src_output();
+
+  const std::string& op_name = input_node->name();
+  CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
+  const int src_id = node_name_to_id_cache_map_[op_name];
+  GraphTransferInfo::NodeInput& node_input = *node_input_info->add_node_input();
+  node_input.set_node_id(src_id);
+  node_input.set_output_port(port);
+}
+
 void GraphTransferer::AppendNodeInputParams(
     const int id, const Node& node, const std::vector<int>& extra_inputs) {
   VLOG(1) << "Append input params: " << node.name() << ", " << node.num_inputs()
@@ -567,18 +886,7 @@ void GraphTransferer::AppendNodeInputParams(
       *graph_transfer_info_.add_node_input_info();
   node_input_info.set_node_id(id);
   for (int i = 0; i < node.num_inputs(); ++i) {
-    const Edge* edge = nullptr;
-    TF_CHECK_OK(node.input_edge(i, &edge));
-    const Node* input_node = edge->src();
-    const int port = edge->src_output();
-
-    const std::string& op_name = input_node->name();
-    CHECK_GT(node_name_to_id_cache_map_.count(op_name), 0) << op_name;
-    const int src_id = node_name_to_id_cache_map_[op_name];
-    GraphTransferInfo::NodeInput& node_input =
-        *node_input_info.add_node_input();
-    node_input.set_node_id(src_id);
-    node_input.set_output_port(port);
+    AddNodeInputByInputIndex(node, i, &node_input_info);
   }
   for (const int extra_input : extra_inputs) {
     GraphTransferInfo::NodeInput& node_input =
@@ -596,9 +904,10 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
       *graph_transfer_info_.add_node_output_info();
   node_output_info.set_node_id(id);
 
+  std::vector<DataType> data_types;
   std::vector<TensorShape> shapes;
   Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-      node.attrs(), nullptr, &shapes);
+      node.attrs(), &data_types, &shapes);
 
   for (int i = 0; i < node.num_outputs(); ++i) {
     int data_size = -1;
@@ -608,16 +917,20 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
 
     shape_inference::InferenceContext* context =
         shape_refiner.GetContext(&node);
-    shape_inference::ShapeHandle shape_handle = context->output(output_index);
-    const shape_inference::DimensionHandle num_elements_dim =
-        context->NumElements(shape_handle);
-    if (context->ValueKnown(num_elements_dim)) {
+
+    if (context != nullptr && context->ValueKnown(context->NumElements(
+                                  context->output(output_index)))) {
+      const shape_inference::DimensionHandle num_elements_dim =
+          context->NumElements(context->output(output_index));
       const int64 num_output_elements = context->Value(num_elements_dim);
       data_size = max_bytes_per_data * num_output_elements;
+      if (status.ok()) {
+        TF_CHECK_OK(status);
+        CHECK_EQ(shapes.at(i).num_elements(), num_output_elements);
+      }
     } else {
       TF_CHECK_OK(status);
       // Use attribute attached to node
-      CHECK_EQ(node.num_outputs(), shapes.size()) << node.name();
       data_size = max_bytes_per_data * shapes.at(i).num_elements();
     }
     CHECK_GE(data_size, 0);
@@ -722,11 +1035,11 @@ bool GraphTransferer::TransferParamsComparator::operator()(
   const int node_id0 = obj0.node_id();
   const int node_id1 = obj1.node_id();
   bool obj0_uses_obj1 = false;
-  if (dependency_map_.count(node_id0)) {
+  if (dependency_map_.count(node_id0) > 0) {
     obj0_uses_obj1 = dependency_map_.at(node_id0).count(node_id1) > 0;
   }
   bool obj1_uses_obj0 = false;
-  if (dependency_map_.count(node_id1)) {
+  if (dependency_map_.count(node_id1) > 0) {
     obj1_uses_obj0 = dependency_map_.at(node_id1).count(node_id0) > 0;
   }
   CHECK(!obj0_uses_obj1 || !obj1_uses_obj0);
@@ -735,7 +1048,9 @@ bool GraphTransferer::TransferParamsComparator::operator()(
   } else if (obj1_uses_obj0) {
     return true;
   }
-  return node_id0 > node_id1;
+  // If there is no dependency between two nodes, it expects that
+  // the execution order follows node id order.
+  return node_id0 < node_id1;
 }
 
 /* static */ void GraphTransferer::FillDependencyRec(
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index fa12b22d75d0069a4d45cb45ea54d16175504519..125d1fd200719de195da2ac3339576decde1ba46 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -53,7 +53,7 @@ class GraphTransferer {
   // TODO(satok): Pass a pair of TensorShape and DataType instead of
   // Tensor as input_node_info_list.
   Status LoadGraphFromProto(
-      const IGraphTransferOpsDefinitions& ops_definitions,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
       const GraphDef& graph_def,
       const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names,
@@ -63,7 +63,7 @@ class GraphTransferer {
   // TODO(satok): Pass a pair of TensorShape and DataType instead of
   // Tensor as input_node_info_list.
   Status LoadGraphFromProtoFile(
-      const IGraphTransferOpsDefinitions& ops_definitions,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
       const string& graph_def_path,
       const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names, const bool is_text_proto,
@@ -88,6 +88,9 @@ class GraphTransferer {
   // Dump verification string of parameters to verify with offline tools
   void DumpVerificationStringOfNodeTransferParams() const;
 
+  static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray(
+      const TensorShape& shape);
+
  private:
   class TransferParamsComparator {
    public:
@@ -98,12 +101,18 @@ class GraphTransferer {
     const std::unordered_map<int, std::unordered_set<int>>& dependency_map_;
   };
 
-  int CacheNode(const Node& node);
+  void CacheNode(const Node& node);
 
   bool AreAllInputsCached(const Node& node) const;
 
+  // Transform a remote fused graph to add an aggregated input node which takes
+  // all inputs of the remote graph.
+  Status TransformGraphToAddAggregatedInputNode(
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      Graph* graph, ShapeRefiner* shape_refiner);
+
   Status RegisterNode(
-      const IGraphTransferOpsDefinitions& ops_definitions,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node,
       const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names);
@@ -113,8 +122,17 @@ class GraphTransferer {
 
   int RegisterConstantShape(const std::vector<int>& shape);
 
+  int RegisterConstTensor(const Tensor& tensor, const string& suffix);
+
+  int RegisterConstScalar(const DataType dt, const int val, const int dst_id,
+                          const int dst_input_count);
+
   bool HasPaddingAndStrides(const Node& node);
 
+  bool NeedsToAddRank(const Node& node);
+
+  bool IsPadNode(const Node& node);
+
   // Return true if the node is a reshape op which just flattens input
   // TODO(satok): Remove this method once generic reshape op is implemented in
   // SOC
@@ -122,23 +140,29 @@ class GraphTransferer {
                             const ShapeRefiner& shape_refiner);
 
   void RegisterNodeWithPaddingAndStrides(
-      const IGraphTransferOpsDefinitions& ops_definitions,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node);
 
-  void RegisterInputNode(const IGraphTransferOpsDefinitions& ops_definitions,
-                         const ShapeRefiner& shape_refiner,
-                         const Node& node);
+  void RegisterNodeWithRank(
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+      const ShapeRefiner& shape_refiner, const Node& node);
+
+  void RegisterPadNode(const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+                       const ShapeRefiner& shape_refiner, const Node& node);
 
-  void RegisterFlattenNode(const IGraphTransferOpsDefinitions& ops_definitions,
-                           const ShapeRefiner& shape_refiner,
-                           const Node& node);
+  void RegisterInputNode(const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+                         const ShapeRefiner& shape_refiner, const Node& node);
 
-  void RegisterGenericNode(const IGraphTransferOpsDefinitions& ops_definitions,
-                           const ShapeRefiner& shape_refiner,
-                           const Node& node);
+  void RegisterFlattenNode(
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+      const ShapeRefiner& shape_refiner, const Node& node);
+
+  void RegisterGenericNode(
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
+      const ShapeRefiner& shape_refiner, const Node& node);
 
   Status RegisterNodeIfAllInputsAreCached(
-      const IGraphTransferOpsDefinitions& ops_definitions,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node,
       const bool only_register_const_node,
       const std::vector<std::pair<string, Tensor>>& input_node_info_list,
@@ -150,6 +174,10 @@ class GraphTransferer {
                         const std::vector<int>& extra_inputs,
                         const int outputs_size);
 
+  void AddNodeInputByInputIndex(
+      const Node& node, const int idx,
+      GraphTransferInfo::NodeInputInfo* node_input_info);
+
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
 
@@ -167,9 +195,6 @@ class GraphTransferer {
       const int outputs_size, const bool append_input_params,
       const bool append_output_params);
 
-  static std::array<int64, SHAPE_ARRAY_SIZE> ToTensorShapeArray(
-      const TensorShape& shape);
-
   static string ToPaddingDebugString(int padding);
 
   // Create dependency map
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index ebd4a90330155958da4c1324f368116a2e8f48e8..536d295506c9669b0434059e26094cb70a4f1e87 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -22,9 +22,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
-#include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -47,28 +48,25 @@ class GraphTransfererTest : public ::testing::Test {
   GraphTransferer gt_;
 };
 
-static const std::vector<string> OP_TYPES{
-    "INPUT", "OUTPUT", "Conv2D", "MaxPool", "NoOp", "Add", "Const", "Softmax"};
 const RemoteFusedGraphExecuteUtils::TensorShapeMap EMPTY_OUTPUT_TENSOR_MAP;
 
-class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions {
+class TestGraphTransferOpsDefinitions : public IRemoteFusedGraphOpsDefinitions {
  public:
-  int GetTotalOpsCount() const final { return OP_TYPES.size(); }
+  int GetTotalOpsCount() const final { return op_types_.size(); }
 
-int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
-  for (int i = 0; i < OP_TYPES.size(); ++i) {
-    if (OP_TYPES[i] == op_type) {
-      return i;
+  int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
+    for (int i = 0; i < op_types_.size(); ++i) {
+      if (op_types_[i] == op_type) {
+        return i;
+      }
     }
-  }
-  return -1;
+    return -1;
 }
 
-GraphTransferInfo::Destination GetTransferDestination() const final {
-  return GraphTransferInfo::NOP;
-  }
-
  private:
+  const std::vector<string> op_types_{"INPUT",   "OUTPUT",  "Conv2D",
+                                      "MaxPool", "NoOp",    "Add",
+                                      "Const",   "Softmax", "Identity"};
 } TEST_GRAPH_TRANSFER_OPS_DEFINITIONS;
 
 static Output BuildAddOps(const Scope& scope, const Input& x, const Input& y) {
@@ -312,7 +310,7 @@ TEST_F(GraphTransfererTest, LoadAddGraphWithOutputTensorMap) {
   const std::vector<string> output_node_names = {NAME_A_PLUS_B};
   status = gt_.LoadGraphFromProto(TEST_GRAPH_TRANSFER_OPS_DEFINITIONS, def,
                                   inputs, output_node_names, false);
-  ASSERT_TRUE(status.ok());
+  TF_ASSERT_OK(status);
 }
 
 TEST_F(GraphTransfererTest, LoadConvGraph) {
@@ -330,7 +328,7 @@ TEST_F(GraphTransfererTest, LoadConvGraph) {
       gt_.GetGraphTransferInfo().const_node_info_size();
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
-  ASSERT_EQ(3, op_node_count);
+  ASSERT_EQ(4, op_node_count);
   const GraphTransferInfo::NodeInfo* params_conv = FindNodeInfo(gt_, "conv");
   ASSERT_TRUE(params_conv != nullptr);
   const int id = params_conv->node_id();
@@ -356,7 +354,7 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
       gt_.GetGraphTransferInfo().const_node_info_size();
   ASSERT_EQ(2, const_node_count);
   const int op_node_count = gt_.GetGraphTransferInfo().node_info_size();
-  ASSERT_EQ(3, op_node_count);
+  ASSERT_EQ(4, op_node_count);
   const GraphTransferInfo::NodeInfo* params_max_pool =
       FindNodeInfo(gt_, "maxpool");
   ASSERT_TRUE(params_max_pool != nullptr);
@@ -369,14 +367,14 @@ TEST_F(GraphTransfererTest, LoadMaxPoolGraph) {
 }
 
 TEST(HexagonOpsDefinitions, CheckOpsDefinitions) {
-  const IGraphTransferOpsDefinitions& ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions& ops_definitions =
       HexagonOpsDefinitions::getInstance();
   const int total_ops_count = ops_definitions.GetTotalOpsCount();
   EXPECT_GT(total_ops_count, 0);
 }
 
 TEST(GraphTransferer, LoadGraphFromProtoFile) {
-  const IGraphTransferOpsDefinitions* ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &TEST_GRAPH_TRANSFER_OPS_DEFINITIONS;
   string filename =
       io::JoinPath(testing::TensorFlowSrcRoot(),
@@ -439,7 +437,7 @@ void CompareGraphTransferInfo(const GraphTransferInfo& a,
 }  // anonymous namespace
 
 TEST(GraphTransferer, LoadGraphFromProtoFileShapeInferenceSimple) {
-  const IGraphTransferOpsDefinitions* ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &TEST_GRAPH_TRANSFER_OPS_DEFINITIONS;
   string filename =
       io::JoinPath(testing::TensorFlowSrcRoot(),
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 518b399c37482dd7b5ad1ef333f86c6e97f75631..ab791d66f6cf5d4352dab6d88e65dab817d4e3ce 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -16,16 +16,19 @@ limitations under the License.
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
-
-#ifdef USE_HEXAGON_LIBS
-#include "tensorflow/core/platform/hexagon/soc_interface.h"
+#include "tensorflow/core/kernels/hexagon/soc_interface.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
-#endif
 
 namespace tensorflow {
 
-constexpr const char* const INPUT_OP_NAME = "INPUT";
 constexpr const char* const OUTPUT_OP_NAME = "OUTPUT";
+constexpr const char* const REMOTE_FUSED_GRAPH_NODE_NAME_PREFIX =
+    "hexagon_remote_fused_graph";
+/* static */ constexpr const char* const
+    HexagonControlWrapper::REMOTE_FUSED_GRAPH_EXECUTOR_NAME;
+
+constexpr int ALIGNMENT_BYTES = 16;
+constexpr int MAX_IN_OUT_COUNT = 128;
 
 const bool DBG_DUMP_VERIFICATION_STRING = false;
 const int DBG_LEVEL = 0;  // -2: verbose, -1: debug, 0: info
@@ -34,6 +37,22 @@ const bool DBG_USE_SAMPLE_INPUT = false;
 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01;
 const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false;
 
+static string AddPort(const string& node_name) {
+  if (node_name.find(':') != string::npos) {
+    return node_name;
+  } else {
+    return strings::StrCat(node_name, ":", 0);
+  }
+}
+
+static uint8* FindAlignedPointer(uint8* ptr) {
+  const uintptr_t data_ptr_int = reinterpret_cast<uintptr_t>(ptr);
+  const int shift_count =
+      (ALIGNMENT_BYTES - data_ptr_int % ALIGNMENT_BYTES) % ALIGNMENT_BYTES;
+  uint8* data_ptr = ptr + shift_count;
+  return data_ptr;
+}
+
 /* static */ GraphTransferInfo::NodeInfo* HexagonControlWrapper::FindNodeInfo(
     const string& name, GraphTransferInfo* graph_transfer_info) {
   for (GraphTransferInfo::NodeInfo& node_info :
@@ -45,7 +64,6 @@ const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false;
   return nullptr;
 }
 
-#ifdef USE_HEXAGON_LIBS
 int HexagonControlWrapper::GetVersion() {
   return soc_interface_GetSocControllerVersion();
 }
@@ -60,18 +78,56 @@ bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo& info) {
     std::vector<string> outputs;
     RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
         info, &inputs, &outputs);
-    graph_transferer_.LoadGraphFromProto(
+    Status status = graph_transferer_.LoadGraphFromProto(
         HexagonOpsDefinitions::getInstance(), info.remote_graph(), inputs,
         outputs,
         false  // shape_inference_for_unknown_shape
-        );
+    );
+    TF_CHECK_OK(status) << status;
   } else {
     // If graph transfer info is attached, just import it.
     graph_transferer_.SetSerializedGraphTransferInfo(
         info.serialized_executor_parameters());
   }
   execute_info_ = &info;
-  return soc_interface_Init();
+  bool success = soc_interface_Init();
+  if (!success) {
+    LOG(ERROR) << "Hexagon initialization was failed.  See log output.";
+    return false;
+  }
+  std::vector<int> input_sizes;
+  std::vector<int> output_sizes;
+  CHECK_NOTNULL(execute_info_);
+  for (int i = 0; i < execute_info_->graph_input_node_name_size(); ++i) {
+    const string& input = execute_info_->graph_input_node_name(i);
+    LOG(INFO) << "Add input: " << input << ", " << i;
+    CHECK(input_port_map_.emplace(AddPort(input), i).second);
+    const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type =
+        execute_info_->default_graph_input_tensor_shape(i);
+    int64 buf_size = DataTypeSize(shape_type.dtype());
+    for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) {
+      buf_size *= dim.size();
+    }
+    input_sizes.emplace_back(static_cast<int>(buf_size));
+  }
+  for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) {
+    const string& output = execute_info_->graph_output_node_name(i);
+    CHECK(output_port_map_.emplace(AddPort(output), i).second);
+    const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type =
+        execute_info_->default_graph_output_tensor_shape(i);
+
+    int64 buf_size = DataTypeSize(shape_type.dtype());
+    for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) {
+      buf_size *= dim.size();
+    }
+    output_sizes.emplace_back(static_cast<int>(buf_size));
+  }
+
+  LOG(INFO) << "Allocate inout buffer";
+  success &= soc_interface_AllocateInOutNodeBuffers(
+      input_sizes.size(), input_sizes.data(), output_sizes.size(),
+      output_sizes.data());
+  return success;
 }
 
 bool HexagonControlWrapper::Finalize() { return soc_interface_Finalize(); }
@@ -86,9 +142,6 @@ bool HexagonControlWrapper::SetupGraph() {
     GraphTransferInfo::NodeInfo* node_info =
         FindNodeInfo(graph_input.name(), &graph_transfer_info);
     CHECK_NE(node_info, nullptr);
-    node_info->set_type_name(INPUT_OP_NAME);
-    node_info->set_soc_op_id(
-        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME, {}));
   }
 
   // Generate a new output node which is connected to graph output node
@@ -153,8 +206,9 @@ bool HexagonControlWrapper::SetupGraph() {
   for (const GraphTransferInfo::NodeInputInfo& input_params :
        graph_transfer_info.node_input_info()) {
     const int count = input_params.node_input_size();
-    int node_ids[count];
-    int ports[count];
+    CHECK(count <= MAX_IN_OUT_COUNT);
+    int node_ids[MAX_IN_OUT_COUNT];
+    int ports[MAX_IN_OUT_COUNT];
     for (int i = 0; i < count; ++i) {
       const GraphTransferInfo::NodeInput& node_input =
           input_params.node_input(i);
@@ -172,7 +226,8 @@ bool HexagonControlWrapper::SetupGraph() {
   for (const GraphTransferInfo::NodeOutputInfo& output_params :
        graph_transfer_info.node_output_info()) {
     const int count = output_params.max_byte_size_size();
-    int sizes[count];
+    CHECK(count <= MAX_IN_OUT_COUNT);
+    int sizes[MAX_IN_OUT_COUNT];
     for (int i = 0; i < count; ++i) {
       const int size = output_params.max_byte_size(i);
       sizes[i] = size;
@@ -202,12 +257,8 @@ bool HexagonControlWrapper::SetupGraph() {
     auto data = dummy_const_data_.emplace(
         std::piecewise_construct, std::make_tuple(node_id), std::make_tuple());
     CHECK(data.second);
-    const int additional_bytes_for_alignment = 16;
-    data.first->second.resize(data_size + additional_bytes_for_alignment - 1);
-    const uintptr_t data_ptr_int =
-        reinterpret_cast<uintptr_t>(data.first->second.data());
-    const int shift_count = (16 - data_ptr_int % 16) % 16;
-    uint8* data_ptr = data.first->second.data() + shift_count;
+    data.first->second.resize(data_size + ALIGNMENT_BYTES - 1);
+    uint8* data_ptr = FindAlignedPointer(data.first->second.data());
     std::memcpy(data_ptr, params.data().data(), data_size);
     soc_interface_AppendConstNode(params.name().c_str(),
                                   node_id + NODE_ID_OFFSET, shape_0, shape_1,
@@ -267,27 +318,37 @@ bool HexagonControlWrapper::TeardownGraph() {
   return soc_interface_TeardownGraph();
 }
 
-bool HexagonControlWrapper::FillInputNode(const string& node_name,
-                                          const ConstByteArray bytes) {
-  uint64 byte_size;
-  const int x = 1;
-  const int y = 299;
-  const int z = 299;
-  const int d = 3;
+bool HexagonControlWrapper::FillInputNode(
+    const string& node_name,
+    const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape,
+    const ConstByteArray bytes) {
+  const string tensor_name = AddPort(node_name);
+  CHECK(input_port_map_.count(tensor_name) > 0);
+  const int port = input_port_map_.at(tensor_name);
+  if (input_tensor_data_.count(port) <= 0) {
+    input_tensor_data_.emplace(port, std::vector<uint8>{});
+  }
+  std::vector<uint8>& input_tensor_data = input_tensor_data_.at(port);
+
+  // hexagon only supports 32bit dimension
+  const int x = static_cast<int>(shape[0]);
+  const int y = static_cast<int>(shape[1]);
+  const int z = static_cast<int>(shape[2]);
+  const int d = static_cast<int>(shape[3]);
+
+  const uint64 byte_size = x * y * z * d * DataTypeSize(std::get<2>(bytes));
+  CHECK_EQ(byte_size, std::get<1>(bytes));
+  input_tensor_data.resize(byte_size + ALIGNMENT_BYTES);
+  uint8* data_ptr = FindAlignedPointer(input_tensor_data.data());
+
   if (DBG_USE_DUMMY_INPUT) {
-    const int array_length = x * y * z * d;
-    byte_size = array_length * sizeof(float);
-    dummy_input_float_.resize(array_length);
-    std::memset(dummy_input_float_.data(), 0, byte_size);
+    std::memset(data_ptr, 0, byte_size);
   } else {
-    CHECK(std::get<2>(bytes) == DT_FLOAT);
-    byte_size = std::get<1>(bytes);
-    dummy_input_float_.resize(byte_size / sizeof(float));
-    std::memcpy(dummy_input_float_.data(), std::get<0>(bytes), byte_size);
+    std::memcpy(data_ptr, std::get<0>(bytes), byte_size);
   }
-  return soc_interface_FillInputNodeFloat(
-      x, y, z, d, reinterpret_cast<uint8*>(dummy_input_float_.data()),
-      byte_size);
+
+  return soc_interface_FillInputNodeWithPort(port, x, y, z, d, data_ptr,
+                                             byte_size);
 }
 
 bool HexagonControlWrapper::ReadOutputNode(
@@ -304,30 +365,49 @@ bool HexagonControlWrapper::ReadOutputNode(
       break;
     }
   }
-  std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs;
+  std::vector<ByteArray> outputs;
   ReadOutputNode(node_name, &outputs);
   CHECK_EQ(1, outputs.size());
-  IRemoteFusedGraphExecutor::ByteArray& output = outputs[0];
+  ByteArray& output = outputs[0];
   Tensor* output_tensor = tensor_allocator(output_shape);
   CHECK(output_tensor->TotalBytes() >= std::get<1>(output))
       << output_tensor->TotalBytes() << ", " << std::get<1>(output);
-  // TODO(satok): Avoid specifying float
-  std::memcpy(output_tensor->flat<float>().data(), std::get<0>(output),
-              std::get<1>(output));
+  TF_CHECK_OK(RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor(
+      std::get<0>(output), std::get<1>(output), output_tensor));
+  return true;
 }
 
 bool HexagonControlWrapper::ReadOutputNode(
     const string& node_name, std::vector<ByteArray>* const outputs) {
   CHECK(outputs != nullptr);
   ByteArray output;
-  soc_interface_ReadOutputNodeFloat(node_name.c_str(), &std::get<0>(output),
-                                    &std::get<1>(output));
+  const string tensor_name = AddPort(node_name);
+  CHECK(output_port_map_.count(tensor_name) > 0);
+  const int port = output_port_map_.at(tensor_name);
+  soc_interface_ReadOutputNodeWithPort(
+      port, &std::get<0>(output),
+      reinterpret_cast<uint64_t*>(&std::get<1>(output)));
   // TODO: Accept all results
-  std::get<2>(output) = DT_FLOAT;
+  // std::get<2>(output) = DT_FLOAT;
   outputs->emplace_back(output);
   return true;
 }
 
+Status HexagonControlWrapper::FuseRemoteGraph(
+    const GraphDef& original_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs, GraphDef* fused_graph_def) {
+  const std::unordered_set<string> fused_node_names =
+      RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions(
+          original_graph_def, HexagonOpsDefinitions::getInstance());
+  // TODO(satok): We may want to place shape and type inside this function
+  // if they are not placed in the given graph.
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+      original_graph_def, inputs, outputs, REMOTE_FUSED_GRAPH_NODE_NAME_PREFIX,
+      fused_node_names, REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
+      /*require_shape_type=*/true, fused_graph_def));
+  return Status::OK();
+}
+
 bool HexagonControlWrapper::FillInputNode(const string& node_name,
                                           const Tensor& tensor) {
   StringPiece tensor_data = tensor.tensor_data();
@@ -347,33 +427,11 @@ bool HexagonControlWrapper::FillInputNode(const string& node_name,
       }
     }
   }
-  FillInputNode(node_name, ba);
+  const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE> shape =
+      GraphTransferer::ToTensorShapeArray(tensor.shape());
+  FillInputNode(node_name, shape, ba);
   return true;
 }
 
-#else
-int HexagonControlWrapper::GetVersion() { return -1; }
-bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo&) {
-  return false;
-}
-bool HexagonControlWrapper::Finalize() { return false; }
-bool HexagonControlWrapper::SetupGraph() { return false; }
-bool HexagonControlWrapper::ExecuteGraph() { return false; }
-bool HexagonControlWrapper::TeardownGraph() { return false; }
-bool HexagonControlWrapper::FillInputNode(const string&, const ConstByteArray) {
-  return false;
-}
-bool HexagonControlWrapper::FillInputNode(const string&, const Tensor&) {
-  return false;
-}
-bool HexagonControlWrapper::ReadOutputNode(
-    const string& node_name, TensorAllocatorFunc tensor_allocator) {
-  return false;
-}
-bool HexagonControlWrapper::ReadOutputNode(const string&,
-                                           std::vector<ByteArray>* const) {
-  return false;
-}
-#endif
-
+bool HexagonControlWrapper::IsEnabled() const { return true; };
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 97448884e1d03b19a10cd77553e26af940736639..8eb3995fc4f7974e382eb1370e05bec4a2f4a3f2 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
 
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
@@ -32,6 +33,11 @@ namespace tensorflow {
  */
 class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
  public:
+  using ByteArray =
+      std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
+  static constexpr const char* const REMOTE_FUSED_GRAPH_EXECUTOR_NAME =
+      "build_hexagon_remote_fused_graph_executor";
+
   HexagonControlWrapper() = default;
   int GetVersion() final;
   bool Init(const RemoteFusedGraphExecuteInfo& info) final;
@@ -42,10 +48,21 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   bool FillInputNode(const string& node_name, const Tensor& tensor) final;
   bool ReadOutputNode(const string& node_name,
                       TensorAllocatorFunc tensor_allocator) final;
+  Status FuseRemoteGraph(const GraphDef& original_graph_def,
+                         const std::vector<string>& inputs,
+                         const std::vector<string>& outputs,
+                         GraphDef* fused_graph_def) final;
+  bool IsEnabled() const final;
   bool ReadOutputNode(const string& node_name, std::vector<ByteArray>* outputs);
 
  private:
-  bool FillInputNode(const string& node_name, const ConstByteArray bytes);
+  using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
+                                    DataType /* type */>;
+
+  bool FillInputNode(
+      const string& node_name,
+      const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape,
+      const ConstByteArray bytes);
 
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
@@ -57,11 +74,15 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   GraphTransferer graph_transferer_{};
   // Dummy float array for input node.
   // TODO(satok): Use actual data passed by FillInputNode and remove
-  std::vector<float> dummy_input_float_{};
+  // std::vector<float> dummy_input_float_{};
+  std::unordered_map<int, std::vector<uint8>> input_tensor_data_{};
   // Dummy byte array for cosnt node.
   // TODO(satok): Remove
   std::unordered_map<int, std::vector<uint8>> dummy_const_data_{};
 
+  std::unordered_map<string, int> input_port_map_{};
+  std::unordered_map<string, int> output_port_map_{};
+
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
 
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 54ba101501f4134a672e4cf2f87a5df558f82589..71bc4187b74cd6501d203aa3779c6d01e01f0d38 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -24,15 +24,20 @@ https://storage.googleapis.com/download.tensorflow.org/models/imagenet_comp_grap
 adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 */
 
+// define EIGEN_USE_THREADS to include quantization_utils.h
+#define EIGEN_USE_THREADS
+
 #include <memory>
 
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
-#include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -46,12 +51,15 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 
 namespace tensorflow {
 
-using ByteArray = IRemoteFusedGraphExecutor::ByteArray;
-using ConstByteArray = IRemoteFusedGraphExecutor::ConstByteArray;
+using ByteArray = HexagonControlWrapper::ByteArray;
 
 constexpr const char* const IMAGE_FILENAME = "/data/local/tmp/img_299x299.bmp";
 constexpr const char* const MODEL_FILENAME =
     "/data/local/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb";
+constexpr const char* const MODEL_WITH_QUANTIZED_INPUT_FILENAME =
+    "/data/local/tmp/"
+    "tensorflow_inception_v3_stripped_optimized_quantized_with_quantized_input."
+    "pb";
 constexpr const char* const FUSED_MODEL_FILENAME =
     "/data/local/tmp/"
     "tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb";
@@ -64,7 +72,7 @@ const int WIDTH = 299;
 const int HEIGHT = 299;
 const int DEPTH = 3;
 const int EXPECTED_FIRST_RESULT_ID = 59;
-const int EXECUTION_REPEAT_COUNT = 3;
+const int EXECUTION_REPEAT_COUNT = 10;
 
 static void CheckHexagonControllerVersion() {
   HexagonControlWrapper hexagon_control_wrapper;
@@ -87,8 +95,7 @@ static void DumpTop10Results(const int byte_size,
       10 /* show top_n results */);
 }
 
-static void DumpTop10Results(
-    const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs) {
+static void DumpTop10Results(const std::vector<ByteArray>& outputs) {
   CHECK(outputs.size() == 1);
   const int byte_size = std::get<1>(outputs.at(0));
   const float* float_array =
@@ -96,9 +103,8 @@ static void DumpTop10Results(
   DumpTop10Results(byte_size, float_array);
 }
 
-static void CheckFirstResult(
-    const std::vector<IRemoteFusedGraphExecutor::ByteArray>& outputs,
-    const int expected_first_id) {
+static void CheckFirstResult(const std::vector<ByteArray>& outputs,
+                             const int expected_first_id) {
   EXPECT_GE(outputs.size(), 1);
   const int byte_size = std::get<1>(outputs.at(0));
   const int element_count = byte_size / sizeof(float);
@@ -167,8 +173,16 @@ static void LoadImage(std::vector<float>* img_floats_ptr) {
   }
 }
 
+static void QuantizeImage(const std::vector<float>& float_vec,
+                          std::vector<quint8>* quint8_vec) {
+  quint8_vec->resize(float_vec.size());
+  for (int i = 0; i < float_vec.size(); ++i) {
+    quint8_vec->at(i) = FloatToQuantized<quint8>(float_vec[i], -1.0f, 1.0f);
+  }
+}
+
 static Tensor BuildImageTensor(const std::vector<float>& img_floats) {
-  LOG(INFO) << "Ioading image finished.";
+  LOG(INFO) << "Loading image finished.";
   Tensor img_tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH});
   CHECK_EQ(WIDTH * HEIGHT * DEPTH, img_floats.size());
   CHECK_EQ(img_tensor.TotalBytes(), img_floats.size() * sizeof(float));
@@ -178,6 +192,18 @@ static Tensor BuildImageTensor(const std::vector<float>& img_floats) {
   return img_tensor;
 }
 
+static Tensor BuildQuantizedImageTensor(
+    const std::vector<quint8>& quantized_img) {
+  LOG(INFO) << "Loading image finished.";
+  Tensor img_tensor(DT_QUINT8, {1, WIDTH, HEIGHT, DEPTH});
+  CHECK_EQ(WIDTH * HEIGHT * DEPTH, quantized_img.size());
+  CHECK_EQ(img_tensor.TotalBytes(), quantized_img.size() * sizeof(quint8));
+  LOG(INFO) << "Copy data to tensor.";
+  std::memcpy(img_tensor.flat<quint8>().data(), quantized_img.data(),
+              img_tensor.TotalBytes());
+  return img_tensor;
+}
+
 /* static */ RemoteFusedGraphExecuteInfo
 BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
     const GraphTransferInfo& graph_transfer_info) {
@@ -212,10 +238,8 @@ BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
   return execute_info;
 }
 
-static void RunInferenceByHexagonControlWrapper(
-    const GraphTransferer& gt, const std::vector<float>& img_floats) {
-  const Tensor img_tensor = BuildImageTensor(img_floats);
-
+static void RunInferenceByHexagonControlWrapper(const GraphTransferer& gt,
+                                                const Tensor& img_tensor) {
   const RemoteFusedGraphExecuteInfo execute_info =
       BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(
           gt.GetGraphTransferInfo());
@@ -231,22 +255,21 @@ static void RunInferenceByHexagonControlWrapper(
   hexagon_control_wrapper.FillInputNode("Mul", img_tensor);
 
   // 4. Execute graph
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
-  ClockCycleProfiler prof;
+  const int64 start_time_us = Env::Default()->NowMicros();
   for (int i = 0; i < EXECUTION_REPEAT_COUNT; ++i) {
-    prof.Start();
     hexagon_control_wrapper.ExecuteGraph();
-    prof.Stop();
   }
+  const int64 end_time_us = Env::Default()->NowMicros();
 
   // 5-1. Read output node's outputs
-  std::vector<IRemoteFusedGraphExecutor::ByteArray> outputs;
+  std::vector<ByteArray> outputs;
   hexagon_control_wrapper.ReadOutputNode("softmax", &outputs);
 
   // 5-2. Dump results
   DumpTop10Results(outputs);
   CheckFirstResult(outputs, EXPECTED_FIRST_RESULT_ID);
-  prof.DumpStatistics("Graph Execution");
+  LOG(INFO) << "Average execution time = "
+            << (end_time_us - start_time_us) / EXECUTION_REPEAT_COUNT << "us";
 
   // 6. Teardown graph in hexagon
   hexagon_control_wrapper.TeardownGraph();
@@ -305,7 +328,7 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
     const GraphTransferInfo::NodeInfo& ni0 = gfi0.node_info(i);
     const GraphTransferInfo::NodeInfo& ni1 = gfi1.node_info(i);
     EXPECT_EQ(ni0.DebugString(), ni1.DebugString());
-    EXPECT_EQ(ni0.ByteSize(), ni1.ByteSize());
+    EXPECT_EQ(ni0.ByteSizeLong(), ni1.ByteSizeLong());
   }
 
   // 2. check const_node_info
@@ -317,7 +340,7 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
     for (int j = 0; j < cni0.shape_size(); ++j) {
       EXPECT_EQ(cni0.shape(j), cni1.shape(j));
     }
-    EXPECT_EQ(cni0.ByteSize(), cni1.ByteSize());
+    EXPECT_EQ(cni0.ByteSizeLong(), cni1.ByteSizeLong());
     EXPECT_EQ(cni0.DebugString(), cni1.DebugString());
   }
 
@@ -326,7 +349,7 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
   for (int i = 0; i < gfi0.node_input_info_size(); ++i) {
     const GraphTransferInfo::NodeInputInfo& nii0 = gfi0.node_input_info(i);
     const GraphTransferInfo::NodeInputInfo& nii1 = gfi1.node_input_info(i);
-    EXPECT_EQ(nii0.ByteSize(), nii1.ByteSize());
+    EXPECT_EQ(nii0.ByteSizeLong(), nii1.ByteSizeLong());
     EXPECT_EQ(nii0.DebugString(), nii1.DebugString());
   }
 
@@ -339,7 +362,7 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
     for (int j = 0; j < noi0.max_byte_size_size(); ++j) {
       EXPECT_EQ(noi0.max_byte_size(j), noi1.max_byte_size(j));
     }
-    EXPECT_EQ(noi0.ByteSize(), noi1.ByteSize());
+    EXPECT_EQ(noi0.ByteSizeLong(), noi1.ByteSizeLong());
     EXPECT_EQ(noi0.DebugString(), noi1.DebugString());
   }
 
@@ -351,7 +374,7 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
         gfi0.graph_input_node_info(i);
     const GraphTransferInfo::GraphInputNodeInfo& gini1 =
         gfi0.graph_input_node_info(i);
-    EXPECT_EQ(gini0.ByteSize(), gini1.ByteSize());
+    EXPECT_EQ(gini0.ByteSizeLong(), gini1.ByteSizeLong());
     EXPECT_EQ(gini0.DebugString(), gini1.DebugString());
   }
 
@@ -363,12 +386,9 @@ static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
         gfi0.graph_output_node_info(i);
     const GraphTransferInfo::GraphOutputNodeInfo& goni1 =
         gfi0.graph_output_node_info(i);
-    EXPECT_EQ(goni0.ByteSize(), goni1.ByteSize());
+    EXPECT_EQ(goni0.ByteSizeLong(), goni1.ByteSizeLong());
     EXPECT_EQ(goni0.DebugString(), goni1.DebugString());
   }
-
-  // 7. check destination
-  EXPECT_EQ(gfi0.destination(), gfi1.destination());
 }
 
 // CAVEAT: This test only runs when you specify hexagon library using
@@ -384,7 +404,7 @@ TEST(GraphTransferer,
   LOG(INFO) << "Run inception v3 on hexagon with hexagon controller";
   CheckHexagonControllerVersion();
 
-  const IGraphTransferOpsDefinitions* ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &HexagonOpsDefinitions::getInstance();
   std::vector<std::pair<string, Tensor>> inputs;
   inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
@@ -407,7 +427,43 @@ TEST(GraphTransferer,
 
   std::vector<float> img_floats;
   LoadImage(&img_floats);
-  RunInferenceByHexagonControlWrapper(gt, img_floats);
+  const Tensor img_tensor = BuildImageTensor(img_floats);
+  RunInferenceByHexagonControlWrapper(gt, img_tensor);
+}
+
+TEST(GraphTransferer,
+     DISABLED_RunInceptionV3OnHexagonExampleWithHexagonWrapperQuantizedInput) {
+  LOG(INFO) << "Run inception v3 on hexagon with hexagon controller "
+            << "with quantized input";
+  CheckHexagonControllerVersion();
+
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
+      &HexagonOpsDefinitions::getInstance();
+  std::vector<std::pair<string, Tensor>> inputs;
+  inputs.emplace_back("Mul", Tensor(DT_QUINT8, {1, WIDTH, HEIGHT, DEPTH}));
+  std::vector<string> output_node_names = {"softmax"};
+
+  GraphTransferer gt;
+  gt.EnableStrictCheckMode(false);
+  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  ClockCycleProfiler prof;
+  prof.Start();
+  Status status = gt.LoadGraphFromProtoFile(
+      *ops_definitions, MODEL_WITH_QUANTIZED_INPUT_FILENAME, inputs,
+      output_node_names,
+      /*is_text_proto=*/false,
+      /*shape_inference_for_unknown_shape=*/false,
+      /*dry_run_for_unknown_shape=*/true);
+  ASSERT_TRUE(status.ok()) << status;
+  prof.Stop();
+  prof.DumpStatistics("LoadGraphFromProtoFile");
+
+  std::vector<float> img_floats;
+  LoadImage(&img_floats);
+  std::vector<quint8> quantized_img;
+  QuantizeImage(img_floats, &quantized_img);
+  const Tensor img_tensor = BuildQuantizedImageTensor(quantized_img);
+  RunInferenceByHexagonControlWrapper(gt, img_tensor);
 }
 
 TEST(GraphTransferer,
@@ -415,7 +471,7 @@ TEST(GraphTransferer,
   LOG(INFO) << "Run inception v3 on hexagon with hexagon controller";
   CheckHexagonControllerVersion();
 
-  const IGraphTransferOpsDefinitions* ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &HexagonOpsDefinitions::getInstance();
   std::vector<std::pair<string, Tensor>> inputs;
   inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
@@ -438,14 +494,15 @@ TEST(GraphTransferer,
 
   std::vector<float> img_floats;
   LoadImage(&img_floats);
-  RunInferenceByHexagonControlWrapper(gt, img_floats);
+  const Tensor img_tensor = BuildImageTensor(img_floats);
+  RunInferenceByHexagonControlWrapper(gt, img_tensor);
 }
 
 TEST(GraphTransferer, RunInceptionV3OnHexagonExampleWithTfRuntime) {
   LOG(INFO) << "Fuse and run inception v3 on hexagon with tf runtime";
   CheckHexagonControllerVersion();
 
-  const IGraphTransferOpsDefinitions* ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &HexagonOpsDefinitions::getInstance();
   std::vector<std::pair<string, Tensor>> inputs;
   inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
@@ -483,7 +540,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
   CheckHexagonControllerVersion();
   profile_utils::CpuUtils::EnableClockCycleProfiling(true);
 
-  const IGraphTransferOpsDefinitions* ops_definitions =
+  const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &HexagonOpsDefinitions::getInstance();
   std::vector<std::pair<string, Tensor>> inputs;
   inputs.emplace_back("Mul", Tensor(DT_FLOAT, {1, WIDTH, HEIGHT, DEPTH}));
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
index a4b79e6ec4f1053f307c56f7b079f76559144998..8e517dc93c2a2f3ddae04835bbc30de3abe0f307 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 
-#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 
 // CAVEAT: Comment-out the following macro if you want to use experimental
@@ -304,8 +303,8 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
   EmplaceOpType("INPUT", {}, SupportedOpType::INPUT, &op_map);
   EmplaceOpType("OUTPUT", {}, SupportedOpType::OUTPUT, &op_map);
   EmplaceOpType("NoOp", {}, SupportedOpType::NOP, &op_map);
-  EmplaceOpType(IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, {},
-                SupportedOpType::FLATTEN, &op_map);
+  // Special op type for hexagon
+  EmplaceOpType("FLATTEN", {}, SupportedOpType::FLATTEN, &op_map);
   // Tensorflow op name
   // CAVEAT: Keep order of SupportedOpType
   EmplaceOpType("Identity", {}, SupportedOpType::NOP, &op_map);
@@ -350,6 +349,8 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
 #ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
   EmplaceOpType("QuantizedMul", {}, SupportedOpType::QUANTIZED_MUL_8x8to32,
                 &op_map);
+  EmplaceOpType("QuantizedAdd", {}, SupportedOpType::QUANTIZED_ADD_8p8to32,
+                &op_map);
   EmplaceOpType("Pad", {}, SupportedOpType::PAD_F, &op_map);
   EmplaceOpType("SpaceToBatchND", {}, SupportedOpType::SPACE_TO_BATCH_ND_F,
                 &op_map),
@@ -359,6 +360,11 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
                 &op_map);
   EmplaceOpType("ConcatV2", {}, SupportedOpType::CONCAT_V2_F, &op_map);
   EmplaceOpType("Conv2DBackpropInput", {}, SupportedOpType::DECONV_F, &op_map);
+
+  EmplaceOpType("Tanh", {}, SupportedOpType::TANH_F, &op_map);
+  EmplaceOpType("Split", {}, SupportedOpType::SPLIT_F, &op_map);
+  EmplaceOpType("Transpose", {}, SupportedOpType::TRANSPOSE_F, &op_map);
+  EmplaceOpType("Concat", {}, SupportedOpType::CONCAT_F, &op_map);
 #endif
   return op_map;
 };
@@ -366,7 +372,7 @@ HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
 HexagonOpsDefinitions::HexagonOpsDefinitions()
     : op_name_to_soc_op_type_map_(BuildOpNameToSocOpTypeMap()) {}
 
-/* static */ const IGraphTransferOpsDefinitions&
+/* static */ const IRemoteFusedGraphOpsDefinitions&
 HexagonOpsDefinitions::getInstance() {
   const static HexagonOpsDefinitions instance{};
   return instance;
@@ -386,17 +392,17 @@ int HexagonOpsDefinitions::GetOpIdFor(const string& op_type,
     if (dt_vec.empty()) {
       return static_cast<int>(std::get<1>(dt_to_op_vec.front()));
     }
+    // If there is only one op_id registered for empty op_vec, we assume
+    // that the op supports any data types.
+    if (dt_to_op_vec.size() == 1 && std::get<0>(dt_to_op_vec.front()).empty()) {
+      return static_cast<int>(std::get<1>(dt_to_op_vec.front()));
+    }
     for (const DataTypeToOp& data_type_to_op : dt_to_op_vec) {
       if (std::get<0>(data_type_to_op) == dt_vec) {
         return static_cast<int>(std::get<1>(data_type_to_op));
       }
     }
   }
-  return IGraphTransferOpsDefinitions::INVALID_OP_ID;
-}
-
-GraphTransferInfo::Destination HexagonOpsDefinitions::GetTransferDestination()
-    const {
-  return GraphTransferInfo::HEXAGON;
+  return IRemoteFusedGraphOpsDefinitions::INVALID_OP_ID;
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index bd1120e1df64ca72b2a3a95d7af91fabf693af98..993a5f9a3a81d1bfc00b59ec1364209d11ceeaa7 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -18,21 +18,20 @@ limitations under the License.
 
 #include <unordered_map>
 
-#include "i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
-// HexagonOpsDefinitions provides ops definitons supported in hexagon library
+// HexagonOpsDefinitions provides ops definitions supported in hexagon library
 // TODO(satok): add a functionality to call functions in hexagon library
-class HexagonOpsDefinitions final : public IGraphTransferOpsDefinitions {
+class HexagonOpsDefinitions final : public IRemoteFusedGraphOpsDefinitions {
  public:
-  static const IGraphTransferOpsDefinitions& getInstance();
+  static const IRemoteFusedGraphOpsDefinitions& getInstance();
 
   int GetTotalOpsCount() const final;
   int GetOpIdFor(const string& op_type, const DataTypeVector& dt) const final;
-  GraphTransferInfo::Destination GetTransferDestination() const final;
 
  private:
   enum class SupportedOpType;
diff --git a/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc b/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
index d22bfa1be2c2c76101f265c332bc1ad0c87d8e2c..595baaa5d0666a8d2c0d6f190fce8171ebb08b0a 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
@@ -27,7 +27,7 @@ Status BuildRemoteFusedGraphExecutor(
 
 static RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
     k_hexagon_remote_fused_graph_executor_build(
-        "build_hexagon_remote_fused_graph_executor",
+        HexagonControlWrapper::REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
         BuildRemoteFusedGraphExecutor);
 
 }  // namespace hexagon_remote_fused_graph_executor_build
diff --git a/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc b/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
deleted file mode 100644
index 285993aaaa7db6c056c712dd016e2753032f8b47..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Tests in this file are designed to evaluate hexagon DSP operations.
-
-#define EIGEN_USE_THREADS
-
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-#ifdef USE_HEXAGON_LIBS
-#include "tensorflow/core/platform/hexagon/soc_interface.h"
-#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
-#endif
-
-namespace tensorflow {
-
-class QuantizedMatMulOpForHexagonTest : public OpsTestBase {
- protected:
-  void SetUp() final {
-#ifdef USE_HEXAGON_LIBS
-    profile_utils::CpuUtils::EnableClockCycleProfiling(true);
-    LOG(INFO) << "Hexagon libs are linked (wrapper version = "
-              << soc_interface_GetWrapperVersion()
-              << ", hexagon binary version = "
-              << soc_interface_GetSocControllerVersion() << ")";
-    LOG(INFO) << "Cpu frequency = "
-              << profile_utils::CpuUtils::GetCycleCounterFrequency();
-#else
-    LOG(WARNING) << "Hexagon libs are not linked.";
-#endif
-  }
-};
-
-// Shows some statistics of hexagon dsp using hexagon specific APIs
-#ifdef USE_HEXAGON_LIBS
-TEST_F(QuantizedMatMulOpForHexagonTest, EvaluateSharedLibOverhead) {
-  const uint64 overhead_shared_lib_start =
-      profile_utils::CpuUtils::GetCurrentClockCycle();
-  const int wrapper_version = soc_interface_GetWrapperVersion();
-  const uint64 overhead_shared_lib_end =
-      profile_utils::CpuUtils::GetCurrentClockCycle();
-  const uint64 overhead_shared_lib_diff =
-      (overhead_shared_lib_end - overhead_shared_lib_start);
-  const uint64 overhead_hexagon_rpc_start =
-      profile_utils::CpuUtils::GetCurrentClockCycle();
-  const int hexagon_binary_version = soc_interface_GetSocControllerVersion();
-  const uint64 overhead_hexagon_rpc_end =
-      profile_utils::CpuUtils::GetCurrentClockCycle();
-  const uint64 overhead_hexagon_rpc_diff =
-      (overhead_hexagon_rpc_end - overhead_hexagon_rpc_start);
-  LOG(INFO) << "Shared lib (ver = " << wrapper_version << ") overhead is "
-            << overhead_shared_lib_diff << " cycles, time = "
-            << std::chrono::duration_cast<std::chrono::microseconds>(
-                   profile_utils::CpuUtils::ConvertClockCycleToTime(
-                       overhead_shared_lib_diff))
-                   .count()
-            << " usec";
-  LOG(INFO) << "hexagon rpc (ver = " << hexagon_binary_version
-            << ") overhead is " << overhead_hexagon_rpc_diff
-            << " cycles, time = "
-            << std::chrono::duration_cast<std::chrono::microseconds>(
-                   profile_utils::CpuUtils::ConvertClockCycleToTime(
-                       overhead_hexagon_rpc_diff))
-                   .count()
-            << " usec";
-}
-#endif
-
-// Runs two small matrices through the operator, and leaves all the parameters
-// at their default values.
-// This test is a sample to execute matmul on hexagon.
-TEST_F(QuantizedMatMulOpForHexagonTest, Small_NoParams) {
-  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "QuantizedMatMul")
-                   .Input(FakeInput(DT_QUINT8))
-                   .Input(FakeInput(DT_QUINT8))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
-                   .Finalize(node_def()));
-  TF_ASSERT_OK(InitOp());
-  // A matrix is:
-  // |  1 |  2 |  3 |
-  // |  4 |  5 |  6 |
-  AddInputFromArray<quint8>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
-  // B matrix is:
-  // |  7 |  8 |  9 | 10 |
-  // | 11 | 12 | 13 | 14 |
-  // | 15 | 16 | 17 | 18 |
-  AddInputFromArray<quint8>(TensorShape({3, 4}),
-                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-
-  TF_ASSERT_OK(RunOpKernel());
-  // Here are the results we expect, from hand calculations:
-  // (1 * 7) + (2 * 11) + (3 * 15) = 74
-  // (1 * 8) + (2 * 12) + (3 * 16) = 80
-  // (1 * 9) + (2 * 13) + (3 * 17) = 86
-  // (1 * 10) + (2 * 14) + (3 * 18) = 92
-  // (4 * 7) + (5 * 11) + (6 * 15) = 173
-  // (4 * 8) + (5 * 12) + (6 * 16) = 188
-  // (4 * 9) + (5 * 13) + (6 * 17) = 203
-  // (4 * 10) + (5 * 14) + (6 * 18) = 218
-  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 4}));
-  test::FillValues<qint32>(&expected, {74, 80, 86, 92, 173, 188, 203, 218});
-  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/soc_interface.cc b/tensorflow/core/kernels/hexagon/soc_interface.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a37571a36e48e715eabba797b2bd338ed7ea55f8
--- /dev/null
+++ b/tensorflow/core/kernels/hexagon/soc_interface.cc
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+vcyou may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/hexagon/soc_interface.h"
+
+// Dummy implementation of soc_interface.
+
+int soc_interface_GetWrapperVersion() { return -1; }
+int soc_interface_GetSocControllerVersion() { return -1; }
+bool soc_interface_Init() { return false; }
+bool soc_interface_Finalize() { return false; }
+bool soc_interface_ExecuteGraph() { return false; }
+bool soc_interface_TeardownGraph() { return false; }
+bool soc_interface_AllocateInOutNodeBuffers(int /*input_count*/,
+                                            int* /*input_sizes*/,
+                                            int /*output_count*/,
+                                            int* /*output_sizes*/) {
+  return false;
+}
+bool soc_interface_FillInputNodeWithPort(int /*port*/, int /*x*/, int /*y*/,
+                                         int /*z*/, int /*d*/,
+                                         const uint8_t* const /*buf*/,
+                                         uint64_t /*buf_byte_size*/) {
+  return false;
+}
+bool soc_interface_FillInputNodeFloat(int /*x*/, int /*y*/, int /*z*/,
+                                      int /*d*/, const uint8_t* const /*buf*/,
+                                      uint64_t /*buf_byte_size*/) {
+  return false;
+}
+bool soc_interface_ReadOutputNodeWithPort(int /*port*/, uint8_t** /*buf*/,
+                                          uint64_t* /*buf_byte_size*/) {
+  return false;
+}
+bool soc_interface_ReadOutputNodeFloat(const char* const /*node_name*/,
+                                       uint8_t** /*buf*/,
+                                       uint64_t* /*buf_byte_size*/) {
+  return false;
+}
+bool soc_interface_setupDummyGraph(int /*version*/) { return false; }
+bool soc_interface_AllocateNodeInputAndNodeOutputArray(
+    int /*total_input_count*/, int /*total_output_count*/) {
+  return false;
+}
+bool soc_interface_ReleaseNodeInputAndNodeOutputArray() { return false; }
+void* soc_interface_SetOneNodeInputs(int /*input_count*/,
+                                     const int* const /*node_id*/,
+                                     const int* const /*port*/) {
+  return nullptr;
+}
+void* soc_interface_SetOneNodeOutputs(int /*output_count*/, int* /*max_size*/) {
+  return nullptr;
+}
+bool soc_interface_AppendConstNode(const char* const /*name*/, int /*node_id*/,
+                                   int /*batch*/, int /*height*/, int /*width*/,
+                                   int /*depth*/, const uint8_t* const /*data*/,
+                                   int /*data_length*/) {
+  return false;
+}
+bool soc_interface_AppendNode(const char* const /*name*/, int /*node_id*/,
+                              int /*op_id*/, int /*padding_id*/,
+                              const void* const /*inputs*/,
+                              int /*inputs_count*/,
+                              const void* const /*outputs*/,
+                              int /*outputs_count*/) {
+  return false;
+}
+bool soc_interface_InstantiateGraph() { return false; }
+bool soc_interface_ConstructGraph() { return false; }
+void soc_interface_SetLogLevel(int /*log_level*/) {}
+void soc_interface_SetDebugFlag(uint64_t /*flag*/) {}
diff --git a/tensorflow/core/platform/hexagon/soc_interface.h b/tensorflow/core/kernels/hexagon/soc_interface.h
similarity index 82%
rename from tensorflow/core/platform/hexagon/soc_interface.h
rename to tensorflow/core/kernels/hexagon/soc_interface.h
index f4a3cdf4bdab4d98996783a704d2158e58a06afc..062103ed988c704253a63d851b3410d99fcfc736 100644
--- a/tensorflow/core/platform/hexagon/soc_interface.h
+++ b/tensorflow/core/kernels/hexagon/soc_interface.h
@@ -21,7 +21,10 @@ limitations under the License.
 // All functions defined here must have prefix "soc_interface" to avoid
 // naming conflicts.
 #ifdef __cplusplus
+#include <cstdint>
 extern "C" {
+#else
+#include <stdbool.h>
 #endif  // __cplusplus
 // Returns the version of loaded hexagon wrapper shared library.
 // You should assert that the version matches the expected version before
@@ -39,13 +42,30 @@ bool soc_interface_Finalize();
 bool soc_interface_ExecuteGraph();
 // Teardown graph setup
 bool soc_interface_TeardownGraph();
+
+// Allocate buffers for input node and output node
+bool soc_interface_AllocateInOutNodeBuffers(int input_count, int* input_sizes,
+                                            int output_count,
+                                            int* output_sizes);
+
+// Send input data to SOC with port
+bool soc_interface_FillInputNodeWithPort(int port, int x, int y, int z, int d,
+                                         const uint8_t* const buf,
+                                         uint64_t buf_byte_size);
+
 // Send input data to SOC
 bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d,
                                       const uint8_t* const buf,
-                                      uint64_t buf_size);
+                                      uint64_t buf_byte_size);
+
+// Load output data from SOC with port
+bool soc_interface_ReadOutputNodeWithPort(int port, uint8_t** buf,
+                                          uint64_t* buf_byte_size);
+
 // Load output data from SOC
 bool soc_interface_ReadOutputNodeFloat(const char* const node_name,
-                                       uint8_t** buf, uint64_t* buf_size);
+                                       uint8_t** buf, uint64_t* buf_byte_size);
+
 // Setup graph
 // TODO(satok): Remove and use runtime version
 bool soc_interface_setupDummyGraph(int version);
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index fe62a259de8d432dbad5b47aa88e6224b028a0e8..05b76172b203673917f65f048f8132c2fb0de173 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -25,10 +25,6 @@ namespace tensorflow {
 
 class IRemoteFusedGraphExecutor {
  public:
-  using ByteArray =
-      std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
-  using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
-                                    DataType /* type */>;
   using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>;
 
   IRemoteFusedGraphExecutor() = default;
@@ -63,6 +59,13 @@ class IRemoteFusedGraphExecutor {
   virtual bool ReadOutputNode(const string& node_name,
                               TensorAllocatorFunc tensor_allocator) = 0;
 
+  virtual Status FuseRemoteGraph(const GraphDef& original_graph_def,
+                                 const std::vector<string>& inputs,
+                                 const std::vector<string>& outputs,
+                                 GraphDef* fused_graph_def) = 0;
+
+  virtual bool IsEnabled() const = 0;
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(IRemoteFusedGraphExecutor);
 };
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.cc
similarity index 74%
rename from tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc
rename to tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.cc
index 36d48a46cc12c7860e837f2a1555f08d09e1aa59..335b912d1f4ae3e89a93cbb57d498a317bf1cc2d 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.cc
+++ b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "i_graph_transfer_ops_definitions.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 
 namespace tensorflow {
-/* static */ constexpr int IGraphTransferOpsDefinitions::INVALID_OP_ID;
-// TODO(satok): Remove
-/* static */ constexpr const char* const
-    IGraphTransferOpsDefinitions::FLATTEN_OP_NAME;
+/* static */ constexpr int IRemoteFusedGraphOpsDefinitions::INVALID_OP_ID;
 }
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
similarity index 56%
rename from tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
rename to tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
index d5b4cf745106849006b2924316a1eb6ea718b0ae..7d3329f490713c243cb23d2e3232d6e343c55187 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
@@ -13,39 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
 
-#include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
-// IGraphTransferOpsDefinitions is an interface class which provides interfaces
-// about ops supported by SOC.
+// IRemoteFusedGraphOpsDefinitions is an interface class which provides
+// APIs to provide information about op types supported by SOC.
 // TODO(satok): Provide ways to transfer graph definitions into SOC
-class IGraphTransferOpsDefinitions {
+class IRemoteFusedGraphOpsDefinitions {
  public:
   // op id which is not supported by SOC
   static constexpr int INVALID_OP_ID = -1;
-  // Custom op name for flatten node
-  static constexpr const char* const FLATTEN_OP_NAME = "FLATTEN";
 
-  IGraphTransferOpsDefinitions() = default;
-  virtual ~IGraphTransferOpsDefinitions() = default;
+  IRemoteFusedGraphOpsDefinitions() = default;
+  virtual ~IRemoteFusedGraphOpsDefinitions() = default;
   // Return total ops count supported by SOC
   virtual int GetTotalOpsCount() const = 0;
   // Return op id for given string op name
-  virtual int GetOpIdFor(const string& op_name,
+  virtual int GetOpIdFor(const string& op_type,
                          const DataTypeVector& dt) const = 0;
-  // Return destination of transfer
-  virtual GraphTransferInfo::Destination GetTransferDestination() const = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(IGraphTransferOpsDefinitions);
+  TF_DISALLOW_COPY_AND_ASSIGN(IRemoteFusedGraphOpsDefinitions);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index ead2db453811b916ac8bd7cdeb5cf1a92afc5cca..0a445b7ea2c5a31fbe20af34cb42734111646390 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -38,6 +38,11 @@ REGISTER_KERNEL_BUILDER(Name("PlaceholderWithDefault").Device(DEVICE_CPU),
 
 REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp);
 
+// Identity op for gradients debugging in TensorFlow Debugger (hidden op in
+// Python).
+REGISTER_KERNEL_BUILDER(Name("DebugGradientIdentity").Device(DEVICE_CPU),
+                        IdentityOp);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                                           \
   REGISTER_KERNEL_BUILDER(                                                   \
@@ -90,7 +95,11 @@ REGISTER_SYCL_HOST_KERNEL(bool);
       IdentityOp);                                                          \
   REGISTER_KERNEL_BUILDER(                                                  \
       Name("StopGradient").Device(DEVICE_GPU).TypeConstraint<type>("T"),    \
-      IdentityOp)
+      IdentityOp);                                                          \
+  REGISTER_KERNEL_BUILDER(Name("DebugGradientIdentity")                     \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("T"),                   \
+                          IdentityOp)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bfloat16);
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
index ddd012b910810a6906395eae1644931d16078c76..6e5714b313887e18df1a9c8fa22a36ebe36da753 100644
--- a/tensorflow/core/kernels/identity_reader_op.cc
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <memory>
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_base.pb.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
diff --git a/tensorflow/core/kernels/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/ignore_errors_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83d5bb605c496cc13cbb0f840f339455419e9229
--- /dev/null
+++ b/tensorflow/core/kernels/ignore_errors_dataset_op.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit IgnoreErrorsDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(const DatasetBase* input) : input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "IgnoreErrorsDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        while (!s.ok()) {
+          out_tensors->clear();
+          s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        }
+        return Status::OK();
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("IgnoreErrorsDataset").Device(DEVICE_CPU),
+                        IgnoreErrorsDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index 13890e5b7ffeffab5bf371333c7f5f6a686eddb6..e2861ae090ccd48c0408b83a7bc7c0230bf2c1a5 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -29,12 +29,29 @@ template <typename T, typename TARGET_T>
 class InTopK : public OpKernel {
  public:
   explicit InTopK(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
+    if (context->num_inputs() == 2) {
+      OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const auto& predictions_in = context->input(0);
     const auto& targets_in = context->input(1);
+    int64 k_val = k_;
+    if (context->num_inputs() == 3) {
+      const auto& k_in = context->input(2);
+
+      OP_REQUIRES(context, TensorShapeUtils::IsScalar(k_in.shape()),
+                  errors::InvalidArgument("k must be 0-D, got shape ",
+                                          k_in.shape().DebugString()));
+
+      if (k_in.dtype() == DT_INT32) {
+        k_val = k_in.scalar<int32>()();
+      } else {
+        k_val = k_in.scalar<int64>()();
+      }
+    }
+
     OP_REQUIRES(context, predictions_in.dims() == 2,
                 errors::InvalidArgument("predictions must be 2-dimensional"));
     OP_REQUIRES(context, targets_in.dims() == 1,
@@ -73,7 +90,7 @@ class InTopK : public OpKernel {
           }
         }
       }
-      out(b) = cannot_say ? false : (more_probable_classes < k_);
+      out(b) = cannot_say ? false : (more_probable_classes < k_val);
     }
   }
 
@@ -82,10 +99,35 @@ class InTopK : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
+    Name("InTopK").Device(DEVICE_CPU)
+    .HostMemory("predictions")
+    .HostMemory("targets")
+    .HostMemory("precision")
+    .TypeConstraint<int32>("T"),
+    InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(
+    Name("InTopK").Device(DEVICE_CPU)
+    .HostMemory("predictions")
+    .HostMemory("targets")
+    .HostMemory("precision")
+    .TypeConstraint<int64>("T"),
+    InTopK<float, int64>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("InTopKV2").Device(DEVICE_CPU)
+    .HostMemory("predictions")
+    .HostMemory("targets")
+    .HostMemory("k")
+    .HostMemory("precision")
+    .TypeConstraint<int32>("T"),
     InTopK<float, int32>);
 REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU).TypeConstraint<int64>("T"),
+    Name("InTopKV2").Device(DEVICE_CPU)
+    .HostMemory("predictions")
+    .HostMemory("targets")
+    .HostMemory("k")
+    .HostMemory("precision")
+    .TypeConstraint<int64>("T"),
     InTopK<float, int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/interleave_dataset_op.cc b/tensorflow/core/kernels/interleave_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfbb5340d2e92ac7f4c054917f3fe51178e81940
--- /dev/null
+++ b/tensorflow/core/kernels/interleave_dataset_op.cc
@@ -0,0 +1,270 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class InterleaveDatasetOp : public OpKernel {
+ public:
+  explicit InterleaveDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    const Tensor* cycle_length_t;
+    OP_REQUIRES_OK(ctx, ctx->input("cycle_length", &cycle_length_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(cycle_length_t->shape()),
+                errors::InvalidArgument("cycle_length must be a scalar."));
+    const int64 cycle_length = cycle_length_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, cycle_length > 0,
+        errors::InvalidArgument("cycle_length must be greater than zero."));
+
+    const Tensor* block_length_t;
+    OP_REQUIRES_OK(ctx, ctx->input("block_length", &block_length_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(block_length_t->shape()),
+                errors::InvalidArgument("block_length must be a scalar."));
+    const int64 block_length = block_length_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, block_length > 0,
+        errors::InvalidArgument("block_length must be greater than zero."));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    DatasetBase* dataset =
+        new Dataset(input, std::move(captured_func), cycle_length, block_length,
+                    output_types_, output_shapes_);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+            int64 block_length, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          captured_func_(std::move(captured_func)),
+          cycle_length_(cycle_length),
+          block_length_(block_length),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "InterleaveDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()),
+            current_elements_(dataset->cycle_length_) {}
+
+      void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        block_index_ = 0;
+        cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+      }
+
+      void AdvancePosition() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        ++block_index_;
+        if (block_index_ == dataset()->block_length_) {
+          AdvanceToNextInCycle();
+        }
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        while (!end_of_input_ || num_open_ > 0) {
+          if (current_elements_[cycle_index_]) {
+            // We are currently processing a mapped element, so try to get the
+            // next subelement.
+            bool end_of_element;
+            TF_RETURN_IF_ERROR(current_elements_[cycle_index_]->GetNext(
+                ctx, out_tensors, &end_of_element));
+            if (!end_of_element) {
+              // Produce the subelement as output.
+              AdvancePosition();
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+            // We have reached the end of the current element, so move
+            // on to the next element in the cycle.
+            current_elements_[cycle_index_].reset();
+            --num_open_;
+            AdvanceToNextInCycle();
+          } else if (!end_of_input_) {
+            // Get the next element from the input dataset, and create
+            // an iterator from it.
+            std::vector<Tensor> args;
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, &args, &end_of_input_));
+            if (!end_of_input_) {
+              TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
+                  ctx, args, &current_elements_[cycle_index_]));
+              ++num_open_;
+            }
+          } else {
+            AdvanceToNextInCycle();
+          }
+        }
+
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     private:
+      Status MakeIteratorFromInputElement(
+          IteratorContext* ctx, const std::vector<Tensor>& input_element,
+          std::unique_ptr<IteratorBase>* out_iterator) {
+        FunctionLibraryRuntime::Options opts;
+        opts.runner = ctx->runner();
+        // Choose a step ID that is guaranteed not to clash with any
+        // Session-generated step ID. DirectSession only generates
+        // non-negative step IDs (contiguous, starting from 0), and
+        // MasterSession generates 56-bit random step IDs whose MSB
+        // is always 0, so a negative random step ID should suffice.
+        opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+        ScopedStepContainer step_container(
+            opts.step_id, [this, ctx](const string& name) {
+              dataset()
+                  ->captured_func_->resource_manager()
+                  ->Cleanup(name)
+                  .IgnoreError();
+            });
+        opts.step_container = &step_container;
+        std::vector<Tensor> return_values;
+        TF_RETURN_IF_ERROR(dataset()->captured_func_->Run(opts, input_element,
+                                                          &return_values));
+
+        if (!(return_values.size() == 1 &&
+              return_values[0].dtype() == DT_RESOURCE &&
+              TensorShapeUtils::IsScalar(return_values[0].shape()))) {
+          return errors::InvalidArgument(
+              "`f` must return a single scalar of dtype DT_RESOURCE.");
+        }
+
+        // Retrieve the dataset that was created in `f`.
+        DatasetBase* returned_dataset;
+        const ResourceHandle& dataset_resource =
+            return_values[0].scalar<ResourceHandle>()();
+
+        // NOTE(mrry): We cannot use the core `LookupResource()` or
+        // `DeleteResource()` functions, because we have an
+        // `IteratorContext*` and not an `OpKernelContext*`, so we
+        // replicate the necessary functionality here.
+        auto type_index = MakeTypeIndex<DatasetBase>();
+        if (type_index.hash_code() != dataset_resource.hash_code()) {
+          return errors::InvalidArgument("`f` must return a Dataset resource.");
+        }
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_func_->resource_manager()->Lookup(
+                dataset_resource.container(), dataset_resource.name(),
+                &returned_dataset));
+        core::ScopedUnref unref_dataset(returned_dataset);
+
+        // Create an iterator for the dataset that was returned by
+        // `f`. This transfers ownership of the dataset to the
+        // iterator, so we can delete it from the resource manager.
+        *out_iterator = returned_dataset->MakeIterator();
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_func_->resource_manager()->Delete<DatasetBase>(
+                dataset_resource.container(), dataset_resource.name()));
+        return Status::OK();
+      }
+
+      mutex mu_;
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<IteratorBase>> current_elements_
+          GUARDED_BY(mu_);
+      size_t cycle_index_ GUARDED_BY(mu_) = 0;
+      int64 block_index_ GUARDED_BY(mu_) = 0;
+      bool end_of_input_ GUARDED_BY(mu_) = false;
+      size_t num_open_ GUARDED_BY(mu_) = 0;
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const int64 cycle_length_;
+    const int64 block_length_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InterleaveDataset").Device(DEVICE_CPU),
+                        InterleaveDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
index ed350d98331f1a825cc743603b6935209fffcc1e..a340aacf14c5a0b0fcb8836921b8ed0e9b779859 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
@@ -160,16 +161,25 @@ class MakeIteratorOp : public OpKernel {
   }
 };
 
-class OneShotIteratorOp : public OpKernel {
+class OneShotIteratorOp : public AsyncOpKernel {
  public:
-  explicit OneShotIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit OneShotIteratorOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("one_shot_iterator_initialization_thread_",
+                            SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */))
+
+  {
     string shared_name;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &shared_name));
     OP_REQUIRES(ctx, shared_name.empty(),
                 errors::InvalidArgument("OneShotIteratorOp does not currently "
                                         "support the 'shared_name' attr."));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetAttr("dataset_factory", &dataset_factory_func_));
+    const NameAttrList* dataset_factory_func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dataset_factory", &dataset_factory_func));
+    dataset_factory_func_ = *dataset_factory_func;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -187,102 +197,159 @@ class OneShotIteratorOp : public OpKernel {
 
   // NOTE(mrry): This is based on `ResourceOpKernel<T>::Compute()`,
   // but due to the fact that `ResourceOpKernel<T>::CreateResource()`
-  // does not provide access to the `OpKernelContext*` and we need this
-  // to invoke the factory function, it's not possible to implement
-  // this kernel by implementing `CreateResource()`.
-  void Compute(OpKernelContext* ctx) override {
-    mutex_lock l(mu_);
-    if (iterator_resource_ == nullptr) {
-      ResourceMgr* mgr = ctx->resource_manager();
-      OP_REQUIRES_OK(ctx, cinfo_.Init(mgr, def()));
-
-      // Create an IteratorResource that will hold the iterator for this op.
-      IteratorResource* resource;
-      OP_REQUIRES_OK(
-          ctx,
-          mgr->LookupOrCreate<IteratorResource>(
-              cinfo_.container(), cinfo_.name(), &resource,
-              [this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                *ret = new IteratorResource(output_dtypes_, output_shapes_);
-                return Status::OK();
-              }));
-      Status s = VerifyTypesMatch(output_dtypes_, resource->output_dtypes());
-      s.Update(
-          VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
-      if (TF_PREDICT_FALSE(!s.ok())) {
-        resource->Unref();
-        ctx->SetStatus(s);
+  // does not provide access to the `OpKernelContext*` and we need
+  // this to invoke the factory function, it's not possible to
+  // implement this kernel by implementing `CreateResource()`.
+  // Furthermore, due to the fact that this kernel might block when
+  // running the initialization function, we must implement this
+  // kernel as an async kernel.
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    {
+      mutex_lock l(mu_);
+      if (iterator_resource_ == nullptr && initialization_status_.ok()) {
+        // The initialization thread will call `done`.
+        if (!initialization_started_) {
+          // TODO(mrry): Convert the initialization code to use
+          // callbacks instead of wasting a thread.
+          thread_pool_->Schedule([this, ctx, done]() { Init(ctx, done); });
+          initialization_started_ = true;
+        } else {
+          done_callbacks_.emplace_back(ctx, std::move(done));
+        }
         return;
       }
-      iterator_resource_ = resource;
-
-      // Call the dataset_factory_func_ to create a new dataset,
-      // over which this op will iterate.
-      FunctionLibraryRuntime::Handle f_handle;
-      OP_REQUIRES_OK(ctx,
-                     ctx->function_library()->Instantiate(
-                         dataset_factory_func_->name(),
-                         AttrSlice(&dataset_factory_func_->attr()), &f_handle));
-      FunctionLibraryRuntime::Options opts;
-      opts.cancellation_manager = ctx->cancellation_manager();
-      // Choose a step ID that is guaranteed not to clash with any
-      // Session-generated step ID. DirectSession only generates
-      // non-negative step IDs (contiguous, starting from 0), and
-      // MasterSession generates 56-bit random step IDs whose MSB is
-      // always 0, so a negative random step ID should suffice.
-      opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-      ScopedStepContainer step_container(
-          opts.step_id, [ctx](const string& name) {
-            ctx->resource_manager()->Cleanup(name).IgnoreError();
-          });
-      opts.step_container = &step_container;
-      opts.runner = ctx->runner();
-      Notification n;
-      Status factory_status;
-      std::vector<Tensor> return_values;
-      ctx->function_library()->Run(opts, f_handle, {}, &return_values,
-                                   [&n, &factory_status](Status s) {
-                                     factory_status.Update(s);
-                                     n.Notify();
-                                   });
-      n.WaitForNotification();
-      OP_REQUIRES_OK(ctx, factory_status);
-      OP_REQUIRES(
-          ctx,
-          return_values.size() == 1 &&
-              return_values[0].dtype() == DT_RESOURCE &&
-              TensorShapeUtils::IsScalar(return_values[0].shape()),
-          errors::InvalidArgument("The `dataset_factory` function must return "
-                                  "a single scalar of dtype DT_RESOURCE."));
-
-      // Retrieve the dataset that was created in the factory function.
-      DatasetBase* dataset;
-      const ResourceHandle& dataset_resource =
-          return_values[0].flat<ResourceHandle>()(0);
-      OP_REQUIRES_OK(ctx, LookupResource(ctx, dataset_resource, &dataset));
-      core::ScopedUnref unref_dataset(dataset);
-
-      // Create an iterator for the dataset that was created in the
-      // factory function. This transfers ownership of the dataset to
-      // the iterator, so we can delete it from the resource manager.
-      OP_REQUIRES_OK(ctx,
-                     iterator_resource_->set_iterator(dataset->MakeIterator()));
-      OP_REQUIRES_OK(ctx, DeleteResource<DatasetBase>(ctx, dataset_resource));
     }
-    Tensor* handle;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-    handle->scalar<ResourceHandle>()() = MakeResourceHandle<IteratorResource>(
-        ctx, cinfo_.container(), cinfo_.name());
+    ProduceOutput(ctx, std::move(done));
   }
 
  private:
-  const NameAttrList* dataset_factory_func_;
+  void Init(OpKernelContext* ctx, DoneCallback done) {
+    IteratorResource* iterator = nullptr;
+    ContainerInfo cinfo;
+    Status s = TryInit(ctx, &iterator, &cinfo);
+
+    std::vector<std::pair<OpKernelContext*, DoneCallback>> callbacks_to_run;
+    {
+      mutex_lock l(mu_);
+      if (s.ok()) {
+        iterator_resource_ = iterator;
+        cinfo_ = cinfo;
+      }
+      initialization_status_ = s;
+      std::swap(done_callbacks_, callbacks_to_run);
+    }
+
+    for (auto&& ctx_done : callbacks_to_run) {
+      ProduceOutput(ctx_done.first, std::move(ctx_done.second));
+    }
+    ProduceOutput(ctx, std::move(done));
+  }
+
+  Status TryInit(OpKernelContext* ctx, IteratorResource** iterator,
+                 ContainerInfo* cinfo) {
+    TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def()));
+
+    // Create an IteratorResource that will hold the iterator for this op.
+    TF_RETURN_IF_ERROR(
+        ctx->resource_manager()->LookupOrCreate<IteratorResource>(
+            cinfo->container(), cinfo->name(), iterator,
+            [this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+              *ret = new IteratorResource(output_dtypes_, output_shapes_);
+              return Status::OK();
+            }));
+
+    core::ScopedUnref unref_iterator(*iterator);
+
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, (*iterator)->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, (*iterator)->output_shapes()));
+
+    // Call the dataset_factory_func_ to create a new dataset,
+    // over which this op will iterate.
+    FunctionLibraryRuntime::Handle f_handle;
+    TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
+        dataset_factory_func_.name(), AttrSlice(&dataset_factory_func_.attr()),
+        &f_handle));
+    FunctionLibraryRuntime::Options opts;
+    opts.cancellation_manager = ctx->cancellation_manager();
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+    ScopedStepContainer step_container(opts.step_id, [ctx](const string& name) {
+      ctx->resource_manager()->Cleanup(name).IgnoreError();
+    });
+    opts.step_container = &step_container;
+    opts.runner = ctx->runner();
+    Notification n;
+    Status factory_status;
+    std::vector<Tensor> return_values;
+    ctx->function_library()->Run(opts, f_handle, {}, &return_values,
+                                 [&n, &factory_status](Status s) {
+                                   factory_status.Update(s);
+                                   n.Notify();
+                                 });
+    n.WaitForNotification();
+    TF_RETURN_IF_ERROR(factory_status);
+    if (return_values.size() != 1 || return_values[0].dtype() != DT_RESOURCE ||
+        !TensorShapeUtils::IsScalar(return_values[0].shape())) {
+      return errors::InvalidArgument(
+          "The `dataset_factory` function must return "
+          "a single scalar of dtype DT_RESOURCE.");
+    }
+
+    // Retrieve the dataset that was created in the factory function.
+    DatasetBase* dataset;
+    const ResourceHandle& dataset_resource =
+        return_values[0].flat<ResourceHandle>()(0);
+    TF_RETURN_IF_ERROR(LookupResource(ctx, dataset_resource, &dataset));
+    core::ScopedUnref unref_dataset(dataset);
+
+    // Create an iterator for the dataset that was created in the
+    // factory function. This transfers ownership of the dataset to
+    // the iterator, so we can delete it from the resource manager.
+    TF_RETURN_IF_ERROR((*iterator)->set_iterator(dataset->MakeIterator()));
+    TF_RETURN_IF_ERROR(DeleteResource<DatasetBase>(ctx, dataset_resource));
+
+    (*iterator)->Ref();
+    return Status::OK();
+  }
+
+  void ProduceOutput(OpKernelContext* ctx, DoneCallback done) {
+    Tensor* handle;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->allocate_output(0, TensorShape({}), &handle),
+                         done);
+    Status s;
+    {
+      mutex_lock l(mu_);
+      s = initialization_status_;
+      if (s.ok()) {
+        handle->scalar<ResourceHandle>()() =
+            MakeResourceHandle<IteratorResource>(ctx, cinfo_.container(),
+                                                 cinfo_.name());
+      }
+    }
+    OP_REQUIRES_OK_ASYNC(ctx, s, done);
+    done();
+  }
+
+  NameAttrList dataset_factory_func_;
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
 
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
   mutex mu_;
   ContainerInfo cinfo_ GUARDED_BY(mu_);
-  IteratorResource* iterator_resource_ = nullptr;
+  IteratorResource* iterator_resource_ GUARDED_BY(mu_) = nullptr;
+
+  bool initialization_started_ GUARDED_BY(mu_) = false;
+  Status initialization_status_ GUARDED_BY(mu_);
+  std::vector<std::pair<OpKernelContext*, DoneCallback>> done_callbacks_
+      GUARDED_BY(mu_);
 };
 
 class IteratorGetNextOp : public AsyncOpKernel {
@@ -292,7 +359,7 @@ class IteratorGetNextOp : public AsyncOpKernel {
         thread_pool_(new thread::ThreadPool(
             ctx->env(), ThreadOptions(),
             strings::StrCat("iterator_get_next_thread_",
-                            SanitizeThreadSuffix(def().name())),
+                            SanitizeThreadSuffix(name())),
             1 /* num_threads */, false /* low_latency_hint */)) {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
@@ -348,6 +415,91 @@ class IteratorDisposeOp : public OpKernel {
   }
 };
 
+class IteratorToStringHandleOp : public OpKernel {
+ public:
+  explicit IteratorToStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& resource_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
+                errors::InvalidArgument("resource_handle must be a scalar"));
+
+    // Validate that the handle corresponds to a real resource, and
+    // that it is an IteratorResource.
+    IteratorResource* iterator_resource;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
+    iterator_resource->Unref();
+
+    Tensor* string_handle_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &string_handle_t));
+    string_handle_t->scalar<string>()() =
+        resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
+  }
+};
+
+class IteratorFromStringHandleOp : public OpKernel {
+ public:
+  explicit IteratorFromStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES(
+        ctx,
+        output_dtypes_.empty() || output_shapes_.empty() ||
+            output_dtypes_.size() == output_shapes_.size(),
+        errors::InvalidArgument("If both 'output_types' and 'output_shapes' "
+                                "are set, they must have the same length."));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& string_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(string_handle_t.shape()),
+                errors::InvalidArgument("string_handle must be a scalar"));
+
+    ResourceHandle resource_handle;
+    OP_REQUIRES(
+        ctx,
+        resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
+        errors::InvalidArgument(
+            "Could not parse string_handle as a valid ResourceHandle"));
+
+    OP_REQUIRES(
+        ctx, resource_handle.device() == ctx->device()->attributes().name(),
+        errors::InvalidArgument("Attempted create an iterator on device \"",
+                                ctx->device()->attributes().name(),
+                                "\" from handle defined on device \"",
+                                resource_handle.device(), "\""));
+
+    // Validate that the handle corresponds to a real resource, and
+    // that it is an IteratorResource.
+    IteratorResource* iterator_resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, resource_handle, &iterator_resource));
+    core::ScopedUnref unref_iterator(iterator_resource);
+    if (!output_dtypes_.empty()) {
+      OP_REQUIRES_OK(ctx, VerifyTypesMatch(output_dtypes_,
+                                           iterator_resource->output_dtypes()));
+    }
+    if (!output_shapes_.empty()) {
+      OP_REQUIRES_OK(
+          ctx, VerifyShapesCompatible(output_shapes_,
+                                      iterator_resource->output_shapes()));
+    }
+
+    Tensor* resource_handle_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({}), &resource_handle_t));
+    resource_handle_t->scalar<ResourceHandle>()() = resource_handle;
+  }
+
+ private:
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
                         MakeIteratorOp);
@@ -357,6 +509,10 @@ REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
                         IteratorGetNextOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorDispose").Device(DEVICE_CPU),
                         IteratorDisposeOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle").Device(DEVICE_CPU),
+                        IteratorToStringHandleOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandle").Device(DEVICE_CPU),
+                        IteratorFromStringHandleOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 23cabe7b547ce723002a341e4304bc8b04997138..3bb07301b5adc8a2a30990fbf2dff24c70705d63 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -13,18 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "lmdb.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 #include <sys/stat.h>
+#include "lmdb.h"
 
 namespace tensorflow {
 
-inline void MDB_CHECK(int mdb_status) {
-  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
-}
+#define MDB_CHECK(val) CHECK_EQ(val, MDB_SUCCESS) << mdb_strerror(val)
 
 class LMDBReader : public ReaderBase {
  public:
@@ -131,4 +129,4 @@ class LMDBReaderOp : public ReaderOpKernel {
 REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
                         LMDBReaderOp);
 
-}
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 11ce2a71dcb5f60f2c5274120cacb186f2076424..d721b3d542761539e2dfccfdeda7facbcdb2f3c6 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -124,6 +124,20 @@ class MutableHashTableOfScalars final : public LookupInterface {
 
   TensorShape value_shape() const override { return TensorShape(); }
 
+  int64 MemoryUsed() const override {
+    int64 ret = 0;
+    mutex_lock l(mu_);
+    for (unsigned i = 0; i < table_.bucket_count(); ++i) {
+      size_t bucket_size = table_.bucket_size(i);
+      if (bucket_size == 0) {
+        ret++;
+      } else {
+        ret += bucket_size;
+      }
+    }
+    return sizeof(MutableHashTableOfScalars) + ret;
+  }
+
  private:
   // TODO(andreasst): consider using a read/write lock or a concurrent map
   mutable mutex mu_;
@@ -239,6 +253,20 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
   TensorShape value_shape() const override { return value_shape_; }
 
+  int64 MemoryUsed() const override {
+    int64 ret = 0;
+    mutex_lock l(mu_);
+    for (unsigned i = 0; i < table_.bucket_count(); ++i) {
+      size_t bucket_size = table_.bucket_size(i);
+      if (bucket_size == 0) {
+        ret++;
+      } else {
+        ret += bucket_size;
+      }
+    }
+    return sizeof(MutableHashTableOfTensors) + ret;
+  }
+
  private:
   TensorShape value_shape_;
   // TODO(andreasst): consider using a read/write lock or a concurrent map
@@ -467,6 +495,12 @@ class MutableDenseHashTable final : public LookupInterface {
 
   TensorShape value_shape() const override { return value_shape_; }
 
+  int64 MemoryUsed() const override {
+    mutex_lock l(mu_);
+    return sizeof(MutableDenseHashTable) + key_buckets_.AllocatedBytes() +
+           value_buckets_.AllocatedBytes() + empty_key_.AllocatedBytes();
+  }
+
  private:
   Status DoInsert(OpKernelContext* ctx, const Tensor& key, const Tensor& value,
                   bool ignore_empty_key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -790,6 +824,7 @@ REGISTER_KERNEL(string, int32);
 REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
 REGISTER_KERNEL(string, bool);
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 46eaf3d9e7aef43b681d94d1af3ec85d7933b1f5..0168b57d3552a9fb46e354e7197b5af700134c01 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -550,12 +550,17 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
 #endif // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapStage").HostMemory("key").Device(DEVICE_SYCL),
+REGISTER_KERNEL_BUILDER(Name("MapStage")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
                         MapStageOp<false>);
-REGISTER_KERNEL_BUILDER(
-    Name("OrderedMapStage").HostMemory("key").Device(DEVICE_SYCL),
-    MapStageOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
+                        MapStageOp<true>);
+#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapUnstageOp : public OpKernel {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 8003f7ff67fd71c06ed137799eadf2de62f4a923..62c5ecfe81133101f46eb3a860b8116e7803d23a 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -23,27 +23,15 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-
+#include "tensorflow/core/util/matmul_autotune.h"
 #if GOOGLE_CUDA
 #include "cuda/include/cuda.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
-#if GOOGLE_CUDA
-
-namespace {
-template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
-  return typed;
-}
-}  // namespace
-
-#endif  // GOOGLE_CUDA
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
@@ -123,10 +111,16 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
 
 template <typename Device, typename T>
 struct LaunchMatMulBase {
+#if GOOGLE_CUDA
+  typedef perftools::gputools::blas::AlgorithmType AlgorithmType;
+#else
+  typedef int64 AlgorithmType;
+#endif  // GOOGLE_CUDA
+
   static void launch(
-      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+      OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
-      Tensor* out) {
+      std::vector<AlgorithmType>* algorithms, bool use_aututone, Tensor* out) {
 #ifndef TENSORFLOW_USE_SYCL
     // An explicit vector-matrix multiply is much better optimized than an
     // implicit one and this is a bottleneck during non-batched inference.
@@ -140,6 +134,10 @@ struct LaunchMatMulBase {
     }
 #endif  // TENSORFLOW_USE_SYCL
   }
+
+  static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx,
+                                   std::vector<int64>* algorithms,
+                                   bool* algorithm_set_flag) {}
 };
 // On CPUs, we ignore USE_CUBLAS
 template <typename T>
@@ -159,24 +157,39 @@ struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
 #if GOOGLE_CUDA
 
 namespace {
+
 template <typename T>
 struct LaunchBlasGemv {
-  static void Compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
-                      bool trans, uint64 m, uint64 n,
-                      const perftools::gputools::DeviceMemory<T>& a,
-                      const perftools::gputools::DeviceMemory<T>& b,
-                      perftools::gputools::DeviceMemory<T>* c) {
+  static void Compute(
+      OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
+      uint64 m, uint64 n, const perftools::gputools::DeviceMemory<T>& a,
+      const perftools::gputools::DeviceMemory<T>& b,
+      perftools::gputools::DeviceMemory<T>* c,
+      perftools::gputools::blas::ProfileResult* output_profile) {
     const auto blas_trans =
         trans ? perftools::gputools::blas::Transpose::kTranspose
               : perftools::gputools::blas::Transpose::kNoTranspose;
-    bool blas_launch_status =
-        stream
-            ->ThenBlasGemv(blas_trans, m, n, static_cast<T>(1.0), a, m, b, 1,
-                           static_cast<T>(0.0), c, 1)
-            .ok();
-    if (!blas_launch_status) {
-      ctx->SetStatus(
-          errors::Internal("Blas GEMV launch failed:  m=", m, ", n=", n));
+    if (output_profile == nullptr) {
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemv(blas_trans, m, n, static_cast<T>(1.0), a, m, b, 1,
+                             static_cast<T>(0.0), c, 1)
+              .ok();
+      if (!blas_launch_status) {
+        ctx->SetStatus(
+            errors::Internal("Blas GEMV launch failed:  m=", m, ", n=", n));
+      }
+    } else {
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemvWithProfiling(blas_trans, m, n, static_cast<T>(1.0),
+                                          a, m, b, 1, static_cast<T>(0.0), c, 1,
+                                          output_profile)
+              .ok();
+      if (!blas_launch_status) {
+        ctx->SetStatus(errors::Internal(
+            "Blas GEMV with profiling launch failed:  m=", m, ", n=", n));
+      }
     }
   }
 
@@ -188,7 +201,8 @@ void LaunchBlasGemv<Eigen::half>::Compute(
     OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
     uint64 m, uint64 n, const perftools::gputools::DeviceMemory<Eigen::half>& a,
     const perftools::gputools::DeviceMemory<Eigen::half>& b,
-    perftools::gputools::DeviceMemory<Eigen::half>* c) {
+    perftools::gputools::DeviceMemory<Eigen::half>* c,
+    perftools::gputools::blas::ProfileResult* output_profile) {
   ctx->SetStatus(errors::Internal(
       "Blas GEMV launch failed: GEMV is not implemented for float16."));
 }
@@ -200,15 +214,55 @@ bool LaunchBlasGemv<Eigen::half>::IsSupported() {
 
 }  // namespace
 
+bool GetCublasAutotuneComputationType(
+    const DataType& dtype,
+    perftools::gputools::blas::ComputationType* compute_type) {
+  using perftools::gputools::blas::ComputationType;
+  bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
+  switch (dtype) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+      if (use_f32_for_f16_computation) {
+        *compute_type = ComputationType::kF32;
+      } else {
+        *compute_type = ComputationType::kF16;
+      }
+      return false;
+    case DT_FLOAT:
+      *compute_type = ComputationType::kF32;
+      return true;
+    case DT_DOUBLE:
+      *compute_type = ComputationType::kF64;
+      return true;
+    default:
+      // Unsupported compute_type, return false.
+      return false;
+  }
+}
+
+// A dummy type to group matmul autotune results together.
+struct MatmulAutoTuneGroup {
+  static string name() { return "Matmul"; }
+};
+typedef AutoTuneSingleton<MatmulAutoTuneGroup, MatmulParameters,
+                          perftools::gputools::blas::AlgorithmConfig>
+    AutoTuneMatmul;
+
 template <typename T>
 struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
   static void launch(
-      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
+      OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
-      Tensor* out) {
-    perftools::gputools::blas::Transpose trans[] = {
-        perftools::gputools::blas::Transpose::kNoTranspose,
-        perftools::gputools::blas::Transpose::kTranspose};
+      std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
+    using perftools::gputools::blas::AlgorithmConfig;
+    using perftools::gputools::blas::ComputationType;
+    using perftools::gputools::blas::ProfileResult;
+    using perftools::gputools::blas::Transpose;
+    using perftools::gputools::blas::kDefaultAlgorithm;
+    using perftools::gputools::blas::kDefaultBlasGemm;
+    using perftools::gputools::blas::kDefaultBlasGemv;
+    using perftools::gputools::blas::kNoAlgorithm;
+    Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
     const uint64 n = b.dim_size(1 - dim_pair[0].second);
@@ -220,35 +274,156 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
     auto* stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
-    auto a_ptr = AsDeviceMemory(a.template flat<T>().data());
-    auto b_ptr = AsDeviceMemory(b.template flat<T>().data());
-    auto c_ptr = AsDeviceMemory(out->template flat<T>().data());
-    // Cublas does
-    // C = A x B
-    // where A, B and C are assumed to be in column major.
-    // We want the output to be in row-major, so we can compute
-    // C' = B' x A' (' stands for transpose)
-    if (LaunchBlasGemv<T>::IsSupported() && n == 1) {
-      // This is a matrix*vector multiply so use GEMV to compute A * b.
-      // Here we are multiplying in the natural order, so we have to flip
-      // the transposition flag to compensate for the tensor being stored
-      // row-major.
-      LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a, transpose_a ? m : k,
-                                 transpose_a ? k : m, a_ptr, b_ptr, &c_ptr);
-    } else {
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
-                             b_ptr, transpose_b ? k : n, a_ptr,
-                             transpose_a ? m : k, 0.0f, &c_ptr, n)
-              .ok();
-      if (!blas_launch_status) {
-        ctx->SetStatus(errors::Internal(
-            "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ",
-            a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
-            "), m=", m, ", n=", n, ", k=", k));
+    auto a_ptr = AsDeviceMemory(a.template flat<T>().data(),
+                                a.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(b.template flat<T>().data(),
+                                b.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(out->template flat<T>().data(),
+                                out->template flat<T>().size());
+    auto alpha = static_cast<T>(1.0);
+    auto beta = static_cast<T>(0.0);
+
+    int device_id = stream->parent()->device_ordinal();
+    DataType dtype = a.dtype();
+    MatmulParameters matmul_parameters = {
+        transpose_a, transpose_b, m, n, k, dtype, device_id,
+    };
+    AlgorithmConfig algorithm_config(kNoAlgorithm);
+
+    ComputationType computation_type;
+    bool compute_type_supported =
+        GetCublasAutotuneComputationType(dtype, &computation_type);
+    if (use_autotune && compute_type_supported && !algorithms->empty()) {
+      ProfileResult best_result;
+      // TODO(yangzihao): Unify this code with conv autotuning.
+      if (!AutoTuneMatmul::GetInstance()->Find(matmul_parameters,
+                                               &algorithm_config)) {
+        ProfileResult profile_result;
+        for (auto profile_algorithm : (*algorithms)) {
+          // Cublas does
+          // C = A x B
+          // where A, B and C are assumed to be in column major.
+          // We want the output to be in row-major, so we can compute
+          // C' = B' x A' (' stands for transpose)
+          bool cublas_launch_status =
+              stream
+                  ->ThenBlasGemmWithAlgorithm(
+                      blas_transpose_b, blas_transpose_a, n, m, k, alpha, b_ptr,
+                      transpose_b ? k : n, a_ptr, transpose_a ? m : k, beta,
+                      &c_ptr, n, computation_type, profile_algorithm,
+                      &profile_result)
+                  .ok();
+          if (cublas_launch_status) {
+            if (profile_result.is_valid()) {
+              if (profile_result.elapsed_time_in_ms() <
+                  best_result.elapsed_time_in_ms()) {
+                best_result = profile_result;
+              }
+            }
+          }
+        }
+        // Try BlasGemmWithProfiling
+        bool cublas_launch_status =
+            stream
+                ->ThenBlasGemmWithProfiling(
+                    blas_transpose_b, blas_transpose_a, n, m, k, 1.0, b_ptr,
+                    transpose_b ? k : n, a_ptr, transpose_a ? m : k, 0.0,
+                    &c_ptr, n, &profile_result)
+                .ok();
+        if (cublas_launch_status) {
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+          }
+        }
+        // Try BlasGemvWithProfiling
+        if (LaunchBlasGemv<T>::IsSupported() && n == 1) {
+          LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a,
+                                     transpose_a ? m : k, transpose_a ? k : m,
+                                     a_ptr, b_ptr, &c_ptr, &profile_result);
+          if (profile_result.is_valid()) {
+            if (profile_result.elapsed_time_in_ms() <
+                best_result.elapsed_time_in_ms()) {
+              best_result = profile_result;
+            }
+          }
+        }
+      }
+      // We make sure that each matmul parameter set only gets one pass of
+      // autotune. If the best result is found, assign it to algorithm_type
+      // and insert it to autotune map. If all internal kernels of
+      // cublasGemmEx() returns invalid results, we add kNoAlgorithm to the
+      // autotune map.
+      if (best_result.is_valid()) {
+        algorithm_config.set_algorithm(best_result.algorithm());
+      }
+      AutoTuneMatmul::GetInstance()->Insert(matmul_parameters,
+                                            algorithm_config);
+      if (algorithm_config.algorithm() != kNoAlgorithm &&
+          algorithm_config.algorithm() != kDefaultBlasGemm &&
+          algorithm_config.algorithm() != kDefaultBlasGemv) {
+        bool cublas_launch_status =
+            stream
+                ->ThenBlasGemmWithAlgorithm(
+                    blas_transpose_b, blas_transpose_a, n, m, k, alpha, b_ptr,
+                    transpose_b ? k : n, a_ptr, transpose_a ? m : k, beta,
+                    &c_ptr, n, computation_type, algorithm_config.algorithm(),
+                    nullptr)
+                .ok();
+        if (!cublas_launch_status) {
+          ctx->SetStatus(errors::Internal(
+              "Blas GEMM with algorithm launch failed : a.shape=(",
+              a.dim_size(0), ", ", a.dim_size(1), "), b.shape=(", b.dim_size(0),
+              ", ", b.dim_size(1), "), m=", m, ", n=", n, ", k=", k));
+        }
       }
     }
+    // For the following case, we use normal BlasGemm():
+    //  1) We didn't set the use_autotune flag;
+    //  2) compute type does not support autotune;
+    //  3) no algorithm is found;
+    //  4) all internal kernels in autotune return invalid results.
+    if (!use_autotune || !compute_type_supported || algorithms->empty() ||
+        algorithm_config.algorithm() == kNoAlgorithm ||
+        algorithm_config.algorithm() == kDefaultBlasGemm ||
+        algorithm_config.algorithm() == kDefaultBlasGemv) {
+      if (algorithm_config.algorithm() == kDefaultBlasGemv) {
+        // This is a matrix*vector multiply so use GEMV to compute A * b.
+        // Here we are multiplying in the natural order, so we have to flip
+        // the transposition flag to compensate for the tensor being stored
+        // row-major.
+        // TODO(yangzihao): Add Gemv as an autotuning option too.
+        LaunchBlasGemv<T>::Compute(ctx, stream, !transpose_a,
+                                   transpose_a ? m : k, transpose_a ? k : m,
+                                   a_ptr, b_ptr, &c_ptr, nullptr);
+      } else {
+        // Use C' = B' x A' (' stands for transpose)
+        bool blas_launch_status =
+            stream
+                ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                               1.0f, b_ptr, transpose_b ? k : n, a_ptr,
+                               transpose_a ? m : k, 0.0f, &c_ptr, n)
+                .ok();
+        if (!blas_launch_status) {
+          ctx->SetStatus(errors::Internal(
+              "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ",
+              a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
+              "), m=", m, ", n=", n, ", k=", k));
+        }
+      }
+    }
+  }
+
+  static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx,
+                                   std::vector<int64>* algorithms,
+                                   bool* algorithm_set_flag) {
+    if (*algorithm_set_flag == false) {
+      auto* stream = ctx->device()->tensorflow_gpu_device_info()->stream;
+      stream->parent()->GetBlasGemmAlgorithms(algorithms);
+      *algorithm_set_flag = true;
+    }
   }
 };
 
@@ -257,9 +432,14 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
 template <typename Device, typename T, bool USE_CUBLAS>
 class MatMulOp : public OpKernel {
  public:
-  explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit MatMulOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), algorithms_set_already_(false) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+
+    LaunchMatMul<Device, T, USE_CUBLAS>::GetBlasGemmAlgorithm(
+        ctx, &algorithms_, &algorithms_set_already_);
+    use_autotune_ = MatmulAutotuneEnable();
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -302,10 +482,14 @@ class MatMulOp : public OpKernel {
       return;
     }
 
-    LaunchMatMul<Device, T, USE_CUBLAS>::launch(ctx, this, a, b, dim_pair, out);
+    LaunchMatMul<Device, T, USE_CUBLAS>::launch(
+        ctx, a, b, dim_pair, &algorithms_, use_autotune_, out);
   }
 
  private:
+  std::vector<int64> algorithms_;
+  bool algorithms_set_already_;
+  bool use_autotune_;
   bool transpose_a_;
   bool transpose_b_;
 };
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 5a8db6da1997ac20eaa66e5e35caa851bf7cdf95..6398da2fb959b0bded9afad8c92be923e44c755c 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_MATMUL_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
 namespace functor {
@@ -50,6 +52,68 @@ struct MatMulFunctor {
 };
 
 }  // end namespace functor
+
+#if GOOGLE_CUDA
+// Encapsulate all the shape information that is used in matmul operations.
+class MatmulParameters {
+ public:
+  MatmulParameters(bool transa, bool transb, uint64 m, uint64 n, uint64 k,
+                   DataType dtype, int device_id)
+      : transa_(transa),
+        transb_(transb),
+        m_(m),
+        n_(n),
+        k_(k),
+        dtype_(dtype),
+        device_id_(device_id) {
+    hash_code_ = transa;
+    hash_code_ = Hash64Combine(hash_code_, transb);
+    hash_code_ = Hash64Combine(hash_code_, m);
+    hash_code_ = Hash64Combine(hash_code_, n);
+    hash_code_ = Hash64Combine(hash_code_, k);
+    hash_code_ = Hash64Combine(hash_code_, dtype);
+    hash_code_ = Hash64Combine(hash_code_, device_id);
+  }
+  bool operator==(const MatmulParameters& other) const {
+    return this->get_data_as_tuple() == other.get_data_as_tuple();
+  }
+
+  bool operator!=(const MatmulParameters& other) const {
+    return !(*this == other);
+  }
+  uint64 hash() const { return hash_code_; }
+
+  string ToString() const {
+    // clang-format off
+    return strings::StrCat(
+        transa_, ", ", transb_, ", ",
+        m_, ", ", n_, ", ", k_,
+        dtype_, ", ", device_id_);
+    // clang-format on
+  }
+
+ private:
+  typedef std::tuple<bool, bool, int64, int64, int64, DataType, int>
+      ParameterDataType;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(transa_, transb_, m_, n_, k_, dtype_, device_id_);
+  }
+
+  bool transa_;
+  bool transb_;
+  uint64 m_;
+  uint64 n_;
+  uint64 k_;
+  DataType dtype_;
+  int device_id_;
+  uint64 hash_code_;
+};
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#endif  // GOOGLE_CUDA
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_MATMUL_OP_H_
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 0e899402a2861447c9ff2a1462eb80790b3ed0f5..b29feb003242500548d1a4bf83a31c8c2d1c57d0 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,8 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-    gemmlowp::meta::Gemm<gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>,
-                         Params, 1, 8, 8>(params);
+      gemmlowp::meta::MultiThreadGemm<
+          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
+          1, 8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d8a8cc74bfae08e2eeeba65e40a8e77e6ac8a1fd..e6673b2ffb7dc4a2e0127c363b4402c98a023b17 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index d4364d31e41790241454050750ecb58d31a0e941..0f1a218fe62dd91160320254342828811e3aa458 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -39,8 +39,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index ddcf2412770258c7a8c7ac2d19ceed7cb4f4b05c..3b23c72f0f1ef491307dcf213d57488521bd7687 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,8 +38,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 23827ceea50f7b0af19640a049a530ef0798536a..ef7338e0e0d634e70e2830048623f2d67d8e272f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -40,8 +40,8 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index df49e03f31c379113d8d4ed2ec6bdc06d9376fd9..135dd254a4cd29be1ffde2a6f69fca44453590e4 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index d53353680247bac3228629d2901b2ca8592d96d5..bc9e906c39a9a7f5f4b2ae83afc6774aecb38c48 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 // TODO(inteltf) Address comments from PR 8968.
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index cb7ea7e7f90546ceb23564d09c9e064b80347148..ca20294a2683059488d9d2b3c7fe9f232b093dfb 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 07a7e6b5dafd43f3bc39f97ca5f584a7893984a4..aa08e93924c588cfb5b4a22a20055e5c74a43b3a 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 16143191a34ae62704691f4916ac8f30d897f1d4..47598f443f76f17a6c0b4005327a4e7d00a6beba 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 
-#include "third_party/mkl/include/mkl_cblas.h"
+#include "mkl_cblas.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index fabecc39a885bd17d3bc75341503e9cb43012042..86a77d769a52d7592d15627b504ae60278b45058 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -25,8 +25,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 593aa3a2fd6052f275015b1acd2e6f5271a837dd..b3763f17bc1393ba42ace07f21db36568eaae6cb 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index 48bd2e88bac31ad3674190ad8e6a13e834fa036a..c8e5df32ce56ba6f1eb433048589f7c00afa3e9a 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <vector>
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -25,12 +24,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -92,12 +92,12 @@ class MklToTfOp : public OpKernel {
     if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
       size_t strides_nchw[4];
       GetStridesFromSizes(FORMAT_NCHW, strides_nchw, in_sizes);
-      CHECK_EQ(dnnLayoutCreate_F32(&lt_trans_input, ndims, in_sizes,
-                                           strides_nchw), E_SUCCESS);
+      CHECK_EQ(
+          dnnLayoutCreate_F32(&lt_trans_input, ndims, in_sizes, strides_nchw),
+          E_SUCCESS);
       AllocTmpBuffer(context, &mkl_tmp_trans_input_buf_tensor, lt_trans_input,
                      &buf_trans_input);
-    }
-    else {
+    } else {
       lt_trans_input = static_cast<dnnLayout_t>(input_shape.GetTfLayout());
       buf_trans_input =
           static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
@@ -111,13 +111,13 @@ class MklToTfOp : public OpKernel {
     // NCHW -> NHWC, if data format is NHWC
     if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
       dnnLayoutDelete_F32(lt_trans_input);
-      TensorShape nhwc_shape = ShapeFromFormat(FORMAT_NHWC,
-          in_sizes[MklDims::N], in_sizes[MklDims::H],
+      TensorShape nhwc_shape = ShapeFromFormat(
+          FORMAT_NHWC, in_sizes[MklDims::N], in_sizes[MklDims::H],
           in_sizes[MklDims::W], in_sizes[MklDims::C]);
       MklNCHWToNHWC(mkl_tmp_trans_input_buf_tensor, &output_tensor);
     }
 
-    delete in_sizes;
+    delete[] in_sizes;
 
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index fcbf105f8faa665a185f8499a9bc13ea0fdc4a40..50d25ac511885eb11c990e4f511ba81ffffd1ebd 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
 
-#include "third_party/mkl/include/mkl_trans.h"
+#include "mkl_trans.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/kernels/transpose_op.h"
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 9ffe71e031e762b8563877ca846f36833fa1d000..ba597f5c678db4d767f5aa73e2bcdcb79625af71 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -65,9 +65,11 @@ static inline void DecreasingArgSort(const std::vector<float>& values,
       [&values](const int i, const int j) { return values[i] > values[j]; });
 }
 
-// Compute intersection-over-union overlap between boxes i and j.
-static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
-                               int i, int j) {
+// Return true if intersection-over-union overlap between boxes i and j
+// is greater than iou_threshold.
+static inline bool IOUGreaterThanThreshold(
+    typename TTypes<float, 2>::ConstTensor boxes, int i, int j,
+    float iou_threshold) {
   const float ymin_i = std::min<float>(boxes(i, 0), boxes(i, 2));
   const float xmin_i = std::min<float>(boxes(i, 1), boxes(i, 3));
   const float ymax_i = std::max<float>(boxes(i, 0), boxes(i, 2));
@@ -76,7 +78,6 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   const float xmin_j = std::min<float>(boxes(j, 1), boxes(j, 3));
   const float ymax_j = std::max<float>(boxes(j, 0), boxes(j, 2));
   const float xmax_j = std::max<float>(boxes(j, 1), boxes(j, 3));
-
   const float area_i = (ymax_i - ymin_i) * (xmax_i - xmin_i);
   const float area_j = (ymax_j - ymin_j) * (xmax_j - xmin_j);
   if (area_i <= 0 || area_j <= 0) return 0.0;
@@ -87,14 +88,17 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   const float intersection_area =
       std::max<float>(intersection_ymax - intersection_ymin, 0.0) *
       std::max<float>(intersection_xmax - intersection_xmin, 0.0);
-  return intersection_area / (area_i + area_j - intersection_area);
+  const float iou = intersection_area / (area_i + area_j - intersection_area);
+  return iou > iou_threshold;
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
-                           const Tensor& scores, const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context,
+                           const Tensor& boxes,
+                           const Tensor& scores,
+                           const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
@@ -102,34 +106,34 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
     return;
   }
 
-  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
+  const int output_size =
+      std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data =
+      boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
   std::vector<int> sorted_indices;
   DecreasingArgSort(scores_data, &sorted_indices);
 
-  std::vector<bool> active(num_boxes, true);
   std::vector<int> selected;
-  int num_active = active.size();
+  std::vector<int> selected_indices(output_size, 0);
+  int num_selected = 0;
   for (int i = 0; i < num_boxes; ++i) {
-    if (num_active == 0 || selected.size() >= output_size) break;
-    if (active[i]) {
-      selected.push_back(sorted_indices[i]);
-    } else {
-      continue;
-    }
-    for (int j = i + 1; j < num_boxes; ++j) {
-      if (active[j]) {
-        float iou =
-            ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
-        if (iou > iou_threshold) {
-          active[j] = false;
-          num_active--;
-        }
+    if (selected.size() >= output_size) break;
+    bool should_select = true;
+    for (int j = 0; j < num_selected; ++j) {
+      if (IOUGreaterThanThreshold(boxes_data, sorted_indices[i],
+                                  sorted_indices[selected_indices[j]],
+                                  iou_threshold)) {
+        should_select = false;
+        break;
       }
     }
+    if (should_select) {
+      selected.push_back(sorted_indices[i]);
+      selected_indices[num_selected++] = i;
+    }
   }
 
   // Allocate output tensor
@@ -175,7 +179,8 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+  }
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -190,9 +195,10 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                        iou_threshold.shape().DebugString()));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index e0e8c87f95324a7a33fc5f7f6a3ae5d8d97b4e22..fdbcf05b89ddf122eee9e0133651355edbb1ba5a 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -143,8 +143,8 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-              StringPiece(s.ToString()).contains("scores has incompatible shape"))
-    << s;
+      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      << s;
 }
 
 TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
@@ -156,8 +156,8 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-              StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
-    << s;
+      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      << s;
 }
 
 TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 96de4094feebc5e1a831333fa0a50eaf3bf886ad..4795cba98f3b2f77bc9779f0051f870fcd4bb74e 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -169,6 +170,32 @@ class OpsTestBase : public ::testing::Test {
     }
   }
 
+  // Adds a Resource type as input. If <container> is empty, uses the default
+  // container name.
+  template <typename T>
+  void AddResourceInput(const string& container, const string& name,
+                        T* resource) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    ResourceMgr* rm = device_->resource_manager();
+    EXPECT_TRUE(
+        rm->Create(container == "" ? rm->default_container() : container, name,
+                   resource)
+            .ok());
+    TypeIndex type_index = MakeTypeIndex<T>();
+    ResourceHandle handle;
+    handle.set_device(device_->name());
+    handle.set_container(container);
+    handle.set_name(name);
+    handle.set_hash_code(type_index.hash_code());
+    handle.set_maybe_type_name(type_index.name());
+    Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                               DT_RESOURCE, TensorShape({}));
+    input->scalar<ResourceHandle>()() = handle;
+    tensors_.push_back(input);
+    inputs_.push_back({nullptr, input});
+  }
+
   // Runs an operation producing 'num_outputs' outputs.
   //
   // Returns the context's status after running the operation.
diff --git a/tensorflow/core/kernels/ops_testutil_test.cc b/tensorflow/core/kernels/ops_testutil_test.cc
index 906fd890e3128db47fc6cff7e45e83a570b5ee99..239e460825bab15be027142294ec3e8a90bbe5da 100644
--- a/tensorflow/core/kernels/ops_testutil_test.cc
+++ b/tensorflow/core/kernels/ops_testutil_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -33,4 +34,17 @@ TEST_F(OpsTestBase, ScopedStepContainer) {
   EXPECT_TRUE(step_container_ != nullptr);
 }
 
+// Verify that a Resource input can be added to the test kernel.
+TEST_F(OpsTestBase, ResourceVariableInput) {
+  TF_EXPECT_OK(NodeDefBuilder("identity", "Identity")
+                   .Input(FakeInput(DT_RESOURCE))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  Var* var = new Var(DT_STRING);
+  AddResourceInput("" /* container */, "Test" /* name */, var);
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ(output->dtype(), DT_RESOURCE);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index 68a9c37406a93f13808db78a6adad568c36f24c1..d3d1b56c9d568487c768f1b1620d2880a3afc531 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -84,6 +84,20 @@ bool IsDim0SliceAligned(const TensorShape& s, int64 start, int64 end_or_size) {
 // Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
 string SanitizeThreadSuffix(string suffix);
 
+// Helper to compute 'strides' given a tensor 'shape'. I.e.,
+// strides[i] = prod(shape.dim_size[(i+1):])
+template <typename T>
+gtl::InlinedVector<T, 8> ComputeStride(const TensorShape& shape) {
+  const int ndims = shape.dims();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape.dim_size(i));
+  }
+  return strides;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index edaa10761ebcdbb2bd25485f8a557496480235c6..75820e3106f77ca3b8d9ed6d72ed59f87e248c9d 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -157,6 +157,7 @@ REGISTER_PACK(string);
       PackOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 4c4319357948a2b7b6e8b732cc656709ae4bbb33..6e8b09d05003ef64c7acb896e84e8887a503d75b 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -70,6 +70,16 @@ class PadOp : public OpKernel {
             "The first dimension of paddings must be the rank of inputs",
             in1.shape().DebugString(), " ", in0.shape().DebugString()));
 
+    T pad_value(0);
+    if (context->num_inputs() == 3) {
+      const Tensor& constant_values = context->input(2);
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsScalar(constant_values.shape()),
+          errors::InvalidArgument("constant_values must be a scalar. Found: ",
+                                  constant_values.shape().DebugString()));
+      pad_value = context->input(2).scalar<T>()();
+    }
+
     // Compute the shape of the output tensor, and allocate it.
     TensorShape output_shape;
     TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>();
@@ -99,27 +109,27 @@ class PadOp : public OpKernel {
     // Invoke the dims-specific implementation.
     switch (fixed_dims) {
       case 0:
-        Operate<0>(context, in0.tensor<T, 0>(), paddings, output);
+        Operate<0>(context, in0.tensor<T, 0>(), paddings, pad_value, output);
         break;
       case 1:
         // TODO(irving): Once Pad doesn't need a scalar special case,
         // change flat to tensor.  That is, once !allow_legacy_scalars().
-        Operate<1>(context, in0.flat<T>(), paddings, output);
+        Operate<1>(context, in0.flat<T>(), paddings, pad_value, output);
         break;
       case 2:
-        Operate<2>(context, in0.tensor<T, 2>(), paddings, output);
+        Operate<2>(context, in0.tensor<T, 2>(), paddings, pad_value, output);
         break;
       case 3:
-        Operate<3>(context, in0.tensor<T, 3>(), paddings, output);
+        Operate<3>(context, in0.tensor<T, 3>(), paddings, pad_value, output);
         break;
       case 4:
-        Operate<4>(context, in0.tensor<T, 4>(), paddings, output);
+        Operate<4>(context, in0.tensor<T, 4>(), paddings, pad_value, output);
         break;
       case 5:
-        Operate<5>(context, in0.tensor<T, 5>(), paddings, output);
+        Operate<5>(context, in0.tensor<T, 5>(), paddings, pad_value, output);
         break;
       case 6:
-        Operate<6>(context, in0.tensor<T, 6>(), paddings, output);
+        Operate<6>(context, in0.tensor<T, 6>(), paddings, pad_value, output);
         break;
       default:
         OP_REQUIRES(context, false,
@@ -132,7 +142,8 @@ class PadOp : public OpKernel {
   template <int Dims>
   void Operate(OpKernelContext* context,
                typename TTypes<T, Dims>::ConstTensor input,
-               TTypes<int32>::ConstMatrix paddings, Tensor* output) {
+               TTypes<int32>::ConstMatrix paddings, T pad_value,
+               Tensor* output) {
     CHECK_EQ(Dims, paddings.dimension(0));
     CHECK_EQ(2, paddings.dimension(1));
     Eigen::array<std::pair<int32, int32>, Dims> paddings_array;
@@ -141,16 +152,22 @@ class PadOp : public OpKernel {
     }
     functor::Pad<Device, T, Dims> functor;
     functor(context->eigen_device<Device>(), output->tensor<T, Dims>(), input,
-            paddings_array);
+            paddings_array, pad_value);
   }
 };
 
-#define REGISTER_KERNEL(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("Pad")                    \
-                              .Device(DEVICE_CPU)        \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("paddings"),   \
-                          PadOp<CPUDevice, type>)
+#define REGISTER_KERNEL(type)                                 \
+  REGISTER_KERNEL_BUILDER(Name("Pad")                         \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<type>("T")      \
+                              .HostMemory("paddings"),        \
+                          PadOp<CPUDevice, type>);            \
+  REGISTER_KERNEL_BUILDER(Name("PadV2")                       \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<type>("T")      \
+                              .HostMemory("paddings")         \
+                              .HostMemory("constant_values"), \
+                          PadOp<CPUDevice, type>);
 
 TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
@@ -158,12 +175,12 @@ TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Dims)                                  \
-  template <>                                                      \
-  void Pad<GPUDevice, T, Dims>::operator()(                        \
-      const GPUDevice& d, typename TTypes<T, Dims>::Tensor output, \
-      typename TTypes<T, Dims>::ConstTensor input,                 \
-      Eigen::array<std::pair<int32, int32>, Dims> paddings);       \
+#define DECLARE_GPU_SPEC(T, Dims)                                         \
+  template <>                                                             \
+  void Pad<GPUDevice, T, Dims>::operator()(                               \
+      const GPUDevice& d, typename TTypes<T, Dims>::Tensor output,        \
+      typename TTypes<T, Dims>::ConstTensor input,                        \
+      Eigen::array<std::pair<int32, int32>, Dims> paddings, T pad_value); \
   extern template struct Pad<GPUDevice, T, Dims>;
 
 #define DECLARE_GPU_SPECS(T) \
@@ -185,6 +202,13 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
                               .TypeConstraint<T>("T")             \
                               .TypeConstraint<int32>("Tpaddings") \
                               .HostMemory("paddings"),            \
+                          PadOp<GPUDevice, T>);                   \
+  REGISTER_KERNEL_BUILDER(Name("PadV2")                           \
+                              .Device(DEVICE_GPU)                 \
+                              .TypeConstraint<T>("T")             \
+                              .TypeConstraint<int32>("Tpaddings") \
+                              .HostMemory("paddings")             \
+                              .HostMemory("constant_values"),     \
                           PadOp<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
@@ -200,6 +224,15 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
                             .HostMemory("paddings")
                             .HostMemory("output"),
                         PadOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("PadV2")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("constant_values")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32>);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -210,6 +243,13 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
                               .TypeConstraint<T>("T")             \
                               .TypeConstraint<int32>("Tpaddings") \
                               .HostMemory("paddings"),            \
+                          PadOp<SYCLDevice, T>);                  \
+  REGISTER_KERNEL_BUILDER(Name("PadV2")                           \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<T>("T")             \
+                              .TypeConstraint<int32>("Tpaddings") \
+                              .HostMemory("paddings")             \
+                              .HostMemory("constant_values"),     \
                           PadOp<SYCLDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
@@ -221,6 +261,15 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
                             .HostMemory("paddings")
                             .HostMemory("output"),
                         PadOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("PadV2")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("constant_values")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32>);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h
index 733e0f3083ca3ed7ce5dd4fffb4a36fad51a4bae..6a973833e2d31961309e5bd1a6e4c15363862aff 100644
--- a/tensorflow/core/kernels/pad_op.h
+++ b/tensorflow/core/kernels/pad_op.h
@@ -27,16 +27,17 @@ namespace functor {
 // Functor used by PadOp to do the computations.
 template <typename Device, typename T, int Dims>
 struct Pad {
-  // Pad "input" into "output", as specified by "paddings".  See pad_op.cc for
-  // details.
+  // Pad "input" into "output", as specified by "paddings" and "pad_value".
+  // See pad_op.cc for details.
   void operator()(const Device& d, typename TTypes<T, Dims>::Tensor output,
                   typename TTypes<T, Dims>::ConstTensor input,
-                  Eigen::array<std::pair<int32, int32>, Dims> paddings) {
+                  Eigen::array<std::pair<int32, int32>, Dims> paddings,
+                  T pad_value) {
     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value &&
         (output.size() <= std::numeric_limits<int32>::max())) {
-      To32Bit(output).device(d) = To32Bit(input).pad(paddings);
+      To32Bit(output).device(d) = To32Bit(input).pad(paddings, pad_value);
     } else {
-      output.device(d) = input.pad(paddings);
+      output.device(d) = input.pad(paddings, pad_value);
     }
   }
 };
@@ -46,7 +47,7 @@ struct Pad<Device, T, 0> {
   // In the scalar case we simply copy the input.
   void operator()(const Device& d, typename TTypes<T, 0>::Tensor output,
                   typename TTypes<T, 0>::ConstTensor input,
-                  Eigen::array<std::pair<int32, int32>, 0>) {
+                  Eigen::array<std::pair<int32, int32>, 0>, T) {
     output.device(d) = input;
   }
 };
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index f4626d4a5d4dd6382e1f1963732e1fdc1c593aa7..d0f7683f3dd8d520339dfd132af8a101da3abd5a 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 8b85bd4ebe930caebdfa5cd2e2239c21bd593ef3..933de65c15a772154ce439cc54489c4a29c42ea5 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #ifdef COMPILER_MSVC
 // msvc does not support unroll. One could try the loop pragma but we need to
 // take a closer look if this generates better code in this case. For now let
-// the compiler take care of of it.
+// the compiler take care of it.
 #define UNROLL
 #else
 #define UNROLL _Pragma("unroll")
diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12ff6b69f87271cee82f39d4b681a46551c26028
--- /dev/null
+++ b/tensorflow/core/kernels/population_count_op.cc
@@ -0,0 +1,163 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc
+
+#define EIGEN_USE_THREADS
+
+#include <bitset>
+
+#include "tensorflow/core/kernels/population_count_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class PopulationCountOp : public OpKernel {
+ public:
+  explicit PopulationCountOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input_t = c->input(0);
+    Tensor* output_t;
+    OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t));
+
+    auto input = input_t.flat<T>();
+    auto output = output_t->flat<uint8>();
+
+    functor::PopulationCount<Device, T> popcnt;
+    popcnt(c, input, output);
+  }
+};
+
+#define REGISTER_POPULATION_COUNT(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("PopulationCount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      PopulationCountOp<CPUDevice, type>);
+
+TF_CALL_uint8(REGISTER_POPULATION_COUNT);
+TF_CALL_int8(REGISTER_POPULATION_COUNT);
+TF_CALL_uint16(REGISTER_POPULATION_COUNT);
+TF_CALL_int16(REGISTER_POPULATION_COUNT);
+TF_CALL_int32(REGISTER_POPULATION_COUNT);
+TF_CALL_int64(REGISTER_POPULATION_COUNT);
+
+#undef REGISTER_POPULATION_COUNT
+
+namespace functor {
+
+namespace {
+
+template <typename T>
+inline uint8 PopCnt(const T v);
+
+#define POPCNT(T, N)                  \
+  template <>                         \
+  uint8 PopCnt<T>(const T v) {        \
+    return std::bitset<N>(v).count(); \
+  }
+
+POPCNT(int8, 8);
+POPCNT(uint8, 8);
+POPCNT(int16, 16);
+POPCNT(uint16, 16);
+POPCNT(int32, 32);
+POPCNT(int64, 64);
+
+#undef POPCNT
+
+}  // namespace
+
+template <typename T>
+struct PopulationCount<CPUDevice, T> {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
+                  TTypes<uint8>::Flat output) {
+    const T* input_ptr = input.data();
+    uint8* output_ptr = output.data();
+    auto shard = [input_ptr, output_ptr](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        output_ptr[i] = PopCnt<T>(input_ptr[i]);
+      }
+    };
+    int64 total_shards = input.size();
+    // Approximating cost of popcnt: convert T to int64
+    // (std::bitset constructor) and convert int64 to uint8
+    // (bitset.count() -> output).  The .count() itself is relatively cheap.
+    const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
+                               Eigen::TensorOpCost::CastCost<int64, uint8>());
+    const int64 shard_cost = (total_cost >= static_cast<double>(kint64max))
+                                 ? kint64max
+                                 : static_cast<int64>(total_cost);
+
+    auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
+          shard_cost, shard);
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+
+#define REGISTER_POPULATION_COUNT(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("PopulationCount").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      PopulationCountOp<GPUDevice, type>)
+
+TF_CALL_uint8(REGISTER_POPULATION_COUNT);
+TF_CALL_int8(REGISTER_POPULATION_COUNT);
+TF_CALL_uint16(REGISTER_POPULATION_COUNT);
+TF_CALL_int16(REGISTER_POPULATION_COUNT);
+TF_CALL_int32(REGISTER_POPULATION_COUNT);
+TF_CALL_int64(REGISTER_POPULATION_COUNT);
+
+#undef REGISTER_POPULATION_COUNT
+
+namespace functor {
+
+#define DECLARE_GPU_SPEC(T)                                    \
+  template <>                                                  \
+  void PopulationCount<GPUDevice, T>::operator()(              \
+      OpKernelContext* c, typename TTypes<T>::ConstFlat input, \
+      TTypes<uint8>::Flat output);                             \
+  extern template struct PopulationCount<GPUDevice, T>
+
+TF_CALL_uint8(DECLARE_GPU_SPEC);
+TF_CALL_int8(DECLARE_GPU_SPEC);
+TF_CALL_uint16(DECLARE_GPU_SPEC);
+TF_CALL_int16(DECLARE_GPU_SPEC);
+TF_CALL_int32(DECLARE_GPU_SPEC);
+TF_CALL_int64(DECLARE_GPU_SPEC);
+
+#undef DECLARE_GPU_SPEC
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..de89582e139b03de48719749ef29a0d3bb638e0e
--- /dev/null
+++ b/tensorflow/core/kernels/population_count_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct PopulationCount {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
+                  TTypes<uint8>::Flat output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27a687ba409fcc359e7fb3c6be2b4917b40fe60e
--- /dev/null
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/population_count_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+__global__ void PopulationCountKernel(const int size, const T* input,
+                                      uint8* output) {
+  CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
+}
+
+template <>
+__global__ void PopulationCountKernel(const int size, const int8* input,
+                                      uint8* output) {
+  // For some reason, __popc on a negative int8 gets confused.
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i)));
+  }
+}
+
+template <>
+__global__ void PopulationCountKernel(const int size, const int16* input,
+                                      uint8* output) {
+  // For some reason, __popc on a negative int16 gets confused.
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i)));
+  }
+}
+
+template <>
+__global__ void PopulationCountKernel<int64>(const int size, const int64* input,
+                                             uint8* output) {
+  CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
+}
+
+#define DEFINE_GPU_SPECS(T)                                               \
+  template <>                                                             \
+  void PopulationCount<GPUDevice, T>::operator()(                         \
+      OpKernelContext* c, typename TTypes<T>::ConstFlat input,            \
+      TTypes<uint8>::Flat output) {                                       \
+    const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
+    int64 total_count = input.size();                                     \
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
+    PopulationCountKernel<T>                                              \
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
+            total_count, input.data(), output.data());                    \
+  }
+
+TF_CALL_uint8(DEFINE_GPU_SPECS);
+TF_CALL_int8(DEFINE_GPU_SPECS);
+TF_CALL_uint16(DEFINE_GPU_SPECS);
+TF_CALL_int16(DEFINE_GPU_SPECS);
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
+
+#undef DEFINE_GPU_SPECS
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 9cbd832957c44a5aebcddf9412635f0cbcfa13d7..4c406fc1ed9f86477a7c0eb7c88f7dd7833f796c 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index cb4fcbd78831bea16e97edf55bf5e4e6f8c047e4..c5dc2e7194dd28b7c3b7f258b0ce91114c1ba845 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -823,9 +823,9 @@ void QuantizedAddUsingEigen(const Eigen::ThreadPoolDevice& device,
   const int64 input_element_count = input.NumElements();
   const int64 smaller_input_element_count = smaller_input.NumElements();
 
-  QuantizedToFloatStruct<T1> smaller_input_q2f(smaller_input_min,
+  QuantizedToFloatStruct<T1> input_q2f(input_min, input_max);
+  QuantizedToFloatStruct<T2> smaller_input_q2f(smaller_input_min,
                                                smaller_input_max);
-  QuantizedToFloatStruct<T2> input_q2f(input_min, input_max);
   FloatToQuantizedStruct<T3> f2q(*output_min, *output_max);
 
   auto smaller_input_float =
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 4dd5e24e5a162d332a6b50000dbe8e982ac68237..dadc15b69ee67b51be1647a1e8a6794e684bcff2 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -85,6 +85,64 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   bool range_given_;
 };
 
+// Simulate quantization precision loss in a float tensor by:
+// 1. Quantize the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantize it back to floating point numbers for the following ops, most
+//    likely matmul.
+// Almost identical to QuantizeAndDequantizeV2Op, except that num_bits is a
+// tensor.
+template <typename Device, typename T>
+class QuantizeAndDequantizeV3Op : public OpKernel {
+ public:
+  explicit QuantizeAndDequantizeV3Op(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+
+    Tensor num_bits_tensor;
+    num_bits_tensor = ctx->input(3);
+    int num_bits_val = num_bits_tensor.scalar<int32>()();
+
+    OP_REQUIRES(
+        ctx, num_bits_val > 0 && num_bits_val < (signed_input_ ? 62 : 63),
+        errors::InvalidArgument("num_bits is out of range: ", num_bits_val,
+                                " with signed_input_ ", signed_input_));
+
+    Tensor input_min_tensor;
+    Tensor input_max_tensor;
+    if (range_given_) {
+      input_min_tensor = ctx->input(1);
+      input_max_tensor = ctx->input(2);
+      auto min_val = input_min_tensor.scalar<T>()();
+      auto max_val = input_max_tensor.scalar<T>()();
+      OP_REQUIRES(ctx, min_val <= max_val,
+                  errors::InvalidArgument("Invalid range: input_min ", min_val,
+                                          " > input_max ", max_val));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             TensorShape(), &input_min_tensor));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             TensorShape(), &input_max_tensor));
+    }
+
+    functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
+    f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_val,
+      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+  }
+
+ private:
+  bool signed_input_;
+  bool range_given_;
+};
+
 // DEPRECATED: Use QuantizeAndDequantizeV2Op.
 template <typename Device, typename T>
 class QuantizeAndDequantizeOp : public OpKernel {
@@ -153,6 +211,10 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV3Op<CPUDevice, T>);            \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<CPUDevice, T>);
@@ -168,6 +230,13 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("input_min")                         \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_max")                         \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("num_bits")                          \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV3Op<GPUDevice, T>);            \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<GPUDevice, T>);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index ccd0d7203aae6144bf3e9c7ae6fb870425cc262e..7c55958cc2835aea51bbb3e96bc1da5f6c17f64a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -62,6 +62,32 @@ TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("signed_input", true)
+          .Attr("range_given", false)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1}), {-3.5});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
+  test::FillValues<float>(&expected, {-3.5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
 // Convert a 1D tensor with signed 8 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   TF_ASSERT_OK(
@@ -92,6 +118,37 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+// Convert a 1D tensor with signed 8 bits.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("signed_input", true)
+          .Attr("range_given", false)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+
+  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+  // Scale is: 1/127
+  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
+  test::FillValues<float>(
+      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
 // Convert a 1D tensor with signed 4 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   TF_ASSERT_OK(
@@ -121,6 +178,36 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+// Convert a 1D tensor with signed 4 bits.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("signed_input", true)
+          .Attr("range_given", false)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  AddInputFromArray<int32>(TensorShape({}), {4});    // num_bits
+
+  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
+  // Scale is: 1/7
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
+  test::FillValues<float>(&expected,
+                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
 // Convert a 2D tensor with signed 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   TF_ASSERT_OK(
@@ -150,6 +237,36 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 2D tensor with signed 8 bits with given range.
+TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("signed_input", true)
+          .Attr("range_given", true)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // Note that the last two values are saturated.
+  AddInputFromArray<float>(TensorShape({2, 4}),
+                           {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
+  AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
+
+  // Note that the range is given as [-1, 1].
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+  // 127}.
+  // Scale is: 1/127
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
+                                      102.0 / 127, 70.0 / 127, -1, 1});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 4D tensor with unsigned 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
   TF_ASSERT_OK(
@@ -175,6 +292,32 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 4D tensor with unsigned 8 bits with given range.
+TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("signed_input", false)
+          .Attr("range_given", true)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+
+  // Note that the range is given as [0, 1].
+  // With int8, the tensor is quantized to {0, 0, 77, 204}
+  // Scale is: 1/255
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
+  test::FillValues<float>(&expected, {0, 0, 77.0 / 255, 204.0 / 255});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a tensor with all 0.
 TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0) {
   TF_ASSERT_OK(
@@ -197,6 +340,29 @@ TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a tensor with all 0.
+TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("signed_input", false)
+          .Attr("range_given", false)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {0, 0, 0, 0});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
+  test::FillValues<float>(&expected, {0, 0, 0, 0});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Range is invalid
 TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   TF_ASSERT_OK(
@@ -218,6 +384,28 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
       << s;
 }
 
+// Range is invalid
+TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_Op", "QuantizeAndDequantizeV3")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("range_given", true)
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("Invalid range: input_min 1 > input_max 0"))
+      << s;
+}
+
 #define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                           \
   static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(int iters) {       \
     auto root = Scope::NewRootScope().ExitOnError();            \
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 56a7e161df442de2a00441807ff10d2b759c4479..3b0764bb9bf9ff00c71173c53cdb78b6ab3ac6ca 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -211,7 +211,7 @@ class Im2ColConvFunctor {
         ++warning_count;
         LOG(WARNING)
             << "For kernel '" << context->op_kernel().name() << "' from input '"
-            << context->op_kernel().def().input(0)
+            << context->op_kernel().requested_input(0)
             << "': Zero is not representable in the quantized range used by the"
             << " input. This means QuantizedConv2d has to fall back to a slow"
             << " implementation, since the border of zero values can't be"
@@ -381,7 +381,7 @@ class Im2ColConvFunctor {
       if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
           std::is_same<T2, quint8>() && std::is_same<T3, qint32>() &&
           (output_offset == 0) && (output_mult == 1) && (output_shift == 0) &&
-          (transpose_c == false)) {
+          (transpose_c == false) && (k <= 2048)) {
         meta::QuantizedGemm(context, transpose_a, transpose_b, im2col_buffer,
                             filter_data, chunk_output_data, m, n, k,
                             -input_offset, -filter_offset, lda, ldb, ldc);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index 86c722e5875cf93b7924e89ed2db5ace36105a16..afb30d5f627feab1a009ec84c5f0bb9f851766e0 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -135,7 +135,7 @@ class QuantizedMatMulOp : public OpKernel {
     if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
         std::is_same<T2, quint8>() && std::is_same<Toutput, qint32>() &&
         (offset_c == 0) && (mult_c == 1) && (shift_c == 0) &&
-        (transpose_c == false)) {
+        (transpose_c == false) && (k <= 2048)) {
       // Gemmlowp/meta code path works on 32 & 64 bit Arm with NEON Simd and
       // allows optimized quantized 8bit to 32bit gemm.
       meta::QuantizedGemm(context, transpose_a_, transpose_b_, a_data, b_data,
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 07ff70a87524232d38d96528fa49092e70221e82..8a9af39e1f7af5483bc72023915dfd408907a99a 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/queue_base.h"
 
 #include <vector>
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 0a0e51a7c3c847631326694feb27e220149cfbd9..c101fb35791eafa109f1a360fe63051398d48de5 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -69,7 +69,7 @@ class QueueBase : public QueueInterface {
 
   int32 capacity() const { return capacity_; }
 
-  bool closed() {
+  bool is_closed() const override {
     mutex_lock lock(mu_);
     return closed_;
   }
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index f2ac09c4e6df55e5794bee929f66e0ea9f8f8ac3..d51dc4ecb00f9501d544dbbbfbd4e92ebf515682 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -425,6 +425,27 @@ class QueueSizeOp : public QueueOpKernel {
 REGISTER_KERNEL_BUILDER(Name("QueueSize").Device(DEVICE_CPU), QueueSizeOp);
 REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp);
 
+class QueueIsClosedOp : public QueueOpKernel {
+ public:
+  explicit QueueIsClosedOp(OpKernelConstruction* context)
+     : QueueOpKernel(context) {}
+ 
+ protected:
+  void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
+                    DoneCallback callback) override {
+    Tensor* Tqueue_is_closed = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
+    Tqueue_is_closed->flat<bool>().setConstant(queue->is_closed());
+    callback();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp);
+
 class FakeQueueOp : public OpKernel {
  public:
   explicit FakeQueueOp(OpKernelConstruction* context) : OpKernel(context) {
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index 66123e47c6eaee57bb5a6166b748789ce188ba0f..b3957cbed60fc76c544cc5c2452cec08586f2e48 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -303,10 +303,6 @@ class RandomPoissonOp : public OpKernel {
 
     const auto rate_flat = rate_t.flat<T>().data();
     const int64 num_rate = rate_t.NumElements();
-    OP_REQUIRES(
-        ctx, num_rate > 0,
-        errors::InvalidArgument(
-            "Input rate should have non-zero element count, got: ", num_rate));
     auto samples_flat = samples_t->flat<T>().data();
     random::PhiloxRandom rng = generator_.ReserveRandomOutputs(
         num_samples * num_rate, kReservedSamplesPerOutput);
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index d9efb5fe7dca3a0b863ef717d752ef52fd6e7291..30bbbd4aed6924972f914c42eb8b0a7b9239f7ae 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/reader_dataset_ops.cc
index e7f65c39cb94d8757cf9de042908b4228b2a3d36..c9da8b149e4060522df9be2c56a82e525514fad4 100644
--- a/tensorflow/core/kernels/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/reader_dataset_ops.cc
@@ -16,8 +16,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
 
 namespace tensorflow {
 
@@ -37,13 +41,39 @@ class TextLineDatasetOp : public OpKernel {
         ctx, filenames_tensor->dims() <= 1,
         errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
+    const Tensor* compression_type_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input("compression_type", &compression_type_tensor));
+    OP_REQUIRES(
+        ctx, compression_type_tensor->dims() == 0,
+        errors::InvalidArgument("`compression_type` must be a scalar."));
+    const string& compression_type =
+        compression_type_tensor->scalar<string>()();
+
+    io::ZlibCompressionOptions zlib_compression_options =
+        io::ZlibCompressionOptions::DEFAULT();
+    bool use_compression = false;
+    if (compression_type.empty()) {
+      use_compression = false;
+    } else if (compression_type == "ZLIB") {
+      use_compression = true;
+      zlib_compression_options = io::ZlibCompressionOptions::DEFAULT();
+    } else if (compression_type == "GZIP") {
+      use_compression = true;
+      zlib_compression_options = io::ZlibCompressionOptions::GZIP();
+    } else {
+      OP_REQUIRES(ctx, compression_type.empty(),
+                  errors::InvalidArgument("Unsupported compression_type."));
+    }
+
     std::vector<string> filenames;
     filenames.reserve(filenames_tensor->NumElements());
     for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
       filenames.push_back(filenames_tensor->flat<string>()(i));
     }
 
-    DatasetBase* dataset = new Dataset(std::move(filenames));
+    DatasetBase* dataset = new Dataset(std::move(filenames), use_compression,
+                                       zlib_compression_options);
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
     ResourceHandle handle = MakeResourceHandle<DatasetBase>(
@@ -55,8 +85,11 @@ class TextLineDatasetOp : public OpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(std::vector<string> filenames)
-        : filenames_(std::move(filenames)) {}
+    explicit Dataset(std::vector<string> filenames, bool use_compression,
+                     io::ZlibCompressionOptions options)
+        : filenames_(std::move(filenames)),
+          use_compression_(use_compression),
+          options_(options) {}
 
     std::unique_ptr<IteratorBase> MakeIterator() const override {
       return std::unique_ptr<IteratorBase>(new Iterator(this));
@@ -86,9 +119,10 @@ class TextLineDatasetOp : public OpKernel {
         mutex_lock l(mu_);
         do {
           // We are currently processing a file, so try to read the next line.
-          if (input_buffer_) {
+          if (processing_file_) {
             string line_contents;
-            Status s = input_buffer_->ReadLine(&line_contents);
+            Status s = buffered_input_stream_->ReadLine(&line_contents);
+
             if (s.ok()) {
               // Produce the line as output.
               Tensor line_tensor(cpu_allocator(), DT_STRING, {});
@@ -103,7 +137,10 @@ class TextLineDatasetOp : public OpKernel {
 
             // We have reached the end of the current file, so maybe
             // move on to next file.
-            input_buffer_.reset();
+            processing_file_ = false;
+            input_stream_.reset();
+            zlib_input_stream_.reset();
+            buffered_input_stream_.reset();
             file_.reset();
             ++current_file_index_;
           }
@@ -117,7 +154,19 @@ class TextLineDatasetOp : public OpKernel {
           // Actually move on to next file.
           TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
               dataset()->filenames_[current_file_index_], &file_));
-          input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
+          processing_file_ = true;
+          input_stream_.reset(
+              new io::RandomAccessInputStream(file_.get(), false));
+          if (dataset()->use_compression_) {
+            zlib_input_stream_.reset(
+                new io::ZlibInputStream(input_stream_.get(), kBufferSize,
+                                        kBufferSize, dataset()->options_));
+            buffered_input_stream_.reset(new io::BufferedInputStream(
+                zlib_input_stream_.get(), kBufferSize, false));
+          } else {
+            buffered_input_stream_.reset(new io::BufferedInputStream(
+                input_stream_.get(), kBufferSize, false));
+          }
         } while (true);
       }
 
@@ -127,13 +176,20 @@ class TextLineDatasetOp : public OpKernel {
       enum { kBufferSize = 256 << 10 /* 256 kB */ };
 
       mutex mu_;
+      bool processing_file_ GUARDED_BY(mu_) = false;
+      std::unique_ptr<io::RandomAccessInputStream> input_stream_
+          GUARDED_BY(mu_);
+      std::unique_ptr<io::ZlibInputStream> zlib_input_stream_ GUARDED_BY(mu_);
+      std::unique_ptr<io::BufferedInputStream> buffered_input_stream_
+          GUARDED_BY(mu_);
       size_t current_file_index_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<RandomAccessFile> file_
-          GUARDED_BY(mu_);  // must outlive input_buffer_
-      std::unique_ptr<io::InputBuffer> input_buffer_ GUARDED_BY(mu_);
+          GUARDED_BY(mu_);  // must outlive input_stream_
     };
 
     const std::vector<string> filenames_;
+    bool use_compression_;
+    io::ZlibCompressionOptions options_;
   };
 };
 
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
index e2eb40677b44bdfa1da67662d3f290a044422272..abd16de6a1c4301699baadacd57a7f1818204eee 100644
--- a/tensorflow/core/kernels/reader_ops.cc
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -50,8 +50,7 @@ class ReaderVerbAsyncOpKernel : public AsyncOpKernel {
       : AsyncOpKernel(context),
         thread_pool_(new thread::ThreadPool(
             context->env(), ThreadOptions(),
-            strings::StrCat("reader_thread_",
-                            SanitizeThreadSuffix(def().name())),
+            strings::StrCat("reader_thread_", SanitizeThreadSuffix(name())),
             1 /* num_threads */, false /* low_latency_hint */)) {}
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index 3af555da1aaed43eacceb0b8f3f5907ef6dccacc..e4fa0ed322df57789f95efe584fe91a3efe561ec 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -46,7 +46,7 @@ RecordYielder::~RecordYielder() {
 
 Status RecordYielder::YieldOne(string* value) {
   mutex_lock l(mu_);
-  while (!BufEnough()) {
+  while (!BufEnough() && status_.ok()) {
     buf_enough_.wait(l);
   }
   if (status_.ok()) {
@@ -98,17 +98,22 @@ void RecordYielder::MainLoop() {
   while (true) {
     ++epoch_;
     num_records_yielded_in_epoch_ = 0;
+    num_records_added_in_epoch_ = 0;
 
     // Finds all files.
     std::vector<string> filenames;
     Status s = MatchFiles(opts_.file_pattern, &filenames);
-    if (ShouldFinish(s)) break;
 
     if (filenames.empty()) {
       s = errors::NotFound("Found no files at ", opts_.file_pattern);
-      if (ShouldFinish(s)) break;
+      if (ShouldFinish(s)) {
+        buf_enough_.notify_all();
+        break;
+      }
     }
 
+    if (ShouldFinish(s)) break;
+
     // Shuffles these files according to the epoch # and random seed.
     std::mt19937_64 shuffle_rnd(
         Hash64(reinterpret_cast<char*>(&epoch_), sizeof(epoch_), opts_.seed));
@@ -139,7 +144,16 @@ void RecordYielder::MainLoop() {
       shards[i].done.WaitForNotification();
       s.Update(shards[i].status);
     }
-    if (ShouldFinish(s)) break;
+
+    if (num_records_added_in_epoch_ < opts_.bufsize) {
+      mutex_lock l(mu_);
+      opts_.bufsize = num_records_added_in_epoch_;
+    }
+
+    if (ShouldFinish(s)) {
+      buf_enough_.notify_all();
+      break;
+    }
 
     // Starts the next epoch once all buffered records are consumed.
     {
@@ -173,6 +187,7 @@ bool RecordYielder::Add(std::vector<string>* values) {
       buf_[index] = std::move(values->back());
     }
     values->pop_back();
+    num_records_added_in_epoch_++;
   }
   if (BufEnough()) {
     buf_enough_.notify_all();
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 44f7c9511f1d39aba5a24435b5028c6d93bedf3d..d86cb75c15fabdcb7ab3e794e6f8f09c52b2c745 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -119,6 +119,7 @@ class RecordYielder {
   // True iff we are draining an epoch.
   bool epoch_end_ = false;
 
+  int64 num_records_added_in_epoch_ = 0;
   int64 num_records_yielded_in_epoch_ = 0;
 
   // Trigger when the main loop has exited.
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index 0cb63abf2a276afb4a4da99720ba762ac340fad5..5eba4288acccfb2465e9c483e9fefda4adf68185 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 
+#include "tensorflow/core/lib/strings/str_util.h"
+
 namespace tensorflow {
 
 TensorShape ReductionHelper::out_reshape() const {
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index d8d30e87e2237cfb15933a661600cd2e5409632f..afad288cc00e0c3934318834d8dae8c181541212 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -50,15 +50,21 @@ typedef Eigen::SyclDevice SYCLDevice;
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
 #undef REGISTER_RELU_KERNELS
 
-#define REGISTER_ELU_KERNELS(type)                                  \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Elu").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
-      EluOp<CPUDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("EluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      EluGradOp<CPUDevice, type>)
-
-// Elu only makes sense with float or double.
+#define REGISTER_ELU_KERNELS(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Elu").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
+      EluOp<CPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("EluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),  \
+      EluGradOp<CPUDevice, type>);                                   \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Selu").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      SeluOp<CPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("SeluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SeluGradOp<CPUDevice, type>)
+
+// Elu and Selu only make sense with float or double.
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ELU_KERNELS);
 #undef REGISTER_ELU_KERNELS
 
@@ -103,7 +109,23 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
       typename TTypes<T>::ConstTensor activations,                             \
       typename TTypes<T>::Tensor backprops);                                   \
-  extern template struct EluGrad<GPUDevice, T>;
+  extern template struct EluGrad<GPUDevice, T>;                                \
+                                                                               \
+  template <>                                                                  \
+  void Selu<GPUDevice, T>::operator()(                                         \
+      const GPUDevice& d,                                                      \
+      typename TTypes<T>::ConstTensor features,                                \
+      typename TTypes<T>::Tensor activations);                                 \
+  extern template struct Selu<GPUDevice, T>;                                   \
+                                                                               \
+  template <>                                                                  \
+  void SeluGrad<GPUDevice, T>::operator()(                                     \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
+      typename TTypes<T>::ConstTensor activations,                             \
+      typename TTypes<T>::Tensor backprops);                                   \
+  extern template struct SeluGrad<GPUDevice, T>;
+
+
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
@@ -127,7 +149,15 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       EluOp<GPUDevice, type>);                                        \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("EluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),   \
-      EluGradOp<GPUDevice, type>)
+      EluGradOp<GPUDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Selu").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
+      SeluOp<GPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
+      SeluGradOp<GPUDevice, type>)
+
+
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
@@ -154,7 +184,15 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
       EluOp<SYCLDevice, type>);                                        \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      EluGradOp<SYCLDevice, type>)
+      EluGradOp<SYCLDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
+      SeluOp<SYCLDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
+      SeluGradOp<SYCLDevice, type>)
+
+
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index 365c6201a54951be0cfb26c7a7e9a4ecccf296ed..e712b02bd7849be968e8e3d429e45ca81efd247f 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -173,6 +173,48 @@ void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
           output->flat<T>());
 }
 
+template <typename Device, typename T>
+class SeluOp : public UnaryElementWiseOp<T, SeluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, SeluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Selu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class SeluGradOp : public BinaryElementWiseOp<T, SeluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, SeluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (outputs): outputs of the SeluOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
+  }
+};
+
+template <typename Device, typename T>
+void SeluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                              const Tensor& g, const Tensor& a,
+                                              Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::SeluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
 }  // namespace tensorflow
 
 #undef EIGEN_USE_THREADS
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 633522920c84254e4e87cf64b7bd8e8c3d94341a..9577b963c6b6783f3b040078d0b042dd4e7e31bd 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -125,6 +125,46 @@ struct EluGrad {
   }
 };
 
+// Functor used by SeluOp to do the computations.
+template <typename Device, typename T>
+struct Selu {
+  // Computes Selu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    // features.constant(?)
+    const auto scale = static_cast<T>(1.0507009873554804934193349852946);
+    const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
+    const auto one = static_cast<T>(1);
+    const auto zero = static_cast<T>(0);
+    activations.device(d) =
+        (features < zero)
+            .select(scale_alpha * (features.exp() - features.constant(one)),
+                    scale * features);
+  }
+};
+
+// Functor used by SeluGradOp to do the computations.
+template <typename Device, typename T>
+struct SeluGrad {
+  // Computes SeluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Selu op.
+  // activations: outputs of the Selu op.
+  // backprops: gradients to backpropagate to the Selu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor activations,
+                  typename TTypes<T>::Tensor backprops) {
+    const auto scale = static_cast<T>(1.0507009873554804934193349852946);
+    const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
+    backprops.device(d) =
+        (activations < static_cast<T>(0)).select(
+            gradients * (activations + scale_alpha), gradients * scale);
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 30c4a289f7f34ddbc837249e402c9755775026af..ec09d8dfea519a70474dca7d3167ba20d3d16d69 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,7 +35,9 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Relu6<GPUDevice, T>;     \
   template struct functor::Relu6Grad<GPUDevice, T>; \
   template struct functor::Elu<GPUDevice, T>;       \
-  template struct functor::EluGrad<GPUDevice, T>;
+  template struct functor::EluGrad<GPUDevice, T>;   \
+  template struct functor::Selu<GPUDevice, T>;      \
+  template struct functor::SeluGrad<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
index aa3835ecc569af95126b1799d8475cbad75e424a..8a25f5832972e311a656c0c059198e7cd8eb4084 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
@@ -41,7 +41,8 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
           RemoteFusedGraphExecuteUtils::GetExecutorBuildFunc(
               execute_info_.executor_name());
       if (build_func != nullptr) {
-        Status status = (*build_func)(&remote_fused_graph_executor_);
+        TF_CHECK_OK((*build_func)(&remote_fused_graph_executor_));
+        CHECK(remote_fused_graph_executor_->IsEnabled());
       } else {
         LOG(ERROR) << "Executor not found for "
                    << execute_info_.executor_name();
@@ -109,6 +110,12 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
               TF_CHECK_OK(ctx->allocate_output(i, shape, &output));
               return output;
             });
+      } else {
+        // For compatibility purpose, returns an empty tensor with specified
+        // data type as output if no executor is used.
+        Tensor* output = nullptr;
+        TensorShape ts({});
+        TF_CHECK_OK(ctx->allocate_output(i, ts, &output));
       }
     }
   }
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
index 655de2f98f38b37dd102265a629472a8f2fc2ac4..ec769d41f96aa956ac7fedc8929e707a89d2e78d 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
@@ -159,8 +159,8 @@ static RemoteFusedGraphExecuteInfo BuildRemoteFusedGraphExecuteInfo(
   return execute_info;
 }
 
-// 1. Create TestRemoteFusedGraphExecutor to execute your fused graph
-class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
+// 1. Create SampleRemoteFusedGraphExecutor to execute your fused graph
+class SampleRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
  public:
   int GetVersion() final { return 1; }
   bool Init(const RemoteFusedGraphExecuteInfo& info) final {
@@ -214,6 +214,16 @@ class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
     return true;
   }
 
+  Status FuseRemoteGraph(const GraphDef& original_graph_def,
+                         const std::vector<string>& /*inputs*/,
+                         const std::vector<string>& /*outputs*/,
+                         GraphDef* fused_graph_def) final {
+    *fused_graph_def = original_graph_def;
+    return Status::OK();
+  }
+
+  bool IsEnabled() const final { return true; }
+
  private:
   const RemoteFusedGraphExecuteInfo* info_;
   std::unordered_map<string, Tensor> input_tensor_cache_;
@@ -225,7 +235,7 @@ class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
 namespace remote_fused_graph_execute_op {
 Status BuildRemoteFusedGraphExecutor(
     std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
-  executor->reset(new TestRemoteFusedGraphExecutor());
+  executor->reset(new SampleRemoteFusedGraphExecutor());
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
index 31c48082dd9715feec54e1b441afade6b489123a..cca77adcffa5265c9f44625304c0b9da33cb9ac4 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -92,4 +93,36 @@ namespace tensorflow {
   return Status::OK();
 }
 
+TestRemoteFusedGraphExecutor::TestRemoteFusedGraphExecutor(
+    const std::unordered_set<string>& fused_op_types,
+    const string& executor_name)
+    : fused_op_types_(fused_op_types), executor_name_(executor_name) {}
+
+int TestRemoteFusedGraphExecutor::GetVersion() { return 0; }
+bool TestRemoteFusedGraphExecutor::Init(const RemoteFusedGraphExecuteInfo&) {
+  return true;
+}
+bool TestRemoteFusedGraphExecutor::Finalize() { return true; }
+bool TestRemoteFusedGraphExecutor::SetupGraph() { return true; }
+bool TestRemoteFusedGraphExecutor::ExecuteGraph() { return true; }
+bool TestRemoteFusedGraphExecutor::TeardownGraph() { return true; }
+bool TestRemoteFusedGraphExecutor::FillInputNode(const string&, const Tensor&) {
+  return true;
+}
+bool TestRemoteFusedGraphExecutor::ReadOutputNode(const string&,
+                                                  TensorAllocatorFunc) {
+  return true;
+}
+Status TestRemoteFusedGraphExecutor::FuseRemoteGraph(
+    const GraphDef& original_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs, GraphDef* fused_graph_def) {
+  return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByOpTypes(
+      original_graph_def, inputs, outputs, "remote_fused_graph_node_names",
+      fused_op_types_, executor_name_,
+      /*require_shape_type=*/false, fused_graph_def);
+  return Status::OK();
+}
+
+bool TestRemoteFusedGraphExecutor::IsEnabled() const { return true; }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
index a0df50162b6adbb7cd0d2e71535735edba36d824..3fa052108ec2d466caead1cb3c14e2ecc00a45f9 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -59,6 +60,30 @@ class RemoteFusedGraphExecuteOpTestUtils {
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteOpTestUtils);
 };
 
+class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
+ public:
+  TestRemoteFusedGraphExecutor(const std::unordered_set<string>& fused_op_types,
+                               const string& executor_name);
+
+  int GetVersion() final;
+  bool Init(const RemoteFusedGraphExecuteInfo&) final;
+  bool Finalize() final;
+  bool SetupGraph() final;
+  bool ExecuteGraph() final;
+  bool TeardownGraph() final;
+  bool FillInputNode(const string&, const Tensor&) final;
+  bool ReadOutputNode(const string&, TensorAllocatorFunc) final;
+  Status FuseRemoteGraph(const GraphDef& original_graph_def,
+                         const std::vector<string>& inputs,
+                         const std::vector<string>& outputs,
+                         GraphDef* fused_graph_def) final;
+  bool IsEnabled() const final;
+
+ private:
+  const std::unordered_set<string> fused_op_types_;
+  const string executor_name_;
+};
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index 103b2be6914ea55c2ceddb8eb223e7e754aa2206..aba755b5c8ba501509bade99a4e27f5b4dcac9b4 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/public/session.h"
@@ -102,7 +104,7 @@ void ConvertMapToVector(const std::unordered_map<int, string>& in,
                         std::vector<string>* out) {
   CHECK_NOTNULL(out);
   out->resize(in.size());
-  for (int i = 0; i < in.size(); ++i) {
+  for (size_t i = 0; i < in.size(); ++i) {
     CHECK(in.count(i) > 0);
     out->at(i) = in.at(i);
   }
@@ -157,6 +159,10 @@ string DumpCluster(const RemoteFusedGraphExecuteUtils::ClusterInfo& cluster) {
     RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS;
 /* static */ constexpr const char* const
     RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_OP_TYPES;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSE_BY_EXECUTOR;
 /* static */ constexpr const char* const
     RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES;
 /* static */ constexpr const char* const
@@ -578,8 +584,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
   } else {
     ImportGraphDefOptions opts;
     Graph graph(OpRegistry::Global());
-    ShapeRefiner shape_refiner(graph.versions().producer(),
-                               graph.op_registry());
+    ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
     TF_RETURN_IF_ERROR(
         ImportGraphDef(opts, *graph_def, &graph, &shape_refiner));
     TF_RETURN_IF_ERROR(PropagateShapeInference(*graph_def, input_tensors,
@@ -722,7 +727,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
     const std::unordered_set<string>& node_names, const GraphDef& graph_def,
     std::vector<ClusterInfo>* cluster_infos) {
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
   std::unordered_set<string> remaining_nodes = node_names;
 
@@ -782,7 +787,9 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
           ++input_count;
         }
       }
-      CHECK(input_count == 0 || input_count == node->in_edges().size());
+      CHECK(input_count == 0 || input_count == node->in_edges().size())
+          << "Invalid input_count(" << input_count << ", "
+          << node->in_edges().size() << ") " << node_name;
 
       for (const Edge* out_edge : node->out_edges()) {
         const Node* dst_node = out_edge->dst();
@@ -825,7 +832,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       BuildNodeSetFromNodeNamesAndPorts(std::get<1>(cluster));
 
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
 
   for (Node* node : graph.nodes()) {
@@ -879,7 +886,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
     const std::vector<string>& border_outputs, const GraphDef& graph_def,
     ClusterInfo* cluster) {
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
 
   std::unordered_set<const Node*> visited;
@@ -951,7 +958,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       BuildClusterSubgraphDef(cluster, input_graph_def, &subgraph_def));
 
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   TF_RETURN_IF_ERROR(
       ImportGraphDef({}, input_graph_def, &graph, &shape_refiner));
 
@@ -988,7 +995,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
   for (const string& output : outputs) {
     const TensorId output_tid = ParseTensorName(output);
     const string output_name = output_tid.first.ToString();
-    for (int i = 0; i < border_outputs.size(); ++i) {
+    for (size_t i = 0; i < border_outputs.size(); ++i) {
       const TensorId subgraph_output_tid =
           ParseTensorName(border_outputs.at(i));
       const string& subgraph_output_name = subgraph_output_tid.first.ToString();
@@ -1035,7 +1042,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
   TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
       subgraph_nodes, input_graph_def, &ci_vec));
 
-  for (int i = 0; i < ci_vec.size(); ++i) {
+  for (size_t i = 0; i < ci_vec.size(); ++i) {
     const string remote_fused_graph_node_name =
         strings::StrCat(remote_fused_graph_node_name_prefix, "/", i);
     TF_RETURN_IF_ERROR(FuseCluster(input_graph_def, inputs, outputs,
@@ -1063,18 +1070,59 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       remote_graph_executor_name, require_shape_type, output_graph_def);
 }
 
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByOpTypes(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name_prefix,
+    const std::unordered_set<string>& fused_op_types,
+    const string& remote_fused_graph_executor_name,
+    const bool require_shape_type, GraphDef* output_graph_def) {
+  const std::unordered_set<string> fused_nodes_filtered_by_op_types =
+      BuildNodeMapFromOpTypes(input_graph_def, fused_op_types);
+
+  return FuseRemoteGraphByNodeNames(
+      input_graph_def, inputs, outputs, remote_fused_graph_node_name_prefix,
+      fused_nodes_filtered_by_op_types, remote_fused_graph_executor_name,
+      require_shape_type, output_graph_def);
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByExecutor(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs, const string& executor_name,
+    GraphDef* output_graph_def) {
+  const ExecutorBuildFunc* build_func = GetExecutorBuildFunc(executor_name);
+  if (build_func == nullptr) {
+    return errors::InvalidArgument("Unknown executor name: " + executor_name);
+  }
+  std::unique_ptr<IRemoteFusedGraphExecutor> executor;
+  TF_RETURN_IF_ERROR((*build_func)(&executor));
+  CHECK_NOTNULL(executor.get());
+  if (!executor->IsEnabled()) {
+    // As this executor is not enabled, just return original graph as is.
+    *output_graph_def = input_graph_def;
+    return Status::OK();
+  }
+  return executor->FuseRemoteGraph(input_graph_def, inputs, outputs,
+                                   output_graph_def);
+}
+
 /* static */ Status RemoteFusedGraphExecuteUtils::PlaceRemoteGraphArguments(
     const std::vector<string>& inputs, const std::vector<string>& outputs,
     const std::unordered_set<string>& fused_node_names,
     const std::vector<string>& border_inputs,
     const std::vector<string>& border_outputs,
+    const std::unordered_set<string>& fused_op_types,
     const string& remote_fused_graph_node_name,
     const string& remote_graph_executor_name, GraphDef* graph_def) {
   CHECK_NOTNULL(graph_def);
+
+  const std::unordered_set<string> fused_nodes_filtered_by_op_types =
+      BuildNodeMapFromOpTypes(*graph_def, fused_op_types);
+
   for (NodeDef& node_def : *graph_def->mutable_node()) {
     string attr_str;
     TensorId tid;
-    for (int i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
       if (IsSameNodeName(node_def, inputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
         attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_INPUT,
@@ -1082,7 +1130,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
                                       remote_fused_graph_node_name);
       }
     }
-    for (int i = 0; i < outputs.size(); ++i) {
+    for (size_t i = 0; i < outputs.size(); ++i) {
       if (IsSameNodeName(node_def, outputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
         attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT,
@@ -1095,14 +1143,20 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
         attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
       }
     }
-    for (int i = 0; i < border_inputs.size(); ++i) {
+    for (const string& fused_node_name : fused_nodes_filtered_by_op_types) {
+      if (fused_node_name == node_def.name()) {
+        AppendDeliminator(&attr_str);
+        attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE);
+      }
+    }
+    for (size_t i = 0; i < border_inputs.size(); ++i) {
       if (IsSameNodeName(node_def, border_inputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
         attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::BORDER_INPUT,
                                       tid.second, i);
       }
     }
-    for (int i = 0; i < border_outputs.size(); ++i) {
+    for (size_t i = 0; i < border_outputs.size(); ++i) {
       if (IsSameNodeName(node_def, border_outputs.at(i), &tid)) {
         AppendDeliminator(&attr_str);
         attr_str += BuildNodeTypeAttr(
@@ -1280,6 +1334,103 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments(
   return true;
 }
 
+/* static */ Status RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor(
+    const void* src_ptr, const int src_size, Tensor* tensor) {
+  CHECK(tensor->TotalBytes() >= src_size)
+      << tensor->TotalBytes() << ", " << src_size;
+  void* dst_ptr;
+  switch (tensor->dtype()) {
+    case DT_FLOAT:
+      dst_ptr = tensor->flat<float>().data();
+      break;
+    case DT_DOUBLE:
+      dst_ptr = tensor->flat<double>().data();
+      break;
+    case DT_INT32:
+      dst_ptr = tensor->flat<int32>().data();
+      break;
+    case DT_UINT8:
+      dst_ptr = tensor->flat<uint8>().data();
+      break;
+    case DT_INT16:
+      dst_ptr = tensor->flat<int16>().data();
+      break;
+    case DT_INT8:
+      dst_ptr = tensor->flat<int8>().data();
+      break;
+    case DT_STRING:
+      dst_ptr = tensor->flat<string>().data();
+      break;
+    case DT_INT64:
+      dst_ptr = tensor->flat<int64>().data();
+      break;
+    case DT_BOOL:
+      dst_ptr = tensor->flat<bool>().data();
+      break;
+    case DT_QINT8:
+      dst_ptr = tensor->flat<qint8>().data();
+      break;
+    case DT_QUINT8:
+      dst_ptr = tensor->flat<quint8>().data();
+      break;
+    case DT_QINT32:
+      dst_ptr = tensor->flat<qint32>().data();
+      break;
+    case DT_BFLOAT16:
+      dst_ptr = tensor->flat<bfloat16>().data();
+      break;
+    case DT_QINT16:
+      dst_ptr = tensor->flat<qint16>().data();
+      break;
+    case DT_QUINT16:
+      dst_ptr = tensor->flat<quint16>().data();
+      break;
+    case DT_UINT16:
+      dst_ptr = tensor->flat<uint16>().data();
+      break;
+    default:
+      CHECK(false) << "type " << tensor->dtype() << " is not supported.";
+      break;
+  }
+  CHECK_NOTNULL(dst_ptr);
+  std::memcpy(dst_ptr, src_ptr, src_size);
+  return Status::OK();
+}
+
+/* static */ std::unordered_set<string>
+RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpTypes(
+    const GraphDef& graph_def, const std::unordered_set<string>& op_types) {
+  std::unordered_set<string> retval;
+  for (const NodeDef& node_def : graph_def.node()) {
+    if (op_types.count(node_def.op()) > 0) {
+      retval.emplace(node_def.name());
+    }
+  }
+  return retval;
+}
+
+/* static */ std::unordered_set<string>
+RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions(
+    const GraphDef& graph_def,
+    const IRemoteFusedGraphOpsDefinitions& ops_definitions) {
+  std::unordered_set<string> retval;
+  for (const NodeDef& node_def : graph_def.node()) {
+    std::vector<DataType> dt_vec;
+    std::vector<TensorShape> shape_vec;
+    const Status status =
+        GetOutputTensorShapeType(node_def, &dt_vec, &shape_vec);
+    if (!status.ok()) {
+      shape_vec.clear();
+    }
+    if (ops_definitions.GetOpIdFor(
+            node_def.op(), DataTypeVector(dt_vec.begin(), dt_vec.end())) !=
+        IRemoteFusedGraphOpsDefinitions::INVALID_OP_ID) {
+      retval.emplace(node_def.name());
+    }
+  }
+  return retval;
+}
+
 /* static */ Status RemoteFusedGraphExecuteUtils::ReplaceInputNodeByPlaceHolder(
     const string& input, const DataType type, const TensorShape& shape,
     GraphDef* graph_def) {
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index a80fc797841dcd6cf3dae71852e46ac901cc0fc9..541c26baaf999d6ad7b34aaf65bf43cb788da582 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -57,6 +58,10 @@ class RemoteFusedGraphExecuteUtils {
       "border_inputs";
   static constexpr const char* const TRANSFORM_ARG_BORDER_OUTPUTS =
       "border_outputs";
+  static constexpr const char* const TRANSFORM_ARG_FUSED_OP_TYPES =
+      "fused_op_types";
+  static constexpr const char* const TRANSFORM_ARG_FUSE_BY_EXECUTOR =
+      "fuse_by_executor";
   static constexpr const char* const TRANSFORM_ARG_INPUT_TYPES = "input_types";
   static constexpr const char* const TRANSFORM_ARG_INPUT_SHAPES =
       "input_shapes";
@@ -157,7 +162,7 @@ class RemoteFusedGraphExecuteUtils {
       const std::vector<std::pair<string, Tensor>>& input_tensors,
       const bool dry_run_inference, GraphDef* graph_def);
 
-  // Build remote fused graph execute info
+  // Build remote fused graph execute info.
   static Status BuildRemoteFusedGraphExecuteInfo(
       const string& executor_name, const GraphDef& subgraph_def,
       const std::vector<string>& inputs, const std::vector<string>& outputs,
@@ -165,31 +170,31 @@ class RemoteFusedGraphExecuteUtils {
       DataTypeVector* input_types, DataTypeVector* output_types);
 
   // Build remote fused graph execute op node by fusing specified subgraph
-  // as remote fused graph execute info
+  // as remote fused graph execute info.
   static Status BuildRemoteFusedGraphExecuteOpNode(
       const string& node_name, const string& executor_name,
       const GraphDef& subgraph_def, const std::vector<string>& inputs,
       const std::vector<string>& outputs, const bool require_shape_type,
       Graph* graph, Node** created_node);
 
-  // Build Identity node to forward remote graph node output
+  // Build Identity node to forward remote graph node output.
   static Status BuildIdentityOpNode(const string& node_name,
                                     const string& input_node_name,
                                     const int input_node_port,
                                     const DataType dt, Graph* graph,
                                     Node** created_node);
 
-  // Create clusters of given nodes
+  // Create clusters of given nodes.
   static Status ClusterizeNodes(const std::unordered_set<string>& node_names,
                                 const GraphDef& graph_def,
                                 std::vector<ClusterInfo>* cluster_infos);
 
-  // Build GraphDef of a given cluster
+  // Build GraphDef of a given cluster.
   static Status BuildClusterSubgraphDef(const ClusterInfo& cluster,
                                         const GraphDef& graph_def,
                                         GraphDef* subgraph_def);
 
-  // Build a cluster by given border
+  // Build a cluster by given border.
   // CAVEAT: The border must be consistent for one cluster.
   static Status BuildClusterByBorder(const std::vector<string>& border_inputs,
                                      const std::vector<string>& border_outputs,
@@ -211,7 +216,7 @@ class RemoteFusedGraphExecuteUtils {
                             const bool require_shape_type,
                             GraphDef* output_graph_def);
 
-  // Fuse subgraph of specified nodes
+  // Fuse subgraph of specified nodes.
   static Status FuseRemoteGraphByNodeNames(
       const GraphDef& input_graph_def, const std::vector<string>& inputs,
       const std::vector<string>& outputs,
@@ -220,7 +225,7 @@ class RemoteFusedGraphExecuteUtils {
       const string& remote_fused_graph_executor_name,
       const bool require_shape_type, GraphDef* output_graph_def);
 
-  // Fuse subgraph of specified border
+  // Fuse subgraph of specified border.
   static Status FuseRemoteGraphByBorder(
       const GraphDef& input_graph_def, const std::vector<string>& inputs,
       const std::vector<string>& outputs,
@@ -230,25 +235,57 @@ class RemoteFusedGraphExecuteUtils {
       const string& remote_graph_executor_name, const bool require_shape_type,
       GraphDef* output_graph_def);
 
-  // Place arguments to fuse remote graph
+  // Fuse subgraph of specified op types.
+  static Status FuseRemoteGraphByOpTypes(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name_prefix,
+      const std::unordered_set<string>& fused_op_types,
+      const string& remote_fused_graph_executor_name,
+      const bool require_shape_type, GraphDef* output_graph_def);
+
+  // Place arguments to fuse remote graph.
   static Status PlaceRemoteGraphArguments(
       const std::vector<string>& inputs, const std::vector<string>& outputs,
       const std::unordered_set<string>& fused_node_names,
       const std::vector<string>& border_inputs,
       const std::vector<string>& border_outputs,
+      const std::unordered_set<string>& fused_op_types,
       const string& remote_fused_graph_node_name,
       const string& remote_graph_executor_name, GraphDef* graph_def);
 
-  // Fuse remote graph by placed arguments
+  // Fuse remote graph by placed arguments.
   static Status FuseRemoteGraphByPlacedArguments(
       const GraphDef& input_graph_def,
       const std::vector<std::pair<string, Tensor>>& input_tensors,
       GraphDef* output_graph_def);
 
+  static Status FuseRemoteGraphByExecutor(const GraphDef& input_graph_def,
+                                          const std::vector<string>& inputs,
+                                          const std::vector<string>& outputs,
+                                          const string& executor_name,
+                                          GraphDef* output_graph_def);
+
   static bool IsFuseReady(
       const GraphDef& input_graph_def,
       const std::vector<std::pair<string, Tensor>>& input_tensors);
 
+  // Copy a byte array to a tensor data.  Though tensor data must be
+  // updated with typed information in general, we can't guarantee that
+  // returned values from a remote processor has typed information because
+  // a logic running in the remote processor possibly be in a separate binary
+  // which may not link tensorflow libraries.  To deal with this situation,
+  // remote fused graph needs to overwrite the tensor data by a byte array.
+  static Status CopyByteArrayToTensor(const void* src_ptr, const int src_size,
+                                      Tensor* tensor);
+
+  static std::unordered_set<string> BuildNodeMapFromOpTypes(
+      const GraphDef& graph_def, const std::unordered_set<string>& op_types);
+
+  static std::unordered_set<string> BuildNodeMapFromOpsDefinitions(
+      const GraphDef& graph_def,
+      const IRemoteFusedGraphOpsDefinitions& ops_definitions);
+
  private:
   static void EmplaceTensorShapeType(const string& name, const Tensor& tensor,
                                      TensorShapeMap* tensor_shape_map);
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
index b24482f2d5423c60b321f817bcae5fc776fd36ba..aca8ddfae9a91d0de40aafc7d5df43867bc9c7af 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -32,6 +33,10 @@ constexpr const char* const NAME_A_PLUS_B = "A_PLUS_B";
 constexpr float NODE_A_VAL = 2.0f;
 constexpr float NODE_B_VAL = 3.0f;
 constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
+constexpr const char* const REMOTE_FUSED_EXECUTOR_NAME0 =
+    "fuse_test_remote_fused_graph_executor0";
+constexpr const char* const REMOTE_FUSED_EXECUTOR_NAME1 =
+    "fuse_test_remote_fused_graph_executor1";
 
 static NodeDef* GetNodeDef(const string& name, GraphDef* def) {
   CHECK_NE(def, nullptr);
@@ -43,17 +48,38 @@ static NodeDef* GetNodeDef(const string& name, GraphDef* def) {
   return nullptr;
 }
 
+Status BuildRemoteFusedGraphExecutor0(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
+  executor->reset(
+      new TestRemoteFusedGraphExecutor({"Mul"}, REMOTE_FUSED_EXECUTOR_NAME0));
+  return Status::OK();
+}
+
+Status BuildRemoteFusedGraphExecutor1(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
+  executor->reset(new TestRemoteFusedGraphExecutor(
+      {"Const", "Mul"}, REMOTE_FUSED_EXECUTOR_NAME1));
+  return Status::OK();
+}
+
 class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
  protected:
   void SetUp() final {
     TF_ASSERT_OK(
         RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def_));
     RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
-        k_hexagon_remote_fused_graph_executor_build(
+        hexagon_remote_fused_graph_executor_build(
             "remote_graph_executor_name",
             [](std::unique_ptr<IRemoteFusedGraphExecutor>* executor) -> Status {
               return Status::OK();
             });
+    RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+        test_remote_fused_graph_executor_build0(REMOTE_FUSED_EXECUTOR_NAME0,
+                                                BuildRemoteFusedGraphExecutor0);
+
+    RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+        test_remote_fused_graph_executor_build1(REMOTE_FUSED_EXECUTOR_NAME1,
+                                                BuildRemoteFusedGraphExecutor1);
   }
 
   void TearDown() final {}
@@ -79,6 +105,25 @@ class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
         /*require_shape_type=*/false, &result_graph_def_);
   }
 
+  Status FuseByOpTypes() {
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByOpTypes(
+        graph_def_, inputs_, outputs_, "remote_fused_graph_node_names",
+        subgraph_op_types_, "remote_graph_executor_name",
+        /*require_shape_type=*/false, &result_graph_def_);
+  }
+
+  Status FuseByExecutor0() {
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByExecutor(
+        graph_def_, inputs_, outputs_, REMOTE_FUSED_EXECUTOR_NAME0,
+        &result_graph_def_);
+  }
+
+  Status FuseByExecutor1() {
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByExecutor(
+        graph_def_, inputs_, outputs_, REMOTE_FUSED_EXECUTOR_NAME1,
+        &result_graph_def_);
+  }
+
   Status BuildAndAddTensorShape() {
     return RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
         input_tensors_, /*dry_run_inference=*/true, &graph_def_);
@@ -87,8 +132,9 @@ class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
   Status PlaceRemoteGraphArguments() {
     return RemoteFusedGraphExecuteUtils::PlaceRemoteGraphArguments(
         inputs_, outputs_, subgraph_node_names_, subgraph_input_names_,
-        subgraph_output_names_, "remote_fused_graph_node_names",
-        "remote_graph_executor_name", &graph_def_);
+        subgraph_output_names_, subgraph_op_types_,
+        "remote_fused_graph_node_names", "remote_graph_executor_name",
+        &graph_def_);
   }
 
   Status FuseByPlacedArguments() {
@@ -104,6 +150,15 @@ class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
                                                      input_tensors_);
   }
 
+  void ReplaceOpType(const std::unordered_set<string>& op_name,
+                     const string& new_op_type) {
+    for (NodeDef& node_def : *graph_def_.mutable_node()) {
+      if (op_name.count(node_def.name()) > 0) {
+        node_def.set_op(new_op_type);
+      }
+    }
+  }
+
  public:
   const std::vector<std::pair<string, Tensor>> input_tensors_{
       {"A", {DT_FLOAT, {1, 1, 1, 1}}}};
@@ -114,6 +169,7 @@ class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
   std::vector<string> subgraph_input_names_;
   std::vector<string> subgraph_output_names_;
   std::unordered_set<string> subgraph_node_names_;
+  std::unordered_set<string> subgraph_op_types_;
 };
 
 void SetSubgraphArguments(const std::vector<string>& input_names,
@@ -289,7 +345,7 @@ TEST(RemoteFusedGraphExecuteUtils, PropagateAndBuildTensorShapeMap) {
       NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
   ImportGraphDefOptions opts;
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   Status status = ImportGraphDef(opts, def, &graph, &shape_refiner);
   ASSERT_TRUE(RemoteFusedGraphExecuteUtils::PropagateShapeInference(
                   def, inputs, &graph, &shape_refiner)
@@ -408,7 +464,7 @@ TEST(RemoteFusedGraphExecuteUtils, BuildRemoteFusedGraphExecuteOpNode) {
       NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
 
   Graph graph(OpRegistry::Global());
-  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   TF_ASSERT_OK(ImportGraphDef({}, def, &graph, &shape_refiner));
 
   Node* node;
@@ -649,6 +705,56 @@ TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_ABCDEFGHIJK) {
       << SummarizeGraphDef(result_graph_def_);
 }
 
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByOpTypes_HIJ) {
+  subgraph_op_types_ = {"Mul"};
+  ReplaceOpType({"H", "I", "J"}, "Mul");
+
+  TF_ASSERT_OK(FuseByOpTypes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByOpTypes_FGHIJ) {
+  subgraph_op_types_ = {"Const", "Mul"};
+  ReplaceOpType({"F", "G", "H", "I", "J"}, "Mul");
+
+  TF_ASSERT_OK(FuseByOpTypes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByExecutor_HIJ) {
+  ReplaceOpType({"H", "I", "J"}, "Mul");
+
+  TF_ASSERT_OK(FuseByExecutor0());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByExecutor_FGHIJ) {
+  ReplaceOpType({"F", "G", "H", "I", "J"}, "Mul");
+
+  TF_ASSERT_OK(FuseByExecutor1());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
 TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_H) {
   subgraph_node_names_ = {"H"};
 
@@ -755,5 +861,41 @@ TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_ABCDE_K) {
       << SummarizeGraphDef(result_graph_def_);
 }
 
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_MUL_HIJ) {
+  ReplaceOpType({"H", "I", "J"}, "Mul");
+  subgraph_op_types_ = {"Mul"};
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, PlaceAndFuse_CONST_MUL_FGHIJ) {
+  ReplaceOpType({"F", "G", "H", "I", "J"}, "Mul");
+  subgraph_op_types_ = {"Const", "Mul"};
+
+  TF_ASSERT_OK(PlaceRemoteGraphArguments());
+  ASSERT_TRUE(IsFuseReady());
+  TF_ASSERT_OK(BuildAndAddTensorShape());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+
+  TF_ASSERT_OK(FuseByPlacedArguments());
+
+  EXPECT_EQ(3, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
index ac0503088c491ddf5a31401b895d380438a46010..0822061b14bd45027019999b637d17920b7c98f8 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
@@ -66,6 +66,7 @@ static Status ParseArguments(const TransformFuncContext& context,
                              string* input_types_str, string* input_shapes_str,
                              string* fused_nodes_str, string* border_inputs_str,
                              string* border_outputs_str,
+                             string* fused_op_types_str, bool* fuse_by_executor,
                              string* remote_fused_graph_node_name,
                              string* remote_graph_executor_name) {
   TF_RETURN_IF_ERROR(context.GetOneStringParameter(
@@ -83,6 +84,12 @@ static Status ParseArguments(const TransformFuncContext& context,
   TF_RETURN_IF_ERROR(context.GetOneStringParameter(
       RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS, "",
       border_outputs_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_OP_TYPES, "",
+      fused_op_types_str));
+  TF_RETURN_IF_ERROR(context.GetOneBoolParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSE_BY_EXECUTOR, false,
+      fuse_by_executor));
   TF_RETURN_IF_ERROR(context.GetOneStringParameter(
       RemoteFusedGraphExecuteUtils::
           TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
@@ -107,7 +114,7 @@ static Status PlaceShapeType(const std::vector<string>& inputs,
   CHECK_EQ(inputs.size(), input_types_strs.size());
   CHECK_EQ(inputs.size(), input_shapes_strs.size());
   std::vector<std::pair<string, Tensor>> input_tensors;
-  for (int i = 0; i < inputs.size(); ++i) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
     const string& name = inputs.at(i);
     std::vector<int64> dims;
     CHECK(str_util::SplitAndParseAsInts(input_shapes_strs.at(i), ',', &dims));
@@ -135,11 +142,14 @@ Status FuseRemoteGraph(const GraphDef& input_graph_def,
   string fused_nodes_str;
   string border_inputs_str;
   string border_outputs_str;
+  string fused_op_types_str;
+  bool fuse_by_executor = false;
   string remote_fused_graph_node_name;
   string remote_graph_executor_name;
   TF_RETURN_IF_ERROR(ParseArguments(
       context, &input_types_str, &input_shapes_str, &fused_nodes_str,
-      &border_inputs_str, &border_outputs_str, &remote_fused_graph_node_name,
+      &border_inputs_str, &border_outputs_str, &fused_op_types_str,
+      &fuse_by_executor, &remote_fused_graph_node_name,
       &remote_graph_executor_name));
 
   if (!input_types_str.empty()) {
@@ -163,16 +173,29 @@ Status FuseRemoteGraph(const GraphDef& input_graph_def,
         str_util::Split(border_inputs_str, ",");
     const std::vector<string> border_outputs =
         str_util::Split(border_outputs_str, ",");
-    for (int i = 0; i < border_inputs.size(); ++i) {
+    for (size_t i = 0; i < border_inputs.size(); ++i) {
       VLOG(2) << "Border Input(" << i << "): " << border_inputs.at(i);
     }
-    for (int i = 0; i < border_outputs.size(); ++i) {
+    for (size_t i = 0; i < border_outputs.size(); ++i) {
       VLOG(2) << "Border Output(" << i << "): " << border_outputs.at(i);
     }
     TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
         mutable_input_graph_def, inputs, outputs, remote_fused_graph_node_name,
         border_inputs, border_outputs, remote_graph_executor_name,
         require_shape_type, output_graph_def));
+  } else if (!fused_op_types_str.empty()) {
+    const std::vector<string> fused_op_type_vector =
+        str_util::Split(fused_op_types_str, ",");
+    const std::unordered_set<string> fused_op_types(
+        fused_op_type_vector.begin(), fused_op_type_vector.end());
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByOpTypes(
+        mutable_input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        fused_op_types, remote_graph_executor_name, require_shape_type,
+        output_graph_def));
+  } else if (fuse_by_executor) {
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByExecutor(
+        mutable_input_graph_def, inputs, outputs, remote_graph_executor_name,
+        output_graph_def));
   } else {
     CHECK(false) << "Fuse targets are not specified.";
   }
@@ -193,11 +216,14 @@ Status PlaceRemoteGraphArguments(const GraphDef& input_graph_def,
   string fused_nodes_str;
   string border_inputs_str;
   string border_outputs_str;
+  string fused_op_types_str;
+  bool fuse_by_executor = false;
   string remote_fused_graph_node_name;
   string remote_graph_executor_name;
   TF_RETURN_IF_ERROR(ParseArguments(
       context, &input_types_str, &input_shapes_str, &fused_nodes_str,
-      &border_inputs_str, &border_outputs_str, &remote_fused_graph_node_name,
+      &border_inputs_str, &border_outputs_str, &fused_op_types_str,
+      &fuse_by_executor, &remote_fused_graph_node_name,
       &remote_graph_executor_name));
 
   if (!input_types_str.empty()) {
@@ -213,9 +239,14 @@ Status PlaceRemoteGraphArguments(const GraphDef& input_graph_def,
       str_util::Split(border_inputs_str, ",");
   const std::vector<string> border_outputs =
       str_util::Split(border_outputs_str, ",");
+  const std::vector<string> fused_op_type_vector =
+      str_util::Split(fused_op_types_str, ",");
+  const std::unordered_set<string> fused_op_types(fused_op_type_vector.begin(),
+                                                  fused_op_type_vector.end());
+
   TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::PlaceRemoteGraphArguments(
       inputs, outputs, fused_node_names, border_inputs, border_outputs,
-      remote_fused_graph_node_name, remote_graph_executor_name,
+      fused_op_types, remote_fused_graph_node_name, remote_graph_executor_name,
       output_graph_def));
 
   return Status::OK();
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index 143cdd7c0ae1c83a0efaa73029a22e43f2c9c31f..d5b37b1ce1279ef69f103bf2968801b641dfc1f5 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -43,11 +44,28 @@ Status PlaceRemoteGraphArguments(const GraphDef& input_graph_def,
                                  GraphDef* output_graph_def);
 
 namespace {
-
 constexpr const char* const REMOTE_FUSED_GRAPH_EXECUTOR_NAME =
     "remote_fused_graph_executor_name";
 constexpr const char* const REMOTE_FUSED_GRAPH_NODE_NAME =
     "remote_fused_graph_node_name";
+constexpr const char* const REMOTE_FUSED_EXECUTOR_NAME0 =
+    "fuse_test_remote_fused_graph_executor0";
+constexpr const char* const REMOTE_FUSED_EXECUTOR_NAME1 =
+    "fuse_test_remote_fused_graph_executor1";
+
+Status BuildRemoteFusedGraphExecutor0(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
+  executor->reset(
+      new TestRemoteFusedGraphExecutor({"Mul"}, REMOTE_FUSED_EXECUTOR_NAME0));
+  return Status::OK();
+}
+
+Status BuildRemoteFusedGraphExecutor1(
+    std::unique_ptr<IRemoteFusedGraphExecutor>* executor) {
+  executor->reset(new TestRemoteFusedGraphExecutor(
+      {"Const", "Mul"}, REMOTE_FUSED_EXECUTOR_NAME1));
+  return Status::OK();
+}
 
 class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
  protected:
@@ -55,11 +73,18 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(
         &input_graph_def_));
     RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
-        k_hexagon_remote_fused_graph_executor_build(
+        hexagon_remote_fused_graph_executor_build(
             REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
             [](std::unique_ptr<IRemoteFusedGraphExecutor>* executor) -> Status {
               return Status::OK();
             });
+    RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+        test_remote_fused_graph_executor_build0(REMOTE_FUSED_EXECUTOR_NAME0,
+                                                BuildRemoteFusedGraphExecutor0);
+
+    RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar
+        test_remote_fused_graph_executor_build1(REMOTE_FUSED_EXECUTOR_NAME1,
+                                                BuildRemoteFusedGraphExecutor1);
   }
 
   void TearDown() final {}
@@ -107,10 +132,22 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
            {border_outputs_str_}}));
     }
 
+    if (!fused_op_types_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_OP_TYPES,
+           {fused_op_types_str_}}));
+    }
+
+    if (fuse_by_executor_) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSE_BY_EXECUTOR,
+           {"true"}}));
+    }
+
     context.params.insert(std::pair<string, std::vector<string>>(
         {RemoteFusedGraphExecuteUtils::
              TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
-         {REMOTE_FUSED_GRAPH_EXECUTOR_NAME}}));
+         {remote_fused_graph_executor_name_}}));
     context.params.insert(std::pair<string, std::vector<string>>(
         {RemoteFusedGraphExecuteUtils::
              TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME,
@@ -129,6 +166,15 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     input_shapes_ = "1,1,1,1";
   }
 
+  void ReplaceOpType(const std::unordered_set<string>& op_name,
+                     const string& new_op_type) {
+    for (NodeDef& node_def : *input_graph_def_.mutable_node()) {
+      if (op_name.count(node_def.name()) > 0) {
+        node_def.set_op(new_op_type);
+      }
+    }
+  }
+
   void CheckGraph(int expected_node_count, int expected_cluster_count) {
     EXPECT_EQ(expected_node_count, output_graph_def_.node_size());
 
@@ -145,7 +191,7 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
                             ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
                         &serialized_proto));
         info.ParseFromString(serialized_proto);
-        CHECK_EQ(REMOTE_FUSED_GRAPH_EXECUTOR_NAME, info.executor_name());
+        CHECK_EQ(remote_fused_graph_executor_name_, info.executor_name());
       }
     }
     EXPECT_EQ(expected_cluster_count, cluster_count);
@@ -162,6 +208,9 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
   string fused_node_names_str_;
   string border_inputs_str_;
   string border_outputs_str_;
+  string fused_op_types_str_;
+  string remote_fused_graph_executor_name_{REMOTE_FUSED_GRAPH_EXECUTOR_NAME};
+  bool fuse_by_executor_{false};
 };
 
 TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
@@ -228,6 +277,40 @@ TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
   CheckGraph(7, 1);
 }
 
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByOpTypes_HIJ) {
+  ReplaceOpType({"H", "I", "J"}, "Mul");
+  fused_op_types_str_ = "Mul";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByOpTypes_FGHIJ) {
+  ReplaceOpType({"F", "G", "H", "I", "J"}, "Mul");
+  fused_op_types_str_ = "Const,Mul";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(3, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByExecutor_HIJ) {
+  ReplaceOpType({"H", "I", "J"}, "Mul");
+  remote_fused_graph_executor_name_ = REMOTE_FUSED_EXECUTOR_NAME0;
+  fuse_by_executor_ = true;
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByExecutor_FGHIJ) {
+  ReplaceOpType({"F", "G", "H", "I", "J"}, "Mul");
+  remote_fused_graph_executor_name_ = REMOTE_FUSED_EXECUTOR_NAME1;
+  fuse_by_executor_ = true;
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(3, 1);
+}
+
 TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_HIJ) {
   fused_node_names_str_ = "H,I,J";
   TF_ASSERT_OK(PlaceFuseArgs());
@@ -259,6 +342,27 @@ TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_ABCDE_K) {
   CheckGraph(7, 1);
 }
 
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest, PlaceAndFuse_MUL_HIJ) {
+  SetInputShapeType();
+  ReplaceOpType({"H", "I", "J"}, "Mul");
+  fused_op_types_str_ = "Mul";
+
+  TF_ASSERT_OK(PlaceFuseArgs());
+  TF_ASSERT_OK(FuseWithPlacedArgs());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       PlaceAndFuse_CONST_MUL_FGHIJ) {
+  SetInputShapeType();
+  ReplaceOpType({"F", "G", "H", "I", "J"}, "Mul");
+  fused_op_types_str_ = "Const,Mul";
+
+  TF_ASSERT_OK(PlaceFuseArgs());
+  TF_ASSERT_OK(FuseWithPlacedArgs());
+  CheckGraph(3, 1);
+}
+
 }  // namespace
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc
index ad94de89dba10d044ef46417b2be68009ae199cf..ada50dfb70de447d9be9f735c6b973a25933cfa5 100644
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -33,7 +33,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 namespace {
-
 struct CachedInterpolation {
   int64 start;
   int64 end;
@@ -41,7 +40,7 @@ struct CachedInterpolation {
   float end_minus_one_scale;
   bool needs_bounding;
 };
-};
+}  // namespace
 
 template <typename Device, typename T>
 class ResizeAreaOp : public OpKernel {
@@ -170,7 +169,7 @@ class ResizeAreaOp : public OpKernel {
                    : (v + 1 > in_x1 ? in_x1 - v : 1.0);
 
       v = ceil(in_x1);
-      x_interp.end = ceil(in_x1);
+      x_interp.end = v;
       v = x_interp.end - 1;
       x_interp.end_minus_one_scale =
           v < in_x ? (v + 1 > in_x1 ? st.width_scale : v + 1 - in_x)
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index fadfb1e637a8096e49ebfed5eae5a8797eea8050..e731971beac82a1ab152a5ca413d505b82c15c19 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -15,12 +15,16 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/dense_update_ops.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/kernels/scatter_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
@@ -94,7 +98,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
                               .HostMemory("resource"),         \
                           ReadVariableOp<GPUDevice, type>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -217,20 +221,13 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNELS(type)                             \
-  namespace functor {                                          \
-  template <>                                                  \
-  void DenseUpdate<GPUDevice, type, ASSIGN>::operator()(       \
-      const GPUDevice& d, typename TTypes<type>::Flat lhs,     \
-      typename TTypes<type>::ConstFlat rhs);                   \
-  extern template struct DenseUpdate<GPUDevice, type, ASSIGN>; \
-  }                                                            \
   REGISTER_KERNEL_BUILDER(Name("AssignVariableOp")             \
                               .Device(DEVICE_GPU)              \
                               .TypeConstraint<type>("dtype")   \
                               .HostMemory("resource"),         \
                           AssignVariableOp<GPUDevice, type>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -275,20 +272,6 @@ TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNELS(type)                                       \
-  namespace functor {                                                    \
-  template <>                                                            \
-  void DenseUpdate<GPUDevice, type, ADD>::operator()(                    \
-      const GPUDevice& d, typename TTypes<type>::Flat lhs,               \
-      typename TTypes<type>::ConstFlat rhs);                             \
-  extern template struct DenseUpdate<GPUDevice, type, ADD>;              \
-  }                                                                      \
-  namespace functor {                                                    \
-  template <>                                                            \
-  void DenseUpdate<GPUDevice, type, SUB>::operator()(                    \
-      const GPUDevice& d, typename TTypes<type>::Flat lhs,               \
-      typename TTypes<type>::ConstFlat rhs);                             \
-  extern template struct DenseUpdate<GPUDevice, type, SUB>;              \
-  }                                                                      \
   REGISTER_KERNEL_BUILDER(Name("AssignAddVariableOp")                    \
                               .Device(DEVICE_GPU)                        \
                               .HostMemory("resource")                    \
@@ -348,9 +331,14 @@ class ResourceGatherOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
     if (N > 0) {
-      auto params_flat = params.flat_outer_dims<T>();
+      const int64 gather_dim_size = params.dim_size(0);
+      int64 inner_size = 1;
+      for (int i = 1; i < params.dims(); i++) {
+        inner_size *= params.dim_size(i);
+      }
+      auto params_flat = params.shaped<T, 3>({1, gather_dim_size, inner_size});
       auto indices_flat = indices.flat<Index>();
-      auto out_flat = out->shaped<T, 2>({N, out->NumElements() / N});
+      auto out_flat = out->shaped<T, 3>({1, N, out->NumElements() / N});
 
       functor::GatherFunctor<Device, T, Index> functor;
       int64 bad_i = functor(c->eigen_device<Device>(), params_flat,
@@ -395,6 +383,7 @@ class ResourceScatterUpdateOp : public OpKernel {
   void Compute(OpKernelContext* c) override {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+    core::ScopedUnref unref_v(v);
     mutex_lock ml(*v->mu());
     Tensor* params = v->tensor();
     const Tensor& indices = c->input(1);
@@ -456,6 +445,17 @@ class ResourceScatterUpdateOp : public OpKernel {
 
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
 
+// Registers GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
+  REGISTER_SCATTER_ARITHEMTIC(type, GPU);
+
+#define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_GPU);
+
+#endif  // GOOGLE_CUDA
+
 #undef REGISTER_SCATTER_ARITHEMTIC
 #undef REGISTER_SCATTER_ARITHEMTIC_CPU
 #undef REGISTER_SCATTER_KERNEL
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index 4dae5da6356e05f250f99869e0f8b4b2db44fb2c..44a817a5c76d31aa8bde25a5f608b75b81116355 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -192,18 +192,20 @@ bool GenerateRandomCrop(int original_width, int original_height,
 }  // namespace
 
 template <typename T>
-class SampleDistortedBoundingBoxOp : public OpKernel {
+class SampleDistortedBoundingBoxV2Op : public OpKernel {
  public:
-  explicit SampleDistortedBoundingBoxOp(OpKernelConstruction* context)
+  explicit SampleDistortedBoundingBoxV2Op(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, generator_.Init(context));
 
-    OP_REQUIRES_OK(
-        context, context->GetAttr("min_object_covered", &min_object_covered_));
-    OP_REQUIRES(
-        context, min_object_covered_ >= 0,
-        errors::InvalidArgument("Min object covered must be non-negative: ",
-                                min_object_covered_));
+    if (context->num_inputs() == 2) {
+      OP_REQUIRES_OK(context, context->GetAttr("min_object_covered",
+                                               &min_object_covered_));
+      OP_REQUIRES(
+          context, min_object_covered_ >= 0,
+          errors::InvalidArgument("Min object covered must be non-negative: ",
+                                  min_object_covered_));
+    }
 
     OP_REQUIRES_OK(context, context->GetAttr("use_image_if_no_bounding_boxes",
                                              &use_image_if_no_bounding_boxes_));
@@ -275,6 +277,25 @@ class SampleDistortedBoundingBoxOp : public OpKernel {
                     "bounding boxes must have shape [4] or [*, 4], got ",
                     input_boxes.shape().DebugString()));
 
+    float min_object_covered_val = 0.0;
+    if (context->num_inputs() == 3) {
+      const Tensor& min_object_covered = context->input(2);
+
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsScalar(min_object_covered.shape()),
+          errors::InvalidArgument("min_object_covered must be 0-D, got shape ",
+                                  min_object_covered.shape().DebugString()));
+
+      min_object_covered_val = min_object_covered.scalar<float>()();
+
+      OP_REQUIRES(
+          context, min_object_covered_val >= 0,
+          errors::InvalidArgument("Min object covered must be non-negative: ",
+                                  min_object_covered_val));
+    } else {
+      min_object_covered_val = min_object_covered_;
+    }
+
     std::vector<Rectangle> bounding_boxes;
     if (input_boxes.NumElements() > 0) {
       TTypes<float>::ConstMatrix boxes = input_boxes.flat_inner_dims<float>();
@@ -325,7 +346,7 @@ class SampleDistortedBoundingBoxOp : public OpKernel {
 
       if (GenerateRandomCrop(width, height, min_sample_area, max_sample_area,
                              sample_aspect_ratio, &random, &crop_rect)) {
-        if (SatisfiesOverlapConstraints(crop_rect, min_object_covered_,
+        if (SatisfiesOverlapConstraints(crop_rect, min_object_covered_val,
                                         bounding_boxes)) {
           sample_generated = true;
           break;
@@ -399,11 +420,15 @@ class SampleDistortedBoundingBoxOp : public OpKernel {
   bool use_image_if_no_bounding_boxes_;
 };
 
-#define REGISTER_KERNELS(type)                               \
-  REGISTER_KERNEL_BUILDER(                                   \
-      Name("SampleDistortedBoundingBox").Device(DEVICE_CPU)   \
-           .TypeConstraint<type>("T"),                       \
-      SampleDistortedBoundingBoxOp<type>)
+#define REGISTER_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBox")    \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<type>("T"),       \
+                          SampleDistortedBoundingBoxV2Op<type>) \
+  REGISTER_KERNEL_BUILDER(Name("SampleDistortedBoundingBoxV2")  \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<type>("T"),       \
+                          SampleDistortedBoundingBoxV2Op<type>)
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 48565d8cb97e955cb8200e7101abbfc91be1ba64..59f690e7aabf8ca9ac02ab3c1f85b4604b8e5f79 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -16,20 +16,26 @@ limitations under the License.
 // See docs in ../ops/state_ops.cc.
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -37,7 +43,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape[:batch_dim] +
 // params_shape[slice_dim:]
@@ -91,11 +97,13 @@ static void PrepareAndValidateInputs(OpKernelContext* c,
       errors::InvalidArgument("Output must be at least 1-D, ",
                               "got shape: ", params_shape.DebugString()));
 
-  OP_REQUIRES(c,
-              params_shape.num_elements() >= 0 ||
-                  (indices.NumElements() == 0 && updates.NumElements() == 0),
-              errors::InvalidArgument(
-                  "Indices and updates specified for empty output", " shape"));
+  OP_REQUIRES(
+      c,
+      params_shape.num_elements() > 0 ||
+          (indices.NumElements() == 0 && updates.NumElements() == 0),
+      errors::InvalidArgument(
+          "Indices and updates specified for empty output.  indices shape: ",
+          indices.shape().DebugString()));
 
   OP_REQUIRES(c, updates.dim_size(0) == indices.dim_size(0),
               errors::InvalidArgument(
@@ -147,9 +155,9 @@ static void PrepareAndValidateInputs(OpKernelContext* c,
 
 template <typename Device, typename Index>
 class IndexFlattener {
-public:
-  inline typename TTypes<Index, 2>::ConstTensor
-  operator()(OpKernelContext*, const Tensor& indices) {
+ public:
+  inline typename TTypes<Index, 2>::ConstTensor operator()(
+      OpKernelContext*, const Tensor& indices) {
     return indices.flat_inner_dims<Index>();
   }
 };
@@ -157,12 +165,12 @@ public:
 #ifdef TENSORFLOW_USE_SYCL
 template <typename Index>
 class IndexFlattener<SYCLDevice, Index> {
-public:
+ public:
   IndexFlattener() { indices_host_ = nullptr; }
   ~IndexFlattener() { delete[] indices_host_; }
 
-  inline typename TTypes<Index, 2>::ConstTensor
-  operator()(OpKernelContext* c, const Tensor& indices) {
+  inline typename TTypes<Index, 2>::ConstTensor operator()(
+      OpKernelContext* c, const Tensor& indices) {
     size_t num_indices = indices.NumElements();
     indices_host_ = new Index[num_indices];
     auto device = c->eigen_sycl_device();
@@ -170,11 +178,11 @@ public:
     auto src_ptr = GetBase(&indices);
     device.memcpyDeviceToHost(indices_host_, static_cast<const Index*>(src_ptr),
                               size);
-    return typename TTypes<Index, 2>::ConstTensor(indices_host_,
-           indices.shape().AsEigenDSizes<2>());
+    return typename TTypes<Index, 2>::ConstTensor(
+        indices_host_, indices.shape().AsEigenDSizes<2>());
   }
 
-private:
+ private:
   Index* indices_host_;
 };
 #endif
@@ -213,6 +221,9 @@ class ScatterNdOp : public OpKernel {
 
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, shape, &out));
+
+    if (shape.num_elements() == 0) return;
+
     functor::SetZeroFunctor<Device, T> fill;
     fill(c->eigen_device<Device>(), out->flat<T>());
     auto output_matrix = out->template shaped<T, 2>(
@@ -271,12 +282,19 @@ class ScatterNdUpdateOp : public OpKernel {
     const DataType dt = DataTypeToEnum<T>::v();
     const DataType dt_ref = DataTypeToEnum<T>::ref();
     const DataType index_t = DataTypeToEnum<Index>::v();
-    OP_REQUIRES_OK(c, c->MatchSignature({dt_ref, index_t, dt}, {dt_ref}));
-    OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
+    if (IsRefType(c->input_type(0))) {
+      OP_REQUIRES_OK(c, c->MatchSignature({dt_ref, index_t, dt}, {dt_ref}));
+      OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
+    } else {
+      OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t, dt}, {dt}));
+      use_exclusive_lock_ = false;
+    }
   }
 
   void Compute(OpKernelContext* c) override {
     if (use_exclusive_lock_) {
+      // If we're here, it means the input type is a ref.
+      DCHECK(IsRefType(c->input_dtype(0)));
       // Hold mutex while we apply updates
       mutex_lock l(*c->input_ref_mutex(0));
       DoCompute(c);
@@ -289,20 +307,43 @@ class ScatterNdUpdateOp : public OpKernel {
   bool use_exclusive_lock_;
 
   void DoCompute(OpKernelContext* c) {
-    Tensor params = c->mutable_input(0, use_exclusive_lock_);
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
-    const TensorShape& params_shape(params.shape());
 
     int64 slice_dim;
     Index num_updates;
     Index slice_size;
 
-    OP_REQUIRES(c, params.IsInitialized(),
-                errors::FailedPrecondition("Null ref for params"));
+    Tensor params;
+    TensorShape params_shape;
+
+    if (IsRefType(c->input_dtype(0))) {
+      params = c->mutable_input(0, use_exclusive_lock_);
+      params_shape = params.shape();
+      c->forward_ref_input_to_ref_output(0, 0);
+      OP_REQUIRES(c, params.IsInitialized(),
+                  errors::FailedPrecondition("Null ref for params"));
+    } else {
+      Tensor* params_ptr;
+      params_shape = c->input(0).shape();
+      if (!c->forward_input_to_output_with_shape(0, 0, params_shape,
+                                                 &params_ptr)) {
+        // We weren't able to forward the input to output, so just
+        // allocate a new output tensor and copy the values over.
+        OP_REQUIRES_OK(c, c->allocate_output(0, params_shape, &params_ptr));
+        params = *params_ptr;
+        functor::DenseUpdate<Device, T, ASSIGN> copy;
+        const Tensor& input_copy = c->input(0);
+        copy(c->eigen_device<Device>(), params.flat<T>(), input_copy.flat<T>());
+      } else {
+        params = *params_ptr;
+      }
+    }
+
     PrepareAndValidateInputs<Index>(c, params_shape, indices, updates,
                                     &slice_dim, &num_updates, &slice_size);
     if (!c->status().ok()) return;
+    if (params_shape.num_elements() == 0) return;
 
     IndexFlattener<Device, Index> index_flattener;
     auto indices_flat = index_flattener(c, indices);
@@ -310,7 +351,6 @@ class ScatterNdUpdateOp : public OpKernel {
     auto params_matrix = params.template shaped<T, 2>(
         {params_shape.num_elements() / slice_size, slice_size});
     Index bad_i = -1;
-    c->forward_ref_input_to_ref_output(0, 0);
 
     switch (slice_dim) {
 #define PARAMS_CASE(IXDIM)                                                  \
@@ -376,10 +416,12 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
-#define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                     \
-  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",     \
-                                    scatter_nd_op::UpdateOp::ADD); \
-  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdSub",     \
+#define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                            \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",            \
+                                    scatter_nd_op::UpdateOp::ADD);        \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdNonAliasingAdd", \
+                                    scatter_nd_op::UpdateOp::ADD);        \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdSub",            \
                                     scatter_nd_op::UpdateOp::SUB);
 // TODO(simister): Find a way to reduce amount of templated generated code
 // to reduce build size, then re-enable these additional operations.
@@ -421,9 +463,31 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_GPU);
-
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_GPU);
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \
+  REGISTER_SCATTER_ND_ADD_SUB(type, SYCL);
+
+#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
+  REGISTER_SCATTER_ND_UPDATE(type, SYCL);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
+#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
+#undef REGISTER_SCATTER_ND_UPDATE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
+
+#undef REGISTER_SCATTER_ND_ADD
+#undef REGISTER_SCATTER_ND_ADD_SUB
+#undef REGISTER_SCATTER_ND_ADD_SUB_CPU
+#undef REGISTER_SCATTER_ND_ADD_SUB_GPU
+#undef REGISTER_SCATTER_ND_UPDATE
+#undef REGISTER_SCATTER_ND_UPDATE_CPU
+#undef REGISTER_SCATTER_ND_UPDATE_GPU
+#undef REGISTER_SCATTER_ND_KERNEL
+#undef REGISTER_SCATTER_ND_KERNEL_INDEX
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, IXDIM)           \
@@ -458,31 +522,9 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
 #undef DECLARE_GPU_SPECS_INDEX_OP
+
 }  // namespace functor
 
 #endif  // GOOGLE_CUDA
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \
-  REGISTER_SCATTER_ND_ADD_SUB(type, SYCL);
-
-#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
-  REGISTER_SCATTER_ND_UPDATE(type, SYCL);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
-#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
-#undef REGISTER_SCATTER_ND_UPDATE_SYCL
-#endif // TENSORFLOW_USE_SYCL
-
-#undef REGISTER_SCATTER_ND_ADD
-#undef REGISTER_SCATTER_ND_ADD_SUB
-#undef REGISTER_SCATTER_ND_ADD_SUB_CPU
-#undef REGISTER_SCATTER_ND_ADD_SUB_GPU
-#undef REGISTER_SCATTER_ND_UPDATE
-#undef REGISTER_SCATTER_ND_UPDATE_CPU
-#undef REGISTER_SCATTER_ND_UPDATE_GPU
-#undef REGISTER_SCATTER_ND_KERNEL
-#undef REGISTER_SCATTER_ND_KERNEL_INDEX
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 2a98a6530cfc70095d357027cfac694edbaa3f5d..9c242052f7ccb0b44720b09dd00ef7db0a982a4b 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -39,6 +39,19 @@ static void GetRendezvousKey(const string& key_prefix,
                      frame_iter.iter_id);
 }
 
+static FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
+                                    bool hostmem_sendrecv) {
+  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
+    // Host memory send/recv pairs are added by
+    // common_runtime/memory_types.cc.  When the pair of nodes are
+    // added inside a function, we need to use the function call frame
+    // to formulate the unique rendezvous key.
+    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
+  } else {
+    return ctx->frame_iter();
+  }
+}
+
 SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   string send_device;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
@@ -56,6 +69,9 @@ SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   // proactively cache the rendezvous key for the top-level.
   GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
   OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
 }
 
 void SendOp::Compute(OpKernelContext* ctx) {
@@ -71,7 +87,8 @@ void SendOp::Compute(OpKernelContext* ctx) {
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->input_alloc_attr(0);
 
-  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  if (frame_iter == FrameAndIter(0, 0)) {
     // Use the cached rendezvous key.
     VLOG(2) << "Send " << parsed_key_.buf_;
     OP_REQUIRES_OK(ctx,
@@ -79,7 +96,7 @@ void SendOp::Compute(OpKernelContext* ctx) {
                                            ctx->is_input_dead()));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
-    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
     VLOG(2) << "Send " << in_loop_parsed.buf_;
     OP_REQUIRES_OK(ctx,
                    Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
@@ -120,6 +137,9 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   // proactively cache the rendezvous key for the top-level.
   GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
   OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
 }
 
 void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
@@ -151,12 +171,13 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       },
       std::move(done), _1, _2, _3, _4, _5);
 
-  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  if (frame_iter == FrameAndIter(0, 0)) {
     VLOG(2) << "Recv " << parsed_key_.buf_;
     ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
-    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
     VLOG(2) << "Recv " << in_loop_parsed.buf_;
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 67867e330867e7add6762be0d2f9804020692701..1ff8eff13f77a0d779629110b0210c0818a0a08e 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -29,6 +29,7 @@ class SendOp : public OpKernel {
  private:
   string key_prefix_;
   Rendezvous::ParsedKey parsed_key_;
+  bool hostmem_sendrecv_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
 };
@@ -41,6 +42,7 @@ class RecvOp : public AsyncOpKernel {
  private:
   string key_prefix_;
   Rendezvous::ParsedKey parsed_key_;
+  bool hostmem_sendrecv_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
 };
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 67234e2a401cfe5096308eecb594547808f43aad..2c7ad5bab08c403351f8a832c5ffe5bdbf4e860e 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 27ad2fcd87eae35a67d43b083585eabb5beb4859..185c5b248fca8f5a4e8edf6d46e9447f8a0b4750 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -43,21 +43,21 @@ class GetSessionHandleOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& val = ctx->input(0);
     int64 id = ctx->session_state()->GetNewId();
-    TensorStore::TensorAndKey tk{val, id, def().device()};
-    OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(def().name(), tk));
+    TensorStore::TensorAndKey tk{val, id, requested_device()};
+    OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(name(), tk));
 
     Tensor* handle = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
     if (ctx->expected_output_dtype(0) == DT_RESOURCE) {
       ResourceHandle resource_handle = MakeResourceHandle<Tensor>(
           ctx, SessionState::kTensorHandleResourceTypeName,
-          tk.GetHandle(def().name()));
+          tk.GetHandle(name()));
       resource_handle.set_maybe_type_name(
           SessionState::kTensorHandleResourceTypeName);
       handle->scalar<ResourceHandle>()() = resource_handle;
     } else {
       // Legacy behavior in V1.
-      handle->flat<string>().setConstant(tk.GetHandle(def().name()));
+      handle->flat<string>().setConstant(tk.GetHandle(name()));
     }
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index c7bf250fad79deb405b05004f28a811ecddb4fde..07d935d55fe06150309736ba0fec88091ed007c6 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/work_sharder.h"
 
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
index 3b915e419bc66f1737dbb9d8f05c2182f05d09fb..81eead11d1aef456ddb6415f4279269fbe0f2f76 100644
--- a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -74,6 +74,7 @@ class SparseFillEmptyRowsOp : public OpKernel {
 
     const int64 N = indices_t->shape().dim_size(0);
     const int64 dense_rows = dense_shape(0);
+
     Tensor* empty_row_indicator_t;
     OP_REQUIRES_OK(context, context->allocate_output("empty_row_indicator",
                                                      TensorShape({dense_rows}),
@@ -84,6 +85,29 @@ class SparseFillEmptyRowsOp : public OpKernel {
         context, context->allocate_output("reverse_index_map", TensorShape({N}),
                                           &reverse_index_map_t));
     auto reverse_index_map = reverse_index_map_t->vec<int64>();
+
+    int rank = indices_t->shape().dim_size(1);
+
+    if (dense_rows == 0) {
+      OP_REQUIRES(
+          context, N == 0,
+          errors::InvalidArgument("Received SparseTensor with dense_shape[0] = "
+                                  "0 but indices.shape[0] = ",
+                                  N));
+      Tensor* output_indices_t;
+      TensorShape output_indices_shape({0, rank});
+      OP_REQUIRES_OK(context, context->allocate_output("output_indices",
+                                                       output_indices_shape,
+                                                       &output_indices_t));
+      Tensor* output_values_t;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output("output_values", TensorShape({0}),
+                                              &output_values_t));
+
+      // Exit early, nothing more to do.
+      return;
+    }
+
     Tensor scratch_t;
     OP_REQUIRES_OK(context,
                    context->allocate_temp(DT_INT64, TensorShape({dense_rows}),
@@ -115,7 +139,6 @@ class SparseFillEmptyRowsOp : public OpKernel {
       }
     }
     Tensor* output_indices_t;
-    int rank = indices_t->shape().dim_size(1);
     const int64 N_full = scratch(dense_rows - 1);
     TensorShape output_indices_shape({N_full, rank});
     OP_REQUIRES_OK(context, context->allocate_output("output_indices",
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
similarity index 83%
rename from tensorflow/core/kernels/sparse_reduce_sum_op.cc
rename to tensorflow/core/kernels/sparse_reduce_op.cc
index 074aab9f9e2865a20a2bea4efc6936b22506f922..9e60791f973a2dd0658b160a65fe16ba5e4704d0 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_op.cc
@@ -130,10 +130,30 @@ Status ValidateInputs(const Tensor *shape_t, const Tensor *reduction_axes_t) {
   return Status::OK();
 }
 
-template <typename T>
-class SparseReduceSumOp : public OpKernel {
+struct SumOp {
+  template <typename T>
+  static void Run(OpKernelContext *ctx, typename TTypes<T>::Scalar &s, const typename TTypes<T>::UnalignedVec &v) {
+      s.device(ctx->eigen_cpu_device()) = v.sum();
+  }
+  static StringPiece Name() {
+      return "sum";
+  }
+};
+
+struct MaxOp {
+  template <typename T>
+  static void Run(OpKernelContext *ctx, typename TTypes<T>::Scalar &s, const typename TTypes<T>::UnalignedVec &v) {
+      s.device(ctx->eigen_cpu_device()) = v.maximum();
+  }
+  static StringPiece Name() {
+      return "max";
+  }
+};
+
+template <typename T, typename Op>
+class SparseReduceOp : public OpKernel {
  public:
-  explicit SparseReduceSumOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+  explicit SparseReduceOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
   }
 
@@ -163,10 +183,10 @@ class SparseReduceSumOp : public OpKernel {
     auto out_flat = out_values->flat<T>();
     out_flat.setZero();
 
-    Tensor tmp_group_sum;
+    Tensor tmp_reduced_val;
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape({}), &tmp_group_sum));
-    auto group_sum = tmp_group_sum.scalar<T>();
+                                           TensorShape({}), &tmp_reduced_val));
+    auto reduced_val = tmp_reduced_val.scalar<T>();
 
     // Compute strides, and use it to convert coords to flat index.  The
     // coordinates returned by .group() have the same ndims as group_by_dims.
@@ -196,11 +216,12 @@ class SparseReduceSumOp : public OpKernel {
     // g.group() provides the coordinates of a particular reduced value.
     sp.Reorder<T>(reduction.reorder_dims);
     for (const auto &g : sp.group(reduction.group_by_dims)) {
-      group_sum.device(ctx->eigen_cpu_device()) = g.template values<T>().sum();
+      Op::template Run<T>(ctx, reduced_val, g.template values<T>());
       const int64 idx = CoordinatesToFlatIndex(g.group(), output_strides);
-      out_flat(idx) = group_sum();
+      out_flat(idx) = reduced_val();
       VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
-              << "; idx: " << idx << "; group sum: " << group_sum();
+              << "; idx: " << idx << "; group " << Op::Name() << ": "
+              << reduced_val();
     }
   }
 
@@ -212,14 +233,21 @@ class SparseReduceSumOp : public OpKernel {
 #define REGISTER_KERNELS(T)                                              \
   REGISTER_KERNEL_BUILDER(                                               \
       Name("SparseReduceSum").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceSumOp<T>)
+      SparseReduceOp<T, SumOp>)
 TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-template <typename T>
-class SparseReduceSumSparseOp : public OpKernel {
+#define REGISTER_KERNELS(T)                                              \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseReduceMax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SparseReduceOp<T, MaxOp>)
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+template <typename T, typename Op>
+class SparseReduceSparseOp : public OpKernel {
  public:
-  explicit SparseReduceSumSparseOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+  explicit SparseReduceSparseOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
   }
 
@@ -260,13 +288,13 @@ class SparseReduceSumSparseOp : public OpKernel {
                    ctx->allocate_output(1, TensorShape({nnz}), &out_values_t));
     auto out_flat = out_values_t->flat<T>();
 
-    Tensor tmp_group_sum;
+    Tensor tmp_reduced_val;
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                           TensorShape({}), &tmp_group_sum));
-    auto group_sum = tmp_group_sum.scalar<T>();
+                                           TensorShape({}), &tmp_reduced_val));
+    auto reduced_val = tmp_reduced_val.scalar<T>();
     int64 i = 0;
     for (const auto &g : sp.group(reduction.group_by_dims)) {
-      group_sum.device(ctx->eigen_cpu_device()) = g.template values<T>().sum();
+      Op::template Run<T>(ctx, reduced_val, g.template values<T>());
       std::vector<int64> group = g.group();
       for (int64 j = 0; j < group.size(); j++) {
         if (keep_dims_) {
@@ -275,10 +303,11 @@ class SparseReduceSumSparseOp : public OpKernel {
           out_indices_mat(i, j) = group[j];
         }
       }
-      out_flat(i) = group_sum();
+      out_flat(i) = reduced_val();
       i++;
       VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
-              << "; group sum: " << group_sum();
+              << "; group " << Op::Name() << ": "
+              << reduced_val();
     }
 
     Tensor *out_shape_t;
@@ -298,8 +327,15 @@ class SparseReduceSumSparseOp : public OpKernel {
 #define REGISTER_KERNELS(T)                                                    \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("SparseReduceSumSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      SparseReduceSumSparseOp<T>)
+      SparseReduceSparseOp<T, SumOp>)
 TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
+#define REGISTER_KERNELS(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("SparseReduceMaxSparse").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SparseReduceSparseOp<T, MaxOp>)
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_slice_op.cc b/tensorflow/core/kernels/sparse_slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10dc208ab67bffd5b081a3f0d87598f03b1ac9f5
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_slice_op.cc
@@ -0,0 +1,104 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+template <typename T>
+class SparseSliceOp : public OpKernel {
+ public:
+  explicit SparseSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_indices = context->input(0);
+    const Tensor& input_values = context->input(1);
+    const Tensor& input_shape = context->input(2);
+    const Tensor& input_start = context->input(3);
+    const Tensor& input_size = context->input(4);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_indices.shape()),
+                errors::InvalidArgument(
+                    "Input indices should be a matrix but received shape ",
+                    input_indices.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_values.shape()),
+                errors::InvalidArgument(
+                    "Input values should be a vector but received shape ",
+                    input_indices.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape.shape()),
+                errors::InvalidArgument(
+                    "Input shape should be a vector but received shape ",
+                    input_shape.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_start.shape()),
+                errors::InvalidArgument(
+                    "Input start should be a vector but received shape ",
+                    input_start.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_size.shape()),
+                errors::InvalidArgument(
+                    "Input size should be a vector but received shape ",
+                    input_size.shape().DebugString()));
+
+    const int input_dims = input_shape.NumElements();
+    OP_REQUIRES(context, input_dims == input_start.NumElements(),
+                errors::InvalidArgument(
+                    "Expected start to be a vector of length ", input_dims,
+                    " but got length ", input_start.NumElements()));
+
+    OP_REQUIRES(context, input_dims == input_size.NumElements(),
+                errors::InvalidArgument(
+                    "Expected size to be a vector of length ", input_dims,
+                    " but got length ", input_size.NumElements()));
+
+    sparse::SparseTensor sparse_tensor(input_indices, input_values,
+                                       TensorShape(input_shape.vec<int64>()));
+
+    const gtl::ArraySlice<int64> start(input_start.flat<int64>().data(),
+                                       input_dims);
+    const gtl::ArraySlice<int64> size(input_size.flat<int64>().data(),
+                                      input_dims);
+
+    const sparse::SparseTensor output =
+        sparse::SparseTensor::Slice<T>(sparse_tensor, start, size);
+
+    context->set_output(0, output.indices());
+    context->set_output(1, output.values());
+
+    const TensorShape output_shape(output.shape());
+
+    Tensor* shape = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, {output_shape.dims()}, &shape));
+    for (int dim = 0; dim < output_shape.dims(); ++dim) {
+      shape->vec<int64>()(dim) = output_shape.dim_size(dim);
+    }
+  }
+
+ private:
+};
+
+#define REGISTER_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("SparseSlice").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SparseSliceOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 3c0b5d113b0a899e98b3463662216bd3ca9b7ff3..dd6fc6115f7b5bce60f5373c8556e7b1642afd6a 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -138,7 +138,8 @@ __global__ void split_v_kernel(const T* input_ptr,
   // do an initial binary search and then scan linearly from there
   // works well when there are many small segments and when the
   // segments are much longer
-  IntType segment = gpu::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
+  IntType segment =
+      cuda_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
 
   IntType curr_offset = col_scan[segment];
   IntType curr_segment = segment;
@@ -195,10 +196,10 @@ struct SplitOpGPULaunch {
     CudaLaunchConfig config = GetCudaLaunchConfig(
         prefix_dim_size * split_dim_size * suffix_dim_size, d);
 
-    SplitOpKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input, prefix_dim_size, split_dim_size, suffix_dim_size,
-        output_ptr_data);
+    SplitOpKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input, prefix_dim_size, split_dim_size, suffix_dim_size,
+            output_ptr_data);
   }
 };
 
@@ -224,15 +225,15 @@ struct SplitVOpGPULaunch {
       // 4096 inputs is a lot, most code will take the smem path
       const int32 kMaxSmemBytesPerformance = 16384;
       if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
-        split_v_kernel<T, IntType,
-                       true><<<config.block_count, config.thread_per_block,
-                               smem_usage, gpu_device.stream()>>>(
-            input_ptr, output_scan, total_rows, total_cols, output_ptr_data);
+        split_v_kernel<T, IntType, true>
+            <<<config.block_count, config.thread_per_block, smem_usage,
+               gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
+                                      total_cols, output_ptr_data);
       else
-        split_v_kernel<T, IntType,
-                       false><<<config.block_count, config.thread_per_block, 0,
-                                gpu_device.stream()>>>(
-            input_ptr, output_scan, total_rows, total_cols, output_ptr_data);
+        split_v_kernel<T, IntType, false>
+            <<<config.block_count, config.thread_per_block, 0,
+               gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
+                                      total_cols, output_ptr_data);
     }
   }
 };
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index b4698a805305723a4d4a22295cbd33f5ec231a29..a474e75d6af3b32c6c0b5bc5de92b894180b510e 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -40,6 +40,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -51,12 +54,19 @@ class Stack : public ResourceBase {
     bool swapped_to_cpu;
   };
 
-  Stack(const DataType& elem_type, const Tensor& handle)
-      : elem_type_(elem_type), handle_(handle), closed_(false) {}
+  Stack(const DataType& elem_type, const string& stack_name, int max_size)
+      : elem_type_(elem_type),
+        stack_name_(stack_name),
+        max_size_(max_size),
+        closed_(false) {}
 
   Status Push(const TensorAndAllocation& value) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (max_size_ >= 0 && stack_.size() >= max_size_) {
+      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
+                                     "its max_size (", max_size_, ")");
+    }
     stack_.push_back(value);
     return Status::OK();
   }
@@ -65,8 +75,7 @@ class Stack : public ResourceBase {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(CheckNotClosed());
     if (stack_.empty()) {
-      const string& stack_name = handle_.vec<string>()(1);
-      return errors::InvalidArgument("Stack[", stack_name,
+      return errors::InvalidArgument("Stack[", stack_name_,
                                      "] is empty when calling Pop().");
     }
     *value = stack_.back();
@@ -95,25 +104,26 @@ class Stack : public ResourceBase {
 
   string DebugString() override {
     mutex_lock l(mu_);
-    const string& stack_name = handle_.vec<string>()(1);
-    return strings::StrCat("Stack[", stack_name, "]");
+    return strings::StrCat("Stack[", stack_name_, "]");
   }
 
+  const string& stack_name() { return stack_name_; }
+
  private:
   friend class StackOp;
   mutex* mu() { return &mu_; }
-  Tensor* handle() { return &handle_; }
 
   mutable mutex mu_;
   DataType elem_type_;
+  const string stack_name_;
   Tensor handle_;
+  int max_size_;
   bool closed_ GUARDED_BY(mu_);
   std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
 
   Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (closed_) {
-      const string& stack_name = handle_.vec<string>()(1);
-      return errors::InvalidArgument("Stack[", stack_name,
+      return errors::InvalidArgument("Stack[", stack_name_,
                                      "] has already been closed.");
     }
     return Status::OK();
@@ -121,20 +131,26 @@ class Stack : public ResourceBase {
 };
 
 Status GetStack(OpKernelContext* ctx, Stack** stack) {
-  Tensor Tstack_handle = ctx->mutable_input(0, false);
-  if (Tstack_handle.NumElements() != 2) {
-    return errors::InvalidArgument(
-        "Stack handle must have two elements, but had shape: ",
-        Tstack_handle.shape().DebugString());
+  string key;
+  if (ctx->input_dtype(0) == DT_RESOURCE) {
+    auto resource = ctx->input(0).flat<ResourceHandle>()(0);
+    key = resource.name();
+  } else {
+    Tensor Tstack_handle = ctx->mutable_input(0, false);
+    if (Tstack_handle.NumElements() != 2) {
+      return errors::InvalidArgument(
+          "Stack handle must have two elements, but had shape: ",
+          Tstack_handle.shape().DebugString());
+    }
+    const string& container = Tstack_handle.flat<string>()(0);
+    const string& stack_name = Tstack_handle.flat<string>()(1);
+    key = strings::StrCat(container, stack_name);
   }
-  const string& container = Tstack_handle.flat<string>()(0);
-  const string& stack_name = Tstack_handle.flat<string>()(1);
   ResourceMgr* rm = ctx->resource_manager();
   if (rm == nullptr) {
     return errors::Internal("No resource manager.");
   }
-  TF_RETURN_IF_ERROR(rm->Lookup(ctx->step_container()->name(),
-                                strings::StrCat(container, stack_name), stack));
+  TF_RETURN_IF_ERROR(rm->Lookup(ctx->step_container()->name(), key, stack));
   return Status::OK();
 }
 
@@ -151,25 +167,48 @@ class StackOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    // Create the stack handle.
-    Tensor stack_handle;
-    AllocatorAttributes alloc_attr;
-    alloc_attr.set_on_host(true);
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
-                                           tensorflow::TensorShape({2}),
-                                           &stack_handle, alloc_attr));
+    int32 size = std::numeric_limits<int32>::max();
+    if (ctx->num_inputs() > 0) {
+      const Tensor* tensor_size;
+      OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
+
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
+                  errors::InvalidArgument(
+                      "Stack size must be a scalar, but had shape: ",
+                      tensor_size->shape().DebugString()));
+
+      int32 size_value = tensor_size->scalar<int32>()();
+      if (size_value >= 0) {
+        size = size_value;
+      }
+    }
+
+    static const char kContainer[] = "_stacks";
     auto stack_id = Stack::stack_counter.fetch_add(1);
-    auto handle = stack_handle.flat<string>();
-    handle(0) = "_stacks";
-    handle(1) = strings::StrCat(stack_name_, "_", stack_id);
+    string stack_name = strings::StrCat(stack_name_, "_", stack_id);
     // Store the handle in a per-step container.
     ResourceMgr* rm = ctx->resource_manager();
     OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
-    Stack* stack = new Stack(elem_type_, stack_handle);
-    OP_REQUIRES_OK(ctx,
-                   rm->Create(ctx->step_container()->name(),
-                              strings::StrCat(handle(0), handle(1)), stack));
-    ctx->set_output_ref(0, stack->mu(), stack->handle());
+    string key = strings::StrCat(kContainer, stack_name);
+    Stack* stack = new Stack(elem_type_, stack_name, size);
+    OP_REQUIRES_OK(ctx, rm->Create(ctx->step_container()->name(), key, stack));
+    if (IsRefType(ctx->expected_output_dtype(0))) {
+      // Create the stack handle.
+      AllocatorAttributes alloc_attr;
+      alloc_attr.set_on_host(true);
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
+                                             tensorflow::TensorShape({2}),
+                                             &stack->handle_, alloc_attr));
+      auto handle = stack->handle_.flat<string>();
+      handle(0) = kContainer;
+      handle(1) = std::move(stack_name);
+      ctx->set_output_ref(0, stack->mu(), &stack->handle_);
+    } else {
+      Tensor* handle;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+      handle->flat<ResourceHandle>()(0) =
+          MakePerStepResourceHandle<Stack>(ctx, key);
+    }
   }
 
  private:
@@ -182,6 +221,21 @@ class StackOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
+REGISTER_KERNEL_BUILDER(Name("StackV2").Device(DEVICE_CPU), StackOp);
+REGISTER_KERNEL_BUILDER(Name("StackV2")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("max_size")
+                            .HostMemory("handle"),
+                        StackOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_SYCL).HostMemory("handle"),
+                        StackOp);
+REGISTER_KERNEL_BUILDER(Name("StackV2")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("max_size")
+                            .HostMemory("handle"),
+                        StackOp);
+#endif // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -213,7 +267,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        std::is_same<Device, GPUDevice>::value &&
+        ( std::is_same<Device, GPUDevice>::value
+#ifdef TENSORFLOW_USE_SYCL
+          || std::is_same<Device, SYCLDevice>::value
+#endif // TENSORFLOW_USE_SYCL
+        ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -261,12 +319,19 @@ class StackPushOp : public AsyncOpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("StackPush").Device(DEVICE_CPU),
                         StackPushOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("StackPushV2").Device(DEVICE_CPU),
+                        StackPushOp<CPUDevice>);
 
 #define REGISTER_GPU_KERNEL(type)                         \
   REGISTER_KERNEL_BUILDER(Name("StackPush")               \
                               .Device(DEVICE_GPU)         \
                               .HostMemory("handle")       \
                               .TypeConstraint<type>("T"), \
+                          StackPushOp<GPUDevice>);        \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
                           StackPushOp<GPUDevice>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
@@ -282,13 +347,45 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
                               .HostMemory("elem")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>)
+                          StackPushOp<GPUDevice>);        \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("handle")       \
+                              .HostMemory("elem")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<GPUDevice>);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<SYCLDevice>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("handle")       \
+                              .HostMemory("elem")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          StackPushOp<SYCLDevice>)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(bool);
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 class StackPopOp : public AsyncOpKernel {
  public:
   explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
@@ -332,13 +429,19 @@ class StackPopOp : public AsyncOpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("StackPop").Device(DEVICE_CPU), StackPopOp);
+REGISTER_KERNEL_BUILDER(Name("StackPopV2").Device(DEVICE_CPU), StackPopOp);
 
 #define REGISTER_GPU_KERNEL(type)                                 \
   REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("handle")               \
                               .TypeConstraint<type>("elem_type"), \
-                          StackPopOp)
+                          StackPopOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("StackPopV2")                      \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("handle")               \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
@@ -352,13 +455,44 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
                               .HostMemory("handle")               \
                               .HostMemory("elem")                 \
                               .TypeConstraint<type>("elem_type"), \
-                          StackPopOp)
+                          StackPopOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("StackPopV2")                      \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("handle")               \
+                              .HostMemory("elem")                 \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
+                              .Device(DEVICE_SYCL)                \
+                              .HostMemory("handle")               \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("StackPop")                        \
+                              .Device(DEVICE_SYCL)                \
+                              .HostMemory("handle")               \
+                              .HostMemory("elem")                 \
+                              .TypeConstraint<type>("elem_type"), \
+                          StackPopOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(bool);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 class StackCloseOp : public OpKernel {
  public:
   explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -376,5 +510,15 @@ class StackCloseOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
+REGISTER_KERNEL_BUILDER(Name("StackCloseV2").Device(DEVICE_CPU), StackCloseOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StackCloseV2").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StackClose").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"),
+    StackCloseOp);
+#endif // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 47eb85999e16c6f8154af3129fba6edd30e463bc..4655503e265b6f0b7e0eb377eb3b02a0eac97ba0 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/strided_slice_op.h"
-#include "tensorflow/core/kernels/dense_update_ops.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/slice_op.h"
 #include "tensorflow/core/kernels/strided_slice_op_impl.h"
 
@@ -96,17 +96,13 @@ class StridedSliceOp : public OpKernel {
     gtl::InlinedVector<int64, 4> end;
     gtl::InlinedVector<int64, 4> strides;
 
-    ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
     OP_REQUIRES_OK(
         context, ValidateStridedSliceOp(
                      &context->input(1), &context->input(2), context->input(3),
-                     ShapeReadWriteFromTensorShape(&context->input(0).shape()),
-                     begin_mask, end_mask, ellipsis_mask, new_axis_mask,
-                     shrink_axis_mask, &wrapped_processing_shape,
-                     &wrapped_final_shape, &is_identity, &is_simple_slice,
-                     &slice_dim0, &begin, &end, &strides));
-
+                     context->input(0).shape(), begin_mask, end_mask,
+                     ellipsis_mask, new_axis_mask, shrink_axis_mask,
+                     &processing_shape, &final_shape, &is_identity,
+                     &is_simple_slice, &slice_dim0, &begin, &end, &strides));
     const Tensor& input = context->input(0);
 
     // Optimization #1, slice is a no-op plus reshape
@@ -218,15 +214,12 @@ class StridedSliceGradOp : public OpKernel {
       LOG(FATAL) << "shape must have type int32 or int64.";
     }
 
-    ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
     OP_REQUIRES_OK(
         context,
         ValidateStridedSliceOp(
             &context->input(1), &context->input(2), context->input(3),
-            ShapeReadWriteFromTensorShape(&input_shape), begin_mask, end_mask,
-            ellipsis_mask, new_axis_mask, shrink_axis_mask,
-            &wrapped_processing_shape, &wrapped_final_shape, &is_identity,
+            input_shape, begin_mask, end_mask, ellipsis_mask, new_axis_mask,
+            shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
             &is_simple_slice, &slice_dim0, &begin, &end, &strides));
 
     // Check to make sure dy is consistent with the original slice
@@ -306,15 +299,12 @@ class StridedSliceAssignOp : public OpKernel {
       old_lhs = context->mutable_input(0, true);
     }
 
-    ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
     OP_REQUIRES_OK(
         context,
         ValidateStridedSliceOp(
             &context->input(1), &context->input(2), context->input(3),
-            ShapeReadWriteFromTensorShape(&old_lhs.shape()), begin_mask,
-            end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
-            &wrapped_processing_shape, &wrapped_final_shape, &is_identity,
+            old_lhs.shape(), begin_mask, end_mask, ellipsis_mask, new_axis_mask,
+            shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
             &is_simple_slice, &slice_dim0, &begin, &end, &strides));
 
     if (processing_shape.num_elements()) {
diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h
index 7893e07949b36f936482c46cddb6324c8ba9c738..0f72c4b771025458a1403ce13842787249a2718f 100644
--- a/tensorflow/core/kernels/strided_slice_op.h
+++ b/tensorflow/core/kernels/strided_slice_op.h
@@ -19,7 +19,7 @@ limitations under the License.
 // Functor definition for StridedSliceOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index d0ccd5c6521f5ad7e2a4027b876ed034395983d4..de6514757242c1e1079752427b444e31a80bc5ef 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/dense_update_ops.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/core/kernels/strided_slice_op_test.cc b/tensorflow/core/kernels/strided_slice_op_test.cc
index ca66ccad8ba6ac5ba40ad5a6d69a8d22898ca513..281ca0f58fe8148d8ad5ba959b88fbe16950c31d 100644
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@@ -115,13 +115,11 @@ static void BM_ValidateStridedSliceOp(int iters) {
     const int32 new_axis_mask = 0;
     const int32 shrink_axis_mask = 0;
 
-    ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
-    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
     TF_CHECK_OK(ValidateStridedSliceOp(
-        &begin, &end, strides, ShapeReadWriteFromTensorShape(&input_shape),
-        begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
-        &wrapped_processing_shape, &wrapped_final_shape, &is_identity,
-        &is_simple_slice, &slice_dim0, &begin_out, &end_out, &strides_out));
+        &begin, &end, strides, input_shape, begin_mask, end_mask, ellipsis_mask,
+        new_axis_mask, shrink_axis_mask, &processing_shape, &final_shape,
+        &is_identity, &is_simple_slice, &slice_dim0, &begin_out, &end_out,
+        &strides_out));
   }
 }
 
diff --git a/tensorflow/core/kernels/summary_tensor_op_test.cc b/tensorflow/core/kernels/summary_tensor_op_test.cc
index 0006a71bd7b35f1f215d27b45ce3e865ddbd8769..010ff443fa7d2f92aeab5575ee60bff1460256d0 100644
--- a/tensorflow/core/kernels/summary_tensor_op_test.cc
+++ b/tensorflow/core/kernels/summary_tensor_op_test.cc
@@ -85,8 +85,15 @@ TEST_F(SummaryTensorOpV2Test, BasicPluginData) {
   ASSERT_EQ(0, out_tensor->dims());
   Summary summary;
   ParseProtoUnlimited(&summary, out_tensor->scalar<string>()());
-
   ASSERT_EQ(1, summary.value_size());
+
+  // Check the content of the tensor stored in the summary.
+  Tensor string_content_tensor;
+  CHECK(string_content_tensor.FromProto(summary.value(0).tensor()));
+  ASSERT_EQ("some string tensor content",
+            string_content_tensor.scalar<string>()());
+
+  // Check plugin-related data.
   ASSERT_EQ("tag_foo", summary.value(0).tag());
   ASSERT_EQ(2, summary.value(0).metadata().plugin_data_size());
   ASSERT_EQ("foo", summary.value(0).metadata().plugin_data(0).plugin_name());
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..28af2dace3a523a32ec7be78580f68965d8663cd
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -0,0 +1,100 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+// Device-specific naive implementation for tile.
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in);
+
+template <typename Device, typename T, int NDIM>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<int32>& broadcast_array) {
+  auto x = in.tensor<T, NDIM>();
+  auto y = out->tensor<T, NDIM>();
+
+  Eigen::array<int32, NDIM> b;
+  for (int i = 0; i < NDIM; ++i) b[i] = broadcast_array[i];
+  if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+    // Use 32bit indexing to speed up the computations
+    To32Bit(y).device(d) = To32Bit(x).broadcast(b);
+  } else {
+    y.device(d) = x.broadcast(b);
+  }
+}
+
+template <typename Device, typename T>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<int32>&) {
+  auto x = in.tensor<T, 0>();
+  auto y = out->tensor<T, 0>();
+  // In the scalar case we simply copy the input.
+  y.device(d) = x;
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+template <typename Device, typename T>
+struct Tile {
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int32> broadcast_array) const {
+    switch (in.dims()) {
+      case 0:
+        internal::TileUsingEigen<Device, T>(d, out, in, broadcast_array);
+        break;
+      case 1:
+        internal::TileUsingEigen<Device, T, 1>(d, out, in, broadcast_array);
+        break;
+      case 2:
+        internal::TileUsingEigen<Device, T, 2>(d, out, in, broadcast_array);
+        break;
+      case 3:
+        internal::TileUsingEigen<Device, T, 3>(d, out, in, broadcast_array);
+        break;
+      case 4:
+        internal::TileUsingEigen<Device, T, 4>(d, out, in, broadcast_array);
+        break;
+      case 5:
+        internal::TileUsingEigen<Device, T, 5>(d, out, in, broadcast_array);
+        break;
+      case 6:
+        internal::TileUsingEigen<Device, T, 6>(d, out, in, broadcast_array);
+        break;
+      case 7:
+        internal::TileUsingEigen<Device, T, 7>(d, out, in, broadcast_array);
+        break;
+      default:
+        internal::TileSimple<Device, T>(d, out, in);
+        break;
+    }
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5418fa1421419a7c0407517df54530500c56cd4
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -0,0 +1,86 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
+  const int ndims = in.dims();
+  const int64 nelem = out->NumElements();
+  gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
+  gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    int64 i_idx = 0;
+    int64 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in.dim_size(i) * in_strides[i];
+      t %= out_strides[i];
+    }
+    q[o_idx] = p[i_idx];
+  }
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Register functors used for Tile functor.
+#define DEFINE_TYPE(T) template struct Tile<CPUDevice, T>;
+
+TF_CALL_bool(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+TF_CALL_string(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+
+#define DEFINE_TYPE(T) template struct Tile<SYCLDevice, T>;
+
+TF_CALL_bool(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+#endif // TENSORFLOW_USE_SYCL
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c61c3030ae10492d5a1ba0fb1aac23ec1da84c4
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
@@ -0,0 +1,102 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace internal {
+
+template <typename T>
+__global__ void TileKernel(int nthreads, const T* src, const int32* buf,
+                           const int32 ndims, T* dst) {
+  const int32* in_strides = buf;
+  const int32* out_strides = buf + ndims;
+  const int32* in_dim_sizes = buf + ndims * 2;
+  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+    int32 i_idx = 0;
+    int32 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in_dim_sizes[i] * in_strides[i];
+      t %= out_strides[i];
+    }
+    dst[o_idx] = ldg(src + i_idx);
+  }
+}
+
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
+  // Ensures we can use 32-bit index.
+  const int64 in_nelem = in.NumElements();
+  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  const int64 out_nelem = out->NumElements();
+  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  // Pack strides and input dimension sizes into one buffer.
+  const int32 ndims = in.dims();
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
+  for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
+    host_buf[ndims * 2 + i] = in.dim_size(i);
+  }
+  // Copies the input strides, output strides and input dimension sizes to the device.
+  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto dev_buf = d.allocate(num_bytes);
+  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // therefore we are doing a sync copy effectively.
+  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
+  // Launch kernel to q[...] = p[...].
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  TileKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
+      ndims, q);
+  // Safe to deallocate immediately after the kernel launch.
+  d.deallocate(dev_buf);
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Register functors used for Tile functor.
+#define DEFINE_TYPE(T) template struct Tile<GPUDevice, T>;
+
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+
+}  // end namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 7c72487d3f2d7595a5efd06e7b77e6e62ad1597d..f1da3c8afb422980ece39d9a1ad9ec1537d541fc 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -44,21 +44,12 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif // TENSORFLOW_USE_SYCL
 
-// Forward declarations of functors that will be defined in
-// tile_ops_cpu_impl*.cc and tile_ops_gpu.cu.cc.
+// Forward declarations of functors that will be defined in tile_ops_impl.h
 namespace functor {
-template <typename Device, typename T, int NDIM>
-struct Tile {
-  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
-                  typename TTypes<T, NDIM>::ConstTensor in,
-                  const Eigen::array<int32, NDIM>& broadcast_array) const;
-};
-
 template <typename Device, typename T>
-struct Tile<Device, T, 0> {
-  void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
-                  typename TTypes<T, 0>::ConstTensor in,
-                  const Eigen::array<int32, 0>&) const;
+struct Tile {
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int32> broadcast_array) const;
 };
 
 template <typename Device, typename T, int NDIM>
@@ -134,21 +125,12 @@ class TileOp : public OpKernel {
     // If there's no output, there's nothing to do.
     if (output_shape.num_elements() == 0) return;
 
-#define HANDLE_DIM(DT, NDIM)                                   \
-  if (context->input(0).dtype() == DT && input_dims == NDIM) { \
-    HandleCase<DT, NDIM>(context, multiples_array, result);    \
+#define HANDLE_TYPE(DT)                                        \
+  if (context->input(0).dtype() == DT) {                       \
+    HandleCase<DT>(context, multiples_array, result);          \
     return;                                                    \
   }
 
-#define HANDLE_TYPE(T) \
-  HANDLE_DIM(T, 1)     \
-  HANDLE_DIM(T, 2)     \
-  HANDLE_DIM(T, 3)     \
-  HANDLE_DIM(T, 4)     \
-  HANDLE_DIM(T, 5)     \
-  HANDLE_DIM(T, 6)     \
-  HANDLE_DIM(T, 7)
-
 #define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
 
     // Invoke macro using TF_CALL_* so type-filtering for platform applies.
@@ -166,7 +148,6 @@ class TileOp : public OpKernel {
 
 #undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
-#undef HANDLE_DIM
 
     OP_REQUIRES(context, false,
                 errors::Unimplemented(
@@ -175,21 +156,17 @@ class TileOp : public OpKernel {
   }
 
  private:
-  template <DataType DT, int NDIM>
+  template <DataType DT>
   void HandleCaseImpl(OpKernelContext* context,
                       const gtl::ArraySlice<int32>& multiples_array,
                       Tensor* result) {
     typedef typename EnumToDataType<DT>::Type T;
-    Eigen::array<int32, NDIM> broadcast_array;
-    for (int i = 0; i < NDIM; ++i) {
-      broadcast_array[i] = multiples_array[i];
-    }
-    functor::Tile<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), broadcast_array);
+    functor::Tile<Device, T>() (
+        context->eigen_device<Device>(), result,
+        context->input(0), multiples_array);
   }
 
-  template <DataType DT, int NDIM>
+  template <DataType DT>
   void HandleCase(OpKernelContext* context,
                   const gtl::ArraySlice<int32>& multiples_array,
                   Tensor* result);
@@ -198,45 +175,35 @@ class TileOp : public OpKernel {
 };
 
 template <typename Device>
-template <DataType DT, int NDIM>
+template <DataType DT>
 inline void TileOp<Device>::HandleCase(
     OpKernelContext* context, const gtl::ArraySlice<int32>& multiples_array,
     Tensor* result) {
   // TODO(vrv): print out the device name if useful. Currently disabled to avoid
   // having to use RTTI.
-  LOG(FATAL) << "TileOp: Invalid combination of Device, DT and NDIM: "
+  LOG(FATAL) << "TileOp: Invalid combination of Device, DT: "
              // << typeid(Device).name() << ", "
-             << DataTypeString(DT) << ", " << NDIM;
+             << DataTypeString(DT);
 }
 
-#define HANDLE_CASE(device, T, dtype, ndim)                            \
+#define HANDLE_CASE(device, dtype)                                     \
   template <>                                                          \
   template <>                                                          \
-  void TileOp<device>::HandleCase<dtype, ndim>(                        \
+  void TileOp<device>::HandleCase<dtype>(                              \
       OpKernelContext * context,                                       \
       const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \
-    HandleCaseImpl<dtype, ndim>(context, multiples_array, result);     \
+    HandleCaseImpl<dtype>(context, multiples_array, result);           \
   }
 
-// 0-D handled above
-#define HANDLE_CASE_DIM(device, T, dtype) \
-  HANDLE_CASE(device, T, dtype, 1);       \
-  HANDLE_CASE(device, T, dtype, 2);       \
-  HANDLE_CASE(device, T, dtype, 3);       \
-  HANDLE_CASE(device, T, dtype, 4);       \
-  HANDLE_CASE(device, T, dtype, 5);       \
-  HANDLE_CASE(device, T, dtype, 6);       \
-  HANDLE_CASE(device, T, dtype, 7);
-
 #define HANDLE_TYPE_NAME_CPU(T) \
-  HANDLE_CASE_DIM(CPUDevice, T, DataTypeToEnum<T>::value);
+  HANDLE_CASE(CPUDevice, DataTypeToEnum<T>::value);
 
 #define HANDLE_TYPE_NAME_GPU(T) \
-  HANDLE_CASE_DIM(GPUDevice, T, DataTypeToEnum<T>::value);
+  HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define HANDLE_TYPE_NAME_SYCL(T) \
-  HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
+  HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value);
 #endif // TENSORFLOW_USE_SYCL
 
 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
@@ -275,7 +242,6 @@ TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
 #ifdef TENSORFLOW_USE_SYCL
 #undef HANDLE_TYPE_NAME_SYCL
 #endif // TENSORFLOW_USE_SYCL
-#undef HANDLE_CASE_DIM
 #undef HANDLE_CASE
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index db3f046439102143a1cf1a64a8a7ecb9caf59c6f..a6eed4935d5c4a2aaa8618bab88998d4ce060ecb 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -21,29 +21,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/tile_ops_impl.h"
 
 namespace tensorflow {
+
 namespace functor {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// Register functors used for TileOp.
-#define DEFINE_DIM(T, NDIM) template struct Tile<CPUDevice, T, NDIM>;
-#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-TF_CALL_half(DEFINE_TYPE);
-TF_CALL_complex64(DEFINE_TYPE);
-TF_CALL_complex128(DEFINE_TYPE);
-TF_CALL_string(DEFINE_TYPE);
-
-#undef DEFINE_DIM
-#undef DEFINE_TYPE
-
 // Register functors used for TileGradientOp.
 #define DEFINE_DIM(T, NDIM)                     \
   template struct TileGrad<CPUDevice, T, NDIM>; \
@@ -65,21 +47,6 @@ TF_CALL_complex128(DEFINE_TYPE);
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 
-// Register functors used for TileOp.
-#define DEFINE_DIM(T, NDIM) template struct Tile<SYCLDevice, T, NDIM>;
-#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-
-#undef DEFINE_DIM
-#undef DEFINE_TYPE
-
 // Register functors used for TileGradientOp.
 #define DEFINE_DIM(T, NDIM)                      \
   template struct TileGrad<SYCLDevice, T, NDIM>; \
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl.h b/tensorflow/core/kernels/tile_ops_gpu_impl.h
index ed893811036b142eb4e5be4e5b67b733e8696dad..592f99e9b7b5c928c7e522b734186ab0225cd1d0 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/tile_ops_impl.h"
 
 #define DEFINE_DIM(T, NDIM)                            \
-  template struct Tile<Eigen::GpuDevice, T, NDIM>;     \
   template struct TileGrad<Eigen::GpuDevice, T, NDIM>; \
   template struct ReduceAndReshape<Eigen::GpuDevice, T, NDIM, 1>;
 
diff --git a/tensorflow/core/kernels/tile_ops_impl.h b/tensorflow/core/kernels/tile_ops_impl.h
index c41e4bd74b0f1b35602298cc9a8fcdaa6aa5557e..9861717a0b81ef71faaf2720abb396a8ea20eac2 100644
--- a/tensorflow/core/kernels/tile_ops_impl.h
+++ b/tensorflow/core/kernels/tile_ops_impl.h
@@ -21,31 +21,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-namespace functor {
 
-template <typename Device, typename T, int NDIM>
-struct Tile {
-  void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out,
-                  typename TTypes<T, NDIM>::ConstTensor in,
-                  const Eigen::array<int32, NDIM>& broadcast_array) const {
-    if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-      // Use 32bit indexing to speed up the computations
-      To32Bit(out).device(d) = To32Bit(in).broadcast(broadcast_array);
-    } else {
-      out.device(d) = in.broadcast(broadcast_array);
-    }
-  }
-};
-
-template <typename Device, typename T>
-struct Tile<Device, T, 0> {
-  void operator()(const Device& d, typename TTypes<T, 0>::Tensor out,
-                  typename TTypes<T, 0>::ConstTensor in,
-                  const Eigen::array<int32, 0>&) const {
-    // In the scalar case we simply copy the input.
-    out.device(d) = in;
-  }
-};
+namespace functor {
 
 template <typename Device, typename T, int NDIM>
 struct TileGrad {
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 630fcb76f36b618c8002b10616ec3d84a956e70b..05fee56335c0e3ec08f9244a24b2dfdf68a506c8 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/topk_op.h"
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -25,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -33,7 +36,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T>
+template <typename Device, typename T>
 class TopK : public OpKernel {
  public:
   explicit TopK(OpKernelConstruction* context) : OpKernel(context) {
@@ -82,7 +85,25 @@ class TopK : public OpKernel {
 
     auto values = values_out->flat_inner_dims<T>();
     auto indices = indices_out->flat_inner_dims<int32>();
+    Status s = functor::TopKFunctor<Device, T>::Compute(
+        context, sorted_, k, input, num_rows, num_cols, values, indices);
+    OP_REQUIRES_OK(context, s);
+  }
 
+ private:
+  int k_;
+  bool sorted_;
+};
+
+namespace functor {
+
+template <typename T>
+struct TopKFunctor<CPUDevice, T> {
+  static EIGEN_ALWAYS_INLINE Status
+  Compute(OpKernelContext* context, bool sorted, int k,
+          const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows,
+          const int64 num_cols, typename TTypes<T, 2>::Tensor values,
+          typename TTypes<int, 2>::Tensor indices) {
     const CPUDevice& d = context->eigen_device<CPUDevice>();
 
     // Special case for k == 1.
@@ -93,7 +114,7 @@ class TopK : public OpKernel {
       rows_by_one.set(0, num_rows);
 #else
       Eigen::array<int, 1> reduce_on_cols = {1};
-      Eigen::array<int, 1> rows_by_one = {static_cast<int>(num_rows), 1};
+      Eigen::array<int, 2> rows_by_one = {static_cast<int>(num_rows), 1};
 #endif
 
       values.device(d) =
@@ -108,7 +129,7 @@ class TopK : public OpKernel {
         }
       }
 
-      return;
+      return Status::OK();
     }
 
     auto SortIndices = [&, context](int start_batch, int limit_batch) {
@@ -117,7 +138,6 @@ class TopK : public OpKernel {
         const auto comp = [input_data](const int32 a, const int32 b) {
           return input_data[a] > input_data[b];
         };
-        gtl::TopN<int32, decltype(comp)> filter(k, comp);
         // TODO(ebrevdo): For large k < num_cols, instead of using
         // TopN, it may be faster to create a temporary vector of
         // values 0..num_cols - 1 and then use std::partial_sort_copy
@@ -130,13 +150,14 @@ class TopK : public OpKernel {
           std::sort(&indices(b, 0), &indices(b, k), comp);
         } else {
           // Use the TopN heap object to sort.
+          gtl::TopN<int32, decltype(comp)> filter(k, comp);
           filter.reserve(num_cols);
           for (int32 c = 0; c < num_cols; ++c) {
             filter.push(c);
           }
 
           int32 i = 0;
-          if (sorted_) {
+          if (sorted) {
             std::unique_ptr<std::vector<int32>> top_k(filter.Extract());
             for (auto top_k_it = top_k->begin(); top_k_it != top_k->end();
                  ++top_k_it, ++i) {
@@ -158,35 +179,75 @@ class TopK : public OpKernel {
 
     // Guesstimate of cost; 4*N*log(K) where N == num_cols.
     // If K == N, assume the cost is N*log(K + 1).
-    const int64 cmp_cost = 3 * Eigen::TensorOpCost::AddCost<int32>() +
-                           Eigen::TensorOpCost::AddCost<T>();
-    const int64 base_cost =
+    const double cmp_cost = 3 * Eigen::TensorOpCost::AddCost<int32>() +
+                            Eigen::TensorOpCost::AddCost<T>();
+    const double base_cost =
         cmp_cost *
-        static_cast<int64>(num_cols *
-                           Eigen::numext::log2(static_cast<float>(k + 1)));
-    const int64 sort_cost = (k == num_cols) ? base_cost : 4 * base_cost;
-    const int64 copy_cost = 2 * k * Eigen::TensorOpCost::AddCost<T>();
-    const int64 total_cost = sort_cost + copy_cost;
+        static_cast<double>(num_cols *
+                            Eigen::numext::log2(static_cast<float>(k + 1)));
+    const double sort_cost = (k == num_cols) ? base_cost : 4 * base_cost;
+    const double copy_cost = 2 * k * Eigen::TensorOpCost::AddCost<T>();
+    const double total_cost = sort_cost + copy_cost;
+    const int64 final_cost = (total_cost >= static_cast<double>(kint64max))
+                                 ? kint64max
+                                 : static_cast<int64>(total_cost);
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
-          total_cost, SortIndices);
-  }
+          final_cost, SortIndices);
 
- private:
-  int k_;
-  bool sorted_;
+    return Status::OK();
+  }
 };
 
-#define REGISTER_KERNELS_NAME(name, type) \
-  REGISTER_KERNEL_BUILDER(                \
-      Name(#name).Device(DEVICE_CPU).TypeConstraint<type>("T"), TopK<type>)
+}  // namespace functor
+
+#define REGISTER_KERNELS_NAME(name, type)                       \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name(#name).Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      TopK<CPUDevice, type>)
 
 #define REGISTER_KERNELS(type)       \
   REGISTER_KERNELS_NAME(TopK, type); \
   REGISTER_KERNELS_NAME(TopKV2, type)
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS_TO_NAME
+#undef REGISTER_KERNELS_NAME
 #undef REGISTER_KERNELS
 
-}  // namespace tensorflow
+#ifdef GOOGLE_CUDA
+
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  Status TopKFunctor<GPUDevice, T>::Compute(                                 \
+      OpKernelContext* context, bool sorted, int k,                          \
+      const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows, \
+      const int64 num_cols, typename TTypes<T, 2>::Tensor values,            \
+      typename TTypes<int, 2>::Tensor indices);                              \
+  extern template struct functor::TopKFunctor<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
+
+#undef DECLARE_GPU_SPEC
+
+}  // namespace functor
+
+#define REGISTER_KERNELS(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("TopK").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      TopK<GPUDevice, type>)                                     \
+  REGISTER_KERNEL_BUILDER(Name("TopKV2")                         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("k"),                  \
+                          TopK<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
+TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
+
+#undef REGISTER_KERNELS
+
+#endif  // end GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/topk_op.h b/tensorflow/core/kernels/topk_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a53e3ec8d4fb71337cedf9c8babcbc2685747279
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TOPK_OP_H_
+#define TENSORFLOW_TOPK_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct TopKFunctor {
+  static Status Compute(OpKernelContext* context, bool sorted, int k,
+                        const typename TTypes<T, 2>::ConstTensor& input,
+                        const int64 num_rows, const int64 num_cols,
+                        typename TTypes<T, 2>::Tensor values,
+                        typename TTypes<int, 2>::Tensor indices);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_TOPK_OP_H_
diff --git a/tensorflow/core/kernels/topk_op_gpu.cu.cc b/tensorflow/core/kernels/topk_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b3f5ccacc439dc4c7fc6ae200d8fbcab12d0d57
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu.cu.cc
@@ -0,0 +1,574 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <cmath>
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/device/device_segmented_radix_sort.cuh"
+#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Required for sorting Eigen::half
+namespace cub {
+template <>
+struct NumericTraits<Eigen::half>
+    : BaseTraits<FLOATING_POINT, true, false, unsigned short int, Eigen::half> {
+};
+}  // namespace cub
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace impl {
+
+enum class HeapType { kMinHeap, kMaxHeap };
+enum class PreferIndices { kLower, kHigher };
+
+template <typename T>
+struct Entry {
+  int index;
+  T value;
+
+  // Test-only.
+  static bool greater(const Entry<T>& a, const Entry<T>& b) {
+    if (a.value == b.value) {
+      return a.index < b.index;
+    }
+    return a.value > b.value;
+  }
+};
+
+template <typename T>
+struct LinearData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const { return data[index]; }
+
+  __device__ int get_index(int i) const { return data[i].index; }
+  __device__ T get_value(int i) const { return data[i].value; }
+
+  Entry* const data;
+};
+
+template <typename T>
+struct IndirectLinearData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const { return data[index]; }
+
+  __device__ int get_index(int i) const {
+    return backing_data[data[i].index].index;
+  }
+  __device__ T get_value(int i) const { return data[i].value; }
+
+  Entry* const data;
+  Entry* const backing_data;
+};
+
+#if GOOGLE_CUDA
+template <typename T>
+struct StridedData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const {
+    return data[index * blockDim.x + threadIdx.x];
+  }
+
+  __device__ int get_index(int i) const { return (*this)[i].index; }
+  __device__ T get_value(int i) const { return (*this)[i].value; }
+
+  Entry* const data;
+};
+#endif
+
+// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
+template <HeapType heapType, PreferIndices preferIndices,
+          template <typename> class Data, typename T>
+struct IndexedHeap {
+  typedef typename Data<T>::Entry Entry;
+  const Data<T> data;
+
+  __device__ bool is_above(int left, int right) {
+    T left_value = data.get_value(left);
+    T right_value = data.get_value(right);
+    if (left_value == right_value) {
+      if (preferIndices == PreferIndices::kLower) {
+        return data.get_index(left) < data.get_index(right);
+      } else {
+        return data.get_index(left) > data.get_index(right);
+      }
+    }
+    if (heapType == HeapType::kMinHeap) {
+      return left_value < right_value;
+    } else {
+      return left_value > right_value;
+    }
+  }
+
+  __device__ void assign(int i, const Entry& entry) { data[i] = entry; }
+
+  __device__ void push_up(int i) {
+    int child = i;
+    int parent;
+    for (; child > 0; child = parent) {
+      parent = (child - 1) / 2;
+      if (!is_above(child, parent)) {
+        // Heap property satisfied.
+        break;
+      }
+      swap(child, parent);
+    }
+  }
+
+  __device__ void swap(int a, int b) {
+    auto tmp = data[b];
+    data[b] = data[a];
+    data[a] = tmp;
+  }
+
+  __device__ void push_root_down(int k) { push_down(0, k); }
+
+  // MAX-HEAPIFY in Cormen
+  __device__ void push_down(int node, int k) {
+    while (true) {
+      const int left = 2 * node + 1;
+      const int right = left + 1;
+      int smallest = node;
+      if (left < k && is_above(left, smallest)) {
+        smallest = left;
+      }
+      if (right < k && is_above(right, smallest)) {
+        smallest = right;
+      }
+      if (smallest == node) {
+        break;
+      }
+      swap(smallest, node);
+      node = smallest;
+    }
+  }
+
+  // BUILD-MAX-HEAPIFY in Cormen
+  __device__ void build(int k) {
+    for (int node = (k - 1) / 2; node >= 0; node--) {
+      push_down(node, k);
+    }
+  }
+
+  // HEAP-EXTRACT-MAX in Cormen
+  __device__ void remove_root(int k) {
+    data[0] = data[k - 1];
+    push_root_down(k - 1);
+  }
+
+  // in-place HEAPSORT in Cormen
+  // This method destroys the heap property.
+  __device__ void sort(int k) {
+    for (int slot = k - 1; slot > 0; slot--) {
+      // This is like remove_root but we insert the element at the end.
+      swap(slot, 0);
+      // Heap is now an element smaller.
+      push_root_down(/*k=*/slot);
+    }
+  }
+
+  __device__ void replace_root(const Entry& entry, int k) {
+    data[0] = entry;
+    push_root_down(k);
+  }
+
+  __device__ const Entry& root() { return data[0]; }
+};
+
+template <HeapType heapType, PreferIndices preferIndices,
+          template <typename> class Data, typename T>
+__device__ IndexedHeap<heapType, preferIndices, Data, T> make_indexed_heap(
+    typename Data<T>::Entry* data) {
+  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
+}
+
+// heapTopK walks over [input, input+length) with `step_size` stride starting at
+// `start_index`.
+// It builds a top-`k` heap that is stored in `heap_entries` using `Accessor` to
+// access elements in `heap_entries`. If sorted=true, the elements will be
+// sorted at the end.
+template <typename T, template <typename> class Data = LinearData>
+__device__ void heapTopK(const T* __restrict__ input, int length, int k,
+                         Entry<T>* __restrict__ heap_entries,
+                         bool sorted = false, int start_index = 0,
+                         int step_size = 1) {
+  assert(k <= length);
+
+  auto heap =
+      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
+          heap_entries);
+
+  int heap_end_index = start_index + k * step_size;
+  if (heap_end_index > length) {
+    heap_end_index = length;
+  }
+  // Initialize the min-heap.
+  for (int index = start_index, slot = 0; index < heap_end_index;
+       index += step_size, slot++) {
+    heap.assign(slot, {index, input[index]});
+  }
+
+  heap.build(k);
+
+  // Now iterate over the remaining items.
+  // If an item is smaller than the min element, it is not amongst the top k.
+  // Otherwise, replace the min element with it and push upwards.
+  for (int index = heap_end_index; index < length; index += step_size) {
+    // We prefer elements with lower indices. This is given here.
+    // Later elements automatically have higher indices, so can be discarded.
+    if (input[index] > heap.root().value) {
+      // This element should replace the min.
+      heap.replace_root({index, input[index]}, k);
+    }
+  }
+
+  // Sort if wanted.
+  if (sorted) {
+    heap.sort(k);
+  }
+}
+
+// mergeShards performs a top-k merge on `num_shards` many sorted streams that
+// are sorted and stored in `entries` in a strided way:
+// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
+// The overall top k elements are written to `top_k_values` and their indices
+// to top_k_indices.
+// `top_k_heap` is used as temporary storage for the merge heap.
+template <typename T>
+__device__ void mergeShards(int num_shards, int k,
+                            Entry<T>* __restrict__ entries,
+                            Entry<T>* __restrict__ top_k_heap, T* top_k_values,
+                            int* top_k_indices) {
+  // If k < num_shards, we can use a min-heap with k elements to get the top k
+  // of the sorted blocks.
+  // If k > num_shards, we can initialize a min-heap with the top element from
+  // each sorted block.
+  const int heap_size = k < num_shards ? k : num_shards;
+
+  // Min-heap part.
+  {
+    auto min_heap = IndexedHeap<HeapType::kMinHeap, PreferIndices::kHigher,
+                                IndirectLinearData, T>{
+        IndirectLinearData<T>{top_k_heap, entries}};
+    // Initialize the heap as a min-heap.
+    for (int slot = 0; slot < heap_size; slot++) {
+      min_heap.assign(slot, {slot, entries[slot].value});
+    }
+    min_heap.build(heap_size);
+
+    // Now perform top k with the remaining shards (if num_shards > heap_size).
+    for (int shard = heap_size; shard < num_shards; shard++) {
+      const auto entry = entries[shard];
+      const auto root = min_heap.root();
+      if (entry.value < root.value) {
+        continue;
+      }
+      if (entry.value == root.value &&
+          entry.index > entries[root.index].index) {
+        continue;
+      }
+      // This element should replace the min.
+      min_heap.replace_root({shard, entry.value}, heap_size);
+    }
+  }
+
+  // Max-part.
+  {
+    // Turn the min-heap into a max-heap in-place.
+    auto max_heap = IndexedHeap<HeapType::kMaxHeap, PreferIndices::kLower,
+                                IndirectLinearData, T>{
+        IndirectLinearData<T>{top_k_heap, entries}};
+    // Heapify into a max heap.
+    max_heap.build(heap_size);
+
+    // Now extract the minimum k-1 times.
+    // k is treated specially.
+    const int last_k = k - 1;
+    for (int rank = 0; rank < last_k; rank++) {
+      const Entry<T>& max_element = max_heap.root();
+      top_k_values[rank] = max_element.value;
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
+      int next_shard_index = shard_index + num_shards;
+      // For rank < k-1, each top k heap still contains at least 1 element,
+      // so we can draw a replacement.
+      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
+                            heap_size);
+    }
+
+    // rank == last_k.
+    const Entry<T>& max_element = max_heap.root();
+    top_k_values[last_k] = max_element.value;
+    int shard_index = max_element.index;
+    top_k_indices[last_k] = entries[shard_index].index;
+  }
+}
+
+extern __shared__ char shared_memory[];
+
+template <typename T>
+__global__ void TopKKernel(const T* input, int length, int k, bool sorted,
+                           T* output, int* indices) {
+  const int batch_index = blockIdx.x;
+  const T* batch_input = input + batch_index * length;
+
+  const int thread_index = threadIdx.x;
+  const int thread_count = blockDim.x;
+
+  Entry<T>* shared_entries = (Entry<T>*)shared_memory;
+
+  heapTopK<T, StridedData>(batch_input, length, k, shared_entries, true,
+                           thread_index, thread_count);
+
+  __syncthreads();
+  if (thread_index == 0) {
+    const int offset = batch_index * k;
+    auto batch_output = output + offset;
+    auto batch_indices = indices + offset;
+    Entry<T>* top_k_heap = shared_entries + thread_count * k;
+
+    // TODO(blackhc): Erich says: Performance can likely be improved
+    // significantly by having the merge be done by multiple threads rather than
+    // just one.  ModernGPU has some nice primitives that could help with this.
+    mergeShards(thread_count, k, shared_entries, top_k_heap, batch_output,
+                batch_indices);
+  }
+}
+
+template <typename T>
+cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
+                           const T* input, int batch_size, int length, int k,
+                           bool sorted, T* output, int* indices) {
+  // This code assumes that k is small enough that the computation
+  // fits inside shared memory (hard coded to 48KB).  In practice this
+  // means k <= 3072 for T=float/int32 and k <= 2048 for T=double/int64.
+  // The calculation is:
+  //   shared_memory_size / (2 * (sizeof(int) + sizeof(T))) < k.
+
+  // Use as many shards as possible.
+  if (num_shards <= 0) {
+    constexpr auto shared_memory_size = 48 << 10;  // 48 KB
+    const auto heap_size = k * (sizeof(int) + sizeof(T));
+    // shared_memory_size = (num_shards + 1) * heap_size <=>
+    num_shards = shared_memory_size / heap_size - 1;
+    if (num_shards <= 0) {
+      num_shards = 1;
+    }
+    auto shard_size = length / num_shards;
+    auto min_shard_size = 2 * k;
+    if (shard_size < min_shard_size) {
+      num_shards = length / min_shard_size;
+    }
+    if (num_shards <= 0) {
+      num_shards = 1;
+    } else if (num_shards > 1024) {
+      num_shards = 1024;
+    }
+  }
+  // We are limited by the amount of shared memory we have per block.
+  auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
+
+  TopKKernel<<<batch_size, num_shards, shared_memory_size, stream>>>(
+      input, length, k, sorted, output, indices);
+  return cudaGetLastError();
+}
+
+struct SegmentOffsetCreator {
+  SegmentOffsetCreator(int num_cols) : num_cols_(num_cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] * num_cols_;
+  };
+  int num_cols_;
+};
+
+struct ColumnIndexCreator {
+  ColumnIndexCreator(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
+      const Eigen::array<int, 1>& ix) const {
+    return ix[0] % num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
+                        int num_cols, int k,
+                        typename TTypes<T, 2>::Tensor values,
+                        TTypes<int, 2>::Tensor indices) {
+  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const cudaStream_t& cu_stream = GetCudaStream(ctx);
+  size_t temp_storage_bytes = -1;
+
+  // TODO(ebrevdo): Once cub supports iterators for the ValueT and
+  // segment_offsets, replace these tensors with iterators that
+  // directly return the correct value.
+  Tensor input_indices;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(
+      DT_INT32, TensorShape({num_rows, num_cols}), &input_indices));
+  auto input_indices_t = To32Bit(input_indices.flat<int32>());
+  input_indices_t.device(d) =
+      input_indices_t.generate(ColumnIndexCreator(num_cols));
+
+  Tensor segment_offsets;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT32, TensorShape({num_rows + 1}),
+                                        &segment_offsets));
+  auto segment_offsets_t = To32Bit(segment_offsets.flat<int32>());
+  segment_offsets_t.device(d) =
+      segment_offsets_t.generate(SegmentOffsetCreator(num_cols));
+
+  Tensor temp_values;
+  Tensor temp_indices;
+  T* sorted_values_ptr;
+  int* sorted_indices_ptr;
+  if (k == num_cols) {
+    // Doing a full sort, no intermediate values needed.
+    sorted_values_ptr = values.data();
+    sorted_indices_ptr = indices.data();
+  } else {
+    // Need to create intermediate values for sorting.
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT32, TensorShape({num_rows, num_cols}), &temp_indices));
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                          TensorShape({num_rows, num_cols}),
+                                          &temp_values));
+    sorted_indices_ptr = temp_indices.flat<int32>().data();
+    sorted_values_ptr = temp_values.flat<T>().data();
+  }
+
+  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      /* d_temp_storage */ nullptr,
+      /* temp_storage_bytes */ temp_storage_bytes,
+      /* d_keys_in */ input,
+      /* d_keys_out */ sorted_values_ptr,
+      /* d_values_in */ input_indices_t.data(),
+      /* d_values_out */ sorted_indices_ptr,
+      /* num_items */ num_cols * num_rows,
+      /* num_segments */ num_rows,
+      /* d_begin_offsets */ segment_offsets_t.data(),
+      /* d_end_offsets */ segment_offsets_t.data() + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(T) * 8,
+      /* stream */ cu_stream);
+  if (err != cudaSuccess) {
+    return errors::Internal(
+        "TopKOp: Could not launch "
+        "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+        "temp_storage_bytes, status: ",
+        cudaGetErrorString(err));
+  }
+  Tensor temp_storage;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(
+      DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+      &temp_storage));
+  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      /* d_temp_storage */ temp_storage.flat<int8>().data(),
+      /* temp_storage_bytes */ temp_storage_bytes,
+      /* d_keys_in */ input,
+      /* d_keys_out */ sorted_values_ptr,
+      /* d_values_in */ input_indices_t.data(),
+      /* d_values_out */ sorted_indices_ptr,
+      /* num_items */ num_cols * num_rows,
+      /* num_segments */ num_rows,
+      /* d_begin_offsets */ segment_offsets_t.data(),
+      /* d_end_offsets */ segment_offsets_t.data() + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(T) * 8,
+      /* stream */ cu_stream);
+  if (err != cudaSuccess) {
+    return errors::Internal(
+        "TopKOp: Could not launch "
+        "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
+        "temp_storage_bytes: ",
+        temp_storage_bytes, ", status: ", cudaGetErrorString(err));
+  }
+  if (k < num_cols) {
+    // Need to copy subsets of sorted_indices and sorted_outputs to
+    // indices and outputs.
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
+    const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
+    To32Bit(indices).device(d) =
+        To32Bit(temp_indices.matrix<int32>()).slice(slice_indices, slice_sizes);
+    To32Bit(values).device(d) =
+        To32Bit(temp_values.matrix<T>()).slice(slice_indices, slice_sizes);
+  }
+  return Status::OK();
+}
+
+}  // end namespace impl
+
+namespace functor {
+
+template <typename T>
+struct TopKFunctor<GPUDevice, T> {
+  static EIGEN_ALWAYS_INLINE Status
+  Compute(OpKernelContext* context, bool sorted, int k,
+          const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows,
+          const int64 num_cols, typename TTypes<T, 2>::Tensor values,
+          typename TTypes<int, 2>::Tensor indices) {
+    // For small k, use the heap implementation.  For larger k, use
+    // the in-place cub sort.  For k == num_cols, always use the
+    // in-place cub sort.  The thresholds for n and k were determined
+    // empirically.
+    if (num_cols <= 1000 || k == num_cols || k >= 100) {
+      return impl::LaunchSortKernel(context, input.data(), num_rows, num_cols,
+                                    k, values, indices);
+    } else {
+      const cudaStream_t& cu_stream = GetCudaStream(context);
+      auto err = impl::LaunchTopKKernel(cu_stream, /* num_shards */ 0,
+                                        input.data(), num_rows, num_cols, k,
+                                        sorted, values.data(), indices.data());
+      if (err != cudaSuccess) {
+        return errors::Internal(
+            "Could not launch TopKKernel: ", cudaGetErrorString(err), ".");
+      } else {
+        return Status::OK();
+      }
+    }
+  }
+};
+
+}  // end namespace functor
+
+#define INSTANTIATE_TEMPLATE(type) \
+  template struct functor::TopKFunctor<GPUDevice, type>;
+
+TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_TEMPLATE);
+TF_CALL_INTEGRAL_TYPES(INSTANTIATE_TEMPLATE);
+#undef INSTANTIATE_TEMPLATE
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
index 11d51188fcc21d5ba23f1583bee452b6ed22babe..01ea6877b1094e411bc19abe285f395dce3737d2 100644
--- a/tensorflow/core/kernels/training_op_helpers.cc
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -71,6 +71,7 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
   if (ctx->input_dtype(input) == DT_RESOURCE) {
     Var* var;
     if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      core::ScopedUnref unref_var(var);
       if (lock_held) {
         *out = *var->tensor();
       } else {
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index f6b6194f0abb10b24d90e3e935b8564d55d491b9..79654938d241a83645bf45c01d8e040368b8acdf 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -40,7 +40,7 @@ inline T sgn(const T x) {
   T one(1);
   return (x == zero ? zero : (x < zero ? -one : one));
 }
-}
+}  // namespace
 
 namespace functor {
 template <typename T>
@@ -55,8 +55,8 @@ struct ApplyGradientDescent<CPUDevice, T> {
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct ApplyGradientDescentSYCL {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
-                  T lr, typename TTypes<T>::ConstFlat grad) {
+  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var, T lr,
+                  typename TTypes<T>::ConstFlat grad) {
     var.device(d) -= grad * lr;
   }
 };
@@ -196,6 +196,47 @@ struct ApplyProximalAdagrad<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyFtrlV2<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar l2_shrinkage,
+                  typename TTypes<T>::ConstScalar lr_power) {
+    auto grad_with_shrinkage = grad + static_cast<T>(2) * l2_shrinkage() * var;
+    auto new_accum = accum + grad_with_shrinkage.square();
+    // special case for which lr_power=-0.5.
+    if (lr_power() == static_cast<T>(-0.5)) {
+      linear.device(d) +=
+          grad_with_shrinkage - (new_accum.sqrt() - accum.sqrt()) / lr() * var;
+    } else {
+      linear.device(d) +=
+          grad_with_shrinkage -
+          (new_accum.pow(-lr_power()) - accum.pow(-lr_power())) / lr() * var;
+    }
+    auto x = (linear.constant(l1()) * linear.sign() - linear);
+    if (lr_power() == static_cast<T>(-0.5)) {
+      auto y = new_accum.sqrt() / new_accum.constant(lr()) +
+               linear.constant(static_cast<T>(2) * l2());
+      auto pre_shrink = x / y;
+      var.device(d) = (linear.abs() > linear.constant(l1()))
+                          .select(pre_shrink, var.constant(static_cast<T>(0)));
+
+    } else {
+      auto y = new_accum.pow(-lr_power()) / new_accum.constant(lr()) +
+               linear.constant(static_cast<T>(2) * l2());
+      auto pre_shrink = x / y;
+      var.device(d) = (linear.abs() > linear.constant(l1()))
+                          .select(pre_shrink, var.constant(static_cast<T>(0)));
+    }
+    accum.device(d) += grad_with_shrinkage.square();
+  }
+};
+
 template <typename T>
 struct ApplyFtrl<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -284,16 +325,16 @@ template <typename T>
 struct ApplyAdamSYCL {
   void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
                   typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
-                  T beta1_power, T beta2_power, T lr, T beta1, T beta2, T epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
-    const T alpha = lr * Eigen::numext::sqrt(T(1) - beta2_power) /
-                    (T(1) - beta1_power);
+                  T beta1_power, T beta2_power, T lr, T beta1, T beta2,
+                  T epsilon, typename TTypes<T>::ConstFlat grad) {
+    const T alpha =
+        lr * Eigen::numext::sqrt(T(1) - beta2_power) / (T(1) - beta1_power);
     m.device(d) += (grad - m) * (T(1) - beta1);
     v.device(d) += (grad.square() - v) * (T(1) - beta2);
     var.device(d) -= (m * alpha) / (v.sqrt() + epsilon);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
@@ -334,7 +375,6 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
 
 }  // namespace functor
 
-
 template <typename Device, typename T>
 class ApplyGradientDescentOp : public OpKernel {
  public:
@@ -352,7 +392,7 @@ class ApplyGradientDescentOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     const Tensor& alpha = ctx->input(1);
     OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
                 errors::InvalidArgument("alpha is not a scalar: ",
@@ -377,21 +417,23 @@ class ApplyGradientDescentOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-class ApplyGradientDescentOp < SYCLDevice, T > : public OpKernel {
+class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
  public:
   explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     const Tensor& alpha_dev = ctx->input(1);
     OP_REQUIRES(ctx, IsLegacyScalar(alpha_dev.shape()),
                 errors::InvalidArgument("alpha is not a scalar: ",
@@ -407,10 +449,10 @@ class ApplyGradientDescentOp < SYCLDevice, T > : public OpKernel {
     auto size = sizeof(T);
     T alpha = T(0);
     auto src_ptr = GetBase(&alpha_dev);
-    device.memcpyDeviceToHost(&alpha, static_cast<const T *>(src_ptr), size);
+    device.memcpyDeviceToHost(&alpha, static_cast<const T*>(src_ptr), size);
 
-    functor::ApplyGradientDescentSYCL<T>()(device, var.flat<T>(),
-        alpha, delta.flat<T>());
+    functor::ApplyGradientDescentSYCL<T>()(device, var.flat<T>(), alpha,
+                                           delta.flat<T>());
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -418,7 +460,7 @@ class ApplyGradientDescentOp < SYCLDevice, T > : public OpKernel {
  private:
   bool use_exclusive_lock_;
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
@@ -461,7 +503,7 @@ REGISTER_KERNELS(GPU, double);
 TF_CALL_float(REGISTER_SYCL_KERNELS);
 TF_CALL_double(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -469,18 +511,21 @@ TF_CALL_double(REGISTER_SYCL_KERNELS);
 template <typename Device, typename T>
 class ApplyDelayCompensatedGradientDescentOp : public OpKernel {
  public:
-  explicit ApplyDelayCompensatedGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ApplyDelayCompensatedGradientDescentOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 4});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 4});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     const Tensor& alpha = ctx->input(1);
     OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
                 errors::InvalidArgument("alpha is not a scalar: ",
@@ -496,7 +541,8 @@ class ApplyDelayCompensatedGradientDescentOp : public OpKernel {
                 errors::InvalidArgument("lambda is not a scalar: ",
                                         lambda.shape().DebugString()));
     Tensor shadow;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 4, use_exclusive_lock_, &shadow));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 4, use_exclusive_lock_, &shadow));
     OP_REQUIRES(
         ctx, shadow.shape().IsSameSize(var.shape()),
         errors::InvalidArgument("shadow and var do not have the same shape",
@@ -506,8 +552,7 @@ class ApplyDelayCompensatedGradientDescentOp : public OpKernel {
     const Device& device = ctx->template eigen_device<Device>();
     functor::ApplyDelayCompensatedGradientDescent<Device, T>()(
         device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>(),
-        lambda.scalar<T>(), shadow.flat<T>()
-    );
+        lambda.scalar<T>(), shadow.flat<T>());
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -516,13 +561,13 @@ class ApplyDelayCompensatedGradientDescentOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(D, T)                                 \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("ApplyDelayCompensatedGradientDescent")             \
-          .Device(DEVICE_##D)                                  \
-          .HostMemory("var")                                   \
-          .HostMemory("shadow")                                \
-          .TypeConstraint<T>("T"),                             \
+#define REGISTER_KERNELS(D, T)                     \
+  REGISTER_KERNEL_BUILDER(                         \
+      Name("ApplyDelayCompensatedGradientDescent") \
+          .Device(DEVICE_##D)                      \
+          .HostMemory("var")                       \
+          .HostMemory("shadow")                    \
+          .TypeConstraint<T>("T"),                 \
       ApplyDelayCompensatedGradientDescentOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
@@ -575,15 +620,15 @@ class ApplyAdadeltaOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, accum_update.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -711,15 +756,15 @@ class SparseApplyAdadeltaOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum_grad.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, accum_update.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(accum_grad.shape()),
         errors::InvalidArgument("var and accum_grad do not have the same shape",
@@ -851,7 +896,7 @@ class ApplyProximalGradientDescentOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     const Tensor& alpha = ctx->input(1);
     OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
                 errors::InvalidArgument("alpha is not a scalar: ",
@@ -1066,11 +1111,11 @@ class ApplyAdagradOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     const Tensor& lr = ctx->input(2);
     OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
                 errors::InvalidArgument("lr is not a scalar: ",
@@ -1159,11 +1204,11 @@ class ApplyProximalAdagradOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -1266,11 +1311,11 @@ class SparseApplyAdagradOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -1400,11 +1445,11 @@ class SparseApplyProximalAdagradOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -1575,15 +1620,15 @@ class ApplyAdagradDAOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, gradient_accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, gradient_squared_accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(gradient_accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -1677,15 +1722,15 @@ class SparseApplyAdagradDAOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, gradient_accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, gradient_squared_accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(gradient_accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -1851,7 +1896,7 @@ REGISTER_KERNELS(double, int32);
 REGISTER_KERNELS(double, int64);
 #undef REGISTER_KERNELS
 
-template <typename Device, typename T>
+template <typename Device, typename T, bool has_l2_shrinkage>
 class ApplyFtrlOp : public OpKernel {
  public:
   explicit ApplyFtrlOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1874,15 +1919,15 @@ class ApplyFtrlOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, linear.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& grad = ctx->input(3);
     OP_REQUIRES(
@@ -1921,7 +1966,8 @@ class ApplyFtrlOp : public OpKernel {
                 errors::InvalidArgument("l2 regularization strength is not a "
                                         "non-negative scalar: ",
                                         l2.shape().DebugString()));
-    const Tensor& lr_power = ctx->input(7);
+    const int lr_power_index = has_l2_shrinkage ? 8 : 7;
+    const Tensor& lr_power = ctx->input(lr_power_index);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(lr_power.shape()) &&
                     lr_power.scalar<T>()() <= static_cast<T>(0),
@@ -1930,10 +1976,25 @@ class ApplyFtrlOp : public OpKernel {
                                         lr_power.shape().DebugString()));
 
     const Device& device = ctx->template eigen_device<Device>();
-    functor::ApplyFtrl<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
-                                    linear.flat<T>(), grad.flat<T>(),
-                                    lr.scalar<T>(), l1.scalar<T>(),
-                                    l2.scalar<T>(), lr_power.scalar<T>());
+    if (has_l2_shrinkage) {
+      const Tensor& l2_shrinkage = ctx->input(7);
+      OP_REQUIRES(
+          ctx,
+          TensorShapeUtils::IsScalar(l2_shrinkage.shape()) &&
+              l2_shrinkage.scalar<T>()() >= static_cast<T>(0),
+          errors::InvalidArgument("l2 shrinkage regularization strength "
+                                  "is not a non-negative scalar: ",
+                                  l2_shrinkage.shape().DebugString()));
+      functor::ApplyFtrlV2<Device, T>()(
+          device, var.flat<T>(), accum.flat<T>(), linear.flat<T>(),
+          grad.flat<T>(), lr.scalar<T>(), l1.scalar<T>(), l2.scalar<T>(),
+          l2_shrinkage.scalar<T>(), lr_power.scalar<T>());
+    } else {
+      functor::ApplyFtrl<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
+                                      linear.flat<T>(), grad.flat<T>(),
+                                      lr.scalar<T>(), l1.scalar<T>(),
+                                      l2.scalar<T>(), lr_power.scalar<T>());
+    }
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -1948,14 +2009,36 @@ using GPUDevice = Eigen::GpuDevice;
 #define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyFtrlOp<D##Device, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("ResourceApplyFtrl")                \
-                              .HostMemory("var")                   \
-                              .HostMemory("accum")                 \
-                              .HostMemory("linear")                \
-                              .Device(DEVICE_##D)                  \
-                              .TypeConstraint<T>("T"),             \
-                          ApplyFtrlOp<D##Device, T>);
+      ApplyFtrlOp<D##Device, T, /*has_l2_shrinkage=*/false>);      \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("ResourceApplyFtrl")                                    \
+          .HostMemory("var")                                       \
+          .HostMemory("accum")                                     \
+          .HostMemory("linear")                                    \
+          .Device(DEVICE_##D)                                      \
+          .TypeConstraint<T>("T"),                                 \
+      ApplyFtrlOp<D##Device, T, /*has_l2_shrinkage=*/false>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(D, T)                                       \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("ApplyFtrlV2").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyFtrlOp<D##Device, T, /*has_l2_shrinkage=*/true>);         \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("ResourceApplyFtrlV2")                                    \
+          .HostMemory("var")                                         \
+          .HostMemory("accum")                                       \
+          .HostMemory("linear")                                      \
+          .Device(DEVICE_##D)                                        \
+          .TypeConstraint<T>("T"),                                   \
+      ApplyFtrlOp<D##Device, T, /*has_l2_shrinkage=*/true>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -1966,7 +2049,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_KERNELS
 
 // Note, this op works on cpu only.
-template <typename Device, typename T, typename Tindex>
+template <typename Device, typename T, typename Tindex, bool has_l2_shrinkage>
 class SparseApplyFtrlOp : public OpKernel {
  public:
   explicit SparseApplyFtrlOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1988,15 +2071,15 @@ class SparseApplyFtrlOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, linear.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -2036,14 +2119,14 @@ class SparseApplyFtrlOp : public OpKernel {
                 errors::InvalidArgument("l2 regularization strength is not a "
                                         "non-negative scalar: ",
                                         l2.shape().DebugString()));
-    const Tensor& lr_power = ctx->input(8);
+    const int lr_power_index = has_l2_shrinkage ? 9 : 8;
+    const Tensor& lr_power = ctx->input(lr_power_index);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(lr_power.shape()) &&
                     lr_power.scalar<T>()() <= static_cast<T>(0),
                 errors::InvalidArgument("lr_power is not a "
                                         "non-positive scalar: ",
                                         lr_power.shape().DebugString()));
-
     int64 inner_dim = 1;
     for (int d = 1; d < var.dims(); d++) {
       OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
@@ -2061,6 +2144,18 @@ class SparseApplyFtrlOp : public OpKernel {
                 errors::InvalidArgument(
                     "Inner dimension should be greater than zero."));
 
+    const Tensor* l2_shrinkage;
+    if (has_l2_shrinkage) {
+      l2_shrinkage = &ctx->input(8);
+      OP_REQUIRES(
+          ctx,
+          TensorShapeUtils::IsScalar(l2_shrinkage->shape()) &&
+              l2_shrinkage->scalar<T>()() >= static_cast<T>(0),
+          errors::InvalidArgument("l2 shrinkage regularization strength "
+                                  "is not a non-negative scalar: ",
+                                  l2_shrinkage->shape().DebugString()));
+    }
+
     if (N > 0) {
       if (inner_dim > 1) {
         const Tindex first_dim_size = var.dim_size(0);
@@ -2072,6 +2167,10 @@ class SparseApplyFtrlOp : public OpKernel {
         T lr_scalar = lr.scalar<T>()();
         T l1_scalar = l1.scalar<T>()();
         T l2_scalar = l2.scalar<T>()();
+        T l2_shrinkage_scalar;
+        if (has_l2_shrinkage) {
+          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
+        }
         T lr_power_scalar = lr_power.scalar<T>()();
 
         for (Tindex i = 0; i < N; i++) {
@@ -2085,41 +2184,56 @@ class SparseApplyFtrlOp : public OpKernel {
           auto grad = grad_flat.template chip<0>(i);
           auto var = var_flat.template chip<0>(index);
 
-          auto new_accum = accum + grad.square();
-          if (lr_power_scalar == static_cast<T>(-0.5)) {
-            linear +=
-                grad - (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;
-          } else {
-            linear += grad -
-                      (new_accum.pow(-lr_power_scalar) -
-                       accum.pow(-lr_power_scalar)) /
-                          lr_scalar * var;
-          }
-          auto x = (linear.constant(l1_scalar) * linear.sign() - linear);
-          if (lr_power_scalar == static_cast<T>(-0.5)) {
-            auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) +
-                     linear.constant(static_cast<T>(2) * l2_scalar);
-            var = x / y;
+// Use a macro to implement the computation here due to the templating of the
+// eigen tensor library.
+#define COMPUTE_FTRL(grad_to_use)                                              \
+  auto new_accum = accum + grad_to_use.square();                               \
+  if (lr_power_scalar == static_cast<T>(-0.5)) {                               \
+    linear +=                                                                  \
+        grad_to_use - (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;     \
+  } else {                                                                     \
+    linear += grad_to_use - (new_accum.pow(-lr_power_scalar) -                 \
+                             accum.pow(-lr_power_scalar)) /                    \
+                                lr_scalar * var;                               \
+  }                                                                            \
+  auto x = (linear.constant(l1_scalar) * linear.sign() - linear);              \
+  if (lr_power_scalar == static_cast<T>(-0.5)) {                               \
+    auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) +                \
+             linear.constant(static_cast<T>(2) * l2_scalar);                   \
+    var = x / y;                                                               \
+  } else {                                                                     \
+    auto y = new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) + \
+             linear.constant(static_cast<T>(2) * l2_scalar);                   \
+    var = x / y;                                                               \
+  }                                                                            \
+  var = (linear.abs() > linear.constant(l1_scalar))                            \
+            .select(var, var.constant(static_cast<T>(0)));                     \
+  accum += grad_to_use.square();
+
+          if (has_l2_shrinkage) {
+            auto grad_with_shrinkage =
+                grad + static_cast<T>(2) * l2_shrinkage_scalar * var;
+            COMPUTE_FTRL(grad_with_shrinkage);
           } else {
-            auto y = new_accum.pow(-lr_power_scalar) /
-                         new_accum.constant(lr_scalar) +
-                     linear.constant(static_cast<T>(2) * l2_scalar);
-            var = x / y;
+            COMPUTE_FTRL(grad);
           }
-          var = (linear.abs() > linear.constant(l1_scalar))
-                    .select(var, var.constant(static_cast<T>(0)));
-          accum += grad.square();
         }
+#undef COMPUTE_FTRL
       } else {
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        T lr_power_scalar = lr_power.scalar<T>()();
+        T l2_shrinkage_scalar;
+        if (has_l2_shrinkage) {
+          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
+        }
+
         auto indices_vec = indices.vec<Tindex>();
         auto var_flat = var.flat<T>();
         auto accum_flat = accum.flat<T>();
         auto linear_flat = linear.flat<T>();
         auto grad_flat = grad.flat<T>();
-        T lr_scalar = lr.scalar<T>()();
-        T l1_scalar = l1.scalar<T>()();
-        T l2_scalar = l2.scalar<T>()();
-        T lr_power_scalar = lr_power.scalar<T>()();
         const Tindex first_dim_size = accum_flat.size();
 
         for (Tindex i = 0; i < N; i++) {
@@ -2131,7 +2245,13 @@ class SparseApplyFtrlOp : public OpKernel {
           T& a = accum_flat(index);
           T& l = linear_flat(index);
           T& v = var_flat(index);
-          const T& g = grad_flat(i);
+          T g;
+          if (has_l2_shrinkage) {
+            g = grad_flat(i) +
+                (static_cast<T>(2) * l2_shrinkage_scalar * var_flat(i));
+          } else {
+            g = grad_flat(i);
+          }
 
           T updated_a = a + g * g;
           using Eigen::numext::pow;
@@ -2153,17 +2273,43 @@ class SparseApplyFtrlOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(T, Tindices)                                 \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyFtrl")                     \
-                              .Device(DEVICE_CPU)                     \
-                              .TypeConstraint<T>("T")                 \
-                              .TypeConstraint<Tindices>("Tindices"),  \
-                          SparseApplyFtrlOp<CPUDevice, T, Tindices>); \
-  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyFtrl")             \
-                              .Device(DEVICE_CPU)                     \
-                              .TypeConstraint<T>("T")                 \
-                              .TypeConstraint<Tindices>("Tindices"),  \
-                          SparseApplyFtrlOp<CPUDevice, T, Tindices>);
+#define REGISTER_KERNELS(T, Tindices)                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("SparseApplyFtrl")                                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      SparseApplyFtrlOp<CPUDevice, T, Tindices, /*has_l2_shrinkage=*/false>); \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ResourceSparseApplyFtrl")                                         \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      SparseApplyFtrlOp<CPUDevice, T, Tindices, /*has_l2_shrinkage=*/false>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(T, Tindices)                                        \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("SparseApplyFtrlV2")                                              \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      SparseApplyFtrlOp<CPUDevice, T, Tindices, /*has_l2_shrinkage=*/true>); \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("ResourceSparseApplyFtrlV2")                                      \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      SparseApplyFtrlOp<CPUDevice, T, Tindices, /*has_l2_shrinkage=*/true>);
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
@@ -2196,11 +2342,11 @@ class ApplyMomentumOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     const Tensor& lr = ctx->input(2);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
                 errors::InvalidArgument("lr is not a scalar: ",
@@ -2299,11 +2445,11 @@ class SparseApplyMomentumOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, accum.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, var.shape().IsSameSize(accum.shape()),
         errors::InvalidArgument("var and accum do not have the same shape",
@@ -2419,15 +2565,15 @@ class ApplyAdamOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, m.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, v.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& beta1_power = ctx->input(3);
     const Tensor& beta2_power = ctx->input(4);
@@ -2487,33 +2633,37 @@ class ApplyAdamOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-class ApplyAdamOp < SYCLDevice, T> : public OpKernel {
+class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
  public:
   explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor m;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m));
     Tensor v;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, m.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, v.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& beta1_power_dev = ctx->input(3);
     const Tensor& beta2_power_dev = ctx->input(4);
@@ -2532,23 +2682,24 @@ class ApplyAdamOp < SYCLDevice, T> : public OpKernel {
     auto device = ctx->eigen_sycl_device();
     auto size = sizeof(T);
     auto src_ptr = GetBase(&beta1_power_dev);
-    device.memcpyDeviceToHost(&beta1_power, static_cast<const T *>(src_ptr), size);
+    device.memcpyDeviceToHost(&beta1_power, static_cast<const T*>(src_ptr),
+                              size);
 
     src_ptr = GetBase(&beta2_power_dev);
-    device.memcpyDeviceToHost(&beta2_power, static_cast<const T *>(src_ptr), size);
+    device.memcpyDeviceToHost(&beta2_power, static_cast<const T*>(src_ptr),
+                              size);
 
     src_ptr = GetBase(&lr_dev);
-    device.memcpyDeviceToHost(&lr, static_cast<const T *>(src_ptr), size);
+    device.memcpyDeviceToHost(&lr, static_cast<const T*>(src_ptr), size);
 
     src_ptr = GetBase(&beta1_dev);
-    device.memcpyDeviceToHost(&beta1, static_cast<const T *>(src_ptr), size);
+    device.memcpyDeviceToHost(&beta1, static_cast<const T*>(src_ptr), size);
 
     src_ptr = GetBase(&beta2_dev);
-    device.memcpyDeviceToHost(&beta2, static_cast<const T *>(src_ptr), size);
+    device.memcpyDeviceToHost(&beta2, static_cast<const T*>(src_ptr), size);
 
     src_ptr = GetBase(&epsilon_dev);
-    device.memcpyDeviceToHost(&epsilon, static_cast<const T *>(src_ptr), size);
-
+    device.memcpyDeviceToHost(&epsilon, static_cast<const T*>(src_ptr), size);
 
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_dev.shape()),
                 errors::InvalidArgument("beta1_power is not a scalar: ",
@@ -2585,11 +2736,9 @@ class ApplyAdamOp < SYCLDevice, T> : public OpKernel {
                                 var.shape().DebugString(), " ",
                                 grad.shape().DebugString()));
 
-    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(),
-                                    v.flat<T>(), beta1_power,
-                                    beta2_power, lr,
-                                    beta1, beta2,
-                                    epsilon, grad.flat<T>());
+    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+                                beta1_power, beta2_power, lr, beta1, beta2,
+                                epsilon, grad.flat<T>());
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -2597,7 +2746,7 @@ class ApplyAdamOp < SYCLDevice, T> : public OpKernel {
  private:
   bool use_exclusive_lock_;
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
@@ -2679,15 +2828,15 @@ class ApplyRMSPropOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, ms.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, mom.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -2764,19 +2913,19 @@ class ApplyCenteredRMSPropOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, mg.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, ms.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
     OP_REQUIRES(
         ctx, mom.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(3)));
+            "Attempting to use uninitialized variables: ", requested_input(3)));
 
     const Tensor& lr = ctx->input(4);
     const Tensor& rho = ctx->input(5);
@@ -2922,15 +3071,15 @@ class SparseApplyRMSPropOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, ms.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(1)));
+            "Attempting to use uninitialized variables: ", requested_input(1)));
     OP_REQUIRES(
         ctx, mom.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -3054,15 +3203,15 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(0)));
+            "Attempting to use uninitialized variables: ", requested_input(0)));
     OP_REQUIRES(
         ctx, ms.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(2)));
+            "Attempting to use uninitialized variables: ", requested_input(2)));
     OP_REQUIRES(
         ctx, mom.IsInitialized(),
         errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", def().input(3)));
+            "Attempting to use uninitialized variables: ", requested_input(3)));
 
     const Tensor& lr = ctx->input(4);
     const Tensor& rho = ctx->input(5);
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 0a3c5d361ed688fb5ca5723344ea957b0383b1df..8b0cde19aadbb3039ce09d440237dd16625815bd 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -113,6 +113,19 @@ struct ApplyFtrl {
                   typename TTypes<T>::ConstScalar lr_power);
 };
 
+template <typename Device, typename T>
+struct ApplyFtrlV2 {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::Flat linear,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstScalar l2_shrinkage,
+                  typename TTypes<T>::ConstScalar lr_power);
+};
+
 template <typename Device, typename T>
 struct ApplyMomentum {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 124cf14dd2c9912205dfc8ef06054c044ecff7f8..498030fdfee0408039f81e121d139f2f271337df 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -38,18 +38,6 @@ namespace internal {
 typedef gtl::InlinedVector<int64, 8> TransposeDimsVec;
 typedef gtl::InlinedVector<int32, 8> TransposePermsVec;
 
-// Helper to compute 'strides' given a tensor 'shape'. I.e.,
-// strides[i] = prod(shape.dim_size[(i+1):])
-template <typename Index>
-void ComputeStride(const TensorShape& shape, Index* strides) {
-  const int ndims = shape.dims();
-  Index stride = 1;
-  for (int i = ndims - 1; i >= 0; --i) {
-    strides[i] = stride;
-    stride *= static_cast<Index>(shape.dim_size(i));
-  }
-}
-
 // Helper function that takes a tensor shape, a permutation, combines the
 // neighboring shapes if their indices in the permutation are consecutive.
 // The function outputs the combined shape and new permutation.
@@ -130,8 +118,25 @@ void TransposeSimple(const Device& d, const Tensor& in,
 // Uses Eigen to transpose.
 template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const gtl::ArraySlice<int32> perm, Tensor* out);
+                         const gtl::ArraySlice<int32> perm, Tensor* out) {
+  Eigen::array<int, NDIMS> p;
+  for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
+  auto x = typename TTypes<T, NDIMS>::ConstTensor(
+      reinterpret_cast<const T*>(in.tensor_data().data()),
+      in.shape().AsEigenDSizes<NDIMS>());
+  auto y = typename TTypes<T, NDIMS>::Tensor(
+      reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
+      out->shape().AsEigenDSizes<NDIMS>());
+  y.device(d) = x.shuffle(p);
+}
 
+
+#ifdef TENSORFLOW_USE_SYCL
+// For SYCL lets always go through Eigen
+template <typename Device, typename T>
+void TransposeSYCL(const Device& d, const Tensor& in,
+                   const gtl::ArraySlice<int32> perm, Tensor* out);
+#endif // TENSORFLOW_USE_SYCL
 }  // namespace internal
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 248c11976e70c4adea11f74d4c634cbb02f75115..a004cb2293cf05b105edd54f80298d7d4363b953 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
 namespace internal {
@@ -24,10 +25,8 @@ template <typename Device, typename T>
 void TransposeSimple(const Device& d, const Tensor& in,
                      const gtl::ArraySlice<int32> perm, Tensor* out) {
   const int ndims = in.dims();
-  gtl::InlinedVector<int64, 8> in_strides(ndims);
-  ComputeStride(in.shape(), in_strides.data());
-  gtl::InlinedVector<int64, 8> out_strides(ndims);
-  ComputeStride(out->shape(), out_strides.data());
+  gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
+  gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
   const int64 nelem = in.NumElements();
   const T* p = reinterpret_cast<const T*>(in.tensor_data().data());
   T* q = reinterpret_cast<T*>(const_cast<char*>((out->tensor_data().data())));
@@ -45,20 +44,6 @@ void TransposeSimple(const Device& d, const Tensor& in,
   }
 }
 
-template <typename Device, typename T, int NDIMS>
-void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const gtl::ArraySlice<int32> perm, Tensor* out) {
-  Eigen::array<int, NDIMS> p;
-  for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
-  auto x = typename TTypes<T, NDIMS>::ConstTensor(
-      reinterpret_cast<const T*>(in.tensor_data().data()),
-      in.shape().AsEigenDSizes<NDIMS>());
-  auto y = typename TTypes<T, NDIMS>::Tensor(
-      reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
-      out->shape().AsEigenDSizes<NDIMS>());
-  y.device(d) = x.shuffle(p);
-}
-
 }  // end namespace internal
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -182,7 +167,35 @@ template <typename T>
 struct Transpose<SYCLDevice, T> {
   static void run(const SYCLDevice& d, const Tensor& in,
                   const gtl::ArraySlice<int32> perm, Tensor* out) {
-    // Should add a specialized implementation for SYCLDevice here.
+    switch (in.dims()) {
+      case 1:
+        internal::TransposeUsingEigen<SYCLDevice, T, 1>(d, in, perm, out);
+        break;
+      case 2:
+        internal::TransposeUsingEigen<SYCLDevice, T, 2>(d, in, perm, out);
+        break;
+      case 3:
+        internal::TransposeUsingEigen<SYCLDevice, T, 3>(d, in, perm, out);
+        break;
+      case 4:
+        internal::TransposeUsingEigen<SYCLDevice, T, 4>(d, in, perm, out);
+        break;
+      case 5:
+        internal::TransposeUsingEigen<SYCLDevice, T, 5>(d, in, perm, out);
+        break;
+      case 6:
+        internal::TransposeUsingEigen<SYCLDevice, T, 6>(d, in, perm, out);
+        break;
+      case 7:
+        internal::TransposeUsingEigen<SYCLDevice, T, 7>(d, in, perm, out);
+        break;
+      case 8:
+        internal::TransposeUsingEigen<SYCLDevice, T, 8>(d, in, perm, out);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported TransposeUsingEigen for: " << in.dims();
+        break;
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index bc72bfb2fdd0a4140695ea51a9102156cbf67d69..a118cc80c969409e97b22f158af21c5940bf4ecc 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 // TODO(yangzihao): Remove the dependency of conv_2d.h once we move all
@@ -53,13 +54,13 @@ void TransposeSimple(const Device& d, const Tensor& in,
   CHECK_LT(nelem, kint32max) << "Tensor too large to transpose on GPU";
   // Pack strides and permutation into one buffer.
   const int32 ndims = in.dims();
-  gtl::InlinedVector<int32, 16> host_buf(ndims * 3);
-  // Input strides.
-  ComputeStride(in.shape(), &host_buf[0]);
-  // Output strides.
-  ComputeStride(out->shape(), &host_buf[ndims]);
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
   // Dimension permutation.
   for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
     host_buf[ndims * 2 + i] = perm[i];
   }
   // Copies the input strides, output strides and permutation to the device.
@@ -79,20 +80,6 @@ void TransposeSimple(const Device& d, const Tensor& in,
   d.deallocate(dev_buf);
 }
 
-template <typename Device, typename T, int NDIMS>
-void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const gtl::ArraySlice<int32> perm, Tensor* out) {
-  Eigen::array<int, NDIMS> p;
-  for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
-  auto x = typename TTypes<T, NDIMS>::ConstTensor(
-      reinterpret_cast<const T*>(in.tensor_data().data()),
-      in.shape().AsEigenDSizes<NDIMS>());
-  auto y = typename TTypes<T, NDIMS>::Tensor(
-      reinterpret_cast<T*>(const_cast<char*>(out->tensor_data().data())),
-      out->shape().AsEigenDSizes<NDIMS>());
-  y.device(d) = x.shuffle(p);
-}
-
 // TransposeUsingTile tries to reduce the dimension of the input tensor to 3 and
 // then call special kernels to swap either dimension 1 and dimension 2 or
 // dimension 0 and dimension 2. It returns true if the operation is success,
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 75ed76a6979ed5f3ccf68455b76b6fe737278f05..d3305fb83a77ed329ee24fd40b893e7f52b7b848 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -233,10 +233,7 @@ Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                               .TypeConstraint<int32>("Tperm") \
                               .HostMemory("perm"),            \
                           TransposeSyclOp);
-REGISTER(float);
-REGISTER(bool);
-REGISTER(int32);
+TF_CALL_POD_TYPES(REGISTER);
 #undef REGISTER
 #endif
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index b57e13a28c39bccd050c1415c1acedab180909a8..6e51696d6f45cc032b47a82fe2eeea2a50de32f0 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <unordered_map>
 #include <utility>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -20,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
 
@@ -50,7 +50,8 @@ class UniqueOp : public OpKernel {
                                 {0}, 1, input.shape(), &idx));
     auto idx_vec = idx->template vec<int32>();
 
-    gtl::FlatMap<T, int32> uniq(N);
+    std::unordered_map<T, int32> uniq;
+    uniq.reserve(2 * N);
     for (int64 i = 0, j = 0; i < N; ++i) {
       auto it = uniq.insert(std::make_pair(Tin(i), j));
       idx_vec(i) = it.first->second;
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index 0dc9066273f28225b20fac5b127d8995f6cf9ee2..0273f08090b20764bf60b6076ea3c14c059ab91e 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -34,12 +35,39 @@ namespace {
 
 const int kMaxStrLen = 40;
 
-static void BM_Unique_INT32(int iters, int dim) {
+TensorProto GetRandomInt32TensorProto(int dim, int max_int) {
+  TensorProto tensor_proto;
+  tensor_proto.set_dtype(DT_INT32);
+  tensor_proto.mutable_tensor_shape()->add_dim()->set_size(dim);
+  tensor_proto.mutable_tensor_shape()->set_unknown_rank(false);
+  for (int i = 0; i < dim; ++i) {
+    const int int_val = std::rand() % max_int;
+    tensor_proto.add_int_val(int_val);
+  }
+  return tensor_proto;
+}
+
+TensorProto GetRandomInt32TensorProtoWithRepeat(int dim, int repeat,
+                                                int max_int) {
+  TensorProto tensor_proto;
+  tensor_proto.set_dtype(DT_INT32);
+  tensor_proto.mutable_tensor_shape()->add_dim()->set_size(dim);
+  tensor_proto.mutable_tensor_shape()->set_unknown_rank(false);
+  for (int i = 0; i < dim; ++i) {
+    const int int_val = std::rand() % max_int;
+    for (int j = 0; j < repeat; ++j) {
+      tensor_proto.add_int_val(int_val);
+    }
+  }
+  return tensor_proto;
+}
+
+static void BM_Unique_INT32(int iters, int dim, int max_int) {
   testing::StopTiming();
   Graph* g = new Graph(OpRegistry::Global());
 
   Tensor input(DT_INT32, TensorShape({dim}));
-  input.flat<int32>().setRandom();
+  CHECK(input.FromProto(GetRandomInt32TensorProto(dim, max_int)));
 
   Node* node;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Unique")
@@ -53,6 +81,27 @@ static void BM_Unique_INT32(int iters, int dim) {
   test::Benchmark("cpu", g).Run(iters);
 }
 
+static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
+  testing::StopTiming();
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor input(DT_INT32, TensorShape({dim * 200}));
+  CHECK(
+      input.FromProto(GetRandomInt32TensorProtoWithRepeat(dim, 200, max_int)));
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Unique")
+                  .Input(test::graph::Constant(g, input))
+                  .Attr("T", DT_INT32)
+                  .Finalize(g, &node));
+
+  testing::BytesProcessed(static_cast<int64>(iters) * dim * 200 *
+                          sizeof(int32));
+  testing::UseRealTime();
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
   TensorProto tensor_proto;
   tensor_proto.set_dtype(DT_STRING);
@@ -90,13 +139,46 @@ static void BM_Unique_STRING(int iters, int dim) {
 }
 
 BENCHMARK(BM_Unique_INT32)
-    ->Arg(32)
-    ->Arg(256)
-    ->Arg(1024)
-    ->Arg(4 * 1024)
-    ->Arg(16 * 1024)
-    ->Arg(64 * 1024)
-    ->Arg(256 * 1024);
+    ->ArgPair(32, 1024 * 1024)
+    ->ArgPair(256, 1024 * 1024)
+    ->ArgPair(1024, 1024 * 1024)
+    ->ArgPair(4 * 1024, 1024 * 1024)
+    ->ArgPair(16 * 1024, 1024 * 1024)
+    ->ArgPair(64 * 1024, 1024 * 1024)
+    ->ArgPair(1024 * 1024, 1024 * 1024)
+    ->ArgPair(4 * 1024 * 1024, 1024 * 1024)
+    ->ArgPair(32, 64 * 1024 * 1024)
+    ->ArgPair(256, 64 * 1024 * 1024)
+    ->ArgPair(1024, 64 * 1024 * 1024)
+    ->ArgPair(4 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(16 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(64 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(1024 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(4 * 1024 * 1024, 64 * 1024 * 1024);
+
+BENCHMARK(BM_Unique_INT32_Repeat)
+    ->ArgPair(32, 1024 * 1024)
+    ->ArgPair(256, 1024 * 1024)
+    ->ArgPair(1024, 1024 * 1024)
+    ->ArgPair(4 * 1024, 1024 * 1024)
+    ->ArgPair(16 * 1024, 1024 * 1024)
+    ->ArgPair(64 * 1024, 1024 * 1024)
+    ->ArgPair(1024 * 1024, 1024 * 1024)
+    ->ArgPair(4 * 1024 * 1024, 1024 * 1024)
+    ->ArgPair(32, 32 * 1024 * 1024)
+    ->ArgPair(256, 32 * 1024 * 1024)
+    ->ArgPair(1024, 32 * 1024 * 1024)
+    ->ArgPair(4 * 1024, 32 * 1024 * 1024)
+    ->ArgPair(16 * 1024, 32 * 1024 * 1024)
+    ->ArgPair(64 * 1024, 32 * 1024 * 1024)
+    ->ArgPair(1024 * 1024, 32 * 1024 * 1024)
+    ->ArgPair(32, 64 * 1024 * 1024)
+    ->ArgPair(256, 64 * 1024 * 1024)
+    ->ArgPair(1024, 64 * 1024 * 1024)
+    ->ArgPair(4 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(16 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(64 * 1024, 64 * 1024 * 1024)
+    ->ArgPair(1024 * 1024, 64 * 1024 * 1024);
 
 BENCHMARK(BM_Unique_STRING)
     ->Arg(32)
diff --git a/tensorflow/core/kernels/warn_about_ints.cc b/tensorflow/core/kernels/warn_about_ints.cc
index fd0a889c99df47454a5eff1acd646b070d3a4280..75ecdf2ae4b6581e77b8c4813851671bf8fcbe71 100644
--- a/tensorflow/core/kernels/warn_about_ints.cc
+++ b/tensorflow/core/kernels/warn_about_ints.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/warn_about_ints.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index e56a49884554873402add58568c20812359af36c..59b474e41cf7c308ffe69f730ddd9e2d7e24a658 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -17,9 +17,14 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
 #include "tensorflow/core/kernels/where_op.h"
 
 #include <memory>
+#include <numeric>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -27,43 +32,116 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/platform/cuda.h"
+
+using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device>
-class WhereOp : public OpKernel {
+namespace functor {
+
+template <>
+struct NumTrue<CPUDevice, int64> {
+  static Status Compute(OpKernelContext* ctx, const CPUDevice& d,
+                        TTypes<bool>::ConstFlat input,
+                        TTypes<int64>::Scalar num_true) {
+    *num_true.data() =
+        std::accumulate(input.data(), input.data() + input.size(), 0);
+    return Status::OK();
+  }
+};
+
+template <int DIMS, typename TIndex>
+struct Where<CPUDevice, DIMS, TIndex> {
+  EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor(
+      typename TTypes<int64>::Matrix output,
+      const typename Eigen::DSizes<TIndex, DIMS>& strides, TIndex true_n,
+      TIndex index) {
+    for (int i = 0; i < DIMS; ++i) {
+      output(true_n, i) = index / strides[i];
+      index -= output(true_n, i) * strides[i];
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const CPUDevice& d,
+      typename TTypes<bool, DIMS>::ConstTensor input,
+      typename TTypes<int64>::Matrix output, TIndex* found_true) {
+    Eigen::DSizes<Eigen::DenseIndex, DIMS> dims = input.dimensions();
+    Eigen::DSizes<TIndex, DIMS> strides;
+
+    EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
+                         static_cast<int>(Eigen::RowMajor)),
+                        INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
+
+    strides[DIMS - 1] = 1;
+    for (int i = DIMS - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dims[i + 1];
+    }
+
+    Eigen::DenseIndex output_size = output.dimension(0);
+    for (Eigen::DenseIndex n = 0; n < input.size(); ++n) {
+      if (input.data()[n]) {
+        if (FastBoundsCheck(*found_true, output_size)) {
+          WriteIndexRowMajor(output, strides, *found_true, n);
+        }
+        ++*found_true;
+      }
+    }
+    return Status::OK();
+  }
+};
+
+}  // namespace functor
+
+class WhereCPUOp : public OpKernel {
  public:
-  explicit WhereOp(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit WhereCPUOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
 
     const int input_dims = input.dims();
+
     Tensor num_true;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true));
     auto num_true_t = num_true.scalar<int64>();
 
-    functor::NumTrue<Device>::Compute(context->eigen_device<Device>(),
-                                      input.flat<bool>(), num_true_t);
+    Status s = functor::NumTrue<CPUDevice, int64>::Compute(
+        context, context->eigen_device<CPUDevice>(), input.flat<bool>(),
+        num_true_t);
+    OP_REQUIRES_OK(context, s);
     TensorShape output_shape({num_true_t(), input_dims});
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
-#define HANDLE_DIM(NDIM)                                             \
-  case NDIM:                                                         \
-    found_true = functor::Where<Device, NDIM>::Compute(              \
-        context->eigen_device<Device>(), input.tensor<bool, NDIM>(), \
-        output->matrix<int64>());                                    \
-    break;
-
+    // TODO(ebrevdo): Replace single-threaded copy with a
+    // multithreaded block copy by getting block counts above instead
+    // of a global NumTrue, then having each block filled in in
+    // separate threads below.
     int64 found_true = 0;
+
+#define HANDLE_DIM(NDIM)                                                   \
+  case NDIM: {                                                             \
+    Status s = functor::Where<CPUDevice, NDIM, int64>::Compute(            \
+        context, context->eigen_device<CPUDevice>(),                       \
+        input.tensor<bool, NDIM>(), output->matrix<int64>(), &found_true); \
+    OP_REQUIRES_OK(context, s);                                            \
+  } break;
+
     switch (input_dims) {
       HANDLE_DIM(1);
       HANDLE_DIM(2);
@@ -79,7 +157,7 @@ class WhereOp : public OpKernel {
 #undef HANDLE_DIM
 
     OP_REQUIRES(
-        context, num_true_t() == found_true,
+        context, found_true == num_true_t(),
         errors::InvalidArgument(
             "WhereOp: Race condition between counting the number of true "
             "elements and writing them.  When counting, saw ",
@@ -88,12 +166,166 @@ class WhereOp : public OpKernel {
   }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(WhereOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(WhereCPUOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Where").Device(DEVICE_CPU), WhereCPUOp);
+
+#if GOOGLE_CUDA
+
+namespace functor {
+
+#define DECLARE_GPU_NUMTRUE(Tindex)                                            \
+  template <>                                                                  \
+  Status NumTrue<GPUDevice, Tindex>::Compute(                                  \
+      OpKernelContext* ctx, const GPUDevice& d, TTypes<bool>::ConstFlat input, \
+      TTypes<Tindex>::Scalar num_true);                                        \
+  extern template struct NumTrue<GPUDevice, Tindex>
+
+DECLARE_GPU_NUMTRUE(int32);
+DECLARE_GPU_NUMTRUE(int64);
+#undef DECLARE_GPU_NUMTRUE
+
+#define DECLARE_GPU_WHERE_INDEX(Dims, Tindex)                     \
+  template <>                                                     \
+  Status Where<GPUDevice, Dims, Tindex>::Compute(                 \
+      OpKernelContext* ctx, const GPUDevice& d,                   \
+      typename TTypes<bool, Dims>::ConstTensor input,             \
+      typename TTypes<int64>::Matrix output, Tindex* found_true); \
+  extern template struct Where<GPUDevice, Dims, Tindex>;
+#define DECLARE_GPU_WHERE(Dims)         \
+  DECLARE_GPU_WHERE_INDEX(Dims, int32); \
+  DECLARE_GPU_WHERE_INDEX(Dims, int64);
+
+DECLARE_GPU_WHERE(1);
+DECLARE_GPU_WHERE(2);
+DECLARE_GPU_WHERE(3);
+DECLARE_GPU_WHERE(4);
+DECLARE_GPU_WHERE(5);
+#undef DECLARE_GPU_WHERE
+#undef DECLARE_GPU_WHERE_INDEX
+
+}  // namespace functor
+
+class WhereGPUOp : public AsyncOpKernel {
+ public:
+  explicit WhereGPUOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& input = context->input(0);
+    const int input_dims = input.dims();
+
+    if (input.NumElements() < std::numeric_limits<int32>::max()) {
+      ComputeAsyncType<int32>(input, input_dims, context, done);
+    } else {
+      ComputeAsyncType<int64>(input, input_dims, context, done);
+    }
+  }
+
+  template <typename Tindex>
+  void ComputeAsyncType(const Tensor& input, const int input_dims,
+                        OpKernelContext* context, DoneCallback done) {
+    // Step 0: alloc nnz
+    // Step 1: call nnz kernel
+    // Step 2: copy nnz to host
+    // Step 3: call create_output
+    // Step 4: call where kernel
+    Tensor num_true;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<Tindex>::v(),
+                                                TensorShape({}), &num_true),
+                         done);
+
+    auto num_true_t = num_true.scalar<Tindex>();
+
+    perftools::gputools::DeviceMemoryBase num_true_ptr(
+        static_cast<void*>(num_true_t.data()));
+    // Push kernel to stream to get number of true elements.
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+    Status s = functor::NumTrue<GPUDevice, Tindex>::Compute(
+        context, d, input.flat<bool>(), num_true_t);
+    OP_REQUIRES_OK_ASYNC(context, s, done);
+
+    // Copy num_true to host;
+    ScratchSpace<Tindex> num_true_host(context, 1, /* on_host */ true);
+
+    auto stream = context->op_device_context()->stream();
+    OP_REQUIRES_ASYNC(
+        context,
+        stream
+            ->ThenMemcpy(num_true_host.mutable_data(), num_true_ptr,
+                         sizeof(Tindex))
+            .ok(),
+        errors::Internal("WhereOp: failed to copy num_true from device"), done);
+
+    auto create_and_check_output = [context, &d, &input, input_dims,
+                                    num_true_host, done]() {
+      // Ensure that within the callback, the proper GPU settings are
+      // configured.
+      auto stream = context->op_device_context()->stream();
+      ScopedActivateExecutorContext scoped_activation{stream->parent()};
+
+      Tindex num_true = *num_true_host.data();
+
+      // TODO(ebrevdo): Properly copy back found_true value to CPU for
+      // validation checking.  Currently Where<GPUDevice>::Compute()
+      // does not perform this copy back to CPU.
+      Tindex found_true = -1;
+
+      // Step 1: Allocate the output and perform the selection/copy.
+      Tensor* output;
+      OP_REQUIRES_OK_ASYNC(context,
+                           context->allocate_output(
+                               0, TensorShape({num_true, input_dims}), &output),
+                           done);
+
+#define HANDLE_DIM(NDIM)                                                 \
+  case NDIM: {                                                           \
+    Status s = functor::Where<GPUDevice, NDIM, Tindex>::Compute(         \
+        context, d, input.tensor<bool, NDIM>(), output->matrix<int64>(), \
+        &found_true);                                                    \
+    OP_REQUIRES_OK_ASYNC(context, s, done);                              \
+  } break;
+
+      switch (input_dims) {
+        HANDLE_DIM(1);
+        HANDLE_DIM(2);
+        HANDLE_DIM(3);
+        HANDLE_DIM(4);
+        HANDLE_DIM(5);
+
+        default:
+          OP_REQUIRES_ASYNC(
+              context, false,
+              errors::InvalidArgument("WhereOp: Unhandled input dimensions: ",
+                                      input_dims),
+              done);
+      }
+#undef HANDLE_DIM
+
+      // TODO(ebrevdo): Fix the copy back to host.
+
+      // OP_REQUIRES_ASYNC(
+      //     context, found_true == num_true,
+      //     errors::InvalidArgument(
+      //         "WhereOp: Race condition between counting the number of true "
+      //         "elements and writing them.  When counting, saw ",
+      //         num_true, " elements; but when writing their indices, saw ",
+      //         found_true, " elements."),
+      //     done);
+
+      done();
+    };
+    context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+        stream, create_and_check_output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(WhereGPUOp);
 };
 
-#define REGISTER_WHERE() \
-  REGISTER_KERNEL_BUILDER(Name("Where").Device(DEVICE_CPU), WhereOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("Where").Device(DEVICE_GPU), WhereGPUOp);
 
-REGISTER_WHERE();
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index aa2712371463c82b08106f5ae3039bba68d2be84..e040325e3de26fd9b1ba05eecdd9340a92f054e3 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_WHERE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -25,55 +26,25 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename Device>
+template <typename Device, typename TIndex>
 struct NumTrue {
-  EIGEN_ALWAYS_INLINE static void Compute(
-      const Device& d, typename TTypes<bool>::ConstFlat input,
-      TTypes<int64>::Scalar num_true) {
-    num_true.device(d) = input.template cast<int64>().sum();
-  }
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const Device& d, TTypes<bool>::ConstFlat input,
+      typename TTypes<TIndex>::Scalar num_true);
 };
 
-template <typename Device, int NDIM>
+template <typename Device, int NDIM, typename TIndex>
 struct Where {
-  EIGEN_ALWAYS_INLINE static int64 Compute(
-      const Device& d, typename TTypes<bool, NDIM>::ConstTensor input,
-      typename TTypes<int64>::Matrix output) {
-    Eigen::DenseIndex true_n = 0;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions();
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> strides;
-
-    // Calculate strides for RowMajor order.
-    EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
-                         static_cast<int>(Eigen::RowMajor)),
-                        INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
-
-    strides[NDIM - 1] = 1;
-    for (int i = NDIM - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dims[i + 1];
-    }
-
-    Eigen::DenseIndex output_size = output.dimension(0);
-    for (Eigen::DenseIndex n = 0; n < input.size(); ++n) {
-      if (input.data()[n]) {
-        if (TF_PREDICT_TRUE(true_n < output_size)) {
-          WriteIndexRowMajor(output, strides, true_n, n);
-        }
-        ++true_n;
-      }
-    }
-    return true_n;
-  }
-
-  EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor(
-      typename TTypes<int64>::Matrix output,
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides,
-      Eigen::DenseIndex true_n, Eigen::DenseIndex index) {
-    for (int i = 0; i < NDIM; ++i) {
-      output(true_n, i) = index / strides[i];
-      index %= strides[i];
-    }
-  }
+  // Copies indices of true values in input into output.  The pointer
+  // found_true should sit on the host.  Compute should copy the
+  // number of true elements found into it.  At the end, if
+  //   *found_true != output.dimension(0),
+  // then the input may have changed between the initial counting of
+  // the true values and the call to Where.
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const Device& d,
+      typename TTypes<bool, NDIM>::ConstTensor input,
+      typename TTypes<int64>::Matrix output, TIndex* found_true);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.cc b/tensorflow/core/kernels/where_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7c54ccbb493f02cad979011747fd12d33fc0f02
--- /dev/null
+++ b/tensorflow/core/kernels/where_op_gpu.cu.cc
@@ -0,0 +1,256 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/device/device_reduce.cuh"
+#include "external/cub_archive/cub/device/device_select.cuh"
+#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/where_op.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <int NDIM, typename TIndex>
+__global__ void PropagateWhereIndicesKernel(
+    const TIndex output_rows, const typename Eigen::array<TIndex, NDIM> strides,
+    int64* output) {
+  // TODO(ebrevdo): Use a multi-dimensional loop, increasing the
+  // dimensions of individual indices manually, instead of relying on
+  // a scalar loop variable and using integer division.
+  CUDA_1D_KERNEL_LOOP(i, output_rows) {
+    TIndex index_value = ldg(output + NDIM * i);
+#pragma unroll
+    for (int c = 0; c < NDIM; ++c) {
+      *(output + NDIM * i + c) = index_value / strides[c];
+      index_value %= strides[c];
+    }
+  }
+}
+
+template <typename TIndex>
+struct NumTrue<GPUDevice, TIndex> {
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const GPUDevice& d, TTypes<bool>::ConstFlat input,
+      typename TTypes<TIndex>::Scalar num_true) {
+    const cudaStream_t& cu_stream = GetCudaStream(ctx);
+
+    std::size_t temp_storage_bytes = 0;
+    const bool* input_data = input.data();
+    TIndex* num_true_data = num_true.data();
+
+    auto first_success =
+        cub::DeviceReduce::Sum(/*temp_storage*/ nullptr, temp_storage_bytes,
+                               /*d_in*/ input_data,
+                               /*d_out*/ num_true_data,
+                               /*num_items*/ input.size(),
+                               /*stream*/ cu_stream);
+
+    if (first_success != cudaSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch cub::DeviceReduce::Sum to calculate "
+          "temp_storage_bytes, status: ",
+          cudaGetErrorString(first_success));
+    }
+
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+        &temp_storage));
+
+    auto second_success = cub::DeviceReduce::Sum(
+        /*temp_storage*/ temp_storage.flat<int8>().data(), temp_storage_bytes,
+        /*d_in*/ input_data,
+        /*d_out*/ num_true_data,
+        /*num_items*/ input.size(),
+        /*stream*/ cu_stream);
+
+    if (second_success != cudaSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch cub::DeviceReduce::Sum to count "
+          "number of true indices.  temp_storage_bytes: ",
+          temp_storage_bytes, ", status: ", cudaGetErrorString(second_success));
+    }
+
+    return Status::OK();
+  }
+};
+
+template struct NumTrue<GPUDevice, int32>;
+template struct NumTrue<GPUDevice, int64>;
+
+template <int NDIM>
+class WhereOutputIterator {
+ public:
+  // Required iterator traits
+  typedef WhereOutputIterator self_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef void value_type;
+  typedef void pointer;
+  typedef int64& reference;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust
+  // 1.7 (or newer) methods
+  typedef typename thrust::detail::iterator_facade_category<
+      thrust::device_system_tag, thrust::random_access_traversal_tag,
+      value_type,
+      reference>::type iterator_category;  ///< The iterator category
+#else
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+#endif  // THRUST_VERSION
+
+  WhereOutputIterator(int64* ptr, const Eigen::DenseIndex max_row)
+      : ptr_(ptr), max_row_(max_row) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int64& operator[](int n) const {
+    // If the selection mechanism finds too many true values (because
+    // the input tensor changed between allocation of output and now),
+    // we may accidentally try to write past the allowable memory.  If
+    // valid is false, then we don't do this.  Instead, we'll read off
+    // the number of items found in Flagged()'s d_num_selected_out at
+    // the end and confirm that it matches the number of rows of output.
+    const bool valid = FastBoundsCheck(n, max_row_);
+    return *(ptr_ + (valid ? (NDIM * n) : 0));
+  }
+
+ private:
+  int64* ptr_;
+  const Eigen::DenseIndex max_row_;
+};
+
+template <typename TIndex, int NDIM>
+Eigen::array<TIndex, NDIM> CalculateStrides(
+    typename TTypes<bool, NDIM>::ConstTensor input) {
+  const Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions();
+  Eigen::array<TIndex, NDIM> strides;
+  EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
+                       static_cast<int>(Eigen::RowMajor)),
+                      INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
+  strides[NDIM - 1] = 1;
+  for (int i = NDIM - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+template <int NDIM, typename Tindex>
+struct Where<GPUDevice, NDIM, Tindex> {
+  EIGEN_ALWAYS_INLINE static Status Compute(
+      OpKernelContext* ctx, const GPUDevice& d,
+      typename TTypes<bool, NDIM>::ConstTensor input,
+      typename TTypes<int64>::Matrix output, Tindex* found_true_host) {
+    if (output.dimension(0) == 0) {
+      // Nothing to do.
+      return Status::OK();
+    }
+
+    const cudaStream_t& cu_stream = GetCudaStream(ctx);
+
+    std::size_t temp_storage_bytes = 0;
+
+    cub::CountingInputIterator<Tindex> select_counter(0);
+
+    Tensor found_true_t;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<Tindex>::v(),
+                                          TensorShape({}), &found_true_t));
+    Tindex* found_true_device = found_true_t.scalar<Tindex>().data();
+
+    WhereOutputIterator<NDIM> output_iterator(
+        output.data(),
+        /* max_row */ output.dimension(0));
+
+    auto first_success =
+        cub::DeviceSelect::Flagged(/*temp_storage*/ nullptr, temp_storage_bytes,
+                                   /*d_in*/ select_counter,
+                                   /*d_flags*/ input.data(),
+                                   /*d_out*/ output_iterator,
+                                   /*d_num_selected_out*/ found_true_device,
+                                   /*num_items*/ input.size(),
+                                   /*stream*/ cu_stream);
+    if (first_success != cudaSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch cub::DeviceSelect::Flagged to calculate "
+          "temp_storage_bytes, status: ",
+          cudaGetErrorString(first_success));
+    }
+
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+        &temp_storage));
+
+    auto second_success = cub::DeviceSelect::Flagged(
+        /*temp_storage*/ temp_storage.flat<int8>().data(), temp_storage_bytes,
+        /*d_in*/ select_counter,
+        /*d_flags*/ input.data(),
+        /*d_out*/ output_iterator,
+        /*d_num_selected_out*/ found_true_device,
+        /*num_items*/ input.size(),
+        /*stream*/ cu_stream);
+
+    if (second_success != cudaSuccess) {
+      return errors::Internal(
+          "WhereOp: Could not launch cub::DeviceSelect::Flagged to copy "
+          "indices out, status: ",
+          cudaGetErrorString(second_success));
+    }
+
+    // TODO(ebrevdo): Find a way to synchronously copy back data from
+    // found_true_device to *found_true_host.
+
+    const Eigen::array<Tindex, NDIM> strides =
+        CalculateStrides<Tindex, NDIM>(input);
+    const Tindex output_rows = output.dimension(0);
+    CudaLaunchConfig config = GetCudaLaunchConfig(output_rows, d);
+    PropagateWhereIndicesKernel<NDIM, Tindex>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            output_rows, strides, output.data());
+
+    return Status::OK();
+  }
+};
+
+#define DECLARE_GPU_SPEC_INDEX(Dims, Tindex) \
+  template struct Where<GPUDevice, Dims, Tindex>
+#define DECLARE_GPU_SPEC(Dims)         \
+  DECLARE_GPU_SPEC_INDEX(Dims, int32); \
+  DECLARE_GPU_SPEC_INDEX(Dims, int64)
+
+DECLARE_GPU_SPEC(1);
+DECLARE_GPU_SPEC(2);
+DECLARE_GPU_SPEC(3);
+DECLARE_GPU_SPEC(4);
+DECLARE_GPU_SPEC(5);
+
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_INDEX
+
+}  // namespace functor
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index 8f42bb28324ecc3f7d82ebbbcb3aebaff6b2c47a..17a39ce29b4ea88cdf99fafa2838ce7433ba0564 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -18,11 +18,14 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_base.pb.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -130,10 +133,14 @@ class WriteFileOp : public OpKernel {
                 errors::InvalidArgument(
                     "Contents tensor must be scalar, but had shape: ",
                     contents_input->shape().DebugString()));
-    OP_REQUIRES_OK(
-        context,
-        WriteStringToFile(context->env(), filename_input->scalar<string>()(),
-                          contents_input->scalar<string>()()));
+    const string& filename = filename_input->scalar<string>()();
+    const string dir = io::Dirname(filename).ToString();
+    if (!context->env()->FileExists(dir).ok()) {
+      OP_REQUIRES_OK(context, context->env()->RecursivelyCreateDir(dir));
+    }
+    OP_REQUIRES_OK(context,
+                   WriteStringToFile(context->env(), filename,
+                                     contents_input->scalar<string>()()));
   }
 };
 
diff --git a/tensorflow/core/lib/gtl/array_slice.h b/tensorflow/core/lib/gtl/array_slice.h
index 30ef19ebe8a73069b17bb4bf3aca1d6f0965cd1c..002d166c724c68bb2f6230c0cf3f3fc6f0b4d0e5 100644
--- a/tensorflow/core/lib/gtl/array_slice.h
+++ b/tensorflow/core/lib/gtl/array_slice.h
@@ -191,7 +191,7 @@ class ArraySlice {
   void pop_front() { remove_prefix(1); }
 
   // These relational operators have the same semantics as the
-  // std::vector<T> relational operators: they do deep (elementwise)
+  // std::vector<T> relational operators: they do deep (element-wise)
   // comparisons.  Array slices are equal iff their size is the same
   // and all their elements are equal.
   bool operator==(ArraySlice<T> other) const { return impl_ == other.impl_; }
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
index 19e1eb5c3bbed2486d2953ff2cd00fa4284ba845..e92083fecfc5f2f86b6df4ab394112a6d8fc2f2a 100644
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <iterator>
 #include <utility>
 #include "tensorflow/core/lib/gtl/flatrep.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -33,7 +34,7 @@ namespace gtl {
 // The map is implemented using an open-addressed hash table.  A
 // single array holds entire map contents and collisions are resolved
 // by probing at a sequence of locations in the array.
-template <typename Key, typename Val, class Hash = std::hash<Key>,
+template <typename Key, typename Val, class Hash = hash<Key>,
           class Eq = std::equal_to<Key>>
 class FlatMap {
  private:
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index d8e50830e6a130b959286397a130afc31d72c5d4..bb65e5357a845ebc132a8518fc28fec94b669bde 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -588,7 +588,7 @@ TEST(FlatMap, ForwardIterator) {
 // or destructions will show up as errors under a sanitizer or
 // heap checker.
 TEST(FlatMap, ConstructDestruct) {
-  FlatMap<string, string, HashStr> map;
+  FlatMap<string, string> map;
   string k1 = "the quick brown fox jumped over the lazy dog";
   string k2 = k1 + k1;
   string k3 = k1 + k2;
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
index f5e318be1f38931e4a95fec67d78d3f487d741ee..bb405b327aa86983a171727b76a63109d7028431 100644
--- a/tensorflow/core/lib/gtl/flatrep.h
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -29,7 +29,7 @@ namespace internal {
 //
 // The representation is an open-addressed hash table.  Conceptually,
 // the representation is a flat array of entries.  However we
-// structure it as an array of of buckets where each bucket holds
+// structure it as an array of buckets where each bucket holds
 // kWidth entries along with metadata for the kWidth entries.  The
 // metadata marker is
 //
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index c5dd45457fa8b9492383aadc7988a433368c7f06..74940880da7cea103e9843c10c48d1dc4db85feb 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <iterator>
 #include <utility>
 #include "tensorflow/core/lib/gtl/flatrep.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -33,8 +34,7 @@ namespace gtl {
 // The map is implemented using an open-addressed hash table.  A
 // single array holds entire map contents and collisions are resolved
 // by probing at a sequence of locations in the array.
-template <typename Key, class Hash = std::hash<Key>,
-          class Eq = std::equal_to<Key>>
+template <typename Key, class Hash = hash<Key>, class Eq = std::equal_to<Key>>
 class FlatSet {
  private:
   // Forward declare some internal types needed in public section.
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 54def603c3562c04beddf80f0f3da52fe0bffee5..09fbbb1fb6c6670d24345c0043c56df0ed2c7bb0 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -485,7 +485,7 @@ TEST(FlatSet, ForwardIterator) {
 // or destructions will show up as errors under a sanitizer or
 // heap checker.
 TEST(FlatSet, ConstructDestruct) {
-  FlatSet<string, HashStr> set;
+  FlatSet<string> set;
   string k1 = "the quick brown fox jumped over the lazy dog";
   string k2 = k1 + k1;
   string k3 = k1 + k2;
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 8ba4b091434c74c6062f7ac91aa7902b71a533e2..2ff8b9c7d1adbbc206e0429142389e9730efa33c 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -656,7 +656,7 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constaint checks.  You can't have an optional of nullopt_t, in_place_t or
+  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
   // a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 6f965928c7516d11895abf0f77cbf79f4f162b5c..77b8031598bc82bdd5475ea2d6502ee74024074f 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -42,18 +43,44 @@ inline uint64 Hash64Combine(uint64 a, uint64 b) {
   return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
 }
 
-// Convenience Hash functors
-struct HashStr {
+// Hash functor suitable for use with power-of-two sized hashtables.  Use
+// instead of std::hash<T>.
+//
+// In particular, tensorflow::hash is not the identity function for pointers.
+// This is important for power-of-two sized hashtables like FlatMap and FlatSet,
+// because otherwise they waste the majority of their hash buckets.
+template <typename T>
+struct hash {
+  size_t operator()(const T& t) const { return std::hash<T>()(t); }
+};
+
+template <typename T>
+struct hash<T*> {
+  size_t operator()(const T* t) const {
+    // Hash pointers as integers, but bring more entropy to the lower bits.
+    size_t k = static_cast<size_t>(reinterpret_cast<uintptr_t>(t));
+    return k + (k >> 6);
+  }
+};
+
+template <>
+struct hash<string> {
   size_t operator()(const string& s) const {
     return static_cast<size_t>(Hash64(s));
   }
 };
-template <typename PTR>
-struct HashPtr {
-  size_t operator()(const PTR p) const {
-    // Hash pointers as integers, but bring more entropy to the lower bits.
-    size_t k = static_cast<size_t>(reinterpret_cast<uintptr_t>(p));
-    return k + (k >> 6);
+
+template <>
+struct hash<StringPiece> {
+  size_t operator()(StringPiece sp) const {
+    return static_cast<size_t>(Hash64(sp.data(), sp.size()));
+  }
+};
+
+template <typename T, typename U>
+struct hash<std::pair<T, U>> {
+  size_t operator()(const std::pair<T, U>& p) const {
+    return Hash64Combine(hash<T>()(p.first), hash<U>()(p.second));
   }
 };
 
diff --git a/tensorflow/core/lib/hash/hash_test.cc b/tensorflow/core/lib/hash/hash_test.cc
index a335fddc4c81d0b932402fec3967501df1883ee6..0e5f6c6803389d78d648142992e9ad5b0d487d26 100644
--- a/tensorflow/core/lib/hash/hash_test.cc
+++ b/tensorflow/core/lib/hash/hash_test.cc
@@ -65,6 +65,11 @@ TEST(Hash, SignedUnsignedIssue) {
   }
 }
 
+TEST(Hash, HashPtrIsNotIdentityFunction) {
+  int* ptr = reinterpret_cast<int*>(0xcafe0000);
+  EXPECT_NE(hash<int*>()(ptr), size_t{0xcafe0000});
+}
+
 static void BM_Hash32(int iters, int len) {
   std::string input(len, 'x');
   uint32 h = 0;
diff --git a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc b/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
index a27ab73bf7f2d8389f92f89d04b05f909277627a..e0918c70a7902be7e514d2f2b1126d33e05d4d43 100644
--- a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
@@ -117,7 +117,7 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
   io::SnappyInputBuffer in(file_reader.get(), uncompress_input_buf_size,
                            uncompress_output_buf_size);
 
-  // Run the test twice, reseting the stream after the first attempt.
+  // Run the test twice, resetting the stream after the first attempt.
   for (int attempt = 0; attempt < 2; ++attempt) {
     string actual_result;
     for (int i = 0; i < num_writes; i++) {
diff --git a/tensorflow/core/lib/io/table_format.txt b/tensorflow/core/lib/io/table_format.txt
index e37c627f5b1140ae6bc483a1583f7df35c7677b7..dd846e012955f396cb0c9f59939f4fae1f7700b9 100644
--- a/tensorflow/core/lib/io/table_format.txt
+++ b/tensorflow/core/lib/io/table_format.txt
@@ -5,4 +5,4 @@ The table format is similar to the table format for the LevelDB
 open source key/value store, with the exception that our tables
 do not support "filter" meta blocks (Bloom Filters).  See:
 
-https://github.com/google/leveldb/blob/master/doc/table_format.txt
+https://github.com/google/leveldb/blob/master/doc/table_format.md
diff --git a/tensorflow/core/lib/random/random.cc b/tensorflow/core/lib/random/random.cc
index a30bad2b1b8aa63f78f4b2c2f019478d1eb6fa55..723c1100f8e49f31e1e656649472eb72cec790a9 100644
--- a/tensorflow/core/lib/random/random.cc
+++ b/tensorflow/core/lib/random/random.cc
@@ -22,17 +22,28 @@ limitations under the License.
 namespace tensorflow {
 namespace random {
 
-std::mt19937_64* InitRng() {
+namespace {
+std::mt19937_64* InitRngWithRandomSeed() {
   std::random_device device("/dev/urandom");
   return new std::mt19937_64(device());
 }
+std::mt19937_64 InitRngWithDefaultSeed() { return std::mt19937_64(); }
+
+}  // anonymous namespace
 
 uint64 New64() {
-  static std::mt19937_64* rng = InitRng();
+  static std::mt19937_64* rng = InitRngWithRandomSeed();
   static mutex mu;
   mutex_lock l(mu);
   return (*rng)();
 }
 
+uint64 New64DefaultSeed() {
+  static std::mt19937_64 rng = InitRngWithDefaultSeed();
+  static mutex mu;
+  mutex_lock l(mu);
+  return rng();
+}
+
 }  // namespace random
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/random/random.h b/tensorflow/core/lib/random/random.h
index 7262f2dbb14af80acbcd515bcc85e4a532a3e23f..5335c8cc3c9f30283f758dd38ddb51c7673f0b9d 100644
--- a/tensorflow/core/lib/random/random.h
+++ b/tensorflow/core/lib/random/random.h
@@ -25,6 +25,10 @@ namespace random {
 // in different processes.
 uint64 New64();
 
+// Return a 64-bit random value. Uses
+// std::mersenne_twister_engine::default_seed as seed value.
+uint64 New64DefaultSeed();
+
 }  // namespace random
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index 3e864c4f2821a00c89eaf5b7e0a0da275bc770a1..46a45a66783af3444589cd66eab16c427ae1b890 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -38,7 +38,7 @@ AlphaNum::AlphaNum(Hex hex) {
   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
   // where 0x10000 is the smallest hex number that is as wide as the user
   // asked for.
-  uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
+  uint64 mask = (static_cast<uint64>(1) << (width - 1) * 4) | value;
   static const char hexdigits[] = "0123456789abcdef";
   do {
     *--writer = hexdigits[value & 0xF];
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 028ff26ffb9b23d57be908494005c6ed1745c981..79918690dbb4b464219f43ab4ef3ff041e25fa50 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -147,7 +147,7 @@ Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
     return errors::InvalidArgument("num_frames must be positive.");
   }
 
-  const size_t bytes_per_second = sample_rate * kBytesPerSample;
+  const size_t bytes_per_second = sample_rate * kBytesPerSample * num_channels;
   const size_t num_samples = num_frames * num_channels;
   const size_t data_size = num_samples * kBytesPerSample;
   const size_t file_size = kHeaderSize + num_samples * kBytesPerSample;
@@ -242,8 +242,7 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
         "Bad bytes per sample in WAV header: Expected ",
         expected_bytes_per_sample, " but got ", bytes_per_sample);
   }
-  const uint32 expected_bytes_per_second =
-      (bytes_per_sample * (*sample_rate)) / *channel_count;
+  const uint32 expected_bytes_per_second = bytes_per_sample * *sample_rate;
   if (bytes_per_second != expected_bytes_per_second) {
     return errors::InvalidArgument(
         "Bad bytes per second in WAV header: Expected ",
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index e54b9445abc30b804366392455eba31693c6b2b5..40ddd94abef649285528b6ee177620eedb07df20 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -97,5 +97,63 @@ TEST(WavIO, EncodeThenDecode) {
   }
 }
 
+TEST(WavIO, BasicMono) {
+  std::vector<uint8> wav_data = {
+      'R', 'I', 'F', 'F',  // ChunkID
+      44, 0, 0, 0,         // ChunkSize: 36 + SubChunk2Size
+      'W', 'A', 'V', 'E',  // Format
+      'f', 'm', 't', ' ',  // Subchunk1ID
+      16, 0, 0, 0,         // Subchunk1Size
+      1, 0,                // AudioFormat: 1=PCM
+      1, 0,                // NumChannels
+      0x44, 0xac, 0, 0,    // SampleRate: 44100
+      0x88, 0x58, 0x1, 0,  // BytesPerSecond: SampleRate * NumChannels *
+                           //                 BitsPerSample/8
+      2, 0,                // BytesPerSample: NumChannels * BitsPerSample/8
+      16, 0,               // BitsPerSample
+      'd', 'a', 't', 'a',  // Subchunk2ID
+      8, 0, 0, 0,          // Subchunk2Size: NumSamples * NumChannels *
+                           //                BitsPerSample/8
+      0, 0,                // Sample 1: 0
+      0xff, 0x7f,          // Sample 2: 32767 (saturated)
+      0, 0,                // Sample 3: 0
+      0x00, 0x80,          // Sample 4: -32768 (saturated)
+  };
+  string expected(wav_data.begin(), wav_data.end());
+  float audio[] = {0.0f, 1.0f, 0.0f, -1.0f};
+  string result;
+  TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 1, 4, &result));
+  EXPECT_EQ(expected, result);
+}
+
+TEST(WavIO, BasicStereo) {
+  std::vector<uint8> wav_data = {
+      'R', 'I', 'F', 'F',  // ChunkID
+      44, 0, 0, 0,         // ChunkSize: 36 + SubChunk2Size
+      'W', 'A', 'V', 'E',  // Format
+      'f', 'm', 't', ' ',  // Subchunk1ID
+      16, 0, 0, 0,         // Subchunk1Size
+      1, 0,                // AudioFormat: 1=PCM
+      2, 0,                // NumChannels
+      0x44, 0xac, 0, 0,    // SampleRate: 44100
+      0x10, 0xb1, 0x2, 0,  // BytesPerSecond: SampleRate * NumChannels *
+                           //                 BitsPerSample/8
+      4, 0,                // BytesPerSample: NumChannels * BitsPerSample/8
+      16, 0,               // BitsPerSample
+      'd', 'a', 't', 'a',  // Subchunk2ID
+      8, 0, 0, 0,          // Subchunk2Size: NumSamples * NumChannels *
+                           //                BitsPerSample/8
+      0, 0,                // Sample 1: 0
+      0xff, 0x7f,          // Sample 2: 32767 (saturated)
+      0, 0,                // Sample 3: 0
+      0x00, 0x80,          // Sample 4: -32768 (saturated)
+  };
+  string expected(wav_data.begin(), wav_data.end());
+  float audio[] = {0.0f, 1.0f, 0.0f, -1.0f};
+  string result;
+  TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 2, &result));
+  EXPECT_EQ(expected, result);
+}
+
 }  // namespace wav
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 85a6cfcac91785e620fa00b97cdf794d60c2e7ee..d587683a2932b7740771928725669de12d8ff64d 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/strided_slice_op.h"
@@ -172,11 +173,11 @@ REGISTER_OP("ParallelConcat")
     .Attr("shape: shape")
     .SetShapeFn([](InferenceContext* c) {
       // Validate that the shape attr is correct.
-      TensorShapeProto passed_shape_proto;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &passed_shape_proto));
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
       ShapeHandle passed_shape;
       TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeProto(passed_shape_proto, &passed_shape));
+          c->MakeShapeFromPartialTensorShape(shape, &passed_shape));
       if (!c->FullyDefined(passed_shape)) {
         return errors::InvalidArgument("shape attr must be fully defined.");
       }
@@ -637,11 +638,9 @@ REGISTER_OP("ImmutableConst")
     .SetShapeFn([](InferenceContext* c) {
       TensorShape shape_from_attr;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_from_attr));
-      TensorShapeProto shape_proto;
-      shape_from_attr.AsProto(&shape_proto);
       ShapeHandle output_shape;
       TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeProto(shape_proto, &output_shape));
+          c->MakeShapeFromPartialTensorShape(shape_from_attr, &output_shape));
       c->set_output(0, output_shape);
       return Status::OK();
     })
@@ -1149,7 +1148,8 @@ reverse(t, dims) ==> [[[[8, 9, 10, 11],
 ```
 
 tensor: Up to 8-D.
-axis: 1-D. The indices of the dimensions to reverse.
+axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+  `[-rank(tensor), rank(tensor))`.
 output: The same shape as `tensor`.
 )Doc");
 
@@ -1306,11 +1306,11 @@ REGISTER_OP("_ParallelConcatStart")
     .Attr("dtype: type")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
-      TensorShapeProto shape_proto;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
       ShapeHandle output_shape;
       TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeProto(shape_proto, &output_shape));
+          c->MakeShapeFromPartialTensorShape(shape, &output_shape));
       c->set_output(0, output_shape);
       return Status::OK();
     })
@@ -1398,6 +1398,105 @@ raising an error.
 </div>
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("GatherV2")
+    .Input("params: Tparams")
+    .Input("indices: Tindices")
+    .Input("axis: Taxis")
+    .Output("output: Tparams")
+    .Attr("Tparams: type")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Taxis: {int32,int64}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle params_shape;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &params_shape));
+
+      ShapeHandle indices_shape = c->input(1);
+      ShapeHandle unused_axis_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_axis_shape));
+      const Tensor* axis_t = c->input_tensor(2);
+
+      // If axis is unknown, we can only infer that the result is params_rank +
+      // indices_rank - 1.
+      if (axis_t == nullptr) {
+        if (c->RankKnown(params_shape) && c->RankKnown(indices_shape)) {
+          c->set_output(0, c->UnknownShapeOfRank(c->Rank(params_shape) +
+                                                 c->Rank(indices_shape) - 1));
+        } else {
+          c->set_output(0, c->UnknownShape());
+        }
+        return Status::OK();
+      }
+
+      // Note, axis can be negative.
+      int64 axis = 0;
+      if (axis_t->dtype() == DT_INT32) {
+        axis = axis_t->scalar<int32>()();
+      } else {
+        axis = axis_t->scalar<int64>()();
+      }
+
+      // Check that params has rank of at least axis + 1.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(
+          params_shape, axis < 0 ? -axis : axis + 1, &unused));
+
+      ShapeHandle params_outer_subshape;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(params_shape, 0, axis, &params_outer_subshape));
+
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(params_outer_subshape, indices_shape, &out));
+
+      // Slice from axis + 1 to the end of params_shape to collect the inner
+      // dimensions of the result. Special case -1 here since -1 + 1 wraps, and
+      // we slice from 0 to the end of shape. Subshape() handles all other
+      // out-of-bounds checking.
+      if (axis != -1) {
+        ShapeHandle params_inner_subshape;
+        TF_RETURN_IF_ERROR(
+            c->Subshape(params_shape, axis + 1, &params_inner_subshape));
+        TF_RETURN_IF_ERROR(c->Concatenate(out, params_inner_subshape, &out));
+      }
+
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Gather slices from `params` axis `axis` according to `indices`.
+
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+params.shape[axis + 1:]` where:
+
+```python
+    # Scalar indices (output is rank(params) - 1).
+    output[a_0, ..., a_n, b_0, ..., b_n] =
+      params[a_0, ..., a_n, indices, b_0, ..., b_n]
+
+    # Vector indices (output is rank(params)).
+    output[a_0, ..., a_n, i, b_0, ..., b_n] =
+      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+
+    # Higher rank indices (output is rank(params) + rank(indices) - 1).
+    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+```
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+</div>
+
+params: The tensor from which to gather values. Must be at least rank
+  `axis + 1`.
+indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+axis: The axis in `params` to gather `indices` from. Defaults to the first
+  dimension. Supports negative indexes.
+output: Values from `params` gathered from indices given by `indices`, with
+  shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("GatherNd")
     .Input("params: Tparams")
@@ -1434,15 +1533,25 @@ REGISTER_OP("GatherNd")
       return Status::OK();
     })
     .Doc(R"doc(
-Gather values or slices from `params` according to `indices`.
+Gather slices from `params` into a Tensor with shape specified by `indices`.
 
-`indices` is an integer tensor containing indices into `params`.  The last
-dimension of `indices` can be at most the rank of `params`:
+`indices` is an K-dimensional integer tensor, best thought of as a
+(K-1)-dimensional tensor of indices into `params`, where each element defines a
+slice of `params`:
+
+    output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+
+Whereas in @{tf.gather} `indices` defines slices into the first
+dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+
+The last dimension of `indices` can be at most the rank of
+`params`:
 
     indices.shape[-1] <= params.rank
 
 The last dimension of `indices` corresponds to elements
-(if `indices.shape[-1] = params.rank`) or slices
+(if `indices.shape[-1] == params.rank`) or slices
 (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
 of `params`.  The output tensor has shape
 
@@ -1578,6 +1687,20 @@ REGISTER_OP("RefIdentity")
 Return the same ref tensor as the input ref tensor.
 )Doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("DebugGradientIdentity")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetAllowsUninitializedInput()
+    .Doc(R"Doc(
+Identity op for gradient debugging.
+
+This op is hidden from public in Python. It is used by TensorFlow Debugger to
+register gradient tensors for gradient debugging.
+)Doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("StopGradient")
     .Input("input: T")
@@ -2266,11 +2389,10 @@ REGISTER_OP("StridedSlice")
         return Status::OK();
       }
 
-      TensorShapeProto input_shape_proto;
+      PartialTensorShape input_shape({});
       for (int i = 0; i < c->Rank(input); ++i) {
         auto dim = c->Dim(input, i);
-        input_shape_proto.add_dim()->set_size(c->ValueKnown(dim) ? c->Value(dim)
-                                                                 : -1);
+        input_shape.AddDim(c->ValueKnown(dim) ? c->Value(dim) : -1);
       }
 
       int32 begin_mask, end_mask, ellipsis_mask, new_axis_mask,
@@ -2284,21 +2406,17 @@ REGISTER_OP("StridedSlice")
       const Tensor* begin_value = c->input_tensor(1);
       const Tensor* end_value = c->input_tensor(2);
 
-      TensorShapeProto processing_shape, final_shape;
-      ShapeReadWriteFromTensorShapeProto wrapped_processing_shape(
-          &processing_shape);
-      ShapeReadWriteFromTensorShapeProto wrapped_final_shape(&final_shape);
+      PartialTensorShape processing_shape, final_shape;
       bool is_identity, is_simple_slice, slice_dim0;
       gtl::InlinedVector<int64, 4> begin, end, strides;
       TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
-          begin_value, end_value, *strides_value,
-          ShapeReadWriteFromTensorShapeProto(&input_shape_proto), begin_mask,
+          begin_value, end_value, *strides_value, input_shape, begin_mask,
           end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
-          &wrapped_processing_shape, &wrapped_final_shape, &is_identity,
-          &is_simple_slice, &slice_dim0, &begin, &end, &strides));
+          &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+          &slice_dim0, &begin, &end, &strides));
 
       ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(final_shape, &out));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(final_shape, &out));
       c->set_output(0, out);
 
       return Status::OK();
@@ -2715,6 +2833,45 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
 
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("PadV2")
+    .Input("input: T")
+    .Input("paddings: Tpaddings")
+    .Input("constant_values: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(PadShapeFn)
+    .Doc(R"doc(
+Pads a tensor.
+
+This operation pads `input` according to the `paddings` and `constant_values`
+you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+how many padding values to add before the contents of `input` in that dimension,
+and `paddings[D, 1]` indicates how many padding values to add after the contents
+of `input` in that dimension. `constant_values` is a scalar tensor of the same
+type as `input` that indicates the value to use for padding `input`.
+
+The padded size of each dimension D of the output is:
+
+`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# 'constant_values' is 0
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+```
+
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("MirrorPad")
     .Input("input: T")
@@ -2870,10 +3027,8 @@ REGISTER_OP("Placeholder")
         return shape_inference::UnknownShape(c);
       }
 
-      TensorShapeProto shape_proto;
-      shape.AsProto(&shape_proto);
       ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &out));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
       c->set_output(0, out);
       return Status::OK();
     })
@@ -2898,10 +3053,10 @@ REGISTER_OP("PlaceholderV2")
     .Attr("dtype: type")
     .Attr("shape: shape")
     .SetShapeFn([](InferenceContext* c) {
-      TensorShapeProto shape;
+      PartialTensorShape shape;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
       ShapeHandle output;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape, &output));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output));
       c->set_output(0, output);
       return Status::OK();
     })
@@ -2929,10 +3084,8 @@ REGISTER_OP("PlaceholderWithDefault")
       ShapeHandle input = c->input(0);
       PartialTensorShape shape;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
-      TensorShapeProto shape_proto;
-      shape.AsProto(&shape_proto);
       ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &out));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
 
       // We merge for compatibility checking, but return the output,
       // since output_shape may be less precise than input_shape.
@@ -3034,7 +3187,8 @@ This operation is related to `squeeze()`, which removes dimensions of
 size 1.
 
 dim: 0-D (scalar). Specifies the dimension index at which to
-  expand the shape of `input`.
+  expand the shape of `input`. Must be in the range
+  `[-rank(input) - 1, rank(input)]`.
 output: Contains the same data as `input`, but its shape has an additional
   dimension of size 1 added.
 )doc");
@@ -3130,7 +3284,8 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 
 input: The `input` to squeeze.
 squeeze_dims: If specified, only squeezes the dimensions listed. The dimension
-  index starts at 0. It is an error to squeeze a dimension that is not 1.
+  index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+  be in the range `[-rank(input), rank(input))`.
 output: Contains the same data as `input`, but has one or more dimensions of
   size 1 removed.
 )doc");
@@ -3903,9 +4058,8 @@ REGISTER_OP("SpaceToDepth")
       TF_RETURN_IF_ERROR(c->Multiply(c->Dim(input, 3), block_size * block_size,
                                      &output_depth));
 
-      c->set_output(0,
-                    c->MakeShape({c->Dim(input, 0), output_height, output_width,
-                                  output_depth}));
+      c->set_output(0, c->MakeShape({c->Dim(input, 0), output_height,
+                                     output_width, output_depth}));
       return Status::OK();
     })
     .Doc(R"doc(
@@ -4009,9 +4163,8 @@ REGISTER_OP("DepthToSpace")
       TF_RETURN_IF_ERROR(c->Divide(c->Dim(input, 3), block_size * block_size,
                                    true /* evenly_divisible */, &output_depth));
 
-      c->set_output(0,
-                    c->MakeShape({c->Dim(input, 0), output_height, output_width,
-                                  output_depth}));
+      c->set_output(0, c->MakeShape({c->Dim(input, 0), output_height,
+                                     output_width, output_depth}));
       return Status::OK();
     })
     .Doc(R"doc(
@@ -4191,7 +4344,8 @@ Extract `patches` from `images` and put them in the "depth" output dimension.
 images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
 patches: 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
   ksize_cols * depth]` containing image patches with size
-  `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension.
+  `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+  `out_rows` and `out_cols` are the dimensions of the output patches.
 ksizes: The size of the sliding window for each dimension of `images`.
 strides: 1-D of length 4. How far the centers of two consecutive patches are in
   the images. Must be: `[1, stride_rows, stride_cols, 1]`.
@@ -4199,7 +4353,8 @@ rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
   input stride, specifying how far two consecutive patch samples are in the
   input. Equivalent to extracting patches with
   `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-  subsampling them spatially by a factor of `rates`.
+  subsampling them spatially by a factor of `rates`. This is equivalent to
+  `rate` in dilated (a.k.a. Atrous) convolutions.
 padding: The type of padding algorithm to use.
 
 We specify the size-related attributes as:
@@ -4216,8 +4371,15 @@ We specify the size-related attributes as:
 REGISTER_OP("Bitcast")
     .Input("input: T")
     .Output("output: type")
-    .Attr("T: numbertype")
-    .Attr("type: numbertype")
+    // All supported dtypes are listed here to include qint16 and quint16.
+    .Attr(
+        "T: {float, double, int64, int32, uint8, uint16, int8, int16,"
+        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
+        " half}")
+    .Attr(
+        "type: {float, double, int64, int32, uint8, uint16, int8, int16,"
+        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
+        " half}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       if (!c->RankKnown(input)) {
@@ -4435,6 +4597,7 @@ REGISTER_OP("QuantizeAndDequantize")
 Use QuantizeAndDequantizeV2 instead.
 )doc");
 
+// TODO(suharshs): Deprecate QuantizeAndDequantizeV2.
 REGISTER_OP("QuantizeAndDequantizeV2")
     .Input("input: T")
     .Input("input_min: T")
@@ -4514,6 +4677,30 @@ input_max: If range_given, this is the max of the range, otherwise this input
            will be ignored.
 )doc");
 
+REGISTER_OP("QuantizeAndDequantizeV3")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Input("num_bits: int32")
+    .Attr("signed_input: bool = true")
+    .Attr("range_given: bool = true")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Quantizes then dequantizes a tensor.
+
+This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+tensor, so its value can change during training.
+)doc");
+
 REGISTER_OP("QuantizeV2")
     .Input("input: float")
     .Input("min_range: float")
@@ -4839,8 +5026,8 @@ Scatter `updates` into a new (initially zero) tensor according to `indices`.
 
 Creates a new tensor by applying sparse `updates` to individual
 values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
-operator which extracts values or slices from a given tensor.
+indices.  This operator is the inverse of the @{tf.gather_nd} operator which
+extracts values or slices from a given tensor.
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
@@ -4916,6 +5103,61 @@ output: A new tensor with the given shape and updates applied according
   to the indices.
 )doc");
 
+REGISTER_OP("ScatterNdNonAliasingAdd")
+    .Input("input: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
+    .Doc(R"doc(
+Applies sparse addition to `input` using individual values or slices
+from `updates` according to indices `indices`.  The updates are non-aliasing:
+`input` is only modified in-place if no other operations will use it.
+Otherwise, a copy of `input` is made.  This operation has a gradient with
+respect to both `input` and `updates`.
+
+`input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `input`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+(if `K < P`) along the `K`th dimension of `input`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
+```
+
+For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+elements. In Python, that addition would look like this:
+
+    input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(output))
+
+The resulting value `output` would look like this:
+
+    [1, 13, 3, 14, 14, 6, 7, 20]
+
+See @{tf.scatter_nd} for more details about how to make updates to slices.
+
+input: A Tensor.
+indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+  A tensor of indices into `input`.
+updates: A Tensor. Must have the same type as ref. A tensor of updated values
+  to add to `input`.
+output: A `Tensor` with the same shape as `input`, containing values of `input`
+  updated with `updates`.
+)doc");
+
 REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index b1d334e4545f7f753a391abb440d60b3c272a009..a5d7a32e05f4688a3a7a7a21eaa9b18d44a21b15 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -262,6 +264,61 @@ TEST(ArrayOpsTest, Gather_ShapeFn) {
   INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[];[1,2,3]");
 }
 
+TEST(ArrayOpsTest, GatherV2_ShapeFn) {
+  ShapeInferenceTestOp op("GatherV2");
+
+  // Tests when axis is unknown.
+  INFER_OK(op, "?;?;?", "?");
+  INFER_OK(op, "[1,2,3];[3];[]", "[?,?,?]");
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op,
+              "[];[1,2,3];[]");
+
+  // Non-scalar axis.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1];[1,2,3];[1]");
+
+  // Test when axis dim is known.
+  Tensor axis_dim_t;
+  op.input_tensors.resize(3);
+  op.input_tensors[2] = &axis_dim_t;
+
+  // Out of range axis.
+  axis_dim_t = test::AsScalar(1);
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[1];[1,2];[]");
+
+  // Rank 0 indices.
+  axis_dim_t = test::AsScalar(0);
+  INFER_OK(op, "[1,2,3];[];[]", "[d0_1,d0_2]");
+  axis_dim_t = test::AsScalar(1);
+  INFER_OK(op, "[1,2,3];[];[]", "[d0_0,d0_2]");
+  axis_dim_t = test::AsScalar(2);
+  INFER_OK(op, "[1,2,3];[];[]", "[d0_0,d0_1]");
+
+  // Rank 1 indices.
+  axis_dim_t = test::AsScalar(0);
+  INFER_OK(op, "[1,2,3];[5];[]", "[d1_0,d0_1,d0_2]");
+  axis_dim_t = test::AsScalar(1);
+  INFER_OK(op, "[1,2,3];[5];[]", "[d0_0,d1_0,d0_2]");
+  axis_dim_t = test::AsScalar(2);
+  INFER_OK(op, "[1,2,3];[5];[]", "[d0_0,d0_1,d1_0]");
+
+  // Rank 2 indices.
+  axis_dim_t = test::AsScalar(0);
+  INFER_OK(op, "[1,2,3];[5,6];[]", "[d1_0,d1_1,d0_1,d0_2]");
+  axis_dim_t = test::AsScalar(1);
+  INFER_OK(op, "[1,2,3];[5,6];[]", "[d0_0,d1_0,d1_1,d0_2]");
+  axis_dim_t = test::AsScalar(2);
+  INFER_OK(op, "[1,2,3];[5,6];[]", "[d0_0,d0_1,d1_0,d1_1]");
+
+  // Negative axis.
+  axis_dim_t = test::AsScalar(-3);
+  INFER_OK(op, "[1,2,3];[5,6];[]", "[d1_0,d1_1,d0_1,d0_2]");
+  axis_dim_t = test::AsScalar(-2);
+  INFER_OK(op, "[1,2,3];[5,6];[]", "[d0_0,d1_0,d1_1,d0_2]");
+  axis_dim_t = test::AsScalar(-1);
+  INFER_OK(op, "[1,2,3];[5,6];[]", "[d0_0,d0_1,d1_0,d1_1]");
+}
+
 TEST(ArrayOpsTest, GatherNd_ShapeFn) {
   ShapeInferenceTestOp op("GatherNd");
 
@@ -347,6 +404,36 @@ TEST(ArrayOpsTest, PadD_ShapeFn) {
   }
 }
 
+TEST(ArrayOpsTest, PadV2_ShapeFn) {
+  ShapeInferenceTestOp op("PadV2");
+  op.input_tensors.resize(3);
+
+  // Inputs are input, paddings and constant_values.
+
+  INFER_OK(op, "?;?;?", "?");
+
+  // Check shape of paddings.
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "?;[1,2,3];?");
+  INFER_ERROR("Dimension must be 2 but is 4", op, "?;[1,4];?");
+
+  // input.rank and paddings.dim(0) are equal. This is the number of dims in
+  // output.
+  INFER_ERROR("Shape must be rank 4 but is rank 3", op, "[1,2,3];[4,2];[]");
+  INFER_OK(op, "[1,2,3];?;[]", "[?,?,?]");
+  INFER_OK(op, "?;[3,2];[]", "[?,?,?]");
+
+  // Make the paddings tensor known and verify padding values get added.
+  // E.g., if padding is ((1,10),(2,20),(3,30)) then values 11,22,23 are added
+  // to input dims to get output.
+  Tensor paddings_t(DT_INT64, TensorShape{3, 2});
+  test::FillValues<int64>(&paddings_t, {1, 10, 2, 20, 3, 30});
+  op.input_tensors[1] = &paddings_t;
+  INFER_OK(op, "[100,200,300];[3,2];[]", "[111,222,333]");
+  INFER_OK(op, "[100,?,300];[3,2];[]", "[111,?,333]");
+  INFER_OK(op, "?;[3,2];[]", "[?,?,?]");
+  INFER_OK(op, "?;?;[]", "[?,?,?]");
+}
+
 TEST(ArrayOpsTest, MirrorPadGrad_ShapeFn) {
   ShapeInferenceTestOp op("MirrorPadGrad");
   op.input_tensors.resize(2);
diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
index 02b13a455ceaa4c0d8c91ba63136083a47658c45..91e81f2579ce89c2729ac1b16f9d6593e27b9ae6 100644
--- a/tensorflow/core/ops/audio_ops.cc
+++ b/tensorflow/core/ops/audio_ops.cc
@@ -33,7 +33,7 @@ Status DecodeWavShapeFn(InferenceContext* c) {
   DimensionHandle channels_dim;
   int32 desired_channels;
   TF_RETURN_IF_ERROR(c->GetAttr("desired_channels", &desired_channels));
-  if (desired_channels == 0) {
+  if (desired_channels == -1) {
     channels_dim = c->UnknownDim();
   } else {
     if (desired_channels < 0) {
@@ -45,7 +45,7 @@ Status DecodeWavShapeFn(InferenceContext* c) {
   DimensionHandle samples_dim;
   int32 desired_samples;
   TF_RETURN_IF_ERROR(c->GetAttr("desired_samples", &desired_samples));
-  if (desired_samples == 0) {
+  if (desired_samples == -1) {
     samples_dim = c->UnknownDim();
   } else {
     if (desired_samples < 0) {
diff --git a/tensorflow/core/ops/bitwise_ops.cc b/tensorflow/core/ops/bitwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ffc4ab74af71abf515c838d73fe9fe0b8863070
--- /dev/null
+++ b/tensorflow/core/ops/bitwise_ops.cc
@@ -0,0 +1,80 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("Invert")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int8, int16, int32, int64, uint8, uint16}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Flips all bits elementwise.
+
+The result will have exactly those bits set, that are not set in `x`. The
+computation is performed on the underlying representation of x.
+)doc");
+
+#define BINARY_BITWISE()                                     \
+  Input("x: T")                                              \
+      .Input("y: T")                                         \
+      .Output("z: T")                                        \
+      .SetIsCommutative()                                    \
+      .Attr("T: {int8, int16, int32, int64, uint8, uint16}") \
+      .SetShapeFn(shape_inference::UnchangedShape)
+
+REGISTER_OP("PopulationCount")
+    .Input("x: T")
+    .Output("y: uint8")
+    .Attr("T: {int8, int16, int32, int64, uint8, uint16}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+
+For each entry in `x`, calculates the number of `1` (on) bits in the binary
+representation of that entry.
+
+**NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+`int32` or `int64` and perform the bitcount on the result, than to feed in
+8- or 16-bit inputs and then aggregate the resulting counts.
+)doc");
+
+REGISTER_OP("BitwiseAnd").BINARY_BITWISE().Doc(R"doc(
+Elementwise computes the bitwise AND of `x` and `y`.
+
+The result will have those bits set, that are set in both `x` and `y`. The
+computation is performed on the underlying representations of `x` and `y`.
+)doc");
+
+REGISTER_OP("BitwiseOr").BINARY_BITWISE().Doc(R"doc(
+Elementwise computes the bitwise OR of `x` and `y`.
+
+The result will have those bits set, that are set in `x`, `y` or both. The
+computation is performed on the underlying representations of `x` and `y`.
+)doc");
+
+REGISTER_OP("BitwiseXor").BINARY_BITWISE().Doc(R"doc(
+Elementwise computes the bitwise XOR of `x` and `y`.
+
+The result will have those bits set, that are different in `x` and `y`. The
+computation is performed on the underlying representations of `x` and `y`.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 4b10e5b79e36cd2ba68ea08e1e36a929e353d28c..b82035bfc327846e5fc4e787d287b976e250dfcb 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -165,6 +165,30 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -1136,6 +1160,82 @@ op {
     }
   }
 }
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyGradientDescent"
   input_arg {
@@ -1540,6 +1640,69 @@ op {
     }
   }
 }
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "ArgMin"
   input_arg {
@@ -1590,6 +1753,69 @@ op {
     }
   }
 }
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "AsString"
   input_arg {
@@ -1677,6 +1903,30 @@ op {
     }
   }
 }
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Assert"
   input_arg {
@@ -1881,6 +2131,30 @@ op {
     }
   }
 }
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -2942,6 +3216,32 @@ op {
     version: 13
   }
 }
+op {
+  name: "BatchMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
 op {
   name: "BatchMatrixDiag"
   input_arg {
@@ -3577,47 +3877,190 @@ op {
         type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT8
+        type: DT_INT16
         type: DT_COMPLEX64
         type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_QINT32
         type: DT_HALF
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "BiasAddV1"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
+    name: "type"
     type: "type"
     allowed_values {
       list {
@@ -3627,12 +4070,14 @@ op {
         type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
+        type: DT_INT16
         type: DT_COMPLEX64
         type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_QINT32
         type: DT_HALF
       }
@@ -3640,21 +4085,17 @@ op {
   }
 }
 op {
-  name: "Bincount"
+  name: "BitwiseAnd"
   input_arg {
-    name: "arr"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "weights"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "bins"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -3662,68 +4103,76 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "Bitcast"
+  name: "BitwiseOr"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "type"
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "BitwiseXor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
-    name: "type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+  is_commutative: true
 }
 op {
   name: "BroadcastArgs"
@@ -4148,6 +4597,37 @@ op {
     }
   }
 }
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Complex"
   input_arg {
@@ -4355,6 +4835,34 @@ op {
     }
   }
 }
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ConditionalAccumulator"
   output_arg {
@@ -5160,6 +5668,30 @@ op {
     }
   }
 }
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "CountUpTo"
   input_arg {
@@ -5501,6 +6033,22 @@ op {
     }
   }
 }
+op {
+  name: "DebugGradientIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "DebugIdentity"
   input_arg {
@@ -8562,11 +9110,58 @@ op {
   is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReader"
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
   output_arg {
     name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    type: DT_RESOURCE
   }
   attr {
     name: "header_bytes"
@@ -8586,13 +9181,6 @@ op {
       i: 0
     }
   }
-  attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
   attr {
     name: "container"
     type: "string"
@@ -8633,6 +9221,13 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
   attr {
     name: "container"
     type: "string"
@@ -8694,6 +9289,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -9577,6 +10179,49 @@ op {
     }
   }
 }
+op {
+  name: "GatherV2"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "GetSessionHandle"
   input_arg {
@@ -10096,6 +10741,30 @@ op {
     }
   }
 }
+op {
+  name: "IgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Imag"
   input_arg {
@@ -10368,6 +11037,51 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Inv"
   input_arg {
@@ -10428,6 +11142,31 @@ op {
     version: 17
   }
 }
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
 op {
   name: "InvertPermutation"
   input_arg {
@@ -10571,6 +11310,48 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "IteratorGetNext"
   input_arg {
@@ -10595,6 +11376,18 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "IteratorToStringHandle"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -11989,6 +12782,29 @@ op {
     }
   }
 }
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatrixDiag"
   input_arg {
@@ -14778,6 +15594,42 @@ op {
     }
   }
 }
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "PaddedBatchDataset"
   input_arg {
@@ -15446,6 +16298,31 @@ op {
     }
   }
 }
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
 op {
   name: "Pow"
   input_arg {
@@ -15998,6 +16875,53 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "QuantizeDownAndShrinkRange"
   input_arg {
@@ -17461,6 +18385,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "QueueIsClosed"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "QueueIsClosedV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
 op {
   name: "QueueSize"
   input_arg {
@@ -18798,6 +19746,31 @@ op {
     }
   }
 }
+op {
+  name: "RemoteFusedGraphExecute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "serialized_remote_fused_graph_execute_info"
+    type: "string"
+  }
+}
 op {
   name: "RepeatDataset"
   input_arg {
@@ -19638,6 +20611,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyGradientDescent"
   input_arg {
@@ -20219,7 +21261,86 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -20252,6 +21373,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -21220,6 +22345,95 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SampleDistortedBoundingBoxV2"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Save"
   input_arg {
@@ -21670,6 +22884,57 @@ op {
     }
   }
 }
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "ScatterNdSub"
   input_arg {
@@ -22681,6 +23946,30 @@ op {
     }
   }
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Size"
   input_arg {
@@ -23579,51 +24868,137 @@ op {
   }
 }
 op {
-  name: "SparseApplyCenteredRMSProp"
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -23670,7 +25045,7 @@ op {
   }
 }
 op {
-  name: "SparseApplyFtrl"
+  name: "SparseApplyFtrlV2"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -23706,6 +25081,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -24522,6 +25901,108 @@ op {
     }
   }
 }
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "SparseReduceSum"
   input_arg {
@@ -24909,6 +26390,45 @@ op {
     }
   }
 }
+op {
+  name: "SparseSlice"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "SparseSoftmax"
   input_arg {
@@ -25647,6 +27167,14 @@ op {
     is_ref: true
   }
 }
+op {
+  name: "StackCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "StackPop"
   input_arg {
@@ -25663,6 +27191,22 @@ op {
     type: "type"
   }
 }
+op {
+  name: "StackPopV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "StackPush"
   input_arg {
@@ -25690,6 +27234,56 @@ op {
     }
   }
 }
+op {
+  name: "StackPushV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StackV2"
+  input_arg {
+    name: "max_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Stage"
   input_arg {
@@ -27861,6 +29455,27 @@ op {
     name: "filenames"
     type: DT_STRING
   }
+  input_arg {
+    name: "compression_type"
+    description: "A scalar containing either (i) the empty string (no\ncompression), (ii) \"ZLIB\", or (iii) \"GZIP\"."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
   output_arg {
     name: "handle"
     type: DT_RESOURCE
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index f0fcd02835056d6153dd1f2ba40a0d50ec25f136..fe709405f45773bd1555d5ce069da2df80084434 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -24,6 +24,25 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+namespace {
+
+Status DequeueManyV2Shape(InferenceContext* c, ShapeHandle n_shape) {
+  auto* t = c->input_handle_shapes_and_types(0);
+  if (t != nullptr && t->size() == c->num_outputs()) {
+    for (int i = 0; i < c->num_outputs(); ++i) {
+      ShapeHandle combined_shape;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(n_shape, (*t)[i].shape, &combined_shape));
+      c->set_output(i, combined_shape);
+    }
+    return Status::OK();
+  } else {
+    return shape_inference::UnknownShape(c);
+  }
+}
+
+}  // namespace
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("DynamicPartition")
@@ -711,7 +730,19 @@ REGISTER_OP("QueueDequeueManyV2")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle n_shape;
+      if (c->input_tensor(1) == nullptr) {
+        n_shape = c->Vector(InferenceContext::kUnknownDim);
+      } else {
+        const int32 n = c->input_tensor(1)->scalar<int32>()();
+        if (n < 0) {
+          return errors::InvalidArgument("Input 'n' must be >= 0, but is ", n);
+        }
+        n_shape = c->Vector(n);
+      }
+      return DequeueManyV2Shape(c, n_shape);
+    })
     .Doc(R"doc(
 Dequeues `n` tuples of one or more tensors from the given queue.
 
@@ -781,7 +812,9 @@ REGISTER_OP("QueueDequeueUpToV2")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      return DequeueManyV2Shape(c, c->Vector(InferenceContext::kUnknownDim));
+    })
     .Doc(R"doc(
 Dequeues `n` tuples of one or more tensors from the given queue.
 
@@ -848,6 +881,32 @@ cancel_pending_enqueues: If true, all pending enqueue requests that are
   blocked on the given queue will be canceled.
 )doc");
 
+REGISTER_OP("QueueIsClosed")
+    .Input("handle: Ref(string)")
+    .Output("is_closed: bool")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Returns true if queue is closed.
+
+This operation returns true if the queue is closed and false if the queue
+is open.
+
+handle: The handle to a queue.
+)doc");
+
+REGISTER_OP("QueueIsClosedV2")
+    .Input("handle: resource")
+    .Output("is_closed: bool")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Returns true if queue is closed.
+
+This operation returns true if the queue is closed and false if the queue
+is open.
+
+handle: The handle to a queue.
+)doc");
+
 REGISTER_OP("QueueSize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
@@ -1081,8 +1140,9 @@ dtype: The data type of accumulated gradients. Needs to correspond to the type
 
 // --------------------------------------------------------------------------
 
-REGISTER_OP("Stack")
-    .Output("handle: Ref(string)")
+REGISTER_OP("StackV2")
+    .Input("max_size: int32")
+    .Output("handle: resource")
     .Attr("elem_type: type")
     .Attr("stack_name: string = ''")
     .SetIsStateful()
@@ -1090,14 +1150,16 @@ REGISTER_OP("Stack")
     .Doc(R"doc(
 A stack that produces elements in first-in last-out order.
 
+max_size: The maximum size of the stack if non-negative. If negative, the stack
+  size is unlimited.
 handle: The handle to the stack.
 elem_type: The type of the elements on the stack.
 stack_name: Overrides the name used for the temporary stack resource. Default
 value is the name of the 'Stack' op (which is guaranteed unique).
 )doc");
 
-REGISTER_OP("StackPush")
-    .Input("handle: Ref(string)")
+REGISTER_OP("StackPushV2")
+    .Input("handle: resource")
     .Input("elem: T")
     .Output("output: T")
     .Attr("T: type")
@@ -1115,8 +1177,8 @@ output: The same tensor as the input 'elem'.
 swap_memory: Swap `elem` to CPU. Default to false.
 )doc");
 
-REGISTER_OP("StackPop")
-    .Input("handle: Ref(string)")
+REGISTER_OP("StackPopV2")
+    .Input("handle: resource")
     .Output("elem: elem_type")
     .Attr("elem_type: type")
     .SetShapeFn(shape_inference::UnknownShape)
@@ -1128,8 +1190,8 @@ elem: The tensor that is popped from the top of the stack.
 elem_type: The type of the elem that is popped.
 )doc");
 
-REGISTER_OP("StackClose")
-    .Input("handle: Ref(string)")
+REGISTER_OP("StackCloseV2")
+    .Input("handle: resource")
     .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
     .Doc(R"doc(
 Delete the stack from its resource container.
@@ -1137,6 +1199,48 @@ Delete the stack from its resource container.
 handle: The handle to a stack.
 )doc");
 
+// Deprecated ref-typed variants of stack.
+
+REGISTER_OP("Stack")
+    .Output("handle: Ref(string)")
+    .Attr("elem_type: type")
+    .Attr("stack_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Deprecated, use StackV2.
+)doc");
+
+REGISTER_OP("StackPush")
+    .Input("handle: Ref(string)")
+    .Input("elem: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("swap_memory: bool = false")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Deprecated, use StackPushV2.
+)doc");
+
+REGISTER_OP("StackPop")
+    .Input("handle: Ref(string)")
+    .Output("elem: elem_type")
+    .Attr("elem_type: type")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Deprecated, use StackPopV2.
+)doc");
+
+REGISTER_OP("StackClose")
+    .Input("handle: Ref(string)")
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Deprecated, use StackCloseV2.
+)doc");
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("TensorArrayV3")
diff --git a/tensorflow/core/ops/data_flow_ops_test.cc b/tensorflow/core/ops/data_flow_ops_test.cc
index 53c843eb60b64adb4e18be243b5b2948804df67a..6f59db3a1b4c59eaebdc972e8d6274ffe61b3392 100644
--- a/tensorflow/core/ops/data_flow_ops_test.cc
+++ b/tensorflow/core/ops/data_flow_ops_test.cc
@@ -149,4 +149,99 @@ TEST(DataFlowOpsTest, TensorArrayV3) {
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[2]");
 }
 
+TEST(DataFlowOpsTest, QueueDequeueV2ShapeFn) {
+  ShapeInferenceTestOp op("QueueDequeueV2");
+  TF_ASSERT_OK(NodeDefBuilder("test", op.name)
+                   .Input("handle", 0, DT_RESOURCE)
+                   .Attr("component_types", {DT_FLOAT, DT_INT32})
+                   .Finalize(&op.node_def));
+
+  INFER_OK(op, "?", "?;?");
+
+  std::vector<ShapeInferenceTestOp::ShapeAndType> shapes_and_types;
+  op.input_resource_handle_shapes_and_types.push_back(&shapes_and_types);
+  INFER_OK(op, "?", "?;?");
+
+  // Wrong number of shapes provided by handle.
+  shapes_and_types.emplace_back("[1,?,3]", DT_FLOAT);
+  INFER_OK(op, "?", "?;?");
+
+  // Correct number of shapes provided by handle.
+  shapes_and_types.emplace_back("[?,2]", DT_FLOAT);
+  INFER_OK(op, "?", "[1,?,3];[?,2]");
+}
+
+TEST(DataFlowOpsTest, QueueDequeueManyV2ShapeFn) {
+  ShapeInferenceTestOp op("QueueDequeueManyV2");
+  TF_ASSERT_OK(NodeDefBuilder("test", op.name)
+                   .Input("handle", 0, DT_RESOURCE)
+                   .Input("n", 0, DT_INT32)
+                   .Attr("component_types", {DT_FLOAT, DT_INT32})
+                   .Finalize(&op.node_def));
+
+  ////////////////////////////
+  // Input n is not a constant.
+  INFER_OK(op, "?;?", "?;?");
+  std::vector<ShapeInferenceTestOp::ShapeAndType> shapes_and_types;
+  op.input_resource_handle_shapes_and_types.push_back(&shapes_and_types);
+  op.input_resource_handle_shapes_and_types.push_back(nullptr);
+  // Wrong number of shapes provided by handle.
+  shapes_and_types.emplace_back("[1,?,3]", DT_FLOAT);
+  INFER_OK(op, "?;?", "?;?");
+  // Correct number of shapes provided by handle.
+  shapes_and_types.emplace_back("[?,2]", DT_FLOAT);
+  INFER_OK(op, "?;?", "[?,1,?,3];[?,?,2]");
+
+  ////////////////////////////
+  // Input n is a constant. (set up test and repeat the cases from above).
+  Tensor n_tensor = test::AsScalar(12);
+  op.input_tensors.push_back(nullptr);
+  op.input_tensors.push_back(&n_tensor);
+  op.input_resource_handle_shapes_and_types.clear();
+  shapes_and_types.clear();
+
+  INFER_OK(op, "?;?", "?;?");
+  op.input_resource_handle_shapes_and_types.push_back(&shapes_and_types);
+  op.input_resource_handle_shapes_and_types.push_back(nullptr);
+  // Wrong number of shapes provided by handle.
+  shapes_and_types.emplace_back("[1,?,3]", DT_FLOAT);
+  INFER_OK(op, "?;?", "?;?");
+  // Correct number of shapes provided by handle.
+  shapes_and_types.emplace_back("[?,2]", DT_FLOAT);
+  INFER_OK(op, "?;?", "[12,1,?,3];[12,?,2]");
+
+  n_tensor = test::AsScalar<int32>(-1);  // invalid value of n.
+  INFER_ERROR("must be >= 0", op, "?;?");
+}
+
+TEST(DataFlowOpsTest, QueueDequeueUpToV2ShapeFn) {
+  // Results are the same regardless of what value is passed for n.
+  for (int pass = 0; pass < 2; ++pass) {
+    ShapeInferenceTestOp op("QueueDequeueUpToV2");
+    TF_ASSERT_OK(NodeDefBuilder("test", op.name)
+                     .Input("handle", 0, DT_RESOURCE)
+                     .Input("n", 0, DT_INT32)
+                     .Attr("component_types", {DT_FLOAT, DT_INT32})
+                     .Finalize(&op.node_def));
+
+    Tensor n_tensor = test::AsScalar(12);
+    if (pass == 1) {
+      // Second pass, pass value of <n> as a constant.
+      op.input_tensors.push_back(nullptr);
+      op.input_tensors.push_back(&n_tensor);
+    }
+
+    INFER_OK(op, "?;?", "?;?");
+    std::vector<ShapeInferenceTestOp::ShapeAndType> shapes_and_types;
+    op.input_resource_handle_shapes_and_types.push_back(&shapes_and_types);
+    op.input_resource_handle_shapes_and_types.push_back(nullptr);
+    // Wrong number of shapes provided by handle.
+    shapes_and_types.emplace_back("[1,?,3]", DT_FLOAT);
+    INFER_OK(op, "?;?", "?;?");
+    // Correct number of shapes provided by handle.
+    shapes_and_types.emplace_back("[?,2]", DT_FLOAT);
+    INFER_OK(op, "?;?", "[?,1,?,3];[?,?,2]");
+  }
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 4fb554ea16ced08b84cc5f563df15092ddf41222..3f2de6f06690d85e217b207b63e8f5506fdc6f21 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -75,6 +75,17 @@ REGISTER_OP("ZipDataset")
 Creates a dataset that zips together `input_datasets`.
 )doc");
 
+REGISTER_OP("ConcatenateDataset")
+    .Input("input_dataset: resource")
+    .Input("another_dataset: resource")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+)doc");
+
 REGISTER_OP("RepeatDataset")
     .Input("input_dataset: resource")
     .Input("count: int64")
@@ -119,6 +130,16 @@ count: A scalar representing the number of elements from the `input_dataset`
   that should be skipped.  If count is -1, skips everything.
 )doc");
 
+REGISTER_OP("IgnoreErrorsDataset")
+    .Input("input_dataset: resource")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+)doc");
+
 REGISTER_OP("MapDataset")
     .Input("input_dataset: resource")
     .Input("other_arguments: Targuments")
@@ -177,6 +198,31 @@ f: A function mapping elements of `input_dataset`, concatenated with
   `output_types` and `output_shapes`.
 )doc");
 
+REGISTER_OP("InterleaveDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Input("cycle_length: int64")
+    .Input("block_length: int64")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+
+Unlike MapDataset, the `f` in InterleaveDataset is expected to return
+a Dataset resource, and InterleaveDataset will flatten successive
+results into a single Dataset. Unlike FlatMapDataset,
+InterleaveDataset will interleave sequences of up to `block_length`
+consecutive elements from `cycle_length` input elements.
+
+f: A function mapping elements of `input_dataset`, concatenated with
+  `other_arguments`, to a Dataset resource that contains elements matching
+  `output_types` and `output_shapes`.
+)doc");
+
 REGISTER_OP("GroupByWindowDataset")
     .Input("input_dataset: resource")
     .Input("key_func_other_arguments: Tkey_func_other_arguments")
@@ -347,6 +393,7 @@ filename: A path on the filesystem where we should cache the dataset. Note: this
 
 REGISTER_OP("TextLineDataset")
     .Input("filenames: string")
+    .Input("compression_type: string")
     .Output("handle: resource")
     .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): validate
                                                // that `filenames` is
@@ -357,6 +404,8 @@ Creates a dataset that emits the lines of one or more text files.
 
 filenames: A scalar or a vector containing the name(s) of the file(s) to be
   read.
+compression_type: A scalar containing either (i) the empty string (no
+  compression), (ii) "ZLIB", or (iii) "GZIP".
 )doc");
 
 REGISTER_OP("FixedLengthRecordDataset")
@@ -487,4 +536,32 @@ REGISTER_OP("IteratorDispose")
 Releases any resources used by the given iterator.
 )doc");
 
+REGISTER_OP("IteratorToStringHandle")
+    .Input("resource_handle: resource")
+    .Output("string_handle: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Converts the given `resource_handle` representing an iterator to a string.
+
+resource_handle: A handle to an iterator resource.
+string_handle: A string representation of the given handle.
+)doc");
+
+REGISTER_OP("IteratorFromStringHandle")
+    .Input("string_handle: string")
+    .Output("resource_handle: resource")
+    .Attr("output_types: list(type) >= 0 = []")
+    .Attr("output_shapes: list(shape) >= 0 = []")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Converts the given string representing a handle to an iterator to a resource.
+
+string_handle: A string representation of the given handle.
+resource_handle: A handle to an iterator resource.
+output_types: If specified, defines the type of each tuple component in an
+  element produced by the resulting iterator.
+output_shapes: If specified, defines the shape of each tuple component in an
+  element produced by the resulting iterator.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc
index f7a96b58da38c976256a93b435874ab648493e68..bd7f7c2c018000656a048c815702a90bf24f5426 100644
--- a/tensorflow/core/ops/debug_ops.cc
+++ b/tensorflow/core/ops/debug_ops.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 // This file registers all TensorFlow Debugger (tfdbg) ops.
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
@@ -88,6 +90,7 @@ REGISTER_OP("DebugIdentity")
     .Attr("debug_urls: list(string) = []")
     .Attr("gated_grpc: bool = false")
     .SetAllowsUninitializedInput()
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Debug Identity Op.
 
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 68f48630264f46c3f4693477fde5056a9d34ed52..0f8563a2dafe6b1127ae7b14d742030cb53e282e 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -82,9 +82,8 @@ Status DecodeImageShapeFn(InferenceContext* c) {
     channels_dim = c->MakeDim(channels);
   }
 
-  c->set_output(0,
-                c->MakeShape({InferenceContext::kUnknownDim,
-                              InferenceContext::kUnknownDim, channels_dim}));
+  c->set_output(0, c->MakeShape({InferenceContext::kUnknownDim,
+                                 InferenceContext::kUnknownDim, channels_dim}));
   return Status::OK();
 }
 
@@ -628,10 +627,9 @@ REGISTER_OP("DecodeGif")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      c->set_output(0,
-                    c->MakeShape({InferenceContext::kUnknownDim,
-                                  InferenceContext::kUnknownDim,
-                                  InferenceContext::kUnknownDim, 3}));
+      c->set_output(0, c->MakeShape({InferenceContext::kUnknownDim,
+                                     InferenceContext::kUnknownDim,
+                                     InferenceContext::kUnknownDim, 3}));
       return Status::OK();
     })
     .Doc(R"doc(
@@ -813,6 +811,98 @@ use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes supplied.
   raise an error.
 )doc");
 
+REGISTER_OP("SampleDistortedBoundingBoxV2")
+  .Input("image_size: T")
+  .Input("bounding_boxes: float")
+  .Input("min_object_covered: float")
+  .Output("begin: T")
+  .Output("size: T")
+  .Output("bboxes: float")
+  .Attr("T: {uint8, int8, int16, int32, int64}")
+  .Attr("seed: int = 0")
+  .Attr("seed2: int = 0")
+  .Attr("aspect_ratio_range: list(float) = [0.75, 1.33]")
+  .Attr("area_range: list(float) = [0.05, 1.0]")
+  .Attr("max_attempts: int = 100")
+  .Attr("use_image_if_no_bounding_boxes: bool = false")
+  .SetIsStateful()
+  .SetShapeFn([](InferenceContext* c) {
+    c->set_output(0, c->Vector(3));
+    c->set_output(1, c->Vector(3));
+    c->set_output(2, c->MakeShape({1, 1, 4}));
+    return Status::OK();
+  })
+  .Doc(R"doc(
+Generate a single randomly distorted bounding box for an image.
+
+Bounding box annotations are often supplied in addition to ground-truth labels
+in image recognition or object localization tasks. A common technique for
+training such a system is to randomly distort an image while preserving
+its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+localization of an object, i.e. bounding box, given an `image_size`,
+`bounding_boxes` and a series of constraints.
+
+The output of this Op is a single bounding box that may be used to crop the
+original image. The output is returned as 3 tensors: `begin`, `size` and
+`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+what the bounding box looks like.
+
+Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+height of the underlying image.
+
+For example,
+
+```python
+    # Generate a single distorted bounding box.
+    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        bounding_boxes=bounding_boxes)
+
+    # Draw the bounding box in an image summary.
+    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                  bbox_for_draw)
+    tf.image_summary('images_with_box', image_with_box)
+
+    # Employ the bounding box to distort the image.
+    distorted_image = tf.slice(image, begin, size)
+```
+
+Note that if no bounding box information is available, setting
+`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+false and no bounding boxes are supplied, an error is raised.
+
+image_size: 1-D, containing `[height, width, channels]`.
+bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+  associated with the image.
+min_object_covered: The cropped area of the image must contain at least this
+  fraction of any bounding box supplied. The value of this parameter should be
+  non-negative. In the case of 0, the cropped area does not need to overlap
+  any of the bounding boxes supplied.
+begin: 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+  `tf.slice`.
+size: 1-D, containing `[target_height, target_width, -1]`. Provide as input to
+  `tf.slice`.
+bboxes: 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+  Provide as input to `tf.image.draw_bounding_boxes`.
+seed: If either `seed` or `seed2` are set to non-zero, the random number
+  generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+  seed.
+seed2: A second seed to avoid seed collision.
+aspect_ratio_range: The cropped area of the image must have an aspect ratio =
+  width / height within this range.
+area_range: The cropped area of the image must contain a fraction of the
+  supplied image within in this range.
+max_attempts: Number of attempts at generating a cropped region of the image
+  of the specified constraints. After `max_attempts` failures, return the entire
+  image.
+use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes supplied.
+  If true, assume an implicit bounding box covering the whole input. If false,
+  raise an error.
+)doc");
+
 // --------------------------------------------------------------------------
 
 // glimpse = extract_glimpse(input, size, offsets) extract the glimpse
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index fa12816c92c71f1ee7664f665648e8a7cf856a62..de287334b8128106b6d241eeb32adb5495b3a1e9 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -109,8 +109,7 @@ REGISTER_OP("RestoreV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &shape1));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &shape2));
       TF_RETURN_IF_ERROR(c->Merge(shape1, shape2, &shape0));
-      c->set_output(0, c->UnknownShape());
-      return Status::OK();
+      return UnknownShape(c);
     })
     .Doc(R"doc(
 Restores tensors from a V2 checkpoint.
@@ -475,6 +474,7 @@ REGISTER_OP("FixedLengthRecordReaderV2")
     .Attr("hop_bytes: int = 0")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
+    .Attr("encoding: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
@@ -490,6 +490,8 @@ container: If non-empty, this reader is placed in the given container.
         Otherwise, a default container is used.
 shared_name: If non-empty, this reader is named in the given bucket
              with this shared_name. Otherwise, the node name is used instead.
+encoding: The type of encoding for the file. Currently ZLIB and GZIP
+        are supported. Defaults to none.
 )doc");
 
 // TODO(cwhipkey): mark this deprecated in favor of V2.
@@ -837,7 +839,8 @@ REGISTER_OP("WriteFile")
       return Status::OK();
     })
     .Doc(R"doc(
-Writes contents to the file at input filename. Creates file if not existing.
+Writes contents to the file at input filename. Creates file and recursively
+creates directory if not existing.
 
 filename: scalar. The name of the file to which we write the contents.
 contents: scalar. The content to be written to the output file.
diff --git a/tensorflow/core/ops/io_ops_test.cc b/tensorflow/core/ops/io_ops_test.cc
index a915cdbe12c64fa98e9a3fe2f697800d0ae49bf7..785fb96c641ea376e1f3c858e89b59a35a86af4f 100644
--- a/tensorflow/core/ops/io_ops_test.cc
+++ b/tensorflow/core/ops/io_ops_test.cc
@@ -79,6 +79,26 @@ TEST(IoOpsTest, Restore_ShapeFn) {
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[];[?]");
 }
 
+TEST(IoOpsTest, RestoreV2_ShapeFn) {
+  ShapeInferenceTestOp op("RestoreV2");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", op.name)
+                   .Input({"prefix", 0, DT_STRING})
+                   .Input({"tensor_names", 0, DT_STRING})
+                   .Input({"shapes_and_slices", 0, DT_STRING})
+                   .Attr("dtypes", {DT_FLOAT, DT_INT64})
+                   .Finalize(&op.node_def));
+
+  INFER_OK(op, "?;?;?", "?;?");
+  INFER_OK(op, "[];[10];[10]", "?;?");
+
+  // Input shape validation.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[?];[?];[?]");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[];[?,?];[?]");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[];[?];[?,?]");
+  INFER_ERROR("in both shapes must be equal", op, "[];[10];[20]");
+}
+
 TEST(IoOpsTest, RestoreSlice_ShapeFn) {
   ShapeInferenceTestOp op("RestoreSlice");
 
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 6e1f2dc0529261b4128b8c5b8b307847a76bebaf..b0f95c91fdf6a83227f48f771d1e33a093f3f5d4 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -189,7 +189,7 @@ Status SvdShapeFn(InferenceContext* c) {
 REGISTER_OP("MatrixDeterminant")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {float, double, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
@@ -560,7 +560,7 @@ REGISTER_OP("BatchSelfAdjointEig")
 REGISTER_OP("BatchMatrixDeterminant")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {float, double, complex64, complex128}")
     .Deprecated(13, "Use MatrixDeterminant instead.");
 
 REGISTER_OP("BatchMatrixInverse")
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 9a58a31757f65ddfb3d36dbbe5f66b6ba83bdecb..5e082ce8f507119caa8b35e94a42e5c00fbcd6fe 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -189,6 +189,42 @@ Status TanhGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Tanh", TanhGrad);
 
+Status AsinhGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      {{"y"}, "Asinh", {"x"}},
+      {{"cosh"}, "Cosh", {"y"}},
+      {{"dx"}, "Mul", {"dy", "cosh"}},  // dy * cosh(y)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Asinh", AsinhGrad);
+
+Status AcoshGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+      {{"y"}, "Acosh", {"x"}},
+      {{"sinh"}, "Sinh", {"y"}},
+      {{"dx"}, "Mul", {"dy", "sinh"}},  // dy * sinh(y)
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Acosh", AcoshGrad);
+
+Status AtanhGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForUnaryCwise(g, {
+    {{"x2"}, "Square", {"x"}},
+    FDH::Const("const", 1.0f),
+    {{"one"}, "Cast", {"const"}, {{"SrcT", DT_FLOAT}, {"DstT", "$T"}}},
+    {{"a"}, "Sub", {"one", "x2"}}, // 1 - x^2
+    {{"inv"}, "Reciprocal", {"a"}},
+    {{"dx"}, "Mul", {"dy", "inv"}}
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("Atanh", AtanhGrad);
+
 Status SigmoidGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForUnaryCwise(g, {
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index aa9706a3286f34fd4d0ca4986b4817d2481cd4af..1393bffb914acec5ad2826445947dc16d3094e96 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -528,6 +528,44 @@ TEST_F(MathGradTest, Tanh) {
   test::ExpectClose(ans, dx);
 }
 
+TEST_F(MathGradTest, Asinh) {
+  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) {
+    auto y = std::asinh(x);
+    return std::cosh(y);
+  };
+  auto dx = test::AsTensor<float>(
+      {g(-3.f), g(-2.f), g(-1.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Asinh", x);
+  test::ExpectClose(ans, dx);
+}
+
+TEST_F(MathGradTest, Acosh) {
+  auto x = test::AsTensor<float>({6.f, 5.f, 4.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) {
+    auto y = std::acosh(x);
+    return std::sinh(y);
+  };
+  auto dx = test::AsTensor<float>(
+      {g(6.f), g(5.f), g(4.f), g(1.f), g(2.f), g(3.f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Acosh", x);
+  test::ExpectClose(ans, dx);
+}
+
+TEST_F(MathGradTest, Atanh) {
+  auto x = test::AsTensor<float>({-0.3f, -0.2f, -0.1f, 0.1f, 0.2f, 0.3f},
+                                 TensorShape({2, 3}));
+  auto g = [](float x) {
+    return 1.f / (1.f - x * x);
+  };
+  auto dx = test::AsTensor<float>(
+      {g(-0.3f), g(-0.2f), g(-0.1f), g(0.1f), g(0.2f), g(0.3f)}, TensorShape({2, 3}));
+  auto ans = SymGrad("Atanh", x);
+  test::ExpectClose(ans, dx);
+}
+
 TEST_F(MathGradTest, Sigmoid) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8b9dd3953605648e2ee688588d153858cf3b90e9..36f999ff607c91f7c4b16c7862c137b528b1c5ce 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -305,6 +305,18 @@ REGISTER_OP("Tanh").UNARY_COMPLEX().Doc(R"doc(
 Computes hyperbolic tangent of `x` element-wise.
 )doc");
 
+REGISTER_OP("Asinh").UNARY_COMPLEX().Doc(R"doc(
+Computes inverse hyperbolic sine of x element-wise.
+)doc");
+
+REGISTER_OP("Acosh").UNARY_COMPLEX().Doc(R"doc(
+Computes inverse hyperbolic cosine of x element-wise.
+)doc");
+
+REGISTER_OP("Atanh").UNARY_COMPLEX().Doc(R"doc(
+Computes inverse hyperbolic tangent of x element-wise.
+)doc");
+
 REGISTER_OP("TanhGrad").UNARY_GRADIENT_COMPLEX().Doc(R"doc(
 Computes the gradient for the tanh of `x` wrt its input.
 
@@ -1134,7 +1146,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -1156,7 +1169,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -1178,7 +1192,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -1200,7 +1215,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -1222,7 +1238,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -1289,33 +1306,37 @@ Status ArgOpShape(shape_inference::InferenceContext* c) {
 REGISTER_OP("ArgMax")
     .Input("input: T")
     .Input("dimension: Tidx")
-    .Output("output: int64")
+    .Output("output: output_type")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("output_type: {int32, int64} = DT_INT64")
     .SetShapeFn(ArgOpShape)
     .Doc(R"doc(
 Returns the index with the largest value across dimensions of a tensor.
 
 Note that in case of ties the identity of the return value is not guaranteed.
 
-dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-  of the input Tensor to reduce across. For vectors, use dimension = 0.
+dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+  Describes which dimension of the input Tensor to reduce across. For vectors,
+  use dimension = 0.
 )doc");
 
 REGISTER_OP("ArgMin")
     .Input("input: T")
     .Input("dimension: Tidx")
-    .Output("output: int64")
+    .Output("output: output_type")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("output_type: {int32, int64} = DT_INT64")
     .SetShapeFn(ArgOpShape)
     .Doc(R"doc(
 Returns the index with the smallest value across dimensions of a tensor.
 
 Note that in case of ties the identity of the return value is not guaranteed.
 
-dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-  of the input Tensor to reduce across. For vectors, use dimension = 0.
+dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+  Describes which dimension of the input Tensor to reduce across. For vectors,
+  use dimension = 0.
 )doc");
 
 namespace {
@@ -1642,7 +1663,7 @@ If the maximum is empty for a given segment ID `i`, it outputs the smallest poss
  `output[i] = numeric_limits<T>::min()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1809,7 +1830,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -1830,7 +1852,8 @@ Reduces `input` along the dimensions given in `reduction_indices`. Unless
 retained with length 1.
 
 input: The tensor to reduce.
-reduction_indices: The dimensions to reduce.
+reduction_indices: The dimensions to reduce. Must be in the range
+  `[-rank(input), rank(input))`.
 keep_dims: If true, retain reduced dimensions with length 1.
 output: The reduced tensor.
 )doc");
@@ -2162,6 +2185,14 @@ The `reverse` and `exclusive` kwargs can also be combined:
 ```python
 tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
 ```
+
+x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+  `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+  `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+  `[-rank(x), rank(x))`.
+exclusive: If `True`, perform exclusive cumsum.
+reverse: A `bool` (default: False).
 )doc");
 
 REGISTER_OP("Cumprod")
@@ -2204,6 +2235,14 @@ The `reverse` and `exclusive` kwargs can also be combined:
 ```python
 tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
 ```
+
+x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+  `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+  `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+  `[-rank(x), rank(x))`.
+exclusive: If `True`, perform exclusive cumprod.
+reverse: A `bool` (default: False).
 )doc");
 
 REGISTER_OP("QuantizedMatMul")
@@ -2271,7 +2310,12 @@ REGISTER_OP("QuantizedMul")
     .Attr("T2: quantizedtype")
     .Attr("Toutput: quantizedtype = DT_QINT32")
     .SetIsCommutative()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::BroadcastBinaryOpShapeFn(c));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Returns x * y element-wise, working on quantized buffers.
 
@@ -2300,7 +2344,12 @@ REGISTER_OP("QuantizedAdd")
     .Attr("T2: quantizedtype")
     .Attr("Toutput: quantizedtype = DT_QINT32")
     .SetIsCommutative()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::BroadcastBinaryOpShapeFn(c));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Returns x + y element-wise, working on quantized buffers.
 
@@ -2409,6 +2458,64 @@ out_type: The type of the output. Should be a lower bit depth than Tinput.
 
 )doc");
 
+REGISTER_OP("CompareAndBitpack")
+    .Input("input: T")
+    .Input("threshold: T")
+    .Output("output: uint8")
+    .Attr("T: {bool, float16, float32, float64, int8, int16, int32, int64}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &input));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      ShapeHandle output = input;
+      if (c->RankKnown(input)) {
+        int rank = c->Rank(input);
+        auto inner_dim = c->Dim(input, rank - 1);
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(inner_dim, 8,
+                                     /* evenly_divisible */ true,
+                                     &inferred_dim));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(output, rank - 1, inferred_dim, &output));
+      }
+      c->set_output(0, output);
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+
+Each comparison returns a boolean `true` (if `input_value > threshold`)
+or and `false` otherwise.
+
+This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+algorithms that use hashing approximations of cosine and `L2` distances;
+codes can be generated from an input via:
+
+```python
+codebook_size = 50
+codebook_bits = codebook_size * 32
+codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+                           dtype=x.dtype,
+                           initializer=tf.orthogonal_initializer())
+codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+# now codes has shape x.shape[:-1] + [codebook_size]
+```
+
+**NOTE**: Currently, the innermost dimension of the tensor must be divisible
+by 8.
+
+Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+
+input: Values to compare against `threshold` and bitpack.
+threshold: Threshold to compare against.
+T: The type of the input and threshold.
+output: The bitpacked comparisons.
+)doc");
+
 REGISTER_OP("RequantizationRange")
     .Input("input: Tinput")
     .Input("input_min: float")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index c10e667f564ee690c81b9db565236af2dea3ab31..28f9969de56c93556f4746acae1a2887c27b5b98 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 70302c38861f6ce26677eb400fd11589f3014d5b..10187425214e8cb22d13366f83c55bed5a5fc351 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -831,11 +831,13 @@ a different filter to each input channel (expanding from 1 channel to
 `channel_multiplier` channels for each), then concatenates the results
 together. Thus, the output has `in_channels * channel_multiplier` channels.
 
+```
 for k in 0..in_channels-1
   for q in 0..channel_multiplier-1
     output[b, i, j, k * channel_multiplier + q] =
       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
                         filter[di, dj, k, q]
+```
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
 horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
@@ -1777,6 +1779,33 @@ backprops: The gradients: `gradients * (outputs + 1)` if outputs < 0,
 `gradients` otherwise.
 )doc");
 
+REGISTER_OP("Selu")
+    .Input("features: T")
+    .Output("activations: T")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+if < 0, `scale * features` otherwise.
+
+See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+)doc");
+
+REGISTER_OP("SeluGrad")
+    .Input("gradients: T")
+    .Input("outputs: T")
+    .Output("backprops: T")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+Computes gradients for the scaled exponential linear (Selu) operation.
+
+gradients: The backpropagated gradients to the corresponding Selu operation.
+outputs: The outputs of the corresponding Selu operation.
+backprops: The gradients: `gradients * (outputs + scale * alpha)`
+if outputs < 0, `scale * gradients` otherwise.
+)doc");
+
 REGISTER_OP("Softplus")
     .Input("features: T")
     .Output("activations: T")
@@ -1977,6 +2006,49 @@ precision: Computed Precision at `k` as a `bool Tensor`.
 
 )doc");
 
+// This is the same as `InTopK`, but takes `k` as in input rather than an attr.
+REGISTER_OP("InTopKV2")
+    .Input("predictions: float")
+    .Input("targets: T")
+    .Input("k: T")
+    .Output("precision: bool")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle predictions;
+      ShapeHandle targets;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &predictions));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &targets));
+      DimensionHandle batch_size;
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(predictions, 0), c->Dim(targets, 0), &batch_size));
+      c->set_output(0, c->Vector(batch_size));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Says whether the targets are in the top `K` predictions.
+
+This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+prediction for the target class is among the top `k` predictions among
+all predictions for example `i`. Note that the behavior of `InTopK` differs
+from the `TopK` op in its handling of ties; if multiple classes have the
+same prediction value and straddle the top-`k` boundary, all of those
+classes are considered to be in the top `k`.
+
+More formally, let
+
+  \\(predictions_i\\) be the predictions for all classes for example `i`,
+  \\(targets_i\\) be the target class for example `i`,
+  \\(out_i\\) be the output for example `i`,
+
+$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+
+predictions: A `batch_size` x `classes` tensor.
+targets: A `batch_size` vector of class ids.
+k: Number of top elements to look at for computing precision.
+precision: Computed precision at `k` as a `bool Tensor`.
+
+)doc");
+
 namespace {
 
 Status TopKShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index a60b1c37880c24f0a4f33e9a2fe3c914cb32aacf..51e4f8bffe05ea11d08f4807b7b49c5c7d2d9ed7 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -412,7 +412,8 @@ TEST(NNOpsTest, Dilation2DBackpropFilter_ShapeFn) {
 
 TEST(NNOpsTest, MergeBothInputs_ShapeFn) {
   for (const char* op_name :
-       {"ReluGrad", "Relu6Grad", "EluGrad", "SoftplusGrad", "SoftsignGrad"}) {
+       {"ReluGrad", "Relu6Grad", "EluGrad", "SeluGrad", "SoftplusGrad",
+        "SoftsignGrad"}) {
     ShapeInferenceTestOp op(op_name);
 
     INFER_OK(op, "?;?", "in0|in1");
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 37d4379d48d595c81d9b0ac7c6f1cb0839eed120..2839575ec7276e562fd833e9834bc4959ffd4a4d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -190,6 +190,31 @@ op {
   }
   summary: "Computes acos of x element-wise."
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes inverse hyperbolic cosine of x element-wise."
+}
 op {
   name: "Add"
   input_arg {
@@ -474,7 +499,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -576,7 +601,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -1169,6 +1194,94 @@ op {
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: "accum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
 }
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 shrinkage regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: "grad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+}
 op {
   name: "ApplyGradientDescent"
   input_arg {
@@ -1578,12 +1691,12 @@ op {
   }
   input_arg {
     name: "dimension"
-    description: "int32, 0 <= dimension < rank(input).  Describes which dimension\nof the input Tensor to reduce across. For vectors, use dimension = 0."
+    description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
+    type_attr: "output_type"
   }
   attr {
     name: "T"
@@ -1620,6 +1733,19 @@ op {
       }
     }
   }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Returns the index with the largest value across dimensions of a tensor."
   description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
@@ -1631,12 +1757,12 @@ op {
   }
   input_arg {
     name: "dimension"
-    description: "int32, 0 <= dimension < rank(input).  Describes which dimension\nof the input Tensor to reduce across. For vectors, use dimension = 0."
+    description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
+    type_attr: "output_type"
   }
   attr {
     name: "T"
@@ -1673,6 +1799,19 @@ op {
       }
     }
   }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Returns the index with the smallest value across dimensions of a tensor."
   description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
@@ -1771,6 +1910,31 @@ op {
   }
   summary: "Computes asin of x element-wise."
 }
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes inverse hyperbolic sine of x element-wise."
+}
 op {
   name: "Assert"
   input_arg {
@@ -2002,6 +2166,31 @@ op {
   summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
   description: "This is the angle \\( \\theta \\in [-\\pi, \\pi] \\) such that\n\\[ x = r \\cos(\\theta) \\]\nand\n\\[ y = r \\sin(\\theta) \\]\nwhere \\(r = \\sqrt(x^2 + y^2) \\)."
 }
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes inverse hyperbolic tangent of x element-wise."
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -2835,6 +3024,8 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -3661,12 +3852,14 @@ op {
         type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
+        type: DT_INT16
         type: DT_COMPLEX64
         type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_QINT32
         type: DT_HALF
       }
@@ -3683,12 +3876,14 @@ op {
         type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
+        type: DT_INT16
         type: DT_COMPLEX64
         type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_QINT32
         type: DT_HALF
       }
@@ -3698,103 +3893,199 @@ op {
   description: "Given a tensor `input`, this operation returns a tensor that has the same buffer\ndata as `input` with datatype `type`.\n\nIf the input datatype `T` is larger than the output datatype `type` then the\nshape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].\n\nIf `T` is smaller than `type`, the operator requires that the rightmost\ndimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from\n[..., sizeof(`type`)/sizeof(`T`)] to [...].\n\n*NOTE*: Bitcast is implemented as a low-level cast, so machines with different\nendian orderings will give different results."
 }
 op {
-  name: "BroadcastArgs"
+  name: "BitwiseAnd"
   input_arg {
-    name: "s0"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "s1"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "r0"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
-  summary: "Return the shape of s0 op s1 with broadcast."
-  description: "Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the\nbroadcasted shape. `s0`, `s1` and `r0` are all integer vectors."
+  summary: "Elementwise computes the bitwise AND of `x` and `y`."
+  description: "The result will have those bits set, that are set in both `x` and `y`. The\ncomputation is performed on the underlying representations of `x` and `y`."
+  is_commutative: true
 }
 op {
-  name: "BroadcastGradientArgs"
+  name: "BitwiseOr"
   input_arg {
-    name: "s0"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "s1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r0"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "r1"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
-  summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
-  description: "This is typically used by gradient computations for a broadcasting operation."
+  summary: "Elementwise computes the bitwise OR of `x` and `y`."
+  description: "The result will have those bits set, that are set in `x`, `y` or both. The\ncomputation is performed on the underlying representations of `x` and `y`."
+  is_commutative: true
 }
 op {
-  name: "Bucketize"
+  name: "BitwiseXor"
   input_arg {
-    name: "input"
-    description: "Any shape of Tensor contains with int or float type."
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    description: "Same shape with \'input\', each value of input replaced with bucket index.\n\n@compatibility(numpy)\nEquivalent to np.digitize.\n@end_compatibility"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
-  attr {
-    name: "boundaries"
-    type: "list(float)"
-    description: "A sorted list of floats gives the boundary of the buckets."
-  }
-  summary: "Bucketizes \'input\' based on \'boundaries\'."
-  description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
+  summary: "Elementwise computes the bitwise XOR of `x` and `y`."
+  description: "The result will have those bits set, that are different in `x` and `y`. The\ncomputation is performed on the underlying representations of `x` and `y`."
+  is_commutative: true
 }
 op {
-  name: "CTCBeamSearchDecoder"
+  name: "BroadcastArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Return the shape of s0 op s1 with broadcast."
+  description: "Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the\nbroadcasted shape. `s0`, `s1` and `r0` are all integer vectors."
+}
+op {
+  name: "BroadcastGradientArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r1"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
+  description: "This is typically used by gradient computations for a broadcasting operation."
+}
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    description: "Any shape of Tensor contains with int or float type."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Same shape with \'input\', each value of input replaced with bucket index.\n\n@compatibility(numpy)\nEquivalent to np.digitize.\n@end_compatibility"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+    description: "A sorted list of floats gives the boundary of the buckets."
+  }
+  summary: "Bucketizes \'input\' based on \'boundaries\'."
+  description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
+}
+op {
+  name: "CTCBeamSearchDecoder"
   input_arg {
     name: "inputs"
     description: "3-D, shape: `(max_time x batch_size x num_classes)`, the logits."
@@ -4115,6 +4406,43 @@ op {
   summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
   description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    description: "Values to compare against `threshold` and bitpack."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    description: "Threshold to compare against."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "The bitpacked comparisons."
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "The type of the input and threshold."
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Compare values of `input` to `threshold` and pack resulting bits into a `uint8`."
+  description: "Each comparison returns a boolean `true` (if `input_value > threshold`)\nor and `false` otherwise.\n\nThis operation is useful for Locality-Sensitive-Hashing (LSH) and other\nalgorithms that use hashing approximations of cosine and `L2` distances;\ncodes can be generated from an input via:\n\n```python\ncodebook_size = 50\ncodebook_bits = codebook_size * 32\ncodebook = tf.get_variable(\'codebook\', [x.shape[-1].value, codebook_bits],\n                           dtype=x.dtype,\n                           initializer=tf.orthogonal_initializer())\ncodes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)\ncodes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32\n# now codes has shape x.shape[:-1] + [codebook_size]\n```\n\n**NOTE**: Currently, the innermost dimension of the tensor must be divisible\nby 8.\n\nGiven an `input` shaped `[s0, s1, ..., s_n]`, the output is\na `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`."
+}
 op {
   name: "Complex"
   input_arg {
@@ -4349,6 +4677,35 @@ op {
   }
   summary: "Concatenates tensors along one dimension."
 }
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
+  is_stateful: true
+}
 op {
   name: "ConditionalAccumulator"
   output_arg {
@@ -5077,6 +5434,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5335,10 +5693,12 @@ op {
   name: "Cumprod"
   input_arg {
     name: "x"
+    description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
+    description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -5351,6 +5711,7 @@ op {
     default_value {
       b: false
     }
+    description: "If `True`, perform exclusive cumprod."
   }
   attr {
     name: "reverse"
@@ -5358,6 +5719,7 @@ op {
     default_value {
       b: false
     }
+    description: "A `bool` (default: False)."
   }
   attr {
     name: "T"
@@ -5401,10 +5763,12 @@ op {
   name: "Cumsum"
   input_arg {
     name: "x"
+    description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
+    description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -5417,6 +5781,7 @@ op {
     default_value {
       b: false
     }
+    description: "If `True`, perform exclusive cumsum."
   }
   attr {
     name: "reverse"
@@ -5424,6 +5789,7 @@ op {
     default_value {
       b: false
     }
+    description: "A `bool` (default: False)."
   }
   attr {
     name: "T"
@@ -5463,6 +5829,24 @@ op {
   summary: "Compute the cumulative sum of the tensor `x` along `axis`."
   description: "By default, this op performs an inclusive cumsum, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumsum([a, b, c])  # => [a, a + b, a + b + c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumsum is\nperformed instead:\n\n```python\ntf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumsum is performed in the\nopposite direction:\n\n```python\ntf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]\n```"
 }
+op {
+  name: "DebugGradientIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Identity op for gradient debugging."
+  description: "This op is hidden from public in Python. It is used by TensorFlow Debugger to\nregister gradient tensors for gradient debugging."
+  allows_uninitialized_input: true
+}
 op {
   name: "DebugIdentity"
   input_arg {
@@ -6179,7 +6563,7 @@ op {
     }
   }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
-  description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
+  description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\n```\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n```\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
 }
 op {
   name: "DepthwiseConv2dNativeBackpropFilter"
@@ -7276,7 +7660,7 @@ op {
   }
   input_arg {
     name: "dim"
-    description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`."
+    description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`. Must be in the range\n`[-rank(input) - 1, rank(input)]`."
     type_attr: "Tdim"
   }
   output_arg {
@@ -7388,7 +7772,7 @@ op {
   }
   output_arg {
     name: "patches"
-    description: "4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *\nksize_cols * depth]` containing image patches with size\n`ksize_rows x ksize_cols x depth` vectorized in the \"depth\" dimension."
+    description: "4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *\nksize_cols * depth]` containing image patches with size\n`ksize_rows x ksize_cols x depth` vectorized in the \"depth\" dimension. Note\n`out_rows` and `out_cols` are the dimensions of the output patches."
     type_attr: "T"
   }
   attr {
@@ -7408,7 +7792,7 @@ op {
   attr {
     name: "rates"
     type: "list(int)"
-    description: "1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the\ninput stride, specifying how far two consecutive patch samples are in the\ninput. Equivalent to extracting patches with\n`patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by\nsubsampling them spatially by a factor of `rates`."
+    description: "1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the\ninput stride, specifying how far two consecutive patch samples are in the\ninput. Equivalent to extracting patches with\n`patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by\nsubsampling them spatially by a factor of `rates`. This is equivalent to\n`rate` in dilated (a.k.a. Atrous) convolutions."
     has_minimum: true
     minimum: 4
   }
@@ -8077,6 +8461,14 @@ op {
     }
     description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "The type of encoding for the file. Currently ZLIB and GZIP\nare supported. Defaults to none."
+  }
   summary: "A Reader that outputs fixed-length records from a file."
   is_stateful: true
 }
@@ -8978,8 +9370,57 @@ op {
       }
     }
   }
-  summary: "Gather values or slices from `params` according to `indices`."
-  description: "`indices` is an integer tensor containing indices into `params`.  The last\ndimension of `indices` can be at most the rank of `params`:\n\n    indices.shape[-1] <= params.rank\n\nThe last dimension of `indices` corresponds to elements\n(if `indices.shape[-1] = params.rank`) or slices\n(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`\nof `params`.  The output tensor has shape\n\n    indices.shape[:-1] + params.shape[indices.shape[-1]:]\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
+  summary: "Gather slices from `params` into a Tensor with shape specified by `indices`."
+  description: "`indices` is an K-dimensional integer tensor, best thought of as a\n(K-1)-dimensional tensor of indices into `params`, where each element defines a\nslice of `params`:\n\n    output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]\n\nWhereas in @{tf.gather} `indices` defines slices into the first\ndimension of `params`, in `tf.gather_nd`, `indices` defines slices into the\nfirst `N` dimensions of `params`, where `N = indices.shape[-1]`.\n\nThe last dimension of `indices` can be at most the rank of\n`params`:\n\n    indices.shape[-1] <= params.rank\n\nThe last dimension of `indices` corresponds to elements\n(if `indices.shape[-1] == params.rank`) or slices\n(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`\nof `params`.  The output tensor has shape\n\n    indices.shape[:-1] + params.shape[indices.shape[-1]:]\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
+}
+op {
+  name: "GatherV2"
+  input_arg {
+    name: "params"
+    description: "The tensor from which to gather values. Must be at least rank\n`axis + 1`."
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    description: "Index tensor. Must be in range `[0, params.shape[axis])`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    description: "The axis in `params` to gather `indices` from. Defaults to the first\ndimension. Supports negative indexes."
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    description: "Values from `params` gathered from indices given by `indices`, with\nshape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`."
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Gather slices from `params` axis `axis` according to `indices`."
+  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `params.shape[:axis] + indices.shape +\nparams.shape[axis + 1:]` where:\n\n```python\n    # Scalar indices (output is rank(params) - 1).\n    output[a_0, ..., a_n, b_0, ..., b_n] =\n      params[a_0, ..., a_n, indices, b_0, ..., b_n]\n\n    # Vector indices (output is rank(params)).\n    output[a_0, ..., a_n, i, b_0, ..., b_n] =\n      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]\n\n    # Higher rank indices (output is rank(params) + rank(indices) - 1).\n    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =\n      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/Gather.png\" alt>\n</div>"
 }
 op {
   name: "GetSessionHandle"
@@ -9551,6 +9992,31 @@ op {
   summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
   description: "The upper regularized incomplete Gamma function is defined as:\n\n\\\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\\\)\n\nwhere\n\n\\\\(Gamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\\\\)\n\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
 }
+op {
+  name: "IgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
+  is_stateful: true
+}
 op {
   name: "Imag"
   input_arg {
@@ -9868,17 +10334,65 @@ op {
   is_stateful: true
 }
 op {
-  name: "Inv"
+  name: "InterleaveDataset"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "T"
+    name: "f"
+    type: "func"
+    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset resource that contains elements matching\n`output_types` and `output_shapes`."
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: "Unlike MapDataset, the `f` in InterleaveDataset is expected to return\na Dataset resource, and InterleaveDataset will flatten successive\nresults into a single Dataset. Unlike FlatMapDataset,\nInterleaveDataset will interleave sequences of up to `block_length`\nconsecutive elements from `cycle_length` input elements."
+  is_stateful: true
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -9933,6 +10447,33 @@ op {
     explanation: "Use ReciprocalGrad"
   }
 }
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  summary: "Flips all bits elementwise."
+  description: "The result will have exactly those bits set, that are not set in `x`. The\ncomputation is performed on the underlying representation of x."
+}
 op {
   name: "InvertPermutation"
   input_arg {
@@ -10093,6 +10634,41 @@ op {
   summary: "Releases any resources used by the given iterator."
   is_stateful: true
 }
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    description: "A string representation of the given handle."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    description: "A handle to an iterator resource."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    description: "If specified, defines the type of each tuple component in an\nelement produced by the resulting iterator."
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    description: "If specified, defines the shape of each tuple component in an\nelement produced by the resulting iterator."
+    has_minimum: true
+  }
+  summary: "Converts the given string representing a handle to an iterator to a resource."
+  is_stateful: true
+}
 op {
   name: "IteratorGetNext"
   input_arg {
@@ -10118,6 +10694,21 @@ op {
   summary: "Gets the next output from the given iterator."
   is_stateful: true
 }
+op {
+  name: "IteratorToStringHandle"
+  input_arg {
+    name: "resource_handle"
+    description: "A handle to an iterator resource."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    description: "A string representation of the given handle."
+    type: DT_STRING
+  }
+  summary: "Converts the given `resource_handle` representing an iterator to a string."
+  is_stateful: true
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -11536,6 +12127,8 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -11777,7 +12370,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -12554,7 +13147,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -12745,7 +13338,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -14133,6 +14726,44 @@ op {
   summary: "Pads a tensor with zeros."
   description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
 }
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Pads a tensor."
+  description: "This operation pads `input` according to the `paddings` and `constant_values`\nyou specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many padding values to add before the contents of `input` in that dimension,\nand `paddings[D, 1]` indicates how many padding values to add after the contents\nof `input` in that dimension. `constant_values` is a scalar tensor of the same\ntype as `input` that indicates the value to use for padding `input`.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# \'constant_values\' is 0\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
+}
 op {
   name: "PaddedBatchDataset"
   input_arg {
@@ -14850,6 +15481,33 @@ op {
   summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
   description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
 }
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  summary: "Computes element-wise population count (a.k.a. popcount, bitsum, bitcount)."
+  description: "For each entry in `x`, calculates the number of `1` (on) bits in the binary\nrepresentation of that entry.\n\n**NOTE**: It is more efficient to first `tf.bitcast` your tensors into\n`int32` or `int64` and perform the bitcount on the result, than to feed in\n8- or 16-bit inputs and then aggregate the resulting counts."
+}
 op {
   name: "Pow"
   input_arg {
@@ -15076,7 +15734,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -15348,6 +16006,55 @@ op {
   summary: "Quantizes then dequantizes a tensor."
   description: "This op simulates the precision loss from the quantized forward pass by:\n1. Quantizing the tensor to fixed point numbers, which should match the target\n   quantization method when it is used in inference.\n2. Dequantizing it back to floating point numbers for the following ops, most\n   likely matmul.\n\nThere are different ways to quantize. This version does not use the full range\nof the output type, choosing to elide the lowest possible value for symmetry\n(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit\nquantization), so that 0.0 maps to 0.\n\nTo perform this op, we first find the range of values in our tensor. The range\nwe use is always centered on 0, so we find m such that\n\n1. m = max(abs(input_min), abs(input_max)) if range_given is true,\n2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.\n\nOur input tensor range is then [-m, m].\n\nNext, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].\nIf signed_input is true, this is\n\n  [min_fixed, max_fixed ] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].\n\nOtherwise, if signed_input is false, the fixed-point range is\n\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].\n\nFrom this we compute our scaling factor, s:\n\n  s = (max_fixed - min_fixed) / (2 * m).\n\nNow we can quantize and dequantize the elements of our tensor.  An element e\nis transformed into e\':\n\n  e\' = (e * s).round_to_nearest() / s.\n\nNote that we have a different number of buckets in the signed vs. unsigned\ncases.  For example, if num_bits == 8, we get 254 buckets in the signed case\nvs. 255 in the unsigned case.\n\nFor example, suppose num_bits = 8 and m = 1.  Then\n\n  [min_fixed, max_fixed] = [-127, 127], and\n  s = (127 + 127) / 2 = 127.\n\nGiven the vector {-1, -0.5, 0, 0.3}, this is quantized to\n{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}."
 }
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Quantizes then dequantizes a tensor."
+  description: "This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a\ntensor, so its value can change during training."
+}
 op {
   name: "QuantizeDownAndShrinkRange"
   input_arg {
@@ -17029,6 +17736,36 @@ op {
   description: "The components input has k elements, which correspond to the components of\ntuples stored in the given queue.\n\nN.B. If the queue is full, this operation will block until the given\nelement has been enqueued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
+op {
+  name: "QueueIsClosed"
+  input_arg {
+    name: "handle"
+    description: "The handle to a queue."
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+  summary: "Returns true if queue is closed."
+  description: "This operation returns true if the queue is closed and false if the queue\nis open."
+}
+op {
+  name: "QueueIsClosedV2"
+  input_arg {
+    name: "handle"
+    description: "The handle to a queue."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+  summary: "Returns true if queue is closed."
+  description: "This operation returns true if the queue is closed and false if the queue\nis open."
+  is_stateful: true
+}
 op {
   name: "QueueSize"
   input_arg {
@@ -18591,6 +19328,36 @@ op {
   }
   summary: "Computes rectified linear gradients for a Relu operation."
 }
+op {
+  name: "RemoteFusedGraphExecute"
+  input_arg {
+    name: "inputs"
+    description: "Arbitrary number of tensors with arbitrary data types"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    description: "Arbitrary number of tensors with arbitrary data types"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "serialized_remote_fused_graph_execute_info"
+    type: "string"
+    description: "Serialized protocol buffer\nof RemoteFusedGraphExecuteInfo which contains graph specifications."
+  }
+  summary: "Execute a sub graph on a remote processor."
+  description: "The graph specifications(such as graph itself, input tensors and output names)\nare stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo\nas serialized_remote_fused_graph_execute_info.\nThe specifications will be passed to a dedicated registered\nremote fused graph executor.  The executor will send the graph specifications\nto a remote processor and execute that graph.  The execution results\nwill be passed to consumer nodes as outputs of this node."
+}
 op {
   name: "RepeatDataset"
   input_arg {
@@ -19477,23 +20244,103 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceApplyFtrlV2"
   input_arg {
     name: "var"
     description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
-    description: "Scaling factor. Must be a scalar."
-    type_attr: "T"
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "delta"
-    description: "The change."
-    type_attr: "T"
-  }
-  attr {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 shrinkage regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: "grad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    description: "The change."
+    type_attr: "T"
+  }
+  attr {
     name: "T"
     type: "type"
     allowed_values {
@@ -20228,6 +21075,101 @@ op {
   description: "That is for rows we have grad for, we update var, accum and linear as follows:\naccum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 shrinkage regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: "That is for rows we have grad for, we update var, accum and linear as follows:\ngrad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -20838,7 +21780,7 @@ op {
   }
   input_arg {
     name: "axis"
-    description: "1-D. The indices of the dimensions to reverse."
+    description: "1-D. The indices of the dimensions to reverse. Must be in the range\n`[-rank(tensor), rank(tensor))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -21094,6 +22036,109 @@ op {
   description: "Bounding box annotations are often supplied in addition to ground-truth labels\nin image recognition or object localization tasks. A common technique for\ntraining such a system is to randomly distort an image while preserving\nits content, i.e. *data augmentation*. This Op outputs a randomly distorted\nlocalization of an object, i.e. bounding box, given an `image_size`,\n`bounding_boxes` and a series of constraints.\n\nThe output of this Op is a single bounding box that may be used to crop the\noriginal image. The output is returned as 3 tensors: `begin`, `size` and\n`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the\nimage. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize\nwhat the bounding box looks like.\n\nBounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example,\n\n```python\n    # Generate a single distorted bounding box.\n    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(\n        tf.shape(image),\n        bounding_boxes=bounding_boxes)\n\n    # Draw the bounding box in an image summary.\n    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),\n                                                  bbox_for_draw)\n    tf.image_summary(\'images_with_box\', image_with_box)\n\n    # Employ the bounding box to distort the image.\n    distorted_image = tf.slice(image, begin, size)\n```\n\nNote that if no bounding box information is available, setting\n`use_image_if_no_bounding_boxes = true` will assume there is a single implicit\nbounding box covering the whole image. If `use_image_if_no_bounding_boxes` is\nfalse and no bounding boxes are supplied, an error is raised."
   is_stateful: true
 }
+op {
+  name: "SampleDistortedBoundingBoxV2"
+  input_arg {
+    name: "image_size"
+    description: "1-D, containing `[height, width, channels]`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    description: "3-D with shape `[batch, N, 4]` describing the N bounding boxes\nassociated with the image."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    description: "The cropped area of the image must contain at least this\nfraction of any bounding box supplied. The value of this parameter should be\nnon-negative. In the case of 0, the cropped area does not need to overlap\nany of the bounding boxes supplied."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    description: "1-D, containing `[offset_height, offset_width, 0]`. Provide as input to\n`tf.slice`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    description: "1-D, containing `[target_height, target_width, -1]`. Provide as input to\n`tf.slice`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    description: "3-D with shape `[1, 1, 4]` containing the distorted bounding box.\nProvide as input to `tf.image.draw_bounding_boxes`."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "If either `seed` or `seed2` are set to non-zero, the random number\ngenerator is seeded by the given `seed`.  Otherwise, it is seeded by a random\nseed."
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "A second seed to avoid seed collision."
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+    description: "The cropped area of the image must have an aspect ratio =\nwidth / height within this range."
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+    description: "The cropped area of the image must contain a fraction of the\nsupplied image within in this range."
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+    description: "Number of attempts at generating a cropped region of the image\nof the specified constraints. After `max_attempts` failures, return the entire\nimage."
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Controls behavior if no bounding boxes supplied.\nIf true, assume an implicit bounding box covering the whole input. If false,\nraise an error."
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+  description: "Bounding box annotations are often supplied in addition to ground-truth labels\nin image recognition or object localization tasks. A common technique for\ntraining such a system is to randomly distort an image while preserving\nits content, i.e. *data augmentation*. This Op outputs a randomly distorted\nlocalization of an object, i.e. bounding box, given an `image_size`,\n`bounding_boxes` and a series of constraints.\n\nThe output of this Op is a single bounding box that may be used to crop the\noriginal image. The output is returned as 3 tensors: `begin`, `size` and\n`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the\nimage. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize\nwhat the bounding box looks like.\n\nBounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example,\n\n```python\n    # Generate a single distorted bounding box.\n    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(\n        tf.shape(image),\n        bounding_boxes=bounding_boxes)\n\n    # Draw the bounding box in an image summary.\n    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),\n                                                  bbox_for_draw)\n    tf.image_summary(\'images_with_box\', image_with_box)\n\n    # Employ the bounding box to distort the image.\n    distorted_image = tf.slice(image, begin, size)\n```\n\nNote that if no bounding box information is available, setting\n`use_image_if_no_bounding_boxes = true` will assume there is a single implicit\nbounding box covering the whole image. If `use_image_if_no_bounding_boxes` is\nfalse and no bounding boxes are supplied, an error is raised."
+  is_stateful: true
+}
 op {
   name: "Save"
   input_arg {
@@ -21460,7 +22505,7 @@ op {
     }
   }
   summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
-  description: "Creates a new tensor by applying sparse `updates` to individual\nvalues or slices within a zero tensor of the given `shape` according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\n**WARNING**: The order in which updates are applied is nondeterministic, so the\noutput will be nondeterministic if `indices` contains duplicates.\n\n`indices` is an integer tensor containing indices into a new tensor of shape\n`shape`.  The last dimension of `indices` can be at most the rank of `shape`:\n\n    indices.shape[-1] <= shape.rank\n\nThe last dimension of `indices` corresponds to indices into elements\n(if `indices.shape[-1] = shape.rank`) or slices\n(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of\n`shape`.  `updates` is a tensor with shape\n\n    indices.shape[:-1] + shape[indices.shape[-1]:]\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
+  description: "Creates a new tensor by applying sparse `updates` to individual\nvalues or slices within a zero tensor of the given `shape` according to\nindices.  This operator is the inverse of the @{tf.gather_nd} operator which\nextracts values or slices from a given tensor.\n\n**WARNING**: The order in which updates are applied is nondeterministic, so the\noutput will be nondeterministic if `indices` contains duplicates.\n\n`indices` is an integer tensor containing indices into a new tensor of shape\n`shape`.  The last dimension of `indices` can be at most the rank of `shape`:\n\n    indices.shape[-1] <= shape.rank\n\nThe last dimension of `indices` corresponds to indices into elements\n(if `indices.shape[-1] = shape.rank`) or slices\n(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of\n`shape`.  `updates` is a tensor with shape\n\n    indices.shape[:-1] + shape[indices.shape[-1]:]\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
 }
 op {
   name: "ScatterNdAdd"
@@ -21527,7 +22572,64 @@ op {
     description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
   summary: "Applies sparse addition between `updates` and individual values or slices"
-  description: "within a given variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to add 4 scattered elements to a rank-1 tensor to 8\nelements. In Python, that addition would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    add = tf.scatter_nd_add(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(add)\n\nThe resulting update to ref would look like this:\n\n    [1, 13, 3, 14, 14, 6, 7, 20]\n\nSee [tf.scatter_nd](#scatter_nd) for more details about how to make updates to\nslices."
+  description: "within a given variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to add 4 scattered elements to a rank-1 tensor to 8\nelements. In Python, that addition would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    add = tf.scatter_nd_add(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(add)\n\nThe resulting update to ref would look like this:\n\n    [1, 13, 3, 14, 14, 6, 7, 20]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    description: "A Tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A Tensor. Must be one of the following types: `int32`, `int64`.\nA tensor of indices into `input`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    description: "A Tensor. Must have the same type as ref. A tensor of updated values\nto add to `input`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "A `Tensor` with the same shape as `input`, containing values of `input`\nupdated with `updates`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Applies sparse addition to `input` using individual values or slices"
+  description: "from `updates` according to indices `indices`.  The updates are non-aliasing:\n`input` is only modified in-place if no other operations will use it.\nOtherwise, a copy of `input` is made.  This operation has a gradient with\nrespect to both `input` and `updates`.\n\n`input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `input`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or `(P-K)`-dimensional slices\n(if `K < P`) along the `K`th dimension of `input`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].\n```\n\nFor example, say we want to add 4 scattered elements to a rank-1 tensor to 8\nelements. In Python, that addition would look like this:\n\n    input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    output = tf.scatter_nd_non_aliasing_add(input, indices, updates)\n    with tf.Session() as sess:\n      print(sess.run(output))\n\nThe resulting value `output` would look like this:\n\n    [1, 13, 3, 14, 14, 6, 7, 20]\n\nSee @{tf.scatter_nd} for more details about how to make updates to slices."
 }
 op {
   name: "ScatterNdSub"
@@ -21594,7 +22696,7 @@ op {
     description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
   summary: "Applies sparse subtraction between `updates` and individual values or slices"
-  description: "within a given variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to subtract 4 scattered elements from a rank-1 tensor\nwith 8 elements. In Python, that subtraction would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    sub = tf.scatter_nd_sub(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(sub)\n\nThe resulting update to ref would look like this:\n\n    [1, -9, 3, -6, -4, 6, 7, -4]\n\nSee [tf.scatter_nd](#scatter_nd) for more details about how to make updates to\nslices."
+  description: "within a given variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to subtract 4 scattered elements from a rank-1 tensor\nwith 8 elements. In Python, that subtraction would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    sub = tf.scatter_nd_sub(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(sub)\n\nThe resulting update to ref would look like this:\n\n    [1, -9, 3, -6, -4, 6, 7, -4]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
 }
 op {
   name: "ScatterNdUpdate"
@@ -21643,7 +22745,7 @@ op {
     description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
   summary: "Applies sparse `updates` to individual values or slices within a given"
-  description: "variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to update 4 scattered elements to a rank-1 tensor to\n8 elements. In Python, that update would look like this:\n\n```python\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1] ,[7]])\n    updates = tf.constant([9, 10, 11, 12])\n    update = tf.scatter_nd_update(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(update)\n```\n\nThe resulting update to ref would look like this:\n\n    [1, 11, 3, 10, 9, 6, 7, 12]\n\nSee [tf.scatter_nd](#scatter_nd) for more details about how to make updates to\nslices."
+  description: "variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to update 4 scattered elements to a rank-1 tensor to\n8 elements. In Python, that update would look like this:\n\n```python\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1] ,[7]])\n    updates = tf.constant([9, 10, 11, 12])\n    update = tf.scatter_nd_update(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(update)\n```\n\nThe resulting update to ref would look like this:\n\n    [1, 11, 3, 10, 9, 6, 7, 12]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
 }
 op {
   name: "ScatterSub"
@@ -22273,13 +23375,67 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
+  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```python\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
+}
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)` if < 0, `scale * features` otherwise."
+  description: "See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)"
+}
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    description: "The backpropagated gradients to the corresponding Selu operation."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    description: "The outputs of the corresponding Selu operation."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    description: "The gradients: `gradients * (outputs + scale * alpha)` if outputs < 0,\n`scale * gradients` otherwise."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
-  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```python\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
+  summary: "Computes gradients for the scaled exponential linear (Selu) operation."
 }
 op {
   name: "SerializeManySparse"
@@ -22652,6 +23808,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23880,6 +25037,109 @@ op {
   summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
   description: "That is for rows we have grad for, we update var, accum and linear as follows:\naccum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
 }
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 shrinkage regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: "That is for rows we have grad for, we update var, accum and linear as follows:\ngrad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+}
 op {
   name: "SparseApplyMomentum"
   input_arg {
@@ -24658,104 +25918,221 @@ op {
   description: "The input `SparseTensor` is represented via the tuple of inputs\n(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the\nsame `dense_shape` but with indices `output_indices` and values\n`output_values`.\n\nThis op inserts a single entry for every row that doesn\'t have any values.\nThe index is created as `[row, 0, ..., 0]` and the inserted value\nis `default_value`.\n\nFor example, suppose `sp_input` has shape `[5, 6]` and non-empty values:\n\n    [0, 1]: a\n    [0, 3]: b\n    [2, 0]: c\n    [3, 1]: d\n\nRows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:\n\n    [0, 1]: a\n    [0, 3]: b\n    [1, 0]: default_value\n    [2, 0]: c\n    [3, 1]: d\n    [4, 0]: default_value\n\nThe output `SparseTensor` will be in row-major order and will have the\nsame shape as the input.\n\nThis op also returns an indicator vector shaped `[dense_shape[0]]` such that\n\n    empty_row_indicator[i] = True iff row i was an empty row.\n\nAnd a reverse index map vector shaped `[indices.shape[0]]` that is used during\nbackpropagation,\n\n    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]"
 }
 op {
-  name: "SparseFillEmptyRowsGrad"
+  name: "SparseFillEmptyRowsGrad"
+  input_arg {
+    name: "reverse_index_map"
+    description: "1-D.  The reverse index map from SparseFillEmptyRows."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad_values"
+    description: "1-D.  The gradients from backprop."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_values"
+    description: "1-D.  The backprop into values."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_default_value"
+    description: "0-D.  The backprop into default_value."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "The gradient of SparseFillEmptyRows."
+  description: "Takes vectors reverse_index_map, shaped `[N]`, and grad_values,\nshaped `[N_full]`, where `N_full >= N` and copies data into either\n`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and\n`d_default_value` is a scalar.\n\n  d_values[j] = grad_values[reverse_index_map[j]]\n  d_default_value = sum_{k : 0 .. N_full - 1} (\n     grad_values[k] * 1{k not in reverse_index_map})"
+}
+op {
+  name: "SparseMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "Ta"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "Tb"
+  }
+  output_arg {
+    name: "product"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "a_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "b_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Ta"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tb"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  summary: "Multiply matrix \"a\" by matrix \"b\"."
+  description: "The inputs must be two-dimensional matrices and the inner dimension of \"a\" must\nmatch the outer dimension of \"b\". This op is optimized for the case where at\nleast one of \"a\" or \"b\" is sparse. The breakeven for using this versus a dense\nmatrix multiply on one platform was 30% zero values in the sparse matrix.\n\nThe gradient computation of this operation will only take advantage of sparsity\nin the input gradient when that gradient comes from a Relu."
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    description: "1-D.  Shape of the input SparseTensor."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    description: "1-D.  Length-`K` vector containing the reduction axes."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "`R-K`-D.  The reduced Tensor."
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true, retain reduced dimensions with length 1."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Computes the max of elements across dimensions of a SparseTensor."
+  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`\ninstead of a sparse one.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
+}
+op {
+  name: "SparseReduceMaxSparse"
   input_arg {
-    name: "reverse_index_map"
-    description: "1-D.  The reverse index map from SparseFillEmptyRows."
+    name: "input_indices"
+    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
-    name: "grad_values"
-    description: "1-D.  The gradients from backprop."
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_values"
-    description: "1-D.  The backprop into values."
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_default_value"
-    description: "0-D.  The backprop into default_value."
+    name: "input_values"
+    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  summary: "The gradient of SparseFillEmptyRows."
-  description: "Takes vectors reverse_index_map, shaped `[N]`, and grad_values,\nshaped `[N_full]`, where `N_full >= N` and copies data into either\n`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and\n`d_default_value` is a scalar.\n\n  d_values[j] = grad_values[reverse_index_map[j]]\n  d_default_value = sum_{k : 0 .. N_full - 1} (\n     grad_values[k] * 1{k not in reverse_index_map})"
-}
-op {
-  name: "SparseMatMul"
   input_arg {
-    name: "a"
-    type_attr: "Ta"
+    name: "input_shape"
+    description: "1-D.  Shape of the input SparseTensor."
+    type: DT_INT64
   }
   input_arg {
-    name: "b"
-    type_attr: "Tb"
+    name: "reduction_axes"
+    description: "1-D.  Length-`K` vector containing the reduction axes."
+    type: DT_INT32
   }
   output_arg {
-    name: "product"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "output_indices"
+    type: DT_INT64
   }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
   }
-  attr {
-    name: "a_is_sparse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
   }
   attr {
-    name: "b_is_sparse"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
+    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
-    name: "Ta"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tb"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  summary: "Multiply matrix \"a\" by matrix \"b\"."
-  description: "The inputs must be two-dimensional matrices and the inner dimension of \"a\" must\nmatch the outer dimension of \"b\". This op is optimized for the case where at\nleast one of \"a\" or \"b\" is sparse. The breakeven for using this versus a dense\nmatrix multiply on one platform was 30% zero values in the sparse matrix.\n\nThe gradient computation of this operation will only take advantage of sparsity\nin the input gradient when that gradient comes from a Relu."
+  summary: "Computes the max of elements across dimensions of a SparseTensor."
+  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a\nSparseTensor.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
 }
 op {
   name: "SparseReduceSum"
@@ -25200,6 +26577,54 @@ op {
   summary: "Computes the sum along sparse segments of a tensor."
   description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```python\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n# => [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n# => [[ 1  2  3  4]\n#     [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n# => [[0 0 0 0]\n#     [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
 }
+op {
+  name: "SparseSlice"
+  input_arg {
+    name: "indices"
+    description: "2-D tensor represents the indices of the sparse tensor."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    description: "1-D tensor represents the values of the sparse tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    description: "1-D. tensor represents the shape of the sparse tensor."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "start"
+    description: "1-D. tensor represents the start of the slice."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "size"
+    description: "1-D. tensor represents the size of the slice.\noutput indices: A list of 1-D tensors represents the indices of the output\nsparse tensors."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    description: "A list of 1-D tensors represents the values of the output sparse\ntensors."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    description: "A list of 1-D tensors represents the shape of the output sparse\ntensors."
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Slice a `SparseTensor` based on the `start` and `size`."
+  description: "For example, if the input is\n\n    input_tensor = shape = [2, 7]\n    [    a   d e  ]\n    [b c          ]\n\nGraphically the output tensors are:\n\n    sparse_slice([0, 0], [2, 4]) = shape = [2, 4]\n    [    a  ]\n    [b c    ]\n\n    sparse_slice([0, 4], [2, 3]) = shape = [2, 3]\n    [ d e  ]\n    [      ]"
+}
 op {
   name: "SparseSoftmax"
   input_arg {
@@ -25961,7 +27386,7 @@ op {
       list {
       }
     }
-    description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1."
+    description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1. Must\nbe in the range `[-rank(input), rank(input))`."
     has_minimum: true
   }
   summary: "Removes dimensions of size 1 from the shape of a tensor."
@@ -25971,14 +27396,12 @@ op {
   name: "Stack"
   output_arg {
     name: "handle"
-    description: "The handle to the stack."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "elem_type"
     type: "type"
-    description: "The type of the elements on the stack."
   }
   attr {
     name: "stack_name"
@@ -25986,29 +27409,53 @@ op {
     default_value {
       s: ""
     }
-    description: "Overrides the name used for the temporary stack resource. Default\nvalue is the name of the \'Stack\' op (which is guaranteed unique)."
   }
-  summary: "A stack that produces elements in first-in last-out order."
+  summary: "Deprecated, use StackV2."
   is_stateful: true
 }
 op {
   name: "StackClose"
   input_arg {
     name: "handle"
-    description: "The handle to a stack."
     type: DT_STRING
     is_ref: true
   }
+  summary: "Deprecated, use StackCloseV2."
+}
+op {
+  name: "StackCloseV2"
+  input_arg {
+    name: "handle"
+    description: "The handle to a stack."
+    type: DT_RESOURCE
+  }
   summary: "Delete the stack from its resource container."
+  is_stateful: true
 }
 op {
   name: "StackPop"
   input_arg {
     name: "handle"
-    description: "The handle to a stack."
     type: DT_STRING
     is_ref: true
   }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  summary: "Deprecated, use StackPopV2."
+}
+op {
+  name: "StackPopV2"
+  input_arg {
+    name: "handle"
+    description: "The handle to a stack."
+    type: DT_RESOURCE
+  }
   output_arg {
     name: "elem"
     description: "The tensor that is popped from the top of the stack."
@@ -26020,15 +27467,43 @@ op {
     description: "The type of the elem that is popped."
   }
   summary: "Pop the element at the top of the stack."
+  is_stateful: true
 }
 op {
   name: "StackPush"
   input_arg {
     name: "handle"
-    description: "The handle to a stack."
     type: DT_STRING
     is_ref: true
   }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  summary: "Deprecated, use StackPushV2."
+}
+op {
+  name: "StackPushV2"
+  input_arg {
+    name: "handle"
+    description: "The handle to a stack."
+    type: DT_RESOURCE
+  }
   input_arg {
     name: "elem"
     description: "The tensor to be pushed onto the stack."
@@ -26052,6 +27527,35 @@ op {
     description: "Swap `elem` to CPU. Default to false."
   }
   summary: "Push an element onto the stack."
+  is_stateful: true
+}
+op {
+  name: "StackV2"
+  input_arg {
+    name: "max_size"
+    description: "The maximum size of the stack if non-negative. If negative, the stack\nsize is unlimited."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    description: "The handle to the stack."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+    description: "The type of the elements on the stack."
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "Overrides the name used for the temporary stack resource. Default\nvalue is the name of the \'Stack\' op (which is guaranteed unique)."
+  }
+  summary: "A stack that produces elements in first-in last-out order."
+  is_stateful: true
 }
 op {
   name: "Stage"
@@ -26872,7 +28376,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -27127,33 +28631,6 @@ op {
   summary: "A Reader that outputs the records from a TensorFlow Records file."
   is_stateful: true
 }
-op {
-  name: "LMDBReader"
-  output_arg {
-    name: "reader_handle"
-    description: "The handle to reference the Reader."
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
-  }
-  summary: "A Reader that outputs the records from a LMDB database."
-  is_stateful: true
-}
 op {
   name: "TakeDataset"
   input_arg {
@@ -28438,6 +29915,11 @@ op {
     description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
     type: DT_STRING
   }
+  input_arg {
+    name: "compression_type"
+    description: "A scalar containing either (i) the empty string (no\ncompression), (ii) \"ZLIB\", or (iii) \"GZIP\"."
+    type: DT_STRING
+  }
   output_arg {
     name: "handle"
     type: DT_RESOURCE
@@ -29129,7 +30611,7 @@ op {
     }
   }
   summary: "Computes the Max along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nThis operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nThis operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentMax.png\" alt>\n</div>"
 }
 op {
   name: "UnsortedSegmentSum"
@@ -29382,7 +30864,8 @@ op {
     description: "scalar. The content to be written to the output file."
     type: DT_STRING
   }
-  summary: "Writes contents to the file at input filename. Creates file if not existing."
+  summary: "Writes contents to the file at input filename. Creates file and recursively"
+  description: "creates directory if not existing."
 }
 op {
   name: "ZerosLike"
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 22f87f5fdfee8f55b461108f7470aa7343c1ee49..2e605fdffcfbb2514a58af5b3f13adce95356e72 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -85,11 +85,10 @@ REGISTER_OP("ParseExample")
       }
 
       // Output dense_shapes.
-      TensorShapeProto shape_proto;
       for (int i = 0; i < attrs.num_dense; ++i) {
-        attrs.dense_shapes[i].AsProto(&shape_proto);
         ShapeHandle dense;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &dense));
+        TF_RETURN_IF_ERROR(
+            c->MakeShapeFromPartialTensorShape(attrs.dense_shapes[i], &dense));
         TF_RETURN_IF_ERROR(c->Concatenate(input, dense, &dense));
         c->set_output(output_idx++, dense);
       }
@@ -196,11 +195,10 @@ REGISTER_OP("ParseSingleSequenceExample")
       }
 
       // Output context_dense_shapes.
-      TensorShapeProto shape_proto;
       for (int i = 0; i < attrs.num_context_dense; ++i) {
-        attrs.context_dense_shapes[i].AsProto(&shape_proto);
         ShapeHandle s;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &s));
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            attrs.context_dense_shapes[i], &s));
         c->set_output(output_idx++, s);
       }
 
@@ -218,9 +216,9 @@ REGISTER_OP("ParseSingleSequenceExample")
 
       // Output feature_list_dense_shapes.
       for (int i = 0; i < attrs.num_feature_list_dense; ++i) {
-        attrs.feature_list_dense_shapes[i].AsProto(&shape_proto);
         ShapeHandle s;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &s));
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            attrs.feature_list_dense_shapes[i], &s));
         TF_RETURN_IF_ERROR(
             c->Concatenate(c->Vector(InferenceContext::kUnknownDim), s, &s));
         c->set_output(output_idx++, s);
diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc
index 5c29e21d00b059949cd69a20092410532575bd9f..c6e521e33e98017ee7cfd96c88ee82d3d338967f 100644
--- a/tensorflow/core/ops/parsing_ops_test.cc
+++ b/tensorflow/core/ops/parsing_ops_test.cc
@@ -59,25 +59,24 @@ TEST(ParsingOpsTest, DecodeCSV_ShapeFn) {
   INFER_ERROR("Shape of a default must be", op, "?;[2];?");
 }
 
-static std::vector<TensorShapeProto> MakeDenseShapes(int size,
-                                                     bool add_extra_shape,
-                                                     int unknown_outer_dims) {
-  std::vector<TensorShapeProto> shapes(size);
+static std::vector<PartialTensorShape> MakeDenseShapes(int size,
+                                                       bool add_extra_shape,
+                                                       int unknown_outer_dims) {
+  std::vector<PartialTensorShape> shapes(size);
   for (int i = 0; i < size; ++i) {
     // Make shapes be the sequence [?,1]; [?,1,2], [?,1,2,3]...
     // where the number of prefixed ? depends on unknown_outer_dims.
     if (i == 0) {
+      shapes[i].Clear();
       for (int d = 0; d < unknown_outer_dims; ++d) {
-        shapes[i].add_dim()->set_size(-1);
+        shapes[i].AddDim(-1);
       }
     } else {
       shapes[i] = shapes[i - 1];
     }
-    shapes[i].add_dim()->set_size(i + 1);
-  }
-  if (add_extra_shape) {
-    shapes.resize(shapes.size() + 1);
+    shapes[i].AddDim(i + 1);
   }
+  if (add_extra_shape) shapes.push_back(PartialTensorShape({}));
   return shapes;
 }
 
diff --git a/tensorflow/core/ops/remote_fused_graph_ops.cc b/tensorflow/core/ops/remote_fused_graph_ops.cc
index 6e9f37a6152b50f0d2e6385125f4f7b51073033f..85370e648c4d43e9595ac16402eb99aa851382d1 100644
--- a/tensorflow/core/ops/remote_fused_graph_ops.cc
+++ b/tensorflow/core/ops/remote_fused_graph_ops.cc
@@ -19,19 +19,40 @@ limitations under the License.
 
 namespace tensorflow {
 
-// TODO(satok): Implement shape_inference
+namespace {
+using shape_inference::InferenceContext;
+
+Status RemoteFusedGraphExecuteShapeFn(InferenceContext* c) {
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->UnknownShape());
+  }
+  return Status::OK();
+}
+}  // namespace
+
 REGISTER_OP("RemoteFusedGraphExecute")
     .Input("inputs: Tinputs")
     .Output("outputs: Toutputs")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("serialized_remote_fused_graph_execute_info: string")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn(RemoteFusedGraphExecuteShapeFn)
     .Doc(R"doc(
-Execute a sub graph on a remote processor transferred by GraphTransferer.
-The graph specifications are serialized by protobuf as graph_transfer_info.
-The implementation / limitations may differ for each platform
-and each available peripheral.
+Execute a sub graph on a remote processor.
+
+The graph specifications(such as graph itself, input tensors and output names)
+are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+as serialized_remote_fused_graph_execute_info.
+The specifications will be passed to a dedicated registered
+remote fused graph executor.  The executor will send the graph specifications
+to a remote processor and execute that graph.  The execution results
+will be passed to consumer nodes as outputs of this node.
+
+inputs: Arbitrary number of tensors with arbitrary data types
+outputs: Arbitrary number of tensors with arbitrary data types
+serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+of RemoteFusedGraphExecuteInfo which contains graph specifications.
+
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 3b48559b1fc3f925b07fbd9c03c99e24d47b9105..034946f17adb2c8cab4aae256fce5fad54945c8b 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -68,10 +68,10 @@ REGISTER_OP("VarHandleOp")
       c->set_output(0, c->Scalar());
       DataType t;
       TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t));
-      TensorShapeProto p;
+      PartialTensorShape p;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &p));
       ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(p, &s));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
       c->set_output_handle_shapes_and_types(0,
                                             std::vector<ShapeAndType>{{s, t}});
 
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 9722f0ee9aedfbbd0c763d70d71d21f11f07aba2..646c37958662b1791af6d54e914d20d058feef6c 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -597,6 +597,60 @@ output_shape: A list of 1-D tensors represents the shape of the output sparse
   tensors.
 )doc");
 
+REGISTER_OP("SparseSlice")
+    .Input("indices: int64")
+    .Input("values: T")
+    .Input("shape: int64")
+    .Input("start: int64")
+    .Input("size: int64")
+    .Output("output_indices: int64")
+    .Output("output_values: T")
+    .Output("output_shape: int64")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape = c->input(2);
+      ShapeHandle output_indices =
+          c->Matrix(InferenceContext::kUnknownDim, c->NumElements(input_shape));
+      ShapeHandle output_values = c->Vector(InferenceContext::kUnknownDim);
+      ShapeHandle output_shape = input_shape;
+
+      c->set_output(0, output_indices);
+      c->set_output(1, output_values);
+      c->set_output(2, output_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Slice a `SparseTensor` based on the `start` and `size`.
+
+For example, if the input is
+
+    input_tensor = shape = [2, 7]
+    [    a   d e  ]
+    [b c          ]
+
+Graphically the output tensors are:
+
+    sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+    [    a  ]
+    [b c    ]
+
+    sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+    [ d e  ]
+    [      ]
+
+indices: 2-D tensor represents the indices of the sparse tensor.
+values: 1-D tensor represents the values of the sparse tensor.
+shape: 1-D. tensor represents the shape of the sparse tensor.
+start: 1-D. tensor represents the start of the slice.
+size: 1-D. tensor represents the size of the slice.
+output indices: A list of 1-D tensors represents the indices of the output
+sparse tensors.
+output_values: A list of 1-D tensors represents the values of the output sparse
+  tensors.
+output_shape: A list of 1-D tensors represents the shape of the output sparse
+  tensors.
+)doc");
+
 REGISTER_OP("SparseReorder")
     .Input("input_indices: int64")
     .Input("input_values: T")
@@ -710,6 +764,75 @@ a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
 b: `ndims`-D Tensor.  With shape `a_shape`.
 )doc");
 
+REGISTER_OP("SparseReduceMax")
+    .Input("input_indices: int64")
+    .Input("input_values: T")
+    .Input("input_shape: int64")
+    .Input("reduction_axes: int32")
+    .Attr("keep_dims: bool = False")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Computes the max of elements across dimensions of a SparseTensor.
+
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+instead of a sparse one.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+
+input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+  SparseTensor, possibly not in canonical ordering.
+input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+input_shape: 1-D.  Shape of the input SparseTensor.
+reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+keep_dims: If true, retain reduced dimensions with length 1.
+output: `R-K`-D.  The reduced Tensor.
+)doc");
+
+REGISTER_OP("SparseReduceMaxSparse")
+    .Input("input_indices: int64")
+    .Input("input_values: T")
+    .Input("input_shape: int64")
+    .Input("reduction_axes: int32")
+    .Attr("keep_dims: bool = False")
+    .Output("output_indices: int64")
+    .Output("output_values: T")
+    .Output("output_shape: int64")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Computes the max of elements across dimensions of a SparseTensor.
+
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+SparseTensor.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+
+input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+  SparseTensor, possibly not in canonical ordering.
+input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+input_shape: 1-D.  Shape of the input SparseTensor.
+reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+keep_dims: If true, retain reduced dimensions with length 1.
+)doc");
+
 REGISTER_OP("SparseReduceSum")
     .Input("input_indices: int64")
     .Input("input_values: T")
@@ -793,7 +916,9 @@ keep_dims: If true, retain reduced dimensions with length 1.
         return Status::OK();                                     \
       })
 
-REGISTER_OP("SparseDenseCwiseMul").SPARSE_DENSE_CWISE_SIGNATURE().Doc(R"doc(
+REGISTER_OP("SparseDenseCwiseMul")
+    .SPARSE_DENSE_CWISE_SIGNATURE()
+    .Doc(R"doc(
 Component-wise multiplies a SparseTensor by a dense Tensor.
 
 The output locations corresponding to the implicitly zero elements in the sparse
@@ -811,7 +936,9 @@ dense: `R`-D.  The dense Tensor operand.
 output: 1-D.  The `N` values that are operated on.
 )doc");
 
-REGISTER_OP("SparseDenseCwiseDiv").SPARSE_DENSE_CWISE_SIGNATURE().Doc(R"doc(
+REGISTER_OP("SparseDenseCwiseDiv")
+    .SPARSE_DENSE_CWISE_SIGNATURE()
+    .Doc(R"doc(
 Component-wise divides a SparseTensor by a dense Tensor.
 
 *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
@@ -825,7 +952,9 @@ dense: `R`-D.  The dense Tensor operand.
 output: 1-D.  The `N` values that are operated on.
 )doc");
 
-REGISTER_OP("SparseDenseCwiseAdd").SPARSE_DENSE_CWISE_SIGNATURE().Doc(R"doc(
+REGISTER_OP("SparseDenseCwiseAdd")
+    .SPARSE_DENSE_CWISE_SIGNATURE()
+    .Doc(R"doc(
 Adds up a SparseTensor and a dense Tensor, using these special rules:
 
 (1) Broadcasts the dense side to have the same shape as the sparse side, if
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 0890d5fc7c77ac4e930f69680345d17ef9bff364..7cf5dfcca83dc0d8776d5b5fd9ec38baa1960765 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
@@ -30,11 +29,11 @@ REGISTER_OP("VariableV2")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
-      TensorShapeProto shape_proto;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
       ShapeHandle output_shape;
       TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeProto(shape_proto, &output_shape));
+          c->MakeShapeFromPartialTensorShape(shape, &output_shape));
       c->set_output(0, output_shape);
       return Status::OK();
     })
@@ -72,10 +71,8 @@ REGISTER_OP("Variable")
         return shape_inference::UnknownShape(c);
       }
 
-      TensorShapeProto shape_proto;
-      shape.AsProto(&shape_proto);
       ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &out));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
       c->set_output(0, out);
       return Status::OK();
     })
@@ -103,10 +100,10 @@ REGISTER_OP("TemporaryVariable")
     .Attr("var_name: string = ''")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
-      TensorShapeProto shape_proto;
-      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
       ShapeHandle output;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &output));
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output));
       c->set_output(0, output);
       return Status::OK();
     })
@@ -472,63 +469,6 @@ use_locking: If True, the operation will be protected by a lock;
   otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
-namespace {
-
-Status ScatterNdUpdateShape(InferenceContext* c) {
-  ShapeHandle ref_shape = c->input(0);
-  ShapeHandle indices_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
-  ShapeHandle updates_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
-
-  if (c->RankKnown(indices_shape) && c->RankKnown(updates_shape)) {
-    const int64 outer_dims = c->Rank(indices_shape) - 1;
-    const DimensionHandle ixdim = c->Dim(indices_shape, -1);
-
-    // We can only do more validation if the last dimension of indices
-    // is a known value.
-    if (c->ValueKnown(ixdim)) {
-      int64 ix = c->Value(ixdim);
-      ShapeHandle unused;
-      ShapeHandle prefix_indices;
-      TF_RETURN_IF_ERROR(
-          c->Subshape(indices_shape, 0, outer_dims, &prefix_indices));
-      ShapeHandle prefix_updates;
-      TF_RETURN_IF_ERROR(
-          c->Subshape(updates_shape, 0, outer_dims, &prefix_updates));
-
-      Status s = c->Merge(prefix_indices, prefix_updates, &unused);
-      if (!s.ok()) {
-        return errors::InvalidArgument(
-            "The outer ", outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), "must match the outer ", outer_dims,
-            " dimensions of updates.shape=", c->DebugString(updates_shape),
-            ": ", s.error_message());
-      }
-
-      ShapeHandle suffix_ref;
-      TF_RETURN_IF_ERROR(c->Subshape(ref_shape, ix, &suffix_ref));
-      ShapeHandle suffix_updates;
-      TF_RETURN_IF_ERROR(
-          c->Subshape(updates_shape, outer_dims, &suffix_updates));
-      s = c->Merge(suffix_ref, suffix_updates, &unused);
-      if (!s.ok()) {
-        return errors::InvalidArgument(
-            "The inner ", c->Rank(ref_shape) - ix, " dimensions of ref.shape=",
-            c->DebugString(ref_shape), "must match the inner ",
-            c->Rank(updates_shape) - outer_dims,
-            " dimensions of updates.shape=", c->DebugString(updates_shape),
-            ": ", s.error_message());
-      }
-    }
-  }
-
-  c->set_output(0, ref_shape);
-  return Status::OK();
-}
-
-}  // namespace
-
 REGISTER_OP("ScatterNdUpdate")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
@@ -537,7 +477,7 @@ REGISTER_OP("ScatterNdUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(ScatterNdUpdateShape)
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
     .Doc(R"doc(
 Applies sparse `updates` to individual values or slices within a given
 variable according to `indices`.
@@ -573,7 +513,7 @@ The resulting update to ref would look like this:
 
     [1, 11, 3, 10, 9, 6, 7, 12]
 
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
+See @{tf.scatter_nd} for more details about how to make updates to
 slices.
 
 ref: A mutable Tensor. Should be from a Variable node.
@@ -596,7 +536,7 @@ REGISTER_OP("ScatterNdAdd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ScatterNdUpdateShape)
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
     .Doc(R"doc(
 Applies sparse addition between `updates` and individual values or slices
 within a given variable according to `indices`.
@@ -630,7 +570,7 @@ The resulting update to ref would look like this:
 
     [1, 13, 3, 14, 14, 6, 7, 20]
 
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
+See @{tf.scatter_nd} for more details about how to make updates to
 slices.
 
 ref: A mutable Tensor. Should be from a Variable node.
@@ -653,7 +593,7 @@ REGISTER_OP("ScatterNdSub")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ScatterNdUpdateShape)
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
     .Doc(R"doc(
 Applies sparse subtraction between `updates` and individual values or slices
 within a given variable according to `indices`.
@@ -687,7 +627,7 @@ The resulting update to ref would look like this:
 
     [1, -9, 3, -6, -4, 6, 7, -4]
 
-See [tf.scatter_nd](#scatter_nd) for more details about how to make updates to
+See @{tf.scatter_nd} for more details about how to make updates to
 slices.
 
 ref: A mutable Tensor. Should be from a Variable node.
@@ -713,7 +653,7 @@ output_ref: Same as ref. Returned as a convenience for operations that want
 //     .Attr("T: numbertype")
 //     .Attr("Tindices: {int32, int64}")
 //     .Attr("use_locking: bool = false")
-//     .SetShapeFn(ScatterNdUpdateShape)
+//     .SetShapeFn(shape_inference::ScatterNdUpdateShape)
 //     .Doc(
 //         R"doc(Applies sparse subtraction between `updates` and individual
 //         values or slices within a given variable according to `indices`.
@@ -747,7 +687,7 @@ output_ref: Same as ref. Returned as a convenience for operations that want
 
 //     [1, 22, 3, 40, 45, 6, 7, 96]
 
-// See [tf.scatter_nd](#scatter_nd) for more details about how to make updates
+// See @{tf.scatter_nd} for more details about how to make updates
 // to slices.
 
 // ref: A mutable Tensor. Should be from a Variable node.
@@ -769,7 +709,7 @@ output_ref: Same as ref. Returned as a convenience for operations that want
 //     .Attr("T: numbertype")
 //     .Attr("Tindices: {int32, int64}")
 //     .Attr("use_locking: bool = false")
-//     .SetShapeFn(ScatterNdUpdateShape)
+//     .SetShapeFn(shape_inference::ScatterNdUpdateShape)
 //     .Doc(
 //         R"doc(Applies sparse subtraction between `updates` and individual
 //         values or slices within a given variable according to `indices`.
@@ -803,7 +743,7 @@ output_ref: Same as ref. Returned as a convenience for operations that want
 
 //     [10, 5, 30, 13, 25, 60, 70, 16]
 
-// See [tf.scatter_nd](#scatter_nd) for more details about how to make updates
+// See @{tf.scatter_nd} for more details about how to make updates
 // to slices.
 
 // ref: A mutable Tensor. Should be from a Variable node.
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index bcc1c924937ec0aa0998f6788c49e2cfab9d49fc..6d05dd0b96c3c5b50b342fe3b97b97390c940fed 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -63,49 +63,41 @@ TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
 TEST(StateOpsTest, TemporaryVariable_ShapeFn) {
   ShapeInferenceTestOp op("TemporaryVariable");
   TensorShape shape({1, 2, 3});
-  TensorShapeProto shape_proto;
-  shape.AsProto(&shape_proto);
   TF_ASSERT_OK(NodeDefBuilder("test", "TemporaryVariable")
-                   .Attr("shape", shape_proto)
+                   .Attr("shape", shape)
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "[1,2,3]");
 }
 
 TEST(StateOpsTest, Variable_ShapeFn) {
   ShapeInferenceTestOp op("Variable");
-  TensorShapeProto shape_proto;
 
   // Unknown rank.
-  PartialTensorShape().AsProto(&shape_proto);
   TF_ASSERT_OK(NodeDefBuilder("test", "Variable")
-                   .Attr("shape", shape_proto)
+                   .Attr("shape", PartialTensorShape())
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "?");
 
   // For historical reasons an empty TensorShapeProto can be either an unknown
   // rank or a scalar, so the shape function conservatively says "unknown"
-  shape_proto.Clear();
   TF_ASSERT_OK(NodeDefBuilder("test", "Variable")
-                   .Attr("shape", shape_proto)
+                   .Attr("shape", TensorShape({}))
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "?");
 
   // Specified shape.
-  TensorShape({1, 2, 3}).AsProto(&shape_proto);
   TF_ASSERT_OK(NodeDefBuilder("test", "Variable")
-                   .Attr("shape", shape_proto)
+                   .Attr("shape", TensorShape({1, 2, 3}))
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "[1,2,3]");
 }
 
 TEST(StateOpsTest, VariableV2_ShapeFn) {
   ShapeInferenceTestOp op("VariableV2");
-  TensorShapeProto shape_proto;
 
   // Unknown rank.
-  shape_proto.set_unknown_rank(true);
   TF_ASSERT_OK(NodeDefBuilder("test", "VariableV2")
-                   .Attr("shape", shape_proto)
+                   .Attr("shape", PartialTensorShape())
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "?");
 
@@ -116,9 +108,8 @@ TEST(StateOpsTest, VariableV2_ShapeFn) {
   INFER_OK(op, "", "[]");
 
   // Specified shape.
-  TensorShape({1, 2, 3}).AsProto(&shape_proto);
   TF_ASSERT_OK(NodeDefBuilder("test", "VariableV2")
-                   .Attr("shape", shape_proto)
+                   .Attr("shape", TensorShape({1, 2, 3}))
                    .Finalize(&op.node_def));
   INFER_OK(op, "", "[1,2,3]");
 }
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 5bb93daea23be9ec164ea3c51a5249f06d181fb5..f77817fbfc54c92823f86d5b502716ab12c35013 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -983,6 +983,178 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
   contention.
 )doc");
 
+REGISTER_OP("ApplyFtrlV2")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("linear: Ref(T)")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: online L2 regulariation. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("SparseApplyFtrlV2")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("linear: Ref(T)")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: onine L2 regularization. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+out: Same as "var".
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyFtrlV2")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: onine L2 regularization. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyFtrlV2")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("l2_shrinkage: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: onine L2 regularization. Must be a scalar.
+l2: L2 shrinkage regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
 static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 4df88b8d37439ceebb529c07db2c0dc7299d268a..67cd1bb2c6914aa8821dc14ba5dfa8802ae726c5 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -24,6 +24,14 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+cc_library(
+    name = "file_block_cache",
+    srcs = ["file_block_cache.cc"],
+    hdrs = ["file_block_cache.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = ["//tensorflow/core:lib"],
+)
+
 cc_library(
     name = "gcs_file_system",
     srcs = [
@@ -35,6 +43,7 @@ cc_library(
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
+        ":file_block_cache",
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
@@ -155,6 +164,18 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "file_block_cache_test",
+    size = "small",
+    srcs = ["file_block_cache_test.cc"],
+    deps = [
+        ":file_block_cache",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gcs_file_system_test",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4970a4188ef07b47d9da633fa1e5baba1c211ec
--- /dev/null
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -0,0 +1,163 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include <cstring>
+#include <memory>
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
+                            std::vector<char>* out) {
+  out->clear();
+  if (n == 0) {
+    return Status::OK();
+  }
+  if (block_size_ == 0 || max_bytes_ == 0) {
+    // The cache is effectively disabled, so we pass the read through to the
+    // fetcher without breaking it up into blocks.
+    return block_fetcher_(filename, offset, n, out);
+  }
+  // Calculate the block-aligned start and end of the read.
+  size_t start = block_size_ * (offset / block_size_);
+  size_t finish = block_size_ * ((offset + n) / block_size_);
+  if (finish < offset + n) {
+    finish += block_size_;
+  }
+  mutex_lock lock(mu_);
+  // Now iterate through the blocks, reading them one at a time. Reads are
+  // locked so that only one block_fetcher call is active at any given time.
+  for (size_t pos = start; pos < finish; pos += block_size_) {
+    Key key = std::make_pair(filename, pos);
+    auto entry = block_map_.find(key);
+    // If we're enforcing max staleness and the block is stale, remove all of
+    // the file's cached blocks so we reload them.
+    if (entry != block_map_.end() && max_staleness_ > 0 &&
+        env_->NowSeconds() - entry->second->timestamp > max_staleness_) {
+      RemoveFile_Locked(filename);
+      entry = block_map_.end();
+    }
+    if (entry == block_map_.end()) {
+      // We need to fetch the block from the remote filesystem. Trim the LRU
+      // cache if needed - we do this up front in order to avoid any period of
+      // time during which the cache size exceeds its desired limit. The
+      // tradeoff is that if the fetcher fails, the cache may evict blocks
+      // prematurely.
+      while (!lru_list_.empty() && cache_size_ + block_size_ > max_bytes_) {
+        RemoveBlock(block_map_.find(lru_list_.back()));
+      }
+      std::unique_ptr<Block> block(new Block);
+      TF_RETURN_IF_ERROR(
+          block_fetcher_(filename, pos, block_size_, &block->data));
+      // Sanity check to detect interrupted reads leading to partial blocks: a
+      // partial block must have a higher key than the highest existing key in
+      // the block map for the file.
+      if (block->data.size() < block_size_ && !block_map_.empty()) {
+        Key fmax = std::make_pair(filename, std::numeric_limits<size_t>::max());
+        auto fcmp = block_map_.upper_bound(fmax);
+        if (fcmp != block_map_.begin() && key < (--fcmp)->first) {
+          // We expected to read a full block at this position.
+          return errors::Internal("File contents are inconsistent");
+        }
+      }
+      // Record the block timestamp, update the cache size, and add the block to
+      // the cache.
+      block->timestamp = env_->NowSeconds();
+      lra_list_.push_front(key);
+      block->lra_iterator = lra_list_.begin();
+      cache_size_ += block->data.size();
+      entry = block_map_.emplace(std::make_pair(key, std::move(block))).first;
+    } else {
+      // Cache hit. Remove the block from the LRU list at its prior location.
+      lru_list_.erase(entry->second->lru_iterator);
+    }
+    // Push the block to the front of the LRU list.
+    lru_list_.push_front(key);
+    entry->second->lru_iterator = lru_list_.begin();
+    // Copy the relevant portion of the block into the result buffer.
+    const auto& data = entry->second->data;
+    if (offset >= pos + data.size()) {
+      // The requested offset is at or beyond the end of the file. This can
+      // happen if `offset` is not block-aligned, and the read returns the last
+      // block in the file, which does not extend all the way out to `offset`.
+      return errors::OutOfRange("EOF at offset ", offset);
+    }
+    auto begin = data.begin();
+    if (offset > pos) {
+      // The block begins before the slice we're reading.
+      begin += offset - pos;
+    }
+    auto end = data.end();
+    if (pos + data.size() > offset + n) {
+      // The block extends past the end of the slice we're reading.
+      end -= (pos + data.size()) - (offset + n);
+    }
+    if (begin < end) {
+      out->insert(out->end(), begin, end);
+    }
+    if (data.size() < block_size_) {
+      // The block was a partial block and thus signals EOF at its upper bound.
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+size_t FileBlockCache::CacheSize() const {
+  mutex_lock lock(mu_);
+  return cache_size_;
+}
+
+void FileBlockCache::Prune() {
+  while (!WaitForNotificationWithTimeout(&stop_pruning_thread_, 1000000)) {
+    mutex_lock lock(mu_);
+    uint64 now = env_->NowSeconds();
+    while (!lra_list_.empty()) {
+      auto it = block_map_.find(lra_list_.back());
+      if (now - it->second->timestamp <= max_staleness_) {
+        // The oldest block is not yet expired. Come back later.
+        break;
+      }
+      // We need to make a copy of the filename here, since it could otherwise
+      // be used within RemoveFile_Locked after `it` is deleted.
+      RemoveFile_Locked(std::string(it->first.first));
+    }
+  }
+}
+
+void FileBlockCache::RemoveFile(const string& filename) {
+  mutex_lock lock(mu_);
+  RemoveFile_Locked(filename);
+}
+
+void FileBlockCache::RemoveFile_Locked(const string& filename) {
+  Key begin = std::make_pair(filename, 0);
+  auto it = block_map_.lower_bound(begin);
+  while (it != block_map_.end() && it->first.first == filename) {
+    auto next = std::next(it);
+    RemoveBlock(it);
+    it = next;
+  }
+}
+
+void FileBlockCache::RemoveBlock(BlockMap::iterator entry) {
+  lru_list_.erase(entry->second->lru_iterator);
+  lra_list_.erase(entry->second->lra_iterator);
+  cache_size_ -= entry->second->data.size();
+  block_map_.erase(entry);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..0429228a2bac27240607ebb951f5eed4ddec92aa
--- /dev/null
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -0,0 +1,175 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+/// \brief An LRU block cache of file contents, keyed by {filename, offset}.
+///
+/// This class should be shared by read-only random access files on a remote
+/// filesystem (e.g. GCS).
+class FileBlockCache {
+ public:
+  /// The callback executed when a block is not found in the cache, and needs to
+  /// be fetched from the backing filesystem. This callback is provided when the
+  /// cache is constructed. The returned Status should be OK as long as the
+  /// read from the remote filesystem succeeded (similar to the semantics of the
+  /// read(2) system call).
+  typedef std::function<Status(const string&, size_t, size_t,
+                               std::vector<char>*)>
+      BlockFetcher;
+
+  FileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
+                 BlockFetcher block_fetcher, Env* env = Env::Default())
+      : block_size_(block_size),
+        max_bytes_(max_bytes),
+        max_staleness_(max_staleness),
+        block_fetcher_(block_fetcher),
+        env_(env) {
+    if (max_staleness_ > 0) {
+      pruning_thread_.reset(env_->StartThread(ThreadOptions(), "TF_prune_FBC",
+                                              [this] { Prune(); }));
+    }
+  }
+
+  ~FileBlockCache() {
+    if (pruning_thread_) {
+      stop_pruning_thread_.Notify();
+      // Destroying pruning_thread_ will block until Prune() receives the above
+      // notification and returns.
+      pruning_thread_.reset();
+    }
+  }
+
+  /// Read `n` bytes from `filename` starting at `offset` into `out`. This
+  /// method will return:
+  ///
+  /// 1) The error from the remote filesystem, if the read from the remote
+  ///    filesystem failed.
+  /// 2) PRECONDITION_FAILED if the read from the remote filesystem succeeded,
+  ///    but the read returned a partial block, and the LRU cache contained a
+  ///    block at a higher offset (indicating that the partial block should have
+  ///    been a full block).
+  /// 3) OUT_OF_RANGE if the read from the remote filesystem succeeded, but
+  ///    the file contents do not extend past `offset` and thus nothing was
+  ///    placed in `out`.
+  /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
+  ///    in `out`).
+  Status Read(const string& filename, size_t offset, size_t n,
+              std::vector<char>* out);
+
+  /// Remove all cached blocks for `filename`.
+  void RemoveFile(const string& filename) LOCKS_EXCLUDED(mu_);
+
+  /// Accessors for cache parameters.
+  size_t block_size() const { return block_size_; }
+  size_t max_bytes() const { return max_bytes_; }
+  uint64 max_staleness() const { return max_staleness_; }
+
+  /// The current size (in bytes) of the cache.
+  size_t CacheSize() const LOCKS_EXCLUDED(mu_);
+
+ private:
+  /// The size of the blocks stored in the LRU cache, as well as the size of the
+  /// reads from the underlying filesystem.
+  const size_t block_size_;
+  /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
+  const size_t max_bytes_;
+  /// The maximum staleness of any block in the LRU cache, in seconds.
+  const uint64 max_staleness_;
+  /// The callback to read a block from the underlying filesystem.
+  const BlockFetcher block_fetcher_;
+  /// The Env from which we read timestamps.
+  Env* const env_;  // not owned
+
+  /// \brief The key type for the file block cache.
+  ///
+  /// The file block cache key is a {filename, offset} pair.
+  typedef std::pair<string, size_t> Key;
+
+  /// \brief A block of a file.
+  ///
+  /// A file block consists of the block data, the block's current position in
+  /// the LRU cache, and the timestamp (seconds since epoch) at which the block
+  /// was cached.
+  struct Block {
+    /// The block data.
+    std::vector<char> data;
+    /// A list iterator pointing to the block's position in the LRU list.
+    std::list<Key>::iterator lru_iterator;
+    /// A list iterator pointing to the block's position in the LRA list.
+    std::list<Key>::iterator lra_iterator;
+    /// The timestamp (seconds since epoch) at which the block was cached.
+    uint64 timestamp;
+  };
+
+  /// \brief The block map type for the file block cache.
+  ///
+  /// The block map is an ordered map from Key to Block.
+  typedef std::map<Key, std::unique_ptr<Block>> BlockMap;
+
+  /// Prune the cache by removing files with expired blocks.
+  void Prune() LOCKS_EXCLUDED(mu_);
+
+  /// Remove all blocks of a file, with mu_ already held.
+  void RemoveFile_Locked(const string& filename) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// Remove the block `entry` from the block map and LRU list, and update the
+  /// cache size accordingly.
+  void RemoveBlock(BlockMap::iterator entry) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  /// The cache pruning thread that removes files with expired blocks.
+  std::unique_ptr<Thread> pruning_thread_;
+
+  /// Notification for stopping the cache pruning thread.
+  Notification stop_pruning_thread_;
+
+  /// Guards access to the block map, LRU list, and cached byte count.
+  mutable mutex mu_;
+
+  /// The block map (map from Key to Block).
+  BlockMap block_map_ GUARDED_BY(mu_);
+
+  /// The LRU list of block keys. The front of the list identifies the most
+  /// recently accessed block.
+  std::list<Key> lru_list_ GUARDED_BY(mu_);
+
+  /// The LRA (least recently added) list of block keys. The front of the list
+  /// identifies the most recently added block.
+  std::list<Key> lra_list_ GUARDED_BY(mu_);
+
+  /// The combined number of bytes in all of the cached blocks.
+  size_t cache_size_ GUARDED_BY(mu_) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_FILE_BLOCK_CACHE_H_
diff --git a/tensorflow/core/platform/cloud/file_block_cache_test.cc b/tensorflow/core/platform/cloud/file_block_cache_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d01181daebaee1d32b0cb8efd0142174c182eece
--- /dev/null
+++ b/tensorflow/core/platform/cloud/file_block_cache_test.cc
@@ -0,0 +1,422 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include <cstring>
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+// This Env wrapper lets us control the NowSeconds() return value.
+class FakeEnv : public EnvWrapper {
+ public:
+  FakeEnv() : EnvWrapper(Env::Default()) {}
+
+  uint64 NowSeconds() override {
+    mutex_lock lock(mu_);
+    return now_;
+  }
+
+  void SetNowSeconds(uint64 now) {
+    mutex_lock lock(mu_);
+    now_ = now;
+  }
+
+  mutex mu_;
+  uint64 now_ = 1;
+};
+
+TEST(FileBlockCacheTest, PassThrough) {
+  const string want_filename = "foo/bar";
+  const size_t want_offset = 42;
+  const size_t want_n = 1024;
+  int calls = 0;
+  auto fetcher = [&calls, want_filename, want_offset, want_n](
+                     const string& got_filename, size_t got_offset,
+                     size_t got_n, std::vector<char>* out) {
+    EXPECT_EQ(got_filename, want_filename);
+    EXPECT_EQ(got_offset, want_offset);
+    EXPECT_EQ(got_n, want_n);
+    calls++;
+    out->resize(got_n, 'x');
+    return Status::OK();
+  };
+  // If block_size, max_bytes, or both are zero, the cache is a pass-through.
+  FileBlockCache cache1(1, 0, 0, fetcher);
+  FileBlockCache cache2(0, 1, 0, fetcher);
+  FileBlockCache cache3(0, 0, 0, fetcher);
+  std::vector<char> out;
+  TF_EXPECT_OK(cache1.Read(want_filename, want_offset, want_n, &out));
+  EXPECT_EQ(calls, 1);
+  TF_EXPECT_OK(cache2.Read(want_filename, want_offset, want_n, &out));
+  EXPECT_EQ(calls, 2);
+  TF_EXPECT_OK(cache3.Read(want_filename, want_offset, want_n, &out));
+  EXPECT_EQ(calls, 3);
+}
+
+TEST(FileBlockCacheTest, BlockAlignment) {
+  // Initialize a 256-byte buffer.  This is the file underlying the reads we'll
+  // do in this test.
+  const size_t size = 256;
+  std::vector<char> buf;
+  for (int i = 0; i < size; i++) {
+    buf.push_back(i);
+  }
+  // The fetcher just fetches slices of the buffer.
+  auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
+                        std::vector<char>* out) {
+    if (offset < buf.size()) {
+      if (offset + n > buf.size()) {
+        out->insert(out->end(), buf.begin() + offset, buf.end());
+      } else {
+        out->insert(out->end(), buf.begin() + offset, buf.begin() + offset + n);
+      }
+    }
+    return Status::OK();
+  };
+  for (size_t block_size = 2; block_size <= 4; block_size++) {
+    // Make a cache of N-byte block size (1 block) and verify that reads of
+    // varying offsets and lengths return correct data.
+    FileBlockCache cache(block_size, block_size, 0, fetcher);
+    for (size_t offset = 0; offset < 10; offset++) {
+      for (size_t n = block_size - 2; n <= block_size + 2; n++) {
+        std::vector<char> got;
+        TF_EXPECT_OK(cache.Read("", offset, n, &got));
+        // Verify the size of the read.
+        if (offset + n <= size) {
+          // Expect a full read.
+          EXPECT_EQ(got.size(), n) << "block size = " << block_size
+                                   << ", offset = " << offset << ", n = " << n;
+        } else {
+          // Expect a partial read.
+          EXPECT_EQ(got.size(), size - offset)
+              << "block size = " << block_size << ", offset = " << offset
+              << ", n = " << n;
+        }
+        // Verify the contents of the read.
+        std::vector<char>::const_iterator begin = buf.begin() + offset;
+        std::vector<char>::const_iterator end =
+            offset + n > buf.size() ? buf.end() : begin + n;
+        std::vector<char> want(begin, end);
+        EXPECT_EQ(got, want) << "block size = " << block_size
+                             << ", offset = " << offset << ", n = " << n;
+      }
+    }
+  }
+}
+
+TEST(FileBlockCacheTest, CacheHits) {
+  const size_t block_size = 16;
+  std::set<size_t> calls;
+  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
+                                      size_t n, std::vector<char>* out) {
+    EXPECT_EQ(n, block_size);
+    EXPECT_EQ(offset % block_size, 0);
+    EXPECT_EQ(calls.find(offset), calls.end()) << "at offset " << offset;
+    calls.insert(offset);
+    out->resize(n, 'x');
+    return Status::OK();
+  };
+  const uint32 block_count = 256;
+  FileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
+  std::vector<char> out;
+  // The cache has space for `block_count` blocks. The loop with i = 0 should
+  // fill the cache, and the loop with i = 1 should be all cache hits. The
+  // fetcher checks that it is called once and only once for each offset (to
+  // fetch the corresponding block).
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < block_count; j++) {
+      TF_EXPECT_OK(cache.Read("", block_size * j, block_size, &out));
+    }
+  }
+}
+
+TEST(FileBlockCacheTest, OutOfRange) {
+  // Tests reads of a 24-byte file with block size 16.
+  const size_t block_size = 16;
+  const size_t file_size = 24;
+  bool first_block = false;
+  bool second_block = false;
+  auto fetcher = [block_size, file_size, &first_block, &second_block](
+                     const string& filename, size_t offset, size_t n,
+                     std::vector<char>* out) {
+    EXPECT_EQ(n, block_size);
+    EXPECT_EQ(offset % block_size, 0);
+    if (offset == 0) {
+      // The first block (16 bytes) of the file.
+      out->resize(n, 'x');
+      first_block = true;
+    } else if (offset == block_size) {
+      // The second block (8 bytes) of the file.
+      out->resize(file_size - block_size, 'x');
+      second_block = true;
+    }
+    return Status::OK();
+  };
+  FileBlockCache cache(block_size, block_size, 0, fetcher);
+  std::vector<char> out;
+  // Reading the first 16 bytes should be fine.
+  TF_EXPECT_OK(cache.Read("", 0, block_size, &out));
+  EXPECT_TRUE(first_block);
+  EXPECT_EQ(out.size(), block_size);
+  // Reading at offset file_size + 4 will read the second block (since the read
+  // at file_size + 4 = 28 will be aligned to an offset of 16) but will return
+  // OutOfRange because the offset is past the end of the 24-byte file.
+  Status status = cache.Read("", file_size + 4, 4, &out);
+  EXPECT_EQ(status.code(), error::OUT_OF_RANGE);
+  EXPECT_TRUE(second_block);
+  EXPECT_EQ(out.size(), 0);
+  // Reading the second full block will return 8 bytes, from a cache hit.
+  second_block = false;
+  TF_EXPECT_OK(cache.Read("", block_size, block_size, &out));
+  EXPECT_FALSE(second_block);
+  EXPECT_EQ(out.size(), file_size - block_size);
+}
+
+TEST(FileBlockCacheTest, Inconsistent) {
+  // Tests the detection of interrupted reads leading to partially filled blocks
+  // where we expected complete blocks.
+  const size_t block_size = 16;
+  // This fetcher returns OK but only fills in one byte for any offset.
+  auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
+                              std::vector<char>* out) {
+    EXPECT_EQ(n, block_size);
+    EXPECT_EQ(offset % block_size, 0);
+    out->resize(1, 'x');
+    return Status::OK();
+  };
+  FileBlockCache cache(block_size, 2 * block_size, 0, fetcher);
+  std::vector<char> out;
+  // Read the second block; this should yield an OK status and a single byte.
+  TF_EXPECT_OK(cache.Read("", block_size, block_size, &out));
+  EXPECT_EQ(out.size(), 1);
+  // Now read the first block; this should yield an INTERNAL error because we
+  // had already cached a partial block at a later position.
+  Status status = cache.Read("", 0, block_size, &out);
+  EXPECT_EQ(status.code(), error::INTERNAL);
+}
+
+TEST(FileBlockCacheTest, LRU) {
+  const size_t block_size = 16;
+  std::list<size_t> calls;
+  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
+                                      size_t n, std::vector<char>* out) {
+    EXPECT_EQ(n, block_size);
+    EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
+    if (!calls.empty()) {
+      EXPECT_EQ(offset, calls.front());
+      calls.pop_front();
+    }
+    out->resize(n, 'x');
+    return Status::OK();
+  };
+  const uint32 block_count = 2;
+  FileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
+  std::vector<char> out;
+  // Read blocks from the cache, and verify the LRU behavior based on the
+  // fetcher calls that the cache makes.
+  calls.push_back(0);
+  // Cache miss - drains an element from `calls`.
+  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  // Cache hit - does not drain an element from `calls`.
+  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  calls.push_back(block_size);
+  // Cache miss followed by cache hit.
+  TF_EXPECT_OK(cache.Read("", block_size, 1, &out));
+  TF_EXPECT_OK(cache.Read("", block_size, 1, &out));
+  calls.push_back(2 * block_size);
+  // Cache miss followed by cache hit.  Causes eviction of LRU element.
+  TF_EXPECT_OK(cache.Read("", 2 * block_size, 1, &out));
+  TF_EXPECT_OK(cache.Read("", 2 * block_size, 1, &out));
+  // LRU element was at offset 0.  Cache miss.
+  calls.push_back(0);
+  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  // Element at 2 * block_size is still in cache, and this read should update
+  // its position in the LRU list so it doesn't get evicted by the next read.
+  TF_EXPECT_OK(cache.Read("", 2 * block_size, 1, &out));
+  // Element at block_size was evicted.  Reading this element will also cause
+  // the LRU element (at 0) to be evicted.
+  calls.push_back(block_size);
+  TF_EXPECT_OK(cache.Read("", block_size, 1, &out));
+  // Element at 0 was evicted again.
+  calls.push_back(0);
+  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+}
+
+TEST(FileBlockCacheTest, MaxStaleness) {
+  int calls = 0;
+  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+                          std::vector<char>* out) {
+    calls++;
+    out->resize(n, 'x');
+    return Status::OK();
+  };
+  std::vector<char> out;
+  std::unique_ptr<FakeEnv> env(new FakeEnv);
+  // Create a cache with max staleness of 2 seconds, and verify that it works as
+  // expected.
+  FileBlockCache cache1(8, 16, 2 /* max staleness */, fetcher, env.get());
+  // Execute the first read to load the block.
+  TF_EXPECT_OK(cache1.Read("", 0, 1, &out));
+  EXPECT_EQ(calls, 1);
+  // Now advance the clock one second at a time and redo the read. The call
+  // count should advance every 3 seconds (i.e. every time the staleness is
+  // greater than 2).
+  for (int i = 1; i <= 10; i++) {
+    env->SetNowSeconds(i + 1);
+    TF_EXPECT_OK(cache1.Read("", 0, 1, &out));
+    EXPECT_EQ(calls, 1 + i / 3);
+  }
+  // Now create a cache with max staleness of 0, and verify that it also works
+  // as expected.
+  calls = 0;
+  env->SetNowSeconds(0);
+  FileBlockCache cache2(8, 16, 0 /* max staleness */, fetcher, env.get());
+  // Execute the first read to load the block.
+  TF_EXPECT_OK(cache2.Read("", 0, 1, &out));
+  EXPECT_EQ(calls, 1);
+  // Advance the clock by a huge amount and verify that the cached block is
+  // used to satisfy the read.
+  env->SetNowSeconds(365 * 24 * 60 * 60);  // ~1 year, just for fun.
+  TF_EXPECT_OK(cache2.Read("", 0, 1, &out));
+  EXPECT_EQ(calls, 1);
+}
+
+TEST(FileBlockCacheTest, RemoveFile) {
+  int calls = 0;
+  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+                          std::vector<char>* out) {
+    calls++;
+    char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
+    if (offset > 0) {
+      // The first block is lower case and all subsequent blocks are upper case.
+      c = toupper(c);
+    }
+    out->clear();
+    out->resize(n, c);
+    return Status::OK();
+  };
+  // This cache has space for 4 blocks; we'll read from two files.
+  const size_t n = 3;
+  FileBlockCache cache(8, 32, 0, fetcher);
+  std::vector<char> out;
+  std::vector<char> a(n, 'a');
+  std::vector<char> b(n, 'b');
+  std::vector<char> A(n, 'A');
+  std::vector<char> B(n, 'B');
+  // Fill the cache.
+  TF_EXPECT_OK(cache.Read("a", 0, n, &out));
+  EXPECT_EQ(out, a);
+  EXPECT_EQ(calls, 1);
+  TF_EXPECT_OK(cache.Read("a", 8, n, &out));
+  EXPECT_EQ(out, A);
+  EXPECT_EQ(calls, 2);
+  TF_EXPECT_OK(cache.Read("b", 0, n, &out));
+  EXPECT_EQ(out, b);
+  EXPECT_EQ(calls, 3);
+  TF_EXPECT_OK(cache.Read("b", 8, n, &out));
+  EXPECT_EQ(out, B);
+  EXPECT_EQ(calls, 4);
+  // All four blocks should be in the cache now.
+  TF_EXPECT_OK(cache.Read("a", 0, n, &out));
+  EXPECT_EQ(out, a);
+  TF_EXPECT_OK(cache.Read("a", 8, n, &out));
+  EXPECT_EQ(out, A);
+  TF_EXPECT_OK(cache.Read("b", 0, n, &out));
+  EXPECT_EQ(out, b);
+  TF_EXPECT_OK(cache.Read("b", 8, n, &out));
+  EXPECT_EQ(out, B);
+  EXPECT_EQ(calls, 4);
+  // Remove the blocks from "a".
+  cache.RemoveFile("a");
+  // Both blocks from "b" should still be there.
+  TF_EXPECT_OK(cache.Read("b", 0, n, &out));
+  EXPECT_EQ(out, b);
+  TF_EXPECT_OK(cache.Read("b", 8, n, &out));
+  EXPECT_EQ(out, B);
+  EXPECT_EQ(calls, 4);
+  // The blocks from "a" should not be there.
+  TF_EXPECT_OK(cache.Read("a", 0, n, &out));
+  EXPECT_EQ(out, a);
+  EXPECT_EQ(calls, 5);
+  TF_EXPECT_OK(cache.Read("a", 8, n, &out));
+  EXPECT_EQ(out, A);
+  EXPECT_EQ(calls, 6);
+}
+
+TEST(FileBlockCacheTest, Prune) {
+  int calls = 0;
+  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+                          std::vector<char>* out) {
+    calls++;
+    out->clear();
+    out->resize(n, 'x');
+    return Status::OK();
+  };
+  std::vector<char> out;
+  // Our fake environment is initialized with the current timestamp.
+  std::unique_ptr<FakeEnv> env(new FakeEnv);
+  uint64 now = Env::Default()->NowSeconds();
+  env->SetNowSeconds(now);
+  FileBlockCache cache(8, 32, 1 /* max staleness */, fetcher, env.get());
+  // Read three blocks into the cache, and advance the timestamp by one second
+  // with each read. Start with a block of "a" at the current timestamp `now`.
+  TF_EXPECT_OK(cache.Read("a", 0, 1, &out));
+  // Now load a block of a different file "b" at timestamp `now` + 1
+  env->SetNowSeconds(now + 1);
+  TF_EXPECT_OK(cache.Read("b", 0, 1, &out));
+  // Now load a different block of file "a" at timestamp `now` + 1. When the
+  // first block of "a" expires, this block should also be removed because it
+  // also belongs to file "a".
+  TF_EXPECT_OK(cache.Read("a", 8, 1, &out));
+  // Ensure that all blocks are in the cache (i.e. reads are cache hits).
+  EXPECT_EQ(cache.CacheSize(), 24);
+  EXPECT_EQ(calls, 3);
+  TF_EXPECT_OK(cache.Read("a", 0, 1, &out));
+  TF_EXPECT_OK(cache.Read("b", 0, 1, &out));
+  TF_EXPECT_OK(cache.Read("a", 8, 1, &out));
+  EXPECT_EQ(calls, 3);
+  // Advance the fake timestamp so that "a" becomes stale via its first block.
+  env->SetNowSeconds(now + 2);
+  // The pruning thread periodically compares env->NowSeconds() with the oldest
+  // block's timestamp to see if it should evict any files. At the current fake
+  // timestamp of `now` + 2, file "a" is stale because its first block is stale,
+  // but file "b" is not stale yet. Thus, once the pruning thread wakes up (in
+  // one second of wall time), it should remove "a" and leave "b" alone.
+  uint64 start = Env::Default()->NowSeconds();
+  do {
+    Env::Default()->SleepForMicroseconds(100000);
+  } while (cache.CacheSize() == 24 && Env::Default()->NowSeconds() - start < 3);
+  // There should be one block left in the cache, and it should be the first
+  // block of "b".
+  EXPECT_EQ(cache.CacheSize(), 8);
+  TF_EXPECT_OK(cache.Read("b", 0, 1, &out));
+  EXPECT_EQ(calls, 3);
+  // Advance the fake time to `now` + 3, at which point "b" becomes stale.
+  env->SetNowSeconds(now + 3);
+  // Wait for the pruner to remove "b".
+  start = Env::Default()->NowSeconds();
+  do {
+    Env::Default()->SleepForMicroseconds(100000);
+  } while (cache.CacheSize() == 8 && Env::Default()->NowSeconds() - start < 3);
+  // The cache should now be empty.
+  EXPECT_EQ(cache.CacheSize(), 0);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 97e4c207d86fc6c8c12595be2de8707e9b3832f6..f1b54311082c167f49fb627b0de7b6b23ad9eb0d 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
 #include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/cloud/time_util.h"
@@ -53,8 +54,21 @@ constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
+// DEPRECATED. Use GCS_BLOCK_SIZE_MB instead.
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
-
+// The environment variable that overrides the block size for aligned reads from
+// GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
+constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
+constexpr size_t kDefaultBlockSize = 256 * 1024 * 1024;
+// The environment variable that overrides the max size of the LRU cache of
+// blocks read from GCS. Specified in MB.
+constexpr char kMaxCacheSize[] = "GCS_READ_CACHE_MAX_SIZE_MB";
+constexpr size_t kDefaultMaxCacheSize = kDefaultBlockSize;
+// The environment variable that overrides the maximum staleness of cached file
+// contents. Once any block of a file reaches this staleness, all cached blocks
+// will be evicted on the next read.
+constexpr char kMaxStaleness[] = "GCS_READ_CACHE_MAX_STALENESS";
+constexpr uint64 kDefaultMaxStaleness = 0;
 // The file statistics returned by Stat() for directories.
 const FileStatistics DIRECTORY_STAT(0, 0, true);
 
@@ -206,55 +220,20 @@ Status GetBoolValue(const Json::Value& parent, const string& name,
   return Status::OK();
 }
 
-/// A GCS-based implementation of a random access file with a read-ahead buffer.
+/// A GCS-based implementation of a random access file with an LRU block cache.
 class GcsRandomAccessFile : public RandomAccessFile {
  public:
-  GcsRandomAccessFile(const string& bucket, const string& object,
-                      AuthProvider* auth_provider,
-                      HttpRequest::Factory* http_request_factory,
-                      size_t read_ahead_bytes)
-      : bucket_(bucket),
-        object_(object),
-        auth_provider_(auth_provider),
-        http_request_factory_(http_request_factory),
-        read_ahead_bytes_(read_ahead_bytes) {}
+  GcsRandomAccessFile(const string& filename, FileBlockCache* file_block_cache)
+      : filename_(filename), file_block_cache_(file_block_cache) {}
 
-  /// The implementation of reads with a read-ahead buffer. Thread-safe.
+  /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    mutex_lock lock(mu_);
-    const bool range_start_included = offset >= buffer_start_offset_;
-    const bool range_end_included =
-        offset + n <= buffer_start_offset_ + buffer_.size();
-    if (range_start_included && range_end_included) {
-      // The requested range can be filled from the buffer.
-      const size_t offset_in_buffer =
-          std::min<uint64>(offset - buffer_start_offset_, buffer_.size());
-      const auto copy_size = std::min(n, buffer_.size() - offset_in_buffer);
-      std::copy(buffer_.begin() + offset_in_buffer,
-                buffer_.begin() + offset_in_buffer + copy_size, scratch);
-      *result = StringPiece(scratch, copy_size);
-    } else {
-      // Update the buffer content based on the new requested range.
-      const size_t desired_buffer_size = n + read_ahead_bytes_;
-      if (n > buffer_.capacity() ||
-          desired_buffer_size > 2 * buffer_.capacity()) {
-        // Re-allocate only if buffer capacity increased significantly.
-        buffer_.reserve(desired_buffer_size);
-      }
-
-      // Shift the offset and clear the buffer so that the state stays
-      // consistent if loading from GCS fails.
-      buffer_start_offset_ = offset;
-      buffer_.clear();
-
-      TF_RETURN_IF_ERROR(LoadBufferFromGCS());
-
-      // Set the results.
-      std::memcpy(scratch, buffer_.data(), std::min(buffer_.size(), n));
-      *result = StringPiece(scratch, std::min(buffer_.size(), n));
-    }
-
+    result->clear();
+    std::vector<char> out;
+    TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, &out));
+    std::memcpy(scratch, out.data(), std::min(out.size(), n));
+    *result = StringPiece(scratch, std::min(out.size(), n));
     if (result->size() < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
@@ -266,38 +245,10 @@ class GcsRandomAccessFile : public RandomAccessFile {
   }
 
  private:
-  /// A helper function to actually read the data from GCS. This function loads
-  /// buffer_ from GCS based on its current capacity.
-  Status LoadBufferFromGCS() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
-    TF_RETURN_IF_ERROR(
-        request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket_,
-                                        "/", request->EscapeString(object_))));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-    TF_RETURN_IF_ERROR(request->SetRange(
-        buffer_start_offset_, buffer_start_offset_ + buffer_.capacity() - 1));
-    TF_RETURN_IF_ERROR(request->SetResultBuffer(&buffer_));
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
-                                    bucket_, "/", object_);
-    return Status::OK();
-  }
-
-  string bucket_;
-  string object_;
-  AuthProvider* auth_provider_;
-  HttpRequest::Factory* http_request_factory_;
-  const size_t read_ahead_bytes_;
-
-  // The buffer-related members need to be mutable, because they are modified
-  // by the const Read() method.
-  mutable mutex mu_;
-  mutable std::vector<char> buffer_ GUARDED_BY(mu_);
-  // The original file offset of the first byte in the buffer.
-  mutable size_t buffer_start_offset_ GUARDED_BY(mu_) = 0;
+  /// The filename of this file.
+  const string filename_;
+  /// The LRU block cache for this file.
+  mutable FileBlockCache* file_block_cache_;  // not owned
 };
 
 /// \brief GCS-based implementation of a writeable file.
@@ -309,11 +260,13 @@ class GcsWritableFile : public WritableFile {
   GcsWritableFile(const string& bucket, const string& object,
                   AuthProvider* auth_provider,
                   HttpRequest::Factory* http_request_factory,
+                  std::function<void()> file_cache_erase,
                   int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
         auth_provider_(auth_provider),
         http_request_factory_(http_request_factory),
+        file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
@@ -331,11 +284,13 @@ class GcsWritableFile : public WritableFile {
                   AuthProvider* auth_provider,
                   const string& tmp_content_filename,
                   HttpRequest::Factory* http_request_factory,
+                  std::function<void()> file_cache_erase,
                   int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
         auth_provider_(auth_provider),
         http_request_factory_(http_request_factory),
+        file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
     tmp_content_filename_ = tmp_content_filename;
@@ -403,6 +358,8 @@ class GcsWritableFile : public WritableFile {
             TF_RETURN_IF_ERROR(RequestUploadSessionStatus(
                 session_uri, &completed, &already_uploaded));
             if (completed) {
+              // Erase the file from the file cache on every successful write.
+              file_cache_erase_();
               // It's unclear why UploadToSession didn't return OK in the
               // previous attempt, but GCS reports that the file is fully
               // uploaded, so succeed.
@@ -555,6 +512,8 @@ class GcsWritableFile : public WritableFile {
         request->SetPutFromFile(tmp_content_filename_, start_offset));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
                                     GetGcsPath());
+    // Erase the file from the file cache on every successful write.
+    file_cache_erase_();
     return Status::OK();
   }
 
@@ -568,6 +527,7 @@ class GcsWritableFile : public WritableFile {
   string tmp_content_filename_;
   std::ofstream outfile_;
   HttpRequest::Factory* http_request_factory_;
+  std::function<void()> file_cache_erase_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
   int64 initial_retry_delay_usec_;
 };
@@ -583,37 +543,95 @@ class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
   std::unique_ptr<char[]> data_;
   uint64 length_;
 };
+
+// Helper function to extract an environment variable and convert it into a
+// value of type T.
+template <typename T>
+bool GetEnvVar(const char* varname, bool (*convert)(StringPiece, T*),
+               T* value) {
+  const char* env_value = std::getenv(varname);
+  if (!env_value) {
+    return false;
+  }
+  return convert(env_value, value);
+}
+
 }  // namespace
 
 GcsFileSystem::GcsFileSystem()
     : auth_provider_(new GoogleAuthProvider()),
       http_request_factory_(new HttpRequest::Factory()) {
+  uint64 value;
+  size_t block_size = kDefaultBlockSize;
+  size_t max_bytes = kDefaultMaxCacheSize;
+  uint64 max_staleness = kDefaultMaxStaleness;
   // Apply the sys env override for the readahead buffer size if it's provided.
-  const char* readahead_buffer_size = std::getenv(kReadaheadBufferSize);
-  if (readahead_buffer_size) {
-    uint64 value;
-    if (strings::safe_strtou64(readahead_buffer_size, &value)) {
-      read_ahead_bytes_ = value;
-    }
+  if (GetEnvVar(kReadaheadBufferSize, strings::safe_strtou64, &value)) {
+    block_size = value;
   }
+  // Apply the overrides for the block size (MB), max bytes (MB), and max
+  // staleness (seconds) if provided.
+  if (GetEnvVar(kBlockSize, strings::safe_strtou64, &value)) {
+    block_size = value * 1024 * 1024;
+  }
+  if (GetEnvVar(kMaxCacheSize, strings::safe_strtou64, &value)) {
+    max_bytes = value * 1024 * 1024;
+  }
+  if (GetEnvVar(kMaxStaleness, strings::safe_strtou64, &value)) {
+    max_staleness = value;
+  }
+  file_block_cache_ = MakeFileBlockCache(block_size, max_bytes, max_staleness);
 }
 
 GcsFileSystem::GcsFileSystem(
     std::unique_ptr<AuthProvider> auth_provider,
     std::unique_ptr<HttpRequest::Factory> http_request_factory,
-    size_t read_ahead_bytes, int64 initial_retry_delay_usec)
+    size_t block_size, size_t max_bytes, uint64 max_staleness,
+    int64 initial_retry_delay_usec)
     : auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
-      read_ahead_bytes_(read_ahead_bytes),
+      file_block_cache_(
+          MakeFileBlockCache(block_size, max_bytes, max_staleness)),
       initial_retry_delay_usec_(initial_retry_delay_usec) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsRandomAccessFile(bucket, object, auth_provider_.get(),
-                                        http_request_factory_.get(),
-                                        read_ahead_bytes_));
+  result->reset(new GcsRandomAccessFile(fname, file_block_cache_.get()));
+  return Status::OK();
+}
+
+// A helper function to build a FileBlockCache for GcsFileSystem.
+std::unique_ptr<FileBlockCache> GcsFileSystem::MakeFileBlockCache(
+    size_t block_size, size_t max_bytes, uint64 max_staleness) {
+  std::unique_ptr<FileBlockCache> file_block_cache(
+      new FileBlockCache(block_size, max_bytes, max_staleness,
+                         [this](const string& filename, size_t offset, size_t n,
+                                std::vector<char>* out) {
+                           return LoadBufferFromGCS(filename, offset, n, out);
+                         }));
+  return file_block_cache;
+}
+
+// A helper function to actually read the data from GCS.
+Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
+                                        size_t n, std::vector<char>* out) {
+  string bucket, object;
+  TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
+  string auth_token;
+  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+
+  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
+  TF_RETURN_IF_ERROR(request->Init());
+  TF_RETURN_IF_ERROR(
+      request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket,
+                                      "/", request->EscapeString(object))));
+  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+  TF_RETURN_IF_ERROR(request->SetRange(offset, offset + n - 1));
+  TF_RETURN_IF_ERROR(request->SetResultBuffer(out));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
+                                  bucket, "/", object);
   return Status::OK();
 }
 
@@ -621,9 +639,10 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
                                       std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsWritableFile(bucket, object, auth_provider_.get(),
-                                    http_request_factory_.get(),
-                                    initial_retry_delay_usec_));
+  result->reset(new GcsWritableFile(
+      bucket, object, auth_provider_.get(), http_request_factory_.get(),
+      [this, fname]() { file_block_cache_->RemoveFile(fname); },
+      initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -663,7 +682,9 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, auth_provider_.get(), old_content_filename,
-      http_request_factory_.get(), initial_retry_delay_usec_));
+      http_request_factory_.get(),
+      [this, fname]() { file_block_cache_->RemoveFile(fname); },
+      initial_retry_delay_usec_));
   return Status::OK();
 }
 
@@ -1005,6 +1026,7 @@ Status GcsFileSystem::DeleteFile(const string& fname) {
   TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
   TF_RETURN_IF_ERROR(request->SetDeleteRequest());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when deleting ", fname);
+  file_block_cache_->RemoveFile(fname);
   return Status::OK();
 }
 
@@ -1100,7 +1122,9 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when renaming ", src,
                                   " to ", target);
-
+  // Flush the target from the block cache.  The source will be flushed in the
+  // DeleteFile call below.
+  file_block_cache_->RemoveFile(target);
   Json::Value root;
   StringPiece response_piece =
       StringPiece(output_buffer.data(), output_buffer.size());
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 18d2de482bb27298bea7f45ad8c6f167fab6c286..9b284722e1546cf16888f40a2dc20f2b66a09496 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/cloud/auth_provider.h"
+#include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -35,7 +36,8 @@ class GcsFileSystem : public FileSystem {
   GcsFileSystem();
   GcsFileSystem(std::unique_ptr<AuthProvider> auth_provider,
                 std::unique_ptr<HttpRequest::Factory> http_request_factory,
-                size_t read_ahead_bytes, int64 initial_retry_delay_usec);
+                size_t block_size, size_t max_bytes, uint64 max_staleness,
+                int64 initial_retry_delay_usec);
 
   Status NewRandomAccessFile(
       const string& filename,
@@ -74,7 +76,9 @@ class GcsFileSystem : public FileSystem {
 
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
-  size_t get_readahead_buffer_size() const { return read_ahead_bytes_; }
+  size_t block_size() const { return file_block_cache_->block_size(); }
+  size_t max_bytes() const { return file_block_cache_->max_bytes(); }
+  uint64 max_staleness() const { return file_block_cache_->max_staleness(); }
 
  private:
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
@@ -108,14 +112,19 @@ class GcsFileSystem : public FileSystem {
                        FileStatistics* stat);
   Status RenameObject(const string& src, const string& target);
 
+  std::unique_ptr<FileBlockCache> MakeFileBlockCache(size_t block_size,
+                                                     size_t max_bytes,
+                                                     uint64 max_staleness);
+
+  /// Loads file contents from GCS for a given filename, offset, and length.
+  Status LoadBufferFromGCS(const string& filename, size_t offset, size_t n,
+                           std::vector<char>* out);
+
   std::unique_ptr<AuthProvider> auth_provider_;
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
+  std::unique_ptr<FileBlockCache> file_block_cache_;
 
-  // The number of bytes to read ahead for buffering purposes in the
-  // RandomAccessFile implementation. Defaults to 256Mb.
-  size_t read_ahead_bytes_ = 256 * 1024 * 1024;
-
-  // The initial delay for exponential backoffs when retrying failed calls.
+  /// The initial delay for exponential backoffs when retrying failed calls.
   const int64 initial_retry_delay_usec_ = 1000000L;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GcsFileSystem);
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index c3a8678fbc6fce15354a2b50a742f02413c46ace..ba08c7414f178f34553abc455c8932109a7ce3b6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -30,7 +30,7 @@ class FakeAuthProvider : public AuthProvider {
   }
 };
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead) {
+TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -45,7 +45,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -64,7 +65,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead) {
   EXPECT_EQ("6789", result);
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead_differentN) {
+TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -79,7 +80,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead_differentN) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -101,7 +103,9 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoReadAhead_differentN) {
   EXPECT_EQ("3456789", result);
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_WithReadAhead) {
+TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
+  // Our underlying file in this test is a 15 byte file with contents
+  // "0123456789abcde".
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -111,32 +115,23 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithReadAhead) {
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 6-14\n",
-           "6789abcde"),
+           "Range: 9-17\n",
+           "9abcde"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 6-20\n",
-           "6789abcd"),
-       new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
-           "Auth Token: fake_token\n"
-           "Range: 7-21\n",
-           "789abcdef"),
-       new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
-           "Auth Token: fake_token\n"
-           "Range: 20-34\n",
+           "Range: 18-26\n",
            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-14\n",
-           "01234567")});
+           "Range: 0-8\n",
+           "012345678")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   9 /* block size */, 18 /* max bytes */,
+                   0 /* max staleness */, 0 /* initial retry delay */);
 
   char scratch[100];
   StringPiece result;
@@ -147,57 +142,100 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithReadAhead) {
     TF_EXPECT_OK(
         fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
 
-    // Read the first chunk. The buffer will be updated with 4 + 5 = 9 bytes.
+    // Read the first chunk. The cache will be populated with the first block of
+    // 9 bytes.
     scratch[5] = 'x';
     TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
     EXPECT_EQ("0123", result);
     EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
 
-    // The second chunk will be fully loaded from the buffer, no requests are
+    // The second chunk will be fully loaded from the cache, no requests are
     // made.
     TF_EXPECT_OK(file->Read(4, 4, &result, scratch));
     EXPECT_EQ("4567", result);
 
-    // The chunk is only partially buffered -- the request will be made to
-    // reload the buffer. 9 bytes will be requested (same as initial buffer
-    // size).
+    // The chunk is only partially cached -- the request will be made to fetch
+    // the next block. 9 bytes will be requested, starting at offset 9.
     TF_EXPECT_OK(file->Read(6, 5, &result, scratch));
     EXPECT_EQ("6789a", result);
 
-    // The range can only be partially satisfied. An attempt to fill the buffer
-    // with 10 + 5 = 15 bytes will be made (buffer is resized for this request).
+    // The range can only be partially satisfied, as the second block contains
+    // only 6 bytes for a total of 9 + 6 = 15 bytes in the file.
     EXPECT_EQ(errors::Code::OUT_OF_RANGE,
               file->Read(6, 10, &result, scratch).code());
-    EXPECT_EQ("6789abcd", result);
-
-    // The range cannot be satisfied, and the requested offset lies within the
-    // buffer, but the end of the range is outside of the buffer.
-    // A new request will be made to read 10 + 5 = 15 bytes.
-    EXPECT_EQ(errors::Code::OUT_OF_RANGE,
-              file->Read(7, 10, &result, scratch).code());
-    EXPECT_EQ("789abcdef", result);
+    EXPECT_EQ("6789abcde", result);
 
-    // The range cannot be satisfied, and the requested offset is greater than
-    // the
-    // buffered range. A new request will be made to read 10 + 5 = 15 bytes.
+    // The range cannot be satisfied, and the requested offset is past the end
+    // of the cache. A new request will be made to read 9 bytes starting at
+    // offset 18. This request will return an empty response, and there will not
+    // be another request.
     EXPECT_EQ(errors::Code::OUT_OF_RANGE,
               file->Read(20, 10, &result, scratch).code());
     EXPECT_TRUE(result.empty());
 
+    // The beginning of the file has been evicted from the LRU cache.  This will
+    // result in another request. The buffer size is still 15.
     TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
   }
 
-  // The beginning of the file is not in the buffer. This call will result
-  // in another request. The buffer size is still 15.
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
+  // Our underlying file in this test is a 16 byte file with contents
+  // "0123456789abcdef".
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
+                           "Auth Token: fake_token\n"
+                           "Range: 0-7\n",
+                           "01234567"),
+       new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
+                           "Auth Token: fake_token\n"
+                           "Range: 8-15\n",
+                           "89abcdef")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   8 /* block size */, 16 /* max bytes */,
+                   3600 /* max staleness */, 0 /* initial retry delay */);
+  char scratch[100];
+  StringPiece result;
+  // There should only be two HTTP requests issued to GCS even though we iterate
+  // this loop 10 times.  This shows that the underlying FileBlockCache persists
+  // across file close/open boundaries.
+  for (int i = 0; i < 10; i++) {
+    // Create two files. Since these files have the same name name and the max
+    // staleness of the filesystem is > 0, they will share the same blocks.
+    std::unique_ptr<RandomAccessFile> file1;
+    std::unique_ptr<RandomAccessFile> file2;
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", &file1));
+    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", &file2));
+    // Reading the first block from file1 should load it once.
+    TF_EXPECT_OK(file1->Read(0, 8, &result, scratch));
+    EXPECT_EQ("01234567", result);
+    // Reading the first block from file2 should not trigger a request to load
+    // the first block again, because the FileBlockCache shared by file1 and
+    // file2 already has the first block.
+    TF_EXPECT_OK(file2->Read(0, 8, &result, scratch));
+    EXPECT_EQ("01234567", result);
+    // Reading the second block from file2 should load it once.
+    TF_EXPECT_OK(file2->Read(8, 8, &result, scratch));
+    EXPECT_EQ("89abcdef", result);
+    // Reading the second block from file1 should not trigger a request to load
+    // the second block again, because the FileBlockCache shared by file1 and
+    // file2 already has the second block.
+    TF_EXPECT_OK(file1->Read(8, 8, &result, scratch));
+    EXPECT_EQ("89abcdef", result);
+  }
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* read ahead bytes */, 0 /* max bytes */,
+                   0 /* max staleness */, 0 /* initial retry delay */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -207,8 +245,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
 TEST(GcsFileSystemTest, NewWritableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-7\n",
+           "01234567"),
+       new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=path%2Fwriteable.txt\n"
+           "uploadType=resumable&name=path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
            "Post: yes\n",
@@ -217,23 +260,39 @@ TEST(GcsFileSystemTest, NewWritableFile) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
                            "Put body: content1,content2\n",
-                           "")});
+                           ""),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-7\n",
+           "01234567")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   8 /* block size */, 8 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
-  std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
-
-  TF_EXPECT_OK(file->Append("content1,"));
-  TF_EXPECT_OK(file->Append("content2"));
-  TF_EXPECT_OK(file->Flush());
+  // Read from the file first, to fill the block cache.
+  std::unique_ptr<RandomAccessFile> rfile;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/writeable", &rfile));
+  char scratch[100];
+  StringPiece result;
+  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  EXPECT_EQ("0123", result);
+  // Open the writable file.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
+  TF_EXPECT_OK(wfile->Append("content1,"));
+  TF_EXPECT_OK(wfile->Append("content2"));
+  TF_EXPECT_OK(wfile->Flush());
+  // Re-reading the file should trigger another HTTP request to GCS.
+  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  EXPECT_EQ("0123", result);
   // The calls to flush, sync, and close below should not cause uploads because
   // the file is not dirty.
-  TF_EXPECT_OK(file->Flush());
-  TF_EXPECT_OK(file->Sync());
-  TF_EXPECT_OK(file->Close());
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Sync());
+  TF_EXPECT_OK(wfile->Close());
 }
 
 TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
@@ -275,7 +334,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -286,10 +346,18 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
 }
 
 TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
+  // This test also verifies that a file's blocks are purged from the cache when
+  // the file is written, even when the write takes the "succeeds on get status"
+  // path.
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-7\n",
+           "01234567"),
+       new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=path%2Fwriteable.txt\n"
+           "uploadType=resumable&name=path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
            "Post: yes\n",
@@ -303,18 +371,41 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
-                           "", Status::OK(), nullptr, {}, 201)});
+                           "", Status::OK(), nullptr, {}, 201),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-7\n",
+           "01234567")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
-
-  std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
-
-  TF_EXPECT_OK(file->Append("content1,"));
-  TF_EXPECT_OK(file->Append("content2"));
-  TF_EXPECT_OK(file->Close());
+                   8 /* block size */, 8 /* max bytes */,
+                   3600 /* max staleness */, 0 /* initial retry delay */);
+  // Pull the file's first block into the cache. This will trigger the first
+  // HTTP request to GCS.
+  std::unique_ptr<RandomAccessFile> rfile;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/writeable", &rfile));
+  char scratch[100];
+  StringPiece result;
+  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  EXPECT_EQ("0123", result);
+  // Now write to the same file. Once the write succeeds, the cached block will
+  // be flushed.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
+  TF_EXPECT_OK(wfile->Append("content1,"));
+  TF_EXPECT_OK(wfile->Append("content2"));
+  // Appending doesn't invalidate the read cache - only flushing does. This read
+  // will not trigger an HTTP request to GCS.
+  TF_EXPECT_OK(rfile->Read(4, 4, &result, scratch));
+  EXPECT_EQ("4567", result);
+  // Closing the file triggers HTTP requests to GCS and invalidates the read
+  // cache for the file.
+  TF_EXPECT_OK(wfile->Close());
+  // Reading the first block of the file goes to GCS again.
+  TF_EXPECT_OK(rfile->Read(0, 8, &result, scratch));
+  EXPECT_EQ("01234567", result);
 }
 
 TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
@@ -364,7 +455,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 2 /* max upload attempts */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   2 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -411,7 +503,8 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -436,7 +529,8 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -446,13 +540,13 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
 TEST(GcsFileSystemTest, NewAppendableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/path%2Fappendable.txt\n"
+           "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-1048575\n",
+           "Range: 0-31\n",
            "content1,"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=path%2Fappendable.txt\n"
+           "uploadType=resumable&name=path%2Fappendable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
            "Post: yes\n",
@@ -461,17 +555,38 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
                            "Put body: content1,content2\n",
-                           "")});
+                           ""),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-31\n",
+           "01234567")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
-
-  std::unique_ptr<WritableFile> file;
-  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable.txt", &file));
-
-  TF_EXPECT_OK(file->Append("content2"));
-  TF_EXPECT_OK(file->Close());
+                   32 /* block size */, 32 /* max bytes */,
+                   0 /* max staleness */, 0 /* initial retry delay */);
+
+  // Create an appendable file. This should read the file from GCS, and pull its
+  // contents into the block cache.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(wfile->Append("content2"));
+  // Verify that the file contents are in the block cache. This read should not
+  // trigger an HTTP request to GCS.
+  std::unique_ptr<RandomAccessFile> rfile;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/appendable", &rfile));
+  char scratch[100];
+  StringPiece result;
+  TF_EXPECT_OK(rfile->Read(0, 8, &result, scratch));
+  EXPECT_EQ("content1", result);
+  // Closing the appendable file will flush its contents to GCS, triggering HTTP
+  // requests.
+  TF_EXPECT_OK(wfile->Close());
+  // Redo the read. The block should be reloaded from GCS, causing one more HTTP
+  // request to load it.
+  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  EXPECT_EQ("0123", result);
 }
 
 TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
@@ -479,7 +594,8 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -505,7 +621,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -520,7 +637,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -537,7 +655,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -559,7 +678,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -577,7 +697,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -599,7 +720,8 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -618,7 +740,8 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2/").code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -635,7 +758,8 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -656,7 +780,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -678,7 +803,8 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -699,7 +825,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -717,7 +844,8 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -735,7 +863,8 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -768,7 +897,8 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -788,7 +918,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -809,7 +940,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -831,7 +963,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -850,7 +983,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -869,7 +1003,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -881,7 +1016,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -890,17 +1026,40 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
 
 TEST(GcsFileSystemTest, DeleteFile) {
   std::vector<HttpRequest*> requests(
-      {new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-15\n",
+           "01234567"),
+       new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
                            "Delete: yes\n",
-                           "")});
+                           ""),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-15\n",
+           "76543210")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   16 /* block size */, 16 /* max bytes */,
+                   0 /* max staleness */, 0 /* initial retry delay */);
 
+  // Do an initial read of the file to load its contents into the block cache.
+  char scratch[100];
+  StringPiece result;
+  std::unique_ptr<RandomAccessFile> file;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/file1.txt", &file));
+  TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
+  EXPECT_EQ("01234567", result);
+  // Deleting the file triggers the next HTTP request to GCS.
   TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt"));
+  // Re-reading the file causes its contents to be reloaded from GCS and not
+  // from the block cache.
+  TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
+  EXPECT_EQ("76543210", result);
 }
 
 TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
@@ -908,7 +1067,8 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -923,7 +1083,8 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -944,7 +1105,8 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -957,7 +1119,8 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -972,7 +1135,8 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -988,7 +1152,8 @@ TEST(GcsFileSystemTest, GetFileSize) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -1000,7 +1165,8 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   5 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1072,14 +1238,25 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
 
 TEST(GcsFileSystemTest, RenameFile_Object) {
   std::vector<HttpRequest*> requests(
-      {// IsDirectory is checking whether there are children objects.
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-15\n",
+           "01234567"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-15\n",
+           "76543210"),
+       // IsDirectory is checking whether there are children objects.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
@@ -1106,14 +1283,42 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
            "Delete: yes\n",
-           "")});
+           ""),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-15\n",
+           "89abcdef"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-15\n",
+           "fedcba98")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
-
+                   16 /* block size */, 64 /* max bytes */,
+                   0 /* max staleness */, 0 /* initial retry delay */);
+  // Do an initial read of the source and destination files to load their
+  // contents into the block cache.
+  char scratch[100];
+  StringPiece result;
+  std::unique_ptr<RandomAccessFile> src;
+  std::unique_ptr<RandomAccessFile> dst;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/src.txt", &src));
+  TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
+  EXPECT_EQ("01234567", result);
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/path/dst.txt", &dst));
+  TF_EXPECT_OK(dst->Read(0, 8, &result, scratch));
+  EXPECT_EQ("76543210", result);
+  // Now rename src to dst. This should flush the block cache for both files.
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+  // Re-read both files. This should reload their contents from GCS.
+  TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
+  EXPECT_EQ("89abcdef", result);
+  TF_EXPECT_OK(dst->Read(0, 8, &result, scratch));
+  EXPECT_EQ("fedcba98", result);
 }
 
 /// Tests the scenario when deletion returns a failure, but actually succeeds.
@@ -1157,7 +1362,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -1190,7 +1396,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -1208,7 +1415,8 @@ TEST(GcsFileSystemTest, Stat_Object) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -1234,7 +1442,8 @@ TEST(GcsFileSystemTest, Stat_Folder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -1259,7 +1468,8 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -1273,7 +1483,8 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -1290,7 +1501,8 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -1312,7 +1524,8 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1335,7 +1548,8 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1358,7 +1572,8 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -1377,7 +1592,8 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -1391,7 +1607,8 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -1423,7 +1640,8 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
@@ -1442,7 +1660,8 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -1501,7 +1720,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -1579,7 +1799,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -1606,7 +1827,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
-                   0 /* read ahead bytes */, 0 /* initial retry delay */);
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* initial retry delay */);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -1617,13 +1839,32 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   EXPECT_EQ(1, undeleted_dirs);
 }
 
-TEST(GcsFileSystemTest, OverrideReadaheadBufferSize) {
+TEST(GcsFileSystemTest, OverrideCacheParameters) {
+  // Verify defaults are propagated correctly.
   GcsFileSystem fs1;
-  EXPECT_EQ(256 * 1024 * 1024, fs1.get_readahead_buffer_size());
+  EXPECT_EQ(256 * 1024 * 1024, fs1.block_size());
+  EXPECT_EQ(fs1.block_size(), fs1.max_bytes());
+  EXPECT_EQ(0, fs1.max_staleness());
 
+  // Verify legacy readahead buffer override sets block size.
   setenv("GCS_READAHEAD_BUFFER_SIZE_BYTES", "123456789", 1);
   GcsFileSystem fs2;
-  EXPECT_EQ(123456789L, fs2.get_readahead_buffer_size());
+  EXPECT_EQ(123456789L, fs2.block_size());
+
+  // Verify block size override.
+  setenv("GCS_READ_CACHE_BLOCK_SIZE_MB", "1", 1);
+  GcsFileSystem fs3;
+  EXPECT_EQ(1048576L, fs3.block_size());
+
+  // Verify max size override.
+  setenv("GCS_READ_CACHE_MAX_SIZE_MB", "16", 1);
+  GcsFileSystem fs4;
+  EXPECT_EQ(16 * 1024 * 1024, fs4.max_bytes());
+
+  // Verify max staleness override.
+  setenv("GCS_READ_CACHE_MAX_STALENESS", "60", 1);
+  GcsFileSystem fs5;
+  EXPECT_EQ(60, fs5.max_staleness());
 }
 
 }  // namespace
diff --git a/tensorflow/core/platform/cloud/http_request.cc b/tensorflow/core/platform/cloud/http_request.cc
index 2d0141e50e786b8275272cce29be269c6fe8afe0..829fcf1e8b36b6c6a2e836f8da7a62601c0439d5 100644
--- a/tensorflow/core/platform/cloud/http_request.cc
+++ b/tensorflow/core/platform/cloud/http_request.cc
@@ -252,6 +252,8 @@ Status HttpRequest::SetPutFromFile(const string& body_filepath, size_t offset) {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1);
   libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
                              reinterpret_cast<void*>(put_body_));
+  // Using the default CURLOPT_READFUNCTION, which is doing an fread() on the
+  // FILE * userdata set with CURLOPT_READDATA.
   return Status::OK();
 }
 
@@ -263,6 +265,8 @@ Status HttpRequest::SetPutEmptyBody() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1);
   curl_headers_ =
       libcurl_->curl_slist_append(curl_headers_, "Content-Length: 0");
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
+                             reinterpret_cast<void*>(this));
   libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
                              &HttpRequest::ReadCallback);
   return Status::OK();
@@ -292,6 +296,8 @@ Status HttpRequest::SetPostEmptyBody() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_POST, 1);
   curl_headers_ =
       libcurl_->curl_slist_append(curl_headers_, "Content-Length: 0");
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_READDATA,
+                             reinterpret_cast<void*>(this));
   libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
                              &HttpRequest::ReadCallback);
   return Status::OK();
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index afcbb9f35cfba478746b7e9beff6135aba32fa1d..584e52565763296bacb03661edff7e5ea2adb25d 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -143,7 +143,6 @@ class HttpRequest {
   size_t post_body_read_ = 0;
 
   std::vector<char>* response_buffer_ = nullptr;
-  size_t response_buffer_size_ = 0;
   CURL* curl_ = nullptr;
   curl_slist* curl_headers_ = nullptr;
 
diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc
index 6d66dfdee18c4251944189aca36da0d08d86d51a..dfca7a616414518301e4242efeec34db7384b769 100644
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@@ -138,7 +138,7 @@ class FakeLibCurl : public LibCurl {
     return CURLE_OK;
   }
   CURLcode curl_easy_perform(CURL* curl) override {
-    if (read_data_) {
+    if (is_post_ || is_put_) {
       char buffer[3];
       int bytes_read;
       posted_content_ = "";
@@ -148,7 +148,7 @@ class FakeLibCurl : public LibCurl {
             strings::StrCat(posted_content_, StringPiece(buffer, bytes_read));
       } while (bytes_read > 0);
     }
-    if (write_data_) {
+    if (write_data_ || write_callback_) {
       write_callback_(response_content_.c_str(), 1, response_content_.size(),
                       write_data_);
     }
@@ -437,6 +437,27 @@ TEST(HttpRequestTest, PutRequest_WithBody_FromFile_NonZeroOffset) {
   std::remove(content_filename.c_str());
 }
 
+TEST(HttpRequestTest, PutRequest_WithoutBody) {
+  FakeLibCurl libcurl("", 200);
+  HttpRequest http_request(&libcurl);
+  TF_EXPECT_OK(http_request.Init());
+
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
+  TF_EXPECT_OK(http_request.SetPutEmptyBody());
+  TF_EXPECT_OK(http_request.Send());
+
+  // Check interactions with libcurl.
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(2, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_EQ("Content-Length: 0", (*libcurl.headers_)[1]);
+  EXPECT_TRUE(libcurl.is_put_);
+  EXPECT_EQ("", libcurl.posted_content_);
+}
+
 TEST(HttpRequestTest, PostRequest_WithBody_FromMemory) {
   FakeLibCurl libcurl("", 200);
   HttpRequest http_request(&libcurl);
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/cloud/retrying_utils.cc
index 096c77c6e3def2fed5f433b57a657935adecb3e3..99691ecfb9d5a51d1d2965e5215144536a6bb756 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils.cc
@@ -89,7 +89,7 @@ Status RetryingUtils::DeleteWithRetries(
   bool is_retried = false;
   return RetryingUtils::CallWithRetries(
       [delete_func, &is_retried]() {
-        const auto& status = delete_func();
+        const Status status = delete_func();
         if (is_retried && status.code() == error::NOT_FOUND) {
           return Status::OK();
         }
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 1cfeb2580fa4918f7e96ee118481ae75f391d6b7..4941bc12393b9c2ece78b852b9a07dcbaa1d1d9f 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -96,6 +96,14 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
 
 void WarnAboutUnusedCPUFeatures() {
   std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
+#ifdef PLATFORM_WINDOWS
+#ifndef __AVX__
+    WarnIfFeatureUnused(CPUFeature::AVX, "AVX");
+#endif  // __AVX__
+#ifndef __AVX2__
+    WarnIfFeatureUnused(CPUFeature::AVX2, "AVX2");
+#endif  // __AVX2__
+#else   // ifdef platform windows
 #ifndef __SSE__
     WarnIfFeatureUnused(CPUFeature::SSE, "SSE");
 #endif  // __SSE__
@@ -123,6 +131,7 @@ void WarnAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     WarnIfFeatureUnused(CPUFeature::FMA, "FMA");
 #endif  // __FMA__
+#endif  // else of ifdef platform windows
   });
 }
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 94f255663ebc94b5584299a6e2e58abc26a41f4a..73e8cb8c50d32b2a57651a7b646404be55ead7c2 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,8 +1,9 @@
 # Platform-specific build configurations.
 
-load("@protobuf//:protobuf.bzl", "cc_proto_library")
-load("@protobuf//:protobuf.bzl", "py_proto_library")
+load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
+load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
@@ -43,15 +44,15 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
   cc_proto_library(
       name = name + "_cc",
       srcs = srcs,
-      deps = tf_deps(protodeps, "_cc") + ["@protobuf//:cc_wkt_protos"],
-      cc_libs = cc_libs + ["@protobuf//:protobuf"],
-      copts = [
+      deps = tf_deps(protodeps, "_cc") + ["@protobuf_archive//:cc_wkt_protos"],
+      cc_libs = cc_libs + ["@protobuf_archive//:protobuf"],
+      copts = if_not_windows([
           "-Wno-unknown-warning-option",
           "-Wno-unused-but-set-variable",
           "-Wno-sign-compare",
-      ],
-      protoc = "@protobuf//:protoc",
-      default_runtime = "@protobuf//:protobuf",
+      ]),
+      protoc = "@protobuf_archive//:protoc",
+      default_runtime = "@protobuf_archive//:protobuf",
       use_grpc_plugin = use_grpc_plugin,
       testonly = testonly,
       visibility = visibility,
@@ -64,9 +65,9 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
       name = name + "_py",
       srcs = srcs,
       srcs_version = srcs_version,
-      deps = deps + tf_deps(protodeps, "_py") + ["@protobuf//:protobuf_python"],
-      protoc = "@protobuf//:protoc",
-      default_runtime = "@protobuf//:protobuf_python",
+      deps = deps + tf_deps(protodeps, "_py") + ["@protobuf_archive//:protobuf_python"],
+      protoc = "@protobuf_archive//:protoc",
+      default_runtime = "@protobuf_archive//:protobuf_python",
       visibility = visibility,
       testonly = testonly,
   )
@@ -74,7 +75,8 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
 def tf_proto_library(name, srcs = [], has_services = None,
                      protodeps = [], visibility = [], testonly = 0,
                      cc_libs = [],
-                     cc_api_version = 2, go_api_version = 2,
+                     cc_api_version = 2, cc_grpc_version = None,
+                     go_api_version = 2,
                      j2objc_api_version = 1,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb"):
@@ -83,6 +85,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
       name = name,
       srcs = srcs,
       protodeps = protodeps,
+      cc_grpc_version = cc_grpc_version,
       cc_libs = cc_libs,
       testonly = testonly,
       visibility = visibility,
@@ -127,6 +130,14 @@ def tf_additional_lib_srcs(exclude = []):
       ], exclude = exclude),
   })
 
+# pylint: disable=unused-argument
+def tf_additional_framework_hdrs(exclude = []):
+  return []
+
+def tf_additional_framework_srcs(exclude = []):
+  return []
+# pylint: enable=unused-argument
+
 def tf_additional_minimal_lib_srcs():
   return [
       "platform/default/integral_types.h",
@@ -171,6 +182,15 @@ def tf_additional_stream_executor_srcs():
 def tf_additional_cupti_wrapper_deps():
   return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
+def tf_additional_gpu_tracer_srcs():
+  return ["platform/default/gpu_tracer.cc"]
+
+def tf_additional_gpu_tracer_cuda_deps():
+  return []
+
+def tf_additional_gpu_tracer_deps():
+  return []
+
 def tf_additional_libdevice_data():
   return []
 
@@ -266,3 +286,7 @@ def tf_additional_mpi_lib_defines():
       "//tensorflow:with_mpi_support": ["TENSORFLOW_USE_MPI"],
       "//conditions:default": [],
   })
+
+def tf_pyclif_proto_library(name, proto_lib, proto_srcfile="", visibility=None,
+                            **kwargs):
+  pass
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index fa4ac4ba73f7093c58b87f458d14abac2ddefba1..04bf2aeca65355083777db390ce9ea91d921935f 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -3,10 +3,10 @@
 # be separate to avoid cyclic references.
 
 def tf_cuda_tests_tags():
-  return ["local"]
+  return ["requires-gpu"]
 
 def tf_sycl_tests_tags():
-  return ["local"]
+  return ["requires-gpu"]
 
 def tf_additional_plugin_deps():
   return select({
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/platform/default/gpu_tracer.cc
similarity index 98%
rename from tensorflow/core/common_runtime/gpu/gpu_tracer.cc
rename to tensorflow/core/platform/default/gpu_tracer.cc
index fb76348ddd245d5683213f8320f40d3d602b9548..7375287e1b6c93f44109efc03285883da879bdda 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
+#include "tensorflow/core/platform/gpu_tracer.h"
 
 #if GOOGLE_CUDA
 
 #include <stdlib.h>
+#include <memory>
 
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -255,7 +256,7 @@ CUPTIManager *GetCUPTIManager() {
 }
 
 #ifdef _MSC_VER
-#define __thread __declspec(thread) 
+#define __thread __declspec(thread)
 #endif
 
 // TODO(pbar) Move this to platform specific header file?
@@ -619,7 +620,10 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
 
 }  // namespace gputracer
 
-GPUTracer *CreateGPUTracer() { return new gputracer::GPUTracerImpl(); }
+std::unique_ptr<GPUTracer> CreateGPUTracer() {
+  std::unique_ptr<GPUTracer> tracer(new gputracer::GPUTracerImpl());
+  return tracer;
+}
 
 }  // namespace tensorflow
 
@@ -627,7 +631,7 @@ GPUTracer *CreateGPUTracer() { return new gputracer::GPUTracerImpl(); }
 
 namespace tensorflow {
 
-GPUTracer *CreateGPUTracer() { return nullptr; }
+std::unique_ptr<GPUTracer> CreateGPUTracer() { return nullptr; }
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/grpc_response_reader.h b/tensorflow/core/platform/default/grpc_response_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c175345fc4a7c9068bec81821b50fbf59e275ed
--- /dev/null
+++ b/tensorflow/core/platform/default/grpc_response_reader.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_GRPC_RESPONSE_READER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_GRPC_RESPONSE_READER_H_
+
+#include "grpc++/grpc++.h"
+
+namespace tensorflow {
+
+template <class ResponseMessage, class RequestMessage>
+::grpc::ClientAsyncResponseReader<ResponseMessage>*
+CreateClientAsyncResponseReader(::grpc::ChannelInterface* channel,
+                                ::grpc::CompletionQueue* cq,
+                                const ::grpc::internal::RpcMethod& method,
+                                ::grpc::ClientContext* context,
+                                const RequestMessage& request) {
+  return new ::grpc::ClientAsyncResponseReader<ResponseMessage>(
+      channel, cq, method, context, request);
+}
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_GRPC_RESPONSE_READER_H_
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 2fdd989c9b97497c94bb035472df910a701b2692..44f11aef968d8daa5ed60585e89e8fc039d9772d 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <sys/stat.h>
 #include <deque>
 #include <utility>
 #include <vector>
@@ -30,7 +31,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -273,6 +277,39 @@ string Env::GetExecutablePath() {
   return exe_path;
 }
 
+bool Env::LocalTempFilename(string* filename) {
+  std::vector<string> dirs;
+  GetLocalTempDirectories(&dirs);
+
+  // Try each directory, as they might be full, have inappropriate
+  // permissions or have different problems at times.
+  for (const string& dir : dirs) {
+#ifdef __APPLE__
+    uint64_t tid64;
+    pthread_threadid_np(nullptr, &tid64);
+    int32 tid = static_cast<int32>(tid64);
+    int32 pid = static_cast<int32>(getpid());
+#elif defined(PLATFORM_WINDOWS)
+    int32 tid = static_cast<int32>(GetCurrentThreadId());
+    int32 pid = static_cast<int32>(GetCurrentProcessId());
+#else
+    int32 tid = static_cast<int32>(pthread_self());
+    int32 pid = static_cast<int32>(getpid());
+#endif
+    uint64 now_microsec = NowMicros();
+
+    *filename = io::JoinPath(
+        dir, strings::Printf("tempfile-%s-%x-%d-%llx", port::Hostname().c_str(),
+                             tid, pid, now_microsec));
+    if (FileExists(*filename).ok()) {
+      filename->clear();
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
 Thread::~Thread() {}
 
 EnvWrapper::~EnvWrapper() {}
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 1b7e024b0f4c1d4e7821a4f31bc274f95135eea4..da8c3e2d7e8a50c9d441cd371078fa86aae13179 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -215,6 +215,9 @@ class Env {
   /// symlinks if there is any.
   string GetExecutablePath();
 
+  /// Creates a local unique temporary file name. Returns true if success.
+  bool LocalTempFilename(string* filename);
+
   // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
@@ -279,6 +282,9 @@ class Env {
       const string& version) = 0;
 
  private:
+  // Returns a possible list of local temporary directories.
+  void GetLocalTempDirectories(std::vector<string>* list);
+
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
   TF_DISALLOW_COPY_AND_ASSIGN(Env);
   EnvTime* envTime = EnvTime::Default();
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 7bc1882c86d056eefbed90fae1d2d67e71b4a6a7..50dd0cd58b88a478a0c8656e98cfa5eb5458ff85 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -298,4 +298,32 @@ TEST_F(DefaultEnvTest, GetExecutablePath) {
   TF_EXPECT_OK(env->FileExists(env->GetExecutablePath()));
 }
 
+TEST_F(DefaultEnvTest, LocalTempFilename) {
+  Env* env = Env::Default();
+  string filename;
+  EXPECT_TRUE(env->LocalTempFilename(&filename));
+  EXPECT_FALSE(env->FileExists(filename).ok());
+
+  // Write something to the temporary file.
+  std::unique_ptr<WritableFile> file_to_write;
+  TF_CHECK_OK(env->NewWritableFile(filename, &file_to_write));
+  TF_CHECK_OK(file_to_write->Append("Null"));
+  TF_CHECK_OK(file_to_write->Close());
+  TF_CHECK_OK(env->FileExists(filename));
+
+  // Read from the temporary file and check content.
+  std::unique_ptr<RandomAccessFile> file_to_read;
+  TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read));
+  StringPiece content;
+  char scratch[1024];
+  CHECK_EQ(error::OUT_OF_RANGE,
+           file_to_read->Read(0 /* offset */, 1024 /* n */, &content, scratch)
+               .code());
+  EXPECT_EQ("Null", content.ToString());
+
+  // Delete the temporary file.
+  TF_CHECK_OK(env->DeleteFile(filename));
+  EXPECT_FALSE(env->FileExists(filename).ok());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index 8a719a8cf1edc8811f2b2d891f5dd030111370cc..9c72d34ff518abcabf773af607589fe8114beebf 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
-#elif (defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)) || defined(PLATFORM_WINDOWS)
+#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
 #include <gif_lib.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.h b/tensorflow/core/platform/gpu_tracer.h
similarity index 91%
rename from tensorflow/core/common_runtime/gpu/gpu_tracer.h
rename to tensorflow/core/platform/gpu_tracer.h
index d83508191ba3ce287369a3b4392a884e1fc6007f..3373d974e3815939989b5abd3fa294025082212b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.h
+++ b/tensorflow/core/platform/gpu_tracer.h
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_TRACER_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_TRACER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
+#define TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
+
+#include <memory>
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -70,8 +72,8 @@ class GPUTracer {
 
 // Creates a platform-specific GPUTracer.
 // Returns 'nullptr' on platforms where tracing is not supported.
-GPUTracer* CreateGPUTracer();
+std::unique_ptr<GPUTracer> CreateGPUTracer();
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_TRACER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc b/tensorflow/core/platform/gpu_tracer_test.cc
similarity index 99%
rename from tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
rename to tensorflow/core/platform/gpu_tracer_test.cc
index 7ca5d4aa5b5b1d0e729f452c6438cd8195fc806d..713282c1fd8bddd49a3cfdbe497958e68d5c1b10 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
+++ b/tensorflow/core/platform/gpu_tracer_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
+#include "tensorflow/core/platform/gpu_tracer.h"
 
 #include <map>
 #include <memory>
diff --git a/tensorflow/core/platform/grpc_response_reader.h b/tensorflow/core/platform/grpc_response_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dc78d0dcbe211ba39a9057aadb199bc064d3600
--- /dev/null
+++ b/tensorflow/core/platform/grpc_response_reader.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_GRPC_RESPONSE_READER_H_
+#define TENSORFLOW_CORE_PLATFORM_GRPC_RESPONSE_READER_H_
+
+#include "grpc++/grpc++.h"
+#include "tensorflow/core/platform/platform.h"
+
+// Include platform-dependent grpc ClientAsyncResponseReader constructors.
+// TODO(b/62910646): Remove this level of indirection once this is resolved.
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/platform/google/grpc_response_reader.h"
+#else
+#include "tensorflow/core/platform/default/grpc_response_reader.h"
+#endif
+
+namespace tensorflow {
+
+// Start a call and write the request out.
+// The returned pointer is owned by the caller.
+// See
+// https://grpc.io/grpc/cpp/classgrpc_1_1_client_async_response_reader.html#ace2c5bae351f67dd7dd603fc39513e0a
+// for more information.
+template <class ResponseMessage, class RequestMessage>
+::grpc::ClientAsyncResponseReader<ResponseMessage>*
+CreateClientAsyncResponseReader(::grpc::ChannelInterface* channel,
+                                ::grpc::CompletionQueue* cq,
+                                const ::grpc::internal::RpcMethod& method,
+                                ::grpc::ClientContext* context,
+                                const RequestMessage& request);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_MUTEX_H_
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index f5b4deed559074796bff1cdf6a40e051c2fa0507..edbcbd960a7d61970119bfb385f075e1d3ffb96f 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/jpeg.h"
-#elif (defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)) || defined(PLATFORM_WINDOWS)
+#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index eaf0171e72df03b295bd626ac497a28d9e3f08e8..47523c7d2b09275be3747e684df1b656534ed6ea 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -20,6 +20,7 @@ limitations under the License.
 #if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)
 // Compiler supports GCC-style attributes
 #define TF_ATTRIBUTE_NORETURN __attribute__((noreturn))
+#define TF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
 #define TF_ATTRIBUTE_NOINLINE __attribute__((noinline))
 #define TF_ATTRIBUTE_UNUSED __attribute__((unused))
 #define TF_ATTRIBUTE_COLD __attribute__((cold))
@@ -33,6 +34,7 @@ limitations under the License.
 #elif defined(COMPILER_MSVC)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
+#define TF_ATTRIBUTE_ALWAYS_INLINE
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
@@ -43,6 +45,7 @@ limitations under the License.
 #else
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN
+#define TF_ATTRIBUTE_ALWAYS_INLINE
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 7c44b22eff6cd30173ff7843ad1b96ea6a842af1..12120c4ab96ae8327864c46a8e0dc434b900e67e 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -43,9 +43,10 @@ limitations under the License.
 #elif defined(__arm__)
 #define PLATFORM_POSIX
 
-// Since there's no macro for the Raspberry Pi, assume we're on a mobile
-// platform if we're compiling for the ARM CPU.
+// Require an outside macro to tell us if we're building for Raspberry Pi.
+#if !defined(RASPBERRY_PI)
 #define IS_MOBILE_PLATFORM
+#endif  // !defined(RASPBERRY_PI)
 
 #else
 // If no platform specified, use:
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index 4500d9abe251bec941d93aaf35dec6766354c68c..5b0203c343e6b1764a9cc8a7908919422d826bcb 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/png.h"
-#elif (defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)) || defined(PLATFORM_WINDOWS)
+#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
 #include <png.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 8cfb39ae18afe7b1b0b8cae8349eebad6c4e0e46..ba3c4e709078adf8c60cf49ab06c7194cf887cc1 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -131,4 +131,39 @@ Env* Env::Default() {
 }
 #endif
 
+void Env::GetLocalTempDirectories(std::vector<string>* list) {
+  list->clear();
+  // Directories, in order of preference. If we find a dir that
+  // exists, we stop adding other less-preferred dirs
+  const char* candidates[] = {
+      // Non-null only during unittest/regtest
+      getenv("TEST_TMPDIR"),
+
+      // Explicitly-supplied temp dirs
+      getenv("TMPDIR"),
+      getenv("TMP"),
+
+      // If all else fails
+      "/tmp",
+  };
+
+  for (const char* d : candidates) {
+    if (!d || d[0] == '\0') continue;  // Empty env var
+
+    // Make sure we don't surprise anyone who's expecting a '/'
+    string dstr = d;
+    if (dstr[dstr.size() - 1] != '/') {
+      dstr += "/";
+    }
+
+    struct stat statbuf;
+    if (!stat(d, &statbuf) && S_ISDIR(statbuf.st_mode) &&
+        !access(dstr.c_str(), 0)) {
+      // We found a dir that exists and is accessible - we're done.
+      list->push_back(dstr);
+      return;
+    }
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index df5c80087920f03c35cf6646c18838cba7e15168..e9baad5422694bb01f8d5e2e61114e723f693bf7 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -171,11 +171,7 @@ error::Code ErrnoToCode(int err_number) {
 
 Status IOError(const string& context, int err_number) {
   auto code = ErrnoToCode(err_number);
-  if (code == error::UNKNOWN) {
-    return Status(code, strings::StrCat(context, "; ", strerror(err_number)));
-  } else {
-    return Status(code, context);
-  }
+  return Status(code, strings::StrCat(context, "; ", strerror(err_number)));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 22400565d67a91c6df4d0eed0403e7f2bc59fb13..52df84e81ce3a368b9be1d6665c8150b4db791b6 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -28,10 +28,17 @@ namespace profile_utils {
 
 static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 
-/* static */ int64 CpuUtils::GetCycleCounterFrequency() {
-  static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
-  return cpu_frequency;
+#if defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+   /* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
+     static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
+     return cpu_frequency;
 }
+#else
+   /* static */ int64 CpuUtils::GetCycleCounterFrequency() {
+     static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
+     return cpu_frequency;
+}
+#endif
 
 /* static */ double CpuUtils::GetMicroSecPerClock() {
   static const double micro_sec_per_clock =
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 19471ec8585c2336a6994e4d15b60af9354e0d0c..8979a40ea10ca8254c75daab4a7b2197e46b6c33 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -97,7 +97,11 @@ class CpuUtils {
   // Return cycle counter frequency.
   // As this method caches the cpu frequency internally,
   // the first call will incur overhead, but not subsequent calls.
-  static int64 GetCycleCounterFrequency();
+  #if defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+     static uint64 GetCycleCounterFrequency();
+  #else
+     static int64 GetCycleCounterFrequency();
+  #endif
 
   // Return micro secound per each clock
   // As this method caches the cpu frequency internally,
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index ca487965a0c9f9d42eb431d504d2d8ed1a5dff1e..e1ec4aaac0fce16ee8203bad14f767bd32d68d35 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -53,9 +53,15 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
 }
 
 TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) {
-  const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-  CHECK_GT(cpu_frequency, 0);
-  CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
+  #if defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+     const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+     CHECK_GT(cpu_frequency, 0);
+     CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
+  #else
+     const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+     CHECK_GT(cpu_frequency, 0);
+     CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
+  #endif
   if (DBG) {
     LOG(INFO) << "Cpu frequency = " << cpu_frequency;
   }
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 73ccd68e5d3731ef810ddd68d4c52369b92fbcff..17dc81f7e0f49e50178f70df3da31d5745387f3a 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tensor_coding.h"
 
 #include <vector>
+#include "tensorflow/core/framework/resource_handle.pb.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -67,12 +68,14 @@ void CopyFromArray(string* s, const char* base, size_t bytes) {
 
 void EncodeResourceHandleList(const ResourceHandle* p, int64 n, string* out) {
   out->clear();
+  string rest;
+  ResourceHandleProto proto;
   for (int i = 0; i < n; ++i) {
-    core::PutVarint32(out, p[i].ByteSize());
-  }
-  for (int i = 0; i < n; ++i) {
-    p[i].AppendToString(out);
+    p[i].AsProto(&proto);
+    core::PutVarint32(out, proto.ByteSize());
+    proto.AppendToString(&rest);
   }
+  *out += rest;
 }
 
 bool DecodeResourceHandleList(const string& in, ResourceHandle* ps, int64 n) {
@@ -86,10 +89,12 @@ bool DecodeResourceHandleList(const string& in, ResourceHandle* ps, int64 n) {
   if (total != static_cast<int64>(reader.size())) {
     return false;
   }
+  ResourceHandleProto proto;
   for (int i = 0; i < n; ++i) {
-    if (!ps[i].ParseFromArray(reader.data(), sizes[i])) {
+    if (!proto.ParseFromArray(reader.data(), sizes[i])) {
       return false;
     }
+    ps[i].FromProto(proto);
     reader.remove_prefix(sizes[i]);
   }
   return true;
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 2dd65fb97ac44f6793afdb92cc5215703dc9c87b..19f53e6374f4d0baf68c7c9991661e09fe065ee2 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -18,7 +18,7 @@ limitations under the License.
 #define TENSORFLOW_PLATFORM_TENSOR_CODING_H_
 
 #include <string>
-#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/platform.h"
diff --git a/tensorflow/core/platform/variant_coding.cc b/tensorflow/core/platform/variant_coding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4bcde4f58163c012b859248f9f030f941b609cd9
--- /dev/null
+++ b/tensorflow/core/platform/variant_coding.cc
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/variant_coding.h"
+
+#include <vector>
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace port {
+
+void EncodeVariantList(const Variant* variant_array, int64 n, string* out) {
+  out->clear();
+  string rest;
+  for (int i = 0; i < n; ++i) {
+    string s;
+    variant_array[i].Encode(&s);
+    core::PutVarint32(out, s.length());
+    strings::StrAppend(&rest, s);
+  }
+  strings::StrAppend(out, rest);
+}
+
+bool DecodeVariantList(const string& in, Variant* variant_array, int64 n) {
+  std::vector<uint32> sizes(n);
+  StringPiece reader(in);
+  int64 total = 0;
+  for (auto& size : sizes) {
+    if (!core::GetVarint32(&reader, &size)) return false;
+    total += size;
+  }
+  if (total != static_cast<int64>(reader.size())) {
+    return false;
+  }
+
+  for (int i = 0; i < n; ++i) {
+    if (variant_array[i].is_empty()) {
+      variant_array[i] = VariantTensorDataProto();
+    }
+    string str(reader.data(), sizes[i]);
+    if (!variant_array[i].Decode(str)) return false;
+    reader.remove_prefix(sizes[i]);
+  }
+  return true;
+}
+
+}  // end namespace port
+}  // end namespace tensorflow
diff --git a/tensorflow/core/platform/variant_coding.h b/tensorflow/core/platform/variant_coding.h
new file mode 100644
index 0000000000000000000000000000000000000000..34c24811496f56a13c5e3195e61edbad1effca8c
--- /dev/null
+++ b/tensorflow/core/platform/variant_coding.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_VARIANT_CODING_H_
+#define TENSORFLOW_PLATFORM_VARIANT_CODING_H_
+
+#include "tensorflow/core/framework/variant.h"
+
+#ifdef PLATFORM_GOOGLE
+#include "tensorflow/core/platform/google/variant_cord_coding.h"
+#endif
+
+namespace tensorflow {
+namespace port {
+
+// Encodes an array of Variant objects in to the given string.
+// `variant_array` is assumed to point to an array of `n` Variant objects.
+void EncodeVariantList(const Variant* variant_array, int64 n, string* out);
+
+// Decodes an array of Variant objects from the given string.
+// `variant_array` is assumed to point to an array of `n` Variant objects.
+bool DecodeVariantList(const string& in, Variant* variant_array, int64 n);
+
+}  // end namespace port
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_VARIANT_CODING_H_
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index b8e1c91616a37c93ebb7c90e2ad17b85177b5393..98fcf927ac4a34062a80baef07d0f49dd3c3b057 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -82,22 +82,42 @@ class WindowsEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  static VOID CALLBACK SchedClosureCallback(PTP_CALLBACK_INSTANCE Instance,
+                                            PVOID Context, PTP_WORK Work) {
+    CloseThreadpoolWork(Work);
+    std::function<void()>* f = (std::function<void()>*)Context;
+    (*f)();
+    delete f;
+  }
   void SchedClosure(std::function<void()> closure) override {
-    // TODO(b/27290852): Spawning a new thread here is wasteful, but
-    // needed to deal with the fact that many `closure` functions are
-    // blocking in the current codebase.
-    std::thread closure_thread(closure);
-    closure_thread.detach();
+    PTP_WORK work = CreateThreadpoolWork(
+        SchedClosureCallback, new std::function<void()>(std::move(closure)),
+        nullptr);
+    SubmitThreadpoolWork(work);
+  }
+
+  static VOID CALLBACK SchedClosureAfterCallback(PTP_CALLBACK_INSTANCE Instance,
+                                                 PVOID Context,
+                                                 PTP_TIMER Timer) {
+    CloseThreadpoolTimer(Timer);
+    std::function<void()>* f = (std::function<void()>*)Context;
+    (*f)();
+    delete f;
   }
 
   void SchedClosureAfter(int64 micros, std::function<void()> closure) override {
-    // TODO(b/27290852): Consuming a thread here is wasteful, but this
-    // code is (currently) only used in the case where a step fails
-    // (AbortStep). This could be replaced by a timer thread
-    SchedClosure([this, micros, closure]() {
-      SleepForMicroseconds(micros);
-      closure();
-    });
+    PTP_TIMER timer = CreateThreadpoolTimer(
+        SchedClosureAfterCallback,
+        new std::function<void()>(std::move(closure)), nullptr);
+    // in 100 nanosecond units
+    FILETIME FileDueTime;
+    ULARGE_INTEGER ulDueTime;
+    // Negative indicates the amount of time to wait is relative to the current
+    // time.
+    ulDueTime.QuadPart = (ULONGLONG) - (10 * micros);
+    FileDueTime.dwHighDateTime = ulDueTime.HighPart;
+    FileDueTime.dwLowDateTime = ulDueTime.LowPart;
+    SetThreadpoolTimer(timer, &FileDueTime, 0, 0);
   }
 
   Status LoadLibrary(const char *library_filename, void** handle) override {
@@ -152,4 +172,21 @@ Env* Env::Default() {
   return default_env;
 }
 
+void Env::GetLocalTempDirectories(std::vector<string>* list) {
+  list->clear();
+  // On windows we'll try to find a directory in this order:
+  //   C:/Documents & Settings/whomever/TEMP (or whatever GetTempPath() is)
+  //   C:/TMP/
+  //   C:/TEMP/
+  //   C:/WINDOWS/ or C:/WINNT/
+  //   .
+  char tmp[MAX_PATH];
+  // GetTempPath can fail with either 0 or with a space requirement > bufsize.
+  // See http://msdn.microsoft.com/en-us/library/aa364992(v=vs.85).aspx
+  DWORD n = GetTempPathA(MAX_PATH, tmp);
+  if (n > 0 && n <= MAX_PATH) list->push_back(tmp);
+  list->push_back("C:\\tmp\\");
+  list->push_back("C:\\temp\\");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/BUILD b/tensorflow/core/profiler/BUILD
similarity index 79%
rename from tensorflow/tools/tfprof/BUILD
rename to tensorflow/core/profiler/BUILD
index 57cccd8921be6b16127e1e759287bd50c4649dac..c930ae3bd2d479acd7dde94e318d7dce50e69f68 100644
--- a/tensorflow/tools/tfprof/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -20,8 +20,8 @@ filegroup(
 )
 
 cc_binary(
-    name = "tfprof",
-    srcs = ["tfprof_main.cc"],
+    name = "profiler",
+    srcs = ["profiler.cc"],
     deps = [
         ":protos_all_cc",
         "//tensorflow/c:c_api",
@@ -30,9 +30,10 @@ cc_binary(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/tools/tfprof/internal:tfprof_options",
-        "//tensorflow/tools/tfprof/internal:tfprof_stats",
-        "//tensorflow/tools/tfprof/internal:tfprof_utils",
+        "//tensorflow/core/profiler/internal:tfprof_options",
+        "//tensorflow/core/profiler/internal:tfprof_stats",
+        "//tensorflow/core/profiler/internal:tfprof_utils",
+        "//tensorflow/core/profiler/internal/advisor:tfprof_advisor",
         "@linenoise//:linenoise",
     ],
 )
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d55824e6f269e539bcf7b137d121cb518e168329
--- /dev/null
+++ b/tensorflow/core/profiler/README.md
@@ -0,0 +1,210 @@
+<h1>TensorFlow Profiler and Advisor</h1>
+
+* [Features](#features)
+* [Interfaces](#interfaces)
+* [Tutorials](#tutorials)
+* [Demo](#demo)
+* [Feature Request and Bug Report](#feature-request-and-bug-report)
+
+Contact for bug report and feature request (xpan@google.com)
+
+### Features
+
+* Profile model architectures.
+  * parameters, tensor shapes, float operations, device placement, etc.
+* Profile multiple-steps model performance.
+  * execution time, memory consumption.
+* Auto profile and advise.
+  * accelerator utilization check
+  * expensive operation check
+  * operation configuration check
+  * distributed runtime check (Not OSS)
+
+### Interfaces
+
+* Python API
+* Command Line
+* Visualization
+* C++ API (Not public, contact us if needed.)
+
+tfprof provides 4 different views to organize the profiles.
+
+    *  code view: operations are grouped by Python codes that generate them.
+    *  op view: operations are grouped by operation type (E.g. MatMul, Conv2D).
+    *  scope view: operations are organized based on name scope hierarchies.
+    *  graph view: operations are organized based on input/output.
+
+tfprof provides options to help user select, filter and order statistics.
+See [Options](g3doc/options.md) for detail instructions.
+
+```
+-max_depth                  10
+-min_bytes                  0
+-min_micros                 0
+-min_params                 0
+-min_float_ops              0
+-min_occurrence             0
+-step                       -1
+-order_by                   name
+-account_type_regexes       .*
+-start_name_regexes         .*
+-trim_name_regexes
+-show_name_regexes          .*
+-hide_name_regexes
+-account_displayed_op_only  false
+-select                     params
+-output                     stdout:
+```
+
+### Tutorials
+
+*  [Python API](g3doc/python_api.md)
+*  [Command Line Interface](g3doc/command_line.md)
+*  [Profile Time](g3doc/profile_time.md)
+*  [Profile Memory](g3doc/profile_memory.md)
+*  [Profile Model Architecture](g3doc/profile_model_architecture.md)
+*  [Auto Detect and Advise](g3doc/advise.md)
+*  [Options](g3doc/options.md)
+
+## Demo
+
+### Attribute TensorFlow graph running time to your Python codes.
+```shell
+tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
+_TFProfRoot (0us/22.44ms)
+  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
+    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
+      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
+        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
+          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
+          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
+          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
+            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
+            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
+            ...
+          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
+          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
+        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
+```
+
+### Show your model variables and the number of parameters.
+```
+tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
+_TFProfRoot (--/930.58k params)
+  global_step (1/1 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+```
+
+### Show the most expensive operation types.
+```
+tfprof> op -select micros,bytes,occurrence -order_by micros
+node name | output bytes | total execution time | accelerator execution time | cpu execution time | op occurrence (run|defined)
+SoftmaxCrossEntropyWithLogits      36.58MB (100.00%, 0.05%),      1.37sec (100.00%, 26.68%),           0us (100.00%, 0.00%),      1.37sec (100.00%, 30.75%),      30|30
+MatMul                        2720.57MB (99.95%, 3.66%),      708.14ms (73.32%, 13.83%),     280.76ms (100.00%, 41.42%),       427.39ms (69.25%, 9.62%),  2694|3450
+ConcatV2                       741.37MB (96.29%, 1.00%),       389.63ms (59.49%, 7.61%),        31.80ms (58.58%, 4.69%),       357.83ms (59.63%, 8.05%),  4801|6098
+Mul                           3957.24MB (95.29%, 5.33%),       338.02ms (51.88%, 6.60%),       80.88ms (53.88%, 11.93%),       257.14ms (51.58%, 5.79%),  7282|9427
+Add                            740.05MB (89.96%, 1.00%),       321.76ms (45.28%, 6.28%),        13.50ms (41.95%, 1.99%),       308.26ms (45.79%, 6.94%),  1699|2180
+Sub                             32.46MB (88.97%, 0.04%),       216.20ms (39.00%, 4.22%),          241us (39.96%, 0.04%),       215.96ms (38.85%, 4.86%),  1780|4372
+Slice                          708.07MB (88.92%, 0.95%),       179.88ms (34.78%, 3.51%),        25.38ms (39.92%, 3.74%),       154.50ms (33.99%, 3.48%),  5800|7277
+AddN                           733.21MB (87.97%, 0.99%),       158.36ms (31.26%, 3.09%),        50.10ms (36.18%, 7.39%),       108.26ms (30.51%, 2.44%),  4567|5481
+Fill                           954.27MB (86.98%, 1.28%),       138.29ms (28.17%, 2.70%),        16.21ms (28.79%, 2.39%),       122.08ms (28.08%, 2.75%),  3278|9686
+Select                         312.33MB (85.70%, 0.42%),       104.75ms (25.47%, 2.05%),        18.30ms (26.40%, 2.70%),        86.45ms (25.33%, 1.95%),  2880|5746
+ApplyAdam                      231.65MB (85.28%, 0.31%),        92.66ms (23.43%, 1.81%),            0us (23.70%, 0.00%),        92.66ms (23.38%, 2.09%),      27|27
+```
+
+### Auto-profile.
+
+```
+tfprof> advise
+Not running under xxxx. Skip JobChecker.
+
+AcceleratorUtilizationChecker:
+device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+
+OperationChecker:
+Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
+
+JobChecker:
+
+ExpensiveOperationChecker:
+top 1 operation type: SoftmaxCrossEntropyWithLogits, cpu: 1.37sec, accelerator: 0us, total: 1.37sec (26.68%)
+top 2 operation type: MatMul, cpu: 427.39ms, accelerator: 280.76ms, total: 708.14ms (13.83%)
+top 3 operation type: ConcatV2, cpu: 357.83ms, accelerator: 31.80ms, total: 389.63ms (7.61%)
+top 1 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_11, cpu: 89.92ms, accelerator: 0us, total: 89.92ms
+top 2 graph node: train_step/update_seq2seq/output_projection/w/ApplyAdam, cpu: 84.52ms, accelerator: 0us, total: 84.52ms
+top 3 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_19, cpu: 73.02ms, accelerator: 0us, total: 73.02ms
+seq2seq_attention_model.py:360:build_graph:self._add_seq2seq(), cpu: 3.16sec, accelerator: 214.84ms, total: 3.37sec
+  seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ..., cpu: 2.46sec, accelerator: 3.25ms, total: 2.47sec
+    seq2seq_lib.py:181:sampled_sequence_...:average_across_ti..., cpu: 2.46sec, accelerator: 3.24ms, total: 2.47sec
+      seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f..., cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec
+      seq2seq_lib.py:148:sequence_loss_by_...:log_perp_list.app..., cpu: 1.33ms, accelerator: 120us, total: 1.45ms
+  seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a..., cpu: 651.56ms, accelerator: 158.92ms, total: 810.48ms
+    seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ..., cpu: 306.58ms, accelerator: 73.54ms, total: 380.12ms
+      core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 306.52ms, accelerator: 73.54ms, total: 380.05ms
+    seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,..., cpu: 296.21ms, accelerator: 73.54ms, total: 369.75ms
+      core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 296.11ms, accelerator: 73.54ms, total: 369.65ms
+    seq2seq_lib.py:113:bidirectional_rnn:outputs = [tf.con..., cpu: 46.88ms, accelerator: 3.87ms, total: 50.75ms
+  seq2seq_attention_model.py:253:_add_seq2seq:initial_state_att..., cpu: 32.48ms, accelerator: 50.01ms, total: 82.50ms
+    seq2seq.py:693:attention_decoder:attns = attention..., cpu: 11.73ms, accelerator: 38.41ms, total: 50.14ms
+      seq2seq.py:653:attention:s = math_ops.redu..., cpu: 2.62ms, accelerator: 17.80ms, total: 20.41ms
+      seq2seq.py:658:attention:array_ops.reshape..., cpu: 1.90ms, accelerator: 12.08ms, total: 13.98ms
+      seq2seq.py:655:attention:a = nn_ops.softma..., cpu: 4.15ms, accelerator: 4.25ms, total: 8.40ms
+    seq2seq.py:686:attention_decoder:cell_output, stat..., cpu: 14.43ms, accelerator: 4.85ms, total: 19.27ms
+    seq2seq.py:696:attention_decoder:output = linear([..., cpu: 3.04ms, accelerator: 2.88ms, total: 5.93ms
+      core_rnn_cell_impl.py:1009:_linear:res = math_ops.ma..., cpu: 2.33ms, accelerator: 2.71ms, total: 5.04ms
+seq2seq_attention_model.py:363:build_graph:self._add_train_o..., cpu: 1.28sec, accelerator: 462.93ms, total: 1.74sec
+  seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self..., cpu: 967.84ms, accelerator: 462.88ms, total: 1.43sec
+    gradients_impl.py:563:gradients:grad_scope, op, f..., cpu: 692.60ms, accelerator: 390.75ms, total: 1.08sec
+    gradients_impl.py:554:gradients:out_grads[i] = co..., cpu: 164.71ms, accelerator: 16.21ms, total: 180.92ms
+      control_flow_ops.py:1314:ZerosLikeOutsideL...:return array_ops...., cpu: 121.85ms, accelerator: 16.21ms, total: 138.05ms
+      control_flow_ops.py:1313:ZerosLikeOutsideL...:zeros_shape = arr..., cpu: 22.85ms, accelerator: 0us, total: 22.85ms
+      control_flow_ops.py:1312:ZerosLikeOutsideL...:switch_val = swit..., cpu: 20.02ms, accelerator: 0us, total: 20.02ms
+    gradients_impl.py:515:gradients:out_grads = _Aggr..., cpu: 108.69ms, accelerator: 51.92ms, total: 160.61ms
+      gradients_impl.py:846:_AggregatedGrads:out_grads[i] = _M..., cpu: 107.99ms, accelerator: 50.05ms, total: 158.04ms
+      gradients_impl.py:856:_AggregatedGrads:array_ops.concat(..., cpu: 340us, accelerator: 1.87ms, total: 2.21ms
+  seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)..., cpu: 307.56ms, accelerator: 0us, total: 307.56ms
+    optimizer.py:456:apply_gradients:update_ops.append..., cpu: 307.43ms, accelerator: 0us, total: 307.43ms
+      optimizer.py:102:update_op:return optimizer...., cpu: 222.66ms, accelerator: 0us, total: 222.66ms
+      optimizer.py:97:update_op:return optimizer...., cpu: 84.76ms, accelerator: 0us, total: 84.76ms
+```
+
+### Visualize time and memory.
+```
+tfprof> graph -step 0 -max_depth 100000 -output timeline:outfile=<filename>
+
+generating trace file.
+
+******************************************************
+Timeline file is written to <filename>.
+Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
+******************************************************
+```
+<left>
+[CodeTimeline](g3doc/graph_timeline.png)
+</left>
+
+
+### Feature Request and Bug Report
+
+Contact: xpan@google.com
+
+Providing `GraphDef` and `RunMetadata` file will greatly help
+bug fix. `OpLogProto` is a good plus if it is used.
+
+
+#### Teams
+
+* Xin Pan (xpan@google.com, github: panyx0718)
+* Yao Zhang
+* Jon Shlens
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc16c8fdffde996b0f76ed74e6f7096adb33df4c
--- /dev/null
+++ b/tensorflow/core/profiler/g3doc/advise.md
@@ -0,0 +1,108 @@
+## Auto Detect and Advise
+
+tfprof analyzes profiles and generates advises for common issues.
+
+### Run Advise.
+
+```python
+# First create a profiler. See profiler tutorials for more details.
+profiler = tf.profiler.Profiler(sess.graph)
+run_meta = config_pb2.RunMetadata()
+_ = sess.run(r1,
+             options=config_pb2.RunOptions(
+                 trace_level=config_pb2.RunOptions.FULL_TRACE),
+             run_metadata=run_meta)
+profiler.add_step(1, run_meta)
+
+# Then Start advise.
+profiler.advise()
+
+# For one-shot API
+tf.profiler.advise(
+    sess.graph, run_meta=run_metadata)
+```
+
+```shell
+# Run advisor on CLI
+# See CLI tutorial on generating the files.
+tfprof --graph_path=graph.pbtxt \
+       --run_meta_path=run_metadata \
+       --op_log_path=tfprof_log
+
+tfprof> advise
+AcceleratorUtilizationChecker:
+device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+
+OperationChecker:
+Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
+
+ExpensiveOperationChecker:
+top 1 operation type: SoftmaxCrossEntropyWithLogits, cpu: 1.37sec, accelerator: 0us, total: 1.37sec (26.68%)
+top 2 operation type: MatMul, cpu: 427.39ms, accelerator: 280.76ms, total: 708.14ms (13.83%)
+top 3 operation type: ConcatV2, cpu: 357.83ms, accelerator: 31.80ms, total: 389.63ms (7.61%)
+seq2seq_attention_model.py:360:build_graph:self._add_seq2seq(), cpu: 3.16sec, accelerator: 214.84ms, total: 3.37sec
+  seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ..., cpu: 2.46sec, accelerator: 3.25ms, total: 2.47sec
+    seq2seq_lib.py:181:sampled_sequence_...:average_across_ti..., cpu: 2.46sec, accelerator: 3.24ms, total: 2.47sec
+      seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f..., cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec
+        seq2seq_attention_model.py:289:sampled_loss_func:num_classes=vsize), cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec
+        seq2seq_attention_model.py:282:sampled_loss_func:labels = tf.resha..., cpu: 164us, accelerator: 0us, total: 164us
+      seq2seq_lib.py:148:sequence_loss_by_...:log_perp_list.app..., cpu: 1.33ms, accelerator: 120us, total: 1.45ms
+      seq2seq_lib.py:151:sequence_loss_by_...:total_size = tf.a..., cpu: 154us, accelerator: 23us, total: 177us
+    seq2seq_lib.py:184:sampled_sequence_...:return cost / tf...., cpu: 97us, accelerator: 8us, total: 105us
+      math_ops.py:690:cast:return gen_math_o..., cpu: 62us, accelerator: 3us, total: 65us
+      math_ops.py:839:binary_op_wrapper:return func(x, y,..., cpu: 35us, accelerator: 5us, total: 40us
+  seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a..., cpu: 651.56ms, accelerator: 158.92ms, total: 810.48ms
+    seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ..., cpu: 306.58ms, accelerator: 73.54ms, total: 380.12ms
+      core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 306.52ms, accelerator: 73.54ms, total: 380.05ms
+        rnn.py:218:_rnn_step:_maybe_copy_some_..., cpu: 303.76ms, accelerator: 73.54ms, total: 377.30ms
+        rnn.py:216:_rnn_step:time >= max_seque..., cpu: 2.75ms, accelerator: 0us, total: 2.75ms
+      core_rnn.py:179:static_rnn:max_sequence_leng..., cpu: 67us, accelerator: 0us, total: 67us
+    seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,..., cpu: 296.21ms, accelerator: 73.54ms, total: 369.75ms
+      core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 296.11ms, accelerator: 73.54ms, total: 369.65ms
+        rnn.py:218:_rnn_step:_maybe_copy_some_..., cpu: 292.04ms, accelerator: 73.54ms, total: 365.58ms
+        rnn.py:216:_rnn_step:time >= max_seque..., cpu: 4.07ms, accelerator: 0us, total: 4.07ms
+      core_rnn.py:178:static_rnn:min_sequence_leng..., cpu: 85us, accelerator: 0us, total: 85us
+      core_rnn.py:179:static_rnn:max_sequence_leng..., cpu: 16us, accelerator: 0us, total: 16us
+    seq2seq_lib.py:113:bidirectional_rnn:outputs = [tf.con..., cpu: 46.88ms, accelerator: 3.87ms, total: 50.75ms
+ ...(omitted)
+top 1 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_11, cpu: 89.92ms, accelerator: 0us, total: 89.92ms
+top 2 graph node: train_step/update_seq2seq/output_projection/w/ApplyAdam, cpu: 84.52ms, accelerator: 0us, total: 84.52ms
+top 3 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_19, cpu: 73.02ms, accelerator: 0us, total: 73.02ms
+```
+
+### Checker
+
+There is no magic behind advise mode. tfprof builds the profiles first, then
+it runs through a list of `Checkers`, each one responsible for checking one
+area with the profile and report issues. A `Checker` is like a plugin.
+
+For example:
+
+#### JobChecker (Not Available OSS)
+
+*   Checks RecvTensor RPC latency and bandwidth.
+*   Checks CPU/Memory utilization of the job.
+
+####AcceleratorUtilization Checker
+* Checks what percentage of time the accelerator spends on computation.
+
+#### OperationChecker
+
+*   Checks whether the operation runs with optimal options.
+*   Checks if there is a better implementation to replace the current operation.
+
+#### ExpensiveOperationChecker
+
+*   Checks the most expensive operation type.
+*   Checks the most expensive graph nodes.
+*   Checks the most expensive graph-building Python codes.
+
+####Contribute Your Checker
+
+Follow examples of accelerator_utilization_checker.h
+
+
+
diff --git a/tensorflow/tools/tfprof/g3doc/code_timeline.png b/tensorflow/core/profiler/g3doc/code_timeline.png
similarity index 100%
rename from tensorflow/tools/tfprof/g3doc/code_timeline.png
rename to tensorflow/core/profiler/g3doc/code_timeline.png
diff --git a/tensorflow/tools/tfprof/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md
similarity index 88%
rename from tensorflow/tools/tfprof/g3doc/command_line.md
rename to tensorflow/core/profiler/g3doc/command_line.md
index 9f0de72e07ee20722cf17390a3f54a85d273b06c..857b5e64590db193baa6e7f836634745f35eb5dc 100644
--- a/tensorflow/tools/tfprof/g3doc/command_line.md
+++ b/tensorflow/core/profiler/g3doc/command_line.md
@@ -16,12 +16,12 @@
 
 tfprof command line tool uses the following inputs:
 
-<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
-architecture of the model. For example, graph.pbtxt written by tf.Supervisor
+<b>--graph_path:</b> GraphDef proto file (required). Used to build in-memory
+data structure of the model. For example, graph.pbtxt written by tf.Supervisor
 can be passed to --graph_path. You can also easily get GraphDef using
 tf.get_default_graph().as_graph_def(add_shapes=True) or other API.
 
-<b>--run_meta_path:</b> tensorflow::RunMetadata (optional).
+<b>--run_meta_path:</b> RunMetadata proto file (optional).
 Used to get the memory consumption and execution time of
 each op of the model.
 
@@ -36,18 +36,19 @@ with tf.gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
 ```
 
 <b>--op_log_path:</b>
-tensorflow::tfprof::OpLog (optional). A proto used to provide extra operation
+tensorflow.tfprof.OpLogProto (optional). A proto used to provide extra operation
 information. 1) float operations. 2) code traces. 3) define customized operation
-type for -account_type_regexes option.
+type for `-account_type_regexes` option.
 
-The following code snippet writes a OpLog file.
+The following code snippet writes a OpLogProto file.
 
 ```python
-tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
+tf.profiler.write_op_log(graph, log_dir, op_log=None)
 ```
 
 <b>--checkpoint_path:</b> TensorFlow checkpoint (optional).
 It defines _checkpoint_variable op type. It also provides checkpointed tensors' values.
+Note: this feature is not well maintained now.
 
 
 ###Start `tfprof`
@@ -56,10 +57,10 @@ It defines _checkpoint_variable op type. It also provides checkpointed tensors'
 
 ```shell
 # Build the tool.
-bazel build --config opt tensorflow/tools/tfprof/...
+bazel build --config opt third_party/tensorflow/core/profiler/...
 
 # Help information, including detail 'option' instructions.
-bazel-bin/tensorflow/tools/tfprof/tfprof help
+bazel-bin/tensorflow/core/profiler/profiler help
 ```
 
 #### Start `tfprof` Interactive Mode
@@ -72,17 +73,17 @@ bazel-bin/tensorflow/tools/tfprof/tfprof help
 # --checkpoint_path contains the model checkpoint data.
 #
 # Only includes model architecture, parameters and shapes.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
+bazel-bin/tensorflow/core/profiler/profiler \
     --graph_path=graph.pbtxt
 #
 # Additionally profile ops memory and timing.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
+bazel-bin/tensorflow/core/profiler/profiler \
     --graph_path=graph.pbtxt \
     --run_meta_path=run_meta \
 #
 # tfprof_log is used to define customized op types, float ops and code traces.
 # Use tfprof_logger.write_op_log() to create tfprof_log.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
+bazel-bin/tensorflow/core/profiler/profiler \
     --graph_path=graph.pbtxt \
     --run_meta_path=run_meta \
     --op_log_path=tfprof_log \
@@ -90,7 +91,7 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
 # Additionally profile checkpoint statistics and values.
 # Use '-account_type_regexes _checkpoint_variables' to select
 # checkpoint tensors.
-bazel-bin/tensorflow/tools/tfprof/tfprof \
+bazel-bin/tensorflow/core/profiler/profiler \
     --graph_path=graph.pbtxt \
     --run_meta_path=run_meta \
     --op_log_path=tfprof_log \
@@ -101,7 +102,7 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
 
 ```python
 # Runs tfprof in one-shot.
-bazel-bin/tensorflow/tools/tfprof/tfprof scope \
+bazel-bin/tensorflow/core/profiler/profiler scope \
     --graph_path=graph.pbtxt \
     --max_depth=3
 ```
@@ -211,7 +212,7 @@ _TFProfRoot (--/464.15k params)
 ```
 
 Where does `_trainable_variables` come from? It is customized operation type
-defined through the OpLog file.
+defined through the OpLogProto file.
 Users can [Define Customized Operation Type](#define-customized-operation-type)
 
 <b>Following example shows importance of defining customized operation type.</b>
@@ -243,7 +244,7 @@ In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
 gpu:0. They share an operation type.
 
 ```shell
-bazel-bin/tensorflow/tools/tfprof/tfprof \
+bazel-bin/tensorflow/core/profiler/profiler \
   --graph_path=/tmp/graph.pbtxt  \
   --run_meta_path=/tmp/run_meta
 
@@ -256,12 +257,11 @@ _TFProfRoot (--/58.84m params)
 
 #### Define Customized Operation Type
 
-First, in Python code, create an `OpLog` proto and add op type
+First, in Python code, create an `OpLogProto` proto and add op type
 information to it:
 
 ```python
-
-op_log = tfprof_log_pb2.OpLog()
+op_log = tfprof_log_pb2.OpLogProto()
 entry = op_log.log_entries.add()
 entry.name = 'pool_logit/DW'
 entry.types.append('pool_logit')
@@ -270,19 +270,19 @@ entry.name = 'pool_logit/biases'
 entry.types.append('pool_logit')
 ```
 
-Second, call write_op_log to write the OpLog proto.
+Second, call write_op_log to write the OpLogProto proto.
 
 ```python
-tf.contrib.tfprof.tfprof_logger.write_op_log(
+tf.profiler.write_op_log(
     sess.graph, /tmp/my_op_log_dir, op_log)
 
 # Get run-time shape information in order to fill shapes and get flops.
-tf.contrib.tfprof.tfprof_logger.write_op_log(
+tf.profiler.write_op_log(
     sess.graph, /tmp/my_op_log_dir, op_log, run_meta)
 ```
 
 Third, when starting the tfprof tool, specify
-"--op_log_path /tmp/my_op_log_dir/op_log"
+"--op_log_path=/tmp/my_op_log_dir/op_log"
 
 ```shell
 tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
@@ -291,7 +291,7 @@ _TFProfRoot (--/650 params)
   pool_logit/biases (10, 10/10 params)
 ```
 
-Note that `tf.contrib.tfprof.tfprof_logger.write_op_log(...)` automatically
+Note that `tf.profiler.write_op_log(...)` automatically
 assigns all `Variables` inside `tf.trainable_variables()` a customized
 operation type: `_trainable_variables`.
 
diff --git a/tensorflow/tools/tfprof/g3doc/graph_timeline.png b/tensorflow/core/profiler/g3doc/graph_timeline.png
similarity index 100%
rename from tensorflow/tools/tfprof/g3doc/graph_timeline.png
rename to tensorflow/core/profiler/g3doc/graph_timeline.png
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
new file mode 100644
index 0000000000000000000000000000000000000000..950837932456f9cfbce4f54baab7a2a72f2925f2
--- /dev/null
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -0,0 +1,89 @@
+##Options
+
+###Overview
+
+For all tfprof views, the profiles are processed with the following procedures
+
+1) An in-memory data structure is built represent the view.
+
+   *  graph view. Graph. Each profiler node corresponds to a
+      TensorFlow graph node.
+   *  scope view. Tree. Each profiler node corresponds to a
+      TensorFlow graph node.
+   *  code view. Tree. Each profiler node includes to all TensorFlow
+      graph nodes created by the profiler node (python code).
+   *  op view. List. Each profiler node includes to all TensorFlow
+      graph nodes belonging to an operation type.
+
+2) `-account_type_regexes` is used to first select the nodes that includes
+   the specified operation types. An operation has its default type
+   (e.g. MatMul, Conv2D). `tfprof` also considers device as operation type.
+   User can also define customized operation type. Hence, an operation has
+   multiple types. Profiler nodes containing matched
+   types are selected for display and their statistics are aggregated by the
+   parents of the in-memory data structure.
+
+3) Various `-xxx_name_regexes`,  `-min_xxx`, `-max_depth` etc options are then
+   applied to further filter based on profiler node names and statistics.
+   It's no limited operation name. In code view,
+   it's the code string. In op view, it's the operation type name. Different
+   from `-account_type_regexes`, Statistics are used even if a profiler node is not displayed.
+   For example, in code view, a callee might be hidden, but its statistics is
+   still aggregated by it's caller. `-account_displayed_op_only`, however,
+   breaks the rule and only aggregates statistics of displayed names.
+
+4) Finally, the filtered data structure is output in a format depending
+   on the `-output` option.
+
+####Option Semantics In Different View
+options usually have the same semantics in different views. However, some
+can vary. For example `-max_depth` in scope view means the depth of
+name scope <b>tree</b>. In op view, it means the length of operation <b>list</b>.
+In graph view, in means the number of hops in the <b>graph</b>.
+
+
+###Docs
+
+`-max_depth`: Show nodes that are at most this number of hops from starting node in the data structure.
+
+`-min_bytes`: Show nodes that request at least this number of bytes.
+
+`-min_micros`: Show nodes that spend at least this number of microseconds to run.
+
+`-min_params`: Show nodes that contains at least this number of parameters.
+
+`-min_float_ops`: Show nodes that contain at least this number of float operations. Only available if an node has op.RegisterStatistics() defined and OpLogProto is provided
+
+`-min_occurrence`: Show nodes that appear at least this number of times..
+
+`-step`: Show the stats of the this step when multiple steps of RunMetadata were added. By default, show the average of all steps."
+
+`-order_by`: Order the results by [name|depth|bytes|micros|accelerator_micros|cpu_micros|params|float_ops|occurrence]
+
+`-account_type_regexes`: Account and display the nodes whose types match one of the type regexes specified. tfprof allow user to define extra operation types for graph nodes through tensorflow.tfprof.OpLogProto proto. regexes are comma-sperated.
+
+`-start_name_regexes`: Show node starting from the node that matches the regexes, recursively. regexes are comma-separated.
+
+`-trim_name_regexes`: Hide node starting from the node that matches the regexes, recursively, regexes are comma-seprated.
+
+`-show_name_regexes`: Show node that match the regexes. regexes are comma-seprated.
+
+`-hide_name_regexes`: Hide node that match the regexes. regexes are comma-seprated.
+
+`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
+
+
+Notes: See <b>overview</b> sesion on how does above options play with each other to decide the output and counting.
+
+`-select`: Comma-separated list of attributes to show. Supported attributes:
+[bytes|micros|accelerator_micros|cpu_micros|params|float_ops|occurrence|tensor_value|device|op_types|input_shapes].
+
+`-output`: Output results as stdout, file or timeline.
+The format is ```output_type:key=value,key=value```.
+For example: ```-output timeline:outfile=<filename>```.
+
+```shell
+timeline: key=outfile, value=<filename>.
+stdout: none.
+file: key=outfile, value=<filename>.
+```
diff --git a/tensorflow/tools/tfprof/g3doc/profile_memory.md b/tensorflow/core/profiler/g3doc/profile_memory.md
similarity index 100%
rename from tensorflow/tools/tfprof/g3doc/profile_memory.md
rename to tensorflow/core/profiler/g3doc/profile_memory.md
diff --git a/tensorflow/tools/tfprof/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
similarity index 84%
rename from tensorflow/tools/tfprof/g3doc/profile_model_architecture.md
rename to tensorflow/core/profiler/g3doc/profile_model_architecture.md
index 5ad5a56513b8c60f0bfe2e2ef426f09b8d61ea72..a42b2e918da0c40e3c8caff1400b25927b46d9c9 100644
--- a/tensorflow/tools/tfprof/g3doc/profile_model_architecture.md
+++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
@@ -30,12 +30,12 @@ _TFProfRoot (--/930.58k params)
 
 # The Python API profiles tf.trainable_variables() instead of VariableV2.
 #
-# By default, it's printed to stdout. User can update tfprof_options['output']
+# By default, it's printed to stdout. User can update options['output']
 # to write to file. The result is always returned as a proto buffer.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+param_stats = tf.profiler.profile(
     tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+    options=tf.profiler.ProfileOptionBuilder
+        .trainable_variables_parameter())
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
 
@@ -52,10 +52,15 @@ use the definition to calculate float operations. Contributes are welcome.
 to calculate the statistics. It is suggested to pass in `-run_meta_path` if
 shape is only known during runtime. tfprof can fill in the missing shape with
 the runtime shape information from RunMetadata.
-
-Hence, it is suggested to use `-account_displayed_name_only`
+Hence, it is suggested to use `-account_displayed_op_only`
 option so that you know the statistics are only for the operations printed out.
 
+* If no RunMetadata provided, tfprof count float_ops of each graph node once,
+even if it is defined in tf.while_loop. This is because tfprof doesn't know
+how many times are run statically. If RunMetadata provided, tfprof calculate
+float_ops as float_ops * run_count.
+
+
 
 ```python
 # To profile float opertions in commandline, you need to pass --graph_path
@@ -81,7 +86,7 @@ MatMul                   491.52k float_ops (0.00%, 0.00%)
 BiasAdd                  1.28k float_ops (0.00%, 0.00%)
 
 # You can also do that in Python API.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
+tf.profiler.profile(
     tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
+    options=tf.profiler.ProfileOptionBuilder.float_operation())
 ```
diff --git a/tensorflow/tools/tfprof/g3doc/profile_time.md b/tensorflow/core/profiler/g3doc/profile_time.md
similarity index 83%
rename from tensorflow/tools/tfprof/g3doc/profile_time.md
rename to tensorflow/core/profiler/g3doc/profile_time.md
index c89d7b0b03f38e461cc2371d584316c7b2bc58ac..db555b3617410b2a3e1ea9992ea1b88effa54f5e 100644
--- a/tensorflow/tools/tfprof/g3doc/profile_time.md
+++ b/tensorflow/core/profiler/g3doc/profile_time.md
@@ -1,10 +1,42 @@
 ##Profile Time
 
+* [Times in TensorFlow and tfprof](#times-in-tensorflow-and-tfprof)
 * [Profile by Python Code](#profile-by-python-code)
 * [Profile by Operation Type](#profile-by-operation-type)
 * [Profile by Graph](#profile-by-graph)
 * [Profile by Name Scope](#profile-by-name-scope)
 
+
+###Times in TensorFlow and tfprof
+When we run a model, Tensorflow schedules and runs the nodes (operations)
+in the graph. An operation can be placed on an accelerator or on CPU.
+
+
+#### On Accelerator
+When an operation is placed on accelerator, it will first be scheduled
+by TensorFlow on CPU. Normally, it's the code in OpKernel::Compute.
+OpKernel::Compute can decide to dispatch some of the computations on the
+accelerator. While some computation (e.g. pre-processing) is still done
+in CPU. OpKernel::Compute can dispatch computation on accelerator
+and return, or it can also wait for the accelerator to finish.
+
+tfprof reports 3 execution times:
+
+  * <b>accelerator_micros</b>, which is the part of computation time spent on accelerator.
+  * <b>cpu_micros</b>, which is the part of computation time spent on cpu, including
+    any wait times that might happen if OpKernel::Compute decides to wait.
+  * <b>exec_micros</b>, which is the sum of accelerator_micros and cpu_micros.
+
+Since accelerator, such as GPU, usually runs operation asynchronously, you
+might notice an operation finishes on cpu before it starts running on
+accelerator.
+
+#### On CPU
+When an operation is placed on CPU, it will completely run on CPU. Hence,
+<b>exec_micros</b> is equal to <b>cpu_micros</b> and <b>accelerator_micros</b>
+should be 0.
+
+
 ###Profile by Python Code
 ```python
 # In code view, the time of each line of Python code is the aggregated
@@ -70,7 +102,7 @@ opts['account_displayed_op_only'] = False
 opts['select'] = ['micros']
 
 tfprof_node = model_analyzer.print_model_analysis(
-    sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+    sess.graph, run_meta, cmd='code', options=opts)
 ```
 
 You can generate some visualization in code view:
diff --git a/tensorflow/tools/tfprof/g3doc/python_api.md b/tensorflow/core/profiler/g3doc/python_api.md
similarity index 66%
rename from tensorflow/tools/tfprof/g3doc/python_api.md
rename to tensorflow/core/profiler/g3doc/python_api.md
index 581e66baa29169b587c2dd14992154c221f981ec..0daac655c4eac5cb8beade335492e580ad856a10 100644
--- a/tensorflow/tools/tfprof/g3doc/python_api.md
+++ b/tensorflow/core/profiler/g3doc/python_api.md
@@ -11,21 +11,24 @@
 ### Parameters and Shapes.
 ```python
 # Print trainable variable parameter statistics to stdout.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder
+
+param_stats = tf.profiler.profile(
     tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+    options=ProfileOptionBuilder.trainable_variables_parameter())
 
 # Use code view to associate statistics with Python codes.
-opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+opts = ProfileOptionBuilder(
+    ProfileOptionBuilder.trainable_variables_parameter()
+    ).with_node_names(show_name_regexes=['.*my_code1.py.*', '.*my_code2.py.*']
+    ).build()
+param_stats = tf.profiler.profile(
     tf.get_default_graph(),
-    tfprof_cmd='code'
-    tfprof_options=opts)
+    cmd='code',
+    options=opts)
 
-# param_stats can be tensorflow.tfprof.TFGraphNodeProto or
-# tensorflow.tfprof.TFMultiGraphNodeProto, depending on the view.
+# param_stats can be tensorflow.tfprof.GraphNodeProto or
+# tensorflow.tfprof.MultiGraphNodeProto, depending on the view.
 # Let's print the root below.
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
@@ -36,9 +39,9 @@ sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ``` python
 # Print to stdout an analysis of the number of floating point operations in the
 # model broken down by individual operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
+tf.profiler.profile(
     tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
+    options=tf.profiler.ProfileOptionBuilder.float_operation())
 ```
 
 ### Time and Memory
@@ -48,8 +51,11 @@ compute the memory and timing statistics.
 ```python
 # Generate the RunMetadata that contains the memory and timing information.
 #
-# Note: When run on GPU, a kernel is first scheduled (enqueued) and then
-#       executed asynchronously. tfprof only tracks the execution time.
+# Note: When run on accelerator (e.g. GPU), an operation might perform some
+#       cpu computation, enqueue the accelerator computation. The accelerator
+#       computation is then run asynchronously. The profiler considers 3
+#       times: 1) accelerator computation. 2) cpu computation (might wait on
+#       accelerator). 3) the sum of 1 and 2.
 #
 run_metadata = tf.RunMetadata()
 with tf.Session() as sess:
@@ -58,39 +64,40 @@ with tf.Session() as sess:
                run_metadata=run_metadata)
 ```
 
-Finally, you may run `print_model_analysis` to explore the timing and memory
+Finally, you may run `tf.profiler.profile` to explore the timing and memory
 information of the model.
 
 ``` python
-# See model_analyzer_test.py for more examples.
-#
 # Print to stdout an analysis of the memory usage and the timing information
 # broken down by python codes.
-opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
-opts['show_name_regexes'] = ['.*my_code.py.*']
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
+ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder
+opts = ProfileOptionBuilder(ProfileOptionBuilder.time_and_memory()
+    ).with_node_names(show_name_regexes=['.*my_code.py.*']).build()
+
+tf.profiler.profile(
     tf.get_default_graph(),
     run_meta=run_metadata,
-    tfprof_cmd='code',
-    tfprof_options=opts)
+    cmd='code',
+    options=opts)
 
 # Print to stdout an analysis of the memory usage and the timing information
-# broken down by operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
+# broken down by operation types.
+tf.profiler.profile(
     tf.get_default_graph(),
     run_meta=run_metadata,
-    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
+    cmd='op',
+    options=tf.profiler.ProfileOptionBuilder.time_and_memory())
 ```
 
 ### Visualize
 
 ```
 To visualize the result of Python API results:
-Set opts['output'] = 'timeline:outfile=<filename>' to generate a timeline json file.
-Open a Chrome Browser, open URL chrome://tracing, and load the json file.
+Call `with_step(0).with_timeline_output(filename)` to generate a timeline json file.
+Open a Chrome Browser, type URL `chrome://tracing`, and load the json file.
 ```
 
-Below are 2 examples of graph view and scope view. See code view example in later examples.
+Below are 2 examples of graph view and scope view.
 
 <left>
 ![CodeTimeline](graph_timeline.png)
@@ -141,4 +148,4 @@ with session.Session() as sess:
   # Add run_meta of step 3.
   profiler.add_step(3, run_meta3)
   pb3 = profiler.profile_name_scope(opts)
-```
\ No newline at end of file
+```
diff --git a/tensorflow/tools/tfprof/g3doc/scope_timeline.png b/tensorflow/core/profiler/g3doc/scope_timeline.png
similarity index 100%
rename from tensorflow/tools/tfprof/g3doc/scope_timeline.png
rename to tensorflow/core/profiler/g3doc/scope_timeline.png
diff --git a/tensorflow/tools/tfprof/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
similarity index 87%
rename from tensorflow/tools/tfprof/internal/BUILD
rename to tensorflow/core/profiler/internal/BUILD
index 9b77b0fb3f26a60f7c30c32e96136af6145a52bd..129a42deebb33b97075ccbf27a6b64061b371c98 100644
--- a/tensorflow/tools/tfprof/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -9,6 +9,7 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 cc_library(
     name = "tfprof_stats",
@@ -28,7 +29,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -41,7 +42,6 @@ cc_library(
         ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/tools/tfprof:protos_all_cc",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
@@ -56,6 +56,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -76,7 +77,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -92,11 +93,12 @@ cc_library(
         ":tfprof_tensor",
         ":tfprof_utils",
         "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -117,7 +119,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -136,7 +138,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -151,7 +153,7 @@ cc_library(
         ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -171,7 +173,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -185,6 +187,7 @@ cc_library(
         ":tfprof_node_show",
         ":tfprof_options",
         ":tfprof_scope",
+        ":tfprof_show",
         ":tfprof_tensor",
         ":tfprof_timeline",
         ":tfprof_utils",
@@ -192,7 +195,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -215,7 +218,7 @@ tf_cc_test(
         ":tfprof_tf_testlib",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -233,7 +236,7 @@ tf_cc_test(
         ":tfprof_stats",
         ":tfprof_tf_testlib",
         ":tfprof_utils",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -241,7 +244,7 @@ cc_library(
     name = "tfprof_utils",
     srcs = ["tfprof_utils.cc"],
     hdrs = ["tfprof_utils.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
         ":tfprof_options",
         "//tensorflow/core:lib",
@@ -257,15 +260,13 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
 cc_library(
     name = "print_model_analysis_hdr",
-    hdrs = [
-        "print_model_analysis.h",
-    ],
+    hdrs = ["print_model_analysis.h"],
     deps = [
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:protos_all_cc",
@@ -282,8 +283,8 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/tools/tfprof:protos_all_cc",
-        "//tensorflow/tools/tfprof/internal/advisor:tfprof_advisor",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler/internal/advisor:tfprof_advisor",
     ],
     alwayslink = 1,
 )
@@ -307,7 +308,7 @@ tf_cc_test(
         ":tfprof_tf_testlib",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -315,11 +316,11 @@ cc_library(
     name = "tfprof_tensor",
     srcs = ["tfprof_tensor.cc"],
     hdrs = ["tfprof_tensor.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
@@ -339,7 +340,7 @@ tf_cc_test(
         ":tfprof_tf_testlib",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/tools/tfprof:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/tools/tfprof/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
similarity index 79%
rename from tensorflow/tools/tfprof/internal/advisor/BUILD
rename to tensorflow/core/profiler/internal/advisor/BUILD
index 30012fa7b14e3a3e88ea1ac1b8a62b1232be9499..e50145bf46576d3bf68023e88aa3b7a42baf17e5 100644
--- a/tensorflow/tools/tfprof/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -14,7 +14,8 @@ cc_library(
     name = "checker",
     hdrs = ["checker.h"],
     deps = [
-        "//tensorflow/tools/tfprof/internal:tfprof_stats",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/internal:tfprof_stats",
     ],
 )
 
@@ -23,7 +24,7 @@ cc_library(
     srcs = ["internal_checker_runner_dummy.cc"],
     hdrs = ["internal_checker_runner.h"],
     deps = [
-        ":checker",
+        "//tensorflow/core/profiler/internal:tfprof_utils",
     ],
 )
 
@@ -43,12 +44,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "expensive_operation_checker",
+    hdrs = ["expensive_operation_checker.h"],
+    deps = [
+        ":checker",
+    ],
+)
+
 cc_library(
     name = "tfprof_advisor",
     hdrs = ["tfprof_advisor.h"],
     deps = [
         ":accelerator_utilization_checker",
         ":checker",
+        ":expensive_operation_checker",
         ":internal_checker_runner_dummy",
         ":operation_checker",
     ],
@@ -59,7 +69,7 @@ tf_cc_test(
     srcs = ["tfprof_advisor_test.cc"],
     deps = [
         ":tfprof_advisor",
-        "//tensorflow/tools/tfprof/internal:tfprof_tf_testlib",
+        "//tensorflow/core/profiler/internal:tfprof_tf_testlib",
     ],
 )
 
diff --git a/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
similarity index 71%
rename from tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
rename to tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
index 8f256584f7b18cc2558a155f2e0c2499613035cf..c6544fe0b02df1b317db2ce4ab73130f9f155e56 100644
--- a/tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This checker checks the accelerator's utilization.
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
 
-#include "tensorflow/tools/tfprof/internal/advisor/checker.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -33,10 +33,11 @@ struct ExecStats {
 
 class AcceleratorUtilizationChecker : public Checker {
  public:
-  string name() override { return "AcceleratorUtilizationChecker"; }
+  string name() const override { return kCheckers[0]; }
 
  private:
-  std::vector<string> Check(const TFStats* stats) override {
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
     if (!stats) {
       fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
               name().c_str());
@@ -48,24 +49,21 @@ class AcceleratorUtilizationChecker : public Checker {
     return CheckInternal();
   }
 
-  std::vector<string> CheckInternal() {
+  AdviceProto::Checker CheckInternal() {
     for (const auto& s : accelerator_exec_stats_) {
       const ExecStats& stat = s.second;
       int64 total_micros = stat.end_micros - stat.start_micros;
       if (total_micros <= 0) continue;
       double utilization = 1.0 * stat.exec_micros / total_micros;
       if (utilization >= 0.5) {
-        reports_.push_back(strings::Printf("%s: device: %s utilization: %.2f",
-                                           kLevel[0], s.first.c_str(),
-                                           utilization));
+        reports_.add_reports(strings::Printf("device: %s utilization: %.2f",
+                                             s.first.c_str(), utilization));
       } else if (utilization < 0.5 && utilization > 0.2) {
-        reports_.push_back(
-            strings::Printf("%s: device: %s low utilization: %.2f", kLevel[1],
-                            s.first.c_str(), utilization));
+        reports_.add_reports(strings::Printf("device: %s low utilization: %.2f",
+                                             s.first.c_str(), utilization));
       } else if (utilization <= 0.2) {
-        reports_.push_back(
-            strings::Printf("%s: device: %s low utilization: %.2f", kLevel[2],
-                            s.first.c_str(), utilization));
+        reports_.add_reports(strings::Printf("device: %s low utilization: %.2f",
+                                             s.first.c_str(), utilization));
       }
     }
     return reports_;
@@ -76,7 +74,7 @@ class AcceleratorUtilizationChecker : public Checker {
     if (execs.empty()) {
       return;
     }
-    if (!IsAcceleratorDevice(node->canonical_device())) {
+    if (!IsPlacedOnAccelerator(node->canonical_device())) {
       return;
     }
 
@@ -102,10 +100,10 @@ class AcceleratorUtilizationChecker : public Checker {
 
   std::map<string, ExecStats> accelerator_exec_stats_;
   std::map<string, int64> ps_placement_;
-  std::vector<string> reports_;
+  AdviceProto::Checker reports_;
 };
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/checker.h b/tensorflow/core/profiler/internal/advisor/checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b5ebcf9e83742c8aa3cff072f490c6ca0243061
--- /dev/null
+++ b/tensorflow/core/profiler/internal/advisor/checker.h
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// Append only.
+static const char* const kCheckers[] = {
+    "AcceleratorUtilizationChecker", "OperationChecker",
+    "ExpensiveOperationChecker",
+    "JobChecker",  // Internal checker.
+};
+
+class Checker {
+ public:
+  virtual ~Checker() {}
+
+  virtual string name() const = 0;
+
+  AdviceProto::Checker Run(const AdvisorOptionsProto::CheckerOption& options,
+                           const TFStats* stats) {
+    return Check(options, stats);
+  }
+
+ protected:
+  virtual AdviceProto::Checker Check(
+      const AdvisorOptionsProto::CheckerOption& options,
+      const TFStats* stats) = 0;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..85b99dc951958bbab2201c38f0ae04823092c828
--- /dev/null
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -0,0 +1,141 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This checker checks the most expensive operations.
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ExpensiveOperationChecker : public Checker {
+ public:
+  string name() const override { return kCheckers[2]; }
+
+ private:
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
+    if (!stats) {
+      fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
+              name().c_str());
+      return reports_;
+    }
+    if (stats->steps().empty()) {
+      fprintf(stderr, "Missing RunMetadata info. Skip %s\n", name().c_str());
+    }
+    CheckOpView(stats);
+    CheckScopeView(stats);
+    CheckCodeView(stats);
+    return reports_;
+  }
+
+  void CheckOpView(const TFStats* stats) {
+    if (stats->steps().empty()) {
+      fprintf(stderr, "Missing run_meta for %s\n", name().c_str());
+      return;
+    }
+    Options opts(3, 0, 1, 0, 0, 0, -1, "micros", {".*"}, {".*"}, {}, {".*"}, {},
+                 false, {"micros", "occurrence"}, "none", {});
+    const MultiGraphNodeProto root = stats->ShowMultiGraphNode("op", opts);
+    if (root.children_size() == 0) {
+      return;
+    }
+    const MultiGraphNodeProto* node = &root;
+    std::vector<string> outputs;
+    for (int i = 0; i < 3 && node->children_size() > 0; ++i) {
+      node = &node->children(0);
+      outputs.push_back(strings::Printf(
+          "top %d operation type: %s, "
+          "cpu: %s, accelerator: %s, total: %s (%.2f%%)",
+          i + 1, node->name().c_str(),
+          FormatTime(node->cpu_exec_micros()).c_str(),
+          FormatTime(node->accelerator_exec_micros()).c_str(),
+          FormatTime(node->exec_micros()).c_str(),
+          100.0 * node->exec_micros() / (root.total_exec_micros() + 1e-10)));
+    }
+    reports_.add_reports(str_util::Join(outputs, "\n"));
+  }
+
+  void CheckCodeView(const TFStats* stats) {
+    if (!stats->has_code_traces()) {
+      fprintf(stderr, "Missing op_log (code traces) for %s\n", name().c_str());
+      return;
+    }
+    Options opts(100, 0, 1, 0, 0, 0, -1, "micros", {".*"}, {".*"}, {}, {".*"},
+                 {}, false, {"micros"}, "none", {});
+    const MultiGraphNodeProto root = stats->ShowMultiGraphNode("code", opts);
+    const MultiGraphNodeProto* node = &root;
+    // A trick here is: Usually, codes in library file are usually referenced
+    // only once, while user's own code are referenced multiple times.
+    while (node->children_size() == 1) {
+      node = &node->children(0);
+    }
+    if (node->children_size() == 0) {
+      return;
+    }
+
+    std::vector<string> outputs;
+    CodeViewHelper(node, 0, &outputs);
+    reports_.add_reports(str_util::Join(outputs, "\n"));
+  }
+
+  void CheckScopeView(const TFStats* stats) {
+    Options opts(100, 0, 100, 0, 0, 0, -1, "micros", {".*"}, {".*"}, {}, {".*"},
+                 {}, false, {"micros"}, "none", {});
+    const GraphNodeProto root = stats->ShowGraphNode("scope", opts);
+    if (root.children_size() == 0) {
+      return;
+    }
+    std::vector<string> outputs;
+    const GraphNodeProto* node = &root;
+    for (int i = 0; i < 3 && i < root.children_size(); ++i) {
+      const GraphNodeProto& node = root.children(i);
+      outputs.push_back(strings::Printf(
+          "top %d graph node: %s, cpu: %s, accelerator: %s, total: %s", i + 1,
+          node.name().c_str(), FormatTime(node.cpu_exec_micros()).c_str(),
+          FormatTime(node.accelerator_exec_micros()).c_str(),
+          FormatTime(node.exec_micros()).c_str()));
+    }
+    reports_.add_reports(str_util::Join(outputs, "\n"));
+  }
+
+  void CodeViewHelper(const MultiGraphNodeProto* node, int depth,
+                      std::vector<string>* outputs) {
+    if (node->children_size() <= 1 || depth > 3) {
+      return;
+    }
+    for (int j = 0; j < 3 && j < node->children_size(); ++j) {
+      const MultiGraphNodeProto* c = &node->children(j);
+      if (c->total_exec_micros() < 1000) {
+        continue;
+      }
+      outputs->push_back(strings::Printf(
+          "%s%s, cpu: %s, accelerator: %s, total: %s",
+          string(depth * 2, ' ').c_str(), c->name().c_str(),
+          FormatTime(c->total_cpu_exec_micros()).c_str(),
+          FormatTime(c->total_accelerator_exec_micros()).c_str(),
+          FormatTime(c->total_exec_micros()).c_str()));
+      CodeViewHelper(c, depth + 1, outputs);
+    }
+  }
+
+  AdviceProto::Checker reports_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OP_CHECKER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h b/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
similarity index 58%
rename from tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
rename to tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
index 1238b57f20bcda1230990b417bc4d93761a3098c..ec52741b19e6769ec9d571666c063524857dd199 100644
--- a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h
+++ b/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
@@ -13,19 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
 
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 
 class TFStats;
 
-std::map<string, std::vector<string>> RunInternalCheckers(const TFStats* stats);
+AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
+                                const TFStats* stats);
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc b/tensorflow/core/profiler/internal/advisor/internal_checker_runner_dummy.cc
similarity index 77%
rename from tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
rename to tensorflow/core/profiler/internal/advisor/internal_checker_runner_dummy.cc
index 8204d2b04e43b0f27bbe433813f6776a9ec4399c..436d943cdd2ae069178ed25ee034244e369501de 100644
--- a/tensorflow/tools/tfprof/internal/advisor/internal_checker_runner_dummy.cc
+++ b/tensorflow/core/profiler/internal/advisor/internal_checker_runner_dummy.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h"
+#include "tensorflow/core/profiler/internal/advisor/internal_checker_runner.h"
 
 namespace tensorflow {
 namespace tfprof {
 
-std::map<string, std::vector<string>> RunInternalCheckers(
-    const TFStats* stats) {
-  return std::map<string, std::vector<string>>();
+AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
+                                const TFStats* stats) {
+  return AdviceProto();
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
similarity index 69%
rename from tensorflow/tools/tfprof/internal/advisor/operation_checker.h
rename to tensorflow/core/profiler/internal/advisor/operation_checker.h
index 78132e3a460aa25467568520fe59008e054a4ddf..08f6d3c38925e73434b1e4476eccd395b9449e0b 100644
--- a/tensorflow/tools/tfprof/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -14,20 +14,21 @@ limitations under the License.
 ==============================================================================*/
 // This checker checks common wrong configurations of operations.
 //
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
 
-#include "tensorflow/tools/tfprof/internal/advisor/checker.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
 
 namespace tensorflow {
 namespace tfprof {
 
 class OperationChecker : public Checker {
  public:
-  string name() override { return "OperationChecker"; }
+  string name() const override { return kCheckers[1]; }
 
  private:
-  std::vector<string> Check(const TFStats* stats) override {
+  AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
+                             const TFStats* stats) override {
     if (!stats) {
       fprintf(stderr, "Missing profiles (e.g. graph, run_meta). Skip %s\n",
               name().c_str());
@@ -47,31 +48,29 @@ class OperationChecker : public Checker {
       if (node->op_attrs().find("data_format") != node->op_attrs().end()) {
         const AttrValue* attr_val = node->op_attrs().at("data_format");
         if (attr_val->s() == "NHWC" &&
-            IsAcceleratorDevice(node->canonical_device())) {
+            IsPlacedOnAccelerator(node->canonical_device())) {
           recommend_nchw = true;
         }
       }
     }
     if (use_batch_norm && !use_fused_batch_norm) {
-      reports_.push_back(strings::Printf(
-          "%s: Maybe use faster FusedBatchNorm instead of BatchNorm",
-          kLevel[1]));
+      reports_.add_reports(
+          "Maybe use faster FusedBatchNorm instead of BatchNorm");
     }
     if (recommend_nchw) {
       // TODO(xpan): Maybe print which Op supports NCHW.
-      reports_.push_back(strings::Printf(
-          "%s: Found operation using NHWC data_format on GPU. Maybe "
-          "NCHW is faster.",
-          kLevel[1]));
+      reports_.add_reports(
+          "Found operation using NHWC data_format on GPU. Maybe "
+          "NCHW is faster.");
     }
     return reports_;
   }
 
  private:
-  std::vector<string> reports_;
+  AdviceProto::Checker reports_;
 };
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
new file mode 100644
index 0000000000000000000000000000000000000000..42bd6d54381d50a0670ac23a6ae686bcf0b13c81
--- /dev/null
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -0,0 +1,81 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+
+#include "tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h"
+#include "tensorflow/core/profiler/internal/advisor/checker.h"
+#include "tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h"
+#include "tensorflow/core/profiler/internal/advisor/internal_checker_runner.h"
+#include "tensorflow/core/profiler/internal/advisor/operation_checker.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// The Advisor runs a list of Checkers, each checks a specific area.
+class Advisor {
+ public:
+  Advisor(const TFStats* stats) : stats_(stats) {}
+
+  static AdvisorOptionsProto DefaultOptions() {
+    AdvisorOptionsProto options;
+    std::vector<string> checkers(
+        kCheckers, kCheckers + sizeof(kCheckers) / sizeof(*kCheckers));
+    for (const string& checker : checkers) {
+      (*options.mutable_checkers())[checker];
+    }
+    return options;
+  }
+
+  AdviceProto Advise(const AdvisorOptionsProto& options) {
+    // Note: Release a checker's memory ASAP.
+    AdviceProto ret = RunInternalCheckers(options, stats_);
+
+    if (options.checkers().find(kCheckers[0]) != options.checkers().end()) {
+      AcceleratorUtilizationChecker au_checker;
+      (*ret.mutable_checkers())[kCheckers[0]].MergeFrom(
+          au_checker.Run(options.checkers().at(kCheckers[0]), stats_));
+    }
+    if (options.checkers().find(kCheckers[1]) != options.checkers().end()) {
+      OperationChecker op_checker;
+      (*ret.mutable_checkers())[kCheckers[1]].MergeFrom(
+          op_checker.Run(options.checkers().at(kCheckers[1]), stats_));
+    }
+    if (options.checkers().find(kCheckers[2]) != options.checkers().end()) {
+      ExpensiveOperationChecker expensive_op_checker;
+      (*ret.mutable_checkers())[kCheckers[2]].MergeFrom(
+          expensive_op_checker.Run(options.checkers().at(kCheckers[2]),
+                                   stats_));
+    }
+    for (const auto& checker : ret.checkers()) {
+      fprintf(stdout, "\n%s:\n", checker.first.c_str());
+      for (const string& r : checker.second.reports()) {
+        fprintf(stdout, "%s\n", r.c_str());
+      }
+    }
+    fflush(stdout);
+    return ret;
+  }
+
+ private:
+  const TFStats* stats_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
similarity index 56%
rename from tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
rename to tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index b41d0770dc7045d84973b4f94c47bc51d3266847..096c1d915ca59481dfe2c2c6f7b36f7edf65320f 100644
--- a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h"
+#include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
 
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
@@ -29,15 +29,16 @@ class TFProfAdvisorTest : public ::testing::Test {
                              nullptr, nullptr));
 
     stats_->AddNodeForTest(
-        "n1", CreateNode("n1", "Conv2D", {{"data_format", "NHWC"}}, 10, 2));
-    stats_->AddNodeForTest("n2", CreateNode("n2", "Conv2D", {}, 20, 2));
+        0, CreateNode("n1", "Conv2D", {{"data_format", "NHWC"}}, 0, 10, 2));
+    stats_->AddNodeForTest(0, CreateNode("n2", "Conv2D", {}, 0, 20, 2));
+    stats_->BuildAllViews();
     advisor_.reset(new Advisor(stats_.get()));
   }
 
   std::unique_ptr<TFGraphNode> CreateNode(const string& name,
                                           const string& type,
                                           std::map<string, string> attrs,
-                                          int64 start_miros,
+                                          int64 step, int64 start_miros,
                                           int64 end_rel_micros) {
     node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
     NodeDef* def = node_defs_.back().get();
@@ -52,10 +53,10 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
-    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
                       node_stat);
-    node->AddStepStat(0, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
                       node_stat);
     return node;
   }
@@ -66,23 +67,38 @@ class TFProfAdvisorTest : public ::testing::Test {
 };
 
 TEST_F(TFProfAdvisorTest, Basics) {
-  std::map<string, std::vector<string>> reports = advisor_->Advise();
-  EXPECT_TRUE(reports.find("AcceleratorUtilizationChecker") != reports.end());
-  EXPECT_TRUE(reports.find("OperationChecker") != reports.end());
+  AdvisorOptionsProto options = Advisor::DefaultOptions();
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_TRUE(advice.checkers().find(kCheckers[0]) != advice.checkers().end());
+  EXPECT_TRUE(advice.checkers().find(kCheckers[1]) != advice.checkers().end());
+  EXPECT_TRUE(advice.checkers().find(kCheckers[2]) != advice.checkers().end());
 }
 
 TEST_F(TFProfAdvisorTest, OperationChecker) {
-  std::map<string, std::vector<string>> reports = advisor_->Advise();
-  EXPECT_EQ(reports["OperationChecker"].size(), 1);
-  EXPECT_TRUE(StringPiece(reports["OperationChecker"][0]).contains("NCHW"));
+  AdvisorOptionsProto options;
+  (*options.mutable_checkers())[kCheckers[1]];
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
+  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[1]).reports(0))
+                  .contains("NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
-  std::map<string, std::vector<string>> reports = advisor_->Advise();
-  EXPECT_EQ(reports["AcceleratorUtilizationChecker"].size(), 1);
-  EXPECT_TRUE(StringPiece(reports["AcceleratorUtilizationChecker"][0])
+  AdvisorOptionsProto options;
+  (*options.mutable_checkers())[kCheckers[0]];
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
+  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[0]).reports(0))
                   .contains("low utilization"));
 }
 
+TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
+  AdvisorOptionsProto options;
+  (*options.mutable_checkers())[kCheckers[2]];
+  AdviceProto advice = advisor_->Advise(options);
+  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[2]).reports(0))
+                  .contains("top 1 operation type: Conv2D"));
+}
+
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
similarity index 80%
rename from tensorflow/tools/tfprof/internal/print_model_analysis.cc
rename to tensorflow/core/profiler/internal/print_model_analysis.cc
index 37d01db3a1cbb0482be90b53f6309697c12b4cf7..65b54f01aa11b1b4f7e61f60b74e83ffd43e6be5 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
+#include "tensorflow/core/profiler/internal/print_model_analysis.h"
 
 #include <stdio.h>
 #include <memory>
@@ -23,11 +23,12 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -36,6 +37,18 @@ TFStats* tf_stat = nullptr;
 
 string RunProfile(const string& command, const string& options,
                   TFStats* tf_stats) {
+  if (command == kCmds[4]) {
+    AdvisorOptionsProto option_pb;
+    if (!option_pb.ParseFromString(options)) {
+      fprintf(stderr, "Cannot parse AdvisorOptionsProto\n");
+      return "";
+    }
+    tf_stats->BuildAllViews();
+    return Advisor(tf_stats).Advise(option_pb).SerializeAsString();
+  } else {
+    tf_stats->BuildView(command);
+  }
+
   Options opts;
   tensorflow::Status s = Options::FromProtoStr(options, &opts);
   if (!s.ok()) {
@@ -76,9 +89,9 @@ bool NewProfiler(const string* graph, const string* op_log) {
   std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
   graph_ptr->ParseFromString(*graph);
 
-  std::unique_ptr<OpLog> op_log_ptr;
+  std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
-    op_log_ptr.reset(new OpLog());
+    op_log_ptr.reset(new OpLogProto());
     op_log_ptr->ParseFromString(*op_log);
   }
   tf_stat = new TFStats(std::move(graph_ptr), nullptr, std::move(op_log_ptr),
@@ -97,14 +110,14 @@ void AddStep(int64 step, const string* run_meta, const string* op_log) {
   // TODO(xpan): Better error handling.
   std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
   run_meta_ptr->ParseFromString(*run_meta);
-  tf_stat->ParseRunMeta(step, std::move(run_meta_ptr));
+  tf_stat->AddRunMeta(step, std::move(run_meta_ptr));
 
-  std::unique_ptr<OpLog> op_log_ptr;
+  std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
-    op_log_ptr.reset(new OpLog());
+    op_log_ptr.reset(new OpLogProto());
     op_log_ptr->ParseFromString(*op_log);
   }
-  tf_stat->ParseOpLog(std::move(op_log_ptr));
+  tf_stat->AddOpLogProto(std::move(op_log_ptr));
 }
 
 string Profile(const string* command, const string* options) {
@@ -129,9 +142,9 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
     run_meta_ptr->ParseFromString(*run_meta);
   }
 
-  std::unique_ptr<OpLog> op_log_ptr;
+  std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
-    op_log_ptr.reset(new OpLog());
+    op_log_ptr.reset(new OpLogProto());
     op_log_ptr->ParseFromString(*op_log);
   }
 
@@ -144,7 +157,5 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   return RunProfile(*command, *options, &tf_stats);
 }
 
-void Advise() { Advisor(tf_stat).Advise(); }
-
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
similarity index 85%
rename from tensorflow/tools/tfprof/internal/print_model_analysis.h
rename to tensorflow/core/profiler/internal/print_model_analysis.h
index 84165e542d76df65d4301c57f5ebac2b7d06619d..76a436f71c6f83c5dfb01a35b28e5224f7b8e635 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
 
 #include <string>
 
@@ -39,13 +39,11 @@ void AddStep(int64 step, const string* run_meta, const string* op_log);
 
 string Profile(const string* command, const string* options);
 
-void Advise();
-
 // Single-step Profiler.
 //
 // Interface defined for Python API swig. Calls the tfprof core API.
 // 'graph', 'run_meta', 'op_log' are serialized GraphDef, RunMetadata,
-// OpLog strings, respectively.
+// OpLogProto strings, respectively.
 // 'graph', 'command' and 'options' are required. Others can be nullptr
 // if not available.
 string PrintModelAnalysis(const string* graph, const string* run_meta,
@@ -55,4 +53,4 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt.data-00000-of-00001 b/tensorflow/core/profiler/internal/testdata/ckpt.data-00000-of-00001
similarity index 100%
rename from tensorflow/tools/tfprof/internal/testdata/ckpt.data-00000-of-00001
rename to tensorflow/core/profiler/internal/testdata/ckpt.data-00000-of-00001
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt.index b/tensorflow/core/profiler/internal/testdata/ckpt.index
similarity index 100%
rename from tensorflow/tools/tfprof/internal/testdata/ckpt.index
rename to tensorflow/core/profiler/internal/testdata/ckpt.index
diff --git a/tensorflow/tools/tfprof/internal/testdata/ckpt.meta b/tensorflow/core/profiler/internal/testdata/ckpt.meta
similarity index 100%
rename from tensorflow/tools/tfprof/internal/testdata/ckpt.meta
rename to tensorflow/core/profiler/internal/testdata/ckpt.meta
diff --git a/tensorflow/tools/tfprof/internal/testdata/graph.pbtxt b/tensorflow/core/profiler/internal/testdata/graph.pbtxt
similarity index 100%
rename from tensorflow/tools/tfprof/internal/testdata/graph.pbtxt
rename to tensorflow/core/profiler/internal/testdata/graph.pbtxt
diff --git a/tensorflow/tools/tfprof/internal/testdata/run_meta b/tensorflow/core/profiler/internal/testdata/run_meta
similarity index 100%
rename from tensorflow/tools/tfprof/internal/testdata/run_meta
rename to tensorflow/core/profiler/internal/testdata/run_meta
diff --git a/tensorflow/tools/tfprof/internal/testdata/tfprof_log b/tensorflow/core/profiler/internal/testdata/tfprof_log
similarity index 100%
rename from tensorflow/tools/tfprof/internal/testdata/tfprof_log
rename to tensorflow/core/profiler/internal/testdata/tfprof_log
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
similarity index 92%
rename from tensorflow/tools/tfprof/internal/tfprof_code.cc
rename to tensorflow/core/profiler/internal/tfprof_code.cc
index f328e3b0cd3de19c436e61f9dcf732bb22b0d606..1746076f7937a661f0345fbec0b49eb90cc76528 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -13,30 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
+#include "tensorflow/core/profiler/internal/tfprof_code.h"
 
 #include <stdio.h>
 #include <utility>
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
 
 namespace tensorflow {
 namespace tfprof {
 namespace {
 // Convert to Trace proto into a short readable string.
 string GetTraceString(const CodeDef::Trace& trace) {
-  string ntrace = "";
-  if (trace.file().find_last_of('/') != trace.file().npos) {
-    ntrace += trace.file().substr(trace.file().find_last_of('/') + 1);
-  } else {
-    ntrace += trace.file();
-  }
+  string ntrace = io::Basename(trace.file()).ToString();
   ntrace += strings::StrCat(":", trace.lineno());
   if (trace.function().length() < 20) {
     ntrace += ":" + trace.function();
@@ -134,10 +130,10 @@ const ShowMultiNode* TFCode::ShowInternal(const Options& opts,
 }
 
 void TFCode::Format(const std::vector<CodeNode*> roots, string* display_str,
-                    TFMultiGraphNodeProto* proto) {
+                    MultiGraphNodeProto* proto) {
   for (CodeNode* node : roots) {
     display_str->append(node->formatted_str);
-    TFMultiGraphNodeProto* child = proto->add_children();
+    MultiGraphNodeProto* child = proto->add_children();
     child->MergeFrom(node->proto());
     Format(node->show_children, display_str, child);
   }
@@ -235,6 +231,17 @@ std::vector<CodeNode*> TFCode::Account(const std::vector<CodeNode*>& roots,
 
 string TFCode::FormatNode(CodeNode* node, const Options& opts, int64 indent) {
   std::vector<string> attrs;
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(node->proto().total_requested_bytes());
+    if (node->account) {
+      memory = FormatMemory(node->proto().requested_bytes()) + "/" + memory;
+    } else {
+      memory = "--/" + memory;
+    }
+    attrs.push_back(memory);
+  }
+  std::vector<string> time_attrs = FormatTimes(node, opts);
+  attrs.insert(attrs.end(), time_attrs.begin(), time_attrs.end());
 
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     string params = FormatNumber(node->proto().total_parameters()) + " params";
@@ -255,24 +262,7 @@ string TFCode::FormatNode(CodeNode* node, const Options& opts, int64 indent) {
     }
     attrs.push_back(fops);
   }
-  if (opts.select.find(kShown[0]) != opts.select.end()) {
-    string memory = FormatMemory(node->proto().total_requested_bytes());
-    if (node->account) {
-      memory = FormatMemory(node->proto().requested_bytes()) + "/" + memory;
-    } else {
-      memory = "--/" + memory;
-    }
-    attrs.push_back(memory);
-  }
-  if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(node->proto().total_exec_micros());
-    if (node->account) {
-      time = FormatTime(node->proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    attrs.push_back(time);
-  }
+
   if (opts.select.find(kShown[5]) != opts.select.end() &&
       !node->node->devices().empty()) {
     attrs.push_back(str_util::Join(node->node->devices(), "|"));
@@ -281,7 +271,10 @@ string TFCode::FormatNode(CodeNode* node, const Options& opts, int64 indent) {
     std::set<string> op_types = node->node->op_types();
     attrs.push_back(str_util::Join(op_types, "|"));
   }
-
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    // TODO(xpan): Make op count available in code view?
+    attrs.push_back(strings::Printf("%s N/A in code view", kShown[7]));
+  }
   if (opts.select.find(kShown[8]) != opts.select.end()) {
     attrs.push_back(strings::Printf("%s N/A in code view", kShown[8]));
   }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h
similarity index 77%
rename from tensorflow/tools/tfprof/internal/tfprof_code.h
rename to tensorflow/core/profiler/internal/tfprof_code.h
index 613f6267393aa360e01f63e093bf72a87c73b77f..733829722199735b6b94999fd16b0215e7a56f8c 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_code.h
+++ b/tensorflow/core/profiler/internal/tfprof_code.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a tree structure based on the TensorFlow model's python code stacks.
 // Stats are aggregated from descendants from ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
 
 #include <map>
 #include <memory>
@@ -27,13 +27,13 @@ limitations under the License.
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_show_multi.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -64,7 +64,7 @@ class TFCode : public TFMultiShow {
                                  const Options& opts);
 
   void Format(const std::vector<CodeNode*> roots, string* display_str,
-              TFMultiGraphNodeProto* proto);
+              MultiGraphNodeProto* proto);
 
   string FormatNode(CodeNode* node, const Options& opts, int64 indent);
 
@@ -77,4 +77,4 @@ class TFCode : public TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_constants.h b/tensorflow/core/profiler/internal/tfprof_constants.h
similarity index 86%
rename from tensorflow/tools/tfprof/internal/tfprof_constants.h
rename to tensorflow/core/profiler/internal/tfprof_constants.h
index e495128728ba95763397f5cbe1206df5e90c2912..6a4eaaa890c51a1c2a730cfbb96d6d45316789c6 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_constants.h
+++ b/tensorflow/core/profiler/internal/tfprof_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
 
 namespace tensorflow {
 namespace tfprof {
@@ -34,4 +34,4 @@ static const char* const kCkptVarType = "_checkpoint_variables";
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc
similarity index 95%
rename from tensorflow/tools/tfprof/internal/tfprof_graph.cc
rename to tensorflow/core/profiler/internal/tfprof_graph.cc
index 4c562ae840662cdf0c5759b45d2267f1a2079a26..8d7c44b219e5094693d3d180adf071a9ddb3b608 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.cc
+++ b/tensorflow/core/profiler/internal/tfprof_graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
+#include "tensorflow/core/profiler/internal/tfprof_graph.h"
 
 #include <stdio.h>
 #include <utility>
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -94,7 +94,7 @@ const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
   }
 
   // 3. Trim the nodes not matching show/hide/trim_name_regexes.
-  // If account_displayed_name_only=true, redo the accounting.
+  // If account_displayed_op_only=true, redo the accounting.
   visits.clear();
   root_->show_children.assign(roots.begin(), roots.end());
   GraphNode* root = PrintGraph({root_}, opts, 1, 0, &visits)[0];
@@ -141,10 +141,10 @@ std::vector<GraphNode*> TFGraph::SearchRoot(
 }
 
 void TFGraph::Format(const std::vector<GraphNode*> roots, string* display_str,
-                     TFGraphNodeProto* proto) {
+                     GraphNodeProto* proto) {
   for (GraphNode* node : roots) {
     display_str->append(node->formatted_str);
-    TFGraphNodeProto* child = proto->add_children();
+    GraphNodeProto* child = proto->add_children();
     child->MergeFrom(node->proto());
     Format(node->show_children, display_str, child);
   }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.h b/tensorflow/core/profiler/internal/tfprof_graph.h
similarity index 79%
rename from tensorflow/tools/tfprof/internal/tfprof_graph.h
rename to tensorflow/core/profiler/internal/tfprof_graph.h
index fbeae8673dda89936f580de5f042429cb69bd1d5..8dac4aee77a456f9bb43d1fea255d8d4655c255b 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/core/profiler/internal/tfprof_graph.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a graph structure based on op inputs/outputs. The graph is a directed
 // acyclic graph pointing *from outputs to inputs*.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
 
 #include <deque>
 #include <map>
@@ -29,11 +29,11 @@ limitations under the License.
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -54,8 +54,8 @@ class TFGraph : public TFShow {
   const ShowNode* ShowInternal(const Options& opts,
                                Timeline* timeline) override;
 
-  bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
-                         int depth) override {
+  bool ShouldShowIfExtra(const ShowNode* node, const Options& opts,
+                         int depth) const override {
     return true;
   }
 
@@ -74,7 +74,7 @@ class TFGraph : public TFShow {
                                   std::set<string>* visits);
 
   void Format(const std::vector<GraphNode*> roots, string* display_str,
-              TFGraphNodeProto* proto);
+              GraphNodeProto* proto);
 
   MemoryTracker memory_tracker_;
   GraphNode* root_;
@@ -86,4 +86,4 @@ class TFGraph : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
similarity index 78%
rename from tensorflow/tools/tfprof/internal/tfprof_node.cc
rename to tensorflow/core/profiler/internal/tfprof_node.cc
index 6353813a26c6a3775f8c1f29c0cc544cbde79edb..732576d29cbf826ec6a4f3455ce8b5ffe686557c 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -13,12 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
 
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+bool CountAsAcceleratorTime(const string& device) {
+  return device.find("stream:all") != device.npos;
+}
+
+bool CountAsCPUTime(const string& device) {
+  return RE2::FullMatch(device, ".*/(gpu|cpu):\\d+");
+}
+
+bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
+
+}  // namespace
 // Notes about start and end time from the NodeExecStats proto:
 // For GPU, there is no difference between op_end_rel_micros and
 // all_end_rel_micros. All are kernel times.
@@ -52,13 +64,17 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
     latest_end_micros_ = std::max(
         latest_end_micros_, step_stat.all_start_micros() + op_end_rel_micros);
 
-    op_execs_[dev].push_back(
-        std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
-
-    // TODO(xpan): Can a stream only in stream:all or doesn't in stream at all?
-    if (dev.find("stream") != dev.npos && dev.find("stream:all") == dev.npos) {
-      gpu_kernel_execs_[dev].push_back(
-          std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
+    const std::pair<int64, int64> pair =
+        std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
+    if (CountAsAcceleratorTime(dev)) {
+      accelerator_execs_[dev].push_back(pair);
+      op_execs_[dev].push_back(pair);
+    } else if (CountAsCPUTime(dev)) {
+      cpu_execs_[dev].push_back(pair);
+      op_execs_[dev].push_back(pair);
+      // In while-loop, a graph node is executed multiple times under
+      // the same name.
+      run_count_ += 1;
     }
   }
 }
@@ -113,8 +129,11 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
                               const NodeExecStats& step_stat) {
   string dev = str_util::Lowercase(device);
 
-  // TODO(xpan): Test it.
-  if (RE2::FullMatch(dev, "/job:.*/replica:\\d+/task:\\d+/[a-z]+:\\d+")) {
+  // TODO(xpan): Make this more robust?
+  // See run_metadata_test.py
+  // It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
+  // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
+  if (IsCanonicalDevice(device)) {
     if (!canonical_device_.empty()) {
       if (canonical_device_ != dev) {
         fprintf(stderr, "Unexpected: graph node changed device: %s->%s.\n",
@@ -143,16 +162,16 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
 }
 
 int64 ExecStep::exec_micros() const {
-  int64 total = accelerator_exec_micros();
-  if (total > 0) return total;
-
-  // If there is no gpu kernel time, fall back to assume it runs on cpu.
-  // TODO(xpan): No way to track CPU async op timing accurately?
-  if (op_execs_.size() > 1) {
-    fprintf(stderr, "Op: %s has over 1 no-gpu assignment\n",
-            node->name().c_str());
-  }
-  for (const auto& execs : op_execs_) {
+  return accelerator_exec_micros() + cpu_exec_micros();
+}
+
+int64 ExecStep::accelerator_exec_micros() const {
+  int64 total = 0;
+  // Normally, an op should only be scheduled on 1 accelerator device.
+  // Hence there should generally be 1 element in accelerator_execs_.
+  for (const auto& execs : accelerator_execs_) {
+    // An op can fire multiple kernels or
+    // being scheduled multiple times in while-loop.
     for (const auto& exec : execs.second) {
       total += exec.second;
     }
@@ -160,9 +179,11 @@ int64 ExecStep::exec_micros() const {
   return total;
 }
 
-int64 ExecStep::accelerator_exec_micros() const {
+int64 ExecStep::cpu_exec_micros() const {
   int64 total = 0;
-  for (const auto& execs : gpu_kernel_execs_) {
+  // Normally, an op can only be scheduled on 1 device.
+  for (const auto& execs : cpu_execs_) {
+    // An op can be scheduled multiple times in while-loop.
     for (const auto& exec : execs.second) {
       total += exec.second;
     }
@@ -170,14 +191,6 @@ int64 ExecStep::accelerator_exec_micros() const {
   return total;
 }
 
-bool IsCombinedGPUStream(const string& device) {
-  return device.find("stream:all") != device.npos;
-}
-
-bool IsCPUDevice(const string& device) {
-  return device.find("cpu:0") != device.npos;
-}
-
 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
   std::vector<int64> shape_vec;
   if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
@@ -203,7 +216,7 @@ TensorShapeProto VecToShapeProto(const std::vector<int64> shape_vec) {
   return shape_pb;
 }
 
-bool IsAcceleratorDevice(const string& device) {
+bool IsPlacedOnAccelerator(const string& device) {
   return device.find("gpu") != device.npos;
 }
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
similarity index 76%
rename from tensorflow/tools/tfprof/internal/tfprof_node.h
rename to tensorflow/core/profiler/internal/tfprof_node.h
index d788f2acf4d1091e3bd8a269b0cd52ff50c8f110..9142ad51c553a07ea1982ed91128ca757d76de62 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
 
 #include <map>
 #include <set>
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -46,6 +46,7 @@ class ExecStep {
  public:
   ExecStep(TFGraphNode* node)
       : node(node),
+        run_count_(0),
         all_start_micros_(0),
         latest_end_micros_(0),
         mem_initiated_(false),
@@ -60,12 +61,14 @@ class ExecStep {
 
   void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
 
+  int64 run_count() const { return run_count_; }
   // The execution time of an op. If it runs on accelerator, then it's
   // accelerator_exec_micros(). Otherwise, it's CPU time.
   int64 exec_micros() const;
-
-  // The execution time of an op. 0 if it runs on cpu.
+  // The accelerator execution time of an op. 0 if not run on accelerator.
   int64 accelerator_exec_micros() const;
+  // The cpu execution time of an op.
+  int64 cpu_exec_micros() const;
 
   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs()
       const {
@@ -88,14 +91,20 @@ class ExecStep {
 
  private:
   TFGraphNode* node;
+  // Can be larger than 1 if run multiple times in loop.
+  int64 run_count_;
   // The earliest/latest time including scheduling and execution.
   int64 all_start_micros_;
   int64 latest_end_micros_;
   // device -> vector of {op_start_micros, op_exec_micros} pairs.
-  // For accelerator op, op_start_micros and op_exec_micros are kernel time.
-  // For cpu op, op_start_micros and op_exec_micros are scheduling time. (
-  // might include compute time if it's sync op).
-  std::map<string, std::vector<std::pair<int64, int64>>> gpu_kernel_execs_;
+  // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
+  // For accelerator, vector size can be larger than 1, multiple kernel fires
+  // or in tf.while_loop.
+  std::map<string, std::vector<std::pair<int64, int64>>> accelerator_execs_;
+  // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
+  // For cpu, vector size can be larger than 1 if in tf.while_loop.
+  std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
+  // combines accelerator_execs_ and cpu_execs_.
   std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
   // All devices the op is associated with (e.g. gpu:0 (scheduling),
   // gpu:0:stream:xx (kernel exec), cpu:0 host)
@@ -184,8 +193,27 @@ class TFGraphNode {
     return src_output_idx_;
   }
 
-  // This is time spent in kernel execution.
-  int64 kernel_exec_micros(int64 step) const {
+  // Number of times the graph node is executed. When step < 0, the
+  // average number of times executed across all steps.
+  int64 run_count(int64 step) const {
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      CHECK(exec != execs_.end());
+      return exec->second.run_count();
+    }
+    int64 total_run_count = 0;
+    for (const auto& exec : execs_) {
+      total_run_count += exec.second.run_count();
+    }
+    return total_run_count / execs_.size();
+  }
+  // This is overall computation time, including both cpu and accelerator.
+  // Note, cpu and accelerator might or might not run in parallel.
+  int64 exec_micros(int64 step) const {
+    // Empty when no RunMetadata is provided.
     if (execs_.empty()) {
       return 0;
     }
@@ -202,6 +230,46 @@ class TFGraphNode {
     return total_micros / execs_.size();
   }
 
+  // This is accelerator computation time of a step, or average of
+  // multiple step, when step < 0.
+  int64 accelerator_exec_micros(int64 step) const {
+    // Empty when no RunMetadata is provided.
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      CHECK(exec != execs_.end());
+      return exec->second.accelerator_exec_micros();
+    }
+
+    int64 total_micros = 0;
+    for (const auto& exec : execs_) {
+      total_micros += exec.second.accelerator_exec_micros();
+    }
+    return total_micros / execs_.size();
+  }
+
+  // This is cpu computation time of a step, or average of
+  // multiple step, when step < 0.
+  int64 cpu_exec_micros(int64 step) const {
+    // Empty when no RunMetadata is provided.
+    if (execs_.empty()) {
+      return 0;
+    }
+    if (step >= 0) {
+      auto exec = execs_.find(step);
+      CHECK(exec != execs_.end());
+      return exec->second.cpu_exec_micros();
+    }
+
+    int64 total_micros = 0;
+    for (const auto& exec : execs_) {
+      total_micros += exec.second.cpu_exec_micros();
+    }
+    return total_micros / execs_.size();
+  }
+
   int64 requested_bytes(int64 step) const {
     if (execs_.empty()) {
       return 0;
@@ -272,7 +340,14 @@ class TFGraphNode {
     return exec->second.allocator_bytes_in_use();
   }
 
-  int64 float_ops() const { return float_ops_; }
+  int64 float_ops(int64 step) const {
+    // If not run, return static analysis.
+    if (execs_.empty()) {
+      return float_ops_;
+    }
+    // Otherwise, return dynamic float_ops.
+    return float_ops_ * run_count(step);
+  }
   const CodeDef& code() { return code_; }
   string canonical_device() const { return canonical_device_; }
   string host_device() const { return host_device_; }
@@ -325,12 +400,19 @@ class TFMultiGraphNode {
  public:
   TFMultiGraphNode(const string& name)
       : name_(name),
-        kernel_exec_micros_(0),
+        run_count_(0),
+        exec_micros_(0),
+        accelerator_exec_micros_(0),
+        cpu_exec_micros_(0),
         requested_bytes_(0),
         float_ops_(0) {}
 
   bool SnapshotNodes(int64 step, const std::vector<string>& type_regexes) {
-    kernel_exec_micros_ = 0;
+    run_count_ = 0;
+    exec_micros_ = 0;
+    accelerator_exec_micros_ = 0;
+    cpu_exec_micros_ = 0;
+
     requested_bytes_ = 0;
     float_ops_ = 0;
     op_types_.clear();
@@ -347,9 +429,13 @@ class TFMultiGraphNode {
     for (const TFGraphNode* node : nodes) {
       op_types_.insert(node->op_types().begin(), node->op_types().end());
 
-      kernel_exec_micros_ += node->kernel_exec_micros(step);
+      run_count_ += node->run_count(step);
+      exec_micros_ += node->exec_micros(step);
+      accelerator_exec_micros_ += node->accelerator_exec_micros(step);
+      cpu_exec_micros_ += node->cpu_exec_micros(step);
+
       requested_bytes_ += node->requested_bytes(step);
-      float_ops_ += node->float_ops();
+      float_ops_ += node->float_ops(step);
       if (node->shape().size() > 0) {
         shapes_.push_back(node->shape());
       }
@@ -382,7 +468,10 @@ class TFMultiGraphNode {
 
   const string& name() const { return name_; }
 
-  int64 kernel_exec_micros() const { return kernel_exec_micros_; }
+  int64 run_count() const { return run_count_; }
+  int64 exec_micros() const { return exec_micros_; }
+  int64 accelerator_exec_micros() const { return accelerator_exec_micros_; }
+  int64 cpu_exec_micros() const { return cpu_exec_micros_; }
 
   int64 requested_bytes() const { return requested_bytes_; }
 
@@ -424,7 +513,11 @@ class TFMultiGraphNode {
   const string name_;
   // Snapshot based on type_regexes
   std::set<string> op_types_;
-  int64 kernel_exec_micros_;
+  int64 run_count_;
+  int64 exec_micros_;
+  int64 accelerator_exec_micros_;
+  int64 cpu_exec_micros_;
+
   int64 requested_bytes_;
   int64 float_ops_;
   std::set<string> devices_;
@@ -436,10 +529,8 @@ class TFMultiGraphNode {
   std::map<string, std::unique_ptr<TFMultiGraphNode>> children_;
 };
 
-bool IsCombinedGPUStream(const string& device);
-bool IsCPUDevice(const string& device);
-bool IsAcceleratorDevice(const string& device);
+bool IsPlacedOnAccelerator(const string& device);
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.cc b/tensorflow/core/profiler/internal/tfprof_node_show.cc
similarity index 67%
rename from tensorflow/tools/tfprof/internal/tfprof_node_show.cc
rename to tensorflow/core/profiler/internal/tfprof_node_show.cc
index 7b604e091a7c120c9d739f8b0453147c8abee0d9..5ca57412e5245065aa98bbb6cce42bad77e025a2 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -31,9 +31,14 @@ void ShowNode::ReInit(int64 step) {
   if (!node->canonical_device().empty()) {
     mutable_proto()->add_devices(node->canonical_device());
   }
-  mutable_proto()->set_exec_micros(node->kernel_exec_micros(step));
+  mutable_proto()->set_run_count(node->run_count(step));
+  mutable_proto()->set_exec_micros(node->exec_micros(step));
+  mutable_proto()->set_accelerator_exec_micros(
+      node->accelerator_exec_micros(step));
+  mutable_proto()->set_cpu_exec_micros(node->cpu_exec_micros(step));
+
   mutable_proto()->set_requested_bytes(node->requested_bytes(step));
-  mutable_proto()->set_float_ops(node->float_ops());
+  mutable_proto()->set_float_ops(node->float_ops(step));
 
   mutable_proto()->clear_input_shapes();
   for (const auto& inp : node->input_shapes()) {
@@ -61,14 +66,24 @@ void ShowNode::ReInit(int64 step) {
   }
 }
 
-TFGraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
+GraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
 
-const TFGraphNodeProto& ShowNode::proto() const { return proto_; }
+const GraphNodeProto& ShowNode::proto() const { return proto_; }
 
 void ShowNode::AggregateTotalStats(ShowNode* node) {
-  TFGraphNodeProto* node_pb = node->mutable_proto();
+  GraphNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_run_count(proto().total_run_count() +
+                                       node_pb->total_run_count());
+  mutable_proto()->set_total_definition_count(
+      proto().total_definition_count() + node_pb->total_definition_count());
   mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
                                          node_pb->total_exec_micros());
+  mutable_proto()->set_total_accelerator_exec_micros(
+      proto().total_accelerator_exec_micros() +
+      node_pb->total_accelerator_exec_micros());
+  mutable_proto()->set_total_cpu_exec_micros(proto().total_cpu_exec_micros() +
+                                             node_pb->total_cpu_exec_micros());
+
   mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
                                              node_pb->total_requested_bytes());
   mutable_proto()->set_total_parameters(proto().total_parameters() +
@@ -78,8 +93,18 @@ void ShowNode::AggregateTotalStats(ShowNode* node) {
 }
 
 void ShowNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_definition_count(proto().total_definition_count() +
+                                              1);
+  mutable_proto()->set_total_run_count(proto().total_run_count() +
+                                       proto().run_count());
   mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
                                          proto().exec_micros());
+  mutable_proto()->set_total_accelerator_exec_micros(
+      proto().total_accelerator_exec_micros() +
+      proto().accelerator_exec_micros());
+  mutable_proto()->set_total_cpu_exec_micros(proto().total_cpu_exec_micros() +
+                                             proto().cpu_exec_micros());
+
   mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
                                              proto().requested_bytes());
   mutable_proto()->set_total_parameters(proto().total_parameters() +
@@ -89,7 +114,12 @@ void ShowNode::AddSelfToTotalStats() {
 }
 
 void ShowNode::ResetTotalStats() {
+  mutable_proto()->set_total_definition_count(0);
+  mutable_proto()->set_total_run_count(0);
   mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_accelerator_exec_micros(0);
+  mutable_proto()->set_total_cpu_exec_micros(0);
+
   mutable_proto()->set_total_requested_bytes(0);
   mutable_proto()->set_total_parameters(0);
   mutable_proto()->set_total_float_ops(0);
@@ -116,7 +146,10 @@ bool ShowMultiNode::ReInit(int64 step,
   }
 
   mutable_proto()->set_name(name());
-  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_exec_micros(node->exec_micros());
+  mutable_proto()->set_accelerator_exec_micros(node->accelerator_exec_micros());
+  mutable_proto()->set_cpu_exec_micros(node->cpu_exec_micros());
+
   mutable_proto()->set_requested_bytes(node->requested_bytes());
   mutable_proto()->set_float_ops(node->float_ops());
 
@@ -143,14 +176,20 @@ bool ShowMultiNode::ReInit(int64 step,
   return has_matched_type;
 }
 
-TFMultiGraphNodeProto* ShowMultiNode::mutable_proto() { return &proto_; }
+MultiGraphNodeProto* ShowMultiNode::mutable_proto() { return &proto_; }
 
-const TFMultiGraphNodeProto& ShowMultiNode::proto() const { return proto_; }
+const MultiGraphNodeProto& ShowMultiNode::proto() const { return proto_; }
 
 void ShowMultiNode::AggregateTotalStats(ShowMultiNode* node) {
-  TFMultiGraphNodeProto* node_pb = node->mutable_proto();
+  MultiGraphNodeProto* node_pb = node->mutable_proto();
   mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
                                          node_pb->total_exec_micros());
+  mutable_proto()->set_total_accelerator_exec_micros(
+      proto().total_accelerator_exec_micros() +
+      node_pb->total_accelerator_exec_micros());
+  mutable_proto()->set_total_cpu_exec_micros(proto().total_cpu_exec_micros() +
+                                             node_pb->total_cpu_exec_micros());
+
   mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
                                              node_pb->total_requested_bytes());
   mutable_proto()->set_total_parameters(proto().total_parameters() +
@@ -162,6 +201,12 @@ void ShowMultiNode::AggregateTotalStats(ShowMultiNode* node) {
 void ShowMultiNode::AddSelfToTotalStats() {
   mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
                                          proto().exec_micros());
+  mutable_proto()->set_total_accelerator_exec_micros(
+      proto().total_accelerator_exec_micros() +
+      proto().accelerator_exec_micros());
+  mutable_proto()->set_total_cpu_exec_micros(proto().total_cpu_exec_micros() +
+                                             proto().cpu_exec_micros());
+
   mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
                                              proto().requested_bytes());
   mutable_proto()->set_total_parameters(proto().total_parameters() +
@@ -172,6 +217,9 @@ void ShowMultiNode::AddSelfToTotalStats() {
 
 void ShowMultiNode::ResetTotalStats() {
   mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_accelerator_exec_micros(0);
+  mutable_proto()->set_total_cpu_exec_micros(0);
+
   mutable_proto()->set_total_requested_bytes(0);
   mutable_proto()->set_total_parameters(0);
   mutable_proto()->set_total_float_ops(0);
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.h b/tensorflow/core/profiler/internal/tfprof_node_show.h
similarity index 79%
rename from tensorflow/tools/tfprof/internal/tfprof_node_show.h
rename to tensorflow/core/profiler/internal/tfprof_node_show.h
index a1091f4d6616ce69e00b2849d67e91a5314f580e..fea19bdca3a39c71a0dfe3a2434d7109aed97a28 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.h
@@ -21,8 +21,8 @@ limitations under the License.
 // ScopeNode and GraphNode each maps to one TFGraphNode.
 // CodeNode and OpNode each maps to one TFMultiGraphNode.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
 
 #include <algorithm>
 #include <string>
@@ -30,11 +30,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -45,8 +45,8 @@ class ShowNode {
   virtual ~ShowNode() {}
 
   const string& name() const { return node->name(); }
-  TFGraphNodeProto* mutable_proto();
-  const TFGraphNodeProto& proto() const;
+  GraphNodeProto* mutable_proto();
+  const GraphNodeProto& proto() const;
 
   void ReInit(int64 step);
 
@@ -61,7 +61,7 @@ class ShowNode {
   string formatted_str;
 
  protected:
-  TFGraphNodeProto proto_;
+  GraphNodeProto proto_;
 };
 
 class GraphNode : public ShowNode {
@@ -91,8 +91,8 @@ class ShowMultiNode {
   bool ReInit(int64 step, const std::vector<string>& type_regexes);
 
   const string& name() const { return node->name(); }
-  TFMultiGraphNodeProto* mutable_proto();
-  const TFMultiGraphNodeProto& proto() const;
+  MultiGraphNodeProto* mutable_proto();
+  const MultiGraphNodeProto& proto() const;
 
   void AggregateTotalStats(ShowMultiNode* node);
 
@@ -106,7 +106,7 @@ class ShowMultiNode {
   string formatted_str;
 
  protected:
-  TFMultiGraphNodeProto proto_;
+  MultiGraphNodeProto proto_;
 };
 
 class CodeNode : public ShowMultiNode {
@@ -127,4 +127,4 @@ class OpNode : public ShowMultiNode {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
similarity index 69%
rename from tensorflow/tools/tfprof/internal/tfprof_op.cc
rename to tensorflow/core/profiler/internal/tfprof_op.cc
index 655569f1a2804b9b6b4e11fa823e519eca947659..46a81da80fd7b92f5b00350b93122251655499a5 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_op.h"
+#include "tensorflow/core/profiler/internal/tfprof_op.h"
 
 #include <stdio.h>
 #include <utility>
@@ -21,11 +21,65 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+string FormatToalExecTime(const ShowMultiNode* node,
+                          const ShowMultiNode* root) {
+  double accu_pct = 0.0;
+  double pct = 0.0;
+  if (node->proto().total_exec_micros() > 0) {
+    accu_pct = 100.0 * node->proto().total_exec_micros() /
+               root->proto().total_exec_micros();
+    pct =
+        100.0 * node->proto().exec_micros() / root->proto().total_exec_micros();
+  }
+
+  return strings::Printf(
+      "%30s", strings::Printf("%s (%.2f%%, %.2f%%)",
+                              FormatTime(node->proto().exec_micros()).c_str(),
+                              accu_pct, pct)
+                  .c_str());
+}
+string FormatCPUExecTime(const ShowMultiNode* node, const ShowMultiNode* root) {
+  double accu_pct = 0.0;
+  double pct = 0.0;
+  if (node->proto().total_cpu_exec_micros() > 0) {
+    accu_pct = 100.0 * node->proto().total_cpu_exec_micros() /
+               root->proto().total_cpu_exec_micros();
+    pct = 100.0 * node->proto().cpu_exec_micros() /
+          root->proto().total_cpu_exec_micros();
+  }
+
+  return strings::Printf(
+      "%30s",
+      strings::Printf("%s (%.2f%%, %.2f%%)",
+                      FormatTime(node->proto().cpu_exec_micros()).c_str(),
+                      accu_pct, pct)
+          .c_str());
+}
+string FormatAcceleratorExecTime(const ShowMultiNode* node,
+                                 const ShowMultiNode* root) {
+  double accu_pct = 0.0;
+  double pct = 0.0;
+  if (node->proto().total_accelerator_exec_micros() > 0) {
+    accu_pct = 100.0 * node->proto().total_accelerator_exec_micros() /
+               root->proto().total_accelerator_exec_micros();
+    pct = 100.0 * node->proto().accelerator_exec_micros() /
+          root->proto().total_accelerator_exec_micros();
+  }
+
+  return strings::Printf(
+      "%30s", strings::Printf(
+                  "%s (%.2f%%, %.2f%%)",
+                  FormatTime(node->proto().accelerator_exec_micros()).c_str(),
+                  accu_pct, pct)
+                  .c_str());
+}
+}  // namespace
 
 void TFOp::AddNode(TFGraphNode* node) {
   const string& op = node->op();
@@ -72,6 +126,7 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
   }
   nodes = SortNodes(nodes, opts);
 
+  // pre keeps track of previous visited node.
   OpNode* pre = nullptr;
   std::vector<OpNode*> account_nodes;
   for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
@@ -116,16 +171,20 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     root_->ResetTotalStats();
     if (pre) {
       root_->AggregateTotalStats(pre);
-      root_->mutable_proto()->add_children()->MergeFrom(pre->proto());
-      pre->mutable_proto()->clear_children();
     }
   }
+  if (pre) {
+    root_->mutable_proto()->add_children()->MergeFrom(pre->proto());
+    pre->mutable_proto()->clear_children();
+  }
 
   if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
     string display_str = FormatLegend(opts);
     for (OpNode* node : show_nodes) {
       display_str += FormatNode(node, root_.get(), opts);
     }
+    // In op view, we don't show root (total). But it will still in proto.
+    // TODO(xpan): Is it the right choice?
     root_->formatted_str = display_str;
   }
   return root_.get();
@@ -147,7 +206,7 @@ int64 TFOp::SearchRoot(const std::vector<OpNode*> nodes,
   return i;
 }
 
-string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) {
+string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
   std::vector<string> attrs;
 
   if (opts.select.find(kShown[0]) != opts.select.end()) {
@@ -168,22 +227,18 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) {
   }
 
   if (opts.select.find(kShown[1]) != opts.select.end()) {
-    double accu_pct = 0.0;
-    double pct = 0.0;
-    if (node->proto().total_exec_micros() > 0) {
-      accu_pct = 100.0 * node->proto().total_exec_micros() /
-          root->proto().total_exec_micros();
-      pct = 100.0 * node->proto().exec_micros() /
-          root->proto().total_exec_micros();
-    }
-
-    attrs.push_back(strings::Printf(
-        "%30s", strings::Printf("%s (%.2f%%, %.2f%%)",
-                                FormatTime(node->proto().exec_micros()).c_str(),
-                                accu_pct, pct)
-                    .c_str()));
+    attrs.push_back(FormatToalExecTime(node, root));
+    attrs.push_back(FormatAcceleratorExecTime(node, root));
+    attrs.push_back(FormatCPUExecTime(node, root));
+  }
+  if (opts.select.find(kShown[9]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    attrs.push_back(FormatAcceleratorExecTime(node, root));
+  }
+  if (opts.select.find(kShown[10]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    attrs.push_back(FormatCPUExecTime(node, root));
   }
-
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     double accu_pct = 0.0;
     double pct = 0.0;
@@ -228,9 +283,14 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) {
   }
 
   if (opts.select.find(kShown[7]) != opts.select.end()) {
+    int64 total_runs = 0;
+    for (const auto& gnode : node->proto().graph_nodes()) {
+      total_runs += gnode.run_count();
+    }
     attrs.push_back(strings::Printf(
         "%10s",
-        strings::Printf("%d", node->proto().graph_nodes_size()).c_str()));
+        strings::Printf("%lld|%d", total_runs, node->proto().graph_nodes_size())
+            .c_str()));
   }
 
   string node_str = strings::Printf("%-25s%s\n", node->name().c_str(),
diff --git a/tensorflow/tools/tfprof/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
similarity index 74%
rename from tensorflow/tools/tfprof/internal/tfprof_op.h
rename to tensorflow/core/profiler/internal/tfprof_op.h
index 5b16490363020ba3f18457a297495cb03c46b4c8..9e20f5c3f496e32f474ba612e66d6c657695ac4f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Build a flat structure of ops.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OP_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OP_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
 
 #include <deque>
 #include <map>
@@ -28,11 +28,11 @@ limitations under the License.
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_show_multi.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -56,15 +56,15 @@ class TFOp : public TFMultiShow {
   int64 SearchRoot(const std::vector<OpNode*> nodes,
                    const std::vector<string>& regexes);
 
-  bool ShouldShowIfExtra(ShowMultiNode* node, const Options& opts,
-                         int depth) override {
+  bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
+                         int depth) const override {
     if (opts.min_occurrence > node->node->graph_nodes().size()) {
       return false;
     }
     return true;
   }
 
-  string FormatNode(OpNode* node, OpNode* root, const Options& opts);
+  string FormatNode(OpNode* node, OpNode* root, const Options& opts) const;
 
   std::unique_ptr<OpNode> root_;
   std::map<string, std::unique_ptr<OpNode>> cnodes_map_;
@@ -74,4 +74,4 @@ class TFOp : public TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OP_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.cc b/tensorflow/core/profiler/internal/tfprof_options.cc
similarity index 98%
rename from tensorflow/tools/tfprof/internal/tfprof_options.cc
rename to tensorflow/core/profiler/internal/tfprof_options.cc
index 83c275756bfecdd2f3ad56038bbaecf2c0f88e30..53e401dd600b9bb30159257c52167a36757ec2b5 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.cc
+++ b/tensorflow/core/profiler/internal/tfprof_options.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/tools/tfprof/tfprof_options.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.h b/tensorflow/core/profiler/internal/tfprof_options.h
similarity index 88%
rename from tensorflow/tools/tfprof/internal/tfprof_options.h
rename to tensorflow/core/profiler/internal/tfprof_options.h
index d8c172e0a2ca0e7ffc7a7a3f6b4e9aff22ea425c..6d0c213b3d8e99120b357850d8a1376f9fbff8ec 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/core/profiler/internal/tfprof_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
 
 #include <set>
 #include <string>
@@ -46,21 +46,23 @@ static const char* const kOptions[] = {
 };
 
 static const char* const kOrderBy[] = {
-    "name", "bytes", "micros", "params", "float_ops", "occurrence",
+    "name",       "bytes",  "micros",    "accelerator_micros",
+    "cpu_micros", "params", "float_ops", "occurrence",
 };
 
 // Append Only.
 // TODO(xpan): As we are adding more fields to be selected, we
 // need to have a way to tell users what fields are available in which view.
 static const char* const kShown[] = {
-    "bytes",  "micros",   "params",     "float_ops",   "tensor_value",
-    "device", "op_types", "occurrence", "input_shapes"};
+    "bytes",     "micros",   "params",     "float_ops",    "tensor_value",
+    "device",    "op_types", "occurrence", "input_shapes", "accelerator_micros",
+    "cpu_micros"};
 
 static const char* const kCmds[] = {
-    "scope", "graph", "code", "op", "set", "help",
+    "scope", "graph", "code", "op", "advise", "set", "help",
 };
 
-static const char* const kOutput[] = {"timeline", "stdout", "file"};
+static const char* const kOutput[] = {"timeline", "stdout", "file", "none"};
 
 static const char* const kTimelineOpts[] = {
     "outfile",
@@ -149,4 +151,4 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc
similarity index 95%
rename from tensorflow/tools/tfprof/internal/tfprof_scope.cc
rename to tensorflow/core/profiler/internal/tfprof_scope.cc
index 2b9e56e4517dc4c9292729900250f89bd9ae4e94..0f7e079098fdd3256a1a136e5952af4c6f1f27f7 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.cc
+++ b/tensorflow/core/profiler/internal/tfprof_scope.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
+#include "tensorflow/core/profiler/internal/tfprof_scope.h"
 
 #include <stdio.h>
 #include <utility>
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -103,10 +103,10 @@ const ShowNode* TFScope::ShowInternal(const Options& opts, Timeline* timeline) {
 }
 
 void TFScope::Format(const std::vector<ScopeNode*> roots, string* display_str,
-                     TFGraphNodeProto* proto) {
+                     GraphNodeProto* proto) {
   for (ScopeNode* node : roots) {
     display_str->append(node->formatted_str);
-    TFGraphNodeProto* child = proto->add_children();
+    GraphNodeProto* child = proto->add_children();
     child->MergeFrom(node->proto());
     Format(node->show_children, display_str, child);
   }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.h b/tensorflow/core/profiler/internal/tfprof_scope.h
similarity index 80%
rename from tensorflow/tools/tfprof/internal/tfprof_scope.h
rename to tensorflow/core/profiler/internal/tfprof_scope.h
index ca5cabe4579f6d932fdeb37d9fd2c6b209791fa4..5e1fa2a32ad17207bd62169d9ad5503d9e6475c1 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.h
+++ b/tensorflow/core/profiler/internal/tfprof_scope.h
@@ -17,8 +17,8 @@ limitations under the License.
 // For example, 'name1/name2' is a child of 'name1'.
 // Stats are aggregated from descendants from ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
 
 #include <map>
 #include <memory>
@@ -28,11 +28,11 @@ limitations under the License.
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -64,7 +64,7 @@ class TFScope : public TFShow {
                                   const Options& opts);
 
   void Format(const std::vector<ScopeNode*> roots, string* display_str,
-              TFGraphNodeProto* proto);
+              GraphNodeProto* proto);
 
   ScopeNode* root_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
@@ -74,4 +74,4 @@ class TFScope : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.cc b/tensorflow/core/profiler/internal/tfprof_show.cc
similarity index 74%
rename from tensorflow/tools/tfprof/internal/tfprof_show.cc
rename to tensorflow/core/profiler/internal/tfprof_show.cc
index 517a09f0c74ab0a1eadc2e62810060613591bcca..2828217d3db567a959769a5871b5105f5748ba59 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
 
 #include <memory>
 #include <set>
@@ -25,8 +25,10 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-const TFGraphNodeProto& TFShow::Show(const Options& opts) {
-  if (opts.output_type == kOutput[0]) {
+const GraphNodeProto& TFShow::Show(const Options& opts) {
+  if (opts.output_type == kOutput[3]) {
+    return ShowInternal(opts, nullptr)->proto();
+  } else if (opts.output_type == kOutput[0]) {
     Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
     return ShowInternal(opts, &timeline)->proto();
   } else if (opts.output_type == kOutput[2]) {
@@ -64,7 +66,8 @@ bool TFShow::LookUpCheckPoint(const string& name,
   return true;
 }
 
-bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
+bool TFShow::ShouldShow(const ShowNode* node, const Options& opts,
+                        int depth) const {
   // Always show kTFProfRoot.
   if (node->name() == kTFProfRoot) return true;
 
@@ -72,6 +75,7 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
       node->proto().exec_micros() < opts.min_micros ||
       node->proto().parameters() < opts.min_params ||
       node->proto().float_ops() < opts.min_float_ops ||
+      node->proto().run_count() < opts.min_occurrence ||
       depth > opts.max_depth || !ShouldShowIfExtra(node, opts, depth)) {
     return false;
   }
@@ -96,7 +100,8 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
   return true;
 }
 
-bool TFShow::ShouldTrim(ShowNode* node, const std::vector<string>& regexes) {
+bool TFShow::ShouldTrim(const ShowNode* node,
+                        const std::vector<string>& regexes) const {
   for (const string& regex : regexes) {
     if (RE2::FullMatch(node->name(), regex)) {
       return true;
@@ -121,7 +126,7 @@ bool TFShow::ReAccount(ShowNode* node, const Options& opts) {
   return false;
 }
 
-string TFShow::FormatNode(ShowNode* node, const Options& opts) {
+string TFShow::FormatNode(ShowNode* node, const Options& opts) const {
   std::vector<string> info;
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     const string shape = FormatShapes(node->node->shape());
@@ -156,13 +161,17 @@ string TFShow::FormatNode(ShowNode* node, const Options& opts) {
     info.push_back(memory);
   }
   if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(node->proto().total_exec_micros());
-    if (node->account) {
-      time = FormatTime(node->proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    info.push_back(time);
+    info.push_back(FormatTotalExecTime(node, opts));
+    info.push_back(FormatAcceleratorExecTime(node, opts));
+    info.push_back(FormatCPUExecTime(node, opts));
+  }
+  if (opts.select.find(kShown[9]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    info.push_back(FormatAcceleratorExecTime(node, opts));
+  }
+  if (opts.select.find(kShown[10]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    info.push_back(FormatCPUExecTime(node, opts));
   }
   if (opts.select.find(kShown[5]) != opts.select.end()) {
     if (node->proto().devices_size() > 0) {
@@ -173,6 +182,21 @@ string TFShow::FormatNode(ShowNode* node, const Options& opts) {
     const std::set<string>& op_types = node->node->op_types();
     info.push_back(str_util::Join(op_types, "|"));
   }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    string run = FormatNumber(node->proto().total_run_count());
+    if (node->account) {
+      run = FormatNumber(node->proto().run_count()) + "/" + run;
+    } else {
+      run = "--/" + run;
+    }
+    string definition = FormatNumber(node->proto().total_definition_count());
+    if (node->account) {
+      definition = "1/" + definition;
+    } else {
+      definition = "--/" + definition;
+    }
+    info.push_back(run + "|" + definition);
+  }
   if (opts.select.find(kShown[8]) != opts.select.end()) {
     std::vector<string> shape_vec;
     for (const auto& s : node->node->input_shapes()) {
@@ -190,7 +214,7 @@ string TFShow::FormatNode(ShowNode* node, const Options& opts) {
                          str_util::Join(info, ", ").c_str());
 }
 
-string TFShow::FormatLegend(const Options& opts) {
+string TFShow::FormatLegend(const Options& opts) const {
   std::vector<string> legends;
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     legends.push_back("# parameters");
@@ -202,7 +226,17 @@ string TFShow::FormatLegend(const Options& opts) {
     legends.push_back("output bytes");
   }
   if (opts.select.find(kShown[1]) != opts.select.end()) {
-    legends.push_back("execution time");
+    legends.push_back("total execution time");
+    legends.push_back("accelerator execution time");
+    legends.push_back("cpu execution time");
+  }
+  if (opts.select.find(kShown[9]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    legends.push_back("accelerator execution time");
+  }
+  if (opts.select.find(kShown[10]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    legends.push_back("cpu execution time");
   }
   if (opts.select.find(kShown[5]) != opts.select.end()) {
     legends.push_back("assigned devices");
@@ -210,6 +244,9 @@ string TFShow::FormatLegend(const Options& opts) {
   if (opts.select.find(kShown[6]) != opts.select.end()) {
     legends.push_back("op types");
   }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    legends.push_back("op count (run|defined)");
+  }
   if (opts.select.find(kShown[8]) != opts.select.end()) {
     legends.push_back("input shapes");
   }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
similarity index 55%
rename from tensorflow/tools/tfprof/internal/tfprof_show.h
rename to tensorflow/core/profiler/internal/tfprof_show.h
index a337a584f7ae6d8b3de91f8633dae39fc5e25be9..2f7e0e62119dbb4f83cb43d1ec0bfbf2b32cb79f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_graph and tfprof_scope.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
 
 #include <algorithm>
 #include <string>
@@ -26,14 +26,14 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -44,7 +44,7 @@ class TFShow {
   virtual ~TFShow() {}
   virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const TFGraphNodeProto& Show(const Options& opts);
+  const GraphNodeProto& Show(const Options& opts);
 
  protected:
   virtual const ShowNode* ShowInternal(const Options& opts,
@@ -54,20 +54,21 @@ class TFShow {
                         std::unique_ptr<TFProfTensor>* tensor);
 
   // Overridden by subclass if extra requirements need to be met.
-  virtual bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
-                                 int depth) {
+  virtual bool ShouldShowIfExtra(const ShowNode* node, const Options& opts,
+                                 int depth) const {
     return true;
   }
 
-  bool ShouldShow(ShowNode* node, const Options& opts, int depth);
+  bool ShouldShow(const ShowNode* node, const Options& opts, int depth) const;
 
-  bool ShouldTrim(ShowNode* node, const std::vector<string>& regexes);
+  bool ShouldTrim(const ShowNode* node,
+                  const std::vector<string>& regexes) const;
 
   bool ReAccount(ShowNode* node, const Options& opts);
 
-  string FormatNode(ShowNode* node, const Options& opts);
+  string FormatNode(ShowNode* node, const Options& opts) const;
 
-  string FormatLegend(const Options& opts);
+  string FormatLegend(const Options& opts) const;
 
   template <typename T>
   std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
@@ -89,8 +90,14 @@ class TFShow {
         return n1->proto().total_exec_micros() >
                n2->proto().total_exec_micros();
       } else if (opts.order_by == kOrderBy[3]) {
-        return n1->proto().total_parameters() > n2->proto().total_parameters();
+        return n1->proto().total_accelerator_exec_micros() >
+               n2->proto().total_accelerator_exec_micros();
       } else if (opts.order_by == kOrderBy[4]) {
+        return n1->proto().total_cpu_exec_micros() >
+               n2->proto().total_cpu_exec_micros();
+      } else if (opts.order_by == kOrderBy[5]) {
+        return n1->proto().total_parameters() > n2->proto().total_parameters();
+      } else if (opts.order_by == kOrderBy[6]) {
         return n1->proto().total_float_ops() > n2->proto().total_float_ops();
       }
       return name_cmp;
@@ -101,7 +108,37 @@ class TFShow {
   checkpoint::CheckpointReader* ckpt_reader_;
 };
 
+template <typename T>
+string FormatTotalExecTime(const T* node, const Options& opts) {
+  string time = FormatTime(node->proto().total_exec_micros());
+  if (node->account) {
+    time = FormatTime(node->proto().exec_micros()) + "/" + time;
+  } else {
+    time = "--/" + time;
+  }
+  return time;
+}
+template <typename T>
+string FormatCPUExecTime(const T* node, const Options& opts) {
+  string time = FormatTime(node->proto().total_cpu_exec_micros());
+  if (node->account) {
+    time = FormatTime(node->proto().cpu_exec_micros()) + "/" + time;
+  } else {
+    time = "--/" + time;
+  }
+  return time;
+}
+template <typename T>
+string FormatAcceleratorExecTime(const T* node, const Options& opts) {
+  string time = FormatTime(node->proto().total_accelerator_exec_micros());
+  if (node->account) {
+    time = FormatTime(node->proto().accelerator_exec_micros()) + "/" + time;
+  } else {
+    time = "--/" + time;
+  }
+  return time;
+}
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
similarity index 59%
rename from tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
rename to tensorflow/core/profiler/internal/tfprof_show_multi.cc
index 4714ffc33a659c3cf299b889fa150df25e702e31..199aef7685912990a9bbd98033f1c61088b62b2e 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_multi.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_show_multi.h"
+#include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
 
 #include <memory>
 #include <set>
@@ -22,13 +22,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
+#include "tensorflow/core/profiler/internal/tfprof_scope.h"
 
 namespace tensorflow {
 namespace tfprof {
 
-const TFMultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
-  if (opts.output_type == kOutput[0]) {
+const MultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
+  if (opts.output_type == kOutput[3]) {
+    return ShowInternal(opts, nullptr)->proto();
+  } else if (opts.output_type == kOutput[0]) {
     Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
     return ShowInternal(opts, &timeline)->proto();
   } else if (opts.output_type == kOutput[2]) {
@@ -48,8 +50,8 @@ const TFMultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
   }
 }
 
-bool TFMultiShow::ShouldShow(ShowMultiNode* node, const Options& opts,
-                            int depth) {
+bool TFMultiShow::ShouldShow(const ShowMultiNode* node, const Options& opts,
+                             int depth) const {
   // Always show kTFProfRoot.
   if (node->name() == kTFProfRoot) return true;
 
@@ -88,8 +90,8 @@ bool TFMultiShow::ShouldShow(ShowMultiNode* node, const Options& opts,
   return true;
 }
 
-bool TFMultiShow::ShouldTrim(ShowMultiNode* node,
-                            const std::vector<string>& regexes) {
+bool TFMultiShow::ShouldTrim(const ShowMultiNode* node,
+                             const std::vector<string>& regexes) const {
   for (const string& regex : regexes) {
     if (RE2::FullMatch(node->name(), regex)) {
       return true;
@@ -102,13 +104,23 @@ bool TFMultiShow::ReAccount(ShowMultiNode* node, const Options& opts) {
   return node->ReInit(opts.step, opts.account_type_regexes);
 }
 
-string TFMultiShow::FormatLegend(const Options& opts) {
+string TFMultiShow::FormatLegend(const Options& opts) const {
   std::vector<string> legends;
   if (opts.select.find(kShown[0]) != opts.select.end()) {
     legends.push_back("output bytes");
   }
   if (opts.select.find(kShown[1]) != opts.select.end()) {
-    legends.push_back("execution time");
+    legends.push_back("total execution time");
+    legends.push_back("accelerator execution time");
+    legends.push_back("cpu execution time");
+  }
+  if (opts.select.find(kShown[9]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    legends.push_back("accelerator execution time");
+  }
+  if (opts.select.find(kShown[10]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    legends.push_back("cpu execution time");
   }
   if (opts.select.find(kShown[2]) != opts.select.end()) {
     legends.push_back("# parameters");
@@ -123,7 +135,7 @@ string TFMultiShow::FormatLegend(const Options& opts) {
     legends.push_back("op types");
   }
   if (opts.select.find(kShown[7]) != opts.select.end()) {
-    legends.push_back("op occurrence");
+    legends.push_back("op occurrence (run|defined)");
   }
   if (opts.select.find(kShown[8]) != opts.select.end()) {
     legends.push_back("input shapes");
@@ -132,11 +144,11 @@ string TFMultiShow::FormatLegend(const Options& opts) {
                          str_util::Join(legends, " | ").c_str());
 }
 
-string TFMultiShow::FormatInputShapes(const TFMultiGraphNodeProto& proto) {
-  std::map<string, int> input_shapes_str;
-  std::map<string, int> input_time_str;
+string TFMultiShow::FormatInputShapes(const MultiGraphNodeProto& proto) const {
+  // input_shape string -> (static defined count, run count, run_micros)
+  std::map<string, std::tuple<int64, int64, int64>> input_shapes_attr;
   for (int i = 0; i < proto.graph_nodes_size(); ++i) {
-    const TFGraphNodeProto& gnode = proto.graph_nodes(i);
+    const GraphNodeProto& gnode = proto.graph_nodes(i);
     // Convert and sort by input_idx.
     std::map<int, std::vector<int64>> input_shapes;
     for (const auto& inp : gnode.input_shapes()) {
@@ -154,30 +166,58 @@ string TFMultiShow::FormatInputShapes(const TFMultiGraphNodeProto& proto) {
     }
     string shape_type_str = strings::Printf(
         "input_type: %s", str_util::Join(input_vec, ",\t").c_str());
-    input_shapes_str[shape_type_str] += 1;
-    input_time_str[shape_type_str] += gnode.exec_micros();
+    auto t = input_shapes_attr.find(shape_type_str);
+    if (t == input_shapes_attr.end()) {
+      input_shapes_attr.insert(
+          std::make_pair(shape_type_str, std::make_tuple(0, 0, 0)));
+      t = input_shapes_attr.find(shape_type_str);
+    }
+    input_shapes_attr[shape_type_str] = std::make_tuple(
+        std::get<0>(t->second) + 1, std::get<1>(t->second) + gnode.run_count(),
+        std::get<2>(t->second) + gnode.exec_micros());
   }
-  if (input_shapes_str.empty()) {
+  if (input_shapes_attr.empty()) {
     return "";
   }
 
-  std::vector<std::pair<string, int>> shape_count_vec(input_shapes_str.begin(),
-                                                      input_shapes_str.end());
-  std::sort(shape_count_vec.begin(), shape_count_vec.end(),
-            [](const std::pair<const string, int>& a,
-               const std::pair<const string, int>& b) {
-              return a.second > b.second;
-            });
+  std::vector<std::pair<string, std::tuple<int64, int64, int64>>>
+      shape_count_vec(input_shapes_attr.begin(), input_shapes_attr.end());
+  std::sort(
+      shape_count_vec.begin(), shape_count_vec.end(),
+      [](const std::pair<const string, std::tuple<int64, int64, int64>>& a,
+         const std::pair<const string, std::tuple<int64, int64, int64>>& b) {
+        return std::get<1>(a.second) > std::get<1>(b.second);
+      });
 
   std::vector<string> input_types;
   input_types.reserve(shape_count_vec.size());
   for (const auto& s : shape_count_vec) {
-    input_types.push_back(
-        strings::Printf("%s\t(*%d)\texec_time: %s", s.first.c_str(), s.second,
-                        FormatTime(input_time_str[s.first]).c_str()));
+    std::tuple<int64, int64, int64> t = s.second;
+    input_types.push_back(strings::Printf(
+        "%s\t(run*%lld|defined*%lld)\texec_time: %s", s.first.c_str(),
+        std::get<1>(t), std::get<0>(t), FormatTime(std::get<2>(t)).c_str()));
   }
   return str_util::Join(input_types, "\n");
 }
 
+std::vector<string> TFMultiShow::FormatTimes(const ShowMultiNode* node,
+                                             const Options& opts) const {
+  std::vector<string> attrs;
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    attrs.push_back(FormatTotalExecTime(node, opts));
+    attrs.push_back(FormatAcceleratorExecTime(node, opts));
+    attrs.push_back(FormatCPUExecTime(node, opts));
+  }
+  if (opts.select.find(kShown[9]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    attrs.push_back(FormatAcceleratorExecTime(node, opts));
+  }
+  if (opts.select.find(kShown[10]) != opts.select.end() &&
+      opts.select.find(kShown[1]) == opts.select.end()) {
+    attrs.push_back(FormatCPUExecTime(node, opts));
+  }
+  return attrs;
+}
+
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
similarity index 64%
rename from tensorflow/tools/tfprof/internal/tfprof_show_multi.h
rename to tensorflow/core/profiler/internal/tfprof_show_multi.h
index 1181e45ee185a61887dccc38bc29eea5cc5dfff7..f731f6afbb3d3082884a0b3d2a86055199466572 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_MULTI_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_MULTI_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
 
 #include <algorithm>
 #include <string>
@@ -26,14 +26,15 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -44,7 +45,7 @@ class TFMultiShow {
   virtual ~TFMultiShow() {}
   virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const TFMultiGraphNodeProto& Show(const Options& opts);
+  const MultiGraphNodeProto& Show(const Options& opts);
 
  protected:
   virtual const ShowMultiNode* ShowInternal(const Options& opts,
@@ -54,19 +55,23 @@ class TFMultiShow {
                         std::unique_ptr<TFProfTensor>* tensor);
 
   // Overridden by subclass if extra requirements need to be met.
-  virtual bool ShouldShowIfExtra(ShowMultiNode* node, const Options& opts,
-                                 int depth) {
+  virtual bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
+                                 int depth) const {
     return true;
   }
 
-  bool ShouldShow(ShowMultiNode* node, const Options& opts, int depth);
+  bool ShouldShow(const ShowMultiNode* node, const Options& opts,
+                  int depth) const;
 
-  bool ShouldTrim(ShowMultiNode* node, const std::vector<string>& regexes);
+  bool ShouldTrim(const ShowMultiNode* node,
+                  const std::vector<string>& regexes) const;
 
   bool ReAccount(ShowMultiNode* node, const Options& opts);
 
-  string FormatLegend(const Options& opts);
-  string FormatInputShapes(const TFMultiGraphNodeProto& proto);
+  string FormatLegend(const Options& opts) const;
+  string FormatInputShapes(const MultiGraphNodeProto& proto) const;
+  std::vector<string> FormatTimes(const ShowMultiNode* node,
+                                  const Options& opts) const;
 
   template <typename T>
   std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
@@ -88,12 +93,18 @@ class TFMultiShow {
                   return n1->proto().total_exec_micros() >
                          n2->proto().total_exec_micros();
                 } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_accelerator_exec_micros() >
+                         n2->proto().total_accelerator_exec_micros();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_cpu_exec_micros() >
+                         n2->proto().total_cpu_exec_micros();
+                } else if (opts.order_by == kOrderBy[5]) {
                   return n1->proto().total_parameters() >
                          n2->proto().total_parameters();
-                } else if (opts.order_by == kOrderBy[4]) {
+                } else if (opts.order_by == kOrderBy[6]) {
                   return n1->proto().total_float_ops() >
                          n2->proto().total_float_ops();
-                } else if (opts.order_by == kOrderBy[5]) {
+                } else if (opts.order_by == kOrderBy[7]) {
                   return n1->node->graph_nodes().size() >
                          n2->node->graph_nodes().size();
                 }
@@ -106,4 +117,4 @@ class TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_MULTI_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2ba113e9bdf65e8cb65b0f18e95fbdf6983cdf8
--- /dev/null
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfShowTest : public ::testing::Test {
+ protected:
+  TFProfShowTest() {
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "core/profiler/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "core/profiler/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
+
+    std::unique_ptr<OpLogProto> op_log_pb(new OpLogProto());
+    string op_log_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "core/profiler/internal/testdata/tfprof_log");
+    TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
+
+    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                    "core/profiler/internal/testdata/ckpt");
+    TF_Status* status = TF_NewStatus();
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
+        new checkpoint::CheckpointReader(ckpt_path, status));
+    CHECK(TF_GetCode(status) == TF_OK);
+    TF_DeleteStatus(status);
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_->BuildAllViews();
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+TEST_F(TFProfShowTest, DumpScopeMode) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(5, 0, 0, 0, 0, 0, -1, "name",
+               {"VariableV2"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "file",
+               {{"outfile", dump_file}});
+  tf_stats_->ShowGraphNode("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(
+      "node name | # parameters | # float_ops | output bytes | total execution "
+      "time | accelerator execution time | cpu execution time\n_TFProfRoot "
+      "(--/370 params, --/0 flops, --/1.48KB, --/5us, --/0us, --/5us)\n  "
+      "conv2d (--/140 params, --/0 flops, --/560B, --/2us, --/0us, --/2us)\n   "
+      " conv2d/bias (5, 5/5 params, 0/0 flops, 20B/20B, 1us/1us, 0us/0us, "
+      "1us/1us)\n    conv2d/kernel (3x3x3x5, 135/135 params, 0/0 flops, "
+      "540B/540B, 1us/1us, 0us/0us, 1us/1us)\n  conv2d_1 (--/230 params, --/0 "
+      "flops, --/920B, --/3us, --/0us, --/3us)\n    conv2d_1/bias (5, 5/5 "
+      "params, 0/0 flops, 20B/20B, 1us/1us, 0us/0us, 1us/1us)\n    "
+      "conv2d_1/kernel (3x3x5x5, 225/225 params, 0/0 flops, 900B/900B, "
+      "2us/2us, 0us/0us, 2us/2us)\n",
+      dump_str);
+}
+
+TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(
+      5, 0, 0, 0, 0, 0, -1, "cpu_micros", {".*"},  // accout_type_regexes
+      {".*"}, {""}, {".*"}, {""}, false, {"accelerator_micros", "cpu_micros"},
+      "file", {{"outfile", dump_file}});
+  tf_stats_->ShowGraphNode("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(
+      "node name | accelerator execution time | cpu execution "
+      "time\n_TFProfRoot (--/0us, --/97us)\n  conv2d (0us/0us, 0us/76us)\n    "
+      "conv2d/convolution (0us/0us, 60us/60us)\n      conv2d/convolution/Shape "
+      "(0us/0us, 0us/0us)\n      conv2d/convolution/dilation_rate (0us/0us, "
+      "0us/0us)\n    conv2d/BiasAdd (0us/0us, 12us/12us)\n    conv2d/bias "
+      "(0us/0us, 1us/2us)\n      conv2d/bias/read (0us/0us, 1us/1us)\n      "
+      "conv2d/bias/Assign (0us/0us, 0us/0us)\n      conv2d/bias/Initializer "
+      "(0us/0us, 0us/0us)\n        conv2d/bias/Initializer/Const (0us/0us, "
+      "0us/0us)\n    conv2d/kernel (0us/0us, 1us/2us)\n      "
+      "conv2d/kernel/read (0us/0us, 1us/1us)\n      conv2d/kernel/Assign "
+      "(0us/0us, 0us/0us)\n      conv2d/kernel/Initializer (0us/0us, "
+      "0us/0us)\n        conv2d/kernel/Initializer/random_uniform (0us/0us, "
+      "0us/0us)\n  conv2d_2 (0us/0us, 0us/15us)\n    conv2d_2/convolution "
+      "(0us/0us, 13us/13us)\n      conv2d_2/convolution/Shape (0us/0us, "
+      "0us/0us)\n      conv2d_2/convolution/dilation_rate (0us/0us, 0us/0us)\n "
+      "   conv2d_2/BiasAdd (0us/0us, 2us/2us)\n  conv2d_1 (0us/0us, 0us/5us)\n "
+      "   conv2d_1/kernel (0us/0us, 2us/3us)\n      conv2d_1/kernel/read "
+      "(0us/0us, 1us/1us)\n      conv2d_1/kernel/Assign (0us/0us, 0us/0us)\n   "
+      "   conv2d_1/kernel/Initializer (0us/0us, 0us/0us)\n        "
+      "conv2d_1/kernel/Initializer/random_uniform (0us/0us, 0us/0us)\n    "
+      "conv2d_1/bias (0us/0us, 1us/2us)\n      conv2d_1/bias/read (0us/0us, "
+      "1us/1us)\n      conv2d_1/bias/Assign (0us/0us, 0us/0us)\n      "
+      "conv2d_1/bias/Initializer (0us/0us, 0us/0us)\n        "
+      "conv2d_1/bias/Initializer/Const (0us/0us, 0us/0us)\n  zeros (0us/0us, "
+      "1us/1us)\n  init (0us/0us, 0us/0us)\n  save (0us/0us, 0us/0us)\n    "
+      "save/Assign (0us/0us, 0us/0us)\n    save/Assign_1 (0us/0us, 0us/0us)\n  "
+      "  save/Assign_2 (0us/0us, 0us/0us)\n    save/Assign_3 (0us/0us, "
+      "0us/0us)\n    save/Const (0us/0us, 0us/0us)\n    save/RestoreV2 "
+      "(0us/0us, 0us/0us)\n      save/RestoreV2/shape_and_slices (0us/0us, "
+      "0us/0us)\n      save/RestoreV2/tensor_names (0us/0us, 0us/0us)\n    "
+      "save/RestoreV2_1 (0us/0us, 0us/0us)\n      "
+      "save/RestoreV2_1/shape_and_slices (0us/0us, 0us/0us)\n      "
+      "save/RestoreV2_1/tensor_names (0us/0us, 0us/0us)\n    save/RestoreV2_2 "
+      "(0us/0us, 0us/0us)\n      save/RestoreV2_2/shape_and_slices (0us/0us, "
+      "0us/0us)\n      save/RestoreV2_2/tensor_names (0us/0us, 0us/0us)\n    "
+      "save/RestoreV2_3 (0us/0us, 0us/0us)\n      "
+      "save/RestoreV2_3/shape_and_slices (0us/0us, 0us/0us)\n      "
+      "save/RestoreV2_3/tensor_names (0us/0us, 0us/0us)\n    save/SaveV2 "
+      "(0us/0us, 0us/0us)\n      save/SaveV2/shape_and_slices (0us/0us, "
+      "0us/0us)\n      save/SaveV2/tensor_names (0us/0us, 0us/0us)\n    "
+      "save/control_dependency (0us/0us, 0us/0us)\n    save/restore_all "
+      "(0us/0us, 0us/0us)\n",
+      dump_str);
+}
+
+TEST_F(TFProfShowTest, DumpOpMode) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(
+      5, 0, 0, 0, 0, 4, -1, "params", {".*"},  // accout_type_regexes
+      {".*"}, {""}, {".*"}, {""}, false,
+      {"params", "bytes", "micros", "float_ops", "occurrence", "input_shapes"},
+      "file", {{"outfile", dump_file}});
+  tf_stats_->ShowMultiGraphNode("op", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(
+      "nodename|outputbytes|totalexecutiontime|acceleratorexecutiontime|"
+      "cpuexecutiontime|#parameters|#float_ops|opoccurrence(run|defined)|"
+      "inputshapes\nVariableV21.48KB(100.00%,17.10%),5us(100.00%,5.15%),0us(0."
+      "00%,0.00%),5us(100.00%,5.15%),370params(100.00%,100.00%),0float_ops(100."
+      "00%,0.00%),4|4\n\ninput_type:\t(run*4|defined*4)\texec_time:"
+      "5us\n\nAssign0B(0.00%,0.00%),0us(94.85%,0.00%),0us(0.00%,0.00%),0us(94."
+      "85%,0.00%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|8\n\ninput_"
+      "type:0:unknown,\t1:unknown\t(run*0|defined*8)\texec_time:0us\n\nConst1."
+      "54KB(58.87%,17.74%),1us(80.41%,1.03%),0us(0.00%,0.00%),1us(80.41%,1.03%)"
+      ",0params(0.00%,0.00%),0float_ops(98.49%,0.00%),1|24\n\ninput_type:\t("
+      "run*1|defined*24)\texec_time:1us\n\n",
+      StringReplace(dump_str, " ", ""));
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
similarity index 70%
rename from tensorflow/tools/tfprof/internal/tfprof_stats.cc
rename to tensorflow/core/profiler/internal/tfprof_stats.cc
index f5b8dad4e263ed8f55a018d9b5e66efa1332745c..f0db8edd4ab554006989ed3c78457eebc9d95a88 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
 
 #include <stdio.h>
 #include <utility>
@@ -21,24 +21,25 @@ limitations under the License.
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
 
 namespace tensorflow {
 namespace tfprof {
 TFStats::TFStats(std::unique_ptr<GraphDef> graph,
                  std::unique_ptr<RunMetadata> run_meta,
-                 std::unique_ptr<OpLog> op_log,
+                 std::unique_ptr<OpLogProto> op_log,
                  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
-    : graph_(std::move(graph)),
+    : has_code_traces_(false),
+      graph_(std::move(graph)),
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph_) << "Must at least have GraphDef";
 
   printf("Parsing Inputs...\n");
   ParseGraph();
   if (run_meta && run_meta->has_step_stats()) {
-    ParseRunMeta(0, std::move(run_meta));
+    AddRunMeta(0, std::move(run_meta));
   }
-  ParseOpLog(std::move(op_log));
+  AddOpLogProto(std::move(op_log));
 
   if (ckpt_reader_) {
     for (const auto& v : ckpt_reader_->GetVariableToShapeMap()) {
@@ -48,27 +49,48 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
       }
     }
   }
+}
 
-  printf("Preparing Views...\n");
-  scope_view_ = std::unique_ptr<TFScope>(new TFScope(ckpt_reader_.get()));
-  graph_view_ = std::unique_ptr<TFGraph>(new TFGraph(ckpt_reader_.get()));
-  code_view_ = std::unique_ptr<TFCode>(new TFCode());
-  op_view_ = std::unique_ptr<TFOp>(new TFOp());
+void TFStats::BuildView(const string& cmd) {
+  if (cmd == kCmds[0] && !scope_view_) {
+    scope_view_.reset(new TFScope(ckpt_reader_.get()));
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      scope_view_->AddNode(it->second.get());
+    }
+    scope_view_->Build();
+  }
+  if (cmd == kCmds[1] && !graph_view_) {
+    graph_view_.reset(new TFGraph(ckpt_reader_.get()));
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      graph_view_->AddNode(it->second.get());
+    }
+    graph_view_->Build();
+  }
+  if (cmd == kCmds[2] && !code_view_) {
+    code_view_.reset(new TFCode());
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      code_view_->AddNode(it->second.get());
+    }
+    code_view_->Build();
+  }
+  if (cmd == kCmds[3] && !op_view_) {
+    op_view_.reset(new TFOp());
+    for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+      op_view_->AddNode(it->second.get());
+    }
+    op_view_->Build();
+  }
+}
 
-  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
-    scope_view_->AddNode(it->second.get());
-    graph_view_->AddNode(it->second.get());
-    code_view_->AddNode(it->second.get());
-    op_view_->AddNode(it->second.get());
-  }
-  scope_view_->Build();
-  graph_view_->Build();
-  code_view_->Build();
-  op_view_->Build();
+void TFStats::BuildAllViews() {
+  std::vector<string> cmds_str(kCmds, kCmds + sizeof(kCmds) / sizeof(*kCmds));
+  for (const string& cmd : cmds_str) {
+    BuildView(cmd);
+  }
 }
 
-const TFGraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
-                                               const Options& opts) {
+const GraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
+                                             const Options& opts) const {
   if (!Validate(opts)) {
     return empty_graph_node_;
   }
@@ -82,8 +104,8 @@ const TFGraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
   }
 }
 
-const TFMultiGraphNodeProto& TFStats::ShowMultiGraphNode(const string& cmd,
-                                                         const Options& opts) {
+const MultiGraphNodeProto& TFStats::ShowMultiGraphNode(
+    const string& cmd, const Options& opts) const {
   if (!Validate(opts)) {
     return empty_multi_graph_node_;
   }
@@ -130,7 +152,7 @@ void TFStats::ParseGraph() {
   }
 }
 
-void TFStats::ParseOpLog(std::unique_ptr<OpLog> op_log) {
+void TFStats::AddOpLogProto(std::unique_ptr<OpLogProto> op_log) {
   if (!op_log) {
     return;
   }
@@ -144,12 +166,13 @@ void TFStats::ParseOpLog(std::unique_ptr<OpLog> op_log) {
       node->second->AddFloatOps(entry.float_ops());
     }
     if (entry.has_code_def()) {
+      has_code_traces_ = true;
       node->second->AddCode(entry.code_def());
     }
   }
 }
 
-void TFStats::ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
+void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   if (!run_meta || !run_meta->has_step_stats()) {
     fprintf(stderr, "Invalid RunMetadata for step %lld\n", step);
     return;
@@ -176,7 +199,7 @@ void TFStats::ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   }
 }
 
-bool TFStats::Validate(const Options& opts) {
+bool TFStats::Validate(const Options& opts) const {
   if (opts.step >= 0 && steps_.find(opts.step) == steps_.end()) {
     fprintf(stderr, "Options -step=%lld not found\n", opts.step);
     return false;
@@ -184,9 +207,9 @@ bool TFStats::Validate(const Options& opts) {
   return true;
 }
 
-void TFStats::AddNodeForTest(const string& name,
-                             std::unique_ptr<TFGraphNode> node) {
-  nodes_map_[name] = std::move(node);
+void TFStats::AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node) {
+  steps_.insert(step);
+  nodes_map_[node->name()] = std::move(node);
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
similarity index 57%
rename from tensorflow/tools/tfprof/internal/tfprof_stats.h
rename to tensorflow/core/profiler/internal/tfprof_stats.h
index dfb190e703dc488df251b232b0eb5e44f1c59eb2..8dbab16fe4f1ec1f0f8a6f64a46a5ce46cec2d55 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -20,8 +20,8 @@ limitations under the License.
 // 3. Accept command and options to selectively aggregate stats for analysis
 //    and print out the results.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
 
 #include <map>
 #include <memory>
@@ -35,16 +35,16 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_op.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_code.h"
+#include "tensorflow/core/profiler/internal/tfprof_graph.h"
+#include "tensorflow/core/profiler/internal/tfprof_node.h"
+#include "tensorflow/core/profiler/internal/tfprof_op.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_scope.h"
+#include "tensorflow/core/profiler/internal/tfprof_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -52,35 +52,46 @@ namespace tfprof {
 class TFStats {
  public:
   TFStats(std::unique_ptr<GraphDef> graph,
-          std::unique_ptr<RunMetadata> run_meta, std::unique_ptr<OpLog> op_log,
+          std::unique_ptr<RunMetadata> run_meta,
+          std::unique_ptr<OpLogProto> op_log,
           std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
   ~TFStats() {}
 
   const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
     return nodes_map_;
   }
+  const std::set<int64>& steps() const { return steps_; }
+  bool has_code_traces() const { return has_code_traces_; }
 
+  void BuildView(const string& cmd);
+  void BuildAllViews();
+
+  // Note: Must first BuildView(view_foo) before ShowXXX(view_foo) methods.
+  //
   // Organize the TensorFlow model as different types of views, and generate
   // outputs for profiling.
-  const TFGraphNodeProto& ShowGraphNode(const string& cmd, const Options& opts);
-  const TFMultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
-                                                  const Options& opts);
+  // TODO(xpan): Should it return reference here?
+  const GraphNodeProto& ShowGraphNode(const string& cmd,
+                                      const Options& opts) const;
+  const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
+                                                const Options& opts) const;
 
   // Add a step of run time meta data.
-  void ParseRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
+  void AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
   // Add tfprof operation meta data, such as customized op type, float_ops,
   // and code traces.
-  void ParseOpLog(std::unique_ptr<OpLog> op_log);
+  void AddOpLogProto(std::unique_ptr<OpLogProto> op_log);
 
   // For test purpose only.
-  void AddNodeForTest(const string& name, std::unique_ptr<TFGraphNode> node);
+  void AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node);
 
  private:
-  bool Validate(const Options& opts);
+  bool Validate(const Options& opts) const;
 
   void ParseGraph();
 
   std::set<int64> steps_;
+  bool has_code_traces_;
   std::unique_ptr<GraphDef> graph_;
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
@@ -90,11 +101,11 @@ class TFStats {
   // Store TFGraphNode instead of TFGraphNode* to avoid large number of
   // dynamic alloc.
   std::map<string, std::unique_ptr<TFGraphNode>> nodes_map_;
-  TFGraphNodeProto empty_graph_node_;
-  TFMultiGraphNodeProto empty_multi_graph_node_;
+  GraphNodeProto empty_graph_node_;
+  MultiGraphNodeProto empty_multi_graph_node_;
 };
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8744f5be2854d973129b7f7e9e13ba7ad06215cb
--- /dev/null
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -0,0 +1,315 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfStatsTest : public ::testing::Test {
+ protected:
+  TFProfStatsTest() {
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "core/profiler/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "core/profiler/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
+
+    std::unique_ptr<OpLogProto> op_log_pb(new OpLogProto());
+    string op_log_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "core/profiler/internal/testdata/tfprof_log");
+    TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
+
+    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                    "core/profiler/internal/testdata/ckpt");
+    TF_Status* status = TF_NewStatus();
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
+        new checkpoint::CheckpointReader(ckpt_path, status));
+    CHECK(TF_GetCode(status) == TF_OK);
+    TF_DeleteStatus(status);
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_->BuildAllViews();
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+TEST_F(TFProfStatsTest, CustomOpType) {
+  Options opts(3, 0, 0, 0, 0, 0, -1, "name",
+               {kTrainableVarType},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "", {});
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+
+  GraphNodeProto expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
+      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 2\n  total_requested_bytes: "
+      "560\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n "
+      "   exec_micros: 1\n    requested_bytes: 20\n    parameters: 5\n    "
+      "total_exec_micros: 1\n    total_requested_bytes: 20\n    "
+      "total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 1\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 1\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  children {\n    name: "
+      "\"conv2d/kernel\"\n    exec_micros: 1\n    requested_bytes: 540\n    "
+      "parameters: 135\n    total_exec_micros: 1\n    total_requested_bytes: "
+      "540\n    total_parameters: 135\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 1\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 1\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  float_ops: 0\n  total_float_ops: 0\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 0\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 2\n  "
+      "run_count: 0\n  total_run_count: 2\n  total_definition_count: "
+      "3\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 3\n  total_requested_bytes: "
+      "920\n  total_parameters: 230\n  children {\n    name: "
+      "\"conv2d_1/bias\"\n    exec_micros: 1\n    requested_bytes: 20\n    "
+      "parameters: 5\n    total_exec_micros: 1\n    total_requested_bytes: "
+      "20\n    total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 1\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 1\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  children {\n    name: "
+      "\"conv2d_1/kernel\"\n    exec_micros: 2\n    requested_bytes: 900\n    "
+      "parameters: 225\n    total_exec_micros: 2\n    total_requested_bytes: "
+      "900\n    total_parameters: 225\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 2\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 2\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  float_ops: 0\n  total_float_ops: 0\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 0\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 3\n  "
+      "run_count: 0\n  total_run_count: 2\n  total_definition_count: "
+      "3\n}\nfloat_ops: 0\ntotal_float_ops: 0\naccelerator_exec_micros: "
+      "0\ncpu_exec_micros: 0\ntotal_accelerator_exec_micros: "
+      "0\ntotal_cpu_exec_micros: 5\nrun_count: 0\ntotal_run_count: "
+      "4\ntotal_definition_count: 6\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, CheckPointOpType) {
+  Options opts(3, 0, 0, 0, 0, 0, -1, "name",
+               {kCkptVarType},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "", {});
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+
+  GraphNodeProto expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
+      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 2\n  total_requested_bytes: "
+      "560\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n "
+      "   exec_micros: 1\n    requested_bytes: 20\n    parameters: 5\n    "
+      "total_exec_micros: 1\n    total_requested_bytes: 20\n    "
+      "total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 1\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 1\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  children {\n    name: "
+      "\"conv2d/kernel\"\n    exec_micros: 1\n    requested_bytes: 540\n    "
+      "parameters: 135\n    total_exec_micros: 1\n    total_requested_bytes: "
+      "540\n    total_parameters: 135\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 1\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 1\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  float_ops: 0\n  total_float_ops: 0\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 0\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 2\n  "
+      "run_count: 0\n  total_run_count: 2\n  total_definition_count: "
+      "3\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
+      "requested_bytes: 0\n  total_exec_micros: 3\n  total_requested_bytes: "
+      "920\n  total_parameters: 230\n  children {\n    name: "
+      "\"conv2d_1/bias\"\n    exec_micros: 1\n    requested_bytes: 20\n    "
+      "parameters: 5\n    total_exec_micros: 1\n    total_requested_bytes: "
+      "20\n    total_parameters: 5\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 1\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 1\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  children {\n    name: "
+      "\"conv2d_1/kernel\"\n    exec_micros: 2\n    requested_bytes: 900\n    "
+      "parameters: 225\n    total_exec_micros: 2\n    total_requested_bytes: "
+      "900\n    total_parameters: 225\n    devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
+      "total_float_ops: 0\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 2\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 2\n    run_count: 1\n    total_run_count: 1\n    "
+      "total_definition_count: 1\n  }\n  float_ops: 0\n  total_float_ops: 0\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 0\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 3\n  "
+      "run_count: 0\n  total_run_count: 2\n  total_definition_count: "
+      "3\n}\nfloat_ops: 0\ntotal_float_ops: 0\naccelerator_exec_micros: "
+      "0\ncpu_exec_micros: 0\ntotal_accelerator_exec_micros: "
+      "0\ntotal_cpu_exec_micros: 5\nrun_count: 0\ntotal_run_count: "
+      "4\ntotal_definition_count: 6\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestGraph) {
+  Options opts(100, 0, 10000, 0, 0, 0, -1, "name", {".*"},
+               {"cost.*"},  // start_name_regexes
+               {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops"}, "", {});
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("graph", opts);
+
+  GraphNodeProto expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
+      "8656\ntotal_parameters: 370\nfloat_ops: 0\ntotal_float_ops: "
+      "34360\naccelerator_exec_micros: 0\ncpu_exec_micros: "
+      "0\ntotal_accelerator_exec_micros: 0\ntotal_cpu_exec_micros: "
+      "97\nrun_count: 0\ntotal_run_count: 13\ntotal_definition_count: 60\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestFloatOps) {
+  Options opts(10, 0, 0, 0, 1, 0, -1, "name", {".*"}, {".*"}, {""}, {".*"},
+               {""}, false, {"float_ops"}, "", {});
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+
+  GraphNodeProto expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
+      "8656\ntotal_parameters: 370\nchildren {\n  name: \"conv2d/BiasAdd\"\n  "
+      "exec_micros: 12\n  requested_bytes: 1440\n  total_exec_micros: 12\n  "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 360\n  "
+      "total_float_ops: 360\n  input_shapes {\n    key: 0\n    value {\n      "
+      "unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    value "
+      "{\n      unknown_rank: true\n    }\n  }\n  accelerator_exec_micros: 0\n "
+      " cpu_exec_micros: 12\n  total_accelerator_exec_micros: 0\n  "
+      "total_cpu_exec_micros: 12\n  run_count: 1\n  total_run_count: 1\n  "
+      "total_definition_count: 1\n}\nchildren {\n  name: "
+      "\"conv2d/convolution\"\n  exec_micros: 60\n  requested_bytes: 1440\n  "
+      "total_exec_micros: 60\n  total_requested_bytes: 1440\n  "
+      "total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 19440\n  "
+      "total_float_ops: 19440\n  input_shapes {\n    key: 0\n    value {\n     "
+      " unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    "
+      "value {\n      unknown_rank: true\n    }\n  }\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 60\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 60\n  "
+      "run_count: 1\n  total_run_count: 1\n  total_definition_count: "
+      "3\n}\nchildren {\n  name: \"conv2d_2/BiasAdd\"\n  exec_micros: 2\n  "
+      "requested_bytes: 640\n  total_exec_micros: 2\n  total_requested_bytes: "
+      "640\n  total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 160\n  "
+      "total_float_ops: 160\n  input_shapes {\n    key: 0\n    value {\n      "
+      "unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    value "
+      "{\n      unknown_rank: true\n    }\n  }\n  accelerator_exec_micros: 0\n "
+      " cpu_exec_micros: 2\n  total_accelerator_exec_micros: 0\n  "
+      "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
+      "total_definition_count: 1\n}\nchildren {\n  name: "
+      "\"conv2d_2/convolution\"\n  exec_micros: 13\n  requested_bytes: 640\n  "
+      "total_exec_micros: 13\n  total_requested_bytes: 640\n  "
+      "total_parameters: 0\n  devices: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 14400\n  "
+      "total_float_ops: 14400\n  input_shapes {\n    key: 0\n    value {\n     "
+      " unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    "
+      "value {\n      unknown_rank: true\n    }\n  }\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 13\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 13\n  "
+      "run_count: 1\n  total_run_count: 1\n  total_definition_count: "
+      "3\n}\nfloat_ops: 0\ntotal_float_ops: 34360\naccelerator_exec_micros: "
+      "0\ncpu_exec_micros: 0\ntotal_accelerator_exec_micros: "
+      "0\ntotal_cpu_exec_micros: 97\nrun_count: 0\ntotal_run_count: "
+      "13\ntotal_definition_count: 68\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
+  Options opts(100, 0, 0, 0, 0, 0, -1, "name", {".*"}, {".*"}, {""},
+               {"unit_2_1.*DW"},  // show_name_regexes.
+               {""}, true,        // account_displayed_op_only.
+               {"params"}, "", {});
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+
+  GraphNodeProto expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
+      "0\nfloat_ops: 0\ntotal_float_ops: 0\naccelerator_exec_micros: "
+      "0\ncpu_exec_micros: 0\ntotal_accelerator_exec_micros: "
+      "0\ntotal_cpu_exec_micros: 0\nrun_count: 0\ntotal_run_count: "
+      "0\ntotal_definition_count: 1\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestShowTensorValue) {
+  Options opts(10, 0, 0, 0, 0, 0, -1, "name", {".*"}, {".*"}, {""},
+               {"unit_1_0.*gamma"}, {""}, false,
+               {"tensor_value"},  // Show tensor value from checkpoint.
+               "", {});
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+  GraphNodeProto expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
+      "8656\ntotal_parameters: 370\nfloat_ops: 0\ntotal_float_ops: "
+      "34360\naccelerator_exec_micros: 0\ncpu_exec_micros: "
+      "0\ntotal_accelerator_exec_micros: 0\ntotal_cpu_exec_micros: "
+      "97\nrun_count: 0\ntotal_run_count: 13\ntotal_definition_count: 68\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor.cc b/tensorflow/core/profiler/internal/tfprof_tensor.cc
similarity index 97%
rename from tensorflow/tools/tfprof/internal/tfprof_tensor.cc
rename to tensorflow/core/profiler/internal/tfprof_tensor.cc
index 297258fee11f7d714c771ac4e5976fcafd0ef99e..d8ec086315f13502ffe907e6d9a0686b4546ad79 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor.h b/tensorflow/core/profiler/internal/tfprof_tensor.h
similarity index 94%
rename from tensorflow/tools/tfprof/internal/tfprof_tensor.h
rename to tensorflow/core/profiler/internal/tfprof_tensor.h
index 5804837ffb688896f0bb518f7f1f8d97dc37a547..d6c4ae131175f04829600531e4098d80a042a974 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor.h
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.h
@@ -19,8 +19,8 @@ limitations under the License.
 //    is not supported by TensorFlow CheckPointReader library, though it is
 //    supported in current code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
 
 #include <typeinfo>
 
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -118,4 +118,4 @@ class TFProfTensor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
similarity index 86%
rename from tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
rename to tensorflow/core/profiler/internal/tfprof_tensor_test.cc
index 3dd721cbcc810c8eee600a9487d7c13f7ba36bc8..50ef82abc919d796e30e45c1b5ab00f655a70624 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -32,15 +32,16 @@ class TFProfTensorTest : public ::testing::Test {
   TFProfTensorTest() {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/graph.pbtxt");
+                     "core/profiler/internal/testdata/graph.pbtxt");
     std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
-    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
     std::unique_ptr<tensorflow::RunMetadata> run_meta_pb;
-    std::unique_ptr<OpLog> op_log_pb;
+    std::unique_ptr<OpLogProto> op_log_pb;
 
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                    "tools/tfprof/internal/testdata/ckpt");
+                                    "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
     std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
         new checkpoint::CheckpointReader(ckpt_path, status));
@@ -49,6 +50,7 @@ class TFProfTensorTest : public ::testing::Test {
 
     tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
                                 std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_->BuildAllViews();
   }
 
   std::unique_ptr<TFStats> tf_stats_;
@@ -58,9 +60,9 @@ TEST_F(TFProfTensorTest, Basics) {
   Options opts(3, 0, 0, 0, 0, 0, -1, "name", {"VariableV2"}, {".*"}, {""},
                {".*"}, {""}, false, {"tensor_value"},  // show the tensor value.
                "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
+  const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  TFGraphNodeProto expected;
+  GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
@@ -72,7 +74,10 @@ TEST_F(TFProfTensorTest, Basics) {
       "total_parameters: 5\n    float_ops: 0\n    total_float_ops: 0\n    "
       "tensor_value {\n      dtype: DT_FLOAT\n      value_double: 0\n      "
       "value_double: 0\n      value_double: 0\n      value_double: 0\n      "
-      "value_double: 0\n    }\n  }\n  children {\n    name: "
+      "value_double: 0\n    }\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 0\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 0\n    run_count: 0\n    total_run_count: 0\n    "
+      "total_definition_count: 1\n  }\n  children {\n    name: "
       "\"conv2d/kernel\"\n    exec_micros: 0\n    requested_bytes: 0\n    "
       "parameters: 135\n    total_exec_micros: 0\n    total_requested_bytes: "
       "0\n    total_parameters: 135\n    float_ops: 0\n    total_float_ops: "
@@ -143,8 +148,14 @@ TEST_F(TFProfTensorTest, Basics) {
       "value_double: 0.19068\n      value_double: 0.220352\n      "
       "value_double: -0.255741\n      value_double: 0.110853\n      "
       "value_double: 0.146625\n      value_double: 0.167754\n      "
-      "value_double: 0.249554\n    }\n  }\n  float_ops: 0\n  total_float_ops: "
-      "0\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
+      "value_double: 0.249554\n    }\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 0\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 0\n    run_count: 0\n    total_run_count: 0\n    "
+      "total_definition_count: 1\n  }\n  float_ops: 0\n  total_float_ops: 0\n  "
+      "accelerator_exec_micros: 0\n  cpu_exec_micros: 0\n  "
+      "total_accelerator_exec_micros: 0\n  total_cpu_exec_micros: 0\n  "
+      "run_count: 0\n  total_run_count: 0\n  total_definition_count: "
+      "3\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
       "requested_bytes: 0\n  total_exec_micros: 0\n  total_requested_bytes: "
       "0\n  total_parameters: 230\n  children {\n    name: \"conv2d_1/bias\"\n "
       "   exec_micros: 0\n    requested_bytes: 0\n    parameters: 5\n    "
@@ -152,7 +163,10 @@ TEST_F(TFProfTensorTest, Basics) {
       "total_parameters: 5\n    float_ops: 0\n    total_float_ops: 0\n    "
       "tensor_value {\n      dtype: DT_FLOAT\n      value_double: 0\n      "
       "value_double: 0\n      value_double: 0\n      value_double: 0\n      "
-      "value_double: 0\n    }\n  }\n  children {\n    name: "
+      "value_double: 0\n    }\n    accelerator_exec_micros: 0\n    "
+      "cpu_exec_micros: 0\n    total_accelerator_exec_micros: 0\n    "
+      "total_cpu_exec_micros: 0\n    run_count: 0\n    total_run_count: 0\n    "
+      "total_definition_count: 1\n  }\n  children {\n    name: "
       "\"conv2d_1/kernel\"\n    exec_micros: 0\n    requested_bytes: 0\n    "
       "parameters: 225\n    total_exec_micros: 0\n    total_requested_bytes: "
       "0\n    total_parameters: 225\n    float_ops: 0\n    total_float_ops: "
@@ -268,9 +282,17 @@ TEST_F(TFProfTensorTest, Basics) {
       "value_double: 0.237298\n      value_double: -0.0896481\n      "
       "value_double: -0.0605349\n      value_double: 0.231679\n      "
       "value_double: -0.123842\n      value_double: 0.0858642\n      "
-      "value_double: 0.23111\n      value_double: 0.0491742\n    }\n  }\n  "
-      "float_ops: 0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: "
-      "0\n",
+      "value_double: 0.23111\n      value_double: 0.0491742\n    }\n    "
+      "accelerator_exec_micros: 0\n    cpu_exec_micros: 0\n    "
+      "total_accelerator_exec_micros: 0\n    total_cpu_exec_micros: 0\n    "
+      "run_count: 0\n    total_run_count: 0\n    total_definition_count: 1\n  "
+      "}\n  float_ops: 0\n  total_float_ops: 0\n  accelerator_exec_micros: 0\n "
+      " cpu_exec_micros: 0\n  total_accelerator_exec_micros: 0\n  "
+      "total_cpu_exec_micros: 0\n  run_count: 0\n  total_run_count: 0\n  "
+      "total_definition_count: 3\n}\nfloat_ops: 0\ntotal_float_ops: "
+      "0\naccelerator_exec_micros: 0\ncpu_exec_micros: "
+      "0\ntotal_accelerator_exec_micros: 0\ntotal_cpu_exec_micros: "
+      "0\nrun_count: 0\ntotal_run_count: 0\ntotal_definition_count: 6\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
similarity index 98%
rename from tensorflow/tools/tfprof/internal/tfprof_timeline.cc
rename to tensorflow/core/profiler/internal/tfprof_timeline.cc
index c98aa940c8c7fbb826fd9c3354cbbe8f0e1f87d0..cfd80b875a5c7e6a2e4d1fd6092c9c8b55d6ff60 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/core/profiler/internal/tfprof_timeline.h"
 
 #include <utility>
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -192,9 +192,6 @@ void Timeline::AllocateTimeNodes(GraphNode* gnode) {
     const TFGraphNode* node = gnode->node;
     for (const auto& kernel_execs : node->op_execs(step_)) {
       const string& device = kernel_execs.first;
-      if (!IsCombinedGPUStream(device) && !IsCPUDevice(device)) {
-        continue;
-      }
 
       if (process_.find(device) == process_.end()) {
         int64 pid = AllocatePID();
@@ -330,7 +327,7 @@ void Timeline::AllocateLanes() {
       int64 start_time = tnode.second->start_micros;
       int64 end_time = tnode.second->start_micros + tnode.second->exec_micros;
       int64 l = -1;
-      for (int i = 0; i < p->lanes.size(); ++i) {
+      for (int64 i = 0; i < p->lanes.size(); ++i) {
         const auto& lane = p->lanes[i];
         l = i;
         for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
similarity index 95%
rename from tensorflow/tools/tfprof/internal/tfprof_timeline.h
rename to tensorflow/core/profiler/internal/tfprof_timeline.h
index 0bba67066f0c1c253885e28e7242d908a90c615a..6c62d1046faa58cac2f3371b353671f7ac45ed17 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 
 #include "include/json/json.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+#include "tensorflow/core/profiler/internal/tfprof_node_show.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -191,4 +191,4 @@ class Timeline {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
similarity index 81%
rename from tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
rename to tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index bcf2bf05946beab94d2ece84701d203d46c0e3bc..6842f262c63eec0cbf9edd42b2adc7938a571213 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
 
 #include <utility>
 
@@ -24,11 +24,11 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -37,20 +37,22 @@ class TFProfTimelineTest : public ::testing::Test {
   TFProfTimelineTest() {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/graph.pbtxt");
+                     "core/profiler/internal/testdata/graph.pbtxt");
     std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
-    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
     std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
         new tensorflow::RunMetadata());
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/run_meta");
+                     "core/profiler/internal/testdata/run_meta");
     TF_CHECK_OK(
-        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+        ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
 
     tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
                                 nullptr, nullptr));
+    tf_stats_->BuildAllViews();
   }
 
   std::unique_ptr<TFStats> tf_stats_;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
similarity index 66%
rename from tensorflow/tools/tfprof/internal/tfprof_utils.cc
rename to tensorflow/core/profiler/internal/tfprof_utils.cc
index 0bc12170125584620c956a56bc683d25017b2a61..464a13f7dfce97e4e5b805a4c78c5dd7088cf1fc 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
 
 #include <stdio.h>
 #include <algorithm>
@@ -72,19 +72,6 @@ string StringReplace(const string& str, const string& oldsub,
   return out;
 }
 
-Status ReadGraphDef(Env* env, const string& fname, GraphDef* graph_def) {
-  string out;
-  Status s = ReadFileToString(env, fname, &out);
-  if (!s.ok()) return s;
-  if (protobuf::TextFormat::ParseFromString(out, graph_def)) {
-    return Status();
-  } else if (ReadBinaryProto(tensorflow::Env::Default(), fname, graph_def)
-                 .ok()) {
-    return Status();
-  }
-  return errors::InvalidArgument("Cannot parse proto string.");
-}
-
 namespace {
 string StripQuote(const string& s) {
   int start = s.find_first_not_of("\"\'");
@@ -273,80 +260,24 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
 
 void PrintHelp() {
   printf(
-      "\nSee go/tfprof for detail tutorial.\n"
-      "\nCommands\n\n"
-      "  scope: Each op has its op name in TensorFlow, such as 'n1', 'n1/n2', "
-      "'n1/n2/n3'. 'n1/n2' is a child of 'n1'. 'scope' command builds "
-      "a name scope tree and aggregates statistics based on it.\n\n"
-      "  graph: ops in TensorFlow are organized as a graph based on their "
-      "the source (inputs) and sink (outputs). 'graph' command builds "
-      "a graph pointing *from output to input*, and aggregates "
-      "statistics based on it.\n\n"
-      "  set: Set options that will be default for follow up commands.\n\n"
-      "  help: Show helps.\n"
-      "\nOptions\n\n"
-      "Press Enter in CLI to see default option values.\n\n"
-      "  -max_depth: Show ops that are at most this number of hops from "
-      "starting op in the tree/graph structure.\n\n"
-      "  -min_bytes: Show ops that request at least this number of bytes.\n\n"
-      "  -min_micros: Show ops that spend at least this number of micros to "
-      "run.\n\n"
-      "  -min_params: Show ops that contains at least this number of "
-      "parameters.\n\n"
-      "  -min_float_ops: Show ops that contain at least this number of "
-      "float operations. Only available if an op has "
-      "op.RegisterStatistics() defined and OpLog is "
-      "provided\n\n"
-      "  -min_occurrence: Show the op types that are at least used this number "
-      "of times. Only available in op view.\n\n"
-      "  -step: Show the stats of a step when multiple steps of "
-      "RunMetadata were added. By default (-1), show the average of all steps."
-      "  -order_by: Order the results by [name|depth|bytes|micros|params|"
-      "float_ops]\n\n"
-      "  -account_type_regexes: Account and display the ops whose types match "
-      "one of the type regexes specified. tfprof "
-      "allow user to define extra op types for ops "
-      "through tensorflow.tfprof.OpLog proto. regexes "
-      "are comma-sperated.\n\n"
-      "  -start_name_regexes: Show ops starting from the ops that matches the "
-      "regexes, recursively. regexes are "
-      "comma-separated.\n\n"
-      "  -trim_name_regexes: Hide ops starting from the ops that matches the "
-      "regexes, recursively, regexes are comma-seprated. "
-      "\n\n"
-      "  -show_name_regexes: Show ops that match the regexes. regexes are "
-      "comma-seprated.\n\n"
-      "  -hide_name_regexes: Hide ops that match the regexes. regexes are "
-      "comma-seprated.\n\n"
-      ""
-      "  Notes: For each op, -acount_type_regexes is first evaluated, "
-      "only ops with types matching the specified regexes are accounted and "
-      "selected for displayed. -start/trim/show/hide_name_regexes are used "
-      "to further filter ops for display. -start_name_regexes is evaluated "
-      "first to search the starting ops to display. Descendants of starting "
-      "ops are then evaluated against show/hide_name_regexes to make display "
-      "decision. If an op matches trim_name_regexes, all its descendants are "
-      "hidden.\n"
-      "Ops statistics are *accounted even if they are hidden* as long as "
-      "they match the -account_xxx options.\n\n"
-      "  -account_displayed_op_only: If True, only account the statistics of "
-      "ops eventually displayed. If False, account all "
-      "op statistics matching -account_type_regexes recursively.\n\n"
-      "  -select: Comma-separated list of metrics to show: [bytes|micros|"
-      "params|float_ops|tensor_value|device|op_types]."
-      "\n\n"
-      "  -dump_to_file: Dump the output to a file, instead of terminal.\n\n"
-      ""
-      "Examples\n"
-      "  Assuming a toy model:\n"
-      "    intput(typeB)->conv2d_1(typeA)->conv2d_2(typeA)->"
-      "fc(typeA)->cost(typeA)->summarize(typeC)\n"
-      "  Command:\n"
-      "    tfprof> graph -account_type_regexes typeA -start_name_regexes "
-      "cost.* -show_name_regexes conv2d.* -max_depth 10\n\n"
-      "  The above command only aggregate statistics of all ops of typeA ("
-      "hence ignoring input(typeB)). It will start looking for candidate to "
-      "display from cost.* and finally displays conv2d_1 and conv2d_2.\n\n");
+      "See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/"
+      "README.md for profiler tutorial.\n");
+  printf(
+      "See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/"
+      "g3doc/command_line.md for command line tool tutorial.\n");
+  printf(
+      "profiler --graph_path=<GraphDef proto file>  # required\n"
+      "         --run_meta_patn=<RunMetadata proto file>  # optional\n"
+      "         --run_log_path=<OpLogProto proto file>  # optional\n\n");
+  printf(
+      "\nCommands:\n"
+      "  scope: Organize profiles based on name scopes.\n"
+      "  graph: Organize profiles based on graph node input/output.\n"
+      "  op: Organize profiles based on operation type.\n"
+      "  code: Organize profiles based on python codes (need op_log_path).\n"
+      "  advise: Auto-profile and advise.\n"
+      "  set: Set options that will be default for follow up commands.\n"
+      "  help: Show helps.\n");
   fflush(stdout);
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_utils.h b/tensorflow/core/profiler/internal/tfprof_utils.h
similarity index 56%
rename from tensorflow/tools/tfprof/internal/tfprof_utils.h
rename to tensorflow/core/profiler/internal/tfprof_utils.h
index afa7a58acd399ca8ebcf85c3d839a9fb814086b8..3407517ce01bbccd5fd82b03f9251fef5015c461 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_utils.h
+++ b/tensorflow/core/profiler/internal/tfprof_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
 
 #include <string>
 #include <vector>
@@ -22,7 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -40,11 +41,32 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
 string StringReplace(const string& str, const string& oldsub,
                      const string& newsub);
 
-Status ReadGraphDef(Env* env, const string& fname, GraphDef* graph_def);
+template <typename T>
+Status ReadProtoFile(Env* env, const string& fname, T* proto,
+                     bool binary_first) {
+  string out;
+  Status s = ReadFileToString(env, fname, &out);
+  if (!s.ok()) return s;
+
+  if (binary_first) {
+    if (ReadBinaryProto(tensorflow::Env::Default(), fname, proto).ok()) {
+      return Status();
+    } else if (protobuf::TextFormat::ParseFromString(out, proto)) {
+      return Status();
+    }
+  } else {
+    if (protobuf::TextFormat::ParseFromString(out, proto)) {
+      return Status();
+    } else if (ReadBinaryProto(tensorflow::Env::Default(), fname, proto).ok()) {
+      return Status();
+    }
+  }
+  return errors::InvalidArgument("Cannot parse proto file.");
+}
 
 void PrintHelp();
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ade478367e981c260fadc71ad306327a76acde5e
--- /dev/null
+++ b/tensorflow/core/profiler/profiler.cc
@@ -0,0 +1,274 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "linenoise.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
+#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_log.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+void completion(const char* buf, linenoiseCompletions* lc) {
+  string buf_str = buf;
+  if (buf_str.find(" ") == buf_str.npos) {
+    for (const char* opt : kCmds) {
+      if (string(opt).find(buf_str) == 0) {
+        linenoiseAddCompletion(lc, opt);
+      }
+    }
+    return;
+  }
+
+  string prefix;
+  int last_dash = buf_str.find_last_of(' ');
+  if (last_dash != string::npos) {
+    prefix = buf_str.substr(0, last_dash + 1);
+    buf_str = buf_str.substr(last_dash + 1, kint32max);
+  }
+  for (const char* opt : kOptions) {
+    if (string(opt).find(buf_str) == 0) {
+      linenoiseAddCompletion(lc, (prefix + opt).c_str());
+    }
+  }
+}
+
+int Run(int argc, char** argv) {
+  string FLAGS_graph_path = "";
+  string FLAGS_run_meta_path = "";
+  string FLAGS_op_log_path = "";
+  string FLAGS_checkpoint_path = "";
+  int32 FLAGS_max_depth = 10;
+  int64 FLAGS_min_bytes = 0;
+  int64 FLAGS_min_micros = 0;
+  int64 FLAGS_min_params = 0;
+  int64 FLAGS_min_float_ops = 0;
+  int64 FLAGS_min_occurrence = 0;
+  int64 FLAGS_step = -1;
+  string FLAGS_order_by = "name";
+  string FLAGS_account_type_regexes = ".*";
+  string FLAGS_start_name_regexes = ".*";
+  string FLAGS_trim_name_regexes = "";
+  string FLAGS_show_name_regexes = ".*";
+  string FLAGS_hide_name_regexes;
+  bool FLAGS_account_displayed_op_only = false;
+  string FLAGS_select = "micros";
+  string FLAGS_output = "";
+  for (int i = 0; i < argc; i++) {
+    fprintf(stderr, "%s\n", argv[i]);
+  }
+
+  std::vector<Flag> flag_list = {
+      Flag("graph_path", &FLAGS_graph_path, "GraphDef proto text file name"),
+      Flag("run_meta_path", &FLAGS_run_meta_path,
+           "Comma-separated list of RunMetadata proto binary "
+           "files. Each file is given step number 0,1,2,etc"),
+      Flag("op_log_path", &FLAGS_op_log_path,
+           "tensorflow::tfprof::OpLogProto proto binary file name"),
+      Flag("checkpoint_path", &FLAGS_checkpoint_path,
+           "TensorFlow Checkpoint file name"),
+      Flag("max_depth", &FLAGS_max_depth, "max depth"),
+      Flag("min_bytes", &FLAGS_min_bytes, "min_bytes"),
+      Flag("min_micros", &FLAGS_min_micros, "min micros"),
+      Flag("min_params", &FLAGS_min_params, "min params"),
+      Flag("min_float_ops", &FLAGS_min_float_ops, "min float ops"),
+      Flag("min_occurrence", &FLAGS_min_occurrence, "min occurrence"),
+      Flag("step", &FLAGS_step,
+           "The stats of which step to use. By default average"),
+      Flag("order_by", &FLAGS_order_by, "order by"),
+      Flag("account_type_regexes", &FLAGS_start_name_regexes,
+           "start name regexes"),
+      Flag("trim_name_regexes", &FLAGS_trim_name_regexes, "trim name regexes"),
+      Flag("show_name_regexes", &FLAGS_show_name_regexes, "show name regexes"),
+      Flag("hide_name_regexes", &FLAGS_hide_name_regexes, "hide name regexes"),
+      Flag("account_displayed_op_only", &FLAGS_account_displayed_op_only,
+           "account displayed op only"),
+      Flag("select", &FLAGS_select, "select"),
+      Flag("output", &FLAGS_output, "output"),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  bool parse_ok = Flags::Parse(&argc, argv, flag_list);
+  if (!parse_ok) {
+    printf("%s", usage.c_str());
+    return (2);
+  }
+  port::InitMain(argv[0], &argc, &argv);
+
+  std::vector<string> account_type_regexes =
+      str_util::Split(FLAGS_account_type_regexes, ',', str_util::SkipEmpty());
+  std::vector<string> start_name_regexes =
+      str_util::Split(FLAGS_start_name_regexes, ',', str_util::SkipEmpty());
+  std::vector<string> trim_name_regexes =
+      str_util::Split(FLAGS_trim_name_regexes, ',', str_util::SkipEmpty());
+  std::vector<string> show_name_regexes =
+      str_util::Split(FLAGS_show_name_regexes, ',', str_util::SkipEmpty());
+  std::vector<string> hide_name_regexes =
+      str_util::Split(FLAGS_hide_name_regexes, ',', str_util::SkipEmpty());
+  std::vector<string> select =
+      str_util::Split(FLAGS_select, ',', str_util::SkipEmpty());
+
+  string output_type;
+  std::map<string, string> output_options;
+  Status s = ParseOutput(FLAGS_output, &output_type, &output_options);
+  CHECK(s.ok()) << s.ToString();
+
+  string cmd = "";
+  if (argc == 1 && FLAGS_graph_path.empty()) {
+    PrintHelp();
+    return 0;
+  } else if (argc > 1) {
+    if (string(argv[1]) == kCmds[6]) {
+      PrintHelp();
+      return 0;
+    }
+    if (string(argv[1]) == kCmds[0] || string(argv[1]) == kCmds[1] ||
+        string(argv[1]) == kCmds[2] || string(argv[1]) == kCmds[3] ||
+        string(argv[1]) == kCmds[4]) {
+      cmd = argv[1];
+    }
+  }
+
+  printf("Reading Files...\n");
+  std::unique_ptr<GraphDef> graph(new GraphDef());
+  TF_CHECK_OK(
+      ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+
+  std::unique_ptr<OpLogProto> op_log(new OpLogProto());
+  if (!FLAGS_op_log_path.empty()) {
+    string op_log_str;
+    s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
+      return 1;
+    }
+    if (!ParseProtoUnlimited(op_log.get(), op_log_str)) {
+      fprintf(stderr, "Failed to parse op_log_path\n");
+      return 1;
+    }
+  }
+
+  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader;
+  TF_Status* status = TF_NewStatus();
+  if (!FLAGS_checkpoint_path.empty()) {
+    ckpt_reader.reset(
+        new checkpoint::CheckpointReader(FLAGS_checkpoint_path, status));
+    if (TF_GetCode(status) != TF_OK) {
+      fprintf(stderr, "%s\n", TF_Message(status));
+      TF_DeleteStatus(status);
+      return 1;
+    }
+    TF_DeleteStatus(status);
+  }
+
+  TFStats tf_stat(std::move(graph), nullptr, std::move(op_log),
+                  std::move(ckpt_reader));
+
+  std::vector<string> run_meta_files =
+      str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty());
+  for (int i = 0; i < run_meta_files.size(); ++i) {
+    std::unique_ptr<RunMetadata> run_meta(new RunMetadata());
+    s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(), true);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
+              run_meta_files[i].c_str(), s.ToString().c_str());
+      return 1;
+    }
+    tf_stat.AddRunMeta(i, std::move(run_meta));
+  }
+
+  if (cmd == kCmds[4]) {
+    tf_stat.BuildAllViews();
+    Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+    return 0;
+  }
+
+  Options opts(FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros,
+               FLAGS_min_params, FLAGS_min_float_ops, FLAGS_min_occurrence,
+               FLAGS_step, FLAGS_order_by, account_type_regexes,
+               start_name_regexes, trim_name_regexes, show_name_regexes,
+               hide_name_regexes, FLAGS_account_displayed_op_only, select,
+               output_type, output_options);
+
+  if (cmd == kCmds[2] || cmd == kCmds[3]) {
+    tf_stat.BuildView(cmd);
+    tf_stat.ShowMultiGraphNode(cmd, opts);
+    return 0;
+  } else if (cmd == kCmds[0] || cmd == kCmds[1]) {
+    tf_stat.BuildView(cmd);
+    tf_stat.ShowGraphNode(cmd, opts);
+    return 0;
+  }
+
+  linenoiseSetCompletionCallback(completion);
+  linenoiseHistoryLoad(".tfprof_history.txt");
+
+  for (char* line = nullptr; (line = linenoise("tfprof> ")) != nullptr;) {
+    string line_s = line;
+    free(line);
+
+    if (line_s.empty()) {
+      printf("%s", opts.ToString().c_str());
+      continue;
+    }
+    linenoiseHistoryAdd(line_s.c_str());
+    linenoiseHistorySave(".tfprof_history.txt");
+
+    Options new_opts = opts;
+    Status s = ParseCmdLine(line_s, &cmd, &new_opts);
+    if (!s.ok()) {
+      fprintf(stderr, "E: %s\n", s.ToString().c_str());
+      continue;
+    }
+    if (cmd == kCmds[5]) {
+      opts = new_opts;
+    } else if (cmd == kCmds[6]) {
+      PrintHelp();
+    } else if (cmd == kCmds[2] || cmd == kCmds[3]) {
+      tf_stat.BuildView(cmd);
+      tf_stat.ShowMultiGraphNode(cmd, new_opts);
+    } else if (cmd == kCmds[0] || cmd == kCmds[1]) {
+      tf_stat.BuildView(cmd);
+      tf_stat.ShowGraphNode(cmd, new_opts);
+    } else if (cmd == kCmds[4]) {
+      tf_stat.BuildAllViews();
+      Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+    }
+  }
+  return 0;
+}
+}  // namespace tfprof
+}  // namespace tensorflow
+
+int main(int argc, char** argv) { return tensorflow::tfprof::Run(argc, argv); }
diff --git a/tensorflow/tools/tfprof/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
similarity index 70%
rename from tensorflow/tools/tfprof/tfprof_log.proto
rename to tensorflow/core/profiler/tfprof_log.proto
index 5c47142e0ab6e3f647d869016a8ab4f9f9eb9e99..048449b4371d12e855bd5181227f2b62615fadde 100644
--- a/tensorflow/tools/tfprof/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -1,4 +1,4 @@
-syntax = "proto2";
+syntax = "proto3";
 
 package tensorflow.tfprof;
 
@@ -6,27 +6,27 @@ package tensorflow.tfprof;
 message CodeDef {
   repeated Trace traces = 1;
   message Trace {
-    optional string file = 1;
-    optional int32 lineno = 2;
-    optional string function = 3;
-    optional string line = 4;
+    string file = 1;
+    int32 lineno = 2;
+    string function = 3;
+    string line = 4;
   }
 }
 
 message OpLogEntry {
   // op name.
-  optional string name = 1;
+  string name = 1;
   // float_ops is filled by tfprof Python API when called. It requires the
   // op has RegisterStatistics defined. Currently, Conv2D, MatMul, etc, are
   // implemented.
-  optional int64 float_ops = 2;
+  int64 float_ops = 2;
   // User can define extra op type information for an op. This allows the user
   // to select a group of ops precisely using op_type as a key.
   repeated string types = 3;
   // Used to support tfprof "code" view.
-  optional CodeDef code_def = 4;
+  CodeDef code_def = 4;
 }
 
-message OpLog {
+message OpLogProto {
   repeated OpLogEntry log_entries = 1;
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/profiler/tfprof_options.proto b/tensorflow/core/profiler/tfprof_options.proto
new file mode 100644
index 0000000000000000000000000000000000000000..58828330398ed67d73c4b4b354a6db14de4af818
--- /dev/null
+++ b/tensorflow/core/profiler/tfprof_options.proto
@@ -0,0 +1,34 @@
+syntax = "proto3";
+
+package tensorflow.tfprof;
+
+// Refers to tfprof_options.h/cc for documentation.
+// Only used to pass tfprof options from Python to C++.
+message OptionsProto {
+  int64 max_depth = 1;
+  int64 min_bytes = 2;
+  int64 min_micros = 3;
+  int64 min_params = 4;
+  int64 min_float_ops = 5;
+  int64 min_occurrence = 17;
+  int64 step = 18;
+
+  string order_by = 7;
+  repeated string account_type_regexes = 8;
+  repeated string start_name_regexes = 9;
+  repeated string trim_name_regexes = 10;
+  repeated string show_name_regexes = 11;
+  repeated string hide_name_regexes = 12;
+  bool account_displayed_op_only = 13;
+  repeated string select = 14;
+  string output = 15;
+  string dump_to_file = 16;
+}
+
+message AdvisorOptionsProto {
+  // checker name -> a dict of key-value options.
+  map<string, CheckerOption> checkers = 1;
+  message CheckerOption {
+    map<string, string> options = 1;
+  }
+}
diff --git a/tensorflow/tools/tfprof/tfprof_output.proto b/tensorflow/core/profiler/tfprof_output.proto
similarity index 50%
rename from tensorflow/tools/tfprof/tfprof_output.proto
rename to tensorflow/core/profiler/tfprof_output.proto
index d00e93c939f99927c54c4b7a7d0df315218fd806..5c9f132243ad79935638fc7328975c703d8d4141 100644
--- a/tensorflow/tools/tfprof/tfprof_output.proto
+++ b/tensorflow/core/profiler/tfprof_output.proto
@@ -1,4 +1,4 @@
-syntax = "proto2";
+syntax = "proto3";
 
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
@@ -6,7 +6,7 @@ import "tensorflow/core/framework/types.proto";
 package tensorflow.tfprof;
 
 message TFProfTensorProto {
-  optional DataType dtype = 1;
+  DataType dtype = 1;
   // Flatten tensor in row-major.
   // Only one of the following array is set.
   repeated double value_double = 2;
@@ -15,30 +15,42 @@ message TFProfTensorProto {
 }
 
 // A node in TensorFlow graph. Used by scope/graph view.
-message TFGraphNodeProto {
+message GraphNodeProto {
   // op name.
-  optional string name = 1;
+  string name = 1;
   // tensor value restored from checkpoint.
-  optional TFProfTensorProto tensor_value = 15;
+  TFProfTensorProto tensor_value = 15;
   // op execution time.
-  optional int64 exec_micros = 2;
+  // A node can be defined once but run multiple times in tf.while_loop.
+  // the times sum up all different runs.
+  int64 run_count = 21;
+  int64 exec_micros = 2;
+  int64 accelerator_exec_micros = 17;
+  int64 cpu_exec_micros = 18;
+
   // Total requested bytes by the op.
-  optional int64 requested_bytes = 3;
+  int64 requested_bytes = 3;
   // Number of parameters if available.
-  optional int64 parameters = 4;
+  int64 parameters = 4;
   // Number of float operations.
-  optional int64 float_ops = 13;
+  int64 float_ops = 13;
   // Device the op is assigned to.
   // Since an op can fire multiple kernel calls, there can be multiple devices.
   repeated string devices = 10;
 
-  // The following are the aggregated stats from all accounted children and
-  // the node itself. The actual children depend on the data structure used
-  // (scope, graph).
-  optional int64 total_exec_micros = 6;
-  optional int64 total_requested_bytes = 7;
-  optional int64 total_parameters = 8;
-  optional int64 total_float_ops = 14;
+  // The following are the aggregated stats from all *accounted* children and
+  // the node itself. The actual children depend on the data structure used.
+  // In graph view, children are inputs recursively.
+  // In scope view, children are nodes under the name scope.
+  int64 total_definition_count = 23;
+  int64 total_run_count = 22;
+  int64 total_exec_micros = 6;
+  int64 total_accelerator_exec_micros = 19;
+  int64 total_cpu_exec_micros = 20;
+
+  int64 total_requested_bytes = 7;
+  int64 total_parameters = 8;
+  int64 total_float_ops = 14;
 
   // shape information, if available.
   // TODO(xpan): Why is this repeated?
@@ -48,39 +60,53 @@ message TFGraphNodeProto {
 
   // Descendants of the graph. The actual descendants depend on the data
   // structure used (scope, graph).
-  repeated TFGraphNodeProto children = 12;
+  repeated GraphNodeProto children = 12;
 }
 
-// A node that groups multiple TFGraphNodeProto.
+// A node that groups multiple GraphNodeProto.
 // Depending on the 'view', the semantics of the TFmultiGraphNodeProto
 // is different:
 // code view: A node groups all TensorFlow graph nodes created by the
 //            Python code.
 // op view:   A node groups all TensorFlow graph nodes that are of type
 //            of the op (e.g. MatMul, Conv2D).
-message TFMultiGraphNodeProto {
+message MultiGraphNodeProto {
   // Name of the node.
-  optional string name = 1;
+  string name = 1;
 
   // code execution time.
-  optional int64 exec_micros = 2;
+  int64 exec_micros = 2;
+  int64 accelerator_exec_micros = 12;
+  int64 cpu_exec_micros = 13;
+
   // Total requested bytes by the code.
-  optional int64 requested_bytes = 3;
+  int64 requested_bytes = 3;
   // Number of parameters if available.
-  optional int64 parameters = 4;
+  int64 parameters = 4;
   // Number of float operations.
-  optional int64 float_ops = 5;
+  int64 float_ops = 5;
 
   // The following are the aggregated stats from descendants.
   // The actual descendants depend on the data structure used.
-  optional int64 total_exec_micros = 6;
-  optional int64 total_requested_bytes = 7;
-  optional int64 total_parameters = 8;
-  optional int64 total_float_ops = 9;
+  int64 total_exec_micros = 6;
+  int64 total_accelerator_exec_micros = 14;
+  int64 total_cpu_exec_micros = 15;
+
+  int64 total_requested_bytes = 7;
+  int64 total_parameters = 8;
+  int64 total_float_ops = 9;
 
-  // TensorFlow graph nodes contained by the TFMultiGraphNodeProto.
-  repeated TFGraphNodeProto graph_nodes = 10;
+  // TensorFlow graph nodes contained by the MultiGraphNodeProto.
+  repeated GraphNodeProto graph_nodes = 10;
   // Descendants of the node. The actual descendants depend on the data
   // structure used.
-  repeated TFMultiGraphNodeProto children = 11;
-}
\ No newline at end of file
+  repeated MultiGraphNodeProto children = 11;
+}
+
+message AdviceProto {
+  // checker name -> a list of reports from the checker.
+  map<string, Checker> checkers = 1;
+  message Checker {
+    repeated string reports = 2;
+  }
+}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 630f47633f87d1dfddb6eddbb18ea13a3575ddc4..69311e3a7f31f6a69607c4294f9af14431c6697c 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -102,6 +102,8 @@ message OptimizerOptions {
     L0 = -1;
   }
 
+  // Overall optimization level. The actual optimizations applied will be the
+  // logical OR of the flags that this level implies and any flags already set.
   Level opt_level = 3;
 
   // Control the use of the compiler/jit.  Experimental.
@@ -160,6 +162,8 @@ message GraphOptions {
   int32 timeline_step = 8;
 
   // Options that control the type and amount of graph rewriting.
+  // Not currently configurable via the public Python API (i.e. there is no API
+  // stability guarantee if you import RewriterConfig explicitly).
   RewriterConfig rewrite_options = 10;
 };
 
@@ -169,6 +173,23 @@ message ThreadPoolOptionProto {
   // 0 means the system picks a value based on where this option proto is used
   // (see the declaration of the specific field for more info).
   int32 num_threads = 1;
+
+  // The global name of the threadpool.
+  //
+  // If empty, then the threadpool is made and used according to the scope it's
+  // in - e.g., for a session threadpool, it is used by that session only.
+  //
+  // If non-empty, then:
+  // - a global threadpool associated with this name is looked
+  //   up or created. This allows, for example, sharing one threadpool across
+  //   many sessions (e.g., like the default behavior, if
+  //   inter_op_parallelism_threads is not configured), but still partitioning
+  //   into a large and small pool.
+  // - if the threadpool for this global_name already exists, then it is an
+  //   error if the existing pool was created using a different num_threads
+  //   value as is specified on this call.
+  // - threadpools created this way are never garbage collected.
+  string global_name = 2;
 };
 
 message RPCOptions {
@@ -216,13 +237,24 @@ message ConfigProto {
   bool use_per_session_threads = 9;
 
   // This option is experimental - it may be replaced with a different mechanism
-  // in the future. The intended use is for when some session invocations need
-  // to run in a background pool limited to a small number of threads.
+  // in the future.
   //
   // Configures session thread pools. If this is configured, then RunOptions for
   // a Run call can select the thread pool to use.
   //
-  // If a pool's num_threads is 0, then inter_op_parallelism_threads is used.
+  // The intended use is for when some session invocations need to run in a
+  // background pool limited to a small number of threads:
+  // - For example, a session may be configured to have one large pool (for
+  // regular compute) and one small pool (for periodic, low priority work);
+  // using the small pool is currently the mechanism for limiting the inter-op
+  // parallelism of the low priority work.  Note that it does not limit the
+  // parallelism of work spawned by a single op kernel implementation.
+  // - Using this setting is normally not needed in training, but may help some
+  // serving use cases.
+  // - It is also generally recommended to set the global_name field of this
+  // proto, to avoid creating multiple large pools. It is typically better to
+  // run the non-low-priority work, even across sessions, in a single large
+  // pool.
   repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
 
   // Assignment of Nodes to Devices is recomputed every placement_period
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
index 8ef78649900a4ac8e18bc60fd87fcdb67284831d..9b1497c710d40c4c5a989f80ae0d98ee2a2dc3a8 100644
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -49,11 +49,3 @@ message DeviceProperties {
   // Memory bandwidth in KB/s
   int64 bandwidth = 13;
 }
-
-message DeviceMap {
-  message NamedDevice {
-    string name = 1;
-    DeviceProperties device = 2;
-  };
-  repeated NamedDevice name_and_device = 1;
-}
\ No newline at end of file
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 753edba4b848e32dd6f71233452ec682342a15c3..1f9e2472a4925cca76520f1fd52a78d9abca0d2b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -12,21 +12,55 @@ message AutoParallelOptions {
 }
 
 message RewriterConfig {
+  // Graph rewriting is experimental and subject to change, not covered by any
+  // API stability guarantees.
+
+  // Configuration options for the meta-optimizer. Unless otherwise noted, these
+  // configuration options do not apply to explicitly triggered optimization
+  // passes in the optimizers field.
+
+  enum Toggle {
+    DEFAULT = 0;
+    ON = 1;
+    OFF = 2;
+  }
+
+  // Optimize tensor layouts
   bool optimize_tensor_layout = 1;
+  // Fold constants (default is OFF)
+  Toggle constant_folding = 3;
+
+  // If true, don't remove unecessary ops from the graph
   bool disable_model_pruning = 2;
-  bool constant_folding = 3;
 
   enum MemOptType {
-    // Fully disabled
-    NO_MEM_OPT = 0;
-    // Driven by manual annotations
-    MANUAL = 1;
+    // The default setting (currently disabled)
+    DEFAULT_MEM_OPT = 0;
+    // Disabled in the meta-optimizer.
+    NO_MEM_OPT = 1;
+    // Driven by manual op-level annotations.
+    MANUAL = 2;
+    // Driven by heuristics. The behavior of these heuristics is subject to
+    // change. Currently includes an experimental recomputation heuristic.
+    HEURISTICS = 3;
   }
+  // Configures memory optimization passes through the meta-optimizer. Has no
+  // effect on manually requested memory optimization passes in the optimizers
+  // field.
   MemOptType memory_optimization = 4;
 
+  // Configures AutoParallel optimization passes either through the
+  // meta-optimizer or when manually specified through the optimizers field.
   AutoParallelOptions auto_parallel = 5;
 
   // If non-empty, will use this as an alternative way to specify a list of
-  // optimizations to turn on and the order of the optimizations.
+  // optimizations to turn on and the order of the optimizations (replacing the
+  // meta-optimizer).
+  //
+  // Of the RewriterConfig options, only the AutoParallel configuration options
+  // (the auto_parallel field) apply to manually requested optimization passes
+  // ("autoparallel"). Memory optimization passes ("memory") invoked here are
+  // not configurable (in contrast to memory optimization passes through the
+  // meta-optimizer) and act only on manual op annotations.
   repeated string optimizers = 100;
 }
diff --git a/tensorflow/core/protobuf/saver.proto b/tensorflow/core/protobuf/saver.proto
index 65fe9c4c98efd1c0c04c5d62dc13d72c479862d6..a757d3f756ab731352b0a77d321d83891766f235 100644
--- a/tensorflow/core/protobuf/saver.proto
+++ b/tensorflow/core/protobuf/saver.proto
@@ -37,9 +37,9 @@ message SaverDef {
   enum CheckpointFormatVersion {
     // Internal legacy format.
     LEGACY = 0;
-    // Current format: tf.Saver() which works with tensorflow::table::Table.
+    // Deprecated format: tf.Saver() which works with tensorflow::table::Table.
     V1 = 1;
-    // Experimental format under development.
+    // Current format: more efficient.
     V2 = 2;
   }
   CheckpointFormatVersion version = 7;
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e476a84a137ba4e113606d912f5184342901c6d0..137f9bc216dcd0edc9c967a17c65710f5619edb6 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -141,7 +141,7 @@ message DeregisterGraphResponse {
 message CleanupAllRequest {
   // A list of container names.
   //
-  // If 'container' is not empty, releases resoures in the given
+  // If 'container' is not empty, releases resources in the given
   // containers in all devices.
   //
   // If 'container' is empty, releases resources in the default
@@ -168,6 +168,7 @@ message CleanupAllResponse {
 message ExecutorOpts {
   bool record_costs = 1;
   bool record_timeline = 3;
+  bool record_partition_graphs = 4;
 };
 
 message RunGraphRequest {
@@ -212,10 +213,12 @@ message RunGraphResponse {
   // `RunGraphRequest.recv_key`.
   repeated NamedTensorProto recv = 1;
 
-  // If the request asked for execution stats or cost graph, these are returned
-  // here.
+  // If the request asked for execution stats, the cost graph, or the partition
+  // graphs, these are returned here.
+  // TODO(suharshs): Package these in a RunMetadata instead.
   StepStats step_stats = 2;
   CostGraphDef cost_graph = 3;
+  repeated GraphDef partition_graph = 4;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index c1f097c7c6854f8e9a72d1852a5386bae6a4a856..bca384e59fe9412a77398a81f0c8abbfd512e51a 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+class DeviceMgr;
 
 /// \brief A Session instance lets a caller drive a TensorFlow graph
 /// computation.
@@ -177,12 +178,23 @@ class Session {
   /// *response. This API is optional. If it is unimplemented, Status will
   /// return a corresponding error message, and *response will be unmodified.
   virtual Status ListDevices(std::vector<DeviceAttributes>* response) = 0;
+
   /// \brief Closes this session.
   ///
   /// Closing a session releases the resources used by this session
   /// on the TensorFlow runtime (specified during session creation by
   /// the `SessionOptions::target` field).
   virtual Status Close() = 0;
+
+  // NOTE(ashankar): As of July 2017, this method was added to faciliate some
+  // experimentation. Reconsider/re-evaluate after September 2017.
+  //
+  // Sets `*output` to the `DeviceMgr` that owns accessible devices in the
+  // address-space of the caller.
+  virtual Status LocalDeviceManager(const DeviceMgr** output) {
+    return errors::Unimplemented(
+        "LocalDeviceManager is not supported for this session.");
+  }
 };
 
 /// \brief Create a new session with the given options.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index d30d7819fce25f3550ca3fe17ff689d7c4c4611b..9ba6b0ed5a58905aee5d50bec49e1742e1e8891d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 2
+#define TF_MINOR_VERSION 3
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc2"
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
@@ -89,6 +89,7 @@ limitations under the License.
 //     produced at version 22 or later.  (04/10/2016)
 // 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
 // 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
+// 25. Deprecate stack (v1) ops in favor of v2 (2017/6/15).
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
diff --git a/tensorflow/core/util/activation_mode.cc b/tensorflow/core/util/activation_mode.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4bf947a0a9abd12fa73898591fec9b066f1d5a8a
--- /dev/null
+++ b/tensorflow/core/util/activation_mode.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/activation_mode.h"
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+Status GetActivationModeFromString(const string& str_value,
+                                   ActivationMode* value) {
+  if (str_value == "Sigmoid") {
+    *value = SIGMOID;
+  } else if (str_value == "Relu") {
+    *value = RELU;
+  } else if (str_value == "Relu6") {
+    *value = RELU6;
+  } else if (str_value == "ReluX") {
+    *value = RELUX;
+  } else if (str_value == "Tanh") {
+    *value = TANH;
+  } else if (str_value == "BandPass") {
+    *value = BANDPASS;
+  } else {
+    return errors::NotFound(str_value, " is not an allowed activation mode");
+  }
+  return Status::OK();
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/util/activation_mode.h b/tensorflow/core/util/activation_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a8564847dd5b6020ca0f779c17974b07ee0d51b
--- /dev/null
+++ b/tensorflow/core/util/activation_mode.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_ACTIVATION_MODE_H_
+#define TENSORFLOW_UTIL_ACTIVATION_MODE_H_
+
+// This file contains helper routines to deal with activation mode in various
+// ops and kernels.
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// ActivationMode: the activation function we apply to the input tensor:
+enum ActivationMode {
+  SIGMOID = 1,
+  RELU = 2,
+  RELU6 = 3,
+  RELUX = 4,
+  TANH = 5,
+  BANDPASS = 6,
+};
+
+// Specialization to parse an attribute directly into a ActivationMode enum.
+Status GetActivationModeFromString(const string& str_value,
+                                   ActivationMode* value);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_ACTIVATION_MODE_H_
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 8373eb1f9e7e13cb7097904a075302164aaf5d80..3efc703faf7b23958eb49d59fd0dd4565f090bbe 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -25,10 +25,11 @@ namespace tensorflow {
 namespace {
 
 bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
-                     string* dst, bool* value_parsing_ok) {
+                     const std::function<bool(string)>& hook,
+                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
-    *dst = arg.ToString();
+    *value_parsing_ok = hook(arg.ToString());
     return true;
   }
 
@@ -36,14 +37,18 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 }
 
 bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
-                    tensorflow::int32* dst, bool* value_parsing_ok) {
+                    const std::function<bool(int32)>& hook,
+                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
     char extra;
-    if (sscanf(arg.data(), "%d%c", dst, &extra) != 1) {
+    int32 parsed_int32;
+    if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
+    } else {
+      *value_parsing_ok = hook(parsed_int32);
     }
     return true;
   }
@@ -52,14 +57,18 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 }
 
 bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
-                    tensorflow::int64* dst, bool* value_parsing_ok) {
+                    const std::function<bool(int64)>& hook,
+                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
     char extra;
-    if (sscanf(arg.data(), "%lld%c", dst, &extra) != 1) {
+    int64 parsed_int64;
+    if (sscanf(arg.data(), "%lld%c", &parsed_int64, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
+    } else {
+      *value_parsing_ok = hook(parsed_int64);
     }
     return true;
   }
@@ -68,19 +77,20 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 }
 
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
-                   bool* dst, bool* value_parsing_ok) {
+                   const std::function<bool(bool)>& hook,
+                   bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (arg.Consume("--") && arg.Consume(flag)) {
     if (arg.empty()) {
-      *dst = true;
+      *value_parsing_ok = hook(true);
       return true;
     }
 
     if (arg == "=true") {
-      *dst = true;
+      *value_parsing_ok = hook(true);
       return true;
     } else if (arg == "=false") {
-      *dst = false;
+      *value_parsing_ok = hook(false);
       return true;
     } else {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
@@ -94,14 +104,18 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 }
 
 bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
-                    float* dst, bool* value_parsing_ok) {
+                    const std::function<bool(float)>& hook,
+                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
     char extra;
-    if (sscanf(arg.data(), "%f%c", dst, &extra) != 1) {
+    float parsed_float;
+    if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
+    } else {
+      *value_parsing_ok = hook(parsed_float);
     }
     return true;
   }
@@ -112,44 +126,107 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 }  // namespace
 
 Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text)
-    : name_(name), type_(TYPE_INT), int_value_(dst), usage_text_(usage_text) {}
+    : name_(name),
+      type_(TYPE_INT32),
+      int32_hook_([dst](int32 value) {
+        *dst = value;
+        return true;
+      }),
+      int32_default_for_display_(*dst),
+      usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, tensorflow::int64* dst, const string& usage_text)
     : name_(name),
       type_(TYPE_INT64),
-      int64_value_(dst),
+      int64_hook_([dst](int64 value) {
+        *dst = value;
+        return true;
+      }),
+      int64_default_for_display_(*dst),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, float* dst, const string& usage_text)
+    : name_(name),
+      type_(TYPE_FLOAT),
+      float_hook_([dst](float value) {
+        *dst = value;
+        return true;
+      }),
+      float_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, bool* dst, const string& usage_text)
     : name_(name),
       type_(TYPE_BOOL),
-      bool_value_(dst),
+      bool_hook_([dst](bool value) {
+        *dst = value;
+        return true;
+      }),
+      bool_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, string* dst, const string& usage_text)
     : name_(name),
       type_(TYPE_STRING),
-      string_value_(dst),
+      string_hook_([dst](string value) {
+        *dst = std::move(value);
+        return true;
+      }),
+      string_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, float* dst, const string& usage_text)
+Flag::Flag(const char* name, std::function<bool(int32)> int32_hook,
+           int32 default_value_for_display, const string& usage_text)
+    : name_(name),
+      type_(TYPE_INT32),
+      int32_hook_(std::move(int32_hook)),
+      int32_default_for_display_(default_value_for_display),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, std::function<bool(int64)> int64_hook,
+           int64 default_value_for_display, const string& usage_text)
+    : name_(name),
+      type_(TYPE_INT64),
+      int64_hook_(std::move(int64_hook)),
+      int64_default_for_display_(default_value_for_display),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, std::function<bool(float)> float_hook,
+           float default_value_for_display, const string& usage_text)
     : name_(name),
       type_(TYPE_FLOAT),
-      float_value_(dst),
+      float_hook_(std::move(float_hook)),
+      float_default_for_display_(default_value_for_display),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, std::function<bool(bool)> bool_hook,
+           bool default_value_for_display, const string& usage_text)
+    : name_(name),
+      type_(TYPE_BOOL),
+      bool_hook_(std::move(bool_hook)),
+      bool_default_for_display_(default_value_for_display),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, std::function<bool(string)> string_hook,
+           string default_value_for_display, const string& usage_text)
+    : name_(name),
+      type_(TYPE_STRING),
+      string_hook_(std::move(string_hook)),
+      string_default_for_display_(std::move(default_value_for_display)),
       usage_text_(usage_text) {}
 
 bool Flag::Parse(string arg, bool* value_parsing_ok) const {
   bool result = false;
-  if (type_ == TYPE_INT) {
-    result = ParseInt32Flag(arg, name_, int_value_, value_parsing_ok);
+  if (type_ == TYPE_INT32) {
+    result = ParseInt32Flag(arg, name_, int32_hook_, value_parsing_ok);
   } else if (type_ == TYPE_INT64) {
-    result = ParseInt64Flag(arg, name_, int64_value_, value_parsing_ok);
+    result = ParseInt64Flag(arg, name_, int64_hook_, value_parsing_ok);
   } else if (type_ == TYPE_BOOL) {
-    result = ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
+    result = ParseBoolFlag(arg, name_, bool_hook_, value_parsing_ok);
   } else if (type_ == TYPE_STRING) {
-    result = ParseStringFlag(arg, name_, string_value_, value_parsing_ok);
+    result = ParseStringFlag(arg, name_, string_hook_, value_parsing_ok);
   } else if (type_ == TYPE_FLOAT) {
-    result = ParseFloatFlag(arg, name_, float_value_, value_parsing_ok);
+    result = ParseFloatFlag(arg, name_, float_hook_, value_parsing_ok);
   }
   return result;
 }
@@ -203,26 +280,28 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
   for (const Flag& flag : flag_list) {
     const char* type_name = "";
     string flag_string;
-    if (flag.type_ == Flag::TYPE_INT) {
+    if (flag.type_ == Flag::TYPE_INT32) {
       type_name = "int32";
-      flag_string =
-          strings::Printf("--%s=%d", flag.name_.c_str(), *flag.int_value_);
+      flag_string = strings::Printf("--%s=%d", flag.name_.c_str(),
+                                    flag.int32_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_INT64) {
       type_name = "int64";
-      flag_string = strings::Printf("--%s=%lld", flag.name_.c_str(),
-                                    static_cast<long long>(*flag.int64_value_));
+      flag_string = strings::Printf(
+          "--%s=%lld", flag.name_.c_str(),
+          static_cast<long long>(flag.int64_default_for_display_));
     } else if (flag.type_ == Flag::TYPE_BOOL) {
       type_name = "bool";
-      flag_string = strings::Printf("--%s=%s", flag.name_.c_str(),
-                                    *flag.bool_value_ ? "true" : "false");
+      flag_string =
+          strings::Printf("--%s=%s", flag.name_.c_str(),
+                          flag.bool_default_for_display_ ? "true" : "false");
     } else if (flag.type_ == Flag::TYPE_STRING) {
       type_name = "string";
       flag_string = strings::Printf("--%s=\"%s\"", flag.name_.c_str(),
-                                    flag.string_value_->c_str());
+                                    flag.string_default_for_display_.c_str());
     } else if (flag.type_ == Flag::TYPE_FLOAT) {
       type_name = "float";
-      flag_string =
-          strings::Printf("--%s=%f", flag.name_.c_str(), *flag.float_value_);
+      flag_string = strings::Printf("--%s=%f", flag.name_.c_str(),
+                                    flag.float_default_for_display_);
     }
     strings::Appendf(&usage_text, "\t%-33s\t%s\t%s\n", flag_string.c_str(),
                      type_name, flag.usage_text_.c_str());
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index f349df16fd478c533a36c8503a711b768d49eea0..121c7063c9ebf6d447d0077f612386e316e05624 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
 #define THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
 
+#include <functional>
 #include <string>
 #include <vector>
 #include "tensorflow/core/platform/types.h"
@@ -61,24 +62,58 @@ namespace tensorflow {
 // text, and a pointer to the corresponding variable.
 class Flag {
  public:
-  Flag(const char* name, int32* dst1, const string& usage_text);
-  Flag(const char* name, int64* dst1, const string& usage_text);
+  Flag(const char* name, int32* dst, const string& usage_text);
+  Flag(const char* name, int64* dst, const string& usage_text);
   Flag(const char* name, bool* dst, const string& usage_text);
   Flag(const char* name, string* dst, const string& usage_text);
   Flag(const char* name, float* dst, const string& usage_text);
 
+  // These constructors invoke a hook on a match instead of writing to a
+  // specific memory location.  The hook may return false to signal a malformed
+  // or illegal value, which will then fail the command line parse.
+  //
+  // "default_value_for_display" is shown as the default value of this flag in
+  // Flags::Usage().
+  Flag(const char* name, std::function<bool(int32)> int32_hook,
+       int32 default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(int64)> int64_hook,
+       int64 default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(float)> float_hook,
+       float default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(bool)> bool_hook,
+       bool default_value_for_display, const string& usage_text);
+  Flag(const char* name, std::function<bool(string)> string_hook,
+       string default_value_for_display, const string& usage_text);
+
  private:
   friend class Flags;
 
   bool Parse(string arg, bool* value_parsing_ok) const;
 
   string name_;
-  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
-  int* int_value_;
-  int64* int64_value_;
-  bool* bool_value_;
-  string* string_value_;
-  float* float_value_;
+  enum {
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_BOOL,
+    TYPE_STRING,
+    TYPE_FLOAT,
+  } type_;
+
+  std::function<bool(int32)> int32_hook_;
+  int32 int32_default_for_display_;
+
+  std::function<bool(int64)> int64_hook_;
+  int64 int64_default_for_display_;
+
+  std::function<bool(float)> float_hook_;
+  float float_default_for_display_;
+
+  std::function<bool(bool)> bool_hook_;
+  bool bool_default_for_display_;
+
+  std::function<bool(string)> string_hook_;
+  string string_default_for_display_;
+
   string usage_text_;
 };
 
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index c86a70ec9d0fbf81f211f3760b9de4b7b907eb9f..6139c8e7bcd1015e17b796896404ccf33064123f 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -36,32 +36,85 @@ std::vector<char *> CharPointerVectorFromStrings(
 }  // namespace
 
 TEST(CommandLineFlagsTest, BasicUsage) {
-  int some_int = 10;
-  int64 some_int64 = 21474836470;  // max int32 is 2147483647
-  bool some_switch = false;
-  string some_name = "something";
-  float some_float = -23.23f;
-  int argc = 6;
+  int some_int32_set_directly = 10;
+  int some_int32_set_via_hook = 20;
+  int64 some_int64_set_directly = 21474836470;  // max int32 is 2147483647
+  int64 some_int64_set_via_hook = 21474836479;  // max int32 is 2147483647
+  bool some_switch_set_directly = false;
+  bool some_switch_set_via_hook = true;
+  string some_name_set_directly = "something_a";
+  string some_name_set_via_hook = "something_b";
+  float some_float_set_directly = -23.23f;
+  float some_float_set_via_hook = -25.23f;
   std::vector<string> argv_strings = {"program_name",
-                                      "--some_int=20",
-                                      "--some_int64=214748364700",
-                                      "--some_switch",
-                                      "--some_name=somethingelse",
-                                      "--some_float=42.0"};
+                                      "--some_int32_set_directly=20",
+                                      "--some_int32_set_via_hook=50",
+                                      "--some_int64_set_directly=214748364700",
+                                      "--some_int64_set_via_hook=214748364710",
+                                      "--some_switch_set_directly",
+                                      "--some_switch_set_via_hook=false",
+                                      "--some_name_set_directly=somethingelse",
+                                      "--some_name_set_via_hook=anythingelse",
+                                      "--some_float_set_directly=42.0",
+                                      "--some_float_set_via_hook=43.0"};
+  int argc = argv_strings.size();
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
-  bool parsed_ok =
-      Flags::Parse(&argc, argv_array.data(),
-                   {Flag("some_int", &some_int, "some int"),
-                    Flag("some_int64", &some_int64, "some int64"),
-                    Flag("some_switch", &some_switch, "some switch"),
-                    Flag("some_name", &some_name, "some name"),
-                    Flag("some_float", &some_float, "some float")});
+  bool parsed_ok = Flags::Parse(
+      &argc, argv_array.data(),
+      {
+          Flag("some_int32_set_directly", &some_int32_set_directly,
+               "some int32 set directly"),
+          Flag("some_int32_set_via_hook",
+               [&](int32 value) {
+                 some_int32_set_via_hook = value;
+                 return true;
+               },
+               some_int32_set_via_hook, "some int32 set via hook"),
+          Flag("some_int64_set_directly", &some_int64_set_directly,
+               "some int64 set directly"),
+          Flag("some_int64_set_via_hook",
+               [&](int64 value) {
+                 some_int64_set_via_hook = value;
+                 return true;
+               },
+               some_int64_set_via_hook, "some int64 set via hook"),
+          Flag("some_switch_set_directly", &some_switch_set_directly,
+               "some switch set directly"),
+          Flag("some_switch_set_via_hook",
+               [&](bool value) {
+                 some_switch_set_via_hook = value;
+                 return true;
+               },
+               some_switch_set_via_hook, "some switch set via hook"),
+          Flag("some_name_set_directly", &some_name_set_directly,
+               "some name set directly"),
+          Flag("some_name_set_via_hook",
+               [&](string value) {
+                 some_name_set_via_hook = std::move(value);
+                 return true;
+               },
+               some_name_set_via_hook, "some name set via hook"),
+          Flag("some_float_set_directly", &some_float_set_directly,
+               "some float set directly"),
+          Flag("some_float_set_via_hook",
+               [&](float value) {
+                 some_float_set_via_hook = value;
+                 return true;
+               },
+               some_float_set_via_hook, "some float set via hook"),
+      });
+
   EXPECT_EQ(true, parsed_ok);
-  EXPECT_EQ(20, some_int);
-  EXPECT_EQ(214748364700, some_int64);
-  EXPECT_EQ(true, some_switch);
-  EXPECT_EQ("somethingelse", some_name);
-  EXPECT_NEAR(42.0f, some_float, 1e-5f);
+  EXPECT_EQ(20, some_int32_set_directly);
+  EXPECT_EQ(50, some_int32_set_via_hook);
+  EXPECT_EQ(214748364700, some_int64_set_directly);
+  EXPECT_EQ(214748364710, some_int64_set_via_hook);
+  EXPECT_EQ(true, some_switch_set_directly);
+  EXPECT_EQ(false, some_switch_set_via_hook);
+  EXPECT_EQ("somethingelse", some_name_set_directly);
+  EXPECT_EQ("anythingelse", some_name_set_via_hook);
+  EXPECT_NEAR(42.0f, some_float_set_directly, 1e-5f);
+  EXPECT_NEAR(43.0f, some_float_set_via_hook, 1e-5f);
   EXPECT_EQ(argc, 1);
 }
 
@@ -107,6 +160,70 @@ TEST(CommandLineFlagsTest, BadFloatValue) {
   EXPECT_EQ(argc, 1);
 }
 
+TEST(CommandLineFlagsTest, FailedInt32Hook) {
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name", "--some_int32=200"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag("some_int32", [](int32 value) { return false; }, 30,
+                         "some int32")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, FailedInt64Hook) {
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name", "--some_int64=200"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag("some_int64", [](int64 value) { return false; }, 30,
+                         "some int64")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, FailedFloatHook) {
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name", "--some_float=200.0"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag("some_float", [](float value) { return false; }, 30.0f,
+                         "some float")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, FailedBoolHook) {
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name", "--some_switch=true"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag("some_switch", [](bool value) { return false; }, false,
+                         "some switch")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, FailedStringHook) {
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name", "--some_name=true"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok = Flags::Parse(
+      &argc, argv_array.data(),
+      {Flag("some_name", [](string value) { return false; }, "", "some name")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(argc, 1);
+}
+
 // Return whether str==pat, but allowing any whitespace in pat
 // to match zero or more whitespace characters in str.
 static bool MatchWithAnyWhitespace(const string &str, const string &pat) {
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 357b2535515432a97f5a24b8670e4e3a03db6359..c955b280146f109dc957c0490d57c3c8d169a1dd 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -102,5 +102,4 @@ cc_library(
     hdrs = [
         "ctc_loss_util.h",
     ],
-    deps = ["//tensorflow/core:lib"],
 )
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index c86c6e4a5d160196a07c5df7ebb18c6fbfc7c3d0..ee651944052b58bb724bb57725614c266458fb5e 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -20,9 +20,11 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 // Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
 // GetCuda3DLaunchConfig:
@@ -95,7 +97,8 @@ void MyDriverFunc(const GPUDevice &d) {
 }
 
 // See the test for this for more example:
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+//
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
 
 */
 
@@ -107,7 +110,7 @@ void MyDriverFunc(const GPUDevice &d) {
   for (int i = blockIdx.axis * blockDim.axis + threadIdx.axis; i < n.axis; \
        i += blockDim.axis * gridDim.axis)
 
-#define DIV_UP(a, b) (((a) + (b) - 1) / (b))
+#define DIV_UP(a, b) (((a) + (b)-1) / (b))
 
 namespace tensorflow {
 
@@ -277,7 +280,19 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
                                dynamic_shared_memory_size, block_size_limit);
 }
 
-namespace gpu {
+// Returns a raw reference to the current cuda stream.  Required by a
+// number of kernel calls (for which StreamInterface* does not work), i.e.
+// CUB and certain cublas primitives.
+inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
+  const cudaStream_t* ptr = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  return *ptr;
+}
+
+namespace cuda_helper {
 
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
@@ -299,7 +314,7 @@ __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
   return first - orig;
 }
 
-}  // namespace gpu
+}  // namespace cuda_helper
 
 template <typename T>
 __device__ __host__ inline T ldg(const T* address) {
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index abd72b7d77fb279948b8130e2370897e0f750a90..623f7bab9058aba9f14b149a934ca87cb0151dc6 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -164,7 +164,7 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8191);
   TEST_LAUNCH_PARAMETER(8192);
   TEST_LAUNCH_PARAMETER(123456);
-  TEST_LAUNCH_PARAMETER(1 << 31 - 1);  // max value of int
+  TEST_LAUNCH_PARAMETER(1 << 30);
   #undef TEST_LAUNCH_PARAMETER
 }
 
@@ -243,8 +243,8 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8191, 1);
   TEST_LAUNCH_PARAMETER(8192, 10);
   TEST_LAUNCH_PARAMETER(123456, 12);
-  TEST_LAUNCH_PARAMETER(1, (1 << 31 - 1));
-  TEST_LAUNCH_PARAMETER((1 << 31 - 1), 1);
+  TEST_LAUNCH_PARAMETER(1, 1 << 30);
+  TEST_LAUNCH_PARAMETER(1 << 30, 1);
   #undef TEST_LAUNCH_PARAMETER
 }
 
@@ -292,9 +292,9 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8191, 1, 1024);
   TEST_LAUNCH_PARAMETER(8192, 10, 32);
   TEST_LAUNCH_PARAMETER(123456, 12, 21);
-  TEST_LAUNCH_PARAMETER(1, 1, (1 << 31 - 1));
-  TEST_LAUNCH_PARAMETER(1, (1 << 31 - 1), 1);
-  TEST_LAUNCH_PARAMETER((1 << 31 - 1), 1, 1);
+  TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
+  TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
+  TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
   #undef TEST_LAUNCH_PARAMETER
 }
 
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 64aa0ac2097b599ac03f3f92723c58351b9c6f55..530a1737e5a5704e778a3b76e6010db1ea738501 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -85,16 +85,29 @@ static bool ConsumeNumber(StringPiece* in, int* val) {
   }
 }
 
-/* static */
-string DeviceNameUtils::FullName(const string& job, int replica, int task,
-                                 const string& type, int id) {
+// Returns a fully qualified device name given the parameters.
+static string DeviceName(const string& job, int replica, int task,
+                         const string& device_prefix, const string& device_type,
+                         int id) {
   CHECK(IsJobName(job)) << job;
   CHECK_LE(0, replica);
   CHECK_LE(0, task);
-  CHECK(!type.empty());
+  CHECK(!device_type.empty());
   CHECK_LE(0, id);
   return strings::StrCat("/job:", job, "/replica:", replica, "/task:", task,
-                         "/device:", type, ":", id);
+                         device_prefix, device_type, ":", id);
+}
+
+/* static */
+string DeviceNameUtils::FullName(const string& job, int replica, int task,
+                                 const string& type, int id) {
+  return DeviceName(job, replica, task, "/device:", type, id);
+}
+
+/* static */
+string DeviceNameUtils::LegacyName(const string& job, int replica, int task,
+                                   const string& type, int id) {
+  return DeviceName(job, replica, task, "/", str_util::Lowercase(type), id);
 }
 
 bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 4228af8b8cae9b3a1dec1afa1a236b1a5b86f1f2..1f32828bae7461db76c5fba1e89468dfcb80c318 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -48,6 +48,9 @@ class DeviceNameUtils {
   // Returns a fully qualified device name given the parameters.
   static string FullName(const string& job, int replica, int task,
                          const string& type, int id);
+  // Returns a fully qualified device name given the parameters in legacy style.
+  static string LegacyName(const string& job, int replica, int task,
+                           const string& type, int id);
 
   struct ParsedName {
     void Clear() {
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index ed511629b6ed9b95b2841ba7f272e5403fac6700..e44b840967283580e1b0bc6161448dae9014b56c 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -69,6 +69,9 @@ TEST(DeviceNameUtilsTest, Basic) {
   EXPECT_EQ(DeviceNameUtils::FullName("hello", 1, 2, "CPU", 3),
             "/job:hello/replica:1/task:2/device:CPU:3");
 
+  EXPECT_EQ(DeviceNameUtils::LegacyName("hello", 1, 2, "CPU", 3),
+            "/job:hello/replica:1/task:2/cpu:3");
+
   {
     DeviceNameUtils::ParsedName p;
     EXPECT_FALSE(DeviceNameUtils::ParseFullName("foobar", &p));
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index 2db026da56c8bb1dd1f563a240068d01daa5b00b..919a46bfb85044a71fc22c114b74b703301f5ba5 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <unordered_map>
 #include <unordered_set>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/core/util/equal_graph_def.h b/tensorflow/core/util/equal_graph_def.h
index 1ce6181c2e7e412f9f059e711538b3e689bfcd65..0e7f2950cbcb8e085e5ea59c9ec5d207662ebcb5 100644
--- a/tensorflow/core/util/equal_graph_def.h
+++ b/tensorflow/core/util/equal_graph_def.h
@@ -16,13 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPH_EQUAL_GRAPH_DEF_H_
 #define TENSORFLOW_GRAPH_EQUAL_GRAPH_DEF_H_
 
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+class GraphDef;
+class NodeDef;
+
 struct EqualGraphDefOptions {
   // Should internal attributes (attribute names that start with '_') be
   // ignored?
@@ -53,11 +56,13 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
                           string* diff,
                           const EqualGraphDefOptions& options = {});
 
-#define TF_EXPECT_GRAPH_EQ(expected, actual)                  \
-  do {                                                        \
-    string diff;                                              \
-    EXPECT_TRUE(EqualGraphDef(actual, expected, &diff))       \
-        << diff << "\nActual: " << SummarizeGraphDef(actual); \
+#define TF_EXPECT_GRAPH_EQ(expected, actual)            \
+  do {                                                  \
+    string diff;                                        \
+    EXPECT_TRUE(EqualGraphDef(actual, expected, &diff)) \
+        << diff << "\nExpected:\n"                      \
+        << SummarizeGraphDef(expected) << "\nActual:\n" \
+        << SummarizeGraphDef(actual);                   \
   } while (false)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/matmul_autotune.cc b/tensorflow/core/util/matmul_autotune.cc
new file mode 100644
index 0000000000000000000000000000000000000000..741a78a193f362576e3ddf0d8b8ca76d49462b79
--- /dev/null
+++ b/tensorflow/core/util/matmul_autotune.cc
@@ -0,0 +1,51 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/matmul_autotune.h"
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+bool MatmulAutotuneEnable() {
+  bool value;
+  Status status =
+      ReadBoolFromEnvVar("TF_MATMUL_AUTOTUNE_ENABLE", false, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
+  return value;
+}
+
+bool MatmulDoFP32ComputationFP16Input() {
+  bool value;
+  // Feedback from NVIDIA: the "true floating point 16" compute capability is
+  // absent from compute capability SM 5.2. The native 16 bit floating point
+  // computation was introduced in SM 5.3 and higher compute capability. So
+  // for compatibility, set this to be true by default for now.
+  // TODO(yangzihao): In the future, we need to return three possibilities:
+  // user-set-true, user-set-false, user-no-setting. In the calling sites,
+  // check the compatibilities. Note that user-set-false with compute
+  // capability <= 5.2 will cause an error in the later cublasGemmEx() call.
+  Status status =
+      ReadBoolFromEnvVar("TF_FP16_MATMUL_USE_FP32_COMPUTE", true, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
+  return value;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/matmul_autotune.h b/tensorflow/core/util/matmul_autotune.h
new file mode 100644
index 0000000000000000000000000000000000000000..53666238836b89db3198adce9620fcbd7c59a12c
--- /dev/null
+++ b/tensorflow/core/util/matmul_autotune.h
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The utility to check matmul autotune related flags.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+
+namespace tensorflow {
+
+bool MatmulAutotuneEnable();
+bool MatmulDoFP32ComputationFP16Input();
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 1d01c6b0839bf17c3e9ca813b41220f2081f97c9..a5f24b08b32f097ed80d5059d57ab799d380bbd4 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/util/memmapped_file_system.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -109,7 +110,7 @@ TEST(MemmappedFileSystemTest, SimpleTest) {
             memmapped_env.FileExists("bla-bla-bla").code());
 }
 
-TEST(MemmappedFileSystemTest, NotInitalized) {
+TEST(MemmappedFileSystemTest, NotInitialized) {
   MemmappedEnv memmapped_env(Env::Default());
   std::unique_ptr<ReadOnlyMemoryRegion> memory_region;
   EXPECT_EQ(
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 67468bdc3fc8764b83383b9226a6362fb647214a..35aca709d92f063fadd619f94ebce9163692842e 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
-#include "third_party/mkl/include/mkl_service.h"
-#include "third_party/mkl/include/mkl_trans.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "mkl_service.h"
+#include "mkl_trans.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index 6f56d9c25a0c5f3e285c57b68d4d5acc0701aa45..2e6003226c6508533696301517de77043564e09b 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -21,11 +21,13 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
+class NodeDef;
+
 // Padding: the padding we apply to the input tensor along the rows and columns
 // dimensions. This is usually used to make sure that the spatial dimensions do
 // not shrink when we progress with convolutions. Two types of padding are
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index d8e5d901427028a3129037b1c5af1ac1ea0d2a76..0ea74c38b1916f777eaaf7b0907b614e680ea6e7 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/sparse/dim_comparator.h"
 #include "tensorflow/core/util/sparse/group_iterator.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
@@ -59,8 +59,8 @@ class SparseTensor {
         shape_(shape.begin(), shape.end()),
         order_(order.begin(), order.end()),
         dims_(GetDimsFromIx(ix)) {
-    CHECK_EQ(ix.dtype(), DT_INT64)
-        << "indices must be type int64 but got: " << ix.dtype();
+    CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: "
+                                   << ix.dtype();
     CHECK(TensorShapeUtils::IsVector(vals.shape()))
         << "vals must be a vec, but got: " << vals.shape().DebugString();
     CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0))
@@ -155,6 +155,15 @@ class SparseTensor {
                                          const int split_dim,
                                          const int num_split);
 
+  // Slice() will slice the input SparseTensor into a SparseTensor based on
+  // specified start and size. Both start and size are 1-D array with each
+  // element of the array representing one dimension. The start is the start
+  // index at each dimension and the size is the size at each dimension.
+  template <typename T>
+  static SparseTensor Slice(const SparseTensor& tensor,
+                            const gtl::ArraySlice<int64>& start,
+                            const gtl::ArraySlice<int64>& size);
+
   // Picks out the dimensions according to `dim_indices`.
   std::vector<int64> PickDims(gtl::ArraySlice<int64> dim_indices) const {
     std::vector<int64> res(dim_indices.size());
@@ -541,6 +550,81 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
   return output_tensors;
 }
 
+template <typename T>
+SparseTensor SparseTensor::Slice(const SparseTensor& input_tensor,
+                                 const gtl::ArraySlice<int64>& start,
+                                 const gtl::ArraySlice<int64>& size) {
+  TensorShape output_shape(input_tensor.shape());
+
+  const int dims = input_tensor.dims();
+  for (int dim = 0; dim < dims; dim++) {
+    int64 dim_size = start[dim] + size[dim] < output_shape.dim_size(dim)
+                         ? size[dim]
+                         : output_shape.dim_size(dim) - start[dim];
+    output_shape.set_dim(dim, dim_size);
+  }
+
+  auto input_indices_t = input_tensor.indices().matrix<int64>();
+  auto input_values_t = input_tensor.values().vec<T>();
+
+  // Find the number of indices that fall inside start and size.
+  int count = 0;
+  for (int i = 0; i < input_tensor.indices().dim_size(0); i++) {
+    // The following will check to see if an input is within the
+    // range specified by start and size.
+    // The for loop below iterates through all dimensions. In case
+    // the index falls outside of the start and size at any dimension,
+    // it will be considered as a "no hit" (hit = false). In this
+    // case, it will not be counted as the index that fall inside
+    // the range specified by start and size.
+    bool hit = true;
+    for (int dim = 0; dim < dims; dim++) {
+      if (!(start[dim] <= input_indices_t(i, dim) &&
+            input_indices_t(i, dim) < start[dim] + size[dim])) {
+        hit = false;
+        break;
+      }
+    }
+    if (!hit) {
+      continue;
+    }
+    count++;
+  }
+
+  Tensor output_values(DataTypeToEnum<T>::v(), TensorShape({count}));
+  Tensor output_indices(DT_INT64, TensorShape({count, dims}));
+
+  auto output_values_t = output_values.vec<T>();
+  auto output_indices_t = output_indices.matrix<int64>();
+
+  // Obtain the output indices that fall inside start and size.
+  int index = 0;
+  for (int i = 0; i < input_tensor.indices().dim_size(0) && index < count;
+       i++) {
+    // The logic here is similiar as the above except that the above
+    // only count the number of indices while here we actually generate
+    // the output.
+    bool hit = true;
+    for (int dim = 0; dim < dims; dim++) {
+      if (!(start[dim] <= input_indices_t(i, dim) &&
+            input_indices_t(i, dim) < start[dim] + size[dim])) {
+        hit = false;
+        break;
+      }
+    }
+    if (!hit) {
+      continue;
+    }
+    output_values_t(index) = input_values_t(i);
+    for (int dim = 0; dim < dims; dim++) {
+      output_indices_t(index, dim) = input_indices_t(i, dim) - start[dim];
+    }
+    index++;
+  }
+
+  return SparseTensor(output_indices, output_values, output_shape);
+}
+
 }  // namespace sparse
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 5edd6cb1d8d1390bc7462441c4f34c8256c74bd7..efdd97fd3d6ffa5c1f66f2a0950d7bd44ba01eb1 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -26,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
@@ -612,6 +612,50 @@ TEST(SparseTensorTest, Split) {
   EXPECT_EQ(st_list[1].indices().matrix<int64>()(0, 1), 0);
 }
 
+TEST(SparseTensorTest, Slice) {
+  const int N = 4;
+  const int DIM = 2;
+
+  Tensor ids(DT_INT64, TensorShape({N, DIM}));
+  Tensor vals(DT_INT64, TensorShape({N}));
+
+  ids.matrix<int64>()(0, 0) = 0;
+  ids.matrix<int64>()(0, 1) = 0;
+  ids.matrix<int64>()(1, 0) = 1;
+  ids.matrix<int64>()(1, 1) = 1;
+  ids.matrix<int64>()(2, 0) = 1;
+  ids.matrix<int64>()(2, 1) = 2;
+  ids.matrix<int64>()(3, 0) = 3;
+  ids.matrix<int64>()(3, 1) = 0;
+
+  vals.vec<int64>()(0) = 1;
+  vals.vec<int64>()(1) = 2;
+  vals.vec<int64>()(2) = 3;
+  vals.vec<int64>()(3) = 4;
+
+  SparseTensor st(ids, vals, TensorShape({4, 3}));
+
+  std::vector<int64> start(2, 0);
+  std::vector<int64> size(2);
+  size[0] = 2;
+  size[1] = 3;
+
+  SparseTensor slice = SparseTensor::Slice<int64>(st, start, size);
+
+  EXPECT_EQ(TensorShape(slice.shape()), TensorShape({2, 3}));
+  EXPECT_EQ(slice.values().NumElements(), 3);
+  EXPECT_EQ(slice.values().vec<int64>()(0), 1);
+  EXPECT_EQ(slice.values().vec<int64>()(1), 2);
+  EXPECT_EQ(slice.values().vec<int64>()(2), 3);
+  EXPECT_EQ(slice.indices().NumElements(), 6);
+  EXPECT_EQ(slice.indices().matrix<int64>()(0, 0), 0);
+  EXPECT_EQ(slice.indices().matrix<int64>()(0, 1), 0);
+  EXPECT_EQ(slice.indices().matrix<int64>()(1, 0), 1);
+  EXPECT_EQ(slice.indices().matrix<int64>()(1, 1), 1);
+  EXPECT_EQ(slice.indices().matrix<int64>()(2, 0), 1);
+  EXPECT_EQ(slice.indices().matrix<int64>()(2, 1), 2);
+}
+
 TEST(SparseTensorTest, Dim0SparseTensorToDenseTensor) {
   Tensor ix(DT_INT64, TensorShape({1, 0}));
   Tensor vals(DT_INT32, TensorShape({1}));
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index fa59f735818c2416d0a5f3aaa9a11c70f495c52c..8447028e3824383beea287f16cfab9fef37dae3e 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -35,6 +37,14 @@ StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
     : StatSummarizer(StatSummarizerOptions()) {}
 
+StatSummarizer::~StatSummarizer() {}
+
+void StatSummarizer::Reset() {
+  run_total_us_.Reset();
+  memory_.Reset();
+  details_.clear();
+}
+
 void StatSummarizer::Validate(const Detail* detail,
                               const NodeExecStats& ns) const {
   if (detail->outputs.size() != ns.output_size()) {
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 6111e276ea69b94e71a3825893014a7520ed20b8..f7b63e86869c2713e08439ed1b0c0d343ad07451 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -154,6 +154,8 @@ class StatSummarizer {
   // GraphDef is not needed by the StatSummarizer.
   explicit StatSummarizer(const tensorflow::GraphDef& tensorflow_graph);
 
+  ~StatSummarizer();
+
   // Adds another run's StepStats output to the aggregate counts.
   void ProcessStepStats(const StepStats& step_stats);
 
@@ -181,11 +183,7 @@ class StatSummarizer {
                                SortingMetric sorting_metric,
                                int num_stats) const;
 
-  void Reset() {
-    run_total_us_.Reset();
-    memory_.Reset();
-    details_.clear();
-  }
+  void Reset();
 
   // Returns number of runs.
   int num_runs() const { return run_total_us_.count(); }
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index dc78a5a01f890ed19da14e9c6f2c70ed3adcc295..cfe9275a09189b0d72e57a79cd860de9ab5d82b8 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -16,37 +16,10 @@ limitations under the License.
 #include "tensorflow/core/util/strided_slice_op.h"
 
 #include <array>
-
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-
-int ShapeReadWriteFromTensorShape::dims() const { return const_shape_->dims(); }
-
-int64 ShapeReadWriteFromTensorShape::dim_size(int idx) const {
-  return const_shape_->dim_size(idx);
-}
-
-void ShapeReadWriteFromTensorShape::add_dim(int64 size) {
-  DCHECK_NE(size, -1);
-  DCHECK(shape_ != nullptr) << "add_dim can only be called on non-const shape";
-  shape_->AddDim(size);
-}
-
-int ShapeReadWriteFromTensorShapeProto::dims() const {
-  return const_shape_->dim_size();
-}
-
-int64 ShapeReadWriteFromTensorShapeProto::dim_size(int idx) const {
-  return const_shape_->dim(idx).size();
-}
-
-void ShapeReadWriteFromTensorShapeProto::add_dim(int64 size) {
-  DCHECK(shape_ != nullptr) << "add_dim can only be called on non-const shape";
-  shape_->add_dim()->set_size(size);
-}
-
 namespace {
 
 /// Constants
@@ -173,12 +146,11 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
 
 Status ValidateStridedSliceOp(
     const Tensor* begin_tensor, const Tensor* end_tensor,
-    const Tensor& strides_tensor, const ShapeReadWriteInterface& input_shape,
+    const Tensor& strides_tensor, const PartialTensorShape& input_shape,
     int32 begin_mask_spec, int32 end_mask_spec, const int32 ellipsis_mask,
     int32 new_axis_mask, int32 shrink_axis_mask,
-    ShapeReadWriteInterface* processing_shape,
-    ShapeReadWriteInterface* final_shape, bool* is_identity,
-    bool* is_simple_slice, bool* slice_dim0,
+    PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
+    bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
     gtl::InlinedVector<int64, 4>* begin, gtl::InlinedVector<int64, 4>* end,
     gtl::InlinedVector<int64, 4>* strides) {
   const bool begin_is_wrong =
@@ -275,6 +247,7 @@ Status ValidateStridedSliceOp(
   *is_identity = true;
   *slice_dim0 = true;
   *is_simple_slice = true;
+  processing_shape->Clear();
   for (int i = 0; i < input_shape.dims(); ++i) {
     int64& begin_i = (*begin)[i];
     int64& end_i = (*end)[i];
@@ -285,7 +258,7 @@ Status ValidateStridedSliceOp(
     }
     bool shrink_i = (dense_spec.shrink_axis_mask & (1 << i));
     if (dim_i == -1) {
-      processing_shape->add_dim(shrink_i ? 1 : -1);
+      processing_shape->AddDim(shrink_i ? 1 : -1);
       continue;
     }
 
@@ -372,9 +345,9 @@ Status ValidateStridedSliceOp(
         size_i = interval_length / stride_i +
                  (interval_length % stride_i != 0 ? 1 : 0);
       }
-      processing_shape->add_dim(size_i);
+      processing_shape->AddDim(size_i);
     } else {
-      processing_shape->add_dim(-1);
+      processing_shape->AddDim(-1);
     }
   }
 
@@ -383,14 +356,41 @@ Status ValidateStridedSliceOp(
   // new_axis will increase dimension by 1 (with a one-size dimension)
   // slices like foo[3,...] will reduce dimension by 1.
   // This cannot be done earlier, because it depends on Step 3.
+  final_shape->Clear();
   for (auto gather_index : dense_spec.final_shape_gather_indices) {
     if (gather_index >= 0) {
-      final_shape->add_dim(processing_shape->dim_size(gather_index));
+      final_shape->AddDim(processing_shape->dim_size(gather_index));
     } else if (gather_index == kNewAxis) {
-      final_shape->add_dim(1);
+      final_shape->AddDim(1);
     }
   }
   return Status::OK();
 }
 
+Status ValidateStridedSliceOp(
+    const Tensor* begin_tensor, const Tensor* end_tensor,
+    const Tensor& strides_tensor, const PartialTensorShape& input_shape,
+    int32 begin_mask_spec, int32 end_mask_spec, const int32 ellipsis_mask,
+    int32 new_axis_mask, int32 shrink_axis_mask, TensorShape* processing_shape,
+    TensorShape* final_shape, bool* is_identity, bool* is_simple_slice,
+    bool* slice_dim0, gtl::InlinedVector<int64, 4>* begin,
+    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides) {
+  // Validate with PartialTensorShape output
+  PartialTensorShape partial_processing_shape, partial_final_shape;
+  TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
+      begin_tensor, end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &partial_processing_shape, &partial_final_shape, is_identity,
+      is_simple_slice, slice_dim0, begin, end, strides));
+
+  // Verify that the output shapes are fully known
+  if (!partial_processing_shape.AsTensorShape(processing_shape) ||
+      !partial_final_shape.AsTensorShape(final_shape)) {
+    return errors::Internal("ValidateStridedSliceOp returned partial shapes ",
+                            partial_processing_shape.DebugString(), " and ",
+                            partial_final_shape.DebugString());
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index 72dc5756fd9bf7604aad6ae96f2508fc98734133..abca98f27b534ea3c4fc2bb7832a38ea6f47df0c 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -23,60 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// This class and its subclasses allow ValidateStridedSliceOp to be called with
-// different implementations of partial tensors.
-class ShapeReadWriteInterface {
- public:
-  virtual ~ShapeReadWriteInterface() {}
-  virtual int dims() const = 0;
-  // Returns -1 for unknown size.
-  virtual int64 dim_size(int idx) const = 0;
-  // Passes -1 for unknown dim size.
-  virtual void add_dim(int64 size) = 0;
-};
-
-// Implementation of ShapeReadWriteInterface that modifies the given TensorShape
-// <shape> in-place. Does not support adding unknown dims in add_dim.
-class ShapeReadWriteFromTensorShape : public ShapeReadWriteInterface {
- public:
-  ShapeReadWriteFromTensorShape(TensorShape* shape)
-      : const_shape_(shape), shape_(shape) {}
-  ShapeReadWriteFromTensorShape(const TensorShape* shape)
-      : const_shape_(shape) {}
-  ~ShapeReadWriteFromTensorShape() override {}
-  int dims() const override;
-  int64 dim_size(int idx) const override;
-  void add_dim(int64 size) override;
-
- private:
-  const TensorShape* const const_shape_;
-  // same as const_shape_, or nullptr if the non-const ctr is used.
-  TensorShape* const shape_ = nullptr;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ShapeReadWriteFromTensorShape);
-};
-
-// Implementation of ShapeReadWriteInterface that modifies the given
-// TensorShapeProto in place.
-class ShapeReadWriteFromTensorShapeProto : public ShapeReadWriteInterface {
- public:
-  ShapeReadWriteFromTensorShapeProto(TensorShapeProto* shape)
-      : const_shape_(shape), shape_(shape) {}
-  ShapeReadWriteFromTensorShapeProto(const TensorShapeProto* shape)
-      : const_shape_(shape) {}
-  ~ShapeReadWriteFromTensorShapeProto() override {}
-  int dims() const override;
-  int64 dim_size(int idx) const override;
-  void add_dim(int64 size) override;
-
- private:
-  const TensorShapeProto* const const_shape_;
-  // same as shape_, or nullptr if the non-const ctr is used.
-  TensorShapeProto* const shape_ = nullptr;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ShapeReadWriteFromTensorShapeProto);
-};
-
 // Runs validation on the strided slice op parameters.
 //
 // Is a separate translation unit from the kernel so that:
@@ -96,15 +42,24 @@ class ShapeReadWriteFromTensorShapeProto : public ShapeReadWriteInterface {
 // performed.
 Status ValidateStridedSliceOp(
     const Tensor* begin_tensor, const Tensor* end_tensor,
-    const Tensor& strides_tensor, const ShapeReadWriteInterface& input_shape,
+    const Tensor& strides_tensor, const PartialTensorShape& input_shape,
     int32 begin_mask_spec, int32 end_mask_spec, const int32 ellipsis_mask,
     int32 new_axis_mask, int32 shrink_axis_mask,
-    ShapeReadWriteInterface* processing_shape,
-    ShapeReadWriteInterface* final_shape, bool* is_identity,
-    bool* is_simple_slice, bool* slice_dim0,
+    PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
+    bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
     gtl::InlinedVector<int64, 4>* begin, gtl::InlinedVector<int64, 4>* end,
     gtl::InlinedVector<int64, 4>* strides);
 
+// Same as above, but the outputs are TensorShape, not PartialTensorShape
+Status ValidateStridedSliceOp(
+    const Tensor* begin_tensor, const Tensor* end_tensor,
+    const Tensor& strides_tensor, const PartialTensorShape& input_shape,
+    int32 begin_mask_spec, int32 end_mask_spec, const int32 ellipsis_mask,
+    int32 new_axis_mask, int32 shrink_axis_mask, TensorShape* processing_shape,
+    TensorShape* final_shape, bool* is_identity, bool* is_simple_slice,
+    bool* slice_dim0, gtl::InlinedVector<int64, 4>* begin,
+    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides);
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index bae00f740032c347157faf73507cb4cb10b30c43..1045cf84b4f8708afa63b704af640a68f3e0c8b2 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -11,6 +11,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
+    "if_not_windows",
     "tf_copts",
 )
 
@@ -34,12 +35,11 @@ cc_library(
     name = "tensor_bundle",
     srcs = ["tensor_bundle.cc"],
     hdrs = ["tensor_bundle.h"],
-    copts = tf_copts() + ["-Wno-sign-compare"],
+    copts = tf_copts() + if_not_windows(["-Wno-sign-compare"]),
     deps = [
         ":naming",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 301eae2c4de11a67edb9986b5d7b231a0722295d..41dc9f8a78210cb733272380d95198f543bba292 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb_text.h"
 #include "tensorflow/core/framework/versions.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 9923428a347f3bb9dba95eda8fc97548f6140c9a..83f95004907456692784d4f65b81e94a284f028c 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -54,6 +54,17 @@ inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
   }
 }
 
+// Returns the rank of a tensor with 'num_spatial_dims' spatial dimensions and
+// tensor format 'format'. This is the inverse of GetTensorSpatialDims.
+inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
+                                        TensorFormat format) {
+  if (format == FORMAT_NCHW_VECT_C) {
+    return num_spatial_dims + 3;  // Include N,C,InnerC.
+  } else {
+    return num_spatial_dims + 2;  // Include N,C.
+  }
+}
+
 // Returns the index of the batch dimension.
 inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
   switch (format) {
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index be636c04c47bbf82fcf8ba85ff532bfeef64b02c..ff5bfd65aef360cd89908a94bee7d8bb052f1905 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <array>
 
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -274,13 +275,13 @@ size_t BytesPerElementHelper(DT value) {
   std::fill(lo_data.begin(), lo_data.end(), value);
   TF_EXPECT_OK(
       TensorSliceWriter::SaveData(lo_data.data(), lo_data.size(), &ss));
-  int lo_byte_size = ss.ByteSize();
+  size_t lo_byte_size = ss.ByteSizeLong();
 
   std::array<DT, 1001> hi_data;
   std::fill(hi_data.begin(), hi_data.end(), value);
   TF_EXPECT_OK(
       TensorSliceWriter::SaveData(hi_data.data(), hi_data.size(), &ss));
-  int hi_byte_size = ss.ByteSize();
+  size_t hi_byte_size = ss.ByteSizeLong();
 
   return (hi_byte_size - lo_byte_size) / (hi_data.size() - lo_data.size());
 }
diff --git a/tensorflow/docs_src/about/roadmap.md b/tensorflow/docs_src/about/roadmap.md
index 1789e050faca201daef67f6e6b1ac286c95ae965..3ee825ed400de93553bf69fee065fcf8ef13be4d 100644
--- a/tensorflow/docs_src/about/roadmap.md
+++ b/tensorflow/docs_src/about/roadmap.md
@@ -5,7 +5,7 @@ TensorFlow is a fast moving project. In order for the community to better
 understand what the near future will bring, this document shares what we are
 working on internally. Many of these features were requested by the community,
 and we welcome
-[contributions](https://github.com/tensorflow/tensorflow/labels/contributions%20welcome).
+[contributions](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
 
 The features on this list are targeted for the next few months. At this point,
 we do not have timelines for these features.
diff --git a/tensorflow/docs_src/api_guides/cc/guide.md b/tensorflow/docs_src/api_guides/cc/guide.md
index c5473cad9735a8951ff2498fdf5bc671e46643c9..f30bf3797edf4c345eeb29e4268229154fce11b0 100644
--- a/tensorflow/docs_src/api_guides/cc/guide.md
+++ b/tensorflow/docs_src/api_guides/cc/guide.md
@@ -59,7 +59,8 @@ cc_binary(
 )
 ```
 
-You should be able to build and run the example using the following command:
+You should be able to build and run the example using the following command
+(be sure to run `./configure` in your build sandbox first):
 
 ```shell
 bazel run -c opt //tensorflow/cc/example:example
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index 8c289dd55631a94546aeab129edf4d530eecaeda..d7f862625e02a50cd716118f882344c1d16ffe1c 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -1,8 +1,12 @@
 # Losses (contrib)
 
+## Deprecated
+
+This module is deprecated. Instructions for updating: Use @{tf.losses} instead.
+
 ## Loss operations for use in neural networks.
 
-Note: By default all the losses are collected into the `GraphKeys.LOSSES`
+Note: By default, all the losses are collected into the `GraphKeys.LOSSES`
 collection.
 
 All of the loss functions take a pair of predictions and ground truth labels,
@@ -13,8 +17,8 @@ of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 It is common, when training with multiple loss functions, to adjust the relative
 strengths of individual losses. This is performed by rescaling the losses via
 a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and mean_square_error, and we wished that the
-log_loss penalty be twice as severe as the mean_square_error, we would
+training with both log_loss and mean_squared_error, and we wished that the
+log_loss penalty be twice as severe as the mean_squared_error, we would
 implement this as:
 
 ```python
@@ -22,7 +26,7 @@ implement this as:
   tf.contrib.losses.log(predictions, labels, weight=2.0)
 
   # Uses default weight of 1.0
-  tf.contrib.losses.mean_square_error(predictions, labels)
+  tf.contrib.losses.mean_squared_error(predictions, labels)
 
   # All the losses are collected into the `GraphKeys.LOSSES` collection.
   losses = tf.get_collection(tf.GraphKeys.LOSSES)
@@ -74,7 +78,7 @@ these predictions.
   predictions = MyModelPredictions(images)
 
   weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
+  loss  = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
 Note that when using weights for the losses, the final average is computed
@@ -100,7 +104,7 @@ weighted average over the individual prediction errors:
 
   weight = MyComplicatedWeightingFunction(labels)
   weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
+  loss = tf.contrib.losses.mean_squared_error(predictions, depths, weight)
 ```
 
 @{tf.contrib.losses.absolute_difference}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index 2522e50c266db24dcea424b882ebba5509a4605f..b56a4884b4c46ecefb426039524923da997c4b97 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -40,7 +40,7 @@ depth.
 
 ### Attention Wrappers
 
-The basic attention wrapper is @{tf.contrib.seq2seq.DynamicAttentionWrapper}.
+The basic attention wrapper is @{tf.contrib.seq2seq.AttentionWrapper}.
 This wrapper accepts an `RNNCell` instance, an instance of `AttentionMechanism`,
 and an attention depth parameter (`attention_size`); as well as several
 optional arguments that allow one to customize intermediate calculations.
@@ -54,7 +54,7 @@ score = attention_mechanism(cell_output)
 alignments = softmax(score)
 context = matmul(alignments, attention_mechanism.values)
 attention = tf.layers.Dense(attention_size)(concat([cell_output, context], 1))
-next_state = DynamicAttentionWrapperState(
+next_state = AttentionWrapperState(
   cell_state=next_cell_state,
   attention=attention)
 output = attention
@@ -68,14 +68,14 @@ be replaced with alternative options when calculating `alignments` from the
 `score`.  Finally, the outputs returned by the wrapper can be configured to
 be the value `cell_output` instead of `attention`.
 
-The benefit of using a `DynamicAttentionWrapper` is that it plays nicely with
+The benefit of using a `AttentionWrapper` is that it plays nicely with
 other wrappers and the dynamic decoder described below.  For example, one can
 write:
 
 ```python
 cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:0")
 attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
-attn_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
+attn_cell = tf.contrib.seq2seq.AttentionWrapper(
   cell, attention_mechanism, attention_size=256)
 attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/gpu:1")
 top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:1")
diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md
index 3d9f203297a41704e0a78e533744f11a01b39eb0..b3c7a0c010364eea0a5857a8ae675b2b341f1890 100644
--- a/tensorflow/docs_src/api_guides/python/math_ops.md
+++ b/tensorflow/docs_src/api_guides/python/math_ops.md
@@ -61,6 +61,9 @@ mathematical functions to your graph.
 *   @{tf.atan}
 *   @{tf.cosh}
 *   @{tf.sinh}
+*   @{tf.asinh}
+*   @{tf.acosh}
+*   @{tf.atanh}
 *   @{tf.lgamma}
 *   @{tf.digamma}
 *   @{tf.erf}
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 44a2696e5cf081df3de14905d02e543aadb12617..75dbb04e7df6f5fef00363bab548fc04bd3c9694 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -8,7 +8,7 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 ## Activation Functions
 
 The activation ops provide different types of nonlinearities for use in neural
-networks.  These include smooth nonlinearities (`sigmoid`, `tanh`, `elu`,
+networks. These include smooth nonlinearities (`sigmoid`, `tanh`, `elu`, `selu`,
 `softplus`, and `softsign`), continuous but not everywhere differentiable
 functions (`relu`, `relu6`, `crelu` and `relu_x`), and random regularization
 (`dropout`).
@@ -20,6 +20,7 @@ shape as the input tensor.
 *   @{tf.nn.relu6}
 *   @{tf.nn.crelu}
 *   @{tf.nn.elu}
+*   @{tf.nn.selu}
 *   @{tf.nn.softplus}
 *   @{tf.nn.softsign}
 *   @{tf.nn.dropout}
@@ -47,28 +48,39 @@ strided according to the `strides` argument.  `strides = [1, 1, 1, 1]` applies
 the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the
 filter to every other image patch in each dimension, etc.
 
-Ignoring channels for the moment, and assume that the 4-D `input` has shape
+Ignoring channels for the moment, assume that the 4-D `input` has shape
 `[batch, in_height, in_width, ...]` and the 4-D `filter` has shape
-`[filter_height, filter_width, ...]`, then the spatial semantics of the
-convolution ops are as follows: first, according to the padding scheme chosen
-as `'SAME'` or `'VALID'`, the output size and the padding pixels are computed.
-For the `'SAME'` padding, the output height and width are computed as:
+`[filter_height, filter_width, ...]`. The spatial semantics of the
+convolution ops depend on the padding scheme chosen: `'SAME'` or `'VALID'`.
+Note that the padding values are always zero.
+
+First, consider the `'SAME'` padding scheme. A detailed explanation of the
+reasoning behind it is given in
+[these notes](#Notes_on_SAME_Convolution_Padding). Here, we summarize the
+mechanics of this padding scheme. When using `'SAME'`, the output height and
+width are computed as:
 
     out_height = ceil(float(in_height) / float(strides[1]))
     out_width  = ceil(float(in_width) / float(strides[2]))
 
-and the padding on the top and left are computed as:
+The total padding applied along the height and width is computed as:
+
+    if (in_height % strides[1] == 0):
+      pad_along_height = max(filter_height - strides[1], 0)
+    else:
+      pad_along_height = max(filter_height - (in_height % strides[1]), 0)
+    if (in_width % strides[2] == 0):
+      pad_along_width = max(filter_width - strides[2], 0)
+    else:
+      pad_along_width = max(filter_width - (in_width % strides[2]), 0)
+    
+Finally, the padding on the top, bottom, left and right are:
 
-    pad_along_height = max((out_height - 1) * strides[1] +
-                        filter_height - in_height, 0)
-    pad_along_width = max((out_width - 1) * strides[2] +
-                       filter_width - in_width, 0)
     pad_top = pad_along_height // 2
     pad_bottom = pad_along_height - pad_top
     pad_left = pad_along_width // 2
     pad_right = pad_along_width - pad_left
 
-
 Note that the division by 2 means that there might be cases when the padding on
 both sides (top vs bottom, right vs left) are off by one. In this case, the
 bottom and right sides always get the one additional padded pixel. For example,
@@ -77,12 +89,14 @@ bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 
-For the `'VALID`' padding, the output height and width are computed as:
+For the `'VALID`' scheme, the output height and width are computed as:
 
     out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
     out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
 
-and the padding values are always zero. The output is then computed as
+and no padding is used.
+
+Given the output size and the padding, the output can be computed as
 
     output[b, i, j, :] =
         sum_{di, dj} input[b, strides[1] * i + di - pad_top,
@@ -288,3 +302,115 @@ classes when using one of the sampled loss functions above.
 *   @{tf.nn.quantized_relu_x}
 *   @{tf.nn.quantized_max_pool}
 *   @{tf.nn.quantized_avg_pool}
+
+## Notes on SAME Convolution Padding
+
+In these notes, we provide more background on the use of the `'SAME'` padding
+scheme for convolution operations.
+
+Tensorflow uses the smallest possible padding to achieve the desired output
+size. To understand what is done, consider the \\(1\\)-dimensional case. Denote
+\\(n_i\\) and \\(n_o\\) the input and output sizes, respectively, and denote the
+kernel size \\(k\\) and stride \\(s\\). As discussed in the
+[Convolution section](#Convolution), for `'SAME'`,
+\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\).
+
+To achieve a desired output size \\(n_o\\), we need to pad the input such that the
+output size after a `'VALID'` convolution is \\(n_o\\). In other words, we need to
+have padding \\(p_i\\) such that:
+
+\begin{equation}
+\left \lceil{\frac{n_i + p_i - k + 1}{s}}\right \rceil = n_o
+\label{eq:tf_pad_1}
+\end{equation}
+
+What is the smallest \\(p_i\\) that we could possibly use? In general, \\(\left
+\lceil{\frac{x}{a}}\right \rceil = b\\) (with \\(a > 0\\)) means that \\(b-1 <
+\frac{x}{a} \leq b\\), and the smallest integer \\(x\\) we can choose to satisfy
+this is \\(x = a\cdot (b-1) + 1\\). The same applies to our problem; we need
+\\(p_i\\) such that:
+
+\begin{equation}
+n_i + p_i - k + 1 = s\cdot (n_o - 1) + 1
+\label{eq:tf_pad_2}
+\end{equation}
+
+which leads to:
+
+\begin{equation}
+p_i = s\cdot (n_o - 1) + k - n_i
+\label{eq:tf_pad_3}
+\end{equation}
+
+Note that this might lead to negative \\(p_i\\), since in some cases we might
+already have more input samples than we actually need. Thus,
+
+\begin{equation}
+p_i = max(s\cdot (n_o - 1) + k - n_i, 0)
+\label{eq:tf_pad_4}
+\end{equation}
+
+Remember that, for `'SAME'` padding,
+\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\), as mentioned above. 
+We need to analyze in detail two cases:
+
+- \\(n_i \text{ mod } s = 0\\)
+
+In this simple case, \\(n_o = \frac{n_i}{s}\\), and the expression for \\(p_i\\)
+becomes:
+
+\begin{equation}
+p_i = max(k - s, 0)
+\label{eq:tf_pad_5}
+\end{equation}
+
+- \\(n_i \text{ mod } s \neq 0\\)
+
+This case is more involved to parse. First, we write:
+
+\begin{equation}
+n_i = s\cdot\left \lceil{\frac{n_i}{s}}\right \rceil
+- s \left(\left \lceil{\frac{n_i}{s}}\right \rceil -
+          \left \lfloor{\frac{n_i}{s}}\right \rfloor\right)
++ (n_i \text{ mod } s)
+\label{eq:tf_pad_6}
+\end{equation}
+
+For the case where \\((n_i \text{ mod } s) \neq 0\\), we have \\(\left
+\lceil{\frac{n_i}{s}}\right \rceil -\left \lfloor{\frac{n_i}{s}}\right \rfloor =
+1\\), leading to:
+
+\begin{equation}
+n_i = s\cdot\left \lceil{\frac{n_i}{s}}\right \rceil
+- s
++ (n_i \text{ mod } s)
+\label{eq:tf_pad_7}
+\end{equation}
+
+We can use this expression to substitute \\(n_o = \left
+\lceil{\frac{n_i}{s}}\right \rceil\\) and get:
+
+$$\begin{align}
+p_i &= max\left(s\cdot \left(\frac{n_i + s - (n_i \text{ mod } s)}{s}
+  - 1\right) + k - n_i, 0\right) \nonumber\\
+&= max(n_i + s - (n_i \text{ mod } s) - s + k - n_i,0) \nonumber \\
+&= max(k - (n_i \text{ mod } s),0)
+\label{eq:tf_pad_8}
+\end{align}$$
+
+### Final expression
+
+Putting all together, the total padding used by tensorflow's convolution with
+`'SAME'` mode is:
+
+$$\begin{align}
+p_i =
+ \begin{cases}
+ max(k - s, 0),  & \text{if $(n_i \text{ mod } s) = 0$} \\
+ max(k - (n_i \text{ mod } s),0), & \text{if $(n_i \text{ mod } s) \neq 0$}
+ \end{cases}
+ \label{eq:tf_pad_9}
+\end{align}$$
+
+This expression is exactly equal to the ones presented for `pad_along_height`
+and `pad_along_width` in the [Convolution section](#Convolution).
diff --git a/tensorflow/docs_src/api_guides/python/test.md b/tensorflow/docs_src/api_guides/python/test.md
index 93a1d50b29800f925a1fa33d39c89f66e6967395..5dc88124e7e1c26237c5c150b624486ab0df1283 100644
--- a/tensorflow/docs_src/api_guides/python/test.md
+++ b/tensorflow/docs_src/api_guides/python/test.md
@@ -31,6 +31,9 @@ methods.  See @{tf.test.TestCase} for details.
 
 ## Utilities
 
+Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
+depending on the python version.
+
 *   @{tf.test.assert_equal_graph_def}
 *   @{tf.test.get_temp_dir}
 *   @{tf.test.is_built_with_cuda}
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 31a10d1f15d437810a9acb04d34bdae97195223c..655506b09824a9f0911155269a869058a572f4e2 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -31,11 +31,11 @@ TensorFlow.
 
 However, most developers will contribute documentation into the master Github
 branch, which is published, occasionally,
-at [tensorflow.org/versions/master](https://tensorflow.org/versions/master).
+at [tensorflow.org/versions/master](https://www.tensorflow.org/versions/master).
 
 If you want documentation changes to appear at root, you will need to also
 contribute that change to the current stable binary branch (and/or
-[cherrypick](https://www.google.com/url?sa=D&q=http%3A%2F%2Fstackoverflow.com%2Fquestions%2F9339429%2Fwhat-does-cherry-picking-a-commit-with-git-mean)).
+[cherrypick](https://stackoverflow.com/questions/9339429/what-does-cherry-picking-a-commit-with-git-mean)).
 
 ## Reference vs. non-reference documentation
 
@@ -276,7 +276,7 @@ __init__.py:
     # Otherwise import symbols directly
     from tensorflow.some_module.some_other_file import some_symbol
 
-    from tensorflow.platform.all_util import remove_undocumented
+    from tensorflow.python.util.all_util import remove_undocumented
 
     _allowed_symbols = [‘some_symbol’, ‘some_other_symbol’]
 
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index f4c57725a05ac41b829d697fa336dfa0ddfa9083..c293e61c3ecb973697195a57e2f9a849ecef926f 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -10,3 +10,5 @@ This section contains the following documents:
     TensorFlow source code or documentation, please read this guide.
   * @{$style_guide$TensorFlow Style Guide}, which identifies coding style
     conventions that TensorFlow developers and users should follow.
+  * @{$benchmarks$Benchmarks}, Benchmarks, a guide for defining and 
+    running a TensorFlow benchmark.
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index 4c8c4e1a97249ef766233b9aa97a1c676c0b2737..194649a304d236147176201ebe3e99a2ad3b31c5 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -25,6 +25,10 @@ The TensorFlow community has created many great projects around TensorFlow, incl
 * [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
 * [Rust language bindings](https://github.com/google/tensorflow-rust)
 * [Operator Vectorization Library](https://github.com/opveclib/opveclib)
+* [Swift language bindings](https://github.com/PerfectlySoft/Perfect-TensorFlow)
+* [Sublime Tensorflow - A plugin for Sublime Text](https://github.com/baptisteArnaud/Sublime-Tensorflow)
+* [Edward - A library for probabilistic modeling, inference, and criticism](http://edwardlib.org) ([Github](https://github.com/blei-lab/edward), [Forum](https://discourse.edwardlib.org))
+* [GPflow - Gaussian processes in TensorFlow](https://github.com/GPflow/GPflow)
 
 ## TensorFlow Communities Around the World
 
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index a8c28e98c9b20c35f30192907b84abdbf4860c81..4b1d1b6e3430b2a9bf6e444cfab38450712fde0c 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -155,7 +155,7 @@ REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
 ### Multi-threaded CPU kernels
 
 To write a multi-threaded CPU kernel, the Shard function in
-[`work_sharder.h`](https://www.tensorflow.org/code/tensorflow/core/framework/work_sharder.h)
+[`work_sharder.h`](https://www.tensorflow.org/code/tensorflow/core/util/work_sharder.h)
 can be used. This function shards a computation function across the
 threads configured to be used for intra-op threading (see
 intra_op_parallelism_threads in
@@ -1100,7 +1100,7 @@ In general, changes to existing, checked-in specifications must be
 backwards-compatible: changing the specification of an op must not break prior
 serialized `GraphDef` protocol buffers constructed from older specifications.
 The details of `GraphDef` compatibility are
-@{$version_semantics#graphs$described here}.
+@{$version_compat#compatibility_of_graphs_and_checkpoints$described here}.
 
 There are several ways to preserve backwards-compatibility.
 
@@ -1150,7 +1150,7 @@ callers.  The Python API may be kept compatible by careful changes in a
 hand-written Python wrapper, by keeping the old signature except possibly adding
 new optional arguments to the end.  Generally incompatible changes may only be
 made when TensorFlow's changes major versions, and must conform to the
-@{$version_semantics#graphs$`GraphDef` version semantics}.
+@{$version_compat#compatibility_of_graphs_and_checkpoints$`GraphDef` version semantics}.
 
 ### GPU Support {#gpu-support}
 
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index 6bd21be019319d197c408aa3dd0d9ea666394cc4..5265e5889be8d69b60c161c5feb6424b4e2e08f8 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -1,52 +1,55 @@
-# Creating Estimators in tf.contrib.learn
+# Creating Estimators in tf.estimator
 
-The tf.contrib.learn framework makes it easy to construct and train machine
-learning models via its high-level
-@{$python/contrib.learn#estimators$Estimator} API. `Estimator`
+The tf.estimator framework makes it easy to construct and train machine
+learning models via its high-level Estimator API. `Estimator`
 offers classes you can instantiate to quickly configure common model types such
 as regressors and classifiers:
 
-*   @{tf.contrib.learn.LinearClassifier}:
+*   @{tf.estimator.LinearClassifier}:
     Constructs a linear classification model.
-*   @{tf.contrib.learn.LinearRegressor}:
+*   @{tf.estimator.LinearRegressor}:
     Constructs a linear regression model.
-*   @{tf.contrib.learn.DNNClassifier}:
+*   @{tf.estimator.DNNClassifier}:
     Construct a neural network classification model.
-*   @{tf.contrib.learn.DNNRegressor}:
-    Construct a neural network regressions model.
-
-But what if none of `tf.contrib.learn`'s predefined model types meets your
-needs? Perhaps you need more granular control over model configuration, such as
+*   @{tf.estimator.DNNRegressor}:
+    Construct a neural network regression model.
+*   @{tf.estimator.DNNLinearCombinedClassifier}:
+    Construct a neural network and linear combined classification model.
+*   @{tf.estimator.DNNRegressor}:
+    Construct a neural network and linear combined regression model.
+
+But what if none of `tf.estimator`'s predefined model types meets your needs?
+Perhaps you need more granular control over model configuration, such as
 the ability to customize the loss function used for optimization, or specify
 different activation functions for each neural network layer. Or maybe you're
 implementing a ranking or recommendation system, and neither a classifier nor a
 regressor is appropriate for generating predictions.
 
 This tutorial covers how to create your own `Estimator` using the building
-blocks provided in `tf.contrib.learn`, which will predict the ages of
+blocks provided in `tf.estimator`, which will predict the ages of
 [abalones](https://en.wikipedia.org/wiki/Abalone) based on their physical
 measurements. You'll learn how to do the following:
 
 *   Instantiate an `Estimator`
 *   Construct a custom model function
-*   Configure a neural network using `tf.contrib.layers`
+*   Configure a neural network using `tf.feature_column` and `tf.layers`
 *   Choose an appropriate loss function from `tf.losses`
 *   Define a training op for your model
 *   Generate and return predictions
 
 ## Prerequisites
 
-This tutorial assumes you already know tf.contrib.learn API basics, such as
-feature columns, input functions, and `fit()`/`evaluate()`/`predict()`
-operations. If you've never used tf.contrib.learn before, or need a refresher,
+This tutorial assumes you already know tf.estimator API basics, such as
+feature columns, input functions, and `train()`/`evaluate()`/`predict()`
+operations. If you've never used tf.estimator before, or need a refresher,
 you should first review the following tutorials:
 
-*   @{$tflearn$tf.contrib.learn Quickstart}: Quick introduction to
-    training a neural network using tf.contrib.learn.
+*   @{$estimator$tf.estimator Quickstart}: Quick introduction to
+    training a neural network using tf.estimator.
 *   @{$wide$TensorFlow Linear Model Tutorial}: Introduction to
     feature columns, and an overview on building a linear classifier in
-    tf.contrib.learn.
-*   @{$input_fn$Building Input Functions with tf.contrib.learn}: Overview of how
+    tf.estimator.
+*   @{$input_fn$Building Input Functions with tf.estimator}: Overview of how
     to construct an input_fn to preprocess and feed data into your models.
 
 ## An Abalone Age Predictor {#abalone-predictor}
@@ -65,8 +68,7 @@ for abalone:
 | Feature        | Description                                               |
 | -------------- | --------------------------------------------------------- |
 | Length         | Length of abalone (in longest direction; in mm)           |
-| Diameter       | Diameter of abalone (measurement perpendicular to length; |
-:                : in mm)                                                    :
+| Diameter       | Diameter of abalone (measurement perpendicular to length; in mm)|
 | Height         | Height of abalone (with its meat inside shell; in mm)     |
 | Whole Weight   | Weight of entire abalone (in grams)                       |
 | Shucked Weight | Weight of abalone meat only (in grams)                    |
@@ -75,8 +77,8 @@ for abalone:
 
 The label to predict is number of rings, as a proxy for abalone age.
 
-![Abalone shell](https://www.tensorflow.org/abalone_shell.jpg) **[“Abalone
-shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
+![Abalone shell](https://www.tensorflow.org/images/abalone_shell.jpg)
+**[“Abalone shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
 Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)**
 
 ## Setup
@@ -113,7 +115,6 @@ from six.moves import urllib
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 
 FLAGS = None
 ```
@@ -207,17 +208,17 @@ if __name__ == "__main__":
 
 ## Instantiating an Estimator
 
-When defining a model using one of tf.contrib.learn's provided classes, such as
+When defining a model using one of tf.estimator's provided classes, such as
 `DNNClassifier`, you supply all the configuration parameters right in the
 constructor, e.g.:
 
 ```python
-my_nn = tf.contrib.learn.DNNClassifier(feature_columns=[age, height, weight],
-                                       hidden_units=[10, 10, 10],
-                                       activation_fn=tf.nn.relu,
-                                       dropout=0.2,
-                                       n_classes=3,
-                                       optimizer="Adam")
+my_nn = tf.estimator.DNNClassifier(feature_columns=[age, height, weight],
+                                   hidden_units=[10, 10, 10],
+                                   activation_fn=tf.nn.relu,
+                                   dropout=0.2,
+                                   n_classes=3,
+                                   optimizer="Adam")
 ```
 
 You don't need to write any further code to instruct TensorFlow how to train the
@@ -229,8 +230,7 @@ constructor accepts just two high-level parameters for model configuration,
 `model_fn` and `params`:
 
 ```python
-nn = tf.contrib.learn.Estimator(
-    model_fn=model_fn, params=model_params)
+nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
 ```
 
 *   `model_fn`: A function object that contains all the aforementioned logic to
@@ -242,7 +242,7 @@ nn = tf.contrib.learn.Estimator(
 *   `params`: An optional dict of hyperparameters (e.g., learning rate, dropout)
     that will be passed into the `model_fn`.
 
-Note: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
+Note: Just like `tf.estimator`'s predefined regressors and classifiers, the
 `Estimator` initializer also accepts the general configuration arguments
 `model_dir` and `config`.
 
@@ -266,7 +266,7 @@ containing the learning rate and instantiates the `Estimator`:
 model_params = {"learning_rate": LEARNING_RATE}
 
 # Instantiate Estimator
-nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
+nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
 ```
 
 ## Constructing the `model_fn` {#constructing-modelfn}
@@ -274,31 +274,31 @@ nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
 The basic skeleton for an `Estimator` API model function looks like this:
 
 ```python
-def model_fn(features, targets, mode, params):
+def model_fn(features, labels, mode, params):
    # Logic to do the following:
    # 1. Configure the model via TensorFlow operations
    # 2. Define the loss function for training/evaluation
    # 3. Define the training operation/optimizer
    # 4. Generate predictions
-   # 5. Return predictions/loss/train_op/eval_metric_ops in ModelFnOps object
-   return ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops)
+   # 5. Return predictions/loss/train_op/eval_metric_ops in EstimatorSpec object
+   return EstimatorSpec(mode, predictions, loss, train_op, eval_metric_ops)
 ```
 
 The `model_fn` must accept three arguments:
 
-*   `features`: A dict containing the features passed to the model via `fit()`,
-    `evaluate()`, or `predict()`.
-*   `targets`: A `Tensor` containing the labels passed to the model via `fit()`,
-    `evaluate()`, or `predict()`. Will be empty for `predict()` calls, as these
-    are the values the model will infer.
-*   `mode`: One of the following @{tf.contrib.learn.ModeKeys} string values
+*   `features`: A dict containing the features passed to the model via
+    `input_fn`.
+*   `labels`: A `Tensor` containing the labels passed to the model via
+    `input_fn`. Will be empty for `predict()` calls, as these are the values the
+    model will infer.
+*   `mode`: One of the following @{tf.estimator.ModeKeys} string values
     indicating the context in which the model_fn was invoked:
-    *   `tf.contrib.learn.ModeKeys.TRAIN` The `model_fn` was invoked in training
-        mode—e.g., via a `fit()` call.
-    *   `tf.contrib.learn.ModeKeys.EVAL`. The `model_fn` was invoked in
-        evaluation mode—e.g., via an `evaluate()` call.
-    *   `tf.contrib.learn.ModeKeys.INFER`. The `model_fn` was invoked in
-        inference mode—e.g., via a `predict()` call.
+    *   `tf.estimator.ModeKeys.TRAIN` The `model_fn` was invoked in training
+        mode, namely via a `train()` call.
+    *   `tf.estimator.ModeKeys.EVAL`. The `model_fn` was invoked in
+        evaluation mode, namely via an `evaluate()` call.
+    *   `tf.estimator.ModeKeys.PREDICT`. The `model_fn` was invoked in
+        predict mode, namely via a `predict()` call.
 
 `model_fn` may also accept a `params` argument containing a dict of
 hyperparameters used for training (as shown in the skeleton above).
@@ -313,28 +313,23 @@ sections that follow):
 *   Defining the training operation that specifies the `optimizer` algorithm to
     minimize the loss values calculated by the loss function.
 
-The `model_fn` must return a @{tf.contrib.learn.ModelFnOps}
+The `model_fn` must return a @{tf.estimator.EstimatorSpec}
 object, which contains the following values:
 
 *   `mode` (required). The mode in which the model was run. Typically, you will
     return the `mode` argument of the `model_fn` here.
 
-*   `predictions` (required in `INFER` and `EVAL` modes). A dict that maps key
-    names of your choice to `Tensor`s containing the predictions from the model,
-    e.g.:
+*   `predictions` (required in `PREDICT` mode). A dict that maps key names of
+    your choice to `Tensor`s containing the predictions from the model, e.g.:
 
     ```python
     predictions = {"results": tensor_of_predictions}
     ```
 
-    In `INFER` mode, the dict that you return in `ModelFnOps` will then be
+    In `PREDICT` mode, the dict that you return in `EstimatorSpec` will then be
     returned by `predict()`, so you can construct it in the format in which
     you'd like to consume it.
 
-    In `EVAL` mode, the dict is used by
-    @{$python/contrib.metrics#Metric_Ops_$metric functions}
-    to compute metrics.
-    
 
 *   `loss` (required in `EVAL` and `TRAIN` mode). A `Tensor` containing a scalar
     loss value: the output of the model's loss function (discussed in more depth
@@ -362,7 +357,7 @@ object, which contains the following values:
     If you do not specify `eval_metric_ops`, only `loss` will be calculated
     during evaluation.
 
-### Configuring a neural network with `tf.contrib.layers`
+### Configuring a neural network with `tf.feature_column` and `tf.layers`
 
 Constructing a [neural
 network](https://en.wikipedia.org/wiki/Artificial_neural_network) entails
@@ -372,23 +367,21 @@ layer.
 The input layer is a series of nodes (one for each feature in the model) that
 will accept the feature data that is passed to the `model_fn` in the `features`
 argument. If `features` contains an n-dimensional `Tensor` with all your feature
-data (which is the case if `x` and `y` `Dataset`s are passed to `fit()`,
-`evaluate()`, and `predict()` directly), then it can serve as the input layer.
+data, then it can serve as the input layer.
 If `features` contains a dict of @{$linear#feature-columns-and-transformations$feature columns} passed to
 the model via an input function, you can convert it to an input-layer `Tensor`
-with the @{tf.contrib.layers.input_from_feature_columns} function in
-@{tf.contrib.layers}.
+with the @{tf.feature_column.input_layer} function.
 
 ```python
-input_layer = tf.contrib.layers.input_from_feature_columns(
-    columns_to_tensors=features, feature_columns=[age, height, weight])
+input_layer = tf.feature_column.input_layer(
+    features=features, feature_columns=[age, height, weight])
 ```
 
-As shown above, `input_from_feature_columns()` takes two required arguments:
+As shown above, `input_layer()` takes two required arguments:
 
-*   `columns_to_tensors`. A mapping of the model's `FeatureColumns` to the
-    `Tensors` containing the corresponding feature data. This is exactly what is
-    passed to the `model_fn` in the `features` argument.
+*   `features`. A mapping from string keys to the `Tensors` containing the
+    corresponding feature data. This is exactly what is passed to the `model_fn`
+    in the `features` argument.
 *   `feature_columns`. A list of all the `FeatureColumns` in the model—`age`,
     `height`, and `weight` in the above example.
 
@@ -397,44 +390,44 @@ hidden layers via an [activation
 function](https://en.wikipedia.org/wiki/Activation_function) that performs a
 nonlinear transformation on the data from the previous layer. The last hidden
 layer is then connected to the output layer, the final layer in the model.
-tf.contrib.layers provides the following convenience functions for constructing
-fully connected layers:
+`tf.layers` provides the `tf.layers.dense` function for constructing fully
+connected layers. The activation is controlled by the `activation` argument.
+Some options to pass to the `activation` argument are:
 
-*   `relu(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
-    connected to the previous layer `inputs` with a [ReLU activation
-    function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\))
+*   `tf.nn.relu`. The following code creates a layer of `units` nodes fully
+    connected to the previous layer `input_layer` with a
+    [ReLU activation function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\))
     (@{tf.nn.relu}):
 
     ```python
-    hidden_layer = tf.contrib.layers.relu(inputs=input_layer, num_outputs=10)
+    hidden_layer = tf.layers.dense(
+        inputs=input_layer, units=10, activation=tf.nn.relu)
     ```
 
-*   `relu6(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
+*   `tf.nn.relu6`. The following code creates a layer of `units` nodes fully
     connected to the previous layer `hidden_layer` with a ReLU 6 activation
     function (@{tf.nn.relu6}):
 
     ```python
-    second_hidden_layer = tf.contrib.layers.relu6(inputs=hidden_layer, num_outputs=20)
+    second_hidden_layer = tf.layers.dense(
+        inputs=hidden_layer, units=20, activation=tf.nn.relu)
     ```
 
-*   `linear(inputs, num_outputs)`. Create a layer of `num_outputs` nodes fully
-    connected to the previous layer `second_hidden_layer` with *no* activation
-    function, just a linear transformation:
+*   `None`. The following code creates a layer of `units` nodes fully connected
+    to the previous layer `second_hidden_layer` with *no* activation function,
+    just a linear transformation:
 
     ```python
-    output_layer = tf.contrib.layers.linear(inputs=second_hidden_layer, num_outputs=3)
+    output_layer = tf.layers.dense(
+        inputs=second_hidden_layer, units=3, activation=None)
     ```
 
-All these functions are
-[partials](https://docs.python.org/2/library/functools.html#functools.partial)
-of the more general @{tf.contrib.layers.fully_connected}
-function, which can be used to add fully connected layers with other activation
-functions, e.g.:
+Other activation functions are possible, e.g.:
 
 ```python
-output_layer = tf.contrib.layers.fully_connected(inputs=second_hidden_layer,
-                                                 num_outputs=10,
-                                                 activation_fn=tf.sigmoid)
+output_layer = tf.layers.dense(inputs=second_hidden_layer,
+                               units=10,
+                               activation_fn=tf.sigmoid)
 ```
 
 The above code creates the neural network layer `output_layer`, which is fully
@@ -446,18 +439,19 @@ Putting it all together, the following code constructs a full neural network for
 the abalone predictor, and captures its predictions:
 
 ```python
-def model_fn(features, targets, mode, params):
+def model_fn(features, labels, mode, params):
   """Model function for Estimator."""
 
   # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+  # (features["x"]) with relu activation
+  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
 
   # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+  second_hidden_layer = tf.layers.dense(
+      first_hidden_layer, 10, activation=tf.nn.relu)
 
   # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+  output_layer = tf.layers.dense(second_hidden_layer, 1)
 
   # Reshape output layer to 1-dim Tensor to return predictions
   predictions = tf.reshape(output_layer, [-1])
@@ -465,9 +459,9 @@ def model_fn(features, targets, mode, params):
   ...
 ```
 
-Here, because you'll be passing the abalone `Datasets` directly to `fit()`,
-`evaluate()`, and `predict()` via `x` and `y` arguments, the input layer is the
-`features` `Tensor` passed to the `model_fn`. The network contains two hidden
+Here, because you'll be passing the abalone `Datasets` using `numpy_input_fn`
+as shown below, `features` is a dict `{"x": data_tensor}`, so
+`features["x"]` is the input layer. The network contains two hidden
 layers, each with 10 nodes and a ReLU activation function. The output layer
 contains no activation function, and is
 @{tf.reshape} to a one-dimensional
@@ -476,47 +470,49 @@ tensor to capture the model's predictions, which are stored in
 
 ### Defining loss for the model {#defining-loss}
 
-The `ModelFnOps` returned by the `model_fn` must contain `loss`: a `Tensor`
+The `EstimatorSpec` returned by the `model_fn` must contain `loss`: a `Tensor`
 representing the loss value, which quantifies how well the model's predictions
-reflect the target values during training and evaluation runs. The @{tf.losses}
+reflect the label values during training and evaluation runs. The @{tf.losses}
 module provides convenience functions for calculating loss using a variety of
 metrics, including:
 
-*   `absolute_difference(predictions, targets)`. Calculates loss using the
+*   `absolute_difference(labels, predictions)`. Calculates loss using the
     [absolute-difference
     formula](https://en.wikipedia.org/wiki/Deviation_\(statistics\)#Unsigned_or_absolute_deviation)
     (also known as L<sub>1</sub> loss).
 
-*   `log_loss(predictions, targets)`. Calculates loss using the [logistic loss
+*   `log_loss(labels, predictions)`. Calculates loss using the [logistic loss
     forumula](https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss)
     (typically used in logistic regression).
 
-*   `mean_squared_error(predictions, targets)`. Calculates loss using the [mean
+*   `mean_squared_error(labels, predictions)`. Calculates loss using the [mean
     squared error](https://en.wikipedia.org/wiki/Mean_squared_error) (MSE; also
     known as L<sub>2</sub> loss).
 
 The following example adds a definition for `loss` to the abalone `model_fn`
 using `mean_squared_error()` (in bold):
 
-<pre class="prettyprint"><code class="lang-python">def model_fn(features, targets, mode, params):
+<pre class="prettyprint"><code class="lang-python">def model_fn(features, labels, mode, params):
   """Model function for Estimator."""
 
   # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+  # (features["x"]) with relu activation
+  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
 
   # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+  second_hidden_layer = tf.layers.dense(
+      first_hidden_layer, 10, activation=tf.nn.relu)
 
   # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+  output_layer = tf.layers.dense(second_hidden_layer, 1)
 
   # Reshape output layer to 1-dim Tensor to return predictions
   predictions = tf.reshape(output_layer, [-1])
   predictions_dict = {"ages": predictions}
 
+
   <strong># Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(targets, predictions)</strong>
+  loss = tf.losses.mean_squared_error(labels, predictions)</strong>
   ...</code></pre>
 
 See the @{$python/contrib.losses$API guide} for a
@@ -524,14 +520,14 @@ full list of loss functions and more details on supported arguments and usage.
 
 Supplementary metrics for evaluation can be added to an `eval_metric_ops` dict.
 The following code defines an `rmse` metric, which calculates the root mean
-squared error for the model predictions. Note that the `targets` tensor is cast
+squared error for the model predictions. Note that the `labels` tensor is cast
 to a `float64` type to match the data type of the `predictions` tensor, which
 will contain real values:
 
 ```python
 eval_metric_ops = {
     "rmse": tf.metrics.root_mean_squared_error(
-        tf.cast(targets, tf.float64), predictions)
+        tf.cast(labels, tf.float64), predictions)
 }
 ```
 
@@ -539,109 +535,73 @@ eval_metric_ops = {
 
 The training op defines the optimization algorithm TensorFlow will use when
 fitting the model to the training data. Typically when training, the goal is to
-minimize loss. The tf.contrib.layers API provides the function `optimize_loss`,
-which returns a training op that will do just that. `optimize_loss` has four
-required arguments:
-
-*   `loss`. The loss value calculated by the `model_fn` (see [Defining Loss for
-    the Model](#defining-loss)).
-*   `global_step`. An integer
-    @{tf.Variable} representing the
-    step counter to increment for each model training run. Can easily be
-    created/incremented in TensorFlow via the
-    @{tf.train.get_global_step}
-    function.
-*   `learning_rate`. The [learning
-    rate](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Background)
-    (also known as _step size_) hyperparameter that the optimization algorithm
-    uses when training.
-*   `optimizer`. The optimization algorithm to use during training. `optimizer`
-    can accept any of the following string values, representing an optimization
-    algorithm predefined in `tf.contrib.layers.optimizers`:
-    *   `SGD`. Implementation of [gradient
-        descent](https://en.wikipedia.org/wiki/Gradient_descent)
-        (@{tf.train.GradientDescentOptimizer})
-    *   `Adagrad`. Implementation of the [AdaGrad optimization
-        algorithm](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-        (@{tf.train.AdagradOptimizer})
-    *   `Adam`. Implementation of the [Adam optimization
-        algorithm](http://arxiv.org/pdf/1412.6980.pdf)
-        (@{tf.train.AdamOptimizer})
-    *   `Ftrl`. Implementation of the
-        [FTRL-Proximal](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
-        ("Follow The (Proximally) Regularized Leader") algorithm
-        (@{tf.train.FtrlOptimizer})
-    *   `Momentum`. Implementation of stochastic gradient descent with
-        [momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)
-        (@{tf.train.MomentumOptimizer})
-    *   `RMSProp`. Implementation of the
-        [RMSprop](http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop)
-        algorithm
-        (@{tf.train.RMSPropOptimizer})
-
-Note: The `optimize_loss` function supports additional optional arguments to
-further configure the optimizer, such as for implementing decay. See the
-@{tf.contrib.layers.optimize_loss$API docs} for more info.
+minimize loss. A simple way to create the training op is to instantiate a
+`tf.train.Optimizer` subclass and call the `minimize` method.
 
 The following code defines a training op for the abalone `model_fn` using the
 loss value calculated in [Defining Loss for the Model](#defining-loss), the
-learning rate passed to the function in `params`, and the SGD optimizer. For
-`global_step`, the convenience function
-@{tf.train.get_global_step}
-in tf.contrib.framework takes care of generating an integer variable:
+learning rate passed to the function in `params`, and the gradient descent
+optimizer. For `global_step`, the convenience function
+@{tf.train.get_global_step} takes care of generating an integer variable:
 
 ```python
-train_op = tf.contrib.layers.optimize_loss(
-    loss=loss,
-    global_step=tf.contrib.framework.get_global_step(),
-    learning_rate=params["learning_rate"],
-    optimizer="SGD")
+optimizer = tf.train.GradientDescentOptimizer(
+    learning_rate=params["learning_rate"])
+train_op = optimizer.minimize(
+    loss=loss, global_step=tf.train.get_global_step())
 ```
 
+For a full list of optimizers, and other details, see the
+@{$python/train#optimizers$API guide}.
+
 ### The complete abalone `model_fn`
 
 Here's the final, complete `model_fn` for the abalone age predictor. The
 following code configures the neural network; defines loss and the training op;
-and returns a `ModelFnOps` object containing `mode`, `predictions_dict`, `loss`,
+and returns a `EstimatorSpec` object containing `mode`, `predictions_dict`, `loss`,
 and `train_op`:
 
 ```python
-def model_fn(features, targets, mode, params):
+def model_fn(features, labels, mode, params):
   """Model function for Estimator."""
 
   # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+  # (features["x"]) with relu activation
+  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
 
   # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+  second_hidden_layer = tf.layers.dense(
+      first_hidden_layer, 10, activation=tf.nn.relu)
 
   # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+  output_layer = tf.layers.dense(second_hidden_layer, 1)
 
   # Reshape output layer to 1-dim Tensor to return predictions
   predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
+
+  # Provide an estimator spec for `ModeKeys.PREDICT`.
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions={"ages": predictions})
 
   # Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(targets, predictions)
+  loss = tf.losses.mean_squared_error(labels, predictions)
 
   # Calculate root mean squared error as additional eval metric
   eval_metric_ops = {
-      "rmse":
-          tf.metrics.root_mean_squared_error(
-              tf.cast(targets, tf.float64), predictions)
+      "rmse": tf.metrics.root_mean_squared_error(
+          tf.cast(labels, tf.float64), predictions)
   }
 
-  train_op = tf.contrib.layers.optimize_loss(
-      loss=loss,
-      global_step=tf.contrib.framework.get_global_step(),
-      learning_rate=params["learning_rate"],
-      optimizer="SGD")
+  optimizer = tf.train.GradientDescentOptimizer(
+      learning_rate=params["learning_rate"])
+  train_op = optimizer.minimize(
+      loss=loss, global_step=tf.train.get_global_step())
 
-  return model_fn_lib.ModelFnOps(
+  # Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
+  return tf.estimator.EstimatorSpec(
       mode=mode,
-      predictions=predictions_dict,
       loss=loss,
       train_op=train_op,
       eval_metric_ops=eval_metric_ops)
@@ -657,29 +617,31 @@ Add the following code to the end of `main()` to fit the neural network to the
 training data and evaluate accuracy:
 
 ```python
-def get_train_inputs():
-  x = tf.constant(training_set.data)
-  y = tf.constant(training_set.target)
-  return x, y
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": np.array(training_set.data)},
+    y=np.array(training_set.target),
+    num_epochs=None,
+    shuffle=True)
 
-# Fit
-nn.fit(input_fn=get_train_inputs, steps=5000)
-
-def get_test_inputs():
-  x = tf.constant(test_set.data)
-  y = tf.constant(test_set.target)
-  return x, y
+# Train
+nn.train(input_fn=train_input_fn, steps=5000)
 
 # Score accuracy
-ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
+test_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": np.array(test_set.data)},
+    y=np.array(test_set.target),
+    num_epochs=1,
+    shuffle=False)
+
+ev = nn.evaluate(input_fn=test_input_fn)
 print("Loss: %s" % ev["loss"])
 print("Root Mean Squared Error: %s" % ev["rmse"])
 ```
 
 Note: The above code uses input functions to feed feature (`x`) and label (`y`)
-`Tensor`s into the model for both training (`get_train_inputs()`) and evaluation
-(`get_test_inputs()`). To learn more about input functions, see the tutorial
-@{$input_fn$Building Input Functions with tf.contrib.learn}.
+`Tensor`s into the model for both training (`train_input_fn`) and evaluation
+(`test_input_fn`). To learn more about input functions, see the tutorial
+@{$input_fn$Building Input Functions with tf.estimator}.
 
 Then run the code. You should see output like the following:
 
@@ -701,7 +663,11 @@ To predict ages for the `ABALONE_PREDICT` data set, add the following to
 
 ```python
 # Print out predictions
-predictions = nn.predict(x=prediction_set.data, as_iterable=True)
+predict_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": prediction_set.data},
+    num_epochs=1,
+    shuffle=False)
+predictions = nn.predict(input_fn=predict_input_fn)
 for i, p in enumerate(predictions):
   print("Prediction %s: %s" % (i + 1, p["ages"]))
 ```
@@ -723,11 +689,10 @@ Prediction 7: 11.1289
 
 ## Additional Resources
 
-Congrats! You've successfully built a tf.contrib.learn `Estimator` from scratch.
+Congrats! You've successfully built a tf.estimator `Estimator` from scratch.
 For additional reference materials on building `Estimator`s, see the following
 sections of the API guides:
 
-*   @{$python/contrib.learn#Estimators$Estimators}
 *   @{$python/contrib.layers$Layers}
 *   @{$python/contrib.losses$Losses}
 *   @{$python/contrib.layers#optimization$Optimization}
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index 590023b0654c66f86e6072e6c8431b9b6a4963db..5812caaffc9c12d3719c0830002bb932d7f5a996 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -20,8 +20,8 @@ TensorFlow:
 
 Python is currently the only language supported by TensorFlow's API stability
 promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
-plus community support for [Haskell](https://github.com/tensorflow/haskell)
-and [Rust](https://github.com/tensorflow/rust).  If you'd like to create or
+plus community support for [Haskell](https://github.com/tensorflow/haskell) and 
+[Rust](https://github.com/tensorflow/rust).  If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
diff --git a/tensorflow/docs_src/extend/tool_developers/index.md b/tensorflow/docs_src/extend/tool_developers/index.md
index 06fc5e70dd0e191730cc8469f4f9a457ba0abc23..f02cd23be88ddb61e79dc8168a0fa998fcdc54b0 100644
--- a/tensorflow/docs_src/extend/tool_developers/index.md
+++ b/tensorflow/docs_src/extend/tool_developers/index.md
@@ -63,7 +63,7 @@ There are actually two different formats that a ProtoBuf can be saved in.
 TextFormat is a human-readable form, which makes it nice for debugging and
 editing, but can get large when there's numerical data like weights stored in
 it. You can see a small example of that in
-[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/demo/data/graph_run_run2.pbtxt).
+[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/demo/data/graph_run_run2.pbtxt).
 
 Binary format files are a lot smaller than their text equivalents, even though
 they're not as readable for us. In this script, we ask the user to supply a
diff --git a/tensorflow/docs_src/get_started/tflearn.md b/tensorflow/docs_src/get_started/estimator.md
similarity index 70%
rename from tensorflow/docs_src/get_started/tflearn.md
rename to tensorflow/docs_src/get_started/estimator.md
index 002118073ce1dc741e4d02d8abe1cf9aa219a8cf..a55454f8af362cd97d1ef18ab750e2ee95291bd0 100644
--- a/tensorflow/docs_src/get_started/tflearn.md
+++ b/tensorflow/docs_src/get_started/estimator.md
@@ -1,8 +1,8 @@
-# tf.contrib.learn Quickstart
+# tf.estimator Quickstart
 
-TensorFlow’s high-level machine learning API (tf.contrib.learn) makes it easy to
+TensorFlow’s high-level machine learning API (tf.estimator) makes it easy to
 configure, train, and evaluate a variety of machine learning models. In this
-tutorial, you’ll use tf.contrib.learn to construct a
+tutorial, you’ll use tf.estimator to construct a
 [neural network](https://en.wikipedia.org/wiki/Artificial_neural_network)
 classifier and train it on the
 [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) to
@@ -10,8 +10,8 @@ predict flower species based on sepal/petal geometry. You'll write code to
 perform the following five steps:
 
 1.  Load CSVs containing Iris training/test data into a TensorFlow `Dataset`
-2.  Construct a @{tf.contrib.learn.DNNClassifier$neural network classifier}
-3.  Fit the model using the training data
+2.  Construct a @{tf.estimator.DNNClassifier$neural network classifier}
+3.  Train the model using the training data
 4.  Evaluate the accuracy of the model
 5.  Classify new samples
 
@@ -64,47 +64,50 @@ def main():
       features_dtype=np.float32)
 
   # Specify that all features have real-value data
-  feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
+  feature_columns = [tf.feature_column.numeric_column("x", shape=[4])]
 
   # Build 3 layer DNN with 10, 20, 10 units respectively.
-  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                              hidden_units=[10, 20, 10],
-                                              n_classes=3,
-                                              model_dir="/tmp/iris_model")
+  classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
+                                          hidden_units=[10, 20, 10],
+                                          n_classes=3,
+                                          model_dir="/tmp/iris_model")
   # Define the training inputs
-  def get_train_inputs():
-    x = tf.constant(training_set.data)
-    y = tf.constant(training_set.target)
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": np.array(training_set.data)},
+      y=np.array(training_set.target),
+      num_epochs=None,
+      shuffle=True)
 
-    return x, y
-
-  # Fit model.
-  classifier.fit(input_fn=get_train_inputs, steps=2000)
+  # Train model.
+  classifier.train(input_fn=train_input_fn, steps=2000)
 
   # Define the test inputs
-  def get_test_inputs():
-    x = tf.constant(test_set.data)
-    y = tf.constant(test_set.target)
-
-    return x, y
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": np.array(test_set.data)},
+      y=np.array(test_set.target),
+      num_epochs=1,
+      shuffle=False)
 
   # Evaluate accuracy.
-  accuracy_score = classifier.evaluate(input_fn=get_test_inputs,
-                                       steps=1)["accuracy"]
+  accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]
 
   print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
 
   # Classify two new flower samples.
-  def new_samples():
-    return tf.constant(
+  new_samples = np.array(
       [[6.4, 3.2, 4.5, 1.5],
-       [5.8, 3.1, 5.0, 1.7]], dtype=tf.float32)
+       [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
+  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": new_samples},
+      num_epochs=1,
+      shuffle=False)
 
-  predictions = list(classifier.predict(input_fn=new_samples))
+  predictions = list(classifier.predict(input_fn=predict_input_fn))
+  predicted_classes = [p["classes"] for p in predictions]
 
   print(
       "New Samples, Class Predictions:    {}\n"
-      .format(predictions))
+      .format(predicted_classes))
 
 if __name__ == "__main__":
     main()
@@ -237,31 +240,30 @@ you'll use `training_set.data` and
 
 ## Construct a Deep Neural Network Classifier
 
-tf.contrib.learn offers a variety of predefined models, called
-@{$python/contrib.learn#estimators$`Estimator`s}, which you can
-use "out of the box" to run training and evaluation operations on your data.
+tf.estimator offers a variety of predefined models, called `Estimator`s, which
+you can use "out of the box" to run training and evaluation operations on your
+data.
 Here, you'll configure a Deep Neural Network Classifier model to fit the Iris
-data. Using tf.contrib.learn, you can instantiate your
-@{tf.contrib.learn.DNNClassifier} with
-just a couple lines of code:
+data. Using tf.estimator, you can instantiate your
+@{tf.estimator.DNNClassifier} with just a couple lines of code:
 
 ```python
 # Specify that all features have real-value data
-feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
+feature_columns = [tf.feature_column.numeric_column("x", shape=[4])]
 
 # Build 3 layer DNN with 10, 20, 10 units respectively.
-classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                            hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir="/tmp/iris_model")
+classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
+                                        hidden_units=[10, 20, 10],
+                                        n_classes=3,
+                                        model_dir="/tmp/iris_model")
 ```
 
 The code above first defines the model's feature columns, which specify the data
 type for the features in the data set. All the feature data is continuous, so
-`tf.contrib.layers.real_valued_column` is the appropriate function to use to
+`tf.feature_column.numeric_column` is the appropriate function to use to
 construct the feature columns. There are four features in the data set (sepal
-width, sepal height, petal width, and petal height), so accordingly `dimension`
-must be set to `4` to hold all the data.
+width, sepal height, petal width, and petal height), so accordingly `shape`
+must be set to `[4]` to hold all the data.
 
 Then, the code creates a `DNNClassifier` model using the following arguments:
 
@@ -272,34 +274,34 @@ Then, the code creates a `DNNClassifier` model using the following arguments:
 *   `n_classes=3`. Three target classes, representing the three Iris species.
 *   `model_dir=/tmp/iris_model`. The directory in which TensorFlow will save
     checkpoint data during model training. For more on logging and monitoring
-    with TensorFlow, see @{$monitors$Logging and Monitoring Basics with     tf.contrib.learn}.
+    with TensorFlow, see
+    @{$monitors$Logging and Monitoring Basics with tf.estimator}.
 
 ## Describe the training input pipeline {#train-input}
 
-The `tf.contrib.learn` API uses input functions, which create the TensorFlow
-operations that generate data for the model. In this case, the data is small
-enough that it can be stored in @{tf.constant$TensorFlow constants}. The
-following code produces the simplest possible input pipeline:
+The `tf.estimator` API uses input functions, which create the TensorFlow
+operations that generate data for the model.
+We can use `tf.estimator.inputs.numpy_input_fn` to produce the input pipeline:
 
 ```python
 # Define the training inputs
-def get_train_inputs():
-  x = tf.constant(training_set.data)
-  y = tf.constant(training_set.target)
-
-  return x, y
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": np.array(training_set.data)},
+    y=np.array(training_set.target),
+    num_epochs=None,
+    shuffle=True)
 ```
 
 ## Fit the DNNClassifier to the Iris Training Data {#fit-dnnclassifier}
 
 Now that you've configured your DNN `classifier` model, you can fit it to the
-Iris training data using the @{tf.contrib.learn.BaseEstimator.fit$`fit`} method.
-Pass `get_train_inputs` as the `input_fn`, and the number of steps to train
+Iris training data using the @{tf.estimator.Estimator.train$`train`} method.
+Pass `train_input_fn` as the `input_fn`, and the number of steps to train
 (here, 2000):
 
 ```python
-# Fit model.
-classifier.fit(input_fn=get_train_inputs, steps=2000)
+# Train model.
+classifier.train(input_fn=train_input_fn, steps=2000)
 ```
 
 The state of the model is preserved in the `classifier`, which means you can
@@ -307,46 +309,44 @@ train iteratively if you like. For example, the above is equivalent to the
 following:
 
 ```python
-classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
-classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
+classifier.train(input_fn=train_input_fn, steps=1000)
+classifier.train(input_fn=train_input_fn, steps=1000)
 ```
 
 However, if you're looking to track the model while it trains, you'll likely
-want to instead use a TensorFlow @{tf.contrib.learn.monitors$`monitor`}
+want to instead use a TensorFlow @{tf.train.SessionRunHook$`SessionRunHook`}
 to perform logging operations. See the tutorial
-@{$monitors$&ldquo;Logging and Monitoring Basics with tf.contrib.learn&rdquo;}
+@{$monitors$Logging and Monitoring Basics with tf.estimator}
 for more on this topic.
 
 ## Evaluate Model Accuracy {#evaluate-accuracy}
 
-You've fit your `DNNClassifier` model on the Iris training data; now, you can
-check its accuracy on the Iris test data using the
-@{tf.contrib.learn.BaseEstimator.evaluate$`evaluate`} method. Like `fit`,
+You've trained your `DNNClassifier` model on the Iris training data; now, you
+can check its accuracy on the Iris test data using the
+@{tf.estimator.Estimator.evaluate$`evaluate`} method. Like `train`,
 `evaluate` takes an input function that builds its input pipeline. `evaluate`
-returns a `dict` with the evaluation results. The following code passes the Iris
-test data&mdash;`test_set.data` and `test_set.target`&mdash;to `evaluate` and
-prints the `accuracy` from the results:
+returns a `dict`s with the evaluation results. The following code passes the
+Iris test data&mdash;`test_set.data` and `test_set.target`&mdash;to `evaluate`
+and prints the `accuracy` from the results:
 
 ```python
 # Define the test inputs
-def get_test_inputs():
-  x = tf.constant(test_set.data)
-  y = tf.constant(test_set.target)
-
-  return x, y
+test_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": np.array(test_set.data)},
+    y=np.array(test_set.target),
+    num_epochs=1,
+    shuffle=False)
 
 # Evaluate accuracy.
-accuracy_score = classifier.evaluate(input_fn=get_test_inputs,
-                                     steps=1)["accuracy"]
+accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]
 
 print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
 ```
 
-Note: The `steps` argument to `evaluate` is important here.
-@{tf.contrib.learn.Evaluable.evaluate$`evaluate`} normally runs until it reaches
-the end of the input. This is perfect for evaluating over a set of files, but
-the constants being used here will never throw the `OutOfRangeError` or
-`StopIteration` that it is expecting.
+Note: The `num_epochs=1` argument to `numpy_input_fn` is important here.
+`test_input_fn` will iterate over the data once, and then raise
+`OutOfRangeError`. This error signals the classifier to stop evaluating, so it
+will evaluate over the input once.
 
 When you run the full script, it will print something close to:
 
@@ -368,21 +368,25 @@ Sepal Length | Sepal Width | Petal Length | Petal Width
 5.8          | 3.1         | 5.0          | 1.7
 
 You can predict their species using the `predict()` method. `predict` returns a
-generator, which can easily be converted to a list. The following code retrieves
-and prints the class predictions:
+generator of dicts, which can easily be converted to a list. The following code
+retrieves and prints the class predictions:
 
 ```python
 # Classify two new flower samples.
-def new_samples():
-  return np.array(
+new_samples = np.array(
     [[6.4, 3.2, 4.5, 1.5],
      [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
+predict_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": new_samples},
+    num_epochs=1,
+    shuffle=False)
 
-predictions = list(classifier.predict(input_fn=new_samples))
+predictions = list(classifier.predict(input_fn=predict_input_fn))
+predicted_classes = [p["classes"] for p in predictions]
 
 print(
     "New Samples, Class Predictions:    {}\n"
-    .format(predictions))
+    .format(predicted_classes))
 ```
 
 Your results should look as follows:
@@ -396,14 +400,11 @@ second sample is *Iris virginica*.
 
 ## Additional Resources
 
-*   For further reference materials on tf.contrib.learn, see the official
-    @{$python/contrib.learn$API docs}.
-
-*   To learn more about using tf.contrib.learn to create linear models, see
+*   To learn more about using tf.estimator to create linear models, see
     @{$linear$Large-scale Linear Models with TensorFlow}.
 
-*   To build your own Estimator using tf.contrib.learn APIs, check out
-    @{$estimators$Creating Estimators in tf.contrib.learn}.
+*   To build your own Estimator using tf.estimator APIs, check out
+    @{$estimators$Creating Estimators in tf.estimator}.
 
 *   To experiment with neural network modeling and visualization in the browser,
     check out [Deep Playground](http://playground.tensorflow.org/).
diff --git a/tensorflow/docs_src/get_started/export.md b/tensorflow/docs_src/get_started/export.md
new file mode 100644
index 0000000000000000000000000000000000000000..77e628699e210ae2dae7697ab0f27a8f2b427509
--- /dev/null
+++ b/tensorflow/docs_src/get_started/export.md
@@ -0,0 +1,297 @@
+# Exporting a Trained Model for Serving
+
+Once you have trained an `Estimator` model, you may want to create a service
+from that model that takes requests and returns a result.  You can run such a
+service locally on your machine or deploy it scalably in the cloud.
+
+To prepare a trained Estimator for serving, you must export it in the standard
+[`SavedModel`](https://www.tensorflow.org/code/tensorflow/python/saved_model/README.md)
+format, which wraps the TensorFlow graph, the trained variable values, any
+required assets, and metadata together in a hermetic package.
+
+In this tutorial, we will discuss how to:
+
+* Add graph nodes that accept and prepare inference requests
+* Specify the output nodes and the corresponding [APIs](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto)
+  that can be served (Classify, Regress, or Predict)
+* Export your model to the `SavedModel` format
+* Deploy the model in Google Cloud ML Engine and request predictions
+* Serve the model from a local server and request predictions
+
+
+## The exported graph and its signatures
+
+The export procedure assembles a new TensorFlow graph from two main components:
+1) a Serving Input Receiver that defines the format of the inputs to be
+   accepted, and
+2) the trained model itself.
+
+An exported `SavedModel` contains that combined graph packaged together with one
+or more *signatures*.  Like a function signature in any programming language, a
+graph signature specifies the required inputs (arguments) and the
+expected outputs (return values) of performing the computation.  In the typical
+case, a single signature is present, corresponding to the predictions that the
+model has learned to make.
+
+The *input* portion of the signature is determined by the Serving Input
+Receiver.  To specify the inputs that your deployed model will accept, you must
+provide a `serving_input_receiver_fn()` to `estimator.export_savedmodel()` (see
+below).
+
+The *output* portion of the signature is determined by the model.  For instance,
+canned Estimators know the nature of the outputs they produce (e.g. whether the
+output is a classification or a regression, and the type and shape of those
+outputs).  Custom Estimators must provide this information via `export_outputs`
+(see [below](#specifying_the_outputs_of_a_custom_model)).
+
+> Note: A *multi-headed model* provides multiple signatures, each corresponding
+> to a different "head", i.e. a set of predictions that can be made from the
+> same inputs by executing a subgraph of the complete trained graph.  The
+> *output* portions of these signatures are determined by the model.
+
+
+![Overview of exporting a SavedModel from Estimator](../images/export_savedmodel_overview.png)
+
+## Preparing serving inputs
+
+During training, an @{$input_fn$`input_fn()`} ingests data and prepares it for
+use by the model.  At serving time, similarly, a `serving_input_receiver_fn()`
+accepts inference requests and prepares them for the model.  The purpose of this
+function is to add placeholders to the graph which the serving system will feed
+with inference requests, as well as to add any additional ops needed to convert
+data from the input format into the feature `Tensor`s expected by the model.
+The function returns a @{tf.estimator.export.ServingInputReceiver} object, which
+packages the placeholders and the resulting feature `Tensor`s together.
+
+A typical pattern is that inference requests arrive in the form of serialized
+`tf.Example`s, so the `serving_input_receiver_fn()` creates a single string
+placeholder to receive them.  The `serving_input_receiver_fn()` is then also
+responsible for parsing the `tf.Example`s by adding a @{tf.parse_example} op to
+the graph.
+
+When writing such a `serving_input_receiver_fn()`, you must pass a parsing
+specification to @{tf.parse_example} to tell the parser what feature names to
+expect and how to map them to `Tensor`s. A parsing specification takes the
+form of a dict from feature names to @{tf.FixedLenFeature}, @{tf.VarLenFeature},
+and @{tf.SparseFeature}.  (Note this parsing specification should not include
+any label or weight columns, since those will not be available at serving
+time&mdash;in contrast to a parsing specification used in the `input_fn()` at
+training time.)
+
+In combination, then:
+
+```py
+feature_spec = {'foo': tf.FixedLenFeature(...),
+                'bar': tf.VarLenFeature(...)}
+
+def serving_input_receiver_fn():
+  """An input receiver that expects a serialized tf.Example."""
+  serialized_tf_example = tf.placeholder(dtype=tf.string,
+                                         shape=[default_batch_size],
+                                         name='input_example_tensor')
+  receiver_tensors = {'examples': serialized_tf_example}
+  features = tf.parse_example(serialized_tf_example, feature_spec)
+  return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
+```
+
+The @{tf.estimator.export.build_parsing_serving_input_receiver_fn} utility
+function provides that input receiver for the common case.
+
+> Note: when training a model to be served using Google Cloud ML Engine (see
+> below), the parsing step is not needed, because the model will receive raw
+> feature data.  This is also true when using the Predict API with a local
+> server.
+
+Even if you require no parsing or other input processing&mdash;i.e., if the
+serving system will feed feature `Tensor`s directly&mdash;you must still provide
+a `serving_input_receiver_fn()` that creates placeholders for the feature
+`Tensor`s and passes them through.  The
+@{tf.estimator.export.build_raw_serving_input_receiver_fn} utility provides for
+this.
+
+If these utilities do not meet your needs, you are free to write your own
+`serving_input_receiver_fn()`.  One case where this may be needed is if your
+training `input_fn()` incorporates some preprocessing logic that must be
+recapitulated at serving time.  To reduce the risk of training-serving skew, we
+recommend encapsulating such processing in a function which is then called
+from both `input_fn()` and `serving_input_receiver_fn()`.
+
+
+## Performing the export
+
+To export your trained Estimator, call
+@{tf.estimator.Estimator.export_savedmodel} with the export base path, together
+with the `serving_input_receiver_fn`.
+
+```py
+estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn)
+```
+
+This method builds a new graph by first calling the
+`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling
+this `Estimator`'s `model_fn()` to generate the model graph based on those
+features. It starts a fresh `Session`, and, by default, restores the most recent
+checkpoint into it.  (A different checkpoint may be passed, if needed.)
+Finally it creates a timestamped export directory below the given
+`export_dir_base` (i.e., `export_dir_base/<timestamp>`), and writes a
+`SavedModel` into it containing a single `MetaGraphDef` saved from this
+Session.
+
+> Note: there is currently no built-in mechanism to garbage-collect old exports,
+> so successive exports will accumulate under `export_dir_base` unless deleted
+> by some external means.
+
+## Specifying the outputs of a custom model
+
+When writing a custom `model_fn`, you must populate the `export_outputs` element
+of the @{tf.estimator.EstimatorSpec} return value. This is a dict of
+`{name: output}` describing the output signatures to be exported and used during
+serving.
+
+In the usual case of making a single prediction, this dict contains
+one element, and the `name` is immaterial.  In a multi-headed model, each head
+is represented by an entry in this dict.  In this case the `name` is a string
+of your choice that can be used to request a specific head at serving time.
+
+Each `output` value must be an `ExportOutput` object  such as
+@{tf.estimator.export.ClassificationOutput},
+@{tf.estimator.export.RegressionOutput}, or
+@{tf.estimator.export.PredictOutput}.
+
+These output types map straightforwardly to the
+[TensorFlow Serving APIs](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto),
+and so determine which request types will be honored.
+
+> Note: In the multi-headed case, a `SignatureDef` will be generated for each
+> element of the `export_outputs` dict returned from the model_fn, named using
+> the same keys.  These signatures differ only in their outputs, as provided by
+> the corresponding `ExportOutput` entry.  The inputs are always those provided
+> by the `serving_input_receiver_fn`.
+> An inference request may specify the head by name.  One head must be named
+> using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tensorflow.org/code/tensorflow/python/saved_model/signature_constants.py)
+> indicating which signature will be served when an inference request does not
+> specify one.
+
+
+## Serving the exported model on Google Cloud ML Engine
+
+[Google Cloud ML Engine](https://cloud.google.com/ml-engine/) provides a fully
+managed, scalable environment for serving your trained SavedModels to make
+online or batch predictions.
+
+Please see [Deploying Models](https://cloud.google.com/ml-engine/docs/how-tos/deploying-models)
+to learn how to deploy your SavedModel on Cloud ML Engine.
+
+> Note: Cloud ML Engine accepts inference requests in JSON, CSV, or TFRecords
+> formats, depending on the circumstance.  Parsing these formats is not the
+> responsibility of the graph.  Cloud ML Engine does the parsing for you, and
+> feeds raw feature data directly into the graph.  Thus, when targeting Cloud ML
+> Engine, you should use a `serving_input_receiver_fn()` of the passthrough form
+> that simply creates placeholders for each feature.
+
+
+## Requesting predictions from Google Cloud ML Engine
+
+To learn how to request predictions from a model deployed in Cloud ML Engine,
+please see:
+
+* [Prediction Basics](https://cloud.google.com/ml-engine/docs/concepts/prediction-overview)
+* [Getting Online Predictions](https://cloud.google.com/ml-engine/docs/how-tos/online-predict)
+* [Getting Batch Predictions](https://cloud.google.com/ml-engine/docs/how-tos/batch-predict)
+
+
+## Serving the exported model locally
+
+For local deployment, you can serve your model using
+@{$deploy/tfserve$Tensorflow Serving}, an open-source project that loads a
+`SavedModel` and exposes it as a [gRPC](http://www.grpc.io/) service.
+
+First, [install TensorFlow Serving](https://tensorflow.github.io/serving/setup#prerequisites).
+
+Then build and run the local model server, substituting `$export_dir_base` with
+the path to the `SavedModel` you exported above:
+
+```sh
+bazel build //tensorflow_serving/model_servers:tensorflow_model_server
+bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --model_base_path=$export_dir_base
+```
+
+Now you have a server listening for inference requests via gRPC on port 9000!
+
+
+## Requesting predictions from a local server
+
+The server responds to gRPC requests according to the [PredictionService](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/prediction_service.proto#L15)
+gRPC API service definition.  (The nested protocol buffers are defined in
+various [neighboring files](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis)).
+
+From the API service definition, the gRPC framework generates client libraries
+in various languages providing remote access to the API.  In a project using the
+Bazel build tool, these libraries are built automatically and provided via
+dependencies like these (using Python for example):
+
+```build
+  deps = [
+    "//tensorflow_serving/apis:classification_proto_py_pb2",
+    "//tensorflow_serving/apis:regression_proto_py_pb2",
+    "//tensorflow_serving/apis:predict_proto_py_pb2",
+    "//tensorflow_serving/apis:prediction_service_proto_py_pb2"
+  ]
+```
+
+Python client code can then import the libraries thus:
+
+```py
+from tensorflow_serving.apis import classification_pb2
+from tensorflow_serving.apis import regression_pb2
+from tensorflow_serving.apis import predict_pb2
+from tensorflow_serving.apis import prediction_service_pb2
+```
+
+> Note: `prediction_service_pb2` defines the service as a whole and so
+> is always required.  However a typical client will need only one of
+> `classification_pb2`, `regression_pb2`, and `predict_pb2`, depending on the
+> type of requests being made.
+
+Sending a gRPC request is then accomplished by assembling a protocol buffer
+containing the request data and passing it to the service stub.  Note how the
+request protocol buffer is created empty and then populated via the
+[generated protocol buffer API](https://developers.google.com/protocol-buffers/docs/reference/python-generated).
+
+```py
+from grpc.beta import implementations
+
+channel = implementations.insecure_channel(host, int(port))
+stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
+
+request = classification_pb2.ClassificationRequest()
+example = request.input.example_list.examples.add()
+example.features.feature['x'].float_list.value.extend(image[0].astype(float))
+
+result = stub.Classify(request, 10.0)  # 10 secs timeout
+```
+
+The returned result in this example is a `ClassificationResponse` protocol
+buffer.
+
+This is a skeletal example; please see the @{$deploy$Tensorflow Serving}
+documentation and [examples](https://github.com/tensorflow/serving/tree/master/tensorflow_serving/example)
+for more details.
+
+> Note: `ClassificationRequest` and `RegressionRequest` contain a
+> `tensorflow.serving.Input` protocol buffer, which in turn contains a list of
+> `tensorflow.Example` protocol buffers.  `PredictRequest`, by contrast,
+> contains a mapping from feature names to values encoded via `TensorProto`.
+> Correspondingly: When using the `Classify` and `Regress` APIs, TensorFlow
+> Serving feeds serialized `tf.Example`s to the graph, so your
+> `serving_input_receiver_fn()` should include a `tf.parse_example()` Op.
+> When using the generic `Predict` API, however, TensorFlow Serving feeds raw
+> feature data to the graph, so a passthrough `serving_input_receiver_fn()`
+> should be used.
+
+
+<!-- TODO(soergel): give examples of making requests against this server, using
+the different Tensorflow Serving APIs, selecting the signature by key, etc. -->
+
+<!-- TODO(soergel): document ExportStrategy here once Experiment moves
+from contrib to core. -->
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index c1c68f9c12b9d5505be902123fc02153256c265d..815b83e5fb34d24a877b94f15c827441581f84d9 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -16,14 +16,11 @@ machine learning researchers and others who require fine levels of control over
 their models. The higher level APIs are built on top of TensorFlow Core. These
 higher level APIs are typically easier to learn and use than TensorFlow Core. In
 addition, the higher level APIs make repetitive tasks easier and more consistent
-between different users. A high-level API like tf.contrib.learn helps you manage
-data sets, estimators, training and inference. Note that a few of the high-level
-TensorFlow APIs--those whose method names contain `contrib`-- are still in
-development. It is possible that some `contrib` methods will change or become
-obsolete in subsequent TensorFlow releases.
+between different users. A high-level API like tf.estimator helps you manage
+data sets, estimators, training and inference.
 
 This guide begins with a tutorial on TensorFlow Core. Later, we
-demonstrate how to implement the same model in tf.contrib.learn. Knowing
+demonstrate how to implement the same model in tf.estimator. Knowing
 TensorFlow Core principles will give you a great mental model of how things are
 working internally when you use the more compact higher level API.
 
@@ -35,8 +32,8 @@ tensor's **rank** is its number of dimensions. Here are some examples of
 tensors:
 
 ```python
-3 # a rank 0 tensor; this is a scalar with shape []
-[1. ,2., 3.] # a rank 1 tensor; this is a vector with shape [3]
+3 # a rank 0 tensor; a scalar with shape []
+[1., 2., 3.] # a rank 1 tensor; a vector with shape [3]
 [[1., 2., 3.], [4., 5., 6.]] # a rank 2 tensor; a matrix with shape [2, 3]
 [[[1., 2., 3.]], [[7., 8., 9.]]] # a rank 3 tensor with shape [2, 1, 3]
 ```
@@ -103,20 +100,20 @@ we see the expected values of 3.0 and 4.0:
 ```
 
 We can build more complicated computations by combining `Tensor` nodes with
-operations (Operations are also nodes.). For example, we can add our two
+operations (Operations are also nodes). For example, we can add our two
 constant nodes and produce a new graph as follows:
 
 ```python
 node3 = tf.add(node1, node2)
-print("node3: ", node3)
-print("sess.run(node3): ",sess.run(node3))
+print("node3:", node3)
+print("sess.run(node3):", sess.run(node3))
 ```
 
 The last two print statements produce
 
 ```
-node3:  Tensor("Add:0", shape=(), dtype=float32)
-sess.run(node3):  7.0
+node3: Tensor("Add:0", shape=(), dtype=float32)
+sess.run(node3): 7.0
 ```
 
 TensorFlow provides a utility called TensorBoard that can display a picture of
@@ -143,8 +140,8 @@ the [run method](https://www.tensorflow.org/api_docs/python/tf/Session#run)
 to feed concrete values to the placeholders:
 
 ```python
-print(sess.run(adder_node, {a: 3, b:4.5}))
-print(sess.run(adder_node, {a: [1,3], b: [2, 4]}))
+print(sess.run(adder_node, {a: 3, b: 4.5}))
+print(sess.run(adder_node, {a: [1, 3], b: [2, 4]}))
 ```
 resulting in the output
 
@@ -162,7 +159,7 @@ For example,
 
 ```python
 add_and_triple = adder_node * 3.
-print(sess.run(add_and_triple, {a: 3, b:4.5}))
+print(sess.run(add_and_triple, {a: 3, b: 4.5}))
 ```
 produces the output
 ```
@@ -184,7 +181,7 @@ initial value:
 W = tf.Variable([.3], dtype=tf.float32)
 b = tf.Variable([-.3], dtype=tf.float32)
 x = tf.placeholder(tf.float32)
-linear_model = W * x + b
+linear_model = W*x + b
 ```
 
 Constants are initialized when you call `tf.constant`, and their value can never
@@ -205,7 +202,7 @@ Since `x` is a placeholder, we can evaluate `linear_model` for several values of
 `x` simultaneously as follows:
 
 ```python
-print(sess.run(linear_model, {x:[1,2,3,4]}))
+print(sess.run(linear_model, {x: [1, 2, 3, 4]}))
 ```
 to produce the output
 ```
@@ -228,7 +225,7 @@ that abstracts the error of all examples using `tf.reduce_sum`:
 y = tf.placeholder(tf.float32)
 squared_deltas = tf.square(linear_model - y)
 loss = tf.reduce_sum(squared_deltas)
-print(sess.run(loss, {x:[1,2,3,4], y:[0,-1,-2,-3]}))
+print(sess.run(loss, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]}))
 ```
 producing the loss value
 ```
@@ -245,7 +242,7 @@ perfect values of -1 and 1. A variable is initialized to the value provided to
 fixW = tf.assign(W, [-1.])
 fixb = tf.assign(b, [1.])
 sess.run([fixW, fixb])
-print(sess.run(loss, {x:[1,2,3,4], y:[0,-1,-2,-3]}))
+print(sess.run(loss, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]}))
 ```
 The final print shows the loss now is zero.
 ```
@@ -276,7 +273,7 @@ train = optimizer.minimize(loss)
 ```python
 sess.run(init) # reset values to incorrect defaults.
 for i in range(1000):
-  sess.run(train, {x:[1,2,3,4], y:[0,-1,-2,-3]})
+  sess.run(train, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]})
 
 print(sess.run([W, b]))
 ```
@@ -298,7 +295,6 @@ next section.
 The completed trainable linear regression model is shown here:
 
 ```python
-import numpy as np
 import tensorflow as tf
 
 # Model parameters
@@ -306,25 +302,27 @@ W = tf.Variable([.3], dtype=tf.float32)
 b = tf.Variable([-.3], dtype=tf.float32)
 # Model input and output
 x = tf.placeholder(tf.float32)
-linear_model = W * x + b
+linear_model = W*x + b
 y = tf.placeholder(tf.float32)
+
 # loss
 loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
 # optimizer
 optimizer = tf.train.GradientDescentOptimizer(0.01)
 train = optimizer.minimize(loss)
+
 # training data
-x_train = [1,2,3,4]
-y_train = [0,-1,-2,-3]
+x_train = [1, 2, 3, 4]
+y_train = [0, -1, -2, -3]
 # training loop
 init = tf.global_variables_initializer()
 sess = tf.Session()
 sess.run(init) # reset values to wrong
 for i in range(1000):
-  sess.run(train, {x:x_train, y:y_train})
+  sess.run(train, {x: x_train, y: y_train})
 
 # evaluate training accuracy
-curr_W, curr_b, curr_loss = sess.run([W, b, loss], {x:x_train, y:y_train})
+curr_W, curr_b, curr_loss = sess.run([W, b, loss], {x: x_train, y: y_train})
 print("W: %s b: %s loss: %s"%(curr_W, curr_b, curr_loss))
 ```
 When run, it produces
@@ -332,45 +330,43 @@ When run, it produces
 W: [-0.9999969] b: [ 0.99999082] loss: 5.69997e-11
 ```
 
-Notice that the loss is a very small number (close to zero). If you run this
-program your loss will not be exactly the same, because the model is initialized
-with random values.
+Notice that the loss is a very small number (very close to zero). If you run 
+this program, your loss may not be exactly the same as the aforementioned loss 
+because the model is initialized with pseudorandom values.
 
 This more complicated program can still be visualized in TensorBoard
 ![TensorBoard final model visualization](https://www.tensorflow.org/images/getting_started_final.png)
 
-## `tf.contrib.learn`
+## `tf.estimator`
 
-`tf.contrib.learn` is a high-level TensorFlow library that simplifies the
+`tf.estimator` is a high-level TensorFlow library that simplifies the
 mechanics of machine learning, including the following:
 
 *   running training loops
 *   running evaluation loops
 *   managing data sets
-*   managing feeding
 
-tf.contrib.learn defines many common models.
+tf.estimator defines many common models.
 
 ### Basic usage
 
 Notice how much simpler the linear regression program becomes with
-`tf.contrib.learn`:
+`tf.estimator`:
 
 ```python
 import tensorflow as tf
 # NumPy is often used to load, manipulate and preprocess data.
 import numpy as np
 
-# Declare list of features. We only have one real-valued feature. There are many
+# Declare list of features. We only have one numeric feature. There are many
 # other types of columns that are more complicated and useful.
-features = [tf.contrib.layers.real_valued_column("x", dimension=1)]
+feature_columns = [tf.feature_column.numeric_column("x", shape=[1])]
 
 # An estimator is the front end to invoke training (fitting) and evaluation
 # (inference). There are many predefined types like linear regression,
-# logistic regression, linear classification, logistic classification, and
-# many neural network classifiers and regressors. The following code
-# provides an estimator that does linear regression.
-estimator = tf.contrib.learn.LinearRegressor(feature_columns=features)
+# linear classification, and many neural network classifiers and regressors.
+# The following code provides an estimator that does linear regression.
+estimator = tf.estimator.LinearRegressor(feature_columns=feature_columns)
 
 # TensorFlow provides many helper methods to read and set up data sets.
 # Here we use two data sets: one for training and one for evaluation
@@ -380,51 +376,53 @@ x_train = np.array([1., 2., 3., 4.])
 y_train = np.array([0., -1., -2., -3.])
 x_eval = np.array([2., 5., 8., 1.])
 y_eval = np.array([-1.01, -4.1, -7, 0.])
-input_fn = tf.contrib.learn.io.numpy_input_fn({"x":x_train}, y_train,
-                                              batch_size=4,
-                                              num_epochs=1000)
-eval_input_fn = tf.contrib.learn.io.numpy_input_fn(
-    {"x":x_eval}, y_eval, batch_size=4, num_epochs=1000)
+input_fn = tf.estimator.inputs.numpy_input_fn(
+    {"x": x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True)
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    {"x": x_train}, y_train, batch_size=4, num_epochs=1000, shuffle=False)
+eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+    {"x": x_eval}, y_eval, batch_size=4, num_epochs=1000, shuffle=False)
 
 # We can invoke 1000 training steps by invoking the  method and passing the
 # training data set.
-estimator.fit(input_fn=input_fn, steps=1000)
+estimator.train(input_fn=input_fn, steps=1000)
 
 # Here we evaluate how well our model did.
-train_loss = estimator.evaluate(input_fn=input_fn)
-eval_loss = estimator.evaluate(input_fn=eval_input_fn)
-print("train loss: %r"% train_loss)
-print("eval loss: %r"% eval_loss)
+train_metrics = estimator.evaluate(input_fn=train_input_fn)
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn)
+print("train metrics: %r"% train_metrics)
+print("eval metrics: %r"% eval_metrics)
 ```
 When run, it produces
 ```
-    train loss: {'global_step': 1000, 'loss': 4.3049088e-08}
-    eval loss: {'global_step': 1000, 'loss': 0.0025487561}
+train metrics: {'loss': 1.2712867e-09, 'global_step': 1000}
+eval metrics: {'loss': 0.0025279333, 'global_step': 1000}
 ```
 Notice how our eval data has a higher loss, but it is still close to zero.
 That means we are learning properly.
 
 ### A custom model
 
-`tf.contrib.learn` does not lock you into its predefined models. Suppose we
+`tf.estimator` does not lock you into its predefined models. Suppose we
 wanted to create a custom model that is not built into TensorFlow. We can still
 retain the high level abstraction of data set, feeding, training, etc. of
-`tf.contrib.learn`. For illustration, we will show how to implement our own
+`tf.estimator`. For illustration, we will show how to implement our own
 equivalent model to `LinearRegressor` using our knowledge of the lower level
 TensorFlow API.
 
-To define a custom model that works with `tf.contrib.learn`, we need to use
-`tf.contrib.learn.Estimator`. `tf.contrib.learn.LinearRegressor` is actually
-a sub-class of `tf.contrib.learn.Estimator`. Instead of sub-classing
+To define a custom model that works with `tf.estimator`, we need to use
+`tf.estimator.Estimator`. `tf.estimator.LinearRegressor` is actually
+a sub-class of `tf.estimator.Estimator`. Instead of sub-classing
 `Estimator`, we simply provide `Estimator` a function `model_fn` that tells
-`tf.contrib.learn` how it can evaluate predictions, training steps, and
+`tf.estimator` how it can evaluate predictions, training steps, and
 loss. The code is as follows:
 
 ```python
 import numpy as np
 import tensorflow as tf
+
 # Declare list of features, we only have one real-valued feature
-def model(features, labels, mode):
+def model_fn(features, labels, mode):
   # Build a linear model and predict values
   W = tf.get_variable("W", [1], dtype=tf.float64)
   b = tf.get_variable("b", [1], dtype=tf.float64)
@@ -436,36 +434,42 @@ def model(features, labels, mode):
   optimizer = tf.train.GradientDescentOptimizer(0.01)
   train = tf.group(optimizer.minimize(loss),
                    tf.assign_add(global_step, 1))
-  # ModelFnOps connects subgraphs we built to the
+  # EstimatorSpec connects subgraphs we built to the
   # appropriate functionality.
-  return tf.contrib.learn.ModelFnOps(
-      mode=mode, predictions=y,
+  return tf.estimator.EstimatorSpec(
+      mode=mode,
+      predictions=y,
       loss=loss,
       train_op=train)
 
-estimator = tf.contrib.learn.Estimator(model_fn=model)
+estimator = tf.estimator.Estimator(model_fn=model_fn)
 # define our data sets
 x_train = np.array([1., 2., 3., 4.])
 y_train = np.array([0., -1., -2., -3.])
 x_eval = np.array([2., 5., 8., 1.])
 y_eval = np.array([-1.01, -4.1, -7, 0.])
-input_fn = tf.contrib.learn.io.numpy_input_fn({"x": x_train}, y_train, 4, num_epochs=1000)
+input_fn = tf.estimator.inputs.numpy_input_fn(
+    {"x": x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True)
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    {"x": x_train}, y_train, batch_size=4, num_epochs=1000, shuffle=False)
+eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+    {"x": x_eval}, y_eval, batch_size=4, num_epochs=1000, shuffle=False)
 
 # train
-estimator.fit(input_fn=input_fn, steps=1000)
-# Here we evaluate how well our model did. 
-train_loss = estimator.evaluate(input_fn=input_fn)
-eval_loss = estimator.evaluate(input_fn=eval_input_fn)
-print("train loss: %r"% train_loss)
-print("eval loss: %r"% eval_loss)
+estimator.train(input_fn=input_fn, steps=1000)
+# Here we evaluate how well our model did.
+train_metrics = estimator.evaluate(input_fn=train_input_fn)
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn)
+print("train metrics: %r"% train_metrics)
+print("eval metrics: %r"% eval_metrics)
 ```
 When run, it produces
 ```
-train loss: {'global_step': 1000, 'loss': 4.9380226e-11}
-eval loss: {'global_step': 1000, 'loss': 0.01010081}
+train metrics: {'loss': 1.227995e-11, 'global_step': 1000}
+eval metrics: {'loss': 0.01010036, 'global_step': 1000}
 ```
 
-Notice how the contents of the custom `model()` function are very similar
+Notice how the contents of the custom `model_fn()` function are very similar
 to our manual model training loop from the lower level API.
 
 ## Next steps
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 241263c72c050eb031241c29859a35a5dcd7930d..dd69408cba4659c1e9be193419f30c1f9ec11b95 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -11,21 +11,23 @@ to training an MNIST model on TensorFlow:
 
   * @{$mnist/beginners$MNIST for ML Beginners}, which introduces MNIST through
     the high-level API.
-  * @{$mnist/pros$Deep MNIST for Experts}, which is more-in depth than 
-    "MNIST for ML Beginners," and assumes some familiarity with machine 
+  * @{$mnist/pros$Deep MNIST for Experts}, which is more-in depth than
+    "MNIST for ML Beginners," and assumes some familiarity with machine
     learning concepts.
   * @{$mnist/mechanics$TensorFlow Mechanics 101}, which introduces MNIST through
     the low-level API.
 
-For developers new to TensorFlow, the high-level API is a good place to start. 
+For developers new to TensorFlow, the high-level API is a good place to start.
 To learn about the high-level API, read the following guides:
 
-  * @{$get_started/tflearn$tf.contrib.learn Quickstart}, which introduces this
+  * @{$get_started/estimator$tf.estimator Quickstart}, which introduces this
     API.
   * @{$get_started/input_fn$Building Input Functions with tf.contrib.learn},
     which takes you into a somewhat more sophisticated use of this API.
   * @{$get_started/monitors$Logging and Monitoring Basics with tf.contrib.learn},
     which explains how to audit the progress of model training.
+  * @{$get_started/export$Exporting a Trained Model for Serving}, which shows
+    how to save a trained model in a form that is ready to deploy.
 
 TensorBoard is a utility to visualize different aspects of machine learning.
 The following guides explain how to use TensorBoard:
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index a053617b5895bd2a92784e64b4dfd6f1ac35ab53..422f45c586aa587c22f9d72eab23833f85b5a2eb 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -1,6 +1,6 @@
-# Building Input Functions with tf.contrib.learn
+# Building Input Functions with tf.estimator
 
-This tutorial introduces you to creating input functions in tf.contrib.learn.
+This tutorial introduces you to creating input functions in tf.estimator.
 You'll get an overview of how to construct an `input_fn` to preprocess and feed
 data into your models. Then, you'll implement an `input_fn` that feeds training,
 evaluation, and prediction data into a neural network regressor for predicting
@@ -8,26 +8,25 @@ median house values.
 
 ## Custom Input Pipelines with input_fn
 
-When training a neural network using tf.contrib.learn, it's possible to pass
-your feature and target data directly into your `fit`, `evaluate`, or `predict`
-operations. Here's an example taken from the @{$tflearn$tf.contrib.learn quickstart tutorial}:
+The `input_fn` is used to pass feature and target data to the `train`,
+`evaluate`, and `predict` methods of the `Estimator`.
+The user can do feature engineering or pre-processing inside the `input_fn`.
+Here's an example taken from the @{$estimator$tf.estimator Quickstart tutorial}:
 
 ```python
+import numpy as np
+
 training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
     filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
-...
 
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=2000)
-```
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": np.array(training_set.data)},
+    y=np.array(training_set.target),
+    num_epochs=None,
+    shuffle=True)
 
-This approach works well when little to no manipulation of source data is
-required. But in cases where more feature engineering is needed,
-`tf.contrib.learn` supports using a custom input function (`input_fn`) to
-encapsulate the logic for preprocessing and piping data into your models.
+classifier.train(input_fn=train_input_fn, steps=2000)
+```
 
 ### Anatomy of an input_fn
 
@@ -43,8 +42,9 @@ def my_input_fn():
     return feature_cols, labels
 ```
 
-The body of the input function contains the specific logic for preprocessing your
-input data, such as scrubbing out bad examples or [feature scaling](https://en.wikipedia.org/wiki/Feature_scaling).
+The body of the input function contains the specific logic for preprocessing
+your input data, such as scrubbing out bad examples or
+[feature scaling](https://en.wikipedia.org/wiki/Feature_scaling).
 
 Input functions must return the following two values containing the final
 feature and label data to be fed into your model (as shown in the above code
@@ -61,15 +61,27 @@ data.</dd>
 
 ### Converting Feature Data to Tensors
 
-If your feature/label data is stored in [_pandas_](http://pandas.pydata.org/)
-dataframes or [numpy](http://www.numpy.org/) arrays, you'll need to convert it
-to `Tensor`s before returning it from your `input_fn`.
+If your feature/label data is a python array or stored in
+[_pandas_](http://pandas.pydata.org/) dataframes or
+[numpy](http://www.numpy.org/) arrays, you can use the following methods to
+construct `input_fn`:
 
-For continuous data, you can create and populate a `Tensor` using `tf.constant`:
+```python
+import numpy as np
+# numpy input_fn.
+my_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": np.array(x_data)},
+    y=np.array(y_data),
+    ...)
+```
 
 ```python
-feature_column_data = [1, 2.4, 0, 9.9, 3, 120]
-feature_tensor = tf.constant(feature_column_data)
+import pandas as pd
+# pandas input_fn.
+my_input_fn = tf.estimator.inputs.pandas_input_fn(
+    x=pd.DataFrame({"x": x_data}),
+    y=pd.Series(y_data),
+    ...)
 ```
 
 For [sparse, categorical data](https://en.wikipedia.org/wiki/Sparse_matrix)
@@ -103,33 +115,26 @@ This corresponds to the following dense tensor:
  [0, 0, 0, 0, 0.5]]
 ```
 
-For more on `SparseTensor`, see the
-@{tf.SparseTensor}.
+For more on `SparseTensor`, see @{tf.SparseTensor}.
 
 ### Passing input_fn Data to Your Model
 
 To feed data to your model for training, you simply pass the input function
-you've created to your `fit` operation as the value of the `input_fn` parameter,
-e.g.:
+you've created to your `train` operation as the value of the `input_fn`
+parameter, e.g.:
 
 ```python
-classifier.fit(input_fn=my_input_fn, steps=2000)
+classifier.train(input_fn=my_input_fn, steps=2000)
 ```
 
-Note that the `input_fn` is responsible for supplying both feature and label
-data to the model, and replaces both the `x` and `y` parameters in `fit`. If you
-supply an `input_fn` value to `fit` that is not `None` in conjunction with
-either an `x` or `y` parameter that is not `None`, it will result in a
-`ValueError`.
-
-Also note that the `input_fn` parameter must receive a function object (i.e.,
+Note that the `input_fn` parameter must receive a function object (i.e.,
 `input_fn=my_input_fn`), not the return value of a function call
-(`input_fn=my_input_fn()`). This means that if you try to pass parameters to the input
-function in your `fit` call, as in the following code, it will result in a
+(`input_fn=my_input_fn()`). This means that if you try to pass parameters to the
+`input_fn` in your `train` call, as in the following code, it will result in a
 `TypeError`:
 
 ```python
-classifier.fit(input_fn=my_input_fn(training_set), steps=2000)
+classifier.train(input_fn=my_input_fn(training_set), steps=2000)
 ```
 
 However, if you'd like to be able to parameterize your input function, there are
@@ -138,29 +143,33 @@ arguments as your `input_fn` and use it to invoke your input function
 with the desired parameters. For example:
 
 ```python
-def my_input_function_training_set():
-  return my_input_function(training_set)
+def my_input_fn(data_set):
+  ...
 
-classifier.fit(input_fn=my_input_fn_training_set, steps=2000)
+def my_input_fn_training_set():
+  return my_input_fn(training_set)
+
+classifier.train(input_fn=my_input_fn_training_set, steps=2000)
 ```
 
 Alternatively, you can use Python's [`functools.partial`](https://docs.python.org/2/library/functools.html#functools.partial)
 function to construct a new function object with all parameter values fixed:
 
 ```python
-classifier.fit(input_fn=functools.partial(my_input_function,
-                                          data_set=training_set), steps=2000)
+classifier.train(
+    input_fn=functools.partial(my_input_fn, data_set=training_set),
+    steps=2000)
 ```
 
-A third option is to wrap your input_fn invocation in a
+A third option is to wrap your `input_fn` invocation in a
 [`lambda`](https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions)
 and pass it to the `input_fn` parameter:
 
 ```python
-classifier.fit(input_fn=lambda: my_input_fn(training_set), steps=2000)
+classifier.train(input_fn=lambda: my_input_fn(training_set), steps=2000)
 ```
 
-One big advantage of architecting your input pipeline as shown above—to accept a
+One big advantage of designing your input pipeline as shown above—to accept a
 parameter for data set—is that you can pass the same `input_fn` to `evaluate`
 and `predict` operations by just changing the data set argument, e.g.:
 
@@ -168,9 +177,36 @@ and `predict` operations by just changing the data set argument, e.g.:
 classifier.evaluate(input_fn=lambda: my_input_fn(test_set), steps=2000)
 ```
 
-This approach enhances code maintainability: no need to capture `x` and `y`
-values in separate variables (e.g., `x_train`, `x_test`, `y_train`, `y_test`)
-for each type of operation.
+This approach enhances code maintainability: no need to define multiple
+`input_fn` (e.g. `input_fn_train`, `input_fn_test`, `input_fn_predict`) for each
+type of operation.
+
+Finally, you can use the methods in `tf.estimator.inputs` to create `input_fn`
+from numpy or pandas data sets. The additional benefit is that you can use
+more arguments, such as `num_epochs` and `shuffle` to control how the `input_fn`
+iterates over the data:
+
+```python
+import pandas as pd
+
+def get_input_fn_from_pandas(data_set, num_epochs=None, shuffle=True):
+  return tf.estimator.inputs.pandas_input_fn(
+      x=pdDataFrame(...),
+      y=pd.Series(...),
+      num_epochs=num_epochs,
+      shuffle=shuffle)
+```
+
+```python
+import numpy as np
+
+def get_input_fn_from_numpy(data_set, num_epochs=None, shuffle=True):
+  return tf.estimator.inputs.numpy_input_fn(
+      x={...},
+      y=np.array(...),
+      num_epochs=num_epochs,
+      shuffle=shuffle)
+```
 
 ### A Neural Network Model for Boston House Values
 
@@ -259,8 +295,7 @@ housing data set contain continuous values, you can create their
 `FeatureColumn`s using the `tf.contrib.layers.real_valued_column()` function:
 
 ```python
-feature_cols = [tf.contrib.layers.real_valued_column(k)
-                  for k in FEATURES]
+feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
 ```
 
 NOTE: For a more in-depth overview of feature columns, see
@@ -275,36 +310,47 @@ with 10 nodes each), and `feature_columns`, containing the list of
 `FeatureColumns` you just defined:
 
 ```python
-regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
-                                          hidden_units=[10, 10],
-                                          model_dir="/tmp/boston_model")
+regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
+                                      hidden_units=[10, 10],
+                                      model_dir="/tmp/boston_model")
 ```
 
 ### Building the input_fn
 
-To pass input data into the `regressor`, create an input function, which will
-accept a _pandas_ `Dataframe` and return feature column and label values as
-`Tensor`s:
+To pass input data into the `regressor`, write a factory method that accepts a
+_pandas_ `Dataframe` and returns an `input_fn`:
 
 ```python
-def input_fn(data_set):
-  feature_cols = {k: tf.constant(data_set[k].values)
-                  for k in FEATURES}
-  labels = tf.constant(data_set[LABEL].values)
-  return feature_cols, labels
+def get_input_fn(data_set, num_epochs=None, shuffle=True):
+  return tf.estimator.inputs.pandas_input_fn(
+      x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
+      y = pd.Series(data_set[LABEL].values),
+      num_epochs=num_epochs,
+      shuffle=shuffle)
 ```
 
 Note that the input data is passed into `input_fn` in the `data_set` argument,
 which means the function can process any of the `DataFrame`s you've imported:
 `training_set`, `test_set`, and `prediction_set`.
 
+Two additional arguments are provided:
+* `num_epochs`: controls the number of
+  epochs to iterate over data. For training, set this to `None`, so the
+  `input_fn` keeps returning data until the required number of train steps is
+  reached. For evaluate and predict, set this to 1, so the `input_fn` will
+  iterate over the data once and then raise `OutOfRangeError`. That error will
+  signal the `Estimator` to stop evaluate or predict.
+* `shuffle`: Whether to shuffle the data. For evaluate and predict, set this to
+  `False`, so the `input_fn` iterates over the data sequentially. For train,
+  set this to `True`.
+
 ### Training the Regressor
 
-To train the neural network regressor, run `fit` with the `training_set` passed
-to the `input_fn` as follows:
+To train the neural network regressor, run `train` with the `training_set`
+passed to the `input_fn` as follows:
 
 ```python
-regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)
+regressor.train(input_fn=get_input_fn(training_set), steps=5000)
 ```
 
 You should see log output similar to the following, which reports training loss
@@ -330,7 +376,8 @@ Next, see how the trained model performs against the test data set. Run
 `evaluate`, and this time pass the `test_set` to the `input_fn`:
 
 ```python
-ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
+ev = regressor.evaluate(
+    input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
 ```
 
 Retrieve the loss from the `ev` results and print it to output:
@@ -354,10 +401,12 @@ Finally, you can use the model to predict median house values for the
 `prediction_set`, which contains feature data but no labels for six examples:
 
 ```python
-y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
-# .predict() returns an iterator; convert to a list and print predictions
-predictions = list(itertools.islice(y, 6))
-print ("Predictions: {}".format(str(predictions)))
+y = regressor.predict(
+    input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
+# .predict() returns an iterator of dicts; convert to a list and print
+# predictions
+predictions = list(p["predictions"] for p in itertools.islice(y, 6))
+print("Predictions: {}".format(str(predictions)))
 ```
 
 Your results should contain six house-value predictions in thousands of dollars,
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 812f248d3ebfdf7439d9324b47825c2facf951c2..656727fbfe0942f69cf9a17a2313d6dcedff119b 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -3,7 +3,7 @@ get_started.md
 mnist/beginners.md
 mnist/pros.md
 mnist/mechanics.md
-tflearn.md
+estimator.md
 input_fn.md
 monitors.md
 summaries_and_tensorboard.md
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index 624d91647484bb0adf85b47179c2ac686ffc890f..193dd41b2abece3ee2cccc6c48a5b98b5c4f7670 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -95,7 +95,7 @@ We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't
 matter how we flatten the array, as long as we're consistent between images.
 From this perspective, the MNIST images are just a bunch of points in a
 784-dimensional vector space, with a
-[very rich structure](http://colah.github.io/posts/2014-10-Visualizing-MNIST/)
+[very rich structure](https://colah.github.io/posts/2014-10-Visualizing-MNIST/)
 (warning: computationally intensive visualizations).
 
 Flattening the data throws away information about the 2D structure of the image.
@@ -231,7 +231,7 @@ Now let's turn that into something that TensorFlow can use.
 
 
 To do efficient numerical computing in Python, we typically use libraries like
-[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
+[NumPy](http://www.numpy.org) that do expensive operations such as matrix
 multiplication outside Python, using highly efficient code implemented in
 another language.  Unfortunately, there can still be a lot of overhead from
 switching back to Python every operation. This overhead is especially bad if you
@@ -324,7 +324,7 @@ distribution (the one-hot vector with the digit labels).  In some rough sense, t
 cross-entropy is measuring how inefficient our predictions are for describing
 the truth. Going into more detail about cross-entropy is beyond the scope of
 this tutorial, but it's well worth
-[understanding](http://colah.github.io/posts/2015-09-Visual-Information/).
+[understanding](https://colah.github.io/posts/2015-09-Visual-Information).
 
 To implement cross-entropy we need to first add a new placeholder to input the
 correct answers:
@@ -356,13 +356,13 @@ instead.
 Now that we know what we want our model to do, it's very easy to have TensorFlow
 train it to do so.  Because TensorFlow knows the entire graph of your
 computations, it can automatically use the
-[backpropagation algorithm](http://colah.github.io/posts/2015-08-Backprop/) to
+[backpropagation algorithm](https://colah.github.io/posts/2015-08-Backprop) to
 efficiently determine how your variables affect the loss you ask it to
 minimize. Then it can apply your choice of optimization algorithm to modify the
 variables and reduce the loss.
 
 ```python
-train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
+train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 ```
 
 In this case, we ask TensorFlow to minimize `cross_entropy` using the
@@ -447,7 +447,7 @@ Is that good? Well, not really. In fact, it's pretty bad. This is because we're
 using a very simple model. With some small changes, we can get to 97%. The best
 models can get to over 99.7% accuracy! (For more information, have a look at
 this
-[list of results](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html).)
+[list of results](https://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results).)
 
 What matters is that we learned from this model. Still, if you're feeling a bit
 down about these results, check out
diff --git a/tensorflow/docs_src/get_started/mnist/pros.md b/tensorflow/docs_src/get_started/mnist/pros.md
index d50e874d521edeffae999fc9215c6caf2385668d..4933dd28cd37e695a10ab28832f26a613589d01a 100644
--- a/tensorflow/docs_src/get_started/mnist/pros.md
+++ b/tensorflow/docs_src/get_started/mnist/pros.md
@@ -250,6 +250,12 @@ section, we'll fix that, jumping from a very simple model to something
 moderately sophisticated: a small convolutional neural network. This will get us
 to around 99.2% accuracy -- not state of the art, but respectable.
 
+Here is a diagram, created with TensorBoard, of the model we will build:
+
+<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img src="https://www.tensorflow.org/images/mnist_deep.png">
+</div>
+
 ### Weight Initialization
 
 To create this model, we're going to need to create a lot of weights and biases.
diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md
index d9c605b013cca5e4bad21fd7167a0cca345c3251..5606e95365812a7287b844a86172287c1aafa766 100644
--- a/tensorflow/docs_src/get_started/monitors.md
+++ b/tensorflow/docs_src/get_started/monitors.md
@@ -4,14 +4,14 @@ When training a model, it’s often valuable to track and evaluate progress in
 real time. In this tutorial, you’ll learn how to use TensorFlow’s logging
 capabilities and the `Monitor` API to audit the in-progress training of a neural
 network classifier for categorizing irises. This tutorial builds on the code
-developed in @{$tflearn$tf.contrib.learn Quickstart} so if you
+developed in @{$estimator$tf.estimator Quickstart} so if you
 haven't yet completed that tutorial, you may want to explore it first,
 especially if you're looking for an intro/refresher on tf.contrib.learn basics.
 
 ## Setup {#setup}
 
 For this tutorial, you'll be building upon the following code from
-@{$tflearn$tf.contrib.learn Quickstart}:
+@{$estimator$tf.estimator Quickstart}:
 
 ```python
 from __future__ import absolute_import
@@ -75,7 +75,7 @@ here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/monitors/iri
 
 ## Overview
 
-The @{$tflearn$tf.contrib.learn Quickstart tutorial} walked through
+The @{$estimator$tf.estimator Quickstart tutorial} walked through
 how to implement a neural net classifier to categorize iris examples into one of
 three species.
 
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
index 45d43e7a6e76ef9adc95cf2ebe5fe346de22caee..ece8fbf43c3c999306f998710b2b021e10867d8d 100644
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@@ -17,7 +17,7 @@ TensorBoard is fully configured, it looks like this:
 </div>
 
 This tutorial is intended to get you started with simple TensorBoard usage.
-There are other resources available as well! The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md)
+There are other resources available as well! The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard)
 has a lot more information on TensorBoard usage, including tips & tricks, and
 debugging information.
 
@@ -216,5 +216,4 @@ corner. Each tab represents a set of serialized data that can be visualized.
 For in depth information on how to use the *graph* tab to visualize your graph,
 see @{$graph_viz$TensorBoard: Graph Visualization}.
 
-For more usage information on TensorBoard in general, see the [TensorBoard
-README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md).
+For more usage information on TensorBoard in general, see the [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard).
diff --git a/tensorflow/docs_src/get_started/tensorboard_histograms.md b/tensorflow/docs_src/get_started/tensorboard_histograms.md
index b3dd13497eb598d7e86efae5529396bd472edc31..918deda190a930e504b7d1b213a2af611b2e919e 100644
--- a/tensorflow/docs_src/get_started/tensorboard_histograms.md
+++ b/tensorflow/docs_src/get_started/tensorboard_histograms.md
@@ -34,6 +34,8 @@ tf.summary.histogram("normal/moving_mean", mean_moving_normal)
 sess = tf.Session()
 writer = tf.summary.FileWriter("/tmp/histogram_example")
 
+summaries = tf.summary.merge_all()
+
 # Setup a loop and write the summaries to disk
 N = 400
 for step in range(N):
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 91189f199da20397addbf131da3f67b298f29b72..b83113438c7976e97979ce856bed8e6f30e1b7b2 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -1,7 +1,7 @@
 # Installing TensorFlow for C
 
 TensorFlow provides a C API defined in
-[`c_api.h`](https://github.com/tensorflow/tensorflow/tree/master/c/c_api.h),
+[`c_api.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h),
 which is suitable for
 [building bindings for other languages](https://www.tensorflow.org/extend/language_bindings).
 The API leans towards simplicity and uniformity rather than convenience.
@@ -35,7 +35,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for Mac OS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.2.0-rc2.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index c9b8dffadb1e8b13a5b04e40f38c0dc2d9bbecd4..068a42d16bf2153ff0450ded79a074406e1bd275 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.2.0-rc2.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 612c4c94f232cbc5c31029b6388ab64b92380cb3..bf0d03903d4adac9a3a015e20f24acf1d18e1f8c 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.2.0-rc2</version>
+  <version>1.3.0-rc0</version>
 </dependency>
 ```
 
@@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.2.0-rc2</version>
+                 <version>1.3.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0-rc2.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.2.0-rc2.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.2.0-rc2.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.2.0-rc2.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.2.0-rc2.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.3.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -214,11 +214,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and Mac OS X:
 
-<pre><b>java -cp libtensorflow-1.2.0-rc2.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.3.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.2.0-rc2.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.3.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 8ce4acda13100ec54e8e3aad2779a4bd5addb23d..55e1a35a4db4abd24cfe88f7d6cd0b67792d6cd7 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -167,12 +167,12 @@ Take the following steps to install TensorFlow with Virtualenv:
      Python version, and GPU support. Find the appropriate value for
      <code><em>tfBinaryURL</em></code> for your system
      [here](#the_url_of_the_tensorflow_python_package).  For example, if you
-     are installing TensorFlow for Linux, Python 2.7, and CPU-only support,
+     are installing TensorFlow for Linux, Python 3.4, and CPU-only support,
      issue the following command to install TensorFlow in the active
      virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -272,12 +272,12 @@ take the following steps:
      Python version, and GPU support. Find the appropriate value for
      <code><em>tfBinaryURL</em></code>
      [here](#the_url_of_the_tensorflow_python_package).  For example, to
-     install TensorFlow for Linux, Python 2.7, and CPU-only support, issue
+     install TensorFlow for Linux, Python 3.4, and CPU-only support, issue
      the following command:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -460,11 +460,11 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      where <code><em>tfBinaryURL</em></code> is the
      [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package).
      For example, the following command installs the CPU-only version of
-     TensorFlow for Python 2.7:
+     TensorFlow for Python 3.4:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -632,14 +632,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -651,14 +651,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -670,14 +670,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -689,14 +689,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.2.0rc2-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.2.0rc2-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index f85ecefb8373faacb8e8822435a4a24196715541..0a17b6bf67b71747339209e8abc405eb3f4775fd 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -109,7 +109,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc0-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -230,7 +230,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc0-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -339,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -512,7 +512,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -520,7 +520,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.2.0rc2-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc0-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index c4554923879343ba1fc754e46762b54e29840c79..63c8c625b0a0eadee5a7911c201003cb183ce000 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -342,10 +342,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.2.0rc2 on Linux:
+for TensorFlow 1.3.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.2.0rc2-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.3.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 42412b9d89c0e8684def0d2aea021240e3f8f144..58749a53f18d638f43541b20ccb043984529c030 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -74,7 +74,7 @@ Use that package at your own risk.
 If the following version of Python is not installed on your machine,
 install it now:
 
-  * [Python 3.5.x from python.org](https://www.python.org/downloads/release/python-352/)
+  * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/)
 
 TensorFlow only supports version 3.5.x of Python on Windows.
 Note that Python 3.5.x comes with the pip3 package manager, which is the
@@ -115,12 +115,12 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      environment. To install the CPU-only version of TensorFlow, enter the
      following command:
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.3.0rc0-cp35-cp35m-win_amd64.whl</b> </pre>
 
      To install the GPU version of TensorFlow, enter the following command
      (on a single line):
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.3.0rc0-cp35-cp35m-win_amd64.whl</b> </pre>
 
 ## Validate your installation
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 07c5d3087f35e6a3dbe7369006d1a4d84517e9e4..9ac60024a1de265aaffb6e9d854a702c260d7e59 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -52,7 +52,8 @@ bazel build -c opt --copt=-march="broadwell" --config=cuda //tensorflow/tools/pi
     (pascal): 6.2, Titan X (maxwell): 5.2, and K80: 3.7.
 *   Install the latest CUDA platform and cuDNN libraries.
 *   Make sure to use a version of gcc that supports all of the optimizations of
-    the target CPU. The recommended minimum gcc version is 4.8.3.
+    the target CPU. The recommended minimum gcc version is 4.8.3.  On OS X upgrade
+    to the latest Xcode version and use the version of clang that comes with Xcode.
 *   TensorFlow checks on startup whether it has been compiled with the
     optimizations available on the CPU. If the optimizations are not included,
     TensorFlow will emit warnings, e.g. AVX, AVX2, and FMA instructions not
@@ -103,7 +104,7 @@ with tf.device('/cpu:0'):
 Under some circumstances, both the CPU and GPU can be starved for data by the
 I/O system. If you are using many small files to form your input data set, you
 may be limited by the speed of your filesystem. If your training loop runs
-faster when using SSDs vs HDDs for storing your input data, you could could be
+faster when using SSDs vs HDDs for storing your input data, you could be
 I/O bottlenecked.
 
 If this is the case, you should pre-process your input data, creating a few
@@ -122,6 +123,11 @@ format.
 The best practice is to build models that work with both `NCHW` and `NHWC` as it
 is common to train using `NCHW` on GPU, and then do inference with NHWC on CPU.
 
+There are edge cases where `NCHW` can be slower on GPU than `NHWC`. One
+[case](https://github.com/tensorflow/tensorflow/issues/7551#issuecomment-280421351)
+is using non-fused batch norm on WRN-16-4 without dropout. In that case using
+fused batch norm, which is also recommended, is the optimal solution.
+
 The very brief history of these two formats is that TensorFlow started by using
 `NHWC` because it was a little faster on CPUs. Then the TensorFlow team
 discovered that `NCHW` performs better when using the NVIDIA cuDNN library.  The
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index a37748d0c91df8e1232e4ed8718ee03529df128e..d050fc5c56df2bbc5e7fedc3f92317dff0f99b0f 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -93,7 +93,7 @@ curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.t
 tar xzf /tmp/inceptionv3.tgz -C /tmp/
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
-  --in_graph=/tmp/classify_image_graph_def.pb \
+  --inputs="Mul" --in_graph=/tmp/classify_image_graph_def.pb \
   --outputs="softmax" --out_graph=/tmp/quantized_graph.pb \
   --transforms='add_default_attributes strip_unused_nodes(type=float, shape="1,299,299,3")
     remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true)
@@ -108,12 +108,6 @@ versus 91MB). You can still run this model using exactly the same inputs and
 outputs though, and you should get equivalent results. Here's an example:
 
 ```sh
-# Note: You need to add the dependencies of the quantization operation to the
-#       cc_binary in the BUILD file of the label_image program:
-#
-#     //tensorflow/contrib/quantization:cc_ops
-#     //tensorflow/contrib/quantization/kernels:quantized_ops
-
 bazel build tensorflow/examples/label_image:label_image
 bazel-bin/tensorflow/examples/label_image/label_image \
 --image=<input-image> \
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index b970aa5f5fe4b8fa6d31f3b68f3a41b97c22b2de..9cb27c7e956dd9643d21450a30e17a9369b664cd 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -61,6 +61,42 @@ Invokes a computation with the given arguments.
 The arity and types of the `args` must match the parameters of the
 `computation`. It is allowed to have no `args`.
 
+## Clamp
+
+See also
+[`ComputationBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Clamps an operand to within the range between a minimum and maximum value.
+
+<b> `Clamp(computation, args...)` </b>
+
+| Arguments     | Type                    | Semantics                        |
+| ------------- | ----------------------- | -------------------------------- |
+| `computation` | `Computation`           | computation of type `T_0, T_1,   |
+:               :                         : ..., T_N -> S` with N parameters :
+:               :                         : of arbitrary type                :
+| `operand`     | `ComputationDataHandle` | array of type T                  |
+| `min`         | `ComputationDataHandle` | array of type T                  |
+| `max`         | `ComputationDataHandle` | array of type T                  |
+
+Given an operand and minimum and maximum values, returns the operand if it is in
+the range between the minimum and maximum, else returns the minimum value if the
+operand is below this range or the maximum value if the operand is above this
+range.  That is, `clamp(x, a, b) =  max(min(x, a), b)`.
+
+All three arrays must be the same shape. Alternately, as a restricted form of
+[broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
+
+Example with scalar `min` and `max`:
+
+```
+let operand: s32[3] = {-1, 5, 9};
+let min: s32 = 0;
+let max: s32 = 6;
+==>
+Clamp(operand, min, max) = s32[3]{0, 5, 6};
+```
+
 ## Collapse
 
 See also
@@ -547,6 +583,8 @@ ComputationBuilder supports these element-wise unary functions:
 
 <b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
 
+<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
+
 <b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
 
 <b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
@@ -586,7 +624,7 @@ See also
 [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
-<b> Warning: Not implemented yet </b>
+<b> Warning: Not implemented on GPU backend yet. </b>
 
 Normalizes an array across batch and spatial dimensions.
 
@@ -643,7 +681,7 @@ spatial dimensions using the formulars above.
 See also
 [`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-<b> Warning: Not implemented yet </b>
+<b> Warning: Not implemented yet. </b>
 
 Normalizes an array across batch and spatial dimensions.
 
@@ -680,11 +718,11 @@ The output is a n dimensional, normalized array with the same shape as input
 See also
 [`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-<b> Warning: Not implemented yet </b>
+<b> Warning: Not implemented yet. </b>
 
 Calculates gradients of batch norm.
 
-<b> `BatchNormGrad(x, scale, mean, variance, epsilon, grad_y, feature_index)` </b>
+<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
 
 | Arguments       | Type                    | Semantics                        |
 | --------------  | ----------------------- | -------------------------------- |
diff --git a/tensorflow/docs_src/programmers_guide/data_versions.md b/tensorflow/docs_src/programmers_guide/data_versions.md
deleted file mode 100644
index 006aa2a8267d58a1b1e7e6eba06ddd85e92e69f8..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/programmers_guide/data_versions.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# TensorFlow Data Versioning: GraphDefs and Checkpoints
-
-As described in
-@{$version_semantics#compatibility-for-graphs-and-checkpoints$Compatibility for Graphs and Checkpoints},
-TensorFlow marks each kind of data with version information in order to maintain
-backward compatibility. This document provides additional details about the
-versioning mechanism, and how to use it to safely change data formats.
-
-## Backward and partial forward compatibility
-
-The two core artifacts exported from and imported into TensorFlow are
-checkpoints (serialized variable states) and `GraphDef`s (serialized computation
-graphs). Any approach to versioning these artifacts must take into account the
-following requirements:
-
-*   **Backward compatibility** to support loading `GraphDefs` created with older
-    versions of TensorFlow.
-*   **Forward compatibility** to support scenarios where the producer of a
-    `GraphDef` is upgraded to a newer version of TensorFlow before the consumer.
-*   Enable evolving TensorFlow in incompatible ways. For example, removing Ops,
-    adding attributes, and removing attributes.
-
-For `GraphDef`s, backward compatibility is enforced within a major version. This
-means functionality can only be removed between major versions. Forward
-compatibility is enforced within Patch releases (1.x.1 -> 1.x.2, for example).
-
-
-In order to achieve backward and forward compatibility as well as know when to
-enforce changes in formats, the serialized representations of graphs and
-variable state need to have metadata that describes when they were produced. The
-sections below detail the TensorFlow implementation and guidelines for evolving
-`GraphDef` versions.
-
-### Independent data version schemes
-
-There are data versions for `GraphDef`s and checkpoints. Both data formats
-evolve at different rates, and also at different speeds than the version of
-TensorFlow. Both versioning systems are defined in
-[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h).
-Whenever a new version is added a note is added to the header detailing what
-changed and the date.
-
-### Data, producers, and consumers
-
-This section discusses version information for **data**, binaries that produce
-data (**producers**), and binaries that consume data (**consumers**):
-
-*   Producer binaries have a version (`producer`) and a minimum consumer version
-    that they are compatible with (`min_consumer`).
-*   Consumer binaries have a version (`consumer`) and a minimum producer version
-    that they are compatible with (`min_producer`).
-*   Each piece of versioned data has a [`VersionDef
-    versions`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/versions.proto)
-    field which records the `producer` that made the data, the `min_consumer`
-    that it is compatible with, and a list of `bad_consumers` versions that are
-    disallowed.
-
-By default, when a producer makes some data, the data inherits the producer's
-`producer` and `min_consumer` versions. `bad_consumers` can be set if specific
-consumer versions are known to contain bugs and must be avoided. A consumer can
-accept a piece of data if
-
-*   `consumer` >= data's `min_consumer`
-*   data's `producer` >= consumer's `min_producer`
-*   `consumer` not in data's `bad_consumers`
-
-Since both producers and consumers come from the same TensorFlow code base,
-[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h)
-contains a main binary version which is treated as either `producer` or
-`consumer` depending on context and both `min_consumer` and `min_producer`
-(needed by producers and consumers, respectively). Specifically,
-
-*   For `GraphDef` versions, we have `TF_GRAPH_DEF_VERSION`,
-    `TF_GRAPH_DEF_VERSION_MIN_CONSUMER`, and
-    `TF_GRAPH_DEF_VERSION_MIN_PRODUCER`.
-*   For checkpoint versions, we have `TF_CHECKPOINT_VERSION`,
-    `TF_CHECKPOINT_VERSION_MIN_CONSUMER`, and
-    `TF_CHECKPOINT_VERSION_MIN_PRODUCER`.
-
-### Evolving GraphDef versions
-
-This section presents examples of using this versioning mechanism to make
-changes to the `GraphDef` format.
-
-**Adding a new Op:**
-
-1.  Add the new Op to both consumers and producers at the same time, and do not
-    change any `GraphDef` versions. This type of change is automatically
-    backward compatible, and does not impact forward compatibility plan since
-    existing producer scripts will not suddenly use the new functionality.
-
-**Adding a new Op and switching existing Python wrappers to use it:**
-
-1.  Implement new consumer functionality and increment the binary version.
-2.  If it is possible to make the wrappers use the new functionality only in
-    cases that did not work before, the wrappers can be updated now.
-3.  Change Python wrappers to use the new functionality. Do not increment
-    `min_consumer`, since models which do not use this Op should not break.
-
-**Removing an Op or restricting the functionality of an Op:**
-
-1.  Fix all producer scripts (not TensorFlow itself) to not use the banned Op or
-    functionality.
-2.  Increment the binary version and implement new consumer functionality that
-    bans the removed Op or functionality for GraphDefs at the new version and
-    above. If possible, make TensorFlow stop producing `GraphDefs` with the
-    banned functionality. This can be done with
-    [`REGISTER_OP(...).Deprecated(deprecated_at_version,
-    message)`](https://github.com/tensorflow/tensorflow/blob/b289bc7a50fc0254970c60aaeba01c33de61a728/tensorflow/core/ops/array_ops.cc#L1009).
-3.  Wait for a major release for backward compatibility purposes.
-4.  Increase `min_producer` to the GraphDef version from (2) and remove the
-    functionality entirely.
-
-**Changing the functionality of an Op:**
-
-1.  Add a new similar Op named `SomethingV2` or similar and go through the
-    process of adding it and switching existing Python wrappers to use it (may
-    take 3 weeks if forward compatibility is desired).
-2.  Remove the old Op (Can only take place with a major version change due to
-    backward compatibility).
-3.  Increase `min_consumer` to rule out consumers with the old Op, add back the
-    old Op as an alias for `SomethingV2`, and go through the process to switch
-    existing Python wrappers to use it.
-4.  Go through the process to remove `SomethingV2`.
-
-**Banning a single consumer version that cannot run safely:**
-
-1.  Bump the binary version and add the bad version to `bad_consumers` for all
-    new GraphDefs. If possible, add to `bad_consumers` only for GraphDefs which
-    contain a certain Op or similar.
-2.  If existing consumers have the bad version, push them out as soon as
-    possible.
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
new file mode 100644
index 0000000000000000000000000000000000000000..68ed4bcd47c5252b2442890686ef6ad287aedb5d
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -0,0 +1,750 @@
+# Using the `Dataset` API for TensorFlow Input Pipelines
+
+The `Dataset` API enables you to build complex input pipelines from
+simple, reusable pieces. For example, the pipeline for an image model might
+aggregate data from files in a distributed file system, apply random
+perturbations to each image, and merge randomly selected images into a batch
+for training. The pipeline for a text model might involve extracting symbols
+from raw text data, converting them to embedding identifiers with a lookup
+table, and batching together sequences of different lengths. The `Dataset` API
+makes it easy to deal with large amounts of data, different data formats, and
+complicated transformations.
+
+The `Dataset` API introduces two new abstractions to TensorFlow:
+
+* A `tf.contrib.data.Dataset` represents a sequence of elements, in which
+  each element contains one or more `Tensor` objects. For example, in an image
+  pipeline, an element might be a single training example, with a pair of
+  tensors representing the image data and a label. There are two distinct
+  ways to create a dataset:
+
+  * Creating a **source** (e.g. `Dataset.from_tensor_slices()`) constructs a
+    dataset from
+    one or more `tf.Tensor` objects.
+
+  * Applying a **transformation** (e.g. `Dataset.batch()`) constructs a dataset
+    from one or more `tf.contrib.data.Dataset` objects.
+
+* A `tf.contrib.data.Iterator` provides the main way to extract elements from a
+  dataset. The operation returned by `Iterator.get_next()` yields the next
+  element of a `Dataset` when executed, and typically acts as the interface
+  between input pipeline code and your model. The simplest iterator is a
+  "one-shot iterator", which is associated with a particular `Dataset` and
+  iterates through it once. For more sophisticated uses, the
+  `Iterator.initializer` operation enables you to reinitialize and parameterize
+  an iterator with different datasets, so that you can, for example, iterate
+  over training and validation data multiple times in the same program.
+
+## Basic mechanics
+
+This section of the guide describes the fundamentals of creating different kinds
+of `Dataset` and `Iterator` objects, and how to extract data from them.
+
+To start an input pipeline, you must define a *source*. For example, to
+construct a `Dataset` from some tensors in memory, you can use
+`tf.contrib.data.Dataset.from_tensors()` or
+`tf.contrib.data.Dataset.from_tensor_slices()`. Alternatively, if your input
+data are on disk in the recommend TFRecord format, you can construct a
+`tf.contrib.data.TFRecordDataset`.
+
+Once you have a `Dataset` object, you can *transform* it into a new `Dataset` by
+chaining method calls on the `tf.contrib.data.Dataset` object. For example, you
+can apply per-element transformations such as `Dataset.map()` (to apply a
+function to each element), and multi-element transformations such as
+`Dataset.batch()`. See the documentation for @{tf.contrib.data.Dataset}
+for a complete list of transformations.
+
+The most common way to consume values from a `Dataset` is to make an
+**iterator** object that provides access to one element of the dataset at a time
+(for example, by calling `Dataset.make_one_shot_iterator()`). A
+`tf.contrib.data.Iterator` provides two operations: `Iterator.initializer`,
+which enables you to (re)initialize the iterator's state; and
+`Iterator.get_next()`, which returns `tf.Tensor` objects that correspond to the
+symbolic next element. Depending on your use case, you might choose a different
+type of iterator, and the options are outlined below.
+
+### Dataset structure
+
+A dataset comprises elements that each have the same structure. An element
+contains one or more `tf.Tensor` objects, called *components*. Each component
+has a `tf.DType` representing the type of elements in the tensor, and a
+`tf.TensorShape` representing the (possibly partially specified) static shape of
+each element. The `Dataset.output_types` and `Dataset.output_shapes` properties
+allow you to inspect the inferred types and shapes of each component of a
+dataset element. The *nested structure* of these properties map to the structure
+of an element, which may be a single tensor, a tuple of tensors, or a nested
+tuple of tensors. For example:
+
+```python
+dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
+print(dataset1.output_types)  # ==> "tf.float32"
+print(dataset1.output_shapes)  # ==> "(10,)"
+
+dataset2 = tf.contrib.data.Dataset.from_tensor_slices(
+   (tf.random_uniform([4]),
+    tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)))
+print(dataset2.output_types)  # ==> "(tf.float32, tf.int32)"
+print(dataset2.output_shapes)  # ==> "((), (100,))"
+
+dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
+print(dataset3.output_types)  # ==> (tf.float32, (tf.float32, tf.int32))
+print(dataset3.output_shapes)  # ==> "(10, ((), (100,)))"
+```
+
+It is often convenient to give names to each component of an element, for
+example if they represent different features of a training example. In addition
+to tuples, you can use `collections.namedtuple` or a dictionary mapping strings
+to tensors to represent a single element of a `Dataset`.
+
+```python
+dataset = tf.contrib.data.Dataset.from_tensor_slices(
+   {"a": tf.random_uniform([4]),
+    "b": tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)})
+print(dataset.output_types)  # ==> "{'a': tf.float32, 'b': tf.int32}"
+print(dataset.output_shapes)  # ==> "{'a': (), 'b': (100,)}"
+```
+
+The `Dataset` transformations support datasets of any structure. When using the
+`Dataset.map()`, `Dataset.flat_map()`, and `Dataset.filter()` transformations,
+which apply a function to each element, the element structure determines the
+arguments of the function:
+
+```python
+dataset1 = dataset1.map(lambda x: ...)
+
+dataset2 = dataset2.flat_map(lambda x, y: ...)
+
+# Note: Argument destructuring is not available in Python 3.
+dataset3 = dataset3.filter(lambda x, (y, z): ...)
+```
+
+### Creating an iterator
+
+One you have built a `Dataset` to represent your input data, the next step is to
+create an `Iterator` to access elements from that dataset.  The `Dataset` API
+currently supports three kinds of iterator, in increasing level of
+sophistication:
+
+* **one-shot**,
+* **initializable**,
+* **reinitializable**, and
+* **feedable**.
+
+A **one-shot** iterator is the simplest form of iterator, which only supports
+iterating once through a dataset, with no need for explicit initialization.
+One-shot iterators handle almost all of the cases that the existing queue-based
+input pipelines support, but they do not support parameterization. Using the
+example of `Dataset.range()`:
+
+```python
+dataset = tf.contrib.data.Dataset.range(100)
+iterator = dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+for i in range(100):
+  value = sess.run(next_element)
+  assert i == value
+```
+
+An **initializable** iterator requires you to run an explicit
+`iterator.initializer` operation before using it. In exchange for this
+inconvenience, it enables you to *parameterize* the definition of the dataset,
+using one or more `tf.placeholder()` tensors that can be fed when you
+initialize the iterator. Continuing the `Dataset.range()` example:
+
+```python
+max_value = tf.placeholder(tf.int64, shape=[])
+dataset = tf.contrib.data.Dataset.range(max_value)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Initialize an iterator over a dataset with 10 elements.
+sess.run(iterator.initializer, feed_dict={max_value: 10})
+for i in range(10):
+  value = sess.run(next_element)
+  assert i == value
+
+# Initialize the same iterator over a dataset with 100 elements.
+sess.run(iterator.initializer, feed_dict={max_value: 100})
+for i in range(100):
+  value = sess.run(next_element)
+  assert i == value
+```
+
+A **reinitializable** iterator can be initialized from multiple different
+`Dataset` objects. For example, you might have a training input pipeline that
+uses random perturbations to the input images to improve generalization, and
+a validation input pipeline that evaluates predictions on unmodified data. These
+pipelines will typically use different `Dataset` objects that have the same
+structure (i.e. the same types and compatible shapes for each component).
+
+```python
+# Define training and validation datasets with the same structure.
+training_dataset = tf.contrib.data.Dataset.range(100).map(
+    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
+validation_dataset = tf.contrib.data.Dataset.range(50)
+
+# A reinitializable iterator is defined by its structure. We could use the
+# `output_types` and `output_shapes` properties of either `training_dataset`
+# or `validation_dataset` here, because they are compatible.
+iterator = Iterator.from_structure(training_dataset.output_types,
+                                   training_dataset.output_shapes)
+next_element = iterator.get_next()
+
+training_init_op = iterator.make_initializer(training_dataset)
+validation_init_op = iterator.make_initializer(validation_dataset)
+
+# Run 20 epochs in which the training dataset is traversed, followed by the
+# validation dataset.
+for _ in range(20):
+  # Initialize an iterator over the training dataset.
+  sess.run(training_init_op)
+  for _ in range(100):
+    sess.run(next_element)
+
+  # Initialize an iterator over the validation dataset.
+  sess.run(validation_init_op)
+  for _ in range(50):
+    sess.run(next_element)
+```
+
+A **feedable** iterator can be used together with @{tf.placeholder} to select
+what `Iterator` to use in each call to @{tf.Session.run}, via the familiar
+`feed_dict` mechanism. It offers the same functionality as a reinitializable
+iterator, but it does not require you to initialize the iterator from the start
+of a dataset when you switch between iterators. For example, using the same
+training and validation example from above, you can use
+@{tf.contrib.data.Iterator.from_string_handle} to define a feedable iterator
+that allows you to switch between the two datasets:
+
+```python
+# Define training and validation datasets with the same structure.
+training_dataset = tf.contrib.data.Dataset.range(100).map(
+    lambda x: x + tf.random_uniform([], -10, 10, tf.int64)).repeat()
+validation_dataset = tf.contrib.data.Dataset.range(50)
+
+# A feedable iterator is defined by a handle placeholder and its structure. We
+# could use the `output_types` and `output_shapes` properties of either
+# `training_dataset` or `validation_dataset` here, because they have
+# identical structure.
+handle = tf.placeholder(tf.string, shape=[])
+iterator = tf.contrib.data.Iterator.from_string_handle(
+    handle, training_dataset.output_types, training_dataset.output_shapes)
+next_element = iterator.get_next()
+
+# You can use feedable iterators with a variety of different kinds of iterator
+# (such as one-shot and initializable iterators).
+training_iterator = training_dataset.make_one_shot_iterator()
+validation_iterator = validation_dataset.make_initializable_iterator()
+
+# The `Iterator.string_handle()` method returns a tensor that can be evaluated
+# and used to feed the `handle` placeholder.
+training_handle = sess.run(training_iterator.string_handle())
+validation_handle = sess.run(validation_iterator.string_handle())
+
+# Loop forever, alternating between training and validation.
+while True:
+  # Run 200 steps using the training dataset. Note that the training dataset is
+  # infinite, and we resume from where we left off in the previous `while` loop
+  # iteration.
+  for _ in range(200):
+    sess.run(next_element, feed_dict={handle: training_handle})
+
+  # Run one pass over the validation dataset.
+  sess.run(validation_iterator.initializer)
+  for _ in range(50):
+    sess.run(next_element, feed_dict={handle: validation_handle})
+```
+
+### Consuming values from an iterator
+
+The `Iterator.get_next()` method returns one or more `tf.Tensor` objects that
+correspond to the symbolic next element of an iterator. Each time these tensors
+are evaluated, they take the value of the next element in the underlying
+dataset. (Note that, like other stateful objects in TensorFlow, calling
+`Iterator.get_next()` does not immediately advance the iterator. Instead you
+must use the returned `tf.Tensor` objects in a TensorFlow expression, and pass
+the result of that expression to `tf.Session.run()` to get the next elements and
+advance the iterator.)
+
+If the iterator reaches the end of the dataset, executing
+the `Iterator.get_next()` operation will raise a `tf.errors.OutOfRangeError`.
+After this point the iterator will be in an unusable state, and you must
+initialize it again if you want to use it further.
+
+```python
+dataset = tf.contrib.data.Dataset.range(5)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Typically `result` will be the output of a model, or an optimizer's
+# training operation.
+result = tf.add(next_element, next_element)
+
+sess.run(iterator.initializer)
+print(sess.run(result))  # ==> "0"
+print(sess.run(result))  # ==> "2"
+print(sess.run(result))  # ==> "4"
+print(sess.run(result))  # ==> "6"
+print(sess.run(result))  # ==> "8"
+try:
+  sess.run(result)
+except tf.errors.OutOfRangeError:
+  print("End of dataset")  # ==> "End of dataset"
+```
+
+A common pattern is to wrap the "training loop" in a `try`-`except` block:
+
+```python
+sess.run(iterator.initializer)
+while True:
+  try:
+    sess.run(result)
+  except tf.errors.OutOfRangeError:
+    break
+```
+
+If each element of the dataset has a nested structure, the return value of
+`Iterator.get_next()` will be one or more `tf.Tensor` objects in the same
+nested structure:
+
+```python
+dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
+dataset2 = tf.contrib.data.Dataset.from_tensor_slices((tf.random_uniform([4]), tf.random_uniform([4, 100])))
+dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
+
+iterator = dataset3.make_initializable_iterator()
+
+sess.run(iterator.initializer)
+next1, (next2, next3) = iterator.get_next()
+```
+
+Note that evaluating *any* of `next1`, `next2`, or `next3` will advance the
+iterator for all components. A typical consumer of an iterator will include all
+components in a single expression.
+
+## Reading input data
+
+### Consuming NumPy arrays
+
+If all of your input data fit in memory, the simplest way to create a `Dataset`
+from them is to convert them to `tf.Tensor` objects and use
+`Dataset.from_tensor_slices()`.
+
+```python
+# Load the training data into two NumPy arrays, for example using `np.load()`.
+with np.load("/var/data/training_data.npy") as data:
+  features = data["features"]
+  labels = data["labels"]
+
+# Assume that each row of `features` corresponds to the same row as `labels`.
+assert features.shape[0] == labels.shape[0]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((features, labels))
+```
+
+Note that the above code snippet will embed the `features` and `labels` arrays
+in your TensorFlow graph as `tf.constant()` operations. This works well for a
+small dataset, but wastes memory---because the contents of the array will be
+copied multiple times---and can run into the 2GB limit for the `tf.GraphDef`
+protocol buffer.
+
+As an alternative, you can define the `Dataset` in terms of `tf.placeholder()`
+tensors, and *feed* the NumPy arrays when you initialize an `Iterator` over the
+dataset.
+
+```python
+# Load the training data into two NumPy arrays, for example using `np.load()`.
+with np.load("/var/data/training_data.npy") as data:
+  features = data["features"]
+  labels = data["labels"]
+
+# Assume that each row of `features` corresponds to the same row as `labels`.
+assert features.shape[0] == labels.shape[0]
+
+features_placeholder = tf.placeholder(features.dtype, features.shape)
+labels_placeholder = tf.placeholder(labels.dtype, labels.shape)
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+# [Other transformations on `dataset`...]
+dataset = ...
+iterator = dataset.make_initializable_iterator()
+
+sess.run(iterator.initializer, feed_dict={features_placeholder: features,
+                                          labels_placeholder: labels})
+```
+
+### Consuming TFRecord data
+
+The `Dataset` API supports a variety of file formats so that you can process
+large datasets that do not fit in memory. For example, the TFRecord file format
+is a simple record-oriented binary format that many TensorFlow applications use
+for training data. The `tf.contrib.data.TFRecordDataset` class enables you to
+stream over the contents of one or more TFRecord files as part of an input
+pipeline.
+
+```python
+# Creates a dataset that reads all of the examples from two files.
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+```
+
+The `filenames` argument to the `TFRecordDataset` initializer can either be a
+string, a list of strings, or a `tf.Tensor` of strings. Therefore if you have
+two sets of files for training and validation purposes, you can use a
+`tf.placeholder(tf.string)` to represent the filenames, and initialize an
+iterator from the appropriate filenames:
+
+```python
+filenames = tf.placeholder(tf.string, shape=[None])
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)  # Parse the record into tensors.
+dataset = dataset.repeat()  # Repeat the input indefinitely.
+dataset = dataset.batch(32)
+iterator = dataset.make_initializable_iterator()
+
+# You can feed the initializer with the appropriate filenames for the current
+# phase of execution, e.g. training vs. validation.
+
+# Initialize `iterator` with training data.
+training_filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
+
+# Initialize `iterator` with validation data.
+validation_filenames = ["/var/data/validation1.tfrecord", ...]
+sess.run(iterator.initializer, feed_dict={filenames: validation_filenames})
+```
+
+### Consuming text data
+
+Many datasets are distributed as one or more text files. The
+`tf.contrib.data.TextLineDataset` provides an easy way to extract lines from
+one or more text files. Given one or more filenames, a `TextLineDataset` will
+produce one string-valued element per line of those files. Like a
+`TFRecordDataset`, `TextLineDataset` accepts `filenames` as a `tf.Tensor`, so
+you can parameterize it by passing a `tf.placeholder(tf.string)`.
+
+```python
+filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
+dataset = tf.contrib.data.TextLineDataset(filenames)
+```
+
+By default, a `TextLineDataset` yields *every* line of each file, which may
+not be desirable, for example if the file starts with a header line, or contains
+comments. These lines can be removed using the `Dataset.skip()` and
+`Dataset.filter()` transformations. To apply these transformations to each
+file separately, we use `Dataset.flat_map()` to create a nested `Dataset` for
+each file.
+
+```python
+filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)
+
+# Use `Dataset.flat_map()` to transform each file as a separate nested dataset,
+# and then concatenate their contents sequentially into a single "flat" dataset.
+# * Skip the first line (header row).
+# * Filter out lines beginning with "#" (comments).
+dataset = dataset.flat_map(
+    lambda filename: (
+        tf.contrib.data.TextLineDataset(filename)
+        .skip(1)
+        .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
+```
+
+<!--
+TODO(mrry): Add these sections.
+
+### Consuming from a Python generator
+-->
+
+## Preprocessing data with `Dataset.map()`
+
+The `Dataset.map(f)` transformation produces a new dataset by applying a given
+function `f` to each element of the input dataset. It is based on
+the
+[`map()` function](https://en.wikipedia.org/wiki/Map_(higher-order_function))
+that is commonly applied to lists (and other structures) in functional
+programming languages.  The function `f` takes the `tf.Tensor` objects that
+represent a single element in the input, and returns the `tf.Tensor` objects
+that will represent a single element in the new dataset. Its implementation uses
+standard TensorFlow operations to transform one element into another.
+
+This section covers common examples of how to use `Dataset.map()`.
+
+### Parsing `tf.Example` protocol buffer messages
+
+Many input pipelines extract `tf.train.Example` protocol buffer messages from a
+TFRecord-format file (written, for example, using
+`tf.python_io.TFRecordWriter`). Each `tf.train.Example` record contains one or
+more "features", and the input pipeline typically converts these features into
+tensors.
+
+```python
+# Transforms a scalar string `example_proto` into a pair of a scalar string and
+# a scalar integer, representing an image and its label, respectively.
+def _parse_function(example_proto):
+  features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
+              "label": tf.FixedLenFeature((), tf.int32, default_value=0)}
+  parsed_features = tf.parse_single_example(example_proto, features)
+  return parsed_features["image"], parsed_features["label"]
+
+# Creates a dataset that reads all of the examples from two files, and extracts
+# the image and label features.
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(_parse_function)
+```
+
+### Decoding image data and resizing it
+
+When training a neural network on real-world image data, it is often necessary
+to convert images of different sizes to a common size, so that they may be
+batched into a fixed size.
+
+```python
+# Reads an image from a file, decodes it into a dense tensor, and resizes it
+# to a fixed shape.
+def _parse_function(filename, label):
+  image_string = tf.read_file(filename)
+  image_decoded = tf.image.decode_image(image_string)
+  image_resized = tf.image.resize_images(image_decoded, [28, 28])
+  return image_resized, label
+
+# A vector of filenames.
+filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...])
+
+# `labels[i]` is the label for the image in `filenames[i].
+labels = tf.constant([0, 37, ...])
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
+dataset = dataset.map(_parse_function)
+```
+
+### Applying arbitrary Python logic with `tf.py_func()`
+
+For performance reasons, we encourage you to use TensorFlow operations for
+preprocessing your data whenever possible. However, it is sometimes useful to
+call upon external Python libraries when parsing your input data. To do so,
+invoke, the `tf.py_func()` operation in a `Dataset.map()` transformation.
+
+```python
+import cv2
+
+# Use a custom OpenCV function to read the image, instead of the standard
+# TensorFlow `tf.read_file()` operation.
+def _read_py_function(filename, label):
+  image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
+  return image_decoded, label
+
+# Use standard TensorFlow operations to resize the image to a fixed shape.
+def _resize_function(image_decoded, label):
+  image_decoded.set_shape([None, None, None])
+  image_resized = tf.image.resize_images(image_decoded, [28, 28])
+  return image_resized, label
+
+filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...]
+labels = [0, 37, 29, 1, ...]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
+dataset = dataset.map(
+    lambda filename, label: tf.py_func(
+        _read_py_function, [filename, label], [tf.uint8, label.dtype]))
+dataset = dataset.map(_resize_function)
+```
+
+<!--
+TODO(mrry): Add this section.
+
+### Handling text data with unusual sizes
+-->
+
+## Batching dataset elements
+
+### Simple batching
+
+The simplest form of batching stacks `n` consecutive elements of a dataset into
+a single element. The `Dataset.batch()` transformation does exactly this, with
+the same constraints as the `tf.stack()` operator, applied to each component
+of the elements: i.e. for each component *i*, all elements must have a tensor
+of the exact same shape.
+
+```python
+inc_dataset = tf.contrib.data.Dataset.range(100)
+dec_dataset = tf.contrib.data.Dataset.range(0, -100, -1)
+dataset = tf.contrib.data.Dataset.zip((inc_dataset, dec_dataset))
+batched_dataset = dataset.batch(4)
+
+iterator = batched_dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+print(sess.run(next_element))  # ==> ([0, 1, 2,   3],   [ 0, -1,  -2,  -3])
+print(sess.run(next_element))  # ==> ([4, 5, 6,   7],   [-4, -5,  -6,  -7])
+print(sess.run(next_element))  # ==> ([8, 9, 10, 11],   [-8, -9, -10, -11])
+```
+
+### Batching tensors with padding
+
+The above recipe works for tensors that all have the same size. However, many
+models (e.g. sequence models) work with input data that can have varying size
+(e.g. sequences of different lengths). To handle this case, the
+`Dataset.padded_batch()` transformation enables you to batch tensors of
+different shape by specifying one or more dimensions in which they may be
+padded.
+
+```python
+dataset = tf.contrib.data.Dataset.range(100)
+dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
+dataset = dataset.padded_batch(4, padded_shapes=[None])
+
+iterator = dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+print(sess.run(next_element))  # ==> [[0, 0, 0], [1, 0, 0], [2, 2, 0], [3, 3, 3]]
+print(sess.run(next_element))  # ==> [[4, 4, 4, 4, 0, 0, 0],
+                               #      [5, 5, 5, 5, 5, 0, 0],
+                               #      [6, 6, 6, 6, 6, 6, 0],
+                               #      [7, 7, 7, 7, 7, 7, 7]]
+```
+
+The `Dataset.padded_batch()` transformation allows you to set different padding
+for each dimension of each component, and it may be variable-length (signified
+by `None` in the example above) or constant-length. It is also possible to
+override the padding value, which defaults to 0.
+
+<!--
+TODO(mrry): Add this section.
+
+### Dense ragged -> tf.SparseTensor
+-->
+
+## Training workflows
+
+### Processing multiple epochs
+
+The `Dataset` API offers two main ways to process multiple epochs of the same
+data.
+
+The simplest way to iterate over a dataset in multiple epochs is to use the
+`Dataset.repeat()` transformation. For example, to create a dataset that repeats
+its input for 10 epochs:
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.repeat(10)
+dataset = dataset.batch(32)
+```
+
+Applying the `Dataset.repeat()` transformation with no arguments will repeat
+the input indefinitely. The `Dataset.repeat()` transformation concatenates its
+arguments without signaling the end of one epoch and the beginning of the next
+epoch.
+
+If you want to receive a signal at the end of each epoch, you can write a
+training loop that catches the `tf.errors.OutOfRangeError` at the end of a
+dataset. At that point you might collect some statistics (e.g. the validation
+error) for the epoch.
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.batch(32)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Compute for 100 epochs.
+for _ in range(100):
+  sess.run(iterator.initializer)
+  while True:
+    try:
+      sess.run(next_element)
+    except tf.errors.OutOfRangeError:
+      break
+
+  # [Perform end-of-epoch calculations here.]
+```
+
+### Randomly shuffling input data
+
+The `Dataset.shuffle()` transformation randomly shuffles the input dataset
+using a similar algorithm to `tf.RandomShuffleQueue`: it maintains a fixed-size
+buffer and chooses the next element uniformly at random from that buffer.
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.shuffle(buffer_size=10000)
+dataset = dataset.batch(32)
+dataset = dataset.repeat()
+```
+
+### Using high-level APIs
+
+The @{tf.train.MonitoredTrainingSession} API simplifies many aspects of running
+TensorFlow in a distributed setting. `MonitoredTrainingSession` uses the
+@{tf.errors.OutOfRangeError} to signal that training has completed, so to use it
+with the `Dataset` API, we recommend using
+`Dataset.make_one_shot_iterator()`. For example:
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.shuffle(buffer_size=10000)
+dataset = dataset.batch(32)
+dataset = dataset.repeat(num_epochs)
+iterator = dataset.make_one_shot_iterator()
+
+next_example, next_label = iterator.get_next()
+loss = model_function(next_example, next_label)
+
+training_op = tf.train.AdagradOptimizer(...).minimize(loss)
+
+with tf.train.MonitoredTrainingSession(...) as sess:
+  while not sess.should_stop():
+    sess.run(training_op)
+```
+
+To use a `Dataset` in the `input_fn` of a @{tf.estimator.Estimator}, we also
+recommend using `Dataset.make_one_shot_iterator()`. For example:
+
+```python
+def dataset_input_fn():
+  filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+  dataset = tf.contrib.data.TFRecordDataset(filenames)
+
+  # Use `tf.parse_single_example()` to extract data from a `tf.Example`
+  # protocol buffer, and perform any additional per-record preprocessing.
+  def parser(record):
+    keys_to_features = {
+        "image_data": tf.FixedLenFeature((), tf.string, default_value=""),
+        "date_time": tf.FixedLenFeature((), tf.int64, default_value=""),
+        "label": tf.FixedLenFeature((), tf.int64,
+                                    default_value=tf.zeros([], dtype=tf.int64)),
+    }
+    parsed = tf.parse_single_example(record, keys_to_features)
+
+    # Perform additional preprocessing on the parsed data.
+    image = tf.decode_jpeg(parsed["image_data"])
+    image = tf.reshape(image, [299, 299, 1])
+    label = tf.cast(parsed["label"], tf.int32)
+
+    return {"image_data": image, "date_time": parsed["date_time"]}, label
+
+  # Use `Dataset.map()` to build a pair of a feature dictionary and a label 
+  # tensor for each example.
+  dataset = dataset.map(parser)
+  dataset = dataset.shuffle(buffer_size=10000)
+  dataset = dataset.batch(32)
+  dataset = dataset.repeat(num_epochs)
+  iterator = dataset.make_one_shot_iterator()
+
+  # `features` is a dictionary in which each value is a batch of values for
+  # that feature; `labels` is a batch of labels.
+  features, labels = iterator.get_next()
+  return features, labels
+```
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 994633dad7963b03c88b5f1f0d5b150f12ab9642..14b8143d2670b590a489844714d4bf26c7ebc630 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -1,6 +1,6 @@
 # Debugging TensorFlow Programs
 
-[comment]: TODO(barryr): Links to and from sections on "Graphs" & "Monitoring Learning".
+<!-- [comment]: TODO(barryr): Links to and from sections on "Graphs" & "Monitoring Learning". -->
 
 [TOC]
 
@@ -35,15 +35,18 @@ This code trains a simple neural network for MNIST digit image recognition.
 Notice that the accuracy increases slightly after the first training step, but
 then gets stuck at a low (near-chance) level:
 
-> Accuracy at step 0: 0.1113
-> Accuracy at step 1: 0.3183
-> Accuracy at step 2: 0.098
-> Accuracy at step 3: 0.098
-> Accuracy at step 4: 0.098
+```none
+Accuracy at step 0: 0.1113
+Accuracy at step 1: 0.3183
+Accuracy at step 2: 0.098
+Accuracy at step 3: 0.098
+Accuracy at step 4: 0.098
+```
 
 Wondering what might have gone wrong, you suspect that certain nodes in the
-training graph generated bad numeric values such as `inf`s and `nan`s. Let's
-use tfdbg to debug this issue and pinpoint the exact graph node where this
+training graph generated bad numeric values such as `inf`s and `nan`s, because
+this is a common cause of this type of training failure.
+Let's use tfdbg to debug this issue and pinpoint the exact graph node where this
 numeric problem first surfaced.
 
 ## Wrapping TensorFlow Sessions with tfdbg
@@ -68,8 +71,8 @@ This wrapper has the same interface as Session, so enabling debugging requires
 no other changes to the code. The wrapper provides additional features,
 including:
 
-* Bringing up a CLI before and after `run()` calls, to let you control the
-execution and inspect the graph's internal state.
+* Bringing up a CLI before and after `Session.run()` calls, to let you
+control the execution and inspect the graph's internal state.
 * Allowing you to register special `filters` for tensor values, to facilitate
 the diagnosis of issues.
 
@@ -83,7 +86,7 @@ we ship it with the
 @{$python/tfdbg#Classes_for_debug_dump_data_and_directories$`debug_data`}
 module.
 
-TIP: You can also write your own custom filters. See
+Note: You can also write your own custom filters. See
 the @{tfdbg.DebugDumpDir.find$API documentation}
 of `DebugDumpDir.find()` for additional information.
 
@@ -97,15 +100,20 @@ python -m tensorflow.python.debug.examples.debug_mnist --debug
 ```
 
 The debug wrapper session will prompt you when it is about to execute the first
-`run()` call, with information regarding the fetched tensor and feed
+`Session.run()` call, with information regarding the fetched tensor and feed
 dictionaries displayed on the screen.
 
 ![tfdbg run-start UI](https://www.tensorflow.org/images/tfdbg_screenshot_run_start.png)
 
-This is what we refer to as the *run-start CLI*. If the screen size is
-too small to display the content of the message in its entirety, you can resize
-it or use the **PageUp** / **PageDown** / **Home** / **End** keys to navigate
-the screen output.
+This is what we refer to as the *run-start CLI*. It lists the feeds and fetches
+to the current `Session.run` call, before executing anything.
+
+If the screen size is too small to display the content of the message in its
+entirety, you can resize it.
+
+Use the **PageUp** / **PageDown** / **Home** / **End** keys to navigate the
+screen output. On most keyboards lacking those keys **Fn + Up** /
+**Fn + Down** / **Fn + Right** / **Fn + Left** will work.
 
 Enter the `run` command (or just `r`) at the command prompt:
 
@@ -113,8 +121,11 @@ Enter the `run` command (or just `r`) at the command prompt:
 tfdbg> run
 ```
 
-tfdbg calculates the accuracy using a test data set and then displays all dumped
-intermediate tensors from the run in the *run-end CLI*. For example:
+The `run` command causes tfdbg to execute until the end of the next
+`Session.run()` call, which calculates the model's accuracy using a test data
+set. tfdbg augments the runtime Graph to dump all intermediate tensors.
+After the run ends, tfdbg displays all the dumped tensors values in the
+*run-end CLI*. For example:
 
 ![tfdbg run-end UI: accuracy](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_accuracy.png)
 
@@ -129,7 +140,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | Command            | Syntax or Option | Explanation  | Example                   |
 |:-------------------|:---------------- |:------------ |:------------------------- |
 | **`lt`** | | **List dumped tensors.** | `lt` |
-| | `-n <name_pattern>` | List dumped tensors with names matching given regular-expression pattern. | `lt -n softmax.*` |
+| | `-n <name_pattern>` | List dumped tensors with names matching given regular-expression pattern. | `lt -n Softmax.*` |
 | | `-t <op_pattern>` | List dumped tensors with op types matching given regular-expression pattern. | `lt -t MatMul` |
 | | `s <sort_key>` | Sort the output by given `sort_key`, whose possible values are `timestamp` (default), `dump_size`, `op_type` and `tensor_name`. | `lt -s dump_size` |
 | | `-r` | Sort in reverse order. | `lt -r -s dump_size` |
@@ -138,9 +149,12 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `pt <tensor>[slicing]` | Print a subarray of tensor, using [numpy](http://www.numpy.org/)-style array slicing. | `pt hidden/Relu:0[0:50,:]` |
 | | `-a` | Print the entirety of a large tensor, without using ellipses. (May take a long time for large tensors.) | `pt -a hidden/Relu:0[0:50,:]` |
 | | `-r <range>` | Highlight elements falling into specified numerical range. Multiple ranges can be used in conjunction. | `pt hidden/Relu:0 -a -r [[-inf,-1],[1,inf]]` |
+| | `-s` | Include a summary of the numeric values of the tensor (applicable only to non-empty tensors with Boolean and numeric types such as `int*` and `float*`.) | `pt -s hidden/Relu:0[0:50,:]` |
 | **`@[coordinates]`** | | Navigate to specified element in `pt` output. | `@[10,0]` or `@10,0` |
 | **`/regex`** | |  [less](https://linux.die.net/man/1/less)-style search for given regular expression. | `/inf` |
 | **`/`** | | Scroll to the next line with matches to the searched regex (if any). | `/` |
+| **`pf`** | | **Print a value in the feed_dict to `Session.run` ** | |
+| | `pf <feed_tensor_name>` | Print the value of the feed. Also note that the `pf` command has the `-a`, `-r` and `-s` flags (not listed below), which have the same syntax and semantics as the identically-named flags of `pt`. | `pf input_xs:0` |
 | **`ni`** | | **Display node information.** | |
 | | `-a` | Include node attributes in the output. | `ni -a hidden/Relu` |
 | | `-d` | List the debug dumps available from the node. | `ni -d hidden/Relu` |
@@ -155,7 +169,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-c` | Include recipients via control edges. | `lo -c -r hidden/Relu:0` |
 | **`ls`** | | **List Python source files involved in node creation.** | |
 | | `-p <path_pattern>` | Limit output to source files matching given regular-expression path pattern. | `ls -p .*debug_mnist.*` |
-| | `-n` | Limit output to node names matching given regular-expression pattern. | `ls -n softmax.*` |
+| | `-n` | Limit output to node names matching given regular-expression pattern. | `ls -n Softmax.*` |
 | **`ps`** | | **Print Python source file.** | |
 | | `ps <file_path>` | Print given Python source file source.py, with the lines annotated with the nodes created at each of them (if any). | `ps /path/to/source.py` |
 | | `-t` | Perform annotation with respect to Tensors, instead of the default, nodes. | `ps -t /path/to/source.py` |
@@ -164,14 +178,19 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | **`run`** | | **Proceed to the next Session.run()** | `run` |
 | | `-n` | Execute through the next `Session.run` without debugging, and drop to CLI right before the run after that. | `run -n` |
 | | `-t <T>` | Execute `Session.run` `T - 1` times without debugging, followed by a run with debugging. Then drop to CLI right after the debugged run. | `run -t 10` |
-| | `-f <filter_name>` | Continue executing `Session.run` until any intermediate tensors passes the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan` |
-| | `--node_name_filter <pattern>` | Execute the next `Session.run`, watching only nodes with names matching the given regular-expression pattern. | `run --node_name_filter softmax.*` |
+| | `-f <filter_name>` | Continue executing `Session.run` until any intermediate tensor triggers the specified Tensor filter (causes the filter to return `True`). | `run -f has_inf_or_nan` |
+| | `--node_name_filter <pattern>` | Execute the next `Session.run`, watching only nodes with names matching the given regular-expression pattern. | `run --node_name_filter Softmax.*` |
 | | `--op_type_filter <pattern>` | Execute the next `Session.run`, watching only nodes with op types matching the given regular-expression pattern. | `run --op_type_filter Variable.*` |
 | | `--tensor_dtype_filter <pattern>` | Execute the next `Session.run`, dumping only Tensors with data types (`dtype`s) matching the given regular-expression pattern. | `run --tensor_dtype_filter int.*` |
 | **`ri`** | | **Display information about the run the current run, including fetches and feeds.** | `ri` |
 | **`help`** | | **Print general help information** | `help` |
 | | `help <command>` | Print help for given command. | `help lt` |
 
+Note that each time you enter a command, a new screen output
+will appear. This is somewhat analogous to web pages in a browser. You can
+navigate between these screens by clicking the `<--` and
+`-->` text arrows near the top-left corner of the CLI.
+
 ### Other Features of the tfdbg CLI
 
 In addition to the commands listed above, the tfdbg CLI provides the following
@@ -197,25 +216,27 @@ addditional features:
 
 ### Finding `nan`s and `inf`s
 
-In this first `run()` call, there happen to be no problematic numerical values.
-You can move on to the next run by using the command `run` or its shorthand `r`.
+In this first `Session.run()` call, there happen to be no problematic numerical
+values. You can move on to the next run by using the command `run` or its
+shorthand `r`.
 
-> TIP: If you enter `run` or `r` repeatedly, you will be able to move through the
-> `run()` calls in a sequential manner.
+> TIP: If you enter `run` or `r` repeatedly, you will be able to move through
+> the `Session.run()` calls in a sequential manner.
 >
-> You can also use the `-t` flag to move ahead a number of `run()` calls at a time, for example:
+> You can also use the `-t` flag to move ahead a number of `Session.run()` calls
+> at a time, for example:
 >
 > ```
 > tfdbg> run -t 10
 > ```
 
 Instead of entering `run` repeatedly and manually searching for `nan`s and
-`inf`s in the run-end UI after every `run()` call (for example, by using the `pt`
-command shown in the table above) , you can use the following
-command to let the debugger repeatedly execute `run()` calls without stopping at
-the run-start or run-end prompt, until the first `nan` or `inf` value shows up
-in the graph. This is analogous to *conditional breakpoints* in some
-procedural-language debuggers:
+`inf`s in the run-end UI after every `Session.run()` call (for example, by using
+the `pt` command shown in the table above) , you can use the following
+command to let the debugger repeatedly execute `Session.run()` calls without
+stopping at the run-start or run-end prompt, until the first `nan` or `inf`
+value shows up in the graph. This is analogous to *conditional breakpoints* in
+some procedural-language debuggers:
 
 ```none
 tfdbg> run -f has_inf_or_nan
@@ -224,21 +245,28 @@ tfdbg> run -f has_inf_or_nan
 > NOTE: The preceding command works properly because we have registered a filter
 > for `nan`s and `inf`s called `has_inf_or_nan` (as explained previously).
 > If you have registered any other filters, you can
-> let tfdbg run till any tensors pass that filter (cause the filter to return True)
-> as well, for example,
+> use "run -f" to have tfdbg run until any tensor triggers that filter (cause
+> the filter to return True).
 >
-> ```
-> # In python code:
+> ``` python
 > sess.add_tensor_filter('my_filter', my_filter_callable)
+> ```
+>
+> Then at the tfdbg run-start prompt run until your filter is triggered:
 >
-> # Run at tfdbg run-start prompt:
+> ```
 > tfdbg> run -f my_filter
 > ```
 
+See [this API document](https://www.tensorflow.org/api_docs/python/tfdbg/DebugDumpDir#find)
+for more information on the expected signature and return value of the predicate
+`Callable` used with `add_tensor_filter()`.
+
 ![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_inf_nan.png)
 
-As the screen display indicates, the `has_inf_or_nan` filter is first passed
-during the fourth `run()` call: an [Adam optimizer](https://arxiv.org/abs/1412.6980)
+As the screen display indicates on the first line, the `has_inf_or_nan` filter is first triggered
+during the fourth `Session.run()` call: an
+[Adam optimizer](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer)
 forward-backward training pass on the graph. In this run, 36 (out of the total
 95) intermediate tensors contain `nan` or `inf` values. These tensors are listed
 in chronological order, with their timestamps displayed on the left. At the top
@@ -266,9 +294,19 @@ Or, alternatively:
 tfdbg> /(inf|nan)
 ```
 
+You can also use the `-s` or `--numeric_summary` command to get a quick summary
+of the types of numeric values in the tensor:
+
+``` none
+tfdbg> pt -s cross_entropy/Log:0
+```
+
+From the summary, you can see that several of the 1000 elements of the
+`cross_entropy/Log:0` tensor are `-inf`s (negative infinities).
+
 Why did these infinities appear? To further debug, display more information
 about the node `cross_entropy/Log` by clicking the underlined `node_info` menu
-item on the top or entering the equivalent command:
+item on the top or entering the equivalent node_info (`ni`) command:
 
 ```none
 tfdbg> ni cross_entropy/Log
@@ -310,7 +348,7 @@ line:
 diff = y_ * tf.log(y)
 ```
 
-***tfdbg** has a feature that makes it ease to trace Tensors and ops back to
+**tfdbg** has a feature that makes it easy to trace Tensors and ops back to
 lines in Python source files. It can annotate lines of a Python file with
 the ops or Tensors created by them. To use this feature,
 simply click the underlined line numbers in the stack trace output of the
@@ -325,13 +363,13 @@ of a `ps` command.
 To fix the problem, edit `debug_mnist.py`, changing the original line:
 
 ```python
-diff = y_ * tf.log(y)
+diff = -(y_ * tf.log(y))
 ```
 
-to the following:
+to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = y_ * tf.log(tf.clip_by_value(y, 1e-8, 1.0))
+diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
@@ -351,7 +389,7 @@ accuracy now continues to rise rather than getting stuck. Success!
 
 ## Debugging tf-learn Estimators and Experiments
 
-This section explains how to debug TensorFlow programs that use the `Estimators`
+This section explains how to debug TensorFlow programs that use the `Estimator`
 and `Experiment` APIs. Part of the convenience provided by these APIs is that
 they manage `Session`s internally. This makes the `LocalCLIDebugWrapperSession`
 described in the preceding sections inapplicable. Fortunately, you can still
@@ -363,7 +401,7 @@ Currently, `tfdbg` can debug the
 @{tf.contrib.learn.BaseEstimator.fit$`fit()`}
 @{tf.contrib.learn.BaseEstimator.evaluate$`evaluate()`}
 methods of tf-learn `Estimator`s. To debug `Estimator.fit()`,
-create a `LocalCLIDebugHook` and supply it as the `monitors` argument. For example:
+create a `LocalCLIDebugHook` and supply it in the `monitors` argument. For example:
 
 ```python
 # First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
@@ -371,9 +409,9 @@ create a `LocalCLIDebugHook` and supply it as the `monitors` argument. For examp
 #  install of open-source TensorFlow.)
 from tensorflow.python import debug as tf_debug
 
+# Create a LocalCLIDebugHook and use it as a monitor when calling fit().
 hooks = [tf_debug.LocalCLIDebugHook()]
 
-# Create a local CLI debug hook and use it as a monitor when calling fit().
 classifier.fit(x=training_set.data,
                y=training_set.target,
                steps=1000,
@@ -441,22 +479,38 @@ calls, as a function of the `fetches` and `feed_dict` and other states. See
 @{tfdbg.DumpingDebugWrapperSession.__init__$this API doc}
 for more details.
 
+## Debugging Keras Models with TFDBG
+
+To use TFDBG with [Keras](https://keras.io/), let the Keras backend use
+a TFDBG-wrapped Session object. For example, to use the CLI wrapper:
+
+``` python
+import tensorflow as tf
+from keras import backend as keras_backend
+from tensorflow.python import debug as tf_debug
+
+keras_backend.set_session(tf_debug.LocalCLIDebugWrapperSession(tf.Session()))
+
+# Define your keras model, called "model".
+model.fit(...)  # This will break into the TFDBG CLI.
+```
+
 ## Offline Debugging of Remotely-Running Sessions
 
-Oftentimes, your model is running in a remote machine or process that you don't
+Often, your model is running on a remote machine or a process that you don't
 have terminal access to. To perform model debugging in such cases, you can use
-the `offline_analyzer` binary of `tfdbg`. It operates on dumped data
-directories. This can be done to both the lower-level `Session` API and the
-higher-level `Estimator` and `Experiment` APIs.
+the `offline_analyzer` binary of `tfdbg` (described below). It operates on
+dumped data directories. This can be done to both the lower-level `Session` API
+and the higher-level `Estimator` and `Experiment` APIs.
 
-### Debugging Remotely-Running tf.Sessions
+### Debugging Remote tf.Sessions
 
 If you interact directly with the `tf.Session` API in `python`, you can
 configure the `RunOptions` proto that you call your `Session.run()` method
 with, by using the method @{tfdbg.watch_graph}.
 This will cause the intermediate tensors and runtime graphs to be dumped to a
-shared storage location of your choice when the `Session.run()` call occurs.
-For example:
+shared storage location of your choice when the `Session.run()` call occurs
+(at the cost of slower performance). For example:
 
 ```python
 from tensorflow.python import debug as tf_debug
@@ -485,8 +539,8 @@ python -m tensorflow.python.debug.cli.offline_analyzer \
 
 The `Session` wrapper `DumpingDebugWrapperSession` offers an easier and more
 flexible way to generate file-system dumps that can be analyzed offline.
-To use it, simply call the `tf_debug.DumpingDebugWrapperSession` method in the
-program being debugged. For example:
+To use it, simply wrap your session in a `tf_debug.DumpingDebugWrapperSession`.
+For example:
 
 ```python
 # Let your BUILD target depend on "//tensorflow/python/debug:debug_py
@@ -498,15 +552,13 @@ sess = tf_debug.DumpingDebugWrapperSession(
     sess, "/shared/storage/location/tfdbg_dumps_1/", watch_fn=my_watch_fn)
 ```
 
-`watch_fn=my_watch_fn` is a `Callable` that allows you to configure what
+The `watch_fn` argument accepts a `Callable` that allows you to configure what
 `tensor`s to watch on different `Session.run()` calls, as a function of the
-`fetches` and `feed_dict` to the `run()` call and other states. See
-@{tfdbg.DumpingDebugWrapperSession.__init__$the API doc of DumpingDebugWrapperSession}
-for more details.
+`fetches` and `feed_dict` to the `run()` call and other states.
 
 ### C++ and other languages
 
-If you model code is written in C++ or other languages, you can also
+If your model code is written in C++ or other languages, you can also
 modify the `debug_options` field of `RunOptions` to generate debug dumps that
 can be inspected offline. See
 [the proto definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/debug.proto)
@@ -547,10 +599,10 @@ python -m tensorflow.python.debug.cli.offline_analyzer \
        performance in a non-debugging session?_
 
 **A**: No. The debugger inserts additional special-purpose debug nodes to the
-       graph to record the values of intermediate tensors. These nodes certainly
+       graph to record the values of intermediate tensors. These nodes
        slow down the graph execution. If you are interested in profiling your
        model, check out
-       [tfprof](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tfprof)
+       [tfprof](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler)
        and other profiling tools for TensorFlow.
 
 **Q**: _How do I link tfdbg against my `Session` in Bazel? Why do I see an
@@ -591,7 +643,7 @@ only from the main thread?_
 **A**:
 This is a common use case, in which the `Session` object is used from multiple
 threads concurrently. Typically, the child threads take care of background tasks
-such as running enqueue operations. Oftentimes, you want to debug only the main
+such as running enqueue operations. Often, you want to debug only the main
 thread (or less frequently, only one of the child threads). You can use the
 `thread_name_filter` keyword argument of `LocalCLIDebugWrapperSession` to
 achieve this type of thread-selective debugging. For example, to debug from the
@@ -640,6 +692,12 @@ There are three possible workarounds or solutions:
    tfdbg> run --tensor_dtype_filter int.*
    ```
 
+   The first command above watches only nodes whose name match the
+   regular-expression pattern `.*hidden.*`. The second command watches only
+   operations whose name match the pattern `Variable.*`. The third one watches
+   only the tensors whose dtype match the pattern `int.*` (e.g., `int32`).
+
+
 **Q**: _Why can't I select text in the tfdbg CLI?_
 
 **A**: This is because the tfdbg CLI enables mouse events in the terminal by
@@ -664,9 +722,8 @@ sess.run(b)
        tensor `b` is effectively also a constant tensor. TensorFlow's graph
        optimization folds the graph that contains `a` and `b` into a single
        node to speed up future runs of the graph, which is why `tfdbg` does
-       not generate any intermedate-tensor dumps. If `a` were a
-       @{tf.Variable}, the constant-folding would not occur and `tfdbg`
-       should show the intermeidate-tensor dumps. For example:
+       not generate any intermediate tensor dumps. However, if `a` were a
+       @{tf.Variable}, as in the following example:
 
 ``` python
 import numpy as np
@@ -678,3 +735,6 @@ sess.run(tf.global_variables_initializer())
 sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 sess.run(b)
 ```
+
+the constant-folding would not occur and `tfdbg` should show the intermediate
+tensor dumps.
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
index 975850349f0e29f698dd36060ecdf51cf29e87bb..4095c6c97a4703bdf16e8feceaacdefaa50488b3 100644
--- a/tensorflow/docs_src/programmers_guide/embedding.md
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -1,12 +1,16 @@
 # Embeddings
 
-[TOC]
+This document introduces the concept of embeddings, gives a simple example of
+how to train an embedding in TensorFlow, and explains how to view embeddings
+with the TensorBoard Embedding Projector. The first two parts target newcomers
+to machine learning or TensorFlow, and the Embedding Projector how-to is for
+users at all levels.
 
-## Introduction
+[TOC]
 
-An embedding is a mapping from discrete objects, such as words, to vectors of
-real numbers. For example, a 300-dimensional embedding for English words could
-include:
+An **embedding** is a mapping from discrete objects, such as words, to vectors
+of real numbers. For example, a 300-dimensional embedding for English words
+could include:
 
 ```
 blue:  (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)
@@ -15,19 +19,24 @@ orange:  (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)
 oranges:  (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)
 ```
 
-Embeddings let you apply machine learning to discrete inputs. Classifiers, and
-neural networks more generally, are designed to work with dense continuous
-vectors, where all values contribute to define what an object is.  If discrete
-objects are naively encoded as discrete atoms, e.g., unique id numbers, they
-hinder learning and generalization. One way to think of embeddings is as a way
-to transform non-vector objects into useful inputs for machine learning.
-
-Embeddings are also useful as outputs of machine learning. Because embeddings
-map objects to vectors, applications can use similarity in vector space (e.g.,
-Euclidean distance or the angle between vectors) as a robust and flexible
-measure of object similarity. One common use is to find nearest neighbors.
-Using the same word embeddings above, for instance, here are the three nearest
-neighbors for each word and the corresponding angles (in degrees):
+The individual dimensions in these vectors typically have no inherent meaning.
+Instead, it's the overall patterns of location and distance between vectors
+that machine learning takes advantage of.
+
+Embeddings are important for input to machine learning. Classifiers, and neural
+networks more generally, work on vectors of real numbers. They train best on
+dense vectors, where all values contribute to define an object. However, many
+important inputs to machine learning, such as words of text, do not have a
+natural vector representation. Embedding functions are the standard and
+effective way to transform such discrete input objects into useful
+continuous vectors.
+
+Embeddings are also valuable as outputs of machine learning. Because embeddings
+map objects to vectors, applications can use similarity in vector space (for
+instance, Euclidean distance or the angle between vectors) as a robust and
+flexible measure of object similarity. One common use is to find nearest
+neighbors.  Using the same word embeddings as above, for instance, here are the
+three nearest neighbors for each word and the corresponding angles:
 
 ```
 blue:  (red, 47.6°), (yellow, 51.9°), (purple, 52.4°)
@@ -39,138 +48,151 @@ oranges:  (apples, 45.3°), (lemons, 48.3°), (mangoes, 50.4°)
 This would tell an application that apples and oranges are in some way more
 similar (45.3° apart) than lemons and oranges (48.3° apart).
 
-## Training an Embedding
+## Embeddings in TensorFlow
 
-To train word embeddings in TensorFlow, we first need to split the text into
-words and assign an integer to every word in the vocabulary. Let us assume that
+To create word embeddings in TensorFlow, we first split the text into words
+and then assign an integer to every word in the vocabulary. Let us assume that
 this has already been done, and that `word_ids` is a vector of these integers.
 For example, the sentence “I have a cat.” could be split into
 `[“I”, “have”, “a”, “cat”, “.”]` and then the corresponding `word_ids` tensor
-would have shape `[5]` and consist of 5 integers. To get these word ids
-embedded, we need to create the embedding variable and use the `tf.gather`
-function as follows:
+would have shape `[5]` and consist of 5 integers. To map these word ids
+to vectors, we need to create the embedding variable and use the
+`tf.nn.embedding_lookup` function as follows:
 
 ```
 word_embeddings = tf.get_variable(“word_embeddings”,
     [vocabulary_size, embedding_size])
-embedded_word_ids = tf.gather(word_embeddings, word_ids)
+embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, word_ids)
 ```
 
 After this, the tensor `embedded_word_ids` will have shape `[5, embedding_size]`
 in our example and contain the embeddings (dense vectors) for each of the 5
-words. The variable `word_embeddings` will be learned and at the end of the
-training it will contain the embeddings for all words in the vocabulary.
-The embeddings can be trained in many ways, depending on the data available.
-For example, one could use a recurrent neural network to predict the next word
-from the previous one given a large corpus of sentences, or one could train
-two networks to do multi-lingual translation. These methods are described in
-[Vector Representations of Words](../tutorials/word2vec.md) tutorial, but in
-all cases there is an embedding variable like above and words are embedded
-using `tf.gather`, as shown.
+words. At the end of training, `word_embeddings` will contain the embeddings
+for all words in the vocabulary.
 
-## Visualizing Embeddings
-
-TensorBoard has a built-in visualizer, called the <i>Embedding Projector</i>,
-for interactive visualization of embeddings. The embedding projector will read
-the embeddings from your checkpoint file and project them into 3 dimensions using
-[principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis).
-For a visual explanation of PCA, see
-[this article](http://setosa.io/ev/principal-component-analysis/). Another
-very useful projection you can use is
-[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding).
-
-If you are working with an embedding, you'll probably want to attach
-labels/images to the data points. You can do this by generating a
-[metadata file](#metadata) containing the labels for each point and configuring
-the projector either by using our Python API, or manually constructing and
-saving a
-<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
-in the same directory as your checkpoint file.
-
-### Setup
-
-For in depth information on how to run TensorBoard and make sure you are
-logging all the necessary information, see
-[TensorBoard: Visualizing Learning](../get_started/summaries_and_tensorboard.md).
-
-To visualize your embeddings, there are 3 things you need to do:
-
-1) Setup a 2D tensor that holds your embedding(s).
-
-```python
-embedding_var = tf.get_variable(....)
-```
+Embeddings can be trained in many network types, and with various loss
+functions and data sets. For example, one could use a recurrent neural network
+to predict the next word from the previous one given a large corpus of
+sentences, or one could train two networks to do multi-lingual translation.
+These methods are described in the @{$word2vec$Vector Representations of Words}
+tutorial.
 
-2) Periodically save your model variables in a checkpoint in
-<code>LOG_DIR</code>.
-
-```python
-saver = tf.train.Saver()
-saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), step)
-```
-
-3) (Optional) Associate metadata with your embedding.
-
-If you have any metadata (labels, images) associated with your embedding, you
-can tell TensorBoard about it either by directly storing a
-<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
-in the <code>LOG_DIR</code>, or use our python API.
-
-For instance, the following <code>projector_config.ptxt</code> associates the
-<code>word_embedding</code> tensor with metadata stored in <code>$LOG_DIR/metadata.tsv</code>:
+## Visualizing Embeddings
 
-```
-embeddings {
-  tensor_name: 'word_embedding'
-  metadata_path: '$LOG_DIR/metadata.tsv'
-}
-```
+TensorBoard includes the **Embedding Projector**, a tool that lets you
+interactively visualize embeddings. This tool can read embeddings from your
+model and render them in two or three dimensions.
 
-The same config can be produced programmatically using the following code snippet:
+The Embedding Projector has three panels:
 
-```python
-from tensorflow.contrib.tensorboard.plugins import projector
+- *Data panel* on the top left, where you can choose the run, the embedding
+  variable and data columns to color and label points by.
+- *Projections panel* on the bottom left, where you can choose the type of
+  projection.
+- *Inspector panel* on the right side, where you can search for particular
+  points and see a list of nearest neighbors.
 
-# Create randomly initialized embedding weights which will be trained.
-vocabulary_size = 10000
-embedding_size = 200
-embedding_var = tf.get_variable('word_embedding', [vocabulary_size, embedding_size])
+### Projections
+The Embedding Projector provides three ways to reduce the dimensionality of a
+data set.
+
+- *[t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding)*:
+  a nonlinear nondeterministic algorithm (T-distributed stochastic neighbor
+  embedding) that tries to preserve local neighborhoods in the data, often at
+  the expense of distorting global structure. You can choose whether to compute
+  two- or three-dimensional projections.
+
+- *[PCA](https://en.wikipedia.org/wiki/Principal_component_analysis)*:
+  a linear deterministic algorithm (principal component analysis) that tries to
+  capture as much of the data variability in as few dimensions as possible. PCA
+  tends to highlight large-scale structure in the data, but can distort local
+  neighborhoods. The Embedding Projector computes the top 10 principal
+  components, from which you can choose two or three to view.
+
+- *Custom*: a linear projection onto horizontal and vertical axes that you
+  specify using labels in the data. You define the horizontal axis, for
+  instance, by giving text patterns for "Left" and "Right". The Embedding
+  Projector finds all points whose label matches the "Left" pattern and
+  computes the centroid of that set; similarly for "Right".  The line passing
+  through these two centroids defines the horizontal axis. The vertical axis is
+  likewise computed from the centroids for points matching the "Up" and "Down"
+  text patterns.
+
+Further useful articles are
+[How to Use t-SNE Effectively](distill.pub/2016/misread-tsne/) and
+[Principal Component Analysis Explained Visually](http://setosa.io/ev/principal-component-analysis/).
+
+### Exploration
+
+You can explore visually by zooming, rotating, and panning using natural
+click-and-drag gestures. Hovering your mouse over a point will show any
+[metadata](#metadata) for that point.  You can also inspect nearest-neighbor
+subsets.  Clicking on a point causes the right pane to list the nearest
+neighbors, along with distances to the current point. The nearest-neighbor
+points are also highlighted in the projection.
+
+It is sometimes useful to restrict the view to a subset of points and perform
+projections only on those points. To do so, you can select points in multiple
+ways:
+
+- After clicking on a point, its nearest neighbors are also selected.
+- After a search, the points matching the query are selected.
+- Enabling selection, clicking on a point and dragging defines a selection
+  sphere.
+
+Then click the "Isolate *nnn* points" button at the top of the Inspector pane
+on the right hand side. The following image shows 101 points selected and ready
+for the user to click "Isolate 101 points":
 
-# Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
-config = projector.ProjectorConfig()
+![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors")
 
-# You can add multiple embeddings. Here we add only one.
-embedding = config.embeddings.add()
-embedding.tensor_name = embedding_var.name
-# Link this tensor to its metadata file (e.g. labels).
-embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
+*Selection of the nearest neighbors of “important” in a word embedding dataset.*
 
-# Use the same LOG_DIR where you stored your checkpoint.
-summary_writer = tf.summary.FileWriter(LOG_DIR)
+Advanced tip: filtering with custom projection can be powerful. Below, we
+filtered the 100 nearest neighbors of “politics” and projected them onto the
+“worst” - “best” vector as an x axis. The y axis is random. As a result, one
+finds on the right side “ideas”, “science”, “perspective”, “journalism” but on
+the left “crisis”, “violence” and “conflict”.
 
-# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
-# read this file during startup.
-projector.visualize_embeddings(summary_writer, config)
-```
+<table width="100%;">
+  <tr>
+    <td style="width: 30%;">
+      <img src="https://www.tensorflow.org/images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
+    </td>
+    <td style="width: 70%;">
+      <img src="https://www.tensorflow.org/images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
+    </td>
+  </tr>
+  <tr>
+    <td style="width: 30%;">
+      Custom projection controls.
+    </td>
+    <td style="width: 70%;">
+      Custom projection of neighbors of "politics" onto "best" - "worst" vector.
+    </td>
+  </tr>
+</table>
 
-After running your model and training your embeddings, run TensorBoard and point
-it to the <code>LOG_DIR</code> of the job.
+To share your findings, you can use the bookmark panel in the bottom right
+corner and save the current state (including computed coordinates of any
+projection) as a small file. The Projector can then be pointed to a set of one
+or more of these files, producing the panel below. Other users can then walk
+through a sequence of bookmarks.
 
-```python
-tensorboard --logdir=LOG_DIR
-```
+<img src="https://www.tensorflow.org/images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
 
-Then click on the *Embeddings* tab on the top pane
-and select the appropriate run (if there are more than one run).
+### Metadata
 
+If you are working with an embedding, you'll probably want to attach
+labels/images to the data points. You can do this by generating a metadata file
+containing the labels for each point and clicking "Load data" in the data panel
+of the Embedding Projector.
 
-### Metadata
-Usually embeddings have metadata associated with it (e.g. labels, images). The
-metadata should be stored in a separate file outside of the model checkpoint
-since the metadata is not a trainable parameter of the model. The format should
+The metadata can be either labels or images, which are
+stored in a separate file. For labels, the format should
 be a [TSV file](https://en.wikipedia.org/wiki/Tab-separated_values)
-(tab characters shown in red) with the first line containing column headers
-(shown in bold) and subsequent lines contain the metadata values:
+(tab characters shown in red) whose first line contains column headers
+(shown in bold) and subsequent lines contain the metadata values. For example:
 
 <code>
 <b>Word<span style="color:#800;">\t</span>Frequency</b><br/>
@@ -179,24 +201,20 @@ be a [TSV file](https://en.wikipedia.org/wiki/Tab-separated_values)
   ...
 </code>
 
-There is no explicit key shared with the main data file; instead, the order in
-the metadata file is assumed to match the order in the embedding tensor. In
-other words, the first line is the header information and the (i+1)-th line in
-the metadata file corresponds to the i-th row of the embedding tensor stored in
-the checkpoint.
-
-Note: If the TSV metadata file has only a single column, then we don’t expect a
-header row, and assume each row is the label of the embedding. We include this
-exception because it matches the commonly-used "vocab file" format.
-
-### Images
-If you have images associated with your embeddings, you will need to
-produce a single image consisting of small thumbnails of each data point.
-This is known as the
-[sprite image](https://www.google.com/webhp#q=what+is+a+sprite+image).
-The sprite should have the same number of rows and columns with thumbnails
-stored in row-first order: the first data point placed in the top left and the
-last data point in the bottom right:
+The order of lines in the metadata file is assumed to match the order of
+vectors in the embedding variable, except for the header.  Consequently, the
+(i+1)-th line in the metadata file corresponds to the i-th row of the embedding
+variable.  If the TSV metadata file has only a single column, then we don’t
+expect a header row, and assume each row is the label of the embedding. We
+include this exception because it matches the commonly-used "vocab file"
+format.
+
+To use images as metadata, you must produce a single
+[sprite image](https://www.google.com/webhp#q=what+is+a+sprite+image),
+consisting of small thumbnails, one for each vector in the embedding.  The
+sprite should store thumbnails in row-first order: the first data point placed
+in the top left and the last data point in the bottom right, though the last
+row doesn't have to be filled, as shown below.
 
 <table style="border: none;">
 <tr style="background-color: transparent;">
@@ -216,120 +234,8 @@ last data point in the bottom right:
 </tr>
 </table>
 
-Note in the example above that the last row doesn't have to be filled. For a
-concrete example of a sprite, see
-[this sprite image](https://www.tensorflow.org/images/mnist_10k_sprite.png) of 10,000 MNIST digits
-(100x100).
-
-Note: We currently support sprites up to 8192px X 8192px.
-
-After constructing the sprite, you need to tell the Embedding Projector where
-to find it:
-
-
-```python
-embedding.sprite.image_path = PATH_TO_SPRITE_IMAGE
-# Specify the width and height of a single thumbnail.
-embedding.sprite.single_image_dim.extend([w, h])
-```
-
-### Interaction
-
-The Embedding Projector has three panels:
-
-1. *Data panel* on the top left, where you can choose the run, the embedding
-   tensor and data columns to color and label points by.
-2. *Projections panel* on the bottom left, where you choose the type of
-    projection (e.g. PCA, t-SNE).
-3. *Inspector panel* on the right side, where you can search for particular
-   points and see a list of nearest neighbors.
-
-### Projections
-The Embedding Projector has three methods of reducing the dimensionality of a
-data set: two linear and one nonlinear. Each method can be used to create either
-a two- or three-dimensional view.
-
-**Principal Component Analysis** A straightforward technique for reducing
-dimensions is Principal Component Analysis (PCA). The Embedding Projector
-computes the top 10 principal components. The menu lets you project those
-components onto any combination of two or three. PCA is a linear projection,
-often effective at examining global geometry.
-
-**t-SNE** A popular non-linear dimensionality reduction technique is t-SNE.
-The Embedding Projector offers both two- and three-dimensional t-SNE views.
-Layout is performed client-side animating every step of the algorithm. Because
-t-SNE often preserves some local structure, it is useful for exploring local
-neighborhoods and finding clusters. Although extremely useful for visualizing
-high-dimensional data, t-SNE plots can sometimes be mysterious or misleading.
-See this [great article](http://distill.pub/2016/misread-tsne/) for how to use
-t-SNE effectively.
-
-**Custom** You can also construct specialized linear projections based on text
-searches for finding meaningful directions in space. To define a projection
-axis, enter two search strings or regular expressions. The program computes the
-centroids of the sets of points whose labels match these searches, and uses the
-difference vector between centroids as a projection axis.
-
-### Navigation
-
-To explore a data set, you can navigate the views in either a 2D or a 3D mode,
-zooming, rotating, and panning using natural click-and-drag gestures.
-Clicking on a point causes the right pane to show an explicit textual list of
-nearest neighbors, along with distances to the current point. The
-nearest-neighbor points themselves are highlighted on the projection.
-
-Zooming into the cluster gives some information, but it is sometimes more
-helpful to restrict the view to a subset of points and perform projections only
-on those points. To do so, you can select points in multiple ways:
-
-1. After clicking on a point, its nearest neighbors are also selected.
-2. After a search, the points matching the query are selected.
-3. Enabling selection, clicking on a point and dragging defines a selection
-   sphere.
-
-After selecting a set of points, you can isolate those points for
-further analysis on their own with the "Isolate Points" button in the Inspector
-pane on the right hand side.
-
-
-![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors")
-*Selection of the nearest neighbors of “important” in a word embedding dataset.*
-
-The combination of filtering with custom projection can be powerful. Below, we filtered
-the 100 nearest neighbors of “politics” and projected them onto the
-“best” - “worst” vector as an x axis. The y axis is random.
-
-You can see that on the right side we have “ideas”, “science”, “perspective”,
-“journalism” while on the left we have “crisis”, “violence” and “conflict”.
-
-<table width="100%;">
-  <tr>
-    <td style="width: 30%;">
-      <img src="https://www.tensorflow.org/images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
-    </td>
-    <td style="width: 70%;">
-      <img src="https://www.tensorflow.org/images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
-    </td>
-  </tr>
-  <tr>
-    <td style="width: 30%;">
-      Custom projection controls.
-    </td>
-    <td style="width: 70%;">
-      Custom projection of neighbors of "politics" onto "best" - "worst" vector.
-    </td>
-  </tr>
-</table>
-
-### Collaborative Features
-
-To share your findings, you can use the bookmark panel in the bottom right
-corner and save the current state (including computed coordinates of any
-projection) as a small file. The Projector can then be pointed to a set of one
-or more of these files, producing the panel below. Other users can then walk
-through a sequence of bookmarks.
-
-<img src="https://www.tensorflow.org/images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
+Follow [this link]("https://www.tensorflow.org/images/embedding-mnist.mp4" )
+to see a fun example of thumbnail images in the Embedding Projector.
 
 
 ## Mini-FAQ
@@ -348,5 +254,5 @@ displaying many properties that are dramatically different from what our human
 intuition has learned about 2- and 3-dimensional spaces.
 
 **Is an embedding the same as an embedding layer?**
-No; an embedding layer is a part of neural network, but an embedding is a more
+No. An *embedding layer* is a part of neural network, but an *embedding* is a more
 general concept.
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5724ea294e1bfbbceb2d70509de507774ddf15f
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -0,0 +1,153 @@
+# Estimators
+
+This document introduces **Estimators**--a high-level TensorFlow API that
+greatly simplifies machine learning programming. Estimators encapsulate
+the following actions:
+
+*   training
+*   evaluation
+*   prediction
+*   export for serving
+
+You may either use the pre-made Estimators we provide or write your
+own custom Estimators.  All Estimators--whether pre-made or custom--are
+classes based on the `tf.estimator.Estimator` class.
+
+Note: TensorFlow also provides an Estimator class at
+`tf.contrib.learn.Estimator`, which you should not use.</aside>
+
+
+## Advantages of Estimators
+
+Estimators provide the following benefits:
+
+*   You can run Estimators-based models on a local host or on a
+    distributed multi-server environment without changing your model.
+    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    or TPUs without recoding your model.
+*   Estimators simplify sharing implementations between model developers.
+*   You can develop a state of the art model with high-level intuitive code,
+    In short, it is generally much easier to create models with Estimators
+    than with the low-level TensorFlow APIs.
+*   Estimators are themselves built on tf.layers, which
+    simplifies customization.
+*   Estimators build the graph for you.  In other words, you don't have to
+    build the graph.
+*   Estimators provide a safe distributed training loop that controls how and
+    when to:
+    *   build the graph
+    *   initialize variables
+    *   start queues
+    *   handle exceptions
+    *   create checkpoint files and recover from failures
+    *   save summaries for TensorBoard
+
+When writing an application with Estimators, you must separate the data input
+pipeline from the model.  This separation simplifies experiments with
+different data sets.
+
+
+## Pre-made Estimators
+
+Pre-made Estimators enable you to work at a much higher conceptual level
+than the base TensorFlow APIs. You no longer have to worry about creating
+the computational graph or sessions since Estimators handle all
+the "plumbing" for you.  That is, pre-made Estimators create and manage
+`Graph` and `Session` objects for you.  Furthermore, pre-made Estimators
+let you experiment with different model architectures by making only minimal
+code changes.  `DNNClassifier`, for example, is a pre-made Estimator class that
+trains classification models through dense, feed-forward neural networks.
+
+
+### Structure of a pre-made Estimators program
+
+A TensorFlow program relying on a pre-made Estimator typically consists
+of the following four steps:
+
+1.  **Write one or more dataset importing functions.** For example, you might
+    create one function to import the training set and another function to
+    import the test set. Each dataset importing function must return two
+    objects:
+
+    *   a dictionary in which the keys are feature column names and the
+        values are Tensors (or SparseTensors) containing the corresponding
+        feature data
+    *   a Tensor containing one or more labels
+
+    For example, the following code illustrates the basic skeleton for
+    an input function:
+
+        def input_fn(dataset):
+           ...  # manipulate dataset, extracting feature names and the label
+           return feature_dict, label
+
+    See @{$datasets$Using the `Dataset` API for TensorFlow Input Pipelines}
+    for full details.)
+
+2.  **Define the feature columns.** Each @{tf.feature_column}
+    identifies a feature name, its type, and any input pre-processing.
+    For example, the following snippet creates three feature
+    columns that hold integer or floating-point data.  The first two
+    feature columns simply identify the feature's name and type. The
+    third feature column also specifies a lambda the program will invoke
+    to scale the raw data:
+
+        # Define three numeric feature columns.
+        population = tf.feature_column.numeric_column('population')
+        crime_rate = tf.feature_column.numeric_column('crime_rate')
+        median_education = tf.feature_column.numeric_column('median_education',
+                            normalizer_fn='lambda x: x - global_education_mean')
+
+3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
+    a sample instantiation of a pre-made Estimator named `LinearClassifier`:
+
+        # Instantiate an estimator, passing the feature columns.
+        estimator = tf.estimator.Estimator.LinearClassifier(
+            feature_columns=[population, crime_rate, median_education],
+            )
+
+4.  **Call a training, evaluation, or inference method.**
+    For example, all Estimators provide a `train` method, which trains a model.
+
+        # my_training_set is the function created in Step 1
+        estimator.train(input_fn=my_training_set, steps=2000)
+
+
+### Benefits of pre-made Estimators
+
+Pre-made Estimators encode best practices, providing the following benefits:
+
+*   Best practices for determining where different parts of the computational
+    graph should run, implementing strategies on a single machine or on a
+    cluster.
+*   Best practices for event (summary) writing and universally useful
+    summaries.
+
+If you don't use pre-made Estimators, you must implement the preceding
+features yourself.
+
+
+## Custom Estimators
+
+The heart of every Estimator--whether pre-made or custom--is its
+**model function**, which is a method that builds graphs for training,
+evaluation, and prediction. When you are using a pre-made Estimator,
+someone else has already implemented the model function. When relying
+on a custom Estimator, you must write the model function yourself. A
+${$extend/estimators$companion document)
+explains how to write the model function.
+
+
+## Recommended workflow
+
+We recommend the following workflow:
+
+1.  Assuming a suitable pre-made Estimator exists, use it to build your
+    first model and use its results to establish a baseline.
+2.  Build and test your overall pipeline, including the integrity and
+    reliability of your data with this pre-made Estimator.
+3.  If suitable alternative pre-made Estimators are available, run
+    experiments to determine which pre-made Estimator produces the
+    best results.
+4.  Possibly, further improve your model by building your own custom Estimator.
+
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index e31d2717a669bfaf7599dbdfa73f5f951490b954..56486a48b7adabcf785b80e36248dd2071cc538c 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -149,6 +149,8 @@ TensorFlow also has a
 to help build support for more client languages.  We invite contributions of new
 language bindings.
 
+Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the opensource community build on top of the C API supported by the TensorFlow maintainers.
+
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
 TensorFlow supports multiple GPUs and CPUs. See the how-to documentation on
@@ -174,8 +176,7 @@ for more information on how to use them.
 
 ## Variables
 
-See also the how-to documentation on @{$variables$variables}
-and @{$variable_scope$variable scopes}, and
+See also the how-to documentation on @{$variables$variables} and
 @{$python/state_ops$the API documentation for variables}.
 
 #### What is the lifetime of a variable?
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2313a4a638df73d65ae57a9683822c0246d9cd3
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -0,0 +1,619 @@
+# Graphs and Sessions
+
+TensorFlow uses a **dataflow graph** to represent your computation in terms of
+the dependencies between individual operations. This leads to a low-level
+programming model in which you first define the dataflow graph, then create a
+TensorFlow **session** to run parts of the graph across a set of local and
+remote devices.
+
+This guide will be most useful if you intend to use the low-level programming
+model directly. Higher-level APIs such as @{tf.estimator.Estimator} and Keras
+hide the details of graphs and sessions from the end user, but this guide may
+also be useful if you want to understand how these APIs are implemented.
+
+## Why dataflow graphs?
+
+![](../images/tensors_flowing.gif)
+
+[Dataflow](https://en.wikipedia.org/wiki/Dataflow_programming) is a common
+programming model for parallel computing. In a dataflow graph, the nodes
+represent units of computation, and the edges represent the data consumed or
+produced by a computation. For example, in a TensorFlow graph, the @{tf.matmul}
+operation would correspond to a single node with two incoming edges (the
+matrices to be multiplied) and one outgoing edge (the result of the
+multiplication).
+
+<!-- TODO(barryr): Add a diagram to illustrate the @{tf.matmul} graph. -->
+
+Dataflow has several advantages that TensorFlow leverages when executing your
+programs:
+
+* **Parallelism.** By using explicit edges to represent dependencies between
+  operations, it is easy for the system to identify operations that can execute
+  in parallel.
+
+* **Distributed execution.** By using explicit edges to represent the values
+  that flow between operations, it is possible for TensorFlow to partition your
+  program across multiple devices (CPUs, GPUs, and TPUs) attached to different
+  machines. TensorFlow inserts the necessary communication and coordination
+  between devices.
+
+* **Compilation.** TensorFlow's @{$performance/xla$XLA compiler} can
+  use the information in your dataflow graph to generate faster code, for
+  example, by fusing together adjacent operations.
+
+* **Portability.** The dataflow graph is a language-independent representation
+  of the code in your model. You can build a dataflow graph in Python, store it
+  in a [SavedModel](TODO), and restore it in a C++ program for low-latency
+  inference.
+
+## Building a @{tf.Graph}
+
+Most TensorFlow programs start with a dataflow graph construction phase. In this
+phase, you invoke TensorFlow API functions that construct new @{tf.Operation}
+(node) and @{tf.Tensor} (edge) objects and add them to a @{tf.Graph}
+instance. TensorFlow provides a **default graph** that is an implicit argument
+to all API functions in the same context.  For example:
+
+* Calling `tf.constant(42.0)` creates a single @{tf.Operation} that produces the
+  value `42.0`, adds it to the default graph, and returns a @{tf.Tensor} that
+  represents the value of the constant.
+
+* Calling `tf.matmul(x, y)` creates a single @{tf.Operation} that multiplies
+  the values of @{tf.Tensor} objects `x` and `y`, adds it to the default graph,
+  and returns a @{tf.Tensor} that represents the result of the multiplication.
+
+* Executing `v = tf.Variable(0)` adds to the graph a @{tf.Operation} that will
+  store a writeable tensor value that persists between @{tf.Session.run} calls.
+  The @{tf.Variable} object wraps this operation, and can be used [like a
+  tensor](#tensor-like-objects), which will read the current value of the
+  stored value. The @{tf.Variable} object also has methods such as
+  @{tf.Variable.assign$`assign`} and @{tf.Variable.assign_add$`assign_add`} that
+  create @{tf.Operation} objects that, when executed, update the stored value.
+  (See @{$programmers_guide/variables} for more information about variables.)
+
+* Calling @{tf.train.Optimizer.minimize} will add operations and tensors to the
+  default graph that calculate gradients, and return a @{tf.Operation} that,
+  when run, will apply those gradients to a set of variables.
+
+Most programs rely solely on the default graph. However,
+see [Dealing with multiple graphs](#dealing-with-multiple-graphs) for more
+advanced use cases. High-level APIs such as the @{tf.estimator.Estimator} API
+manage the default graph on your behalf, and--for example--may create different
+graphs for training and evaluation.
+
+Note: Calling most functions in the TensorFlow API merely adds operations
+and tensors to the default graph, but **does not** perform the actual
+computation. Instead, you compose these functions until you have a @{tf.Tensor}
+or @{tf.Operation} that represents the overall computation--such as performing
+one step of gradient descent--and then pass that object to a @{tf.Session} to
+perform the computation. See the section "Executing a graph in a @{tf.Session}"
+for more details.
+
+## Naming operations
+
+A @{tf.Graph} object defines a **namespace** for the @{tf.Operation} objects it
+contains. TensorFlow automatically chooses a unique name for each operation in
+your graph, but giving operations descriptive names can make your program easier
+to read and debug. The TensorFlow API provides two ways to override the name of
+an operation:
+
+* Each API function that creates a new @{tf.Operation} or returns a new
+  @{tf.Tensor} accepts an optional `name` argument. For example,
+  `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
+  `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
+  already contained an operation named `"answer"`, the TensorFlow would append
+  `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
+
+* The @{tf.name_scope} function makes it possible to add a **name scope** prefix
+  to all operations created in a particular context. The current name scope
+  prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
+  context managers. If a name scope has already been used in the current
+  context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example: 
+
+  ```python
+  c_0 = tf.constant(0, name="c")  # => operation named "c"
+
+  # Already-used names will be "uniquified".
+  c_1 = tf.constant(2, name="c")  # => operation named "c_1"
+
+  # Name scopes add a prefix to all operations created in the same context.
+  with tf.name_scope("outer"):
+    c_2 = tf.constant(2, name="c")  # => operation named "outer/c"
+
+    # Name scopes nest like paths in a hierarchical file system.
+    with tf.name_scope("inner"):
+      c_3 = tf.constant(3, name="c")  # => operation named "outer/inner/c"
+
+    # Exiting a name scope context will return to the previous prefix.
+    c_4 = tf.constant(4, name="c")  # => operation named "outer/c_1"
+
+    # Already-used name scopes will be "uniquified".
+    with tf.name_scope("inner"):
+      c_5 = tf.constant(5, name="c")  # => operation named "outer/inner_1/c"
+  ```
+
+The graph visualizer uses name scopes to group operations and reduce the visual
+complexity of a graph. See [Visualizing your graph](#visualizing-your-graph) for
+more information.
+
+Note that @{tf.Tensor} objects are implicitly named after the @{tf.Operation}
+that produces the tensor as output. A tensor name has the form `"<OP_NAME>:<i>"`
+where:
+
+* `"<OP_NAME>"` is the name of the operation that produces it.
+* `"<i>"` is an integer representing the index of that tensor among the
+  operation's outputs.
+
+## Placing operations on different devices
+
+If you want your TensorFlow program to use multiple different devices, the
+@{tf.device} function provides a convenient way to request that all operations
+created in a particular context are placed on the same device (or type of
+device).
+
+A **device specification** has the following form:
+
+```
+/job:<JOB_NAME>/task:<TASK_INDEX>/device:<DEVICE_TYPE>:<DEVICE_INDEX>
+```
+
+where:
+
+* `<JOB_NAME>` is an alpha-numeric string that does not start with a number.
+* `<DEVICE_TYPE>` is a registered device type (such as `GPU` or `CPU`).
+* `<TASK_INDEX>` is a non-negative integer representing the index of the task
+  in the job named `<JOB_NAME>`. See @{tf.train.ClusterSpec} for an explanation
+  of jobs and tasks.
+* `<DEVICE_INDEX>` is a non-negative integer representing the index of the
+  device, for example, to distinguish between different GPU devices used in the
+  same process.
+
+You do not need to specify every part of a device specification. For example,
+if you are running in a single-machine configuration with a single GPU, you
+might use @{tf.device} to pin some operations to the CPU and GPU:
+
+```python
+# Operations created outside either context will run on the "best possible"
+# device. For example, if you have a GPU and a CPU available, and the operation
+# has a GPU implementation, TensorFlow will choose the GPU.
+weights = tf.random_normal(...)
+
+with tf.device("/device:CPU:0"):
+  # Operations created in this context will be pinned to the CPU.
+  img = tf.decode_jpeg(tf.read_file("img.jpg"))
+
+with tf.device("/device:GPU:0"):
+  # Operations created in this context will be pinned to the GPU.
+  result = tf.matmul(weights, img)
+```
+
+If you are deploying TensorFlow in a @{$deploy/distributed$typical distributed
+configuration}, you might specify the job name and task ID to place variables on
+a task in the parameter server job (`"/job:ps"`), and the other operations on
+task in the worker job (`"/job:worker"`):
+
+```python
+with tf.device("/job:ps/task:0"):
+  weights_1 = tf.Variable(tf.truncated_normal([784, 100]))
+  biases_1 = tf.Variable(tf.zeroes([100]))
+
+with tf.device("/job:ps/task:1"):
+  weights_2 = tf.Variable(tf.truncated_normal([100, 10]))
+  biases_2 = tf.Variable(tf.zeroes([10]))
+
+with tf.device("/job:worker"):
+  layer_1 = tf.matmul(train_batch, weights_1) + biases_1
+  layer_2 = tf.matmul(train_batch, weights_2) + biases_2
+```
+
+@{tf.device} gives you a lot of flexibility to choose placements for individual
+operations or broad regions of a TensorFlow graph. In many cases, there are
+simple heuristics that work well. For example, the
+@{tf.train.replica_device_setter} API can be used with @{tf.device} to place
+operations for **data-parallel distributed training**. For example, the
+following code fragment shows how @{tf.train.replica_device_setter} applies
+different placement policies to @{tf.Variable} objects and other operations:
+
+```python
+with tf.device(tf.train.replica_device_setter(ps_tasks=3)):
+  # tf.Variable objects are, by default, placed on tasks in "/job:ps" in a
+  # round-robin fashion.
+  w_0 = tf.Variable(...)  # placed on "/job:ps/task:0"
+  b_0 = tf.Variable(...)  # placed on "/job:ps/task:1"
+  w_1 = tf.Variable(...)  # placed on "/job:ps/task:2"
+  b_1 = tf.Variable(...)  # placed on "/job:ps/task:0"
+
+  input_data = tf.placeholder(tf.float32)     # placed on "/job:worker"
+  layer_0 = tf.matmul(input_data, w_0) + b_0  # placed on "/job:worker"
+  layer_1 = tf.matmul(layer_0, w_1) + b_1     # placed on "/job:worker"
+```
+
+## Tensor-like objects
+
+Many TensorFlow operations take one or more @{tf.Tensor} objects as arguments.
+For example, @{tf.matmul} takes two @{tf.Tensor} objects, and @{tf.add_n} takes
+a list of `n` @{tf.Tensor} objects. For convenience, these functions will accept
+a **tensor-like object** in place of a @{tf.Tensor}, and implicitly convert it
+to a @{tf.Tensor} using the @{tf.convert_to_tensor} method. Tensor-like objects
+include elements of the following types:
+
+* @{tf.Tensor}
+* @{tf.Variable}
+* [`numpy.ndarray`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html)
+* `list` (and lists of tensor-like objects)
+* Scalar Python types: `bool`, `float`, `int`, `str`
+
+You can register additional tensor-like types using
+@{tf.register_tensor_conversion_function}.
+
+Note: By default, TensorFlow will create a new @{tf.Tensor} each time you use
+the same tensor-like object. If the tensor-like object is large (e.g. a
+`numpy.ndarray` containing a set of training examples) and you use it multiple
+times, you may run out of memory. To avoid this, manually call
+@{tf.convert_to_tensor} on the tensor-like object once and use the returned
+@{tf.Tensor} instead.
+
+## Executing a graph in a @{tf.Session}
+
+TensorFlow uses the @{tf.Session} class to represent a connection between the
+client program---typically a Python program, although a similar interface is
+available in other languages---and the C++ runtime. A @{tf.Session} object
+provides access to devices in the local machine, and remote devices using the
+distributed TensorFlow runtime. It also caches information about your
+@{tf.Graph} so that you can efficiently run the same computation multiple times.
+
+### Creating a @{tf.Session}
+
+If you are using the low-level TensorFlow API, you can create a @{tf.Session}
+for the current default graph as follows:
+
+```python
+# Create a default in-process session.
+with tf.Session() as sess:
+  # ...
+
+# Create a remote session.
+with tf.Session("grpc://example.org:2222"):
+  # ...
+```
+
+Since a @{tf.Session} owns physical resources (such as GPUs and
+network connections), it is typically used as a context manager (in a `with`
+block) that automatically closes the session when you exit the block. It is
+also possible to create a session without using a `with` block, but you should
+explicitly call @{tf.Session.close} when you are finished with it to free the
+resources.
+
+Note: Higher-level APIs such as @{tf.train.MonitoredTrainingSession} or
+@{tf.estimator.Estimator} will create and manage a @{tf.Session} for you. These
+APIs accept optional `target` and `config` arguments (either directly, or as
+part of a @{tf.estimator.RunConfig} object), with the same meaning as
+described below.
+
+@{tf.Session.__init__} accepts three optional arguments:
+
+* **`target`.** If this argument is left empty (the default), the session will
+  only use devices in the local machine. However, you may also specify a
+  `grpc://` URL to specify the address of a TensorFlow server, which gives the
+  session access to all devices on machines that that server controls. See 
+  @{tf.train.Server} for details of how to create a TensorFlow
+  server. For example, in the common **between-graph replication**
+  configuration, the @{tf.Session} connects to a @{tf.train.Server} in the same
+  process as the client. The [distributed TensorFlow](../deploy/distributed.md)
+  deployment guide describes other common scenarios.
+
+* **`graph`.** By default, a new @{tf.Session} will be bound to---and only able
+  to run operations in---the current default graph. If you are using multiple
+  graphs in your program (see [Programming with multiple
+  graphs](programming-with-multiple-graphs) for more details), you can specify
+  an explicit @{tf.Graph} when you construct the session.
+
+* **`config`.** This argument allows you to specify a @{tf.ConfigProto} that
+  controls the behavior of the session. For example, some of the configuration
+  options include:
+
+  * `allow_soft_placement`. Set this to `True` to enable a "soft" device
+    placement algorithm, which ignores @{tf.device} annotations that attempt
+    to place CPU-only operations on a GPU device, and places them on the CPU
+    instead.
+
+  * `cluster_def`. When using distributed TensorFlow, this option allows you
+    to specify what machines to use in the computation, and provide a mapping
+    between job names, task indices, and network addresses. See
+    @{tf.train.ClusterSpec.as_cluster_def} for details.
+
+  * `graph_options.optimizer_options`. Provides control over the optimizations
+    that TensorFlow performs on your graph before executing it.
+
+  * `gpu_options.allow_growth`. Set this to `True` to change the GPU memory
+    allocator so that it gradually increases the amount of memory allocated,
+    rather than allocating most of the memory at startup.
+
+
+### Using @{tf.Session.run} to execute operations
+
+The @{tf.Session.run} method is the main mechanism for running a @{tf.Operation}
+or evaluating a @{tf.Tensor}. You can pass one or more @{tf.Operation} or
+@{tf.Tensor} objects to @{tf.Session.run}, and TensorFlow will execute the
+operations that are needed to compute the result.
+
+@{tf.Session.run} requires you to specify a list of **fetches**, which determine
+the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or
+a [tensor-like type](#tensor-like-objects) such as @{tf.Variable}. These fetches
+determine what **subgraph** of the overall @{tf.Graph} must be executed to
+produce the result: this is the subgraph that contains all operations named in
+the fetch list, plus all operations whose outputs are used to compute the value
+of the fetches. For example, the following code fragment shows how different
+arguments to @{tf.Session.run} cause different subgraphs to be executed:
+
+```python
+x = tf.constant([[37.0, -23.0], [1.0, 4.0]])
+w = tf.Variable(tf.random_uniform([2, 2]))
+y = tf.matmul(x, w)
+output = tf.nn.softmax(y)
+init_op = w.initializer
+
+with tf.Session() as sess:
+  # Run the initializer on `w`.
+  sess.run(init_op)
+
+  # Evaluate `output`. `sess.run(output)` will return a NumPy array containing
+  # the result of the computation.
+  print(sess.run(output))
+
+  # Evaluate `y` and `output`. Note that `y` will only be computed once, and its
+  # result used both to return `y_val` and as an input to the `tf.nn.softmax()`
+  # op. Both `y_val` and `output_val` will be NumPy arrays.
+  y_val, output_val = sess.run([y, output])
+```
+
+@{tf.Session.run} also optionally takes a dictionary of **feeds**, which is a
+mapping from @{tf.Tensor} objects (typically @{tf.placeholder} tensors) to
+values (typically Python scalars, lists, or NumPy arrays) that will be
+substituted for those tensors in the execution. For example:
+
+```python
+# Define a placeholder that expects a vector of three floating-point values,
+# and a computation that depends on it.
+x = tf.placeholder(tf.float32, shape=[3])
+y = tf.square(x)
+
+with tf.Session() as sess:
+  # Feeding a value changes the result that is returned when you evaluate `y`.
+  print(sess.run(y, {x: [1.0, 2.0, 3.0]})  # => "[1.0, 4.0, 9.0]"
+  print(sess.run(y, {x: [0.0, 0.0, 5.0]})  # => "[0.0, 0.0, 25.0]"
+
+  # Raises `tf.errors.InvalidArgumentError`, because you must feed a value for
+  # a `tf.placeholder()` when evaluating a tensor that depends on it.
+  sess.run(y)
+
+  # Raises `ValueError`, because the shape of `37.0` does not match the shape
+  # of placeholder `x`.
+  sess.run(y, {x: 37.0})
+```
+
+@{tf.Session.run} also accepts an optional `options` argument that enables you
+to specify options about the call, and an optional `run_metadata` argument that
+enables you to collect metadata about the execution. For example, you can use
+these options together to collect tracing information about the execution:
+
+```
+y = tf.matmul([[37.0, -23.0], [1.0, 4.0]], tf.random_uniform([2, 2]))
+
+with tf.Session() as sess:
+  # Define options for the `sess.run()` call.
+  options = tf.RunOptions()
+  options.output_partition_graphs = True
+  options.trace_level = tf.RunOptions.FULL_TRACE
+
+  # Define a container for the returned metadata.
+  metadata = tf.RunMetadata()
+
+  sess.run(y, options=options, run_metadata=metadata)
+
+  # Print the subgraphs that executed on each device.
+  print(metadata.partition_graphs)
+
+  # Print the timings of each operation that executed.
+  print(metadata.step_stats)
+```
+
+## `GraphDef` and `MetaGraphDef`
+
+TensorFlow uses a dataflow graph as a portable representation for your
+application. A @{tf.Graph} contains two relevant kinds of information:
+
+* **Graph structure.** The nodes and edges of the graph, indicating how
+  individual operations are composed together, but not prescribing how they
+  should be used. The graph structure is like assembly code: inspecting it can
+  convey some useful information, but it does not contain all of the useful
+  context that source code conveys.
+
+* **Graph collections.** TensorFlow provides a general mechanism for storing
+  collections of metadata in a @{tf.Graph}. The @{tf.add_to_collection} function
+  enables you to associate a list of objects with a key (where @{tf.GraphKeys}
+  defines some of the standard keys), and @{tf.get_collection} enables you to
+  look up all objects associated with a key. Many parts of the TensorFlow
+  library use this facility: for example, when you create a @{tf.Variable}, it
+  is added by default to collections representing "global variables" and
+  "trainable variables". When you later come to create a @{tf.train.Saver} or
+  @{tf.train.Optimizer}, the variables in these collections are used as the
+  default arguments.
+
+A @{tf.Graph} can be saved in two forms:
+
+* @{tf.GraphDef}: This is a low-level representation of the graph structure,
+  containing a description of all of its operations (as @{tf.NodeDef} protocol
+  buffers) and the edges between them. The @{tf.GraphDef} representation is
+  primarily used with low-level APIs, such as the `tensorflow::Session` C++
+  API, and it typically requires additional context (such as the names of
+  particular operations) to make use of it. The @{tf.Graph.as_graph_def} method
+  converts a @{tf.Graph} to a @{tf.GraphDef}.
+
+* `tf.train.MetaGraphDef`: This is a higher-level representation of a dataflow
+  graph, which includes a @{tf.GraphDef}, and information that helps to
+  understand the graph (such as the contents of the graph collections). The
+  @{tf.train.export_meta_graph} function converts a @{tf.Graph} to a
+  `tf.train.MetaGraphDef`. The @{tf.train.Saver.save} method also writes a
+  `tf.train.MetaGraphDef` that can be used in conjunction with the saved
+  checkpoint to restore the state of a training process at the point it was
+  saved.
+
+In most cases, we encourage you to use `tf.train.MetaGraphDef` instead of
+@{tf.GraphDef}. There are cases where a @{tf.GraphDef} can be useful---for
+example, when performing low-level graph modifications using functions like
+@{tf.import_graph_def} or
+the
+[Graph Transform](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/README.md) tool---but
+`tf.train.MetaGraphDef` is a better building block for high-level applications.
+For example the [SavedModel library](TODO) uses `tf.train.MetaGraphDef` to
+package up a @{tf.Graph} and a set of trained model parameters so that it can be
+used for serving.
+
+If you have a `tf.train.MetaGraphDef`, the @{tf.train.import_meta_graph}
+function will load it into the default graph. Calling this function has two
+main features:
+
+1. It will restore the contents of the graph collections from the original
+   graph. APIs such as @{tf.global_variables} and the default arguments to
+   APIs like @{tf.train.Optimizer.minimize} will work the same way as they
+   did in the original graph.
+
+2. The function returns a @{tf.train.Saver}, which can be used to restore the
+   state (trained parameters, etc.) associated with the graph from a checkpoint.
+   The @{tf.train.latest_checkpoint} function can help to find the latest
+   checkpoint from a particular checkpoint directory.
+
+If you have a @{tf.GraphDef}, the @{tf.import_graph_def} function enables you
+to load the graph into an existing Python @{tf.Graph} object. To make use of the
+imported graph, you must know the names of operations or tensors in the
+@{tf.GraphDef}. The @{tf.import_graph_def} function has two main features to
+help you use the imported graph:
+
+1. You can **rebind** tensors in the imported graph to @{tf.Tensor} objects in
+   the default graph by passing the optional `input_map` argument. For example,
+   `input_map` enables you to take import a graph fragment defined in a
+   @{tf.GraphDef}, and statically connect tensors in the graph you are
+   building to @{tf.placeholder} tensors in that fragment.
+
+2. You can **return** @{tf.Tensor} or @{tf.Operation} objects from the imported
+   graph by passing their names in the `return_elements` list.
+
+In addition, you can use @{tf.device} and @{tf.name_scope} to control the
+device placement and name of the imported nodes.
+
+## Visualizing your graph
+
+TensorFlow includes tools that can help you to understand the code in a graph.
+The **graph visualizer** is a component of TensorBoard that renders the
+structure of your graph visually in a browser. The easiest way to create a
+visualization is to pass a @{tf.Graph} when creating the
+@{tf.summary.FileWriter}:
+
+```python
+# Build your graph.
+x = tf.constant([[37.0, -23.0], [1.0, 4.0]])
+w = tf.Variable(tf.random_uniform([2, 2]))
+y = tf.matmul(x, w)
+# ...
+loss = ...
+train_op = tf.train.AdagradOptimizer(0.01).minimize(loss)
+
+with tf.Session() as sess:
+  # `sess.graph` provides access to the graph used in a `tf.Session`.
+  writer = tf.summary.FileWriter("/tmp/log/...", sess.graph)
+
+  # Perform your computation...
+  for i in range(1000):
+    sess.run(train_op)
+    # ...
+
+  writer.close()
+```
+
+Note: If you are using a @{tf.estimator.Estimator}, the graph (and any
+summaries) will be logged automatically to the `model_dir` that you specified
+when creating the estimator.
+
+You can then open the log in `tensorboard`, navigate to the "Graph" tab, and
+see a high-level visualization of your graph's structure. Note that a typical
+TensorFlow graph---especially training graphs with automatically computed
+gradients---has too many nodes to visualize at once. The graph visualizer makes
+use of name scopes to group related operations into "super" nodes. You can
+click on the orange "+" button on any of these super nodes to expand the
+subgraph inside.
+
+![](../images/mnist_deep.png)
+
+For more information about visualizing your TensorFlow application with
+TensorBoard, see the [TensorBoard tutorial](TODO).
+
+## Programming with multiple graphs
+
+Note: When training a model, a common way of organizing your code is to use one
+graph for training your model, and a separate graph for evaluating or performing
+inference with a trained model. In many cases, the inference graph will be
+different from the training graph: for example, techniques like dropout and
+batch normalization use different operations in each case. Furthermore, by
+default utilities like @{tf.train.Saver} use the names of @{tf.Variable} objects
+(which have names based on an underlying @{tf.Operation}) to identify each
+variable in a saved checkpoint. When programming this way, you can either use
+completely separate Python processes to build and execute the graphs, or you can
+use multiple graphs in the same process. This section describes how to use
+multiple graphs in the same process.
+
+As noted above, TensorFlow provides a "default graph" that is implicitly passed
+to all API functions in the same context. For many applications, a single graph
+is sufficient. However, TensorFlow also provides methods for manipulating
+the default graph, which can be useful in more advanced used cases. For example:
+
+* A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each
+  operation in a single graph must have a unique name. TensorFlow will
+  "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to
+  their names if the requested name is already taken. Using multiple explicitly
+  created graphs gives you more control over what name is given to each
+  operation.
+
+* The default graph stores information about every @{tf.Operation} and
+  @{tf.Tensor} that was ever added to it. If your program creates a large number
+  of unconnected subgraphs, it may be more efficient to use a different 
+  @{tf.Graph} to build each subgraph, so that unrelated state can be garbage
+  collected.
+
+You can install a different @{tf.Graph} as the default graph, using the
+@{tf.Graph.as_default} context manager:
+
+```python
+g_1 = tf.Graph()
+with g_1.as_default():
+  # Operations created in this scope will be added to `g_1`.
+  c = tf.constant("Node in g_1")
+
+  # Sessions created in this scope will run operations from `g_1`.
+  sess_1 = tf.Session()
+
+g_2 = tf.Graph()
+with g_2.as_default():
+  # Operations created in this scope will be added to `g_2`.
+  d = tf.constant("Node in g_2")
+
+# Alternatively, you can pass a graph when constructing a `tf.Session`:
+# `sess_2` will run operations from `g_2`.
+sess_2 = tf.Session(graph=g_2)
+
+assert c.graph is g_1
+assert sess_1.graph is g_1
+
+assert d.graph is g_2
+assert sess_2.graph is g_2
+```
+
+To inspect the current default graph, call @{tf.get_default_graph}, which
+returns a @{tf.Graph} object:
+
+```python
+# Print all of the operations in the default graph.
+g = tf.get_default_graph()
+print(g.get_operations())
+```
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 5ed7c09a577d4dc8a773369b9097634eca98fec4..52c2aaae344419c06826bf251caa58d054f19fba 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -4,13 +4,11 @@ The documents in this unit dive into the details of writing TensorFlow
 code.  This section begins with the following guides, each of which
 explain a particular aspect of TensorFlow:
 
-  * @{$variables$Variables: Creation, Initialization, Saving, and Loading},
-    which details the mechanics of TensorFlow Variables.
+  * @{$variables$Variables: Creation, Initialization, Saving, Loading, and
+     Sharing}, which details the mechanics of TensorFlow Variables.
   * @{$dims_types$Tensor Ranks, Shapes, and Types}, which explains Tensor
     rank (the number of dimensions), shape (the size of each dimension),
     and datatypes.
-  * @{$variable_scope$Sharing Variables}, which explains how to share and
-    manage large sets of variables when building complex models.
   * @{$threading_and_queues$Threading and Queues}, which explains TensorFlow's
     rich queuing system.
   * @{$reading_data$Reading Data}, which documents three different mechanisms
@@ -42,14 +40,10 @@ documented in the following guide:
 
   * @{$saved_model_cli$SavedModel CLI (Command-Line Interface)}.
 
-To learn about the TensorFlow versioning scheme, consult the following two
-guides:
+To learn about the TensorFlow versioning scheme consult:
 
-  * @{$version_semantics$TensorFlow Version Semantics}, which explains
-    TensorFlow's versioning nomenclature and compatibility rules.
-  * @{$data_versions$TensorFlow Data Versioning: GraphDefs and Checkpoints},
-    which explains how TensorFlow adds versioning information to computational
-    graphs and checkpoints in order to support compatibility across versions.
+  * @{$version_compat$The TensorFlow Version Compatibility Guide}, which explains
+TensorFlow's versioning nomenclature and compatibility rules.
 
 We conclude this section with a FAQ about TensorFlow programming:
 
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 322e11cbd697ab427bc4857647234e2a9014ae6a..7f0b9b8db9d3b3f3c872527140d94276b4957bac 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,14 +1,15 @@
 index.md
+tensors.md
 variables.md
 dims_types.md
-variable_scope.md
+graphs.md
+datasets.md
 threading_and_queues.md
 reading_data.md
-supervisor.md
+embedding.md
 debugger.md
-tfdbg-tflearn.md
-meta_graph.md
+supervisor.md
 saved_model_cli.md
-version_semantics.md
-data_versions.md
+meta_graph.md
+version_compat.md
 faq.md
diff --git a/tensorflow/docs_src/programmers_guide/reading_data.md b/tensorflow/docs_src/programmers_guide/reading_data.md
index 3c31d3a1a7065ed04b1eeb20960fd7687374bf28..a7d9372053462bfb37e8570c7d57376d70022bdd 100644
--- a/tensorflow/docs_src/programmers_guide/reading_data.md
+++ b/tensorflow/docs_src/programmers_guide/reading_data.md
@@ -476,4 +476,4 @@ This is what is done in
 
 You can have the train and eval in the same graph in the same process, and share
 their trained variables.  See
-@{$variable_scope$the shared variables tutorial}.
+@{$variables$the shared variables tutorial}.
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff747f326f621ad811e8d5ba67d0c624111fa318
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -0,0 +1,330 @@
+# Tensors
+
+TensorFlow, as the name indicates, is a framework to define and run computations
+involving tensors. A **tensor** is a generalization of vectors and matrices to
+potentially higher dimensions. Internally, TensorFlow represents tensors as
+n-dimensional arrays of base datatypes.
+
+When writing a TensorFlow program, the main object you manipulate and pass
+around is the `tf.Tensor`. A `tf.Tensor` object represents a partially defined
+computation that will eventually produce a value. TensorFlow programs work by
+first building a graph of `tf.Tensor` objects, detailing how each tensor is
+computed based on the other available tensors and then by running parts of this
+graph to achieve the desired results.
+
+A `tf.Tensor` has the following properties:
+
+ * a data type (`float32`, `int32`, or `string`, for example)
+ * a shape
+
+
+Each element in the Tensor has the same data type, and the data type is always
+known. The shape (that is, the number of dimensions it has and the size of each
+dimension) might be only partially known. Most operations produce tensors of
+fully-known shapes if the shapes of their inputs are also fully known, but in
+some cases it's only possible to find the shape of a tensor at graph execution
+time.
+
+Some types of tensors are special, and these will be covered in other
+units of the Programmer's guide. The main ones are:
+
+  * `tf.Variable`
+  * `tf.Constant`
+  * `tf.Placeholder`
+  * `tf.SparseTensor`
+
+With the exception of `tf.Variable`, the value of a tensor is immutable, which
+means that in the context of a single execution tensors only have a single
+value. However, evaluating the same tensor twice can return different values;
+for example that tensor can be the result of reading data from disk, or
+generating a random number.
+
+## Rank
+
+The **rank** of a `tf.Tensor` object is its number of dimensions. Synonyms for
+rank include **order** or **degree** or **n-dimension**.
+Note that rank in TensorFlow is not the same as matrix rank in mathematics. 
+As the following table shows, each rank in TensorFlow corresponds to a 
+different mathematical entity:
+
+Rank | Math entity
+--- | ---
+0 | Scalar (magnitude only)
+1 | Vector (magnitude and direction)
+2 | Matrix (table of numbers)
+3 | 3-Tensor (cube of numbers)
+n | n-Tensor (you get the idea)
+
+
+### Rank 0 
+
+The following snippet demonstrates creating a few rank 0 variables:
+
+```python
+mammal = tf.Variable("Elephant", tf.string)
+ignition = tf.Variable(451, tf.int16)
+floating = tf.Variable(3.14159265359, tf.float64)
+its_complicated = tf.Variable((12.3, -4.85), tf.complex64)
+```
+
+Note: A string is treated as a single item in TensorFlow, not as a sequence of
+characters. It is possible to have scalar strings, vectors of strings, etc.
+
+### Rank 1
+
+To create a rank 1 `tf.Tensor` object, you can pass a list of items as the
+initial value. For example:
+
+```python
+mystr = tf.Variable(["Hello"], tf.string)
+cool_numbers  = tf.Variable([3.14159, 2.71828], tf.float32)
+first_primes = tf.Variable([2, 3, 5, 7, 11], tf.int32)
+its_very_complicated = tf.Variable([(12.3, -4.85), (7.5, -6.23)], tf.complex64)
+```
+
+
+### Higher ranks
+
+A rank 2 `tf.Tensor` object consists of at least one row and at least
+one column:
+
+```python
+mymat = tf.Variable([[7],[11]], tf.int16)
+myxor = tf.Variable([[False, True],[True, False]], tf.bool)
+linear_squares = tf.Variable([[4], [9], [16], [25]], tf.int32)
+squarish_squares = tf.Variable([ [4, 9], [16, 25] ], tf.int32)
+rank_of_squares = tf.rank(squarish_squares)
+mymatC = tf.Variable([[7],[11]], tf.int32)
+```
+
+Higher-rank Tensors, similarly, consist of an n-dimensional array. For example,
+during image processing, many tensors of rank 4 are used, with dimensions
+corresponding to example-in-batch, image width, image height, and color channel.
+
+``` python
+my_image = tf.zeros([10, 299, 299, 3])  # batch x height x width x color
+```
+
+### Getting a `tf.Tensor` object's rank
+
+To determine the rank of a `tf.Tensor` object, call the `tf.rank` method.
+For example, the following method programmatically determines the rank 
+of the `tf.Tensor` defined in the previous section:
+
+```python
+r = tf.rank(my3d)
+# After the graph runs, r will hold the value 3.
+```
+
+### Referring to `tf.Tensor` slices
+
+Since a `tf.Tensor` is an n-dimensional array of cells, to access a single cell
+in a `tf.Tensor` you need to specify n indices.
+
+For a rank 0 tensor (a scalar), no indices are necessary, since it is already a
+single number.
+
+For a rank 1 tensor (a vector), passing a single index allows you to access a
+number:
+
+```python
+my_scalar = my_vector[2]
+```
+
+Note that the index passed inside the `[]` can itself be a scalar `tf.Tensor`, if
+you want to dynamically choose an element from the vector.
+
+For tensors of rank 2 or higher, the situation is more interesting. For a
+`tf.Tensor` of rank 2, passing two numbers returns a scalar, as expected:
+
+
+```python
+my_scalar = my_matrix[1, 2]
+```
+
+
+Passing a single number, however, returns a subvector of a matrix, as follows:
+
+
+```python
+my_row_vetor = my_matrix[2]
+my_column_vector = my_matrix[:, 3]
+```
+
+The `:` notation is python slicing syntax for "leave this dimension alone". This
+is useful in higher-rank Tensors, as it allows you to access its subvectors,
+submatrices, and even other subtensors.
+
+
+## Shape
+
+The **shape** of a tensor is the number of elements in each dimension.
+TensorFlow automatically infers shapes during graph construction. These inferred
+shapes might have known or unknown rank. If the rank is known, the sizes of each
+dimension might be known or unknown.
+
+The TensorFlow documentation uses three notational conventions to describe
+tensor dimensionality: rank, shape, and dimension number. The following table
+shows how these relate to one another:
+
+Rank | Shape | Dimension number | Example
+--- | --- | --- | ---
+0 | [] | 0-D | A 0-D tensor.  A scalar.
+1 | [D0] | 1-D | A 1-D tensor with shape [5].
+2 | [D0, D1] | 2-D | A 2-D tensor with shape [3, 4].
+3 | [D0, D1, D2] | 3-D | A 3-D tensor with shape [1, 4, 3].
+n | [D0, D1, ... Dn-1] | n-D | A tensor with shape [D0, D1, ... Dn-1].
+
+Shapes can be represented via Python lists / tuples of ints, or with the
+@{tf.TensorShape}.
+
+### Getting a `tf.Tensor` object's shape
+
+There are two ways of accessing the shape of a `tf.Tensor`. While building the
+graph, it is often useful to ask what is already known about a tensor's
+shape. This can be done by reading the `shape` property of a `tf.Tensor` object.
+This method returns a `TensorShape` object, which is a convenient way of
+representing partially-specified shapes (since, when building the graph, not all
+shapes will be fully known).
+
+It is also possible to get a `tf.Tensor` that will represent the fully-defined
+shape of another `tf.Tensor` at runtime. This is done by calling the `tf.shape`
+operation. This way, you can build a graph that manipulates the shapes of
+tensors by building other tensors that depend on the dynamic shape of the input
+`tf.Tensor`.
+
+For example, here is how to make a vector of zeros with the same size as the
+number of columns in a given matrix:
+
+``` python
+zeros = tf.zeros(tf.shape(my_matrix)[1])
+```
+
+### Changing the shape of a `tf.Tensor`
+
+The **number of elements** of a tensor is the product of the sizes of all its
+shapes. The number of elements of a scalar is always `1`. Since there are often
+many different shapes that have the same number of elements, it's often
+convenient to be able to change the shape of a `tf.Tensor`, keeping its elements
+fixed. This can be done with `tf.reshape`.
+
+The following examples demonstrate how to reshape tensors:
+
+```python
+rank_three_tensor = tf.ones([3, 4, 5])
+matrix = tf.reshape(rank_three_tensor, [6, 10])  # Reshape existing content into
+                                                 # a 6x10 matrix
+matrixB = tf.reshape(matrix, [3, -1])  #  Reshape existing content into a 3x20
+                                       # matrix. -1 tells reshape to calculate
+                                       # the size of this dimension.
+matrixAlt = tf.reshape(matrixB, [4, 3, -1])  # Reshape existing content into a
+                                             #4x3x5 tensor
+
+# Note that the number of elements of the reshaped Tensors has to match the
+# original number of elements. Therefore, the following example generates an
+# error because no possible value for the last dimension will match the number
+# of elements.
+yet_another = tf.reshape(matrixAlt, [13, 2, -1])  # ERROR!
+```
+
+## Data types
+
+In addition to dimensionality, Tensors have a data type. Refer to the
+`tf.DataType` page in the programmer's guide for a full list of the data types.
+
+It is not possible to have a `tf.Tensor` with more than one data type. It is
+possible, however, to serialize arbitrary data structures as `string`s and store
+those in `tf.Tensor`s.
+
+It is possible to cast `tf.Tensor`s from one datatype to another using
+`tf.cast`:
+
+``` python
+# Cast a constant integer tensor into floating point.
+float_tensor = tf.cast(tf.constant([1, 2, 3]), dtype=tf.float32)
+```
+
+To inspect a `tf.Tensor`'s data type use the `Tensor.dtype` property.
+
+When creating a `tf.Tensor` from a python object you may optionally specify the
+datatype. If you don't, TensorFlow chooses a datatype that can represent your
+data. TensorFlow converts Python integers to `tf.int32` and python floating
+point numbers to `tf.float32`. Otherwise TensorFlow uses the same rules numpy
+uses when converting to arrays.
+
+## Evaluating Tensors
+
+Once the computation graph has been built, you can run the computation that
+produces a particular `tf.Tensor` and fetch the value assigned to it. This is
+often useful for debugging as well as being required for much of TensorFlow to
+work.
+
+The simplest way to evaluate a Tensor is using the `Tensor.eval` method. For
+example:
+
+```python
+constant = tf.constant([1, 2, 3])
+tensor = constant * constant
+print tensor.eval()
+```
+
+The `eval` method only works when a default `tf.Session` is active (see
+Graphs and Sessions for more information).
+
+`Tensor.eval` returns a numpy array with the same contents as the tensor.
+
+Sometimes it is not possible to evaluate a `tf.Tensor` with no context because
+its value might depend on dynamic information that is not available. For
+example, tensors that depend on `Placeholder`s can't be evaluated without
+providing a value for the `Placeholder`.
+
+``` python
+p = tf.placeholder(tf.float32)
+t = p + 1.0
+t.eval()  # This will fail, since the placeholder did not get a value.
+t.eval(feed_dict={p:2.0})  # This will succeed because we're feeding a value
+                           # to the placeholder.
+```
+
+Note that it is possible to feed any `tf.Tensor`, not just placeholders.
+
+Other model constructs might make evaluating a `tf.Tensor`
+complicated. TensorFlow can't directly evaluate `tf.Tensor`s defined inside
+functions or inside control flow constructs. If a `tf.Tensor` depends on a value
+from a queue, evaluating the `tf.Tensor` will only work once something has been
+enqueued; otherwise, evaluating it will hang. When working with queues, remember
+to call `tf.train.start_queue_runners` before evaluating any `tf.Tensor`s.
+
+## Printing Tensors
+
+For debugging purposes you might want to print the value of a `tf.Tensor`. While
+ @{$debugger$tfdbg} provides advanced debugging support, TensorFlow also has an
+ operation to directly print the value of a `tf.Tensor`.
+
+Note that you rarely want to use the following pattern when printing a
+`tf.Tensor`:
+
+``` python
+t = <<some tensorflow operation>>
+print t  # This will print the symbolic tensor when the graph is being built.
+         # This tensor does not have a value in this context.
+```
+
+This code prints the `tf.Tensor` object (which represents deferred computation)
+and not its value. Instead, TensorFlow provides the `tf.Print` operation, which
+returns its first tensor argument unchanged while printing the set of
+`tf.Tensor`s it is passed as the second argument.
+
+To correctly use `tf.Print` its return value must be used. See the example below
+
+``` python
+t = <<some tensorflow operation>>
+tf.Print(t, [t])  # This does nothing
+t = tf.Print(t, [t])  # Here we are using the value returned by tf.Print
+result = t + 1  # Now when result is evaluated the value of `t` will be printed.
+```
+
+When you evaluate `result` you will evaluate everything `result` depends
+upon. Since `result` depends upon `t`, and evaluating `t` has the side effect of
+printing its input (the old value of `t`), `t` gets printed.
+
diff --git a/tensorflow/docs_src/programmers_guide/threading_and_queues.md b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
index 7d3edb788e0ee36e015fcac5ba2634617594c2b0..3483c7533cb330316bf7f7187a798d8259bb1fa3 100644
--- a/tensorflow/docs_src/programmers_guide/threading_and_queues.md
+++ b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
@@ -1,28 +1,44 @@
 # Threading and Queues
 
-Queues are a powerful mechanism for asynchronous computation using TensorFlow.
-
-Like everything in TensorFlow, a queue is a node in a TensorFlow graph. It's a
-stateful node, like a variable: other nodes can modify its content. In
-particular, nodes can enqueue new items in to the queue, or dequeue existing
-items from the queue.
-
-To get a feel for queues, let's consider a simple example. We will create a
-"first in, first out" queue (`FIFOQueue`) and fill it with zeros.
-Then we'll construct a graph
-that takes an item off the queue, adds one to that item, and puts it back on the
-end of the queue. Slowly, the numbers on the queue increase.
+Note: In versions of TensorFlow before 1.2, we recommended using multi-threaded,
+queue-based input pipelines for performance. Beginning with TensorFlow 1.2,
+however, we recommend using the `tf.contrib.data` module instead. (See
+[Datasets](datasets) for details.) The `tf.contrib.data` module offers an
+easier-to-use interface for constructing efficient input pipelines. Furthermore,
+we've stopped developing the old multi-threaded, queue-based input pipelines.
+We've retained the documentation in this file to help developers who are still
+maintaining older code.
+
+Multithreaded queues are a powerful and widely used mechanism supporting
+asynchronous computation.
+
+Following the [dataflow programming model](graphs.md), TensorFlow's queues are
+implemented using nodes in the computation graph.  A queue is a stateful node,
+like a variable: other nodes can modify its content. In particular, nodes can
+enqueue new items in to the queue, or dequeue existing items from the
+queue. TensorFlow's queues provide a way to coordinate multiple steps of a
+computation: a queue will **block** any step that attempts to dequeue from it
+when it is empty, or enqueue to it when it is full. When that condition no
+longer holds, the queue will unblock the step and allow execution to proceed.
+
+TensorFlow implements several classes of queue. The principal difference between
+these classes is the order that items are removed from the queue.  To get a feel
+for queues, let's consider a simple example. We will create a "first in, first
+out" queue (@{tf.FIFOQueue}) and fill it with zeros.  Then we'll construct a
+graph that takes an item off the queue, adds one to that item, and puts it back
+on the end of the queue. Slowly, the numbers on the queue increase.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/IncremeterFifoQueue.gif">
 </div>
 
 `Enqueue`, `EnqueueMany`, and `Dequeue` are special nodes. They take a pointer
-to the queue instead of a normal value, allowing them to change it. We recommend
-you think of these as being like methods of the queue. In fact, in the Python
-API, they are methods of the queue object (e.g. `q.enqueue(...)`).
+to the queue instead of a normal value, allowing them to mutate its state. We
+recommend that you think of these operations as being like methods of the queue
+in an object-oriented sense. In fact, in the Python API, these operations are
+created by calling methods on a queue object (e.g. `q.enqueue(...)`).
 
-**N.B.** Queue methods (such as `q.enqueue(...)`) *must* run on the same device
+Note: Queue methods (such as `q.enqueue(...)`) *must* run on the same device
 as the queue. Incompatible device placement directives will be ignored when
 creating these operations.
 
@@ -32,13 +48,13 @@ Now that you have a bit of a feel for queues, let's dive into the details...
 
 Queues, such as @{tf.FIFOQueue}
 and @{tf.RandomShuffleQueue},
-are important TensorFlow objects for computing tensors asynchronously in a
-graph.
+are important TensorFlow objects that aid in computing tensors asynchronously
+in a graph.
 
-For example, a typical input architecture is to use a `RandomShuffleQueue` to
+For example, a typical input pipeline uses a `RandomShuffleQueue` to
 prepare inputs for training a model:
 
-* Multiple threads prepare training examples and push them in the queue.
+* Multiple threads prepare training examples and enqueue them in the queue.
 * A training thread executes a training op that dequeues mini-batches from the
   queue
 
@@ -46,7 +62,8 @@ This architecture has many benefits, as highlighted in the
 @{$reading_data$Reading data how to}, which also gives an overview of
 functions that simplify the construction of input pipelines.
 
-The TensorFlow `Session` object is multithreaded, so multiple threads can
+The TensorFlow `Session` object is multithreaded and thread-safe, so multiple
+threads can
 easily use the same session and run ops in parallel.  However, it is not always
 easy to implement a Python program that drives threads as described above.  All
 threads must be able to stop together, exceptions must be caught and
@@ -62,11 +79,12 @@ enqueue tensors in the same queue.
 
 ## Coordinator
 
-The `Coordinator` class helps multiple threads stop together.
+The @{tf.train.Coordinator} class manages background threads in a TensorFlow
+program and helps multiple threads stop together.
 
 Its key methods are:
 
-* @{tf.train.Coordinator.should_stop}: returns True if the threads should stop.
+* @{tf.train.Coordinator.should_stop}: returns `True` if the threads should stop.
 * @{tf.train.Coordinator.request_stop}: requests that threads should stop.
 * @{tf.train.Coordinator.join}: waits until the specified threads have stopped.
 
@@ -79,6 +97,9 @@ Any thread can decide that the computation should stop.  It only has to call
 return `True`.
 
 ```python
+# Using Python's threading library.
+import threading
+
 # Thread body: loop until the coordinator indicates a stop was requested.
 # If some condition becomes true, ask the coordinator to stop.
 def MyLoop(coord):
@@ -105,10 +126,10 @@ also has support to capture and report exceptions.  See the @{tf.train.Coordinat
 
 ## QueueRunner
 
-The `QueueRunner` class creates a number of threads that repeatedly run an
-enqueue op.  These threads can use a coordinator to stop together.  In
-addition, a queue runner runs a *closer thread* that automatically closes the
-queue if an exception is reported to the coordinator.
+The @{tf.train.QueueRunner} class creates a number of threads that repeatedly
+run an enqueue op.  These threads can use a coordinator to stop together.  In
+addition, a queue runner will run a *closer operation* that closes the queue if
+an exception is reported to the coordinator.
 
 You can use a queue runner to implement the architecture described above.
 
diff --git a/tensorflow/docs_src/programmers_guide/variable_scope.md b/tensorflow/docs_src/programmers_guide/variable_scope.md
deleted file mode 100644
index f4d2b3f37b875f589e2de69d8681a09e90f99360..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/programmers_guide/variable_scope.md
+++ /dev/null
@@ -1,373 +0,0 @@
-# Sharing Variables
-
-You can create, initialize, save and load single variables
-in the way described in the @{$variables$Variables HowTo}.
-But when building complex models you often need to share large sets of
-variables and you might want to initialize all of them in one place.
-This tutorial shows how this can be done using `tf.variable_scope()` and
-`tf.get_variable()`.
-
-## The Problem
-
-Imagine you create a simple model for image filters, similar to our
-@{$deep_cnn$Convolutional Neural Networks Tutorial}
-model but with only 2 convolutions (for simplicity of this example). If you use
-just `tf.Variable`, as explained in @{$variables$Variables HowTo},
-your model might look like this.
-
-```python
-def my_image_filter(input_images):
-    conv1_weights = tf.Variable(tf.random_normal([5, 5, 32, 32]),
-        name="conv1_weights")
-    conv1_biases = tf.Variable(tf.zeros([32]), name="conv1_biases")
-    conv1 = tf.nn.conv2d(input_images, conv1_weights,
-        strides=[1, 1, 1, 1], padding='SAME')
-    relu1 = tf.nn.relu(conv1 + conv1_biases)
-
-    conv2_weights = tf.Variable(tf.random_normal([5, 5, 32, 32]),
-        name="conv2_weights")
-    conv2_biases = tf.Variable(tf.zeros([32]), name="conv2_biases")
-    conv2 = tf.nn.conv2d(relu1, conv2_weights,
-        strides=[1, 1, 1, 1], padding='SAME')
-    return tf.nn.relu(conv2 + conv2_biases)
-```
-
-As you can easily imagine, models quickly get much more complicated than
-this one, and even here we already have 4 different variables: `conv1_weights`,
-`conv1_biases`, `conv2_weights`, and `conv2_biases`.
-
-The problem arises when you want to reuse this model. Assume you want to
-apply your image filter to 2 different images, `image1` and `image2`.
-You want both images processed by the same filter with the same parameters.
-You can call `my_image_filter()` twice, but this will create two sets
-of variables, 4 variables in each one, for a total of 8 variables.
-
-```python
-# First call creates one set of 4 variables.
-result1 = my_image_filter(image1)
-# Another set of 4 variables is created in the second call.
-result2 = my_image_filter(image2)
-```
-
-A common way to share variables is to create them in a separate piece of code
-and pass them to functions that use them.   For example by using a dictionary:
-
-```python
-variables_dict = {
-    "conv1_weights": tf.Variable(tf.random_normal([5, 5, 32, 32]),
-        name="conv1_weights")
-    "conv1_biases": tf.Variable(tf.zeros([32]), name="conv1_biases")
-    ... etc. ...
-}
-
-def my_image_filter(input_images, variables_dict):
-    conv1 = tf.nn.conv2d(input_images, variables_dict["conv1_weights"],
-        strides=[1, 1, 1, 1], padding='SAME')
-    relu1 = tf.nn.relu(conv1 + variables_dict["conv1_biases"])
-
-    conv2 = tf.nn.conv2d(relu1, variables_dict["conv2_weights"],
-        strides=[1, 1, 1, 1], padding='SAME')
-    return tf.nn.relu(conv2 + variables_dict["conv2_biases"])
-
-# Both calls to my_image_filter() now use the same variables
-result1 = my_image_filter(image1, variables_dict)
-result2 = my_image_filter(image2, variables_dict)
-```
-
-While convenient, creating variables like above,
-outside of the code, breaks encapsulation:
-
-*  The code that builds the graph must document the names, types,
-   and shapes of variables to create.
-*  When the code changes, the callers may have to create more, or less,
-   or different variables.
-
-One way to address the problem is to use classes to create a model,
-where the classes take care of managing the variables they need.
-For a lighter solution, not involving classes, TensorFlow provides
-a *Variable Scope* mechanism that allows to easily share named variables
-while constructing a graph.
-
-## Variable Scope Example
-
-Variable Scope mechanism in TensorFlow consists of two main functions:
-
-* `tf.get_variable(<name>, <shape>, <initializer>)`:
-  Creates or returns a variable with a given name.
-* `tf.variable_scope(<scope_name>)`:
-  Manages namespaces for names passed to `tf.get_variable()`.
-
-The function `tf.get_variable()` is used to get or create a variable instead
-of a direct call to `tf.Variable`. It uses an *initializer* instead of passing
-the value directly, as in `tf.Variable`. An initializer is a function that
-takes the shape and provides a tensor with that shape. Here are some
-initializers available in TensorFlow:
-
-* `tf.constant_initializer(value)` initializes everything to the provided value,
-* `tf.random_uniform_initializer(a, b)` initializes uniformly from [a, b],
-* `tf.random_normal_initializer(mean, stddev)` initializes from the normal
-  distribution with the given mean and standard deviation.
-
-To see how `tf.get_variable()` solves the problem discussed
-before, let's refactor the code that created one convolution into
-a separate function, named `conv_relu`:
-
-```python
-def conv_relu(input, kernel_shape, bias_shape):
-    # Create variable named "weights".
-    weights = tf.get_variable("weights", kernel_shape,
-        initializer=tf.random_normal_initializer())
-    # Create variable named "biases".
-    biases = tf.get_variable("biases", bias_shape,
-        initializer=tf.constant_initializer(0.0))
-    conv = tf.nn.conv2d(input, weights,
-        strides=[1, 1, 1, 1], padding='SAME')
-    return tf.nn.relu(conv + biases)
-```
-
-This function uses short names `"weights"` and `"biases"`.
-We'd like to use it for both `conv1` and `conv2`, but
-the variables need to have different names.
-This is where `tf.variable_scope()` comes into play:
-it pushes a namespace for variables.
-
-```python
-def my_image_filter(input_images):
-    with tf.variable_scope("conv1"):
-        # Variables created here will be named "conv1/weights", "conv1/biases".
-        relu1 = conv_relu(input_images, [5, 5, 32, 32], [32])
-    with tf.variable_scope("conv2"):
-        # Variables created here will be named "conv2/weights", "conv2/biases".
-        return conv_relu(relu1, [5, 5, 32, 32], [32])
-```
-
-Now, let's see what happens when we call `my_image_filter()` twice.
-
-```
-result1 = my_image_filter(image1)
-result2 = my_image_filter(image2)
-# Raises ValueError(... conv1/weights already exists ...)
-```
-
-As you can see, `tf.get_variable()` checks that already existing variables
-are not shared by accident. If you want to share them, you need to specify
-it by setting `reuse_variables()` as follows.
-
-```
-with tf.variable_scope("image_filters") as scope:
-    result1 = my_image_filter(image1)
-    scope.reuse_variables()
-    result2 = my_image_filter(image2)
-```
-
-This is a good way to share variables, lightweight and safe.
-
-## How Does Variable Scope Work?
-
-### Understanding `tf.get_variable()`
-
-To understand variable scope it is necessary to first
-fully understand how `tf.get_variable()` works.
-Here is how `tf.get_variable` is usually called.
-
-```python
-v = tf.get_variable(name, shape, dtype, initializer)
-```
-
-This call does one of two things depending on the scope it is called in.
-Here are the two options.
-
-* Case 1: the scope is set for creating new variables, as evidenced by
-`tf.get_variable_scope().reuse == False`.
-
-In this case, `v` will be a newly created `tf.Variable` with the provided
-shape and data type. The full name of the created variable will be set to
-the current variable scope name + the provided `name` and a check will be
-performed to ensure that no variable with this full name exists yet.
-If a variable with this full name already exists, the function will
-raise a `ValueError`. If a new variable is created, it will be
-initialized to the value `initializer(shape)`. For example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-assert v.name == "foo/v:0"
-```
-
-* Case 2: the scope is set for reusing variables, as evidenced by
-`tf.get_variable_scope().reuse == True`.
-
-In this case, the call will search for an already existing variable with
-name equal to the current variable scope name + the provided `name`.
-If no such variable exists, a `ValueError` will be raised. If the variable
-is found, it will be returned. For example:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-with tf.variable_scope("foo", reuse=True):
-    v1 = tf.get_variable("v", [1])
-assert v1 is v
-```
-
-### Basics of `tf.variable_scope()`
-
-Knowing how `tf.get_variable()` works makes it easy to understand variable
-scope. The primary function of variable scope is to carry a name that will
-be used as prefix for variable names and a reuse-flag to distinguish the two
-cases described above. Nesting variable scopes appends their names in a way
-analogous to how directories work:
-
-```python
-with tf.variable_scope("foo"):
-    with tf.variable_scope("bar"):
-        v = tf.get_variable("v", [1])
-assert v.name == "foo/bar/v:0"
-```
-
-The current variable scope can be retrieved using `tf.get_variable_scope()`
-and the `reuse` flag of the current variable scope can be set to `True` by
-calling `tf.get_variable_scope().reuse_variables()`:
-
-```python
-with tf.variable_scope("foo"):
-    v = tf.get_variable("v", [1])
-    tf.get_variable_scope().reuse_variables()
-    v1 = tf.get_variable("v", [1])
-assert v1 is v
-```
-
-Note that you *cannot* set the `reuse` flag to `False`. The reason behind
-this is to allow to compose functions that create models. Imagine you write
-a function `my_image_filter(inputs)` as before. Someone calling the function
-in a variable scope with `reuse=True` would expect all inner variables to be
-reused as well. Allowing to force `reuse=False` inside the function would break
-this contract and make it hard to share parameters in this way.
-
-Even though you cannot set `reuse` to `False` explicitly, you can enter
-a reusing variable scope and then exit it, going back to a non-reusing one.
-This can be done using a `reuse=True` parameter when opening a variable scope.
-Note also that, for the same reason as above, the `reuse` parameter is
-inherited. So when you open a reusing variable scope, all sub-scopes will
-be reusing too.
-
-```python
-with tf.variable_scope("root"):
-    # At start, the scope is not reusing.
-    assert tf.get_variable_scope().reuse == False
-    with tf.variable_scope("foo"):
-        # Opened a sub-scope, still not reusing.
-        assert tf.get_variable_scope().reuse == False
-    with tf.variable_scope("foo", reuse=True):
-        # Explicitly opened a reusing scope.
-        assert tf.get_variable_scope().reuse == True
-        with tf.variable_scope("bar"):
-            # Now sub-scope inherits the reuse flag.
-            assert tf.get_variable_scope().reuse == True
-    # Exited the reusing scope, back to a non-reusing one.
-    assert tf.get_variable_scope().reuse == False
-```
-
-### Capturing variable scope
-
-In all examples presented above, we shared parameters only because their
-names agreed, that is, because we opened a reusing variable scope with
-exactly the same string. In more complex cases, it might be useful to pass
-a VariableScope object rather than rely on getting the names right.
-To this end, variable scopes can be captured and used instead of names
-when opening a new variable scope.
-
-```python
-with tf.variable_scope("foo") as foo_scope:
-    v = tf.get_variable("v", [1])
-with tf.variable_scope(foo_scope):
-    w = tf.get_variable("w", [1])
-with tf.variable_scope(foo_scope, reuse=True):
-    v1 = tf.get_variable("v", [1])
-    w1 = tf.get_variable("w", [1])
-assert v1 is v
-assert w1 is w
-```
-
-When opening a variable scope using a previously existing scope
-we jump out of the current variable scope prefix to an entirely
-different one. This is fully independent of where we do it.
-
-```python
-with tf.variable_scope("foo") as foo_scope:
-    assert foo_scope.name == "foo"
-with tf.variable_scope("bar"):
-    with tf.variable_scope("baz") as other_scope:
-        assert other_scope.name == "bar/baz"
-        with tf.variable_scope(foo_scope) as foo_scope2:
-            assert foo_scope2.name == "foo"  # Not changed.
-```
-
-### Initializers in variable scope
-
-Using `tf.get_variable()` allows to write functions that create or reuse
-variables and can be transparently called from outside. But what if we wanted
-to change the initializer of the created variables? Do we need to pass an extra
-argument to every function that creates variables? What about the most common
-case, when we want to set the default initializer for all variables in one
-place, on top of all functions? To help with these cases, variable scope
-can carry a default initializer. It is inherited by sub-scopes and passed
-to each `tf.get_variable()` call. But it will be overridden if another
-initializer is specified explicitly.
-
-```python
-with tf.variable_scope("foo", initializer=tf.constant_initializer(0.4)):
-    v = tf.get_variable("v", [1])
-    assert v.eval() == 0.4  # Default initializer as set above.
-    w = tf.get_variable("w", [1], initializer=tf.constant_initializer(0.3)):
-    assert w.eval() == 0.3  # Specific initializer overrides the default.
-    with tf.variable_scope("bar"):
-        v = tf.get_variable("v", [1])
-        assert v.eval() == 0.4  # Inherited default initializer.
-    with tf.variable_scope("baz", initializer=tf.constant_initializer(0.2)):
-        v = tf.get_variable("v", [1])
-        assert v.eval() == 0.2  # Changed default initializer.
-```
-
-### Names of ops in `tf.variable_scope()`
-
-We discussed how `tf.variable_scope` governs the names of variables.
-But how does it influence the names of other ops in the scope?
-It is natural that ops created inside a variable scope should also
-share that name. For this reason, when we do `with tf.variable_scope("name")`,
-this implicitly opens a `tf.name_scope("name")`. For example:
-
-```python
-with tf.variable_scope("foo"):
-    x = 1.0 + tf.get_variable("v", [1])
-assert x.op.name == "foo/add"
-```
-
-Name scopes can be opened in addition to a variable scope, and then
-they will only affect the names of the ops, but not of variables.
-
-```python
-with tf.variable_scope("foo"):
-    with tf.name_scope("bar"):
-        v = tf.get_variable("v", [1])
-        x = 1.0 + v
-assert v.name == "foo/v:0"
-assert x.op.name == "foo/bar/add"
-```
-
-When opening a variable scope using a captured object instead of a string,
-we do not alter the current name scope for ops.
-
-
-## Examples of Use
-
-Here are pointers to a few files that make use of variable scope. They can all
-be found in the [TensorFlow models repo](https://github.com/tensorflow/models).
-In particular, variable scope is heavily used for recurrent neural networks and
-sequence-to-sequence models.
-
-File | What's in it?
---- | ---
-`tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
-`tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
-`tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index e8d1e519f0b8fd05039b107a5501ea0da7cc29a6..2f8e7eef5ed5973990fae2d06b7efd296abdfb60 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -1,190 +1,283 @@
-# Variables: Creation, Initialization, Saving, and Loading
+# Variables
 
-When you train a model, you use @{$python/state_ops$variables}
-to hold and update parameters.  Variables are in-memory buffers containing
-tensors.  They must be explicitly initialized and can be saved to disk during
-and after training. You can later restore saved values to exercise or analyze
-the model.
+A TensorFlow **variable** is the best way to represent shared, persistent state
+manipulated by your program.
 
-This document references the following TensorFlow classes.  Follow the links to
-their reference manual for a complete description of their API:
+Variables are manipulated via the `tf.Variable` class. A `tf.Variable`
+represents a tensor whose value can be changed by running ops on it. Unlike
+`tf.Tensor` objects, a `tf.Variable` exists outside the context of a single
+`session.run` call.
 
-*  The @{tf.Variable} class.
-*  The @{tf.train.Saver} class.
+Internally, a `tf.Variable` stores a persistent tensor. Specific ops allow you
+to read and modify the values of this tensor. These modifications are visible
+across multiple `tf.Session`s, so multiple workers can see the same values for a
+`tf.Variable`.
 
+## Creating a Variable
 
-## Creation
+The best way to create a variable is to call the `tf.get_variable`
+function. This function requires you to specify the Variable's name. This name
+will be used by other replicas to access the same variable, as well as to name
+this variable's value when checkpointing and exporting models. `tf.get_variable`
+also allows you to reuse a previously created variable of the same name, making it
+easy to define models which reuse layers.
 
-When you create a @{$python/state_ops$Variable} you pass a
-`Tensor` as its initial value to the `Variable()` constructor.  TensorFlow
-provides a collection of ops that produce tensors often used for initialization
-from @{$python/constant_op$constants or random values}.
+To create a variable with `tf.get_variable`, simply provide the name and shape
 
-Note that all these ops require you to specify the shape of the tensors.  That
-shape automatically becomes the shape of the variable.  Variables generally
-have a fixed shape, but TensorFlow provides advanced mechanisms to reshape
-variables.
+``` python
+my_variable = tf.get_variable("my_variable", [1, 2, 3])
+```
 
-```python
-# Create two variables.
-weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
-                      name="weights")
-biases = tf.Variable(tf.zeros([200]), name="biases")
+This creates a variable named "my_variable" which is a three-dimensional tensor
+with shape `[1, 2, 3]`. This variable will, by default, have the `dtype`
+`tf.float32` and its initial value will be randomized via
+`tf.glorot_uniform_initializer`.
+
+You may optionally specify the `dtype` and initializer to `tf.get_variable`. For
+example:
+
+``` python
+my_int_variable = tf.get_variable("my_int_variable", [1, 2, 3], dtype=tf.int32, 
+  initializer=tf.zeros_initializer)
 ```
 
-Calling `tf.Variable()` adds several ops to the graph:
+TensorFlow provides many convenient initializers. Alternatively, you may
+initialize a `tf.Variable` to have the value of a `tf.Tensor`. For example:
 
-*  A `variable` op that holds the variable value.
-*  An initializer op that sets the variable to its initial value.  This is
-   actually a `tf.assign` op.
-*  The ops for the initial value, such as the `zeros` op for the `biases`
-   variable in the example are also added to the graph.
+``` python
+other_variable = tf.get_variable("other_variable", dtype=tf.int32, 
+  initializer=tf.constant([23, 42]))
+```
 
-The value returned by `tf.Variable()` value is an instance of the Python class
-`tf.Variable`.
+Note that when the initializer is a `tf.Tensor` you should not specify the
+variable's shape, as the shape of the initializer tensor will be used.
+
+### Variable collections
+
+Because disconnected parts of a TensorFlow program might want to create
+variables, it is sometimes useful to have a single way to access all of
+them. For this reason TensorFlow provides **collections**, which are named lists
+of tensors or other objects, such as `tf.Variable` instances.
+
+By default every `tf.Variable` gets placed in the following two collections:
+ * `tf.GraphKeys.GLOBAL_VARIABLES` --- variables that can be shared across
+multiple devices,
+ * `tf.GraphKeys.TRAINABLE_VARIABLES`--- variables for which TensorFlow will
+   calculate gradients.
+ 
+If you don't want a variable to be trainable, add it to the
+`tf.GraphKeys.LOCAL_VARIABLES` collection instead. For example, the following
+snippet demonstrates how to add a variable named `my_local` to this collection:
+
+``` python
+my_local = tf.get_variable("my_local", shape=(), 
+collections=[tf.GraphKeys.LOCAL_VARIABLES])
+```
 
-### Device placement
+Alternatively, you can specify `trainable=False` as an argument to
+`tf.get_variable`:
 
-A variable can be pinned to a particular device when it is created, using a
-@{tf.device$`with tf.device(...):`} block:
+``` python
+my_non_trainable = tf.get_variable("my_non_trainable", 
+                                   shape=(), 
+                                   trainable=False)
+```
 
-```python
-# Pin a variable to CPU.
-with tf.device("/cpu:0"):
-  v = tf.Variable(...)
 
-# Pin a variable to GPU.
-with tf.device("/gpu:0"):
-  v = tf.Variable(...)
+You can also use your own collections. Any string is a valid collection name,
+and there is no need to explicitly create a collection. To add a variable (or
+any other object) to a collection after creating the variable, call
+`tf.add_to_collection`.  For example, the following code adds an existing
+variable named `my_local` to a collection named `my_collection_name`:
 
-# Pin a variable to a particular parameter server task.
-with tf.device("/job:ps/task:7"):
-  v = tf.Variable(...)
+``` python
+tf.add_to_collection("my_collection_name", my_local)
 ```
 
-**NOTE** Operations that mutate a variable, such as
-@{tf.Variable.assign} and the parameter
-update operations in a
-@{tf.train.Optimizer} *must* run on
-the same device as the variable. Incompatible device placement directives will
-be ignored when creating these operations.
+And to retrieve a list of all the variables (or other objects) you've placed in
+a collection you can use:
 
-Device placement is particularly important when running in a replicated
-setting. See
-@{tf.train.replica_device_setter}
-for details of a device function that can simplify the configuration for devices
-for a replicated model.
+``` python
+tf.get_collection("my_collection_name")
+```
 
-## Initialization
+### Device placement
 
-Variable initializers must be run explicitly before other ops in your model can
-be run.  The easiest way to do that is to add an op that runs all the variable
-initializers, and run that op before using the model.
+Just like any other TensorFlow operation, you can place variables on particular
+devices. For example, the following snippet creates a variable named `v` and
+places it on the second GPU device:
 
-You can alternatively restore variable values from a checkpoint file, see
-below.
+``` python
+with tf.device("/gpu:1"):
+  v = tf.get_variable("v", [1])
+```
 
-Use `tf.global_variables_initializer()` to add an op to run variable initializers.
-Only run that op after you have fully constructed your model and launched it in
-a session.
+It is particularly important for variables to be in the correct device in
+distributed settings. Accidentally putting variables on workers instead of
+parameter servers, for example, can severely slow down training or, in the worst
+case, let each worker blithely forge ahead with its own independent copy of each
+variable. For this reason we provide @{tf.train.replica_device_setter}, which
+can automatically place variables in parameter servers. For example:
+
+``` python
+cluster_spec = {
+    "ps": ["ps0:2222", "ps1:2222"],
+    "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
+with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
+  v = tf.get_variable("v", shape=[20, 20])  # this variable is placed 
+                                            # in the parameter server
+                                            # by the replica_device_setter
+```
 
-```python
-# Create two variables.
-weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
-                      name="weights")
-biases = tf.Variable(tf.zeros([200]), name="biases")
-...
-# Add an op to initialize the variables.
-init_op = tf.global_variables_initializer()
+## Initializing variables
 
-# Later, when launching the model
-with tf.Session() as sess:
-  # Run the init operation.
-  sess.run(init_op)
-  ...
-  # Use the model
-  ...
+Before you can use a variable, it must be initialized. If you are programming in
+the low-level TensorFlow API (that is, you are explicitly creating your own
+graphs and sessions), you must explicitly initialize the variables.  Most
+high-level frameworks such as `tf.contrib.slim`, `tf.estimator.Estimator` and
+`Keras` automatically initialize variables for you before training a model.
+
+Explicit initialization is otherwise useful because it allows you not to rerun
+potentially expensive initializers when reloading a model from a checkpoint as
+well as allowing determinism when randomly-initialized variables are shared in a
+distributed setting. 
+
+To initialize all trainable variables in one go, before training starts, call
+`tf.global_variables_initializer()`. This function returns a single operation
+responsible for initializing all variables in the
+`tf.GraphKeys.GLOBAL_VARIABLES` collection. Running this operation initializes
+all variables. For example:
+
+``` python
+session.run(tf.global_variables_initializer())
+# Now all variables are initialized.
 ```
 
-### Initialization from another Variable
+If you do need to initialize variables yourself, you can run the variable's
+initializer operation. For example:
 
-You sometimes need to initialize a variable from the initial value of another
-variable.  As the op added by `tf.global_variables_initializer()` initializes all
-variables in parallel you have to be careful when this is needed.
+``` python
+session.run(my_variable.initializer)
+```
 
-To initialize a new variable from the value of another variable use the other
-variable's `initialized_value()` property.  You can use the initialized value
-directly as the initial value for the new variable, or you can use it as any
-other tensor to compute a value for the new variable.
 
+You can also ask which variables have still not been initialized. For example,
+the following code prints the names of all variables which have not yet been
+initialized:
 
-```python
-# Create a variable with a random value.
-weights = tf.Variable(tf.random_normal([784, 200], stddev=0.35),
-                      name="weights")
-# Create another variable with the same value as 'weights'.
-w2 = tf.Variable(weights.initialized_value(), name="w2")
-# Create another variable with twice the value of 'weights'
-w_twice = tf.Variable(weights.initialized_value() * 2.0, name="w_twice")
+``` python
+print(session.run(tf.report_uninitialized_variables()))
+```
+
+
+Note that by default `tf.global_variables_initializer` does not specify the
+order in which variables are initialized. Therefore, if the initial value of a
+variable depends on another variable's value, it's likely that you'll get an
+error. Any time you use the value of a variable in a context in which not all
+variables are initialized (say, if you use a variable's value while initializing
+another variable), it is best to use `variable.initialized_value()` instead of
+`variable`:
+
+``` python
+v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
+w = tf.get_variable("w", initializer=v.initialized_value() + 1)
+```
+
+## Using variables
+
+To use the value of a `tf.Variable` in a TensorFlow graph, simply treat it like
+a normal `tf.Tensor`:
+
+``` python
+v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
+w = v + 1  # w is a tf.Tensor which is computed based on the value of v.
+           # Any time a variable is used in an expression it gets automatically
+           # converted to a tf.Tensor representing its value.
 ```
 
-### Custom Initialization
+To assign a value to a variable, use the methods `assign`, `assign_add`, and
+friends in the `tf.Variable` class. For example, here is how you can call these
+methods:
 
-The convenience function `tf.global_variables_initializer()` adds an op to
-initialize *all variables* in the model.  You can also pass an explicit list of
-variables to initialize to `tf.variables_initializer`.  See the
-@{$python/state_ops$Variables Documentation} for more options,
-including checking if variables are initialized.
+``` python
+v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
+assignment = v.assign_add(1)
+tf.global_variables_initializer().run()
+assignment.run()
+```
+
+Most TensorFlow optimizers have specialized ops that efficiently update the
+values of variables according to some gradient descent-like algorithm. See
+@{tf.train.Optimizer} for an explanation of how to use optimizers.
+
+Because variables are mutable it's sometimes useful to know what version of a
+variable's value is being used at any point in time. To force a re-read of the
+value of a variable after something has happened, you can use
+`tf.Variable.read_value`. For example:
+
+``` python
+v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
+assignment = v.assign_add(1)
+with tf.control_dependencies([assignment]):
+  w = v.read_value()  # w is guaranteed to reflect v's value after the
+                      # assign_add operation.
+```
 
 ## Saving and Restoring
 
 The easiest way to save and restore a model is to use a `tf.train.Saver` object.
 The constructor adds `save` and `restore` ops to the graph for all, or a
-specified list, of the variables in the graph.  The saver object provides
+specified list, of the variables in the graph.  The `Saver` object provides
 methods to run these ops, specifying paths for the checkpoint files to write to
 or read from.
 
-Note that to restore a model checkpoint without a graph one must first import
-the graph from the meta graph file (typical extension is `.meta`). This is
-done with @{tf.train.import_meta_graph}, which in turn returns a `Saver` from
-which one can than perform a `restore`.
+To restore a model checkpoint without a graph, you must first import the graph
+from the `MetaGraph` file (typical extension is `.meta`). Do this by calling
+@{tf.train.import_meta_graph}, which in turn returns a `Saver` from which one
+can than perform a `restore`.
 
 ### Checkpoint Files
 
-Variables are saved in binary files that, roughly, contain a map from variable
+TensorFlow saves variables in binary files that, roughly speaking, map variable
 names to tensor values.
 
 When you create a `Saver` object, you can optionally choose names for the
-variables in the checkpoint files.  By default, it uses the value of the
+variables in the checkpoint files.  By default, `Saver` uses the value of the
 @{tf.Variable.name} property for
 each variable.
 
-To understand what variables are in a checkpoint, you can use the
-[`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py)
-library, and in particular, the `print_tensors_in_checkpoint_file` function.
+To inspect the variables in a checkpoint, you can use
+the
+[`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py) library,
+particularly the `print_tensors_in_checkpoint_file` function.
 
 ### Saving Variables
 
-Create a `Saver` with `tf.train.Saver()` to manage all variables in
-the model.
+Create a `Saver` with `tf.train.Saver()` to manage all variables in the
+model. For example, the following snippet demonstrates how to call the
+`tf.train.Saver.save` method to save variables to a checkpoint file:
 
 ```python
 # Create some variables.
-v1 = tf.Variable(..., name="v1")
-v2 = tf.Variable(..., name="v2")
-...
+v1 = tf.get_variable("v1", shape=[3], initializer = tf.zeros_initializer)
+v2 = tf.get_variable("v2", shape=[5], initializer = tf.zeros_initializer)
+
+inc_v1 = v1.assign(v1+1)
+dec_v2 = v2.assign(v2-1)
+
 # Add an op to initialize the variables.
 init_op = tf.global_variables_initializer()
 
 # Add ops to save and restore all the variables.
 saver = tf.train.Saver()
 
-# Later, launch the model, initialize the variables, do some work, save the
+# Later, launch the model, initialize the variables, do some work, and save the
 # variables to disk.
 with tf.Session() as sess:
   sess.run(init_op)
   # Do some work with the model.
-  ..
+  inc_v1.op.run()
+  dec_v2.op.run()
   # Save the variables to disk.
   save_path = saver.save(sess, "/tmp/model.ckpt")
   print("Model saved in file: %s" % save_path)
@@ -192,14 +285,19 @@ with tf.Session() as sess:
 
 ### Restoring Variables
 
-The same `Saver` object is used to restore variables.  Note that when you
-restore variables from a file you do not have to initialize them beforehand.
+The `tf.train.Saver` object not only saves variables to checkpoint files, it
+also restores variables.  Note that when you restore variables from a file you
+do not have to initialize them beforehand. For example, the following snippet
+demonstrates how to call the `tf.train.Saver.restore` method to restore
+variables from a checkpoint file:
 
 ```python
+tf.reset_default_graph()
+
 # Create some variables.
-v1 = tf.Variable(..., name="v1")
-v2 = tf.Variable(..., name="v2")
-...
+v1 = tf.get_variable("v1", shape=[3])
+v2 = tf.get_variable("v2", shape=[5])
+
 # Add ops to save and restore all the variables.
 saver = tf.train.Saver()
 
@@ -209,50 +307,156 @@ with tf.Session() as sess:
   # Restore variables from disk.
   saver.restore(sess, "/tmp/model.ckpt")
   print("Model restored.")
-  # Do some work with the model
-  ...
+  # Check the values of the variables
+  print("v1 : %s" % v1.eval())
+  print("v2 : %s" % v2.eval())
 ```
 
+
+
 ### Choosing which Variables to Save and Restore
 
-If you do not pass any argument to `tf.train.Saver()` the saver handles all
-variables in the graph.  Each one of them is saved under the name that was
-passed when the variable was created.
+If you do not pass any argument to `tf.train.Saver()`, the saver handles all
+variables in the graph.  Each variable is saved under the name that was passed
+when the variable was created.
 
 It is sometimes useful to explicitly specify names for variables in the
 checkpoint files.  For example, you may have trained a model with a variable
-named `"weights"` whose value you want to restore in a new variable named
+named `"weights"` whose value you want to restore into a variable named
 `"params"`.
 
 It is also sometimes useful to only save or restore a subset of the variables
-used by a model.  For example, you may have trained a neural net with 5 layers,
-and you now want to train a new model with 6 layers, restoring the parameters
-from the 5 layers of the previously trained model into the first 5 layers of
-the new model.
+used by a model.  For example, you may have trained a neural net with five
+layers, and you now want to train a new model with six layers that reuses the
+existing weights of the five trained layers. You can use the saver to restore
+the weights of just the first five layers.
+
+You can easily specify the names and variables to save or load by passing to the
+`tf.train.Saver()` constructor either a list of variables (which will be stored 
+under their own names), or a Python dictionary in which keys are the names to 
+use and values are the variables to manage. 
+
+Continuing from the save/restore examples, above:
+
+```python
+tf.reset_default_graph()
+# Create some variables.
+v1 = tf.get_variable("v1", [3], initializer = tf.zeros_initializer)
+v2 = tf.get_variable("v2", [5], initializer = tf.zeros_initializer)
 
-You can easily specify the names and variables to save by passing to the
-`tf.train.Saver()` constructor a Python dictionary: keys are the
-names to use, values are the variables to manage.
+# Add ops to save and restore only `v2` using the name "v2"
+saver = tf.train.Saver({"v2": v2})
+
+# Use the saver object normally after that.
+with tf.Session() as sess:
+  # Initialize v1 since the saver will not.
+  v1.initializer.run()
+  saver.restore(sess, "/tmp/model.ckpt")
+  
+  print("v1 : %s" % v1.eval())
+  print("v2 : %s" % v2.eval())
+
+```
 
 Notes:
 
-*  You can create as many saver objects as you want if you need to save and
+*  You can create as many `Saver` objects as you want if you need to save and
    restore different subsets of the model variables.  The same variable can be
-   listed in multiple saver objects, its value is only changed when the saver
-   `restore()` method is run.
+   listed in multiple saver objects, its value is only changed when the
+   `Saver.restore()` method is run.
+
+*  If you only restore a subset of the model variables at the start of a
+   session, you have to run an initialize op for the other variables.  See
+   @{tf.variables_initializer} for more information.
+
+
+## Sharing variables
+
+TensorFlow supports two ways of sharing variables:
 
-*  If you only restore a subset of the model variables at the start
-   of a session, you have to run an initialize op for the other variables.  See
-   @{tf.variables_initializer}
-   for more information.
+ * Explicitly passing `tf.Variable` objects around.
+ * Implicitly wrapping `tf.Variable` objects within `tf.variable_scope` objects.
+
+While code which explicitly passes variables around is very clear, it is
+sometimes convenient to write TensorFlow functions that implicitly use
+variables in their implementations. Most of the functional layers from
+`tf.layer` use this approach, as well as all `tf.metrics`, and a few other
+library utilities.
+
+Variable scopes allow you to control variable reuse when calling functions which
+implicitly create and use variables. They also allow you to name your variables
+in a hierarchical and understandable way.
+
+For example, let's say we write a function to create a convolutional / relu
+layer:
 
 ```python
-# Create some variables.
-v1 = tf.Variable(..., name="v1")
-v2 = tf.Variable(..., name="v2")
-...
-# Add ops to save and restore only 'v2' using the name "my_v2"
-saver = tf.train.Saver({"my_v2": v2})
-# Use the saver object normally after that.
-...
+def conv_relu(input, kernel_shape, bias_shape):
+    # Create variable named "weights".
+    weights = tf.get_variable("weights", kernel_shape,
+        initializer=tf.random_normal_initializer())
+    # Create variable named "biases".
+    biases = tf.get_variable("biases", bias_shape,
+        initializer=tf.constant_initializer(0.0))
+    conv = tf.nn.conv2d(input, weights,
+        strides=[1, 1, 1, 1], padding='SAME')
+    return tf.nn.relu(conv + biases)
 ```
+
+This function uses short names `weights` and `biases`, which is good for
+clarity. In a real model, however, we want many such convolutional layers, and
+calling this function repeatedly would not work:
+
+``` python
+input1 = tf.random_normal([1,10,10,32])
+input2 = tf.random_normal([1,20,20,32])
+x = conv_relu(input1, kernel_shape=[5, 5, 1, 32], bias_shape=[32])
+x = conv_relu(x, kernel_shape=[5, 5, 32, 32], bias_shape = [32])  # This fails.
+```
+
+Since the desired behavior is unclear (create new variables or reuse the
+existing ones?) TensorFlow will fail. Calling `conv_relu` in different scopes,
+however, clarifies that we want to create new variables:
+
+```python
+def my_image_filter(input_images):
+    with tf.variable_scope("conv1"):
+        # Variables created here will be named "conv1/weights", "conv1/biases".
+        relu1 = conv_relu(input_images, [5, 5, 1, 32], [32])
+    with tf.variable_scope("conv2"):
+        # Variables created here will be named "conv2/weights", "conv2/biases".
+        return conv_relu(relu1, [5, 5, 32, 32], [32])
+```
+
+If you do want the variables to be shared, you have two options. First, you can
+create a scope with the same name using `reuse=True`:
+
+``` python
+with tf.variable_scope("model"):
+  output1 = my_image_filter(input1)
+with tf.variable_scope("model", reuse=True):
+  output2 = my_image_filter(input2)
+
+```
+
+You can also call `scope.reuse_variables()` to trigger a reuse:
+
+``` python
+with tf.variable_scope("model") as scope:
+  output1 = my_image_filter(input1)
+  scope.reuse_variables()
+  output2 = my_image_filter(input2)
+
+```
+
+Since depending on exact string names of scopes can feel dangerous, it's also
+possible to initialize a variable scope based on another one:
+
+``` python
+with tf.variable_scope("model") as scope:
+  output1 = my_image_filter(input1)
+with tf.variable_scope(scope, reuse=True):
+  output2 = my_image_filter(input2)
+
+```
+
diff --git a/tensorflow/docs_src/programmers_guide/version_compat.md b/tensorflow/docs_src/programmers_guide/version_compat.md
new file mode 100644
index 0000000000000000000000000000000000000000..db6d596acf73df7df837d578a8bba0fb26f127a3
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/version_compat.md
@@ -0,0 +1,297 @@
+# TensorFlow Version Compatibility
+
+This document is for users who need backwards compatibility across different
+versions of TensorFlow (either for code or data), and for developers who want
+to modify TensorFlow while preserving compatibility.
+
+## Semantic Versioning 2.0
+
+TensorFlow follows Semantic Versioning 2.0 ([semver](http://semver.org)) for its
+public API. Each release version of TensorFlow has the form `MAJOR.MINOR.PATCH`.
+For example, TensorFlow version 1.2.3 has `MAJOR` version 1, `MINOR` version 2,
+and `PATCH` version 3. Changes to each number have the following meaning:
+
+* **MAJOR**:  Potentially backwards incompatible changes.  Code and data that
+  worked with a previous major release will not necessarily work with the new
+  release. However, in some cases existing TensorFlow graphs and checkpoints
+  may be migratable to the newer release; see
+  [Compatibility of graphs and checkpoints](#compatibility_of_graphs_and_checkpoints)
+  for details on data compatibility.
+
+* **MINOR**: Backwards compatible features, speed improvements, etc.  Code and
+  data that worked with a previous minor release *and* which depends only on the
+  public API will continue to work unchanged.  For details on what is and is
+  not the public API, see [What is covered](#what_is_covered).
+
+* **PATCH**: Backwards compatible bug fixes.
+
+For example, release 1.0.0 introduced backwards *incompatible* changes from
+release 0.12.1.  However, release 1.1.1 was backwards *compatible* with release
+1.0.0.
+
+## What is covered
+
+Only the public APIs of TensorFlow are backwards compatible across minor and
+patch versions.  The public APIs consist of
+
+* All the documented [Python](../api_docs/python) functions and classes in the
+  `tensorflow` module and its submodules, except for
+    * functions and classes in `tf.contrib`
+    * functions and classes whose names start with `_` (as these are private)
+  Note that the code in the `examples/` and `tools/` directories is not
+  reachable through the `tensorflow` Python module and is thus not covered by
+  the compatibility guarantee.
+
+  If a symbol is available through the `tensorflow` Python module or its
+  submodules, but is not documented, then it is **not** considered part of the
+  public API.
+
+* The [C API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h).
+
+* The following protocol buffer files:
+    * [`attr_value`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto)
+    * [`config`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto)
+    * [`event`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/event.proto)
+    * [`graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto)
+    * [`op_def`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
+    * [`reader_base`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/reader_base.proto)
+    * [`summary`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto)
+    * [`tensor`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto)
+    * [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto)
+    * [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto)
+
+## What is *not* covered
+
+Some API functions are explicitly marked as "experimental" and can change in
+backward incompatible ways between minor releases. These include:
+
+*   **Experimental APIs**: The @{tf.contrib} module and its submodules in Python
+    and any functions in the C API or fields in protocol buffers that are
+    explicitly commented as being experimental.
+
+*   **Other languages**: TensorFlow APIs in languages other than Python and C,
+    such as:
+
+  - @{$cc/guide$C++} (exposed through header files in
+    [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)).
+  - [Java](../api_docs/java/reference/org/tensorflow/package-summary),
+  - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
+
+*   **Details of composite ops:** Many public functions in Python expand to
+    several primitive ops in the graph, and these details will be part of any
+    graphs saved to disk as `GraphDef`s. These details may change for
+    minor releases. In particular, regressions tests that check for exact
+    matching between graphs are likely to break across minor releases, even
+    though the behavior of the graph should be unchanged and existing
+    checkpoints will still work.
+
+*   **Floating point numerical details:** The specific floating point values
+    computed by ops may change at any time.  Users should rely only on
+    approximate accuracy and numerical stability, not on the specific bits
+    computed. Changes to numerical formulas in minor and patch releases should
+    result in comparable or improved accuracy, with the caveat that in machine
+    learning improved accuracy of specific formulas may result in decreased
+    accuracy for the overall system.
+
+*   **Random numbers:** The specific random numbers computed by the
+    @{$python/constant_op#Random_Tensors$random ops} may change at any time.
+    Users should rely only on approximately correct distributions and
+    statistical strength, not the specific bits computed. However, we will make
+    changes to random bits rarely (or perhaps never) for patch releases.  We
+    will, of course, document all such changes.
+
+*   **Version skew in distributed Tensorflow:** Running two different versions
+    of TensorFlow in a single cluster is unsupported. There are no guarantees
+    about backwards compatibility of the wire protocol.
+
+*   **Bugs:** We reserve the right to make backwards incompatible behavior
+    (though not API) changes if the current implementation is clearly broken,
+    that is, if it contradicts the documentation or if a well-known and
+    well-defined intended behavior is not properly implemented due to a bug.
+    For example, if an optimizer claims to implement a well-known optimization
+    algorithm but does not match that algorithm due to a bug, then we will fix
+    the optimizer. Our fix may break code relying on the wrong behavior for
+    convergence. We will note such changes in the release notes.
+
+*   **Error messages:** We reserve the right to change the text of error
+    messages. In addition, the type of an error may change unless the type is
+    specified in the documentation. For example, a function documented to
+    raise an `InvalidArgument` exception will continue to
+    raise `InvalidArgument`, but the human-readable message contents can change.
+
+## Compatibility of graphs and checkpoints
+
+You'll sometimes need to preserve graphs and checkpoints.
+Graphs describe the data flow of ops to be run during training and
+inference, and checkpoints contain the saved tensor values of variables in a
+graph.
+
+Many TensorFlow users save graphs and trained models to disk for
+later evaluation or additional training, but end up running their saved graphs
+or models on a later release. In compliance with semver, any graph or checkpoint
+written out with one version of TensorFlow can be loaded and evaluated with a
+later version of TensorFlow with the same major release.  However, we will
+endeavor to preserve backwards compatibility even across major releases when
+possible, so that the serialized files are usable over long periods of time.
+
+
+Graphs are serialized via the `GraphDef` protocol buffer.  To facilitate (rare)
+backwards incompatible changes to graphs, each `GraphDef` has a version number
+separate from the TensorFlow version.  For example, `GraphDef` version 17
+deprecated the `inv` op in favor of `reciprocal`.  The semantics are:
+
+* Each version of TensorFlow supports an interval of `GraphDef` versions.  This
+  interval will be constant across patch releases, and will only grow across
+  minor releases.  Dropping support for a `GraphDef` version will only occur
+  for a major release of TensorFlow.
+
+* Newly created graphs are assigned the latest `GraphDef` version number.
+
+* If a given version of TensorFlow supports the `GraphDef` version of a graph,
+  it will load and evaluate with the same behavior as the TensorFlow version
+  used to generate it (except for floating point numerical details and random
+  numbers), regardless of the major version of TensorFlow.  In particular, all
+  checkpoint files will be compatible.
+
+* If the `GraphDef` *upper* bound is increased to X in a (minor) release, there
+  will be at least six months before the *lower* bound is increased to X.  For
+  example (we're using hypothetical version numbers here):
+    * TensorFlow 1.2 might support `GraphDef` versions 4 to 7.
+    * TensorFlow 1.3 could add `GraphDef` version 8 and support versions 4 to 8.
+    * At least six months later, TensorFlow 2.0.0 could drop support for
+      versions 4 to 7, leaving version 8 only.
+
+Finally, when support for a `GraphDef` version is dropped, we will attempt to
+provide tools for automatically converting graphs to a newer supported
+`GraphDef` version.
+
+## Graph and checkpoint compatibility when extending TensorFlow
+
+This section is relevant only when making incompatible changes to the `GraphDef`
+format, such as when adding ops, removing ops, or changing the functionality
+of existing ops.  The previous section should suffice for most users.
+
+### Backward and partial forward compatibility
+
+Our versioning scheme has three requirements:
+
+*   **Backward compatibility** to support loading graphs and checkpoints
+    created with older versions of TensorFlow.
+*   **Forward compatibility** to support scenarios where the producer of a
+    graph or checkpoint is upgraded to a newer version of TensorFlow before
+    the consumer.
+*   Enable evolving TensorFlow in incompatible ways. For example, removing Ops,
+    adding attributes, and removing attributes.
+
+Note that while the `GraphDef` version mechanism is separate from the TensorFlow
+version, backwards incompatible changes to the `GraphDef` format are still
+restricted by Semantic Versioning.  This means functionality can only be removed
+or changed between `MAJOR` versions of TensorFlow (such as `1.7` to `2.0`).
+Additionally, forward compatibility is enforced within Patch releases (`1.x.1`
+to `1.x.2` for example).
+
+To achieve backward and forward compatibility and to know when to enforce changes
+in formats, graphs and checkpoints have metadata that describes when they
+were produced. The sections below detail the TensorFlow implementation and
+guidelines for evolving `GraphDef` versions.
+
+### Independent data version schemes
+
+There are different data versions for graphs and checkpoints. The two data
+formats evolve at different rates from each other and also at different rates
+from TensorFlow. Both versioning systems are defined in
+[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h).
+Whenever a new version is added, a note is added to the header detailing what
+changed and the date.
+
+### Data, producers, and consumers
+
+We distinguish between the following kinds of data version information:
+* **producers**: binaries that produce data.  Producers have a version
+  (`producer`) and a minimum consumer version that they are compatible with
+  (`min_consumer`).
+* **consumers**: binaries that consume data.  Consumers have a version
+  (`consumer`) and a minimum producer version that they are compatible with
+  (`min_producer`).
+
+Each piece of versioned data has a [`VersionDef
+versions`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/versions.proto)
+field which records the `producer` that made the data, the `min_consumer`
+that it is compatible with, and a list of `bad_consumers` versions that are
+disallowed.
+
+By default, when a producer makes some data, the data inherits the producer's
+`producer` and `min_consumer` versions. `bad_consumers` can be set if specific
+consumer versions are known to contain bugs and must be avoided. A consumer can
+accept a piece of data if the following are all true:
+
+*   `consumer` >= data's `min_consumer`
+*   data's `producer` >= consumer's `min_producer`
+*   `consumer` not in data's `bad_consumers`
+
+Since both producers and consumers come from the same TensorFlow code base,
+[`core/public/version.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h)
+contains a main data version which is treated as either `producer` or
+`consumer` depending on context and both `min_consumer` and `min_producer`
+(needed by producers and consumers, respectively). Specifically,
+
+*   For `GraphDef` versions, we have `TF_GRAPH_DEF_VERSION`,
+    `TF_GRAPH_DEF_VERSION_MIN_CONSUMER`, and
+    `TF_GRAPH_DEF_VERSION_MIN_PRODUCER`.
+*   For checkpoint versions, we have `TF_CHECKPOINT_VERSION`,
+    `TF_CHECKPOINT_VERSION_MIN_CONSUMER`, and
+    `TF_CHECKPOINT_VERSION_MIN_PRODUCER`.
+
+### Evolving GraphDef versions
+
+This section explains how to use this versioning mechanism to make different
+types of changes to the `GraphDef` format.
+
+#### Add an Op
+
+Add the new Op to both consumers and producers at the same time, and do not
+change any `GraphDef` versions. This type of change is automatically
+backward compatible, and does not impact forward compatibility plan since
+existing producer scripts will not suddenly use the new functionality.
+
+#### Add an Op and switch existing Python wrappers to use it
+
+1.  Implement new consumer functionality and increment the `GraphDef` version.
+2.  If it is possible to make the wrappers use the new functionality only in
+    cases that did not work before, the wrappers can be updated now.
+3.  Change Python wrappers to use the new functionality. Do not increment
+    `min_consumer`, since models that do not use this Op should not break.
+
+#### Remove or restrict an Op's functionality
+
+1.  Fix all producer scripts (not TensorFlow itself) to not use the banned Op or
+    functionality.
+2.  Increment the `GraphDef` version and implement new consumer functionality
+    that bans the removed Op or functionality for GraphDefs at the new version
+    and above. If possible, make TensorFlow stop producing `GraphDefs` with the
+    banned functionality. To do so, add the
+    [`REGISTER_OP(...).Deprecated(deprecated_at_version,
+    message)`](https://github.com/tensorflow/tensorflow/blob/b289bc7a50fc0254970c60aaeba01c33de61a728/tensorflow/core/ops/array_ops.cc#L1009).
+3.  Wait for a major release for backward compatibility purposes.
+4.  Increase `min_producer` to the GraphDef version from (2) and remove the
+    functionality entirely.
+
+#### Change an Op's functionality
+
+1.  Add a new similar Op named `SomethingV2` or similar and go through the
+    process of adding it and switching existing Python wrappers to use it, which
+    may take three weeks if forward compatibility is desired.
+2.  Remove the old Op (Can only take place with a major version change due to
+    backward compatibility).
+3.  Increase `min_consumer` to rule out consumers with the old Op, add back the
+    old Op as an alias for `SomethingV2`, and go through the process to switch
+    existing Python wrappers to use it.
+4.  Go through the process to remove `SomethingV2`.
+
+#### Ban a single unsafe consumer version
+
+1.  Bump the `GraphDef` version and add the bad version to `bad_consumers` for
+    all new GraphDefs. If possible, add to `bad_consumers` only for GraphDefs
+    which contain a certain Op or similar.
+2.  If existing consumers have the bad version, push them out as soon as
+    possible.
diff --git a/tensorflow/docs_src/programmers_guide/version_semantics.md b/tensorflow/docs_src/programmers_guide/version_semantics.md
deleted file mode 100644
index cee3b105de4c2faba801c5ea4e01765391d1173b..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/programmers_guide/version_semantics.md
+++ /dev/null
@@ -1,161 +0,0 @@
-# TensorFlow Version Semantics
-
-## Semantic Versioning 2.0
-
-TensorFlow follows Semantic Versioning 2.0 ([semver](http://semver.org)) for its
-public API. Each release version of TensorFlow has the form `MAJOR.MINOR.PATCH`.
-Changes to the each number have the following meaning:
-
-* **MAJOR**:  Backwards incompatible changes.  Code and data that worked with
-  a previous major release will not necessarily work with a new release.
-  However, in some cases existing TensorFlow data (graphs, checkpoints, and
-  other protobufs) may be migratable to the newer release; see below for details
-  on data compatibility.
-
-* **MINOR**: Backwards compatible features, speed improvements, etc.  Code and
-  data that worked with a previous minor release *and* which depends only the
-  public API will continue to work unchanged.  For details on what is and is
-  not the public API, see below.
-
-* **PATCH**: Backwards compatible bug fixes.
-
-## What is covered
-
-Only the public APIs of TensorFlow are backwards compatible across minor and
-patch versions.  The public APIs consist of
-
-* The documented public [Python](../api_docs/python) API, excluding `tf.contrib`.
-  This includes all public functions and classes (whose names do not start with
-  `_`) in the tensorflow module and its submodules. Note that the code in
-  the `examples/` to `tools/` directories is not reachable through the
-  tensorflow Python module and is thus not covered by the compatibility
-  guarantee.
-
-  If a symbol is available through the tensorflow Python module or its
-  submodules, but is not documented, then it is _not_ considered part of the
-  public API.
-
-* The [C API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/c/c_api.h).
-
-* The following protocol buffer files:
-  [`attr_value`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto),
-  [`config`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto),
-  [`event`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/event.proto),
-  [`graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/graph.proto),
-  [`op_def`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto),
-  [`reader_base`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/reader_base.proto),
-  [`summary`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto),
-  [`tensor`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto),
-  [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto),
-  and [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto).
-
-## What is *not* covered
-
-Some API functions are explicitly marked as "experimental" and can change in
-backward incompatible ways between minor releases. These include:
-
-*   **Experimental APIs**: The @{tf.contrib} module and its submodules in Python
-    and any functions in the C API or fields in protocol buffers that are
-    explicitly commented as being experimental.
-
-*   **Other languages**: TensorFlow APIs in languages other than Python and C,
-    such as:
-
-  - @{$cc/guide$C++} (exposed through header files in
-    [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)).
-  - [Java](../api_docs/java/reference/org/tensorflow/package-summary), and
-  - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
-
-*   **Details of composite ops:** Many public functions in Python expand to
-    several primitive ops in the graph, and these details will be part of any
-    graphs saved to disk as `GraphDef`s. These details are allowed to change for
-    minor releases. In particular, regressions tests that check for exact
-    matching between graphs are likely to break across minor releases, even
-    though the behavior of the graph should be unchanged and existing
-    checkpoints will still work.
-
-*   **Floating point numerical details:** The specific floating point values
-    computed by ops may change at any time: users should rely only on
-    approximate accuracy and numerical stability, not on the specific bits
-    computed. Changes to numerical formulas in minor and patch releases should
-    result in comparable or improved accuracy, with the caveat that in machine
-    learning improved accuracy of specific formulas may result in worse accuracy
-    for the overall system.
-
-*   **Random numbers:** The specific random numbers computed by the
-    @{$python/constant_op#Random_Tensors$random ops} may change at any time:
-    users should rely only on approximately correct distributions and
-    statistical strength, not the specific bits computed. However, we will make
-    changes to random bits rarely and ideally never for patch releases, and all
-    such intended changes will be documented.
-
-*   **Distributed Tensorflow:** Running 2 different versions of TensorFlow in a
-    single cluster is unsupported. There are no guarantees about backwards
-    compatibility of the wire protocol.
-
-*   **Bugs:** We reserve the right to make backwards incompatible behavior
-    (though not API) changes if the current implementation is clearly broken,
-    i.e., if it is contradicting the documentation, or if a well-known and
-    well-defined intended behavior is not properly implemented due to a bug.
-    For example, if an optimizer claims to implement a well-known optimization
-    algorithm but, due to a bug, does not match that algorithm we will fix the
-    optimizer. This may break code relying on the wrong behavior for
-    convergence. We will note such changes in the release notes.
-
-*   **Error messages:** We reserve the right to change the text of error
-    messages. In addition, the type of an error may change unless the type is
-    specified in the documentation. For example, a function that says in some
-    condition it will raise an `InvalidArgument` exception, it will continue to
-    raise `InvalidArgument`, but the human-readable message contents can change.
-
-
-Furthermore, any API methods marked "deprecated" in the 1.0 release can
-be deleted in any subsequent minor release.
-
-## Compatibility for Graphs and Checkpoints
-
-Many users of TensorFlow will be saving graphs and trained models to disk for
-later evaluation or more training, often changing versions of TensorFlow in the
-process.  First, following semver, any graph or checkpoint written out with one
-version of TensorFlow can be loaded and evaluated with a later version of
-TensorFlow with the same major release.  However, we will endeavor to preserve
-backwards compatibility even across major releases when possible, so that the
-serialized files are usable over long periods of time.
-
-There are two main classes of saved TensorFlow data: graphs and checkpoints.
-Graphs describe the data flow graphs of ops to be run during training and
-inference, and checkpoints contain the saved tensor values of variables in a
-graph.
-
-Graphs are serialized via the `GraphDef` protocol buffer.  To facilitate (rare)
-backwards incompatible changes to graphs, each `GraphDef` has an integer version
-separate from the TensorFlow version.  The semantics are:
-
-* Each version of TensorFlow supports an interval of `GraphDef` versions.  This
-  interval with be constant across patch releases, and will only grow across
-  minor releases.  Dropping support for a `GraphDef` version will only occur
-  for a major release of TensorFlow.
-
-* Newly created graphs use the newest `GraphDef` version.
-
-* If a given version of TensorFlow supports the `GraphDef` version of a graph,
-  it will load and evaluate with the same behavior as when it was written out
-  (except for floating point numerical details and random numbers), regardless
-  of the major version of TensorFlow.  In particular, all checkpoint files will
-  be compatible.
-
-* If the `GraphDef` upper bound is increased to X in a (minor) release, there
-  will be at least six months before the lower bound is increased to X.
-
-For example (numbers and versions hypothetical), TensorFlow 1.2 might support
-`GraphDef` versions 4 to 7.  TensorFlow 1.3 could add `GraphDef` version 8 and
-support versions 4 to 8.  At least six months later, TensorFlow 2.0.0 could drop
-support for versions 4 to 7, leaving version 8 only.
-
-Finally, when support for a `GraphDef` version is dropped, we will attempt to
-provide tools for automatically converting graphs to a newer supported
-`GraphDef` version.
-
-For developer-level details about `GraphDef` versioning, including how to evolve
-the versions to account for changes, see
-@{$data_versions$TensorFlow Data Versioning}.
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index b0617326ff32ce8d219985d8eb1baa1c0ffc0cc4..a9e9dda12b9f96603b058c9ff9d719e8f4408c91 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -178,7 +178,7 @@ the network architecture to return normalized predictions using
 @{tf.nn.softmax}.
 
 The `inputs()` and `inference()` functions provide all the components
-necessary to perform evaluation on a model. We now shift our focus towards
+necessary to perform an evaluation of a model. We now shift our focus towards
 building operations for training a model.
 
 > **EXERCISE:** The model architecture in `inference()` differs slightly from
@@ -417,7 +417,7 @@ scope indicating that they should be run on the first GPU.
 All variables are pinned to the CPU and accessed via
 @{tf.get_variable}
 in order to share them in a multi-GPU version.
-See how-to on @{$variable_scope$Sharing Variables}.
+See how-to on @{$variables$Sharing Variables}.
 
 ### Launching and Training the Model on Multiple GPU cards
 
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index 85e6ec76dc4ed966492e404d0c3ab59824f41413..b0e715edcb2b0390d7a1d66626d2e856e0ee2d28 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -137,7 +137,7 @@ Once TensorBoard is running, navigate your web browser to `localhost:6006` to vi
 
 The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
 
-The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
+The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
 
 ## Using the Retrained Model
 
@@ -170,6 +170,10 @@ above
 [`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/image_retraining/label_image.py)
 is a reasonable starting point.
 
+If you find the default Inception v3 model is too large or slow for your
+application, take a look at the [Other Model Architectures section](/tutorials/image_retraining#other_model_architectures)
+below for options to speed up and slim down your network.
+
 ## Training on Your Own Categories
 
 If you've managed to get the script working on the flower example images, you
@@ -328,3 +332,43 @@ errors in the input data set, such as mislabeled, low-quality, or ambiguous
 images. However, one should generally avoid point-fixing individual errors in
 the test set, since they are likely to merely reflect more general problems in
 the (much larger) training set.
+
+## Other Model Architectures
+
+By default the script uses a pretrained version of the Inception v3 model
+architecture. This is a good place to start because it provides high accuracy
+results, but if you intend to deploy your model on mobile devices or other
+resource-constrained environments you may want to trade off a little accuracy
+for much smaller file sizes or faster speeds. To help with that, the
+[retrain.py script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py)
+supports 32 different variations on the [Mobilenet architecture](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html).
+
+These are a little less precise than Inception v3, but can result in far
+smaller file sizes (down to less than a megabyte) and can be many times faster
+to run. To train with one of these models, pass in the `--architecture` flag,
+for example:
+
+```
+python tensorflow/examples/image_retraining/retrain.py \
+    --image_dir ~/flower_photos --architecture mobilenet_0.25_128_quantized
+```
+
+This will create a 941KB model file in `/tmp/output_graph.pb`, with 25% of the
+parameters of the full Mobilenet, taking 128x128 sized input images, and with
+its weights quantized down to eight bits on disk. You can choose '1.0', '0.75',
+'0.50', or '0.25' to control the number of weight parameters, and so the file
+size (and to some extent the speed), '224', '192', '160', or '128' for the input
+image size, with smaller sizes giving faster speeds, and an optional
+'_quantized' at the end to indicate whether the file should contain 8-bit or
+32-bit float weights.
+
+The speed and size advantages come at a loss to accuracy of course, but for many
+purposes this isn't critical. They can also be somewhat offset with improved
+training data. For example, training with distortions allows me to get above 80%
+accuracy on the flower data set even with the 0.25/128/quantized graph above.
+
+If you're going to be using the Mobilenet models in label_image or your own
+programs, you'll need to feed in an image of the specified size converted to a
+float range into the 'input' tensor. Typically 24-bit images are in the range
+[0,255], and you must convert them to the [-1,1] float range expected by the
+model with the formula  `(image - 128.)/128.`.
diff --git a/tensorflow/docs_src/tutorials/kernel_methods.md b/tensorflow/docs_src/tutorials/kernel_methods.md
index fbf1afc4ab4359cd473e74674a38cd85381e21d1..8506b5228e7258e55a120c52e4c014bfc407f1b9 100644
--- a/tensorflow/docs_src/tutorials/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/kernel_methods.md
@@ -22,7 +22,7 @@ TensorFlow will provide support for sparse features at a later release.
 
 This tutorial uses [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn)
 (TensorFlow's high-level Machine Learning API) Estimators for our ML models.
-If you are not familiar with this API, [tf.contrib.learn Quickstart](https://www.tensorflow.org/get_started/tflearn)
+If you are not familiar with this API, [tf.estimator Quickstart](https://www.tensorflow.org/get_started/estimator)
 is a good place to start. We will use the MNIST dataset. The tutorial consists
 of the following steps:
 
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 0fdfcf5d2a2b7d9f654882d116b3de21e162784d..acf33afe6d3043d980060fc5179134696b2d667e 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -27,9 +27,6 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.contrib import learn
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-
 tf.logging.set_verbosity(tf.logging.INFO)
 
 # Our application logic will be added here
@@ -122,15 +119,14 @@ Open `cnn_mnist.py` and add the following `cnn_model_fn` function, which
 conforms to the interface expected by TensorFlow's Estimator API (more on this
 later in [Create the Estimator](#create-the-estimator)). `cnn_mnist.py` takes
 MNIST feature data, labels, and
-@{tf.contrib.learn.ModeKeys$model mode} (`TRAIN`, `EVAL`,
-`INFER`) as arguments; configures the CNN; and returns predictions, loss, and a
-training operation:
+@{tf.estimator.ModeKeys$model mode} (`TRAIN`, `EVAL`, `PREDICT`) as arguments;
+configures the CNN; and returns predictions, loss, and a training operation:
 
 ```python
 def cnn_model_fn(features, labels, mode):
   """Model function for CNN."""
   # Input Layer
-  input_layer = tf.reshape(features, [-1, 28, 28, 1])
+  input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
 
   # Convolutional Layer #1
   conv1 = tf.layers.conv2d(
@@ -156,39 +152,41 @@ def cnn_model_fn(features, labels, mode):
   pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
   dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
   dropout = tf.layers.dropout(
-      inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
+      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
 
   # Logits Layer
   logits = tf.layers.dense(inputs=dropout, units=10)
 
-  loss = None
-  train_op = None
+  predictions = {
+      # Generate predictions (for PREDICT and EVAL mode)
+      "classes": tf.argmax(input=logits, axis=1),
+      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
+      # `logging_hook`.
+      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
+  }
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  if mode != learn.ModeKeys.INFER:
-    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
+  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
-  if mode == learn.ModeKeys.TRAIN:
-    train_op = tf.contrib.layers.optimize_loss(
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+    train_op = optimizer.minimize(
         loss=loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=0.001,
-        optimizer="SGD")
-
-  # Generate Predictions
-  predictions = {
-      "classes": tf.argmax(
-          input=logits, axis=1),
-      "probabilities": tf.nn.softmax(
-          logits, name="softmax_tensor")
-  }
-
-  # Return a ModelFnOps object
-  return model_fn_lib.ModelFnOps(
-      mode=mode, predictions=predictions, loss=loss, train_op=train_op)
+        global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+
+  # Add evaluation metrics (for EVAL mode)
+  eval_metric_ops = {
+      "accuracy": tf.metrics.accuracy(
+          labels=labels, predictions=predictions["classes"])}
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 ```
 
 The following sections (with headings corresponding to each code block above)
@@ -222,18 +220,18 @@ To convert our input feature map (`features`) to this shape, we can perform the
 following `reshape` operation:
 
 ```python
-input_layer = tf.reshape(features, [-1, 28, 28, 1])
+input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
 ```
 
 Note that we've indicated `-1` for batch size, which specifies that this
 dimension should be dynamically computed based on the number of input values in
-`features`, holding the size of all other dimensions constant. This allows us to
-treat `batch_size` as a hyperparameter that we can tune. For example, if we feed
-examples into our model in batches of 5, `features` will contain 3,920 values
-(one value for each pixel in each image), and `input_layer` will have a shape of
-`[5, 28, 28, 1]`. Similarly, if we feed examples in batches of 100, `features`
-will contain 78,400 values, and `input_layer` will have a shape of `[100, 28,
-28, 1]`.
+`features["x"]`, holding the size of all other dimensions constant. This allows
+us to treat `batch_size` as a hyperparameter that we can tune. For example, if
+we feed examples into our model in batches of 5, `features["x"]` will contain
+3,920 values (one value for each pixel in each image), and `input_layer` will
+have a shape of `[5, 28, 28, 1]`. Similarly, if we feed examples in batches of
+100, `features["x"]` will contain 78,400 values, and `input_layer` will have a
+shape of `[100, 28, 28, 1]`.
 
 ### Convolutional Layer #1
 
@@ -386,7 +384,7 @@ to our dense layer, using the `dropout` method in `layers`:
 
 ```python
 dropout = tf.layers.dropout(
-    inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
+    inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
 ```
 
 Again, `inputs` specifies the input tensor, which is the output tensor from our
@@ -415,6 +413,54 @@ logits = tf.layers.dense(inputs=dropout, units=10)
 Our final output tensor of the CNN, `logits`, has shape
 <code>[<em>batch_size</em>, 10]</code>.
 
+### Generate Predictions {#generate_predictions}
+
+The logits layer of our model returns our predictions as raw values in a
+<code>[<em>batch_size</em>, 10]</code>-dimensional tensor. Let's convert these
+raw values into two different formats that our model function can return:
+
+*   The **predicted class** for each example: a digit from 0–9.
+*   The **probabilities** for each possible target class for each example: the
+    probability that the example is a 0, is a 1, is a 2, etc.
+
+For a given example, our predicted class is the element in the corresponding row
+of the logits tensor with the highest raw value. We can find the index of this
+element using the @{tf.argmax}
+function:
+
+```python
+tf.argmax(input=logits, axis=1)
+```
+
+The `input` argument specifies the tensor from which to extract maximum
+values—here `logits`. The `axis` argument specifies the axis of the `input`
+tensor along which to find the greatest value. Here, we want to find the largest
+value along the dimension with index of 1, which corresponds to our predictions
+(recall that our logits tensor has shape <code>[<em>batch_size</em>,
+10]</code>).
+
+We can derive probabilities from our logits layer by applying softmax activation
+using @{tf.nn.softmax}:
+
+```python
+tf.nn.softmax(logits, name="softmax_tensor")
+```
+
+> Note: We use the `name` argument to explicitly name this operation
+> `softmax_tensor`, so we can reference it later. (We'll set up logging for the
+> softmax values in ["Set Up a Logging Hook"](#set-up-a-logging-hook).
+
+We compile our predictions in a dict, and return an `EstimatorSpec` object:
+
+```python
+predictions = {
+    "classes": tf.argmax(input=logits, axis=1),
+    "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
+}
+if mode == tf.estimator.ModeKeys.PREDICT:
+  return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+```
+
 ### Calculate Loss {#calculating-loss}
 
 For both training and evaluation, we need to define a
@@ -426,14 +472,9 @@ as the loss metric. The following code calculates cross entropy when the model
 runs in either `TRAIN` or `EVAL` mode:
 
 ```python
-loss = None
-train_op = None
-
-# Calculate loss for both TRAIN and EVAL modes
-if mode != learn.ModeKeys.INFER:
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+loss = tf.losses.softmax_cross_entropy(
+    onehot_labels=onehot_labels, logits=logits)
 ```
 
 Let's take a closer look at what's happening above.
@@ -474,90 +515,42 @@ predictions from our logits layer. `tf.losses.softmax_cross_entropy()` takes
 
 ```python
 loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
+    onehot_labels=onehot_labels, logits=logits)
 ```
 
 ### Configure the Training Op
 
 In the previous section, we defined loss for our CNN as the softmax
 cross-entropy of the logits layer and our labels. Let's configure our model to
-optimize this loss value during training, using the
-@{tf.contrib.layers.optimize_loss}
-method in `tf.contrib.layers`. We'll use a learning rate of 0.001 and
+optimize this loss value during training. We'll use a learning rate of 0.001 and
 [stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)
 as the optimization algorithm:
 
 ```python
-# Configure the Training Op (for TRAIN mode)
-if mode == learn.ModeKeys.TRAIN:
-    train_op = tf.contrib.layers.optimize_loss(
-        loss=loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=0.001,
-        optimizer="SGD")
+if mode == tf.estimator.ModeKeys.TRAIN:
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+  train_op = optimizer.minimize(
+      loss=loss,
+      global_step=tf.train.get_global_step())
+  return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
 > functions, see @{$estimators#defining-the-training-op-for-the-model$"Defining
 > the training op for the model"} in the @{$estimators$"Creating Estimations in
-> tf.contrib.learn"} tutorial.
-
-### Generate Predictions {#generate_predictions}
+> tf.estimator"} tutorial.
 
-The logits layer of our model returns our predictions as raw values in a
-<code>[<em>batch_size</em>, 10]</code>-dimensional tensor. Let's convert these
-raw values into two different formats that our model function can return:
-
-*   The **predicted class** for each example: a digit from 0–9.
-*   The **probabilities** for each possible target class for each example: the
-    probability that the example is a 0, is a 1, is a 2, etc.
+### Add evaluation metrics
 
-For a given example, our predicted class is the element in the corresponding row
-of the logits tensor with the highest raw value. We can find the index of this
-element using the @{tf.argmax}
-function:
+To add accuracy metric in our model, we define `eval_metric_ops` dict in EVAL
+mode as follows:
 
 ```python
-tf.argmax(input=logits, axis=1)
-```
-
-The `input` argument specifies the tensor from which to extract maximum
-values—here `logits`. The `axis` argument specifies the axis of the `input`
-tensor along which to find the greatest value. Here, we want to find the largest
-value along the dimension with index of 1, which corresponds to our predictions
-(recall that our logits tensor has shape <code>[<em>batch_size</em>,
-10]</code>).
-
-We can derive probabilities from our logits layer by applying softmax activation
-using @{tf.nn.softmax}:
-
-```python
-tf.nn.softmax(logits, name="softmax_tensor")
-```
-
-> Note: We use the `name` argument to explicitly name this operation
-> `softmax_tensor`, so we can reference it later. (We'll set up logging for the
-> softmax values in ["Set Up a Logging Hook"](#set-up-a-logging-hook).
-
-We compile our predictions in a dict as follows:
-
-```python
-predictions = {
-    "classes": tf.argmax(
-        input=logits, axis=1),
-    "probabilities": tf.nn.softmax(
-        logits, name="softmax_tensor")
-}
-```
-
-Finally, now that we've got our `predictions`, `loss`, and `train_op`, we can
-return them, along with our `mode` argument, in a
-@{tf.contrib.learn.ModelFnOps} object:
-
-```python
-# Return a ModelFnOps object
-return model_fn_lib.ModelFnOps(
-    mode=mode, predictions=predictions, loss=loss, train_op=train_op)
+eval_metric_ops = {
+    "accuracy": tf.metrics.accuracy(
+        labels=labels, predictions=predictions["classes"])}
+return tf.estimator.EstimatorSpec(
+    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 ```
 
 ## Training and Evaluating the CNN MNIST Classifier {#training_and_evaluating_the_cnn_mnist_classifier}
@@ -573,7 +566,7 @@ First, let's load our training and test data. Add a `main()` function to
 ```python
 def main(unused_argv):
   # Load training and eval data
-  mnist = learn.datasets.load_dataset("mnist")
+  mnist = tf.contrib.learn.datasets.load_dataset("mnist")
   train_data = mnist.train.images # Returns np.array
   train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
   eval_data = mnist.test.images # Returns np.array
@@ -596,19 +589,19 @@ to `main()`:
 
 ```python
 # Create the Estimator
-mnist_classifier = learn.Estimator(
-      model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
+mnist_classifier = tf.estimator.Estimator(
+    model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
 ```
 
 The `model_fn` argument specifies the model function to use for training,
-evaluation, and inference; we pass it the `cnn_model_fn` we created in
+evaluation, and prediction; we pass it the `cnn_model_fn` we created in
 ["Building the CNN MNIST Classifier."](#building-the-cnn-mnist-classifier) The
 `model_dir` argument specifies the directory where model data (checkpoints) will
 be saved (here, we specify the temp directory `/tmp/mnist_convnet_model`, but
 feel free to change to another directory of your choice).
 
 > Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the
-> tutorial @{$estimators$"Creating Estimators in tf.contrib.learn."}
+> tutorial @{$estimators$"Creating Estimators in tf.estimator."}
 
 ### Set Up a Logging Hook {#set_up_a_logging_hook}
 
@@ -643,65 +636,54 @@ should be logged after every 50 steps of training.
 
 ### Train the Model
 
-Now we're ready to train our model, which we can do by calling `fit()` on
-`mnist_classifier`. Add the following to `main()`:
+Now we're ready to train our model, which we can do by creating `train_input_fn`
+ans calling `train()` on `mnist_classifier`. Add the following to `main()`:
 
 ```python
 # Train the model
-mnist_classifier.fit(
-    x=train_data,
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": train_data},
     y=train_labels,
     batch_size=100,
+    num_epochs=None,
+    shuffle=True)
+mnist_classifier.train(
+    input_fn=train_input_fn,
     steps=20000,
-    monitors=[logging_hook])
+    hooks=[logging_hook])
 ```
 
-In the `fit` call, we pass the training feature data and labels to `x` and `y`,
-respectively. We set a `batch_size` of `100` (which means that the model will
-train on minibatches of 100 examples at each step), and `steps` of `20000`
+In the `numpy_input_fn` call, we pass the training feature data and labels to
+`x` (as a dict) and `y`, respectively. We set a `batch_size` of `100` (which
+means that the model will train on minibatches of 100 examples at each step).
+`num_epochs=None` means that the model will train until the specified number of
+steps is reached. We also set `shuffle=True` to shuffle the training data.
+In the `train` call, we set `steps=20000`
 (which means the model will train for 20,000 steps total). We pass our
-`logging_hook` to the `monitors` argument, so that it will be triggered during
+`logging_hook` to the `hooks` argument, so that it will be triggered during
 training.
 
 ### Evaluate the Model
 
 Once training is complete, we want to evaluate our model to determine its
-accuracy on the MNIST test set. To set up the accuracy metric for our model, we
-need to create a metrics dict with a @{tf.contrib.learn.MetricSpec}
-that calculates accuracy. Add the following to `main()`:
-
-```python
-# Configure the accuracy metric for evaluation
-metrics = {
-    "accuracy":
-        learn.MetricSpec(
-            metric_fn=tf.metrics.accuracy, prediction_key="classes"),
-}
-```
-
-We create our `MetricSpec`s with the following two arguments:
-
-*   `metric_fn`. The function that calculates and returns the value of our
-    metric. Here, we can use the predefined `accuracy` function in the
-    @{tf.metrics} module.
-*   `prediction_key`. The key of the tensor that contains the predictions
-    returned by the model function. Here, because we're building a
-    classification model, the prediction key is `"classes"`, which we specified
-    back in ["Generate Predictions."](#generate_predictions)
-
-Now that we've set up our `metrics` dict, we can evaluate the model. Add the
-following code, which performs evaluation and prints the results:
+accuracy on the MNIST test set. We call the `evaluate` method, which evaluates
+the metrics we specified in `eval_metric_ops` argument in the `model_fn`.
+Add the following to `main()`:
 
 ```python
 # Evaluate the model and print results
-eval_results = mnist_classifier.evaluate(
-    x=eval_data, y=eval_labels, metrics=metrics)
+eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"x": eval_data},
+    y=eval_labels,
+    num_epochs=1,
+    shuffle=False)
+eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
 print(eval_results)
 ```
 
-We pass our evaluation feature data and labels to `evaluate()` in the `x` and
-`y` arguments, respectively. The `metrics` argument takes the metrics dict we
-just defined.
+To create `eval_input_fn`, we set `num_epochs=1`, so that the model evaluates
+the metrics over one epoch of data and returns the result. We also set
+`shuffle=False` to iterate through the data sequentially.
 
 ### Run the Model
 
@@ -711,7 +693,7 @@ logic; now let's see the results. Run `cnn_mnist.py`.
 > Note: Training CNNs is quite computationally intensive. Estimated completion
 > time of `cnn_mnist.py` will vary depending on your processor, but will likely
 > be upwards of 1 hour on CPU. To train more quickly, you can decrease the
-> number of `steps` passed to `fit()`, but note that this will affect accuracy.
+> number of `steps` passed to `train()`, but note that this will affect accuracy.
 
 As the model trains, you'll see log output like the following:
 
@@ -738,7 +720,7 @@ Here, we've achieved an accuracy of 97.3% on our test data set.
 To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
 following resources:
 
-*   @{$estimators$Creating Estimators in tf.contrib.learn}. An
+*   @{$estimators$Creating Estimators in tf.estimator}. An
     introduction to the TensorFlow Estimator API, which walks through
     configuring an Estimator, writing a model function, calculating loss, and
     defining a training op.
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
index de87c164ae08c2a606108cc72554ea3bfd4b75e3..4201a8021b13290e46d52df86de09f01181b972d 100644
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -1,23 +1,22 @@
 # Large-scale Linear Models with TensorFlow
 
-The tf.contrib.learn API provides (among other things) a rich set of tools for working
-with linear models in TensorFlow. This document provides an overview of those
-tools. It explains:
+The tf.estimator API provides (among other things) a rich set of tools for
+working with linear models in TensorFlow. This document provides an overview of
+those tools. It explains:
 
    * what a linear model is.
    * why you might want to use a linear model.
-   * how tf.contrib.learn makes it easy to build linear models in TensorFlow.
-   * how you can use tf.contrib.learn to combine linear models with
+   * how tf.estimator makes it easy to build linear models in TensorFlow.
+   * how you can use tf.estimator to combine linear models with
    deep learning to get the advantages of both.
 
-Read this overview to decide whether the tf.contrib.learn linear model tools might be
-useful to you. Then do the @{$wide$Linear Models tutorial} to
+Read this overview to decide whether the tf.estimator linear model tools might
+be useful to you. Then do the @{$wide$Linear Models tutorial} to
 give it a try. This overview uses code samples from the tutorial, but the
 tutorial walks through the code in greater detail.
 
 To understand this overview it will help to have some familiarity
-with basic machine learning concepts, and also with
-@{$tflearn$tf.contrib.learn}.
+with basic machine learning concepts, and also with @{$estimator$tf.estimator}.
 
 [TOC]
 
@@ -52,21 +51,22 @@ Linear models:
    * provide an excellent starting point for learning about machine learning.
    * are widely used in industry.
 
-## How does tf.contrib.learn help you build linear models?
+## How does tf.estimator help you build linear models?
 
 You can build a linear model from scratch in TensorFlow without the help of a
-special API. But tf.contrib.learn provides some tools that make it easier to build
+special API. But tf.estimator provides some tools that make it easier to build
 effective large-scale linear models.
 
 ### Feature columns and transformations
 
 Much of the work of designing a linear model consists of transforming raw data
-into suitable input features. tf.contrib.learn uses the `FeatureColumn` abstraction to
+into suitable input features. Tensorflow uses the `FeatureColumn` abstraction to
 enable these transformations.
 
 A `FeatureColumn` represents a single feature in your data. A `FeatureColumn`
 may represent a quantity like 'height', or it may represent a category like
-'eye_color' where the value is drawn from a set of discrete possibilities like {'blue', 'brown', 'green'}.
+'eye_color' where the value is drawn from a set of discrete possibilities like
+{'blue', 'brown', 'green'}.
 
 In the case of both *continuous features* like 'height' and *categorical
 features* like 'eye_color', a single value in the data might get transformed
@@ -86,10 +86,10 @@ become [0, 1, 0] and 'green' would become [0, 0, 1]. These vectors are called
 "sparse" because they may be very long, with many zeros, when the set of
 possible values is very large (such as all English words).
 
-While you don't need to use sparse columns to use tf.contrib.learn linear models, one
-of the strengths of linear models is their ability to deal with large sparse
-vectors. Sparse features are a primary use case for the tf.contrib.learn linear model
-tools.
+While you don't need to use categorical columns to use tf.estimator linear
+models, one of the strengths of linear models is their ability to deal with
+large sparse vectors. Sparse features are a primary use case for the
+tf.estimator linear model tools.
 
 ##### Encoding sparse columns
 
@@ -97,19 +97,19 @@ tools.
 automatically, with code like this:
 
 ```python
-eye_color = tf.contrib.layers.sparse_column_with_keys(
-  column_name="eye_color", keys=["blue", "brown", "green"])
+eye_color = tf.feature_column.categorical_column_with_vocabulary_list(
+    "eye_color", vocabulary_list=["blue", "brown", "green"])
 ```
 
 where `eye_color` is the name of a column in your source data.
 
 You can also generate `FeatureColumn`s for categorical features for which you
 don't know all possible values. For this case you would use
-`sparse_column_with_hash_bucket()`, which uses a hash function to assign
+`categorical_column_with_hash_bucket()`, which uses a hash function to assign
 indices to feature values.
 
 ```python
-education = tf.contrib.layers.sparse_column_with_hash_bucket(\
+education = tf.feature_column.categorical_column_with_hash_bucket(
     "education", hash_bucket_size=1000)
 ```
 
@@ -131,12 +131,8 @@ a *feature cross*.
 The `crossed_column()` method makes it easy to set up feature crosses:
 
 ```python
-sport = tf.contrib.layers.sparse_column_with_hash_bucket(\
-    "sport", hash_bucket_size=1000)
-city = tf.contrib.layers.sparse_column_with_hash_bucket(\
-    "city", hash_bucket_size=1000)
-sport_x_city = tf.contrib.layers.crossed_column(
-    [sport, city], hash_bucket_size=int(1e4))
+sport_x_city = tf.feature_column.crossed_column(
+    ["sport", "city"], hash_bucket_size=int(1e4))
 ```
 
 #### Continuous columns
@@ -144,11 +140,11 @@ sport_x_city = tf.contrib.layers.crossed_column(
 You can specify a continuous feature like so:
 
 ```python
-age = tf.contrib.layers.real_valued_column("age")
+age = tf.feature_column.numeric_column("age")
 ```
 
 Although, as a single real number, a continuous feature can often be input
-directly into the model, tf.contrib.learn offers useful transformations for this sort
+directly into the model, Tensorflow offers useful transformations for this sort
 of column as well.
 
 ##### Bucketization
@@ -161,7 +157,7 @@ Bucketization divides the range of possible values into subranges called
 buckets:
 
 ```python
-age_buckets = tf.contrib.layers.bucketized_column(
+age_buckets = tf.feature_column.bucketized_column(
     age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
 ```
 
@@ -177,43 +173,45 @@ the data itself. You provide the data through an input function.
 The input function must return a dictionary of tensors. Each key corresponds to
 the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See
-@{$input_fn$Building Input Functions with tf.contrib.learn} for a
+@{$input_fn$Building Input Functions with tf.estimator} for a
 more comprehensive look at input functions, and `input_fn` in the
 [linear models tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
 for an example implementation of an input function.
 
-The input function is passed to the `fit()` and `evaluate()` calls that
+The input function is passed to the `train()` and `evaluate()` calls that
 initiate training and testing, as described in the next section.
 
 ### Linear estimators
 
-tf.contrib.learn's estimator classes provide a unified training and evaluation harness
+Tensorflow estimator classes provide a unified training and evaluation harness
 for regression and classification models. They take care of the details of the
 training and evaluation loops and allow the user to focus on model inputs and
 architecture.
 
 To build a linear estimator, you can use either the
-`tf.contrib.learn.LinearClassifier` estimator or the
-`tf.contrib.learn.LinearRegressor` estimator, for classification and
+`tf.estimator.LinearClassifier` estimator or the
+`tf.estimator.LinearRegressor` estimator, for classification and
 regression respectively.
 
-As with all tf.contrib.learn estimators, to run the estimator you just:
+As with all tensorflow estimators, to run the estimator you just:
 
    1. Instantiate the estimator class. For the two linear estimator classes,
    you pass a list of `FeatureColumn`s to the constructor.
-   2. Call the estimator's `fit()` method to train it.
+   2. Call the estimator's `train()` method to train it.
    3. Call the estimator's `evaluate()` method to see how it does.
 
 For example:
 
 ```python
-e = tf.contrib.learn.LinearClassifier(feature_columns=[
-  native_country, education, occupation, workclass, marital_status,
-  race, age_buckets, education_x_occupation, age_buckets_x_race_x_occupation],
-  model_dir=YOUR_MODEL_DIRECTORY)
-e.fit(input_fn=input_fn_train, steps=200)
+e = tf.estimator.LinearClassifier(
+    feature_columns=[
+        native_country, education, occupation, workclass, marital_status,
+        race, age_buckets, education_x_occupation,
+        age_buckets_x_race_x_occupation],
+    model_dir=YOUR_MODEL_DIRECTORY)
+e.train(input_fn=input_fn_train, steps=200)
 # Evaluate for one step (one pass through the test data).
-results = e.evaluate(input_fn=input_fn_test, steps=1)
+results = e.evaluate(input_fn=input_fn_test)
 
 # Print the stats for the evaluation.
 for key in sorted(results):
@@ -222,14 +220,14 @@ for key in sorted(results):
 
 ### Wide and deep learning
 
-The tf.contrib.learn API also provides an estimator class that lets you jointly train
-a linear model and a deep neural network. This novel approach combines the
+The tf.estimator API also provides an estimator class that lets you jointly
+train a linear model and a deep neural network. This novel approach combines the
 ability of linear models to "memorize" key features with the generalization
-ability of neural nets. Use `tf.contrib.learn.DNNLinearCombinedClassifier` to
+ability of neural nets. Use `tf.estimator.DNNLinearCombinedClassifier` to
 create this sort of "wide and deep" model:
 
 ```python
-e = tf.contrib.learn.DNNLinearCombinedClassifier(
+e = tf.estimator.DNNLinearCombinedClassifier(
     model_dir=YOUR_MODEL_DIR,
     linear_feature_columns=wide_columns,
     dnn_feature_columns=deep_columns,
diff --git a/tensorflow/docs_src/tutorials/recurrent.md b/tensorflow/docs_src/tutorials/recurrent.md
index 708a9620dd7ec2b71905d932dffa2af74cfceb96..346b6be06c9d9ad7721bf9e38e2a1fa060e39f69 100644
--- a/tensorflow/docs_src/tutorials/recurrent.md
+++ b/tensorflow/docs_src/tutorials/recurrent.md
@@ -75,7 +75,9 @@ The basic pseudocode is as follows:
 words_in_dataset = tf.placeholder(tf.float32, [num_batches, batch_size, num_features])
 lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
 # Initial state of the LSTM memory.
-state = tf.zeros([batch_size, lstm.state_size])
+hidden_state = tf.zeros([batch_size, lstm.state_size])
+current_state = tf.zeros([batch_size, lstm.state_size])
+state = hidden_state, current_state
 probabilities = []
 loss = 0.0
 for current_batch_of_words in words_in_dataset:
diff --git a/tensorflow/docs_src/tutorials/seq2seq.md b/tensorflow/docs_src/tutorials/seq2seq.md
index 6ffe3e8b037a8e21b38cded7e3b0d617b4ddb212..84c8a9c9f3a6519667a87676fb899fa0ab62e9e5 100644
--- a/tensorflow/docs_src/tutorials/seq2seq.md
+++ b/tensorflow/docs_src/tutorials/seq2seq.md
@@ -8,7 +8,10 @@ some input and generate a meaningful response? For example, could we train
 a neural network to translate from English to French? It turns out that
 the answer is *yes*.
 
-This tutorial will show you how to build and train such a system end-to-end. Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub. You can then start by running the translate program:
+This tutorial will show you how to build and train such a system end-to-end.
+Clone the [TensorFlow main repo](https://github.com/tensorflow/tensorflow) and
+the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub.
+You can then start by running the translate program:
 
 ```
 cd models/tutorials/rnn/translate
@@ -25,7 +28,7 @@ This tutorial references the following files.
 
 File | What's in it?
 --- | ---
-`tensorflow/tensorflow/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
+`tensorflow/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py` | Library for building sequence-to-sequence models.
 `models/tutorials/rnn/translate/seq2seq_model.py` | Neural translation sequence-to-sequence model.
 `models/tutorials/rnn/translate/data_utils.py` | Helper functions for preparing translation data.
 `models/tutorials/rnn/translate/translate.py` | Binary that trains and runs the translation model.
@@ -137,7 +140,7 @@ When training models with large output vocabularies, i.e., when
 tensors. Instead, it is better to return smaller output tensors, which will
 later be projected onto a large output tensor using `output_projection`.
 This allows to use our seq2seq models with a sampled softmax loss, as described
-in [Jean et. al., 2014](http://arxiv.org/abs/1412.2007)
+in [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)).
 
 In addition to `basic_rnn_seq2seq` and `embedding_rnn_seq2seq` there are a few
@@ -148,9 +151,9 @@ have similar interfaces, so we will not describe them in detail. We will use
 ## Neural translation model
 
 While the core of the sequence-to-sequence model is constructed by
-the functions in `tensorflow/tensorflow/python/ops/seq2seq.py`, there are still a few tricks
-that are worth mentioning that are used in our translation model in
-`models/tutorials/rnn/translate/seq2seq_model.py`.
+the functions in `tensorflow/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py`,
+there are still a few tricks that are worth mentioning that are used in our
+translation model in `models/tutorials/rnn/translate/seq2seq_model.py`.
 
 ### Sampled softmax and output projection
 
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index b87efedb8005d3ee75479b284b4c4ad809c056d3..fdf43955eafbdeb97b94349a55e0f20052af16ab 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -1,12 +1,13 @@
 # TensorFlow Linear Model Tutorial
 
-In this tutorial, we will use the tf.contrib.learn API in TensorFlow to solve a binary
-classification problem: Given census data about a person such as age, gender,
-education and occupation (the features), we will try to predict whether or not
-the person earns more than 50,000 dollars a year (the target label). We will
-train a **logistic regression** model, and given an individual's information our
-model will output a number between 0 and 1, which can be interpreted as the
-probability that the individual has an annual income of over 50,000 dollars.
+In this tutorial, we will use the tf.estimator API in TensorFlow to solve a
+binary classification problem: Given census data about a person such as age,
+gender, education and occupation (the features), we will try to predict whether
+or not the person earns more than 50,000 dollars a year (the target label). We
+will train a **logistic regression** model, and given an individual's
+information our model will output a number between 0 and 1, which can be
+interpreted as the probability that the individual has an annual income of over
+50,000 dollars.
 
 ## Setup
 
@@ -16,7 +17,7 @@ To try the code for this tutorial:
 
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
-3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3.  Install the pandas data analysis library. tf.estimator doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
 
     a. Get `pip`:
 
@@ -65,12 +66,13 @@ Once the CSV files are downloaded, let's read them into
 
 ```python
 import pandas as pd
-COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
-           "marital_status", "occupation", "relationship", "race", "gender",
-           "capital_gain", "capital_loss", "hours_per_week", "native_country",
-           "income_bracket"]
-df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
+CSV_COLUMNS = [
+    "age", "workclass", "fnlwgt", "education", "education_num",
+    "marital_status", "occupation", "relationship", "race", "gender",
+    "capital_gain", "capital_loss", "hours_per_week", "native_country",
+    "income_bracket"]
+df_train = pd.read_csv(train_file.name, names=CSV_COLUMNS, skipinitialspace=True)
+df_test = pd.read_csv(test_file.name, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)
 ```
 
 Since the task is a binary classification problem, we'll construct a label
@@ -78,9 +80,8 @@ column named "label" whose value is 1 if the income is over 50K, and 0
 otherwise.
 
 ```python
-LABEL_COLUMN = "label"
-df_train[LABEL_COLUMN] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-df_test[LABEL_COLUMN] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+train_labels = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
+test_labels = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
 ```
 
 Next, let's take a look at the dataframe and see which columns we can use to
@@ -95,12 +96,6 @@ and continuous columns:
     a continuous range. For example, the capital gain of a person (e.g. $14,084)
     is a continuous column.
 
-```python
-CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
-                       "relationship", "race", "gender", "native_country"]
-CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
-```
-
 Here's a list of columns available in the Census Income dataset:
 
 | Column Name    | Type        | Description                       | {.sortable}
@@ -111,8 +106,8 @@ Here's a list of columns available in the Census Income dataset:
 :                :             : military, private, etc.).         :
 | fnlwgt         | Continuous  | The number of people the census   |
 :                :             : takers believe that observation   :
-:                :             : represents (sample weight). This  :
-:                :             : variable will not be used.        :
+:                :             : represents (sample weight). Final :
+:                :             : weight will not be used.          :
 | education      | Categorical | The highest level of education    |
 :                :             : achieved for that individual.     :
 | education_num  | Continuous  | The highest level of education in |
@@ -136,13 +131,11 @@ Here's a list of columns available in the Census Income dataset:
 
 ## Converting Data into Tensors
 
-When building a tf.contrib.learn model, the input data is specified by means of an Input
-Builder function. This builder function will not be called until it is later
-passed to tf.contrib.learn methods such as `fit` and `evaluate`. The purpose of this
-function is to construct the input data, which is represented in the form of
-@{tf.Tensor}s
-or
-@{tf.SparseTensor}s.
+When building a tf.estimator model, the input data is specified by means of an
+Input Builder function. This builder function will not be called until it is
+later passed to tf.estimator.Estimator methods such as `train` and `evaluate`.
+The purpose of this function is to construct the input data, which is
+represented in the form of @{tf.Tensor}s or @{tf.SparseTensor}s.
 In more detail, the Input Builder function returns the following as a pair:
 
 1.  `feature_cols`: A dict from feature column names to `Tensors` or
@@ -150,52 +143,43 @@ In more detail, the Input Builder function returns the following as a pair:
 2.  `label`: A `Tensor` containing the label column.
 
 The keys of the `feature_cols` will be used to construct columns in the
-next section. Because we want to call the `fit` and `evaluate` methods with
-different data, we define two different input builder functions,
-`train_input_fn` and `test_input_fn` which are identical except that they pass
-different data to `input_fn`. Note that `input_fn` will be called while
+next section. Because we want to call the `train` and `evaluate` methods with
+different data, we define a method that returns an input function based on the
+given data. Note that the returned input function will be called while
 constructing the TensorFlow graph, not while running the graph. What it is
 returning is a representation of the input data as the fundamental unit of
 TensorFlow computations, a `Tensor` (or `SparseTensor`).
 
-Our model represents the input data as *constant* tensors, meaning that the
-tensor represents a constant value, in this case the values of a particular
-column of `df_train` or `df_test`. This is the simplest way to pass data into
-TensorFlow. Another more advanced way to represent input data would be to
-construct an @{$python/io_ops#inputs-and-readers$Inputs And Readers}
-that represents a file or other data source, and iterates through the file as
-TensorFlow runs the graph. Each continuous column in the train or test dataframe
+We use the `tf.estimator.inputs.pandas_input_fn` method to create an input
+function from pandas dataframes.
+Each continuous column in the train or test dataframe
 will be converted into a `Tensor`, which in general is a good format to
 represent dense data. For categorical data, we must represent the data as a
 `SparseTensor`. This data format is good for representing sparse data.
+Another more advanced way to represent input data would be to
+construct an @{$python/io_ops#inputs-and-readers$Inputs And Readers}
+that represents a file or other data source, and iterates through the file as
+TensorFlow runs the graph.
 
 ```python
-import tensorflow as tf
-
-def input_fn(df):
-  # Creates a dictionary mapping from each continuous feature column name (k) to
-  # the values of that column stored in a constant Tensor.
-  continuous_cols = {k: tf.constant(df[k].values)
-                     for k in CONTINUOUS_COLUMNS}
-  # Creates a dictionary mapping from each categorical feature column name (k)
-  # to the values of that column stored in a tf.SparseTensor.
-  categorical_cols = {k: tf.SparseTensor(
-      indices=[[i, 0] for i in range(df[k].size)],
-      values=df[k].values,
-      dense_shape=[df[k].size, 1])
-                      for k in CATEGORICAL_COLUMNS}
-  # Merges the two dictionaries into one.
-  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
-  # Converts the label column into a constant Tensor.
-  label = tf.constant(df[LABEL_COLUMN].values)
-  # Returns the feature columns and the label.
-  return feature_cols, label
-
-def train_input_fn():
-  return input_fn(df_train)
-
-def eval_input_fn():
-  return input_fn(df_test)
+def input_fn(data_file, num_epochs, shuffle):
+  """Input builder function."""
+  df_data = pd.read_csv(
+      tf.gfile.Open(data_file),
+      names=CSV_COLUMNS,
+      skipinitialspace=True,
+      engine="python",
+      skiprows=1)
+  # remove NaN elements
+  df_data = df_data.dropna(how="any", axis=0)
+  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
+  return tf.estimator.inputs.pandas_input_fn(
+      x=df_data,
+      y=labels,
+      batch_size=100,
+      num_epochs=num_epochs,
+      shuffle=shuffle,
+      num_threads=5)
 ```
 
 ## Selecting and Engineering Features for the Model
@@ -211,37 +195,38 @@ to predict the target label.
 ### Base Categorical Feature Columns
 
 To define a feature column for a categorical feature, we can create a
-`SparseColumn` using the tf.contrib.learn API. If you know the set of all possible
-feature values of a column and there are only a few of them, you can use
-`sparse_column_with_keys`. Each key in the list will get assigned an
-auto-incremental ID starting from 0. For example, for the `gender` column we can
-assign the feature string "Female" to an integer ID of 0 and "Male" to 1 by
-doing:
+`CategoricalColumn` using the tf.feature_column API. If you know the set of all
+possible feature values of a column and there are only a few of them, you can
+use `categorical_column_with_vocabulary_list`. Each key in the list will get
+assigned an auto-incremental ID starting from 0. For example, for the `gender`
+column we can assign the feature string "Female" to an integer ID of 0 and
+"Male" to 1 by doing:
 
 ```python
-gender = tf.contrib.layers.sparse_column_with_keys(
-  column_name="gender", keys=["Female", "Male"])
+gender = tf.feature_column.categorical_column_with_vocabulary_list(
+    "gender", ["Female", "Male"])
 ```
 
 What if we don't know the set of possible values in advance? Not a problem. We
-can use `sparse_column_with_hash_bucket` instead:
+can use `categorical_column_with_hash_bucket` instead:
 
 ```python
-education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
+occupation = tf.feature_column.categorical_column_with_hash_bucket(
+    "occupation", hash_bucket_size=1000)
 ```
 
-What will happen is that each possible value in the feature column `education`
+What will happen is that each possible value in the feature column `occupation`
 will be hashed to an integer ID as we encounter them in training. See an example
 illustration below:
 
 ID  | Feature
 --- | -------------
 ... |
-9   | `"Bachelors"`
+9   | `"Machine-op-inspct"`
 ... |
-103 | `"Doctorate"`
+103 | `"Farming-fishing"`
 ... |
-375 | `"Masters"`
+375 | `"Protective-serv"`
 ... |
 
 No matter which way we choose to define a `SparseColumn`, each feature string
@@ -255,25 +240,43 @@ learned through the model training process we'll go through later.
 We'll do the similar trick to define the other categorical features:
 
 ```python
-race = tf.contrib.layers.sparse_column_with_hash_bucket("race", hash_bucket_size=100)
-marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
-relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
-workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
-occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
-native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)
+education = tf.feature_column.categorical_column_with_vocabulary_list(
+    "education", [
+        "Bachelors", "HS-grad", "11th", "Masters", "9th",
+        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
+        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
+        "Preschool", "12th"
+    ])
+marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
+    "marital_status", [
+        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
+        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
+    ])
+relationship = tf.feature_column.categorical_column_with_vocabulary_list(
+    "relationship", [
+        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
+        "Other-relative"
+    ])
+workclass = tf.feature_column.categorical_column_with_vocabulary_list(
+    "workclass", [
+        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
+        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
+    ])
+native_country = tf.feature_column.categorical_column_with_hash_bucket(
+    "native_country", hash_bucket_size=1000)
 ```
 
 ### Base Continuous Feature Columns
 
-Similarly, we can define a `RealValuedColumn` for each continuous feature column
+Similarly, we can define a `NumericColumn` for each continuous feature column
 that we want to use in the model:
 
 ```python
-age = tf.contrib.layers.real_valued_column("age")
-education_num = tf.contrib.layers.real_valued_column("education_num")
-capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
+age = tf.feature_column.numeric_column("age")
+education_num = tf.feature_column.numeric_column("education_num")
+capital_gain = tf.feature_column.numeric_column("capital_gain")
+capital_loss = tf.feature_column.numeric_column("capital_loss")
+hours_per_week = tf.feature_column.numeric_column("hours_per_week")
 ```
 
 ### Making Continuous Features Categorical through Bucketization
@@ -297,7 +300,8 @@ ID (as a categorical feature) depending on which bucket that value falls into.
 So, we can define a `bucketized_column` over `age` as:
 
 ```python
-age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+age_buckets = tf.feature_column.bucketized_column(
+    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
 ```
 
 where the `boundaries` is a list of bucket boundaries. In this case, there are
@@ -317,7 +321,8 @@ differences between different feature combinations, we can add **crossed feature
 columns** to the model.
 
 ```python
-education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))
+education_x_occupation = tf.feature_column.crossed_column(
+    ["education", "occupation"], hash_bucket_size=1000)
 ```
 
 We can also create a `CrossedColumn` over more than two columns. Each
@@ -326,8 +331,8 @@ constituent column can be either a base feature column that is categorical
 or even another `CrossColumn`. Here's an example:
 
 ```python
-age_buckets_x_education_x_occupation = tf.contrib.layers.crossed_column(
-  [age_buckets, education, occupation], hash_bucket_size=int(1e6))
+age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
+    [age_buckets, "education", "occupation"], hash_bucket_size=1000)
 ```
 
 ## Defining The Logistic Regression Model
@@ -337,8 +342,8 @@ ready to put them all together and build a Logistic Regression model. In the
 previous section we've seen several types of base and derived feature columns,
 including:
 
-*   `SparseColumn`
-*   `RealValuedColumn`
+*   `CategoricalColumn`
+*   `NumericColumn`
 *   `BucketizedColumn`
 *   `CrossedColumn`
 
@@ -346,11 +351,22 @@ All of these are subclasses of the abstract `FeatureColumn` class, and can be
 added to the `feature_columns` field of a model:
 
 ```python
+base_columns = [
+    gender, native_country, education, occupation, workclass, relationship,
+    age_buckets,
+]
+crossed_columns = [
+    tf.feature_column.crossed_column(
+        ["education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        ["native_country", "occupation"], hash_bucket_size=1000)
+]
+
 model_dir = tempfile.mkdtemp()
-m = tf.contrib.learn.LinearClassifier(feature_columns=[
-  gender, native_country, education, occupation, workclass, marital_status, race,
-  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
-  model_dir=model_dir)
+m = tf.estimator.LinearClassifier(
+    model_dir=model_dir, feature_columns=base_columns + crossed_columns)
 ```
 
 The model also automatically learns a bias term, which controls the prediction
@@ -361,19 +377,26 @@ in `model_dir`.
 ## Training and Evaluating Our Model
 
 After adding all the features to the model, now let's look at how to actually
-train the model. Training a model is just a one-liner using the tf.contrib.learn API:
+train the model. Training a model is just a one-liner using the tf.estimator
+API:
 
 ```python
-m.fit(input_fn=train_input_fn, steps=200)
+# set num_epochs to None to get infinite stream of data.
+m.train(
+    input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
+    steps=train_steps)
 ```
 
 After the model is trained, we can evaluate how good our model is at predicting
 the labels of the holdout data:
 
 ```python
-results = m.evaluate(input_fn=eval_input_fn, steps=1)
+results = m.evaluate(
+    input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
+    steps=None)
+print("model directory = %s" % model_dir)
 for key in sorted(results):
-    print("%s: %s" % (key, results[key]))
+  print("%s: %s" % (key, results[key]))
 ```
 
 The first line of the output should be something like `accuracy: 0.83557522`,
@@ -398,14 +421,13 @@ In the Linear Model library, you can add L1 and L2 regularizations to the model
 as:
 
 ```
-m = tf.contrib.learn.LinearClassifier(feature_columns=[
-  gender, native_country, education, occupation, workclass, marital_status, race,
-  age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
-  optimizer=tf.train.FtrlOptimizer(
-    learning_rate=0.1,
-    l1_regularization_strength=1.0,
-    l2_regularization_strength=1.0),
-  model_dir=model_dir)
+m = tf.estimator.LinearClassifier(
+    model_dir=model_dir, feature_columns=base_columns + crossed_columns,
+    optimizer=tf.train.FtrlOptimizer(
+      learning_rate=0.1,
+      l1_regularization_strength=1.0,
+      l2_regularization_strength=1.0),
+    model_dir=model_dir)
 ```
 
 One important difference between L1 and L2 regularization is that L1
@@ -465,6 +487,7 @@ value would be high.
 
 ## Learn Deeper
 
-If you're interested in learning more, check out our @{$wide_and_deep$Wide & Deep Learning Tutorial} where we'll show you how to combine
-the strengths of linear models and deep neural networks by jointly training them
-using the tf.contrib.learn API.
+If you're interested in learning more, check out our
+@{$wide_and_deep$Wide & Deep Learning Tutorial} where we'll show you how to
+combine the strengths of linear models and deep neural networks by jointly
+training them using the tf.estimator API.
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index 0978005d6c3f9147597509383f04f39113d057f9..e6344405d58305837c09f99385bf1e74400e0ddd 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -9,13 +9,13 @@ great for training deep neural networks too, and you might be thinking which one
 you should choose—Well, why not both? Would it be possible to combine the
 strengths of both in one model?
 
-In this tutorial, we'll introduce how to use the tf.contrib.learn API to jointly train a
-wide linear model and a deep feed-forward neural network. This approach combines
-the strengths of memorization and generalization. It's useful for generic
-large-scale regression and classification problems with sparse input features
-(e.g., categorical features with a large number of possible feature values). If
-you're interested in learning more about how Wide & Deep Learning works, please
-check out our [research paper](http://arxiv.org/abs/1606.07792).
+In this tutorial, we'll introduce how to use the tf.estimator API to jointly
+train a wide linear model and a deep feed-forward neural network. This approach
+combines the strengths of memorization and generalization. It's useful for
+generic large-scale regression and classification problems with sparse input
+features (e.g., categorical features with a large number of possible feature
+values). If you're interested in learning more about how Wide & Deep Learning
+works, please check out our [research paper](http://arxiv.org/abs/1606.07792).
 
 ![Wide & Deep Spectrum of Models](https://www.tensorflow.org/images/wide_n_deep.svg "Wide & Deep")
 
@@ -23,7 +23,7 @@ The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
 with an embedding layer and several hidden layers), and a Wide & Deep model
 (joint training of both). At a high level, there are only 3 steps to configure a
-wide, deep, or Wide & Deep model using the tf.contrib.learn API:
+wide, deep, or Wide & Deep model using the tf.estimator API:
 
 1.  Select features for the wide part: Choose the sparse base columns and
     crossed columns you want to use.
@@ -42,7 +42,7 @@ To try the code for this tutorial:
 
 2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
-3.  Install the pandas data analysis library. tf.contrib.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3.  Install the pandas data analysis library. tf.estimator doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
 
     a. Get `pip`:
 
@@ -78,23 +78,47 @@ part and the deep part of the model.
 ```python
 import tensorflow as tf
 
-# Categorical base columns.
-gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
-race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=[
-  "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
-education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
-relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
-workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
-occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
-native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)
+gender = tf.feature_column.categorical_column_with_vocabulary_list(
+    "gender", ["Female", "Male"])
+education = tf.feature_column.categorical_column_with_vocabulary_list(
+    "education", [
+        "Bachelors", "HS-grad", "11th", "Masters", "9th",
+        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
+        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
+        "Preschool", "12th"
+    ])
+marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
+    "marital_status", [
+        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
+        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
+    ])
+relationship = tf.feature_column.categorical_column_with_vocabulary_list(
+    "relationship", [
+        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
+        "Other-relative"
+    ])
+workclass = tf.feature_column.categorical_column_with_vocabulary_list(
+    "workclass", [
+        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
+        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
+    ])
+
+# To show an example of hashing:
+occupation = tf.feature_column.categorical_column_with_hash_bucket(
+    "occupation", hash_bucket_size=1000)
+native_country = tf.feature_column.categorical_column_with_hash_bucket(
+    "native_country", hash_bucket_size=1000)
 
 # Continuous base columns.
-age = tf.contrib.layers.real_valued_column("age")
-age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-education_num = tf.contrib.layers.real_valued_column("education_num")
-capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
+age = tf.feature_column.numeric_column("age")
+education_num = tf.feature_column.numeric_column("education_num")
+capital_gain = tf.feature_column.numeric_column("capital_gain")
+capital_loss = tf.feature_column.numeric_column("capital_loss")
+hours_per_week = tf.feature_column.numeric_column("hours_per_week")
+
+# Transformations.
+age_buckets = tf.feature_column.bucketized_column(
+    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
 ```
 
 ## The Wide Model: Linear Model with Crossed Feature Columns
@@ -103,11 +127,19 @@ The wide model is a linear model with a wide set of sparse and crossed feature
 columns:
 
 ```python
-wide_columns = [
-  gender, native_country, education, occupation, workclass, relationship, age_buckets,
-  tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)),
-  tf.contrib.layers.crossed_column([native_country, occupation], hash_bucket_size=int(1e4)),
-  tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))]
+base_columns = [
+    gender, native_country, education, occupation, workclass, relationship,
+    age_buckets,
+]
+
+crossed_columns = [
+    tf.feature_column.crossed_column(
+        ["education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        ["native_country", "occupation"], hash_bucket_size=1000)
+]
 ```
 
 Wide models with crossed feature columns can memorize sparse interactions
@@ -130,27 +162,42 @@ the TensorFlow tutorial on
 [Vector Representations of Words](https://www.tensorflow.org/versions/r0.9/tutorials/word2vec/index.html),
 or [Word Embedding](https://en.wikipedia.org/wiki/Word_embedding) on Wikipedia.
 
+Another way to represent categorical columns to feed into a neural network is
+via a multi-hot representation. This is often appropriate for categorical
+columns with only a few possible values. E.g. for the gender column, `"Male"`
+can be represented as `[1, 0]` and `"Female"` as `[0, 1]`. This is a fixed
+representation, whereas embeddings are more flexible and calculated at training
+time.
+
 We'll configure the embeddings for the categorical columns using
-`embedding_column`, and concatenate them with the continuous columns:
+`embedding_column`, and concatenate them with the continuous columns.
+We also use `indicator_column` to create multi-hot representation of some
+categorical columns.
 
 ```python
 deep_columns = [
-  tf.contrib.layers.embedding_column(workclass, dimension=8),
-  tf.contrib.layers.embedding_column(education, dimension=8),
-  tf.contrib.layers.embedding_column(gender, dimension=8),
-  tf.contrib.layers.embedding_column(relationship, dimension=8),
-  tf.contrib.layers.embedding_column(native_country, dimension=8),
-  tf.contrib.layers.embedding_column(occupation, dimension=8),
-  age, education_num, capital_gain, capital_loss, hours_per_week]
+    tf.feature_column.indicator_column(workclass),
+    tf.feature_column.indicator_column(education),
+    tf.feature_column.indicator_column(gender),
+    tf.feature_column.indicator_column(relationship),
+    # To show an example of embedding
+    tf.feature_column.embedding_column(native_country, dimension=8),
+    tf.feature_column.embedding_column(occupation, dimension=8),
+    age,
+    education_num,
+    capital_gain,
+    capital_loss,
+    hours_per_week,
+]
 ```
 
 The higher the `dimension` of the embedding is, the more degrees of freedom the
 model will have to learn the representations of the features. For simplicity, we
 set the dimension to 8 for all feature columns here. Empirically, a more
 informed decision for the number of dimensions is to start with a value on the
-order of \\(\log_2(n)\\) or \\(k\sqrt[4]n\\), where \\(n\\) is the number of unique
-features in a feature column and \\(k\\) is a small constant (usually smaller than
-10).
+order of \\(\log_2(n)\\) or \\(k\sqrt[4]n\\), where \\(n\\) is the number of
+unique features in a feature column and \\(k\\) is a small constant (usually
+smaller than 10).
 
 Through dense embeddings, deep models can generalize better and make predictions
 on feature pairs that were previously unseen in the training data. However, it
@@ -176,9 +223,9 @@ handled for you under the hood, so you simply need to create a
 ```python
 import tempfile
 model_dir = tempfile.mkdtemp()
-m = tf.contrib.learn.DNNLinearCombinedClassifier(
+m = tf.estimator.DNNLinearCombinedClassifier(
     model_dir=model_dir,
-    linear_feature_columns=wide_columns,
+    linear_feature_columns=crossed_columns,
     dnn_feature_columns=deep_columns,
     dnn_hidden_units=[100, 50])
 ```
@@ -194,62 +241,73 @@ import pandas as pd
 import urllib
 
 # Define the column names for the data sets.
-COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
-  "marital_status", "occupation", "relationship", "race", "gender",
-  "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]
-LABEL_COLUMN = 'label'
-CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
-                       "relationship", "race", "gender", "native_country"]
-CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
-                      "hours_per_week"]
-
-# Download the training and test data to temporary files.
-# Alternatively, you can download them yourself and change train_file and
-# test_file to your own paths.
-train_file = tempfile.NamedTemporaryFile()
-test_file = tempfile.NamedTemporaryFile()
-urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
-urllib.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
-
-# Read the training and test data sets into Pandas dataframe.
-df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)
-df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
-df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
-
-def input_fn(df):
-  # Creates a dictionary mapping from each continuous feature column name (k) to
-  # the values of that column stored in a constant Tensor.
-  continuous_cols = {k: tf.constant(df[k].values)
-                     for k in CONTINUOUS_COLUMNS}
-  # Creates a dictionary mapping from each categorical feature column name (k)
-  # to the values of that column stored in a tf.SparseTensor.
-  categorical_cols = {k: tf.SparseTensor(
-      indices=[[i, 0] for i in range(df[k].size)],
-      values=df[k].values,
-      dense_shape=[df[k].size, 1])
-                      for k in CATEGORICAL_COLUMNS}
-  # Merges the two dictionaries into one.
-  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
-  # Converts the label column into a constant Tensor.
-  label = tf.constant(df[LABEL_COLUMN].values)
-  # Returns the feature columns and the label.
-  return feature_cols, label
-
-def train_input_fn():
-  return input_fn(df_train)
-
-def eval_input_fn():
-  return input_fn(df_test)
+CSV_COLUMNS = [
+    "age", "workclass", "fnlwgt", "education", "education_num",
+    "marital_status", "occupation", "relationship", "race", "gender",
+    "capital_gain", "capital_loss", "hours_per_week", "native_country",
+    "income_bracket"
+]
+
+def maybe_download(train_data, test_data):
+  """Maybe downloads training data and returns train and test file names."""
+  if train_data:
+    train_file_name = train_data
+  else:
+    train_file = tempfile.NamedTemporaryFile(delete=False)
+    urllib.request.urlretrieve(
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+        train_file.name)  # pylint: disable=line-too-long
+    train_file_name = train_file.name
+    train_file.close()
+    print("Training data is downloaded to %s" % train_file_name)
+
+  if test_data:
+    test_file_name = test_data
+  else:
+    test_file = tempfile.NamedTemporaryFile(delete=False)
+    urllib.request.urlretrieve(
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+        test_file.name)  # pylint: disable=line-too-long
+    test_file_name = test_file.name
+    test_file.close()
+    print("Test data is downloaded to %s"% test_file_name)
+
+  return train_file_name, test_file_name
+
+def input_fn(data_file, num_epochs, shuffle):
+  """Input builder function."""
+  df_data = pd.read_csv(
+      tf.gfile.Open(data_file),
+      names=CSV_COLUMNS,
+      skipinitialspace=True,
+      engine="python",
+      skiprows=1)
+  # remove NaN elements
+  df_data = df_data.dropna(how="any", axis=0)
+  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
+  return tf.estimator.inputs.pandas_input_fn(
+      x=df_data,
+      y=labels,
+      batch_size=100,
+      num_epochs=num_epochs,
+      shuffle=shuffle,
+      num_threads=5)
 ```
 
 After reading in the data, you can train and evaluate the model:
 
 ```python
-m.fit(input_fn=train_input_fn, steps=200)
-results = m.evaluate(input_fn=eval_input_fn, steps=1)
+# set num_epochs to None to get infinite stream of data.
+m.train(
+    input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
+    steps=train_steps)
+# set steps to None to run evaluation until all data consumed.
+results = m.evaluate(
+    input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
+    steps=None)
+print("model directory = %s" % model_dir)
 for key in sorted(results):
-    print("%s: %s" % (key, results[key]))
+  print("%s: %s" % (key, results[key]))
 ```
 
 The first line of the output should be something like `accuracy: 0.84429705`. We
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index d7a9089949c05aebf033d7024ae2333e9b8a31f6..380763db1ab9279d23917c6954f17a59f844ab97 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -351,7 +351,7 @@ to evaluate embeddings is to directly use them to predict syntactic and semantic
 relationships like `king is to queen as father is to ?`. This is called
 *analogical reasoning* and the task was introduced by
 [Mikolov and colleagues
-](http://msr-waypoint.com/en-us/um/people/gzweig/Pubs/NAACL2013Regularities.pdf).
+](http://www.anthology.aclweb.org/N/N13/N13-1090.pdf).
 Download the dataset for this task from
 [download.tensorflow.org](http://download.tensorflow.org/data/questions-words.txt).
 
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index ffaf9349d2a1045b3d1a16352011880f64ab0c17..f952cab8b5e1332bea27af35d549f44c2026e113 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -26,6 +26,7 @@ py_library(
     srcs = ["zero_out_op_1.py"],
     data = [":zero_out_op_kernel_1.so"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 tf_custom_op_library(
@@ -38,6 +39,7 @@ py_library(
     srcs = ["zero_out_op_2.py"],
     data = [":zero_out_op_kernel_2.so"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 tf_custom_op_library(
@@ -50,6 +52,7 @@ py_library(
     srcs = ["zero_out_op_3.py"],
     data = [":zero_out_op_kernel_3.so"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_library(
@@ -57,8 +60,6 @@ py_library(
     srcs = ["zero_out_grad_2.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":zero_out_op_2",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:sparse_ops",
@@ -73,7 +74,6 @@ py_test(
     deps = [
         ":zero_out_op_1",
         "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -97,7 +97,6 @@ py_test(
     deps = [
         ":zero_out_op_3",
         "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -112,6 +111,7 @@ py_library(
     srcs = ["cuda_op.py"],
     data = [":cuda_op_kernel.so"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_test(
@@ -131,17 +131,13 @@ py_test(
     size = "small",
     srcs = ["fact_test.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 cc_binary(
     name = "attr_examples",
     srcs = ["attr_examples.cc"],
-    deps = [
-        "//tensorflow/core",
-    ],
+    deps = ["//tensorflow/core"],
 )
 
 filegroup(
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
index 04c34c5968530aed0e6b6296b7e061e0e46e7d04..4a04e5c3c9466277be68f0e03b80726b2ec02673 100644
--- a/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/shape_inference.h"
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 270f654ed729891af29b020a046c6ff88c24cc1e..51f6c4a71c2db1a947833480595b6355cf7db6b6 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -124,7 +124,7 @@ it and the Android NDK and SDK must be installed on your system.
 
 ##### Edit WORKSPACE
 
-The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L32)
+The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L36)
 must be uncommented with the paths filled in appropriately depending on where
 you installed the NDK and SDK. Otherwise an error such as:
 "The external label '//external:android/sdk' is not bound to anything" will
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 001d57f88ab5762c1c1ade563480f333534f10d9..e97faad5167b2425c81d697c3b23e30dedd35416 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -10,7 +10,7 @@
 // setting nativeBuildSystem below to 'makefile'. This will allow building the demo
 // on Windows machines, but note that full equivalence with the Bazel
 // build is not yet guaranteed. See comments below for caveats and tips
-// for speeding up the build, such as as enabling ccache.
+// for speeding up the build, such as enabling ccache.
 // NOTE: Running a make build will cause subsequent Bazel builds to *fail*
 // unless the contrib/makefile/downloads/ and gen/ dirs are deleted afterwards.
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 27d7e414870229a8f29fed3fc2a72db884027ca6..512580fd3eb46e45ed8ecb2301af6fb1a69035c6 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -19,22 +19,34 @@ package org.tensorflow.demo;
 import android.Manifest;
 import android.app.Activity;
 import android.app.Fragment;
+import android.content.Context;
 import android.content.pm.PackageManager;
+import android.graphics.Bitmap;
+import android.hardware.Camera;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.Image;
 import android.media.Image.Plane;
+import android.media.ImageReader;
 import android.media.ImageReader.OnImageAvailableListener;
 import android.os.Build;
 import android.os.Bundle;
 import android.os.Handler;
 import android.os.HandlerThread;
+import android.os.Trace;
 import android.util.Size;
 import android.view.KeyEvent;
 import android.view.WindowManager;
 import android.widget.Toast;
 import java.nio.ByteBuffer;
+
+import org.tensorflow.demo.env.ImageUtils;
 import org.tensorflow.demo.env.Logger;
-import org.tensorflow.demo.R;
 
-public abstract class CameraActivity extends Activity implements OnImageAvailableListener {
+public abstract class CameraActivity extends Activity implements OnImageAvailableListener, Camera.
+        PreviewCallback {
   private static final Logger LOGGER = new Logger();
 
   private static final int PERMISSIONS_REQUEST = 1;
@@ -46,6 +58,20 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
 
   private Handler handler;
   private HandlerThread handlerThread;
+  private boolean useCamera2API;
+  protected Bitmap rgbFrameBitmap = null;
+  private int[] rgbBytes = null;
+  protected int previewWidth = 0;
+  protected int previewHeight = 0;
+  protected Bitmap croppedBitmap = null;
+  protected static final boolean SAVE_PREVIEW_BITMAP = false;
+  protected long lastProcessingTimeMs;
+  protected Bitmap cropCopyBitmap;
+  protected ResultsView resultsView;
+  protected boolean computing = false;
+  protected Runnable postInferenceCallback;
+  protected byte[][] yuvBytes=new byte[3][];
+  protected int yRowStride;
 
   @Override
   protected void onCreate(final Bundle savedInstanceState) {
@@ -62,6 +88,93 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
     }
   }
 
+  /**
+   * Callback for android.hardware.Camera API
+   */
+  @Override
+  public void onPreviewFrame(final byte[] bytes, final Camera camera) {
+    if (computing) {
+      return;
+    }
+    computing = true;
+    yuvBytes[0] = bytes;
+    try {
+      // Initialize the storage bitmaps once when the resolution is known.
+      if (rgbBytes == null) {
+        Camera.Size previewSize = camera.getParameters().getPreviewSize();
+        previewHeight = previewSize.height;
+        previewWidth = previewSize.width;
+        rgbBytes = new int[previewWidth * previewHeight];
+        onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90);
+      }
+      ImageUtils.convertYUV420SPToARGB8888(bytes, rgbBytes, previewWidth, previewHeight, false);
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      return;
+    }
+    postInferenceCallback = new Runnable() {
+      @Override
+      public void run() {
+        camera.addCallbackBuffer(bytes);
+      }
+    };
+    processImageRGBbytes(rgbBytes);
+  }
+
+  /**
+   * Callback for Camera2 API
+   */
+  @Override
+  public void onImageAvailable(final ImageReader reader) {
+    Image image = null;
+    //We need wait until we have some size from onPreviewSizeChosen
+    if (previewWidth == 0 || previewHeight == 0) {
+      return;
+    }
+    rgbBytes = new int[previewWidth * previewHeight];
+    try {
+      image = reader.acquireLatestImage();
+
+      if (image == null) {
+        return;
+      }
+
+      if (computing) {
+        image.close();
+        return;
+      }
+      computing = true;
+      Trace.beginSection("imageAvailable");
+      final Plane[] planes = image.getPlanes();
+      fillBytes(planes, yuvBytes);
+      yRowStride = planes[0].getRowStride();
+      final int uvRowStride = planes[1].getRowStride();
+      final int uvPixelStride = planes[1].getPixelStride();
+      ImageUtils.convertYUV420ToARGB8888(
+          yuvBytes[0],
+          yuvBytes[1],
+          yuvBytes[2],
+          rgbBytes,
+          previewWidth,
+          previewHeight,
+          yRowStride,
+          uvRowStride,
+          uvPixelStride,
+          false);
+      image.close();
+
+    } catch (final Exception e) {
+      if (image != null) {
+        image.close();
+      }
+      LOGGER.e(e, "Exception!");
+      Trace.endSection();
+      return;
+    }
+    processImageRGBbytes(rgbBytes);
+    Trace.endSection();
+  }
+
   @Override
   public synchronized void onStart() {
     LOGGER.d("onStart " + this);
@@ -123,8 +236,8 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
     switch (requestCode) {
       case PERMISSIONS_REQUEST: {
         if (grantResults.length > 0
-                && grantResults[0] == PackageManager.PERMISSION_GRANTED
-                && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+            && grantResults[0] == PackageManager.PERMISSION_GRANTED
+            && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
           setFragment();
         } else {
           requestPermission();
@@ -135,7 +248,8 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
 
   private boolean hasPermission() {
     if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
-      return checkSelfPermission(PERMISSION_CAMERA) == PackageManager.PERMISSION_GRANTED && checkSelfPermission(PERMISSION_STORAGE) == PackageManager.PERMISSION_GRANTED;
+      return checkSelfPermission(PERMISSION_CAMERA) == PackageManager.PERMISSION_GRANTED &&
+          checkSelfPermission(PERMISSION_STORAGE) == PackageManager.PERMISSION_GRANTED;
     } else {
       return true;
     }
@@ -143,25 +257,80 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
 
   private void requestPermission() {
     if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
-      if (shouldShowRequestPermissionRationale(PERMISSION_CAMERA) || shouldShowRequestPermissionRationale(PERMISSION_STORAGE)) {
-        Toast.makeText(CameraActivity.this, "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
+      if (shouldShowRequestPermissionRationale(PERMISSION_CAMERA) ||
+          shouldShowRequestPermissionRationale(PERMISSION_STORAGE)) {
+        Toast.makeText(CameraActivity.this,
+            "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
       }
       requestPermissions(new String[] {PERMISSION_CAMERA, PERMISSION_STORAGE}, PERMISSIONS_REQUEST);
     }
   }
 
+  // Returns true if the device supports the required hardware level, or better.
+  boolean isHardwareLevelSupported(CameraCharacteristics characteristics, int requiredLevel) {
+    int deviceLevel = characteristics.get(CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL);
+    if (deviceLevel == CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_LEGACY) {
+      return requiredLevel == deviceLevel;
+    }
+    // deviceLevel is not LEGACY, can use numerical sort
+    return requiredLevel <= deviceLevel;
+  }
+
+  private String chooseCamera() {
+    final CameraManager manager = (CameraManager) getSystemService(Context.CAMERA_SERVICE);
+    try {
+      for (final String cameraId : manager.getCameraIdList()) {
+        final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+        // We don't use a front facing camera in this sample.
+        final Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
+        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
+          continue;
+        }
+
+        final StreamConfigurationMap map =
+            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+        if (map == null) {
+          continue;
+        }
+
+        useCamera2API = isHardwareLevelSupported(characteristics,
+            CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        LOGGER.i("Camera API lv2?: %s", useCamera2API);
+        return cameraId;
+      }
+    } catch (CameraAccessException e) {
+      LOGGER.e(e, "Not allowed to access camera");
+    }
+
+    return null;
+  }
+
   protected void setFragment() {
-    final Fragment fragment =
-        CameraConnectionFragment.newInstance(
-            new CameraConnectionFragment.ConnectionCallback() {
-              @Override
-              public void onPreviewSizeChosen(final Size size, final int rotation) {
-                CameraActivity.this.onPreviewSizeChosen(size, rotation);
-              }
-            },
-            this,
-            getLayoutId(),
-            getDesiredPreviewFrameSize());
+    String cameraId = chooseCamera();
+
+    Fragment fragment;
+    if (useCamera2API) {
+      CameraConnectionFragment camera2Fragment =
+          CameraConnectionFragment.newInstance(
+              new CameraConnectionFragment.ConnectionCallback() {
+                @Override
+                public void onPreviewSizeChosen(final Size size, final int rotation) {
+                  previewHeight = size.getHeight();
+                  previewWidth = size.getWidth();
+                  CameraActivity.this.onPreviewSizeChosen(size, rotation);
+                }
+              },
+              this,
+              getLayoutId(),
+              getDesiredPreviewFrameSize());
+
+      camera2Fragment.setCamera(cameraId);
+      fragment = camera2Fragment;
+    } else {
+      fragment = new LegacyCameraConnectionFragment(this, getLayoutId());
+    }
 
     getFragmentManager()
         .beginTransaction()
@@ -213,6 +382,7 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
     return super.onKeyDown(keyCode, event);
   }
 
+  protected abstract void processImageRGBbytes(int[] rgbBytes ) ;
   protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
   protected abstract int getLayoutId();
   protected abstract Size getDesiredPreviewFrameSize();
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
index 76bd61d00f2ae16501ff9e234108b240647de4f2..986f2777b2c020a3bb366c5dd95ea3041cbbe34e 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -353,58 +353,44 @@ public class CameraConnectionFragment extends Fragment {
     super.onPause();
   }
 
+  public void setCamera(String cameraId) {
+    this.cameraId = cameraId;
+  }
+
   /**
    * Sets up member variables related to camera.
-   *
-   * @param width  The width of available size for camera preview
-   * @param height The height of available size for camera preview
    */
-  private void setUpCameraOutputs(final int width, final int height) {
+  private void setUpCameraOutputs() {
     final Activity activity = getActivity();
     final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
     try {
-      for (final String cameraId : manager.getCameraIdList()) {
-        final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
-
-        // We don't use a front facing camera in this sample.
-        final Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
-        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
-          continue;
-        }
-
-        final StreamConfigurationMap map =
-            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
-
-        if (map == null) {
-          continue;
-        }
-
-        // For still image captures, we use the largest available size.
-        final Size largest =
-            Collections.max(
-                Arrays.asList(map.getOutputSizes(ImageFormat.YUV_420_888)),
-                new CompareSizesByArea());
-
-        sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
-
-        // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
-        // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
-        // garbage capture data.
-        previewSize =
-            chooseOptimalSize(
-                map.getOutputSizes(SurfaceTexture.class),
-                inputSize.getWidth(),
-                inputSize.getHeight());
-
-        // We fit the aspect ratio of TextureView to the size of preview we picked.
-        final int orientation = getResources().getConfiguration().orientation;
-        if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
-          textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
-        } else {
-          textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
-        }
-
-        CameraConnectionFragment.this.cameraId = cameraId;
+      final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+      final StreamConfigurationMap map =
+          characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+      // For still image captures, we use the largest available size.
+      final Size largest =
+          Collections.max(
+              Arrays.asList(map.getOutputSizes(ImageFormat.YUV_420_888)),
+              new CompareSizesByArea());
+
+      sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
+
+      // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
+      // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
+      // garbage capture data.
+      previewSize =
+          chooseOptimalSize(map.getOutputSizes(SurfaceTexture.class),
+              inputSize.getWidth(),
+              inputSize.getHeight());
+
+      // We fit the aspect ratio of TextureView to the size of preview we picked.
+      final int orientation = getResources().getConfiguration().orientation;
+      if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
+        textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
+      } else {
+        textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
       }
     } catch (final CameraAccessException e) {
       LOGGER.e(e, "Exception!");
@@ -425,7 +411,7 @@ public class CameraConnectionFragment extends Fragment {
    * Opens the camera specified by {@link CameraConnectionFragment#cameraId}.
    */
   private void openCamera(final int width, final int height) {
-    setUpCameraOutputs(width, height);
+    setUpCameraOutputs();
     configureTransform(width, height);
     final Activity activity = getActivity();
     final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index bc391269255f64cb17bdc3f7ff65f801b0c60e67..f5cebc7670393c6572f990fc0fbad3236df00584 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -22,22 +22,19 @@ import android.graphics.Canvas;
 import android.graphics.Matrix;
 import android.graphics.Paint;
 import android.graphics.Typeface;
-import android.media.Image;
-import android.media.Image.Plane;
-import android.media.ImageReader;
+
 import android.media.ImageReader.OnImageAvailableListener;
 import android.os.SystemClock;
-import android.os.Trace;
 import android.util.Size;
 import android.util.TypedValue;
 import android.view.Display;
+
 import java.util.List;
 import java.util.Vector;
 import org.tensorflow.demo.OverlayView.DrawCallback;
 import org.tensorflow.demo.env.BorderedText;
 import org.tensorflow.demo.env.ImageUtils;
 import org.tensorflow.demo.env.Logger;
-import org.tensorflow.demo.R;
 
 public class ClassifierActivity extends CameraActivity implements OnImageAvailableListener {
   private static final Logger LOGGER = new Logger();
@@ -64,39 +61,25 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   private static final String INPUT_NAME = "input";
   private static final String OUTPUT_NAME = "output";
 
+
   private static final String MODEL_FILE = "file:///android_asset/tensorflow_inception_graph.pb";
   private static final String LABEL_FILE =
       "file:///android_asset/imagenet_comp_graph_label_strings.txt";
 
-  private static final boolean SAVE_PREVIEW_BITMAP = false;
 
   private static final boolean MAINTAIN_ASPECT = true;
 
   private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
 
-  private Classifier classifier;
 
   private Integer sensorOrientation;
-
-  private int previewWidth = 0;
-  private int previewHeight = 0;
-  private byte[][] yuvBytes;
-  private int[] rgbBytes = null;
-  private Bitmap rgbFrameBitmap = null;
-  private Bitmap croppedBitmap = null;
-
-  private Bitmap cropCopyBitmap;
-
-  private boolean computing = false;
-
+  private Classifier classifier;
   private Matrix frameToCropTransform;
   private Matrix cropToFrameTransform;
 
-  private ResultsView resultsView;
 
   private BorderedText borderedText;
 
-  private long lastProcessingTimeMs;
 
   @Override
   protected int getLayoutId() {
@@ -112,9 +95,8 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
 
   @Override
   public void onPreviewSizeChosen(final Size size, final int rotation) {
-    final float textSizePx =
-        TypedValue.applyDimension(
-            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    final float textSizePx = TypedValue.applyDimension(
+        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
@@ -129,7 +111,6 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
             INPUT_NAME,
             OUTPUT_NAME);
 
-    resultsView = (ResultsView) findViewById(R.id.results);
     previewWidth = size.getWidth();
     previewHeight = size.getHeight();
 
@@ -141,15 +122,13 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
     sensorOrientation = rotation + screenOrientation;
 
     LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
-    rgbBytes = new int[previewWidth * previewHeight];
     rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
     croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
 
-    frameToCropTransform =
-        ImageUtils.getTransformationMatrix(
-            previewWidth, previewHeight,
-            INPUT_SIZE, INPUT_SIZE,
-            sensorOrientation, MAINTAIN_ASPECT);
+    frameToCropTransform = ImageUtils.getTransformationMatrix(
+        previewWidth, previewHeight,
+        INPUT_SIZE, INPUT_SIZE,
+        sensorOrientation, MAINTAIN_ASPECT);
 
     cropToFrameTransform = new Matrix();
     frameToCropTransform.invert(cropToFrameTransform);
@@ -165,52 +144,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
         });
   }
 
-  @Override
-  public void onImageAvailable(final ImageReader reader) {
-    Image image = null;
-
-    try {
-      image = reader.acquireLatestImage();
-
-      if (image == null) {
-        return;
-      }
-
-      if (computing) {
-        image.close();
-        return;
-      }
-      computing = true;
-
-      Trace.beginSection("imageAvailable");
-
-      final Plane[] planes = image.getPlanes();
-      fillBytes(planes, yuvBytes);
-
-      final int yRowStride = planes[0].getRowStride();
-      final int uvRowStride = planes[1].getRowStride();
-      final int uvPixelStride = planes[1].getPixelStride();
-      ImageUtils.convertYUV420ToARGB8888(
-          yuvBytes[0],
-          yuvBytes[1],
-          yuvBytes[2],
-          previewWidth,
-          previewHeight,
-          yRowStride,
-          uvRowStride,
-          uvPixelStride,
-          rgbBytes);
-
-      image.close();
-    } catch (final Exception e) {
-      if (image != null) {
-        image.close();
-      }
-      LOGGER.e(e, "Exception!");
-      Trace.endSection();
-      return;
-    }
-
+  protected void processImageRGBbytes(int[] rgbBytes ) {
     rgbFrameBitmap.setPixels(rgbBytes, 0, previewWidth, 0, 0, previewWidth, previewHeight);
     final Canvas canvas = new Canvas(croppedBitmap);
     canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
@@ -219,7 +153,6 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
     if (SAVE_PREVIEW_BITMAP) {
       ImageUtils.saveBitmap(croppedBitmap);
     }
-
     runInBackground(
         new Runnable() {
           @Override
@@ -227,15 +160,19 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
             final long startTime = SystemClock.uptimeMillis();
             final List<Classifier.Recognition> results = classifier.recognizeImage(croppedBitmap);
             lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
-
+            LOGGER.i("Detect: %s", results);
             cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            if (resultsView==null) {
+              resultsView = (ResultsView) findViewById(R.id.results);
+            }
             resultsView.setResults(results);
             requestRender();
             computing = false;
+            if (postInferenceCallback != null) {
+              postInferenceCallback.run();
+            }
           }
         });
-
-    Trace.endSection();
   }
 
   @Override
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index 5800f80651bdbd07b3a861299421501cf47b1716..acace0eace6e6e7220e5b28aff067e371f12b782 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -66,7 +66,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   // must be manually placed in the assets/ directory by the user.
   // Graphs and models downloaded from http://pjreddie.com/darknet/yolo/ may be converted e.g. via
   // DarkFlow (https://github.com/thtrieu/darkflow). Sample command:
-  // ./flow --model cfg/tiny-yolo-voc.cfg --load bin/tiny-yolo-voc.weights --savepb --verbalise=True
+  // ./flow --model cfg/tiny-yolo-voc.cfg --load bin/tiny-yolo-voc.weights --savepb --verbalise
   private static final String YOLO_MODEL_FILE = "file:///android_asset/graph-tiny-yolo-voc.pb";
   private static final int YOLO_INPUT_SIZE = 416;
   private static final String YOLO_INPUT_NAME = "input";
@@ -126,6 +126,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
 
     tracker = new MultiBoxTracker(this);
 
+
     if (USE_YOLO) {
       detector =
           TensorFlowYoloDetector.create(
@@ -270,15 +271,17 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
       final int uvRowStride = planes[1].getRowStride();
       final int uvPixelStride = planes[1].getPixelStride();
       ImageUtils.convertYUV420ToARGB8888(
-          yuvBytes[0],
-          yuvBytes[1],
-          yuvBytes[2],
-          previewWidth,
-          previewHeight,
-          yRowStride,
-          uvRowStride,
-          uvPixelStride,
-          rgbBytes);
+              yuvBytes[0],
+              yuvBytes[1],
+              yuvBytes[2],
+              rgbBytes,
+              previewWidth,
+              previewHeight,
+              yRowStride,
+              uvRowStride,
+              uvPixelStride,
+              false);
+
 
       image.close();
     } catch (final Exception e) {
@@ -344,6 +347,8 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
     Trace.endSection();
   }
 
+  protected  void processImageRGBbytes(int[] rgbBytes ) {}
+
   @Override
   protected int getLayoutId() {
     return R.layout.camera_connection_fragment_tracking;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..8a22aaf4d68dda5b55ad674cdc4e72098cd1094c
--- /dev/null
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -0,0 +1,205 @@
+package org.tensorflow.demo;
+
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import android.app.Fragment;
+import android.graphics.SurfaceTexture;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+
+import java.io.IOException;
+
+import android.hardware.Camera;
+import android.hardware.Camera.CameraInfo;
+
+import org.tensorflow.demo.env.Logger;
+
+public class LegacyCameraConnectionFragment extends Fragment {
+
+  private Camera camera;
+  private static final Logger LOGGER = new Logger();
+  private Camera.PreviewCallback imageListener;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private int layout;
+
+  public LegacyCameraConnectionFragment(
+      final Camera.PreviewCallback imageListener,
+      final int layout) {
+    this.imageListener = imageListener;
+    this.layout = layout;
+  }
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+
+          int index = getCameraId();
+          camera = Camera.open(index);
+
+          try {
+            Camera.Parameters parameters = camera.getParameters();
+            parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+
+            camera.setDisplayOrientation(90);
+            camera.setParameters(parameters);
+            camera.setPreviewTexture(texture);
+          } catch (IOException exception) {
+            camera.release();
+          }
+
+          camera.setPreviewCallbackWithBuffer(imageListener);
+          Camera.Size s = camera.getParameters().getPreviewSize();
+          int bufferSize = s.height * s.width * 3 / 2;
+          camera.addCallbackBuffer(new byte[bufferSize]);
+
+          textureView.setAspectRatio(s.height, s.width);
+
+          camera.startPreview();
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {
+        }
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {
+        }
+      };
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+
+    if (textureView.isAvailable()) {
+      camera.startPreview();
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    stopCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("CameraBackground");
+    backgroundThread.start();
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  protected void stopCamera() {
+    if (camera != null) {
+      camera.stopPreview();
+      camera.setPreviewCallback(null);
+      camera.release();
+      camera = null;
+    }
+  }
+
+  private int getCameraId() {
+    CameraInfo ci = new CameraInfo();
+    for (int i = 0; i < Camera.getNumberOfCameras(); i++) {
+      Camera.getCameraInfo(i, ci);
+      if (ci.facing == CameraInfo.CAMERA_FACING_BACK)
+        return i;
+    }
+    return -1; // No camera found
+  }
+}
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
index 7afe2bf5412694c94a0e5b6d575e0a73e42dcb72..f0d2dd1ed69a4269238c28cc8cc9e73d560f1e9d 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -28,6 +28,7 @@ import android.graphics.Paint;
 import android.graphics.Paint.Style;
 import android.graphics.Rect;
 import android.graphics.Typeface;
+import android.hardware.Camera;
 import android.media.Image;
 import android.media.Image.Plane;
 import android.media.ImageReader;
@@ -58,7 +59,6 @@ import org.tensorflow.demo.OverlayView.DrawCallback;
 import org.tensorflow.demo.env.BorderedText;
 import org.tensorflow.demo.env.ImageUtils;
 import org.tensorflow.demo.env.Logger;
-import org.tensorflow.demo.R;
 
 /**
  * Sample activity that stylizes the camera preview according to "A Learned Representation For
@@ -97,10 +97,6 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
 
   private int previewWidth = 0;
   private int previewHeight = 0;
-  private byte[][] yuvBytes;
-  private int[] rgbBytes = null;
-  private Bitmap rgbFrameBitmap = null;
-  private Bitmap croppedBitmap = null;
 
   private final float[] styleVals = new float[NUM_STYLES];
   private int[] intValues;
@@ -108,18 +104,13 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
 
   private int frameNum = 0;
 
-  private Bitmap cropCopyBitmap;
   private Bitmap textureCopyBitmap;
 
-  private boolean computing = false;
-
   private Matrix frameToCropTransform;
   private Matrix cropToFrameTransform;
 
   private BorderedText borderedText;
 
-  private long lastProcessingTimeMs;
-
   private TensorFlowInferenceInterface inferenceInterface;
 
   private int lastOtherStyle = 1;
@@ -363,9 +354,8 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
 
   @Override
   public void onPreviewSizeChosen(final Size size, final int rotation) {
-    final float textSizePx =
-        TypedValue.applyDimension(
-            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    final float textSizePx = TypedValue.applyDimension(
+        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
@@ -393,7 +383,6 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     grid = (GridView) findViewById(R.id.grid_layout);
     grid.setAdapter(adapter);
     grid.setOnTouchListener(gridTouchAdapter);
-
     setStyle(adapter.items[0], 1.0f);
   }
 
@@ -455,78 +444,42 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     }
   }
 
-  @Override
-  public void onImageAvailable(final ImageReader reader) {
-    Image image = null;
-
-    try {
-      image = reader.acquireLatestImage();
-
-      if (image == null) {
-        return;
-      }
-
-      if (computing) {
-        image.close();
-        return;
-      }
-
-      if (desiredSize != initializedSize) {
-        LOGGER.i(
-            "Initializing at size preview size %dx%d, stylize size %d",
-            previewWidth, previewHeight, desiredSize);
-        rgbBytes = new int[previewWidth * previewHeight];
-        rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
-        croppedBitmap = Bitmap.createBitmap(desiredSize, desiredSize, Config.ARGB_8888);
-
-        frameToCropTransform =
-            ImageUtils.getTransformationMatrix(
-                previewWidth, previewHeight,
-                desiredSize, desiredSize,
-                sensorOrientation, true);
-
-        cropToFrameTransform = new Matrix();
-        frameToCropTransform.invert(cropToFrameTransform);
-
-        yuvBytes = new byte[3][];
-
-        intValues = new int[desiredSize * desiredSize];
-        floatValues = new float[desiredSize * desiredSize * 3];
-        initializedSize = desiredSize;
-      }
-
-      computing = true;
+  private void resetPreviewBuffers() {
+    croppedBitmap = Bitmap.createBitmap(desiredSize, desiredSize, Config.ARGB_8888);
 
-      Trace.beginSection("imageAvailable");
+    frameToCropTransform = ImageUtils.getTransformationMatrix(
+        previewWidth, previewHeight,
+        desiredSize, desiredSize,
+        sensorOrientation, true);
 
-      final Plane[] planes = image.getPlanes();
-      fillBytes(planes, yuvBytes);
-
-      final int yRowStride = planes[0].getRowStride();
-      final int uvRowStride = planes[1].getRowStride();
-      final int uvPixelStride = planes[1].getPixelStride();
-
-      ImageUtils.convertYUV420ToARGB8888(
-          yuvBytes[0],
-          yuvBytes[1],
-          yuvBytes[2],
-          previewWidth,
-          previewHeight,
-          yRowStride,
-          uvRowStride,
-          uvPixelStride,
-          rgbBytes);
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+    yuvBytes = new byte[3][];
+    intValues = new int[desiredSize * desiredSize];
+    floatValues = new float[desiredSize * desiredSize * 3];
+    initializedSize = desiredSize;
+  }
 
-      image.close();
-    } catch (final Exception e) {
-      if (image != null) {
-        image.close();
-      }
-      LOGGER.e(e, "Exception!");
-      Trace.endSection();
-      return;
+  protected void processImageRGBbytes(int[] rgbBytes ) {
+    if (desiredSize != initializedSize) {
+      LOGGER.i(
+          "Initializing at size preview size %dx%d, stylize size %d",
+          previewWidth, previewHeight, desiredSize);
+      
+      rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+      croppedBitmap = Bitmap.createBitmap(desiredSize, desiredSize, Config.ARGB_8888);
+      frameToCropTransform = ImageUtils.getTransformationMatrix(
+          previewWidth, previewHeight,
+          desiredSize, desiredSize,
+          sensorOrientation, true);
+
+      cropToFrameTransform = new Matrix();
+      frameToCropTransform.invert(cropToFrameTransform);
+      yuvBytes = new byte[3][];
+      intValues = new int[desiredSize * desiredSize];
+      floatValues = new float[desiredSize * desiredSize * 3];
+      initializedSize = desiredSize;
     }
-
     rgbFrameBitmap.setPixels(rgbBytes, 0, previewWidth, 0, 0, previewWidth, previewHeight);
     final Canvas canvas = new Canvas(croppedBitmap);
     canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
@@ -536,24 +489,24 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
       ImageUtils.saveBitmap(croppedBitmap);
     }
 
-    runInBackground(
-        new Runnable() {
-          @Override
-          public void run() {
-            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
-
-            final long startTime = SystemClock.uptimeMillis();
-            stylizeImage(croppedBitmap);
-            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
-
-            textureCopyBitmap = Bitmap.createBitmap(croppedBitmap);
-
-            requestRender();
-            computing = false;
-          }
-        });
-
-    Trace.endSection();
+    runInBackground(new Runnable() {
+      @Override
+      public void run() {
+        cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+        final long startTime = SystemClock.uptimeMillis();
+        stylizeImage(croppedBitmap);
+        lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+        textureCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+        requestRender();
+        computing = false;
+        if (postInferenceCallback != null) {
+          postInferenceCallback.run();
+        }
+      }
+    });
+    if (desiredSize != initializedSize) {
+      resetPreviewBuffers();
+    }
   }
 
   private void stylizeImage(final Bitmap bitmap) {
@@ -584,6 +537,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
     }
 
     // Copy the input data into TensorFlow.
+    LOGGER.i("Width: %s , Height: %s",bitmap.getWidth(),bitmap.getHeight());
     inferenceInterface.feed(
         INPUT_NODE, floatValues, 1, bitmap.getWidth(), bitmap.getHeight(), 3);
     inferenceInterface.feed(STYLE_NODE, styleVals, NUM_STYLES);
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
index 5f87ed002c65044596df21df9ceedba18bb8ee27..4a43585d5395b1df94dd8a8767f92f131cfcaea4 100644
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ b/tensorflow/examples/how_tos/reading_data/BUILD
@@ -14,7 +14,6 @@ py_binary(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
-        "//tensorflow/examples/tutorials/mnist:input_data",
     ],
 )
 
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
index 394c413b33ef08902281f5207dd1e3b5bbad0367..c8c136ac14c9745d269952bdf32e7360eb83336c 100644
--- a/tensorflow/examples/image_retraining/BUILD
+++ b/tensorflow/examples/image_retraining/BUILD
@@ -18,6 +18,7 @@ py_binary(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:graph_util",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
@@ -50,10 +51,16 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":label_image",
         ":retrain",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 44a3097d80ee1a5fecb0673a2fae27ca7248460a..2e2e578050bcbd3dae280a91158da6edd5f8bf47 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Simple transfer learning with an Inception v3 architecture model.
+r"""Simple transfer learning with Inception v3 or Mobilenet models.
 
 With support for TensorBoard.
 
-This example shows how to take a Inception v3 architecture model trained on
+This example shows how to take a Inception v3 or Mobilenet model trained on
 ImageNet images, and train a new top layer that can recognize other classes of
 images.
 
-The top layer receives as input a 2048-dimensional vector for each image. We
-train a softmax layer on top of this representation. Assuming the softmax layer
-contains N labels, this corresponds to learning N + 2048*N model parameters
-corresponding to the learned biases and weights.
+The top layer receives as input a 2048-dimensional vector (1001-dimensional for
+Mobilenet) for each image. We train a softmax layer on top of this
+representation. Assuming the softmax layer contains N labels, this corresponds
+to learning N + 2048*N (or 1001*N)  model parameters corresponding to the
+learned biases and weights.
 
 Here's an example, which assumes you have a folder containing class-named
 subfolders, each full of images for each label. The example folder flower_photos
@@ -62,6 +63,23 @@ in.
 This produces a new model file that can be loaded and run by any TensorFlow
 program, for example the label_image sample code.
 
+By default this script will use the high accuracy, but comparatively large and
+slow Inception v3 model architecture. It's recommended that you start with this
+to validate that you have gathered good training data, but if you want to deploy
+on resource-limited platforms, you can try the `--architecture` flag with a
+Mobilenet model. For example:
+
+```bash
+python tensorflow/examples/image_retraining/retrain.py \
+    --image_dir ~/flower_photos --architecture mobilenet_1.0_224
+```
+
+There are 32 different Mobilenet models to choose from, with a variety of file
+size and latency options. The first number can be '1.0', '0.75', '0.50', or
+'0.25' to control the size, and the second controls the input image size, either
+'224', '192', '160', or '128', with smaller sizes running faster. See
+https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
+for more information on Mobilenet.
 
 To use with TensorBoard:
 
@@ -82,7 +100,6 @@ import hashlib
 import os.path
 import random
 import re
-import struct
 import sys
 import tarfile
 
@@ -101,16 +118,6 @@ FLAGS = None
 # we're using for Inception v3. These include things like tensor names and their
 # sizes. If you want to adapt this script to work with another model, you will
 # need to update these to reflect the values in the network you're using.
-# pylint: disable=line-too-long
-DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
-# pylint: enable=line-too-long
-BOTTLENECK_TENSOR_NAME = 'pool_3/_reshape:0'
-BOTTLENECK_TENSOR_SIZE = 2048
-MODEL_INPUT_WIDTH = 299
-MODEL_INPUT_HEIGHT = 299
-MODEL_INPUT_DEPTH = 3
-JPEG_DATA_TENSOR_NAME = 'DecodeJpeg/contents:0'
-RESIZED_INPUT_TENSOR_NAME = 'ResizeBilinear:0'
 MAX_NUM_IMAGES_PER_CLASS = 2 ** 27 - 1  # ~134M
 
 
@@ -131,7 +138,7 @@ def create_image_lists(image_dir, testing_percentage, validation_percentage):
     into training, testing, and validation sets within each label.
   """
   if not gfile.Exists(image_dir):
-    print("Image directory '" + image_dir + "' not found.")
+    tf.logging.error("Image directory '" + image_dir + "' not found.")
     return None
   result = {}
   sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
@@ -146,18 +153,20 @@ def create_image_lists(image_dir, testing_percentage, validation_percentage):
     dir_name = os.path.basename(sub_dir)
     if dir_name == image_dir:
       continue
-    print("Looking for images in '" + dir_name + "'")
+    tf.logging.info("Looking for images in '" + dir_name + "'")
     for extension in extensions:
       file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
       file_list.extend(gfile.Glob(file_glob))
     if not file_list:
-      print('No files found')
+      tf.logging.warning('No files found')
       continue
     if len(file_list) < 20:
-      print('WARNING: Folder has less than 20 images, which may cause issues.')
+      tf.logging.warning(
+          'WARNING: Folder has less than 20 images, which may cause issues.')
     elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
-      print('WARNING: Folder {} has more than {} images. Some images will '
-            'never be selected.'.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
+      tf.logging.warning(
+          'WARNING: Folder {} has more than {} images. Some images will '
+          'never be selected.'.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
     label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
     training_images = []
     testing_images = []
@@ -230,7 +239,7 @@ def get_image_path(image_lists, label_name, index, image_dir, category):
 
 
 def get_bottleneck_path(image_lists, label_name, index, bottleneck_dir,
-                        category):
+                        category, architecture):
   """"Returns a path to a bottleneck file for a label at the given index.
 
   Args:
@@ -241,35 +250,42 @@ def get_bottleneck_path(image_lists, label_name, index, bottleneck_dir,
     bottleneck_dir: Folder string holding cached files of bottleneck values.
     category: Name string of set to pull images from - training, testing, or
     validation.
+    architecture: The name of the model architecture.
 
   Returns:
     File system path string to an image that meets the requested parameters.
   """
   return get_image_path(image_lists, label_name, index, bottleneck_dir,
-                        category) + '.txt'
+                        category) + '_' + architecture + '.txt'
 
 
-def create_inception_graph():
+def create_model_graph(model_info):
   """"Creates a graph from saved GraphDef file and returns a Graph object.
 
+  Args:
+    model_info: Dictionary containing information about the model architecture.
+
   Returns:
     Graph holding the trained Inception network, and various tensors we'll be
     manipulating.
   """
   with tf.Graph().as_default() as graph:
-    model_filename = os.path.join(
-        FLAGS.model_dir, 'classify_image_graph_def.pb')
-    with gfile.FastGFile(model_filename, 'rb') as f:
+    model_path = os.path.join(FLAGS.model_dir, model_info['model_file_name'])
+    with gfile.FastGFile(model_path, 'rb') as f:
       graph_def = tf.GraphDef()
       graph_def.ParseFromString(f.read())
-      bottleneck_tensor, jpeg_data_tensor, resized_input_tensor = (
-          tf.import_graph_def(graph_def, name='', return_elements=[
-              BOTTLENECK_TENSOR_NAME, JPEG_DATA_TENSOR_NAME,
-              RESIZED_INPUT_TENSOR_NAME]))
-  return graph, bottleneck_tensor, jpeg_data_tensor, resized_input_tensor
+      bottleneck_tensor, resized_input_tensor = (tf.import_graph_def(
+          graph_def,
+          name='',
+          return_elements=[
+              model_info['bottleneck_tensor_name'],
+              model_info['resized_input_tensor_name'],
+          ]))
+  return graph, bottleneck_tensor, resized_input_tensor
 
 
 def run_bottleneck_on_image(sess, image_data, image_data_tensor,
+                            decoded_image_tensor, resized_input_tensor,
                             bottleneck_tensor):
   """Runs inference on an image to extract the 'bottleneck' summary layer.
 
@@ -277,28 +293,36 @@ def run_bottleneck_on_image(sess, image_data, image_data_tensor,
     sess: Current active TensorFlow Session.
     image_data: String of raw JPEG data.
     image_data_tensor: Input data layer in the graph.
+    decoded_image_tensor: Output of initial image resizing and  preprocessing.
+    resized_input_tensor: The input node of the recognition graph.
     bottleneck_tensor: Layer before the final softmax.
 
   Returns:
     Numpy array of bottleneck values.
   """
-  bottleneck_values = sess.run(
-      bottleneck_tensor,
-      {image_data_tensor: image_data})
+  # First decode the JPEG image, resize it, and rescale the pixel values.
+  resized_input_values = sess.run(decoded_image_tensor,
+                                  {image_data_tensor: image_data})
+  # Then run it through the recognition network.
+  bottleneck_values = sess.run(bottleneck_tensor,
+                               {resized_input_tensor: resized_input_values})
   bottleneck_values = np.squeeze(bottleneck_values)
   return bottleneck_values
 
 
-def maybe_download_and_extract():
+def maybe_download_and_extract(data_url):
   """Download and extract model tar file.
 
   If the pretrained model we're using doesn't already exist, this function
   downloads it from the TensorFlow.org website and unpacks it into a directory.
+
+  Args:
+    data_url: Web location of the tar file containing the pretrained model.
   """
   dest_directory = FLAGS.model_dir
   if not os.path.exists(dest_directory):
     os.makedirs(dest_directory)
-  filename = DATA_URL.split('/')[-1]
+  filename = data_url.split('/')[-1]
   filepath = os.path.join(dest_directory, filename)
   if not os.path.exists(filepath):
 
@@ -308,12 +332,11 @@ def maybe_download_and_extract():
                         float(count * block_size) / float(total_size) * 100.0))
       sys.stdout.flush()
 
-    filepath, _ = urllib.request.urlretrieve(DATA_URL,
-                                             filepath,
-                                             _progress)
+    filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
-    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
+    tf.logging.info('Successfully downloaded', filename, statinfo.st_size,
+                    'bytes.')
   tarfile.open(filepath, 'r:gz').extractall(dest_directory)
 
 
@@ -327,43 +350,15 @@ def ensure_dir_exists(dir_name):
     os.makedirs(dir_name)
 
 
-def write_list_of_floats_to_file(list_of_floats, file_path):
-  """Writes a given list of floats to a binary file.
-
-  Args:
-    list_of_floats: List of floats we want to write to a file.
-    file_path: Path to a file where list of floats will be stored.
-
-  """
-
-  s = struct.pack('d' * BOTTLENECK_TENSOR_SIZE, *list_of_floats)
-  with open(file_path, 'wb') as f:
-    f.write(s)
-
-
-def read_list_of_floats_from_file(file_path):
-  """Reads list of floats from a given file.
-
-  Args:
-    file_path: Path to a file where list of floats was stored.
-  Returns:
-    Array of bottleneck values (list of floats).
-
-  """
-
-  with open(file_path, 'rb') as f:
-    s = struct.unpack('d' * BOTTLENECK_TENSOR_SIZE, f.read())
-    return list(s)
-
-
 bottleneck_path_2_bottleneck_values = {}
 
 
 def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                            image_dir, category, sess, jpeg_data_tensor,
+                           decoded_image_tensor, resized_input_tensor,
                            bottleneck_tensor):
   """Create a single bottleneck file."""
-  print('Creating bottleneck at ' + bottleneck_path)
+  tf.logging.info('Creating bottleneck at ' + bottleneck_path)
   image_path = get_image_path(image_lists, label_name, index,
                               image_dir, category)
   if not gfile.Exists(image_path):
@@ -371,10 +366,11 @@ def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
   image_data = gfile.FastGFile(image_path, 'rb').read()
   try:
     bottleneck_values = run_bottleneck_on_image(
-        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
-  except:
-    raise RuntimeError('Error during processing file %s' % image_path)
-
+        sess, image_data, jpeg_data_tensor, decoded_image_tensor,
+        resized_input_tensor, bottleneck_tensor)
+  except Exception as e:
+    raise RuntimeError('Error during processing file %s (%s)' % (image_path,
+                                                                 str(e)))
   bottleneck_string = ','.join(str(x) for x in bottleneck_values)
   with open(bottleneck_path, 'w') as bottleneck_file:
     bottleneck_file.write(bottleneck_string)
@@ -382,7 +378,8 @@ def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
 
 def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
                              category, bottleneck_dir, jpeg_data_tensor,
-                             bottleneck_tensor):
+                             decoded_image_tensor, resized_input_tensor,
+                             bottleneck_tensor, architecture):
   """Retrieves or calculates bottleneck values for an image.
 
   If a cached version of the bottleneck data exists on-disk, return that,
@@ -400,7 +397,10 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
     or validation.
     bottleneck_dir: Folder string holding cached files of bottleneck values.
     jpeg_data_tensor: The tensor to feed loaded jpeg data into.
+    decoded_image_tensor: The output of decoding and resizing the image.
+    resized_input_tensor: The input node of the recognition graph.
     bottleneck_tensor: The output tensor for the bottleneck values.
+    architecture: The name of the model architecture.
 
   Returns:
     Numpy array of values produced by the bottleneck layer for the image.
@@ -410,10 +410,11 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
   sub_dir_path = os.path.join(bottleneck_dir, sub_dir)
   ensure_dir_exists(sub_dir_path)
   bottleneck_path = get_bottleneck_path(image_lists, label_name, index,
-                                        bottleneck_dir, category)
+                                        bottleneck_dir, category, architecture)
   if not os.path.exists(bottleneck_path):
     create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                            image_dir, category, sess, jpeg_data_tensor,
+                           decoded_image_tensor, resized_input_tensor,
                            bottleneck_tensor)
   with open(bottleneck_path, 'r') as bottleneck_file:
     bottleneck_string = bottleneck_file.read()
@@ -421,11 +422,12 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
   try:
     bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
   except ValueError:
-    print('Invalid float found, recreating bottleneck')
+    tf.logging.warning('Invalid float found, recreating bottleneck')
     did_hit_error = True
   if did_hit_error:
     create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
                            image_dir, category, sess, jpeg_data_tensor,
+                           decoded_image_tensor, resized_input_tensor,
                            bottleneck_tensor)
     with open(bottleneck_path, 'r') as bottleneck_file:
       bottleneck_string = bottleneck_file.read()
@@ -436,7 +438,8 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
 
 
 def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
-                      jpeg_data_tensor, bottleneck_tensor):
+                      jpeg_data_tensor, decoded_image_tensor,
+                      resized_input_tensor, bottleneck_tensor, architecture):
   """Ensures all the training, testing, and validation bottlenecks are cached.
 
   Because we're likely to read the same image multiple times (if there are no
@@ -453,7 +456,10 @@ def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
     images.
     bottleneck_dir: Folder string holding cached files of bottleneck values.
     jpeg_data_tensor: Input tensor for jpeg data from file.
+    decoded_image_tensor: The output of decoding and resizing the image.
+    resized_input_tensor: The input node of the recognition graph.
     bottleneck_tensor: The penultimate output layer of the graph.
+    architecture: The name of the model architecture.
 
   Returns:
     Nothing.
@@ -464,18 +470,21 @@ def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
     for category in ['training', 'testing', 'validation']:
       category_list = label_lists[category]
       for index, unused_base_name in enumerate(category_list):
-        get_or_create_bottleneck(sess, image_lists, label_name, index,
-                                 image_dir, category, bottleneck_dir,
-                                 jpeg_data_tensor, bottleneck_tensor)
+        get_or_create_bottleneck(
+            sess, image_lists, label_name, index, image_dir, category,
+            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
+            resized_input_tensor, bottleneck_tensor, architecture)
 
         how_many_bottlenecks += 1
         if how_many_bottlenecks % 100 == 0:
-          print(str(how_many_bottlenecks) + ' bottleneck files created.')
+          tf.logging.info(
+              str(how_many_bottlenecks) + ' bottleneck files created.')
 
 
 def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
                                   bottleneck_dir, image_dir, jpeg_data_tensor,
-                                  bottleneck_tensor):
+                                  decoded_image_tensor, resized_input_tensor,
+                                  bottleneck_tensor, architecture):
   """Retrieves bottleneck values for cached images.
 
   If no distortions are being applied, this function can retrieve the cached
@@ -493,7 +502,10 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
     image_dir: Root folder string of the subfolders containing the training
     images.
     jpeg_data_tensor: The layer to feed jpeg image data into.
+    decoded_image_tensor: The output of decoding and resizing the image.
+    resized_input_tensor: The input node of the recognition graph.
     bottleneck_tensor: The bottleneck output layer of the CNN graph.
+    architecture: The name of the model architecture.
 
   Returns:
     List of bottleneck arrays, their corresponding ground truths, and the
@@ -511,10 +523,10 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
       image_index = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1)
       image_name = get_image_path(image_lists, label_name, image_index,
                                   image_dir, category)
-      bottleneck = get_or_create_bottleneck(sess, image_lists, label_name,
-                                            image_index, image_dir, category,
-                                            bottleneck_dir, jpeg_data_tensor,
-                                            bottleneck_tensor)
+      bottleneck = get_or_create_bottleneck(
+          sess, image_lists, label_name, image_index, image_dir, category,
+          bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
+          resized_input_tensor, bottleneck_tensor, architecture)
       ground_truth = np.zeros(class_count, dtype=np.float32)
       ground_truth[label_index] = 1.0
       bottlenecks.append(bottleneck)
@@ -527,10 +539,10 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
           image_lists[label_name][category]):
         image_name = get_image_path(image_lists, label_name, image_index,
                                     image_dir, category)
-        bottleneck = get_or_create_bottleneck(sess, image_lists, label_name,
-                                              image_index, image_dir, category,
-                                              bottleneck_dir, jpeg_data_tensor,
-                                              bottleneck_tensor)
+        bottleneck = get_or_create_bottleneck(
+            sess, image_lists, label_name, image_index, image_dir, category,
+            bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
+            resized_input_tensor, bottleneck_tensor, architecture)
         ground_truth = np.zeros(class_count, dtype=np.float32)
         ground_truth[label_index] = 1.0
         bottlenecks.append(bottleneck)
@@ -583,12 +595,12 @@ def get_random_distorted_bottlenecks(
     # might be optimized in other implementations.
     distorted_image_data = sess.run(distorted_image,
                                     {input_jpeg_tensor: jpeg_data})
-    bottleneck = run_bottleneck_on_image(sess, distorted_image_data,
-                                         resized_input_tensor,
-                                         bottleneck_tensor)
+    bottleneck_values = sess.run(bottleneck_tensor,
+                                 {resized_input_tensor: distorted_image_data})
+    bottleneck_values = np.squeeze(bottleneck_values)
     ground_truth = np.zeros(class_count, dtype=np.float32)
     ground_truth[label_index] = 1.0
-    bottlenecks.append(bottleneck)
+    bottlenecks.append(bottleneck_values)
     ground_truths.append(ground_truth)
   return bottlenecks, ground_truths
 
@@ -612,7 +624,8 @@ def should_distort_images(flip_left_right, random_crop, random_scale,
 
 
 def add_input_distortions(flip_left_right, random_crop, random_scale,
-                          random_brightness):
+                          random_brightness, input_width, input_height,
+                          input_depth, input_mean, input_std):
   """Creates the operations to apply the specified distortions.
 
   During training it can help to improve the results if we run the images
@@ -660,13 +673,18 @@ def add_input_distortions(flip_left_right, random_crop, random_scale,
     random_scale: Integer percentage of how much to vary the scale by.
     random_brightness: Integer range to randomly multiply the pixel values by.
     graph.
+    input_width: Horizontal size of expected input image to model.
+    input_height: Vertical size of expected input image to model.
+    input_depth: How many channels the expected input image should have.
+    input_mean: Pixel value that should be zero in the image for the graph.
+    input_std: How much to divide the pixel values by before recognition.
 
   Returns:
     The jpeg input layer and the distorted result tensor.
   """
 
   jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput')
-  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=MODEL_INPUT_DEPTH)
+  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
   decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
   decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
   margin_scale = 1.0 + (random_crop / 100.0)
@@ -676,16 +694,15 @@ def add_input_distortions(flip_left_right, random_crop, random_scale,
                                          minval=1.0,
                                          maxval=resize_scale)
   scale_value = tf.multiply(margin_scale_value, resize_scale_value)
-  precrop_width = tf.multiply(scale_value, MODEL_INPUT_WIDTH)
-  precrop_height = tf.multiply(scale_value, MODEL_INPUT_HEIGHT)
+  precrop_width = tf.multiply(scale_value, input_width)
+  precrop_height = tf.multiply(scale_value, input_height)
   precrop_shape = tf.stack([precrop_height, precrop_width])
   precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32)
   precropped_image = tf.image.resize_bilinear(decoded_image_4d,
                                               precrop_shape_as_int)
   precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0])
   cropped_image = tf.random_crop(precropped_image_3d,
-                                 [MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH,
-                                  MODEL_INPUT_DEPTH])
+                                 [input_height, input_width, input_depth])
   if flip_left_right:
     flipped_image = tf.image.random_flip_left_right(cropped_image)
   else:
@@ -696,7 +713,9 @@ def add_input_distortions(flip_left_right, random_crop, random_scale,
                                        minval=brightness_min,
                                        maxval=brightness_max)
   brightened_image = tf.multiply(flipped_image, brightness_value)
-  distort_result = tf.expand_dims(brightened_image, 0, name='DistortResult')
+  offset_image = tf.subtract(brightened_image, input_mean)
+  mul_image = tf.multiply(offset_image, 1.0 / input_std)
+  distort_result = tf.expand_dims(mul_image, 0, name='DistortResult')
   return jpeg_data, distort_result
 
 
@@ -713,7 +732,8 @@ def variable_summaries(var):
     tf.summary.histogram('histogram', var)
 
 
-def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
+def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
+                           bottleneck_tensor_size):
   """Adds a new softmax and fully-connected layer for training.
 
   We need to retrain the top layer to identify our new classes, so this function
@@ -721,13 +741,14 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   weights, and then sets up all the gradients for the backward pass.
 
   The set up for the softmax and fully-connected layers is based on:
-  https://tensorflow.org/versions/master/tutorials/mnist/beginners/index.html
+  https://www.tensorflow.org/versions/master/tutorials/mnist/beginners/index.html
 
   Args:
     class_count: Integer of how many categories of things we're trying to
     recognize.
     final_tensor_name: Name string for the new final node that produces results.
     bottleneck_tensor: The output of the main CNN graph.
+    bottleneck_tensor_size: How many entries in the bottleneck vector.
 
   Returns:
     The tensors for the training and cross entropy results, and tensors for the
@@ -735,7 +756,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   """
   with tf.name_scope('input'):
     bottleneck_input = tf.placeholder_with_default(
-        bottleneck_tensor, shape=[None, BOTTLENECK_TENSOR_SIZE],
+        bottleneck_tensor,
+        shape=[None, bottleneck_tensor_size],
         name='BottleneckInputPlaceholder')
 
     ground_truth_input = tf.placeholder(tf.float32,
@@ -747,8 +769,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   layer_name = 'final_training_ops'
   with tf.name_scope(layer_name):
     with tf.name_scope('weights'):
-      initial_value = tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count],
-                                          stddev=0.001)
+      initial_value = tf.truncated_normal(
+          [bottleneck_tensor_size, class_count], stddev=0.001)
 
       layer_weights = tf.Variable(initial_value, name='final_weights')
 
@@ -802,7 +824,7 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
 
 def save_graph_to_file(sess, graph, graph_file_name):
   output_graph_def = graph_util.convert_variables_to_constants(
-    sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
+      sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
   with gfile.FastGFile(graph_file_name, 'wb') as f:
     f.write(output_graph_def.SerializeToString())
   return
@@ -818,25 +840,160 @@ def prepare_file_system():
   return
 
 
+def create_model_info(architecture):
+  """Given the name of a model architecture, returns information about it.
+
+  There are different base image recognition pretrained models that can be
+  retrained using transfer learning, and this function translates from the name
+  of a model to the attributes that are needed to download and train with it.
+
+  Args:
+    architecture: Name of a model architecture.
+
+  Returns:
+    Dictionary of information about the model, or None if the name isn't
+    recognized
+
+  Raises:
+    ValueError: If architecture name is unknown.
+  """
+  architecture = architecture.lower()
+  if architecture == 'inception_v3':
+    # pylint: disable=line-too-long
+    data_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
+    # pylint: enable=line-too-long
+    bottleneck_tensor_name = 'pool_3/_reshape:0'
+    bottleneck_tensor_size = 2048
+    input_width = 299
+    input_height = 299
+    input_depth = 3
+    resized_input_tensor_name = 'Mul:0'
+    model_file_name = 'classify_image_graph_def.pb'
+    input_mean = 128
+    input_std = 128
+  elif architecture.startswith('mobilenet_'):
+    parts = architecture.split('_')
+    if len(parts) != 3 and len(parts) != 4:
+      tf.logging.error("Couldn't understand architecture name '%s'",
+                       architecture)
+      return None
+    version_string = parts[1]
+    if (version_string != '1.0' and version_string != '0.75' and
+        version_string != '0.50' and version_string != '0.25'):
+      tf.logging.error(
+          """"The Mobilenet version should be '1.0', '0.75', '0.50', or '0.25',
+  but found '%s' for architecture '%s'""",
+          version_string, architecture)
+      return None
+    size_string = parts[2]
+    if (size_string != '224' and size_string != '192' and
+        size_string != '160' and size_string != '128'):
+      tf.logging.error(
+          """The Mobilenet input size should be '224', '192', '160', or '128',
+ but found '%s' for architecture '%s'""",
+          size_string, architecture)
+      return None
+    if len(parts) == 3:
+      is_quantized = False
+    else:
+      if parts[3] != 'quantized':
+        tf.logging.error(
+            "Couldn't understand architecture suffix '%s' for '%s'", parts[3],
+            architecture)
+        return None
+      is_quantized = True
+    data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
+    data_url += version_string + '_' + size_string + '_frozen.tgz'
+    bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
+    bottleneck_tensor_size = 1001
+    input_width = int(size_string)
+    input_height = int(size_string)
+    input_depth = 3
+    resized_input_tensor_name = 'input:0'
+    if is_quantized:
+      model_base_name = 'quantized_graph.pb'
+    else:
+      model_base_name = 'frozen_graph.pb'
+    model_dir_name = 'mobilenet_v1_' + version_string + '_' + size_string
+    model_file_name = os.path.join(model_dir_name, model_base_name)
+    input_mean = 127.5
+    input_std = 127.5
+  else:
+    tf.logging.error("Couldn't understand architecture name '%s'", architecture)
+    raise ValueError('Unknown architecture', architecture)
+
+  return {
+      'data_url': data_url,
+      'bottleneck_tensor_name': bottleneck_tensor_name,
+      'bottleneck_tensor_size': bottleneck_tensor_size,
+      'input_width': input_width,
+      'input_height': input_height,
+      'input_depth': input_depth,
+      'resized_input_tensor_name': resized_input_tensor_name,
+      'model_file_name': model_file_name,
+      'input_mean': input_mean,
+      'input_std': input_std,
+  }
+
+
+def add_jpeg_decoding(input_width, input_height, input_depth, input_mean,
+                      input_std):
+  """Adds operations that perform JPEG decoding and resizing to the graph..
+
+  Args:
+    input_width: Desired width of the image fed into the recognizer graph.
+    input_height: Desired width of the image fed into the recognizer graph.
+    input_depth: Desired channels of the image fed into the recognizer graph.
+    input_mean: Pixel value that should be zero in the image for the graph.
+    input_std: How much to divide the pixel values by before recognition.
+
+  Returns:
+    Tensors for the node to feed JPEG data into, and the output of the
+      preprocessing steps.
+  """
+  jpeg_data = tf.placeholder(tf.string, name='DecodeJPGInput')
+  decoded_image = tf.image.decode_jpeg(jpeg_data, channels=input_depth)
+  decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
+  decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
+  resize_shape = tf.stack([input_height, input_width])
+  resize_shape_as_int = tf.cast(resize_shape, dtype=tf.int32)
+  resized_image = tf.image.resize_bilinear(decoded_image_4d,
+                                           resize_shape_as_int)
+  offset_image = tf.subtract(resized_image, input_mean)
+  mul_image = tf.multiply(offset_image, 1.0 / input_std)
+  return jpeg_data, mul_image
+
+
 def main(_):
+  # Needed to make sure the logging output is visible.
+  # See https://github.com/tensorflow/tensorflow/issues/3047
+  tf.logging.set_verbosity(tf.logging.INFO)
+
   # Prepare necessary directories  that can be used during training
   prepare_file_system()
 
+  # Gather information about the model architecture we'll be using.
+  model_info = create_model_info(FLAGS.architecture)
+  if not model_info:
+    tf.logging.error('Did not recognize architecture flag')
+    return -1
+
   # Set up the pre-trained graph.
-  maybe_download_and_extract()
-  graph, bottleneck_tensor, jpeg_data_tensor, resized_image_tensor = (
-      create_inception_graph())
+  maybe_download_and_extract(model_info['data_url'])
+  graph, bottleneck_tensor, resized_image_tensor = (
+      create_model_graph(model_info))
 
   # Look at the folder structure, and create lists of all the images.
   image_lists = create_image_lists(FLAGS.image_dir, FLAGS.testing_percentage,
                                    FLAGS.validation_percentage)
   class_count = len(image_lists.keys())
   if class_count == 0:
-    print('No valid folders of images found at ' + FLAGS.image_dir)
+    tf.logging.error('No valid folders of images found at ' + FLAGS.image_dir)
     return -1
   if class_count == 1:
-    print('Only one valid folder of images found at ' + FLAGS.image_dir +
-          ' - multiple classes are needed for classification.')
+    tf.logging.error('Only one valid folder of images found at ' +
+                     FLAGS.image_dir +
+                     ' - multiple classes are needed for classification.')
     return -1
 
   # See if the command-line flags mean we're applying any distortions.
@@ -845,25 +1002,33 @@ def main(_):
       FLAGS.random_brightness)
 
   with tf.Session(graph=graph) as sess:
+    # Set up the image decoding sub-graph.
+    jpeg_data_tensor, decoded_image_tensor = add_jpeg_decoding(
+        model_info['input_width'], model_info['input_height'],
+        model_info['input_depth'], model_info['input_mean'],
+        model_info['input_std'])
 
     if do_distort_images:
       # We will be applying distortions, so setup the operations we'll need.
       (distorted_jpeg_data_tensor,
        distorted_image_tensor) = add_input_distortions(
-           FLAGS.flip_left_right, FLAGS.random_crop,
-           FLAGS.random_scale, FLAGS.random_brightness)
+           FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
+           FLAGS.random_brightness, model_info['input_width'],
+           model_info['input_height'], model_info['input_depth'],
+           model_info['input_mean'], model_info['input_std'])
     else:
       # We'll make sure we've calculated the 'bottleneck' image summaries and
       # cached them on disk.
       cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
                         FLAGS.bottleneck_dir, jpeg_data_tensor,
-                        bottleneck_tensor)
+                        decoded_image_tensor, resized_image_tensor,
+                        bottleneck_tensor, FLAGS.architecture)
 
     # Add the new layer that we'll be training.
     (train_step, cross_entropy, bottleneck_input, ground_truth_input,
-     final_tensor) = add_final_training_ops(len(image_lists.keys()),
-                                            FLAGS.final_tensor_name,
-                                            bottleneck_tensor)
+     final_tensor) = add_final_training_ops(
+         len(image_lists.keys()), FLAGS.final_tensor_name, bottleneck_tensor,
+         model_info['bottleneck_tensor_size'])
 
     # Create the operations we need to evaluate the accuracy of our new layer.
     evaluation_step, prediction = add_evaluation_step(
@@ -896,10 +1061,10 @@ def main(_):
          train_ground_truth, _) = get_random_cached_bottlenecks(
              sess, image_lists, FLAGS.train_batch_size, 'training',
              FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-             bottleneck_tensor)
+             decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
+             FLAGS.architecture)
       # Feed the bottlenecks and ground truth into the graph, and run a training
       # step. Capture training summaries for TensorBoard with the `merged` op.
-
       train_summary, _ = sess.run(
           [merged, train_step],
           feed_dict={bottleneck_input: train_bottlenecks,
@@ -913,15 +1078,16 @@ def main(_):
             [evaluation_step, cross_entropy],
             feed_dict={bottleneck_input: train_bottlenecks,
                        ground_truth_input: train_ground_truth})
-        print('%s: Step %d: Train accuracy = %.1f%%' % (datetime.now(), i,
-                                                        train_accuracy * 100))
-        print('%s: Step %d: Cross entropy = %f' % (datetime.now(), i,
-                                                   cross_entropy_value))
+        tf.logging.info('%s: Step %d: Train accuracy = %.1f%%' %
+                        (datetime.now(), i, train_accuracy * 100))
+        tf.logging.info('%s: Step %d: Cross entropy = %f' %
+                        (datetime.now(), i, cross_entropy_value))
         validation_bottlenecks, validation_ground_truth, _ = (
             get_random_cached_bottlenecks(
                 sess, image_lists, FLAGS.validation_batch_size, 'validation',
                 FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-                bottleneck_tensor))
+                decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
+                FLAGS.architecture))
         # Run a validation step and capture training summaries for TensorBoard
         # with the `merged` op.
         validation_summary, validation_accuracy = sess.run(
@@ -929,38 +1095,43 @@ def main(_):
             feed_dict={bottleneck_input: validation_bottlenecks,
                        ground_truth_input: validation_ground_truth})
         validation_writer.add_summary(validation_summary, i)
-        print('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
-              (datetime.now(), i, validation_accuracy * 100,
-               len(validation_bottlenecks)))
+        tf.logging.info('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
+                        (datetime.now(), i, validation_accuracy * 100,
+                         len(validation_bottlenecks)))
 
       # Store intermediate results
       intermediate_frequency = FLAGS.intermediate_store_frequency
 
-      if intermediate_frequency > 0 and (i % intermediate_frequency == 0) and i > 0:
-        intermediate_file_name = FLAGS.intermediate_output_graphs_dir + 'intermediate_' + str(i) + '.pb'
-        print('Save intermediate result to : ' + intermediate_file_name)
+      if (intermediate_frequency > 0 and (i % intermediate_frequency == 0)
+          and i > 0):
+        intermediate_file_name = (FLAGS.intermediate_output_graphs_dir +
+                                  'intermediate_' + str(i) + '.pb')
+        tf.logging.info('Save intermediate result to : ' +
+                        intermediate_file_name)
         save_graph_to_file(sess, graph, intermediate_file_name)
-                
+
     # We've completed all our training, so run a final test evaluation on
     # some new images we haven't used before.
     test_bottlenecks, test_ground_truth, test_filenames = (
-        get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
-                                      'testing', FLAGS.bottleneck_dir,
-                                      FLAGS.image_dir, jpeg_data_tensor,
-                                      bottleneck_tensor))
+        get_random_cached_bottlenecks(
+            sess, image_lists, FLAGS.test_batch_size, 'testing',
+            FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
+            decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
+            FLAGS.architecture))
     test_accuracy, predictions = sess.run(
         [evaluation_step, prediction],
         feed_dict={bottleneck_input: test_bottlenecks,
                    ground_truth_input: test_ground_truth})
-    print('Final test accuracy = %.1f%% (N=%d)' % (
-        test_accuracy * 100, len(test_bottlenecks)))
+    tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
+                    (test_accuracy * 100, len(test_bottlenecks)))
 
     if FLAGS.print_misclassified_test_images:
-      print('=== MISCLASSIFIED TEST IMAGES ===')
+      tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
       for i, test_filename in enumerate(test_filenames):
         if predictions[i] != test_ground_truth[i].argmax():
-          print('%70s  %s' % (test_filename,
-                              list(image_lists.keys())[predictions[i]]))
+          tf.logging.info('%70s  %s' %
+                          (test_filename,
+                           list(image_lists.keys())[predictions[i]]))
 
     # Write out the trained graph and labels with the weights stored as
     # constants.
@@ -993,7 +1164,10 @@ if __name__ == '__main__':
       '--intermediate_store_frequency',
       type=int,
       default=0,
-      help='How many steps to store intermediate graph. If "0" then will not store.'
+      help="""\
+         How many steps to store intermediate graph. If "0" then will not
+         store.\
+      """
   )
   parser.add_argument(
       '--output_labels',
@@ -1134,5 +1308,19 @@ if __name__ == '__main__':
       input pixels up or down by.\
       """
   )
+  parser.add_argument(
+      '--architecture',
+      type=str,
+      default='inception_v3',
+      help="""\
+      Which model architecture to use. 'inception_v3' is the most accurate, but
+      also the slowest. For faster or smaller models, chose a MobileNet with the
+      form 'mobilenet_<parameter size>_<input_size>[_quantized]'. For example,
+      'mobilenet_1.0_224' will pick a model that is 17 MB in size and takes 224
+      pixel input images, while 'mobilenet_0.25_128_quantized' will choose a much
+      less accurate, but smaller and faster network that's 920 KB on disk and
+      takes 128x128 images. See https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html
+      for more information on Mobilenet.\
+      """)
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 8af5cc71149c3ecb5f3f95dadaaacb64514525dc..467c15d0de5520c4f737ea3e9628a6d027388f14 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -48,10 +48,10 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
 
   def testGetBottleneckPath(self):
     image_lists = self.dummyImageLists()
-    self.assertEqual('bottleneck_dir/somedir/image_five.jpg.txt',
+    self.assertEqual('bottleneck_dir/somedir/image_five.jpg_imagenet_v3.txt',
                      retrain.get_bottleneck_path(
                          image_lists, 'label_one', 0, 'bottleneck_dir',
-                         'validation'))
+                         'validation', 'imagenet_v3'))
 
   def testShouldDistortImage(self):
     self.assertEqual(False, retrain.should_distort_images(False, 0, 0, 0))
@@ -63,7 +63,7 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
   def testAddInputDistortions(self):
     with tf.Graph().as_default():
       with tf.Session() as sess:
-        retrain.add_input_distortions(True, 10, 10, 10)
+        retrain.add_input_distortions(True, 10, 10, 10, 299, 299, 3, 128, 128)
         self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortJPGInput:0'))
         self.assertIsNotNone(sess.graph.get_tensor_by_name('DistortResult:0'))
 
@@ -72,9 +72,9 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
     with tf.Graph().as_default():
       with tf.Session() as sess:
         bottleneck = tf.placeholder(
-            tf.float32, [1, retrain.BOTTLENECK_TENSOR_SIZE],
-            name=retrain.BOTTLENECK_TENSOR_NAME.split(':')[0])
-        retrain.add_final_training_ops(5, 'final', bottleneck)
+            tf.float32, [1, 1024],
+            name='bottleneck')
+        retrain.add_final_training_ops(5, 'final', bottleneck, 1024)
         self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
 
   def testAddEvaluationStep(self):
@@ -113,5 +113,22 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
       result = label_image.run_graph(image, labels, jpeg, 'final:0', 3)
       self.assertEqual(result, 0)
 
+  def testAddJpegDecoding(self):
+    with tf.Graph().as_default():
+      jpeg_data, mul_image = retrain.add_jpeg_decoding(10, 10, 3, 0, 255)
+      self.assertIsNotNone(jpeg_data)
+      self.assertIsNotNone(mul_image)
+
+  def testCreateModelInfo(self):
+    did_raise_value_error = False
+    try:
+      retrain.create_model_info('no_such_model_name')
+    except ValueError:
+      did_raise_value_error = True
+    self.assertTrue(did_raise_value_error)
+    model_info = retrain.create_model_info('inception_v3')
+    self.assertIsNotNone(model_info)
+    self.assertEqual(299, model_info['input_width'])
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 9832399d721d69a5b121a2cc7a67039306f0dba6..7974b8c879abbfa659b16e623e33a7ec6cb6cc71 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -20,9 +20,9 @@ mkdir -p ~/graphs
 curl -o ~/graphs/inception5h.zip \
  https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip \
  && unzip ~/graphs/inception5h.zip -d ~/graphs/inception5h
-cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/benchmark/data/
-cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/camera/data/
-cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/simple/data/
+cp ~/graphs/inception5h/* tensorflow/examples/ios/benchmark/data/
+cp ~/graphs/inception5h/* tensorflow/examples/ios/camera/data/
+cp ~/graphs/inception5h/* tensorflow/examples/ios/simple/data/
 ```
 
  - Change directory to one of the samples, download the TensorFlow-experimental
@@ -30,7 +30,7 @@ cp ~/graphs/inception5h/* tensorflow/contrib/ios_examples/simple/data/
    long time since it is big (~450MB). For example, if you want to run the
    simple example, then:
 ```bash
-cd tensorflow/contrib/ios_examples/simple
+cd tensorflow/ios/simple
 pod install
 open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
 ```
@@ -51,9 +51,10 @@ open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
   
  - The TensorFlow-experimental pod is current about ~450MB. The reason it is 
    so big is because we are bundling multiple platforms, and the pod includes
-   all TensorFlow functionality (e.g. operations). This is convenient during
-   development, but see below section on how you can build your own custom
-   TensorFlow library to reduce the size.
+   all TensorFlow functionality (e.g. operations). The final app size after
+   build is substantially smaller though (~25MB). Working with the complete
+   pod is convenient during development, but see below section on how you can
+   build your own custom TensorFlow library to reduce the size.
 
 ### Creating Your own App
 
@@ -65,7 +66,7 @@ target 'YourProjectName'
 ```
 
  - Then you run ```pod install``` to download and install the
- TensorFlow-experimental pod, and finaly perform
+ TensorFlow-experimental pod, and finally perform
  ```open YourProjectName.xcworkspace``` and add your code.
 
  - In your apps "Build Settings", make sure to add $(inherited) to sections
@@ -145,10 +146,10 @@ rundown:
    in your project settings.
 
  - Remove any use of the `-all_load` flag in your project. The protocol buffers
-   libraries (full and lite versions) contain duplicate symbols, and the `-all_load`
-   flag will cause these duplicates to become link errors. If you were using
-   `-all_load` to avoid issues with Objective-C categories in static libraries,
-   you may be able to replace it with the `-ObjC` flag.
+   libraries (full and lite versions) contain duplicate symbols, and the
+   `-all_load` flag will cause these duplicates to become link errors. If you
+   were using `-all_load` to avoid issues with Objective-C categories in static
+   libraries, you may be able to replace it with the `-ObjC` flag.
 
 ### Reducing the binary size
 
@@ -159,7 +160,7 @@ It can be tricky to set up the right configuration in your own app to keep the
 size minimized, so if you do run into this issue we recommend you start by
 looking at the simple example to examine its size. Here's how you do that:
 
- - Open the Xcode project in tensorflow/contrib/ios_examples/simple.
+ - Open the Xcode project in tensorflow/examples/ios/simple.
 
  - Make sure you've followed the steps above to get the data files.
 
@@ -181,7 +182,7 @@ looking at the simple example to examine its size. Here's how you do that:
  - Running this command will show the size of the executable as the
    `tf_simple_example` line.
 
-Right now you'll see a size of around 23 MB, since it's including two
+Right now you'll see a size of around 25 MB, since it's including two
 architectures (armv7 and arm64). As a first step, you should make sure the size
 increase you see in your own app is similar, and if it's larger, look at the
 "Other Linker Flags" used in the Simple Xcode project settings to strip the
diff --git a/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm b/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
index cab7b36f1775b3b943e22e828cd51c9c78203396..9fc5f6ded24d94db888f4c067278bb36f62bac2f 100644
--- a/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
+++ b/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
@@ -22,17 +22,7 @@
 #include <sstream>
 #include <string>
 
-//#include "google/protobuf/io/coded_stream.h"
-//#include "google/protobuf/io/zero_copy_stream_impl.h"
-//#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-//#include "google/protobuf/message_lite.h"
 #include "tensorflow/core/framework/op_kernel.h"
-//#include "tensorflow/core/framework/tensor.h"
-//#include "tensorflow/core/framework/types.pb.h"
-//#include "tensorflow/core/platform/env.h"
-//#include "tensorflow/core/platform/logging.h"
-//#include "tensorflow/core/platform/mutex.h"
-//#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index d677e58ac323e1789f493bfc6aa9a33cf807612d..ddbcf68df8c4709476022cdad2e4f8350ca3c77a 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -31,6 +31,9 @@ cc_binary(
             # Jpg, gif, and png related code won't be included
             "//tensorflow/cc:cc_ops",
             "//tensorflow/core:android_tensorflow_lib",
+            # cc:android_tensorflow_image_op is for including jpeg/gif/png
+            # decoder to enable real-image evaluation on Android
+            "//tensorflow/core/kernels:android_tensorflow_image_op",
         ],
         "//conditions:default": [
             "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/examples/label_image/README.md b/tensorflow/examples/label_image/README.md
index 1103caf5864878d2f0c5b1685f0a7ec8a34f5446..a9e44745e5cfa673c19d2c2fb434251b12d7aad6 100644
--- a/tensorflow/examples/label_image/README.md
+++ b/tensorflow/examples/label_image/README.md
@@ -1,4 +1,4 @@
-# TensorFlow C++ Image Recognition Demo
+# TensorFlow C++ and Python Image Recognition Demo
 
 This example shows how you can load a pre-trained TensorFlow network and use it
 to recognize objects in images in C++. For Java see the [Java
@@ -63,4 +63,25 @@ $ bazel-bin/tensorflow/examples/label_image/label_image --image=my_image.png
 ```
 
 For a more detailed look at this code, you can check out the C++ section of the
-[Inception tutorial](https://tensorflow.org/tutorials/image_recognition/).
+[Inception tutorial](https://www.tensorflow.org/tutorials/image_recognition/).
+
+## Python implementation
+
+label_image.py is a python implementation that provides code corresponding
+to the C++ code here. This gives more intuitive mapping between C++ and
+Python than the Python code mentioned in the
+[Inception tutorial](https://www.tensorflow.org/tutorials/image_recognition/).
+and could be easier to add visualization or debug code.
+
+With tensorflow python package installed, you can run it like:
+```bash
+$ python3 tensorflow/examples/label_image/label_image.py
+```
+And get result similar to this:
+```
+military uniform 0.834305
+mortarboard 0.0218694
+academic gown 0.0103581
+pickelhaube 0.00800818
+bulletproof vest 0.0053509
+```
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d09813375687fc954cab3d55ce997f8684da17
--- /dev/null
+++ b/tensorflow/examples/label_image/label_image.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import numpy as np
+import tensorflow as tf
+
+def load_graph(model_file):
+  graph = tf.Graph()
+  graph_def = tf.GraphDef()
+
+  with open(model_file, "rb") as f:
+    graph_def.ParseFromString(f.read())
+  with graph.as_default():
+    tf.import_graph_def(graph_def)
+
+  return graph
+
+def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
+				input_mean=0, input_std=255):
+  input_name = "file_reader"
+  output_name = "normalized"
+  file_reader = tf.read_file(file_name, input_name)
+  if file_name.endswith(".png"):
+    image_reader = tf.image.decode_png(file_reader, channels = 3,
+                                       name='png_reader')
+  elif file_name.endswith(".gif"):
+    image_reader = tf.squeeze(tf.image.decode_gif(file_reader,
+                                                  name='gif_reader'))
+  elif file_name.endswith(".bmp"):
+    image_reader = tf.image.decode_bmp(file_reader, name='bmp_reader')
+  else:
+    image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
+                                        name='jpeg_reader')
+  float_caster = tf.cast(image_reader, tf.float32)
+  dims_expander = tf.expand_dims(float_caster, 0);
+  resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
+  normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
+  sess = tf.Session()
+  result = sess.run(normalized)
+
+  return result
+
+def load_labels(label_file):
+  label = []
+  proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
+  for l in proto_as_ascii_lines:
+    label.append(l.rstrip())
+  return label
+
+if __name__ == "__main__":
+  file_name = "tensorflow/examples/label_image/data/grace_hopper.jpg"
+  model_file = \
+    "tensorflow/examples/label_image/data/inception_v3_2016_08_28_frozen.pb"
+  label_file = "tensorflow/examples/label_image/data/imagenet_slim_labels.txt"
+  input_height = 299
+  input_width = 299
+  input_mean = 0
+  input_std = 255
+  input_layer = "input"
+  output_layer = "InceptionV3/Predictions/Reshape_1"
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--image", help="image to be processed")
+  parser.add_argument("--graph", help="graph/model to be executed")
+  parser.add_argument("--labels", help="name of file containing labels")
+  parser.add_argument("--input_height", type=int, help="input height")
+  parser.add_argument("--input_width", type=int, help="input width")
+  parser.add_argument("--input_mean", type=int, help="input mean")
+  parser.add_argument("--input_std", type=int, help="input std")
+  parser.add_argument("--input_layer", help="name of input layer")
+  parser.add_argument("--output_layer", help="name of output layer")
+  args = parser.parse_args()
+
+  if args.graph:
+    model_file = args.graph
+  if args.image:
+    file_name = args.image
+  if args.labels:
+    label_file = args.labels
+  if args.input_height:
+    input_height = args.input_height
+  if args.input_width:
+    input_width = args.input_width
+  if args.input_mean:
+    input_mean = args.input_mean
+  if args.input_std:
+    input_std = args.input_std
+  if args.input_layer:
+    input_layer = args.input_layer
+  if args.output_layer:
+    output_layer = args.output_layer
+
+  graph = load_graph(model_file)
+  t = read_tensor_from_image_file(file_name,
+                                  input_height=input_height,
+                                  input_width=input_width,
+                                  input_mean=input_mean,
+                                  input_std=input_std)
+
+  input_name = "import/" + input_layer
+  output_name = "import/" + output_layer
+  input_operation = graph.get_operation_by_name(input_name);
+  output_operation = graph.get_operation_by_name(output_name);
+
+  with tf.Session(graph=graph) as sess:
+    results = sess.run(output_operation.outputs[0],
+                      {input_operation.outputs[0]: t})
+  results = np.squeeze(results)
+
+  top_k = results.argsort()[-5:][::-1]
+  labels = load_labels(label_file)
+  for i in top_k:
+    print(labels[i], results[i])
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index a98c0817e30662b3848807472a63b50fb8d333fd..63bc39de6c0a420e03adada56cbc8b0f895b6155 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -89,7 +89,6 @@ Status ReadLabelsFile(const string& file_name, std::vector<string>* result,
 
 static Status ReadEntireFile(tensorflow::Env* env, const string& filename,
                              Tensor* output) {
-
   tensorflow::uint64 file_size = 0;
   TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
 
@@ -124,15 +123,15 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
 
   // read file_name into a tensor named input
   Tensor input(tensorflow::DT_STRING, tensorflow::TensorShape());
-  TF_RETURN_IF_ERROR(ReadEntireFile(tensorflow::Env::Default(), file_name,
-                                    &input));
+  TF_RETURN_IF_ERROR(
+      ReadEntireFile(tensorflow::Env::Default(), file_name, &input));
 
   // use a placeholder to read input data
-  auto file_reader = Placeholder(root.WithOpName("input"),
-                                 tensorflow::DataType::DT_STRING);
+  auto file_reader =
+      Placeholder(root.WithOpName("input"), tensorflow::DataType::DT_STRING);
 
   std::vector<std::pair<string, tensorflow::Tensor>> inputs = {
-    {"input", input},
+      {"input", input},
   };
 
   // Now try to figure out what kind of file it is and decode it.
@@ -285,8 +284,8 @@ int main(int argc, char* argv[]) {
       "tensorflow/examples/label_image/data/imagenet_slim_labels.txt";
   int32 input_width = 299;
   int32 input_height = 299;
-  int32 input_mean = 0;
-  int32 input_std = 255;
+  float input_mean = 0;
+  float input_std = 255;
   string input_layer = "input";
   string output_layer = "InceptionV3/Predictions/Reshape_1";
   bool self_test = false;
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index 1606e1a947b610030271bdb170aa81d5fdf3eca4..23a42a60ba476701b42f846095aadc8acd0e9b2f 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -13,10 +13,7 @@ py_binary(
     name = "boston",
     srcs = ["boston.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
@@ -25,7 +22,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
         "//third_party/py/numpy",
     ],
 )
@@ -34,70 +30,38 @@ py_binary(
     name = "iris",
     srcs = ["iris.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
     name = "iris_custom_decay_dnn",
     srcs = ["iris_custom_decay_dnn.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
     name = "iris_custom_model",
     srcs = ["iris_custom_model.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
     name = "iris_run_config",
     srcs = ["iris_run_config.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "iris_val_based_early_stopping",
-    srcs = ["iris_val_based_early_stopping.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
-    ],
-)
-
-py_binary(
-    name = "iris_with_pipeline",
-    srcs = ["iris_with_pipeline.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
     name = "random_forest_mnist",
-    srcs = [
-        "random_forest_mnist.py",
-    ],
+    srcs = ["random_forest_mnist.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/python:platform",
     ],
 )
 
@@ -105,11 +69,7 @@ py_binary(
     name = "resnet",
     srcs = ["resnet.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
@@ -118,7 +78,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
+        "//tensorflow/contrib/layers:layers_py",
         "//third_party/py/numpy",
     ],
 )
@@ -129,7 +89,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
         "//third_party/py/numpy",
     ],
 )
@@ -140,7 +99,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
         "//third_party/py/numpy",
     ],
 )
@@ -151,20 +109,15 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
         "//third_party/py/numpy",
     ],
 )
 
 py_binary(
     name = "wide_n_deep_tutorial",
-    srcs = [
-        "wide_n_deep_tutorial.py",
-    ],
+    srcs = ["wide_n_deep_tutorial.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 py_binary(
@@ -173,9 +126,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-        "//tensorflow/examples/tutorials/mnist:input_data",
         "//third_party/py/numpy",
     ],
 )
@@ -184,11 +134,7 @@ py_binary(
     name = "multiple_gpu",
     srcs = ["multiple_gpu.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-    ],
+    deps = ["//tensorflow:tensorflow_py"],
 )
 
 sh_test(
@@ -201,8 +147,6 @@ sh_test(
         ":iris_custom_decay_dnn",
         ":iris_custom_model",
         ":iris_run_config",
-        ":iris_val_based_early_stopping",
-        ":iris_with_pipeline",
         ":random_forest_mnist",
         ":resnet",
         ":text_classification",
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index 37157fc29679064512fd1a3a5bcad10ee811078b..70d9db85ee5b48a75c7f6829ce6a6b22ff097535 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -1,33 +1,34 @@
-# TF Learn Examples
+# Estimator Examples
 
-Learn is a high-level API for TensorFlow that allows you to create,
-train, and use deep learning models easily. See the [Quickstart tutorial](https://www.tensorflow.org/get_started/tflearn)
+TensorFlow Estimators are a high-level API for TensorFlow that allows you to
+create, train, and use deep learning models easily.
+
+See the [Quickstart tutorial](https://www.tensorflow.org/get_started/estimator)
 for an introduction to the API.
 
-To run most of these examples, you need to install the `scikit learn` library (`sudo pip install sklearn`).
-Some examples use the `pandas` library for data processing (`sudo pip install pandas`).
+To run most of these examples, you need to install the `scikit learn` library
+(`pip install -U scikit-learn`). Some examples use the `pandas` library for data
+processing (`pip install -U pandas`).
 
 ## Basics
 
-* [Deep Neural Network Regression with Boston Data](boston.py)
-* [Deep Neural Network Classification with Iris Data](iris.py)
-* [Building a Custom Model](iris_custom_model.py)
-* [Building a Model Using Different GPU Configurations](iris_run_config.py)
+* [Deep Neural Network Regression with Boston Data](https://www.tensorflow.org/code/tensorflow/examples/learn/boston.py)
+* [Deep Neural Network Classification with Iris Data](https://www.tensorflow.org/code/tensorflow/examples/learn/iris.py)
+* [Building a Custom Model](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_custom_model.py)
+* [Building a Model Using Different GPU Configurations](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_run_config.py)
 
 ## Techniques
 
-* [Improving Performance Using Early Stopping with Iris Data](iris_val_based_early_stopping.py)
-* [Using skflow with Pipeline](iris_with_pipeline.py)
-* [Deep Neural Network with Customized Decay Function](iris_custom_decay_dnn.py)
+* [Deep Neural Network with Customized Decay Function](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_custom_decay_dnn.py)
 
 ## Specialized Models
-* [Building a Random Forest Model](random_forest_mnist.py)
-* [Building a Wide & Deep Model](wide_n_deep_tutorial.py)
-* [Building a Residual Network Model](resnet.py)
+* [Building a Random Forest Model](https://www.tensorflow.org/code/tensorflow/examples/learn/random_forest_mnist.py)
+* [Building a Wide & Deep Model](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
+* [Building a Residual Network Model](https://www.tensorflow.org/code/tensorflow/examples/learn/resnet.py)
 
 ## Text classification
 
-* [Text Classification Using Recurrent Neural Networks on Words](text_classification.py)
-* [Text Classification Using Convolutional Neural Networks on Words](text_classification_cnn.py)
-* [Text Classification Using Recurrent Neural Networks on Characters](text_classification_character_rnn.py)
-* [Text Classification Using Convolutional Neural Networks on Characters](text_classification_character_cnn.py)
+* [Text Classification Using Recurrent Neural Networks on Words](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification.py)
+* [Text Classification Using Convolutional Neural Networks on Words](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_cnn.py)
+* [Text Classification Using Recurrent Neural Networks on Characters](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_rnn.py)
+* [Text Classification Using Convolutional Neural Networks on Characters](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_cnn.py)
diff --git a/tensorflow/examples/learn/boston.py b/tensorflow/examples/learn/boston.py
index 7a7024e001ac5d4a31c4a9471082d80f60476daa..c9ce508dfdb05569f4f212137032a7dd16e86a55 100644
--- a/tensorflow/examples/learn/boston.py
+++ b/tensorflow/examples/learn/boston.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from sklearn import datasets
 from sklearn import metrics
 from sklearn import model_selection
@@ -39,22 +40,31 @@ def main(unused_argv):
   x_train = scaler.fit_transform(x_train)
 
   # Build 2 layer fully connected DNN with 10, 10 units respectively.
-  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-      x_train)
-  regressor = tf.contrib.learn.DNNRegressor(
+  feature_columns = [
+      tf.feature_column.numeric_column('x', shape=np.array(x_train).shape[1:])]
+  regressor = tf.estimator.DNNRegressor(
       feature_columns=feature_columns, hidden_units=[10, 10])
 
-  # Fit
-  regressor.fit(x_train, y_train, steps=5000, batch_size=1)
-  
-  # Transform
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={'x': x_train}, y=y_train, batch_size=1, num_epochs=None, shuffle=True)
+  regressor.train(input_fn=train_input_fn, steps=2000)
+
+  # Predict.
   x_transformed = scaler.transform(x_test)
-  
-  # Predict and score
-  y_predicted = list(regressor.predict(x_transformed, as_iterable=True))
-  score = metrics.mean_squared_error(y_predicted, y_test)
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={'x': x_transformed}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = regressor.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['predictions'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
+  score_sklearn = metrics.mean_squared_error(y_predicted, y_test)
+  print('MSE (sklearn): {0:f}'.format(score_sklearn))
 
-  print('MSE: {0:f}'.format(score))
+  # Score with tensorflow.
+  scores = regressor.evaluate(input_fn=test_input_fn)
+  print('MSE (tensorflow): {0:f}'.format(scores['average_loss']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index 6a7bfa49b9d6c0f101b62a64b5cb5ba2bce23e0f..b8763de471c90a3f1d4067606222f7a7ecd2959d 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -49,8 +49,6 @@ test iris
 test iris_custom_decay_dnn
 test iris_custom_model
 test iris_run_config
-test iris_val_based_early_stopping
-test iris_with_pipeline
 test random_forest_mnist
 test resnet
 test text_classification --test_with_fake_data
diff --git a/tensorflow/examples/learn/hdf5_classification.py b/tensorflow/examples/learn/hdf5_classification.py
index db37500246be39c9cb6978aa42888be3518fadb5..3a46bbcf41c68187f493ac18bb7d4725ad91dbfc 100644
--- a/tensorflow/examples/learn/hdf5_classification.py
+++ b/tensorflow/examples/learn/hdf5_classification.py
@@ -11,25 +11,27 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset, h5 format."""
+"""Example of DNNClassifier for Iris plant dataset, hdf5 format."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from sklearn import cross_validation
+from sklearn import datasets
 from sklearn import metrics
+from sklearn import model_selection
 import tensorflow as tf
 import h5py  # pylint: disable=g-bad-import-order
 
-learn = tf.contrib.learn
+
+X_FEATURE = 'x'  # Name of the input feature.
 
 
 def main(unused_argv):
   # Load dataset.
-  iris = learn.datasets.load_dataset('iris')
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  iris = datasets.load_iris()
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
   # Note that we are saving and load iris data as h5 format as a simple
@@ -48,14 +50,31 @@ def main(unused_argv):
   y_test = np.array(h5f['y_test'])
 
   # Build 3 layer DNN with 10, 20, 10 units respectively.
-  feature_columns = learn.infer_real_valued_columns_from_input(x_train)
-  classifier = learn.DNNClassifier(
+  feature_columns = [
+      tf.feature_column.numeric_column(
+          X_FEATURE, shape=np.array(x_train).shape[1:])]
+  classifier = tf.estimator.DNNClassifier(
       feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
 
-  # Fit and predict.
-  classifier.fit(x_train, y_train, steps=200)
-  score = metrics.accuracy_score(y_test, classifier.predict(x_test))
-  print('Accuracy: {0:f}'.format(score))
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=200)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class_ids'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
+  score = metrics.accuracy_score(y_test, y_predicted)
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index ec2aa9b5731dce94c7070d98569d35a7fe689324..33e8d45801409fa112e27f40b1732c43cda72bc2 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from sklearn import datasets
 from sklearn import metrics
 from sklearn import model_selection
@@ -24,6 +25,9 @@ from sklearn import model_selection
 import tensorflow as tf
 
 
+X_FEATURE = 'x'  # Name of the input feature.
+
+
 def main(unused_argv):
   # Load dataset.
   iris = datasets.load_iris()
@@ -31,16 +35,31 @@ def main(unused_argv):
       iris.data, iris.target, test_size=0.2, random_state=42)
 
   # Build 3 layer DNN with 10, 20, 10 units respectively.
-  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-      x_train)
-  classifier = tf.contrib.learn.DNNClassifier(
+  feature_columns = [
+      tf.feature_column.numeric_column(
+          X_FEATURE, shape=np.array(x_train).shape[1:])]
+  classifier = tf.estimator.DNNClassifier(
       feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
 
-  # Fit and predict.
-  classifier.fit(x_train, y_train, steps=200)
-  predictions = list(classifier.predict(x_test, as_iterable=True))
-  score = metrics.accuracy_score(y_test, predictions)
-  print('Accuracy: {0:f}'.format(score))
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=200)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class_ids'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
+  score = metrics.accuracy_score(y_test, y_predicted)
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 31acbd30cd33a5211350d1adea1f7cdbbf80e874..072357e51c418ae1163debe29516c31ccc367386 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -17,36 +17,87 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from sklearn import datasets
 from sklearn import metrics
-from sklearn.cross_validation import train_test_split
+from sklearn import model_selection
 import tensorflow as tf
 
 
-def optimizer_exp_decay():
-  global_step = tf.contrib.framework.get_or_create_global_step()
-  learning_rate = tf.train.exponential_decay(
-      learning_rate=0.1, global_step=global_step,
-      decay_steps=100, decay_rate=0.001)
-  return tf.train.AdagradOptimizer(learning_rate=learning_rate)
+X_FEATURE = 'x'  # Name of the input feature.
+
+
+def my_model(features, labels, mode):
+  """DNN with three hidden layers."""
+  # Create three fully connected layers respectively of size 10, 20, and 10.
+  net = features[X_FEATURE]
+  for units in [10, 20, 10]:
+    net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
+
+  # Compute logits (1 per class).
+  logits = tf.layers.dense(net, 3, activation=None)
+
+  # Compute predictions.
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class': predicted_classes,
+        'prob': tf.nn.softmax(logits)
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
+  # with a on-value of 1 for each one-hot vector of length 3.
+  onehot_labels = tf.one_hot(labels, 3, 1, 0)
+  # Compute loss.
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+
+  # Create training op with exponentially decaying learning rate.
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.train.get_global_step()
+    learning_rate = tf.train.exponential_decay(
+        learning_rate=0.1, global_step=global_step,
+        decay_steps=100, decay_rate=0.001)
+    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
+    train_op = optimizer.minimize(loss, global_step=global_step)
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  # Compute evaluation metrics.
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
-  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-      x_train)
-  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                              hidden_units=[10, 20, 10],
-                                              n_classes=3,
-                                              optimizer=optimizer_exp_decay)
-
-  classifier.fit(x_train, y_train, steps=800)
-  predictions = list(classifier.predict(x_test, as_iterable=True))
-  score = metrics.accuracy_score(y_test, predictions)
-  print('Accuracy: {0:f}'.format(score))
+  classifier = tf.estimator.Estimator(model_fn=my_model)
+
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=1000)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
+  score = metrics.accuracy_score(y_test, y_predicted)
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index fbc50716c932d56ad6208f575c7eec3c9d7c6167..471a99ba76dd8012ba3b1a519d5d07fb378f89e7 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -16,62 +16,85 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sklearn import cross_validation
+import numpy as np
 from sklearn import datasets
 from sklearn import metrics
+from sklearn import model_selection
 import tensorflow as tf
 
-layers = tf.contrib.layers
-learn = tf.contrib.learn
 
+X_FEATURE = 'x'  # Name of the input feature.
 
-def my_model(features, target):
-  """DNN with three hidden layers, and dropout of 0.1 probability."""
-  # Convert the target to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  target = tf.one_hot(target, 3, 1, 0)
 
+def my_model(features, labels, mode):
+  """DNN with three hidden layers, and dropout of 0.1 probability."""
   # Create three fully connected layers respectively of size 10, 20, and 10 with
   # each layer having a dropout probability of 0.1.
-  normalizer_fn = layers.dropout
-  normalizer_params = {'keep_prob': 0.9}
-  features = layers.stack(
-      features,
-      layers.fully_connected, [10, 20, 10],
-      normalizer_fn=normalizer_fn,
-      normalizer_params=normalizer_params)
-
-  # Compute logits (1 per class) and compute loss.
-  logits = layers.fully_connected(features, 3, activation_fn=None)
-  loss = tf.losses.softmax_cross_entropy(target, logits)
-
-  # Create a tensor for training op.
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adagrad',
-      learning_rate=0.1)
-
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
+  net = features[X_FEATURE]
+  for units in [10, 20, 10]:
+    net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
+    net = tf.layers.dropout(net, rate=0.1)
+
+  # Compute logits (1 per class).
+  logits = tf.layers.dense(net, 3, activation=None)
+
+  # Compute predictions.
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class': predicted_classes,
+        'prob': tf.nn.softmax(logits)
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
+  # with a on-value of 1 for each one-hot vector of length 3.
+  onehot_labels = tf.one_hot(labels, 3, 1, 0)
+  # Compute loss.
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+
+  # Create training op.
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  # Compute evaluation metrics.
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
-  classifier = learn.Estimator(model_fn=my_model)
-  classifier.fit(x_train, y_train, steps=1000)
+  classifier = tf.estimator.Estimator(model_fn=my_model)
 
-  y_predicted = [
-      p['class'] for p in classifier.predict(
-          x_test, as_iterable=True)
-  ]
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=1000)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
   score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy: {0:f}'.format(score))
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/iris_run_config.py b/tensorflow/examples/learn/iris_run_config.py
index b7b8b5cd01fd0d6be419740e44ec9aadec7e2e2f..286c824e30f7f85be9751a852d79c60150100d9a 100644
--- a/tensorflow/examples/learn/iris_run_config.py
+++ b/tensorflow/examples/learn/iris_run_config.py
@@ -18,37 +18,53 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from sklearn import cross_validation
+import numpy as np
 from sklearn import datasets
 from sklearn import metrics
+from sklearn import model_selection
 import tensorflow as tf
 
 
+X_FEATURE = 'x'  # Name of the input feature.
+
+
 def main(unused_argv):
   # Load dataset.
   iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
   # You can define you configurations by providing a RunConfig object to
-  # estimator to control session configurations, e.g. num_cores
-  # and gpu_memory_fraction
-  run_config = tf.contrib.learn.estimators.RunConfig(
-      num_cores=3, gpu_memory_fraction=0.6)
+  # estimator to control session configurations, e.g. tf_random_seed.
+  run_config = tf.estimator.RunConfig().replace(tf_random_seed=1)
 
   # Build 3 layer DNN with 10, 20, 10 units respectively.
-  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-      x_train)
-  classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                              hidden_units=[10, 20, 10],
-                                              n_classes=3,
-                                              config=run_config)
-
-  # Fit and predict.
-  classifier.fit(x_train, y_train, steps=200)
-  predictions = list(classifier.predict(x_test, as_iterable=True))
-  score = metrics.accuracy_score(y_test, predictions)
-  print('Accuracy: {0:f}'.format(score))
+  feature_columns = [
+      tf.feature_column.numeric_column(
+          X_FEATURE, shape=np.array(x_train).shape[1:])]
+  classifier = tf.estimator.DNNClassifier(
+      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
+      config=run_config)
+
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=200)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class_ids'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
+  score = metrics.accuracy_score(y_test, y_predicted)
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/iris_val_based_early_stopping.py b/tensorflow/examples/learn/iris_val_based_early_stopping.py
deleted file mode 100644
index 991d1831d7e2ded295c6298e48997508768784b1..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/iris_val_based_early_stopping.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset, with early stopping."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-
-from sklearn import datasets
-from sklearn import metrics
-from sklearn.cross_validation import train_test_split
-import tensorflow as tf
-
-learn = tf.contrib.learn
-
-
-def clean_folder(folder):
-  """Cleans the given folder if it exists."""
-  try:
-    shutil.rmtree(folder)
-  except OSError:
-    pass
-
-
-def main(unused_argv):
-  iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  x_train, x_val, y_train, y_val = train_test_split(
-      x_train, y_train, test_size=0.2, random_state=42)
-  val_monitor = learn.monitors.ValidationMonitor(
-      x_val, y_val, early_stopping_rounds=200)
-
-  model_dir = '/tmp/iris_model'
-  clean_folder(model_dir)
-
-  # classifier with early stopping on training data
-  classifier1 = learn.DNNClassifier(
-      feature_columns=learn.infer_real_valued_columns_from_input(x_train),
-      hidden_units=[10, 20, 10],
-      n_classes=3,
-      model_dir=model_dir)
-  classifier1.fit(x=x_train, y=y_train, steps=2000)
-  predictions1 = list(classifier1.predict(x_test, as_iterable=True))
-  score1 = metrics.accuracy_score(y_test, predictions1)
-
-  model_dir = '/tmp/iris_model_val'
-  clean_folder(model_dir)
-
-  # classifier with early stopping on validation data, save frequently for
-  # monitor to pick up new checkpoints.
-  classifier2 = learn.DNNClassifier(
-      feature_columns=learn.infer_real_valued_columns_from_input(x_train),
-      hidden_units=[10, 20, 10],
-      n_classes=3,
-      model_dir=model_dir,
-      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
-  classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
-  predictions2 = list(classifier2.predict(x_test, as_iterable=True))
-  score2 = metrics.accuracy_score(y_test, predictions2)
-
-  # In many applications, the score is improved by using early stopping
-  print('score1: ', score1)
-  print('score2: ', score2)
-  print('score2 > score1: ', score2 > score1)
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/iris_with_pipeline.py b/tensorflow/examples/learn/iris_with_pipeline.py
deleted file mode 100644
index 7ba958d85b2e06797d7164735578d667242a0244..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/iris_with_pipeline.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset, with pipeline."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from sklearn import cross_validation
-from sklearn.datasets import load_iris
-from sklearn.metrics import accuracy_score
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
-import tensorflow as tf
-
-learn = tf.contrib.learn
-
-
-def main(unused_argv):
-  iris = load_iris()
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  # It's useful to scale to ensure Stochastic Gradient Descent
-  # will do the right thing.
-  scaler = StandardScaler()
-
-  # DNN classifier.
-  classifier = learn.DNNClassifier(
-      feature_columns=learn.infer_real_valued_columns_from_input(x_train),
-      hidden_units=[10, 20, 10],
-      n_classes=3)
-
-  pipeline = Pipeline([('scaler', scaler), ('DNNclassifier', classifier)])
-
-  pipeline.fit(x_train, y_train, DNNclassifier__steps=200)
-
-  score = accuracy_score(y_test, list(pipeline.predict(x_test)))
-  print('Accuracy: {0:f}'.format(score))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 15cf4b91ddbb6d92219798906f2e13a5d07d96d8..5344526b52b970721fccdc450e902d42573608dc 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -22,89 +22,110 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from sklearn import metrics
 import tensorflow as tf
 
-layers = tf.contrib.layers
-learn = tf.contrib.learn
 
+N_DIGITS = 10  # Number of digits.
+X_FEATURE = 'x'  # Name of the input feature.
 
-def max_pool_2x2(tensor_in):
-  return tf.nn.max_pool(
-      tensor_in, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 
-
-def conv_model(feature, target, mode):
+def conv_model(features, labels, mode):
   """2-layer convolution model."""
-  # Convert the target to a one-hot tensor of shape (batch_size, 10) and
-  # with a on-value of 1 for each one-hot vector of length 10.
-  target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
-
   # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
   # image width and height final dimension being the number of color channels.
-  feature = tf.reshape(feature, [-1, 28, 28, 1])
+  feature = tf.reshape(features[X_FEATURE], [-1, 28, 28, 1])
 
   # First conv layer will compute 32 features for each 5x5 patch
   with tf.variable_scope('conv_layer1'):
-    h_conv1 = layers.convolution2d(
-        feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
-    h_pool1 = max_pool_2x2(h_conv1)
+    h_conv1 = tf.layers.conv2d(
+        feature,
+        filters=32,
+        kernel_size=[5, 5],
+        padding='same',
+        activation=tf.nn.relu)
+    h_pool1 = tf.layers.max_pooling2d(
+        h_conv1, pool_size=2, strides=2, padding='same')
 
   # Second conv layer will compute 64 features for each 5x5 patch.
   with tf.variable_scope('conv_layer2'):
-    h_conv2 = layers.convolution2d(
-        h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
-    h_pool2 = max_pool_2x2(h_conv2)
+    h_conv2 = tf.layers.conv2d(
+        h_pool1,
+        filters=64,
+        kernel_size=[5, 5],
+        padding='same',
+        activation=tf.nn.relu)
+    h_pool2 = tf.layers.max_pooling2d(
+        h_conv2, pool_size=2, strides=2, padding='same')
     # reshape tensor into a batch of vectors
     h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
 
   # Densely connected layer with 1024 neurons.
-  h_fc1 = layers.dropout(
-      layers.fully_connected(
-          h_pool2_flat, 1024, activation_fn=tf.nn.relu),
-      keep_prob=0.5,
-      is_training=mode == tf.contrib.learn.ModeKeys.TRAIN)
+  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    h_fc1 = tf.layers.dropout(h_fc1, rate=0.5)
 
   # Compute logits (1 per class) and compute loss.
-  logits = layers.fully_connected(h_fc1, 10, activation_fn=None)
-  loss = tf.losses.softmax_cross_entropy(target, logits)
-
-  # Create a tensor for training op.
-  train_op = layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='SGD',
-      learning_rate=0.001)
-
-  return tf.argmax(logits, 1), loss, train_op
+  logits = tf.layers.dense(h_fc1, N_DIGITS, activation=None)
+
+  # Compute predictions.
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class': predicted_classes,
+        'prob': tf.nn.softmax(logits)
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  # Compute loss.
+  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+
+  # Create training op.
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  # Compute evaluation metrics.
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_args):
   ### Download and load MNIST dataset.
-  mnist = learn.datasets.load_dataset('mnist')
+  mnist = tf.contrib.learn.datasets.DATASETS['mnist']('/tmp/mnist')
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: mnist.train.images},
+      y=mnist.train.labels.astype(np.int32),
+      batch_size=100,
+      num_epochs=None,
+      shuffle=True)
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: mnist.train.images},
+      y=mnist.train.labels.astype(np.int32),
+      num_epochs=1,
+      shuffle=False)
 
   ### Linear classifier.
-  feature_columns = learn.infer_real_valued_columns_from_input(
-      mnist.train.images)
-  classifier = learn.LinearClassifier(
-      feature_columns=feature_columns, n_classes=10)
-  classifier.fit(mnist.train.images,
-                 mnist.train.labels.astype(np.int32),
-                 batch_size=100,
-                 steps=1000)
-  score = metrics.accuracy_score(mnist.test.labels,
-                                 list(classifier.predict(mnist.test.images)))
-  print('Accuracy: {0:f}'.format(score))
+  feature_columns = [
+      tf.feature_column.numeric_column(
+          X_FEATURE, shape=mnist.train.images.shape[1:])]
+  classifier = tf.estimator.LinearClassifier(
+      feature_columns=feature_columns, n_classes=N_DIGITS)
+  classifier.train(input_fn=train_input_fn, steps=200)
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (LinearClassifier): {0:f}'.format(scores['accuracy']))
 
   ### Convolutional network
-  classifier = learn.Estimator(model_fn=conv_model)
-  classifier.fit(mnist.train.images,
-                 mnist.train.labels,
-                 batch_size=100,
-                 steps=20000)
-  score = metrics.accuracy_score(mnist.test.labels,
-                                 list(classifier.predict(mnist.test.images)))
-  print('Accuracy: {0:f}'.format(score))
+  classifier = tf.estimator.Estimator(model_fn=conv_model)
+  classifier.train(input_fn=train_input_fn, steps=200)
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (conv_model): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index df58906b393f27f98c9922c19ddb05fc437ddcd9..c7364d1f7207355f1249d353ff1db3a218f94420 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -20,75 +20,100 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from sklearn import cross_validation
+import numpy as np
 from sklearn import datasets
 from sklearn import metrics
+from sklearn import model_selection
 import tensorflow as tf
 
-layers = tf.contrib.layers
-learn = tf.contrib.learn
 
+X_FEATURE = 'x'  # Name of the input feature.
 
-def my_model(features, target):
+
+def my_model(features, labels, mode):
   """DNN with three hidden layers, and dropout of 0.1 probability.
 
   Note: If you want to run this example with multiple GPUs, Cuda Toolkit 7.0 and
   CUDNN 6.5 V2 from NVIDIA need to be installed beforehand.
 
   Args:
-    features: `Tensor` of input features.
-    target: `Tensor` of targets.
+    features: Dict of input `Tensor`.
+    labels: Label `Tensor`.
+    mode: One of `ModeKeys`.
 
   Returns:
-    Tuple of predictions, loss and training op.
+    `EstimatorSpec`.
   """
-  # Convert the target to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  target = tf.one_hot(target, 3, 1, 0)
-
   # Create three fully connected layers respectively of size 10, 20, and 10 with
   # each layer having a dropout probability of 0.1.
-  normalizer_fn = layers.dropout
-  normalizer_params = {'keep_prob': 0.5}
+  net = features[X_FEATURE]
   with tf.device('/gpu:1'):
-    features = layers.stack(
-        features,
-        layers.fully_connected, [10, 20, 10],
-        normalizer_fn=normalizer_fn,
-        normalizer_params=normalizer_params)
+    for units in [10, 20, 10]:
+      net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
+      net = tf.layers.dropout(net, rate=0.1)
 
   with tf.device('/gpu:2'):
-    # Compute logits (1 per class) and compute loss.
-    logits = layers.fully_connected(features, 3, activation_fn=None)
-    loss = tf.losses.softmax_cross_entropy(target, logits)
-
-    # Create a tensor for training op.
-    train_op = tf.contrib.layers.optimize_loss(
-        loss,
-        tf.contrib.framework.get_global_step(),
-        optimizer='Adagrad',
-        learning_rate=0.1)
-
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
+    # Compute logits (1 per class).
+    logits = tf.layers.dense(net, 3, activation=None)
+
+    # Compute predictions.
+    predicted_classes = tf.argmax(logits, 1)
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      predictions = {
+          'class': predicted_classes,
+          'prob': tf.nn.softmax(logits)
+      }
+      return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+    # Convert the labels to a one-hot tensor of shape (length of features, 3)
+    # and with a on-value of 1 for each one-hot vector of length 3.
+    onehot_labels = tf.one_hot(labels, 3, 1, 0)
+    # Compute loss.
+    loss = tf.losses.softmax_cross_entropy(
+        onehot_labels=onehot_labels, logits=logits)
+
+    # Create training op.
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
+      train_op = optimizer.minimize(
+          loss, global_step=tf.train.get_global_step())
+      return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+    # Compute evaluation metrics.
+    eval_metric_ops = {
+        'accuracy': tf.metrics.accuracy(
+            labels=labels, predictions=predicted_classes)
+    }
+    return tf.estimator.EstimatorSpec(
+        mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
       iris.data, iris.target, test_size=0.2, random_state=42)
 
-  classifier = learn.Estimator(model_fn=my_model)
-  classifier.fit(x_train, y_train, steps=1000)
+  classifier = tf.estimator.Estimator(model_fn=my_model)
 
-  y_predicted = [
-      p['class'] for p in classifier.predict(
-          x_test, as_iterable=True)
-  ]
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=100)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
   score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy: {0:f}'.format(score))
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 881905fde8e5ce8ecd0e176ca6d6273bad29a5df..33a09bb6e0a00a18b91242fdafc05d60e382c0ba 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -25,31 +25,17 @@ from __future__ import print_function
 
 from collections import namedtuple
 from math import sqrt
-import os
 
+import numpy as np
 import tensorflow as tf
 
-batch_norm = tf.contrib.layers.batch_norm
-convolution2d = tf.contrib.layers.convolution2d
 
+N_DIGITS = 10  # Number of digits.
+X_FEATURE = 'x'  # Name of the input feature.
 
-def res_net(x, y, activation=tf.nn.relu):
-  """Builds a residual network.
 
-  Note that if the input tensor is 2D, it must be square in order to be
-  converted to a 4D tensor.
-
-  Borrowed structure from:
-  github.com/pkmital/tensorflow_tutorials/blob/master/10_residual_network.py
-
-  Args:
-    x: Input of the network
-    y: Output of the network
-    activation: Activation function to apply after each convolution
-
-  Returns:
-    Predictions and loss tensors.
-  """
+def res_net_model(features, labels, mode):
+  """Builds a residual network."""
 
   # Configurations for each bottleneck group.
   BottleneckGroup = namedtuple('BottleneckGroup',
@@ -59,6 +45,7 @@ def res_net(x, y, activation=tf.nn.relu):
       BottleneckGroup(3, 512, 128), BottleneckGroup(3, 1024, 256)
   ]
 
+  x = features[X_FEATURE]
   input_shape = x.get_shape().as_list()
 
   # Reshape the input into the right shape if it's 2D tensor
@@ -68,15 +55,24 @@ def res_net(x, y, activation=tf.nn.relu):
 
   # First convolution expands to 64 channels
   with tf.variable_scope('conv_layer1'):
-    net = convolution2d(
-        x, 64, 7, normalizer_fn=batch_norm, activation_fn=activation)
+    net = tf.layers.conv2d(
+        x,
+        filters=64,
+        kernel_size=7,
+        activation=tf.nn.relu)
+    net = tf.layers.batch_normalization(net)
 
   # Max pool
-  net = tf.nn.max_pool(net, [1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
+  net = tf.layers.max_pooling2d(
+      net, pool_size=3, strides=2, padding='same')
 
   # First chain of resnets
   with tf.variable_scope('conv_layer2'):
-    net = convolution2d(net, groups[0].num_filters, 1, padding='VALID')
+    net = tf.layers.conv2d(
+        net,
+        filters=groups[0].num_filters,
+        kernel_size=1,
+        padding='valid')
 
   # Create the bottleneck groups, each of which contains `num_blocks`
   # bottleneck groups.
@@ -86,33 +82,33 @@ def res_net(x, y, activation=tf.nn.relu):
 
       # 1x1 convolution responsible for reducing dimension
       with tf.variable_scope(name + '/conv_in'):
-        conv = convolution2d(
+        conv = tf.layers.conv2d(
             net,
-            group.bottleneck_size,
-            1,
-            padding='VALID',
-            activation_fn=activation,
-            normalizer_fn=batch_norm)
+            filters=group.num_filters,
+            kernel_size=1,
+            padding='valid',
+            activation=tf.nn.relu)
+        conv = tf.layers.batch_normalization(conv)
 
       with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = convolution2d(
+        conv = tf.layers.conv2d(
             conv,
-            group.bottleneck_size,
-            3,
-            padding='SAME',
-            activation_fn=activation,
-            normalizer_fn=batch_norm)
+            filters=group.bottleneck_size,
+            kernel_size=3,
+            padding='same',
+            activation=tf.nn.relu)
+        conv = tf.layers.batch_normalization(conv)
 
       # 1x1 convolution responsible for restoring dimension
       with tf.variable_scope(name + '/conv_out'):
         input_dim = net.get_shape()[-1].value
-        conv = convolution2d(
+        conv = tf.layers.conv2d(
             conv,
-            input_dim,
-            1,
-            padding='VALID',
-            activation_fn=activation,
-            normalizer_fn=batch_norm)
+            filters=input_dim,
+            kernel_size=1,
+            padding='valid',
+            activation=tf.nn.relu)
+        conv = tf.layers.batch_normalization(conv)
 
       # shortcut connections that turn the network into its counterpart
       # residual function (identity shortcut)
@@ -122,13 +118,13 @@ def res_net(x, y, activation=tf.nn.relu):
       # upscale to the next group size
       next_group = groups[group_i + 1]
       with tf.variable_scope('block_%d/conv_upscale' % group_i):
-        net = convolution2d(
+        net = tf.layers.conv2d(
             net,
-            next_group.num_filters,
-            1,
-            activation_fn=None,
-            biases_initializer=None,
-            padding='SAME')
+            filters=next_group.num_filters,
+            kernel_size=1,
+            padding='same',
+            activation=None,
+            bias_initializer=None)
     except IndexError:
       pass
 
@@ -142,48 +138,65 @@ def res_net(x, y, activation=tf.nn.relu):
   net_shape = net.get_shape().as_list()
   net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]])
 
-  target = tf.one_hot(y, depth=10, dtype=tf.float32)
-  logits = tf.contrib.layers.fully_connected(net, 10, activation_fn=None)
-  loss = tf.losses.softmax_cross_entropy(target, logits)
-  return tf.nn.softmax(logits), loss
-
-
-def res_net_model(x, y):
-  prediction, loss = res_net(x, y)
-  predicted = tf.argmax(prediction, 1)
-  accuracy = tf.equal(predicted, tf.cast(y, tf.int64))
-  predictions = {'prob': prediction, 'class': predicted, 'accuracy': accuracy}
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adagrad',
-      learning_rate=0.001)
-  return predictions, loss, train_op
-
-
-# Download and load MNIST data.
-mnist = tf.contrib.learn.datasets.load_dataset('mnist')
-
-# Create a new resnet classifier.
-classifier = tf.contrib.learn.Estimator(model_fn=res_net_model)
-
-tf.logging.set_verbosity(tf.logging.INFO)  # Show training logs. (avoid silence)
-
-# Train model and save summaries into logdir.
-classifier.fit(mnist.train.images,
-               mnist.train.labels,
-               batch_size=100,
-               steps=1000)
-
-# Calculate accuracy.
-result = classifier.evaluate(
-    x=mnist.test.images,
-    y=mnist.test.labels,
-    metrics={
-        'accuracy':
-            tf.contrib.learn.MetricSpec(
-                metric_fn=tf.contrib.metrics.streaming_accuracy,
-                prediction_key='accuracy'),
-    })
-score = result['accuracy']
-print('Accuracy: {0:f}'.format(score))
+  # Compute logits (1 per class) and compute loss.
+  logits = tf.layers.dense(net, N_DIGITS, activation=None)
+
+  # Compute predictions.
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class': predicted_classes,
+        'prob': tf.nn.softmax(logits)
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  # Compute loss.
+  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+
+  # Create training op.
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  # Compute evaluation metrics.
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode, loss=loss, eval_metric_ops=eval_metric_ops)
+
+
+def main(unused_args):
+  # Download and load MNIST data.
+  mnist = tf.contrib.learn.datasets.DATASETS['mnist']('/tmp/mnist')
+
+  # Create a new resnet classifier.
+  classifier = tf.estimator.Estimator(model_fn=res_net_model)
+
+  tf.logging.set_verbosity(tf.logging.INFO)  # Show training logs.
+
+  # Train model and save summaries into logdir.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: mnist.train.images},
+      y=mnist.train.labels.astype(np.int32),
+      batch_size=100,
+      num_epochs=None,
+      shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=100)
+
+  # Calculate accuracy.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={X_FEATURE: mnist.train.images},
+      y=mnist.train.labels.astype(np.int32),
+      num_epochs=1,
+      shuffle=False)
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy: {0:f}'.format(scores['accuracy']))
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index 7e10014c392b665828aa0ec6c46cbb7b85ef5f44..21d98e9ea2bd335f5551212ef4ba0d81487ee70b 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -24,43 +24,67 @@ import numpy as np
 import pandas
 from sklearn import metrics
 import tensorflow as tf
-from tensorflow.contrib.layers.python.layers import encoders
-
-learn = tf.contrib.learn
 
 FLAGS = None
 
 MAX_DOCUMENT_LENGTH = 10
 EMBEDDING_SIZE = 50
 n_words = 0
+MAX_LABEL = 15
+WORDS_FEATURE = 'words'  # Name of the input words feature.
+
+
+def estimator_spec_for_softmax_classification(
+    logits, labels, mode):
+  """Returns EstimatorSpec instance for softmax classification."""
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions={
+            'class': predicted_classes,
+            'prob': tf.nn.softmax(logits)
+        })
+
+  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
+
+
+def bag_of_words_model(features, labels, mode):
+  """A bag-of-words model. Note it disregards the word order in the text."""
+  bow_column = tf.feature_column.categorical_column_with_identity(
+      WORDS_FEATURE, num_buckets=n_words)
+  bow_embedding_column = tf.feature_column.embedding_column(
+      bow_column, dimension=EMBEDDING_SIZE)
+  bow = tf.feature_column.input_layer(
+      features,
+      feature_columns=[bow_embedding_column])
+  logits = tf.layers.dense(bow, MAX_LABEL, activation=None)
 
+  return estimator_spec_for_softmax_classification(
+      logits=logits, labels=labels, mode=mode)
 
-def bag_of_words_model(features, target):
-  """A bag-of-words model. Note it disregards the word order in the text."""
-  target = tf.one_hot(target, 15, 1, 0)
-  features = encoders.bow_encoder(
-      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
-  logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adam',
-      learning_rate=0.01)
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
-
-
-def rnn_model(features, target):
+
+def rnn_model(features, labels, mode):
   """RNN model to predict from sequence of words to a class."""
   # Convert indexes of words into embeddings.
   # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
   # maps word indexes of the sequence into [batch_size, sequence_length,
   # EMBEDDING_SIZE].
   word_vectors = tf.contrib.layers.embed_sequence(
-      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')
+      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
 
   # Split into list of embedding per word, while removing doc length dim.
   # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
@@ -74,29 +98,17 @@ def rnn_model(features, target):
   _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)
 
   # Given encoding of RNN, take encoding of last step (e.g hidden size of the
-  # neural network of last step) and pass it as features for logistic
-  # regression over output classes.
-  target = tf.one_hot(target, 15, 1, 0)
-  logits = tf.contrib.layers.fully_connected(encoding, 15, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
-
-  # Create a training op.
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adam',
-      learning_rate=0.01)
-
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
+  # neural network of last step) and pass it as features for softmax
+  # classification over output classes.
+  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
+  return estimator_spec_for_softmax_classification(
+      logits=logits, labels=labels, mode=mode)
 
 
 def main(unused_argv):
   global n_words
   # Prepare training and testing data
-  dbpedia = learn.datasets.load_dataset(
+  dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
   x_train = pandas.DataFrame(dbpedia.train.data)[1]
   y_train = pandas.Series(dbpedia.train.target)
@@ -104,14 +116,15 @@ def main(unused_argv):
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
-  vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
-  
+  vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
+      MAX_DOCUMENT_LENGTH)
+
   x_transform_train = vocab_processor.fit_transform(x_train)
   x_transform_test = vocab_processor.transform(x_test)
-  
+
   x_train = np.array(list(x_transform_train))
   x_test = np.array(list(x_transform_test))
-  
+
   n_words = len(vocab_processor.vocabulary_)
   print('Total words: %d' % n_words)
 
@@ -119,17 +132,41 @@ def main(unused_argv):
   # Switch between rnn_model and bag_of_words_model to test different models.
   model_fn = rnn_model
   if FLAGS.bow_model:
+    # Subtract 1 because VocabularyProcessor outputs a word-id matrix where word
+    # ids start from 1 and 0 means 'no word'. But
+    # categorical_column_with_identity assumes 0-based count and uses -1 for
+    # missing word.
+    x_train -= 1
+    x_test -= 1
     model_fn = bag_of_words_model
-  classifier = learn.Estimator(model_fn=model_fn)
-
-  # Train and predict
-  classifier.fit(x_train, y_train, steps=100)
-  y_predicted = [
-      p['class'] for p in classifier.predict(
-          x_test, as_iterable=True)
-  ]
+  classifier = tf.estimator.Estimator(model_fn=model_fn)
+
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={WORDS_FEATURE: x_train},
+      y=y_train,
+      batch_size=len(x_train),
+      num_epochs=None,
+      shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=100)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={WORDS_FEATURE: x_test},
+      y=y_test,
+      num_epochs=1,
+      shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
   score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy: {0:f}'.format(score))
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index 5ad53acf9f3f49ad3b217c73a642c6d0ca5d657a..5f7c8e73710ff6b9a107e2197fbecc602c074731 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -11,8 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""This is an example of using convolutional networks over characters for
-   DBpedia dataset to predict class from description of an entity.
+"""Example of using convolutional networks over characters for DBpedia dataset.
 
 This model is similar to one described in this paper:
    "Character-level Convolutional Networks for Text Classification"
@@ -34,8 +33,6 @@ import pandas
 from sklearn import metrics
 import tensorflow as tf
 
-learn = tf.contrib.learn
-
 FLAGS = None
 
 MAX_DOCUMENT_LENGTH = 100
@@ -44,53 +41,73 @@ FILTER_SHAPE1 = [20, 256]
 FILTER_SHAPE2 = [20, N_FILTERS]
 POOLING_WINDOW = 4
 POOLING_STRIDE = 2
+MAX_LABEL = 15
+CHARS_FEATURE = 'chars'  # Name of the input character feature.
 
 
-def char_cnn_model(features, target):
+def char_cnn_model(features, labels, mode):
   """Character level convolutional neural network model to predict classes."""
-  target = tf.one_hot(target, 15, 1, 0)
-  byte_list = tf.reshape(
-      tf.one_hot(features, 256), [-1, MAX_DOCUMENT_LENGTH, 256, 1])
+  features_onehot = tf.one_hot(features[CHARS_FEATURE], 256)
+  input_layer = tf.reshape(
+      features_onehot, [-1, MAX_DOCUMENT_LENGTH, 256, 1])
   with tf.variable_scope('CNN_Layer1'):
     # Apply Convolution filtering on input sequence.
-    conv1 = tf.contrib.layers.convolution2d(
-        byte_list, N_FILTERS, FILTER_SHAPE1, padding='VALID')
-    # Add a ReLU for non linearity.
-    conv1 = tf.nn.relu(conv1)
+    conv1 = tf.layers.conv2d(
+        input_layer,
+        filters=N_FILTERS,
+        kernel_size=FILTER_SHAPE1,
+        padding='VALID',
+        # Add a ReLU for non linearity.
+        activation=tf.nn.relu)
     # Max pooling across output of Convolution+Relu.
-    pool1 = tf.nn.max_pool(
+    pool1 = tf.layers.max_pooling2d(
         conv1,
-        ksize=[1, POOLING_WINDOW, 1, 1],
-        strides=[1, POOLING_STRIDE, 1, 1],
+        pool_size=POOLING_WINDOW,
+        strides=POOLING_STRIDE,
         padding='SAME')
     # Transpose matrix so that n_filters from convolution becomes width.
     pool1 = tf.transpose(pool1, [0, 1, 3, 2])
   with tf.variable_scope('CNN_Layer2'):
     # Second level of convolution filtering.
-    conv2 = tf.contrib.layers.convolution2d(
-        pool1, N_FILTERS, FILTER_SHAPE2, padding='VALID')
+    conv2 = tf.layers.conv2d(
+        pool1,
+        filters=N_FILTERS,
+        kernel_size=FILTER_SHAPE2,
+        padding='VALID')
     # Max across each filter to get useful features for classification.
     pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
 
   # Apply regular WX + B and classification.
-  logits = tf.contrib.layers.fully_connected(pool2, 15, activation_fn=None)
-  loss = tf.losses.softmax_cross_entropy(target, logits)
-
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adam',
-      learning_rate=0.01)
-
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
+  logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
+
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions={
+            'class': predicted_classes,
+            'prob': tf.nn.softmax(logits)
+        })
+
+  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   # Prepare training and testing data
-  dbpedia = learn.datasets.load_dataset(
+  dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data, size='large')
   x_train = pandas.DataFrame(dbpedia.train.data)[1]
   y_train = pandas.Series(dbpedia.train.target)
@@ -98,21 +115,43 @@ def main(unused_argv):
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
-  char_processor = learn.preprocessing.ByteProcessor(MAX_DOCUMENT_LENGTH)
+  char_processor = tf.contrib.learn.preprocessing.ByteProcessor(
+      MAX_DOCUMENT_LENGTH)
   x_train = np.array(list(char_processor.fit_transform(x_train)))
   x_test = np.array(list(char_processor.transform(x_test)))
 
+  x_train = x_train.reshape([-1, MAX_DOCUMENT_LENGTH, 1, 1])
+  x_test = x_test.reshape([-1, MAX_DOCUMENT_LENGTH, 1, 1])
+
   # Build model
-  classifier = learn.Estimator(model_fn=char_cnn_model)
-
-  # Train and predict
-  classifier.fit(x_train, y_train, steps=100)
-  y_predicted = [
-      p['class'] for p in classifier.predict(
-          x_test, as_iterable=True)
-  ]
+  classifier = tf.estimator.Estimator(model_fn=char_cnn_model)
+
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={CHARS_FEATURE: x_train},
+      y=y_train,
+      batch_size=len(x_train),
+      num_epochs=None,
+      shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=100)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={CHARS_FEATURE: x_test},
+      y=y_test,
+      num_epochs=1,
+      shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
   score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy: {0:f}'.format(score))
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
index 1cb2cd2f88c7730c92be770e9f3a17fea63dcef9..1fc9388a1a026013ad14f8d1deeccbed817d1c88 100644
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ b/tensorflow/examples/learn/text_classification_character_rnn.py
@@ -11,7 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""This is an example of using recurrent neural networks over characters for DBpedia dataset to predict class from description of an entity.
+"""Example of recurrent neural networks over characters for DBpedia dataset.
 
 This model is similar to one described in this paper:
    "Character-level Convolutional Networks for Text Classification"
@@ -33,41 +33,52 @@ import pandas
 from sklearn import metrics
 import tensorflow as tf
 
-learn = tf.contrib.learn
-
 FLAGS = None
 
 MAX_DOCUMENT_LENGTH = 100
 HIDDEN_SIZE = 20
+MAX_LABEL = 15
+CHARS_FEATURE = 'chars'  # Name of the input character feature.
 
 
-def char_rnn_model(features, target):
+def char_rnn_model(features, labels, mode):
   """Character level recurrent neural network model to predict classes."""
-  target = tf.one_hot(target, 15, 1, 0)
-  byte_list = tf.one_hot(features, 256, 1, 0)
-  byte_list = tf.unstack(byte_list, axis=1)
+  byte_vectors = tf.one_hot(features[CHARS_FEATURE], 256, 1., 0.)
+  byte_list = tf.unstack(byte_vectors, axis=1)
 
   cell = tf.contrib.rnn.GRUCell(HIDDEN_SIZE)
   _, encoding = tf.contrib.rnn.static_rnn(cell, byte_list, dtype=tf.float32)
 
-  logits = tf.contrib.layers.fully_connected(encoding, 15, activation_fn=None)
-  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
-
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adam',
-      learning_rate=0.01)
-
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
+  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
+
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions={
+            'class': predicted_classes,
+            'prob': tf.nn.softmax(logits)
+        })
+
+  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   # Prepare training and testing data
-  dbpedia = learn.datasets.load_dataset(
+  dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
   x_train = pandas.DataFrame(dbpedia.train.data)[1]
   y_train = pandas.Series(dbpedia.train.target)
@@ -75,21 +86,40 @@ def main(unused_argv):
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
-  char_processor = learn.preprocessing.ByteProcessor(MAX_DOCUMENT_LENGTH)
+  char_processor = tf.contrib.learn.preprocessing.ByteProcessor(
+      MAX_DOCUMENT_LENGTH)
   x_train = np.array(list(char_processor.fit_transform(x_train)))
   x_test = np.array(list(char_processor.transform(x_test)))
 
   # Build model
-  classifier = learn.Estimator(model_fn=char_rnn_model)
-
-  # Train and predict
-  classifier.fit(x_train, y_train, steps=100)
-  y_predicted = [
-      p['class'] for p in classifier.predict(
-          x_test, as_iterable=True)
-  ]
+  classifier = tf.estimator.Estimator(model_fn=char_rnn_model)
+
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={CHARS_FEATURE: x_train},
+      y=y_train,
+      batch_size=len(x_train),
+      num_epochs=None,
+      shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=100)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={CHARS_FEATURE: x_test},
+      y=y_test,
+      num_epochs=1,
+      shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
   score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy: {0:f}'.format(score))
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index 468a96b58f799dc12e050d81b8c4d7f19c840962..0ee2405c8bdc35831f29a195791b743161bec80b 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -25,8 +25,6 @@ import pandas
 from sklearn import metrics
 import tensorflow as tf
 
-learn = tf.contrib.learn
-
 FLAGS = None
 
 MAX_DOCUMENT_LENGTH = 100
@@ -38,59 +36,78 @@ FILTER_SHAPE2 = [WINDOW_SIZE, N_FILTERS]
 POOLING_WINDOW = 4
 POOLING_STRIDE = 2
 n_words = 0
+MAX_LABEL = 15
+WORDS_FEATURE = 'words'  # Name of the input words feature.
 
 
-def cnn_model(features, target):
+def cnn_model(features, labels, mode):
   """2 layer ConvNet to predict from sequence of words to a class."""
   # Convert indexes of words into embeddings.
   # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
   # maps word indexes of the sequence into [batch_size, sequence_length,
   # EMBEDDING_SIZE].
-  target = tf.one_hot(target, 15, 1, 0)
   word_vectors = tf.contrib.layers.embed_sequence(
-      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')
+      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
   word_vectors = tf.expand_dims(word_vectors, 3)
   with tf.variable_scope('CNN_Layer1'):
     # Apply Convolution filtering on input sequence.
-    conv1 = tf.contrib.layers.convolution2d(
-        word_vectors, N_FILTERS, FILTER_SHAPE1, padding='VALID')
-    # Add a RELU for non linearity.
-    conv1 = tf.nn.relu(conv1)
+    conv1 = tf.layers.conv2d(
+        word_vectors,
+        filters=N_FILTERS,
+        kernel_size=FILTER_SHAPE1,
+        padding='VALID',
+        # Add a ReLU for non linearity.
+        activation=tf.nn.relu)
     # Max pooling across output of Convolution+Relu.
-    pool1 = tf.nn.max_pool(
+    pool1 = tf.layers.max_pooling2d(
         conv1,
-        ksize=[1, POOLING_WINDOW, 1, 1],
-        strides=[1, POOLING_STRIDE, 1, 1],
+        pool_size=POOLING_WINDOW,
+        strides=POOLING_STRIDE,
         padding='SAME')
     # Transpose matrix so that n_filters from convolution becomes width.
     pool1 = tf.transpose(pool1, [0, 1, 3, 2])
   with tf.variable_scope('CNN_Layer2'):
     # Second level of convolution filtering.
-    conv2 = tf.contrib.layers.convolution2d(
-        pool1, N_FILTERS, FILTER_SHAPE2, padding='VALID')
+    conv2 = tf.layers.conv2d(
+        pool1,
+        filters=N_FILTERS,
+        kernel_size=FILTER_SHAPE2,
+        padding='VALID')
     # Max across each filter to get useful features for classification.
     pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
 
   # Apply regular WX + B and classification.
-  logits = tf.contrib.layers.fully_connected(pool2, 15, activation_fn=None)
-  loss = tf.losses.softmax_cross_entropy(target, logits)
-
-  train_op = tf.contrib.layers.optimize_loss(
-      loss,
-      tf.contrib.framework.get_global_step(),
-      optimizer='Adam',
-      learning_rate=0.01)
-
-  return ({
-      'class': tf.argmax(logits, 1),
-      'prob': tf.nn.softmax(logits)
-  }, loss, train_op)
+  logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
+
+  predicted_classes = tf.argmax(logits, 1)
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions={
+            'class': predicted_classes,
+            'prob': tf.nn.softmax(logits)
+        })
+
+  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
+    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+  eval_metric_ops = {
+      'accuracy': tf.metrics.accuracy(
+          labels=labels, predictions=predicted_classes)
+  }
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   global n_words
   # Prepare training and testing data
-  dbpedia = learn.datasets.load_dataset(
+  dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
   x_train = pandas.DataFrame(dbpedia.train.data)[1]
   y_train = pandas.Series(dbpedia.train.target)
@@ -98,20 +115,42 @@ def main(unused_argv):
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
-  vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
+  vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
+      MAX_DOCUMENT_LENGTH)
   x_train = np.array(list(vocab_processor.fit_transform(x_train)))
   x_test = np.array(list(vocab_processor.transform(x_test)))
   n_words = len(vocab_processor.vocabulary_)
   print('Total words: %d' % n_words)
 
   # Build model
-  classifier = learn.SKCompat(learn.Estimator(model_fn=cnn_model))
-
-  # Train and predict
-  classifier.fit(x_train, y_train, steps=100)
-  y_predicted = classifier.predict(x_test)['class']
+  classifier = tf.estimator.Estimator(model_fn=cnn_model)
+
+  # Train.
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={WORDS_FEATURE: x_train},
+      y=y_train,
+      batch_size=len(x_train),
+      num_epochs=None,
+      shuffle=True)
+  classifier.train(input_fn=train_input_fn, steps=100)
+
+  # Predict.
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={WORDS_FEATURE: x_test},
+      y=y_test,
+      num_epochs=1,
+      shuffle=False)
+  predictions = classifier.predict(input_fn=test_input_fn)
+  y_predicted = np.array(list(p['class'] for p in predictions))
+  y_predicted = y_predicted.reshape(np.array(y_test).shape)
+
+  # Score with sklearn.
   score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy: {0:f}'.format(score))
+  print('Accuracy (sklearn): {0:f}'.format(score))
+
+  # Score with tensorflow.
+  scores = classifier.evaluate(input_fn=test_input_fn)
+  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
index a0c6df821a496a2948df28519782d3462dfe17eb..7b9381311c22a1bce179e309b36997ccc3afaac3 100644
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ b/tensorflow/examples/learn/wide_n_deep_tutorial.py
@@ -18,24 +18,93 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import shutil
 import sys
 import tempfile
 
-from six.moves import urllib
-
 import pandas as pd
+from six.moves import urllib
 import tensorflow as tf
 
 
-COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
-           "marital_status", "occupation", "relationship", "race", "gender",
-           "capital_gain", "capital_loss", "hours_per_week", "native_country",
-           "income_bracket"]
-LABEL_COLUMN = "label"
-CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
-                       "relationship", "race", "gender", "native_country"]
-CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
-                      "hours_per_week"]
+CSV_COLUMNS = [
+    "age", "workclass", "fnlwgt", "education", "education_num",
+    "marital_status", "occupation", "relationship", "race", "gender",
+    "capital_gain", "capital_loss", "hours_per_week", "native_country",
+    "income_bracket"
+]
+
+gender = tf.feature_column.categorical_column_with_vocabulary_list(
+    "gender", ["Female", "Male"])
+education = tf.feature_column.categorical_column_with_vocabulary_list(
+    "education", [
+        "Bachelors", "HS-grad", "11th", "Masters", "9th",
+        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
+        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
+        "Preschool", "12th"
+    ])
+marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
+    "marital_status", [
+        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
+        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
+    ])
+relationship = tf.feature_column.categorical_column_with_vocabulary_list(
+    "relationship", [
+        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
+        "Other-relative"
+    ])
+workclass = tf.feature_column.categorical_column_with_vocabulary_list(
+    "workclass", [
+        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
+        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
+    ])
+
+# To show an example of hashing:
+occupation = tf.feature_column.categorical_column_with_hash_bucket(
+    "occupation", hash_bucket_size=1000)
+native_country = tf.feature_column.categorical_column_with_hash_bucket(
+    "native_country", hash_bucket_size=1000)
+
+# Continuous base columns.
+age = tf.feature_column.numeric_column("age")
+education_num = tf.feature_column.numeric_column("education_num")
+capital_gain = tf.feature_column.numeric_column("capital_gain")
+capital_loss = tf.feature_column.numeric_column("capital_loss")
+hours_per_week = tf.feature_column.numeric_column("hours_per_week")
+
+# Transformations.
+age_buckets = tf.feature_column.bucketized_column(
+    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+
+# Wide columns and deep columns.
+base_columns = [
+    gender, education, marital_status, relationship, workclass, occupation,
+    native_country, age_buckets,
+]
+
+crossed_columns = [
+    tf.feature_column.crossed_column(
+        ["education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
+    tf.feature_column.crossed_column(
+        ["native_country", "occupation"], hash_bucket_size=1000)
+]
+
+deep_columns = [
+    tf.feature_column.indicator_column(workclass),
+    tf.feature_column.indicator_column(education),
+    tf.feature_column.indicator_column(gender),
+    tf.feature_column.indicator_column(relationship),
+    # To show an example of embedding
+    tf.feature_column.embedding_column(native_country, dimension=8),
+    tf.feature_column.embedding_column(occupation, dimension=8),
+    age,
+    education_num,
+    capital_gain,
+    capital_loss,
+    hours_per_week,
+]
 
 
 def maybe_download(train_data, test_data):
@@ -44,7 +113,9 @@ def maybe_download(train_data, test_data):
     train_file_name = train_data
   else:
     train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+        train_file.name)  # pylint: disable=line-too-long
     train_file_name = train_file.name
     train_file.close()
     print("Training data is downloaded to %s" % train_file_name)
@@ -53,140 +124,75 @@ def maybe_download(train_data, test_data):
     test_file_name = test_data
   else:
     test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
+        test_file.name)  # pylint: disable=line-too-long
     test_file_name = test_file.name
     test_file.close()
-    print("Test data is downloaded to %s" % test_file_name)
+    print("Test data is downloaded to %s"% test_file_name)
 
   return train_file_name, test_file_name
 
 
 def build_estimator(model_dir, model_type):
   """Build an estimator."""
-  # Sparse base columns.
-  gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender",
-                                                     keys=["female", "male"])
-  education = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "education", hash_bucket_size=1000)
-  relationship = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "relationship", hash_bucket_size=100)
-  workclass = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "workclass", hash_bucket_size=100)
-  occupation = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "occupation", hash_bucket_size=1000)
-  native_country = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "native_country", hash_bucket_size=1000)
-
-  # Continuous base columns.
-  age = tf.contrib.layers.real_valued_column("age")
-  education_num = tf.contrib.layers.real_valued_column("education_num")
-  capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-  capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-  hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
-
-  # Transformations.
-  age_buckets = tf.contrib.layers.bucketized_column(age,
-                                                    boundaries=[
-                                                        18, 25, 30, 35, 40, 45,
-                                                        50, 55, 60, 65
-                                                    ])
-
-  # Wide columns and deep columns.
-  wide_columns = [gender, native_country, education, occupation, workclass,
-                  relationship, age_buckets,
-                  tf.contrib.layers.crossed_column([education, occupation],
-                                                   hash_bucket_size=int(1e4)),
-                  tf.contrib.layers.crossed_column(
-                      [age_buckets, education, occupation],
-                      hash_bucket_size=int(1e6)),
-                  tf.contrib.layers.crossed_column([native_country, occupation],
-                                                   hash_bucket_size=int(1e4))]
-  deep_columns = [
-      tf.contrib.layers.embedding_column(workclass, dimension=8),
-      tf.contrib.layers.embedding_column(education, dimension=8),
-      tf.contrib.layers.embedding_column(gender, dimension=8),
-      tf.contrib.layers.embedding_column(relationship, dimension=8),
-      tf.contrib.layers.embedding_column(native_country,
-                                         dimension=8),
-      tf.contrib.layers.embedding_column(occupation, dimension=8),
-      age,
-      education_num,
-      capital_gain,
-      capital_loss,
-      hours_per_week,
-  ]
-
   if model_type == "wide":
-    m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
-                                          feature_columns=wide_columns)
+    m = tf.estimator.LinearClassifier(
+        model_dir=model_dir, feature_columns=base_columns + crossed_columns)
   elif model_type == "deep":
-    m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
-                                       feature_columns=deep_columns,
-                                       hidden_units=[100, 50])
+    m = tf.estimator.DNNClassifier(
+        model_dir=model_dir,
+        feature_columns=deep_columns,
+        hidden_units=[100, 50])
   else:
-    m = tf.contrib.learn.DNNLinearCombinedClassifier(
+    m = tf.estimator.DNNLinearCombinedClassifier(
         model_dir=model_dir,
-        linear_feature_columns=wide_columns,
+        linear_feature_columns=crossed_columns,
         dnn_feature_columns=deep_columns,
-        dnn_hidden_units=[100, 50],
-        fix_global_step_increment_bug=True)
+        dnn_hidden_units=[100, 50])
   return m
 
 
-def input_fn(df):
+def input_fn(data_file, num_epochs, shuffle):
   """Input builder function."""
-  # Creates a dictionary mapping from each continuous feature column name (k) to
-  # the values of that column stored in a constant Tensor.
-  continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
-  # Creates a dictionary mapping from each categorical feature column name (k)
-  # to the values of that column stored in a tf.SparseTensor.
-  categorical_cols = {
-      k: tf.SparseTensor(
-          indices=[[i, 0] for i in range(df[k].size)],
-          values=df[k].values,
-          dense_shape=[df[k].size, 1])
-      for k in CATEGORICAL_COLUMNS}
-  # Merges the two dictionaries into one.
-  feature_cols = dict(continuous_cols)
-  feature_cols.update(categorical_cols)
-  # Converts the label column into a constant Tensor.
-  label = tf.constant(df[LABEL_COLUMN].values)
-  # Returns the feature columns and the label.
-  return feature_cols, label
+  df_data = pd.read_csv(
+      tf.gfile.Open(data_file),
+      names=CSV_COLUMNS,
+      skipinitialspace=True,
+      engine="python",
+      skiprows=1)
+  # remove NaN elements
+  df_data = df_data.dropna(how="any", axis=0)
+  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
+  return tf.estimator.inputs.pandas_input_fn(
+      x=df_data,
+      y=labels,
+      batch_size=100,
+      num_epochs=num_epochs,
+      shuffle=shuffle,
+      num_threads=5)
 
 
 def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
   """Train and evaluate the model."""
   train_file_name, test_file_name = maybe_download(train_data, test_data)
-  df_train = pd.read_csv(
-      tf.gfile.Open(train_file_name),
-      names=COLUMNS,
-      skipinitialspace=True,
-      engine="python")
-  df_test = pd.read_csv(
-      tf.gfile.Open(test_file_name),
-      names=COLUMNS,
-      skipinitialspace=True,
-      skiprows=1,
-      engine="python")
-
-  # remove NaN elements
-  df_train = df_train.dropna(how='any', axis=0)
-  df_test = df_test.dropna(how='any', axis=0)
-
-  df_train[LABEL_COLUMN] = (
-      df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-  df_test[LABEL_COLUMN] = (
-      df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-
+  # Specify file path below if want to find the output easily
   model_dir = tempfile.mkdtemp() if not model_dir else model_dir
-  print("model directory = %s" % model_dir)
 
   m = build_estimator(model_dir, model_type)
-  m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)
-  results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
+  # set num_epochs to None to get infinite stream of data.
+  m.train(
+      input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
+      steps=train_steps)
+  # set steps to None to run evaluation until all data consumed.
+  results = m.evaluate(
+      input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
+      steps=None)
+  print("model directory = %s" % model_dir)
   for key in sorted(results):
     print("%s: %s" % (key, results[key]))
+  # Manual cleanup
+  shutil.rmtree(model_dir)
 
 
 FLAGS = None
@@ -215,7 +221,7 @@ if __name__ == "__main__":
   parser.add_argument(
       "--train_steps",
       type=int,
-      default=200,
+      default=2000,
       help="Number of training steps."
   )
   parser.add_argument(
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index 5d0c769007a613ad4db9673dc81e9b63703668dc..24c7040ac11a37cf2ed784c542e668721da9d9ba 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -21,7 +21,10 @@ cc_binary(
     linkopts = ["-lm"],
     deps = [
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
     ],
 )
diff --git a/tensorflow/examples/saved_model/BUILD b/tensorflow/examples/saved_model/BUILD
index 844e99dcd46cb63271e4a379b8c86dc4b1564fc1..72066eabc3b322a0a1f8ee9ee23e66b84f20e078 100644
--- a/tensorflow/examples/saved_model/BUILD
+++ b/tensorflow/examples/saved_model/BUILD
@@ -32,5 +32,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python/saved_model:main_op",
     ],
 )
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
index 02afa0b0fcbb3ce887271ea5c400a3fcc51c38f3..0d6f1ef655bcaba43c0d68e1e924bcb4b29967af 100644
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
@@ -46,7 +46,6 @@ import sys
 import tensorflow as tf
 
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.saved_model import main_op
 
 FLAGS = None
 
@@ -205,7 +204,8 @@ def _generate_saved_model_for_half_plus_two(export_dir,
           sess, [tf.saved_model.tag_constants.SERVING],
           signature_def_map=signature_def_map,
           assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-          main_op=tf.group(main_op.main_op(), assign_filename_op))
+          main_op=tf.group(tf.saved_model.main_op.main_op(),
+                           assign_filename_op))
     else:
       builder.add_meta_graph_and_variables(
           sess, [tf.saved_model.tag_constants.SERVING],
diff --git a/tensorflow/examples/tutorials/estimators/BUILD b/tensorflow/examples/tutorials/estimators/BUILD
index 0ff606831ce19c4dc192eefa9f0f9a16b0dce2ad..ecbc1a431d9a2173e80434b6f9350c225fc9bfb4 100644
--- a/tensorflow/examples/tutorials/estimators/BUILD
+++ b/tensorflow/examples/tutorials/estimators/BUILD
@@ -16,6 +16,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/learn",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
index 3c0ea2e409076671b282253d22f99516bfa99ffc..737b3ee5d6a5a71b3093fcd219c699eda228f903 100644
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ b/tensorflow/examples/tutorials/estimators/abalone.py
@@ -25,7 +25,6 @@ from six.moves import urllib
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 
 FLAGS = None
 
@@ -72,41 +71,46 @@ def maybe_download(train_data, test_data, predict_data):
   return train_file_name, test_file_name, predict_file_name
 
 
-def model_fn(features, targets, mode, params):
+def model_fn(features, labels, mode, params):
   """Model function for Estimator."""
 
   # Connect the first hidden layer to input layer
-  # (features) with relu activation
-  first_hidden_layer = tf.contrib.layers.relu(features, 10)
+  # (features["x"]) with relu activation
+  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
 
   # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.contrib.layers.relu(first_hidden_layer, 10)
+  second_hidden_layer = tf.layers.dense(
+      first_hidden_layer, 10, activation=tf.nn.relu)
 
   # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.contrib.layers.linear(second_hidden_layer, 1)
+  output_layer = tf.layers.dense(second_hidden_layer, 1)
 
   # Reshape output layer to 1-dim Tensor to return predictions
   predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
+
+  # Provide an estimator spec for `ModeKeys.PREDICT`.
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions={"ages": predictions})
 
   # Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(targets, predictions)
+  loss = tf.losses.mean_squared_error(labels, predictions)
+
+  optimizer = tf.train.GradientDescentOptimizer(
+      learning_rate=params["learning_rate"])
+  train_op = optimizer.minimize(
+      loss=loss, global_step=tf.train.get_global_step())
 
   # Calculate root mean squared error as additional eval metric
   eval_metric_ops = {
       "rmse": tf.metrics.root_mean_squared_error(
-          tf.cast(targets, tf.float64), predictions)
+          tf.cast(labels, tf.float64), predictions)
   }
 
-  train_op = tf.contrib.layers.optimize_loss(
-      loss=loss,
-      global_step=tf.contrib.framework.get_global_step(),
-      learning_rate=params["learning_rate"],
-      optimizer="SGD")
-
-  return model_fn_lib.ModelFnOps(
+  # Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
+  return tf.estimator.EstimatorSpec(
       mode=mode,
-      predictions=predictions_dict,
       loss=loss,
       train_op=train_op,
       eval_metric_ops=eval_metric_ops)
@@ -133,28 +137,34 @@ def main(unused_argv):
   model_params = {"learning_rate": LEARNING_RATE}
 
   # Instantiate Estimator
-  nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
-  
-  def get_train_inputs():
-    x = tf.constant(training_set.data)
-    y = tf.constant(training_set.target)
-    return x, y
-  
-  # Fit
-  nn.fit(input_fn=get_train_inputs, steps=5000)
+  nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
+
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": np.array(training_set.data)},
+      y=np.array(training_set.target),
+      num_epochs=None,
+      shuffle=True)
+
+  # Train
+  nn.train(input_fn=train_input_fn, steps=5000)
 
   # Score accuracy
-  def get_test_inputs():
-    x = tf.constant(test_set.data)
-    y = tf.constant(test_set.target)
-    return x, y
-  
-  ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
+  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": np.array(test_set.data)},
+      y=np.array(test_set.target),
+      num_epochs=1,
+      shuffle=False)
+
+  ev = nn.evaluate(input_fn=test_input_fn)
   print("Loss: %s" % ev["loss"])
   print("Root Mean Squared Error: %s" % ev["rmse"])
 
   # Print out predictions
-  predictions = nn.predict(x=prediction_set.data, as_iterable=True)
+  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": prediction_set.data},
+      num_epochs=1,
+      shuffle=False)
+  predictions = nn.predict(input_fn=predict_input_fn)
   for i, p in enumerate(predictions):
     print("Prediction %s: %s" % (i + 1, p["ages"]))
 
diff --git a/tensorflow/examples/tutorials/input_fn/boston.py b/tensorflow/examples/tutorials/input_fn/boston.py
index c7fb7e2316567d5c7489ec91b3c86c0ba1344229..34f350e9acd3d9541fe24c235c6f2cb5c8170c35 100644
--- a/tensorflow/examples/tutorials/input_fn/boston.py
+++ b/tensorflow/examples/tutorials/input_fn/boston.py
@@ -31,10 +31,12 @@ FEATURES = ["crim", "zn", "indus", "nox", "rm",
 LABEL = "medv"
 
 
-def input_fn(data_set):
-  feature_cols = {k: tf.constant(data_set[k].values) for k in FEATURES}
-  labels = tf.constant(data_set[LABEL].values)
-  return feature_cols, labels
+def get_input_fn(data_set, num_epochs=None, shuffle=True):
+  return tf.estimator.inputs.pandas_input_fn(
+      x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
+      y=pd.Series(data_set[LABEL].values),
+      num_epochs=num_epochs,
+      shuffle=shuffle)
 
 
 def main(unused_argv):
@@ -49,26 +51,28 @@ def main(unused_argv):
                                skiprows=1, names=COLUMNS)
 
   # Feature cols
-  feature_cols = [tf.contrib.layers.real_valued_column(k)
-                  for k in FEATURES]
+  feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
 
   # Build 2 layer fully connected DNN with 10, 10 units respectively.
-  regressor = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols,
-                                            hidden_units=[10, 10],
-                                            model_dir="/tmp/boston_model")
+  regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
+                                        hidden_units=[10, 10],
+                                        model_dir="/tmp/boston_model")
 
-  # Fit
-  regressor.fit(input_fn=lambda: input_fn(training_set), steps=5000)
+  # Train
+  regressor.train(input_fn=get_input_fn(training_set), steps=5000)
 
-  # Score accuracy
-  ev = regressor.evaluate(input_fn=lambda: input_fn(test_set), steps=1)
+  # Evaluate loss over one epoch of test_set.
+  ev = regressor.evaluate(
+      input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
   loss_score = ev["loss"]
   print("Loss: {0:f}".format(loss_score))
 
-  # Print out predictions
-  y = regressor.predict(input_fn=lambda: input_fn(prediction_set))
-  # .predict() returns an iterator; convert to a list and print predictions
-  predictions = list(itertools.islice(y, 6))
+  # Print out predictions over a slice of prediction_set.
+  y = regressor.predict(
+      input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
+  # .predict() returns an iterator of dicts; convert to a list and print
+  # predictions
+  predictions = list(p["predictions"] for p in itertools.islice(y, 6))
   print("Predictions: {}".format(str(predictions)))
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index ba9e1f60e14baf28cd79136a8fb9395999aef4cd..7621beec48167846875eba0e0997795258f9731c 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -16,6 +16,8 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/learn",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index aa92b1758a05439440cbdda2e7063024646739b9..2124843fcb21d0c4a28ef9a11aba012a5a116e84 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.contrib import learn
-from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-
 tf.logging.set_verbosity(tf.logging.INFO)
 
 
@@ -31,7 +28,7 @@ def cnn_model_fn(features, labels, mode):
   # Input Layer
   # Reshape X to 4-D tensor: [batch_size, width, height, channels]
   # MNIST images are 28x28 pixels, and have one color channel
-  input_layer = tf.reshape(features, [-1, 28, 28, 1])
+  input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
 
   # Convolutional Layer #1
   # Computes 32 features using a 5x5 filter with ReLU activation.
@@ -82,53 +79,54 @@ def cnn_model_fn(features, labels, mode):
 
   # Add dropout operation; 0.6 probability that element will be kept
   dropout = tf.layers.dropout(
-      inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
+      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
 
   # Logits layer
   # Input Tensor Shape: [batch_size, 1024]
   # Output Tensor Shape: [batch_size, 10]
   logits = tf.layers.dense(inputs=dropout, units=10)
 
-  loss = None
-  train_op = None
+  predictions = {
+      # Generate predictions (for PREDICT and EVAL mode)
+      "classes": tf.argmax(input=logits, axis=1),
+      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
+      # `logging_hook`.
+      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
+  }
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  if mode != learn.ModeKeys.INFER:
-    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
+  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+  loss = tf.losses.softmax_cross_entropy(
+      onehot_labels=onehot_labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
-  if mode == learn.ModeKeys.TRAIN:
-    train_op = tf.contrib.layers.optimize_loss(
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+    train_op = optimizer.minimize(
         loss=loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=0.001,
-        optimizer="SGD")
-
-  # Generate Predictions
-  predictions = {
-      "classes": tf.argmax(
-          input=logits, axis=1),
-      "probabilities": tf.nn.softmax(
-          logits, name="softmax_tensor")
-  }
+        global_step=tf.train.get_global_step())
+    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
 
-  # Return a ModelFnOps object
-  return model_fn_lib.ModelFnOps(
-      mode=mode, predictions=predictions, loss=loss, train_op=train_op)
+  # Add evaluation metrics (for EVAL mode)
+  eval_metric_ops = {
+      "accuracy": tf.metrics.accuracy(
+          labels=labels, predictions=predictions["classes"])}
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 
 
 def main(unused_argv):
   # Load training and eval data
-  mnist = learn.datasets.load_dataset("mnist")
+  mnist = tf.contrib.learn.datasets.load_dataset("mnist")
   train_data = mnist.train.images  # Returns np.array
   train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
   eval_data = mnist.test.images  # Returns np.array
   eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
 
   # Create the Estimator
-  mnist_classifier = learn.Estimator(
+  mnist_classifier = tf.estimator.Estimator(
       model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
 
   # Set up logging for predictions
@@ -138,23 +136,24 @@ def main(unused_argv):
       tensors=tensors_to_log, every_n_iter=50)
 
   # Train the model
-  mnist_classifier.fit(
-      x=train_data,
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": train_data},
       y=train_labels,
       batch_size=100,
+      num_epochs=None,
+      shuffle=True)
+  mnist_classifier.train(
+      input_fn=train_input_fn,
       steps=20000,
-      monitors=[logging_hook])
-
-  # Configure the accuracy metric for evaluation
-  metrics = {
-      "accuracy":
-          learn.MetricSpec(
-              metric_fn=tf.metrics.accuracy, prediction_key="classes"),
-  }
+      hooks=[logging_hook])
 
   # Evaluate the model and print results
-  eval_results = mnist_classifier.evaluate(
-      x=eval_data, y=eval_labels, metrics=metrics)
+  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": eval_data},
+      y=eval_labels,
+      num_epochs=1,
+      shuffle=False)
+  eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
   print(eval_results)
 
 
diff --git a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
index be50f4529fcd38421cde1524a08ab2acee96f028..af89c8c77bc01891dfe683904873c96b0aa0fff8 100644
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # pylint: disable=missing-docstring
 import argparse
-import os.path
+import os
 import sys
 import time
 
@@ -257,13 +257,15 @@ if __name__ == '__main__':
   parser.add_argument(
       '--input_data_dir',
       type=str,
-      default='/tmp/tensorflow/mnist/input_data',
+      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
+                           'tensorflow/mnist/input_data'),
       help='Directory to put the input data.'
   )
   parser.add_argument(
       '--log_dir',
       type=str,
-      default='/tmp/tensorflow/mnist/logs/fully_connected_feed',
+      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
+                           'tensorflow/mnist/logs/fully_connected_feed'),
       help='Directory to put the log data.'
   )
   parser.add_argument(
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
index 2896eee77d1ca6fe017d2b1bff9706a95c9a5bc4..4b5b50400a717ec2f29bab8a5fe3171b0e993476 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 
 import argparse
 import sys
+import tempfile
 
 from tensorflow.examples.tutorials.mnist import input_data
 
@@ -52,42 +53,50 @@ def deepnn(x):
   # Reshape to use within a convolutional neural net.
   # Last dimension is for "features" - there is only one here, since images are
   # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
-  x_image = tf.reshape(x, [-1, 28, 28, 1])
+  with tf.name_scope('reshape'):
+    x_image = tf.reshape(x, [-1, 28, 28, 1])
 
   # First convolutional layer - maps one grayscale image to 32 feature maps.
-  W_conv1 = weight_variable([5, 5, 1, 32])
-  b_conv1 = bias_variable([32])
-  h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
+  with tf.name_scope('conv1'):
+    W_conv1 = weight_variable([5, 5, 1, 32])
+    b_conv1 = bias_variable([32])
+    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
 
   # Pooling layer - downsamples by 2X.
-  h_pool1 = max_pool_2x2(h_conv1)
+  with tf.name_scope('pool1'):
+    h_pool1 = max_pool_2x2(h_conv1)
 
   # Second convolutional layer -- maps 32 feature maps to 64.
-  W_conv2 = weight_variable([5, 5, 32, 64])
-  b_conv2 = bias_variable([64])
-  h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
+  with tf.name_scope('conv2'):
+    W_conv2 = weight_variable([5, 5, 32, 64])
+    b_conv2 = bias_variable([64])
+    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
 
   # Second pooling layer.
-  h_pool2 = max_pool_2x2(h_conv2)
+  with tf.name_scope('pool2'):
+    h_pool2 = max_pool_2x2(h_conv2)
 
   # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
   # is down to 7x7x64 feature maps -- maps this to 1024 features.
-  W_fc1 = weight_variable([7 * 7 * 64, 1024])
-  b_fc1 = bias_variable([1024])
+  with tf.name_scope('fc1'):
+    W_fc1 = weight_variable([7 * 7 * 64, 1024])
+    b_fc1 = bias_variable([1024])
 
-  h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
-  h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
+    h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
+    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
 
   # Dropout - controls the complexity of the model, prevents co-adaptation of
   # features.
-  keep_prob = tf.placeholder(tf.float32)
-  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
+  with tf.name_scope('dropout'):
+    keep_prob = tf.placeholder(tf.float32)
+    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
 
   # Map the 1024 features to 10 classes, one for each digit
-  W_fc2 = weight_variable([1024, 10])
-  b_fc2 = bias_variable([10])
+  with tf.name_scope('fc2'):
+    W_fc2 = weight_variable([1024, 10])
+    b_fc2 = bias_variable([10])
 
-  y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
+    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
   return y_conv, keep_prob
 
 
@@ -127,11 +136,23 @@ def main(_):
   # Build the graph for the deep net
   y_conv, keep_prob = deepnn(x)
 
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
-  train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
-  correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  with tf.name_scope('loss'):
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
+                                                            logits=y_conv)
+  cross_entropy = tf.reduce_mean(cross_entropy)
+
+  with tf.name_scope('adam_optimizer'):
+    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
+
+  with tf.name_scope('accuracy'):
+    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+    correct_prediction = tf.cast(correct_prediction, tf.float32)
+  accuracy = tf.reduce_mean(correct_prediction)
+
+  graph_location = tempfile.mkdtemp()
+  print('Saving graph to: %s' % graph_location)
+  train_writer = tf.summary.FileWriter(graph_location)
+  train_writer.add_graph(tf.get_default_graph())
 
   with tf.Session() as sess:
     sess.run(tf.global_variables_initializer())
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index dc0d87031584ae0357db08b7bca9bdc1f1f3c08c..c401d09df8ca5132178ab31e3b14b3a5cf98e70d 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -25,6 +25,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
 import sys
 
 import tensorflow as tf
@@ -200,12 +201,14 @@ if __name__ == '__main__':
   parser.add_argument(
       '--data_dir',
       type=str,
-      default='/tmp/tensorflow/mnist/input_data',
+      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
+                           'tensorflow/mnist/input_data'),
       help='Directory for storing input data')
   parser.add_argument(
       '--log_dir',
       type=str,
-      default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
+                           'tensorflow/mnist/logs/mnist_with_summaries'),
       help='Summaries log directory')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/monitors/BUILD b/tensorflow/examples/tutorials/monitors/BUILD
index 6aa0b7ee47c0477b46d99a5197d3a309a3666cbd..4220e8144de1259dc5bd873ddb5810bf95dcafae 100644
--- a/tensorflow/examples/tutorials/monitors/BUILD
+++ b/tensorflow/examples/tutorials/monitors/BUILD
@@ -20,7 +20,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index aee482fda56f3099179dcecb4922caec709a9b9c..e67442b14b0aeef2c2f40c571c91a39ca0464f91 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -253,7 +253,7 @@ try:
   from sklearn.manifold import TSNE
   import matplotlib.pyplot as plt
 
-  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
+  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
   plot_only = 500
   low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
   labels = [reverse_dictionary[i] for i in xrange(plot_only)]
diff --git a/tensorflow/examples/udacity/5_word2vec.ipynb b/tensorflow/examples/udacity/5_word2vec.ipynb
index 9d4243d7aeae02a7bb8ea8290c13d61b3eaf87c8..18c456cad787b2ed5b39d5791de649874bbe7ae3 100644
--- a/tensorflow/examples/udacity/5_word2vec.ipynb
+++ b/tensorflow/examples/udacity/5_word2vec.ipynb
@@ -806,7 +806,7 @@
       "source": [
         "num_points = 400\n",
         "\n",
-        "tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)\n",
+        "tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')\n",
         "two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])"
       ],
       "outputs": [],
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index 5923fa59293430e0df06c562e095f44e5d672ca8..dc29236b081121cd45a962bff49ceabff58172ce 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -19,7 +19,10 @@ cc_library(
     hdrs = ["wav_to_spectrogram.h"],
     deps = [
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
     ],
 )
@@ -30,6 +33,7 @@ cc_binary(
     deps = [
         ":wav_to_spectrogram_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
     ],
 )
 
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
index 4f4076db374ff56137f4f04485f658a01f72f639..6eaf1e1bda1e6c43df96195a682961cd28dc177b 100644
--- a/tensorflow/g3doc/README.txt
+++ b/tensorflow/g3doc/README.txt
@@ -5,7 +5,7 @@ go to:
 
 Documentation (on Github, tensorflow.org, and anywhere else we decide to
 serve it from) is now generated from the files in
-third_party/tensorflow/docs_src/ (for tutorials and other guides) and
+tensorflow/docs_src/ (for tutorials and other guides) and
 TensorFlow source code (for the API reference pages). If you see a problem with
 API reference, edit the code comments in the appropriate language. If you see a 
 problem with our other docs, edit the files in docs_src.
@@ -14,9 +14,10 @@ To preview the results of your changes, or generate an offline copy of
 the docs, run:
 
   bazel run -- tensorflow/tools/docs:generate \
-    --src_dir=tensorflow/docs_src/ \
+    --src_dir=/path/to/tensorflow/docs_src/ \
     --output_dir=/tmp/tfdocs/
 
+`src_dir` must be absolute path to documentation source.
 When authoring docs, note that we have some new syntax for references --
 at least for docs coming from Python docstrings or
 tensorflow/docs_src/.  Use:
@@ -26,7 +27,6 @@ tensorflow/docs_src/.  Use:
   syntax still works, since @{tf.MyClass.method} links to the right
   part of the tf.MyClass page.
 
-
 * @{tensorflow::symbol} to make a link to the reference page for a C++
   symbol. (This only works for a few symbols but will work for more soon.)
 
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index a1b4255292b0908fd5f022ce641967ba1b30f75c..9c2fa600176b00e853e8319302b519075ed17b52 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -9,35 +9,6 @@ Construct and execute TensorFlow graphs in Go.
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
-1.  Download and extract the TensorFlow C library, preferably into `/usr/local`.
-    GPU-enabled versions require CUDA 8.0 and cuDNN 5.1. For other versions, the
-    TensorFlow C library will have to be built from source (see below).
-
-    -   Linux:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.1.0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.1.0.tar.gz)
-    -   OS X
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.1.0.tar.gz),
-
-    The following shell snippet downloads and extracts into `/usr/local`:
-
-    ```sh
-    TF_TYPE="cpu" # Set to "gpu" for GPU support
-    curl -L \
-      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.1.0.tar.gz" |
-    sudo tar -C /usr/local -xz
-    ```
-
-2.  `go get` this package (and run tests):
-
-    ```sh
-    go get github.com/tensorflow/tensorflow/tensorflow/go
-    go test github.com/tensorflow/tensorflow/tensorflow/go
-    ```
-
-3.  Done!
-
-### Installing into locations other than `/usr/local`
 
 Refer to [Installing TensorFlow for Go](https://www.tensorflow.org/install/install_go)
 
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 15ef3b95253c439efc5ac0d020537342fd7ac591..01fcfb9058378b49d1315ddbbcc08e6a5de09d7d 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -36,7 +36,7 @@ then
   then
     echo "Protocol buffer compiler protoc not found in PATH or in ${PROTOC}"
     echo "Perhaps build it using:"
-    echo "bazel build --config opt @protobuf//:protoc"
+    echo "bazel build --config opt @protobuf_archive//:protoc"
     exit 1
   fi
   PROTOC=$PATH_PROTOC
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c72dfdd17e27ae4349ace5b00098db227efdb92d..43e09c498c62b8d3a644a0ec8a34017ce55b53a1 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -100,32 +100,6 @@ func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Out
 	return op.Output(0)
 }
 
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Assigns a new value to a variable.
 //
 // Any ReadVariableOp with a control dependency on this op is guaranteed to return
@@ -621,7 +595,8 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 // input stride, specifying how far two consecutive patch samples are in the
 // input. Equivalent to extracting patches with
 // `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`.
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
 //	padding: The type of padding algorithm to use.
 //
 // We specify the size-related attributes as:
@@ -634,7 +609,8 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 //
 // Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
 // ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension.
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
 func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -1154,7 +1130,8 @@ type SqueezeAttr func(optionalAttr)
 // SqueezeSqueezeDims sets the optional squeeze_dims attribute to value.
 //
 // value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1.
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
@@ -1292,6 +1269,46 @@ func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (
 	return op.Output(0)
 }
 
+// Pads a tensor.
+//
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PadV2",
+		Input: []tf.Input{
+			input, paddings, constant_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Return the reduction indices for computing gradients of s0 op s1 with broadcast.
 //
 // This is typically used by gradient computations for a broadcasting operation.
@@ -1498,48 +1515,6 @@ func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (outp
 	return op.Output(0)
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
-
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns shape of tensors.
-//
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ShapeN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
-}
-
 // Checks a tensor for NaN and Inf values.
 //
 // When run, reports an `InvalidArgument` error if `tensor` has any values
@@ -1646,30 +1621,25 @@ func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather values or slices from `params` according to `indices`.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// `indices` is an integer tensor containing indices into `params`.  The last
-// dimension of `indices` can be at most the rank of `params`:
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
+//
+//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+//
+// Whereas in @{tf.gather} `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+//
+// The last dimension of `indices` can be at most the rank of
+// `params`:
 //
 //     indices.shape[-1] <= params.rank
 //
 // The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] = params.rank`) or slices
+// (if `indices.shape[-1] == params.rank`) or slices
 // (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
 // of `params`.  The output tensor has shape
 //
@@ -2320,6 +2290,24 @@ func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (aud
 	return op.Output(0), op.Output(1)
 }
 
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
 type AllCandidateSamplerAttr func(optionalAttr)
 
@@ -2781,8 +2769,8 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //
 // Creates a new tensor by applying sparse `updates` to individual
 // values or slices within a zero tensor of the given `shape` according to
-// indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
-// operator which extracts values or slices from a given tensor.
+// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
+// extracts values or slices from a given tensor.
 //
 // **WARNING**: The order in which updates are applied is nondeterministic, so the
 // output will be nondeterministic if `indices` contains duplicates.
@@ -4348,6 +4336,86 @@ func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0), op.Output(1)
 }
 
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
+//
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
+//
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackV2",
+		Input: []tf.Input{
+			max_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the batched diagonal part of a batched tensor.
 //
 // This operation returns a tensor with the `diagonal` part
@@ -4398,25 +4466,21 @@ func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// Returns true if queue is closed.
 //
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Arguments:
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -5296,33 +5360,57 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
+		m["output_types"] = value
 	}
 }
 
-// Closes the given queue.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	string_handle: A string representation of the given handle.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5331,23 +5419,130 @@ func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "IteratorFromStringHandle",
 		Input: []tf.Input{
-			handle,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Releases any resources used by the given iterator.
-//
-// Returns the created operation.
-func IteratorDispose(scope *Scope, iterator tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
+
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	opspec := tf.OpSpec{
+}
+
+// Returns shape of tensors.
+//
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ShapeN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// Converts the given `resource_handle` representing an iterator to a string.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorToStringHandle",
+		Input: []tf.Input{
+			resource_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
+//
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Releases any resources used by the given iterator.
+//
+// Returns the created operation.
+func IteratorDispose(scope *Scope, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
 		Type: "IteratorDispose",
 		Input: []tf.Input{
 			iterator,
@@ -5543,6 +5738,24 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Deprecated. Use TensorArrayGradV3
 func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
@@ -5646,23 +5859,58 @@ func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
-//
-//
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "IgnoreErrorsDataset",
 		Input: []tf.Input{
-			input_dataset, count,
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -6052,104 +6300,248 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels and the bounding box is
-// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-// bounding box will be `(10, 40)` to `(50, 180)`.
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
-		Input: []tf.Input{
-			images,
-		},
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
-//
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
-//
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeGif",
-		Input: []tf.Input{
-			contents,
-		},
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.image_summary('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBoxV2",
+		Input: []tf.Input{
+			image_size, bounding_boxes, min_object_covered,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels and the bounding box is
+// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
+// bounding box will be `(10, 40)` to `(50, 180)`.
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+//
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeGif",
+		Input: []tf.Input{
+			contents,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
 		m["channels"] = value
 	}
 }
@@ -6457,39 +6849,6 @@ func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Store the input tensor in the state of the current session.
 //
 // Arguments:
@@ -6558,92 +6917,143 @@ func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			gradients, features,
+			reader_handle, queue_handle, num_records,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			features,
+			reader_handle, queue_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
 
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["adj_x"] = value
+		m["container"] = value
 	}
 }
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+// A Reader that outputs the queued work as both the key and value.
 //
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// It is computed as:
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6652,59 +7062,53 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "TFRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+		m["container"] = value
 	}
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6713,10 +7117,8 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -7157,6 +7559,48 @@ func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV3",
+		Input: []tf.Input{
+			input, input_min, input_max, num_bits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -7495,67 +7939,84 @@ func RsqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor transferred by GraphTransferer.
+// Makes its input available to the next iteration.
 //
-// The graph specifications are serialized by protobuf as graph_transfer_info.
-// The implementation / limitations may differ for each platform
-// and each available peripheral.
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			data,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "NoOp",
 	}
-	return outputs
+	return scope.AddOperation(opspec)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7564,9 +8025,9 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "DepthwiseConv2dNative",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -7574,112 +8035,122 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["method"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["extrapolation_value"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
 //
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
+//
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
+//
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
+		Type: "CropAndResize",
+		Input: []tf.Input{
+			image, boxes, box_ind, crop_size,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["data_format"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			size,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -7687,109 +8158,150 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// Adds `bias` to `value`.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// This is a deprecated version of BiasAdd and will be soon removed.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes its input available to the next iteration.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "BiasAddV1",
 		Input: []tf.Input{
-			data,
+			value, bias,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["density_unit"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// Input images can be of different types but output images are always float.
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7798,9 +8310,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			images, size,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -7808,93 +8320,87 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// Gradients for batch normalization.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			input, reduction_indices,
+			t, m, v, gamma, backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
+//     [batch, in_height, in_width, in_channels].
 // Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
+//     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
 //	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7903,9 +8409,9 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			input, filter,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -7913,63 +8419,60 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// CropAndResizeMethod sets the optional method attribute to value.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
 }
 
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
+		m["data_format"] = value
 	}
 }
 
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7978,57 +8481,62 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -8036,161 +8544,180 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// Computes sigmoid of `x` element-wise.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			value, bias,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
 
-// EncodeJpegFormat sets the optional format attribute to value.
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["format"] = value
+		m["seed"] = value
 	}
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["quality"] = value
+		m["seed2"] = value
 	}
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// Computes the ids of the positions in sampled_candidates that match true_labels.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
 	return func(m optionalAttr) {
-		m["optimize_size"] = value
+		m["capacity"] = value
 	}
 }
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
 	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
-//
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
 	return func(m optionalAttr) {
-		m["density_unit"] = value
+		m["container"] = value
 	}
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
 	return func(m optionalAttr) {
-		m["x_density"] = value
+		m["shared_name"] = value
 	}
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Op removes all elements in the underlying container.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["data_format"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			image,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -8198,159 +8725,156 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// Gradients for batch normalization.
+// Computes the maximum along segments of a tensor.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
+	return op.Output(0)
 }
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// Saves input tensors slices to disk.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradients of convolution with respect to the input.
+// Returns the rank of a tensor.
 //
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+// This operation returns an integer representing the rank of `input`.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "Rank",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["field_delim"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
 // If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["use_quote_delim"] = value
 	}
 }
 
-// Batch normalization.
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be used in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8359,62 +8883,71 @@ func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
 	}
+	return output
 }
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			shape,
+			serialized,
 		},
 		Attrs: attrs,
 	}
@@ -8422,15 +8955,13 @@ func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, opti
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "Acos",
 		Input: []tf.Input{
 			x,
 		},
@@ -8439,592 +8970,129 @@ func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["little_endian"] = value
 	}
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
+//	bytes: All the elements must have the same length.
 //
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
+	attrs := map[string]interface{}{"out_type": out_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			bytes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
-//
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveSlices",
-		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Writes contents to the file at input filename. Creates file if not existing.
-//
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
-//
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteFile",
-		Input: []tf.Input{
-			filename, contents,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
-//
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-//
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cholesky",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rank",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-//
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
-//
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "ParseTensor",
-		Input: []tf.Input{
-			serialized,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
-		Input: []tf.Input{
-			input, num_lower, num_upper,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
-
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues a tuple of one or more tensors from the given queue.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
 // This operation has k outputs, where k is the number of components
 // in the tuples stored in the given queue, and output i is the ith
@@ -9514,6 +9582,61 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 	return op.Output(0)
 }
 
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // SkipgramAttr is an optional argument to Skipgram.
 type SkipgramAttr func(optionalAttr)
 
@@ -9744,6 +9867,27 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
+// Computes fingerprints of the input strings.
+//
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseMatMulAttr is an optional argument to SparseMatMul.
 type SparseMatMulAttr func(optionalAttr)
 
@@ -10482,288 +10626,296 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
-
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Constructs a tensor by tiling a given tensor.
 //
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
 // Arguments:
-//	contents: 0-D.  The BMP-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "Tile",
 		Input: []tf.Input{
-			contents,
+			input, multiples,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
+// Returns the element-wise min of two SparseTensors.
 //
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			logits,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// Delete the stack from its resource container.
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// Arguments:
+//	handle: The handle to a stack.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "StackCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "ShardedFilename",
+		Input: []tf.Input{
+			basename, shard, num_shards,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A queue that randomizes the order of elements.
+// Subtracts a value from the current value of a variable.
+//
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
+//
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-		Attrs: attrs,
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
+
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			tags, values,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Constructs a tensor by tiling a given tensor.
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Tile",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			input, multiples,
+			input, filter_sizes, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// Execute a sub graph on a remote processor.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	inputs: Arbitrary number of tensors with arbitrary data types
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "RemoteFusedGraphExecute",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			x, y,
-		},
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// Computes numerical negative value element-wise.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
+		Type: "Neg",
 		Input: []tf.Input{
-			basename, shard, num_shards,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -10838,35 +10990,153 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	opspec := tf.OpSpec{
 		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Writes contents to the file at input filename. Creates file and recursively
+//
+// creates directory if not existing.
+//
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// For example, if the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			indices, values, shape, start, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // ListDiffAttr is an optional argument to ListDiff.
@@ -11195,54 +11465,6 @@ func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
-//
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
-//
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
@@ -11283,119 +11505,213 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// Returns immutable tensor from memory region.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			l, grad,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns immutable tensor from memory region.
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// The current implementation memmaps the tensor from a file.
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
+// Encode audio data using the WAV file format.
+//
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
+//
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+//
+// Arguments:
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
+//
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeWav",
+		Input: []tf.Input{
+			audio, sample_rate,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The gradient operator for the SparseAdd op.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "SparseAddGrad",
 		Input: []tf.Input{
-			serialized_sparse,
+			backprop_val_grad, a_indices, b_indices, sum_indices,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
 // Batch normalization.
@@ -11436,48 +11752,6 @@ func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v
 	return op.Output(0)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Save",
-		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // L2 Loss.
 //
 // Computes half the L2 norm of a tensor without the `sqrt`:
@@ -11665,96 +11939,299 @@ func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
+//
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
+}
+
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+//
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["mode"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// number_of_steps = 1 << (# of bits in T)
+// range_adjust = number_of_steps / (number_of_steps - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = number_of_steps / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			indices,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
+//     [batch, height, width, channels].
 // Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
+//     [batch, channels, height, width].
 // If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			input,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -11762,125 +12239,151 @@ func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "ShardedFilespec",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			basename, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
+// Saves the input tensors to disk.
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// See also `SaveSlices`.
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "Save",
 		Input: []tf.Input{
-			audio, sample_rate,
+			filename, tensor_names, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			filename,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
+
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			input,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -11888,47 +12391,65 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
-// Computes numerical negative value element-wise.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// Arguments:
-//	resource: handle to the resource to delete.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11937,48 +12458,65 @@ func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyReso
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			resource,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			input,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
 // value: When set to True, it means when pooling, the values at the boundary
 // of adjacent pooling cells are used by both cells. For example:
@@ -11988,33 +12526,28 @@ type FractionalAvgPoolGradAttr func(optionalAttr)
 // `value  20 5  16 3  7`
 //
 // If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
+// The result would be [20, 16] for fractional max pooling.
 // If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
 	return func(m optionalAttr) {
 		m["overlapping"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
-//
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
 //	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
+// w.r.t. the output of `fractional_max_pool`.
 //	row_pooling_sequence: row pooling sequence, form pooling region with
 // col_pooling_sequence.
 //	col_pooling_sequence: column pooling sequence, form pooling region with
 // row_pooling sequence.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12023,9 +12556,9 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -12342,7 +12875,8 @@ func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 //
 // Arguments:
 //	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
 //
 // Returns The same shape as `tensor`.
 func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
@@ -13164,61 +13698,15 @@ func TensorSummaryDisplayName(value string) TensorSummaryAttr {
 	}
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
+//
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13227,61 +13715,53 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			tensor,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
 //
-// `index  0  1  2  3  4`
+// The folded size of each dimension D of the output is:
 //
-// `value  20 5  16 3  7`
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalMaxPool function.
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "MirrorPadGrad",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input, paddings,
 		},
 		Attrs: attrs,
 	}
@@ -13289,193 +13769,244 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Computes the inverse permutation of a tensor.
 //
-// Arguments:
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
 //
+// The values must include 0. There can be no duplicate values or negative values.
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// Reverses specific dimensions of a tensor.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
 //
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
 //
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// `rank(tensor) = size(dims)`
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// For example:
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "Reverse",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			tensor, dims,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
 //
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
 //
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
 //
-// Arguments:
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "SparseFillEmptyRows",
 		Input: []tf.Input{
-			data, segment_ids,
+			indices, values, dense_shape, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
 
-// CumprodExclusive sets the optional exclusive attribute to value.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["data_format"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
 //
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
+// In detail, with the default NHWC format,
 //
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+//   `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "Conv2D",
 		Input: []tf.Input{
-			x, axis,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -13483,62 +14014,97 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 	return op.Output(0)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["separator"] = value
 	}
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op peeks at the values at the specified key.  If the
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			key, indices,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -13548,363 +14114,366 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	}
 	var idx int
 	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
 		return
 	}
-	return values
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// Compute the pairwise cross product.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "Cross",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			a, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
+// Inverse 2D real-valued fast Fourier transform.
 //
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
-// For example:
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			input, paddings,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// Arguments:
-//	x: 1-D.
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
 //
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "TruncateMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
 //
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			tensor, dims,
+			var_, accum, lr, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
-//     empty_row_indicator[i] = True iff row i was an empty row.
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSum",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
-//
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["seed"] = value
 	}
 }
 
-// Conv2DDataFormat sets the optional data_format attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// In detail, with the default NHWC format,
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-//   `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			input, filter,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
-// StringJoinSeparator sets the optional separator attribute to value.
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// with the given separator (default is an empty separator).
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13913,211 +14482,215 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "DecodeBase64",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			a, b,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			input, fft_length,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			x, y,
+			boxes, scores, max_output_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// Update '*var' according to the adadelta scheme.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14126,223 +14699,197 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// ApplyDelayCompensatedGradientDescentAttr is an optional argument to ApplyDelayCompensatedGradientDescent.
+type ApplyDelayCompensatedGradientDescentAttr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// ApplyDelayCompensatedGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ApplyDelayCompensatedGradientDescentUseLocking(value bool) ApplyDelayCompensatedGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// var -= alpha * (delta + lambda * delta * (var - shadow))
+//
+// Update '*shadow' by changing it to the new value of 'var'
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//	lambda: The variance parameter.
+//	shadow: Same as "var".
+//
+// Returns the created operation.
+func ApplyDelayCompensatedGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, lambda tf.Output, shadow tf.Output, optional ...ApplyDelayCompensatedGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "ApplyDelayCompensatedGradientDescent",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			var_, alpha, delta, lambda, shadow,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Returns the size of a tensor.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// This operation returns an integer representing the number of elements in
+// `input`.
 //
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+// For example:
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "Size",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Update '*var' according to the RMSProp algorithm.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14351,145 +14898,145 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Store the input tensor in the state of the current session.
+// Elementwise computes the bitwise XOR of `x` and `y`.
 //
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "BitwiseXor",
 		Input: []tf.Input{
-			value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// The polygamma function is defined as:
 //
-// Arguments:
-//	input: Base64 strings to decode.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			x,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+//
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			x,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["pad"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input: Strings to be encoded.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -14497,45 +15044,44 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "Fact",
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dtype"] = value
 	}
 }
 
-// A Reader that outputs the lines of a file delimited by '\n'.
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14544,149 +15090,59 @@ func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
-//
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// PrintMessage sets the optional message attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["message"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// PrintFirstN sets the optional first_n attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["first_n"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["summarize"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.image_summary('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// Prints a list of tensors.
 //
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14695,75 +15151,43 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "Print",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Resize `images` to `size` using area interpolation.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14772,153 +15196,168 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			images, size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["Tout"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
+// Returns the real part of a complex number.
 //
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// If the mode is 'MIN_FIRST', then this approach is used:
+// For example:
 //
 // ```
-// number_of_steps = 1 << (# of bits in T)
-// range_adjust = number_of_steps / (number_of_steps - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = number_of_steps / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
 // ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
-//
-// Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "Real",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			empty_key,
 		},
 		Attrs: attrs,
 	}
@@ -14926,97 +15365,66 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// LRNBeta sets the optional beta attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
+// value: An exponent.
 // If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+func LRNBeta(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["beta"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Local Response Normalization.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15025,9 +15433,9 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "LRN",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -15035,38 +15443,36 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15075,99 +15481,67 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// LRNGradBias sets the optional bias attribute to value.
 //
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["bias"] = value
 	}
 }
 
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StageSize",
-
-		Attrs: attrs,
+		m["alpha"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ApplyDelayCompensatedGradientDescentAttr is an optional argument to ApplyDelayCompensatedGradientDescent.
-type ApplyDelayCompensatedGradientDescentAttr func(optionalAttr)
-
-// ApplyDelayCompensatedGradientDescentUseLocking sets the optional use_locking attribute to value.
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ApplyDelayCompensatedGradientDescentUseLocking(value bool) ApplyDelayCompensatedGradientDescentAttr {
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["beta"] = value
 	}
 }
 
-// var -= alpha * (delta + lambda * delta * (var - shadow))
-//
-// Update '*shadow' by changing it to the new value of 'var'
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
-//	lambda: The variance parameter.
-//	shadow: Same as "var".
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns the created operation.
-func ApplyDelayCompensatedGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, lambda tf.Output, shadow tf.Output, optional ...ApplyDelayCompensatedGradientDescentAttr) (o *tf.Operation) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15176,38 +15550,36 @@ func ApplyDelayCompensatedGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApplyDelayCompensatedGradientDescent",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			var_, alpha, delta, lambda, shadow,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
 
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// Returns the size of a tensor.
-//
-// This operation returns an integer representing the number of elements in
-// `input`.
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// For example:
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15216,9 +15588,9 @@ func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			input,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
@@ -15226,46 +15598,156 @@ func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Generate a single randomly distorted bounding box for an image.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+// For example,
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.image_summary('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15274,61 +15756,75 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
+// value: If `True`, updating of the var, ms, and mom tensors is protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// Update '*var' according to the RMSProp algorithm.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15337,92 +15833,56 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
+// Computes gradients of average pooling function.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			shape, seed,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -15430,105 +15890,134 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// PrintMessage sets the optional message attribute to value.
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["capacity"] = value
 	}
 }
 
-// PrintFirstN sets the optional first_n attribute to value.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["first_n"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["container"] = value
 	}
 }
 
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
+	}
+	return values
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			images, size,
+			input_dataset, batch_size,
 		},
 		Attrs: attrs,
 	}
@@ -15536,31 +16025,54 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// For example:
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15569,150 +16081,241 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// Flips all bits elementwise.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "Invert",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes the mean along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["exclusive"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["reverse"] = value
 	}
 }
 
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// Deletes the resource specified by the handle.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
+//
+// Returns The gradients: `gradients * (outputs + scale * alpha)` if outputs < 0,
+// `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SeluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates an empty hash table that uses tensors as the backing store.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//	value_dtype: Type of the table values.
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
-			empty_key,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -15720,114 +16323,116 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// LRNBias sets the optional bias attribute to value.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["footer_bytes"] = value
 	}
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["hop_bytes"] = value
 	}
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["container"] = value
 	}
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15836,67 +16441,77 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["exclusive"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["reverse"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
 //
-// Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15905,9 +16520,9 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -15915,235 +16530,314 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
+// Pads a tensor with zeros.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// The padded size of each dimension D of the output is:
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "Pad",
 		Input: []tf.Input{
-			string_tensor,
+			input, paddings,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the number of elements in the given queue.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// Arguments:
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			input_dataset, count,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
+//
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
+//
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "AsString",
 		Input: []tf.Input{
-			gradients, outputs,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// The hash function is deterministic on the content of the string within the
-// process.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
 //
-// Arguments:
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
 //
-//	num_buckets: The number of buckets.
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			string_tensor,
+			input, delimiter,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A Reader that outputs fixed-length records from a file.
-//
-// Arguments:
-//	record_bytes: Number of bytes in the record.
+// Returns the truth value of (x != y) element-wise.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Gather slices from `params` according to `indices`.
 //
-// Arguments:
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+//
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16152,231 +16846,258 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "Gather",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
+// Return substrings from `Tensor` of strings.
 //
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
 //
 // ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
 // ```
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
+// Using `pos` and `len` with same shape as `input`:
 //
 // ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
 // ```
 //
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
+// Broadcasting `pos` and `len` onto `input`:
 //
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
 // ```
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// Broadcasting `input` onto `pos` and `len`:
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// output = [b'hir', b'ee', b'n"]
 // ```
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "Substr",
 		Input: []tf.Input{
-			x, axis,
+			input, pos, len,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// The padded size of each dimension D of the output is:
+// The generated values will have mean 0 and standard deviation 1.
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// The outputs are a deterministic function of `shape` and `seed`.
 //
-// For example:
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			input, paddings,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	input: A complex64 tensor.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "IFFT",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// N is the size of the segment being reduced.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			tag, values,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// AsStringShortest sets the optional shortest attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
-//
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["width"] = value
+		m["align_corners"] = value
 	}
 }
 
-// AsStringFill sets the optional fill attribute to value.
+// Computes the gradient of bilinear interpolation.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16385,9 +17106,9 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			input,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
@@ -16395,56 +17116,74 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
-
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
+// Computes the number of elements in the given table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads the value of a variable.
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
+// The tensor returned by this operation is immutable.
 //
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			params, indices,
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -16452,128 +17191,154 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			gradients, features,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
+// Restore a reader to a previously saved state.
 //
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			a, x,
+			reader_handle, state,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
-//
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
-//
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// Computes the absolute value of a tensor.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "Abs",
 		Input: []tf.Input{
-			input, filter,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// RandomPoissonSeed sets the optional seed attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["seed"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
+// rate.
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16582,9 +17347,9 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -16592,148 +17357,153 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			reader_handle,
+			sp_indices, sp_values, sp_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// If `pos` is negative or specifies a character index larger than any of the input
-// strings, then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+// Computes gradients for SparseSegmentMean.
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// Broadcasting `input` onto `pos` and `len`:
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts one or more images from RGB to HSV.
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// output = [b'hir', b'ee', b'n"]
-// ```
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			input, pos, len,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["fast"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
+// Solves one or more linear least-squares problems.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+// in the least squares sense.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// matrix and right-hand sides in the batch:
+//
+// `matrix`=\\(A \in \Re^{m \times n}\\),
+// `rhs`=\\(B  \in \Re^{m \times k}\\),
+// `output`=\\(X  \in \Re^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
+// \\(A Z = B\\). Notice that the fast path is only numerically stable when
+// \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16742,9 +17512,9 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			shape, seed,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -16752,88 +17522,137 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
+// Applies sparse addition to `input` using individual values or slices
 //
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "ScatterNdNonAliasingAdd",
 		Input: []tf.Input{
-			input,
+			input, indices, updates,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			features, max_value, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16842,182 +17661,198 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			grads, original_image,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the number of elements in the given table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
-		Input: []tf.Input{
-			table_handle,
-		},
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num": num}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "Unpack",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
 }
 
-// Reads the value of a variable.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// The tensor returned by this operation is immutable.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			resource,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return output_indices, output_values, output_shape
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// RandomPoissonSeed sets the optional seed attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["separator"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
+// Joins a string Tensor across the given dimensions.
 //
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
-// rate.
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17026,9 +17861,9 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			shape, rate,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -17036,153 +17871,121 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
-		},
+}
+
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// Computes the singular value decompositions of one or more matrices.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "Svd",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
 //
-// Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "PopulationCount",
 		Input: []tf.Input{
-			images,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["summarize"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
-// in the least squares sense.
-//
-// matrix and right-hand sides in the batch:
-//
-// `matrix`=\\(A \in \Re^{m \times n}\\),
-// `rhs`=\\(B  \in \Re^{m \times k}\\),
-// `output`=\\(X  \in \Re^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
-// \\(A Z = B\\). Notice that the fast path is only numerically stable when
-// \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// Asserts that the given condition is true.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17191,86 +17994,104 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "Assert",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// Arguments:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
 //
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["use_locking"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17279,267 +18100,226 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
 
-// UnpackAxis sets the optional axis attribute to value.
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// This is the opposite of `pack`.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "Any",
 		Input: []tf.Input{
-			value,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
 	}
-	return output
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+// Inverse real-valued fast Fourier transform.
 //
-// Graphically the output tensors are:
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
+// Arguments:
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RangeDataset",
+		Input: []tf.Input{
+			start, stop, step,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
-//
-// For example:
+// Saves tensors in V2 checkpoint format.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
-// ```
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
 
-// SvdComputeUv sets the optional compute_uv attribute to value.
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
 // If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["compute_uv"] = value
+		m["lower"] = value
 	}
 }
 
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
 // If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// backsubstitution.
 //
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17548,285 +18328,237 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			input,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
+// Adds a value to the current value of a variable.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
 // Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			resource, value,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "Asinh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a uniform distribution.
+// Real-valued fast Fourier transform.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "RFFT",
 		Input: []tf.Input{
-			shape,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["capacity"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// Stage (key, values) in the underlying container which behaves like a ordered
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// associative container.   Elements are ordered by key.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	key: int64
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
-			input, reduction_indices,
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "TanhGrad",
 		Input: []tf.Input{
-			x, q,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// Outputs all keys and values in the table.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//	table_handle: Handle to the table.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			input, fft_length,
+			table_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			start, stop, step,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -17834,98 +18566,110 @@ func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output,
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
+//
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			handle, indices, flow_in,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
-
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["lower"] = value
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["data_format"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-//
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
-//
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			matrix, rhs,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -17933,259 +18677,308 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
+// 3D real-valued fast Fourier transform.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			resource, value,
+			input, fft_length,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCounts",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			input, fft_length,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["Tout"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Converts two real numbers to a complex number.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["Tout"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
+// Returns the imaginary part of a complex number.
 //
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
+// For example:
 //
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
+		Type: "Imag",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// Creates a dataset that emits the lines of one or more text files.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TanhGrad",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			x, y,
+			filenames, compression_type,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
+// Returns the number of records this Reader has produced.
 //
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			table_handle,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
-
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// Computes exponential of x - 1 element-wise.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// Returns x - y element-wise.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// *NOTE*: `Sub` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// Arguments:
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -18196,26 +18989,26 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 // Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
 // The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
+//
 //	num_buckets: The number of buckets.
 //
 // Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			input,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
@@ -18223,45 +19016,79 @@ func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (o
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Computes gradient of the FractionalAvgPool function.
 //
-// All elements selected by `indices` must have the same shape.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -18269,64 +19096,82 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
-//
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "Softplus",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["adj_x"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -18334,82 +19179,114 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
+// Computes softplus gradients for a softplus operation.
 //
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// Input images can be of different types but output images are always float.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			input, fft_length,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
+// Computes the product of elements across dimensions of a tensor.
 //
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	x: 1-D.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18418,68 +19295,64 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "Prod",
 		Input: []tf.Input{
-			x,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			input_dataset, count,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["channels"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// The input tensors `real` and `imag` must have the same shape.
+// Accepted values are:
 //
-// For example:
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Arguments:
+//	contents: 0-D.  The BMP-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18488,9 +19361,9 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			real, imag,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -18498,241 +19371,262 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
+// Computes softmax activations.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// For each batch `i` and class `j` we have
 //
-// For example:
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "Softmax",
 		Input: []tf.Input{
-			input,
+			logits,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-func TextLineDataset(scope *Scope, filenames tf.Output) (handle tf.Output) {
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames,
-		},
+		Type: "RandomShuffleQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			reader_handle,
+			tags, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// *NOTE*: `Sub` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// Update '*var' according to the Adam algorithm.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			input, delimiter,
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// 3D fast Fourier transform.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
+// Equivalent to np.fft.fftn with 3 dimensions.
 // @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "FFT3D",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Returns the truth value of (x <= y) element-wise.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "LessEqual",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -18741,6 +19635,30 @@ func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
 // range that covers the actual values present in that tensor.  This op is
@@ -19273,6 +20191,55 @@ func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Quantized Batch normalization.
 //
 // This op is deprecated and will be removed in the future. Prefer
@@ -19361,7 +20328,8 @@ func MaxKeepDims(value bool) MaxAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
@@ -19577,16 +20545,34 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes scaled exponential linear: `1.758099 * (exp(features) - 1)` if < 0,
+// `1.050701 * features` otherwise.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "Selu",
 		Input: []tf.Input{
 			features,
 		},
@@ -19686,7 +20672,8 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 // Arguments:
 //
 //	dim: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`.
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
 //
 // Returns Contains the same data as `input`, but its shape has an additional
 // dimension of size 1 added.
@@ -19726,7 +20713,8 @@ func AllKeepDims(value bool) AllAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
@@ -19955,108 +20943,354 @@ func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (out
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of x element-wise.
+//
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			start, stop, num,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			x,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using bicubic interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "Acosh",
 		Input: []tf.Input{
-			images, size,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+//
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation.
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			gradients, features,
+			l, grad,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
-//
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "Atanh",
 		Input: []tf.Input{
 			x,
 		},
@@ -20130,6 +21364,53 @@ func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Gather slices from `params` axis `axis` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
+//
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+//
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Arguments:
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GatherV2",
+		Input: []tf.Input{
+			params, indices, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the complementary error function of `x` element-wise.
 func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -20239,38 +21520,6 @@ func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BatchToSpace for 4-D tensors of type T.
 //
 // This is a legacy version of the more general BatchToSpaceND.
@@ -20490,23 +21739,40 @@ func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
+
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
 // Returns the index with the smallest value across dimensions of a tensor.
 //
 // Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
 //
-//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-// of the input Tensor to reduce across. For vectors, use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "ArgMin",
 		Input: []tf.Input{
 			input, dimension,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -20888,7 +22154,7 @@ func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 //  `output[i] = numeric_limits<T>::min()`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
 // </div>
 //
 // Arguments:
@@ -21387,48 +22653,6 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LessEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the truth value of x OR y element-wise.
 //
 // *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
@@ -21583,7 +22807,8 @@ func MeanKeepDims(value bool) MeanAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
@@ -21723,7 +22948,8 @@ func MinKeepDims(value bool) MinAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
@@ -21764,23 +22990,40 @@ func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
 // Returns the index with the largest value across dimensions of a tensor.
 //
 // Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
 //
-//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-// of the input Tensor to reduce across. For vectors, use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "ArgMax",
 		Input: []tf.Input{
 			input, dimension,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -21876,31 +23119,69 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 // *  If `max_images` is greater than 1, the summary value tags are
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			tag, tensor,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -21950,44 +23231,6 @@ func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
-	opspec := tf.OpSpec{
-		Type: "Bucketize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the product along segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
@@ -22065,6 +23308,21 @@ func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num
 	return op.Output(0)
 }
 
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along sparse segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
@@ -22723,6 +23981,51 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:
@@ -23412,59 +24715,6 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// A Reader that outputs the records from a TensorFlow Records file.
-//
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Adjust the saturation of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last dimension is
@@ -23612,7 +24862,8 @@ func SumKeepDims(value bool) SumAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
@@ -23864,6 +25115,29 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
+// Pop the element at the top of the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	opspec := tf.OpSpec{
+		Type: "StackPopV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the mean along sparse segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
@@ -23940,79 +25214,3 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
-//
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
-//
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
diff --git a/tensorflow/go/shape.go b/tensorflow/go/shape.go
index 114ab5decb6553fe5431f6daae9be09b4aeeb307..8d000cb9debc82154932f831bd84f7bec9fd170a 100644
--- a/tensorflow/go/shape.go
+++ b/tensorflow/go/shape.go
@@ -59,7 +59,7 @@ func (s Shape) NumDimensions() int {
 //
 // REQUIRES: 0 <= dim < s.NumDimensions()
 func (s Shape) Size(dim int) int64 {
-	if dim < 0 || dim > s.NumDimensions() {
+	if dim < 0 || dim >= s.NumDimensions() {
 		return -1
 	}
 	return s.dims[dim]
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 4a60c736b53ffefe6af1e32e72c93c01dcc0df0d..8b8909a6f811f82db0b4c00e3088df7485a4bda1 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -227,6 +227,7 @@ var types = []struct {
 	{reflect.TypeOf(uint16(0)), C.TF_UINT16},
 	{reflect.TypeOf(complex(float64(0), float64(0))), C.TF_COMPLEX128},
 	// TODO(apassos): support DT_RESOURCE representation in go.
+	// TODO(keveman): support DT_VARIANT representation in go.
 }
 
 // shapeAndDataTypeOf returns the data type and shape of the Tensor
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 90372660cd2b4f053e6d82e378a59b7dc571b7fa..64b3767735781e6c15bab251e8dbc730f397b160 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -22,7 +22,10 @@ java_library(
 # .aar. At some point, might make sense for a .aar rule here instead.
 filegroup(
     name = "java_sources",
-    srcs = glob(["src/main/java/org/tensorflow/*.java"]),
+    srcs = glob([
+        "src/main/java/org/tensorflow/*.java",
+        "src/main/java/org/tensorflow/types/*.java",
+    ]),
     visibility = [
         "//tensorflow/contrib/android:__pkg__",
         "//tensorflow/java:__pkg__",
@@ -31,7 +34,7 @@ filegroup(
 
 filegroup(
     name = "java_op_sources",
-    srcs = glob(["src/main/java/org/tensorflow/op/*.java"]),
+    srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]),
     visibility = [
         "//tensorflow/java:__pkg__",
     ],
@@ -162,6 +165,45 @@ java_test(
     ],
 )
 
+java_test(
+    name = "PrimitiveOpTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/PrimitiveOpTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.PrimitiveOpTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "OperandsTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/OperandsTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.OperandsTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "ConstantTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/core/ConstantTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.core.ConstantTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflow_jni",
     srcs = select({
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 750d76301e5435214bd27f2bd8fe8e672c01469c..5bd5b9a388ff334fe78d5f148ca0fc8176378bb2 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -1,5 +1,14 @@
 # -*- Python -*-
 
+# Pin to Java 1.7 to ensure broader compatibility for the Java bindings on
+# Android. Note also that the android_library bazel rule currently enforces
+# java 7
+# https://github.com/bazelbuild/bazel/blob/6c1106b1a721516d3b3db54d2e1c31b44a76fbb1/src/main/java/com/google/devtools/build/lib/bazel/rules/android/BazelAndroidSemantics.java#L73
+
+JAVA_VERSION_OPTS = [
+    "-source 7 -target 7",
+]
+
 # A more robust set of lint and errorprone checks when building
 # Java source to improve code consistency.
 
@@ -151,4 +160,4 @@ EP_DISABLED_CHECKS = [
 
 EP_OPTS = EP_ENABLED_WARNINGS + EP_DISABLED_CHECKS
 
-JAVACOPTS = XLINT_OPTS + EP_OPTS
+JAVACOPTS = JAVA_VERSION_OPTS + XLINT_OPTS + EP_OPTS
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 91140478186820700d16ed53e4a34725874f59c8..a955f14275b94bbdd98881f321eef68124fdb9d3 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.2.0-rc2</version>
+    <version>1.3.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
@@ -19,8 +19,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.6.1</version>
         <configuration>
-          <source>1.8</source>
-          <target>1.8</target>
+          <source>1.7</source>
+          <target>1.7</target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 89b7c6528f7cd16646e88f2a82bc88019753359d..8f1062ff3bb2e39f5385859acdc52e357120e5a4 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.2.0-rc2</version>
+    <version>1.3.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index e8d8fe63781e71744dfc5557890f9f09c0eb93b0..6c378f07de690230ed5ee64e99d58de46379d94e 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.2.0-rc2</version>
+  <version>1.3.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 6405daddef566ae10db52f7b923e7fa75a28665d..72984b740e12c4dd414e7e448b52159ee521534e 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.2.0-rc2</version>
+    <version>1.3.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.2.0</version>
+      <version>3.3.1</version>
     </dependency>
   </dependencies>
 
@@ -27,8 +27,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.6.1</version>
         <configuration>
-          <source>1.8</source>
-          <target>1.8</target>
+          <source>1.7</source>
+          <target>1.7</target>
         </configuration>
       </plugin>
       <plugin>
diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh
index 0bb4efbcc097b9a1fa0b0c5a70ba73ddcdee76c5..b95a4d4674e7dba785a1f4d3bc90b01e6c03d94d 100755
--- a/tensorflow/java/maven/release.sh
+++ b/tensorflow/java/maven/release.sh
@@ -17,6 +17,7 @@
 # Script to upload release artifacts for the TensorFlow Java library to
 # Maven Central. See README.md for an explanation.
 
+cd $(dirname "$0")
 TF_VERSION="$1"
 SETTINGS_XML="$2"
 shift
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index b5e2bfc3a6fa7d09510108c77fc921d08d628adf..302ed96c1280efbbc41068c9d21cc86e5bed3889 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -23,7 +23,7 @@ IS_SNAPSHOT="false"
 if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
   IS_SNAPSHOT="true"
 fi
-PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.2.0/protoc-3.2.0-linux-x86_64.zip"
+PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip"
 
 set -ex
 
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 31fb01510998d6cc97aa346126be12ab75eaec86..89c67d6b60b25ea7c3cbd2cbc83083db1f69b5b0 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.2.0-rc2</version>
+    <version>1.3.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/perl/tftypes-runall.pl b/tensorflow/java/src/gen/perl/tftypes-runall.pl
new file mode 100644
index 0000000000000000000000000000000000000000..258c1ff8366fb75a59515e2f55f1f8648c36e0ff
--- /dev/null
+++ b/tensorflow/java/src/gen/perl/tftypes-runall.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+my $script = $0;
+my $dir = `dirname $script`;
+chomp $dir;
+my $gen = "$dir/..";
+my $tfjavasrc = "$gen/..";
+my $rsrc = "$gen/resources";
+my $root = "$tfjavasrc/main/java";
+my $pkg = "$root/org/tensorflow";
+
+sub locchk {
+    (my $f) = @_;
+    if (! -r $f) {
+        print STDERR "Script tftypes-runall seems to be located in the wrong place (could not find $f)\n";
+        exit 1;
+    }
+}
+&locchk("$gen");
+&locchk("$tfjavasrc/gen");
+&locchk("$dir/tftypes.pl");
+&locchk("$rsrc/tftypes.csv");
+
+system("perl $dir/tftypes.pl -t $rsrc/tftypes.csv $pkg/types");
+# system("perl $dir/tftypes.pl -c $rsrc/tftypes.csv $rsrc/Tensors.java.tmpl > $pkg/op/Tensors.java");
diff --git a/tensorflow/java/src/gen/perl/tftypes.pl b/tensorflow/java/src/gen/perl/tftypes.pl
new file mode 100644
index 0000000000000000000000000000000000000000..86867335cb50d66d89d29dc601e632edec1add34
--- /dev/null
+++ b/tensorflow/java/src/gen/perl/tftypes.pl
@@ -0,0 +1,157 @@
+#!/usr/bin/perl
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+use strict;
+
+my $copyright =
+'/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+';
+
+my $count;
+
+my $option = '-t', my $template;
+
+sub usage {
+    print "Usage: tftypes [-ctdT] <type desc file> <tmpl file>\n\n"
+         ."This script generates parts of various .java files that depend on which"
+         ."TensorFlow types are supported by the Java API and how much. For each"
+         ."such .java file, there is a .tmpl file in the same source directory in"
+         ."which the strings \@TYPEINFO\@ and \@IMPORTS\@ are replaced with"
+         ."appropriate Java code. Output code is sent to standard output.\n\n";
+
+    print "Modulo putting in the correct directory names, it can be invoked as follows:\n";
+    print "tftypes -c tftypes.csv Tensors.java.tmpl > Tensors.java\n";
+    print "tftypes -t tftypes.csv <dir>                                   [outputs files to dir]\n";
+}
+
+if ($ARGV[0] =~ m/^-/) {
+    $option = shift;
+}
+my $typedesc = shift;
+my $tmpl = shift;
+
+my $dirname;
+
+if ($option eq '-t') {
+    $dirname = $tmpl;
+}
+
+open (TMPL, "<$tmpl") || die "Cannot open $tmpl for reading\n";
+
+my $text = do { local $/; <TMPL> };
+
+my %jtypecount;
+
+my $typeinfo, my $imports;
+
+open (TYPEDESC, $typedesc);
+
+my @info = ([]);
+
+while (<TYPEDESC>) {
+    chomp;
+    my $line = $_;
+    if ($line =~ m/^TF type/) { next }
+    $line =~ s/\r$//;
+    (my $name, my $jtype, my $creat, my $default, my $desc) =
+        split /,/, $line, 5;
+    $desc =~ s/^ *//g;
+    $desc =~ s/ *$//g;
+    $jtypecount{$jtype}++;
+    if ($jtypecount{$jtype} > 1) {
+# currently allowing Java types to stand for more than one TF type, but
+# may want to revisit this.
+#       print STDERR "Ambiguous Java type for $name : $jtype\n";
+#       exit 1
+    }
+
+    push @info, [$name, $jtype, $creat, $default, $desc];
+}
+
+for (my $i = 1; $i <= $#info; $i++) {
+    (my $name, my $jtype, my $creat, my $default, my $desc) =
+        @{$info[$i]};
+    my $tfname = "TF".$name;
+    my $ucname = uc $name;
+
+    if ($option eq '-t') {
+        if ($jtype eq '') { next }
+        # Generate class declarations
+        # print STDERR "Creating $dirname/$tfname.java\n";
+        open (CLASSFILE, ">$dirname/$tfname.java") || die "Can't open $tfname.java";
+        print CLASSFILE $copyright;
+        print CLASSFILE "// GENERATED FILE. To update, edit tftypes.pl instead.\n\n";
+
+        my $fulldesc = $desc;
+        if (substr($desc, 0, 1) =~ m/^[aeoiu8]$/i) {
+            $fulldesc = "an $desc"
+        } else {
+            $fulldesc = "a $desc"
+        }
+        print CLASSFILE  "package org.tensorflow.types;\n\n"
+                        ."import org.tensorflow.DataType;\n\n";
+        print CLASSFILE  "/** Represents $fulldesc. */\n"
+                        ."public class $tfname implements TFType {\n"
+                        ."  private $tfname() {}\n"
+                        ."  static {\n"
+                        ."    Types.typeCodes.put($tfname.class, DataType.$ucname);\n"
+                        ."  }\n";
+        if ($default ne '') {
+            print CLASSFILE
+                         "  static {\n"
+                        ."    Types.scalars.put($tfname.class, $default);\n"
+                        ."  }\n";
+        }
+        print CLASSFILE  "}\n";
+        close(CLASSFILE);
+    } elsif ($option eq '-c') {
+      # Generate creator declarations for Tensors.java
+      if ($jtype ne '' && $creat eq 'y') {
+        for (my $brackets = ''; length $brackets <= 12; $brackets .= '[]') {
+            $typeinfo .=
+                "  public static Tensor<$tfname> create($jtype$brackets data) {\n"
+               ."    return Tensor.create(data, $tfname.class);\n"
+               ."  }\n";
+        }
+      }
+      if ($text =~ m/\b$tfname\b/ || $creat eq 'y') {
+            $imports .= "import org.tensorflow.types.$tfname;\n";
+      }
+    }
+}
+
+if ($option ne '-t') {
+  print "// GENERATED FILE. Edits to this file will be lost -- edit $tmpl instead.\n";
+
+  $text =~ s/\@TYPEINFO\@/$typeinfo/;
+  $text =~ s/\@IMPORTS\@/$imports/;
+
+  print $text;
+}
diff --git a/tensorflow/java/src/gen/resources/tftypes.csv b/tensorflow/java/src/gen/resources/tftypes.csv
new file mode 100644
index 0000000000000000000000000000000000000000..88acaafd3cdc7ae55f7d9c738fb453ae96f4fd62
--- /dev/null
+++ b/tensorflow/java/src/gen/resources/tftypes.csv
@@ -0,0 +1,21 @@
+TF type,Java type,Creator?,Zero value,Description
+Float,float,y,0f,32-bit single precision floating point number
+Double,double,y,0.0,64-bit double precision floating point number
+Int32,int,y,0,32-bit signed integer
+UInt8,byte,n,(byte)0,8-bit unsigned integer
+Int16,,n,(short)0,16-bit signed integer
+Int8,,n,(byte)0,8-bit signed integer
+String,byte,n,,arbitrary sequence of bytes
+Complex64,,n,,single-precision complex number
+Int64,long,y,0L,64-bit signed integer
+Bool,boolean,y,false,boolean
+QInt8,,n,,quantized int8
+QUInt8,,n,,quantized uint8
+QInt32,,n,,quantized int32
+BFloat16,,n,,float32 truncated to 16 bits. Only for cast ops.
+QInt16,,n,,quantized int16
+QUInt16,,n,,quantized uint16
+UInt16,,n,,16-bit unsigned integer
+Complex128,,n,,double-precision complex number
+Half,,n,,
+Resource,,n,,
diff --git a/tensorflow/java/src/main/java/org/tensorflow/DataType.java b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
index cafa3ffc7d41d5443bd462799b7d4e6d5ae8a928..e67e266ff7da07626371142afd9e3b91a7a0eaba 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/DataType.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/DataType.java
@@ -53,12 +53,14 @@ public enum DataType {
   int c() {
     return value;
   }
+  
+  // Cached to avoid copying it
+  final private static DataType[] values = values();
 
   static DataType fromC(int c) {
-    for (DataType t : DataType.values()) {
-      if (t.c() == c) {
+    for (DataType t : values) {
+      if (t.value == c)
         return t;
-      }
     }
     throw new IllegalArgumentException(
         "DataType " + c + " is not recognized in Java (version " + TensorFlow.version() + ")");
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index d817239919dac8dd8b3a22158b6b3d99650148d2..ca061770175b69e5ac13402aa70c6a997c11ab66 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -49,7 +49,7 @@ final class NativeLibrary {
       // Either:
       // (1) The native library has already been statically loaded, OR
       // (2) The required native code has been statically linked (through a custom launcher), OR
-      // (3) The native code is part of another library (such as an an application-level libraryh)
+      // (3) The native code is part of another library (such as an application-level library)
       // that has already been loaded. For example, tensorflow/examples/android and
       // tensorflow/contrib/android include the required native code in differently named libraries.
       //
@@ -64,9 +64,11 @@ final class NativeLibrary {
     if (resource == null) {
       throw new UnsatisfiedLinkError(
           String.format(
-              "Cannot find TensorFlow native library for OS: %s, architecture: %s. "
-                  + "See https://github.com/tensorflow/tensorflow/tree/master/java/README.md "
-                  + "for possible solutions (such as building the library from source).",
+              "Cannot find TensorFlow native library for OS: %s, architecture: %s. See "
+                  + "https://github.com/tensorflow/tensorflow/tree/master/tensorflow/java/README.md"
+                  + " for possible solutions (such as building the library from source). Additional"
+                  + " information on attempts to find the native library can be obtained by adding"
+                  + " org.tensorflow.NativeLibrary.DEBUG=1 to the system properties of the JVM.",
               os(), architecture()));
     }
     try {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operand.java b/tensorflow/java/src/main/java/org/tensorflow/Operand.java
new file mode 100644
index 0000000000000000000000000000000000000000..695c4c1060b4f53ca2c9804f17a840d859d51309
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operand.java
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * Interface implemented by operands of a TensorFlow operation.
+ *
+ * <p>Example usage:
+ *
+ * <pre>{@code
+ * // The "decodeJpeg" operation can be used as an operand to the "cast" operation
+ * Operand decodeJpeg = ops.image().decodeJpeg(...);
+ * ops.math().cast(decodeJpeg, DataType.FLOAT);
+ *
+ * // The output "y" of the "unique" operation can be used as an operand to the "cast" operation
+ * Output y = ops.array().unique(...).y();
+ * ops.math().cast(y, DataType.FLOAT);
+ *
+ * // The "split" operation can be used as operand list to the "concat" operation
+ * Iterable<? extends Operand> split = ops.array().split(...);
+ * ops.array().concat(0, split);
+ * }</pre>
+ */
+public interface Operand {
+
+  /**
+   * Returns the symbolic handle of a tensor.
+   *
+   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
+   * used to obtain a symbolic handle that represents the computation of the input.
+   *
+   * @see OperationBuilder#addInput(Output)
+   */
+  Output asOutput();
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index 5b89ce5e48c78ec8c965db2022dc3f527a22b39e..ec26309fba17ee187122124a120e771bee239378 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -73,15 +73,14 @@ public final class Operation {
   /**
    * Returns the size of the list of Tensors produced by this operation.
    *
-   * <p>An Operation has multiple named outputs, each of which produces either
-   * a single tensor or a list of tensors. This method returns the size of
-   * the list of tensors for a specific named output of the operation.
+   * <p>An Operation has multiple named outputs, each of which produces either a single tensor or a
+   * list of tensors. This method returns the size of the list of tensors for a specific named
+   * output of the operation.
    *
-   * @param name identifier of the list of tensors (of which there may
-   *        be many) produced by this operation.
+   * @param name identifier of the list of tensors (of which there may be many) produced by this
+   *     operation.
    * @return the size of the list of Tensors produced by this named output.
-   * @throws IllegalArgumentException if this operation has no output
-   *         with the provided name.
+   * @throws IllegalArgumentException if this operation has no output with the provided name.
    */
   public int outputListLength(final String name) {
     Graph.Reference r = graph.ref();
@@ -92,11 +91,81 @@ public final class Operation {
     }
   }
 
+  /**
+   * Returns symbolic handles to a list of tensors produced by this operation.
+   *
+   * @param idx index of the first tensor of the list
+   * @param length number of tensors in the list
+   * @return array of {@code Output}
+   */
+  public Output[] outputList(int idx, int length) {
+    Output[] outputs = new Output[length];
+    for (int i = 0; i < length; ++i) {
+      outputs[i] = output(idx + i);
+    }
+    return outputs;
+  }
+
   /** Returns a symbolic handle to one of the tensors produced by this operation. */
   public Output output(int idx) {
     return new Output(this, idx);
   }
 
+  @Override
+  public int hashCode() {
+    return Long.valueOf(unsafeNativeHandle).hashCode();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (o == this) {
+      return true;
+    }
+    if (!(o instanceof Operation)) {
+      return false;
+    }
+    Operation that = (Operation) o;
+    if (graph != that.graph) {
+      return false;
+    }
+
+    // The graph object is known to be identical here, so this one
+    // reference is sufficient to validate the use of native pointers
+    // in both objects.
+    Graph.Reference r = graph.ref();
+    try {
+      return unsafeNativeHandle == that.unsafeNativeHandle;
+    } finally {
+      r.close();
+    }
+  }
+
+  @Override
+  public String toString() {
+    return String.format("<%s '%s'>", type(), name());
+  }
+
+  /**
+   * Returns the size of the given inputs list of Tensors for this operation.
+   *
+   * <p>An Operation has multiple named inputs, each of which contains either a single tensor or a
+   * list of tensors. This method returns the size of the list of tensors for a specific named input
+   * of the operation.
+   *
+   * @param name identifier of the list of tensors (of which there may be many) inputs to this
+   *     operation.
+   * @return the size of the list of Tensors produced by this named input.
+   * @throws IllegalArgumentException if this operation has no input with the provided name.
+   */
+  public int inputListLength(final String name) {
+    Graph.Reference r = graph.ref();
+    try {
+      return inputListLength(unsafeNativeHandle, name);
+    } finally {
+      r.close();
+    }
+  }
+
   long getUnsafeNativeHandle() {
     return unsafeNativeHandle;
   }
@@ -122,6 +191,7 @@ public final class Operation {
   }
 
   private final long unsafeNativeHandle;
+
   private final Graph graph;
 
   private static native String name(long handle);
@@ -132,6 +202,8 @@ public final class Operation {
 
   private static native int outputListLength(long handle, String name);
 
+  private static native int inputListLength(long handle, String name);
+
   private static native long[] shape(long graphHandle, long opHandle, int output);
 
   private static native int dtype(long graphHandle, long opHandle, int output);
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index 38ffa2a8e1932390780e1e762b7be2e7e7b27e8b..15077ce4395f78a9cf6cf19a7442c715657fb727 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -28,7 +28,7 @@ import java.nio.charset.Charset;
  * <pre>{@code
  * // g is a Graph instance.
  * try (Tensor c1 = Tensor.create(3.0f)) {
- *   g.opBuilder("Constant", "MyConst")
+ *   g.opBuilder("Const", "MyConst")
  *       .setAttr("dtype", c1.dataType())
  *       .setAttr("value", c1)
  *       .build();
@@ -63,6 +63,16 @@ public final class OperationBuilder {
     }
   }
 
+
+  /**
+   * Returns the builder to create an operation.
+   *
+   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
+   * used to add a input to a {@link OperationBuilder}.
+   *
+   * @param input {@link Output} supposed to be the input of the OperationBuilder.
+   * @return the OperationBuilder instance for chaining.
+   */
   public OperationBuilder addInput(Output input) {
     Graph.Reference r = graph.ref();
     try {
@@ -256,6 +266,21 @@ public final class OperationBuilder {
     return this;
   }
 
+  public OperationBuilder setAttr(String name,  String[] value) {
+    Charset utf8 = Charset.forName("UTF-8");
+    Object[] objects = new Object[value.length];
+    for (int i = 0; i < value.length; ++i) {
+      objects[i] = value[i].getBytes(utf8);
+    }
+    Graph.Reference r = graph.ref();
+    try {
+      setAttrStringList(unsafeNativeHandle, name, objects);
+    } finally {
+      r.close();
+    }
+    return this;
+  }
+
   private long unsafeNativeHandle;
   private Graph graph;
 
@@ -273,10 +298,7 @@ public final class OperationBuilder {
 
   // The names of all the setAttr* family functions below correspond to the C library types, not the
   // Java library types. Roughly, setAttrFoo calls the TensorFlow C library function: TF_SetAttrFoo.
-  //
   // TODO(ashankar):
-  // - setAttrStringList: Which would take in an array of byte[] (java Strings will need to be UTF-8
-  //   encoded?)
   // - setAttrShapeList: Which would take in a long[][]
 
   private static native void setAttrString(long handle, String name, byte[] value);
@@ -302,4 +324,7 @@ public final class OperationBuilder {
   private static native void setAttrTensorList(long handle, String name, long[] tensorHandle);
 
   private static native void setAttrShape(long handle, String name, long[] shape, int numDims);
+
+  private static native void setAttrStringList(long handle, String name, Object[] value);
+
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Output.java b/tensorflow/java/src/main/java/org/tensorflow/Output.java
index ab128c2b30a956cf45d366decba876e4abdd70b3..8dff50fafbabd6ec0beb4edaaa03f2dc6a9d7082 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Output.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Output.java
@@ -15,13 +15,18 @@ limitations under the License.
 
 package org.tensorflow;
 
+import java.util.Objects;
+
 /**
  * A symbolic handle to a tensor produced by an {@link Operation}.
  *
  * <p>An Output is a symbolic handle to a tensor. The value of the Tensor is computed by executing
  * the {@link Operation} in a {@link Session}.
+ *
+ * <p>By implementing the {@link Operand} interface, instances of this class also act as operands to
+ * {@link org.tensorflow.op.Op Op} instances.
  */
-public final class Output {
+public final class Output implements Operand {
 
   /** Handle to the idx-th output of the Operation {@code op}. */
   public Output(Operation op, int idx) {
@@ -49,6 +54,35 @@ public final class Output {
     return operation.dtype(index);
   }
 
+  @Override
+  public Output asOutput() {
+    return this;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(operation, index);
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (o == this) {
+      return true;
+    }
+    if (o instanceof Output) {
+      Output that = (Output) o;
+      return index == that.index && operation.equals(that.operation);
+    }
+    return false;
+  }
+
+  @Override
+  public String toString() {
+    return String.format(
+        "<%s '%s:%d' shape=%s dtype=%s>",
+        operation.type(), operation.name(), index, shape().toString(), dataType());
+  }
+
   private final Operation operation;
   private final int index;
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index f73cded4e3ee53649f2d7aafbe5a4014e6c73b82..83a300a5605d4c4d9fec9d46f0a04f9c99524d28 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -113,7 +113,7 @@ public final class Session implements AutoCloseable {
    *
    * <p>A Runner runs the necessary graph fragments to execute every {@link Operation} required to
    * evaluate the {@link Tensor}s to fetch. The {@link #feed(String,int,Tensor)} call allows callers
-   * to override the value of {@link Tensor}s in the graph by substituing the provided {@link
+   * to override the value of {@link Tensor}s in the graph by substituting the provided {@link
    * Tensor}s for the outputs of the operations provided to {@link #feed(String,int,Tensor)}.
    */
   public final class Runner {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Shape.java b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
index 90d6cf7b85436f9645f279326e5b60a77c4b77f7..9aa92be111c09bfb687822c20264afe07266e356 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Shape.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
@@ -77,7 +77,7 @@ public final class Shape {
     return shape[i];
   }
 
-  /** Succint description of the shape meant for debugging. */
+  /** Succinct description of the shape meant for debugging. */
   @Override
   public String toString() {
     if (shape == null) {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/Op.java b/tensorflow/java/src/main/java/org/tensorflow/op/Op.java
new file mode 100644
index 0000000000000000000000000000000000000000..aa6db404571f697cee0144457756dd9b71a08756
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/Op.java
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+/**
+ * A marker interface for all operation wrappers.
+ *
+ * <p>Operation wrappers provide strongly typed interfaces for building operations and linking them
+ * into a graph without the use of literals and indexes required by the core classes.
+ *
+ * <p>This interface allows keeping references to any operation wrapper using a common type.
+ *
+ * <pre>{@code
+ * // All values returned by an Ops call can be referred as a Op
+ * Op split = ops.array().split(...);
+ * Op shape = ops.array().shape(...);
+ *
+ * // All operations could be added to an Op collection
+ * Collection<Op> allOps = Arrays.asList(split, shape);
+ * }
+ */
+public interface Op {}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java b/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java
new file mode 100644
index 0000000000000000000000000000000000000000..5971103d6d19bc9c0755367618f093045bbbfb21
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/Operands.java
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.tensorflow.Operand;
+import org.tensorflow.OperationBuilder;
+import org.tensorflow.Output;
+
+/** Utilities for manipulating operand related types and lists. */
+public final class Operands {
+
+  /**
+   * Converts a list of {@link Operand} into an array of {@link Output}.
+   *
+   * <p>Operation wrappers need to convert back a list of inputs into an array of outputs in order
+   * to build an operation, see {@link OperationBuilder#addInputList(Output[])}.
+   *
+   * @param inputs an iteration of input operands
+   * @return an array of outputs
+   */
+  public static Output[] asOutputs(Iterable<? extends Operand> inputs) {
+    List<Output> outputList = new ArrayList<>();
+    for (Operand input : inputs) {
+      outputList.add(input.asOutput());
+    }
+    return outputList.toArray(new Output[outputList.size()]);
+  }
+
+  // Disabled constructor
+  private Operands() {}
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java b/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
new file mode 100644
index 0000000000000000000000000000000000000000..8e56f970416ef35737d6763fcc6bb46bc7a157c5
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import org.tensorflow.Operation;
+
+/**
+ * A base class for {@link Op} implementations that are backed by a single {@link Operation}.
+ *
+ * <p>Each operation registered in the TensorFlow core is a primitive and is provided as a {@code
+ * PrimitiveOp}. Custom operations working with only one primitive may also derive from this class.
+ */
+public abstract class PrimitiveOp implements Op {
+
+  @Override
+  public final int hashCode() {
+    return operation.hashCode();
+  }
+
+  @Override
+  public final boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    // Note: we consider that all objects wrapping the same operation are equal, no matter their
+    // implementation
+    if (!(obj instanceof PrimitiveOp)) {
+      return false;
+    }
+    return operation.equals(((PrimitiveOp) obj).operation);
+  }
+
+  @Override
+  public final String toString() {
+    return String.format("<%s '%s'>", operation.type(), operation.name());
+  }
+
+  /**
+   * Underlying operation. It is deliberately not exposed by a getter method to avoid any name
+   * conflict with generated methods of the subclasses.
+   */
+  protected final Operation operation;
+
+  /**
+   * Constructor.
+   *
+   * @param operation the underlying operation
+   */
+  protected PrimitiveOp(Operation operation) {
+    this.operation = operation;
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
new file mode 100644
index 0000000000000000000000000000000000000000..59476fb43d4c08dbc213aebf4a84d05ad27ce75e
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op.annotation;
+
+import java.lang.annotation.Documented;
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * Annotation used by classes to make TensorFlow operations conveniently accessible via {@code
+ * org.tensorflow.op.Ops}.
+ *
+ * <p>An annotation processor (TODO: not yet implemented) builds the {@code Ops} class by
+ * aggregating all classes annotated as {@code @Operator}s. Each annotated class <b>must</b> have at
+ * least one public static factory method named {@code create} that accepts a {@link
+ * org.tensorflow.op.Scope} as its first argument. The processor then adds a convenience method in
+ * the {@code Ops} class. For example:
+ *
+ * <pre>{@code
+ * @Operator
+ * public final class MyOp implements Op {
+ *   public static MyOp create(Scope scope, Operand operand) {
+ *     ...
+ *   }
+ * }
+ * }</pre>
+ *
+ * <p>results in a method in the {@code Ops} class
+ *
+ * <pre>{@code
+ * import org.tensorflow.op.Ops;
+ * ...
+ * Ops ops = new Ops(graph);
+ * ...
+ * ops.myOp(operand);
+ * // and has exactly the same effect as calling
+ * // MyOp.create(ops.getScope(), operand);
+ * }</pre>
+ */
+@Documented
+@Target(ElementType.TYPE)
+@Retention(RetentionPolicy.CLASS)
+public @interface Operator {
+  /**
+   * Specify an optional group within the {@code Ops} class.
+   *
+   * <p>By default, an annotation processor will create convenience methods directly in the {@code
+   * Ops} class. An annotated operator may optionally choose to place the method within a group. For
+   * example:
+   *
+   * <pre>{@code
+   * @Operator(group="math")
+   * public final class Add extends PrimitiveOp implements Operand {
+   *   ...
+   * }
+   * }</pre>
+   *
+   * <p>results in the {@code add} method placed within a {@code math} group within the {@code Ops}
+   * class.
+   *
+   * <pre>{@code
+   * ops.math().add(...);
+   * }</pre>
+   *
+   * <p>The group name must be a <a
+   * href="https://docs.oracle.com/javase/specs/jls/se7/html/jls-3.html#jls-3.8">valid Java
+   * identifier</a>.
+   */
+  String group() default "";
+
+  /**
+   * Name for the wrapper method used in the {@code Ops} class.
+   *
+   * <p>By default, a processor derives the method name in the {@code Ops} class from the class name
+   * of the operator. This attribute allow you to provide a different name instead. For example:
+   *
+   * <pre>{@code
+   * @Operator(name="myOperation")
+   * public final class MyRealOperation implements Operand {
+   *   public static MyRealOperation create(...)
+   * }
+   * }</pre>
+   *
+   * <p>results in this method added to the {@code Ops} class
+   *
+   * <pre>{@code
+   * ops.myOperation(...);
+   * // and is the same as calling
+   * // MyRealOperation.create(...)
+   * }</pre>
+   *
+   * <p>The name must be a <a
+   * href="https://docs.oracle.com/javase/specs/jls/se7/html/jls-3.html#jls-3.8">valid Java
+   * identifier</a>.
+   */
+  String name() default "";
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java b/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
new file mode 100644
index 0000000000000000000000000000000000000000..cd7931d3bb72a357aac3ceaab6f2acb3ba670ed7
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/core/Constant.java
@@ -0,0 +1,173 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op.core;
+
+import java.nio.ByteBuffer;
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
+import java.nio.LongBuffer;
+import org.tensorflow.DataType;
+import org.tensorflow.Operand;
+import org.tensorflow.Operation;
+import org.tensorflow.Output;
+import org.tensorflow.Tensor;
+import org.tensorflow.op.PrimitiveOp;
+import org.tensorflow.op.Scope;
+import org.tensorflow.op.annotation.Operator;
+
+/** An operator producing a constant value. */
+@Operator
+public final class Constant extends PrimitiveOp implements Operand {
+  /**
+   * Create a constant from a Java object.
+   *
+   * <p>The argument {@code object} is first converted into a Tensor using {@link
+   * org.tensorflow.Tensor#create(Object)}, so only Objects supported by this method must be
+   * provided. For example:
+   *
+   * <pre>{@code
+   * Constant.create(scope, 7); // returns a constant scalar tensor 7
+   * }</pre>
+   *
+   * @param scope is a scope used to add the underlying operation.
+   * @param object a Java object representing the constant.
+   * @see org.tensorflow.Tensor#create(Object) Tensor.create
+   */
+  public static Constant create(Scope scope, Object object) {
+    try (Tensor value = Tensor.create(object)) {
+      return createWithTensor(scope, value);
+    }
+  }
+
+  /**
+   * Create a {@link DataType#INT32} constant with data from the given buffer.
+   *
+   * <p>Creates a constant with the given shape by copying elements from the buffer (starting from
+   * its current position) into the tensor. For example, if {@code shape = {2,3} } (which represents
+   * a 2x3 matrix) then the buffer must have 6 elements remaining, which will be consumed by this
+   * method.
+   *
+   * @param scope is a scope used to add the underlying operation.
+   * @param shape the tensor shape.
+   * @param data a buffer containing the tensor data.
+   * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
+   */
+  public static Constant create(Scope scope, long[] shape, IntBuffer data) {
+    try (Tensor value = Tensor.create(shape, data)) {
+      return createWithTensor(scope, value);
+    }
+  }
+
+  /**
+   * Create a {@link DataType#FLOAT} constant with data from the given buffer.
+   *
+   * <p>Creates a constant with the given shape by copying elements from the buffer (starting from
+   * its current position) into the tensor. For example, if {@code shape = {2,3} } (which represents
+   * a 2x3 matrix) then the buffer must have 6 elements remaining, which will be consumed by this
+   * method.
+   *
+   * @param scope is a scope used to add the underlying operation.
+   * @param shape the tensor shape.
+   * @param data a buffer containing the tensor data.
+   * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
+   */
+  public static Constant create(Scope scope, long[] shape, FloatBuffer data) {
+    try (Tensor value = Tensor.create(shape, data)) {
+      return createWithTensor(scope, value);
+    }
+  }
+
+  /**
+   * Create a {@link DataType#DOUBLE} constant with data from the given buffer.
+   *
+   * <p>Creates a constant with the given shape by copying elements from the buffer (starting from
+   * its current position) into the tensor. For example, if {@code shape = {2,3} } (which represents
+   * a 2x3 matrix) then the buffer must have 6 elements remaining, which will be consumed by this
+   * method.
+   *
+   * @param scope is a scope used to add the underlying operation.
+   * @param shape the tensor shape.
+   * @param data a buffer containing the tensor data.
+   * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
+   */
+  public static Constant create(Scope scope, long[] shape, DoubleBuffer data) {
+    try (Tensor value = Tensor.create(shape, data)) {
+      return createWithTensor(scope, value);
+    }
+  }
+
+  /**
+   * Create a {@link DataType#INT64} constant with data from the given buffer.
+   *
+   * <p>Creates a constant with the given shape by copying elements from the buffer (starting from
+   * its current position) into the tensor. For example, if {@code shape = {2,3} } (which represents
+   * a 2x3 matrix) then the buffer must have 6 elements remaining, which will be consumed by this
+   * method.
+   *
+   * @param scope is a scope used to add the underlying operation.
+   * @param shape the tensor shape.
+   * @param data a buffer containing the tensor data.
+   * @throws IllegalArgumentException If the tensor shape is not compatible with the buffer
+   */
+  public static Constant create(Scope scope, long[] shape, LongBuffer data) {
+    try (Tensor value = Tensor.create(shape, data)) {
+      return createWithTensor(scope, value);
+    }
+  }
+
+  /**
+   * Create a constant with data from the given buffer.
+   *
+   * <p>Creates a Constant with the provided shape of any type where the constant data has been
+   * encoded into {@code data} as per the specification of the TensorFlow <a
+   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
+   *
+   * @param scope is a scope used to add the underlying operation.
+   * @param dataType the tensor datatype.
+   * @param shape the tensor shape.
+   * @param data a buffer containing the tensor data.
+   * @throws IllegalArgumentException If the tensor datatype or shape is not compatible with the
+   *     buffer
+   */
+  public static Constant create(Scope scope, DataType dataType, long[] shape, ByteBuffer data) {
+    try (Tensor value = Tensor.create(dataType, shape, data)) {
+      return createWithTensor(scope, value);
+    }
+  }
+
+  private static Constant createWithTensor(Scope scope, Tensor value) {
+    return new Constant(
+        scope
+            .graph()
+            .opBuilder("Const", scope.makeOpName("Const"))
+            .setAttr("value", value)
+            .setAttr("dtype", value.dataType())
+            .build());
+  }
+
+  @Override
+  public Output asOutput() {
+    return output;
+  }
+
+  private Constant(Operation operation) {
+    super(operation);
+    output = operation.output(0);
+  }
+
+  private final Output output;
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
new file mode 100644
index 0000000000000000000000000000000000000000..ab34f6aa125eded4f7acafea1439559d084c9780
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFBool.java
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
+
+package org.tensorflow.types;
+
+import org.tensorflow.DataType;
+
+/** Represents a boolean. */
+public class TFBool implements TFType {
+  private TFBool() {}
+  static {
+    Types.typeCodes.put(TFBool.class, DataType.BOOL);
+  }
+  static {
+    Types.scalars.put(TFBool.class, false);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
new file mode 100644
index 0000000000000000000000000000000000000000..49e5d9f2f3a6627201dd9af67b5698f095a9c0f0
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFDouble.java
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
+
+package org.tensorflow.types;
+
+import org.tensorflow.DataType;
+
+/** Represents a 64-bit double precision floating point number. */
+public class TFDouble implements TFType {
+  private TFDouble() {}
+  static {
+    Types.typeCodes.put(TFDouble.class, DataType.DOUBLE);
+  }
+  static {
+    Types.scalars.put(TFDouble.class, 0.0);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
new file mode 100644
index 0000000000000000000000000000000000000000..8426ee41f01efa71cac4dc2dd8aabcafe500e1cc
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFFloat.java
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
+
+package org.tensorflow.types;
+
+import org.tensorflow.DataType;
+
+/** Represents a 32-bit single precision floating point number. */
+public class TFFloat implements TFType {
+  private TFFloat() {}
+  static {
+    Types.typeCodes.put(TFFloat.class, DataType.FLOAT);
+  }
+  static {
+    Types.scalars.put(TFFloat.class, 0f);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
new file mode 100644
index 0000000000000000000000000000000000000000..3947b6ad095b5a4e8bc8b55561961fc91bc73966
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt32.java
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
+
+package org.tensorflow.types;
+
+import org.tensorflow.DataType;
+
+/** Represents a 32-bit signed integer. */
+public class TFInt32 implements TFType {
+  private TFInt32() {}
+  static {
+    Types.typeCodes.put(TFInt32.class, DataType.INT32);
+  }
+  static {
+    Types.scalars.put(TFInt32.class, 0);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
new file mode 100644
index 0000000000000000000000000000000000000000..ccdded86939c0bbe1265145d76b8470a9099fb94
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFInt64.java
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
+
+package org.tensorflow.types;
+
+import org.tensorflow.DataType;
+
+/** Represents a 64-bit signed integer. */
+public class TFInt64 implements TFType {
+  private TFInt64() {}
+  static {
+    Types.typeCodes.put(TFInt64.class, DataType.INT64);
+  }
+  static {
+    Types.scalars.put(TFInt64.class, 0L);
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/style.html b/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
similarity index 54%
rename from tensorflow/tensorboard/components/tf_tensorboard/style.html
rename to tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
index 575e89e39828dda56067aa91e1145d45b7e87a18..e7327e8c57fd41e1c4441f7f79eb44614afdc325 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/style.html
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFString.java
@@ -1,6 +1,4 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--->
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
 
-<link rel="import" href="../font-roboto/roboto.html">
+package org.tensorflow.types;
 
-<style>
-  html,
-  body {
-    margin: 0;
-    padding: 0;
-    height: 100%;
-    font-family: "RobotoDraft", "Roboto", sans-serif;
+import org.tensorflow.DataType;
+
+/** Represents an arbitrary sequence of bytes. */
+public class TFString implements TFType {
+  private TFString() {}
+  static {
+    Types.typeCodes.put(TFString.class, DataType.STRING);
   }
-</style>
+}
diff --git a/tensorflow/tensorboard/components/tf_storage/tf-storage.html b/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
similarity index 65%
rename from tensorflow/tensorboard/components/tf_storage/tf-storage.html
rename to tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
index ff3f7b0ad4aa79be64061e991e99f808ae79a65f..562953ac9dc0abf8cac172338025bac9e1dae81c 100644
--- a/tensorflow/tensorboard/components/tf_storage/tf-storage.html
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFType.java
@@ -1,6 +1,4 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--->
+==============================================================================*/
+package org.tensorflow.types;
 
-<link rel="import" href="../tf-globals/tf-globals.html">
-<link rel="import" href="../tf-imports/lodash.html">
-
-<script src="storage.js"></script>
+/**
+ * A marker interface for classes representing TensorFlow types.
+ */
+public interface TFType {}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
new file mode 100644
index 0000000000000000000000000000000000000000..d7305ca5a80311f520e446353d31d376c210d6a3
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/TFUInt8.java
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GENERATED FILE. To update, edit tftypes.pl instead.
+
+package org.tensorflow.types;
+
+import org.tensorflow.DataType;
+
+/** Represents an 8-bit unsigned integer. */
+public class TFUInt8 implements TFType {
+  private TFUInt8() {}
+  static {
+    Types.typeCodes.put(TFUInt8.class, DataType.UINT8);
+  }
+  static {
+    Types.scalars.put(TFUInt8.class, (byte)0);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/Types.java b/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
new file mode 100644
index 0000000000000000000000000000000000000000..976cd9fd347fb48020f186ece8d23dae89626b1d
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/Types.java
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.types;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.tensorflow.DataType;
+
+/**
+ * Utility class for managing the representation of TensorFlow types as Java
+ * types. For each TensorFlow type (e.g., int32), there is a corresponding Java
+ * type (e.g., TFInt32) that represents it at compile time and a corresponding
+ * class object (e.g., TFInt32.class) that represents it at run time. There is
+ * also an enumeration value in DataType that can be used to represent the
+ * type, though that should rarely be required.
+ */
+public class Types {
+
+  private Types() {} // not instantiable
+
+  static final Map<Class<?>, DataType> typeCodes = new HashMap<>();
+
+  /** Returns the DataType value corresponding to a TensorFlow type class. */
+  public static DataType dataType(Class<? extends TFType> c) {
+    DataType dtype = typeCodes.get(c);
+    if (dtype == null) {
+      throw new IllegalArgumentException("" + c + " is not a TensorFlow type.");
+    }
+    return dtype;
+  }
+
+  static final Map<Class<?>, Object> scalars = new HashMap<>();
+
+  /** Returns the zero value of type described by {@code c}, or null if
+   *  the type (e.g., string) is not numeric and therefore has no zero value.
+   */
+  public static Object zeroValue(Class<? extends TFType> c) {
+    return scalars.get(c);
+  }
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
new file mode 100644
index 0000000000000000000000000000000000000000..f1410a760e157167ad601d9966a53f98dcb93e58
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/package-info.java
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * Defines classes that represent TensorFlow data types. For each possible data type
+ * that can be used in a tensor, there is a corresponding class in this package that
+ * is used to represent it. For example, the TensorFlow int32 type is represented by
+ * the type TFInt32 and by the class object TFInt32.class. The former is used to
+ * support compile-time checking of tensor data types and the latter is used for
+ * run-time checking of data types. All such classes implement the TFType interface.
+ * TensorFlow data types are also separately represented by the DataType enum, with
+ * one enum value per data type. The enum representation should rarely be needed, but
+ * the Types class can be used to obtain it from the class object representation.
+ */
+package org.tensorflow.types;
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index a7696182c7fd35a3d27ea4d62b6242c28d93f8d7..37f01a943a24c4164d3abd1cf2e5ed73b1d05162 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -257,3 +257,30 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShape(
   TF_SetAttrShape(d, cname, cvalue.get(), static_cast<int>(num_dims));
   env->ReleaseStringUTFChars(name, cname);
 }
+
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrStringList(
+        JNIEnv* env, jclass object, jlong handle, jstring name, jobjectArray values) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int num_values = env->GetArrayLength(values);
+  static_assert(sizeof(jbyte) == 1,
+                "Require Java byte to be represented as a single byte");
+  std::unique_ptr<jbyteArray[]> jarrays(new jbyteArray[num_values]);
+  std::unique_ptr<jbyte*[]> jvalues(new jbyte*[num_values]);
+  std::unique_ptr<void*[]> cvalues(new void*[num_values]);
+  std::unique_ptr<size_t[]> lengths(new size_t[num_values]);
+
+  for (int i = 0; i < num_values; ++i) {
+    jbyteArray v = static_cast<jbyteArray>(env->GetObjectArrayElement(values, i));
+    jarrays[i] = v;
+    jvalues[i] = env->GetByteArrayElements(v, nullptr);
+    cvalues[i] = jvalues[i];
+    lengths[i] = static_cast<size_t>(env->GetArrayLength(v));
+  }
+  TF_SetAttrStringList(d, cname, cvalues.get(), lengths.get(), num_values);
+  for (int i = 0; i < num_values; ++i) {
+    env->ReleaseByteArrayElements(jarrays[i], jvalues[i], JNI_ABORT);
+  }
+  env->ReleaseStringUTFChars(name, cname);
+}
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
index 9b64c328203ad406953dea0e9cddcf6f468c043d..2e72bd68da5ad5915ba8268971a2f96961a45972 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.h
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -169,6 +169,14 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTensorList(
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShape(
     JNIEnv *, jclass, jlong, jstring, jlongArray, jint);
 
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrStringList
+ * Signature: (JLjava/lang/String;[L)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrStringList(
+    JNIEnv *, jclass, jlong, jstring, jobjectArray);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
index b3d5fc4ec374fe6e5214799581878d94315a7ea7..ccc44d91c0093963124d7246c7b9e0a7cff447e0 100644
--- a/tensorflow/java/src/main/native/operation_jni.cc
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -156,3 +156,21 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv* env,
 
   return static_cast<jint>(TF_OperationOutputType(TF_Output{op, output_index}));
 }
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv* env,
+                                                                      jclass clazz,
+                                                                      jlong handle,
+                                                                      jstring name) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return 0;
+
+  TF_Status* status = TF_NewStatus();
+
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int result = TF_OperationInputListLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  return result;
+}
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index b5d156f7c2749f7fbba3145f79e269f12e53a055..6f379256d21f590efef28dcbe54f55cc08c59b8f 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -73,6 +73,17 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(JNIEnv *,
 JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_dtype(JNIEnv *, jclass,
                                                            jlong, jlong, jint);
 
+
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    inputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv *,
+                                                                      jclass,
+                                                                      jlong,
+                                                                      jstring);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
index 74fdcf484e91efde27d16f2bf789310c96374e7f..aade375db860a7f7ef3b7031d5fbbef193b38f11 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -16,8 +16,15 @@ limitations under the License.
 package org.tensorflow;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -45,6 +52,91 @@ public class OperationTest {
     }
   }
 
+  @Test
+  public void operationEquality() {
+    Operation op1;
+    try (Graph g = new Graph()) {
+      op1 = TestUtil.constant(g, "op1", 1).op();
+      Operation op2 = TestUtil.constant(g, "op2", 2).op();
+      Operation op3 = new Operation(g, op1.getUnsafeNativeHandle());
+      Operation op4 = g.operation("op1");
+      assertEquals(op1, op1);
+      assertNotEquals(op1, op2);
+      assertEquals(op1, op3);
+      assertEquals(op1.hashCode(), op3.hashCode());
+      assertEquals(op1, op4);
+      assertEquals(op1.hashCode(), op4.hashCode());
+      assertEquals(op3, op4);
+      assertNotEquals(op2, op3);
+      assertNotEquals(op2, op4);
+    }
+    try (Graph g = new Graph()) {
+      Operation newOp1 = TestUtil.constant(g, "op1", 1).op();
+      assertNotEquals(op1, newOp1);
+    }
+  }
+
+  @Test
+  public void operationCollection() {
+    try (Graph g = new Graph()) {
+      Operation op1 = TestUtil.constant(g, "op1", 1).op();
+      Operation op2 = TestUtil.constant(g, "op2", 2).op();
+      Operation op3 = new Operation(g, op1.getUnsafeNativeHandle());
+      Operation op4 = g.operation("op1");
+      Set<Operation> ops = new HashSet<>();
+      ops.addAll(Arrays.asList(op1, op2, op3, op4));
+      assertEquals(2, ops.size());
+      assertTrue(ops.contains(op1));
+      assertTrue(ops.contains(op2));
+      assertTrue(ops.contains(op3));
+      assertTrue(ops.contains(op4));
+    }
+  }
+
+  @Test
+  public void operationToString() {
+    try (Graph g = new Graph()) {
+      Operation op = TestUtil.constant(g, "c", new int[] {1}).op();
+      assertNotNull(op.toString());
+    }
+  }
+
+  @Test
+  public void outputEquality() {
+    try (Graph g = new Graph()) {
+      Output output = TestUtil.constant(g, "c", 1);
+      Output output1 = output.op().output(0);
+      Output output2 = g.operation("c").output(0);
+      assertEquals(output, output1);
+      assertEquals(output.hashCode(), output1.hashCode());
+      assertEquals(output, output2);
+      assertEquals(output.hashCode(), output2.hashCode());
+    }
+  }
+
+  @Test
+  public void outputCollection() {
+    try (Graph g = new Graph()) {
+      Output output = TestUtil.constant(g, "c", 1);
+      Output output1 = output.op().output(0);
+      Output output2 = g.operation("c").output(0);
+      Set<Output> ops = new HashSet<>();
+      ops.addAll(Arrays.asList(output, output1, output2));
+      assertEquals(1, ops.size());
+      assertTrue(ops.contains(output));
+      assertTrue(ops.contains(output1));
+      assertTrue(ops.contains(output2));
+    }
+  }
+
+  @Test
+  public void outputToString() {
+    try (Graph g = new Graph()) {
+      Output output = TestUtil.constant(g, "c", new int[] {1});
+      assertNotNull(output.toString());
+    }
+  }
+
   @Test
   public void outputListLength() {
     assertEquals(1, split(new int[] {0, 1}, 1));
@@ -52,6 +144,29 @@ public class OperationTest {
     assertEquals(3, split(new int[] {0, 1, 2}, 3));
   }
 
+  @Test
+  public void inputListLength() {
+    assertEquals(1, splitWithInputList(new int[] {0, 1}, 1, "split_dim"));
+    try {
+      splitWithInputList(new int[] {0, 1}, 2, "inputs");
+    } catch (IllegalArgumentException iae) {
+      // expected
+    }
+  }
+
+  @Test
+  public void outputList() {
+    try (Graph g = new Graph()) {
+      Operation split = TestUtil.split(g, "split", new int[] {0, 1, 2}, 3);
+      Output[] outputs = split.outputList(1, 2);
+      assertNotNull(outputs);
+      assertEquals(2, outputs.length);
+      for (int i = 0; i < outputs.length; ++i) {
+        assertEquals(i + 1, outputs[i].index());
+      }
+    }
+  }
+
   private static int split(int[] values, int num_split) {
     try (Graph g = new Graph()) {
       return g.opBuilder("Split", "Split")
@@ -62,4 +177,15 @@ public class OperationTest {
           .outputListLength("output");
     }
   }
+
+  private static int splitWithInputList(int[] values, int num_split, String name) {
+    try (Graph g = new Graph()) {
+      return g.opBuilder("Split", "Split")
+          .addInput(TestUtil.constant(g, "split_dim", 0))
+          .addInput(TestUtil.constant(g, "values", values))
+          .setAttr("num_split", num_split)
+          .build()
+          .inputListLength(name);
+    }
+  }
 }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
index 6a3a16c2e17951e2b5ad8823bab3e0183bb7ae55..e3415a696df3e3df82c6009b0866053cf0a2a0b1 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TestUtil.java
@@ -48,6 +48,14 @@ public class TestUtil {
         .output(0);
   }
 
+  public static Operation split(Graph g, String name, int[] values, int numSplit) {
+    return g.opBuilder("Split", name)
+        .addInput(constant(g, "split_dim", 0))
+        .addInput(constant(g, "values", values))
+        .setAttr("num_split", numSplit)
+        .build();
+  }
+
   public static void transpose_A_times_X(Graph g, int[][] a) {
     matmul(g, "Y", constant(g, "A", a), placeholder(g, "X", DataType.INT32), true, false);
   }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..4fdd150acc5bc7260c2de4ca7c9030ab8d8b2a6a
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/OperandsTest.java
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Operation;
+import org.tensorflow.Output;
+import org.tensorflow.TestUtil;
+
+/** Unit tests for {@link org.tensorflow.op.Operands}. */
+@RunWith(JUnit4.class)
+public class OperandsTest {
+
+  @Test
+  public void createOutputArrayFromOperandList() {
+    try (Graph g = new Graph()) {
+      Operation split = TestUtil.split(g, "split", new int[] {0, 1, 2}, 3);
+      List<Output> list = Arrays.asList(split.output(0), split.output(2));
+      Output[] array = Operands.asOutputs(list);
+      assertEquals(list.size(), array.length);
+      assertSame(array[0], list.get(0));
+      assertSame(array[1], list.get(1));
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..b24bf5a476187fed7c186f99b939960546130490
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/PrimitiveOpTest.java
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.HashSet;
+import java.util.Set;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Output;
+import org.tensorflow.TestUtil;
+
+/** Unit tests for {@link org.tensorflow.op.PrimitiveOp} */
+@RunWith(JUnit4.class)
+public class PrimitiveOpTest {
+
+  @Test
+  public void equalsHashcode() {
+    try (Graph g = new Graph()) {
+      Output array = TestUtil.constant(g, "array", new int[2]);
+
+      PrimitiveOp test1 =
+          new PrimitiveOp(g.opBuilder("Shape", "shape1").addInput(array).build()) {};
+      PrimitiveOp test2 =
+          new PrimitiveOp(g.opBuilder("Shape", "shape2").addInput(array).build()) {};
+      PrimitiveOp test3 = new PrimitiveOp(test1.operation) {};
+
+      // equals() tests
+      assertNotEquals(test1, test2);
+      assertEquals(test1, test3);
+      assertEquals(test3, test1);
+      assertNotEquals(test2, test3);
+
+      // hashcode() tests
+      Set<PrimitiveOp> ops = new HashSet<>();
+      assertTrue(ops.add(test1));
+      assertTrue(ops.add(test2));
+      assertFalse(ops.add(test3));
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..ec237924855f36b87507c74616f0678f5a862bc4
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/core/ConstantTest.java
@@ -0,0 +1,131 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op.core;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.DoubleBuffer;
+import java.nio.FloatBuffer;
+import java.nio.IntBuffer;
+import java.nio.LongBuffer;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.DataType;
+import org.tensorflow.Graph;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.op.Scope;
+
+@RunWith(JUnit4.class)
+public class ConstantTest {
+  private static final float EPSILON = 1e-7f;
+
+  @Test
+  public void createIntBuffer() {
+    int[] ints = {1, 2, 3, 4};
+    long[] shape = {4};
+
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope scope = new Scope(g);
+      Constant op = Constant.create(scope, shape, IntBuffer.wrap(ints));
+      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      int[] actual = new int[ints.length];
+      assertArrayEquals(ints, result.copyTo(actual));
+    }
+  }
+
+  @Test
+  public void createFloatBuffer() {
+    float[] floats = {1, 2, 3, 4};
+    long[] shape = {4};
+
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope scope = new Scope(g);
+      Constant op = Constant.create(scope, shape, FloatBuffer.wrap(floats));
+      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      float[] actual = new float[floats.length];
+      assertArrayEquals(floats, result.copyTo(actual), EPSILON);
+    }
+  }
+
+  @Test
+  public void createDoubleBuffer() {
+    double[] doubles = {1, 2, 3, 4};
+    long[] shape = {4};
+
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope scope = new Scope(g);
+      Constant op = Constant.create(scope, shape, DoubleBuffer.wrap(doubles));
+      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      double[] actual = new double[doubles.length];
+      assertArrayEquals(doubles, result.copyTo(actual), EPSILON);
+    }
+  }
+
+  @Test
+  public void createLongBuffer() {
+    long[] longs = {1, 2, 3, 4};
+    long[] shape = {4};
+
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope scope = new Scope(g);
+      Constant op = Constant.create(scope, shape, LongBuffer.wrap(longs));
+      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      long[] actual = new long[longs.length];
+      assertArrayEquals(longs, result.copyTo(actual));
+    }
+  }
+
+  @Test
+  public void createStringBuffer() throws IOException {
+
+    byte[] data = {(byte) 1, (byte) 2, (byte) 3, (byte) 4};
+    long[] shape = {};
+
+    // byte arrays (DataType.STRING in Tensorflow) are encoded as an offset in the data buffer,
+    // followed by a varint encoded size, followed by the data.
+    ByteArrayOutputStream baout = new ByteArrayOutputStream();
+    DataOutputStream out = new DataOutputStream(baout);
+    // Offset in array.
+    out.writeLong(0L);
+    // Varint encoded length of buffer.
+    // For any number < 0x80, the varint encoding is simply the number itself.
+    // https://developers.google.com/protocol-buffers/docs/encoding#varints
+    assertTrue(data.length < 0x80);
+    out.write(data.length);
+    out.write(data);
+    out.close();
+    byte[] content = baout.toByteArray();
+
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Scope scope = new Scope(g);
+      Constant op = Constant.create(scope, DataType.STRING, shape, ByteBuffer.wrap(content));
+      Tensor result = sess.runner().fetch(op.asOutput()).run().get(0);
+      assertArrayEquals(data, result.bytesValue());
+    }
+  }
+}
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 88d5980835c63156bd4a6736595371893455feb8..adbe53a3e38043ba2c67f78dd404290a3f741809 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -41,12 +41,12 @@ py_library(
         "//tensorflow/contrib/learn/python/learn/datasets:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/debug:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/tools:__pkg__",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/tensorboard/scripts:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
     deps = [
         ":tf_optimizer",
         ":array_ops",
+        ":bitwise_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
@@ -87,6 +87,7 @@ py_library(
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
     ] + if_not_windows([
         "//tensorflow/contrib:contrib_py",
@@ -175,6 +176,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "model_analyzer_lib",
+    srcs = ["grappler/model_analyzer.cc"],
+    hdrs = ["grappler/model_analyzer.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:utils",
+    ],
+)
+
 cc_library(
     name = "numpy_lib",
     srcs = ["lib/core/numpy.cc"],
@@ -219,6 +233,7 @@ cc_library(
         ":numpy_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:script_ops_op_lib",
         "//third_party/py/numpy:headers",
         "//util/python:python_headers",
@@ -252,7 +267,7 @@ cc_library(
 cc_binary(
     name = "framework/test_file_system.so",
     srcs = ["framework/test_file_system.cc"],
-    copts = ["-Wno-sign-compare"],
+    copts = if_not_windows(["-Wno-sign-compare"]),
     linkopts = select({
         "//conditions:default": [
             "-lm",
@@ -262,7 +277,7 @@ cc_binary(
     linkshared = 1,
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "@protobuf//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
@@ -416,6 +431,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "c_api_util",
+    srcs = ["framework/c_api_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow",
+    ],
+)
+
 py_library(
     name = "common_shapes",
     srcs = ["framework/common_shapes.py"],
@@ -465,7 +489,10 @@ py_library(
         "framework/errors_impl.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [":util"],
+    deps = [
+        ":c_api_util",
+        ":util",
+    ],
 )
 
 py_library(
@@ -528,6 +555,7 @@ py_library(
     srcs = ["framework/ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":c_api_util",
         ":device",
         ":dtypes",
         ":op_def_registry",
@@ -857,7 +885,6 @@ cc_library(
 tf_gen_op_wrapper_py(
     name = "test_ops_2",
     out = "framework/test_ops_2.py",
-    require_shape_functions = True,
     deps = [":test_ops_2_kernels"],
 )
 
@@ -1047,6 +1074,16 @@ py_test(
 
 tf_gen_op_wrapper_private_py(
     name = "array_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/contrib/quantization:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "bitwise_ops_gen",
     require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
@@ -1058,13 +1095,11 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "candidate_sampling_ops_gen",
-    require_shape_functions = True,
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "control_flow_ops_gen",
-    require_shape_functions = True,
     visibility = ["//learning/brain/python/ops:__pkg__"],
     deps = [
         "//tensorflow/core:control_flow_ops_op_lib",
@@ -1074,12 +1109,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "ctc_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "data_flow_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
@@ -1088,7 +1121,6 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "dataset_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow:__subpackages__",
@@ -1098,13 +1130,11 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "image_ops_gen",
-    require_shape_functions = True,
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "io_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
@@ -1113,13 +1143,11 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "linalg_ops_gen",
-    require_shape_functions = True,
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "logging_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
@@ -1128,7 +1156,6 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "lookup_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/contrib/lookup:__pkg__",
@@ -1138,7 +1165,6 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/google/python/ops:__pkg__",
         "//learning/brain/python/ops:__pkg__",
@@ -1150,7 +1176,6 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "nn_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
@@ -1162,7 +1187,6 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "parsing_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
     ],
@@ -1170,34 +1194,28 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "random_ops_gen",
-    require_shape_functions = True,
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "resource_variable_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "script_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "sdca_ops_gen",
-    require_shape_functions = True,
     visibility = ["//tensorflow/contrib/linear_optimizer:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "set_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "state_ops_gen",
-    require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/contrib/framework:__pkg__",
@@ -1207,27 +1225,24 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "sparse_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "spectral_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "string_ops_gen",
-    require_shape_functions = True,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "user_ops_gen",
+    require_shape_functions = False,
 )
 
 tf_gen_op_wrapper_private_py(
     name = "training_ops_gen",
     out = "training/gen_training_ops.py",
-    require_shape_functions = True,
 )
 
 py_library(
@@ -1264,6 +1279,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "bitwise_ops",
+    srcs = ["ops/bitwise_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":bitwise_ops_gen",
+        ":framework",
+        ":util",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -1424,6 +1450,7 @@ py_library(
     deps = [
         ":array_grad",
         ":array_ops",
+        ":bitwise_ops",
         ":control_flow_grad",
         ":control_flow_ops",
         ":framework",
@@ -1876,6 +1903,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "spectral_ops_test_util",
+    srcs = ["ops/spectral_ops_test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":framework_ops",
+    ],
+)
+
 py_library(
     name = "confusion_matrix",
     srcs = ["ops/confusion_matrix.py"],
@@ -1989,6 +2026,7 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
+        "//tensorflow/python/ops/distributions",
     ],
 )
 
@@ -2028,6 +2066,7 @@ py_library(
     srcs = ["ops/summary_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":framework",
         ":framework_for_generated_wrappers",
         ":logging_ops_gen",
         ":summary_op_util",
@@ -2128,6 +2167,19 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "bitwise_ops_test",
+    size = "small",
+    srcs = ["ops/bitwise_ops_test.py"],
+    additional_deps = [
+        ":bitwise_ops",
+        ":constant_op",
+        ":dtypes",
+        ":framework_test_lib",
+    ],
+    tags = ["no_windows"],
+)
+
 cuda_py_test(
     name = "control_flow_ops_test",
     size = "small",
@@ -2328,6 +2380,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2437,7 +2490,6 @@ py_library(
     srcs = [
         "client/client_lib.py",
         "client/device_lib.py",
-        "client/session.py",
         "client/timeline.py",
     ],
     srcs_version = "PY2AND3",
@@ -2446,6 +2498,7 @@ py_library(
         ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
+        ":session",
         ":session_ops",
         ":util",
         "//third_party/py/numpy",
@@ -2465,7 +2518,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//third_party/py/numpy",
-        "@protobuf//:protobuf_python",
+        "@org_python_pypi_backports_weakref",
+        "@protobuf_archive//:protobuf_python",
         "@six_archive//:six",
     ],
 )
@@ -2638,7 +2692,7 @@ cc_library(
     name = "cpp_shape_inference",
     srcs = ["framework/cpp_shape_inference.cc"],
     hdrs = ["framework/cpp_shape_inference.h"],
-    copts = ["-Wno-sign-compare"],
+    copts = if_not_windows(["-Wno-sign-compare"]),
     visibility = ["//visibility:public"],
     deps = [
         ":cpp_shape_inference_proto_cc",
@@ -2706,6 +2760,7 @@ tf_py_wrap_cc(
         "framework/cpp_shape_inference.i",
         "framework/python_op_gen.i",
         "grappler/cost_analyzer.i",
+        "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
@@ -2719,10 +2774,12 @@ tf_py_wrap_cc(
         "util/port.i",
         "util/py_checkpoint_reader.i",
         "util/stat_summarizer.i",
+        "util/tfprof.i",
         "util/transform_graph.i",
     ],
     deps = [
         ":cost_analyzer_lib",
+        ":model_analyzer_lib",
         ":cpp_shape_inference",
         ":kernel_registry",
         ":numpy_lib",
@@ -2733,6 +2790,7 @@ tf_py_wrap_cc(
         ":tf_session_helper",
         "//tensorflow/c:c_api",
         "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
@@ -2744,8 +2802,8 @@ tf_py_wrap_cc(
         "//tensorflow/core:reader_base",
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
-        "//tensorflow/tools/tfprof/internal:print_model_analysis",
         "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
@@ -2774,6 +2832,7 @@ py_library(
     srcs = ["client/session.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":c_api_util",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -2928,6 +2987,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    tags = ["oss_serial"],
 )
 
 tf_py_test(
@@ -2942,6 +3002,10 @@ tf_py_test(
         ":training",
         ":variables",
     ],
+    tags = [
+        "notsan",  # data race due to b/62910646
+        "oss_serial",
+    ],
 )
 
 py_library(
@@ -3003,11 +3067,13 @@ py_test(
     srcs = ["client/session_clusterspec_prop_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_gpu",
         "no_pip_gpu",
     ],
     deps = [
         ":array_ops",
         ":client",
+        ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -3027,7 +3093,9 @@ py_test(
     srcs = ["client/session_list_devices_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_gpu",
         "no_pip_gpu",
+        "notsan",  # data race due to b/62910646
     ],
     deps = [
         ":client",
@@ -3148,7 +3216,6 @@ cuda_py_tests(
         "training/adagrad_da_test.py",
         "training/adagrad_test.py",
         "training/basic_loops_test.py",
-        "training/checkpoint_utils_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
         "training/ftrl_test.py",
@@ -3352,6 +3419,27 @@ py_test(
     ],
 )
 
+py_test(
+    name = "checkpoint_utils_test",
+    size = "small",
+    srcs = ["training/checkpoint_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":client",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":io_ops",
+        ":partitioned_variables",
+        ":platform",
+        ":pywrap_tensorflow",
+        ":state_ops",
+        ":training",
+        ":variable_scope",
+        ":variables",
+    ],
+)
+
 py_test(
     name = "monitored_session_test",
     size = "small",
@@ -3749,6 +3837,49 @@ cuda_py_test(
     main = "ops/transpose_benchmark.py",
 )
 
+cuda_py_test(
+    name = "matmul_benchmark",
+    size = "medium",
+    srcs = ["ops/matmul_benchmark.py"],
+    additional_deps = [
+        ":math_ops",
+        ":random_ops",
+        ":client",
+        ":client_testlib",
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":platform",
+        ":platform_benchmark",
+        ":variables",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    main = "ops/matmul_benchmark.py",
+)
+
+cuda_py_test(
+    name = "matmul_benchmark_test",
+    size = "medium",
+    srcs = ["ops/matmul_benchmark_test.py"],
+    additional_deps = [
+        ":math_ops",
+        ":random_ops",
+        ":client",
+        ":client_testlib",
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":platform",
+        ":platform_benchmark",
+        ":matmul_benchmark",
+        ":variables",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    main = "ops/matmul_benchmark_test.py",
+    tags = ["no_pip"],
+)
+
 cuda_py_test(
     name = "session_benchmark",
     srcs = ["client/session_benchmark.py"],
@@ -3801,7 +3932,13 @@ py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":nn",
+        ":random_seed",
+        ":session",
         ":tf_optimizer",
+        ":training",
+        ":variable_scope",
+        ":variables",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
     ],
@@ -3833,6 +3970,20 @@ py_library(
     deps = [":pywrap_tensorflow_internal"],
 )
 
+py_binary(
+    name = "cost_analyzer_tool",
+    srcs = [
+        "grappler/cost_analyzer_tool.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cost_analyzer",
+        ":framework_for_generated_wrappers",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_test(
     name = "cost_analyzer_test",
     size = "small",
@@ -3855,3 +4006,30 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_library(
+    name = "model_analyzer",
+    srcs = [
+        "grappler/model_analyzer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [":pywrap_tensorflow_internal"],
+)
+
+py_test(
+    name = "model_analyzer_test",
+    size = "small",
+    srcs = ["grappler/model_analyzer_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":model_analyzer",
+        ":state_ops",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 9f4e92204b40f3544d51788fb64ace8f3fe9e79c..2fd8fc86880034bc6266d6e55a0e199204fa267d 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -57,7 +57,6 @@ from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
 from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
 from tensorflow.core.protobuf.config_pb2 import *
 from tensorflow.core.protobuf.tensorflow_server_pb2 import *
-from tensorflow.core.protobuf.rewriter_config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
 # Framework
@@ -78,12 +77,15 @@ from tensorflow.python.ops.standard_ops import *
 from tensorflow.python.estimator import estimator_lib as estimator
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
+from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import spectral_ops as spectral
+from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.profiler import profiler
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 from tensorflow.python.saved_model import saved_model
@@ -132,7 +134,6 @@ from tensorflow.python.ops import tensor_array_ops
 # documentation, or remove.
 _allowed_symbols = [
     'AttrValue',
-    'AutoParallelOptions',
     'ConfigProto',
     'ClusterDef',
     'DeviceSpec',
@@ -149,11 +150,11 @@ _allowed_symbols = [
     'NameAttrList',
     'NodeDef',
     'OptimizerOptions',
-    'RewriterConfig',
     'RunOptions',
     'RunMetadata',
     'SessionLog',
     'Summary',
+    'SummaryMetadata',
     'TensorInfo',  # Used for tf.saved_model functionality.
 ]
 
@@ -207,12 +208,15 @@ _allowed_symbols.extend([
     'uint16',
     'uint8',
     'resource',
+    'variant',
 ])
 
 # Export modules and constants.
 _allowed_symbols.extend([
     'app',
+    'bitwise',
     'compat',
+    'distributions',
     'errors',
     'estimator',
     'feature_column',
@@ -236,6 +240,7 @@ _allowed_symbols.extend([
     'train',
     'user_ops',
     'layers',
+    'profiler',
 ])
 
 # Variables framework.versions:
@@ -249,10 +254,11 @@ _allowed_symbols.extend([
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols, [
     framework_lib, array_ops, check_ops, client_lib, compat, constant_op,
-    control_flow_ops, confusion_matrix_m, functional_ops, histogram_ops, io_ops,
+    control_flow_ops, confusion_matrix_m, distributions,
+    functional_ops, histogram_ops, io_ops,
     losses, math_ops, metrics, nn, resource_loader, sets, script_ops,
     session_ops, sparse_ops, state_ops, string_ops, summary, tensor_array_ops,
-    train, layers
+    train, layers, profiler
 ])
 
 # Special dunders that we choose to export:
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index 1efac5738cf05b0a5b886819c64f140e2ec0e2c1..2d8625933f9ea4ab3bedf8d3157430d821f3e584 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -13,13 +13,13 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 # hard code the ops/ directory.
 
 def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
-                                 require_shape_functions=False,
+                                 require_shape_functions=True,
                                  visibility=[]):
   if not name.endswith("_gen"):
     fail("name must end in _gen")
   if not visibility:
     visibility = ["//visibility:private"]
-  bare_op_name = name[:-4] # Strip of the _gen
+  bare_op_name = name[:-4] # Strip off the _gen
   tf_gen_op_wrapper_py(name=bare_op_name,
     out=out,
     hidden_file="ops/hidden_ops.txt",
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index ad2ee13db5829eceb9880805c4882c926fd917ba..7d698c2972fa6206985c29548eb8e8a1cf0434e4 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import re
 import threading
 
@@ -26,6 +27,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -61,6 +63,7 @@ class SessionInterface(object):
     """Continues the execution with additional feeds and fetches."""
     raise NotImplementedError('partial_run')
 
+
 def _get_indexed_slices_value_from_fetches(fetched_vals):
   return ops.IndexedSlicesValue(fetched_vals[0], fetched_vals[1],
                                 fetched_vals[2]
@@ -689,24 +692,17 @@ class BaseSession(SessionInterface):
     except Exception:  # pylint: disable=broad-except
       pass
     if self._session is not None:
-      # We create `status` outside the `try` block because at shutdown
-      # `tf_session` may have been garbage collected, and the creation of a
-      # status object may fail. In that case, we prefer to ignore the failure
-      # and silently leak the session object, since the program is about to
-      # terminate.
-      status = None
       try:
-        status = tf_session.TF_NewStatus()
+        status = c_api_util.ScopedTFStatus()
         if self._created_with_new_api:
           tf_session.TF_DeleteSession(self._session, status)
         else:
           tf_session.TF_DeleteDeprecatedSession(self._session, status)
       except AttributeError:
-        # 'NoneType' object has no attribute 'TF_NewStatus'
+        # At shutdown, `c_api_util` or `tf_session` may have been garbage
+        # collected, causing the above method calls to fail. In this case,
+        # silently leak since the program is about to terminate anyway.
         pass
-      finally:
-        if status is not None:
-          tf_session.TF_DeleteStatus(status)
       self._session = None
 
   @property
@@ -884,12 +880,9 @@ class BaseSession(SessionInterface):
       ValueError: If `fetches` or `feed_dict` keys are invalid or refer to a
         `Tensor` that doesn't exist.
     """
-    run_metadata_ptr = tf_session.TF_NewBuffer()
-    if options:
-      options_ptr = tf_session.TF_NewBufferFromString(
-          compat.as_bytes(options.SerializeToString()))
-    else:
-      options_ptr = None
+    options_ptr = tf_session.TF_NewBufferFromString(
+        compat.as_bytes(options.SerializeToString())) if options else None
+    run_metadata_ptr = tf_session.TF_NewBuffer() if run_metadata else None
 
     try:
       result = self._run(None, fetches, feed_dict, options_ptr,
@@ -898,7 +891,8 @@ class BaseSession(SessionInterface):
         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
         run_metadata.ParseFromString(compat.as_bytes(proto_data))
     finally:
-      tf_session.TF_DeleteBuffer(run_metadata_ptr)
+      if run_metadata_ptr:
+        tf_session.TF_DeleteBuffer(run_metadata_ptr)
       if options:
         tf_session.TF_DeleteBuffer(options_ptr)
     return result
@@ -971,7 +965,6 @@ class BaseSession(SessionInterface):
       TypeError: If `fetches` or `feed_dict` keys are of an inappropriate type.
       tf.errors.OpError: Or one of its subclasses if a TensorFlow error happens.
     """
-    assert not self._created_with_new_api, 'Partial runs don\'t work with C API'
 
     def _feed_fn(feed):
       for tensor_type, _, _, feed_fn in _REGISTERED_EXPANSIONS:
@@ -999,7 +992,12 @@ class BaseSession(SessionInterface):
         try:
           subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
                                                   allow_operation=False)
-          feed_list.append(compat.as_bytes(subfeed_t.name))
+          if self._created_with_new_api:
+            # pylint: disable=protected-access
+            feed_list.append(subfeed_t._as_tf_output())
+            # pylint: enable=protected-access
+          else:
+            feed_list.append(compat.as_bytes(subfeed_t.name))
         except Exception as e:
           e.message = ('Cannot interpret feed_list key as Tensor: '
                        + e.message)
@@ -1014,12 +1012,24 @@ class BaseSession(SessionInterface):
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
       with errors.raise_exception_on_not_ok_status() as status:
-        return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
-                                       target_list, status)
+        if self._created_with_new_api:
+          return tf_session.TF_SessionPRunSetup_wrapper(
+              session, feed_list, fetch_list, target_list, status)
+        else:
+          return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
+                                         target_list, status)
 
-    return self._do_call(_setup_fn, self._session, feed_list,
-                         _name_list(fetch_handler.fetches()),
-                         _name_list(fetch_handler.targets()))
+    if self._created_with_new_api:
+      # pylint: disable=protected-access
+      final_fetches = [t._as_tf_output() for t in fetch_handler.fetches()]
+      final_targets = [op._c_op for op in fetch_handler.targets()]
+      # pylint: enable=protected-access
+    else:
+      final_fetches = _name_list(fetch_handler.fetches())
+      final_targets = _name_list(fetch_handler.targets())
+
+    return self._do_call(_setup_fn, self._session, feed_list, final_fetches,
+                         final_targets)
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
     """Perform either run or partial_run, depending the presence of `handle`."""
@@ -1110,7 +1120,10 @@ class BaseSession(SessionInterface):
       results = []
     return fetch_handler.build_results(self, results)
 
-  def make_callable(self, fetches, feed_list=None):
+  def make_callable(self,
+                    fetches,
+                    feed_list=None,
+                    accept_options=False):
     """Returns a Python callable that runs a particular step.
 
     The returned callable will take `len(feed_list)` arguments whose types
@@ -1130,6 +1143,12 @@ class BaseSession(SessionInterface):
         for details of the allowable fetch types.
       feed_list: (Optional.) A list of `feed_dict` keys. See
         @{tf.Session.run} for details of the allowable feed key types.
+      accept_options: (Optional.) Iff `True`, the returned `Callable` will be
+        able to accept @{tf.RunOptions} and @{tf.RunMetadata} as optional
+        keyword arguments `options` and `run_metadata`, respectively, with
+        the same syntax and semantics as @{tf.Session.run}, which is useful
+        for certain use cases (profiling and debugging) but will result in
+        measurable slowdown of the `Callable`'s performance. Default: `False`.
 
     Returns:
       A function that when called will execute the step defined by
@@ -1149,10 +1168,10 @@ class BaseSession(SessionInterface):
       # TODO(mrry): Refactor the feed handling logic from
       # `Session._run()` so that we can convert the feeds to a list of
       # strings here.
-      def _generic_run(*feed_args):
+      def _generic_run(*feed_args, **kwargs):
         feed_dict = {feed: feed_val
                      for feed, feed_val in zip(feed_list, feed_args)}
-        return self.run(fetches, feed_dict=feed_dict)
+        return self.run(fetches, feed_dict=feed_dict, **kwargs)
       return _generic_run
 
     # Ensure any changes to the graph are reflected in the runtime.
@@ -1166,7 +1185,40 @@ class BaseSession(SessionInterface):
     fetch_list_as_strings = _name_list(fetch_handler.fetches())
     target_list_as_strings = _name_list(fetch_handler.targets())
 
-    if isinstance(fetches, ops.Operation):
+    def _callable_template_with_options_and_metadata(
+        fetch_list_as_strings,
+        target_list_as_strings,
+        fetch_handler,
+        options=None,
+        run_metadata=None):
+      """Template callable that accepts RunOptions and RunMetadata."""
+      options_ptr = tf_session.TF_NewBufferFromString(
+          compat.as_bytes(options.SerializeToString())) if options else None
+      run_metadata_ptr = tf_session.TF_NewBuffer() if run_metadata else None
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = tf_session.TF_Run(
+              self._session, options_ptr, {}, fetch_list_as_strings,
+              target_list_as_strings, status, run_metadata_ptr)
+          if fetch_handler:
+            results = fetch_handler.build_results(self, results)
+          else:
+            results = results[0] if results else None
+        if run_metadata:
+          proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
+          run_metadata.ParseFromString(compat.as_bytes(proto_data))
+      finally:
+        if run_metadata_ptr:
+          tf_session.TF_DeleteBuffer(run_metadata_ptr)
+        if options:
+          tf_session.TF_DeleteBuffer(options_ptr)
+      return results
+
+    if accept_options:
+      return functools.partial(
+          _callable_template_with_options_and_metadata, fetch_list_as_strings,
+          target_list_as_strings, fetch_handler)
+    elif isinstance(fetches, ops.Operation):
       # Special case for fetching a single operation, because the
       # function will have no return value.
       assert not fetch_list_as_strings
@@ -1248,13 +1300,15 @@ class BaseSession(SessionInterface):
                                    status, run_metadata)
 
     def _prun_fn(session, handle, feed_dict, fetch_list):
-      assert not self._created_with_new_api, ('Partial runs don\'t work with '
-                                              'C API')
       if target_list:
         raise RuntimeError('partial_run() requires empty target_list.')
       with errors.raise_exception_on_not_ok_status() as status:
-        return tf_session.TF_PRun(session, handle, feed_dict, fetch_list,
-                                  status)
+        if self._created_with_new_api:
+          return tf_session.TF_SessionPRun_wrapper(session, handle, feed_dict,
+                                                   fetch_list, status)
+        else:
+          return tf_session.TF_PRun(session, handle, feed_dict, fetch_list,
+                                    status)
 
     if handle is None:
       return self._do_call(_run_fn, self._session, feeds, fetches, targets,
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index f982c6ae3737bc6c5edb245c3d6c8ec156debd66..1ce2b7d7c3a3942c13e4dd15d314f8501753c4b5 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.python.client.session.Session's ClusterSpec Propagation.
 
 These tests exercise the ClusterSpec Propagation capabilities of distributed
@@ -39,6 +38,7 @@ from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unuse
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
 ops._USE_C_API = True
@@ -50,7 +50,6 @@ ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testClusterSpecPropagationSimple(self):
     server1 = server_lib.Server.create_local_server()
     server2 = server_lib.Server.create_local_server()
@@ -66,7 +65,6 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     output = sess.run(const)
     self.assertEqual(17, output)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testClusterSpecPropagationWorker2Placement(self):
     server1 = server_lib.Server.create_local_server()
     server2 = server_lib.Server.create_local_server()
@@ -94,7 +92,6 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
                          dev_stats.device and 'Const' == node_stats.node_name
                      ]))
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testClusterSpecPropagationWorker1Placement(self):
     server1 = server_lib.Server.create_local_server()
     server2 = server_lib.Server.create_local_server()
@@ -111,7 +108,137 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     output = sess.run(const)
     self.assertEqual(17, output)
 
+  def testCanonicalDeviceNames(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device(
+        '/job:worker/task:1/device:CPU:0'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    output = sess.run(const, options=run_options, run_metadata=run_metadata)
+    self.assertEqual(17, output)
+    self.assertEqual(1,
+                     len([
+                         node_stats
+                         for dev_stats in run_metadata.step_stats.dev_stats
+                         for node_stats in dev_stats.node_stats
+                         if '/job:worker/replica:0/task:1/device:CPU:0' ==
+                         dev_stats.device and 'Const' == node_stats.node_name
+                     ]))
+
+  def testFullDeviceNames(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'renamed_worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device(
+        '/job:renamed_worker/replica:0/task:1/device:CPU:0'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    output = sess.run(const, options=run_options, run_metadata=run_metadata)
+    self.assertEqual(17, output)
+    self.assertEqual(1,
+                     len([
+                         node_stats
+                         for dev_stats in run_metadata.step_stats.dev_stats
+                         for node_stats in dev_stats.node_stats
+                         if '/job:renamed_worker/replica:0/task:1/device:CPU:0'
+                         == dev_stats.device and 'Const' == node_stats.node_name
+                     ]))
+
+  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testMultipleLocalDevices(self):
+    # Note: CPU->CPU transfers have a fast-path in
+    # BaseRemoteRendezvous::SameWorkerRecvDone that means the test doesn't
+    # actually capture the motivating bug unless run on a GPU machine.
+    #
+    # Example error message (before bugfix -- linebreaks added because  lint):
+    #
+    # W0718 17:14:41.521534  190121 device_mgr.cc:107] Unknown device:
+    #     /job:worker/replica:0/task:0/device:CPU:0 all devices:
+    #     /job:local/replica:0/task:0/gpu:0,
+    #     /job:local/replica:0/task:0/device:GPU:0,
+    #     /job:local/replica:0/task:0/cpu:1, CPU:0, GPU:0,
+    #     /job:local/replica:0/task:0/device:CPU:1,
+    #     /job:local/replica:0/task:0/device:CPU:0, CPU:1,
+    #     /job:local/replica:0/task:0/cpu:0
+    server_config = config_pb2.ConfigProto(device_count={'CPU': 2})
+    server1 = server_lib.Server.create_local_server(config=server_config)
+    server2 = server_lib.Server.create_local_server(config=server_config)
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g:
+      with ops.device('/job:worker/task:1/cpu:1'):
+        input1 = constant_op.constant(17, dtypes.float32)
+      with ops.device('/job:worker/task:0/cpu:1'):
+        input2 = constant_op.constant(3, dtypes.float32)
+      with ops.device('/job:worker/task:1/cpu:0'):
+        sum1 = input1 + input2
+
+      if test.is_gpu_available():
+        device_str = '/job:worker/task:0/gpu:0'
+      else:
+        device_str = '/job:worker/task:0/cpu:1'
+      with ops.device(device_str):
+        sum2 = input2 + input1
+
+      with ops.device('/job:worker/task:0/cpu:0'):
+        sum3 = sum1 + sum2
+    sess = session.Session(server1.target, config=config, graph=g)
+    output = sess.run(sum3)
+    self.assertEqual(40, output)
+
   @test_util.disable_c_api  # Operation._set_device doesn't work with C API
+  def testLegacyDeviceNames(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:1/cpu:0'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    output = sess.run(const, options=run_options, run_metadata=run_metadata)
+    self.assertEqual(17, output)
+    self.assertEqual(1,
+                     len([
+                         node_stats
+                         for dev_stats in run_metadata.step_stats.dev_stats
+                         for node_stats in dev_stats.node_stats
+                         if '/job:worker/replica:0/task:1/device:CPU:0' ==
+                         dev_stats.device and 'Const' == node_stats.node_name
+                     ]))
+
   def testClusterSpecPropagationThreeServers2Graphs(self):
     """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
 
@@ -173,7 +300,6 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_ones, sess2.run(var2))
     self.assertAllEqual(expected_ones + expected_ones, sess1.run(var1))
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testClusterSpecPropagationThreeServers(self):
     """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
 
@@ -228,7 +354,6 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_ones, sess2.run(var))
     self.assertAllEqual(expected_ones + expected_ones, sess1.run(var))
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testClusterSpecPropagationThreeServersOneCluster(self):
     """Boots 3 servers, ensures appropriate communication across workers.
 
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 9e0eca2089e766385847f086f8273a7e45c87dc7..33b90e6156f41935f53894032117178822249edb 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -33,25 +33,15 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 
-ops._USE_C_API = True
 
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
-class PartialRunTest(test_util.TensorFlowTestCase):
+class PartialRunTestMethods(object):
 
-  def setUp(self):
-    # Partial runs don't work with C API
-    ops._USE_C_API = False
-    super(PartialRunTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = True
-    super(PartialRunTest, self).tearDown()
-
-  def runTestPartialRun(self, sess):
+  def RunTestPartialRun(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -73,7 +63,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     res = sess.partial_run(h2, r2, feed_dict={c: temp})
     self.assertEqual(162, res)
 
-  def runTestPartialRunIncomplete(self, sess):
+  def RunTestPartialRunIncomplete(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -84,7 +74,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
     self.assertEqual(3, res)
 
-  def runTestConcurrentPartialRun(self, sess):
+  def RunTestConcurrentPartialRun(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -101,7 +91,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     res = sess.partial_run(h2, r2, feed_dict={c: 7})
     self.assertEqual(462, res)
 
-  def runTestManyPartialRun(self, sess):
+  def RunTestManyPartialRun(self, sess):
     steps = 200
     inputs = []
     outputs = []
@@ -123,7 +113,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     self.assertEqual(steps, len(res))
     self.assertEqual(2.0, res[-1])
 
-  def runTestRunAndPartialRun(self, sess):
+  def RunTestRunAndPartialRun(self, sess):
     a = constant_op.constant(2.0, dtypes.float32)
     b = a * 2
     c = b * 3
@@ -132,7 +122,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     r2 = sess.partial_run(h, [b, c])
     self.assertEqual(r1, r2)
 
-  def runTestPartialRunMissingPlaceholderFeedException(self, sess):
+  def RunTestPartialRunMissingPlaceholderFeedException(self, sess):
     x = array_ops.placeholder(dtypes.float32, shape=())
     fetches = [x * 2, x * 3]
     handle = sess.partial_run_setup(fetches=fetches, feeds=[])
@@ -140,7 +130,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
                                  'You must feed a value for placeholder'):
       sess.partial_run(handle, fetches[0])
 
-  def runTestPartialRunUnspecifiedFeed(self, sess):
+  def RunTestPartialRunUnspecifiedFeed(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -151,7 +141,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
                                  'was not specified in partial_run_setup.$'):
       sess.partial_run(h, r1, feed_dict={a: 1, b: 2, c: 3})
 
-  def runTestPartialRunUnspecifiedFetch(self, sess):
+  def RunTestPartialRunUnspecifiedFetch(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -163,7 +153,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
                                  'was not specified in partial_run_setup.$'):
       sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
 
-  def runTestPartialRunAlreadyFed(self, sess):
+  def RunTestPartialRunAlreadyFed(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -176,7 +166,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
                                  'has already been fed.$'):
       sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
 
-  def runTestPartialRunAlreadyFetched(self, sess):
+  def RunTestPartialRunAlreadyFetched(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
     b = array_ops.placeholder(dtypes.float32, shape=[])
     c = array_ops.placeholder(dtypes.float32, shape=[])
@@ -189,7 +179,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
                                  'has already been fetched.$'):
       sess.partial_run(h, r1, feed_dict={c: 3})
 
-  def runTestPartialRunEmptyFetches(self, sess):
+  def RunTestPartialRunEmptyFetches(self, sess):
     a = array_ops.placeholder(dtypes.float32)
     b = a * 2.0
 
@@ -207,82 +197,109 @@ class PartialRunTest(test_util.TensorFlowTestCase):
       sess.partial_run_setup(fetches=[], feeds=[x])
 
   def testPartialRunDirect(self):
-    self.runTestPartialRun(session.Session())
+    self.RunTestPartialRun(session.Session())
 
   def testPartialRunIncompleteDirect(self):
-    self.runTestPartialRunIncomplete(session.Session())
+    self.RunTestPartialRunIncomplete(session.Session())
 
   def testConcurrentPartialRunDirect(self):
-    self.runTestConcurrentPartialRun(session.Session())
+    self.RunTestConcurrentPartialRun(session.Session())
 
   def testManyPartialRunDirect(self):
-    self.runTestManyPartialRun(session.Session())
+    self.RunTestManyPartialRun(session.Session())
 
   def testRunAndPartialRunDirect(self):
-    self.runTestRunAndPartialRun(session.Session())
+    self.RunTestRunAndPartialRun(session.Session())
 
   def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
-    self.runTestPartialRunMissingPlaceholderFeedException(session.Session())
+    self.RunTestPartialRunMissingPlaceholderFeedException(session.Session())
 
   def testPartialRunUnspecifiedFeedDirect(self):
-    self.runTestPartialRunUnspecifiedFeed(session.Session())
+    self.RunTestPartialRunUnspecifiedFeed(session.Session())
 
   def testPartialRunUnspecifiedFetchDirect(self):
-    self.runTestPartialRunUnspecifiedFetch(session.Session())
+    self.RunTestPartialRunUnspecifiedFetch(session.Session())
 
   def testPartialRunAlreadyFedDirect(self):
-    self.runTestPartialRunAlreadyFed(session.Session())
+    self.RunTestPartialRunAlreadyFed(session.Session())
 
   def testPartialRunAlreadyFetchedDirect(self):
-    self.runTestPartialRunAlreadyFetched(session.Session())
+    self.RunTestPartialRunAlreadyFetched(session.Session())
 
   def testPartialRunEmptyFetchesDirect(self):
-    self.runTestPartialRunEmptyFetches(session.Session())
+    self.RunTestPartialRunEmptyFetches(session.Session())
 
   def testPartialRunDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRun(session.Session(server.target))
+    self.RunTestPartialRun(session.Session(server.target))
 
   def testPartialRunIncompleteDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunIncomplete(session.Session(server.target))
+    self.RunTestPartialRunIncomplete(session.Session(server.target))
 
   def testConcurrentPartialRunDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestConcurrentPartialRun(session.Session(server.target))
+    self.RunTestConcurrentPartialRun(session.Session(server.target))
 
   def testManyPartialRunDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestManyPartialRun(session.Session(server.target))
+    self.RunTestManyPartialRun(session.Session(server.target))
 
   def testRunAndPartialRunDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestRunAndPartialRun(session.Session(server.target))
+    self.RunTestRunAndPartialRun(session.Session(server.target))
 
   def testPartialRunMissingPlaceholderFeedExceptionDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunMissingPlaceholderFeedException(
+    self.RunTestPartialRunMissingPlaceholderFeedException(
         session.Session(server.target))
 
   def testPartialRunUnspecifiedFeedDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunUnspecifiedFeed(session.Session(server.target))
+    self.RunTestPartialRunUnspecifiedFeed(session.Session(server.target))
 
   def testPartialRunUnspecifiedFetchDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunUnspecifiedFetch(session.Session(server.target))
+    self.RunTestPartialRunUnspecifiedFetch(session.Session(server.target))
 
   def testPartialRunAlreadyFedDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunAlreadyFed(session.Session(server.target))
+    self.RunTestPartialRunAlreadyFed(session.Session(server.target))
 
   def testPartialRunAlreadyFetchedDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunAlreadyFetched(session.Session(server.target))
+    self.RunTestPartialRunAlreadyFetched(session.Session(server.target))
 
   def testPartialRunEmptyFetchesDist(self):
     server = server_lib.Server.create_local_server()
-    self.runTestPartialRunEmptyFetches(session.Session(server.target))
+    self.RunTestPartialRunEmptyFetches(session.Session(server.target))
+
+
+class PartialRunTest(PartialRunTestMethods, test_util.TensorFlowTestCase):
+  """Test case that invokes test methods with _USE_C_API=False."""
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API
+    ops._USE_C_API = False
+    super(PartialRunTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api
+    super(PartialRunTest, self).tearDown()
+
+
+class PartialRunWithCApiTest(PartialRunTestMethods,
+                             test_util.TensorFlowTestCase):
+  """Test case that invokes test methods with _USE_C_API=True."""
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API
+    ops._USE_C_API = True
+    super(PartialRunWithCApiTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api
+    super(PartialRunWithCApiTest, self).tearDown()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index a1f98059cd88c1acc13f78623ad1312d20db9d70..0cec75cf996c63d07bb30fc45542730538ee3eeb 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
@@ -63,7 +64,6 @@ ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 class SessionTest(test_util.TensorFlowTestCase):
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testUseExistingGraph(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       a = constant_op.constant(6.0, shape=[1, 1])
@@ -73,7 +73,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = c.eval()
       self.assertAllEqual(result, [[42.0]])
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testUseDefaultGraph(self):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant(6.0, shape=[1, 1])
@@ -128,6 +127,17 @@ class SessionTest(test_util.TensorFlowTestCase):
       results = s.run([inp])
       self.assertAllEqual([20.0], results)
 
+    pool = config.session_inter_op_thread_pool.add()
+    pool.num_threads = 1
+    pool.global_name = 't1'
+    run_options = config_pb2.RunOptions()
+    run_options.inter_op_thread_pool = (
+        len(config.session_inter_op_thread_pool) - 1)
+    with session.Session(config=config) as s:
+      inp = constant_op.constant(30.0, name='W2')
+      results = s.run([inp], options=run_options)
+      self.assertAllEqual([30.0], results)
+
   def testErrorsReported(self):
     with session.Session() as s:
       constant_op.constant(10.0, name='W1')
@@ -140,7 +150,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError(lambda e: e.op == a.op):
         a.eval()
 
-  @test_util.disable_c_api  # Partial runs don't work with C API
   def testErrorCodeWithNoNodeDef(self):
     with session.Session() as s:
       a = array_ops.placeholder(dtypes.float32, shape=[])
@@ -868,7 +877,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       v_val = v.eval()
       self.assertAllEqual([[6.0, 6.0, 6.0]], v_val)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testExtendWithGroupBy(self):
     with session.Session() as s:
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1080,7 +1088,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
         sess.run({})
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testNotEntered(self):
     # pylint: disable=protected-access
     self.assertEqual(ops._default_session_stack.get_default(), None)
@@ -1096,7 +1103,6 @@ class SessionTest(test_util.TensorFlowTestCase):
           ValueError, lambda e: 'No default session is registered.' in str(e)):
         c_2.eval()
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInteractive(self):
     with ops.device('/cpu:0'):
       sess = session.InteractiveSession()
@@ -1109,7 +1115,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1131,7 +1136,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       a.eval()
     sess.close()
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testDefaultSessionPlacePrunedGraph(self):
     sess = session.Session()
 
@@ -1153,7 +1157,6 @@ class SessionTest(test_util.TensorFlowTestCase):
 
     sess.close()
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testSharedGraph(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1220,6 +1223,50 @@ class SessionTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
+  def testMakeCallableOnTensorWithRunOptions(self):
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      tensor_runner = sess.make_callable(a, accept_options=True)
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertEqual(0, len(run_metadata.step_stats.dev_stats))
+      res = tensor_runner(options=run_options, run_metadata=run_metadata)
+      self.assertEqual(42.0, res)
+      self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
+
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
+  def testMakeCallableOnOperationWithRunOptions(self):
+    with session.Session() as sess:
+      a = variables.Variable(42.0)
+      b = state_ops.assign_add(a, 1.0)
+      sess.run(a.initializer)
+      tensor_runner = sess.make_callable(b.op, accept_options=True)
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertEqual(0, len(run_metadata.step_stats.dev_stats))
+      tensor_runner(options=run_options, run_metadata=run_metadata)
+      self.assertEqual(43.0, sess.run(a))
+      self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
+
+  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
+  def testMakeCallableWithFeedListAndRunOptions(self):
+    with session.Session() as sess:
+      ph = array_ops.placeholder(dtypes.float32)
+      a = math_ops.add(ph, 1.0)
+      tensor_runner = sess.make_callable(
+          a, feed_list=[ph.name], accept_options=True)
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertEqual(0, len(run_metadata.step_stats.dev_stats))
+      self.assertAllClose(
+          42.0,
+          tensor_runner(41.0, options=run_options, run_metadata=run_metadata))
+      self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
+
   def testFeedError(self):
     with session.Session() as sess:
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
@@ -1368,7 +1415,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(TypeError, 'Cannot interpret feed_dict'):
         sess.run(a, feed_dict={'a': [2.0]})
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testPerStepTrace(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1389,7 +1435,6 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testRunOptionsRunMetadata(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1425,7 +1470,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, 'may not be fed'):
         sess.run(reshaped_tensor, feed_dict={new_shape: [3, 7]})
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInferShapesFalse(self):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
@@ -1434,7 +1478,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       # Avoid lint error regarding 'unused' var a.
       self.assertTrue(a == a)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testInferShapesTrue(self):
     config = config_pb2.ConfigProto(
         graph_options=config_pb2.GraphOptions(infer_shapes=True))
@@ -1445,7 +1488,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       # Avoid lint error regarding 'unused' var a.
       self.assertTrue(a == a)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testBuildCostModel(self):
     run_options = config_pb2.RunOptions()
     config = config_pb2.ConfigProto(
@@ -1466,6 +1508,22 @@ class SessionTest(test_util.TensorFlowTestCase):
         else:
           self.assertFalse(run_metadata.HasField('cost_graph'))
 
+  def runTestOutputPartitionGraphs(self, sess):
+    run_options = config_pb2.RunOptions(output_partition_graphs=True)
+    a = constant_op.constant(1)
+    run_metadata = config_pb2.RunMetadata()
+    sess.run(a, options=run_options, run_metadata=run_metadata)
+    self.assertGreater(len(run_metadata.partition_graphs), 0)
+    sess.run(a, run_metadata=run_metadata)
+    self.assertEqual(len(run_metadata.partition_graphs), 0)
+
+  def testOutputPartitionGraphsDirect(self):
+    self.runTestOutputPartitionGraphs(session.Session())
+
+  def testOutputPartitionGraphsDistributed(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestOutputPartitionGraphs(session.Session(server.target))
+
   def testNonInteractiveSessionNesting(self):
     sess1 = session.Session()
     sess1_controller = sess1.as_default()
@@ -1525,7 +1583,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
-  @test_util.disable_c_api  # Partial runs don't work with C API
+  @test_util.disable_c_api  # set_device does not work with C API
   def testRegisterFetchAndFeedConversionFunctions(self):
     class SquaredTensor(object):
       def __init__(self, tensor):
@@ -1698,6 +1756,28 @@ class SessionTest(test_util.TensorFlowTestCase):
     str_repr = '%s' % attrs
     self.assertTrue(str_repr.startswith('_DeviceAttributes'), str_repr)
 
+  def runTestAddFunctionToSession(self, target=''):
+    """Add a function to a session after the graph has already been run."""
+    @function.Defun(dtypes.float32)
+    def foo(x):
+      return x + 1
+
+    x = constant_op.constant(1.0)
+    with session.Session(target=target) as sess:
+      sess.run(x)
+      f = foo(x)
+      result = sess.run(f)
+      self.assertEqual(result, 2.0)
+
+  @test_util.disable_c_api  # functions don't work with C API
+  def testAddFunctionToSession(self):
+    self.runTestAddFunctionToSession()
+
+  @test_util.disable_c_api  # functions don't work with C API
+  def testAddFunctionToGrpcSession(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestAddFunctionToSession(server.target)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 4a62a96935ef228df6304375df1a295347cdc143..40bebaa0f9eb861259de983a2a831cd22172e19b 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -17,6 +17,7 @@ limitations under the License.
 
 %{
 
+#include "tensorflow/c/python_api.h"
 #include "tensorflow/python/client/tf_session_helper.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -63,6 +64,37 @@ tensorflow::ImportNumpy();
 // Constants used by TensorHandle (get_session_handle).
 %constant const char* TENSOR_HANDLE_KEY = tensorflow::SessionState::kTensorHandleResourceTypeName;
 
+// Convert TF_OperationName output to unicode python string
+%typemap(out) const char* TF_OperationName {
+  $result = PyUnicode_FromString($1);
+}
+
+// Convert TF_OperationOpType output to unicode python string
+%typemap(out) const char* TF_OperationOpType {
+  $result = PyUnicode_FromString($1);
+}
+
+// We use TF_OperationGetControlInputs_wrapper instead of
+// TF_OperationGetControlInputs
+%ignore TF_OperationGetControlInputs;
+%unignore TF_OperationGetControlInputs_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetControlInputs_wrapper;
+
+// Build a Python list of TF_Operation* and return it.
+%typemap(out) std::vector<TF_Operation*> tensorflow::TF_OperationGetControlInputs_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, SWIG_NewPointerObj(
+                            $1[i], SWIGTYPE_p_TF_Operation, 0));
+  }
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // BEGIN TYPEMAPS FOR tensorflow::TF_Run_wrapper()
 ////////////////////////////////////////////////////////////////////////////////
@@ -112,6 +144,8 @@ tensorflow::ImportNumpy();
     tensorflow::PyObjectVector temp) {
   $1 = &temp;
 }
+// TODO(iga): move this and the corresponding typemap(argout) to
+// tf_sessionrun_wrapper.i once we get rid of this code for DeprecatedSession.
 %typemap(in, numinputs=0) char** out_handle (
     char* temp) {
   $1 = &temp;
@@ -142,7 +176,7 @@ tensorflow::ImportNumpy();
 %#else
   $result = PyUnicode_FromStringAndSize(
 %#endif
-    *$1, strlen(*$1));
+    *$1, *$1 == nullptr ? 0 : strlen(*$1));
   delete[] *$1;
 }
 
@@ -150,6 +184,34 @@ tensorflow::ImportNumpy();
 // END TYPEMAPS FOR tensorflow::TF_Run_wrapper()
 ////////////////////////////////////////////////////////////////////////////////
 
+// Typemap for TF_Status* inputs that automatically unwraps a ScopedTFStatus.
+// This can also handle a wrapped TF_Status* input.
+%typemap(in) (TF_Status*) {
+  PyObject* wrapped_tf_status;
+  if (strcmp(Py_TYPE($input)->tp_name, "ScopedTFStatus") == 0) {
+    DCHECK(PyObject_HasAttrString($input, "status"))
+        << "ScopedTFStatus.status not found! Do you need to modify "
+           "tf_session.i?";
+    wrapped_tf_status = PyObject_GetAttrString($input, "status");
+  } else {
+    // Assume wrapped TF_Status*
+    wrapped_tf_status = $input;
+  }
+  DCHECK_EQ(strcmp(Py_TYPE(wrapped_tf_status)->tp_name, "SwigPyObject"), 0)
+      << Py_TYPE(wrapped_tf_status)->tp_name;
+
+  // The following is the default SWIG code generated for TF_Status*
+  void* tf_status = nullptr;
+  int r = SWIG_ConvertPtr(wrapped_tf_status, &tf_status,
+                          $descriptor(TF_Status*), 0 | 0);
+  if (!SWIG_IsOK(r)) {
+    SWIG_exception_fail(
+        SWIG_ArgError(r),
+        "in method '_TF_DeleteStatus', argument 1 of type 'TF_Status *'");
+  }
+  $1 = reinterpret_cast<TF_Status*>(tf_status);
+}
+
 // Typemap for functions that return a TF_Buffer struct. This typemap creates a
 // Python string from the TF_Buffer and returns it. The TF_Buffer.data string
 // is not expected to be NULL-terminated, and TF_Buffer.length does not count
@@ -163,29 +225,37 @@ tensorflow::ImportNumpy();
 // Helper function to convert a Python list of Tensors to a C++ vector of
 // TF_Outputs.
 //
-// Caller should have already checked that `py_tensor_list` is a list (this
-// isn't done in this function to allow for function-specific error messages)
-void PyTensorListToVector(PyObject* py_tensor_list,
-                          std::vector<TF_Output>* vec) {
+// Returns true if successful. Otherwise, returns false and sets error_msg.
+bool PyTensorListToVector(PyObject* py_tensor_list,
+                          std::vector<TF_Output>* vec,
+                          string* error_msg) {
+  if (!PyList_Check(py_tensor_list)) {
+    *error_msg = "expected Python list.";
+    return false;
+  }
   size_t size = PyList_Size(py_tensor_list);
   for (int i = 0; i < size; ++i) {
     PyObject* item = PyList_GetItem(py_tensor_list, i);
     TF_Output* input_ptr;
-    SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
-                    SWIGTYPE_p_TF_Output, 0);
+    if (!SWIG_IsOK(SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
+                                   SWIGTYPE_p_TF_Output, 0))) {
+      *error_msg = "expected Python list of wrapped TF_Output objects. "
+          "Found python list of something else.";
+      return false;
+    }
     vec->push_back(*input_ptr);
   }
+  return true;
 }
 %}
 
 // Converts input Python list of wrapped TF_Outputs into a single array
 %typemap(in) (const TF_Output* inputs, int num_inputs)
     (std::vector<TF_Output> inputs) {
-  if (!PyList_Check($input)) {
-    SWIG_exception_fail(
-        SWIG_TypeError, "$symname: expected Python list of wrapped TF_Outputs");
+  string error_msg;
+  if (!PyTensorListToVector($input, &inputs, &error_msg)) {
+    SWIG_exception_fail(SWIG_TypeError, ("$symname: " + error_msg).c_str());
   }
-  PyTensorListToVector($input, &inputs);
   $1 = inputs.data();
   $2 = inputs.size();
 }
@@ -211,12 +281,25 @@ void PyTensorListToVector(PyObject* py_tensor_list,
 // PyArray_Return, maybe others).
 %noexception TF_SessionRun_wrapper;
 
+// We use TF_SessionPRunSetup_wrapper instead of TF_SessionPRunSetup
+%ignore TF_SessionPRunSetup;
+%unignore TF_SessionPRunSetup_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_SessionPRunSetup_wrapper;
+
+// We use TF_SessionPRun_wrapper instead of TF_SessionPRun
+%ignore TF_SessionPRun;
+%unignore TF_SessionPRun_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_SessionPRun_wrapper;
 
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
 
 %include "tensorflow/c/c_api.h"
+%include "tensorflow/c/python_api.h"
+
 
 %ignoreall
 %insert("python") %{
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 86088d0ab491f9298ca628751126ecb6df9374d3..5c04a8c2a5e26213d4d2c892232eadf8c0d7d2b1 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -594,14 +594,6 @@ void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
       const_cast<const char**>(output_names.data()), output_names.size(),
       const_cast<const char**>(target_nodes.data()), target_nodes.size(),
       out_handle, out_status);
-  // TF_PRunSetup leaves out_handle undefined if it fails, but SWIG will call
-  // free(out_handle) on the returned handle regardless. Thus, must make sure it
-  // is valid.
-  if (TF_GetCode(out_status) != TF_OK) {
-    char* tmp = new char[1];
-    tmp[0] = '\0';
-    *out_handle = tmp;
-  }
   Py_END_ALLOW_THREADS;
 }
 
@@ -623,7 +615,7 @@ void TF_Reset_wrapper(const TF_SessionOptions* opt,
            out_status);
 }
 
-void TF_SessionRun_wrapper_helper(TF_Session* session,
+void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
                                   const TF_Buffer* run_options,
                                   const std::vector<TF_Output>& inputs,
                                   const std::vector<PyObject*>& input_ndarrays,
@@ -678,10 +670,16 @@ void TF_SessionRun_wrapper_helper(TF_Session* session,
 
   // Call TF_SessionRun() (and release GIL during execution)
   Py_BEGIN_ALLOW_THREADS;
-  TF_SessionRun(session, run_options, inputs.data(), input_vals.data(),
-                inputs.size(), outputs.data(), output_vals.data(),
-                outputs.size(), targets.data(), targets.size(), run_metadata,
-                out_status);
+  if (handle == nullptr) {
+    TF_SessionRun(session, run_options, inputs.data(), input_vals.data(),
+                  inputs.size(), outputs.data(), output_vals.data(),
+                  outputs.size(), targets.data(), targets.size(), run_metadata,
+                  out_status);
+  } else {
+    TF_SessionPRun(session, handle, inputs.data(), input_vals.data(),
+                   inputs.size(), outputs.data(), output_vals.data(),
+                   outputs.size(), targets.data(), targets.size(), out_status);
+  }
   Py_END_ALLOW_THREADS;
 
   // Create scoped containers for output tensors
@@ -692,7 +690,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session,
 
   // Convert outputs to ndarrays (in scoped containers)
   std::vector<Safe_PyObjectPtr> py_outputs_safe;
-  for (int i = 0; i < outputs.size(); ++i) {
+  for (size_t i = 0; i < outputs.size(); ++i) {
     PyObject* py_array;
     s = TFTensorToPyArray(std::move(output_vals_safe[i]), &py_array);
     if (!s.ok()) {
@@ -704,7 +702,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session,
 
   // If we reach this point, we have successfully built a list of objects so we
   // can release them from the safe container into the return vector.
-  for (int i = 0; i < outputs.size(); ++i) {
+  for (size_t i = 0; i < outputs.size(); ++i) {
     py_outputs->push_back(py_outputs_safe[i].release());
   }
 }
@@ -716,9 +714,9 @@ void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
                            const std::vector<TF_Operation*>& targets,
                            TF_Buffer* run_metadata, TF_Status* out_status,
                            std::vector<PyObject*>* py_outputs) {
-  TF_SessionRun_wrapper_helper(session, run_options, inputs, input_ndarrays,
-                               outputs, targets, run_metadata, out_status,
-                               py_outputs);
+  TF_SessionRun_wrapper_helper(session, nullptr, run_options, inputs,
+                               input_ndarrays, outputs, targets, run_metadata,
+                               out_status, py_outputs);
   // Release any unused ndarray references (see memory management comment in
   // TF_SessionRun_wrapper_helper)
   ClearDecrefCache();
@@ -737,4 +735,43 @@ string EqualGraphDefWrapper(const string& actual, const string& expected) {
   return EqualGraphDef(actual_def, expected_def, &diff) ? "" : diff;
 }
 
+void TF_SessionPRunSetup_wrapper(TF_Session* session,
+                                 const std::vector<TF_Output>& inputs,
+                                 const std::vector<TF_Output>& outputs,
+                                 const std::vector<TF_Operation*>& targets,
+                                 const char** out_handle,
+                                 TF_Status* out_status) {
+  // Call TF_SessionPRunSetup() (and release GIL during execution)
+  Py_BEGIN_ALLOW_THREADS;
+  TF_SessionPRunSetup(session, inputs.data(), inputs.size(), outputs.data(),
+                      outputs.size(), targets.data(), targets.size(),
+                      out_handle, out_status);
+  Py_END_ALLOW_THREADS;
+}
+
+void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
+                            const std::vector<TF_Output>& inputs,
+                            const std::vector<PyObject*>& input_ndarrays,
+                            const std::vector<TF_Output>& outputs,
+                            TF_Status* out_status,
+                            std::vector<PyObject*>* py_outputs) {
+  const std::vector<TF_Operation*> targets;
+  TF_SessionRun_wrapper_helper(session, handle,
+                               nullptr,  // run_options
+                               inputs, input_ndarrays, outputs, targets,
+                               nullptr,  // run_metadata
+                               out_status, py_outputs);
+  // Release any unused ndarray references (see memory management comment in
+  // TF_SessionRun_wrapper_helper)
+  ClearDecrefCache();
+}
+
+std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
+    TF_Operation* oper) {
+  std::vector<TF_Operation*> control_inputs(TF_OperationNumControlInputs(oper));
+  TF_OperationGetControlInputs(oper, control_inputs.data(),
+                               control_inputs.size());
+  return control_inputs;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 727e8ade52f93027ddee294369de2e67c2cd55bf..f1f70a9a1d2e9ab8de7d4a31c7b87f970c5757eb 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -81,8 +81,6 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
 //
 // On failure, out_status contains a tensorflow::Status with an error
 // message.
-//
-// NOTE: This is EXPERIMENTAL and subject to change.
 void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
                           const NameVector& input_names,
                           const NameVector& output_names,
@@ -101,8 +99,6 @@ void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
 //
 // On failure,  out_status contains a tensorflow::Status with an error
 // message.
-//
-// NOTE: This is EXPERIMENTAL and subject to change.
 void TF_PRun_wrapper(TF_DeprecatedSession* session, const char* handle,
                      PyObject* feed_dict, const NameVector& output_names,
                      TF_Status* out_status, PyObjectVector* out_values);
@@ -128,6 +124,45 @@ void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
                            const std::vector<TF_Operation*>& targets,
                            TF_Buffer* run_metadata, TF_Status* out_status,
                            std::vector<PyObject*>* py_outputs);
+
+// Set up the graph with the intended feeds (inputs) and fetches (output) for
+// a sequence of partial run calls.
+//
+// On success, returns a handle that can be used for subsequent PRun calls. The
+// handle is owned by the caller and should be deleted with TF_DeletePRunHandle
+// when it is no longer needed.
+//
+// On failure, out_status contains a tensorflow::Status with an error
+// message.
+void TF_SessionPRunSetup_wrapper(TF_Session* session,
+                                 const std::vector<TF_Output>& inputs,
+                                 const std::vector<TF_Output>& outputs,
+                                 const std::vector<TF_Operation*>& targets,
+                                 const char** out_handle,
+                                 TF_Status* out_status);
+
+// Continue to run the graph with additional feeds and fetches. The
+// execution state is uniquely identified by the handle.
+//
+// On success, `py_outputs` is populated with a numpy ndarray for each output
+// (the caller must decref these ndarrays, although this will likely be handled
+// by the Python gc). `session`, `handle`, `out_status`, and `py_outputs` must
+// be non-null. `py_outputs` should be empty.
+//
+// On failure, out_status contains a tensorflow::Status with an error
+// message.
+void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
+                            const std::vector<TF_Output>& inputs,
+                            const std::vector<PyObject*>& input_ndarrays,
+                            const std::vector<TF_Output>& outputs,
+                            TF_Status* out_status,
+                            std::vector<PyObject*>* py_outputs);
+
+// Retrieves control inputs of this operation.
+// control_inputs should be empty.
+std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
+    TF_Operation* oper);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/client/tf_sessionrun_wrapper.i b/tensorflow/python/client/tf_sessionrun_wrapper.i
index 289792fef26a0289378e6c0b5fc0e3502987f43e..473bc3ccc532a7900a3ce6d3c82d0881fd07e420 100644
--- a/tensorflow/python/client/tf_sessionrun_wrapper.i
+++ b/tensorflow/python/client/tf_sessionrun_wrapper.i
@@ -73,13 +73,17 @@ tensorflow::ImportNumpy();
 // $input is a Python list of wrapped TF_Outputs
 %typemap(in) (const std::vector<TF_Output>& outputs)
     (std::vector<TF_Output> outputs) {
-  if (!PyList_Check($input)) {
-    SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+  string error_msg;
+  if (!PyTensorListToVector($input, &outputs, &error_msg)) {
+    SWIG_exception_fail(SWIG_TypeError, ("$symname: " + error_msg).c_str());
   }
-  PyTensorListToVector($input, &outputs);
   $1 = &outputs;
 }
 
+// Apply the typemap above to inputs as well
+%typemap(in) (const std::vector<TF_Output>& inputs) =
+             (const std::vector<TF_Output>& outputs);
+
 // Create temporary py_outputs_vec variable to store return value
 %typemap(in, numinputs=0) (std::vector<PyObject*>* py_outputs)
     (std::vector<PyObject*> py_outputs_vec) {
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 39446b6ca27c2a5d375612278195edc6b46b5289..ea2ab578b3293209578485a47941da8d6fc00fa6 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -1,5 +1,12 @@
 # Description:
 #   Python Client Code of the TensorFlow Debugger (tfdbg).
+#
+# Public target(s):
+#
+# ":debug_py": Public Python methods and classes of tfdbg.
+#   For API documentation, see https://www.tensorflow.org/api_docs/python/tfdbg
+#   For a user interface walkthrough, see https://www.tensorflow.org/programmers_guide/debugger
+# ":grpc_debug_server": Server interface for grpc:// debug URLs.
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -24,7 +31,9 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":debug_data",
+        ":debug_gradients",
         ":debug_utils",
+        ":grpc_debug_server",
         ":hooks",
         ":local_cli_wrapper",
     ],
@@ -35,6 +44,7 @@ py_library(
     name = "debug_pip",
     deps = [
         ":debug_py",
+        ":grpc_debug_test_server",
         ":offline_analyzer",
         ":session_debug_testlib",
     ] + if_not_windows([
@@ -53,6 +63,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "debug_gradients",
+    srcs = ["lib/debug_gradients.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":debug_data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "debug_utils",
     srcs = ["lib/debug_utils.py"],
@@ -266,12 +289,15 @@ py_library(
     deps = [
         ":analyzer_cli",
         ":cli_shared",
+        ":command_parser",
         ":debug_data",
         ":debugger_cli_common",
         ":framework",
         ":profile_analyzer_cli",
         ":stepper_cli",
+        ":tensor_format",
         ":ui_factory",
+        "@six_archive//:six",
     ],
 )
 
@@ -367,12 +393,33 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//third_party/py/numpy",
     ],
 )
 
+cuda_py_test(
+    name = "debug_gradients_test",
+    size = "small",
+    srcs = [
+        "lib/debug_gradients_test.py",
+    ],
+    additional_deps = [
+        ":debug_data",
+        ":debug_gradients",
+        ":debug_utils",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
 py_test(
     name = "debug_utils_test",
     size = "small",
@@ -529,6 +576,41 @@ py_library(
     ],
 )
 
+py_library(
+    name = "debug_service_pb2_grpc",
+    srcs = ["lib/debug_service_pb2_grpc.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core/debug:debug_service_proto_py",
+    ],
+)
+
+py_library(
+    name = "grpc_debug_server",
+    srcs = ["lib/grpc_debug_server.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":debug_data",
+        ":debug_service_pb2_grpc",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "grpc_debug_test_server",
+    srcs = ["lib/grpc_debug_test_server.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":debug_data",
+        ":debug_utils",
+        ":grpc_debug_server",
+        "//tensorflow/python:client",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "session_debug_file_test",
     size = "small",
@@ -695,6 +777,62 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "session_debug_grpc_test",
+    size = "medium",
+    srcs = ["lib/session_debug_grpc_test.py"],
+    additional_deps = [
+        ":debug_data",
+        ":debug_utils",
+        ":dumping_wrapper",
+        ":grpc_debug_test_server",
+        ":grpc_wrapper",
+        ":hooks",
+        ":session_debug_testlib",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+    tags = [
+        "no_windows",
+        "nomac",  # TODO(cais): Install of futures and grpcio on all macs.
+        "notsan",
+        "oss_serial",
+    ],
+)
+
+# TODO(cais): Run the test in OSS, perhaps through a sh_test.
+cuda_py_test(
+    name = "dist_session_debug_grpc_test",
+    size = "medium",
+    srcs = ["lib/dist_session_debug_grpc_test.py"],
+    additional_deps = [
+        ":debug_data",
+        ":debug_utils",
+        ":dumping_wrapper",
+        ":grpc_debug_test_server",
+        ":grpc_wrapper",
+        ":hooks",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+    data = ["//tensorflow/tools/dist_test/server:grpc_tensorflow_server"],
+    tags = [
+        "no_oss",  # Incompatible with bazel_pip.
+        "no_windows",
+        "nomac",  # TODO(cais): Install of futures and grpcio on all macs.
+        "notsan",
+    ],
+)
+
 py_test(
     name = "dumping_wrapper_test",
     size = "small",
diff --git a/tensorflow/python/debug/README.md b/tensorflow/python/debug/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b26411cd1538250b61364b6c7257fd03d5b6278b
--- /dev/null
+++ b/tensorflow/python/debug/README.md
@@ -0,0 +1,49 @@
+# TensorFlow Debugger (TFDBG)
+
+[TOC]
+
+TensorFlow Debugger (TFDBG) is a specialized debugger for TensorFlow's computation
+graphs. It provides access to internal graph structures and tensor values at
+TensorFlow runtime.
+
+<!-- TODO(cais): Add release notes starting from 1.3. -->
+
+## Why TFDBG?
+
+In TensorFlow's current
+[computation-graph framework](https://www.tensorflow.org/get_started/get_started#the_computational_graph),
+almost all actual computation after graph construction happens in a single
+Python function, namely
+[tf.Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run).
+Basic Python debugging tools such as [pdb](https://docs.python.org/2/library/pdb.html)
+cannot be used to debug `Session.run`, due to the fact that TensorFlow's graph
+execution happens in the underlying C++ layer. C++ debugging tools such as
+[gdb](https://www.gnu.org/software/gdb/) are not ideal either, because of their
+inability to recognize and organize the stack frames and variables in a way
+relevant to TensorFlow's operations, tensors and other graph constructs.
+
+TFDBG addresses these limitations. Among the features provided by TFDBG, the
+following ones are designed to facilitate runtime debugging of TensorFlow
+models:
+
+* Easy access through session wrappers
+* Easy integration with common high-level APIs, such as
+  [tf-learn](https://www.tensorflow.org/get_started/tflearn) and
+  [Keras](https://keras.io/)
+* Inspection of runtime tensor values and node connections
+* Conditional breaking after runs that generate tensors satisfying given
+  predicates, which makes common debugging tasks such as tracing the origin
+  of infinities and [NaNs](https://en.wikipedia.org/wiki/NaN) easier
+* Association of nodes and tensors in graphs with Python source lines
+* Profiling of models at the level of graph nodes and Python source lines.
+(Omitted internal-only feature)
+
+## How to use TFDBG?
+
+* For a walkthrough of TFDBG command-line interface, see https://www.tensorflow.org/programmers_guide/debugger.
+* For programmatic use of the API of TFDBG, see https://www.tensorflow.org/api_docs/python/tfdbg.
+
+## Related Publications
+
+* Cai, S., Breck E., Nielsen E., Salib M., Sculley D. (2016) TensorFlow Debugger:
+  Debugging Dataflow Graphs for Machine Learning. https://research.google.com/pubs/pub45789.html
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 750d21f80d318a16c571aa7438e25115b6912bfc..e20849bc1c8100575a863fc47c4aea1cac6e4eb1 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -21,6 +21,7 @@ See the @{$python/tfdbg} guide.
 @@watch_graph_with_blacklists
 @@DebugTensorDatum
 @@DebugDumpDir
+@@load_tensor_from_event
 @@load_tensor_from_event_file
 @@has_inf_or_nan
 @@DumpingDebugHook
@@ -30,6 +31,9 @@ See the @{$python/tfdbg} guide.
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
 @@WatchOptions
+
+@@GradientsDebugger
+@@clear_gradient_debuggers
 """
 
 from __future__ import absolute_import
@@ -40,8 +44,11 @@ from __future__ import print_function
 from tensorflow.python.debug.lib.debug_data import DebugDumpDir
 from tensorflow.python.debug.lib.debug_data import DebugTensorDatum
 from tensorflow.python.debug.lib.debug_data import has_inf_or_nan
+from tensorflow.python.debug.lib.debug_data import load_tensor_from_event
 from tensorflow.python.debug.lib.debug_data import load_tensor_from_event_file
 
+from tensorflow.python.debug.lib.debug_gradients import GradientsDebugger
+
 from tensorflow.python.debug.lib.debug_utils import add_debug_tensor_watch
 from tensorflow.python.debug.lib.debug_utils import watch_graph
 from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index da27f4cebeaf6e7d0a1db2b74245b45279066a3d..2bb8a7f6ce210c737fe1c0b74dc3568f279c3f59 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -279,39 +279,9 @@ class DebugAnalyzer(object):
     self._arg_parsers["list_outputs"] = ap
 
     # Parser for print_tensor.
-    ap = argparse.ArgumentParser(
-        description="Print the value of a dumped tensor.",
-        usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "tensor_name",
-        type=str,
-        help="Name of the tensor, followed by any slicing indices, "
-        "e.g., hidden1/Wx_plus_b/MatMul:0, "
-        "hidden1/Wx_plus_b/MatMul:0[1, :]")
-    ap.add_argument(
-        "-n",
-        "--number",
-        dest="number",
-        type=int,
-        default=-1,
-        help="0-based dump number for the specified tensor. "
-        "Required for tensor with multiple dumps.")
-    ap.add_argument(
-        "-r",
-        "--ranges",
-        dest="ranges",
-        type=str,
-        default="",
-        help="Numerical ranges to highlight tensor elements in. "
-        "Examples: -r 0,1e-8, -r [-0.1,0.1], "
-        "-r \"[[-inf, -0.1], [0.1, inf]]\"")
-    ap.add_argument(
-        "-a",
-        "--all",
-        dest="print_all",
-        action="store_true",
-        help="Print the tensor in its entirety, i.e., do not use ellipses.")
-    self._arg_parsers["print_tensor"] = ap
+    self._arg_parsers["print_tensor"] = (
+        command_parser.get_print_tensor_argparser(
+            "Print the value of a dumped tensor."))
 
     # Parser for print_source.
     ap = argparse.ArgumentParser(
@@ -837,10 +807,8 @@ class DebugAnalyzer(object):
 
     parsed = self._arg_parsers["print_tensor"].parse_args(args)
 
-    if screen_info and "cols" in screen_info:
-      np_printoptions = {"linewidth": screen_info["cols"]}
-    else:
-      np_printoptions = {}
+    np_printoptions = cli_shared.numpy_printoptions_from_screen_info(
+        screen_info)
 
     # Determine if any range-highlighting is required.
     highlight_options = cli_shared.parse_ranges_highlight(parsed.ranges)
@@ -910,7 +878,8 @@ class DebugAnalyzer(object):
             np_printoptions,
             print_all=parsed.print_all,
             tensor_slicing=tensor_slicing,
-            highlight_options=highlight_options)
+            highlight_options=highlight_options,
+            include_numeric_summary=parsed.numeric_summary)
       else:
         output = cli_shared.error(
             "Invalid number (%d) for tensor %s, which generated one dump." %
@@ -999,11 +968,10 @@ class DebugAnalyzer(object):
 
   def _reconstruct_print_source_command(self,
                                         parsed,
-                                        line_begin_decrease=0,
+                                        line_begin,
                                         max_elements_per_line_increase=0):
     return "ps %s %s -b %d -m %d" % (
-        parsed.source_file_path, "-t" if parsed.tensors else "",
-        max(parsed.line_begin - line_begin_decrease, 1),
+        parsed.source_file_path, "-t" if parsed.tensors else "", line_begin,
         parsed.max_elements_per_line + max_elements_per_line_increase)
 
   def print_source(self, args, screen_info=None):
@@ -1015,38 +983,26 @@ class DebugAnalyzer(object):
     source_annotation = source_utils.annotate_source(
         self._debug_dump,
         parsed.source_file_path,
-        do_dumped_tensors=parsed.tensors,
-        min_line=parsed.line_begin)
+        do_dumped_tensors=parsed.tensors)
 
     source_lines, line_num_width = source_utils.load_source(
         parsed.source_file_path)
 
     labeled_source_lines = []
-    if parsed.line_begin > 1:
-      omitted_info_line = RL(
-          "(... Omitted %d source lines ...) " % (parsed.line_begin - 1),
-          "bold")
-      omitted_info_line += RL(
-          "+5",
-          debugger_cli_common.MenuItem(
-              None,
-              self._reconstruct_print_source_command(
-                  parsed, line_begin_decrease=5)))
-      labeled_source_lines.append(omitted_info_line)
-
-    for i, line in enumerate(source_lines[parsed.line_begin - 1:]):
-      annotated_line = RL("L%d" % (i + parsed.line_begin),
-                          cli_shared.COLOR_YELLOW)
+    actual_initial_scroll_target = 0
+    for i, line in enumerate(source_lines):
+      annotated_line = RL("L%d" % (i + 1), cli_shared.COLOR_YELLOW)
       annotated_line += " " * (line_num_width - len(annotated_line))
       annotated_line += line
       labeled_source_lines.append(annotated_line)
 
-      if i + parsed.line_begin in source_annotation:
-        sorted_elements = sorted(source_annotation[i + parsed.line_begin])
+      if i + 1 == parsed.line_begin:
+        actual_initial_scroll_target = len(labeled_source_lines) - 1
+
+      if i + 1 in source_annotation:
+        sorted_elements = sorted(source_annotation[i + 1])
         for k, element in enumerate(sorted_elements):
           if k >= parsed.max_elements_per_line:
-            # TODO(cais): Replace this accordion pattern with the easier-to-use
-            # INIT_SCROLL_POS_KEY.
             omitted_info_line = RL("    (... Omitted %d of %d %s ...) " % (
                 len(sorted_elements) - parsed.max_elements_per_line,
                 len(sorted_elements),
@@ -1056,7 +1012,7 @@ class DebugAnalyzer(object):
                 debugger_cli_common.MenuItem(
                     None,
                     self._reconstruct_print_source_command(
-                        parsed, max_elements_per_line_increase=5)))
+                        parsed, i + 1, max_elements_per_line_increase=5)))
             labeled_source_lines.append(omitted_info_line)
             break
 
@@ -1071,7 +1027,9 @@ class DebugAnalyzer(object):
           labeled_source_lines.append(label)
 
     output = debugger_cli_common.rich_text_lines_from_rich_line_list(
-        labeled_source_lines)
+        labeled_source_lines,
+        annotations={debugger_cli_common.INIT_SCROLL_POS_KEY:
+                     actual_initial_scroll_target})
     _add_main_menu(output, node_name=None)
     return output
 
@@ -1312,7 +1270,7 @@ class DebugAnalyzer(object):
     all_inputs = copy.copy(tracker(node_name, is_control=False))
     is_ctrl = [False] * len(all_inputs)
     if include_control:
-      # Sort control inputs or recipients in in alphabetical order of the node
+      # Sort control inputs or recipients in alphabetical order of the node
       # names.
       ctrl_inputs = sorted(tracker(node_name, is_control=True))
       all_inputs.extend(ctrl_inputs)
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index ce224fff208f8edd053bd48609f27d1e3d8a1bef..9e1f459915a7f61c6c1099a2db08fa1e70a0046a 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_shared
@@ -43,6 +44,13 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
 
+def no_rewrite_session_config():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
 def line_number_above():
   return tf_inspect.stack()[1][2] - 1
 
@@ -506,7 +514,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     cls._curr_file_path = os.path.abspath(
         tf_inspect.getfile(tf_inspect.currentframe()))
 
-    cls._sess = session.Session()
+    cls._sess = session.Session(config=no_rewrite_session_config())
     with cls._sess as sess:
       u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
       v_init_val = np.array([[2.0], [-1.0]])
@@ -993,6 +1001,34 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         list_inputs_node_name=node_name,
         list_outputs_node_name=node_name)
 
+  def testPrintTensorHighlightingRangesAndIncludingNumericSummary(self):
+    node_name = "simple_mul_add/matmul"
+    tensor_name = node_name + ":0"
+    out = self._registry.dispatch_command(
+        "print_tensor", [tensor_name, "--ranges", "[-inf, 0.0]", "-s"],
+        screen_info={"cols": 80})
+
+    self.assertEqual([
+        "Tensor \"%s:DebugIdentity\": " % tensor_name +
+        "Highlighted([-inf, 0.0]): 1 of 2 element(s) (50.00%)",
+        "  dtype: float64",
+        "  shape: (2, 1)",
+        "",
+        "Numeric summary:",
+        "| - + | total |",
+        "| 1 1 |     2 |",
+        "|  min  max mean  std |",
+        "| -2.0  7.0  2.5  4.5 |",
+        "",
+        "array([[ 7.],",
+        "       [-2.]])",
+    ], out.lines)
+
+    self.assertIn("tensor_metadata", out.annotations)
+    self.assertIn(10, out.annotations)
+    self.assertIn(11, out.annotations)
+    self.assertEqual([(8, 11, "bold")], out.font_attr_segs[11])
+
   def testPrintTensorWithSlicing(self):
     node_name = "simple_mul_add/matmul"
     tensor_name = node_name + ":0"
@@ -1241,16 +1277,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         [self._curr_file_path, "-b", "3"],
         screen_info={"cols": 80})
 
-    self.assertIn("Omitted 2 source lines", out.lines[0])
-    self.assertTrue(out.lines[0].endswith("+5"))
-    expand_lines_command = out.font_attr_segs[0][-1][2].content
-    self.assertStartsWith(expand_lines_command,
-                          "ps %s " % self._curr_file_path)
-    self.assertIn("-b 1", expand_lines_command)
-
-    self.assertIsNone(self._findSourceLine(out, 1))
-    self.assertIsNone(self._findSourceLine(out, 2))
-    self.assertIsNotNone(self._findSourceLine(out, 3))
+    self.assertEqual(
+        2, out.annotations[debugger_cli_common.INIT_SCROLL_POS_KEY])
 
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
@@ -1390,7 +1418,7 @@ class AnalyzerCLIPrintLargeTensorTest(test_util.TensorFlowTestCase):
   def setUpClass(cls):
     cls._dump_root = tempfile.mkdtemp()
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       # 2400 elements should exceed the default threshold (2000).
       x = constant_op.constant(np.zeros([300, 8]), name="large_tensors/x")
 
@@ -1467,7 +1495,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
     else:
       cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       x_init_val = np.array([5.0, 3.0])
       x_init = constant_op.constant(x_init_val, shape=[2])
       x = variables.Variable(x_init, name="control_deps/x")
@@ -1807,7 +1835,7 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
   def setUpClass(cls):
     cls._dump_root = tempfile.mkdtemp()
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       loop_var = constant_op.constant(0, name="while_loop_test/loop_var")
       cond = lambda loop_var: math_ops.less(loop_var, 10)
       body = lambda loop_var: math_ops.add(loop_var, 1)
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index 9164e18bcf582d55502b0f9530dfa51ea416b00e..34d942fdf182691c26cfa4570917a97fcdd4a978 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -138,12 +138,20 @@ def parse_ranges_highlight(ranges_string):
     return None
 
 
+def numpy_printoptions_from_screen_info(screen_info):
+  if screen_info and "cols" in screen_info:
+    return {"linewidth": screen_info["cols"]}
+  else:
+    return {}
+
+
 def format_tensor(tensor,
                   tensor_name,
                   np_printoptions,
                   print_all=False,
                   tensor_slicing=None,
-                  highlight_options=None):
+                  highlight_options=None,
+                  include_numeric_summary=False):
   """Generate formatted str to represent a tensor or its slices.
 
   Args:
@@ -161,6 +169,8 @@ def format_tensor(tensor,
     highlight_options: (tensor_format.HighlightOptions) options to highlight
       elements of the tensor. See the doc of tensor_format.format_tensor()
       for more details.
+    include_numeric_summary: Whether a text summary of the numeric values (if
+      applicable) will be included.
 
   Returns:
     (str) Formatted str representing the (potentially sliced) tensor.
@@ -183,6 +193,7 @@ def format_tensor(tensor,
       value,
       sliced_name,
       include_metadata=True,
+      include_numeric_summary=include_numeric_summary,
       np_printoptions=np_printoptions,
       highlight_options=highlight_options)
 
@@ -286,10 +297,14 @@ def get_tfdbg_logo():
   return debugger_cli_common.RichTextLines(lines)
 
 
+_HORIZONTAL_BAR = "======================================"
+
+
 def get_run_start_intro(run_call_count,
                         fetches,
                         feed_dict,
-                        tensor_filters):
+                        tensor_filters,
+                        is_callable_runner=False):
   """Generate formatted intro for run-start UI.
 
   Args:
@@ -300,6 +315,8 @@ def get_run_start_intro(run_call_count,
       for more details.
     tensor_filters: (dict) A dict from tensor-filter name to tensor-filter
       callable.
+    is_callable_runner: (bool) whether a runner returned by
+        Session.make_callable is being run.
 
   Returns:
     (RichTextLines) Formatted intro message about the `Session.run()` call.
@@ -308,29 +325,35 @@ def get_run_start_intro(run_call_count,
   fetch_lines = _get_fetch_names(fetches)
 
   if not feed_dict:
-    feed_dict_lines = ["(Empty)"]
+    feed_dict_lines = [debugger_cli_common.RichLine("  (Empty)")]
   else:
     feed_dict_lines = []
     for feed_key in feed_dict:
-      if isinstance(feed_key, six.string_types):
-        feed_dict_lines.append(feed_key)
-      else:
-        feed_dict_lines.append(feed_key.name)
-
-  intro_lines = [
-      "======================================",
-      "Session.run() call #%d:" % run_call_count,
-      "", "Fetch(es):"
-  ]
-  intro_lines.extend(["  " + line for line in fetch_lines])
-  intro_lines.extend(["", "Feed dict(s):"])
-  intro_lines.extend(["  " + line for line in feed_dict_lines])
-  intro_lines.extend([
-      "======================================", "",
-      "Select one of the following commands to proceed ---->"
-  ])
-
-  out = debugger_cli_common.RichTextLines(intro_lines)
+      feed_key_name = (feed_key if isinstance(feed_key, six.string_types)
+                       else feed_key.name)
+      feed_dict_line = debugger_cli_common.RichLine("  ")
+      feed_dict_line += debugger_cli_common.RichLine(
+          feed_key_name,
+          debugger_cli_common.MenuItem(None, "pf %s" % feed_key_name))
+      feed_dict_lines.append(feed_dict_line)
+  feed_dict_lines = debugger_cli_common.rich_text_lines_from_rich_line_list(
+      feed_dict_lines)
+
+  out = debugger_cli_common.RichTextLines(_HORIZONTAL_BAR)
+  if is_callable_runner:
+    out.append("Running a runner returned by Session.make_callabe()")
+  else:
+    out.append("Session.run() call #%d:" % run_call_count)
+    out.append("")
+    out.append("Fetch(es):")
+    out.extend(debugger_cli_common.RichTextLines(
+        ["  " + line for line in fetch_lines]))
+    out.append("")
+    out.append("Feed dict:")
+    out.extend(feed_dict_lines)
+  out.append(_HORIZONTAL_BAR)
+  out.append("")
+  out.append("Select one of the following commands to proceed ---->")
 
   out.extend(
       _recommend_command(
@@ -392,7 +415,10 @@ def get_run_start_intro(run_call_count,
   return out
 
 
-def get_run_short_description(run_call_count, fetches, feed_dict):
+def get_run_short_description(run_call_count,
+                              fetches,
+                              feed_dict,
+                              is_callable_runner=False):
   """Get a short description of the run() call.
 
   Args:
@@ -401,11 +427,15 @@ def get_run_short_description(run_call_count, fetches, feed_dict):
       for more details.
     feed_dict: Feeds to the `Session.run()` call. See doc of `Session.run()`
       for more details.
+    is_callable_runner: (bool) whether a runner returned by
+        Session.make_callable is being run.
 
   Returns:
     (str) A short description of the run() call, including information about
       the fetche(s) and feed(s).
   """
+  if is_callable_runner:
+    return "runner from make_callable()"
 
   description = "run #%d: " % run_call_count
 
diff --git a/tensorflow/python/debug/cli/command_parser.py b/tensorflow/python/debug/cli/command_parser.py
index 143c1045199dc1e4f471d187219992557f2483c8..d78dc1649af8345f67679032a2052ac515044db8 100644
--- a/tensorflow/python/debug/cli/command_parser.py
+++ b/tensorflow/python/debug/cli/command_parser.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import ast
 import re
 import sys
@@ -487,3 +488,55 @@ def evaluate_tensor_slice(tensor, tensor_slicing):
     raise ValueError("Invalid tensor-slicing string.")
 
   return tensor[_parse_slices(tensor_slicing)]
+
+
+def get_print_tensor_argparser(description):
+  """Get an ArgumentParser for a command that prints tensor values.
+
+  Examples of such commands include print_tensor and print_feed.
+
+  Args:
+    description: Description of the ArgumentParser.
+
+  Returns:
+    An instance of argparse.ArgumentParser.
+  """
+
+  ap = argparse.ArgumentParser(
+      description=description, usage=argparse.SUPPRESS)
+  ap.add_argument(
+      "tensor_name",
+      type=str,
+      help="Name of the tensor, followed by any slicing indices, "
+      "e.g., hidden1/Wx_plus_b/MatMul:0, "
+      "hidden1/Wx_plus_b/MatMul:0[1, :]")
+  ap.add_argument(
+      "-n",
+      "--number",
+      dest="number",
+      type=int,
+      default=-1,
+      help="0-based dump number for the specified tensor. "
+      "Required for tensor with multiple dumps.")
+  ap.add_argument(
+      "-r",
+      "--ranges",
+      dest="ranges",
+      type=str,
+      default="",
+      help="Numerical ranges to highlight tensor elements in. "
+      "Examples: -r 0,1e-8, -r [-0.1,0.1], "
+      "-r \"[[-inf, -0.1], [0.1, inf]]\"")
+  ap.add_argument(
+      "-a",
+      "--all",
+      dest="print_all",
+      action="store_true",
+      help="Print the tensor in its entirety, i.e., do not use ellipses.")
+  ap.add_argument(
+      "-s",
+      "--numeric_summary",
+      action="store_true",
+      help="Include summary for non-empty tensors of numeric (int*, float*, "
+      "complex*) and Boolean types.")
+  return ap
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index 6a571c097ee699bec87353114d6c68ce3308283b..498e346393b061adc5655bf6a6d613c9862efbf4 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -513,6 +513,18 @@ class CursesUI(base_ui.BaseUI):
   def get_help(self):
     return self._command_handler_registry.get_help()
 
+  def _addstr(self, *args):
+    try:
+      self._stdscr.addstr(*args)
+    except curses.error:
+      pass
+
+  def _refresh_pad(self, pad, *args):
+    try:
+      pad.refresh(*args)
+    except curses.error:
+      pass
+
   def _screen_create_command_textbox(self, existing_command=None):
     """Create command textbox on screen.
 
@@ -522,8 +534,8 @@ class CursesUI(base_ui.BaseUI):
     """
 
     # Display the tfdbg prompt.
-    self._stdscr.addstr(self._max_y - self._command_textbox_height, 0,
-                        self.CLI_PROMPT, curses.A_BOLD)
+    self._addstr(self._max_y - self._command_textbox_height, 0,
+                 self.CLI_PROMPT, curses.A_BOLD)
     self._stdscr.refresh()
 
     self._command_window.clear()
@@ -948,7 +960,7 @@ class CursesUI(base_ui.BaseUI):
     color_pair = (self._default_color_pair if color is None else
                   self._color_pairs[color])
 
-    self._stdscr.addstr(row, 0, line, color_pair | attr)
+    self._addstr(row, 0, line, color_pair | attr)
     self._screen_refresh()
 
   def _screen_new_output_pad(self, rows, cols):
@@ -1235,10 +1247,9 @@ class CursesUI(base_ui.BaseUI):
   def _screen_scroll_output_pad(self, pad, viewport_top, viewport_left,
                                 screen_location_top, screen_location_left,
                                 screen_location_bottom, screen_location_right):
-    pad.refresh(viewport_top, viewport_left, screen_location_top,
-                screen_location_left, screen_location_bottom,
-                screen_location_right)
-
+    self._refresh_pad(pad, viewport_top, viewport_left, screen_location_top,
+                      screen_location_left, screen_location_bottom,
+                      screen_location_right)
     self._scroll_bar = ScrollBar(
         self._max_x - 2,
         3,
@@ -1249,9 +1260,9 @@ class CursesUI(base_ui.BaseUI):
 
     (scroll_pad, _, _) = self._display_lines(
         self._scroll_bar.layout(), self._output_num_rows - 1)
-    scroll_pad.refresh(
-        0, 0, self._output_top_row + 1, self._max_x - 2,
-        self._output_num_rows + 1, self._max_x - 1)
+    self._refresh_pad(scroll_pad, 0, 0, self._output_top_row + 1,
+                      self._max_x - 2, self._output_num_rows + 1,
+                      self._max_x - 1)
 
   def _scroll_output(self, direction, line_index=None):
     """Scroll the output pad.
@@ -1332,15 +1343,14 @@ class CursesUI(base_ui.BaseUI):
 
   def _screen_render_nav_bar(self):
     if self._nav_bar_pad:
-      self._nav_bar_pad.refresh(0, 0, self._nav_bar_row, 0,
-                                self._output_pad_screen_location.top,
-                                self._max_x)
+      self._refresh_pad(self._nav_bar_pad, 0, 0, self._nav_bar_row, 0,
+                        self._output_pad_screen_location.top, self._max_x)
 
   def _screen_render_menu_pad(self):
     if self._main_menu_pad:
-      self._main_menu_pad.refresh(0, 0, self._output_pad_screen_location.top, 0,
-                                  self._output_pad_screen_location.top,
-                                  self._max_x)
+      self._refresh_pad(
+          self._main_menu_pad, 0, 0, self._output_pad_screen_location.top, 0,
+          self._output_pad_screen_location.top, self._max_x)
 
   def _compile_ui_status_summary(self):
     """Compile status summary about this Curses UI instance.
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 06e1228b951947673cd0e29b8dbe0eb4c40977d4..ee8cabca0d12e76ed2be40d36377d770c6eb582b 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -22,6 +22,8 @@ import re
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.cli import stepper_cli
 from tensorflow.python.debug.lib import stepper
@@ -143,7 +145,11 @@ class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
     self.opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
         self.e, name="opt")
 
-    self.sess = session.Session()
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
 
     self.sess.run(self.a.initializer)
     self.sess.run(self.b.initializer)
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index bb7ac314303269481de802460383dfd94bba1355..7a5597db12b0115e952ad33928b825e38e26e20a 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -72,6 +72,7 @@ class HighlightOptions(object):
 def format_tensor(tensor,
                   tensor_label,
                   include_metadata=False,
+                  include_numeric_summary=False,
                   np_printoptions=None,
                   highlight_options=None):
   """Generate a RichTextLines object showing a tensor in formatted style.
@@ -83,6 +84,8 @@ def format_tensor(tensor,
       suppress the tensor name line in the return value.
     include_metadata: Whether metadata such as dtype and shape are to be
       included in the formatted text.
+    include_numeric_summary: Whether a text summary of the numeric values (if
+      applicable) will be included.
     np_printoptions: A dictionary of keyword arguments that are passed to a
       call of np.set_printoptions() to set the text format for display numpy
       ndarrays.
@@ -132,25 +135,31 @@ def format_tensor(tensor,
 
   if lines:
     lines.append("")
-  hlines = len(lines)
+  formatted = debugger_cli_common.RichTextLines(
+      lines, font_attr_segs=font_attr_segs)
+
+  if include_numeric_summary:
+    formatted.append("Numeric summary:")
+    formatted.extend(numeric_summary(tensor))
+    formatted.append("")
 
   # Apply custom string formatting options for numpy ndarray.
   if np_printoptions is not None:
     np.set_printoptions(**np_printoptions)
 
   array_lines = repr(tensor).split("\n")
-  lines.extend(array_lines)
-
   if tensor.dtype.type is not np.string_:
     # Parse array lines to get beginning indices for each line.
 
     # TODO(cais): Currently, we do not annotate string-type tensors due to
     #   difficulty in escaping sequences. Address this issue.
     annotations = _annotate_ndarray_lines(
-        array_lines, tensor, np_printoptions=np_printoptions, offset=hlines)
-
-  formatted = debugger_cli_common.RichTextLines(
-      lines, font_attr_segs=font_attr_segs, annotations=annotations)
+        array_lines, tensor, np_printoptions=np_printoptions)
+  else:
+    annotations = None
+  formatted_array = debugger_cli_common.RichTextLines(
+      array_lines, annotations=annotations)
+  formatted.extend(formatted_array)
 
   # Perform optional highlighting.
   if highlight_options is not None:
@@ -464,3 +473,91 @@ def _locate_elements_in_line(line, indices_list, ref_indices):
     offset_counter += 1
 
   return start_columns, end_columns
+
+
+def _pad_string_to_length(string, length):
+  return " " * (length - len(string)) + string
+
+
+def numeric_summary(tensor):
+  """Get a text summmary of a numeric tensor.
+
+  This summary is only available for numeric (int*, float*, complex*) and
+  Boolean tensors.
+
+  Args:
+    tensor: (`numpy.ndarray`) the tensor value object to be summarized.
+
+  Returns:
+    The summary text as a `RichTextLines` object. If the type of `tensor` is not
+    numeric or Boolean, a single-line `RichTextLines` object containing a
+    warning message will reflect that.
+  """
+
+  def _counts_summary(counts, skip_zeros=True, total_count=None):
+    """Format values as a two-row table."""
+    if skip_zeros:
+      counts = [(count_key, count_val) for count_key, count_val in counts
+                if count_val]
+    max_common_len = 0
+    for count_key, count_val in counts:
+      count_val_str = str(count_val)
+      common_len = max(len(count_key) + 1, len(count_val_str) + 1)
+      max_common_len = max(common_len, max_common_len)
+
+    key_line = debugger_cli_common.RichLine("|")
+    val_line = debugger_cli_common.RichLine("|")
+    for count_key, count_val in counts:
+      count_val_str = str(count_val)
+      key_line += _pad_string_to_length(count_key, max_common_len)
+      val_line += _pad_string_to_length(count_val_str, max_common_len)
+    key_line += " |"
+    val_line += " |"
+
+    if total_count is not None:
+      total_key_str = "total"
+      total_val_str = str(total_count)
+      max_common_len = max(len(total_key_str) + 1, len(total_val_str))
+      total_key_str = _pad_string_to_length(total_key_str, max_common_len)
+      total_val_str = _pad_string_to_length(total_val_str, max_common_len)
+      key_line += total_key_str + " |"
+      val_line += total_val_str + " |"
+
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(
+        [key_line, val_line])
+
+  if not isinstance(tensor, np.ndarray) or not np.size(tensor):
+    return debugger_cli_common.RichTextLines([
+        "No numeric summary available due to empty tensor."])
+  elif (np.issubdtype(tensor.dtype, np.float) or
+        np.issubdtype(tensor.dtype, np.complex) or
+        np.issubdtype(tensor.dtype, np.integer)):
+    counts = [
+        ("nan", np.sum(np.isnan(tensor))),
+        ("-inf", np.sum(np.isneginf(tensor))),
+        ("-", np.sum(np.logical_and(
+            tensor < 0.0, np.logical_not(np.isneginf(tensor))))),
+        ("0", np.sum(tensor == 0.0)),
+        ("+", np.sum(np.logical_and(
+            tensor > 0.0, np.logical_not(np.isposinf(tensor))))),
+        ("+inf", np.sum(np.isposinf(tensor)))]
+    output = _counts_summary(counts, total_count=np.size(tensor))
+
+    valid_array = tensor[
+        np.logical_not(np.logical_or(np.isinf(tensor), np.isnan(tensor)))]
+    if np.size(valid_array):
+      stats = [
+          ("min", np.min(valid_array)),
+          ("max", np.max(valid_array)),
+          ("mean", np.mean(valid_array)),
+          ("std", np.std(valid_array))]
+      output.extend(_counts_summary(stats, skip_zeros=False))
+    return output
+  elif tensor.dtype == np.bool:
+    counts = [
+        ("False", np.sum(tensor == 0)),
+        ("True", np.sum(tensor > 0)),]
+    return _counts_summary(counts, total_count=np.size(tensor))
+  else:
+    return debugger_cli_common.RichTextLines([
+        "No numeric summary available due to tensor dtype: %s." % tensor.dtype])
diff --git a/tensorflow/python/debug/cli/tensor_format_test.py b/tensorflow/python/debug/cli/tensor_format_test.py
index ec80bb998ef59de9dc0f6f3a4bcc29b910acf5f7..d3beb5f7bc8538a36437d1a322904cd141210985 100644
--- a/tensorflow/python/debug/cli/tensor_format_test.py
+++ b/tensorflow/python/debug/cli/tensor_format_test.py
@@ -665,6 +665,73 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
         ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [0])
 
+  def testLocateTensorElement2DNoEllipsisWithNumericSummary(self):
+    a = np.linspace(0.0, 1.0 - 1.0 / 16.0, 16).reshape([4, 4])
+
+    out = tensor_format.format_tensor(a, "a", include_numeric_summary=True)
+
+    self.assertEqual([
+        "Tensor \"a\":",
+        "",
+        "Numeric summary:",
+        "|  0  + | total |",
+        "|  1 15 |    16 |",
+        "|           min           max          mean           std |",
+        "|           0.0        0.9375       0.46875 0.28811076429 |",
+        "",
+        "array([[ 0.    ,  0.0625,  0.125 ,  0.1875],",
+        "       [ 0.25  ,  0.3125,  0.375 ,  0.4375],",
+        "       [ 0.5   ,  0.5625,  0.625 ,  0.6875],",
+        "       [ 0.75  ,  0.8125,  0.875 ,  0.9375]])",
+    ], out.lines)
+
+    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
+        out, [0, 0])
+    self.assertFalse(is_omitted)
+    self.assertEqual(8, row)
+    self.assertEqual(9, start_col)
+    self.assertEqual(11, end_col)
+
+    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
+        out, [0, 3])
+    self.assertFalse(is_omitted)
+    self.assertEqual(8, row)
+    self.assertEqual(36, start_col)
+    self.assertEqual(42, end_col)
+
+    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
+        out, [1, 0])
+    self.assertFalse(is_omitted)
+    self.assertEqual(9, row)
+    self.assertEqual(9, start_col)
+    self.assertEqual(13, end_col)
+
+    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
+        out, [1, 3])
+    self.assertFalse(is_omitted)
+    self.assertEqual(9, row)
+    self.assertEqual(36, start_col)
+    self.assertEqual(42, end_col)
+
+    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
+        out, [3, 3])
+    self.assertFalse(is_omitted)
+    self.assertEqual(11, row)
+    self.assertEqual(36, start_col)
+    self.assertEqual(42, end_col)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Indices exceed tensor dimensions"):
+      tensor_format.locate_tensor_element(out, [1, 4])
+
+    with self.assertRaisesRegexp(
+        ValueError, "Indices contain negative"):
+      tensor_format.locate_tensor_element(out, [-1, 2])
+
+    with self.assertRaisesRegexp(
+        ValueError, "Dimensions mismatch"):
+      tensor_format.locate_tensor_element(out, [0])
+
   def testLocateTensorElement3DWithEllipses(self):
     a = np.zeros([11, 11, 11])
 
@@ -858,5 +925,97 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
       tensor_format.locate_tensor_element(out, [0])
 
 
+class NumericSummaryTest(test_util.TensorFlowTestCase):
+
+  def testNumericSummaryOnFloatFullHouse(self):
+    x = np.array([np.nan, np.nan, -np.inf, np.inf, np.inf, np.inf, -2, -3, -4,
+                  0, 1, 2, 2, 2, 2, 0, 0, 0, np.inf, np.inf, np.inf])
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(
+        "|  nan -inf    -    0    + +inf | total |", out.lines[0])
+    self.assertEqual(
+        "|    2    1    3    4    5    6 |    21 |", out.lines[1])
+    self.assertEqual(
+        "|           min           max          mean           std |",
+        out.lines[2])
+    self.assertEqual(
+        "|          -4.0           2.0           0.0 1.95789002075 |",
+        out.lines[3])
+
+  def testNumericSummaryOnFloatMissingCategories(self):
+    x = np.array([np.nan, np.nan])
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(2, len(out.lines))
+    self.assertEqual("| nan | total |", out.lines[0])
+    self.assertEqual("|   2 |     2 |", out.lines[1])
+
+    x = np.array([-np.inf, np.inf, 0, 0, np.inf, np.inf])
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual("| -inf    0 +inf | total |", out.lines[0])
+    self.assertEqual("|    1    2    3 |     6 |", out.lines[1])
+    self.assertEqual("|  min  max mean  std |", out.lines[2])
+    self.assertEqual("|  0.0  0.0  0.0  0.0 |", out.lines[3])
+
+    x = np.array([-120, 120, 130])
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual("| - + | total |", out.lines[0])
+    self.assertEqual("| 1 2 |     3 |", out.lines[1])
+    self.assertEqual(
+        "|           min           max          mean           std |",
+        out.lines[2])
+    self.assertEqual(
+        "|          -120           130 43.3333333333 115.566238822 |",
+        out.lines[3])
+
+  def testNumericSummaryOnEmptyFloat(self):
+    x = np.array([], dtype=np.float32)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(["No numeric summary available due to empty tensor."],
+                     out.lines)
+
+  def testNumericSummaryOnInt(self):
+    x = np.array([-3] * 50 + [3] * 200 + [0], dtype=np.int32)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual("|   -   0   + | total |", out.lines[0])
+    self.assertEqual("|  50   1 200 |   251 |", out.lines[1])
+    self.assertEqual(
+        "|           min           max          mean           std |",
+        out.lines[2])
+    self.assertEqual(
+        "|            -3             3 1.79282868526 2.39789673081 |",
+        out.lines[3])
+
+  def testNumericSummaryOnBool(self):
+    x = np.array([False, True, True, False], dtype=np.bool)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(2, len(out.lines))
+    self.assertEqual("| False  True | total |", out.lines[0])
+    self.assertEqual("|     2     2 |     4 |", out.lines[1])
+
+    x = np.array([True] * 10, dtype=np.bool)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(2, len(out.lines))
+    self.assertEqual("| True | total |", out.lines[0])
+    self.assertEqual("|   10 |    10 |", out.lines[1])
+
+    x = np.array([False] * 10, dtype=np.bool)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(2, len(out.lines))
+    self.assertEqual("| False | total |", out.lines[0])
+    self.assertEqual("|    10 |    10 |", out.lines[1])
+
+    x = np.array([], dtype=np.bool)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(["No numeric summary available due to empty tensor."],
+                     out.lines)
+
+  def testNumericSummaryOnStrTensor(self):
+    x = np.array(["spam", "egg"], dtype=np.object)
+    out = tensor_format.numeric_summary(x)
+    self.assertEqual(
+        ["No numeric summary available due to tensor dtype: object."],
+        out.lines)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 86689c7d44bcea8851d1d0702e736ddb74215719..704dbda357d1208d0663da41eb7aef4b299dedb8 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -53,8 +53,8 @@ def main(_):
     sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
     sess.add_tensor_filter("has_negative", has_negative)
 
-  print("Fibonacci number at position %d: %d" %
-        (FLAGS.length, int(sess.run(n1))))
+  print("Fibonacci number at position %d:\n%s" %
+        (FLAGS.length, sess.run(n1)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 73d398c086c497217f609bc696d57344a4e4c9bc..96fca9f6f3dfb9194f99f057d1e7f3479031f41d 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -90,7 +90,8 @@ def main(_):
       return activations
 
   hidden = nn_layer(x, IMAGE_SIZE**2, HIDDEN_SIZE, "hidden")
-  y = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "softmax", act=tf.nn.softmax)
+  logits = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "output", tf.identity)
+  y = tf.nn.softmax(logits)
 
   with tf.name_scope("cross_entropy"):
     # The following line is the culprit of the bad numerical values that appear
@@ -99,12 +100,13 @@ def main(_):
     # call. A multiplication of the inf values with zeros leads to nans,
     # which is first in "cross_entropy/mul:0".
     #
-    # You can use clipping to fix this issue, e.g.,
-    #   diff = y_ * tf.log(tf.clip_by_value(y, 1e-8, 1.0))
+    # You can use the built-in, numerically-stable implementation to fix this
+    # issue:
+    #   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)
 
-    diff = y_ * tf.log(y)
+    diff = -(y_ * tf.log(y))
     with tf.name_scope("total"):
-      cross_entropy = -tf.reduce_mean(diff)
+      cross_entropy = tf.reduce_mean(diff)
 
   with tf.name_scope("train"):
     train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 0b5401a7f29a3e979de157b47f952047cecd6a4e..25916f1903cd41c7f714fd0eb7bad0329dde8ceb 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -56,7 +56,7 @@ fi
 
 # Override the default ui_type=curses to allow the test to pass in a tty-less
 # test environment.
-cat << EOF | ${DEBUG_FIBONACCI_BIN} --ui_type=readline
+cat << EOF | ${DEBUG_FIBONACCI_BIN} --tensor_size=2 --ui_type=readline
 run
 exit
 EOF
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 0cdf1891272d6ba469b87702503b212230402b31..3335657a61d408f92f6b19d22edcf422b83b0ed9 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -22,6 +22,7 @@ import collections
 import glob
 import json
 import os
+import platform
 
 import numpy as np
 import six
@@ -33,6 +34,7 @@ from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import gfile
+from tensorflow.python.util import compat
 
 
 # TODO(cais): Tie these string constants in with C++?
@@ -40,11 +42,19 @@ METADATA_FILE_PREFIX = "_tfdbg_"
 CORE_METADATA_TAG = "core_metadata_"
 GRAPH_FILE_TAG = "graph_"
 DEVICE_TAG = "device_"
+HASH_TAG = "hash"
 
 FETCHES_INFO_FILE_TAG = "fetches_info_"
 FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
 
 
+def _glob(glob_pattern):
+  if platform.system() == "Windows":
+    return glob.glob(glob_pattern)
+  else:
+    return gfile.Glob(glob_pattern)
+
+
 class InconvertibleTensorProto(object):
   """Represents a TensorProto that cannot be converted to np.ndarray."""
 
@@ -108,8 +118,13 @@ def load_tensor_from_event(event):
   """
 
   tensor_proto = event.summary.value[0].tensor
-  if tensor_proto.tensor_content or tensor_proto.string_val:
-    # Initialized tensor.
+  shape = tensor_util.TensorShapeProtoToList(tensor_proto.tensor_shape)
+  num_elements = 1
+  for shape_dim in shape:
+    num_elements *= shape_dim
+
+  if tensor_proto.tensor_content or tensor_proto.string_val or not num_elements:
+    # Initialized tensor or empty tensor.
     if tensor_proto.dtype == types_pb2.DT_RESOURCE:
       tensor_value = InconvertibleTensorProto(tensor_proto)
     else:
@@ -348,7 +363,7 @@ def extract_core_metadata_from_event_proto(event):
 
 def device_name_to_device_path(device_name):
   """Convert device name to device path."""
-  device_name_items = device_name.split("/")
+  device_name_items = compat.as_text(device_name).split("/")
   device_name_items = [item.replace(":", "_") for item in device_name_items]
   return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items)
 
@@ -679,7 +694,7 @@ class DebugDumpDir(object):
 
   def _load_all_device_dumps(self, partition_graphs, validate):
     """Load the dump data for all devices."""
-    device_dirs = glob.glob(os.path.join(
+    device_dirs = _glob(os.path.join(
         self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*"))
 
     self._device_names = []
@@ -762,7 +777,7 @@ class DebugDumpDir(object):
     self._t0 = min(t0s) if t0s else None
 
   def _load_core_metadata(self):
-    core_metadata_files = glob.glob(os.path.join(
+    core_metadata_files = _glob(os.path.join(
         self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*"))
     for core_metadata_file in core_metadata_files:
       with gfile.Open(core_metadata_file, "rb") as f:
@@ -772,7 +787,7 @@ class DebugDumpDir(object):
             extract_core_metadata_from_event_proto(event))
 
   def _load_fetches_info(self):
-    fetches_info_files = glob.glob(os.path.join(
+    fetches_info_files = _glob(os.path.join(
         self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*"))
     self._run_fetches_info = []
     for fetches_info_file in fetches_info_files:
@@ -780,7 +795,7 @@ class DebugDumpDir(object):
           _load_log_message_from_event_file(fetches_info_file))
 
   def _load_feeds_info(self):
-    feeds_info_files = glob.glob(os.path.join(
+    feeds_info_files = _glob(os.path.join(
         self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*"))
     self._run_feed_keys_info = []
     for feeds_info_file in feeds_info_files:
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index 70dc8c11500ac6998b35d924dd7aa98d889e2a13..eff70b662bd49df693e35d3d8b8eec27b3290131 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import platform
 import shutil
 import tempfile
 
@@ -27,7 +28,9 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 class DeviceNamePathConversionTest(test_util.TensorFlowTestCase):
@@ -339,6 +342,38 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     self.assertIsNone(dump_dir.t0)
     self.assertEqual([], dump_dir.dumped_tensor_data)
 
+  def testDebugDumpDir_usesGfileGlob(self):
+    if platform.system() == "Windows":
+      self.skipTest("gfile.Glob is not used on Windows.")
+
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+
+    def fake_gfile_glob(glob_pattern):
+      del glob_pattern
+      return []
+
+    with test.mock.patch.object(
+        gfile, "Glob", side_effect=fake_gfile_glob, autospec=True) as fake:
+      debug_data.DebugDumpDir(self._dump_root)
+      expected_calls = [
+          test.mock.call(os.path.join(
+              self._dump_root,
+              (debug_data.METADATA_FILE_PREFIX +
+               debug_data.CORE_METADATA_TAG + "*"))),
+          test.mock.call(os.path.join(
+              self._dump_root,
+              (debug_data.METADATA_FILE_PREFIX +
+               debug_data.FETCHES_INFO_FILE_TAG + "*"))),
+          test.mock.call(os.path.join(
+              self._dump_root,
+              (debug_data.METADATA_FILE_PREFIX +
+               debug_data.FEED_KEYS_INFO_FILE_TAG + "*"))),
+          test.mock.call(os.path.join(
+              self._dump_root,
+              (debug_data.METADATA_FILE_PREFIX +
+               debug_data.DEVICE_TAG + "*")))]
+      fake.assert_has_calls(expected_calls, any_order=True)
+
 
 class GetNodeNameAndOutputSlotTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/debug/lib/debug_gradients.py b/tensorflow/python/debug/lib/debug_gradients.py
new file mode 100644
index 0000000000000000000000000000000000000000..8689a68875b4781e45ba22b5ea0bf69621709e39
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_gradients.py
@@ -0,0 +1,416 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Debugger: Tools for debugging gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import uuid
+
+import six
+
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import variables
+
+_GRADIENT_DEBUG_TAG = "gradient_debug_"
+
+_gradient_debuggers = {}
+
+
+def _tensor_to_grad_debug_op_name(tensor, grad_debugger_uuid):
+  op_name, slot = debug_data.parse_node_or_tensor_name(tensor.name)
+  return "%s_%d/%s%s" % (op_name, slot, _GRADIENT_DEBUG_TAG, grad_debugger_uuid)
+
+
+def _parse_grad_debug_op_name(op_name):
+  """Parse the name of a debug gradient op.
+
+  Args:
+    op_name: the name of the debug gradient op.
+
+  Returns:
+    1) The UUID of the GradientsDebugger that created the debug gradient op.
+    2) Name of the original tensor whose gradient is debugged by the debug
+       gradient op.
+  """
+  name_items = op_name.split("/")
+  assert len(name_items) > 1
+  assert name_items[-1].startswith(_GRADIENT_DEBUG_TAG)
+
+  grad_debugger_uuid = name_items[-1][len(_GRADIENT_DEBUG_TAG):]
+  if "_" in grad_debugger_uuid:
+    grad_debugger_uuid = grad_debugger_uuid[:grad_debugger_uuid.index("_")]
+  orig_tensor_slot = int(name_items[-2][name_items[-2].rfind("_") + 1:])
+  orig_base_op_name = name_items[-2][:name_items[-2].rfind("_")]
+  orig_tensor_name = ("/".join(name_items[:-2] + [orig_base_op_name]) +
+                      ":%d" % orig_tensor_slot)
+
+  return grad_debugger_uuid, orig_tensor_name
+
+
+class GradientsDebugger(object):
+  """Gradients Debugger.
+
+  Allows retrieval of gradient tensors created by TensorFlow's automatic
+  differentiation algorithm, i.e., @{tf.gradients} and optimizer classes that
+  use it.
+  """
+  # TODO(cais): Add examples code in the doc string?
+
+  def __init__(self, y_tensor=None):
+    """Constructor of GradientsDebugger.
+
+    Args:
+      y_tensor: optional: the `tf.Tensor` to be differentiated, i.e., the tensor
+        on the numerator of the differentiation.
+    """
+
+    self._uuid = uuid.uuid4().hex
+    _gradient_debuggers[self._uuid] = self
+
+    # A dict mapping x-tensor names to gradient tensor. x-tensor refers to the
+    # independent tf.Tensor, i.e., the tensor on the denominator of the
+    # differentiation.
+    self._gradient_tensors = {}
+    self._y_tensor = y_tensor
+
+    self._graph = None
+    if y_tensor:
+      self._graph = y_tensor.graph
+
+    self._is_active_context = False
+
+  @property
+  def y_tensor(self):
+    return self._y_tensor
+
+  @property
+  def graph(self):
+    return self._graph
+
+  def __enter__(self):
+    self._is_active_context = True
+
+  def __exit__(self, unused_type, unused_value, unused_traceback):
+    self._is_active_context = False
+
+  def identify_gradient(self, input_tensor):
+    """Create a debug identity tensor that registers and forwards gradients.
+
+    The side effect of this method is that when gradient tensor(s) are created
+    with respect to the any paths that include the `input_tensor`, the gradient
+    tensor(s) with repsect to `input_tensor` will be registered with this
+    this `GradientsDebugger` instance and can later be retrieved, with the
+    methods `gradient_tensor` and `gradient_tensors`.
+
+    Example:
+
+    ```python
+    x = tf.Variable(1.0)
+    y = tf.add(x, x)
+
+    grad_debugger = tf_debug.GradientsDebugger()
+    debug_y = grad_debugger.identify_gradient(y)
+    z = tf.square(debug_y)
+
+    # Create a train op under the grad_debugger context.
+    with grad_debugger:
+      train_op = tf.train.GradientDescentOptimizer(z)
+
+    # Now we can reflect through grad_debugger to get the gradient tensor
+    # with respect to y.
+    y_grad = grad_debugger.gradient_tensor(y)
+    ```
+
+    Args:
+      input_tensor: the input `tf.Tensor` object whose related gradient tensors
+        are to be reigstered with this `GradientsDebugger` instance when they
+        are created, e.g., during @{tf.gradients} calls or the construction
+        of optimization (training) op that uses @{tf.gradients}.
+
+    Returns:
+      A forwarded identity of `input_tensor`, as a `tf.Tensor`.
+
+    Raises:
+      ValueError: If an op with name that duplicates the gradient-debugging op
+        already exists in the graph (highly unlikely).
+    """
+    # TODO(cais): Allow overriding gradient.
+    # TODO(cais): Implement value_stack.
+    grad_debug_op_name = _tensor_to_grad_debug_op_name(input_tensor, self._uuid)
+    # pylint: disable=protected-access
+    debug_grad_identity = gen_array_ops._debug_gradient_identity(
+        input_tensor, name=grad_debug_op_name)
+    # pylint: enable=protected-access
+    if debug_grad_identity.op.name != grad_debug_op_name:
+      raise ValueError(
+          "The graph already contains an op named %s" % grad_debug_op_name)
+    return debug_grad_identity
+
+  def watch_gradients_by_tensors(self, graph, tensors):
+    """Watch gradient tensors by x-tensor(s).
+
+    The side effect of this method is that when gradient tensor(s) are created
+    with respect to the any paths that include the `x_tensor`s, the gradient
+    tensor(s) with repsect to the tensor will be registered with this
+    this `GradientsDebugger` instance and can later be retrieved, with the
+    methods `gradient_tensor` and `gradient_tensors`.
+
+    Unlike the method `identify_gradient`, this method is used to retrieve
+    gradient tensors after the construction of the forward subgraph has
+    completed (but before the construction of the backward subgraph).
+
+    This method is the same as `watch_gradients_by_x_tensor_names` except that
+    the tensors are specified by the Python `tf.Tensor` or `tf.Variable`
+    objects, instead by name patterns.
+
+    Example:
+
+    ```python
+    x = tf.Variable(1.0)
+    y = tf.add(x, x, name="y")
+    z = tf.square(debug_y)
+
+    # Create a train op under the grad_debugger context.
+    grad_debugger = tf_debug.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensors(y):
+      train_op = tf.train.GradientDescentOptimizer(z)
+
+    # Now we can reflect through grad_debugger to get the gradient tensor
+    # with respect to y.
+    y_grad = grad_debugger.gradient_tensor(y)
+    # or
+    y_grad = grad_debugger.gradient_tensor("y:0")
+    ```
+
+    Args:
+      graph: the `tf.Graph` to watch the gradients on.
+      tensors: a `tf.Tensor` or `tf.Variable` object, or a list of such objects.
+
+    Returns:
+      The GradientsDebugger instance itself.
+    """
+
+    if not isinstance(tensors, list):
+      tensors = [tensors]
+
+    tensor_name_regex = []
+    for tensor in tensors:
+      tensor_name_regex.append(re.escape(tensor.name) + "$")
+    tensor_name_regex = "(" + "|".join(tensor_name_regex) + ")"
+    return self.watch_gradients_by_tensor_names(graph, tensor_name_regex)
+
+  def watch_gradients_by_tensor_names(self, graph, tensor_name_regex):
+    """Watch gradient tensors by name(s) of the x-tensor(s).
+
+    The side effect of this method is that when gradient tensor(s) are created
+    with respect to the x-tensors, the gradient tensor(s) will be registered
+    with this `GradientsDebugger` instance and can later be retrieved.
+
+    Unlike the `identify_gradient` method, this method is used after the
+    construction of the forward graph has completed. Unlike the
+    `watch_gradients_by_tensor` method, this method does not use handles to the
+    tensors of interest; it uses their names.
+
+    This method is the same as `watch_gradients_by_tensors` except that the
+    x-tensors are specified by name patterns, instead of `tf.Tensor` or
+    `tf.Variable` objects.
+
+    Example:
+
+    ```python
+    x = tf.Variable(1.0, name="x")
+    y = tf.add(x, x, name="y")
+    z = tf.square(debug_y)
+
+    # Create a train op under the grad_debugger context.
+    grad_debugger = tf_debug.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensor_names(r"(x|y):0$"):
+      train_op = tf.train.GradientDescentOptimizer(z)
+
+    # Now we can reflect through grad_debugger to get the gradient tensor
+    # with respect to x and y.
+    x_grad = grad_debugger.gradient_tensor("x:0")
+    y_grad = grad_debugger.gradient_tensor("y:0")
+    ```
+
+    Args:
+      graph: the `tf.Graph` to watch the gradients on.
+      tensor_name_regex: the regular-expression pattern of the name(s) of the
+        x-tensor(s) to watch. x-tensor refers to the tensors on the denominator
+        of the differentiation.
+
+    Returns:
+      The GradientsDebugger instance itself.
+    """
+    tensor_name_pattern = re.compile(tensor_name_regex)
+
+    # pylint: disable=protected-access
+    with graph.as_default():
+      for op in graph.get_operations():
+        for output in op.outputs:
+          if tensor_name_pattern.match(output.name):
+            debug_op = self.identify_gradient(output)
+
+            for consumer in output.consumers():
+              if consumer == debug_op.op:
+                continue
+
+              # Locate the slot index of the original input.
+              input_slots = []
+              for i, consumer_input in enumerate(consumer._inputs):
+                if consumer_input == output:
+                  input_slots.append(i)
+
+              for slot in input_slots:
+                consumer._inputs[slot] = debug_op
+                debug_op._consumers.append(consumer)
+
+            del output._consumers[:]
+            output._consumers.append(debug_op.op)
+    # pylint: enable=protected-access
+
+    return self
+
+  def _check_same_graph(self, tensor):
+    if self._graph is None:
+      self._graph = tensor.graph
+    elif self._graph != tensor.graph:
+      raise ValueError(
+          "The graph of the value (%s) is not the same as the graph %s" %
+          (tensor.graph, self._graph))
+
+  def register_gradient_tensor(self,
+                               x_tensor_name,
+                               gradient_tensor):
+    """Register the gradient tensor for an x-tensor.
+
+    Args:
+      x_tensor_name: (`str`) the name of the independent `tf.Tensor`, i.e.,
+        the tensor on the denominator of the differentiation.
+      gradient_tensor: the gradient `tf.Tensor`.
+    """
+    if len(_gradient_debuggers) == 1 or self._is_active_context:
+      self._check_same_graph(gradient_tensor)
+      self._gradient_tensors[x_tensor_name] = gradient_tensor
+
+  def gradient_tensor(self, x_tensor):
+    """Get the gradient tensor of an x-tensor.
+
+    Args:
+      x_tensor: (`tf.Tensor`, `tf.Variable` or `str`) The x-tensor object or its
+        name. x-tensor refers to the independent `tf.Tensor`, i.e., the tensor
+        on the denominator of the differentiation.
+
+    Returns:
+      If found, the gradient tensor.
+
+    Raises:
+      TypeError: If `x_tensor` is not a `tf.Tensor`, `tf.Variable` or `str`.
+      LookupError: If the `x_tensor` has not been registered with a gradient
+        tensor.
+    """
+    x_tensor_name = self._get_tensor_name(x_tensor)
+    if x_tensor_name not in self._gradient_tensors:
+      raise LookupError(
+          "This GradientsDebugger has not received any gradient tensor for "
+          "x-tensor %s" % x_tensor_name)
+    return self._gradient_tensors[x_tensor_name]
+
+  def gradient_tensors(self):
+    """Get the gradient tensors that this object is aware of.
+
+    Returns:
+      A dict mapping x-tensor names to gradient tensor objects. x-tensor refers
+      to the tensors on the denominator of the differentation.
+    """
+    return self._gradient_tensors
+
+  def _get_tensor_name(self, tensor):
+    if isinstance(tensor, (ops.Tensor, variables.Variable)):
+      return tensor.name
+    elif  isinstance(tensor, six.string_types):
+      return tensor
+    else:
+      raise TypeError(
+          "x_tensor must be a str or tf.Tensor or tf.Variable, "
+          "but instead has type %s" % type(tensor))
+
+
+def clear_gradient_debuggers():
+  """Clear all globally registered gradient debuggers."""
+  _gradient_debuggers.clear()
+
+
+@ops.RegisterGradient("DebugGradientIdentity")
+def _identify_gradient_grad(op, dy):
+  """Gradient function for the DebugIdentity op."""
+  # TODO(cais): Allow overriding gradient.
+  grad_debugger_uuid, orig_tensor_name = _parse_grad_debug_op_name(op.name)
+  grad_debugger = _gradient_debuggers[grad_debugger_uuid]
+  grad_debugger.register_gradient_tensor(orig_tensor_name, dy)
+  return dy
+
+
+def gradient_values_from_dump(grad_debugger, x_tensor, dump):
+  """Find gradient values from a `DebugDumpDir` object.
+
+  Args:
+    grad_debugger: the `tf_debug.GradientsDebugger` instance to be used.
+    x_tensor: (`tf.Tensor`, `tf.Variable` or `str`) The x-tensor object or its
+      name. x-tensor refers to the independent `tf.Tensor`, i.e., the tensor
+      on the denominator of the differentiation.
+    dump: A `tfdbg.DebugDumpDir` object.
+
+  Returns:
+    If this `GradientsDebugger` instance has the gradient tensor of `x_tensor`
+      registered: a list of `numpy.ndarray` representing the value of the
+      gradient tensor from `dump`. The list could be empty, if the gradient
+      tensor is not executed in the `tf.Session.run()` call that generated
+      the `dump`. The list could also contain multiple values of the gradient
+      tensor, e.g., if gradient tensor is computed repeatedly in a
+      `tf.while_loop` during the run that generated the `dump`.
+
+  Raises:
+    LookupError: If this `GradientsDebugger` instance does not have the
+      gradient tensor of `x_tensor` registered.
+    ValueError: If this `GradientsDebugger` has a `tf.Graph` object that
+      does not match the `tf.Graph` object of the `dump`.
+    TypeError: If `x_tensor` is not a `tf.Tensor`, `tf.Variable` or `str`.
+  """
+  # TODO(cais): Use this method in LocalCLIDebugWrapperSession to present the
+  # gradient tensors to the TFDBG CLI.
+
+  # If possible, verify that the Python graph of the dump and that of this
+  # GradientsDebugger match.
+  if (dump.python_graph and grad_debugger.graph and
+      dump.python_graph != grad_debugger.graph):
+    raise ValueError(
+        "This GradientsDebugger instance has a graph (%s) that differs from "
+        "the graph of the DebugDumpDir object (%s)." %
+        (grad_debugger.graph, dump.python_graph))
+
+  gradient_tensor = grad_debugger.gradient_tensor(x_tensor)
+  node_name, output_slot = debug_data.parse_node_or_tensor_name(
+      gradient_tensor.name)
+
+  try:
+    return dump.get_tensors(node_name, output_slot, "DebugIdentity")
+  except debug_data.WatchKeyDoesNotExistInDebugDumpDirError:
+    return []
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..966578320e22caba28344248cbc0562fdc3dfee2
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -0,0 +1,378 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for debug_gradients module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_gradients
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import gradient_descent
+
+
+class IdentifyGradientTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self.sess = session.Session()
+    with self.sess:
+      self.u = variables.Variable(2.0, name="u")
+      self.v = variables.Variable(3.0, name="v")
+      self.w = math_ops.multiply(self.u.value(), self.v.value(), name="w")
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    debug_gradients.clear_gradient_debuggers()
+
+  def testIdentifyGradientGivesCorrectTensorObjectWithoutContextManager(self):
+    grad_debugger = debug_gradients.GradientsDebugger()
+    id_grad_w = grad_debugger.identify_gradient(self.w)
+    y = math_ops.add(id_grad_w, -1.0, name="y")
+
+    grads = gradients_impl.gradients(y, [self.u, self.v])
+    self.assertEqual(2, len(grads))
+    u_grad = grads[0]
+    v_grad = grads[1]
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(5.0, self.sess.run(y))
+    self.assertAllClose(3.0, self.sess.run(u_grad))
+    self.assertAllClose(2.0, self.sess.run(v_grad))
+
+    # Fetch the gradient tensor with the x-tensor object.
+    w_grad = grad_debugger.gradient_tensor(self.w)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+    # Fetch the gradient tensor with the x-tensor's name.
+    w_grad = grad_debugger.gradient_tensor(self.w.name)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+    # Fetch the gradient tensor with the x-tensor name.
+    w_grad = grad_debugger.gradient_tensor(self.w.name)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+  def testIdentifyGradientGivesCorrectTensorObjectWithTfGradients(self):
+    grad_debugger = debug_gradients.GradientsDebugger()
+    id_grad_w = grad_debugger.identify_gradient(self.w)
+    y = math_ops.add(id_grad_w, -1.0, name="y")
+
+    with grad_debugger:
+      grads = gradients_impl.gradients(y, [self.u, self.v])
+    self.assertEqual(2, len(grads))
+    u_grad = grads[0]
+    v_grad = grads[1]
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(5.0, self.sess.run(y))
+    self.assertAllClose(3.0, self.sess.run(u_grad))
+    self.assertAllClose(2.0, self.sess.run(v_grad))
+
+    # Fetch the gradient tensor with the x-tensor object.
+    w_grad = grad_debugger.gradient_tensor(self.w)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+    # Fetch the gradient tensor with the x-tensor's name.
+    w_grad = grad_debugger.gradient_tensor(self.w.name)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+    # Fetch the gradient tensor with the x-tensor name.
+    w_grad = grad_debugger.gradient_tensor(self.w.name)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+  def testCallingIdentifyGradientTwiceWithTheSameGradientsDebuggerErrors(self):
+    grad_debugger = debug_gradients.GradientsDebugger()
+    grad_debugger.identify_gradient(self.w)
+    with self.assertRaisesRegexp(
+        ValueError, "The graph already contains an op named .*"):
+      grad_debugger.identify_gradient(self.w)
+
+  def testIdentifyGradientWorksOnMultipleLosses(self):
+    grad_debugger_1 = debug_gradients.GradientsDebugger()
+    grad_debugger_2 = debug_gradients.GradientsDebugger()
+
+    y = math_ops.add(self.w, -1.0, name="y")
+    debug_y = grad_debugger_1.identify_gradient(y)
+    z1 = math_ops.square(debug_y, name="z1")
+
+    debug_y = grad_debugger_2.identify_gradient(y)
+    z2 = math_ops.sqrt(debug_y, name="z2")
+
+    with grad_debugger_1:
+      gradient_descent.GradientDescentOptimizer(0.1).minimize(z1)
+    with grad_debugger_2:
+      gradient_descent.GradientDescentOptimizer(0.1).minimize(z2)
+
+    dz1_dy = grad_debugger_1.gradient_tensor(y)
+    dz2_dy = grad_debugger_2.gradient_tensor(y)
+    self.assertIsInstance(dz1_dy, ops.Tensor)
+    self.assertIsInstance(dz2_dy, ops.Tensor)
+    self.assertIsNot(dz1_dy, dz2_dy)
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
+    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
+    self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
+    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
+
+  def testIdentifyGradientRaisesLookupErrorForUnknownXTensor(self):
+    grad_debugger_1 = debug_gradients.GradientsDebugger()
+    grad_debugger_2 = debug_gradients.GradientsDebugger()
+    id_grad_w = grad_debugger_1.identify_gradient(self.w)
+    y = math_ops.add(id_grad_w, -1.0, name="y")
+
+    # There are >1 gradient debuggers registered, and grad_debugger is not used
+    # as a context manager here, so the gradient w.r.t. self.w will not be
+    # registered.
+    gradients_impl.gradients(y, [self.u, self.v])
+
+    with self.assertRaisesRegexp(
+        LookupError,
+        r"This GradientsDebugger has not received any gradient tensor for "):
+      grad_debugger_1.gradient_tensor(self.w)
+    with self.assertRaisesRegexp(
+        LookupError,
+        r"This GradientsDebugger has not received any gradient tensor for "):
+      grad_debugger_2.gradient_tensor(self.w)
+
+  def testIdentifyGradientRaisesTypeErrorForNonTensorOrTensorNameInput(self):
+    grad_debugger = debug_gradients.GradientsDebugger()
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"x_tensor must be a str or tf\.Tensor or tf\.Variable, but instead "
+        r"has type .*Operation.*"):
+      grad_debugger.gradient_tensor(variables.global_variables_initializer())
+
+  def testIdentifyGradientTensorWorksWithGradientDescentOptimizer(self):
+    grad_debugger = debug_gradients.GradientsDebugger()
+    id_grad_w = grad_debugger.identify_gradient(self.w)
+    y = math_ops.add(id_grad_w, -1.0, name="y")
+
+    with grad_debugger:
+      gradient_descent.GradientDescentOptimizer(0.1).minimize(y)
+
+    self.sess.run(variables.global_variables_initializer())
+
+    # Fetch the gradient tensor with the x-tensor object.
+    w_grad = grad_debugger.gradient_tensor(self.w)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+  def testWatchGradientsByXTensorNamesWorks(self):
+    y = math_ops.add(self.w, -1.0, name="y")
+
+    # The constructrion of the forward graph has completed.
+    # But we can still get the gradient tensors by using
+    # watch_gradients_by_tensor_names().
+    grad_debugger = debug_gradients.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensor_names(self.sess.graph, "w:0$"):
+      grads = gradients_impl.gradients(y, [self.u, self.v])
+    self.assertEqual(2, len(grads))
+    u_grad = grads[0]
+    v_grad = grads[1]
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(5.0, self.sess.run(y))
+    self.assertAllClose(3.0, self.sess.run(u_grad))
+    self.assertAllClose(2.0, self.sess.run(v_grad))
+
+    w_grad = grad_debugger.gradient_tensor(self.w)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+    w_grad = grad_debugger.gradient_tensor("w:0")
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+  def testWatchGradientsByXTensorNamesWorksWithoutContextManager(self):
+    y = math_ops.add(self.w, -1.0, name="y")
+
+    # The constructrion of the forward graph has completed.
+    # But we can still get the gradient tensors by using
+    # watch_gradients_by_tensor_names().
+    grad_debugger = debug_gradients.GradientsDebugger()
+    grad_debugger.watch_gradients_by_tensor_names(self.sess.graph, "w:0$")
+    grads = gradients_impl.gradients(y, [self.u, self.v])
+    self.assertEqual(2, len(grads))
+    u_grad = grads[0]
+    v_grad = grads[1]
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(5.0, self.sess.run(y))
+    self.assertAllClose(3.0, self.sess.run(u_grad))
+    self.assertAllClose(2.0, self.sess.run(v_grad))
+
+    w_grad = grad_debugger.gradient_tensor(self.w)
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+    w_grad = grad_debugger.gradient_tensor("w:0")
+    self.assertIsInstance(w_grad, ops.Tensor)
+    self.assertAllClose(1.0, self.sess.run(w_grad))
+
+  def testWatchGradientsWorksOnRefTensor(self):
+    y = math_ops.add(self.w, -1.0, name="y")
+
+    grad_debugger = debug_gradients.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensor_names(self.sess.graph, "u:0$"):
+      grads = gradients_impl.gradients(y, [self.u, self.v])
+    self.assertEqual(2, len(grads))
+    u_grad = grads[0]
+    v_grad = grads[1]
+
+    self.assertIs(u_grad, grad_debugger.gradient_tensor("u:0"))
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(3.0, self.sess.run(u_grad))
+    self.assertAllClose(2.0, self.sess.run(v_grad))
+    self.assertAllClose(
+        3.0, self.sess.run(grad_debugger.gradient_tensor("u:0")))
+
+  def testWatchGradientsWorksOnMultipleTensors(self):
+    y = math_ops.add(self.w, -1.0, name="y")
+
+    grad_debugger = debug_gradients.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensor_names(self.sess.graph,
+                                                       "(u|w):0$"):
+      grads = gradients_impl.gradients(y, [self.u, self.v])
+    self.assertEqual(2, len(grads))
+    u_grad = grads[0]
+
+    self.assertEqual(2, len(grad_debugger.gradient_tensors()))
+    self.assertIs(u_grad, grad_debugger.gradient_tensor("u:0"))
+    self.assertIsInstance(grad_debugger.gradient_tensor("w:0"), ops.Tensor)
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(
+        1.0, self.sess.run(grad_debugger.gradient_tensor("w:0")))
+    self.assertAllClose(
+        3.0, self.sess.run(grad_debugger.gradient_tensor("u:0")))
+
+  def testWatchGradientsByXTensorsWorks(self):
+    y = math_ops.add(self.w, -1.0, name="foo/y")
+    z = math_ops.square(y, name="foo/z")
+
+    # The constructrion of the forward graph has completed.
+    # But we can still get the gradient tensors by using
+    # watch_gradients_by_x_tensors().
+    grad_debugger = debug_gradients.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensors(
+        self.sess.graph, [self.w, self.u, y]):
+      gradient_descent.GradientDescentOptimizer(0.1).minimize(z)
+
+    self.assertEqual(3, len(grad_debugger.gradient_tensors()))
+    u_grad = grad_debugger.gradient_tensor(self.u)
+    w_grad = grad_debugger.gradient_tensor(self.w)
+    y_grad = grad_debugger.gradient_tensor(y)
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(10.0, self.sess.run(y_grad))
+    self.assertAllClose(10.0, self.sess.run(w_grad))
+    self.assertAllClose(30.0, self.sess.run(u_grad))
+
+  def testWatchGradientsByTensorCanWorkOnMultipleLosses(self):
+    y = math_ops.add(self.w, -1.0, name="y")
+    z1 = math_ops.square(y, name="z1")
+    z2 = math_ops.sqrt(y, name="z2")
+
+    grad_debugger_1 = debug_gradients.GradientsDebugger()
+    with grad_debugger_1.watch_gradients_by_tensors(self.sess.graph, y):
+      gradient_descent.GradientDescentOptimizer(0.1).minimize(z1)
+
+    grad_debugger_2 = debug_gradients.GradientsDebugger()
+    with grad_debugger_2.watch_gradients_by_tensors(self.sess.graph, y):
+      gradient_descent.GradientDescentOptimizer(0.1).minimize(z2)
+
+    dz1_dy = grad_debugger_1.gradient_tensor(y)
+    dz2_dy = grad_debugger_2.gradient_tensor(y)
+    self.assertIsInstance(dz1_dy, ops.Tensor)
+    self.assertIsInstance(dz2_dy, ops.Tensor)
+    self.assertIsNot(dz1_dy, dz2_dy)
+
+    self.sess.run(variables.global_variables_initializer())
+    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
+    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
+    self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
+    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
+
+  def testGradientsValuesFromDumpWorks(self):
+    y = math_ops.add(self.w, -1.0, name="y")
+    z = math_ops.square(y, name="z")
+
+    grad_debugger = debug_gradients.GradientsDebugger()
+    with grad_debugger.watch_gradients_by_tensors(
+        self.sess.graph, [self.w, self.u, y]):
+      train_op = gradient_descent.GradientDescentOptimizer(0.1).minimize(z)
+
+    self.sess.run(variables.global_variables_initializer())
+
+    run_options = config_pb2.RunOptions(output_partition_graphs=True)
+    dump_dir = tempfile.mkdtemp()
+    debug_url = "file://" + dump_dir
+    debug_utils.watch_graph(
+        run_options,
+        self.sess.graph,
+        debug_urls=debug_url)
+    run_metadata = config_pb2.RunMetadata()
+    self.sess.run(train_op, options=run_options, run_metadata=run_metadata)
+
+    dump = debug_data.DebugDumpDir(
+        dump_dir, partition_graphs=run_metadata.partition_graphs)
+    dump.set_python_graph(self.sess.graph)
+
+    y_grad_values = debug_gradients.gradient_values_from_dump(
+        grad_debugger, y, dump)
+    self.assertEqual(1, len(y_grad_values))
+    self.assertAllClose(10.0, y_grad_values[0])
+
+    w_grad_values = debug_gradients.gradient_values_from_dump(
+        grad_debugger, self.w, dump)
+    self.assertEqual(1, len(w_grad_values))
+    self.assertAllClose(10.0, w_grad_values[0])
+
+    u_grad_values = debug_gradients.gradient_values_from_dump(
+        grad_debugger, self.u, dump)
+    self.assertEqual(1, len(u_grad_values))
+    self.assertAllClose(30.0, u_grad_values[0])
+
+    with self.assertRaisesRegexp(
+        LookupError,
+        r"This GradientsDebugger has not received any gradient tensor for "
+        r"x-tensor v:0"):
+      debug_gradients.gradient_values_from_dump(grad_debugger, self.v, dump)
+
+    # Cleanup.
+    shutil.rmtree(dump_dir)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
new file mode 100755
index 0000000000000000000000000000000000000000..98adc3284b94afc8190f7ee4240d7c5fbf37b4b5
--- /dev/null
+++ b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
@@ -0,0 +1,76 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+#
+# Do not use pylint on generated code.
+# pylint: disable=missing-docstring,g-short-docstring-punctuation,g-no-space-after-docstring-summary,invalid-name,line-too-long,unused-argument,g-doc-args
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import grpc
+
+from tensorflow.core.debug import debug_service_pb2 as tensorflow_dot_core_dot_debug_dot_debug__service__pb2
+from tensorflow.core.util import event_pb2 as tensorflow_dot_core_dot_util_dot_event__pb2
+
+
+class EventListenerStub(object):
+  """EventListener: Receives Event protos, e.g., from debugged TensorFlow
+  runtime(s).
+  """
+
+  def __init__(self, channel):
+    """Constructor.
+
+    Args:
+      channel: A grpc.Channel.
+    """
+    self.SendEvents = channel.stream_stream(
+        '/tensorflow.EventListener/SendEvents',
+        request_serializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
+
+
+class EventListenerServicer(object):
+  """EventListener: Receives Event protos, e.g., from debugged TensorFlow
+  runtime(s).
+  """
+
+  def SendEvents(self, request_iterator, context):
+    """Client(s) can use this RPC method to send the EventListener Event protos.
+    The Event protos can hold information such as:
+    1) intermediate tensors from a debugged graph being executed, which can
+    be sent from DebugIdentity ops configured with grpc URLs.
+    2) GraphDefs of partition graphs, which can be sent from special debug
+    ops that get executed immediately after the beginning of the graph
+    execution.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+
+def add_EventListenerServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+      'SendEvents': grpc.stream_stream_rpc_method_handler(
+          servicer.SendEvents,
+          request_deserializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+      'tensorflow.EventListener', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1fbf39d414224d97675581092c1ad8dfdc1f1e
--- /dev/null
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -0,0 +1,230 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities in tf.Session with grpc:// URLs.
+
+This test focus on grpc:// debugging of distributed (gRPC) sessions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import subprocess
+import sys
+import time
+
+import portpicker
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import grpc_debug_test_server
+from tensorflow.python.debug.wrappers import framework
+from tensorflow.python.debug.wrappers import grpc_wrapper
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
+  """Test the debugging of distributed sessions."""
+
+  PER_PROC_GPU_MEMORY_FRACTION = 0.1
+  POLLING_INTERVAL_SEC = 0.025
+
+  @classmethod
+  def setUpClass(cls):
+    gpu_memory_fraction_opt = (
+        "--gpu_memory_fraction=%f" % cls.PER_PROC_GPU_MEMORY_FRACTION)
+
+    worker_port = portpicker.pick_unused_port()
+    cluster_spec = "worker|localhost:%d" % worker_port
+    tf_logging.info("cluster_spec: %s", cluster_spec)
+
+    server_bin = test.test_src_dir_path(
+        "tools/dist_test/server/grpc_tensorflow_server")
+
+    cls.server_target = "grpc://localhost:%d" % worker_port
+
+    cls.server_procs = {}
+    cls.server_procs["worker"] = subprocess.Popen(
+        [
+            server_bin,
+            "--cluster_spec=%s" % cluster_spec,
+            "--job_name=worker",
+            "--task_id=0",
+            gpu_memory_fraction_opt,
+        ],
+        stdout=sys.stdout,
+        stderr=sys.stderr)
+
+    # Start debug server in-process, on separate thread.
+    (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread,
+     cls.debug_server
+    ) = grpc_debug_test_server.start_server_on_separate_thread(
+        dump_to_filesystem=False)
+    tf_logging.info("debug server url: %s", cls.debug_server_url)
+
+    cls.session_config = config_pb2.ConfigProto(
+        gpu_options=config_pb2.GPUOptions(
+            per_process_gpu_memory_fraction=cls.PER_PROC_GPU_MEMORY_FRACTION))
+
+  @classmethod
+  def tearDownClass(cls):
+    for key in cls.server_procs:
+      cls.server_procs[key].terminate()
+    cls.debug_server.stop_server().wait()
+    cls.debug_server_thread.join()
+
+  def setUp(self):
+    pass
+
+  def tearDown(self):
+    self.debug_server.clear_data()
+
+  def _pollingAssertDebugTensorValuesAllClose(self, expected_values,
+                                              debug_tensor_name):
+    """Poll debug_server till tensor appears and matches expected values."""
+    while (debug_tensor_name not in self.debug_server.debug_tensor_values or
+           len(self.debug_server.debug_tensor_values) < len(expected_values)):
+      time.sleep(self.POLLING_INTERVAL_SEC)
+    self.assertAllClose(
+        expected_values,
+        self.debug_server.debug_tensor_values[debug_tensor_name])
+
+  def _createGraph(self):
+    """Create graph for testing.
+
+    Returns:
+      Python Graph object.
+    """
+    with ops.Graph().as_default() as graph:
+      with ops.device("/job:worker/task:0/cpu:0"):
+        self.a = variables.Variable(10.0, name="a")
+        self.b = variables.Variable(100.0, name="b")
+        self.inc_a = state_ops.assign_add(self.a, 2.0, name="inc_a")
+        self.dec_b = state_ops.assign_add(self.b, -5.0, name="dec_b")
+        self.p = math_ops.multiply(self.inc_a, self.dec_b, name="p")
+        self.q = math_ops.negative(self.p, name="q")
+    return graph
+
+  def testDistributedRunWithGatedGrpcCommunicatesWithDebugServerCorrectly(self):
+    graph = self._createGraph()
+    with session.Session(
+        config=self.session_config, graph=graph,
+        target=self.server_target) as sess:
+      sess.run(self.a.initializer)
+      sess.run(self.b.initializer)
+
+      run_options = config_pb2.RunOptions()
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          node_name_regex_whitelist=r"a",
+          debug_ops=["DebugIdentity"],
+          debug_urls=[self.debug_server_url])
+
+      # Test gated_grpc for an op located on the worker, i.e., on the same
+      # host as where MasterSession is.
+      # TODO(cais): gRPC gating of debug ops does not work on partition graphs
+      # not located on MasterSession hosts (e.g., parameter servers) yet. Make
+      # it work.
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          node_name_regex_whitelist=r"p",
+          debug_ops=["DebugIdentity(gated_grpc=True)"],
+          debug_urls=[self.debug_server_url])
+
+      for i in xrange(4):
+        # N.B.: These requests will be fulfilled not in this debugged
+        # Session.run() invocation, but in the next one.
+        if i % 2 == 0:
+          self.debug_server.request_watch("p", 0, "DebugIdentity")
+        else:
+          self.debug_server.request_unwatch("p", 0, "DebugIdentity")
+
+        expected_p = (10.0 + 2.0 * (i + 1)) * (100.0 - 5.0 * (i + 1))
+        self.assertAllClose(-expected_p, sess.run(self.q, options=run_options))
+
+        self.assertEqual(1, len(self.debug_server.core_metadata_json_strings))
+        core_metadata = json.loads(
+            self.debug_server.core_metadata_json_strings[0])
+        self.assertEqual([], core_metadata["input_names"])
+        self.assertEqual(["q:0"], core_metadata["output_names"])
+        self.assertEqual(i, core_metadata["executor_step_index"])
+
+        if i == 0:
+          self.assertEqual(1, len(self.debug_server.partition_graph_defs))
+
+        # Tensor "a" is from a PS. It may take longer to arrive due to the fact
+        # that the stream connection between the PS and the debug server is
+        # persistent and not torn down at the end of each Session.run()
+        self._pollingAssertDebugTensorValuesAllClose([10.0 + 2.0 * i],
+                                                     "a:0:DebugIdentity")
+
+        # Due to the gRPC gating of the debug op for "p", the debug tensor
+        # should be available on odd-indexed runs.
+        if i % 2 == 0:
+          self.assertNotIn("p:0:DebugIdentity",
+                           self.debug_server.debug_tensor_values)
+        else:
+          self.assertAllClose(
+              [expected_p],
+              self.debug_server.debug_tensor_values["p:0:DebugIdentity"])
+
+        self.assertNotIn("b:0:DebugIdentity",
+                         self.debug_server.debug_tensor_values)
+        self.debug_server.clear_data()
+
+  def testDistributedRunWithGrpcDebugWrapperWorks(self):
+    graph = self._createGraph()
+    with session.Session(
+        config=self.session_config, graph=graph,
+        target=self.server_target) as sess:
+      sess.run(self.a.initializer)
+      sess.run(self.b.initializer)
+
+      def watch_fn(feeds, fetch_keys):
+        del feeds, fetch_keys
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"p")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+
+      for i in xrange(4):
+        expected_p = (10.0 + 2.0 * (i + 1)) * (100.0 - 5.0 * (i + 1))
+        self.assertAllClose(-expected_p, sess.run(self.q))
+
+        if i == 0:
+          self.assertEqual(1, len(self.debug_server.partition_graph_defs))
+
+        self.assertAllClose(
+            [expected_p],
+            self.debug_server.debug_tensor_values["p:0:DebugIdentity"])
+        self.assertNotIn("b:0:DebugIdentity",
+                         self.debug_server.debug_tensor_values)
+        self.debug_server.clear_data()
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/grpc_debug_server.py b/tensorflow/python/debug/lib/grpc_debug_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..181b437695900e09ade0b0cac6777d2c2c42cf80
--- /dev/null
+++ b/tensorflow/python/debug/lib/grpc_debug_server.py
@@ -0,0 +1,395 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""gRPC debug server in Python."""
+# pylint: disable=g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import threading
+import time
+
+from concurrent import futures
+import grpc
+from six.moves import queue
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_service_pb2_grpc
+
+DebugWatch = collections.namedtuple("DebugWatch",
+                                    ["node_name", "output_slot", "debug_op"])
+
+
+def _watch_key_event_reply(to_enable, node_name, output_slot, debug_op):
+  """Make EventReply proto to represent a request to watch/unwatch a debug op.
+
+  Args:
+    to_enable: (`bool`) whether the request is to enable the watch key.
+    node_name: (`str`) name of the node.
+    output_slot: (`int`) output slot of the tensor.
+    debug_op: (`str`) the debug op attached to node_name:output_slot tensor to
+      watch or unwatch.
+
+  Returns:
+    An EventReply proto.
+  """
+  event_reply = debug_service_pb2.EventReply()
+  state_change = event_reply.debug_op_state_changes.add()
+  state_change.change = (
+      debug_service_pb2.EventReply.DebugOpStateChange.ENABLE
+      if to_enable else debug_service_pb2.EventReply.DebugOpStateChange.DISABLE)
+  state_change.node_name = node_name
+  state_change.output_slot = output_slot
+  state_change.debug_op = debug_op
+  return event_reply
+
+
+class EventListenerBaseStreamHandler(object):
+  """Per-stream handler of EventListener gRPC streams."""
+
+  def __init__(self):
+    """Constructor of EventListenerStreamHandler."""
+    raise NotImplementedError(
+        "__init__() is not implemented in the base stream handler class")
+
+  def on_core_metadata_event(self, event):
+    """Callback for core metadata.
+
+    Args:
+      event: The Event proto that carries a JSON string in its
+        `log_message.message` field.
+    """
+    raise NotImplementedError(
+        "on_core_metadata_event() is not implemented in the base servicer "
+        "class")
+
+  def on_graph_def(self, graph_def, device_name, wall_time):
+    """Callback for Event proto received through the gRPC stream.
+
+    This Event proto carries a GraphDef, encoded as bytes, in its graph_def
+    field.
+
+    Args:
+      graph_def: A GraphDef object.
+      device_name: Name of the device on which the graph was created.
+      wall_time: An epoch timestamp (in microseconds) for the graph.
+    """
+    raise NotImplementedError(
+        "on_graph_def() is not implemented in the base servicer class")
+
+  def on_value_event(self, event):
+    """Callback for Event proto received through the gRPC stream.
+
+    This Event proto carries a Tensor in its summary.value[0] field.
+
+    Args:
+      event: The Event proto from the stream to be processed.
+    """
+    raise NotImplementedError(
+        "on_value_event() is not implemented in the base servicer class")
+
+
+class EventListenerBaseServicer(debug_service_pb2_grpc.EventListenerServicer):
+  """Base Python class for gRPC debug server."""
+
+  def __init__(self, server_port, stream_handler_class):
+    """Constructor.
+
+    Args:
+      server_port: (int) Port number to bind to.
+      stream_handler_class: A class of the base class
+        `EventListenerBaseStreamHandler` that will be used to constructor
+        stream handler objects during `SendEvents` calls.
+    """
+
+    self._server_port = server_port
+    self._stream_handler_class = stream_handler_class
+
+    self._server_lock = threading.Lock()
+    self._server_started = False
+    self._stop_requested = False
+
+    self._event_reply_queue = queue.Queue()
+    self._gated_grpc_debug_watches = set()
+
+  def SendEvents(self, request_iterator, context):
+    """Implementation of the SendEvents service method.
+
+    This method receives streams of Event protos from the client, and processes
+    them in ways specified in the on_event() callback. The stream is
+    bi-directional, but currently only the client-to-server stream (i.e., the
+    stream from the debug ops to the server) is used.
+
+    Args:
+      request_iterator: The incoming stream of Event protos.
+      context: Server context.
+
+    Raises:
+      ValueError: If there are more than one core metadata events.
+
+    Yields:
+      An empty stream of responses.
+    """
+    core_metadata_count = 0
+
+    # A map from GraphDef hash to a list of received chunks.
+    graph_def_chunks = {}
+    tensor_chunks = {}
+
+    stream_handler = None
+    for event in request_iterator:
+      if not stream_handler:
+        stream_handler = self._stream_handler_class()
+
+      if event.graph_def:
+        maybe_graph_def, maybe_device_name, maybe_wall_time = (
+            self._process_encoded_graph_def_in_chunks(event, graph_def_chunks))
+        if maybe_graph_def:
+          stream_handler.on_graph_def(
+              maybe_graph_def, maybe_device_name, maybe_wall_time)
+      elif event.log_message.message:
+        core_metadata_count += 1
+        if core_metadata_count > 1:
+          raise ValueError(
+              "Expected one core metadata event; received multiple")
+        stream_handler.on_core_metadata_event(event)
+      elif event.summary and event.summary.value:
+        maybe_tensor_event = self._process_tensor_event_in_chunks(
+            event, tensor_chunks)
+        if maybe_tensor_event:
+          stream_handler.on_value_event(maybe_tensor_event)
+
+    # The server writes EventReply messages, if any.
+    while not self._event_reply_queue.empty():
+      yield self._event_reply_queue.get()
+
+  def _process_tensor_event_in_chunks(self, event, tensor_chunks):
+    """Possibly reassemble event chunks.
+
+    Due to gRPC's message size limit, a large tensor can be encapsulated in
+    multiple Event proto chunks to be sent through the debugger stream. This
+    method keeps track of the chunks that have arrived, reassemble all chunks
+    corresponding to a tensor when they have arrived and return the reassembled
+    Event proto.
+
+    Args:
+      event: The single Event proto that has arrived.
+      tensor_chunks: A dict used to keep track of the Event protos that have
+        arrived but haven't been reassembled.
+
+    Returns:
+      If all Event protos corresponding to a tensor have arrived, returns the
+      reassembled Event proto. Otherwise, return None.
+    """
+
+    value = event.summary.value[0]
+    debugger_plugin_metadata = json.loads(
+        value.metadata.plugin_data[0].content)
+    device_name = debugger_plugin_metadata["device"]
+    num_chunks = debugger_plugin_metadata["numChunks"]
+    chunk_index = debugger_plugin_metadata["chunkIndex"]
+
+    if num_chunks <= 1:
+      return event
+
+    debug_node_name = value.node_name
+    timestamp = int(event.wall_time)
+    tensor_key = "%s_%s_%d" % (device_name, debug_node_name, timestamp)
+
+    if tensor_key not in tensor_chunks:
+      tensor_chunks[tensor_key] = [None] * num_chunks
+
+    chunks = tensor_chunks[tensor_key]
+    if value.tensor.tensor_content:
+      chunks[chunk_index] = value.tensor
+    elif value.tensor.string_val:
+      chunks[chunk_index] = event
+
+    if None not in chunks:
+      if value.tensor.tensor_content:
+        event.summary.value[0].tensor.tensor_content = b"".join(
+            chunk.tensor_content for chunk in chunks)
+        del tensor_chunks[tensor_key]
+        return event
+      elif value.tensor.string_val:
+        merged_event = chunks[0]
+        for chunk in chunks[1:]:
+          merged_event.summary.value[0].tensor.string_val.extend(
+              list(chunk.summary.value[0].tensor.string_val))
+        return merged_event
+
+  def _process_encoded_graph_def_in_chunks(self,
+                                           event,
+                                           graph_def_chunks):
+    """Process an Event proto containing a chunk of encoded GraphDef.
+
+    Args:
+      event: the Event proto containing the chunk of encoded GraphDef.
+      graph_def_chunks: A dict mapping keys for GraphDefs (i.e.,
+      "<graph_def_hash>,<device_name>,<wall_time>") to a list of chunks of
+      encoded GraphDefs.
+
+    Returns:
+      If all chunks of the GraphDef have arrived,
+        return decoded GraphDef proto, device name, wall_time.
+      Otherwise,
+        return None, None, None.
+    """
+    graph_def = graph_pb2.GraphDef()
+    index_bar_0 = event.graph_def.find(b"|")
+    index_bar_1 = event.graph_def.find(b"|", index_bar_0 + 1)
+    index_bar_2 = event.graph_def.find(b"|", index_bar_1 + 1)
+    graph_def_hash_device_timestamp = event.graph_def[:index_bar_0]
+    chunk_index = int(event.graph_def[index_bar_0 + 1 : index_bar_1])
+    num_chunks = int(event.graph_def[index_bar_1 + 1 : index_bar_2])
+    if graph_def_hash_device_timestamp not in graph_def_chunks:
+      graph_def_chunks[graph_def_hash_device_timestamp] = [None] * num_chunks
+    graph_def_chunks[graph_def_hash_device_timestamp][
+        chunk_index] = event.graph_def[index_bar_2 + 1:]
+    if all(graph_def_chunks[graph_def_hash_device_timestamp]):
+      device_name = graph_def_hash_device_timestamp.split(b",")[1]
+      wall_time = int(graph_def_hash_device_timestamp.split(b",")[2])
+      graph_def.ParseFromString(
+          b"".join(graph_def_chunks[graph_def_hash_device_timestamp]))
+      del graph_def_chunks[graph_def_hash_device_timestamp]
+      self._process_graph_def(graph_def)
+      return graph_def, device_name, wall_time
+    else:
+      return None, None, None
+
+  def _process_graph_def(self, graph_def):
+    for node_def in graph_def.node:
+      if (debug_data.is_debug_node(node_def.name) and
+          node_def.attr["gated_grpc"].b):
+        node_name, output_slot, _, debug_op = (
+            debug_data.parse_debug_node_name(node_def.name))
+        self._gated_grpc_debug_watches.add(
+            DebugWatch(node_name, output_slot, debug_op))
+
+  def run_server(self):
+    """Start running the server.
+
+    Blocks until `stop_server` is invoked.
+
+    Raises:
+      ValueError: If server stop has already been requested, or if the server
+        has already started running.
+    """
+    self._server_lock.acquire()
+    try:
+      if self._stop_requested:
+        raise ValueError("Server has already stopped")
+      if self._server_started:
+        raise ValueError("Server has already started running")
+
+      self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+      debug_service_pb2_grpc.add_EventListenerServicer_to_server(self,
+                                                                 self.server)
+      self.server.add_insecure_port("[::]:%d" % self._server_port)
+      self.server.start()
+      self._server_started = True
+    finally:
+      self._server_lock.release()
+
+    while not self._stop_requested:
+      time.sleep(1.0)
+
+  def stop_server(self, grace=1.0):
+    """Request server stopping.
+
+    Once stopped, server cannot be stopped or started again. This method is
+    non-blocking. Call `wait()` on the returned event to block until the server
+    has completely stopped.
+
+    Args:
+      grace: Grace period in seconds to be used when calling `server.stop()`.
+
+    Raises:
+      ValueError: If server stop has already been requested, or if the server
+        has not started running yet.
+
+    Returns:
+      A threading.Event that will be set when the server has completely stopped.
+    """
+    self._server_lock.acquire()
+    try:
+      if not self._server_started:
+        raise ValueError("Server has not started running")
+      if self._stop_requested:
+        raise ValueError("Server has already stopped")
+
+      self._stop_requested = True
+      return self.server.stop(grace=grace)
+    finally:
+      self._server_lock.release()
+
+  def request_watch(self, node_name, output_slot, debug_op):
+    """Request enabling a debug tensor watch.
+
+    This will let the server send a EventReply to the client side
+    (i.e., the debugged TensorFlow runtime process) to request adding a watch
+    key (i.e., <node_name>:<output_slot>:<debug_op>) to the list of enabled
+    watch keys. The list applies only to debug ops with the attribute
+    gated_grpc=True.
+
+    The request will take effect on the next debugged `Session.run()` call.
+
+    To disable the watch, use `request_unwatch()`.
+
+    Args:
+      node_name: (`str`) name of the node that the to-be-watched tensor belongs
+        to, e.g., "hidden/Weights".
+      output_slot: (`int`) output slot index of the tensor to watch.
+      debug_op: (`str`) name of the debug op to enable. This should not include
+        any attribute substrings.
+    """
+    self._event_reply_queue.put(
+        _watch_key_event_reply(True, node_name, output_slot, debug_op))
+
+  def request_unwatch(self, node_name, output_slot, debug_op):
+    """Request disabling a debug tensor watch.
+
+    The request will take effect on the next debugged `Session.run()` call.
+
+    This is the opposite of `request_watch()`.
+
+    Args:
+      node_name: (`str`) name of the node that the to-be-watched tensor belongs
+        to, e.g., "hidden/Weights".
+      output_slot: (`int`) output slot index of the tensor to watch.
+      debug_op: (`str`) name of the debug op to enable. This should not include
+        any attribute substrings.
+    """
+    self._event_reply_queue.put(
+        _watch_key_event_reply(False, node_name, output_slot, debug_op))
+
+  def gated_grpc_debug_watches(self):
+    """Get the list of debug watches with attribute gated_grpc=True.
+
+    Since the server receives `GraphDef` from the debugged runtime, it can only
+    return such debug watches that it has received so far.
+
+    Returns:
+      A `list` of `DebugWatch` `namedtuples` representing the debug watches with
+      gated_grpc=True. Each `namedtuple` element has the attributes:
+        `node_name` as a `str`,
+        `output_slot` as an `int`,
+        `debug_op` as a `str`.
+    """
+    return list(self._gated_grpc_debug_watches)
diff --git a/tensorflow/python/debug/lib/grpc_debug_test_server.py b/tensorflow/python/debug/lib/grpc_debug_test_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..32751e0f29f64c42ebccdb8f1dd9a28b6440bad5
--- /dev/null
+++ b/tensorflow/python/debug/lib/grpc_debug_test_server.py
@@ -0,0 +1,336 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""GRPC debug server for testing."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import errno
+import functools
+import hashlib
+import json
+import os
+import re
+import shutil
+import tempfile
+import threading
+import time
+
+import portpicker
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import grpc_debug_server
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import variables
+
+
+def _get_dump_file_path(dump_root, device_name, debug_node_name):
+  """Get the file path of the dump file for a debug node.
+
+  Args:
+    dump_root: (str) Root dump directory.
+    device_name: (str) Name of the device that the debug node resides on.
+    debug_node_name: (str) Name of the debug node, e.g.,
+      cross_entropy/Log:0:DebugIdentity.
+
+  Returns:
+    (str) Full path of the dump file.
+  """
+
+  dump_root = os.path.join(
+      dump_root, debug_data.device_name_to_device_path(device_name))
+  if "/" in debug_node_name:
+    dump_dir = os.path.join(dump_root, os.path.dirname(debug_node_name))
+    dump_file_name = re.sub(":", "_", os.path.basename(debug_node_name))
+  else:
+    dump_dir = dump_root
+    dump_file_name = re.sub(":", "_", debug_node_name)
+
+  now_microsec = int(round(time.time() * 1000 * 1000))
+  dump_file_name += "_%d" % now_microsec
+
+  return os.path.join(dump_dir, dump_file_name)
+
+
+class EventListenerTestStreamHandler(
+    grpc_debug_server.EventListenerBaseStreamHandler):
+  """Implementation of EventListenerBaseStreamHandler that dumps to file."""
+
+  def __init__(self, dump_dir, event_listener_servicer):
+    self._dump_dir = dump_dir
+    self._event_listener_servicer = event_listener_servicer
+    if self._dump_dir:
+      self._try_makedirs(self._dump_dir)
+
+    self._grpc_path = None
+    self._cached_graph_defs = []
+    self._cached_graph_def_device_names = []
+    self._cached_graph_def_wall_times = []
+
+  def on_core_metadata_event(self, event):
+    core_metadata = json.loads(event.log_message.message)
+
+    if not self._grpc_path:
+      grpc_path = core_metadata["grpc_path"]
+      if grpc_path:
+        if grpc_path.startswith("/"):
+          grpc_path = grpc_path[1:]
+      if self._dump_dir:
+        self._dump_dir = os.path.join(self._dump_dir, grpc_path)
+
+        # Write cached graph defs to filesystem.
+        for graph_def, device_name, wall_time in zip(
+            self._cached_graph_defs,
+            self._cached_graph_def_device_names,
+            self._cached_graph_def_wall_times):
+          self._write_graph_def(graph_def, device_name, wall_time)
+
+    if self._dump_dir:
+      self._write_core_metadata_event(event)
+    else:
+      self._event_listener_servicer.core_metadata_json_strings.append(
+          event.log_message.message)
+
+  def on_graph_def(self, graph_def, device_name, wall_time):
+    """Implementation of the tensor value-carrying Event proto callback.
+
+    Args:
+      graph_def: A GraphDef object.
+      device_name: Name of the device on which the graph was created.
+      wall_time: An epoch timestamp (in microseconds) for the graph.
+    """
+    if self._dump_dir:
+      if self._grpc_path:
+        self._write_graph_def(graph_def, device_name, wall_time)
+      else:
+        self._cached_graph_defs.append(graph_def)
+        self._cached_graph_def_device_names.append(device_name)
+        self._cached_graph_def_wall_times.append(wall_time)
+    else:
+      self._event_listener_servicer.partition_graph_defs.append(graph_def)
+
+  def on_value_event(self, event):
+    """Implementation of the tensor value-carrying Event proto callback.
+
+    Writes the Event proto to the file system for testing. The path written to
+    follows the same pattern as the file:// debug URLs of tfdbg, i.e., the
+    name scope of the op becomes the directory structure under the dump root
+    directory.
+
+    Args:
+      event: The Event proto carrying a tensor value.
+    """
+    if self._dump_dir:
+      self._write_value_event(event)
+    else:
+      value = event.summary.value[0]
+      self._event_listener_servicer.debug_tensor_values[value.node_name].append(
+          debug_data.load_tensor_from_event(event))
+
+  def _try_makedirs(self, dir_path):
+    if not os.path.isdir(dir_path):
+      try:
+        os.makedirs(dir_path)
+      except OSError as error:
+        if error.errno != errno.EEXIST:
+          raise
+
+  def _write_core_metadata_event(self, event):
+    core_metadata_path = os.path.join(
+        self._dump_dir,
+        debug_data.METADATA_FILE_PREFIX + debug_data.CORE_METADATA_TAG +
+        "_%d" % event.wall_time)
+    self._try_makedirs(self._dump_dir)
+    with open(core_metadata_path, "wb") as f:
+      f.write(event.SerializeToString())
+
+  def _write_graph_def(self, graph_def, device_name, wall_time):
+    encoded_graph_def = graph_def.SerializeToString()
+    graph_hash = int(hashlib.md5(encoded_graph_def).hexdigest(), 16)
+    event = event_pb2.Event(graph_def=encoded_graph_def, wall_time=wall_time)
+    graph_file_path = os.path.join(
+        self._dump_dir,
+        debug_data.device_name_to_device_path(device_name),
+        debug_data.METADATA_FILE_PREFIX + debug_data.GRAPH_FILE_TAG +
+        debug_data.HASH_TAG + "%d_%d" % (graph_hash, wall_time))
+    self._try_makedirs(os.path.dirname(graph_file_path))
+    with open(graph_file_path, "wb") as f:
+      f.write(event.SerializeToString())
+
+  def _write_value_event(self, event):
+    value = event.summary.value[0]
+
+    # Obtain the device name from the metadata.
+    summary_metadata = event.summary.value[0].metadata
+    if not summary_metadata.plugin_data:
+      raise ValueError("The value lacks plugin data.")
+    try:
+      content = json.loads(summary_metadata.plugin_data[0].content)
+    except ValueError as err:
+      raise ValueError("Could not parse content into JSON: %r, %r" % (content,
+                                                                      err))
+    device_name = content["device"]
+
+    dump_full_path = _get_dump_file_path(
+        self._dump_dir, device_name, value.node_name)
+    self._try_makedirs(os.path.dirname(dump_full_path))
+    with open(dump_full_path, "wb") as f:
+      f.write(event.SerializeToString())
+
+
+class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
+  """An implementation of EventListenerBaseServicer for testing."""
+
+  def __init__(self, server_port, dump_dir):
+    """Constructor of EventListenerTestServicer.
+
+    Args:
+      server_port: (int) The server port number.
+      dump_dir: (str) The root directory to which the data files will be
+        dumped. If empty or None, the received debug data will not be dumped
+        to the file system: they will be stored in memory instead.
+    """
+    self.core_metadata_json_strings = []
+    self.partition_graph_defs = []
+    self.debug_tensor_values = collections.defaultdict(list)
+
+    grpc_debug_server.EventListenerBaseServicer.__init__(
+        self, server_port,
+        functools.partial(EventListenerTestStreamHandler, dump_dir, self))
+
+  def clear_data(self):
+    self.core_metadata_json_strings = []
+    self.partition_graph_defs = []
+    self.debug_tensor_values = collections.defaultdict(list)
+
+
+def start_server_on_separate_thread(dump_to_filesystem=True,
+                                    server_start_delay_sec=0.0,
+                                    poll_server=False):
+  """Create a test gRPC debug server and run on a separate thread.
+
+  Args:
+    dump_to_filesystem: (bool) whether the debug server will dump debug data
+      to the filesystem.
+    server_start_delay_sec: (float) amount of time (in sec) to delay the server
+      start up for.
+    poll_server: (bool) whether the server will be polled till success on
+      startup.
+
+  Returns:
+    server_port: (int) Port on which the server runs.
+    debug_server_url: (str) grpc:// URL to the server.
+    server_dump_dir: (str) The debug server's dump directory.
+    server_thread: The server Thread object.
+    server: The `EventListenerTestServicer` object.
+
+  Raises:
+    ValueError: If polling the server process for ready state is not successful
+      within maximum polling count.
+  """
+  server_port = portpicker.pick_unused_port()
+  debug_server_url = "grpc://localhost:%d" % server_port
+
+  server_dump_dir = tempfile.mkdtemp() if dump_to_filesystem else None
+  server = EventListenerTestServicer(server_port=server_port,
+                                     dump_dir=server_dump_dir)
+
+  def delay_then_run_server():
+    time.sleep(server_start_delay_sec)
+    server.run_server()
+  server_thread = threading.Thread(target=delay_then_run_server)
+  server_thread.start()
+
+  if poll_server:
+    if not _poll_server_till_success(
+        50,
+        0.2,
+        debug_server_url,
+        server_dump_dir,
+        server,
+        gpu_memory_fraction=0.1):
+      raise ValueError(
+          "Failed to start test gRPC debug server at port %d" % server_port)
+    server.clear_data()
+  return server_port, debug_server_url, server_dump_dir, server_thread, server
+
+
+def _poll_server_till_success(max_attempts,
+                              sleep_per_poll_sec,
+                              debug_server_url,
+                              dump_dir,
+                              server,
+                              gpu_memory_fraction=1.0):
+  """Poll server until success or exceeding max polling count.
+
+  Args:
+    max_attempts: (int) How many times to poll at maximum
+    sleep_per_poll_sec: (float) How many seconds to sleep for after each
+      unsuccessful poll.
+    debug_server_url: (str) gRPC URL to the debug server.
+    dump_dir: (str) Dump directory to look for files in. If None, will directly
+      check data from the server object.
+    server: The server object.
+    gpu_memory_fraction: (float) Fraction of GPU memory to be
+      allocated for the Session used in server polling.
+
+  Returns:
+    (bool) Whether the polling succeeded within max_polls attempts.
+  """
+  poll_count = 0
+
+  config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
+      per_process_gpu_memory_fraction=gpu_memory_fraction))
+  with session.Session(config=config) as sess:
+    for poll_count in range(max_attempts):
+      server.clear_data()
+      print("Polling: poll_count = %d" % poll_count)
+
+      x_init_name = "x_init_%d" % poll_count
+      x_init = constant_op.constant([42.0], shape=[1], name=x_init_name)
+      x = variables.Variable(x_init, name=x_init_name)
+
+      run_options = config_pb2.RunOptions()
+      debug_utils.add_debug_tensor_watch(
+          run_options, x_init_name, 0, debug_urls=[debug_server_url])
+      try:
+        sess.run(x.initializer, options=run_options)
+      except errors.FailedPreconditionError:
+        pass
+
+      if dump_dir:
+        if os.path.isdir(
+            dump_dir) and debug_data.DebugDumpDir(dump_dir).size > 0:
+          shutil.rmtree(dump_dir)
+          print("Poll succeeded.")
+          return True
+        else:
+          print("Poll failed. Sleeping for %f s" % sleep_per_poll_sec)
+          time.sleep(sleep_per_poll_sec)
+      else:
+        if server.debug_tensor_values:
+          print("Poll succeeded.")
+          return True
+        else:
+          print("Poll failed. Sleeping for %f s" % sleep_per_poll_sec)
+          time.sleep(sleep_per_poll_sec)
+
+    return False
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index bb3e4ead35f6db96cc176505b73bf96239ff5449..48f31771db8b6309883f3b9eac51ca51611d173f 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -22,6 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_utils
@@ -35,6 +36,12 @@ from tensorflow.python.platform import googletest
 
 class SessionDebugTest(session_debug_testlib.SessionDebugTestBase):
 
+  def _no_rewrite_session_config(self):
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    return config_pb2.ConfigProto(graph_options=graph_options)
+
   def _debug_urls(self, run_number=None):
     return ["file://%s" % self._debug_dump_dir(run_number=run_number)]
 
@@ -47,7 +54,7 @@ class SessionDebugTest(session_debug_testlib.SessionDebugTestBase):
   def testAllowsDifferentWatchesOnDifferentRuns(self):
     """Test watching different tensors on different runs of the same graph."""
 
-    with session.Session() as sess:
+    with session.Session(config=self._no_rewrite_session_config()) as sess:
       u_init_val = [[5.0, 3.0], [-1.0, 0.0]]
       v_init_val = [[2.0], [-1.0]]
 
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f97b4debd315bac6253c5cf30a4a79634a6dfb3e
--- /dev/null
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -0,0 +1,622 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities in tf.Session with grpc:// URLs.
+
+This test file focuses on the grpc:// debugging of local (non-distributed)
+tf.Sessions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import grpc_debug_test_server
+from tensorflow.python.debug.lib import session_debug_testlib
+from tensorflow.python.debug.wrappers import framework
+from tensorflow.python.debug.wrappers import grpc_wrapper
+from tensorflow.python.debug.wrappers import hooks
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import monitored_session
+
+
+def no_rewrite_session_config():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
+class GrpcDebugServerTest(test_util.TensorFlowTestCase):
+
+  def testRepeatedRunServerRaisesException(self):
+    (_, _, _, server_thread,
+     server) = grpc_debug_test_server.start_server_on_separate_thread(
+         poll_server=True)
+    # The server is started asynchronously. It needs to be polled till its state
+    # has become started.
+
+    with self.assertRaisesRegexp(
+        ValueError, "Server has already started running"):
+      server.run_server()
+
+    server.stop_server().wait()
+    server_thread.join()
+
+  def testRepeatedStopServerRaisesException(self):
+    (_, _, _, server_thread,
+     server) = grpc_debug_test_server.start_server_on_separate_thread(
+         poll_server=True)
+    server.stop_server().wait()
+    server_thread.join()
+
+    with self.assertRaisesRegexp(ValueError, "Server has already stopped"):
+      server.stop_server().wait()
+
+  def testRunServerAfterStopRaisesException(self):
+    (_, _, _, server_thread,
+     server) = grpc_debug_test_server.start_server_on_separate_thread(
+         poll_server=True)
+    server.stop_server().wait()
+    server_thread.join()
+
+    with self.assertRaisesRegexp(ValueError, "Server has already stopped"):
+      server.run_server()
+
+
+class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
+
+  @classmethod
+  def setUpClass(cls):
+    session_debug_testlib.SessionDebugTestBase.setUpClass()
+    (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
+     cls._server_thread,
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+
+  @classmethod
+  def tearDownClass(cls):
+    # Stop the test server and join the thread.
+    cls._server.stop_server().wait()
+    cls._server_thread.join()
+
+    session_debug_testlib.SessionDebugTestBase.tearDownClass()
+
+  def setUp(self):
+    # Override the dump root as the test server's dump directory.
+    self._dump_root = self._server_dump_dir
+
+  def tearDown(self):
+    if os.path.isdir(self._server_dump_dir):
+      shutil.rmtree(self._server_dump_dir)
+    session_debug_testlib.SessionDebugTestBase.tearDown(self)
+
+  def _debug_urls(self, run_number=None):
+    return ["grpc://localhost:%d" % self._server_port]
+
+  def _debug_dump_dir(self, run_number=None):
+    if run_number is None:
+      return self._dump_root
+    else:
+      return os.path.join(self._dump_root, "run_%d" % run_number)
+
+  def testConstructGrpcDebugWrapperSessionWithInvalidTypeRaisesException(self):
+    sess = session.Session(config=no_rewrite_session_config())
+    with self.assertRaisesRegexp(
+        TypeError, "Expected type str or list in grpc_debug_server_addresses"):
+      grpc_wrapper.GrpcDebugWrapperSession(sess, 1337)
+
+  def testConstructGrpcDebugWrapperSessionWithInvalidTypeRaisesException2(self):
+    sess = session.Session(config=no_rewrite_session_config())
+    with self.assertRaisesRegexp(
+        TypeError, "Expected type str in list grpc_debug_server_addresses"):
+      grpc_wrapper.GrpcDebugWrapperSession(sess, ["localhost:1337", 1338])
+
+  def testUseInvalidWatchFnTypeWithGrpcDebugWrapperSessionRaisesException(self):
+    sess = session.Session(config=no_rewrite_session_config())
+    with self.assertRaises(TypeError):
+      grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self._server_port, watch_fn="foo")
+
+  def testGrpcDebugWrapperSessionWithoutWatchFnWorks(self):
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(u.initializer)
+    sess.run(v.initializer)
+
+    sess = grpc_wrapper.GrpcDebugWrapperSession(
+        sess, "localhost:%d" % self._server_port)
+    w_result = sess.run(w)
+    self.assertAllClose(42.0, w_result)
+
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(5, dump.size)
+    self.assertAllClose([2.1], dump.get_tensors("u", 0, "DebugIdentity"))
+    self.assertAllClose([2.1], dump.get_tensors("u/read", 0, "DebugIdentity"))
+    self.assertAllClose([20.0], dump.get_tensors("v", 0, "DebugIdentity"))
+    self.assertAllClose([20.0], dump.get_tensors("v/read", 0, "DebugIdentity"))
+    self.assertAllClose([42.0], dump.get_tensors("w", 0, "DebugIdentity"))
+
+  def testGrpcDebugWrapperSessionWithWatchFnWorks(self):
+    def watch_fn(feeds, fetch_keys):
+      del feeds, fetch_keys
+      return ["DebugIdentity", "DebugNumericSummary"], r".*/read", None
+
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(u.initializer)
+    sess.run(v.initializer)
+
+    sess = grpc_wrapper.GrpcDebugWrapperSession(
+        sess, "localhost:%d" % self._server_port, watch_fn=watch_fn)
+    w_result = sess.run(w)
+    self.assertAllClose(42.0, w_result)
+
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(4, dump.size)
+    self.assertAllClose([2.1], dump.get_tensors("u/read", 0, "DebugIdentity"))
+    self.assertEqual(
+        14, len(dump.get_tensors("u/read", 0, "DebugNumericSummary")[0]))
+    self.assertAllClose([20.0], dump.get_tensors("v/read", 0, "DebugIdentity"))
+    self.assertEqual(
+        14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
+
+  def testGrpcDebugHookWithStatelessWatchFnWorks(self):
+    # Perform some set up. Specifically, construct a simple TensorFlow graph and
+    # create a watch function for certain ops.
+    def watch_fn(feeds, fetch_keys):
+      del feeds, fetch_keys
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity", "DebugNumericSummary"],
+          node_name_regex_whitelist=r".*/read",
+          op_type_regex_whitelist=None,
+          tolerate_debug_op_creation_failures=True)
+
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(u.initializer)
+    sess.run(v.initializer)
+
+    # Create a hook. One could use this hook with say a tflearn Estimator.
+    # However, we use a HookedSession in this test to avoid depending on the
+    # internal implementation of Estimators.
+    grpc_debug_hook = hooks.GrpcDebugHook(
+        ["localhost:%d" % self._server_port], watch_fn=watch_fn)
+    sess = monitored_session._HookedSession(sess, [grpc_debug_hook])
+
+    # Run the hooked session. This should stream tensor data to the GRPC
+    # endpoints.
+    w_result = sess.run(w)
+
+    # Verify that the hook monitored the correct tensors.
+    self.assertAllClose(42.0, w_result)
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(4, dump.size)
+    self.assertAllClose([2.1], dump.get_tensors("u/read", 0, "DebugIdentity"))
+    self.assertEqual(
+        14, len(dump.get_tensors("u/read", 0, "DebugNumericSummary")[0]))
+    self.assertAllClose([20.0], dump.get_tensors("v/read", 0, "DebugIdentity"))
+    self.assertEqual(
+        14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
+
+  def testConstructGrpcDebugHookWithGrpcInUrlRaisesValueError(self):
+    """Tests that the hook raises an error if the URL starts with grpc://."""
+    with self.assertRaises(ValueError):
+      hooks.GrpcDebugHook(["grpc://foo:42"])
+
+
+class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    (cls.debug_server_port, cls.debug_server_url, _, cls.debug_server_thread,
+     cls.debug_server
+    ) = grpc_debug_test_server.start_server_on_separate_thread(
+        dump_to_filesystem=False)
+    tf_logging.info("debug server url: %s", cls.debug_server_url)
+
+  @classmethod
+  def tearDownClass(cls):
+    cls.debug_server.stop_server().wait()
+    cls.debug_server_thread.join()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    self.debug_server.clear_data()
+
+  def testSendingLargeGraphDefsWorks(self):
+    with self.test_session(
+        use_gpu=True, config=no_rewrite_session_config()) as sess:
+      u = variables.Variable(42.0, name="original_u")
+      for _ in xrange(50 * 1000):
+        u = array_ops.identity(u)
+      sess.run(variables.global_variables_initializer())
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"original_u")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      self.assertAllClose(42.0, sess.run(u))
+
+      self.assertAllClose(
+          [42.0],
+          self.debug_server.debug_tensor_values["original_u:0:DebugIdentity"])
+      self.assertEqual(2 if test.is_gpu_available() else 1,
+                       len(self.debug_server.partition_graph_defs))
+      max_graph_def_size = max([
+          len(graph_def.SerializeToString())
+          for graph_def in self.debug_server.partition_graph_defs])
+      self.assertGreater(max_graph_def_size, 4 * 1024 * 1024)
+
+  def testSendingLargeFloatTensorWorks(self):
+    with self.test_session(
+        use_gpu=True, config=no_rewrite_session_config()) as sess:
+      u_init_val_array = list(xrange(1200 * 1024))
+      # Size: 4 * 1200 * 1024 = 4800k > 4M
+
+      u_init = constant_op.constant(
+          u_init_val_array, dtype=dtypes.float32, name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds  # Unused by this watch_fn.
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      self.assertAllEqual(
+          u_init_val_array,
+          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
+
+  def testSendingStringTensorWithAlmostTooLargeStringsWorks(self):
+    with self.test_session(
+        use_gpu=True, config=no_rewrite_session_config()) as sess:
+      u_init_val = [
+          b"", b"spam", b"A" * 2500 * 1024, b"B" * 2500 * 1024, b"egg", b""]
+      u_init = constant_op.constant(
+          u_init_val, dtype=dtypes.string, name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      self.assertAllEqual(
+          u_init_val,
+          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
+
+  def testSendingLargeStringTensorWorks(self):
+    with self.test_session(
+        use_gpu=True, config=no_rewrite_session_config()) as sess:
+      strs_total_size_threshold = 5000 * 1024
+      cum_size = 0
+      u_init_val_array = []
+      while cum_size < strs_total_size_threshold:
+        strlen = np.random.randint(200)
+        u_init_val_array.append(b"A" * strlen)
+        cum_size += strlen
+
+      u_init = constant_op.constant(
+          u_init_val_array, dtype=dtypes.string, name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      self.assertAllEqual(
+          u_init_val_array,
+          self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
+
+  def testSendingEmptyFloatTensorWorks(self):
+    with self.test_session(
+        use_gpu=True, config=no_rewrite_session_config()) as sess:
+      u_init = constant_op.constant(
+          [], dtype=dtypes.float32, shape=[0], name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      u_init_value = self.debug_server.debug_tensor_values[
+          "u_init:0:DebugIdentity"][0]
+      self.assertEqual(np.float32, u_init_value.dtype)
+      self.assertEqual(0, len(u_init_value))
+
+  def testSendingEmptyStringTensorWorks(self):
+    with self.test_session(
+        use_gpu=True, config=no_rewrite_session_config()) as sess:
+      u_init = constant_op.constant(
+          [], dtype=dtypes.string, shape=[0], name="u_init")
+      u = variables.Variable(u_init, name="u")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(
+            debug_ops=["DebugIdentity"],
+            node_name_regex_whitelist=r"u_init")
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % self.debug_server_port, watch_fn=watch_fn)
+      sess.run(u.initializer)
+
+      u_init_value = self.debug_server.debug_tensor_values[
+          "u_init:0:DebugIdentity"][0]
+      self.assertEqual(np.object, u_init_value.dtype)
+      self.assertEqual(0, len(u_init_value))
+
+
+class SessionDebugConcurrentTest(
+    session_debug_testlib.DebugConcurrentRunCallsTest):
+
+  @classmethod
+  def setUpClass(cls):
+    session_debug_testlib.SessionDebugTestBase.setUpClass()
+    (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
+     cls._server_thread,
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+
+  @classmethod
+  def tearDownClass(cls):
+    # Stop the test server and join the thread.
+    cls._server.stop_server().wait()
+    cls._server_thread.join()
+    session_debug_testlib.SessionDebugTestBase.tearDownClass()
+
+  def setUp(self):
+    self._num_concurrent_runs = 3
+    self._dump_roots = []
+    for i in range(self._num_concurrent_runs):
+      self._dump_roots.append(
+          os.path.join(self._server_dump_dir, "thread%d" % i))
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    if os.path.isdir(self._server_dump_dir):
+      shutil.rmtree(self._server_dump_dir)
+
+  def _get_concurrent_debug_urls(self):
+    urls = []
+    for i in range(self._num_concurrent_runs):
+      urls.append(self._debug_server_url + "/thread%d" % i)
+    return urls
+
+
+class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
+  """Test server gating of debug ops."""
+
+  @classmethod
+  def setUpClass(cls):
+    (cls._server_port_1, cls._debug_server_url_1, _, cls._server_thread_1,
+     cls._server_1) = grpc_debug_test_server.start_server_on_separate_thread(
+         dump_to_filesystem=False)
+    (cls._server_port_2, cls._debug_server_url_2, _, cls._server_thread_2,
+     cls._server_2) = grpc_debug_test_server.start_server_on_separate_thread(
+         dump_to_filesystem=False)
+
+  @classmethod
+  def tearDownClass(cls):
+    cls._server_1.stop_server().wait()
+    cls._server_thread_1.join()
+    cls._server_2.stop_server().wait()
+    cls._server_thread_2.join()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    self._server_1.clear_data()
+    self._server_2.clear_data()
+
+  def testToggleEnableTwoDebugWatchesNoCrosstalkBetweenDebugNodes(self):
+    with session.Session(config=no_rewrite_session_config()) as sess:
+      v = variables.Variable(50.0, name="v")
+      delta = constant_op.constant(5.0, name="delta")
+      inc_v = state_ops.assign_add(v, delta, name="inc_v")
+
+      sess.run(v.initializer)
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugIdentity(gated_grpc=true)",
+                     "DebugNumericSummary(gated_grpc=true)"],
+          debug_urls=[self._debug_server_url_1])
+
+      for i in xrange(4):
+        self._server_1.clear_data()
+
+        # N.B.: These requests will be fulfilled not in this debugged
+        # Session.run() invocation, but in the next one.
+        if i % 2 == 0:
+          self._server_1.request_watch("delta", 0, "DebugIdentity")
+          self._server_1.request_unwatch("delta", 0, "DebugNumericSummary")
+        else:
+          self._server_1.request_unwatch("delta", 0, "DebugIdentity")
+          self._server_1.request_watch("delta", 0, "DebugNumericSummary")
+
+        sess.run(inc_v, options=run_options, run_metadata=run_metadata)
+
+        if i == 0:
+          self.assertEqual(0, len(self._server_1.debug_tensor_values))
+        else:
+          self.assertEqual(1, len(self._server_1.debug_tensor_values))
+          if i % 2 == 1:
+            self.assertAllClose(
+                [5.0],
+                self._server_1.debug_tensor_values["delta:0:DebugIdentity"])
+          else:
+            self.assertAllClose(
+                [[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0, 5.0, 5.0,
+                  0.0, 1.0, 0.0]],
+                self._server_1.debug_tensor_values[
+                    "delta:0:DebugNumericSummary"])
+
+  def testToggleEnableTwoDebugWatchesNoCrosstalkBetweenServers(self):
+    with session.Session(config=no_rewrite_session_config()) as sess:
+      v = variables.Variable(50.0, name="v")
+      delta = constant_op.constant(5.0, name="delta")
+      inc_v = state_ops.assign_add(v, delta, name="inc_v")
+
+      sess.run(v.initializer)
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugIdentity(gated_grpc=true)"],
+          debug_urls=[self._debug_server_url_1, self._debug_server_url_2])
+
+      for i in xrange(4):
+        self._server_1.clear_data()
+        self._server_2.clear_data()
+
+        # N.B.: These requests will be fulfilled not in this debugged
+        # Session.run() invocation, but in the next one.
+        if i % 2 == 0:
+          self._server_1.request_watch("delta", 0, "DebugIdentity")
+          self._server_2.request_watch("v", 0, "DebugIdentity")
+        else:
+          self._server_1.request_unwatch("delta", 0, "DebugIdentity")
+          self._server_2.request_unwatch("v", 0, "DebugIdentity")
+
+        sess.run(inc_v, options=run_options, run_metadata=run_metadata)
+
+        if i % 2 == 0:
+          self.assertEqual(0, len(self._server_1.debug_tensor_values))
+          self.assertEqual(0, len(self._server_2.debug_tensor_values))
+        else:
+          self.assertEqual(1, len(self._server_1.debug_tensor_values))
+          self.assertEqual(1, len(self._server_2.debug_tensor_values))
+          self.assertAllClose(
+              [5.0],
+              self._server_1.debug_tensor_values["delta:0:DebugIdentity"])
+          self.assertAllClose(
+              [50 + 5.0 * i],
+              self._server_2.debug_tensor_values["v:0:DebugIdentity"])
+
+  def testGetGrpcDebugWatchesReturnsCorrectAnswer(self):
+    with session.Session() as sess:
+      v = variables.Variable(50.0, name="v")
+      delta = constant_op.constant(5.0, name="delta")
+      inc_v = state_ops.assign_add(v, delta, name="inc_v")
+
+      sess.run(v.initializer)
+
+      # Before any debugged runs, the server should be aware of no debug
+      # watches.
+      self.assertEqual([], self._server_1.gated_grpc_debug_watches())
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.add_debug_tensor_watch(
+          run_options, "delta", output_slot=0,
+          debug_ops=["DebugNumericSummary(gated_grpc=true)"],
+          debug_urls=[self._debug_server_url_1])
+      debug_utils.add_debug_tensor_watch(
+          run_options, "v", output_slot=0,
+          debug_ops=["DebugIdentity"],
+          debug_urls=[self._debug_server_url_1])
+      sess.run(inc_v, options=run_options, run_metadata=run_metadata)
+
+      # After the first run, the server should have noted the debug watches
+      # for which gated_grpc == True, but not the ones with gated_grpc == False.
+      self.assertEqual(1, len(self._server_1.gated_grpc_debug_watches()))
+      debug_watch = self._server_1.gated_grpc_debug_watches()[0]
+      self.assertEqual("delta", debug_watch.node_name)
+      self.assertEqual(0, debug_watch.output_slot)
+      self.assertEqual("DebugNumericSummary", debug_watch.debug_op)
+
+
+class DelayedDebugServerTest(test_util.TensorFlowTestCase):
+
+  def testDebuggedSessionRunWorksWithDelayedDebugServerStartup(self):
+    """Test debugged Session.run() tolerates delayed debug server startup."""
+    ops.reset_default_graph()
+
+    # Start a debug server asynchronously, with a certain amount of delay.
+    (debug_server_port, _, _, server_thread,
+     debug_server) = grpc_debug_test_server.start_server_on_separate_thread(
+         server_start_delay_sec=2.0, dump_to_filesystem=False)
+
+    with self.test_session() as sess:
+      a_init = constant_op.constant(42.0, name="a_init")
+      a = variables.Variable(a_init, name="a")
+
+      def watch_fn(fetches, feeds):
+        del fetches, feeds
+        return framework.WatchOptions(debug_ops=["DebugIdentity"])
+
+      sess = grpc_wrapper.GrpcDebugWrapperSession(
+          sess, "localhost:%d" % debug_server_port, watch_fn=watch_fn)
+      sess.run(a.initializer)
+      self.assertAllClose(
+          [42.0], debug_server.debug_tensor_values["a_init:0:DebugIdentity"])
+
+    debug_server.stop_server().wait()
+    server_thread.join()
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 67f5e9d29e6e8ee1bb4b8b63161927997d41cd7b..1b4053ff3b2927bce913ea6361d36b140859a7ea 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -29,6 +29,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
@@ -53,6 +54,13 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
+def no_rewrite_session_config():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
 class _RNNCellForTest(rnn_cell_impl.RNNCell):
   """RNN cell for testing."""
 
@@ -160,7 +168,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
         validate=validate)
 
   def _generate_dump_from_simple_addition_graph(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
       v_init_val = np.array([[2.0], [-1.0]])
 
@@ -304,7 +312,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       results.dump.node_op_type("foo_bar")
 
   def testDumpStringTensorsWorks(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       str1_init_val = np.array(b"abc")
       str2_init_val = np.array(b"def")
 
@@ -385,9 +393,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       # Add debug tensor watch for u.
       debug_utils.add_debug_tensor_watch(
-          run_options, "%s" % u_name, 0, debug_urls=debug_urls)
+          run_options, u_name, 0, debug_urls=debug_urls)
       debug_utils.add_debug_tensor_watch(
-          run_options, "%s" % s_name, 0, debug_urls=debug_urls)
+          run_options, s_name, 0, debug_urls=debug_urls)
 
       run_metadata = config_pb2.RunMetadata()
 
@@ -419,7 +427,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertEqual(s_init_val, sess.run(s))
 
   def testDebugWhileLoopGeneratesMultipleDumps(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       num_iter = 10
 
       # "u" is the Variable being updated in the loop.
@@ -659,7 +667,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertEqual(x_name, first_bad_datum[0].node_name)
 
   def _session_run_for_graph_structure_lookup(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_name = "testDumpGraphStructureLookup/u"
       v_name = "testDumpGraphStructureLookup/v"
       w_name = "testDumpGraphStructureLookup/w"
@@ -798,7 +806,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertTrue(dump.loaded_partition_graphs())
 
   def testGraphPathFindingOnControlEdgesWorks(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       v1 = variables.Variable(1.0, name="v1")
       v2 = variables.Variable(2.0, name="v2")
       v3 = variables.Variable(3.0, name="v3")
@@ -814,7 +822,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertIsNone(dump.find_some_path("v1", "c", include_control=False))
 
   def testGraphPathFindingReverseRefEdgeWorks(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       v = variables.Variable(10.0, name="v")
       delta = variables.Variable(1.0, name="delta")
       inc_v = state_ops.assign_add(v, delta, name="inc_v")
@@ -1164,7 +1172,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertAllClose(np.array([[-3.0, 0.0]]), x_dumps[0].get_tensor())
 
   def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       a = variables.Variable(
           [
               np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf,
@@ -1252,7 +1260,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertIn("m:0:DebugNumericSummary", dump.debug_watch_keys("m"))
 
   def testDebugNumericSummaryInvalidAttributesStringAreCaught(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       a = variables.Variable(10.0, name="a")
       b = variables.Variable(0.0, name="b")
       c = variables.Variable(0.0, name="c")
@@ -1300,7 +1308,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
         sess.run(y, options=run_options, run_metadata=run_metadata)
 
   def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       a = variables.Variable(10.0, name="a")
       b = variables.Variable(0.0, name="b")
       c = variables.Variable(0.0, name="c")
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 78e7b3b5ebaf9f33a808b754775e420750706c15..686fb45238ecc6e19022d82e1c6cdb517f15d9ba 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib.stepper import NodeStepper
 from tensorflow.python.framework import constant_op
@@ -52,7 +54,11 @@ class StepperTest(test_util.TensorFlowTestCase):
 
     self.z = math_ops.multiply(self.x, self.y, name="z")  # Should be -4.0.
 
-    self.sess = session.Session()
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
     self.sess.run(variables.global_variables_initializer())
 
   def tearDown(self):
@@ -581,7 +587,11 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
                                        1.0,
                                        name="v_add_plus_one")
 
-    self.sess = session.Session()
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
     self.sess.run(self.v.initializer)
 
   def tearDown(self):
@@ -708,7 +718,11 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
     gradient_descent.GradientDescentOptimizer(0.01).minimize(
         self.f, name="optim")
 
-    self.sess = session.Session()
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
     self.sess.run(variables.global_variables_initializer())
 
   def tearDown(self):
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index 63229a85398ef92469bb35e3ee4010c1de7e0ee2..7382cd5fa2c329a04f6bd5c4c35f97b96a65bc32 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -77,6 +77,8 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
         raise ValueError(
             "session_root path points to a non-empty directory: %s" %
             session_root)
+    else:
+      gfile.MakeDirs(session_root)
     self._session_root = session_root
 
     self._run_counter = 0
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 5474b0e27f9456ee949f6791d9c4945e57483718..d987ba84b55d6b35e90c5b137714f3eab3ce674c 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -88,6 +88,14 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       dumping_wrapper.DumpingDebugWrapperSession(
           session.Session(), session_root=file_path, log_usage=False)
 
+  def testConstructWrapperWithNonexistentSessionRootCreatesDirectory(self):
+    new_dir_path = os.path.join(tempfile.mkdtemp(), "new_dir")
+    dumping_wrapper.DumpingDebugWrapperSession(
+        session.Session(), session_root=new_dir_path, log_usage=False)
+    self.assertTrue(gfile.IsDirectory(new_dir_path))
+    # Cleanup.
+    gfile.DeleteRecursively(new_dir_path)
+
   def testDumpingOnASingleRunWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess, session_root=self.session_root, log_usage=False)
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 2c239038e44e617ed31b266bb6b6dcef6e3d598d..622950e51d15a5776a3d77448d418d73ff59f446 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -193,7 +193,7 @@ class OnRunStartRequest(object):
   """
 
   def __init__(self, fetches, feed_dict, run_options, run_metadata,
-               run_call_count):
+               run_call_count, is_callable_runner=False):
     """Constructor of `OnRunStartRequest`.
 
     Args:
@@ -205,12 +205,15 @@ class OnRunStartRequest(object):
         run() method of a non-wrapped TensorFlow session.
       run_call_count: 1-based count of how many run calls (including this one)
         has been invoked.
+      is_callable_runner: (bool) whether a runner returned by
+        Session.make_callable is being run.
     """
     self.fetches = fetches
     self.feed_dict = feed_dict
     self.run_options = run_options
     self.run_metadata = run_metadata
     self.run_call_count = run_call_count
+    self.is_callable_runner = is_callable_runner
 
 
 class OnRunStartAction(object):
@@ -394,7 +397,13 @@ class BaseDebugWrapperSession(session.SessionInterface):
   def as_default(self):
     return ops.default_session(self)
 
-  def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
+  def run(self,
+          fetches,
+          feed_dict=None,
+          options=None,
+          run_metadata=None,
+          callable_runner=None,
+          callable_runner_args=None):
     """Wrapper around Session.run() that inserts tensor watch options.
 
     Args:
@@ -402,25 +411,39 @@ class BaseDebugWrapperSession(session.SessionInterface):
       feed_dict: Same as the `feed_dict` arg to regular `Session.run()`.
       options: Same as the `options` arg to regular `Session.run()`.
       run_metadata: Same as the `run_metadata` arg to regular `Session.run()`.
+      callable_runner: A `callable` returned by `Session.make_callable()`.
+        If not `None`, `fetches` and `feed_dict` must both be `None`.
+      callable_runner_args: An optional list of arguments to `callable_runner`.
 
     Returns:
       Simply forwards the output of the wrapped `Session.run()` call.
 
     Raises:
-      ValueError: On invalid `OnRunStartAction` value.
+      ValueError: On invalid `OnRunStartAction` value. Or if `callable_runner`
+        is not `None` and either or both of `fetches` and `feed_dict` is `None`.
     """
+    if not callable_runner:
+      self._run_call_count += 1
+    else:
+      if fetches or feed_dict:
+        raise ValueError(
+            "callable_runner and fetches/feed_dict are mutually exclusive, but "
+            "are used simultaneously.")
 
-    self._run_call_count += 1
     if self._is_disabled_thread():
-      return self._sess.run(fetches,
-                            feed_dict=feed_dict,
-                            options=options,
-                            run_metadata=run_metadata)
+      if callable_runner:
+        return callable_runner(*callable_runner_args)
+      else:
+        return self._sess.run(fetches,
+                              feed_dict=feed_dict,
+                              options=options,
+                              run_metadata=run_metadata)
 
     # Invoke on-run-start callback and obtain response.
     run_start_resp = self.on_run_start(
         OnRunStartRequest(fetches, feed_dict, options, run_metadata,
-                          self._run_call_count))
+                          self._run_call_count,
+                          is_callable_runner=bool(callable_runner)))
     _check_type(run_start_resp, OnRunStartResponse)
 
     if run_start_resp.action == OnRunStartAction.DEBUG_RUN:
@@ -443,10 +466,15 @@ class BaseDebugWrapperSession(session.SessionInterface):
       # runtime errors.
       tf_error = None
       try:
-        retvals = self._sess.run(fetches,
-                                 feed_dict=feed_dict,
-                                 options=decorated_run_options,
-                                 run_metadata=run_metadata)
+        if callable_runner:
+          retvals = callable_runner(*callable_runner_args,
+                                    options=decorated_run_options,
+                                    run_metadata=run_metadata)
+        else:
+          retvals = self._sess.run(fetches,
+                                   feed_dict=feed_dict,
+                                   options=decorated_run_options,
+                                   run_metadata=run_metadata)
       except errors.OpError as op_error:
         tf_error = op_error
         retvals = op_error
@@ -461,17 +489,26 @@ class BaseDebugWrapperSession(session.SessionInterface):
       decorated_run_options = options or config_pb2.RunOptions()
       run_metadata = run_metadata or config_pb2.RunMetadata()
       self._decorate_run_options_for_profile(decorated_run_options)
-      retvals = self._sess.run(fetches,
-                               feed_dict=feed_dict,
-                               options=decorated_run_options,
-                               run_metadata=run_metadata)
+      if callable_runner:
+        retvals = callable_runner(*callable_runner_args,
+                                  options=decorated_run_options,
+                                  run_metadata=run_metadata)
+      else:
+        retvals = self._sess.run(fetches,
+                                 feed_dict=feed_dict,
+                                 options=decorated_run_options,
+                                 run_metadata=run_metadata)
       run_end_req = OnRunEndRequest(
           run_start_resp.action,
           run_metadata=run_metadata,
           client_graph_def=self._sess.graph.as_graph_def())
-
     elif (run_start_resp.action == OnRunStartAction.NON_DEBUG_RUN or
           run_start_resp.action == OnRunStartAction.INVOKE_STEPPER):
+      if callable_runner:
+        raise NotImplementedError(
+            "Stepper mode is not implemented for callables created by "
+            "Session.make_callable().")
+
       if run_start_resp.action == OnRunStartAction.INVOKE_STEPPER:
         with stepper.NodeStepper(
             self._sess, fetches, feed_dict) as node_stepper:
@@ -512,6 +549,28 @@ class BaseDebugWrapperSession(session.SessionInterface):
     raise NotImplementedError(
         "partial_run is not implemented for debug-wrapper sessions.")
 
+  def list_devices(self, *args, **kwargs):
+    return self._sess.list_devices(*args, **kwargs)
+
+  def reset(self, *args, **kwargs):
+    return self._sess.reset(*args, **kwargs)
+
+  def make_callable(self,
+                    fetches,
+                    feed_list=None,
+                    accept_options=False):
+    runner = self._sess.make_callable(
+        fetches, feed_list=feed_list, accept_options=True)
+    def wrapped_runner(*runner_args, **kwargs):
+      return self.run(None,
+                      feed_dict=None,
+                      options=kwargs.get("options", None),
+                      run_metadata=kwargs.get("run_metadata", None),
+                      callable_runner=runner,
+                      callable_runner_args=runner_args)
+
+    return wrapped_runner
+
   def _decorate_run_options_for_debug(
       self,
       run_options,
@@ -618,6 +677,9 @@ class BaseDebugWrapperSession(session.SessionInterface):
   def __exit__(self, exec_type, exec_value, exec_tb):
     self._sess.__exit__(exec_type, exec_value, exec_tb)
 
+  def __del__(self):
+    self._sess.__del__()
+
   def close(self):
     self._sess.close()
 
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 2b2289d6a81be836041dfc58ff2072af0c1cb95a..3991708b578ab7afdef44f73182301e1e4c5fb09 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -24,6 +24,8 @@ import threading
 
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import framework
@@ -38,6 +40,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
 
 
 class TestDebugWrapperSession(framework.BaseDebugWrapperSession):
@@ -139,6 +142,12 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
 
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
+  def _no_rewrite_session_config(self):
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    return config_pb2.ConfigProto(graph_options=graph_options)
+
   def setUp(self):
     self._observer = {
         "sess_init_count": 0,
@@ -153,7 +162,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
     self._dump_root = tempfile.mkdtemp()
 
-    self._sess = session.Session()
+    self._sess = session.Session(config=self._no_rewrite_session_config())
 
     self._a_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
     self._b_init_val = np.array([[2.0], [-1.0]])
@@ -387,5 +396,28 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
         [datum.node_name for datum in dump.dumped_tensor_data])
 
 
+def _is_public_method_name(method_name):
+  return (method_name.startswith("__") and method_name.endswith("__")
+          or not method_name.startswith("_"))
+
+
+class SessionWrapperPublicMethodParityTest(test_util.TensorFlowTestCase):
+
+  def testWrapperHasAllPublicMethodsOfSession(self):
+    session_public_methods = [
+        method_tuple[0] for method_tuple in
+        tf_inspect.getmembers(session.Session, predicate=tf_inspect.ismethod)
+        if _is_public_method_name(method_tuple[0])]
+    wrapper_public_methods = [
+        method_tuple[0] for method_tuple in
+        tf_inspect.getmembers(
+            framework.BaseDebugWrapperSession, predicate=tf_inspect.ismethod)
+        if _is_public_method_name(method_tuple[0])]
+    missing_public_methods = [
+        method for method in session_public_methods
+        if method not in wrapper_public_methods]
+    self.assertFalse(missing_public_methods)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index fe822df6ce3ca21a26825abc8385f8c120f55b0c..3dddc43d82ebd7680f1a030ad26faa454045d719 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -23,9 +23,12 @@ import shutil
 import sys
 import tempfile
 
+import six
+
 # Google-internal import(s).
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_shared
+from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import profile_analyzer_cli
 from tensorflow.python.debug.cli import stepper_cli
@@ -181,6 +184,9 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         usage=argparse.SUPPRESS)
     self._argparsers["run_info"] = ap
 
+    self._argparsers["print_feed"] = command_parser.get_print_tensor_argparser(
+        "Print the value of a feed in feed_dict.")
+
   def add_tensor_filter(self, filter_name, tensor_filter):
     """Add a tensor filter.
 
@@ -218,8 +224,9 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       An instance of `OnRunStartResponse`.
     """
     self._is_run_start = True
-    self._update_run_calls_state(request.run_call_count, request.fetches,
-                                 request.feed_dict)
+    self._update_run_calls_state(
+        request.run_call_count, request.fetches, request.feed_dict,
+        is_callable_runner=request.is_callable_runner)
 
     if self._active_tensor_filter:
       # If we are running till a filter passes, we just need to keep running
@@ -439,6 +446,44 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
 
     return output
 
+  def _print_feed_handler(self, args, screen_info=None):
+    np_printoptions = cli_shared.numpy_printoptions_from_screen_info(
+        screen_info)
+
+    if not self._feed_dict:
+      return cli_shared.error(
+          "The feed_dict of the current run is None or empty.")
+
+    parsed = self._argparsers["print_feed"].parse_args(args)
+    tensor_name, tensor_slicing = (
+        command_parser.parse_tensor_name_with_slicing(parsed.tensor_name))
+
+    feed_key = None
+    feed_value = None
+    for key in self._feed_dict:
+      if isinstance(key, six.string_types):
+        if key == tensor_name:
+          feed_key = key
+      elif key.name == tensor_name:
+        feed_key = key.name
+      if feed_key is not None:
+        feed_value = self._feed_dict[key]
+        break
+
+    if feed_key is None:
+      return cli_shared.error(
+          "The feed_dict of the current run does not contain the key %s" %
+          tensor_name)
+    else:
+      return cli_shared.format_tensor(
+          feed_value,
+          feed_key + " (feed)",
+          np_printoptions,
+          print_all=parsed.print_all,
+          tensor_slicing=tensor_slicing,
+          highlight_options=cli_shared.parse_ranges_highlight(parsed.ranges),
+          include_numeric_summary=parsed.numeric_summary)
+
   def _run_handler(self, args, screen_info=None):
     """Command handler for "run" command during on-run-start."""
 
@@ -503,11 +548,21 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         self._run_info_handler,
         self._argparsers["run_info"].format_help(),
         prefix_aliases=["ri"])
+    curses_cli.register_command_handler(
+        "print_feed",
+        self._print_feed_handler,
+        self._argparsers["print_feed"].format_help(),
+        prefix_aliases=["pf"])
 
     if self._tensor_filters:
       # Register tab completion for the filter names.
       curses_cli.register_tab_comp_context(["run", "r"],
                                            list(self._tensor_filters.keys()))
+    if self._feed_dict:
+      # Register tab completion for feed_dict keys.
+      feed_keys = [(key if isinstance(key, six.string_types) else key.name)
+                   for key in self._feed_dict.keys()]
+      curses_cli.register_tab_comp_context(["print_feed", "pf"], feed_keys)
 
   def _on_run_start_step_handler(self, args, screen_info=None):
     """Command handler for "invoke_stepper" command during on-run-start."""
@@ -532,7 +587,11 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
 
     return ["file://" + self._dump_root]
 
-  def _update_run_calls_state(self, run_call_count, fetches, feed_dict):
+  def _update_run_calls_state(self,
+                              run_call_count,
+                              fetches,
+                              feed_dict,
+                              is_callable_runner=False):
     """Update the internal state with regard to run() call history.
 
     Args:
@@ -542,18 +601,25 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         call.
       feed_dict: None of a dict. This is the feed_dict argument to the run()
         call.
+      is_callable_runner: (bool) whether a runner returned by
+        Session.make_callable is being run.
     """
 
     self._run_call_count = run_call_count
-    self._run_description = cli_shared.get_run_short_description(run_call_count,
-                                                                 fetches,
-                                                                 feed_dict)
+    self._feed_dict = feed_dict
+    self._run_description = cli_shared.get_run_short_description(
+        run_call_count,
+        fetches,
+        feed_dict,
+        is_callable_runner=is_callable_runner)
     self._run_through_times -= 1
 
-    self._run_info = cli_shared.get_run_start_intro(run_call_count,
-                                                    fetches,
-                                                    feed_dict,
-                                                    self._tensor_filters)
+    self._run_info = cli_shared.get_run_start_intro(
+        run_call_count,
+        fetches,
+        feed_dict,
+        self._tensor_filters,
+        is_callable_runner=is_callable_runner)
 
   def invoke_node_stepper(self,
                           node_stepper,
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index f8e32eca25e7f97060527a8c408e16b1e556bbbb..cde161469a38a9930a9bef8127914f0154ac4287 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -21,6 +21,7 @@ import os
 import shutil
 import tempfile
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import debugger_cli_common
@@ -49,14 +50,16 @@ class LocalCLIDebuggerWrapperSessionForTest(
   """
 
   def __init__(self,
-               command_args_sequence,
+               command_sequence,
                sess,
                dump_root=None):
     """Constructor of the for-test subclass.
 
     Args:
-      command_args_sequence: (list of list of str) A list of arguments for the
-        "run" command.
+      command_sequence: (list of list of str) A list of command arguments,
+        including the command prefix, each element of the list is such as:
+        ["run", "-n"],
+        ["print_feed", "input:0"].
       sess: See the doc string of LocalCLIDebugWrapperSession.__init__.
       dump_root: See the doc string of LocalCLIDebugWrapperSession.__init__.
     """
@@ -64,8 +67,8 @@ class LocalCLIDebuggerWrapperSessionForTest(
     local_cli_wrapper.LocalCLIDebugWrapperSession.__init__(
         self, sess, dump_root=dump_root, log_usage=False)
 
-    self._command_args_sequence = command_args_sequence
-    self._response_pointer = 0
+    self._command_sequence = command_sequence
+    self._command_pointer = 0
 
     # Observer variables.
     self.observers = {
@@ -73,6 +76,7 @@ class LocalCLIDebuggerWrapperSessionForTest(
         "tf_errors": [],
         "run_start_cli_run_numbers": [],
         "run_end_cli_run_numbers": [],
+        "print_feed_responses": [],
         "profiler_py_graphs": [],
         "profiler_run_metadata": [],
     }
@@ -94,15 +98,20 @@ class LocalCLIDebuggerWrapperSessionForTest(
     else:
       self.observers["run_end_cli_run_numbers"].append(self._run_call_count)
 
-    command_args = self._command_args_sequence[self._response_pointer]
-    self._response_pointer += 1
+    while True:
+      command = self._command_sequence[self._command_pointer]
+      self._command_pointer += 1
 
-    try:
-      self._run_handler(command_args)
-    except debugger_cli_common.CommandLineExit as e:
-      response = e.exit_token
-
-    return response
+      try:
+        if command[0] == "run":
+          self._run_handler(command[1:])
+        elif command[0] == "print_feed":
+          self.observers["print_feed_responses"].append(
+              self._print_feed_handler(command[1:]))
+        else:
+          raise ValueError("Unrecognized command prefix: %s" % command[0])
+      except debugger_cli_common.CommandLineExit as e:
+        return e.exit_token
 
 
 class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
@@ -168,9 +177,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
           session.Session(), dump_root=file_path, log_usage=False)
 
   def testRunsUnderDebugMode(self):
-    # Test command sequence: run; run; run;
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [[], [], []], self.sess, dump_root=self._tmp_dir)
+        [["run"], ["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
 
     # run under debug mode twice.
     wrapped_sess.run(self.inc_v)
@@ -192,9 +200,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
   def testRunsWithEmptyStringDumpRootWorks(self):
-    # Test command sequence: run, run
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [[], []], self.sess, dump_root="")
+        [["run"], ["run"]], self.sess, dump_root="")
 
     # run under debug mode.
     wrapped_sess.run(self.inc_v)
@@ -203,7 +210,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testRunInfoOutputAtRunEndIsCorrect(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [[], [], []], self.sess, dump_root=self._tmp_dir)
+        [["run"], ["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
 
     wrapped_sess.run(self.inc_v)
     run_info_output = wrapped_sess._run_info_handler([])
@@ -226,9 +233,9 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertIn("list_tensors", menu.captions())
 
   def testRunsUnderNonDebugMode(self):
-    # Test command sequence: run -n; run -n; run -n;
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-n"], ["-n"], ["-n"]], self.sess, dump_root=self._tmp_dir)
+        [["run", "-n"], ["run", "-n"], ["run", "-n"]],
+        self.sess, dump_root=self._tmp_dir)
 
     # run three times.
     wrapped_sess.run(self.inc_v)
@@ -242,10 +249,10 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual([], wrapped_sess.observers["run_end_cli_run_numbers"])
 
   def testRunsUnderNonDebugThenDebugMode(self):
-    # Test command sequence: run -n; run -n; run; run;
     # Do two NON_DEBUG_RUNs, followed by DEBUG_RUNs.
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-n"], ["-n"], [], []], self.sess, dump_root=self._tmp_dir)
+        [["run", "-n"], ["run", "-n"], ["run"], ["run"]],
+        self.sess, dump_root=self._tmp_dir)
 
     # run three times.
     wrapped_sess.run(self.inc_v)
@@ -264,9 +271,9 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual([None], wrapped_sess.observers["tf_errors"])
 
   def testRunMultipleTimesWithinLimit(self):
-    # Test command sequence: run -t 3; run;
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-t", "3"], []], self.sess, dump_root=self._tmp_dir)
+        [["run", "-t", "3"], ["run"]],
+        self.sess, dump_root=self._tmp_dir)
 
     # run three times.
     wrapped_sess.run(self.inc_v)
@@ -281,9 +288,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual([None], wrapped_sess.observers["tf_errors"])
 
   def testRunMultipleTimesOverLimit(self):
-    # Test command sequence: run -t 3;
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-t", "3"]], self.sess, dump_root=self._tmp_dir)
+        [["run", "-t", "3"]], self.sess, dump_root=self._tmp_dir)
 
     # run twice, which is less than the number of times specified by the
     # command.
@@ -298,9 +304,9 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual([], wrapped_sess.observers["tf_errors"])
 
   def testRunMixingDebugModeAndMultpleTimes(self):
-    # Test command sequence: run -n; run -t 2; run; run;
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-n"], ["-t", "2"], [], []], self.sess, dump_root=self._tmp_dir)
+        [["run", "-n"], ["run", "-t", "2"], ["run"], ["run"]],
+        self.sess, dump_root=self._tmp_dir)
 
     # run four times.
     wrapped_sess.run(self.inc_v)
@@ -316,9 +322,56 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
+  def testDebuggingMakeCallableTensorRunnerWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
+    v = variables.Variable(42)
+    tensor_runner = wrapped_sess.make_callable(v)
+    self.sess.run(v.initializer)
+
+    self.assertAllClose(42, tensor_runner())
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+
+  def testDebuggingMakeCallableTensorRunnerWithCustomRunOptionsWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
+    a = constant_op.constant(42)
+    tensor_runner = wrapped_sess.make_callable(a)
+
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    self.assertAllClose(
+        42, tensor_runner(options=run_options, run_metadata=run_metadata))
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
+
+  def testDebuggingMakeCallableOperationRunnerWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
+    v = variables.Variable(10.0)
+    inc_v = state_ops.assign_add(v, 1.0)
+    op_runner = wrapped_sess.make_callable(inc_v.op)
+    self.sess.run(v.initializer)
+
+    op_runner()
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+    self.assertEqual(11.0, self.sess.run(v))
+
+  def testDebuggingMakeCallableRunnerWithFeedListWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
+    ph1 = array_ops.placeholder(dtypes.float32)
+    ph2 = array_ops.placeholder(dtypes.float32)
+    a = math_ops.add(ph1, ph2)
+    tensor_runner = wrapped_sess.make_callable(a, feed_list=[ph1, ph2])
+
+    self.assertAllClose(42.0, tensor_runner(41.0, 1.0))
+    self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
+
   def testRuntimeErrorShouldBeCaught(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [[], []], self.sess, dump_root=self._tmp_dir)
+        [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
 
     # Do a run that should lead to an TensorFlow runtime error.
     wrapped_sess.run(self.y, feed_dict={self.ph: [[0.0], [1.0], [2.0]]})
@@ -338,15 +391,15 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       w = variables.Variable([1.0] * 10, name="w")
 
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [[]], self.sess, dump_root=self._tmp_dir)
+        [["run"]], self.sess, dump_root=self._tmp_dir)
     with self.assertRaisesRegexp(errors.OpError, r".*[Dd]evice.*1337.*"):
       wrapped_sess.run(w)
 
   def testRunTillFilterPassesShouldLaunchCLIAtCorrectRun(self):
-    # Test command sequence:
-    #   run -f greater_than_twelve; run -f greater_than_twelve; run;
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-f", "v_greater_than_twelve"], ["-f", "v_greater_than_twelve"], []],
+        [["run", "-f", "v_greater_than_twelve"],
+         ["run", "-f", "v_greater_than_twelve"],
+         ["run"]],
         self.sess,
         dump_root=self._tmp_dir)
 
@@ -375,12 +428,10 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual([None, None], wrapped_sess.observers["tf_errors"])
 
   def testRunsUnderDebugModeWithWatchFnFilteringNodeNames(self):
-    # Test command sequence:
-    #   run --node_name_filter inc.*
-    #   run --node_name_filter delta
-    #   run
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["--node_name_filter", "inc.*"], ["--node_name_filter", "delta"], []],
+        [["run", "--node_name_filter", "inc.*"],
+         ["run", "--node_name_filter", "delta"],
+         ["run"]],
         self.sess, dump_root=self._tmp_dir)
 
     # run under debug mode twice.
@@ -402,14 +453,10 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual("delta", dumps.dumped_tensor_data[0].node_name)
 
   def testRunsUnderDebugModeWithWatchFnFilteringOpTypes(self):
-    # Test command sequence:
-    #   run --node_name_filter delta
-    #   run --op_type_filter AssignAdd
-    #   run
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["--node_name_filter", "delta"],
-         ["--op_type_filter", "AssignAdd"],
-         []],
+        [["run", "--node_name_filter", "delta"],
+         ["run", "--op_type_filter", "AssignAdd"],
+         ["run"]],
         self.sess, dump_root=self._tmp_dir)
 
     # run under debug mode twice.
@@ -431,13 +478,10 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual("inc_v", dumps.dumped_tensor_data[0].node_name)
 
   def testRunsUnderDebugModeWithWatchFnFilteringTensorDTypes(self):
-    # Test command sequence:
-    #   run --op_type_filter Variable.*
-    #   run --dtype_filter int32
-    #   run
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["--op_type_filter", "Variable.*"],
-         ["--tensor_dtype_filter", "int32"], []],
+        [["run", "--op_type_filter", "Variable.*"],
+         ["run", "--tensor_dtype_filter", "int32"],
+         ["run"]],
         self.sess, dump_root=self._tmp_dir)
 
     # run under debug mode twice.
@@ -459,11 +503,9 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         [dumps.dumped_tensor_data[i].node_name for i in [0, 1]])
 
   def testRunsUnderDebugModeWithWatchFnFilteringOpTypesAndTensorDTypes(self):
-    # Test command sequence:
-    #   run --op_type_filter Cast --dtype_filter int32
-    #   run
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["--op_type_filter", "Cast", "--tensor_dtype_filter", "int32"], []],
+        [["run", "--op_type_filter", "Cast", "--tensor_dtype_filter", "int32"],
+         ["run"]],
         self.sess, dump_root=self._tmp_dir)
 
     # run under debug mode twice.
@@ -476,9 +518,59 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, dumps.size)
     self.assertEqual("w_int_inner", dumps.dumped_tensor_data[0].node_name)
 
+  def testPrintFeedPrintsFeedValueForTensorFeedKey(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["print_feed", "ph:0"], ["run"], ["run"]], self.sess)
+
+    self.assertAllClose(
+        [[5.0], [-1.0]],
+        wrapped_sess.run(self.y, feed_dict={self.ph: [[0.0, 1.0, 2.0]]}))
+    print_feed_responses = wrapped_sess.observers["print_feed_responses"]
+    self.assertEqual(1, len(print_feed_responses))
+    self.assertEqual(
+        ["Tensor \"ph:0 (feed)\":", "", "[[0.0, 1.0, 2.0]]"],
+        print_feed_responses[0].lines)
+
+  def testPrintFeedPrintsFeedValueForTensorNameFeedKey(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["print_feed", "ph:0"], ["run"], ["run"]], self.sess)
+
+    self.assertAllClose(
+        [[5.0], [-1.0]],
+        wrapped_sess.run(self.y, feed_dict={"ph:0": [[0.0, 1.0, 2.0]]}))
+    print_feed_responses = wrapped_sess.observers["print_feed_responses"]
+    self.assertEqual(1, len(print_feed_responses))
+    self.assertEqual(
+        ["Tensor \"ph:0 (feed)\":", "", "[[0.0, 1.0, 2.0]]"],
+        print_feed_responses[0].lines)
+
+  def testPrintFeedPrintsErrorForInvalidFeedKey(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["print_feed", "spam"], ["run"], ["run"]], self.sess)
+
+    self.assertAllClose(
+        [[5.0], [-1.0]],
+        wrapped_sess.run(self.y, feed_dict={"ph:0": [[0.0, 1.0, 2.0]]}))
+    print_feed_responses = wrapped_sess.observers["print_feed_responses"]
+    self.assertEqual(1, len(print_feed_responses))
+    self.assertEqual(
+        ["ERROR: The feed_dict of the current run does not contain the key "
+         "spam"], print_feed_responses[0].lines)
+
+  def testPrintFeedPrintsErrorWhenFeedDictIsNone(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["print_feed", "spam"], ["run"], ["run"]], self.sess)
+
+    wrapped_sess.run(self.w_int)
+    print_feed_responses = wrapped_sess.observers["print_feed_responses"]
+    self.assertEqual(1, len(print_feed_responses))
+    self.assertEqual(
+        ["ERROR: The feed_dict of the current run is None or empty."],
+        print_feed_responses[0].lines)
+
   def testRunUnderProfilerModeWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["-p"], []], self.sess)
+        [["run", "-p"], ["run"]], self.sess)
 
     wrapped_sess.run(self.w_int)
 
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 0cfd02466e98e18b0a60789c57dacb186bed7d9a..10f1bef0608beb74d6c4300377fec25ed951f63e 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -26,6 +26,7 @@ py_library(
         ":model_fn",
         ":parsing_utils",
         ":run_config",
+        ":util",
         "//tensorflow/python:util",
     ],
 )
@@ -212,6 +213,27 @@ py_test(
     ],
 )
 
+py_library(
+    name = "util",
+    srcs = [
+        "util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "util_test",
+    srcs = ["util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":util",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_library(
     name = "estimator",
     srcs = [
@@ -222,6 +244,7 @@ py_library(
         ":export",
         ":model_fn",
         ":run_config",
+        ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:control_flow_ops",
@@ -373,7 +396,6 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:logging_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
@@ -381,7 +403,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:summary",
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -394,7 +416,9 @@ py_test(
     size = "small",
     srcs = ["canned/head_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dnn_testing_utils",
         ":head",
         ":metric_keys",
         ":model_fn",
@@ -476,7 +500,10 @@ py_test(
     size = "medium",
     srcs = ["canned/linear_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "noasan",  # times out b/63680444
+    ],
     deps = [
         ":estimator",
         ":export_export",
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 013a43a8b086e6416fe5d9ca67bab81c5c420054..02ceab38b8d4ce3bea47f30aa9103166e49f634e 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -50,7 +50,7 @@ def _dnn_model_fn(
   """Deep Neural Net model_fn.
 
   Args:
-    features: Dict of `Tensor` (depends on data passed to `train`).
+    features: dict of `Tensor`.
     labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
       dtype `int32` or `int64` in the range `[0, n_classes)`.
     mode: Defines whether this is training, evaluation or prediction.
@@ -72,7 +72,13 @@ def _dnn_model_fn(
     predictions: A dict of `Tensor` objects.
     loss: A scalar containing the loss of the step.
     train_op: The op for training.
+
+  Raises:
+    ValueError: If features has the wrong type.
   """
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
   optimizer = optimizers.get_optimizer_instance(
       optimizer, learning_rate=_LEARNING_RATE)
   num_ps_replicas = config.num_ps_replicas if config else 0
@@ -188,6 +194,8 @@ class DNNClassifier(estimator.Estimator):
       name. Both features' `value` must be a `SparseTensor`.
     - if `column` is a `_DenseColumn`, a feature with `key=column.name`
       whose `value` is a `Tensor`.
+
+  Loss is calculated by using softmax cross entropy.
   """
 
   def __init__(self,
@@ -319,6 +327,8 @@ class DNNRegressor(estimator.Estimator):
       name. Both features' `value` must be a `SparseTensor`.
     - if `column` is a `_DenseColumn`, a feature with `key=column.name`
       whose `value` is a `Tensor`.
+
+  Loss is calculated by using mean squared error.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 935f6564eb567924eb1b534286e23377c17284f8..cd71c379401592d4dc5301ef3eb1a9585f88bad1 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -85,7 +85,7 @@ def _dnn_linear_combined_model_fn(
   """Deep Neural Net and Linear combined model_fn.
 
   Args:
-    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
+    features: dict of `Tensor`.
     labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype
       `int32` or `int64` in the range `[0, n_classes)`.
     mode: Defines whether this is training, evaluation or prediction.
@@ -114,8 +114,12 @@ def _dnn_linear_combined_model_fn(
 
   Raises:
     ValueError: If both `linear_feature_columns` and `dnn_features_columns`
-      are empty at the same time, or `input_layer_partitioner` is missing.
+      are empty at the same time, or `input_layer_partitioner` is missing,
+      or features has the wrong type.
   """
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
   if not linear_feature_columns and not dnn_feature_columns:
     raise ValueError(
         'Either linear_feature_columns or dnn_feature_columns must be defined.')
@@ -295,6 +299,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     - if `column` is a `_DenseColumn`, a feature with `key=column.name`
       whose `value` is a `Tensor`.
 
+  Loss is calculated by using softmax cross entropy.
   """
 
   def __init__(self,
@@ -453,6 +458,7 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
     - if `column` is a `_DenseColumn`, a feature with `key=column.name`
       whose `value` is a `Tensor`.
 
+  Loss is calculated by using mean squared error.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
index 486b302ab5c57fefe1beffb5864a688b91677a61..2151df8423774f0e6f9e51a114efe66472204962 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
@@ -664,7 +664,8 @@ class DNNLinearCombinedTests(test.TestCase):
       self.assertTrue(
           all([name.startswith(var_name_prefix) for name in var_names]))
       # var is used to check this op called by training.
-      var = variables_lib.Variable(0., name=(var_name_prefix + '_called'))
+      with ops.name_scope(''):
+        var = variables_lib.Variable(0., name=(var_name_prefix + '_called'))
       with ops.control_dependencies([var.assign(100.)]):
         return real_optimizer.minimize(loss, global_step, var_list)
 
@@ -695,11 +696,11 @@ class DNNLinearCombinedTests(test.TestCase):
     # verifies train_op fires linear minimize op
     self.assertEqual(100.,
                      checkpoint_utils.load_variable(
-                         self._model_dir, 'binary_logistic_head/linear_called'))
+                         self._model_dir, 'linear_called'))
     # verifies train_op fires dnn minimize op
     self.assertEqual(100.,
                      checkpoint_utils.load_variable(
-                         self._model_dir, 'binary_logistic_head/dnn_called'))
+                         self._model_dir, 'dnn_called'))
 
   def test_dnn_and_linear_logits_are_added(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 269da5246d5dd0a29225f84bb92c7300f9121a19..0be447604cb138b67f81034c10e7f48e2d438dbb 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -436,6 +436,33 @@ class BaseDNNModelFnTest(object):
           else:
             self.fail('Invalid mode: {}'.format(mode))
 
+  def test_features_tensor_raises_value_error(self):
+    """Tests that passing a Tensor for features raises a ValueError."""
+    hidden_units = (2, 2)
+    logits_dimension = 3
+    inputs = ([[10.]], [[8.]])
+    expected_logits = [[0, 0, 0]]
+
+    with ops.Graph().as_default():
+      training_util.create_global_step()
+      head = mock_head(
+          self,
+          hidden_units=hidden_units,
+          logits_dimension=logits_dimension,
+          expected_logits=expected_logits)
+      with self.assertRaisesRegexp(ValueError, 'features should be a dict'):
+        self._dnn_model_fn(
+            features=constant_op.constant(inputs),
+            labels=constant_op.constant([[1]]),
+            mode=model_fn.ModeKeys.TRAIN,
+            head=head,
+            hidden_units=hidden_units,
+            feature_columns=[
+                feature_column.numeric_column(
+                    'age', shape=np.array(inputs).shape[1:])
+            ],
+            optimizer=mock_optimizer(self, hidden_units))
+
 
 class BaseDNNClassifierEvaluateTest(object):
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index a1c1f1be0ba74494f9003a28bf32c8a95f66e084..bc868a493ffe791a28b87bd6a73ca5965a322188 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -32,17 +32,16 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.summary import summary
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -367,10 +366,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `Head`."""
-    with variable_scope.variable_scope(
-        None,
-        default_name='multi_class_head',
-        values=(tuple(six.itervalues(features)) + (labels, logits))):
+    with ops.name_scope('head'):
       logits = _check_logits(logits, self.logits_dimension)
 
       # Predict.
@@ -441,17 +437,18 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None.')
-      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
-      logging_ops.scalar_summary(
-          metric_keys.MetricKeys.LOSS_MEAN,
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
-      return model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          predictions=predictions,
-          loss=training_loss,
-          train_op=train_op_fn(training_loss))
+    with ops.name_scope(''):
+      summary.scalar(metric_keys.MetricKeys.LOSS, training_loss)
+      summary.scalar(metric_keys.MetricKeys.LOSS_MEAN,
+                     losses.compute_weighted_loss(
+                         unweighted_loss,
+                         weights=weights,
+                         reduction=losses.Reduction.MEAN))
+    return model_fn.EstimatorSpec(
+        mode=model_fn.ModeKeys.TRAIN,
+        predictions=predictions,
+        loss=training_loss,
+        train_op=train_op_fn(training_loss))
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
@@ -579,32 +576,31 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `Head`."""
-    with variable_scope.variable_scope(
-        None, default_name='binary_logistic_head',
-        values=(tuple(six.itervalues(features)) + (labels, logits))):
-
-      # Predict.
-      pred_keys = prediction_keys.PredictionKeys
-      logits = _check_logits(logits, self.logits_dimension)
-      logistic = math_ops.sigmoid(logits, name=pred_keys.LOGISTIC)
-      two_class_logits = array_ops.concat(
-          (array_ops.zeros_like(logits), logits), 1, name='two_class_logits')
-      scores = nn.softmax(two_class_logits, name=pred_keys.PROBABILITIES)
-      class_ids = array_ops.reshape(
-          math_ops.argmax(two_class_logits, axis=1), (-1, 1), name='classes')
-      if self._label_vocabulary:
-        table = lookup_ops.index_to_string_table_from_tensor(
-            vocabulary_list=self._label_vocabulary, name='class_string_lookup')
-        classes = table.lookup(class_ids)
-      else:
-        classes = string_ops.as_string(class_ids, name='str_classes')
-      predictions = {
-          pred_keys.LOGITS: logits,
-          pred_keys.LOGISTIC: logistic,
-          pred_keys.PROBABILITIES: scores,
-          pred_keys.CLASS_IDS: class_ids,
-          pred_keys.CLASSES: classes,
-      }
+    # Predict.
+    with ops.name_scope('head'):
+      with ops.name_scope(None, 'predictions', (logits,)):
+        pred_keys = prediction_keys.PredictionKeys
+        logits = _check_logits(logits, self.logits_dimension)
+        logistic = math_ops.sigmoid(logits, name=pred_keys.LOGISTIC)
+        two_class_logits = array_ops.concat(
+            (array_ops.zeros_like(logits), logits), 1, name='two_class_logits')
+        scores = nn.softmax(two_class_logits, name=pred_keys.PROBABILITIES)
+        class_ids = array_ops.reshape(
+            math_ops.argmax(two_class_logits, axis=1), (-1, 1), name='classes')
+        if self._label_vocabulary:
+          table = lookup_ops.index_to_string_table_from_tensor(
+              vocabulary_list=self._label_vocabulary,
+              name='class_string_lookup')
+          classes = table.lookup(class_ids)
+        else:
+          classes = string_ops.as_string(class_ids, name='str_classes')
+        predictions = {
+            pred_keys.LOGITS: logits,
+            pred_keys.LOGISTIC: logistic,
+            pred_keys.PROBABILITIES: scores,
+            pred_keys.CLASS_IDS: class_ids,
+            pred_keys.CLASSES: classes,
+        }
       if mode == model_fn.ModeKeys.PREDICT:
         batch_size = array_ops.shape(logistic)[0]
         export_class_list = self._label_vocabulary
@@ -657,17 +653,18 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None.')
-      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
-      logging_ops.scalar_summary(
-          metric_keys.MetricKeys.LOSS_MEAN,
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
-      return model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          predictions=predictions,
-          loss=training_loss,
-          train_op=train_op_fn(training_loss))
+    with ops.name_scope(''):
+      summary.scalar(metric_keys.MetricKeys.LOSS, training_loss)
+      summary.scalar(metric_keys.MetricKeys.LOSS_MEAN,
+                     losses.compute_weighted_loss(
+                         unweighted_loss,
+                         weights=weights,
+                         reduction=losses.Reduction.MEAN))
+    return model_fn.EstimatorSpec(
+        mode=model_fn.ModeKeys.TRAIN,
+        predictions=predictions,
+        loss=training_loss,
+        train_op=train_op_fn(training_loss))
 
 
 def _regression_head_with_mean_squared_error_loss(weight_column=None,
@@ -707,12 +704,8 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `Head`."""
-    with variable_scope.variable_scope(
-        None,
-        default_name='regression_head',
-        values=(tuple(six.itervalues(features)) + (labels, logits))):
-
-      # Predict.
+    # Predict.
+    with ops.name_scope('head'):
       logits = _check_logits(logits, self._logits_dimension)
       predictions = {prediction_keys.PredictionKeys.PREDICTIONS: logits}
       if mode == model_fn.ModeKeys.PREDICT:
@@ -744,43 +737,46 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None.')
-      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
-      logging_ops.scalar_summary(
-          metric_keys.MetricKeys.LOSS_MEAN,
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
-      return model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          predictions=predictions,
-          loss=training_loss,
-          train_op=train_op_fn(training_loss))
+    with ops.name_scope(''):
+      summary.scalar(metric_keys.MetricKeys.LOSS, training_loss)
+      summary.scalar(metric_keys.MetricKeys.LOSS_MEAN,
+                     losses.compute_weighted_loss(
+                         unweighted_loss,
+                         weights=weights,
+                         reduction=losses.Reduction.MEAN))
+    return model_fn.EstimatorSpec(
+        mode=model_fn.ModeKeys.TRAIN,
+        predictions=predictions,
+        loss=training_loss,
+        train_op=train_op_fn(training_loss))
 
 
 def _assert_range(labels, n_classes):
-  assert_less = check_ops.assert_less(
-      labels,
-      ops.convert_to_tensor(n_classes, dtype=labels.dtype),
-      message='Label IDs must < n_classes')
-  assert_greater = check_ops.assert_non_negative(
-      labels, message='Label IDs must >= 0')
-  with ops.control_dependencies((assert_less, assert_greater)):
-    return array_ops.identity(labels)
+  with ops.name_scope(None, 'assert_range', (labels,)):
+    assert_less = check_ops.assert_less(
+        labels,
+        ops.convert_to_tensor(n_classes, dtype=labels.dtype),
+        message='Label IDs must < n_classes')
+    assert_greater = check_ops.assert_non_negative(
+        labels, message='Label IDs must >= 0')
+    with ops.control_dependencies((assert_less, assert_greater)):
+      return array_ops.identity(labels)
 
 
 def _weights(features, weight_column):
   """Fetches weights from features."""
-  if weight_column is None:
-    return 1.
-  if isinstance(weight_column, six.string_types):
-    weight_column = feature_column_lib.numeric_column(key=weight_column)
-  if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-    raise TypeError('Weight column must be either a string or _NumericColumn. '
-                    'Given type: {}.'.format(type(weight_column)))
-  weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
-      feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
-  if not (weights.dtype.is_floating or weights.dtype.is_integer):
-    raise ValueError('Weight column should be castable to float. '
-                     'Given dtype: {}'.format(weights.dtype))
-  weights = _maybe_expand_dim(math_ops.to_float(weights, name='weights'))
-  return weights
+  with ops.name_scope(None, 'weights', values=features.values()):
+    if weight_column is None:
+      return 1.
+    if isinstance(weight_column, six.string_types):
+      weight_column = feature_column_lib.numeric_column(key=weight_column)
+    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
+      raise TypeError('Weight column must be either a string or _NumericColumn.'
+                      ' Given type: {}.'.format(type(weight_column)))
+    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
+        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
+    if not (weights.dtype.is_floating or weights.dtype.is_integer):
+      raise ValueError('Weight column should be castable to float. '
+                       'Given dtype: {}'.format(weights.dtype))
+    weights = _maybe_expand_dim(math_ops.to_float(weights, name='weights'))
+    return weights
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index c6ea54f08ea3ef8c00ec8d056ad0827d0b784d9e..5110582dd17100dd1c92f627c91ace1734c266fa 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -23,6 +23,7 @@ import six
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import dnn_testing_utils
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
@@ -83,26 +84,11 @@ def _sigmoid(logits):
   return 1 / (1 + np.exp(-logits))
 
 
-# TODO(roumposg): Reuse the code from dnn_testing_utils.
-def _assert_close(expected, actual, rtol=1e-04, message='',
-                  name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs((expected - actual) / expected, 'diff')
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=(message, 'Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        summarize=expected.get_shape().num_elements(),
-        name=scope)
-
-
 class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
 
+  def setUp(self):
+    ops.reset_default_graph()
+
   def test_n_classes_is_none(self):
     with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
       head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -690,6 +676,9 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
 # TODO(ptucker): Add thresholds tests.
 class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
 
+  def setUp(self):
+    ops.reset_default_graph()
+
   def test_threshold_too_small(self):
     with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
       head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
@@ -1026,7 +1015,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     #      = 1.2484322
     expected_loss = 1.2484322
     def _train_op_fn(loss):
-      with ops.control_dependencies((_assert_close(
+      with ops.control_dependencies((dnn_testing_utils.assert_close(
           math_ops.to_float(expected_loss), math_ops.to_float(loss)),)):
         return constant_op.constant(expected_train_result)
     spec = head.create_estimator_spec(
@@ -1258,6 +1247,9 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
 
 class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
 
+  def setUp(self):
+    ops.reset_default_graph()
+
   def test_invalid_label_dimension(self):
     with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
       head_lib._regression_head_with_mean_squared_error_loss(label_dimension=-1)
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index fd929b260bd0201714b5cdc01311fb6e3f64f199..19ae5e79a5747d3554454cbe542be8cee98e8cf9 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -42,38 +42,38 @@ def _get_default_optimizer(feature_columns):
   return ftrl.FtrlOptimizer(learning_rate=learning_rate)
 
 
-# TODO(b/36813849): Revisit passing params vs named arguments.
-def _linear_model_fn(features, labels, mode, params, config):
+def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
+                     partitioner, config):
   """A model_fn for linear models that use a gradient-based optimizer.
 
   Args:
-    features: Dict of `Tensor`.
+    features: dict of `Tensor`.
     labels: `Tensor` of shape `[batch_size, logits_dimension]`.
     mode: Defines whether this is training, evaluation or prediction.
       See `ModeKeys`.
-    params: A dict of hyperparameters.
-      The following hyperparameters are expected:
-      * head: A `Head` instance.
-      * feature_columns: An iterable containing all the feature columns used by
-          the model.
-      * optimizer: string, `Optimizer` object, or callable that defines the
-          optimizer to use for training. If `None`, will use a FTRL optimizer.
+    head: A `Head` instance.
+    feature_columns: An iterable containing all the feature columns used by
+      the model.
+    optimizer: string, `Optimizer` object, or callable that defines the
+      optimizer to use for training. If `None`, will use a FTRL optimizer.
+    partitioner: Partitioner for variables.
     config: `RunConfig` object to configure the runtime settings.
 
   Returns:
     An `EstimatorSpec` instance.
 
   Raises:
-    ValueError: If mode or params are invalid.
+    ValueError: mode or params are invalid, or features has the wrong type.
   """
-  head = params['head']
-  feature_columns = tuple(params['feature_columns'])
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
   optimizer = optimizers.get_optimizer_instance(
-      params.get('optimizer') or _get_default_optimizer(feature_columns),
+      optimizer or _get_default_optimizer(feature_columns),
       learning_rate=_LEARNING_RATE)
   num_ps_replicas = config.num_ps_replicas if config else 0
 
-  partitioner = params.get('partitioner') or (
+  partitioner = partitioner or (
       partitioned_variables.min_max_variable_partitioner(
           max_partitions=num_ps_replicas,
           min_slice_size=64 << 20))
@@ -151,6 +151,8 @@ class LinearClassifier(estimator.Estimator):
       Both features' `value` must be a `SparseTensor`.
     - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
       whose `value` is a `Tensor`.
+
+  Loss is calculated by using softmax cross entropy.
   """
 
   def __init__(self,
@@ -208,16 +210,20 @@ class LinearClassifier(estimator.Estimator):
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
           label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _linear_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          partitioner=partitioner,
+          config=config)
     super(LinearClassifier, self).__init__(
-        model_fn=_linear_model_fn,
+        model_fn=_model_fn,
         model_dir=model_dir,
-        config=config,
-        params={
-            'head': head,
-            'feature_columns': feature_columns,
-            'optimizer': optimizer,
-            'partitioner': partitioner,
-        })
+        config=config)
 
 
 class LinearRegressor(estimator.Estimator):
@@ -260,6 +266,8 @@ class LinearRegressor(estimator.Estimator):
          key=weight column name, value=a `SparseTensor`}
     - if isinstance(column, `RealValuedColumn`):
         key=column.name, value=a `Tensor`
+
+  Loss is calculated by using mean squared error.
   """
 
   def __init__(self,
@@ -294,21 +302,19 @@ class LinearRegressor(estimator.Estimator):
       config: `RunConfig` object to configure the runtime settings.
       partitioner: Optional. Partitioner for input layer.
     """
+    head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
+        label_dimension=label_dimension, weight_column=weight_column)
+    def _model_fn(features, labels, mode, config):
+      return _linear_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          partitioner=partitioner,
+          config=config)
     super(LinearRegressor, self).__init__(
-        model_fn=_linear_model_fn,
+        model_fn=_model_fn,
         model_dir=model_dir,
-        config=config,
-        params={
-            # pylint: disable=protected-access
-            'head':
-                head_lib._regression_head_with_mean_squared_error_loss(
-                    label_dimension=label_dimension,
-                    weight_column=weight_column),
-            # pylint: enable=protected-access
-            'feature_columns':
-                feature_columns,
-            'optimizer':
-                optimizer,
-            'partitioner':
-                partitioner,
-        })
+        config=config)
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index 965ac8cbdd32d6a80f48d717d62661bc64b55248..c9bde91f9b36cdcfd143fdf585033a3a492caca4 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -124,7 +124,7 @@ def sigmoid(x):
 
 
 class CheckPartitionerVarHook(session_run_hook.SessionRunHook):
-  """A `SessionRunHook` to check a paritioned variable."""
+  """A `SessionRunHook` to check a partitioned variable."""
 
   def __init__(self, test_case, var_name, var_dim, partitions):
     self._test_case = test_case
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index 91e3bf1d83a060e4c852525193acc5712c7166dd..7dc4bfe5ffb5f762b56f4fc91b8a75ee4ba1796e 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -21,12 +21,10 @@ from __future__ import print_function
 from tensorflow.python.estimator import model_fn
 
 
-# TODO(pucker): Merge with model_fn.MetricKeys once we've worked out out naming
-# conventions.
 class MetricKeys(object):
   """Metric key strings."""
-  LOSS = model_fn.MetricKeys.LOSS
-  LOSS_MEAN = model_fn.MetricKeys.AVERAGE_LOSS
+  LOSS = model_fn.LOSS_METRIC_KEY
+  LOSS_MEAN = model_fn.AVERAGE_LOSS_METRIC_KEY
 
   ACCURACY = 'accuracy'
   # This is the best the model could do by always predicting one class.
diff --git a/tensorflow/python/estimator/canned/parsing_utils.py b/tensorflow/python/estimator/canned/parsing_utils.py
index af584965bb9d88364368800d16a2b4e3c1c10690..f153272947ca427b25b00e6df4741d7ada5790df 100644
--- a/tensorflow/python/estimator/canned/parsing_utils.py
+++ b/tensorflow/python/estimator/canned/parsing_utils.py
@@ -38,9 +38,9 @@ def classifier_parse_example_spec(feature_columns,
   * Users need to combine parsing spec of features with labels and weights
     (if any) since they are all parsed from same tf.Example instance. This
     utility combines these specs.
-  * It is difficult to map expected label by a classifier such as DNNClassifier
-    to corresponding tf.parse_example spec. This utility encodes it by getting
-    related information from users (key, dtype).
+  * It is difficult to map expected label by a classifier such as
+    `DNNClassifier` to corresponding tf.parse_example spec. This utility encodes
+    it by getting related information from users (key, dtype).
 
   Example output of parsing spec:
 
@@ -82,7 +82,7 @@ def classifier_parse_example_spec(feature_columns,
 
 
   # Input builders
-  def input_fn_train():  # Returns a dictionary which also contains labels.
+  def input_fn_train():  # Returns a tuple of features and labels.
     features = tf.contrib.learn.read_keyed_batch_features(
         file_pattern=train_files,
         batch_size=batch_size,
@@ -115,8 +115,13 @@ def classifier_parse_example_spec(feature_columns,
       key 'clicked' it should count as negative example by setting
       `label_deafault=0`. Type of this value should be compatible with
       `label_dtype`.
-    weight_column: (Optional) A `tf.feature_column.numeric_column` represents
-      the weight used given classifier.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
 
   Returns:
     A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
@@ -129,7 +134,6 @@ def classifier_parse_example_spec(feature_columns,
       instance.
     ValueError: If `weight_column` is not a `_NumericColumn` instance.
     ValueError: if label_key is None.
-    ValueError: if label_dtype is neither an integer nor string
   """
   parsing_spec = fc.make_parse_example_spec(feature_columns)
   if label_key in parsing_spec:
@@ -158,3 +162,138 @@ def classifier_parse_example_spec(feature_columns,
 
   parsing_spec.update(weight_column._parse_example_spec)  # pylint: disable=protected-access
   return parsing_spec
+
+
+def regressor_parse_example_spec(feature_columns,
+                                 label_key,
+                                 label_dtype=dtypes.float32,
+                                 label_default=None,
+                                 label_dimension=1,
+                                 weight_column=None):
+  """Generates parsing spec for tf.parse_example to be used with regressors.
+
+  If users keep data in tf.Example format, they need to call tf.parse_example
+  with a proper feature spec. There are two main things that this utility helps:
+
+  * Users need to combine parsing spec of features with labels and weights
+    (if any) since they are all parsed from same tf.Example instance. This
+    utility combines these specs.
+  * It is difficult to map expected label by a regressor such as `DNNRegressor`
+    to corresponding tf.parse_example spec. This utility encodes it by getting
+    related information from users (key, dtype).
+
+  Example output of parsing spec:
+
+  ```python
+  # Define features and transformations
+  feature_b = tf.feature_column.numeric_column(...)
+  feature_c_bucketized = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column("feature_c"), ...)
+  feature_a_x_feature_c = tf.feature_column.crossed_column(
+      columns=["feature_a", feature_c_bucketized], ...)
+
+  feature_columns = [feature_b, feature_c_bucketized, feature_a_x_feature_c]
+  parsing_spec = tf.estimator.regressor_parse_example_spec(
+      feature_columns, label_key='my-label')
+
+  # For the above example, regressor_parse_example_spec would return the dict:
+  assert parsing_spec == {
+    "feature_a": parsing_ops.VarLenFeature(tf.string),
+    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
+    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+    "my-label" : parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+  }
+  ```
+
+  Example usage with a regressor:
+
+  ```python
+  feature_columns = # define features via tf.feature_column
+  estimator = DNNRegressor(
+      hidden_units=[256, 64, 16],
+      feature_columns=feature_columns,
+      weight_column='example-weight',
+      label_dimension=3)
+  # This label configuration tells the regressor the following:
+  # * weights are retrieved with key 'example-weight'
+  # * label is a 3 dimension tensor with float32 dtype.
+
+
+  # Input builders
+  def input_fn_train():  # Returns a tuple of features and labels.
+    features = tf.contrib.learn.read_keyed_batch_features(
+        file_pattern=train_files,
+        batch_size=batch_size,
+        # creates parsing configuration for tf.parse_example
+        features=tf.estimator.classifier_parse_example_spec(
+            feature_columns,
+            label_key='my-label',
+            label_dimension=3,
+            weight_column='example-weight'),
+        reader=tf.RecordIOReader)
+     labels = features.pop('my-label')
+     return features, labels
+
+  estimator.train(input_fn=input_fn_train)
+  ```
+
+  Args:
+    feature_columns: An iterable containing all feature columns. All items
+      should be instances of classes derived from `_FeatureColumn`.
+    label_key: A string identifying the label. It means tf.Example stores labels
+      with this key.
+    label_dtype: A `tf.dtype` identifies the type of labels. By default it is
+      `tf.float32`.
+    label_default: used as label if label_key does not exist in given
+      tf.Example. By default default_value is none, which means
+      `tf.parse_example` will error out if there is any missing label.
+    label_dimension: Number of regression targets per example. This is the
+      size of the last dimension of the labels and logits `Tensor` objects
+      (typically, these have shape `[batch_size, label_dimension]`).
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+
+  Returns:
+    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
+    value.
+
+  Raises:
+    ValueError: If label is used in `feature_columns`.
+    ValueError: If weight_column is used in `feature_columns`.
+    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
+      instance.
+    ValueError: If `weight_column` is not a `_NumericColumn` instance.
+    ValueError: if label_key is None.
+  """
+  parsing_spec = fc.make_parse_example_spec(feature_columns)
+  if label_key in parsing_spec:
+    raise ValueError('label should not be used as feature. '
+                     'label_key: {}, features: {}'.format(
+                         label_key, parsing_spec.keys()))
+  parsing_spec[label_key] = parsing_ops.FixedLenFeature(
+      (label_dimension,), label_dtype, label_default)
+
+  if weight_column is None:
+    return parsing_spec
+
+  if isinstance(weight_column, six.string_types):
+    weight_column = fc.numeric_column(weight_column)
+
+  if not isinstance(weight_column, fc._NumericColumn):  # pylint: disable=protected-access
+    raise ValueError('weight_column should be an instance of '
+                     'tf.feature_column.numeric_column. '
+                     'Given type: {} value: {}'.format(
+                         type(weight_column), weight_column))
+
+  if weight_column.key in parsing_spec:
+    raise ValueError('weight_column should not be used as feature. '
+                     'weight_column: {}, features: {}'.format(
+                         weight_column.key, parsing_spec.keys()))
+
+  parsing_spec.update(weight_column._parse_example_spec)  # pylint: disable=protected-access
+  return parsing_spec
diff --git a/tensorflow/python/estimator/canned/parsing_utils_test.py b/tensorflow/python/estimator/canned/parsing_utils_test.py
index c83823e75083de4710cdd353d139049e36e2cde6..366bb104ca574e0ac2f3b80eee5dffa91b010fd1 100644
--- a/tensorflow/python/estimator/canned/parsing_utils_test.py
+++ b/tensorflow/python/estimator/canned/parsing_utils_test.py
@@ -111,5 +111,101 @@ class ClassifierParseExampleSpec(test.TestCase):
           weight_column=not_a_numeric_column)
 
 
+class RegressorParseExampleSpec(test.TestCase):
+  """Tests tf.estimator.classifier_parse_example_spec."""
+
+  def test_defaults(self):
+    parsing_spec = parsing_utils.regressor_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')], label_key='b')
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_int64(self):
+    parsing_spec = parsing_utils.regressor_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        label_dtype=dtypes.int64)
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_label_default_value(self):
+    parsing_spec = parsing_utils.regressor_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        label_default=0.)
+    expected_spec = {
+        'a':
+            parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b':
+            parsing_ops.FixedLenFeature(
+                (1,), dtype=dtypes.float32, default_value=0.),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_label_dimension(self):
+    parsing_spec = parsing_utils.regressor_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        label_dimension=3)
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((3,), dtype=dtypes.float32),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_weight_column_as_string(self):
+    parsing_spec = parsing_utils.regressor_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        weight_column='c')
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_weight_column_as_numeric_column(self):
+    parsing_spec = parsing_utils.regressor_parse_example_spec(
+        feature_columns=[fc.numeric_column('a')],
+        label_key='b',
+        weight_column=fc.numeric_column('c'))
+    expected_spec = {
+        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
+    }
+    self.assertDictEqual(expected_spec, parsing_spec)
+
+  def test_label_key_should_not_be_used_as_feature(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'label should not be used as feature'):
+      parsing_utils.regressor_parse_example_spec(
+          feature_columns=[fc.numeric_column('a')], label_key='a')
+
+  def test_weight_column_should_not_be_used_as_feature(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'weight_column should not be used as feature'):
+      parsing_utils.regressor_parse_example_spec(
+          feature_columns=[fc.numeric_column('a')],
+          label_key='b',
+          weight_column=fc.numeric_column('a'))
+
+  def test_weight_column_should_be_a_numeric_column(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'tf.feature_column.numeric_column'):
+      not_a_numeric_column = 3
+      parsing_utils.regressor_parse_example_spec(
+          feature_columns=[fc.numeric_column('a')],
+          label_key='b',
+          weight_column=not_a_numeric_column)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 293aa752531a29799151ff1aba67ebdf7ab06aa8..a1403367d3e82733d069b0adc4844652c800690f 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -31,6 +31,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import util
 from tensorflow.python.estimator.export.export import build_all_signature_defs
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
 from tensorflow.python.framework import ops
@@ -47,12 +48,11 @@ from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
 from tensorflow.python.training import training
 from tensorflow.python.util import compat
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 
 _VALID_MODEL_FN_ARGS = set(
-    ['features', 'labels', 'mode', 'params', 'config'])
+    ['features', 'labels', 'mode', 'params', 'self', 'config'])
 
 
 class Estimator(object):
@@ -95,10 +95,10 @@ class Estimator(object):
         * Args:
 
           * `features`: This is the first item returned from the `input_fn`
-                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 passed to `train`, `evaluate`, and `predict`. This should be a
                  single `Tensor` or `dict` of same.
           * `labels`: This is the second item returned from the `input_fn`
-                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 passed to `train`, `evaluate`, and `predict`. This should be a
                  single `Tensor` or `dict` of same (for multi-head models). If
                  mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
                  the `model_fn`'s signature does not accept `mode`, the
@@ -172,7 +172,7 @@ class Estimator(object):
       raise ValueError('model_fn must be provided to Estimator.')
     _verify_model_fn_args(model_fn, params)
     self._model_fn = model_fn
-    self._params = params or {}
+    self._params = copy.deepcopy(params or {})
 
   @property
   def model_dir(self):
@@ -334,7 +334,8 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       self._create_and_assert_global_step(g)
-      features = self._get_features_from_input_fn(input_fn)
+      features = self._get_features_from_input_fn(
+          input_fn, model_fn_lib.ModeKeys.PREDICT)
       estimator_spec = self._call_model_fn(features, None,
                                            model_fn_lib.ModeKeys.PREDICT)
       predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
@@ -357,7 +358,7 @@ class Estimator(object):
               }
 
   def _assert_members_are_not_overridden(self):
-    allowed_overrides = set(['_create_global_step'])
+    allowed_overrides = set(['_call_input_fn', '_create_global_step'])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
     subclass_members = set(self.__class__.__dict__.keys())
@@ -484,8 +485,8 @@ class Estimator(object):
 
       return export_dir
 
-  def _get_features_from_input_fn(self, input_fn):
-    result = input_fn()
+  def _get_features_from_input_fn(self, input_fn, mode):
+    result = self._call_input_fn(input_fn, mode)
     if not ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
       logging.warning('Input graph does not contain a QueueRunner. '
                       'That means predict yields forever. '
@@ -494,6 +495,15 @@ class Estimator(object):
       return result[0]
     return result
 
+  def _get_features_and_labels_from_input_fn(self, input_fn, mode):
+    result = self._call_input_fn(input_fn, mode)
+    if isinstance(result, (list, tuple)):
+      if len(result) != 2:
+        raise ValueError(
+            'input_fn should return (feautures, labels) as a len 2 tuple.')
+      return result
+    return result, None
+
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
     batch_length = None
@@ -549,6 +559,31 @@ class Estimator(object):
     assert step.dtype.is_integer
     return step
 
+  def _call_input_fn(self, input_fn, mode):
+    """Calls the input function.
+
+    Args:
+      input_fn: The input function.
+      mode: ModeKeys
+
+    Returns:
+      Either features or (features, labels) where features and labels are:
+        features - `Tensor` or dictionary of string feature name to `Tensor`.
+        labels - `Tensor` or dictionary of `Tensor` with labels.
+
+    Raises:
+      ValueError: if input_fn takes invalid arguments.
+    """
+    del mode  # unused
+    input_fn_args = util.fn_args(input_fn)
+    kwargs = {}
+    if 'params' in input_fn_args:
+      kwargs['params'] = self.params
+    if 'config' in input_fn_args:
+      kwargs['config'] = self.config
+    with ops.device('/cpu:0'):
+      return input_fn(**kwargs)
+
   def _call_model_fn(self, features, labels, mode):
     """Calls model function.
 
@@ -563,16 +598,21 @@ class Estimator(object):
     Raises:
       ValueError: if model_fn returns invalid objects.
     """
-    model_fn_args = _model_fn_args(self._model_fn)
+    model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
+    if 'labels' in model_fn_args:
+      kwargs['labels'] = labels
+    else:
+      if labels is not None:
+        raise ValueError(
+            'model_fn does not take labels, but input_fn returns labels.')
     if 'mode' in model_fn_args:
       kwargs['mode'] = mode
     if 'params' in model_fn_args:
       kwargs['params'] = self.params
     if 'config' in model_fn_args:
       kwargs['config'] = self.config
-    model_fn_results = self._model_fn(
-        features=features, labels=labels, **kwargs)
+    model_fn_results = self._model_fn(features=features, **kwargs)
 
     if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
       raise ValueError('model_fn should return an EstimatorSpec.')
@@ -584,8 +624,8 @@ class Estimator(object):
     with ops.Graph().as_default() as g, g.device(self._device_fn):
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      with ops.device('/cpu:0'):
-        features, labels = input_fn()
+      features, labels = self._get_features_and_labels_from_input_fn(
+          input_fn, model_fn_lib.ModeKeys.TRAIN)
       estimator_spec = self._call_model_fn(features, labels,
                                            model_fn_lib.ModeKeys.TRAIN)
       ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
@@ -603,12 +643,15 @@ class Estimator(object):
 
       if not (estimator_spec.scaffold.saver or
               ops.get_collection(ops.GraphKeys.SAVERS)):
-        ops.add_to_collection(ops.GraphKeys.SAVERS,
-                              training.Saver(
-                                  sharded=True,
-                                  max_to_keep=self._config.keep_checkpoint_max,
-                                  defer_build=True,
-                                  save_relative_paths=True))
+        ops.add_to_collection(
+            ops.GraphKeys.SAVERS,
+            training.Saver(
+                sharded=True,
+                max_to_keep=self._config.keep_checkpoint_max,
+                keep_checkpoint_every_n_hours=(
+                    self._config.keep_checkpoint_every_n_hours),
+                defer_build=True,
+                save_relative_paths=True))
 
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
@@ -636,7 +679,8 @@ class Estimator(object):
               tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config) as mon_sess:
+          config=self._session_config,
+          log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
@@ -663,17 +707,18 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      features, labels = input_fn()
+      features, labels = self._get_features_and_labels_from_input_fn(
+          input_fn, model_fn_lib.ModeKeys.EVAL)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.EVAL)
 
-      if model_fn_lib.MetricKeys.LOSS in estimator_spec.eval_metric_ops:
+      if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
         raise ValueError(
             'Metric with name "%s" is not allowed, because Estimator ' % (
-                model_fn_lib.MetricKeys.LOSS) +
+                model_fn_lib.LOSS_METRIC_KEY) +
             'already defines a default metric with the same name.')
       estimator_spec.eval_metric_ops[
-          model_fn_lib.MetricKeys.LOSS] = metrics_lib.mean(estimator_spec.loss)
+          model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss)
 
       update_op, eval_dict = _extract_metric_update_ops(
           estimator_spec.eval_metric_ops)
@@ -684,13 +729,16 @@ class Estimator(object):
             'already defines a default metric with the same name.')
       eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
 
+      all_hooks = list(hooks or [])
+      all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
+
       eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
           checkpoint_path=checkpoint_path,
           master=self._config.evaluation_master,
           scaffold=estimator_spec.scaffold,
           eval_ops=update_op,
           final_ops=eval_dict,
-          hooks=hooks,
+          hooks=all_hooks,
           config=self._session_config)
 
       _write_dict_to_summary(
@@ -746,36 +794,11 @@ def _get_replica_device_setter(config):
     return None
 
 
-def _model_fn_args(fn):
-  """Get argument names for function-like object.
-
-  Args:
-    fn: Function, or function-like object (e.g., result of `functools.partial`).
-
-  Returns:
-    `tuple` of string argument names.
-
-  Raises:
-    ValueError: if partial function has positionally bound arguments
-  """
-  _, fn = tf_decorator.unwrap(fn)
-  if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
-    # Handle functools.partial and similar objects.
-    return tuple([
-        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
-        if arg not in set(fn.keywords.keys())
-    ])
-  # Handle function.
-  return tuple(tf_inspect.getargspec(fn).args)
-
-
 def _verify_model_fn_args(model_fn, params):
   """Verifies model fn arguments."""
-  args = set(_model_fn_args(model_fn))
+  args = set(util.fn_args(model_fn))
   if 'features' not in args:
     raise ValueError('model_fn (%s) must include features argument.' % model_fn)
-  if 'labels' not in args:
-    raise ValueError('model_fn (%s) must include labels argument.' % model_fn)
   if params is not None and 'params' not in args:
     raise ValueError('model_fn (%s) does not include params argument, '
                      'but params (%s) is passed to Estimator.' % (model_fn,
@@ -862,7 +885,8 @@ def _write_dict_to_summary(output_dir,
       value.simple_value = int(dictionary[key])
     else:
       logging.warn(
-          'Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
+          'Skipping summary for %s, must be a float, np.float32, np.int64, '
+          'np.int32 or int.',
           key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index 52cedad012513bca23a0aecf780935ffa2f53eb5..8e7d966564f1dcdeea26455edcab571b2f0a4bdb 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -18,7 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.python.estimator.canned.dnn import DNNClassifier
+from tensorflow.python.estimator.canned.dnn import DNNRegressor
+from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
+from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedRegressor
+from tensorflow.python.estimator.canned.linear import LinearClassifier
+from tensorflow.python.estimator.canned.linear import LinearRegressor
+from tensorflow.python.estimator.canned.parsing_utils import classifier_parse_example_spec
+from tensorflow.python.estimator.canned.parsing_utils import regressor_parse_example_spec
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.export import export_lib as export
 from tensorflow.python.estimator.inputs import inputs
@@ -27,9 +35,17 @@ from tensorflow.python.estimator.model_fn import ModeKeys
 from tensorflow.python.estimator.run_config import RunConfig
 
 from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long
+# pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    'DNNClassifier',
+    'DNNRegressor',
+    'DNNLinearCombinedClassifier',
+    'DNNLinearCombinedRegressor',
+    'LinearClassifier',
+    'LinearRegressor',
+    'classifier_parse_example_spec',
+    'regressor_parse_example_spec',
     'inputs',
     'export',
     'Estimator',
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index b86afece431dcda71ca3b245563d78fcd8eef01d..caf232b0db97e61d3f5c07a7304a46b5a206be67 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -120,6 +120,9 @@ class EstimatorInheritanceConstraintTest(test.TestCase):
       def __init__(self):
         super(_Estimator, self).__init__(model_fn=dummy_model_fn)
 
+      def _call_input_fn(self, input_fn, mode):
+        return input_fn()
+
       def _create_global_step(self, graph):
         pass
 
@@ -235,13 +238,12 @@ class EstimatorConstructorTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'features'):
       estimator.Estimator(model_fn=model_fn)
 
-  def test_model_fn_args_must_include_labels(self):
+  def test_model_fn_args_labels_is_optional(self):
 
-    def model_fn(features, y):
-      _, _ = features, y
+    def model_fn(features):
+      _ = features
 
-    with self.assertRaisesRegexp(ValueError, 'labels'):
-      estimator.Estimator(model_fn=model_fn)
+    estimator.Estimator(model_fn=model_fn)
 
   def test_if_params_provided_then_model_fn_should_accept_it(self):
 
@@ -252,6 +254,17 @@ class EstimatorConstructorTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'params'):
       estimator.Estimator(model_fn=model_fn, params={'hidden_layers': 4})
 
+  def test_internal_params_is_a_deepcopy(self):
+
+    def model_fn(features, labels, params):
+      _, _, _ = features, labels, params
+
+    params = {'hidden_layers': 4}
+    est = estimator.Estimator(model_fn=model_fn, params=params)
+
+    params['hidden_layers'] = 5
+    self.assertEqual(4, est.params['hidden_layers'])
+
   def test_not_known_model_fn_args(self):
 
     def model_fn(features, labels, something):
@@ -295,16 +308,21 @@ def model_fn_global_step_incrementer(features, labels, mode):
       train_op=state_ops.assign_add(global_step, 1))
 
 
-def _estimator_spec(
-    expected_features, expected_labels, actual_features, actual_labels, mode):
-  assert_ops = tuple([
+def assert_features_op(expected_features, actual_features):
+  return [
       check_ops.assert_equal(
           expected_features[k], actual_features[k], name='assert_%s' % k)
       for k in expected_features
-  ] + [
-      check_ops.assert_equal(
-          expected_labels, actual_labels, name='assert_labels')
-  ])
+  ]
+
+
+def _estimator_spec(
+    expected_features, expected_labels, actual_features, actual_labels, mode):
+  assert_ops = tuple(
+      assert_features_op(expected_features, actual_features) + [
+          check_ops.assert_equal(
+              expected_labels, actual_labels, name='assert_labels')
+      ])
   global_step = training.get_global_step()
   with ops.control_dependencies(assert_ops):
     return model_fn_lib.EstimatorSpec(
@@ -325,29 +343,131 @@ def _make_input_fn(features, labels):
 
 class EstimatorTrainTest(test.TestCase):
 
-  def test_minimal_model_fn_args(self):
+  def test_callable_model_fn(self):
     expected_features = {'x': 42., 'y': 43.}
     expected_labels = 44.
 
-    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
-    # doesn't work with mock fns.
     model_fn_call_count = [0]
 
-    def _model_fn(features, labels):
-      model_fn_call_count[0] += 1
-      self.assertItemsEqual(expected_features.keys(), features.keys())
-      return _estimator_spec(
-          expected_features, expected_labels, features, labels,
-          model_fn_lib.ModeKeys.TRAIN)
+    test_self = self
+
+    class ModelFn(object):
+
+      def __call__(self, features, labels):
+        model_fn_call_count[0] += 1
+        test_self.assertItemsEqual(expected_features.keys(), features.keys())
+        return _estimator_spec(
+            expected_features, expected_labels, features, labels,
+            model_fn_lib.ModeKeys.TRAIN)
 
     with self.assertRaisesRegexp(ValueError, 'does not include params'):
-      estimator.Estimator(model_fn=_model_fn, params={'a': 'b'})
-    est = estimator.Estimator(model_fn=_model_fn, config=run_config.RunConfig())
+      estimator.Estimator(model_fn=ModelFn(), params={'a': 'b'})
+    est = estimator.Estimator(model_fn=ModelFn(), config=run_config.RunConfig())
     self.assertEqual(0, model_fn_call_count[0])
     est.train(
         input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
     self.assertEqual(1, model_fn_call_count[0])
 
+  def test_callable_input_fn(self):
+    expected_params = {'batch_size': 10}
+    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
+    input_fn_call_count = [0]
+
+    def _model_fn(features, labels, mode, params, config):
+      del params, config
+      return model_fn_global_step_incrementer(features, labels, mode)
+
+    test_self = self
+
+    class InputFn(object):
+
+      def __call__(self, params, config):
+        input_fn_call_count[0] += 1
+        test_self.assertEqual(expected_params, params)
+        test_self.assertEqual(4321, config.tf_random_seed)
+        return dummy_input_fn()
+
+    est = estimator.Estimator(model_fn=_model_fn,
+                              params=expected_params,
+                              config=expected_config)
+    self.assertEqual(0, input_fn_call_count[0])
+    est.train(InputFn(), steps=1)
+    self.assertEqual(1, input_fn_call_count[0])
+
+  def test_input_fn_args(self):
+    expected_params = {'batch_size': 10}
+    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
+    input_fn_call_count = [0]
+
+    def _model_fn(features, labels, mode, params, config):
+      del params, config
+      return model_fn_global_step_incrementer(features, labels, mode)
+
+    def _input_fn(params, config):
+      input_fn_call_count[0] += 1
+      self.assertEqual(expected_params, params)
+      self.assertEqual(4321, config.tf_random_seed)
+      return dummy_input_fn()
+
+    est = estimator.Estimator(model_fn=_model_fn,
+                              params=expected_params,
+                              config=expected_config)
+    self.assertEqual(0, input_fn_call_count[0])
+    est.train(_input_fn, steps=1)
+    self.assertEqual(1, input_fn_call_count[0])
+
+  def test_minimal_model_fn_args(self):
+    expected_features = {'x': 4, 'y': 5}
+
+    def _input_fn():
+      return expected_features
+
+    model_fn_call_count = [0]
+    def _model_fn(features):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      with ops.control_dependencies(
+          assert_features_op(expected_features, features)):
+        return model_fn_lib.EstimatorSpec(
+            mode=None,
+            predictions=constant_op.constant(0.),
+            loss=constant_op.constant(0.),
+            train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(input_fn=_input_fn, steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def test_labels_should_be_none_if_model_fn_does_not_use_labels(self):
+
+    def _input_fn_with_labels():
+      return {'x': 4, 'y': 5}, [4]
+
+    def _model_fn(features):
+      _ = features
+      return model_fn_lib.EstimatorSpec(
+          mode=None,
+          predictions=constant_op.constant(0.),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    with self.assertRaisesRegexp(ValueError, 'model_fn does not take labels'):
+      est.train(input_fn=_input_fn_with_labels, steps=1)
+
+  def test_input_fn_len_should_be_2_if_tuple_or_list(self):
+
+    def _input_fn():
+      return 4, 5, 6
+
+    def _model_fn(features):
+      _ = features
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    with self.assertRaisesRegexp(ValueError, 'len 2 tuple'):
+      est.train(input_fn=_input_fn, steps=1)
+
   def test_all_model_fn_args(self):
     expected_features = {'x': 42., 'y': 43.}
     expected_labels = 44.
@@ -665,6 +785,29 @@ class _StepCounterHook(session_run_hook.SessionRunHook):
 
 class EstimatorEvaluateTest(test.TestCase):
 
+  def test_input_fn_args(self):
+    expected_params = {'batch_size': 10}
+    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
+    input_fn_call_count = [0]
+
+    def _model_fn(features, labels, mode, params, config):
+      del params, config
+      return model_fn_global_step_incrementer(features, labels, mode)
+
+    def _input_fn(params, config):
+      input_fn_call_count[0] += 1
+      self.assertEqual(expected_params, params)
+      self.assertEqual(4321, config.tf_random_seed)
+      return dummy_input_fn()
+
+    est = estimator.Estimator(model_fn=_model_fn,
+                              params=expected_params,
+                              config=expected_config)
+    est.train(dummy_input_fn, steps=1)
+    self.assertEqual(0, input_fn_call_count[0])
+    est.evaluate(_input_fn, steps=1)
+    self.assertEqual(1, input_fn_call_count[0])
+
   def test_model_fn_must_return_estimator_spec(self):
     def _model_fn(features, labels, mode):
       _, _ = features, labels
@@ -770,9 +913,9 @@ class EstimatorEvaluateTest(test.TestCase):
     est = estimator.Estimator(model_fn=_model_fn_with_incremental_loss)
     est.train(dummy_input_fn, steps=1)
     scores = est.evaluate(dummy_input_fn, steps=5)
-    self.assertIn(model_fn_lib.MetricKeys.LOSS, scores)
+    self.assertIn(model_fn_lib.LOSS_METRIC_KEY, scores)
     # Average loss will be (2 + 4 + 6 + 8 + 10)/5=6
-    self.assertAlmostEqual(6., scores[model_fn_lib.MetricKeys.LOSS])
+    self.assertAlmostEqual(6., scores[model_fn_lib.LOSS_METRIC_KEY])
 
   def test_hooks_should_be_session_run_hook(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
@@ -863,9 +1006,54 @@ class EstimatorEvaluateTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     est.evaluate(dummy_input_fn, steps=1)
 
+  def test_evaluation_hooks_are_used(self):
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          evaluation_hooks=[hook])
+
+    est = estimator.Estimator(model_fn=_model_fn_hooks)
+    est.train(dummy_input_fn, steps=1)
+    self.assertFalse(hook.begin.called)
+    est.evaluate(dummy_input_fn, steps=1)
+    self.assertTrue(hook.begin.called)
+
 
 class EstimatorPredictTest(test.TestCase):
 
+  def test_input_fn_args(self):
+    expected_params = {'batch_size': 10}
+    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
+    input_fn_call_count = [0]
+
+    def _model_fn(features, labels, mode, params, config):
+      del features, labels, params, config
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]))
+
+    def _input_fn(params, config):
+      input_fn_call_count[0] += 1
+      self.assertEqual(expected_params, params)
+      self.assertEqual(4321, config.tf_random_seed)
+      return dummy_input_fn()
+
+    est = estimator.Estimator(model_fn=_model_fn,
+                              params=expected_params,
+                              config=expected_config)
+    est.train(dummy_input_fn, steps=1)
+    self.assertEqual(0, input_fn_call_count[0])
+    next(est.predict(_input_fn))
+    self.assertEqual(1, input_fn_call_count[0])
+
   def test_no_trained_model_in_model_dir(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     with self.assertRaisesRegexp(ValueError,
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index b31c5492d8648ce1bc4fc9cec36ef6a2f385d19d..c9f37f06e834e0d8be756097130d4cd5136ba9cf 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -55,6 +55,7 @@ def numpy_input_fn(x,
   of numpy arrays. The dict `features` has the same keys as the `x`.
 
   Example:
+
   ```python
   age = np.arange(4) * 1.0
   height = np.arange(32, 36)
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index a6f5157680f5733a930b3d3e1fd8c2b63af690be..048032560419c2a425c7dacf675cbed8ded2a215 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -258,7 +258,7 @@ class _GeneratorFeedFn(object):
         data_row = next(self._iterator)
       for index, key in enumerate(self._keys):
         if key not in data_row.keys():
-          raise KeyError("key mismatch between dicts emitted by GenFun"
+          raise KeyError("key mismatch between dicts emitted by GenFun "
                          "Expected {} keys; got {}".format(
                              self._keys, data_row.keys()))
         list_dict.setdefault(self._col_placeholders[index],
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 1aa2623962059aa54b5e1e84482681614a56b043..1a023c971f66b8eca04c5b3c7d85c21a7363cf6a 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -48,17 +48,15 @@ class ModeKeys(object):
   PREDICT = 'infer'
 
 
-class MetricKeys(object):
-  """Metric key strings."""
-  LOSS = 'loss'
-  AVERAGE_LOSS = 'average_loss'
+LOSS_METRIC_KEY = 'loss'
+AVERAGE_LOSS_METRIC_KEY = 'average_loss'
 
 
 class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'export_outputs', 'training_chief_hooks', 'training_hooks',
-        'scaffold'
+        'scaffold', 'evaluation_hooks'
     ])):
   """Ops and objects returned from a `model_fn` and passed to `Estimator`.
 
@@ -74,12 +72,13 @@ class EstimatorSpec(
               export_outputs=None,
               training_chief_hooks=None,
               training_hooks=None,
-              scaffold=None):
+              scaffold=None,
+              evaluation_hooks=None):
     """Creates a validated `EstimatorSpec` instance.
 
     Depending on the value of `mode`, different arguments are required. Namely
     * For `mode == ModeKeys.TRAIN`: required fields are `loss` and `train_op`.
-    * For `mode == ModeKeys.EVAL`: required field is`loss`.
+    * For `mode == ModeKeys.EVAL`: required field is `loss`.
     * For `mode == ModeKeys.PREDICT`: required fields are `predictions`.
 
     model_fn can populate all arguments independent of mode. In this case, some
@@ -145,10 +144,12 @@ class EstimatorSpec(
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.
       training_chief_hooks: Iterable of `tf.train.SessionRunHook` objects to
         run on the chief worker during training.
-      training_hooks: Iterable of `tf.train.SessionRunHook` objects that to run
+      training_hooks: Iterable of `tf.train.SessionRunHook` objects to run
         on all workers during training.
       scaffold: A `tf.train.Scaffold` object that can be used to set
         initialization, saver, and more to be used in training.
+      evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run during evaluation.
 
     Returns:
       A validated `EstimatorSpec` object.
@@ -276,7 +277,8 @@ class EstimatorSpec(
     # Validate hooks.
     training_chief_hooks = tuple(training_chief_hooks or [])
     training_hooks = tuple(training_hooks or [])
-    for hook in training_hooks + training_chief_hooks:
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    for hook in training_hooks + training_chief_hooks + evaluation_hooks:
       if not isinstance(hook, session_run_hook.SessionRunHook):
         raise TypeError(
             'All hooks must be SessionRunHook instances, given: {}'.format(
@@ -297,7 +299,8 @@ class EstimatorSpec(
         export_outputs=export_outputs,
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
-        scaffold=scaffold)
+        scaffold=scaffold,
+        evaluation_hooks=evaluation_hooks)
 
 
 def _check_is_tensor_or_operation(x, name):
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
index 96c38a987b36c53d22a5864ad80017bd5eb95371..c41df413539f68d38283555380748ca550a56d86 100644
--- a/tensorflow/python/estimator/model_fn_test.py
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -71,7 +71,8 @@ class EstimatorSpecTrainTest(test.TestCase):
           },
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
-          scaffold=monitored_session.Scaffold())
+          scaffold=monitored_session.Scaffold(),
+          evaluation_hooks=[_FakeHook()])
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
@@ -221,7 +222,17 @@ class EstimatorSpecEvalTest(test.TestCase):
           },
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
-          scaffold=monitored_session.Scaffold())
+          scaffold=monitored_session.Scaffold(),
+          evaluation_hooks=[_FakeHook()])
+
+  def testEvaluationHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            loss=constant_op.constant(1.),
+            evaluation_hooks=[_InvalidHook()])
 
   def testTupleMetric(self):
     """Tests that no errors are raised when a metric is tuple-valued."""
@@ -427,7 +438,8 @@ class EstimatorSpecInferTest(test.TestCase):
           },
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
-          scaffold=monitored_session.Scaffold())
+          scaffold=monitored_session.Scaffold(),
+          evaluation_hooks=[_FakeHook()])
 
   def testPredictionsMissing(self):
     with ops.Graph().as_default(), self.test_session():
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 30ba18d07dbb804297fbe6d668abf91756867086..ac521cc7913b853c4e56c382cf2626c5d80590e1 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -35,6 +35,7 @@ _DEFAULT_REPLACEABLE_LIST = [
     'session_config',
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
+    'log_step_count_steps'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -89,6 +90,8 @@ def _validate_properties(run_config):
             message='keep_checkpoint_max should be >= 0')
   _validate('keep_checkpoint_every_n_hours', lambda keep_hours: keep_hours > 0,
             message='keep_checkpoint_every_n_hours should be > 0')
+  _validate('log_step_count_steps', lambda num_steps: num_steps > 0,
+            message='log_step_count_steps should be > 0')
 
   _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
             message='tf_random_seed must be integer.')
@@ -112,6 +115,7 @@ class RunConfig(object):
     self._session_config = None
     self._keep_checkpoint_max = 5
     self._keep_checkpoint_every_n_hours = 10000
+    self._log_step_count_steps = 100
     _validate_properties(self)
 
   @property
@@ -174,6 +178,10 @@ class RunConfig(object):
   def keep_checkpoint_every_n_hours(self):
     return self._keep_checkpoint_every_n_hours
 
+  @property
+  def log_step_count_steps(self):
+    return self._log_step_count_steps
+
   @property
   def model_dir(self):
     return self._model_dir
@@ -190,6 +198,7 @@ class RunConfig(object):
       - `session_config`,
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
+      - `log_step_count_steps`,
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..de35e66bdfb46dbfdc0be3b4316d62a3a136142a
--- /dev/null
+++ b/tensorflow/python/estimator/util.py
@@ -0,0 +1,57 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utility to retrieve function args.."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  _, fn = tf_decorator.unwrap(fn)
+
+  # Handle callables.
+  if hasattr(fn, '__call__') and tf_inspect.ismethod(fn.__call__):
+    return tuple(tf_inspect.getargspec(fn.__call__).args)
+
+  # Handle functools.partial and similar objects.
+  if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
+    # Handle nested partial.
+    original_args = fn_args(fn.func)
+    if not original_args:
+      return tuple()
+
+    return tuple([
+        arg for arg in original_args[len(fn.args):]
+        if arg not in set((fn.keywords or {}).keys())
+    ])
+
+  # Handle function.
+  return tuple(tf_inspect.getargspec(fn).args)
diff --git a/tensorflow/python/estimator/util_test.py b/tensorflow/python/estimator/util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f8122c407bfc707d1d411ca6ed31b6ad72ee6a2
--- /dev/null
+++ b/tensorflow/python/estimator/util_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Estimator related util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.estimator import util
+from tensorflow.python.platform import test
+
+
+class FnArgsTest(test.TestCase):
+
+  def test_simple_function(self):
+    def fn(a, b):
+      return a + b
+    self.assertEqual(('a', 'b'), util.fn_args(fn))
+
+  def test_callable(self):
+
+    class Foo(object):
+
+      def __call__(self, a, b):
+        return a + b
+
+    self.assertEqual(('self', 'a', 'b'), util.fn_args(Foo()))
+
+  def test_partial_function(self):
+    expected_test_arg = 123
+
+    def fn(a, test_arg):
+      if test_arg != expected_test_arg:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, test_arg=123)
+
+    self.assertEqual(('a',), util.fn_args(wrapped_fn))
+
+  def test_partial_function_with_positional_args(self):
+    expected_test_arg = 123
+
+    def fn(test_arg, a):
+      if test_arg != expected_test_arg:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, 123)
+
+    self.assertEqual(('a',), util.fn_args(wrapped_fn))
+
+    self.assertEqual(3, wrapped_fn(3))
+    self.assertEqual(3, wrapped_fn(a=3))
+
+  def test_double_partial(self):
+    expected_test_arg1 = 123
+    expected_test_arg2 = 456
+
+    def fn(a, test_arg1, test_arg2):
+      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
+        return ValueError('partial does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, test_arg2=456)
+    double_wrapped_fn = functools.partial(wrapped_fn, test_arg1=123)
+
+    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+
+  def test_double_partial_with_positional_args_in_outer_layer(self):
+    expected_test_arg1 = 123
+    expected_test_arg2 = 456
+
+    def fn(test_arg1, a, test_arg2):
+      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, test_arg2=456)
+    double_wrapped_fn = functools.partial(wrapped_fn, 123)
+
+    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+
+    self.assertEqual(3, double_wrapped_fn(3))
+    self.assertEqual(3, double_wrapped_fn(a=3))
+
+  def test_double_partial_with_positional_args_in_both_layers(self):
+    expected_test_arg1 = 123
+    expected_test_arg2 = 456
+
+    def fn(test_arg1, test_arg2, a):
+      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, 123)  # binds to test_arg1
+    double_wrapped_fn = functools.partial(wrapped_fn, 456)  # binds to test_arg2
+
+    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+
+    self.assertEqual(3, double_wrapped_fn(3))
+    self.assertEqual(3, double_wrapped_fn(a=3))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 99aedc5d4b8aea11087b8a71b08dc17de700c58e..37da89c484821fea9c9a0f1c400b44840951b25b 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -400,11 +400,14 @@ def make_parse_example_spec(feature_columns):
   ```
 
   For the above example, make_parse_example_spec would return the dict:
+
+  ```python
   {
-    "feature_a": parsing_ops.VarLenFeature(tf.string),
-    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
-    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+      "feature_a": parsing_ops.VarLenFeature(tf.string),
+      "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
+      "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
   }
+  ```
 
   Args:
     feature_columns: An iterable containing all feature columns. All items
@@ -594,15 +597,21 @@ def bucketized_column(source_column, boundaries):
   `[1., 2.)`, and `[2., +inf)`.
 
   For example, if the inputs are
-    `boundaries` = [0, 10, 100]
-    input tensor = [[-5, 10000]
-                    [150,   10]
-                    [5,    100]]
+
+  ```python
+  boundaries = [0, 10, 100]
+  input tensor = [[-5, 10000]
+                  [150,   10]
+                  [5,    100]]
+  ```
 
   then the output will be
-    output = [[0, 3]
-              [3, 2]
-              [1, 3]]
+
+  ```python
+  output = [[0, 3]
+            [3, 2]
+            [1, 3]]
+  ```
 
   Example:
 
@@ -621,6 +630,7 @@ def bucketized_column(source_column, boundaries):
 
   `bucketized_column` can also be crossed with another categorical column using
   `crossed_column`:
+
   ```python
   price = numeric_column('price')
   # bucketized_column converts numerical feature to a categorical one.
@@ -745,6 +755,7 @@ def categorical_column_with_vocabulary_file(
   abbreviation. All inputs with values in that file are assigned an ID 0-49,
   corresponding to its line number. All other values are hashed and assigned an
   ID 50-54.
+
   ```python
   states = categorical_column_with_vocabulary_file(
       key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
@@ -759,6 +770,7 @@ def categorical_column_with_vocabulary_file(
   other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
   in input, and other values missing from the file, will be assigned ID 0. All
   others are assigned the corresponding line number 1-50.
+
   ```python
   states = categorical_column_with_vocabulary_file(
       key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
@@ -769,6 +781,7 @@ def categorical_column_with_vocabulary_file(
   ```
 
   And to make an embedding with either:
+
   ```python
   columns = [embedding_column(states, 3),...]
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
@@ -799,7 +812,8 @@ def categorical_column_with_vocabulary_file(
   Raises:
     ValueError: `vocabulary_file` is missing.
     ValueError: `vocabulary_size` is missing or < 1.
-    ValueError: `num_oov_buckets` is not a non-negative integer.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
     ValueError: `dtype` is neither string nor integer.
   """
   if not vocabulary_file:
@@ -826,27 +840,40 @@ def categorical_column_with_vocabulary_file(
 
 
 def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1):
+    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
 
-  Logic for feature f is:
-  id = vocabulary_list.index_of(f) if f in vocabulary_list else default_value
-
   Use this when your inputs are in string or integer format, and you have an
   in-memory vocabulary mapping each value to an integer ID. By default,
-  out-of-vocabulary values are ignored. Use `default_value` to specify how to
-  include out-of-vocabulary values.
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
 
   For input dictionary `features`, `features[key]` is either `Tensor` or
   `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
   and `''` for string. Note that these values are independent of the
   `default_value` argument.
 
-  In the following examples, each input in `vocabulary_list` is assigned an ID
-  0-4 corresponding to its index (e.g., input 'B' produces output 2). All other
+  Example with `num_oov_buckets`:
+  In the following example, each input in `vocabulary_list` is assigned an ID
+  0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
+  inputs are hashed and assigned an ID 4-5.
+
+  ```python
+  colors = categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
+      num_oov_buckets=2)
+  columns = [colors, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  In the following example, each input in `vocabulary_list` is assigned an ID
+  0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
   inputs are assigned `default_value` 0.
 
-  Linear model:
+
   ```python
   colors = categorical_column_with_vocabulary_list(
       key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
@@ -855,7 +882,8 @@ def categorical_column_with_vocabulary_list(
   linear_prediction, _, _ = linear_model(features, columns)
   ```
 
-  Embedding for a DNN model:
+  And to make an embedding with either:
+
   ```python
   columns = [embedding_column(colors, 3),...]
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
@@ -871,13 +899,22 @@ def categorical_column_with_vocabulary_list(
       Must be castable to `dtype`.
     dtype: The type of features. Only string and integer types are supported.
       If `None`, it will be inferred from `vocabulary_list`.
-    default_value: The value to use for values not in `vocabulary_list`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
+      hash of the input value. A positive `num_oov_buckets` can not be specified
+      with `default_value`.
 
   Returns:
     A `_CategoricalColumn` with in-memory vocabulary.
 
   Raises:
     ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
     ValueError: if `dtype` is not integer or string.
   """
   if (vocabulary_list is None) or (len(vocabulary_list) < 1):
@@ -889,6 +926,14 @@ def categorical_column_with_vocabulary_list(
         'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
             vocabulary_list, key))
   vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
+  if num_oov_buckets:
+    if default_value != -1:
+      raise ValueError(
+          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+              key))
+    if num_oov_buckets < 0:
+      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
+          num_oov_buckets, key))
   _assert_string_or_int(
       vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
   if dtype is None:
@@ -901,7 +946,7 @@ def categorical_column_with_vocabulary_list(
 
   return _VocabularyListCategoricalColumn(
       key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
-      default_value=default_value)
+      default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
 def categorical_column_with_identity(key, num_buckets, default_value=None):
@@ -926,6 +971,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
   literal 0 in inputs will result in the same default ID.
 
   Linear model:
+
   ```python
   video_id = categorical_column_with_identity(
       key='video_id', num_buckets=1000000, default_value=0)
@@ -935,6 +981,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
   ```
 
   Embedding for a DNN model:
+
   ```python
   columns = [embedding_column(video_id, 9),...]
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
@@ -976,8 +1023,8 @@ def indicator_column(categorical_column):
   `embedding_column` if the inputs are sparse.
 
   ```python
-  name = indicator_column(categorical_column_with_vocabulary_list('name',
-      ['bob', 'george', 'wanda'])
+  name = indicator_column(categorical_column_with_vocabulary_list(
+      'name', ['bob', 'george', 'wanda'])
   columns = [name, ...]
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = input_layer(features, columns)
@@ -1009,6 +1056,8 @@ def weighted_categorical_column(
   Example:
 
   Input `tf.Example` objects:
+
+  ```proto
   [
     features {
       feature {
@@ -1031,6 +1080,7 @@ def weighted_categorical_column(
       }
     }
   ]
+  ```
 
   ```python
   categorical_column = categorical_column_with_hash_bucket(
@@ -1076,22 +1126,41 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
     Hash(cartesian product of features) % `hash_bucket_size`
 
   For example, if the input features are:
-  * SparseTensor referred by first key: shape = [2, 2]
-      [0, 0]: "a"
-      [1, 0]: "b"
-      [1, 1]: "c"
 
-  * SparseTensor referred by second key: shape = [2, 1]
-      [0, 0]: "d"
-      [1, 0]: "e"
+  * SparseTensor referred by first key:
+
+    ```python
+    shape = [2, 2]
+    {
+        [0, 0]: "a"
+        [1, 0]: "b"
+        [1, 1]: "c"
+    }
+    ```
+
+  * SparseTensor referred by second key:
+
+    ```python
+    shape = [2, 1]
+    {
+        [0, 0]: "d"
+        [1, 0]: "e"
+    }
+    ```
 
   then crossed feature will look like:
-      shape = [2, 2]
+
+  ```python
+   shape = [2, 2]
+  {
       [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
       [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
       [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
+  }
+  ```
 
   Here is an example to create a linear model with crosses of string features:
+
   ```python
   keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
   columns = [keywords_x_doc_terms, ...]
@@ -1100,6 +1169,7 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
   ```
 
   You could also use vocabulary lookup before crossing:
+
   ```python
   keywords = categorical_column_with_vocabulary_file(
       'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
@@ -1111,6 +1181,7 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
 
   If an input feature is of numeric type, you can use
   `categorical_column_with_identity`, or `bucketized_column`, as in the example:
+
   ```python
   # vertical_id is an integer categorical feature.
   vertical_id = categorical_column_with_identity('vertical_id', 10K)
@@ -1125,6 +1196,7 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
 
   To use crossed column in DNN model, you need to add it in an embedding column
   as in this example:
+
   ```python
   vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
   vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
@@ -1990,7 +2062,7 @@ class _VocabularyFileCategoricalColumn(
 class _VocabularyListCategoricalColumn(
     _CategoricalColumn,
     collections.namedtuple('_VocabularyListCategoricalColumn', (
-        'key', 'vocabulary_list', 'dtype', 'default_value'
+        'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'
     ))):
   """See `categorical_column_with_vocabulary_list`."""
 
@@ -2024,13 +2096,14 @@ class _VocabularyListCategoricalColumn(
     return lookup_ops.index_table_from_tensor(
         vocabulary_list=tuple(self.vocabulary_list),
         default_value=self.default_value,
+        num_oov_buckets=self.num_oov_buckets,
         dtype=key_dtype,
         name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
   @property
   def _num_buckets(self):
     """Returns number of buckets in this sparse feature."""
-    return len(self.vocabulary_list)
+    return len(self.vocabulary_list) + self.num_oov_buckets
 
   def _get_sparse_tensors(
       self, inputs, weight_collections=None, trainable=None):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index cfa2a0f7d479ece9fe8b7298514b16a487d96502..b14ec73ba261c185e3da11cdd30ab0a4642ea1a0 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -2535,6 +2535,21 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
+  def test_invalid_num_oov_buckets(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36),
+          num_oov_buckets=-1)
+
+  def test_invalid_buckets_and_default_value(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'both num_oov_buckets and default_value'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=(12, 24, 36),
+          num_oov_buckets=100,
+          default_value=2)
+
   def test_invalid_input_dtype_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -2693,6 +2708,26 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  def test_get_sparse_tensors_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+        dense_shape=(2, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 33, 0, 62), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -2736,11 +2771,33 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  def test_get_sparse_tensors_int32_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 60, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
-    self.assertEqual(3, wire_column._num_buckets)
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
           wire_column.name: sparse_tensor.SparseTensorValue(
@@ -2752,12 +2809,12 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
         self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
         self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,))).eval()
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> None, 'omar' -> 0: wire_var[0] = 1
-        self.assertAllClose(((3.,), (1.,)), predictions.eval())
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..379ba19def68410d7f7c92cbf8e65609e404d8f4
--- /dev/null
+++ b/tensorflow/python/framework/c_api_util.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for using the TensorFlow C API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as c_api
+
+
+class ScopedTFStatus(object):
+  """Wrapper around TF_Status that handles deletion."""
+
+  def __init__(self):
+    self.status = c_api.TF_NewStatus()
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api.TF_DeleteStatus is not None:
+      c_api.TF_DeleteStatus(self.status)
+
+
+class ScopedTFGraph(object):
+  """Wrapper around TF_Graph that handles deletion."""
+
+  def __init__(self):
+    self.graph = c_api.TF_NewGraph()
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api.TF_DeleteGraph is not None:
+      c_api.TF_DeleteGraph(self.graph)
diff --git a/tensorflow/python/framework/common_shapes.py b/tensorflow/python/framework/common_shapes.py
index 79bf0879d70a6cf41d6dadcc809618f720e26853..52a0b9baa6b67ae3456dff264a1789ae0fb2638d 100644
--- a/tensorflow/python/framework/common_shapes.py
+++ b/tensorflow/python/framework/common_shapes.py
@@ -617,7 +617,7 @@ def call_cpp_shape_fn(op, require_shape_fn=True):
 
 def _call_cpp_shape_fn_impl(
     op, input_tensors_needed, input_tensors_as_shapes_needed, require_shape_fn):
-  """Core implementaton of call_cpp_shape_fn."""
+  """Core implementation of call_cpp_shape_fn."""
   graph_def_version = op.graph.graph_def_versions.producer
   node_def_str = op.node_def.SerializeToString()
 
diff --git a/tensorflow/python/framework/cpp_shape_inference.cc b/tensorflow/python/framework/cpp_shape_inference.cc
index d5e58c174bb7607cb59a246a226f8852a09588f0..34f68b4fae2a61fed75bf0d4deee8a01735a8d55 100644
--- a/tensorflow/python/framework/cpp_shape_inference.cc
+++ b/tensorflow/python/framework/cpp_shape_inference.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/python/framework/cpp_shape_inference.h"
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/python/framework/cpp_shape_inference.pb.h"
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 3e6c04982b4b1c1ca219cfd1bc1a1954e2b520a1..43535a593e0bd6742a355c826bce78f04c46e131 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -48,6 +48,7 @@ class DType(object):
   * `tf.quint16`: Quantized 16-bit unsigned integer.
   * `tf.qint32`: Quantized 32-bit signed integer.
   * `tf.resource`: Handle to a mutable resource.
+  * `tf.variant`: Values of arbitrary types.
 
   In addition, variants of these types with the `_ref` suffix are
   defined for reference-typed tensors.
@@ -113,8 +114,11 @@ class DType(object):
 
   @property
   def is_numpy_compatible(self):
-    return (self._type_enum != types_pb2.DT_RESOURCE and
-            self._type_enum != types_pb2.DT_RESOURCE_REF)
+    numpy_incompatible = [types_pb2.DT_VARIANT,
+                          types_pb2.DT_VARIANT_REF,
+                          types_pb2.DT_RESOURCE,
+                          types_pb2.DT_RESOURCE_REF]
+    return self._type_enum not in numpy_incompatible
 
   @property
   def as_numpy_dtype(self):
@@ -135,13 +139,13 @@ class DType(object):
   def is_integer(self):
     """Returns whether this is a (non-quantized) integer type."""
     return (self.is_numpy_compatible and not self.is_quantized and
-            issubclass(self.as_numpy_dtype, np.integer))
+            np.issubdtype(self.as_numpy_dtype, np.integer))
 
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return self.is_numpy_compatible and issubclass(self.as_numpy_dtype,
-                                                   np.floating)
+    return self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
+                                                      np.floating)
 
   @property
   def is_complex(self):
@@ -284,7 +288,8 @@ class DType(object):
 
   @property
   def size(self):
-    if self._type_enum == types_pb2.DT_RESOURCE:
+    if (self._type_enum == types_pb2.DT_VARIANT or
+        self._type_enum == types_pb2.DT_RESOURCE):
       return 1
     return np.dtype(self.as_numpy_dtype).itemsize
 
@@ -304,6 +309,7 @@ dtype_range = {np.bool_: (False, True),
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
+variant = DType(types_pb2.DT_VARIANT)
 float16 = DType(types_pb2.DT_HALF)
 half = float16
 float32 = DType(types_pb2.DT_FLOAT)
@@ -325,6 +331,7 @@ qint16 = DType(types_pb2.DT_QINT16)
 quint16 = DType(types_pb2.DT_QUINT16)
 qint32 = DType(types_pb2.DT_QINT32)
 resource_ref = DType(types_pb2.DT_RESOURCE_REF)
+variant_ref = DType(types_pb2.DT_VARIANT_REF)
 bfloat16 = DType(types_pb2.DT_BFLOAT16)
 float16_ref = DType(types_pb2.DT_HALF_REF)
 half_ref = float16_ref
@@ -372,6 +379,7 @@ _INTERN_TABLE = {
     types_pb2.DT_QINT32: qint32,
     types_pb2.DT_BFLOAT16: bfloat16,
     types_pb2.DT_RESOURCE: resource,
+    types_pb2.DT_VARIANT: variant,
     types_pb2.DT_HALF_REF: float16_ref,
     types_pb2.DT_FLOAT_REF: float32_ref,
     types_pb2.DT_DOUBLE_REF: float64_ref,
@@ -392,6 +400,7 @@ _INTERN_TABLE = {
     types_pb2.DT_QINT32_REF: qint32_ref,
     types_pb2.DT_BFLOAT16_REF: bfloat16_ref,
     types_pb2.DT_RESOURCE_REF: resource_ref,
+    types_pb2.DT_VARIANT_REF: variant_ref,
 }
 
 
@@ -417,6 +426,7 @@ _TYPE_TO_STRING = {
     types_pb2.DT_QINT32: "qint32",
     types_pb2.DT_BFLOAT16: "bfloat16",
     types_pb2.DT_RESOURCE: "resource",
+    types_pb2.DT_VARIANT: "variant",
     types_pb2.DT_HALF_REF: "float16_ref",
     types_pb2.DT_FLOAT_REF: "float32_ref",
     types_pb2.DT_DOUBLE_REF: "float64_ref",
@@ -437,6 +447,7 @@ _TYPE_TO_STRING = {
     types_pb2.DT_QINT32_REF: "qint32_ref",
     types_pb2.DT_BFLOAT16_REF: "bfloat16_ref",
     types_pb2.DT_RESOURCE_REF: "resource_ref",
+    types_pb2.DT_VARIANT_REF: "variant_ref",
 }
 _STRING_TO_TF = {value: _INTERN_TABLE[key]
                  for key, value in _TYPE_TO_STRING.items()}
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 5bb60763b6e30d23c622b1a281f62e3577c77692..1e84f1b656d8d536148c84eab4a9f5cb89c60409 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -27,9 +27,12 @@ from tensorflow.python.platform import googletest
 
 
 def _is_numeric_dtype_enum(datatype_enum):
-  return (datatype_enum != types_pb2.DT_INVALID and
-          datatype_enum != types_pb2.DT_RESOURCE and
-          datatype_enum != types_pb2.DT_RESOURCE_REF)
+  non_numeric_dtypes = [types_pb2.DT_VARIANT,
+                        types_pb2.DT_VARIANT_REF,
+                        types_pb2.DT_INVALID,
+                        types_pb2.DT_RESOURCE,
+                        types_pb2.DT_RESOURCE_REF]
+  return datatype_enum not in non_numeric_dtypes
 
 
 class TypesTest(test_util.TensorFlowTestCase):
@@ -154,6 +157,11 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("string").is_integer, False)
     self.assertEqual(dtypes.as_dtype("bool").is_integer, False)
     self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("qint8").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("qint16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("qint32").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("quint8").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("quint16").is_integer, False)
 
   def testIsFloating(self):
     self.assertEqual(dtypes.as_dtype("int8").is_floating, False)
@@ -169,6 +177,11 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("string").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bool").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("qint8").is_floating, False)
+    self.assertEqual(dtypes.as_dtype("qint16").is_floating, False)
+    self.assertEqual(dtypes.as_dtype("qint32").is_floating, False)
+    self.assertEqual(dtypes.as_dtype("quint8").is_floating, False)
+    self.assertEqual(dtypes.as_dtype("quint16").is_floating, False)
 
   def testIsComplex(self):
     self.assertEqual(dtypes.as_dtype("int8").is_complex, False)
@@ -183,7 +196,12 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("float64").is_complex, False)
     self.assertEqual(dtypes.as_dtype("string").is_complex, False)
     self.assertEqual(dtypes.as_dtype("bool").is_complex, False)
-    self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("bfloat16").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("qint8").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("qint16").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("qint32").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("quint8").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("quint16").is_complex, False)
 
   def testIsUnsigned(self):
     self.assertEqual(dtypes.as_dtype("int8").is_unsigned, False)
@@ -198,7 +216,12 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("string").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("complex64").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("complex128").is_unsigned, False)
-    self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("bfloat16").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("qint8").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("qint16").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("qint32").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("quint8").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("quint16").is_unsigned, False)
 
   def testMinMax(self):
     # make sure min/max evaluates for all data types that have min/max
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 32c96ec9471a973dab734fab8dfdb95fa7fa1b7c..9b1f0a0cfb2e5267a281600aba925a5e66ba0111 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -23,7 +23,8 @@ import traceback
 import warnings
 
 from tensorflow.core.lib.core import error_codes_pb2
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.util import compat
 
 
@@ -456,13 +457,10 @@ def _make_specific_exception(node_def, op, message, error_code):
 
 @contextlib.contextmanager
 def raise_exception_on_not_ok_status():
-  status = pywrap_tensorflow.TF_NewStatus()
-  try:
-    yield status
-    if pywrap_tensorflow.TF_GetCode(status) != 0:
-      raise _make_specific_exception(
-          None, None,
-          compat.as_text(pywrap_tensorflow.TF_Message(status)),
-          pywrap_tensorflow.TF_GetCode(status))
-  finally:
-    pywrap_tensorflow.TF_DeleteStatus(status)
+  status = c_api_util.ScopedTFStatus()
+  yield status.status
+  if c_api.TF_GetCode(status) != 0:
+    raise _make_specific_exception(
+        None, None,
+        compat.as_text(c_api.TF_Message(status)),
+        c_api.TF_GetCode(status))
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index 80f936e8ab4e4605b7a1fa06029a1d75f4a6d64e..56ac56ba42cce80467d473562d12c6228d1e8b38 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -32,6 +32,7 @@
 @@device
 @@container
 @@name_scope
+@@colocate_with
 @@control_dependencies
 @@convert_to_tensor
 @@convert_to_tensor_or_indexed_slices
@@ -85,6 +86,7 @@ from tensorflow.python.framework.ops import device
 from tensorflow.python.framework.ops import container
 from tensorflow.python.framework.ops import name_scope
 from tensorflow.python.framework.ops import op_scope
+from tensorflow.python.framework.ops import colocate_with
 from tensorflow.python.framework.ops import control_dependencies
 from tensorflow.python.framework.ops import get_default_graph
 from tensorflow.python.framework.ops import reset_default_graph
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index dbd406ebd5ee06502c9d3018e9b3b90929a01a7f..ff47c0dbf86e5e2e08fb28e02436a2bfc1053820 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -581,10 +581,10 @@ class _OverloadedFunction(object):
 
 
 class _FuncGraph(ops.Graph):
-  """A helper for construction a function.
+  """A helper for constructing a function.
 
   _FuncGraph overrides ops.Graph's create_op() so that we can keep
-  track of every inputs into every op created inside the function.  If
+  track of all inputs into every op created inside the function.  If
   any input is from other graphs, we keep track of it in self.capture
   and substitue the input with a place holder.
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index c4e841b81f5302d1e947a4c29f4fda0e577754c3..abfd0b76606bcbff6f6b4d323915707933880a70 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -325,6 +325,25 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  def testWhileLoopCallsFunc(self):
+    with self.test_session(use_gpu=True) as sess:
+
+      @function.Defun(dtypes.float32)
+      def Times2(x):
+        constant_two = constant_op.constant(2, dtypes.int32)
+        two_on_gpu = math_ops.cast(constant_two, dtypes.float32)
+        return x * two_on_gpu
+
+      def Body(x):
+        x2 = Times2(x)
+        x2.set_shape([])
+        return x2
+
+      loop = control_flow_ops.while_loop(lambda x: x < 1e5, Body, [1.0])
+
+      ans = sess.run(loop)
+      self.assertAllClose(ans, 131072.)
+
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -588,8 +607,8 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[2], vals[3])
 
   def testDeclare(self):
-    foo = function.Declare("Foo", [("x", dtypes.float32)],
-                           [("y", dtypes.float32)])
+    foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
+                                                             dtypes.float32)])
 
     @function.Defun(dtypes.float32, func_name="Foo", out_names=["y"])
     def FooImpl(x):
@@ -607,8 +626,8 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
 
   def testDeclareUsedInDefun(self):
-    foo = function.Declare("Foo", [("x", dtypes.float32)],
-                           [("y", dtypes.float32)])
+    foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
+                                                             dtypes.float32)])
 
     @function.Defun()
     def Bar(x):
@@ -630,8 +649,8 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
 
   def testDeclareTypeMistake(self):
-    foo = function.Declare("Foo", [("x", dtypes.float32)],
-                           [("y", dtypes.float32)])
+    foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
+                                                             dtypes.float32)])
 
     @function.Defun(dtypes.float32, func_name="Foo", out_names=["y"])
     def Foo(x):
@@ -749,8 +768,9 @@ class FunctionTest(test.TestCase):
       self.assertAllEqual(v1, 20.)
 
   def testShapeFunction(self):
-    @function.Defun(dtypes.float32,
-                    shape_func=lambda op: [op.inputs[0].get_shape()])
+
+    @function.Defun(
+        dtypes.float32, shape_func=lambda op: [op.inputs[0].get_shape()])
     def Foo(x):
       return x + 1.0
 
@@ -767,11 +787,12 @@ class FunctionTest(test.TestCase):
       self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
 
   def testVariableReuse(self):
+
     def LinearWithReuse(input_tensor, reuse=None):
       size = input_tensor.shape.dims[1]
       with variable_scope.variable_scope("linear", reuse=reuse):
-        w = variable_scope.get_variable("w", shape=[size, size],
-                                        dtype=input_tensor.dtype)
+        w = variable_scope.get_variable(
+            "w", shape=[size, size], dtype=input_tensor.dtype)
       return math_ops.matmul(input_tensor, w)
 
     @function.Defun(dtypes.float32)
@@ -789,15 +810,19 @@ class FunctionTest(test.TestCase):
 
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      output_val = sess.run(output_op,
-                            feed_dict={input_op: np.random.rand(32, 100)})
+      output_val = sess.run(
+          output_op, feed_dict={input_op: np.random.rand(32, 100)})
       self.assertEqual(output_val.shape, (32, 100))
 
   def testFunctionCallInDifferentVariableScopes(self):
+
     @function.Defun(dtypes.float32)
     def Foo(inputs):
-      var = variable_scope.get_variable("var", shape=[10], dtype=dtypes.float32,
-                                        initializer=init_ops.ones_initializer())
+      var = variable_scope.get_variable(
+          "var",
+          shape=[10],
+          dtype=dtypes.float32,
+          initializer=init_ops.ones_initializer())
       return inputs + var
 
     input_op = array_ops.placeholder(shape=[10], dtype=dtypes.float32)
@@ -813,8 +838,8 @@ class FunctionTest(test.TestCase):
 
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      out1, out2 = sess.run([out1_op, out2_op],
-                            feed_dict={input_op: np.linspace(1, 10, 10)})
+      out1, out2 = sess.run(
+          [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
       self.assertAllEqual(out1, np.linspace(2, 11, 10))
       self.assertAllEqual(out2, np.linspace(2, 11, 10))
 
@@ -852,12 +877,15 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(func.captured_inputs, new_func.captured_inputs)
 
   def testBasic(self):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def Foo(x, y):
       return x + y
+
     self.expectFunctionsEqual(Foo)
 
   def testGradFunc(self):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def G(x, dy):
       return x * dy
@@ -865,10 +893,12 @@ class FunctionsFromProtos(test.TestCase):
     @function.Defun(dtypes.float32, grad_func=G)
     def F(x):
       return math_ops.exp(x) - math_ops.exp(-x)
+
     self.expectFunctionsEqual(F, grad_func=G)
 
   def testCapturedInputs(self):
     c = constant_op.constant(10, dtypes.int64)
+
     @function.Defun(dtypes.int64)
     def Foo(x):
       return x + c
@@ -885,6 +915,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(len(new_func.captured_inputs), 0)
 
   def testNestedFunctions(self):
+
     @function.Defun(dtypes.float32)
     def Outer(x):
 
@@ -958,6 +989,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(len(function._from_library(library)), 0)
 
   def testFromLibraryMissingFuncDef(self):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def G1(x, dy):
       return x * dy
@@ -989,6 +1021,7 @@ class FunctionsFromProtos(test.TestCase):
       function._from_library(library)
 
   def testFromLibraryCyclicGradFuncs(self):
+
     @function.Defun(dtypes.float32)
     def F1(x):
       return math_ops.exp(x) - math_ops.exp(-x)
@@ -1242,10 +1275,11 @@ class FunctionInlineControlTest(test.TestCase):
       inp = np.random.uniform(-1, 1, [16, 1]).astype(np.float32)
       run_metadata = config_pb2.RunMetadata()
       with session.Session(graph=g, config=cfg) as sess:
-        ans = sess.run([y, dx], {x: inp},
-                       run_metadata=run_metadata,
-                       options=config_pb2.RunOptions(
-                           trace_level=config_pb2.RunOptions.FULL_TRACE))
+        ans = sess.run(
+            [y, dx], {x: inp},
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
         print(ans[0], np.sum(ans[1]))
         self.assertAllClose(ans[0], 255.971, rtol=1e-3)
         self.assertAllClose(np.sum(ans[1]), 13.0408, rtol=1e-3)
@@ -1275,8 +1309,7 @@ class ModuleFunctionTest(test.TestCase):
   def testBasic(self):
     with ops.Graph().as_default():
       a, b, c, d, e = [
-          constant_op.constant(
-              [[_]], dtype=dtypes.float32) for _ in range(5)
+          constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
       ]
       y = Linear(a, b, c)
       z = Linear2(a, b, c, d, e)
@@ -1295,7 +1328,8 @@ class VariableHoistingTest(test.TestCase):
           initializer=init_ops.random_uniform_initializer(seed=312),
           use_resource=use_resource)
       b = variable_scope.get_variable(
-          "b", (64), initializer=init_ops.zeros_initializer(),
+          "b", (64),
+          initializer=init_ops.zeros_initializer(),
           use_resource=use_resource),
       return math_ops.sigmoid(math_ops.matmul(x, w) + b)
 
@@ -1354,5 +1388,6 @@ class VariableHoistingTest(test.TestCase):
     self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index f909bcd62d2912707d869891c7d83c0525537852..a0ea4ad48eb84b22f42ea840513ebefbf6b4abbe 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -29,7 +29,7 @@ from tensorflow.python.lib.io import file_io
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
-  The graph is written as a binary proto unless `as_text` is `True`.
+  The graph is written as a text proto unless `as_text` is `False`.
 
   ```python
   v = tf.Variable(0, name='my_variable')
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 025e2136206e3206b5d1a822b8e3e51cc9637d23..eec7c4a463d9f707e489620038fdff6a801256e2 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
 
@@ -311,9 +312,10 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
           compute_shapes=False, compute_device=False,
           op_def=op_def)
 
-    # Maps from a node to the op it is colocated with, if colocation
+    # Maps from a node to the ops it is colocated with, if colocation
     # is specified in the attributes.
-    colocation_pairs = {}
+    colocation_pairs = collections.defaultdict(list)
+
     # 2. Add inputs to the operations.
     for node in graph_def.node:
       op = name_to_op[node.name]
@@ -339,7 +341,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
                   'loc:@' + original_op.name))
               if op_to_bind_to != node.name:
                 # Keep track of this mapping for a later phase.
-                colocation_pairs[op] = original_op
+                colocation_pairs[op].append(original_op)
                 # Don't apply this op's device function,
                 # the colocation constraint will ensure
                 # the proper device gets assigned at runtime.
@@ -474,21 +476,40 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
     # The following loop populates the device field of ops that are
     # colocated with another op.  This is implied by the colocation
     # attribute, but we propagate the device field for completeness.
-    for op, coloc_op in colocation_pairs.items():
-      # If the colocation op has no device, even after a device
-      # application, there's nothing to do here.
-      if not coloc_op.device:
-        continue
-      coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
-      op._set_device(coloc_device)  # pylint: disable=protected-access
-
-    # Treat unused input mappings as an error, because they are likely to be
-    # due to a typo.
-    unused_input_keys = frozenset(input_map.keys()).difference(used_input_keys)
-    if unused_input_keys:
+    for op, coloc_op_list in colocation_pairs.items():
+      coloc_device = None
+      # Find any device in the list of colocated ops that have a
+      # device, if it exists.  We assume that if multiple ops
+      # have devices, they refer to the same device.  Otherwise, a
+      # runtime error will occur since the colocation property
+      # cannot be guaranteed.
+      #
+      # One possible improvement is to try to check for compatibility
+      # of all devices in this list at import time here, which would
+      # require implementing a compatibility function for device specs
+      # in python.
+      for coloc_op in coloc_op_list:
+        if coloc_op.device:
+          coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
+          break
+      if coloc_device:
+        op._set_device(coloc_device)  # pylint: disable=protected-access
+
+    # Treat input mappings that don't appear in the graph as an error,
+    # because they are likely to be due to a typo.
+    def _IsImportedNodeOutput(tensor_name):
+      operation_name, output_index = _ParseTensorName(tensor_name)
+      try:
+        return output_index < len(name_to_op[operation_name].outputs)
+      except KeyError:
+        return False
+    absent_input_keys = [
+        k for k in frozenset(input_map.keys()).difference(used_input_keys)
+        if not _IsImportedNodeOutput(k)]
+    if absent_input_keys:
       raise ValueError(
           'Attempted to map inputs that were not found in graph_def: [%s]'
-          % ', '.join(unused_input_keys))
+          % ', '.join(absent_input_keys))
 
     if return_elements is None:
       return None
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 7fdbcfd8561ef04c3ec8ae544b8df23bd371f1ff..cfba6af5232a85c0219a6a4ff45d9b843302bc64 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -548,6 +548,24 @@ class ImportGraphDefTest(test.TestCase):
             input_map={"B:0": constant_op.constant(5.0)})
       self.assertTrue("not found in graph_def: [B:0]" in str(e.exception))
 
+  def testInputMapUnusedAsInput(self):
+    with ops.Graph().as_default():
+      # Mapping an unused node output should succeed.
+      importer.import_graph_def(
+          self._MakeGraphDef("""
+          node { name: 'A' op: 'Oi' }
+          """),
+          input_map={"A:0": constant_op.constant(5.0)})
+
+      # Mapping a non-existent output of an existing node should fail.
+      with self.assertRaises(ValueError) as e:
+        importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'Oi' }
+            """),
+            input_map={"A:2": constant_op.constant(5.0)})
+      self.assertTrue("not found in graph_def: [A:2]" in str(e.exception))
+
   def testInputMapTypeMismatch(self):
     with ops.Graph().as_default():
       with self.assertRaises(ValueError) as e:
@@ -682,6 +700,42 @@ class ImportGraphDefTest(test.TestCase):
                   key: '_class' value { list { s: 'loc:@imported_graph/A' } }
           } }""", b.graph.as_graph_def())
 
+  def testMultipleColocationWithDeviceFn(self):
+    original_graph_def = self._MakeGraphDef("""
+          node { name: 'A' op: 'None'}
+          node { name: 'B' op: 'None'}
+          node { name: 'C' op: 'None'  attr {
+            key: '_class'
+            value { list { s: 'loc:@A' s: 'loc:@B' } }
+          } }""")
+
+    # A device function that places "B" on a device, and "A" is empty.
+    #
+    # B and C should contain "/device:B".  A will not right now.  But
+    # because of the colocation property, at runtime it would be
+    # placed with B and C.
+    def CustomDeviceFn(op):
+      if "B" in op.name:
+        return "/device:B:0"
+      return ""
+
+    with ops.Graph().as_default():
+      with ops.device(CustomDeviceFn):
+        c, = importer.import_graph_def(
+            original_graph_def, return_elements=["C"], name="imported_graph")
+
+      self.assertProtoEqualsVersion("""
+          node { name: 'imported_graph/A' op: 'None' }
+          node { name: 'imported_graph/B' op: 'None' device: "/device:B:0" }
+          node { name: 'imported_graph/C' op: 'None' device: "/device:B:0"
+                 attr {
+                   key: '_class' value {
+                     list { s: 'loc:@imported_graph/A'
+                            s: 'loc:@imported_graph/B' }
+                   }
+                 }
+               }""", c.graph.as_graph_def())
+
   def testNamePrefixColocationAttrsMultipleImport(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' }
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 44ad6a7c3bfb032d81ea5adc176003014efd6339..909e6d4c7be76743211d4c9045706fce62d4910e 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -52,19 +52,8 @@ def load_op_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library or get the python wrappers.
   """
-  status = py_tf.TF_NewStatus()
-
-  lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
-  try:
-    error_code = py_tf.TF_GetCode(status)
-    if error_code != 0:
-      error_msg = compat.as_text(py_tf.TF_Message(status))
-      # pylint: disable=protected-access
-      raise errors_impl._make_specific_exception(
-          None, None, error_msg, error_code)
-      # pylint: enable=protected-access
-  finally:
-    py_tf.TF_DeleteStatus(status)
+  with errors_impl.raise_exception_on_not_ok_status() as status:
+    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
 
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
@@ -107,15 +96,5 @@ def load_file_system_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library.
   """
-  status = py_tf.TF_NewStatus()
-  lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
-  try:
-    error_code = py_tf.TF_GetCode(status)
-    if error_code != 0:
-      error_msg = compat.as_text(py_tf.TF_Message(status))
-      # pylint: disable=protected-access
-      raise errors_impl._make_specific_exception(
-          None, None, error_msg, error_code)
-      # pylint: enable=protected-access
-  finally:
-    py_tf.TF_DeleteStatus(status)
+  with errors_impl.raise_exception_on_not_ok_status() as status:
+    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 46417c2324669fc65be1891c8c65d5d7f2ba200e..e70716d3168df13ae9d5f1ef5d46c7adc5238cbf 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -27,12 +27,14 @@ import sys
 import threading
 
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -1050,7 +1052,7 @@ def _device_string(dev_spec):
     return dev_spec
 
 
-def _NodeDef(op_type, name, device=None, attrs=None):
+def _NodeDef(op_type, name, device=None, attrs=None):  # pylint: disable=redefined-outer-name
   """Create a NodeDef proto.
 
   Args:
@@ -1169,7 +1171,7 @@ class Operation(object):
       a._add_consumer(self)  # pylint: disable=protected-access
     if output_types is None:
       output_types = []
-    self._output_types = output_types
+    self._output_types_val = output_types
     self._outputs = [Tensor(self, i, output_type)
                      for i, output_type in enumerate(output_types)]
     if input_types is None:
@@ -1182,7 +1184,7 @@ class Operation(object):
                             self.node_def.name,
                             [i.dtype for i in self._inputs],
                             input_types))
-    self._input_types = input_types
+    self._input_types_val = input_types
 
     # Build the list of control inputs.
     self._control_inputs = []
@@ -1202,7 +1204,7 @@ class Operation(object):
     self._op_def = op_def
     self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
     # Add this op to the current control flow context:
-    self._control_flow_context = g._get_control_flow_context()
+    self._control_flow_context = g._get_control_flow_context()  # pylint: disable=protected-access
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
     # NOTE(keveman): Control flow context's AddOp could be creating new ops and
@@ -1213,10 +1215,7 @@ class Operation(object):
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._recompute_node_def()
 
-    if _USE_C_API:
-      assert self._graph._c_graph, (  # pylint: disable=protected-access
-          "_USE_C_API set to False when creating Graph, you may need to "
-          "manually set 'ops._USE_C_API = True' before creating the Graph")
+    if self._graph._c_graph:  # pylint: disable=protected-access
       if self._op_def:
         # TODO(skyewm): op_def_library.apply_op() flattens the incoming
         # inputs. Refactor so we don't have to do this here.
@@ -1350,7 +1349,13 @@ class Operation(object):
   @property
   def name(self):
     """The full name of this operation."""
-    return self._node_def.name
+    if self._graph._c_graph:  # pylint: disable=protected-access
+      # TODO(iga): Remove this assert after converting to C API by default.
+      # Just being a bit paranoid here.
+      assert self._node_def.name == c_api.TF_OperationName(self._c_op)
+      return c_api.TF_OperationName(self._c_op)
+    else:
+      return self._node_def.name
 
   @property
   def _id(self):
@@ -1366,15 +1371,66 @@ class Operation(object):
       assigned, or an empty string if it has not been assigned to a
       device.
     """
-    return self._node_def.device
+    if self._graph._c_graph:  # pylint: disable=protected-access
+      # TODO(iga): Remove this assert after converting to C API by default.
+      # Just being a bit paranoid here
+      assert self._node_def.device == c_api.TF_OperationDevice(self._c_op)
+      return c_api.TF_OperationDevice(self._c_op)
+    else:
+      return self._node_def.device
+
+  @property
+  def _output_types(self):
+    """List this operation's output types.
+
+    Returns:
+      List of the types of the Tensors computed by this operation.
+      Each element in the list is an integer whose value is one of
+      the TF_DataType enums defined in c_api.h
+      The length of this list indicates the number of output endpoints
+      of the operation.
+    """
+    if self._graph._c_graph:  # pylint: disable=protected-access
+      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+      output_types = [c_api.TF_OperationOutputType(self._tf_output(i)) for
+                      i in xrange(num_outputs)]
+      # TODO(iga): Remove this assert after converting to C API by default.
+      # Just being a bit paranoid here.
+      assert self._output_types_val == output_types
+      # In all the tests we have output_types that are passed into
+      # Operation.__init__ are a list of ints (which is illegal according
+      # to the docstring), but input_types are instances of DType.
+      # This extra assert is to catch if we ever use DType for output_types.
+      if output_types:
+        assert isinstance(output_types[0], int)
+      return output_types
+    else:
+      return self._output_types_val
 
-  def _set_device(self, device):
+  def _tf_output(self, output_idx):
+    """Create and return a new TF_Output for output_idx'th output of this op."""
+    tf_output = c_api.TF_Output()
+    tf_output.oper = self._c_op
+    tf_output.index = output_idx
+    return tf_output
+
+  def _tf_input(self, input_idx):
+    """Create and return a new TF_Input for input_idx'th input of this op."""
+    tf_input = c_api.TF_Input()
+    tf_input.oper = self._c_op
+    tf_input.index = input_idx
+    return tf_input
+
+  def _set_device(self, device):  # pylint: disable=redefined-outer-name
     """Set the device of this operation.
 
     Args:
       device: string or device..  The device to set.
     """
-    assert not _USE_C_API, "Operation._set_device doesn't work with C API"
+    if _USE_C_API:
+      c_api.SetRequestedDevice(
+          self._graph._c_graph, self._c_op, _device_string(device))  # pylint: disable=protected-access
+    # TODO(nolivia): remove this line when switch to C api
     self._node_def.device = _device_string(device)
 
   def _add_input(self, tensor, dtype=None):
@@ -1390,6 +1446,8 @@ class Operation(object):
         or if input tensor type is not convertible to dtype.
       ValueError: if the Tensor is from a different graph.
     """
+    assert not self._graph._c_graph, (  # pylint: disable=protected-access
+        "Operation._add_input doesn't work with C API")
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
@@ -1402,7 +1460,7 @@ class Operation(object):
             "Cannot convert a tensor of type %s to an input of type %s"
             % (tensor.dtype.name, dtype.name))
     self._inputs.append(tensor)
-    self._input_types.append(dtype)
+    self._input_types_val.append(dtype)
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
 
@@ -1422,6 +1480,8 @@ class Operation(object):
         or if input tensor type is not convertible to dtype.
       ValueError: if the Tensor is from a different graph.
     """
+    assert not self._graph._c_graph, (  # pylint: disable=protected-access
+        "Operation._update_input doesn't work with C API")
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
@@ -1436,7 +1496,7 @@ class Operation(object):
 
     self._inputs[index].consumers().remove(self)
     self._inputs[index] = tensor
-    self._input_types[index] = dtype
+    self._input_types_val[index] = dtype
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
 
@@ -1450,6 +1510,8 @@ class Operation(object):
       TypeError: if ops is not a list of Operations.
       ValueError: if any op in ops is from a different graph.
     """
+    assert not self._graph._c_graph, (  # pylint: disable=protected-access
+        "Operation._add_control_inputs doesn't work with C API")
     if ops:
       for op in ops:
         if not isinstance(op, Operation):
@@ -1468,12 +1530,17 @@ class Operation(object):
       TypeError: if op is not an Operation.
       ValueError: if op is from a different graph.
     """
-    self._add_control_inputs([op])
+    if _USE_C_API:
+      c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
+    else:
+      self._add_control_inputs([op])
 
   # Methods below are used when building the NodeDef and Graph proto.
   def _recompute_node_def(self):
     del self._node_def.input[:]
+    # pylint: disable=protected-access
     self._node_def.input.extend([t._as_node_def_input() for t in self._inputs])
+    # pylint: enable=protected-access
     if self._control_inputs:
       self._node_def.input.extend(["^%s" % op.name for op in
                                    self._control_inputs])
@@ -1521,6 +1588,20 @@ class Operation(object):
   def _input_dtypes(self):
     return self._input_types
 
+  @property
+  def _input_types(self):
+    if self._graph._c_graph:  # pylint: disable=protected-access
+      num_inputs = c_api.TF_OperationNumInputs(self._c_op)
+      input_types = [dtypes.as_dtype(
+          c_api.TF_OperationInputType(self._tf_input(i)))
+                     for i in xrange(num_inputs)]
+      # TODO(iga): Remove this assert after converting to C API by default.
+      # Just being a bit paranoid here.
+      assert self._input_types_val == input_types
+      return input_types
+    else:
+      return self._input_types_val
+
   @property
   def control_inputs(self):
     """The `Operation` objects on which this op has a control dependency.
@@ -1535,12 +1616,31 @@ class Operation(object):
       A list of `Operation` objects.
 
     """
-    return self._control_inputs
+    if self._graph._c_graph:  # pylint: disable=protected-access
+      control_c_ops = c_api.TF_OperationGetControlInputs_wrapper(self._c_op)
+      # pylint: disable=protected-access
+      return [self.graph._get_operation_by_name_unsafe(
+          c_api.TF_OperationName(c_op)) for c_op in control_c_ops]
+      # pylint: enable=protected-access
+    else:
+      return self._control_inputs
 
   @property
   def type(self):
     """The type of the op (e.g. `"MatMul"`)."""
-    return self._node_def.op
+    if self._graph._c_graph:  # pylint: disable=protected-access
+      op_type = c_api.TF_OperationOpType(self._c_op)
+      # TODO(iga): Remove these asserts after converting to C API by default.
+      # Just being a bit paranoid here.
+      # pylint: disable=unidiomatic-typecheck
+      assert type(op_type) == type(self._node_def.op), (
+          "Expected same types %s vs %s" % (type(op_type),
+                                            type(self._node_def.op)))
+      # pylint: enable=unidiomatic-typecheck
+      assert op_type == self._node_def.op
+      return op_type
+    else:
+      return self._node_def.op
 
   @property
   def graph(self):
@@ -1979,18 +2079,6 @@ def _name_from_scope_name(name):
   return name[:-1] if name[-1] == "/" else name
 
 
-class _ScopedTF_Graph(object):
-
-  def __init__(self):
-    self.graph = c_api.TF_NewGraph()
-
-  def __del__(self):
-    # Note: when we're destructing the global context (i.e when the process is
-    # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteGraph is not None:
-      c_api.TF_DeleteGraph(self.graph)
-
-
 class Graph(object):
   """A TensorFlow computation, represented as a dataflow graph.
 
@@ -2107,7 +2195,7 @@ class Graph(object):
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
     if _USE_C_API:
-      self._scoped_c_graph = _ScopedTF_Graph()
+      self._scoped_c_graph = c_api_util.ScopedTFGraph()
     else:
       self._scoped_c_graph = None
 
@@ -2228,6 +2316,9 @@ class Graph(object):
 
     Note that this is unrelated to the
     @{tf.Graph.graph_def_versions}.
+
+    Returns:
+       An integer version that increases as ops are added to the graph.
     """
     if self._finalized:
       return self._version
@@ -2436,7 +2527,7 @@ class Graph(object):
     return self._building_function
 
   # Helper functions to create operations.
-  def create_op(self, op_type, inputs, dtypes,
+  def create_op(self, op_type, inputs, dtypes,  # pylint: disable=redefined-outer-name
                 input_types=None, name=None, attrs=None, op_def=None,
                 compute_shapes=True, compute_device=True):
     """Creates an `Operation` in this graph.
@@ -2542,14 +2633,16 @@ class Graph(object):
           # Make this device match the device of the colocated op, to
           # provide consistency between the device and the colocation
           # property.
-          if ret.device and ret.device != colocation_op.device:
+          if (ret.device and
+              pydev.canonical_name(ret.device) !=
+              pydev.canonical_name(colocation_op.device)):
             logging.warning("Tried to colocate %s with an op %s that had "
                             "a different device: %s vs %s. "
                             "Ignoring colocation property.",
                             name, colocation_op.name,
                             ret.device, colocation_op.device)
           else:
-            ret._set_device(colocation_op.device)
+            ret._set_device(colocation_op.device)  # pylint: disable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       ret.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue(
@@ -2733,6 +2826,29 @@ class Graph(object):
                       % type(name).__name__)
     return self.as_graph_element(name, allow_tensor=False, allow_operation=True)
 
+  def _get_operation_by_name_unsafe(self, name):
+    """Returns the `Operation` with the given `name`.
+
+    This is a internal unsafe version of get_operation_by_name. It skips many
+    checks and does not have user friedly error messages but runs considerably
+    faster. This method may be called concurrently from multiple threads.
+
+    Args:
+      name: The name of the `Operation` to return.
+
+    Returns:
+      The `Operation` with the given `name`.
+
+    Raises:
+      KeyError: If `name` does not correspond to an operation in this graph.
+    """
+
+    if self._finalized:
+      return self._nodes_by_name[name]
+
+    with self._lock:
+      return self._nodes_by_name[name]
+
   def get_tensor_by_name(self, name):
     """Returns the `Tensor` with the given `name`.
 
@@ -3250,7 +3366,7 @@ class Graph(object):
       device_name_or_function: The device name or function to use in
         the context.
 
-    Returns:
+    Yields:
       A context manager that specifies the default device to use for newly
       created ops.
 
@@ -3277,7 +3393,7 @@ class Graph(object):
     for device_function in reversed(self._device_function_stack):
       if device_function is None:
         break
-      op._set_device(device_function(op))
+      op._set_device(device_function(op))  # pylint: disable=protected-access
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
@@ -3825,6 +3941,9 @@ class _DefaultStack(threading.local):
   def reset(self):
     self.stack = []
 
+  def is_cleared(self):
+    return not self.stack
+
   @property
   def enforce_nesting(self):
     return self._enforce_nesting
@@ -4030,7 +4149,13 @@ def reset_default_graph():
   a `tf.Session` or `tf.InteractiveSession` is active will result in undefined
   behavior. Using any previously created `tf.Operation` or `tf.Tensor` objects
   after calling this function will result in undefined behavior.
+  Raises:
+    AssertionError: If this function is called within a nested graph.
   """
+  if not _default_graph_stack.is_cleared():
+    raise AssertionError("Do not use tf.reset_default_graph() to clear "
+                         "nested graphs. If you need a cleared graph, "
+                         "exit the nesting and create a new graph.")
   _default_graph_stack.reset()
 
 
@@ -4440,6 +4565,7 @@ def prepend_name_scope(name, import_scope):
 
 
 # pylint: disable=g-doc-return-or-yield
+# pylint: disable=not-context-manager
 @tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 00891d2d2199d4e0241726a8f5491274965acd07..8fb330100d54d2a2bfa1a8b6d6dcb4086e5891ba 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -34,11 +34,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
-from tensorflow.python.framework import test_ops_2
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variable_scope
@@ -384,6 +384,15 @@ class OperationTest(test_util.TensorFlowTestCase):
       self.assertIsInstance(x, dtypes.DType)
     self.assertEqual([dtypes.string, dtypes.double], l)
 
+  # TODO(skyewm): test adding cycles, other error cases
+  @test_util.enable_c_api
+  def testAddControlInput(self):
+    with ops.Graph().as_default():
+      x = constant_op.constant(1).op
+      y = constant_op.constant(2).op
+    y._add_control_input(x)  # pylint: disable=protected-access
+    self.assertEqual(y.control_inputs, [x])
+
 
 class CreateOpTest(test_util.TensorFlowTestCase):
 
@@ -1062,29 +1071,26 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
+  @test_util.enable_c_api
   def testBasic(self):
-    ops._USE_C_API = True
-    try:
-      g = ops.Graph()
-      with g.as_default():
-        # Creating unregistered ops with _apply_op() doesn't work with the C API
-        # TODO(skyewm): address this more consistently. Possible solutions are
-        # to use registered ops in all tests, create a way to register ops in
-        # Python tests, or conditionally disable the op registration check in
-        # the C API.
-        a = constant_op.constant(1.0)
-        b = constant_op.constant(1.0)
-        with g.control_dependencies([a]):
-          c = constant_op.constant(1.0)
-          d = array_ops.identity(b)
-          e = array_ops.identity(c)
-
-      self.assertEqual(c.op.control_inputs, [a.op])
-      self.assertEqual(d.op.control_inputs, [a.op])
-      # e should be dominated by c.
-      self.assertEqual(e.op.control_inputs, [])
-    finally:
-      ops._USE_C_API = False
+    g = ops.Graph()
+    with g.as_default():
+      # Creating unregistered ops with _apply_op() doesn't work with the C API
+      # TODO(skyewm): address this more consistently. Possible solutions are
+      # to use registered ops in all tests, create a way to register ops in
+      # Python tests, or conditionally disable the op registration check in
+      # the C API.
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(1.0)
+      with g.control_dependencies([a]):
+        c = constant_op.constant(1.0)
+        d = array_ops.identity(b)
+        e = array_ops.identity(c)
+
+    self.assertEqual(c.op.control_inputs, [a.op])
+    self.assertEqual(d.op.control_inputs, [a.op])
+    # e should be dominated by c.
+    self.assertEqual(e.op.control_inputs, [])
 
   def testBasicWithConversion(self):
     g = ops.Graph()
@@ -1314,6 +1320,12 @@ class GraphTest(test_util.TensorFlowTestCase):
   def _AssertDefault(self, expected):
     self.assertIs(expected, ops.get_default_graph())
 
+  def testResetDefaultGraphNesting(self):
+    g0 = ops.Graph()
+    with self.assertRaises(AssertionError):
+      with g0.as_default():
+        ops.reset_default_graph()
+
   def testGraphContextManager(self):
     g0 = ops.Graph()
     with g0.as_default() as g1:
@@ -1550,6 +1562,21 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual(a.op.device, b.op.device)
 
+  def testColocationCanonicalization(self):
+    with ops.device("/gpu:0"):
+      _ = constant_op.constant(2.0)
+    with ops.device(lambda op: "/gpu:0"):
+      b = constant_op.constant(3.0)
+    with ops.get_default_graph().colocate_with(b):
+      with ops.device("/gpu:0"):
+        c = constant_op.constant(4.0)
+
+    # A's device will be /gpu:0
+    # B's device will be /device:GPU:0
+    # C's device will be /device:GPU:0 because it
+    # inherits B's device name, after canonicalizing the names.
+    self.assertEqual(b.op.device, c.op.device)
+
   def testLocationOverrides(self):
     with ops.device("/cpu:0"):
       with ops.device("/gpu:0"):
@@ -1772,5 +1799,97 @@ class TracebackTest(test_util.TensorFlowTestCase):
           self.assertEquals(frame, frame_with_start_line[:-1])
 
 
+class OutputTypesTest(test_util.TensorFlowTestCase):
+  """Tests Operation._output_types property.
+
+  This test should not exist as _output_types is a private property.
+  This property is used by util.copy_elements and its tests would normally
+  cover Operation._output_types. However, we can't yet run these tests in C
+  API mode because their use _set_device method. This test will be deleted
+  once we port _set_device and run the copy tests with C API on.
+  """
+  # TODO(iga): Remove this test
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API  # pylint: disable=protected-access
+    ops._USE_C_API = True  # pylint: disable=protected-access
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api  # pylint: disable=protected-access
+
+  def testOneOutput(self):
+    g = ops.Graph()
+    with g.as_default():
+      # Using a constant because creating unregistered ops
+      # doesn't work with the C API.
+      op = constant_op.constant(12, dtype=dtypes.uint16).op
+      # pylint: disable=protected-access
+      self.assertEqual([types_pb2.DT_UINT16], op._output_types)
+      # pylint: enable=protected-access
+
+  def testTwoDifferentOutputs(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant([1, 1, 2, 4, 4, 4, 7, 8, 8],
+                               dtype=dtypes.double)
+      y, _ = gen_array_ops.unique(x)
+      self.assertEqual([types_pb2.DT_DOUBLE, types_pb2.DT_INT32],
+                       y.op._output_types)  # pylint: disable=protected-access
+
+  def testThreeOutputs(self):
+    g = ops.Graph()
+    with g.as_default():
+      # Using a split operationt because creating unregistered ops
+      # doesn't work with the C API.
+      a = constant_op.constant("abc", dtype=dtypes.string, shape=[5, 30])
+      split0, _, _ = array_ops.split(a, [4, 15, 11], 1)
+      # pylint: disable=protected-access
+      self.assertEqual([types_pb2.DT_STRING] * 3, split0.op._output_types)
+      # pylint: enable=protected-access
+
+
+class InputTypesTest(test_util.TensorFlowTestCase):
+  """Tests Operation._input_dtypes and Operation._input_types properties.
+
+  This test should not exist as _input_types is a private property.
+  This property is used by many tests that would normally cover its
+  behavior. However, we can't yet run these tests in C
+  API mode because they use _set_device method. This test will be deleted
+  once we port _set_device.
+  """
+  # TODO(iga): Remove this test
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API  # pylint: disable=protected-access
+    ops._USE_C_API = True  # pylint: disable=protected-access
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api  # pylint: disable=protected-access
+
+  def testZeroInputs(self):
+    g = ops.Graph()
+    with g.as_default():
+      # Using a constant because creating unregistered ops
+      # doesn't work with the C API.
+      op = constant_op.constant(12, dtype=dtypes.uint16).op
+      # pylint: disable=protected-access
+      self.assertEqual([], op._input_types)
+      self.assertEqual([], op._input_dtypes)
+      # pylint: enable=protected-access
+
+  def testTwoInputs(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1.0, dtype=dtypes.double)
+      y = constant_op.constant(2.0, dtype=dtypes.double)
+      z = math_ops.multiply(x, y)
+      # pylint: disable=protected-access
+      self.assertTrue(isinstance(z.op._input_types[0], dtypes.DType))
+      self.assertTrue(isinstance(z.op._input_types[1], dtypes.DType))
+      self.assertEqual([dtypes.double, dtypes.double], z.op._input_types)
+      self.assertEqual([dtypes.double, dtypes.double], z.op._input_dtypes)
+      # pylint: enable=protected-access
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 142bd53573828d7158277bc4c442c0f7c54387fd..090436aebf7369bc67ad3be70ec52cf29da3529c 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -682,8 +682,13 @@ void GenPythonOp::AddDocStringOutputs() {
 }
 
 void GenPythonOp::AddBody(const string& prefix) {
+  AddBodyNoReturn(prefix);
+  strings::StrAppend(&result_, prefix, "return _result\n");
+}
+
+void GenPythonOp::AddBodyNoReturn(const string& prefix) {
   string return_prefix =
-      strings::StrCat(prefix, "result = _op_def_lib.apply_op(");
+      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
   string return_args = strings::StrCat("\"", op_def_.name(), "\", ");
   for (size_t i = 0; i < param_names_.size(); ++i) {
     strings::StrAppend(&return_args, param_names_[i], "=", param_names_[i],
@@ -695,11 +700,9 @@ void GenPythonOp::AddBody(const string& prefix) {
                      // Wrap the arguments, and indent to the (.
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
 
-  if (num_outs_ <= 1) {
-    strings::StrAppend(&result_, prefix, "return result\n");
-  } else {
-    strings::StrAppend(&result_, prefix, "return _", op_def_.name(),
-                       "Output._make(result)\n");
+  if (num_outs_ > 1) {
+    strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
+                       "Output._make(_result)\n");
   }
 }
 
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 44b1aed71f1102ce5659b250fff0a101f6e7e14a..d588f362d82edbab0aeae02f058adc1db0df2ea5 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -59,6 +59,7 @@ class GenPythonOp {
   void AddOutputGlobals();
   void AddDocStringOutputs();
   void AddBody(const string& prefix);
+  void AddBodyNoReturn(const string& prefix);
 
   // From constructor arguments
   const OpDef& op_def_;
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 3aedbfef0d592885f2ef5a6d48be668fe0ee0abf..66c05335b4f4321730f8a5b24c4f4378e4ef06e2 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -365,7 +365,7 @@ class Dimension(object):
 def as_dimension(value):
   """Converts the given value to a Dimension.
 
-  A Dimenson input will be returned unmodified.
+  A Dimension input will be returned unmodified.
   An input of `None` will be converted to an unknown Dimension.
   An integer input will be converted to a Dimension with that value.
 
@@ -736,6 +736,36 @@ class TensorShape(object):
     if not self.is_compatible_with(other):
       raise ValueError("Shapes %s and %s are incompatible" % (self, other))
 
+  def most_specific_compatible_shape(self, other):
+    """Returns the most specific TensorShape compatible with `self` and `other`.
+
+    * TensorShape([None, 1]) is the most specific TensorShape compatible with
+      both TensorShape([2, 1]) and TensorShape([5, 1]). Note that
+      TensorShape(None) is also compatible with above mentioned TensorShapes.
+
+    * TensorShape([1, 2, 3]) is the most specific TensorShape compatible with
+      both TensorShape([1, 2, 3]) and TensorShape([1, 2, 3]). There are more
+      less specific TensorShapes compatible with above mentioned TensorShapes,
+      e.g. TensorShape([1, 2, None]), TensorShape(None).
+
+    Args:
+      other: Another `TensorShape`.
+
+    Returns:
+      A `TensorShape` which is the most specific compatible shape of `self`
+      and `other`.
+    """
+
+    other = as_shape(other)
+    if self._dims is None or other.dims is None or self.ndims != other.ndims:
+      return unknown_shape()
+
+    dims = [(Dimension(None))] * self.ndims
+    for i, (d1, d2) in enumerate(zip(self._dims, other.dims)):
+      if d1 is not None and d2 is not None and d1 == d2:
+        dims[i] = d1
+    return TensorShape(dims)
+
   def is_fully_defined(self):
     """Returns True iff `self` is fully defined in every dimension."""
     return (self._dims is not None and all(dim.value is not None
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 0ae8d6b8217721e05d6a0f198e99f7334e488393..fffd86c7a6241b8be92ad33852da244ab9b5284d 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -275,6 +275,26 @@ class ShapeTest(test_util.TensorFlowTestCase):
         tensor_shape.TensorShape([1, 2]).concatenate(
             tensor_shape.Dimension(3)))
 
+  def _testMostSpecificCompatibleShapeHelper(self, x, y, expected):
+    mcs = tensor_shape.TensorShape(x).most_specific_compatible_shape(
+        tensor_shape.TensorShape(y))
+    mcs_dims = mcs.dims
+    if expected is None or mcs_dims is None:
+      self.assertIs(expected, mcs_dims)
+    else:
+      self.assertEqual(expected, mcs.as_list())
+
+  def testMostSpecificCompatibleShape(self):
+    self._testMostSpecificCompatibleShapeHelper([1, 2], None, None)
+    self._testMostSpecificCompatibleShapeHelper(None, [1, 2], None)
+    self._testMostSpecificCompatibleShapeHelper([1, 2], [1, 2, 3, 4], None)
+    self._testMostSpecificCompatibleShapeHelper([1, 2, 3, 4], [1, 2], None)
+    self._testMostSpecificCompatibleShapeHelper([1, 2], [1, 2], [1, 2])
+    self._testMostSpecificCompatibleShapeHelper([None, 2, 3], [1, 1, 3],
+                                                [None, None, 3])
+    self._testMostSpecificCompatibleShapeHelper([1, 1, 3], [None, 2, 3],
+                                                [None, None, 3])
+
   def testHelpers(self):
     tensor_shape.TensorShape([]).assert_is_compatible_with(
         tensor_shape.scalar())
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 10811100010614a8b0b18cd4f24ddd5dcc5bb542..323802e57fe12686188e798ace5ff475a06d81c2 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -770,13 +770,46 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
       # and concatenate it with `ret`.
       ret = ret.concatenate(constant_value_as_shape(concat_input))
     return ret
-  else:
-    ret = tensor_shape.unknown_shape(shape[0].value)
-    value = constant_value(tensor)
-    if value is not None:
-      ret = ret.merge_with(tensor_shape.TensorShape(
-          [d if d != -1 else None for d in value]))
-    return ret
+  elif tensor.op.type == "StridedSlice":
+    try:
+      begin = constant_value(tensor.op.inputs[1])
+      end = constant_value(tensor.op.inputs[2])
+      strides = constant_value(tensor.op.inputs[3])
+      if begin is not None and end is not None and strides is not None:
+        begin = begin[0]
+        end = end[0]
+        strides = strides[0]
+        begin_mask = tensor.op.get_attr("begin_mask")
+        if begin_mask == 1:
+          begin = None
+        end_mask = tensor.op.get_attr("end_mask")
+        if end_mask == 1:
+          end = None
+
+        ellipsis_mask = tensor.op.get_attr("ellipsis_mask")
+        new_axis_mask = tensor.op.get_attr("new_axis_mask")
+        shrink_axis_mask = tensor.op.get_attr("shrink_axis_mask")
+        valid_attributes = (not ellipsis_mask and not new_axis_mask and
+                            not shrink_axis_mask and
+                            (not begin_mask or (begin_mask == 1)) and
+                            (not end_mask or (end_mask == 1)))
+        if valid_attributes:  # additional inputs not supported
+          prev = constant_value_as_shape(tensor.op.inputs[0])
+          prev = prev[begin:end:strides]
+          ret = tensor_shape.TensorShape(prev)
+          return ret
+
+    except ValueError:  # Could come from get_attr or slicing prev.
+      pass
+    except TypeError:  # Could come from slicing prev.
+      pass
+
+  ret = tensor_shape.unknown_shape(shape[0].value)
+  value = constant_value(tensor)
+  if value is not None:
+    ret = ret.merge_with(tensor_shape.TensorShape(
+        [d if d >= 0 else None for d in value]))
+  return ret
 
 
 def is_tensor(x):  # pylint: disable=invalid-name
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 8949702b8752796c0de796b37959e355c7fe12ef..b0a117a21e09bfcf43612ee758f2e6f7fedd6b87 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -832,6 +832,83 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None, 48], c_val.as_list())
 
+  def testSlice(self):
+    tf_val = array_ops.placeholder(dtypes.int32, shape=(4,))[0:2]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([None, None], c_val.as_list())
+
+    # begin:end
+    tf_val = constant_op.constant([10, 20, 30])[1:3]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([20, 30], c_val.as_list())
+
+    # begin:end:stride
+    tf_val = array_ops.strided_slice(
+        constant_op.constant([10, 20, 30]), [1], [3], strides=[2])
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([20], c_val.as_list())
+
+    # [1, 2, 16, 37, None, 48]
+    tf_val_orig = array_ops.concat(
+        [[1, 2, 16, 37], array_ops.placeholder(
+            dtypes.int32, shape=(1,)), [48]], 0)
+
+    # begin: no end
+    tf_val = tf_val_orig[2:]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([16, 37, None, 48], c_val.as_list())
+
+    # begin::negative slice
+    tf_val = tf_val_orig[2::-1]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([16, 2, 1], c_val.as_list())
+
+    # :end:negative slice
+    tf_val = tf_val_orig[:1:-2]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([48, 37], c_val.as_list())
+
+    # begin:end:negative slice
+    tf_val = tf_val_orig[3:1:-1]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([37, 16], c_val.as_list())
+
+    # begin:negative end:slice
+    tf_val = tf_val_orig[1:-3:1]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([2, 16], c_val.as_list())
+
+    # negative begin::slice
+    tf_val = tf_val_orig[-3::1]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([37, None, 48], c_val.as_list())
+
+    # negative begin::negative slice
+    tf_val = tf_val_orig[-3::-1]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([37, 16, 2, 1], c_val.as_list())
+
+    # negative begin:negative end:negative slice
+    tf_val = tf_val_orig[-3:-5:-1]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([37, 16], c_val.as_list())
+
+    # Do not support shape inference for additional arguments
+    tf_val = constant_op.constant([10, 20, 30])[...]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual([None, None, None], c_val.as_list())
+
+    # Do not support shape inference for tensor slices.
+    tf_val = constant_op.constant([10, 20, 30])[
+        array_ops.placeholder(dtypes.int32, shape=()):]
+    c_val = tensor_util.constant_value_as_shape(tf_val)
+    self.assertEqual(tensor_shape.unknown_shape(), c_val)
+
+    # Do not support shape inference for higher rank
+    with self.assertRaises(ValueError):
+      tf_val = constant_op.constant([[10], [20], [30]])[:, 0:]
+      c_val = tensor_util.constant_value_as_shape(tf_val)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 275f9bec25a52b85c734d9d466771f9051af8a86..d22b5b3e25bb61782d01d0bfbc15f55b737a5a99 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_handle.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/public/version.h"
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index dd2fd7b2abe413d3d74e114684f937d9c144da00..96a1cd1410fa1e8d2f21eeac04d52cd9257c49dd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -227,6 +227,19 @@ def NCHWToNHWC(input_tensor):
     return [input_tensor[a] for a in new_axes[ndims]]
 
 
+# TODO(skyewm): remove this eventually
+# pylint: disable=protected-access
+def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
+  prev_value = ops._USE_C_API
+  ops._USE_C_API = use_c_api
+  try:
+    with ops.Graph().as_default():
+      fn(*args, **kwargs)
+  finally:
+    ops._USE_C_API = prev_value
+# pylint: disable=protected-access
+
+
 # TODO(skyewm): remove this eventually
 def disable_c_api(fn):
   """Decorator for disabling the C API on a test.
@@ -240,16 +253,23 @@ def disable_c_api(fn):
   Returns:
     The wrapped function
   """
-  # pylint: disable=protected-access
-  def disable_c_api_wrapper(*args, **kwargs):
-    prev_value = ops._USE_C_API
-    ops._USE_C_API = False
-    try:
-      fn(*args, **kwargs)
-    finally:
-      ops._USE_C_API = prev_value
-  # pylint: disable=protected-access
-  return disable_c_api_wrapper
+  return lambda *args, **kwargs: _use_c_api_wrapper(fn, False, *args, **kwargs)
+
+
+# TODO(skyewm): remove this eventually
+def enable_c_api(fn):
+  """Decorator for enabling the C API on a test.
+
+  Note this enables the C API after running the test class's setup/teardown
+  methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+  return lambda *args, **kwargs: _use_c_api_wrapper(fn, True, *args, **kwargs)
 
 
 class TensorFlowTestCase(googletest.TestCase):
@@ -266,6 +286,13 @@ class TensorFlowTestCase(googletest.TestCase):
     self._ClearCachedSession()
     random.seed(random_seed.DEFAULT_GRAPH_SEED)
     np.random.seed(random_seed.DEFAULT_GRAPH_SEED)
+    # Note: The following line is necessary because some test methods may error
+    # out from within nested graph contexts (e.g., via assertRaises and
+    # assertRaisesRegexp), which may leave ops._default_graph_stack non-empty
+    # under certain versions of Python. That would cause
+    # ops.reset_default_graph() to throw an exception if the stack were not
+    # cleared first.
+    ops._default_graph_stack.reset()  # pylint: disable=protected-access
     ops.reset_default_graph()
     ops.get_default_graph().seed = random_seed.DEFAULT_GRAPH_SEED
 
@@ -382,16 +409,17 @@ class TensorFlowTestCase(googletest.TestCase):
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
     Example:
-
-      class MyOperatorTest(test_util.TensorFlowTestCase):
-        def testMyOperator(self):
-          with self.test_session(use_gpu=True):
-            valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
-            result = MyOperator(valid_input).eval()
-            self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
-            invalid_input = [-1.0, 2.0, 7.0]
-            with self.assertRaisesOpError("negative input not supported"):
-              MyOperator(invalid_input).eval()
+    ```python
+    class MyOperatorTest(test_util.TensorFlowTestCase):
+      def testMyOperator(self):
+        with self.test_session(use_gpu=True):
+          valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
+          result = MyOperator(valid_input).eval()
+          self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
+          invalid_input = [-1.0, 2.0, 7.0]
+          with self.assertRaisesOpError("negative input not supported"):
+            MyOperator(invalid_input).eval()
+    ```
 
     Args:
       graph: Optional graph to use during the returned session.
@@ -626,7 +654,7 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not close dif = ", np.abs(x - y))
       print("not close tol = ", atol + rtol * np.abs(y))
       print("dtype = %s, shape = %s" % (a.dtype, a.shape))
-      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
+      np.testing.assert_allclose(b, a, rtol=rtol, atol=atol, err_msg=msg)
 
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
     """Asserts that two numpy arrays, or dicts of same, have near values.
@@ -634,10 +662,10 @@ class TensorFlowTestCase(googletest.TestCase):
     This does not support nested dicts.
 
     Args:
-      a: A numpy ndarray (or anything can be converted to one), or dict of same.
-        Must be a dict iff `b` is a dict.
-      b: A numpy ndarray (or anything can be converted to one), or dict of same.
-        Must be a dict iff `a` is a dict.
+      a: The expected numpy ndarray (or anything can be converted to one), or
+        dict of same. Must be a dict iff `b` is a dict.
+      b: The actual numpy ndarray (or anything can be converted to one), or
+        dict of same. Must be a dict iff `a` is a dict.
       rtol: relative tolerance.
       atol: absolute tolerance.
 
@@ -673,8 +701,8 @@ class TensorFlowTestCase(googletest.TestCase):
     one of the arguments is of type float16.
 
     Args:
-      a: a numpy ndarray or anything can be converted to one.
-      b: a numpy ndarray or anything can be converted to one.
+      a: the expected numpy ndarray or anything can be converted to one.
+      b: the actual numpy ndarray or anything can be converted to one.
       rtol: relative tolerance.
       atol: absolute tolerance.
       float_rtol: relative tolerance for float32.
@@ -698,8 +726,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Asserts that two numpy arrays have the same values.
 
     Args:
-      a: a numpy ndarray or anything can be converted to one.
-      b: a numpy ndarray or anything can be converted to one.
+      a: the expected numpy ndarray or anything can be converted to one.
+      b: the actual numpy ndarray or anything can be converted to one.
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
@@ -721,7 +749,7 @@ class TensorFlowTestCase(googletest.TestCase):
         x, y = a, b
       print("not equal lhs = ", x)
       print("not equal rhs = ", y)
-      np.testing.assert_array_equal(a, b)
+      np.testing.assert_array_equal(b, a)
 
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
@@ -812,7 +840,8 @@ class TensorFlowTestCase(googletest.TestCase):
     # pylint: enable=invalid-name
 
 
-def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+def create_local_cluster(num_workers, num_ps, protocol="grpc",
+                         worker_config=None, ps_config=None):
   """Create and start local servers and return the associated `Server` objects.
 
   Example:
@@ -838,6 +867,9 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
     num_ps: Number of PS servers to start.
     protocol: Communication protocol.  Allowed values are documented in
       the documentation of `tf.train.Server`.
+    worker_config: (optional) ConfigProto to initialize workers. Can be used
+      to instantiate multiple devices etc.
+    ps_config: (optional) ConfigProto to initialize PS servers.
 
   Returns:
     A tuple `(worker_servers, ps_servers)`.  `worker_servers` is a list
@@ -859,12 +891,14 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
   workers = [
       server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+          cs, job_name="worker", protocol=protocol, task_index=ix,
+          config=worker_config, start=True)
       for ix in range(num_workers)
   ]
   ps_servers = [
       server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+          cs, job_name="ps", protocol=protocol, task_index=ix,
+          config=ps_config, start=True)
       for ix in range(num_ps)
   ]
 
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index 29976b79495ab22128416e540323818033d8165b..88bf900dca6d97773959eb309a4a3c5931fdcb88 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -30,11 +30,11 @@ CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
       analytical_estimator_(cluster, false),
       suffix_(suffix) {}
 
-Status CostAnalyzer::GenerateReport(std::ostream& os) {
+Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report) {
   GatherCosts();
   PreprocessCosts();
   AnalyzeCosts();
-  PrintAnalysis(os);
+  PrintAnalysis(os, per_node_report);
   return Status::OK();
 }
 
@@ -158,7 +158,7 @@ void CostAnalyzer::AnalyzeCosts() {
   }
 }
 
-void CostAnalyzer::PrintAnalysis(std::ostream& os) const {
+void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report) const {
   os << std::endl;
   os << std::left << std::setw(50)
      << "Total time measured in ns (serialized): " << std::right
@@ -225,6 +225,11 @@ void CostAnalyzer::PrintAnalysis(std::ostream& os) const {
     os << std::endl;
   }
   os << std::endl;
+
+  if (per_node_report) {
+    os << "Below is the per-node report:" << std::endl;
+    os << op_perf_.DebugString();
+  }
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
index 3700bf5fb37cce6a99803c6c177882a7534a7645..0e860e0fee9923510292d3cf1a8069435787476f 100644
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -50,7 +50,7 @@ class CostAnalyzer {
  public:
   explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
                         const string& suffix);
-  Status GenerateReport(std::ostream& os);
+  Status GenerateReport(std::ostream& os, bool per_node_report);
 
  private:
   void PredictCosts(CostEstimator* cost_estimator, CostGraphDef* cost_graph,
@@ -59,7 +59,7 @@ class CostAnalyzer {
   void PreprocessCosts();
   void AnalyzeCosts();
   void SortOpsByTime(std::map<string, OpPerfSummary> ops);
-  void PrintAnalysis(std::ostream& os) const;
+  void PrintAnalysis(std::ostream& os, bool per_node_report) const;
 
   const GrapplerItem* item_;
   MeasuringCostEstimator measure_estimator_;
diff --git a/tensorflow/python/grappler/cost_analyzer.i b/tensorflow/python/grappler/cost_analyzer.i
index a51d8673c996a302626703cf0373cad2d9fbec40..1f024e439d8d4d819e5f603ae3b8a843063baeeb 100644
--- a/tensorflow/python/grappler/cost_analyzer.i
+++ b/tensorflow/python/grappler/cost_analyzer.i
@@ -42,27 +42,36 @@ limitations under the License.
 %}
 
 %{
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph) {
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
+per_node_report) {
   tensorflow::grappler::ItemConfig cfg;
+  cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("metagraph", metagraph, cfg);
-
+  if (!item) {
+    return "Error: failed to preprocess metagraph: check your log file for errors";
+  }
+  
   // TODO(bsteiner): we should wrap the tf session instead to properly handle the case of a
   // distributed setup.
   const int timeout_s = 3600;
   int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   tensorflow::grappler::SingleMachine cluster(timeout_s, num_cpu_cores, num_gpus);
+  cluster.SetNumWarmupSteps(10);
+  cluster.AllowSoftPlacement(true);
+  cluster.DisableDetailedStats(false);
   TF_CHECK_OK(cluster.Provision());
 
   string suffix;
   tensorflow::grappler::CostAnalyzer analyzer(*item, &cluster, suffix);
 
   std::stringstream os;
-  analyzer.GenerateReport(os);
+  analyzer.GenerateReport(os, per_node_report);
   return os.str();
 }
 
 %}
 
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph);
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
+per_node_report);
diff --git a/tensorflow/python/grappler/cost_analyzer.py b/tensorflow/python/grappler/cost_analyzer.py
index d16614c7c75708e96a0b88489114367754d6da55..75c21e572719128cfd5f9a36191b5765386c43dc 100644
--- a/tensorflow/python/grappler/cost_analyzer.py
+++ b/tensorflow/python/grappler/cost_analyzer.py
@@ -22,8 +22,19 @@ from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 
 
-def GenerateCostReport(metagraph):
-  """Analyze the cost of each TensorFlow operation in the provided metagraph."""
+def GenerateCostReport(metagraph, per_node_report=False):
+  """Analyze the cost of each TensorFlow op and node in the provided metagraph.
+
+  Args:
+    metagraph: An TensorFlow MetaGraphDef.
+    per_node_report: by default the report contains stats aggregated on a per op
+      type basis, setting per_node_report to True adds results for each
+      individual node to the report.
+
+  Returns:
+    A string of cost report.
+  """
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString())
+    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString(),
+                                               per_node_report)
   return ret_from_swig
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..146bb4311cb5a44d5739821db19f33a41e6e9ce2
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""A tool for cost analysis."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from google.protobuf import text_format
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.grappler import cost_analyzer
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.platform import app
+from tensorflow.python.platform import gfile
+
+
+def main(_):
+  with gfile.GFile(FLAGS.input) as input_file:
+    metagraph = meta_graph_pb2.MetaGraphDef()
+    metagraph.ParseFromString(input_file.read())
+
+  if FLAGS.rewriter_config is not None:
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    text_format.Merge(FLAGS.rewriter_config, rewriter_config)
+    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+    metagraph.graph_def.CopyFrom(optimized_graph)
+
+  report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report)
+  print(report)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--input", type=str, default=None, help="Input .meta file path.")
+  parser.add_argument(
+      "--rewriter_config",
+      type=str,
+      default=None,
+      help="Configuration for the grappler optimizers, described as a "
+      "RewriterConfig protocol buffer. Usage example 1: "
+      "--rewriter_config='optimize_tensor_layout: true "
+      "disable_model_pruning: true'. Usage example 2: "
+      "--rewriter_config='optimizers: \"constfold\" optimizers: \"layout\"'")
+  parser.add_argument(
+      "--per_node_report",
+      action="store_true",
+      help="Generate per-node report. By default the report contains stats "
+      "aggregated on a per op type basis, per_node_report adds results "
+      "for each individual node to the report.")
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 581f17c2ca21d2d1634bdbc695156f66dd1d4b35..4db8fa724519b36aab8cae391ae8edef457c154b 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -18,16 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import training as train
 
 
-class MemoryOptimizerTest(test.TestCase):
+class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
   def testNoSwapping(self):
@@ -85,5 +92,51 @@ class MemoryOptimizerTest(test.TestCase):
         self.assertEqual('c', node.input[1])
 
 
+class MemoryOptimizerRecomputeTest(test.TestCase):
+
+  def _RunGraphWithConfig(self, config, batch_size=14, image_dim=12):
+    """Run a simple layered graph with conv, an intermediate op, and a ReLU."""
+    graph = ops.Graph()
+    with graph.as_default():
+      random_seed.set_random_seed(1)
+      current_activation = variable_scope.get_variable(
+          name='start', shape=[batch_size, image_dim, image_dim, 5])
+      conv_filter = variable_scope.get_variable(
+          name='filter', shape=[5, 5, 5, 5])
+      for layer_number in range(10):
+        with variable_scope.variable_scope('layer_{}'.format(layer_number)):
+          after_conv = nn.conv2d(current_activation, conv_filter, [1, 1, 1, 1],
+                                 'SAME')
+          current_activation = 2. * after_conv
+          current_activation = nn.relu(current_activation)
+      loss = math_ops.reduce_mean(current_activation)
+      optimizer = train.AdamOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+      init_op = variables.global_variables_initializer()
+      with session.Session(config=config, graph=graph) as sess:
+        sess.run(init_op)
+        sess.run(train_op)
+        sess.run(train_op)
+        return sess.run(loss)
+
+  def _GetMemoryOptimizerConfig(self):
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewrite_options)
+    return config_pb2.ConfigProto(graph_options=graph_options)
+
+  def testRecomputationRewritingNoErrors(self):
+    """Tests that there are no errors when we request a memory optimizer pass.
+
+    Does not test that the memory optimizer actually runs. See
+    core/grappler/optimizers/memory_optimizer_test.cc for a functional test of
+    the graph rewriting.
+    """
+    original_loss = self._RunGraphWithConfig(config_pb2.ConfigProto())
+    memory_optimized_loss = self._RunGraphWithConfig(
+        config=self._GetMemoryOptimizerConfig())
+    self.assertAllClose(original_loss, memory_optimized_loss, rtol=1e-4)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ec7620bce9462018c8b49ecb5116aa3f77f8271
--- /dev/null
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/grappler/model_analyzer.h"
+
+#include <iomanip>
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+ModelAnalyzer::ModelAnalyzer(const GrapplerItem& item) : item_(item) {}
+
+Status ModelAnalyzer::GenerateReport(std::ostream& os) {
+  GraphProperties properties(item_);
+  TF_RETURN_IF_ERROR(properties.InferStatically());
+
+  for (const auto& node : item_.MainOpsFanin()) {
+    PrintNodeInfo(node, properties, os);
+  }
+  for (const auto& node : item_.EnqueueOpsFanin()) {
+    PrintNodeInfo(node, properties, os);
+  }
+
+  return Status::OK();
+}
+
+void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
+                                  const GraphProperties& properties,
+                                  std::ostream& os) const {
+  os << node->name() << " [" << node->op() << "]" << std::endl;
+  if (properties.HasOutputProperties(node->name())) {
+    std::vector<OpInfo::TensorProperties> props =
+        properties.GetOutputProperties(node->name());
+    for (int i = 0; i < props.size(); ++i) {
+      const OpInfo::TensorProperties& prop = props[i];
+      os << "\t"
+         << "output " << i << " (" << DataTypeString(prop.dtype())
+         << ") has shape ";
+      if (prop.shape().unknown_rank()) {
+        os << "?";
+      } else {
+        os << "[";
+        for (int i = 0; i < prop.shape().dim_size(); ++i) {
+          if (i > 0) {
+            os << ", ";
+          }
+          if (prop.shape().dim(i).size() < 0) {
+            os << "?";
+          } else {
+            os << prop.shape().dim(i).size();
+          }
+        }
+        os << "]";
+      }
+      os << std::endl;
+    }
+  }
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/python/grappler/model_analyzer.h b/tensorflow/python/grappler/model_analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a14034103ca70e59ac24d88318edc198e7d1c5f4
--- /dev/null
+++ b/tensorflow/python/grappler/model_analyzer.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_MODEL_ANALYZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_MODEL_ANALYZER_H_
+
+#include <iostream>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+namespace grappler {
+struct GrapplerItem;
+class GraphProperties;
+
+// Generate a report detailing how much information is known statically for most
+// operations in the model, including output data types and output shapes.
+class ModelAnalyzer {
+ public:
+  explicit ModelAnalyzer(const GrapplerItem& item);
+  Status GenerateReport(std::ostream& os);
+
+ private:
+  void PrintNodeInfo(const NodeDef* node, const GraphProperties& properties,
+                     std::ostream& os) const;
+
+  const GrapplerItem& item_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_MODEL_ANALYZER_H_
diff --git a/tensorflow/python/grappler/model_analyzer.i b/tensorflow/python/grappler/model_analyzer.i
new file mode 100644
index 0000000000000000000000000000000000000000..d74bd37c6372733d25d2b5766a302aa1701dac17
--- /dev/null
+++ b/tensorflow/python/grappler/model_analyzer.i
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/lib/core/strings.i"
+%include "tensorflow/python/platform/base.i"
+
+%typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The MetaGraphDef could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%{
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/python/grappler/model_analyzer.h"
+%}
+
+%{
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph) {
+  tensorflow::grappler::ItemConfig cfg;
+  cfg.apply_optimizations = false;
+  std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
+      tensorflow::grappler::GrapplerItemFromMetaGraphDef("metagraph", metagraph, cfg);
+  if (!item) {
+    return "Error: failed to preprocess metagraph: check your log file for errors";
+  }
+  
+  string suffix;
+  tensorflow::grappler::ModelAnalyzer analyzer(*item);
+
+  std::stringstream os;
+  analyzer.GenerateReport(os);
+  return os.str();
+}
+
+%}
+
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph);
diff --git a/tensorflow/python/grappler/model_analyzer.py b/tensorflow/python/grappler/model_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c852d71ad8b047f5437ca62c49a5500bc29cec60
--- /dev/null
+++ b/tensorflow/python/grappler/model_analyzer.py
@@ -0,0 +1,37 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Provides a proper python API for the symbols exported through swig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as tf_wrap
+from tensorflow.python.framework import errors
+
+
+def GenerateModelReport(metagraph):
+  """Report what's known statically about each node in the provided metagraph.
+
+  Args:
+    metagraph: A TensorFlow MetaGraphDef.
+
+  Returns:
+    A string containing the report.
+  """
+  with errors.raise_exception_on_not_ok_status():
+    ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString())
+
+  return ret_from_swig
diff --git a/tensorflow/python/grappler/model_analyzer_test.py b/tensorflow/python/grappler/model_analyzer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59d1650f4b5e4c7239c2275213e9a26c3aafafe
--- /dev/null
+++ b/tensorflow/python/grappler/model_analyzer_test.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the cost analyzer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import model_analyzer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PyWrapOptimizeGraphTest(test.TestCase):
+
+  def testBasic(self):
+    """Make sure arguments can be passed correctly."""
+    a = constant_op.constant([10, 11], name="a")
+    b = constant_op.constant([10], name="b")
+    c = math_ops.add(a, b, name="c")
+    d = math_ops.add_n([a, c], name="d")
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = model_analyzer.GenerateModelReport(mg)
+
+    # Check the report headers
+    self.assertTrue(b"a [Const]" in report)
+    self.assertTrue(b"a [Const]" in report)
+    self.assertTrue(b"c [Add]" in report)
+    self.assertTrue(b"d [AddN]" in report)
+
+    # Also print the report to make it easier to debug
+    print("{}".format(report))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c89118d4bf7c45fb4bf8fa67ab31e49307f0a962..896d466c2599fd5513f5c13e55fb3fd99c7fed62 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -47,7 +47,7 @@ tf_py_test(
 
 tf_py_test(
     name = "barrier_ops_test",
-    size = "small",
+    size = "medium",  # NOTE(ebrevdo): This test is NOT small.
     srcs = ["barrier_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -56,6 +56,7 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    shard_count = 20,
 )
 
 tf_py_test(
@@ -71,6 +72,7 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:string_ops",
     ],
+    tags = ["nomac"],  # b/35468214
 )
 
 tf_py_test(
@@ -727,6 +729,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "sparse_slice_op_test",
+    size = "small",
+    srcs = ["sparse_slice_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:sparse_ops",
+    ],
+)
+
 tf_py_test(
     name = "sparse_to_dense_op_py_test",
     size = "small",
@@ -835,7 +849,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "topk_op_test",
     size = "small",
     srcs = ["topk_op_test.py"],
@@ -900,7 +914,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "where_op_test",
     size = "small",
     srcs = ["where_op_test.py"],
@@ -1343,7 +1357,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "gather_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["gather_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1657,6 +1671,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "compare_and_bitpack_op_test",
+    size = "small",
+    srcs = ["compare_and_bitpack_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
 cuda_py_test(
     name = "scalar_test",
     size = "small",
@@ -1702,7 +1728,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "shape_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["shape_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1971,7 +1997,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "transpose_op_test",
-    size = "large",
+    size = "medium",
     srcs = ["transpose_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2050,6 +2076,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["noasan"],  # times out b/63680444
 )
 
 cuda_py_test(
@@ -2183,6 +2210,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:spectral_ops",
+        "//tensorflow/python:spectral_ops_test_util",
     ],
     shard_count = 3,
 )
@@ -2479,6 +2507,7 @@ cuda_py_test(
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:platform",
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index a5352561aa734382757a750f1b71fd0a81741e88..ce0676990221fb441b99043083647f9d65722db8 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -28,14 +29,16 @@ class ArgMaxTest(test.TestCase):
   def _testArg(self,
                method,
                x,
-               dimension,
+               axis,
                expected_values,
                use_gpu=False,
                expected_err_re=None):
     with self.test_session(use_gpu=use_gpu):
-      ans = method(x, dimension=dimension)
+      ans = method(x, axis=axis)
       if expected_err_re is None:
         tf_ans = ans.eval()
+        # Defaults to int64 output.
+        self.assertEqual(np.int64, tf_ans.dtype)
         self.assertAllEqual(tf_ans, expected_values)
         self.assertShapeEqual(expected_values, ans)
       else:
@@ -45,32 +48,48 @@ class ArgMaxTest(test.TestCase):
   def _testBothArg(self,
                    method,
                    x,
-                   dimension,
+                   axis,
                    expected_values,
                    expected_err_re=None):
-    self._testArg(method, x, dimension, expected_values, True, expected_err_re)
-    self._testArg(method, x, dimension, expected_values, False, expected_err_re)
+    self._testArg(method, x, axis, expected_values, True, expected_err_re)
+    self._testArg(method, x, axis, expected_values, False, expected_err_re)
 
   def _testBasic(self, dtype):
     x = np.asarray(100 * np.random.randn(200), dtype=dtype)
 
-    # Check that argmin and argmax match numpy along the primary
-    # dimension
+    # Check that argmin and argmax match numpy along the primary axis
     self._testBothArg(math_ops.argmax, x, 0, x.argmax())
     self._testBothArg(math_ops.argmin, x, 0, x.argmin())
 
   def _testDim(self, dtype):
     x = np.asarray(100 * np.random.randn(3, 2, 4, 5, 6), dtype=dtype)
 
-    # Check that argmin and argmax match numpy along all dimensions
-    for dim in range(-5, 5):
-      self._testBothArg(math_ops.argmax, x, dim, x.argmax(dim))
-      self._testBothArg(math_ops.argmin, x, dim, x.argmin(dim))
+    # Check that argmin and argmax match numpy along all axes
+    for axis in range(-5, 5):
+      self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
+      self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
 
   def testFloat(self):
     self._testBasic(np.float32)
     self._testDim(np.float32)
 
+  def testFloatInt32Output(self):
+    x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
+    expected_values = x.argmax()
+    with self.test_session(use_gpu=True):
+      ans = math_ops.argmax(x, axis=0, output_type=dtypes.int32)
+      tf_ans = ans.eval()
+      self.assertEqual(np.int32, tf_ans.dtype)
+      # The values are equal when comparing int32 to int64 because
+      # the values don't have a range that exceeds 32-bit integers.
+      self.assertAllEqual(tf_ans, expected_values)
+    expected_values = x.argmin()
+    with self.test_session(use_gpu=True):
+      ans = math_ops.argmin(x, axis=0, output_type=dtypes.int32)
+      tf_ans = ans.eval()
+      self.assertEqual(np.int32, tf_ans.dtype)
+      self.assertAllEqual(tf_ans, expected_values)
+
   def testDouble(self):
     self._testBasic(np.float64)
     self._testDim(np.float64)
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index b5b17ff80abcfe64187a3459bd84b21a36c55828..155aad8bd9a642aeefabcfe121fb81641a00e31f 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -107,9 +107,12 @@ class MathBuiltinUnaryTest(test.TestCase):
 
   def _testDtype(self, dtype, use_gpu):
     data = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(dtype)
+    data_gt_1 = data + 2 # for x > 1
     self._compare(data, np.abs, math_ops.abs, use_gpu)
     self._compare(data, np.arccos, math_ops.acos, use_gpu)
     self._compare(data, np.arcsin, math_ops.asin, use_gpu)
+    self._compare(data, np.arcsinh, math_ops.asinh, use_gpu)
+    self._compare(data_gt_1, np.arccosh, math_ops.acosh, use_gpu)
     self._compare(data, np.arctan, math_ops.atan, use_gpu)
     self._compare(data, np.ceil, math_ops.ceil, use_gpu)
     self._compare(data, np.cos, math_ops.cos, use_gpu)
@@ -126,6 +129,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     self._compare(data, np.square, math_ops.square, use_gpu)
     self._compare(data, np.tan, math_ops.tan, use_gpu)
     self._compare(data, np.tanh, math_ops.tanh, use_gpu)
+    self._compare(data, np.arctanh, math_ops.atanh, use_gpu)
 
   def testTypes(self):
     for dtype in [np.float32]:
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index 077a4b8e2775fb345afb57821d2a68558f7027ba..a535468b058d289d5cc6611ff542d89615793834 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -76,6 +76,12 @@ class BitcastTest(test.TestCase):
     datatype = dtypes.int8
     array_ops.bitcast(x, datatype, None)
 
+  def testQuantizeType(self):
+    shape = [3, 4]
+    x = np.zeros(shape, np.uint16)
+    datatype = dtypes.quint16
+    self._testBitcast(x, datatype, shape)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index 17771e0572302df1531040f4992a4e1a082a78b0..c785f2358d5e659c71acf02457e2146616a9e880 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 import sys
+import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -146,9 +147,16 @@ class CastOpTest(test.TestCase):
     if sys.byteorder == "big":  
       self._compare(np.inf, np.int32, i4.max, False)  
       self._compare(np.inf, np.int64, i8.max, False)  
-    else:  
-      self._compare(np.inf, np.int32, i4.min, False)  
-      self._compare(np.inf, np.int64, i8.min, False)  
+    else:
+      # np.float64("np.inf").astype(np.int32) is negative on x86 but positive on ppc64le
+      # Numpy link to relevant discussion - https://github.com/numpy/numpy/issues/9040
+      # Tensorflow link to relevant discussion - https://github.com/tensorflow/tensorflow/issues/9360
+      if platform.machine() == "ppc64le":
+        self._compare(-np.inf, np.int32, i4.min, False)
+        self._compare(-np.inf, np.int64, i8.min, False)
+      else:
+        self._compare(np.inf, np.int32, i4.min, False)
+        self._compare(np.inf, np.int64, i8.min, False)  
     self._compare(-np.inf, np.float32, -np.inf, False)
     self._compare(-np.inf, np.float64, -np.inf, False)
     self._compare(-np.inf, np.int32, i4.min, False)
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ddd6e42826e4055ee163e154489bfa7a92dbfa
--- /dev/null
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -0,0 +1,83 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.compare_and_bitpack_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CompareAndBitpackTest(test.TestCase):
+
+  def _testCompareAndBitpack(self,
+                             x, threshold,
+                             truth,
+                             expected_err_re=None):
+    with self.test_session(use_gpu=True):
+      ans = math_ops.compare_and_bitpack(x, threshold)
+      if expected_err_re is None:
+        tf_ans = ans.eval()
+        self.assertShapeEqual(truth, ans)
+        self.assertAllEqual(tf_ans, truth)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          ans.eval()
+
+  def _testBasic(self, dtype):
+    rows = 371
+    cols = 294
+    x = np.random.randn(rows, cols * 8)
+    if dtype == np.bool:
+      x = x > 0
+    else:
+      x = x.astype(dtype)
+    threshold = dtype(0)
+    # np.packbits flattens the tensor, so we reshape it back to the
+    # expected dimensions.
+    truth = np.packbits(x > threshold).reshape(rows, cols)
+    self._testCompareAndBitpack(x, threshold, truth)
+
+  def testBasicFloat32(self):
+    self._testBasic(np.float32)
+
+  def testBasicFloat64(self):
+    self._testBasic(np.float64)
+
+  def testBasicFloat16(self):
+    self._testBasic(np.float16)
+
+  def testBasicBool(self):
+    self._testBasic(np.bool)
+
+  def testBasicInt8(self):
+    self._testBasic(np.int8)
+
+  def testBasicInt16(self):
+    self._testBasic(np.int16)
+
+  def testBasicInt32(self):
+    self._testBasic(np.int32)
+
+  def testBasicInt64(self):
+    self._testBasic(np.int64)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 0bb5b551555ae9234afbd79ba69668c1d4f8d1ee..aba4224dc624f9195d245bda35b5591686e66b92 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -141,6 +141,7 @@ class ConcatOpTest(test.TestCase):
     self._testRandom(dtypes.float32)
     self._testRandom(dtypes.int16)
     self._testRandom(dtypes.int32)
+    self._testRandom(dtypes.int64)
     self._testRandom(dtypes.bfloat16)
     self._testRandom(dtypes.complex64)
     self._testRandom(dtypes.complex128)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 40c6a9e614dfd1897732c7b3808cb73a8de9c84d..0cc6745de333f5fb7f8a2f7d3119eca00fae63eb 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -684,6 +684,16 @@ class PlaceholderTest(test.TestCase):
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :2]})
 
+  def testPartialShapeWhenNotFed(self):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
+      p_identity = array_ops.identity(p)
+
+      # Should trigger an operator error, not a shape error.
+      with self.assertRaisesOpError(
+          "must feed a value for placeholder tensor 'p' with dtype float"):
+        p_identity.eval()
+
   def testControlDependency(self):
     with self.test_session():
       p = array_ops.placeholder(dtypes_lib.int32, shape=[], name="p")
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index b47139e6b8b78875b2f545b9d0a662653c9c55de..5a0fb1879aa58092ddb156217981945ae424a19c 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -184,6 +184,7 @@ class UnaryOpTest(test.TestCase):
 
   def testFloatBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
+    w = x - x.min() + 1.01 # all greater than 1
     y = (x + .5).astype(np.float32)  # no zero
     z = (x + 15.5).astype(np.float32)  # all positive
     k = np.arange(-0.90, 0.90, 0.25).astype(np.float32)  # between -1 and 1
@@ -203,6 +204,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.sinh, math_ops.sinh)
     self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
+    # b/63457572: failing under CUDA.
+    # self._compareBoth(x, np.arcsinh, math_ops.asinh)
+    # self._compareBoth(w, np.arccosh, math_ops.acosh)
+    # self._compareBoth(k, np.arctanh, math_ops.atanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(x, self._log_sigmoid, math_ops.log_sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
@@ -248,6 +253,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.log, math_ops.log)
     self._compareBoth(x, np.log1p, math_ops.log1p)
     self._compareBoth(x, np.sinh, math_ops.sinh)
+    # b/63457572.
+    # self._compareBoth(x, np.arcsinh, math_ops.asinh)
     self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
@@ -273,6 +280,7 @@ class UnaryOpTest(test.TestCase):
 
   def testDoubleBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+    w = x - x.min() + 1.01 # all greater than 1
     y = (x + .5).astype(np.float64)  # no zero
     z = (x + 15.5).astype(np.float64)  # all positive
     k = np.arange(-0.90, 0.90, 0.35).reshape(1, 3, 2).astype(
@@ -292,6 +300,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.sinh, math_ops.sinh)
     self._compareBoth(x, np.cosh, math_ops.cosh)
     self._compareBoth(x, np.tanh, math_ops.tanh)
+    # b/63457572: failing under CUDA.
+    # self._compareBoth(x, np.arcsinh, math_ops.asinh)
+    # self._compareBoth(w, np.arccosh, math_ops.acosh)
+    # self._compareBoth(k, np.arctanh, math_ops.atanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
@@ -398,6 +410,10 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.sinh, math_ops.sinh)
     self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
+    # b/63457572: failing under CUDA.
+    # self._compareCpu(x, np.arcsinh, math_ops.asinh)
+    # self._compareCpu(x, np.arccosh, math_ops.acosh)
+    # self._compareCpu(x, np.arctanh, math_ops.atanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
     self._compareCpu(x, np.cos, math_ops.cos)
@@ -434,6 +450,10 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.sinh, math_ops.sinh)
     self._compareCpu(x, np.cosh, math_ops.cosh)
     self._compareCpu(x, np.tanh, math_ops.tanh)
+    # b/63457572, failing under CUDA.
+    # self._compareCpu(x, np.arcsinh, math_ops.asinh)
+    # self._compareCpu(x, np.arccosh, math_ops.acosh)
+    # self._compareCpu(x, np.arctanh, math_ops.atanh)
     self._compareCpu(x, self._sigmoid, math_ops.sigmoid)
     self._compareCpu(x, np.sin, math_ops.sin)
     self._compareCpu(x, np.cos, math_ops.cos)
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 8ba9d0efff7dec5a9747ff4be17d489588da9a12..3298092fbeac34e542dbab7ed204e293a6774229 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -37,18 +37,21 @@ def ConfigsToTest():
     Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
     convolution parameters.
   """
-  input_sizes = [[4, 5, 5, 48], [4, 8, 8, 84], [4, 17, 17, 48], [4, 35, 35, 2],
-                 [4, 147, 147, 2], [3, 299, 299, 3], [5, 183, 183, 1]]
-  filter_sizes = [[1, 1, 48, 2], [1, 3, 84, 1], [3, 1, 48, 4], [5, 5, 2, 1],
-                  [3, 3, 2, 8], [2, 2, 3, 8], [5, 5, 1, 2]]
-  out_sizes = [[4, 5, 5, 96], [4, 8, 8, 84], [4, 17, 17, 192], [4, 35, 35, 2],
-               [4, 49, 49, 16], [3, 150, 150, 24], [5, 92, 92, 2]]
-  strides = [1, 1, 1, 1, 3, 2, 2]
+  input_sizes = [[4, 5, 5, 48], [4, 8, 8, 84], [4, 17, 17, 48], [4, 9, 27, 8],
+                 [4, 31, 31, 7], [4, 35, 35, 2], [4, 147, 147, 2],
+                 [3, 299, 299, 3], [5, 183, 183, 1]]
+  filter_sizes = [[1, 1, 48, 2], [1, 3, 84, 1], [3, 1, 48, 4], [3, 3, 8, 1],
+                  [3, 3, 7, 1], [5, 5, 2, 1], [3, 3, 2, 8], [2, 2, 3,
+                                                             8], [5, 5, 1, 2]]
+  out_sizes = [[4, 5, 5, 96], [4, 8, 8, 84], [4, 17, 17, 192], [4, 9, 27, 8],
+               [4, 31, 31, 7], [4, 35, 35, 2], [4, 49, 49, 16],
+               [3, 150, 150, 24], [5, 92, 92, 2]]
+  strides = [1, 1, 1, 1, 1, 1, 3, 2, 2]
   # pylint: disable=invalid-name
   VALID = "VALID"
   SAME = "SAME"
   # pylint: enable=invalid-name
-  paddings = [SAME, SAME, SAME, SAME, VALID, SAME, SAME, SAME]
+  paddings = [SAME, SAME, SAME, SAME, SAME, SAME, VALID, SAME, SAME, SAME]
   for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
                            paddings):
     yield i, f, o, s, p
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index 2d05ab6139015f52c10d99439d82fe44b1520f60..089ec0de7957d42aa91c745399eff099fcc85076 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -66,6 +66,41 @@ class DeterminantOpTest(test.TestCase):
     # A multidimensional batch of 2x2 matrices
     self._compareDeterminant(np.random.rand(3, 4, 5, 2, 2).astype(np.float64))
 
+  def testBasicComplex64(self):
+    # 2x2 matrices
+    self._compareDeterminant(
+        np.array([[2., 3.], [3., 4.]]).astype(np.complex64))
+    self._compareDeterminant(
+        np.array([[0., 0.], [0., 0.]]).astype(np.complex64))
+    self._compareDeterminant(
+        np.array([[1. + 1.j, 1. - 1.j], [-1. + 1.j, -1. - 1.j]]).astype(
+            np.complex64))
+    # 5x5 matrices (Eigen forces LU decomposition)
+    self._compareDeterminant(
+        np.array([[2., 3., 4., 5., 6.], [3., 4., 9., 2., 0.], [
+            2., 5., 8., 3., 8.
+        ], [1., 6., 7., 4., 7.], [2., 3., 4., 5., 6.]]).astype(np.complex64))
+    # A multidimensional batch of 2x2 matrices
+    self._compareDeterminant(np.random.rand(3, 4, 5, 2, 2).astype(np.complex64))
+
+  def testBasicComplex128(self):
+    # 2x2 matrices
+    self._compareDeterminant(
+        np.array([[2., 3.], [3., 4.]]).astype(np.complex128))
+    self._compareDeterminant(
+        np.array([[0., 0.], [0., 0.]]).astype(np.complex128))
+    self._compareDeterminant(
+        np.array([[1. + 1.j, 1. - 1.j], [-1. + 1.j, -1. - 1.j]]).astype(
+            np.complex128))
+    # 5x5 matrices (Eigen forces LU decomposition)
+    self._compareDeterminant(
+        np.array([[2., 3., 4., 5., 6.], [3., 4., 9., 2., 0.], [
+            2., 5., 8., 3., 8.
+        ], [1., 6., 7., 4., 7.], [2., 3., 4., 5., 6.]]).astype(np.complex128))
+    # A multidimensional batch of 2x2 matrices
+    self._compareDeterminant(
+        np.random.rand(3, 4, 5, 2, 2).astype(np.complex128))
+
   def testOverflow(self):
     max_double = np.finfo("d").max
     huge_matrix = np.array([[max_double, 0.0], [0.0, max_double]])
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 50a079520048f8b2fc1ae0769b26507bb452d8b1..f124065e0d615513949616b7d2239a5c0815b4c0 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -29,6 +29,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "kullback_leibler_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index ef93c4dab088c1e8bcb8ba1673d964eabb79835d..a269d722737866fa5e6ae9feee919be0db71bcf1 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -172,7 +172,7 @@ class BernoulliTest(test.TestCase):
       dist = bernoulli.Bernoulli(probs=p, validate_args=True)
       with self.assertRaisesOpError("must be non-negative."):
         dist.prob([1, 1, -1]).eval()
-      with self.assertRaisesOpError("is not less than or equal to 1."):
+      with self.assertRaisesOpError("Elements cannot exceed 1."):
         dist.prob([2, 0, 1]).eval()
 
   def testPmfWithP(self):
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index 33db933e82a3fdc794c34aa3a93de82fdd89e3be..019c1bc353a9891da6967a7ce9114b58226a980a 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 
@@ -183,11 +184,105 @@ class CategoricalTest(test.TestCase):
     with self.test_session():
       self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
 
+  def testCDFBroadcasting(self):
+    # shape: [batch=2, n_bins=3]
+    histograms = [[0.2, 0.1, 0.7],
+                  [0.3, 0.45, 0.25]]
+
+    # shape: [batch=3, batch=2]
+    devent = [
+        [0, 0],
+        [1, 1],
+        [2, 2]
+    ]
+    dist = categorical.Categorical(probs=histograms)
+
+    # We test that the probabilities are correctly broadcasted over the
+    # additional leading batch dimension of size 3.
+    expected_cdf_result = np.zeros((3, 2))
+    expected_cdf_result[0, 0] = 0
+    expected_cdf_result[0, 1] = 0
+    expected_cdf_result[1, 0] = 0.2
+    expected_cdf_result[1, 1] = 0.3
+    expected_cdf_result[2, 0] = 0.3
+    expected_cdf_result[2, 1] = 0.75
+
+    with self.test_session():
+      self.assertAllClose(dist.cdf(devent).eval(), expected_cdf_result)
+
+  def testBroadcastWithBatchParamsAndBiggerEvent(self):
+    ## The parameters have a single batch dimension, and the event has two.
+
+    # param shape is [3 x 4], where 4 is the number of bins (non-batch dim).
+    cat_params_py = [
+        [0.2, 0.15, 0.35, 0.3],
+        [0.1, 0.05, 0.68, 0.17],
+        [0.1, 0.05, 0.68, 0.17]
+    ]
+
+    # event shape = [5, 3], both are "batch" dimensions.
+    disc_event_py = [
+        [0, 1, 2],
+        [1, 2, 3],
+        [0, 0, 0],
+        [1, 1, 1],
+        [2, 1, 0]
+    ]
+
+    # shape is [3]
+    normal_params_py = [
+        -10.0,
+        120.0,
+        50.0
+    ]
+
+    # shape is [5, 3]
+    real_event_py = [
+        [-1.0, 0.0, 1.0],
+        [100.0, 101, -50],
+        [90, 90, 90],
+        [-4, -400, 20.0],
+        [0.0, 0.0, 0.0]
+    ]
+
+    cat_params_tf = array_ops.constant(cat_params_py)
+    disc_event_tf = array_ops.constant(disc_event_py)
+    cat = categorical.Categorical(probs=cat_params_tf)
+
+    normal_params_tf = array_ops.constant(normal_params_py)
+    real_event_tf = array_ops.constant(real_event_py)
+    norm = normal.Normal(loc=normal_params_tf, scale=1.0)
+
+    # Check that normal and categorical have the same broadcasting behaviour.
+    to_run = {
+        "cat_prob": cat.prob(disc_event_tf),
+        "cat_log_prob": cat.log_prob(disc_event_tf),
+        "cat_cdf": cat.cdf(disc_event_tf),
+        "cat_log_cdf": cat.log_cdf(disc_event_tf),
+        "norm_prob": norm.prob(real_event_tf),
+        "norm_log_prob": norm.log_prob(real_event_tf),
+        "norm_cdf": norm.cdf(real_event_tf),
+        "norm_log_cdf": norm.log_cdf(real_event_tf),
+    }
+
+    with self.test_session() as sess:
+      run_result = sess.run(to_run)
+
+    self.assertAllEqual(run_result["cat_prob"].shape,
+                        run_result["norm_prob"].shape)
+    self.assertAllEqual(run_result["cat_log_prob"].shape,
+                        run_result["norm_log_prob"].shape)
+    self.assertAllEqual(run_result["cat_cdf"].shape,
+                        run_result["norm_cdf"].shape)
+    self.assertAllEqual(run_result["cat_log_cdf"].shape,
+                        run_result["norm_log_cdf"].shape)
+
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
     with self.test_session():
       self.assertAllClose(dist.log_prob([0, 1]).eval(), np.log([0.2, 0.4]))
+      self.assertAllClose(dist.log_prob([0.0, 1.0]).eval(), np.log([0.2, 0.4]))
 
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
@@ -263,6 +358,7 @@ class CategoricalTest(test.TestCase):
 
   def testLogPMFBroadcasting(self):
     with self.test_session():
+      # 1 x 2 x 2
       histograms = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
 
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index 2f8f85866dfd3d40894081fdea8a3d23fe14ca9c..7922fb0606c6f4b475b25da716d5f9a169e213b5 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -74,10 +74,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(n, alpha, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
-      with self.assertRaisesOpError("counts must be non-negative"):
+      with self.assertRaisesOpError("must be non-negative"):
         dist.prob([-1., 4, 2]).eval()
       with self.assertRaisesOpError(
-          "counts last-dimension must sum to `self.total_count`"):
+          "last-dimension must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
   def testPmfNonIntegerCounts(self):
@@ -91,7 +91,7 @@ class DirichletMultinomialTest(test.TestCase):
       # Both equality and integer checking fail.
       placeholder = array_ops.placeholder(dtypes.float32)
       with self.assertRaisesOpError(
-          "counts cannot contain fractional components"):
+          "cannot contain fractional components"):
         dist.prob(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
       dist = ds.DirichletMultinomial(n, alpha, validate_args=False)
       dist.prob([1., 2., 3.]).eval()
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..07b1ba71bb5a3420618facc173132e8cd5d88a7b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -0,0 +1,719 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import util as distribution_util
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+
+
+def _logit(x):
+  x = np.asarray(x)
+  return np.log(x) - np.log1p(-x)
+
+
+class AssertCloseTest(test.TestCase):
+
+  def testAssertCloseIntegerDtype(self):
+    x = array_ops.placeholder(dtypes.int32)
+    y = x
+    z = array_ops.placeholder(dtypes.int32)
+    feed_dict = {x: [1, 5, 10, 15, 20], z: [2, 5, 10, 15, 20]}
+    with self.test_session():
+      with ops.control_dependencies([distribution_util.assert_close(x, y)]):
+        array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with ops.control_dependencies([distribution_util.assert_close(y, x)]):
+        array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("Condition x ~= y"):
+        with ops.control_dependencies([distribution_util.assert_close(x, z)]):
+          array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("Condition x ~= y"):
+        with ops.control_dependencies([distribution_util.assert_close(y, z)]):
+          array_ops.identity(y).eval(feed_dict=feed_dict)
+
+  def testAssertCloseNonIntegerDtype(self):
+    x = array_ops.placeholder(dtypes.float32)
+    y = x + 1e-8
+    z = array_ops.placeholder(dtypes.float32)
+    feed_dict = {x: [1., 5, 10, 15, 20], z: [2., 5, 10, 15, 20]}
+    with self.test_session():
+      with ops.control_dependencies([distribution_util.assert_close(x, y)]):
+        array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with ops.control_dependencies([distribution_util.assert_close(y, x)]):
+        array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("Condition x ~= y"):
+        with ops.control_dependencies([distribution_util.assert_close(x, z)]):
+          array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("Condition x ~= y"):
+        with ops.control_dependencies([distribution_util.assert_close(y, z)]):
+          array_ops.identity(y).eval(feed_dict=feed_dict)
+
+  def testAssertCloseEpsilon(self):
+    x = [0., 5, 10, 15, 20]
+    # x != y
+    y = [0.1, 5, 10, 15, 20]
+    # x = z
+    z = [1e-8, 5, 10, 15, 20]
+    with self.test_session():
+      with ops.control_dependencies([distribution_util.assert_close(x, z)]):
+        array_ops.identity(x).eval()
+
+      with self.assertRaisesOpError("Condition x ~= y"):
+        with ops.control_dependencies([distribution_util.assert_close(x, y)]):
+          array_ops.identity(x).eval()
+
+      with self.assertRaisesOpError("Condition x ~= y"):
+        with ops.control_dependencies([distribution_util.assert_close(y, z)]):
+          array_ops.identity(y).eval()
+
+  def testAssertIntegerForm(self):
+    # This should only be detected as an integer.
+    x = array_ops.placeholder(dtypes.float32)
+    y = array_ops.placeholder(dtypes.float32)
+    # First component isn't less than float32.eps = 1e-7
+    z = array_ops.placeholder(dtypes.float32)
+    # This shouldn"t be detected as an integer.
+    w = array_ops.placeholder(dtypes.float32)
+    feed_dict = {x: [1., 5, 10, 15, 20], y: [1.1, 5, 10, 15, 20],
+                 z: [1.0001, 5, 10, 15, 20], w: [1e-8, 5, 10, 15, 20]}
+    with self.test_session():
+      with ops.control_dependencies([distribution_util.assert_integer_form(x)]):
+        array_ops.identity(x).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("has non-integer components"):
+        with ops.control_dependencies(
+            [distribution_util.assert_integer_form(y)]):
+          array_ops.identity(y).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("has non-integer components"):
+        with ops.control_dependencies(
+            [distribution_util.assert_integer_form(z)]):
+          array_ops.identity(z).eval(feed_dict=feed_dict)
+
+      with self.assertRaisesOpError("has non-integer components"):
+        with ops.control_dependencies(
+            [distribution_util.assert_integer_form(w)]):
+          array_ops.identity(w).eval(feed_dict=feed_dict)
+
+
+class GetLogitsAndProbsTest(test.TestCase):
+
+  def testImproperArguments(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        distribution_util.get_logits_and_probs(logits=None, probs=None)
+
+      with self.assertRaises(ValueError):
+        distribution_util.get_logits_and_probs(logits=[0.1], probs=[0.1])
+
+  def testLogits(self):
+    p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
+    logits = _logit(p)
+
+    with self.test_session():
+      new_logits, new_p = distribution_util.get_logits_and_probs(
+          logits=logits, validate_args=True)
+
+      self.assertAllClose(p, new_p.eval(), rtol=1e-5, atol=0.)
+      self.assertAllClose(logits, new_logits.eval(), rtol=1e-5, atol=0.)
+
+  def testLogitsMultidimensional(self):
+    p = np.array([0.2, 0.3, 0.5], dtype=np.float32)
+    logits = np.log(p)
+
+    with self.test_session():
+      new_logits, new_p = distribution_util.get_logits_and_probs(
+          logits=logits, multidimensional=True, validate_args=True)
+
+      self.assertAllClose(new_p.eval(), p)
+      self.assertAllClose(new_logits.eval(), logits)
+
+  def testProbability(self):
+    p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
+
+    with self.test_session():
+      new_logits, new_p = distribution_util.get_logits_and_probs(
+          probs=p, validate_args=True)
+
+      self.assertAllClose(_logit(p), new_logits.eval())
+      self.assertAllClose(p, new_p.eval())
+
+  def testProbabilityMultidimensional(self):
+    p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
+
+    with self.test_session():
+      new_logits, new_p = distribution_util.get_logits_and_probs(
+          probs=p, multidimensional=True, validate_args=True)
+
+      self.assertAllClose(np.log(p), new_logits.eval())
+      self.assertAllClose(p, new_p.eval())
+
+  def testProbabilityValidateArgs(self):
+    p = [0.01, 0.2, 0.5, 0.7, .99]
+    # Component less than 0.
+    p2 = [-1, 0.2, 0.5, 0.3, .2]
+    # Component greater than 1.
+    p3 = [2, 0.2, 0.5, 0.3, .2]
+
+    with self.test_session():
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p, validate_args=True)
+      prob.eval()
+
+      with self.assertRaisesOpError("Condition x >= 0"):
+        _, prob = distribution_util.get_logits_and_probs(
+            probs=p2, validate_args=True)
+        prob.eval()
+
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p2, validate_args=False)
+      prob.eval()
+
+      with self.assertRaisesOpError("probs has components greater than 1"):
+        _, prob = distribution_util.get_logits_and_probs(
+            probs=p3, validate_args=True)
+        prob.eval()
+
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p3, validate_args=False)
+      prob.eval()
+
+  def testProbabilityValidateArgsMultidimensional(self):
+    p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
+    # Component less than 0. Still sums to 1.
+    p2 = np.array([[-.3, 0.4, 0.9], [0.1, 0.5, 0.4]], dtype=np.float32)
+    # Component greater than 1. Does not sum to 1.
+    p3 = np.array([[1.3, 0.0, 0.0], [0.1, 0.5, 0.4]], dtype=np.float32)
+    # Does not sum to 1.
+    p4 = np.array([[1.1, 0.3, 0.4], [0.1, 0.5, 0.4]], dtype=np.float32)
+
+    with self.test_session():
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p, multidimensional=True)
+      prob.eval()
+
+      with self.assertRaisesOpError("Condition x >= 0"):
+        _, prob = distribution_util.get_logits_and_probs(
+            probs=p2, multidimensional=True, validate_args=True)
+        prob.eval()
+
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p2, multidimensional=True, validate_args=False)
+      prob.eval()
+
+      with self.assertRaisesOpError(
+          "(probs has components greater than 1|probs does not sum to 1)"):
+        _, prob = distribution_util.get_logits_and_probs(
+            probs=p3, multidimensional=True, validate_args=True)
+        prob.eval()
+
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p3, multidimensional=True, validate_args=False)
+      prob.eval()
+
+      with self.assertRaisesOpError("probs does not sum to 1"):
+        _, prob = distribution_util.get_logits_and_probs(
+            probs=p4, multidimensional=True, validate_args=True)
+        prob.eval()
+
+      _, prob = distribution_util.get_logits_and_probs(
+          probs=p4, multidimensional=True, validate_args=False)
+      prob.eval()
+
+  def testProbsMultidimShape(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        p = array_ops.ones([int(2**11+1)], dtype=np.float16)
+        distribution_util.get_logits_and_probs(
+            probs=p, multidimensional=True, validate_args=True)
+
+      with self.assertRaisesOpError(
+          "Number of classes exceeds `dtype` precision"):
+        p = array_ops.placeholder(dtype=dtypes.float16)
+        _, prob = distribution_util.get_logits_and_probs(
+            probs=p, multidimensional=True, validate_args=True)
+        prob.eval(feed_dict={p: np.ones([int(2**11+1)])})
+
+  def testLogitsMultidimShape(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        l = array_ops.ones([int(2**11+1)], dtype=np.float16)
+        distribution_util.get_logits_and_probs(
+            logits=l, multidimensional=True, validate_args=True)
+
+      with self.assertRaisesOpError(
+          "Number of classes exceeds `dtype` precision"):
+        l = array_ops.placeholder(dtype=dtypes.float16)
+        logit, _ = distribution_util.get_logits_and_probs(
+            logits=l, multidimensional=True, validate_args=True)
+        logit.eval(feed_dict={l: np.ones([int(2**11+1)])})
+
+
+class EmbedCheckCategoricalEventShapeTest(test.TestCase):
+
+  def testTooSmall(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        param = array_ops.ones([1], dtype=np.float16)
+        checked_param = distribution_util.embed_check_categorical_event_shape(
+            param)
+
+      with self.assertRaisesOpError(
+          "must have at least 2 events"):
+        param = array_ops.placeholder(dtype=dtypes.float16)
+        checked_param = distribution_util.embed_check_categorical_event_shape(
+            param)
+        checked_param.eval(feed_dict={param: np.ones([1])})
+
+  def testTooLarge(self):
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        param = array_ops.ones([int(2**11+1)], dtype=dtypes.float16)
+        checked_param = distribution_util.embed_check_categorical_event_shape(
+            param)
+
+      with self.assertRaisesOpError(
+          "Number of classes exceeds `dtype` precision"):
+        param = array_ops.placeholder(dtype=dtypes.float16)
+        checked_param = distribution_util.embed_check_categorical_event_shape(
+            param)
+        checked_param.eval(feed_dict={param: np.ones([int(2**11+1)])})
+
+  def testUnsupportedDtype(self):
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        param = array_ops.ones([int(2**11+1)], dtype=dtypes.qint16)
+        distribution_util.embed_check_categorical_event_shape(param)
+
+
+class EmbedCheckIntegerCastingClosedTest(test.TestCase):
+
+  def testCorrectlyAssertsNonnegative(self):
+    with self.test_session():
+      with self.assertRaisesOpError("Elements must be non-negative"):
+        x = array_ops.placeholder(dtype=dtypes.float16)
+        x_checked = distribution_util.embed_check_integer_casting_closed(
+            x, target_dtype=dtypes.int16)
+        x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.float16)})
+
+  def testCorrectlyAssersIntegerForm(self):
+    with self.test_session():
+      with self.assertRaisesOpError("Elements must be int16-equivalent."):
+        x = array_ops.placeholder(dtype=dtypes.float16)
+        x_checked = distribution_util.embed_check_integer_casting_closed(
+            x, target_dtype=dtypes.int16)
+        x_checked.eval(feed_dict={x: np.array([1, 1.5], dtype=np.float16)})
+
+  def testCorrectlyAssertsLargestPossibleInteger(self):
+    with self.test_session():
+      with self.assertRaisesOpError("Elements cannot exceed 32767."):
+        x = array_ops.placeholder(dtype=dtypes.int32)
+        x_checked = distribution_util.embed_check_integer_casting_closed(
+            x, target_dtype=dtypes.int16)
+        x_checked.eval(feed_dict={x: np.array([1, 2**15], dtype=np.int32)})
+
+  def testCorrectlyAssertsSmallestPossibleInteger(self):
+    with self.test_session():
+      with self.assertRaisesOpError("Elements cannot be smaller than 0."):
+        x = array_ops.placeholder(dtype=dtypes.int32)
+        x_checked = distribution_util.embed_check_integer_casting_closed(
+            x, target_dtype=dtypes.uint16, assert_nonnegative=False)
+        x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.int32)})
+
+
+class LogCombinationsTest(test.TestCase):
+
+  def testLogCombinationsBinomial(self):
+    n = [2, 5, 12, 15]
+    k = [1, 2, 4, 11]
+
+    if not special:
+      return
+
+    log_combs = np.log(special.binom(n, k))
+
+    with self.test_session():
+      n = np.array(n, dtype=np.float32)
+      counts = [[1., 1], [2., 3], [4., 8], [11, 4]]
+      log_binom = distribution_util.log_combinations(n, counts)
+      self.assertEqual([4], log_binom.get_shape())
+      self.assertAllClose(log_combs, log_binom.eval())
+
+  def testLogCombinationsShape(self):
+    # Shape [2, 2]
+    n = [[2, 5], [12, 15]]
+
+    with self.test_session():
+      n = np.array(n, dtype=np.float32)
+      # Shape [2, 2, 4]
+      counts = [[[1., 1, 0, 0], [2., 2, 1, 0]], [[4., 4, 1, 3], [10, 1, 1, 4]]]
+      log_binom = distribution_util.log_combinations(n, counts)
+      self.assertEqual([2, 2], log_binom.get_shape())
+
+
+class DynamicShapeTest(test.TestCase):
+
+  def testSameDynamicShape(self):
+    with self.test_session():
+      scalar = constant_op.constant(2.0)
+      scalar1 = array_ops.placeholder(dtype=dtypes.float32)
+
+      vector = [0.3, 0.4, 0.5]
+      vector1 = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
+      vector2 = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
+
+      multidimensional = [[0.3, 0.4], [0.2, 0.6]]
+      multidimensional1 = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, None])
+      multidimensional2 = array_ops.placeholder(
+          dtype=dtypes.float32, shape=[None, None])
+
+      # Scalar
+      self.assertTrue(
+          distribution_util.same_dynamic_shape(scalar, scalar1).eval({
+              scalar1: 2.0
+          }))
+
+      # Vector
+
+      self.assertTrue(
+          distribution_util.same_dynamic_shape(vector, vector1).eval({
+              vector1: [2.0, 3.0, 4.0]
+          }))
+      self.assertTrue(
+          distribution_util.same_dynamic_shape(vector1, vector2).eval({
+              vector1: [2.0, 3.0, 4.0],
+              vector2: [2.0, 3.5, 6.0]
+          }))
+
+      # Multidimensional
+      self.assertTrue(
+          distribution_util.same_dynamic_shape(
+              multidimensional, multidimensional1).eval({
+                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
+              }))
+      self.assertTrue(
+          distribution_util.same_dynamic_shape(
+              multidimensional1, multidimensional2).eval({
+                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]],
+                  multidimensional2: [[1.0, 3.5], [6.3, 2.3]]
+              }))
+
+      # Scalar, X
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(scalar, vector1).eval({
+              vector1: [2.0, 3.0, 4.0]
+          }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(scalar1, vector1).eval({
+              scalar1: 2.0,
+              vector1: [2.0, 3.0, 4.0]
+          }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(scalar, multidimensional1).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
+          }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(scalar1, multidimensional1).eval(
+              {
+                  scalar1: 2.0,
+                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
+              }))
+
+      # Vector, X
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(vector, vector1).eval({
+              vector1: [2.0, 3.0]
+          }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(vector1, vector2).eval({
+              vector1: [2.0, 3.0, 4.0],
+              vector2: [6.0]
+          }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(vector, multidimensional1).eval({
+              multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
+          }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(vector1, multidimensional1).eval(
+              {
+                  vector1: [2.0, 3.0, 4.0],
+                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]]
+              }))
+
+      # Multidimensional, X
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(
+              multidimensional, multidimensional1).eval({
+                  multidimensional1: [[1.0, 3.5, 5.0], [6.3, 2.3, 7.1]]
+              }))
+      self.assertFalse(
+          distribution_util.same_dynamic_shape(
+              multidimensional1, multidimensional2).eval({
+                  multidimensional1: [[2.0, 3.0], [3.0, 4.0]],
+                  multidimensional2: [[1.0, 3.5, 5.0], [6.3, 2.3, 7.1]]
+              }))
+
+
+class RotateTransposeTest(test.TestCase):
+
+  def _np_rotate_transpose(self, x, shift):
+    if not isinstance(x, np.ndarray):
+      x = np.array(x)
+    return np.transpose(x, np.roll(np.arange(len(x.shape)), shift))
+
+  def testRollStatic(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "None values not supported."):
+        distribution_util.rotate_transpose(None, 1)
+      for x in (np.ones(1), np.ones((2, 1)), np.ones((3, 2, 1))):
+        for shift in np.arange(-5, 5):
+          y = distribution_util.rotate_transpose(x, shift)
+          self.assertAllEqual(self._np_rotate_transpose(x, shift), y.eval())
+          self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
+
+  def testRollDynamic(self):
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      shift = array_ops.placeholder(dtypes.int32)
+      for x_value in (np.ones(
+          1, dtype=x.dtype.as_numpy_dtype()), np.ones(
+              (2, 1), dtype=x.dtype.as_numpy_dtype()), np.ones(
+                  (3, 2, 1), dtype=x.dtype.as_numpy_dtype())):
+        for shift_value in np.arange(-5, 5):
+          self.assertAllEqual(
+              self._np_rotate_transpose(x_value, shift_value),
+              sess.run(distribution_util.rotate_transpose(x, shift),
+                       feed_dict={x: x_value,
+                                  shift: shift_value}))
+
+
+class PickVectorTest(test.TestCase):
+
+  def testCorrectlyPicksVector(self):
+    with self.test_session():
+      x = np.arange(10, 12)
+      y = np.arange(15, 18)
+      self.assertAllEqual(x,
+                          distribution_util.pick_vector(
+                              math_ops.less(0, 5), x, y).eval())
+      self.assertAllEqual(y,
+                          distribution_util.pick_vector(
+                              math_ops.less(5, 0), x, y).eval())
+      self.assertAllEqual(x,
+                          distribution_util.pick_vector(
+                              constant_op.constant(True), x, y))  # No eval.
+      self.assertAllEqual(y,
+                          distribution_util.pick_vector(
+                              constant_op.constant(False), x, y))  # No eval.
+
+
+class FillLowerTriangularTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _fill_lower_triangular(self, x):
+    """Numpy implementation of `fill_lower_triangular`."""
+    x = np.asarray(x)
+    d = x.shape[-1]
+    # d = n(n+1)/2 implies n is:
+    n = int(0.5 * (np.sqrt(1. + 8. * d) - 1.))
+    ids = np.tril_indices(n)
+    y = np.zeros(list(x.shape[:-1]) + [n, n], dtype=x.dtype)
+    y[..., ids[0], ids[1]] = x
+    return y
+
+  def testCorrectlyMakes1x1LowerTril(self):
+    with self.test_session():
+      x = ops.convert_to_tensor(self._rng.randn(3, 1))
+      expected = self._fill_lower_triangular(tensor_util.constant_value(x))
+      actual = distribution_util.fill_lower_triangular(x, validate_args=True)
+      self.assertAllEqual(expected.shape, actual.get_shape())
+      self.assertAllEqual(expected, actual.eval())
+
+  def testCorrectlyMakesNoBatchLowerTril(self):
+    with self.test_session():
+      x = ops.convert_to_tensor(self._rng.randn(10))
+      expected = self._fill_lower_triangular(tensor_util.constant_value(x))
+      actual = distribution_util.fill_lower_triangular(x, validate_args=True)
+      self.assertAllEqual(expected.shape, actual.get_shape())
+      self.assertAllEqual(expected, actual.eval())
+      g = gradients_impl.gradients(
+          distribution_util.fill_lower_triangular(x), x)
+      self.assertAllEqual(np.tri(4).reshape(-1), g[0].values.eval())
+
+  def testCorrectlyMakesBatchLowerTril(self):
+    with self.test_session():
+      x = ops.convert_to_tensor(self._rng.randn(2, 2, 6))
+      expected = self._fill_lower_triangular(tensor_util.constant_value(x))
+      actual = distribution_util.fill_lower_triangular(x, validate_args=True)
+      self.assertAllEqual(expected.shape, actual.get_shape())
+      self.assertAllEqual(expected, actual.eval())
+      self.assertAllEqual(
+          np.ones((2, 2, 6)),
+          gradients_impl.gradients(
+              distribution_util.fill_lower_triangular(x), x)[0].eval())
+
+
+class GenNewSeedTest(test.TestCase):
+
+  def testOnlyNoneReturnsNone(self):
+    self.assertFalse(distribution_util.gen_new_seed(0, "salt") is None)
+    self.assertTrue(distribution_util.gen_new_seed(None, "salt") is None)
+
+
+# TODO(jvdillon): Merge this test back into:
+# tensorflow/python/kernel_tests/softplus_op_test.py
+# once TF core is accepting new ops.
+class SoftplusTest(test.TestCase):
+
+  def _npSoftplus(self, np_features):
+    np_features = np.asarray(np_features)
+    zero = np.asarray(0).astype(np_features.dtype)
+    return np.logaddexp(zero, np_features)
+
+  def _testSoftplus(self, np_features, use_gpu=False):
+    np_features = np.asarray(np_features)
+    np_softplus = self._npSoftplus(np_features)
+    with self.test_session(use_gpu=use_gpu) as sess:
+      softplus = nn_ops.softplus(np_features)
+      softplus_inverse = distribution_util.softplus_inverse(softplus)
+      [tf_softplus, tf_softplus_inverse] = sess.run([
+          softplus, softplus_inverse])
+    self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
+    rtol = {"float16": 0.07, "float32": 0.003, "float64": 0.002}.get(
+        str(np_features.dtype), 1e-6)
+    # This will test that we correctly computed the inverse by verifying we
+    # recovered the original input.
+    self.assertAllCloseAccordingToType(
+        np_features, tf_softplus_inverse,
+        atol=0., rtol=rtol)
+    self.assertAllEqual(np.ones_like(tf_softplus).astype(np.bool),
+                        tf_softplus > 0)
+
+    self.assertShapeEqual(np_softplus, softplus)
+    self.assertShapeEqual(np_softplus, softplus_inverse)
+
+    self.assertAllEqual(np.ones_like(tf_softplus).astype(np.bool),
+                        np.isfinite(tf_softplus))
+    self.assertAllEqual(np.ones_like(tf_softplus_inverse).astype(np.bool),
+                        np.isfinite(tf_softplus_inverse))
+
+  def testNumbers(self):
+    for t in [np.float16, np.float32, np.float64]:
+      lower = {np.float16: -15, np.float32: -50, np.float64: -50}.get(t, -100)
+      upper = {np.float16: 50, np.float32: 50, np.float64: 50}.get(t, 100)
+      self._testSoftplus(
+          np.array(np.linspace(lower, upper, int(1e3)).astype(t)).reshape(
+              [2, -1]),
+          use_gpu=False)
+      self._testSoftplus(
+          np.array(np.linspace(lower, upper, int(1e3)).astype(t)).reshape(
+              [2, -1]),
+          use_gpu=True)
+      log_eps = np.log(np.finfo(t).eps)
+      one = t(1)
+      ten = t(10)
+      self._testSoftplus(
+          [
+              log_eps, log_eps - one, log_eps + one, log_eps - ten,
+              log_eps + ten, -log_eps, -log_eps - one, -log_eps + one,
+              -log_eps - ten, -log_eps + ten
+          ],
+          use_gpu=False)
+      self._testSoftplus(
+          [
+              log_eps, log_eps - one, log_eps + one, log_eps - ten,
+              log_eps + ten - log_eps, -log_eps - one, -log_eps + one,
+              -log_eps - ten, -log_eps + ten
+          ],
+          use_gpu=True)
+
+  def testGradient(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], y, [2, 5], x_init_value=x_init)
+    tf_logging.vlog(2, "softplus (float) gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+  def testInverseSoftplusGradientNeverNan(self):
+    with self.test_session():
+      # Note that this range contains both zero and inf.
+      x = constant_op.constant(np.logspace(-8, 6).astype(np.float16))
+      y = distribution_util.softplus_inverse(x)
+      grads = gradients_impl.gradients(y, x)[0].eval()
+      # Equivalent to `assertAllFalse` (if it existed).
+      self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
+
+  def testInverseSoftplusGradientFinite(self):
+    with self.test_session():
+      # This range of x is all finite, and so is 1 / x.  So the
+      # gradient and its approximations should be finite as well.
+      x = constant_op.constant(np.logspace(-4.8, 4.5).astype(np.float16))
+      y = distribution_util.softplus_inverse(x)
+      grads = gradients_impl.gradients(y, x)[0].eval()
+      # Equivalent to `assertAllTrue` (if it existed).
+      self.assertAllEqual(
+          np.ones_like(grads).astype(np.bool), np.isfinite(grads))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 057da9d7afaf949428cff30410f41d00e6027dfa..a1f44bafcc9830b9253cfef84e1881ee3622bfb4 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
@@ -77,11 +78,11 @@ class ScatterAddSubTest(test.TestCase):
     # Compute the expected 'p' using numpy operations.
     for i, ind in enumerate(indices):
       if scatter_op == state_ops.scatter_add:
-        p_init.reshape(shape[0], -1)[ind, :] += (
-            vals_init.reshape(vals_shape[0], -1)[i, :])
+        p_init.reshape(shape[0], -1)[ind, :] += (vals_init.reshape(
+            vals_shape[0], -1)[i, :])
       else:
-        p_init.reshape(shape[0], -1)[ind, :] -= (
-            vals_init.reshape(vals_shape[0], -1)[i, :])
+        p_init.reshape(shape[0], -1)[ind, :] -= (vals_init.reshape(
+            vals_shape[0], -1)[i, :])
     self.assertTrue(all((p_init == result).ravel()))
 
   def testNoRepetitions(self):
@@ -111,8 +112,7 @@ class ScatterAddSubTest(test.TestCase):
   def testWrongShape(self):
     # Indices and values mismatch.
     var = variables.Variable(
-        array_ops.zeros(
-            shape=[1024, 64, 64], dtype=dtypes.float32))
+        array_ops.zeros(shape=[1024, 64, 64], dtype=dtypes.float32))
     indices = array_ops.placeholder(dtypes.int32, shape=[32])
     values = array_ops.placeholder(dtypes.float32, shape=[33, 64, 64])
     with self.assertRaises(ValueError):
@@ -208,8 +208,8 @@ def _EmbeddingResult(params,
         else:
           partition = extras + (i - threshold) // ids_per_partition
           offset = (i - threshold) % ids_per_partition
-        val = np.copy(params[_PName(partition) + ":0"][
-            offset, :]) * weight_value
+        val = np.copy(
+            params[_PName(partition) + ":0"][offset, :]) * weight_value
       else:
         assert False
       if value_aggregation is None:
@@ -274,8 +274,7 @@ class EmbeddingLookupTest(test.TestCase):
           [embeddings], ids, max_norm=2.0)
 
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              embeddings * embeddings, axis=1))
+          math_ops.reduce_sum(embeddings * embeddings, axis=1))
       normalized = embeddings / array_ops.stack([norms, norms], axis=1)
       self.assertAllEqual(embedding.eval(), 2 * normalized.eval())
 
@@ -510,8 +509,7 @@ class EmbeddingLookupTest(test.TestCase):
   def testConstructionNonSharded(self):
     with ops.Graph().as_default():
       p = variables.Variable(
-          array_ops.zeros(
-              shape=[100, 100], dtype=dtypes.float32))
+          array_ops.zeros(shape=[100, 100], dtype=dtypes.float32))
       ids = constant_op.constant([0, 1, 1, 7], dtype=dtypes.int32)
       embedding_ops.embedding_lookup([p], ids)
 
@@ -521,8 +519,7 @@ class EmbeddingLookupTest(test.TestCase):
       for _ in range(2):
         p += [
             variables.Variable(
-                array_ops.zeros(
-                    shape=[100, 100], dtype=dtypes.float32))
+                array_ops.zeros(shape=[100, 100], dtype=dtypes.float32))
         ]
         ids = constant_op.constant([0, 1, 1, 17], dtype=dtypes.int32)
       embedding_ops.embedding_lookup(p, ids)
@@ -550,19 +547,22 @@ class EmbeddingLookupTest(test.TestCase):
   def testHigherRankMaxNorm(self):
     np.random.seed(8)
     with self.test_session():
-      for params_shape in (12,), (6, 3):
+      for params_shape in (12,), (6, 3), (6, 2, 3):
+        # Test embedding rank 0, 1, 2.
+        # Note: the first dimension must be a common multiple of procs below.
         params = 2 * np.ones(params_shape)
         params_norm = params / np.sqrt(
-            np.sum(params*params, tuple(range(params.ndim)[1:]), keepdims=True))
+            np.sum(
+                params * params, tuple(range(params.ndim)[1:]), keepdims=True))
         for ids_shape in (), (3), (4, 3), (2, 3, 4):
           ids = np.random.randint(
-              params.shape[0], size=np.prod(ids_shape, dtype=np.int64)).reshape(
-                  ids_shape)
+              params.shape[0], size=np.prod(ids_shape,
+                                            dtype=np.int64)).reshape(ids_shape)
           # Compare nonsharded to gather
           simple = embedding_ops.embedding_lookup(
               params, ids, max_norm=1.0).eval()
           self.assertAllEqual(simple, array_ops.gather(params_norm, ids).eval())
-          # Run a few random sharded versions
+          # Run a few different sharded versions.
           for procs in 1, 2, 3:
             stride = procs * math_ops.range(params.shape[0] // procs)
             split_params = [
@@ -572,6 +572,42 @@ class EmbeddingLookupTest(test.TestCase):
                 split_params, ids, max_norm=1.0).eval()
             self.assertAllEqual(simple, sharded)
 
+  def testTransform(self):
+    # This tests all combinations of:
+    #   - ids rank 0, 1, >1
+    #   - params sharded/unsharded
+    # It always applies max_norm.
+    np.random.seed(8)
+    l2_norm = 2.
+    with self.test_session():
+      # Param values are in [l2_norm, l2_norm+1) so it will always clip.
+      params = np.random.rand(6, 3) + l2_norm
+      params_norm = l2_norm * params / np.sqrt(
+          np.sum(params * params, axis=1, keepdims=True))
+      # Compute the norm of each embedding. This will change the embedding
+      # rank to 0.
+      params_norm = np.linalg.norm(params_norm, axis=1)
+      transform = lambda x: linalg_ops.norm(x, axis=1)
+      for ids_shape in (), (3), (4, 3), (2, 3, 4):
+        # Test ids rank 0, 1, 2, 3.
+        ids = np.random.randint(
+            params.shape[0], size=np.prod(ids_shape,
+                                          dtype=np.int64)).reshape(ids_shape)
+        # Compare nonsharded to gather.
+        simple = embedding_ops._embedding_lookup_and_transform(
+            params, ids, max_norm=l2_norm, transform_fn=transform).eval()
+        self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
+        # Run a few different sharded versions.
+        for procs in 1, 2, 3:
+          stride = procs * math_ops.range(params.shape[0] // procs)
+          split_params = [
+              array_ops.gather(params, stride + p) for p in xrange(procs)
+          ]
+          sharded = embedding_ops._embedding_lookup_and_transform(
+              split_params, ids, max_norm=l2_norm,
+              transform_fn=transform).eval()
+          self.assertAllEqual(simple, sharded)
+
 
 class EmbeddingLookupSparseTest(test.TestCase):
 
@@ -625,8 +661,8 @@ class EmbeddingLookupSparseTest(test.TestCase):
         np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
 
     for num_shards, combiner, dtype, ignore_weights in itertools.product(
-        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32, dtypes.float64],
-        [True, False]):
+        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
 
       with self.test_session():
         p, params, feed_dict = _EmbeddingParams(
@@ -647,8 +683,8 @@ class EmbeddingLookupSparseTest(test.TestCase):
             grouped_ids,
             num_shards,
             vocab_size,
-            weight_vals=grouped_ignored_weights if ignore_weights else
-            grouped_weights)
+            weight_vals=grouped_ignored_weights
+            if ignore_weights else grouped_weights)
         if combiner == "mean":
           np_embedding_sum /= np.reshape(np_weight_sum, (batch_size, 1, 1))
         if combiner == "sqrtn":
@@ -660,12 +696,12 @@ class EmbeddingLookupSparseTest(test.TestCase):
     vocab_size = 12
     batch_size = 4
     param_shape = [2, 3]
-    sp_ids, sp_weights, _, _, _ = (
-        self._RandomIdsAndWeights(batch_size, vocab_size))
+    sp_ids, sp_weights, _, _, _ = (self._RandomIdsAndWeights(
+        batch_size, vocab_size))
 
     for num_shards, combiner, dtype, ignore_weights in itertools.product(
-        [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32, dtypes.float64],
-        [True, False]):
+        [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
       with self.test_session():
         x, params, _ = _EmbeddingParams(
             num_shards, vocab_size, shape=param_shape, dtype=dtype)
@@ -705,10 +741,12 @@ class DynamicStitchOpTest(test.TestCase):
   def testCint32Cpu(self):
     with self.test_session(use_gpu=False):
       indices = [
-          ops.convert_to_tensor([0, 1, 2]), ops.convert_to_tensor([2, 3])
+          ops.convert_to_tensor([0, 1, 2]),
+          ops.convert_to_tensor([2, 3])
       ]
       values = [
-          ops.convert_to_tensor([12, 23, 34]), ops.convert_to_tensor([1, 2])
+          ops.convert_to_tensor([12, 23, 34]),
+          ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
@@ -716,10 +754,12 @@ class DynamicStitchOpTest(test.TestCase):
   def testCint32Gpu(self):
     with self.test_session(use_gpu=True):
       indices = [
-          ops.convert_to_tensor([0, 1, 2]), ops.convert_to_tensor([2, 3])
+          ops.convert_to_tensor([0, 1, 2]),
+          ops.convert_to_tensor([2, 3])
       ]
       values = [
-          ops.convert_to_tensor([12, 23, 34]), ops.convert_to_tensor([1, 2])
+          ops.convert_to_tensor([12, 23, 34]),
+          ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
@@ -727,10 +767,12 @@ class DynamicStitchOpTest(test.TestCase):
   def testInt32Cpu(self):
     with self.test_session(use_gpu=False):
       indices = [
-          ops.convert_to_tensor([0, 1, 2]), ops.convert_to_tensor([2, 3])
+          ops.convert_to_tensor([0, 1, 2]),
+          ops.convert_to_tensor([2, 3])
       ]
       values = [
-          ops.convert_to_tensor([12, 23, 34]), ops.convert_to_tensor([1, 2])
+          ops.convert_to_tensor([12, 23, 34]),
+          ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
@@ -738,10 +780,12 @@ class DynamicStitchOpTest(test.TestCase):
   def testInt32Gpu(self):
     with self.test_session(use_gpu=True):
       indices = [
-          ops.convert_to_tensor([0, 1, 2]), ops.convert_to_tensor([2, 3])
+          ops.convert_to_tensor([0, 1, 2]),
+          ops.convert_to_tensor([2, 3])
       ]
       values = [
-          ops.convert_to_tensor([12, 23, 34]), ops.convert_to_tensor([1, 2])
+          ops.convert_to_tensor([12, 23, 34]),
+          ops.convert_to_tensor([1, 2])
       ]
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
@@ -749,10 +793,12 @@ class DynamicStitchOpTest(test.TestCase):
   def testSumGradArgs(self):
     with self.test_session(use_gpu=False):
       indices = [
-          ops.convert_to_tensor([0, 1, 2, 3]), ops.convert_to_tensor([2, 3])
+          ops.convert_to_tensor([0, 1, 2, 3]),
+          ops.convert_to_tensor([2, 3])
       ]
       values = [
-          ops.convert_to_tensor([2, 3, 5, 7]), ops.convert_to_tensor([1, 1])
+          ops.convert_to_tensor([2, 3, 5, 7]),
+          ops.convert_to_tensor([1, 1])
       ]
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [2, 3, 1, 1])
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/fft_ops_test.py
index 6c575aea1287d224660dccfc0089878d2cb28aa3..b9e2aa1f3a4ebb01459786ee950a5a44371d6849 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/fft_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -29,6 +30,7 @@ from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
@@ -36,36 +38,11 @@ VALID_FFT_RANKS = (1, 2, 3)
 
 class BaseFFTOpsTest(test.TestCase):
 
-  def _use_eigen_kernels(self):
-    use_eigen_kernels = False  # Eigen kernels are default
-    if test.is_gpu_available(cuda_only=True):
-      use_eigen_kernels = False
-    return use_eigen_kernels
-
-  def _fft_kernel_label_map(self):
-    """Returns a generator overriding kernel selection.
-
-    This is used to force testing of the eigen kernels, even
-    when they are not the default registered kernels.
-
-    Returns:
-      A generator in which to wrap every test.
-    """
-    if self._use_eigen_kernels():
-      d = dict([(op, "eigen")
-                for op in [
-                    "FFT", "FFT2D", "FFT3D", "IFFT", "IFFT2D", "IFFT3D",
-                    "IRFFT", "IRFFT2D", "IRFFT3D", "RFFT", "RFFT2D", "RFFT3D"
-                ]])
-      return ops.get_default_graph()._kernel_label_map(d)
-    else:
-      return ops.get_default_graph()._kernel_label_map({})
-
-  def _Compare(self, x, rank, fft_length=None, use_placeholder=False):
-    self._CompareForward(x, rank, fft_length, use_placeholder)
-    self._CompareBackward(x, rank, fft_length, use_placeholder)
+  def _compare(self, x, rank, fft_length=None, use_placeholder=False):
+    self._compareForward(x, rank, fft_length, use_placeholder)
+    self._compareBackward(x, rank, fft_length, use_placeholder)
 
-  def _CompareForward(self, x, rank, fft_length=None, use_placeholder=False):
+  def _compareForward(self, x, rank, fft_length=None, use_placeholder=False):
     x_np = self._npFFT(x, rank, fft_length)
     if use_placeholder:
       x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
@@ -75,7 +52,7 @@ class BaseFFTOpsTest(test.TestCase):
 
     self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
 
-  def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False):
+  def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False):
     x_np = self._npIFFT(x, rank, fft_length)
     if use_placeholder:
       x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
@@ -85,6 +62,12 @@ class BaseFFTOpsTest(test.TestCase):
 
     self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
 
+  def _checkMemoryFail(self, x, rank):
+    config = config_pb2.ConfigProto()
+    config.gpu_options.per_process_gpu_memory_fraction = 1e-2
+    with self.test_session(config=config, force_gpu=True):
+      self._tfFFT(x, rank, fft_length=None)
+
   def _checkGradComplex(self, func, x, y, result_is_complex=True):
     with self.test_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
@@ -171,7 +154,7 @@ class FFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   def testEmpty(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
           x = np.zeros((0,) * dims).astype(np.complex64)
@@ -179,25 +162,43 @@ class FFTOpsTest(BaseFFTOpsTest):
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   def testBasic(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
-          self._Compare(
+          self._compare(
               np.mod(np.arange(np.power(4, dims)), 10).reshape(
                   (4,) * dims).astype(np.complex64), rank)
 
+  def testLargeBatch(self):
+    if test.is_gpu_available(cuda_only=True):
+      rank = 1
+      for dims in xrange(rank, rank + 3):
+        self._compare(
+            np.mod(np.arange(np.power(128, dims)), 10).reshape(
+                (128,) * dims).astype(np.complex64), rank)
+
+  # TODO(yangzihao): Disable before we can figure out a way to
+  # properly test memory fail for large batch fft.
+  # def testLargeBatchMemoryFail(self):
+  #   if test.is_gpu_available(cuda_only=True):
+  #     rank = 1
+  #     for dims in xrange(rank, rank + 3):
+  #       self._checkMemoryFail(
+  #           np.mod(np.arange(np.power(128, dims)), 64).reshape(
+  #               (128,) * dims).astype(np.complex64), rank)
+
   def testBasicPlaceholder(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
-          self._Compare(
+          self._compare(
               np.mod(np.arange(np.power(4, dims)), 10).reshape(
                   (4,) * dims).astype(np.complex64),
               rank,
               use_placeholder=True)
 
   def testRandom(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       np.random.seed(12345)
 
       def gen(shape):
@@ -208,7 +209,7 @@ class FFTOpsTest(BaseFFTOpsTest):
 
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
-          self._Compare(gen((4,) * dims), rank)
+          self._compare(gen((4,) * dims), rank)
 
   def testError(self):
     for rank in VALID_FFT_RANKS:
@@ -222,7 +223,7 @@ class FFTOpsTest(BaseFFTOpsTest):
           self._tfIFFT(x, rank)
 
   def testGrad_Simple(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 2):
           re = np.ones(shape=(4,) * dims, dtype=np.float32) / 10.0
@@ -231,7 +232,7 @@ class FFTOpsTest(BaseFFTOpsTest):
           self._checkGradComplex(self._tfIFFTForRank(rank), re, im)
 
   def testGrad_Random(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       np.random.seed(54321)
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 2):
@@ -243,8 +244,8 @@ class FFTOpsTest(BaseFFTOpsTest):
 
 class RFFTOpsTest(BaseFFTOpsTest):
 
-  def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False):
-    super(RFFTOpsTest, self)._CompareBackward(x, rank, fft_length,
+  def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False):
+    super(RFFTOpsTest, self)._compareBackward(x, rank, fft_length,
                                               use_placeholder)
 
   def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
@@ -296,7 +297,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   def testEmpty(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
           x = np.zeros((0,) * dims).astype(np.float32)
@@ -305,41 +306,54 @@ class RFFTOpsTest(BaseFFTOpsTest):
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   def testBasic(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
           for size in (5, 6):
             inner_dim = size // 2 + 1
             r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
                 (size,) * dims)
-            self._CompareForward(r2c.astype(np.float32), rank, (size,) * rank)
+            self._compareForward(r2c.astype(np.float32), rank, (size,) * rank)
             c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
                          10).reshape((size,) * (dims - 1) + (inner_dim,))
-            self._CompareBackward(
+            self._compareBackward(
                 c2r.astype(np.complex64), rank, (size,) * rank)
 
+  def testLargeBatch(self):
+    if test.is_gpu_available(cuda_only=True):
+      rank = 1
+      for dims in xrange(rank, rank + 3):
+        for size in (64, 128):
+          inner_dim = size // 2 + 1
+          r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
+              (size,) * dims)
+          self._compareForward(r2c.astype(np.float32), rank, (size,) * rank)
+          c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
+                       10).reshape((size,) * (dims - 1) + (inner_dim,))
+          self._compareBackward(c2r.astype(np.complex64), rank, (size,) * rank)
+
   def testBasicPlaceholder(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(rank, rank + 3):
           for size in (5, 6):
             inner_dim = size // 2 + 1
             r2c = np.mod(np.arange(np.power(size, dims)), 10).reshape(
                 (size,) * dims)
-            self._CompareForward(
+            self._compareForward(
                 r2c.astype(np.float32),
                 rank, (size,) * rank,
                 use_placeholder=True)
             c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
                          10).reshape((size,) * (dims - 1) + (inner_dim,))
-            self._CompareBackward(
+            self._compareBackward(
                 c2r.astype(np.complex64),
                 rank, (size,) * rank,
                 use_placeholder=True)
 
   def testFftLength(self):
     if test.is_gpu_available(cuda_only=True):
-      with self._fft_kernel_label_map():
+      with spectral_ops_test_util.fft_kernel_label_map():
         for rank in VALID_FFT_RANKS:
           for dims in xrange(rank, rank + 3):
             for size in (5, 6):
@@ -348,31 +362,39 @@ class RFFTOpsTest(BaseFFTOpsTest):
                   (size,) * dims)
               c2r = np.mod(np.arange(np.power(size, dims - 1) * inner_dim),
                            10).reshape((size,) * (dims - 1) + (inner_dim,))
-
               # Test truncation (FFT size < dimensions).
               fft_length = (size - 2,) * rank
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
-
+              self._compareForward(r2c.astype(np.float32), rank, fft_length)
+              self._compareBackward(c2r.astype(np.complex64), rank, fft_length)
               # Confirm it works with unknown shapes as well.
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
-                                   use_placeholder=True)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
-                                    use_placeholder=True)
-
+              self._compareForward(
+                  r2c.astype(np.float32),
+                  rank,
+                  fft_length,
+                  use_placeholder=True)
+              self._compareBackward(
+                  c2r.astype(np.complex64),
+                  rank,
+                  fft_length,
+                  use_placeholder=True)
               # Test padding (FFT size > dimensions).
               fft_length = (size + 2,) * rank
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length)
-
+              self._compareForward(r2c.astype(np.float32), rank, fft_length)
+              self._compareBackward(c2r.astype(np.complex64), rank, fft_length)
               # Confirm it works with unknown shapes as well.
-              self._CompareForward(r2c.astype(np.float32), rank, fft_length,
-                                   use_placeholder=True)
-              self._CompareBackward(c2r.astype(np.complex64), rank, fft_length,
-                                    use_placeholder=True)
+              self._compareForward(
+                  r2c.astype(np.float32),
+                  rank,
+                  fft_length,
+                  use_placeholder=True)
+              self._compareBackward(
+                  c2r.astype(np.complex64),
+                  rank,
+                  fft_length,
+                  use_placeholder=True)
 
   def testRandom(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       np.random.seed(12345)
 
       def gen_real(shape):
@@ -392,13 +414,13 @@ class RFFTOpsTest(BaseFFTOpsTest):
         for dims in xrange(rank, rank + 3):
           for size in (5, 6):
             inner_dim = size // 2 + 1
-            self._CompareForward(gen_real((size,) * dims), rank, (size,) * rank)
+            self._compareForward(gen_real((size,) * dims), rank, (size,) * rank)
             complex_dims = (size,) * (dims - 1) + (inner_dim,)
-            self._CompareBackward(
+            self._compareBackward(
                 gen_complex(complex_dims), rank, (size,) * rank)
 
   def testError(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         for dims in xrange(0, rank):
           x = np.zeros((1,) * dims).astype(np.complex64)
@@ -454,7 +476,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             irfft_fn(x, fft_length).eval()
 
   def testGrad_Simple(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
         # rfft3d/irfft3d do not have gradients yet.
         if rank == 3:
@@ -468,7 +490,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 self._tfIFFTForRank(rank), re, im, result_is_complex=False)
 
   def testGrad_Random(self):
-    with self._fft_kernel_label_map():
+    with spectral_ops_test_util.fft_kernel_label_map():
       np.random.seed(54321)
       for rank in VALID_FFT_RANKS:
         # rfft3d/irfft3d do not have gradients yet.
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index b3ce234d4e88d0eccd471c528bfc53d33ab93d2f..9a946925693370912613f4dde33bbbda176060e4 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -44,70 +44,110 @@ class GatherTest(test.TestCase):
     with self.test_session(use_gpu=True):
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in _TEST_TYPES:
-        params_np = self._buildParams(data, dtype)
-        params = constant_op.constant(params_np)
-        indices = constant_op.constant(4)
-        gather_t = array_ops.gather(params, indices)
-        gather_val = gather_t.eval()
-        self.assertAllEqual(params_np[4], gather_val)
-        self.assertEqual([], gather_t.get_shape())
+        for indices in 4, [1, 2, 2, 4, 5]:
+          params_np = self._buildParams(data, dtype)
+          params = constant_op.constant(params_np)
+          indices_tf = constant_op.constant(indices)
+          gather_t = array_ops.gather(params, indices_tf)
+          gather_val = gather_t.eval()
+          np_val = params_np[indices]
+          self.assertAllEqual(np_val, gather_val)
+          self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testScalar2D(self):
     with self.test_session(use_gpu=True):
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
-        params_np = self._buildParams(data, dtype)
-        params = constant_op.constant(params_np)
-        indices = constant_op.constant(2)
-        gather_t = array_ops.gather(params, indices)
-        gather_val = gather_t.eval()
-        self.assertAllEqual(params_np[2], gather_val)
-        self.assertEqual([3], gather_t.get_shape())
+        for axis in range(data.ndim):
+          params_np = self._buildParams(data, dtype)
+          params = constant_op.constant(params_np)
+          indices = constant_op.constant(2)
+          gather_t = array_ops.gather(params, indices, axis=axis)
+          gather_val = gather_t.eval()
+          self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
+          expected_shape = data.shape[:axis] + data.shape[axis + 1:]
+          self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testSimpleTwoD32(self):
     with self.test_session(use_gpu=True):
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
-        params_np = self._buildParams(data, dtype)
-        params = constant_op.constant(params_np)
-        indices = constant_op.constant([0, 4, 0, 2])
-        gather_t = array_ops.gather(params, indices)
-        gather_val = gather_t.eval()
-        self.assertAllEqual(params_np[[0, 4, 0, 2]], gather_val)
-        self.assertEqual([4, 3], gather_t.get_shape())
+        for axis in range(data.ndim):
+          params_np = self._buildParams(data, dtype)
+          params = constant_op.constant(params_np)
+          # The indices must be in bounds for any axis.
+          indices = constant_op.constant([0, 1, 0, 2])
+          gather_t = array_ops.gather(params, indices, axis=axis)
+          gather_val = gather_t.eval()
+          self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
+                              gather_val)
+          expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
+          self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testHigherRank(self):
-    np.random.seed(1)
-    # We check that scalar and empty shapes work as well
-    for shape in (7, 0), (4, 3, 2):
-      for indices_shape in (), (0,), (3, 0), (3, 5):
-        for dtype in _TEST_TYPES:
+    # We check that scalar and empty indices shapes work as well
+    shape = (2, 1, 3, 2)
+    for indices_shape in (), (0,), (2, 0), (2, 3):
+      for dtype in _TEST_TYPES:
+        for axis in range(len(shape)):
           params = self._buildParams(np.random.randn(*shape), dtype)
-          indices = np.random.randint(shape[0], size=indices_shape)
-          with self.test_session(use_gpu=True):
+          indices = np.random.randint(shape[axis], size=indices_shape)
+          with self.test_session(use_gpu=True) as sess:
             tf_params = constant_op.constant(params)
             tf_indices = constant_op.constant(indices)
-            gather = array_ops.gather(tf_params, tf_indices)
-            self.assertAllEqual(params[indices], gather.eval())
-            self.assertEqual(indices.shape + params.shape[1:],
-                             gather.get_shape())
+            # Check that both positive and negative indices for axis work.
+            tf_axis = constant_op.constant(axis)
+            tf_negative_axis = constant_op.constant(-len(shape) + axis)
+            gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+            gather_negative_axis = array_ops.gather(
+                tf_params, tf_indices, axis=tf_negative_axis)
+            gather_value, gather_negative_axis_value = sess.run(
+                [gather, gather_negative_axis])
+            gather_np = np.take(params, indices, axis)
+            self.assertAllEqual(gather_np, gather_value)
+            self.assertAllEqual(gather_np, gather_negative_axis_value)
+            expected_shape = (params.shape[:axis] + indices.shape +
+                              params.shape[axis + 1:])
+            self.assertEqual(expected_shape, gather.shape)
+            self.assertEqual(expected_shape, gather_negative_axis.shape)
+
             # Test gradients
-            gather_grad = np.random.randn(*gather.get_shape().as_list()).astype(
-                dtype.as_numpy_dtype)
+            gather_grad = np.random.randn(
+                *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
             if dtype.is_complex:
               gather_grad -= 1j * gather_grad
-            params_grad, indices_grad = gradients_impl.gradients(
-                gather, [tf_params, tf_indices], gather_grad)
+            params_grad, indices_grad, axis_grad = gradients_impl.gradients(
+                gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
-            self.assertEqual(type(params_grad), ops.IndexedSlices)
-            params_grad = ops.convert_to_tensor(params_grad)
+            self.assertEqual(axis_grad, None)
+            # For axis 0, we are able to create an efficient IndexedSlices for
+            # the gradient.
+            if axis == 0:
+              self.assertEqual(type(params_grad), ops.IndexedSlices)
+              params_grad = ops.convert_to_tensor(params_grad)
             correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
-            for i, g in zip(indices.flat,
-                            gather_grad.reshape((indices.size,) + shape[1:])):
-              correct_params_grad[i] += g
-            self.assertAllClose(correct_params_grad, params_grad.eval())
+            outer_dims = axis
+            inner_dims = len(shape) - axis - 1
+            gather_grad = gather_grad.reshape(
+                shape[:axis] + (indices.size,) + shape[axis + 1:])
+            for source_index, dest_index in enumerate(indices.flat):
+              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
+                            (slice(None),) * inner_dims)
+              source_slice = ((slice(None),) * outer_dims + (source_index,) +
+                              (slice(None),) * inner_dims)
+              correct_params_grad[dest_slice] += gather_grad[source_slice]
+            self.assertAllClose(correct_params_grad, params_grad.eval(),
+                                atol=2e-6, rtol=2e-6)
+
+  def testString(self):
+    params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
+    with self.test_session():
+      self.assertAllEqual([b"qwer", b"uiop"],
+                          array_ops.gather(params, 1, axis=0).eval())
+      self.assertAllEqual([b"asdf", b"qwer"],
+                          array_ops.gather(params, 0, axis=1).eval())
 
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
@@ -115,22 +155,62 @@ class GatherTest(test.TestCase):
     gather_t = array_ops.gather(params, indices)
     self.assertEqual(None, gather_t.get_shape())
 
+  def testUnknownAxis(self):
+    params = constant_op.constant([[0, 1, 2]])
+    indices = constant_op.constant([[0, 0], [0, 0]])
+    axis = array_ops.placeholder(dtypes.int32)
+    gather_t = array_ops.gather(params, indices, axis=axis)
+    # Rank 2 params with rank 2 indices results in a rank 3 shape.
+    self.assertEqual([None, None, None], gather_t.shape.as_list())
+
+    # If indices is also unknown the result rank is unknown.
+    indices = array_ops.placeholder(dtypes.int32)
+    gather_t = array_ops.gather(params, indices, axis=axis)
+    self.assertEqual(None, gather_t.shape)
+
   def testBadIndices(self):
     with self.test_session(use_gpu=True):
-      params = [0, 1, 2]
-      indices = [[7]]
-      gather = array_ops.gather(params, indices)
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
-        gather.eval()
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def testBadAxis(self):
+    with self.test_session(use_gpu=True):
+      params = [0, 1, 2]
+      params_ph = array_ops.placeholder(dtypes.int32)
+      indices = 0
+      for bad_axis in (1, 2, -2):
+        # Shape inference can validate axis for known params rank.
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be at least rank . but is rank 1"):
+          array_ops.gather(params, indices, axis=bad_axis)
+        # If params rank is unknown, an op error occurs.
+        with self.assertRaisesOpError(
+            r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
+          array_ops.gather(params_ph, indices, axis=bad_axis).eval(
+              feed_dict={params_ph: params})
 
   def testEmptySlices(self):
     with self.test_session(use_gpu=True):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
-          params = np.zeros((7, 0), dtype=dtype.as_numpy_dtype)
+          # Leading axis gather.
+          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
           indices = np.array([3, 4], dtype=itype)
-          gather = array_ops.gather(params, indices)
-          self.assertAllEqual(gather.eval(), np.zeros((2, 0)))
+          gather = array_ops.gather(params, indices, axis=0)
+          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+
+          # Middle axis gather.
+          params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
+          gather = array_ops.gather(params, indices, axis=1)
+          self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
+
+          # Trailing axis gather.
+          params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
+          gather = array_ops.gather(params, indices, axis=2)
+          self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index 4a4686d1b99e84629db5eb861276ca629080c4b9..37e9a8e3d1be62e5058c2b148b995a26712d6323 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
@@ -69,6 +71,18 @@ class InTopKTest(test.TestCase):
                                    "target.*out of range"):
         nn_ops.in_top_k(predictions, target, 2).eval()
 
+  def testTensorK(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    target = [0, 2]
+    k = constant_op.constant(3)
+    np_ans = np.array([False, True])
+    with self.test_session():
+      # TODO (yongtang): The test will be switch to nn_ops.in_top
+      # once nn_ops.in_top points to _in_top_kv2 later
+      precision = gen_nn_ops._in_top_kv2(predictions, target, k)
+      out = precision.eval()
+      self.assertAllClose(np_ans, out)
+      self.assertShapeEqual(np_ans, precision)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index 472487ccfb15e340949278f3df11226de451bd08..f91875c6f0c1a7bfa388ec1b1a58f06b65889c3e 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import shutil
 import tempfile
 
 from tensorflow.python.ops import io_ops
@@ -58,6 +59,20 @@ class IoOpsTest(test.TestCase):
         self.assertEqual(file_contents, contents)
       os.remove(temp.name)
 
+  def testWriteFileCreateDir(self):
+    cases = ['', 'Some contents']
+    for contents in cases:
+      contents = compat.as_bytes(contents)
+      subdir = os.path.join(self.get_temp_dir(), 'subdir1')
+      filepath = os.path.join(subdir, 'subdir2', 'filename')
+      with self.test_session() as sess:
+        w = io_ops.write_file(filepath, contents)
+        sess.run(w)
+        with open(filepath, 'rb') as f:
+          file_contents = f.read()
+        self.assertEqual(file_contents, contents)
+      shutil.rmtree(subdir)
+
   def _subset(self, files, indices):
     return set(
         compat.as_bytes(files[i].name) for i in range(len(files))
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index 4ceb24862ff5c4ae80a114a891ca851e2483056a..8b669450590f1fce0f14a9e5d64e1055dbe23f4e 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -46,7 +46,7 @@ class MapStageTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
-        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
   def testMultiple(self):
@@ -67,7 +67,7 @@ class MapStageTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
-        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
@@ -94,7 +94,7 @@ class MapStageTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
-        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
@@ -108,15 +108,15 @@ class MapStageTest(test.TestCase):
       with ops.device(gpu_dev):
         stager = data_flow_ops.MapStagingArea([dtypes.float32])
         y = stager.put(1, [v], [0])
-        self.assertEqual(y.device, '/device:GPU:0' if gpu_dev
-                                                   else gpu_dev)
+        expected_name = gpu_dev if 'gpu' not in gpu_dev else '/device:GPU:0'
+        self.assertEqual(y.device, expected_name)
       with ops.device('/cpu:0'):
         _, x = stager.get(1)
-        y = stager.peek(1)
+        y = stager.peek(1)[0]
         _, z = stager.get()
-        self.assertEqual(x.device, '/device:CPU:0')
+        self.assertEqual(x[0].device, '/device:CPU:0')
         self.assertEqual(y.device, '/device:CPU:0')
-        self.assertEqual(z.device, '/device:CPU:0')
+        self.assertEqual(z[0].device, '/device:CPU:0')
 
     G.finalize()
 
@@ -139,10 +139,10 @@ class MapStageTest(test.TestCase):
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       for i in range(n):
-        sess.run(stage, feed_dict={x:i, pi:i})
+        sess.run(stage, feed_dict={x: i, pi: i})
 
       for i in range(n):
-        self.assertTrue(sess.run(peek, feed_dict={gi: i}) == i)
+        self.assertTrue(sess.run(peek, feed_dict={gi: i})[0] == i)
 
       self.assertTrue(sess.run(size) == 10)
 
@@ -372,16 +372,26 @@ class MapStageTest(test.TestCase):
       # 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
       # We can now obtain tuple associated with key 0
-      self.assertTrue(sess.run([key, ret], feed_dict={gi:0})
-                              == [0, { 'x':1, 'f':2, 'v':1}])
+      self.assertTrue(
+          sess.run([key, ret],
+                   feed_dict={gi: 0}) == [0, {
+                       'x': 1,
+                       'f': 2,
+                       'v': 1
+                   }])
 
       # 0 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [0, 1])
       # Now complete key 1 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
-      self.assertTrue(sess.run([key, ret], feed_dict={gi:1})
-                              == [1, { 'x':1, 'f':2, 'v':3}])
+      self.assertTrue(
+          sess.run([key, ret],
+                   feed_dict={gi: 1}) == [1, {
+                       'x': 1,
+                       'f': 2,
+                       'v': 3
+                   }])
 
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
@@ -417,16 +427,14 @@ class MapStageTest(test.TestCase):
       # 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
       # We can now obtain tuple associated with key 0
-      self.assertTrue(sess.run([key, ret], feed_dict={gi:0})
-                              == [0, [1, 1, 2]])
+      self.assertTrue(sess.run([key, ret], feed_dict={gi: 0}) == [0, [1, 1, 2]])
 
       # 0 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [0, 1])
       # Now complete key 1 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
-      self.assertTrue(sess.run([key, ret], feed_dict={gi:1})
-                              == [1, [1,3, 2]])
+      self.assertTrue(sess.run([key, ret], feed_dict={gi: 1}) == [1, [1, 3, 2]])
 
   def testPartialDictGetsAndPeeks(self):
     with ops.Graph().as_default() as G:
@@ -471,23 +479,25 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 1])
 
       # We can now peek at 'x' and 'f' values associated with key 0
-      self.assertTrue(sess.run(peek_xf, feed_dict={pei:0})
-                              == { 'x':1, 'f':2})
+      self.assertTrue(sess.run(peek_xf, feed_dict={pei: 0}) == {'x': 1, 'f': 2})
       # Peek at 'v' value associated with key 0
-      self.assertTrue(sess.run(peek_v, feed_dict={pei:0})
-                              == { 'v':1})
+      self.assertTrue(sess.run(peek_v, feed_dict={pei: 0}) == {'v': 1})
       # 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
 
       # We can now obtain 'x' and 'f' values associated with key 0
-      self.assertTrue(sess.run([key_xf, get_xf], feed_dict={gi:0})
-                              == [0, { 'x':1, 'f':2}])
+      self.assertTrue(
+          sess.run([key_xf, get_xf],
+                   feed_dict={gi: 0}) == [0, {
+                       'x': 1,
+                       'f': 2
+                   }])
       # Still have 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
 
       # We can no longer get 'x' and 'f' from key 0
       with self.assertRaises(errors.InvalidArgumentError) as cm:
-        sess.run([key_xf, get_xf], feed_dict={gi:0})
+        sess.run([key_xf, get_xf], feed_dict={gi: 0})
 
       exc_str = ("Tensor at index '0' for key '0' "
                 "has already been removed.")
@@ -495,8 +505,10 @@ class MapStageTest(test.TestCase):
       self.assertTrue(exc_str in cm.exception.message)
 
       # Obtain 'v' value associated with key 0
-      self.assertTrue(sess.run([key_v, get_v], feed_dict={gi:0})
-                              == [0, { 'v':1}])
+      self.assertTrue(
+          sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, {
+              'v': 1
+          }])
       # 0 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [0, 1])
 
@@ -506,13 +518,14 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Pop without key to obtain 'x' and 'f' values associated with key 1
-      self.assertTrue(sess.run([pop_key_xf, pop_xf])
-                              == [1, { 'x':1, 'f':2}])
+      self.assertTrue(sess.run([pop_key_xf, pop_xf]) == [1, {'x': 1, 'f': 2}])
       # still 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 0])
       # We can now obtain 'x' and 'f' values associated with key 1
-      self.assertTrue(sess.run([pop_key_v, pop_v], feed_dict={pi:1})
-                              == [1, { 'v': 1 }])
+      self.assertTrue(
+          sess.run([pop_key_v, pop_v], feed_dict={pi: 1}) == [1, {
+              'v': 1
+          }])
       # Nothing is left
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 042f4623574b518130348eefba32c9434e875d61..b167278984cf45ffec2b1ca74e2bbb8c50e44161 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -31,6 +31,9 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
+# TODO(yangzihao): Currently matmul autotuning is disabled by default. Use
+# os.environ["TF_MATMUL_AUTOTUNE_ENABLE"] = "1" to enable it.
+
 
 def _AddTest(test, op_name, testcase_name, fn):
   test_name = "_".join(["test", op_name, testcase_name])
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index c709be0b5b84edfdea0bd20895040d4cbed4636e..b774c69ceb42198c6716dea7c4dfd0fdfbf3606e 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -30,8 +30,12 @@ from tensorflow.python.platform import test
 
 class PadOpTest(test.TestCase):
 
-  def _npPad(self, inp, paddings, mode):
-    return np.pad(inp, paddings, mode=mode.lower())
+  def _npPad(self, inp, paddings, mode, constant_values=0):
+    mode = mode.lower()
+    if mode == "constant":
+      return np.pad(inp, paddings, mode=mode, constant_values=constant_values)
+    else:
+      return np.pad(inp, paddings, mode=mode)
 
   def testNpPad(self):
     self.assertAllEqual(
@@ -46,6 +50,18 @@ class PadOpTest(test.TestCase):
             [[1, 2], [1, 3]],
             mode="constant"))
 
+    self.assertAllEqual(
+        np.array([[1, 1, 1, 1, 1, 1],
+                  [1, 3, 3, 1, 1, 1],
+                  [1, 4, 4, 1, 1, 1],
+                  [1, 5, 5, 1, 1, 1],
+                  [1, 1, 1, 1, 1, 1],
+                  [1, 1, 1, 1, 1, 1]]),
+        self._npPad(
+            np.array([[3, 3], [4, 4], [5, 5]]),
+            [[1, 2], [1, 3]],
+            mode="constant", constant_values=1))
+
     self.assertAllEqual(
         np.array([[4, 3, 4, 9, 4, 3],
                   [1, 0, 1, 2, 1, 0],
@@ -66,35 +82,39 @@ class PadOpTest(test.TestCase):
             [[1, 1], [1, 2]],
             mode="symmetric"))
 
-  def _testPad(self, np_inputs, paddings, mode):
-    np_val = self._npPad(np_inputs, paddings, mode=mode)
+  def _testPad(self, np_inputs, paddings, mode, constant_values):
+    np_val = self._npPad(np_inputs, paddings, mode=mode,
+                         constant_values=constant_values)
     with self.test_session(use_gpu=True):
-      tf_val = array_ops.pad(np_inputs, paddings, mode=mode)
+      tf_val = array_ops.pad(np_inputs, paddings, mode=mode,
+                             constant_values=constant_values)
       out = tf_val.eval()
     self.assertAllEqual(np_val, out)
     self.assertShapeEqual(np_val, tf_val)
 
-  def _testGradient(self, x, a, mode):
+  def _testGradient(self, x, a, mode, constant_values):
     with self.test_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       xs = list(x.shape)
       ina = ops.convert_to_tensor(a)
-      y = array_ops.pad(inx, ina, mode=mode)
+      y = array_ops.pad(inx, ina, mode=mode, constant_values=constant_values)
       # Expected y's shape to be:
       ys = list(np.array(x.shape) + np.sum(np.array(a), axis=1))
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           inx, xs, y, ys, x_init_value=x)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
-  def _testAll(self, np_inputs, paddings):
+  def _testAll(self, np_inputs, paddings, constant_values):
     for mode in ("CONSTANT", "REFLECT", "SYMMETRIC", "reflect", "symmetric",
                  "constant"):
       # Zero-sized input is not allowed for REFLECT mode, but we still want
       # zero-sized input test cases for the other modes.
       if np_inputs.size or mode.upper() != "REFLECT":
-        self._testPad(np_inputs, paddings, mode=mode)
+        self._testPad(np_inputs, paddings, mode=mode,
+                      constant_values=constant_values)
         if np_inputs.dtype == np.float32:
-          self._testGradient(np_inputs, paddings, mode=mode)
+          self._testGradient(np_inputs, paddings, mode=mode,
+                             constant_values=constant_values)
 
   def testInputDims(self):
     with self.test_session(use_gpu=True):
@@ -179,23 +199,25 @@ class PadOpTest(test.TestCase):
     for t in [np.int32, np.int64]:
       self._testAll(
           np.random.randint(-100, 100, (4, 4, 3)).astype(t),
-          [[1, 0], [2, 3], [0, 2]])
+          [[1, 0], [2, 3], [0, 2]], 0)
       self._testAll(
           np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
-          [[0, 0], [0, 0], [0, 0], [0, 0]])
+          [[0, 0], [0, 0], [0, 0], [0, 0]], -1234)
 
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
-      self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]])
-      self._testAll(np.random.rand(2, 3, 4).astype(t), [[0, 0], [0, 0], [0, 0]])
-      self._testAll(np.random.rand(0, 3, 4).astype(t), [[0, 0], [2, 1], [2, 3]])
+      self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0)
+      self._testAll(np.random.rand(2, 3, 4).astype(t),
+                    [[0, 0], [0, 0], [0, 0]], -1234.0)
+      self._testAll(np.random.rand(0, 3, 4).astype(t),
+                    [[0, 0], [2, 1], [2, 3]], 0.0)
 
   def testComplexTypes(self):
     for t in [np.complex64, np.complex128]:
       x = np.random.rand(2, 5).astype(t)
-      self._testAll(x + 1j * x, [[1, 0], [2, 0]])
+      self._testAll(x + 1j * x, [[1, 0], [2, 0]], 1234.0 - 1234.0j)
       x = np.random.rand(3, 2, 1, 1).astype(t)
-      self._testAll(x + 1j * x, [[0, 0], [0, 0], [0, 0], [0, 0]])
+      self._testAll(x + 1j * x, [[0, 0], [0, 0], [0, 0], [0, 0]], 0 + 0j)
 
   def testShapeFunctionEdgeCases(self):
     # Unknown paddings shape.
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 1b6c8bef9864e754dce6e5224e5015cd44b8b3ab..f5fb7e4e03e640e10443872f6b571a16bb76e77f 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -521,7 +521,7 @@ class PoolingTest(test.TestCase):
               padding="SAME").eval()
 
   # The following are tests that verify that the CPU and GPU implementations
-  # produce the same resuts.
+  # produce the same results.
   def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
     for dtype in np.float64, np.float32, np.float16:
       tensor_input = np.random.rand(*input_shape).astype(dtype)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index e098cf3ff9ca88bfee7746b2916e8dd947f664f2..43c0fe7837d7aae080146eb7484d646d2afd1ba8 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -113,8 +113,8 @@ class PyOpTest(test.TestCase):
     # returns a tuple, Tout and inp a tuple
     with self.test_session():
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, (x,),
-                                (dtypes.float64, dtypes.float64))
+      y, z = script_ops.py_func(tuple_func, (x,), (dtypes.float64,
+                                                   dtypes.float64))
       self.assertAllClose(y.eval(), 0.0)
       self.assertAllClose(z.eval(), 1.0)
 
@@ -194,6 +194,21 @@ class PyOpTest(test.TestCase):
                                    "Unsupported object type"):
         z.eval()
 
+  def testReturnInput(self):
+    with self.test_session():
+
+      def ident(x):
+        return x[0]
+
+      p = array_ops.placeholder(dtypes.float32)
+
+      # Create a numpy array aliasing a tensor and a tensor aliasing this array
+      z, = script_ops.py_func(ident, [p], [dtypes.float32])
+      z += 0.0  # Makes sure we release the tensor aliasing the numpy array x[0]
+                # above instead of using its memory as the return value of
+                # session.run
+      self.assertEqual(0.0, z.eval(feed_dict={p: [0.0]}))
+
   def testStateful(self):
     # Not using self.test_session(), which disables optimization.
     with session_lib.Session() as sess:
@@ -225,7 +240,8 @@ class PyOpTest(test.TestCase):
   def testCOrder(self):
     with self.test_session():
       val = [[1, 2], [3, 4]]
-      x, = script_ops.py_func(lambda: np.array(val, order="F"), [], [dtypes.int64])
+      x, = script_ops.py_func(lambda: np.array(val, order="F"), [],
+                              [dtypes.int64])
       self.assertAllEqual(val, x.eval())
 
   def testParallel(self):
diff --git a/tensorflow/python/kernel_tests/random_poisson_test.py b/tensorflow/python/kernel_tests/random_poisson_test.py
index 01281b7bd0350dc23f1a7963eadc8267f7b3228c..107c9bbe14f6b41d1548fd43a75aa3279101d5ab 100644
--- a/tensorflow/python/kernel_tests/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random_poisson_test.py
@@ -131,8 +131,14 @@ class RandomPoissonTest(test.TestCase):
         # be at least 1 if they are different.
         self.assertGreaterEqual(np.linalg.norm(diff.eval()), 1)
 
+  def testZeroShape(self):
+    with self.test_session():
+      rnd = random_ops.random_poisson([], [], seed=12345)
+      self.assertEqual([0], rnd.get_shape().as_list())
+      self.assertAllClose(np.array([], dtype=np.float32), rnd.eval())
+
   def testShape(self):
-    # Fully known shape.
+    # Fully known shape
     rnd = random_ops.random_poisson(2.0, [150], seed=12345)
     self.assertEqual([150], rnd.get_shape().as_list())
     rnd = random_ops.random_poisson(
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 12932219fc312eece9d11e5fdb7f5e0d0c59b9b6..5630259b7b7c7d4607854ee4fb8a04c404e70a17 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import gzip
 import os
+import shutil
 import threading
 import zlib
 
@@ -36,6 +37,8 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
+prefix_path = "tensorflow/core/lib"
+
 # pylint: disable=invalid-name
 TFRecordCompressionType = tf_record.TFRecordCompressionType
 # pylint: enable=invalid-name
@@ -347,13 +350,11 @@ class FixedLengthRecordReaderTest(test.TestCase):
   def setUp(self):
     super(FixedLengthRecordReaderTest, self).setUp()
     self._num_files = 2
-    self._num_records = 7
     self._header_bytes = 5
     self._record_bytes = 3
     self._footer_bytes = 2
 
     self._hop_bytes = 2
-    self._num_overlapped_records = 3
 
   def _Record(self, f, r):
     return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
@@ -366,19 +367,24 @@ class FixedLengthRecordReaderTest(test.TestCase):
     ])
     return compat.as_bytes(record_str)
 
-  def _CreateFiles(self):
+  # gap_bytes=hop_bytes-record_bytes
+  def _CreateFiles(self, num_records, gap_bytes):
     filenames = []
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
       filenames.append(fn)
       with open(fn, "wb") as f:
         f.write(b"H" * self._header_bytes)
-        for j in range(self._num_records):
+        if num_records > 0:
+          f.write(self._Record(i, 0))
+        for j in range(1, num_records):
+          if gap_bytes > 0:
+            f.write(b"G" * gap_bytes)
           f.write(self._Record(i, j))
         f.write(b"F" * self._footer_bytes)
     return filenames
 
-  def _CreateOverlappedRecordFiles(self):
+  def _CreateOverlappedRecordFiles(self, num_overlapped_records):
     filenames = []
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(),
@@ -386,23 +392,104 @@ class FixedLengthRecordReaderTest(test.TestCase):
       filenames.append(fn)
       with open(fn, "wb") as f:
         f.write(b"H" * self._header_bytes)
-        all_records_str = "".join([
-            str(i)[0]
-            for i in range(self._record_bytes + self._hop_bytes *
-                           (self._num_overlapped_records - 1))
-        ])
-        f.write(compat.as_bytes(all_records_str))
+        if num_overlapped_records > 0:
+          all_records_str = "".join([
+              str(i)[0]
+              for i in range(self._record_bytes + self._hop_bytes *
+                             (num_overlapped_records - 1))
+          ])
+          f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
     return filenames
 
-  def testOneEpoch(self):
-    files = self._CreateFiles()
+  # gap_bytes=hop_bytes-record_bytes
+  def _CreateGzipFiles(self, num_records, gap_bytes):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+      with gzip.GzipFile(fn, "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        if num_records > 0:
+          f.write(self._Record(i, 0))
+        for j in range(1, num_records):
+          if gap_bytes > 0:
+            f.write(b"G" * gap_bytes)
+          f.write(self._Record(i, j))
+        f.write(b"F" * self._footer_bytes)
+    return filenames
+
+  # gap_bytes=hop_bytes-record_bytes
+  def _CreateZlibFiles(self, num_records, gap_bytes):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn+".tmp", "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        if num_records > 0:
+          f.write(self._Record(i, 0))
+        for j in range(1, num_records):
+          if gap_bytes > 0:
+            f.write(b"G" * gap_bytes)
+          f.write(self._Record(i, j))
+        f.write(b"F" * self._footer_bytes)
+      with open(fn+".tmp", "rb") as f:
+        cdata = zlib.compress(f.read())
+        with open(fn, "wb") as zf:
+          zf.write(cdata)
+    return filenames
+
+  def _CreateGzipOverlappedRecordFiles(self, num_overlapped_records):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(),
+                        "fixed_length_overlapped_record.%d.txt" % i)
+      filenames.append(fn)
+      with gzip.GzipFile(fn, "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        if num_overlapped_records > 0:
+          all_records_str = "".join([
+              str(i)[0]
+              for i in range(self._record_bytes + self._hop_bytes *
+                           (num_overlapped_records - 1))
+          ])
+          f.write(compat.as_bytes(all_records_str))
+        f.write(b"F" * self._footer_bytes)
+    return filenames
+
+  def _CreateZlibOverlappedRecordFiles(self, num_overlapped_records):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(),
+                        "fixed_length_overlapped_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn+".tmp", "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        if num_overlapped_records > 0:
+          all_records_str = "".join([
+              str(i)[0]
+              for i in range(self._record_bytes + self._hop_bytes *
+                             (num_overlapped_records - 1))
+          ])
+          f.write(compat.as_bytes(all_records_str))
+        f.write(b"F" * self._footer_bytes)
+      with open(fn+".tmp", "rb") as f:
+        cdata = zlib.compress(f.read())
+        with open(fn, "wb") as zf:
+          zf.write(cdata)
+    return filenames
+
+  # gap_bytes=hop_bytes-record_bytes
+  def _TestOneEpoch(self, files, num_records, gap_bytes, encoding=None):
+    hop_bytes = 0 if gap_bytes == 0 else self._record_bytes + gap_bytes
     with self.test_session() as sess:
       reader = io_ops.FixedLengthRecordReader(
           header_bytes=self._header_bytes,
           record_bytes=self._record_bytes,
           footer_bytes=self._footer_bytes,
-          hop_bytes=0,
+          hop_bytes=hop_bytes,
+          encoding=encoding,
           name="test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -410,7 +497,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
       queue.enqueue_many([files]).run()
       queue.close().run()
       for i in range(self._num_files):
-        for j in range(self._num_records):
+        for j in range(num_records):
           k, v = sess.run([key, value])
           self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
           self.assertAllEqual(self._Record(i, j), v)
@@ -419,14 +506,14 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
-  def testOneEpochWithHopBytes(self):
-    files = self._CreateOverlappedRecordFiles()
+  def _TestOneEpochWithHopBytes(self, files, num_overlapped_records, encoding=None):
     with self.test_session() as sess:
       reader = io_ops.FixedLengthRecordReader(
           header_bytes=self._header_bytes,
           record_bytes=self._record_bytes,
           footer_bytes=self._footer_bytes,
           hop_bytes=self._hop_bytes,
+          encoding=encoding,
           name="test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -434,7 +521,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
       queue.enqueue_many([files]).run()
       queue.close().run()
       for i in range(self._num_files):
-        for j in range(self._num_overlapped_records):
+        for j in range(num_overlapped_records):
           k, v = sess.run([key, value])
           print(v)
           self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
@@ -444,6 +531,45 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testOneEpoch(self):
+    for num_records in [0, 7]:
+      # gap_bytes=0: hop_bytes=0
+      # gap_bytes=1: hop_bytes=record_bytes+1
+      for gap_bytes in [0, 1]:
+        files = self._CreateFiles(num_records, gap_bytes)
+        self._TestOneEpoch(files, num_records, gap_bytes)
+
+  def testGzipOneEpoch(self):
+    for num_records in [0, 7]:
+      # gap_bytes=0: hop_bytes=0
+      # gap_bytes=1: hop_bytes=record_bytes+1
+      for gap_bytes in [0, 1]:
+        files = self._CreateGzipFiles(num_records, gap_bytes)
+        self._TestOneEpoch(files, num_records, gap_bytes, encoding="GZIP")
+
+  def testZlibOneEpoch(self):
+    for num_records in [0, 7]:
+      # gap_bytes=0: hop_bytes=0
+      # gap_bytes=1: hop_bytes=record_bytes+1
+      for gap_bytes in [0, 1]:
+        files = self._CreateZlibFiles(num_records, gap_bytes)
+        self._TestOneEpoch(files, num_records, gap_bytes, encoding="ZLIB")
+
+  def testOneEpochWithHopBytes(self):
+    for num_overlapped_records in [0, 2]:
+      files = self._CreateOverlappedRecordFiles(num_overlapped_records)
+      self._TestOneEpochWithHopBytes(files, num_overlapped_records)
+
+  def testGzipOneEpochWithHopBytes(self):
+    for num_overlapped_records in [0, 2]:
+      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records, )
+      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="GZIP")
+
+  def testZlibOneEpochWithHopBytes(self):
+    for num_overlapped_records in [0, 2]:
+      files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
+      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="ZLIB")
+
 
 class TFRecordReaderTest(test.TestCase):
 
@@ -858,48 +984,50 @@ class AsyncReaderTest(test.TestCase):
     output.append(sess.run(args))
 
 
-# TODO(jhseu): Restore after fixing.
-#class LMDBReaderTest(test.TestCase):
-#
-#  def setUp(self):
-#    super(LMDBReaderTest, self).setUp()
-#
-#  def testReadFromFile(self):
-#    with self.test_session() as sess:
-#      reader = io_ops.LMDBReader(name="test_read_from_file")
-#      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata",
-#                          "data.mdb")
-#      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-#      key, value = reader.read(queue)
-#
-#      queue.enqueue([path]).run()
-#      queue.close().run()
-#      for i in range(10):
-#        k, v = sess.run([key, value])
-#        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-#        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
-#
-#      with self.assertRaisesOpError("is closed and has insufficient elements "
-#                                    "\\(requested 1, current size 0\\)"):
-#        k, v = sess.run([key, value])
-#
-#  def testReadFromFolder(self):
-#    with self.test_session() as sess:
-#      reader = io_ops.LMDBReader(name="test_read_from_folder")
-#      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata")
-#      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-#      key, value = reader.read(queue)
-#
-#      queue.enqueue([path]).run()
-#      queue.close().run()
-#      for i in range(10):
-#        k, v = sess.run([key, value])
-#        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-#        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
-#
-#      with self.assertRaisesOpError("is closed and has insufficient elements "
-#                                    "\\(requested 1, current size 0\\)"):
-#        k, v = sess.run([key, value])
+class LMDBReaderTest(test.TestCase):
+
+  def setUp(self):
+    super(LMDBReaderTest, self).setUp()
+    # Copy database out because we need the path to be writable to use locks.
+    path = os.path.join(prefix_path, "lmdb", "testdata", "data.mdb")
+    self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
+    shutil.copy(path, self.db_path)
+
+  def testReadFromFile(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_file")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue([self.db_path]).run()
+      queue.close().run()
+      for i in range(10):
+        k, v = sess.run([key, value])
+        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+        self.assertAllEqual(
+            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
+  def testReadFromFolder(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_folder")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue([self.db_path]).run()
+      queue.close().run()
+      for i in range(10):
+        k, v = sess.run([key, value])
+        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+        self.assertAllEqual(
+            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index 8fec2affa5960af8fd1e707de85ffd8c51604b39..1ec48ac361b81e66fd77e8a4506bebf910ea0e8a 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -96,6 +98,42 @@ class RecordInputOpTest(test.TestCase):
         for _ in range(50):
           sess.run(yield_op)
 
+  def testEmptyGlob(self):
+    with self.test_session() as sess:
+      record_input = data_flow_ops.RecordInput(file_pattern="foo")
+      yield_op = record_input.get_yield_op()
+      sess.run(variables.global_variables_initializer())
+      with self.assertRaises(NotFoundError):
+        sess.run(yield_op)
+
+  def testBufferTooSmall(self):
+    files = 10
+    records_per_file = 10
+    batches = 2
+    with self.test_session() as sess:
+      self.generateTestData("basic", files, records_per_file)
+
+      records = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=2,
+          buffer_size=2000,
+          batch_size=1,
+          shift_ratio=0.33,
+          seed=10,
+          name="record_input",
+          batches=batches)
+
+      yield_op = records.get_yield_op()
+
+      # cycle over 3 epochs and make sure we never duplicate
+      for _ in range(3):
+        epoch_set = set()
+        for _ in range(int(files * records_per_file / batches)):
+          op_list = sess.run(yield_op)
+          self.assertTrue(len(op_list) is batches)
+          for r in op_list:
+            self.assertTrue(r[0] not in epoch_set)
+            epoch_set.add(r[0])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 229f27e9cae9192c47e238c50cfe1b29ee63c9b9..8cd1f52d80039deccfe4623b8bae9bb1482b8392 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -32,6 +33,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
+def _elu_grad_grad(activation):
+  if activation < 0:
+    return np.exp(activation)
+  return 0
+
+
 class ReluTest(test.TestCase):
 
   def _npRelu(self, np_features):
@@ -266,6 +273,17 @@ class EluTest(test.TestCase):
     print("elu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
+  def testGradGrad(self):
+    with self.test_session():
+      x = array_ops.placeholder(dtype=dtypes.float32)
+      elu = nn_ops.elu(x)
+      g, = gradients_impl.gradients(elu, x)
+      gg, = gradients_impl.gradients(g, x)
+
+      for x_val in [-1, -0.5, 0.5, 1]:
+        err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val))
+        self.assertLess(err, 1e-4)
+
   def testGradGradFloat32(self):
     with self.test_session():
       x = constant_op.constant(
@@ -302,6 +320,97 @@ class EluTest(test.TestCase):
     self.assertLess(err, 1e-6)
 
 
+class SeluTest(test.TestCase):
+
+  def _npSelu(self, np_features):
+    scale = 1.0507009873554804934193349852946
+    scale_alpha = 1.7580993408473768599402175208123
+    return np.where(np_features < 0, scale_alpha * (np.exp(np_features) - 1),
+                    scale * np_features)
+
+  def testNpSelu(self):
+    self.assertAllClose(
+        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103 , -0.16730527],
+                 [0.1050701 , -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
+        self._npSelu(
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
+                     ])))
+
+  def _testSelu(self, np_features, use_gpu=False):
+    np_selu = self._npSelu(np_features)
+    with self.test_session(use_gpu=use_gpu):
+      selu = nn_ops.selu(np_features)
+      tf_selu = selu.eval()
+    self.assertAllClose(np_selu, tf_selu)
+    self.assertShapeEqual(np_selu, selu)
+
+  def testNumbers(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testSelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          use_gpu=False)
+      self._testSelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          use_gpu=True)
+
+  def testGradientFloat32(self):
+    with self.test_session():
+      x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
+      x = constant_op.constant(x_val, name="x")
+      y = nn_ops.selu(x, name="selu")
+      x_init = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], y, [2, 5], x_init_value=x_init)
+    print("selu (float32) gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat64(self):
+    with self.test_session():
+      x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
+      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
+      y = nn_ops.selu(x, name="selu")
+      x_init = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], y, [2, 5], x_init_value=x_init)
+    print("selu (float64) gradient err = ", err)
+    self.assertLess(err, 1e-6)
+
+  def testGradGradFloat32(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.selu(x, name="selu")
+      z = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+    print("selu (float32) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+  def testGradGradFloat64(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          dtype=dtypes.float64,
+          name="x")
+      y = nn_ops.selu(x, name="selu")
+      z = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+    print("selu (float64) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-6)
+
+
 class CreluTest(test.TestCase):
 
   def testCreluShape(self):
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 3d10f4d0e93109d4acb9ab45e66bb1b7dfc9ce82..4dec0e8380b75101ffe45aec9f306202e05b6c4c 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -93,7 +93,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(read.eval(), 2)
 
   def testScatterAdd(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1, 1])
       resource_variable_ops.assign_variable_op(
@@ -163,6 +163,16 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.load(2.0)
       self.assertEqual(2.0, v.value().eval())
 
+  def testSparseRead(self):
+    with self.test_session():
+      init_value = np.reshape(np.arange(np.power(4, 3)), (4, 4, 4))
+      v = resource_variable_ops.ResourceVariable(
+          constant_op.constant(init_value, dtype=dtypes.int32))
+      variables.global_variables_initializer().run()
+
+      value = v.sparse_read([0, 3, 1, 2]).eval()
+      self.assertAllEqual(init_value[[0, 3, 1, 2], ...], value)
+
   def testToFromProto(self):
     with self.test_session():
       v = resource_variable_ops.ResourceVariable(1.0)
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 8519d19fe19afba3923e527305e6302f5a6fff35..ebc568621277c7d395b52f0c7b78c1389c88b103 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -87,7 +87,7 @@ def _NumpyDiv(ref, indices, updates):
   return _NumpyScatterNd(ref, indices, updates, lambda p, u: p / u)
 
 
-class ScatterNdTest(test.TestCase):
+class StatefulScatterNdTest(test.TestCase):
 
   def _VariableRankTest(self,
                         np_scatter,
@@ -261,10 +261,6 @@ class ScatterNdTest(test.TestCase):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
-    self.assertAllEqual(
-        array_ops.scatter_nd(indices, updates, shape).get_shape().as_list(),
-        shape)
-
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     self.assertAllEqual(
         state_ops.scatter_nd_update(ref, indices,
@@ -274,37 +270,120 @@ class ScatterNdTest(test.TestCase):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
     shape = np.array([2, 2])
-    scatter = array_ops.scatter_nd(indices, updates, shape)
-    self.assertAllEqual(scatter.get_shape().as_list(), shape)
-    expected_result = np.zeros([2, 2], dtype=np.int32)
-    with self.test_session():
-      self.assertAllEqual(expected_result, scatter.eval())
-
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     scatter_update = state_ops.scatter_nd_update(ref, indices, updates)
     self.assertAllEqual(scatter_update.get_shape().as_list(), shape)
 
+    expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.test_session():
       ref.initializer.run()
       self.assertAllEqual(expected_result, scatter_update.eval())
 
+  def testRank3InvalidShape1(self):
+    indices = array_ops.zeros([3, 2, 2], dtypes.int32)
+    updates = array_ops.zeros([2, 2, 2], dtypes.int32)
+    shape = np.array([2, 2, 2])
+    ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "The outer \\d+ dimensions of indices\\.shape="):
+      state_ops.scatter_nd_update(ref, indices, updates)
+
+  def testRank3InvalidShape2(self):
+    indices = array_ops.zeros([2, 2, 1], dtypes.int32)
+    updates = array_ops.zeros([2, 2], dtypes.int32)
+    shape = np.array([2, 2, 2])
+    ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "The inner \\d+ dimensions of input\\.shape="):
+      state_ops.scatter_nd_update(ref, indices, updates)
+
+  def testConcurrentUpdates(self):
+    num_updates = 10000
+    update_values = np.random.rand(num_updates)
+    ref = variables.Variable(np.zeros([2, 2]), dtype=dtypes.float64)
+    indices = constant_op.constant([[0, 1]] * num_updates, dtype=dtypes.int32)
+    updates = constant_op.constant(update_values, dtype=dtypes.float64)
+
+    expected_result = np.zeros([2, 2], dtype=np.float64)
+    expected_result[0, 1] = np.sum(update_values)
+
+    scatter = state_ops.scatter_nd_add(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with session.Session() as sess:
+      sess.run(init)
+      result = sess.run(scatter)
+      assert np.allclose(result, expected_result)
+
+  # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
+  def _disabledTestScatterOutOfRangeGpu(self):
+    if not test.IsBuiltWithCuda():
+      return
+    # TODO(simister): Re-enable once binary size increase due to
+    # scatter_nd ops is under control.
+    # tf.scatter_nd_mul, tf.scatter_nd_div,
+    for op in (state_ops.scatter_nd_add, state_ops.scatter_nd_sub,
+               state_ops.scatter_nd_update):
+      params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
+      updates = np.array([-3, -4, -5]).astype(np.float32)
+      # With GPU, the code ignores indices that are out of range.
+      # We don't test the implementation; just test there's no failures.
+      with self.test_session(force_gpu=True):
+        ref = variables.Variable(params)
+        ref.initializer.run()
+
+        # Indices all in range, no problem.
+        indices = np.array([2, 0, 5])
+        op(ref, indices, updates).eval()
+
+        # Indicies out of range should not fail.
+        indices = np.array([-1, 0, 5])
+        op(ref, indices, updates).eval()
+        indices = np.array([2, 0, 6])
+        op(ref, indices, updates).eval()
+
+
+class ScatterNdTest(test.TestCase):
+  non_aliasing_add_test = False
+
+  def scatter_nd(self, indices, updates, shape, input_=None):
+    del input_  # input_ is not used in scatter_nd
+    return array_ops.scatter_nd(indices, updates, shape)
+
+  def testRank3ValidShape(self):
+    indices = array_ops.zeros([2, 2, 2], dtypes.int32)
+    updates = array_ops.zeros([2, 2, 2], dtypes.int32)
+    shape = np.array([2, 2, 2])
+    self.assertAllEqual(
+        self.scatter_nd(indices, updates, shape).get_shape().as_list(), shape)
+
+  def testExtraIndicesDimensions(self):
+    indices = array_ops.zeros([1, 1, 2], dtypes.int32)
+    updates = array_ops.zeros([1, 1], dtypes.int32)
+    shape = np.array([2, 2])
+    scatter = self.scatter_nd(indices, updates, shape)
+    self.assertAllEqual(scatter.get_shape().as_list(), shape)
+    expected_result = np.zeros([2, 2], dtype=np.int32)
+    with self.test_session():
+      self.assertAllEqual(expected_result, scatter.eval())
+
   def testUndefinedIndicesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
-    array_ops.scatter_nd(indices, updates, shape)
+    self.scatter_nd(indices, updates, shape)
 
   def testUndefinedUpdatesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=None)
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
-    array_ops.scatter_nd(indices, updates, shape)
+    self.scatter_nd(indices, updates, shape)
 
   def testUndefinedOutputShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = array_ops.placeholder(dtypes.int32, shape=[None])
-    array_ops.scatter_nd(indices, updates, shape)
+    self.scatter_nd(indices, updates, shape)
 
   def testEmptyOutputShape1(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -313,7 +392,7 @@ class ScatterNdTest(test.TestCase):
 
     with self.assertRaisesWithPredicateMatch(
         ValueError, "Indices and updates specified for empty output shape"):
-      array_ops.scatter_nd(indices, updates, shape)
+      self.scatter_nd(indices, updates, shape)
 
   def testEmptyOutputShape2(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
@@ -321,18 +400,18 @@ class ScatterNdTest(test.TestCase):
     shape = constant_op.constant([0, 3, 2], dtypes.int32)
 
     with self.test_session():
-      array_ops.scatter_nd(indices, updates, shape).eval(feed_dict={
-          indices: np.zeros(
-              [2, 2, 2], dtype=np.int32),
-          updates: np.zeros(
-              [2, 2, 2], dtype=np.int32)
-      })
+      with self.assertRaisesOpError(
+          "Indices and updates specified for empty output"):
+        self.scatter_nd(indices, updates, shape).eval(feed_dict={
+            indices: np.zeros([2, 2, 2], dtype=np.int32),
+            updates: np.zeros([2, 2, 2], dtype=np.int32)
+        })
 
   def testEmptyOutputShape3(self):
     indices = array_ops.zeros([0], dtypes.int32)
     updates = array_ops.zeros([0], dtypes.int32)
     shape = constant_op.constant([0], dtypes.int32)
-    scatter = array_ops.scatter_nd(indices, updates, shape)
+    scatter = self.scatter_nd(indices, updates, shape)
 
     with self.test_session():
       self.assertEqual(scatter.eval().size, 0)
@@ -343,49 +422,49 @@ class ScatterNdTest(test.TestCase):
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
-      array_ops.scatter_nd(indices, updates, shape)
-
-    ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, "The outer \\d+ dimensions of indices\\.shape="):
-      state_ops.scatter_nd_update(ref, indices, updates)
+      self.scatter_nd(indices, updates, shape)
 
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The inner \\d+ dimensions of output\\.shape="):
-      array_ops.scatter_nd(indices, updates, shape)
-
-    ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, "The inner \\d+ dimensions of ref\\.shape="):
-      state_ops.scatter_nd_update(ref, indices, updates)
+        ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
+      self.scatter_nd(indices, updates, shape)
 
   def testGradientsRank2ElementUpdate(self):
     indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
     updates = constant_op.constant([1, 4], dtype=dtypes.float64)
     shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-    outputs = array_ops.scatter_nd(indices, updates, shape)
+    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
+    outputs = self.scatter_nd(indices, updates, shape, input_)
 
     grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
-    grads = gradients_impl.gradients([outputs], [updates], [grad_vals])[0]
-    expected_grads = np.array([1, 4], dtype=np.float64)
+    updates_grad, input_grad = gradients_impl.gradients(
+        [outputs], [updates, input_], [grad_vals])
+    expected_updates_grad = np.array([1, 4], dtype=np.float64)
+    expected_input_grad = np.array([[1, 2], [3, 4]], dtype=np.float64)
     with self.test_session():
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, input_grad.eval())
 
   def testGradientsRank2SliceUpdate(self):
     indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
     updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtypes.float64)
     shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-    outputs = array_ops.scatter_nd(indices, updates, shape)
+    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
+    outputs = self.scatter_nd(indices, updates, shape, input_)
 
     grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtypes.float64)
-    grads = gradients_impl.gradients([outputs], [updates], [grad_vals])[0]
-    expected_grads = np.array([[1, 2], [3, 4]], dtype=np.float64)
+    updates_grad, input_grad = gradients_impl.gradients(
+        [outputs], [updates, input_], [grad_vals])
+    expected_updates_grad = np.array([[1, 2], [3, 4]], dtype=np.float64)
+    expected_input_grad = np.array([[3, 4], [1, 2]], dtype=np.float64)
     with self.test_session():
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, input_grad.eval())
 
   def testGradientsRank3SliceUpdate(self):
     indices = constant_op.constant(
@@ -393,67 +472,28 @@ class ScatterNdTest(test.TestCase):
     updates = constant_op.constant(
         [[[5, 7], [2, 4]], [[1, 3], [6, 8]]], dtype=dtypes.float64)
     shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
-    outputs = array_ops.scatter_nd(indices, updates, shape)
+    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
+    outputs = self.scatter_nd(indices, updates, shape, input_)
 
     grad_vals = constant_op.constant(
         [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtypes.float64)
-    grads = gradients_impl.gradients([outputs], [updates], [grad_vals])[0]
-    expected_grads = np.array(
+    updates_grad, input_grad = gradients_impl.gradients(
+        [outputs], [updates, input_], [grad_vals])
+    expected_updates_grad = np.array(
         [[[3, 4], [5, 6]], [[1, 2], [7, 8]]], dtype=np.float64)
+    expected_input_grad = np.array(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=np.float64)
     with self.test_session():
-      self.assertAllEqual(expected_grads, grads.eval())
-
-  def testConcurrentUpdates(self):
-    num_updates = 10000
-    update_values = np.random.rand(num_updates)
-    ref = variables.Variable(np.zeros([2, 2]), dtype=dtypes.float64)
-    indices = constant_op.constant([[0, 1]] * num_updates, dtype=dtypes.int32)
-    updates = constant_op.constant(update_values, dtype=dtypes.float64)
-
-    expected_result = np.zeros([2, 2], dtype=np.float64)
-    expected_result[0, 1] = np.sum(update_values)
-
-    scatter = state_ops.scatter_nd_add(ref, indices, updates)
-    init = variables.global_variables_initializer()
-
-    with session.Session() as sess:
-      sess.run(init)
-      result = sess.run(scatter)
-      assert np.allclose(result, expected_result)
-
-  # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
-  def _disabledTestScatterOutOfRangeGpu(self):
-    if not test.IsBuiltWithCuda():
-      return
-    # TODO(simister): Re-enable once binary size increase due to
-    # scatter_nd ops is under control.
-    # tf.scatter_nd_mul, tf.scatter_nd_div,
-    for op in (state_ops.scatter_nd_add, state_ops.scatter_nd_sub,
-               state_ops.scatter_nd_update):
-      params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
-      updates = np.array([-3, -4, -5]).astype(np.float32)
-      # With GPU, the code ignores indices that are out of range.
-      # We don't test the implementation; just test there's no failures.
-      with self.test_session(force_gpu=True):
-        ref = variables.Variable(params)
-        ref.initializer.run()
-
-        # Indices all in range, no problem.
-        indices = np.array([2, 0, 5])
-        op(ref, indices, updates).eval()
-
-        # Indicies out of range should not fail.
-        indices = np.array([-1, 0, 5])
-        op(ref, indices, updates).eval()
-        indices = np.array([2, 0, 6])
-        op(ref, indices, updates).eval()
+      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, input_grad.eval())
 
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
     shape = [1]
     with self.test_session():
-      val = array_ops.scatter_nd(indices, values, shape).eval()
+      val = self.scatter_nd(indices, values, shape).eval()
     self.assertAllClose([np.sum(values)], val)
 
   def testSmokeScatterNdBatch2DSliceDim2(self):
@@ -461,28 +501,37 @@ class ScatterNdTest(test.TestCase):
       indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
       values = array_ops.zeros([3, 5, 7])
       shape = [4, 6, 7]
-      array_ops.scatter_nd(indices, values, shape).eval()
+      self.scatter_nd(indices, values, shape).eval()
 
   def testSmokeScatterNdBatch1DSliceDim2(self):
     with self.test_session():
       indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
       values = array_ops.zeros([0, 7])
       shape = [4, 6, 7]
-      array_ops.scatter_nd(indices, values, shape).eval()
+      self.scatter_nd(indices, values, shape).eval()
 
   def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
     with self.test_session():
       indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
       values = array_ops.zeros([1, 6, 7, 8, 9])
       shape = [3, 4, 5, 6, 7, 8, 9]
-      array_ops.scatter_nd(indices, values, shape).eval()
+      self.scatter_nd(indices, values, shape).eval()
 
   def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
     with self.test_session():
       indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
       values = array_ops.zeros([1, 2, 6, 7, 8, 9])
       shape = [3, 4, 5, 6, 7, 8, 9]
-      array_ops.scatter_nd(indices, values, shape).eval()
+      self.scatter_nd(indices, values, shape).eval()
+
+
+class ScatterNdNonAliasingAddTest(ScatterNdTest):
+  non_aliasing_add_test = True
+
+  def scatter_nd(self, indices, updates, shape, input_=None):
+    input_ = (input_ if input_ is not None else array_ops.zeros(
+        shape, dtype=updates.dtype))
+    return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 97d61d52af5ccbf51ceb3ab6934ebe14c1165063..52cf904528b27dee20679f044d92c84b49bef53b 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -504,16 +504,16 @@ class TileTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.tile(a, [[2, 3], [3, 4]]).eval()
 
-  def _RunAndVerifyResult(self, use_gpu):
+  def _RunAndVerifyResult(self, rank, use_gpu):
     with self.test_session(use_gpu=use_gpu):
-      # Random dims of rank 5
-      input_shape = np.random.randint(1, 4, size=5)
+      # Random dims of given rank
+      input_shape = np.random.randint(1, 4, size=rank)
       inp = np.random.rand(*input_shape).astype("f")
       a = constant_op.constant(
           [float(x) for x in inp.ravel(order="C")],
           shape=input_shape,
           dtype=dtypes.float32)
-      multiples = np.random.randint(1, 4, size=5).astype(np.int32)
+      multiples = np.random.randint(1, 4, size=rank).astype(np.int32)
       tiled = array_ops.tile(a, multiples)
       result = tiled.eval()
     self.assertTrue((np.array(multiples) * np.array(inp.shape) == np.array(
@@ -522,10 +522,16 @@ class TileTest(test.TestCase):
     self.assertShapeEqual(result, tiled)
 
   def testRandom(self):
+    # test low rank, like 5
     for _ in range(5):
-      self._RunAndVerifyResult(use_gpu=False)
+      self._RunAndVerifyResult(5, use_gpu=False)
     for _ in range(5):
-      self._RunAndVerifyResult(use_gpu=True)
+      self._RunAndVerifyResult(5, use_gpu=True)
+    # test high rank, like 10
+    for _ in range(5):
+      self._RunAndVerifyResult(10, use_gpu=False)
+    for _ in range(5):
+      self._RunAndVerifyResult(10, use_gpu=True)
 
   def testGradientSimpleReduction(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index c11f78b77e92245db521cec63070ab882ad6ac94..f6997e9c61e9faff112b539b3316cf51ec4b1ace 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -49,6 +49,28 @@ class SliceTest(test.TestCase):
         slice_val = slice_t.eval()
       self.assertAllEqual(slice_val, inp[2, k:k])
 
+  def testInt64Slicing(self):
+    with self.test_session(use_gpu=True):
+      a = constant_op.constant([0, 1, 2], dtype=dtypes.int64)
+
+      # Slice using int64 Tensor.
+      i = constant_op.constant(1, dtype=dtypes.int64)
+      slice_t = a[i]
+      slice_val = slice_t.eval()
+      self.assertAllEqual(1, slice_val)
+      slice_t = a[i:i+1]
+      slice_val = slice_t.eval()
+      self.assertAllEqual([1], slice_val)
+
+      # Slice using int64 integer.
+      i = np.asarray(1).astype(np.int64)
+      slice_t = a[i]
+      slice_val = slice_t.eval()
+      self.assertAllEqual(1, slice_val)
+      slice_t = a[i:i+1]
+      slice_val = slice_t.eval()
+      self.assertAllEqual([1], slice_val)
+
   def testSelectAll(self):
     for _ in range(10):
       with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index aff9753eb895ca82a1a75183cff8b88cee91fe98..c70f152af8e7d0ce1734a1b093af46becdc0ca94 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import unittest
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -545,21 +544,24 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out, np.zeros(2).astype(np.bool))
 
 
-class SparseReduceSumTest(test_util.TensorFlowTestCase):
+class SparseReduceTest(test_util.TensorFlowTestCase):
 
-  # [[1, ?, 1]
-  #  [?, 1, ?]]
-  # where ? is implictly-zero.
+  # [[1, ?, 2]
+  #  [?, 3, ?]]
+  # where ? is implicitly-zero.
   ind = np.array([[0, 0], [0, 2], [1, 1]]).astype(np.int64)
   vals = np.array([1, 1, 1]).astype(np.int32)
   dense_shape = np.array([2, 3]).astype(np.int64)
 
-  def _compare(self, sp_t, reduction_axes, ndims, keep_dims):
+  def _compare(self, sp_t, reduction_axes, ndims, keep_dims, do_sum):
     densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
 
     np_ans = densified
     if reduction_axes is None:
-      np_ans = np.sum(np_ans, keepdims=keep_dims)
+      if do_sum:
+        np_ans = np.sum(np_ans, keepdims=keep_dims)
+      else:
+        np_ans = np.max(np_ans, keepdims=keep_dims)
     else:
       if not isinstance(reduction_axes, list):  # Single scalar.
         reduction_axes = [reduction_axes]
@@ -569,15 +571,28 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
       # Loop below depends on sorted.
       reduction_axes.sort()
       for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
+        if do_sum:
+          np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
+        else:
+          np_ans = np.max(np_ans, axis=ra, keepdims=keep_dims)
 
     with self.test_session():
-      tf_dense_ans = sparse_ops.sparse_reduce_sum(sp_t, reduction_axes,
-                                                  keep_dims)
+      if do_sum:
+        tf_dense_ans = sparse_ops.sparse_reduce_sum(sp_t, reduction_axes,
+                                                    keep_dims)
+      else:
+        tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
+                                                    keep_dims)
       out_dense = tf_dense_ans.eval()
 
-      tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t, reduction_axes,
-                                                          keep_dims)
+      if do_sum:
+        tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
+                                                            reduction_axes,
+                                                            keep_dims)
+      else:
+        tf_sparse_ans = sparse_ops.sparse_reduce_max_sparse(sp_t,
+                                                            reduction_axes,
+                                                            keep_dims)
       # Convert to dense for comparison purposes.
       out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans).eval()
 
@@ -585,10 +600,11 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
     self.assertAllClose(np_ans, out_sparse)
 
   def _compare_all(self, sp_t, reduction_axes, ndims):
-    self._compare(sp_t, reduction_axes, ndims, False)
-    self._compare(sp_t, reduction_axes, ndims, True)
+    self._compare(sp_t, reduction_axes, ndims, False, False)
+    self._compare(sp_t, reduction_axes, ndims, False, True)
+    self._compare(sp_t, reduction_axes, ndims, True, False)
+    self._compare(sp_t, reduction_axes, ndims, True, True)
 
-  @unittest.skipIf(np.__version__ == "1.13.0", "numpy 1.13 bug")
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -623,8 +639,11 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
         sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
+      with self.assertRaisesOpError("Invalid reduction dimension -3"):
+        sparse_ops.sparse_reduce_max(sp_t, -3).eval()
+      with self.assertRaisesOpError("Invalid reduction dimension 2"):
+        sparse_ops.sparse_reduce_max(sp_t, 2).eval()
 
-  @unittest.skipIf(np.__version__ == "1.13.0", "numpy 1.13 bug")
   def testGradient(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -887,5 +906,20 @@ class SparseTransposeTest(test.TestCase):
           self.assertAllEqual(dn_trans, expected_trans)
 
 
+class SparsePlaceholderTest(test.TestCase):
+
+  def testPlaceholder(self):
+    foo = array_ops.sparse_placeholder(dtypes.float32, shape=(10, 47))
+    self.assertAllEqual([10, 47], foo.get_shape())
+
+  def testPartialShapePlaceholder(self):
+    foo = array_ops.sparse_placeholder(dtypes.float32, shape=(None, 47))
+    self.assertAllEqual([None, None], foo.get_shape().as_list())
+
+  def testNoShapePlaceholder(self):
+    foo = array_ops.sparse_placeholder(dtypes.float32, shape=None)
+    self.assertAllEqual(None, foo.get_shape())
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..762e400447c7e6e89ca4c0b480662aa91e287c26
--- /dev/null
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -0,0 +1,251 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SparseReorder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class SparseSliceOpTest(test.TestCase):
+
+  def _SparseTensor_4x6(self):
+    # [0 |  |2 |  |4 |5 ]
+    # [  |11|  |13|14|  ]
+    # [20|  |  |23|  |25]
+    # [30|  |32|33|  |35]
+    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4],
+                    [2, 0], [2, 3], [2, 5], [3, 0], [3, 2], [3, 3],
+                    [3, 5]]).astype(np.int64)
+    val = np.array(
+        [0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(np.int64)
+    shape = np.array([4, 6]).astype(np.int64)
+    return sparse_tensor.SparseTensor(ind, val, shape)
+
+  def _SparseTensor_5x7(self):
+    # [0 |  |2 |  |4 |5 |  ]
+    # [  |11|  |13|14|  |16]
+    # [20|  |  |23|  |25|  ]
+    # [30|  |32|33|  |35|  ]
+    # [  |41|  |  |44|  |46]
+    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4],
+                    [1, 6], [2, 0], [2, 3], [2, 5], [3, 0], [3, 2], [3, 3],
+                    [3, 5], [4, 1], [4, 4], [4, 6]]).astype(np.int64)
+    val = np.array(
+        [0, 2, 4, 5, 11, 13, 14, 16, 20, 23, 25, 30, 32, 33, 35, 41, 44,
+         46]).astype(np.int64)
+    shape = np.array([5, 7]).astype(np.int64)
+    return sparse_tensor.SparseTensor(ind, val, shape)
+
+  def _SparseTensorValue_3x4x2(self):
+    #  slice(:,:, 0)
+    #  ['a0'|    |'b0'|    ]
+    #  [    |'c0'|    |'d0']
+    #  [    |    |'e0'|    ]
+    #  slice(:,:, 1)
+    #  ['a1'|    |'b1'|    ]
+    #  [    |'c1'|    |'d1']
+    #  [    |    |'e1'|    ]
+    ind = np.array([[0, 0, 0], [0, 0, 1], [0, 2, 0], [0, 2, 1], [1, 1, 0],
+                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0],
+                    [2, 2, 1]]).astype(np.int64)
+    val = np.array(['a0', 'a1', 'b0', 'b1', 'c0', 'c1', 'd0', 'd1', 'e0', 'e1'])
+    shape = np.array([3, 4, 2]).astype(np.int64)
+    return sparse_tensor.SparseTensorValue(ind, val, shape)
+
+  def _SparseTensor_3x4x2(self):
+    return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
+    ))
+
+  def testSliceMatrixRows(self):
+    with self.test_session(use_gpu=False):
+      sp_input=self._SparseTensor_4x6()
+      sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 6])
+      sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [3, 7])
+      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
+                                                      [0, 5], [1, 1], [1, 3],
+                                                      [1, 4]])
+      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5, 11, 13, 14])
+      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 6])
+      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 0], [0, 3], [0, 5],
+                                                      [1, 0], [1, 2], [1, 3],
+                                                      [1, 5]])
+      self.assertAllEqual(sp_tensor1.values.eval(),
+                          [20, 23, 25, 30, 32, 33, 35])
+      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
+
+  def testSliceMatrixUnevenCols(self):
+    with self.test_session(use_gpu=False):
+      sp_input=self._SparseTensor_5x7()
+      sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 3])
+      sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 3], [5, 2])
+      sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 5], [5, 2])
+
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2],
+                           [4, 1]])
+      self.assertAllEqual(sp_tensor0.values.eval(),
+                          [0, 2, 11, 20, 30, 32, 41])
+      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 3])
+      self.assertAllEqual(sp_tensor1.indices.eval(),
+                          [[0, 1], [1, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
+      self.assertAllEqual(sp_tensor1.values.eval(),
+                          [4, 13, 14, 23, 33, 44])
+      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensor2.indices.eval(),
+                          [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
+      self.assertAllEqual(sp_tensor2.values.eval(), [5, 16, 25, 35, 46])
+      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [5, 2])
+
+      sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 2])
+      sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
+      sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 2])
+      sp_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 6], [5, 2])
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
+      self.assertAllEqual(sp_tensor0.values.eval(), [0, 11, 20, 30, 41])
+      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensor1.indices.eval(),
+                          [[0, 0], [1, 1], [2, 1], [3, 0], [3, 1]])
+      self.assertAllEqual(sp_tensor1.values.eval(), [2, 13, 23, 32, 33])
+      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensor2.indices.eval(),
+                          [[0, 0], [0, 1], [1, 0], [2, 1], [3, 1], [4, 0]])
+      self.assertAllEqual(sp_tensor2.values.eval(), [4, 5, 14, 25, 35, 44])
+      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [5, 2])
+      self.assertAllEqual(sp_tensor3.indices.eval(), [[1, 0], [4, 0]])
+      self.assertAllEqual(sp_tensor3.values.eval(), [16, 46])
+      self.assertAllEqual(sp_tensor3.dense_shape.eval(), [5, 1])
+
+  def testSliceMatrixUnevenRows(self):
+    with self.test_session(use_gpu=False):
+      sp_input=self._SparseTensor_5x7()
+      sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [3, 7])
+      sp_tensor1 = sparse_ops.sparse_slice(sp_input, [3, 0], [3, 7])
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
+                           [1, 4], [1, 6], [2, 0], [2, 3], [2, 5]])
+      self.assertAllEqual(sp_tensor0.values.eval(),
+                          [0, 2, 4, 5, 11, 13, 14, 16, 20, 23, 25])
+      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [3, 7])
+      self.assertAllEqual(sp_tensor1.indices.eval(),
+                          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4],
+                           [1, 6]])
+      self.assertAllEqual(sp_tensor1.values.eval(),
+                          [30, 32, 33, 35, 41, 44, 46])
+      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
+
+      sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 7])
+      sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [2, 7])
+      sp_tensor2 = sparse_ops.sparse_slice(sp_input, [4, 0], [2, 7])
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
+                           [1, 4], [1, 6]])
+      self.assertAllEqual(sp_tensor0.values.eval(),
+                          [0, 2, 4, 5, 11, 13, 14, 16])
+      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 7])
+
+      self.assertAllEqual(sp_tensor1.values.eval(),
+                          [20, 23, 25, 30, 32, 33, 35])
+      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4],
+                                                           [0, 6]])
+      self.assertAllEqual(sp_tensor2.values.eval(), [41, 44, 46])
+      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
+    return
+
+  def testSliceAllRows(self):
+    with self.test_session(use_gpu=False):
+      sp_input=self._SparseTensor_4x6()
+      sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [1, 6])
+      sp_tensor1 = sparse_ops.sparse_slice(sp_input, [1, 0], [1, 6])
+      sp_tensor2 = sparse_ops.sparse_slice(sp_input, [2, 0], [1, 7])
+      sp_tensor3 = sparse_ops.sparse_slice(sp_input, [3, 0], [2, 7])
+      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
+                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5])
+      self.assertAllEqual(sp_tensor0.dense_shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0,
+                                                                          4]])
+      self.assertAllEqual(sp_tensor1.values.eval(), [11, 13, 14])
+      self.assertAllEqual(sp_tensor1.dense_shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0,
+                                                                          5]])
+      self.assertAllEqual(sp_tensor2.values.eval(), [20, 23, 25])
+      self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 6])
+      self.assertAllEqual(sp_tensor3.indices.eval(), [[0, 0], [0, 2], [0, 3],
+                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
+      self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
+
+  def testSliceColumns(self):
+    with self.test_session(use_gpu=False):
+      sp_input=self._SparseTensor_4x6()
+      sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 2])
+      sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
+      sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 3])
+
+      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [1, 1],
+                                                             [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor0.values.eval(), [0, 11, 20, 30])
+      self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 2])
+      self.assertAllEqual(sparse_tensor1.indices.eval(),
+                          [[0, 0], [1, 1], [2, 1], [3, 0], [3, 1]])
+      self.assertAllEqual(sparse_tensor1.values.eval(), [2, 13, 23, 32, 33])
+      self.assertAllEqual(sparse_tensor1.dense_shape.eval(), [4, 2])
+      self.assertAllEqual(sparse_tensor2.indices.eval(),
+                          [[0, 0], [0, 1], [1, 0], [2, 1], [3, 1]])
+      self.assertAllEqual(sparse_tensor2.values.eval(), [4, 5, 14, 25, 35])
+      self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 2])
+
+  def testSliceAllColumns(self):
+    with self.test_session(use_gpu=False):
+      sp_input=self._SparseTensor_4x6()
+      sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 1])
+      sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 1], [4, 1])
+      sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 2], [4, 1])
+      sparse_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 3], [4, 1])
+      sparse_tensor4 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 1])
+      sparse_tensor5 = sparse_ops.sparse_slice(sp_input, [0, 5], [6, 3])
+      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [2, 0],
+                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor0.values.eval(), [0, 20, 30])
+      self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensor1.indices.eval(), [[1, 0]])
+      self.assertAllEqual(sparse_tensor1.values.eval(), [11])
+      self.assertAllEqual(sparse_tensor1.dense_shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensor2.indices.eval(), [[0, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor2.values.eval(), [2, 32])
+      self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensor3.indices.eval(), [[1, 0], [2, 0],
+                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor3.dense_shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensor3.values.eval(), [13, 23, 33])
+      self.assertAllEqual(sparse_tensor4.indices.eval(), [[0, 0], [1, 0]])
+      self.assertAllEqual(sparse_tensor4.values.eval(), [4, 14])
+      self.assertAllEqual(sparse_tensor4.dense_shape.eval(), [4, 1])
+      self.assertAllEqual(sparse_tensor5.indices.eval(), [[0, 0], [2, 0],
+                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
+      self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index afc0c38cacbe3fd9135413390fd6c7f9f6d614be..95ea3a9047367fab956144a7264d989e98f781e9 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -45,45 +45,61 @@ class StackOpTest(test.TestCase):
     np.random.seed(7)
     with self.test_session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        data = np.random.randn(*shape)
-        # Convert [data[0], data[1], ...] separately to tensorflow
-        # TODO(irving): Remove list() once we handle maps correctly
+        for dtype in [np.float32, np.int32, np.int64]:
+          data = np.random.randn(*shape).astype(dtype)
+          # Convert [data[0], data[1], ...] separately to tensorflow
+          # TODO(irving): Remove list() once we handle maps correctly
+          xs = list(map(constant_op.constant, data))
+          # Pack back into a single tensorflow tensor
+          c = array_ops.stack(xs)
+          self.assertAllEqual(c.eval(), data)
+
+  def testSimpleParallel(self):
+    np.random.seed(7)
+    with self.test_session(use_gpu=True):
+      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+        data = np.random.randn(*shape).astype(np.float32)
         xs = list(map(constant_op.constant, data))
-        # Pack back into a single tensorflow tensor
-        c = array_ops.stack(xs)
-        self.assertAllEqual(c.eval(), data)
-
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
   def testConst(self):
+    np.random.seed(7)
+    with self.test_session(use_gpu=True):
+      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+        for dtype in [np.float32, np.int32, np.int64]:
+          data = np.random.randn(*shape).astype(dtype)
+          # Pack back into a single tensorflow tensor directly using np array
+          c = array_ops.stack(data)
+          # This is implemented via a Const:
+          self.assertEqual(c.op.type, "Const")
+          self.assertAllEqual(c.eval(), data)
+
+          # Python lists also work for 1-D case:
+          if len(shape) == 1:
+            data_list = list(data)
+            cl = array_ops.stack(data_list)
+            self.assertEqual(cl.op.type, "Const")
+            self.assertAllEqual(cl.eval(), data)
+
+        # Verify that shape induction works with shapes produced via const stack
+        a = constant_op.constant([1, 2, 3, 4, 5, 6])
+        b = array_ops.reshape(a, array_ops.stack([2, 3]))
+        self.assertAllEqual(b.get_shape(), [2, 3])
+
+  def testConstParallel(self):
     np.random.seed(7)
     with self.test_session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         data = np.random.randn(*shape).astype(np.float32)
-        # Pack back into a single tensorflow tensor directly using np array
-        c = array_ops.stack(data)
-        # This is implemented via a Const:
-        self.assertEqual(c.op.type, "Const")
-        self.assertAllEqual(c.eval(), data)
-
-        c = array_ops.parallel_stack(data)
-        self.assertAllEqual(c.eval(), data)
-
-        # Python lists also work for 1-D case:
         if len(shape) == 1:
           data_list = list(data)
-          cl = array_ops.stack(data_list)
-          self.assertEqual(cl.op.type, "Const")
-          self.assertAllEqual(cl.eval(), data)
-
           cl = array_ops.parallel_stack(data_list)
           self.assertAllEqual(cl.eval(), data)
 
-      # Verify that shape induction works with shapes produced via const stack
-      a = constant_op.constant([1, 2, 3, 4, 5, 6])
-      b = array_ops.reshape(a, array_ops.stack([2, 3]))
-      self.assertAllEqual(b.get_shape(), [2, 3])
+        data = np.random.randn(*shape).astype(np.float32)
+        c = array_ops.parallel_stack(data)
+        self.assertAllEqual(c.eval(), data)
 
   def testGradientsAxis0(self):
     np.random.seed(7)
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index 441256df2636cb513a3b2509b080fbbb8c5a8105..aa409336f5c50178e4d0ca946190119fb0e4188e 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_ops
@@ -35,10 +34,11 @@ class StackOpTest(test.TestCase):
 
   def _testStackPushPop(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
-      h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push(h, [[4.0, 5.0]])
+      h = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
       self.assertAllClose([[4.0, 5.0]], c1.eval())
 
   def testStackPushPop(self):
@@ -49,10 +49,11 @@ class StackOpTest(test.TestCase):
     with self.test_session(use_gpu=use_gpu):
       a = np.arange(2000)
       x = constant_op.constant(a, dtype=dtypes.float32)
-      h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
-      c = gen_data_flow_ops._stack_push(h, x, swap_memory=True)
+      h = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
-        c1 = gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        c1 = gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
       self.assertAllClose(a, c1.eval())
 
   def testStackPushPopSwap(self):
@@ -62,7 +63,8 @@ class StackOpTest(test.TestCase):
   def _testStackWhileSwap(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
-      h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
+      h = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
 
       def c(x):
         return math_ops.less(x, 10)
@@ -70,7 +72,7 @@ class StackOpTest(test.TestCase):
       def b(x):
         with ops.control_dependencies([x]):
           a = constant_op.constant(np.ones(2000), dtype=dtypes.float32)
-          v = gen_data_flow_ops._stack_push(h, a, swap_memory=True)
+          v = gen_data_flow_ops._stack_push_v2(h, a, swap_memory=True)
         with ops.control_dependencies([v]):
           return math_ops.add(x, 1)
 
@@ -79,14 +81,15 @@ class StackOpTest(test.TestCase):
       v = constant_op.constant(np.zeros(2000), dtype=dtypes.float32)
 
       def c1(x, y):
+        del y
         return math_ops.greater(x, 0)
 
       def b1(x, y):
         nx = math_ops.subtract(x, 1)
-        ny = y + gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        ny = y + gen_data_flow_ops._stack_pop_v2(h, dtypes.float32)
         return [nx, ny]
 
-      rx, ry = control_flow_ops.while_loop(
+      _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
       self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
 
@@ -94,6 +97,102 @@ class StackOpTest(test.TestCase):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
 
+  def _testMultiStack(self, use_gpu):
+    with self.test_session(use_gpu=use_gpu):
+      h1 = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+      c1 = gen_data_flow_ops._stack_push_v2(h1, 4.0)
+      with ops.control_dependencies([c1]):
+        c1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
+      h2 = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="bar")
+      c2 = gen_data_flow_ops._stack_push_v2(h2, 5.0)
+      with ops.control_dependencies([c2]):
+        c2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+      r = c1 + c2
+      self.assertAllClose(9.0, r.eval())
+
+  def testMultiStack(self):
+    self._testMultiStack(use_gpu=False)
+    self._testMultiStack(use_gpu=True)
+
+  def _testSameNameStacks(self, use_gpu):
+    """Different stacks with the same name do not interfere."""
+    with self.test_session(use_gpu=use_gpu) as sess:
+      h1 = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+      h2 = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+
+      c1 = gen_data_flow_ops._stack_push_v2(h1, 4.0)
+      with ops.control_dependencies([c1]):
+        c2 = gen_data_flow_ops._stack_push_v2(h2, 5.0)
+      with ops.control_dependencies([c2]):
+        pop1 = gen_data_flow_ops._stack_pop_v2(h1, dtypes.float32)
+        pop2 = gen_data_flow_ops._stack_pop_v2(h2, dtypes.float32)
+
+      out1, out2 = sess.run([pop1, pop2])
+      self.assertAllClose(out1, 4.0)
+      self.assertAllClose(out2, 5.0)
+
+  def testSameNameStacks(self):
+    self._testSameNameStacks(use_gpu=False)
+    self._testSameNameStacks(use_gpu=True)
+
+  def _testCloseStack(self, use_gpu):
+    with self.test_session(use_gpu=use_gpu) as sess:
+      h = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+      c1 = gen_data_flow_ops._stack_close_v2(h)
+      sess.run(c1)
+
+  def testCloseStack(self):
+    self._testCloseStack(use_gpu=False)
+    self._testCloseStack(use_gpu=True)
+
+  def _testPushCloseStack(self, use_gpu):
+    with self.test_session(use_gpu=use_gpu) as sess:
+      h = gen_data_flow_ops._stack_v2(
+          -1, elem_type=dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push_v2(h, [[4.0, 5.0]])
+      with ops.control_dependencies([c]):
+        c1 = gen_data_flow_ops._stack_close_v2(h)
+      sess.run(c1)
+
+  def testPushCloseStack(self):
+    self._testPushCloseStack(use_gpu=False)
+    self._testPushCloseStack(use_gpu=True)
+
+
+class StackOpRefTest(test.TestCase):
+  """Tests for deprecated non-resource variant of stack ops."""
+
+  def _testStackPushPop(self, use_gpu):
+    with self.test_session(use_gpu=use_gpu):
+      h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push(h, [[4.0, 5.0]])
+      with ops.control_dependencies([c]):
+        c1 = gen_data_flow_ops._stack_pop(h, dtypes.float32)
+      self.assertAllClose([[4.0, 5.0]], c1.eval())
+
+  def testStackPushPop(self):
+    self._testStackPushPop(use_gpu=False)
+    self._testStackPushPop(use_gpu=True)
+
+  def _testStackPushPopSwap(self, use_gpu):
+    with self.test_session(use_gpu=use_gpu):
+      a = np.arange(2000)
+      x = constant_op.constant(a, dtype=dtypes.float32)
+      h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
+      c = gen_data_flow_ops._stack_push(h, x, swap_memory=True)
+      with ops.control_dependencies([c]):
+        c1 = gen_data_flow_ops._stack_pop(h, dtypes.float32)
+      self.assertAllClose(a, c1.eval())
+
+  def testStackPushPopSwap(self):
+    self._testStackPushPopSwap(use_gpu=False)
+    self._testStackPushPopSwap(use_gpu=True)
+
   def _testMultiStack(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       h1 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
@@ -107,6 +206,42 @@ class StackOpTest(test.TestCase):
       r = c1 + c2
       self.assertAllClose(9.0, r.eval())
 
+  def _testStackWhileSwap(self, use_gpu):
+    with self.test_session(use_gpu=use_gpu):
+      n = constant_op.constant(0)
+      h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
+
+      def c(x):
+        return math_ops.less(x, 10)
+
+      def b(x):
+        with ops.control_dependencies([x]):
+          a = constant_op.constant(np.ones(2000), dtype=dtypes.float32)
+          v = gen_data_flow_ops._stack_push(h, a, swap_memory=True)
+        with ops.control_dependencies([v]):
+          return math_ops.add(x, 1)
+
+      r = control_flow_ops.while_loop(c, b, [n])
+
+      v = constant_op.constant(np.zeros(2000), dtype=dtypes.float32)
+
+      def c1(x, y):
+        del y
+        return math_ops.greater(x, 0)
+
+      def b1(x, y):
+        nx = math_ops.subtract(x, 1)
+        ny = y + gen_data_flow_ops._stack_pop(h, dtypes.float32)
+        return [nx, ny]
+
+      _, ry = control_flow_ops.while_loop(
+          c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
+      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+
+  def testStackWhileSwap(self):
+    self._testStackWhileSwap(use_gpu=False)
+    self._testStackWhileSwap(use_gpu=True)
+
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
@@ -117,7 +252,7 @@ class StackOpTest(test.TestCase):
       c1 = gen_data_flow_ops._stack_push(h1, 4.0)
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c2 = gen_data_flow_ops._stack_push(h2, 5.0)
-      r = c1 + c2
+      _ = c1 + c2
       self.assertNotEqual(h1.eval()[1], h2.eval()[1])
 
   def testSameNameStacks(self):
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 1a6a869e3d0e97006213a177fd0657f170695ecd..64b3388c5c0fd16436fa77ac5d8d0e8f9a859c32 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -102,10 +102,10 @@ class StageTest(test.TestCase):
       with ops.device(gpu_dev):
         stager = data_flow_ops.StagingArea([dtypes.float32])
         y = stager.put([v])
-        self.assertEqual(y.device, '/device:GPU:0' if gpu_dev
-                                                   else gpu_dev)
+        expected_name = gpu_dev if 'gpu' not in gpu_dev else '/device:GPU:0'
+        self.assertEqual(y.device, expected_name)
       with ops.device('/cpu:0'):
-        x = stager.get()
+        x = stager.get()[0]
         self.assertEqual(x.device, '/device:CPU:0')
 
     G.finalize()
@@ -125,10 +125,10 @@ class StageTest(test.TestCase):
 
     with self.test_session(use_gpu=True, graph=G) as sess:
       for i in range(10):
-        sess.run(stage, feed_dict={x:i})
+        sess.run(stage, feed_dict={x: i})
 
       for i in range(10):
-        self.assertTrue(sess.run(peek, feed_dict={p:i}) == i)
+        self.assertTrue(sess.run(peek, feed_dict={p: i}) == [i])
 
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
@@ -210,7 +210,7 @@ class StageTest(test.TestCase):
 
       # Clear the staging area completely
       for i in range(n):
-        self.assertTrue(sess.run(ret) == i)
+        self.assertTrue(sess.run(ret) == [i])
 
       # It should now be empty
       self.assertTrue(sess.run(size) == 0)
@@ -273,7 +273,7 @@ class StageTest(test.TestCase):
 
       # Clear the staging area completely
       for i in range(n):
-        self.assertTrue(np.all(sess.run(ret) == i))
+        self.assertTrue(np.all(sess.run(ret)[0] == i))
 
       self.assertTrue(sess.run(size) == 0)
 
diff --git a/tensorflow/python/kernel_tests/summary_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_tensor_op_test.py
index ee3f5aa250e35c2ce5f84495da9680ce428b4a02..35846378655744f207a4f9e5d279ca1578a34a5a 100644
--- a/tensorflow/python/kernel_tests/summary_tensor_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_tensor_op_test.py
@@ -41,7 +41,7 @@ class SummaryOpsTest(test.TestCase):
   def _AssertNumpyEq(self, actual, expected):
     self.assertTrue(np.array_equal(actual, expected))
 
-  def testNodeNames(self):
+  def testTags(self):
     with self.test_session() as sess:
       c = constant_op.constant(1)
       s1 = summary_ops.tensor_summary("s1", c)
@@ -53,16 +53,16 @@ class SummaryOpsTest(test.TestCase):
       summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4])
 
     v1 = self._SummarySingleValue(summ1)
-    self.assertEqual(v1.node_name, "s1")
+    self.assertEqual(v1.tag, "s1")
 
     v2 = self._SummarySingleValue(summ2)
-    self.assertEqual(v2.node_name, "foo/s2")
+    self.assertEqual(v2.tag, "foo/s2")
 
     v3 = self._SummarySingleValue(summ3)
-    self.assertEqual(v3.node_name, "foo/zod/s3")
+    self.assertEqual(v3.tag, "foo/zod/s3")
 
     v4 = self._SummarySingleValue(summ4)
-    self.assertEqual(v4.node_name, "foo/zod/TensorSummary")
+    self.assertEqual(v4.tag, "foo/zod/TensorSummary")
 
   def testScalarSummary(self):
     with self.test_session() as sess:
@@ -115,6 +115,56 @@ class SummaryOpsTest(test.TestCase):
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, bools)
 
+  def testSummaryDescriptionAndDisplayName(self):
+    with self.test_session() as sess:
+
+      def get_description(summary_op):
+        summ_str = sess.run(summary_op)
+        summ = summary_pb2.Summary()
+        summ.ParseFromString(summ_str)
+        return summ.value[0].metadata
+
+      const = constant_op.constant(1)
+      # Default case; no description or display name
+      simple_summary = summary_ops.tensor_summary("simple", const)
+
+      descr = get_description(simple_summary)
+      self.assertEqual(descr.display_name, "")
+      self.assertEqual(descr.summary_description, "")
+
+      # Values are provided via function args
+      with_values = summary_ops.tensor_summary(
+          "simple",
+          const,
+          display_name="my name",
+          summary_description="my description")
+
+      descr = get_description(with_values)
+      self.assertEqual(descr.display_name, "my name")
+      self.assertEqual(descr.summary_description, "my description")
+
+      # Values are provided via the SummaryMetadata arg
+      metadata = summary_pb2.SummaryMetadata()
+      metadata.display_name = "my name"
+      metadata.summary_description = "my description"
+
+      with_metadata = summary_ops.tensor_summary(
+          "simple", const, summary_metadata=metadata)
+      descr = get_description(with_metadata)
+      self.assertEqual(descr.display_name, "my name")
+      self.assertEqual(descr.summary_description, "my description")
+
+      # If both SummmaryMetadata and explicit args are provided, the args win
+      overwrite = summary_ops.tensor_summary(
+          "simple",
+          const,
+          summary_metadata=metadata,
+          display_name="overwritten",
+          summary_description="overwritten")
+      descr = get_description(overwrite)
+      self.assertEqual(descr.display_name, "overwritten")
+      self.assertEqual(descr.summary_description, "overwritten")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index b3f737c88410f970510127edb551fab4e07ad3cb..034b8be4dd2614d4b01f09be24972ce2fc085610 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -18,13 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import sys
+
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
@@ -36,25 +43,103 @@ class TopKTest(test.TestCase):
                     k,
                     expected_values,
                     expected_indices,
-                    sorted=True):
-    np_values = np.array(expected_values)
-    np_indices = np.array(expected_indices)
-    with self.test_session():
+                    sorted=True):  # pylint: disable=redefined-builtin
+    np_expected_values = np.array(expected_values)
+    np_expected_indices = np.array(expected_indices)
+    with self.test_session(use_gpu=True) as sess:
       values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
-      values = values_op.eval()
-      indices = indices_op.eval()
-      self.assertShapeEqual(np_values, values_op)
-      self.assertShapeEqual(np_indices, indices_op)
-      self.assertAllEqual(np_indices, indices)
-      self.assertAllClose(np_values, values)
+      values, indices = sess.run([values_op, indices_op])
+
+      self.assertShapeEqual(np_expected_values, values_op)
+      self.assertShapeEqual(np_expected_indices, indices_op)
+
+      if sorted:
+        self.assertAllClose(np_expected_values, values)
+        # Do some special casing of equality of indices: if indices
+        # are not the same, but values are floating type, ensure that
+        # the values are within epsilon of each other.
+        if not np.issubdtype(np_expected_values.dtype, np.float):
+          # Values are not floating point type; check indices exactly
+          self.assertAllEqual(np_expected_indices, indices)
+        else:
+          # Values are floating point; indices may be swapped for
+          # values near each other.
+          indices_not_equal = np_expected_indices != indices
+          if np.any(indices_not_equal):
+            values_unsure = values[indices_not_equal]
+            expected_values_unsure = expected_values[indices_not_equal]
+            self.assertAllClose(expected_values_unsure, values_unsure)
+      else:
+        np_inputs = np.array(inputs)
+
+        # Check that the indices are valid.
+        for result_index, src_index in np.ndenumerate(indices):
+          value = values[result_index]
+          expected_value = np_inputs[result_index[0], src_index]
+          np.testing.utils.assert_almost_equal(value, expected_value)
+
+        # Check that if two elements are equal, the lower-index element appears
+        # first.
+        shape = values.shape
+        for batch_index in range(shape[0]):
+          for index in range(shape[1] - 1):
+            if np.isclose(values[batch_index, index],
+                          values[batch_index, index + 1]):
+              self.assertLess(indices[batch_index, index],
+                              indices[batch_index, index + 1])
+
+        # Now check the results, ignoring order.
+        self.assertAllEqual(np.sort(np_expected_indices), np.sort(indices))
+        self.assertAllClose(np.sort(np_expected_values), np.sort(values))
 
   def testTop1(self):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
     self._validateTopK(inputs, 1, [[0.4], [0.3]], [[3], [1]])
 
   def testTop2(self):
-    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
-    self._validateTopK(inputs, 2, [[0.4, 0.3], [0.3, 0.3]], [[3, 1], [2, 1]])
+    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.4, 0.2]]
+    self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])
+
+  def _testLargeSort(self, dtype):
+    b = 10
+    n = 5000
+    inputs = np.random.permutation(
+        np.linspace(0, 100, b * n, dtype=dtype)).reshape(b, n)
+    indices = np.argsort(-inputs, axis=1)
+    values = -np.sort(-inputs, axis=1)
+    self._validateTopK(inputs, n, values, indices)
+
+  def testLargeSort(self):
+    self._testLargeSort(np.float32)
+    self._testLargeSort(np.float16)
+
+  def _testLargeTopK(self, dtype):
+    b = 10
+    n = 5000
+    k = n - 1
+    inputs = np.random.permutation(
+        np.linspace(0, 100, b * n, dtype=dtype)).reshape(b, n)
+    indices = np.argsort(-inputs, axis=1)[:, :k]
+    values = -np.sort(-inputs, axis=1)[:, :k]
+    self._validateTopK(inputs, k, values, indices)
+
+  def testLargeTopK(self):
+    self._testLargeTopK(np.float32)
+    self._testLargeTopK(np.float16)
+
+  def _testMediumTopK(self, dtype):
+    b = 5
+    n = 500
+    k = 50
+    inputs = np.random.permutation(
+        np.linspace(0, 100, b * n, dtype=dtype)).reshape(b, n)
+    indices = np.argsort(-inputs, axis=1)[:, :k]
+    values = -np.sort(-inputs, axis=1)[:, :k]
+    self._validateTopK(inputs, k, values, indices)
+
+  def testMediumTopK(self):
+    self._testMediumTopK(np.float32)
+    self._testMediumTopK(np.float16)
 
   def testTopAll(self):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
@@ -79,7 +164,7 @@ class TopKTest(test.TestCase):
 
   def testKNegative(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       k = array_ops.placeholder(dtypes.int32)
       values, _ = nn_ops.top_k(inputs, k)
       with self.assertRaisesOpError("Need k >= 0, got -7"):
@@ -92,7 +177,7 @@ class TopKTest(test.TestCase):
       nn_ops.top_k(inputs, 4)
 
   def testTopKGradients(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = array_ops.placeholder(dtypes.int32, shape=[2, 5])
       values, _ = nn_ops.top_k(inputs, 3)
       grad = sess.run(
@@ -102,5 +187,33 @@ class TopKTest(test.TestCase):
     self.assertEqual(grad.tolist(), [[0, 0, 1, 3, 2], [0, 4, 0, 5, 6]])
 
 
+class TopKBenchmark(test.Benchmark):
+
+  def benchmarkTopK(self):
+    for (m, n, p, use_gpu) in itertools.product(
+        [128],
+        [10, 100, 1000, 10000, 100000],
+        [0.001, 0.01, 0.5, 0.99, 1.0],
+        [False, True]):
+      k = int(p * n)
+      if k == 0:
+        continue
+      name = "m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          x = random_ops.random_uniform((m, n))
+          v = resource_variable_ops.ResourceVariable(x)
+          op = nn_ops.top_k(v, k)
+        with session.Session() as sess:
+          v.initializer.run()
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          gb_processed_input = m * n / 1.0e9
+          throughput = gb_processed_input / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index b47159ae7f0ba7498b6ec70effbea418043ed657..3e1fa0a287b1c61ff8f2e19f8f9186ead6939481 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -18,17 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import sys
+
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
 class WhereOpTest(test.TestCase):
 
   def _testWhere(self, x, truth, expected_err_re=None):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ans = array_ops.where(x)
       self.assertEqual([None, x.ndim], ans.get_shape().as_list())
       if expected_err_re is None:
@@ -39,12 +47,30 @@ class WhereOpTest(test.TestCase):
           ans.eval()
 
   def testWrongNumbers(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.where([False, True], [1, 2], None)
       with self.assertRaises(ValueError):
         array_ops.where([False, True], None, [1, 2])
 
+  def testBasicVec(self):
+    x = np.asarray([True, False])
+    truth = np.asarray([[0]], dtype=np.int64)
+    self._testWhere(x, truth)
+
+    x = np.asarray([False, True, False])
+    truth = np.asarray([[1]], dtype=np.int64)
+    self._testWhere(x, truth)
+
+    x = np.asarray([False, False, True, False, True])
+    truth = np.asarray([[2], [4]], dtype=np.int64)
+    self._testWhere(x, truth)
+
+  def testRandomVec(self):
+    x = np.random.rand(1000000) > 0.5
+    truth = np.vstack([np.where(x)[0].astype(np.int64)]).T
+    self._testWhere(x, truth)
+
   def testBasicMat(self):
     x = np.asarray([[True, False], [True, False]])
 
@@ -67,10 +93,37 @@ class WhereOpTest(test.TestCase):
   def testThreeArgument(self):
     x = np.array([[-2, 3, -1], [1, -3, -3]])
     np_val = np.where(x > 0, x * x, -x)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       tf_val = array_ops.where(constant_op.constant(x) > 0, x * x, -x).eval()
     self.assertAllEqual(tf_val, np_val)
 
 
+class WhereBenchmark(test.Benchmark):
+
+  def benchmarkWhere(self):
+    for (m, n, p, use_gpu) in itertools.product(
+        [10],
+        [10, 100, 1000, 10000, 100000, 1000000],
+        [0.01, 0.5, 0.99],
+        [False, True]):
+      name = "m_%d_n_%d_p_%g_use_gpu_%s" % (m, n, p, use_gpu)
+      device = "/%s:0" % ("gpu" if use_gpu else "cpu")
+      with ops.Graph().as_default():
+        with ops.device(device):
+          x = random_ops.random_uniform((m, n), dtype=dtypes.float32) <= p
+          v = resource_variable_ops.ResourceVariable(x)
+          op = array_ops.where(v)
+        with session.Session() as sess:
+          v.initializer.run()
+          r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
+          gb_processed_input = m * n / 1.0e9
+          # approximate size of output: m*n*p int64s for each axis.
+          gb_processed_output = 2 * 8 * m * n * p / 1.0e9
+          gb_processed = gb_processed_input + gb_processed_output
+          throughput = gb_processed / r["wall_time"]
+          print("Benchmark: %s \t wall_time: %0.03g s \t "
+                "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
+          sys.stdout.flush()
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index a37308f702b7e230b14bcd0ca30f2f56ac8d97f5..9e5772b891c8379fd73f1dfe17df4276ba1bbfec 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -27,6 +27,7 @@ import collections
 import copy
 import functools
 import re
+import weakref
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
@@ -40,6 +41,14 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
+def _is_tensor_or_tensor_list(v):
+  v = nest.flatten(v)
+  if v and isinstance(v[0], ops.Tensor):
+    return True
+  else:
+    return False
+
+
 class Layer(object):
   """Base layer class.
 
@@ -472,6 +481,8 @@ class Layer(object):
         setattr(result, k, v)
       elif k in shallow_copy:
         setattr(result, k, copy.copy(v))
+      elif _is_tensor_or_tensor_list(v):
+        setattr(result, k, v)
       else:
         setattr(result, k, copy.deepcopy(v, memo))
     return result
@@ -526,7 +537,7 @@ class Layer(object):
         if x.get_shape().ndims is None:
           raise ValueError('Input ' + str(input_index) + ' of layer ' +
                            self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, by the layer requires a '
+                           'its rank is undefined, but the layer requires a '
                            'defined rank.')
 
       # Check ndim.
@@ -671,8 +682,7 @@ def _object_list_uid(object_list):
 # A global dictionary mapping graph objects to an index of counters used
 # for various layer names in each graph.
 # Allows to give unique autogenerated names to layers, in a graph-specific way.
-PER_GRAPH_LAYER_NAME_UIDS = collections.defaultdict(
-    lambda: collections.defaultdict(int))
+PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
 
 
 def _unique_layer_name(name):
@@ -694,6 +704,8 @@ def _unique_layer_name(name):
   ```
   """
   graph = ops.get_default_graph()
+  if graph not in PER_GRAPH_LAYER_NAME_UIDS:
+    PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(int)
   layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
   layer_name_uids[name] += 1
   return name + '_' + str(layer_name_uids[name])
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 81fbe5fbf70e95637fa5687bd5a146b1d1bac4b4..5f93f58f8c49dfafac1073deef3772706d842e84 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -192,6 +192,7 @@ class BaseLayerTest(test.TestCase):
         return math_ops.square(inputs)
 
     layer = MyLayer(name='my_layer')
+    layer._private_tensor = random_ops.random_uniform(())
     inputs = random_ops.random_uniform((5,), seed=1)
     outputs = layer.apply(inputs)
     self.assertEqual(layer.built, True)
@@ -201,6 +202,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer_copy.name, layer.name)
     self.assertEqual(layer_copy._scope.name, layer._scope.name)
     self.assertEqual(layer_copy._graph, layer._graph)
+    self.assertEqual(layer_copy._private_tensor, layer._private_tensor)
 
   def testScopeNaming(self):
 
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index fdf1b134b9cd4b659d519725ed3d71f3f67245be..3594ced2595befc295b6da9b2fe09f4e1bb5a4b8 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -388,7 +388,7 @@ class Conv2D(_Conv):
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: An integer or tuple/list of 2 integers, specifying the
-      width and height of the 2D convolution window.
+      height and width of the 2D convolution window.
       Can be a single integer to specify the same value for
       all spatial dimensions.
     strides: An integer or tuple/list of 2 integers,
@@ -489,7 +489,7 @@ def conv2d(inputs,
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: An integer or tuple/list of 2 integers, specifying the
-      width and height of the 2D convolution window.
+      height and width of the 2D convolution window.
       Can be a single integer to specify the same value for
       all spatial dimensions.
     strides: An integer or tuple/list of 2 integers,
@@ -741,7 +741,7 @@ class SeparableConv2D(Conv2D):
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: A tuple or list of 2 integers specifying the spatial
-      dimensions of of the filters. Can be a single integer to specify the same
+      dimensions of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
     strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
@@ -950,7 +950,7 @@ def separable_conv2d(inputs,
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: A tuple or list of 2 integers specifying the spatial
-      dimensions of of the filters. Can be a single integer to specify the same
+      dimensions of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
     strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
@@ -1033,7 +1033,7 @@ class Conv2DTranspose(Conv2D):
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: A tuple or list of 2 positive integers specifying the spatial
-      dimensions of of the filters. Can be a single integer to specify the same
+      dimensions of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
     strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
@@ -1233,7 +1233,7 @@ def conv2d_transpose(inputs,
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: A tuple or list of 2 positive integers specifying the spatial
-      dimensions of of the filters. Can be a single integer to specify the same
+      dimensions of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
     strides: A tuple or list of 2 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
@@ -1350,6 +1350,7 @@ class Conv3DTranspose(Conv3D):
         trainable=trainable,
         name=name,
         **kwargs)
+    self.input_spec = base.InputSpec(ndim=5)
 
   def build(self, input_shape):
     if len(input_shape) != 5:
@@ -1391,6 +1392,9 @@ class Conv3DTranspose(Conv3D):
     else:
       c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
 
+    self.input_spec = base.InputSpec(ndim=5,
+                                     axes={c_axis: inputs_shape[c_axis]})
+
     depth = inputs_shape[d_axis]
     height = inputs_shape[h_axis]
     width = inputs_shape[w_axis]
@@ -1468,6 +1472,26 @@ class Conv3DTranspose(Conv3D):
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[d_axis] = utils.deconv_output_length(
+        output_shape[d_axis], stride_d, kernel_d, self.padding)
+    output_shape[h_axis] = utils.deconv_output_length(
+        output_shape[h_axis], stride_h, kernel_h, self.padding)
+    output_shape[w_axis] = utils.deconv_output_length(
+        output_shape[w_axis], stride_w, kernel_w, self.padding)
+    return tensor_shape.TensorShape(output_shape)
+
 
 def conv3d_transpose(inputs,
                      filters,
@@ -1492,7 +1516,7 @@ def conv3d_transpose(inputs,
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
     kernel_size: A tuple or list of 3 positive integers specifying the spatial
-      dimensions of of the filters. Can be a single integer to specify the same
+      dimensions of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
     strides: A tuple or list of 3 positive integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
@@ -1501,8 +1525,9 @@ def conv3d_transpose(inputs,
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
     activation: Activation function. Set it to None to maintain a
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 407bc06dfebc61d64b56493fa26e1b7380f91be0..61dffe29500925e081fc32400ce2bf563bd78a65 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -57,6 +57,8 @@ class Dense(base.Layer):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the default
+      initializer used by `tf.get_variable`.
     bias_initializer: Initializer function for the bias.
     kernel_regularizer: Regularizer function for the weight matrix.
     bias_regularizer: Regularizer function for the bias.
@@ -186,6 +188,8 @@ def dense(
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the default
+      initializer used by `tf.get_variable`.
     bias_initializer: Initializer function for the bias.
     kernel_regularizer: Regularizer function for the weight matrix.
     bias_regularizer: Regularizer function for the bias.
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 780d1c2b8e02894f87423bf32a3d53baf3ac2f14..ad0f202f95928a2b250dbcea955b3f868b31fed9 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -123,6 +123,10 @@ class BatchNormalization(base.Layer):
     if self.fused and renorm:
       raise ValueError(
           'Batch renorm is currently not supported with fused batch norm.')
+    if self.fused and (beta_regularizer is not None or
+                       gamma_regularizer is not None):
+      raise ValueError('Regularizers are not currently '
+                       'supported for fused batch norm.')
     if renorm:
       renorm_clipping = renorm_clipping or {}
       keys = ['rmax', 'rmin', 'dmax']
@@ -153,7 +157,12 @@ class BatchNormalization(base.Layer):
                        ' is out of range for input with rank ' + str(ndim))
 
     if self.fused is None:
-      self.fused = not self.renorm and ndim == 4 and axis in [1, 3]
+      # Currently fused batch norm doesn't support renorm and beta/gamma
+      # regularizer; and only supports an input tensor of rank 4 and a channel
+      # dimension on axis 1 and 3.
+      self.fused = not self.renorm and ndim == 4 and axis in [
+          1, 3
+      ] and self.beta_regularizer is None and self.gamma_regularizer is None
 
     if self.fused:
       if axis == 1:
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index fa6c9c4a5db675131f10c9992d1500c446340120..64bebb1021c165d472fd80f6f4c466be192e6946 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -143,44 +143,45 @@ class BNTest(test.TestCase):
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
   def test4DInputAxis1(self):
-    epsilon = 1e-3
-    bn = normalization_layers.BatchNormalization(
-        axis=1, epsilon=epsilon, momentum=0.9)
-    inputs = variables.Variable(
-        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
-    training = array_ops.placeholder(dtype='bool')
-    outputs = bn.apply(inputs, training=training)
+    if test.is_gpu_available(cuda_only=True):
+      epsilon = 1e-3
+      bn = normalization_layers.BatchNormalization(
+          axis=1, epsilon=epsilon, momentum=0.9)
+      inputs = variables.Variable(
+          np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
+      training = array_ops.placeholder(dtype='bool')
+      outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
-      # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
-      np_beta = np.reshape(np_beta, (1, 4, 1, 1))
-      for _ in range(100):
-        np_output, _, _ = sess.run([outputs] + bn.updates,
-                                   feed_dict={training: True})
-        # Verify that the axis is normalized during training.
-        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+      with self.test_session(use_gpu=True) as sess:
+        # Test training with placeholder learning phase.
+        sess.run(variables.global_variables_initializer())
+        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
+        np_beta = np.reshape(np_beta, (1, 4, 1, 1))
+        for _ in range(100):
+          np_output, _, _ = sess.run(
+              [outputs] + bn.updates, feed_dict={training: True})
+          # Verify that the axis is normalized during training.
+          normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+          self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+          self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
-      # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
-      mean = np.mean(np_inputs, axis=(0, 2, 3))
-      std = np.std(np_inputs, axis=(0, 2, 3))
-      variance = np.square(std)
-      self.assertAllClose(mean, moving_mean, atol=1e-2)
-      self.assertAllClose(variance, moving_var, atol=1e-2)
+        # Verify that the statistics are updated during training.
+        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
+        np_inputs = sess.run(inputs)
+        mean = np.mean(np_inputs, axis=(0, 2, 3))
+        std = np.std(np_inputs, axis=(0, 2, 3))
+        variance = np.square(std)
+        self.assertAllClose(mean, moving_mean, atol=1e-2)
+        self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs, feed_dict={training: False})
+        # Test inference with placeholder learning phase.
+        np_output = sess.run(outputs, feed_dict={training: False})
 
-      # Verify that the axis is normalized during inference.
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+        # Verify that the axis is normalized during inference.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
   def test4DInputAxis2(self):
     epsilon = 1e-3
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 8934d39e47381def48ecb7445a5a32ae58458c6b..f468e0b70eaac395b594ad4ab6f7b4df1a3420a3 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -49,11 +49,14 @@ void DelayedNumpyDecref(void* data, size_t len, void* obj) {
 // Actually dereferences cached numpy arrays. REQUIRES being called while
 // holding the GIL.
 void ClearDecrefCache() {
-  mutex_lock ml(*DelayedDecrefLock());
-  for (void* obj : *DecrefCache()) {
+  std::vector<void*> cache_copy;
+  {
+    mutex_lock ml(*DelayedDecrefLock());
+    cache_copy.swap(*DecrefCache());
+  }
+  for (void* obj : cache_copy) {
     Py_DECREF(reinterpret_cast<PyObject*>(obj));
   }
-  DecrefCache()->clear();
 }
 
 // Structure which keeps a reference to a Tensor alive while numpy has a pointer
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index c48296eccb0fcad0626ffa8a57913167c80e6fc2..a1618d5349cd1b0fbd97bdaa3ddd86737650073f 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 
 #include "numpy/arrayobject.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
diff --git a/tensorflow/python/lib/core/strings.i b/tensorflow/python/lib/core/strings.i
index b74eb91cd55259923803448b951221aebd77b61d..938c13e30eb7b00a8225c8e95c7d53f2dd8398c3 100644
--- a/tensorflow/python/lib/core/strings.i
+++ b/tensorflow/python/lib/core/strings.i
@@ -80,7 +80,7 @@ bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
   }
 }
 
-// Converts a C++ string vector to a a list of Python bytes objects.
+// Converts a C++ string vector to a list of Python bytes objects.
 %typemap(out) std::vector<string> {
   const int size = $1.size();
   auto temp_string_list = tensorflow::make_safe(PyList_New(size));
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index c212d2071f216881b58c6a2a37626eaebd3be4ca..c4ac97a7584b2b6a01e107f4ca73348110172004 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -27,6 +27,7 @@ import uuid
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
@@ -441,11 +442,8 @@ def is_directory(dirname):
   Returns:
     True, if the path is a directory; False otherwise
   """
-  try:
-    status = pywrap_tensorflow.TF_NewStatus()
-    return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
-  finally:
-    pywrap_tensorflow.TF_DeleteStatus(status)
+  status = c_api_util.ScopedTFStatus()
+  return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
 
 
 def list_directory(dirname):
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index df35c43c3d4abdd6a3e0bf7c1fb07bedf556afb7..ba749da47a57305a8d414a946c1290f4982cc759 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -55,6 +55,14 @@ bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
   return s.ok();
 }
 
+void PyRecordWriter::Flush(TF_Status* out_status) {
+  Status s = writer_->Flush();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+}
+
 void PyRecordWriter::Close(TF_Status* out_status) {
   Status s = writer_->Close();
   if (!s.ok()) {
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 8c53420ce687ab5878d1ffd7eae6579a48f112c5..9d66c031d456aa5b31ca848d5920887f2d71375b 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -44,6 +44,7 @@ class PyRecordWriter {
   ~PyRecordWriter();
 
   bool WriteRecord(tensorflow::StringPiece record);
+  void Flush(TF_Status* out_status);
   void Close(TF_Status* out_status);
 
  private:
diff --git a/tensorflow/python/lib/io/py_record_writer.i b/tensorflow/python/lib/io/py_record_writer.i
index 9e61c9893a7ba95f16af97a670f5de1de5315a2c..3181c9afce31b64a131b897a0e170a065fdf5069 100644
--- a/tensorflow/python/lib/io/py_record_writer.i
+++ b/tensorflow/python/lib/io/py_record_writer.i
@@ -45,6 +45,7 @@ limitations under the License.
 %unignore tensorflow::io::PyRecordWriter;
 %unignore tensorflow::io::PyRecordWriter::~PyRecordWriter;
 %unignore tensorflow::io::PyRecordWriter::WriteRecord;
+%unignore tensorflow::io::PyRecordWriter::Flush;
 %unignore tensorflow::io::PyRecordWriter::Close;
 %unignore tensorflow::io::PyRecordWriter::New;
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 3d0cdc2153c211c6f6e804dfefda826c1dec2730..df190100689bd864de78f5a2cf52b1ade081a789 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -121,6 +121,11 @@ class TFRecordWriter(object):
     """
     self._writer.WriteRecord(record)
 
+  def flush(self):
+    """Flush the file."""
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._writer.Flush(status)
+
   def close(self):
     """Close the file."""
     with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 5c6d309e6c766b135089b3659b90687505114168..2f6e81f322c4725196a6cd4ae6f811b80b9cb3cc 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -55,7 +55,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
     dim_index: An interger index of concat_dim or axis parameter in op.inputs.
 
   Returns:
-    Tensors represending the partial gradients with respect to each input
+    Tensors representing the partial gradients with respect to each input
     of the op.
 
   Raises:
@@ -373,6 +373,67 @@ def _GatherGrad(op, grad):
   return [ops.IndexedSlices(values, indices, params_shape), None]
 
 
+@ops.RegisterGradient("GatherV2")
+def _GatherV2Grad(op, grad):
+  """Gradient for GatherV2 op."""
+  # params can be large, so colocate the shape calculation with it.
+  #
+  # params can be very large for sparse model, array_ops.shape raises
+  # exception on the Windows platform when any dimension is larger than
+  # int32. params_shape is not used in optimizer apply_sparse gradients,
+  # so it's fine to convert it back to int32 regardless of truncation.
+  params = op.inputs[0]
+  with ops.colocate_with(params):
+    params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
+    params_shape = math_ops.to_int32(params_shape)
+
+  indices = op.inputs[1]
+  indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
+  axis = op.inputs[2]
+  axis_static = tensor_util.constant_value(axis)
+
+  # For axis 0 gathers, build an appropriately shaped IndexedSlices.
+  if axis_static == 0:
+    values_shape = array_ops.concat([indices_size, params_shape[1:]], 0)
+    values = array_ops.reshape(grad, values_shape)
+    indices = array_ops.reshape(indices, indices_size)
+    return [ops.IndexedSlices(values, indices, params_shape), None, None]
+
+  outer_shape = params_shape[:axis]
+  outer_dims = array_ops.size(outer_shape)
+  inner_shape = params_shape[axis:][1:]
+  inner_dims = array_ops.size(inner_shape)
+
+  outer_axes_indices = math_ops.range(outer_dims)
+  inner_axes_indices = math_ops.range(outer_dims + 1,
+                                      outer_dims + 1 + inner_dims)
+
+  values_shape = array_ops.concat([outer_shape, indices_size, inner_shape], 0)
+  values = array_ops.reshape(grad, values_shape)
+  indices = array_ops.reshape(indices, indices_size)
+
+  # We need to sum up every slice `values[..., i, ....]` corresponding to
+  # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not
+  # support an axis parameter, we transpose the gather dimension to the front,
+  # then use `unsorted_segment_sum` to build a
+  # [gather_axis, outer_axes, inner_axes] tensor with all the gradients
+  # affecting each index in `gather_axis` summed up.
+  transpose_dims = array_ops.concat(
+      [[outer_dims], outer_axes_indices, inner_axes_indices], 0)
+  values_transpose = array_ops.transpose(values, transpose_dims)
+  num_segments = params_shape[axis]
+
+  params_grad = math_ops.unsorted_segment_sum(
+      values_transpose, indices, num_segments)
+
+  # Inverts the above transpose by moving dimension 0 back to its original
+  # position.
+  invert_transpose_dims = array_ops.concat(
+      [outer_axes_indices + 1, [0], inner_axes_indices], 0)
+  params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
+  return [params_grad, None, None]
+
+
 @ops.RegisterGradient("GatherNd")
 def _GatherNdGrad(op, grad):
   ref = op.inputs[0]
@@ -470,7 +531,6 @@ def _TileGrad(op, grad):
 ops.NotDifferentiable("BroadcastGradientArgs")
 
 
-@ops.RegisterGradient("Pad")
 def _PadGrad(op, grad):
   """Gradient for Pad."""
   # Pad introduces values around the original tensor, so the gradient function
@@ -483,7 +543,14 @@ def _PadGrad(op, grad):
   # Make it a 1-D tensor.
   begin = array_ops.reshape(pad_before, [-1])
   sizes = array_ops.shape(x)
-  return array_ops.slice(grad, begin, sizes), None
+  x_grad = array_ops.slice(grad, begin, sizes)
+  if len(op.inputs) == 3:
+    return x_grad, None, None
+  else:
+    return x_grad, None
+
+ops.RegisterGradient("Pad")(_PadGrad)
+ops.RegisterGradient("PadV2")(_PadGrad)
 
 
 # ReverseSequence is just a permutation.  The gradient permutes back.
@@ -586,6 +653,12 @@ def _QuantizeAndDequantizeV2Grad(_, grad):
   return [grad, None, None]
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV3")
+def _QuantizeAndDequantizeV3Grad(_, grad):
+  # Only propagate the gradient for the unquantized input.
+  return [grad, None, None, None]
+
+
 @ops.RegisterGradient("ExtractImagePatches")
 def _ExtractImagePatchesGrad(op, grad):
 
@@ -670,3 +743,10 @@ def _ScatterNdGrad(op, grad):
   indices = op.inputs[0]
   updates_grad = array_ops.gather_nd(grad, indices)
   return [None, updates_grad, None]
+
+
+@ops.RegisterGradient("ScatterNdNonAliasingAdd")
+def _ScatterNdNonAliasingAddGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  return [grad, None, updates_grad]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 86a59ff9e30c1886b7ed616e9af1fc1c3248a471..f64c89ac5d2c5e9752958bd14bf62fdd6152bc07 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -150,7 +150,8 @@ def expand_dims(input, axis=None, name=None, dim=None):
   Args:
     input: A `Tensor`.
     axis: 0-D (scalar). Specifies the dimension index at which to
-      expand the shape of `input`.
+      expand the shape of `input`. Must be in the range
+      `[-rank(input) - 1, rank(input)]`.
     name: The name of the output `Tensor`.
     dim: 0-D (scalar). Equivalent to `axis`, to be deprecated.
 
@@ -329,7 +330,7 @@ def rank(input, name=None):
   # pylint: disable=redefined-builtin
   """Returns the rank of a tensor.
 
-  This operation returns an integer representing the rank of `input`.
+  Returns a 0-D `int32` `Tensor` representing the rank of `input`.
 
   For example:
 
@@ -381,6 +382,13 @@ def rank_internal(input, name=None, optimize=True):
       return gen_array_ops.rank(input, name=name)
 
 
+def _one_like_dtype(other):
+  if isinstance(other, ops.Tensor):
+    return constant(1, other.dtype)
+  else:
+    return np.ones_like(other).dtype.type(1)
+
+
 def _SliceHelper(tensor, slice_spec, var=None):
   """Overload for Tensor.__getitem__.
 
@@ -444,7 +452,6 @@ def _SliceHelper(tensor, slice_spec, var=None):
   ellipsis_mask = 0
   for s in slice_spec:
     if isinstance(s, _baseslice):
-      strides.append(s.step if s.step is not None else 1)
       # python doesn't always use None when constructing ranges
       # for example a[:] gives slice(None,sys.maxsize,None)
       # whereas a[::1] gives slice(None,None,None)
@@ -458,6 +465,11 @@ def _SliceHelper(tensor, slice_spec, var=None):
       else:
         end.append(0)
         end_mask |= (1 << index)
+      if s.step is not None:
+        strides.append(s.step)
+      else:
+        # Use a 1 of the same dtype as begin.
+        strides.append(_one_like_dtype(begin[-1]))
     elif s is Ellipsis:
       begin.append(0)
       end.append(0)
@@ -471,10 +483,7 @@ def _SliceHelper(tensor, slice_spec, var=None):
     else:
       begin.append(s)
       end.append(s + 1)
-      if isinstance(s, ops.Tensor):
-        strides.append(constant(1, s.dtype))
-      else:
-        strides.append(np.ones_like(s).dtype.type(1))
+      strides.append(_one_like_dtype(s))
       shrink_axis_mask |= (1 << index)
     index += 1
 
@@ -514,6 +523,10 @@ def slice(input_, begin, size, name=None):
   words, `begin[i]` is the offset into the 'i'th dimension of `input` that you
   want to slice from.
 
+  Note that @{tf.Tensor.__getitem__} is typically a more pythonic way to
+  perform slices, as it allows you to write `foo[3:7, :-2]` instead of
+  `tf.slice([3, 0], [4, foo.get_shape()[1]-2])`.
+
   `begin` is zero-based; `size` is one-based. If `size[i]` is -1,
   all remaining elements in dimension i are included in the
   slice. In other words, this is equivalent to setting:
@@ -709,10 +722,10 @@ def _SliceHelperVar(var, slice_spec):
   A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
   with tf.Session() as sess:
     sess.run(tf.global_variables_initializer())
-    print sess.run(A[:2, :2]) # => [[1,2], [4,5]]
+    print(sess.run(A[:2, :2]))  # => [[1,2], [4,5]]
 
     op = A[:2,:2].assign(22. * tf.ones((2, 2)))
-    print sess.run(op) # => [[22, 22, 3], [22, 22, 6], [7,8,9]]
+    print(sess.run(op))  # => [[22, 22, 3], [22, 22, 6], [7,8,9]]
   ```
 
   Note that assignments currently do not support NumPy broadcasting
@@ -760,10 +773,10 @@ def parallel_stack(values, name="parallel_stack"):
   The difference between `stack` and `parallel_stack` is that `stack` requires
   all the inputs be computed before the operation will begin but doesn't require
   that the input shapes be known during graph construction.
-  
+
   `parallel_stack` will copy pieces of the input into the output as they become
   available, in some situations this can provide a performance benefit.
-  
+
   Unlike `stack`, `parallel_stack` does NOT support backpropagation.
 
   This is the opposite of unstack.  The numpy equivalent is
@@ -818,7 +831,7 @@ def stack(values, axis=0, name="stack"):
   Args:
     values: A list of `Tensor` objects with the same shape and type.
     axis: An `int`. The axis to stack along. Defaults to the first dimension.
-      Supports negative indexes.
+      Negative values wrap around, so the valid range is `[-(R+1), R+1)`.
     name: A name for this operation (optional).
 
   Returns:
@@ -958,7 +971,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
     num: An `int`. The length of the dimension `axis`. Automatically inferred
       if `None` (the default).
     axis: An `int`. The axis to unstack along. Defaults to the first
-      dimension. Supports negative indexes.
+      dimension. Negative values wrap around, so the valid range is `[-R, R)`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1029,7 +1042,8 @@ def concat(values, axis, name="concat"):
 
   Args:
     values: A list of `Tensor` objects or a single `Tensor`.
-    axis: 0-D `int32` `Tensor`.  Dimension along which to concatenate.
+    axis: 0-D `int32` `Tensor`.  Dimension along which to concatenate. Must be
+      in the range `[-rank(values), rank(values))`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1369,7 +1383,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   ```
 
   Args:
-    shape: Either a list of integers, or a 1-D `Tensor` of type `int32`.
+    shape: A list of integers, a tuple of integers, or a 1-D `Tensor` of type `int32`.
     dtype: The type of an element in the resulting `Tensor`.
     name: A name for the operation (optional).
 
@@ -1483,7 +1497,7 @@ def ones(shape, dtype=dtypes.float32, name=None):
   ```
 
   Args:
-    shape: Either a list of integers, or a 1-D `Tensor` of type `int32`.
+    shape: A list of integers, a tuple of integers, or a 1-D `Tensor` of type `int32`.
     dtype: The type of an element in the resulting `Tensor`.
     name: A name for the operation (optional).
 
@@ -1538,13 +1552,12 @@ def placeholder(dtype, shape=None, name=None):
 
 # pylint: disable=redefined-outer-name
 def _normalize_sparse_shape(shape, name):
-  """Takes numpy array or Tensor or None and returns either None or Tensor."""
-  if shape is None: return None
-  if not isinstance(shape, ops.Tensor):
-    for el in shape:
-      if el is None:
-        return None
-  return ops.convert_to_tensor(shape, name=name)
+  """Returns a tuple of (Tensor or None, rank or None)."""
+  if shape is None: return (None, None)
+  rank = shape.get_shape()[0] if isinstance(shape, ops.Tensor) else len(shape)
+  if not isinstance(shape, ops.Tensor) and None in shape:
+    return (None, rank)
+  return (ops.convert_to_tensor(shape, dtype=dtypes.int64, name=name), rank)
 
 
 def sparse_placeholder(dtype, shape=None, name=None):
@@ -1587,9 +1600,9 @@ def sparse_placeholder(dtype, shape=None, name=None):
     evaluated directly.
   """
   shape_name = (name + "/shape") if name is not None else None
-  shape = _normalize_sparse_shape(shape, shape_name)
+  shape, rank = _normalize_sparse_shape(shape, shape_name)
   if shape is None:
-    shape = placeholder(dtypes.int64, shape=[None], name=shape_name)
+    shape = placeholder(dtypes.int64, shape=[rank], name=shape_name)
   return sparse_tensor.SparseTensor(
       values=placeholder(
           dtype, shape=[None],
@@ -1601,7 +1614,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
 # pylint: enable=redefined-outer-name
 
 
-def pad(tensor, paddings, mode="CONSTANT", name=None):  # pylint: disable=invalid-name
+def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
   This operation pads a `tensor` according to the `paddings` you specify.
@@ -1623,6 +1636,7 @@ def pad(tensor, paddings, mode="CONSTANT", name=None):  # pylint: disable=invali
   ```python
   # 't' is [[1, 2, 3], [4, 5, 6]].
   # 'paddings' is [[1, 1,], [2, 2]].
+  # 'constant_values' is 0.
   # rank of 't' is 2.
   pad(t, paddings, "CONSTANT") ==> [[0, 0, 0, 0, 0, 0, 0],
                                     [0, 0, 1, 2, 3, 0, 0],
@@ -1645,6 +1659,8 @@ def pad(tensor, paddings, mode="CONSTANT", name=None):  # pylint: disable=invali
     paddings: A `Tensor` of type `int32`.
     mode: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
     name: A name for the operation (optional).
+    constant_values: In "CONSTANT" mode, the scalar pad value to use. Must be
+      same type as `tensor`.
 
   Returns:
     A `Tensor`. Has the same type as `tensor`.
@@ -1657,7 +1673,12 @@ def pad(tensor, paddings, mode="CONSTANT", name=None):  # pylint: disable=invali
   # NumPy uses all lower-case modes.
   mode = mode.upper()
   if mode == "CONSTANT":
-    return gen_array_ops._pad(tensor, paddings, name=name)
+    # TODO(rjryan): Once the forward compatibility period (3 weeks) have passed
+    # remove the "Pad" fallback here.
+    if constant_values != 0:
+      return gen_array_ops._pad_v2(tensor, paddings, constant_values, name=name)
+    else:
+      return gen_array_ops._pad(tensor, paddings, name=name)
   if mode == "REFLECT":
     return gen_array_ops._mirror_pad(tensor,
                                      paddings,
@@ -2281,6 +2302,7 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
     axis: An optional list of `ints`. Defaults to `[]`.
       If specified, only squeezes the dimensions listed. The dimension
       index starts at 0. It is an error to squeeze a dimension that is not 1.
+      Must be in the range `[-rank(input), rank(input))`.
     name: A name for the operation (optional).
     squeeze_dims: Deprecated keyword argument that is now axis.
 
@@ -2380,3 +2402,14 @@ reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
         gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
     "seq_dim", "seq_axis")
+
+
+def gather(params, indices, validate_indices=None, name=None, axis=0):
+  # TODO(rjryan): Remove "Gather" creation in favor of GatherV2 once the forward
+  # compatibility 3 week period has passed.
+  if axis == 0:
+    return gen_array_ops.gather(params, indices,
+                                validate_indices=validate_indices, name=name)
+  else:
+    return gen_array_ops.gather_v2(params, indices, axis, name=name)
+gather.__doc__ = gen_array_ops.gather_v2.__doc__
diff --git a/tensorflow/python/ops/bitwise_ops.py b/tensorflow/python/ops/bitwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..44daf1353706f9c91679a7cd3d8a6ea77d17b879
--- /dev/null
+++ b/tensorflow/python/ops/bitwise_ops.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Operations for manipulating the binary representations of integers.
+
+@@bitwise_and
+@@bitwise_or
+@@bitwise_xor
+@@invert
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_bitwise_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+ops.NotDifferentiable("BitwiseAnd")
+ops.NotDifferentiable("BitwiseOr")
+ops.NotDifferentiable("BitwiseXor")
+ops.NotDifferentiable("Invert")
+ops.NotDifferentiable("PopulationCount")
+
+remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d08c8f82dcb01e9d4c386b6a5033bf9628014ec
--- /dev/null
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -0,0 +1,97 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for bitwise operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.platform import googletest
+
+
+class BitwiseOpTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(BitwiseOpTest, self).__init__(method_name)
+
+  def testBinaryOps(self):
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in dtype_list:
+        lhs = constant_op.constant([0, 5, 3, 14], dtype=dtype)
+        rhs = constant_op.constant([5, 0, 7, 11], dtype=dtype)
+        and_result, or_result, xor_result = sess.run(
+            [bitwise_ops.bitwise_and(lhs, rhs),
+             bitwise_ops.bitwise_or(lhs, rhs),
+             bitwise_ops.bitwise_xor(lhs, rhs)])
+        self.assertAllEqual(and_result, [0, 0, 3, 10])
+        self.assertAllEqual(or_result, [5, 5, 7, 15])
+        self.assertAllEqual(xor_result, [5, 5, 4, 5])
+
+  def testPopulationCountOp(self):
+    dtype_list = [dtypes.int8, dtypes.int16,
+                  dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+    raw_inputs = [0, 1, -1, 3, -3, 5, -5, 14, -14,
+                  127, 128, 255, 256, 65535, 65536,
+                  2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32,
+                  -2**63 + 1, 2**63 - 1]
+    def count_bits(x):
+      return sum([bin(z).count("1") for z in six.iterbytes(x.tobytes())])
+    for dtype in dtype_list:
+      with self.test_session(use_gpu=True) as sess:
+        print("PopulationCount test: ", dtype)
+        inputs = np.array(raw_inputs, dtype=dtype.as_numpy_dtype)
+        truth = [count_bits(x) for x in inputs]
+        input_tensor = constant_op.constant(inputs, dtype=dtype)
+        popcnt_result = sess.run(gen_bitwise_ops.population_count(input_tensor))
+        self.assertAllEqual(truth, popcnt_result)
+
+  def testInvertOp(self):
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+    inputs = [0, 5, 3, 14]
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in dtype_list:
+        # Because of issues with negative numbers, let's test this indirectly.
+        # 1. invert(a) and a = 0
+        # 2. invert(a) or a = invert(0)
+        input_tensor = constant_op.constant(inputs, dtype=dtype)
+        not_a_and_a, not_a_or_a, not_0 = sess.run(
+            [bitwise_ops.bitwise_and(
+                input_tensor, bitwise_ops.invert(input_tensor)),
+             bitwise_ops.bitwise_or(
+                 input_tensor, bitwise_ops.invert(input_tensor)),
+             bitwise_ops.invert(constant_op.constant(0, dtype=dtype))])
+        self.assertAllEqual(not_a_and_a, [0, 0, 0, 0])
+        self.assertAllEqual(not_a_or_a, [not_0] * 4)
+        # For unsigned dtypes let's also check the result directly.
+        if dtype.is_unsigned:
+          inverted = sess.run(bitwise_ops.invert(input_tensor))
+          expected = [dtype.max - x for x in inputs]
+          self.assertAllEqual(inverted, expected)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 478e0a9472b053ebceac50cb0a2dff11f3c257ce..4ba812eaf5d03eaf79eeb799ed06a0c532278226 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -61,6 +61,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
@@ -983,9 +984,16 @@ class GradLoopState(object):
             # the right control flow context.
             real_value = self._grad_context.AddValue(cur_value)
             break
+        elif constant_op.is_constant(cur_value):
+          # If the value to be forwarded is a constant, clone the constant in
+          # the gradient loop rather than using a stack.
+          # TODO(phawkins): consider hoisting the constant out of the loop
+          # instead.
+          real_value = constant_op.constant(
+              tensor_util.constant_value(cur_value), dtype=cur_value.dtype)
+          break
         else:
           # Record the history of this value in forward_ctxt.
-          # TODO(yuanbyu): Avoid recording constants.
           self._grad_context.Exit()
           history_value = cur_grad_state.AddForwardAccumulator(cur_value)
           self._grad_context.Enter()
@@ -1363,18 +1371,22 @@ class ControlFlowContext(object):
       import_scope: Optional `string`. Name scope to add.
     """
     assert isinstance(values_def, control_flow_pb2.ValuesDef)
-    self._values = set(values_def.values)
+    self._values = set(
+        ops.prepend_name_scope(value, import_scope)
+        for value in values_def.values)
     g = ops.get_default_graph()
     self._external_values = {}
     for k, v in values_def.external_values.items():
+      k = ops.prepend_name_scope(k, import_scope)
       self._external_values[k] = g.as_graph_element(
           ops.prepend_name_scope(v, import_scope))
-    op_names = set([op.split(":")[0]
-                    for op in self._values - set(self._external_values)])
+    op_names = set([
+        op.split(":")[0]
+        for op in self._values - set(self._external_values.keys())
+    ])
     for op in op_names:
       # pylint: disable=protected-access
-      g.as_graph_element(ops.prepend_name_scope(
-          op, import_scope))._set_control_flow_context(self)
+      g.as_graph_element(op)._set_control_flow_context(self)
       # pylint: enable=protected-access
 
   @property
@@ -1404,6 +1416,7 @@ class ControlFlowContext(object):
         [ops.strip_name_scope(v, export_scope)
          for v in sorted(self._values)])
     for k, v in self._external_values.items():
+      k = ops.strip_name_scope(k, export_scope)
       values_def.external_values[k] = ops.strip_name_scope(
           v.name, export_scope)
     return values_def
@@ -1798,7 +1811,7 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
   if not callable(false_fn):
     raise TypeError("false_fn must be callable.")
 
-  with ops.name_scope(name, "cond", [pred]) as name:
+  with ops.name_scope(name, "cond", [pred]):
     # Add the Switch to the graph.
     if isinstance(pred, bool):
       raise TypeError("pred must not be a Python bool")
@@ -2754,7 +2767,7 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
   ```
 
   """
-  with ops.name_scope(name, "while", loop_vars) as name:
+  with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
       raise ValueError("No loop variables provided")
     if not callable(cond):
@@ -2767,7 +2780,7 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     if shape_invariants is not None:
       nest.assert_same_structure(loop_vars, shape_invariants)
 
-    context = WhileContext(parallel_iterations, back_prop, swap_memory, name)
+    context = WhileContext(parallel_iterations, back_prop, swap_memory)
     ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, context)
     result = context.BuildLoop(cond, body, loop_vars, shape_invariants)
     return result
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 4e95783e5a81f01499bb3d164683d34de258b9b9..c23957443fd6f4df0eb57af240c98ad0d1318a05 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -43,7 +43,6 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import momentum
 from tensorflow.python.util import nest
-from tensorflow.python.util.protobuf import compare
 
 
 TestTuple = collections.namedtuple("TestTuple", "a b")
@@ -399,7 +398,7 @@ class ContextTest(TensorFlowTestCase):
       for op in sess.graph.get_operations():
         c = op._get_control_flow_context()
         if c:
-          compare.ProtoEq(
+          self.assertProtoEquals(
               c.to_proto(),
               control_flow_ops.CondContext.from_proto(c.to_proto()).to_proto())
 
@@ -412,10 +411,35 @@ class ContextTest(TensorFlowTestCase):
       for op in sess.graph.get_operations():
         c = op._get_control_flow_context()
         if c:
-          compare.ProtoEq(
+          self.assertProtoEquals(
               c.to_proto(),
               control_flow_ops.WhileContext.from_proto(c.to_proto()).to_proto())
 
+  def testControlContextImportScope(self):
+    with self.test_session():
+      constant_op.constant(0, name="a")
+      constant_op.constant(2, name="test_scope/a")
+      b1 = constant_op.constant(1, name="b")
+      b2 = constant_op.constant(3, name="test_scope/b")
+
+      c = control_flow_ops.ControlFlowContext()
+      c._values = ["a", "b"]
+      c._external_values = {"a": b1}
+
+      c_with_scope = control_flow_ops.ControlFlowContext._from_proto(
+          c._to_proto(), import_scope="test_scope")
+
+      # _values and _external_values should be have scope prepended.
+      self.assertEquals(
+          c_with_scope._values, set(["test_scope/a", "test_scope/b"]))
+      self.assertEquals(
+          c_with_scope._external_values, {"test_scope/a": b2})
+
+      # Calling _to_proto() with export_scope should remove "test_scope".
+      self.assertProtoEquals(
+          c._to_proto(),
+          c_with_scope._to_proto(export_scope="test_scope"))
+
 
 def _GetNestedShape(nested):
   def _GetShape(tensor):
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 829aa99284647aba22c1598e8adec0de7bd0864f..fbfbb50d8cb0cd267e71f89c367a0a1b0008797f 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -248,7 +248,7 @@ class QueueBase(object):
     if isinstance(vals, dict):
       if not self._names:
         raise ValueError("Queue must have names to enqueue a dictionary")
-      if sorted(self._names) != sorted(vals.keys()):
+      if sorted(self._names, key=str) != sorted(vals.keys(), key=str):
         raise ValueError("Keys in dictionary to enqueue do not match "
                          "names of Queue.  Dictionary: (%s), Queue: (%s)" %
                          (sorted(vals.keys()), sorted(self._names)))
@@ -512,8 +512,9 @@ class QueueBase(object):
     the given queue. Subsequent `enqueue` and `enqueue_many`
     operations will fail. Subsequent `dequeue` and `dequeue_many`
     operations will continue to succeed if sufficient elements remain
-    in the queue. Subsequent `dequeue` and `dequeue_many` operations
-    that would block will fail immediately.
+    in the queue. Subsequently dequeue and dequeue_many operations
+    that would otherwise block waiting for more elements (if close
+    hadn't been called) will now fail immediately.
 
     If `cancel_pending_enqueues` is `True`, all pending requests will also
     be canceled.
@@ -537,6 +538,25 @@ class QueueBase(object):
           self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
 
+  def is_closed(self, name=None):
+    """ Returns true if queue is closed.
+
+    This operation returns true if the queue is closed and false if the queue
+    is open.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      True if the queue is closed and false if the queue is open.
+    """
+    if name is None:
+      name = "%s_Is_Closed" % self._name
+    if self._queue_ref.dtype == _dtypes.resource:
+      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref,name=name)
+    else:
+      return gen_data_flow_ops.queue_is_closed_(self._queue_ref,name=name)
+
   def size(self, name=None):
     """Compute the number of elements in this queue.
 
@@ -1532,10 +1552,7 @@ class BaseStagingArea(object):
       # The returned values in `tensors` are in the same order as
       # the names in `self._names`.
       return {self._names[i]: t for t, i in zip(tensors, indices)}
-    elif len(tensors) == 1:
-      return tensors[0]
-    else:
-      return tensors
+    return tensors
 
   def _scope_vals(self, vals):
     """Return a list of values to pass to `name_scope()`.
diff --git a/tensorflow/python/ops/distributions/__init__.py b/tensorflow/python/ops/distributions/__init__.py
index 563b189990cfed5d6418c7cfca6c0fdf4226995f..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/tensorflow/python/ops/distributions/__init__.py
+++ b/tensorflow/python/ops/distributions/__init__.py
@@ -1,18 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Core module for TensorFlow distribution objects and helpers."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 3281b57e83e374ddae9ac9cb1d4ef0154c12f836..2b981e7b19324792a0143c15004919781e318e8c 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -22,7 +22,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -121,7 +120,10 @@ class Bernoulli(distribution.Distribution):
     return math_ops.cast(sample, self.dtype)
 
   def _log_prob(self, event):
-    event = self._maybe_assert_valid_sample(event)
+    if self.validate_args:
+      event = distribution_util.embed_check_integer_casting_closed(
+          event, target_dtype=dtypes.bool)
+
     # TODO(jaana): The current sigmoid_cross_entropy_with_logits has
     # inconsistent  behavior for logits = inf/-inf.
     event = math_ops.cast(event, self.logits.dtype)
@@ -162,17 +164,6 @@ class Bernoulli(distribution.Distribution):
     """Returns `1` if `prob > 0.5` and `0` otherwise."""
     return math_ops.cast(self.probs > 0.5, self.dtype)
 
-  def _maybe_assert_valid_sample(self, event, check_integer=True):
-    if not self.validate_args:
-      return event
-    event = distribution_util.embed_check_nonnegative_discrete(
-        event, check_integer=check_integer)
-    return control_flow_ops.with_dependencies([
-        check_ops.assert_less_equal(
-            event, array_ops.ones_like(event),
-            message="event is not less than or equal to 1."),
-    ], event)
-
 
 class BernoulliWithSigmoidProbs(Bernoulli):
   """Bernoulli with `probs = nn.sigmoid(logits)`."""
diff --git a/tensorflow/python/ops/distributions/bijector.py b/tensorflow/python/ops/distributions/bijector.py
index 70e9fdadd20e42b5618a23f4b03aa24decd267ba..84bd0a20da38d15c5bd23d7e0b906063702b46de 100644
--- a/tensorflow/python/ops/distributions/bijector.py
+++ b/tensorflow/python/ops/distributions/bijector.py
@@ -19,9 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 # go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.distributions.bijector_impl import *
-# pylint: enable=wildcard-import
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.distributions.bijector_impl import Bijector
+
+# pylint: enable=wildcard-import,unused-import
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ["Bijector"]
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 7be7c27ae9a59118db45e6aa0ae649c0702d972d..49a8938f010cda282bd589b1b6dbb4b4b4340142 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -356,6 +356,9 @@ class Bijector(object):
       dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
         enforced.
       name: The name to give Ops created by the initializer.
+
+    Raises:
+      ValueError:  If a member of `graph_parents` is not a `Tensor`.
     """
     self._event_ndims = (
         ops.convert_to_tensor(event_ndims, dtype=dtypes.int32)
@@ -379,6 +382,10 @@ class Bijector(object):
         return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
       self._name = camel_to_snake(type(self).__name__.lstrip("_"))
 
+    for i, t in enumerate(self._graph_parents):
+      if t is None or not tensor_util.is_tensor(t):
+        raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
+
   @property
   def event_ndims(self):
     """Returns then number of event dimensions this bijector operates on."""
diff --git a/tensorflow/python/ops/distributions/bijectors.py b/tensorflow/python/ops/distributions/bijectors.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c3a5d4c0ba86586ccb6e55e71d898b1bf7c035
--- /dev/null
+++ b/tensorflow/python/ops/distributions/bijectors.py
@@ -0,0 +1,31 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core module for TensorFlow distribution bijectors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.distributions.bijector import Bijector
+from tensorflow.python.ops.distributions.identity_bijector import Identity
+
+# pylint: enable=wildcard-import,unused-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["Bijector", "Identity"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index bad7e6e42f008849a60759e69f3902a3a713f293..84ca6db4c4b4deea86fb0a0f626eda42f0283d1f 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -31,31 +31,104 @@ from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
 
 
+def _broadcast_cat_event_and_params(event, params, base_dtype=dtypes.int32):
+  """Broadcasts the event or distribution parameters."""
+  if event.shape.ndims is None:
+    raise NotImplementedError(
+        "Cannot broadcast with an event tensor of unknown rank.")
+
+  if event.dtype.is_integer:
+    pass
+  elif event.dtype.is_floating:
+    # When `validate_args=True` we've already ensured int/float casting
+    # is closed.
+    event = math_ops.cast(event, dtype=dtypes.int32)
+  else:
+    raise TypeError("`value` should have integer `dtype` or "
+                    "`self.dtype` ({})".format(base_dtype))
+
+  if params.get_shape()[:-1] == event.get_shape():
+    params = params
+  else:
+    params *= array_ops.ones_like(
+        array_ops.expand_dims(event, -1), dtype=params.dtype)
+    params_shape = array_ops.shape(params)[:-1]
+    event *= array_ops.ones(params_shape, dtype=event.dtype)
+    event.set_shape(tensor_shape.TensorShape(params.get_shape()[:-1]))
+  return event, params
+
+
 class Categorical(distribution.Distribution):
   """Categorical distribution.
 
-  The categorical distribution is parameterized by the log-probabilities
-  of a set of classes.
+  The Categorical distribution is parameterized by either probabilities or
+  log-probabilities of a set of `K` classes. It is defined over the integers
+  `{0, 1, ..., K}`.
+
+  The Categorical distribution is closely related to the `OneHotCategorical` and
+  `Multinomial` distributions.  The Categorical distribution can be intuited as
+  generating samples according to `argmax{ OneHotCategorical(probs) }` itself
+  being identical to `argmax{ Multinomial(probs, total_count=1) }.
+
+  #### Mathematical Details
+
+  The probability mass function (pmf) is,
+
+  ```none
+  pmf(k; pi) = prod_j pi_j**[k == j]
+  ```
+
+  #### Pitfalls
+
+  The number of classes, `K`, must not exceed:
+  - the largest integer representable by `self.dtype`, i.e.,
+    `2**(mantissa_bits+1)` (IEE754),
+  - the maximum `Tensor` index, i.e., `2**31-1`.
+
+  In other words,
+
+  ```python
+  K <= min(2**31-1, {
+    tf.float16: 2**11,
+    tf.float32: 2**24,
+    tf.float64: 2**53 }[param.dtype])
+  ```
+
+  Note: This condition is validated only when `self.validate_args = True`.
 
   #### Examples
 
-  Creates a 3-class distribution, with the 2nd class, the most likely to be
-  drawn from.
+  Creates a 3-class distribution with the 2nd class being most likely.
 
   ```python
-  p = [0.1, 0.5, 0.4]
-  dist = Categorical(probs=p)
+  dist = Categorical(probs=[0.1, 0.5, 0.4])
+  n = 1e4
+  empirical_prob = tf.cast(
+      tf.histogram_fixed_width(
+        dist.sample(int(n)),
+        [0., 2],
+        nbins=3),
+      dtype=tf.float32) / n
+  # ==> array([ 0.1005,  0.5037,  0.3958], dtype=float32)
   ```
 
-  Creates a 3-class distribution, with the 2nd class the most likely to be
-  drawn from, using logits.
+  Creates a 3-class distribution with the 2nd class being most likely.
+  Parameterized by [logits](https://en.wikipedia.org/wiki/Logit) rather than
+  probabilities.
 
   ```python
-  logits = [-50, 400, 40]
-  dist = Categorical(logits=logits)
+  dist = Categorical(logits=np.log([0.1, 0.5, 0.4])
+  n = 1e4
+  empirical_prob = tf.cast(
+      tf.histogram_fixed_width(
+        dist.sample(int(n)),
+        [0., 2],
+        nbins=3),
+      dtype=tf.float32) / n
+  # ==> array([0.1045,  0.5047, 0.3908], dtype=float32)
   ```
 
-  Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
+  Creates a 3-class distribution with the 3rd class being most likely.
   The distribution functions can be evaluated on counts.
 
   ```python
@@ -116,6 +189,10 @@ class Categorical(distribution.Distribution):
           multidimensional=True,
           name=name)
 
+      if validate_args:
+        self._logits = distribution_util.embed_check_categorical_event_shape(
+            self._logits)
+
       logits_shape_static = self._logits.get_shape().with_rank_at_least(1)
       if logits_shape_static.ndims is not None:
         self._batch_rank = ops.convert_to_tensor(
@@ -186,45 +263,45 @@ class Categorical(distribution.Distribution):
       logits_2d = self.logits
     else:
       logits_2d = array_ops.reshape(self.logits, [-1, self.event_size])
-    samples = random_ops.multinomial(logits_2d, n, seed=seed)
-    samples = math_ops.cast(samples, self.dtype)
-    ret = array_ops.reshape(
-        array_ops.transpose(samples),
+    draws = random_ops.multinomial(logits_2d, n, seed=seed)
+    draws = array_ops.reshape(
+        array_ops.transpose(draws),
         array_ops.concat([[n], self.batch_shape_tensor()], 0))
-    return ret
+    return math_ops.cast(draws, self.dtype)
 
   def _cdf(self, k):
     k = ops.convert_to_tensor(k, name="k")
-
-    # If there are multiple batch dimension, flatten them into one.
-    batch_flattened_probs = array_ops.reshape(self._probs,
-                                              [-1, self._event_size])
-    batch_flattened_k = array_ops.reshape(k, (-1,))
-
-    # Form a tensor to sum over.
-    mask_tensor = array_ops.sequence_mask(batch_flattened_k, self._event_size)
-    to_sum_over = array_ops.where(mask_tensor,
-                                  batch_flattened_probs,
-                                  array_ops.zeros_like(batch_flattened_probs))
-    batch_flat_cdf = math_ops.reduce_sum(to_sum_over, axis=-1)
-    return array_ops.reshape(batch_flat_cdf, self._batch_shape())
+    if self.validate_args:
+      k = distribution_util.embed_check_integer_casting_closed(
+          k, target_dtype=dtypes.int32)
+
+    k, probs = _broadcast_cat_event_and_params(
+        k, self.probs, base_dtype=self.dtype.base_dtype)
+
+    # batch-flatten everything in order to use `sequence_mask()`.
+    batch_flattened_probs = array_ops.reshape(probs,
+                                              (-1, self._event_size))
+    batch_flattened_k = array_ops.reshape(k, [-1])
+
+    to_sum_over = array_ops.where(
+        array_ops.sequence_mask(batch_flattened_k, self._event_size),
+        batch_flattened_probs,
+        array_ops.zeros_like(batch_flattened_probs))
+    batch_flattened_cdf = math_ops.reduce_sum(to_sum_over, axis=-1)
+    # Reshape back to the shape of the argument.
+    return array_ops.reshape(batch_flattened_cdf, array_ops.shape(k))
 
   def _log_prob(self, k):
     k = ops.convert_to_tensor(k, name="k")
-    if self.logits.get_shape()[:-1] == k.get_shape():
-      logits = self.logits
-    else:
-      logits = self.logits * array_ops.ones_like(
-          array_ops.expand_dims(k, -1), dtype=self.logits.dtype)
-      logits_shape = array_ops.shape(logits)[:-1]
-      k *= array_ops.ones(logits_shape, dtype=k.dtype)
-      k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1]))
+    if self.validate_args:
+      k = distribution_util.embed_check_integer_casting_closed(
+          k, target_dtype=dtypes.int32)
+    k, logits = _broadcast_cat_event_and_params(
+        k, self.logits, base_dtype=self.dtype.base_dtype)
+
     return -nn_ops.sparse_softmax_cross_entropy_with_logits(labels=k,
                                                             logits=logits)
 
-  def _prob(self, k):
-    return math_ops.exp(self._log_prob(k))
-
   def _entropy(self):
     return -math_ops.reduce_sum(
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 662a7655584b8dc6aeed5251f98dd17fb24f3606..d792e9fe52dee4325d0956dbb74c8b408d5a1e8c 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -36,7 +36,7 @@ __all__ = [
 
 
 _dirichlet_multinomial_sample_note = """For each batch of counts,
-`value = [n_0, ..., n_{k-1}]`, `P[value]` is the probability that after
+`value = [n_0, ..., n_{K-1}]`, `P[value]` is the probability that after
 sampling `self.total_count` draws from this Dirichlet-Multinomial distribution,
 the number of draws falling in class `j` is `n_j`. Since this definition is
 [exchangeable](https://en.wikipedia.org/wiki/Exchangeable_random_variables);
@@ -53,16 +53,16 @@ class DirichletMultinomial(distribution.Distribution):
   """Dirichlet-Multinomial compound distribution.
 
   The Dirichlet-Multinomial distribution is parameterized by a (batch of)
-  length-`k` `concentration` vectors (`k > 1`) and a `total_count` number of
+  length-`K` `concentration` vectors (`K > 1`) and a `total_count` number of
   trials, i.e., the number of trials per draw from the DirichletMultinomial. It
-  is defined over a (batch of) length-`k` vector `counts` such that
+  is defined over a (batch of) length-`K` vector `counts` such that
   `tf.reduce_sum(counts, -1) = total_count`. The Dirichlet-Multinomial is
-  identically the Beta-Binomial distribution when `k = 2`.
+  identically the Beta-Binomial distribution when `K = 2`.
 
   #### Mathematical Details
 
-  The Dirichlet-Multinomial is a distribution over `k`-class counts, i.e., a
-  length-`k` vector of non-negative integer `counts = n = [n_0, ..., n_{k-1}]`.
+  The Dirichlet-Multinomial is a distribution over `K`-class counts, i.e., a
+  length-`K` vector of non-negative integer `counts = n = [n_0, ..., n_{K-1}]`.
 
   The probability mass function (pmf) is,
 
@@ -73,7 +73,7 @@ class DirichletMultinomial(distribution.Distribution):
 
   where:
 
-  * `concentration = alpha = [alpha_0, ..., alpha_{k-1}]`, `alpha_j > 0`,
+  * `concentration = alpha = [alpha_0, ..., alpha_{K-1}]`, `alpha_j > 0`,
   * `total_count = N`, `N` a positive integer,
   * `N!` is `N` factorial, and,
   * `Beta(x) = prod_j Gamma(x_j) / Gamma(sum_j x_j)` is the
@@ -88,19 +88,37 @@ class DirichletMultinomial(distribution.Distribution):
   samples are generated as follows.
 
     1. Choose class probabilities:
-       `probs = [p_0,...,p_{k-1}] ~ Dir(concentration)`
+       `probs = [p_0,...,p_{K-1}] ~ Dir(concentration)`
     2. Draw integers:
-       `counts = [n_0,...,n_{k-1}] ~ Multinomial(total_count, probs)`
+       `counts = [n_0,...,n_{K-1}] ~ Multinomial(total_count, probs)`
 
   The last `concentration` dimension parametrizes a single Dirichlet-Multinomial
   distribution. When calling distribution functions (e.g., `dist.prob(counts)`),
   `concentration`, `total_count` and `counts` are broadcast to the same shape.
-  The last dimension of of `counts` corresponds single Dirichlet-Multinomial
+  The last dimension of `counts` corresponds single Dirichlet-Multinomial
   distributions.
 
   Distribution parameters are automatically broadcast in all functions; see
   examples for details.
 
+  #### Pitfalls
+
+  The number of classes, `K`, must not exceed:
+  - the largest integer representable by `self.dtype`, i.e.,
+    `2**(mantissa_bits+1)` (IEE754),
+  - the maximum `Tensor` index, i.e., `2**31-1`.
+
+  In other words,
+
+  ```python
+  K <= min(2**31-1, {
+    tf.float16: 2**11,
+    tf.float32: 2**24,
+    tf.float64: 2**53 }[param.dtype])
+  ```
+
+  Note: This condition is validated only when `self.validate_args = True`.
+
   #### Examples
 
   ```python
@@ -157,8 +175,8 @@ class DirichletMultinomial(distribution.Distribution):
         Dirichlet multinomial distributions. Its components should be equal to
         integer values.
       concentration: Positive floating point tensor, whose dtype is the
-        same as `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.
-        Defines this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
+        same as `n` with shape broadcastable to `[N1,..., Nm, K]` `m >= 0`.
+        Defines this as a batch of `N1 x ... x Nm` different `K` class Dirichlet
         multinomial distributions.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
@@ -180,9 +198,11 @@ class DirichletMultinomial(distribution.Distribution):
       #   created automatically by prepending). This forces enough explicitness.
       # * All calls involving `counts` eventually require a broadcast between
       #  `counts` and concentration.
-      self._total_count = self._maybe_assert_valid_total_count(
-          ops.convert_to_tensor(total_count, name="total_count"),
-          validate_args)
+      self._total_count = ops.convert_to_tensor(total_count, name="total_count")
+      if validate_args:
+        self._total_count = (
+            distribution_util.embed_check_nonnegative_integer_form(
+                self._total_count))
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration,
                                 name="concentration"),
@@ -242,7 +262,8 @@ class DirichletMultinomial(distribution.Distribution):
         seed=distribution_util.gen_new_seed(seed, salt="dirichlet_multinomial"))
     x = math_ops.reduce_sum(array_ops.one_hot(draws, depth=k), -2)
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
-    return array_ops.reshape(x, final_shape)
+    x = array_ops.reshape(x, final_shape)
+    return math_ops.cast(x, self.dtype)
 
   @distribution_util.AppendDocstring(_dirichlet_multinomial_sample_note)
   def _log_prob(self, counts):
@@ -302,42 +323,21 @@ class DirichletMultinomial(distribution.Distribution):
     """Checks the validity of the concentration parameter."""
     if not validate_args:
       return concentration
+    concentration = distribution_util.embed_check_categorical_event_shape(
+        concentration)
     return control_flow_ops.with_dependencies([
         check_ops.assert_positive(
             concentration,
             message="Concentration parameter must be positive."),
-        check_ops.assert_rank_at_least(
-            concentration, 1,
-            message="Concentration parameter must have >=1 dimensions."),
-        check_ops.assert_less(
-            1, array_ops.shape(concentration)[-1],
-            message="Concentration parameter must have event_size >= 2."),
     ], concentration)
 
-  def _maybe_assert_valid_total_count(self, total_count, validate_args):
-    if not validate_args:
-      return total_count
-    return control_flow_ops.with_dependencies([
-        check_ops.assert_non_negative(
-            total_count,
-            message="total_count must be non-negative."),
-        distribution_util.assert_integer_form(
-            total_count,
-            message="total_count cannot contain fractional values."),
-    ], total_count)
-
   def _maybe_assert_valid_sample(self, counts):
     """Check counts for proper shape, values, then return tensor version."""
     if not self.validate_args:
       return counts
+    counts = distribution_util.embed_check_nonnegative_integer_form(counts)
     return control_flow_ops.with_dependencies([
-        check_ops.assert_non_negative(
-            counts,
-            message="counts must be non-negative."),
         check_ops.assert_equal(
             self.total_count, math_ops.reduce_sum(counts, -1),
             message="counts last-dimension must sum to `self.total_count`"),
-        distribution_util.assert_integer_form(
-            counts,
-            message="counts cannot contain fractional components."),
     ], counts)
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index a0be433a616103fc9525c157494629044704ec02..22687a093ae72edff1d53131cab49fa004aa3be0 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -35,6 +35,13 @@ from tensorflow.python.ops.distributions import util
 from tensorflow.python.util import tf_inspect
 
 
+__all__ = [
+    "ReparameterizationType",
+    "FULLY_REPARAMETERIZED",
+    "NOT_REPARAMETERIZED",
+    "Distribution",
+]
+
 _DISTRIBUTION_PUBLIC_METHOD_WRAPPERS = [
     "batch_shape_tensor", "batch_shape", "event_shape_tensor", "event_shape",
     "sample", "log_prob", "prob", "log_cdf", "cdf", "log_survival_function",
@@ -176,7 +183,7 @@ class _DistributionMeta(abc.ABCMeta):
 class ReparameterizationType(object):
   """Instances of this class represent how sampling is reparameterized.
 
-  Two static instances exist in the distritributions library, signifying
+  Two static instances exist in the distributions library, signifying
   one of two possible properties for samples from a distribution:
 
   `FULLY_REPARAMETERIZED`: Samples from the distribution are fully
diff --git a/tensorflow/python/ops/distributions/distributions.py b/tensorflow/python/ops/distributions/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df7d148a583e533475276e090bcb02cb705290f
--- /dev/null
+++ b/tensorflow/python/ops/distributions/distributions.py
@@ -0,0 +1,65 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core module for TensorFlow distribution objects and helpers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.distributions import bijectors
+from tensorflow.python.ops.distributions.bernoulli import Bernoulli
+from tensorflow.python.ops.distributions.beta import Beta
+from tensorflow.python.ops.distributions.categorical import Categorical
+from tensorflow.python.ops.distributions.dirichlet import Dirichlet
+from tensorflow.python.ops.distributions.dirichlet_multinomial import DirichletMultinomial
+from tensorflow.python.ops.distributions.distribution import *
+from tensorflow.python.ops.distributions.exponential import Exponential
+from tensorflow.python.ops.distributions.gamma import Gamma
+from tensorflow.python.ops.distributions.kullback_leibler import *
+from tensorflow.python.ops.distributions.laplace import Laplace
+from tensorflow.python.ops.distributions.multinomial import Multinomial
+from tensorflow.python.ops.distributions.normal import Normal
+from tensorflow.python.ops.distributions.student_t import StudentT
+from tensorflow.python.ops.distributions.uniform import Uniform
+# pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = [
+    "bijectors",
+    "Bernoulli",
+    "Beta",
+    "Categorical",
+    "DirichletMultinomial",
+    "Dirichlet",
+    "Distribution",
+    "ReparameterizationType",
+    "FULLY_REPARAMETERIZED",
+    "NOT_REPARAMETERIZED",
+    "Exponential",
+    "Gamma",
+    "RegisterKL",
+    "kl_divergence",
+    "Laplace",
+    "Multinomial",
+    "Normal",
+    "StudentT",
+    "Uniform",
+]
+
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index 9770d82bd8398a9f6d88c4360b77a7a691e72e5a..a6ab581cc22ce8e9a278bb8e0c7e6afc2dcc30eb 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -28,6 +28,12 @@ from tensorflow.python.util import tf_inspect
 _DIVERGENCES = {}
 
 
+__all__ = [
+    "RegisterKL",
+    "kl_divergence",
+]
+
+
 def _registered_kl(type_a, type_b):
   """Get the KL function registered for classes a and b."""
   hierarchy_a = tf_inspect.getmro(type_a)
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index a5bea7b4bad0e644cb7776446195f2734750ce7e..9b15d4c76eceac0b4b00059863f355ce417bbcd8 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -52,17 +52,17 @@ class Multinomial(distribution.Distribution):
   """Multinomial distribution.
 
   This Multinomial distribution is parameterized by `probs`, a (batch of)
-  length-`k` `prob` (probability) vectors (`k > 1`) such that
+  length-`K` `prob` (probability) vectors (`K > 1`) such that
   `tf.reduce_sum(probs, -1) = 1`, and a `total_count` number of trials, i.e.,
   the number of trials per draw from the Multinomial. It is defined over a
-  (batch of) length-`k` vector `counts` such that
+  (batch of) length-`K` vector `counts` such that
   `tf.reduce_sum(counts, -1) = total_count`. The Multinomial is identically the
-  Binomial distribution when `k = 2`.
+  Binomial distribution when `K = 2`.
 
   #### Mathematical Details
 
-  The Multinomial is a distribution over `k`-class counts, i.e., a length-`k`
-  vector of non-negative integer `counts = n = [n_0, ..., n_{k-1}]`.
+  The Multinomial is a distribution over `K`-class counts, i.e., a length-`K`
+  vector of non-negative integer `counts = n = [n_0, ..., n_{K-1}]`.
 
   The probability mass function (pmf) is,
 
@@ -72,7 +72,7 @@ class Multinomial(distribution.Distribution):
   ```
 
   where:
-  * `probs = pi = [pi_0, ..., pi_{k-1}]`, `pi_j > 0`, `sum_j pi_j = 1`,
+  * `probs = pi = [pi_0, ..., pi_{K-1}]`, `pi_j > 0`, `sum_j pi_j = 1`,
   * `total_count = N`, `N` a positive integer,
   * `Z` is the normalization constant, and,
   * `N!` denotes `N` factorial.
@@ -80,6 +80,24 @@ class Multinomial(distribution.Distribution):
   Distribution parameters are automatically broadcast in all functions; see
   examples for details.
 
+  #### Pitfalls
+
+  The number of classes, `K`, must not exceed:
+  - the largest integer representable by `self.dtype`, i.e.,
+    `2**(mantissa_bits+1)` (IEE754),
+  - the maximum `Tensor` index, i.e., `2**31-1`.
+
+  In other words,
+
+  ```python
+  K <= min(2**31-1, {
+    tf.float16: 2**11,
+    tf.float32: 2**24,
+    tf.float64: 2**53 }[param.dtype])
+  ```
+
+  Note: This condition is validated only when `self.validate_args = True`.
+
   #### Examples
 
   Create a 3-class distribution, with the 3rd class is most likely to be drawn,
@@ -138,14 +156,14 @@ class Multinomial(distribution.Distribution):
         to `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
         `N1 x ... x Nm` different Multinomial distributions. Its components
         should be equal to integer values.
-      logits: Floating point tensor representing the log-odds of a
-        positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
-        and the same dtype as `total_count`. Defines this as a batch of
-        `N1 x ... x Nm` different `k` class Multinomial distributions. Only one
-        of `logits` or `probs` should be passed in.
+      logits: Floating point tensor representing unnormalized log-probabilities
+        of a positive event with shape broadcastable to
+        `[N1,..., Nm, K]` `m >= 0`, and the same dtype as `total_count`. Defines
+        this as a batch of `N1 x ... x Nm` different `K` class Multinomial
+        distributions. Only one of `logits` or `probs` should be passed in.
       probs: Positive floating point tensor with shape broadcastable to
-        `[N1,..., Nm, k]` `m >= 0` and same dtype as `total_count`. Defines
-        this as a batch of `N1 x ... x Nm` different `k` class Multinomial
+        `[N1,..., Nm, K]` `m >= 0` and same dtype as `total_count`. Defines
+        this as a batch of `N1 x ... x Nm` different `K` class Multinomial
         distributions. `probs`'s components in the last portion of its shape
         should sum to `1`. Only one of `logits` or `probs` should be passed in.
       validate_args: Python `bool`, default `False`. When `True` distribution
@@ -160,9 +178,11 @@ class Multinomial(distribution.Distribution):
     """
     parameters = locals()
     with ops.name_scope(name, values=[total_count, logits, probs]):
-      self._total_count = self._maybe_assert_valid_total_count(
-          ops.convert_to_tensor(total_count, name="total_count"),
-          validate_args)
+      self._total_count = ops.convert_to_tensor(total_count, name="total_count")
+      if validate_args:
+        self._total_count = (
+            distribution_util.embed_check_nonnegative_integer_form(
+                self._total_count))
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
           probs=probs,
@@ -193,7 +213,7 @@ class Multinomial(distribution.Distribution):
 
   @property
   def probs(self):
-    """Probability of of drawing a `1` in that coordinate."""
+    """Probability of drawing a `1` in that coordinate."""
     return self._probs
 
   def _batch_shape_tensor(self):
@@ -222,25 +242,22 @@ class Multinomial(distribution.Distribution):
     k = self.event_shape_tensor()[0]
     # Flatten batch dims so logits has shape [B, k],
     # where B = reduce_prod(self.batch_shape_tensor()).
-    draws = random_ops.multinomial(
+    x = random_ops.multinomial(
         logits=array_ops.reshape(self.logits, [-1, k]),
         num_samples=n * n_draws,
         seed=seed)
-    draws = array_ops.reshape(draws, shape=[-1, n, n_draws])
-    x = math_ops.reduce_sum(array_ops.one_hot(draws, depth=k),
+    x = array_ops.reshape(x, shape=[-1, n, n_draws])
+    x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k),
                             axis=-2)  # shape: [B, n, k]
     x = array_ops.transpose(x, perm=[1, 0, 2])
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
-    return array_ops.reshape(x, final_shape)
+    x = array_ops.reshape(x, final_shape)
+    return math_ops.cast(x, self.dtype)
 
   @distribution_util.AppendDocstring(_multinomial_sample_note)
   def _log_prob(self, counts):
     return self._log_unnormalized_prob(counts) - self._log_normalization(counts)
 
-  @distribution_util.AppendDocstring(_multinomial_sample_note)
-  def _prob(self, counts):
-    return math_ops.exp(self._log_prob(counts))
-
   def _log_unnormalized_prob(self, counts):
     counts = self._maybe_assert_valid_sample(counts)
     return math_ops.reduce_sum(counts * math_ops.log(self.probs), -1)
@@ -265,25 +282,11 @@ class Multinomial(distribution.Distribution):
         self.total_count)[..., array_ops.newaxis]
     return self._mean_val - self._mean_val * p
 
-  def _maybe_assert_valid_total_count(self, total_count, validate_args):
-    if not validate_args:
-      return total_count
-    return control_flow_ops.with_dependencies([
-        check_ops.assert_non_negative(
-            total_count,
-            message="total_count must be non-negative."),
-        distribution_util.assert_integer_form(
-            total_count,
-            message="total_count cannot contain fractional values."),
-    ], total_count)
-
   def _maybe_assert_valid_sample(self, counts):
     """Check counts for proper shape, values, then return tensor version."""
     if not self.validate_args:
       return counts
-
-    counts = distribution_util.embed_check_nonnegative_discrete(
-        counts, check_integer=True)
+    counts = distribution_util.embed_check_nonnegative_integer_form(counts)
     return control_flow_ops.with_dependencies([
         check_ops.assert_equal(
             self.total_count, math_ops.reduce_sum(counts, -1),
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1be3819569cc1fca599b8967667cb93253edb8f8..d72e07a867588ff4960e15114cf2b359ae71dbd7 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -347,9 +346,10 @@ class TransformedDistribution(distribution_lib.Distribution):
     # Since the `bijector` may change the `event_shape`, we then forward what we
     # know to the bijector. This allows the `bijector` to have final say in the
     # `event_shape`.
-    static_override = tensor_util.constant_value(self._override_event_shape)
+    static_override = tensor_util.constant_value_as_shape(
+        self._override_event_shape)
     return self.bijector.forward_event_shape(
-        tensor_shape.TensorShape(static_override)
+        static_override
         if self._is_maybe_event_override
         else self.distribution.event_shape)
 
@@ -369,8 +369,9 @@ class TransformedDistribution(distribution_lib.Distribution):
     # the `bijector` doesn't get to alter the `batch_shape`. Recall that
     # `batch_shape` is a property of a distribution while `event_shape` is
     # shared between both the `distribution` instance and the `bijector`.
-    static_override = tensor_util.constant_value(self._override_batch_shape)
-    return (tensor_shape.TensorShape(static_override)
+    static_override = tensor_util.constant_value_as_shape(
+        self._override_batch_shape)
+    return (static_override
             if self._is_maybe_batch_override
             else self.distribution.batch_shape)
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 05c6f4da5700382dbd5b4b5dae497696feff836d..63fb87e93c583e8902880cf9feb9ccba3528609b 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import nn
 
 def assert_close(
     x, y, data=None, summarize=None, message=None, name="assert_close"):
-  """Assert that that x and y are within machine epsilon of each other.
+  """Assert that x and y are within machine epsilon of each other.
 
   Args:
     x: Floating-point `Tensor`
@@ -74,7 +74,8 @@ def assert_close(
 
 
 def assert_integer_form(
-    x, data=None, summarize=None, message=None, name="assert_integer_form"):
+    x, data=None, summarize=None, message=None,
+    int_dtype=None, name="assert_integer_form"):
   """Assert that x has integer components (or floats equal to integers).
 
   Args:
@@ -83,18 +84,30 @@ def assert_integer_form(
       error message and first few entries of `x` and `y`.
     summarize: Print this many entries of each tensor.
     message: A string to prefix to the default message.
+    int_dtype: A `tf.dtype` used to cast the float to. The default (`None`)
+      implies the smallest possible signed int will be used for casting.
     name: A name for this operation (optional).
 
   Returns:
-    Op raising `InvalidArgumentError` if round(x) != x.
+    Op raising `InvalidArgumentError` if `cast(x, int_dtype) != x`.
   """
-
-  message = message or "x has non-integer components"
-  x = ops.convert_to_tensor(x, name="x")
-  casted_x = math_ops.to_int64(x)
-  return check_ops.assert_equal(
-      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
-      data=data, summarize=summarize, message=message, name=name)
+  with ops.name_scope(name, values=[x, data]):
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_integer:
+      return control_flow_ops.no_op()
+    message = message or "{} has non-integer components".format(x.op.name)
+    if int_dtype is None:
+      try:
+        int_dtype = {
+            dtypes.float16: dtypes.int16,
+            dtypes.float32: dtypes.int32,
+            dtypes.float64: dtypes.int64,
+        }[x.dtype.base_dtype]
+      except KeyError:
+        raise TypeError("Unrecognized type {}".format(x.dtype.name))
+    return check_ops.assert_equal(
+        x, math_ops.cast(math_ops.cast(x, int_dtype), x.dtype),
+        data=data, summarize=summarize, message=message, name=name)
 
 
 def assert_symmetric(matrix):
@@ -103,14 +116,22 @@ def assert_symmetric(matrix):
       [check_ops.assert_equal(matrix, matrix_t)], matrix)
 
 
-def embed_check_nonnegative_discrete(x, check_integer=True):
+def embed_check_nonnegative_integer_form(
+    x, name="embed_check_nonnegative_integer_form"):
   """Assert x is a non-negative tensor, and optionally of integers."""
-  assertions = [check_ops.assert_non_negative(
-      x, message="x must be non-negative.")]
-  if check_integer:
-    assertions += [assert_integer_form(
-        x, message="x cannot contain fractional components.")]
-  return control_flow_ops.with_dependencies(assertions, x)
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    assertions = [
+        check_ops.assert_non_negative(
+            x, message="'{}' must be non-negative.".format(x.op.name)),
+    ]
+    if not x.dtype.is_integer:
+      assertions += [
+          assert_integer_form(
+              x, message="'{}' cannot contain fractional components.".format(
+                  x.op.name)),
+      ]
+    return control_flow_ops.with_dependencies(assertions, x)
 
 
 def same_dynamic_shape(a, b):
@@ -175,16 +196,26 @@ def get_logits_and_probs(logits=None,
 
     if probs is None:
       logits = ops.convert_to_tensor(logits, name="logits")
+      if not logits.dtype.is_floating:
+        raise TypeError("logits must having floating type.")
+      # We can early return since we constructed probs and therefore know
+      # they're valid.
       if multidimensional:
+        if validate_args:
+          logits = embed_check_categorical_event_shape(logits)
         return logits, nn.softmax(logits, name="probs")
       return logits, math_ops.sigmoid(logits, name="probs")
 
     probs = ops.convert_to_tensor(probs, name="probs")
+    if not probs.dtype.is_floating:
+      raise TypeError("probs must having floating type.")
+
     if validate_args:
       with ops.name_scope("validate_probs"):
         one = constant_op.constant(1., probs.dtype)
         dependencies = [check_ops.assert_non_negative(probs)]
         if multidimensional:
+          probs = embed_check_categorical_event_shape(probs)
           dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
                                         message="probs does not sum to 1.")]
         else:
@@ -205,6 +236,247 @@ def get_logits_and_probs(logits=None,
       return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
 
 
+def _is_known_unsigned_by_dtype(dt):
+  """Helper returning True if dtype is known to be unsigned."""
+  return {
+      dtypes.bool: True,
+      dtypes.uint8: True,
+      dtypes.uint16: True,
+  }.get(dt.base_dtype, False)
+
+
+def _is_known_signed_by_dtype(dt):
+  """Helper returning True if dtype is known to be signed."""
+  return {
+      dtypes.float16: True,
+      dtypes.float32: True,
+      dtypes.float64: True,
+      dtypes.int8: True,
+      dtypes.int16: True,
+      dtypes.int32: True,
+      dtypes.int64: True,
+  }.get(dt.base_dtype, False)
+
+
+def _is_known_dtype(dt):
+  """Helper returning True if dtype is known."""
+  return _is_known_unsigned_by_dtype(dt) or _is_known_signed_by_dtype(dt)
+
+
+def _largest_integer_by_dtype(dt):
+  """Helper returning the largest integer exactly representable by dtype."""
+  if not _is_known_dtype(dt):
+    raise TypeError("Unrecognized dtype: {}".format(dt.name))
+  if dt.is_floating:
+    return int(2**(np.finfo(dt.as_numpy_dtype).nmant + 1))
+  if dt.is_integer:
+    return np.iinfo(dt.as_numpy_dtype).max
+  if dt.base_dtype == dtypes.bool:
+    return int(1)
+  # We actually can't land here but keep the case for completeness.
+  raise TypeError("Unrecognized dtype: {}".format(dt.name))
+
+
+def _smallest_integer_by_dtype(dt):
+  """Helper returning the smallest integer exactly representable by dtype."""
+  if not _is_known_dtype(dt):
+    raise TypeError("Unrecognized dtype: {}".format(dt.name))
+  if _is_known_unsigned_by_dtype(dt):
+    return 0
+  return -1 * _largest_integer_by_dtype(dt)
+
+
+def _is_integer_like_by_dtype(dt):
+  """Helper returning True if dtype.is_interger or is `bool`."""
+  if not _is_known_dtype(dt):
+    raise TypeError("Unrecognized dtype: {}".format(dt.name))
+  return dt.is_integer or dt.base_dtype == dtypes.bool
+
+
+def embed_check_categorical_event_shape(
+    categorical_param,
+    name="embed_check_categorical_event_shape"):
+  """Embeds checks that categorical distributions don't have too many classes.
+
+  A categorical-type distribution is one which, e.g., returns the class label
+  rather than a one-hot encoding.  E.g., `Categorical(probs)`.
+
+  Since distributions output samples in the same dtype as the parameters, we
+  must ensure that casting doesn't lose precision. That is, the
+  `parameter.dtype` implies a maximum number of classes. However, since shape is
+  `int32` and categorical variables are presumed to be indexes into a `Tensor`,
+  we must also ensure that the number of classes is no larger than the largest
+  possible `int32` index, i.e., `2**31-1`.
+
+  In other words the number of classes, `K`, must satisfy the following
+  condition:
+
+  ```python
+  K <= min(
+      int(2**31 - 1),  # Largest float as an index.
+      {
+          dtypes.float16: int(2**11),   # Largest int as a float16.
+          dtypes.float32: int(2**24),
+          dtypes.float64: int(2**53),
+      }.get(categorical_param.dtype.base_dtype, 0))
+  ```
+
+  Args:
+    categorical_param: Floating-point `Tensor` representing parameters of
+      distribution over categories. The rightmost shape is presumed to be the
+      number of categories.
+    name: A name for this operation (optional).
+
+  Returns:
+    categorical_param: Input `Tensor` with appropriate assertions embedded.
+
+  Raises:
+    TypeError: if `categorical_param` has an unknown `dtype`.
+    ValueError: if we can statically identify `categorical_param` as being too
+      large (for being closed under int32/float casting).
+  """
+  with ops.name_scope(name, values=[categorical_param]):
+    x = ops.convert_to_tensor(categorical_param, name="categorical_param")
+    # The size must not exceed both of:
+    # - The largest possible int32 (since categorical values are presumed to be
+    #   indexes into a Tensor).
+    # - The largest possible integer exactly representable under the given
+    #   floating-point dtype (since we need to cast to/from).
+    #
+    # The chosen floating-point thresholds are 2**(1 + mantissa_bits).
+    # For more details, see:
+    # https://en.wikipedia.org/wiki/Floating-point_arithmetic#Internal_representation
+    x_dtype = x.dtype.base_dtype
+    max_event_size = (_largest_integer_by_dtype(x_dtype)
+                      if x_dtype.is_floating else 0)
+    if max_event_size is 0:
+      raise TypeError("Unable to validate size of unrecognized dtype "
+                      "({}).".format(x_dtype.name))
+    try:
+      x_shape_static = x.get_shape().with_rank_at_least(1)
+    except ValueError:
+      raise ValueError("A categorical-distribution parameter must have "
+                       "at least 1 dimension.")
+    if x_shape_static[-1].value is not None:
+      event_size = x_shape_static[-1].value
+      if event_size < 2:
+        raise ValueError("A categorical-distribution parameter must have at "
+                         "least 2 events.")
+      if event_size > max_event_size:
+        raise ValueError(
+            "Number of classes exceeds `dtype` precision, i.e., "
+            "{} implies shape ({}) cannot exceed {}.".format(
+                x_dtype.name, event_size, max_event_size))
+      return x
+    else:
+      event_size = array_ops.shape(x, name="x_shape")[-1]
+      return control_flow_ops.with_dependencies([
+          check_ops.assert_rank_at_least(
+              x, 1, message=("A categorical-distribution parameter must have "
+                             "at least 1 dimension.")),
+          check_ops.assert_greater_equal(
+              array_ops.shape(x)[-1], 2,
+              message=("A categorical-distribution parameter must have at "
+                       "least 2 events.")),
+          check_ops.assert_less_equal(
+              event_size, max_event_size,
+              message="Number of classes exceeds `dtype` precision, "
+                      "i.e., {} dtype cannot exceed {} shape.".format(
+                          x_dtype.name, max_event_size)),
+      ], x)
+
+
+def embed_check_integer_casting_closed(
+    x,
+    target_dtype,
+    assert_nonnegative=True,
+    name="embed_check_casting_closed"):
+  """Ensures integers remain unaffected despite casting to/from int/float types.
+
+  Example integer-types: `uint8`, `int32`, `bool`.
+  Example floating-types: `float32`, `float64`.
+
+  The largest possible integer representable by an IEEE754 floating-point is
+  `2**(1 + mantissa_bits)` yet the largest possible integer as an int-type is
+  `2**(bits - 1) - 1`. This function ensures that a `Tensor` purporting to have
+  integer-form values can be cast to some other type without loss of precision.
+
+  The smallest representable integer is the negative of the largest
+  representable integer, except for types: `uint8`, `uint16`, `bool`. For these
+  types, the smallest representable integer is `0`.
+
+  Args:
+    x: `Tensor` representing integer-form values.
+    target_dtype: TF `dtype` under which `x` should have identical values.
+    assert_nonnegative: `bool` indicating `x` should contain nonnegative values.
+    name: A name for this operation (optional).
+
+  Returns:
+    x: Input `Tensor` with appropriate assertions embedded.
+
+  Raises:
+    TypeError: if `x` is neither integer- nor floating-type.
+    TypeError: if `target_dtype` is neither integer- nor floating-type.
+    TypeError: if neither `x` nor `target_dtype` are integer-type.
+  """
+
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if (not _is_integer_like_by_dtype(x.dtype)
+        and not x.dtype.is_floating):
+      raise TypeError("{}.dtype must be floating- or "
+                      "integer-type.".format(x.dtype.name))
+    if (not _is_integer_like_by_dtype(target_dtype)
+        and not target_dtype.is_floating):
+      raise TypeError("target_dtype ({}) must be floating- or "
+                      "integer-type.".format(target_dtype.name))
+    if (not _is_integer_like_by_dtype(x.dtype)
+        and not _is_integer_like_by_dtype(target_dtype)):
+      raise TypeError("At least one of {}.dtype ({}) and target_dtype ({}) "
+                      "must be integer-type.".format(
+                          x.op.name, x.dtype.name, target_dtype.name))
+
+    assertions = []
+    if assert_nonnegative:
+      assertions += [
+          check_ops.assert_non_negative(
+              x, message="Elements must be non-negative."),
+      ]
+
+    if x.dtype.is_floating:
+      # Being here means _is_integer_like_by_dtype(target_dtype) = True.
+      # Since this check implies the magnitude check below, we need only it.
+      assertions += [
+          assert_integer_form(
+              x, int_dtype=target_dtype,
+              message="Elements must be {}-equivalent.".format(
+                  target_dtype.name)),
+      ]
+    else:
+      if (_largest_integer_by_dtype(x.dtype)
+          > _largest_integer_by_dtype(target_dtype)):
+        # Cast may lose integer precision.
+        assertions += [
+            check_ops.assert_less_equal(
+                x, _largest_integer_by_dtype(target_dtype),
+                message=("Elements cannot exceed {}.".format(
+                    _largest_integer_by_dtype(target_dtype)))),
+        ]
+      if (not assert_nonnegative and
+          (_smallest_integer_by_dtype(x.dtype)
+           < _smallest_integer_by_dtype(target_dtype))):
+        assertions += [
+            check_ops.assert_greater_equal(
+                x, _smallest_integer_by_dtype(target_dtype),
+                message=("Elements cannot be smaller than {}.".format(
+                    _smallest_integer_by_dtype(target_dtype)))),
+        ]
+
+    if not assertions:
+      return x
+    return control_flow_ops.with_dependencies(assertions, x)
+
+
 def log_combinations(n, counts, name="log_combinations"):
   """Multinomial coefficient.
 
@@ -253,8 +525,8 @@ def matrix_diag_transform(matrix, transform=None, name=None):
   # valid Cholesky factor.
   chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
 
-  # OperatorPDCholesky ignores the upper triangle.
-  operator = OperatorPDCholesky(chol)
+  # LinearOperatorTriL ignores the upper triangle.
+  operator = LinearOperatorTriL(chol)
   ```
 
   Example of heteroskedastic 2-D linear regression.
@@ -456,6 +728,8 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
   e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
   construct a lower triangular.
 
+  Warning: This Op is intended for convenience, not efficiency.
+
   Example:
 
   ```python
@@ -563,6 +837,74 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
     return y
 
 
+def tridiag(below=None, diag=None, above=None, name=None):
+  """Creates a matrix with values set above, below, and on the diagonal.
+
+  Example:
+
+  ```python
+  tridiag(below=[1., 2., 3.],
+          diag=[4., 5., 6., 7.],
+          above=[8., 9., 10.])
+  # ==> array([[  4.,   8.,   0.,   0.],
+  #            [  1.,   5.,   9.,   0.],
+  #            [  0.,   2.,   6.,  10.],
+  #            [  0.,   0.,   3.,   7.]], dtype=float32)
+  ```
+
+  Warning: This Op is intended for convenience, not efficiency.
+
+  Args:
+    below: `Tensor` of shape `[B1, ..., Bb, d-1]` corresponding to the below
+      diagonal part. `None` is logically equivalent to `below = 0`.
+    diag: `Tensor` of shape `[B1, ..., Bb, d]` corresponding to the diagonal
+      part.  `None` is logically equivalent to `diag = 0`.
+    above: `Tensor` of shape `[B1, ..., Bb, d-1]` corresponding to the above
+      diagonal part.  `None` is logically equivalent to `above = 0`.
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    tridiag: `Tensor` with values set above, below and on the diagonal.
+
+  Raises:
+    ValueError: if all inputs are `None`.
+  """
+
+  def _pad(x):
+    """Prepends and appends a zero to every vector in a batch of vectors."""
+    shape = array_ops.concat([array_ops.shape(x)[:-1], [1]], axis=0)
+    z = array_ops.zeros(shape, dtype=x.dtype)
+    return array_ops.concat([z, x, z], axis=-1)
+
+  def _add(*x):
+    """Adds list of Tensors, ignoring `None`."""
+    s = None
+    for y in x:
+      if y is None:
+        continue
+      elif s is None:
+        s = y
+      else:
+        s += y
+    if s is None:
+      raise ValueError("Must specify at least one of `below`, `diag`, `above`.")
+    return s
+
+  with ops.name_scope(name, "tridiag", [below, diag, above]):
+    if below is not None:
+      below = ops.convert_to_tensor(below, name="below")
+      below = array_ops.matrix_diag(_pad(below))[..., :-1, 1:]
+    if diag is not None:
+      diag = ops.convert_to_tensor(diag, name="diag")
+      diag = array_ops.matrix_diag(diag)
+    if above is not None:
+      above = ops.convert_to_tensor(above, name="above")
+      above = array_ops.matrix_diag(_pad(above))[..., 1:, :-1]
+    # TODO(jvdillon): Consider using scatter_nd instead of creating three full
+    # matrices.
+    return _add(below, diag, above)
+
+
 # TODO(jvdillon): Merge this test back into:
 # tensorflow/python/ops/softplus_op_test.py
 # once TF core is accepting new ops.
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 4c94f9e9b535c11fd2ca445bfa4ac2c1867f5e0e..f34d7fd389b259343e5e599554229ce98187f80d 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Operations for embeddings."""
 from __future__ import absolute_import
 from __future__ import division
@@ -35,65 +34,73 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 
-def _do_gather(params, ids, name=None):
-  """Deals with doing gather differently for resource variables."""
-  if isinstance(params, resource_variable_ops.ResourceVariable):
-    return params.sparse_read(ids, name=name)
-  return array_ops.gather(params, ids, name=name)
-
-
-def embedding_lookup(params, ids, partition_strategy="mod", name=None,
-                     validate_indices=True,  # pylint: disable=unused-argument
-                     max_norm=None):
-  """Looks up `ids` in a list of embedding tensors.
-
-  This function is used to perform parallel lookups on the list of
-  tensors in `params`.  It is a generalization of
-  @{tf.gather}, where `params` is
-  interpreted as a partitioning of a large embedding tensor.  `params` may be
-  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-  partitioner.
-
-  If `len(params) > 1`, each element `id` of `ids` is partitioned between
-  the elements of `params` according to the `partition_strategy`.
-  In all strategies, if the id space does not evenly divide the number of
-  partitions, each of the first `(max_id + 1) % len(params)` partitions will
-  be assigned one more id.
-
-  If `partition_strategy` is `"mod"`, we assign each id to partition
-  `p = id % len(params)`. For instance,
-  13 ids are split across 5 partitions as:
-  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
-
-  If `partition_strategy` is `"div"`, we assign ids to partitions in a
-  contiguous manner. In this case, 13 ids are split across 5 partitions as:
-  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
+def _gather_and_clip(params, ids, max_norm, name=None):
+  """Helper function for _embedding_lookup_and_transform.
 
-  The results of the lookup are concatenated into a dense
-  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
+  This function gathers embeddings from a single tensor. The gather deals with
+  resource variables specially. The embeddings are clipped to an l2-norm of
+  max_norm if provided.
 
   Args:
-    params: A single tensor representing the complete embedding tensor,
-      or a list of P tensors all of same shape except for the first dimension,
-      representing sharded embedding tensors.  Alternatively, a
-      `PartitionedVariable`, created by partitioning along dimension 0. Each
-      element must be appropriately sized for the given `partition_strategy`.
-    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
-      up in `params`.
-    partition_strategy: A string specifying the partitioning strategy, relevant
-      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
-      is `"mod"`.
+    params: A `Tensor` of embeddings.
+    ids: A `Tensor` indexing the embeddings to be retrieved from `params`.
+    max_norm: If provided, embedding values are l2-normalized to the value of
+      max_norm.
     name: A name for the operation (optional).
-    validate_indices: DEPRECATED. If this operation is assigned to CPU, values
-      in `indices` are always validated to be within range.  If assigned to GPU,
-      out-of-bound indices result in safe but unspecified behavior, which may
-      include raising an error.
-    max_norm: If not None, embedding values are l2-normalized to the value of
-     max_norm.
 
   Returns:
-    A `Tensor` with the same type as the tensors in `params`.
+    A `Tensor` with the same type as `params`.
+  """
+  if isinstance(params, resource_variable_ops.ResourceVariable):
+    embs = params.sparse_read(ids, name=name)
+  else:
+    embs = array_ops.gather(params, ids, name=name)
+  if max_norm is None:
+    return embs
+  static = True
+  ids_rank = ops.convert_to_tensor(ids).get_shape().ndims
+  if ids_rank is None:
+    ids_rank = array_ops.rank(ids)
+    static = False
+  embs_rank = embs.get_shape().ndims
+  if embs_rank is None:
+    embs_rank = array_ops.rank(embs)
+    static = False
+  return clip_ops.clip_by_norm(
+      embs,
+      max_norm,
+      axes=list(range(ids_rank, embs_rank))
+      if static else math_ops.range(ids_rank, embs_rank))
+
+
+def _embedding_lookup_and_transform(params,
+                                    ids,
+                                    partition_strategy="mod",
+                                    name=None,
+                                    max_norm=None,
+                                    transform_fn=None):
+  """Helper function for embedding_lookup and _compute_sampled_logits.
+
+  This function is a generalization of embedding_lookup that optionally
+  applies a caller-specified transformation to each embedding. This is
+  done through the `transform_fn` argument. If provided, the function is
+  applied to each partitioned tensor of retrieved embeddings, colocated
+  with the embeddings. This function will be called with a single `Tensor`
+  argument of the same type as the `params` tensor and should return a
+  `Tensor`. The shape of the argument will be the same as `params` except
+  for the size of the first dimension. The first dimension of the result's
+  shape must be the same size as the argument's.
 
+  Args:
+    params: See embedding_lookup.
+    ids: See embedding_lookup.
+    partition_strategy: See embedding_lookup.
+    name: See embedding_lookup.
+    max_norm: See embedding_lookup.
+    transform_fn: An optional function to apply to each retrieved embedding.
+
+  Returns:
+    See embedding_lookup for details.
   Raises:
     ValueError: If `params` is empty.
   """
@@ -104,35 +111,25 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
   if not isinstance(params, list):
     params = [params]
 
-  def maybe_normalize(x):
-    """Normalizes the embeddings in x if max_norm is not None."""
-    if max_norm is None:
-      return x
-    static = True
-    ids_rank = ops.convert_to_tensor(ids).get_shape().ndims
-    if ids_rank is None:
-      ids_rank = array_ops.rank(ids)
-      static = False
-    x_rank = x.get_shape().ndims
-    if x_rank is None:
-      x_rank = array_ops.rank(x)
-      static = False
-    return clip_ops.clip_by_norm(
-        x, max_norm,
-        axes=list(range(ids_rank, x_rank)) if static
-        else math_ops.range(ids_rank, x_rank))
-
   with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
     np = len(params)  # Number of partitions
     # Preserve the resource variable status to avoid accidental dense reads.
-    if not any(isinstance(p, resource_variable_ops.ResourceVariable)
-               for p in params):
+    if not any(
+        isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
       params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
-    if np == 1:
+    ids = ops.convert_to_tensor(ids, name="ids")
+    if np == 1 and (transform_fn is None or ids.get_shape().ndims == 1):
       with ops.colocate_with(params[0]):
-        return maybe_normalize(_do_gather(params[0], ids, name=name))
+        result = _gather_and_clip(params[0], ids, max_norm, name=name)
+        if transform_fn is not None:
+          result = transform_fn(result)
+        return result
     else:
-      ids = ops.convert_to_tensor(ids, name="ids")
+      # Flatten the ids. There are two cases where we need to do this.
+      # - There is more than one params tensor.
+      # - There is a transform_fn and ids is not statically known to be 1-D.
+      #   We must flatten in this case because transform_fn expects a flat
+      #   a flat tensor of embeddings.
       flat_ids = array_ops.reshape(ids, [-1])
       original_indices = math_ops.range(array_ops.size(flat_ids))
 
@@ -167,13 +164,12 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
             (flat_ids - extras) // ids_per_partition)
 
         # Emulate a conditional using a boolean indicator tensor
-        is_in_first_extras_partitions = math_ops.cast(
-            p_assignments < extras, flat_ids.dtype)
-        new_ids = (
-            is_in_first_extras_partitions * (
-                flat_ids % (ids_per_partition + 1)) +
-            (1 - is_in_first_extras_partitions) * (
-                (flat_ids - extras) % ids_per_partition))
+        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+                                                      flat_ids.dtype)
+        new_ids = (is_in_first_extras_partitions * (flat_ids %
+                                                    (ids_per_partition + 1)) +
+                   (1 - is_in_first_extras_partitions) *
+                   ((flat_ids - extras) % ids_per_partition))
       else:
         raise ValueError("Unrecognized partition strategy: " +
                          partition_strategy)
@@ -190,36 +186,117 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       partitioned_result = []
       for p in xrange(np):
         with ops.colocate_with(params[p]):
-          partitioned_result.append(_do_gather(params[p], gather_ids[p]))
+          result = _gather_and_clip(params[p], gather_ids[p], max_norm)
+          if transform_fn is not None:
+            result = transform_fn(result)
+          partitioned_result.append(result)
       # Stitch these back together
-      ret = data_flow_ops.dynamic_stitch(pindices, partitioned_result,
-                                         name=name)
-      # Reshape to reverse the flattening of ids.
-      element_shape = params[0].get_shape()[1:]
-      for p in params[1:]:
-        element_shape = element_shape.merge_with(p.get_shape()[1:])
-      if element_shape.is_fully_defined():
-        ret = array_ops.reshape(ret,
-                                array_ops.concat(
-                                    [array_ops.shape(ids), element_shape], 0))
+      ret = data_flow_ops.dynamic_stitch(
+          pindices, partitioned_result, name=name)
+
+      # Determine the static element shape.
+      if transform_fn is None:
+        element_shape_s = params[0].get_shape()[1:]
+        for p in params[1:]:
+          element_shape_s = element_shape_s.merge_with(p.get_shape()[1:])
       else:
+        element_shape_s = ret.get_shape()[1:]
+
+      # Compute the dynamic element shape.
+      if element_shape_s.is_fully_defined():
+        element_shape_d = element_shape_s
+      elif transform_fn is None:
         # It's important that we compute params[0].shape on the right device
         # to avoid data motion.
         with ops.colocate_with(params[0]):
           params_shape = array_ops.shape(params[0])
-        ret = array_ops.reshape(ret,
-                                array_ops.concat([
-                                    array_ops.shape(ids),
-                                    array_ops.slice(params_shape, [1], [-1])
-                                ], 0))
-      # output shape = ids.shape + params[*].shape[1:]
+        element_shape_d = params_shape[1:]
+      else:
+        element_shape_d = array_ops.shape(ret)[1:]
+
+      # Reshape to reverse the flattening of ids.
+      ret = array_ops.reshape(ret,
+                              array_ops.concat(
+                                  [array_ops.shape(ids), element_shape_d], 0))
+
       # Normally the reshape is sufficient, but setting shape explicitly
-      # teaches shape inference that params[1:].get_shape() matters.
-      ret.set_shape(ids.get_shape().concatenate(element_shape))
-      return maybe_normalize(ret)
+      # teaches shape inference that params[1:].get_shape() matters
+      # (in the case that transform_fn is None).
+      ret.set_shape(ids.get_shape().concatenate(element_shape_s))
+      return ret
+
+
+def embedding_lookup(
+    params,
+    ids,
+    partition_strategy="mod",
+    name=None,
+    validate_indices=True,  # pylint: disable=unused-argument
+    max_norm=None):
+  """Looks up `ids` in a list of embedding tensors.
+
+  This function is used to perform parallel lookups on the list of
+  tensors in `params`.  It is a generalization of
+  @{tf.gather}, where `params` is
+  interpreted as a partitioning of a large embedding tensor.  `params` may be
+  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  If `len(params) > 1`, each element `id` of `ids` is partitioned between
+  the elements of `params` according to the `partition_strategy`.
+  In all strategies, if the id space does not evenly divide the number of
+  partitions, each of the first `(max_id + 1) % len(params)` partitions will
+  be assigned one more id.
+
+  If `partition_strategy` is `"mod"`, we assign each id to partition
+  `p = id % len(params)`. For instance,
+  13 ids are split across 5 partitions as:
+  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
+
+  If `partition_strategy` is `"div"`, we assign ids to partitions in a
+  contiguous manner. In this case, 13 ids are split across 5 partitions as:
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
+
+  The results of the lookup are concatenated into a dense
+  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
+      up in `params`.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`.
+    name: A name for the operation (optional).
+    validate_indices: DEPRECATED. If this operation is assigned to CPU, values
+      in `indices` are always validated to be within range.  If assigned to GPU,
+      out-of-bound indices result in safe but unspecified behavior, which may
+      include raising an error.
+    max_norm: If provided, embedding values are l2-normalized to the value of
+      max_norm.
 
+  Returns:
+    A `Tensor` with the same type as the tensors in `params`.
 
-def embedding_lookup_sparse(params, sp_ids, sp_weights,
+  Raises:
+    ValueError: If `params` is empty.
+  """
+  return _embedding_lookup_and_transform(
+      params=params,
+      ids=ids,
+      partition_strategy=partition_strategy,
+      name=name,
+      max_norm=max_norm,
+      transform_fn=None)
+
+
+def embedding_lookup_sparse(params,
+                            sp_ids,
+                            sp_weights,
                             partition_strategy="mod",
                             name=None,
                             combiner=None,
@@ -254,7 +331,7 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
       "mean" is the weighted sum divided by the total weight.
       "sqrtn" is the weighted sum divided by the square root of the sum of the
       squares of the weights.
-    max_norm: If not None, each embedding is normalized to have l2 norm equal
+    max_norm: If provided, each embedding is normalized to have l2 norm equal
       to max_norm before combining.
 
   Returns:
@@ -349,8 +426,9 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
       # Set the weight shape, since after reshaping to bcast_weights_shape,
       # the shape becomes None.
       if embeddings.get_shape().ndims is not None:
-        weights.set_shape(orig_weights_shape.concatenate(
-            [1 for _ in range(embeddings.get_shape().ndims - 1)]))
+        weights.set_shape(
+            orig_weights_shape.concatenate(
+                [1 for _ in range(embeddings.get_shape().ndims - 1)]))
 
       embeddings *= weights
 
@@ -371,14 +449,14 @@ def embedding_lookup_sparse(params, sp_ids, sp_weights,
     else:
       assert idx is not None
       if combiner == "sum":
-        embeddings = math_ops.sparse_segment_sum(embeddings, idx, segment_ids,
-                                                 name=name)
+        embeddings = math_ops.sparse_segment_sum(
+            embeddings, idx, segment_ids, name=name)
       elif combiner == "mean":
-        embeddings = math_ops.sparse_segment_mean(embeddings, idx, segment_ids,
-                                                  name=name)
+        embeddings = math_ops.sparse_segment_mean(
+            embeddings, idx, segment_ids, name=name)
       elif combiner == "sqrtn":
-        embeddings = math_ops.sparse_segment_sqrt_n(embeddings, idx,
-                                                    segment_ids, name=name)
+        embeddings = math_ops.sparse_segment_sqrt_n(
+            embeddings, idx, segment_ids, name=name)
       else:
         assert False, "Unrecognized combiner"
 
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index bd8a5c86acc7501b30489c78b714293ee14763c1..e073dbc640cdc2e9dd8e0b1b7840cfa0ce6b3e9d 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -45,6 +46,7 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -915,13 +917,11 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
     aggregation_method: See `gradients()` documentation for details.
 
   Returns:
-    A list of Hessian matrices of `sum(y)` for each `x` in `xs`.
+    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
 
   Raises:
     LookupError: if one of the operations between `xs` and `ys` does not
       have a registered gradient function.
-    ValueError: if the arguments are invalid or not supported. Currently,
-      this function only supports one-dimensional `x` in `xs`.
   """
   xs = _AsList(xs)
   kwargs = {
@@ -929,28 +929,30 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
       'gate_gradients': gate_gradients,
       'aggregation_method': aggregation_method
     }
-  # Compute a hessian matrix for each x in xs
+  # Compute first-order derivatives and iterate for each x in xs.
   hessians = []
-  for i, x in enumerate(xs):
-    # Check dimensions
-    ndims = x.get_shape().ndims
-    if ndims is None:
-      raise ValueError('Cannot compute Hessian because the dimensionality of '
-                       'element number %d of `xs` cannot be determined' % i)
-    elif ndims != 1:
-      raise ValueError('Computing hessians is currently only supported for '
-                       'one-dimensional tensors. Element number %d of `xs` has '
-                       '%d dimensions.' % (i, ndims))
-    with ops.name_scope(name + '_first_derivative'):
-      # Compute the partial derivatives of the input with respect to all
-      # elements of `x`
-      _gradients = gradients(ys, x, **kwargs)[0]
-      # Unpack the gradients into a list so we can take derivatives with
-      # respect to each element
-      _gradients = array_ops.unstack(_gradients)
-    with ops.name_scope(name + '_second_derivative'):
-      # Compute the partial derivatives with respect to each element of the list
-      _hess = [gradients(_gradient, x, **kwargs)[0] for _gradient in _gradients]
-      # Pack the list into a matrix and add to the list of hessians
-      hessians.append(array_ops.stack(_hess, name=name))
+  _gradients = gradients(ys, xs, **kwargs)
+  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
+    # Ensure that x is a vector.
+    check_rank = check_ops.assert_rank(
+      x, 1, message='Cannot compute Hessian because element %d of `xs` does '
+      'not have rank one.' % i
+    )
+    with ops.control_dependencies([check_rank]):
+      # Declare an iterator and tensor array loop variables for the gradients.
+      n = array_ops.size(x)
+      loop_vars = [
+        array_ops.constant(0, dtypes.int32),
+        tensor_array_ops.TensorArray(x.dtype, n)
+      ]
+      # Iterate over all elements of the gradient and compute second order
+      # derivatives.
+      _, hessian = control_flow_ops.while_loop(
+          lambda j, _: j < n,
+          lambda j, result: (j + 1,
+                             result.write(j, gradients(_gradient[j], x)[0])),
+          loop_vars
+      )
+
+      hessians.append(hessian.stack())
   return hessians
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 9aef6bffdea0b169cf09eeda0362ecbb84d26b9b..519731643268e9b19f0aa85cb165476e87f5393a 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -6,6 +6,7 @@ ConcatOffset
 Concat
 ConcatV2
 Const
+DebugGradientIdentity
 EditDistance
 ExpandDims
 ListDiff
@@ -14,6 +15,7 @@ MirrorPadGrad
 OneHot
 Pack
 Pad
+PadV2
 ParallelConcat
 Placeholder
 RefIdentity
@@ -111,6 +113,10 @@ Stack
 StackClose
 StackPop
 StackPush
+StackV2
+StackCloseV2
+StackPopV2
+StackPushV2
 TensorArray
 TensorArrayClose
 TensorArrayCloseV2
@@ -157,6 +163,8 @@ ResizeBilinearGrad
 ResizeNearestNeighborGrad
 AdjustContrastv2
 ScaleImageGrad
+SampleDistortedBoundingBox
+SampleDistortedBoundingBoxV2
 
 # io_ops
 FixedLengthRecordReader
@@ -282,6 +290,7 @@ MaxPool3DGradGrad
 ReluGrad
 Relu6Grad
 EluGrad
+SeluGrad
 SoftplusGrad
 SoftsignGrad
 TopK
@@ -295,6 +304,8 @@ Softmax
 LogSoftmax
 FractionalAvgPoolGrad
 FractionalMaxPoolGrad
+InTopK
+InTopKV2
 
 # parsing_ops
 ParseExample
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 51d0276140200ecfbe97379b292aca6efa447719..75c67dcb3c2ad34b53a86970562fe770f7fd4e69 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -60,7 +60,6 @@ See the @{$python/image} guide.
 @@per_image_standardization
 @@draw_bounding_boxes
 @@non_max_suppression
-@@non_max_suppression_v2
 @@sample_distorted_bounding_box
 @@total_variation
 """
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 65a1399c5b8b115fa2277fad84c384e0b5c74755..e39cabde31ee564a77738225ac54b735a65ed682 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -47,6 +47,7 @@ ops.NotDifferentiable('RGBToHSV')
 ops.NotDifferentiable('HSVToRGB')
 ops.NotDifferentiable('DrawBoundingBoxes')
 ops.NotDifferentiable('SampleDistortedBoundingBox')
+ops.NotDifferentiable('SampleDistortedBoundingBoxV2')
 # TODO(bsteiner): Implement the gradient function for extract_glimpse
 # TODO(b/31222613): This op may be differentiable, and there may be
 # latent bugs here.
@@ -1472,3 +1473,103 @@ def total_variation(images, name=None):
                math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
 
   return tot_var
+
+
+def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
+                                  seed2=None, min_object_covered=None,
+                                  aspect_ratio_range=None, area_range=None,
+                                  max_attempts=None,
+                                  use_image_if_no_bounding_boxes=None,
+                                  name=None):
+  """Generate a single randomly distorted bounding box for an image.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+  localization of an object, i.e. bounding box, given an `image_size`,
+  `bounding_boxes` and a series of constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+  what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+  bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+  height of the underlying image.
+
+  For example,
+
+  ```python
+      # Generate a single distorted bounding box.
+      begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+          tf.shape(image),
+          bounding_boxes=bounding_boxes)
+
+      # Draw the bounding box in an image summary.
+      image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                    bbox_for_draw)
+      tf.image_summary('images_with_box', image_with_box)
+
+      # Employ the bounding box to distort the image.
+      distorted_image = tf.slice(image, begin, size)
+  ```
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`.
+      1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`.
+      3-D with shape `[batch, N, 4]` describing the N bounding boxes
+      associated with the image.
+    seed: An optional `int`. Defaults to `0`.
+      If either `seed` or `seed2` are set to non-zero, the random number
+      generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+      seed.
+    seed2: An optional `int`. Defaults to `0`.
+      A second seed to avoid seed collision.
+    min_object_covered: An optional `float`. Defaults to `0.1`.
+      The cropped area of the image must contain at least this
+      fraction of any bounding box supplied. The value of this parameter should be
+      non-negative. In the case of 0, the cropped area does not need to overlap
+      any of the bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75, 1.33]`.
+      The cropped area of the image must have an aspect ratio =
+      width / height within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
+      The cropped area of the image must contain a fraction of the
+      supplied image within in this range.
+    max_attempts: An optional `int`. Defaults to `100`.
+      Number of attempts at generating a cropped region of the image
+      of the specified constraints. After `max_attempts` failures, return the entire
+      image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied.
+      If true, assume an implicit bounding box covering the whole input. If false,
+      raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+      Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  with ops.name_scope(name, 'sample_distorted_bounding_box'):
+    return gen_image_ops._sample_distorted_bounding_box_v2(image_size,
+                bounding_boxes, seed=seed,
+                seed2=seed2, min_object_covered=min_object_covered,
+                aspect_ratio_range=aspect_ratio_range, area_range=area_range,
+                max_attempts=max_attempts,
+                use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
+                name=name)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 5588d18ef1d75e38bd7a91f02c50c508f22044ad..a7a2de87f1fafa9183d4bdb9a577db67e5d99de5 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1501,6 +1501,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
           image_size_np, shape=image_size_np.shape)
       bounding_box_tf = constant_op.constant(
           bounding_box_np, dtype=dtypes.float32, shape=bounding_box_np.shape)
+
       begin, size, _ = image_ops.sample_distorted_bounding_box(
           image_size=image_size_tf,
           bounding_boxes=bounding_box_tf,
@@ -1520,6 +1521,27 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         area_ratios.append(area / original_area)
         fraction_object_covered.append(float(np.sum(y_tf)) / bounding_box_area)
 
+      # min_object_covered as tensor
+      min_object_covered_placeholder = array_ops.placeholder(dtypes.float32)
+      begin, size, _ = image_ops.sample_distorted_bounding_box(
+          image_size=image_size_tf,
+          bounding_boxes=bounding_box_tf,
+          min_object_covered=min_object_covered_placeholder,
+          aspect_ratio_range=aspect_ratio_range,
+          area_range=area_range)
+      y = array_ops.strided_slice(image_tf, begin, begin + size)
+
+      for _ in xrange(num_iter):
+        y_tf = y.eval(feed_dict={min_object_covered_placeholder: min_object_covered})
+        crop_height = y_tf.shape[0]
+        crop_width = y_tf.shape[1]
+        aspect_ratio = float(crop_width) / float(crop_height)
+        area = float(crop_width * crop_height)
+
+        aspect_ratios.append(aspect_ratio)
+        area_ratios.append(area / original_area)
+        fraction_object_covered.append(float(np.sum(y_tf)) / bounding_box_area)
+
     # Ensure that each entry is observed within 3 standard deviations.
     # num_bins = 10
     # aspect_ratio_hist, _ = np.histogram(aspect_ratios,
@@ -1617,6 +1639,18 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
 
+      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+          image_size=image_size,
+          bounding_boxes=bounding_box,
+          min_object_covered=array_ops.placeholder(dtypes.float32),
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+      # Test that the shapes are correct.
+      self.assertAllEqual([3], begin.get_shape().as_list())
+      self.assertAllEqual([3], end.get_shape().as_list())
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 1e2f999995756ad4b4c432ddfc31c39254818622..42b4f952bbcda54b35ad71b531f8c9cb09137bf8 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -41,7 +41,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import math_ops
 
 
 class Initializer(object):
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 0b1a77969a00d8ae392a25bf432645619f7a24c0..975494e45f0ec074d23e6873ededf7ad4c42cb90 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -26,6 +26,7 @@ See the @{$python/io_ops} guide.
 @@WholeFileReader
 @@IdentityReader
 @@TFRecordReader
+@@LMDBReader
 @@FixedLengthRecordReader
 @@decode_csv
 @@decode_raw
@@ -396,7 +397,8 @@ class FixedLengthRecordReader(ReaderBase):
                header_bytes=None,
                footer_bytes=None,
                hop_bytes=None,
-               name=None):
+               name=None,
+               encoding=None):
     """Create a FixedLengthRecordReader.
 
     Args:
@@ -405,12 +407,14 @@ class FixedLengthRecordReader(ReaderBase):
       footer_bytes: An optional int. Defaults to 0.
       hop_bytes: An optional int. Defaults to 0.
       name: A name for the operation (optional).
+      encoding: The type of encoding for the file. Defaults to none.
     """
     rr = gen_io_ops._fixed_length_record_reader_v2(
         record_bytes=record_bytes,
         header_bytes=header_bytes,
         footer_bytes=footer_bytes,
         hop_bytes=hop_bytes,
+        encoding=encoding,
         name=name)
     super(FixedLengthRecordReader, self).__init__(rr)
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 43af95e2784a1372ed4495a00382daf8a27e962f..e0cf8bc5e1485100d867bfb1e6be01c55f526816 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -240,7 +240,7 @@ class HashTable(InitializableLookupTableBase):
       tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
   out = table.lookup(input_tensor).
   table.init.run()
-  print out.eval()
+  print(out.eval())
   ```
   """
 
@@ -691,8 +691,8 @@ class IdTableWithHashBuckets(LookupInterface):
   - emerson -> 0
   - lake -> 1
   - palmer -> 2
-  - <other term> -> bucket id between 3 and 3 + num_oov_buckets, calculated by:
-    hash(<term>) % num_oov_buckets + vocab_size
+  - <other term> -> bucket id between 3 and 3 + num_oov_buckets - 1, calculated
+    by: hash(<term>) % num_oov_buckets + vocab_size
 
   If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
   the lookup result is [0, 1, 2, 4, 7]
@@ -709,7 +709,7 @@ class IdTableWithHashBuckets(LookupInterface):
       num_oov_buckets)
   out = table.lookup(input_tensor).
   table.init.run()
-  print out.eval()
+  print(out.eval())
   ```
 
   The hash function used for generating out-of-vocabulary buckets ID is handled
@@ -870,7 +870,8 @@ def index_table_from_file(vocabulary_file=None,
   Any lookup of an out-of-vocabulary token will return a bucket ID based on its
   hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
   `default_value`.
-  The bucket ID range is `[vocabulary size, vocabulary size + num_oov_buckets]`.
+  The bucket ID range is
+  `[vocabulary size, vocabulary size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
   `tf.tables_initializer.run()` or `table.init.run()` once.
@@ -977,7 +978,7 @@ def index_table_from_tensor(vocabulary_list,
   Any lookup of an out-of-vocabulary token will return a bucket ID based on its
   hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
   `default_value`.
-  The bucket ID range is `[mapping size, mapping size + num_oov_buckets]`.
+  The bucket ID range is `[mapping size, mapping size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
   `tf.tables_initializer.run()` or `table.init.run()` once.
@@ -988,9 +989,9 @@ def index_table_from_tensor(vocabulary_list,
   Sample Usages:
 
   ```python
-  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
   table = tf.contrib.lookup.index_table_from_tensor(
-      vocabulary_list=vocabulary_list, num_oov_buckets=1, default_value=-1)
+      mapping=vocabulary_list, num_oov_buckets=1, default_value=-1)
   features = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = table.lookup(features)
   ...
@@ -1160,7 +1161,7 @@ def index_to_string_table_from_tensor(vocabulary_list,
   Sample Usages:
 
   ```python
-  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
   indices = tf.constant([1, 5], tf.int64)
   table = tf.contrib.lookup.index_to_string_table_from_tensor(
       vocabulary_list, default_value="UNKNOWN")
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index c913d7a37934598d3fad581a07e58184e61ebe6d..438e2712757db4fbb9f4790eb451e9c7be7062fb 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -531,7 +531,7 @@ def mean_squared_error(
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    losses = math_ops.square(math_ops.subtract(predictions, labels))
+    losses = math_ops.squared_difference(predictions, labels)
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index a0f505e47b222fbd9c0e4b4b62f43be81efbf141..b8266a527d2b4a1febf35defc287664d442b9865 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -122,8 +122,10 @@ def _ProdGrad(op, grad):
   # so we need to cast here.  We put all the shape-related ops on CPU to avoid
   # copying back and forth, and since listdiff is CPU only.
   with ops.device("/cpu:0"):
+    rank = array_ops.rank(op.inputs[0])
+    reduction_indices = (reduction_indices + rank) % rank
     reduced = math_ops.cast(reduction_indices, dtypes.int32)
-    idx = math_ops.range(0, array_ops.rank(op.inputs[0]))
+    idx = math_ops.range(0, rank)
     other, _ = array_ops.setdiff1d(idx, reduced)
     perm = array_ops.concat([reduced, other], 0)
     reduced_num = math_ops.reduce_prod(array_ops.gather(input_shape, reduced))
@@ -397,6 +399,36 @@ def _TanhGrad(op, grad):
     return gen_math_ops._tanh_grad(y, grad)
 
 
+@ops.RegisterGradient("Asinh")
+def _AsinhGrad(op, grad):
+  """Returns grad * 1/cosh(y)."""
+  y = op.outputs[0]
+  with ops.control_dependencies([grad.op]):
+    y = math_ops.conj(y)
+    return grad / math_ops.cosh(y)
+
+
+@ops.RegisterGradient("Acosh")
+def _AcoshGrad(op, grad):
+  """Returns grad * 1/sinh(y)."""
+  y = op.outputs[0]
+  with ops.control_dependencies([grad.op]):
+    y = math_ops.conj(y)
+    return grad / math_ops.sinh(y)
+
+
+@ops.RegisterGradient("Atanh")
+def _AtanhGrad(op, grad):
+  """Returns grad * 1/ (1 - x^2)."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad.op]):
+    x = math_ops.conj(x)
+    x2 = math_ops.square(x)
+    one = constant_op.constant(1, dtype=grad.dtype)
+    inv = math_ops.reciprocal(math_ops.subtract(one, x2))
+    return grad * inv
+
+
 @ops.RegisterGradient("TanhGrad")
 def _TanhGradGrad(op, grad):
   with ops.control_dependencies([grad.op]):
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 1fa15957b0911a68c90aa0231e694d26457a3c2d..da3e0d72949d26ff7ce3c333cda3aeacc21c4208 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -113,6 +113,29 @@ class MinOrMaxGradientTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+class ProdGradientTest(test.TestCase):
+
+  def testProdGradient(self):
+    inputs = constant_op.constant([[1., 2.], [3., 4.]],
+                                  dtype=dtypes.float32)
+    outputs = math_ops.reduce_prod(inputs)
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs, inputs.get_shape().as_list(),
+          outputs, outputs.get_shape().as_list())
+      self.assertLess(error, 1e-4)
+
+  def testProdGradientForNegativeAxis(self):
+    inputs = constant_op.constant([[1., 2.], [3., 4.]],
+                                  dtype=dtypes.float32)
+    outputs = math_ops.reduce_prod(inputs, -1)
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs, inputs.get_shape().as_list(),
+          outputs, outputs.get_shape().as_list())
+      self.assertLess(error, 1e-4)
+
+
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
   def testSegmentMinGradient(self):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 16aa5d82476ba4e12fd10c1b2aa9c9f9d59cff59..3e91ec068415bb5598400057671392936be7e54d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -47,6 +47,9 @@ See the @{$python/math_ops} guide.
 @@log1p
 @@sinh
 @@cosh
+@@asinh
+@@acosh
+@@atanh
 @@ceil
 @@floor
 @@maximum
@@ -165,42 +168,58 @@ from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.deprecation import deprecated_args
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
+arg_max = deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
+arg_min = deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+
+
+def _set_doc(doc):
+  def _decorator(func):
+    func.__doc__ = doc
+    return func
+  return _decorator
+
 
 # pylint: disable=redefined-builtin
-# TODO(aselle): deprecate arg_max
-def argmax(input, axis=None, name=None, dimension=None):
+@deprecated_args(None, "Use the `axis` argument instead", "dimension")
+@_set_doc(gen_math_ops.arg_max.__doc__
+          .replace("dimensions", "axes")
+          .replace("dimension", "axis"))
+def argmax(input,
+           axis=None,
+           name=None,
+           dimension=None,
+           output_type=dtypes.int64):
   if dimension is not None:
     if axis is not None:
       raise ValueError("Cannot specify both 'axis' and 'dimension'")
     axis = dimension
   elif axis is None:
     axis = 0
-  return gen_math_ops.arg_max(input, axis, name)
-
-
-argmax.__doc__ = (gen_math_ops.arg_max.__doc__.replace("dimensions",
-                                                       "axes").replace(
-                                                           "dimension", "axis"))
-
-
-# TODO(aselle:deprecate arg_min)
-def argmin(input, axis=None, name=None, dimension=None):
+  return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
+
+
+@deprecated_args(None, "Use the `axis` argument instead", "dimension")
+@_set_doc(gen_math_ops.arg_min.__doc__
+          .replace("dimensions", "axes")
+          .replace("dimension", "axis"))
+def argmin(input,
+           axis=None,
+           name=None,
+           dimension=None,
+           output_type=dtypes.int64):
   if dimension is not None:
     if axis is not None:
       raise ValueError("Cannot specify both 'axis' and 'dimension'")
     axis = dimension
   elif axis is None:
     axis = 0
-  return gen_math_ops.arg_min(input, axis, name)
-
+  return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
-argmin.__doc__ = (gen_math_ops.arg_min.__doc__.replace("dimensions",
-                                                       "axes").replace(
-                                                           "dimension", "axis"))
 
 # pylint: enable=redefined-builtin
 
@@ -1240,7 +1259,8 @@ def reduce_sum(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1294,7 +1314,8 @@ def count_nonzero(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should be of numeric type, or `bool`.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     dtype: The output dtype; defaults to `tf.int64`.
     name: A name for the operation (optional).
@@ -1344,7 +1365,8 @@ def reduce_mean(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1381,7 +1403,8 @@ def reduce_prod(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1418,7 +1441,8 @@ def reduce_min(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1455,7 +1479,8 @@ def reduce_max(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1502,7 +1527,8 @@ def reduce_all(input_tensor,
   Args:
     input_tensor: The boolean tensor to reduce.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1549,7 +1575,8 @@ def reduce_any(input_tensor,
   Args:
     input_tensor: The boolean tensor to reduce.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1602,7 +1629,8 @@ def reduce_logsumexp(input_tensor,
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions.
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
     keep_dims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1683,8 +1711,8 @@ def matmul(a,
            name=None):
   """Multiplies matrix `a` by matrix `b`, producing `a` * `b`.
 
-  The inputs must, following any transpositions, be tensors of rank >= 2 
-  where the inner 2 dimensions specify valid matrix multiplication arguments, 
+  The inputs must, following any transpositions, be tensors of rank >= 2
+  where the inner 2 dimensions specify valid matrix multiplication arguments,
   and any further outer dimensions match.
 
   Both matrices must be of the same type. The supported types are:
@@ -2152,7 +2180,8 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
     x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    axis: A `Tensor` of type `int32` (default: 0).
+    axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+      `[-rank(x), rank(x))`.
     exclusive: If `True`, perform exclusive cumsum.
     reverse: A `bool` (default: False).
     name: A name for the operation (optional).
@@ -2202,7 +2231,8 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
     x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-    axis: A `Tensor` of type `int32` (default: 0).
+    axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+      `[-rank(x), rank(x))`.
     exclusive: If `True`, perform exclusive cumprod.
     reverse: A `bool` (default: False).
     name: A name for the operation (optional).
@@ -2308,8 +2338,8 @@ def tensordot(a, b, axes, name=None):
   Example 2: When `a` and `b` are matrices (order 2), the case
   `axes = [[1], [0]]` is equivalent to matrix multiplication.
 
-  Example 3: Suppose that \\(a_ijk\\) and \\(b_lmn\\) represent two
-  tensors of order 3. Then, `contract(a, b, [0], [2])` is the order 4 tensor
+  Example 3: Suppose that \\(a_{ijk}\\) and \\(b_{lmn}\\) represent two
+  tensors of order 3. Then, `contract(a, b, [[0], [2]])` is the order 4 tensor
   \\(c_{jklm}\\) whose entry
   corresponding to the indices \\((j,k,l,m)\\) is given by:
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 96836037856c65be67b9d7c0b5218dd996b7a106..617d2305bd87df5eda7374b6fa8756ef6fd5553a 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -224,7 +224,6 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testAcceptsRefs(self):
     var = variables.Variable(10)
     result = math_ops.scalar_mul(3, var)
@@ -327,7 +326,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     divs = np.arange(-3, 0, .25).reshape(1, 12)
     return nums, divs
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testFloorModInt(self):
     nums, divs = self.intTestData()
     with self.test_session():
@@ -337,7 +335,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = nums % divs
       self.assertAllEqual(tf_result, np_result)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testFloorModFloat(self):
     nums, divs = self.floatTestData()
     with self.test_session():
@@ -349,7 +346,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               % array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     with self.test_session():
@@ -357,7 +353,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testTruncateModFloat(self):
     nums, divs = self.floatTestData()
     with self.test_session():
@@ -365,7 +360,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testDivideInt(self):
     nums, divs = self.intTestData()
     with self.test_session():
@@ -377,14 +371,12 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               // array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testDivideName(self):
     with self.test_session():
       op = math_ops.divide(
           array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
       self.assertEqual(op.name, "my_cool_divide:0")
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testRealDiv(self):
     nums, divs = self.floatTestData()
     with self.test_session():
@@ -392,14 +384,12 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.divide(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testComplexDiv(self):
     foo = array_ops.constant([1. + 3.j])
     with self.test_session():
       _ = math_ops.divide(foo, 1.).eval()
       _ = math_ops.div(foo, 2.).eval()
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testFloorDivGrad(self):
     with self.test_session():
       a = variables.Variable(2.)
@@ -414,7 +404,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
         self.assertAllEqual([None if x is None else x.eval()
                              for x in c_grad], [None, None])
 
-  @test_util.disable_c_api  # Operation._set_device doesn't work with C API
   def testConsistent(self):
     nums, divs = self.intTestData()
     with self.test_session():
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c575162ab24727a1d50ec6668321aade6cfef5
--- /dev/null
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -0,0 +1,143 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Matmul operator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
+  """Build a graph containing a sequence of matmul operations.
+
+  Args:
+    device: String, the device to run on.
+    n: tensor A's first dimension size.
+    m: tensor A's second dimension size.
+    k: tensor B's second dimension size.
+    transpose_a: boolean value to show if tensor A is transposed.
+    transpose_b: boolean value to show if tensor B is transposed.
+    dtype: numpy data type of the input tensor.
+
+  Returns:
+    A matmul operation to run()
+  """
+  with ops.device('/%s:0' % device):
+    if not transpose_a:
+      x = variables.Variable(random_ops.random_uniform([n, m], dtype=dtype))
+    else:
+      x = variables.Variable(random_ops.random_uniform([m, n], dtype=dtype))
+    if not transpose_b:
+      y = variables.Variable(random_ops.random_uniform([m, k], dtype=dtype))
+    else:
+      y = variables.Variable(random_ops.random_uniform([k, m], dtype=dtype))
+
+    z = math_ops.matmul(x, y, transpose_a=transpose_a, transpose_b=transpose_b)
+    return control_flow_ops.group(z)
+
+
+class MatmulBenchmark(test.Benchmark):
+  """Benchmark matmul!"""
+
+  def run_graph(self, device, n, m, k, transpose_a, transpose_b, num_iters,
+                dtype):
+    """Run the graph and print its execution time.
+
+    Args:
+      device: String, the device to run on.
+      n: tensor A's first dimension size.
+      m: tensor A's second dimension size.
+      k: tensor B's second dimension size.
+      transpose_a: boolean value to show if tensor A is transposed.
+      transpose_b: boolean value to show if tensor B is transposed.
+      num_iters: number of iterations to run the benchmark.
+      dtype: numpy data type of the input tensor.
+
+    Returns:
+      The duration of the run in seconds.
+    """
+    graph = ops.Graph()
+    with graph.as_default():
+      output = build_graph(device, n, m, k, transpose_a, transpose_b, dtype)
+      with session_lib.Session(graph=graph) as session:
+        variables.global_variables_initializer().run()
+        for _ in range(500):
+          session.run(output)
+        start_time = time.time()
+        for _ in range(num_iters):
+          session.run(output)
+        duration = (time.time() - start_time)
+        num_items = n * m * k * 2
+        throughput = num_items * num_iters / duration / 1e9
+        print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' %
+              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:'
+               + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
+               duration, throughput))
+
+    name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}')
+
+    self.report_benchmark(
+        name=name_template.format(
+            device=device,
+            dtype=str(dtype).replace(' ', ''),
+            inputinfo=str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:' +
+            str(transpose_a) + '.tb:' + str(transpose_b)).replace(' ', ''),
+        iters=num_iters,
+        wall_time=duration)
+    return duration
+
+  def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
+    self.run_graph('gpu', n, m, k, transpose_a, transpose_b, num_iters, dtype)
+
+  def test_round(self, num_iters):
+    dtypes = [np.float32, np.float64]
+    for dtype in dtypes:
+      for n, m, (transpose_a, transpose_b) in itertools.product(
+          [512, 1024], [1, 8, 16, 128], [(False, False), (True, False),
+                                         (False, True)]):
+        k = n
+        self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
+
+      for n, m, k, (transpose_a, transpose_b) in itertools.product(
+          [200], [1, 8, 20], [10000], [(False, False), (True, False), (False,
+                                                                       True)]):
+        self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
+
+      for (n, m, k), (transpose_a, transpose_b) in itertools.product(
+          [(200, 20, 20000), (1, 10000, 200)], [(False, False), (True, False),
+                                                (False, True)]):
+        self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
+
+  def benchmark_matmul(self):
+    num_iters = 200
+    for _ in range(10):
+      self.test_round(num_iters)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7914dba787aa670a079c21d2da3f242aa13f4a3
--- /dev/null
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -0,0 +1,122 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for matmul_benchmark.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import numpy as np
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import matmul_benchmark
+from tensorflow.python.platform import test as googletest
+from tensorflow.python.platform import tf_logging
+
+
+def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype):
+
+  def Test(self):
+    if not googletest.is_gpu_available():
+      tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a,
+                                                     transpose_b))
+      return
+    tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a,
+                                                  transpose_b))
+    self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype)
+
+  return Test
+
+
+def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype):
+
+  def Test(self):
+    if not googletest.is_gpu_available():
+      tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a,
+                                                   transpose_b))
+      return
+    tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a,
+                                                transpose_b))
+    self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype)
+
+  return Test
+
+
+class MatmulBenchmarkTest(googletest.TestCase):
+
+  def _StripNode(self, nd):
+    snode = node_def_pb2.NodeDef(name=nd.name, op=nd.op, input=nd.input)
+    if nd.device:
+      snode.device = nd.device
+    return snode
+
+  def _StripGraph(self, gd):
+    return graph_pb2.GraphDef(node=[self._StripNode(nd) for nd in gd.node])
+
+  def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
+    graph = ops.Graph()
+    with graph.as_default():
+      matmul_benchmark.build_graph("gpu", n, m, k, transpose_a, transpose_b,
+                                   dtype)
+      gd = graph.as_graph_def()
+      self.assertProtoEquals("""
+      node { name: "random_uniform/shape" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform/min" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform/max" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: "/device:GPU:0" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: "/device:GPU:0" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: "/device:GPU:0" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: "/device:GPU:0" }
+      node { name: "Variable" op: "VariableV2" device: "/device:GPU:0" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: "/device:GPU:0" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/shape" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/min" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/max" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: "/device:GPU:0" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: "/device:GPU:0" }
+      node { name: "Variable_1" op: "VariableV2" device: "/device:GPU:0" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: "/device:GPU:0" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: "/device:GPU:0" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: "/device:GPU:0" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: "/device:GPU:0" }
+                             """, self._StripGraph(gd))
+
+  def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
+    benchmark_instance = matmul_benchmark.MatmulBenchmark()
+    duration = benchmark_instance.run_graph("gpu", n, m, k, transpose_a,
+                                            transpose_b, 1, dtype)
+    self.assertTrue(duration > 1e-6)
+
+
+if __name__ == "__main__":
+  dtypes = [np.float32, np.float64]
+  index = 0
+  for _dtype in dtypes:
+    for _n, _m, (_transpose_a, _transpose_b) in itertools.product(
+        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False,
+                                                                       True)]):
+      _k = _n
+      setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index),
+              BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype))
+      setattr(MatmulBenchmarkTest, "testRunGraph_" + str(index),
+              RunGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype))
+      index += 1
+  googletest.main()
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index d05cba2e930b1a9fb886756b41aac7848c01ae69..60e9695dcb9ede5cd0b3549b6692dcc5fdcbc3b4 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -22,6 +22,7 @@ See the @{$python/nn} guide.
 @@relu6
 @@crelu
 @@elu
+@@selu
 @@softplus
 @@softsign
 @@dropout
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 028d82aa4da50a506d80cd24b9fd8e0c7fa584e1..50673ed4276f3ccb324d598817e040471ae4e542 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -327,10 +327,21 @@ def _ReluGrad(op, grad):
 
 @ops.RegisterGradient("EluGrad")
 def _EluGradGrad(op, grad):
+  elu_x = op.inputs[1]
+  return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
+          array_ops.where(elu_x < 0,
+                          grad * op.inputs[0],
+                          array_ops.zeros(shape=array_ops.shape(elu_x),
+                                          dtype=elu_x.dtype)))
+
+
+@ops.RegisterGradient("SeluGrad")
+def _SeluGradGrad(op, grad):
   x = op.inputs[1]
+  scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
           array_ops.where(
-              x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + 1),
+              x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + scale_alpha),
               array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
 
 
@@ -344,6 +355,11 @@ def _EluGrad(op, grad):
   return gen_nn_ops._elu_grad(grad, op.outputs[0])
 
 
+@ops.RegisterGradient("Selu")
+def _SeluGrad(op, grad):
+  return gen_nn_ops._selu_grad(grad, op.outputs[0])
+
+
 @ops.RegisterGradient("Softplus")
 def _SoftplusGrad(op, grad):
   return gen_nn_ops._softplus_grad(grad, op.inputs[0])
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 254a8432d3f3fdb82bd7a18c6c0dddff401de573..98ede2031bc65bc5f2e90bc23a0926e7f6490a77 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -97,9 +97,11 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
     return result
 
 
-def sigmoid_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      name=None):
+def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
+    _sentinel=None,
+    labels=None,
+    logits=None,
+    name=None):
   """Computes sigmoid cross entropy given `logits`.
 
   Measures the probability error in discrete classification tasks in which each
@@ -143,8 +145,8 @@ def sigmoid_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     ValueError: If `logits` and `labels` do not have the same shape.
   """
   # pylint: disable=protected-access
-  nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits",
-                           _sentinel, labels, logits)
+  nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", _sentinel,
+                           labels, logits)
   # pylint: enable=protected-access
 
   with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
@@ -153,8 +155,8 @@ def sigmoid_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     try:
       labels.get_shape().merge_with(logits.get_shape())
     except ValueError:
-      raise ValueError("logits and labels must have the same shape (%s vs %s)"
-                       % (logits.get_shape(), labels.get_shape()))
+      raise ValueError("logits and labels must have the same shape (%s vs %s)" %
+                       (logits.get_shape(), labels.get_shape()))
 
     # The logistic loss formula from above is
     #   x - x * z + log(1 + exp(-x))
@@ -168,9 +170,10 @@ def sigmoid_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     cond = (logits >= zeros)
     relu_logits = array_ops.where(cond, logits, zeros)
     neg_abs_logits = array_ops.where(cond, -logits, logits)
-    return math_ops.add(relu_logits - logits * labels,
-                        math_ops.log1p(math_ops.exp(neg_abs_logits)),
-                        name=name)
+    return math_ops.add(
+        relu_logits - logits * labels,
+        math_ops.log1p(math_ops.exp(neg_abs_logits)),
+        name=name)
 
 
 def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
@@ -226,8 +229,9 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
     try:
       targets.get_shape().merge_with(logits.get_shape())
     except ValueError:
-      raise ValueError("logits and targets must have the same shape (%s vs %s)"
-                       % (logits.get_shape(), targets.get_shape()))
+      raise ValueError(
+          "logits and targets must have the same shape (%s vs %s)" %
+          (logits.get_shape(), targets.get_shape()))
 
     # The logistic loss formula from above is
     #   (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
@@ -569,21 +573,23 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
     else:  # no shift.
       shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean")
       mean = shifted_mean
-    variance = math_ops.subtract(math_ops.multiply(variance_ss, divisor),
-                                 math_ops.square(shifted_mean),
-                                 name="variance")
+    variance = math_ops.subtract(
+        math_ops.multiply(variance_ss, divisor),
+        math_ops.square(shifted_mean),
+        name="variance")
   return (mean, variance)
 
 
-def moments(x, axes, shift=None, name=None, keep_dims=False):
+def moments(x, axes,
+            shift=None,  # pylint: disable=unused-argument
+            name=None, keep_dims=False):
   """Calculate the mean and variance of `x`.
 
   The mean and variance are calculated by aggregating the contents of `x`
   across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
   and variance of a vector.
 
-  Note: for numerical stability, when shift=None, the true mean
-  would be computed and used as shift.
+  Note: shift is currently not used, the true mean is computed and used.
 
   When using these moments for batch normalization (see
   `tf.nn.batch_normalization`):
@@ -596,35 +602,26 @@ def moments(x, axes, shift=None, name=None, keep_dims=False):
     x: A `Tensor`.
     axes: Array of ints.  Axes along which to compute mean and
       variance.
-    shift: A `Tensor` containing the value by which to shift the data for
-      numerical stability, or `None` in which case the true mean of the data is
-      used as shift. A shift close to the true mean provides the most
-      numerically stable results.
+    shift: Not used in the current implementation
     name: Name used to scope the operations that compute the moments.
     keep_dims: produce moments with the same dimensionality as the input.
 
   Returns:
     Two `Tensor` objects: `mean` and `variance`.
   """
-  with ops.name_scope(name, "moments", [x, axes, shift]):
+  with ops.name_scope(name, "moments", [x, axes]):
     # The dynamic range of fp16 is too limited to support the collection of
     # sufficient statistics. As a workaround we simply perform the operations
     # on 32-bit floats before converting the mean and variance back to fp16
     y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
-    if shift is None:
-      # Compute true mean while keeping the dims for proper broadcasting.
-      shift = array_ops.stop_gradient(
-          math_ops.reduce_mean(y, axes, keep_dims=True))
-    else:
-      shift = math_ops.cast(shift, y.dtype)
-    shifted_mean = math_ops.reduce_mean(
-        math_ops.subtract(y, shift), axes, keep_dims=True, name="shifted_mean")
-    variance = math_ops.subtract(
-        math_ops.reduce_mean(
-            math_ops.squared_difference(y, shift), axes, keep_dims=True),
-        math_ops.square(shifted_mean),
+    # Compute true mean while keeping the dims for proper broadcasting.
+    mean = math_ops.reduce_mean(y, axes, keep_dims=True, name="mean")
+    # sample variance, not unbiased variance
+    variance = math_ops.reduce_mean(
+        math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
+        axes,
+        keep_dims=True,
         name="variance")
-    mean = math_ops.add(shifted_mean, shift, name="mean")
     if not keep_dims:
       mean = array_ops.squeeze(mean, axes)
       variance = array_ops.squeeze(variance, axes)
@@ -965,16 +962,32 @@ def _compute_sampled_logits(weights,
     # sampled is a [num_sampled] int tensor
     all_ids = array_ops.concat([labels_flat, sampled], 0)
 
+    # Retrieve the true weights and the logits of the sampled weights.
+
     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
+
+    # true_w shape is [batch_size * num_true, dim]
+    true_w = array_ops.slice(all_w, [0, 0],
+                             array_ops.stack(
+                                 [array_ops.shape(labels_flat)[0], -1]))
+
+    sampled_w = array_ops.slice(
+        all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
+    # inputs has shape [batch_size, dim]
+    # sampled_w has shape [num_sampled, dim]
+    # Apply X*W', which yields [batch_size, num_sampled]
+    sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True)
+
+    # Retrieve the true and sampled biases, compute the true logits, and
+    # add the biases to the true and sampled logits.
     all_b = embedding_ops.embedding_lookup(
         biases, all_ids, partition_strategy=partition_strategy)
-    # true_w shape is [batch_size * num_true, dim]
     # true_b is a [batch_size * num_true] tensor
-    true_w = array_ops.slice(
-        all_w, [0, 0], array_ops.stack([array_ops.shape(labels_flat)[0], -1]))
+    # sampled_b is a [num_sampled] float tensor
     true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
+    sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])
 
     # inputs shape is [batch_size, dim]
     # true_w shape is [batch_size * num_true, dim]
@@ -991,20 +1004,7 @@ def _compute_sampled_logits(weights,
     true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])
     true_b = array_ops.reshape(true_b, [-1, num_true])
     true_logits += true_b
-
-    # Lookup weights and biases for sampled labels.
-    #   sampled_w shape is [num_sampled, dim]
-    #   sampled_b is a [num_sampled] float tensor
-    sampled_w = array_ops.slice(
-        all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
-    sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])
-
-    # inputs has shape [batch_size, dim]
-    # sampled_w has shape [num_sampled, dim]
-    # sampled_b has shape [num_sampled]
-    # Apply X*W'+B, which yields [batch_size, num_sampled]
-    sampled_logits = math_ops.matmul(
-        inputs, sampled_w, transpose_b=True) + sampled_b
+    sampled_logits += sampled_b
 
     if remove_accidental_hits:
       acc_hits = candidate_sampling_ops.compute_accidental_hits(
@@ -1019,8 +1019,8 @@ def _compute_sampled_logits(weights,
                                         "sparse_indices")
       # Create sampled_logits_shape = [batch_size, num_sampled]
       sampled_logits_shape = array_ops.concat(
-          [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)],
-          0)
+          [array_ops.shape(labels)[:1],
+           array_ops.expand_dims(num_sampled, 0)], 0)
       if sampled_logits.dtype != acc_weights.dtype:
         acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
       sampled_logits += sparse_ops.sparse_to_dense(
@@ -1245,7 +1245,7 @@ def sampled_softmax_loss(weights,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
       name=name)
-  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(labels=labels,
-                                                            logits=logits)
+  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index e4eaeff67ad6b90d6c9fb129b0f7ad8f732d77b0..1ce7ea179f14cd1019ef859b6ce2d68053013059 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -544,7 +544,9 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
                          x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1],
                          q]
   ```
-  where `padded_input` is obtained by zero padding the input using an effective
+  where b is the index into the batch, k is the output channel number, q is the
+  input channel number, and z is the N-D spatial offset within the filter. Here,
+  `padded_input` is obtained by zero padding the input using an effective
   spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
   output striding `strides` as described in the
   @{tf.nn.convolution$comment here}.
@@ -2083,3 +2085,36 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
                               rates=rates,
                               padding=padding,
                               name=name))
+
+def in_top_k(predictions, targets, k, name=None):
+  r"""Says whether the targets are in the top `K` predictions.
+
+  This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+  prediction for the target class is among the top `k` predictions among
+  all predictions for example `i`. Note that the behavior of `InTopK` differs
+  from the `TopK` op in its handling of ties; if multiple classes have the
+  same prediction value and straddle the top-`k` boundary, all of those
+  classes are considered to be in the top `k`.
+
+  More formally, let
+
+    \\(predictions_i\\) be the predictions for all classes for example `i`,
+    \\(targets_i\\) be the target class for example `i`,
+    \\(out_i\\) be the output for example `i`,
+
+  $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+
+  Args:
+    predictions: A `Tensor` of type `float32`.
+      A `batch_size` x `classes` tensor.
+    targets: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A `batch_size` vector of class ids.
+    k: An `int`. Number of top elements to look at for computing precision.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
+  """
+  with ops.name_scope(name, 'in_top_k'):
+    # TODO (yongtang): Need to switch to v2 after 3 weeks.
+    return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 5cf8c3291cdea496a8fd902d19e9d7faa4a974d8..cc8c6239470c428245f158e65673d076275844a4 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -136,9 +136,8 @@ class LogPoissonLossTest(test_lib.TestCase):
           z_np, x_tf, compute_full_loss=True)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
-      err_stirling = gradient_checker.compute_gradient_error(x_tf, x_shape,
-                                                             y_tf_stirling,
-                                                             x_shape)
+      err_stirling = gradient_checker.compute_gradient_error(
+          x_tf, x_shape, y_tf_stirling, x_shape)
     eps = 1e-6
     self.assertLess(err, eps)
     self.assertLess(err_stirling, eps)
@@ -404,273 +403,281 @@ class DropoutTest(test_lib.TestCase):
 class ComputeSampledLogitsTest(test_lib.TestCase):
 
   def setUp(self):
-    self._num_classes = 5
-    self._dim = 10
-    self._batch_size = 3
-    self._num_shards = 3
+    self._eps = 1e-3
+
+  def _GenerateTestData(self, num_classes, dim, batch_size, num_true, labels,
+                        sampled, subtract_log_q):
+    """Randomly generates input/output data for a single test case.
+
+    This function returns numpy constants for use in a test case.
+
+    Args:
+      num_classes: An int. The number of embedding classes in the test case.
+      dim: An int. The dimension of the embedding.
+      batch_size: An int. The batch size.
+      num_true: An int. The number of target classes per training example.
+      labels: A list of batch_size * num_true ints. The target classes.
+      sampled: A list of indices in [0, num_classes).
+      subtract_log_q: A bool corresponding to the parameter in
+          _compute_sampled_logits().
+
+    Returns:
+      weights: Embedding weights to use as test input. It is a numpy array
+          of shape [num_classes, dim]
+      biases: Embedding biases to use as test input. It is a numpy array
+          of shape [num_classes].
+      hidden_acts: Forward activations of the network to use as test input.
+          It is a numpy array of shape [batch_size, dim].
+      sampled_vals: A tuple based on `sampled` to use as test input in the
+          format returned by a *_candidate_sampler function.
+      exp_logits: The output logits expected from _compute_sampled_logits().
+          It is a numpy array of shape [batch_size, num_true + len(sampled)].
+      exp_labels: The output labels expected from _compute_sampled_logits().
+          It is a numpy array of shape [batch_size, num_true + len(sampled)].
+    """
+    weights = np.random.randn(num_classes, dim).astype(np.float32)
+    biases = np.random.randn(num_classes).astype(np.float32)
+    hidden_acts = np.random.randn(batch_size, dim).astype(np.float32)
+
+    true_exp = np.full([batch_size, 1], fill_value=0.5, dtype=np.float32)
+    sampled_exp = np.full([len(sampled)], fill_value=0.5, dtype=np.float32)
+    sampled_vals = (sampled, true_exp, sampled_exp)
 
-  def _GenerateTestInputs(self):
-    np.random.seed(0)
-    weights = np.random.randn(self._num_classes, self._dim).astype(np.float32)
-    biases = np.random.randn(self._num_classes).astype(np.float32)
-    hidden_acts = np.random.randn(self._batch_size,
-                                  self._dim).astype(np.float32)
+    sampled_w, sampled_b = weights[sampled], biases[sampled]
+    true_w, true_b = weights[labels], biases[labels]
+
+    true_logits = np.sum(
+        hidden_acts.reshape((batch_size, 1, dim)) * true_w.reshape(
+            (batch_size, num_true, dim)),
+        axis=2)
+    true_b = true_b.reshape((batch_size, num_true))
+    true_logits += true_b
+    sampled_logits = np.dot(hidden_acts, sampled_w.T) + sampled_b
+
+    if subtract_log_q:
+      true_logits -= np.log(true_exp)
+      sampled_logits -= np.log(sampled_exp[np.newaxis, :])
 
+    exp_logits = np.concatenate([true_logits, sampled_logits], axis=1)
+    exp_labels = np.hstack((np.ones_like(true_logits) / num_true,
+                            np.zeros_like(sampled_logits)))
+
+    return weights, biases, hidden_acts, sampled_vals, exp_logits, exp_labels
+
+  def _ShardTestEmbeddings(self, weights, biases, num_shards):
+    """Shards the weights and biases returned by _GenerateTestData.
+
+    Args:
+      weights: The weights returned by _GenerateTestData.
+      biases: The biases returned by _GenerateTestData.
+      num_shards: The number of shards to create.
+
+    Returns:
+      sharded_weights: A list of size `num_shards` containing all the weights.
+      sharded_biases: A list of size `num_shards` containing all the biases.
+    """
     with ops.Graph().as_default() as g:
       sharded_weights = variable_scope.get_variable(
           "w",
-          partitioner=partitioned_variables.fixed_size_partitioner(
-              self._num_shards),
+          partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
           initializer=constant_op.constant(weights))
       sharded_biases = variable_scope.get_variable(
           "b",
-          partitioner=partitioned_variables.fixed_size_partitioner(
-              self._num_shards),
+          partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
           initializer=constant_op.constant(biases))
       with self.test_session(graph=g) as sess:
         variables.global_variables_initializer().run()
+        return sess.run([list(sharded_weights), list(sharded_biases)])
 
-        sharded_weights_v, sharded_biases_v = sess.run(
-            [list(sharded_weights), list(sharded_biases)])
-
-    return weights, biases, hidden_acts, sharded_weights_v, sharded_biases_v
-
-  def _ComputeSampledLogitsNP(self,
-                              true_w,
-                              true_b,
-                              sampled_w,
-                              sampled_b,
-                              hidden_acts,
-                              num_true=1,
-                              true_expected=None,
-                              sampled_expected=None):
-
-    batch_size, dim = hidden_acts.shape
-    true_logits = np.sum(hidden_acts.reshape(
-        (batch_size, 1, dim)) * true_w.reshape((batch_size, num_true, dim)),
-                         axis=2)
-    true_b = true_b.reshape((batch_size, num_true))
-    true_logits += true_b
-    sampled_logits = np.dot(hidden_acts, sampled_w.T) + sampled_b
-
-    if true_expected is not None:
-      true_logits -= np.log(true_expected)
-    if sampled_expected is not None:
-      sampled_logits -= np.log(sampled_expected[np.newaxis, :])
-
-    out_logits = np.concatenate([true_logits, sampled_logits], axis=1)
-    out_labels = np.hstack((np.ones_like(true_logits) / num_true,
-                            np.zeros_like(sampled_logits)))
-
-    return out_logits, out_labels
-
-  def _ComputeSampledLogitsTF(self,
-                              weights,
-                              biases,
-                              hidden_acts,
-                              labels,
-                              num_sampled,
-                              num_classes,
-                              num_true,
-                              sampled_vals,
-                              subtract_log_q,
-                              remove_accidental_hits,
-                              name="sampled_loss_TF"):
-    # Should be called from within a `with test_session():` block
-    if isinstance(weights, list):
-      weights_tf = [constant_op.constant(shard) for shard in weights]
-    else:
-      weights_tf = constant_op.constant(weights)
-    if isinstance(biases, list):
-      biases_tf = [constant_op.constant(shard) for shard in biases]
-    else:
-      biases_tf = constant_op.constant(biases)
-    hidden_acts_tf = constant_op.constant(
-        hidden_acts, shape=(self._batch_size, self._dim))
-    labels_tf = constant_op.constant(
-        labels, dtype=dtypes.int64, shape=(self._batch_size, num_true))
-
-    pred_logits_tf, pred_labels_tf = _compute_sampled_logits(
-        weights_tf,
-        biases_tf,
-        labels_tf,
-        hidden_acts_tf,
-        num_sampled,
-        num_classes,
-        num_true,
-        sampled_vals,
-        subtract_log_q=subtract_log_q,
-        remove_accidental_hits=remove_accidental_hits,
-        name=name,
-        partition_strategy="div")
-    return pred_logits_tf, pred_labels_tf
-
-  def testComputeSampledLogitsShapes(self):
-    # We just check that the shapes of the returned values are correct.
-    weights, biases, hidden_acts, _, _ = self._GenerateTestInputs()
-    sampled = [1, 0, 2, 3]
-    num_sampled = len(sampled)
-    true_exp = sampled_exp = [1., 1., 1., 1.]
-    test_sampled_vals = (sampled, true_exp, sampled_exp)
-    sampled_w, sampled_b = weights[sampled], biases[sampled]
-
+  def testShapes(self):
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
     with self.test_session() as sess:
-      for num_true_test in range(1, 5):
+      for num_true in range(1, 5):
         labels = np.random.randint(
-            low=0,
-            high=self._num_classes,
-            size=self._batch_size * num_true_test)
-        true_w, true_b = weights[labels], biases[labels]
-
-        logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w,
-            true_b,
-            sampled_w,
-            sampled_b,
-            hidden_acts,
-            num_true=num_true_test)
-
-        logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights,
-            biases,
-            hidden_acts,
-            labels,
-            num_sampled,
-            self._num_classes,
-            num_true=num_true_test,
-            sampled_vals=test_sampled_vals,
-            remove_accidental_hits=True,
-            subtract_log_q=False)
-
-      logits_tf_val, labels_tf_val = sess.run([logits_tf, labels_tf])
-      self.assertEqual(logits_np.shape, logits_tf_val.shape)
-      self.assertEqual(labels_np.shape, labels_tf_val.shape)
-
-  def testComputeSampledLogitsValues(self):
-    # Here we check the actual numerics.
-    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
-        self._GenerateTestInputs())
-    eps = 1e-3
-    sampled = [1, 0, 2, 3]
-    num_sampled = len(sampled)
-    true_exp = np.empty([self._batch_size, 1], dtype=np.float32)
-    true_exp.fill(0.5)
-    sampled_exp = np.empty([num_sampled], dtype=np.float32)
-    sampled_exp.fill(0.5)
-    sampled_w, sampled_b = weights[sampled], biases[sampled]
-    test_sampled_vals = (sampled, true_exp, sampled_exp)
-
+            low=0, high=num_classes, size=batch_size * num_true)
+        (weights, biases, hidden_acts, sampled_vals, exp_logits,
+         exp_labels) = self._GenerateTestData(
+             num_classes=num_classes,
+             dim=10,
+             batch_size=batch_size,
+             num_true=num_true,
+             labels=labels,
+             sampled=[1, 0, 2, 3],
+             subtract_log_q=False)
+        logits_tensor, labels_tensor = _compute_sampled_logits(
+            weights=constant_op.constant(weights),
+            biases=constant_op.constant(biases),
+            labels=constant_op.constant(
+                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+            inputs=constant_op.constant(hidden_acts),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=num_true,
+            sampled_values=sampled_vals,
+            subtract_log_q=False,
+            remove_accidental_hits=False,
+            partition_strategy="div",
+            name="sampled_logits_basic_num_true_%d" % num_true)
+        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
+        self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
+        self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
+
+  def testBasic(self):
+    """Without accidental hit removal or subtract_log_q."""
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
     with self.test_session() as sess:
-      for num_true_test in range(1, 5):
-        # Generate test data for this run
+      for num_true in range(1, 5):
         labels = np.random.randint(
-            low=0,
-            high=self._num_classes,
-            size=self._batch_size * num_true_test)
-        true_w, true_b = weights[labels], biases[labels]
-
-        # Test 1: Without accidental hit removal or subtract_log_q
-        logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w,
-            true_b,
-            sampled_w,
-            sampled_b,
-            hidden_acts,
-            num_true=num_true_test)
-        logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights,
-            biases,
-            hidden_acts,
-            labels,
-            num_sampled,
-            self._num_classes,
-            num_true=num_true_test,
-            sampled_vals=test_sampled_vals,
+            low=0, high=num_classes, size=batch_size * num_true)
+        (weights, biases, hidden_acts, sampled_vals, exp_logits,
+         exp_labels) = self._GenerateTestData(
+             num_classes=num_classes,
+             dim=10,
+             batch_size=batch_size,
+             num_true=num_true,
+             labels=labels,
+             sampled=[1, 0, 2, 3],
+             subtract_log_q=False)
+        logits_tensor, labels_tensor = _compute_sampled_logits(
+            weights=constant_op.constant(weights),
+            biases=constant_op.constant(biases),
+            labels=constant_op.constant(
+                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+            inputs=constant_op.constant(hidden_acts),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=num_true,
+            sampled_values=sampled_vals,
             subtract_log_q=False,
             remove_accidental_hits=False,
-            name="sampled_loss_test1_num_true%d" % num_true_test)
-
-        logits_tf_val, labels_tf_val = sess.run([logits_tf, labels_tf])
-        self.assertAllClose(logits_np, logits_tf_val, eps)
-        self.assertAllClose(labels_np, labels_tf_val, eps)
-
-        # Test 2: With accidental hit removal, no subtract_log_q
-        logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights,
-            biases,
-            hidden_acts,
-            labels,
-            num_sampled,
-            self._num_classes,
-            num_true=num_true_test,
-            sampled_vals=test_sampled_vals,
+            partition_strategy="div",
+            name="sampled_logits_basic_num_true_%d" % num_true)
+        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
+        self.assertAllClose(exp_logits, got_logits, self._eps)
+        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+  def testAccidentalHitRemoval(self):
+    """With accidental hit removal, no subtract_log_q."""
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
+    sampled = [1, 0, 2, 3]
+    with self.test_session():
+      for num_true in range(1, 5):
+        labels = np.random.randint(
+            low=0, high=num_classes, size=batch_size * num_true)
+        (weights, biases, hidden_acts, sampled_vals, _,
+         _) = self._GenerateTestData(
+             num_classes=num_classes,
+             dim=10,
+             batch_size=batch_size,
+             num_true=num_true,
+             labels=labels,
+             sampled=sampled,
+             subtract_log_q=False)
+        logits_tensor, _ = _compute_sampled_logits(
+            weights=constant_op.constant(weights),
+            biases=constant_op.constant(biases),
+            labels=constant_op.constant(
+                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+            inputs=constant_op.constant(hidden_acts),
+            num_sampled=len(sampled),
+            num_classes=num_classes,
+            num_true=num_true,
+            sampled_values=sampled_vals,
             subtract_log_q=False,
             remove_accidental_hits=True,
-            name="sampled_loss_test2_num_true%d" % num_true_test)
-
+            partition_strategy="div",
+            name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
         # Test that the exponentiated logits of accidental hits are near 0.
         # First we need to find the hits in this random test run:
-        labels_reshape = labels.reshape((self._batch_size, num_true_test))
-        logits_tf_np = logits_tf.eval()
-        for row in xrange(self._batch_size):
+        labels_reshape = labels.reshape((batch_size, num_true))
+        got_logits = logits_tensor.eval()
+        for row in xrange(batch_size):
           row_labels = labels_reshape[row, :]
-          for col in xrange(num_sampled):
+          for col in xrange(len(sampled)):
             if sampled[col] in row_labels:
               # We need to add the num_true_test offset into logits_*
               self.assertNear(
-                  np.exp(logits_tf_np[row, col + num_true_test]), 0., eps)
-
-        # Test 3: With subtract_log_q, no accidental hit removal
-        logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w,
-            true_b,
-            sampled_w,
-            sampled_b,
-            hidden_acts,
-            num_true=num_true_test,
-            true_expected=true_exp,
-            sampled_expected=sampled_exp)
-        logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            weights,
-            biases,
-            hidden_acts,
-            labels,
-            num_sampled,
-            self._num_classes,
-            num_true=num_true_test,
-            sampled_vals=test_sampled_vals,
+                  np.exp(got_logits[row, col + num_true]), 0., self._eps)
+
+  def testSubtractLogQ(self):
+    """With subtract_log_q, no accidental hit removal."""
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
+    with self.test_session() as sess:
+      for num_true in range(1, 5):
+        labels = np.random.randint(
+            low=0, high=num_classes, size=batch_size * num_true)
+        (weights, biases, hidden_acts, sampled_vals, exp_logits,
+         exp_labels) = self._GenerateTestData(
+             num_classes=num_classes,
+             dim=10,
+             batch_size=batch_size,
+             num_true=num_true,
+             labels=labels,
+             sampled=[1, 0, 2, 3],
+             subtract_log_q=True)
+        logits_tensor, labels_tensor = _compute_sampled_logits(
+            weights=constant_op.constant(weights),
+            biases=constant_op.constant(biases),
+            labels=constant_op.constant(
+                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+            inputs=constant_op.constant(hidden_acts),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=num_true,
+            sampled_values=sampled_vals,
             subtract_log_q=True,
             remove_accidental_hits=False,
-            name="sampled_loss_test3_num_true%d" % num_true_test)
-
-        logits_tf_val, labels_tf_val = sess.run([logits_tf, labels_tf])
-        self.assertAllClose(logits_np, logits_tf_val, eps)
-        self.assertAllClose(labels_np, labels_tf_val, eps)
-
-        # Test 4: Test 1, with sharded weights and sharded biases.
-        logits_np, labels_np = self._ComputeSampledLogitsNP(
-            true_w,
-            true_b,
-            sampled_w,
-            sampled_b,
-            hidden_acts,
-            num_true=num_true_test)
-        logits_tf, labels_tf = self._ComputeSampledLogitsTF(
-            sharded_weights,
-            sharded_biases,
-            hidden_acts,
-            labels,
-            num_sampled,
-            self._num_classes,
-            num_true=num_true_test,
-            sampled_vals=test_sampled_vals,
+            partition_strategy="div",
+            name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
+        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
+        self.assertAllClose(exp_logits, got_logits, self._eps)
+        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+  def testSharded(self):
+    """With sharded weights and sharded biases."""
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
+    with self.test_session() as sess:
+      for num_true in range(1, 5):
+        labels = np.random.randint(
+            low=0, high=num_classes, size=batch_size * num_true)
+        (weights, biases, hidden_acts, sampled_vals, exp_logits,
+         exp_labels) = self._GenerateTestData(
+             num_classes=num_classes,
+             dim=10,
+             batch_size=batch_size,
+             num_true=num_true,
+             labels=labels,
+             sampled=[1, 0, 2, 3],
+             subtract_log_q=False)
+        weight_shards, bias_shards = self._ShardTestEmbeddings(
+            weights, biases, num_shards=3)
+        logits_tensor, labels_tensor = _compute_sampled_logits(
+            weights=[constant_op.constant(shard) for shard in weight_shards],
+            biases=[constant_op.constant(shard) for shard in bias_shards],
+            labels=constant_op.constant(
+                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+            inputs=constant_op.constant(hidden_acts),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=num_true,
+            sampled_values=sampled_vals,
             subtract_log_q=False,
             remove_accidental_hits=False,
-            name="sampled_loss_test1_num_true%d" % num_true_test)
-
-        logits_tf_val, labels_tf_val = sess.run([logits_tf, labels_tf])
-        self.assertAllClose(logits_np, logits_tf_val, eps)
-        self.assertAllClose(labels_np, labels_tf_val, eps)
+            partition_strategy="div",
+            name="sampled_logits_sharded_num_true_%d" % num_true)
+        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
+        self.assertAllClose(exp_logits, got_logits, self._eps)
+        self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testNCELoss(self):
     # A simple test to verify the numerics.
@@ -683,62 +690,51 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       pred = np.minimum(np.maximum(pred, eps), 1 - eps)
       return -targets * np.log(pred) - (1. - targets) * np.log(1. - pred)
 
-    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
-        self._GenerateTestInputs())
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
     labels = [0, 1, 2]
-    true_w, true_b = weights[labels], biases[labels]
-    sampled = [1, 0, 2, 3]
-    num_sampled = len(sampled)
-    true_exp = np.empty([self._batch_size, 1], dtype=np.float32)
-    true_exp.fill(0.5)
-    sampled_exp = np.empty([num_sampled], dtype=np.float32)
-    sampled_exp.fill(0.5)
-    sampled_w, sampled_b = weights[sampled], biases[sampled]
-    test_sampled_vals = (sampled, true_exp, sampled_exp)
+    (weights, biases, hidden_acts, sampled_vals, exp_logits,
+     exp_labels) = self._GenerateTestData(
+         num_classes=num_classes,
+         dim=10,
+         batch_size=batch_size,
+         num_true=1,
+         labels=labels,
+         sampled=[1, 0, 2, 3],
+         subtract_log_q=True)
+    exp_nce_loss = np.sum(
+        _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
     with self.test_session():
-      logits_np, labels_np = self._ComputeSampledLogitsNP(
-          true_w,
-          true_b,
-          sampled_w,
-          sampled_b,
-          hidden_acts,
-          true_expected=true_exp,
-          sampled_expected=sampled_exp)
-      nce_loss_np = np.sum(
-          _SigmoidCrossEntropyWithLogits(logits_np, labels_np), 1)
-
-      labels_tf = constant_op.constant(labels, shape=(self._batch_size, 1))
-      weights_tf = constant_op.constant(weights)
-      biases_tf = constant_op.constant(biases)
-      inputs_tf = constant_op.constant(hidden_acts)
-
-      nce_loss_tf = nn_impl.nce_loss(
-          weights_tf,
-          biases_tf,
-          labels_tf,
-          inputs_tf,
-          num_sampled=num_sampled,
-          num_classes=self._num_classes,
+      got_nce_loss = nn_impl.nce_loss(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(labels, shape=(batch_size, 1)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals,
+          sampled_values=sampled_vals,
           partition_strategy="div")
 
-      self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
+      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
 
       # Test with sharded weights and sharded biases.
-      nce_loss_tf = nn_impl.nce_loss(
-          sharded_weights,
-          sharded_biases,
-          labels_tf,
-          inputs_tf,
-          num_sampled=num_sampled,
-          num_classes=self._num_classes,
+      weight_shards, bias_shards = self._ShardTestEmbeddings(
+          weights, biases, num_shards=3)
+      got_nce_loss = nn_impl.nce_loss(
+          weights=[constant_op.constant(shard) for shard in weight_shards],
+          biases=[constant_op.constant(shard) for shard in bias_shards],
+          labels=constant_op.constant(labels, shape=(batch_size, 1)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals,
+          sampled_values=sampled_vals,
           partition_strategy="div")
 
-      self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
+      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
 
   def testSampledSoftmaxLoss(self):
     # A simple test to verify the numerics.
@@ -751,64 +747,55 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
       return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
 
-    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
-        self._GenerateTestInputs())
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
     labels = [0, 1, 2]
-    true_w, true_b = weights[labels], biases[labels]
-    sampled = [1, 0, 2, 3]
-    num_sampled = len(sampled)
-    true_exp = np.full([self._batch_size, 1], fill_value=0.5, dtype=np.float32)
-    sampled_exp = np.full([num_sampled], fill_value=0.5, dtype=np.float32)
-    sampled_w, sampled_b = weights[sampled], biases[sampled]
-    test_sampled_vals = (sampled, true_exp, sampled_exp)
+    (weights, biases, hidden_acts, sampled_vals, exp_logits,
+     exp_labels) = self._GenerateTestData(
+         num_classes=num_classes,
+         dim=10,
+         batch_size=batch_size,
+         num_true=1,
+         labels=labels,
+         sampled=[1, 0, 2, 3],
+         subtract_log_q=True)
+    exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
+        exp_logits, exp_labels)
 
     with self.test_session():
-      logits_np, labels_np = self._ComputeSampledLogitsNP(
-          true_w,
-          true_b,
-          sampled_w,
-          sampled_b,
-          hidden_acts,
-          true_expected=true_exp,
-          sampled_expected=sampled_exp)
-      sampled_softmax_loss_np = _SoftmaxCrossEntropyWithLogits(logits_np,
-                                                               labels_np)
-
-      labels_tf = constant_op.constant(labels, shape=(self._batch_size, 1))
-      weights_tf = constant_op.constant(weights)
-      biases_tf = constant_op.constant(biases)
-      inputs_tf = constant_op.constant(hidden_acts)
-
-      sampled_softmax_loss_tf = nn_impl.sampled_softmax_loss(
-          weights=weights_tf,
-          biases=biases_tf,
-          labels=labels_tf,
-          inputs=inputs_tf,
-          num_sampled=num_sampled,
-          num_classes=self._num_classes,
+      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(labels, shape=(batch_size, 1)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals,
+          sampled_values=sampled_vals,
           remove_accidental_hits=False,
           partition_strategy="div")
 
-      self.assertAllClose(sampled_softmax_loss_np,
-                          sampled_softmax_loss_tf.eval(), 1e-4)
+      self.assertAllClose(exp_sampled_softmax_loss,
+                          got_sampled_softmax_loss.eval(), 1e-4)
 
       # Test with sharded weights and sharded biases.
-      sampled_softmax_loss_tf = nn_impl.sampled_softmax_loss(
-          weights=sharded_weights,
-          biases=sharded_biases,
-          labels=labels_tf,
-          inputs=inputs_tf,
-          num_sampled=num_sampled,
-          num_classes=self._num_classes,
+      weight_shards, bias_shards = self._ShardTestEmbeddings(
+          weights, biases, num_shards=3)
+      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
+          weights=[constant_op.constant(shard) for shard in weight_shards],
+          biases=[constant_op.constant(shard) for shard in bias_shards],
+          labels=constant_op.constant(labels, shape=(batch_size, 1)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals,
+          sampled_values=sampled_vals,
           remove_accidental_hits=False,
           partition_strategy="div")
 
-      self.assertAllClose(sampled_softmax_loss_np,
-                          sampled_softmax_loss_tf.eval(), 1e-4)
+      self.assertAllClose(exp_sampled_softmax_loss,
+                          got_sampled_softmax_loss.eval(), 1e-4)
 
 
 class CReluTest(test_lib.TestCase):
@@ -843,24 +830,32 @@ class ReluTest(test_lib.TestCase):
 
 class MomentsTest(test_lib.TestCase):
 
-  def doOutputTest(self, input_shape, moments_axes, tol=1e-4):
+  def doOutputTest(self, input_shape, moments_axes, tol=1e-4,
+                   check_gradients=False):
     for mu in [0.0, 1.0, 1e3]:
       for sigma in [1.0, 0.1]:
         for keep_dims in [True, False]:
           input_values = np.random.rand(*input_shape) * sigma + mu
-          expected_mean = np.mean(input_values, axis=moments_axes,
-                                  keepdims=keep_dims)
-          expected_var = np.var(input_values, axis=moments_axes,
-                                keepdims=keep_dims)
+          expected_mean = np.mean(
+              input_values, axis=moments_axes, keepdims=keep_dims)
+          expected_var = np.var(
+              input_values, axis=moments_axes, keepdims=keep_dims)
           with ops.Graph().as_default() as g:
             with self.test_session(graph=g) as sess:
-              inputs = constant_op.constant(input_values,
-                                            shape=input_shape,
-                                            dtype=dtypes.float32)
-              mean, variance = nn_impl.moments(inputs,
-                                               moments_axes,
-                                               keep_dims=keep_dims)
-
+              inputs = constant_op.constant(
+                  input_values, shape=input_shape, dtype=dtypes.float32)
+              mean, variance = nn_impl.moments(
+                  inputs, moments_axes, keep_dims=keep_dims)
+
+              if check_gradients:
+                err = gradient_checker.compute_gradient_error(
+                    inputs, input_shape, mean, mean.shape.as_list())
+                self.assertLess(err, 1e-3)
+                err = gradient_checker.compute_gradient_error(
+                    inputs, input_shape, variance, variance.shape.as_list())
+                self.assertLess(err, 1e-3)
+
+              # Evaluate.
               [mean, variance] = sess.run([mean, variance])
               # Make sure that there are no NaNs
               self.assertFalse(np.isnan(mean).any())
@@ -868,6 +863,12 @@ class MomentsTest(test_lib.TestCase):
               self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
               self.assertAllClose(variance, expected_var, rtol=tol, atol=tol)
 
+  def testOutputAndGradient2DInput0(self):
+    self.doOutputTest((10, 10), (0,), check_gradients=True)
+
+  def testOutputAndGradient2DInput01(self):
+    self.doOutputTest((10, 10), (0, 1), check_gradients=True)
+
   def testOutput2DInput0(self):
     self.doOutputTest((10, 300), (0,))
 
@@ -892,27 +893,6 @@ class MomentsTest(test_lib.TestCase):
   def testOutput4DInput123(self):
     self.doOutputTest((10, 10, 10, 30), (1, 2, 3))
 
-  def testUnstableOutputShiftNone(self):
-    input_shape = (10, 300)
-    moments_axes = (0, 1)
-    mu, sigma = 1e3, 0.1
-    tol = 1e-3
-    input_values = np.random.rand(*input_shape) * sigma + mu
-    expected_mean = np.mean(input_values, axis=moments_axes)
-    expected_var = np.var(input_values, axis=moments_axes)
-
-    with self.test_session() as sess:
-      inputs = constant_op.constant(input_values, shape=input_shape,
-                                    dtype=dtypes.float32)
-      mean, variance = nn_impl.moments(inputs, moments_axes, shift=0.0)
-
-      [mean, variance] = sess.run([mean, variance])
-      # Make sure that there are no NaNs
-      self.assertFalse(np.isnan(mean).any())
-      self.assertFalse(np.isnan(variance).any())
-      self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
-      # The variance is unstable
-      self.assertGreater(np.abs(variance - expected_var), 0.1)
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index c2f9961731630173127ed5367789a31550ce1c0d..0071e7d868f67ad194b6b2d4849e117886d31642 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -120,8 +120,10 @@ class SparseFeature(
       `value_key` are already sorted by their index position. If so skip
       sorting. False by default (optional).
   """
-  pass
-SparseFeature.__new__.__defaults__ = (False,)
+
+  def __new__(cls, index_key, value_key, dtype, size, already_sorted=False):
+    return super(SparseFeature, cls).__new__(
+        cls, index_key, value_key, dtype, size, already_sorted)
 
 
 class FixedLenFeature(collections.namedtuple(
@@ -137,8 +139,10 @@ class FixedLenFeature(collections.namedtuple(
     default_value: Value to be used if an example is missing this feature. It
         must be compatible with `dtype` and of the specified `shape`.
   """
-  pass
-FixedLenFeature.__new__.__defaults__ = (None,)
+
+  def __new__(cls, shape, dtype, default_value=None):
+    return super(FixedLenFeature, cls).__new__(
+        cls, shape, dtype, default_value)
 
 
 class FixedLenSequenceFeature(collections.namedtuple(
@@ -168,8 +172,10 @@ class FixedLenSequenceFeature(collections.namedtuple(
       `SequenceExample`. Defaults to "" for dtype string and 0 otherwise
       (optional).
   """
-  pass
-FixedLenSequenceFeature.__new__.__defaults__ = (False, None)
+
+  def __new__(cls, shape, dtype, allow_missing=False, default_value=None):
+    return super(FixedLenSequenceFeature, cls).__new__(
+        cls, shape, dtype, allow_missing, default_value)
 
 
 def _features_to_raw_params(features, types):
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 15613289a0b0a96b4631c60afa9d196da408480d..1e0bb925d415f6ae6bacb1496f87fa6b84ca13c2 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -206,7 +206,8 @@ def random_uniform(shape,
     maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on
       the range of random values to generate.  Defaults to 1 if `dtype` is
       floating point.
-    dtype: The type of the output: `float32`, `float64`, `int32`, or `int64`.
+    dtype: The type of the output: 'float16`, `float32`, `float64`, `int32`,
+      or `int64`.
     seed: A Python integer. Used to create a random seed for the distribution.
       See @{tf.set_random_seed}
       for behavior.
@@ -219,6 +220,9 @@ def random_uniform(shape,
     ValueError: If `dtype` is integral and `maxval` is not specified.
   """
   dtype = dtypes.as_dtype(dtype)
+  if dtype not in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+                   dtypes.int64):
+    raise ValueError("Invalid dtype %r" % dtype)
   if maxval is None:
     if dtype.is_integer:
       raise ValueError("Must specify maxval for integer dtype %r" % dtype)
@@ -325,7 +329,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
 
   Args:
     logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
-      `[i, :]` represents the log-odds for all classes.
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 465b73bda5eba7aad22b530a14ccaaac18fe1505..38c4dacef035f850245ca3ad167ea1da6a165e53 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -204,7 +204,9 @@ class ResourceVariable(variables.Variable):
         if initial_value is not None:
           with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
             self._initializer_op = gen_resource_variable_ops.assign_variable_op(
-                self._handle, self._initial_value, name=n)
+                self._handle,
+                self._build_initializer_expr(self._initial_value),
+                name=n)
         with ops.name_scope("Read"), ops.colocate_with(self._handle):
           # Manually assign reads to the handle's device to avoid log messages.
           with ops.device(self._handle.device):
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index 57ba0084e846a612ba3deedb600f53c123545571..db6740643cffd9ca852d75653c837a39a1731d42 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -85,18 +85,20 @@ def report_uninitialized_resources(resource_list=None,
   if resource_list is None:
     resource_list = shared_resources() + local_resources()
   with ops.name_scope(name):
-    if not resource_list:
-      # Return an empty tensor so we only need to check for returned tensor
-      # size being 0 as an indication of model ready.
-      return array_ops.constant([], dtype=dtypes.string)
-    # Get a 1-D boolean tensor listing whether each resource is initialized.
-    variables_mask = math_ops.logical_not(
-        array_ops.stack([r.is_initialized for r in resource_list]))
-    # Get a 1-D string tensor containing all the resource names.
-    variable_names_tensor = array_ops.constant(
-        [s.handle.name for s in resource_list])
-    # Return a 1-D tensor containing all the names of uninitialized resources.
-    return array_ops.boolean_mask(variable_names_tensor, variables_mask)
+    # Run all operations on CPU
+    with ops.device("/cpu:0"):
+      if not resource_list:
+        # Return an empty tensor so we only need to check for returned tensor
+        # size being 0 as an indication of model ready.
+        return array_ops.constant([], dtype=dtypes.string)
+      # Get a 1-D boolean tensor listing whether each resource is initialized.
+      variables_mask = math_ops.logical_not(
+          array_ops.stack([r.is_initialized for r in resource_list]))
+      # Get a 1-D string tensor containing all the resource names.
+      variable_names_tensor = array_ops.constant(
+          [s.handle.name for s in resource_list])
+      # Return a 1-D tensor containing all the names of uninitialized resources.
+      return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 
 @tf_should_use.should_use_result
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index cc6528d1f59f01e23e91756e2f0bd3f18ce1a45d..304b6ae665f3674e7a93d568cea90afc4ce21375 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -345,6 +345,8 @@ class BasicLSTMCell(RNNCell):
     Args:
       num_units: int, The number of units in the LSTM cell.
       forget_bias: float, The bias added to forget gates (see above).
+        Must set to `0.0` manually when restoring from CudnnLSTM-trained
+        checkpoints.
       state_is_tuple: If True, accepted and returned states are 2-tuples of
         the `c_state` and `m_state`.  If False, they are concatenated
         along the column axis.  The latter behavior will soon be deprecated.
@@ -352,6 +354,9 @@ class BasicLSTMCell(RNNCell):
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
     """
     super(BasicLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
@@ -372,7 +377,20 @@ class BasicLSTMCell(RNNCell):
     return self._num_units
 
   def call(self, inputs, state):
-    """Long short-term memory cell (LSTM)."""
+    """Long short-term memory cell (LSTM).
+
+    Args:
+      inputs: `2-D` tensor with shape `[batch_size x input_size]`.
+      state: An `LSTMStateTuple` of state tensors, each shaped
+        `[batch_size x self.state_size]`, if `state_is_tuple` has been set to
+        `True`.  Otherwise, a `Tensor` shaped
+        `[batch_size x 2 * self.state_size]`.
+
+    Returns:
+      A pair containing the new hidden state, and the new state (either a
+        `LSTMStateTuple` or a concatenated state, depending on
+        `state_is_tuple`).
+    """
     sigmoid = math_ops.sigmoid
     # Parameters of gates are concatenated into one multiply for efficiency.
     if self._state_is_tuple:
@@ -401,7 +419,7 @@ class LSTMCell(RNNCell):
 
   The default non-peephole implementation is based on:
 
-    http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+    http://www.bioinf.jku.at/publications/older/2604.pdf
 
   S. Hochreiter and J. Schmidhuber.
   "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
@@ -444,7 +462,8 @@ class LSTMCell(RNNCell):
         Use a variable_scope partitioner instead.
       forget_bias: Biases of the forget gate are initialized by default to 1
         in order to reduce the scale of forgetting at the beginning of
-        the training.
+        the training. Must set it manually to `0.0` when restoring from
+        CudnnLSTM trained checkpoints.
       state_is_tuple: If True, accepted and returned states are 2-tuples of
         the `c_state` and `m_state`.  If False, they are concatenated
         along the column axis.  This latter behavior will soon be deprecated.
@@ -452,6 +471,9 @@ class LSTMCell(RNNCell):
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
     """
     super(LSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
@@ -764,13 +786,18 @@ class DropoutWrapper(RNNCell):
 class ResidualWrapper(RNNCell):
   """RNNCell wrapper that ensures cell inputs are added to the outputs."""
 
-  def __init__(self, cell):
+  def __init__(self, cell, residual_fn=None):
     """Constructs a `ResidualWrapper` for `cell`.
 
     Args:
       cell: An instance of `RNNCell`.
+      residual_fn: (Optional) The function to map raw cell inputs and raw cell
+        outputs to the actual cell outputs of the residual network.
+        Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
+        and outputs.
     """
     self._cell = cell
+    self._residual_fn = residual_fn
 
   @property
   def state_size(self):
@@ -785,7 +812,7 @@ class ResidualWrapper(RNNCell):
       return self._cell.zero_state(batch_size, dtype)
 
   def __call__(self, inputs, state, scope=None):
-    """Run the cell and add its inputs to its outputs.
+    """Run the cell and then apply the residual_fn on its inputs to its outputs.
 
     Args:
       inputs: cell inputs.
@@ -800,13 +827,14 @@ class ResidualWrapper(RNNCell):
       ValueError: If cell inputs and outputs have different structure (value).
     """
     outputs, new_state = self._cell(inputs, state, scope=scope)
-    nest.assert_same_structure(inputs, outputs)
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
-    nest.map_structure(assert_shape_match, inputs, outputs)
-    res_outputs = nest.map_structure(
-        lambda inp, out: inp + out, inputs, outputs)
+    def default_residual_fn(inputs, outputs):
+      nest.assert_same_structure(inputs, outputs)
+      nest.map_structure(assert_shape_match, inputs, outputs)
+      return nest.map_structure(lambda inp, out: inp + out, inputs, outputs)
+    res_outputs = (self._residual_fn or default_residual_fn)(inputs, outputs)
     return (res_outputs, new_state)
 
 
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index e74c52b8cf90d5620739a6b6b0c94510cbefa22f..dc4d913c938a89f23297c02c2d18b286fd3bb9e8 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -74,7 +74,7 @@ class TensorHandle(object):
   def _get_resource_handle(self):
     """The ResourceHandle representation of this handle."""
     if not self._resource_handle:
-      self._resource_handle = resource_handle_pb2.ResourceHandle()
+      self._resource_handle = resource_handle_pb2.ResourceHandleProto()
       self._resource_handle.device = self._handle.split(";")[-1]
       self._resource_handle.container = (
           pywrap_tensorflow_internal.TENSOR_HANDLE_KEY)
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index f3b52636d4ff3658273d450be521bf16d8aeddf3..e623c4f2ee309f852993dc065a1c87e37b00e102 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -99,7 +99,7 @@ def _set_operation(a, b, set_operation, validate_indices=True):
     b: `Tensor` or `SparseTensor` of the same type as `a`. Must be
         `SparseTensor` if `a` is `SparseTensor`. If sparse, indices must be
         sorted in row-major order.
-    set_operation: String indicating set operaiton. See
+    set_operation: String indicating set operation. See
         SetOperationOp::SetOperationFromContext for valid values.
     validate_indices: Whether to validate the order and range of sparse indices
        in `a` and `b`.
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index b52610661fd4662f1905685fb06ec4533ea2b75a..db33541218d675266420268e2001d89dd4aee083 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -25,11 +25,14 @@
 @@sparse_concat
 @@sparse_reorder
 @@sparse_reshape
+@@sparse_slice
 @@sparse_split
 @@sparse_retain
 @@sparse_reset_shape
 @@sparse_fill_empty_rows
 @@sparse_transpose
+@@sparse_reduce_max
+@@sparse_reduce_max_sparse
 @@sparse_reduce_sum
 @@sparse_reduce_sum_sparse
 @@sparse_add
@@ -655,6 +658,50 @@ def sparse_split(keyword_required=KeywordRequired(),
   return sparse_tensors
 
 
+def sparse_slice(sp_input, start, size, name=None):
+  """Slice a `SparseTensor` based on the `start` and `size.
+
+  For example, if the input is
+
+      input_tensor = shape = [2, 7]
+      [    a   d e  ]
+      [b c          ]
+
+  Graphically the output tensors are:
+
+      sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+      [    a  ]
+      [b c    ]
+
+      sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+      [ d e  ]
+      [      ]
+
+  Args:
+    sp_input: The `SparseTensor` to split.
+    start: 1-D. tensor represents the start of the slice.
+    size: 1-D. tensor represents the size of the slice.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `SparseTensor` objects resulting from splicing.
+
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  sp_input = _convert_to_sparse_tensor(sp_input)
+  start = ops.convert_to_tensor(start, dtypes.int64)
+  size = ops.convert_to_tensor(size, dtypes.int64)
+
+  with ops.name_scope(name, "SparseSlice", [sp_input]) as name:
+    output_indices, output_values, output_shape = gen_sparse_ops.sparse_slice(
+        sp_input.indices, sp_input.values, sp_input.dense_shape, start, size, name=name)
+
+    return sparse_tensor.SparseTensor(
+        output_indices,
+        output_values,
+        output_shape)
+
 def sparse_to_dense(sparse_indices,
                     output_shape,
                     sparse_values,
@@ -710,6 +757,90 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
+def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
+                      reduction_axes=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+  instead of a sparse one.
+
+  Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 2]
+  #                 [?, 3, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse_reduce_max(x) ==> 3
+  tf.sparse_reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse_reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse_reduce_max(x, 1, keep_dims=True) ==> [[2], [3]]
+  tf.sparse_reduce_max(x, [0, 1]) ==> 3
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keep_dims: If true, retain reduced dimensions with length 1.
+    reduction_axes: Deprecated name of axis.
+
+  Returns:
+    The reduced Tensor.
+  """
+  return gen_sparse_ops.sparse_reduce_max(
+      sp_input.indices, sp_input.values,
+      sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes),
+      keep_dims)
+
+
+def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
+                             reduction_axes=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In contrast to SparseReduceSum, this Op returns a
+  SparseTensor.
+
+  Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+  `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  which are interpreted according to the indexing rules in Python.
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keep_dims: If true, retain reduced dimensions with length 1.
+    reduction_axes: Deprecated name of axis
+
+  Returns:
+    The reduced SparseTensor.
+  """
+  output_ind, output_val, output_shape = (
+      gen_sparse_ops.sparse_reduce_max_sparse(
+          sp_input.indices, sp_input.values,
+          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
+                                                        reduction_axes),
+          keep_dims))
+
+  return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+
 def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
                       reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
@@ -979,7 +1110,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
   Args:
     sp_ids: A single `SparseTensor` with `values` property of type `int32`
       or `int64` or a Python list of such `SparseTensor`s or a list thereof.
-    sp_values: A`SparseTensor` of any type.
+    sp_values: A `SparseTensor` of any type.
     vocab_size: A scalar `int64` Tensor (or Python int) containing the new size
       of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
       Or a list thereof with `all(0 <= sp_ids[i].values < vocab_size[i])` for
@@ -1189,10 +1320,6 @@ def sparse_reset_shape(sp_input, new_shape=None):
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
 
-# TODO(b/37517434): Delete this variable on 20170610.
-_SPARSE_FILL_EMPTY_ROWS_FAST_PATH = False
-
-
 def sparse_fill_empty_rows(sp_input, default_value, name=None):
   """Fills empty rows in the input 2-D `SparseTensor` with a default value.
 
@@ -1241,52 +1368,19 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
-
-  # TODO(b/37517434): Delete the slow path and only use the fast path
-  # on 20170610.
   with ops.name_scope(name, "SparseFillEmptyRows", [sp_input]):
     default_value = ops.convert_to_tensor(
         default_value, dtype=sp_input.values.dtype)
-    if _SPARSE_FILL_EMPTY_ROWS_FAST_PATH:
-      (output_indices, output_values, empty_row_indicator,
-       unused_reverse_index_map) = gen_sparse_ops._sparse_fill_empty_rows(
-           indices=sp_input.indices,
-           values=sp_input.values,
-           dense_shape=sp_input.dense_shape,
-           default_value=default_value)
-      return (sparse_tensor.SparseTensor(
-          indices=output_indices,
-          values=output_values,
-          dense_shape=sp_input.dense_shape), empty_row_indicator)
-    else:
-      num_rows = math_ops.cast(sp_input.dense_shape[0], dtypes.int32)
-      all_row_indices = math_ops.cast(math_ops.range(num_rows), dtypes.int64)
-      empty_row_indices, _ = array_ops.setdiff1d(all_row_indices,
-                                                 sp_input.indices[:, 0])
-      empty_row_indicator = sparse_to_dense(
-          empty_row_indices,
-          array_ops.expand_dims(sp_input.dense_shape[0], -1), True,
-          False)
-
-      empty_row_indices_as_column = array_ops.reshape(
-          empty_row_indices, [-1, 1])
-      additional_indices = array_ops.concat([
-          empty_row_indices_as_column,
-          array_ops.zeros_like(empty_row_indices_as_column)
-      ], 1)
-      additional_values = array_ops.fill(
-          array_ops.shape(empty_row_indices), default_value)
-
-      all_indices_unordered = array_ops.concat(
-          [sp_input.indices, additional_indices], 0)
-      all_values_unordered = array_ops.concat(
-          [sp_input.values, additional_values], 0)
-      sp_unordered_output = sparse_tensor.SparseTensor(
-          all_indices_unordered,
-          all_values_unordered, sp_input.dense_shape)
-      sp_ordered_output = sparse_reorder(sp_unordered_output)
-
-      return sp_ordered_output, empty_row_indicator
+    (output_indices, output_values, empty_row_indicator,
+     unused_reverse_index_map) = gen_sparse_ops._sparse_fill_empty_rows(
+         indices=sp_input.indices,
+         values=sp_input.values,
+         dense_shape=sp_input.dense_shape,
+         default_value=default_value)
+    return (sparse_tensor.SparseTensor(indices=output_indices,
+                                       values=output_values,
+                                       dense_shape=sp_input.dense_shape),
+            empty_row_indicator)
 
 
 def serialize_sparse(sp_input, name=None):
diff --git a/tensorflow/python/ops/spectral_ops_test_util.py b/tensorflow/python/ops/spectral_ops_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f2e730edc8f57582a2f2075fdc2d8614e6b9582
--- /dev/null
+++ b/tensorflow/python/ops/spectral_ops_test_util.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for writing test involving spectral_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def _use_eigen_kernels():
+  use_eigen_kernels = False  # Eigen kernels are default
+  if test.is_gpu_available(cuda_only=True):
+    use_eigen_kernels = False
+  return use_eigen_kernels
+
+
+def fft_kernel_label_map():
+  """Returns a generator overriding kernel selection.
+
+  This is used to force testing of the eigen kernels, even
+  when they are not the default registered kernels.
+
+  Returns:
+    A generator in which to wrap every test.
+  """
+  if _use_eigen_kernels():
+    d = dict([(op, "eigen")
+              for op in [
+                  "FFT", "FFT2D", "FFT3D", "IFFT", "IFFT2D", "IFFT3D",
+                  "IRFFT", "IRFFT2D", "IRFFT3D", "RFFT", "RFFT2D", "RFFT3D"
+              ]])
+    return ops.get_default_graph()._kernel_label_map(d)  # pylint: disable=protected-access
+  else:
+    return ops.get_default_graph()._kernel_label_map({})  # pylint: disable=protected-access
+
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index a6b14f6f6f35a497f20908868a9ef5f2dfeef48e..30bf4e4ef1b96ea68e9020621f37551ac619a3c2 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -145,7 +145,7 @@ _allowed_symbols_math_ops = [
     "sub",  # use tf.subtract instead.
 
     # These are documented in nn.
-    # We are are not importing nn because it would create a circular dependency.
+    # We are not importing nn because it would create a circular dependency.
     "sigmoid",
     "log_sigmoid",
     "tanh",
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index dbc637975d6d70065919d86825a7aa72e86103c0..d84949814afb7b5c8b608369dfecf8e23ab84942 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -46,10 +46,13 @@
 @@random_normal_initializer
 @@truncated_normal_initializer
 @@random_uniform_initializer
-@@uniform_unit_scaling_initializer
+@@glorot_uniform_initializer
+@@glorot_normal_initializer
 @@zeros_initializer
 @@ones_initializer
 @@orthogonal_initializer
+@@variance_scaling_initializer
+@@uniform_unit_scaling_initializer
 @@fixed_size_partitioner
 @@variable_axis_size_partitioner
 @@min_max_variable_partitioner
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index a3f66169029573102e296965998156c2d5970e6b..06ea63704d904642cb27939c805e10a348eb609d 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -78,7 +78,7 @@ def summary_scope(name, family=None, default_name=None, values=None):
   If `family` is set, then the tag name will be '<family>/<scope_name>', where
   `scope_name` is `<outer_scope>/<family>/<name>`. This ensures that `family`
   is always the prefix of the tag (and unmodified), while ensuring the scope
-  respects the outer scope from this this summary was created.
+  respects the outer scope from this summary was created.
 
   Args:
     name: A name for the generated summary node.
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
index 4ad0862dcc7f15d217b60879b89ee3ce1425ea8a..2cf2eda16e69bcfab766c7adaa4b5d8b40d99723 100644
--- a/tensorflow/python/ops/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from google.protobuf import json_format
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_logging_ops
@@ -29,110 +28,56 @@ from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 
 
-# TODO(dandelion): As currently implemented, this op has several problems.
-# The 'summary_description' field is passed but not used by the kernel.
-# The 'name' field is used to creat a scope and passed down via name=scope,
-# but gen_logging_ops._tensor_summary ignores this parameter and uses the
-# kernel's op name as the name. This is ok because scope and the op name
-# are identical, but it's probably worthwhile to fix.
-# Finally, because of the complications above, this currently does not
-# support the family= attribute added to other summaries in cl/156791589.
-def tensor_summary(  # pylint: disable=invalid-name
-    name,
-    tensor,
-    summary_description=None,
-    collections=None):
-  # pylint: disable=line-too-long
+def tensor_summary(name,
+                   tensor,
+                   summary_description=None,
+                   collections=None,
+                   summary_metadata=None,
+                   family=None,
+                   display_name=None):
   """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
 
-  The generated
-  [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-  has one summary value containing the input tensor.
-
   Args:
-    name: A name for the generated node. Will also serve as the series name in
-      TensorBoard.
+    name: A name for the generated node. If display_name is not set, it will
+      also serve as the tag name in TensorBoard. (In that case, the tag
+      name will inherit tf name scopes.)
     tensor: A tensor of any type and shape to serialize.
-    summary_description: Optional summary_pb2.SummaryDescription()
-    collections: Optional list of graph collections keys. The new summary op is
-      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-
-  Returns:
-    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-    buffer.
-  """
-  # pylint: enable=line-too-long
-
-  if summary_description is None:
-    summary_description = summary_pb2.SummaryDescription()
-
-  description = json_format.MessageToJson(summary_description)
-  with ops.name_scope(name, None, [tensor]) as scope:
-    val = gen_logging_ops._tensor_summary(
-        tensor=tensor,
-        description=description,
-        name=scope)
-    summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
-  return val
-
-ops.NotDifferentiable("TensorSummary")
-
-
-def _tensor_summary_v2(  # pylint: disable=invalid-name
-    name,
-    tensor,
-    summary_description=None,
-    collections=None,
-    summary_metadata=None,
-    family=None):
-  # pylint: disable=line-too-long
-  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
-
-  NOTE(chizeng): This method is temporary. It should never make it into
-  TensorFlow 1.3, and nothing should depend on it. This method should be deleted
-  before August 2017 (ideally, earlier). This method exists to unblock the
-  TensorBoard plugin refactoring effort. We will later modify the tensor_summary
-  method to directly make use of the TensorSummaryV2 op. There must be a 3-week
-  difference between adding a new op (C++) and changing a python interface to
-  use it.
-
-  The generated
-  [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-  has one summary value containing the input tensor.
-
-  Args:
-    name: A name for the generated node. Will also serve as the series name in
-      TensorBoard.
-    tensor: A tensor of any type and shape to serialize.
-    summary_description: This is currently un-used but must be kept for
-      backwards compatibility.
+    summary_description: A long description of the summary sequence. Markdown
+      is supported.
     collections: Optional list of graph collections keys. The new summary op is
       added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
     summary_metadata: Optional SummaryMetadata proto (which describes which
       plugins may use the summary value).
-    family: Optional; if provided, used as the prefix of the summary tag name,
-      which controls the tab name used for display on Tensorboard.
+    family: Optional; if provided, used as the prefix of the summary tag,
+      which controls the name used for display on TensorBoard when
+      display_name is not set.
+    display_name: A string used to name this data in TensorBoard. If this is
+      not set, then the node name will be used instead.
 
   Returns:
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  # pylint: enable=line-too-long
 
-  # The summary description is unused now.
-  del summary_description
+  if summary_metadata is None:
+    summary_metadata = summary_pb2.SummaryMetadata()
+
+  if summary_description is not None:
+    summary_metadata.summary_description = summary_description
 
-  serialized_summary_metadata = ""
-  if summary_metadata:
-    serialized_summary_metadata = summary_metadata.SerializeToString()
+  if display_name is not None:
+    summary_metadata.display_name = display_name
+
+  serialized_summary_metadata = summary_metadata.SerializeToString()
 
   with summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     val = gen_logging_ops._tensor_summary_v2(
         tensor=tensor,
         tag=tag,
-        description="",
         name=scope,
         serialized_summary_metadata=serialized_summary_metadata)
     summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
+
+ops.NotDifferentiable("TensorSummary")
diff --git a/tensorflow/python/ops/transpose_benchmark.py b/tensorflow/python/ops/transpose_benchmark.py
index 6bd3fe5e5a0306ced6d7113f0678f77abe1829c3..6047f4ae9325ab75893e535a200c946555008417 100644
--- a/tensorflow/python/ops/transpose_benchmark.py
+++ b/tensorflow/python/ops/transpose_benchmark.py
@@ -83,8 +83,8 @@ class TransposeBenchmark(test.Benchmark):
         start_time = time.time()
         session.run(outputs)
         duration = (time.time() - start_time) / num_iters
-        throughput = np.prod(np.array(
-            input_shape)) * datatype().itemsize * 2 / duration / 1e9
+        throughput = np.prod(
+            np.array(input_shape)) * datatype().itemsize * 2 / duration / 1e9
         print("%s %s inputshape:%s perm:%s %d %.6fsec, %.4fGB/s." %
               (device, str(datatype), str(input_shape).replace(" ", ""),
                str(perm).replace(" ", ""), num_iters, duration, throughput))
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index aceffd373af18ee55d4c3f3ecb44c5307a99fc0c..7077f679adae631cfe88fc698e330cf21c575a20 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -300,7 +300,8 @@ class _VariableStore(object):
                      initializer=None, regularizer=None, reuse=None,
                      trainable=True, collections=None, caching_device=None,
                      partitioner=None, validate_shape=True, use_resource=None):
-      is_scalar = shape is not None and not shape
+      is_scalar = (shape is not None and isinstance(shape, collections_lib.Sequence)
+                   and len(shape) == 0)
       # Partitioned variable case
       if partitioner is not None and not is_scalar:
         if not callable(partitioner):
@@ -1068,7 +1069,7 @@ get_variable_or_local_docstring = (
 
 %sThis function prefixes the name with the current variable scope
 and performs reuse checks. See the
-@{$variable_scope$Variable Scope How To}
+@{$variables$Variable Scope How To}
 for an extensive description of how reusing works. Here is a basic example:
 
 ```python
@@ -1444,7 +1445,7 @@ def variable_scope(name_or_scope,
 
   Variable scope allows to create new variables and to share already created
   ones while providing checks to not create or share by accident. For details,
-  see the @{$variable_scope$Variable Scope How To},
+  see the @{$variables$Variable Scope How To},
   here we present only a few basic examples.
 
   Simple example of how to create a new variable:
@@ -1501,6 +1502,11 @@ def variable_scope(name_or_scope,
   A note about name scoping: Setting `reuse` does not impact the naming of other
   ops such as mult. See related discussion on [github#6189](https://github.com/tensorflow/tensorflow/issues/6189)
 
+  Note that up to and including version 1.0, it was allowed (though
+  explicitly discouraged) to pass False to the reuse argument, yielding
+  undocumented behaviour slightly different from None. Starting at 1.1.0
+  passing None and False as reuse has exactly the same effect.
+
   Args:
     name_or_scope: `string` or `VariableScope`: the scope to open.
     default_name: The default name to use if the `name_or_scope` argument is
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 93d2838f6a2201a57605ab3a36ff0e01cf14dce4..5381e3abbe763e260004d19323f3173b9922c8a8 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -311,9 +311,12 @@ class Variable(object):
             raise ValueError("initial_value must have a shape specified: %s" %
                              self._initial_value)
 
-        # Assigns initial value.
+        # If 'initial_value' makes use of other variables, make sure we don't
+        # have an issue if these other variables aren't initialized first by
+        # using their initialized_value() method.
         self._initializer_op = state_ops.assign(
-            self._variable, self._initial_value,
+            self._variable,
+            self._build_initializer_expr(self._initial_value),
             validate_shape=validate_shape).op
 
         # TODO(vrv): Change this class to not take caching_device, but
@@ -708,6 +711,89 @@ class Variable(object):
 
     setattr(Variable, operator, _run_op)
 
+  def _build_initializer_expr(self, initial_value):
+    """Build an expression suitable to initialize a variable.
+
+    Replace references to variables in initial_value with references to the
+    variable initial values instead.
+
+    Args:
+      initial_value: original expression
+    Returns:
+      A tensorflow expression suitable to initialize a variable.
+    """
+    if isinstance(initial_value, Variable):
+      return initial_value.initialized_value()
+    elif isinstance(initial_value, ops.Tensor):
+      new_op = self._build_initializer_expr(initial_value.op)
+      if new_op != initial_value.op:
+        if isinstance(new_op, ops.Tensor):
+          return new_op
+        else:
+          return ops.Tensor(new_op, initial_value.value_index,
+                            initial_value.dtype)
+      else:
+        return initial_value
+    elif isinstance(initial_value, ops.Operation):
+      if initial_value.node_def.op in [
+          "IsVariableInitialized", "VarIsInitializedOp", "ReadVariableOp"
+      ]:
+        return initial_value
+      if initial_value.node_def.op in ["Variable", "VariableV2", "VarHandleOp"]:
+        return self._find_initialized_value_for_variable(initial_value)
+      modified = False
+      new_inputs = []
+      for tensor in initial_value.inputs:
+        new_tensor = self._build_initializer_expr(tensor)
+        new_inputs.append(new_tensor)
+        if new_tensor != tensor:
+          modified = True
+
+      if modified:
+        new_name = initial_value.node_def.name + "_" + self.name
+        new_name = new_name.replace(":", "_")
+        new_op = initial_value.node_def.op
+        new_op = new_op.replace("RefSwitch", "Switch")
+        new_value = self.graph.create_op(
+            new_op,
+            new_inputs,
+            # pylint: disable=protected-access
+            initial_value._output_types,
+            # pylint: enable=protected-access
+            name=new_name,
+            attrs=initial_value.node_def.attr)
+        return new_value
+      else:
+        return initial_value
+    else:
+      return initial_value
+
+  def _find_initialized_value_for_variable(self, variable_op):
+    """Find the initial value for a variable op.
+
+    To do so, lookup the variable op in the variables collection.
+
+    Args:
+      variable_op: a TensorFlow variable Operation
+    Returns:
+      The initial value for the variable.
+    """
+    try:
+      var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
+      global_vars = self.graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      for var in global_vars:
+        if var.name in var_names:
+          return var.initialized_value()
+      local_vars = self.graph.get_collection(ops.GraphKeys.LOCAL_VARIABLES)
+      for var in local_vars:
+        if var.name == var_names:
+          return var.initialized_value()
+    except AttributeError:
+      # Return the variable itself when an incomplete user defined variable type
+      # was put in the collection.
+      return variable_op
+    return variable_op
+
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Variable class higher priority than an ndarray, or a
@@ -1310,19 +1396,23 @@ def report_uninitialized_variables(var_list=None,
         if op.type in ["Variable", "VariableV2", "AutoReloadVariable"]:
           var_list.append(op.outputs[0])
   with ops.name_scope(name):
-    if not var_list:
-      # Return an empty tensor so we only need to check for returned tensor
-      # size being 0 as an indication of model ready.
-      return array_ops.constant([], dtype=dtypes.string)
-    else:
-      # Get a 1-D boolean tensor listing whether each variable is initialized.
-      variables_mask = math_ops.logical_not(
-          array_ops.stack(
-              [state_ops.is_variable_initialized(v) for v in var_list]))
-      # Get a 1-D string tensor containing all the variable names.
-      variable_names_tensor = array_ops.constant([s.op.name for s in var_list])
-      # Return a 1-D tensor containing all the names of uninitialized variables.
-      return array_ops.boolean_mask(variable_names_tensor, variables_mask)
+    # Run all operations on CPU
+    with ops.device("/cpu:0"):
+      if not var_list:
+        # Return an empty tensor so we only need to check for returned tensor
+        # size being 0 as an indication of model ready.
+        return array_ops.constant([], dtype=dtypes.string)
+      else:
+        # Get a 1-D boolean tensor listing whether each variable is initialized.
+        variables_mask = math_ops.logical_not(
+            array_ops.stack(
+                [state_ops.is_variable_initialized(v) for v in var_list]))
+        # Get a 1-D string tensor containing all the variable names.
+        variable_names_tensor = array_ops.constant(
+            [s.op.name for s in var_list])
+        # Return a 1-D tensor containing all the names of
+        # uninitialized variables.
+        return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 # pylint: disable=protected-access
 Variable._OverloadAllOperators()
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index f70dbf18d4e9125b9c469885dcf5b2a766c2f9e4..138a0ced97bc03c491b0282fc56f25a575093684 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import argparse as _argparse
 
+from tensorflow.python.platform import tf_logging as _logging
 from tensorflow.python.util.all_util import remove_undocumented
 
 _global_parser = _argparse.ArgumentParser()
@@ -34,17 +35,24 @@ class _FlagValues(object):
   def __init__(self):
     self.__dict__['__flags'] = {}
     self.__dict__['__parsed'] = False
+    self.__dict__['__required_flags'] = set()
 
   def _parse_flags(self, args=None):
     result, unparsed = _global_parser.parse_known_args(args=args)
     for flag_name, val in vars(result).items():
       self.__dict__['__flags'][flag_name] = val
     self.__dict__['__parsed'] = True
+    self._assert_all_required()
     return unparsed
 
   def __getattr__(self, name):
     """Retrieves the 'value' attribute of the flag --name."""
-    if not self.__dict__['__parsed']:
+    try:
+      parsed = self.__dict__['__parsed']
+    except KeyError:
+      # May happen during pickle.load or copy.copy
+      raise AttributeError(name)
+    if not parsed:
       self._parse_flags()
     if name not in self.__dict__['__flags']:
       raise AttributeError(name)
@@ -55,6 +63,19 @@ class _FlagValues(object):
     if not self.__dict__['__parsed']:
       self._parse_flags()
     self.__dict__['__flags'][name] = value
+    self._assert_required(name)
+
+  def _add_required_flag(self, item):
+    self.__dict__['__required_flags'].add(item)
+
+  def _assert_required(self, flag_name):
+    if (flag_name not in self.__dict__['__flags'] or
+        self.__dict__['__flags'][flag_name] is None):
+      raise AttributeError('Flag --%s must be specified.' % flag_name)
+
+  def _assert_all_required(self):
+    for flag_name in self.__dict__['__required_flags']:
+      self._assert_required(flag_name)
 
 
 def _define_helper(flag_name, default_value, docstring, flagtype):
@@ -131,6 +152,51 @@ def DEFINE_float(flag_name, default_value, docstring):
   """
   _define_helper(flag_name, default_value, docstring, float)
 
+
+def mark_flag_as_required(flag_name):
+  """Ensures that flag is not None during program execution.
+  
+  It is recommended to call this method like this:
+  
+    if __name__ == '__main__':
+      tf.flags.mark_flag_as_required('your_flag_name')
+      tf.app.run()
+  
+  Args:
+    flag_name: string, name of the flag to mark as required.
+ 
+  Raises:
+    AttributeError: if flag_name is not registered as a valid flag name.
+      NOTE: The exception raised will change in the future. 
+  """
+  if _global_parser.get_default(flag_name) is not None:
+    _logging.warn(
+        'Flag %s has a non-None default value; therefore, '
+        'mark_flag_as_required will pass even if flag is not specified in the '
+        'command line!' % flag_name)
+  FLAGS._add_required_flag(flag_name)
+
+
+def mark_flags_as_required(flag_names):
+  """Ensures that flags are not None during program execution.
+  
+  Recommended usage:
+  
+    if __name__ == '__main__':
+      tf.flags.mark_flags_as_required(['flag1', 'flag2', 'flag3'])
+      tf.app.run()
+  
+  Args:
+    flag_names: a list/tuple of flag names to mark as required.
+
+  Raises:
+    AttributeError: If any of flag name has not already been defined as a flag.
+      NOTE: The exception raised will change in the future.
+  """
+  for flag_name in flag_names:
+    mark_flag_as_required(flag_name)
+
+
 _allowed_symbols = [
     # We rely on gflags documentation.
     'DEFINE_bool',
@@ -139,5 +205,7 @@ _allowed_symbols = [
     'DEFINE_integer',
     'DEFINE_string',
     'FLAGS',
+    'mark_flag_as_required',
+    'mark_flags_as_required',
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index 0dbaafd1fab3f5020e32916bfc2960976e188ded..c6bdd94a76397756dfa7d356d57362b400f80a69 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import sys
 import unittest
 
@@ -34,6 +35,8 @@ flags.DEFINE_boolean("bool_a", False, "HelpString")
 flags.DEFINE_boolean("bool_c", False, "HelpString")
 flags.DEFINE_boolean("bool_d", True, "HelpString")
 flags.DEFINE_bool("bool_e", True, "HelpString")
+flags.DEFINE_string("string_foo_required", "default_val", "HelpString")
+flags.DEFINE_string("none_string_foo_required", None, "HelpString")
 
 FLAGS = flags.FLAGS
 
@@ -79,6 +82,22 @@ class FlagsTest(unittest.TestCase):
     FLAGS.float_foo = -1.0
     self.assertEqual(-1.0, FLAGS.float_foo)
 
+  def test_copy(self):
+    copied = copy.copy(FLAGS)
+    self.assertEqual(copied.__dict__, FLAGS.__dict__)
+
+  def testStringRequired(self):
+    res = FLAGS.string_foo_required
+    self.assertEqual(res, "default_val")
+    FLAGS.string_foo_required = "bar"
+    self.assertEqual("bar", FLAGS.string_foo_required)
+
+  def testNoneStringRequired(self):
+    res = FLAGS.none_string_foo_required
+    self.assertEqual(res, "default_val")
+    FLAGS.none_string_foo_required = "bar"
+    self.assertEqual("bar", FLAGS.none_string_foo_required)
+
 
 def main(_):
   # unittest.main() tries to interpret the unknown flags, so use the
@@ -92,7 +111,9 @@ if __name__ == "__main__":
   # Test command lines
   sys.argv.extend([
       "--bool_a", "--nobool_negation", "--bool_c=True", "--bool_d=False",
-      "and_argument"
+      "and_argument",
+      "--none_string_foo_required=default_val"
   ])
-
+  flags.mark_flag_as_required('string_foo_required')
+  flags.mark_flags_as_required(['none_string_foo_required'])
   app.run()
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index a307347f606379a0cff08def274d8bf1bf7a2cc6..fdaae04fb6c6088b580bbed6c4c27f036e9ac375 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -17,6 +17,9 @@
 
 See the @{$python/test} guide.
 
+Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
+depending on the python version.
+
 @@main
 @@TestCase
 @@test_src_dir_path
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD b/tensorflow/python/profiler/BUILD
similarity index 77%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
rename to tensorflow/python/profiler/BUILD
index 8040c791ee4dd451b9b06926270380296bd2dd6c..c32cddbd6d7a363f2cc8e8566f05cd68ee90c444 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -2,50 +2,71 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
+py_library(
+    name = "profiler",
+    srcs = ["profiler.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_analyzer",
+        ":tfprof_logger",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "option_builder",
+    srcs = ["option_builder.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tfprof_logger",
+    ],
+)
+
 py_library(
     name = "model_analyzer",
     srcs = ["model_analyzer.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":option_builder",
         ":tfprof_logger",
-        "//tensorflow/contrib/tfprof/python/tools/tfprof/internal:pywrap_tensorflow_print_model_analysis_lib",
-        "//tensorflow/tools/tfprof:protos_all_py",
+        "//tensorflow/core/profiler:protos_all_py",
+        "//tensorflow/python:pywrap_tensorflow",
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "model_analyzer_test",
     srcs = ["model_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":model_analyzer",
-        "//tensorflow/contrib/tfprof/python/tools/tfprof/internal:model_analyzer_testlib",
+        "//tensorflow/python/profiler/internal:model_analyzer_testlib",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_pip"],
 )
 
-py_test(
+cuda_py_test(
     name = "profiler_test",
     srcs = ["profiler_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":model_analyzer",
-        "//tensorflow/contrib/tfprof/python/tools/tfprof/internal:model_analyzer_testlib",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
+        "//tensorflow/python/profiler/internal:model_analyzer_testlib",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_pip"],
 )
 
 py_library(
@@ -53,9 +74,9 @@ py_library(
     srcs = ["tfprof_logger.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
-        "//tensorflow/tools/tfprof:protos_all_py",
         "@six_archive//:six",
     ],
 )
@@ -66,14 +87,13 @@ tf_py_test(
     srcs = ["tfprof_logger_test.py"],
     additional_deps = [
         ":tfprof_logger",
-        "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/tools/tfprof:protos_all_py",
     ],
 )
 
diff --git a/tensorflow/tensorboard/backend/__init__.py b/tensorflow/python/profiler/__init__.py
similarity index 100%
rename from tensorflow/tensorboard/backend/__init__.py
rename to tensorflow/python/profiler/__init__.py
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
similarity index 67%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/internal/BUILD
rename to tensorflow/python/profiler/internal/BUILD
index 3fa5b7867d4076196245397a0ade8e9b72cf0cce..5a6e53121c55f33e7d15f6a30f6b1c08c7a5b811 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -1,7 +1,8 @@
-package(default_visibility = ["//tensorflow/contrib/tfprof/python/tools/tfprof:__subpackages__"])
+package(default_visibility = ["//tensorflow/python/profiler:__subpackages__"])
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
@@ -11,33 +12,20 @@ py_library(
     srcs = ["model_analyzer_testlib.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/rnn:rnn_py",
-        "//tensorflow/contrib/tfprof/python/tools/tfprof:model_analyzer",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:rnn",
+        "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-    ],
-)
-
-tf_py_wrap_cc(
-    name = "pywrap_tensorflow_print_model_analysis_lib",
-    srcs = ["pywrap_tensorflow_print_model_analysis.i"],
-    swig_includes = [
-        "//tensorflow/python:lib/core/strings.i",
-        "//tensorflow/python:platform/base.i",
-    ],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/tools/tfprof/internal:print_model_analysis_hdr",
-        "//util/python:python_headers",
+        "//tensorflow/python/profiler:model_analyzer",
     ],
 )
 
@@ -46,7 +34,7 @@ py_test(
     srcs = ["print_model_analysis_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":pywrap_tensorflow_print_model_analysis_lib",
+        "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -55,8 +43,25 @@ py_test(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/tools/tfprof:protos_all_py",
+    ],
+)
+
+cuda_py_test(
+    name = "run_metadata_test",
+    srcs = ["run_metadata_test.py"],
+    additional_deps = [
+        ":model_analyzer_testlib",
+        "//tensorflow/core/profiler:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/profiler:model_analyzer",
+        "//tensorflow/python:random_ops",
+    ],
+    tags = [
+        "no_pip",
     ],
 )
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/model_analyzer_testlib.py b/tensorflow/python/profiler/internal/model_analyzer_testlib.py
similarity index 100%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/internal/model_analyzer_testlib.py
rename to tensorflow/python/profiler/internal/model_analyzer_testlib.py
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/print_model_analysis_test.py b/tensorflow/python/profiler/internal/print_model_analysis_test.py
similarity index 74%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/internal/print_model_analysis_test.py
rename to tensorflow/python/profiler/internal/print_model_analysis_test.py
index 76e7d627ceab49b1b08d015bd626d0d2ed2158e6..797c430e99b21a73a2260d45d6c9f25e26122806 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/print_model_analysis_test.py
+++ b/tensorflow/python/profiler/internal/print_model_analysis_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from google.protobuf import text_format
 
+from tensorflow.core.profiler import tfprof_options_pb2
+from tensorflow.core.profiler import tfprof_output_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,12 +30,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.tools.tfprof import tfprof_options_pb2
-from tensorflow.tools.tfprof import tfprof_output_pb2
 
 # pylint: disable=g-bad-import-order
 # XXX: this depends on pywrap_tensorflow and must come later
-from tensorflow.contrib.tfprof.python.tools.tfprof.internal import pywrap_tensorflow_print_model_analysis_lib as print_mdl
+from tensorflow.python import pywrap_tensorflow as print_mdl
 
 # pylint: disable=bad-whitespace
 # pylint: disable=bad-continuation
@@ -77,6 +77,7 @@ class PrintModelAnalysisTest(test.TestCase):
     opts.min_params = TEST_OPTIONS['min_params']
     opts.min_float_ops = TEST_OPTIONS['min_float_ops']
     opts.order_by = TEST_OPTIONS['order_by']
+    opts.step = -1
     for p in TEST_OPTIONS['account_type_regexes']:
       opts.account_type_regexes.append(p)
     for p in TEST_OPTIONS['start_name_regexes']:
@@ -94,7 +95,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session() as sess, ops.device('/cpu:0'):
       _ = self._BuildSmallModel()
-      tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
+      tfprof_pb = tfprof_output_pb2.GraphNodeProto()
       tfprof_pb.ParseFromString(
           print_mdl.PrintModelAnalysis(
               sess.graph.as_graph_def(add_shapes=True).SerializeToString(),
@@ -103,7 +104,7 @@ class PrintModelAnalysisTest(test.TestCase):
               b'scope',
               opts.SerializeToString()))
 
-      expected_pb = tfprof_output_pb2.TFGraphNodeProto()
+      expected_pb = tfprof_output_pb2.GraphNodeProto()
       text_format.Merge(r"""name: "_TFProfRoot"
           exec_micros: 0
           requested_bytes: 0
@@ -153,6 +154,13 @@ class PrintModelAnalysisTest(test.TestCase):
                 }
               }
             }
+            accelerator_exec_micros: 0
+            cpu_exec_micros: 0
+            total_accelerator_exec_micros: 0
+            total_cpu_exec_micros: 0
+            run_count: 0
+            total_run_count: 0
+            total_definition_count: 1
           }
           children {
             name: "DW"
@@ -205,6 +213,13 @@ class PrintModelAnalysisTest(test.TestCase):
                   }
                 }
               }
+              accelerator_exec_micros: 0
+              cpu_exec_micros: 0
+              total_accelerator_exec_micros: 0
+              total_cpu_exec_micros: 0
+              run_count: 0
+              total_run_count: 0
+              total_definition_count: 1
             }
             children {
               name: "DW/Initializer"
@@ -237,6 +252,13 @@ class PrintModelAnalysisTest(test.TestCase):
                       }
                     }
                   }
+                  accelerator_exec_micros: 0
+                  cpu_exec_micros: 0
+                  total_accelerator_exec_micros: 0
+                  total_cpu_exec_micros: 0
+                  run_count: 0
+                  total_run_count: 0
+                  total_definition_count: 1
                 }
                 children {
                   name: "DW/Initializer/random_normal/mean"
@@ -247,6 +269,13 @@ class PrintModelAnalysisTest(test.TestCase):
                   total_parameters: 0
                   float_ops: 0
                   total_float_ops: 0
+                  accelerator_exec_micros: 0
+                  cpu_exec_micros: 0
+                  total_accelerator_exec_micros: 0
+                  total_cpu_exec_micros: 0
+                  run_count: 0
+                  total_run_count: 0
+                  total_definition_count: 1
                 }
                 children {
                   name: "DW/Initializer/random_normal/mul"
@@ -282,6 +311,13 @@ class PrintModelAnalysisTest(test.TestCase):
                       }
                     }
                   }
+                  accelerator_exec_micros: 0
+                  cpu_exec_micros: 0
+                  total_accelerator_exec_micros: 0
+                  total_cpu_exec_micros: 0
+                  run_count: 0
+                  total_run_count: 0
+                  total_definition_count: 1
                 }
                 children {
                   name: "DW/Initializer/random_normal/shape"
@@ -292,6 +328,13 @@ class PrintModelAnalysisTest(test.TestCase):
                   total_parameters: 0
                   float_ops: 0
                   total_float_ops: 0
+                  accelerator_exec_micros: 0
+                  cpu_exec_micros: 0
+                  total_accelerator_exec_micros: 0
+                  total_cpu_exec_micros: 0
+                  run_count: 0
+                  total_run_count: 0
+                  total_definition_count: 1
                 }
                 children {
                   name: "DW/Initializer/random_normal/stddev"
@@ -302,6 +345,13 @@ class PrintModelAnalysisTest(test.TestCase):
                   total_parameters: 0
                   float_ops: 0
                   total_float_ops: 0
+                  accelerator_exec_micros: 0
+                  cpu_exec_micros: 0
+                  total_accelerator_exec_micros: 0
+                  total_cpu_exec_micros: 0
+                  run_count: 0
+                  total_run_count: 0
+                  total_definition_count: 1
                 }
                 float_ops: 0
                 total_float_ops: 0
@@ -330,9 +380,23 @@ class PrintModelAnalysisTest(test.TestCase):
                     }
                   }
                 }
+                accelerator_exec_micros: 0
+                cpu_exec_micros: 0
+                total_accelerator_exec_micros: 0
+                total_cpu_exec_micros: 0
+                run_count: 0
+                total_run_count: 0
+                total_definition_count: 6
               }
               float_ops: 0
               total_float_ops: 0
+              accelerator_exec_micros: 0
+              cpu_exec_micros: 0
+              total_accelerator_exec_micros: 0
+              total_cpu_exec_micros: 0
+              run_count: 0
+              total_run_count: 0
+              total_definition_count: 7
             }
             children {
               name: "DW/read"
@@ -360,9 +424,23 @@ class PrintModelAnalysisTest(test.TestCase):
                   }
                 }
               }
+              accelerator_exec_micros: 0
+              cpu_exec_micros: 0
+              total_accelerator_exec_micros: 0
+              total_cpu_exec_micros: 0
+              run_count: 0
+              total_run_count: 0
+              total_definition_count: 1
             }
             float_ops: 0
             total_float_ops: 0
+            accelerator_exec_micros: 0
+            cpu_exec_micros: 0
+            total_accelerator_exec_micros: 0
+            total_cpu_exec_micros: 0
+            run_count: 0
+            total_run_count: 0
+            total_definition_count: 10
           }
           children {
             name: "zeros"
@@ -373,9 +451,23 @@ class PrintModelAnalysisTest(test.TestCase):
             total_parameters: 0
             float_ops: 0
             total_float_ops: 0
+            accelerator_exec_micros: 0
+            cpu_exec_micros: 0
+            total_accelerator_exec_micros: 0
+            total_cpu_exec_micros: 0
+            run_count: 0
+            total_run_count: 0
+            total_definition_count: 1
           }
           float_ops: 0
-          total_float_ops: 0""", expected_pb)
+          total_float_ops: 0
+          accelerator_exec_micros: 0
+          cpu_exec_micros: 0
+          total_accelerator_exec_micros: 0
+          total_cpu_exec_micros: 0
+          run_count: 0
+          total_run_count: 0
+          total_definition_count: 13""", expected_pb)
       self.assertEqual(expected_pb, tfprof_pb)
 
 
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b2314aea0de7eb68e0d8e66d72621da2828754
--- /dev/null
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -0,0 +1,195 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""test the RunMetadata proto."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+
+import six
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.profiler import option_builder
+
+# pylint: disable=g-bad-import-order
+# XXX: this depends on pywrap_tensorflow and must come later
+from tensorflow.python.profiler import model_analyzer
+from tensorflow.python.profiler.internal import model_analyzer_testlib as lib
+
+SIZE = 1300
+builder = option_builder.ProfileOptionBuilder
+
+
+def _extract_node(run_meta, node_names):
+  if not isinstance(node_names, list):
+    node_names = [node_names]
+  ret = defaultdict(list)
+  for dev_stat in run_meta.step_stats.dev_stats:
+    dev = dev_stat.device
+    for node_stat in dev_stat.node_stats:
+      if node_stat.node_name in node_names:
+        ret[dev].append(node_stat)
+  return ret
+
+
+def _run_model():
+  x = random_ops.random_normal(shape=[1, SIZE])
+  w = random_ops.random_normal(shape=[SIZE, 2 * SIZE])
+  y = math_ops.matmul(x, w)
+
+  with session.Session() as sess:
+    run_metadata = config_pb2.RunMetadata()
+    opts = builder.time_and_memory()
+    opts['min_micros'] = 0
+    opts['min_bytes'] = 0
+    _ = sess.run(y,
+                 options=config_pb2.RunOptions(
+                     trace_level=config_pb2.RunOptions.FULL_TRACE),
+                 run_metadata=run_metadata)
+    tfprof_node = model_analyzer.profile(
+        sess.graph,
+        run_meta=run_metadata,
+        options=opts)
+
+    return tfprof_node, run_metadata
+
+
+def _run_loop_model():
+  with session.Session() as sess:
+    x = lib.BuildFullModel()
+
+    sess.run(variables.global_variables_initializer())
+    run_meta = config_pb2.RunMetadata()
+    _ = sess.run(x,
+                 options=config_pb2.RunOptions(
+                     trace_level=config_pb2.RunOptions.FULL_TRACE),
+                 run_metadata=run_meta)
+
+    tfprof_node = model_analyzer.profile(
+        sess.graph, run_meta,
+        options=builder.time_and_memory())
+    return tfprof_node, run_meta
+
+
+class RunMetadataTest(test.TestCase):
+
+  def testGPU(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    ops.reset_default_graph()
+    with ops.device('/gpu:0'):
+      tfprof_node, run_meta = _run_model()
+      self.assertEqual(tfprof_node.children[0].name, 'MatMul')
+      self.assertGreater(tfprof_node.children[0].exec_micros, 10)
+
+    ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul'])
+    self.assertEqual(len(ret), 3)
+    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in ret)
+    del ret['/job:localhost/replica:0/task:0/gpu:0']
+
+    has_all_stream = False
+    for k, _ in six.iteritems(ret):
+      self.assertTrue('gpu:0/stream' in k)
+      if 'gpu:0/stream:all' in k:
+        has_all_stream = True
+    self.assertTrue(has_all_stream)
+
+  def testCPU(self):
+    ops.reset_default_graph()
+    with ops.device('/cpu:0'):
+      tfprof_node, run_meta = _run_model()
+      self.assertEqual(tfprof_node.children[0].name, 'MatMul')
+      self.assertGreater(tfprof_node.children[0].exec_micros, 0)
+
+    ret = _extract_node(run_meta, 'MatMul')
+    self.assertEqual(len(ret), 1)
+    self.assertTrue('/job:localhost/replica:0/task:0/cpu:0' in ret)
+
+    ret = _extract_node(run_meta, 'MatMul:MatMul')
+    self.assertEqual(len(ret), 0)
+
+  def testLoopCPU(self):
+    ops.reset_default_graph()
+    with ops.device('/cpu:0'):
+      tfprof_node, run_meta = _run_loop_model()
+      # The while-loop caused a node to appear 4 times in scheduling.
+      ret = _extract_node(run_meta,
+                          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
+      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/cpu:0']), 4)
+
+      total_cpu_execs = 0
+      for node in ret['/job:localhost/replica:0/task:0/cpu:0']:
+        total_cpu_execs += node.op_end_rel_micros
+
+      mm_node = lib.SearchTFProfNode(
+          tfprof_node,
+          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
+
+      self.assertEqual(mm_node.run_count, 4)
+      self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
+      self.assertEqual(mm_node.exec_micros, total_cpu_execs)
+
+  # pylint: disable=pointless-string-statement
+  """
+  TODO(xpan): This test is flaky because RunMetadata returned from TensorFlow
+  is random. Still being investigated.
+  def testLoopGPU(self):
+    if not test.is_gpu_available():
+      return
+
+    ops.reset_default_graph()
+    with ops.device('/gpu:0'):
+      tfprof_node, run_meta = _run_loop_model()
+      # The while-loop caused a node to appear 4 times in scheduling.
+      ret = _extract_node(run_meta,
+                          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
+      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/gpu:0']), 4)
+
+      total_cpu_execs = 0
+      for node in ret['/job:localhost/replica:0/task:0/gpu:0']:
+        total_cpu_execs += node.op_end_rel_micros
+
+      ret = _extract_node(
+          run_meta,
+          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul')
+      self.assertGreaterEqual(len(ret['/gpu:0/stream:all']), 4)
+
+      total_accelerator_execs = 0
+      for node in ret['/gpu:0/stream:all']:
+        total_accelerator_execs += node.op_end_rel_micros
+
+      mm_node = lib.SearchTFProfNode(
+          tfprof_node,
+          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
+
+      self.assertEqual(mm_node.run_count, 4)
+      self.assertEqual(mm_node.accelerator_exec_micros, total_accelerator_execs)
+      self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
+      self.assertEqual(mm_node.exec_micros,
+                       total_cpu_execs + total_accelerator_execs)
+  """
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2268472fdda068ceb4dfd3bd792b8b8f6fb705b6
--- /dev/null
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -0,0 +1,358 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model Analyzer.
+
+Analyze model, including shape, params, time, memory, structure, etc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.core.profiler import tfprof_options_pb2
+from tensorflow.core.profiler import tfprof_output_pb2
+from tensorflow.python import pywrap_tensorflow as print_mdl
+from tensorflow.python.framework import errors
+from tensorflow.python.profiler import option_builder
+from tensorflow.python.profiler import tfprof_logger
+
+_DEFAULT_PROFILE_OPTIONS = 0
+_DEFAULT_ADVISE_OPTIONS = 0
+
+# The following options are for 'advise' cmd.
+# Show all advice.
+ALL_ADVICE = {
+    'ExpensiveOperationChecker': {},
+    'AcceleratorUtilizationChecker': {},
+    'JobChecker': {},  # Only available internally.
+    'OperationChecker': {},
+}
+
+
+def _build_options(options):
+  """Build tfprof.OptionsProto.
+
+  Args:
+    options: A dictionary of options.
+  Returns:
+    tfprof.OptionsProto.
+  """
+  opts = tfprof_options_pb2.OptionsProto()
+  opts.max_depth = options.get('max_depth', 10)
+  opts.min_bytes = options.get('min_bytes', 0)
+  opts.min_micros = options.get('min_micros', 0)
+  opts.min_params = options.get('min_params', 0)
+  opts.min_float_ops = options.get('min_float_ops', 0)
+  opts.min_occurrence = options.get('min_occurrence', 0)
+
+  opts.step = options.get('step', -1)
+
+  opts.order_by = options.get('order_by', 'name')
+
+  for p in options.get('account_type_regexes', []):
+    opts.account_type_regexes.append(p)
+  for p in options.get('start_name_regexes', []):
+    opts.start_name_regexes.append(p)
+  for p in options.get('trim_name_regexes', []):
+    opts.trim_name_regexes.append(p)
+  for p in options.get('show_name_regexes', []):
+    opts.show_name_regexes.append(p)
+  for p in options.get('hide_name_regexes', []):
+    opts.hide_name_regexes.append(p)
+  opts.account_displayed_op_only = options.get('account_displayed_op_only',
+                                               False)
+
+  for p in options.get('select', []):
+    opts.select.append(p)
+
+  opts.output = options.get('output', 'stdout')
+  opts.dump_to_file = options.get('dump_to_file', '')
+
+  return opts
+
+
+def _build_advisor_options(options):
+  """Build tfprof.AdvisorOptionsProto.
+
+  Args:
+    options: A dictionary of options. See ALL_ADVICE example.
+  Returns:
+    tfprof.AdvisorOptionsProto.
+  """
+  opts = tfprof_options_pb2.AdvisorOptionsProto()
+  if options is None:
+    return opts
+  for checker, checker_opts in six.iteritems(options):
+    checker_ops_pb = tfprof_options_pb2.AdvisorOptionsProto.CheckerOption()
+    for k, v in six.iteritems(checker_opts):
+      checker_ops_pb[k] = v
+    opts.checkers[checker].MergeFrom(checker_ops_pb)
+  return opts
+
+
+class Profiler(object):
+  """TensorFlow multi-step profiler.
+
+  https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md
+
+  ```python
+  Typical use case:
+    # Currently we are only allowed to create 1 profiler per process.
+    profiler = Profile(sess.graph)
+
+    for i in xrange(total_steps):
+      if i % 10000 == 0:
+        run_meta = tf.RunMetadata()
+        _ = sess.run(...,
+                     options=tf.RunOptions(
+                         trace_level=tf.RunOptions.FULL_TRACE),
+                     run_metadata=run_meta)
+        profiler.add_step(i, run_meta)
+
+        # Profile the parameters of your model.
+        profiler.profile_name_scope(options=(option_builder.ProfileOptionBuilder
+            .trainable_variables_parameter()))
+
+        # Or profile the timing of your model operations.
+        opts = option_builder.ProfileOptionBuilder.time_and_memory()
+        profiler.profile_operations(options=opts)
+
+        # Or you can generate a timeline:
+        opts = (option_builder.ProfileOptionBuilder(
+                option_builder.ProfileOptionBuilder.time_and_memory())
+                .with_step(i)
+                .with_timeline_output(filename).build())
+        profiler.profile_graph(options=opts)
+      else:
+        _ = sess.run(...)
+    # Auto detect problems and generate advice.
+    profiler.advise()
+  ```
+  """
+
+  def __init__(self, graph, op_log=None):
+    """Constructor.
+
+    Args:
+      graph: tf.Graph.
+      op_log: optional. tensorflow::tfprof::OpLogProto proto. Used to define
+          extra op types.
+    """
+    self._graph = graph
+    # pylint: disable=protected-access
+    op_log = tfprof_logger._merge_default_with_oplog(
+        self._graph, op_log=op_log)
+    # pylint: enable=protected-access
+
+    print_mdl.NewProfiler(
+        self._graph.as_graph_def(add_shapes=True).SerializeToString(),
+        op_log.SerializeToString())
+
+  def __del__(self):
+    print_mdl.DeleteProfiler()
+
+  def add_step(self, step, run_meta):
+    """Add statistics of a step.
+
+    Args:
+      step: A step uint64 used to identify the RunMetadata. Must be different
+         across different AddStep() calls.
+      run_meta: RunMetadata proto that contains statistics of a session run.
+    """
+    # pylint: disable=protected-access
+    op_log = tfprof_logger._merge_default_with_oplog(
+        self._graph, run_meta=run_meta, add_trace=False,
+        add_trainable_var=False)
+    # pylint: enable=protected-access
+    print_mdl.AddStep(
+        step, run_meta.SerializeToString(), op_log.SerializeToString())
+
+  def profile_python(self, options):
+    """Profile the statistics of the Python codes.
+
+      By default, it shows the call stack from root. To avoid
+      redundant output, you may use options to filter as below
+        options['show_name_regexes'] = ['.*my_code.py.*']
+
+    Args:
+      options: A dict of options. See core/profiler/g3doc/options.md.
+    Returns:
+      a MultiGraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.MultiGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('code'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def profile_operations(self, options):
+    """Profile the statistics of the Operation types (e.g. MatMul, Conv2D).
+
+    Args:
+      options: A dict of options. See core/profiler/g3doc/options.md.
+    Returns:
+      a MultiGraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.MultiGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('op'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def profile_name_scope(self, options):
+    """Profile the statistics of graph nodes, organized by name scope.
+
+    Args:
+      options: A dict of options. See core/profiler/g3doc/options.md.
+    Returns:
+      a GraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.GraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('scope'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def profile_graph(self, options):
+    """Profile the statistics of graph nodes, organized by dataflow graph.
+
+    Args:
+      options: A dict of options. See core/profiler/g3doc/options.md.
+    Returns:
+      a GraphNodeProto that records the results.
+    """
+    opts = _build_options(options)
+    tfprof_node = tfprof_output_pb2.GraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.Profile('graph'.encode('utf-8'), opts.SerializeToString()))
+    return tfprof_node
+
+  def advise(self, options):
+    """Automatically detect problems and generate reports.
+
+    Args:
+      options: A dict of options. See ALL_ADVICE example above.
+    Returns:
+      A Advise proto that conains the reports from all checkers.
+    """
+    advise_pb = tfprof_output_pb2.AdviceProto()
+    opts = _build_advisor_options(options)
+    advise_pb.ParseFromString(
+        print_mdl.Profile('advise'.encode('utf-8'), opts.SerializeToString()))
+    return advise_pb
+
+
+def profile(graph,
+            run_meta=None,
+            op_log=None,
+            cmd='scope',
+            options=_DEFAULT_PROFILE_OPTIONS):
+  """Profile model.
+
+    Tutorials and examples can be found in:
+    https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md
+
+  Args:
+    graph: required tf.Graph.
+    run_meta: optional tensorflow.RunMetadata proto. It is necessary to
+        to support run time information profiling, such as time and memory.
+    op_log: tensorflow.tfprof.OpLogProto proto. User can assign "types" to
+        graph nodes with op_log. "types" allow user to flexibly group and
+        account profiles using options['accounted_type_regexes'].
+    cmd: string. Either 'op', 'scope', 'graph' or 'code'.
+        'op' view organizes profile using operation type. (e.g. MatMul)
+        'scope' view organizes profile using graph node name scope.
+        'graph' view organizes profile using graph node inputs/outputs.
+        'code' view organizes profile using Python call stack.
+    options: A dict of options. See core/profiler/g3doc/options.md.
+  Returns:
+    If cmd is 'scope' or 'graph', returns GraphNodeProto proto.
+    If cmd is 'op' or 'code', returns MultiGraphNodeProto proto.
+    Side effect: stdout/file/timeline.json depending on options['output']
+  """
+  if options == _DEFAULT_PROFILE_OPTIONS:
+    options = (option_builder.ProfileOptionBuilder
+               .trainable_variables_parameter())
+
+  # pylint: disable=protected-access
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, op_log, run_meta, add_trace=cmd == 'code')
+  # pylint: enable=protected-access
+
+  opts = _build_options(options)
+
+  run_meta_str = run_meta.SerializeToString() if run_meta else b''
+
+  if cmd == 'code' or cmd == 'op':
+    tfprof_node = tfprof_output_pb2.MultiGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def(add_shapes=True).SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  elif cmd == 'graph' or cmd == 'scope':
+    tfprof_node = tfprof_output_pb2.GraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def(add_shapes=True).SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  else:
+    raise errors.InvalidArgumentError(
+        None, None, 'unknown cmd: %s\n' % cmd)
+
+  return tfprof_node
+
+
+def advise(graph, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
+  """Auto profile and advise.
+
+    Builds profiles and automatically check anomalies of various
+    aspects. For more details:
+    https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md
+
+  Args:
+    graph: required tf.Graph.
+    run_meta: optional tensorflow.RunMetadata proto. It is necessary to
+        to support run time information profiling, such as time and memory.
+    options: see ALL_ADVICE example above. Default checks everything.
+  Returns:
+    Returns AdviceProto proto
+  """
+  if options == _DEFAULT_ADVISE_OPTIONS:
+    options = ALL_ADVICE.copy()
+
+  # pylint: disable=protected-access
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, None, run_meta, add_trace=True)
+  # pylint: enable=protected-access
+
+  run_meta_str = run_meta.SerializeToString() if run_meta else b''
+
+  opts = _build_advisor_options(options)
+  ret = tfprof_output_pb2.AdviceProto()
+  ret.ParseFromString(
+      print_mdl.PrintModelAnalysis(
+          graph.as_graph_def(add_shapes=True).SerializeToString(),
+          run_meta_str,
+          op_log.SerializeToString(),
+          'advise'.encode('utf-8'),
+          opts.SerializeToString()))
+  return ret
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..43da587b2c7cc3f2d824727c9607bf0f3364754a
--- /dev/null
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -0,0 +1,317 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.profiler import model_analyzer
+from tensorflow.python.profiler import option_builder
+from tensorflow.python.profiler.internal import model_analyzer_testlib as lib
+
+builder = option_builder.ProfileOptionBuilder
+
+
+class PrintModelAnalysisTest(test.TestCase):
+
+  def testDumpToFile(self):
+    ops.reset_default_graph()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts = builder(builder.trainable_variables_parameter()
+                  ).with_file_output(outfile).build()
+
+    with session.Session() as sess:
+      _ = lib.BuildSmallModel()
+      model_analyzer.profile(sess.graph, options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        self.assertEqual(u'node name | # parameters\n'
+                         '_TFProfRoot (--/451 params)\n'
+                         '  DW (3x3x3x6, 162/162 params)\n'
+                         '  DW2 (2x2x6x12, 288/288 params)\n'
+                         '  ScalarW (1, 1/1 params)\n',
+                         f.read())
+
+  def testSelectEverything(self):
+    ops.reset_default_graph()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_file_output(outfile)
+            .with_accounted_types(['.*'])
+            .select(['params', 'float_ops', 'occurrence', 'device', 'op_types',
+                     'input_shapes']).build())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.profile(
+          sess.graph, run_meta, options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            'node name | # parameters | # float_ops | assigned devices | op types | op count (run|defined) | input shapes\n_TFProfRoot (--/451 params, --/10.44k flops, _kTFScopeParent, --/7|--/35, )\n  Conv2D (0/0 params, 5.83k/5.83k flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D, 1/1|1/1, 0:2x6x6x3|1:3x3x3x6)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D, 1/1|1/1, 0:2x3x3x6|1:2x2x6x12)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables, 1/2|1/10, )\n    DW/Assign (0/0 params, 0/0 flops, Assign, 0/0|1/1, 0:3x3x3x6|1:3x3x3x6)\n    DW/Initializer (0/0 params, 0/0 flops, _kTFScopeParent, 0/0|1/7, )\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, Add, 0/0|1/6, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, RandomStandardNormal, 0/0|1/1, 0:4)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, Mul, 0/0|1/1, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n    DW/read (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity, 1/1|1/1, 0:3x3x3x6)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables, 1/2|1/10, )\n    DW2/Assign (0/0 params, 0/0 flops, Assign, 0/0|1/1, 0:2x2x6x12|1:2x2x6x12)\n    DW2/Initializer (0/0 params, 0/0 flops, _kTFScopeParent, 0/0|1/7, )\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, Add, 0/0|1/6, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, RandomStandardNormal, 0/0|1/1, 0:4)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, Mul, 0/0|1/1, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n    DW2/read (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity, 1/1|1/1, 0:2x2x6x12)\n  ScalarW (1, 1/1 params, 0/0 flops, VariableV2|_trainable_variables, 0/0|1/10, )\n    ScalarW/Assign (0/0 params, 0/0 flops, Assign, 0/0|1/1, 0:1|1:1)\n    ScalarW/Initializer (0/0 params, 0/0 flops, _kTFScopeParent, 0/0|1/7, )\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, Add, 0/0|1/6, 0:1|1:1)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, RandomStandardNormal, 0/0|1/1, 0:0)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, Mul, 0/0|1/1, 0:1|1:1)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n    ScalarW/read (0/0 params, 0/0 flops, Identity, 0/0|1/1, 0:1)\n  init (0/0 params, 0/0 flops, NoOp, 0/0|1/1, 0:1|1:3x3x3x6|2:2x2x6x12)\n  zeros (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const, 1/1|1/1, )\n',
+            f.read())
+        # pylint: enable=line-too-long
+
+  def testSimpleCodeView(self):
+    ops.reset_default_graph()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    # TODO(xpan): Test 'micros'. Since the execution time changes each run,
+    # it's a bit difficult to test it now.
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_file_output(outfile)
+            .with_accounted_types(['.*'])
+            .with_node_names(show_name_regexes=['.*model_analyzer_testlib.*'])
+            .account_displayed_op_only(False)
+            .select(['bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
+                     'input_shapes']).build())
+
+    with session.Session() as sess:
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.profile(
+          sess.graph, run_meta, cmd='code', options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            'node name | output bytes | # parameters | # float_ops | assigned devices | input',
+            f.read()[0:80])
+        # pylint: enable=line-too-long
+
+  def testComplexCodeView(self):
+    ops.reset_default_graph()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_file_output(outfile)
+            .with_accounted_types(['.*'])
+            .with_node_names(show_name_regexes=
+                             ['.*model_analyzer_testlib.py.*'])
+            .account_displayed_op_only(False)
+            .select(['params', 'float_ops']).build())
+
+    with session.Session() as sess:
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.profile(
+          sess.graph, run_meta, cmd='code', options=opts)
+
+      # pylint: disable=line-too-long
+      with gfile.Open(outfile, 'r') as f:
+        lines = f.read().split('\n')
+        result = '\n'.join([l[:min(len(l), 80)] for l in lines])
+        self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/91.04k flops)\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n    model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n    model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n    model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n    model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n  model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n  model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n',
+                         result)
+
+      self.assertLess(0, tfprof_node.total_exec_micros)
+      self.assertEqual(2844, tfprof_node.total_parameters)
+      self.assertEqual(91040, tfprof_node.total_float_ops)
+      self.assertEqual(5, len(tfprof_node.children))
+      self.assertEqual('_TFProfRoot', tfprof_node.name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_...',
+          tfprof_node.children[0].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...',
+          tfprof_node.children[1].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...',
+          tfprof_node.children[2].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...',
+          tfprof_node.children[3].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...',
+          tfprof_node.children[4].name)
+      # pylint: enable=line-too-long
+
+  def testCodeViewLeafGraphNode(self):
+    ops.reset_default_graph()
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_empty_output()
+            .with_accounted_types(['.*'])
+            .account_displayed_op_only(False)
+            .select(['bytes', 'params', 'float_ops', 'device']).build())
+
+    with session.Session() as sess:
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.profile(
+          sess.graph, run_meta, cmd='code', options=opts)
+
+      leaf = tfprof_node
+      while leaf.children:
+        self.assertEqual(0, len(leaf.graph_nodes))
+        leaf = leaf.children[0]
+      self.assertEqual(1, len(leaf.graph_nodes))
+
+  def testTimeline(self):
+    ops.reset_default_graph()
+    opts = builder.trainable_variables_parameter()
+    outfile = os.path.join(test.get_temp_dir(), 'timeline')
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_max_depth(100000)
+            .with_step(0)
+            .with_timeline_output(outfile)
+            .with_accounted_types(['.*']).build())
+
+    with session.Session() as sess:
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(
+          x,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE),
+          run_metadata=run_meta)
+
+      _ = model_analyzer.profile(
+          sess.graph, run_meta, cmd='graph', options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # Test that a json file is created.
+        # TODO(xpan): tfprof Timeline isn't quite correct on Windows.
+        # Investigate why.
+        if os.name != 'nt':
+          self.assertLess(1000, len(f.read()))
+        else:
+          self.assertLess(1, len(f.read()))
+
+  def testOpView(self):
+    ops.reset_default_graph()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_file_output(outfile)
+            .with_accounted_types(['.*'])
+            .with_min_occurrence(10)
+            .order_by('occurrence')
+            .select(['params', 'micros', 'occurrence', 'input_shapes']).build())
+
+    with session.Session() as sess:
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.profile(
+          sess.graph, run_meta, cmd='op', options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            'nodename|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes\n',
+            f.read().replace('\t', '').replace(' ', '')[0:120])
+        # pylint: enable=line-too-long
+
+      total_children = 0
+      last_occurrence = 1e32
+      input_shapes = 0
+      last_total_micros = tfprof_node.total_exec_micros
+      last_micros = tfprof_node.exec_micros
+      while tfprof_node.children:
+        for gnode in tfprof_node.graph_nodes:
+          input_shapes += len(gnode.input_shapes)
+        self.assertEqual(len(tfprof_node.children), 1)
+        tfprof_node = tfprof_node.children[0]
+
+        self.assertEqual(
+            last_total_micros, tfprof_node.total_exec_micros + last_micros)
+        last_total_micros = tfprof_node.total_exec_micros
+        last_micros = tfprof_node.exec_micros
+
+        total_children += 1
+        self.assertLessEqual(len(tfprof_node.graph_nodes), last_occurrence)
+        last_occurrence = len(tfprof_node.graph_nodes)
+
+      self.assertEqual(total_children, 15)
+      self.assertGreater(input_shapes, 0)
+
+  def testAdvisor(self):
+    ops.reset_default_graph()
+
+    with session.Session() as sess:
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(
+          x,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE),
+          run_metadata=run_meta)
+
+      advice_pb = model_analyzer.advise(sess.graph, run_meta)
+      self.assertTrue('AcceleratorUtilizationChecker' in advice_pb.checkers)
+      self.assertTrue('ExpensiveOperationChecker' in advice_pb.checkers)
+      self.assertTrue('OperationChecker' in advice_pb.checkers)
+
+      checker = advice_pb.checkers['AcceleratorUtilizationChecker']
+      if test.is_gpu_available():
+        self.assertGreater(len(checker.reports), 0)
+      else:
+        self.assertEqual(len(checker.reports), 0)
+      checker = advice_pb.checkers['ExpensiveOperationChecker']
+      self.assertGreater(len(checker.reports), 0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f82709bedb47928546a8af568a0df22bda1052
--- /dev/null
+++ b/tensorflow/python/profiler/option_builder.py
@@ -0,0 +1,400 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for building profiler options."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.python.profiler import tfprof_logger
+
+
+class ProfileOptionBuilder(object):
+  # pylint: disable=line-too-long
+  """Option Builder for Profiling API.
+
+  For tutorial on the options, see
+  https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/g3doc/options.md
+
+  ```python
+  # Users can use pre-built options:
+  opts = (
+      tf.profiler.ProfileOptionBuilder.trainable_variables_parameter())
+
+  # Or, build your own options:
+  opts = (tf.profiler.ProfileOptionBuilder()
+      .with_max_depth(10)
+      .with_min_micros(1000)
+      .select(['accelerator_micros'])
+      .with_stdout_output()
+      .build()
+
+  # Or customize the pre-built options:
+  opts = (tf.profiler.ProfileOptionBuilder(
+      tf.profiler.ProfileOptionBuilder.time_and_memory())
+      .with_displaying_options(show_name_regexes=['.*rnn.*'])
+      .build())
+
+  # Finally, profiling with the options:
+  _ = tf.profiler.profile(tf.get_default_graph(),
+                          run_meta=run_meta,
+                          cmd='scope',
+                          options=opts)
+  ```
+  """
+  # pylint: enable=line-too-long
+
+  def __init__(self, options=None):
+    """Constructor.
+
+    Args:
+      options: Optional initial option dict to start with.
+    """
+    if options is not None:
+      self._options = copy.deepcopy(options)
+    else:
+      self._options = {'max_depth': 100,
+                       'min_bytes': 0,
+                       'min_micros': 0,
+                       'min_params': 0,
+                       'min_float_ops': 0,
+                       'min_occurrence': 0,
+                       'order_by': 'name',
+                       'account_type_regexes': ['.*'],
+                       'start_name_regexes': ['.*'],
+                       'trim_name_regexes': [],
+                       'show_name_regexes': ['.*'],
+                       'hide_name_regexes': [],
+                       'account_displayed_op_only': False,
+                       'select': ['micros'],
+                       'step': -1,
+                       'output': 'stdout'}
+
+  @staticmethod
+  def trainable_variables_parameter():
+    """Options used to profile trainable variable parameters.
+
+    Normally used together with 'scope' view.
+
+    Returns:
+      A dict of profiling options.
+    """
+    return {'max_depth': 10000,
+            'min_bytes': 0,
+            'min_micros': 0,
+            'min_params': 0,
+            'min_float_ops': 0,
+            'min_occurrence': 0,
+            'order_by': 'name',
+            'account_type_regexes': [tfprof_logger.TRAINABLE_VARIABLES],
+            'start_name_regexes': ['.*'],
+            'trim_name_regexes': [],
+            'show_name_regexes': ['.*'],
+            'hide_name_regexes': [],
+            'account_displayed_op_only': True,
+            'select': ['params'],
+            'step': -1,
+            'output': 'stdout'}
+
+  @staticmethod
+  def float_operation():
+    # pylint: disable=line-too-long
+    """Options used to profile float operations.
+
+    Please see https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/g3doc/profile_model_architecture.md
+    on the caveats of calculating float operations.
+
+    Returns:
+      A dict of profiling options.
+    """
+    # pylint: enable=line-too-long
+    return {'max_depth': 10000,
+            'min_bytes': 0,
+            'min_micros': 0,
+            'min_params': 0,
+            'min_float_ops': 1,
+            'min_occurrence': 0,
+            'order_by': 'float_ops',
+            'account_type_regexes': ['.*'],
+            'start_name_regexes': ['.*'],
+            'trim_name_regexes': [],
+            'show_name_regexes': ['.*'],
+            'hide_name_regexes': [],
+            'account_displayed_op_only': True,
+            'select': ['float_ops'],
+            'step': -1,
+            'output': 'stdout'}
+
+  @staticmethod
+  def time_and_memory(min_micros=1, min_bytes=1):
+    """Show operation time and memory consumptions.
+
+    Args:
+      min_micros: Only show profiler nodes with more execution time than this.
+      min_bytes: Only show profiler nodes consuming more memory than this.
+    Returns:
+      A dict of profiling options.
+    """
+    return {'max_depth': 10000,
+            'min_bytes': min_bytes,
+            'min_micros': min_micros,
+            'min_params': 0,
+            'min_float_ops': 0,
+            'min_occurrence': 0,
+            'order_by': 'name',
+            'account_type_regexes': ['.*'],
+            'start_name_regexes': ['.*'],
+            'trim_name_regexes': [],
+            'show_name_regexes': ['.*'],
+            'hide_name_regexes': [],
+            'account_displayed_op_only': True,
+            'select': ['micros', 'bytes'],
+            'step': -1,
+            'output': 'stdout'}
+
+  def build(self):
+    """Build a profiling option.
+
+    Returns:
+      A dict of profiling options.
+    """
+    return copy.deepcopy(self._options)
+
+  def with_max_depth(self, max_depth):
+    """Set the maximum depth of display.
+
+    The depth depends on profiling view. For 'scope' view, it's the
+    depth of name scope hierarchy (tree), for 'op' view, it's the number
+    of operation types (list), etc.
+
+    Args:
+      max_depth: Maximum depth of the data structure to display.
+    Returns:
+      self
+    """
+    self._options['max_depth'] = max_depth
+    return self
+
+  def with_min_memory(self, min_bytes):
+    """Only show profiler nodes consuming no less than 'min_bytes'.
+
+    Args:
+      min_bytes: Only show profiler nodes with memory consumption
+          no less than this.
+    Returns:
+      self
+    """
+    self._options['min_bytes'] = min_bytes
+    return self
+
+  def with_min_execution_time(self, min_micros):
+    """Only show profiler nodes consuming no less than 'min_micros'.
+
+    Args:
+      min_micros: Only show profiler nodes with execution time
+          no less than this.
+    Returns:
+      self
+    """
+    self._options['min_micros'] = min_micros
+    return self
+
+  def with_min_parameters(self, min_params):
+    """Only show profiler nodes holding no less than 'min_params' parameters.
+
+    'Parameters' normally refers the weights of in TensorFlow variables.
+    It reflects the 'capacity' of models.
+
+    Args:
+      min_params: Only show profiler nodes holding number parameters
+          no less than this.
+    Returns:
+      self
+    """
+    self._options['min_params'] = min_params
+    return self
+
+  def with_min_occurrence(self, min_occurrence):
+    # pylint: disable=line-too-long
+    """Only show profiler nodes including no less than 'min_occurrence' graph nodes.
+
+    A "node" means a profiler output node, which can be a python line
+    (code view), an operation type (op view), or a graph node
+    (graph/scope view). A python line includes all graph nodes created by that
+    line, while an operation type includes all graph nodes of that type.
+
+    Args:
+      min_occurrence: Only show nodes including no less than this.
+    Returns:
+      self
+    """
+    # pylint: enable=line-too-long
+    self._options['min_occurrence'] = min_occurrence
+    return self
+
+  def with_min_float_operations(self, min_float_ops):
+    # pylint: disable=line-too-long
+    """Only show profiler nodes consuming no less than 'min_float_ops'.
+
+    Please see https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profilerg3doc/profile_model_architecture.md
+    on the caveats of calculating float operations.
+
+    Args:
+      min_float_ops: Only show profiler nodes with float operations
+          no less than this.
+    Returns:
+      self
+    """
+    # pylint: enable=line-too-long
+    self._options['min_float_ops'] = min_float_ops
+    return self
+
+  def with_accounted_types(self, account_type_regexes):
+    """Selectively counting statistics based on node types.
+
+    Here, 'types' means the profiler nodes' properties. Profiler by default
+    consider device name (e.g. /job:xx/.../gpu:0) and operation type
+    (e.g. MatMul) as profiler nodes' properties. User can also associate
+    customized 'types' to profiler nodes through OpLogProto proto.
+
+    For example, user can select profiler nodes placed on gpu:0 with:
+    `account_type_regexes=['.*gpu:0.*']`
+
+    If none of a node's properties match the specified regexes, the node is
+    not displayed nor accounted.
+
+    Args:
+      account_type_regexes: A list of regexes specifying the types.
+    Returns:
+      self.
+    """
+    self._options['account_type_regexes'] = copy.copy(account_type_regexes)
+    return self
+
+  def with_node_names(self,
+                      start_name_regexes=None,
+                      show_name_regexes=None,
+                      hide_name_regexes=None,
+                      trim_name_regexes=None):
+    """Regular expressions used to select profiler nodes to display.
+
+    After 'with_accounted_types' is evaluated, 'with_node_names' are
+    evaluated as follows:
+
+      For a profile data structure, profiler first finds the profiler
+      nodes matching 'start_name_regexes', and starts displaying profiler
+      nodes from there. Then, if a node matches 'show_name_regexes' and
+      doesn't match 'hide_name_regexes', it's displayed. If a node matches
+      'trim_name_regexes', profiler stops further searching that branch.
+
+    Args:
+      start_name_regexes: list of node name regexes to start displaying.
+      show_name_regexes: list of node names regexes to display.
+      hide_name_regexes: list of node_names regexes that should be hidden.
+      trim_name_regexes: list of node name regexes from where to stop.
+    Returns:
+      self
+    """
+    if start_name_regexes is not None:
+      self._options['start_name_regexes'] = copy.copy(start_name_regexes)
+    if show_name_regexes is not None:
+      self._options['show_name_regexes'] = copy.copy(show_name_regexes)
+    if hide_name_regexes is not None:
+      self._options['hide_name_regexes'] = copy.copy(hide_name_regexes)
+    if trim_name_regexes is not None:
+      self._options['trim_name_regexes'] = copy.copy(trim_name_regexes)
+    return self
+
+  def account_displayed_op_only(self, is_true):
+    """Whether only account the statistics of displayed profiler nodes.
+
+    Args:
+      is_true: If true, only account statistics of nodes eventually
+          displayed by the outputs.
+          Otherwise, a node's statistics are accounted by its parents
+          as long as it's types match 'account_type_regexes', even if
+          it is hidden from the output, say, by hide_name_regexes.
+    Returns:
+      self
+    """
+    self._options['account_displayed_op_only'] = is_true
+    return self
+
+  def with_empty_output(self):
+    """Do not generate side-effect outputs."""
+    self._options['output'] = 'none'
+    return self
+
+  def with_stdout_output(self):
+    """Print the result to stdout."""
+    self._options['output'] = 'stdout'
+    return self
+
+  def with_file_output(self, outfile):
+    """Print the result to a file."""
+    self._options['output'] = 'file:outfile=%s' % outfile
+    return self
+
+  def with_timeline_output(self, timeline_file):
+    """Generate a timeline json file."""
+    self._options['output'] = 'timeline:outfile=%s' % timeline_file
+    return self
+
+  def order_by(self, attribute):
+    # pylint: disable=line-too-long
+    """Order the displayed profiler nodes based on a attribute.
+
+    Supported attribute includes micros, bytes, occurrence, params, etc.
+    https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/g3doc/options.md
+
+    Args:
+      attribute: An attribute the profiler node has.
+    Returns:
+      self
+    """
+    # pylint: enable=line-too-long
+    self._options['order_by'] = attribute
+    return self
+
+  def select(self, attributes):
+    # pylint: disable=line-too-long
+    """Select the attributes to display.
+
+    See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/g3doc/options.md
+    for supported attributes.
+
+    Args:
+      attributes: A list of attribute the profiler node has.
+    Returns:
+      self
+    """
+    # pylint: enable=line-too-long
+    self._options['select'] = copy.copy(attributes)
+    return self
+
+  def with_step(self, step):
+    """Which profile step to use for profiling.
+
+    The 'step' here refers to the step defined by `Profiler.add_step()` API.
+
+    Args:
+      step: When multiple steps of profiles are available, select which step's
+         profile to use. If -1, use average of all available steps.
+    Returns:
+      self
+    """
+    self._options['step'] = step
+    return self
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py b/tensorflow/python/profiler/pprof_profiler.py
similarity index 100%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
rename to tensorflow/python/profiler/pprof_profiler.py
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py b/tensorflow/python/profiler/pprof_profiler_test.py
similarity index 98%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
rename to tensorflow/python/profiler/pprof_profiler_test.py
index 6487adf99204d7d2f22f47e937a6921c2a54e220..c2469f012d10cde582e7c2616e96134774616e46 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
+++ b/tensorflow/python/profiler/pprof_profiler_test.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 import gzip
 
 from proto import profile_pb2
-from tensorflow.contrib.tfprof.python.tools.tfprof import pprof_profiler
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.profiler import pprof_profiler
 
 
 class PprofProfilerTest(test.TestCase):
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..130dcb5134d6f7e6eb43aebea803b366a5ce27d8
--- /dev/null
+++ b/tensorflow/python/profiler/profiler.py
@@ -0,0 +1,57 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""profiler python module provides APIs to profile TensorFlow models.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.core.profiler.tfprof_log_pb2 import OpLogProto
+from tensorflow.core.profiler.tfprof_output_pb2 import AdviceProto
+from tensorflow.core.profiler.tfprof_output_pb2 import GraphNodeProto
+from tensorflow.core.profiler.tfprof_output_pb2 import MultiGraphNodeProto
+
+from tensorflow.python.profiler.model_analyzer import advise
+from tensorflow.python.profiler.model_analyzer import profile
+from tensorflow.python.profiler.model_analyzer import Profiler
+from tensorflow.python.profiler.option_builder import ProfileOptionBuilder
+from tensorflow.python.profiler.tfprof_logger import write_op_log
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = [
+    'Profiler',
+    'profile',
+    'ProfileOptionBuilder',
+    'advise',
+    'write_op_log',
+]
+
+_allowed_symbols.extend([
+    'GraphNodeProto',
+    'MultiGraphNodeProto',
+    'AdviceProto',
+    'OpLogProto',
+])
+
+remove_undocumented(__name__, _allowed_symbols, [
+    Profiler,
+    profile,
+    ProfileOptionBuilder,
+    advise,
+    write_op_log,
+])
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py b/tensorflow/python/profiler/profiler_test.py
similarity index 74%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
rename to tensorflow/python/profiler/profiler_test.py
index 5daaafd7c8a36db1f3b94f59e45f73de07455637..7d30c29264f071b29d8317e47e85635f13732990 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/profiler_test.py
+++ b/tensorflow/python/profiler/profiler_test.py
@@ -24,22 +24,25 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.profiler import option_builder
 
 # pylint: disable=g-bad-import-order
-from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
-from tensorflow.contrib.tfprof.python.tools.tfprof.internal import model_analyzer_testlib as lib
+from tensorflow.python.profiler import model_analyzer
+from tensorflow.python.profiler.internal import model_analyzer_testlib as lib
+
+builder = option_builder.ProfileOptionBuilder
 
 
 class ProfilerTest(test.TestCase):
 
   def testProfileBasic(self):
     ops.reset_default_graph()
-    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    opts['account_type_regexes'] = ['.*']
-    opts['select'] = ['params', 'float_ops', 'micros', 'bytes',
-                      'device', 'op_types', 'occurrence']
     outfile = os.path.join(test.get_temp_dir(), 'dump')
-    opts['output'] = 'file:outfile=' + outfile
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_file_output(outfile)
+            .with_accounted_types(['.*'])
+            .select(['params', 'float_ops', 'micros', 'bytes',
+                     'device', 'op_types', 'occurrence']).build())
 
     # Test the output without run_meta.
     sess = session.Session()
@@ -51,8 +54,8 @@ class ProfilerTest(test.TestCase):
     with gfile.Open(outfile, 'r') as f:
       profiler_str = f.read()
 
-    model_analyzer.print_model_analysis(
-        sess.graph, tfprof_cmd='scope', tfprof_options=opts)
+    model_analyzer.profile(
+        sess.graph, cmd='scope', options=opts)
     with gfile.Open(outfile, 'r') as f:
       pma_str = f.read()
     self.assertEqual(pma_str, profiler_str)
@@ -69,18 +72,18 @@ class ProfilerTest(test.TestCase):
     with gfile.Open(outfile, 'r') as f:
       profiler_str = f.read()
 
-    model_analyzer.print_model_analysis(
-        sess.graph, tfprof_cmd='graph', run_meta=run_meta, tfprof_options=opts)
+    model_analyzer.profile(
+        sess.graph, cmd='graph', run_meta=run_meta, options=opts)
     with gfile.Open(outfile, 'r') as f:
       pma_str = f.read()
     self.assertEqual(pma_str, profiler_str)
 
-    profiler.profile_python_codes(opts)
+    profiler.profile_python(opts)
     with gfile.Open(outfile, 'r') as f:
       profiler_str = f.read()
 
-    model_analyzer.print_model_analysis(
-        sess.graph, tfprof_cmd='code', run_meta=run_meta, tfprof_options=opts)
+    model_analyzer.profile(
+        sess.graph, cmd='code', run_meta=run_meta, options=opts)
     with gfile.Open(outfile, 'r') as f:
       pma_str = f.read()
     self.assertEqual(pma_str, profiler_str)
@@ -89,25 +92,14 @@ class ProfilerTest(test.TestCase):
     with gfile.Open(outfile, 'r') as f:
       profiler_str = f.read()
 
-    model_analyzer.print_model_analysis(
-        sess.graph, tfprof_cmd='op', run_meta=run_meta, tfprof_options=opts)
+    model_analyzer.profile(
+        sess.graph, cmd='op', run_meta=run_meta, options=opts)
     with gfile.Open(outfile, 'r') as f:
       pma_str = f.read()
     self.assertEqual(pma_str, profiler_str)
 
-    # Test the output difference between multi-step profile and 1-step profile.
-    _ = sess.run(r,
-                 options=config_pb2.RunOptions(
-                     trace_level=config_pb2.RunOptions.FULL_TRACE),
-                 run_metadata=run_meta)
-
-    profiler.add_step(2, run_meta)
-    profiler.profile_name_scope(opts)
-    with gfile.Open(outfile, 'r') as f:
-      profiler_str = f.read()
-
-    model_analyzer.print_model_analysis(
-        sess.graph, tfprof_cmd='scope', run_meta=run_meta, tfprof_options=opts)
+    model_analyzer.profile(
+        sess.graph, cmd='scope', run_meta=run_meta, options=opts)
     with gfile.Open(outfile, 'r') as f:
       pma_str = f.read()
     self.assertNotEqual(pma_str, profiler_str)
@@ -118,18 +110,17 @@ class ProfilerTest(test.TestCase):
     with gfile.Open(outfile, 'r') as f:
       profiler_str = f.read()
 
-    model_analyzer.print_model_analysis(
-        sess.graph, tfprof_cmd='scope', run_meta=run_meta, tfprof_options=opts2)
+    model_analyzer.profile(
+        sess.graph, cmd='scope', run_meta=run_meta, options=opts2)
     with gfile.Open(outfile, 'r') as f:
       pma_str = f.read()
     self.assertEqual(pma_str, profiler_str)
 
   def testMultiStepProfile(self):
     ops.reset_default_graph()
-    opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
-    opts['account_type_regexes'] = ['.*']
+    opts = builder.time_and_memory()
 
-    with session.Session() as sess, ops.device('/cpu:0'):
+    with session.Session() as sess:
       r1, r2, r3 = lib.BuildSplitableModel()
       sess.run(variables.global_variables_initializer())
 
@@ -179,8 +170,18 @@ class ProfilerTest(test.TestCase):
       self.assertEqual(lib.SearchTFProfNode(pb2, 'add'), None)
       self.assertGreater(lib.SearchTFProfNode(pb3, 'add').exec_micros, 0)
 
-      # TODO(xpan): Better test of advisor.
-      profiler.advise()
+      advice_pb = profiler.advise(model_analyzer.ALL_ADVICE)
+      self.assertTrue('AcceleratorUtilizationChecker' in advice_pb.checkers)
+      self.assertTrue('ExpensiveOperationChecker' in advice_pb.checkers)
+      self.assertTrue('OperationChecker' in advice_pb.checkers)
+
+      checker = advice_pb.checkers['AcceleratorUtilizationChecker']
+      if test.is_gpu_available():
+        self.assertGreater(len(checker.reports), 0)
+      else:
+        self.assertEqual(len(checker.reports), 0)
+      checker = advice_pb.checkers['ExpensiveOperationChecker']
+      self.assertGreater(len(checker.reports), 0)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
similarity index 92%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
rename to tensorflow/python/profiler/tfprof_logger.py
index 52febef26cd219e75d55af4359b317ba5a8309a9..e50c52884aa32a7dab33d3d18606f7cabef19b2a 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Logging tensorflow::tfprof::OpLog.
+"""Logging tensorflow::tfprof::OpLogProto.
 
-OpLog is used to add extra model information for offline analysis by tfprof.
+OpLogProto is used to add extra model information for offline analysis.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -24,10 +24,10 @@ import os
 import sys
 
 import six
+from tensorflow.core.profiler import tfprof_log_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
-from tensorflow.tools.tfprof import tfprof_log_pb2
 
 TRAINABLE_VARIABLES = '_trainable_variables'
 REGISTERED_FLOP_STATS = 'flops'
@@ -131,15 +131,15 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
 
   Args:
     graph: tf.Graph.
-    op_log: OpLog proto.
+    op_log: OpLogProto proto.
     run_meta: RunMetadata proto used to complete shape information.
     add_trace: Whether to add op trace information.
     add_trainable_var: Whether to assign tf.trainable_variables() op type
       '_trainable_variables'.
   Returns:
-    tmp_op_log: Merged OpLog proto.
+    tmp_op_log: Merged OpLogProto proto.
   """
-  tmp_op_log = tfprof_log_pb2.OpLog()
+  tmp_op_log = tfprof_log_pb2.OpLogProto()
   logged_ops = _get_logged_ops(
       graph, run_meta, add_trace=add_trace, add_trainable_var=add_trainable_var)
 
@@ -169,17 +169,18 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
     '_trainable_variables'.
     The API also logs 'flops' statistics for ops with op.RegisterStatistics()
     defined. flops calculation depends on Tensor shapes defined in 'graph',
-    which might not be complete, 'run_meta', if provided, completes the shape
+    which might not be complete. 'run_meta', if provided, completes the shape
     information with best effort.
 
   Args:
     graph: tf.Graph.
     log_dir: directory to write the log file.
-    op_log: (Optional) OpLog proto to be written. If not provided, an new
+    op_log: (Optional) OpLogProto proto to be written. If not provided, an new
         one is created.
     run_meta: (Optional) RunMetadata proto that helps flops computation using
         run time shape information.
-    add_trace: Whether to add op trace information. Used to support "code" view.
+    add_trace: Whether to add python code trace information.
+        Used to support "code" view.
   """
   op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py b/tensorflow/python/profiler/tfprof_logger_test.py
similarity index 91%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py
rename to tensorflow/python/profiler/tfprof_logger_test.py
index 87dfdc0fc1961526bcb63afdbe143fa0e762f134..141144f98776f3aa7c95b9ef743022aeca5084e1 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger_test.py
+++ b/tensorflow/python/profiler/tfprof_logger_test.py
@@ -17,13 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.copy_graph.python.util import copy_elements
-from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -42,6 +37,8 @@ class TFProfLoggerTest(test.TestCase):
     b = constant_op.constant([[1, 2], [3, 4]])
     return math_ops.matmul(a, b)
 
+  # pylint: disable=pointless-string-statement
+  """# TODO(xpan): This this out of core so it doesn't depend on contrib.
   def testFillMissingShape(self):
     a, b, y = self._BuildSmallPlaceholderlModel()
     run_options = config_pb2.RunOptions(
@@ -76,6 +73,7 @@ class TFProfLoggerTest(test.TestCase):
     # run_metadata has special name for MatMul, hence failed to fill shape.
     tfprof_logger._fill_missing_graph_shape(graph2, run_metadata)
     self.assertEquals('<unknown>', str(y2.get_shape()))
+  """
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/saved_model/README.md b/tensorflow/python/saved_model/README.md
index 38203da5b671ae7765097d7c94e3d21de7d7486e..8213e52ce9c004c9b9c53b76e08a028508703d06 100644
--- a/tensorflow/python/saved_model/README.md
+++ b/tensorflow/python/saved_model/README.md
@@ -106,7 +106,7 @@ builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 with tf.Session(graph=tf.Graph()) as sess:
   ...
   builder.add_meta_graph_and_variables(sess,
-                                       [tag_constants.TRAINING],
+                                       [tf.saved_model.tag_constants.TRAINING],
                                        signature_def_map=foo_signatures,
                                        assets_collection=foo_assets)
 ...
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index e6d71a48dfda413da204d8cb51f775b01c6d7c4b..5c988bf95a8034dfdad9cbe957fbce5d806ba4ab 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -287,16 +287,12 @@ class SavedModelBuilder(object):
     # The graph almost certainly previously contained at least one Saver, and
     # possibly several (e.g. one for loading a pretrained embedding, and another
     # for the model weights).  However, a *new* Saver was just created that
-    # includes all of the variables.  In the context of the SavedModel, this
-    # new Saver is the only one that needs to be retained. The associated
-    # checkpoint produced in add_meta_graph_and_variables() contains all of the
-    # variable values.  Thus, any preexisting Savers are redundant and useless
-    # at best, but worse may break downstream graph-processing tools, and can be
-    # confusing during debugging. It is therefore safe and wise to set
-    # `clear_extraneous_savers` to `True`, since it removes both the extraneous
-    # SaverDefs and their associated Save/Restore Ops from the graph.
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices,
-                                             clear_extraneous_savers=True)
+    # includes all of the variables.  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
@@ -378,16 +374,12 @@ class SavedModelBuilder(object):
     # The graph almost certainly previously contained at least one Saver, and
     # possibly several (e.g. one for loading a pretrained embedding, and another
     # for the model weights).  However, a *new* Saver was just created that
-    # includes all of the variables.  In the context of the SavedModel, this
-    # new Saver is the only one that needs to be retained.  The associated
-    # checkpoint that was saved just above contains all of the variable values.
-    # Thus, any preexisting Savers are redundant and useless at best, but worse
-    # may break downstream graph-processing tools, and can be confusing during
-    # debugging.  It is therefore safe and wise to set `clear_extraneous_savers`
-    # to `True`, since it removes both the extraneous SaverDefs and their
-    # associated Save/Restore Ops from the graph.
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices,
-                                             clear_extraneous_savers=True)
+    # includes all of the variables.  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 32526521749d26c02e29f8bcda7b934faecfddfe..5ff954fd9f83989565e007cad3f0f66913e0a4dd 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -206,8 +206,11 @@ def load(sess, tags, export_dir, **saver_kwargs):
         break
 
     if not found_match:
-      raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
-          "[]") + " could not be found in SavedModel")
+      raise RuntimeError(
+          "MetaGraphDef associated with tags " + str(tags).strip("[]") +
+          " could not be found in SavedModel. To inspect available tag-sets in"
+          " the SavedModel, please use the SavedModel CLI: `saved_model_cli`"
+      )
 
     # Build a saver by importing the meta graph def to load.
     saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 0eb9f49fed0f7397f081414e68118c2ea066bc25..5639e6855df02fba14cd3ddf800b2c7532e9d2fe 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import main_op
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.util import compat
 
@@ -208,6 +207,13 @@ class SavedModelTest(test.TestCase):
       self._init_and_validate_variable(sess, "v", 43)
       builder.add_meta_graph([tag_constants.SERVING])
 
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - multiple tags (from predefined constants).
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
+
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
@@ -231,6 +237,13 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+    # Restore the graph with multiple predefined tags whose variables were not
+    # saved.
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, [tag_constants.SERVING, tag_constants.GPU], export_dir)
+      self.assertEqual(
+          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
     # Restore the graph with multiple tags. Provide duplicate tags to test set
     # semantics.
     with self.test_session(graph=ops.Graph()) as sess:
@@ -810,66 +823,6 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
-  def testClearExtraneousSavers(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_clear_extraneous_savers")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    # Create a variable and a Saver.
-    with ops.Graph().as_default() as graph:
-      with session.Session(
-          target="",
-          config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-        self._init_and_validate_variable(sess, "v", 42)
-
-        # Add two Savers, which should be removed in
-        # add_meta_graph_and_variables() in favor of the locally added one.
-        saver1 = tf_saver.Saver()
-        graph.add_to_collection(ops.GraphKeys.SAVERS, saver1)
-        saver2 = tf_saver.Saver()
-        graph.add_to_collection(ops.GraphKeys.SAVERS, saver2)
-
-        # Confirm there are two SaverDefs.
-        savers = graph.get_collection(ops.GraphKeys.SAVERS)
-        self.assertEqual(2, len(savers))
-
-        # Confirm there are two Save and two Restore ops.
-        save_op_names = set([x.name for x in graph.get_operations()
-                             if x.type == "SaveV2"])
-        self.assertSetEqual(set(["save/SaveV2", "save_1/SaveV2"]),
-                            save_op_names)
-
-        restore_op_names = set([x.name for x in graph.get_operations()
-                                if x.type == "RestoreV2"])
-        self.assertSetEqual(set(["save/RestoreV2", "save_1/RestoreV2"]),
-                            restore_op_names)
-
-        # The SavedModel builder adds its own Saver' for a total of three.
-        builder.add_meta_graph_and_variables(
-            sess, [tag_constants.TRAINING], clear_devices=True)
-
-    # Save the SavedModel to disk.
-    builder.save()
-
-    # Restore the graph.
-    with ops.Graph().as_default() as graph:
-      with self.test_session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        self.assertEqual(
-            42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
-
-        # Confirm that the reloaded graph has only one SaverDef.
-        savers = ops.get_collection(ops.GraphKeys.SAVERS)
-        self.assertEqual(1, len(savers))
-
-        # The reloaded graph should have exactly one Save and one Restore op.
-        save_op_names = set([x.name for x in graph.get_operations()
-                             if x.type == "SaveV2"])
-        self.assertSetEqual(set(["save_2/SaveV2"]), save_op_names)
-        restore_op_names = set([x.name for x in graph.get_operations()
-                                if x.type == "RestoreV2"])
-        self.assertSetEqual(set(["save_2/RestoreV2"]), restore_op_names)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 4fb9645deacea6321625fd79eb2a1db6f8249db3..52868bdf99b4734a99d7b9dac301f00783402d77 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -28,9 +28,12 @@ SERVING = "serve"
 # Tag for the `training` graph.
 TRAINING = "train"
 
+# Tag for the `gpu` graph.
+GPU = "gpu"
 
 _allowed_symbols = [
     "SERVING",
-    "TRAINING"
+    "TRAINING",
+    "GPU"
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/utils.py b/tensorflow/python/saved_model/utils.py
index 8e970e96ae06eed7f5c515bd74106e6278446029..8e750d8708a36a9dd406a4703b3a79645fc04e8f 100644
--- a/tensorflow/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/utils.py
@@ -22,8 +22,9 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.saved_model.utils_impl import build_tensor_info
+from tensorflow.python.saved_model.utils_impl import get_tensor_from_tensor_info
 # pylint: enable=unused-import
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ["build_tensor_info",]
+_allowed_symbols = ["build_tensor_info", "get_tensor_from_tensor_info"]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index fcb6fc91b6bf8105a3d3b940b793e561ce932880..73ca8c9c1c6d8fddc8a9c7dbee56682999281c28 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 
 
 # TensorInfo helpers.
@@ -29,13 +31,53 @@ def build_tensor_info(tensor):
   """Utility function to build TensorInfo proto.
 
   Args:
-    tensor: Tensor whose name, dtype and shape are used to build the TensorInfo.
+    tensor: Tensor or SparseTensor whose name, dtype and shape are used to
+        build the TensorInfo. For SparseTensors, the names of the three
+        constitutent Tensors are used.
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
   """
-  dtype_enum = dtypes.as_dtype(tensor.dtype).as_datatype_enum
-  return meta_graph_pb2.TensorInfo(
-      name=tensor.name,
-      dtype=dtype_enum,
+  tensor_info = meta_graph_pb2.TensorInfo(
+      dtype=dtypes.as_dtype(tensor.dtype).as_datatype_enum,
       tensor_shape=tensor.get_shape().as_proto())
+  if isinstance(tensor, sparse_tensor.SparseTensor):
+    tensor_info.coo_sparse.values_tensor_name = tensor.values.name
+    tensor_info.coo_sparse.indices_tensor_name = tensor.indices.name
+    tensor_info.coo_sparse.dense_shape_tensor_name = tensor.dense_shape.name
+  else:
+    tensor_info.name = tensor.name
+  return tensor_info
+
+
+def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
+  """Returns the Tensor or SparseTensor described by a TensorInfo proto.
+
+  Args:
+    tensor_info: A TensorInfo proto describing a Tensor or SparseTensor.
+    graph: The tf.Graph in which tensors are looked up. If None, the
+        current default graph is used.
+    import_scope: If not None, names in `tensor_info` are prefixed with this
+        string before lookup.
+
+  Returns:
+    The Tensor or SparseTensor in `graph` described by `tensor_info`.
+
+  Raises:
+    KeyError: If `tensor_info` does not correspond to a tensor in `graph`.
+    ValueError: If `tensor_info` is malformed.
+  """
+  graph = graph if graph is not None else ops.get_default_graph()
+  def _get_tensor(name):
+    return graph.get_tensor_by_name(
+        ops.prepend_name_scope(name, import_scope=import_scope))
+  encoding = tensor_info.WhichOneof("encoding")
+  if encoding == "name":
+    return _get_tensor(tensor_info.name)
+  elif encoding == "coo_sparse":
+    return sparse_tensor.SparseTensor(
+        _get_tensor(tensor_info.coo_sparse.indices_tensor_name),
+        _get_tensor(tensor_info.coo_sparse.values_tensor_name),
+        _get_tensor(tensor_info.coo_sparse.dense_shape_tensor_name))
+  else:
+    raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 39c87e48ce03c5274d376bd70886e0e5672c49c5..85e6757b26f80ffe07d9483ed3653c94b21553cb 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import utils
@@ -27,7 +29,7 @@ from tensorflow.python.saved_model import utils
 
 class UtilsTest(test.TestCase):
 
-  def testBuildTensorInfo(self):
+  def testBuildTensorInfoDense(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
     self.assertEqual("x:0", x_tensor_info.name)
@@ -35,6 +37,77 @@ class UtilsTest(test.TestCase):
     self.assertEqual(1, len(x_tensor_info.tensor_shape.dim))
     self.assertEqual(1, x_tensor_info.tensor_shape.dim[0].size)
 
+  def testBuildTensorInfoSparse(self):
+    x = sparse_tensor.SparseTensor(indices=[[3, 3], [4, 4], [5, 5]],
+                                   values=[103.0, 104.0, 105.0],
+                                   dense_shape=[42, 69])
+    x_tensor_info = utils.build_tensor_info(x)
+    self.assertEqual(x.values.name,
+                     x_tensor_info.coo_sparse.values_tensor_name)
+    self.assertEqual(x.indices.name,
+                     x_tensor_info.coo_sparse.indices_tensor_name)
+    self.assertEqual(x.dense_shape.name,
+                     x_tensor_info.coo_sparse.dense_shape_tensor_name)
+    self.assertEqual(types_pb2.DT_FLOAT, x_tensor_info.dtype)
+    self.assertEqual(2, len(x_tensor_info.tensor_shape.dim))
+    self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
+    self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
+
+  def testGetTensorFromInfoDense(self):
+    expected = array_ops.placeholder(dtypes.float32, 1, name="x")
+    tensor_info = utils.build_tensor_info(expected)
+    actual = utils.get_tensor_from_tensor_info(tensor_info)
+    self.assertIsInstance(actual, ops.Tensor)
+    self.assertEqual(expected.name, actual.name)
+
+  def testGetTensorFromInfoSparse(self):
+    expected = array_ops.sparse_placeholder(dtypes.float32, name="x")
+    tensor_info = utils.build_tensor_info(expected)
+    actual = utils.get_tensor_from_tensor_info(tensor_info)
+    self.assertIsInstance(actual, sparse_tensor.SparseTensor)
+    self.assertEqual(expected.values.name, actual.values.name)
+    self.assertEqual(expected.indices.name, actual.indices.name)
+    self.assertEqual(expected.dense_shape.name, actual.dense_shape.name)
+
+  def testGetTensorFromInfoInOtherGraph(self):
+    with ops.Graph().as_default() as expected_graph:
+      expected = array_ops.placeholder(dtypes.float32, 1, name="right")
+      tensor_info = utils.build_tensor_info(expected)
+    with ops.Graph().as_default():  # Some other graph.
+      array_ops.placeholder(dtypes.float32, 1, name="other")
+    actual = utils.get_tensor_from_tensor_info(tensor_info,
+                                               graph=expected_graph)
+    self.assertIsInstance(actual, ops.Tensor)
+    self.assertIs(actual.graph, expected_graph)
+    self.assertEqual(expected.name, actual.name)
+
+  def testGetTensorFromInfoInScope(self):
+    # Build a TensorInfo with name "bar/x:0".
+    with ops.Graph().as_default():
+      with ops.name_scope("bar"):
+        unscoped = array_ops.placeholder(dtypes.float32, 1, name="x")
+        tensor_info = utils.build_tensor_info(unscoped)
+        self.assertEqual("bar/x:0", tensor_info.name)
+    # Build a graph with node "foo/bar/x:0", akin to importing into scope foo.
+    with ops.Graph().as_default():
+      with ops.name_scope("foo"):
+        with ops.name_scope("bar"):
+          expected = array_ops.placeholder(dtypes.float32, 1, name="x")
+      self.assertEqual("foo/bar/x:0", expected.name)
+      # Test that tensor is found by prepending the import scope.
+      actual = utils.get_tensor_from_tensor_info(tensor_info,
+                                                 import_scope="foo")
+      self.assertEqual(expected.name, actual.name)
+
+  def testGetTensorFromInfoRaisesErrors(self):
+    expected = array_ops.placeholder(dtypes.float32, 1, name="x")
+    tensor_info = utils.build_tensor_info(expected)
+    tensor_info.name = "blah:0"  # Nonexistant name.
+    with self.assertRaises(KeyError):
+      utils.get_tensor_from_tensor_info(tensor_info)
+    tensor_info.ClearField("name")  # Malformed (missing encoding).
+    with self.assertRaises(ValueError):
+      utils.get_tensor_from_tensor_info(tensor_info)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 7ff01a51f3d4bf03db7da766da7465e15cff69ed..90afcc0a112dea884031fefe1504cce7a31c317a 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -28,6 +28,7 @@ See the @{$python/summary} guide.
 @@merge
 @@merge_all
 @@get_summary_description
+@@PluginAsset
 @@get_plugin_asset
 @@get_all_plugin_assets
 """
@@ -54,7 +55,6 @@ from tensorflow.python.ops import summary_op_util as _summary_op_util
 
 # exports tensor-related summaries
 # pylint: disable=unused-import
-from tensorflow.python.ops.summary_ops import _tensor_summary_v2
 from tensorflow.python.ops.summary_ops import tensor_summary
 # pylint: enable=unused-import
 
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
index 52bc913b2ada672bc0b263d4f0646a98698d2895..b97c02666cf726024b30f804f4c351d86a2c1856 100644
--- a/tensorflow/python/summary/text_summary.py
+++ b/tensorflow/python/summary/text_summary.py
@@ -23,12 +23,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import namedtuple
 import json
 
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.summary_ops import tensor_summary
 from tensorflow.python.summary import plugin_asset
 
+PLUGIN_NAME = "text"
+
+# Contains event-related data specific to the text plugin.
+_TextPluginData = namedtuple("_TextPluginData", [])
+
 
 def text_summary(name, tensor, collections=None):
   """Summarizes textual data.
@@ -60,9 +67,16 @@ def text_summary(name, tensor, collections=None):
     raise ValueError("Expected tensor %s to have dtype string, got %s" %
                      (tensor.name, tensor.dtype))
 
-  t_summary = tensor_summary(name, tensor, collections=collections)
-  text_assets = plugin_asset.get_plugin_asset(TextSummaryPluginAsset)
-  text_assets.register_tensor(t_summary.op.name)
+  summary_metadata = summary_pb2.SummaryMetadata()
+  text_plugin_data = _TextPluginData()
+  data_dict = text_plugin_data._asdict()  # pylint: disable=protected-access
+  summary_metadata.plugin_data.add(
+      plugin_name=PLUGIN_NAME, content=json.dumps(data_dict))
+  t_summary = tensor_summary(
+      name=name,
+      tensor=tensor,
+      summary_metadata=summary_metadata,
+      collections=collections)
   return t_summary
 
 
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
index 31009702ca41e1f0b5ff5742ae81cfc962d3061d..4d357918f6c2eb68fa396f05984ee50e06d2147f 100644
--- a/tensorflow/python/summary/text_summary_test.py
+++ b/tensorflow/python/summary/text_summary_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
@@ -43,16 +42,11 @@ class TextPluginTest(test_util.TensorFlowTestCase):
       # The API accepts vectors.
       arr = array_ops.constant(["one", "two", "three"])
       summ = text_summary.text_summary("foo", arr)
-      self.assertEqual(summ.op.type, "TensorSummary")
+      self.assertEqual(summ.op.type, "TensorSummaryV2")
 
       # the API accepts scalars
       summ = text_summary.text_summary("foo", array_ops.constant("one"))
-      self.assertEqual(summ.op.type, "TensorSummary")
-
-  def testTextSummaryCollections(self):
-    text_summary.text_summary("bar", array_ops.constant("2"), collections=[])
-    summaries = framework_ops.get_collection(framework_ops.GraphKeys.SUMMARIES)
-    self.assertEqual(len(summaries), 0)
+      self.assertEqual(summ.op.type, "TensorSummaryV2")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 05f97fb28417dbac6e26c7767f34a2152baea8c0..8ce49d623d8a9bb5ffc30403798fd0c81ba1b25f 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -86,6 +86,14 @@ class SummaryToEventTransformer(object):
           meta_graph.create_meta_graph_def(graph_def=graph_def or
                                            maybe_graph_as_def))
 
+    # This set contains tags of Summary Values that have been encountered
+    # already. The motivation here is that the SummaryWriter only keeps the
+    # metadata property (which is a SummaryMetadata proto) of the first Summary
+    # Value encountered for each tag. The SummaryWriter strips away the
+    # SummaryMetadata for all subsequent Summary Values with tags seen
+    # previously. This saves space.
+    self._seen_summary_tags = set()
+
   def add_summary(self, summary, global_step=None):
     """Adds a `Summary` protocol buffer to the event file.
 
@@ -108,6 +116,24 @@ class SummaryToEventTransformer(object):
       summ = summary_pb2.Summary()
       summ.ParseFromString(summary)
       summary = summ
+
+    # We strip metadata from values with tags that we have seen before in order
+    # to save space - we just store the metadata on the first value with a
+    # specific tag.
+    for value in summary.value:
+      if not value.metadata:
+        continue
+
+      if value.tag in self._seen_summary_tags:
+        # This tag has been encountered before. Strip the metadata.
+        value.ClearField("metadata")
+        continue
+
+      # We encounter a value with a tag we have not encountered previously. And
+      # it has metadata. Remember to strip metadata from future values with this
+      # tag string.
+      self._seen_summary_tags.add(value.tag)
+
     event = event_pb2.Event(summary=summary)
     self._add_event(event, global_step)
 
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 8c34eb82e35cba6db8716f797a39f56e778a74af..3d27b11cb9f2ff0e0ef43c18ac7e17ea5d7bb90e 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -317,6 +317,63 @@ class SummaryWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  def testPluginMetadataStrippedFromSubsequentEvents(self):
+    test_dir = self._CleanTestDir("basics")
+    sw = writer.FileWriter(test_dir)
+
+    sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
+
+    # We add 2 summaries with the same tags. They both have metadata. The writer
+    # should strip the metadata from the second one.
+    value = summary_pb2.Summary.Value(tag="foo", simple_value=10.0)
+    value.metadata.plugin_data.add(plugin_name="bar", content="... content ...")
+    sw.add_summary(summary_pb2.Summary(value=[value]), 10)
+    value = summary_pb2.Summary.Value(tag="foo", simple_value=10.0)
+    value.metadata.plugin_data.add(plugin_name="bar", content="... content ...")
+    sw.add_summary(summary_pb2.Summary(value=[value]), 10)
+
+    sw.close()
+    rr = self._EventsReader(test_dir)
+
+    # The first event should list the file_version.
+    ev = next(rr)
+    self._assertRecent(ev.wall_time)
+    self.assertEquals("brain.Event:2", ev.file_version)
+
+    # The next event should be the START message.
+    ev = next(rr)
+    self._assertRecent(ev.wall_time)
+    self.assertEquals(1, ev.step)
+    self.assertEquals(SessionLog.START, ev.session_log.status)
+
+    # This is the first event with tag foo. It should contain SummaryMetadata.
+    ev = next(rr)
+    self.assertProtoEquals("""
+      value {
+        tag: "foo"
+        simple_value: 10.0
+        metadata {
+          plugin_data {
+            plugin_name: "bar"
+            content: "... content ..."
+          }
+        }
+      }
+      """, ev.summary)
+
+    # This is the second event with tag foo. It should lack SummaryMetadata
+    # because the file writer should have stripped it.
+    ev = next(rr)
+    self.assertProtoEquals("""
+      value {
+        tag: "foo"
+        simple_value: 10.0
+      }
+      """, ev.summary)
+
+    # We should be done.
+    self.assertRaises(StopIteration, lambda: next(rr))
+
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
     sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index a9a0b7fffa808c519c85c02b5c85a017685c3236..0c8110ec60c83d16044d615c08c71fe9e305af8b 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -20,6 +20,7 @@ limitations under the License.
 %include "tensorflow/python/util/port.i"
 %include "tensorflow/python/util/py_checkpoint_reader.i"
 %include "tensorflow/python/util/stat_summarizer.i"
+%include "tensorflow/python/util/tfprof.i"
 
 %include "tensorflow/python/lib/core/py_func.i"
 
@@ -43,3 +44,4 @@ limitations under the License.
 
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"
+%include "tensorflow/python/grappler/model_analyzer.i"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 1780d34b39431f7a38361184b44c7f805387332c..47b09f67b4a1c34fcf71df5ea667f916cd8fedf9 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -44,6 +44,20 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "import_pb_to_tensorboard",
+    srcs = ["import_pb_to_tensorboard.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+)
+
 py_test(
     name = "freeze_graph_test",
     size = "small",
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index bd046a7fd099c71518e694c7a44c62616c960178..8069950925d41d64c4cb326dc6d67a6ed3d20c42 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -115,10 +115,15 @@ def freeze_graph_with_def_protos(
         output_node_names.split(","),
         variable_names_blacklist=variable_names_blacklist)
 
-  with gfile.GFile(output_graph, "wb") as f:
-    f.write(output_graph_def.SerializeToString())
+  # Write GraphDef to file if output path has been given.
+  if output_graph:
+    with gfile.GFile(output_graph, "wb") as f:
+      f.write(output_graph_def.SerializeToString())
+
   print("%d ops in the final graph." % len(output_graph_def.node))
 
+  return output_graph_def
+
 
 def _parse_input_graph_proto(input_graph, input_binary):
   """Parser input tensorflow graph into GraphDef proto."""
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index 2bb055e978630bcb399e327ddc968961b4978bca..a8712fc37e631cd7c3ddb76b9ca21f78599d668c 100644
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
+
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary import summary
 
@@ -48,3 +52,25 @@ def import_to_tensorboard(model_dir, log_dir):
     pb_visual_writer.add_graph(sess.graph)
     print("Model Imported. Visualize by running: "
           "> tensorboard --logdir={}".format(log_dir))
+
+
+def main(unused_args):
+  import_to_tensorboard(FLAGS.model_dir, FLAGS.log_dir)
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--model_dir",
+      type=str,
+      default="",
+      required=True,
+      help="The location of the protobuf (\'pb\') model to visualize.")
+  parser.add_argument(
+      "--log_dir",
+      type=str,
+      default="",
+      required=True,
+      help="The location for the Tensorboard log to begin visualization from.")
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 3e80f1ecd7a192fee1378265f34c24263e67e381..c2687bf557b03ff588fd369771077c92ba012a15 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -67,6 +67,25 @@ flags = flags_lib
 FLAGS = flags.FLAGS
 
 
+# Support folding two types of batch norm ops:
+# BatchNormWithGlobalNormalization and FusedBatchNorm.  The two types only
+# differ in input order and attribute names, so we've collected their
+# differences up front.
+INPUT_ORDER = {
+    # Order of inputs for BatchNormWithGlobalNormalization.
+    "BatchNormWithGlobalNormalization": [
+        "conv_op", "mean_op", "var_op", "beta_op", "gamma_op"
+    ],
+    # Order of inputs for FusedBatchNorm.
+    "FusedBatchNorm": ["conv_op", "gamma_op", "beta_op", "mean_op", "var_op"]
+}
+# Name of the attribute epsilon value is stored in.
+EPSILON_ATTR = {
+    "BatchNormWithGlobalNormalization": "variance_epsilon",
+    "FusedBatchNorm": "epsilon"
+}
+
+
 def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
                            placeholder_type_enum):
   """Applies a series of inference optimizations on the input graph.
@@ -85,10 +104,9 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
   """
   ensure_graph_is_valid(input_graph_def)
   optimized_graph_def = input_graph_def
-  optimized_graph_def = strip_unused_lib.strip_unused(optimized_graph_def,
-                                                      input_node_names,
-                                                      output_node_names,
-                                                      placeholder_type_enum)
+  optimized_graph_def = strip_unused_lib.strip_unused(
+      optimized_graph_def, input_node_names, output_node_names,
+      placeholder_type_enum)
   optimized_graph_def = graph_util.remove_training_nodes(
       optimized_graph_def, output_node_names)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
@@ -173,6 +191,13 @@ def values_from_const(node_def):
   return tensor_value
 
 
+# Whether to scale by gamma after normalization.
+def scale_after_normalization(node):
+  if node.op == "BatchNormWithGlobalNormalization":
+    return node.attr["scale_after_normalization"].b
+  return True
+
+
 def fold_batch_norms(input_graph_def):
   """Removes batch normalization ops by folding them into convolutions.
 
@@ -195,7 +220,6 @@ def fold_batch_norms(input_graph_def):
   Raises:
     ValueError: If the graph is badly formed with duplicate node names.
   """
-
   input_node_map = {}
   for node in input_graph_def.node:
     if node.name not in input_node_map.keys():
@@ -206,13 +230,14 @@ def fold_batch_norms(input_graph_def):
   nodes_to_skip = {}
   new_ops = []
   for node in input_graph_def.node:
-    if node.op != "BatchNormWithGlobalNormalization":
+    if node.op not in ("BatchNormWithGlobalNormalization", "FusedBatchNorm"):
       continue
 
-    conv_op = node_from_map(input_node_map, node.input[0])
+    conv_op = node_from_map(input_node_map,
+                            node.input[INPUT_ORDER[node.op].index("conv_op")])
     if conv_op.op != "Conv2D":
-      tf_logging.warning("Didn't find expected Conv2D input to '%s'" %
-                         node.name)
+      tf_logging.warning(
+          "Didn't find expected Conv2D input to '%s'" % node.name)
       continue
 
     weights_op = node_from_map(input_node_map, conv_op.input[1])
@@ -224,7 +249,8 @@ def fold_batch_norms(input_graph_def):
     weights = values_from_const(weights_op)
     channel_count = weights.shape[3]
 
-    mean_op = node_from_map(input_node_map, node.input[1])
+    mean_op = node_from_map(input_node_map,
+                            node.input[INPUT_ORDER[node.op].index("mean_op")])
     if mean_op.op != "Const":
       tf_logging.warning("Didn't find expected mean Constant input to '%s',"
                          " found %s instead. Maybe because freeze_graph wasn't"
@@ -237,7 +263,8 @@ def fold_batch_norms(input_graph_def):
                              (channel_count,)), node.name))
       continue
 
-    var_op = node_from_map(input_node_map, node.input[2])
+    var_op = node_from_map(input_node_map,
+                           node.input[INPUT_ORDER[node.op].index("var_op")])
     if var_op.op != "Const":
       tf_logging.warning("Didn't find expected var Constant input to '%s',"
                          " found %s instead. Maybe because freeze_graph wasn't"
@@ -250,7 +277,8 @@ def fold_batch_norms(input_graph_def):
                              (channel_count,)), node.name))
       continue
 
-    beta_op = node_from_map(input_node_map, node.input[3])
+    beta_op = node_from_map(input_node_map,
+                            node.input[INPUT_ORDER[node.op].index("beta_op")])
     if beta_op.op != "Const":
       tf_logging.warning("Didn't find expected beta Constant input to '%s',"
                          " found %s instead. Maybe because freeze_graph wasn't"
@@ -263,7 +291,8 @@ def fold_batch_norms(input_graph_def):
                              (channel_count,)), node.name))
       continue
 
-    gamma_op = node_from_map(input_node_map, node.input[4])
+    gamma_op = node_from_map(input_node_map,
+                             node.input[INPUT_ORDER[node.op].index("gamma_op")])
     if gamma_op.op != "Const":
       tf_logging.warning("Didn't find expected gamma Constant input to '%s',"
                          " found %s instead. Maybe because freeze_graph wasn't"
@@ -276,8 +305,7 @@ def fold_batch_norms(input_graph_def):
                              (channel_count,)), node.name))
       continue
 
-    variance_epsilon_value = node.attr["variance_epsilon"].f
-    scale_after_normalization = node.attr["scale_after_normalization"].b
+    variance_epsilon_value = node.attr[EPSILON_ATTR[node.op]].f
     nodes_to_skip[node.name] = True
     nodes_to_skip[weights_op.name] = True
     nodes_to_skip[mean_op.name] = True
@@ -286,7 +314,7 @@ def fold_batch_norms(input_graph_def):
     nodes_to_skip[gamma_op.name] = True
     nodes_to_skip[conv_op.name] = True
 
-    if scale_after_normalization:
+    if scale_after_normalization(node):
       scale_value = (
           (1.0 / np.vectorize(math.sqrt)(var_value + variance_epsilon_value)) *
           gamma_value)
@@ -346,6 +374,8 @@ def fuse_resize_and_conv(input_graph_def, output_node_names):
 
   Args:
     input_graph_def: A GraphDef containing a model.
+    output_node_names: A list of names of the nodes that produce the final
+      results.
 
   Returns:
     Modified graph with resize and pad ops merged.
@@ -428,8 +458,8 @@ def fuse_resize_and_conv(input_graph_def, output_node_names):
           resize_op.input[0], resize_op.input[1], mirror_paddings_name,
           conv_op.input[1]
       ])
-      fused_conv_op.attr["resize_align_corners"].CopyFrom(resize_op.attr[
-          "align_corners"])
+      fused_conv_op.attr["resize_align_corners"].CopyFrom(
+          resize_op.attr["align_corners"])
     else:
       fused_conv_op.input.extend(
           [mirror_pad_op.input[0], mirror_paddings_name, conv_op.input[1]])
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 7428fa76b888f770614891ddc6b08a7581ec6785..447057cfe9fc3d7aa7bd78739ba8f1caee1ec757 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -170,6 +170,50 @@ class OptimizeForInferenceTest(test.TestCase):
     for node in optimized_graph_def.node:
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
+  def testFoldFusedBatchNorms(self):
+    with self.test_session() as sess:
+      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+      input_op = constant_op.constant(
+          np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32)
+      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+      weights_op = constant_op.constant(
+          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
+      conv_op = nn_ops.conv2d(
+          input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op")
+      mean_op = constant_op.constant(
+          np.array([10, 20]), shape=[2], dtype=dtypes.float32)
+      variance_op = constant_op.constant(
+          np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
+      beta_op = constant_op.constant(
+          np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
+      gamma_op = constant_op.constant(
+          np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
+      ops.get_default_graph().graph_def_versions.producer = 9
+      gen_nn_ops._fused_batch_norm(
+          conv_op,
+          gamma_op,
+          beta_op,
+          mean_op,
+          variance_op,
+          0.00001,
+          is_training=False,
+          name="output")
+      original_graph_def = sess.graph_def
+      original_result = sess.run(["output:0"])
+    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+        original_graph_def)
+
+    with self.test_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized")
+      optimized_result = sess.run(["optimized/output:0"])
+
+    self.assertAllClose(
+        original_result, optimized_result, rtol=1e-04, atol=1e-06)
+
+    for node in optimized_graph_def.node:
+      self.assertNotEqual("FusedBatchNorm", node.op)
+
   def testFuseResizePadAndConv(self):
     with self.test_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -227,7 +271,8 @@ class OptimizeForInferenceTest(test.TestCase):
 
     for node in optimized_graph_def.node:
       self.assertNotEqual("Conv2D", node.op)
-      self.assertNotEqual("ResizeBilinear", node.op)
+      self.assertNotEqual("MirrorPad", node.op)
+      
 
   def testFusePadAndConv(self):
     with self.test_session() as sess:
@@ -255,7 +300,7 @@ class OptimizeForInferenceTest(test.TestCase):
 
     for node in optimized_graph_def.node:
       self.assertNotEqual("Conv2D", node.op)
-      self.assertNotEqual("MirrorPad", node.op)
+      self.assertNotEqual("ResizeBilinear", node.op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index 62f00f446781557a9b67aa4737e6ef772356a31b..21d7de02040874fd5e0bb5ca8a647c011ad2be76 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -28,7 +28,7 @@ When compiling for Android:
     --copt="-DSUPPORT_SELECTIVE_REGISTRATION" \
     //tensorflow/contrib/android:libtensorflow_inference.so \
     --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-    --config=android_arm
+    --crosstool_top=//external:android/crosstool --cpu=armeabi-v7a
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index e1be3055052aeeb1355bbb71d6232ab8d60cc974..9075a707a2e31548d7f2a9d23388086c9dd02f3b 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -255,6 +255,7 @@ def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
         SavedModel.
 
   Raises:
+    ValueError: When any of the input tensor keys is not valid.
     RuntimeError: An error when output file already exists and overwrite is not
     enabled.
   """
@@ -265,6 +266,15 @@ def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
   # uses tensor name.
   inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
       meta_graph_def, signature_def_key)
+
+  # Check if input tensor keys are valid.
+  for input_key_name in input_tensor_key_feed_dict.keys():
+    if input_key_name not in inputs_tensor_info.keys():
+      raise ValueError(
+          '"%s" is not a valid input key. Please choose from %s, or use '
+          '--show option.' %
+          (input_key_name, '"' + '", "'.join(inputs_tensor_info.keys()) + '"'))
+
   inputs_feed_dict = {
       inputs_tensor_info[key].name: tensor
       for key, tensor in input_tensor_key_feed_dict.items()
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index 8f79c888ebd3c82affde5d17ff0c5db2232a6c46..a55cf168b23e8fc4efeb5175e3c01cad1a68fa57 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -391,6 +391,16 @@ Method name is: tensorflow/serving/predict"""
     y_expected = np.array([[2.5], [3.0]])
     self.assertAllClose(y_expected, y_actual)
 
+  def testRunCommandInvalidInputKeyError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x2_to_y3', '--input_exprs', 'x2=np.ones((3,1))'
+    ])
+    with self.assertRaises(ValueError):
+      saved_model_cli.run(args)
+
   def testRunCommandOutputFileExistError(self):
     self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index 1aa513f13e1ce3d0ee7ae4c4d143119b511ace08..b3f9ea323c2bb4fd9ecee93863fbc7955b47a947 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -68,7 +68,7 @@ class AdagradDAOptimizer(optimizer.Optimizer):
       invalid.
     """
     if initial_gradient_squared_accumulator_value <= 0.0:
-      raise ValueError("initial_gradient_squared_accumulator_value must be"
+      raise ValueError("initial_gradient_squared_accumulator_value must be "
                        "positive: %s" %
                        initial_gradient_squared_accumulator_value)
     super(AdagradDAOptimizer, self).__init__(use_locking, name)
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 459c735ea3837bdcaf2312666dbce884fd5fe953..796402425a123d0063084f3f9886855789a40e10 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -31,7 +31,7 @@ from tensorflow.python.training import training_ops
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
-  See [Kingma et. al., 2014](http://arxiv.org/abs/1412.6980)
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index d52cf9a4367dd7728245cbe4fe35b47dd5c0dd25..ddf04e21e61ca629ecbacb844e7dd5fd65b689e7 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import six
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
@@ -27,7 +28,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver
-from tensorflow.python.training import training as train
+
 
 __all__ = [
     "load_checkpoint", "load_variable", "list_variables", "init_from_checkpoint"
@@ -55,7 +56,7 @@ def load_checkpoint(ckpt_dir_or_file):
   if filename is None:
     raise ValueError("Couldn't find 'checkpoint' file or checkpoints in "
                      "given directory %s" % ckpt_dir_or_file)
-  return train.NewCheckpointReader(filename)
+  return pywrap_tensorflow.NewCheckpointReader(filename)
 
 
 def load_variable(ckpt_dir_or_file, name):
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 618f3baf089bdb11f8931d3f078983a7a3b9ac96..c64a1b3f799e776c7bbbbcfb691bdd97e4a34466 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -29,6 +29,9 @@ class FtrlOptimizer(optimizer.Optimizer):
 
   See this [paper](
   https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  This version has support for both online L2 (the L2 penalty given in the paper
+  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
+  loss function).
   """
 
   def __init__(self,
@@ -40,8 +43,9 @@ class FtrlOptimizer(optimizer.Optimizer):
                use_locking=False,
                name="Ftrl",
                accum_name=None,
-               linear_name=None):
-    """Construct a new FTRL optimizer.
+               linear_name=None,
+               l2_shrinkage_regularization_strength=0.0):
+    r"""Construct a new FTRL optimizer.
 
     Args:
       learning_rate: A float value or a constant float `Tensor`.
@@ -59,6 +63,19 @@ class FtrlOptimizer(optimizer.Optimizer):
         accumulator.  If not present, defaults to name.
       linear_name: The suffix for the variable that keeps the linear gradient
         accumulator.  If not present, defaults to name + "_1".
+      l2_shrinkage_regularization_strength: A float value, must be greater than
+        or equal to zero. This differs from L2 above in that the L2 above is a
+        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
+        The FTRL formulation can be written as:
+        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
+        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
+        function w.r.t. the weights w.
+        Specifically, in the absence of L1 regularization, it is equivalent to
+        the following update rule:
+        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        where lr_t is the learning rate at t.
+        When input is sparse shrinkage will only happen on the active weights.
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -79,16 +96,23 @@ class FtrlOptimizer(optimizer.Optimizer):
       raise ValueError(
           "l2_regularization_strength %f needs to be positive or zero" %
           l2_regularization_strength)
+    if l2_shrinkage_regularization_strength < 0.0:
+      raise ValueError(
+          "l2_shrinkage_regularization_strength %f needs to be positive"
+          " or zero" % l2_shrinkage_regularization_strength)
 
     self._learning_rate = learning_rate
     self._learning_rate_power = learning_rate_power
     self._initial_accumulator_value = initial_accumulator_value
     self._l1_regularization_strength = l1_regularization_strength
     self._l2_regularization_strength = l2_regularization_strength
+    self._l2_shrinkage_regularization_strength = (
+        l2_shrinkage_regularization_strength)
     self._learning_rate_tensor = None
     self._learning_rate_power_tensor = None
     self._l1_regularization_strength_tensor = None
     self._l2_regularization_strength_tensor = None
+    self._l2_shrinkage_regularization_strength_tensor = None
     self._accum_name = accum_name
     self._linear_name = linear_name
 
@@ -108,69 +132,137 @@ class FtrlOptimizer(optimizer.Optimizer):
         self._l1_regularization_strength, name="l1_regularization_strength")
     self._l2_regularization_strength_tensor = ops.convert_to_tensor(
         self._l2_regularization_strength, name="l2_regularization_strength")
+    self._l2_shrinkage_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_shrinkage_regularization_strength,
+        name="l2_shrinkage_regularization_strength")
     self._learning_rate_power_tensor = ops.convert_to_tensor(
         self._learning_rate_power, name="learning_rate_power")
 
   def _apply_dense(self, grad, var):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
-    return training_ops.apply_ftrl(
-        var,
-        accum,
-        linear,
-        grad,
-        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
-        math_ops.cast(self._l1_regularization_strength_tensor,
-                      var.dtype.base_dtype),
-        math_ops.cast(self._l2_regularization_strength_tensor,
-                      var.dtype.base_dtype),
-        math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking)
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.apply_ftrl(
+          var,
+          accum,
+          linear,
+          grad,
+          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
+          use_locking=self._use_locking)
+    else:
+      return training_ops.apply_ftrl_v2(
+          var,
+          accum,
+          linear,
+          grad,
+          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
+          use_locking=self._use_locking)
 
   def _resource_apply_dense(self, grad, var):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
-    return training_ops.resource_apply_ftrl(
-        var.handle,
-        accum.handle,
-        linear.handle,
-        grad,
-        math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
-        math_ops.cast(self._l1_regularization_strength_tensor,
-                      grad.dtype.base_dtype),
-        math_ops.cast(self._l2_regularization_strength_tensor,
-                      grad.dtype.base_dtype),
-        math_ops.cast(self._learning_rate_power_tensor, grad.dtype.base_dtype),
-        use_locking=self._use_locking)
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
+          use_locking=self._use_locking)
 
   def _apply_sparse(self, grad, var):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
-    return training_ops.sparse_apply_ftrl(
-        var,
-        accum,
-        linear,
-        grad.values,
-        grad.indices,
-        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
-        math_ops.cast(self._l1_regularization_strength_tensor,
-                      var.dtype.base_dtype),
-        math_ops.cast(self._l2_regularization_strength_tensor,
-                      var.dtype.base_dtype),
-        math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking)
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.sparse_apply_ftrl(
+          var,
+          accum,
+          linear,
+          grad.values,
+          grad.indices,
+          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
+          use_locking=self._use_locking)
+    else:
+      return training_ops.sparse_apply_ftrl_v2(
+          var,
+          accum,
+          linear,
+          grad.values,
+          grad.indices,
+          math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor,
+                        var.dtype.base_dtype),
+          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
+                        grad.dtype.base_dtype),
+          math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
+          use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, var, indices):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
-    return training_ops.resource_sparse_apply_ftrl(
-        var.handle,
-        accum.handle,
-        linear.handle,
-        grad,
-        indices,
-        math_ops.cast(self._learning_rate_tensor, grad.dtype),
-        math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
-        math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
-        math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
-        use_locking=self._use_locking)
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_sparse_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
+                        grad.dtype),
+          math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
+          use_locking=self._use_locking)
+
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index f4cf17f6f03129845f7611fcfe977e8528fa9aaa..775bdb3f60092b966edd182721211095f353d765 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -179,6 +179,43 @@ class FtrlOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.02406147, -0.04830509]), v1_val)
 
+  def testFtrlWithL1_L2_L2Shrinkage(self):
+    """Test the new FTRL op with support for l2 shrinkage.
+
+    The addition of this parameter which places a constant pressure on weights
+    towards the origin causes the gradient descent trajectory to differ. The
+    weights will tend to have smaller magnitudes with this parameter set.
+    """
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.FtrlOptimizer(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = sess.run([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.22078767, -0.41378114]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02919818, -0.07343706]), v1_val)
+
   def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
     if is_sparse:
       var0 = variables.Variable([[0.0], [0.0]], dtype=dtype)
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 21183823c237d6b28cbe8bc167cc22009fdb4bb6..704017c244625e171a587789253fdb047cad0599 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -148,7 +148,7 @@ def input_producer(input_tensor,
   """
   with ops.name_scope(name, "input_producer", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
-    element_shape = input_tensor.get_shape()[1:].merge_with(element_shape)
+    element_shape = input_tensor.shape[1:].merge_with(element_shape)
     if not element_shape.is_fully_defined():
       raise ValueError("Either `input_tensor` must have a fully defined shape "
                        "or `element_shape` must be specified")
@@ -168,7 +168,7 @@ def input_producer(input_tensor,
             q, [enq], cancel_op=cancel_op))
     if summary_name is not None:
       summary.scalar(summary_name,
-                     math_ops.cast(q.size(), dtypes.float32) * (1. / capacity))
+                     math_ops.to_float(q.size()) * (1. / capacity))
     return q
 
 
@@ -382,7 +382,7 @@ class _SparseMetaData(object):
 
 def _as_tensor_list(tensors):
   if isinstance(tensors, dict):
-    return [tensors[k] for k in sorted(tensors)]
+    return [tensors[k] for k in sorted(tensors, key=str)]
   else:
     return tensors
 
@@ -408,7 +408,7 @@ def _as_original_type(original_tensors, tensor_list):
       # was enqueued.  Make it a list again.  See b/28117485.
       tensor_list = [tensor_list]
     return {k: tensor_list[i]
-            for i, k in enumerate(sorted(original_tensors))}
+            for i, k in enumerate(sorted(original_tensors, key=str))}
   else:
     return tensor_list
 
@@ -465,7 +465,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
   def _sparse_meta_data(t, storing_op, map_op):
     if not isinstance(t, sparse_tensor.SparseTensor):
       return _SparseMetaData(False, None, None)
-    rank = t.dense_shape.get_shape().with_rank(1)[0]
+    rank = t.dense_shape.shape.with_rank(1)[0]
     if enqueue_many:
       rank -= 1
     # If a shared map_op was provided, use that. Otherwise use the name of
@@ -492,8 +492,15 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
           lambda: -1 * array_ops.ones(array_ops.shape(t)[0:1], dtypes.int64))
       out_tensor.set_shape([None])  # necessary when t.ndims is unknown
       return out_tensor
-    if keep_input.get_shape().ndims == 1:
-      t = sparse_ops.sparse_retain(t, keep_input)
+    def _sparse_values_to_keep(t, keep_input):
+      """Convert a per-row `keep_input` vector to a per-value one."""
+      # Get the rows of every value in the sparse Tensor.
+      row_values = array_ops.reshape(
+          t.indices, [array_ops.shape(t.indices)[0], -1])[:, 0]
+      # The value should be kept iff the row should be kept.
+      return array_ops.gather(keep_input, row_values)
+    if keep_input.shape.ndims == 1:
+      t = sparse_ops.sparse_retain(t, _sparse_values_to_keep(t, keep_input))
       store_f = lambda t, name, _: _store_many_sparse(t, shared_name=name)
     elif enqueue_many:
       store_f = _maybe_store_many_sparse
@@ -577,13 +584,13 @@ def _validate_join(tensor_list_list):
 def _validate_keep_input(keep_input, enqueue_many):
   """Validate `keep_input` argument to conditional batching functions."""
   keep_input = ops.convert_to_tensor(keep_input)
-  if keep_input.get_shape().ndims is None:
+  if keep_input.shape.ndims is None:
     raise ValueError(
         "`keep_input` dimensions must be known at graph construction.")
-  if not enqueue_many and keep_input.get_shape().ndims == 1:
+  if not enqueue_many and keep_input.shape.ndims == 1:
     raise ValueError(
         "`keep_input` cannot be a vector when `enqueue_many=False`.")
-  if keep_input.get_shape().ndims > 1:
+  if keep_input.shape.ndims > 1:
     raise ValueError("`keep_input` must be 0 or 1 dimensions.")
   return keep_input
 
@@ -632,18 +639,18 @@ def _shapes(tensor_list_list, shapes, enqueue_many):
 
     for tl in tensor_list_list:
       for i in xrange(len0):
-        if tl[i].get_shape().ndims is None:
+        if tl[i].shape.ndims is None:
           raise ValueError("Cannot infer Tensor's rank: %s" % tl[i])
 
     shapes = [_merge_shapes(
-        [tl[i].get_shape().as_list() for tl in tensor_list_list], enqueue_many)
+        [tl[i].shape.as_list() for tl in tensor_list_list], enqueue_many)
               for i in xrange(len0)]
   return shapes
 
 
 def _select_which_to_enqueue(tensor_list, keep_input):
   """Select which examples to enqueue based on vector `keep_input`."""
-  select_i = math_ops.cast(keep_input, dtypes.int32)
+  select_i = math_ops.to_int32(keep_input)
   tensor_list = [
       data_flow_ops.dynamic_partition(x, select_i, num_partitions=2)[1]
       for x in tensor_list]
@@ -656,7 +663,7 @@ def _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input):
     enqueue_fn = queue.enqueue_many
   else:
     enqueue_fn = queue.enqueue
-  if keep_input.get_shape().ndims == 1:
+  if keep_input.shape.ndims == 1:
     enqueue_ops = [enqueue_fn(_select_which_to_enqueue(x, keep_input))
                    for x in tensor_list_list]
   else:
@@ -673,7 +680,7 @@ def _enqueue(queue, tensor_list, threads, enqueue_many, keep_input):
     enqueue_fn = queue.enqueue_many
   else:
     enqueue_fn = queue.enqueue
-  if keep_input.get_shape().ndims == 1:
+  if keep_input.shape.ndims == 1:
     enqueue_ops = [
         enqueue_fn(_select_which_to_enqueue(tensor_list, keep_input))] * threads
   else:
@@ -707,8 +714,7 @@ def _batch(tensors, batch_size, keep_input, num_threads=1, capacity=32,
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
     summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.cast(queue.size(), dtypes.float32) *
-                   (1. / capacity))
+                   math_ops.to_float(queue.size()) * (1. / capacity))
 
     if allow_smaller_final_batch:
       dequeued = queue.dequeue_up_to(batch_size, name=name)
@@ -742,8 +748,7 @@ def _batch_join(tensors_list, batch_size, keep_input, capacity=32,
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
     summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.cast(queue.size(), dtypes.float32) *
-                   (1. / capacity))
+                   math_ops.to_float(queue.size()) * (1. / capacity))
 
     if allow_smaller_final_batch:
       dequeued = queue.dequeue_up_to(batch_size, name=name)
@@ -775,8 +780,8 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
         capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
         dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
-    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
-                          dtypes.float32) *
+    full = (math_ops.to_float(
+        math_ops.maximum(0, queue.size() - min_after_dequeue)) *
             (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
@@ -812,8 +817,8 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
         capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
         dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
-    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
-                          dtypes.float32) *
+    full = (math_ops.to_float(
+        math_ops.maximum(0, queue.size() - min_after_dequeue)) *
             (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
@@ -879,7 +884,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
   `batch_size` is returned when the queue is closed and there are not enough
   elements to fill the batch, otherwise the pending elements are discarded.
   In addition, all output tensors' static shapes, as accessed via the
-  `get_shape` method will have a first `Dimension` value of `None`, and
+  `shape` property will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
   Args:
@@ -1033,7 +1038,7 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
   `batch_size` is returned when the queue is closed and there are not enough
   elements to fill the batch, otherwise the pending elements are discarded.
   In addition, all output tensors' static shapes, as accessed via the
-  `get_shape` method will have a first `Dimension` value of `None`, and
+  `shape` property will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
   Args:
@@ -1088,8 +1093,8 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresponding value in `keep_input` is `True`. This tensor essentially acts
-      as a filtering mechanism.
+      corresponding value in `keep_input` is `True`. This tensor essentially
+      acts as a filtering mechanism.
     batch_size: An integer. The new batch size pulled from the queue.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensor_list_list` is a single
@@ -1178,7 +1183,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   `batch_size` is returned when the queue is closed and there are not enough
   elements to fill the batch, otherwise the pending elements are discarded.
   In addition, all output tensors' static shapes, as accessed via the
-  `get_shape` method will have a first `Dimension` value of `None`, and
+  `shape` property will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
   Args:
@@ -1239,8 +1244,8 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresponding value in `keep_input` is `True`. This tensor essentially acts
-      as a filtering mechanism.
+      corresponding value in `keep_input` is `True`. This tensor essentially
+      acts as a filtering mechanism.
     num_threads: The number of threads enqueuing `tensor_list`.
     seed: Seed for the random shuffling within the queue.
     enqueue_many: Whether each tensor in `tensor_list` is a single example.
@@ -1320,7 +1325,7 @@ def shuffle_batch_join(tensors_list, batch_size, capacity,
   `batch_size` is returned when the queue is closed and there are not enough
   elements to fill the batch, otherwise the pending elements are discarded.
   In addition, all output tensors' static shapes, as accessed via the
-  `get_shape` method will have a first `Dimension` value of `None`, and
+  `shape` property will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
   Args:
@@ -1381,8 +1386,8 @@ def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
       added to the queue or not.  If it is a scalar and evaluates `True`, then
       `tensors` are all added to the queue. If it is a vector and `enqueue_many`
       is `True`, then each example is added to the queue only if the
-      corresponding value in `keep_input` is `True`. This tensor essentially acts
-      as a filtering mechanism.
+      corresponding value in `keep_input` is `True`. This tensor essentially
+      acts as a filtering mechanism.
     seed: Seed for the random shuffling within the queue.
     enqueue_many: Whether each tensor in `tensor_list_list` is a single
       example.
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 4f705a0a85e0cb3e6746f8ccb92aeea6dcb79180..3a25bfe34322385e6a1ab1b3da4a6ad17c3208c6 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -429,6 +429,13 @@ class DictHelperTest(test_lib.TestCase):
     d2 = inp._as_original_type(d, l)
     self.assertEquals(d, d2)
 
+  def testHeterogeneousKeysDictInputs(self):
+    d = {"z": 1, 1: 42, ("a", "b"): 100}
+    l = inp._as_tensor_list(d)
+    self.assertEquals([100, 42, 1], l)
+    d2 = inp._as_original_type(d, l)
+    self.assertEquals(d, d2)
+
 
 class BatchTest(test_lib.TestCase):
 
@@ -896,6 +903,29 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  def testMaybeBatchCorrectValues(self):
+    sparse_t = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
+        dense_shape=[2, 4],
+        values=[5, 4, 7, 2])
+    keep = constant_op.constant([True, False])
+    batched = inp.maybe_batch(
+        [sparse_t], keep_input=keep, batch_size=1, enqueue_many=True)
+
+    with self.test_session():
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(coord=coord)
+
+      batched_np = batched.eval()
+
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
+    self.assertAllEqual([[0, 1], [0, 2]], batched_np.indices)
+    self.assertAllEqual([5, 4], batched_np.values)
+    self.assertAllEqual([1, 4], batched_np.dense_shape)
+
 
 class BatchJoinTest(test_lib.TestCase):
 
@@ -1450,6 +1480,29 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  def testMaybeBatchCorrectValues(self):
+    sparse = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
+        dense_shape=[2, 4],
+        values=[5, 4, 7, 2])
+    keep = constant_op.constant([True, False])
+    batched = inp.maybe_batch_join(
+        [[sparse]], keep_input=keep, batch_size=1, enqueue_many=True)
+
+    with self.test_session():
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(coord=coord)
+
+      batched_np = batched.eval()
+
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
+    self.assertAllEqual([[0, 1], [0, 2]], batched_np.indices)
+    self.assertAllEqual([5, 4], batched_np.values)
+    self.assertAllEqual([1, 4], batched_np.dense_shape)
+
 
 class ShuffleBatchTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index ffd7c12c427aefc531cd785351993cea05a512e1..f34ff22f070af8eadbc4ad3f868b97adaa32f270 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -52,7 +52,7 @@ class MomentumOptimizer(optimizer.Optimizer):
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Momentum".
       use_nesterov: If `True` use Nesterov Momentum.
-        See [Sutskever et. al., 2013](
+        See [Sutskever et al., 2013](
         http://jmlr.org/proceedings/papers/v28/sutskever13.pdf)
 
     """
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index ff77470a8248d3d2a1869fc1d59df22aab01d721..9f71395c9650983849afe797b6e182b804bbea0d 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -44,6 +44,10 @@ from tensorflow.python.training import session_run_hook
 _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 
 
+# Value that indicates no value was provided.
+USE_DEFAULT = object()
+
+
 # TODO(touts): Share that with the Supervisor.
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
@@ -88,7 +92,7 @@ class Scaffold(object):
 
   * `init_feed_dict`: A session feed dictionary that should be used when
      running the init op.
-  * `init_fn`: A callable to run run after the init op to perform additional
+  * `init_fn`: A callable to run after the init op to perform additional
     initializations.  The callable will be called as
     `init_fn(scaffold, session)`.
 
@@ -269,8 +273,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              hooks=None,
                              chief_only_hooks=None,
                              save_checkpoint_secs=600,
-                             save_summaries_steps=100,
-                             save_summaries_secs=None,
+                             save_summaries_steps=USE_DEFAULT,
+                             save_summaries_secs=USE_DEFAULT,
                              config=None,
                              stop_grace_period_secs=120,
                              log_step_count_steps=100):
@@ -301,11 +305,11 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
     save_summaries_steps: The frequency, in number of global steps, that the
       summaries are written to disk using a default summary saver. If both
       `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
-      the default summary saver isn't used.
+      the default summary saver isn't used. Default 100.
     save_summaries_secs: The frequency, in secs, that the summaries are written
       to disk using a default summary saver.  If both `save_summaries_steps` and
       `save_summaries_secs` are set to `None`, then the default summary saver
-      isn't used.
+      isn't used. Default not enabled.
     config: an instance of `tf.ConfigProto` proto used to configure the session.
       It's the `config` argument of constructor of `tf.Session`.
     stop_grace_period_secs: Number of seconds given to threads to stop after
@@ -316,6 +320,14 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
   Returns:
     A `MonitoredSession` object.
   """
+  if save_summaries_steps == USE_DEFAULT and save_summaries_secs == USE_DEFAULT:
+    save_summaries_steps = 100
+    save_summaries_secs = None
+  elif save_summaries_secs == USE_DEFAULT:
+    save_summaries_secs = None
+  elif save_summaries_steps == USE_DEFAULT:
+    save_summaries_steps = None
+
   scaffold = scaffold or Scaffold()
   if not is_chief:
     session_creator = WorkerSessionCreator(
@@ -523,7 +535,7 @@ class _MonitoredSession(object):
     # __exit__ should return True to suppress an exception.
     return exception_type is None
 
-  class _CoordinatedSessionCreator(object):
+  class _CoordinatedSessionCreator(SessionCreator):
     """Factory for the _RecoverableSession."""
 
     def __init__(self, session_creator, hooks, stop_grace_period_secs):
@@ -563,7 +575,7 @@ class _MonitoredSession(object):
           ops.get_default_graph()._unsafe_unfinalize()  # pylint: disable=protected-access
 
   def _is_closed(self):
-    """Return True if the supervised session is closed.  For tests only.
+    """Return True if the monitored session is closed.  For tests only.
 
     Returns:
       A boolean.
@@ -714,7 +726,8 @@ class SingularMonitoredSession(_MonitoredSession):
                master='',
                config=None,
                checkpoint_dir=None,
-               stop_grace_period_secs=120):
+               stop_grace_period_secs=120,
+               checkpoint_filename_with_path=None):
     """Creates a SingularMonitoredSession.
 
     Args:
@@ -727,12 +740,15 @@ class SingularMonitoredSession(_MonitoredSession):
         variables.
       stop_grace_period_secs: Number of seconds given to threads to stop after
         `close()` has been called.
+      checkpoint_filename_with_path: A string. Optional path to a checkpoint
+        file from which to restore variables.
     """
     session_creator = ChiefSessionCreator(
         scaffold=scaffold,
         master=master,
         config=config,
-        checkpoint_dir=checkpoint_dir)
+        checkpoint_dir=checkpoint_dir,
+        checkpoint_filename_with_path=checkpoint_filename_with_path)
     super(SingularMonitoredSession, self).__init__(
         session_creator, hooks, should_recover=False,
         stop_grace_period_secs=stop_grace_period_secs)
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index b31027ca3cc23767ddba18ffe677d7c6b5e1fe83..4dbe820e5a237befeeaa40f595b5311d7f0b364d 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -51,8 +51,8 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     variable: A Variable.
     value: A tensor with the same shape as 'variable'.
     decay: A float Tensor or float value.  The moving average decay.
-    zero_debias: A python bool. If true, assume the variable is 0-initialized and
-      unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
+    zero_debias: A python bool. If true, assume the variable is 0-initialized
+      and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
       `_zero_debias` for more details.
     name: Optional name of the returned operation.
 
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 20d520fd7b5703332488cd513a179a3fbec06ded..d9304ff50a9ea9f8f257c47d4b6c2867bbc316d8 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1547,9 +1547,14 @@ class Saver(object):
     Args:
       sess: A `Session` to use to restore the parameters.
       save_path: Path where parameters were previously saved.
+
+    Raises:
+      ValueError: If save_path is None.
     """
     if self._is_empty:
       return
+    if save_path is None:
+      raise ValueError("Can't load save_path when it is None.")
     logging.info("Restoring parameters from %s", save_path)
     sess.run(self.saver_def.restore_op_name,
              {self.saver_def.filename_tensor_name: save_path})
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index d17b7e93a108983f6d7d66c4d99ef5304afc8487..f351efd8433c1a4dca0854778d06ccb22ff2fa9f 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -1951,7 +1951,8 @@ class MetaGraphTest(test.TestCase):
                                                         logits=logit)
         adam.AdamOptimizer().minimize(cost, name="optimize")
       meta_graph_def = saver_module.export_meta_graph(clear_devices=True)
-      graph_io.write_graph(meta_graph_def, "/tmp", "meta_graph.pbtxt")
+      graph_io.write_graph(meta_graph_def, self.get_temp_dir(),
+                           "meta_graph.pbtxt")
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 4dc1d5abb71f9d7b8d63da016876bcec84edd9eb..5879fd330adec58dde45f3da8ae16c9a297f3b24 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -448,8 +448,8 @@ class SessionManagerTest(test.TestCase):
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
           local_init_op=w.initializer)
-    with self.assertRaisesRegexp(errors_impl.FailedPreconditionError,
-                                 "Attempting to use uninitialized value v"):
+    with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
+                                 "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
   def testPrepareSessionWithReadyForLocalInitOp(self):
@@ -460,14 +460,20 @@ class SessionManagerTest(test.TestCase):
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
+      x = variables.Variable(
+          3 * v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="x")
       with self.test_session():
         self.assertEqual(False, variables.is_variable_initialized(v).eval())
         self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variables.is_variable_initialized(x).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=variables.report_uninitialized_variables(
               variables.global_variables()),
-          local_init_op=w.initializer)
+          local_init_op=[w.initializer, x.initializer])
       sess = sm2.prepare_session("", init_op=v.initializer)
       self.assertEqual(
           True,
@@ -477,8 +483,78 @@ class SessionManagerTest(test.TestCase):
           True,
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
+      self.assertEqual(
+          True,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
       self.assertEquals(1, sess.run(w))
+      self.assertEquals(3, sess.run(x))
+
+  def testPrepareSessionWithPartialInitOp(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1, name="v")
+      w = variables.Variable(
+          v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="w")
+      x = variables.Variable(
+          3 * v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="x")
+      v_res = variables.Variable(1, name="v_res")
+      w_res = variables.Variable(
+          v_res,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="w_res")
+      x_res = variables.Variable(
+          3 * v_res,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="x_res")
+
+      with self.test_session():
+        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variables.is_variable_initialized(x).eval())
+        self.assertEqual(False, variables.is_variable_initialized(v_res).eval())
+        self.assertEqual(False, variables.is_variable_initialized(w_res).eval())
+        self.assertEqual(False, variables.is_variable_initialized(x_res).eval())
+      sm2 = session_manager.SessionManager(local_init_op=[
+          w.initializer, x.initializer, w_res.initializer, x_res.initializer
+      ])
+      sess = sm2.prepare_session("", init_op=None)
+      self.assertEqual(
+          False,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
+      self.assertEqual(
+          True,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
+      self.assertEqual(
+          True,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
+      self.assertEquals(1, sess.run(w))
+      self.assertEquals(3, sess.run(x))
+      self.assertEqual(
+          False,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("v_res:0")).eval(session=sess))
+      self.assertEqual(
+          True,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("w_res:0")).eval(session=sess))
+      self.assertEqual(
+          True,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("x_res:0")).eval(session=sess))
+      self.assertEquals(1, sess.run(w_res))
+      self.assertEquals(3, sess.run(x_res))
 
   def testPrepareSessionDidNotInitLocalVariable(self):
     with ops.Graph().as_default():
@@ -493,8 +569,8 @@ class SessionManagerTest(test.TestCase):
         self.assertEqual(False, variables.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
-      with self.assertRaisesRegexp(RuntimeError,
-                                   "Init operations did not make model ready"):
+      with self.assertRaisesRegexp(
+          RuntimeError, "Init operations did not make model ready.*"):
         sm2.prepare_session("", init_op=v.initializer)
 
   def testPrepareSessionDidNotInitLocalVariableList(self):
@@ -550,8 +626,8 @@ class SessionManagerTest(test.TestCase):
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
           local_init_op=w.initializer)
-    with self.assertRaisesRegexp(errors_impl.FailedPreconditionError,
-                                 "Attempting to use uninitialized value v"):
+    with self.assertRaisesRegexp(RuntimeError,
+                                 "Init operations did not make model ready.*"):
       sm2.prepare_session("", init_op=None)
 
 
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 09da63eb68712bf10f9c09f3e15ae6170d366a79..dbeabd250e7d90ad5fe4e602cc790f0314e00bf3 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -42,23 +42,28 @@ For more specific needs, you can create custom hooks:
       print('Starting the session.')
       self.your_tensor = ...
 
-    def end(self, session):
-      print('Done with the session.')
+    def after_create_session(self, session, coord):
+      # When this is called, the graph is finalized and
+      # ops can no longer be added to the graph.
+      print('Session created.')
 
     def before_run(self, run_context):
-      print('before calling session.run)
+      print('Before calling session.run().')
       return SessionRunArgs(self.your_tensor)
 
-    def after_run(self, run_context, run_values)
+    def after_run(self, run_context, run_values):
       print('Done running one step. The value of my tensor: %s',
             run_values.results)
       if you-need-to-stop-loop:
         run_context.request_stop()
 
+    def end(self, session):
+      print('Done with the session.')
+
 To understand how hooks interact with calls to `MonitoredSession.run()`,
 look at following code:
-  with SupervisedSession(hooks=your_hooks, ...) as sess
-    while not sess.should_stop()
+  with MonitoredTrainingSession(hooks=your_hooks, ...) as sess:
+    while not sess.should_stop():
       sess.run(your_fetches)
 
 Above user code leads to following execution:
@@ -68,7 +73,7 @@ Above user code leads to following execution:
   while not stop is requested:
     call hooks.before_run()
     try:
-      results = sess.run(merged_fetches)
+      results = sess.run(merged_fetches, feed_dict=merged_feeds)
     except (errors.OutOfRangeError, StopIteration):
       break
     call hooks.after_run()
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 230ed1db6874da6bbb106f687da616cda1f896f9..cfdd03dc1505204b40e06e0c20f454397890eec8 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -147,14 +147,14 @@ class Supervisor(object):
   Example: Start a thread to print losses.  We want this thread to run
   every 60 seconds, so we launch it with `sv.loop()`.
 
-    ```python
-    ...
-    sv = Supervisor(logdir='/tmp/mydir')
-    with sv.managed_session(FLAGS.master) as sess:
-      sv.loop(60, print_loss, (sess, ))
-      while not sv.should_stop():
-        sess.run(my_train_op)
-    ```
+  ```python
+  ...
+  sv = Supervisor(logdir='/tmp/mydir')
+  with sv.managed_session(FLAGS.master) as sess:
+    sv.loop(60, print_loss, (sess, ))
+    while not sv.should_stop():
+      sess.run(my_train_op)
+  ```
 
   ##### Launching fewer services
 
@@ -166,22 +166,22 @@ class Supervisor(object):
 
   Example: Create summaries manually every 100 steps in the chief.
 
-    ```python
-    # Create a Supervisor with no automatic summaries.
-    sv = Supervisor(logdir='/tmp/mydir', is_chief=is_chief, summary_op=None)
-    # As summary_op was None, managed_session() does not start the
-    # summary thread.
-    with sv.managed_session(FLAGS.master) as sess:
-      for step in xrange(1000000):
-        if sv.should_stop():
-          break
-        if is_chief and step % 100 == 0:
-          # Create the summary every 100 chief steps.
-          sv.summary_computed(sess, sess.run(my_summary_op))
-        else:
-          # Train normally
-          sess.run(my_train_op)
-    ```
+  ```python
+  # Create a Supervisor with no automatic summaries.
+  sv = Supervisor(logdir='/tmp/mydir', is_chief=is_chief, summary_op=None)
+  # As summary_op was None, managed_session() does not start the
+  # summary thread.
+  with sv.managed_session(FLAGS.master) as sess:
+    for step in xrange(1000000):
+      if sv.should_stop():
+        break
+      if is_chief and step % 100 == 0:
+        # Create the summary every 100 chief steps.
+        sv.summary_computed(sess, sess.run(my_summary_op))
+      else:
+        # Train normally
+        sess.run(my_train_op)
+  ```
 
   ##### Custom model initialization
 
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 15f938df8c337c70e0d7b59a6c85613efa2eeb3d..85e8a8a4bb1dba6f0c4e7e1059b07816d893347f 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -20,41 +20,15 @@ from __future__ import print_function
 
 import time
 
-import portpicker
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import server_lib
 from tensorflow.python.training import training
 
 
-def create_local_cluster(num_workers, num_ps, protocol="grpc"):
-  """Create local GRPC servers and return them."""
-  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-  cluster_dict = {
-      "worker": ["localhost:%s" % port for port in worker_ports],
-      "ps": ["localhost:%s" % port for port in ps_ports]
-  }
-  cs = server_lib.ClusterSpec(cluster_dict)
-
-  workers = [
-      server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
-      for ix in range(num_workers)
-  ]
-  ps_servers = [
-      server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
-      for ix in range(num_ps)
-  ]
-
-  return workers, ps_servers
-
-
 # Creates the workers and return their sessions, graphs, train_ops.
 def get_workers(num_workers, replicas_to_aggregate, workers):
   sessions = []
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index f4ac3c9758712182d2aee26a1a53c83e92e97b63..e2a7b28e2bc96ea57274f9e8d31a41f4ea857f1a 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -85,6 +85,10 @@ See the @{$python/train} guide.
 @@create_global_step
 @@assert_global_step
 @@write_graph
+@@load_checkpoint
+@@load_variable
+@@list_variables
+@@init_from_checkpoint
 """
 
 # Optimizers.
@@ -142,6 +146,11 @@ from tensorflow.python.training.basic_session_run_hooks import GlobalStepWaiterH
 from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_loops import basic_train_loop
+from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
+from tensorflow.python.training.checkpoint_utils import list_variables
+from tensorflow.python.training.checkpoint_utils import load_checkpoint
+from tensorflow.python.training.checkpoint_utils import load_variable
+
 from tensorflow.python.training.device_setter import replica_device_setter
 from tensorflow.python.training.monitored_session import Scaffold
 from tensorflow.python.training.monitored_session import MonitoredTrainingSession
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index e2d9a594a35b93279d1a298dcaa1cec915bf7442..16d246c4df4c52e8f029cb1793726fb65407a2cd 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -608,7 +608,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions, "d2"]),
                         set(args2[1:]))
 
-    # Assert calls with the deprecated arguments dont log warnings if
+    # Assert calls with the deprecated arguments don't log warnings if
     # the value matches the 'ok_val'.
     mock_warning.reset_mock()
     self.assertEqual(3, _fn(1, None, 2, d2="my_ok_val"))
diff --git a/tensorflow/python/util/example_parser_configuration.py b/tensorflow/python/util/example_parser_configuration.py
index a3750851769a31466eebba5cfd5e665f4cbc4f9c..e3fdcf956e543c516335762a7c47e5547256a2a7 100644
--- a/tensorflow/python/util/example_parser_configuration.py
+++ b/tensorflow/python/util/example_parser_configuration.py
@@ -65,15 +65,15 @@ def extract_example_parser_configuration(parse_example_op, sess):
 
   # Fetch total_features key names and num_dense default values.
   if len(fetch_list) != (total_features + num_dense):
-    raise ValueError("len(fetch_list) does not match total features + num_dense"
-                     "(%d vs %d" % (len(fetch_list),
-                                    (total_features + num_dense)))
+    raise ValueError("len(fetch_list) does not match total features + "
+                     "num_dense (%d vs %d)" %
+                     (len(fetch_list), (total_features + num_dense)))
 
   fetched = sess.run(fetch_list)
 
   if len(fetched) != len(fetch_list):
-    raise ValueError("len(fetched) does not match len(fetch_list)"
-                     "(%d vs %d" % (len(fetched), len(fetch_list)))
+    raise ValueError("len(fetched) does not match len(fetch_list) "
+                     "(%d vs %d)" % (len(fetched), len(fetch_list)))
 
   # Fetch indices.
   sparse_keys_start = 0
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index d451bbace2c1cb9b3815b3ec788f69b181d98437..7d47692f6b5553c0e3d92b965a54d90eeb00bc13 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/util/kernel_registry.h"
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 01b3dfacd145ab6035625f5aa7ffbfa5c1fc80f5..f3f3887afc5ea3b3edf530a177cb828468843679 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -15,11 +15,13 @@
 
 """## Functions for working with arbitrarily nested sequences of elements.
 
-This module is used to perform any operations on nested structures. A nested
-structure is a Python sequence that contains non-sequence elements or other
-sequences. The utilities here assume (and do not check) that the nested
-structures form a 'tree', i.e. no references in the structure of the input of
-these functions should be recursive.
+This module can perform operations on nested structures. A nested structure is a
+Python sequence, tuple (including `namedtuple`), or dict that can contain
+further sequences, tuples, and dicts.
+
+The utilities here assume (and do not check) that the nested structures form a
+'tree', i.e., no references in the structure of the input of these functions
+should be recursive.
 
 Example structures: `((3, 4), 5, (6, 7, (9, 10), 8))`, `(np.array(0),
   (np.array([3, 4]), tf.constant([3, 4])))`
@@ -33,6 +35,7 @@ import collections as _collections
 
 import six as _six
 
+from tensorflow.python.platform import tf_logging as _tf_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -40,16 +43,23 @@ def _sequence_like(instance, args):
   """Converts the sequence `args` to the same type as `instance`.
 
   Args:
-    instance: an instance of `tuple`, `list`, or a `namedtuple` class.
+    instance: an instance of `tuple`, `list`, `namedtuple`, `dict`, or
+        `collections.NamedDict`.
     args: elements to be converted to a sequence.
 
   Returns:
     `args` with the type of `instance`.
   """
-  if (isinstance(instance, tuple) and
-      hasattr(instance, "_fields") and
-      isinstance(instance._fields, _collections.Sequence) and
-      all(isinstance(f, _six.string_types) for f in instance._fields)):
+  if isinstance(instance, dict):
+    # For dictionaries with their values extracted, we always order the values
+    # by sorting the keys first (see note below). This code allows recreating
+    # e.g., `OrderedDict`s with their original key ordering.
+    result = dict(zip(sorted(_six.iterkeys(instance)), args))
+    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+  elif (isinstance(instance, tuple) and
+        hasattr(instance, "_fields") and
+        isinstance(instance._fields, _collections.Sequence) and
+        all(isinstance(f, _six.string_types) for f in instance._fields)):
     # This is a namedtuple
     return type(instance)(*args)
   else:
@@ -57,8 +67,21 @@ def _sequence_like(instance, args):
     return type(instance)(args)
 
 
+def _yield_value(iterable):
+  if isinstance(iterable, dict):
+    # Iterate through dictionaries in a deterministic order. Note: we
+    # intentionally ignore the order in an `OrderedDict` because of the
+    # potential to introduce bugs if the user mixes ordered and plain dicts with
+    # the same keys. (This is based on experience.)
+    for key in sorted(_six.iterkeys(iterable)):
+      yield iterable[key]
+  else:
+    for value in iterable:
+      yield value
+
+
 def _yield_flat_nest(nest):
-  for n in nest:
+  for n in _yield_value(nest):
     if is_sequence(n):
       for ni in _yield_flat_nest(n):
         yield ni
@@ -66,6 +89,17 @@ def _yield_flat_nest(nest):
       yield n
 
 
+# Used by `_warn_once` to remember which warning messages have been given.
+_ALREADY_WARNED = {}
+
+
+def _warn_once(message):
+  """Logs a warning message, once per unique string."""
+  if message not in _ALREADY_WARNED:
+    _ALREADY_WARNED[message] = True
+    _tf_logging.warning(message)
+
+
 def is_sequence(seq):
   """Returns a true if its input is a collections.Sequence (except strings).
 
@@ -73,8 +107,14 @@ def is_sequence(seq):
     seq: an input sequence.
 
   Returns:
-    True if the sequence is a not a string and is a collections.Sequence.
+    True if the sequence is a not a string and is a collections.Sequence or a
+    dict.
   """
+  if isinstance(seq, dict):
+    return True
+  if isinstance(seq, set):
+    _warn_once("Sets are not currently considered sequences, but this may "
+               "change in the future, so consider avoiding using them.")
   return (isinstance(seq, _collections.Sequence)
           and not isinstance(seq, _six.string_types))
 
@@ -82,36 +122,55 @@ def is_sequence(seq):
 def flatten(nest):
   """Returns a flat sequence from a given nested structure.
 
-  If `nest` is not a sequence, this returns a single-element list: `[nest]`.
+  If `nest` is not a sequence, tuple, or dict, then returns a single-element
+  list: `[nest]`.
 
   Args:
-    nest: an arbitrarily nested structure or a scalar object.
-      Note, numpy arrays are considered scalars.
+    nest: an arbitrarily nested structure or a scalar object. Note, numpy
+        arrays are considered scalars.
 
   Returns:
     A Python list, the flattened version of the input.
   """
-  return list(_yield_flat_nest(nest)) if is_sequence(nest) else [nest]
+  if is_sequence(nest):
+    return list(_yield_flat_nest(nest))
+  else:
+    return [nest]
 
 
 def _recursive_assert_same_structure(nest1, nest2, check_types):
+  """Helper function for `assert_same_structure`."""
   is_sequence_nest1 = is_sequence(nest1)
   if is_sequence_nest1 != is_sequence(nest2):
     raise ValueError(
-        "The two structures don't have the same nested structure. "
-        "First structure: %s, second structure: %s." % (nest1, nest2))
+        "The two structures don't have the same nested structure.\n\n"
+        "First structure: %s\n\nSecond structure: %s." % (nest1, nest2))
 
-  if is_sequence_nest1:
+  if not is_sequence_nest1:
+    return  # finished checking
+
+  if check_types:
     type_nest1 = type(nest1)
     type_nest2 = type(nest2)
-    if check_types and type_nest1 != type_nest2:
+    if type_nest1 != type_nest2:
       raise TypeError(
           "The two structures don't have the same sequence type. First "
           "structure has type %s, while second structure has type %s."
           % (type_nest1, type_nest2))
 
-    for n1, n2 in zip(nest1, nest2):
-      _recursive_assert_same_structure(n1, n2, check_types)
+    if isinstance(nest1, dict):
+      keys1 = set(_six.iterkeys(nest1))
+      keys2 = set(_six.iterkeys(nest2))
+      if keys1 != keys2:
+        raise ValueError(
+            "The two dictionaries don't have the same set of keys. First "
+            "structure has keys {}, while second structure has keys {}."
+            .format(keys1, keys2))
+
+  nest1_as_sequence = [n for n in _yield_value(nest1)]
+  nest2_as_sequence = [n for n in _yield_value(nest2)]
+  for n1, n2 in zip(nest1_as_sequence, nest2_as_sequence):
+    _recursive_assert_same_structure(n1, n2, check_types)
 
 
 def assert_same_structure(nest1, nest2, check_types=True):
@@ -121,8 +180,9 @@ def assert_same_structure(nest1, nest2, check_types=True):
     nest1: an arbitrarily nested structure.
     nest2: an arbitrarily nested structure.
     check_types: if `True` (default) types of sequences are checked as
-      well. If set to `False`, for example a list and a tuple of objects will
-      look same if they have the same size.
+        well, including the keys of dictionaries. If set to `False`, for example
+        a list and a tuple of objects will look the same if they have the same
+        size.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -134,8 +194,9 @@ def assert_same_structure(nest1, nest2, check_types=True):
   len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
   if len_nest1 != len_nest2:
     raise ValueError("The two structures don't have the same number of "
-                     "elements. First structure: %s, second structure: %s."
-                     % (nest1, nest2))
+                     "elements.\n\nFirst structure (%i elements): %s\n\n"
+                     "Second structure (%i elements): %s"
+                     % (len_nest1, nest1, len_nest2, nest2))
   _recursive_assert_same_structure(nest1, nest2, check_types)
 
 
@@ -196,10 +257,10 @@ def flatten_dict_items(dictionary):
 
 
 def _packed_nest_with_indices(structure, flat, index):
-  """Helper function for pack_nest_as.
+  """Helper function for pack_sequence_as.
 
   Args:
-    structure: Substructure (tuple of elements and/or tuples) to mimic
+    structure: Substructure (list / tuple / dict) to mimic.
     flat: Flattened values to output substructure for.
     index: Index at which to start reading from flat.
 
@@ -215,7 +276,7 @@ def _packed_nest_with_indices(structure, flat, index):
       (assuming indexing starts from `index`).
   """
   packed = []
-  for s in structure:
+  for s in _yield_value(structure):
     if is_sequence(s):
       new_index, child = _packed_nest_with_indices(s, flat, index)
       packed.append(_sequence_like(s, child))
@@ -233,8 +294,9 @@ def pack_sequence_as(structure, flat_sequence):
   in this case the return value is `flat_sequence[0]`.
 
   Args:
-    structure: tuple or list constructed of scalars and/or other tuples/lists,
-      or a scalar.  Note: numpy arrays are considered scalars.
+    structure: Nested structure, whose structure is given by nested lists,
+        tuples, and dicts. Note: numpy arrays and strings are considered
+        scalars.
     flat_sequence: flat sequence to pack.
 
   Returns:
@@ -272,9 +334,9 @@ def map_structure(func, *structure, **check_types_dict):
   and the return value will contain the results in the same structure.
 
   Args:
-    func: A callable that acceps as many arguments are there are structures.
+    func: A callable that accepts as many arguments as there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
-      tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
+      tuples/lists, or scalars.  Note: numpy arrays are considered  as scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
       `True` (default) the types of iterables within the  structures have to be
       same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 8a17d990da216538172936eff34025cd83772df9..375e30e9534a90568ed7a476b8b8868d394450bc 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -65,11 +65,80 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  def testFlattenAndPack_withDicts(self):
+    # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
+    named_tuple = collections.namedtuple("A", ("b", "c"))
+    mess = [
+        "z",
+        named_tuple(3, 4),
+        {
+            "c": [
+                1,
+                collections.OrderedDict([
+                    ("b", 3),
+                    ("a", 2),
+                ]),
+            ],
+            "b": 5
+        },
+        17
+    ]
+
+    flattened = nest.flatten(mess)
+    self.assertEqual(flattened, ["z", 3, 4, 5, 1, 2, 3, 17])
+
+    structure_of_mess = [
+        14,
+        named_tuple("a", True),
+        {
+            "c": [
+                0,
+                collections.OrderedDict([
+                    ("b", 9),
+                    ("a", 8),
+                ]),
+            ],
+            "b": 3
+        },
+        "hi everybody",
+    ]
+
+    unflattened = nest.pack_sequence_as(structure_of_mess, flattened)
+    self.assertEqual(unflattened, mess)
+
+    # Check also that the OrderedDict was created, with the correct key order.
+    unflattened_ordered_dict = unflattened[2]["c"][1]
+    self.assertIsInstance(unflattened_ordered_dict, collections.OrderedDict)
+    self.assertEqual(list(unflattened_ordered_dict.keys()), ["b", "a"])
+
+  def testFlatten_numpyIsNotFlattened(self):
+    structure = np.array([1, 2, 3])
+    flattened = nest.flatten(structure)
+    self.assertEqual(len(flattened), 1)
+
+  def testFlatten_stringIsNotFlattened(self):
+    structure = "lots of letters"
+    flattened = nest.flatten(structure)
+    self.assertEqual(len(flattened), 1)
+
+  def testPackSequenceAs_notIterableError(self):
+    with self.assertRaisesRegexp(TypeError,
+                                 "flat_sequence must be a sequence"):
+      nest.pack_sequence_as("hi", "bye")
+
+  def testPackSequenceAs_wrongLengthsError(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Structure had 2 elements, but flat_sequence had 3 elements."):
+      nest.pack_sequence_as(["hello", "world"],
+                            ["and", "goodbye", "again"])
+
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
     self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
     self.assertTrue(nest.is_sequence([]))
+    self.assertTrue(nest.is_sequence({"a": 1, "b": 2}))
     self.assertFalse(nest.is_sequence(set([1, 2])))
     ones = array_ops.ones([2, 3])
     self.assertFalse(nest.is_sequence(ones))
@@ -103,22 +172,33 @@ class NestTest(test.TestCase):
     nest.assert_same_structure("abc", np.array([0, 1]))
     nest.assert_same_structure("abc", constant_op.constant([0, 1]))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+    with self.assertRaisesRegexp(
+        ValueError,
+        ("don't have the same number of elements\\.\n\n"
+         "First structure \\(6 elements\\):.*?"
+         "\n\nSecond structure \\(2 elements\\):")):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+    with self.assertRaisesRegexp(
+        ValueError,
+        ("don't have the same number of elements\\.\n\n"
+         "First structure \\(2 elements\\):.*?"
+         "\n\nSecond structure \\(1 elements\\):")):
       nest.assert_same_structure([0, 1], np.array([0, 1]))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+    with self.assertRaisesRegexp(
+        ValueError,
+        ("don't have the same number of elements\\.\n\n"
+         "First structure \\(1 elements\\):.*"
+         "\n\nSecond structure \\(2 elements\\):")):
       nest.assert_same_structure(0, [0, 1])
 
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1), [0, 1])
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegexp(
+        ValueError,
+        ("don't have the same nested structure\\.\n\n"
+         "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(structure1, structure_different_nesting)
 
     named_type_0 = collections.namedtuple("named_0", ("a", "b"))
@@ -131,12 +211,16 @@ class NestTest(test.TestCase):
     self.assertRaises(TypeError, nest.assert_same_structure,
                       named_type_0(3, 4), named_type_1(3, 4))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegexp(
+        ValueError,
+        ("don't have the same nested structure\\.\n\n"
+         "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(named_type_0(3, 4), named_type_0([3], 4))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same nested structure"):
+    with self.assertRaisesRegexp(
+        ValueError,
+        ("don't have the same nested structure\\.\n\n"
+         "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure([[3], 4], [3, [4]])
 
     structure1_list = [[[1, 2], 3], 4, [5, 6]]
@@ -146,6 +230,10 @@ class NestTest(test.TestCase):
     nest.assert_same_structure(structure1, structure2, check_types=False)
     nest.assert_same_structure(structure1, structure1_list, check_types=False)
 
+    with self.assertRaisesRegexp(ValueError,
+                                 "don't have the same set of keys"):
+      nest.assert_same_structure({"a": 1}, {"b": 1})
+
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 05c99856d27c1748c4411d03b0932db2755229cc..ab9e82a3cce660e60b0aa69c37228ff75f919f23 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -25,10 +25,15 @@ import types
 
 import six  # pylint: disable=unused-import
 
-from backports import weakref  # pylint: disable=g-bad-import-order
+# pylint: disable=g-bad-import-order,g-import-not-at-top
+try:
+  from weakref import finalize
+except ImportError:
+  from backports.weakref import finalize
 
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
+# pylint: enable=g-bad-import-order,g-import-not-at-top
 
 
 class _RefInfoField(
@@ -107,7 +112,7 @@ def _add_should_use_warning(x, fatal_error=False):
       # garbage collected.  Can't add self as the args because the
       # loop will break garbage collection.  We keep track of
       # ourselves via python ids.
-      weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error)
+      finalize(self, _deleted, self._tf_ref_id, fatal_error)
 
     # Not sure why this pylint warning is being used; this is not an
     # old class form.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i b/tensorflow/python/util/tfprof.i
similarity index 85%
rename from tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
rename to tensorflow/python/util/tfprof.i
index 40f29ae8a2c3e2989e65dd83787ab2c0300e4877..45105298e5f15f1ce19cc77bfbc47342a73cc8ae 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/internal/pywrap_tensorflow_print_model_analysis.i
+++ b/tensorflow/python/util/tfprof.i
@@ -17,8 +17,8 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/profiler/internal/print_model_analysis.h"
 
 using tensorflow::int64;
 %}
@@ -29,7 +29,11 @@ using tensorflow::int64;
   $1 = &temp;
 }
 %typemap(out) const string& {
+%#if PY_MAJOR_VERSION >= 3
+  $result = PyUnicode_FromStringAndSize($1->data(), $1->size());
+%#else
   $result = PyString_FromStringAndSize($1->data(), $1->size());
+%#endif
 }
 %apply const string & {string &};
 %apply const string & {string *};
@@ -43,8 +47,7 @@ using tensorflow::int64;
 %unignore tensorflow::tfprof::DeleteProfiler;
 %unignore tensorflow::tfprof::AddStep;
 %unignore tensorflow::tfprof::Profile;
-%unignore tensorflow::tfprof::Advise;
 
-%include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
+%include "tensorflow/core/profiler/internal/print_model_analysis.h"
 
 %unignoreall
\ No newline at end of file
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index a59a1dda71f89e4c9a707ae457883c5c227ae782..da09d84921e2dd94942b3a62fe7366211c60aed1 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -67,6 +67,10 @@ string SideString(Side s) {
   }
 }
 
+// -- AlgorithmConfig
+
+string AlgorithmConfig::ToString() const { return port::StrCat(algorithm_); }
+
 string ComputationTypeString(ComputationType ty) {
   switch (ty) {
     case ComputationType::kF16:
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index 07a0f7ccd618c955fb33985fb1e036c790d47bed..eb1b19c5d963d56c6175251a54e2ab5072a01760 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 
 #include "tensorflow/stream_executor/lib/array_slice.h"
-#include "tensorflow/stream_executor/platform/port.h"
 
 namespace Eigen {
 struct half;
@@ -97,8 +96,9 @@ enum class ComputationType {
   kF16,         // 16-bit floating-point
   kF32,         // 32-bit floating-point
   kF64,         // 64-bit floating-point
+  kI32,         // 32-bit integer
   kComplexF32,  // Complex number comprised of two f32s.
-  kComplexF64   // Complex number comprised of two f64s.
+  kComplexF64,  // Complex number comprised of two f64s.
 };
 
 // Converts a ComputationType to a string.
@@ -107,6 +107,19 @@ string ComputationTypeString(ComputationType ty);
 // Opaque identifier for an "algorithm" used by a blas routine.  This functions
 // as a hint to the blas library.
 typedef int64 AlgorithmType;
+constexpr AlgorithmType kDefaultAlgorithm = -1;
+constexpr AlgorithmType kDefaultBlasGemm = -2;
+constexpr AlgorithmType kDefaultBlasGemv = -3;
+constexpr AlgorithmType kNoAlgorithm = -4;
+
+// blas uses -1 to represent the default algorithm. This happens to match up
+// with the CUBLAS_GEMM_DFALT constant, so cuda_blas.cc is using static_cast
+// to convert from AlgorithmType to cublasGemmAlgo_t, and uses a static_assert
+// to ensure that this assumption does not break.
+// If another blas implementation uses a different value for the default
+// algorithm, then it needs to convert kDefaultGemmAlgo to that value
+// (e.g. via a function called ToWhateverGemmAlgo).
+constexpr AlgorithmType kDefaultGemmAlgo = -1;
 
 // Describes the result of a performance experiment, usually timing the speed of
 // a particular AlgorithmType.
@@ -124,10 +137,28 @@ class ProfileResult {
 
  private:
   bool is_valid_ = false;
-  AlgorithmType algorithm_ = 0;
+  AlgorithmType algorithm_ = kDefaultAlgorithm;
   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
 };
 
+class AlgorithmConfig {
+ public:
+  AlgorithmConfig() : algorithm_(kDefaultAlgorithm) {}
+  explicit AlgorithmConfig(AlgorithmType algorithm) : algorithm_(algorithm) {}
+  AlgorithmType algorithm() const { return algorithm_; }
+  void set_algorithm(AlgorithmType val) { algorithm_ = val; }
+  bool operator==(const AlgorithmConfig &other) const {
+    return this->algorithm_ == other.algorithm_;
+  }
+  bool operator!=(const AlgorithmConfig &other) const {
+    return !(*this == other);
+  }
+  string ToString() const;
+
+ private:
+  AlgorithmType algorithm_;
+};
+
 // BLAS support interface -- this can be derived from a GPU executor when the
 // underlying platform has an BLAS library implementation available. See
 // StreamExecutor::AsBlas().
@@ -443,6 +474,29 @@ class BlasSupport {
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
 
+  virtual bool DoBlasGemvWithProfiling(
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
+      int incx, float beta, DeviceMemory<float> *y, int incy,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemvWithProfiling(
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
+      int incx, double beta, DeviceMemory<double> *y, int incy,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemvWithProfiling(
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+      std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
+      int lda, const DeviceMemory<std::complex<float>> &x, int incx,
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemvWithProfiling(
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+      std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
+      int lda, const DeviceMemory<std::complex<double>> &x, int incx,
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *y,
+      int incy, ProfileResult *output_profile_result) = 0;
+
   // Performs a rank-1 update of a general matrix.
   //
   //     a <- alpha * x * y' + a,
@@ -925,8 +979,39 @@ class BlasSupport {
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) = 0;
 
-  // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.  Note that
-  // any or all of these algorithms may still be
+  virtual bool DoBlasGemmWithProfiling(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+      int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
+      DeviceMemory<Eigen::half> *c, int ldc,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithProfiling(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+      const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+      int ldc, ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithProfiling(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+      const DeviceMemory<double> &b, int ldb, double beta,
+      DeviceMemory<double> *c, int ldc,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithProfiling(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, std::complex<float> alpha,
+      const DeviceMemory<std::complex<float>> &a, int lda,
+      const DeviceMemory<std::complex<float>> &b, int ldb,
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      ProfileResult *output_profile_result) = 0;
+  virtual bool DoBlasGemmWithProfiling(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, std::complex<double> alpha,
+      const DeviceMemory<std::complex<double>> &a, int lda,
+      const DeviceMemory<std::complex<double>> &b, int ldb,
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      ProfileResult *output_profile_result) = 0;
+
+  // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   virtual bool GetBlasGemmAlgorithms(
       std::vector<AlgorithmType> *out_algorithms) = 0;
 
@@ -944,6 +1029,12 @@ class BlasSupport {
   // output_profile_result->is_valid().  This lets you use this function for
   // choosing the best algorithm among many (some of which may fail) without
   // creating a new Stream for each attempt.
+  virtual bool DoBlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
+      const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int32> *c,
+      int ldc, ComputationType computation_type, AlgorithmType algorithm,
+      ProfileResult *output_profile_result) = 0;
   virtual bool DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
       uint64 n, uint64 k, const Eigen::half &alpha,
@@ -1457,6 +1548,28 @@ class BlasSupport {
                   const DeviceMemory<std::complex<double>> &x, int incx,       \
                   std::complex<double> beta,                                   \
                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
+  bool DoBlasGemvWithProfiling(                                                \
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,  \
+      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,     \
+      int incx, float beta, DeviceMemory<float> *y, int incy,                  \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemvWithProfiling(                                                \
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha, \
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,   \
+      int incx, double beta, DeviceMemory<double> *y, int incy,                \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemvWithProfiling(                                                \
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n,               \
+      std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,   \
+      int lda, const DeviceMemory<std::complex<float>> &x, int incx,           \
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *y,          \
+      int incy, blas::ProfileResult *output_profile_result) override;          \
+  bool DoBlasGemvWithProfiling(                                                \
+      Stream *stream, blas::Transpose trans, uint64 m, uint64 n,               \
+      std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, \
+      int lda, const DeviceMemory<std::complex<double>> &x, int incx,          \
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *y,        \
+      int incy, blas::ProfileResult *output_profile_result) override;          \
   bool DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,              \
                  const DeviceMemory<float> &x, int incx,                       \
                  const DeviceMemory<float> &y, int incy,                       \
@@ -1735,8 +1848,48 @@ class BlasSupport {
                   const DeviceMemory<std::complex<double>> &b, int ldb,        \
                   std::complex<double> beta,                                   \
                   DeviceMemory<std::complex<double>> *c, int ldc) override;    \
+  bool DoBlasGemmWithProfiling(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha,                               \
+      const DeviceMemory<Eigen::half> &a, int lda,                             \
+      const DeviceMemory<Eigen::half> &b, int ldb, float beta,                 \
+      DeviceMemory<Eigen::half> *c, int ldc,                                   \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithProfiling(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, \
+      int lda, const DeviceMemory<float> &b, int ldb, float beta,              \
+      DeviceMemory<float> *c, int ldc,                                         \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithProfiling(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, double alpha,                              \
+      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
+      int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithProfiling(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
+      const DeviceMemory<std::complex<float>> &a, int lda,                     \
+      const DeviceMemory<std::complex<float>> &b, int ldb,                     \
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
+      blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmWithProfiling(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
+      const DeviceMemory<std::complex<double>> &a, int lda,                    \
+      const DeviceMemory<std::complex<double>> &b, int ldb,                    \
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
+      int ldc, blas::ProfileResult *output_profile_result) override;           \
   bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms) \
       override;                                                                \
+  bool DoBlasGemmWithAlgorithm(                                                \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a,    \
+      int lda, const DeviceMemory<int8> &b, int ldb, int beta,                 \
+      DeviceMemory<int> *c, int ldc, blas::ComputationType computation_type,   \
+      blas::AlgorithmType algorithm,                                           \
+      blas::ProfileResult *output_profile_result) override;                    \
   bool DoBlasGemmWithAlgorithm(                                                \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64 m, uint64 n, uint64 k, const Eigen::half &alpha,                  \
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 2c650afc7025084fd5b233ccf46cb60e2a9befd8..cb2b06d47cd8ccf82e9df81d63049915b9b47582 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -33,6 +33,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
 
+#include <assert.h>
 #include <complex>
 
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@@ -483,6 +484,11 @@ struct CUDADataType<std::complex<double>> {
   static constexpr cudaDataType_t type = CUDA_C_64F;
 };
 
+template <>
+struct CUDADataType<int> {
+  static constexpr cudaDataType_t type = CUDA_R_32I;
+};
+
 template <>
 struct CUDADataType<int8> {
   static constexpr cudaDataType_t type = CUDA_R_8I;
@@ -511,6 +517,8 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
       return CUDA_R_32F;
     case blas::ComputationType::kF64:
       return CUDA_R_64F;
+    case blas::ComputationType::kI32:
+      return CUDA_R_32I;
     case blas::ComputationType::kComplexF32:
       return CUDA_C_32F;
     case blas::ComputationType::kComplexF64:
@@ -1849,12 +1857,186 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
       CUDAComplex(CUDAMemoryMutable(c)), ldc);
 }
 
+bool CUDABlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
+    int incx, float beta, DeviceMemory<float> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
+    int incx, double beta, DeviceMemory<double> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
+    int lda, const DeviceMemory<std::complex<float>> &x, int incx,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
+    int lda, const DeviceMemory<std::complex<double>> &x, int incx,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
+    DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool CUDABlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
 template <typename T>
+bool CUDABlas::DoBlasGemvWithProfilingImpl(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
+    const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
+    const T &beta, DeviceMemory<T> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  struct TimerDeleter {
+    void operator()(CUDATimer *t) {
+      t->Destroy();
+      delete t;
+    }
+  };
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  if (output_profile_result != nullptr) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
+
+  // Call blasGemm
+  bool result =
+      DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+
+  if (timer != nullptr && result) {
+    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // state.
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(blas::kDefaultBlasGemv);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
+  return result;
+}
+
+template <typename T, typename ParamType>
+bool CUDABlas::DoBlasGemmWithProfilingImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+    int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+    DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
+  struct TimerDeleter {
+    void operator()(CUDATimer *t) {
+      t->Destroy();
+      delete t;
+    }
+  };
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  if (output_profile_result != nullptr) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
+
+  // Call blasGemm
+  bool result = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b,
+                           ldb, beta, c, ldc);
+
+  if (timer != nullptr && result) {
+    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // state.
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(blas::kDefaultBlasGemm);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
+  return result;
+}
+
+template <typename InT, typename OutT, typename CompT>
 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
-    uint64 n, uint64 k, const T &alpha, const DeviceMemory<T> &a, int lda,
-    const DeviceMemory<T> &b, int ldb, const T &beta, DeviceMemory<T> *c,
-    int ldc, blas::ComputationType computation_type,
+    uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a, int lda,
+    const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+    DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
 // CUDA < version 8 and GPUs < sm_50 don't support cublasGemmEx.
 #if CUDA_VERSION < 8000
@@ -1881,12 +2063,15 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     }
   }
 
-  cudaDataType_t data_type = CUDADataType<T>::type;
+  cudaDataType_t cuda_in_type = CUDADataType<InT>::type;
+  // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
+  // we do the following compile-time check on the default value:
+  static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, "");
   bool result = DoBlasInternalFailureOK(
       wrap::cublasGemmEx, stream, /* pointer_mode_host = */ true,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), data_type, lda, CUDAMemory(b), data_type, ldb, &beta,
-      CUDAMemoryMutable(c), data_type, ldc,
+      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb, &beta,
+      CUDAMemoryMutable(c), CUDADataType<OutT>::type, ldc,
       CUDAComputationType(computation_type),
       static_cast<cublasGemmAlgo_t>(algorithm));
 
@@ -1909,6 +2094,9 @@ bool CUDABlas::GetBlasGemmAlgorithms(
     std::vector<blas::AlgorithmType> *out_algorithms) {
 // cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx)
 // were first introduced in CUDA 8.
+// Note that when CUDA version and compute capability is not sufficient, we
+// still return the out_algorithms. Caller needs to make sure that in this case,
+// the returned vector is empty.
 #if CUDA_VERSION >= 8000
   for (cublasGemmAlgo_t algo :
        {CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1,
@@ -1920,6 +2108,17 @@ bool CUDABlas::GetBlasGemmAlgorithms(
   return true;
 }
 
+bool CUDABlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
+    const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithAlgorithmImpl(
+      stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+      computation_type, algorithm, output_profile_result);
+}
+
 bool CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
     uint64 n, uint64 k, const Eigen::half &alpha,
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 6a33cd746b30a93740abef58aff7950807dc30ca..80cda971173fe34658f3403f1354babbd02e6ff9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -118,15 +118,30 @@ class CUDABlas : public blas::BlasSupport {
   // and we want to avoid pulling in a dependency on Eigen.  When we pass the
   // references to cublas, we essentially reinterpret_cast to __half, which is
   // safe because Eigen::half inherits from __half.
+  template <typename InT, typename OutT, typename CompT>
+  bool DoBlasGemmWithAlgorithmImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a,
+      int lda, const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+      DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemmWithProfiling.
+  template <typename T, typename ParamType>
+  bool DoBlasGemmWithProfilingImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+      int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+      DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemvWithProfiling.
   template <typename T>
-  bool DoBlasGemmWithAlgorithmImpl(Stream *stream, blas::Transpose transa,
-                                   blas::Transpose transb, uint64 m, uint64 n,
-                                   uint64 k, const T &alpha,
+  bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans,
+                                   uint64 m, uint64 n, const T &alpha,
                                    const DeviceMemory<T> &a, int lda,
-                                   const DeviceMemory<T> &b, int ldb,
-                                   const T &beta, DeviceMemory<T> *c, int ldc,
-                                   blas::ComputationType computation_type,
-                                   blas::AlgorithmType algorithm,
+                                   const DeviceMemory<T> &x, int incx,
+                                   const T &beta, DeviceMemory<T> *y, int incy,
                                    blas::ProfileResult *output_profile_result);
 
   // mutex that guards the cuBLAS handle for this device.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 34a683ab5753cde8224665ffcee1428f56803df7..b4d2589c7eea7113a3cbae19e772f9b87dba0e9d 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -728,6 +728,7 @@ class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor(CUDAExecutor* parent,
                              dnn::ActivationMode activation_mode,
+                             cudnnNanPropagation_t nan_propagation,
                              double value_max)
       : parent_(parent), handle_(nullptr) {
     cudnnStatus_t status =
@@ -762,8 +763,6 @@ class ScopedActivationDescriptor {
                    << static_cast<int>(activation_mode);
     }
 
-    // Always propagate nans.
-    cudnnNanPropagation_t nan_propagation = CUDNN_PROPAGATE_NAN;
     status = wrap::cudnnSetActivationDescriptor(parent_, handle_, mode,
                                                 nan_propagation, relu_ceiling);
     if (status != CUDNN_STATUS_SUCCESS) {
@@ -1563,13 +1562,13 @@ bool CudnnSupport::DoRnnBackwardImpl(
     const DeviceMemory<T>& output_h_data,
     const CudnnRnnStateTensorDescriptor& output_c_desc,
     const DeviceMemory<T>& output_c_data,
-    const DeviceMemory<float>& output_backprop_data,
-    const DeviceMemory<float>& output_h_backprop_data,
-    const DeviceMemory<float>& output_c_backprop_data,
-    DeviceMemory<float>* input_backprop_data,
-    DeviceMemory<float>* input_h_backprop_data,
-    DeviceMemory<float>* input_c_backprop_data,
-    DeviceMemory<float>* params_backprop_data,
+    const DeviceMemory<T>& output_backprop_data,
+    const DeviceMemory<T>& output_h_backprop_data,
+    const DeviceMemory<T>& output_c_backprop_data,
+    DeviceMemory<T>* input_backprop_data,
+    DeviceMemory<T>* input_h_backprop_data,
+    DeviceMemory<T>* input_c_backprop_data,
+    DeviceMemory<T>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator) {
   // extract model parameters
@@ -1769,6 +1768,49 @@ bool CudnnSupport::DoRnnForward(
 #endif  // CUDNN_VERSION
 }
 
+bool CudnnSupport::DoRnnForward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<double>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<double>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<double>& input_c_data,
+    const DeviceMemory<double>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    DeviceMemory<double>* output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    DeviceMemory<double>* output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    DeviceMemory<double>* output_c_data, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator) {
+#if CUDNN_VERSION >= 5000
+  const CudnnRnnDescriptor& cudnn_rnn_desc =
+      static_cast<const CudnnRnnDescriptor&>(rnn_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(input_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_c_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_output_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(output_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnForwardImpl<double>(
+      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
+      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
+      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+#else
+  return false;
+#endif  // CUDNN_VERSION
+}
+
 bool CudnnSupport::DoRnnBackward(
     Stream* stream, const dnn::RnnDescriptor& rnn_desc,
     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -1821,6 +1863,59 @@ bool CudnnSupport::DoRnnBackward(
 #endif  // CUDNN_VERSION
 }
 
+bool CudnnSupport::DoRnnBackward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<double>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<double>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<double>& input_c_data,
+    const DeviceMemory<double>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<double>& output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<double>& output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<double>& output_c_data,
+    const DeviceMemory<double>& output_backprop_data,
+    const DeviceMemory<double>& output_h_backprop_data,
+    const DeviceMemory<double>& output_c_backprop_data,
+    DeviceMemory<double>* input_backprop_data,
+    DeviceMemory<double>* input_h_backprop_data,
+    DeviceMemory<double>* input_c_backprop_data,
+    DeviceMemory<double>* params_backprop_data,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
+#if CUDNN_VERSION >= 5000
+  const CudnnRnnDescriptor& cudnn_rnn_desc =
+      static_cast<const CudnnRnnDescriptor&>(rnn_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(input_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_c_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_output_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(output_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnBackwardImpl<double>(
+      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
+      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
+      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
+      output_c_data, output_backprop_data, output_h_backprop_data,
+      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
+      input_c_backprop_data, params_backprop_data, reserve_space_data,
+      workspace_allocator);
+#else
+  return false;
+#endif  // CUDNN_VERSION
+}
+
 template <class T>
 bool CudnnSupport::DoConvolveImpl(
     Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
@@ -1863,8 +1958,8 @@ bool CudnnSupport::DoConvolveImpl(
 
   if (algorithm_config.algorithm() == dnn::kDefaultAlgorithm) {
     // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm = [&](bool specify_limit)
-        SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
+    auto get_algorithm =
+        [&](bool specify_limit) SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
           cudnnConvolutionFwdPreference_t preference =
               specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
                             : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
@@ -1892,7 +1987,6 @@ bool CudnnSupport::DoConvolveImpl(
         };
 
     algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-
     if (scratch_allocator != nullptr) {
       size_t size_in_bytes;
       status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
@@ -1917,7 +2011,6 @@ bool CudnnSupport::DoConvolveImpl(
   } else {
     // An algorithm has been specified.
     algo = ToConvForwardAlgo(algorithm_config.algorithm());
-
     size_t size_in_bytes;
     status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
         parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
@@ -1954,12 +2047,9 @@ bool CudnnSupport::DoConvolveImpl(
       }
     }
   }
-
   const bool has_biases = (biases != nullptr);
   const bool supported_activation_mode =
-      (activation_mode == dnn::ActivationMode::kRelu6 ||
-       activation_mode == dnn::ActivationMode::kReluX ||
-       activation_mode == dnn::ActivationMode::kRelu);
+      (activation_mode == dnn::ActivationMode::kRelu);
 
   if (has_biases && !supported_activation_mode) {
     LOG(ERROR) << "cudnnConvolutionBiasActivationForward() only "
@@ -1967,11 +2057,11 @@ bool CudnnSupport::DoConvolveImpl(
     return false;
   }
 
-  if (has_biases && activation_mode != dnn::ActivationMode::kNone) {
+  if (has_biases && activation_mode == dnn::ActivationMode::kNone) {
     LOG(ERROR) << "To use cudnnConvolutionBiasActivationForward() "
                   "with a valid biases tensor, need to also provide "
                   "a valid activation mode (currently only supports "
-                  "kRelu6, kReluX, and kRelu).";
+                  "kRelu).";
     return false;
   }
 
@@ -2004,7 +2094,12 @@ bool CudnnSupport::DoConvolveImpl(
         .set_layout(dnn::DataLayout::kBatchYXDepth);
     ScopedTensorDescriptor bias_descriptor{
         parent_, bias_dimensions, static_cast<cudnnDataType_t>(cudnn_type)};
+    // CUDNN v6 only supports CUDNN_NOT_PROPAGATE_NAN as the reluNanOpt for
+    // activation descriptor. Note that this will change the nan propagation
+    // behavior from separate conv, bias, and relu (which by default is
+    // CUDNN_PROPAGATE_NAN.
     ScopedActivationDescriptor activation_desc{parent_, activation_mode,
+                                               CUDNN_NOT_PROPAGATE_NAN,
                                                output_descriptor.value_max()};
     status = wrap::cudnnConvolutionBiasActivationForward(
         parent_, ToHandle(dnn_handle_),
@@ -2013,7 +2108,7 @@ bool CudnnSupport::DoConvolveImpl(
         /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
         /*algo=*/algo, /*workSpace=*/scratch.opaque(),
         /*workSpaceSizeInBytes=*/scratch.size(), /*alpha2=*/&beta,
-        /*zDesc=*/output_nd.handle(), /*z=*/nullptr,
+        /*zDesc=*/output_nd.handle(), /*z=*/input_data.opaque(),
         /*biasDesc=*/bias_descriptor.handle(),
         /*bias=*/biases.opaque(), /*activationDesc=*/activation_desc.handle(),
         /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
@@ -2059,8 +2154,9 @@ bool CudnnSupport::DoConvolveImpl(
 // env-var "TF_ENABLE_WINOGRAD_NONFUSED=0".
 // https://github.com/tensorflow/tensorflow/pull/4901
 // TODO(yangzihao): winograd_nonfused bug will only be fixed in cuDNNv7, for
-// cuDNN with smaller versions, we have added code to avoid using winograd
-// nonfused for certain input parameter set.
+// cuDNN with smaller version, we are setting the default flag to false due to
+// b/62635189. Need to root cause this and figure out a workaround or file a bug
+// against NVIDIA.
 template <bool DefaultFlag>
 class WinogradNonfused {
  public:
@@ -2102,7 +2198,7 @@ bool CudnnSupport::GetConvolveAlgorithms(
       // clang-format on
   });
 #if CUDNN_VERSION >= 5100
-  if (WinogradNonfused<true>::IsEnabled() && with_winograd_nonfused) {
+  if (WinogradNonfused<false>::IsEnabled() && with_winograd_nonfused) {
     out_algorithms->push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
   }
 #endif
@@ -2124,7 +2220,7 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
       // clang-format on
   });
 #if CUDNN_VERSION >= 5100
-  if (WinogradNonfused<true>::IsEnabled() && with_winograd_nonfused) {
+  if (WinogradNonfused<false>::IsEnabled() && with_winograd_nonfused) {
     out_algorithms->push_back(
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
@@ -2144,13 +2240,7 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       // clang-format on
   });
 #if CUDNN_VERSION >= 5100
-#if CUDNN_VERSION >= 5110
-  static constexpr bool kDefaultFlagWinogradNonfused = true;
-#else
-  static constexpr bool kDefaultFlagWinogradNonfused = false;
-#endif
-  if (WinogradNonfused<kDefaultFlagWinogradNonfused>::IsEnabled() &&
-      with_winograd_nonfused) {
+  if (WinogradNonfused<false>::IsEnabled() && with_winograd_nonfused) {
     out_algorithms->push_back(
         // Based on cudnn.h, the following is not implemented.
         // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
@@ -2432,17 +2522,16 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      dnn::DataType input_type,
                                      const DeviceMemoryBase& input_data,
                                      const dnn::BatchDescriptor& output_desc,
-                                     dnn::DataType output_type,
+                                     dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
-  float alpha = 1.0f;
   float beta = 0.0f;
   ScopedTensorDescriptor input_tensor_desc(
       parent_, input_desc, ToCudnnDataType(input_type, input_desc.layout()));
   ScopedTensorDescriptor output_tensor_desc(
       parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
   cudnnStatus_t status = wrap::cudnnTransformTensor(
-      parent_, ToHandle(dnn_handle_), &alpha, input_tensor_desc.handle(),
+      parent_, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
       input_data.opaque(), &beta, output_tensor_desc.handle(),
       output_data->opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
@@ -3201,8 +3290,8 @@ bool CudnnSupport::DoActivate(Stream* stream,
   }
 
 #if CUDNN_VERSION >= 5000
-  ScopedActivationDescriptor activation_desc{parent_, activation_mode,
-                                             dimensions.value_max()};
+  ScopedActivationDescriptor activation_desc{
+      parent_, activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max()};
 #else
   cudnnActivationMode_t mode;
   switch (activation_mode) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index cc37c8bb9f3baf782bc2921d27c683f40f20a1bf..b094cf76e94bfdbd2fbe89b8e7ff917145cd0fd5 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -79,6 +79,23 @@ class CudnnSupport : public dnn::DnnSupport {
                     ScratchAllocator* reserve_space_allocator,
                     ScratchAllocator* workspace_allocator) override;
 
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<double>& input_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<double>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<double>& input_c_data,
+                    const DeviceMemory<double>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<double>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<double>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<double>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator) override;
+
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
                      const DeviceMemory<float>& input_data,
@@ -103,6 +120,30 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<uint8>* reserve_space_data,
                      ScratchAllocator* workspace_allocator) override;
 
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<double>& input_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<double>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<double>& input_c_data,
+                     const DeviceMemory<double>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<double>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<double>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<double>& output_c_data,
+                     const DeviceMemory<double>& output_backprop_data,
+                     const DeviceMemory<double>& output_h_backprop_data,
+                     const DeviceMemory<double>& output_c_backprop_data,
+                     DeviceMemory<double>* input_backprop_data,
+                     DeviceMemory<double>* input_h_backprop_data,
+                     DeviceMemory<double>* input_c_backprop_data,
+                     DeviceMemory<double>* params_backprop_data,
+                     DeviceMemory<uint8>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator) override;
+
   bool GetConvolveAlgorithms(
       bool with_winograd_nonfused,
       std::vector<dnn::AlgorithmType>* out_algorithms) override;
@@ -455,7 +496,7 @@ class CudnnSupport : public dnn::DnnSupport {
                          dnn::DataType input_type,
                          const DeviceMemoryBase& input_data,
                          const dnn::BatchDescriptor& output_desc,
-                         dnn::DataType output_type,
+                         dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
  private:
@@ -597,13 +638,13 @@ class CudnnSupport : public dnn::DnnSupport {
                          const DeviceMemory<T>& output_h_data,
                          const CudnnRnnStateTensorDescriptor& output_c_desc,
                          const DeviceMemory<T>& output_c_data,
-                         const DeviceMemory<float>& output_backprop_data,
-                         const DeviceMemory<float>& output_h_backprop_data,
-                         const DeviceMemory<float>& output_c_backprop_data,
-                         DeviceMemory<float>* input_backprop_data,
-                         DeviceMemory<float>* input_h_backprop_data,
-                         DeviceMemory<float>* input_c_backprop_data,
-                         DeviceMemory<float>* params_backprop_data,
+                         const DeviceMemory<T>& output_backprop_data,
+                         const DeviceMemory<T>& output_h_backprop_data,
+                         const DeviceMemory<T>& output_c_backprop_data,
+                         DeviceMemory<T>* input_backprop_data,
+                         DeviceMemory<T>* input_h_backprop_data,
+                         DeviceMemory<T>* input_c_backprop_data,
+                         DeviceMemory<T>* params_backprop_data,
                          DeviceMemory<uint8>* reserve_space_data,
                          ScratchAllocator* workspace_allocator);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 520a3598d71ec567be902558830eb3b3b58791d7..0f465bea7af35e6751a41bc67d0b085ec90c9a1e 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -53,12 +53,20 @@ namespace wrap {
     }                                                            \
   } __name;
 
-#define CUFFT_ROUTINE_EACH(__macro)                                         \
-  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)        \
-      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)      \
-          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C) \
-              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                   \
-                  __macro(cufftExecR2C)
+#define CUFFT_ROUTINE_EACH(__macro)                                            \
+  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)           \
+      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)         \
+          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C)    \
+              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                      \
+                  __macro(cufftExecR2C) __macro(cufftCreate)                   \
+                      __macro(cufftSetAutoAllocation)                          \
+                          __macro(cufftSetWorkArea) __macro(cufftGetSize1d)    \
+                              __macro(cufftMakePlan1d) __macro(cufftGetSize2d) \
+                                  __macro(cufftMakePlan2d)                     \
+                                      __macro(cufftGetSize3d)                  \
+                                          __macro(cufftMakePlan3d)             \
+                                              __macro(cufftGetSizeMany)        \
+                                                  __macro(cufftMakePlanMany)
 
 CUFFT_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUFFT_WRAP)
 
@@ -100,40 +108,15 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
 
 }  // namespace
 
-CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type)
-    : parent_(parent), fft_type_(type) {
-  auto ret = wrap::cufftPlan1d(parent, &plan_, num_x, CUDAFftType(type),
-                               1 /* = batch */);
-  if (ret != CUFFT_SUCCESS) {
-    LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
-  }
-}
-
-CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
-                         fft::Type type)
-    : parent_(parent), fft_type_(type) {
-  auto ret = wrap::cufftPlan2d(parent, &plan_, num_x, num_y, CUDAFftType(type));
-  if (ret != CUFFT_SUCCESS) {
-    LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
+port::Status CUDAFftPlan::Initialize(
+    CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    uint64 *input_embed, uint64 input_stride, uint64 input_distance,
+    uint64 *output_embed, uint64 output_stride, uint64 output_distance,
+    fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
+  if (IsInitialized()) {
+    LOG(FATAL) << "Try to repeatedly initialize.";
   }
-}
-
-CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y,
-                         uint64 num_z, fft::Type type)
-    : parent_(parent), fft_type_(type) {
-  auto ret =
-      wrap::cufftPlan3d(parent, &plan_, num_x, num_y, num_z, CUDAFftType(type));
-  if (ret != CUFFT_SUCCESS) {
-    LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
-  }
-}
-
-CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
-                         uint64 *input_embed, uint64 input_stride,
-                         uint64 input_distance, uint64 *output_embed,
-                         uint64 output_stride, uint64 output_distance,
-                         fft::Type type, int batch_count)
-    : parent_(parent), fft_type_(type) {
+  is_initialized_ = true;
   int elem_count_[3], input_embed_[3], output_embed_[3];
   for (int i = 0; i < rank; ++i) {
     elem_count_[i] = elem_count[i];
@@ -144,55 +127,302 @@ CUDAFftPlan::CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
       output_embed_[i] = output_embed[i];
     }
   }
-  auto ret = wrap::cufftPlanMany(
-      parent, &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr,
-      input_stride, input_distance, output_embed ? output_embed_ : nullptr,
-      output_stride, output_distance, CUDAFftType(type), batch_count);
-  if (ret != CUFFT_SUCCESS) {
-    LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
+  parent_ = parent;
+  fft_type_ = type;
+  if (batch_count == 1 && input_embed == nullptr && output_embed == nullptr) {
+    cufftResult_t ret;
+    if (scratch_allocator == nullptr) {
+      switch (rank) {
+        case 1:
+          // cufftPlan1d
+          ret = wrap::cufftPlan1d(parent, &plan_, elem_count_[0],
+                                  CUDAFftType(type), 1 /* = batch */);
+          if (ret != CUFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create cuFFT 1d plan."};
+          }
+          return port::Status::OK();
+        case 2:
+          // cufftPlan2d
+          ret = wrap::cufftPlan2d(parent, &plan_, elem_count_[0],
+                                  elem_count_[1], CUDAFftType(type));
+          if (ret != CUFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create cuFFT 2d plan."};
+          }
+          return port::Status::OK();
+        case 3:
+          // cufftPlan3d
+          ret =
+              wrap::cufftPlan3d(parent, &plan_, elem_count_[0], elem_count_[1],
+                                elem_count_[2], CUDAFftType(type));
+          if (ret != CUFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create cuFFT 3d plan."};
+          }
+          return port::Status::OK();
+        default:
+          LOG(ERROR) << "Invalid rank value for cufftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "cufftPlan only takes rank 1, 2, or 3."};
+      }
+    } else {
+      ret = wrap::cufftCreate(parent, &plan_);
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create cuFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create cuFFT plan."};
+      }
+      ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for cuFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set auto allocation for cuFFT plan."};
+      }
+      size_t size_in_bytes;
+      switch (rank) {
+        case 1:
+          ret = wrap::cufftMakePlan1d(parent, plan_, elem_count_[0],
+                                      CUDAFftType(type), /*batch=*/1,
+                                      &size_in_bytes);
+          if (ret != CUFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make cuFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make cuFFT 1d plan."};
+          }
+          break;
+        case 2:
+          ret = wrap::cufftMakePlan2d(parent, plan_, elem_count_[0],
+                                      elem_count_[1], CUDAFftType(type),
+                                      &size_in_bytes);
+          if (ret != CUFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make cuFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make cuFFT 2d plan."};
+          }
+          break;
+        case 3:
+          ret = wrap::cufftMakePlan3d(parent, plan_, elem_count_[0],
+                                      elem_count_[1], elem_count_[2],
+                                      CUDAFftType(type), &size_in_bytes);
+          if (ret != CUFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make cuFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make cuFFT 3d plan."};
+          }
+          break;
+        default:
+          LOG(ERROR) << "Invalid rank value for cufftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "cufftPlan only takes rank 1, 2, or 3."};
+      }
+      // TODO(yangzihao): refactor this code and the one with the same function
+      // in the batch mode.
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::cufftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for cuFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for cuFFT plan."};
+      }
+      return port::Status::OK();
+    }
+  } else {
+    // For either multiple batches or rank higher than 3, use cufftPlanMany().
+    if (scratch_allocator == nullptr) {
+      auto ret = wrap::cufftPlanMany(
+          parent, &plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, CUDAFftType(type), batch_count);
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create cuFFT bacthed plan."};
+      }
+    } else {
+      auto ret = wrap::cufftCreate(parent, &plan_);
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create cuFFT bacthed plan."};
+      }
+      ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for cuFFT batched plan:"
+                   << ret;
+        return port::Status{
+            port::error::INTERNAL,
+            "Failed to set auto allocation for cuFFT bacthed plan."};
+      }
+      size_t size_in_bytes;
+      ret = wrap::cufftMakePlanMany(
+          parent, plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, CUDAFftType(type), batch_count, &size_in_bytes);
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to make cuFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to make cuFFT bacthed plan."};
+      }
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::cufftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != CUFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for cuFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for cuFFT bacthed plan."};
+      }
+    }
   }
+  return port::Status::OK();
+}
+
+port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
+                                     int rank, uint64 *elem_count,
+                                     fft::Type type,
+                                     ScratchAllocator *scratch_allocator) {
+  return Initialize(parent_, stream, rank, elem_count,
+                    /*input_embed=*/nullptr, /*input_stride=*/0,
+                    /*input_distance=*/0,
+                    /*output_embed=*/nullptr, /*output_stride=*/0,
+                    /*output_distance=*/0, type, 1, scratch_allocator);
 }
 
 CUDAFftPlan::~CUDAFftPlan() { wrap::cufftDestroy(parent_, plan_); }
 
 int CUDAFftPlan::GetFftDirection() const {
-  switch (fft_type_) {
-    case fft::Type::kC2CForward:
-    case fft::Type::kZ2ZForward:
-    case fft::Type::kR2C:
-    case fft::Type::kD2Z:
-      return CUFFT_FORWARD;
-    case fft::Type::kC2CInverse:
-    case fft::Type::kZ2ZInverse:
-    case fft::Type::kC2R:
-    case fft::Type::kZ2D:
-      return CUFFT_INVERSE;
-    default:
-      LOG(FATAL) << "Invalid value of fft::Type.";
+  if (!IsInitialized()) {
+    LOG(FATAL) << "Try to get fft direction before initialization.";
+  } else {
+    switch (fft_type_) {
+      case fft::Type::kC2CForward:
+      case fft::Type::kZ2ZForward:
+      case fft::Type::kR2C:
+      case fft::Type::kD2Z:
+        return CUFFT_FORWARD;
+      case fft::Type::kC2CInverse:
+      case fft::Type::kZ2ZInverse:
+      case fft::Type::kC2R:
+      case fft::Type::kZ2D:
+        return CUFFT_INVERSE;
+      default:
+        LOG(FATAL) << "Invalid value of fft::Type.";
+    }
   }
 }
 
 std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x,
                                                  fft::Type type,
                                                  bool in_place_fft) {
-  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, type)};
-  return plan;
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  // TODO(yangzihao): In the future, send error msg back to TensorFlow
+  // so it can fail gracefully,
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize cufft 1d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create1dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, fft::Type type, bool in_place_fft,
+    ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize cufft 1d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
 }
 
 std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x,
                                                  uint64 num_y, fft::Type type,
                                                  bool in_place_fft) {
-  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(parent_, num_x, num_y, type)};
-  return plan;
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize cufft 2d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create2dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize cufft 2d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
 }
 
 std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x,
                                                  uint64 num_y, uint64 num_z,
                                                  fft::Type type,
                                                  bool in_place_fft) {
-  std::unique_ptr<fft::Plan> plan{
-      new CUDAFftPlan(parent_, num_x, num_y, num_z, type)};
-  return plan;
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize cufft 3d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::Create3dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize cufft 3d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
 }
 
 std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
@@ -200,10 +430,35 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
     uint64 input_stride, uint64 input_distance, uint64 *output_embed,
     uint64 output_stride, uint64 output_distance, fft::Type type,
     bool in_place_fft, int batch_count) {
-  std::unique_ptr<fft::Plan> plan{new CUDAFftPlan(
-      parent_, rank, elem_count, input_embed, input_stride, input_distance,
-      output_embed, output_stride, output_distance, type, batch_count)};
-  return plan;
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched cufft plan: "
+               << status.error_message();
+  }
+
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlanWithScratchAllocator(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize batched cufft plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
 }
 
 template <typename FuncT, typename InputT, typename OutputT>
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 95b3e8de63d572bfdc27124c17639b45e909000b..16102eb945a11d7083ebcfe29796b3fb5aa15a9c 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -20,10 +20,11 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
 
+#include "cuda/include/cufft.h"
 #include "tensorflow/stream_executor/fft.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
-#include "cuda/include/cufft.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
 
 namespace perftools {
 namespace gputools {
@@ -37,30 +38,53 @@ class CUDAExecutor;
 // Opaque and unique indentifier for the cuFFT plugin.
 extern const PluginId kCuFftPlugin;
 
+// CUDAFftPlan uses deferred initialization. Only a single call of
+// Initialize() is allowed to properly create cufft plan and set member
+// variable is_initialized_ to true. Newly added interface that uses member
+// variables should first check is_initialized_ to make sure that the values of
+// member variables are valid.
 class CUDAFftPlan : public fft::Plan {
  public:
-  // Constructor creating 1d FFT plan.
-  CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, fft::Type type);
-  // Constructor creating 2d FFT plan.
-  CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, fft::Type type);
-  // Constructor creating 3d FFT plan.
-  CUDAFftPlan(CUDAExecutor *parent, uint64 num_x, uint64 num_y, uint64 num_z,
-              fft::Type type);
-  // Constructor creating batched FFT plan.
-  CUDAFftPlan(CUDAExecutor *parent, int rank, uint64 *elem_count,
-              uint64 *input_embed, uint64 input_stride, uint64 input_distance,
-              uint64 *output_embed, uint64 output_stride,
-              uint64 output_distance, fft::Type type, int batch_count);
+  CUDAFftPlan()
+      : parent_(nullptr),
+        plan_(-1),
+        fft_type_(fft::Type::kInvalid),
+        scratch_(nullptr),
+        is_initialized_(false) {}
   ~CUDAFftPlan() override;
 
   // Get FFT direction in cuFFT based on FFT type.
   int GetFftDirection() const;
-  cufftHandle GetPlan() const { return plan_; }
+  cufftHandle GetPlan() const {
+    if (IsInitialized()) {
+      return plan_;
+    } else {
+      LOG(FATAL) << "Try to get cufftHandle value before initialization.";
+    }
+  }
+
+  // Initialize function for batched plan
+  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, uint64 *input_embed,
+                          uint64 input_stride, uint64 input_distance,
+                          uint64 *output_embed, uint64 output_stride,
+                          uint64 output_distance, fft::Type type,
+                          int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Initialize function for 1d,2d, and 3d plan
+  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, fft::Type type,
+                          ScratchAllocator *scratch_allocator);
+
+ protected:
+  bool IsInitialized() const { return is_initialized_; }
 
  private:
   CUDAExecutor *parent_;
   cufftHandle plan_;
   fft::Type fft_type_;
+  DeviceMemory<uint8> scratch_;
+  bool is_initialized_;
 };
 
 // FFT support for CUDA platform via cuFFT library.
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 8c8ac8662d18ef295d3be4183c5f60cebaa884f8..f97deb72227f07d256301c6063198b714f2e8be1 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1902,6 +1902,25 @@ class DnnSupport {
     return false;
   }
 
+  virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                            const dnn::RnnSequenceTensorDescriptor& input_desc,
+                            const DeviceMemory<double>& input_data,
+                            const dnn::RnnStateTensorDescriptor& input_h_desc,
+                            const DeviceMemory<double>& input_h_data,
+                            const dnn::RnnStateTensorDescriptor& input_c_desc,
+                            const DeviceMemory<double>& input_c_data,
+                            const DeviceMemory<double>& params,
+                            const dnn::RnnSequenceTensorDescriptor& output_desc,
+                            DeviceMemory<double>* output_data,
+                            const dnn::RnnStateTensorDescriptor& output_h_desc,
+                            DeviceMemory<double>* output_h_data,
+                            const dnn::RnnStateTensorDescriptor& output_c_desc,
+                            DeviceMemory<double>* output_c_data,
+                            bool is_training,
+                            ScratchAllocator* reserve_space_allocator,
+                            ScratchAllocator* workspace_allocator) {
+    return false;
+  }
   // Enqueue a backward operation of the RNN model onto the stream.
   //
   // Arguments:
@@ -1970,6 +1989,33 @@ class DnnSupport {
     return false;
   }
 
+  virtual bool DoRnnBackward(
+      Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+      const dnn::RnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<double>& input_data,
+      const dnn::RnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<double>& input_h_data,
+      const dnn::RnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<double>& input_c_data,
+      const DeviceMemory<double>& params,
+      const dnn::RnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<double>& output_data,
+      const dnn::RnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<double>& output_h_data,
+      const dnn::RnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<double>& output_c_data,
+      const DeviceMemory<double>& output_backprop_data,
+      const DeviceMemory<double>& output_h_backprop_data,
+      const DeviceMemory<double>& output_c_backprop_data,
+      DeviceMemory<double>* input_backprop_data,
+      DeviceMemory<double>* input_h_backprop_data,
+      DeviceMemory<double>* input_c_backprop_data,
+      DeviceMemory<double>* params_backprop_data,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
   // Transforms a tensor into another tensor with a different layout and/or data
   // type.
   //
@@ -1980,13 +2026,14 @@ class DnnSupport {
   //  input_data: the device memory region that contains the input tensor.
   //  output_desc: specifies the shape and the data layout of the output tensor.
   //  output_type: the data type of the output tensor.
+  //  scale: an element-wise scaling factor to apply.
   //  output_data: the device memory region that contains the output tensor.
   virtual bool DoTransformTensor(Stream* stream,
                                  const dnn::BatchDescriptor& input_desc,
                                  dnn::DataType input_type,
                                  const DeviceMemoryBase& input_data,
                                  const dnn::BatchDescriptor& output_desc,
-                                 dnn::DataType output_type,
+                                 dnn::DataType output_type, float scale,
                                  DeviceMemoryBase* output_data) {
     return false;
   }
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
index 6e921d142b603525391caea915492ab2a4df4f4d..98cd77e2062bef45dd46e73ac29782eb12591e64 100644
--- a/tensorflow/stream_executor/fft.h
+++ b/tensorflow/stream_executor/fft.h
@@ -54,12 +54,14 @@ namespace gputools {
 class Stream;
 template <typename ElemT>
 class DeviceMemory;
+class ScratchAllocator;
 
 namespace fft {
 
 // Specifies FFT input and output types, and the direction.
 // R, D, C, and Z stand for SP real, DP real, SP complex, and DP complex.
 enum class Type {
+  kInvalid,
   kC2CForward,
   kC2CInverse,
   kC2R,
@@ -103,6 +105,21 @@ class FftSupport {
                                              uint64 num_y, uint64 num_z,
                                              Type type, bool in_place_fft) = 0;
 
+  // Creates a 1d FFT plan with scratch allocator.
+  virtual std::unique_ptr<Plan> Create1dPlanWithScratchAllocator(
+      Stream *stream, uint64 num_x, Type type, bool in_place_fft,
+      ScratchAllocator *scratch_allocator) = 0;
+
+  // Creates a 2d FFT plan with scratch allocator.
+  virtual std::unique_ptr<Plan> Create2dPlanWithScratchAllocator(
+      Stream *stream, uint64 num_x, uint64 num_y, Type type, bool in_place_fft,
+      ScratchAllocator *scratch_allocator) = 0;
+
+  // Creates a 3d FFT plan with scratch allocator.
+  virtual std::unique_ptr<Plan> Create3dPlanWithScratchAllocator(
+      Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z, Type type,
+      bool in_place_fft, ScratchAllocator *scratch_allocator) = 0;
+
   // Creates a batched FFT plan.
   //
   // stream:          The GPU stream in which the FFT runs.
@@ -126,6 +143,30 @@ class FftSupport {
       uint64 output_stride, uint64 output_distance, Type type,
       bool in_place_fft, int batch_count) = 0;
 
+  // Creates a batched FFT plan with scratch allocator.
+  //
+  // stream:          The GPU stream in which the FFT runs.
+  // rank:            Dimensionality of the transform (1, 2, or 3).
+  // elem_count:      Array of size rank, describing the size of each dimension.
+  // input_embed, output_embed:
+  //                  Pointer of size rank that indicates the storage dimensions
+  //                  of the input/output data in memory. If set to null_ptr all
+  //                  other advanced data layout parameters are ignored.
+  // input_stride:    Indicates the distance (number of elements; same below)
+  //                  between two successive input elements.
+  // input_distance:  Indicates the distance between the first element of two
+  //                  consecutive signals in a batch of the input data.
+  // output_stride:   Indicates the distance between two successive output
+  //                  elements.
+  // output_distance: Indicates the distance between the first element of two
+  //                  consecutive signals in a batch of the output data.
+  virtual std::unique_ptr<Plan> CreateBatchedPlanWithScratchAllocator(
+      Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+      uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+      uint64 output_stride, uint64 output_distance, Type type,
+      bool in_place_fft, int batch_count,
+      ScratchAllocator *scratch_allocator) = 0;
+
   // Computes complex-to-complex FFT in the transform direction as specified
   // by direction parameter.
   virtual bool DoFft(Stream *stream, Plan *plan,
@@ -161,38 +202,54 @@ class FftSupport {
 // Macro used to quickly declare overrides for abstract virtuals in the
 // fft::FftSupport base class. Assumes that it's emitted somewhere inside the
 // ::perftools::gputools namespace.
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES                \
-  std::unique_ptr<fft::Plan> Create1dPlan(Stream *stream, uint64 num_x,      \
-                                          fft::Type type, bool in_place_fft) \
-      override;                                                              \
-  std::unique_ptr<fft::Plan> Create2dPlan(Stream *stream, uint64 num_x,      \
-                                          uint64 num_y, fft::Type type,      \
-                                          bool in_place_fft) override;       \
-  std::unique_ptr<fft::Plan> Create3dPlan(                                   \
-      Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z,              \
-      fft::Type type, bool in_place_fft) override;                           \
-  std::unique_ptr<fft::Plan> CreateBatchedPlan(                              \
-      Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,     \
-      uint64 input_stride, uint64 input_distance, uint64 *output_embed,      \
-      uint64 output_stride, uint64 output_distance, fft::Type type,          \
-      bool in_place_fft, int batch_count) override;                          \
-  bool DoFft(Stream *stream, fft::Plan *plan,                                \
-             const DeviceMemory<std::complex<float>> &input,                 \
-             DeviceMemory<std::complex<float>> *output) override;            \
-  bool DoFft(Stream *stream, fft::Plan *plan,                                \
-             const DeviceMemory<std::complex<double>> &input,                \
-             DeviceMemory<std::complex<double>> *output) override;           \
-  bool DoFft(Stream *stream, fft::Plan *plan,                                \
-             const DeviceMemory<float> &input,                               \
-             DeviceMemory<std::complex<float>> *output) override;            \
-  bool DoFft(Stream *stream, fft::Plan *plan,                                \
-             const DeviceMemory<double> &input,                              \
-             DeviceMemory<std::complex<double>> *output) override;           \
-  bool DoFft(Stream *stream, fft::Plan *plan,                                \
-             const DeviceMemory<std::complex<float>> &input,                 \
-             DeviceMemory<float> *output) override;                          \
-  bool DoFft(Stream *stream, fft::Plan *plan,                                \
-             const DeviceMemory<std::complex<double>> &input,                \
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES                   \
+  std::unique_ptr<fft::Plan> Create1dPlan(Stream *stream, uint64 num_x,        \
+                                          fft::Type type, bool in_place_fft)   \
+      override;                                                                \
+  std::unique_ptr<fft::Plan> Create2dPlan(Stream *stream, uint64 num_x,        \
+                                          uint64 num_y, fft::Type type,        \
+                                          bool in_place_fft) override;         \
+  std::unique_ptr<fft::Plan> Create3dPlan(                                     \
+      Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z,                \
+      fft::Type type, bool in_place_fft) override;                             \
+  std::unique_ptr<fft::Plan> Create1dPlanWithScratchAllocator(                 \
+      Stream *stream, uint64 num_x, fft::Type type, bool in_place_fft,         \
+      ScratchAllocator *scratch_allocator) override;                           \
+  std::unique_ptr<fft::Plan> Create2dPlanWithScratchAllocator(                 \
+      Stream *stream, uint64 num_x, uint64 num_y, fft::Type type,              \
+      bool in_place_fft, ScratchAllocator *scratch_allocator) override;        \
+  std::unique_ptr<fft::Plan> Create3dPlanWithScratchAllocator(                 \
+      Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z,                \
+      fft::Type type, bool in_place_fft, ScratchAllocator *scratch_allocator)  \
+      override;                                                                \
+  std::unique_ptr<fft::Plan> CreateBatchedPlan(                                \
+      Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,       \
+      uint64 input_stride, uint64 input_distance, uint64 *output_embed,        \
+      uint64 output_stride, uint64 output_distance, fft::Type type,            \
+      bool in_place_fft, int batch_count) override;                            \
+  std::unique_ptr<fft::Plan> CreateBatchedPlanWithScratchAllocator(            \
+      Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,       \
+      uint64 input_stride, uint64 input_distance, uint64 *output_embed,        \
+      uint64 output_stride, uint64 output_distance, fft::Type type,            \
+      bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) \
+      override;                                                                \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<float>> &input,                   \
+             DeviceMemory<std::complex<float>> *output) override;              \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<double>> &input,                  \
+             DeviceMemory<std::complex<double>> *output) override;             \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<float> &input,                                 \
+             DeviceMemory<std::complex<float>> *output) override;              \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<double> &input,                                \
+             DeviceMemory<std::complex<double>> *output) override;             \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<float>> &input,                   \
+             DeviceMemory<float> *output) override;                            \
+  bool DoFft(Stream *stream, fft::Plan *plan,                                  \
+             const DeviceMemory<std::complex<double>> &input,                  \
              DeviceMemory<double> *output) override;
 
 }  // namespace fft
diff --git a/tensorflow/stream_executor/lib/status_macros.h b/tensorflow/stream_executor/lib/status_macros.h
index 816132e85a62dd9bfa609d1932c1fc44b39034fd..ff8f4a71c8d70288348f4041c64c0534e6d91acd 100644
--- a/tensorflow/stream_executor/lib/status_macros.h
+++ b/tensorflow/stream_executor/lib/status_macros.h
@@ -40,7 +40,7 @@ limitations under the License.
   if (!__name.ok()) {                                  \
     return __name.status();                            \
   }                                                    \
-  __lhs = __name.ConsumeValueOrDie();
+  __lhs = std::move(__name.ValueOrDie());
 
 // Early-returns the status if it is in error; otherwise, assigns the
 // right-hand-side expression to the left-hand-side expression.
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index e06550009a9cefa2e52f6939a3da55bd5379930c..138738ecab54986fd7d5cd76839d59da55623b1f 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -49,7 +49,7 @@ limitations under the License.
 //
 //  StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg);
 //  if (result.ok()) {
-//    std::unique_ptr<Foo> foo = result.ConsumeValueOrDie();
+//    std::unique_ptr<Foo> foo = std::move(result.ValueOrDie());
 //    foo->DoSomethingCool();
 //  } else {
 //    LOG(ERROR) << result.status();
@@ -163,6 +163,7 @@ class StatusOr {
   // If you need to initialize a T object from the stored value,
   // ConsumeValueOrDie() may be more efficient.
   const T& ValueOrDie() const;
+  T& ValueOrDie();
 
   // Returns our current value, requires this->ok(). Use this if
   // you would otherwise want to say std::move(s.ValueOrDie()), for example
@@ -206,6 +207,12 @@ const T& StatusOr<T>::ValueOrDie() const {
   return value_;
 }
 
+template <typename T>
+T& StatusOr<T>::ValueOrDie() {
+  TF_CHECK_OK(status_);
+  return value_;
+}
+
 template <typename T>
 T StatusOr<T>::ConsumeValueOrDie() {
   TF_CHECK_OK(status_);
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index fff9accae3243879889015a09b8e0b56310f6214..c9b36ba7ab35df1229d04b2ef5f73edeaa2e3c1f 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -59,6 +59,10 @@ string ToVlogString(dnn::ActivationMode mode) {
   return dnn::ActivationModeString(mode);
 }
 
+string ToVlogString(const dnn::AlgorithmConfig &algo_config) {
+  return algo_config.ToString();
+}
+
 string ToVlogString(dnn::ElementwiseOperation op) {
   return dnn::ElementwiseOperationString(op);
 }
@@ -482,7 +486,8 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
             PARAM(filter_descriptor), PARAM(filter_data),
             PARAM(convolution_descriptor), PARAM(biases),
-            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
+            PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -515,7 +520,8 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
             PARAM(filter_descriptor), PARAM(filter_data),
             PARAM(convolution_descriptor), PARAM(biases),
-            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
+            PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -546,7 +552,7 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
             PARAM(filter_descriptor), PARAM(filter_data),
             PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output));
+            PARAM(output), PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -577,7 +583,7 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
             PARAM(filter_descriptor), PARAM(filter_data),
             PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output));
+            PARAM(output), PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -3452,6 +3458,184 @@ struct ThenBlasWithProfileImpl {
 };
 }  // anonymous namespace
 
+Stream &Stream::ThenBlasGemvWithProfiling(
+    blas::Transpose trans, uint64 m, uint64 n, float alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
+    int incx, float beta, DeviceMemory<float> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
+            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
+            PARAM(incy));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, uint64, uint64, float, const DeviceMemory<float> &, int,
+      const DeviceMemory<float> &, int, float, DeviceMemory<float> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
+              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemvWithProfiling(
+    blas::Transpose trans, uint64 m, uint64 n, double alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
+    int incx, double beta, DeviceMemory<double> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
+            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
+            PARAM(incy));
+
+  ThenBlasWithProfileImpl<blas::Transpose, uint64, uint64, double,
+                          const DeviceMemory<double> &, int,
+                          const DeviceMemory<double> &, int, double,
+                          DeviceMemory<double> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
+              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemvWithProfiling(
+    blas::Transpose trans, uint64 m, uint64 n, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &x, int incx,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
+            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
+            PARAM(incy));
+
+  ThenBlasWithProfileImpl<blas::Transpose, uint64, uint64, std::complex<float>,
+                          const DeviceMemory<std::complex<float>> &, int,
+                          const DeviceMemory<std::complex<float>> &, int,
+                          std::complex<float>,
+                          DeviceMemory<std::complex<float>> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
+              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemvWithProfiling(
+    blas::Transpose trans, uint64 m, uint64 n, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &x, int incx,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
+            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
+            PARAM(incy));
+
+  ThenBlasWithProfileImpl<blas::Transpose, uint64, uint64, std::complex<double>,
+                          const DeviceMemory<std::complex<double>> &, int,
+                          const DeviceMemory<std::complex<double>> &, int,
+                          std::complex<double>,
+                          DeviceMemory<std::complex<double>> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
+              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithProfiling(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha, const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb, float beta,
+    DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc));
+
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64, float, const DeviceMemory<Eigen::half> &, int,
+                          const DeviceMemory<Eigen::half> &, int, float,
+                          DeviceMemory<Eigen::half> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+              output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithProfiling(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc));
+
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64, float, const DeviceMemory<float> &, int,
+                          const DeviceMemory<float> &, int, float,
+                          DeviceMemory<float> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+              output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithProfiling(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc));
+
+  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64, uint64,
+                          uint64, double, const DeviceMemory<double> &, int,
+                          const DeviceMemory<double> &, int, double,
+                          DeviceMemory<double> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+              output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithProfiling(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      std::complex<float>, const DeviceMemory<std::complex<float>> &, int,
+      const DeviceMemory<std::complex<float>> &, int, std::complex<float>,
+      DeviceMemory<std::complex<float>> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+              output_profile_result);
+}
+
+Stream &Stream::ThenBlasGemmWithProfiling(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64,
+      std::complex<double>, const DeviceMemory<std::complex<double>> &, int,
+      const DeviceMemory<std::complex<double>> &, int, std::complex<double>,
+      DeviceMemory<std::complex<double>> *, int>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+              output_profile_result);
+}
+
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
     uint64 k, const Eigen::half &alpha, const DeviceMemory<Eigen::half> &a,
@@ -3476,6 +3660,27 @@ Stream &Stream::ThenBlasGemmWithAlgorithm(
               algorithm, output_profile_result);
 }
 
+Stream &Stream::ThenBlasGemmWithAlgorithm(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, int alpha, const DeviceMemory<int8> &a, int lda,
+    const DeviceMemory<int8> &b, int ldb, int beta, DeviceMemory<int> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type),
+            PARAM(algorithm));
+
+  ThenBlasWithProfileImpl<
+      blas::Transpose, blas::Transpose, uint64, uint64, uint64, int,
+      const DeviceMemory<int8> &, int, const DeviceMemory<int8> &, int, int,
+      DeviceMemory<int> *, int, blas::ComputationType, blas::AlgorithmType>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb,
+              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type,
+              algorithm, output_profile_result);
+}
+
 Stream &Stream::ThenBlasGemmWithAlgorithm(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
     uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
@@ -4361,6 +4566,39 @@ Stream &Stream::ThenRnnForward(
   return *this;
 }
 
+Stream &Stream::ThenRnnForward(
+    const dnn::RnnDescriptor &rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor &input_desc,
+    const DeviceMemory<double> &input_data,
+    const dnn::RnnStateTensorDescriptor &input_h_desc,
+    const DeviceMemory<double> &input_h_data,
+    const dnn::RnnStateTensorDescriptor &input_c_desc,
+    const DeviceMemory<double> &input_c_data,
+    const DeviceMemory<double> &params,
+    const dnn::RnnSequenceTensorDescriptor &output_desc,
+    DeviceMemory<double> *output_data,
+    const dnn::RnnStateTensorDescriptor &output_h_desc,
+    DeviceMemory<double> *output_h_data,
+    const dnn::RnnStateTensorDescriptor &output_c_desc,
+    DeviceMemory<double> *output_c_data, bool is_training,
+    ScratchAllocator *reserve_space_allocator,
+    ScratchAllocator *workspace_allocator) {
+  // TODO(zhengxq): add VLOG PARAM calls.
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoRnnForward(
+          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+          input_c_desc, input_c_data, params, output_desc, output_data,
+          output_h_desc, output_h_data, output_c_desc, output_c_data,
+          is_training, reserve_space_allocator, workspace_allocator));
+    } else {
+      SetError();
+      LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenRnnBackward(
     const dnn::RnnDescriptor &rnn_desc,
     const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -4402,19 +4640,62 @@ Stream &Stream::ThenRnnBackward(
   return *this;
 }
 
+Stream &Stream::ThenRnnBackward(
+    const dnn::RnnDescriptor &rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor &input_desc,
+    const DeviceMemory<double> &input_data,
+    const dnn::RnnStateTensorDescriptor &input_h_desc,
+    const DeviceMemory<double> &input_h_data,
+    const dnn::RnnStateTensorDescriptor &input_c_desc,
+    const DeviceMemory<double> &input_c_data,
+    const DeviceMemory<double> &params,
+    const dnn::RnnSequenceTensorDescriptor &output_desc,
+    const DeviceMemory<double> &output_data,
+    const dnn::RnnStateTensorDescriptor &output_h_desc,
+    const DeviceMemory<double> &output_h_data,
+    const dnn::RnnStateTensorDescriptor &output_c_desc,
+    const DeviceMemory<double> &output_c_data,
+    const DeviceMemory<double> &output_backprop_data,
+    const DeviceMemory<double> &output_h_backprop_data,
+    const DeviceMemory<double> &output_c_backprop_data,
+    DeviceMemory<double> *input_backprop_data,
+    DeviceMemory<double> *input_h_backprop_data,
+    DeviceMemory<double> *input_c_backprop_data,
+    DeviceMemory<double> *params_backprop_data,
+    DeviceMemory<uint8> *reserve_space_data,
+    ScratchAllocator *workspace_allocator) {
+  // TODO(zhengxq): add VLOG PARAM calls.
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoRnnBackward(
+          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+          input_c_desc, input_c_data, params, output_desc, output_data,
+          output_h_desc, output_h_data, output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator));
+    } else {
+      SetError();
+      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
                                     dnn::DataType input_type,
                                     const DeviceMemoryBase &input_data,
                                     const dnn::BatchDescriptor &output_desc,
-                                    dnn::DataType output_type,
+                                    dnn::DataType output_type, float scale,
                                     DeviceMemoryBase *output_data) {
   VLOG_CALL(PARAM(input_desc), PARAM(input_type), PARAM(input_data),
-            PARAM(output_desc), PARAM(output_type), PARAM(output_data));
+            PARAM(output_desc), PARAM(output_type), PARAM(scale),
+            PARAM(output_data));
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoTransformTensor(this, input_desc, input_type,
                                         input_data, output_desc, output_type,
-                                        output_data));
+                                        scale, output_data));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index b07d3021c924af64b54e57d3bc14a4786d2728a7..e2188738390822c82443df022bf23e367326fb7e 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -934,6 +934,31 @@ class Stream {
                        std::complex<double> beta,
                        DeviceMemory<std::complex<double>> *y, int incy);
 
+  Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64 m, uint64 n,
+                                    float alpha, const DeviceMemory<float> &a,
+                                    int lda, const DeviceMemory<float> &x,
+                                    int incx, float beta,
+                                    DeviceMemory<float> *y, int incy,
+                                    blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64 m, uint64 n,
+                                    double alpha, const DeviceMemory<double> &a,
+                                    int lda, const DeviceMemory<double> &x,
+                                    int incx, double beta,
+                                    DeviceMemory<double> *y, int incy,
+                                    blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemvWithProfiling(
+      blas::Transpose trans, uint64 m, uint64 n, std::complex<float> alpha,
+      const DeviceMemory<std::complex<float>> &a, int lda,
+      const DeviceMemory<std::complex<float>> &x, int incx,
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemvWithProfiling(
+      blas::Transpose trans, uint64 m, uint64 n, std::complex<double> alpha,
+      const DeviceMemory<std::complex<double>> &a, int lda,
+      const DeviceMemory<std::complex<double>> &x, int incx,
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *y,
+      int incy, blas::ProfileResult *output_profile_result);
+
   // See BlasSupport::DoBlasGer.
   Stream &ThenBlasGer(uint64 m, uint64 n, float alpha,
                       const DeviceMemory<float> &x, int incx,
@@ -1249,6 +1274,44 @@ class Stream {
                        std::complex<double> beta,
                        DeviceMemory<std::complex<double>> *c, int ldc);
 
+  Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
+                                    blas::Transpose transb, uint64 m, uint64 n,
+                                    uint64 k, float alpha,
+                                    const DeviceMemory<Eigen::half> &a, int lda,
+                                    const DeviceMemory<Eigen::half> &b, int ldb,
+                                    float beta, DeviceMemory<Eigen::half> *c,
+                                    int ldc,
+                                    blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
+                                    blas::Transpose transb, uint64 m, uint64 n,
+                                    uint64 k, float alpha,
+                                    const DeviceMemory<float> &a, int lda,
+                                    const DeviceMemory<float> &b, int ldb,
+                                    float beta, DeviceMemory<float> *c, int ldc,
+                                    blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
+                                    blas::Transpose transb, uint64 m, uint64 n,
+                                    uint64 k, double alpha,
+                                    const DeviceMemory<double> &a, int lda,
+                                    const DeviceMemory<double> &b, int ldb,
+                                    double beta, DeviceMemory<double> *c,
+                                    int ldc,
+                                    blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithProfiling(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, std::complex<float> alpha,
+      const DeviceMemory<std::complex<float>> &a, int lda,
+      const DeviceMemory<std::complex<float>> &b, int ldb,
+      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+      blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithProfiling(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, std::complex<double> alpha,
+      const DeviceMemory<std::complex<double>> &a, int lda,
+      const DeviceMemory<std::complex<double>> &b, int ldb,
+      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+      blas::ProfileResult *output_profile_result);
+
   // See BlasSupport::DoBlasGemmWithAlgorithm.
   Stream &ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
@@ -1257,6 +1320,15 @@ class Stream {
       const Eigen::half &beta, DeviceMemory<Eigen::half> *c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
       blas::ProfileResult *output_profile_result);
+  Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
+                                    blas::Transpose transb, uint64 m, uint64 n,
+                                    uint64 k, int alpha,
+                                    const DeviceMemory<int8> &a, int lda,
+                                    const DeviceMemory<int8> &b, int ldb,
+                                    int beta, DeviceMemory<int> *c, int ldc,
+                                    blas::ComputationType computation_type,
+                                    blas::AlgorithmType algorithm,
+                                    blas::ProfileResult *output_profile_result);
   Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa,
                                     blas::Transpose transb, uint64 m, uint64 n,
                                     uint64 k, float alpha,
@@ -1623,6 +1695,23 @@ class Stream {
                          ScratchAllocator *reserve_space_allocator,
                          ScratchAllocator *workspace_allocator);
 
+  Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
+                         const dnn::RnnSequenceTensorDescriptor &input_desc,
+                         const DeviceMemory<double> &input_data,
+                         const dnn::RnnStateTensorDescriptor &input_h_desc,
+                         const DeviceMemory<double> &input_h_data,
+                         const dnn::RnnStateTensorDescriptor &input_c_desc,
+                         const DeviceMemory<double> &input_c_data,
+                         const DeviceMemory<double> &params,
+                         const dnn::RnnSequenceTensorDescriptor &output_desc,
+                         DeviceMemory<double> *output_data,
+                         const dnn::RnnStateTensorDescriptor &output_h_desc,
+                         DeviceMemory<double> *output_h_data,
+                         const dnn::RnnStateTensorDescriptor &output_c_desc,
+                         DeviceMemory<double> *output_c_data, bool is_training,
+                         ScratchAllocator *reserve_space_allocator,
+                         ScratchAllocator *workspace_allocator);
+
   // Enqueue a backward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnBackward for more details.
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
@@ -1649,13 +1738,37 @@ class Stream {
                           DeviceMemory<uint8> *reserve_space_data,
                           ScratchAllocator *workspace_allocator);
 
+  Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
+                          const dnn::RnnSequenceTensorDescriptor &input_desc,
+                          const DeviceMemory<double> &input_data,
+                          const dnn::RnnStateTensorDescriptor &input_h_desc,
+                          const DeviceMemory<double> &input_h_data,
+                          const dnn::RnnStateTensorDescriptor &input_c_desc,
+                          const DeviceMemory<double> &input_c_data,
+                          const DeviceMemory<double> &params,
+                          const dnn::RnnSequenceTensorDescriptor &output_desc,
+                          const DeviceMemory<double> &output_data,
+                          const dnn::RnnStateTensorDescriptor &output_h_desc,
+                          const DeviceMemory<double> &output_h_data,
+                          const dnn::RnnStateTensorDescriptor &output_c_desc,
+                          const DeviceMemory<double> &output_c_data,
+                          const DeviceMemory<double> &output_backprop_data,
+                          const DeviceMemory<double> &output_h_backprop_data,
+                          const DeviceMemory<double> &output_c_backprop_data,
+                          DeviceMemory<double> *input_backprop_data,
+                          DeviceMemory<double> *input_h_backprop_data,
+                          DeviceMemory<double> *input_c_backprop_data,
+                          DeviceMemory<double> *params_backprop_data,
+                          DeviceMemory<uint8> *reserve_space_data,
+                          ScratchAllocator *workspace_allocator);
+
   // Enqueue onto the stream a operation that transforms a tensor.
   // See DnnSupport::DoTransformTensor for more details.
   Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
                               dnn::DataType input_type,
                               const DeviceMemoryBase &input_data,
                               const dnn::BatchDescriptor &output_desc,
-                              dnn::DataType output_type,
+                              dnn::DataType output_type, float scale,
                               DeviceMemoryBase *output_data);
 
   // The templated version of the above ThenTransformTensor. Useful when the
diff --git a/tensorflow/tensorboard/BUILD b/tensorflow/tensorboard/BUILD
deleted file mode 100644
index bbd4251731e5c8af3c0cda71a1f3bbf07ec07eb7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/BUILD
+++ /dev/null
@@ -1,84 +0,0 @@
-# Description:
-# TensorBoard, a dashboard for investigating TensorFlow
-
-package(default_visibility = [":internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-package_group(
-    name = "internal",
-    packages = [
-        "//learning/brain/tensorboard/...",
-        "//learning/vis/...",
-        "//tensorflow/...",
-        "//tensorflow/tensorboard/...",
-    ],
-)
-
-py_binary(
-    name = "tensorboard",
-    srcs = ["main.py"],
-    data = [":assets"],
-    main = "main.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_file_inspector",
-        "//tensorflow/tensorboard/plugins/audio:audio_plugin",
-        "//tensorflow/tensorboard/plugins/distributions:distributions_plugin",
-        "//tensorflow/tensorboard/plugins/graphs:graphs_plugin",
-        "//tensorflow/tensorboard/plugins/histograms:histograms_plugin",
-        "//tensorflow/tensorboard/plugins/images:images_plugin",
-        "//tensorflow/tensorboard/plugins/projector:projector_plugin",
-        "//tensorflow/tensorboard/plugins/scalars:scalars_plugin",
-        "//tensorflow/tensorboard/plugins/text:text_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-    ],
-)
-
-py_library(
-    name = "expect_tensorflow_installed",
-    # This is a dummy rule used as a TensorFlow dependency in open-source.
-    # We expect TensorFlow to already be installed on the system, e.g. via
-    # `pip install tensorflow`
-)
-
-py_library(
-    name = "expect_numpy_installed",
-    # This is a dummy rule used as a numpy dependency in open-source.
-    # We expect numpy to already be installed on the system, e.g. via
-    # `pip install numpy`
-)
-
-filegroup(
-    name = "assets",
-    srcs = [
-        "TAG",
-        "//tensorflow/tensorboard/components:index.html",
-        "//tensorflow/tensorboard/components:trace_viewer_index.html",
-    ],
-)
-
-filegroup(
-    name = "ts_web_library_default_typings",
-    srcs = [
-        # Ordering probably matters.
-        "@com_microsoft_typescript//:lib.es6.d.ts",
-        "@io_angular_clutz//:src/resources/closure.lib.d.ts",
-        "//tensorflow/tensorboard/defs:clutz.d.ts",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**"],
-        exclude = [
-            "METADATA",
-            "OWNERS",
-            "tensorboard.google.bzl",
-        ],
-    ),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/DEVELOPMENT.md b/tensorflow/tensorboard/DEVELOPMENT.md
deleted file mode 100644
index 79d534a26c979804ab00d13d57c1e90a0705dff6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/DEVELOPMENT.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# How to Develop TensorBoard
-
-## Launching a Development Instance
-
-Run the following to launch a demo of TensorBoard in raw sources mode:
-
-```sh
-bazel run third_party/tensorflow/tensorboard/components/tf_tensorboard:demo
-```
-
-Now you can navigate to <http://localhost:6006/demo/index.html> and play with
-the demo TensorBoard instance. This will have live source reloading.
-
-This demo TensorBoard will have a small amount of demo data generated by
-[generate_testdata.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/scripts/generate_testdata.py).
-You can use [serialize_tensorboard.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/scripts/serialize_tensorboard.py)
-to create a realistic demo directory from your own data files.
-
-## Launching TensorBoard Proper
-
-Running TensorBoard automatically asks Bazel to create a vulcanized HTML binary:
-
-```sh
-bazel run //tensorflow/tensorboard:tensorboard -- --logdir=/path/to/logs
-```
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
deleted file mode 100644
index a9ab4d3bd2a85929436673981a0aa11e497a05db..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/README.md
+++ /dev/null
@@ -1,366 +0,0 @@
-# TensorBoard
-
-TensorBoard is a suite of web applications for inspecting and understanding your
-TensorFlow runs and graphs.
-
-This README gives an overview of key concepts in TensorBoard, as well as how to
-interpret the visualizations TensorBoard provides. For an in-depth example of
-using TensorBoard, see the tutorial: [TensorBoard: Visualizing
-Learning](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
-For in-depth information on the Graph Visualizer, see this tutorial: [TensorBoard: Graph Visualization](https://www.tensorflow.org/get_started/graph_viz).
-
-You may also want to watch
-[this video tutorial](https://www.youtube.com/watch?v=eBbEDRsCmv4) that walks
-through setting up and using TensorBoard.
-
-# Usage
-
-Before running TensorBoard, make sure you have generated summary data in a log
-directory by creating a summary writer:
-
-``` python
-# sess.graph contains the graph definition; that enables the Graph Visualizer.
-
-file_writer = tf.summary.FileWriter('/path/to/logs', sess.graph)
-```
-
-For more details, see [the TensorBoard tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
-Once you have event files, run TensorBoard and provide the log directory. If
-you're using a precompiled TensorFlow package (e.g. you installed via pip), run:
-
-```
-tensorboard --logdir=path/to/logs
-```
-
-Or, if you are building from source:
-
-```
-bazel build tensorflow/tensorboard:tensorboard
-./bazel-bin/tensorflow/tensorboard/tensorboard --logdir=path/to/logs
-```
-
-This should print that TensorBoard has started. Next, connect to
-http://localhost:6006.
-
-TensorBoard requires a `logdir` to read logs from. For info on configuring
-TensorBoard, run `tensorboard --help`.
-
-TensorBoard can be used in Google Chrome or Firefox. Other browsers might
-work, but there may be bugs or performance issues.
-
-# Key Concepts
-
-### Summary Ops: How TensorBoard gets data from TensorFlow
-
-The first step in using TensorBoard is acquiring data from your TensorFlow run.
-For this, you need [summary ops](https://www.tensorflow.org/api_docs/python/tf/summary).
-Summary ops are ops, like
-[`tf.matmul`](https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/matmul)
-or
-[`tf.nn.relu`](https://www.tensorflow.org/versions/master/api_docs/python/tf/nn/relu),
-which means they take in tensors, produce tensors, and are evaluated from within
-a TensorFlow graph. However, summary ops have a twist: the Tensors they produce
-contain serialized protobufs, which are written to disk and sent to TensorBoard.
-To visualize the summary data in TensorBoard, you should evaluate the summary
-op, retrieve the result, and then write that result to disk using a
-summary.FileWriter. A full explanation, with examples, is in [the
-tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
-
-The supported summary ops include:
-* tf.summary.scalar
-* tf.summary.image
-* tf.summary.audio
-* tf.summary.text
-* tf.summary.histogram
-
-### Tags: Giving names to data
-
-When you make a summary op, you will also give it a `tag`. The tag is basically
-a name for the data recorded by that op, and will be used to organize the data
-in the frontend. The scalar and histogram dashboards organize data by tag, and
-group the tags into folders according to a directory/like/hierarchy. If you have
-a lot of tags, we recommend grouping them with slashes.
-
-### Event Files & LogDirs: How TensorBoard loads the data
-
-`summary.FileWriters` take summary data from TensorFlow, and then write them to a
-specified directory, known as the `logdir`. Specifically, the data is written to
-an append-only record dump that will have "tfevents" in the filename.
-TensorBoard reads data from a full directory, and organizes it into the history
-of a single TensorFlow execution.
-
-Why does it read the whole directory, rather than an individual file? You might
-have been using
-[supervisor.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/training/supervisor.py)
-to run your model, in which case if TensorFlow crashes, the supervisor will
-restart it from a checkpoint. When it restarts, it will start writing to a new
-events file, and TensorBoard will stitch the various event files together to
-produce a consistent history of what happened.
-
-### Runs: Comparing different executions of your model
-
-You may want to visually compare multiple executions of your model; for example,
-suppose you've changed the hyperparameters and want to see if it's converging
-faster. TensorBoard enables this through different "runs". When TensorBoard is
-passed a `logdir` at startup, it recursively walks the directory tree rooted at
-`logdir` looking for subdirectories that contain tfevents data. Every time it
-encounters such a subdirectory, it loads it as a new `run`, and the frontend
-will organize the data accordingly.
-
-For example, here is a well-organized TensorBoard log directory, with two runs,
-"run1" and "run2".
-
-```
-/some/path/mnist_experiments/
-/some/path/mnist_experiments/run1/
-/some/path/mnist_experiments/run1/events.out.tfevents.1456525581.name
-/some/path/mnist_experiments/run1/events.out.tfevents.1456525585.name
-/some/path/mnist_experiments/run2/
-/some/path/mnist_experiments/run2/events.out.tfevents.1456525385.name
-/tensorboard --logdir=/some/path/mnist_experiments
-```
-
-You may also pass a comma separated list of log directories, and TensorBoard
-will watch each directory. You can also assign names to individual log
-directories by putting a colon between the name and the path, as in
-
-```
-tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
-```
-
-# The Visualizations
-
-### Scalar Dashboard
-
-TensorBoard's Scalar Dashboard visualizes scalar statistics that vary over time;
-for example, you might want to track the model's loss or learning rate. As
-described in *Key Concepts*, you can compare multiple runs, and the data is
-organized by tag. The line charts have the following interactions:
-
-* Clicking on the small blue icon in the lower-left corner of each chart will
-expand the chart
-
-* Dragging a rectangular region on the chart will zoom in
-
-* Double clicking on the chart will zoom out
-
-* Mousing over the chart will produce crosshairs, with data values recorded in
-the run-selector on the left.
-
-Additionally, you can create new folders to organize tags by writing regular
-expressions in the box in the top-left of the dashboard.
-
-### Histogram Dashboard
-
-The HistogramDashboard displays how the statistical distribution of a Tensor
-has varied over time. It visualizes data recorded via `tf.summary.histogram`.
-Each chart shows temporal "slices" of data, where each slice is a histogram of
-the tensor at a given step. It's organized with the oldest timestep in the back,
-and the most recent timestep in front. By changing the Histogram Mode from
-"offset" to "overlay", the perspective will rotate so that every histogram slice
-is rendered as a line and overlaid with one another.
-
-### Distribution Dashboard
-
-The Distribution Dashboard is another way of visualizing histogram data from
-`tf.summary.histogram`. It shows some high-level statistics on a distribution.
-Each line on the chart represents a percentile in the distribution over the
-data: for example, the bottom line shows how the minimum value has changed over
-time, and the line in the middle shows how the median has changed. Reading from
-top to bottom, the lines have the following meaning: `[maximum, 93%, 84%, 69%,
-50%, 31%, 16%, 7%, minimum]`
-
-These percentiles can also be viewed as standard deviation boundaries on a
-normal distribution: `[maximum, μ+1.5σ, μ+σ, μ+0.5σ, μ, μ-0.5σ, μ-σ, μ-1.5σ,
-minimum]` so that the colored regions, read from inside to outside, have widths
-`[σ, 2σ, 3σ]` respectively.
-
-
-### Image Dashboard
-
-The Image Dashboard can display pngs that were saved via a `tf.summary.image`.
-The dashboard is set up so that each row corresponds to a different tag, and
-each column corresponds to a run. Since the image dashboard supports arbitrary
-pngs, you can use this to embed custom visualizations (e.g. matplotlib
-scatterplots) into TensorBoard. This dashboard always shows you the latest image
-for each tag.
-
-### Audio Dashboard
-
-The Audio Dashboard can embed playable audio widgets for audio saved via a
-`tf.summary.audio`. The dashboard is set up so that each row corresponds to a
-different tag, and each column corresponds to a run. This dashboard always
-embeds the latest audio for each tag.
-
-### Graph Explorer
-
-The Graph Explorer can visualize a TensorBoard graph, enabling inspection of the
-TensorFlow model. To get best use of the graph visualizer, you should use name
-scopes to hierarchically group the ops in your graph - otherwise, the graph may
-be difficult to decipher. For more information, including examples, see [the
-graph visualizer tutorial](https://www.tensorflow.org/get_started/graph_viz).
-
-### Embedding Projector
-
-The Embedding Projector allows you to visualize high-dimensional data; for
-example, you may view your input data after it has been embedded in a high-
-dimensional space by your model. The embedding projector reads data from your
-model checkpoint file, and may be configured with additional metadata, like
-a vocabulary file or sprite images. For more details, see [the embedding
-projector tutorial](https://www.tensorflow.org/get_started/embedding_viz).
-
-### Text Dashboard
-
-The Text Dashboard displays text snippets saved via `tf.summary.text`. Markdown
-features including hyperlinks, lists, and tables are all supported.
-
-# Frequently Asked Questions
-
-### My TensorBoard isn't showing any data! What's wrong?
-
-The first thing to do is ensure that TensorBoard is properly loading data from
-the correct directory. Launch `tensorboard --logdir=DIRECTORY_PATH --debug` and
-look for output of the form
-
-`INFO:tensorflow:TensorBoard path_to_run is: {'DIRECTORY_PATH': None}`
-
-Verify that the DIRECTORY_PATH TensorBoard is looking at is the path you expect.
-(Note: There's a known issue where TensorBoard [does not handle paths starting
-in ~ properly](https://github.com/tensorflow/tensorflow/issues/1587)).
-
-If you're loading from the proper path, make sure that event files are present.
-TensorBoard will recursively walk its logdir, it's fine if the data is nested
-under a subdirectory. Try running the command:
-
-`find DIRECTORY_PATH | grep tfevents`
-
-If you have at least one result, then TensorBoard should be able to load data.
-
-Finally, let's make sure that the event files actually have data. Run
-tensorboard in inspector mode to inspect the contents of your event files.
-
-`tensorboard --inspect --logdir=DIRECTORY_PATH`
-
-If after running this procedure, it's still not working, please file an [issue
-on GitHub](https://github.com/tensorflow/tensorflow/issues). It will be much
-easier for us to debug it if you provide an event file that isn't working.
-
-### TensorBoard is showing only some of my data, or isn't properly updating!
-
-This issue usually comes about because of how TensorBoard iterates through the
-`tfevents` files: it progresses through the events file in timestamp order, and
-only reads one file at a time. Let's suppose we have files with timestamps `a`
-and `b`, where `a<b`. Once TensorBoard has read all the events in `a`, it will
-never return to it, because it assumes any new events are being written in the
-more recent file. This could cause an issue if, for example, you have two
-`FileWriters` simultaneously writing to the same directory. If you have
-multiple summary writers, each one should be writing to a separate directory.
-
-### Does TensorBoard support multiple or distributed summary writers?
-
-No. TensorBoard expects that only one events file will be written to at a time,
-and multiple summary writers means multiple events files. If you are running a
-distributed TensorFlow instance, we encourage you to designate a single worker
-as the "chief" that is responsible for all summary processing. See
-[supervisor.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/training/supervisor.py)
-for an example.
-
-### I'm seeing data overlapped on itself! What gives?
-
-If you are seeing data that seems to travel backwards through time and overlap
-with itself, there are a few possible explanations.
-
-* You may have multiple execution of TensorFlow that all wrote to the same log
-directory. Please have each TensorFlow run write to its own logdir.
-
-* You may have a have a bug in your code where the global_step variable (passed
-to `FileWriter.add_summary`) is being maintained incorrectly.
-
-* It may be that your TensorFlow job crashed, and was restarted from an earlier
-checkpoint. See *How to handle TensorFlow restarts*, below.
-
-As a workaround, try changing the x-axis display in TensorBoard from `steps` to
-`wall_time`. This will frequently clear up the issue.
-
-### How should I handle TensorFlow restarts?
-
-TensorFlow is designed with a mechanism for graceful recovery if a job crashes
-or is killed: TensorFlow can periodically write model checkpoint files, which
-enable you to restart TensorFlow without losing all your training progress.
-
-However, this can complicate things for TensorBoard; imagine that TensorFlow
-wrote a checkpoint at step `a`, and then continued running until step `b`, and
-then crashed and restarted at timestamp `a`. All of the events written between
-`a` and `b` were "orphaned" by the restart event and should be removed.
-
-To facilitate this, we have a `SessionLog` message in
-`tensorflow/core/util/event.proto` which can record `SessionStatus.START` as an
-event; like all events, it may have a `step` associated with it. If TensorBoard
-detects a `SessionStatus.START` event with step `a`, it will assume that every
-event with a step greater than `a` was orphaned, and it will discard those
-events. This behavior may be disabled with the flag
-`--purge_orphaned_data=false` (in versions after 0.7).
-
-### How can I export data from TensorBoard?
-
-The Scalar Dashboard supports exporting data; you can click the "enable
-download links" option in the left-hand bar. Then, each plot will provide
-download links for the data it contains.
-
-If you need access to the full dataset, you can read the event files that
-TensorBoard consumes by using the [`summary_iterator`](https://github.com/tensorflow/tensorflow/blob/e7f333b5f8b3c53b21d149d8d14c0cebbde431aa/tensorflow/python/summary/summary_iterator.py#L313)
-method.
-
-
-### Can I overlap multiple plots?
-
-Right now, you can overlap plots only if they are from different runs, and both
-have the same tag name.
-
-### Can I create scatterplots (or other custom plots)?
-
-This isn't yet possible. As a workaround, you could create your custom plot in
-your own code (e.g. matplotlib) and then write it into an `SummaryProto`
-(`core/framework/summary.proto`) and add it to your `FileWriter`. Then, your
-custom plot will appear in the TensorBoard image tab.
-
-### Is my data being downsampled? Am I really seeing all the data?
-
-TensorBoard uses [reservoir
-sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to downsample your
-data so that it can be loaded into RAM. You can modify the number of elements it
-will keep per tag in
-[tensorboard/backend/application.py](https://www.github.com/tensorflow/tensorflow/blob/r1.1/tensorflow/tensorboard/backend/application.py).
-See this [StackOverflow question](http://stackoverflow.com/questions/43702546/tensorboard-doesnt-show-all-data-points/)
-for some more information.
-
-### I get a network security popup every time I run TensorBoard on a mac!
-
-This is because by default, TensorBoard serves on host `0.0.0.0` which is
-publicly accessible. You can stop the popups by specifying `--host=localhost` at
-startup.
-
-### How can I develop TensorBoard?
-
-See [tensorflow/tensorboard/DEVELOPMENT.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/DEVELOPMENT.md).
-
-### I have a different issue that wasn't addressed here!
-
-First, try searching our [GitHub
-issues](https://github.com/tensorflow/tensorflow/issues) and [Stack
-Overflow](https://stackoverflow.com/questions/tagged/tensorboard). It may be
-that someone else has already had the same issue or question.
-
-If you have a bug, please [file a GitHub
-issue](https://github.com/tensorflow/tensorflow/issues). If the bug is related
-to your specific data (e.g. the events aren't loading properly), please do both
-of the following things to make it easier for us to debug and fix:
-
-- Run tensorboard in --inspect mode and copy paste the debug output.
-- Upload some events files that will reproduce the issue.
-
-If you have a feature request, please [file a GitHub
-issue](https://github.com/tensorflow/tensorflow/issues).
-
-General usage questions should go to [Stack
-Overflow](http://stackoverflow.com/questions/tagged/tensorflow).
diff --git a/tensorflow/tensorboard/TAG b/tensorflow/tensorboard/TAG
deleted file mode 100644
index fb1e7bc86996a80d4a16529b990adda1d3434c92..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/TAG
+++ /dev/null
@@ -1 +0,0 @@
-54
diff --git a/tensorflow/tensorboard/WORKSPACE b/tensorflow/tensorboard/WORKSPACE
deleted file mode 100644
index 1667478cab9dbf7458892de1ab544ca41ef83ad9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/WORKSPACE
+++ /dev/null
@@ -1,21 +0,0 @@
-workspace(name = "org_tensorflow_tensorboard")
-
-http_archive(
-    name = "io_bazel_rules_closure",
-    sha256 = "bc41b80486413aaa551860fc37471dbc0666e1dbb5236fb6177cb83b0c105846",
-    strip_prefix = "rules_closure-dec425a4ff3faf09a56c85d082e4eed05d8ce38f",
-    urls = [
-        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dec425a4ff3faf09a56c85d082e4eed05d8ce38f.tar.gz",  # 2017-06-02
-        "https://github.com/bazelbuild/rules_closure/archive/dec425a4ff3faf09a56c85d082e4eed05d8ce38f.tar.gz",
-    ],
-)
-
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
-
-closure_repositories()
-
-load("//third_party:workspace.bzl", "tensorboard_workspace")
-
-# Please add all new dependencies in workspace.bzl.
-tensorboard_workspace()
diff --git a/tensorflow/tensorboard/backend/BUILD b/tensorflow/tensorboard/backend/BUILD
deleted file mode 100644
index c7bf0dfee58d9203356a13b3f8d071f5466670b6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/BUILD
+++ /dev/null
@@ -1,106 +0,0 @@
-# Description:
-# TensorBoard, a dashboard for investigating TensorFlow
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "http_util",
-    srcs = ["http_util.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":json_util",
-        "//tensorflow:tensorflow_py",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "http_util_test",
-    size = "small",
-    srcs = ["http_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":http_util",
-        "//tensorflow:tensorflow_py",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "json_util",
-    srcs = ["json_util.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_test(
-    name = "json_util_test",
-    size = "small",
-    srcs = ["json_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":json_util",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "application",
-    srcs = ["application.py"],
-    data = ["//tensorflow/tensorboard:assets"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":http_util",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "application_test",
-    size = "medium",
-    srcs = ["application_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["manual"],
-    deps = [
-        ":application",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-    ],
-)
-
-py_library(
-    name = "process_graph",
-    srcs = ["process_graph.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        [
-            "*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/backend/application.py b/tensorflow/tensorboard/backend/application.py
deleted file mode 100644
index 3657eee38b0167f2de750a77081f5cf5f5056268..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/application.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorBoard WSGI Application Logic.
-
-TensorBoardApplication constructs TensorBoard as a WSGI application.
-It handles serving static assets, and implements TensorBoard data APIs.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import re
-import threading
-import time
-
-import six
-from six.moves.urllib import parse as urlparse
-import tensorflow as tf
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-
-
-DEFAULT_SIZE_GUIDANCE = {
-    event_accumulator.COMPRESSED_HISTOGRAMS: 500,
-    event_accumulator.IMAGES: 10,
-    event_accumulator.AUDIO: 10,
-    event_accumulator.SCALARS: 1000,
-    event_accumulator.HEALTH_PILLS: 100,
-    event_accumulator.HISTOGRAMS: 50,
-}
-
-DATA_PREFIX = '/data'
-LOGDIR_ROUTE = '/logdir'
-RUNS_ROUTE = '/runs'
-PLUGIN_PREFIX = '/plugin'
-PLUGINS_LISTING_ROUTE = '/plugins_listing'
-TAB_ROUTES = ['', '/events', '/images', '/audio', '/graphs', '/histograms']
-
-# Slashes in a plugin name could throw the router for a loop. An empty
-# name would be confusing, too. To be safe, let's restrict the valid
-# names as follows.
-_VALID_PLUGIN_RE = re.compile(r'^[A-Za-z0-9_.-]+$')
-
-
-def standard_tensorboard_wsgi(
-    logdir,
-    purge_orphaned_data,
-    reload_interval,
-    plugins):
-  """Construct a TensorBoardWSGIApp with standard plugins and multiplexer.
-
-  Args:
-    logdir: The path to the directory containing events files.
-    purge_orphaned_data: Whether to purge orphaned data.
-    reload_interval: The interval at which the backend reloads more data in
-        seconds.
-    plugins: A list of plugins for TensorBoard to initialize.
-
-  Returns:
-    The new TensorBoard WSGI application.
-  """
-  multiplexer = event_multiplexer.EventMultiplexer(
-      size_guidance=DEFAULT_SIZE_GUIDANCE,
-      purge_orphaned_data=purge_orphaned_data)
-
-  return TensorBoardWSGIApp(logdir, plugins, multiplexer, reload_interval)
-
-
-class TensorBoardWSGIApp(object):
-  """The TensorBoard application, conforming to WSGI spec."""
-
-  # How many samples to include in sampling API calls by default.
-  DEFAULT_SAMPLE_COUNT = 10
-
-  # NOTE TO MAINTAINERS: An accurate Content-Length MUST be specified on all
-  #                      responses using send_header.
-  protocol_version = 'HTTP/1.1'
-
-  def __init__(self, logdir, plugins, multiplexer, reload_interval):
-    """Constructs the TensorBoard application.
-
-    Args:
-      logdir: the logdir spec that describes where data will be loaded.
-        may be a directory, or comma,separated list of directories, or colons
-        can be used to provide named directories
-      plugins: List of plugins that extend tensorboard.plugins.BasePlugin
-      multiplexer: The EventMultiplexer with TensorBoard data to serve
-      reload_interval: How often (in seconds) to reload the Multiplexer
-
-    Returns:
-      A WSGI application that implements the TensorBoard backend.
-
-    Raises:
-      ValueError: If some plugin has no plugin_name
-      ValueError: If some plugin has an invalid plugin_name (plugin
-          names must only contain [A-Za-z0-9_.-])
-      ValueError: If two plugins have the same plugin_name
-      ValueError: If some plugin handles a route that does not start
-          with a slash
-    """
-    self._logdir = logdir
-    self._plugins = plugins
-    self._multiplexer = multiplexer
-    self.tag = get_tensorboard_tag()
-
-    path_to_run = parse_event_files_spec(self._logdir)
-    if reload_interval:
-      start_reloading_multiplexer(self._multiplexer, path_to_run,
-                                  reload_interval)
-    else:
-      reload_multiplexer(self._multiplexer, path_to_run)
-
-    self.data_applications = {
-        DATA_PREFIX + LOGDIR_ROUTE:
-            self._serve_logdir,
-        # TODO(chizeng): Delete this RPC once we have skylark rules that obviate
-        # the need for the frontend to determine which plugins are active.
-        DATA_PREFIX + PLUGINS_LISTING_ROUTE: self._serve_plugins_listing,
-        DATA_PREFIX + RUNS_ROUTE: self._serve_runs,
-    }
-
-    # Serve the routes from the registered plugins using their name as the route
-    # prefix. For example if plugin z has two routes /a and /b, they will be
-    # served as /data/plugin/z/a and /data/plugin/z/b.
-    plugin_names_encountered = set()
-    for plugin in self._plugins:
-      if plugin.plugin_name is None:
-        raise ValueError('Plugin %s has no plugin_name' % plugin)
-      if not _VALID_PLUGIN_RE.match(plugin.plugin_name):
-        raise ValueError('Plugin %s has invalid name %r' % (plugin,
-                                                            plugin.plugin_name))
-      if plugin.plugin_name in plugin_names_encountered:
-        raise ValueError('Duplicate plugins for name %s' % plugin.plugin_name)
-      plugin_names_encountered.add(plugin.plugin_name)
-
-      try:
-        plugin_apps = plugin.get_plugin_apps(self._multiplexer, self._logdir)
-      except Exception as e:  # pylint: disable=broad-except
-        tf.logging.warning('Plugin %s failed. Exception: %s',
-                           plugin.plugin_name, str(e))
-        continue
-      for route, app in plugin_apps.items():
-        if not route.startswith('/'):
-          raise ValueError('Plugin named %r handles invalid route %r: '
-                           'route does not start with a slash' %
-                           (plugin.plugin_name, route))
-        path = DATA_PREFIX + PLUGIN_PREFIX + '/' + plugin.plugin_name + route
-        self.data_applications[path] = app
-
-  def _path_is_safe(self, path):
-    """Check path is safe (stays within current directory).
-
-    This is for preventing directory-traversal attacks.
-
-    Args:
-      path: The path to check for safety.
-
-    Returns:
-      True if the given path stays within the current directory, and false
-      if it would escape to a higher directory. E.g. _path_is_safe('index.html')
-      returns true, but _path_is_safe('../../../etc/password') returns false.
-    """
-    base = os.path.abspath(os.curdir)
-    absolute_path = os.path.abspath(path)
-    prefix = os.path.commonprefix([base, absolute_path])
-    return prefix == base
-
-  @wrappers.Request.application
-  def _serve_logdir(self, request):
-    """Respond with a JSON object containing this TensorBoard's logdir."""
-    return http_util.Respond(
-        request, {'logdir': self._logdir}, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_plugins_listing(self, request):
-    """Serves an object mapping plugin name to whether it is enabled.
-
-    Args:
-      request: The werkzeug.Request object.
-
-    Returns:
-      A werkzeug.Response object.
-    """
-    return http_util.Respond(
-        request,
-        {plugin.plugin_name: plugin.is_active() for plugin in self._plugins},
-        'application/json')
-
-  @wrappers.Request.application
-  def _serve_runs(self, request):
-    """WSGI app serving a JSON object about runs and tags.
-
-    Returns a mapping from runs to tagType to list of tags for that run.
-
-    Args:
-      request: A werkzeug request
-
-    Returns:
-      A werkzeug Response with the following content:
-      {runName: {firstEventTimestamp: 123456.789}}
-    """
-    run_names = sorted(self._multiplexer.Runs())  # Why `sorted`? See below.
-    def get_first_event_timestamp(run_name):
-      try:
-        return self._multiplexer.FirstEventTimestamp(run_name)
-      except ValueError:
-        tf.logging.warning('Unable to get first event timestamp for run %s',
-                           run_name)
-        # Put runs without a timestamp at the end. Their internal
-        # ordering would be nondeterministic, but Python's sorts are
-        # stable, so `sorted`ing the initial list above provides a
-        # deterministic ordering. Of course, we cannot guarantee that
-        # this will be append-only for new event-less runs.
-        return float('inf')
-    first_event_timestamps = {
-        run_name: get_first_event_timestamp(run_name)
-        for run_name in run_names
-    }
-    run_names.sort(key=first_event_timestamps.get)
-    return http_util.Respond(request, run_names, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_index(self, request):
-    """Serves the index page (i.e., the tensorboard app itself)."""
-    contents = tf.resource_loader.load_resource(
-        'tensorboard/components/index.html')
-    return http_util.Respond(request, contents, 'text/html', expires=3600)
-
-  def __call__(self, environ, start_response):  # pylint: disable=invalid-name
-    """Central entry point for the TensorBoard application.
-
-    This method handles routing to sub-applications. It does simple routing
-    using regular expression matching.
-
-    This __call__ method conforms to the WSGI spec, so that instances of this
-    class are WSGI applications.
-
-    Args:
-      environ: See WSGI spec.
-      start_response: See WSGI spec.
-
-    Returns:
-      A werkzeug Response.
-    """
-    request = wrappers.Request(environ)
-    parsed_url = urlparse.urlparse(request.path)
-
-    # Remove a trailing slash, if present.
-    clean_path = parsed_url.path
-    if clean_path.endswith('/'):
-      clean_path = clean_path[:-1]
-    # pylint: disable=too-many-function-args
-    if clean_path in self.data_applications:
-      return self.data_applications[clean_path](environ, start_response)
-    elif clean_path in TAB_ROUTES:
-      return self._serve_index(environ, start_response)
-    else:
-      tf.logging.warning('path %s not found, sending 404', clean_path)
-      return http_util.Respond(request, 'Not found', 'text/plain', code=404)(
-          environ, start_response)
-    # pylint: enable=too-many-function-args
-
-
-def parse_event_files_spec(logdir):
-  """Parses `logdir` into a map from paths to run group names.
-
-  The events files flag format is a comma-separated list of path specifications.
-  A path specification either looks like 'group_name:/path/to/directory' or
-  '/path/to/directory'; in the latter case, the group is unnamed. Group names
-  cannot start with a forward slash: /foo:bar/baz will be interpreted as a
-  spec with no name and path '/foo:bar/baz'.
-
-  Globs are not supported.
-
-  Args:
-    logdir: A comma-separated list of run specifications.
-  Returns:
-    A dict mapping directory paths to names like {'/path/to/directory': 'name'}.
-    Groups without an explicit name are named after their path. If logdir is
-    None, returns an empty dict, which is helpful for testing things that don't
-    require any valid runs.
-  """
-  files = {}
-  if logdir is None:
-    return files
-  # Make sure keeping consistent with ParseURI in core/lib/io/path.cc
-  uri_pattern = re.compile('[a-zA-Z][0-9a-zA-Z.]*://.*')
-  for specification in logdir.split(','):
-    # Check if the spec contains group. A spec start with xyz:// is regarded as
-    # URI path spec instead of group spec. If the spec looks like /foo:bar/baz,
-    # then we assume it's a path with a colon.
-    if (uri_pattern.match(specification) is None and ':' in specification and
-        specification[0] != '/'):
-      # We split at most once so run_name:/path:with/a/colon will work.
-      run_name, _, path = specification.partition(':')
-    else:
-      run_name = None
-      path = specification
-    if uri_pattern.match(path) is None:
-      path = os.path.realpath(path)
-    files[path] = run_name
-  return files
-
-
-def reload_multiplexer(multiplexer, path_to_run):
-  """Loads all runs into the multiplexer.
-
-  Args:
-    multiplexer: The `EventMultiplexer` to add runs to and reload.
-    path_to_run: A dict mapping from paths to run names, where `None` as the run
-      name is interpreted as a run name equal to the path.
-  """
-  start = time.time()
-  tf.logging.info('TensorBoard reload process beginning')
-  for (path, name) in six.iteritems(path_to_run):
-    multiplexer.AddRunsFromDirectory(path, name)
-  tf.logging.info('TensorBoard reload process: Reload the whole Multiplexer')
-  multiplexer.Reload()
-  duration = time.time() - start
-  tf.logging.info('TensorBoard done reloading. Load took %0.3f secs', duration)
-
-
-def start_reloading_multiplexer(multiplexer, path_to_run, load_interval):
-  """Starts a thread to automatically reload the given multiplexer.
-
-  The thread will reload the multiplexer by calling `ReloadMultiplexer` every
-  `load_interval` seconds, starting immediately.
-
-  Args:
-    multiplexer: The `EventMultiplexer` to add runs to and reload.
-    path_to_run: A dict mapping from paths to run names, where `None` as the run
-      name is interpreted as a run name equal to the path.
-    load_interval: How many seconds to wait after one load before starting the
-      next load.
-
-  Returns:
-    A started `threading.Thread` that reloads the multiplexer.
-  """
-
-  # We don't call multiplexer.Reload() here because that would make
-  # AddRunsFromDirectory block until the runs have all loaded.
-  def _reload_forever():
-    while True:
-      reload_multiplexer(multiplexer, path_to_run)
-      time.sleep(load_interval)
-
-  thread = threading.Thread(target=_reload_forever)
-  thread.daemon = True
-  thread.start()
-  return thread
-
-
-def get_tensorboard_tag():
-  """Read the TensorBoard TAG number, and return it or an empty string."""
-  tag = tf.resource_loader.load_resource('tensorboard/TAG').strip()
-  return tag
diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py
deleted file mode 100644
index fd63564e4e3212be66916d29b8246d523d513c8a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/application_test.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for TensorBoard.
-
-These tests start up a full-fledged TensorBoard server.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gzip
-import json
-import os
-import shutil
-import socket
-import tempfile
-import threading
-
-from six import BytesIO
-from six.moves import http_client
-import tensorflow as tf
-
-from werkzeug import serving
-
-from tensorflow.tensorboard import main as tensorboard
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins import base_plugin
-
-
-class FakePlugin(base_plugin.TBPlugin):
-  """A plugin with no functionality."""
-
-  def __init__(self, plugin_name, is_active_value, routes_mapping):
-    """Constructs a fake plugin.
-
-    Args:
-      plugin_name: The name of this plugin.
-      is_active_value: Whether the plugin is active.
-      routes_mapping: A dictionary mapping from route (string URL path) to the
-        method called when a user issues a request to that route.
-    """
-    self.plugin_name = plugin_name
-    self._is_active_value = is_active_value
-    self._routes_mapping = routes_mapping
-
-  def get_plugin_apps(self, multiplexer, logdir):
-    """Returns a mapping from routes to handlers offered by this plugin.
-
-    Args:
-      multiplexer: The event multiplexer.
-      logdir: The path to the directory containing logs.
-
-    Returns:
-      A dictionary mapping from routes to handlers offered by this plugin.
-    """
-    return self._routes_mapping
-
-  def is_active(self):
-    """Returns whether this plugin is active.
-
-    Returns:
-      A boolean. Whether this plugin is active.
-    """
-    return self._is_active_value
-
-
-class TensorboardServerTest(tf.test.TestCase):
-  _only_use_meta_graph = False  # Server data contains only a GraphDef
-
-  def setUp(self):
-    self.logdir = self.get_temp_dir()
-
-    self._GenerateTestData(run_name='run1')
-    self._multiplexer = event_multiplexer.EventMultiplexer(
-        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
-        purge_orphaned_data=True)
-    plugins = [
-        FakePlugin(plugin_name='foo', is_active_value=True, routes_mapping={}),
-        FakePlugin(plugin_name='bar', is_active_value=False, routes_mapping={})
-    ]
-    app = application.TensorBoardWSGIApp(
-        self.logdir, plugins, self._multiplexer, reload_interval=0)
-    try:
-      self._server = serving.BaseWSGIServer('localhost', 0, app)
-      # 0 to pick an unused port.
-    except IOError:
-      # BaseWSGIServer has a preference for IPv4. If that didn't work, try again
-      # with an explicit IPv6 address.
-      self._server = serving.BaseWSGIServer('::1', 0, app)
-    self._server_thread = threading.Thread(target=self._server.serve_forever)
-    self._server_thread.daemon = True
-    self._server_thread.start()
-    self._connection = http_client.HTTPConnection(
-        'localhost', self._server.server_address[1])
-
-  def tearDown(self):
-    self._connection.close()
-    self._server.shutdown()
-    self._server.server_close()
-
-  def _get(self, path, headers=None):
-    """Perform a GET request for the given path."""
-    if headers is None:
-      headers = {}
-    self._connection.request('GET', path, None, headers)
-    return self._connection.getresponse()
-
-  def _getJson(self, path):
-    """Perform a GET request and decode the result as JSON."""
-    self._connection.request('GET', path)
-    response = self._connection.getresponse()
-    self.assertEqual(response.status, 200)
-    data = response.read()
-    if response.getheader('Content-Encoding') == 'gzip':
-      data = gzip.GzipFile('', 'rb', 9, BytesIO(data)).read()
-    return json.loads(data.decode('utf-8'))
-
-  def testBasicStartup(self):
-    """Start the server up and then shut it down immediately."""
-    pass
-
-  def testRequestMainPage(self):
-    """Navigate to the main page and verify that it returns a 200."""
-    response = self._get('/')
-    self.assertEqual(response.status, 200)
-
-  def testRequestNonexistentPage(self):
-    """Request a page that doesn't exist; it should 404."""
-    response = self._get('/asdf')
-    self.assertEqual(response.status, 404)
-
-  def testLogdir(self):
-    """Test the format of the data/logdir endpoint."""
-    parsed_object = self._getJson('/data/logdir')
-    self.assertEqual(parsed_object, {'logdir': self.logdir})
-
-  def testPluginsListing(self):
-    """Test the format of the data/plugins_listing endpoint."""
-    parsed_object = self._getJson('/data/plugins_listing')
-    # Plugin foo is active. Plugin bar is not.
-    self.assertEqual(parsed_object, {'foo': True, 'bar': False})
-
-  def testRuns(self):
-    """Test the format of the /data/runs endpoint."""
-    run_json = self._getJson('/data/runs')
-    self.assertEqual(run_json, ['run1'])
-
-  def testRunsAppendOnly(self):
-    """Test that new runs appear after old ones in /data/runs."""
-    # We use three runs: the 'run1' that we already created in our
-    # `setUp` method, plus runs with names lexicographically before and
-    # after it (so that just sorting by name doesn't have a chance of
-    # working).
-    fake_wall_times = {
-        'run1': 1234.0,
-        'avocado': 2345.0,
-        'zebra': 3456.0,
-        'mysterious': None,
-    }
-
-    stubs = tf.test.StubOutForTesting()
-    # pylint: disable=invalid-name
-    def FirstEventTimestamp_stub(multiplexer_self, run_name):
-      del multiplexer_self
-      matches = [candidate_name
-                 for candidate_name in fake_wall_times
-                 if run_name.endswith(candidate_name)]
-      self.assertEqual(len(matches), 1, '%s (%s)' % (matches, run_name))
-      wall_time = fake_wall_times[matches[0]]
-      if wall_time is None:
-        raise ValueError('No event timestamp could be found')
-      else:
-        return wall_time
-    # pylint: enable=invalid-name
-
-    stubs.SmartSet(self._multiplexer,
-                   'FirstEventTimestamp',
-                   FirstEventTimestamp_stub)
-
-    def add_run(run_name):
-      self._GenerateTestData(run_name)
-      self._multiplexer.AddRunsFromDirectory(self.logdir)
-      self._multiplexer.Reload()
-
-    # Add one run: it should come last.
-    add_run('avocado')
-    self.assertEqual(self._getJson('/data/runs'),
-                     ['run1', 'avocado'])
-
-    # Add another run: it should come last, too.
-    add_run('zebra')
-    self.assertEqual(self._getJson('/data/runs'),
-                     ['run1', 'avocado', 'zebra'])
-
-    # And maybe there's a run for which we somehow have no timestamp.
-    add_run('mysterious')
-    self.assertEqual(self._getJson('/data/runs'),
-                     ['run1', 'avocado', 'zebra', 'mysterious'])
-
-    stubs.UnsetAll()
-
-  def testApplicationPaths_getCached(self):
-    """Test the format of the /data/runs endpoint."""
-    for path in ('/',):  # TODO(jart): '/app.js' in open source
-      connection = http_client.HTTPConnection('localhost',
-                                              self._server.server_address[1])
-      connection.request('GET', path)
-      response = connection.getresponse()
-      self.assertEqual(response.status, 200, msg=path)
-      self.assertEqual(
-          response.getheader('Cache-Control'),
-          'private, max-age=3600',
-          msg=path)
-      connection.close()
-
-  def testDataPaths_disableAllCaching(self):
-    """Test the format of the /data/runs endpoint."""
-    for path in ('/data/runs', '/data/logdir'):
-      connection = http_client.HTTPConnection('localhost',
-                                              self._server.server_address[1])
-      connection.request('GET', path)
-      response = connection.getresponse()
-      self.assertEqual(response.status, 200, msg=path)
-      self.assertEqual(response.getheader('Expires'), '0', msg=path)
-      response.read()
-      connection.close()
-
-  def _GenerateTestData(self, run_name):
-    """Generates the test data directory.
-
-    The test data has a single run of the given name, containing:
-      - a graph definition and metagraph definition
-
-    Arguments:
-      run_name: the directory under self.logdir into which to write
-        events
-    """
-    run_path = os.path.join(self.logdir, run_name)
-    os.makedirs(run_path)
-
-    writer = tf.summary.FileWriter(run_path)
-
-    # Add a simple graph event.
-    graph_def = tf.GraphDef()
-    node1 = graph_def.node.add()
-    node1.name = 'a'
-    node2 = graph_def.node.add()
-    node2.name = 'b'
-    node2.attr['very_large_attr'].s = b'a' * 2048  # 2 KB attribute
-
-    meta_graph_def = tf.MetaGraphDef(graph_def=graph_def)
-
-    if self._only_use_meta_graph:
-      writer.add_meta_graph(meta_graph_def)
-    else:
-      writer.add_graph(graph_def)
-
-    writer.flush()
-    writer.close()
-
-
-class TensorboardServerPluginNameTest(tf.test.TestCase):
-
-  def _test(self, name, should_be_okay):
-    temp_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir)
-    multiplexer = event_multiplexer.EventMultiplexer(
-        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
-        purge_orphaned_data=True)
-    plugins = [
-        FakePlugin(plugin_name='foo', is_active_value=True, routes_mapping={}),
-        FakePlugin(plugin_name=name, is_active_value=True, routes_mapping={}),
-        FakePlugin(plugin_name='bar', is_active_value=False, routes_mapping={})
-    ]
-    if should_be_okay:
-      application.TensorBoardWSGIApp(
-          temp_dir, plugins, multiplexer, reload_interval=0)
-    else:
-      with self.assertRaisesRegexp(ValueError, r'invalid name'):
-        application.TensorBoardWSGIApp(
-            temp_dir, plugins, multiplexer, reload_interval=0)
-
-  def testEmptyName(self):
-    self._test('', False)
-
-  def testNameWithSlashes(self):
-    self._test('scalars/data', False)
-
-  def testNameWithSpaces(self):
-    self._test('my favorite plugin', False)
-
-  def testSimpleName(self):
-    self._test('scalars', True)
-
-  def testComprehensiveName(self):
-    self._test('Scalar-Dashboard_3000.1', True)
-
-
-class TensorboardServerPluginRouteTest(tf.test.TestCase):
-
-  def _test(self, route, should_be_okay):
-    temp_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir)
-    multiplexer = event_multiplexer.EventMultiplexer(
-        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
-        purge_orphaned_data=True)
-    plugins = [
-        FakePlugin(
-            plugin_name='foo',
-            is_active_value=True,
-            routes_mapping={route: lambda environ, start_response: None}),
-    ]
-    if should_be_okay:
-      application.TensorBoardWSGIApp(
-          temp_dir, plugins, multiplexer, reload_interval=0)
-    else:
-      with self.assertRaisesRegexp(ValueError, r'invalid route'):
-        application.TensorBoardWSGIApp(
-            temp_dir, plugins, multiplexer, reload_interval=0)
-
-  def testNormalRoute(self):
-    self._test('/runs', True)
-
-  def testEmptyRoute(self):
-    self._test('', False)
-
-  def testSlashlessRoute(self):
-    self._test('runaway', False)
-
-
-class TensorboardServerUsingMetagraphOnlyTest(TensorboardServerTest):
-  # Tests new ability to use only the MetaGraphDef
-  _only_use_meta_graph = True  # Server data contains only a MetaGraphDef
-
-
-class ParseEventFilesSpecTest(tf.test.TestCase):
-
-  def testRunName(self):
-    logdir = 'lol:/cat'
-    expected = {'/cat': 'lol'}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testPathWithColonThatComesAfterASlash_isNotConsideredARunName(self):
-    logdir = '/lol:/cat'
-    expected = {'/lol:/cat': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testMultipleDirectories(self):
-    logdir = '/a,/b'
-    expected = {'/a': None, '/b': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testNormalizesPaths(self):
-    logdir = '/lol/.//cat/../cat'
-    expected = {'/lol/cat': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testAbsolutifies(self):
-    logdir = 'lol/cat'
-    expected = {os.path.realpath('lol/cat'): None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testRespectsGCSPath(self):
-    logdir = 'gs://foo/path'
-    expected = {'gs://foo/path': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testRespectsHDFSPath(self):
-    logdir = 'hdfs://foo/path'
-    expected = {'hdfs://foo/path': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testDoesNotExpandUserInGCSPath(self):
-    logdir = 'gs://~/foo/path'
-    expected = {'gs://~/foo/path': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testDoesNotNormalizeGCSPath(self):
-    logdir = 'gs://foo/./path//..'
-    expected = {'gs://foo/./path//..': None}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-  def testRunNameWithGCSPath(self):
-    logdir = 'lol:gs://foo/path'
-    expected = {'gs://foo/path': 'lol'}
-    self.assertEqual(application.parse_event_files_spec(logdir), expected)
-
-
-class TensorBoardAssetsTest(tf.test.TestCase):
-
-  def testTagFound(self):
-    tag = application.get_tensorboard_tag()
-    self.assertTrue(tag)
-    app = application.standard_tensorboard_wsgi('', True, 60, [])
-    self.assertEqual(app.tag, tag)
-
-
-class TensorBoardPluginsTest(tf.test.TestCase):
-
-  def testPluginsAdded(self):
-
-    def foo_handler():
-      pass
-
-    def bar_handler():
-      pass
-
-    plugins = [
-        FakePlugin(
-            plugin_name='foo',
-            is_active_value=True,
-            routes_mapping={'/foo_route': foo_handler}),
-        FakePlugin(
-            plugin_name='bar',
-            is_active_value=True,
-            routes_mapping={'/bar_route': bar_handler}),
-    ]
-
-    # The application should have added routes for both plugins.
-    app = application.standard_tensorboard_wsgi('', True, 60, plugins)
-
-    # The routes are prefixed with /data/plugin/[plugin name].
-    self.assertDictContainsSubset({
-        '/data/plugin/foo/foo_route': foo_handler,
-        '/data/plugin/bar/bar_route': bar_handler,
-    }, app.data_applications)
-
-
-class TensorboardSimpleServerConstructionTest(tf.test.TestCase):
-  """Tests that the default HTTP server is constructed without error.
-
-  Mostly useful for IPv4/IPv6 testing. This test should run with only IPv4, only
-  IPv6, and both IPv4 and IPv6 enabled.
-  """
-
-  class _StubApplication(object):
-    tag = ''
-
-  def testMakeServerBlankHost(self):
-    # Test that we can bind to all interfaces without throwing an error
-    server, url = tensorboard.make_simple_server(
-        self._StubApplication(),
-        host='',
-        port=0)  # Grab any available port
-    self.assertTrue(server)
-    self.assertTrue(url)
-
-  def testSpecifiedHost(self):
-    one_passed = False
-    try:
-      _, url = tensorboard.make_simple_server(
-          self._StubApplication(),
-          host='127.0.0.1',
-          port=0)
-      self.assertStartsWith(actual=url, expected_start='http://127.0.0.1:')
-      one_passed = True
-    except socket.error:
-      # IPv4 is not supported
-      pass
-    try:
-      _, url = tensorboard.make_simple_server(
-          self._StubApplication(),
-          host='::1',
-          port=0)
-      self.assertStartsWith(actual=url, expected_start='http://[::1]:')
-      one_passed = True
-    except socket.error:
-      # IPv6 is not supported
-      pass
-    self.assertTrue(one_passed)  # We expect either IPv4 or IPv6 to be supported
-
-
-class TensorBoardApplcationConstructionTest(tf.test.TestCase):
-
-  def testExceptions(self):
-    logdir = '/fake/foo'
-    multiplexer = event_multiplexer.EventMultiplexer()
-
-    # Fails if there is an unnamed plugin
-    with self.assertRaises(ValueError):
-      # This plugin lacks a name.
-      plugins = [
-          FakePlugin(plugin_name=None, is_active_value=True, routes_mapping={})
-      ]
-      application.TensorBoardWSGIApp(logdir, plugins, multiplexer, 0)
-
-    # Fails if there are two plugins with same name
-    with self.assertRaises(ValueError):
-      plugins = [
-          FakePlugin(
-              plugin_name='foo', is_active_value=True, routes_mapping={}),
-          FakePlugin(
-              plugin_name='foo', is_active_value=True, routes_mapping={}),
-      ]
-      application.TensorBoardWSGIApp(logdir, plugins, multiplexer, 0)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/BUILD b/tensorflow/tensorboard/backend/event_processing/BUILD
deleted file mode 100644
index 9c9ca29be2d1bbebbccccc535ac4b0415db18d0b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/BUILD
+++ /dev/null
@@ -1,165 +0,0 @@
-# Description:
-# Event processing logic for TensorBoard
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "io_wrapper",
-    srcs = ["io_wrapper.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "directory_watcher",
-    srcs = ["directory_watcher.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":io_wrapper",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_test(
-    name = "directory_watcher_test",
-    size = "small",
-    srcs = ["directory_watcher_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":directory_watcher",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "reservoir",
-    srcs = ["reservoir.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_test(
-    name = "reservoir_test",
-    size = "small",
-    srcs = ["reservoir_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":reservoir",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "event_file_loader",
-    srcs = ["event_file_loader.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_test(
-    name = "event_file_loader_test",
-    size = "small",
-    srcs = ["event_file_loader_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":event_file_loader",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "event_accumulator",
-    srcs = ["event_accumulator.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":directory_watcher",
-        ":event_file_loader",
-        ":plugin_asset_util",
-        ":reservoir",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_test(
-    name = "event_accumulator_test",
-    size = "small",
-    srcs = ["event_accumulator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":event_accumulator",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "event_multiplexer",
-    srcs = ["event_multiplexer.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":directory_watcher",
-        ":event_accumulator",
-        ":io_wrapper",
-        "//tensorflow:tensorflow_py",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "event_multiplexer_test",
-    size = "small",
-    srcs = ["event_multiplexer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":event_accumulator",
-        ":event_multiplexer",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "plugin_asset_util",
-    srcs = ["plugin_asset_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "event_file_inspector",
-    srcs = ["event_file_inspector.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":event_accumulator",
-        ":event_multiplexer",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_test(
-    name = "event_file_inspector_test",
-    size = "small",
-    srcs = ["event_file_inspector_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":event_file_inspector",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        [
-            "*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/backend/event_processing/directory_watcher.py b/tensorflow/tensorboard/backend/event_processing/directory_watcher.py
deleted file mode 100644
index 6be3049e906a88e7e056fc9e80738fdf66bdaccc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/directory_watcher.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Contains the implementation for the DirectoryWatcher class."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import bisect
-
-import tensorflow as tf
-
-
-from tensorflow.tensorboard.backend.event_processing import io_wrapper
-
-
-class DirectoryWatcher(object):
-  """A DirectoryWatcher wraps a loader to load from a sequence of paths.
-
-  A loader reads a path and produces some kind of values as an iterator. A
-  DirectoryWatcher takes a directory, a factory for loaders, and optionally a
-  path filter and watches all the paths inside that directory.
-
-  This class is only valid under the assumption that only one path will be
-  written to by the data source at a time and that once the source stops writing
-  to a path, it will start writing to a new path that's lexicographically
-  greater and never come back. It uses some heuristics to check whether this is
-  true based on tracking changes to the files' sizes, but the check can have
-  false negatives. However, it should have no false positives.
-  """
-
-  def __init__(self, directory, loader_factory, path_filter=lambda x: True):
-    """Constructs a new DirectoryWatcher.
-
-    Args:
-      directory: The directory to load files from.
-      loader_factory: A factory for creating loaders. The factory should take a
-        path and return an object that has a Load method returning an
-        iterator that will yield all events that have not been yielded yet.
-      path_filter: If specified, only paths matching this filter are loaded.
-
-    Raises:
-      ValueError: If path_provider or loader_factory are None.
-    """
-    if directory is None:
-      raise ValueError('A directory is required')
-    if loader_factory is None:
-      raise ValueError('A loader factory is required')
-    self._directory = directory
-    self._path = None
-    self._loader_factory = loader_factory
-    self._loader = None
-    self._path_filter = path_filter
-    self._ooo_writes_detected = False
-    # The file size for each file at the time it was finalized.
-    self._finalized_sizes = {}
-
-  def Load(self):
-    """Loads new values.
-
-    The watcher will load from one path at a time; as soon as that path stops
-    yielding events, it will move on to the next path. We assume that old paths
-    are never modified after a newer path has been written. As a result, Load()
-    can be called multiple times in a row without losing events that have not
-    been yielded yet. In other words, we guarantee that every event will be
-    yielded exactly once.
-
-    Yields:
-      All values that have not been yielded yet.
-
-    Raises:
-      DirectoryDeletedError: If the directory has been permanently deleted
-        (as opposed to being temporarily unavailable).
-    """
-    try:
-      for event in self._LoadInternal():
-        yield event
-    except tf.errors.OpError:
-      if not tf.gfile.Exists(self._directory):
-        raise DirectoryDeletedError(
-            'Directory %s has been permanently deleted' % self._directory)
-
-  def _LoadInternal(self):
-    """Internal implementation of Load().
-
-    The only difference between this and Load() is that the latter will throw
-    DirectoryDeletedError on I/O errors if it thinks that the directory has been
-    permanently deleted.
-
-    Yields:
-      All values that have not been yielded yet.
-    """
-
-    # If the loader exists, check it for a value.
-    if not self._loader:
-      self._InitializeLoader()
-
-    while True:
-      # Yield all the new events in the path we're currently loading from.
-      for event in self._loader.Load():
-        yield event
-
-      next_path = self._GetNextPath()
-      if not next_path:
-        tf.logging.info('No path found after %s', self._path)
-        # Current path is empty and there are no new paths, so we're done.
-        return
-
-      # There's a new path, so check to make sure there weren't any events
-      # written between when we finished reading the current path and when we
-      # checked for the new one. The sequence of events might look something
-      # like this:
-      #
-      # 1. Event #1 written to path #1.
-      # 2. We check for events and yield event #1 from path #1
-      # 3. We check for events and see that there are no more events in path #1.
-      # 4. Event #2 is written to path #1.
-      # 5. Event #3 is written to path #2.
-      # 6. We check for a new path and see that path #2 exists.
-      #
-      # Without this loop, we would miss event #2. We're also guaranteed by the
-      # loader contract that no more events will be written to path #1 after
-      # events start being written to path #2, so we don't have to worry about
-      # that.
-      for event in self._loader.Load():
-        yield event
-
-      tf.logging.info('Directory watcher advancing from %s to %s', self._path,
-                      next_path)
-
-      # Advance to the next path and start over.
-      self._SetPath(next_path)
-
-  # The number of paths before the current one to check for out of order writes.
-  _OOO_WRITE_CHECK_COUNT = 20
-
-  def OutOfOrderWritesDetected(self):
-    """Returns whether any out-of-order writes have been detected.
-
-    Out-of-order writes are only checked as part of the Load() iterator. Once an
-    out-of-order write is detected, this function will always return true.
-
-    Note that out-of-order write detection is not performed on GCS paths, so
-    this function will always return false.
-
-    Returns:
-      Whether any out-of-order write has ever been detected by this watcher.
-
-    """
-    return self._ooo_writes_detected
-
-  def _InitializeLoader(self):
-    path = self._GetNextPath()
-    if path:
-      self._SetPath(path)
-    else:
-      raise StopIteration
-
-  def _SetPath(self, path):
-    """Sets the current path to watch for new events.
-
-    This also records the size of the old path, if any. If the size can't be
-    found, an error is logged.
-
-    Args:
-      path: The full path of the file to watch.
-    """
-    old_path = self._path
-    if old_path and not io_wrapper.IsGCSPath(old_path):
-      try:
-        # We're done with the path, so store its size.
-        size = tf.gfile.Stat(old_path).length
-        tf.logging.debug('Setting latest size of %s to %d', old_path, size)
-        self._finalized_sizes[old_path] = size
-      except tf.errors.OpError as e:
-        tf.logging.error('Unable to get size of %s: %s', old_path, e)
-
-    self._path = path
-    self._loader = self._loader_factory(path)
-
-  def _GetNextPath(self):
-    """Gets the next path to load from.
-
-    This function also does the checking for out-of-order writes as it iterates
-    through the paths.
-
-    Returns:
-      The next path to load events from, or None if there are no more paths.
-    """
-    paths = sorted(path
-                   for path in io_wrapper.ListDirectoryAbsolute(self._directory)
-                   if self._path_filter(path))
-    if not paths:
-      return None
-
-    if self._path is None:
-      return paths[0]
-
-    # Don't bother checking if the paths are GCS (which we can't check) or if
-    # we've already detected an OOO write.
-    if not io_wrapper.IsGCSPath(paths[0]) and not self._ooo_writes_detected:
-      # Check the previous _OOO_WRITE_CHECK_COUNT paths for out of order writes.
-      current_path_index = bisect.bisect_left(paths, self._path)
-      ooo_check_start = max(0, current_path_index - self._OOO_WRITE_CHECK_COUNT)
-      for path in paths[ooo_check_start:current_path_index]:
-        if self._HasOOOWrite(path):
-          self._ooo_writes_detected = True
-          break
-
-    next_paths = list(path
-                      for path in paths
-                      if self._path is None or path > self._path)
-    if next_paths:
-      return min(next_paths)
-    else:
-      return None
-
-  def _HasOOOWrite(self, path):
-    """Returns whether the path has had an out-of-order write."""
-    # Check the sizes of each path before the current one.
-    size = tf.gfile.Stat(path).length
-    old_size = self._finalized_sizes.get(path, None)
-    if size != old_size:
-      if old_size is None:
-        tf.logging.error('File %s created after file %s even though it\'s '
-                         'lexicographically earlier', path, self._path)
-      else:
-        tf.logging.error('File %s updated even though the current file is %s',
-                         path, self._path)
-      return True
-    else:
-      return False
-
-
-class DirectoryDeletedError(Exception):
-  """Thrown by Load() when the directory is *permanently* gone.
-
-  We distinguish this from temporary errors so that other code can decide to
-  drop all of our data only when a directory has been intentionally deleted,
-  as opposed to due to transient filesystem errors.
-  """
-  pass
diff --git a/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py b/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py
deleted file mode 100644
index d44f74a8a4314685e97834d551532855c056393f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for directory_watcher."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import directory_watcher
-from tensorflow.tensorboard.backend.event_processing import io_wrapper
-
-
-class _ByteLoader(object):
-  """A loader that loads individual bytes from a file."""
-
-  def __init__(self, path):
-    self._f = open(path)
-    self.bytes_read = 0
-
-  def Load(self):
-    while True:
-      self._f.seek(self.bytes_read)
-      byte = self._f.read(1)
-      if byte:
-        self.bytes_read += 1
-        yield byte
-      else:
-        return
-
-
-class DirectoryWatcherTest(tf.test.TestCase):
-
-  def setUp(self):
-    # Put everything in a directory so it's easier to delete.
-    self._directory = os.path.join(self.get_temp_dir(), 'monitor_dir')
-    os.mkdir(self._directory)
-    self._watcher = directory_watcher.DirectoryWatcher(self._directory,
-                                                       _ByteLoader)
-    self.stubs = tf.test.StubOutForTesting()
-
-  def tearDown(self):
-    self.stubs.CleanUp()
-    try:
-      shutil.rmtree(self._directory)
-    except OSError:
-      # Some tests delete the directory.
-      pass
-
-  def _WriteToFile(self, filename, data):
-    path = os.path.join(self._directory, filename)
-    with open(path, 'a') as f:
-      f.write(data)
-
-  def _LoadAllEvents(self):
-    """Loads all events in the watcher."""
-    for _ in self._watcher.Load():
-      pass
-
-  def assertWatcherYields(self, values):
-    self.assertEqual(list(self._watcher.Load()), values)
-
-  def testRaisesWithBadArguments(self):
-    with self.assertRaises(ValueError):
-      directory_watcher.DirectoryWatcher(None, lambda x: None)
-    with self.assertRaises(ValueError):
-      directory_watcher.DirectoryWatcher('dir', None)
-
-  def testEmptyDirectory(self):
-    self.assertWatcherYields([])
-
-  def testSingleWrite(self):
-    self._WriteToFile('a', 'abc')
-    self.assertWatcherYields(['a', 'b', 'c'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testMultipleWrites(self):
-    self._WriteToFile('a', 'abc')
-    self.assertWatcherYields(['a', 'b', 'c'])
-    self._WriteToFile('a', 'xyz')
-    self.assertWatcherYields(['x', 'y', 'z'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testMultipleLoads(self):
-    self._WriteToFile('a', 'a')
-    self._watcher.Load()
-    self._watcher.Load()
-    self.assertWatcherYields(['a'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testMultipleFilesAtOnce(self):
-    self._WriteToFile('b', 'b')
-    self._WriteToFile('a', 'a')
-    self.assertWatcherYields(['a', 'b'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testFinishesLoadingFileWhenSwitchingToNewFile(self):
-    self._WriteToFile('a', 'a')
-    # Empty the iterator.
-    self.assertEquals(['a'], list(self._watcher.Load()))
-    self._WriteToFile('a', 'b')
-    self._WriteToFile('b', 'c')
-    # The watcher should finish its current file before starting a new one.
-    self.assertWatcherYields(['b', 'c'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testIntermediateEmptyFiles(self):
-    self._WriteToFile('a', 'a')
-    self._WriteToFile('b', '')
-    self._WriteToFile('c', 'c')
-    self.assertWatcherYields(['a', 'c'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testPathFilter(self):
-    self._watcher = directory_watcher.DirectoryWatcher(
-        self._directory, _ByteLoader,
-        lambda path: 'do_not_watch_me' not in path)
-
-    self._WriteToFile('a', 'a')
-    self._WriteToFile('do_not_watch_me', 'b')
-    self._WriteToFile('c', 'c')
-    self.assertWatcherYields(['a', 'c'])
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testDetectsNewOldFiles(self):
-    self._WriteToFile('b', 'a')
-    self._LoadAllEvents()
-    self._WriteToFile('a', 'a')
-    self._LoadAllEvents()
-    self.assertTrue(self._watcher.OutOfOrderWritesDetected())
-
-  def testIgnoresNewerFiles(self):
-    self._WriteToFile('a', 'a')
-    self._LoadAllEvents()
-    self._WriteToFile('q', 'a')
-    self._LoadAllEvents()
-    self.assertFalse(self._watcher.OutOfOrderWritesDetected())
-
-  def testDetectsChangingOldFiles(self):
-    self._WriteToFile('a', 'a')
-    self._WriteToFile('b', 'a')
-    self._LoadAllEvents()
-    self._WriteToFile('a', 'c')
-    self._LoadAllEvents()
-    self.assertTrue(self._watcher.OutOfOrderWritesDetected())
-
-  def testDoesntCrashWhenFileIsDeleted(self):
-    self._WriteToFile('a', 'a')
-    self._LoadAllEvents()
-    os.remove(os.path.join(self._directory, 'a'))
-    self._WriteToFile('b', 'b')
-    self.assertWatcherYields(['b'])
-
-  def testRaisesRightErrorWhenDirectoryIsDeleted(self):
-    self._WriteToFile('a', 'a')
-    self._LoadAllEvents()
-    shutil.rmtree(self._directory)
-    with self.assertRaises(directory_watcher.DirectoryDeletedError):
-      self._LoadAllEvents()
-
-  def testDoesntRaiseDirectoryDeletedErrorIfOutageIsTransient(self):
-    self._WriteToFile('a', 'a')
-    self._LoadAllEvents()
-    shutil.rmtree(self._directory)
-
-    # Fake a single transient I/O error.
-    def FakeFactory(original):
-
-      def Fake(*args, **kwargs):
-        if FakeFactory.has_been_called:
-          original(*args, **kwargs)
-        else:
-          raise OSError('lp0 temporarily on fire')
-
-      return Fake
-
-    FakeFactory.has_been_called = False
-
-    for stub_name in ['ListDirectoryAbsolute', 'ListRecursively']:
-      self.stubs.Set(io_wrapper, stub_name,
-                     FakeFactory(getattr(io_wrapper, stub_name)))
-    for stub_name in ['IsDirectory', 'Exists', 'Stat']:
-      self.stubs.Set(tf.gfile, stub_name,
-                     FakeFactory(getattr(tf.gfile, stub_name)))
-
-    with self.assertRaises((IOError, OSError)):
-      self._LoadAllEvents()
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
deleted file mode 100644
index 1562f0f8339a845449462084b5356df8dbf5429f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
+++ /dev/null
@@ -1,851 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Takes a generator of values, and accumulates them for a frontend."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-import re
-import threading
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import directory_watcher
-from tensorflow.tensorboard.backend.event_processing import event_file_loader
-from tensorflow.tensorboard.backend.event_processing import plugin_asset_util
-from tensorflow.tensorboard.backend.event_processing import reservoir
-
-namedtuple = collections.namedtuple
-ScalarEvent = namedtuple('ScalarEvent', ['wall_time', 'step', 'value'])
-
-HealthPillEvent = namedtuple('HealthPillEvent', [
-    'wall_time', 'step', 'device_name', 'node_name', 'output_slot', 'dtype',
-    'shape', 'value'])
-
-CompressedHistogramEvent = namedtuple('CompressedHistogramEvent',
-                                      ['wall_time', 'step',
-                                       'compressed_histogram_values'])
-
-CompressedHistogramValue = namedtuple('CompressedHistogramValue',
-                                      ['basis_point', 'value'])
-
-HistogramEvent = namedtuple('HistogramEvent',
-                            ['wall_time', 'step', 'histogram_value'])
-
-HistogramValue = namedtuple('HistogramValue', ['min', 'max', 'num', 'sum',
-                                               'sum_squares', 'bucket_limit',
-                                               'bucket'])
-
-ImageEvent = namedtuple('ImageEvent', ['wall_time', 'step',
-                                       'encoded_image_string', 'width',
-                                       'height'])
-
-AudioEvent = namedtuple('AudioEvent', ['wall_time', 'step',
-                                       'encoded_audio_string', 'content_type',
-                                       'sample_rate', 'length_frames'])
-
-TensorEvent = namedtuple('TensorEvent', ['wall_time', 'step', 'tensor_proto'])
-
-## Different types of summary events handled by the event_accumulator
-SUMMARY_TYPES = {
-    'simple_value': '_ProcessScalar',
-    'histo': '_ProcessHistogram',
-    'image': '_ProcessImage',
-    'audio': '_ProcessAudio',
-    'tensor': '_ProcessTensor',
-}
-
-## The tagTypes below are just arbitrary strings chosen to pass the type
-## information of the tag from the backend to the frontend
-COMPRESSED_HISTOGRAMS = 'distributions'
-HISTOGRAMS = 'histograms'
-IMAGES = 'images'
-AUDIO = 'audio'
-SCALARS = 'scalars'
-TENSORS = 'tensors'
-HEALTH_PILLS = 'health_pills'
-GRAPH = 'graph'
-META_GRAPH = 'meta_graph'
-RUN_METADATA = 'run_metadata'
-
-## Normal CDF for std_devs: (-Inf, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, Inf)
-## naturally gives bands around median of width 1 std dev, 2 std dev, 3 std dev,
-## and then the long tail.
-NORMAL_HISTOGRAM_BPS = (0, 668, 1587, 3085, 5000, 6915, 8413, 9332, 10000)
-
-DEFAULT_SIZE_GUIDANCE = {
-    COMPRESSED_HISTOGRAMS: 500,
-    IMAGES: 4,
-    AUDIO: 4,
-    SCALARS: 10000,
-    # We store this many health pills per op.
-    HEALTH_PILLS: 100,
-    HISTOGRAMS: 1,
-    TENSORS: 10,
-}
-
-STORE_EVERYTHING_SIZE_GUIDANCE = {
-    COMPRESSED_HISTOGRAMS: 0,
-    IMAGES: 0,
-    AUDIO: 0,
-    SCALARS: 0,
-    HEALTH_PILLS: 0,
-    HISTOGRAMS: 0,
-    TENSORS: 0,
-}
-
-# The tag that values containing health pills have. Health pill data is stored
-# in tensors. In order to distinguish health pill values from scalar values, we
-# rely on how health pill values have this special tag value.
-HEALTH_PILL_EVENT_TAG_PREFIX = '__health_pill__/'
-
-
-def IsTensorFlowEventsFile(path):
-  """Check the path name to see if it is probably a TF Events file.
-
-  Args:
-    path: A file path to check if it is an event file.
-
-  Raises:
-    ValueError: If the path is an empty string.
-
-  Returns:
-    If path is formatted like a TensorFlowEventsFile.
-  """
-  if not path:
-    raise ValueError('Path must be a nonempty string')
-  return 'tfevents' in tf.compat.as_str_any(os.path.basename(path))
-
-
-class EventAccumulator(object):
-  """An `EventAccumulator` takes an event generator, and accumulates the values.
-
-  The `EventAccumulator` is intended to provide a convenient Python interface
-  for loading Event data written during a TensorFlow run. TensorFlow writes out
-  `Event` protobuf objects, which have a timestamp and step number, and often
-  contain a `Summary`. Summaries can have different kinds of data like an image,
-  a scalar value, or a histogram. The Summaries also have a tag, which we use to
-  organize logically related data. The `EventAccumulator` supports retrieving
-  the `Event` and `Summary` data by its tag.
-
-  Calling `Tags()` gets a map from `tagType` (e.g. `'images'`,
-  `'compressedHistograms'`, `'scalars'`, etc) to the associated tags for those
-  data types. Then, various functional endpoints (eg
-  `Accumulator.Scalars(tag)`) allow for the retrieval of all data
-  associated with that tag.
-
-  The `Reload()` method synchronously loads all of the data written so far.
-
-  Histograms, audio, and images are very large, so storing all of them is not
-  recommended.
-  @@Tensors
-  """
-
-  def __init__(self,
-               path,
-               size_guidance=DEFAULT_SIZE_GUIDANCE,
-               compression_bps=NORMAL_HISTOGRAM_BPS,
-               purge_orphaned_data=True):
-    """Construct the `EventAccumulator`.
-
-    Args:
-      path: A file path to a directory containing tf events files, or a single
-        tf events file. The accumulator will load events from this path.
-      size_guidance: Information on how much data the EventAccumulator should
-        store in memory. The DEFAULT_SIZE_GUIDANCE tries not to store too much
-        so as to avoid OOMing the client. The size_guidance should be a map
-        from a `tagType` string to an integer representing the number of
-        items to keep per tag for items of that `tagType`. If the size is 0,
-        all events are stored.
-      compression_bps: Information on how the `EventAccumulator` should compress
-        histogram data for the `CompressedHistograms` tag (for details see
-        `ProcessCompressedHistogram`).
-      purge_orphaned_data: Whether to discard any events that were "orphaned" by
-        a TensorFlow restart.
-    """
-    sizes = {}
-    for key in DEFAULT_SIZE_GUIDANCE:
-      if key in size_guidance:
-        sizes[key] = size_guidance[key]
-      else:
-        sizes[key] = DEFAULT_SIZE_GUIDANCE[key]
-
-    self._first_event_timestamp = None
-    self._scalars = reservoir.Reservoir(size=sizes[SCALARS])
-
-    # Unlike the other reservoir, the reservoir for health pills is keyed by the
-    # name of the op instead of the tag. This lets us efficiently obtain the
-    # health pills per node.
-    self._health_pills = reservoir.Reservoir(size=sizes[HEALTH_PILLS])
-
-    self._graph = None
-    self._graph_from_metagraph = False
-    self._meta_graph = None
-    self._tagged_metadata = {}
-    self._histograms = reservoir.Reservoir(size=sizes[HISTOGRAMS])
-    self._compressed_histograms = reservoir.Reservoir(
-        size=sizes[COMPRESSED_HISTOGRAMS], always_keep_last=False)
-    self._images = reservoir.Reservoir(size=sizes[IMAGES])
-    self._audio = reservoir.Reservoir(size=sizes[AUDIO])
-    self._tensors = reservoir.Reservoir(size=sizes[TENSORS])
-
-    self._generator_mutex = threading.Lock()
-    self.path = path
-    self._generator = _GeneratorFromPath(path)
-
-    self._compression_bps = compression_bps
-    self.purge_orphaned_data = purge_orphaned_data
-
-    self.most_recent_step = -1
-    self.most_recent_wall_time = -1
-    self.file_version = None
-
-    # The attributes that get built up by the accumulator
-    self.accumulated_attrs = ('_scalars', '_histograms',
-                              '_compressed_histograms', '_images', '_audio')
-    self._tensor_summaries = {}
-
-  def Reload(self):
-    """Loads all events added since the last call to `Reload`.
-
-    If `Reload` was never called, loads all events in the file.
-
-    Returns:
-      The `EventAccumulator`.
-    """
-    with self._generator_mutex:
-      for event in self._generator.Load():
-        self._ProcessEvent(event)
-    return self
-
-  def PluginAssets(self, plugin_name):
-    """Return a list of all plugin assets for the given plugin.
-
-    Args:
-      plugin_name: The string name of a plugin to retrieve assets for.
-
-    Returns:
-      A list of string plugin asset names, or empty list if none are available.
-      If the plugin was not registered, an empty list is returned.
-    """
-    return plugin_asset_util.ListAssets(self.path, plugin_name)
-
-  def RetrievePluginAsset(self, plugin_name, asset_name):
-    """Return the contents of a given plugin asset.
-
-    Args:
-      plugin_name: The string name of a plugin.
-      asset_name: The string name of an asset.
-
-    Returns:
-      The string contents of the plugin asset.
-
-    Raises:
-      KeyError: If the asset is not available.
-    """
-    return plugin_asset_util.RetrieveAsset(self.path, plugin_name, asset_name)
-
-  def FirstEventTimestamp(self):
-    """Returns the timestamp in seconds of the first event.
-
-    If the first event has been loaded (either by this method or by `Reload`,
-    this returns immediately. Otherwise, it will load in the first event. Note
-    that this means that calling `Reload` will cause this to block until
-    `Reload` has finished.
-
-    Returns:
-      The timestamp in seconds of the first event that was loaded.
-
-    Raises:
-      ValueError: If no events have been loaded and there were no events found
-      on disk.
-    """
-    if self._first_event_timestamp is not None:
-      return self._first_event_timestamp
-    with self._generator_mutex:
-      try:
-        event = next(self._generator.Load())
-        self._ProcessEvent(event)
-        return self._first_event_timestamp
-
-      except StopIteration:
-        raise ValueError('No event timestamp could be found')
-
-  def _ProcessEvent(self, event):
-    """Called whenever an event is loaded."""
-    if self._first_event_timestamp is None:
-      self._first_event_timestamp = event.wall_time
-
-    if event.HasField('file_version'):
-      new_file_version = _ParseFileVersion(event.file_version)
-      if self.file_version and self.file_version != new_file_version:
-        ## This should not happen.
-        tf.logging.warn(('Found new file_version for event.proto. This will '
-                         'affect purging logic for TensorFlow restarts. '
-                         'Old: {0} New: {1}').format(self.file_version,
-                                                     new_file_version))
-      self.file_version = new_file_version
-
-    self._MaybePurgeOrphanedData(event)
-
-    ## Process the event.
-    # GraphDef and MetaGraphDef are handled in a special way:
-    # If no graph_def Event is available, but a meta_graph_def is, and it
-    # contains a graph_def, then use the meta_graph_def.graph_def as our graph.
-    # If a graph_def Event is available, always prefer it to the graph_def
-    # inside the meta_graph_def.
-    if event.HasField('graph_def'):
-      if self._graph is not None:
-        tf.logging.warn(
-            ('Found more than one graph event per run, or there was '
-             'a metagraph containing a graph_def, as well as one or '
-             'more graph events.  Overwriting the graph with the '
-             'newest event.'))
-      self._graph = event.graph_def
-      self._graph_from_metagraph = False
-    elif event.HasField('meta_graph_def'):
-      if self._meta_graph is not None:
-        tf.logging.warn(('Found more than one metagraph event per run. '
-                         'Overwriting the metagraph with the newest event.'))
-      self._meta_graph = event.meta_graph_def
-      if self._graph is None or self._graph_from_metagraph:
-        # We may have a graph_def in the metagraph.  If so, and no
-        # graph_def is directly available, use this one instead.
-        meta_graph = tf.MetaGraphDef()
-        meta_graph.ParseFromString(self._meta_graph)
-        if meta_graph.graph_def:
-          if self._graph is not None:
-            tf.logging.warn(
-                ('Found multiple metagraphs containing graph_defs,'
-                 'but did not find any graph events.  Overwriting the '
-                 'graph with the newest metagraph version.'))
-          self._graph_from_metagraph = True
-          self._graph = meta_graph.graph_def.SerializeToString()
-    elif event.HasField('tagged_run_metadata'):
-      tag = event.tagged_run_metadata.tag
-      if tag in self._tagged_metadata:
-        tf.logging.warn('Found more than one "run metadata" event with tag ' +
-                        tag + '. Overwriting it with the newest event.')
-      self._tagged_metadata[tag] = event.tagged_run_metadata.run_metadata
-    elif event.HasField('summary'):
-      for value in event.summary.value:
-        if (value.HasField('tensor') and
-            value.tag.startswith(HEALTH_PILL_EVENT_TAG_PREFIX)):
-          self._ProcessHealthPillSummary(value, event)
-        else:
-          for summary_type, summary_func in SUMMARY_TYPES.items():
-            if value.HasField(summary_type):
-              datum = getattr(value, summary_type)
-              tag = value.node_name if summary_type == 'tensor' else value.tag
-              getattr(self, summary_func)(tag, event.wall_time, event.step,
-                                          datum)
-
-  def _ProcessHealthPillSummary(self, value, event):
-    """Process summaries containing health pills.
-
-    These summaries are distinguished by the fact that they have a Tensor field
-    and have a special tag value.
-
-    This method emits ERROR-level messages to the logs if it encounters Tensor
-    summaries that it cannot process.
-
-    Args:
-      value: A tf.Summary.Value with a Tensor field.
-      event: The tf.Event containing that value.
-    """
-    elements = tf.make_ndarray(value.tensor)
-
-    # The node_name property of the value object is actually a watch key: a
-    # combination of node name, output slot, and a suffix. We capture the
-    # actual node name and the output slot with a regular expression.
-    match = re.match(r'^(.*):(\d+):DebugNumericSummary$', value.node_name)
-    if not match:
-      tf.logging.log_first_n(
-          tf.logging.ERROR,
-          'Unsupported watch key %s for health pills; skipping this sequence.',
-          1, value.node_name)
-      return
-
-    node_name = match.group(1)
-    output_slot = int(match.group(2))
-    device_name = value.tag[len(HEALTH_PILL_EVENT_TAG_PREFIX):]
-    self._ProcessHealthPill(event.wall_time, event.step, device_name, node_name,
-                            output_slot, elements)
-
-  def Tags(self):
-    """Return all tags found in the value stream.
-
-    Returns:
-      A `{tagType: ['list', 'of', 'tags']}` dictionary.
-    """
-    return {
-        IMAGES: self._images.Keys(),
-        AUDIO: self._audio.Keys(),
-        HISTOGRAMS: self._histograms.Keys(),
-        SCALARS: self._scalars.Keys(),
-        COMPRESSED_HISTOGRAMS: self._compressed_histograms.Keys(),
-        TENSORS: self._tensors.Keys(),
-        # Use a heuristic: if the metagraph is available, but
-        # graph is not, then we assume the metagraph contains the graph.
-        GRAPH: self._graph is not None,
-        META_GRAPH: self._meta_graph is not None,
-        RUN_METADATA: list(self._tagged_metadata.keys())
-    }
-
-  def Scalars(self, tag):
-    """Given a summary tag, return all associated `ScalarEvent`s.
-
-    Args:
-      tag: A string tag associated with the events.
-
-    Raises:
-      KeyError: If the tag is not found.
-
-    Returns:
-      An array of `ScalarEvent`s.
-    """
-    return self._scalars.Items(tag)
-
-  def HealthPills(self, node_name):
-    """Returns all health pill values for a certain node.
-
-    Args:
-      node_name: The name of the node to obtain health pills for.
-
-    Raises:
-      KeyError: If the node name is not found.
-
-    Returns:
-      An array of `HealthPillEvent`s.
-    """
-    return self._health_pills.Items(node_name)
-
-  def GetOpsWithHealthPills(self):
-    """Determines which ops have at least 1 health pill event.
-
-    Returns:
-      A list of names of ops with at least 1 health pill event.
-    """
-    return self._health_pills.Keys()
-
-  def Graph(self):
-    """Return the graph definition, if there is one.
-
-    If the graph is stored directly, return that.  If no graph is stored
-    directly but a metagraph is stored containing a graph, return that.
-
-    Raises:
-      ValueError: If there is no graph for this run.
-
-    Returns:
-      The `graph_def` proto.
-    """
-    graph = tf.GraphDef()
-    if self._graph is not None:
-      graph.ParseFromString(self._graph)
-      return graph
-    raise ValueError('There is no graph in this EventAccumulator')
-
-  def MetaGraph(self):
-    """Return the metagraph definition, if there is one.
-
-    Raises:
-      ValueError: If there is no metagraph for this run.
-
-    Returns:
-      The `meta_graph_def` proto.
-    """
-    if self._meta_graph is None:
-      raise ValueError('There is no metagraph in this EventAccumulator')
-    meta_graph = tf.MetaGraphDef()
-    meta_graph.ParseFromString(self._meta_graph)
-    return meta_graph
-
-  def RunMetadata(self, tag):
-    """Given a tag, return the associated session.run() metadata.
-
-    Args:
-      tag: A string tag associated with the event.
-
-    Raises:
-      ValueError: If the tag is not found.
-
-    Returns:
-      The metadata in form of `RunMetadata` proto.
-    """
-    if tag not in self._tagged_metadata:
-      raise ValueError('There is no run metadata with this tag name')
-
-    run_metadata = tf.RunMetadata()
-    run_metadata.ParseFromString(self._tagged_metadata[tag])
-    return run_metadata
-
-  def Histograms(self, tag):
-    """Given a summary tag, return all associated histograms.
-
-    Args:
-      tag: A string tag associated with the events.
-
-    Raises:
-      KeyError: If the tag is not found.
-
-    Returns:
-      An array of `HistogramEvent`s.
-    """
-    return self._histograms.Items(tag)
-
-  def CompressedHistograms(self, tag):
-    """Given a summary tag, return all associated compressed histograms.
-
-    Args:
-      tag: A string tag associated with the events.
-
-    Raises:
-      KeyError: If the tag is not found.
-
-    Returns:
-      An array of `CompressedHistogramEvent`s.
-    """
-    return self._compressed_histograms.Items(tag)
-
-  def Images(self, tag):
-    """Given a summary tag, return all associated images.
-
-    Args:
-      tag: A string tag associated with the events.
-
-    Raises:
-      KeyError: If the tag is not found.
-
-    Returns:
-      An array of `ImageEvent`s.
-    """
-    return self._images.Items(tag)
-
-  def Audio(self, tag):
-    """Given a summary tag, return all associated audio.
-
-    Args:
-      tag: A string tag associated with the events.
-
-    Raises:
-      KeyError: If the tag is not found.
-
-    Returns:
-      An array of `AudioEvent`s.
-    """
-    return self._audio.Items(tag)
-
-  def Tensors(self, tag):
-    """Given a summary tag, return all associated tensors.
-
-    Args:
-      tag: A string tag associated with the events.
-
-    Raises:
-      KeyError: If the tag is not found.
-
-    Returns:
-      An array of `TensorEvent`s.
-    """
-    return self._tensors.Items(tag)
-
-  def _MaybePurgeOrphanedData(self, event):
-    """Maybe purge orphaned data due to a TensorFlow crash.
-
-    When TensorFlow crashes at step T+O and restarts at step T, any events
-    written after step T are now "orphaned" and will be at best misleading if
-    they are included in TensorBoard.
-
-    This logic attempts to determine if there is orphaned data, and purge it
-    if it is found.
-
-    Args:
-      event: The event to use as a reference, to determine if a purge is needed.
-    """
-    if not self.purge_orphaned_data:
-      return
-    ## Check if the event happened after a crash, and purge expired tags.
-    if self.file_version and self.file_version >= 2:
-      ## If the file_version is recent enough, use the SessionLog enum
-      ## to check for restarts.
-      self._CheckForRestartAndMaybePurge(event)
-    else:
-      ## If there is no file version, default to old logic of checking for
-      ## out of order steps.
-      self._CheckForOutOfOrderStepAndMaybePurge(event)
-
-  def _CheckForRestartAndMaybePurge(self, event):
-    """Check and discard expired events using SessionLog.START.
-
-    Check for a SessionLog.START event and purge all previously seen events
-    with larger steps, because they are out of date. Because of supervisor
-    threading, it is possible that this logic will cause the first few event
-    messages to be discarded since supervisor threading does not guarantee
-    that the START message is deterministically written first.
-
-    This method is preferred over _CheckForOutOfOrderStepAndMaybePurge which
-    can inadvertently discard events due to supervisor threading.
-
-    Args:
-      event: The event to use as reference. If the event is a START event, all
-        previously seen events with a greater event.step will be purged.
-    """
-    if event.HasField(
-        'session_log') and event.session_log.status == tf.SessionLog.START:
-      self._Purge(event, by_tags=False)
-
-  def _CheckForOutOfOrderStepAndMaybePurge(self, event):
-    """Check for out-of-order event.step and discard expired events for tags.
-
-    Check if the event is out of order relative to the global most recent step.
-    If it is, purge outdated summaries for tags that the event contains.
-
-    Args:
-      event: The event to use as reference. If the event is out-of-order, all
-        events with the same tags, but with a greater event.step will be purged.
-    """
-    if event.step < self.most_recent_step and event.HasField('summary'):
-      self._Purge(event, by_tags=True)
-    else:
-      self.most_recent_step = event.step
-      self.most_recent_wall_time = event.wall_time
-
-  def _ConvertHistogramProtoToTuple(self, histo):
-    return HistogramValue(min=histo.min,
-                          max=histo.max,
-                          num=histo.num,
-                          sum=histo.sum,
-                          sum_squares=histo.sum_squares,
-                          bucket_limit=list(histo.bucket_limit),
-                          bucket=list(histo.bucket))
-
-  def _ProcessHistogram(self, tag, wall_time, step, histo):
-    """Processes a proto histogram by adding it to accumulated state."""
-    histo = self._ConvertHistogramProtoToTuple(histo)
-    histo_ev = HistogramEvent(wall_time, step, histo)
-    self._histograms.AddItem(tag, histo_ev)
-    self._compressed_histograms.AddItem(
-        tag, histo_ev, lambda x: _CompressHistogram(x, self._compression_bps))
-
-  def _ProcessImage(self, tag, wall_time, step, image):
-    """Processes an image by adding it to accumulated state."""
-    event = ImageEvent(wall_time=wall_time,
-                       step=step,
-                       encoded_image_string=image.encoded_image_string,
-                       width=image.width,
-                       height=image.height)
-    self._images.AddItem(tag, event)
-
-  def _ProcessAudio(self, tag, wall_time, step, audio):
-    """Processes a audio by adding it to accumulated state."""
-    event = AudioEvent(wall_time=wall_time,
-                       step=step,
-                       encoded_audio_string=audio.encoded_audio_string,
-                       content_type=audio.content_type,
-                       sample_rate=audio.sample_rate,
-                       length_frames=audio.length_frames)
-    self._audio.AddItem(tag, event)
-
-  def _ProcessScalar(self, tag, wall_time, step, scalar):
-    """Processes a simple value by adding it to accumulated state."""
-    sv = ScalarEvent(wall_time=wall_time, step=step, value=scalar)
-    self._scalars.AddItem(tag, sv)
-
-  def _ProcessTensor(self, tag, wall_time, step, tensor):
-    tv = TensorEvent(wall_time=wall_time, step=step, tensor_proto=tensor)
-    self._tensors.AddItem(tag, tv)
-
-  def _ProcessHealthPill(self, wall_time, step, device_name, node_name,
-                         output_slot, elements):
-    """Processes a health pill value by adding it to accumulated state.
-
-    Args:
-      wall_time: The time at which the health pill was created. Provided by the
-        debugger.
-      step: The step at which the health pill was created. Provided by the
-        debugger.
-      device_name: The name of the node's device.
-      node_name: The name of the node for this health pill.
-      output_slot: The output slot for this health pill.
-      elements: An ND array of 20 floats. The elements of the health pill.
-    """
-    # Key by the node name for fast retrieval of health pills by node name. The
-    # array is cast to a list so that it is JSON-able. The debugger data plugin
-    # serves a JSON response.
-    self._health_pills.AddItem(node_name,
-                               HealthPillEvent(
-                                   wall_time=wall_time,
-                                   step=step,
-                                   device_name=device_name,
-                                   node_name=node_name,
-                                   output_slot=output_slot,
-                                   dtype=repr(tf.as_dtype(elements[12])),
-                                   shape=list(elements[14:]),
-                                   value=list(elements)))
-
-  def _Purge(self, event, by_tags):
-    """Purge all events that have occurred after the given event.step.
-
-    If by_tags is True, purge all events that occurred after the given
-    event.step, but only for the tags that the event has. Non-sequential
-    event.steps suggest that a TensorFlow restart occurred, and we discard
-    the out-of-order events to display a consistent view in TensorBoard.
-
-    Discarding by tags is the safer method, when we are unsure whether a restart
-    has occurred, given that threading in supervisor can cause events of
-    different tags to arrive with unsynchronized step values.
-
-    If by_tags is False, then purge all events with event.step greater than the
-    given event.step. This can be used when we are certain that a TensorFlow
-    restart has occurred and these events can be discarded.
-
-    Args:
-      event: The event to use as reference for the purge. All events with
-        the same tags, but with a greater event.step will be purged.
-      by_tags: Bool to dictate whether to discard all out-of-order events or
-        only those that are associated with the given reference event.
-    """
-    ## Keep data in reservoirs that has a step less than event.step
-    _NotExpired = lambda x: x.step < event.step
-
-    if by_tags:
-
-      def _ExpiredPerTag(value):
-        return [getattr(self, x).FilterItems(_NotExpired, value.tag)
-                for x in self.accumulated_attrs]
-
-      expired_per_tags = [_ExpiredPerTag(value)
-                          for value in event.summary.value]
-      expired_per_type = [sum(x) for x in zip(*expired_per_tags)]
-    else:
-      expired_per_type = [getattr(self, x).FilterItems(_NotExpired)
-                          for x in self.accumulated_attrs]
-
-    if sum(expired_per_type) > 0:
-      purge_msg = _GetPurgeMessage(self.most_recent_step,
-                                   self.most_recent_wall_time, event.step,
-                                   event.wall_time, *expired_per_type)
-      tf.logging.warn(purge_msg)
-
-
-def _GetPurgeMessage(most_recent_step, most_recent_wall_time, event_step,
-                     event_wall_time, num_expired_scalars, num_expired_histos,
-                     num_expired_comp_histos, num_expired_images,
-                     num_expired_audio):
-  """Return the string message associated with TensorBoard purges."""
-  return ('Detected out of order event.step likely caused by '
-          'a TensorFlow restart. Purging expired events from Tensorboard'
-          ' display between the previous step: {} (timestamp: {}) and '
-          'current step: {} (timestamp: {}). Removing {} scalars, {} '
-          'histograms, {} compressed histograms, {} images, '
-          'and {} audio.').format(most_recent_step, most_recent_wall_time,
-                                  event_step, event_wall_time,
-                                  num_expired_scalars, num_expired_histos,
-                                  num_expired_comp_histos, num_expired_images,
-                                  num_expired_audio)
-
-
-def _GeneratorFromPath(path):
-  """Create an event generator for file or directory at given path string."""
-  if not path:
-    raise ValueError('path must be a valid string')
-  if IsTensorFlowEventsFile(path):
-    return event_file_loader.EventFileLoader(path)
-  else:
-    return directory_watcher.DirectoryWatcher(
-        path, event_file_loader.EventFileLoader, IsTensorFlowEventsFile)
-
-
-def _ParseFileVersion(file_version):
-  """Convert the string file_version in event.proto into a float.
-
-  Args:
-    file_version: String file_version from event.proto
-
-  Returns:
-    Version number as a float.
-  """
-  tokens = file_version.split('brain.Event:')
-  try:
-    return float(tokens[-1])
-  except ValueError:
-    ## This should never happen according to the definition of file_version
-    ## specified in event.proto.
-    tf.logging.warn(
-        ('Invalid event.proto file_version. Defaulting to use of '
-         'out-of-order event.step logic for purging expired events.'))
-    return -1
-
-
-def _CompressHistogram(histo_ev, bps):
-  """Creates fixed size histogram by adding compression to accumulated state.
-
-  This routine transforms a histogram at a particular step by linearly
-  interpolating its variable number of buckets to represent their cumulative
-  weight at a constant number of compression points. This significantly reduces
-  the size of the histogram and makes it suitable for a two-dimensional area
-  plot where the output of this routine constitutes the ranges for a single x
-  coordinate.
-
-  Args:
-    histo_ev: A HistogramEvent namedtuple.
-    bps: Compression points represented in basis points, 1/100ths of a percent.
-
-  Returns:
-    CompressedHistogramEvent namedtuple.
-  """
-  # See also: Histogram::Percentile() in core/lib/histogram/histogram.cc
-  histo = histo_ev.histogram_value
-  if not histo.num:
-    return CompressedHistogramEvent(
-        histo_ev.wall_time,
-        histo_ev.step,
-        [CompressedHistogramValue(b, 0.0) for b in bps])
-  bucket = np.array(histo.bucket)
-  weights = (bucket * bps[-1] / (bucket.sum() or 1.0)).cumsum()
-  values = []
-  j = 0
-  while j < len(bps):
-    i = np.searchsorted(weights, bps[j], side='right')
-    while i < len(weights):
-      cumsum = weights[i]
-      cumsum_prev = weights[i - 1] if i > 0 else 0.0
-      if cumsum == cumsum_prev:  # prevent remap divide by zero
-        i += 1
-        continue
-      if not i or not cumsum_prev:
-        lhs = histo.min
-      else:
-        lhs = max(histo.bucket_limit[i - 1], histo.min)
-      rhs = min(histo.bucket_limit[i], histo.max)
-      weight = _Remap(bps[j], cumsum_prev, cumsum, lhs, rhs)
-      values.append(CompressedHistogramValue(bps[j], weight))
-      j += 1
-      break
-    else:
-      break
-  while j < len(bps):
-    values.append(CompressedHistogramValue(bps[j], histo.max))
-    j += 1
-  return CompressedHistogramEvent(histo_ev.wall_time, histo_ev.step, values)
-
-
-def _Remap(x, x0, x1, y0, y1):
-  """Linearly map from [x0, x1] unto [y0, y1]."""
-  return y0 + (x - x0) * float(y1 - y0) / (x1 - x0)
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
deleted file mode 100644
index 4ce766f4204a2b629d691f8a557de135be8b2ab7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
+++ /dev/null
@@ -1,976 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_accumulator as ea
-
-
-class _EventGenerator(object):
-  """Class that can add_events and then yield them back.
-
-  Satisfies the EventGenerator API required for the EventAccumulator.
-  Satisfies the EventWriter API required to create a SummaryWriter.
-
-  Has additional convenience methods for adding test events.
-  """
-
-  def __init__(self, testcase, zero_out_timestamps=False):
-    self._testcase = testcase
-    self.items = []
-    self.zero_out_timestamps = zero_out_timestamps
-
-  def Load(self):
-    while self.items:
-      yield self.items.pop(0)
-
-  def AddScalar(self, tag, wall_time=0, step=0, value=0):
-    event = tf.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=tf.Summary(
-            value=[tf.Summary.Value(tag=tag, simple_value=value)]))
-    self.AddEvent(event)
-
-  def AddHealthPill(self, wall_time, step, device_name, op_name, output_slot,
-                    elements):
-    event = tf.Event(step=step, wall_time=wall_time)
-    value = event.summary.value.add(
-        tag=ea.HEALTH_PILL_EVENT_TAG_PREFIX + device_name,
-        node_name='%s:%d:DebugNumericSummary' % (op_name, output_slot))
-    value.tensor.tensor_shape.dim.add(size=len(elements))
-    value.tensor.dtype = 2  # DT_DOUBLE
-    value.tensor.tensor_content = np.array(elements, dtype=np.float64).tobytes()
-    self.AddEvent(event)
-
-  def AddHistogram(self,
-                   tag,
-                   wall_time=0,
-                   step=0,
-                   hmin=1,
-                   hmax=2,
-                   hnum=3,
-                   hsum=4,
-                   hsum_squares=5,
-                   hbucket_limit=None,
-                   hbucket=None):
-    histo = tf.HistogramProto(
-        min=hmin,
-        max=hmax,
-        num=hnum,
-        sum=hsum,
-        sum_squares=hsum_squares,
-        bucket_limit=hbucket_limit,
-        bucket=hbucket)
-    event = tf.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=tf.Summary(value=[tf.Summary.Value(tag=tag, histo=histo)]))
-    self.AddEvent(event)
-
-  def AddImage(self,
-               tag,
-               wall_time=0,
-               step=0,
-               encoded_image_string=b'imgstr',
-               width=150,
-               height=100):
-    image = tf.Summary.Image(
-        encoded_image_string=encoded_image_string, width=width, height=height)
-    event = tf.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=tf.Summary(value=[tf.Summary.Value(tag=tag, image=image)]))
-    self.AddEvent(event)
-
-  def AddAudio(self,
-               tag,
-               wall_time=0,
-               step=0,
-               encoded_audio_string=b'sndstr',
-               content_type='audio/wav',
-               sample_rate=44100,
-               length_frames=22050):
-    audio = tf.Summary.Audio(
-        encoded_audio_string=encoded_audio_string,
-        content_type=content_type,
-        sample_rate=sample_rate,
-        length_frames=length_frames)
-    event = tf.Event(
-        wall_time=wall_time,
-        step=step,
-        summary=tf.Summary(value=[tf.Summary.Value(tag=tag, audio=audio)]))
-    self.AddEvent(event)
-
-  def AddEvent(self, event):
-    if self.zero_out_timestamps:
-      event.wall_time = 0
-    self.items.append(event)
-
-  def add_event(self, event):  # pylint: disable=invalid-name
-    """Match the EventWriter API."""
-    self.AddEvent(event)
-
-  def get_logdir(self):  # pylint: disable=invalid-name
-    """Return a temp directory for asset writing."""
-    return self._testcase.get_temp_dir()
-
-
-class EventAccumulatorTest(tf.test.TestCase):
-
-  def assertTagsEqual(self, actual, expected):
-    """Utility method for checking the return value of the Tags() call.
-
-    It fills out the `expected` arg with the default (empty) values for every
-    tag type, so that the author needs only specify the non-empty values they
-    are interested in testing.
-
-    Args:
-      actual: The actual Accumulator tags response.
-      expected: The expected tags response (empty fields may be omitted)
-    """
-
-    empty_tags = {
-        ea.IMAGES: [],
-        ea.AUDIO: [],
-        ea.SCALARS: [],
-        ea.HISTOGRAMS: [],
-        ea.COMPRESSED_HISTOGRAMS: [],
-        ea.GRAPH: False,
-        ea.META_GRAPH: False,
-        ea.RUN_METADATA: [],
-        ea.TENSORS: [],
-    }
-
-    # Verifies that there are no unexpected keys in the actual response.
-    # If this line fails, likely you added a new tag type, and need to update
-    # the empty_tags dictionary above.
-    self.assertItemsEqual(actual.keys(), empty_tags.keys())
-
-    for key in actual:
-      expected_value = expected.get(key, empty_tags[key])
-      if isinstance(expected_value, list):
-        self.assertItemsEqual(actual[key], expected_value)
-      else:
-        self.assertEqual(actual[key], expected_value)
-
-
-class MockingEventAccumulatorTest(EventAccumulatorTest):
-
-  def setUp(self):
-    super(MockingEventAccumulatorTest, self).setUp()
-    self.stubs = tf.test.StubOutForTesting()
-    self._real_constructor = ea.EventAccumulator
-    self._real_generator = ea._GeneratorFromPath
-
-    def _FakeAccumulatorConstructor(generator, *args, **kwargs):
-      ea._GeneratorFromPath = lambda x: generator
-      return self._real_constructor(generator, *args, **kwargs)
-
-    ea.EventAccumulator = _FakeAccumulatorConstructor
-
-  def tearDown(self):
-    self.stubs.CleanUp()
-    ea.EventAccumulator = self._real_constructor
-    ea._GeneratorFromPath = self._real_generator
-
-  def testEmptyAccumulator(self):
-    gen = _EventGenerator(self)
-    x = ea.EventAccumulator(gen)
-    x.Reload()
-    self.assertTagsEqual(x.Tags(), {})
-
-  def testTags(self):
-    """Tags should be found in EventAccumulator after adding some events."""
-    gen = _EventGenerator(self)
-    gen.AddScalar('s1')
-    gen.AddScalar('s2')
-    gen.AddHistogram('hst1')
-    gen.AddHistogram('hst2')
-    gen.AddImage('im1')
-    gen.AddImage('im2')
-    gen.AddAudio('snd1')
-    gen.AddAudio('snd2')
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: ['im1', 'im2'],
-        ea.AUDIO: ['snd1', 'snd2'],
-        ea.SCALARS: ['s1', 's2'],
-        ea.HISTOGRAMS: ['hst1', 'hst2'],
-        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
-    })
-
-  def testReload(self):
-    """EventAccumulator contains suitable tags after calling Reload."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {})
-    gen.AddScalar('s1')
-    gen.AddScalar('s2')
-    gen.AddHistogram('hst1')
-    gen.AddHistogram('hst2')
-    gen.AddImage('im1')
-    gen.AddImage('im2')
-    gen.AddAudio('snd1')
-    gen.AddAudio('snd2')
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: ['im1', 'im2'],
-        ea.AUDIO: ['snd1', 'snd2'],
-        ea.SCALARS: ['s1', 's2'],
-        ea.HISTOGRAMS: ['hst1', 'hst2'],
-        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
-    })
-
-  def testScalars(self):
-    """Tests whether EventAccumulator contains scalars after adding them."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    s1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
-    s2 = ea.ScalarEvent(wall_time=2, step=12, value=64)
-    gen.AddScalar('s1', wall_time=1, step=10, value=32)
-    gen.AddScalar('s2', wall_time=2, step=12, value=64)
-    acc.Reload()
-    self.assertEqual(acc.Scalars('s1'), [s1])
-    self.assertEqual(acc.Scalars('s2'), [s2])
-
-  def _compareHealthPills(self, expected_event, gotten_event):
-    """Compares 2 health pills.
-
-    Args:
-      expected_event: The expected HealthPillEvent.
-      gotten_event: The gotten HealthPillEvent.
-    """
-    self.assertEqual(expected_event.wall_time, gotten_event.wall_time)
-    self.assertEqual(expected_event.step, gotten_event.step)
-    self.assertEqual(expected_event.device_name, gotten_event.device_name)
-    self.assertEqual(expected_event.node_name, gotten_event.node_name)
-    self.assertEqual(expected_event.output_slot, gotten_event.output_slot)
-    self.assertEqual(len(expected_event.value), len(gotten_event.value))
-    for i, expected_value in enumerate(expected_event.value):
-      self.assertEqual(expected_value, gotten_event.value[i])
-
-  def testHealthPills(self):
-    """HealthPills should be properly inserted into EventAccumulator."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    health_pill_elements_1 = list(range(1, 13)) + [
-        float(1), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
-    gen.AddHealthPill(13371337, 41, '/job:localhost/replica:0/task:0/cpu:0',
-                      'Add', 0, health_pill_elements_1)
-    health_pill_elements_2 = list(range(42, 54)) + [
-        float(2), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
-    gen.AddHealthPill(13381338, 42, '/job:localhost/replica:0/task:0/gpu:0',
-                      'Add', 1, health_pill_elements_2)
-    acc.Reload()
-
-    # Retrieve the health pills for each node name.
-    gotten_events = acc.HealthPills('Add')
-    self.assertEquals(2, len(gotten_events))
-    self._compareHealthPills(
-        ea.HealthPillEvent(
-            wall_time=13371337,
-            step=41,
-            device_name='/job:localhost/replica:0/task:0/cpu:0',
-            node_name='Add',
-            output_slot=0,
-            dtype='tf.float32',
-            shape=[1, 2],
-            value=health_pill_elements_1), gotten_events[0])
-    self._compareHealthPills(
-        ea.HealthPillEvent(
-            wall_time=13381338,
-            device_name='/job:localhost/replica:0/task:0/gpu:0',
-            step=42,
-            node_name='Add',
-            output_slot=1,
-            dtype='tf.float64',
-            shape=[3, 4],
-            value=health_pill_elements_2), gotten_events[1])
-
-  def testGetOpsWithHealthPills(self):
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    health_pill_elements_1 = list(range(1, 13)) + [
-        float(1), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
-    gen.AddHealthPill(13371337, 41, '/job:localhost/replica:0/task:0/cpu:0',
-                      'Add', 0, health_pill_elements_1)
-    health_pill_elements_2 = list(range(42, 54)) + [
-        float(2), 2.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0]
-    gen.AddHealthPill(13381338, 42, '/job:localhost/replica:0/task:0/cpu:0',
-                      'MatMul', 1, health_pill_elements_2)
-    acc.Reload()
-    self.assertItemsEqual(['Add', 'MatMul'], acc.GetOpsWithHealthPills())
-
-  def testHistograms(self):
-    """Tests whether histograms are inserted into EventAccumulator."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-
-    val1 = ea.HistogramValue(
-        min=1,
-        max=2,
-        num=3,
-        sum=4,
-        sum_squares=5,
-        bucket_limit=[1, 2, 3],
-        bucket=[0, 3, 0])
-    val2 = ea.HistogramValue(
-        min=-2,
-        max=3,
-        num=4,
-        sum=5,
-        sum_squares=6,
-        bucket_limit=[2, 3, 4],
-        bucket=[1, 3, 0])
-
-    hst1 = ea.HistogramEvent(wall_time=1, step=10, histogram_value=val1)
-    hst2 = ea.HistogramEvent(wall_time=2, step=12, histogram_value=val2)
-    gen.AddHistogram(
-        'hst1',
-        wall_time=1,
-        step=10,
-        hmin=1,
-        hmax=2,
-        hnum=3,
-        hsum=4,
-        hsum_squares=5,
-        hbucket_limit=[1, 2, 3],
-        hbucket=[0, 3, 0])
-    gen.AddHistogram(
-        'hst2',
-        wall_time=2,
-        step=12,
-        hmin=-2,
-        hmax=3,
-        hnum=4,
-        hsum=5,
-        hsum_squares=6,
-        hbucket_limit=[2, 3, 4],
-        hbucket=[1, 3, 0])
-    acc.Reload()
-    self.assertEqual(acc.Histograms('hst1'), [hst1])
-    self.assertEqual(acc.Histograms('hst2'), [hst2])
-
-  def testCompressedHistograms(self):
-    """Tests compressed histograms inserted into EventAccumulator."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
-
-    gen.AddHistogram(
-        'hst1',
-        wall_time=1,
-        step=10,
-        hmin=1,
-        hmax=2,
-        hnum=3,
-        hsum=4,
-        hsum_squares=5,
-        hbucket_limit=[1, 2, 3],
-        hbucket=[0, 3, 0])
-    gen.AddHistogram(
-        'hst2',
-        wall_time=2,
-        step=12,
-        hmin=-2,
-        hmax=3,
-        hnum=4,
-        hsum=5,
-        hsum_squares=6,
-        hbucket_limit=[2, 3, 4],
-        hbucket=[1, 3, 0])
-    acc.Reload()
-
-    # Create the expected values after compressing hst1
-    expected_vals1 = [
-        ea.CompressedHistogramValue(bp, val)
-        for bp, val in [(0, 1.0), (2500, 1.25), (5000, 1.5), (7500, 1.75
-                                                             ), (10000, 2.0)]
-    ]
-    expected_cmphst1 = ea.CompressedHistogramEvent(
-        wall_time=1, step=10, compressed_histogram_values=expected_vals1)
-    self.assertEqual(acc.CompressedHistograms('hst1'), [expected_cmphst1])
-
-    # Create the expected values after compressing hst2
-    expected_vals2 = [
-        ea.CompressedHistogramValue(bp, val)
-        for bp, val in [(0, -2),
-                        (2500, 2),
-                        (5000, 2 + 1 / 3),
-                        (7500, 2 + 2 / 3),
-                        (10000, 3)]
-    ]
-    expected_cmphst2 = ea.CompressedHistogramEvent(
-        wall_time=2, step=12, compressed_histogram_values=expected_vals2)
-    self.assertEqual(acc.CompressedHistograms('hst2'), [expected_cmphst2])
-
-  def testCompressedHistogramsWithEmptyHistogram(self):
-    """Tests that empty histograms compressed properly in EventAccumulator."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
-
-    gen.AddHistogram(
-        'hst1',
-        wall_time=1,
-        step=10,
-        hmin=None,
-        hmax=None,
-        hnum=0,
-        hsum=0,
-        hsum_squares=0,
-        hbucket_limit=[1, 2, 3],
-        hbucket=[0, 0, 0])
-    acc.Reload()
-
-    # Create the expected values after compressing hst1
-    expected_vals1 = [
-        ea.CompressedHistogramValue(bp, val)
-        for bp, val in [(0, 0.0), (2500, 0), (5000, 0), (7500, 0), (10000, 0)]
-    ]
-    expected_cmphst1 = ea.CompressedHistogramEvent(
-        wall_time=1, step=10, compressed_histogram_values=expected_vals1)
-    self.assertEqual(acc.CompressedHistograms('hst1'), [expected_cmphst1])
-
-  def testCompressHistogram_uglyHistogram(self):
-    bps = (0, 668, 1587, 3085, 5000, 6915, 8413, 9332, 10000)
-    histogram_values = ea.HistogramValue(
-        min=0.0,
-        max=1.0,
-        num=960.0,
-        sum=64.0,
-        sum_squares=64.0,
-        bucket_limit=[
-            0.0, 1e-12, 0.917246389039776, 1.0089710279437536,
-            1.7976931348623157e+308
-        ],
-        bucket=[0.0, 896.0, 0.0, 64.0, 0.0])
-    histogram_event = ea.HistogramEvent(0, 0, histogram_values)
-    compressed_event = ea._CompressHistogram(histogram_event, bps)
-    vals = compressed_event.compressed_histogram_values
-    self.assertEquals(tuple(v.basis_point for v in vals), bps)
-    self.assertAlmostEqual(vals[0].value, 0.0)
-    self.assertAlmostEqual(vals[1].value, 7.157142857142856e-14)
-    self.assertAlmostEqual(vals[2].value, 1.7003571428571426e-13)
-    self.assertAlmostEqual(vals[3].value, 3.305357142857143e-13)
-    self.assertAlmostEqual(vals[4].value, 5.357142857142857e-13)
-    self.assertAlmostEqual(vals[5].value, 7.408928571428571e-13)
-    self.assertAlmostEqual(vals[6].value, 9.013928571428571e-13)
-    self.assertAlmostEqual(vals[7].value, 9.998571428571429e-13)
-    self.assertAlmostEqual(vals[8].value, 1.0)
-
-  def testImages(self):
-    """Tests 2 images inserted/accessed in EventAccumulator."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    im1 = ea.ImageEvent(
-        wall_time=1,
-        step=10,
-        encoded_image_string=b'big',
-        width=400,
-        height=300)
-    im2 = ea.ImageEvent(
-        wall_time=2,
-        step=12,
-        encoded_image_string=b'small',
-        width=40,
-        height=30)
-    gen.AddImage(
-        'im1',
-        wall_time=1,
-        step=10,
-        encoded_image_string=b'big',
-        width=400,
-        height=300)
-    gen.AddImage(
-        'im2',
-        wall_time=2,
-        step=12,
-        encoded_image_string=b'small',
-        width=40,
-        height=30)
-    acc.Reload()
-    self.assertEqual(acc.Images('im1'), [im1])
-    self.assertEqual(acc.Images('im2'), [im2])
-
-  def testAudio(self):
-    """Tests 2 audio events inserted/accessed in EventAccumulator."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    snd1 = ea.AudioEvent(
-        wall_time=1,
-        step=10,
-        encoded_audio_string=b'big',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=441000)
-    snd2 = ea.AudioEvent(
-        wall_time=2,
-        step=12,
-        encoded_audio_string=b'small',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=44100)
-    gen.AddAudio(
-        'snd1',
-        wall_time=1,
-        step=10,
-        encoded_audio_string=b'big',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=441000)
-    gen.AddAudio(
-        'snd2',
-        wall_time=2,
-        step=12,
-        encoded_audio_string=b'small',
-        content_type='audio/wav',
-        sample_rate=44100,
-        length_frames=44100)
-    acc.Reload()
-    self.assertEqual(acc.Audio('snd1'), [snd1])
-    self.assertEqual(acc.Audio('snd2'), [snd2])
-
-  def testKeyError(self):
-    """KeyError should be raised when accessing non-existing keys."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    acc.Reload()
-    with self.assertRaises(KeyError):
-      acc.Scalars('s1')
-    with self.assertRaises(KeyError):
-      acc.Scalars('hst1')
-    with self.assertRaises(KeyError):
-      acc.Scalars('im1')
-    with self.assertRaises(KeyError):
-      acc.Histograms('s1')
-    with self.assertRaises(KeyError):
-      acc.Histograms('im1')
-    with self.assertRaises(KeyError):
-      acc.Images('s1')
-    with self.assertRaises(KeyError):
-      acc.Images('hst1')
-    with self.assertRaises(KeyError):
-      acc.Audio('s1')
-    with self.assertRaises(KeyError):
-      acc.Audio('hst1')
-
-  def testNonValueEvents(self):
-    """Non-value events in the generator don't cause early exits."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    gen.AddScalar('s1', wall_time=1, step=10, value=20)
-    gen.AddEvent(tf.Event(wall_time=2, step=20, file_version='nots2'))
-    gen.AddScalar('s3', wall_time=3, step=100, value=1)
-    gen.AddHistogram('hst1')
-    gen.AddImage('im1')
-    gen.AddAudio('snd1')
-
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.IMAGES: ['im1'],
-        ea.AUDIO: ['snd1'],
-        ea.SCALARS: ['s1', 's3'],
-        ea.HISTOGRAMS: ['hst1'],
-        ea.COMPRESSED_HISTOGRAMS: ['hst1'],
-    })
-
-  def testExpiredDataDiscardedAfterRestartForFileVersionLessThan2(self):
-    """Tests that events are discarded after a restart is detected.
-
-    If a step value is observed to be lower than what was previously seen,
-    this should force a discard of all previous items with the same tag
-    that are outdated.
-
-    Only file versions < 2 use this out-of-order discard logic. Later versions
-    discard events based on the step value of SessionLog.START.
-    """
-    warnings = []
-    self.stubs.Set(tf.logging, 'warn', warnings.append)
-
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-
-    gen.AddEvent(tf.Event(wall_time=0, step=0, file_version='brain.Event:1'))
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    acc.Reload()
-    ## Check that number of items are what they should be
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
-
-    gen.AddScalar('s1', wall_time=1, step=101, value=20)
-    gen.AddScalar('s1', wall_time=1, step=201, value=20)
-    gen.AddScalar('s1', wall_time=1, step=301, value=20)
-    acc.Reload()
-    ## Check that we have discarded 200 and 300 from s1
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
-
-  def testOrphanedDataNotDiscardedIfFlagUnset(self):
-    """Tests that events are not discarded if purge_orphaned_data is false.
-    """
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen, purge_orphaned_data=False)
-
-    gen.AddEvent(tf.Event(wall_time=0, step=0, file_version='brain.Event:1'))
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    acc.Reload()
-    ## Check that number of items are what they should be
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
-
-    gen.AddScalar('s1', wall_time=1, step=101, value=20)
-    gen.AddScalar('s1', wall_time=1, step=201, value=20)
-    gen.AddScalar('s1', wall_time=1, step=301, value=20)
-    acc.Reload()
-    ## Check that we have discarded 200 and 300 from s1
-    self.assertEqual([x.step for x in acc.Scalars('s1')],
-                     [100, 200, 300, 101, 201, 301])
-
-  def testEventsDiscardedPerTagAfterRestartForFileVersionLessThan2(self):
-    """Tests that event discards after restart, only affect the misordered tag.
-
-    If a step value is observed to be lower than what was previously seen,
-    this should force a discard of all previous items that are outdated, but
-    only for the out of order tag. Other tags should remain unaffected.
-
-    Only file versions < 2 use this out-of-order discard logic. Later versions
-    discard events based on the step value of SessionLog.START.
-    """
-    warnings = []
-    self.stubs.Set(tf.logging, 'warn', warnings.append)
-
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-
-    gen.AddEvent(tf.Event(wall_time=0, step=0, file_version='brain.Event:1'))
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    gen.AddScalar('s1', wall_time=1, step=101, value=20)
-    gen.AddScalar('s1', wall_time=1, step=201, value=20)
-    gen.AddScalar('s1', wall_time=1, step=301, value=20)
-
-    gen.AddScalar('s2', wall_time=1, step=101, value=20)
-    gen.AddScalar('s2', wall_time=1, step=201, value=20)
-    gen.AddScalar('s2', wall_time=1, step=301, value=20)
-
-    acc.Reload()
-    ## Check that we have discarded 200 and 300
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
-
-    ## Check that s1 discards do not affect s2
-    ## i.e. check that only events from the out of order tag are discarded
-    self.assertEqual([x.step for x in acc.Scalars('s2')], [101, 201, 301])
-
-  def testOnlySummaryEventsTriggerDiscards(self):
-    """Test that file version event does not trigger data purge."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    ev1 = tf.Event(wall_time=2, step=0, file_version='brain.Event:1')
-    graph_bytes = tf.GraphDef().SerializeToString()
-    ev2 = tf.Event(wall_time=3, step=0, graph_def=graph_bytes)
-    gen.AddEvent(ev1)
-    gen.AddEvent(ev2)
-    acc.Reload()
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100])
-
-  def testSessionLogStartMessageDiscardsExpiredEvents(self):
-    """Test that SessionLog.START message discards expired events.
-
-    This discard logic is preferred over the out-of-order step discard logic,
-    but this logic can only be used for event protos which have the SessionLog
-    enum, which was introduced to event.proto for file_version >= brain.Event:2.
-    """
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(tf.Event(wall_time=0, step=1, file_version='brain.Event:2'))
-
-    gen.AddScalar('s1', wall_time=1, step=100, value=20)
-    gen.AddScalar('s1', wall_time=1, step=200, value=20)
-    gen.AddScalar('s1', wall_time=1, step=300, value=20)
-    gen.AddScalar('s1', wall_time=1, step=400, value=20)
-
-    gen.AddScalar('s2', wall_time=1, step=202, value=20)
-    gen.AddScalar('s2', wall_time=1, step=203, value=20)
-
-    slog = tf.SessionLog(status=tf.SessionLog.START)
-    gen.AddEvent(tf.Event(wall_time=2, step=201, session_log=slog))
-    acc.Reload()
-    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200])
-    self.assertEqual([x.step for x in acc.Scalars('s2')], [])
-
-  def testFirstEventTimestamp(self):
-    """Test that FirstEventTimestamp() returns wall_time of the first event."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(tf.Event(wall_time=10, step=20, file_version='brain.Event:2'))
-    gen.AddScalar('s1', wall_time=30, step=40, value=20)
-    self.assertEqual(acc.FirstEventTimestamp(), 10)
-
-  def testReloadPopulatesFirstEventTimestamp(self):
-    """Test that Reload() means FirstEventTimestamp() won't load events."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(tf.Event(wall_time=1, step=2, file_version='brain.Event:2'))
-
-    acc.Reload()
-
-    def _Die(*args, **kwargs):  # pylint: disable=unused-argument
-      raise RuntimeError('Load() should not be called')
-
-    self.stubs.Set(gen, 'Load', _Die)
-    self.assertEqual(acc.FirstEventTimestamp(), 1)
-
-  def testFirstEventTimestampLoadsEvent(self):
-    """Test that FirstEventTimestamp() doesn't discard the loaded event."""
-    gen = _EventGenerator(self)
-    acc = ea.EventAccumulator(gen)
-    gen.AddEvent(tf.Event(wall_time=1, step=2, file_version='brain.Event:2'))
-
-    self.assertEqual(acc.FirstEventTimestamp(), 1)
-    acc.Reload()
-    self.assertEqual(acc.file_version, 2.0)
-
-  def testTFSummaryScalar(self):
-    """Verify processing of tf.summary.scalar."""
-    event_sink = _EventGenerator(self, zero_out_timestamps=True)
-    writer = tf.summary.FileWriter(self.get_temp_dir())
-    writer.event_writer = event_sink
-    with self.test_session() as sess:
-      ipt = tf.placeholder(tf.float32)
-      tf.summary.scalar('scalar1', ipt)
-      tf.summary.scalar('scalar2', ipt * ipt)
-      merged = tf.summary.merge_all()
-      writer.add_graph(sess.graph)
-      for i in xrange(10):
-        summ = sess.run(merged, feed_dict={ipt: i})
-        writer.add_summary(summ, global_step=i)
-
-    accumulator = ea.EventAccumulator(event_sink)
-    accumulator.Reload()
-
-    seq1 = [ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)]
-    seq2 = [
-        ea.ScalarEvent(
-            wall_time=0, step=i, value=i * i) for i in xrange(10)
-    ]
-
-    self.assertTagsEqual(accumulator.Tags(), {
-        ea.SCALARS: ['scalar1', 'scalar2'],
-        ea.GRAPH: True,
-        ea.META_GRAPH: False,
-    })
-
-    self.assertEqual(accumulator.Scalars('scalar1'), seq1)
-    self.assertEqual(accumulator.Scalars('scalar2'), seq2)
-    first_value = accumulator.Scalars('scalar1')[0].value
-    self.assertTrue(isinstance(first_value, float))
-
-  def testTFSummaryImage(self):
-    """Verify processing of tf.summary.image."""
-    event_sink = _EventGenerator(self, zero_out_timestamps=True)
-    writer = tf.summary.FileWriter(self.get_temp_dir())
-    writer.event_writer = event_sink
-    with self.test_session() as sess:
-      ipt = tf.ones([10, 4, 4, 3], tf.uint8)
-      # This is an interesting example, because the old tf.image_summary op
-      # would throw an error here, because it would be tag reuse.
-      # Using the tf node name instead allows argument re-use to the image
-      # summary.
-      with tf.name_scope('1'):
-        tf.summary.image('images', ipt, max_outputs=1)
-      with tf.name_scope('2'):
-        tf.summary.image('images', ipt, max_outputs=2)
-      with tf.name_scope('3'):
-        tf.summary.image('images', ipt, max_outputs=3)
-      merged = tf.summary.merge_all()
-      writer.add_graph(sess.graph)
-      for i in xrange(10):
-        summ = sess.run(merged)
-        writer.add_summary(summ, global_step=i)
-
-    accumulator = ea.EventAccumulator(event_sink)
-    accumulator.Reload()
-
-    tags = [
-        u'1/images/image', u'2/images/image/0', u'2/images/image/1',
-        u'3/images/image/0', u'3/images/image/1', u'3/images/image/2'
-    ]
-
-    self.assertTagsEqual(accumulator.Tags(), {
-        ea.IMAGES: tags,
-        ea.GRAPH: True,
-        ea.META_GRAPH: False,
-    })
-
-  def testTFSummaryTensor(self):
-    """Verify processing of tf.summary.tensor."""
-    event_sink = _EventGenerator(self, zero_out_timestamps=True)
-    writer = tf.summary.FileWriter(self.get_temp_dir())
-    writer.event_writer = event_sink
-    with self.test_session() as sess:
-      tf.summary.tensor_summary('scalar', tf.constant(1.0))
-      tf.summary.tensor_summary('vector', tf.constant([1.0, 2.0, 3.0]))
-      tf.summary.tensor_summary('string', tf.constant(six.b('foobar')))
-      merged = tf.summary.merge_all()
-      summ = sess.run(merged)
-      writer.add_summary(summ, 0)
-
-    accumulator = ea.EventAccumulator(event_sink)
-    accumulator.Reload()
-
-    self.assertTagsEqual(accumulator.Tags(), {
-        ea.TENSORS: ['scalar', 'vector', 'string'],
-    })
-
-    scalar_proto = accumulator.Tensors('scalar')[0].tensor_proto
-    scalar = tf.make_ndarray(scalar_proto)
-    vector_proto = accumulator.Tensors('vector')[0].tensor_proto
-    vector = tf.make_ndarray(vector_proto)
-    string_proto = accumulator.Tensors('string')[0].tensor_proto
-    string = tf.make_ndarray(string_proto)
-
-    self.assertTrue(np.array_equal(scalar, 1.0))
-    self.assertTrue(np.array_equal(vector, [1.0, 2.0, 3.0]))
-    self.assertTrue(np.array_equal(string, six.b('foobar')))
-
-
-class RealisticEventAccumulatorTest(EventAccumulatorTest):
-
-  def setUp(self):
-    super(RealisticEventAccumulatorTest, self).setUp()
-
-  def testScalarsRealistically(self):
-    """Test accumulator by writing values and then reading them."""
-
-    def FakeScalarSummary(tag, value):
-      value = tf.Summary.Value(tag=tag, simple_value=value)
-      summary = tf.Summary(value=[value])
-      return summary
-
-    directory = os.path.join(self.get_temp_dir(), 'values_dir')
-    if tf.gfile.IsDirectory(directory):
-      tf.gfile.DeleteRecursively(directory)
-    tf.gfile.MkDir(directory)
-
-    writer = tf.summary.FileWriter(directory, max_queue=100)
-
-    with tf.Graph().as_default() as graph:
-      _ = tf.constant([2.0, 1.0])
-    # Add a graph to the summary writer.
-    writer.add_graph(graph)
-    meta_graph_def = tf.train.export_meta_graph(graph_def=graph.as_graph_def(
-        add_shapes=True))
-    writer.add_meta_graph(meta_graph_def)
-
-    run_metadata = tf.RunMetadata()
-    device_stats = run_metadata.step_stats.dev_stats.add()
-    device_stats.device = 'test device'
-    writer.add_run_metadata(run_metadata, 'test run')
-
-    # Write a bunch of events using the writer.
-    for i in xrange(30):
-      summ_id = FakeScalarSummary('id', i)
-      summ_sq = FakeScalarSummary('sq', i * i)
-      writer.add_summary(summ_id, i * 5)
-      writer.add_summary(summ_sq, i * 5)
-    writer.flush()
-
-    # Verify that we can load those events properly
-    acc = ea.EventAccumulator(directory)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.SCALARS: ['id', 'sq'],
-        ea.GRAPH: True,
-        ea.META_GRAPH: True,
-        ea.RUN_METADATA: ['test run'],
-    })
-    id_events = acc.Scalars('id')
-    sq_events = acc.Scalars('sq')
-    self.assertEqual(30, len(id_events))
-    self.assertEqual(30, len(sq_events))
-    for i in xrange(30):
-      self.assertEqual(i * 5, id_events[i].step)
-      self.assertEqual(i * 5, sq_events[i].step)
-      self.assertEqual(i, id_events[i].value)
-      self.assertEqual(i * i, sq_events[i].value)
-
-    # Write a few more events to test incremental reloading
-    for i in xrange(30, 40):
-      summ_id = FakeScalarSummary('id', i)
-      summ_sq = FakeScalarSummary('sq', i * i)
-      writer.add_summary(summ_id, i * 5)
-      writer.add_summary(summ_sq, i * 5)
-    writer.flush()
-
-    # Verify we can now see all of the data
-    acc.Reload()
-    id_events = acc.Scalars('id')
-    sq_events = acc.Scalars('sq')
-    self.assertEqual(40, len(id_events))
-    self.assertEqual(40, len(sq_events))
-    for i in xrange(40):
-      self.assertEqual(i * 5, id_events[i].step)
-      self.assertEqual(i * 5, sq_events[i].step)
-      self.assertEqual(i, id_events[i].value)
-      self.assertEqual(i * i, sq_events[i].value)
-    self.assertProtoEquals(graph.as_graph_def(add_shapes=True), acc.Graph())
-    self.assertProtoEquals(meta_graph_def, acc.MetaGraph())
-
-  def testGraphFromMetaGraphBecomesAvailable(self):
-    """Test accumulator by writing values and then reading them."""
-
-    directory = os.path.join(self.get_temp_dir(), 'metagraph_test_values_dir')
-    if tf.gfile.IsDirectory(directory):
-      tf.gfile.DeleteRecursively(directory)
-    tf.gfile.MkDir(directory)
-
-    writer = tf.summary.FileWriter(directory, max_queue=100)
-
-    with tf.Graph().as_default() as graph:
-      _ = tf.constant([2.0, 1.0])
-    # Add a graph to the summary writer.
-    meta_graph_def = tf.train.export_meta_graph(graph_def=graph.as_graph_def(
-        add_shapes=True))
-    writer.add_meta_graph(meta_graph_def)
-
-    writer.flush()
-
-    # Verify that we can load those events properly
-    acc = ea.EventAccumulator(directory)
-    acc.Reload()
-    self.assertTagsEqual(acc.Tags(), {
-        ea.GRAPH: True,
-        ea.META_GRAPH: True,
-    })
-    self.assertProtoEquals(graph.as_graph_def(add_shapes=True), acc.Graph())
-    self.assertProtoEquals(meta_graph_def, acc.MetaGraph())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_file_inspector.py b/tensorflow/tensorboard/backend/event_processing/event_file_inspector.py
deleted file mode 100644
index e120dd2ab160dc4e8d79a905e59d450010aaff78..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_file_inspector.py
+++ /dev/null
@@ -1,427 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Logic for TensorBoard inspector to help humans investigate event files.
-
-Example usages:
-tensorboard --inspect --event_file=myevents.out
-tensorboard --inspect --event_file=myevents.out --tag=loss
-tensorboard --inspect --logdir=mylogdir
-tensorboard --inspect --logdir=mylogdir --tag=loss
-
-
-This script runs over a logdir and creates an InspectionUnit for every
-subdirectory with event files. If running over an event file, it creates only
-one InspectionUnit. One block of output is printed to console for each
-InspectionUnit.
-
-The primary content of an InspectionUnit is the dict field_to_obs that maps
-fields (e.g. "scalar", "histogram", "session_log:start", etc.) to a list of
-Observations for the field. Observations correspond one-to-one with Events in an
-event file but contain less information because they only store what is
-necessary to generate the final console output.
-
-The final output is rendered to console by applying some aggregating function
-to the lists of Observations. Different functions are applied depending on the
-type of field. For instance, for "scalar" fields, the inspector shows aggregate
-statistics. For other fields like "session_log:start", all observed steps are
-printed in order to aid debugging.
-
-
-[1] Query a logdir or an event file for its logged tags and summary statistics
-using --logdir or --event_file.
-
-[[event_file]] contains these tags:
-histograms
-   binary/Sign/Activations
-   binary/nn_tanh/act/Activations
-   binary/nn_tanh/biases
-   binary/nn_tanh/biases:gradient
-   binary/nn_tanh/weights
-   binary/nn_tanh/weights:gradient
-images
-   input_images/image/0
-   input_images/image/1
-   input_images/image/2
-scalars
-   Learning Rate
-   Total Cost
-   Total Cost (raw)
-
-Debug output aggregated over all tags:
-graph
-   first_step           0
-   last_step            0
-   max_step             0
-   min_step             0
-   num_steps            1
-   outoforder_steps     []
-histograms
-   first_step           491
-   last_step            659823
-   max_step             659823
-   min_step             491
-   num_steps            993
-   outoforder_steps     []
-images -
-scalars
-   first_step           0
-   last_step            659823
-   max_step             659823
-   min_step             0
-   num_steps            1985
-   outoforder_steps     []
-sessionlog:checkpoint
-   first_step           7129
-   last_step            657167
-   max_step             657167
-   min_step             7129
-   num_steps            99
-   outoforder_steps     []
-sessionlog:start
-   outoforder_steps     []
-   steps                [0L]
-sessionlog:stop -
-
-
-[2] Drill down into a particular tag using --tag.
-
-Debug output for binary/Sign/Activations:
-histograms
-   first_step           491
-   last_step            659823
-   max_step             659823
-   min_step             491
-   num_steps            993
-   outoforder_steps     []
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import itertools
-import os
-
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import event_file_loader
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-
-FLAGS = tf.flags.FLAGS
-
-
-# Map of field names within summary.proto to the user-facing names that this
-# script outputs.
-SUMMARY_TYPE_TO_FIELD = {'simple_value': 'scalars',
-                         'histo': 'histograms',
-                         'image': 'images',
-                         'audio': 'audio'}
-for summary_type in event_accumulator.SUMMARY_TYPES:
-  if summary_type not in SUMMARY_TYPE_TO_FIELD:
-    SUMMARY_TYPE_TO_FIELD[summary_type] = summary_type
-
-# Types of summaries that we may want to query for by tag.
-TAG_FIELDS = list(SUMMARY_TYPE_TO_FIELD.values())
-
-# Summaries that we want to see every instance of.
-LONG_FIELDS = ['sessionlog:start', 'sessionlog:stop']
-
-# Summaries that we only want an abridged digest of, since they would
-# take too much screen real estate otherwise.
-SHORT_FIELDS = ['graph', 'sessionlog:checkpoint'] + TAG_FIELDS
-
-# All summary types that we can inspect.
-TRACKED_FIELDS = SHORT_FIELDS + LONG_FIELDS
-
-# An `Observation` contains the data within each Event file that the inspector
-# cares about. The inspector accumulates Observations as it processes events.
-Observation = collections.namedtuple('Observation', ['step', 'wall_time',
-                                                     'tag'])
-
-# An InspectionUnit is created for each organizational structure in the event
-# files visible in the final terminal output. For instance, one InspectionUnit
-# is created for each subdirectory in logdir. When asked to inspect a single
-# event file, there may only be one InspectionUnit.
-
-# The InspectionUnit contains the `name` of the organizational unit that will be
-# printed to console, a `generator` that yields `Event` protos, and a mapping
-# from string fields to `Observations` that the inspector creates.
-InspectionUnit = collections.namedtuple('InspectionUnit', ['name', 'generator',
-                                                           'field_to_obs'])
-
-PRINT_SEPARATOR = '=' * 70 + '\n'
-
-
-def get_field_to_observations_map(generator, query_for_tag=''):
-  """Return a field to `Observations` dict for the event generator.
-
-  Args:
-    generator: A generator over event protos.
-    query_for_tag: A string that if specified, only create observations for
-      events with this tag name.
-
-  Returns:
-    A dict mapping keys in `TRACKED_FIELDS` to an `Observation` list.
-  """
-
-  def increment(stat, event, tag=''):
-    assert stat in TRACKED_FIELDS
-    field_to_obs[stat].append(Observation(step=event.step,
-                                          wall_time=event.wall_time,
-                                          tag=tag)._asdict())
-
-  field_to_obs = dict([(t, []) for t in TRACKED_FIELDS])
-
-  for event in generator:
-    ## Process the event
-    if event.HasField('graph_def') and (not query_for_tag):
-      increment('graph', event)
-    if event.HasField('session_log') and (not query_for_tag):
-      status = event.session_log.status
-      if status == tf.SessionLog.START:
-        increment('sessionlog:start', event)
-      elif status == tf.SessionLog.STOP:
-        increment('sessionlog:stop', event)
-      elif status == tf.SessionLog.CHECKPOINT:
-        increment('sessionlog:checkpoint', event)
-    elif event.HasField('summary'):
-      for value in event.summary.value:
-        if query_for_tag and value.tag != query_for_tag:
-          continue
-
-        for proto_name, display_name in SUMMARY_TYPE_TO_FIELD.items():
-          if value.HasField(proto_name):
-            increment(display_name, event, value.tag)
-  return field_to_obs
-
-
-def get_unique_tags(field_to_obs):
-  """Returns a dictionary of tags that a user could query over.
-
-  Args:
-    field_to_obs: Dict that maps string field to `Observation` list.
-
-  Returns:
-    A dict that maps keys in `TAG_FIELDS` to a list of string tags present in
-    the event files. If the dict does not have any observations of the type,
-    maps to an empty list so that we can render this to console.
-  """
-  return {field: sorted(set([x.get('tag', '') for x in observations]))
-          for field, observations in field_to_obs.items()
-          if field in TAG_FIELDS}
-
-
-def print_dict(d, show_missing=True):
-  """Prints a shallow dict to console.
-
-  Args:
-    d: Dict to print.
-    show_missing: Whether to show keys with empty values.
-  """
-  for k, v in sorted(d.items()):
-    if (not v) and show_missing:
-      # No instances of the key, so print missing symbol.
-      print('{} -'.format(k))
-    elif isinstance(v, list):
-      # Value is a list, so print each item of the list.
-      print(k)
-      for item in v:
-        print('   {}'.format(item))
-    elif isinstance(v, dict):
-      # Value is a dict, so print each (key, value) pair of the dict.
-      print(k)
-      for kk, vv in sorted(v.items()):
-        print('   {:<20} {}'.format(kk, vv))
-
-
-def get_dict_to_print(field_to_obs):
-  """Transform the field-to-obs mapping into a printable dictionary.
-
-  Args:
-    field_to_obs: Dict that maps string field to `Observation` list.
-
-  Returns:
-    A dict with the keys and values to print to console.
-  """
-
-  def compressed_steps(steps):
-    return {'num_steps': len(set(steps)),
-            'min_step': min(steps),
-            'max_step': max(steps),
-            'last_step': steps[-1],
-            'first_step': steps[0],
-            'outoforder_steps': get_out_of_order(steps)}
-
-  def full_steps(steps):
-    return {'steps': steps, 'outoforder_steps': get_out_of_order(steps)}
-
-  output = {}
-  for field, observations in field_to_obs.items():
-    if not observations:
-      output[field] = None
-      continue
-
-    steps = [x['step'] for x in observations]
-    if field in SHORT_FIELDS:
-      output[field] = compressed_steps(steps)
-    if field in LONG_FIELDS:
-      output[field] = full_steps(steps)
-
-  return output
-
-
-def get_out_of_order(list_of_numbers):
-  """Returns elements that break the monotonically non-decreasing trend.
-
-  This is used to find instances of global step values that are "out-of-order",
-  which may trigger TensorBoard event discarding logic.
-
-  Args:
-    list_of_numbers: A list of numbers.
-
-  Returns:
-    A list of tuples in which each tuple are two elements are adjacent, but the
-    second element is lower than the first.
-  """
-  # TODO(cassandrax): Consider changing this to only check for out-of-order
-  # steps within a particular tag.
-  result = []
-  for i in range(len(list_of_numbers)):
-    if i == 0:
-      continue
-    if list_of_numbers[i] < list_of_numbers[i - 1]:
-      result.append((list_of_numbers[i - 1], list_of_numbers[i]))
-  return result
-
-
-def generators_from_logdir(logdir):
-  """Returns a list of event generators for subdirectories with event files.
-
-  The number of generators returned should equal the number of directories
-  within logdir that contain event files. If only logdir contains event files,
-  returns a list of length one.
-
-  Args:
-    logdir: A log directory that contains event files.
-
-  Returns:
-    List of event generators for each subdirectory with event files.
-  """
-  subdirs = event_multiplexer.GetLogdirSubdirectories(logdir)
-  generators = [
-      itertools.chain(*[
-          generator_from_event_file(os.path.join(subdir, f))
-          for f in tf.gfile.ListDirectory(subdir)
-          if event_accumulator.IsTensorFlowEventsFile(os.path.join(subdir, f))
-      ]) for subdir in subdirs
-  ]
-  return generators
-
-
-def generator_from_event_file(event_file):
-  """Returns a generator that yields events from an event file."""
-  return event_file_loader.EventFileLoader(event_file).Load()
-
-
-def get_inspection_units(logdir='', event_file='', tag=''):
-  """Returns a list of InspectionUnit objects given either logdir or event_file.
-
-  If logdir is given, the number of InspectionUnits should equal the
-  number of directories or subdirectories that contain event files.
-
-  If event_file is given, the number of InspectionUnits should be 1.
-
-  Args:
-    logdir: A log directory that contains event files.
-    event_file: Or, a particular event file path.
-    tag: An optional tag name to query for.
-
-  Returns:
-    A list of InspectionUnit objects.
-  """
-  if logdir:
-    subdirs = event_multiplexer.GetLogdirSubdirectories(logdir)
-    inspection_units = []
-    for subdir in subdirs:
-      generator = itertools.chain(*[
-          generator_from_event_file(os.path.join(subdir, f))
-          for f in tf.gfile.ListDirectory(subdir)
-          if event_accumulator.IsTensorFlowEventsFile(os.path.join(subdir, f))
-      ])
-      inspection_units.append(InspectionUnit(
-          name=subdir,
-          generator=generator,
-          field_to_obs=get_field_to_observations_map(generator, tag)))
-    if inspection_units:
-      print('Found event files in:\n{}\n'.format('\n'.join(
-          [u.name for u in inspection_units])))
-    elif event_accumulator.IsTensorFlowEventsFile(logdir):
-      print(
-          'It seems that {} may be an event file instead of a logdir. If this '
-          'is the case, use --event_file instead of --logdir to pass '
-          'it in.'.format(logdir))
-    else:
-      print('No event files found within logdir {}'.format(logdir))
-    return inspection_units
-  elif event_file:
-    generator = generator_from_event_file(event_file)
-    return [InspectionUnit(
-        name=event_file,
-        generator=generator,
-        field_to_obs=get_field_to_observations_map(generator, tag))]
-
-
-def inspect(logdir='', event_file='', tag=''):
-  """Main function for inspector that prints out a digest of event files.
-
-  Args:
-    logdir: A log directory that contains event files.
-    event_file: Or, a particular event file path.
-    tag: An optional tag name to query for.
-
-  Raises:
-    ValueError: If neither logdir and event_file are given, or both are given.
-  """
-  if logdir and event_file:
-    raise ValueError(
-        'Must specify either --logdir or --event_file, but not both.')
-  if not (logdir or event_file):
-    raise ValueError('Must specify either --logdir or --event_file.')
-
-  print(PRINT_SEPARATOR +
-        'Processing event files... (this can take a few minutes)\n' +
-        PRINT_SEPARATOR)
-  inspection_units = get_inspection_units(logdir, event_file, tag)
-
-  for unit in inspection_units:
-    if tag:
-      print('Event statistics for tag {} in {}:'.format(tag, unit.name))
-    else:
-      # If the user is not inspecting a particular tag, also print the list of
-      # all available tags that they can query.
-      print('These tags are in {}:'.format(unit.name))
-      print_dict(get_unique_tags(unit.field_to_obs))
-      print(PRINT_SEPARATOR)
-      print('Event statistics for {}:'.format(unit.name))
-
-    print_dict(get_dict_to_print(unit.field_to_obs), show_missing=(not tag))
-    print(PRINT_SEPARATOR)
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_file_inspector_test.py b/tensorflow/tensorboard/backend/event_processing/event_file_inspector_test.py
deleted file mode 100644
index 084043d5110fc5000af963a4315d810bf05fbd57..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_file_inspector_test.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_file_inspector as efi
-
-
-class EventFileInspectorTest(tf.test.TestCase):
-
-  def setUp(self):
-    self.logdir = os.path.join(self.get_temp_dir(), 'tfevents')
-    self._MakeDirectoryIfNotExists(self.logdir)
-
-  def tearDown(self):
-    shutil.rmtree(self.logdir)
-
-  def _MakeDirectoryIfNotExists(self, path):
-    if not os.path.exists(path):
-      os.mkdir(path)
-
-  def _WriteScalarSummaries(self, data, subdirs=('',)):
-    # Writes data to a tempfile in subdirs, and returns generator for the data.
-    # If subdirs is given, writes data identically to all subdirectories.
-    for subdir_ in subdirs:
-      subdir = os.path.join(self.logdir, subdir_)
-      self._MakeDirectoryIfNotExists(subdir)
-
-      sw = tf.summary.FileWriter(subdir)
-      for datum in data:
-        summary = tf.Summary()
-        if 'simple_value' in datum:
-          summary.value.add(tag=datum['tag'],
-                            simple_value=datum['simple_value'])
-          sw.add_summary(summary, global_step=datum['step'])
-        elif 'histo' in datum:
-          summary.value.add(tag=datum['tag'], histo=tf.HistogramProto())
-          sw.add_summary(summary, global_step=datum['step'])
-        elif 'session_log' in datum:
-          sw.add_session_log(datum['session_log'], global_step=datum['step'])
-      sw.close()
-
-  def testEmptyLogdir(self):
-    # Nothing was written to logdir
-    units = efi.get_inspection_units(self.logdir)
-    self.assertEqual([], units)
-
-  def testGetAvailableTags(self):
-    data = [{'tag': 'c', 'histo': 2, 'step': 10},
-            {'tag': 'c', 'histo': 2, 'step': 11},
-            {'tag': 'c', 'histo': 2, 'step': 9},
-            {'tag': 'b', 'simple_value': 2, 'step': 20},
-            {'tag': 'b', 'simple_value': 2, 'step': 15},
-            {'tag': 'a', 'simple_value': 2, 'step': 3}]
-    self._WriteScalarSummaries(data)
-    units = efi.get_inspection_units(self.logdir)
-    tags = efi.get_unique_tags(units[0].field_to_obs)
-    self.assertEqual(['a', 'b'], tags['scalars'])
-    self.assertEqual(['c'], tags['histograms'])
-
-  def testInspectAll(self):
-    data = [{'tag': 'c', 'histo': 2, 'step': 10},
-            {'tag': 'c', 'histo': 2, 'step': 11},
-            {'tag': 'c', 'histo': 2, 'step': 9},
-            {'tag': 'b', 'simple_value': 2, 'step': 20},
-            {'tag': 'b', 'simple_value': 2, 'step': 15},
-            {'tag': 'a', 'simple_value': 2, 'step': 3}]
-    self._WriteScalarSummaries(data)
-    units = efi.get_inspection_units(self.logdir)
-    printable = efi.get_dict_to_print(units[0].field_to_obs)
-    self.assertEqual(printable['histograms']['max_step'], 11)
-    self.assertEqual(printable['histograms']['min_step'], 9)
-    self.assertEqual(printable['histograms']['num_steps'], 3)
-    self.assertEqual(printable['histograms']['last_step'], 9)
-    self.assertEqual(printable['histograms']['first_step'], 10)
-    self.assertEqual(printable['histograms']['outoforder_steps'], [(11, 9)])
-
-    self.assertEqual(printable['scalars']['max_step'], 20)
-    self.assertEqual(printable['scalars']['min_step'], 3)
-    self.assertEqual(printable['scalars']['num_steps'], 3)
-    self.assertEqual(printable['scalars']['last_step'], 3)
-    self.assertEqual(printable['scalars']['first_step'], 20)
-    self.assertEqual(printable['scalars']['outoforder_steps'], [(20, 15),
-                                                                (15, 3)])
-
-  def testInspectTag(self):
-    data = [{'tag': 'c', 'histo': 2, 'step': 10},
-            {'tag': 'c', 'histo': 2, 'step': 11},
-            {'tag': 'c', 'histo': 2, 'step': 9},
-            {'tag': 'b', 'histo': 2, 'step': 20},
-            {'tag': 'b', 'simple_value': 2, 'step': 15},
-            {'tag': 'a', 'simple_value': 2, 'step': 3}]
-    self._WriteScalarSummaries(data)
-    units = efi.get_inspection_units(self.logdir, tag='c')
-    printable = efi.get_dict_to_print(units[0].field_to_obs)
-    self.assertEqual(printable['histograms']['max_step'], 11)
-    self.assertEqual(printable['histograms']['min_step'], 9)
-    self.assertEqual(printable['histograms']['num_steps'], 3)
-    self.assertEqual(printable['histograms']['last_step'], 9)
-    self.assertEqual(printable['histograms']['first_step'], 10)
-    self.assertEqual(printable['histograms']['outoforder_steps'], [(11, 9)])
-    self.assertEqual(printable['scalars'], None)
-
-  def testSessionLogSummaries(self):
-    data = [
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.START),
-            'step': 0
-        },
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.CHECKPOINT),
-            'step': 1
-        },
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.CHECKPOINT),
-            'step': 2
-        },
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.CHECKPOINT),
-            'step': 3
-        },
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.STOP),
-            'step': 4
-        },
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.START),
-            'step': 5
-        },
-        {
-            'session_log': tf.SessionLog(status=tf.SessionLog.STOP),
-            'step': 6
-        },
-    ]
-
-    self._WriteScalarSummaries(data)
-    units = efi.get_inspection_units(self.logdir)
-    self.assertEqual(1, len(units))
-    printable = efi.get_dict_to_print(units[0].field_to_obs)
-    self.assertEqual(printable['sessionlog:start']['steps'], [0, 5])
-    self.assertEqual(printable['sessionlog:stop']['steps'], [4, 6])
-    self.assertEqual(printable['sessionlog:checkpoint']['num_steps'], 3)
-
-  def testInspectAllWithNestedLogdirs(self):
-    data = [{'tag': 'c', 'simple_value': 2, 'step': 10},
-            {'tag': 'c', 'simple_value': 2, 'step': 11},
-            {'tag': 'c', 'simple_value': 2, 'step': 9},
-            {'tag': 'b', 'simple_value': 2, 'step': 20},
-            {'tag': 'b', 'simple_value': 2, 'step': 15},
-            {'tag': 'a', 'simple_value': 2, 'step': 3}]
-
-    subdirs = ['eval', 'train']
-    self._WriteScalarSummaries(data, subdirs=subdirs)
-    units = efi.get_inspection_units(self.logdir)
-    self.assertEqual(2, len(units))
-    directory_names = [os.path.join(self.logdir, name) for name in subdirs]
-    self.assertEqual(directory_names, sorted([unit.name for unit in units]))
-
-    for unit in units:
-      printable = efi.get_dict_to_print(unit.field_to_obs)['scalars']
-      self.assertEqual(printable['max_step'], 20)
-      self.assertEqual(printable['min_step'], 3)
-      self.assertEqual(printable['num_steps'], 6)
-      self.assertEqual(printable['last_step'], 3)
-      self.assertEqual(printable['first_step'], 10)
-      self.assertEqual(printable['outoforder_steps'], [(11, 9), (20, 15),
-                                                       (15, 3)])
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_file_loader.py b/tensorflow/tensorboard/backend/event_processing/event_file_loader.py
deleted file mode 100644
index 896142daaf4eed52fccaaa47745e6a3107bef75f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_file_loader.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functionality for loading events from a record file."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-class EventFileLoader(object):
-  """An EventLoader is an iterator that yields Event protos."""
-
-  def __init__(self, file_path):
-    if file_path is None:
-      raise ValueError('A file path is required')
-    file_path = tf.resource_loader.readahead_file_path(file_path)
-    tf.logging.debug('Opening a record reader pointing at %s', file_path)
-    with tf.errors.raise_exception_on_not_ok_status() as status:
-      self._reader = tf.pywrap_tensorflow.PyRecordReader_New(
-          tf.compat.as_bytes(file_path), 0, tf.compat.as_bytes(''), status)
-    # Store it for logging purposes.
-    self._file_path = file_path
-    if not self._reader:
-      raise IOError('Failed to open a record reader pointing to %s' % file_path)
-
-  def Load(self):
-    """Loads all new values from disk.
-
-    Calling Load multiple times in a row will not 'drop' events as long as the
-    return value is not iterated over.
-
-    Yields:
-      All values that were written to disk that have not been yielded yet.
-    """
-    while True:
-      try:
-        with tf.errors.raise_exception_on_not_ok_status() as status:
-          self._reader.GetNext(status)
-      except (tf.errors.DataLossError, tf.errors.OutOfRangeError):
-        # We ignore partial read exceptions, because a record may be truncated.
-        # PyRecordReader holds the offset prior to the failed read, so retrying
-        # will succeed.
-        break
-      event = tf.Event()
-      event.ParseFromString(self._reader.record())
-      yield event
-    tf.logging.debug('No more events in %s', self._file_path)
-
-
-def main(argv):
-  if len(argv) != 2:
-    print('Usage: event_file_loader <path-to-the-recordio-file>')
-    return 1
-  loader = EventFileLoader(argv[1])
-  for event in loader.Load():
-    print(event)
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_file_loader_test.py b/tensorflow/tensorboard/backend/event_processing/event_file_loader_test.py
deleted file mode 100644
index 210a7bc52edf74ad40636c237d519f30ce4f4b56..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_file_loader_test.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for event_file_loader."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-import tensorflow as tf
-
-
-from tensorflow.tensorboard.backend.event_processing import event_file_loader
-
-
-class EventFileLoaderTest(tf.test.TestCase):
-  # A record containing a simple event.
-  RECORD = (b'\x18\x00\x00\x00\x00\x00\x00\x00\xa3\x7fK"\t\x00\x00\xc0%\xddu'
-            b'\xd5A\x1a\rbrain.Event:1\xec\xf32\x8d')
-
-  def _WriteToFile(self, filename, data):
-    with open(filename, 'ab') as f:
-      f.write(data)
-
-  def _LoaderForTestFile(self, filename):
-    return event_file_loader.EventFileLoader(
-        os.path.join(self.get_temp_dir(), filename))
-
-  def testEmptyEventFile(self):
-    filename = tempfile.NamedTemporaryFile(dir=self.get_temp_dir()).name
-    self._WriteToFile(filename, b'')
-    loader = self._LoaderForTestFile(filename)
-    self.assertEqual(len(list(loader.Load())), 0)
-
-  def testSingleWrite(self):
-    filename = tempfile.NamedTemporaryFile(dir=self.get_temp_dir()).name
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    loader = self._LoaderForTestFile(filename)
-    events = list(loader.Load())
-    self.assertEqual(len(events), 1)
-    self.assertEqual(events[0].wall_time, 1440183447.0)
-    self.assertEqual(len(list(loader.Load())), 0)
-
-  def testMultipleWrites(self):
-    filename = tempfile.NamedTemporaryFile(dir=self.get_temp_dir()).name
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    loader = self._LoaderForTestFile(filename)
-    self.assertEqual(len(list(loader.Load())), 1)
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    self.assertEqual(len(list(loader.Load())), 1)
-
-  def testMultipleLoads(self):
-    filename = tempfile.NamedTemporaryFile(dir=self.get_temp_dir()).name
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    loader = self._LoaderForTestFile(filename)
-    loader.Load()
-    loader.Load()
-    self.assertEqual(len(list(loader.Load())), 1)
-
-  def testMultipleWritesAtOnce(self):
-    filename = tempfile.NamedTemporaryFile(dir=self.get_temp_dir()).name
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    loader = self._LoaderForTestFile(filename)
-    self.assertEqual(len(list(loader.Load())), 2)
-
-  def testMultipleWritesWithBadWrite(self):
-    filename = tempfile.NamedTemporaryFile(dir=self.get_temp_dir()).name
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    self._WriteToFile(filename, EventFileLoaderTest.RECORD)
-    # Test that we ignore partial record writes at the end of the file.
-    self._WriteToFile(filename, b'123')
-    loader = self._LoaderForTestFile(filename)
-    self.assertEqual(len(list(loader.Load())), 2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
deleted file mode 100644
index e4b8814c929765e2a6e61c22299e413574c151d2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
+++ /dev/null
@@ -1,475 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides an interface for working with multiple event files."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import threading
-
-import six
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import directory_watcher
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import io_wrapper
-
-
-class EventMultiplexer(object):
-  """An `EventMultiplexer` manages access to multiple `EventAccumulator`s.
-
-  Each `EventAccumulator` is associated with a `run`, which is a self-contained
-  TensorFlow execution. The `EventMultiplexer` provides methods for extracting
-  information about events from multiple `run`s.
-
-  Example usage for loading specific runs from files:
-
-  ```python
-  x = EventMultiplexer({'run1': 'path/to/run1', 'run2': 'path/to/run2'})
-  x.Reload()
-  ```
-
-  Example usage for loading a directory where each subdirectory is a run
-
-  ```python
-  (eg:) /parent/directory/path/
-        /parent/directory/path/run1/
-        /parent/directory/path/run1/events.out.tfevents.1001
-        /parent/directory/path/run1/events.out.tfevents.1002
-
-        /parent/directory/path/run2/
-        /parent/directory/path/run2/events.out.tfevents.9232
-
-        /parent/directory/path/run3/
-        /parent/directory/path/run3/events.out.tfevents.9232
-  x = EventMultiplexer().AddRunsFromDirectory('/parent/directory/path')
-  (which is equivalent to:)
-  x = EventMultiplexer({'run1': '/parent/directory/path/run1', 'run2':...}
-  ```
-
-  If you would like to watch `/parent/directory/path`, wait for it to be created
-    (if necessary) and then periodically pick up new runs, use
-    `AutoloadingMultiplexer`
-  @@Tensors
-  """
-
-  def __init__(self,
-               run_path_map=None,
-               size_guidance=event_accumulator.DEFAULT_SIZE_GUIDANCE,
-               purge_orphaned_data=True):
-    """Constructor for the `EventMultiplexer`.
-
-    Args:
-      run_path_map: Dict `{run: path}` which specifies the
-        name of a run, and the path to find the associated events. If it is
-        None, then the EventMultiplexer initializes without any runs.
-      size_guidance: A dictionary mapping from `tagType` to the number of items
-        to store for each tag of that type. See
-        `event_accumulator.EventAccumulator` for details.
-      purge_orphaned_data: Whether to discard any events that were "orphaned" by
-        a TensorFlow restart.
-    """
-    tf.logging.info('Event Multiplexer initializing.')
-    self._accumulators_mutex = threading.Lock()
-    self._accumulators = {}
-    self._paths = {}
-    self._reload_called = False
-    self._size_guidance = size_guidance
-    self.purge_orphaned_data = purge_orphaned_data
-    if run_path_map is not None:
-      tf.logging.info('Event Multplexer doing initialization load for %s',
-                      run_path_map)
-      for (run, path) in six.iteritems(run_path_map):
-        self.AddRun(path, run)
-    tf.logging.info('Event Multiplexer done initializing')
-
-  def AddRun(self, path, name=None):
-    """Add a run to the multiplexer.
-
-    If the name is not specified, it is the same as the path.
-
-    If a run by that name exists, and we are already watching the right path,
-      do nothing. If we are watching a different path, replace the event
-      accumulator.
-
-    If `Reload` has been called, it will `Reload` the newly created
-    accumulators.
-
-    Args:
-      path: Path to the event files (or event directory) for given run.
-      name: Name of the run to add. If not provided, is set to path.
-
-    Returns:
-      The `EventMultiplexer`.
-    """
-    if name is None or name is '':
-      name = path
-    accumulator = None
-    with self._accumulators_mutex:
-      if name not in self._accumulators or self._paths[name] != path:
-        if name in self._paths and self._paths[name] != path:
-          # TODO(danmane) - Make it impossible to overwrite an old path with
-          # a new path (just give the new path a distinct name)
-          tf.logging.warning('Conflict for name %s: old path %s, new path %s',
-                             name, self._paths[name], path)
-        tf.logging.info('Constructing EventAccumulator for %s', path)
-        accumulator = event_accumulator.EventAccumulator(
-            path,
-            size_guidance=self._size_guidance,
-            purge_orphaned_data=self.purge_orphaned_data)
-        self._accumulators[name] = accumulator
-        self._paths[name] = path
-    if accumulator:
-      if self._reload_called:
-        accumulator.Reload()
-    return self
-
-  def AddRunsFromDirectory(self, path, name=None):
-    """Load runs from a directory; recursively walks subdirectories.
-
-    If path doesn't exist, no-op. This ensures that it is safe to call
-      `AddRunsFromDirectory` multiple times, even before the directory is made.
-
-    If path is a directory, load event files in the directory (if any exist) and
-      recursively call AddRunsFromDirectory on any subdirectories. This mean you
-      can call AddRunsFromDirectory at the root of a tree of event logs and
-      TensorBoard will load them all.
-
-    If the `EventMultiplexer` is already loaded this will cause
-    the newly created accumulators to `Reload()`.
-    Args:
-      path: A string path to a directory to load runs from.
-      name: Optionally, what name to apply to the runs. If name is provided
-        and the directory contains run subdirectories, the name of each subrun
-        is the concatenation of the parent name and the subdirectory name. If
-        name is provided and the directory contains event files, then a run
-        is added called "name" and with the events from the path.
-
-    Raises:
-      ValueError: If the path exists and isn't a directory.
-
-    Returns:
-      The `EventMultiplexer`.
-    """
-    tf.logging.info('Starting AddRunsFromDirectory: %s', path)
-    for subdir in GetLogdirSubdirectories(path):
-      tf.logging.info('Adding events from directory %s', subdir)
-      rpath = os.path.relpath(subdir, path)
-      subname = os.path.join(name, rpath) if name else rpath
-      self.AddRun(subdir, name=subname)
-    tf.logging.info('Done with AddRunsFromDirectory: %s', path)
-    return self
-
-  def Reload(self):
-    """Call `Reload` on every `EventAccumulator`."""
-    tf.logging.info('Beginning EventMultiplexer.Reload()')
-    self._reload_called = True
-    # Build a list so we're safe even if the list of accumulators is modified
-    # even while we're reloading.
-    with self._accumulators_mutex:
-      items = list(self._accumulators.items())
-
-    names_to_delete = set()
-    for name, accumulator in items:
-      try:
-        accumulator.Reload()
-      except (OSError, IOError) as e:
-        tf.logging.error("Unable to reload accumulator '%s': %s", name, e)
-      except directory_watcher.DirectoryDeletedError:
-        names_to_delete.add(name)
-
-    with self._accumulators_mutex:
-      for name in names_to_delete:
-        tf.logging.warning("Deleting accumulator '%s'", name)
-        del self._accumulators[name]
-    tf.logging.info('Finished with EventMultiplexer.Reload()')
-    return self
-
-  def PluginAssets(self, plugin_name):
-    """Get index of runs and assets for a given plugin.
-
-    Args:
-      plugin_name: Name of the plugin we are checking for.
-
-    Returns:
-      A dictionary that maps from run_name to a list of plugin
-        assets for that run.
-    """
-    with self._accumulators_mutex:
-      # To avoid nested locks, we construct a copy of the run-accumulator map
-      items = list(six.iteritems(self._accumulators))
-
-    return {run: accum.PluginAssets(plugin_name) for run, accum in items}
-
-  def RetrievePluginAsset(self, run, plugin_name, asset_name):
-    """Return the contents for a specific plugin asset from a run.
-
-    Args:
-      run: The string name of the run.
-      plugin_name: The string name of a plugin.
-      asset_name: The string name of an asset.
-
-    Returns:
-      The string contents of the plugin asset.
-
-    Raises:
-      KeyError: If the asset is not available.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.RetrievePluginAsset(plugin_name, asset_name)
-
-  def FirstEventTimestamp(self, run):
-    """Return the timestamp of the first event of the given run.
-
-    This may perform I/O if no events have been loaded yet for the run.
-
-    Args:
-      run: A string name of the run for which the timestamp is retrieved.
-
-    Returns:
-      The wall_time of the first event of the run, which will typically be
-      seconds since the epoch.
-
-    Raises:
-      KeyError: If the run is not found.
-      ValueError: If the run has no events loaded and there are no events on
-        disk to load.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.FirstEventTimestamp()
-
-  def Scalars(self, run, tag):
-    """Retrieve the scalar events associated with a run and tag.
-
-    Args:
-      run: A string name of the run for which values are retrieved.
-      tag: A string name of the tag for which values are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.ScalarEvents`.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.Scalars(tag)
-
-  def HealthPills(self, run, node_name):
-    """Retrieve the health pill events associated with a run and node name.
-
-    Args:
-      run: A string name of the run for which health pills are retrieved.
-      node_name: A string name of the node for which health pills are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the node name is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.HealthPillEvents`.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.HealthPills(node_name)
-
-  def GetOpsWithHealthPills(self, run):
-    """Determines which ops have at least 1 health pill event for a given run.
-
-    Args:
-      run: The name of the run.
-
-    Raises:
-      KeyError: If the run is not found, or the node name is not available for
-        the given run.
-
-    Returns:
-      The list of names of ops with health pill events.
-    """
-    return self._GetAccumulator(run).GetOpsWithHealthPills()
-
-  def Graph(self, run):
-    """Retrieve the graph associated with the provided run.
-
-    Args:
-      run: A string name of a run to load the graph for.
-
-    Raises:
-      KeyError: If the run is not found.
-      ValueError: If the run does not have an associated graph.
-
-    Returns:
-      The `GraphDef` protobuf data structure.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.Graph()
-
-  def MetaGraph(self, run):
-    """Retrieve the metagraph associated with the provided run.
-
-    Args:
-      run: A string name of a run to load the graph for.
-
-    Raises:
-      KeyError: If the run is not found.
-      ValueError: If the run does not have an associated graph.
-
-    Returns:
-      The `MetaGraphDef` protobuf data structure.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.MetaGraph()
-
-  def RunMetadata(self, run, tag):
-    """Get the session.run() metadata associated with a TensorFlow run and tag.
-
-    Args:
-      run: A string name of a TensorFlow run.
-      tag: A string name of the tag associated with a particular session.run().
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for the
-        given run.
-
-    Returns:
-      The metadata in the form of `RunMetadata` protobuf data structure.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.RunMetadata(tag)
-
-  def Histograms(self, run, tag):
-    """Retrieve the histogram events associated with a run and tag.
-
-    Args:
-      run: A string name of the run for which values are retrieved.
-      tag: A string name of the tag for which values are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.HistogramEvents`.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.Histograms(tag)
-
-  def CompressedHistograms(self, run, tag):
-    """Retrieve the compressed histogram events associated with a run and tag.
-
-    Args:
-      run: A string name of the run for which values are retrieved.
-      tag: A string name of the tag for which values are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.CompressedHistogramEvents`.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.CompressedHistograms(tag)
-
-  def Images(self, run, tag):
-    """Retrieve the image events associated with a run and tag.
-
-    Args:
-      run: A string name of the run for which values are retrieved.
-      tag: A string name of the tag for which values are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.ImageEvents`.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.Images(tag)
-
-  def Audio(self, run, tag):
-    """Retrieve the audio events associated with a run and tag.
-
-    Args:
-      run: A string name of the run for which values are retrieved.
-      tag: A string name of the tag for which values are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.AudioEvents`.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.Audio(tag)
-
-  def Tensors(self, run, tag):
-    """Retrieve the tensor events associated with a run and tag.
-
-    Args:
-      run: A string name of the run for which values are retrieved.
-      tag: A string name of the tag for which values are retrieved.
-
-    Raises:
-      KeyError: If the run is not found, or the tag is not available for
-        the given run.
-
-    Returns:
-      An array of `event_accumulator.TensorEvent`s.
-    """
-    accumulator = self._GetAccumulator(run)
-    return accumulator.Tensors(tag)
-
-  def Runs(self):
-    """Return all the run names in the `EventMultiplexer`.
-
-    Returns:
-    ```
-      {runName: { images: [tag1, tag2, tag3],
-                  scalarValues: [tagA, tagB, tagC],
-                  histograms: [tagX, tagY, tagZ],
-                  compressedHistograms: [tagX, tagY, tagZ],
-                  graph: true, meta_graph: true}}
-    ```
-    """
-    with self._accumulators_mutex:
-      # To avoid nested locks, we construct a copy of the run-accumulator map
-      items = list(six.iteritems(self._accumulators))
-    return {run_name: accumulator.Tags() for run_name, accumulator in items}
-
-  def RunPaths(self):
-    """Returns a dict mapping run names to event file paths."""
-    return self._paths
-
-  def _GetAccumulator(self, run):
-    with self._accumulators_mutex:
-      return self._accumulators[run]
-
-
-def GetLogdirSubdirectories(path):
-  """Returns subdirectories with event files on path."""
-  if tf.gfile.Exists(path) and not tf.gfile.IsDirectory(path):
-    raise ValueError('GetLogdirSubdirectories: path exists and is not a '
-                     'directory, %s' % path)
-
-  # ListRecursively just yields nothing if the path doesn't exist.
-  return (
-      subdir
-      for (subdir, files) in io_wrapper.ListRecursively(path)
-      if list(filter(event_accumulator.IsTensorFlowEventsFile, files))
-  )
diff --git a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
deleted file mode 100644
index ea536dfaad671a6768e859d204a80577a8d56afb..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-import os.path
-import shutil
-
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-
-
-def _AddEvents(path):
-  if not tf.gfile.IsDirectory(path):
-    tf.gfile.MakeDirs(path)
-  fpath = os.path.join(path, 'hypothetical.tfevents.out')
-  with tf.gfile.GFile(fpath, 'w') as f:
-    f.write('')
-    return fpath
-
-
-def _CreateCleanDirectory(path):
-  if tf.gfile.IsDirectory(path):
-    tf.gfile.DeleteRecursively(path)
-  tf.gfile.MkDir(path)
-
-
-class _FakeAccumulator(object):
-
-  def __init__(self, path, health_pill_mapping=None):
-    """Constructs a fake accumulator with some fake events.
-
-    Args:
-      path: The path for the run that this accumulator is for.
-      health_pill_mapping: An optional mapping from Op to health pill strings.
-    """
-    self._path = path
-    self.reload_called = False
-    self._node_names_to_health_pills = health_pill_mapping or {}
-
-  def Tags(self):
-    return {event_accumulator.IMAGES: ['im1', 'im2'],
-            event_accumulator.AUDIO: ['snd1', 'snd2'],
-            event_accumulator.HISTOGRAMS: ['hst1', 'hst2'],
-            event_accumulator.COMPRESSED_HISTOGRAMS: ['cmphst1', 'cmphst2'],
-            event_accumulator.SCALARS: ['sv1', 'sv2']}
-
-  def FirstEventTimestamp(self):
-    return 0
-
-  def _TagHelper(self, tag_name, enum):
-    if tag_name not in self.Tags()[enum]:
-      raise KeyError
-    return ['%s/%s' % (self._path, tag_name)]
-
-  def Scalars(self, tag_name):
-    return self._TagHelper(tag_name, event_accumulator.SCALARS)
-
-  def HealthPills(self, node_name):
-    if node_name not in self._node_names_to_health_pills:
-      raise KeyError
-    health_pills = self._node_names_to_health_pills[node_name]
-    return [self._path + '/' + health_pill for health_pill in health_pills]
-
-  def GetOpsWithHealthPills(self):
-    return self._node_names_to_health_pills.keys()
-
-  def Histograms(self, tag_name):
-    return self._TagHelper(tag_name, event_accumulator.HISTOGRAMS)
-
-  def CompressedHistograms(self, tag_name):
-    return self._TagHelper(tag_name, event_accumulator.COMPRESSED_HISTOGRAMS)
-
-  def Images(self, tag_name):
-    return self._TagHelper(tag_name, event_accumulator.IMAGES)
-
-  def Audio(self, tag_name):
-    return self._TagHelper(tag_name, event_accumulator.AUDIO)
-
-  def Tensors(self, tag_name):
-    return self._TagHelper(tag_name, event_accumulator.TENSORS)
-
-  def Reload(self):
-    self.reload_called = True
-
-
-def _GetFakeAccumulator(path,
-                        size_guidance=None,
-                        compression_bps=None,
-                        purge_orphaned_data=None,
-                        health_pill_mapping=None):
-  del size_guidance, compression_bps, purge_orphaned_data  # Unused.
-  return _FakeAccumulator(path, health_pill_mapping=health_pill_mapping)
-
-
-class EventMultiplexerTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(EventMultiplexerTest, self).setUp()
-    self.stubs = tf.test.StubOutForTesting()
-
-    self.stubs.Set(event_accumulator, 'EventAccumulator', _GetFakeAccumulator)
-
-  def tearDown(self):
-    self.stubs.CleanUp()
-
-  def testEmptyLoader(self):
-    """Tests empty EventMultiplexer creation."""
-    x = event_multiplexer.EventMultiplexer()
-    self.assertEqual(x.Runs(), {})
-
-  def testRunNamesRespected(self):
-    """Tests two EventAccumulators inserted/accessed in EventMultiplexer."""
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'run2'])
-    self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
-    self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
-
-  def testReload(self):
-    """EventAccumulators should Reload after EventMultiplexer call it."""
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertFalse(x._GetAccumulator('run1').reload_called)
-    self.assertFalse(x._GetAccumulator('run2').reload_called)
-    x.Reload()
-    self.assertTrue(x._GetAccumulator('run1').reload_called)
-    self.assertTrue(x._GetAccumulator('run2').reload_called)
-
-  def testScalars(self):
-    """Tests Scalars function returns suitable values."""
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-
-    run1_actual = x.Scalars('run1', 'sv1')
-    run1_expected = ['path1/sv1']
-
-    self.assertEqual(run1_expected, run1_actual)
-
-  def testHealthPills(self):
-    """Tests HealthPills() returns events associated with run1/Add."""
-    self.stubs.Set(event_accumulator, 'EventAccumulator',
-                   functools.partial(
-                       _GetFakeAccumulator,
-                       health_pill_mapping={'Add': ['hp1', 'hp2']}))
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertEqual(['path1/hp1', 'path1/hp2'], x.HealthPills('run1', 'Add'))
-
-  def testGetOpsWithHealthPillsWhenHealthPillsAreNotAvailable(self):
-    # The event accumulator lacks health pills for the run.
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertItemsEqual([], x.GetOpsWithHealthPills('run1'))
-
-  def testGetOpsWithHealthPillsWhenHealthPillsAreAvailable(self):
-    # The event accumulator has health pills for the run.
-    self.stubs.Set(event_accumulator, 'EventAccumulator',
-                   functools.partial(
-                       _GetFakeAccumulator,
-                       health_pill_mapping={'Add': ['hp1', 'hp2']}))
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertItemsEqual(['Add'], x.GetOpsWithHealthPills('run1'))
-
-  def testExceptions(self):
-    """KeyError should be raised when accessing non-existing keys."""
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    with self.assertRaises(KeyError):
-      x.Scalars('sv1', 'xxx')
-
-  def testInitialization(self):
-    """Tests EventMultiplexer is created properly with its params."""
-    x = event_multiplexer.EventMultiplexer()
-    self.assertEqual(x.Runs(), {})
-    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
-    self.assertItemsEqual(x.Runs(), ['run1', 'run2'])
-    self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
-    self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
-
-  def testAddRunsFromDirectory(self):
-    """Tests AddRunsFromDirectory function.
-
-    Tests the following scenarios:
-    - When the directory does not exist.
-    - When the directory is empty.
-    - When the directory has empty subdirectory.
-    - Contains proper EventAccumulators after adding events.
-    """
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    fakedir = join(tmpdir, 'fake_accumulator_directory')
-    realdir = join(tmpdir, 'real_accumulator_directory')
-    self.assertEqual(x.Runs(), {})
-    x.AddRunsFromDirectory(fakedir)
-    self.assertEqual(x.Runs(), {}, 'loading fakedir had no effect')
-
-    _CreateCleanDirectory(realdir)
-    x.AddRunsFromDirectory(realdir)
-    self.assertEqual(x.Runs(), {}, 'loading empty directory had no effect')
-
-    path1 = join(realdir, 'path1')
-    tf.gfile.MkDir(path1)
-    x.AddRunsFromDirectory(realdir)
-    self.assertEqual(x.Runs(), {}, 'creating empty subdirectory had no effect')
-
-    _AddEvents(path1)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['path1'], 'loaded run: path1')
-    loader1 = x._GetAccumulator('path1')
-    self.assertEqual(loader1._path, path1, 'has the correct path')
-
-    path2 = join(realdir, 'path2')
-    _AddEvents(path2)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['path1', 'path2'])
-    self.assertEqual(
-        x._GetAccumulator('path1'), loader1, 'loader1 not regenerated')
-
-    path2_2 = join(path2, 'path2')
-    _AddEvents(path2_2)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['path1', 'path2', 'path2/path2'])
-    self.assertEqual(
-        x._GetAccumulator('path2/path2')._path, path2_2, 'loader2 path correct')
-
-  def testAddRunsFromDirectoryThatContainsEvents(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    realdir = join(tmpdir, 'event_containing_directory')
-
-    _CreateCleanDirectory(realdir)
-
-    self.assertEqual(x.Runs(), {})
-
-    _AddEvents(realdir)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['.'])
-
-    subdir = join(realdir, 'subdir')
-    _AddEvents(subdir)
-    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['.', 'subdir'])
-
-  def testAddRunsFromDirectoryWithRunNames(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    realdir = join(tmpdir, 'event_containing_directory')
-
-    _CreateCleanDirectory(realdir)
-
-    self.assertEqual(x.Runs(), {})
-
-    _AddEvents(realdir)
-    x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo/.'])
-
-    subdir = join(realdir, 'subdir')
-    _AddEvents(subdir)
-    x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo/.', 'foo/subdir'])
-
-  def testAddRunsFromDirectoryWalksTree(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    realdir = join(tmpdir, 'event_containing_directory')
-
-    _CreateCleanDirectory(realdir)
-    _AddEvents(realdir)
-    sub = join(realdir, 'subdirectory')
-    sub1 = join(sub, '1')
-    sub2 = join(sub, '2')
-    sub1_1 = join(sub1, '1')
-    _AddEvents(sub1)
-    _AddEvents(sub2)
-    _AddEvents(sub1_1)
-    x.AddRunsFromDirectory(realdir)
-
-    self.assertItemsEqual(x.Runs(), ['.', 'subdirectory/1', 'subdirectory/2',
-                                     'subdirectory/1/1'])
-
-  def testAddRunsFromDirectoryThrowsException(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-
-    filepath = _AddEvents(tmpdir)
-    with self.assertRaises(ValueError):
-      x.AddRunsFromDirectory(filepath)
-
-  def testAddRun(self):
-    x = event_multiplexer.EventMultiplexer()
-    x.AddRun('run1_path', 'run1')
-    run1 = x._GetAccumulator('run1')
-    self.assertEqual(sorted(x.Runs().keys()), ['run1'])
-    self.assertEqual(run1._path, 'run1_path')
-
-    x.AddRun('run1_path', 'run1')
-    self.assertEqual(run1, x._GetAccumulator('run1'), 'loader not recreated')
-
-    x.AddRun('run2_path', 'run1')
-    new_run1 = x._GetAccumulator('run1')
-    self.assertEqual(new_run1._path, 'run2_path')
-    self.assertNotEqual(run1, new_run1)
-
-    x.AddRun('runName3')
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'runName3'])
-    self.assertEqual(x._GetAccumulator('runName3')._path, 'runName3')
-
-  def testAddRunMaintainsLoading(self):
-    x = event_multiplexer.EventMultiplexer()
-    x.Reload()
-    x.AddRun('run1')
-    x.AddRun('run2')
-    self.assertTrue(x._GetAccumulator('run1').reload_called)
-    self.assertTrue(x._GetAccumulator('run2').reload_called)
-
-
-class EventMultiplexerWithRealAccumulatorTest(tf.test.TestCase):
-
-  def testDeletingDirectoryRemovesRun(self):
-    x = event_multiplexer.EventMultiplexer()
-    tmpdir = self.get_temp_dir()
-    join = os.path.join
-    run1_dir = join(tmpdir, 'run1')
-    run2_dir = join(tmpdir, 'run2')
-    run3_dir = join(tmpdir, 'run3')
-
-    for dirname in [run1_dir, run2_dir, run3_dir]:
-      _AddEvents(dirname)
-
-    x.AddRun(run1_dir, 'run1')
-    x.AddRun(run2_dir, 'run2')
-    x.AddRun(run3_dir, 'run3')
-
-    x.Reload()
-
-    # Delete the directory, then reload.
-    shutil.rmtree(run2_dir)
-    x.Reload()
-    self.assertNotIn('run2', x.Runs().keys())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/io_wrapper.py b/tensorflow/tensorboard/backend/event_processing/io_wrapper.py
deleted file mode 100644
index c185f26a4fd4ce6a9467df42986670ec44b6c37d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/io_wrapper.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""IO helper functions."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import tensorflow as tf
-
-
-def IsGCSPath(path):
-  return path.startswith("gs://")
-
-
-def ListDirectoryAbsolute(directory):
-  """Yields all files in the given directory. The paths are absolute."""
-  return (os.path.join(directory, path)
-          for path in tf.gfile.ListDirectory(directory))
-
-
-def ListRecursively(top):
-  """Walks a directory tree, yielding (dir_path, file_paths) tuples.
-
-  For each of `top` and its subdirectories, yields a tuple containing the path
-  to the directory and the path to each of the contained files.  Note that
-  unlike os.Walk()/tf.gfile.Walk(), this does not list subdirectories and the
-  file paths are all absolute.
-
-  If the directory does not exist, this yields nothing.
-
-  Args:
-    top: A path to a directory..
-  Yields:
-    A list of (dir_path, file_paths) tuples.
-  """
-  for dir_path, _, filenames in tf.gfile.Walk(top):
-    yield (dir_path, (os.path.join(dir_path, filename)
-                      for filename in filenames))
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
deleted file mode 100644
index 5fb7128424487543bd240e09d61d3294b32905b9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load plugin assets from disk."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-
-import tensorflow as tf
-
-
-_PLUGINS_DIR = "plugins"
-
-
-def _IsDirectory(parent, item):
-  """Helper that returns if parent/item is a directory."""
-  return tf.gfile.IsDirectory(os.path.join(parent, item))
-
-
-def PluginDirectory(logdir, plugin_name):
-  """Returns the plugin directory for plugin_name."""
-  return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
-
-
-def ListPlugins(logdir):
-  """List all the plugins that have registered assets in logdir.
-
-  If the plugins_dir does not exist, it returns an empty list. This maintains
-  compatibility with old directories that have no plugins written.
-
-  Args:
-    logdir: A directory that was created by a TensorFlow events writer.
-
-  Returns:
-    a list of plugin names, as strings
-  """
-  plugins_dir = os.path.join(logdir, _PLUGINS_DIR)
-  if not tf.gfile.IsDirectory(plugins_dir):
-    return []
-  entries = tf.gfile.ListDirectory(plugins_dir)
-  return [x for x in entries if _IsDirectory(plugins_dir, x)]
-
-
-def ListAssets(logdir, plugin_name):
-  """List all the assets that are available for given plugin in a logdir.
-
-  Args:
-    logdir: A directory that was created by a TensorFlow summary.FileWriter.
-    plugin_name: A string name of a plugin to list assets for.
-
-  Returns:
-    A string list of available plugin assets. If the plugin subdirectory does
-    not exist (either because the logdir doesn't exist, or because the plugin
-    didn't register) an empty list is returned.
-  """
-  plugin_dir = PluginDirectory(logdir, plugin_name)
-  if not tf.gfile.IsDirectory(plugin_dir):
-    return []
-  entries = tf.gfile.ListDirectory(plugin_dir)
-  return [x for x in entries if not _IsDirectory(plugin_dir, x)]
-
-
-def RetrieveAsset(logdir, plugin_name, asset_name):
-  """Retrieve a particular plugin asset from a logdir.
-
-  Args:
-    logdir: A directory that was created by a TensorFlow summary.FileWriter.
-    plugin_name: The plugin we want an asset from.
-    asset_name: The name of the requested asset.
-
-  Returns:
-    string contents of the plugin asset.
-
-  Raises:
-    KeyError: if the asset does not exist.
-  """
-
-  asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
-  try:
-    with tf.gfile.Open(asset_path, "r") as f:
-      return f.read()
-  except tf.errors.NotFoundError:
-    raise KeyError("Asset path %s not found" % asset_path)
-  except tf.errors.OpError as e:
-    raise KeyError("Couldn't read asset path: %s, OpError %s" % (asset_path, e))
diff --git a/tensorflow/tensorboard/backend/event_processing/reservoir.py b/tensorflow/tensorboard/backend/event_processing/reservoir.py
deleted file mode 100644
index 0a1252e63520d732a1bbc10491e7aaefdf08ea23..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/reservoir.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A key-value[] store that implements reservoir sampling on the values."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import random
-import threading
-
-
-class Reservoir(object):
-  """A map-to-arrays container, with deterministic Reservoir Sampling.
-
-  Items are added with an associated key. Items may be retrieved by key, and
-  a list of keys can also be retrieved. If size is not zero, then it dictates
-  the maximum number of items that will be stored with each key. Once there are
-  more items for a given key, they are replaced via reservoir sampling, such
-  that each item has an equal probability of being included in the sample.
-
-  Deterministic means that for any given seed and bucket size, the sequence of
-  values that are kept for any given tag will always be the same, and that this
-  is independent of any insertions on other tags. That is:
-
-  >>> separate_reservoir = reservoir.Reservoir(10)
-  >>> interleaved_reservoir = reservoir.Reservoir(10)
-  >>> for i in xrange(100):
-  >>>   separate_reservoir.AddItem('key1', i)
-  >>> for i in xrange(100):
-  >>>   separate_reservoir.AddItem('key2', i)
-  >>> for i in xrange(100):
-  >>>   interleaved_reservoir.AddItem('key1', i)
-  >>>   interleaved_reservoir.AddItem('key2', i)
-
-  separate_reservoir and interleaved_reservoir will be in identical states.
-
-  See: https://en.wikipedia.org/wiki/Reservoir_sampling
-
-  Adding items has amortized O(1) runtime.
-
-  """
-
-  def __init__(self, size, seed=0, always_keep_last=True):
-    """Creates a new reservoir.
-
-    Args:
-      size: The number of values to keep in the reservoir for each tag. If 0,
-        all values will be kept.
-      seed: The seed of the random number generator to use when sampling.
-        Different values for |seed| will produce different samples from the same
-        input items.
-      always_keep_last: Whether to always keep the latest seen item in the
-        end of the reservoir. Defaults to True.
-
-    Raises:
-      ValueError: If size is negative or not an integer.
-    """
-    if size < 0 or size != round(size):
-      raise ValueError('size must be nonegative integer, was %s' % size)
-    self._buckets = collections.defaultdict(
-        lambda: _ReservoirBucket(size, random.Random(seed), always_keep_last))
-    # _mutex guards the keys - creating new keys, retrieving by key, etc
-    # the internal items are guarded by the ReservoirBuckets' internal mutexes
-    self._mutex = threading.Lock()
-
-  def Keys(self):
-    """Return all the keys in the reservoir.
-
-    Returns:
-      ['list', 'of', 'keys'] in the Reservoir.
-    """
-    with self._mutex:
-      return list(self._buckets.keys())
-
-  def Items(self, key):
-    """Return items associated with given key.
-
-    Args:
-      key: The key for which we are finding associated items.
-
-    Raises:
-      KeyError: If the key is not found in the reservoir.
-
-    Returns:
-      [list, of, items] associated with that key.
-    """
-    with self._mutex:
-      if key not in self._buckets:
-        raise KeyError('Key %s was not found in Reservoir' % key)
-      bucket = self._buckets[key]
-    return bucket.Items()
-
-  def AddItem(self, key, item, f=lambda x: x):
-    """Add a new item to the Reservoir with the given tag.
-
-    If the reservoir has not yet reached full size, the new item is guaranteed
-    to be added. If the reservoir is full, then behavior depends on the
-    always_keep_last boolean.
-
-    If always_keep_last was set to true, the new item is guaranteed to be added
-    to the reservoir, and either the previous last item will be replaced, or
-    (with low probability) an older item will be replaced.
-
-    If always_keep_last was set to false, then the new item will replace an
-    old item with low probability.
-
-    If f is provided, it will be applied to transform item (lazily, iff item is
-      going to be included in the reservoir).
-
-    Args:
-      key: The key to store the item under.
-      item: The item to add to the reservoir.
-      f: An optional function to transform the item prior to addition.
-    """
-    with self._mutex:
-      bucket = self._buckets[key]
-    bucket.AddItem(item, f)
-
-  def FilterItems(self, filterFn, key=None):
-    """Filter items within a Reservoir, using a filtering function.
-
-    Args:
-      filterFn: A function that returns True for the items to be kept.
-      key: An optional bucket key to filter. If not specified, will filter all
-        all buckets.
-
-    Returns:
-      The number of items removed.
-    """
-    with self._mutex:
-      if key:
-        if key in self._buckets:
-          return self._buckets[key].FilterItems(filterFn)
-        else:
-          return 0
-      else:
-        return sum(bucket.FilterItems(filterFn)
-                   for bucket in self._buckets.values())
-
-
-class _ReservoirBucket(object):
-  """A container for items from a stream, that implements reservoir sampling.
-
-  It always stores the most recent item as its final item.
-  """
-
-  def __init__(self, _max_size, _random=None, always_keep_last=True):
-    """Create the _ReservoirBucket.
-
-    Args:
-      _max_size: The maximum size the reservoir bucket may grow to. If size is
-        zero, the bucket has unbounded size.
-      _random: The random number generator to use. If not specified, defaults to
-        random.Random(0).
-      always_keep_last: Whether the latest seen item should always be included
-        in the end of the bucket.
-
-    Raises:
-      ValueError: if the size is not a nonnegative integer.
-    """
-    if _max_size < 0 or _max_size != round(_max_size):
-      raise ValueError('_max_size must be nonegative int, was %s' % _max_size)
-    self.items = []
-    # This mutex protects the internal items, ensuring that calls to Items and
-    # AddItem are thread-safe
-    self._mutex = threading.Lock()
-    self._max_size = _max_size
-    self._num_items_seen = 0
-    if _random is not None:
-      self._random = _random
-    else:
-      self._random = random.Random(0)
-    self.always_keep_last = always_keep_last
-
-  def AddItem(self, item, f=lambda x: x):
-    """Add an item to the ReservoirBucket, replacing an old item if necessary.
-
-    The new item is guaranteed to be added to the bucket, and to be the last
-    element in the bucket. If the bucket has reached capacity, then an old item
-    will be replaced. With probability (_max_size/_num_items_seen) a random item
-    in the bucket will be popped out and the new item will be appended
-    to the end. With probability (1 - _max_size/_num_items_seen)
-    the last item in the bucket will be replaced.
-
-    Since the O(n) replacements occur with O(1/_num_items_seen) likelihood,
-    the amortized runtime is O(1).
-
-    Args:
-      item: The item to add to the bucket.
-      f: A function to transform item before addition, if it will be kept in
-        the reservoir.
-    """
-    with self._mutex:
-      if len(self.items) < self._max_size or self._max_size == 0:
-        self.items.append(f(item))
-      else:
-        r = self._random.randint(0, self._num_items_seen)
-        if r < self._max_size:
-          self.items.pop(r)
-          self.items.append(f(item))
-        elif self.always_keep_last:
-          self.items[-1] = f(item)
-      self._num_items_seen += 1
-
-  def FilterItems(self, filterFn):
-    """Filter items in a ReservoirBucket, using a filtering function.
-
-    Filtering items from the reservoir bucket must update the
-    internal state variable self._num_items_seen, which is used for determining
-    the rate of replacement in reservoir sampling. Ideally, self._num_items_seen
-    would contain the exact number of items that have ever seen by the
-    ReservoirBucket and satisfy filterFn. However, the ReservoirBucket does not
-    have access to all items seen -- it only has access to the subset of items
-    that have survived sampling (self.items). Therefore, we estimate
-    self._num_items_seen by scaling it by the same ratio as the ratio of items
-    not removed from self.items.
-
-    Args:
-      filterFn: A function that returns True for items to be kept.
-
-    Returns:
-      The number of items removed from the bucket.
-    """
-    with self._mutex:
-      size_before = len(self.items)
-      self.items = list(filter(filterFn, self.items))
-      size_diff = size_before - len(self.items)
-
-      # Estimate a correction the number of items seen
-      prop_remaining = len(self.items) / float(
-          size_before) if size_before > 0 else 0
-      self._num_items_seen = int(round(self._num_items_seen * prop_remaining))
-      return size_diff
-
-  def Items(self):
-    """Get all the items in the bucket."""
-    with self._mutex:
-      return list(self.items)
diff --git a/tensorflow/tensorboard/backend/event_processing/reservoir_test.py b/tensorflow/tensorboard/backend/event_processing/reservoir_test.py
deleted file mode 100644
index df4757e245453341f9ae9c1519b0d9f6c2f6b7f5..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/event_processing/reservoir_test.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import reservoir
-
-
-class ReservoirTest(tf.test.TestCase):
-
-  def testEmptyReservoir(self):
-    r = reservoir.Reservoir(1)
-    self.assertFalse(r.Keys())
-
-  def testRespectsSize(self):
-    r = reservoir.Reservoir(42)
-    self.assertEqual(r._buckets['meaning of life']._max_size, 42)
-
-  def testItemsAndKeys(self):
-    r = reservoir.Reservoir(42)
-    r.AddItem('foo', 4)
-    r.AddItem('bar', 9)
-    r.AddItem('foo', 19)
-    self.assertItemsEqual(r.Keys(), ['foo', 'bar'])
-    self.assertEqual(r.Items('foo'), [4, 19])
-    self.assertEqual(r.Items('bar'), [9])
-
-  def testExceptions(self):
-    with self.assertRaises(ValueError):
-      reservoir.Reservoir(-1)
-    with self.assertRaises(ValueError):
-      reservoir.Reservoir(13.3)
-
-    r = reservoir.Reservoir(12)
-    with self.assertRaises(KeyError):
-      r.Items('missing key')
-
-  def testDeterminism(self):
-    """Tests that the reservoir is deterministic."""
-    key = 'key'
-    r1 = reservoir.Reservoir(10)
-    r2 = reservoir.Reservoir(10)
-    for i in xrange(100):
-      r1.AddItem('key', i)
-      r2.AddItem('key', i)
-
-    self.assertEqual(r1.Items(key), r2.Items(key))
-
-  def testBucketDeterminism(self):
-    """Tests that reservoirs are deterministic at a bucket level.
-
-    This means that only the order elements are added within a bucket matters.
-    """
-    separate_reservoir = reservoir.Reservoir(10)
-    interleaved_reservoir = reservoir.Reservoir(10)
-    for i in xrange(100):
-      separate_reservoir.AddItem('key1', i)
-    for i in xrange(100):
-      separate_reservoir.AddItem('key2', i)
-    for i in xrange(100):
-      interleaved_reservoir.AddItem('key1', i)
-      interleaved_reservoir.AddItem('key2', i)
-
-    for key in ['key1', 'key2']:
-      self.assertEqual(
-          separate_reservoir.Items(key), interleaved_reservoir.Items(key))
-
-  def testUsesSeed(self):
-    """Tests that reservoirs with different seeds keep different samples."""
-    key = 'key'
-    r1 = reservoir.Reservoir(10, seed=0)
-    r2 = reservoir.Reservoir(10, seed=1)
-    for i in xrange(100):
-      r1.AddItem('key', i)
-      r2.AddItem('key', i)
-    self.assertNotEqual(r1.Items(key), r2.Items(key))
-
-  def testFilterItemsByKey(self):
-    r = reservoir.Reservoir(100, seed=0)
-    for i in xrange(10):
-      r.AddItem('key1', i)
-      r.AddItem('key2', i)
-
-    self.assertEqual(len(r.Items('key1')), 10)
-    self.assertEqual(len(r.Items('key2')), 10)
-
-    self.assertEqual(r.FilterItems(lambda x: x <= 7, 'key2'), 2)
-    self.assertEqual(len(r.Items('key2')), 8)
-    self.assertEqual(len(r.Items('key1')), 10)
-
-    self.assertEqual(r.FilterItems(lambda x: x <= 3, 'key1'), 6)
-    self.assertEqual(len(r.Items('key1')), 4)
-    self.assertEqual(len(r.Items('key2')), 8)
-
-
-class ReservoirBucketTest(tf.test.TestCase):
-
-  def testEmptyBucket(self):
-    b = reservoir._ReservoirBucket(1)
-    self.assertFalse(b.Items())
-
-  def testFillToSize(self):
-    b = reservoir._ReservoirBucket(100)
-    for i in xrange(100):
-      b.AddItem(i)
-    self.assertEqual(b.Items(), list(xrange(100)))
-    self.assertEqual(b._num_items_seen, 100)
-
-  def testDoesntOverfill(self):
-    b = reservoir._ReservoirBucket(10)
-    for i in xrange(1000):
-      b.AddItem(i)
-    self.assertEqual(len(b.Items()), 10)
-    self.assertEqual(b._num_items_seen, 1000)
-
-  def testMaintainsOrder(self):
-    b = reservoir._ReservoirBucket(100)
-    for i in xrange(10000):
-      b.AddItem(i)
-    items = b.Items()
-    prev = -1
-    for item in items:
-      self.assertTrue(item > prev)
-      prev = item
-
-  def testKeepsLatestItem(self):
-    b = reservoir._ReservoirBucket(5)
-    for i in xrange(100):
-      b.AddItem(i)
-      last = b.Items()[-1]
-      self.assertEqual(last, i)
-
-  def testSizeOneBucket(self):
-    b = reservoir._ReservoirBucket(1)
-    for i in xrange(20):
-      b.AddItem(i)
-      self.assertEqual(b.Items(), [i])
-    self.assertEqual(b._num_items_seen, 20)
-
-  def testSizeZeroBucket(self):
-    b = reservoir._ReservoirBucket(0)
-    for i in xrange(20):
-      b.AddItem(i)
-      self.assertEqual(b.Items(), list(range(i + 1)))
-    self.assertEqual(b._num_items_seen, 20)
-
-  def testSizeRequirement(self):
-    with self.assertRaises(ValueError):
-      reservoir._ReservoirBucket(-1)
-    with self.assertRaises(ValueError):
-      reservoir._ReservoirBucket(10.3)
-
-  def testRemovesItems(self):
-    b = reservoir._ReservoirBucket(100)
-    for i in xrange(10):
-      b.AddItem(i)
-    self.assertEqual(len(b.Items()), 10)
-    self.assertEqual(b._num_items_seen, 10)
-    self.assertEqual(b.FilterItems(lambda x: x <= 7), 2)
-    self.assertEqual(len(b.Items()), 8)
-    self.assertEqual(b._num_items_seen, 8)
-
-  def testRemovesItemsWhenItemsAreReplaced(self):
-    b = reservoir._ReservoirBucket(100)
-    for i in xrange(10000):
-      b.AddItem(i)
-    self.assertEqual(b._num_items_seen, 10000)
-
-    # Remove items
-    num_removed = b.FilterItems(lambda x: x <= 7)
-    self.assertGreater(num_removed, 92)
-    self.assertEqual([], [item for item in b.Items() if item > 7])
-    self.assertEqual(b._num_items_seen,
-                     int(round(10000 * (1 - float(num_removed) / 100))))
-
-  def testLazyFunctionEvaluationAndAlwaysKeepLast(self):
-
-    class FakeRandom(object):
-
-      def randint(self, a, b):  # pylint:disable=unused-argument
-        return 999
-
-    class Incrementer(object):
-
-      def __init__(self):
-        self.n = 0
-
-      def increment_and_double(self, x):
-        self.n += 1
-        return x * 2
-
-    # We've mocked the randomness generator, so that once it is full, the last
-    # item will never get durable reservoir inclusion. Since always_keep_last is
-    # false, the function should only get invoked 100 times while filling up
-    # the reservoir. This laziness property is an essential performance
-    # optimization.
-    b = reservoir._ReservoirBucket(100, FakeRandom(), always_keep_last=False)
-    incrementer = Incrementer()
-    for i in xrange(1000):
-      b.AddItem(i, incrementer.increment_and_double)
-    self.assertEqual(incrementer.n, 100)
-    self.assertEqual(b.Items(), [x * 2 for x in xrange(100)])
-
-    # This time, we will always keep the last item, meaning that the function
-    # should get invoked once for every item we add.
-    b = reservoir._ReservoirBucket(100, FakeRandom(), always_keep_last=True)
-    incrementer = Incrementer()
-
-    for i in xrange(1000):
-      b.AddItem(i, incrementer.increment_and_double)
-    self.assertEqual(incrementer.n, 1000)
-    self.assertEqual(b.Items(), [x * 2 for x in xrange(99)] + [999 * 2])
-
-
-class ReservoirBucketStatisticalDistributionTest(tf.test.TestCase):
-
-  def setUp(self):
-    self.total = 1000000
-    self.samples = 10000
-    self.n_buckets = 100
-    self.total_per_bucket = self.total // self.n_buckets
-    self.assertEqual(self.total % self.n_buckets, 0, 'total must be evenly '
-                     'divisible by the number of buckets')
-    self.assertTrue(self.total > self.samples, 'need to have more items '
-                    'than samples')
-
-  def AssertBinomialQuantity(self, measured):
-    p = 1.0 * self.n_buckets / self.samples
-    mean = p * self.samples
-    variance = p * (1 - p) * self.samples
-    error = measured - mean
-    # Given that the buckets were actually binomially distributed, this
-    # fails with probability ~2E-9
-    passed = error * error <= 36.0 * variance
-    self.assertTrue(passed, 'found a bucket with measured %d '
-                    'too far from expected %d' % (measured, mean))
-
-  def testBucketReservoirSamplingViaStatisticalProperties(self):
-    # Not related to a 'ReservoirBucket', but instead number of buckets we put
-    # samples into for testing the shape of the distribution
-    b = reservoir._ReservoirBucket(_max_size=self.samples)
-    # add one extra item because we always keep the most recent item, which
-    # would skew the distribution; we can just slice it off the end instead.
-    for i in xrange(self.total + 1):
-      b.AddItem(i)
-
-    divbins = [0] * self.n_buckets
-    modbins = [0] * self.n_buckets
-    # Slice off the last item when we iterate.
-    for item in b.Items()[0:-1]:
-      divbins[item // self.total_per_bucket] += 1
-      modbins[item % self.n_buckets] += 1
-
-    for bucket_index in xrange(self.n_buckets):
-      divbin = divbins[bucket_index]
-      modbin = modbins[bucket_index]
-      self.AssertBinomialQuantity(divbin)
-      self.AssertBinomialQuantity(modbin)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/http_util.py b/tensorflow/tensorboard/backend/http_util.py
deleted file mode 100644
index 81a06a5f14cfb41734154427ad33183db00dcada..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/http_util.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorBoard HTTP utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import gzip
-import json
-import re
-import time
-import wsgiref.handlers
-
-import six
-import tensorflow as tf
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import json_util
-
-
-_EXTRACT_MIMETYPE_PATTERN = re.compile(r'^[^;\s]*')
-_EXTRACT_CHARSET_PATTERN = re.compile(r'charset=([-_0-9A-Za-z]+)')
-
-# Allows *, gzip or x-gzip, but forbid gzip;q=0
-# https://tools.ietf.org/html/rfc7231#section-5.3.4
-_ALLOWS_GZIP_PATTERN = re.compile(
-    r'(?:^|,|\s)(?:(?:x-)?gzip|\*)(?!;q=0)(?:\s|,|$)')
-
-_TEXTUAL_MIMETYPES = set([
-    'application/javascript',
-    'application/json',
-    'application/json+protobuf',
-    'image/svg+xml',
-    'text/css',
-    'text/csv',
-    'text/html',
-    'text/plain',
-    'text/tab-separated-values',
-    'text/x-protobuf',
-])
-
-_JSON_MIMETYPES = set([
-    'application/json',
-    'application/json+protobuf',
-])
-
-
-def Respond(request,
-            content,
-            content_type,
-            code=200,
-            expires=0,
-            content_encoding=None,
-            encoding='utf-8'):
-  """Construct a werkzeug Response.
-
-  Responses are transmitted to the browser with compression if: a) the browser
-  supports it; b) it's sane to compress the content_type in question; and c)
-  the content isn't already compressed, as indicated by the content_encoding
-  parameter.
-
-  Browser and proxy caching is completely disabled by default. If the expires
-  parameter is greater than zero then the response will be able to be cached by
-  the browser for that many seconds; however, proxies are still forbidden from
-  caching so that developers can bypass the cache with Ctrl+Shift+R.
-
-  For textual content that isn't JSON, the encoding parameter is used as the
-  transmission charset which is automatically appended to the Content-Type
-  header. That is unless of course the content_type parameter contains a
-  charset parameter. If the two disagree, the characters in content will be
-  transcoded to the latter.
-
-  If content_type declares a JSON media type, then content MAY be a dict, list,
-  tuple, or set, in which case this function has an implicit composition with
-  json_util.Cleanse and json.dumps. The encoding parameter is used to decode
-  byte strings within the JSON object; therefore transmitting binary data
-  within JSON is not permitted. JSON is transmitted as ASCII unless the
-  content_type parameter explicitly defines a charset parameter, in which case
-  the serialized JSON bytes will use that instead of escape sequences.
-
-  Args:
-    request: A werkzeug Request object. Used mostly to check the
-      Accept-Encoding header.
-    content: Payload data as byte string, unicode string, or maybe JSON.
-    content_type: Media type and optionally an output charset.
-    code: Numeric HTTP status code to use.
-    expires: Second duration for browser caching.
-    content_encoding: Encoding if content is already encoded, e.g. 'gzip'.
-    encoding: Input charset if content parameter has byte strings.
-
-  Returns:
-    A werkzeug Response object (a WSGI application).
-  """
-
-  mimetype = _EXTRACT_MIMETYPE_PATTERN.search(content_type).group(0)
-  charset_match = _EXTRACT_CHARSET_PATTERN.search(content_type)
-  charset = charset_match.group(1) if charset_match else encoding
-  textual = charset_match or mimetype in _TEXTUAL_MIMETYPES
-  if mimetype in _JSON_MIMETYPES and (isinstance(content, dict) or
-                                      isinstance(content, list) or
-                                      isinstance(content, set) or
-                                      isinstance(content, tuple)):
-    content = json.dumps(json_util.Cleanse(content, encoding),
-                         ensure_ascii=not charset_match)
-  if charset != encoding:
-    content = tf.compat.as_text(content, encoding)
-  content = tf.compat.as_bytes(content, charset)
-  if textual and not charset_match and mimetype not in _JSON_MIMETYPES:
-    content_type += '; charset=' + charset
-  if (not content_encoding and textual and
-      _ALLOWS_GZIP_PATTERN.search(request.headers.get('Accept-Encoding', ''))):
-    out = six.BytesIO()
-    f = gzip.GzipFile(fileobj=out, mode='wb', compresslevel=3)
-    f.write(content)
-    f.close()
-    content = out.getvalue()
-    content_encoding = 'gzip'
-  if request.method == 'HEAD':
-    content = ''
-  headers = []
-
-  headers.append(('Content-Length', str(len(content))))
-  if content_encoding:
-    headers.append(('Content-Encoding', content_encoding))
-  if expires > 0:
-    e = wsgiref.handlers.format_date_time(time.time() + float(expires))
-    headers.append(('Expires', e))
-    headers.append(('Cache-Control', 'private, max-age=%d' % expires))
-  else:
-    headers.append(('Expires', '0'))
-    headers.append(('Cache-Control', 'no-cache, must-revalidate'))
-
-  return wrappers.Response(
-      response=content, status=code, headers=headers, content_type=content_type)
diff --git a/tensorflow/tensorboard/backend/http_util_test.py b/tensorflow/tensorboard/backend/http_util_test.py
deleted file mode 100644
index 6b0c8d3403b547f9d2e96c791e484769456a0512..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/http_util_test.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests HTTP utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import gzip
-
-import six
-import tensorflow as tf
-from werkzeug import test as wtest
-from werkzeug import wrappers
-from tensorflow.tensorboard.backend import http_util
-
-
-class RespondTest(tf.test.TestCase):
-
-  def testHelloWorld(self):
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, '<b>hello world</b>', 'text/html')
-    self.assertEqual(r.status_code, 200)
-    self.assertEqual(r.response[0], six.b('<b>hello world</b>'))
-
-  def testHeadRequest_doesNotWrite(self):
-    builder = wtest.EnvironBuilder(method='HEAD')
-    env = builder.get_environ()
-    request = wrappers.Request(env)
-    r = http_util.Respond(request, '<b>hello world</b>', 'text/html')
-    self.assertEqual(r.status_code, 200)
-    self.assertEqual(r.response[0], six.b(''))
-
-  def testPlainText_appendsUtf8ToContentType(self):
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, 'hello', 'text/plain')
-    h = r.headers
-    self.assertEqual(h.get('Content-Type'), 'text/plain; charset=utf-8')
-
-  def testContentLength_isInBytes(self):
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, '爱', 'text/plain')
-    self.assertEqual(r.headers.get('Content-Length'), '3')
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, '爱'.encode('utf-8'), 'text/plain')
-    self.assertEqual(r.headers.get('Content-Length'), '3')
-
-  def testResponseCharsetTranscoding(self):
-    bean = '要依法治国是赞美那些谁是公义的和惩罚恶人。 - 韩非'
-
-    # input is unicode string, output is gbk string
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, bean, 'text/plain; charset=gbk')
-    self.assertEqual(r.response[0], bean.encode('gbk'))
-
-    # input is utf-8 string, output is gbk string
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, bean.encode('utf-8'), 'text/plain; charset=gbk')
-    self.assertEqual(r.response[0], bean.encode('gbk'))
-
-    # input is object with unicode strings, output is gbk json
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, {'red': bean}, 'application/json; charset=gbk')
-    self.assertEqual(r.response[0], b'{"red": "' + bean.encode('gbk') + b'"}')
-
-    # input is object with utf-8 strings, output is gbk json
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(
-        q, {'red': bean.encode('utf-8')}, 'application/json; charset=gbk')
-    self.assertEqual(r.response[0], b'{"red": "' + bean.encode('gbk') + b'"}')
-
-    # input is object with gbk strings, output is gbk json
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(
-        q, {'red': bean.encode('gbk')},
-        'application/json; charset=gbk',
-        encoding='gbk')
-    self.assertEqual(r.response[0], b'{"red": "' + bean.encode('gbk') + b'"}')
-
-  def testAcceptGzip_compressesResponse(self):
-    fall_of_hyperion_canto1_stanza1 = '\n'.join([
-        'Fanatics have their dreams, wherewith they weave',
-        'A paradise for a sect; the savage too',
-        'From forth the loftiest fashion of his sleep',
-        'Guesses at Heaven; pity these have not',
-        'Trac\'d upon vellum or wild Indian leaf',
-        'The shadows of melodious utterance.',
-        'But bare of laurel they live, dream, and die;',
-        'For Poesy alone can tell her dreams,',
-        'With the fine spell of words alone can save',
-        'Imagination from the sable charm',
-        'And dumb enchantment. Who alive can say,',
-        '\'Thou art no Poet may\'st not tell thy dreams?\'',
-        'Since every man whose soul is not a clod',
-        'Hath visions, and would speak, if he had loved',
-        'And been well nurtured in his mother tongue.',
-        'Whether the dream now purpos\'d to rehearse',
-        'Be poet\'s or fanatic\'s will be known',
-        'When this warm scribe my hand is in the grave.',
-    ])
-
-    e1 = wtest.EnvironBuilder(headers={'Accept-Encoding': '*'}).get_environ()
-    any_encoding = wrappers.Request(e1)
-
-    r = http_util.Respond(
-        any_encoding, fall_of_hyperion_canto1_stanza1, 'text/plain')
-    self.assertEqual(r.headers.get('Content-Encoding'), 'gzip')
-
-    self.assertEqual(
-        _gunzip(r.response[0]), fall_of_hyperion_canto1_stanza1.encode('utf-8'))
-
-    e2 = wtest.EnvironBuilder(headers={'Accept-Encoding': 'gzip'}).get_environ()
-    gzip_encoding = wrappers.Request(e2)
-
-    r = http_util.Respond(
-        gzip_encoding, fall_of_hyperion_canto1_stanza1, 'text/plain')
-    self.assertEqual(r.headers.get('Content-Encoding'), 'gzip')
-    self.assertEqual(
-        _gunzip(r.response[0]), fall_of_hyperion_canto1_stanza1.encode('utf-8'))
-
-    r = http_util.Respond(
-        any_encoding, fall_of_hyperion_canto1_stanza1, 'image/png')
-    self.assertEqual(
-        r.response[0], fall_of_hyperion_canto1_stanza1.encode('utf-8'))
-
-  def testJson_getsAutoSerialized(self):
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, [1, 2, 3], 'application/json')
-    self.assertEqual(r.response[0], b'[1, 2, 3]')
-
-  def testExpires_setsCruiseControl(self):
-    q = wrappers.Request(wtest.EnvironBuilder().get_environ())
-    r = http_util.Respond(q, '<b>hello world</b>', 'text/html', expires=60)
-    self.assertEqual(r.headers.get('Cache-Control'), 'private, max-age=60')
-
-
-def _gunzip(bs):
-  return gzip.GzipFile('', 'rb', 9, six.BytesIO(bs)).read()
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/json_util.py b/tensorflow/tensorboard/backend/json_util.py
deleted file mode 100644
index ab8f34a2fb98c399352a454a34405a3b88ee91ac..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/json_util.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A module providing a function for serializing JSON values with Infinity.
-
-Python provides no way to override how json.dumps serializes
-Infinity/-Infinity/NaN; if allow_nan is true, it encodes them as
-Infinity/-Infinity/NaN, in violation of the JSON spec and in violation of what
-JSON.parse accepts. If it's false, it throws a ValueError, Neither subclassing
-JSONEncoder nor passing a function in the |default| keyword argument overrides
-this.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow as tf
-
-
-_INFINITY = float('inf')
-_NEGATIVE_INFINITY = float('-inf')
-
-
-def Cleanse(obj, encoding='utf-8'):
-  """Makes Python object appropriate for JSON serialization.
-
-  - Replaces instances of Infinity/-Infinity/NaN with strings.
-  - Turns byte strings into unicode strings.
-  - Turns sets into sorted lists.
-  - Turns tuples into lists.
-
-  Args:
-    obj: Python data structure.
-    encoding: Charset used to decode byte strings.
-
-  Returns:
-    Unicode JSON data structure.
-  """
-  if isinstance(obj, int):
-    return obj
-  elif isinstance(obj, float):
-    if obj == _INFINITY:
-      return 'Infinity'
-    elif obj == _NEGATIVE_INFINITY:
-      return '-Infinity'
-    elif math.isnan(obj):
-      return 'NaN'
-    else:
-      return obj
-  elif isinstance(obj, bytes):
-    return tf.compat.as_text(obj, encoding)
-  elif isinstance(obj, list) or isinstance(obj, tuple):
-    return [Cleanse(i, encoding) for i in obj]
-  elif isinstance(obj, set):
-    return [Cleanse(i, encoding) for i in sorted(obj)]
-  elif isinstance(obj, dict):
-    return {Cleanse(k, encoding): Cleanse(v, encoding) for k, v in obj.items()}
-  else:
-    return obj
diff --git a/tensorflow/tensorboard/backend/json_util_test.py b/tensorflow/tensorboard/backend/json_util_test.py
deleted file mode 100644
index 22e815564e49aedc2b657fe4074ef0c32b131464..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/json_util_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-from tensorflow.tensorboard.backend import json_util
-
-_INFINITY = float('inf')
-
-
-class FloatWrapperTest(tf.test.TestCase):
-
-  def _assertWrapsAs(self, to_wrap, expected):
-    """Asserts that |to_wrap| becomes |expected| when wrapped."""
-    actual = json_util.Cleanse(to_wrap)
-    for a, e in zip(actual, expected):
-      self.assertEqual(e, a)
-
-  def testWrapsPrimitives(self):
-    self._assertWrapsAs(_INFINITY, 'Infinity')
-    self._assertWrapsAs(-_INFINITY, '-Infinity')
-    self._assertWrapsAs(float('nan'), 'NaN')
-
-  def testWrapsObjectValues(self):
-    self._assertWrapsAs({'x': _INFINITY}, {'x': 'Infinity'})
-
-  def testWrapsObjectKeys(self):
-    self._assertWrapsAs({_INFINITY: 'foo'}, {'Infinity': 'foo'})
-
-  def testWrapsInListsAndTuples(self):
-    self._assertWrapsAs([_INFINITY], ['Infinity'])
-    # map() returns a list even if the argument is a tuple.
-    self._assertWrapsAs((_INFINITY,), ['Infinity',])
-
-  def testWrapsRecursively(self):
-    self._assertWrapsAs({'x': [_INFINITY]}, {'x': ['Infinity']})
-
-  def testTuple_turnsIntoList(self):
-    self.assertEqual(json_util.Cleanse(('a', 'b')), ['a', 'b'])
-
-  def testSet_turnsIntoSortedList(self):
-    self.assertEqual(json_util.Cleanse(set(['b', 'a'])), ['a', 'b'])
-
-  def testByteString_turnsIntoUnicodeString(self):
-    self.assertEqual(json_util.Cleanse(b'\xc2\xa3'), u'\u00a3')  # is # sterling
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/backend/process_graph.py b/tensorflow/tensorboard/backend/process_graph.py
deleted file mode 100644
index 2b314d79cb16a6c1309dd036e96c6d3e2e765022..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/backend/process_graph.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Graph post-processing logic. Used by both TensorBoard and mldash."""
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-import tensorflow as tf
-
-
-def prepare_graph_for_ui(graph, limit_attr_size=1024,
-                         large_attrs_key='_too_large_attrs'):
-  """Prepares (modifies in-place) the graph to be served to the front-end.
-
-  For now, it supports filtering out attributes that are
-  too large to be shown in the graph UI.
-
-  Args:
-    graph: The GraphDef proto message.
-    limit_attr_size: Maximum allowed size in bytes, before the attribute
-        is considered large. Default is 1024 (1KB). Must be > 0 or None.
-        If None, there will be no filtering.
-    large_attrs_key: The attribute key that will be used for storing attributes
-        that are too large. Default is '_too_large_attrs'. Must be != None if
-        `limit_attr_size` is != None.
-
-  Raises:
-    ValueError: If `large_attrs_key is None` while `limit_attr_size != None`.
-    ValueError: If `limit_attr_size` is defined, but <= 0.
-  """
-  # Check input for validity.
-  if limit_attr_size is not None:
-    if large_attrs_key is None:
-      raise ValueError('large_attrs_key must be != None when limit_attr_size'
-                       '!= None.')
-
-    if limit_attr_size <= 0:
-      raise ValueError('limit_attr_size must be > 0, but is %d' %
-                       limit_attr_size)
-
-  # Filter only if a limit size is defined.
-  if limit_attr_size is not None:
-    for node in graph.node:
-      # Go through all the attributes and filter out ones bigger than the
-      # limit.
-      keys = list(node.attr.keys())
-      for key in keys:
-        size = node.attr[key].ByteSize()
-        if size > limit_attr_size or size < 0:
-          del node.attr[key]
-          # Add the attribute key to the list of "too large" attributes.
-          # This is used in the info card in the graph UI to show the user
-          # that some attributes are too large to be shown.
-          node.attr[large_attrs_key].list.s.append(tf.compat.as_bytes(key))
diff --git a/tensorflow/tensorboard/components/BUILD b/tensorflow/tensorboard/components/BUILD
deleted file mode 100644
index 2d7613dbfdcb26cb21b231db3dc363cc0daec802..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-load("//tensorflow/tensorboard/defs:vulcanize.bzl", "tensorboard_html_binary")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tensorboard",
-    srcs = [
-        "analytics.html",
-        "tensorboard.html",
-    ],
-    path = "/",
-    deps = ["//tensorflow/tensorboard/components/tf_tensorboard"],
-)
-
-tensorboard_html_binary(
-    name = "index",
-    input_path = "/tensorboard.html",
-    output_path = "/index.html",
-    deps = [":tensorboard"],
-)
-
-ts_web_library(
-    name = "trace_viewer",
-    srcs = [
-        "trace_viewer.html",
-    ],
-    path = "/",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_trace_viewer",
-    ],
-)
-
-tensorboard_html_binary(
-    name = "trace_viewer_index",
-    input_path = "/trace_viewer.html",
-    output_path = "/trace_viewer_index.html",
-    deps = [":trace_viewer"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/analytics.html b/tensorflow/tensorboard/components/analytics.html
deleted file mode 100644
index d319f576fc1e58296f52e006e7dfc6dda9d191b4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/analytics.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!-- TODO(jart): Give users the ability to opt-in to analytics. -->
diff --git a/tensorflow/tensorboard/components/tensorboard.html b/tensorflow/tensorboard/components/tensorboard.html
deleted file mode 100644
index afaf396614fb06e67630d41c8c566ca8374b5210..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tensorboard.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<title>TensorBoard</title>
-<link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-<link rel="import" href="tf-tensorboard/style.html">
-<link rel="import" href="tf-tensorboard/tf-tensorboard.html">
-<link rel="import" href="analytics.html">
-<body>
-<tf-tensorboard use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
deleted file mode 100644
index 3bc754063c75923fabe0d303323769b2b80f9d47..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
+++ /dev/null
@@ -1,50 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_audio_dashboard",
-    srcs = [
-        "tf-audio-dashboard.html",
-        "tf-audio-grid.html",
-        "tf-audio-loader.html",
-    ],
-    path = "/tf-audio-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-ts_web_library(
-    name = "index",
-    srcs = [
-        "demo/index.html",
-        "index.html",
-    ],
-    path = "/tf-audio-dashboard",
-    deps = [
-        ":tf_audio_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "//tensorflow/tensorboard/demo:demo_data",
-        "@org_polymer_iron_component_page",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
deleted file mode 100644
index a1d7e968e8fcbb2a06c665685b086a04a126770c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
+++ /dev/null
@@ -1,67 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../../paper-styles/typography.html">
-<link rel="import" href="../tf-audio-dashboard.html">
-
-<title>Audio Dashboard Demo</title>
-<style>
-  #container {
-    height: 300px;
-    width: 100%;
-  }
-
-  html, body {
-    margin: 0;
-    padding: 0;
-    font-family: "RobotoDraft","Roboto",sans-serif;
-  }
-
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="audio-dash-demo">
-      <template>
-        <tf-audio-dashboard id="demo" backend="[[backend]]"></tf-audio-dashboard>
-      </template>
-      <script>
-        import {Backend} from '../tf-backend/backend';
-        import {createRouter, setRouter} from '../tf-backend/router';
-
-        Polymer({
-          is: "audio-dash-demo",
-          properties: {
-            backend: {
-              type: Object,
-              value: function() {
-                return new Backend();
-              },
-            },
-          },
-          created: function() {
-            var router = createRouter("/data", true);
-            setRouter(router);
-          },
-        });
-      </script>
-    </dom-module>
-    <audio-dash-demo id="container"></audio-dash-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard/index.html
deleted file mode 100644
index 157f16926580a24ac5ac38f9c1b8b4cd871ab71a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/index.html
+++ /dev/null
@@ -1,25 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<title>tf-audio-dashboard</title>
-<meta name="viewport" content="width=device-width, initial-scale=1.0">
-<script src="../webcomponentsjs/webcomponents-lite.js"></script>
-<link rel="import" href="../iron-component-page/iron-component-page.html">
-<body>
-<iron-component-page src="tf-audio-dashboard.html"></iron-component-page>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/test/BUILD
deleted file mode 100644
index 3d50e5d2caaa02959206bebde7bb5a7ce1917b42..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "audioDashboardTests.ts",
-        "tests.html",
-    ],
-    path = "/tf-audio-dashboard/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_audio_dashboard",
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "//tensorflow/tensorboard/demo:demo_data",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts b/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
deleted file mode 100644
index 6ccd9bede666d2bb111977ae91314c46fccaf0b3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import * as backend_backend from '../../tf-backend/backend';
-import {createRouter, setRouter} from '../../tf-backend/router';
-
-// TODO(dandelion): Fix me.
-declare function fixture(id: string): any;
-declare function stub(x, y: any): void;
-
-describe('audio dashboard tests', () => {
-  let audioDash;
-  let reloadCount = 0;
-  beforeEach(() => {
-    audioDash = fixture('testElementFixture');
-    const router = createRouter('/data', true);
-    setRouter(router);
-    const backend = new backend_backend.Backend();
-    audioDash.backend = backend;
-    stub('tf-audio-loader', {
-      reload: () => { reloadCount++; },
-    });
-  });
-
-  it('calling reload on dashboard reloads the audio-loaders', (done) => {
-    audioDash.backendReload().then(() => {
-      reloadCount = 0;
-      const loaders =
-          [].slice.call(audioDash.getElementsByTagName('tf-audio-loader'));
-      audioDash.frontendReload();
-      setTimeout(() => {
-        chai.assert.isTrue(reloadCount >= 2);
-        done();
-      });
-    });
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/tests.html b/tensorflow/tensorboard/components/tf_audio_dashboard/test/tests.html
deleted file mode 100644
index 891e8bf0c29f5cca7a4654b49dde81997c6d27d5..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/tests.html
+++ /dev/null
@@ -1,38 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<script src="../../web-component-tester/browser.js"></script>
-<link rel="import" href="../../tf-imports/d3.html">
-<link rel="import" href="../tf-audio-dashboard.html">
-<style>
-  html, body {
-    margin: 0;
-    padding: 0;
-    height: 100%;
-    font-family: "RobotoDraft","Roboto",sans-serif;
-  }
-</style>
-
-<test-fixture id="testElementFixture">
-  <template>
-    <tf-audio-dashboard></tf-audio-dashboard>
-  </template>
-</test-fixture>
-
-<script src="audioDashboardTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
deleted file mode 100644
index 7caea7130d00dd2b221ebe54fc4f8e7dc1450779..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
+++ /dev/null
@@ -1,94 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="tf-audio-grid.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-
-<!--
-tf-audio-dashboard displays a dashboard that loads audio from a TensorFlow run.
-
-@element tf-audio-dashboard
-@demo demo/index.html
--->
-<dom-module id="tf-audio-dashboard">
-  <template>
-    <div class="center">
-      <tf-no-data-warning
-        data-type="audio"
-        show-warning="[[dataNotFound]]"
-      ></tf-no-data-warning>
-      <tf-audio-grid
-        id="audioGrid"
-        run-to-audio="[[run2tag]]"
-        audio-generator="[[dataProvider]]"
-        tags="[[tags]]"
-        runs="[[runs]]"
-      ></tf-audio-grid>
-    </div>
-
-    <style>
-      .center {
-        height: 100%;
-        width: 100%;
-        -webkit-box-sizing: border-box;
-        -moz-box-sizing: border-box;
-        box-sizing: border-box;
-      }
-      :host {
-        height: 100%;
-        display: block;
-      }
-
-    </style>
-  </template>
-  <script>
-    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-    import {BackendBehavior} from "../tf-backend/behavior";
-
-    Polymer({
-      is: "tf-audio-dashboard",
-      factoryImpl: function(backend) {
-        this.backend = backend;
-      },
-      properties: {
-        dataType: {
-          type: Object,
-          value: "audio",
-        },
-      },
-      behaviors: [
-        DashboardBehavior("audio"),
-        ReloadBehavior("tf-audio-loader"),
-        BackendBehavior,
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _hasAudio: function(runToAudioChange) {
-        return _.values(runToAudioChange.base).some(function(arr) {
-          return arr.length > 0;
-        });
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html
deleted file mode 100644
index c71d8bdd4bf918ad3877d6ecae8394d131f007f8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html
+++ /dev/null
@@ -1,183 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-styles/paper-styles.html">
-<link rel="import" href="tf-audio-loader.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
-
-<!--
-tf-audio-grid creates a grid for examining audio data. The columns correspond
-to runs and the rows correspond to tags. Each cell is an audio clip.
-
-Structurally, it makes extensive use of flexbox for layout: it has a top-level
-columnar flexbox that contains the topRow (run names) and then a
-bottomContainer. The bottomContainer is another columnar flexbox which contains
-repeated audio-rows. Each audio-row is a row flexbox which contains a tag name
-cell, and then audio cells.
-
-In the future, we should improve on the layout by making the tag names and run names have fixed positions
-within the audio-grid, so that when you scroll you always have context (e.g. row and column names in a spreadsheet).
-For now, it just scrolls.
-
-The audio grid provides internal scroll bars (with styling) so that it can be dropped into
-a dashboard in a predictable fashion, even though the internal audio grid may be enormous.
-
-Room for future improvement:
-
-- Make it obvious when an audio didn't load due to the audio not existing.
-- Find some way to collapse sparse audio grids into denser ones (when sparsity
-is high)
-- Fix column/row names
-- Include hook for loading past audio (by step/timestamp? or index?)
-
-@element tf-audio-grid
--->
-<dom-module id="tf-audio-grid">
-  <template>
-    <style include="scrollbar-style"></style>
-    <div id="fullContainer" class="container scrollbar">
-      <div id="topRow" class="container">
-        <div class="noshrink" id="paddingCell"></div>
-        <template is="dom-if" if="[[_tagsExist(tags)]]">
-          <template
-            is="dom-repeat"
-            items="[[runs]]"
-            as="run"
-          >
-            <div class="run-name-cell noshrink">
-              <span>[[run]]</span>
-            </div>
-          </template>
-        </template>
-      </div>
-      <div id="bottomContainer" class="container">
-        <template
-          is="dom-repeat"
-          items="[[tags]]"
-          as="tag"
-        >
-          <div class="audio-row container noshrink">
-            <div class="tag-name-cell noshrink">
-              <span class="tag-name">[[tag]]</span>
-            </div>
-            <template
-              is="dom-repeat"
-              items="[[runs]]"
-              as="run"
-            >
-              <div class="audio-cell noshrink">
-                <template is="dom-if" if="[[_exists(run, tag, runToAudio.*)]]">
-                  <tf-audio-loader
-                    id="loader"
-                    run="[[run]]"
-                    tag="[[tag]]"
-                    audio-generator="[[audioGenerator]]"
-                  >
-                  </tf-audio-loader>
-                </template>
-              </div>
-            </template>
-          </div>
-        </template>
-      </div>
-    </div>
-    <style>
-      :host {
-        display: block;
-        height: 100%;
-        --audio-cell-min-height: 105px;
-      }
-      .container {
-        display: flex;
-        flex-wrap: nowrap;
-      }
-      #fullContainer {
-        width: 100%;
-        height: 100%;
-        flex-direction: column;
-        padding-top: 20px;
-        overflow: auto;
-        -webkit-box-sizing: border-box;
-        -moz-box-sizing: border-box;
-        box-sizing: border-box;
-      }
-      #topRow {
-        flex-direction: row;
-      }
-      #bottomContainer {
-        flex-direction: column;
-        height: 100%;
-        width: 100%;
-      }
-      .audio-row {
-        flex-direction: row;
-        padding-top: 5px;
-      }
-      .audio-cell {
-        background: #FAFAFA;
-        width: 300px;
-        min-height: var(--audio-cell-min-height);
-        border: 1px solid black;
-        margin-right: 3px;
-        padding: 10px;
-        box-sizing: border-box;
-      }
-      .tag-name-cell {
-        width: 300px;
-        height: var(--audio-cell-min-height);
-        display:flex;
-        flex-direction: column;
-        justify-content: center;
-      }
-      .tag-name {
-        word-wrap: break-word;
-        text-align: center;
-        white-space: nowrap;
-      }
-      .run-name-cell {
-        width: 300px;
-        text-align: center;
-        margin-right: 5px;
-      }
-      .noshrink {
-        flex-shrink: 0;
-      }
-      #paddingCell {
-        width: 300px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-grid",
-      properties: {
-        runToAudio: Object,
-        tags: Array,
-        runs: Array,
-        audioGenerator: Function,
-      },
-      _tagsExist: function(tags) {
-        return tags && tags.length > 0;
-      },
-      _exists: function (run, tag) {
-        return this.runToAudio[run].indexOf(tag) !== -1;
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html b/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
deleted file mode 100644
index 71539537d0e55efcc6c1e07ed76f79ec5699ecf4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
+++ /dev/null
@@ -1,237 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-<link rel="import" href="../tf-imports/lodash.html">
-
-<!--
-tf-audio-loader loads an individual audio clip from the TensorBoard backend.
-
-Right now it always loads the most recent audio clip. We should add support in the
-future for loading older clips.
-
-@element tf-audio-loader
--->
-<dom-module id="tf-audio-loader">
-  <style>
-    :host {
-      display: block;
-      --step-slider-knob-color: #424242;
-    }
-
-    img {
-      width: 100%;
-      height: 100%;
-      image-rendering: pixelated;
-    }
-
-    .step-description {
-      font-size: 12px;
-    }
-
-    .step-value {
-      font-weight: bold;
-    }
-
-    #audio-loading-spinner {
-      width: 14px;
-      height: 14px;
-      vertical-align: text-bottom;
-      --paper-spinner-color: var(--tb-orange-strong)
-    }
-
-    #steps {
-      height: 15px;
-      margin: 0 0 0 -15px;
-      /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
-       * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
-       * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
-      width: calc(100% + 31px);
-      --paper-slider-active-color: var(--step-slider-knob-color);
-      --paper-slider-knob-color: var(--step-slider-knob-color);
-      --paper-slider-pin-color: var(--step-slider-knob-color);
-      --paper-slider-knob-start-color: var(--step-slider-knob-color);
-      --paper-slider-knob-start-border-color: var(--step-slider-knob-color);
-      --paper-slider-pin-start-color: var(--step-slider-knob-color);
-    }
-
-    #individual-audio-container audio {
-      margin: 5px 0 0 -10px;
-      width: calc(100% + 20px);
-    }
-  </style>
-  <template>
-    <template is="dom-if" if="[[_metadatas]]">
-      <template is="dom-if" if="[[_hasAtLeastOneStep(_metadatas)]]">
-        <div class="step-description">
-          step
-          <span class="step-value">
-            [[_stepValue]]
-          </span><br>
-          <template is="dom-if" if="[[_stepWallTime]]">
-            [[_stepWallTime]]
-          </template>
-          <paper-spinner-lite active
-                              id="audio-loading-spinner"
-                              hidden$=[[!_isAudioLoading]]></paper-spinner-lite>
-        </div>
-      </template>
-      <template is="dom-if" if="[[_maxStepIndex]]">
-        <paper-slider
-            id="steps"
-            immediate-value="{{_stepIndex}}"
-            max="[[_maxStepIndex]]"
-            max-markers="[[_maxStepIndex]]"
-            snaps
-            step="1"
-            value="{{_stepIndex}}"></paper-slider>
-      </template>
-      <div id="individual-audio-container"></div>
-    </template>
-  </template>
-  <script>
-    "use strict";
-
-    Polymer({
-      is: "tf-audio-loader",
-      properties: {
-        run: String,
-        tag: String,
-        audioGenerator: Function,
-        // todo: document.
-        _metadatas: Array,
-        _stepIndex: Number,
-        _stepValue: {
-          type: Number,
-          computed: "_computeStepValue(_metadatas, _stepIndex)",
-          value: 0,
-        },
-        _stepWallTime: {
-          type: Number,
-          computed: "_computeStepWallTime(_metadatas, _stepIndex)",
-          value: 0,
-        },
-        _maxStepIndex: {
-          type: Number,
-          computed: "_computeMaxStepIndex(_metadatas)",
-          value: 0,
-        },
-        _isAudioLoading: Boolean,
-        // Used to identify stale requests for audio.
-        _audioRequestId: {
-          type: Number,
-          value: 1
-        },
-      },
-      observers: [
-        "_updateAudio(_metadatas, _stepIndex)",
-      ],
-      reload: function() {
-        this.audioGenerator(this.tag, this.run).then(function(metadatas) {
-          // Set the list of available metadata.
-          this.set("_metadatas", metadatas);
-
-          // Set the index to be the last one.
-          this.set("_stepIndex", this._maxStepIndex);
-        }.bind(this));
-      },
-      ready: function() {
-        // Need to test so that it will not error if it is constructed w/o
-        // all properties (so that it's possible to use stub to mock it out)
-        if (this.run != null && this.tag != null && this.audioGenerator != null) {
-          this.reload();
-        }
-      },
-      _updateAudio: function(metadatas, stepIndex) {
-        if (!metadatas || stepIndex >= metadatas.length) {
-          // No audio to show. The audio section should be hidden.
-          return;
-        }
-
-        // Load new audio.
-        const requestId = ++this._audioRequestId;
-        this.set("_isAudioLoading", true);
-
-        // Create a new audio element. Only replace the previous one once the new audio loads.
-        let audioElement = document.createElement("audio");
-        audioElement.setAttribute("controls", true);
-        audioElement.setAttribute("loop", "loop");
-        let canPlayHandler = function() {
-          if (requestId !== this._audioRequestId) {
-            // This request is no longer relevant.
-            return;
-          }
-
-          // Remove this event listener: "canplay" apparently fires in Chrome every time playing
-          // begins again on loop. So, if we create a new audio element every time that happens, we
-          // don't actually loop.
-          audioElement.removeEventListener("canplay", canPlayHandler);
-
-          let individualAudioContainer = this.$$("#individual-audio-container");
-          individualAudioContainer.innerHTML = "";
-          Polymer.dom(individualAudioContainer).appendChild(audioElement);
-          this.set("_isAudioLoading", false);
-        }.bind(this);
-        audioElement.addEventListener("canplay", canPlayHandler);
-        audioElement.addEventListener("error", function() {
-          if (requestId !== this._audioRequestId) {
-            // This request is no longer relevant.
-            return;
-          }
-
-          // The audio could not be loaded.
-          this.$$("#individual-audio-container").innerHTML = "";
-          this.set("_isAudioLoading", false);
-        }.bind(this));
-
-        // Initiate the request for new audio.
-        var sourceElement = document.createElement("source");
-        let metadata = metadatas[stepIndex];
-        sourceElement.setAttribute("src", metadata.url);
-        sourceElement.setAttribute("type", metadata.content_type);
-        audioElement.appendChild(sourceElement);
-      },
-      _computeStepValue: function(metadatas, stepIndex) {
-        if (!metadatas || stepIndex >= metadatas.length) {
-          // No audio to show. The audio section should be hidden.
-          return 0;
-        }
-        return metadatas[stepIndex].step;
-      },
-      _computeStepWallTime: function(metadatas, stepIndex) {
-        if (!metadatas || stepIndex >= metadatas.length) {
-          // No audio to show. The audio section should be hidden.
-          return 0;
-        }
-        return metadatas[stepIndex].wall_time.toString();
-      },
-      _computeMaxStepIndex: function(metadatas) {
-        if (!metadatas || metadatas.length === 0) {
-          // No audio to show. The audio section should be hidden.
-          return 0;
-        }
-        return metadatas.length - 1;
-      },
-      _hasAtLeastOneStep: function(metadatas) {
-        return metadatas && metadatas.length > 0;
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_backend/BUILD b/tensorflow/tensorboard/components/tf_backend/BUILD
deleted file mode 100644
index 50fc267dc4d4f0a54984e2cee7bbbdb208c32743..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/BUILD
+++ /dev/null
@@ -1,45 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_backend",
-    srcs = [
-        "backend.ts",
-        "behavior.ts",
-        "requestManager.ts",
-        "router.ts",
-        "runsStore.ts",
-        "tf-backend.html",
-        "urlPathHelpers.ts",
-    ],
-    path = "/tf-backend",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:plottable",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/vz_sorting",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_backend"],
-    destdir = "tf-backend",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports_google:lib",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_backend/backend.ts b/tensorflow/tensorboard/components/tf_backend/backend.ts
deleted file mode 100644
index 023414b6b75d5fc3bb46cb3c4832344731e2721a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/backend.ts
+++ /dev/null
@@ -1,608 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {compareTagNames} from '../vz-sorting/sorting';
-import {RequestManager} from './requestManager';
-import {getRouter} from './router';
-import {demoify, queryEncoder} from './urlPathHelpers';
-
-export interface RunEnumeration {
-  histograms: string[];
-  compressedHistogramTuples: string[];
-  scalars: string[];
-  images: string[];
-  audio: string[];
-  graph: boolean;
-  run_metadata: string[];
-}
-
-export interface LogdirResponse { logdir: string; }
-
-export interface RunsResponse { [runName: string]: RunEnumeration; }
-
-export type RunToTag = {
-  [run: string]: string[];
-};
-
-export interface Datum {
-  wall_time: Date;
-  step: number;
-}
-
-export type ScalarDatum = Datum & Scalar;
-export interface Scalar { scalar: number; }
-
-export interface Text { text: string; }
-export type TextDatum = Datum & Text;
-
-export type HistogramDatum = Datum & Histogram;
-export interface Histogram {
-  min: number;
-  max: number;
-  nItems?: number;
-  sum?: number;
-  sumSquares?: number;
-  bucketRightEdges: number[];
-  bucketCounts: number[];
-}
-
-export interface HistogramBin {
-  x: number;
-  dx: number;
-  y: number;
-}
-export type HistogramSeriesDatum = HistogramSeries & Datum;
-export interface HistogramSeries { bins: HistogramBin[]; }
-
-export type ImageDatum = Datum & Image;
-export interface Image {
-  width: number;
-  height: number;
-  url: string;
-}
-
-export type AudioDatum = Datum & Audio;
-export interface Audio {
-  content_type: string;
-  url: string;
-}
-
-// A health pill encapsulates an overview of tensor element values. The value
-// field is a list of 12 numbers that shed light on the status of the tensor.
-export interface HealthPill {
-  device_name: string;
-  node_name: string;
-  output_slot: number;
-  dtype: string;
-  shape: number[];
-  value: number[];
-}
-
-// When updating this type, keep it consistent with the HealthPill interface
-// in tf_graph_common/lib/scene/scene.ts.
-export type HealthPillDatum = Datum & HealthPill;
-// A health pill response is a mapping from node name to a list of health pill
-// data entries.
-export interface HealthPillsResponse { [key: string]: HealthPillDatum[]; }
-
-// An object that encapsulates an alert issued by the debugger. This alert is
-// sent by debugging libraries after bad values (NaN, +/- Inf) are encountered.
-export interface DebuggerNumericsAlertReport {
-  device_name: string;
-  tensor_name: string;
-  first_timestamp: number;
-  nan_event_count: number;
-  neg_inf_event_count: number;
-  pos_inf_event_count: number;
-}
-// A DebuggerNumericsAlertReportResponse contains alerts issued by the debugger
-// in ascending order of timestamp. This helps the user identify for instance
-// when bad values first appeared in the model.
-export type DebuggerNumericsAlertReportResponse = DebuggerNumericsAlertReport[];
-
-export const TYPES = [
-  'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
-  'runMetadata', 'text'
-];
-/**
- * The Backend class provides a convenient and typed interface to the backend.
- *
- * It provides methods corresponding to the different data sources on the
- * TensorBoard backend. These methods return a promise containing the data
- * from the backend. This class does some post-processing on the data; for
- * example, converting data elements tuples into js objects so that they can
- * be accessed in a more convenient and clearly-documented fashion.
- */
-export class Backend {
-  public requestManager: RequestManager;
-
-  /**
-   * Construct a Backend instance.
-   * @param requestManager The RequestManager, overwritable so you may
-   * manually clear request queue, etc. Defaults to a new RequestManager.
-   */
-  constructor(requestManager?: RequestManager) {
-    this.requestManager = requestManager || new RequestManager();
-  }
-
-  /**
-   * Returns a promise for requesting the logdir string.
-   */
-  public logdir(): Promise<LogdirResponse> {
-    return this.requestManager.request(getRouter().logdir());
-  }
-
-  /**
-   * Returns a listing of all the available data in the TensorBoard backend.
-   */
-  public runs(): Promise<RunsResponse> {
-    return this.requestManager.request(getRouter().runs());
-  }
-
-  /**
-   * Return a promise showing the Run-to-Tag mapping for scalar data.
-   */
-  public scalarTags(): Promise<RunToTag> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('scalars', '/tags'));
-  }
-
-  /**
-   * Return a promise showing the Run-to-Tag mapping for histogram data.
-   */
-  public histogramTags(): Promise<RunToTag> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('histograms', '/tags'));
-  }
-
-  /**
-   * Return a promise showing the Run-to-Tag mapping for image data.
-   */
-  public imageTags(): Promise<RunToTag> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('images', '/tags'));
-  }
-
-  /**
-   * Return a promise showing the Run-to-Tag mapping for audio data.
-   */
-  public audioTags(): Promise<RunToTag> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('audio', '/tags'));
-  }
-
-  /**
-   * Return a promise showing the Run-to-Tag mapping for compressedHistogram
-   * data.
-   */
-  public compressedHistogramTags(): Promise<RunToTag> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('distributions', '/tags'));
-  }
-
-  /**
-   * Returns a promise showing the Run-to-Tag mapping for profile data.
-   */
-  public profileTags(): Promise<RunToTag> {
-    let url = getRouter().pluginRoute('profile', '/tags');
-    if (getRouter().isDemoMode()) {
-      url += '.json';
-    }
-    return this.requestManager.request(url);
-  }
-
-  /**
-   * Return a promise showing list of runs that contain graphs.
-   */
-  public graphRuns(): Promise<string[]> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('graphs', '/runs'));
-  }
-
-  /**
-   * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
-   */
-  public runMetadataTags(): Promise<RunToTag> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('graphs', '/run_metadata_tags'));
-  }
-
-
-  /**
-   * Returns a promise showing the Run-to-Tag mapping for text data.
-   */
-  public textRuns(): Promise<RunToTag> {
-    return this.requestManager.request(getRouter().textRuns());
-  }
-
-
-  /**
-   * Returns a promise containing TextDatums for given run and tag.
-   */
-  public text(tag: string, run: string): Promise<TextDatum[]> {
-    const url = getRouter().text(tag, run);
-    // tslint:disable-next-line:no-any it's convenient and harmless here
-    return this.requestManager.request(url).then(map((x: any) => {
-      x.wall_time = timeToDate(x.wall_time);
-      return x;
-    }));
-  }
-
-  /**
-   * Return a URL to fetch a graph (cf. method 'graph').
-   */
-  public graphUrl(run: string, limitAttrSize?: number, largeAttrsKey?: string):
-      string {
-    const demoMode = getRouter().isDemoMode();
-    const base = getRouter().pluginRoute('graphs', '/graph');
-    const optional = (p) => (p != null && !demoMode || undefined) && p;
-    const parameters = {
-      'run': run,
-      'limit_attr_size': optional(limitAttrSize),
-      'large_attrs_key': optional(largeAttrsKey),
-    };
-    const extension = demoMode ? '.pbtxt' : '';
-    return base + queryEncoder(parameters) + extension;
-  }
-
-  public graph(run: string, limitAttrSize?: number, largeAttrsKey?: string):
-      Promise<string> {
-    const url = this.graphUrl(run, limitAttrSize, largeAttrsKey);
-    return this.requestManager.request(url);
-  }
-
-  /**
-   * Return a promise containing ScalarDatums for given run and tag.
-   */
-  public scalar(tag: string, run: string): Promise<Array<ScalarDatum>> {
-    let p: Promise<TupleData<number>[]>;
-    const url = getRouter().pluginRunTagRoute('scalars', '/scalars')(tag, run);
-    p = this.requestManager.request(url);
-    return p.then(map(detupler(createScalar)));
-  }
-
-  /**
-   * Returns a promise for requesting the health pills for a list of nodes. This
-   * route is used by the debugger plugin.
-   */
-  public healthPills(nodeNames: string[], step?: number):
-      Promise<HealthPillsResponse> {
-    const postData = {
-      'node_names': JSON.stringify(nodeNames),
-
-      // Events files with debugger data fall under this special run.
-      'run': '__debugger_data__',
-    };
-    if (step !== undefined) {
-      // The user requested health pills for a specific step. This request
-      // might be slow since the backend reads events sequentially from disk.
-      postData['step'] = step;
-    }
-    return this.requestManager.request(getRouter().healthPills(), postData);
-  }
-
-  /**
-   * Returns a promise for alerts for bad values (detected by the debugger).
-   * This route is used by the debugger plugin.
-   */
-  public debuggerNumericsAlerts():
-      Promise<DebuggerNumericsAlertReportResponse> {
-    return this.requestManager.request(
-        getRouter().pluginRoute('debugger', '/numerics_alert_report'));
-  }
-
-  /**
-   * Return a promise containing HistogramDatums for given run and tag.
-   */
-  public histogram(tag: string, run: string):
-      Promise<Array<HistogramSeriesDatum>> {
-    let p: Promise<TupleData<HistogramTuple>[]>;
-    const url =
-        getRouter().pluginRunTagRoute('histograms', '/histograms')(tag, run);
-    p = this.requestManager.request(url);
-    return p.then(map(detupler(createHistogram))).then(function(histos) {
-      // Get the minimum and maximum values across all histograms so that the
-      // visualization is aligned for all timesteps.
-      const min = d3.min(histos, d => d.min);
-      const max = d3.max(histos, d => d.max);
-
-      return histos.map(function(histo, i) {
-        return {
-          wall_time: histo.wall_time,
-          step: histo.step,
-          bins: convertBins(histo, min, max)
-        };
-      });
-    });
-  }
-
-  /**
-   * Return a promise containing ImageDatums for given run and tag.
-   */
-  public image(tag: string, run: string): Promise<Array<ImageDatum>> {
-    const url = (getRouter().pluginRunTagRoute('images', '/images')(tag, run));
-    let p: Promise<ImageMetadata[]>;
-    p = this.requestManager.request(url);
-    return p.then(map(this.createImage.bind(this)));
-  }
-
-  /**
-   * Return a promise containing AudioDatums for given run and tag.
-   */
-  public audio(tag: string, run: string): Promise<Array<AudioDatum>> {
-    const url = (getRouter().pluginRunTagRoute('audio', '/audio')(tag, run));
-    let p: Promise<AudioMetadata[]>;
-    p = this.requestManager.request(url);
-    return p.then(map(this.createAudio.bind(this)));
-  }
-
-  /**
-   * Returns a promise containing profile data for given run and tag.
-   */
-  public profile(tag: string, run: string): Promise<string> {
-    let url = (getRouter().pluginRunTagRoute('profile', '/data')(tag, run));
-    if (getRouter().isDemoMode()) {
-      url += '.json';
-    }
-    return this.requestManager.request(url);
-  }
-
-  /**
-   * Returns the url for the RunMetadata for the given run/tag.
-   */
-  public runMetadataUrl(tag: string, run: string): string {
-    return getRouter().pluginRunTagRoute('graphs', '/run_metadata')(tag, run);
-  }
-
-  /**
-   * Returns a promise to load the string RunMetadata for given run/tag.
-   */
-  public runMetadata(tag: string, run: string): Promise<string> {
-    const url = this.runMetadataUrl(tag, run);
-    return this.requestManager.request(url);
-  }
-
-  /**
-   * Get compressedHistogram data.
-   * Unlike other methods, don't bother reprocessing this data into a nicer
-   * format. This is because we will deprecate this route.
-   */
-  private compressedHistogram(tag: string, run: string):
-      Promise<Array<Datum&CompressedHistogramTuple>> {
-    const url = (getRouter().pluginRunTagRoute(
-        'distributions', '/distributions')(tag, run));
-    let p: Promise<TupleData<CompressedHistogramTuple>[]>;
-    p = this.requestManager.request(url);
-    return p.then(map(detupler((x) => x)));
-  }
-
-  private createImage(x: ImageMetadata): Image&Datum {
-    const pluginRoute = getRouter().pluginRoute('images', '/individualImage');
-
-    let query = x.query;
-    if (pluginRoute.indexOf('?') > -1) {
-      // The route already has GET parameters. Append our parameters to them.
-      query = '&' + query;
-    } else {
-      // The route lacks GET parameters. We append them.
-      query = '?' + query;
-    }
-
-    if (getRouter().isDemoMode()) {
-      query = demoify(query);
-    }
-
-    let individualImageUrl = pluginRoute + query;
-    // Include wall_time just to disambiguate the URL and force the browser
-    // to reload the image when the URL changes. The backend doesn't care
-    // about the value.
-    individualImageUrl +=
-        getRouter().isDemoMode() ? '.png' : '&ts=' + x.wall_time;
-
-    return {
-      width: x.width,
-      height: x.height,
-      wall_time: timeToDate(x.wall_time),
-      step: x.step,
-      url: individualImageUrl,
-    };
-  }
-
-  private createAudio(x: AudioMetadata): Audio&Datum {
-    const pluginRoute = getRouter().pluginRoute('audio', '/individualAudio');
-
-    let query = x.query;
-    if (pluginRoute.indexOf('?') > -1) {
-      // The route already has GET parameters. Append our parameters to them.
-      query = '&' + query;
-    } else {
-      // The route lacks GET parameters. We append them.
-      query = '?' + query;
-    }
-
-    if (getRouter().isDemoMode()) {
-      query = demoify(query);
-    }
-
-    let individualAudioUrl = pluginRoute + query;
-    // Include wall_time just to disambiguate the URL and force the browser
-    // to reload the audio when the URL changes. The backend doesn't care
-    // about the value.
-    individualAudioUrl +=
-        getRouter().isDemoMode() ? '.wav' : '&ts=' + x.wall_time;
-
-    return {
-      content_type: x.content_type,
-      wall_time: timeToDate(x.wall_time),
-      step: x.step,
-      url: individualAudioUrl,
-    };
-  }
-}
-
-/** Given a RunToTag, return sorted array of all runs */
-export function getRuns(r: RunToTag): string[] {
-  return _.keys(r).sort(compareTagNames);
-}
-
-/** Given a RunToTag, return array of all tags (sorted + dedup'd) */
-export function getTags(r: RunToTag): string[] {
-  return _.union.apply(null, _.values(r)).sort(compareTagNames);
-}
-
-/**
- * Given a RunToTag and an array of runs, return every tag that appears for
- * at least one run.
- * Sorted, deduplicated.
- */
-export function filterTags(r: RunToTag, runs: string[]): string[] {
-  let result = [];
-  runs.forEach((x) => result = result.concat(r[x]));
-  return _.uniq(result).sort(compareTagNames);
-}
-
-function timeToDate(x: number): Date {
-  return new Date(x * 1000);
-};
-
-/**  Just a curryable map to make things cute and tidy. */
-function map<T, U>(f: (x: T) => U): (arr: T[]) => U[] {
-  return function(arr: T[]): U[] {
-    return arr.map(f);
-  };
-};
-
-/**
- * This is a higher order function that takes a function that transforms a
- * T into a G, and returns a function that takes TupleData<T>s and converts
- * them into the intersection of a G and a Datum.
- */
-function detupler<T, G>(xform: (x: T) => G): (t: TupleData<T>) => Datum & G {
-  return function(x: TupleData<T>): Datum & G {
-    // Create a G, assert it has type <G & Datum>
-    let obj = <G&Datum>xform(x[2]);
-    // ... patch in the properties of datum
-    obj.wall_time = timeToDate(x[0]);
-    obj.step = x[1];
-    return obj;
-  };
-};
-
-function createScalar(x: number): Scalar {
-  return {scalar: x};
-}
-
-function createHistogram(x: HistogramTuple): Histogram {
-  return {
-    min: x[0],
-    max: x[1],
-    nItems: x[2],
-    sum: x[3],
-    sumSquares: x[4],
-    bucketRightEdges: x[5],
-    bucketCounts: x[6],
-  };
-}
-
-/**
- * Takes histogram data as stored by tensorboard backend and converts it to
- * the standard d3 histogram data format to make it more compatible and easier
- * to visualize. When visualizing histograms, having the left edge and width
- * makes things quite a bit easier. The bins are also converted to have an
- * uniform width, what makes the visualization easier to understand.
- *
- * @param histogram A histogram from tensorboard backend.
- * @param min The leftmost edge. The binning will start on it.
- * @param max The rightmost edge. The binning will end on it.
- * @param numBins The number of bins of the converted data. The default of 30
- * is a sensible default, using more starts to get artifacts because the event
- * data is stored in buckets, and you start being able to see the aliased
- * borders between each bucket.
- * @return A histogram bin. Each bin has an x (left edge), a dx (width),
- *     and a y (count).
- *
- * If given rightedges are inclusive, then these left edges (x) are exclusive.
- */
-export function convertBins(
-    histogram: Histogram, min: number, max: number, numBins = 30) {
-  if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
-    throw(new Error('Edges and counts are of different lengths.'));
-  }
-
-  if (max === min) {
-    // Create bins even if all the data has a single value.
-    max = min * 1.1 + 1;
-    min = min / 1.1 - 1;
-  }
-  const binWidth = (max - min) / numBins;
-  let bucketLeft = min;  // Use the min as the starting point for the bins.
-  let bucketPos = 0;
-  return d3.range(min, max, binWidth).map((binLeft) => {
-    const binRight = binLeft + binWidth;
-
-    // Take the count of each existing bucket, multiply it by the proportion
-    // of overlap with the new bin, then sum and store as the count for the
-    // new bin. If no overlap, will add to zero, if 100% overlap, will include
-    // the full count into new bin.
-    let binY = 0;
-    while (bucketPos < histogram.bucketRightEdges.length) {
-      // Clip the right edge because right-most edge can be infinite-sized.
-      const bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
-
-      const intersect =
-          Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
-      const count = (intersect / (bucketRight - bucketLeft)) *
-          histogram.bucketCounts[bucketPos];
-
-      binY += intersect > 0 ? count : 0;
-
-      // If bucketRight is bigger than binRight, than this bin is finished and
-      // there is data for the next bin, so don't increment bucketPos.
-      if (bucketRight > binRight) {
-        break;
-      }
-      bucketLeft = Math.max(min, bucketRight);
-      bucketPos++;
-    }
-
-    return {x: binLeft, dx: binWidth, y: binY};
-  });
-}
-
-/**
- * The following interfaces (TupleData, HistogramTuple,
- * CompressedHistogramTuple, ImageMetadata, and AudioMetadata) describe how
- * the data is sent over from the backend.
- */
-type TupleData<T> = [number, number, T];  // wall_time, step
-
-// Min, Max, nItems, Sum, Sum_Squares, right edges of buckets, nItems in
-// buckets
-type HistogramTuple =
-    [number, number, number, number, number, number[], number[]];
-type CompressedHistogramTuple = [number, number][];  // percentile, value
-interface ImageMetadata {
-  width: number;
-  height: number;
-  wall_time: number;
-  step: number;
-  query: string;
-}
-interface AudioMetadata {
-  content_type: string;
-  wall_time: number;
-  step: number;
-  query: string;
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/behavior.ts b/tensorflow/tensorboard/components/tf_backend/behavior.ts
deleted file mode 100644
index 8df791eface0681d6db78d88e89dda588de51458..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/behavior.ts
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {getRuns, getTags, TYPES} from './backend';
-
-/** @polymerBehavior */
-export const BackendBehavior = {
-  properties: {
-    /** *** Required properties *** */
-    /** Data type. One of Backend.TYPES */
-    dataType: {
-      type: String,
-      observer: '_throwErrorOnUnrecognizedType',
-    },
-
-    /** Backend for data loading. */
-    backend: {
-      type: Object,
-    },
-
-    /** Should it automatically load when configured ready? Default true. */
-    autoLoad: {
-      type: Boolean,
-      value: true,
-    },
-
-    /** *** Component-provided properties *** */
-    /** Every tag available for data type (sorted, dedpulicated) */
-    tags: {
-      type: Array,
-      readOnly: true,
-      notify: true,
-    },
-
-    /** Every run available for data type (sorted) */
-    runs: {
-      type: Array,
-      readOnly: true,
-      notify: true,
-    },
-
-    /** Mapping from runs to tags for the data type */
-    run2tag: {
-      type: Object,
-      readOnly: true,
-      notify: true,
-    },
-
-    /** Promise provider for the data. Useful for passing to subcomponents */
-    dataProvider:
-        {type: Function, computed: '_getDataProvider(dataType, backend)'},
-
-    /** Has the dashboard loaded yet? */
-    loadState: {
-      type: String,
-      value: 'noload',  // [noload, pending, loaded, failure]
-      readOnly: true,
-    },
-
-    /**
-     * True if dashboard has loaded, and no tags were found.
-     * Persists through subsequent reloads (ie. still true while
-     * next load is pending) so warning won't flash away every reload
-     * when there is no data.
-     */
-    dataNotFound: {
-      type: Boolean,
-      value: false,
-      readOnly: true,
-    }
-
-  },
-  observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
-  /**
-   * Reloading works in two steps:
-   * Backend reload, which gets metadata on available runs, tags, etc from
-   *   the backend.
-   * Frontend reload, which loads new data for each chart or visual display.
-   * Backend reload logic is provided by this behavior. The frontend reload
-   *   logic should be provided elsewhere, since it is component-specific.
-   * To keep things simple and consistent, we do the backend reload first,
-   *   and the frontend reload afterwards.
-   */
-  reload() {
-    return this.backendReload().then((x) => {
-      return this.frontendReload();
-    });
-  },
-  /**
-   * Load data from backend and then set run2tag, tags, runs, and loadState.
-   * Returns a promise that resolves/rejects when data is loaded.
-   */
-  backendReload() {
-    if (this.dataType == null) {
-      throw new Error('BackendBehavior: Need a dataType to reload.');
-    }
-    if (this.backend == null) {
-      throw new Error('BackendBehavior: Need a backend to reload.');
-    }
-    const runsRoute = (this.backend[this.dataType + 'Runs'] ||
-                       this.backend[this.dataType + 'Tags'])
-                          .bind(this.backend);
-    this._setLoadState('pending');
-    return runsRoute().then(
-        (x) => {
-          this._setLoadState('loaded');
-          if (_.isEqual(x, this.run2tag)) {
-            // If x and run2tag are equal, let's avoid updating everything
-            // since that can needlessly trigger run changes, reloads, etc
-            return x;
-          }
-          this._setRun2tag(x);
-          const tags = getTags(x);
-          this._setDataNotFound(tags.length === 0);
-          this._setTags(tags);
-          this._setRuns(getRuns(x));
-          return x;
-        },
-        (fail) => {
-          this._setLoadState('failure');
-          return fail;
-        });
-  },
-  _do_autoLoad(type, backend, autoLoad) {
-    if (autoLoad) {
-      this.reload();
-    }
-  },
-  _getDataProvider(dataType, backend) {
-    return this.backend[this.dataType].bind(this.backend);
-  },
-  _throwErrorOnUnrecognizedType(dataType) {
-    if (TYPES.indexOf(dataType) === -1) {
-      throw new Error('BackendBehavior: Unknown dataType ' + dataType);
-    }
-  },
-};
diff --git a/tensorflow/tensorboard/components/tf_backend/requestManager.ts b/tensorflow/tensorboard/components/tf_backend/requestManager.ts
deleted file mode 100644
index 0fa198416e81e9d6a31c3ac2ae24b5a9eb6f2a3d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/requestManager.ts
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-interface ResolveReject {
-  resolve: Function;
-  reject: Function;
-}
-/**
- * Manages many fetch requests. Launches up to nSimultaneousRequests
- * simultaneously, and maintains a LIFO queue of requests to process when
- * more urls are requested than can be handled at once. The queue can be
- * cleared.
- *
- * When a request is made, a Promise is returned which resolves with the
- * parsed JSON result from the request.
- */
-export class RequestCancellationError extends Error {
-  public name = 'RequestCancellationError';
-}
-
-export class RequestNetworkError extends Error {
-  public name: string;
-  public req: XMLHttpRequest;
-  public url: string;
-
-  constructor(req: XMLHttpRequest, url) {
-    super();
-    this.message = `RequestNetworkError: ${req.status} at ${url}`;
-    this.name = 'RequestNetworkError';
-    this.req = req;
-    this.url = url;
-  }
-}
-
-export class RequestManager {
-  private _queue: ResolveReject[];
-  private _maxRetries: number;
-  private _nActiveRequests: number;
-  private _nSimultaneousRequests: number;
-
-  constructor(nSimultaneousRequests = 10, maxRetries = 3) {
-    this._queue = [];
-    this._nActiveRequests = 0;
-    this._nSimultaneousRequests = nSimultaneousRequests;
-    this._maxRetries = maxRetries;
-  }
-
-  /**
-   * Gives a promise that loads assets from given url (respects queuing). If
-   * postData is provided, this request will use POST, not GET. This is an
-   * object mapping POST keys to string values.
-   */
-  public request(url: string, postData?: {[key: string]: string}):
-      Promise<any> {
-    const promise =
-        new Promise((resolve, reject) => {
-          const resolver = {resolve: resolve, reject: reject};
-          this._queue.push(resolver);
-          this.launchRequests();
-        })
-            .then(() => {
-              return this.promiseWithRetries(url, this._maxRetries, postData);
-            })
-            .then(
-                (response) => {
-                  // Success - Let's free space for another active
-                  // request, and launch it
-                  this._nActiveRequests--;
-                  this.launchRequests();
-                  return response;
-                },
-                (rejection) => {
-                  if (rejection.name === 'RequestNetworkError') {
-                    // If we failed due to network error, we should
-                    // decrement
-                    // _nActiveRequests because this request was
-                    // active
-                    this._nActiveRequests--;
-                    this.launchRequests();
-                  }
-                  return Promise.reject(rejection);
-                });
-    return promise;
-  }
-
-  public clearQueue() {
-    while (this._queue.length > 0) {
-      this._queue.pop().reject(
-          new RequestCancellationError('Request cancelled by clearQueue'));
-    }
-  }
-
-  /* Return number of currently pending requests */
-  public activeRequests(): number {
-    return this._nActiveRequests;
-  }
-
-  /* Return total number of outstanding requests (includes queue) */
-  public outstandingRequests(): number {
-    return this._nActiveRequests + this._queue.length;
-  }
-
-  private launchRequests() {
-    while (this._nActiveRequests < this._nSimultaneousRequests &&
-           this._queue.length > 0) {
-      this._nActiveRequests++;
-      this._queue.pop().resolve();
-    }
-  }
-
-  /**
-   * Try to request a given URL using overwritable _promiseFromUrl method.
-   * If the request fails for any reason, we will retry up to maxRetries
-   * times. In practice, this will help us paper over transient network issues
-   * like '502 Bad Gateway'.
-   * By default, Chrome displays network errors in console, so
-   * the user will be able to tell when the requests are failing. I think this
-   * is a feature, if the request failures and retries are causing any
-   * pain to users, they can see it and file issues.
-   */
-  private promiseWithRetries(
-      url: string, maxRetries: number, postData?: {[key: string]: string}) {
-    var success = (x) => x;
-    var failure = (x) => {
-      if (maxRetries > 0) {
-        return this.promiseWithRetries(url, maxRetries - 1, postData);
-      } else {
-        return Promise.reject(x);
-      }
-    };
-    return this._promiseFromUrl(url, postData).then(success, failure);
-  }
-
-  /* Actually get promise from url using XMLHttpRequest */
-  protected _promiseFromUrl(url: string, postData?: {[key: string]: string}) {
-    return new Promise((resolve, reject) => {
-      let req = new XMLHttpRequest();
-      req.open(postData ? 'POST' : 'GET', url);
-
-      let formData;
-      if (postData) {
-        // We are to make a POST request.
-        formData = new FormData();
-        for (let postKey in postData) {
-          if (postKey) {
-            // The linter requires 'for in' loops to be filtered by an if
-            // condition.
-            formData.append(postKey, postData[postKey]);
-          }
-        }
-      }
-      req.onload = function() {
-        if (req.status === 200) {
-          resolve(JSON.parse(req.responseText));
-        } else {
-          reject(new RequestNetworkError(req, url));
-        }
-      };
-      req.onerror = function() {
-        reject(new RequestNetworkError(req, url));
-      };
-      req.send(formData);
-    });
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/router.ts b/tensorflow/tensorboard/components/tf_backend/router.ts
deleted file mode 100644
index 598546004e1c4ce3ec7fedc1d8cc3ef273c4dfb2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/router.ts
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {demoify, queryEncoder} from './urlPathHelpers'
-
-export type RunTagUrlFn = (tag: string, run: string) => string;
-
-export interface Router {
-  logdir: () => string;
-  runs: () => string;
-  isDemoMode: () => boolean;
-  textRuns: () => string;
-  text: RunTagUrlFn;
-  healthPills: () => string;
-  pluginRoute: (pluginName: string, route: string) => string;
-  pluginRunTagRoute: (pluginName: string, route: string) => RunTagUrlFn;
-}
-;
-
-/**
- * Create a router for communicating with the TensorBoard backend. You
- * can pass this to `setRouter` to make it the global router.
- *
- * @param dataDir {string} The base prefix for finding data on server.
- * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
- */
-export function createRouter(dataDir = 'data', demoMode = false): Router {
-  var clean = demoMode ? demoify : (x) => x;
-  if (dataDir[dataDir.length - 1] === '/') {
-    dataDir = dataDir.slice(0, dataDir.length - 1);
-  }
-  function standardRoute(route: string, demoExtension = '.json'):
-      ((tag: string, run: string) => string) {
-    return function(tag: string, run: string): string {
-      var url =
-          dataDir + '/' + route + clean(queryEncoder({tag: tag, run: run}));
-      if (demoMode) {
-        url += demoExtension;
-      }
-      return url;
-    };
-  }
-  function pluginRoute(pluginName: string, route: string): string {
-    return `${dataDir}/plugin/${pluginName}${route}`;
-  }
-  function pluginRunTagRoute(pluginName: string, route: string):
-      ((tag: string, run: string) => string) {
-    const base = pluginRoute(pluginName, route);
-    return (tag, run) => base + clean(queryEncoder({tag, run}));
-  }
-  return {
-    logdir: () => dataDir + '/logdir',
-    runs: () => dataDir + '/runs' + (demoMode ? '.json' : ''),
-    isDemoMode: () => demoMode,
-    healthPills: () => dataDir + '/plugin/debugger/health_pills',
-    textRuns: () => dataDir + '/plugin/text/runs' + (demoMode ? '.json' : ''),
-    text: standardRoute('plugin/text/text'),
-    pluginRoute,
-    pluginRunTagRoute,
-  };
-};
-
-let _router: Router = createRouter();
-
-/**
- * @return {Router} the global router
- */
-export function getRouter(): Router {
-  return _router;
-}
-
-/**
- * Set the global router, to be returned by future calls to `getRouter`.
- * You may wish to invoke this if you are running a demo server with a
- * custom path prefix, or if you have customized the TensorBoard backend
- * to use a different path.
- *
- * @param {Router} router the new global router
- */
-export function setRouter(router: Router): void {
-  if (router == null) {
-    throw new Error('Router required, but got: ' + router);
-  }
-  _router = router;
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/runsStore.ts b/tensorflow/tensorboard/components/tf_backend/runsStore.ts
deleted file mode 100644
index bcaff994ce8b304250984098b5015b828251775c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/runsStore.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {RequestManager} from './requestManager';
-import {getRouter} from './router';
-
-let runs: string[] = [];
-
-export type Listener = () => void;
-const listeners = new Set<Listener>();
-
-const requestManager = new RequestManager(1 /* simultaneous request */);
-
-/**
- * Register a listener (nullary function) to be called when new runs are
- * available.
- */
-export function addListener(listener: Listener): void {
-  listeners.add(listener);
-}
-
-/**
- * Remove a listener registered with `addListener`.
- */
-export function removeListener(listener: Listener): void {
-  listeners.delete(listener);
-}
-
-/**
- * Asynchronously load or reload the runs data. Listeners will be
- * invoked if this causes the runs data to change.
- *
- * @see addListener
- * @return {Promise<void>} a promise that resolves when the runs have
- * loaded
- */
-export function fetchRuns(): Promise<void> {
-  const url = getRouter().runs();
-  return requestManager.request(url).then(newRuns => {
-    if (!_.isEqual(runs, newRuns)) {
-      runs = newRuns;
-      listeners.forEach(listener => {
-        listener();
-      });
-    }
-  });
-}
-
-/**
- * Get the current list of runs. If no data is available, this will be
- * an empty array (i.e., there is no distinction between "no runs" and
- * "no runs yet").
- */
-export function getRuns(): string[] {
-  return runs.slice();
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/BUILD b/tensorflow/tensorboard/components/tf_backend/test/BUILD
deleted file mode 100644
index da70f8a9daa54bbc3b36ad1f5bbd836ada4f2878..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "tests.html",
-        "backendTests.ts",
-        "behaviorTests.ts",
-        "requestManagerTests.ts",
-    ] + glob(["data/**"]),
-    path = "/tf-backend/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts b/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
deleted file mode 100644
index 029c83591256fd6a8b4d342431dbeb8657afe08e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {Backend, convertBins, filterTags, getRuns, getTags, RunToTag, TYPES} from '../backend';
-import {RequestManager} from '../requestManager';
-import {createRouter, setRouter} from '../router';
-import {BAD_CHARACTERS, demoify, queryEncoder} from '../urlPathHelpers';
-
-describe('urlPathHelpers', () => {
-  it('demoify works as expected', () => {
-    const demoified = demoify(BAD_CHARACTERS);
-    let allClean = '';
-    for (let i = 0; i < BAD_CHARACTERS.length; i++) {
-      allClean += '_';
-    }
-    chai.assert.equal(demoified, allClean, 'cleaning the BAD_CHARACTERS works');
-    chai.assert.equal(demoify('foozod'), 'foozod', 'doesnt change safe string');
-    chai.assert.equal(demoify('foo zod (2)'), 'foo_zod__2_', 'simple case');
-  });
-
-  it('queryEncoder works with demoify on spaces and parens', () => {
-    const params = {foo: 'something with spaces and (parens)'};
-    const actual = demoify(queryEncoder(params));
-    const expected = '_foo_something_with_spaces_and__28parens_29';
-    chai.assert.equal(actual, expected);
-  });
-});
-
-function assertIsDatum(x) {
-  chai.assert.isNumber(x.step);
-  chai.assert.instanceOf(x.wall_time, Date);
-}
-
-describe('backend tests', () => {
-  let backend: Backend;
-  let rm: RequestManager;
-  const base = 'data';
-  const demoRouter = createRouter(base, /*demoMode=*/true);
-  beforeEach(() => {
-    // Construct a demo Backend (third param is true)
-    setRouter(demoRouter);
-    backend = new Backend();
-    rm = new RequestManager();
-  });
-
-  it('runs are loaded properly', (done) => {
-    const runsResponse = backend.runs();
-    const actualRuns = rm.request(demoRouter.runs());
-    Promise.all([runsResponse, actualRuns]).then((values) => {
-      chai.assert.deepEqual(values[0], values[1]);
-      done();
-    });
-  });
-
-  it('scalars are loaded properly', (done) => {
-    backend.scalar('cross_entropy (1)', 'run1').then((s) => {
-      // just check the data got reformatted properly
-      const aScalar = s[s.length - 1];
-      assertIsDatum(aScalar);
-      chai.assert.isNumber(aScalar.scalar);
-      // verify date conversion works
-      chai.assert.equal(aScalar.wall_time.valueOf(), 40000);
-      done();
-    });
-  });
-
-  it('histograms are loaded properly', (done) => {
-    backend.histogram('histo1', 'run1').then((histos) => {
-      const histo = histos[0];
-      assertIsDatum(histo);
-      chai.assert.instanceOf(histo.bins, Array);
-      done();
-    });
-  });
-
-  it('all registered types have handlers', () => {
-    TYPES.forEach((t: string) => {
-      chai.assert.isDefined(backend[t], t);
-      chai.assert.isDefined(backend[t + 'Runs'], t + 'Runs');
-    });
-  });
-
-  it('images are loaded properly', (done) => {
-    backend.image('im1', 'run1').then((images) => {
-      const image = images[0];
-      assertIsDatum(image);
-      chai.assert.isNumber(image.width);
-      chai.assert.isNumber(image.height);
-      done();
-    });
-  });
-
-  it('audio is loaded properly', (done) => {
-    backend.audio('audio1', 'run1').then((audioClips) => {
-      const audio = audioClips[0];
-      assertIsDatum(audio);
-      chai.assert.equal(audio.content_type, 'audio/wav');
-      done();
-    });
-  });
-
-  it('trailing slash removed from base route', () => {
-    const r = createRouter('foo/');
-    chai.assert.equal(r.runs(), 'foo/runs');
-  });
-
-  it('run helper methods work', (done) => {
-    const scalar = {run1: ['cross_entropy (1)'], fake_run_no_data: ['scalar2']};
-    const image = {run1: ['im1'], fake_run_no_data: ['im1', 'im2']};
-    const audio = {run1: ['audio1'], fake_run_no_data: ['audio1', 'audio2']};
-    const runMetadata = {run1: ['step99'], fake_run_no_data: ['step99']};
-    const graph = ['fake_run_no_data'];
-    let count = 0;
-    function next() {
-      count++;
-      if (count === 4) {
-        done();
-      }
-    }
-    backend.scalarTags().then((x) => {
-      chai.assert.deepEqual(x, scalar);
-      next();
-    });
-    backend.imageTags().then((x) => {
-      chai.assert.deepEqual(x, image);
-      next();
-    });
-    backend.audioTags().then((x) => {
-      chai.assert.deepEqual(x, audio);
-      next();
-    });
-    backend.runMetadataTags().then((x) => {
-      chai.assert.deepEqual(x, runMetadata);
-      next();
-    });
-    backend.graphRuns().then((x) => {
-      chai.assert.deepEqual(x, graph);
-      next();
-    });
-  });
-
-  it('runToTag helpers work', () => {
-    const r2t: RunToTag = {
-      run1: ['foo', 'bar', 'zod'],
-      run2: ['zod', 'zoink'],
-      a: ['foo', 'zod']
-    };
-    const empty1: RunToTag = {};
-    const empty2: RunToTag = {run1: [], run2: []};
-    chai.assert.deepEqual(getRuns(r2t), ['a', 'run1', 'run2']);
-    chai.assert.deepEqual(getTags(r2t), ['bar', 'foo', 'zod', 'zoink']);
-    chai.assert.deepEqual(filterTags(r2t, ['run1', 'run2']), getTags(r2t));
-    chai.assert.deepEqual(filterTags(r2t, ['run1']), ['bar', 'foo', 'zod']);
-    chai.assert.deepEqual(
-        filterTags(r2t, ['run2', 'a']), ['foo', 'zod', 'zoink']);
-
-    chai.assert.deepEqual(getRuns(empty1), []);
-    chai.assert.deepEqual(getTags(empty1), []);
-
-    chai.assert.deepEqual(getRuns(empty2), ['run1', 'run2']);
-    chai.assert.deepEqual(getTags(empty2), []);
-  });
-});
-
-describe('Verify that the histogram format conversion works.', () => {
-
-  function assertHistogramEquality(h1, h2) {
-    h1.forEach((b1, i) => {
-      const b2 = h2[i];
-      chai.assert.closeTo(b1.x, b2.x, 1e-10);
-      chai.assert.closeTo(b1.dx, b2.dx, 1e-10);
-      chai.assert.closeTo(b1.y, b2.y, 1e-10);
-    });
-  }
-
-  it('Throws and error if the inputs are of different lengths', () => {
-    chai.assert.throws(() => {
-      convertBins(
-          {bucketRightEdges: [0], bucketCounts: [1, 2], min: 1, max: 2}, 1, 2,
-          2);
-    }, 'Edges and counts are of different lengths.');
-  });
-
-  it('Handles data with no bins', () => {
-    chai.assert.deepEqual(
-        convertBins(
-            {bucketRightEdges: [], bucketCounts: [], min: 0, max: 0}, 0, 0, 0),
-        []);
-  });
-
-  it('Handles data with one bin', () => {
-    const counts = [1];
-    const rightEdges = [1.21e-12];
-    const histogram = [{x: 1.1e-12, dx: 1.21e-12 - 1.1e-12, y: 1}];
-    const newHistogram = convertBins(
-        {
-          bucketRightEdges: rightEdges,
-          bucketCounts: counts,
-          min: 1.1e-12,
-          max: 1.21e-12
-        },
-        1.1e-12, 1.21e-12, 1);
-    assertHistogramEquality(newHistogram, histogram);
-  });
-
-  it('Handles data with two bins.', () => {
-    const counts = [1, 2];
-    const rightEdges = [1.1e-12, 1.21e-12];
-    const histogram = [
-      {x: 1.0e-12, dx: 1.05e-13, y: 1.09090909090909},
-      {x: 1.105e-12, dx: 1.05e-13, y: 1.9090909090909}
-    ];
-    const newHistogram = convertBins(
-        {
-          bucketRightEdges: rightEdges,
-          bucketCounts: counts,
-          min: 1.0e-12,
-          max: 1.21e-12
-        },
-        1.0e-12, 1.21e-12, 2);
-    assertHistogramEquality(newHistogram, histogram);
-  });
-
-  it('Handles a domain that crosses zero, but doesn\'t include zero as ' +
-         'an edge.',
-     () => {
-       const counts = [1, 2];
-       const rightEdges = [-1.0e-12, 1.0e-12];
-       const histogram = [
-         {x: -1.1e-12, dx: 1.05e-12, y: 1.95},
-         {x: -0.5e-13, dx: 1.05e-12, y: 1.05}
-       ];
-       const newHistogram = convertBins(
-           {
-             bucketRightEdges: rightEdges,
-             bucketCounts: counts,
-             min: -1.1e-12,
-             max: 1.0e-12
-           },
-           -1.1e-12, 1.0e-12, 2);
-       assertHistogramEquality(newHistogram, histogram);
-     });
-
-  it('Handles a histogram of all zeros', () => {
-    const h = {
-      min: 0,
-      max: 0,
-      nItems: 51200,
-      sum: 0,
-      sumSquares: 0,
-      bucketRightEdges: [0, 1e-12, 1.7976931348623157e+308],
-      bucketCounts: [0, 51200, 0],
-      wall_time: '2017-01-25T02:30:11.257Z',
-      step: 0
-    };
-    const newHistogram = convertBins(h, 0, 0, 5);
-    const expectedHistogram = [
-      {x: -1, dx: 0.4, y: 0}, {x: -0.6, dx: 0.4, y: 0},
-      {x: -0.2, dx: 0.4, y: 51200}, {x: 0.2, dx: 0.4, y: 0},
-      {x: 0.6, dx: 0.4, y: 0}
-    ];
-    assertHistogramEquality(newHistogram, expectedHistogram);
-  });
-
-  it('Handles a right-most right edge that extends to very large number.',
-     () => {
-       const counts = [1, 2, 3];
-       const rightEdges = [0, 1.0e-12, 1.0e14];
-       const histogram = [
-         {x: -1.0e-12, dx: 0.7e-12, y: 0.7}, {x: -0.3e-12, dx: 0.7e-12, y: 1.1},
-         {x: 0.4e-12, dx: 0.7e-12, y: 4.2}
-       ];
-       const newHistogram = convertBins(
-           {
-             bucketRightEdges: rightEdges,
-             bucketCounts: counts,
-             min: -1.0e-12,
-             max: 1.1e-12
-           },
-           -1.0e-12, 1.1e-12, 3);
-       assertHistogramEquality(newHistogram, histogram);
-     });
-});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts b/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
deleted file mode 100644
index 6bf328140e21a74b2bc66d1b75e89f2d8c02be0e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {Backend, getRuns, getTags, RunToTag} from '../backend'
-import {BackendBehavior} from '../behavior'
-
-declare function fixture(id: string): void;
-
-window.addEventListener('WebComponentsReady', function() {
-  Polymer({
-    is: 'test-element',
-    behaviors: [BackendBehavior],
-    frontendReload: function() {
-      // no-op
-    },
-  });
-});
-
-describe('data-behavior', function() {
-  let testElement;
-  let resolve;
-  let reject;
-  const fakeBackend = {
-    scalarTags() {
-      return new Promise((_resolve, _reject) => {
-        resolve = (x) => _resolve(x);
-        reject = (x) => _reject(x);
-      });
-    },
-    scalar(x) {
-      return this;
-    },
-  };
-  beforeEach(function() {
-    testElement = fixture('testElementFixture');
-    testElement.autoLoad = false;
-    testElement.backend = fakeBackend;
-    testElement.dataType = 'scalar';
-  });
-
-  it('load states work as expected', function(done) {
-    chai.assert.equal(testElement.loadState, 'noload');
-    var reloaded = testElement.reload();
-    chai.assert.equal(testElement.loadState, 'pending');
-    resolve();
-    reloaded
-        .then(function() {
-          chai.assert.equal(testElement.loadState, 'loaded');
-          var reloaded2 = testElement.reload();
-          chai.assert.equal(testElement.loadState, 'pending');
-          reject();
-          return reloaded2;
-        })
-        .then(function() {
-          chai.assert.equal(testElement.loadState, 'failure');
-          done();
-        });
-  });
-
-  it('data provider set appropriately', function() {
-    chai.assert.deepEqual(testElement.dataProvider(), testElement.backend);
-  });
-
-  it('loads data as expected', function(done) {
-    var r2t: RunToTag = {
-      run1: ['foo', 'bar', 'zod'],
-      run2: ['zoink', 'zow'],
-      run3: ['.'],
-    };
-    var tags = getTags(r2t);
-    var runs = getRuns(r2t);
-    testElement.backend = fakeBackend;
-    testElement.dataType = 'scalar';
-    testElement.reload().then(function(x) {
-      chai.assert.deepEqual(testElement.run2tag, r2t);
-      chai.assert.deepEqual(testElement.runs, runs);
-      chai.assert.deepEqual(testElement.tags, tags);
-      done();
-    });
-    resolve(r2t);
-  });
-
-  it('errors thrown on bad data types', function() {
-    testElement.backend = undefined;
-    chai.assert.throws(function() {
-      testElement.dataType = 'foo';
-    });
-    testElement.dataType = 'scalar';
-    testElement.dataType = 'graph';
-    testElement.dataType = 'histogram';
-  });
-
-  it('dataNotFound flag works', function(done) {
-    chai.assert.isFalse(testElement.dataNotFound, 'initially false');
-    var next = testElement.reload();
-    chai.assert.isFalse(testElement.dataNotFound, 'still false while pending');
-    resolve({foo: [], bar: []});
-    next.then(() => {
-      chai.assert.isTrue(testElement.dataNotFound, 'true on empty data');
-      var last = testElement.reload();
-      chai.assert.isTrue(testElement.dataNotFound, 'still true while pending');
-      resolve({foo: ['bar'], bar: ['zod']});
-      last.then(() => {
-        chai.assert.isFalse(
-            testElement.dataNotFound, 'false now that we have data');
-        done();
-      });
-    });
-  });
-
-  it('reloads as soon as setup, if autoReload is true', function(done) {
-    var r2t = {foo: [], bar: []};
-    var fakeBackend = {
-      scalarTags: () => Promise.resolve(r2t),
-      scalar: () => null,
-    };
-    testElement = fixture('testElementFixture');
-    testElement.dataType = 'scalar';
-    testElement.backend = fakeBackend;
-    setTimeout(() => {
-      chai.assert.equal(testElement.run2tag, r2t);
-      done();
-    });
-  });
-
-  it('doesn\'t mutate props if backend returns same data', function(done) {
-    var r2t_1 = {foo: ['1', '2'], bar: ['3', '4']};
-    var r2t_2 = {foo: ['1', '2'], bar: ['3', '4']};
-    var fakeBackend = {
-      scalarTags: () => Promise.resolve(r2t_1),
-      scalar: () => null,
-    };
-    testElement.backend = fakeBackend;
-    testElement.reload().then(() => {
-      fakeBackend.scalarTags = () => Promise.resolve(r2t_2);
-      var tags = testElement.tags;
-      testElement.reload().then(() => {
-        // shallow equality ensures it wasn't recomputed
-        chai.assert.equal(tags, testElement.tags, 'tags was not recomputed');
-        done();
-      });
-    });
-  });
-
-  // TODO(dandelion): Fix this test.
-  it('reload calls frontendReload', function(done) {
-    testElement.frontendReload = function() {
-      done();
-    };
-    testElement.reload();
-  });
-
-});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/audio_run_run1_tag_audio1.json b/tensorflow/tensorboard/components/tf_backend/test/data/audio_run_run1_tag_audio1.json
deleted file mode 100644
index 21a00f198d65cbc06e5db0c2bd3d1f6eb5149149..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/audio_run_run1_tag_audio1.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"wall_time": 0, "step": 0, "query": "index=0&tag=audio1&run=run1", "content_type": "audio/wav"}]
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_backend/test/data/compressedHistograms_run_run1_tag_histo1.json
deleted file mode 100644
index 8b4c088392db36c3172df15f50f21587640faf0f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/compressedHistograms_run_run1_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0, 0, [[0, -2.3150592308536755], [668, -2.0967547155036605], [1587, -1.4326244423655616], [3085, -0.8871306575801902], [5000, -0.09312398815580714], [6915, 0.2584093405812282], [8413, 0.8895470642005087], [9332, 1.3198979614453679], [10000, 1.6793308878855118]]], [100.0, 10, [[0, -1.3417572789138936], [668, -1.183563374619141], [1587, -0.48920418783271574], [3085, 0.29326906896076954], [5000, 0.56953784145381], [6915, 0.8684655583499333], [8413, 1.4133127368907181], [9332, 1.906140650457873], [10000, 2.135771998171255]]], [200.0, 20, [[0, -1.5066917525035333], [668, -1.3910909571770793], [1587, -0.902737218885874], [3085, -0.3807791904765027], [5000, 0.38900200905253046], [6915, 0.8209734209339482], [8413, 1.302385856695965], [9332, 1.9324626053521639], [10000, 2.957505317875451]]], [300.0, 30, [[0, -0.5430457051469562], [668, -0.4626161834245273], [1587, 0.21573949543027715], [3085, 0.37353741100174215], [5000, 0.6891407881591103], [6915, 1.0927156232630852], [8413, 1.2745337159550916], [9332, 1.4321116832891605], [10000, 2.1913774993059034]]], [400.0, 40, [[0, -0.3584790755077172], [668, -0.33301611509753215], [1587, -0.1089466072951948], [3085, 0.5792199847585249], [5000, 1.220854943811942], [6915, 1.759829438421432], [8413, 2.3072559906741614], [9332, 2.753036118353921], [10000, 3.0267252195784047]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/example.json b/tensorflow/tensorboard/components/tf_backend/test/data/example.json
deleted file mode 100644
index 8adc6fb896a873d0ade470cedf9363f77834c2f6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/example.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "foo": 3,
-  "bar": "zoidberg"
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_backend/test/data/histograms_run_run1_tag_histo1.json
deleted file mode 100644
index a5600a356e8277e58be3b2891c3e328d058b5d08..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/histograms_run_run1_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-0.3584790755077172, 3.0267252195784047, 20.0, 24.012225532303315, 48.29045006426564, [-0.35363819004775493, -0.29226296698161564, -0.19961953895336082, 0.3214892636797772, 0.5177616740489182, 0.56953784145381, 0.6264916255991911, 0.7580548669750213, 0.8338603536725235, 1.220854943811942, 1.3429404381931362, 1.47723448201245, 1.624957930213695, 1.7874537232350647, 1.9661990955585713, 2.379100905625872, 2.6170109961884593, 3.1665833053880363], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/images_run_run1_tag_im1.json b/tensorflow/tensorboard/components/tf_backend/test/data/images_run_run1_tag_im1.json
deleted file mode 100644
index fd2a96b62fee897c4f16e13071bfe6a76d813b72..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/images_run_run1_tag_im1.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"wall_time": 0, "step": 0, "query": "index=0&tag=im1&run=run1", "width": 1, "height": 1}]
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/individualImage_index_0_tag_im1_run_run1.png b/tensorflow/tensorboard/components/tf_backend/test/data/individualImage_index_0_tag_im1_run_run1.png
deleted file mode 100644
index f191b280ce91e6cb8c387735c10ef9bc5da6c83b..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/components/tf_backend/test/data/individualImage_index_0_tag_im1_run_run1.png and /dev/null differ
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/run_metadata_run_step99_tag_train.pbtxt b/tensorflow/tensorboard/components/tf_backend/test/data/run_metadata_run_step99_tag_train.pbtxt
deleted file mode 100644
index 07ce4fad5392db0ee24f2cc132393b8c3c36d3d9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/run_metadata_run_step99_tag_train.pbtxt
+++ /dev/null
@@ -1,17 +0,0 @@
-step_stats {
-  dev_stats {
-    device: "/job:localhost/replica:0/task:0/cpu:0"
-    node_stats {
-      node_name: "_SOURCE"
-      all_start_micros: 1459365298611334
-      op_start_rel_micros: 29
-      op_end_rel_micros: 30
-      all_end_rel_micros: 52
-      memory {
-        allocator_name: "cpu"
-      }
-      timeline_label: "_SOURCE = NoOp()"
-      scheduled_micros: 1459365298611291
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/runs.json b/tensorflow/tensorboard/components/tf_backend/test/data/runs.json
deleted file mode 100644
index 413ddb9ab34d8599bf59b2451d914b49073aa3e7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/runs.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "run1": {
-    "images": [
-      "im1"
-    ],
-    "audio": [
-      "audio1"
-    ],
-    "scalars": [
-      "cross_entropy (1)"
-    ],
-    "histograms": [
-      "histo1"
-    ],
-    "compressedHistograms": [
-      "histo1"
-    ],
-    "run_metadata": [
-      "step99"
-    ],
-    "graph": false
-  },
-  "fake_run_no_data": {
-    "images": ["im1", "im2"],
-    "audio": ["audio1", "audio2"],
-    "scalars": ["scalar2"],
-    "histograms": ["histo1"],
-    "compressedHistograms": ["histo1"],
-    "run_metadata": ["step99"],
-    "graph": true
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/scalars.json b/tensorflow/tensorboard/components/tf_backend/test/data/scalars.json
deleted file mode 100644
index bc9d3353d5fcaec6144b95e4f475a1b51d5a250c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/scalars.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run1": {"cross_entropy (1)": [[0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]}}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/scalars_run_run1_tag_cross_entropy__281_29.json b/tensorflow/tensorboard/components/tf_backend/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
deleted file mode 100644
index 97b0062f0f0c39a0eb393d8599eb00eecfe866fa..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTests.ts b/tensorflow/tensorboard/components/tf_backend/test/requestManagerTests.ts
deleted file mode 100644
index 3800e6e40213a9d873b54f2e7514fb2a43b6340e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTests.ts
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {RequestManager, RequestNetworkError} from '../requestManager';
-
-interface MockRequest {
-  resolve: Function;
-  reject: Function;
-  id: number;
-  url: string;
-}
-
-class MockedRequestManager extends RequestManager {
-  private resolvers: Function[];
-  private rejectors: Function[];
-  public requestsDispatched: number;
-  constructor(maxRequests = 10, maxRetries = 3) {
-    super(maxRequests, maxRetries);
-    this.resolvers = [];
-    this.rejectors = [];
-    this.requestsDispatched = 0;
-  }
-  protected _promiseFromUrl(url) {
-    return new Promise((resolve, reject) => {
-      const mockJSON = {
-        ok: true,
-        json() {
-          return url;
-        },
-        url,
-        status: 200,
-      };
-      const mockFailedRequest: any = {
-        ok: false,
-        url,
-        status: 502,
-      };
-      const mockFailure = new RequestNetworkError(mockFailedRequest, url);
-      this.resolvers.push(() => {
-        resolve(mockJSON);
-      });
-      this.rejectors.push(() => {
-        reject(mockFailure);
-      });
-      this.requestsDispatched++;
-    });
-  }
-  public resolveFakeRequest() {
-    this.resolvers.pop()();
-  }
-  public rejectFakeRequest() {
-    this.rejectors.pop()();
-  }
-  public dispatchAndResolve() {
-    // Wait for at least one request to be dispatched, then resolve it.
-    this.waitForDispatch(1).then(() => this.resolveFakeRequest());
-  }
-  public waitForDispatch(num) {
-    return waitForCondition(() => {
-      return this.requestsDispatched >= num;
-    });
-  }
-}
-
-/** Create a promise that returns when *check* returns true.
- * May cause a test timeout if check never becomes true.
- */
-
-function waitForCondition(check: () => boolean): Promise<any> {
-  return new Promise((resolve, reject) => {
-    const go = () => {
-      if (check()) {
-        resolve();
-      }
-      setTimeout(go, 2);
-    };
-    go();
-  });
-}
-
-describe('backend', () => {
-  describe('request manager', () => {
-    it('request loads JSON properly', (done) => {
-      const rm = new RequestManager();
-      const promise = rm.request('data/example.json');
-      promise.then(
-          (response) => {
-            chai.assert.deepEqual(response, {foo: 3, bar: 'zoidberg'});
-            done();
-          },
-          (reject) => {
-            throw new Error(reject);
-          });
-    });
-
-    it('rejects on bad url', (done) => {
-      const rm = new RequestManager(5, 0);
-      const badUrl = '_bad_url_which_doesnt_exist.json';
-      const promise = rm.request(badUrl);
-      promise.then(
-          (success) => {
-            done(new Error('the promise should have rejected'));
-          },
-          (reject: RequestNetworkError) => {
-            chai.assert.include(reject.message, '404');
-            chai.assert.include(reject.message, badUrl);
-            chai.assert.equal(reject.req.status, 404);
-            done();
-          });
-    });
-
-    it('can retry if requests fail', (done) => {
-      const rm = new MockedRequestManager(3, 5);
-      const r = rm.request('foo');
-      rm.waitForDispatch(1)
-          .then(() => {
-            rm.rejectFakeRequest();
-            return rm.waitForDispatch(2);
-          })
-          .then(() => rm.resolveFakeRequest());
-      r.then((success) => done());
-    });
-
-    it('retries at most maxRetries times', (done) => {
-      const MAX_RETRIES = 2;
-      const rm = new MockedRequestManager(3, MAX_RETRIES);
-      const r = rm.request('foo');
-      rm.waitForDispatch(1)
-          .then(() => {
-            rm.rejectFakeRequest();
-            return rm.waitForDispatch(2);
-          })
-          .then(() => {
-            rm.rejectFakeRequest();
-            return rm.waitForDispatch(3);
-          })
-          .then(() => {
-            rm.rejectFakeRequest();
-          });
-
-      r.then(
-          (success) => done(new Error('The request should have failed')),
-          (failure) => done());
-    });
-
-    it('requestManager only sends maxRequests requests at a time', (done) => {
-      const rm = new MockedRequestManager(3);
-      const r0 = rm.request('1');
-      const r1 = rm.request('2');
-      const r2 = rm.request('3');
-      const r3 = rm.request('4');
-      chai.assert.equal(rm.activeRequests(), 3, 'three requests are active');
-      chai.assert.equal(
-          rm.outstandingRequests(), 4, 'four requests are pending');
-      rm.waitForDispatch(3)
-          .then(() => {
-            chai.assert.equal(
-                rm.activeRequests(), 3, 'three requests are still active (1)');
-            chai.assert.equal(
-                rm.requestsDispatched, 3, 'three requests were dispatched');
-            rm.resolveFakeRequest();
-            return rm.waitForDispatch(4);
-          })
-          .then(() => {
-            chai.assert.equal(
-                rm.activeRequests(), 3, 'three requests are still active (2)');
-            chai.assert.equal(
-                rm.requestsDispatched, 4, 'four requests were dispatched');
-            chai.assert.equal(
-                rm.outstandingRequests(), 3, 'three requests are pending');
-            rm.resolveFakeRequest();
-            rm.resolveFakeRequest();
-            rm.resolveFakeRequest();
-            return r3;
-          })
-          .then(() => {
-            chai.assert.equal(rm.activeRequests(), 0, 'all requests finished');
-            chai.assert.equal(
-                rm.outstandingRequests(), 0, 'no requests pending');
-            done();
-          });
-    });
-
-    it('queue continues after failures', (done) => {
-      const rm = new MockedRequestManager(1, 0);
-      const r0 = rm.request('1');
-      const r1 = rm.request('2');
-      rm.waitForDispatch(1).then(() => {
-        rm.rejectFakeRequest();
-      });
-
-      r0.then(
-            (success) => done(new Error('r0 should have failed')),
-            (failure) => 'unused_argument')
-          .then(() => rm.resolveFakeRequest());
-
-      // When the first request rejects, it should decrement nActiveRequests
-      // and then launch remaining requests in queue (i.e. this one)
-      r1.then((success) => done(), (failure) => done(new Error(failure)));
-    });
-
-    it('queue is LIFO', (done) => {
-      /* This test is a bit tricky.
-       * We want to verify that the RequestManager queue has LIFO semantics.
-       * So we construct three requests off the bat: A, B, C.
-       * So LIFO semantics ensure these will resolve in order A, C, B.
-       * (Because the A request launches immediately when we create it, it's
-       * not in queue)
-       * Then after resolving A, C moves out of queue, and we create X.
-       * So expected final order is A, C, X, B.
-       * We verify this with an external var that counts how many requests were
-       * resolved.
-       */
-      const rm = new MockedRequestManager(1);
-      let nResolved = 0;
-      function assertResolutionOrder(expectedSpotInSequence) {
-        return () => {
-          nResolved++;
-          chai.assert.equal(expectedSpotInSequence, nResolved);
-        };
-      }
-
-      function launchThirdRequest() {
-        rm.request('started late but goes third')
-            .then(assertResolutionOrder(3))
-            .then(() => rm.dispatchAndResolve());
-      }
-
-      rm.request('first')
-          .then(
-              assertResolutionOrder(1))  // Assert that this one resolved first
-          .then(launchThirdRequest)
-          .then(() => rm.dispatchAndResolve());  // then trigger the next one
-
-      rm.request('this one goes fourth')  // created second, will go last
-          .then(assertResolutionOrder(
-              4))       // assert it was the fourth to get resolved
-          .then(done);  // finish the test
-
-      rm.request('second')
-          .then(assertResolutionOrder(2))
-          .then(() => rm.dispatchAndResolve());
-
-      rm.dispatchAndResolve();
-    });
-
-    it('requestManager can clear queue', (done) => {
-      const rm = new MockedRequestManager(1);
-      let requestsResolved = 0;
-      let requestsRejected = 0;
-      const success = () => requestsResolved++;
-      const failure = (err) => {
-        chai.assert.equal(err.name, 'RequestCancellationError');
-        requestsRejected++;
-      };
-      const finishTheTest = () => {
-        chai.assert.equal(rm.activeRequests(), 0, 'no requests still active');
-        chai.assert.equal(
-            rm.requestsDispatched, 1, 'only one req was ever dispatched');
-        chai.assert.equal(rm.outstandingRequests(), 0, 'no pending requests');
-        chai.assert.equal(requestsResolved, 1, 'one request got resolved');
-        chai.assert.equal(
-            requestsRejected, 4, 'four were cancelled and threw errors');
-        done();
-      };
-      rm.request('0').then(success, failure).then(finishTheTest);
-      rm.request('1').then(success, failure);
-      rm.request('2').then(success, failure);
-      rm.request('3').then(success, failure);
-      rm.request('4').then(success, failure);
-      chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
-      rm.waitForDispatch(1).then(() => {
-        chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
-        chai.assert.equal(rm.requestsDispatched, 1, 'one req was dispatched');
-        chai.assert.equal(rm.outstandingRequests(), 5, 'five reqs outstanding');
-        rm.clearQueue();
-        rm.resolveFakeRequest();
-        // resolving the first request triggers finishTheTest
-      });
-    });
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/tests.html b/tensorflow/tensorboard/components/tf_backend/test/tests.html
deleted file mode 100644
index 58cb89a30b6a0c44de0de5faa1999fc2de56ca7c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/tests.html
+++ /dev/null
@@ -1,37 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../polymer/polymer.html">
-  <link rel="import" href="../tf-backend.html">
-</head>
-<body>
-  <test-fixture id="testElementFixture">
-    <template>
-      <test-element id="test"></test-element>
-    </template>
-  </test-fixture>
-  <script src="backendTests.js"></script>
-  <script src="behaviorTests.js"></script>
-  <script src="requestManagerTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_backend/tf-backend.html b/tensorflow/tensorboard/components/tf_backend/tf-backend.html
deleted file mode 100644
index c2a44b3b63f5598d1e467c8f72e59464fdfe635e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/tf-backend.html
+++ /dev/null
@@ -1,28 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="../vz-sorting/vz-sorting.html">
-
-<script src="requestManager.js"></script>
-<script src="urlPathHelpers.js"></script>
-<script src="router.js"></script>
-<script src="runsStore.js"></script>
-<script src="backend.js"></script>
-<script src="behavior.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts b/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
deleted file mode 100644
index 62519dac5ca73b4b62880319dc81c80a188b337e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-export const BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
-/** Cleanup a url so that it can be loaded from a filesystem. */
-export function demoify(s) {
-  // for consistency with python's urllib.urlencode
-  s = s.replace(new RegExp('%20', 'g'), '+');
-  for (let i = 0; i < BAD_CHARACTERS.length; i++) {
-    const c = BAD_CHARACTERS[i];
-    s = s.replace(new RegExp('\\' + c, 'g'), '_');
-  }
-  return s;
-}
-
-export function queryEncoder(params?: any): string {
-  // It's important that the keys be sorted, so we always grab the right file
-  // if we are talking to the backend generated by serialze_tensorboard.py
-  if (params == null) {
-    return '';
-  }
-  const components = _.keys(params)
-                       .sort()
-                       .filter((k) => params[k] !== undefined)
-                       .map((k) => k + '=' + encodeURIComponent(params[k]));
-  const result = components.length ? '?' + components.join('&') : '';
-  // Replace parens for consistency with urllib.urlencode
-  return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
-}
diff --git a/tensorflow/tensorboard/components/tf_color_scale/BUILD b/tensorflow/tensorboard/components/tf_color_scale/BUILD
deleted file mode 100644
index 730ab37d6f7635366e22a40f7fd2a93f9536f73f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_color_scale",
-    srcs = [
-        "colorScale.ts",
-        "palettes.ts",
-        "tf-color-scale.html",
-    ],
-    path = "/tf-color-scale",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-color-scale",
-    deps = [
-        ":tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts b/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
deleted file mode 100644
index e20a65cdd84edccc9dea6e7c1955f21dcce75453..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Example usage:
-// runs = ["train", "test", "test1", "test2"]
-// ccs = new ColorScale();
-// ccs.domain(runs);
-// ccs.getColor("train");
-// ccs.getColor("test1");
-
-import {palettes} from './palettes';
-
-export class ColorScale {
-  private identifiers = d3.map();
-
-  /**
-   * Creates a color scale with optional custom palette.
-   * @param {Array<string>} [palette=palettes.googleColorBlind] - The color
-   *     palette you want as an Array of hex strings.
-   */
-  constructor(
-      private readonly palette: string[] = palettes.googleColorBlindAssist) {}
-
-  /**
-   * Set the domain of strings.
-   * @param {Array<string>} strings - An array of possible strings to use as the
-   *     domain for your scale.
-   */
-  public domain(strings: string[]): this {
-    this.identifiers = d3.map();
-
-    // TODO(wchargin): Remove this call to `sort` once we have only a
-    // singleton ColorScale, linked directly to the RunsStore, which
-    // will always give sorted output.
-    strings = strings.slice();
-    strings.sort();
-
-    strings.forEach((s, i) => {
-      this.identifiers.set(s, this.palette[i % this.palette.length]);
-    });
-    return this;
-  }
-
-  /**
-   * Use the color scale to transform an element in the domain into a color.
-   * @param {string} The input string to map to a color.
-   * @return {string} The color corresponding to that input string.
-   * @throws Will error if input string is not in the scale's domain.
-   */
-  public scale(s: string): string {
-    if (!this.identifiers.has(s)) {
-      throw new Error('String was not in the domain.');
-    }
-    return this.identifiers.get(s) as string;
-  }
-}
-
-Polymer({
-  is: 'tf-color-scale',
-  properties: {
-    runs: {
-      type: Array,
-    },
-    outColorScale: {
-      type: Object,
-      readOnly: true,
-      notify: true,
-      value() {
-        return new ColorScale();
-      },
-    },
-  },
-  observers: ['updateColorScale(runs.*)'],
-  updateColorScale(runsChange) {
-    this.outColorScale.domain(this.runs);
-  },
-});
diff --git a/tensorflow/tensorboard/components/tf_color_scale/index.html b/tensorflow/tensorboard/components/tf_color_scale/index.html
deleted file mode 100644
index 81dfab098c6d86dfc6b666aa26d0d39f4ad3ae8e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/index.html
+++ /dev/null
@@ -1,94 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>tf-color-scale demo</title>
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-styles/typography.html">
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="tf-color-scale.html">
-
-<style> body {font-family: "Roboto";}</style>
-<demo-snippet>
-  <template>
-    <dom-module id="color-scale-demo">
-      <template>
-        <paper-button raised id="button" on-tap="change">Change Runs</paper-button>
-        <tf-color-scale runs="[[runs]]" out-color-scale="{{scale}}"></tf-color-scale>
-        <div class="container">
-          <template is="dom-repeat" items="[[runs]]">
-            <div class="row">
-              <div class="circle" style$=[[_makeStyle(item)]]></div>
-              <span>[[item]]</span>
-            </div>
-          </template>
-        </div>
-        <style>
-          .circle {
-            width: 20px;
-            height: 20px;
-            border-radius: 10px;
-            display: inline-block;
-          }
-          .row {
-            height: 35px;
-            width: 200px;
-            display: inline-block;
-          }
-          .container {
-            height: 200px;
-          }
-          #button {
-            margin: 20px;
-          }
-        </style>
-      </template>
-      <script>
-        let fellowship = ["aragorn", "legolas", "gimli", "frodo", 
-                          "gandalf", "boromir", "merry", "pippin", "sam"];
-        let gems = ["garnet", "amethyst", "pearl", "and steven!"];
-        let numbers = d3.range(30).map(function(x) {return x.toString();});
-        let examples = [numbers, fellowship, gems];
-        Polymer({
-          is: "color-scale-demo",
-          properties: {
-            runs: {
-              type: Array,
-              value: examples[0],
-            },
-            i: {
-              type: Number,
-              value: 0,
-            },
-          },
-          _makeStyle: function(item) {
-            return "background-color: " + this.scale.scale(item);
-          },
-          change: function() {
-            this.i = (this.i + 1) % 3;
-            this.runs = examples[this.i];
-          },
-        });
-      </script>
-    </dom-module>
-    <color-scale-demo id="demo"></color-scale-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_color_scale/palettes.ts b/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
deleted file mode 100644
index ce42a115458eb3d15bb6c3ac72cf7407f5a30afc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-export const palettes = {
-  googleStandard: [
-    '#db4437',  // google red 500
-    '#ff7043',  // deep orange 400
-    '#f4b400',  // google yellow 500
-    '#0f9d58',  // google green 500
-    '#00796b',  // teal 700
-    '#00acc1',  // cyan 600
-    '#4285f4',  // google blue 500
-    '#5c6bc0',  // indigo 400
-    '#ab47bc'   // purple 400
-  ],
-  googleCool: [
-    '#9e9d24',  // lime 800
-    '#0f9d58',  // google green 500
-    '#00796b',  // teal 700
-    '#00acc1',  // cyan 600
-    '#4285f4',  // google blue 500
-    '#5c6bc0',  // indigo 400
-    '#607d8b'   // blue gray 500
-  ],
-  googleWarm: [
-    '#795548',  // brown 500
-    '#ab47bc',  // purple 400
-    '#f06292',  // pink 300
-    '#c2185b',  // pink 700
-    '#db4437',  // google red 500
-    '#ff7043',  // deep orange 400
-    '#f4b400'   // google yellow 700
-  ],
-  googleColorBlindAssist: [
-    '#ff7043',  // orange
-    '#00ACC1',  // dark cyan
-    '#AB47BC',  // bright purple
-    '#2A56C6',  // dark blue
-    '#0b8043',  // green
-    '#F7CB4D',  // yellow
-    '#c0ca33',  // lime
-    '#5e35b1',  // purple
-    '#A52714',  // red
-  ],
-  // These palettes try to be better for color differentiation.
-  // https://personal.sron.nl/~pault/
-  colorBlindAssist1:
-      ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
-  colorBlindAssist2: [
-    '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677', '#882255',
-    '#aa4499'
-  ],
-  colorBlindAssist3: [
-    '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77',
-    '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
-  ],
-  // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
-  colorBlindAssist4: [
-    '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB', '#490092'
-  ],
-  mldash: [
-    '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8', '#00B7ED'
-  ]
-};
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/BUILD b/tensorflow/tensorboard/components/tf_color_scale/test/BUILD
deleted file mode 100644
index 331783f3c767320a1eb2736279d8c2bfc67469e1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/test/BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "colorScaleTests.ts",
-        "tests.html",
-    ],
-    path = "/tf-color-scale/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts b/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
deleted file mode 100644
index 78824a772c3e6b68a4d1fa2f63b821b202bba0c8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-let assert = chai.assert;
-
-import {ColorScale} from '../colorScale'
-
-describe('ColorScale', function() {
-  let ccs: ColorScale;
-
-  beforeEach(function() {
-    ccs = new ColorScale();
-  });
-
-  it('Returns consistent colors', function() {
-    ccs.domain(['train', 'eval', 'test']);
-    let trainColor = ccs.scale('train');
-    let trainColor2 = ccs.scale('train');
-    assert.equal(trainColor, trainColor2);
-  });
-
-  it('Returns consistent colors after new domain', function() {
-    ccs.domain(['train', 'eval']);
-    let trainColor = ccs.scale('train');
-    ccs.domain(['train', 'eval', 'test']);
-    let trainColor2 = ccs.scale('train');
-    assert.equal(trainColor, trainColor2);
-  });
-
-  it('Throws an error if string is not in the domain', function() {
-    ccs.domain(['red', 'yellow', 'green']);
-    assert.throws(function() {
-      ccs.scale('not in domain');
-    }, 'String was not in the domain.');
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/tests.html b/tensorflow/tensorboard/components/tf_color_scale/test/tests.html
deleted file mode 100644
index 59c802d02bf954a4493499dd75e0e5f67d3dee91..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/test/tests.html
+++ /dev/null
@@ -1,24 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<script src="../../web-component-tester/browser.js"></script>
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-color-scale.html">
-<body>
-<script src="colorScaleTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
deleted file mode 100644
index 7471da3144a461ca2f62caf8087eae44741e0e18..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
+++ /dev/null
@@ -1,107 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_dashboard_common",
-    srcs = [
-        "dashboard-behavior.ts",
-        "dashboard-style.html",
-        "reload-behavior.ts",
-        "run-color-style.html",
-        "scrollbar-style.html",
-        "tensorboard-color.html",
-        "tf-categorizer.html",
-        "tf-categorizer.ts",
-        "tf-chart-scaffold.html",
-        "tf-collapsable-pane.html",
-        "tf-dashboard.html",
-        "tf-dashboard-layout.html",
-        "tf-downloader.html",
-        "tf-multi-checkbox.html",
-        "tf-multi-checkbox.ts",
-        "tf-no-data-warning.html",
-        "tf-option-selector.html",
-        "tf-panes-helper.html",
-        "tf-regex-group.html",
-        "tf-regex-group.ts",
-        "tf-run-selector.html",
-        "tf-sidebar-helper.html",
-    ],
-    path = "/tf-dashboard-common",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_storage",
-        "//tensorflow/tensorboard/components/vz_sorting",
-        "@org_polymer_iron_ajax",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_iron_icons",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_checkbox",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_dropdown_menu",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_input",
-        "@org_polymer_paper_item",
-        "@org_polymer_paper_menu",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-        "@org_polymer_paper_styles",
-        "@org_polymer_paper_toggle_button",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = [
-        "tf-categorizer-demo.html",
-        "tf-collapsable-pane-demo.html",
-        "tf-multi-checkbox-demo.html",
-        "tf-regex-group-demo.html",
-    ],
-    path = "/tf-dashboard-common",
-    deps = [
-        ":tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_dashboard_common"],
-    destdir = "tf-dashboard-common",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports_google:lib",
-        "//tensorflow/tensorboard/components/tf_storage:legacy",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/iron-ajax:lib",
-        "//third_party/javascript/polymer/v1/iron-collapse:lib",
-        "//third_party/javascript/polymer/v1/iron-icons:lib",
-        "//third_party/javascript/polymer/v1/paper-button:lib",
-        "//third_party/javascript/polymer/v1/paper-checkbox:lib",
-        "//third_party/javascript/polymer/v1/paper-dialog:lib",
-        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-input:lib",
-        "//third_party/javascript/polymer/v1/paper-item:lib",
-        "//third_party/javascript/polymer/v1/paper-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-slider:lib",
-        "//third_party/javascript/polymer/v1/paper-spinner:lib",
-        "//third_party/javascript/polymer/v1/paper-styles:lib",
-        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts
deleted file mode 100644
index aa063c74220d0aeb5facdb4c009e68e905d5e58f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * A behavior that TensorBoard dashboards must implement. This behavior serves
- * the purpose of an interface.
- *
- * @polymerBehavior
- */
-export function DashboardBehavior(dashboardName) {
-  return {
-    properties: {
-      name: {
-        type: String,
-        value: dashboardName,
-        readOnly: true,
-      },
-    },
-    // This method is called when the dashboard reloads, either when the
-    // dashboard is first visited, periodically reloaded, or manually reloaded
-    // via the user clicking the button. Note that dashboard custom elements
-    // that use TF.Dashboard.ReloadBehavior already implement a reload method.
-    reload() {
-      throw Error(
-          'The ' + dashboardName + ' dashboard does not implement reload.');
-    },
-  };
-}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html
deleted file mode 100644
index 6629e5bfc2284770da8559145c88e451ae063a77..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-style.html
+++ /dev/null
@@ -1,53 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../paper-styles/paper-styles.html">
-<link rel="import" href="tensorboard-color.html">
-
-<dom-module id="dashboard-style">
-  <template>
-    <style>
-      .sidebar {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-        margin-right: 20px;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 15px 0px 15px 30px;
-      }
-
-      .sidebar-section:first-child {
-        border: none;
-      }
-
-      .sidebar-section:last-child {
-        flex-grow: 1;
-        display: flex;
-      }
-
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
-        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
-        font-size: 14px;
-        margin-top: 5px;
-      }
-    </style>
-  </template>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts
deleted file mode 100644
index 61fe0c07812c1407e29a8e36d984f70b6445f2ac..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/reload-behavior.ts
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * ReloadBehavior: A simple behavior for dashboards where the
- * frontendReload() function should find every child element with a
- * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
- * and call a `reload` method on that child.
- * May later extend it so it has more sophisticated logic, e.g. reloading
- * only tags that are in view.
- *
- * @polymerBehavior
- */
-export function ReloadBehavior(tagName) {
-  return {
-    properties: {
-      reloadTag: {
-        type: String,
-        value: tagName,
-      },
-    },
-    frontendReload: function() {
-      var elements = this.getElementsByTagName(this.reloadTag);
-      Array.prototype.forEach.call(elements, function(x) {
-        x.reload();
-      });
-    },
-  };
-}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/run-color-style.html b/tensorflow/tensorboard/components/tf_dashboard_common/run-color-style.html
deleted file mode 100644
index b15861694f57c1d801fe6d2c4cf3e5cb2410a611..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/run-color-style.html
+++ /dev/null
@@ -1,79 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../paper-styles/paper-styles.html">
-
-<dom-module id="run-color-style">
-  <template>
-    <style>
-    [color-class="light-blue"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-light-blue-500);
-      --paper-checkbox-checked-ink-color: var(--paper-light-blue-500);
-      --paper-checkbox-unchecked-color: var(--paper-light-blue-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-light-blue-900);
-    }
-    [color-class="red"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-red-500);
-      --paper-checkbox-checked-ink-color: var(--paper-red-500);
-      --paper-checkbox-unchecked-color: var(--paper-red-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-red-900);
-    }
-    [color-class="green"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-green-500);
-      --paper-checkbox-checked-ink-color: var(--paper-green-500);
-      --paper-checkbox-unchecked-color: var(--paper-green-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-green-900);
-    }
-    [color-class="purple"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-purple-500);
-      --paper-checkbox-checked-ink-color: var(--paper-purple-500);
-      --paper-checkbox-unchecked-color: var(--paper-purple-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-purple-900);
-    }
-    [color-class="teal"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-teal-500);
-      --paper-checkbox-checked-ink-color: var(--paper-teal-500);
-      --paper-checkbox-unchecked-color: var(--paper-teal-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-teal-900);
-    }
-    [color-class="pink"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-pink-500);
-      --paper-checkbox-checked-ink-color: var(--paper-pink-500);
-      --paper-checkbox-unchecked-color: var(--paper-pink-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-pink-900);
-    }
-    [color-class="orange"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-orange-500);
-      --paper-checkbox-checked-ink-color: var(--paper-orange-500);
-      --paper-checkbox-unchecked-color: var(--paper-orange-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-orange-900);
-    }
-    [color-class="brown"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-brown-500);
-      --paper-checkbox-checked-ink-color: var(--paper-brown-500);
-      --paper-checkbox-unchecked-color: var(--paper-brown-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-brown-900);
-    }
-    [color-class="indigo"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-indigo-500);
-      --paper-checkbox-checked-ink-color: var(--paper-indigo-500);
-      --paper-checkbox-unchecked-color: var(--paper-indigo-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-indigo-900);
-    }
-    </style>
-  </template>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html b/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html
deleted file mode 100644
index bfd61f66191df29521ecb3958f3bc9cccd57821e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/scrollbar-style.html
+++ /dev/null
@@ -1,46 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-styles/paper-styles.html">
-
-<dom-module id="scrollbar-style">
-  <template>
-    <style>
-      .scrollbar::-webkit-scrollbar-track
-      {
-        visibility: hidden;
-      }
-
-      .scrollbar::-webkit-scrollbar
-      {
-        width: 10px;
-      }
-
-      .scrollbar::-webkit-scrollbar-thumb
-      {
-        border-radius: 10px;
-        -webkit-box-shadow: inset 0 0 2px rgba(0,0,0,.3);
-        background-color: var(--paper-grey-500);
-        color: var(--paper-grey-900);
-      }
-      .scrollbar {
-        box-sizing: border-box;
-      }
-    </style>
-  </template>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/test/BUILD
deleted file mode 100644
index ef7a1562c65bd4eaf1a60a19cbefde4deb060ada..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "tests.html",
-        "tf-categorizer-tests.ts",
-    ],
-    path = "/tf-dashboard-common/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/tests.html b/tensorflow/tensorboard/components/tf_dashboard_common/test/tests.html
deleted file mode 100644
index c9ad14730f00f52c6d3268c3752e71e651f4111c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/tests.html
+++ /dev/null
@@ -1,24 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<script src="../../web-component-tester/browser.js"></script>
-<link rel="import" href="../tf-categorizer.html">
-<body>
-<script src="tf-categorizer-tests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/tf-categorizer-tests.ts b/tensorflow/tensorboard/components/tf_dashboard_common/test/tf-categorizer-tests.ts
deleted file mode 100644
index a786f39b4fb6f6c9560916e8ab863af8503780b9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/tf-categorizer-tests.ts
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import * as cat from '../tf-categorizer';
-
-let assert = chai.assert;
-
-describe('categorizer', () => {
-  describe('topLevelNamespaceCategorizer', () => {
-    it('returns empty array on empty tags', () => {
-      assert.lengthOf(cat.topLevelNamespaceCategorizer([]), 0);
-    });
-
-    it('handles a simple case', () => {
-      let simple = [
-        'foo1/bar', 'foo1/zod', 'foo2/bar', 'foo2/zod', 'gosh/lod/mar',
-        'gosh/lod/ned'
-      ];
-      let expected = [
-        {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
-        {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
-        {name: 'gosh', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
-      ];
-      assert.deepEqual(cat.topLevelNamespaceCategorizer(simple), expected);
-    });
-
-    it('orders the categories', () => {
-      let test = ['e', 'f', 'g', 'a', 'b', 'c'];
-      let expected = [
-        {name: 'a', tags: ['a']},
-        {name: 'b', tags: ['b']},
-        {name: 'c', tags: ['c']},
-        {name: 'e', tags: ['e']},
-        {name: 'f', tags: ['f']},
-        {name: 'g', tags: ['g']},
-      ];
-      assert.deepEqual(cat.topLevelNamespaceCategorizer(test), expected);
-    });
-
-    it('handles cases where category names overlap node names', () => {
-      let test = ['a', 'a/a', 'a/b', 'a/c', 'b', 'b/a'];
-      const actual = cat.topLevelNamespaceCategorizer(test);
-      let expected = [
-        {name: 'a', tags: ['a', 'a/a', 'a/b', 'a/c']},
-        {name: 'b', tags: ['b', 'b/a']},
-      ];
-      assert.deepEqual(actual, expected);
-    });
-
-    it('handles singleton case', () => {
-      assert.deepEqual(
-          cat.topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
-    });
-  });
-
-  describe('customCategorizer', () => {
-    function noFallbackCategorizer(tags: string[]): cat.Category[] {
-      return [];
-    }
-
-    function testCategorizer(
-        defs: string[], fallback: cat.Categorizer,
-        tags: string[]): cat.Category[] {
-      const catDefs = defs.map(cat.defineCategory);
-      return cat._categorizer(catDefs, fallback)(tags);
-    }
-
-    it('categorizes by regular expression', () => {
-      let defs = ['foo..', 'bar..'];
-      let tags = ['fooab', 'fooxa', 'barts', 'barms'];
-      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
-      let expected = [
-        {name: 'foo..', tags: ['fooab', 'fooxa']},
-        {name: 'bar..', tags: ['barms', 'barts']},
-      ];
-      assert.deepEqual(actual, expected);
-    });
-
-    it('matches non-exclusively', () => {
-      let tags = ['abc', 'bar', 'zod'];
-      const actual =
-          testCategorizer(['...', 'bar'], noFallbackCategorizer, tags);
-      let expected = [
-        {name: '...', tags: ['abc', 'bar', 'zod']},
-        {name: 'bar', tags: ['bar']},
-      ];
-      assert.deepEqual(actual, expected);
-    });
-
-    it('creates categories for unmatched rules', () => {
-      const actual =
-          testCategorizer(['a', 'b', 'c'], noFallbackCategorizer, []);
-      let expected = [
-        {name: 'a', tags: []},
-        {name: 'b', tags: []},
-        {name: 'c', tags: []},
-      ];
-      assert.deepEqual(actual, expected);
-    });
-
-    it('category regexs work with special characters', () => {
-      let defs = ['^\\w+$', '^\\d+$', '^\\/..$'];
-      let tags = ['foo', '3243', '/xa'];
-      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
-      let expected = [
-        {name: '^\\w+$', tags: ['3243', 'foo']},
-        {name: '^\\d+$', tags: ['3243']},
-        {name: '^\\/..$', tags: ['/xa']},
-      ];
-      assert.deepEqual(actual, expected);
-    });
-
-    it('category tags are sorted', () => {
-      let tags = ['a', 'z', 'c', 'd', 'e', 'x', 'f', 'y', 'g'];
-      let sorted = tags.slice().sort();
-      let expected = [{name: '.*', tags: sorted}];
-      const actual = testCategorizer(['.*'], noFallbackCategorizer, tags);
-      assert.deepEqual(actual, expected);
-    });
-
-    it('if nonexclusive: all tags passed to fallback', () => {
-      let passedToDefault = null;
-      function defaultCategorizer(tags: string[]): cat.Category[] {
-        passedToDefault = tags;
-        return [];
-      }
-      let tags = ['foo', 'bar', 'foo123'];
-      testCategorizer(['foo'], defaultCategorizer, tags);
-      assert.deepEqual(passedToDefault, tags);
-    });
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer-demo.html
deleted file mode 100644
index 23babaaecc4d2fe1b31fa0e930a608a41c307f90..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer-demo.html
+++ /dev/null
@@ -1,106 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
- <head>
-  <link rel="import" href="tf-categorizer.html">
- </head>
- <body>
-  <style>
-  </style>
-  <dom-module id="x-demo">
-    <style>
-      .container {
-        width: 255px;
-        padding: 10px;
-        border: 1px solid var(--paper-indigo-900);
-        border-radius: 5px;
-        position: fixed;
-      }
-      :host {
-        margin: 0px;
-      }
-
-      .categories {
-        font-family: "RobotoDraft",Helvetica;
-        margin-left: 300px;
-        width: 500px;
-        border: 1px solid var(--paper-indigo-500);
-        border-radius: 5px;
-      }
-
-      .category {
-        background-color: var(--paper-indigo-50);
-        margin: 20px;
-        padding: 20px;
-        border-radius: 5px;
-      }
-
-      .cat-name {
-        font-size: 20px;
-      }
-
-      .tag {
-        border-radius: 5px;
-        padding: 5px;
-        margin: 5px;
-        background-color: var(--paper-indigo-900);
-        color: white;
-      }
-    </style>
-    <template>
-      <div class="container">
-        <tf-categorizer categories="{{categories}}" tags="[[tags]]" id="demo"></tf-categorizer>
-      </div>
-      <div class="categories">
-        <template is="dom-repeat" items="[[categories]]">
-          <div class="category">
-            <p class="cat-name">Category: <span>[[item.name]]</span></p>
-            <div class="tags-container layout horizontal wrap">
-              <template is="dom-repeat" items="[[item.tags]]">
-                <span class="tag layout vertical center-center">[[item]]</span>
-              </template>
-            </div>
-          </div>
-        </template>
-      </div>
-    </template>
-    <script>
-
-    function tagsGenerator() {
-      var tags = ["special1", "special2", "special3", "special4", "special5"];
-      ["l1", "l2", "l3", "l4", "l5"].forEach(function(l) {
-        ["foo", "bar", "baz", "boink", "zod", "specialx"].forEach(function(x) {
-          tags.push(l + "/" + x);
-        });
-      });
-      return tags;
-    }
-
-    Polymer({
-      is: "x-demo",
-      properties: {
-        tags: { type: Array, value: tagsGenerator },
-      },
-    });
-    </script>
-  </dom-module>
-
-  <x-demo id="demo"></x-demo>
- </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html
deleted file mode 100644
index f09eb03582d94e8755131874e89b8d6365c99ae6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.html
+++ /dev/null
@@ -1,63 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../vz-sorting/vz-sorting.html">
-<link rel="import" href="tf-regex-group.html">
-<link rel="import" href="tensorboard-color.html">
-
-<!--
-`tf-categorizer` turns an array of tags into an array of categories
-
-The transformation from tags to categories is controlled by the user, through
-interacting with the categorizer widget.
-
-(See type signatures in categorizer.ts)
-
-Example:
-  <tf-categorizer tags="[[tags]]" categories="{{categories}}"></tf-categorizer>
-
-Public Properties:
-`tags` - Array of strings that are the tags to categorize. Should be one-way bound downward.
-`categories` - Array of Categorizer.Category objects that are generated by the Categorizer.
-  Are readOnly and notify: True. Expected to be one-way bound upward.
-
-The categorizer provides inputs for adding regular expression rules and toggling whether
-categories are exclusive.
--->
-<dom-module id="tf-categorizer">
-  <template>
-    <div class="inputs">
-      <tf-regex-group id="regexGroup" regexes="{{regexes}}"></tf-regex-group>
-    </div>
-    <style>
-      :host {
-        display: block;
-        padding-bottom: 5px;
-      }
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--paper-grey-600);
-        --paper-checkbox-unchecked-color: var(--paper-grey-600);
-        font-size: 14px;
-      }
-    </style>
-  </template>
-  <script src="tf-categorizer.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.ts
deleted file mode 100644
index 0eaf852ff13e1a09f77bd5de53454bc54c622719..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-categorizer.ts
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {compareTagNames} from '../vz-sorting/sorting';
-
-/**
- * This module contains methods that allow sorting tags into 'categories'.
- * A category contains a name and a list of tags.
- * The sorting strategy is defined by a 'CustomCategorization', which contains
- * 'categoryDefinitions' which are regex rules used to construct a category.
- * E.g. the regex rule 'xent' will create a category called 'xent' that
- * contains values whose tags match the regex.
- *
- * After custom categories are evaluated, the tags are sorted by a hardcoded
- * fallback categorizer, which may, for example, group tags into categories
- * based on their top namespace.
- */
-
-export interface Category {
-  // Categories that data is sorted into
-  name: string;
-  tags: string[];
-}
-
-export interface CustomCategorization {
-  // Defines a categorization strategy
-  categoryDefinitions: string[];
-  fallbackCategorizer: string;
-  /* {'TopLevelNamespaceCategorizer',
-      'LegacyUnderscoreCategorizer'} */
-}
-
-export interface Categorizer {
-  // Function that generates categories
-  (tags: string[]): Category[];
-}
-
-/* Canonical TensorFlow ops are namespaced using forward slashes.
- * This fallback categorizer categorizes by the top-level namespace.
- */
-export var topLevelNamespaceCategorizer: Categorizer = splitCategorizer(/\//);
-
-export function fallbackCategorizer(s: string): Categorizer {
-  switch (s) {
-    case 'TopLevelNamespaceCategorizer':
-      return topLevelNamespaceCategorizer;
-    default:
-      throw new Error('Unrecognized categorization strategy: ' + s);
-  }
-}
-
-/* An 'extractor' is a function that takes a tag name, and 'extracts' a
- * category name.
- * This function takes an extractor, and produces a categorizer.
- * Currently, it is just used for the fallbackCategorizer, but we may want to
- * refactor the general categorization logic to use the concept of extractors.
- */
-function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
-  return (tags: string[]): Category[] => {
-    if (tags.length === 0) {
-      return [];
-    }
-
-    // Maps between top-level name and category. We use the mapping to avoid
-    // duplicating categories per run.
-    const categoryMapping: {[key: string]: Category} = {};
-
-    tags.forEach((t: string) => {
-      const topLevel = extractor(t);
-      if (!categoryMapping[topLevel]) {
-        const newCategory = {
-          name: topLevel,
-          tags: [],
-        };
-        categoryMapping[topLevel] = newCategory;
-      }
-
-      categoryMapping[topLevel].tags.push(t);
-    });
-
-    // Sort categories into alphabetical order.
-    const categories =
-        _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
-    _.forEach(categories, (category) => {
-      // Sort the tags within each category.
-      category.tags.sort(compareTagNames);
-    });
-    return categories;
-  };
-}
-
-function splitCategorizer(r: RegExp): Categorizer {
-  let extractor = (t: string) => {
-    return t.split(r)[0];
-  };
-  return extractorToCategorizer(extractor);
-}
-
-export interface CategoryDefinition {
-  name: string;
-  matches: (t: string) => boolean;
-}
-
-export function defineCategory(ruledef: string): CategoryDefinition {
-  let r = new RegExp(ruledef);
-  let f = function(tag: string): boolean {
-    return r.test(tag);
-  };
-  return {name: ruledef, matches: f};
-}
-
-export function _categorizer(
-    rules: CategoryDefinition[], fallback: Categorizer) {
-  return function(tags: string[]): Category[] {
-    let remaining: d3.Set = d3.set(tags);
-    let userSpecified = rules.map((def: CategoryDefinition) => {
-      let tags: string[] = [];
-      remaining.each((t: string) => {
-        if (def.matches(t)) {
-          tags.push(t);
-        }
-      });
-      let cat = {name: def.name, tags: tags.sort(compareTagNames)};
-      return cat;
-    });
-    let defaultCategories = fallback(remaining.values());
-    return userSpecified.concat(defaultCategories);
-  };
-}
-
-export function categorizer(s: CustomCategorization): Categorizer {
-  let rules = s.categoryDefinitions.map(defineCategory);
-  let fallback = fallbackCategorizer(s.fallbackCategorizer);
-  return _categorizer(rules, fallback);
-};
-
-Polymer({
-  is: 'tf-categorizer',
-  properties: {
-    regexes: {type: Array},
-    tags: {type: Array},
-    categoriesAreExclusive: {type: Boolean, value: true},
-    fallbackCategorizer: {
-      type: String,
-      value: 'TopLevelNamespaceCategorizer',
-    },
-    categorizer: {
-      type: Object,
-      computed:
-          'computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)',
-    },
-    categories: {
-      type: Array,
-      value: function() {
-        return [];
-      },
-      notify: true,
-      readOnly: true
-    },
-  },
-  observers: ['recategorize(tags.*, categorizer)'],
-  computeCategorization: function(
-      regexes, categoriesAreExclusive, fallbackCategorizer) {
-    var categorizationStrategy = {
-      categoryDefinitions: regexes.base,
-      categoriesAreExclusive: categoriesAreExclusive,
-      fallbackCategorizer: fallbackCategorizer,
-    };
-    return categorizer(categorizationStrategy);
-  },
-  recategorize: function() {
-    this.debounce('tf-categorizer-recategorize', function() {
-      var categories = this.categorizer(this.tags);
-      this._setCategories(categories);
-    })
-  },
-});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
deleted file mode 100644
index a39fb9462baf952688bef35372c2c3a70d1b1894..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
+++ /dev/null
@@ -1,152 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-
-<!--
-tf-chart-scaffold is responsible for providing data from TensorBoard to charts.
-It has the following settable properties:
-tag: (required, string) - the name of the tag to load for this chart
-visibleSeries: (required, string[]) - the names of the series the chart should
-    display.
-dataProvider: (required, VZ.ChartHelpers.DataFn) - function that takes (tag,
-    run) and returns a promise containing an array of VZ.ChartHelpers.Datum,
-    compatible with TF.Backend.Datum.
-
-It exposes the following methods:
-chart() - Returns the underlying chart element.
-reload() - Reloads the data and sends it to the underlying chart.
-
-This element should have a compatible chart plugin element as it's content. The
-plugin is required to implement two functions:
-- setVisibleSeries(names: string[]): a function that receives an array of series
-    names as the first parameter, responsible for changing the series currently
-    being displayed to only the series in this array.
-- setSeriesData(name: string, data: VZ.ChartHelpers.Datum[]): sets the data of
-    the series with the given name to the data given in the second parameter.
--->
-<dom-module id="tf-chart-scaffold">
-  <template>
-    <content></content>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-    </style>
-  </template>
-  <script>
-    "use strict";
-
-    Polymer({
-      is: "tf-chart-scaffold",
-      properties: {
-        tag: String,
-        dataProvider: Function,
-        visibleSeries: Array,
-        _attached: {
-          type: Boolean,
-          value: false
-        },
-
-        // Storing the update ID of the previous request for data enables us to determine if a
-        // data response is outdated. We rely on an increasing ID instead of timestamp because
-        // successive updates often fire within the same millisecond.
-        _dataUpdateIdOfLastRequest: Number,
-        _nextAvailableDataUpdateId: {
-          type: Number,
-          value: 1,
-        },
-      },
-      observers: [
-        "reload(tag, dataProvider)",
-        "_changeSeries(visibleSeries.*)"
-      ],
-      ready: function() {
-        this.fire('ready');
-      },
-      attached: function() {
-        this._attached = true;
-        this._changeSeries();
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      reload: function() {
-        if (!this._attached) {
-          return;
-        }
-        else if (!this.dataProvider) {
-          throw new Error('tf-chart-scaffold requires a dataProvider.');
-        }
-        else if (!this.tag) {
-          throw new Error('tf-chart-scaffold requires a tag.');
-        }
-
-        // TODO(chizeng): At this point, notify effective children that the previous data has been
-        // invalidated. For instance, the image dashboard may want to clear its images. Today, the
-        // chart scaffold only informs children when the new image URLs response finishes loading.
-
-        const dataUpdateId = this._nextAvailableDataUpdateId++;
-        this._dataUpdateIdOfLastRequest = dataUpdateId;
-
-        this.visibleSeries.forEach(function(name) {
-          this.dataProvider(this.tag, name).then(function(data) {
-            if (dataUpdateId != this._dataUpdateIdOfLastRequest) {
-              // This response is outdated. Ignore it.
-              // TODO(chizeng): Explore canceling an outdated request before we even receive its
-              // response. This involves creating hooks into the request manager and might introduce
-              // some complexity that may not be worth it; Tensorboard frankly does not seem
-              // bottlenecked by the network (It is often run in fast corp networks or locally.).
-              return;
-            }
-            this.chart().setSeriesData(name, data);
-          }.bind(this));
-        }.bind(this));
-      },
-      _changeSeries: function() {
-        if (!this._attached) {
-           return;
-        }
-        else if (!this.visibleSeries) {
-          throw new Error('tf-chart-scaffold requires a visibleSeries.');
-        }
-
-        this.chart().setVisibleSeries(this.visibleSeries);
-        this.reload();
-      },
-      chart: function() {
-        var children = this.getEffectiveChildren();
-        if (!children.length) {
-          throw new Error('tf-chart-scaffold has no children');
-        }
-
-        var child = children[0];
-        if (!child.setVisibleSeries || !child.setSeriesData) {
-          throw new Error("tf-chart-scaffold's content doesn't implement the " +
-              "required interface");
-        }
-        return child;
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane-demo.html
deleted file mode 100644
index efa990b11cfa45bc3396a65d33e3e07161dac80c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane-demo.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
- <head>
-   <link rel="import" href="tf-collapsable-pane.html">
-   
- </head>
- <body>
-  <style>
-  </style>
-  <tf-collapsable-pane name="foo">
-    <h1>This is content inside the pane.</h1>
-  </tf-collapsable-pane>
- </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane.html
deleted file mode 100644
index e82540127fa5c765cde178dcc1d17014854990d2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-collapsable-pane.html
+++ /dev/null
@@ -1,109 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-
-<dom-module id="tf-collapsable-pane">
-  <template>
-    <button
-      class="heading"
-      on-tap="togglePane"
-      open-button$="[[opened]]"
-    >
-    <span class="name">[[name]]</span>
-    <span class="count">
-      <span>[[count]]</span>
-    </span>
-  </button>
-    <iron-collapse opened="[[opened]]">
-      <div class="content">
-        <template is="dom-if" if="[[opened]]" restamp="[[restamp]]">
-          <content></content>
-        </template>
-      </div>
-    </iron-collapse>
-    <style>
-      :host {
-        display: block;
-        margin: 0 5px 1px 10px;
-      }
-
-      :host:first-of-type {
-        margin-top: 20px;
-      }
-      
-      :host:last-of-type {
-        margin-bottom: 20px;
-      }
-
-      .heading {
-        background-color: white;
-        border: none;
-        cursor: pointer;
-        width: 100%;
-        font-size: 15px;
-        line-height: 1;
-        box-shadow: 0 1px 5px rgba(0,0,0,0.2);
-        padding: 10px 15px;
-      }
-
-      .content {
-        padding: 15px;
-        border: 1px solid #dedede;
-        border-top: none;
-        border-bottom-left-radius: 2px;
-        border-bottom-right-radius: 2px;
-        background: white;
-      }
-
-      [open-button] {
-        border-bottom-left-radius: 0px !important;
-        border-bottom-right-radius: 0px !important;
-      }
-
-      .name {
-        float: left;
-      }
-
-      .count {
-        float: right;
-        margin-right: 5px;
-        font-size: 12px;
-        color: var(--paper-grey-500);
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-collapsable-pane",
-      properties: {
-        opened: {type: Boolean, value: false},
-        restamp: {type: Boolean, value: true},
-        name: {type: String, observer: "hide"},
-        count: {type: Number},
-      },
-      hide: function() {
-        this.opened = false;
-      },
-      togglePane: function() {
-        this.opened = !this.opened;
-      }
-    });
-  </script>
-
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard-layout.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard-layout.html
deleted file mode 100644
index e0e8a2b52c38965b78e254cf1c6c0bf4b5c0d4b3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard-layout.html
+++ /dev/null
@@ -1,67 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="scrollbar-style.html">
-<link rel="import" href="tensorboard-color.html">
-
-<!--
-Generic layout for a dashboard.
--->
-<dom-module id="tf-dashboard-layout">
-  <template>
-    <div id="sidebar">
-      <content select=".sidebar"></content>
-    </div>
-
-    <div id="center" class="scrollbar">
-      <content select=".center"></content>
-    </div>
-    <style include="scrollbar-style"></style>
-    <style>
-      #sidebar {
-        width: inherit;
-        height: 100%;
-        overflow: ellipsis;
-        flex-grow: 0;
-        flex-shrink: 0;
-      }
-
-      #center {
-        height: 100%;
-        overflow-y: auto;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-      .tf-graph-dashboard #center {
-        background: white;
-      }
-
-      :host {
-        display: flex;
-        flex-direction: row;
-        height: 100%;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-dashboard-layout",
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
deleted file mode 100644
index 9e2f6b9589b3648a07899758285d03bef2aa8a9f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="tf-dashboard-layout.html">
-<link rel="import" href="tensorboard-color.html">
-<link rel="import" href="dashboard-style.html">
-<link rel="import" href="tf-downloader.html">
-<link rel="import" href="tf-no-data-warning.html">
-
-<script src="dashboard-behavior.js"></script>
-<script src="reload-behavior.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-downloader.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-downloader.html
deleted file mode 100644
index 719142595984e2e529c2b569098efbe5258e6906..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-downloader.html
+++ /dev/null
@@ -1,99 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-menu/paper-menu.html">
-<link rel="import" href="../paper-item/paper-item.html">
-
-<dom-module id="tf-downloader">
-  <template>
-    <paper-dropdown-menu
-      no-label-float="true"
-      label="run to download"
-      selected-item-label="{{_run}}"
-    >
-      <paper-menu class="dropdown-content">
-        <template is="dom-repeat" items="[[runs]]">
-          <paper-item no-label-float=true>[[item]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-    <div class="center">
-      <span>
-        <a
-          download="[[_csvName(_run)]]"
-          href="[[_csvUrl(_run, urlFn)]]"
-          >CSV</a>
-        <a
-          download="[[_jsonName(_run)]]"
-          href="[[_jsonUrl(_run, urlFn)]]"
-          >JSON</a>
-      </span>
-    </div>
-    <style>
-      :host {
-        display: flex;
-        height: 32px;
-      }
-      .center {
-        display: flex;
-        align-self: center;
-      }
-      paper-dropdown-menu {
-        width: 100px;
-        --paper-input-container-label: {
-          font-size: 10px;
-        }
-        --paper-input-container-input: {
-          font-size: 10px;
-        }
-      }
-      a {
-        font-size: 10px;
-        border-radius: 3px;
-        border: 1px solid #EEE;
-      }
-      paper-input {
-        font-size: 22px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-downloader",
-      properties: {
-        _run: String,
-        runs: Array,
-        tag: String,
-        urlFn: Function,
-      },
-      _csvUrl: function(_run, urlFn) {
-        return urlFn(this.tag, _run) + "&format=csv";
-      },
-      _jsonUrl: function(_run, urlFn) {
-        return urlFn(this.tag, _run);
-      },
-      _csvName: function(_run) {
-        return "run_" + _run + ",tag_" + this.tag + ".csv";
-      },
-      _jsonName: function(_run) {
-        return "run-" + _run + "-tag-" + this.tag + ".json";
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox-demo.html
deleted file mode 100644
index d0f5aa6f27d7cf5351c5c50fc3be693ce1bd39d4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox-demo.html
+++ /dev/null
@@ -1,176 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-<link rel="import" href="../tf-color-scale/tf-color-scale.html">
-<link rel="import" href="tf-multi-checkbox.html">
-
-</head>
-<body>
-<script>
-var seed = 1;
-function random() {
-  var x = Math.sin(seed++) * 10000;
-  return x - Math.floor(x);
-}
-</script>
-<style>
-</style>
-
-<dom-module id="mc-demo">
-  <template>
-    <tf-multi-checkbox
-      id="multiCheckbox"
-      names="[[names]]"
-      tooltips="[[_tooltips]]"
-      class-scale="[[classScale]]"
-      highlights="[[highlights]]"
-    ></tf-multi-checkbox>
-    <tf-color-scale
-      id="colorScale"
-      runs="[[names]]"
-      out-class-scale="{{classScale}}"
-    ></tf-color-scale>
-  <style>
-  </style>
-  </template>
-  <script>
-
-  function randomTooltip() {
-    var s = "";
-    while (random() < 0.8) {
-      s += String(10*random())[0];
-    }
-    return s;
-  }
-  Polymer({
-    is: "mc-demo",
-    properties: {
-      names: Array,
-      tooltips: Object,
-      autoGenerateTooltips: {value: true},
-      _tooltips: Object,
-      classScale: Function,
-      highlights: Array,
-    },
-    observers: [
-      'autogenerate(names, autoGenerateTooltips)',
-      'randomHighlights(names)'
-    ],
-    autogenerate: function(names, autoGenerateTooltips) {
-      if (autoGenerateTooltips) {
-        var tooltips = {};
-        names.forEach(function(n) {
-        if (random() > 0.5) {
-          tooltips[n] = randomTooltip();
-        }
-      });
-      this._tooltips = tooltips;
-      }
-    },
-    randomHighlights: function(names) {
-      var h = [];
-      names.forEach(function(n) {
-        if (random() > 0.6) {
-          h.push(n);
-        }
-      });
-      this.highlights = h;
-    }
-  });
-  </script>
-</dom-module>
-
-<dom-module id="x-demo">
-<style>
-.small {
-  width: 200px;
-  height: 500px;
-}
-.large {
-  width: 500px;
-  height: 900px;
-}
-html,body {
-  height: 100%;
-}
-mc-demo {
-  padding: 5px;
-  border: 1px solid var(--paper-red-500);
-  display: inline-block;
-}
-</style>
-<template>
-  <div class="demo-block">
-    <mc-demo id="demo1" class="small" names="[[long_names]]"></mc-demo>
-    <mc-demo class="small" names="[[many_names]]"></mc-demo>
-    <mc-demo class="small" names="[[many_long_names]]"></mc-demo>
-  </div>
-
-  <div class="demo-block">
-    <mc-demo class="large" names="[[long_names]]"></mc-demo>
-    <mc-demo class="large" names="[[many_names]]"></mc-demo>
-    <mc-demo class="large" names="[[many_long_names]]"></mc-demo>
-  </div>
-
-</template>
-<script>
-
-function long_names() {
-  return [
-    "foo_bar very long name with spaces",
-    "the quick brown fox jumped over the lazy dog",
-    "supercalifragilisticexpialodcious/bar/foo/zod/longer/longer",
-  ];
-}
-
-function many_names() {
-  var out = [];
-  for (var i=0; i<20; i++) {
-    out.push("foo_bar-" + i);
-    out.push("bar_zod_bing-" + i);
-    out.push("lol-" + i);
-  }
-  return out;
-}
-
-function many_long_names() {
-  var out = [];
-  for (var i=0; i<20; i++) {
-    out.push("foo_bar very very very long some spaces though-" + i);
-    out.push("bar_zod_bing_bas_womp_wub_wub_dub_wub_wub-" + i);
-    out.push("rightly_to_be_great_is_not_to_stir_without_great_argument_but_greatly_to_find_quarrel_in_a_straw_when_honors_at_the_stake-" + i);
-  }
-  return out;
-}
-
-Polymer({
-  is: "x-demo",
-  properties: {
-  long_names: {type: Array, value: long_names},
-  many_names: {type: Array, value: many_names},
-  many_long_names: {type: Array, value: many_long_names},
-},
-});
-</script>
-</dom-module>
-
-<x-demo id="demo"></x-demo>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
deleted file mode 100644
index fad4642963f75790ed520ce0330c12060a34e560..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
+++ /dev/null
@@ -1,160 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../tf-storage/tf-storage.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="scrollbar-style.html">
-<link rel="import" href="run-color-style.html">
-
-<!--
-tf-multi-checkbox creates a list of checkboxes that can be used to toggle on or off
-a large number of values. Each checkbox displays a name, and may also have an
-associated tooltip value. Checkboxes can be highlighted, hidden, and re-ordered.
-
-tf-multi-checkbox assumes that the names may be very long compared to the width
-of the checkbox, and the number of names may also be very large, and works to
-handle these situations gracefully.
--->
-<dom-module id="tf-multi-checkbox">
-  <style include="scrollbar-style"></style>
-  <style include="run-color-style"></style>
-
-  <template>
-      <paper-input
-        id="runs-regex"
-        no-label-float
-        label="Write a regex to filter runs"
-        value="[[regexInput]]"
-        on-bind-value-changed="_debouncedRegexChange"
-      ></paper-input>
-    <div id="outer-container" class="scrollbar">
-      <template
-        is="dom-repeat"
-        items="[[namesMatchingRegex]]"
-      >
-        <div
-          class="run-row"
-        >
-          <div class="icon-container checkbox-container vertical-align-container">
-            <paper-checkbox
-              class="checkbox vertical-align-center"
-              name="[[item]]"
-              checked$="[[_isChecked(item, runSelectionState.*)]]"
-              on-change="_checkboxChange"
-            ></paper-checkbox>
-
-          </div>
-          <div class="icon-container isolator-container vertical-align-container">
-            <paper-icon-button
-              icon="radio-button-unchecked"
-              class="isolator vertical-align-center"
-              on-tap="_isolateRun"
-              name="[[item]]"
-            ></paper-icon-button>
-          </div>
-          <div class="item-label-container">
-            <span>[[item]]</span>
-          </div>
-        </div>
-      </template>
-    </div>
-  <style>
-    paper-input {
-      --paper-input-container-focus-color: var(--tb-orange-strong);
-      --paper-input-container-input: {
-        font-size: 14px;
-      };
-      --paper-input-container-label: {
-        font-size: 14px;
-      };
-    }
-    :host {
-      display: flex;
-      flex-direction: column;
-      height: 100%;
-    }
-    #outer-container {
-      overflow-y: auto;
-      overflow-x: hidden;
-      width: 100%;
-      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
-      flex-grow: 1;
-      flex-shrink: 1;
-      word-wrap: break-word;
-    }
-    .run-row {
-      padding-top: 5px;
-      padding-bottom: 5px;
-      display: flex;
-      flex-direction: row;
-      font-size: 13px;
-    }
-    .icon-container {
-      flex-grow: 0;
-      flex-shrink: 0;
-      padding-left: 2px;
-    }
-    .checkbox {
-      padding-left: 2px;
-      width: 18px;
-      height: 18px;
-    }
-    .isolator {
-      width: 18px;
-      height: 18px;
-      padding: 0px;
-    }
-    .isolator-container {
-      padding-left: 6px;
-      padding-right: 3px;
-    }
-    .checkbox-container {
-      padding-left: 2px;
-    }
-    .item-label-container {
-      padding-left: 5px;
-      flex-grow: 1;
-      flex-shrink: 1;
-      width: 0px; /* hack to get the flex-grow to work properly */
-    }
-    .tooltip-value-container {
-      display: flex;
-      justify-content: center;
-      flex-grow: 0;
-      flex-shrink: 0;
-      text-align:right;
-      padding-left: 2px;
-    }
-    .vertical-align-container {
-      display: flex;
-      justify-content: center;
-    }
-    .vertical-align-container .vertical-align-center {
-      align-self: center;
-    }
-    .vertical-align-container .vertical-align-top {
-      align-self: start;
-    }
-  </style>
-  </template>
-  <script src="tf-multi-checkbox.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.ts b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.ts
deleted file mode 100644
index 4b38d82b14edee7c5da0dc5c19a95fbe6b42a5ab..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.ts
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import * as storage from '../tf-storage/storage';
-
-Polymer({
-  is: 'tf-multi-checkbox',
-  properties: {
-    names: {
-      type: Array,
-      value: function() {
-        return [];
-      },
-    },  // All the runs in consideration
-    regexInput: {
-      type: String,
-      value: storage.getStringInitializer('regexInput', ''),
-      observer: '_regexInputObserver',
-    },  // Regex for filtering the runs
-    regex: {type: Object, computed: '_makeRegex(regexInput)'},
-    namesMatchingRegex: {
-      type: Array,
-      computed: 'computeNamesMatchingRegex(names.*, regex)'
-    },  // Runs that match the regex
-    runSelectionState: {
-      // if a run is explicitly enabled, True, if explicitly disabled, False.
-      // if undefined, default value (enable for first k runs, disable after).
-      type: Object,
-      value: storage.getObjectInitializer('runSelectionState', {}),
-      observer: '_storeRunToIsCheckedMapping',
-    },
-    // (Allows state to persist across regex filtering)
-    outSelected: {
-      type: Array,
-      notify: true,
-      computed: 'computeOutSelected(namesMatchingRegex.*, runSelectionState.*)'
-    },
-    colorScale: {
-      type: Object,
-      observer: 'synchronizeColors',
-    },  // map from run name to css class
-    maxRunsToEnableByDefault: {
-      // When TB first loads, if it has k or fewer runs, they are all enabled
-      // by default. If there are more, then they are all disabled.
-      type: Number,
-      value: 40,
-    },
-    _debouncedRegexChange: {
-      type: Object,
-      // Updating the regex can be slow, because it involves updating styles
-      // on a large number of Polymer paper-checkboxes. We don't want to do
-      // this while the user is typing, as it may make a bad, laggy UI.
-      // So we debounce the updates that come from user typing.
-      value: function() {
-        const _this = this;
-        var debounced = _.debounce(function(r) {
-          _this.regexInput = r;
-        }, 150, {leading: false});
-        return function() {
-          var r = this.$$('#runs-regex').value;
-          if (r == '') {
-            // If the user cleared the field, they may be done typing, so
-            // update more quickly.
-            this.async(function() {
-              _this.regexInput = r;
-            }, 30);
-          } else {
-            debounced(r);
-          };
-        };
-      },
-    },
-  },
-  listeners: {
-    'dom-change': 'synchronizeColors',
-  },
-  observers: [
-    '_setIsolatorIcon(runSelectionState, names)',
-  ],
-  _storeRunToIsCheckedMapping:
-      storage.getObjectObserver('runSelectionState', {}),
-  _makeRegex: function(regex) {
-    try {
-      return new RegExp(regex)
-    } catch (e) {
-      return null;
-    }
-  },
-  _setIsolatorIcon: function() {
-    var runMap = this.runSelectionState;
-    var numChecked = _.filter(_.values(runMap)).length;
-    var buttons =
-        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
-
-    buttons.forEach(function(b) {
-      if (numChecked === 1 && runMap[b.name]) {
-        b.icon = 'radio-button-checked';
-      } else {
-        b.icon = 'radio-button-unchecked';
-      }
-    });
-  },
-  computeNamesMatchingRegex: function(__, ___) {
-    var regex = this.regex;
-    return this.names.filter(function(n) {
-      return regex == null || regex.test(n);
-    });
-  },
-  computeOutSelected: function(__, ___) {
-    var runSelectionState = this.runSelectionState;
-    var num = this.maxRunsToEnableByDefault;
-    var allEnabled = this.namesMatchingRegex.length <= num;
-    return this.namesMatchingRegex.filter(function(n, i) {
-      return runSelectionState[n] == null ? allEnabled : runSelectionState[n];
-    });
-  },
-  synchronizeColors: function(e) {
-    if (!this.colorScale) return;
-
-    this._setIsolatorIcon();
-
-    var checkboxes =
-        Array.prototype.slice.call(this.querySelectorAll('paper-checkbox'));
-    var scale = this.colorScale;
-    checkboxes.forEach(function(p) {
-      var color = scale.scale(p.name);
-      p.customStyle['--paper-checkbox-checked-color'] = color;
-      p.customStyle['--paper-checkbox-checked-ink-color'] = color;
-      p.customStyle['--paper-checkbox-unchecked-color'] = color;
-      p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
-    });
-    var buttons =
-        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
-    buttons.forEach(function(p) {
-      var color = scale.scale(p.name);
-      p.style['color'] = color;
-    });
-    // The updateStyles call fails silently if the browser doesn't have focus,
-    // e.g. if TensorBoard was opened into a new tab that isn't visible.
-    // So we wait for requestAnimationFrame.
-    var _this = this;
-    window.requestAnimationFrame(function() {
-      _this.updateStyles();
-    });
-  },
-  _isolateRun: function(e) {
-    // If user clicks on the label for one run, enable it and disable all other
-    // runs.
-
-    var name = (Polymer.dom(e) as any).localTarget.name;
-    var selectionState = {};
-    this.names.forEach(function(n) {
-      selectionState[n] = n == name;
-    });
-    this.runSelectionState = selectionState;
-  },
-  _checkboxChange: function(e) {
-    var target = (Polymer.dom(e) as any).localTarget;
-    this.runSelectionState[target.name] = target.checked;
-    // n.b. notifyPath won't work because run names may have periods.
-    this.runSelectionState = _.clone(this.runSelectionState);
-  },
-  _isChecked: function(item, outSelectedChange) {
-    return this.outSelected.indexOf(item) != -1;
-  },
-  _regexInputObserver: storage.getStringObserver('regexInput', ''),
-  toggleAll: function() {
-    var _this = this;
-    var anyToggledOn = this.namesMatchingRegex.some(function(n) {
-      return _this.runSelectionState[n]
-    });
-
-
-    var runSelectionStateIsDefault =
-        Object.keys(this.runSelectionState).length == 0;
-
-    var defaultOff =
-        this.namesMatchingRegex.length > this.maxRunsToEnableByDefault;
-    // We have runs toggled either if some were explicitly toggled on, or if
-    // we are in the default state, and there are few enough that we default
-    // to toggling on.
-    anyToggledOn = anyToggledOn || runSelectionStateIsDefault && !defaultOff;
-
-    // If any are toggled on, we turn everything off. Or, if none are toggled
-    // on, we turn everything on.
-
-    var newRunsDisabled = {};
-    this.names.forEach(function(n) {
-      newRunsDisabled[n] = !anyToggledOn;
-    });
-    this.runSelectionState = newRunsDisabled;
-  },
-});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
deleted file mode 100644
index c90efac1d6b58debc6a39ae4ffafaeb3fb093da1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
+++ /dev/null
@@ -1,129 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-
-<!--
-Display a warning when there is no data found.
--->
-<dom-module id="tf-no-data-warning">
-  <template>
-    <template is="dom-if" if="[[showWarning]]">
-      <div class="warning">
-        <template is="dom-if" if="[[_isGraph(dataType)]]">
-          <h3>
-            No graph definition files were found.
-          </h3>
-          <p>
-            To store a graph, create a
-            <code>tf.summary.FileWriter</code>
-            and pass the graph either via the constructor, or by calling its
-            <code>add_graph()</code> method.
-            You may want to check out the
-            <a href="https://www.tensorflow.org/get_started/graph_viz">
-              graph visualizer tutorial
-            </a>.
-          </p>
-        </template>
-        <template is="dom-if" if="[[_isProjector(dataType)]]">
-          <h3>
-            No checkpoint was found.
-          </h3>
-          <p>
-            Probable causes:
-            <ul>
-              <li>
-                No checkpoint has been saved yet. Please refresh the page periodically.
-              </li>
-              <li>
-                You are not saving any checkpoint. To save your model,
-                create a
-                <a href="https://www.tensorflow.org/api_docs/python/tf/train/Saver">
-                  <code>tf.train.Saver</code>
-                </a>
-                and save your model periodically
-                by calling <code>saver.save(session, LOG_DIR/model.ckpt, step)</code>.
-              </li>
-            </ul>
-          </p>
-        </template>
-        <template is="dom-if" if="[[_isOther(dataType)]]">
-          <h3>
-            No <span>[[dataType]]</span> data was found.
-          </h3>
-          <p>
-            Probable causes:
-            <ul>
-              <li>
-                You haven't written any <span>[[dataType]]</span> data
-                to your event files.
-              </li>
-              <li>
-                TensorBoard can't find your event files.
-              </li>
-            </ul>
-          </p>
-        </template>
-        <p>
-          If you're new to using TensorBoard, and want to find out how to add
-          data and set up your event files, check out the
-          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md">
-            README
-          </a>
-          and perhaps the
-          <a href="https://www.tensorflow.org/get_started/summaries_and_tensorboard">
-            TensorBoard tutorial
-          </a>.
-        </p>
-
-        <p>
-          If you think TensorBoard is configured properly, please see the
-          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md#my-tensorboard-isnt-showing-any-data-whats-wrong">
-            section of the README devoted to missing data problems
-          </a>
-          and consider filing an issue on GitHub.
-        </p>
-
-      </div>
-    </template>
-    <style>
-      .warning {
-        max-width: 540px;
-        margin: 80px auto 0 auto;
-      }
-    </style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-no-data-warning",
-      properties: {
-        dataType: String,
-        showWarning: Boolean
-      },
-      _isGraph: function(dataType) {
-        return dataType === "graph";
-      },
-      _isProjector: function(dataType) {
-        return dataType === "projector";
-      },
-      _isOther: function(dataType) {
-        return !this._isGraph(dataType) && !this._isProjector(dataType);
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html
deleted file mode 100644
index 547a558ad0b5da9305d88d2d678302be1f928f8b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-option-selector.html
+++ /dev/null
@@ -1,94 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="tensorboard-color.html">
-
-<!--
-tf-option-selector is a simple component that has buttons as content and
-provides a "selectedId" property that is one of the IDs of the buttons inside it.
--->
-<dom-module id="tf-option-selector">
-  <template>
-    <div id="wrap">
-      <h3>[[name]]</h3>
-      <div class="content-wrapper"><content></content></div>
-    </div>
-    <style>
-      .content-wrapper ::content > * {
-        width: 30%;
-        font-size: 13px;
-        background: none;
-        margin-top: 10px;
-        color: var(--tb-ui-dark-accent);
-      }
-
-      .content-wrapper ::content :first-of-type {
-        margin-left: 0;
-      }
-
-      .content-wrapper ::content .selected {
-        background-color: var(--tb-ui-dark-accent);
-        color: white!important;
-      }
-
-      h3 {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-        display: block;
-        pointer-events: none;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-option-selector",
-      properties: {
-        name: String,
-        selectedId: {
-          type: String,
-          notify: true,
-          observer: '_selectedIdChanged'
-        }
-      },
-      attached: function() {
-        this.async(function() {
-          this.getEffectiveChildren().forEach(function(node) {
-            this.listen(node, 'tap', '_selectTarget');
-          }.bind(this));
-        });
-      },
-      _selectTarget: function(e) {
-        this.selectedId = e.currentTarget.id;
-      },
-      _selectedIdChanged: function() {
-        var selected = this.queryEffectiveChildren('#' + this.selectedId);
-        if (!selected) {
-          return;
-        }
-
-        this.getEffectiveChildren().forEach(function(node) {
-          node.classList.remove("selected");
-        });
-        selected.classList.add("selected");
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html
deleted file mode 100644
index 155259d3294bd1caf5cc59f91c56f304d12091a0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-panes-helper.html
+++ /dev/null
@@ -1,352 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="tf-collapsable-pane.html">
-<link rel="import" href="tf-no-data-warning.html">
-<link rel="import" href="tf-chart-scaffold.html">
-
-<!--
-tf-panes-helper is a component that renders the contents of TensorBoard pages.
-It renders a tf-collapsable-pane for each category. Inside each category, the
-provided content template is rendered repeatedly for each tag within that
-category.
-
-This helper also incorporates an expand button and data download utility for
-each card.
-
-To use it, just specify a template inside tf-panes-helper that contains the
-code that will be replicated for each tag.
-
-<tf-panes-helper
-  categories="[[categories]]"
-  data-type="type"
-  data-provider="[[provider]]"
-  run2tag="[[run2tag]]"
-  selected-runs="[[selectedRuns]]"
-  >
-  <template>
-    <Code instantiated for each card>
-  </template>
-</tf-panes-helper>
-
-If you want for the template to be replicated for each tag and run, not only for
-each tag, you can set the repeatForRuns property to true.
-
-You can also set the showDownloadLinks property, which will show a menu with
-options to download JSON and CSV data. For this, you must also set the
-downloadLinkUrlFunction property to an appropriate value.
-
-@element tf-panes-helper
--->
-<dom-module id="tf-panes-helper">
-  <template>
-    <content></content> <!-- User template will be put here -->
-    <tf-no-data-warning
-      data-type="[[dataType]]"
-      show-warning="[[dataNotFound]]"
-      ></tf-no-data-warning>
-
-    <template is="dom-repeat" items="[[categories]]" as="category">
-      <tf-collapsable-pane
-        name="[[category.name]]"
-        count="[[_count(category.tags, selectedRuns.*)]]"
-        >
-        <div class="layout horizontal wrap">
-          <template is="dom-repeat" items="[[_categoryCards(category, selectedRuns.*, run2tag.*)]]">
-              <div class="card">
-                <div class="card-title-container" style="border-color: [[_titleBorderColor(item.run)]]">
-                  <div class="card-title" inner-h-t-m-l="[[_break(item.tag)]]"></div>
-                  <template is="dom-if" if="[[repeatForRuns]]">
-                    <div class="card-subtitle" title="[[item.run]]">[[item.run]]</div>
-                  </template>
-                </div>
-                <div class="card-content">
-                  <tf-chart-scaffold
-                    tag="[[item.tag]]"
-                    data-provider="[[dataProvider]]"
-                    visible-series="[[item.runs]]"
-                    on-ready="_instantiateTemplate"
-                    >
-                    <!-- Instantiated template will be put here -->
-                  </tf-chart-scaffold>
-                </div>
-                <div class="card-bottom-row">
-                  <paper-icon-button
-                    class="expand-button"
-                    icon="fullscreen"
-                    on-tap="_toggleExpanded"
-                    ></paper-icon-button>
-                  <template is="dom-if" if="[[showDownloadLinks]]">
-                    <tf-downloader
-                      runs="[[item.runs]]"
-                      tag="[[item.tag]]"
-                      url-fn="[[downloadLinkUrlFunction]]"
-                      >
-                    </tf-downloader>
-                  </template>
-                </div>
-              </div>
-          </template>
-        </div>
-      </tf-collapsable-pane>
-    </template>
-
-    <style>
-      .card {
-        height: var(--card-height, 200px);
-        width: var(--card-width, 300px);
-        display: flex;
-        flex-direction: column;
-        margin: 5px;
-        padding: var(--card-padding, 0 30px 35px 0);
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        position: relative;
-      }
-
-      .card-expanded {
-        height: var(--card-expanded-height, 400px);
-        width: var(--card-expanded-width, 100%);
-      }
-
-      .card-title, .card-subtitle {
-        flex-grow: 0;
-        flex-shrink: 0;
-        font-size: 14px;
-        text-overflow: ellipsis;
-        overflow: hidden;
-      }
-
-      .card-subtitle {
-        font-size: 12px;
-      }
-
-      .card-content {
-        flex-grow: 1;
-        flex-shrink: 1;
-        display: flex;
-        margin-top: 10px;
-      }
-
-      .card-bottom-row {
-        position: absolute;
-        left: 0px;
-        bottom: 0px;
-        width: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        pointer-events: none;
-      }
-
-      .card-title-container {
-        border-left: 4px solid;
-        padding-left: 5px;
-      }
-
-      .expand-button {
-        color: #2196F3;
-        width: 32px;
-        height: 32px;
-        padding: 4px;
-        border-radius: 100%;
-        pointer-events: auto;
-        display: var(--show-expand-button, block);
-      }
-
-      .card-expanded .expand-button {
-        background: var(--tb-ui-light-accent);
-      }
-
-      tf-downloader {
-        margin-right: 30px;
-        pointer-events: auto;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-panes-helper",
-      properties: {
-        /**
-         * Categories that separate the template instances. Each category will
-         * be given its own collapsible pane. The category must be an array of
-         * objects, each with a 'name' property and a 'tags' array of strings.
-         */
-        categories: Array,
-
-        /**
-         * Input of the colors that are used for the user's runs.
-         */
-        colorScale: Object,
-
-        /**
-         * The name of the data type that is used by this dashboard. This will
-         * be used to display what is missing when there is no data available.
-         */
-        dataType: String,
-
-        /**
-         * The function that requests and returns a promise with the data of the
-         * required type for the templates from the backend.
-         */
-        dataProvider: Object,
-
-        /**
-         * If false, instantiates one template for each tag and calls
-         * setVisibleSeries on the first element of the template with all valid
-         * runs the tag has. If true, instantiates one template for each run of
-         * each tag, and calls setVisibleSeries of the first element of the
-         * instantiated template with just the one run.
-         */
-        repeatForRuns: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Map from runs to the valid tags that have them.
-         */
-        run2tag: Object,
-
-        /**
-         * Array with the runs that are selected by the user (i.e. valid to be
-         * displayed).
-         */
-        selectedRuns: Array,
-
-        /**
-         * If true, shows a menu with download links for the template data.
-         * If this is set to true, urlFn must also be provided.
-         */
-        showDownloadLinks: Boolean,
-
-        /**
-         * Function that returns the route to get data to download. Must be
-         * provided if showDownloadLinks is enabled.
-         */
-        downloadLinkUrlFunction: Function,
-        _contentTemplate: {
-          type: Object,
-          value: null
-        },
-        _stampedTemplates: {
-          type: Array,
-          value: function() { return [] }
-        }
-      },
-      behaviors: [
-        Polymer.Templatizer,
-      ],
-
-      /**
-       * Initializes the Polymer.Templatizer behavior with the template supplied
-       * by the user. With this, all calls to this.stamp() will produce an
-       * instance of the user template.
-       */
-      _initTemplatizer: function() {
-        if (!this._contentTemplate) {
-          // First template is used as the content.
-          this._contentTemplate = Polymer.dom(this).querySelector('template');
-          this.templatize(this._contentTemplate);
-        }
-      },
-
-      /**
-       * Called every time a tf-chart-scaffold is ready, stamps the user
-       * template inside the scaffold element (before it is attached) and
-       * stores the stamped template in an array to use for data binding
-       * (forwardParentProp/Path).
-       */
-      _instantiateTemplate: function(e) {
-        var scaffold = e.target;
-        this._initTemplatizer();
-        var instance = this.stamp();
-        this._stampedTemplates.push(instance);
-        Polymer.dom(scaffold).appendChild(instance.root);
-      },
-      _toggleExpanded: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var card = currentTarget.node.closest('.card');
-        var scaffold = card.querySelector('tf-chart-scaffold');
-        card.classList.toggle('card-expanded');
-        scaffold.chart().redraw();
-      },
-      _count: function(tags) {
-        if (!this.repeatForRuns) {
-          return tags.length;
-        }
-
-        var targetTags = d3.set(tags);
-        var count = 0;
-        this.selectedRuns.forEach(function(r) {
-          this.run2tag[r].forEach(function(t) {
-            if (targetTags.has(t)) {
-              count++;
-            }
-          });
-        }.bind(this));
-        return count;
-      },
-      _categoryCards: function(category) {
-        var cards = [];
-        category.tags.forEach(function(tag) {
-          var runs = this.selectedRuns.filter(function(r) {
-            return this.run2tag[r] && this.run2tag[r].indexOf(tag) !== -1;
-          }.bind(this));
-
-          if (this.repeatForRuns) {
-            runs.forEach(function(run) {
-              cards.push({tag: tag, run: run, runs: [run]});
-            });
-          } else {
-            cards.push({tag: tag, runs: runs});
-          }
-        }.bind(this));
-
-        return cards;
-      },
-      _titleBorderColor: function(run) {
-        return this.repeatForRuns ? this.colorScale.scale(run) : 'white';
-      },
-
-      /*
-       * Polymer data binding forwarding functions. Check the
-       * Polymer.Templatizer documentation for more information.
-       */
-
-      _forwardParentProp: function(property, value) {
-        this._stampedTemplates.forEach(function(instance) {
-          instance[property] = value;
-        });
-      },
-      _forwardParentPath: function(path, value) {
-        this._stampedTemplates.forEach(function(instance) {
-          instance.notifyPath(path, value, true);
-        });
-      },
-      // TODO(renatoutsch): implement the instance forwarding for two-way data
-      // binding.
-      // Add breaks to input so it will wrap nicely
-      _break: function(ipt) {
-        return ipt.replace(/([\/_-])/g, "$1<wbr>")
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group-demo.html
deleted file mode 100644
index 3565fec17912437897ec6b3ec509d48fed10645a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group-demo.html
+++ /dev/null
@@ -1,45 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
- <head>
-   <link rel="import" href="tf-regex-group.html">
- </head>
- <body>
-  <style>
-  .container {
-    width: 255px;
-    padding: 10px;
-    border: 1px solid #3f51b5;
-    border-radius: 5px;
-  }
-  :host {
-    margin: 0px;
-  }
-  </style>
-  <template id="page-template" is="dom-bind">
-    <div class="container">
-      <tf-regex-group regexes="{{regexes}}" id="demo"></tf-regex-group>
-    </div>
-    <p> Regexes:</p>
-    <template is="dom-repeat" items="[[regexes]]">
-      <p>"<span>[[item]]</span>"</p>
-    </template>
-  </template>
- </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html
deleted file mode 100644
index c1d3cf06aeadc0aa6c30d24d82ae5aa9ae65a566..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.html
+++ /dev/null
@@ -1,99 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../tf-storage/tf-storage.html">
-
-<!--
-`tf-regex-group` provides an input component for a group of regular expressions.
-
-Example:
-  <tf-regex-group regexes="{{regexes}}"></tf-regex-group>
-
-It contains a series of regular expression input fields. From this, it computes
-`regexes', an array in which every element is either a string representing a
-valid, nonempty regular expression, or the value `null`
-
-Public Properties:
-`regexes` a readonly, notifying array of strings, where each string is a regex
-
-It maintains an invariant that the final regex should always be an empty string,
-so the user can easily add more regular expressions. It does this by adding
-a new empty regex when the final one is nonempty.
-
-Pressing "enter" moves focus to the next regex (or just blurs if there are no
-more regexes).
--->
-<dom-module id="tf-regex-group">
-  <template>
-    <div class="regex-list">
-      <template is="dom-repeat" items="{{rawRegexes}}">
-        <div class="regex-line">
-          <paper-input
-            id="text-input"
-            class="regex-input"
-            label="Write a regex to create a tag group"
-            no-label-float
-            value="{{item.regex}}"
-            invalid="[[!item.valid]]"
-            on-keyup="moveFocus"
-          ></paper-input>
-          <paper-icon-button
-            icon="close"
-            class="delete-button"
-            aria-label="Delete Regex"
-            tabindex="0"
-            on-tap="deleteRegex"
-          ></paper-icon-button>
-        </div>
-        <style>
-          .regex-input {
-            width: 250px;
-            display: inline-block;
-            margin-left: -3px;
-          }
-
-          .delete-button {
-            color: var(--paper-grey-700);
-            width: 40px;
-            height: 40px;
-            margin-right: -10px;
-          }
-
-          .regex-list {
-            margin-bottom: 10px;
-          }
-
-          paper-input {
-            --paper-input-container-focus-color: var(--tb-orange-strong);
-            --paper-input-container-input: {
-              font-size: 14px;
-            };
-            --paper-input-container-label: {
-              font-size: 14px;
-            };
-          }
-        </style>
-      </template>
-    </div>
-  </template>
-  <script src="tf-regex-group.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.ts b/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.ts
deleted file mode 100644
index 92a0eb6a0b9d0738369ff89356e3c49336e2fb27..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-regex-group.ts
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import * as storage from '../tf-storage/storage';
-
-Polymer({
-  is: 'tf-regex-group',
-  properties: {
-    rawRegexes: {
-      type: Array,
-      value: storage.getObjectInitializer(
-          'rawRegexes', [{regex: '', valid: true}]),
-    },
-    regexes:
-        {type: Array, computed: 'usableRegexes(rawRegexes.*)', notify: true},
-  },
-  observers: [
-    'addNewRegexIfNeeded(rawRegexes.*)',
-    'checkValidity(rawRegexes.*)',
-    '_uriStoreRegexes(rawRegexes.*)',
-  ],
-  _uriStoreRegexes:
-      storage.getObjectObserver('rawRegexes', [{regex: '', valid: true}]),
-  checkValidity: function(x) {
-    var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
-    if (match) {
-      var idx = match[1];
-      this.set('rawRegexes.' + idx + '.valid', this.isValid(x.value));
-    }
-  },
-  isValid: function(s) {
-    try {
-      new RegExp(s);
-      return true;
-    } catch (e) {
-      return false;
-    }
-  },
-  usableRegexes: function(regexes) {
-    var isValid = this.isValid;
-    return regexes.base
-        .filter(function(r) {
-          // Checking validity here (rather than using the data property)
-          // is necessary because otherwise we might send invalid regexes due
-          // to the fact that this function can call before the observer does
-          return r.regex !== '' && isValid(r.regex);
-        })
-        .map(function(r) {
-          return r.regex;
-        });
-  },
-  addNewRegexIfNeeded: function() {
-    var last = this.rawRegexes[this.rawRegexes.length - 1];
-    if (last.regex !== '') {
-      this.push('rawRegexes', {regex: '', valid: true});
-    }
-  },
-  deleteRegex: function(e) {
-    if (this.rawRegexes.length > 1) {
-      this.splice('rawRegexes', e.model.index, 1);
-    }
-  },
-  moveFocus: function(e) {
-    if (e.keyCode === 13) {
-      var idx = e.model.index;
-      var inputs = Polymer.dom(this.root).querySelectorAll('.regex-input');
-      if (idx < this.rawRegexes.length - 1) {
-        (inputs[idx + 1] as any).$.input.focus();
-      } else {
-        (document.activeElement as HTMLElement).blur();
-      }
-    }
-  }
-});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
deleted file mode 100644
index e3d8a91fd0c2e64650ebbac0fcb6448ffadc9f52..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
+++ /dev/null
@@ -1,188 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="tf-multi-checkbox.html">
-<link rel="import" href="scrollbar-style.html">
-
-<!--
-tf-run-selector creates a set of checkboxes to display which runs are selected.
-It also displays tooltips.
-
-Properties in:
-- runs: Array of strings representing the runs that may be selected
-- colorScale: a TF.ColorScale mapping run names to colors
-
-Properties out:
-- outSelected: The array of run names that are currently checked by the user.
-
--->
-<dom-module id="tf-run-selector">
-  <template>
-    <paper-dialog with-backdrop id="logdir-dialog">
-      <h2>logdir</h2>
-      <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
-    </paper-dialog>
-    <div id="top-text">
-      <h3 id="tooltip-help" class="tooltip-container">
-        Runs
-      </h3>
-    </div>
-    <tf-multi-checkbox
-      id="multiCheckbox"
-      names="[[runs]]"
-      out-selected="{{outSelected}}"
-      color-scale="[[colorScale]]"
-    ></tf-multi-checkbox>
-    <paper-button
-      class="x-button"
-      id="toggle-all"
-      on-tap="_toggleAll"
-    >
-    Toggle All Runs
-    </paper-button>
-    <template
-      is="dom-if"
-      if="[[logdir]]">
-      <div id="logdir">
-        <span id="clipped-logdir" inner-h-t-m-l="[[_clippedLogdir]]"></span><!--
-          We use HTML comments to remove spaces before the ellipsis.
-        --><template
-                     is="dom-if"
-                     if="[[_shouldShowExpandLogdirButton(logdir, _logdirClipLength)]]"><!--
-          --><a href="" on-click="_openLogdirDialog">…</a>
-        </template>
-      </div>
-    </template>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        padding-bottom: 10px;
-        box-sizing: border-box;
-      }
-      #top-text {
-        width: 100%;
-        flex-grow: 0;
-        flex-shrink: 0;
-        padding-right: 16px;
-        box-sizing: border-box;
-        color: var(--paper-grey-800);
-      }
-      tf-multi-checkbox {
-        display: flex;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      .x-button {
-        font-size: 13px;
-        background-color: var(--tb-ui-light-accent);
-        color: var(--tb-ui-dark-accent);
-      }
-      #tooltip-help {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-      paper-button {
-        margin-left: 0;
-      }
-      #logdir {
-        color: var(--tb-ui-dark-accent);
-        font-size: 13px;
-        margin: 5px 0 0 0;
-        max-width: 288px;
-      }
-    </style>
-  </template>
-  <script>
-  Polymer({
-    is: "tf-run-selector",
-    properties: {
-      backend: Object,
-      outSelected: {type: Array, notify: true},
-      // runs: an array of strings, representing the run names that may be chosen
-      runs: Array,
-      colorScale: Object, // TF.ColorScale
-      logdir: {
-        type: String,
-        notify: true,
-      },
-      // This is the potentially clipped portion of the logdir we show at the bottom of the sidebar.
-      _clippedLogdir: {
-        type: String,
-      },
-      _logdirClipLength: {
-        type: Number,
-        value: 250,
-        readOnly: true,
-      },
-    },
-    observers: [
-      "_onBackendUpdate(backend)",
-      "_logdirSet(logdir)",
-    ],
-    _toggleAll: function() {
-      this.$.multiCheckbox.toggleAll();
-    },
-    // Break the string at natural points, including commas, equals, and slashes
-    _breakString: function(originalString) {
-      return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
-    },
-    _onBackendUpdate: function(backend) {
-      if (backend === undefined) {
-        return;
-      }
-
-      // When the backend is set, the selector can request the logdir.
-      backend.logdir().then(logdirObject => {
-        this.set('logdir', logdirObject.logdir);
-      }).catch(e => {
-        // Fetching the logdir failed. Prevent the exception from logging to
-        // console. The console already logs a 404 network event.
-      });
-    },
-    _logdirSet: function(logdir) {
-      if (logdir === undefined) {
-        // The logdir has not been set yet.
-        return;
-      }
-
-      var lineBrokenText;
-      if (logdir.length > this._logdirClipLength) {
-        // Clip the logdir to avoid blocking the runs selector. Let the user view a more full
-        // version of the logdir.
-        lineBrokenText = this._breakString(logdir.substring(0, this._logdirClipLength));
-      } else {
-        lineBrokenText = this._breakString(logdir);
-      }
-      this.set('_clippedLogdir', lineBrokenText);
-    },
-    _openLogdirDialog: function(event) {
-      event.preventDefault();
-      this.$$('#logdir-dialog').open();
-    },
-    _shouldShowExpandLogdirButton(logdir, _logdirClipLength) {
-      return logdir && logdir.length > _logdirClipLength;
-    },
-  });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html
deleted file mode 100644
index 5eb8537040ccef6e8fa76f31c80b85dea795dfdd..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-sidebar-helper.html
+++ /dev/null
@@ -1,165 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="tf-categorizer.html">
-<link rel="import" href="tf-run-selector.html">
-
-<!--
-tf-sidebar-helper is a component that renders a sidebar for configuration
-components, like the tf-categorizer and the tf-run-selector. The component can
-also be extended with more options useful to the dashboards.
-
-To use it, create the tf-sidebar-helper with the required properties. To extend
-it with extra configuration components, add them to the element's component:
-
-<tf-sidebar-helper
-  backend: "[[backend]]",
-  categories: "{{outputCategories}}",
-  colorScale: "[[colorScale]]",
-  run2tag: "[[run2tag]]",
-  runs: "[[runs]]",
-  selectedRuns: "{{outSelectedRuns}}",
-  >
-  <div class="extend-first-section">
-    <my options>
-  </div>
-  <div class="sidebar-section">
-    <my options>
-  </div>
-  ...
-</tf-sidebar-helper>
-
-Elements inside the .extend-first-section div will be put on the first section
-of the sidebar, while the rest of the divs will be put after it and before the
-tf-run-selector.
-
-@element tf-sidebar-helper
--->
-<dom-module id="tf-sidebar-helper">
-  <template>
-    <div class="sidebar-section">
-      <tf-categorizer
-        id="categorizer"
-        tags="[[tags]]"
-        categories="{{categories}}"
-        ></tf-categorizer>
-      <content select=".extend-first-section"></content>
-    </div>
-    <content></content>
-    <div class="sidebar-section">
-      <tf-run-selector
-        id="runSelector"
-        backend="[[backend]]"
-        runs="[[runs]]"
-        color-scale="[[colorScale]]"
-        out-selected="{{selectedRuns}}"
-        ></tf-run-selector>
-    </div>
-    <style include="dashboard-style"></style>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-      }
-
-      #categorizer {
-        flex-shrink: 0;
-      }
-
-      #runSelector {
-        flex-shrink: 1;
-        flex-grow: 1;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 20px 0px 20px 30px;
-      }
-
-      .sidebar-section:first-child {
-        border: none;
-      }
-
-      .sidebar-section:last-child {
-        flex-grow: 1;
-        display: flex;
-      }
-
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
-        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
-        font-size: 14px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-sidebar-helper",
-      properties: {
-        /**
-         * The backend object used to issue requests.
-         */
-        backend: Object,
-
-        /**
-         * This is an output of the categories that the user selected to
-         * separate the different tags. Each category here should be given its
-         * own collapsible pane.
-         */
-        categories: {
-          type: Array,
-          notify: true,
-        },
-
-        /**
-         * Input of the colors that are used for the user's runs.
-         */
-        colorScale: Object,
-
-        /**
-         * Map from runs to the valid tags that have them.
-         */
-        run2tag: Object,
-
-        /**
-         * Input of all valid runs that can be selected by the user.
-         */
-        runs: Array,
-
-        /**
-         * Outputs an array with the runs that are selected by the user (i.e.
-         * valid to be displayed).
-         */
-        selectedRuns: {
-          type: Array,
-          notify: true,
-        },
-
-        tags: {
-          type: Array,
-          computed: "_getTags(run2tag.*)"
-        },
-      },
-      _getTags: function() {
-        return _.union.apply(null, _.values(this.run2tag));
-      },
-    })
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
deleted file mode 100644
index 5ddd6ba5bb91328d55c0390abbe295f0b899f1b3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_distribution_dashboard",
-    srcs = ["tf-distribution-dashboard.html"],
-    path = "/tf-distribution-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/vz_distribution_chart",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-distribution-dashboard",
-    deps = [
-        ":tf_distribution_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run1_tag_histo1.json
deleted file mode 100644
index a6765285b14c1c12692b5d9346b71a46e1b7d515..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run1_tag_histo1.json
+++ /dev/null
@@ -1,212 +0,0 @@
-[
-    [
-        0.0,
-        0,
-        [
-            [
-                0,
-                -2.3150592308536755
-            ],
-            [
-                668,
-                -2.0967547155036605
-            ],
-            [
-                1587,
-                -1.4326244423655616
-            ],
-            [
-                3085,
-                -0.8871306575801902
-            ],
-            [
-                5000,
-                -0.09312398815580714
-            ],
-            [
-                6915,
-                0.2584093405812282
-            ],
-            [
-                8413,
-                0.8895470642005087
-            ],
-            [
-                9332,
-                1.3198979614453679
-            ],
-            [
-                10000,
-                1.6793308878855118
-            ]
-        ]
-    ],
-    [
-        100.0,
-        10,
-        [
-            [
-                0,
-                -1.3417572789138936
-            ],
-            [
-                668,
-                -1.183563374619141
-            ],
-            [
-                1587,
-                -0.48920418783271574
-            ],
-            [
-                3085,
-                0.29326906896076954
-            ],
-            [
-                5000,
-                0.56953784145381
-            ],
-            [
-                6915,
-                0.8684655583499333
-            ],
-            [
-                8413,
-                1.4133127368907181
-            ],
-            [
-                9332,
-                1.906140650457873
-            ],
-            [
-                10000,
-                2.135771998171255
-            ]
-        ]
-    ],
-    [
-        200.0,
-        20,
-        [
-            [
-                0,
-                -1.5066917525035333
-            ],
-            [
-                668,
-                -1.3910909571770793
-            ],
-            [
-                1587,
-                -0.902737218885874
-            ],
-            [
-                3085,
-                -0.3807791904765027
-            ],
-            [
-                5000,
-                0.38900200905253046
-            ],
-            [
-                6915,
-                0.8209734209339482
-            ],
-            [
-                8413,
-                1.302385856695965
-            ],
-            [
-                9332,
-                1.9324626053521639
-            ],
-            [
-                10000,
-                2.957505317875451
-            ]
-        ]
-    ],
-    [
-        300.0,
-        30,
-        [
-            [
-                0,
-                -0.5430457051469562
-            ],
-            [
-                668,
-                -0.4626161834245273
-            ],
-            [
-                1587,
-                0.21573949543027715
-            ],
-            [
-                3085,
-                0.37353741100174215
-            ],
-            [
-                5000,
-                0.6891407881591103
-            ],
-            [
-                6915,
-                1.0927156232630852
-            ],
-            [
-                8413,
-                1.2745337159550916
-            ],
-            [
-                9332,
-                1.4321116832891605
-            ],
-            [
-                10000,
-                2.1913774993059034
-            ]
-        ]
-    ],
-    [
-        400.0,
-        40,
-        [
-            [
-                0,
-                -0.3584790755077172
-            ],
-            [
-                668,
-                -0.33301611509753215
-            ],
-            [
-                1587,
-                -0.1089466072951948
-            ],
-            [
-                3085,
-                0.5792199847585249
-            ],
-            [
-                5000,
-                1.220854943811942
-            ],
-            [
-                6915,
-                1.759829438421432
-            ],
-            [
-                8413,
-                2.3072559906741614
-            ],
-            [
-                9332,
-                2.753036118353921
-            ],
-            [
-                10000,
-                3.0267252195784047
-            ]
-        ]
-    ]
-]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo1.json
deleted file mode 100644
index 9e8a55b3f20739bb81cafd8314721c16fda09378..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo1.json
+++ /dev/null
@@ -1,212 +0,0 @@
-[
-    [
-        0.0,
-        0,
-        [
-            [
-                0,
-                -3.6801669545044846
-            ],
-            [
-                668,
-                -3.192188140974744
-            ],
-            [
-                1587,
-                -2.3414678549368806
-            ],
-            [
-                3085,
-                -0.9632173471995873
-            ],
-            [
-                5000,
-                -0.3214892636797772
-            ],
-            [
-                6915,
-                0.11870794142185205
-            ],
-            [
-                8413,
-                0.8895470642005087
-            ],
-            [
-                9332,
-                1.183563374619141
-            ],
-            [
-                10000,
-                2.665663810418372
-            ]
-        ]
-    ],
-    [
-        100.0,
-        10,
-        [
-            [
-                0,
-                -3.564793583751807
-            ],
-            [
-                668,
-                -3.376844436865802
-            ],
-            [
-                1587,
-                -1.0366615731293798
-            ],
-            [
-                3085,
-                -0.27318696312672563
-            ],
-            [
-                5000,
-                0.9718642422053263
-            ],
-            [
-                6915,
-                2.5765662807928194
-            ],
-            [
-                8413,
-                3.1415385101545126
-            ],
-            [
-                9332,
-                4.085981768607621
-            ],
-            [
-                10000,
-                4.623079406808927
-            ]
-        ]
-    ],
-    [
-        200.0,
-        20,
-        [
-            [
-                0,
-                -2.235172510433281
-            ],
-            [
-                668,
-                -2.004569042815611
-            ],
-            [
-                1587,
-                -1.2015432383370985
-            ],
-            [
-                3085,
-                0.11835464933202625
-            ],
-            [
-                5000,
-                0.56953784145381
-            ],
-            [
-                6915,
-                1.202844810963146
-            ],
-            [
-                8413,
-                2.689066032283515
-            ],
-            [
-                9332,
-                2.8494015726499944
-            ],
-            [
-                10000,
-                3.481377676013788
-            ]
-        ]
-    ],
-    [
-        300.0,
-        30,
-        [
-            [
-                0,
-                -3.360113978269659
-            ],
-            [
-                668,
-                -2.8293185004961043
-            ],
-            [
-                1587,
-                -1.5992540502266783
-            ],
-            [
-                3085,
-                0.14393860259807117
-            ],
-            [
-                5000,
-                1.47723448201245
-            ],
-            [
-                6915,
-                1.9510057389110733
-            ],
-            [
-                8413,
-                2.833176104473626
-            ],
-            [
-                9332,
-                4.142405216576347
-            ],
-            [
-                10000,
-                4.706937777668589
-            ]
-        ]
-    ],
-    [
-        400.0,
-        40,
-        [
-            [
-                0,
-                -2.599286228987632
-            ],
-            [
-                668,
-                -2.240365897443259
-            ],
-            [
-                1587,
-                -1.5992540502266783
-            ],
-            [
-                3085,
-                -0.9101893288861387
-            ],
-            [
-                5000,
-                0.7580548669750213
-            ],
-            [
-                6915,
-                1.6009864433919474
-            ],
-            [
-                8413,
-                2.3504002974280036
-            ],
-            [
-                9332,
-                2.7907805263353733
-            ],
-            [
-                10000,
-                3.5098048900144323
-            ]
-        ]
-    ]
-]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo2.json
deleted file mode 100644
index 7c8836f6246306cbf162d4c1299d3eff075185b6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/compressedHistograms_run_run2_tag_histo2.json
+++ /dev/null
@@ -1,212 +0,0 @@
-[
-    [
-        0.0,
-        0,
-        [
-            [
-                0,
-                -1.9291158122759586
-            ],
-            [
-                668,
-                -1.5970765333488954
-            ],
-            [
-                1587,
-                -1.0923120348519078
-            ],
-            [
-                3085,
-                -0.6688082872192093
-            ],
-            [
-                5000,
-                0.09312398815580714
-            ],
-            [
-                6915,
-                0.44532789251701854
-            ],
-            [
-                8413,
-                0.8238009655877649
-            ],
-            [
-                9332,
-                1.0357232383581656
-            ],
-            [
-                10000,
-                1.2741043689144438
-            ]
-        ]
-    ],
-    [
-        100.0,
-        10,
-        [
-            [
-                0,
-                -0.7780725642449806
-            ],
-            [
-                668,
-                -0.7138496178727424
-            ],
-            [
-                1587,
-                -0.5448932415735014
-            ],
-            [
-                3085,
-                -0.24370397454796228
-            ],
-            [
-                5000,
-                0.42790220995778355
-            ],
-            [
-                6915,
-                0.6191730643365096
-            ],
-            [
-                8413,
-                0.752059342118037
-            ],
-            [
-                9332,
-                1.0451472255274825
-            ],
-            [
-                10000,
-                2.5559479569222825
-            ]
-        ]
-    ],
-    [
-        200.0,
-        20,
-        [
-            [
-                0,
-                -1.3876904425996377
-            ],
-            [
-                668,
-                -1.1464188862638496
-            ],
-            [
-                1587,
-                -0.4049955219067526
-            ],
-            [
-                3085,
-                0.04721394862139682
-            ],
-            [
-                5000,
-                0.56953784145381
-            ],
-            [
-                6915,
-                1.3221859041483333
-            ],
-            [
-                8413,
-                1.6188495656305735
-            ],
-            [
-                9332,
-                1.7613953069723651
-            ],
-            [
-                10000,
-                2.3257482385477384
-            ]
-        ]
-    ],
-    [
-        300.0,
-        30,
-        [
-            [
-                0,
-                -1.600772629982185
-            ],
-            [
-                668,
-                -1.1548516185367033
-            ],
-            [
-                1587,
-                -0.260387173785447
-            ],
-            [
-                3085,
-                0.17416570914366614
-            ],
-            [
-                5000,
-                0.47069243095356195
-            ],
-            [
-                6915,
-                1.1559276581637614
-            ],
-            [
-                8413,
-                2.0474031182051404
-            ],
-            [
-                9332,
-                2.18821711651116
-            ],
-            [
-                10000,
-                2.2393193406467518
-            ]
-        ]
-    ],
-    [
-        400.0,
-        40,
-        [
-            [
-                0,
-                -0.8286852465281818
-            ],
-            [
-                668,
-                -0.7815041529866706
-            ],
-            [
-                1587,
-                -0.3334896444053469
-            ],
-            [
-                3085,
-                0.21085213041026643
-            ],
-            [
-                5000,
-                0.5177616740489182
-            ],
-            [
-                6915,
-                1.077122434649409
-            ],
-            [
-                8413,
-                1.5898009703967424
-            ],
-            [
-                9332,
-                1.8859097291499742
-            ],
-            [
-                10000,
-                2.0954239138728523
-            ]
-        ]
-    ]
-]
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/logdir
deleted file mode 100644
index b6362b45d777266d6204b23884222a080f789f71..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_distribution_dashboard/data/runs.json
deleted file mode 100644
index 739262a9fb62edcdd4d8010410a7713629a0d383..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/data/runs.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "run1": {"compressedHistograms": ["histo1"]},
-  "run2": {"compressedHistograms": ["histo2", "histo1"]}
-}
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/index.html b/tensorflow/tensorboard/components/tf_distribution_dashboard/index.html
deleted file mode 100644
index fe899a0ba8c459f1311e110e65f67e905bfc1990..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/index.html
+++ /dev/null
@@ -1,69 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../paper-styles/typography.html">
-<link rel="import" href="tf-distribution-dashboard.html">
-
-<title>Distribution Dashboard Demo</title>
-<style>
-  #container {
-    height: 800px;
-    width: 100%;
-    display: block;
-  }
-
-  html, body {
-    margin: 0;
-    padding: 0;
-    font-family: "RobotoDraft","Roboto",sans-serif;
-  }
-
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="distribution-dash-demo">
-      <template>
-        <tf-distribution-dashboard id="demo" backend="[[backend]]"></tf-distribution-dashboard>
-      </template>
-      <script>
-        import {Backend} from "../../tf-backend/backend";
-        import {createRouter, setRouter} from "../../tf-backend/router";
-
-        Polymer({
-          is: "distribution-dash-demo",
-          properties: {
-            backend: {
-              type: Object,
-              value: function() {
-                return new Backend();
-              },
-            },
-          },
-          created: function() {
-            var path = "data";
-            var router = createRouter(path, true);
-            setRouter(router);
-          },
-        });
-      </script>
-    </dom-module>
-    <distribution-dash-demo id="container"></distribution-dash-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html b/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
deleted file mode 100644
index 76de74273f269c4b33dece0e4a3fc7b218043b93..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
+++ /dev/null
@@ -1,131 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-color-scale/tf-color-scale.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
-<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
-<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../vz-distribution-chart/vz-distribution-chart.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-
-<!--
-tf-distribution-dashboard is a complete frontend that loads runs from a backend,
-and creates chart panes that display data for those runs.
-
-It provides a x type selector and the normal tf-sidebar-helper options, by
-which the user can customize how data is organized and displayed.
-
-Each chart has a button that can toggle whether it is "expanded"; expanded
-charts are larger.
-
-Organizationally, the #plumbing div contains components that have no concrete
-manifestation and just effect data bindings or data loading. The .sidebar div
-contains shared controls provided by tf-sidebar-helper. The .center div
-contains vz-distribution-charts embedded inside tf-panes-helper's.
--->
-<dom-module id="tf-distribution-dashboard">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale
-        id="colorScale"
-        runs="[[runs]]"
-        out-color-scale="{{_colorScale}}"
-      ></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper
-          backend="[[backend]]"
-          categories="{{_categories}}"
-          color-scale="[[_colorScale]]"
-          run2tag="[[run2tag]]"
-          runs="[[runs]]"
-          selected-runs="{{_selectedRuns}}"
-          >
-        <div class="sidebar-section">
-          <tf-option-selector
-            id="xTypeSelector"
-            name="Horizontal Axis"
-            selected-id="{{_xType}}"
-            >
-            <paper-button id="step">step</paper-button>
-            <paper-button id="relative">relative</paper-button>
-            <paper-button id="wall_time">wall</paper-button>
-          </tf-option-selector>
-        </div>
-        </tf-sidebar-helper>
-      </div>
-
-      <div class="center">
-        <tf-panes-helper
-          categories="[[_categories]]"
-          color-scale="[[_colorScale]]"
-          data-type="[[dataType]]"
-          data-provider="[[dataProvider]]"
-          data-not-found="[[dataNotFound]]"
-          run2tag="[[run2tag]]"
-          selected-runs="[[_selectedRuns]]"
-          repeat-for-runs
-          >
-          <template>
-            <vz-distribution-chart
-              x-type="[[_xType]]"
-              color-scale="[[_colorScale]]"
-              ></vz-distribution-chart>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-  </template>
-
-  <script>
-    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-    import {BackendBehavior} from "../tf-backend/behavior";
-
-    Polymer({
-      is: "tf-distribution-dashboard",
-      factoryImpl: function(backend) {
-        this.backend = backend;
-      },
-      behaviors: [
-        DashboardBehavior("distributions"),
-        ReloadBehavior("tf-chart-scaffold"),
-        BackendBehavior,
-      ],
-      properties: {
-        backend: Object,
-        _xType: {
-          type: String,
-          value: "step"
-        },
-        dataType: {
-          type: Object,
-          value: "compressedHistogram",
-        },
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_globals/BUILD b/tensorflow/tensorboard/components/tf_globals/BUILD
deleted file mode 100644
index c5b0cfbaa558bee614da0c53c5d14c916352c339..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_globals/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_globals",
-    srcs = [
-        "globals.ts",
-        "tf-globals.html",
-    ],
-    path = "/tf-globals",
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_globals"],
-    destdir = "tf-globals",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_globals/globals.ts b/tensorflow/tensorboard/components/tf_globals/globals.ts
deleted file mode 100644
index fb6bb83b97f79651ded49f5a1cd3e6f76d9872a9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_globals/globals.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// The names of TensorBoard tabs.
-export const TABS = [
-  'scalars', 'images', 'audio', 'graphs', 'distributions', 'histograms',
-  'embeddings', 'text'
-];
-
-// If true, TensorBoard stores its hash in the URI state.
-// If false, tab switching in TensorBoard will not update location hash,
-// because hash updates interfere with wct_tests.
-let _useHash = false;
-
-export function setUseHash(shouldUseHash: boolean): void {
-  _useHash = shouldUseHash;
-}
-
-export function useHash(): boolean {
-  return _useHash;
-}
-
-let _fakeHash = '';
-
-export function setFakeHash(h: string) {
-  _fakeHash = h;
-}
-
-export function getFakeHash() {
-  return _fakeHash;
-}
diff --git a/tensorflow/tensorboard/components/tf_globals/tf-globals.html b/tensorflow/tensorboard/components/tf_globals/tf-globals.html
deleted file mode 100644
index efb8e92e0805c026fd7c44ae0e8c4931d21dff70..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_globals/tf-globals.html
+++ /dev/null
@@ -1,19 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="globals.js"></script>
-
diff --git a/tensorflow/tensorboard/components/tf_graph/BUILD b/tensorflow/tensorboard/components/tf_graph/BUILD
deleted file mode 100644
index 4c0894f1925c9d716ca7825e7ed520b84bf32678..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph",
-    srcs = [
-        "tf-graph.html",
-        "tf-graph-minimap.html",
-        "tf-graph-scene.html",
-    ],
-    path = "/tf-graph",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_iron_flex_layout",
-        "@org_polymer_iron_icons",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_dropdown_menu",
-        "@org_polymer_paper_input",
-        "@org_polymer_paper_menu",
-        "@org_polymer_paper_radio_group",
-        "@org_polymer_paper_toggle_button",
-        "@org_polymer_paper_tooltip",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph"],
-    destdir = "tf-graph",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
-        "//third_party/javascript/polymer/v1/iron-flex-layout:lib",
-        "//third_party/javascript/polymer/v1/iron-icons:lib",
-        "//third_party/javascript/polymer/v1/paper-button:lib",
-        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-input:lib",
-        "//third_party/javascript/polymer/v1/paper-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
-        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
-        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/BUILD b/tensorflow/tensorboard/components/tf_graph/demo/BUILD
deleted file mode 100644
index 02f3bf64bbc166902c346cf2874a54bacccd348a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_graph_loader",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt
deleted file mode 100644
index 30b206453469801d31b46856c29cdda78164f18f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/data/graph.pbtxt
+++ /dev/null
@@ -1,4606 +0,0 @@
-node {
-  name: "GradientDescent/learning_rate"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_3"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 100
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000d\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_grad/Shape"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_1_grad/Shape"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod_1"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape_1"
-  input: "gradients/Mean_grad/Const_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum"
-  op: "Maximum"
-  input: "gradients/Mean_grad/Prod_1"
-  input: "gradients/Mean_grad/Maximum/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape"
-  input: "gradients/Mean_grad/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/floordiv"
-  op: "FloorDiv"
-  input: "gradients/Mean_grad/Prod"
-  input: "gradients/Mean_grad/Maximum"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Cast"
-  op: "Cast"
-  input: "gradients/Mean_grad/floordiv"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "DstT"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "SrcT"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile/multiples"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Fill"
-  op: "Fill"
-  input: "gradients/Shape"
-  input: "gradients/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Fill"
-  input: "gradients/Mean_grad/Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile"
-  op: "Tile"
-  input: "gradients/Mean_grad/Reshape"
-  input: "gradients/Mean_grad/Tile/multiples"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tmultiples"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/truediv"
-  op: "RealDiv"
-  input: "gradients/Mean_grad/Tile"
-  input: "gradients/Mean_grad/Cast"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Mean_grad/truediv"
-  input: "gradients/Reshape_3_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  op: "ExpandDims"
-  input: "gradients/Reshape_3_grad/Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/begin"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1"
-  op: "Sub"
-  input: "Rank_2"
-  input: "Sub_1/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/begin"
-  op: "Pack"
-  input: "Sub_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_1"
-  op: "Slice"
-  input: "Shape_2"
-  input: "Slice_1/begin"
-  input: "Slice_1/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat_1"
-  op: "ConcatV2"
-  input: "concat_1/values_0"
-  input: "Slice_1"
-  input: "concat_1/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub"
-  op: "Sub"
-  input: "Rank_1"
-  input: "Sub/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice/begin"
-  op: "Pack"
-  input: "Sub"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice"
-  op: "Slice"
-  input: "Shape_1"
-  input: "Slice/begin"
-  input: "Slice/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat"
-  op: "ConcatV2"
-  input: "concat/values_0"
-  input: "Slice"
-  input: "concat/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2"
-  op: "Sub"
-  input: "Rank"
-  input: "Sub_2/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/size"
-  op: "Pack"
-  input: "Sub_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_2"
-  op: "Slice"
-  input: "Shape"
-  input: "Slice_2/begin"
-  input: "Slice_2/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_biases/read"
-  op: "Identity"
-  input: "logits_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_weights/read"
-  op: "Identity"
-  input: "logits_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_biases/read"
-  op: "Identity"
-  input: "hidden_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 784
-        }
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_weights/read"
-  op: "Identity"
-  input: "hidden_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\377\377\377\377"
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/depth"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/off_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/on_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_1/random_shuffle_queue"
-  op: "RandomShuffleQueueV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "capacity"
-    value {
-      i: 20000
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "min_after_dequeue"
-    value {
-      i: 4000
-    }
-  }
-  attr {
-    key: "seed"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "seed2"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 28
-          }
-          dim {
-            size: 28
-          }
-          dim {
-            size: 1
-          }
-        }
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  op: "QueueDequeueManyV2"
-  input: "mnist_dataset_train_1/random_shuffle_queue"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "timeout_ms"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  input: "Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul"
-  op: "MatMul"
-  input: "Reshape"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add"
-  op: "Add"
-  input: "MatMul"
-  input: "hidden_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Relu"
-  op: "Relu"
-  input: "add"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add_1"
-  op: "Add"
-  input: "MatMul_1"
-  input: "logits_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_1"
-  op: "Reshape"
-  input: "add_1"
-  input: "concat"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot"
-  op: "OneHot"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
-  input: "mnist_dataset_train_2/one_hot/depth"
-  input: "mnist_dataset_train_2/one_hot/on_value"
-  input: "mnist_dataset_train_2/one_hot/off_value"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "TI"
-    value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape_2"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/one_hot"
-  input: "concat_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "SoftmaxCrossEntropyWithLogits"
-  op: "SoftmaxCrossEntropyWithLogits"
-  input: "Reshape_1"
-  input: "Reshape_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  op: "PreventGradient"
-  input: "SoftmaxCrossEntropyWithLogits:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "message"
-    value {
-      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  op: "Mul"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  input: "gradients/Reshape_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum_1"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum"
-  input: "gradients/add_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape_1"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul_1"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Relu_grad/ReluGrad"
-  op: "ReluGrad"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency"
-  input: "Relu"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum_1"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum"
-  input: "gradients/add_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape_1"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul_1"
-  op: "MatMul"
-  input: "Reshape"
-  input: "gradients/add_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_grad/tuple/control_dependency"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 784
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_grad/MatMul"
-  input: "^gradients/MatMul_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_grad/MatMul_1"
-  input: "^gradients/MatMul_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "GradientDescent"
-  op: "NoOp"
-  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_2"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_3"
-  op: "Reshape"
-  input: "SoftmaxCrossEntropyWithLogits"
-  input: "Slice_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Mean"
-  op: "Mean"
-  input: "Reshape_3"
-  input: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "_send_Mean_0"
-  op: "_Send"
-  input: "Mean"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "client_terminated"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "recv_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device_incarnation"
-    value {
-      i: -5924635994370253548
-    }
-  }
-  attr {
-    key: "tensor_name"
-    value {
-      s: "Mean:0"
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 21
-}
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/index.html b/tensorflow/tensorboard/components/tf_graph/demo/index.html
deleted file mode 100644
index 52e2f0b9340950ed5f873cba17c8bbf2aee62e6a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/index.html
+++ /dev/null
@@ -1,92 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-graph.html">
-<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<title>TF Graph Demo</title>
-<style>
-  #demo-container {
-    border: 2px solid #808080;
-    width: 1000px;
-    height: 600px;
-  }
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="tf-graph-demo">
-      <template>
-        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
-        <tf-graph-loader
-            id="loader"
-            datasets="[[_datasets]]"
-            selected-dataset="[[_selectedDataset]]"
-            out-graph="{{_graph}}">
-        </tf-graph-loader>
-
-        <!-- We color ops in the graph by XLA cluster. -->
-        <tf-graph id="graph" color-by="xla_cluster"></tf-graph>
-      </template>
-      <script>
-        "use strict";
-
-        Polymer({
-          is: "tf-graph-demo",
-          properties: {
-            // We tell the graph loader to load a specific pbtxt file.
-            _datasets: {
-              type: Array,
-              value: [{
-                "name": "Graph with XLA Clusters Specified",
-                "path": "data/graph.pbtxt"
-              }],
-            },
-            _selectedDataset: {
-              type: Number,
-              value: 0,
-            },
-
-            // This property will be updated by the graph loader.
-            _graph: {
-              type: Object,
-            },
-          },
-          observers: [
-            '_graphUpdated(_graph)',
-          ],
-          _graphUpdated: function(slimGraph) {
-            const tracker = tf.graph.util.getTracker(this.$.loader);
-            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
-                tracker, 100, 'Namespace hierarchy');
-            const hierarchyOptions = {};
-            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
-                function(graphHierarchy) {
-              // We have parsed and built the graph object from a pbtxt file. Render the graph.
-              this.$.graph.set('basicGraph', slimGraph);
-              this.$.graph.set('graphHierarchy', graphHierarchy);
-            }.bind(this));
-          },
-        });
-      </script>
-    </dom-module>
-    <div id='demo-container'>
-      <tf-graph-demo></tf-graph-demo>
-    </div>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html b/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
deleted file mode 100644
index 5fc16c05207fd082336717a6da2563e9eafc3985..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
+++ /dev/null
@@ -1,88 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<script src="../tf-graph-common/minimap.js"></script>
-
-<dom-module id="tf-graph-minimap">
-<template>
-<style>
-:host {
-  background-color:white;
-  transition: opacity .3s linear;
-  pointer-events: auto;
-}
-
-:host.hidden {
-  opacity: 0;
-  pointer-events: none;
-}
-
-canvas {
-  border: 1px solid #999;
-}
-
-rect {
-  fill: white;
-  stroke: #111111;
-  stroke-width: 1px;
-  fill-opacity: 0;
-  filter: url(#minimapDropShadow);
-  cursor: move;
-}
-
-svg {
-  position: absolute;
-}
-</style>
-<svg>
-  <defs>
-    <filter id="minimapDropShadow" x="-20%" y="-20%" width="150%" height="150%">
-      <feOffset result="offOut" in="SourceGraphic" dx="1" dy="1"></feOffset>
-      <feColorMatrix result="matrixOut" in="offOut" type="matrix" values="0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.5 0"></feColorMatrix>
-      <feGaussianBlur result="blurOut" in="matrixOut" stdDeviation="2"></feGaussianBlur>
-      <feBlend in="SourceGraphic" in2="blurOut" mode="normal"></feBlend>
-    </filter>
-  </defs>
-  <rect></rect>
-</svg>
-<canvas class="first"></canvas>
-<!-- Additional canvas to use as buffer to avoid flickering between updates -->
-<canvas class="second"></canvas>
-<canvas class="download"></canvas>
-</template>
-<script>
-Polymer({
-  is: 'tf-graph-minimap',
-
-  /**
-   * Initializes the minimap and returns a minimap object to notify when
-   * things update.
-   *
-   * @param svg The main svg element.
-   * @param zoomG The svg group used for panning and zooming the main svg.
-   * @param mainZoom The main zoom behavior.
-   * @param maxWandH The maximum width/height for the minimap.
-   * @param labelPadding Padding in pixels due to the main graph labels.
-   */
-  init: function(svg, zoomG, mainZoom, maxWAndH, labelPadding) {
-    return new tf.scene.Minimap(svg, zoomG, mainZoom, this, maxWAndH,
-        labelPadding);
-  }
-});
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html b/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
deleted file mode 100644
index fb2bc13f9a14eabc813b5339d513723ca991b497..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
+++ /dev/null
@@ -1,1081 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-<link rel="import" href="tf-graph-minimap.html">
-
-<!--
-  A module that takes a render hierarchy as input and produces an SVG DOM using
-  dagre and d3.
--->
-<dom-module id="tf-graph-scene">
-<template>
-<style>
-:host {
-  display: flex;
-  width: 100%;
-  font-size: 20px;
-}
-
-::content #svg {
-  overflow: hidden;
-  flex: 1;
-  height: 100%;
-  width: 100%;
-}
-
-::content #hidden {
-  position: fixed;
-  top: 0px;
-  visibility: hidden;
-}
-
-/* --- Node and annotation-node for Metanode --- */
-
-::content .meta > .nodeshape > rect,
-::content .meta > .annotation-node > rect {
-  cursor: pointer;
-  fill: hsl(0, 0%, 70%);
-}
-
-::content .node.meta.highlighted > .nodeshape > rect,
-::content .node.meta.highlighted > .annotation-node > rect {
-  stroke-width: 2;
-}
-
-::content .annotation.meta.highlighted > .nodeshape > rect,
-::content .annotation.meta.highlighted > .annotation-node > rect {
-  stroke-width: 1;
-}
-
-::content .meta.selected > .nodeshape > rect,
-::content .meta.selected > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .node.meta.selected.expanded > .nodeshape > rect,
-::content .node.meta.selected.expanded > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 3;
-}
-
-::content .annotation.meta.selected > .nodeshape > rect,
-::content .annotation.meta.selected > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .node.meta.selected.expanded.highlighted > .nodeshape > rect,
-::content .node.meta.selected.expanded.highlighted > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 4;
-}
-
-::content .faded,
-::content .faded rect,
-::content .faded ellipse,
-::content .faded path,
-::content .faded use,
-::content #rectHatch line,
-::content #ellipseHatch line {
-  color: #e0d4b3 !important;
-  fill: white;
-  stroke: #e0d4b3 !important;
-}
-
-
-::content .faded path {
-  stroke-width: 1px !important;
-}
-
-::content .faded rect {
-  fill: url(#rectHatch) !important;
-}
-
-::content .faded ellipse,
-::content .faded use {
-  fill: url(#ellipseHatch) !important;
-}
-
-::content .faded text {
-  opacity: 0;
-}
-
-/* Rules used for input-tracing. */
-::content .input-highlight > * > rect,
-::content .input-highlight > * > ellipse,
-::content .input-highlight > * > use
-{
-  fill: white;
-  stroke: #ff9800 !important;
-}
-
-/*  - Faded non-input styling */
-::content .non-input > * > rect,
-::content .non-input > * > ellipse,
-::content .non-input > * > use,
-/* For Const nodes. */
-::content .non-input > * > .constant:not([class*="input-highlight"]) >
-  .annotation-node > ellipse,
-/* For styling of annotation nodes of non-input nodes. */
-::content .non-input > g > .annotation > .annotation-node > rect {
-  stroke: #e0d4b3 !important;
-  stroke-width: inherit;
-  stroke-dasharray: inherit;
-}
-
-
-::content .non-input path {
-  visibility: hidden;
-}
-
-::content .non-input > .nodeshape > rect,
-::content .non-input > .annotation-node > rect,
-/* For styling of annotation nodes of non-input nodes. */
-::content .non-input > g > .annotation > .annotation-node > rect
-{
-  fill: url(#rectHatch) !important;
-}
-
-::content .non-input ellipse,
-::content .non-input use {
-  fill: url(#ellipseHatch) !important;
-}
-
-::content .non-input > text {
-  opacity: 0;
-}
-
-::content .non-input .annotation > .annotation-edge {
-  marker-end: url(#annotation-arrowhead-faded);
-}
-
-::content .non-input .annotation > .annotation-edge.refline {
-  marker-start: url(#ref-annotation-arrowhead-faded);
-}
-
-/* Input edges. */
-::content .input-edge-highlight > text {
-  fill: black !important;
-}
-::content .input-edge-highlight > path,
-::content .input-highlight > .in-annotations > .annotation > .annotation-edge,
-::content .input-highlight-selected > .in-annotations > .annotation >
-.annotation-edge {
-  stroke: #999 !important;
-}
-
-/* Non-input edges. */
-::content .non-input-edge-highlight,
-::content .non-input > g > .annotation > path,
-/* Annotation styles (label and edges respectively). */
-::content .non-input > g >
-.annotation:not(.input-highlight):not(.input-highlight-selected) >
-.annotation-label
-/*.annotation-edge*/
-{
-  visibility: hidden;
-}
-
-/* --- Op Node --- */
-
-::content .op > .nodeshape > ellipse,
-::content .op > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: #fff;
-  stroke: #ccc;
-}
-
-::content .op.selected > .nodeshape > ellipse,
-::content .op.selected > .annotation-node > ellipse {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .op.highlighted > .nodeshape > ellipse,
-::content .op.highlighted > .annotation-node > ellipse {
-  stroke-width: 2;
-}
-
-/* --- Series Node --- */
-
-/* By default, don't show the series background <rect>. */
-::content .series > .nodeshape > rect {
-  fill: hsl(0, 0%, 70%);
-  fill-opacity: 0;
-  stroke-dasharray: 5, 5;
-  stroke-opacity: 0;
-  cursor: pointer;
-}
-
-/* Once expanded, show the series background <rect> and hide the <use>. */
-::content .series.expanded > .nodeshape > rect {
-  fill-opacity: 0.15;
-  stroke: hsl(0, 0%, 70%);
-  stroke-opacity: 1;
-}
-::content .series.expanded > .nodeshape > use {
-  visibility: hidden;
-}
-
-/**
- * TODO(jimbo): Simplify this by applying a stable class name to all <g>
- * elements that currently have either the nodeshape or annotation-node classes.
- */
-::content .series > .nodeshape > use ,
-::content .series > .annotation-node > use {
-  stroke: #ccc;
-}
-::content .series.highlighted > .nodeshape > use ,
-::content .series.highlighted > .annotation-node > use {
-  stroke-width: 2;
-}
-::content .series.selected > .nodeshape > use ,
-::content .series.selected > .annotation-node > use {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .series.selected > .nodeshape > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .annotation.series.selected > .annotation-node > use {
-  stroke: red;
-  stroke-width: 2;
-}
-
-/* --- Bridge Node --- */
-::content .bridge > .nodeshape > rect {
-  stroke: #f0f;
-  opacity: 0.2;
-  display: none;
-}
-
-/* --- Structural Elements --- */
-::content .edge > path.edgeline.structural {
-  stroke: #f0f;
-  opacity: 0.2;
-  display: none;
-}
-
-/* Reference Edge */
-::content .edge > path.edgeline.referenceedge {
-  stroke: #FFB74D;
-  opacity: 1;
-}
-
-/* --- Series Nodes --- */
-
-/* Hide the rect for a series' annotation. */
-::content .series > .annotation-node > rect {
-  display: none;
-}
-
-/* --- Node label --- */
-
-
-::content .node > text.nodelabel {
-  cursor: pointer;
-  fill: #444;
-}
-
-::content .meta.expanded > text.nodelabel {
-  font-size: 9px;
-}
-
-::content .series > text.nodelabel {
-  font-size: 8px;
-}
-
-::content .op > text.nodelabel {
-  font-size: 6px;
-}
-
-::content .bridge > text.nodelabel {
-  display: none;
-}
-
-::content .node.meta.expanded > text.nodelabel{
-  cursor: normal;
-}
-
-::content .annotation.meta.highlighted > text.annotation-label {
-  fill: #50A3F7;
-}
-
-::content .annotation.meta.selected > text.annotation-label {
-  fill: #4285F4;
-}
-
-/* --- Annotation --- */
-
-/* only applied for annotations that are not summary or constant.
-(.summary, .constant gets overridden below) */
-::content .annotation > .annotation-node > * {
-  stroke-width: 0.5;
-  stroke-dasharray: 1, 1;
-}
-
-::content .annotation.summary > .annotation-node > *,
-::content .annotation.constant > .annotation-node > * {
-  stroke-width: 1;
-  stroke-dasharray: none;
-}
-
-::content .annotation > .annotation-edge {
-  fill: none;
-  stroke: #aaa;
-  stroke-width: 0.5;
-  marker-end: url(#annotation-arrowhead);
-}
-
-::content .faded .annotation > .annotation-edge {
-  marker-end: url(#annotation-arrowhead-faded);
-}
-
-::content .annotation > .annotation-edge.refline {
-  marker-start: url(#ref-annotation-arrowhead);
-}
-
-::content .faded .annotation > .annotation-edge.refline {
-  marker-start: url(#ref-annotation-arrowhead-faded);
-}
-
-::content .annotation > .annotation-control-edge {
-  stroke-dasharray: 1, 1;
-}
-
-::content #annotation-arrowhead {
-  fill: #aaa;
-}
-
-::content #annotation-arrowhead-faded {
-  fill: #e0d4b3;
-}
-
-::content #ref-annotation-arrowhead {
-  fill: #aaa;
-}
-
-::content #ref-annotation-arrowhead-faded {
-  fill: #e0d4b3;
-}
-
-::content .annotation > .annotation-label {
-  font-size: 5px;
-  cursor: pointer;
-}
-::content .annotation > .annotation-label.annotation-ellipsis {
-  cursor: default;
-}
-
-/* Hide annotations on expanded meta nodes since they're redundant. */
-::content .expanded > .in-annotations,
-::content .expanded > .out-annotations {
-  display: none;
-}
-
-/* --- Annotation: Constant --- */
-
-::content .constant > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: white;
-  stroke: #848484;
-}
-
-::content .constant.selected > .annotation-node > ellipse {
-  fill: white;
-  stroke: red;
-}
-
-::content .constant.highlighted > .annotation-node > ellipse {
-  stroke-width: 1.5;
-}
-
-/* --- Annotation: Summary --- */
-
-::content .summary > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: #DB4437;
-  stroke: #DB4437;
-}
-
-::content .summary.selected > .annotation-node > ellipse {
-  fill: #A52714;
-  stroke: #A52714;
-}
-
-::content .summary.highlighted > .annotation-node > ellipse {
-  stroke-width: 1.5;
-}
-
-/* --- Edge --- */
-
-::content .edge > path.edgeline {
-  fill: none;
-  stroke: #bbb;
-  stroke-linecap: round;
-  stroke-width: 0.75;
-}
-
-/* Labels showing tensor shapes on edges */
-::content .edge > text {
-  font-size: 3.5px;
-  fill: #666;
-}
-
-::content .dataflow-arrowhead {
-  fill: #bbb;
-}
-
-::content .reference-arrowhead {
-  fill: #FFB74D;
-}
-
-::content .edge .control-dep {
-  stroke-dasharray: 2, 2;
-}
-
-/* --- Group node expand/collapse button --- */
-
-/* Hides expand/collapse buttons when a node isn't expanded or highlighted. Using
-   incredibly small opacity so that the bounding box of the <g> parent still takes
-   this container into account even when it isn't visible */
-::content .node:not(.highlighted):not(.expanded) > .nodeshape > .buttoncontainer {
-  opacity: 0.01;
-}
-::content .node.highlighted > .nodeshape > .buttoncontainer {
-  cursor: pointer;
-}
-::content .buttoncircle {
-  fill: #E7811D;
-}
-::content .buttoncircle:hover {
-  fill: #B96717;
-}
-::content .expandbutton,
-::content .collapsebutton {
-  stroke: white;
-}
-/* Do not let the path elements in the button take pointer focus */
-::content .node > .nodeshape > .buttoncontainer > .expandbutton,
-::content .node > .nodeshape > .buttoncontainer > .collapsebutton {
-  pointer-events: none;
-}
-/* Only show the expand button when a node is collapsed and only show the
-   collapse button when a node is expanded. */
-::content .node.expanded > .nodeshape > .buttoncontainer > .expandbutton {
-  display: none;
-}
-::content .node:not(.expanded) > .nodeshape > .buttoncontainer > .collapsebutton {
-  display: none;
-}
-
-::content .health-pill-stats {
-  font-size: 4px;
-  text-anchor: middle;
-}
-
-::content .health-pill rect {
-  filter: url(#health-pill-shadow);
-  rx: 3;
-  ry: 3;
-}
-
-.titleContainer {
-  position: relative;
-  top: 20px;
-}
-
-.title {
-  position: absolute;
-}
-
-.auxTitle {
-  position: absolute;
-}
-
-#minimap {
-  position: absolute;
-  right: 20px;
-  bottom: 20px;
-}
-</style>
-<div class="titleContainer">
-  <div id="title" class="title">Main Graph</div>
-  <div id="auxTitle" class="auxTitle">Auxiliary Nodes</div>
-</div>
-<svg id="svg">
-  <defs>
-
-    <!-- Arrow heads for reference edge paths of different predefined sizes per color. -->
-    <path id="reference-arrowhead-path" d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
-    <marker class="reference-arrowhead" id="reference-arrowhead-small" viewBox="0 0 10 10" markerWidth="5" markerHeight="5"
-      refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#reference-arrowhead-path" />
-    </marker>
-    <marker class="reference-arrowhead" id="reference-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13"
-        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#reference-arrowhead-path" />
-    </marker>
-    <marker class="reference-arrowhead" id="reference-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16"
-        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#reference-arrowhead-path" />
-    </marker>
-    <marker class="reference-arrowhead" id="reference-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20"
-        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#reference-arrowhead-path" />
-    </marker>
-
-    <!-- Arrow heads for dataflow edge paths of different predefined sizes per color. -->
-    <path id="dataflow-arrowhead-path" d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
-    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-small" viewBox="0 0 10 10" markerWidth="5" markerHeight="5"
-      refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#dataflow-arrowhead-path" />
-    </marker>
-    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13"
-        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#dataflow-arrowhead-path" />
-    </marker>
-    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16"
-        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#dataflow-arrowhead-path" />
-    </marker>
-    <marker class="dataflow-arrowhead" id="dataflow-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20"
-        refX="2" refY="5" orient="auto-start-reverse" markerUnits="userSpaceOnUse">
-      <use xlink:href="#dataflow-arrowhead-path" />
-    </marker>
-
-    <!-- Arrow head for annotation edge paths. -->
-    <marker id="annotation-arrowhead" markerWidth="5" markerHeight="5"
-      refX="5" refY="2.5" orient="auto">
-      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"/>
-    </marker>
-    <marker id="annotation-arrowhead-faded" markerWidth="5" markerHeight="5"
-      refX="5" refY="2.5" orient="auto">
-      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"/>
-    </marker>
-    <marker id="ref-annotation-arrowhead" markerWidth="5" markerHeight="5"
-      refX="0" refY="2.5" orient="auto">
-      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"/>
-    </marker>
-    <marker id="ref-annotation-arrowhead-faded" markerWidth="5" markerHeight="5"
-      refX="0" refY="2.5" orient="auto">
-      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"/>
-    </marker>
-    <!-- Template for an Op node ellipse. -->
-    <ellipse id="op-node-stamp"
-        rx="7.5" ry="3" stroke="inherit" fill="inherit" />
-    <!-- Template for an Op node annotation ellipse (smaller). -->
-    <ellipse id="op-node-annotation-stamp"
-        rx="5" ry="2" stroke="inherit" fill="inherit" />
-    <!-- Vertically stacked series of Op nodes when unexpanded. -->
-    <g id="op-series-vertical-stamp">
-      <use xlink:href="#op-node-stamp" x="8" y="9" />
-      <use xlink:href="#op-node-stamp" x="8" y="6" />
-      <use xlink:href="#op-node-stamp" x="8" y="3" />
-    </g>
-    <!-- Horizontally stacked series of Op nodes when unexpanded. -->
-    <g id="op-series-horizontal-stamp">
-      <use xlink:href="#op-node-stamp" x="16" y="4" />
-      <use xlink:href="#op-node-stamp" x="12" y="4" />
-      <use xlink:href="#op-node-stamp" x="8" y="4" />
-    </g>
-    <!-- Horizontally stacked series of Op nodes for annotation. -->
-    <g id="op-series-annotation-stamp">
-      <use xlink:href="#op-node-annotation-stamp" x="9" y="2" />
-      <use xlink:href="#op-node-annotation-stamp" x="7" y="2" />
-      <use xlink:href="#op-node-annotation-stamp" x="5" y="2" />
-    </g>
-    <svg id="summary-icon" fill="#848484" height="12" viewBox="0 0 24 24" width="12">
-      <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z" />
-    </svg>
-    <!--
-      Where the linearGradient for each node is stored. Used when coloring
-      by proportions of devices.
-    -->
-    <g id="linearGradients"></g>
-
-    <!-- Hatch patterns for faded out nodes. -->
-    <pattern id="rectHatch" patternTransform="rotate(45 0 0)" width="5" height="5" patternUnits="userSpaceOnUse">
-      <line x1="0" y1="0" x2="0" y2="5" style="stroke-width: 1"/>
-    </pattern>
-    <pattern id="ellipseHatch" patternTransform="rotate(45 0 0)" width="2" height="2" patternUnits="userSpaceOnUse">
-      <line x1="0" y1="0" x2="0" y2="2" style="stroke-width: 1"/>
-    </pattern>
-
-    <!-- A shadow for health pills. -->
-    <filter id="health-pill-shadow" x="-40%" y="-40%" width="180%" height="180%">
-      <feGaussianBlur in="SourceAlpha" stdDeviation="0.8"/>
-      <feOffset dx="0" dy="0" result="offsetblur"/>
-      <feFlood flood-color="#000000"/>
-      <feComposite in2="offsetblur" operator="in"/>
-      <feMerge>
-        <feMergeNode/>
-        <feMergeNode in="SourceGraphic"/>
-      </feMerge>
-    </filter>
-  </defs>
-  <!-- Make a large rectangle that fills the svg space so that
-  zoom events get captured on safari -->
-  <rect fill="white" width="10000" height="10000"></rect>
-  <g id="root"></g>
-</svg>
-<tf-graph-minimap id="minimap"></tf-graph-minimap>
-</template>
-<script>
-Polymer({
-  is: 'tf-graph-scene',
-  properties: {
-    renderHierarchy: Object,
-    name: String,
-    colorBy: String,
-
-    // For each render hierarchy, we only fit it to the viewport once (when the scene is attached to
-    // the DOM). We do not fit the hierarchy again (unless the user clicks the reset button). For
-    // instance, if the user enters a certain view in the graph, switches to another dashboard, and
-    // returns to the graph dashboard, the user expects the previous view. These properties enable
-    // that behavior.
-
-    /** Whether the scene has fit the current render hierarchy (to the viewport) at least once. */
-    _hasRenderHierarchyBeenFitOnce: Boolean,
-    /** Whether this scene element is currently attached to a parent element. */
-    _isAttached: Boolean,
-
-    /** @type {d3_zoom} d3 zoom object */
-    _zoom: Object,
-    highlightedNode: {
-      type: String,
-      observer: '_highlightedNodeChanged'
-    },
-    selectedNode: {
-      type: String,
-      observer: '_selectedNodeChanged'
-    },
-    /** Keeps track of if the graph has been zoomed/panned since loading */
-    _zoomed: {
-      type: Boolean,
-      observer: '_onZoomChanged',
-      value: false
-    },
-    /** Keeps track of the starting coordinates of a graph zoom/pan */
-    _zoomStartCoords: {
-      type: Object,
-      value: null
-    },
-    /** Keeps track of the current coordinates of a graph zoom/pan */
-    _zoomTransform: {
-      type: Object,
-      value: null
-    },
-    /** Maximum distance of a zoom event for it to be interpreted as a click */
-    _maxZoomDistanceForClick: {
-      type: Number,
-      value: 20
-    },
-    /**
-     * @type {d3.scale.ordinal}
-     * Scale mapping from template name to a number between 0 and N-1
-     * where N is the number of different template names. Used by
-     * tf.graph.scene.node when computing node color by structure.
-     */
-    templateIndex: Function,
-    /**
-     * @type {tf.scene.Minimap}
-     * A minimap object to notify for zoom events.
-     */
-    minimap: Object,
-    /*
-     * Dictionary for easily stylizing nodes when state changes.
-     * _nodeGroupIndex[nodeName] = d3_selection of the nodeGroup
-     */
-    _nodeGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /*
-     * Dictionary for easily stylizing annotation nodes when state changes.
-     * _annotationGroupIndex[nodeName][hostNodeName] =
-     *   d3_selection of the annotationGroup
-     */
-    _annotationGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /*
-     * Dictionary for easily stylizing edges when state changes.
-     * _edgeGroupIndex[edgeName] = d3_selection of the edgeGroup
-     */
-    _edgeGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /**
-     * Max font size for metanode label strings.
-     */
-    maxMetanodeLabelLengthFontSize: {
-      type: Number,
-      value: 9
-    },
-    /**
-     * Min font size for metanode label strings.
-     */
-    minMetanodeLabelLengthFontSize: {
-      type: Number,
-      value: 6
-    },
-    /**
-     * Metanode label strings longer than this are given smaller fonts.
-     */
-    maxMetanodeLabelLengthLargeFont: {
-      type: Number,
-      value: 11
-    },
-    /**
-     * Metanode label strings longer than this are truncated with ellipses.
-     */
-    maxMetanodeLabelLength: {
-      type: Number,
-      value: 18
-    },
-    progress: Object,
-    // A mapping between node name to the tf.graph.scene.HealthPill to render.
-    nodeNamesToHealthPills: Object,
-    // The step of health pills to show throughout the graph.
-    healthPillStepIndex: Number,
-  },
-  observers: [
-    '_colorByChanged(colorBy)',
-    '_renderHierarchyChanged(renderHierarchy)',
-    // Animation and fitting must come after the observer for the hierarchy changing because we must
-    // first build the render hierarchy.
-    '_animateAndFit(_isAttached, renderHierarchy)',
-    '_updateHealthPills(nodeNamesToHealthPills, healthPillStepIndex)',
-  ],
-  getNode: function(nodeName) {
-    return this.renderHierarchy.getRenderNodeByName(nodeName);
-  },
-  isNodeExpanded: function(node) {
-    return node.expanded;
-  },
-  setNodeExpanded: function(renderNode) {
-    this._build(this.renderHierarchy);
-    this._updateLabels(!this._zoomed);
-  },
-  /**
-   * Resets the state of the component. Called whenever the whole graph
-   * (dataset) changes.
-   */
-  _resetState: function() {
-    // Reset the state of the component.
-    this._nodeGroupIndex = {};
-    this._annotationGroupIndex = {};
-    this._edgeGroupIndex = {};
-    this._updateLabels(false);
-    // Remove all svg elements under the 'root' svg group.
-    d3.select(this.$.svg).select('#root').selectAll('*').remove();
-    // And the defs.
-    d3.select(this.$.svg).select('defs #linearGradients')
-        .selectAll('*').remove();
-  },
-  /** Main method for building the scene */
-  _build: function(renderHierarchy) {
-    this.templateIndex = renderHierarchy.hierarchy.getTemplateIndex();
-    tf.graph.util.time('tf-graph-scene (layout):', function() {
-      // layout the scene for this meta / series node
-      tf.graph.layout.layoutScene(renderHierarchy.root, this);
-    }.bind(this));
-
-    tf.graph.util.time('tf-graph-scene (build scene):', function() {
-      tf.graph.scene.buildGroup(d3.select(this.$.root), renderHierarchy.root, this);
-      tf.graph.scene.addGraphClickListener(this.$.svg, this);
-      tf.graph.scene.node.traceInputs(renderHierarchy);
-    }.bind(this));
-    // Update the minimap again when the graph is done animating.
-    setTimeout(function() {
-      this._updateHealthPills(this.nodeNamesToHealthPills, this.healthPillStepIndex);
-      this.minimap.update();
-    }.bind(this), tf.graph.layout.PARAMS.animation.duration);
-  },
-  ready: function() {
-    this._zoom = d3.zoom()
-      .on('end', function() {
-        if (this._zoomStartCoords) {
-          // Calculate the total distance dragged during the zoom event.
-          // If it is sufficiently small, then fire an event indicating
-          // that zooming has ended. Otherwise wait to fire the zoom end
-          // event, so that a mouse click registered as part of this zooming
-          // is ignored (as this mouse click was part of a zooming, and should
-          // not be used to indicate an actual click on the graph).
-          var dragDistance = Math.sqrt(
-            Math.pow(this._zoomStartCoords.x - this._zoomTransform.x, 2) +
-            Math.pow(this._zoomStartCoords.y - this._zoomTransform.y, 2));
-          if (dragDistance < this._maxZoomDistanceForClick) {
-            this._fireEnableClick();
-          } else {
-            setTimeout(this._fireEnableClick.bind(this), 50);
-          }
-        }
-        this._zoomStartCoords = null;
-      }.bind(this))
-      .on('zoom', function() {
-        // Store the coordinates of the zoom event.
-        this._zoomTransform = d3.event.transform;
-
-        // If this is the first zoom event after a zoom-end, then
-        // store the coordinates as the start coordinates as well,
-        // and fire an event to indicate that zooming has started.
-        // This doesn't use the zoomstart event, as d3 sends this
-        // event on mouse-down, even if there has been no dragging
-        // done to translate the graph around.
-        if (!this._zoomStartCoords) {
-          this._zoomStartCoords = this._zoomTransform;
-          this.fire('disable-click');
-        }
-        this._zoomed = true;
-        d3.select(this.$.root).attr('transform', d3.event.transform);
-        // Notify the minimap.
-        this.minimap.zoom(d3.event.transform);
-      }.bind(this));
-    d3.select(this.$.svg).call(this._zoom)
-      .on('dblclick.zoom', null);
-    d3.select(window).on('resize', function() {
-      // Notify the minimap that the user's window was resized.
-      // The minimap will figure out the new dimensions of the main svg
-      // and will use the existing translate and scale params.
-      this.minimap.zoom();
-    }.bind(this));
-    // Initialize the minimap.
-    this.minimap = this.$.minimap.init(this.$.svg, this.$.root, this._zoom,
-        tf.graph.layout.PARAMS.minimap.size,
-        tf.graph.layout.PARAMS.subscene.meta.labelHeight);
-  },
-  attached: function() {
-    this.set('_isAttached', true);
-  },
-  detached: function() {
-    this.set('_isAttached', false);
-  },
-  _renderHierarchyChanged: function(renderHierarchy) {
-    this._hasRenderHierarchyBeenFitOnce = false;
-    this._resetState();
-    this._build(renderHierarchy);
-  },
-  _animateAndFit: function(isAttached, renderHierarchy) {
-    if (this._hasRenderHierarchyBeenFitOnce || !isAttached) {
-      // Do not animate and fit if the scene has already fitted this render hierarchy once. Or if
-      // the graph dashboard is not attached (in which case the scene lacks DOM info for fitting).
-      return;
-    }
-
-    // Fit to screen after the graph is done animating.
-    setTimeout(this.fit.bind(this), tf.graph.layout.PARAMS.animation.duration);
-  },
-  _updateLabels: function(showLabels) {
-    var mainGraphTitleElement = this.getElementsByClassName('title')[0];
-    var titleStyle = mainGraphTitleElement.style;
-    var auxTitleStyle = this.getElementsByClassName('auxTitle')[0].style;
-    var core = d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-      tf.graph.scene.Class.Scene.CORE).node();
-    // Only show labels if the graph is fully loaded.
-    if (showLabels && core && this.progress && this.progress.value === 100) {
-      var aux =
-        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.INEXTRACT).node() ||
-        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.OUTEXTRACT).node();
-      var coreX = core.getCTM().e;
-      var auxX = aux ? aux.getCTM().e : null;
-      titleStyle.display = 'inline';
-      titleStyle.left = coreX + 'px';
-      if (auxX !== null && auxX !== coreX) {
-        auxTitleStyle.display = 'inline';
-
-        // Make sure that the aux title is positioned rightwards enough so as to
-        // prevent overlap with the main graph title.
-        auxX = Math.max(
-            coreX + mainGraphTitleElement.getBoundingClientRect().width, auxX);
-
-        auxTitleStyle.left = auxX + 'px';
-      } else {
-        auxTitleStyle.display = 'none';
-      }
-    } else {
-      titleStyle.display='none';
-      auxTitleStyle.display = 'none';
-    }
-  },
-  /**
-    * Called whenever the user changed the 'color by' option in the
-    * UI controls.
-    */
-  _colorByChanged: function() {
-    if (this.renderHierarchy != null) {
-      // We iterate through each svg node and update its state.
-      _.each(this._nodeGroupIndex, function(nodeGroup, nodeName) {
-        this._updateNodeState(nodeName);
-      }, this);
-      // Notify also the minimap.
-      this.minimap.update();
-    }
-  },
-  fit: function() {
-    this._hasRenderHierarchyBeenFitOnce = true;
-    tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
-      this._zoomed = false;
-    }.bind(this));
-  },
-  isNodeSelected: function(n) {
-    return n === this.selectedNode;
-  },
-  isNodeHighlighted: function(n) {
-    return n === this.highlightedNode;
-  },
-  addAnnotationGroup: function(a, d, selection) {
-    var an = a.node.name;
-    this._annotationGroupIndex[an] = this._annotationGroupIndex[an] || {};
-    this._annotationGroupIndex[an][d.node.name] = selection;
-  },
-  getAnnotationGroupsIndex: function(a) {
-    return this._annotationGroupIndex[a];
-  },
-  removeAnnotationGroup: function(a, d) {
-    delete this._annotationGroupIndex[a.node.name][d.node.name];
-  },
-  addNodeGroup: function(n, selection) {
-    this._nodeGroupIndex[n] = selection;
-  },
-  getNodeGroup: function(n) {
-    return this._nodeGroupIndex[n];
-  },
-  removeNodeGroup: function(n) {
-    delete this._nodeGroupIndex[n];
-  },
-  addEdgeGroup: function(n, selection) {
-    this._edgeGroupIndex[n] = selection;
-  },
-  getEdgeGroup: function(e) {
-    return this._edgeGroupIndex[e];
-  },
-  _updateHealthPills: function(nodeNamesToHealthPills, healthPillStepIndex) {
-    tf.graph.scene.addHealthPills(
-        this.$.svg, nodeNamesToHealthPills, healthPillStepIndex);
-  },
-  /**
-   * Update node and annotation node of the given name.
-   * @param  {String} n node name
-   */
-  _updateNodeState: function(n) {
-    var node = this.getNode(n);
-    var nodeGroup = this.getNodeGroup(n);
-
-    if (nodeGroup) {
-      tf.graph.scene.node.stylize(nodeGroup, node, this);
-    }
-
-    var annotationGroupIndex = this.getAnnotationGroupsIndex(n);
-    _.each(annotationGroupIndex, function(aGroup, hostName) {
-      tf.graph.scene.node.stylize(aGroup, node, this,
-          tf.graph.scene.Class.Annotation.NODE);
-    }, this);
-  },
-
-  /**
-   * Handles new node selection. 1) Updates the selected-state of each node,
-   * 2) triggers input tracing.
-   * @param selectedNode {string} The name of the newly selected node.
-   * @param oldSelectedNode {string} The name of the previously selected node.
-   * @private
-   */
-  _selectedNodeChanged: function(selectedNode, oldSelectedNode) {
-    if (selectedNode === oldSelectedNode) {
-      return;
-    }
-
-    if (selectedNode) {
-      this._updateNodeState(selectedNode);
-    }
-    if (oldSelectedNode) {
-      this._updateNodeState(oldSelectedNode);
-    }
-
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-
-    if (!selectedNode) {
-      return;
-    }
-
-
-    // Update the minimap to reflect the highlighted (selected) node.
-    this.minimap.update();
-    var node = this.renderHierarchy.hierarchy.node(selectedNode);
-    var nodeParents = [];
-    // Create list of all metanode parents of the selected node.
-    while (node.parentNode != null
-        && node.parentNode.name != tf.graph.ROOT_NAME) {
-      node = node.parentNode;
-      nodeParents.push(node.name);
-    }
-    // Ensure each parent metanode is built and expanded.
-    var topParentNodeToBeExpanded;
-    _.forEachRight(nodeParents, function(parentName) {
-      this.renderHierarchy.buildSubhierarchy(parentName);
-      var renderNode = this.renderHierarchy.getRenderNodeByName(parentName);
-      if (renderNode.node.isGroupNode && !renderNode.expanded) {
-        renderNode.expanded = true;
-        if (!topParentNodeToBeExpanded) {
-          topParentNodeToBeExpanded = renderNode;
-        }
-      }
-    }, this);
-    // If any expansion was needed to display this selected node, then
-    // inform the scene of the top-most expansion.
-    if (topParentNodeToBeExpanded) {
-      this.setNodeExpanded(topParentNodeToBeExpanded);
-      this._zoomed = true;
-    }
-
-    if (tf.graph.scene.panToNode(selectedNode, this.$.svg, this.$.root,
-        this._zoom)) {
-      this._zoomed = true;
-    }
-  },
-  _highlightedNodeChanged: function(highlightedNode, oldHighlightedNode) {
-    if (highlightedNode === oldHighlightedNode) {
-      return;
-    }
-
-    if (highlightedNode) {
-      this._updateNodeState(highlightedNode);
-    }
-    if (oldHighlightedNode) {
-      this._updateNodeState(oldHighlightedNode);
-    }
-  },
-  _onZoomChanged: function() {
-    this._updateLabels(!this._zoomed);
-  },
-  _fireEnableClick: function() {
-    this.fire('enable-click');
-  },
-});
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph.html b/tensorflow/tensorboard/components/tf_graph/tf-graph.html
deleted file mode 100644
index efbf065a40ac80d3a45f6fe304841c98ed51a02b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph.html
+++ /dev/null
@@ -1,316 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-flex-layout/iron-flex-layout.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="tf-graph-scene.html">
-
-<dom-module id="tf-graph">
-<template>
-<style>
-.container {
-  width: 100%;
-  height: 100%;
-  background: white;
-  box-shadow: 0 1px 5px rgba(0,0,0,0.2);
-}
-
-.vertical {
-  width:100%;
-  height:100%;
-  @apply(--layout-vertical);
-}
-
-.auto {
-  @apply(--layout-flex-auto);
-  @apply(--layout-vertical);
-}
-
-h2 {
-  text-align: center;
-}
-
-paper-button {
-  text-transform: none;
-}
-</style>
-<div class="container">
-  <div class="vertical">
-    <template is="dom-if" if="[[title]]">
-      <h2>[[title]]</h2>
-    </template>
-    <tf-graph-scene id="scene" class="auto"
-          render-hierarchy="[[renderHierarchy]]"
-          highlighted-node="[[_getVisible(highlightedNode)]]"
-          selected-node="{{selectedNode}}"
-          color-by="[[colorBy]]"
-          progress="[[progress]]"
-          node-names-to-health-pills="[[nodeNamesToHealthPills]]"
-          health-pill-step-index="{{healthPillStepIndex}}"
-    ></tf-graph-scene>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-Polymer({
-
-  is: 'tf-graph',
-
-  properties: {
-    graphHierarchy: {
-      type: Object,
-      notify: true,
-      observer: '_graphChanged'
-    },
-    basicGraph: Object,
-    stats: Object,
-    devicesForStats: Object,
-    hierarchyParams: Object,
-    progress: {
-      type: Object,
-      notify: true,
-    },
-    title: String,
-    selectedNode: {
-      type: String,
-      notify: true,
-    },
-    highlightedNode: {
-      type: String,
-      notify: true
-    },
-    /** What to color the nodes by (compute time, memory, device etc.) */
-    colorBy: String,
-    colorByParams: {
-      type: Object,
-      notify: true,
-      readOnly: true, // Produces and doesn't consume.
-    },
-    renderHierarchy: {
-      type: Object,
-      readOnly: true,
-      notify: true,
-    },
-    _renderDepth: {
-      type: Number,
-      value: 1
-    },
-    _allowGraphSelect: {
-      type: Boolean,
-      value: true
-    },
-    // A mapping between node name to the tf.graph.scene.HealthPill to render.
-    nodeNamesToHealthPills: Object,
-    // The step of health pills to show throughout the graph.
-    healthPillStepIndex: Number,
-  },
-  observers: [
-    '_statsChanged(stats, devicesForStats)',
-    '_buildRenderHierarchy(graphHierarchy)'
-  ],
-  _statsChanged: function(stats, devicesForStats) {
-    if (this.graphHierarchy) {
-      if (stats && devicesForStats) {
-        tf.graph.joinStatsInfoWithGraph(this.basicGraph, stats, devicesForStats);
-        tf.graph.hierarchy.joinAndAggregateStats(this.graphHierarchy, stats);
-      }
-      // Recompute the rendering information.
-      this._buildRenderHierarchy(this.graphHierarchy);
-    }
-  },
-  _buildRenderHierarchy: function(graphHierarchy) {
-    tf.graph.util.time('new tf.graph.render.Hierarchy', function() {
-      if (graphHierarchy.root.type !== tf.graph.NodeType.META) {
-        // root must be metanode but sometimes Polymer's dom-if has not
-        // remove tf-graph element yet in <tf-node-info>
-        // and thus mistakenly pass non-metanode to this module.
-        return;
-      }
-      var renderGraph = new tf.graph.render.RenderGraphInfo(
-          graphHierarchy, !!this.stats /** displayingStats */);
-      // Producing the 'color by' parameters to be consumed
-      // by the tf-graph-controls panel. It contains information about the
-      // min and max values and their respective colors, as well as list
-      // of devices with their respective colors.
-
-      function getColorParamsFromScale(scale) {
-        return {
-          minValue: scale.domain()[0],
-          maxValue: scale.domain()[1],
-          startColor: scale.range()[0],
-          endColor: scale.range()[1]
-        };
-      }
-
-      this._setColorByParams({
-        compute_time: getColorParamsFromScale(renderGraph.computeTimeScale),
-        memory: getColorParamsFromScale(renderGraph.memoryUsageScale),
-        device: _.map(renderGraph.deviceColorMap.domain(),
-            function(deviceName) {
-          return {
-            device: deviceName,
-            color: renderGraph.deviceColorMap(deviceName)
-          };
-        }),
-        xla_cluster: _.map(renderGraph.xlaClusterColorMap.domain(),
-            function(xlaClusterName) {
-          return {
-            xla_cluster: xlaClusterName,
-            color: renderGraph.xlaClusterColorMap(xlaClusterName)
-          };
-        }),
-      });
-      this._setRenderHierarchy(renderGraph);
-      this.async(function() {
-        this.fire("rendered");
-      });
-    }.bind(this));
-  },
-  _getVisible: function(name) {
-    if (!name) {
-      return name;
-    }
-    return this.renderHierarchy.getNearestVisibleAncestor(name);
-  },
-  listeners: {
-    'graph-select': '_graphSelected',
-    'disable-click': '_disableClick',
-    'enable-click': '_enableClick',
-    // Nodes
-    'node-toggle-expand': '_nodeToggleExpand',
-    'node-select': '_nodeSelected',
-    'node-highlight': '_nodeHighlighted',
-    'node-unhighlight': '_nodeUnhighlighted',
-    'node-toggle-extract': '_nodeToggleExtract',
-    'node-toggle-seriesgroup': '_nodeToggleSeriesGroup',
-
-    // Annotations
-
-    /* Note: currently highlighting/selecting annotation node has the same
-      * behavior as highlighting/selecting actual node so we point to the same
-      * set of event listeners.  However, we might redesign this to be a bit
-      * different.
-      */
-    'annotation-select': '_nodeSelected',
-    'annotation-highlight': '_nodeHighlighted',
-    'annotation-unhighlight': '_nodeUnhighlighted',
-  },
-  _graphChanged: function() {
-    // When a new graph is loaded, fire this event so that there is no
-    // info-card being displayed for the previously-loaded graph.
-    this.fire('graph-select');
-  },
-  _graphSelected: function(event) {
-    // Graph selection is not allowed during an active zoom event, as the
-    // click seen during a zoom/pan is part of the zooming and does not
-    // indicate a user desire to click on a specific section of the graph.
-    if (this._allowGraphSelect) {
-      this.set('selectedNode', null);
-    }
-    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
-    // callback not to be called if a right-click happens during a zoom event.
-    this._allowGraphSelect = true;
-  },
-  _disableClick: function(event) {
-    this._allowGraphSelect = false;
-  },
-  _enableClick: function(event) {
-    this._allowGraphSelect = true;
-  },
-  _nodeSelected: function(event) {
-    if (this._allowGraphSelect) {
-      this.set('selectedNode', event.detail.name);
-    }
-    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
-    // callback not to be called if a right-click happens during a zoom event.
-    this._allowGraphSelect = true;
-  },
-  _nodeHighlighted: function(event) {
-    this.set('highlightedNode', event.detail.name);
-  },
-  _nodeUnhighlighted: function(event) {
-    this.set('highlightedNode', null);
-  },
-  _nodeToggleExpand: function(event) {
-    // Immediately select the node that is about to be expanded.
-    this._nodeSelected(event);
-
-    // Compute the sub-hierarchy scene.
-    var nodeName = event.detail.name;
-    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
-    // Op nodes are not expandable.
-    if (renderNode.node.type === tf.graph.NodeType.OP) {
-      return;
-    }
-    this.renderHierarchy.buildSubhierarchy(nodeName);
-    renderNode.expanded = !renderNode.expanded;
-
-    // Expand the node with some delay so that the user can immediately see
-    // the visual effect of selecting that node, before the expansion is
-    // done.
-    this.async(function() {
-      this.querySelector('#scene').setNodeExpanded(renderNode);
-    }, 75);
-  },
-  _nodeToggleExtract: function(event) {
-    // Toggle the include setting of the specified node appropriately.
-    var nodeName = event.detail.name;
-    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
-    if (renderNode.node.include == tf.graph.InclusionType.INCLUDE) {
-      renderNode.node.include = tf.graph.InclusionType.EXCLUDE;
-    } else if (renderNode.node.include == tf.graph.InclusionType.EXCLUDE) {
-      renderNode.node.include = tf.graph.InclusionType.INCLUDE;
-    } else {
-      renderNode.node.include =
-       this.renderHierarchy.isNodeAuxiliary(renderNode)
-          ? tf.graph.InclusionType.INCLUDE : tf.graph.InclusionType.EXCLUDE;
-    }
-
-    // Rebuild the render hierarchy.
-    this._buildRenderHierarchy(this.graphHierarchy);
-  },
-  _nodeToggleSeriesGroup: function(event) {
-    // Toggle the group setting of the specified node appropriately.
-    var nodeName = event.detail.name;
-    tf.graph.toggleNodeSeriesGroup(this.hierarchyParams.seriesMap, nodeName);
-
-    // Rebuild the render hierarchy with the updated series grouping map.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 100,
-          'Namespace hierarchy');
-    tf.graph.hierarchy.build(this.basicGraph, this.hierarchyParams, hierarchyTracker)
-    .then(function(graphHierarchy) {
-      this.set('graphHierarchy', graphHierarchy);
-      this._buildRenderHierarchy(this.graphHierarchy);
-    }.bind(this));
-  },
-  not: function(x) {
-    return !x;
-  }
-});
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/BUILD b/tensorflow/tensorboard/components/tf_graph_app/BUILD
deleted file mode 100644
index d0b6d79640db0eab91f36e73a7fdab13f2fb5ce9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_app",
-    srcs = [
-        "index.html",
-        "tf-graph-app.html",
-    ],
-    path = "/tf-graph-app",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_board",
-        "//tensorflow/tensorboard/components/tf_graph_controls",
-        "//tensorflow/tensorboard/components/tf_graph_loader",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_component_page",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_app"],
-    visibility = [
-        "//learning/brain/python/client/colab:__pkg__",
-        "//learning/vis/vz_elements/catalog:__pkg__",
-    ],
-    destdir = "tf-graph-app",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
-        "//third_party/javascript/polymer/v1/iron-component-page:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-        "//third_party/javascript/polymer/v1/webcomponentsjs:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD
deleted file mode 100644
index 0205e2fd92c2bc6d7be3c6701c34d0f30d323173..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/demo/BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_app/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph-app/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_app",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt
deleted file mode 100644
index 8b95b258df4806dcf84e3b4c1c14cd0434df8910..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/demo/data/graph.pbtxt
+++ /dev/null
@@ -1,90 +0,0 @@
-node {
-  name: "life"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "universe"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 40
-      }
-    }
-  }
-}
-node {
-  name: "everything"
-  op: "Const"
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Add"
-  op: "Add"
-  input: "life"
-  input: "universe"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-node {
-  name: "answer"
-  op: "Add"
-  input: "Add"
-  input: "everything"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-}
-versions {
-  producer: 10
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html b/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
deleted file mode 100644
index f71feea390a958b447e046e815cb36ec2152a1aa..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
+++ /dev/null
@@ -1,34 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1.0">
-<link rel="import" href="../tf-graph-app.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<style>
-  /** Make the graph app tall enough so the bottom legend does not overlap with the top. */
-  tf-graph-app, .container.tf-graph-app {
-    display: block;
-    height: 700px;
-  }
-</style>
-<h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
-<demo-snippet>
-  <template>
-    <tf-graph-app pbtxt-file-location="data/graph.pbtxt"></tf-graph-app>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/index.html b/tensorflow/tensorboard/components/tf_graph_app/index.html
deleted file mode 100644
index c80fbf4f632696ba48c424599b7a84eeb77ecead..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/index.html
+++ /dev/null
@@ -1,30 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <title>vz-vega</title>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
-  </head>
-  <body>
-    <iron-component-page src="tf-graph-app.html"></iron-component-page>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html b/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
deleted file mode 100644
index 915b54a06a9efe5e2bcbd60edcd2021df3304ce3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
+++ /dev/null
@@ -1,152 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
-
-<!--
-Stand alone element of tf-graph for embedding.
-
-The pbtxt format is the stringified version of the graphdef.
-
-    <tf-graph-app pbtxt="[[pbtxt]]"></tf-graph-app>
-
-    import tensorflow as tf
-    life = tf.constant(2, name='life')
-    universe = tf.constant(40, name='universe')
-    everything = tf.constant(0, name='everything')
-    lifeuniverse = tf.add(life, universe)
-    answer = tf.add(lifeuniverse, everything, name='answer')
-    open("graph.pbtxt", "w").write(str(tf.get_default_graph().as_graph_def()))
-
-@demo demo/index.html
--->
-<dom-module id="tf-graph-app">
-<template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.main {
-  position: absolute;
-  right: 0;
-  left: 250px;
-  height: 100%;
-}
-
-.side {
-  position: absolute;
-  left: 0;
-  width: 250px;
-  height: 100%;
-  border: 1px solid black;
-  box-sizing: border-box;
-}
-
-.all {
-  position: relative;
-  width: 100%;
-  height: 100%
-}
-
-.container {
-  height: 650px;
-}
-
-</style>
-<div class="container">
-  <div class="all">
-    <div class="side">
-      <tf-graph-controls
-          color-by-params="[[colorByParams]]"
-          stats="[[stats]]"
-          color-by="{{colorBy}}"
-          render-hierarchy="[[_renderHierarchy]]"
-      ></tf-graph-controls>
-      <tf-graph-loader id="loader"
-          out-graph-hierarchy="{{graphHierarchy}}"
-          out-graph="{{graph}}"
-          out-stats="{{stats}}"
-          progress="{{_progress}}"
-      ></tf-graph-loader>
-    </div>
-    <div class="main">
-      <tf-graph-board id="graphboard"
-          graph-hierarchy="[[graphHierarchy]]"
-          graph="[[graph]]"
-          stats="[[stats]]"
-          progress="[[_progress]]"
-          color-by="[[colorBy]]"
-          color-by-params="{{colorByParams}}"
-          render-hierarchy="{{_renderHierarchy}}"
-      ></tf-graph-board>
-    </div>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-(function(){
-
-Polymer({
-  is: 'tf-graph-app',
-  properties: {
-    stats: Object,
-
-    // To use tf-graph-app, specify one of these 2 properties. Provide either
-    // 1. The path to a pbtxt file to load (pbtxtFileLocation). This option nicely makes the
-    //    progress bar include the time it takes to load the file across the network. The path could
-    //    be either a relative path or an absolute URL (of a resource that supports CORS).
-    // 2. The raw contents of a pbtxt file (pbtxt).
-    // Do not set both of these 2 properties.
-    pbtxtFileLocation: {
-      type: String,
-      observer: '_updateGraph',
-    },
-    pbtxt: {
-      type: String,
-      observer: '_updateGraph',
-    },
-
-    _renderHierarchy: Object,
-    _progress: Object,
-  },
-  _updateGraph: function() {
-    if (this.pbtxtFileLocation) {
-      // Fetch a pbtxt file. The fetching will be part of the loading sequence.
-      this.$.loader.datasets = [{
-        // Just name the dataset based on the file location.
-        "name": this.pbtxtFileLocation,
-        "path": this.pbtxtFileLocation,
-      }];
-      this.$.loader.set('selectedDataset', 0);
-    } else if (this.pbtxt) {
-      // Render the provided pbtxt.
-      var blob = new Blob([this.pbtxt]);
-
-      // TODO(chizeng): Find out why we call a private method here and do away with the call.
-      this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
-    }
-  },
-});
-})();
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/BUILD b/tensorflow/tensorboard/components/tf_graph_board/BUILD
deleted file mode 100644
index 866112e02122f979185f04ba55e0bddb4029fdb3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_board/BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_board",
-    srcs = ["tf-graph-board.html"],
-    path = "/tf-graph-board",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_graph_info",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_progress",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_board"],
-    destdir = "tf-graph-board",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_info:legacy",
-        "//third_party/javascript/polymer/v1/paper-progress:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD
deleted file mode 100644
index 07e8d43dbeed95c1fb548c8c88867f53d5625cf4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_board/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_board/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph-board/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_board",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_graph_loader",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt
deleted file mode 100644
index 30b206453469801d31b46856c29cdda78164f18f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_board/demo/data/graph.pbtxt
+++ /dev/null
@@ -1,4606 +0,0 @@
-node {
-  name: "GradientDescent/learning_rate"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_3"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 100
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000d\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_grad/Shape"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_1_grad/Shape"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod_1"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape_1"
-  input: "gradients/Mean_grad/Const_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum"
-  op: "Maximum"
-  input: "gradients/Mean_grad/Prod_1"
-  input: "gradients/Mean_grad/Maximum/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape"
-  input: "gradients/Mean_grad/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/floordiv"
-  op: "FloorDiv"
-  input: "gradients/Mean_grad/Prod"
-  input: "gradients/Mean_grad/Maximum"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Cast"
-  op: "Cast"
-  input: "gradients/Mean_grad/floordiv"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "DstT"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "SrcT"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile/multiples"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Fill"
-  op: "Fill"
-  input: "gradients/Shape"
-  input: "gradients/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Fill"
-  input: "gradients/Mean_grad/Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile"
-  op: "Tile"
-  input: "gradients/Mean_grad/Reshape"
-  input: "gradients/Mean_grad/Tile/multiples"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tmultiples"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/truediv"
-  op: "RealDiv"
-  input: "gradients/Mean_grad/Tile"
-  input: "gradients/Mean_grad/Cast"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Mean_grad/truediv"
-  input: "gradients/Reshape_3_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  op: "ExpandDims"
-  input: "gradients/Reshape_3_grad/Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/begin"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1"
-  op: "Sub"
-  input: "Rank_2"
-  input: "Sub_1/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/begin"
-  op: "Pack"
-  input: "Sub_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_1"
-  op: "Slice"
-  input: "Shape_2"
-  input: "Slice_1/begin"
-  input: "Slice_1/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat_1"
-  op: "ConcatV2"
-  input: "concat_1/values_0"
-  input: "Slice_1"
-  input: "concat_1/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub"
-  op: "Sub"
-  input: "Rank_1"
-  input: "Sub/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice/begin"
-  op: "Pack"
-  input: "Sub"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice"
-  op: "Slice"
-  input: "Shape_1"
-  input: "Slice/begin"
-  input: "Slice/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat"
-  op: "ConcatV2"
-  input: "concat/values_0"
-  input: "Slice"
-  input: "concat/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2"
-  op: "Sub"
-  input: "Rank"
-  input: "Sub_2/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/size"
-  op: "Pack"
-  input: "Sub_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_2"
-  op: "Slice"
-  input: "Shape"
-  input: "Slice_2/begin"
-  input: "Slice_2/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_biases/read"
-  op: "Identity"
-  input: "logits_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_weights/read"
-  op: "Identity"
-  input: "logits_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_biases/read"
-  op: "Identity"
-  input: "hidden_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 784
-        }
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_weights/read"
-  op: "Identity"
-  input: "hidden_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\377\377\377\377"
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/depth"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/off_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/on_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_1/random_shuffle_queue"
-  op: "RandomShuffleQueueV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "capacity"
-    value {
-      i: 20000
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "min_after_dequeue"
-    value {
-      i: 4000
-    }
-  }
-  attr {
-    key: "seed"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "seed2"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 28
-          }
-          dim {
-            size: 28
-          }
-          dim {
-            size: 1
-          }
-        }
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  op: "QueueDequeueManyV2"
-  input: "mnist_dataset_train_1/random_shuffle_queue"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "timeout_ms"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  input: "Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul"
-  op: "MatMul"
-  input: "Reshape"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add"
-  op: "Add"
-  input: "MatMul"
-  input: "hidden_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Relu"
-  op: "Relu"
-  input: "add"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add_1"
-  op: "Add"
-  input: "MatMul_1"
-  input: "logits_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_1"
-  op: "Reshape"
-  input: "add_1"
-  input: "concat"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot"
-  op: "OneHot"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
-  input: "mnist_dataset_train_2/one_hot/depth"
-  input: "mnist_dataset_train_2/one_hot/on_value"
-  input: "mnist_dataset_train_2/one_hot/off_value"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "TI"
-    value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape_2"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/one_hot"
-  input: "concat_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "SoftmaxCrossEntropyWithLogits"
-  op: "SoftmaxCrossEntropyWithLogits"
-  input: "Reshape_1"
-  input: "Reshape_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  op: "PreventGradient"
-  input: "SoftmaxCrossEntropyWithLogits:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "message"
-    value {
-      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  op: "Mul"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  input: "gradients/Reshape_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum_1"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum"
-  input: "gradients/add_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape_1"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul_1"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Relu_grad/ReluGrad"
-  op: "ReluGrad"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency"
-  input: "Relu"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum_1"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum"
-  input: "gradients/add_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape_1"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul_1"
-  op: "MatMul"
-  input: "Reshape"
-  input: "gradients/add_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_grad/tuple/control_dependency"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 784
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_grad/MatMul"
-  input: "^gradients/MatMul_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_grad/MatMul_1"
-  input: "^gradients/MatMul_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "GradientDescent"
-  op: "NoOp"
-  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_2"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_3"
-  op: "Reshape"
-  input: "SoftmaxCrossEntropyWithLogits"
-  input: "Slice_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Mean"
-  op: "Mean"
-  input: "Reshape_3"
-  input: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "_send_Mean_0"
-  op: "_Send"
-  input: "Mean"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "client_terminated"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "recv_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device_incarnation"
-    value {
-      i: -5924635994370253548
-    }
-  }
-  attr {
-    key: "tensor_name"
-    value {
-      s: "Mean:0"
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 21
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_board/demo/index.html b/tensorflow/tensorboard/components/tf_graph_board/demo/index.html
deleted file mode 100644
index 2563e1595e9648fafea8d3632ece3af7732bf642..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_board/demo/index.html
+++ /dev/null
@@ -1,98 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-graph-board.html">
-<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<title>TF Graph Board Demo</title>
-<style>
-  #demo-container {
-    border: 2px solid #808080;
-    width: 1000px;
-    height: 600px;
-  }
-
-  /** Make the graph take up the entire height of the demo container. */
-  tf-graph-board-demo, #board, #board > div {
-    display: block;
-    height: 100%;
-  }
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="tf-graph-board-demo">
-      <template>
-        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
-        <tf-graph-loader
-            id="loader"
-            datasets="[[_datasets]]"
-            selected-dataset="[[_selectedDataset]]"
-            out-graph="{{_graph}}">
-        </tf-graph-loader>
-
-        <!-- We color ops in the graph by XLA cluster. -->
-        <tf-graph-board id="board" color-by="xla_cluster"></tf-graph-board>
-      </template>
-      <script>
-        "use strict";
-
-        Polymer({
-          is: "tf-graph-board-demo",
-          properties: {
-            // We tell the graph loader to load a specific pbtxt file.
-            _datasets: {
-              type: Array,
-              value: [{
-                "name": "Graph with XLA Clusters Specified",
-                "path": "data/graph.pbtxt"
-              }],
-            },
-            _selectedDataset: {
-              type: Number,
-              value: 0,
-            },
-
-            // This property will be updated by the graph loader.
-            _graph: {
-              type: Object,
-            },
-          },
-          observers: [
-            '_graphUpdated(_graph)',
-          ],
-          _graphUpdated: function(slimGraph) {
-            const tracker = tf.graph.util.getTracker(this.$.loader);
-            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
-                tracker, 100, 'Namespace hierarchy');
-            const hierarchyOptions = {};
-            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
-                function(graphHierarchy) {
-              // We have parsed and built the graph object from a pbtxt file. Render the graph.
-              this.$.board.set('graph', slimGraph);
-              this.$.board.set('graphHierarchy', graphHierarchy);
-            }.bind(this));
-          },
-        });
-      </script>
-    </dom-module>
-    <div id='demo-container'>
-      <tf-graph-board-demo></tf-graph-board-demo>
-    </div>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html b/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
deleted file mode 100644
index 742bb63e045b513b87c7ca72cdb8802229bda5bc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
+++ /dev/null
@@ -1,264 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph/tf-graph.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../tf-graph-info/tf-graph-info.html">
-<link rel="import" href="../paper-progress/paper-progress.html">
-
-<!--
-Element for putting tf-graph and tf-graph-info side by side.
-
-Example
-
-  <tf-graph-board graph=[[graph]]></tf-graph-board>
-
--->
-<dom-module id="tf-graph-board">
-<template>
-<style>
-::host {
-  display: block;
-}
-
-/deep/ .close {
-  position: absolute;
-  cursor: pointer;
-  left: 15px;
-  bottom: 15px;
-}
-
-.container {
-  width: 100%;
-  height: 100%;
-  opacity: 1;
-}
-
-.container.loading {
-  cursor: progress;
-  opacity: 0.1;
-}
-
-.container.loading.error {
-  cursor: auto;
-}
-
-#info {
-  position: absolute;
-  right: 5px;
-  top: 5px;
-  padding: 0px;
-  max-width: 380px;
-  min-width: 320px;
-  background-color: rgba(255,255,255,0.9);
-  @apply(--shadow-elevation-2dp);
-}
-
-#main {
-  width: 100%;
-  height: 100%;
-}
-
-#progress-bar {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  width: 100%;
-  position: absolute;
-  top: 40px;
-  left: 0;
-  font-size: 13px;
-}
-
-#progress-msg {
-  width: 400px;
-  margin-bottom: 5px;
-}
-
-paper-progress {
-  width: 400px;
-  --paper-progress-height: 6px;
-  --paper-progress-active-color: #f3913e;
-}
-
-.context-menu {
-  position: absolute;
-  display: none;
-  background-color: #e2e2e2;
-  border-radius: 2px;
-  font-size: 14px;
-  min-width: 150px;
-  border: 1px solid #d4d4d4;
-}
-
-/deep/ .context-menu ul {
-  list-style-type: none;
-  margin: 0;
-  padding: 0;
-  cursor: default;
-}
-
-/deep/ .context-menu ul li {
-  padding: 4px 16px;
-}
-
-/deep/ .context-menu ul li:hover {
-  background-color: #f3913e;
-  color: white;
-}
-</style>
-<template is="dom-if" if="[[_isNotComplete(progress)]]">
-  <div id="progress-bar">
-    <div id="progress-msg">[[progress.msg]]</div>
-    <paper-progress value="[[progress.value]]"></paper-progress>
-  </div>
-</template>
-<div class$="[[_getContainerClass(progress)]]">
-  <div id="main">
-    <tf-graph id="graph"
-              graph-hierarchy="{{graphHierarchy}}"
-              basic-graph="[[graph]]"
-              hierarchy-params="[[hierarchyParams]]"
-              render-hierarchy="{{renderHierarchy}}"
-              devices-for-stats="[[devicesForStats]]"
-              stats="[[stats]]"
-              selected-node="{{selectedNode}}"
-              highlighted-node="{{_highlightedNode}}"
-              color-by="[[colorBy]]"
-              color-by-params="{{colorByParams}}"
-              progress="{{progress}}"
-              node-names-to-health-pills="[[nodeNamesToHealthPills]]"
-              health-pill-step-index="[[healthPillStepIndex]]"
-    ></tf-graph>
-  </div>
-  <div id="info">
-    <tf-graph-info id="graph-info"
-              title="selected"
-              graph-hierarchy="[[graphHierarchy]]"
-              render-hierarchy="[[renderHierarchy]]"
-              graph="[[graph]]"
-              selected-node="{{selectedNode}}"
-              selected-node-include="{{_selectedNodeInclude}}"
-              highlighted-node="{{_highlightedNode}}"
-              color-by="[[colorBy]]"
-              color-by-params="[[colorByParams]]"
-              debugger-data-enabled="[[debuggerDataEnabled]]"
-              are-health-pills-loading="[[areHealthPillsLoading]]"
-              debugger-numeric-alerts="[[debuggerNumericAlerts]]"
-              node-names-to-health-pills="[[nodeNamesToHealthPills]]"
-              all-steps-mode-enabled="{{allStepsModeEnabled}}"
-              specific-health-pill-step="{{specificHealthPillStep}}"
-              health-pill-step-index="{{healthPillStepIndex}}"
-    ></tf-graph-info>
-  </div>
-  <div class="context-menu"></div>
-</div>
-</template>
-</dom-module>
-
-<script>
-Polymer({
-  is: 'tf-graph-board',
-  properties: {
-    // Public API.
-    graphHierarchy: Object,
-    graph: Object,
-    stats: Object,
-    /**
-     * A number between 0 and 100 denoting the % of progress
-     * for the progress bar and the displayed message.
-     * @type {{value: number, msg: string}}
-     */
-    progress: Object,
-    colorBy: String,
-    colorByParams: {
-      type: Object,
-      notify: true
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true
-    },
-    // Whether debugger data is enabled for this instance of Tensorboard.
-    debuggerDataEnabled: Boolean,
-    // Whether health pills are currently being loaded.
-    areHealthPillsLoading: Boolean,
-    // An array of alerts (in chronological order) provided by debugging libraries on when bad
-    // values (NaN, +/- Inf) appear.
-    debuggerNumericAlerts: {
-      type: Array,
-      notify: true,
-    },
-    // A mapping between node name to the tf.graph.scene.HealthPill to render.
-    nodeNamesToHealthPills: Object,
-    // Whether the user can request health pills for individual steps from the server. This can be
-    // slow compared the default of showing sampled health pills.
-    allStepsModeEnabled: {
-      type: Boolean,
-      notify: true,
-      value: false,
-    },
-    // Relevant if allStepsModeEnabled. The specific step for which to fetch health pills from the
-    // server for.
-    specificHealthPillStep: {
-      type: Number,
-      notify: true,
-      value: 0,
-    },
-    // The step of health pills to show throughout the graph.
-    healthPillStepIndex: Number,
-    // Private API: Data routing between child components.
-    selectedNode: {
-      type: String,
-      notify: true,
-    },
-    // The enum value of the include property of the selected node.
-    _selectedNodeInclude: Number,
-    _highlightedNode: String
-  },
-  listeners: {
-    'node-toggle-extract': '_nodeToggleExtract'
-  },
-  observers: [
-    '_updateNodeInclude(selectedNode)'
-  ],
-  /** True if the progress is not complete yet (< 100 %). */
-  _isNotComplete: function(progress) {
-    return progress.value < 100;
-  },
-  _getContainerClass: function(progress) {
-    var result = 'container';
-    if (progress.error) {
-      result += ' error';
-    }
-    if (this._isNotComplete(progress)) {
-      result += ' loading';
-    }
-    return result;
-  },
-  _updateNodeInclude: function(nodeName) {
-    var node = this.graphHierarchy.node(nodeName);
-    this.set("_selectedNodeInclude",
-      node ? node.include : tf.graph.InclusionType.UNSPECIFIED);
-  },
-  _nodeToggleExtract: function() {
-    this._updateNodeInclude(this.selectedNode);
-  }
-});
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/BUILD b/tensorflow/tensorboard/components/tf_graph_common/BUILD
deleted file mode 100644
index e4e57149f3c36e6d3dda4edb8cdf7614297f6729..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/BUILD
+++ /dev/null
@@ -1,54 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_common",
-    srcs = [
-        "annotation.ts",
-        "colors.ts",
-        "common.ts",
-        "contextmenu.ts",
-        "edge.ts",
-        "externs.ts",
-        "graph.ts",
-        "hierarchy.ts",
-        "layout.ts",
-        "minimap.ts",
-        "node.ts",
-        "parser.ts",
-        "proto.ts",
-        "render.ts",
-        "scene.ts",
-        "template.ts",
-        "tf-graph-common.html",
-        "util.ts",
-    ],
-    path = "/tf-graph-common",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:dagre",
-        "//tensorflow/tensorboard/components/tf_imports:graphlib",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_common"],
-    destdir = "tf-graph-common",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports_google:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_common/annotation.ts b/tensorflow/tensorboard/components/tf_graph_common/annotation.ts
deleted file mode 100644
index bde382977858d7a3a3a69ea233c801c41ab7b4f0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/annotation.ts
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.scene.annotation {
-  /**
-   * Populate a given annotation container group
-   *
-   *     <g class='{in|out}-annotations'></g>
-   *
-   * with annotation group of the following structure:
-   *
-   * <g class='annotation'>
-   *   <g class='annotation-node'>
-   *   <!--
-   *   Content here determined by Scene.node.buildGroup.
-   *   -->
-   *   </g>
-   * </g>
-   *
-   * @param container selection of the container.
-   * @param annotationData node.{in|out}Annotations
-   * @param d node to build group for.
-   * @param sceneElement <tf-graph-scene> polymer element.
-   * @return selection of appended objects
-   */
-  export function buildGroup(
-      container, annotationData: render.AnnotationList,
-      d: render.RenderNodeInfo, sceneElement) {
-    // Select all children and join with data.
-    let annotationGroups =
-        container
-            .selectAll(function() {
-              // using d3's selector function
-              // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-              // (It's not listed in the d3 wiki.)
-              return this.childNodes;
-            })
-            .data(annotationData.list, d => { return d.node.name; });
-
-    annotationGroups.enter()
-        .append('g')
-        .attr('data-name', a => { return a.node.name; })
-        .each(function(a) {
-          let aGroup = d3.select(this);
-
-          // Add annotation to the index in the scene
-          sceneElement.addAnnotationGroup(a, d, aGroup);
-          // Append annotation edge
-          let edgeType = Class.Annotation.EDGE;
-          let metaedge = a.renderMetaedgeInfo && a.renderMetaedgeInfo.metaedge;
-          if (metaedge && !metaedge.numRegularEdges) {
-            edgeType += ' ' + Class.Annotation.CONTROL_EDGE;
-          }
-          // If any edges are reference edges, add the reference edge class.
-          if (metaedge && metaedge.numRefEdges) {
-            edgeType += ' ' + Class.Edge.REF_LINE;
-          }
-          edge.appendEdge(aGroup, a, sceneElement, edgeType);
-
-          if (a.annotationType !== render.AnnotationType.ELLIPSIS) {
-            addAnnotationLabelFromNode(aGroup, a);
-            buildShape(aGroup, a);
-          } else {
-            addAnnotationLabel(
-                aGroup, a.node.name, a, Class.Annotation.ELLIPSIS);
-          }
-        }).merge(annotationGroups)
-        .attr(
-            'class',
-            a => {
-              return Class.Annotation.GROUP + ' ' +
-                  annotationToClassName(a.annotationType) + ' ' +
-                  node.nodeClass(a);
-            })
-        .each(function(a) {
-          let aGroup = d3.select(this);
-          update(aGroup, d, a, sceneElement);
-          if (a.annotationType !== render.AnnotationType.ELLIPSIS) {
-            addInteraction(aGroup, d, a, sceneElement);
-          }
-        });
-
-    annotationGroups.exit()
-        .each(function(a) {
-          let aGroup = d3.select(this);
-
-          // Remove annotation from the index in the scene
-          sceneElement.removeAnnotationGroup(a, d, aGroup);
-        })
-        .remove();
-    return annotationGroups;
-};
-
-/**
- * Maps an annotation enum to a class name used in css rules.
- */
-function annotationToClassName(annotationType: render.AnnotationType) {
-  return (render.AnnotationType[annotationType] || '').toLowerCase() || null;
-}
-
-function buildShape(aGroup, a: render.Annotation) {
-  if (a.annotationType === render.AnnotationType.SUMMARY) {
-    let summary = selectOrCreateChild(aGroup, 'use');
-    summary
-      .attr('class', 'summary')
-      .attr('xlink:href', '#summary-icon')
-      .attr('cursor', 'pointer');
-  } else {
-    let shape = node.buildShape(aGroup, a, Class.Annotation.NODE);
-    // add title tag to get native tooltips
-    selectOrCreateChild(shape, 'title').text(a.node.name);
-  }
-}
-
-function addAnnotationLabelFromNode(aGroup, a: render.Annotation) {
-  let namePath = a.node.name.split('/');
-  let text = namePath[namePath.length - 1];
-  return addAnnotationLabel(aGroup, text, a, null);
-}
-
-function addAnnotationLabel(
-    aGroup, label: string, a: render.Annotation, additionalClassNames) {
-  let classNames = Class.Annotation.LABEL;
-  if (additionalClassNames) {
-    classNames += ' ' + additionalClassNames;
-  }
-  let txtElement = aGroup.append('text')
-                       .attr('class', classNames)
-                       .attr('dy', '.35em')
-                       .attr('text-anchor', a.isIn ? 'end' : 'start')
-                       .text(label);
-
-  return tf.graph.scene.node.enforceLabelWidth(txtElement, -1);
-}
-
-function addInteraction(selection, d: render.RenderNodeInfo,
-    annotation: render.Annotation, sceneElement) {
-  selection
-      .on('mouseover',
-          a => {
-            sceneElement.fire(
-                'annotation-highlight',
-                {name: a.node.name, hostName: d.node.name});
-          })
-      .on('mouseout',
-          a => {
-            sceneElement.fire(
-                'annotation-unhighlight',
-                {name: a.node.name, hostName: d.node.name});
-          })
-      .on('click', a => {
-        // Stop this event's propagation so that it isn't also considered a
-        // graph-select.
-        (<Event>d3.event).stopPropagation();
-        sceneElement.fire(
-            'annotation-select', {name: a.node.name, hostName: d.node.name});
-      });
-  if (annotation.annotationType !== render.AnnotationType.SUMMARY &&
-      annotation.annotationType !== render.AnnotationType.CONSTANT) {
-    selection.on(
-        'contextmenu', contextmenu.getMenu(
-                           node.getContextMenu(annotation.node, sceneElement)));
-  }
-};
-
-/**
- * Adjust annotation's position.
- *
- * @param aGroup selection of a 'g.annotation' element.
- * @param d Host node data.
- * @param a annotation node data.
- * @param sceneElement <tf-graph-scene> polymer element.
- */
-function update(aGroup, d: render.RenderNodeInfo, a: render.Annotation,
-    sceneElement) {
-  let cx = layout.computeCXPositionOfNodeShape(d);
-  // Annotations that point to embedded nodes (constants,summary)
-  // don't have a render information attached so we don't stylize these.
-  // Also we don't stylize ellipsis annotations (the string '... and X more').
-  if (a.renderNodeInfo &&
-      a.annotationType !== render.AnnotationType.ELLIPSIS) {
-    node.stylize(aGroup, a.renderNodeInfo, sceneElement,
-      Class.Annotation.NODE);
-  }
-
-  if (a.annotationType === render.AnnotationType.SUMMARY) {
-    // Update the width of the annotation to give space for the image.
-    a.width += 10;
-  }
-
-  // label position
-  aGroup.select('text.' + Class.Annotation.LABEL).transition()
-    .attr('x', cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset))
-    .attr('y', d.y + a.dy);
-
-  // Some annotations (such as summary) are represented using a 12x12 image tag.
-  // Purposely omitted units (e.g. pixels) since the images are vector graphics.
-  // If there is an image, we adjust the location of the image to be vertically
-  // centered with the node and horizontally centered between the arrow and the
-  // text label.
-  aGroup.select('use.summary').transition()
-    .attr('x', cx + a.dx - 3)
-    .attr('y', d.y + a.dy - 6);
-
-  // Node position (only one of the shape selection will be non-empty.)
-  positionEllipse(
-      aGroup.select('.' + Class.Annotation.NODE + ' ellipse'), cx + a.dx,
-      d.y + a.dy, a.width, a.height);
-  positionRect(
-      aGroup.select('.' + Class.Annotation.NODE + ' rect'), cx + a.dx,
-      d.y + a.dy, a.width, a.height);
-  positionRect(
-      aGroup.select('.' + Class.Annotation.NODE + ' use'), cx + a.dx,
-      d.y + a.dy, a.width, a.height);
-
-  // Edge position
-  aGroup.select('path.' + Class.Annotation.EDGE).transition().attr('d', a => {
-    // map relative position to absolute position
-    let points = a.points.map(p => { return {x: p.dx + cx, y: p.dy + d.y}; });
-    return edge.interpolate(points);
-  });
-};
-
-} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/colors.ts b/tensorflow/tensorboard/components/tf_graph_common/colors.ts
deleted file mode 100644
index 40f91f7d2dbde23d20fe7f5f694994a4beb3b94f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/colors.ts
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module tf {
-  /**
-   * Mapping from color palette name to color palette, which contains
-   * exact colors for multiple states of a single color palette.
-   */
-  export let COLORS = [
-    {
-      'name': 'Google Blue',
-      'color': '#4184f3',
-      'active': '#3a53c5',
-      'disabled': '#cad8fc'
-    },
-    {
-      'name': 'Google Red',
-      'color': '#db4437',
-      'active': '#8f2a0c',
-      'disabled': '#e8c6c1'
-    },
-    {
-      'name': 'Google Yellow',
-      'color': '#f4b400',
-      'active': '#db9200',
-      'disabled': '#f7e8b0'
-    },
-    {
-      'name': 'Google Green',
-      'color': '#0f9d58',
-      'active': '#488046',
-      'disabled': '#c2e1cc'
-    },
-    {
-      'name': 'Purple',
-      'color': '#aa46bb',
-      'active': '#5c1398',
-      'disabled': '#d7bce6'
-    },
-    {
-      'name': 'Teal',
-      'color': '#00abc0',
-      'active': '#47828e',
-      'disabled': '#c2eaf2'
-    },
-    {
-      'name': 'Deep Orange',
-      'color': '#ff6f42',
-      'active': '#ca4a06',
-      'disabled': '#f2cbba'
-    },
-    {
-      'name': 'Lime',
-      'color': '#9d9c23',
-      'active': '#7f771d',
-      'disabled': '#f1f4c2'
-    },
-    {
-      'name': 'Indigo',
-      'color': '#5b6abf',
-      'active': '#3e47a9',
-      'disabled': '#c5c8e8'
-    },
-    {
-      'name': 'Pink',
-      'color': '#ef6191',
-      'active': '#ca1c60',
-      'disabled': '#e9b9ce'
-    },
-    {
-      'name': 'Deep Teal',
-      'color': '#00786a',
-      'active': '#2b4f43',
-      'disabled': '#bededa'
-    },
-    {
-      'name': 'Deep Pink',
-      'color': '#c1175a',
-      'active': '#75084f',
-      'disabled': '#de8cae'
-    },
-    {
-      'name': 'Gray',
-      'color': '#9E9E9E',   // 500
-      'active': '#424242',  // 800
-      'disabled': 'F5F5F5'  // 100
-    }
-  ].reduce((m, c) => {
-    m[c.name] = c;
-    return m;
-  }, {});
-
-  /**
-   * Mapping from op category to color palette name
-   * e.g.,  OP_GROUP_COLORS['state_ops'] = 'Google Blue';
-   */
-  export let OP_GROUP_COLORS = [
-    {
-      color: 'Google Red',
-      groups: [
-        'gen_legacy_ops', 'legacy_ops', 'legacy_flogs_input',
-        'legacy_image_input', 'legacy_input_example_input',
-        'legacy_sequence_input', 'legacy_seti_input_input'
-      ]
-    },
-    {color: 'Deep Orange', groups: ['constant_ops']},
-    {color: 'Indigo', groups: ['state_ops']},
-    {color: 'Purple', groups: ['nn_ops', 'nn']},
-    {color: 'Google Green', groups: ['math_ops']},
-    {color: 'Lime', groups: ['array_ops']},
-    {color: 'Teal', groups: ['control_flow_ops', 'data_flow_ops']},
-    {color: 'Pink', groups: ['summary_ops']},
-    {color: 'Deep Pink', groups: ['io_ops']}
-  ].reduce((m, c) => {
-    c.groups.forEach(function(group) { m[group] = c.color; });
-    return m;
-  }, {});
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_common/common.ts b/tensorflow/tensorboard/components/tf_graph_common/common.ts
deleted file mode 100644
index e7eac54e58fa50407c4a979b6eb6f2d22baf88af..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/common.ts
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * @fileoverview Common interfaces for the tensorflow graph visualizer.
- */
-
-module tf {
-  /**
-   * Tracks task progress. Each task being passed a progress tracker needs
-   * to call the below-defined methods to notify the caller about the gradual
-   * progress of the task.
-   */
-  export interface ProgressTracker {
-    updateProgress(incrementValue: number): void;
-    setMessage(msg: string): void;
-    reportError(msg: string, err: Error): void;
-  }
-} // close module tf
diff --git a/tensorflow/tensorboard/components/tf_graph_common/contextmenu.ts b/tensorflow/tensorboard/components/tf_graph_common/contextmenu.ts
deleted file mode 100644
index 8121cf9f6dab97347efa33e388ecc8f2fb4e9d38..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/contextmenu.ts
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module tf.graph.scene.contextmenu {
-
-/** Function that converts data to a title string. */
-export interface TitleFunction {
-  (data: any): string;
-}
-
-/** Function that takes action based on item clicked in the context menu. */
-export interface ActionFunction {
-  (elem: any, d: any, i: number): void;
-}
-
-/**
- * The interface for an item in the context menu
- */
-export interface ContextMenuItem {
-  title: TitleFunction;
-  action: ActionFunction;
-}
-
-/**
- * Returns the event listener, which can be used as an argument for the d3
- * selection.on function. Renders the context menu that is to be displayed
- * in response to the event.
- */
-export function getMenu(menu: ContextMenuItem[]) {
-  let menuSelection = d3.select('.context-menu');
-  // Close the menu when anything else is clicked.
-  d3.select('body').on(
-      'click.context', function() { menuSelection.style('display', 'none'); });
-
-  // Function called to populate the context menu.
-  return function(data, index: number): void {
-    // Position and display the menu.
-    let event = <MouseEvent>d3.event;
-    menuSelection
-      .style('display', 'block')
-      .style('left', (event.layerX + 1) + 'px')
-      .style('top', (event.layerY + 1) + 'px');
-
-    // Stop the event from propagating further.
-    event.preventDefault();
-    event.stopPropagation();
-
-    // Add provided items to the context menu.
-    menuSelection.html('');
-    let list = menuSelection.append('ul');
-    list.selectAll('li')
-        .data(menu)
-        .enter()
-        .append('li')
-        .html(function(d) { return d.title(data); })
-        .on('click', (d, i) => {
-          d.action(this, data, index);
-          menuSelection.style('display', 'none');
-        });
-  };
-};
-
-} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/edge.ts b/tensorflow/tensorboard/components/tf_graph_common/edge.ts
deleted file mode 100644
index 4a1182bb9fb459b31ab64c3ec4294195cc2dba7e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/edge.ts
+++ /dev/null
@@ -1,359 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.scene.edge {
-
-/** Delimiter between dimensions when showing sizes of tensors. */
-const TENSOR_SHAPE_DELIM = '×';
-
-/** The minimum stroke width of an edge. */
-export const MIN_EDGE_WIDTH = 0.75;
-
-/** The maximum stroke width of an edge. */
-export const MAX_EDGE_WIDTH = 12;
-
-/** The exponent used in the power scale for edge thickness. */
-const EDGE_WIDTH_SCALE_EXPONENT = 0.3;
-
-/** The domain (min and max value) for the edge width. */
-const DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
-
-export const EDGE_WIDTH_SCALE: d3.ScalePower<number, number> = d3.scalePow()
-      .exponent(EDGE_WIDTH_SCALE_EXPONENT)
-      .domain(DOMAIN_EDGE_WIDTH_SCALE)
-      .range([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH])
-      .clamp(true);
-
-let arrowheadMap =
-    d3.scaleQuantize<String>().domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH]).range([
-      'small', 'medium', 'large', 'xlarge'
-    ]);
-
-/** Minimum stroke width to put edge labels in the middle of edges */
-const CENTER_EDGE_LABEL_MIN_STROKE_WIDTH = 2.5;
-
-export type EdgeData = {v: string, w: string, label: render.RenderMetaedgeInfo};
-
-export function getEdgeKey(edgeObj: EdgeData) {
-  return edgeObj.v + EDGE_KEY_DELIM + edgeObj.w;
-}
-
-/**
- * Select or Create a 'g.edges' group to a given sceneGroup
- * and builds a number of 'g.edge' groups inside the group.
- *
- * Structure Pattern:
- *
- * <g class='edges'>
- *   <g class='edge'>
- *     <path class='edgeline'/>
- *   </g>
- *   ...
- * </g>
- *
- *
- * @param sceneGroup container
- * @param graph
- * @param sceneElement <tf-graph-scene> polymer element.
- * @return selection of the created nodeGroups
- */
-export function buildGroup(sceneGroup,
-    graph: graphlib.Graph<render.RenderNodeInfo, render.RenderMetaedgeInfo>,
-    sceneElement) {
-  let edges: EdgeData[] = [];
-  edges = _.reduce(graph.edges(), (edges, edgeObj) => {
-    let edgeLabel = graph.edge(edgeObj);
-    edges.push({
-      v: edgeObj.v,
-      w: edgeObj.w,
-      label: edgeLabel
-    });
-    return edges;
-  }, edges);
-
-  let container =
-      scene.selectOrCreateChild(sceneGroup, 'g', Class.Edge.CONTAINER);
-
-  // Select all children and join with data.
-  // (Note that all children of g.edges are g.edge)
-  let edgeGroups = (container as any).selectAll(function() {return this.childNodes;}).data(edges, getEdgeKey);
-
-  // Make edges a group to support rendering multiple lines for metaedge
-  edgeGroups.enter()
-      .append('g')
-      .attr('class', Class.Edge.GROUP)
-      .attr('data-edge', getEdgeKey)
-      .each(function(d: EdgeData) {
-        let edgeGroup = d3.select(this);
-        d.label.edgeGroup = edgeGroup;
-        // index node group for quick highlighting
-        sceneElement._edgeGroupIndex[getEdgeKey(d)] = edgeGroup;
-
-        // Add line during enter because we're assuming that type of line
-        // normally does not change.
-        appendEdge(edgeGroup, d, sceneElement);
-      })
-      .merge(edgeGroups)
-      .each(position)
-      .each(function(d) {
-    stylize(d3.select(this), d, sceneElement);
-  });
-
-  edgeGroups.exit()
-    .each(d => {
-      delete sceneElement._edgeGroupIndex[getEdgeKey(d)];
-    })
-    .remove();
-  return edgeGroups;
-};
-
-/**
- * Returns the label for the given base edge.
- * The label is the shape of the underlying tensor.
- */
-export function getLabelForBaseEdge(
-    baseEdge: BaseEdge, renderInfo: render.RenderGraphInfo): string {
-  let node = <OpNode>renderInfo.getNodeByName(baseEdge.v);
-  if (node.outputShapes == null || node.outputShapes.length === 0) {
-    return null;
-  }
-  let shape = node.outputShapes[baseEdge.outputTensorIndex];
-  if (shape == null) {
-    return null;
-  }
-  if (shape.length === 0) {
-    return 'scalar';
-  }
-  return shape.map(size => { return size === -1 ? '?' : size; })
-      .join(TENSOR_SHAPE_DELIM);
-}
-
-/**
- * Creates the label for the given metaedge. If the metaedge consists
- * of only 1 tensor, and it's shape is known, the label will contain that
- * shape. Otherwise, the label will say the number of tensors in the metaedge.
- */
-export function getLabelForEdge(metaedge: Metaedge,
-    renderInfo: render.RenderGraphInfo): string {
-  let isMultiEdge = metaedge.baseEdgeList.length > 1;
-  return isMultiEdge ?
-      metaedge.baseEdgeList.length + ' tensors' :
-      getLabelForBaseEdge(metaedge.baseEdgeList[0], renderInfo);
-}
-
-/**
- * Shortens the path enought such that the tip of the start/end marker will
- * point to the start/end of the path. The marker can be of arbitrary size.
- *
- * @param points Array of path control points.
- * @param marker D3 selection of the <marker> svg element.
- * @param isStart Is the marker a `start-marker`. If false, the marker is
- *     an `end-marker`.
- * @return The new array of control points.
- */
-function adjustPathPointsForMarker(points: render.Point[],
-    marker: d3.Selection<any, any, any, any>, isStart: boolean): render.Point[] {
-  let lineFunc = d3.line<render.Point>()
-    .x(d => d.x)
-    .y(d => d.y);
-  let path =
-      d3.select(document.createElementNS('http://www.w3.org/2000/svg', 'path'))
-          .attr('d', lineFunc(points));
-  let markerWidth = +marker.attr('markerWidth');
-  let viewBox = marker.attr('viewBox').split(' ').map(Number);
-  let viewBoxWidth = viewBox[2] - viewBox[0];
-  let refX = +marker.attr('refX');
-  let pathNode = <SVGPathElement> path.node();
-  if (isStart) {
-    // The edge flows downwards. Do not make the edge go the whole way, lest we
-    // clobber the arrowhead.
-    const fractionStickingOut = 1 - refX / viewBoxWidth;
-    const length = markerWidth * fractionStickingOut;
-    const point = pathNode.getPointAtLength(length);
-    // Figure out how many segments of the path we need to remove in order
-    // to shorten the path.
-    const segIndex = pathNode.getPathSegAtLength(length);
-    // Update the very first segment.
-    points[segIndex - 1] = {x: point.x, y: point.y};
-    // Ignore every point before segIndex - 1.
-    return points.slice(segIndex - 1);
-  } else {
-    // The edge flows upwards. Do not make the edge go the whole way, lest we
-    // clobber the arrowhead.
-    const fractionStickingOut = 1 - refX / viewBoxWidth;
-    const length =
-        pathNode.getTotalLength() - markerWidth * fractionStickingOut;
-    const point = pathNode.getPointAtLength(length);
-    // Figure out how many segments of the path we need to remove in order
-    // to shorten the path.
-    const segIndex = pathNode.getPathSegAtLength(length);
-    // Update the very last segment.
-    points[segIndex] = {x: point.x, y: point.y};
-    // Ignore every point after segIndex.
-    return points.slice(0, segIndex + 1);
-  }
-}
-
-/**
- * For a given d3 selection and data object, create a path to represent the
- * edge described in d.label.
- *
- * If d.label is defined, it will be a RenderMetaedgeInfo instance. It
- * will sometimes be undefined, for example for some Annotation edges for which
- * there is no underlying Metaedge in the hierarchical graph.
- */
-export function appendEdge(edgeGroup, d: EdgeData,
-    sceneElement: {renderHierarchy: render.RenderGraphInfo},
-    edgeClass?: string) {
-  let size = 1;
-  if (d.label != null && d.label.metaedge != null) {
-    // There is an underlying Metaedge.
-    size = d.label.metaedge.totalSize;
-  }
-  edgeClass = edgeClass || Class.Edge.LINE; // set default type
-
-  if (d.label && d.label.structural) {
-    edgeClass += ' ' + Class.Edge.STRUCTURAL;
-  }
-  if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
-    edgeClass += ' ' + Class.Edge.REFERENCE_EDGE;
-  }
-  // Give the path a unique id, which will be used to link
-  // the textPath (edge label) to this path.
-  let pathId = 'path_' + getEdgeKey(d);
-  let strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
-
-  let path = edgeGroup.append('path')
-                 .attr('id', pathId)
-                 .attr('class', edgeClass)
-                 .style('stroke-width', strokeWidth + 'px');
-
-  // Check if there is a reference edge and add an arrowhead of the right size.
-  if (d.label && d.label.metaedge) {
-    if (d.label.metaedge.numRefEdges) {
-      // We have a reference edge.
-      const markerId = `reference-arrowhead-${arrowheadMap(strokeWidth)}`;
-      path.style('marker-start', `url(#${markerId})`);
-      d.label.startMarkerId = markerId;
-    } else {
-      // We have a dataflow edge.
-      const markerId = `dataflow-arrowhead-${arrowheadMap(strokeWidth)}`;
-      path.style('marker-end', `url(#${markerId})`);
-      d.label.endMarkerId = markerId;
-    }
-  }
-
-  if (d.label == null || d.label.metaedge == null) {
-    // There is no associated metaedge, thus no text.
-    // This happens for annotation edges.
-    return;
-  }
-  let labelForEdge = getLabelForEdge(d.label.metaedge,
-      sceneElement.renderHierarchy);
-  if (labelForEdge == null) {
-    // We have no information to show on this edge.
-    return;
-  }
-
-  // Put edge label in the middle of edge only if the edge is thick enough.
-  let baseline = strokeWidth > CENTER_EDGE_LABEL_MIN_STROKE_WIDTH ?
-      'central' :
-      'text-after-edge';
-
-  edgeGroup.append('text')
-      .append('textPath')
-        .attr('xlink:href', '#' + pathId)
-        .attr('startOffset', '50%')
-        .attr('text-anchor', 'middle')
-        .attr('dominant-baseline', 'central')
-      .text(labelForEdge);
-};
-
-export let interpolate: d3.Line<{x: number, y: number}> = d3.line<{x: number, y: number}>()
-                             .curve(d3.curveBasis)
-                             .x((d) => { return d.x;})
-                             .y((d) => { return d.y;});
-
-/**
- * Returns a tween interpolator for the endpoint of an edge path.
- */
-function getEdgePathInterpolator(d: EdgeData, i: number, a: string) {
-  let renderMetaedgeInfo = <render.RenderMetaedgeInfo> d.label;
-  let adjoiningMetaedge = renderMetaedgeInfo.adjoiningMetaedge;
-  let points = renderMetaedgeInfo.points;
-
-  // Adjust the path so that start/end markers point to the end
-  // of the path.
-  if (d.label.startMarkerId) {
-    points = adjustPathPointsForMarker(
-        points, d3.select('#' + d.label.startMarkerId), true);
-  }
-  if (d.label.endMarkerId) {
-    points = adjustPathPointsForMarker(
-        points, d3.select('#' + d.label.endMarkerId), false);
-  }
-
-  if (!adjoiningMetaedge) {
-    return d3.interpolate(a, interpolate(points));
-  }
-
-  let renderPath = this;
-
-  // Get the adjoining path that matches the adjoining metaedge.
-  let adjoiningPath =
-    <SVGPathElement>((<HTMLElement>adjoiningMetaedge.edgeGroup.node())
-      .firstChild);
-
-  // Find the desired SVGPoint along the adjoining path, then convert those
-  // coordinates into the space of the renderPath using its Current
-  // Transformation Matrix (CTM).
-  let inbound = renderMetaedgeInfo.metaedge.inbound;
-
-  return function(t) {
-    let adjoiningPoint = adjoiningPath
-      .getPointAtLength(inbound ? adjoiningPath.getTotalLength() : 0)
-      .matrixTransform(adjoiningPath.getCTM())
-      .matrixTransform(renderPath.getCTM().inverse());
-
-    // Update the relevant point in the renderMetaedgeInfo's points list, then
-    // re-interpolate the path.
-    let index = inbound ? 0 : points.length - 1;
-    points[index].x = adjoiningPoint.x;
-    points[index].y = adjoiningPoint.y;
-    let dPath = interpolate(points);
-    return dPath;
-  };
-}
-
-function position(d) {
-  d3.select(this)
-      .select('path.' + Class.Edge.LINE)
-      .transition()
-      .attrTween('d', getEdgePathInterpolator as any);
-};
-
-/**
- * For a given d3 selection and data object, mark the edge as a control
- * dependency if it contains only control edges.
- *
- * d's label property will be a RenderMetaedgeInfo object.
- */
-function stylize(edgeGroup, d: EdgeData, stylize) {
-  edgeGroup.classed('faded', d.label.isFadedOut);
-  let metaedge = d.label.metaedge;
-  edgeGroup.select('path.' + Class.Edge.LINE)
-      .classed('control-dep', metaedge && !metaedge.numRegularEdges);
-};
-
-} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/externs.ts b/tensorflow/tensorboard/components/tf_graph_common/externs.ts
deleted file mode 100644
index 7c0d168a4298c30a3554c9079d6573a9b63a76f6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/externs.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * @fileoverview Extern declarations for tensorflow graph visualizer.
- *     This file contains compiler stubs for external dependencies whos
- *     implementations are defined at runtime.
- */
-
-declare module graphlib {
-  interface GraphOptions {
-    name?: string;
-    /**
-     * Direction for rank nodes. Can be TB, BT, LR, or RL, where T = top,
-     * B = bottom, L = left, and R = right.
-     */
-    rankdir?: string;
-    type?: string|number;
-    /** Number of pixels between each rank in the layout. */
-    ranksep?: number;
-    /** Number of pixels that separate nodes horizontally in the layout. */
-    nodesep?: number;
-    /** Number of pixels that separate edges horizontally in the layout */
-    edgesep?: number;
-  }
-
-  export interface EdgeObject {
-    v: string;
-    w: string;
-    name?: string;
-  }
-
-  export class Graph<N, E> {
-    constructor(opt?: Object);
-    setNode(name: string, value?: N): void;
-    hasNode(name: string): boolean;
-    setEdge(fromName: string, toName: string, value?: E): void;
-    hasEdge(fromName: string, toName: string): boolean;
-    edge(fromName: string, toName: string): E;
-    edge(edgeObject: EdgeObject): E;
-    removeEdge(v: string, w: string): void;
-    nodes(): string[];
-    node(name: string): N;
-    removeNode(name: string): void;
-    setGraph(graphOptions: GraphOptions): void;
-    graph(): GraphOptions;
-    nodeCount(): number;
-    neighbors(name: string): string[];
-    successors(name: string): string[];
-    predecessors(name: string): string[];
-    edges(): EdgeObject[];
-    outEdges(name: string): E[];
-    inEdges(name: string): E[];
-    /**
-     * Returns those nodes in the graph that have no in-edges.
-     * Takes O(|V|) time.
-     */
-    sources(): string[];
-    /**
-     * Remove the node with the id v in the graph or do nothing if
-     * the node is not in the graph. If the node was removed this
-     * function also removes any incident edges. Returns the graph,
-     * allowing this to be chained with other functions. Takes O(|E|) time.
-     */
-    removeNode(name: string): Graph<N, E>;
-    setParent(name: string, parentName: string): void;
-  }
-}
-
-/**
- * Declaring dagre var used for dagre layout.
- */
-declare var dagre: {layout(graph: graphlib.Graph<any, any>): void;};
diff --git a/tensorflow/tensorboard/components/tf_graph_common/graph.ts b/tensorflow/tensorboard/components/tf_graph_common/graph.ts
deleted file mode 100644
index cbd7b14539a319915bc49e2ed95a8cbbcf6f88ea..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/graph.ts
+++ /dev/null
@@ -1,1257 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph {
-
-/** Delimiter used in node names to denote namespaces. */
-export const NAMESPACE_DELIM = '/';
-export const ROOT_NAME = '__root__';
-
-/** Attribute key used for storing attributes that are too large. */
-export const LARGE_ATTRS_KEY = '_too_large_attrs';
-/**
- * Maximum allowed size in bytes, before the attribute is considered large
- * and filtered out of the graph.
- */
-export const LIMIT_ATTR_SIZE = 1024;
-
-// Separator between the source and the destination name of the edge.
-export const EDGE_KEY_DELIM = '--';
-
-export enum GraphType {FULL, EMBEDDED, META, SERIES, CORE, SHADOW, BRIDGE,
-    EDGE};
-export enum NodeType {META, OP, SERIES, BRIDGE, ELLIPSIS};
-
-/** Indicates if a node is to be included in the main graph when rendered. */
-export enum InclusionType {INCLUDE, EXCLUDE, UNSPECIFIED};
-
-/** Indicates if a series is to be grouped in the graph when rendered. */
-export enum SeriesGroupingType {GROUP, UNGROUP};
-
-/** Attribute key reserved for the shapes of the output tensors. */
-const OUTPUT_SHAPES_KEY = '_output_shapes';
-
-/** Attribute key reserved for the XLA cluster that an op runs on. */
-const _XLA_CLUSTER_KEY = '_XlaCluster';
-
-/**
- * A BaseEdge is the label object (in the graphlib sense) for an edge in the
- * original, full graph produced after parsing. Subsequent graphs, like those
- * which belong to Metanodes, should not use BaseEdge objects, but instead
- * contain Metaedges (which in turn may contain any number of BaseEdges).
- */
-export interface BaseEdge extends graphlib.EdgeObject {
-  isControlDependency: boolean;
-  isReferenceEdge: boolean;
-  /** The index of the output tensor of the source node. */
-  outputTensorIndex: number;
-}
-
-/**
- * A SlimGraph is inspired by graphlib.Graph, but having only the functionality
- * that we need.
- */
-export class SlimGraph {
-  nodes: { [nodeName: string]: OpNode };
-  edges: BaseEdge[];
-
-  constructor() {
-    this.nodes = {};
-    this.edges = [];
-  }
-}
-
-export interface NormalizedInput {
-  name: string;
-  /** The index of the output tensor of the source node. */
-  outputTensorIndex: number;
-  isControlDependency: boolean;
-}
-
-export interface BuildParams {
-  enableEmbedding: boolean;
-  inEmbeddingTypes: string[];
-  outEmbeddingTypes: string[];
-  refEdges: { [inputEdge: string]: boolean };
-}
-
-/**
- * The most basic information about a node in the hierarchical graph.
- */
-export interface Node {
-  /** The name of the node, used frequently to look up nodes by name. */
-  name: string;
-  /** Which type of node this is. */
-  type: NodeType;
-  /**
-   * Whether this node is a type that may contain other nodes. Those types
-   * should extend from GroupNode.
-   *
-   * For an OpNode, isGroupNode will be false, even though it may have
-   * embeddings. These embedding Nodes will have their parentNode set to the
-   * OpNode. However, embeddings are later rendered as annotations, not as
-   * children to be made visible on expansion (like a Metanode or SeriesNode).
-   */
-  isGroupNode: boolean;
-  /**
-   * The number of nodes this node represents. For OpNodes, this will be 1, and
-   * for GroupNodes it will be a count of the total number of descendents it
-   * contains.
-   */
-  cardinality: number;
-  /**
-   * The Node which is this Node's parent. This is of type Node and not
-   * GroupNode because of embeddings, which will have a parent OpNode.
-   */
-  parentNode: Node;
-  /** Runtime execution stats for this node, if available */
-  stats: NodeStats;
-  /** If the node is to be included or excluded from the main graph when
-   *  rendered. Defaults to UNSPECIFIED, which means that the rendering
-   *  algorithm determines if it will be included or not. Then can be set to
-   *  INCLUDE or EXCLUDE manually by the user.
-   */
-  include: InclusionType;
-  /**
-   * Node attributes specify customizable visual aspects of a node and
-   * application-specific metadata associated with a node. The name
-   * 'nodeAttributes' is meant to avoid naming-conflicts with the 'attr' in
-   * subclasses of Node.
-   */
-  nodeAttributes: {[key: string]: any;};
-}
-
-export type TensorShape = number[];
-
-export interface OpNode extends Node {
-  op: string;
-  // The device on which the op ran. Null if it is unknown.
-  device: string;
-  attr: {key: string, value: any}[];
-  inputs: NormalizedInput[];
-  inEmbeddings: OpNode[];
-  outEmbeddings: OpNode[];
-  // The name of the SeriesNode that can contain this node in its series.
-  // If there is no such node, then this is null.
-  owningSeries: string;
-  /**
-   * Array of tensor shapes. Null if the number of output tensors is unknown,
-   * otherwise the length will equal the number of output tensors.
-   *
-   * Each tensor shape is an array of numbers, or null. Details:
-   * - null means unknown rank, and therefore entire shape is unknown.
-   * - [4, 2, 1] means rank-3 tensor of size 4x2x1.
-   * - [] means a scalar (rank-0 tensor).
-   * - [1] means rank-1 tensor of size 1 (not the same as scalar).
-   * - [5, -1, 3] means rank-3 tensor of shape is 5x?x3. The size
-   *       of the middle dimension is unknown (encoded as -1).
-   */
-  outputShapes: TensorShape[];
-  // The XLA Cluster on which the op ran. Null if it is unknown.
-  xlaCluster: string;
-}
-
-export interface BridgeNode extends Node {
-  /**
-   * Whether this bridge node represents edges coming into its parent node.
-   */
-  inbound: boolean;
-}
-
-/**
- * A node that is used when there are more than the maximum number of allowed
- * annotations hanging off of a node.  This node represents an ellipsis
- * annotation, indicating a number of additional annotations.
- */
-export interface EllipsisNode extends Node {
-  /**
-   * The number of nodes this ellipsis represents.
-   */
-  numMoreNodes: number;
-
-  /**
-   * Sets the number of nodes this ellipsis represents and changes the node
-   * name accordingly.
-   */
-  setNumMoreNodes(numNodes: number);
-}
-
-export interface GroupNode extends Node {
-  /**
-   * The metagraph contains nodes and metaedges between the immediate children
-   * of this group. The node label objects may be other GroupNodes (like
-   * SeriesNodes and Metanodes) or individual OpNodes. All edge label objects
-   * are Metaedges, each of which contains references to the original
-   * BaseEdge(s) from which it was created.
-   */
-  metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
-
-  /**
-   * The bridgegraph contains only edges which link immediate children of this
-   * group with nodes outside of the metagraph. As in the metagraph, all edge
-   * label objects are Metaedges which contain references to the original
-   * BaseEdge(s) that contribute to it.
-   *
-   * For a Metaedge in the bridgegraph, its external endpoint will be the same
-   * as the metagraph edge from which it came. This is most easily explained
-   * by example.
-   *
-   * Consider an original graph that contains a BaseEdge A/B/C->Z/Y/X.
-   *
-   *     +-------+    (BaseEdge)     +-------+
-   *     | A/B/C |>----------------->| Z/Y/X |
-   *     +-------+                   +-------+
-   *
-   * When we construct the Root's metagraph, it will contain nodes for A and Z,
-   * and a Metaedge A->Z. The A->Z Metaedge will contain the original BaseEdge
-   * A/B/C->Z/Y/X in its baseEdgeGraph. The Root's bridgegraph will always be
-   * empty.
-   *
-   *     +---+    (Root.metagraph edge)    +---+
-   *     | A |>--------------------------->| Z |
-   *     +---+                             +---+
-   *
-   * Now consider the Metanode A. Its metagraph will contain a Metanode for A/B
-   * and no edges. A's bridgegraph will have one Metaedge from A/B->Z, which
-   * was derived from the Root's Metaedge A->Z. That Metaedge will contain the
-   * original BaseEdge in its baseEdgeGraph.
-   *
-   *     +---------+
-   *     | A       |
-   *     |  +---+  |   (A.bridgegraph edge)    +---+
-   *     |  | B |>---------------------------->| Z |
-   *     |  +---+  |                           +---+
-   *     +---------+
-   *
-   * Finally, consider the Metanode A/B. Its metagraph will contain a Metanode
-   * for A/B/C and again no edges. A/B's bridgegraph will have one Metaedge
-   * from A/B/C->Z, which was derived from A's bridgegraph Metaedge A/B->Z.
-   * As before, the A/B/C->Z Metaedge will contain the original BaseEdge in its
-   * baseEdgeGraph.
-   *
-   *     +---------------+
-   *     | A             |
-   *     |  +---------+  |
-   *     |  | B       |  |
-   *     |  |  +---+  |  |   (A/B.bridgegraph edge)      +---+
-   *     |  |  | C |>----------------------------------->| Z |
-   *     |  |  +---+  |  |                               +---+
-   *     |  +---------+  |
-   *     +---------------+
-   *
-   * Likewise, under the Metanode Z and Z/Y, to compute the bridgegraph, we'll
-   * end up with Metaedges A->Z/Y and A->Z/Y/X respectively. So the original
-   * BaseEdge A/B/C->Z/Y/X becomes four different Metaedges in four different
-   * bridgegraphs:
-   *
-   *   + A/B->Z in GroupNode A's bridgegraph,
-   *   + A/B/C->Z in GroupNode A/B's bridgegraph,
-   *   + A->Z/Y in GroupNode Z's bridgegraph, and
-   *   + A->Z/Y/X in GroupNode Z/Y's bridgegraph.
-   *
-   * Considering any BaseEdge then, if N is the number of path segments in the
-   * source and M is the number of path segments in the destination, then the
-   * total number of bridgegraph edges you could create would be (N-1)(M-1).
-   *
-   * For this reason, it is computationally expensive to generate all the
-   * bridgegraphs for all the Metanodes, and instead they should be computed
-   * on demand as needed.
-   */
-  bridgegraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
-
-  /**
-   * Stores how many times each device name appears in its children
-   * op nodes. Used to color group nodes by devices.
-   */
-  deviceHistogram: {[device: string]: number};
-
-  /**
-   * Flag indicating whether this GroupNode's metagraph contains any edges that
-   * are not control edges. Used to quickly determine how to draw a collapsed
-   * series (vertically or horizontally).
-   */
-  hasNonControlEdges: boolean;
-}
-
-export interface Metanode extends GroupNode {
-  depth: number;
-  templateId: string;
-  opHistogram: {[op: string]: number};
-  getFirstChild(): GroupNode|OpNode;
-  getRootOp(): OpNode;
-  /** Return name of all leaves inside a metanode. */
-  leaves(): string[];
-}
-
-export interface SeriesNode extends GroupNode {
-  hasLoop: boolean;
-  prefix: string;
-  suffix: string;
-  clusterId: number;
-  ids: number[];
-  parent: string;
-}
-
-export class EllipsisNodeImpl implements EllipsisNode {
-  name: string;
-  numMoreNodes: number;
-  stats: NodeStats;
-  type: NodeType;
-  isGroupNode: boolean;
-  cardinality: number;
-  parentNode: Node;
-  include: InclusionType;
-  nodeAttributes: {[key: string]: any;};
-  /**
-   * Constructs a new ellipsis annotation node.
-   *
-   * @param numNodes The number of additional annotations this node represents.
-   */
-  constructor(numNodes: number) {
-    this.type = NodeType.ELLIPSIS;
-    this.isGroupNode = false;
-    this.cardinality = 1;
-    this.parentNode = null;
-    this.stats = null;
-    this.setNumMoreNodes(numNodes);
-    this.include = InclusionType.UNSPECIFIED;
-  }
-
-  setNumMoreNodes(numNodes: number) {
-    this.numMoreNodes = numNodes;
-    this.name = '... ' + numNodes + ' more';
-  }
-};
-
-/**
- * A label object for nodes in the full graph and leaf nodes in the render
- * graph.
- */
-export class OpNodeImpl implements OpNode {
-  name: string;
-  op: string;
-  device: string;
-  stats: NodeStats;
-  attr: {key: string, value: any}[];
-  inputs: NormalizedInput[];
-  type: NodeType;
-  isGroupNode: boolean;
-  cardinality: number;
-  inEmbeddings: OpNode[];
-  outEmbeddings: OpNode[];
-  parentNode: Node;
-  include: InclusionType;
-  owningSeries: string;
-  outputShapes: TensorShape[];
-  nodeAttributes: {[key: string]: any;};
-  xlaCluster: string;
-
-  /**
-   * Constructs a new Op node.
-   *
-   * @param rawNode The raw node.
-   */
-  constructor(rawNode: tf.graph.proto.NodeDef) {
-    this.op = rawNode.op;
-    this.name = rawNode.name;
-    this.device = rawNode.device;
-    this.attr = rawNode.attr;
-    // An array of normalized inputs that denote the incoming edges to
-    // the current node. Each input contains the normalized name of the
-    // source node, whether it has a number part and whether it is a
-    // control dependency.
-    this.inputs = normalizeInputs(rawNode.input);
-    this.outputShapes = extractOutputShapes(rawNode.attr);
-    this.xlaCluster = extractXlaCluster(rawNode.attr);
-    // additional properties
-    this.type = NodeType.OP;
-    this.isGroupNode = false;
-    this.cardinality = 1;
-    this.inEmbeddings = [];
-    this.outEmbeddings = [];
-    this.parentNode = null;
-    this.include = InclusionType.UNSPECIFIED;
-    this.owningSeries = null;
-  }
-};
-
-export function createMetanode(name: string, opt = {}): Metanode {
-  return new MetanodeImpl(name, opt);
-}
-
-/**
- * Joins the information from the stats file (memory, compute time) with the
- * graph information.
- */
-export function joinStatsInfoWithGraph(
-    graph: SlimGraph, stats: tf.graph.proto.StepStats,
-    devicesForStats?: {[device: string]: boolean}): void {
-  // Reset stats for each node.
-  _.each(graph.nodes, node => { node.stats = null; });
-
-  _.each(stats.dev_stats, devStats => {
-    // Ignore devices that are not selected.
-    if (devicesForStats && !devicesForStats[devStats.device]) {
-      return;
-    }
-    _.each(devStats.node_stats, nodeStats => {
-      // Lookup the node in the graph by its original name, e.g. A. If not
-      // found, lookup by the rewritten name A/(A) in case the name is both
-      // a namespace and a node name.
-      let nodeName = nodeStats.node_name in graph.nodes ? nodeStats.node_name :
-                                                          nodeStats.node_name +
-              NAMESPACE_DELIM + '(' + nodeStats.node_name + ')';
-
-      // Couldn't find a matching node.
-      if (!(nodeName in graph.nodes)) {
-        return;
-      }
-
-      // Compute the total bytes used.
-      let totalBytes = 0;
-      if (nodeStats.memory) {
-        _.each(nodeStats.memory, alloc => {
-        if (alloc.total_bytes) {
-            if (alloc.total_bytes > 0) {
-              totalBytes += Number(alloc.total_bytes);
-            } else {
-              /* tslint:disable */
-              console.log(
-                  'ignoring negative memory allocation for ' + nodeName);
-              /* tslint:enable */
-            }
-          }
-        });
-      }
-      let outputSize: number[][] = null;
-      if (nodeStats.output) {
-        outputSize = _.map(nodeStats.output, output => {
-          return _.map(output.tensor_description.shape.dim,
-              dim => Number(dim.size));
-        });
-      }
-      graph.nodes[nodeName].device = devStats.device;
-      if (graph.nodes[nodeName].stats == null) {
-        graph.nodes[nodeName].stats = new NodeStats(outputSize);
-      }
-      graph.nodes[nodeName].stats.addBytesAllocation(totalBytes);
-      if (nodeStats.all_end_rel_micros) {
-        if (nodeStats.all_end_rel_micros > 0) {
-          graph.nodes[nodeName].stats.addExecutionTime(
-              nodeStats.all_start_micros,
-              nodeStats.all_start_micros + nodeStats.all_end_rel_micros);
-        } else {
-          /* tslint:disable */
-          console.log('ignoring negative runtime for ' + nodeName);
-          /* tslint:enable */
-        }
-      }
-    });
-  });
-}
-
-/**
- * Execution stats for the node.
- */
-export class NodeStats {
-  constructor(outputSize: number[][]) { this.outputSize = outputSize; }
-
-  /**
-   * Add the start and end time for a particular kernel execution of this op.
-   * Ops can have multiple kernel executions within the same session run.
-   */
-  addExecutionTime(startTime: number, endTime: number) {
-    if (this.startTime != null) {
-      this.startTime = Math.min(this.startTime, startTime);
-    } else {
-      this.startTime = startTime;
-    }
-    if (this.endTime != null) {
-      this.endTime = Math.max(this.endTime, endTime);
-    } else {
-      this.endTime = endTime;
-    }
-  }
-
-  /**
-   * Add the bytes allocated for a particular kernel execution of this op.
-   * Ops can have multiple kernel executions within the same session run.
-   */
-  addBytesAllocation(totalBytes: number) {
-    if (this.totalBytes != null) {
-      this.totalBytes = Math.max(this.totalBytes, totalBytes);
-    } else {
-      this.totalBytes = totalBytes;
-    }
-  }
-
-  /**
-   * Absolute start time for the very first kernel execution of this op.
-   */
-  startTime: number;
-  /**
-   * Absolute end time for the very last kernel execution of this op.
-   */
-  endTime: number;
-  /**
-   * Total number of bytes used for the node. Sum of all children
-   * if it is a Group node.
-   */
-  totalBytes = 0;
-
-  /**
-   * The shape of each output tensors, if there are any.
-   * Empty if it is a Group node.
-   */
-  outputSize: number[][];
-
-  /**
-   * Combines the specified stats with the current stats.
-   * Modifies the current object. This method is used to
-   * compute aggregate stats for group nodes.
-   */
-  combine(stats: NodeStats): void {
-    if (stats.totalBytes != null) {
-      this.totalBytes += stats.totalBytes;
-    }
-    if (stats.getTotalMicros() != null) {
-      this.addExecutionTime(stats.startTime, stats.endTime);
-    }
-  }
-
-  /**
-   * Total number of compute time in microseconds used for the node.
-   * Sum of all children if it is a Group node. Null if it is unknown.
-   * This method can not be scaffolded under a getter attribute because
-   * ECMAScript 5 does not support getter attributes.
-   */
-  getTotalMicros(): number {
-    if (this.startTime == null || this.endTime == null) {
-      return null;
-    }
-    return this.endTime - this.startTime;
-  }
-}
-
-export class MetanodeImpl implements Metanode {
-  name: string;
-  stats: NodeStats;
-  type: NodeType;
-  depth: number;
-  isGroupNode: boolean;
-  cardinality: number;
-  metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
-  bridgegraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
-  templateId: string;
-  opHistogram: {[op: string]: number};
-  deviceHistogram: {[op: string]: number};
-  parentNode: Node;
-  hasNonControlEdges: boolean;
-  include: InclusionType;
-  nodeAttributes: {[key: string]: any;};
-
-  /** A label object for meta-nodes in the graph hierarchy */
-  constructor(name: string, opt = {}) {
-    this.name = name;
-    this.type = NodeType.META;
-    /** number of levels under this group */
-    this.depth = 1;
-    this.isGroupNode = true;
-    /** # of leaf nodes (including embedded ones) */
-    this.cardinality = 0;
-    /** graph contains metanodes, nodes, edges
-     * and metaedges for main items within this metanode
-     */
-    this.metagraph =
-      createGraph<GroupNode|OpNode, Metaedge>(name, GraphType.META, opt);
-    /** bridgegraph must be constructed lazily-see hierarchy.getBridgegraph() */
-    this.bridgegraph = null;
-    /**
-     * A dictionary that count ops type of nodes in this metanode
-     * (op type => count).
-     */
-    this.opHistogram = {};
-    this.deviceHistogram = {};
-    /** unique id for a metanode of similar subgraph */
-    this.templateId = null;
-    /** Metanode which contains this node, if any */
-    this.parentNode = null;
-    this.hasNonControlEdges = false;
-    this.include = InclusionType.UNSPECIFIED;
-  }
-
-  getFirstChild(): GroupNode|OpNode {
-    return this.metagraph.node(this.metagraph.nodes()[0]);
-  }
-
-  /**
-   * Returns the op node associated with the metanode.
-   * For example, if the metanode is 'sgd', the associated
-   * op node is sgd/(sgd).
-   */
-  getRootOp(): OpNode {
-    let nameSplit = this.name.split('/');
-    let rootOpName = this.name + '/(' + nameSplit[nameSplit.length - 1] + ')';
-    return <OpNode>this.metagraph.node(rootOpName);
-  }
-
-  /**
-   * Return an array of the names of all the leaves (non-GroupNodes) inside
-   * this metanode. This performs a breadth-first search of the tree, so
-   * immediate child leaves will appear earlier in the output array than
-   * descendant leaves.
-   */
-  leaves(): string[] {
-    let leaves = [];
-    let queue = [<Node> this];
-    let metagraph; // Defined here due to a limitation of ES6->5 compilation.
-    while (queue.length) {
-      let node = queue.shift();
-      if (node.isGroupNode) {
-        metagraph = (<GroupNode> node).metagraph;
-        _.each(metagraph.nodes(), name => queue.push(metagraph.node(name)));
-      } else {
-        leaves.push(node.name);
-      }
-    }
-    return leaves;
-  }
-};
-
-export interface Metaedge extends graphlib.EdgeObject {
-
-  /**
-   * Stores the original BaseEdges represented by this Metaedge.
-   */
-  baseEdgeList: BaseEdge[];
-
-  /**
-   * Whether this edge represents a relationship that is inbound (or outbound)
-   * to the object which contains this information. For example, in a Metanode's
-   * bridgegraph, each edge connects an immediate child to something outside
-   * the Metanode. If the destination of the edge is inside the Metanode, then
-   * its inbound property should be true. If the destination is outside the
-   * Metanode, then its inbound property should be false.
-   *
-   * The property is optional because not all edges can be described as
-   * inbound/outbound. For example, in a Metanode's metagraph, all of the edges
-   * connect immediate children of the Metanode. None should have an inbound
-   * property, or they should be null/undefined.
-   */
-  inbound?: boolean;
-
-  /**
-   * Number of regular edges (not control dependency edges).
-   */
-  numRegularEdges: number;
-
-  /**
-   * Number of control dependency edges.
-   */
-  numControlEdges: number;
-
-  /**
-   * Number of reference edges, which is an edge to an operation
-   * that takes a reference to its input and changes its value.
-   */
-  numRefEdges: number;
-
-  /**
-   * Total size (number of units) of all the tensors flowing through this edge.
-   */
-  totalSize: number;
-
-  addBaseEdge(edge: BaseEdge, h: hierarchy.Hierarchy): void;
-}
-
-export function createMetaedge(v: string, w: string): Metaedge {
-  return new MetaedgeImpl(v, w);
-}
-
-/**
- * A label object for edges between metanodes of subgraphs in the render graph.
- */
-export class MetaedgeImpl implements Metaedge {
-  v: string;
-  w: string;
-  baseEdgeList: BaseEdge[];
-  inbound: boolean;
-  numRegularEdges: number;
-  numControlEdges: number;
-  numRefEdges: number;
-  totalSize: number;
-
-  constructor(v: string, w: string) {
-    this.v = v;
-    this.w = w;
-    this.baseEdgeList = [];
-    this.inbound = null;
-    this.numRegularEdges = 0;
-    this.numControlEdges = 0;
-    this.numRefEdges = 0;
-    this.totalSize = 0;
-  }
-
-  addBaseEdge(edge: BaseEdge, h: hierarchy.Hierarchy): void {
-    this.baseEdgeList.push(edge);
-    if (edge.isControlDependency) {
-      this.numControlEdges += 1;
-    } else {
-      this.numRegularEdges += 1;
-    }
-    if (edge.isReferenceEdge) {
-      this.numRefEdges += 1;
-    }
-    // Compute the size of the tensor flowing through this
-    // base edge.
-    this.totalSize += MetaedgeImpl.computeSizeOfEdge(edge, h);
-    h.maxMetaEdgeSize = Math.max(h.maxMetaEdgeSize, this.totalSize);
-  }
-
-  private static computeSizeOfEdge(edge: BaseEdge, h: hierarchy.Hierarchy):
-      number {
-    let opNode = <OpNode> h.node(edge.v);
-    if (opNode.outputShapes == null) {
-      // No shape information. Assume a single number. This gives
-      // a lower bound for the total size.
-      return 1;
-    }
-    h.hasShapeInfo = true;
-    // Sum the sizes of all output tensors.
-    return _(opNode.outputShapes).map(shape => {
-      // If the shape is unknown, treat it as 1 when computing
-      // total size. This gives a lower bound for the total size.
-      if (shape == null) {
-        return 1;
-      }
-      // Multiply all shapes to get the total size of the tensor.
-      // E.g. The total size of [4, 2, 1] is 4 * 2 * 1.
-      return _(shape).reduce((accumulated, currSize) => {
-        // If this particular dimension is unknown, treat
-        // it as 1 when computing total size. This gives a lower bound
-        // for the total size.
-        if (currSize === -1) {
-          currSize = 1;
-        }
-        return accumulated * currSize;
-      }, 1);
-    }).sum();
-  }
-}
-
-export function createSeriesNode(prefix: string, suffix: string,
-    parent: string, clusterId: number, name: string): SeriesNode {
-  return new SeriesNodeImpl(prefix, suffix, parent, clusterId, name);
-}
-
-export function getSeriesNodeName(prefix: string, suffix: string,
-    parent: string, startId?: number, endId?: number): string {
-  let numRepresentation =
-      (typeof startId !== 'undefined' && typeof endId !== 'undefined') ?
-      '[' + startId + '-' + endId + ']' :
-      '#';
-  let pattern = prefix + numRepresentation + suffix;
-  return (parent ? parent + '/' : '') + pattern;
-}
-
-class SeriesNodeImpl implements SeriesNode {
-  name: string;
-  type: NodeType;
-  stats: NodeStats;
-  hasLoop: boolean;
-  prefix: string;
-  suffix: string;
-  clusterId: number;
-  ids: number[];
-  parent: string;
-  isGroupNode: boolean;
-  cardinality: number;
-  metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
-  bridgegraph: graphlib.Graph<GroupNode|OpNode, Metaedge>;
-  parentNode: Node;
-  deviceHistogram: {[op: string]: number};
-  hasNonControlEdges: boolean;
-  include: InclusionType;
-  nodeAttributes: {[key: string]: any;};
-
-  constructor(prefix: string, suffix: string, parent: string,
-      clusterId: number, name: string) {
-    this.name = name || getSeriesNodeName(prefix, suffix, parent);
-    this.type = NodeType.SERIES;
-    this.hasLoop = false;
-    this.prefix = prefix;
-    this.suffix = suffix;
-    this.clusterId = clusterId;
-    this.ids = [];
-    this.parent = parent;
-    this.isGroupNode = true;
-    this.cardinality = 0;
-    this.metagraph = createGraph<Metanode, Metaedge>(name, GraphType.SERIES);
-    // bridgegraph must be constructed lazily-see hierarchy.getBridgegraph()
-    this.bridgegraph = null;
-    this.parentNode = null;
-    this.deviceHistogram = {};
-    this.hasNonControlEdges = false;
-    this.include = InclusionType.UNSPECIFIED;
-  }
-}
-
-/**
- * Extracts the shapes of the output tensors from the attr property in the
- * node proto.
- */
-// tslint:disable-next-line:no-any
-function extractOutputShapes(attr: Array<{key: string, value: any}>):
-    TensorShape[] {
-  let result = null;
-  // We don't know anything about the output tensors.
-  if (!attr) {
-    return null;
-  }
-  for (let i = 0; i < attr.length; i++) {
-    let {key, value} = attr[i];
-    if (key === OUTPUT_SHAPES_KEY) {
-      if (!value.list.shape) {
-        // The OUTPUT_SHAPES_KEY lacks a value. We know nothing about the shape.
-        return null;
-      }
-
-      // Map all output tensors into array of numbers denoting their shape.
-      let result = value.list.shape.map(shape => {
-        if (shape.unknown_rank) {
-          // This output tensor is of unknown rank. We don't know if it is a
-          // scalar, or a tensor, or of what shape it is.
-          return null;
-        }
-        if (shape.dim == null ||
-            (shape.dim.length === 1 && shape.dim[0].size == null)) {
-          // This output tensor is a scalar.
-          return [];
-        }
-        // This output tensor has a known rank. Map each dimension size
-        // into a number.
-        return shape.dim.map(dim => {
-          // Size can be -1 if this particular dimension is unknown.
-          return dim.size;
-        });
-      });
-      // Since we already processed it, remove the entry from the attribute
-      // list (saves memory).
-      attr.splice(i, 1);
-      return result;
-    }
-  }
-  // We didn't find OUTPUT_SHAPES_KEY in attributes, so we don't know anything
-  // about the output tensors.
-  return null;
-}
-
-/**
- * Extracts the XLA Cluster that an op runs on from the attrs of the OpNode.
- * @param attr The attr property.
- * @return A string that is the name of the cluster. Or null if it could not be
- *     determined.
- */
-// tslint:disable-next-line:no-any
-function extractXlaCluster(attr: Array<{key: string, value: any}>): string|
-    null {
-  if (!attr) {
-    return null;
-  }
-
-  // Find the attribute for XLA cluster if there is one.
-  for (let i = 0; i < attr.length; i++) {
-    if (attr[i].key === _XLA_CLUSTER_KEY) {
-      return attr[i].value['s'] || null;
-    }
-  }
-  return null;
-}
-
-/**
- * Normalizes the inputs and extracts associated metadata:
- * 1) Inputs can contain a colon followed by a number at the end
- *    (e.g. inputName:1) and we remove this from the input name, and take note
- *    that the input was numbered.
- * 2) Control dependency inputs contain caret at the beginning and we
- *    remove this and annotate the edge as a control dependency.
- * @param inputs Array of unnormalized names of input nodes.
- */
-function normalizeInputs(inputs: string[]): NormalizedInput[] {
-  let normalizedInputs: NormalizedInput[] = [];
-  _.each(inputs, inputName => {
-    let start = inputName[0] === '^';
-    let colon = inputName.lastIndexOf(':');
-    let end = colon !== -1 &&
-      inputName.length - colon > 1 &&
-      !(/\D/).test(inputName.substring(colon + 1)) ?
-      colon : inputName.length;
-    let name = inputName.substring(start ? 1 : 0, end);
-    if (normalizedInputs.length === 0 ||
-      name !== normalizedInputs[normalizedInputs.length - 1].name) {
-      normalizedInputs.push({
-        name: name,
-        outputTensorIndex:
-            end === inputName.length ? 0 : Number(inputName.slice(colon + 1)),
-        isControlDependency: start
-      });
-    }
-  });
-  return normalizedInputs;
-}
-
-function addEdgeToGraph(
-    graph: SlimGraph, inputName: string, outputNode: OpNode,
-    input: NormalizedInput, params: BuildParams, index: number) {
-  // Don't allow loops in the graph.
-  if (inputName === outputNode.name) {
-    return;
-  }
-  // Check if this op type and input number corresponds to a
-  // reference edge using the refEdges dictionary in the params.
-  let isRefEdge = params.refEdges[outputNode.op + ' ' + index] === true;
-  graph.edges.push({
-    v: inputName,
-    w: outputNode.name,
-    outputTensorIndex: input.outputTensorIndex,
-    isControlDependency: input.isControlDependency,
-    isReferenceEdge: isRefEdge
-  });
-}
-
-export function build(
-    rawNodes: tf.graph.proto.NodeDef[], params: BuildParams,
-    tracker: ProgressTracker): Promise<SlimGraph|void> {
-  /**
-   * A dictionary that maps each in-embedding node name to the node
-   * object.
-   */
-  let inEmbedding: {[nodeName: string]: OpNode} = {};
-  /**
-   * A dictionary that maps each out-embedding node name to the node
-   * object.
-   */
-  let outEmbedding: {[nodeName: string]: OpNode} = {};
-  /**
-   * A dictionary that maps each node name to an array of the node's
-   * out-embedding node label objects.
-   */
-  let outEmbeddings: {[inputName: string]: OpNode[]} = {};
-  let isInEmbeddedPred = getEmbedPredicate(params.inEmbeddingTypes);
-  let isOutEmbeddedPred = getEmbedPredicate(params.outEmbeddingTypes);
-  let embeddingNodeNames: string[] = [];
-  /**
-   * A list of all the non-embedding node names which appear in the processed
-   * list of raw nodes. Here we pre-allocate enough room for all the rawNodes,
-   * even though there will some number of embeddings. The excess array length
-   * is spliced off later.
-   *
-   * Experimentation shows that around 30% of the array will go unused, and
-   * even for very large networks that amounts to less than 10k spaces.
-   */
-  let nodeNames = new Array<string>(rawNodes.length);
-
-  return tf.graph.util
-      .runAsyncTask(
-          'Normalizing names', 30,
-          () => {
-            let opNodes = new Array<OpNode>(rawNodes.length);
-            let index = 0;
-            _.each(rawNodes, rawNode => {
-              let opNode = new OpNodeImpl(rawNode);
-              if (isInEmbeddedPred(opNode)) {
-                embeddingNodeNames.push(opNode.name);
-                inEmbedding[opNode.name] = opNode;
-                return;
-              }
-
-              if (isOutEmbeddedPred(opNode)) {
-                embeddingNodeNames.push(opNode.name);
-                outEmbedding[opNode.name] = opNode;
-                _.each(opNode.inputs, input => {
-                  let inputName = input.name;
-                  outEmbeddings[inputName] = outEmbeddings[inputName] || [];
-                  outEmbeddings[inputName].push(opNode);
-                });
-                return;
-              }
-              // The node is not an embedding, so add it to the names and nodes
-              // lists.
-              opNodes[index] = opNode;
-              nodeNames[index] = opNode.name;
-              index++;
-            });
-            opNodes.splice(index);
-            nodeNames.splice(index);
-            return opNodes;
-          },
-          tracker)
-      .then((opNodes) => {
-        // Create the graph data structure from the graphlib library.
-        return tf.graph.util.runAsyncTask(
-            'Building the data structure', 70, () => {
-              let normalizedNameDict =
-                  mapStrictHierarchy(nodeNames, embeddingNodeNames);
-              let graph = new SlimGraph;
-
-              // Add the nodes to the graph.
-              _.each(opNodes, opNode => {
-                let normalizedName =
-                    normalizedNameDict[opNode.name] || opNode.name;
-                graph.nodes[normalizedName] = opNode;
-                // Check if the node has out-embeddings. If yes, add them to the
-                // node.
-                if (opNode.name in outEmbeddings) {
-                  opNode.outEmbeddings = outEmbeddings[opNode.name];
-                  // Normalize the names of the out-embeddings.
-                  _.each(opNode.outEmbeddings, node => {
-                    node.name = normalizedNameDict[node.name] || node.name;
-                  });
-                }
-                // Update the name of the node.
-                opNode.name = normalizedName;
-              });
-
-              // Visit each node's inputs to add the edges to the graph. If the
-              // input
-              // is an in-embedding, then add it to the node's in-embeddings
-              // instead.
-              _.each(opNodes, opNode => {
-                _.each(opNode.inputs, (input, i) => {
-                  let inputName = input.name;
-                  if (inputName in inEmbedding) {
-                    let inEmbedNode = inEmbedding[inputName];
-                    opNode.inEmbeddings.push(inEmbedNode);
-                    // Move the inputs of the in-embedding node into incoming
-                    // edges of
-                    // the main node. E.g. the control dependency of a constant
-                    // node
-                    // should be moved to the op node where the constant is
-                    // embedded.
-                    for (let embedInput of inEmbedNode.inputs) {
-                      addEdgeToGraph(
-                          graph, normalizedNameDict[embedInput.name] ||
-                              embedInput.name,
-                          opNode, embedInput, params, i);
-                    }
-                  } else if (inputName in outEmbedding) {
-                    // Move the inputs of the out-embedding node into inputs of
-                    // the main node where the out-embedding points to.
-                    let outEmbedNode = outEmbedding[inputName];
-                    for (let embedInput of outEmbedNode.inputs) {
-                      addEdgeToGraph(
-                          graph, normalizedNameDict[embedInput.name] ||
-                              embedInput.name,
-                          opNode, input, params, i);
-                    }
-                  } else {
-                    addEdgeToGraph(
-                        graph, normalizedNameDict[inputName] || inputName,
-                        opNode, input, params, i);
-                  }
-                });
-              });
-
-              // Normalize the names of in-embeddings.
-              _.each(inEmbedding, (node, name) => {
-                node.name = normalizedNameDict[node.name] || node.name;
-              });
-
-              return graph;
-            }, tracker);
-      });
-};
-
-/**
- * Create a new graphlib.Graph() instance with default parameters
- */
-export function createGraph<N, E>(name: string, type, opt = {}):
-    graphlib.Graph<N, E> {
-  let graph = new graphlib.Graph<N, E>(opt);
-  graph.setGraph({
-    name: name,
-    rankdir: 'BT',  // BT,TB,LR,RL
-    type: type
-  });
-  return graph;
-};
-
-/**
- * Create a predicate for checking whether a node should be embedded based on
- * the specified types.
- */
-function getEmbedPredicate(types: string[]) {
-  return function(node: OpNode) {
-    // check types
-    for (let i = 0; i < types.length; i++) {
-      let regExp = new RegExp(types[i]);
-      if (node.op.match(regExp)) { return true; }
-    }
-    return false;
-  };
-};
-
-/**
- * Returns a strict node name (name => name/(name)) to avoid conflicts
- * where the node name is also a namespace.
- */
-export function getStrictName(name: string): string {
-  let parts = name.split(NAMESPACE_DELIM);
-  return name + NAMESPACE_DELIM + '(' + parts[parts.length - 1] + ')';
-}
-
-/**
- * For each op node (embedding or non-embedding), rename it if there is a
- * non-embedding node under its namespace. For example, assume node name 'A'.
- * If there is a non-embedding node under its namespace (e.g. 'A/B'), 'A' will
- * be renamed to 'A/(A)'. Then the namespace 'A' will contain 2 nodes: '(A)'
- * and 'B'. If all the nodes under 'A' are embedding nodes (e.g. constant and
- * summary), keep 'A' as an Op node and don't create a namespace.
- *
- * @param nodeNames An array of regular (non-embedding) node names.
- * @param embeddingNodeNames An array of embedding node names.
- * @return Dictionary object mapping names that need to be renamed to
- *     new names.
- */
-function mapStrictHierarchy(nodeNames: string[],
-    embeddingNodeNames: string[]): {[oldName: string]: string} {
-  /** Dictionary that maps the old new to the new name */
-  let newNameDictionary: {[oldName: string]: string} = {};
-  /** Set used to store all namespaces. */
-  let namespaceSet: {[namespace: string]: boolean} = {};
-  // sort the nodes to make prefix check faster
-  nodeNames.sort();
-  // look for nodes with a prefix a,a/b -> a/(a),a/b
-  for (let i = 0; i < nodeNames.length - 1; ++i) {
-    let a = nodeNames[i];
-    // Get all the parent namespaces of the current node
-    // and add them in the namespace set.
-    _.each(getHierarchicalPath(a).slice(0, -1), ns => {
-      namespaceSet[ns] = true;
-    });
-    for (let j = i + 1; j < nodeNames.length; ++j) {
-      let b = nodeNames[j];
-      if (_.startsWith(b, a)) {
-        if (b.length > a.length && b.charAt(a.length) === NAMESPACE_DELIM) {
-          newNameDictionary[a] = getStrictName(a);
-          break;
-        }
-      } else {
-        break;
-      }
-    }
-  }
-  // Go through all the embedding node names and rename them in case they
-  // collide with namespaces.
-  _.each(embeddingNodeNames, embeddingName => {
-    if (embeddingName in namespaceSet) {
-      // Rename to follow strict hierarchy.
-      newNameDictionary[embeddingName] = getStrictName(embeddingName);
-    }
-  });
-  return newNameDictionary;
-};
-
-/**
- * Returns a list of the degrees of each node in the graph.
- */
-function degreeSequence(graph: graphlib.Graph<any, any>): number[] {
-  let degrees = graph.nodes().map(function(name) {
-    return graph.neighbors(name).length;
-  });
-  degrees.sort();
-  return degrees;
-};
-
-/**
- * Returns if the degree sequence of the two graphs is the same.
- */
-export function hasSimilarDegreeSequence(graph1: graphlib.Graph<any, any>,
-    graph2: graphlib.Graph<any, any>): boolean {
-  let dg1 = degreeSequence(graph1);
-  let dg2 = degreeSequence(graph2);
-
-  for (let i = 0; i < dg1.length; i++) {
-    if (dg1[i] !== dg2[i]) {
-      return false;
-    }
-  }
-  return true;
-};
-
-/**
- * Returns the hierarchical path of the current node, based on the node's name.
- * For example, if the name is 'a/b/c', the returned path is
- * ['a', 'a/b', 'a/b/c'].
- */
-export function getHierarchicalPath(name: string,
-  seriesNames?: { [name: string]: string }): string[] {
-  let path: string[] = [];
-  let i = name.indexOf(NAMESPACE_DELIM);
-  // Push all parent portions of the path.
-  while (i >= 0) {
-    path.push(name.substring(0, i));
-    i = name.indexOf(NAMESPACE_DELIM, i + 1);
-  }
-  // If the node's path is under a series, then add the series node name to the
-  // hierarchical path as the parent of the leaf.
-  if (seriesNames) {
-    let seriesName = seriesNames[name];
-    if (seriesName) {
-      path.push(seriesName);
-    }
-  }
-  // Push the leaf of the path.
-  path.push(name);
-  return path;
-};
-
-/**
- * Returns the string for the node inclusion toggle button, dependant
- * on the provided current InclusionType.
- */
-export function getIncludeNodeButtonString(include: InclusionType) {
-  if (include === tf.graph.InclusionType.EXCLUDE) {
-    return 'Add to main graph';
-  } else {
-    return 'Remove from main graph';
-  }
-};
-
-/**
- * Returns the string for the series node grouping toggle button, dependant
- * on the provided current SeriesGroupingType.
- */
-export function getGroupSeriesNodeButtonString(group: SeriesGroupingType) {
-  if (group === tf.graph.SeriesGroupingType.GROUP) {
-    return 'Ungroup this series of nodes';
-  } else {
-    return 'Group this series of nodes';
-  }
-};
-
-/**
- * Toggle the node series grouping option in the provided map, setting it
- * to ungroup if the series is not already in the map.
- */
-export function toggleNodeSeriesGroup(
-  map: { [name: string]: tf.graph.SeriesGroupingType }, name: string) {
-  if (!(name in map) || map[name] === tf.graph.SeriesGroupingType.GROUP) {
-    map[name] = tf.graph.SeriesGroupingType.UNGROUP;
-  } else {
-    map[name] = tf.graph.SeriesGroupingType.GROUP;
-  }
-};
-
-} // close module tf.graph
diff --git a/tensorflow/tensorboard/components/tf_graph_common/hierarchy.ts b/tensorflow/tensorboard/components/tf_graph_common/hierarchy.ts
deleted file mode 100644
index 889607ac5006bf75c698f7d121e1e0b6f9da6e8e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/hierarchy.ts
+++ /dev/null
@@ -1,807 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * Package for the Graph Hierarchy for TensorFlow graph.
- */
-module tf.graph.hierarchy {
-
-/**
- * Class used as output for getPredecessors and getSuccessors methods
- */
-export interface Edges {
-  control: Metaedge[];
-  regular: Metaedge[];
-}
-
-export interface Hierarchy {
-  root: Metanode;
-  templates: {[templateId: string]: string[]};
-  /** List of all device names */
-  devices: string[];
-  /** List of all XLA cluster names */
-  xlaClusters: string[];
-  /** True if at least one tensor in the graph has shape information */
-  hasShapeInfo: boolean;
-  /** The maximum size across all meta edges. Used for scaling thickness. */
-  maxMetaEdgeSize: number;
-  getNodeMap(): {[nodeName: string]: GroupNode|OpNode};
-  node(name: string): GroupNode|OpNode;
-  setNode(name: string, node: GroupNode|OpNode): void;
-  getBridgegraph(nodeName: string): graphlib.Graph<GroupNode|OpNode, Metaedge>;
-  getPredecessors(nodeName: string): Edges;
-  getSuccessors(nodeName: string): Edges;
-  getTopologicalOrdering(nodeName: string): { [childName: string]: number };
-  getTemplateIndex(): (string) => number;
-}
-
-/**
- * Class for the Graph Hierarchy for TensorFlow graph.
- */
-class HierarchyImpl implements Hierarchy {
-  root: Metanode;
-  templates: {[templateId: string]: string[]};
-  private index: {[nodeName: string]: GroupNode|OpNode};
-  devices: string[];
-  xlaClusters: string[];
-  hasShapeInfo = false;
-  maxMetaEdgeSize = 1;
-  orderings: { [nodeName: string]: { [childName: string]: number } };
-
-  constructor() {
-    this.root = createMetanode(ROOT_NAME, {compound: true});
-    this.templates = null;
-    this.devices = null;
-    /**
-     * @type {Object} Dictionary object that maps node name to the node
-     * (could be op-node, metanode, or series-node)
-     */
-    this.index = {};
-    this.index[ROOT_NAME] = this.root;
-    this.orderings = {};
-  }
-
-  getNodeMap(): {[nodeName: string]: GroupNode|OpNode} {
-    return this.index;
-  }
-
-  node(name: string): GroupNode|OpNode {
-    return this.index[name];
-  }
-
-  setNode(name: string, node: GroupNode|OpNode): void {
-    this.index[name] = node;
-  }
-
-  /**
-   * Given the name of a node in this hierarchy, get its bridgegraph, creating
-   * it on the fly if necessary. If the node is not a GroupNode, then this
-   * method returns null. If the provided name does not map to a node in the
-   * hierarchy, an error will be thrown.
-   */
-  getBridgegraph(nodeName: string): graphlib.Graph<GroupNode|OpNode, Metaedge> {
-    let node = this.index[nodeName];
-    if (!node) {
-      throw Error('Could not find node in hierarchy: ' + nodeName);
-    }
-    if (!('metagraph' in node)) {
-      return null;
-    }
-    let groupNode = <GroupNode> node;
-    if (groupNode.bridgegraph) {
-      return groupNode.bridgegraph;
-    }
-    let bridgegraph = groupNode.bridgegraph =
-        createGraph<GroupNode|OpNode, Metaedge>(
-            'BRIDGEGRAPH', GraphType.BRIDGE);
-    if (!node.parentNode || !('metagraph' in node.parentNode)) {
-      return bridgegraph;
-    }
-
-    let parentNode = <GroupNode>node.parentNode;
-    let parentMetagraph = parentNode.metagraph;
-    let parentBridgegraph = this.getBridgegraph(parentNode.name);
-
-    // For each of the parent node's two Metaedge containing graphs, process
-    // each Metaedge involving this node.
-    _.each([parentMetagraph, parentBridgegraph], parentGraph => {
-      _(parentGraph.edges())
-        .filter(e => e.v === nodeName || e.w === nodeName)
-        .each(parentEdgeObj => {
-
-          let inbound = parentEdgeObj.w === nodeName;
-          let parentMetaedge = parentGraph.edge(parentEdgeObj);
-
-          // The parent's Metaedge represents some number of underlying
-          // BaseEdges from the original full graph. For each of those, we need
-          // to determine which immediate child is involved and make sure
-          // there's a Metaedge in the bridgegraph that covers it.
-          _.each(parentMetaedge.baseEdgeList, baseEdge => {
-
-            // Based on the direction, figure out which is the descendant node
-            // and which is the 'other' node (sibling of parent or ancestor).
-            let [descendantName, otherName] =
-              inbound ?
-                [baseEdge.w, parentEdgeObj.v] :
-                [baseEdge.v, parentEdgeObj.w];
-
-            // Determine the immediate child containing this descendant node.
-            let childName = this.getChildName(nodeName, descendantName);
-
-            // Look for an existing Metaedge in the bridgegraph (or create a
-            // new one) that covers the relationship between child and other.
-            let bridgeEdgeObj = <graphlib.EdgeObject> {
-              v: inbound ? otherName : childName,
-              w: inbound ? childName : otherName,
-            };
-            let bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
-            if (!bridgeMetaedge) {
-              bridgeMetaedge = createMetaedge(bridgeEdgeObj.v, bridgeEdgeObj.w);
-              bridgeMetaedge.inbound = inbound;
-              bridgegraph.setEdge(bridgeEdgeObj.v, bridgeEdgeObj.w,
-                  bridgeMetaedge);
-            }
-
-            // Copy the BaseEdge from the parent's Metaedge into this
-            // bridgegraph Metaedge.
-            bridgeMetaedge.addBaseEdge(baseEdge, this);
-          });
-        })
-        .value(); // force lodash chain execution.
-    });
-
-    return bridgegraph;
-  }
-
-  /**
-   * Utility function for determining the name of the immediate child under a
-   * node for a given descendant path. If the descendant corresponds to no
-   * immediate child, an error is thrown.
-   */
-  getChildName(nodeName: string, descendantName: string): string {
-    // Walk up the hierarchy from the descendant to find the child.
-    let currentNode: Node = this.index[descendantName];
-    while (currentNode) {
-      if (currentNode.parentNode && currentNode.parentNode.name === nodeName) {
-        return currentNode.name;
-      }
-      currentNode = currentNode.parentNode;
-    }
-    throw Error(
-        'Could not find immediate child for descendant: ' + descendantName);
-  };
-
-  /** Given the name of a node, return its incoming metaedges. */
-  getPredecessors(nodeName: string): Edges {
-    let node = this.index[nodeName];
-    if (!node) {
-      throw Error('Could not find node with name: ' + nodeName);
-    }
-
-    let predecessors = this.getOneWayEdges(node, true);
-    // Add embedded predecessors, such as constants.
-    if (!node.isGroupNode) {
-      _.each((<OpNode>node).inEmbeddings, embeddedNode => {
-        _.each((<OpNode>node).inputs, input => {
-          if (input.name === embeddedNode.name) {
-            // Make a new metaedge holding the edge between the
-            // node and the in-embedding.
-            let metaedge = new MetaedgeImpl(embeddedNode.name, nodeName);
-            metaedge.addBaseEdge(
-                {
-                  isControlDependency: input.isControlDependency,
-                  outputTensorIndex: input.outputTensorIndex,
-                  isReferenceEdge: false,
-                  v: embeddedNode.name,
-                  w: nodeName
-                },
-                this);
-            predecessors.regular.push(metaedge);
-          }
-        });
-      });
-    }
-    return predecessors;
-  }
-
-  /**
-   * Given the name of a node, return its outgoing metaedges.
-   *
-   * This is the inverse of getPredecessors(). See that method's documentation
-   * for an in-depth example.
-   */
-  getSuccessors(nodeName: string): Edges {
-    let node = this.index[nodeName];
-    if (!node) {
-      throw Error('Could not find node with name: ' + nodeName);
-    }
-
-    let successors = this.getOneWayEdges(node, false);
-
-    // Add embedded successors, such as summaries.
-    if (!node.isGroupNode) {
-      _.each((<OpNode>node).outEmbeddings, embeddedNode => {
-        _.each(embeddedNode.inputs, input => {
-          if (input.name === nodeName) {
-            // Make a new metaedge holding the edge between the
-            // node and the out-embedding.
-            let metaedge = new MetaedgeImpl(nodeName, embeddedNode.name);
-            metaedge.addBaseEdge(
-                {
-                  isControlDependency: input.isControlDependency,
-                  outputTensorIndex: input.outputTensorIndex,
-                  isReferenceEdge: false,
-                  v: nodeName,
-                  w: embeddedNode.name
-                },
-                this);
-            successors.regular.push(metaedge);
-          }
-        });
-      });
-    }
-    return successors;
-  }
-
-  /** Helper method for getPredecessors and getSuccessors */
-  getOneWayEdges(node: GroupNode|OpNode, inEdges: boolean) {
-    let edges: Edges = {control: [], regular: []};
-    // A node with no parent cannot have any edges.
-    if (!node.parentNode || !node.parentNode.isGroupNode) {
-      return edges;
-    }
-    let parentNode = <GroupNode> node.parentNode;
-    let metagraph = parentNode.metagraph;
-    let bridgegraph = this.getBridgegraph(parentNode.name);
-    findEdgeTargetsInGraph(metagraph, node, inEdges, edges);
-    findEdgeTargetsInGraph(bridgegraph, node, inEdges, edges);
-    return edges;
-  }
-
-  /**
-   * For a given GroupNode, get or calculate an object which describes a
-   * topological ordering of child nodes within that GroupNode's metagraph.
-   *
-   * This ordering is used when rendering bridge control edges which are
-   * sometimes backwards relative to the dataflow.
-   *
-   * For example, say we have a graph with two edges A->B and A->C, and we're
-   * interested in the ordering under ROOT. In this case, any of the following
-   * would be legitimate return values:
-   *
-   *  - { 'A': 0, 'B': 1, 'C': 2 } -- most likely
-   *  - { 'A': 0, 'B': 2, 'C': 1 } -- less likely
-   *  - { 'A': 12, 'B': 100, 'C': 99 } -- unlikely, but still OK
-   *
-   * The algorithm does not guarantee that all numbers from 0-N (where N is
-   * the number of nodes) appear exactly once. Rather it guarantees that if
-   * there is a path between two nodes, the earlier one will have a lower
-   * number in the ordering hash.
-   *
-   * When generating the ordering, we ignore control Metaedges (those which
-   * represent only BaseEdges that have isControlDependency set to true).
-   *
-   * If there is no node with the specified name, an error is thrown. If the
-   * node with the specified name is not a group node, null is returned.
-   */
-  getTopologicalOrdering(nodeName: string): { [childName: string]: number } {
-    let node = this.index[nodeName];
-    if (!node) {
-      throw Error('Could not find node with name: ' + nodeName);
-    }
-    if (!node.isGroupNode) {
-      return null;
-    }
-    if (nodeName in this.orderings) {
-      return this.orderings[nodeName];
-    }
-
-    // Mapping of a child node names to lists of their successors.
-    let successors: { [childName: string]: string[] } = {};
-
-    // Set of node names which have appeared as a destination.
-    let destinations: { [childName: string]: boolean } = {};
-
-    let metagraph = (<GroupNode> node).metagraph;
-    _.each(metagraph.edges(), (e: graphlib.EdgeObject) => {
-      if (!metagraph.edge(e).numRegularEdges) {
-        return; // Skip control edges.
-      }
-
-      // Keep track of successors and destinations.
-      if (!(e.v in successors)) {
-        successors[e.v] = [];
-      }
-      successors[e.v].push(e.w);
-      destinations[e.w] = true;
-    });
-
-    // Seed the queue with true sources (those that are not destinations).
-    let queue: string[] =
-      _.difference(_.keys(successors), _.keys(destinations));
-
-    // Produce an ordering by traversing the graph breadth first.
-    let ordering = this.orderings[nodeName] = {};
-    let index = 0;
-    while (queue.length) {
-      let childName = queue.shift();
-      ordering[childName] = index++;
-      _.each(successors[childName], succName => queue.push(succName));
-      delete successors[childName]; // Prevent cycles from infinite looping.
-    }
-    return ordering;
-  }
-
-  /**
-   * Returns a d3 Ordinal function that can be used to look up the index of
-   * a node based on its template id.
-   */
-  getTemplateIndex(): (string) => number {
-    let templateNames = d3.keys(this.templates);
-    let templateIndex = d3.scaleOrdinal()
-        .domain(templateNames)
-        .range(d3.range(0, templateNames.length));
-    return (templateId: string) => <number>templateIndex(templateId);
-  }
-}
-
-/**
- * Internal utility function - given a graph (should be either a metagraph or a
- * bridgegraph) and a node which is known to be in that graph, determine
- * the other ends of edges that involve that node in the direction specified
- * by whether it's inbound.
- *
- * For example if you wanted to find the predecessors of a node, you'd call
- * this method for the parent's metagraph and bridgegraph, specifying inbound
- * as true (look at the source of inbound edges to the specified node).
- *
- * Discovered target names are appended to the targets array.
- */
-function findEdgeTargetsInGraph(
-    graph: graphlib.Graph<GroupNode|OpNode, Metaedge>,
-    node: Node, inbound: boolean, targets: Edges): void {
-  let edges = inbound ? graph.inEdges(node.name) : graph.outEdges(node.name);
-  _.each(edges, e => {
-    let metaedge = graph.edge(e);
-    let targetList =
-        metaedge.numRegularEdges ? targets.regular : targets.control;
-    targetList.push(metaedge);
-  });
-}
-
-export interface HierarchyParams {
-  verifyTemplate: boolean;
-  seriesNodeMinSize: number;
-  seriesMap: { [name: string]: tf.graph.SeriesGroupingType };
-}
-
-/**
- * @param graph The raw graph.
- * @param params Parameters used when building a hierarchy.
- */
-export function build(graph: tf.graph.SlimGraph, params: HierarchyParams,
-    tracker: ProgressTracker): Promise<Hierarchy|void> {
-  let h = new HierarchyImpl();
-  let seriesNames: { [name: string]: string } = {};
-  return tf.graph.util
-      .runAsyncTask(
-          'Adding nodes', 20,
-          () => {
-            // Get all the possible device and XLA cluster names.
-            let deviceNames = {};
-            let xlaClusterNames = {};
-            _.each(graph.nodes, (node, nodeName) => {
-              if (node.device) {
-                deviceNames[node.device] = true;
-              }
-
-              if (node.xlaCluster) {
-                xlaClusterNames[node.xlaCluster] = true;
-              }
-            });
-
-            h.devices = _.keys(deviceNames);
-            h.xlaClusters = _.keys(xlaClusterNames);
-
-            addNodes(h, graph);
-          },
-          tracker)
-      .then(() => {
-        return tf.graph.util.runAsyncTask('Detect series', 20, () => {
-          if (params.seriesNodeMinSize > 0) {
-            groupSeries(
-                h.root, h, seriesNames, params.seriesNodeMinSize,
-                params.seriesMap);
-          }
-        }, tracker);
-      })
-      .then(() => {
-        return tf.graph.util.runAsyncTask('Adding edges', 30, () => {
-          addEdges(h, graph, seriesNames);
-        }, tracker);
-      })
-      .then(() => {
-        return tf.graph.util.runAsyncTask(
-            'Finding similar subgraphs', 30, () => {
-              h.templates = template.detect(h, params.verifyTemplate);
-            }, tracker);
-      })
-      .then(() => {
-        return h;
-      });
-};
-
-export function joinAndAggregateStats(
-    h: Hierarchy, stats: tf.graph.proto.StepStats) {
-  // Get all the possible device names.
-  let deviceNames = {};
-  _.each(h.root.leaves(), nodeName => {
-    let leaf = <OpNode> h.node(nodeName);
-    if (leaf.device != null) {
-      deviceNames[leaf.device] = true;
-    }
-  });
-  h.devices = _.keys(deviceNames);
-
-  // Reset stats for each group node.
-  _.each(h.getNodeMap(), (node, nodeName) => {
-    if (node.isGroupNode) {
-      node.stats = new NodeStats(null);
-      (<GroupNode>node).deviceHistogram = {};
-    }
-  });
-
-  // Bubble-up the stats and device distribution from leaves to parents.
-  _.each(h.root.leaves(), nodeName => {
-    let leaf = <OpNode> h.node(nodeName);
-    let node = <GroupNode|OpNode> leaf;
-    while (node.parentNode != null) {
-      if (leaf.device != null) {
-        let deviceHistogram = (<GroupNode>node.parentNode).deviceHistogram;
-        deviceHistogram[leaf.device] = (deviceHistogram[leaf.device] || 0) + 1;
-      }
-      if (leaf.stats != null) {
-        node.parentNode.stats.combine(leaf.stats);
-      }
-      node = <GroupNode> node.parentNode;
-    }
-  });
-}
-
-/**
- * Creates the metanodes in the hierarchical graph and assigns parent-child
- * relationship between them.
- */
-function addNodes(h: Hierarchy, graph: SlimGraph) {
-  _.each(graph.nodes, (node, nodeName) => {
-    let path = getHierarchicalPath(node.name);
-    let parent: Metanode = h.root;
-
-    parent.depth = Math.max(path.length, parent.depth);
-
-    // Create parent metanodes for each depth. For example if the node name
-    // is 'a/b/c', then create metanodes 'a' and 'a/b', where 'a/b' is a child
-    // of a.
-    for (let i = 0; i < path.length; i++) {
-      parent.depth = Math.max(parent.depth, path.length - i);
-      parent.cardinality += node.cardinality;
-      parent.opHistogram[node.op] = (parent.opHistogram[node.op] || 0) + 1;
-      if (node.device != null) {
-        parent.deviceHistogram[node.device] =
-            (parent.deviceHistogram[node.device] || 0) + 1;
-      }
-      if (i === path.length - 1) { break; }
-      let name = path[i];
-      let child = <Metanode>h.node(name);
-      if (!child) {
-        child = createMetanode(name);
-        child.parentNode = parent;
-        h.setNode(name, child);
-        parent.metagraph.setNode(name, child);
-      }
-      parent = child;
-    }
-    // Assuming node name is 'a/b/c', assign the OpNode as a child of the
-    // metanode 'a/b'.
-    h.setNode(node.name, node);
-    node.parentNode = parent;
-    parent.metagraph.setNode(node.name, node);
-
-    // Add each of the in-embeddings and out-embeddings in the hierarchy.
-    _.each(node.inEmbeddings, function(embedding) {
-      h.setNode(embedding.name, embedding);
-      embedding.parentNode = node;
-    });
-    _.each(node.outEmbeddings, function(embedding) {
-      h.setNode(embedding.name, embedding);
-      embedding.parentNode = node;
-    });
-  });
-};
-
-/**
- * For each metanode in the hierarchical graph, this method adds:
- * the edges in the metagraph. These are edges between nodes
- * that share the same parent.
- */
-function addEdges(h: Hierarchy, graph: SlimGraph,
-    seriesNames: { [name: string]: string }) {
-
-  let nodeIndex = h.getNodeMap();
-
-  // Ancestor paths for the source and destination nodes of an edge. These are
-  // reused for each edge rather than allocating new ones. It's about 10% faster
-  // than allocating new ones on each pass through the loop.
-  let sourcePath: string[] = [];
-  let destPath: string[] = [];
-
-  // Insert the ancestor path for a node into the provided array, including the
-  // node itself. Return the index of the last node inserted (always ROOT).
-  let getPath = (node: Node, path: string[]): number => {
-    let i = 0;
-    while (node) {
-      path[i++] = node.name;
-      node = node.parentNode;
-    }
-    return i - 1;
-  };
-
-  _.each(graph.edges, baseEdge => {
-
-    // Get the hierarchical paths for the source and destination of the edge.
-    let sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath);
-    let destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath);
-
-    // If the hierarchical path cannot be found for either endpoint, then we
-    // cannot create the edge. This happens for example when a node has a
-    // control dependency on a summary node, which are embedded.
-    if (sourceAncestorIndex === -1 || destAncestorIndex === -1) {
-      return;
-    }
-
-    // Find the lowest shared ancestor between source and dest by looking for
-    // the highest nodes that differ between their ancestor paths.
-    while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) {
-      sourceAncestorIndex--;
-      destAncestorIndex--;
-      if (sourceAncestorIndex < 0 || destAncestorIndex < 0) {
-        // This would only occur if the two nodes were the same (a cycle in the
-        // graph), or if one endpoint was a strict ancestor of the other. The
-        // latter shouldn't happen because we rename nodes which are both
-        // metanodes and op nodes. E.g. 'A/B' becomes 'A/B/(B)'.
-        throw Error('No difference found between ancestor paths.');
-      }
-    }
-
-    let sharedAncestorNode =
-      <GroupNode>nodeIndex[sourcePath[sourceAncestorIndex + 1]];
-    let sourceAncestorName = sourcePath[sourceAncestorIndex];
-    let destAncestorName = destPath[destAncestorIndex];
-
-    // Find or create the Metaedge which should contain this BaseEdge inside
-    // the shared ancestor.
-    let metaedge =
-      sharedAncestorNode.metagraph.edge(sourceAncestorName, destAncestorName);
-    if (!metaedge) {
-      metaedge = createMetaedge(sourceAncestorName, destAncestorName);
-      sharedAncestorNode.metagraph
-        .setEdge(sourceAncestorName, destAncestorName, metaedge);
-    }
-    if (!sharedAncestorNode.hasNonControlEdges &&
-        !baseEdge.isControlDependency) {
-      sharedAncestorNode.hasNonControlEdges = true;
-    }
-    metaedge.addBaseEdge(baseEdge, h);
-  });
-};
-
-/**
- * Using the hierarchy template information, detect series in the provided
- * metanode.  For each detected series, create a new SeriesNode
- * and remove series members from the metanode's metagraph and move them to
- * the new series node's metagraph.
- *
- * @param metanode
- * @param hierarchy
- * @param seriesNames Map of node names to their series they are contained in.
- *     This should be provided empty and is populated by this method.
- * @param threshold If the series has this many nodes or more, then group them
- *     into a series.
- * @param map Map of series names to their series grouping type, if one has
- *     been set.
- * @return A dictionary from node name to series node name that contains the
- *     node.
- */
-function groupSeries(metanode: Metanode, hierarchy: Hierarchy,
-    seriesNames: { [name: string]: string }, threshold: number,
-    map: { [name: string]: tf.graph.SeriesGroupingType }) {
-  let metagraph = metanode.metagraph;
-  _.each(metagraph.nodes(), n => {
-    let child = metagraph.node(n);
-    if (child.type === tf.graph.NodeType.META) {
-      groupSeries(<Metanode>child, hierarchy, seriesNames, threshold, map);
-    }
-  });
-
-  let clusters = clusterNodes(metagraph);
-  let seriesDict = detectSeries(clusters, metagraph);
-
-  // Add each series node to the graph and add its grouped children to its own
-  // metagraph.
-  _.each(seriesDict, function(seriesNode: SeriesNode, seriesName: string) {
-    let nodeMemberNames = seriesNode.metagraph.nodes();
-    _.each(nodeMemberNames, n => {
-      let child = <OpNode>metagraph.node(n);
-      if (!child.owningSeries) {
-        child.owningSeries = seriesName;
-      }
-    });
-    // If the series contains less than the threshold number of nodes and
-    // this series has not been adding to the series map, then set this
-    // series to be shown ungrouped in the map.
-    if (nodeMemberNames.length < threshold && !(seriesNode.name in map)) {
-      map[seriesNode.name] = tf.graph.SeriesGroupingType.UNGROUP;
-    }
-    // If the series is in the map as ungrouped then do not group the series.
-    if (seriesNode.name in map
-      && map[seriesNode.name] === tf.graph.SeriesGroupingType.UNGROUP) {
-      return;
-    }
-    hierarchy.setNode(seriesName, seriesNode); // add to the index
-    metagraph.setNode(seriesName, seriesNode);
-    _.each(nodeMemberNames, n => {
-      let child = <OpNode> metagraph.node(n);
-      seriesNode.metagraph.setNode(n, child);
-      seriesNode.parentNode = child.parentNode;
-      seriesNode.cardinality++;
-      if (child.device != null) {
-        seriesNode.deviceHistogram[child.device] =
-            (seriesNode.deviceHistogram[child.device] || 0) + 1;
-      }
-      child.parentNode = seriesNode;
-      seriesNames[n] = seriesName;
-      // Remove now-grouped node from its original parent's metagraph.
-      metagraph.removeNode(n);
-    });
-  });
-};
-
-/** cluster op-nodes with similar op */
-function clusterNodes(metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>):
-    {[clusterId: string]: string[]} {
-  let result: {[clusterId: string]: string[]} = {};
-  return  _.reduce(metagraph.nodes(),
-      (clusters: {[clusterId: string]: string[]}, n: string) => {
-    let child = metagraph.node(n);
-    if (child.type === NodeType.META) {
-      // skip metanodes
-      return clusters;
-    }
-    let template = (<OpNode>child).op;
-    if (template) {
-      clusters[template] = clusters[template] || [];
-      clusters[template].push(child.name);
-    }
-    return clusters;
-  }, result);
-}
-
-/**
- * For each cluster of op-nodes based op type, try to detect groupings.
- * Infer series name using by trying to find pattern '<number>' in the node
- * name.
- *
- * @param clusters Dictionary output from clusterNodes().
- * @param metagraph
- * @return A dictionary from series name => seriesNode
- */
-function detectSeries(clusters: {[clusterId: string]: string[]},
-     metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>):
-     {[seriesName: string]: SeriesNode} {
-  let seriesDict: {[seriesName: string]: SeriesNode} = {};
-  _.each(clusters, function(members, clusterId: string) {
-    if (members.length <= 1) { return; } // isolated clusters can't make series
-
-    /** @type {Object}  A dictionary mapping seriesName to seriesInfoArray,
-     * which is an array that contains objects with name, id, prefix, suffix,
-     * and parent properties.
-     */
-    let candidatesDict: {[seriesName: string]: SeriesNode[]} = {};
-
-    // Group all nodes that have the same name, with the exception of a
-    // number at the end of the name after an underscore, which is allowed to
-    // vary.
-    _.each(members, function(name: string) {
-      let isGroup = name.charAt(name.length - 1) === '*';
-      let namepath = name.split('/');
-      let leaf = namepath[namepath.length - 1];
-      let parent = namepath.slice(0, namepath.length - 1).join('/');
-      let matches = leaf.match(/^(\D*)_(\d+)$/);
-
-      let prefix;
-      let id;
-      let suffix = '';
-      if (matches) {         // if found '<number>' in the name, assign id.
-        prefix = matches[1]; // the front non-numeric characters
-        id = matches[2]; // the digits
-      } else {  // for node without '_<number>', make them zero-th items.
-        prefix = isGroup ? leaf.substr(0, leaf.length - 1) : leaf;
-        id = 0;
-        suffix = isGroup ? '*' : '';
-      }
-      let seriesName = getSeriesNodeName(prefix, suffix, parent);
-      candidatesDict[seriesName] = candidatesDict[seriesName] || [];
-      let seriesNode = createSeriesNode(prefix, suffix, parent, +id, name);
-      candidatesDict[seriesName].push(seriesNode);
-    });
-
-    // In each group of nodes, group nodes in bunches that have monotonically
-    // increasing numbers in their names.  Each of these bunches is a series.
-    _.each(candidatesDict, function(seriesInfoArray: SeriesNode[], seriesName) {
-      if (seriesInfoArray.length < 2) {
-        return;
-      }
-      seriesInfoArray.sort(function(a, b) {
-        return (+a.clusterId) - (+b.clusterId);
-      });
-
-      // Loop through the nodes sorted by its detected series number, grouping
-      // all nodes with monotonically-increasing series numbers.
-      let seriesNodes = [seriesInfoArray[0]];
-      for (let index = 1; index < seriesInfoArray.length; index++) {
-        let nextNode = seriesInfoArray[index];
-        if (nextNode.clusterId === seriesNodes[seriesNodes.length - 1].clusterId
-            + 1) {
-          seriesNodes.push(nextNode);
-          continue;
-        }
-        addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
-        seriesNodes = [nextNode];
-      }
-      addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
-    });
-  });
-  return seriesDict;
-}
-
-/**
- * Add a series to the provided dictionary mapping series names to series.
- *
- * @param seriesNodes the nodes in the series. Contains
- *     name, id, prefix, suffix and parent properties of the node.
- * @param seriesDict the dictionary of series
- * @param clusterId ID of the template of the nodes of the series
- * @param metagraph
- */
-function addSeriesToDict(seriesNodes: SeriesNode[],
-    seriesDict: {[seriesName: string]: SeriesNode},
-    clusterId: number,
-    metagraph: graphlib.Graph<GroupNode|OpNode, Metaedge>) {
-  if (seriesNodes.length > 1) {
-    let curSeriesName = getSeriesNodeName(
-      seriesNodes[0].prefix, seriesNodes[0].suffix,
-      seriesNodes[0].parent, seriesNodes[0].clusterId,
-      seriesNodes[seriesNodes.length - 1].clusterId);
-    let curSeriesNode = createSeriesNode(seriesNodes[0].prefix,
-      seriesNodes[0].suffix, seriesNodes[0].parent, clusterId,
-      curSeriesName);
-    _.each(seriesNodes, function(node) {
-      curSeriesNode.ids.push(node.clusterId);
-      curSeriesNode.metagraph.setNode(node.name, metagraph.node(node.name));
-    });
-    seriesDict[curSeriesName] = curSeriesNode;
-  }
-}
-
-} // close module tf.graph.hierarchy
diff --git a/tensorflow/tensorboard/components/tf_graph_common/layout.ts b/tensorflow/tensorboard/components/tf_graph_common/layout.ts
deleted file mode 100644
index 1019e4f2694d01e3bba71f8f91294cfe61d14a35..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/layout.ts
+++ /dev/null
@@ -1,760 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.layout {
-
-/** Set of parameters that define the look and feel of the graph. */
-export const PARAMS = {
-  animation: {
-    /** Default duration for graph animations in ms. */
-    duration: 250
-  },
-  graph: {
-    /** Graph parameter for metanode. */
-    meta: {
-      /**
-       * Dagre's nodesep param - number of pixels that
-       * separate nodes horizontally in the layout.
-       *
-       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-       */
-      nodeSep: 5,
-      /**
-       * Dagre's ranksep param - number of pixels
-       * between each rank in the layout.
-       *
-       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-       */
-      rankSep: 25,
-      /**
-       * Dagre's edgesep param - number of pixels that separate
-       * edges horizontally in the layout.
-       */
-      edgeSep: 5,
-    },
-    /** Graph parameter for metanode. */
-    series: {
-      /**
-       * Dagre's nodesep param - number of pixels that
-       * separate nodes horizontally in the layout.
-       *
-       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-       */
-      nodeSep: 5,
-      /**
-       * Dagre's ranksep param - number of pixels
-       * between each rank in the layout.
-       *
-       * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-       */
-      rankSep: 25,
-      /**
-       * Dagre's edgesep param - number of pixels that separate
-       * edges horizontally in the layout.
-       */
-      edgeSep: 5
-    },
-    /**
-     * Padding is used to correctly position the graph SVG inside of its parent
-     * element. The padding amounts are applied using an SVG transform of X and
-     * Y coordinates.
-     */
-    padding: {paddingTop: 40, paddingLeft: 20}
-  },
-  subscene: {
-    meta: {
-      paddingTop: 10,
-      paddingBottom: 10,
-      paddingLeft: 10,
-      paddingRight: 10,
-      /**
-       * Used to leave room for the label on top of the highest node in
-       * the core graph.
-       */
-      labelHeight: 20,
-      /** X-space between each extracted node and the core graph. */
-      extractXOffset: 15,
-      /** Y-space between each extracted node. */
-      extractYOffset: 20
-    },
-    series: {
-      paddingTop: 10,
-      paddingBottom: 10,
-      paddingLeft: 10,
-      paddingRight: 10,
-      labelHeight: 10
-    }
-  },
-  nodeSize: {
-    /** Size of meta nodes. */
-    meta: {
-      radius: 5,
-      width: 60,
-      maxLabelWidth: 52,
-      /** A scale for the node's height based on number of nodes inside */
-      // Hack - set this as an any type to avoid issues in exporting a type
-      // from an external module.
-      height: (d3 as any).scaleLinear().domain([1, 200]).range([15, 60]).clamp(true),
-      /** The radius of the circle denoting the expand button. */
-      expandButtonRadius: 3
-    },
-    /** Size of op nodes. */
-    op: {
-      width: 15,
-      height: 6,
-      radius: 3,  // for making annotation touching ellipse
-      labelOffset: -8,
-      maxLabelWidth: 30
-    },
-    /** Size of series nodes. */
-    series: {
-      expanded: {
-        // For expanded series nodes, width and height will be
-        // computed to account for the subscene.
-        radius: 10,
-        labelOffset: 0,
-      },
-      vertical: {
-        // When unexpanded, series whose underlying metagraphs contain
-        // one or more non-control edges will show as a vertical stack
-        // of ellipses.
-        width: 16,
-        height: 13,
-        labelOffset: -13,
-      },
-      horizontal: {
-        // When unexpanded, series whose underlying metagraphs contain
-        // no non-control edges will show as a horizontal stack of
-        // ellipses.
-        width: 24,
-        height: 8,
-        radius: 10,  // Forces annotations to center line.
-        labelOffset: -10,
-      },
-    },
-    /** Size of bridge nodes. */
-    bridge: {
-      // NOTE: bridge nodes will normally be invisible, but they must
-      // take up some space so that the layout step leaves room for
-      // their edges.
-      width: 20,
-      height: 20,
-      radius: 2,
-      labelOffset: 0
-    }
-  },
-  shortcutSize: {
-    /** Size of shortcuts for op nodes */
-    op: {width: 10, height: 4},
-    /** Size of shortcuts for meta nodes */
-    meta: {width: 12, height: 4, radius: 1},
-    /** Size of shortcuts for series nodes */
-    series: {
-      width: 14,
-      height: 4,
-    }
-  },
-  annotations: {
-    /** Maximum possible width of the bounding box for in annotations */
-    inboxWidth: 50,
-    /** Maximum possible width of the bounding box for out annotations */
-    outboxWidth: 50,
-    /** X-space between the shape and each annotation-node. */
-    xOffset: 10,
-    /** Y-space between each annotation-node. */
-    yOffset: 3,
-    /** X-space between each annotation-node and its label. */
-    labelOffset: 2,
-    /** Defines the max width for annotation label */
-    maxLabelWidth: 120
-  },
-  constant: {size: {width: 4, height: 4}},
-  series: {
-    /** Maximum number of repeated item for unexpanded series node. */
-    maxStackCount: 3,
-    /**
-     * Positioning offset ratio for collapsed stack
-     * of parallel series (series without edges between its members).
-     */
-    parallelStackOffsetRatio: 0.2,
-    /**
-     * Positioning offset ratio for collapsed stack
-     * of tower series (series with edges between its members).
-     */
-    towerStackOffsetRatio: 0.5
-  },
-  minimap: {
-    /** The maximum width/height the minimap can have. */
-    size: 150
-  }
-};
-
-/** Calculate layout for a scene of a group node. */
-export function layoutScene(renderNodeInfo: render.RenderGroupNodeInfo): void {
-  // Update layout, size, and annotations of its children nodes and edges.
-  if (renderNodeInfo.node.isGroupNode) {
-    layoutChildren(renderNodeInfo);
-  }
-
-  // Update position of its children nodes and edges
-  if (renderNodeInfo.node.type === NodeType.META) {
-    layoutMetanode(renderNodeInfo);
-  } else if (renderNodeInfo.node.type === NodeType.SERIES) {
-    layoutSeriesNode(renderNodeInfo);
-  }
-};
-
-/**
- * Updates the total width of an unexpanded node which includes the size of its
- * in and out annotations.
- */
-function updateTotalWidthOfNode(renderInfo: render.RenderNodeInfo): void {
-  renderInfo.inboxWidth = renderInfo.inAnnotations.list.length > 0 ?
-      PARAMS.annotations.inboxWidth : 0;
-  renderInfo.outboxWidth = renderInfo.outAnnotations.list.length > 0 ?
-      PARAMS.annotations.outboxWidth : 0;
-  // Assign the width of the core box (the main shape of the node).
-  renderInfo.coreBox.width = renderInfo.width;
-  renderInfo.coreBox.height = renderInfo.height;
-  // TODO(jimbo): Account for font width rather than using a magic number.
-  let labelLength = renderInfo.node.name.length -
-      renderInfo.node.name.lastIndexOf(NAMESPACE_DELIM) - 1;
-  let charWidth = 3; // 3 pixels per character.
-  // Compute the total width of the node.
-  renderInfo.width = Math.max(renderInfo.coreBox.width +
-      renderInfo.inboxWidth + renderInfo.outboxWidth,
-      labelLength * charWidth);
-
-}
-
-/**
- * Update layout, size, and annotations of its children nodes and edges.
- */
-function layoutChildren(renderNodeInfo: render.RenderGroupNodeInfo): void {
-  let children = renderNodeInfo.coreGraph.nodes().map(n => {
-    return renderNodeInfo.coreGraph.node(n);
-  }).concat(renderNodeInfo.isolatedInExtract,
-      renderNodeInfo.isolatedOutExtract);
-
-  _.each(children, childNodeInfo => {
-    // Set size of each child
-    switch (childNodeInfo.node.type) {
-      case NodeType.OP:
-        _.extend(childNodeInfo, PARAMS.nodeSize.op);
-        break;
-      case NodeType.BRIDGE:
-        _.extend(childNodeInfo, PARAMS.nodeSize.bridge);
-        break;
-      case NodeType.META:
-        if (!childNodeInfo.expanded) {
-          // Set fixed width and scalable height based on cardinality
-          _.extend(childNodeInfo, PARAMS.nodeSize.meta);
-          childNodeInfo.height =
-              PARAMS.nodeSize.meta.height(childNodeInfo.node.cardinality);
-        } else {
-          let childGroupNodeInfo =
-            <render.RenderGroupNodeInfo>childNodeInfo;
-          layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
-        }
-        break;
-      case NodeType.SERIES:
-        if (childNodeInfo.expanded) {
-          _.extend(childNodeInfo, PARAMS.nodeSize.series.expanded);
-          let childGroupNodeInfo =
-            <render.RenderGroupNodeInfo>childNodeInfo;
-          layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
-        } else {
-          let childGroupNodeInfo =
-            <render.RenderGroupNodeInfo>childNodeInfo;
-          let seriesParams =
-            childGroupNodeInfo.node.hasNonControlEdges ?
-              PARAMS.nodeSize.series.vertical :
-              PARAMS.nodeSize.series.horizontal;
-          _.extend(childNodeInfo, seriesParams);
-        }
-        break;
-      default:
-        throw Error('Unrecognized node type: ' + childNodeInfo.node.type);
-    }
-    // Compute total width of un-expanded nodes. Width of expanded nodes
-    // has already been computed.
-    if (!childNodeInfo.expanded) {
-      updateTotalWidthOfNode(childNodeInfo);
-    }
-    // Layout each child's annotations
-    layoutAnnotation(childNodeInfo);
-  });
-}
-
-/**
- * Calculate layout for a graph using dagre
- * @param graph the graph to be laid out
- * @param params layout parameters
- * @return width and height of the core graph
- */
-function dagreLayout(
-    graph: graphlib.Graph<render.RenderNodeInfo, render.RenderMetaedgeInfo>,
-    params): {height: number, width: number} {
-  _.extend(graph.graph(), {
-    nodesep: params.nodeSep,
-    ranksep: params.rankSep,
-    edgesep: params.edgeSep
-  });
-  let bridgeNodeNames = [];
-  let nonBridgeNodeNames = [];
-
-  // Split out nodes into bridge and non-bridge nodes, and calculate the total
-  // width we should use for bridge nodes.
-  _.each(graph.nodes(), nodeName => {
-    let nodeInfo = graph.node(nodeName);
-    if (nodeInfo.node.type === NodeType.BRIDGE) {
-      bridgeNodeNames.push(nodeName);
-    } else {
-      nonBridgeNodeNames.push(nodeName);
-    }
-  });
-
-  // If there are no non-bridge nodes, then the graph has zero size.
-  if (!nonBridgeNodeNames.length) {
-    return {
-      width: 0,
-      height: 0,
-    };
-  }
-  dagre.layout(graph);
-
-  // Calculate the true bounding box of the graph by iterating over nodes and
-  // edges rather than accepting dagre's word for it. In particular, we should
-  // ignore the extra-wide bridge nodes and bridge edges, and allow for
-  // annotation boxes and labels.
-  let minX = Infinity;
-  let minY = Infinity;
-  let maxX = -Infinity;
-  let maxY = -Infinity;
-  _.each(nonBridgeNodeNames, nodeName => {
-    let nodeInfo = graph.node(nodeName);
-    let w = 0.5 * nodeInfo.width;
-    let x1 = nodeInfo.x - w;
-    let x2 = nodeInfo.x + w;
-    minX = x1 < minX ? x1 : minX;
-    maxX = x2 > maxX ? x2 : maxX;
-    // TODO(jimbo): Account for the height of labels above op nodes here.
-    let h = 0.5 * nodeInfo.height;
-    let y1 = nodeInfo.y - h;
-    let y2 = nodeInfo.y + h;
-    minY = y1 < minY ? y1 : minY;
-    maxY = y2 > maxY ? y2 : maxY;
-  });
-  _.each(graph.edges(), edgeObj => {
-    let edgeInfo = graph.edge(edgeObj);
-    if (edgeInfo.structural) {
-      return; // Skip structural edges from min/max calculations.
-    }
-
-    // Since the node size passed to dagre includes the in and out
-    // annotations, the endpoints of the edge produced by dagre may not
-    // point to the actual node shape (rectangle, ellipse). We correct the
-    // end-points by finding the intersection of a line between the
-    // next-to-last (next-to-first) point and the destination (source)
-    // rectangle.
-    let sourceNode = graph.node(edgeInfo.metaedge.v);
-    let destNode = graph.node(edgeInfo.metaedge.w);
-
-    // Straight 3-points edges are special case, since they are curved after
-    // our default correction. To keep them straight, we remove the mid point
-    // and correct the first and the last point to be the center of the
-    // source and destination node respectively.
-    if (edgeInfo.points.length === 3 && isStraightLine(edgeInfo.points)) {
-      if (sourceNode != null) {
-        let cxSource = sourceNode.expanded ?
-            sourceNode.x : computeCXPositionOfNodeShape(sourceNode);
-        edgeInfo.points[0].x = cxSource;
-      }
-      if (destNode != null) {
-        let cxDest = destNode.expanded ?
-            destNode.x : computeCXPositionOfNodeShape(destNode);
-        edgeInfo.points[2].x = cxDest;
-      }
-      // Remove the middle point so the edge doesn't curve.
-      edgeInfo.points = [edgeInfo.points[0], edgeInfo.points[1]];
-    }
-    // Correct the destination endpoint of the edge.
-    let nextToLastPoint = edgeInfo.points[edgeInfo.points.length - 2];
-    // The destination node might be null if this is a bridge edge.
-    if (destNode != null) {
-      edgeInfo.points[edgeInfo.points.length - 1] =
-          intersectPointAndNode(nextToLastPoint, destNode);
-    }
-    // Correct the source endpoint of the edge.
-    let secondPoint = edgeInfo.points[1];
-    // The source might be null if this is a bridge edge.
-    if (sourceNode != null) {
-      edgeInfo.points[0] = intersectPointAndNode(secondPoint, sourceNode);
-    }
-
-    _.each(edgeInfo.points, (point: render.Point) => {
-        minX = point.x < minX ? point.x : minX;
-        maxX = point.x > maxX ? point.x : maxX;
-        minY = point.y < minY ? point.y : minY;
-        maxY = point.y > maxY ? point.y : maxY;
-      });
-  });
-
-  // Shift all nodes and edge points to account for the left-padding amount,
-  // and the invisible bridge nodes.
-  _.each(graph.nodes(), nodeName => {
-    let nodeInfo = graph.node(nodeName);
-    nodeInfo.x -= minX;
-    nodeInfo.y -= minY;
-  });
-  _.each(graph.edges(), edgeObj => {
-    _.each(graph.edge(edgeObj).points, (point: render.Point) => {
-        point.x -= minX;
-        point.y -= minY;
-      });
-  });
-
-  return {
-    width: maxX - minX,
-    height: maxY - minY
-  };
-}
-
-/** Layout a metanode. Only called for an expanded node. */
-function layoutMetanode(renderNodeInfo: render.RenderGroupNodeInfo): void {
-  // First, copy params specific to meta nodes onto this render info object.
-  let params = PARAMS.subscene.meta;
-  _.extend(renderNodeInfo, params);
-  // Invoke dagre.layout() on the core graph and record the bounding box
-  // dimensions.
-  _.extend(renderNodeInfo.coreBox,
-      dagreLayout(renderNodeInfo.coreGraph, PARAMS.graph.meta));
-
-  // Calculate the position of nodes in isolatedInExtract relative to the
-  // top-left corner of inExtractBox (the bounding box for all inExtract nodes)
-  // and calculate the size of the inExtractBox.
-  let maxInExtractWidth = _.max(renderNodeInfo.isolatedInExtract,
-      renderNode => renderNode.width).width;
-  renderNodeInfo.inExtractBox.width = maxInExtractWidth != null ?
-      maxInExtractWidth : 0;
-
-  renderNodeInfo.inExtractBox.height =
-    _.reduce(renderNodeInfo.isolatedInExtract, (height, child, i) => {
-      let yOffset = i > 0 ? params.extractYOffset : 0;
-      // use width/height here to avoid overlaps between extracts
-      child.x = 0;
-      child.y = height + yOffset + child.height / 2;
-      return height + yOffset + child.height;
-    }, 0);
-
-  // Calculate the position of nodes in isolatedOutExtract relative to the
-  // top-left corner of outExtractBox (the bounding box for all outExtract
-  // nodes) and calculate the size of the outExtractBox.
-  let maxOutExtractWidth = _.max(renderNodeInfo.isolatedOutExtract,
-      renderNode => renderNode.width).width;
-  renderNodeInfo.outExtractBox.width = maxOutExtractWidth != null ?
-      maxOutExtractWidth : 0;
-
-  renderNodeInfo.outExtractBox.height =
-    _.reduce(renderNodeInfo.isolatedOutExtract, (height, child, i) => {
-      let yOffset = i > 0 ? params.extractYOffset : 0;
-      // use width/height here to avoid overlaps between extracts
-      child.x = 0;
-      child.y = height + yOffset + child.height / 2;
-      return height + yOffset + child.height;
-    }, 0);
-
-  // Compute the total padding between the core graph, in-extract and
-  // out-extract boxes.
-  let numParts = 0;
-  if (renderNodeInfo.isolatedInExtract.length > 0) {
-    numParts++;
-  }
-  if (renderNodeInfo.isolatedOutExtract.length > 0) {
-    numParts++;
-  }
-  if (renderNodeInfo.coreGraph.nodeCount() > 0) {
-    numParts++;
-  }
-  let offset = PARAMS.subscene.meta.extractXOffset;
-  let padding = numParts <= 1 ? 0 : (numParts  <= 2 ? offset : 2 * offset);
-
-  // Add the in-extract and out-extract width to the core box width.
-  renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width +
-      renderNodeInfo.outExtractBox.width + padding;
-  renderNodeInfo.coreBox.height =
-    params.labelHeight +
-    Math.max(
-      renderNodeInfo.inExtractBox.height,
-      renderNodeInfo.coreBox.height,
-      renderNodeInfo.outExtractBox.height
-  );
-  // Determine the whole metanode's width (from left to right).
-  renderNodeInfo.width = renderNodeInfo.coreBox.width +
-      params.paddingLeft + params.paddingRight;
-
-  // Determine the whole metanode's height (from top to bottom).
-  renderNodeInfo.height =
-      renderNodeInfo.paddingTop +
-      renderNodeInfo.coreBox.height +
-      renderNodeInfo.paddingBottom;
-}
-
-/**
- * Calculate layout for series node's core graph. Only called for an expanded
- * series.
- */
-function layoutSeriesNode(node: render.RenderGroupNodeInfo): void {
-  let graph = node.coreGraph;
-
-  let params = PARAMS.subscene.series;
-  _.extend(node, params);
-
-  // Layout the core.
-  _.extend(node.coreBox, dagreLayout(node.coreGraph, PARAMS.graph.series));
-
-  _.each(graph.nodes(), nodeName => {
-    graph.node(nodeName).excluded = false;
-  });
-
-  // Series do not have in/outExtractBox so no need to include them here.
-  node.width = node.coreBox.width + params.paddingLeft + params.paddingRight;
-  node.height = node.coreBox.height + params.paddingTop + params.paddingBottom;
-}
-
-/**
- * Calculate layout for annotations of a given node.
- * This will modify positions of the given node and its annotations.
- *
- * @see tf.graph.render.Node and tf.graph.render.Annotation
- * for description of each property of each render node.
- *
- */
-function layoutAnnotation(renderNodeInfo: render.RenderNodeInfo): void {
-  // If the render node is an expanded metanode, then its annotations will not
-  // be visible and we should skip the annotation calculations.
-  if (renderNodeInfo.expanded) {
-    return;
-  }
-
-  let inAnnotations = renderNodeInfo.inAnnotations.list;
-  let outAnnotations = renderNodeInfo.outAnnotations.list;
-
-  // Calculate size for in-annotations
-  _.each(inAnnotations, a => sizeAnnotation(a));
-
-  // Calculate size for out-annotations
-  _.each(outAnnotations, a => sizeAnnotation(a));
-
-  let params = PARAMS.annotations;
-
-  // Calculate annotation node position (a.dx, a.dy)
-  // and total height for in-annotations
-  // After this chunk of code:
-  // inboxHeight = sum of annotation heights+ (annotation.length - 1 * yOffset)
-  let inboxHeight = _.reduce(inAnnotations,
-      (height, a, i) => {
-        let yOffset = i > 0 ? params.yOffset : 0;
-        a.dx = -(renderNodeInfo.coreBox.width + a.width) / 2 - params.xOffset;
-        a.dy = height + yOffset + a.height / 2;
-        return height + yOffset + a.height;
-      }, 0);
-
-  _.each(inAnnotations, a => {
-    a.dy -= inboxHeight / 2;
-
-    a.labelOffset = params.labelOffset;
-  });
-
-  // Calculate annotation node position (a.dx, a.dy)
-  // and total height for out-annotations
-  // After this chunk of code:
-  // outboxHeight = sum of annotation heights +
-  //                (annotation.length - 1 * yOffset)
-  let outboxHeight = _.reduce(outAnnotations,
-      (height, a, i) => {
-        let yOffset = i > 0 ? params.yOffset : 0;
-        a.dx = (renderNodeInfo.coreBox.width + a.width) / 2 + params.xOffset;
-        a.dy = height + yOffset + a.height / 2;
-        return height + yOffset + a.height;
-      }, 0);
-
-  _.each(outAnnotations, a => {
-    // adjust by (half of ) the total height
-    // so dy is relative to the host node's center.
-    a.dy -= outboxHeight / 2;
-
-    a.labelOffset = params.labelOffset;
-  });
-
-  // Creating scales for touch point between the in-annotation edges
-  // and their hosts.
-
-  let inTouchHeight =
-      Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius,
-          inboxHeight / 2);
-  inTouchHeight = inTouchHeight < 0 ? 0 : inTouchHeight;
-
-  let inY = d3.scaleLinear()
-    .domain([0, inAnnotations.length - 1])
-    .range([-inTouchHeight, inTouchHeight]);
-
-  // Calculate annotation edge position
-  _.each(inAnnotations, (a, i) => {
-    a.points = [
-      // The annotation node end
-      {
-        dx: a.dx + a.width / 2,
-        dy: a.dy
-      },
-
-      // The host node end
-      {
-        dx: - renderNodeInfo.coreBox.width / 2,
-        // only use scale if there are more than one,
-        // otherwise center it vertically
-        dy: inAnnotations.length > 1 ? inY(i) : 0
-      }
-    ];
-  });
-
-  // Creating scales for touch point between the out-annotation edges
-  // and their hosts.
-  let outTouchHeight =
-      Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius,
-          outboxHeight / 2);
-  outTouchHeight = outTouchHeight < 0 ? 0 : outTouchHeight;
-  let outY = d3.scaleLinear()
-    .domain([0, outAnnotations.length - 1])
-    .range([-outTouchHeight, outTouchHeight]);
-
-  _.each(outAnnotations, (a, i) => {
-    // Add point from the border of the annotation node
-    a.points = [
-      // The host node end
-      {
-        dx: renderNodeInfo.coreBox.width / 2,
-        // only use scale if there are more than one,
-        // otherwise center it vertically
-        dy: outAnnotations.length > 1 ? outY(i) : 0
-      },
-      // The annotation node end
-      {
-        dx: a.dx - a.width / 2,
-        dy: a.dy
-      }
-    ];
-  });
-
-  renderNodeInfo.height =
-      Math.max(renderNodeInfo.height, inboxHeight, outboxHeight);
-}
-
-/**
- * Set size of an annotation node.
- */
-function sizeAnnotation(a: render.Annotation): void {
-  switch (a.annotationType) {
-    case render.AnnotationType.CONSTANT:
-      _.extend(a, PARAMS.constant.size);
-      break;
-    case render.AnnotationType.SHORTCUT:
-      if (a.node.type === NodeType.OP) {
-        _.extend(a, PARAMS.shortcutSize.op);
-      } else if (a.node.type === NodeType.META) {
-        _.extend(a, PARAMS.shortcutSize.meta);
-      } else if (a.node.type === NodeType.SERIES) {
-        _.extend(a, PARAMS.shortcutSize.series);
-      } else {
-        throw Error('Invalid node type: ' + a.node.type);
-      }
-      break;
-    case render.AnnotationType.SUMMARY:
-      _.extend(a, PARAMS.constant.size);
-      break;
-  }
-}
-
-/**
- * Determines the center position of the node's shape. The position depends
- * on if the node has in and out-annotations.
- */
-export function computeCXPositionOfNodeShape(renderInfo: render.RenderNodeInfo):
-    number {
-  if (renderInfo.expanded) {
-    return renderInfo.x;
-  }
-  let dx = renderInfo.inAnnotations.list.length ? renderInfo.inboxWidth : 0;
-  return renderInfo.x - renderInfo.width / 2 + dx +
-      renderInfo.coreBox.width / 2;
-}
-
-/** Returns the angle (in degrees) between two points. */
-function angleBetweenTwoPoints(a: render.Point, b: render.Point): number {
-  let dx = b.x - a.x;
-  let dy = b.y - a.y;
-  return 180 * Math.atan(dy / dx) / Math.PI;
-}
-
-/**
- * Returns if a line going through the specified points is a straight line.
- */
-function isStraightLine(points: render.Point[]) {
-  let angle = angleBetweenTwoPoints(points[0], points[1]);
-  for (let i = 1; i < points.length - 1; i++) {
-    let newAngle = angleBetweenTwoPoints(points[i], points[i + 1]);
-    // Have a tolerance of 1 degree.
-    if (Math.abs(newAngle - angle) > 1) {
-      return false;
-    }
-    angle = newAngle;
-  }
-  return true;
-}
-
-/**
- * Returns the intersection of a line between the provided point
- * and the provided rectangle.
- */
-function intersectPointAndNode(
-    point: render.Point, node: render.RenderNodeInfo): render.Point {
-  // cx and cy are the center of the rectangle.
-  let cx = node.expanded ?
-     node.x : computeCXPositionOfNodeShape(node);
-  let cy = node.y;
-  // Calculate the slope
-  let dx = point.x - cx;
-  let dy = point.y - cy;
-  let w = node.expanded ? node.width : node.coreBox.width;
-  let h = node.expanded ? node.height : node.coreBox.height;
-  let deltaX, deltaY;
-  if (Math.abs(dy) * w / 2  > Math.abs(dx) * h / 2) {
-    // The intersection is above or below the rectangle.
-    if (dy < 0) {
-      h = -h;
-    }
-    deltaX = dy === 0 ? 0 : h / 2 * dx / dy;
-    deltaY = h / 2;
-  } else {
-    // The intersection is left or right of the rectangle.
-    if (dx < 0) {
-      w = -w;
-    }
-    deltaX = w / 2;
-    deltaY = dx === 0 ? 0 : w / 2 * dy / dx;
-  }
-  return {x: cx + deltaX, y: cy + deltaY};
-}
-
-} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/minimap.ts b/tensorflow/tensorboard/components/tf_graph_common/minimap.ts
deleted file mode 100644
index 8129df3a4268803d5105ce6a8e31755c9e40f470..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/minimap.ts
+++ /dev/null
@@ -1,328 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.scene {
-
-/** Show minimap when the viewpoint area is less than X% of the whole area. */
-const FRAC_VIEWPOINT_AREA: number = 0.8;
-
-export class Minimap {
-  /** The minimap container. */
-  private minimap: HTMLElement;
-  /** The canvas used for drawing the mini version of the svg. */
-  private canvas: HTMLCanvasElement;
-  /** A buffer canvas used for temporary drawing to avoid flickering. */
-  private canvasBuffer: HTMLCanvasElement;
-  private download: HTMLLinkElement;
-  private downloadCanvas: HTMLCanvasElement;
-
-  /** The minimap svg used for holding the viewpoint rectangle. */
-  private minimapSvg: SVGSVGElement;
-  /** The rectangle showing the current viewpoint. */
-  private viewpoint: SVGRectElement;
-  /**
-   * The scale factor for the minimap. The factor is determined automatically
-   * so that the minimap doesn't violate the maximum width/height specified
-   * in the constructor. The minimap maintains the same aspect ratio as the
-   * original svg.
-   */
-  private scaleMinimap: number;
-  /** The main svg element. */
-  private svg: SVGSVGElement;
-  /** The svg group used for panning and zooming the main svg. */
-  private zoomG: SVGGElement;
-  /** The zoom behavior of the main svg. */
-  private mainZoom: d3.ZoomBehavior<any, any>;
-  /** The maximum width and height for the minimap. */
-  private maxWandH: number;
-  /** The last translation vector used in the main svg. */
-  private translate: [number, number];
-  /** The last scaling factor used in the main svg. */
-  private scaleMain: number;
-  /** The coordinates of the viewpoint rectangle. */
-  private viewpointCoord: {x: number, y: number};
-  /** The current size of the minimap */
-  private minimapSize: {width: number, height: number};
-  /** Padding (px) due to the main labels of the graph. */
-  private labelPadding: number;
-  /**
-   * Constructs a new minimap.
-   *
-   * @param svg The main svg element.
-   * @param zoomG The svg group used for panning and zooming the main svg.
-   * @param mainZoom The main zoom behavior.
-   * @param minimap The minimap container.
-   * @param maxWandH The maximum width/height for the minimap.
-   * @param labelPadding Padding in pixels due to the main graph labels.
-   */
-  constructor(svg: SVGSVGElement, zoomG: SVGGElement,
-      mainZoom: d3.ZoomBehavior<any, any>, minimap: HTMLElement,
-      maxWandH: number, labelPadding: number) {
-    this.svg = svg;
-    this.labelPadding = labelPadding;
-    this.zoomG = zoomG;
-    this.mainZoom = mainZoom;
-    this.maxWandH = maxWandH;
-    let $minimap = d3.select(minimap);
-    // The minimap will have 2 main components: the canvas showing the content
-    // and an svg showing a rectangle of the currently zoomed/panned viewpoint.
-    let $minimapSvg = $minimap.select('svg');
-
-    // Make the viewpoint rectangle draggable.
-    let $viewpoint = $minimapSvg.select('rect');
-    let dragmove = (d) => {
-      this.viewpointCoord.x = (<DragEvent>d3.event).x;
-      this.viewpointCoord.y = (<DragEvent>d3.event).y;
-      this.updateViewpoint();
-    };
-    this.viewpointCoord = {x: 0, y: 0};
-    let drag = d3.drag().subject(Object).on('drag', dragmove);
-    $viewpoint.datum(this.viewpointCoord as any).call(drag);
-
-    // Make the minimap clickable.
-    $minimapSvg.on('click', () => {
-      if ((<Event>d3.event).defaultPrevented) {
-        // This click was part of a drag event, so suppress it.
-        return;
-      }
-      // Update the coordinates of the viewpoint.
-      let width = Number($viewpoint.attr('width'));
-      let height = Number($viewpoint.attr('height'));
-      let clickCoords = d3.mouse($minimapSvg.node() as any);
-      this.viewpointCoord.x = clickCoords[0] - width / 2;
-      this.viewpointCoord.y = clickCoords[1] - height / 2;
-      this.updateViewpoint();
-    });
-    this.viewpoint = <SVGRectElement>$viewpoint.node();
-    this.minimapSvg = <SVGSVGElement>$minimapSvg.node();
-    this.minimap = minimap;
-    this.canvas = <HTMLCanvasElement>$minimap.select('canvas.first').node();
-    this.canvasBuffer =
-        <HTMLCanvasElement>$minimap.select('canvas.second').node();
-    this.downloadCanvas =
-        <HTMLCanvasElement>$minimap.select('canvas.download').node();
-    d3.select(this.downloadCanvas).style('display', 'none');
-    this.update();
-  }
-
-  /**
-   * Updates the position and the size of the viewpoint rectangle.
-   * It also notifies the main svg about the new panned position.
-   */
-  private updateViewpoint(): void {
-    // Update the coordinates of the viewpoint rectangle.
-    d3.select(this.viewpoint)
-        .attr('x', this.viewpointCoord.x)
-        .attr('y', this.viewpointCoord.y);
-    // Update the translation vector of the main svg to reflect the
-    // new viewpoint.
-    let mainX = - this.viewpointCoord.x * this.scaleMain / this.scaleMinimap;
-    let mainY = - this.viewpointCoord.y * this.scaleMain / this.scaleMinimap;
-    d3.select(this.svg).call(
-        this.mainZoom.transform,
-        d3.zoomIdentity.translate(mainX, mainY).scale(this.scaleMain));
-  }
-
-  /**
-   * Redraws the minimap. Should be called whenever the main svg
-   * was updated (e.g. when a node was expanded).
-   */
-  update(): void {
-    let sceneSize = null;
-    try {
-      // Get the size of the entire scene.
-      sceneSize = this.zoomG.getBBox();
-      if (sceneSize.width === 0) {
-        // There is no scene anymore. We have been detached from the dom.
-        return;
-      }
-    } catch (e) {
-      // Firefox produced NS_ERROR_FAILURE if we have been
-      // detached from the dom.
-      return;
-    }
-    let $download = d3.select('#graphdownload');
-    this.download = <HTMLLinkElement>$download.node();
-    $download.on('click', d => {
-      this.download.href = this.downloadCanvas.toDataURL('image/png');
-    });
-
-    let $svg = d3.select(this.svg);
-    // Read all the style rules in the document and embed them into the svg.
-    // The svg needs to be self contained, i.e. all the style rules need to be
-    // embedded so the canvas output matches the origin.
-    let stylesText = '';
-    for (let k = 0; k < document.styleSheets.length; k++) {
-      try {
-        let cssRules = (<any>document.styleSheets[k]).cssRules ||
-          (<any>document.styleSheets[k]).rules;
-        if (cssRules == null) {
-          continue;
-        }
-        for (let i = 0; i < cssRules.length; i++) {
-          // Remove tf-* selectors from the styles.
-          stylesText +=
-              cssRules[i].cssText.replace(/ ?tf-[\w-]+ ?/g, '') + '\n';
-        }
-      } catch (e) {
-        if (e.name !== 'SecurityError') {
-          throw e;
-        }
-      }
-    }
-
-    // Temporarily add the css rules to the main svg.
-    let svgStyle = $svg.append('style');
-    svgStyle.text(stylesText);
-
-    // Temporarily remove the zoom/pan transform from the main svg since we
-    // want the minimap to show a zoomed-out and centered view.
-    let $zoomG = d3.select(this.zoomG);
-    let zoomTransform = $zoomG.attr('transform');
-    $zoomG.attr('transform', null);
-
-    // Since we add padding, account for that here.
-    sceneSize.height += this.labelPadding * 2;
-    sceneSize.width += this.labelPadding * 2;
-
-    // Temporarily assign an explicit width/height to the main svg, since
-    // it doesn't have one (uses flex-box), but we need it for the canvas
-    // to work.
-    $svg
-      .attr('width', sceneSize.width)
-      .attr('height', sceneSize.height);
-
-    // Since the content inside the svg changed (e.g. a node was expanded),
-    // the aspect ratio have also changed. Thus, we need to update the scale
-    // factor of the minimap. The scale factor is determined such that both
-    // the width and height of the minimap are <= maximum specified w/h.
-    this.scaleMinimap =
-        this.maxWandH / Math.max(sceneSize.width, sceneSize.height);
-
-    this.minimapSize = {
-      width: sceneSize.width * this.scaleMinimap,
-      height: sceneSize.height * this.scaleMinimap
-    };
-
-    // Update the size of the minimap's svg, the buffer canvas and the
-    // viewpoint rect.
-    d3.select(this.minimapSvg).attr(<any>this.minimapSize);
-    d3.select(this.canvasBuffer).attr(<any>this.minimapSize);
-
-    // Download canvas width and height are multiples of the style width and
-    // height in order to increase pixel density of the PNG for clarity.
-    d3.select(this.downloadCanvas).style(
-      <any>{ width: sceneSize.width, height: sceneSize.height });
-    d3.select(this.downloadCanvas).attr(
-      <any>{ width: sceneSize.width * 3, height: sceneSize.height * 3 });
-
-    if (this.translate != null && this.zoom != null) {
-      // Update the viewpoint rectangle shape since the aspect ratio of the
-      // map has changed.
-      requestAnimationFrame(() => this.zoom());
-    }
-
-    // Serialize the main svg to a string which will be used as the rendering
-    // content for the canvas.
-    let svgXml = (new XMLSerializer()).serializeToString(this.svg);
-
-    // Now that the svg is serialized for rendering, remove the temporarily
-    // assigned styles, explicit width and height and bring back the pan/zoom
-    // transform.
-    svgStyle.remove();
-    $svg.attr('width', null).attr('height', null);
-
-    $zoomG.attr('transform', zoomTransform);
-    let image = new Image();
-    image.onload = () => {
-      // Draw the svg content onto the buffer canvas.
-      let context = this.canvasBuffer.getContext('2d');
-      context.clearRect(0, 0, this.canvasBuffer.width,
-          this.canvasBuffer.height);
-      context.drawImage(image, 0, 0,
-        this.minimapSize.width, this.minimapSize.height);
-      requestAnimationFrame(() => {
-        // Hide the old canvas and show the new buffer canvas.
-        d3.select(this.canvasBuffer).style('display', null);
-        d3.select(this.canvas).style('display', 'none');
-        // Swap the two canvases.
-        [this.canvas, this.canvasBuffer] = [this.canvasBuffer, this.canvas];
-      });
-      let downloadContext = this.downloadCanvas.getContext('2d');
-      downloadContext.clearRect(0, 0, this.downloadCanvas.width,
-        this.downloadCanvas.height);
-      downloadContext.drawImage(image, 0, 0,
-        this.downloadCanvas.width, this.downloadCanvas.height);
-    };
-    image.onerror = () => {
-      let blob = new Blob([svgXml], {type: 'image/svg+xml;charset=utf-8'});
-      image.src = URL.createObjectURL(blob);
-    };
-    image.src =
-        'data:image/svg+xml;charset=utf-8,' + encodeURIComponent(svgXml);
-  }
-
-  /**
-   * Handles changes in zooming/panning. Should be called from the main svg
-   * to notify that a zoom/pan was performed and this minimap will update it's
-   * viewpoint rectangle.
-   *
-   * @param translate The translate vector, or none to use the last used one.
-   * @param scale The scaling factor, or none to use the last used one.
-   */
-  zoom(transform?: d3.ZoomTransform): void {
-    if (this.scaleMinimap == null) {
-      // Scene is not ready yet.
-      return;
-    }
-    // Update the new translate and scale params, only if specified.
-    if (transform) {
-      this.translate = [transform.x, transform.y];
-      this.scaleMain = transform.k;
-    }
-
-    // Update the location of the viewpoint rectangle.
-    let svgRect = this.svg.getBoundingClientRect();
-    let $viewpoint = d3.select(this.viewpoint);
-    this.viewpointCoord.x = -this.translate[0] * this.scaleMinimap /
-        this.scaleMain;
-    this.viewpointCoord.y = -this.translate[1] * this.scaleMinimap /
-        this.scaleMain;
-    let viewpointWidth = svgRect.width * this.scaleMinimap / this.scaleMain;
-    let viewpointHeight = svgRect.height * this.scaleMinimap / this.scaleMain;
-    $viewpoint
-      .attr('x', this.viewpointCoord.x)
-      .attr('y', this.viewpointCoord.y)
-      .attr('width', viewpointWidth)
-      .attr('height', viewpointHeight);
-    // Show/hide the minimap depending on the viewpoint area as fraction of the
-    // whole minimap.
-    let mapWidth = this.minimapSize.width;
-    let mapHeight = this.minimapSize.height;
-    let x = this.viewpointCoord.x;
-    let y = this.viewpointCoord.y;
-    let w = Math.min(Math.max(0, x + viewpointWidth), mapWidth) -
-        Math.min(Math.max(0, x), mapWidth);
-    let h = Math.min(Math.max(0, y + viewpointHeight), mapHeight) -
-        Math.min(Math.max(0, y), mapHeight);
-    let fracIntersect = (w * h) / (mapWidth * mapHeight);
-    if (fracIntersect < FRAC_VIEWPOINT_AREA) {
-      this.minimap.classList.remove('hidden');
-    } else {
-      this.minimap.classList.add('hidden');
-    }
-  }
-}
-
-} // close module tf.scene
diff --git a/tensorflow/tensorboard/components/tf_graph_common/node.ts b/tensorflow/tensorboard/components/tf_graph_common/node.ts
deleted file mode 100644
index f090a51fc4ec58e82220e6675872baba45861c61..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/node.ts
+++ /dev/null
@@ -1,1072 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.scene.node {
-  import RenderNodeInfo = tf.graph.render.RenderNodeInfo;
-  /**
-   * Select or Create a 'g.nodes' group to a given sceneGroup
-   * and builds a number of 'g.node' groups inside the group.
-   *
-   * Structure Pattern:
-   *
-   * <g class='nodes'>
-   *   <g class='node'>
-   *     <g class='in-annotations'>
-   *       ...
-   *     </g>
-   *     <g class='out-annotations'>
-   *       ...
-   *     </g>
-   *     <g class='nodeshape'>
-   *      <!--
-   *      Content of the node shape should be for the node itself. For example a
-   *      Metanode would have a <rect> with rounded edges, an op would have an
-   *      <ellipse>. More complex nodes like series may contain multiple
-   *      elements which are conditionally visible based on whether the node is
-   *      expanded.
-   *      -->
-   *     </g>
-   *     <text class='label'>node name</text>
-   *     <g class='subscene'>
-   *       <!--
-   *       Content of  the subscene (only for metanode and series node).
-   *
-   *       Subscene is a svg group that contains content of the
-   *       metanode's metagraph that is recursively generated by Scene.build().
-   *
-   *       When the graph is expanded multiple times, a subscene can contain
-   *       nested subscenes inside.
-   *       -->
-   *     </g>
-   *   </g>
-   *   ...
-   * </g>
-   *
-   *
-   * @param sceneGroup selection of the container
-   * @param nodeData array of render node information to map
-   * @param sceneElement <tf-graph-scene> polymer element
-   * @return selection of the created nodeGroups
-   */
-  export function buildGroup(
-      sceneGroup, nodeData: render.RenderNodeInfo[], sceneElement) {
-    let container =
-        scene.selectOrCreateChild(sceneGroup, 'g', Class.Node.CONTAINER);
-    // Select all children and join with data.
-    // (Note that all children of g.nodes are g.node)
-    let nodeGroups =
-        (container as any).selectAll(function() {return this.childNodes;})
-            .data(nodeData, (d) => {
-              // make sure that we don't have to swap shape type
-              return d.node.name + ':' + d.node.type;
-            });
-
-    // ENTER
-    nodeGroups.enter()
-        .append('g')
-        .attr('data-name', d => { return d.node.name; })
-        .each(function(d) {
-          let nodeGroup = d3.select(this);
-          // index node group for quick stylizing
-          sceneElement.addNodeGroup(d.node.name, nodeGroup);
-        })
-        .merge(nodeGroups)
-        // ENTER + UPDATE
-        .attr('class', d => { return Class.Node.GROUP + ' ' + nodeClass(d); })
-        .each(function(d) {
-          let nodeGroup = d3.select(this);
-          // Add g.in-annotations (always add -- to keep layer order
-          // consistent.)
-          let inAnnotationBox =
-              scene.selectOrCreateChild(nodeGroup, 'g', Class.Annotation.INBOX);
-          annotation.buildGroup(
-              inAnnotationBox, d.inAnnotations, d, sceneElement);
-
-          // Add g.out-annotations  (always add -- to keep layer order
-          // consistent.)
-          let outAnnotationBox = scene.selectOrCreateChild(
-              nodeGroup, 'g', Class.Annotation.OUTBOX);
-          annotation.buildGroup(
-              outAnnotationBox, d.outAnnotations, d, sceneElement);
-
-          // Build .shape first (background of the node).
-          let shape = buildShape(nodeGroup, d, Class.Node.SHAPE);
-          if (d.node.isGroupNode) {
-            addButton(shape, d, sceneElement);
-          }
-          addInteraction(shape, d, sceneElement);
-
-          // Build subscene on the top.
-          subsceneBuild(nodeGroup, <render.RenderGroupNodeInfo>d, sceneElement);
-
-          // Build label last. Should be on top of everything else.
-          let label = labelBuild(nodeGroup, d, sceneElement);
-          // Do not add interaction to metanode labels as they live inside the
-          // metanode shape which already has the same interactions.
-          addInteraction(label, d, sceneElement, d.node.type === NodeType.META);
-
-          stylize(nodeGroup, d, sceneElement);
-          position(nodeGroup, d);
-        });
-
-    // EXIT
-    nodeGroups.exit()
-        .each(function(d) {
-          // remove all indices on remove
-          sceneElement.removeNodeGroup(d.node.name);
-
-          let nodeGroup = d3.select(this);
-          if (d.inAnnotations.list.length > 0) {
-            nodeGroup.select('.' + Class.Annotation.INBOX)
-                .selectAll('.' + Class.Annotation.GROUP)
-                .each(a => { sceneElement.removeAnnotationGroup(a, d); });
-          }
-          if (d.outAnnotations.list.length > 0) {
-            nodeGroup.select('.' + Class.Annotation.OUTBOX)
-                .selectAll('.' + Class.Annotation.GROUP)
-                .each(a => { sceneElement.removeAnnotationGroup(a, d); });
-          }
-        })
-        .remove();
-    return nodeGroups;
-};
-
-/**
- * Update or remove the subscene of a render group node depending on whether it
- * is a expanded. If the node is not a group node, this method has no effect.
- *
- * @param nodeGroup selection of the container
- * @param renderNodeInfo the render information for the node.
- * @param sceneElement <tf-graph-scene> polymer element.
- * @return Selection of the subscene group, or null if node group does not have
- *        a subscene. Op nodes, bridge nodes and unexpanded group nodes will
- *        not have a subscene.
- */
-function subsceneBuild(nodeGroup,
-    renderNodeInfo: render.RenderGroupNodeInfo, sceneElement) {
-  if (renderNodeInfo.node.isGroupNode) {
-    if (renderNodeInfo.expanded) {
-      // Recursively build the subscene.
-      return scene.buildGroup(nodeGroup, renderNodeInfo, sceneElement,
-        Class.Subscene.GROUP);
-    }
-    // Clean out existing subscene if the node is not expanded.
-    scene.selectChild(nodeGroup, 'g', Class.Subscene.GROUP).remove();
-  }
-  return null;
-};
-
-/**
- * Translate the subscene of the given node group
- */
-function subscenePosition(nodeGroup, d: render.RenderNodeInfo) {
-  let x0 = d.x - d.width / 2.0 + d.paddingLeft;
-  let y0 = d.y - d.height / 2.0 + d.paddingTop;
-
-  let subscene = scene.selectChild(nodeGroup, 'g', Class.Subscene.GROUP);
-  scene.translate(subscene, x0, y0);
-};
-
-/**
- * Add an expand/collapse button to a group node
- *
- * @param selection The group node selection.
- * @param d Info about the node being rendered.
- * @param sceneElement <tf-graph-scene> polymer element.
- */
-function addButton(selection, d: render.RenderNodeInfo, sceneElement) {
-  let group =
-      scene.selectOrCreateChild(selection, 'g', Class.Node.BUTTON_CONTAINER);
-  scene.selectOrCreateChild(group, 'circle', Class.Node.BUTTON_CIRCLE);
-  scene.selectOrCreateChild(group, 'path', Class.Node.EXPAND_BUTTON)
-      .attr('d', 'M0,-2.2 V2.2 M-2.2,0 H2.2');
-  scene.selectOrCreateChild(group, 'path', Class.Node.COLLAPSE_BUTTON)
-      .attr('d', 'M-2.2,0 H2.2');
-  (group as any).on('click', (d: any) => {
-    // Stop this event's propagation so that it isn't also considered a
-    // node-select.
-    (<Event>d3.event).stopPropagation();
-    sceneElement.fire('node-toggle-expand', {name: d.node.name});
-  });
-  scene.positionButton(group, d);
-};
-
-/**
- * Fire node-* events when the selection is interacted.
- *
- * @param disableInteraction When true, have the provided selection
- * ignore all pointer events. Used for text labels inside of metanodes, which
- * don't need interaction as their surrounding shape has interaction, and if
- * given interaction would cause conflicts with the expand/collapse button.
- */
-function addInteraction(selection, d: render.RenderNodeInfo,
-    sceneElement, disableInteraction?: boolean) {
-  if (disableInteraction) {
-    selection.attr('pointer-events', 'none');
-    return;
-  }
-
-  let contextMenuFunction = contextmenu.getMenu(
-    getContextMenu(d.node, sceneElement));
-  selection
-      .on('dblclick',
-          d => {
-            sceneElement.fire('node-toggle-expand', {name: d.node.name});
-          })
-      .on('mouseover',
-          d => {
-            // don't send mouseover over expanded group,
-            // otherwise it is causing too much glitches
-            if (sceneElement.isNodeExpanded(d)) {
-              return;
-            }
-
-            sceneElement.fire('node-highlight', {name: d.node.name});
-          })
-      .on('mouseout',
-          d => {
-            // don't send mouseover over expanded group,
-            // otherwise it is causing too much glitches
-            if (sceneElement.isNodeExpanded(d)) {
-              return;
-            }
-
-            sceneElement.fire('node-unhighlight', {name: d.node.name});
-          })
-      .on('click',
-          d => {
-            // Stop this event's propagation so that it isn't also considered
-            // a graph-select.
-            (<Event>d3.event).stopPropagation();
-            sceneElement.fire('node-select', {name: d.node.name});
-          })
-      .on('contextmenu', (d, i) => {
-        sceneElement.fire('node-select', {name: d.node.name});
-        contextMenuFunction.call(d, i);
-      });
-};
-
-/**
- * Returns the d3 context menu specification for the provided node.
- */
-export function getContextMenu(node: Node, sceneElement) {
-  let menu = [{
-    title: (d): string => {
-      return getIncludeNodeButtonString(node.include);
-    },
-    action: (elm, d, i) => {
-      sceneElement.fire('node-toggle-extract', {name: node.name});
-    }
-  }];
-  if (canBeInSeries(node)) {
-    menu.push({
-      title: d => { return getGroupSettingLabel(node); },
-      action: (elm, d, i) => {
-        sceneElement.fire(
-            'node-toggle-seriesgroup', {name: getSeriesName(node)});
-      }
-    });
-  }
-  return menu;
-}
-
-/** Returns if a node can be part of a grouped series */
-export function canBeInSeries(node: Node) {
-  return getSeriesName(node) !== null;
-}
-
-/**
- * Returns the name of the possible grouped series containing this node.
- * Returns null if the node cannot be part of a grouped series of nodes.
- */
-export function getSeriesName(node: Node) {
-  if (!node) {
-    return null;
-  }
-  if (node.type === NodeType.SERIES) {
-    return node.name;
-  }
-  if (node.type === NodeType.OP) {
-    let op = <OpNode>node;
-    return op.owningSeries;
-  }
-  return null;
-}
-
-/**
- * Returns the SeriesNode that represents the series that the provided node
- * is contained in (or itself if the provided node is itself a SeriesNode).
- * Returns null if the node is not rendered as part of a series.
- */
-function getContainingSeries(node: Node) {
-  let s: SeriesNode = null;
-  if (!node) {
-    return null;
-  } else if (node.type === NodeType.SERIES) {
-    s = <SeriesNode>node;
-  } else if (node.parentNode && node.parentNode.type === NodeType.SERIES) {
-    s = <SeriesNode>node.parentNode;
-  }
-  return s;
-}
-
-/**
- * Returns the label for a button to toggle the group setting of the provided
- * node.
- */
-export function getGroupSettingLabel(node: Node) {
-  return tf.graph.getGroupSeriesNodeButtonString(
-    getContainingSeries(node) !== null ? tf.graph.SeriesGroupingType.GROUP :
-     tf.graph.SeriesGroupingType.UNGROUP);
-}
-
-/**
- * Append svg text for label and assign data.
- * @param nodeGroup
- * @param renderNodeInfo The render node information for the label.
- * @param sceneElement <tf-graph-scene> polymer element.
- */
-function labelBuild(nodeGroup, renderNodeInfo: render.RenderNodeInfo,
-    sceneElement) {
-  let namePath = renderNodeInfo.node.name.split('/');
-  let text = namePath[namePath.length - 1];
-
-  // Truncate long labels for unexpanded Metanodes.
-  let useFontScale = renderNodeInfo.node.type === NodeType.META &&
-    !renderNodeInfo.expanded;
-
-  let label = scene.selectOrCreateChild(nodeGroup, 'text', Class.Node.LABEL);
-
-  // Make sure the label is visually on top among its siblings.
-  let labelNode = <HTMLElement> label.node();
-  labelNode.parentNode.appendChild(labelNode);
-
-  label.attr('dy', '.35em').attr('text-anchor', 'middle');
-  if (useFontScale) {
-    if (text.length > sceneElement.maxMetanodeLabelLength) {
-      text = text.substr(0, sceneElement.maxMetanodeLabelLength - 2) + '...';
-    }
-    let scale = getLabelFontScale(sceneElement);
-    label.attr('font-size', scale(text.length) + 'px');
-  }
-
-  let txtElement = <d3.Selection<any, any, any, any>>label.text(text);
-  enforceLabelWidth(txtElement, renderNodeInfo.node.type, renderNodeInfo);
-  return label;
-}
-/**
- * This function shortens text which would exceed the maximum pixel width of
- * a label.
- *
- * @param txtElementSelection The text element containing the label's text as d3
- * selection.
- * @param nodeType The type of the node the label belongs to. If the node is
- * an annotation, the value is -1. Label widths are defined in
- * layout.PARAMS.nodeSize.{meta|op|...}.maxLabelWidth for nodes and
- * layout.PARAMS.annotations.labelWidth for annotations.
- * @param renderNodeInfo The render information about the node, required to
- * determine whether META nodes are collapsed or expanded.
- */
-export function enforceLabelWidth(
-    txtElementSelection: d3.Selection<any, any, any, any>, nodeType: NodeType | number,
-    renderNodeInfo?: render.RenderNodeInfo): any {
-  // Get text element itself and its on-screen width.
-  let txtNode = <SVGTextElement>txtElementSelection.node();
-  let computedTxtLength = txtNode.getComputedTextLength();
-  let labelContent = txtNode.textContent;
-
-  // Get maximum length from settings.
-  let maxLength = null;
-  switch (nodeType) {
-    case NodeType.META:
-      if (renderNodeInfo && !renderNodeInfo.expanded) {  // Only trim text if
-        // node expanded.
-        maxLength = layout.PARAMS.nodeSize.meta.maxLabelWidth;
-      }
-      break;
-
-    case NodeType.OP:
-      maxLength = layout.PARAMS.nodeSize.op.maxLabelWidth;
-      break;
-
-    case -1:
-      maxLength = layout.PARAMS.annotations.maxLabelWidth;
-      break;
-
-    default:
-      break;
-  }
-
-  // Return if no max length provided for node type, or current label length is
-  // less than or equal to the provided length limit.
-  if (maxLength === null || computedTxtLength <= maxLength) {
-    return;
-  }
-
-  // Find the index of the character which exceeds the width.
-  // getSubStringLength performs far better than getComputedTextLength, and
-  // results in a 3x speed-up on average.
-  let index = 1;
-  while (txtNode.getSubStringLength(0, index) < maxLength) {
-    index++;
-  }
-
-  // Shorten the label starting at the string length known to be one
-  // character above max pixel length.
-  // When shortened the original label's substring is concatenated with
-  // '...', baseText contains the substring not including the '...'.
-  let baseText = <string>txtNode.textContent.substr(0, index);
-  do {
-    baseText = baseText.substr(0, baseText.length - 1);
-
-    // Recompute text length.
-    txtNode.textContent = baseText + '...';
-    computedTxtLength = txtNode.getComputedTextLength();
-  } while (computedTxtLength > maxLength && baseText.length > 0);
-
-  // Add tooltip with full name and return.
-  return txtElementSelection.append('title').text(labelContent);
-}
-
-/**
- * d3 scale used for sizing font of labels, used by labelBuild,
- * initialized once by getLabelFontScale.
- */
-let fontScale = null;
-function getLabelFontScale(sceneElement) {
-  if (!fontScale) {
-    fontScale = d3.scaleLinear()
-      .domain([sceneElement.maxMetanodeLabelLengthLargeFont,
-        sceneElement.maxMetanodeLabelLength])
-      .range([sceneElement.maxMetanodeLabelLengthFontSize,
-        sceneElement.minMetanodeLabelLengthFontSize]).clamp(true);
-  }
-  return fontScale;
-}
-
-/**
- * Set label position of a given node group
- */
-function labelPosition(nodeGroup, cx: number, cy: number,
-    yOffset: number) {
-  scene.selectChild(nodeGroup, 'text', Class.Node.LABEL)
-      .transition()
-      .attr('x', cx)
-      .attr('y', cy + yOffset);
-};
-
-/**
- * Select or append/insert shape for a node and assign renderNode
- * as the shape's data.
- *
- * @param nodeGroup
- * @param d Render node information.
- * @param nodeClass class for the element.
- * @return Selection of the shape.
- */
-export function buildShape(nodeGroup, d, nodeClass: string): d3.Selection<any, any, any, any> {
-  // Create a group to house the underlying visual elements.
-  let shapeGroup = scene.selectOrCreateChild(nodeGroup, 'g', nodeClass);
-  // TODO(jimbo): DOM structure should be templated in HTML somewhere, not JS.
-  switch (d.node.type) {
-    case NodeType.OP:
-      scene.selectOrCreateChild(shapeGroup, 'ellipse', Class.Node.COLOR_TARGET);
-      break;
-    case NodeType.SERIES:
-      // Choose the correct stamp to use to represent this series.
-      let stampType = 'annotation';
-      let groupNodeInfo = <render.RenderGroupNodeInfo>d;
-      if (groupNodeInfo.coreGraph) {
-        stampType =
-            groupNodeInfo.node.hasNonControlEdges ? 'vertical' : 'horizontal';
-      }
-      let classList = [Class.Node.COLOR_TARGET];
-      if (groupNodeInfo.isFadedOut) {
-        classList.push('faded-ellipse');
-      }
-      scene.selectOrCreateChild(shapeGroup, 'use', classList)
-          .attr('xlink:href', '#op-series-' + stampType + '-stamp');
-      scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr('rx', d.radius).attr('ry', d.radius);
-      break;
-    case NodeType.BRIDGE:
-      scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr('rx', d.radius).attr('ry', d.radius);
-      break;
-    case NodeType.META:
-      scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr('rx', d.radius).attr('ry', d.radius);
-      break;
-    default:
-      throw Error('Unrecognized node type: ' + d.node.type);
-  }
-  return shapeGroup;
-};
-
-export function nodeClass(d: render.RenderNodeInfo) {
-  switch (d.node.type) {
-    case NodeType.OP:
-      return Class.OPNODE;
-    case NodeType.META:
-      return Class.METANODE;
-    case NodeType.SERIES:
-      return Class.SERIESNODE;
-    case NodeType.BRIDGE:
-      return Class.BRIDGENODE;
-    case NodeType.ELLIPSIS:
-      return Class.ELLIPSISNODE;
-  };
-  throw Error('Unrecognized node type: ' + d.node.type);
-};
-
-/** Modify node and its subscene and its label's positional attributes */
-function position(nodeGroup, d: render.RenderNodeInfo) {
-  let shapeGroup = scene.selectChild(nodeGroup, 'g', Class.Node.SHAPE);
-  let cx = layout.computeCXPositionOfNodeShape(d);
-  switch (d.node.type) {
-    case NodeType.OP: {
-      // position shape
-      let shape = scene.selectChild(shapeGroup, 'ellipse');
-      scene.positionEllipse(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-      labelPosition(nodeGroup, cx, d.y, d.labelOffset);
-      break;
-    }
-    case NodeType.META: {
-      // position shape
-      let shape = scene.selectChild(shapeGroup, 'rect');
-      if (d.expanded) {
-        scene.positionRect(shape, d.x, d.y, d.width, d.height);
-        subscenePosition(nodeGroup, d);
-        // put label on top
-        labelPosition(nodeGroup, cx, d.y,
-          - d.height / 2 + d.labelHeight / 2);
-      } else {
-        scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-        labelPosition(nodeGroup, cx, d.y, 0);
-      }
-      break;
-    }
-    case NodeType.SERIES: {
-      let shape = scene.selectChild(shapeGroup, 'use');
-      if (d.expanded) {
-        scene.positionRect(shape, d.x, d.y, d.width, d.height);
-        subscenePosition(nodeGroup, d);
-        // put label on top
-        labelPosition(nodeGroup, cx, d.y,
-          - d.height / 2 + d.labelHeight / 2);
-      } else {
-        scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-        labelPosition(nodeGroup, cx, d.y, d.labelOffset);
-      }
-      break;
-    }
-    case NodeType.BRIDGE: {
-      // position shape
-      // NOTE: In reality, these will not be visible, but it helps to put them
-      // in the correct position for debugging purposes.
-      let shape = scene.selectChild(shapeGroup, 'rect');
-      scene.positionRect(shape, d.x, d.y, d.width, d.height);
-      break;
-    }
-    default: { throw Error('Unrecognized node type: ' + d.node.type); }
-  }
-};
-
-/** Enum specifying the options to color nodes by */
-export enum ColorBy {STRUCTURE, DEVICE, XLA_CLUSTER, COMPUTE_TIME, MEMORY}
-;
-
-/**
- * Returns the fill color for the node given its state and the 'color by'
- * option.
- */
-export function getFillForNode(templateIndex, colorBy,
-    renderInfo: render.RenderNodeInfo, isExpanded: boolean): string {
-  let colorParams = render.MetanodeColors;
-  switch (colorBy) {
-    case ColorBy.STRUCTURE:
-      if (renderInfo.node.type === NodeType.META) {
-        let tid = (<Metanode>renderInfo.node).templateId;
-        return tid === null ?
-          colorParams.UNKNOWN :
-          colorParams.STRUCTURE_PALETTE(templateIndex(tid), isExpanded);
-      } else if (renderInfo.node.type === NodeType.SERIES) {
-        // If expanded, we're showing the background rect, which we want to
-        // appear gray. Otherwise we're showing a stack of ellipses which we
-        // want to show white.
-        return isExpanded ? colorParams.EXPANDED_COLOR : 'white';
-      } else if (renderInfo.node.type === NodeType.BRIDGE) {
-        return renderInfo.structural ?
-            '#f0e' :
-            (<BridgeNode>renderInfo.node).inbound ? '#0ef' : '#fe0';
-      } else {
-        // Op nodes are white.
-        return 'white';
-      }
-    case ColorBy.DEVICE:
-      if (renderInfo.deviceColors == null) {
-        // Return the hue for unknown device.
-        return colorParams.UNKNOWN;
-      }
-      let id = renderInfo.node.name;
-      let escapedId = tf.graph.util.escapeQuerySelector(id);
-      let gradientDefs = d3.select('svg#svg defs #linearGradients');
-      let linearGradient = gradientDefs.select('linearGradient#' + escapedId);
-      // If the linear gradient is not there yet, create it.
-      if (linearGradient.size() === 0) {
-        linearGradient = gradientDefs.append('linearGradient').attr('id', id);
-        // Re-create the stops of the linear gradient.
-        linearGradient.selectAll('*').remove();
-        let cumulativeProportion = 0;
-        // For each device, create a stop using the proportion of that device.
-        _.each(renderInfo.deviceColors, d => {
-          let color = d.color;
-          linearGradient.append('stop')
-              .attr('offset', cumulativeProportion)
-              .attr('stop-color', color);
-          linearGradient.append('stop')
-              .attr('offset', cumulativeProportion + d.proportion)
-              .attr('stop-color', color);
-          cumulativeProportion += d.proportion;
-        });
-      }
-      return isExpanded ? colorParams.EXPANDED_COLOR : `url(#${escapedId})`;
-    case ColorBy.XLA_CLUSTER:
-      return isExpanded ? colorParams.EXPANDED_COLOR :
-                          renderInfo.xlaClusterColor || colorParams.UNKNOWN;
-    case ColorBy.COMPUTE_TIME:
-      return isExpanded ?
-        colorParams.EXPANDED_COLOR : renderInfo.computeTimeColor ||
-        colorParams.UNKNOWN;
-    case ColorBy.MEMORY:
-      return isExpanded ?
-        colorParams.EXPANDED_COLOR : renderInfo.memoryColor ||
-        colorParams.UNKNOWN;
-    default:
-      throw new Error('Unknown case to color nodes by');
-  }
-}
-
-/**
- * Modify node style by toggling class and assign attributes (only for things
- * that can't be done in css).
- */
-export function stylize(nodeGroup, renderInfo: render.RenderNodeInfo,
-    sceneElement, nodeClass?) {
-  nodeClass = nodeClass || Class.Node.SHAPE;
-  let isHighlighted = sceneElement.isNodeHighlighted(renderInfo.node.name);
-  let isSelected = sceneElement.isNodeSelected(renderInfo.node.name);
-  let isExtract = renderInfo.isInExtract || renderInfo.isOutExtract;
-  let isExpanded = renderInfo.expanded;
-  let isFadedOut = renderInfo.isFadedOut;
-  nodeGroup.classed('highlighted', isHighlighted);
-  nodeGroup.classed('selected', isSelected);
-  nodeGroup.classed('extract', isExtract);
-  nodeGroup.classed('expanded', isExpanded);
-  nodeGroup.classed('faded', isFadedOut);
-
-  // Main node always exists here and it will be reached before subscene,
-  // so d3 selection is fine here.
-  let node = nodeGroup.select('.' + nodeClass + ' .' + Class.Node.COLOR_TARGET);
-  let fillColor = getFillForNode(sceneElement.templateIndex,
-    ColorBy[sceneElement.colorBy.toUpperCase()],
-    renderInfo, isExpanded);
-  node.style('fill', fillColor);
-
-  // Choose outline to be darker version of node color if the node is a single
-  // color and is not selected.
-  node.style('stroke', isSelected ? null : getStrokeForFill(fillColor));
-};
-
-/**
- * Given a node's fill color/gradient, determine the stroke for the node.
- */
-export function getStrokeForFill(fill: string) {
-  // If node is colored by a gradient, then use a dark gray outline.
-  return fill.substring(0, 3) === 'url' ?
-      render.MetanodeColors.GRADIENT_OUTLINE :
-      d3.rgb(fill).darker().toString();
-}
-
-/**
- * Finds selected node and highlights all nodes which are providing direct
- * or indirect input to the node and all edges connecting these nodes
- * together and to the selected node.
- *
- * @param renderGraphInfo Information on the rendered state of the graph.
- */
-export function traceInputs(renderGraphInfo: tf.graph.render.RenderGraphInfo) {
-  // Reset all styling.
-  d3.selectAll('.input-highlight').classed('input-highlight', false);
-  d3.selectAll('.non-input').classed('non-input', false);
-  d3.selectAll('.input-parent').classed('input-parent', false);
-  d3.selectAll('.input-child').classed('input-child', false);
-  d3.selectAll('.input-edge-highlight').classed('input-edge-highlight', false);
-  d3.selectAll('.non-input-edge-highlight')
-      .classed('non-input-edge-highlight', false);
-  d3.selectAll('.input-highlight-selected')
-      .classed('input-highlight-selected', false);
-
-  // Extract currently selected node. Return if input tracing disabled or no
-  // node is selected.
-  const selectedNodeSelectorString = 'g.node.selected,g.op.selected';
-  const nodeSelection = d3.select(selectedNodeSelectorString);
-  let currentNode = undefined;
-  if (renderGraphInfo && renderGraphInfo.traceInputs &&
-      nodeSelection.nodes().length) {
-    currentNode = nodeSelection.nodes()[0];
-  } else {
-    return;
-  }
-  let nodeName = currentNode.getAttribute('data-name');
-  let opNodes = _getAllContainedOpNodes(nodeName, renderGraphInfo);
-  let allTracedNodes = {};
-  _.each(opNodes, function(nodeInstance) {
-    allTracedNodes =
-        traceAllInputsOfOpNode(renderGraphInfo, nodeInstance, allTracedNodes);
-  });
-
-  d3.selectAll(selectedNodeSelectorString)
-      // Remove the input-highlight from the selected node.
-      .classed('input-highlight', false)
-      // Add input-highlight-selected class to selected node, which allows
-      // treating the selected not as a special case of an input node.
-      .classed('input-highlight-selected', true);
-
-  // Highlight all parent nodes of each OpNode as input parent to allow
-  // specific highlighting.
-  let highlightedNodes = Object.keys(allTracedNodes);
-  let visibleNodes =
-      _findVisibleParentsFromOpNodes(renderGraphInfo, highlightedNodes);
-  _markParentsOfNodes(visibleNodes);
-
-  // Attach class to all non-input nodes and edges for styling.
-  d3.selectAll(
-        'g.node:not(.selected):not(.input-highlight)' +
-        ':not(.input-parent):not(.input-children)')
-      .classed('non-input', true)
-      .each(function(d: RenderNodeInfo) {
-        // Mark all nodes with the specified name as non-inputs. This
-        // results in Annotation nodes which are attached to inputs to be
-        // tagged as well.
-        let nodeName = d.node.name;
-        d3.selectAll(`[data-name="${nodeName}"]`).classed('non-input', true);
-      });
-  d3.selectAll('g.edge:not(.input-edge-highlight)')
-      .classed('non-input-edge-highlight', true);
-}
-
-/**
- * Recursively find all op nodes contained by the node identified by the
- * provided name.
- * @param nodeName The meta or op node of which the OpNode instances are
- * required.
- * @param renderGraphInfo The rendered graph information object.
- * @returns {Array} An array of OpNodeImpl instances.
- */
-export function _getAllContainedOpNodes(
-    nodeName: string, renderGraphInfo: tf.graph.render.RenderGraphInfo) {
-  let opNodes = [];
-
-  // Get current node.
-  let node = renderGraphInfo.getNodeByName(nodeName) as tf.graph.GroupNode |
-      tf.graph.OpNode;
-
-  // If node is already OpNode then return the node plus its input embeddings.
-  if (node instanceof tf.graph.OpNodeImpl) {
-    return [node].concat(node.inEmbeddings);
-  }
-
-  // Otherwise, make recursive call for each node contained by the GroupNode.
-  let childNodeNames = (node as tf.graph.GroupNode).metagraph.nodes();
-  _.each(childNodeNames, function(childNodeName) {
-    opNodes =
-        opNodes.concat(_getAllContainedOpNodes(childNodeName, renderGraphInfo));
-  });
-
-  return opNodes;
-}
-
-/**
- * When resolving inputs of a node the visible parent node of each input
- * node (i.e. the first parent which is rendered to the screen) needs to be
- * found, and since such a node may contain several input OpNodes a map
- * of the visible parent to all the input OpNodes it contains is provided by
- * opNodes.
- */
-interface VisibleParent {
-  visibleParent: Node;
-  opNodes: OpNode[];
-}
-
-export function traceAllInputsOfOpNode(
-    renderGraphInfo: tf.graph.render.RenderGraphInfo, startNode: OpNode,
-    allTracedNodes: Object) {
-  // To prevent infinite loops due to cyclical relationships and improving
-  // performance by tracing OpNode which is input to 2+ nodes only once.
-  if (allTracedNodes[startNode.name]) {
-    return allTracedNodes;
-  } else {
-    allTracedNodes[startNode.name] = true;
-  }
-  // Extract the inputs.
-  let inputs = startNode.inputs;
-  // Get visible parent.
-  let currentVisibleParent = getVisibleParent(renderGraphInfo, startNode);
-  // Mark as input node.
-  d3.select(`.node[data-name="${currentVisibleParent.name}"]`)
-      .classed('input-highlight', true);
-
-  // Find the visible parent of each input.
-  let visibleInputs = {};
-  _.each(inputs, function(nodeInstance) {
-    let resolvedNode = renderGraphInfo.getNodeByName(nodeInstance.name);
-    if (resolvedNode === undefined) {
-      // Node could not be found in rendered Hierarchy, which happens when
-      // tracing inputs of a SummaryNode.
-      return;
-    }
-    // Ensure node is resolved to OpNode if name collision with Metanode exists.
-    if (resolvedNode instanceof MetanodeImpl) {
-      let resolvedNodeName = tf.graph.getStrictName(resolvedNode.name);
-      resolvedNode = renderGraphInfo.getNodeByName(resolvedNodeName) as OpNode;
-    }
-
-    let visibleParent = getVisibleParent(renderGraphInfo, resolvedNode);
-
-    // Append OpNode to visible parent entry.
-    let visibleInputsEntry = visibleInputs[visibleParent.name];
-    if (visibleInputsEntry) {
-      visibleInputsEntry.opNodes.push(resolvedNode);
-    } else {  // Create new entry.
-      visibleInputs[visibleParent.name] = {
-        visibleParent: visibleParent,
-        opNodes: [resolvedNode]
-      } as VisibleParent;
-    }
-  });
-
-  // Find all parents of the start node.
-  let startNodeParents = {};
-  let indexedStartNodeParents = [currentVisibleParent];
-  startNodeParents[currentVisibleParent.name] = {
-    traced: false,
-    index: 0,
-    connectionEndpoints: []
-  };
-
-  let currentNode = currentVisibleParent as Node;
-  for (let index = 1; currentNode.name !== tf.graph.ROOT_NAME; index++) {
-    currentNode = currentNode.parentNode;
-    startNodeParents[currentNode.name] = {
-      traced: false,
-      index: index,
-      connectionEndpoints: []
-    };
-    indexedStartNodeParents[index] = currentNode;
-  }
-
-  // Find first mutual parent of each input node and highlight connection.
-  _.forOwn(visibleInputs, function(visibleParentInfo: VisibleParent, key) {
-    let nodeInstance = visibleParentInfo.visibleParent;
-    // Make recursive call for each input-OpNode contained by the visible
-    // parent.
-    _.each(visibleParentInfo.opNodes, function(opNode: OpNode) {
-      allTracedNodes =
-          traceAllInputsOfOpNode(renderGraphInfo, opNode, allTracedNodes);
-    });
-
-    if (nodeInstance.name !== currentVisibleParent.name) {
-      _createVisibleTrace(
-          nodeInstance, startNodeParents, indexedStartNodeParents);
-    }
-  });
-
-  return allTracedNodes;
-}
-
-/**
- * Colors the edges to connect the passed node to the start node. This is
- * done by:
- *
- * a) Finding the first (visible) common parent in the rendered
- * hierarchy.
- * NB: There are 2 types of connections:
- * 1) Direct connections between node A
- * and B, marked below as II,
- * 2) Connections from any node A to its parent, A'. Marked below as I and III.
- * For type 2 connection you need to know the inner-nested node, the
- * direct parent, and the ultimate destination of the connection.
- *
- *  A_parent      B_parent
- * +--------+    +---------+
- * |        |    |         |
- * |  +--+ I| II |III+--+  |
- * |  |A +---------->+B |  |
- * |  +--+  |    |   +--+  |
- * |        |    |         |
- * +--------+    +---------+
- *
- *
- * b) Highlighting the direct connection between the parents of A and B,
- * called A_parent and B_parent, s.t. A_parent and B_parent are children of the
- * mutual parent of A and B found in a), marked above as II.
- *
- * c) Highlighting the connection from A to A_parent and B to B_parent
- * (through all layers of parents between A and A_parent and B and B_parent,
- * respectively). Marked above as I and III.
- *
- * @param nodeInstance The instance of the node to use as destination node, B.
- * @param startNodeParents Map of startNodeParent names to information objects
- * about the parent.
- * @param indexedStartNodeParents An array of all parents of the start node.
- * This is required to find the child of the mutual parent which is a parent
- * of the start node.
- * @private
- */
-function _createVisibleTrace(
-    nodeInstance: Node, startNodeParents, indexedStartNodeParents: Node[]) {
-  let currentNode = nodeInstance;
-  let previousNode = nodeInstance;
-
-  // Ascend through parents until a mutual parent is found with the start
-  // node.
-  let destinationParentPairs = [];
-  while (!startNodeParents[currentNode.name]) {
-    if (previousNode.name !== currentNode.name) {
-      destinationParentPairs.push([previousNode, currentNode]);
-    }
-    previousNode = currentNode;
-    currentNode = currentNode.parentNode;
-  }
-
-  // Connection between nodes is drawn between the parents of each
-  // respective node, both of which share the mutual parent.
-  let startNodeIndex = startNodeParents[currentNode.name].index;
-  let startNodeName =
-      indexedStartNodeParents[Math.max(startNodeIndex - 1, 0)].name;
-
-  let startNodeTopParentName = startNodeName;
-  let targetNodeTopParentName = previousNode.name;
-
-  let endNodeName = previousNode.name;
-  d3.selectAll(`[data-edge="${endNodeName}--${startNodeName}"]`)
-      .classed('input-edge-highlight', true);
-
-  // Trace up the parents of the input.
-  _.each(destinationParentPairs, function(value) {
-    let inner = value[0];
-    let outer = value[1];
-    let edgeSelector = `[data-edge="${inner.name}--${startNodeTopParentName}` +
-        `~~${outer.name}~~OUT"]`;
-    d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
-  });
-
-  // Trace up the parents of the start node.
-  for (let index = 1; index < startNodeIndex; index++) {
-    let inner = indexedStartNodeParents[index - 1];
-    let outer = indexedStartNodeParents[index];
-    let edgeSelector = `[data-edge="${targetNodeTopParentName}~~${outer.name}` +
-        `~~IN--${inner.name}"]`;
-    d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
-  }
-}
-
-/**
- * Creates map { [name: string] -> Node } of all visible / rendered parents
- * of the nodes identified by the node names passed in.
- *
- * @param renderGraphInfo The information on the rendered graph.
- * @param nodeNames String array of node names.
- * @returns {[nodeName: string]: Node}
- * @private
- */
-function _findVisibleParentsFromOpNodes(renderGraphInfo, nodeNames: string[]) {
-  let visibleParents: {[nodeName: string]: Node} = {};
-  _.each(nodeNames, function(nodeName) {
-    let currentNode = renderGraphInfo.getNodeByName(nodeName);
-    let visibleParent = getVisibleParent(renderGraphInfo, currentNode);
-    visibleParents[visibleParent.name] = visibleParent;
-  });
-
-  return visibleParents;
-}
-
-/**
- * Traverse through the parents of all nodes in the list and mark each
- * encountered node as input-parent.
- * @param visibleNodes Map of input nodes, have to be visible/rendered when
- * called.
- * @private
- */
-function _markParentsOfNodes(visibleNodes: {[nodeName: string]: Node}) {
-  _.forOwn(visibleNodes, function(nodeInstance: Node) {
-    // Mark all parents of the node as input-parents.
-    let currentNode = nodeInstance;
-
-    while (currentNode.name !== tf.graph.ROOT_NAME) {
-      const renderedElementSelection =
-          d3.select(`.node[data-name="${currentNode.name}"]`);
-      // Only mark the element as a parent node to an input if it is not
-      // marked as input node itself.
-      if (renderedElementSelection.nodes().length &&
-          !renderedElementSelection.classed('input-highlight') &&
-          !renderedElementSelection.classed('selected') &&
-          // OpNode only parent if start node is embedded node, in which case
-          // the OpNode should be faded as well.
-          !renderedElementSelection.classed('op')) {
-        renderedElementSelection.classed('input-parent', true);
-      }
-      currentNode = currentNode.parentNode;
-    }
-  });
-}
-
-/**
- * Find the parent of the passed in op node which is expanded. This is done
- * by going through all parents until the parent's parent is expanded, thus
- * finding the first unexpanded parent which is rendered on the screen.
- * @param renderGraphInfo The graph info object used to gain access to the
- * render info of the parents.
- * @param currentNode The node whose parent is to be found.
- * @returns Node
- */
-export function getVisibleParent(
-    renderGraphInfo: tf.graph.render.RenderGraphInfo,
-    currentNode: tf.graph.Node) {
-  let found = false;
-  let currentParent = currentNode;
-
-  while (!found) {
-    // Get parent element, to extract name.
-    currentNode = currentParent;
-    currentParent = currentNode.parentNode;
-
-    if (currentParent === undefined) {
-      found = true;
-    } else {
-      let renderNode = renderGraphInfo.getRenderNodeByName(currentParent.name);
-      // Found if node is rendered on the screen (renderNode truthy), and
-      // the parent is either expanded (i.e. it is a metanode or seriesnode)
-      // or the parent is an OpNode in which case currentNode is an embedded
-      // node which has another OpNode as parent.
-      if (renderNode &&
-          (renderNode.expanded || currentParent instanceof graph.OpNodeImpl)) {
-        found = true;
-      }
-    }
-  }  // Close while loop.
-  return currentNode;
-}
-}  // Close module.
diff --git a/tensorflow/tensorboard/components/tf_graph_common/parser.ts b/tensorflow/tensorboard/components/tf_graph_common/parser.ts
deleted file mode 100644
index 04d879ef9108fafbf1e99bd43ac868bea11860f2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/parser.ts
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.parser {
-
-/**
- * Parses a native js value, which can be either a string, boolean or number.
- *
- * @param value The value to be parsed.
- */
-function parseValue(value: string): string|number|boolean {
-  if (value === 'true') {
-    return true;
-  }
-  if (value === 'false') {
-    return false;
-  }
-  let firstChar = value[0];
-  if (firstChar === '"') {
-    return value.substring(1, value.length - 1);
-  }
-  let num = parseFloat(value);
-  return isNaN(num) ? value : num;
-}
-
-/**
- * Fetches a text file and returns a promise of the result.
- */
-export function fetchPbTxt(filepath: string): Promise<ArrayBuffer> {
-  return new Promise<ArrayBuffer>(function(resolve, reject) {
-    const request = new XMLHttpRequest();
-    request.open('GET', filepath);
-    request.responseType = 'arraybuffer';
-
-    request.onerror = () => reject(request.status);
-    request.onload = () => resolve(request.response);
-
-    request.send(null);
-  });
-}
-
-/**
- * Fetches the metadata file, parses it and returns a promise of the result.
- */
-export function fetchAndParseMetadata(path: string, tracker: ProgressTracker) {
-  return tf.graph.util
-      .runTask(
-          'Reading metadata pbtxt', 40,
-          () => {
-            if (path == null) {
-              return Promise.resolve(null);
-            }
-            return fetchPbTxt(path);
-          },
-          tracker)
-      .then((arrayBuffer: ArrayBuffer) => {
-        return tf.graph.util.runAsyncPromiseTask(
-            'Parsing metadata.pbtxt', 60, () => {
-              return arrayBuffer != null ? parseStatsPbTxt(arrayBuffer) :
-                                           Promise.resolve(null);
-            }, tracker);
-      });
-}
-
-/**
- * Fetches the graph file, parses it and returns a promise of the result. The
- * result will be undefined if the graph is empty.
- */
-export function fetchAndParseGraphData(path: string, pbTxtFile: Blob,
-    tracker: ProgressTracker) {
-  return tf.graph.util
-      .runTask(
-          'Reading graph pbtxt', 40,
-          () => {
-            if (pbTxtFile) {
-              return new Promise<ArrayBuffer>(function(resolve, reject) {
-                let fileReader = new FileReader();
-                fileReader.onload = () => resolve(fileReader.result);
-                fileReader.onerror = () => reject(fileReader.error);
-                fileReader.readAsArrayBuffer(pbTxtFile);
-              });
-            } else {
-              return fetchPbTxt(path);
-            }
-          },
-          tracker)
-      .then((arrayBuffer: ArrayBuffer) => {
-        return tf.graph.util.runTask('Parsing graph.pbtxt', 60, () => {
-          return parseGraphPbTxt(arrayBuffer);
-        }, tracker);
-      });
-}
-
-/**
- * Parse a file object in a streaming fashion line by line (or custom delim).
- * Can handle very large files.
- * @param input The file object as an array buffer.
- * @param callback The callback called on each line
- * @param chunkSize The size of each read chunk. (optional)
- * @param delim The delimiter used to split a line. (optional)
- * @returns A promise for when it is finished.
- */
-export function streamParse(
-    arrayBuffer: ArrayBuffer, callback: (string) => void,
-    chunkSize: number = 1000000, delim: string = '\n'): Promise<boolean> {
-  return new Promise<boolean>(function(resolve, reject) {
-    let offset = 0;
-    let bufferSize = arrayBuffer.byteLength - 1;
-    let data = '';
-
-    function readHandler(str) {
-      offset += chunkSize;
-      let parts = str.split(delim);
-      let first = data + parts[0];
-      if (parts.length === 1) {
-        data = first;
-        readChunk(offset, chunkSize);
-        return;
-      }
-      data = parts[parts.length - 1];
-      callback(first);
-      for (let i = 1; i < parts.length - 1; i++) {
-        callback(parts[i]);
-      }
-      if (offset >= bufferSize) {
-        if (data) {
-          callback(data);
-        }
-        resolve(true);
-        return;
-      }
-      readChunk(offset, chunkSize);
-    }
-
-    function readChunk(offset: number, size: number) {
-      const arrayBufferChunk = arrayBuffer.slice(offset, offset + size);
-
-      const blob = new Blob([arrayBufferChunk]);
-      const file = new FileReader();
-      file.onload = (e: any) => readHandler(e.target.result);
-      file.readAsText(blob);
-    }
-
-    readChunk(offset, chunkSize);
-  });
-}
-
-/**
- * Since proto-txt doesn't explicitly say whether an attribute is repeated
- * (an array) or not, we keep a hard-coded list of attributes that are known
- * to be repeated. This list is used in parsing time to convert repeated
- * attributes into arrays even when the attribute only shows up once in the
- * object.
- */
-const GRAPH_REPEATED_FIELDS: {[attrPath: string]: boolean} = {
-  'node': true,
-  'node.input': true,
-  'node.attr': true,
-  'node.attr.value.list.type': true,
-  'node.attr.value.shape.dim': true,
-  'node.attr.value.tensor.string_val': true,
-  'node.attr.value.tensor.tensor_shape.dim': true,
-  'node.attr.value.list.shape': true,
-  'node.attr.value.list.shape.dim': true,
-  'node.attr.value.list.s': true
-};
-
-const METADATA_REPEATED_FIELDS: {[attrPath: string]: boolean} = {
-  'step_stats.dev_stats': true,
-  'step_stats.dev_stats.node_stats': true,
-  'step_stats.dev_stats.node_stats.output': true,
-  'step_stats.dev_stats.node_stats.memory': true,
-  'step_stats.dev_stats.node_stats.output.tensor_description.shape.dim': true
-};
-
-/**
- * Parses an ArrayBuffer of a proto txt file into a raw Graph object.
- */
-export function parseGraphPbTxt(input: ArrayBuffer):
-    Promise<tf.graph.proto.NodeDef[]> {
-  return parsePbtxtFile(input, GRAPH_REPEATED_FIELDS).then(obj => obj['node']);
-}
-
-/**
- * Parses an ArrayBuffer of a proto txt file into a StepStats object.
- */
-export function parseStatsPbTxt(input: ArrayBuffer):
-    Promise<tf.graph.proto.StepStats> {
-  return parsePbtxtFile(input, METADATA_REPEATED_FIELDS)
-      .then(obj => obj['step_stats']);
-}
-
-/**
- * Parses a ArrayBuffer of a proto txt file into javascript object.
- *
- * @param input The ArrayBuffer or file object implementing slice.
- * @param repeatedFields Map (Set) of all the repeated fields, since you can't
- *   tell directly from the pbtxt if a field is repeated or not.
- * @returns The parsed object.
- */
-function parsePbtxtFile(
-    input: ArrayBuffer,
-    repeatedFields: {[attrPath: string]: boolean}): Promise<Object> {
-  let output: { [name: string]: any; } = {};
-  let stack = [];
-  let path: string[] = [];
-  let current: { [name: string]: any; } = output;
-
-  function splitNameAndValueInAttribute(line: string) {
-    let colonIndex = line.indexOf(':');
-    let name = line.substring(0, colonIndex).trim();
-    let value = parseValue(line.substring(colonIndex + 2).trim());
-    return {
-      name: name,
-      value: value
-    };
-  }
-
-  /**
-   * Adds a value, given the attribute name and the host object. If the
-   * attribute already exists, but is not an array, it will convert it to an
-   * array of values.
-   *
-   * @param obj The host object that holds the attribute.
-   * @param name The attribute name (key).
-   * @param value The attribute value.
-   * @param path A path that identifies the attribute. Used to check if
-   *     an attribute is an array or not.
-   */
-  function addAttribute(obj: Object, name: string,
-      value: Object|string|number|boolean, path: string[]): void {
-    // We treat 'node' specially since it is done so often.
-    let existingValue = obj[name];
-    if (existingValue == null) {
-      obj[name] = path.join('.') in repeatedFields ? [value] : value;
-    } else if (Array.isArray(existingValue)) {
-      existingValue.push(value);
-    } else {
-      obj[name] = [existingValue, value];
-    }
-  }
-
-  // Run through the file a line at a time.
-  return streamParse(input, function(line: string) {
-    if (!line) {
-      return;
-    }
-    line = line.trim();
-
-    switch (line[line.length - 1]) {
-      case '{':  // create new object
-        let name = line.substring(0, line.length - 2).trim();
-        let newValue: { [name: string]: any; } = {};
-        stack.push(current);
-        path.push(name);
-        addAttribute(current, name, newValue, path);
-        current = newValue;
-        break;
-      case '}':
-        current = stack.pop();
-        path.pop();
-        break;
-      default:
-        let x = splitNameAndValueInAttribute(line);
-        addAttribute(current, x.name, x.value, path.concat(x.name));
-        break;
-    }
-  }).then(function() {
-    return output;
-  });
-}
-
-} // Close module tf.graph.parser.
diff --git a/tensorflow/tensorboard/components/tf_graph_common/proto.ts b/tensorflow/tensorboard/components/tf_graph_common/proto.ts
deleted file mode 100644
index eda73e45c3b27f77d5fc5790f57fd97ae3518382..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/proto.ts
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * @fileoverview Interfaces that parallel proto definitions in
- * third_party/tensorflow/core/framework/...
- *     graph.proto
- *     step_stats.proto
- * These should stay in sync.
- */
-module tf.graph.proto {
-  /**
-   * TensorFlow node definition as defined in the graph.proto file.
-   */
-  export interface NodeDef {
-    /** Name of the node */
-    name: string;
-    /** List of nodes that are inputs for this node. */
-    input: string[];
-    /** The name of the device where the computation will run. */
-    device: string;
-    /** The name of the operation associated with this node. */
-    op: string;
-    /** List of attributes that describe/modify the operation. */
-    attr: {key: string, value: Object}[];
-  }
-
-  /**
-   * Generic graph as defined in the graph_explorer.proto file.
-   */
-  export interface GenericGraph {
-    /** List of nodes in the graph */
-    node: GenericNode[];
-    /** List of nodes in the graph */
-    edge: GenericEdge[];
-    /** List of attributes that describe/modify the operation. */
-    attr: Array<{[key: string]: any}>;
-  }
-
-  /**
-   * GenericEdge corresponds to the Edge message in graph_explorer.proto.
-   */
-  export interface GenericEdge {
-    /** Name of the source node. */
-    source: string;
-    /** Name of the target node. */
-    target: string;
-    /** Attributes of the edge. */
-    edge_attr: Array<{[key: string]: any}>;
-  }
-
-  /**
-   * GenericNode corresponds to the Node message in graph_explorer.proto.
-   */
-  export interface GenericNode {
-    /** Name of the node */
-    name: string;
-    /** Attributes of a leaf node or leaf nodes within a metanode. */
-    node_attr: Array<{[key: string]: any}>;
-    /** Attributes of a metanode. */
-    metanode_attr: Array<{[key: string]: any}>;
-  }
-
-  /**
-   * TensorFlow stats file definition as defined in the stats proto file.
-   */
-  export interface StepStats {
-    dev_stats: {device: string, node_stats: NodeExecStats[]}[];
-  }
-
-  /**
-   * TensorFlow stats for a node as defined in the step_stats proto file.
-   */
-  export interface NodeExecStats {
-    node_name: string;
-    // The next 4 properties are currently stored as string in json
-    // and must be parsed.
-    all_start_micros: number;
-    op_start_rel_micros: number;
-    op_end_rel_micros: number;
-    all_end_rel_micros: number;
-    memory: {
-      allocator_name: string;
-      total_bytes: number;  // Stored as string in json and should be parsed.
-      peak_bytes: number;   // Stored as string in json and should be parsed.
-    }[];
-    /** Output sizes recorded for a single execution of a graph node */
-    output: NodeOutput[];
-    timeline_label: string;
-    scheduled_micros: string;
-    thread_id: string;
-  }
-
-  /**
-   * Description for the output tensor(s) of an operation in the graph as
-   * defined in the step_stats.proto file.
-   */
-  export interface NodeOutput {
-    slot: number;  // Stored as string in json and should be parsed.
-    tensor_description: {
-      /** Data type of tensor elements */
-      dtype: string;
-      /** Shape of the tensor */
-      shape: {
-        /**
-         * Dimensions of the tensor, such as [{name: 'input', size: 30},
-         * {name: 'output', size: 40}] for a 30 x 40 2D tensor.  The names
-         * are optional. The order of entries in 'dim' matters: It indicates
-         * the layout of the values in the tensor in-memory representation.
-         */
-        dim: {
-          /** Size of the tensor in that dimension */
-          size: number,  // Stored as string in json and should be parsed.
-          /** Optional name of the tensor dimension */
-          name?: string
-        }[];
-      };
-      /** Information about the size and allocator used for the data */
-      allocation_description: {
-        // The next 2 properties are stored as string in json and
-        // should be parsed.
-        /** Total number of bytes requested */
-        requested_bytes: number;
-        /** Total number of bytes allocated, if known */
-        allocated_bytes?: number;
-        /** Name of the allocator used */
-        allocator_name: string;
-      };
-    };
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_common/render.ts b/tensorflow/tensorboard/components/tf_graph_common/render.ts
deleted file mode 100644
index 4f28af481d47336c26f0cfaa704e0b7230b7e11e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/render.ts
+++ /dev/null
@@ -1,1673 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * Package for the Render Hierarchy for TensorFlow graph.
- */
-module tf.graph.render {
-
-export type Point = {x: number, y: number};
-
-/**
- * Color parameters for op nodes.
- */
-export let OpNodeColors = {DEFAULT_FILL: 'white', DEFAULT_STROKE: '#b2b2b2'};
-
-/**
- * Color parameters for node encoding.
- * @type {Object}
- */
-export let MetanodeColors = {
-  /**
-   * Default fill and stroke to use when no other information is available.
-   */
-  DEFAULT_FILL: '#d9d9d9',
-  DEFAULT_STROKE: '#a6a6a6',
-  SATURATION: 0.6,
-  LIGHTNESS: 0.85,
-  /**
-   * Neutral color to use when the node is expanded (used when coloring by
-   * compute time, memory and device).
-   */
-  EXPANDED_COLOR: '#f0f0f0',
-  /**
-   * Standard hue values for node color palette.
-   */
-  HUES: [220, 100, 180, 40, 20, 340, 260, 300, 140, 60],
-  STRUCTURE_PALETTE(id: number, lightened?: boolean) {
-    // The code below is a flexible way to computationally create a set
-    // of colors that go well together.
-    let hues = MetanodeColors.HUES;
-    let n = hues.length;
-    let hue = hues[id % n];
-    let m = Math.sin(hue * Math.PI / 360);
-    let sat = lightened ? 30 : 90 - 60 * m;
-    let light = lightened ? 95 : 80;
-    return d3.hsl(hue, .01 * sat, .01 * light).toString();
-  },
-  DEVICE_PALETTE(index: number): string {
-    return MetanodeColors.STRUCTURE_PALETTE(index);
-  },
-  XLA_CLUSTER_PALETTE(index: number): string {
-    return MetanodeColors.STRUCTURE_PALETTE(index);
-  },
-  UNKNOWN: '#eee',
-  GRADIENT_OUTLINE: '#888'
-};
-
-/**
- * Color parameters for op nodes.
- */
-export let SeriesNodeColors = {
-  DEFAULT_FILL: 'white',
-  DEFAULT_STROKE: '#b2b2b2'
-};
-
-/**
- * Parameters that affect how the graph is rendered on the screen.
- */
-const PARAMS = {
-  /**
-   * Whether to extract high degree nodes from the core part of the graph.
-   */
-  enableExtraction: true,
-  /**
-   * The minimum number of nodes for a graph to have in order for high in and
-   * out degree nodes to be extracted in auxiliary. The aim here is to prevent
-   * nodes from being extracted from small graphs.
-   */
-  minNodeCountForExtraction: 15,
-  /**
-   * The minimum in or out degree a node must have in order to be possibly
-   * extracted.
-   */
-  minDegreeForExtraction: 5,
-  /**
-   * Maximum number of control edges a node can have before they aren't
-   * displayed.
-   */
-  maxControlDegree: 4,
-  /**
-   * Maximum in (for outbound bridge paths) or out (for inbound bridge paths)
-   * degree of a node allowed for a bridge path to be rendered to it from a
-   * subhierarchy of nodes. Having a max prevents having too many nodes emanate
-   * from a subhierarchy and crowding up.
-   */
-  maxBridgePathDegree: 4,
-  /**
-   * Types patterns for predefined out-extract nodes, which are
-   * sink-like nodes that will be extracted from the main graph.
-   */
-  outExtractTypes: [
-    'NoOp'  // NoOps are sink-like used for managing control dependencies.
-  ],
-
-  /**
-   * Types patterns for predefined in-extract nodes, which are
-   * source-like nodes that will be extracted from the main graph.
-   */
-  inExtractTypes: [],
-
-  /**
-   * When removing edges from a high degree node, remove all of its edges if
-   * detachAllEdgesForHighDegree is true.  Otherwise remove all in-edges if
-   * the node has high in-degree, or all out-edges if the node has high
-   * out-degree.
-   */
-  detachAllEdgesForHighDegree: true,
-
-  /**
-   * After extracting high in/out degree nodes and predefined
-   * source-like/sink-like, extract isolated nodes to the side
-   * if this extractIsolatedNodesWithAnnotationsOnOneSide is true.
-   */
-  extractIsolatedNodesWithAnnotationsOnOneSide: true,
-
-  /**
-   * Whether to add bridge nodes and edges to the core when building the
-   * subhierarchy of an expanded metanode. See buildSubhierarchy().
-   */
-  enableBridgegraph: true,
-
-  /**
-   * 2 colors, for the minimum and maximum value respectively, whenever we
-   * have a gradient scale.
-   */
-  minMaxColors: ['#fff5f0', '#fb6a4a'],
-
-  /**
-   * Maximum number of annotations to be displayed on a node before an
-   * ellipsis is used.
-   */
-  maxAnnotations: 5
-};
-
-/**
- * Stores the rendering information, such as x and y coordinates,
- * for each node in the graph.
- */
-export class RenderGraphInfo {
-  hierarchy: hierarchy.Hierarchy;
-  private displayingStats: boolean;
-  private index: {[nodeName: string]: RenderNodeInfo};
-  private renderedOpNames: string[];
-  private deviceColorMap: d3.ScaleOrdinal<string, string>;
-  private xlaClusterColorMap: d3.ScaleOrdinal<string, string>;
-  private memoryUsageScale: d3.ScaleLinear<string, string>;
-  private computeTimeScale: d3.ScaleLinear<string, string>;
-  /** Scale for the thickness of edges when there is no shape information. */
-  edgeWidthScale:
-      d3.ScaleLinear<number, number> | d3.ScalePower<number, number>;
-  // Since the rendering information for each node is constructed lazily,
-  // upon node's expansion by the user, we keep a map between the node's name
-  // and whether the rendering information was already constructed for that
-  // node.
-  private hasSubhierarchy: {[nodeName: string]: boolean};
-  root: RenderGroupNodeInfo;
-  traceInputs: Boolean;
-
-  constructor(hierarchy: hierarchy.Hierarchy, displayingStats: boolean) {
-    this.hierarchy = hierarchy;
-    this.displayingStats = displayingStats;
-    this.index = {};
-    this.renderedOpNames = [];
-
-    this.computeScales();
-    // Maps node name to whether the rendering hierarchy was already
-    // constructed.
-    this.hasSubhierarchy = {};
-    this.root = new RenderGroupNodeInfo(hierarchy.root);
-    this.index[hierarchy.root.name] = this.root;
-    this.renderedOpNames.push(hierarchy.root.name);
-    this.buildSubhierarchy(hierarchy.root.name);
-    this.root.expanded = true;
-    this.traceInputs = false;
-  }
-
-  computeScales() {
-    this.deviceColorMap = d3.scaleOrdinal<string>()
-        .domain(this.hierarchy.devices)
-        .range(_.map(d3.range(this.hierarchy.devices.length),
-                     MetanodeColors.DEVICE_PALETTE));
-
-    this.xlaClusterColorMap =
-        d3.scaleOrdinal<string>()
-            .domain(this.hierarchy.xlaClusters)
-            .range(_.map(
-                d3.range(this.hierarchy.xlaClusters.length),
-                MetanodeColors.XLA_CLUSTER_PALETTE));
-
-    let topLevelGraph = this.hierarchy.root.metagraph;
-    // Find the maximum and minimum memory usage.
-    let memoryExtent = d3.extent(topLevelGraph.nodes(),
-        (nodeName, index) => {
-      let node = topLevelGraph.node(nodeName);
-      // Some ops don't have stats at all.
-      if (node.stats != null) {
-        return node.stats.totalBytes;
-      }
-    });
-    this.memoryUsageScale = d3.scaleLinear<string, string>()
-        .domain(memoryExtent)
-        .range(PARAMS.minMaxColors);
-
-    // Find also the minimum and maximum compute time.
-    let computeTimeExtent = d3.extent(topLevelGraph.nodes(),
-        (nodeName, index) => {
-      let node = topLevelGraph.node(nodeName);
-      // Some ops don't have stats at all.
-      if (node.stats != null) {
-        return node.stats.getTotalMicros();
-      }
-    });
-    this.computeTimeScale = d3.scaleLinear<string, string>()
-        .domain(computeTimeExtent)
-        .range(PARAMS.minMaxColors);
-
-    this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
-      scene.edge.EDGE_WIDTH_SCALE :
-      d3.scaleLinear()
-        .domain([1, this.hierarchy.maxMetaEdgeSize])
-        .range([scene.edge.MIN_EDGE_WIDTH, scene.edge.MAX_EDGE_WIDTH]);
-  }
-
-  /**
-   * Get a previously created RenderNodeInfo by its node name.
-   */
-  getRenderNodeByName(nodeName: string): RenderNodeInfo {
-    return this.index[nodeName];
-  }
-
-  /**
-   * Get the underlying node in the hierarchical graph by its name.
-   */
-  getNodeByName(nodeName: string): Node {
-    return this.hierarchy.node(nodeName);
-  }
-
-  /**
-   * Get a previously created RenderNodeInfo for the specified node name,
-   * or create one if it hasn't been created yet.
-   */
-  getOrCreateRenderNodeByName(nodeName: string): RenderNodeInfo {
-    // Polymer may invoke this with null.
-    if (!nodeName) {
-      return null;
-    }
-
-    if (nodeName in this.index) {
-      return this.index[nodeName];
-    }
-
-    let node = this.hierarchy.node(nodeName);
-    // Exit early if the node does not exist in the hierarchy. This can happen
-    // when a graph is reloaded while the infocard points to a node not visible
-    // at the top-level.
-    if (!node) {
-      return null;
-    }
-    let renderInfo = node.isGroupNode ?
-        new RenderGroupNodeInfo(<GroupNode>node) :
-        new RenderNodeInfo(node);
-    this.index[nodeName] = renderInfo;
-    this.renderedOpNames.push(nodeName);
-
-    if (node.stats) {
-      renderInfo.memoryColor = this.memoryUsageScale(node.stats.totalBytes);
-      renderInfo.computeTimeColor =
-          this.computeTimeScale(node.stats.getTotalMicros());
-    }
-
-    if (!node.isGroupNode) {
-      let clusterName = (node as OpNode).xlaCluster;
-      if (clusterName) {
-        renderInfo.xlaClusterColor = this.xlaClusterColorMap(clusterName);
-      }
-    }
-
-    // We only fade nodes when we're displaying stats.
-    renderInfo.isFadedOut = this.displayingStats &&
-        !tf.graph.util.hasDisplayableNodeStats(node.stats);
-
-    if (node.isGroupNode) {
-      // Make a list of tuples (device, proportion), where proportion
-      // is the fraction of op nodes that have that device.
-      let pairs = _.pairs((<GroupNode>node).deviceHistogram);
-      if (pairs.length > 0) {
-        // Compute the total # of devices.
-        let numDevices = _.sum(pairs, _.last);
-        renderInfo.deviceColors = _.map(pairs, pair => ({
-              color: this.deviceColorMap(pair[0]),
-              // Normalize to a proportion of total # of devices.
-              proportion: pair[1] / numDevices
-            }));
-      }
-    } else {
-      let device = (<OpNode>renderInfo.node).device;
-      if (device) {
-        renderInfo.deviceColors = [{
-          color: this.deviceColorMap(device),
-          proportion: 1.0
-        }];
-      }
-    }
-
-    return this.index[nodeName];
-  }
-
-  /**
-   * Return the nearest ancestor node, including itself, that is visible
-   * in the visualization. This method is used so that we can select
-   * (highlight) a node that isn't drawn yet, by selecting (highlighting)
-   * its nearest ancestor that has been drawn.
-   */
-  getNearestVisibleAncestor(name: string): string {
-    let path = getHierarchicalPath(name);
-    for (let i = 0; i < path.length; i++) {
-      let nodeName = path[i];
-      // Op nodes have expanded set to false by default.
-      if (!this.getRenderNodeByName(nodeName).expanded) {
-        return nodeName;
-      }
-    }
-    // Fallthrough. If everything was expanded return the node.
-    return name;
-  }
-
-  // TODO(jimbo): Delete this an any code it touches (all deprecated).
-  setDepth(depth: number): void {
-    setGroupNodeDepth(this.root, +depth);
-  }
-
-  /**
-   * Returns true if the renderNode is an isolated node within its parent node.
-   */
-  isNodeAuxiliary(renderNode: RenderNodeInfo): boolean {
-    let parentNode = <RenderGroupNodeInfo>this.getRenderNodeByName(
-      renderNode.node.parentNode.name);
-    let found = _.find(parentNode.isolatedInExtract, node => {
-      return node.node.name === renderNode.node.name;
-    });
-    if (found) {
-      return true;
-    }
-    found = _.find(parentNode.isolatedOutExtract, node => {
-      return node.node.name === renderNode.node.name;
-    });
-    return !!found;
-  }
-
-  /**
-   * Returns a list of ops that have been rendered so far for this graph. More
-   * ops may later be rendered if the user expands nodes for instance. The list
-   * returned here can only stay the same size or grow on successive calls.
-   */
-  getNamesOfRenderedOps(): string[] {
-    return this.renderedOpNames;
-  }
-
-  buildSubhierarchy(nodeName: string): void {
-    // Terminate if the rendering hierarchy was already constructed
-    // for this node.
-    if (nodeName in this.hasSubhierarchy) {
-      return;
-    }
-
-    let renderNodeInfo = this.index[nodeName];
-
-    // If it is not a meta node or a series node, don't do anything.
-    if (renderNodeInfo.node.type !== NodeType.META &&
-        renderNodeInfo.node.type !== NodeType.SERIES) {
-      return;
-    }
-
-    // At this point we know the rendering information is about a group node.
-    let renderGroupNodeInfo = <RenderGroupNodeInfo> renderNodeInfo;
-    let metagraph = renderGroupNodeInfo.node.metagraph;
-    let coreGraph = renderGroupNodeInfo.coreGraph;
-
-    // Create render nodes to represent each child from the metagraph. Although
-    // these will initially be added to the coreGraph, they may later be
-    // extracted. Also, due to extraction, the coreGraph may contain disjoint
-    // groups between which there is no visible path (other than annotations).
-    _.each(metagraph.nodes(), childName => {
-
-      let childRenderInfo = this.getOrCreateRenderNodeByName(childName);
-      let childNode = childRenderInfo.node;
-
-      coreGraph.setNode(childName, childRenderInfo);
-
-      if (!childNode.isGroupNode) {
-        _.each((<OpNode>childNode).inEmbeddings, embedding => {
-          let renderMetaedgeInfo = new RenderMetaedgeInfo(null);
-          addInAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo,
-              AnnotationType.CONSTANT);
-          this.index[embedding.name] = new RenderNodeInfo(embedding);
-        });
-        _.each((<OpNode>childNode).outEmbeddings, embedding => {
-          let renderMetaedgeInfo = new RenderMetaedgeInfo(null);
-          addOutAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo,
-              AnnotationType.SUMMARY);
-          this.index[embedding.name] = new RenderNodeInfo(embedding);
-        });
-      }
-
-    });
-
-    // Add render metaedge info for edges in the metagraph.
-    _.each(metagraph.edges(), edgeObj => {
-      let metaedge = metagraph.edge(edgeObj);
-      let renderMetaedgeInfo = new RenderMetaedgeInfo(metaedge);
-      renderMetaedgeInfo.isFadedOut =
-          this.index[edgeObj.v].isFadedOut || this.index[edgeObj.w].isFadedOut;
-      coreGraph.setEdge(edgeObj.v, edgeObj.w, renderMetaedgeInfo);
-    });
-
-    if (PARAMS.enableExtraction &&
-        renderGroupNodeInfo.node.type === NodeType.META) {
-      extractHighDegrees(renderGroupNodeInfo);
-    }
-
-    // Record that we constructed the rendering hierarchy for this node, so we
-    // don't construct it another time.
-    this.hasSubhierarchy[nodeName] = true;
-
-    // Look up the parent node's render information and short circuit if none.
-    let parentNode = renderGroupNodeInfo.node.parentNode;
-    if (!parentNode) {
-      return;
-    }
-    let parentNodeInfo =
-      <RenderGroupNodeInfo> this.index[parentNode.name];
-
-    // Utility function for computing the name of a bridge node.
-    let getBridgeNodeName = (inbound, ...rest) =>
-        rest.concat([inbound ? 'IN' : 'OUT']).join('~~');
-
-    // Build out the bridgegraph.
-    let bridgegraph = this.hierarchy.getBridgegraph(nodeName);
-
-    // Look for popular nodes so we can make annotations instead of paths.
-    let otherCounts = {
-      // Counts of edges coming INTO other nodes by name (outgoing from self).
-      in: <{[nodeName: string]: number}> {},
-      // Counts of edges going OUT from other nodes by name (coming into self).
-      out: <{[nodeName: string]: number}> {},
-      // Counts of all control edges involving other nodes by name.
-      control: <{[nodeName: string]: number}> {},
-    };
-    _.each(bridgegraph.edges(), e => {
-      // An edge is inbound if its destination node is in the metagraph.
-      let inbound = !!metagraph.node(e.w);
-      let otherName = inbound ? e.v : e.w;
-      let metaedge = bridgegraph.edge(e);
-      if (!metaedge.numRegularEdges) {
-        otherCounts.control[otherName] =
-          (otherCounts.control[otherName] || 0) + 1;
-      } else if (inbound) {
-        otherCounts.out[otherName] = (otherCounts.out[otherName] || 0) + 1;
-      } else {
-        otherCounts.in[otherName] = (otherCounts.in[otherName] || 0) + 1;
-      }
-    });
-
-    // Add annotations and edges for bridgegraph relationships.
-    let hierarchyNodeMap = this.hierarchy.getNodeMap();
-    _.each(bridgegraph.edges(), bridgeEdgeObj => {
-      let bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
-
-      // Determine whether this bridge edge is incoming by checking the
-      // metagraph for a node that matches the destination end.
-      let inbound = !!metagraph.node(bridgeEdgeObj.w);
-
-      // Based on the direction of the edge, one endpoint will be an immediate
-      // child of this renderNodeInfo, and the other endpoint will be a sibling
-      // of the parent (or an ancestor further up).
-      let [childName, otherName] =
-        inbound ?
-          [bridgeEdgeObj.w, bridgeEdgeObj.v] :
-          [bridgeEdgeObj.v, bridgeEdgeObj.w];
-
-      let childRenderInfo = this.index[childName];
-      let otherRenderInfo = this.index[otherName];
-      let otherNode =
-        otherRenderInfo ?
-          otherRenderInfo.node :
-          hierarchyNodeMap[otherName];
-
-      // Determine whether this edge is a control edge between nodes where
-      // either node is high-degree with respect to control edges. This will
-      // be a signal to show it as an annotation instead of a bridge edge.
-      let isHighDegreeControlEdge = !bridgeMetaedge.numRegularEdges &&
-        otherCounts.control[otherName] > PARAMS.maxControlDegree;
-
-      let [, childAnnotations] =
-        inbound ?
-          [renderNodeInfo.inAnnotations, childRenderInfo.inAnnotations] :
-          [renderNodeInfo.outAnnotations, childRenderInfo.outAnnotations];
-
-      // Don't render a bridge path if the other node has in or out degree above
-      // a threshold, lest bridge paths emanating out of a metagraph crowd up,
-      // as was the case for the Fatcat LSTM lstm_1 > lstm_1 metagraph.
-      let otherDegreeCount =
-          (inbound ? otherCounts.out : otherCounts.in)[otherName];
-      let isOtherHighDegree = otherDegreeCount > PARAMS.maxBridgePathDegree;
-
-      // The adjoining render metaedge info from the parent's coreGraph, if any.
-      // It will either be a Metaedge involving this node directly, if it
-      // previously came from a metagraph, or it'll be a Metaedge involving
-      // a previously created bridge node standing in for the other node.
-      let adjoiningMetaedge = null;
-
-      // We can only hope to render a bridge path if:
-      //  - bridgegraph paths are enabled,
-      //  - the other node is not too high-degree,
-      //  - the child is in the core (not extracted for being high-degree), and
-      //  - there's a path (in the traversal sense) between child and other.
-      let canDrawBridgePath = false;
-      if (PARAMS.enableBridgegraph &&
-          !isOtherHighDegree &&
-          !isHighDegreeControlEdge &&
-          childRenderInfo.isInCore()) {
-
-        // Utility function for finding an adjoining metaedge.
-        let findAdjoiningMetaedge = targetName => {
-          let adjoiningEdgeObj: graphlib.EdgeObject =
-            inbound ?
-              { v: targetName, w: nodeName } :
-              { v: nodeName, w: targetName };
-          return <RenderMetaedgeInfo>
-            parentNodeInfo.coreGraph.edge(adjoiningEdgeObj);
-        };
-
-        adjoiningMetaedge = findAdjoiningMetaedge(otherName);
-        if (!adjoiningMetaedge) {
-          adjoiningMetaedge = findAdjoiningMetaedge(
-              getBridgeNodeName(inbound, otherName, parentNode.name));
-        }
-
-        canDrawBridgePath = !!adjoiningMetaedge;
-      }
-
-      // Although dataflow edges are acyclic, control dependency edges may
-      // actually point 'backwards' in the graph. If this bridgeMetaedge is
-      // a control dependency, we need to determine whether it's backwards
-      // pointing so that we render it appropriately.
-      //
-      // For instance, say we're rendering a graph with nodes named A/B and Z/Y,
-      // and we're currently rendering the bridgegraph for A. Further, let's say
-      // that there was an original BaseEdge from A/B->Z/Y and a CONTROL EDGE
-      // from Z/Y=>A/B.
-      //
-      //     +----------------+
-      //     | A              |
-      //     |  +-----+       |         +------+
-      //     |  | B   |>----->|>------->| Z    |
-      //     |  |     |       |         |      |
-      //     |  |     |   *   |         |      |
-      //     |  |     |<=====<|<=======<|      |
-      //     |  +-----+       |         +------+
-      //     +----------------+
-      //
-      // When we render the subhierarchy for Metanode A, we'll come across a
-      // control-only Metaedge in the bridgegraph from Z=>A/B (*). The question
-      // is whether this edge is backwards.
-      //
-      // To answer that question, we follow the chain of adjoining metaedges
-      // until we reach the topmost one. In this case, that's the control-only
-      // Metaedge Z=>A in the ROOT's metagraph. We determine that this edge
-      // is backwards by looking at the topological ordering of ROOT's metagraph
-      // (which ignores control edges) and seeing that Z comes AFTER A.
-      //
-      // The property of being backwards is independent of whether the edge
-      // is inbound or outbound. In the preceding example, if we were building
-      // the subhierarchy for Z, we'd find bridge edge Z/Y=>A, walk to its
-      // topmost adjoining metaedge Z=>A and discover that it's backwards.
-      let backwards = false;
-      if (adjoiningMetaedge && !bridgeMetaedge.numRegularEdges) {
-        // Find the top-most adjoining render metaedge information, and the
-        // GroupNode whose metagraph must contain the associated metaedge.
-        let topAdjoiningMetaedge = adjoiningMetaedge;
-        let topGroupNode = parentNodeInfo.node;
-        while (topAdjoiningMetaedge.adjoiningMetaedge) {
-          topAdjoiningMetaedge = topAdjoiningMetaedge.adjoiningMetaedge;
-          topGroupNode = <GroupNode>topGroupNode.parentNode;
-        }
-
-        // Check against the topological ordering for the top node. The current
-        // bridge metaedge we're evaluating is backwards if its source comes
-        // after its destination.
-        let ordering = this.hierarchy.getTopologicalOrdering(topGroupNode.name);
-        let e = topAdjoiningMetaedge.metaedge;
-        backwards = ordering[e.v] > ordering[e.w];
-      }
-
-      // Render backwards control edges as annotations.
-      canDrawBridgePath = canDrawBridgePath && !backwards;
-
-      // If we can't make a bridge path for any reason, then we add an
-      // annotation instead.
-      if (!canDrawBridgePath) {
-        childAnnotations.push(new Annotation(
-            otherNode,
-            otherRenderInfo,
-            new RenderMetaedgeInfo(bridgeMetaedge),
-            AnnotationType.SHORTCUT,
-            inbound));
-        return;
-      }
-
-      // At this point, all conditions have been met for drawing a bridge path.
-
-      // Find or create the IN/OUT node representing otherNode.
-      let bridgeContainerName = getBridgeNodeName(inbound, nodeName);
-      let bridgeNodeName = getBridgeNodeName(inbound, otherName, nodeName);
-      let bridgeNodeRenderInfo = coreGraph.node(bridgeNodeName);
-      if (!bridgeNodeRenderInfo) {
-
-        // Find or create the directional container for the bridge node.
-        let bridgeContainerInfo = coreGraph.node(bridgeContainerName);
-        if (!bridgeContainerInfo) {
-          let bridgeContainerNode: BridgeNode = {
-            // Important node properties.
-            name: bridgeContainerName,
-            type: NodeType.BRIDGE,
-            // Unused node properties.
-            isGroupNode: false,
-            cardinality: 0,
-            parentNode: null,
-            stats: null,
-            include: InclusionType.UNSPECIFIED,
-            // BridgeNode properties.
-            inbound: inbound,
-            nodeAttributes: {},
-          };
-          bridgeContainerInfo =
-            new RenderNodeInfo(bridgeContainerNode);
-          this.index[bridgeContainerName] = bridgeContainerInfo;
-          coreGraph.setNode(bridgeContainerName, bridgeContainerInfo);
-        }
-
-        let bridgeNode: BridgeNode = {
-          // Important node properties.
-          name: bridgeNodeName,
-          type: NodeType.BRIDGE,
-          // Unimportant node properties.
-          isGroupNode: false,
-          cardinality: 1,
-          parentNode: null,
-          stats: null,
-          include: InclusionType.UNSPECIFIED,
-          // BridgeNode properties.
-          inbound: inbound,
-          nodeAttributes: {},
-        };
-        bridgeNodeRenderInfo = new RenderNodeInfo(bridgeNode);
-        this.index[bridgeNodeName] = bridgeNodeRenderInfo;
-        coreGraph.setNode(bridgeNodeName, bridgeNodeRenderInfo);
-
-        // Set bridgeNode to be a graphlib child of the container node.
-        coreGraph.setParent(bridgeNodeName, bridgeContainerName);
-        bridgeContainerInfo.node.cardinality++;
-      }
-
-      // Create and add a bridge render metaedge.
-      let bridgeRenderMetaedge =
-        new RenderMetaedgeInfo(bridgeMetaedge);
-      bridgeRenderMetaedge.adjoiningMetaedge = adjoiningMetaedge;
-      inbound ?
-        coreGraph.setEdge(bridgeNodeName, childName, bridgeRenderMetaedge) :
-        coreGraph.setEdge(childName, bridgeNodeName, bridgeRenderMetaedge);
-
-    }); // End _.each(bridgegraph.edges).
-
-    // For each bridge container (IN and/or OUT), add structural edges between
-    // terminal nodes and that container. A terminal node is one which has no
-    // non-bridge edges in the direction of the container.
-    //
-    // For example, consider a Metanode A which contains two child nodes A/B
-    // and A/C. Let's say it has one edge in the metagraph from A/B->A/C, and
-    // one edge in the bridgegraph from Z->A/C.
-    //
-    // At this point, we've added a container bridge node IN to house all
-    // incoming bridge nodes. We've also added a bridge node Z' (with parent IN)
-    // to A, and a bridge edge from Z'->C.
-    //
-    //     +----------------------+
-    //     | A          +---+     |
-    //     |    +------>| C |     |
-    //     |    |       +---+     |
-    //     |    |         ^       |
-    //     |    |         |       |
-    //     |    |    +----|----+  |
-    //     |    |    | IN |    |  |
-    //     |  +---+  |  +---+  |  |
-    //     |  | B |  |  | Z'|  |  |
-    //     |  +---+  |  +---+  |  |
-    //     |         +---------+  |
-    //     +----------------------+
-    //
-    // With no other help, dagre would lay out B and Z' on the same level,
-    // because both of them have no incoming edges. In other words, B is a
-    // terminal node in the INCOMING direction.
-    //
-    // But we want to force dagre to lay out Z' (and everything in IN) lower
-    // than all non-bridge nodes, so that there's enough room for the bridge
-    // edges after they've been adjusted to meet up with paths coming in from
-    // outside.
-    //
-    // To force Z' (and all other bridge nodes) to be lowest in the graph, we
-    // identify terminal nodes like B and give them structural edges to
-    // a new structural bridge node S which we add to IN.
-    //
-    //     +----------------------+
-    //     | A          +---+     |
-    //     |       +--->| C |     |
-    //     |       |    +---+     |
-    //     |     +---+    ^       |
-    //     |     | B |    |       |
-    //     |     +---+    |       |
-    //     |       ^      |       |
-    //     |       |      |       |
-    //     |  +----|------|----+  |
-    //     |  |IN  |      |    |  |
-    //     |  |  +---+  +---+  |  |
-    //     |  |  | S |  | Z'|  |  |
-    //     |  |  +---+  +---+  |  |
-    //     |  +----------------+  |
-    //     +----------------------+
-    //
-    // This ensures that dagre will lay out the bridge containers strictly at
-    // the ends of the graph. The structural edges will never be seen in the
-    // visualization except as a debugging aid.
-    _.each([true, false], inbound => {
-      let bridgeContainerName = getBridgeNodeName(inbound, nodeName);
-      let bridgeContainerInfo = coreGraph.node(bridgeContainerName);
-      if (!bridgeContainerInfo) {
-        return;
-      }
-      _.each(coreGraph.nodes(), childName => {
-        // Short-circuit if this child is a bridge node or it's not a terminal
-        // node in the direction we're interested in.
-        let childNodeInfo = coreGraph.node(childName);
-        if (childNodeInfo.node.type === NodeType.BRIDGE) {
-          return;
-        }
-        let isTerminal = inbound ?
-          !coreGraph.predecessors(childName).length :
-          !coreGraph.successors(childName).length;
-        if (!isTerminal) {
-          return;
-        }
-
-        // Find or create a bridge node in the container for all structural
-        // metaedges. It would have been nice to skip this step and simply
-        // set a metaedge between the terminal node and the container node, but
-        // in that case, something about the graph upsets dagre.layout()'s
-        // longestPath algorithm (was getting errors due to an undefined).
-        let structuralNodeName =
-            getBridgeNodeName(inbound, nodeName, 'STRUCTURAL_TARGET');
-        let structuralRenderInfo = coreGraph.node(structuralNodeName);
-        if (!structuralRenderInfo) {
-          let bridgeNode: BridgeNode = {
-            // Important Node properties.
-            name: structuralNodeName,
-            type: NodeType.BRIDGE,
-            // Unimportant Node properties.
-            isGroupNode: false,
-            cardinality: 1,
-            parentNode: null,
-            stats: null,
-            include: InclusionType.UNSPECIFIED,
-            // BridgeNode properties.
-            inbound: inbound,
-            nodeAttributes: {},
-          };
-          structuralRenderInfo = new RenderNodeInfo(bridgeNode);
-          structuralRenderInfo.structural = true;
-          this.index[structuralNodeName] = structuralRenderInfo;
-          coreGraph.setNode(structuralNodeName, structuralRenderInfo);
-          bridgeContainerInfo.node.cardinality++;
-          coreGraph.setParent(structuralNodeName, bridgeContainerName);
-        }
-
-        // Create the structural Metaedge and insert it.
-        let structuralMetaedgeInfo = new RenderMetaedgeInfo(null);
-        structuralMetaedgeInfo.structural = true;
-        structuralMetaedgeInfo.weight--; // Reduce weight for dagre layout.
-        inbound ?
-          coreGraph.setEdge(
-              structuralNodeName, childName, structuralMetaedgeInfo) :
-          coreGraph.setEdge(
-              childName, structuralNodeName, structuralMetaedgeInfo);
-      });
-    });
-  }
-}
-
-/**
- * A class for rendering annotation object which contains label
- * about the node embedded as annotation, type of annotation and the location
- * of both the annotation's node and edge.
- *
- * Annotation objects include embedded constants, embedded summary, and
- * edge shortcuts.
- */
-export class Annotation {
-  node: Node;
-  renderNodeInfo: RenderNodeInfo;
-  renderMetaedgeInfo: RenderMetaedgeInfo;
-  annotationType: AnnotationType;
-  /**
-   * Center position of annotation relative to the host
-   * node's center x.
-   */
-  dx: number;
-  /**
-   * Center position of annotation relative to the host
-   * node's center y.
-   */
-  dy: number;
-  width: number;
-  height: number;
-  /**
-   * The names of nodes on either side of this edge.
-   */
-  v: string;
-  w: string;
-  /**
-   * A flag whether it is an in-annotation (if true) or
-   * out-annotation  (if false).
-   */
-  isIn: boolean;
-  /** Label horizontal offset from the end of the node shape */
-  labelOffset: number;
-  /**
-   * Array of points for edges from the annotation to its host
-   * node. Each point contains the point location, relative to
-   * the host node's center.
-   */
-  points: {dx: number, dy: number}[];
-
-  /**
-   * Creates a new Annotation.
-   *
-   * @param node The underlying node this annotation points to.
-   * @param renderNodeInfo The render information for the underlying node
-   *     this annotation points to. This can be null if the annotation
-   *     denotes an embedding (constant, summary), in which case we
-   *     use the node property.
-   * @param renderMetaedgeInfo The render information for the edge associated
-   *     with the annotation.
-   * @param type The type of the annotation.
-   * @param isIn True if it is an in-annotation. False if it is an
-   *     out-annotation.
-   */
-  constructor(node: Node, renderNodeInfo: RenderNodeInfo,
-      renderMetaedgeInfo: RenderMetaedgeInfo, type: AnnotationType,
-      isIn: boolean) {
-    this.node = node;
-    this.renderNodeInfo = renderNodeInfo;
-    this.renderMetaedgeInfo = renderMetaedgeInfo;
-    this.annotationType = type;
-    // Properties specified by layout
-    this.dx = 0;
-    this.dy = 0;
-    this.width = 0;
-    this.height = 0;
-    // Properties needed for generating an ID for the edge's path element if
-    // this annotation is associated with a metaedge.
-    if (renderMetaedgeInfo && renderMetaedgeInfo.metaedge) {
-      this.v = renderMetaedgeInfo.metaedge.v;
-      this.w = renderMetaedgeInfo.metaedge.w;
-    }
-
-    this.isIn = isIn;
-    this.points = [];
-  }
-};
-
-export enum AnnotationType {SHORTCUT, CONSTANT, SUMMARY, ELLIPSIS};
-
-/**
- * Manages a list of annotations. Two will be used for each
- * RenderNodeInfo, one for in annotations and one for out annotations.
- */
-export class AnnotationList {
-  /**
-   * List of visually drawable annotations, may include an ellipses annotation
-   * if the number added exceeds the number specified by maxAnnotations.
-   */
-  list: Annotation[];
-
-  /**
-   * Set of nodes which have been added as annotations to this list, so we can
-   * prevent duplicates.
-   */
-  nodeNames: { [nodeName: string]: boolean };
-
-  constructor() {
-    this.list = [];
-    this.nodeNames = {};
-  }
-
-  /**
-   * Append an annotation to the list, or a stand-in ellipsis annotation instead
-   * if this would make it too many.
-   */
-  push(annotation: Annotation): void {
-    if (annotation.node.name in this.nodeNames) {
-      return; // Skip duplicate annotation.
-    }
-    this.nodeNames[annotation.node.name] = true;
-
-    if (this.list.length < PARAMS.maxAnnotations) {
-      this.list.push(annotation);
-      return;
-    }
-
-    let lastAnnotation = this.list[this.list.length - 1];
-    if (lastAnnotation.annotationType === AnnotationType.ELLIPSIS) {
-      let ellipsisNode = <EllipsisNode>lastAnnotation.node;
-      ellipsisNode.setNumMoreNodes(++ellipsisNode.numMoreNodes);
-      return;
-    }
-
-    let ellipsisNode = new tf.graph.EllipsisNodeImpl(1);
-    this.list.push(new Annotation(ellipsisNode,
-        new RenderNodeInfo(ellipsisNode), null,
-        AnnotationType.ELLIPSIS, annotation.isIn));
-  }
-}
-
-/**
- * Contains rendering information about a node in the hierarchical graph.
- */
-export class RenderNodeInfo {
-  /** Reference to the original underlying Node from the hierarchical graph. */
-  node: Node;
-  /** Whether the node is expanded or not. */
-  expanded: boolean;
-  /**
-   * List of rendering information about in-annotations like constants and
-   * shortcuts to high-degree nodes.
-   */
-  inAnnotations: AnnotationList;
-  /**
-   * List of rendering information about out-annotations (e.g. summary nodes)
-   */
-  outAnnotations: AnnotationList;
-
-  // --- Params specified by layout --- //
-
-  /** Center x position */
-  x: number;
-  /** Center y position */
-  y: number;
-  /**
-   * Total width of the node's shape, including in- and out-annotations. This
-   * property is used by dagre to layout the graph.
-   */
-  width: number;
-  /**
-   * Total height of the node's shape, including in- and out-annotations. This
-   * property is used by dagre to layout the graph.
-   */
-  height: number;
-  /**
-   * Size of the main box of the node, excluding in- and out-annotations. This
-   * property is used to draw the rectangle/ellipse shape denoting the node.
-   */
-  coreBox: {
-    width: number,
-    height: number,
-  };
-
-  /** Width of the bounding box for all in-annotations. */
-  inboxWidth: number;
-  /** Width of the bounding box for all out-annotations. */
-  outboxWidth: number;
-  /**
-   * Whether the node should be excluded from the scene.
-   * This is only used when there are too many items in a series so we only
-   * want to include top N ones.
-   */
-  // TODO(jimbo): Now that series rendering is non-recursive, remove this and
-  // all its uses from the code base.
-  excluded: boolean;
-
-  // --- Params used in drawing the bridge paths --- //
-
-  /**
-   * All bridge nodes are meant to be invisible, but whereas most represent a
-   * relationship from the underlying graph hierarchy, some exist solely for
-   * layout reasons. Specifically, those bridge nodes which have only structural
-   * rendering metaedges.
-   */
-  structural: boolean;
-
-  // --- Params for the size of the node box --- //
-
-  /** Label vertical offset from the center of node shape */
-  labelOffset: number;
-  /** Rectangle radius (for making rounded rectangle) */
-  radius: number;
-
-  // --- Params for expanded node --- //
-
-  /** Label height for expanded node. */
-  labelHeight: number;
-  // Paddings between inner subscene and the border of the expanded node.
-  paddingTop: number;
-  paddingLeft: number;
-  paddingRight: number;
-  paddingBottom: number;
-
-  /**
-   * Whether a node is extracted as source-like (having high out-degree or
-   * matching predefined in-extract pattern.)
-   */
-  isInExtract: boolean;
-  /**
-   * Whether a node is extracted as sink-like (having high in-degree or matching
-   * predefined out-extract pattern.)
-   */
-  isOutExtract: boolean;
-
-  /**
-   * List of (color, proportion) tuples based on the proportion of devices of
-   * its children. If this node is an op node, this list will have only one
-   * color with proportion 1.0.
-   */
-  deviceColors: Array<{color: string, proportion: number}>;
-
-  /**
-   * Color according to the XLA cluster of this node.
-   */
-  xlaClusterColor: string;
-
-  /**
-   * Color according to the memory usage of this node.
-   */
-  memoryColor: string;
-
-  /**
-   * Color according to the compute time of this node.
-   */
-  computeTimeColor: string;
-
-  /**
-   * Whether this node is faded out. Used when displaying stats.
-   */
-  isFadedOut: boolean;
-
-  constructor(node: Node) {
-    this.node = node;
-    this.expanded = false;
-    this.inAnnotations = new AnnotationList();
-    this.outAnnotations = new AnnotationList();
-    // Params specified by layout
-    this.x = 0;
-    this.y = 0;
-    this.width = 0;
-    this.height = 0;
-    this.inboxWidth = 0;
-    this.outboxWidth = 0;
-
-    this.excluded = false;
-
-    // Params for bridge paths.
-    this.structural = false;
-
-    // Params for node box.
-    this.labelOffset = 0;
-    this.radius = 0;
-
-    // Params for expanded node
-    this.labelHeight = 0;
-    this.paddingTop = 0;
-    this.paddingLeft = 0;
-    this.paddingRight = 0;
-    this.paddingBottom = 0;
-    this.isInExtract = false;
-    this.isOutExtract = false;
-    this.coreBox = {width: 0, height: 0};
-
-    // By default, we don't fade nodes out. Default to false for safety.
-    this.isFadedOut = false;
-  }
-
-  isInCore(): boolean {
-    return !this.isInExtract && !this.isOutExtract;
-  }
-}
-
-/**
- * Contains rendering information about a Metaedge from the underlying
- * hierarchical graph. It may be from either a metagraph or a bridgegraph.
- */
-export class RenderMetaedgeInfo {
-  /**
-   * Reference to the original underlying Metaedge from the hierarchical graph,
-   * if any. This will be null for the edges which connect OpNodes to their
-   * embeddings, for example.
-   */
-  metaedge: Metaedge;
-
-  /**
-   * Reference to the adjoining RenderMetaedgeInfo from the parent's
-   * coreGraph. This is used during layout to determine the point at which this
-   * edge should touch the node's bounding box. This property will be null for
-   * edges which terminate at a node on both ends (all non-bridge edges).
-   */
-  adjoiningMetaedge: RenderMetaedgeInfo;
-
-  /**
-   * Most of the time, a RenderMetaedgeInfo object represents a real
-   * edge between nodes in the underlying graph structure. But sometimes, an
-   * edge only exists for layout purposes. These structural edges are added
-   * during buildSubhierarchy() to force dagre.layout() to put bridge nodes
-   * at the ends of the flow.
-   * @see buildSubhierarchy()
-   */
-  structural: boolean;
-
-  /**
-   * Weight of the edge, used by dagre when deciding how important an edge is.
-   * Edges with higher weight are made shorter and straighter. The default
-   * dagre uses is 1.
-   */
-  weight: number;
-
-  /**
-   * X and Y coordinate pairs of the points in the path of the edge.
-   * @see tf.graph.node.subsceneAdjustPaths
-   */
-  points: Point[];
-
-  /**
-   * D3 selection of the group containing the path that displays this edge.
-   */
-  edgeGroup: d3.Selection<RenderMetaedgeInfo & any, any, any, any>;
-
-  /** Id of the <marker> used as a start-marker for the edge path. */
-  startMarkerId: string;
-
-  /** Id of the <marker> used as an end-marker for the edge path. */
-  endMarkerId: string;
-
-  /**
-   * Whether this edge is faded out. Used for fading out unused edges when
-   * displaying run statistics.
-   */
-  isFadedOut: boolean;
-
-  constructor(metaedge: Metaedge) {
-    this.metaedge = metaedge;
-    this.adjoiningMetaedge = null;
-    this.structural = false;
-    this.weight = 1;
-    this.isFadedOut = false;
-  }
-}
-
-function addInAnnotation(node: RenderNodeInfo, predecessor: Node,
-    predecessorRenderInfo: RenderNodeInfo,
-    edge: RenderMetaedgeInfo, type: AnnotationType): void {
-  let annotation = new Annotation(predecessor, predecessorRenderInfo, edge,
-      type, true);
-  node.inAnnotations.push(annotation);
-}
-
-function addOutAnnotation(node: RenderNodeInfo, successor: Node,
-    successorRenderInfo: RenderNodeInfo, edge: RenderMetaedgeInfo,
-    type: AnnotationType): void {
-  let annotation = new Annotation(successor, successorRenderInfo, edge,
-      type, false);
-  node.outAnnotations.push(annotation);
-}
-
-function setGraphDepth(graph: graphlib.Graph<RenderNodeInfo, any>,
-    depth: number) {
-  _.each(graph.nodes(), nodeName => {
-    let child = graph.node(nodeName);
-    child.expanded = depth > 1; // set all child of depth 1 to collapsed
-    if (depth > 0) {
-      switch (child.node.type) {
-        case NodeType.META:
-        case NodeType.SERIES:
-          setGroupNodeDepth(<RenderGroupNodeInfo>child, depth - 1);
-          break;
-        // Do nothing for leaf
-      }
-    }
-  });
-};
-
-export class RenderGroupNodeInfo extends RenderNodeInfo {
-  node: GroupNode;
-  /**
-   * The core graph is derived from the underlying node's metagraph, minus
-   * the extracted source-like and sink-like nodes.
-   */
-  coreGraph: graphlib.Graph<RenderNodeInfo, RenderMetaedgeInfo>;
-  /** Size of the bounding box for a metanode's isolated in-extract children. */
-  inExtractBox: {width: number, height: number};
-  /**
-   * Size of the bounding box for a metanode's isolated out-extract children.
-   */
-  outExtractBox: {width: number, height: number};
-  /** Array of isolated in-extract nodes. */
-  isolatedInExtract: RenderNodeInfo[];
-  /** Array of isolated out-extract nodes. */
-  isolatedOutExtract: RenderNodeInfo[];
-
-  constructor(groupNode: GroupNode) {
-    super(groupNode);
-    let metagraph = groupNode.metagraph;
-    let gl = metagraph.graph();
-    this.coreGraph =
-        createGraph<RenderNodeInfo, RenderMetaedgeInfo>(
-            gl.name, GraphType.CORE, { compound: true });
-    this.inExtractBox = {width: 0, height: 0};
-    this.outExtractBox = {width: 0, height: 0};
-    this.isolatedInExtract = [];
-    this.isolatedOutExtract = [];
-  }
-}
-
-function setGroupNodeDepth(renderInfo: RenderGroupNodeInfo,
-    depth: number): void {
-  if (renderInfo.coreGraph) {
-    setGraphDepth(renderInfo.coreGraph, depth);
-  }
-}
-
-/**
- * Remove an edge from the graph and add annotations to both ends of the edge.
- *
- * @param The core graph.
- * @param v Source name.
- * @param w Sink name.
- */
-function createShortcut(
-    graph: graphlib.Graph<RenderNodeInfo, RenderMetaedgeInfo>,
-    v: string, w: string) {
-  let src = graph.node(v);
-  let sink = graph.node(w);
-  let edge = graph.edge(v, w);
-
-  // If either of the nodes is explicitly included in the main graph and
-  // both nodes are in the main graph then do not create the shortcut
-  // and instead keep the real edge.
-  if ((src.node.include === InclusionType.INCLUDE ||
-       sink.node.include === InclusionType.INCLUDE) &&
-      src.node.include !== InclusionType.EXCLUDE &&
-      sink.node.include !== InclusionType.EXCLUDE) {
-    return;
-  }
-
-  // Add each annotation.
-  addOutAnnotation(src, sink.node, sink, edge, AnnotationType.SHORTCUT);
-  addInAnnotation(sink, src.node, src, edge, AnnotationType.SHORTCUT);
-
-  // Remove the edge from the core graph.
-  graph.removeEdge(v, w);
-}
-
-/**
- * Remove edges from a node, and set its isOutExtract property to true,
- * and remove the node and move it to isolatedOutExtract.
- *
- * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
- * edges. Otherwise, only extract all in-edges.
- */
-function makeOutExtract(renderNode: RenderGroupNodeInfo, n: string,
-    forceDetach?: boolean) {
-  let graph = renderNode.coreGraph;
-  let child = graph.node(n);
-  child.isOutExtract = true;
-
-  _.each(graph.predecessors(n), (p, index) => {
-    createShortcut(graph, p, n);
-  });
-
-  if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
-    _.each(graph.successors(n), (s, index) => {
-      createShortcut(graph, n, s);
-    });
-  }
-
-  // Remove the node from the core graph if it no longer has neighbors.
-  if (graph.neighbors(n).length === 0) {
-    child.node.include = InclusionType.EXCLUDE;
-    renderNode.isolatedOutExtract.push(child);
-    graph.removeNode(n);
-  }
-}
-
-/**
- * Remove edges from a node, set its isInExtract property to true,
- * and remove the node and move it to isolatedInExtract.
- *
- * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
- * edges. Otherwise, only remove all out-edges.
- */
-export function makeInExtract(renderNode: RenderGroupNodeInfo, n: string,
-    forceDetach?: boolean) {
-  let graph = renderNode.coreGraph;
-  let child = graph.node(n);
-  child.isInExtract = true;
-
-  _.each(graph.successors(n), (s, index) => {
-    createShortcut(graph, n, s);
-  });
-
-  if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
-    _.each(graph.predecessors(n), (p, index) => {
-      createShortcut(graph, p, n);
-    });
-  }
-
-  // Remove the node from the core graph if it no longer has neighbors.
-  if (graph.neighbors(n).length === 0) {
-    child.node.include = InclusionType.EXCLUDE;
-    renderNode.isolatedInExtract.push(child);
-    graph.removeNode(n);
-  }
-}
-
-/**
- * Check whether the node's type is a member of the given list of types.
- *
- * @param node Node.
- * @param types List of type to match.
- */
-function hasTypeIn(node: Node, types: string[]): boolean {
-  if (node.type === NodeType.OP) {
-    for (let i = 0; i < types.length; i++) {
-      if ((<OpNode>node).op === types[i]) { return true; }
-    }
-  } else if (node.type === NodeType.META) {
-    let rootOpNode = (<Metanode>node).getRootOp();
-    if (rootOpNode) {
-      for (let i = 0; i < types.length; i++) {
-        if (rootOpNode.op === types[i]) { return true; }
-      }
-    }
-  }
-  return false;
-}
-
-/** Move nodes that are specified to be excluded out of the core graph. */
-function extractSpecifiedNodes(renderNode: RenderGroupNodeInfo) {
-  let graph = renderNode.coreGraph;
-  _.each(graph.nodes(), n => {
-    let renderInfo = graph.node(n);
-    if (renderInfo.node.include === InclusionType.EXCLUDE) {
-      if (renderNode.coreGraph.outEdges(n).length >
-          renderNode.coreGraph.inEdges(n).length) {
-        makeOutExtract(renderNode, n, true);
-      } else {
-        makeInExtract(renderNode, n, true);
-      }
-    }
-  });
-}
-
-/** Remove edges from pre-defined out-extract patterns */
-function extractPredefinedSink(renderNode: RenderGroupNodeInfo) {
-  let graph = renderNode.coreGraph;
-  _.each(graph.nodes(), n => {
-    let renderInfo = graph.node(n);
-    if (renderInfo.node.include !== InclusionType.UNSPECIFIED) {
-      return;
-    }
-    if (hasTypeIn(renderInfo.node, PARAMS.outExtractTypes)) {
-      makeOutExtract(renderNode, n);
-    }
-  });
-}
-
-/** Remove edges from pre-defined in-extract patterns */
-function extractPredefinedSource(renderNode) {
-  let graph = renderNode.coreGraph;
-  _.each(graph.nodes(), n => {
-    let renderInfo = graph.node(n);
-    if (renderInfo.node.include !== InclusionType.UNSPECIFIED) {
-      return;
-    }
-    if (hasTypeIn(renderInfo.node, PARAMS.inExtractTypes)) {
-      makeInExtract(renderNode, n);
-    }
-  });
-}
-
-/** Extract nodes deemed to have either high in-degree or high out-degree. */
-function extractHighInOrOutDegree(renderNode: RenderGroupNodeInfo) {
-  let graph = renderNode.coreGraph;
-
-  // Create mappings from node to in and out degrees. Count the number of valid
-  // nodes along the way.
-  let nodeToInDegree = {};
-  let nodeToOutDegree = {};
-  let validNodeCount = 0;
-  _.each(graph.nodes(), currentNode => {
-    if (graph.node(currentNode).node.include !== InclusionType.UNSPECIFIED) {
-      // This node is not included in the first place.
-      return;
-    }
-
-    // Count the in and out degrees based on only regular edges, unless there
-    // are no regular edges, in which case use the number of control edges.
-    // This is done so that control edges don't affect if nodes are extracted
-    // from the core graph, unless the node is only used for control.
-    let inDegree =
-        _.reduce(graph.predecessors(currentNode), (inDegree, pred) => {
-          let metaedge = graph.edge(pred, currentNode).metaedge;
-          return inDegree + (metaedge.numRegularEdges ? 1 : 0);
-        }, 0);
-    if (inDegree === 0 && graph.predecessors(currentNode).length > 0) {
-      inDegree = graph.predecessors(currentNode).length;
-    }
-
-    let outDegree =
-        _.reduce(graph.successors(currentNode), (outDegree, succ) => {
-          let metaedge = graph.edge(currentNode, succ).metaedge;
-          return outDegree + (metaedge.numRegularEdges ? 1 : 0);
-        }, 0);
-    if (outDegree === 0 && graph.successors(currentNode).length > 0) {
-      outDegree = graph.successors(currentNode).length;
-    }
-
-    // Store the in and out degrees of this node to avoid recomputing.
-    nodeToInDegree[currentNode] = inDegree;
-    nodeToOutDegree[currentNode] = outDegree;
-    validNodeCount++;
-  });
-
-  if (validNodeCount < PARAMS.minNodeCountForExtraction) {
-    // This graph has few nodes. Do not extract any nodes.
-    return;
-  }
-
-  // We only extract if the node has a min in or out degree greater than this.
-  let minUpperBound = PARAMS.minDegreeForExtraction - 1;
-
-  // Mark for extraction nodes with in-degree > Q3 + (Q3 - Q1).
-  let q3Index = Math.round(validNodeCount * 0.75);
-  let q1Index = Math.round(validNodeCount * 0.25);
-  let sortedByInDegree = Object.keys(nodeToInDegree).sort((node0, node1) => {
-    return nodeToInDegree[node0] - nodeToInDegree[node1];
-  });
-  let inDegreeQ3 = nodeToInDegree[sortedByInDegree[q3Index]];
-  let inDegreeQ1 = nodeToInDegree[sortedByInDegree[q1Index]];
-  let inDegreeUpperBound = inDegreeQ3 + inDegreeQ3 - inDegreeQ1;
-  // Only extract if the upper bound is high enough.
-  inDegreeUpperBound = Math.max(inDegreeUpperBound, minUpperBound);
-  for (let i = validNodeCount - 1;
-       nodeToInDegree[sortedByInDegree[i]] > inDegreeUpperBound; i--) {
-    // Extract a high in-degree node.
-    makeInExtract(renderNode, sortedByInDegree[i]);
-  }
-
-  // Mark for extraction nodes with out-degree > Q3 + (Q3 - Q1) * 4.
-  let sortedByOutDegree = Object.keys(nodeToOutDegree).sort((node0, node1) => {
-    return nodeToOutDegree[node0] - nodeToOutDegree[node1];
-  });
-  let outDegreeQ3 = nodeToOutDegree[sortedByOutDegree[q3Index]];
-  let outDegreeQ1 = nodeToOutDegree[sortedByOutDegree[q1Index]];
-  // The upper bound for extracting out-degree nodes is higher than that for
-  // extracting in-degree ones (Note the "* 4") because, in practice, some
-  // graphs look worse with a smaller out-degree bound. For instance, a smaller
-  // out-degree bound removes the convolution nodes from cifar 10 train's graph.
-  let outDegreeUpperBound = outDegreeQ3 + (outDegreeQ3 - outDegreeQ1) * 4;
-  // Only extract if the upper bound is high enough.
-  outDegreeUpperBound = Math.max(outDegreeUpperBound, minUpperBound);
-  for (let i = validNodeCount - 1;
-       nodeToOutDegree[sortedByOutDegree[i]] > outDegreeUpperBound; i--) {
-    let node = graph.node(sortedByOutDegree[i]);
-    if (!node || node.isInExtract) {
-      // This node has already been extracted due to high in-degree. It might
-      // have been removed from the graph in general (during in-degree
-      // extraction) due to a lack of neighbors. Do not extract this node twice.
-      continue;
-    }
-
-    // Extract a high out-degree node that has not already been extracted.
-    makeOutExtract(renderNode, sortedByOutDegree[i]);
-  }
-}
-
-/** Remove control edges from nodes that have too many control edges */
-function removeControlEdges(renderNode: RenderGroupNodeInfo) {
-  let graph = renderNode.coreGraph;
-
-  // Collect control edges into a map by node name.
-  let map = <{[nodeName: string]: graphlib.EdgeObject[]}>{};
-  _.each(graph.edges(), e => {
-    if (!graph.edge(e).metaedge.numRegularEdges) {
-      (map[e.v] = map[e.v] || []).push(e);
-      (map[e.w] = map[e.w] || []).push(e);
-    }
-  });
-
-  // For each node with too many control edges, turn them into annotations.
-  _.each(map, (edges, nodeName) => {
-    if (edges.length > PARAMS.maxControlDegree) {
-      _.each(edges, e => createShortcut(graph, e.v, e.w));
-    }
-  });
-}
-
-/**
- * Given an integer, picks a hue that is far apart from other colors.
- * The formula for picking color that avoid collision is:
- *     hue = (color range * golden ratio * index) % color range
- */
-export function mapIndexToHue(id: number): number {
-  let GOLDEN_RATIO = 1.61803398875;
-  // Hue of 0 is reserved for the gray nodes.
-  let MIN_HUE = 1;
-  let MAX_HUE = 359;
-  let COLOR_RANGE = MAX_HUE - MIN_HUE;
-  return MIN_HUE + ((COLOR_RANGE * GOLDEN_RATIO * id) % COLOR_RANGE);
-};
-
-/**
- * Remove edges and add to annotation instead.
- *
- * For root node, consider predefined types for source and sink.
- * We do not extract predefined type from non-root so that Variables and the
- * sgd node (op type = 'NoOp') do not get extract from inside own group.
- *
- * The order of extraction is important here as swapping the order can totally
- * screw up the graph layout.
- *
- * @param {Render.Node} renderNode Node to manipulate.
- */
-function extractHighDegrees(renderNode: RenderGroupNodeInfo) {
-
-  extractSpecifiedNodes(renderNode);
-
-  if (PARAMS.outExtractTypes) {
-    extractPredefinedSink(renderNode);
-  }
-
-  // This has to come before extract high in-degree to protect the core part
-  // that takes many variables.
-  if (PARAMS.inExtractTypes) {
-    extractPredefinedSource(renderNode);
-  }
-
-  extractHighInOrOutDegree(renderNode);
-
-  if (PARAMS.maxControlDegree) {
-    removeControlEdges(renderNode);
-  }
-
-  // Extract isolated nodes, which can be
-  // (1) source-like and sink-like nodes that are not originally isolated but
-  //     become isolated after further removal.
-  // (2) isolated nodes with annotations on one-side.  These might be either
-  //     - nodes that originally have high out-degree but because we remove
-  //       high in-degree nodes first, they no longer have high in-degree when
-  //       we check.  (Detecting all high-degree before removing also leads to
-  //       another problem.)
-  //     - nodes that do not have high degree, but their neighbors are all
-  //       extracted, so it might make sense to extract them too.
-
-  let graph = renderNode.coreGraph;
-  _.each(graph.nodes(), n => {
-    let child = graph.node(n);
-    let degree = graph.neighbors(n).length;
-    if (child.node.include !== InclusionType.UNSPECIFIED) {
-      return;
-    }
-    if (degree === 0) {
-      let hasOutAnnotations = child.outAnnotations.list.length > 0;
-      let hasInAnnotations = child.inAnnotations.list.length > 0;
-
-      if (child.isInExtract) { // Is source-like.
-        // This case only happens if detachAllEdgesForHighDegree is false.
-        // (Otherwise all source-like nodes are all isolated already.)
-        renderNode.isolatedInExtract.push(child);
-        child.node.include = InclusionType.EXCLUDE;
-        graph.removeNode(n);
-      } else if (child.isOutExtract) { // Is sink-like.
-        // This case only happens if detachAllEdgesForHighDegree is false.
-        // // (Otherwise all sink-like nodes are all isolated already.)
-        renderNode.isolatedOutExtract.push(child);
-        child.node.include = InclusionType.EXCLUDE;
-        graph.removeNode(n);
-      } else if (PARAMS.extractIsolatedNodesWithAnnotationsOnOneSide) {
-        if (hasOutAnnotations && !hasInAnnotations) {
-          child.isInExtract = true; // for ones with high out-annotations
-          renderNode.isolatedInExtract.push(child);
-          child.node.include = InclusionType.EXCLUDE;
-          graph.removeNode(n);
-        } else if (hasInAnnotations && !hasOutAnnotations) {
-          child.isOutExtract = true; // for ones with high in-annotations
-          renderNode.isolatedOutExtract.push(child);
-          child.node.include = InclusionType.EXCLUDE;
-          graph.removeNode(n);
-        } else {
-          // if a low degree node has both in- & out- annotations, do nothing
-          // because it is unclear which side it should go to.
-        }
-      }
-    }
-  });
-}
-
-/**
- * Expands nodes in the graph until the desired node is visible.
- *
- * @param scene The scene polymer component.
- * @param renderHierarchy The render hierarchy.
- * @param tensorName The name of a tensor.
- * @return A string that is the name of the node representing the given tensor.
- *     Note that the original tensor name might differ from this returned node
- *     name. Specifically, for instance, the tensor name usually ends with an
- *     output slot index (such as :0), while the node name lacks that suffix.
- */
-export function expandUntilNodeIsShown(
-    scene, renderHierarchy, tensorName: string) {
-  const splitTensorName = tensorName.split('/');
-
-  // Graph names do not take into account the output slot. Strip it.
-  const lastNodeNameMatch =
-      splitTensorName[splitTensorName.length - 1].match(/(.*):\d+/);
-  if (lastNodeNameMatch.length === 2) {
-    splitTensorName[splitTensorName.length - 1] = lastNodeNameMatch[1];
-  }
-
-  let nodeName = splitTensorName[0];
-  let renderNode = renderHierarchy.getRenderNodeByName(nodeName);
-  for (let i = 1; i < splitTensorName.length; i++) {
-    // Op nodes are not expandable.
-    if (renderNode.node.type === tf.graph.NodeType.OP) {
-      break;
-    }
-    renderHierarchy.buildSubhierarchy(nodeName);
-    renderNode.expanded = true;
-    scene.setNodeExpanded(renderNode);
-    nodeName += '/' + splitTensorName[i];
-    renderNode = renderHierarchy.getRenderNodeByName(nodeName);
-  }
-
-  return renderNode.node.name;
-}
-
-} // close module tf.graph.render
diff --git a/tensorflow/tensorboard/components/tf_graph_common/scene.ts b/tensorflow/tensorboard/components/tf_graph_common/scene.ts
deleted file mode 100644
index 14d35efd9ffc176147ce8cadcec72a7be1784084..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/scene.ts
+++ /dev/null
@@ -1,735 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.scene {
-  const svgNamespace = 'http://www.w3.org/2000/svg';
-
-  /** Enums element class of objects in the scene */
-  export let Class = {
-    Node: {
-      // <g> element that contains nodes.
-      CONTAINER: 'nodes',
-      // <g> element that contains detail about a node.
-      GROUP: 'node',
-      // <g> element that contains visual elements (like rect, ellipse).
-      SHAPE: 'nodeshape',
-      // <*> element(s) under SHAPE that should receive color updates.
-      COLOR_TARGET: 'nodecolortarget',
-      // <text> element showing the node's label.
-      LABEL: 'nodelabel',
-      // <g> element that contains all visuals for the expand/collapse
-      // button for expandable group nodes.
-      BUTTON_CONTAINER: 'buttoncontainer',
-      // <circle> element that surrounds expand/collapse buttons.
-      BUTTON_CIRCLE: 'buttoncircle',
-      // <path> element of the expand button.
-      EXPAND_BUTTON: 'expandbutton',
-      // <path> element of the collapse button.
-      COLLAPSE_BUTTON: 'collapsebutton'
-    },
-    Edge: {
-      CONTAINER: 'edges',
-      GROUP: 'edge',
-      LINE: 'edgeline',
-      REFERENCE_EDGE: 'referenceedge',
-      REF_LINE: 'refline',
-      STRUCTURAL: 'structural'
-    },
-    Annotation: {
-      OUTBOX: 'out-annotations',
-      INBOX: 'in-annotations',
-      GROUP: 'annotation',
-      NODE: 'annotation-node',
-      EDGE: 'annotation-edge',
-      CONTROL_EDGE: 'annotation-control-edge',
-      LABEL: 'annotation-label',
-      ELLIPSIS: 'annotation-ellipsis'
-    },
-    Scene: {
-      GROUP: 'scene',
-      CORE: 'core',
-      INEXTRACT: 'in-extract',
-      OUTEXTRACT: 'out-extract'
-    },
-    Subscene: {GROUP: 'subscene'},
-    OPNODE: 'op',
-    METANODE: 'meta',
-    SERIESNODE: 'series',
-    BRIDGENODE: 'bridge',
-    ELLIPSISNODE: 'ellipsis'
-  };
-
-  /**
-   * A health pill encapsulates an overview of tensor element values. The value
-   * field is a list of 12 numbers that shed light on the status of the tensor.
-   * Visualized in health pills are the 3rd through 8th (inclusive) numbers of
-   * health pill values. Those 6 numbers are counts of tensor elements that fall
-   * under -Inf, negative, 0, positive, +Inf, NaN (in that order).
-   *
-   * Please keep this interface consistent with HealthPillDatum within
-   * backend.ts.
-   */
-  export interface HealthPill {
-    device_name: string;
-    node_name: string;
-    output_slot: number;
-    dtype: string;
-    shape: number[];
-    value: number[];
-    wall_time: number;
-    step: number;
-  }
-
-  interface HealthPillNumericStats {
-    min: number;
-    max: number;
-    mean: number;
-    stddev: number;
-  }
-
-  /**
-   * Encapsulates how to render a single entry in a health pill. Each entry
-   * corresponds to a category of tensor element values.
-   */
-  export interface HealthPillEntry {
-    background_color: string;
-    label: string;
-  }
-  ;
-  export let healthPillEntries: HealthPillEntry[] = [
-    {
-      background_color: '#CC2F2C',
-      label: 'NaN',
-    },
-    {
-      background_color: '#FF8D00',
-      label: '-∞',
-    },
-    {
-      background_color: '#EAEAEA',
-      label: '-',
-    },
-    {
-      background_color: '#A5A5A5',
-      label: '0',
-    },
-    {
-      background_color: '#262626',
-      label: '+',
-    },
-    {
-      background_color: '#003ED4',
-      label: '+∞',
-    },
-  ];
-
-  /**
-   * Helper method for fitting the graph in the svg view.
-   *
-   * @param svg The main svg.
-   * @param zoomG The svg group used for panning and zooming.
-   * @param d3zoom The zoom behavior.
-   * @param callback Called when the fitting is done.
-   */
-  export function fit(svg, zoomG, d3zoom, callback) {
-    let svgRect = svg.getBoundingClientRect();
-    let sceneSize = null;
-    try {
-      sceneSize = zoomG.getBBox();
-      if (sceneSize.width === 0) {
-        // There is no scene anymore. We have been detached from the dom.
-        return;
-      }
-    } catch (e) {
-      // Firefox produced NS_ERROR_FAILURE if we have been
-      // detached from the dom.
-      return;
-    }
-    let scale = 0.9 *
-        Math.min(
-            svgRect.width / sceneSize.width, svgRect.height / sceneSize.height,
-            2);
-    let params = layout.PARAMS.graph;
-    const transform = d3.zoomIdentity
-        .scale(scale)
-        .translate(params.padding.paddingLeft, params.padding.paddingTop);
-
-    d3.select(svg)
-        .transition()
-        .duration(500)
-        .call(d3zoom.transform, transform)
-        .on('end.fitted', () => {
-          // Remove the listener for the zoomend event,
-          // so we don't get called at the end of regular zoom events,
-          // just those that fit the graph to screen.
-          d3zoom.on('end.fitted', null);
-          callback();
-        });
-};
-
-/**
- * Helper method for panning the graph to center on the provided node,
- * if the node is currently off-screen.
- *
- * @param nodeName The node to center the graph on
- * @param svg The root SVG element for the graph
- * @param zoomG The svg group used for panning and zooming.
- * @param d3zoom The zoom behavior.
- * @return True if the graph had to be panned to display the
- *            provided node.
- */
-export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
-  let node = <SVGAElement>d3
-                 .select('[data-name="' + nodeName + '"].' + Class.Node.GROUP)
-                 .node();
-  if (!node) {
-    return false;
-  }
-
-  // Check if the selected node is off-screen in either
-  // X or Y dimension in either direction.
-  let nodeBox = node.getBBox();
-  let nodeCtm = node.getScreenCTM();
-  let pointTL = svg.createSVGPoint();
-  let pointBR = svg.createSVGPoint();
-  pointTL.x = nodeBox.x;
-  pointTL.y = nodeBox.y;
-  pointBR.x = nodeBox.x + nodeBox.width;
-  pointBR.y = nodeBox.y + nodeBox.height;
-  pointTL = pointTL.matrixTransform(nodeCtm);
-  pointBR = pointBR.matrixTransform(nodeCtm);
-  let isOutsideOfBounds = (start, end, bound) => {
-    return end < 0 || start > bound;
-  };
-  let svgRect = svg.getBoundingClientRect();
-  if (isOutsideOfBounds(pointTL.x, pointBR.x, svgRect.width) ||
-      isOutsideOfBounds(pointTL.y, pointBR.y, svgRect.height)) {
-    // Determine the amount to translate the graph in both X and Y dimensions in
-    // order to center the selected node. This takes into account the position
-    // of the node, the size of the svg scene, the amount the scene has been
-    // scaled by through zooming, and any previous transforms already performed
-    // by this logic.
-    let centerX = (pointTL.x + pointBR.x) / 2;
-    let centerY = (pointTL.y + pointBR.y) / 2;
-    let dx = ((svgRect.width / 2) - centerX);
-    let dy = ((svgRect.height / 2) - centerY);
-
-    // We translate by this amount. We divide the X and Y translations by the
-    // scale to undo how translateBy scales the translations (in d3 v4).
-    const svgTransform = d3.zoomTransform(svg);
-    d3.select(svg).transition().duration(500).call(
-        d3zoom.translateBy, dx / svgTransform.k, dy / svgTransform.k);
-
-    return true;
-  }
-  return false;
-};
-
-/**
- * Given a container d3 selection, select a child svg element of a given tag
- * and class if exists or append / insert one otherwise.  If multiple children
- * matches the tag and class name, returns only the first one.
- *
- * @param container
- * @param tagName tag name.
- * @param className (optional) Class name or a list of class names.
- * @param before (optional) reference DOM node for insertion.
- * @return selection of the element
- */
-export function selectOrCreateChild(
-    container, tagName: string, className?: string | string[], before?): d3.Selection<any, any, any, any> {
-  let child = selectChild(container, tagName, className);
-  if (!child.empty()) {
-    return child;
-  }
-  let newElement =
-      document.createElementNS('http://www.w3.org/2000/svg', tagName);
-
-  if (className instanceof Array) {
-    for (let i = 0; i < className.length; i++) {
-      newElement.classList.add(className[i]);
-    }
-  } else {
-    newElement.classList.add(className);
-  }
-
-  if (before) { // if before exists, insert
-    container.node().insertBefore(newElement, before);
-  } else { // otherwise, append
-    container.node().appendChild(newElement);
-  }
-  return d3.select(newElement)
-           // need to bind data to emulate d3_selection.append
-           .datum(container.datum());
-};
-
-/**
- * Given a container d3 selection, select a child element of a given tag and
- * class. If multiple children matches the tag and class name, returns only
- * the first one.
- *
- * @param container
- * @param tagName tag name.
- * @param className (optional) Class name or list of class names.
- * @return selection of the element, or an empty selection
- */
-export function selectChild(
-    container, tagName: string, className?: string | string[]): d3.Selection<any, any, any, any> {
-  let children = container.node().childNodes;
-  for (let i = 0; i < children.length; i++) {
-    let child = children[i];
-    if (child.tagName === tagName) {
-      if (className instanceof Array) {
-        let hasAllClasses = true;
-        for (let j = 0; j < className.length; j++) {
-          hasAllClasses =
-              hasAllClasses && child.classList.contains(className[j]);
-        }
-        if (hasAllClasses) {
-          return d3.select(child);
-        }
-      } else if ((!className || child.classList.contains(className))) {
-        return d3.select(child);
-      }
-    }
-  }
-  return d3.select(null);
-};
-
-/**
- * Select or create a sceneGroup and build/update its nodes and edges.
- *
- * Structure Pattern:
- *
- * <g class='scene'>
- *   <g class='core'>
- *     <g class='edges'>
- *       ... stuff from tf.graph.scene.edges.build ...
- *     </g>
- *     <g class='nodes'>
- *       ... stuff from tf.graph.scene.nodes.build ...
- *     </g>
- *   </g>
- *   <g class='in-extract'>
- *     <g class='nodes'>
- *       ... stuff from tf.graph.scene.nodes.build ...
- *     </g>
- *   </g>
- *   <g class='out-extract'>
- *     <g class='nodes'>
- *       ... stuff from tf.graph.scene.nodes.build ...
- *     </g>
- *   </g>
- * </g>
- *
- * @param container D3 selection of the parent.
- * @param renderNode render node of a metanode or series node.
- * @param sceneElement <tf-graph-scene> polymer element.
- * @param sceneClass class attribute of the scene (default='scene').
- */
-export function buildGroup(container,
-    renderNode: render.RenderGroupNodeInfo,
-    sceneElement,
-    sceneClass: string): d3.Selection<any, any, any, any> {
-  sceneClass = sceneClass || Class.Scene.GROUP;
-  let isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
-  let sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
-
-  // core
-  let coreGroup = selectOrCreateChild(sceneGroup, 'g', Class.Scene.CORE);
-  let coreNodes = _.reduce(renderNode.coreGraph.nodes(), (nodes, name) => {
-                    let node = renderNode.coreGraph.node(name);
-                    if (!node.excluded) {
-                      nodes.push(node);
-                    }
-                    return nodes;
-                  }, []);
-
-  if (renderNode.node.type === NodeType.SERIES) {
-    // For series, we want the first item on top, so reverse the array so
-    // the first item in the series becomes last item in the top, and thus
-    // is rendered on the top.
-    coreNodes.reverse();
-  }
-
-  // Create the layer of edges for this scene (paths).
-  edge.buildGroup(coreGroup, renderNode.coreGraph, sceneElement);
-
-  // Create the layer of nodes for this scene (ellipses, rects etc).
-  node.buildGroup(coreGroup, coreNodes, sceneElement);
-
-  // In-extract
-  if (renderNode.isolatedInExtract.length > 0) {
-    let inExtractGroup =
-        selectOrCreateChild(sceneGroup, 'g', Class.Scene.INEXTRACT);
-    node.buildGroup(inExtractGroup, renderNode.isolatedInExtract,
-        sceneElement);
-  } else {
-    selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT).remove();
-  }
-
-  // Out-extract
-  if (renderNode.isolatedOutExtract.length > 0) {
-    let outExtractGroup =
-        selectOrCreateChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT);
-    node.buildGroup(outExtractGroup, renderNode.isolatedOutExtract,
-        sceneElement);
-  } else {
-    selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT).remove();
-  }
-
-  position(sceneGroup, renderNode);
-
-  // Fade in the scene group if it didn't already exist.
-  if (isNewSceneGroup) {
-    sceneGroup.attr('opacity', 0).transition().attr('opacity', 1);
-  }
-
-  return sceneGroup;
-};
-
-/**
- * Given a scene's svg group, set  g.in-extract, g.coreGraph, g.out-extract svg
- * groups' position relative to the scene.
- *
- * @param sceneGroup
- * @param renderNode render node of a metanode or series node.
- */
-function position(sceneGroup, renderNode: render.RenderGroupNodeInfo) {
-  // Translate scenes down by the label height so that when showing graphs in
-  // expanded metanodes, the graphs are below the labels.  Do not shift them
-  // down for series nodes as series nodes don't have labels inside of their
-  // bounding boxes.
-  let yTranslate = renderNode.node.type === NodeType.SERIES ?
-    0 : layout.PARAMS.subscene.meta.labelHeight;
-
-  // core
-  translate(selectChild(sceneGroup, 'g', Class.Scene.CORE), 0, yTranslate);
-
-  // in-extract
-  let hasInExtract = renderNode.isolatedInExtract.length > 0;
-  let hasOutExtract = renderNode.isolatedOutExtract.length > 0;
-
-  if (hasInExtract) {
-    let offset = layout.PARAMS.subscene.meta.extractXOffset;
-    let inExtractX = renderNode.coreBox.width -
-      renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width -
-          (hasOutExtract ? offset : 0);
-    translate(
-        selectChild(sceneGroup, 'g', Class.Scene.INEXTRACT), inExtractX,
-        yTranslate);
-  }
-
-  // out-extract
-  if (hasOutExtract) {
-    let outExtractX = renderNode.coreBox.width -
-      renderNode.outExtractBox.width / 2;
-    translate(
-        selectChild(sceneGroup, 'g', Class.Scene.OUTEXTRACT), outExtractX,
-        yTranslate);
-  }
-};
-
-/** Adds a click listener to a group that fires a graph-select event */
-export function addGraphClickListener(graphGroup, sceneElement) {
-  d3.select(graphGroup).on('click', () => {
-    sceneElement.fire('graph-select');
-  });
-};
-
-/** Helper for adding transform: translate(x0, y0) */
-export function translate(selection, x0: number, y0: number) {
-  // If it is already placed on the screen, make it a transition.
-  if (selection.attr('transform') != null) {
-    selection = selection.transition('position');
-  }
-  selection.attr('transform', 'translate(' + x0 + ',' + y0 + ')');
-};
-
-/**
- * Helper for setting position of a svg rect
- * @param rect rect to set position of.
- * @param cx Center x.
- * @param cy Center x.
- * @param width Width to set.
- * @param height Height to set.
- */
-export function positionRect(rect, cx: number, cy: number, width: number,
-    height: number) {
-  rect.transition()
-    .attr('x', cx - width / 2)
-    .attr('y', cy - height / 2)
-    .attr('width', width)
-    .attr('height', height);
-};
-
-/**
- * Helper for setting position of a svg expand/collapse button
- * @param button container group
- * @param renderNode the render node of the group node to position
- *        the button on.
- */
-export function positionButton(button, renderNode: render.RenderNodeInfo) {
-  let cx = layout.computeCXPositionOfNodeShape(renderNode);
-  // Position the button in the top-right corner of the group node,
-  // with space given the draw the button inside of the corner.
-  let width = renderNode.expanded ?
-      renderNode.width : renderNode.coreBox.width;
-  let height = renderNode.expanded ?
-      renderNode.height : renderNode.coreBox.height;
-  let x = cx + width / 2 - 6;
-  let y = renderNode.y - height / 2 + 6;
-  // For unexpanded series nodes, the button has special placement due
-  // to the unique visuals of this group node.
-  if (renderNode.node.type === NodeType.SERIES && !renderNode.expanded) {
-    x += 10;
-    y -= 2;
-  }
-  let translateStr = 'translate(' + x + ',' + y + ')';
-  button.selectAll('path').transition().attr('transform', translateStr);
-  button.select('circle').transition().attr(
-      {cx: x, cy: y, r: layout.PARAMS.nodeSize.meta.expandButtonRadius});
-};
-
-/**
- * Helper for setting position of a svg ellipse
- * @param ellipse ellipse to set position of.
- * @param cx Center x.
- * @param cy Center x.
- * @param width Width to set.
- * @param height Height to set.
- */
-export function positionEllipse(ellipse, cx: number, cy: number,
-    width: number, height: number) {
-  ellipse.transition()
-    .attr('cx', cx)
-    .attr('cy', cy)
-    .attr('rx', width / 2)
-    .attr('ry', height / 2);
-};
-
-/**
- * @param {number} stat A stat for a health pill (such as mean or variance).
- * @param {boolean} shouldRoundOnesDigit Whether to round this number to the
- *     ones digit. Useful for say int, uint, and bool output types.
- * @return {string} A human-friendly string representation of that stat.
- */
-export function humanizeHealthPillStat(stat, shouldRoundOnesDigit) {
-  if (shouldRoundOnesDigit) {
-    return stat.toFixed(0);
-  }
-
-  if (Math.abs(stat) >= 1) {
-    return stat.toFixed(1);
-  }
-  return stat.toExponential(1);
-}
-
-/**
- * Get text content describing a health pill.
- */
-function _getHealthPillTextContent(healthPill: HealthPill,
-                                   totalCount: number,
-                                   elementsBreakdown: number[],
-                                   numericStats: HealthPillNumericStats) {
-  let text = 'Device: ' + healthPill.device_name + '\n';
-  text += 'dtype: ' + healthPill.dtype + '\n';
-
-  let shapeStr = '(scalar)';
-  if (healthPill.shape.length > 0) {
-    shapeStr = '(' + healthPill.shape.join(',') + ')';
-  }
-  text += '\nshape: ' + shapeStr + '\n\n';
-
-  text += '#(elements): ' + totalCount + '\n';
-  const breakdownItems = [];
-  for (let i = 0; i < elementsBreakdown.length; i++) {
-    if (elementsBreakdown[i] > 0) {
-      breakdownItems.push(
-          '#(' + healthPillEntries[i].label + '): ' + elementsBreakdown[i]);
-    }
-  }
-  text += breakdownItems.join(', ') + '\n\n';
-
-  // In some cases (e.g., size-0 tensors; all elements are nan or inf) the
-  // min/max and mean/stddev stats are meaningless.
-  if (numericStats.max >= numericStats.min) {
-    text += 'min: ' + numericStats.min + ', max: ' + numericStats.max + '\n';
-    text += 'mean: ' + numericStats.mean + ', stddev: ' + numericStats.stddev;
-  }
-
-  return text;
-}
-
-/**
- * Renders a health pill for an op atop a node.
- */
-function _addHealthPill(
-    nodeGroupElement: SVGElement, healthPill: HealthPill,
-    nodeInfo: render.RenderNodeInfo) {
-  // Check if text already exists at location.
-  d3.select(nodeGroupElement.parentNode as any).selectAll('.health-pill').remove();
-
-  if (!nodeInfo || !healthPill) {
-    return;
-  }
-
-  let lastHealthPillData = healthPill.value;
-
-  // For now, we only visualize the 6 values that summarize counts of tensor
-  // elements of various categories: -Inf, negative, 0, positive, Inf, and NaN.
-  const lastHealthPillElementsBreakdown = lastHealthPillData.slice(2, 8);
-  let totalCount = lastHealthPillData[1];
-  const numericStats: HealthPillNumericStats = {
-      min: lastHealthPillData[8],
-      max: lastHealthPillData[9],
-      mean: lastHealthPillData[10],
-      stddev: Math.sqrt(lastHealthPillData[11])
-  };
-
-  let healthPillWidth = 60;
-  let healthPillHeight = 10;
-  if (nodeInfo.node.type === tf.graph.NodeType.OP) {
-    // Use a smaller health pill for op nodes (rendered as smaller ellipses).
-    healthPillWidth /= 2;
-    healthPillHeight /= 2;
-  }
-
-  let healthPillGroup = document.createElementNS(svgNamespace, 'g');
-  healthPillGroup.classList.add('health-pill');
-
-  // Define the gradient for the health pill.
-  let healthPillDefs = document.createElementNS(svgNamespace, 'defs');
-  healthPillGroup.appendChild(healthPillDefs);
-  let healthPillGradient =
-      document.createElementNS(svgNamespace, 'linearGradient');
-  const healthPillGradientId = 'health-pill-gradient';
-  healthPillGradient.setAttribute('id', healthPillGradientId);
-
-  let cumulativeCount = 0;
-  let previousOffset = '0%';
-  for (let i = 0; i < lastHealthPillElementsBreakdown.length; i++) {
-    if (!lastHealthPillElementsBreakdown[i]) {
-      // Exclude empty categories.
-      continue;
-    }
-    cumulativeCount += lastHealthPillElementsBreakdown[i];
-
-    // Create a color interval using 2 stop elements.
-    let stopElement0 = document.createElementNS(svgNamespace, 'stop');
-    stopElement0.setAttribute('offset', previousOffset);
-    stopElement0.setAttribute(
-        'stop-color', healthPillEntries[i].background_color);
-    healthPillGradient.appendChild(stopElement0);
-
-    let stopElement1 = document.createElementNS(svgNamespace, 'stop');
-    let percent = (cumulativeCount * 100 / totalCount) + '%';
-    stopElement1.setAttribute('offset', percent);
-    stopElement1.setAttribute(
-        'stop-color', healthPillEntries[i].background_color);
-    healthPillGradient.appendChild(stopElement1);
-    previousOffset = percent;
-  }
-  healthPillDefs.appendChild(healthPillGradient);
-
-  // Create the rectangle for the health pill.
-  let rect = document.createElementNS(svgNamespace, 'rect');
-  rect.setAttribute('fill', 'url(#' + healthPillGradientId + ')');
-  rect.setAttribute('width', String(healthPillWidth));
-  rect.setAttribute('height', String(healthPillHeight));
-  healthPillGroup.appendChild(rect);
-
-  // Show a title with specific counts on hover.
-  let titleSvg = document.createElementNS(svgNamespace, 'title');
-  titleSvg.textContent = _getHealthPillTextContent(
-      healthPill, totalCount, lastHealthPillElementsBreakdown, numericStats);
-  healthPillGroup.appendChild(titleSvg);
-  // TODO(cais): Make the tooltip content prettier.
-
-  // Center this health pill just right above the node for the op.
-  let healthPillX = nodeInfo.x - healthPillWidth / 2;
-  let healthPillY = nodeInfo.y - healthPillHeight - nodeInfo.height / 2 - 2;
-  if (nodeInfo.labelOffset < 0) {
-    // The label is positioned above the node. Do not occlude the label.
-    healthPillY += nodeInfo.labelOffset;
-  }
-
-  if (lastHealthPillElementsBreakdown[2] ||
-      lastHealthPillElementsBreakdown[3] ||
-      lastHealthPillElementsBreakdown[4]) {
-    // At least 1 "non-Inf and non-NaN" value exists (a -, 0, or + value). Show
-    // stats on tensor values.
-
-    // Determine if we should display the output range as integers.
-    let shouldRoundOnesDigit = false;
-    let node = nodeInfo.node as OpNode;
-    let attributes = node.attr;
-    if (attributes && attributes.length) {
-      // Find the attribute for output type if there is one.
-      for (let i = 0; i < attributes.length; i++) {
-        if (attributes[i].key === 'T') {
-          // Note whether the output type is an integer.
-          let outputType = attributes[i].value['type'];
-          shouldRoundOnesDigit =
-              outputType && /^DT_(BOOL|INT|UINT)/.test(outputType);
-          break;
-        }
-      }
-    }
-
-    let statsSvg = document.createElementNS(svgNamespace, 'text');
-    const minString = humanizeHealthPillStat(numericStats.min, shouldRoundOnesDigit);
-    const maxString = humanizeHealthPillStat(numericStats.max, shouldRoundOnesDigit);
-    if (totalCount > 1) {
-      statsSvg.textContent = minString + ' ~ ' + maxString;
-    } else {
-      statsSvg.textContent = minString;
-    }
-    statsSvg.classList.add('health-pill-stats');
-    statsSvg.setAttribute('x', String(healthPillWidth / 2));
-    statsSvg.setAttribute('y', '-2');
-    healthPillGroup.appendChild(statsSvg);
-  }
-
-  healthPillGroup.setAttribute(
-      'transform', 'translate(' + healthPillX + ', ' + healthPillY + ')');
-
-  Polymer.dom(nodeGroupElement.parentNode).appendChild(healthPillGroup);
-}
-
-/**
- * Adds health pills (which visualize tensor summaries) to a graph group.
- * @param svgRoot The root SVG element of the graph to add heath pills to.
- * @param nodeNamesToHealthPills An object mapping node name to health pill.
- * @param colors A list of colors to use.
- */
-export function addHealthPills(
-    svgRoot: SVGElement, nodeNamesToHealthPills: {[key: string]: HealthPill[]},
-    healthPillStepIndex: number) {
-  if (!nodeNamesToHealthPills) {
-    // No health pill information available.
-    return;
-  }
-
-  let svgRootSelection = d3.select(svgRoot);
-  svgRootSelection.selectAll('g.nodeshape')
-      .each(function(nodeInfo: render.RenderNodeInfo) {
-        // Only show health pill data for this node if it is available.
-        let healthPills = nodeNamesToHealthPills[nodeInfo.node.name];
-        let healthPill = healthPills ? healthPills[healthPillStepIndex] : null;
-        _addHealthPill((this as SVGElement), healthPill, nodeInfo);
-      });
-};
-
-} // close module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/template.ts b/tensorflow/tensorboard/components/tf_graph_common/template.ts
deleted file mode 100644
index 7800d46029b7672c9c32debe36383be25e374c96..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/template.ts
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module tf.graph.template {
-
-/**
- * Detect repeating patterns of subgraphs.
- * Assign templateId to each subgraph if it belongs to a template.
- * Returns clusters of similar subgraphs .
- *
- * @param graph
- * @param verifyTemplate whether to run the template verification algorithm
- * @return a dict (template id => Array of node names)
- */
-export function detect(h, verifyTemplate): {[templateId: string]: string[]} {
-  // In any particular subgraph, there are either
-  // - leaf nodes (which do not have subgraph)
-  // - metanode nodes - some of them have only one member (singular metanode)
-  //                    and some have multiple members (non-singular metanode)
-
-  // First, generate a nearest neighbor hash of metanode nodes.
-  let nnGroups = clusterSimilarSubgraphs(h);
-
-  // For each metanode, compare its subgraph (starting from shallower groups)
-  // and assign template id.
-  let templates = groupTemplateAndAssignId(nnGroups, verifyTemplate);
-
-  // Sort the templates by minimum level in the graph at which they appear,
-  // as this leads to optimal setting of the colors of each template for
-  // maximum differentiation.
-  return <{[templateId: string]: string[]}>_(templates)
-      .pairs()
-      .sortBy(function(pair: {level: number, nodes: string[]}[]) {
-        return pair[1].level;
-      })
-      .map(function(pair: {level: number, nodes: string[]}[]) {
-        return [pair[0], pair[1].nodes];
-      })
-      .object()
-      .value();
-};
-
-/**
- * @return Unique string for a metanode based on depth, |V|, |E| and
- * op type histogram.
- */
-function getSignature(metanode) {
-  // depth=<number> |V|=<number> |E|=<number>
-  let props = _.map(
-                   {
-                     'depth': metanode.depth,
-                     '|V|': metanode.metagraph.nodes().length,
-                     '|E|': metanode.metagraph.edges().length
-                   },
-                   function(v, k) { return k + '=' + v; })
-                  .join(' ');
-
-  // optype1=count1,optype2=count2
-  let ops = _.map(metanode.opHistogram, function(count, op) {
-               return op + '=' + count;
-             }).join(',');
-
-  return props + ' [ops] ' + ops;
-}
-
-/**
- * Generate a nearest neighbor hash of metanodes
- * based on depth, |V|, |E|, and opHistogram of their subgraph
- * (excluding leaf nodes and singular metanodes).
- * @param graph The graph
- * @return Array of pairs of [signature,
- *   Object with min level of the template and an Array of tf.graph.Group]
- *   sort by ascending order of minimum depth at which metanode appears.
- */
-function clusterSimilarSubgraphs(h: hierarchy.Hierarchy) {
-  /** a dict from metanode.signature() => Array of tf.graph.Groups */
-  let hashDict = _(h.getNodeMap()).reduce(
-      (hash, node: OpNode|Metanode, name) => {
-    if (node.type !== NodeType.META) {
-        return hash;
-    }
-    let levelOfMetaNode = name.split('/').length - 1;
-    let signature = getSignature(node);
-    let templateInfo = hash[signature] ||
-      {nodes: [], level: levelOfMetaNode};
-    hash[signature] = templateInfo;
-    templateInfo.nodes.push(node);
-    if (templateInfo.level > levelOfMetaNode) {
-      templateInfo.level = levelOfMetaNode;
-    }
-    return hash;
-  }, {});
-
-  return _(hashDict)
-      .pairs()
-      // filter nn metanode with only one member
-      .filter(function(pair: {level: number, nodes: string[]}) {
-        return pair[1].nodes.length > 1;
-      })
-      .sortBy(function(pair: {level: number, nodes: string[]}) {
-        // sort by depth
-        // (all members in the same nnGroup has equal depth)
-        return pair[1].nodes[0].depth;
-      })
-      .value();
-}
-
-function groupTemplateAndAssignId(nnGroups, verifyTemplate) {
-  // For each metanode, compare its subgraph (starting from shallower groups)
-  // and assign template id.
-  let result: {[templateId: string]: {level: number, nodes: string[]}} = {};
-  return _.reduce(nnGroups, function(templates, nnGroupPair) {
-    let signature = nnGroupPair[0],
-      nnGroup = nnGroupPair[1].nodes,
-      clusters = [];
-
-    nnGroup.forEach(function(metanode) {
-      // check with each existing cluster
-      for (let i = 0; i < clusters.length; i++) {
-        let similar = !verifyTemplate ||
-                      isSimilarSubgraph(
-                        clusters[i].metanode.metagraph,
-                        metanode.metagraph
-                      );
-        // if similar, just add this metanode to the cluster
-        if (similar) {
-          // get template from the first one
-          metanode.templateId = clusters[i].metanode.templateId;
-          clusters[i].members.push(metanode.name);
-          return;
-        }
-      }
-      // otherwise create a new cluster with id 'signature [count] '
-      metanode.templateId = signature + '[' + clusters.length + ']';
-      clusters.push({
-        metanode: metanode,
-        members: [metanode.name]
-      });
-    });
-
-    clusters.forEach(function(c) {
-      templates[c.metanode.templateId] = {
-        level: nnGroupPair[1].level,
-        nodes: c.members
-      };
-    });
-    return templates;
-  }, result);
-}
-
-function sortNodes(names: string[],
-    graph: graphlib.Graph<Metanode|OpNode, Metaedge>, prefix: string) {
-  return _.sortByAll(names,
-    function(name) {
-      let node = graph.node(name);
-      return (<OpNode>node).op;
-    },
-    function(name) {
-      let node = graph.node(name);
-      return (<Metanode>node).templateId;
-    },
-    function(name) {
-      return graph.neighbors(name).length;
-    },
-    function(name) {
-      return graph.predecessors(name).length;
-    },
-    function(name) {
-      return graph.successors(name).length;
-    },
-    function(name) {
-      return name.substr(prefix.length);
-    });
-}
-
-function isSimilarSubgraph(g1: graphlib.Graph<any, any>,
-    g2: graphlib.Graph<any, any>) {
-  if (!tf.graph.hasSimilarDegreeSequence(g1, g2)) {
-      return false;
-  }
-
-  // if we want to skip, just return true here.
-  // return true;
-
-  // Verify sequence by running DFS
-  let g1prefix = g1.graph().name;
-  let g2prefix = g2.graph().name;
-
-  let visited1 = {};
-  let visited2 = {};
-  let stack = [];
-
-  /**
-   * push sources or successors into the stack
-   * if the visiting pattern has been similar.
-   */
-  function stackPushIfNotDifferent(n1, n2) {
-    let sub1 = n1.substr(g1prefix.length),
-      sub2 = n2.substr(g2prefix.length);
-
-    /* tslint:disable */
-    if (visited1[sub1] ^ visited2[sub1]) {
-      console.warn(
-          'different visit pattern', '[' + g1prefix + ']', sub1,
-          '[' + g2prefix + ']', sub2);
-      return true;
-    }
-    /* tslint:enable */
-    if (!visited1[sub1]) { // implied && !visited2[sub2]
-      visited1[sub1] = visited2[sub2] = true;
-      stack.push({n1: n1, n2: n2});
-    }
-
-    return false;
-  }
-
-  // check if have same # of sources then sort and push
-  let sources1 = g1.sources();
-  let sources2 = g2.sources();
-  if (sources1.length !== sources2.length) {
-    /* tslint:disable */
-    console.log('different source length');
-    /* tslint:enable */
-    return false;
-  }
-  sources1 = sortNodes(sources1, g1, g1prefix);
-  sources2 = sortNodes(sources2, g2, g2prefix);
-
-  for (let i = 0; i < sources1.length; i++) {
-    let different = stackPushIfNotDifferent(sources1[i], sources2[i]);
-    if (different) {
-        return false;
-    }
-  }
-
-  while (stack.length > 0) {
-    let cur = stack.pop();
-
-    // check node
-    let similar = isSimilarNode(g1.node(cur.n1), g2.node(cur.n2));
-    if (!similar) {
-        return false;
-    }
-
-    // check if have same # of successors then sort and push
-    let succ1 = g1.successors(cur.n1), succ2 = g2.successors(cur.n2);
-    if (succ1.length !== succ2.length) {
-      /* tslint:disable */
-      console.log('# of successors mismatch', succ1, succ2);
-      /* tslint:enable */
-      return false;
-    }
-    succ1 = sortNodes(succ1, g1, g1prefix);
-    succ2 = sortNodes(succ2, g2, g2prefix);
-
-    for (let j = 0; j < succ1.length; j++) {
-      let different = stackPushIfNotDifferent(succ1[j], succ2[j]);
-      if (different) {
-          return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-/**
- * Returns if two nodes have identical structure.
- */
-function isSimilarNode(n1: OpNode|Metanode|SeriesNode,
-    n2: OpNode|Metanode|SeriesNode): boolean {
-  if (n1.type === NodeType.META) {
-    // compare metanode
-    let metanode1 = <Metanode> n1;
-    let metanode2 = <Metanode> n2;
-    return metanode1.templateId && metanode2.templateId &&
-        metanode1.templateId === metanode2.templateId;
-  } else if (n1.type === NodeType.OP && n2.type === NodeType.OP) {
-    // compare leaf node
-    return (<OpNode>n1).op === (<OpNode>n2).op;
-  } else if (n1.type === NodeType.SERIES && n2.type === NodeType.SERIES) {
-    // compare series node sizes and operations
-    // (only need to check one op as all op nodes are identical in series)
-    let sn1 = <SeriesNode> n1;
-    let sn2 = <SeriesNode> n2;
-    let seriesnode1Count = sn1.metagraph.nodeCount();
-    return (seriesnode1Count === sn2.metagraph.nodeCount() &&
-      (seriesnode1Count === 0 ||
-      ((<OpNode>sn1.metagraph.node(sn1.metagraph.nodes()[0])).op ===
-          (<OpNode>sn2.metagraph.node(sn2.metagraph.nodes()[0])).op)));
-  }
-  return false;
-}
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts b/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
deleted file mode 100644
index af3030197e0824aaa808a8ad5b77fadf0cc856f9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-suite('graph', () => {
-  let assert = chai.assert;
-
-  test('graphlib exists', () => { assert.isTrue(graphlib != null); });
-
-  test('simple graph contruction', done => {
-    let pbtxt = tf.graph.test.util.stringToArrayBuffer(`
-      node {
-        name: "Q"
-        op: "Input"
-      }
-      node {
-        name: "W"
-        op: "Input"
-      }
-      node {
-        name: "X"
-        op: "MatMul"
-        input: "Q:2"
-        input: "W"
-      }`);
-    let statsPbtxt = tf.graph.test.util.stringToArrayBuffer(`step_stats {
-      dev_stats {
-        device: "cpu"
-        node_stats {
-          node_name: "Q"
-          all_start_micros: 10
-          all_end_rel_micros: 4
-        }
-        node_stats {
-          node_name: "Q"
-          all_start_micros: 12
-          all_end_rel_micros: 4
-        }
-      }
-    }`);
-
-    let buildParams: tf.graph.BuildParams = {
-      enableEmbedding: true,
-      inEmbeddingTypes: ['Const'],
-      outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
-      refEdges: {}
-    };
-    let dummyTracker =
-        tf.graph.util.getTracker({set: () => { return; }, progress: 0});
-    tf.graph.parser.parseGraphPbTxt(pbtxt).then(nodes => {
-      tf.graph.build(nodes, buildParams, dummyTracker)
-          .then((slimGraph: tf.graph.SlimGraph) => {
-            assert.isTrue(slimGraph.nodes['X'] != null);
-            assert.isTrue(slimGraph.nodes['W'] != null);
-            assert.isTrue(slimGraph.nodes['Q'] != null);
-
-            let firstInputOfX = slimGraph.nodes['X'].inputs[0];
-            assert.equal(firstInputOfX.name, 'Q');
-            assert.equal(firstInputOfX.outputTensorIndex, 2);
-
-            let secondInputOfX = slimGraph.nodes['X'].inputs[1];
-            assert.equal(secondInputOfX.name, 'W');
-            assert.equal(secondInputOfX.outputTensorIndex, 0);
-
-            tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
-              tf.graph.joinStatsInfoWithGraph(slimGraph, stepStats);
-              assert.equal(slimGraph.nodes['Q'].stats.getTotalMicros(), 6);
-              done();
-            });
-          });
-    });
-  });
-
-  test('health pill numbers round correctly', () => {
-    // Integers are rounded to the ones place.
-    assert.equal(tf.graph.scene.humanizeHealthPillStat(42.0, true), '42');
-
-    // Numbers with magnitude >= 1 are rounded to the tenths place.
-    assert.equal(tf.graph.scene.humanizeHealthPillStat(1, false), '1.0');
-    assert.equal(tf.graph.scene.humanizeHealthPillStat(42.42, false), '42.4');
-    assert.equal(tf.graph.scene.humanizeHealthPillStat(-42.42, false), '-42.4');
-
-    // Numbers with magnitude < 1 are written in scientific notation rounded to
-    // the tenths place.
-    assert.equal(tf.graph.scene.humanizeHealthPillStat(0, false), '0.0e+0');
-    assert.equal(tf.graph.scene.humanizeHealthPillStat(0.42, false), '4.2e-1');
-    assert.equal(
-        tf.graph.scene.humanizeHealthPillStat(-0.042, false), '-4.2e-2');
-  });
-
-  // TODO(bp): write tests.
-});
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/index.html b/tensorflow/tensorboard/components/tf_graph_common/test/index.html
deleted file mode 100644
index 7564167129d67d4f0e2d8f14de11f780ba262d67..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/test/index.html
+++ /dev/null
@@ -1,34 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../tf-graph-common.html">
-</head>
-<body>
-  <script src="parser-test.js"></script>
-  <script src="graph-test.js"></script>
-  <script src="hierarchy-test.js"></script>
-  <script src="layout-test.js"></script>
-  <script src="util-test.js"></script>
-  <script src="util.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/parser-test.ts b/tensorflow/tensorboard/components/tf_graph_common/test/parser-test.ts
deleted file mode 100644
index 7c73178c1ce34e327afe6847cc96ad3f5f702185..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/test/parser-test.ts
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-suite('parser', () => {
-  let assert = chai.assert;
-
-  test('simple pbtxt', done => {
-    let pbtxt = tf.graph.test.util.stringToArrayBuffer(`node {
-       name: "Q"
-       op: "Input"
-     }
-     node {
-       name: "W"
-       op: "Input"
-     }
-     node {
-       name: "X"
-       op: "MatMul"
-       input: "Q"
-       input: "W"
-     }`);
-    tf.graph.parser.parseGraphPbTxt(pbtxt).then(nodes => {
-      assert.isTrue(nodes != null && nodes.length === 3);
-
-      assert.equal('Q', nodes[0].name);
-      assert.equal('Input', nodes[0].op);
-
-      assert.equal('W', nodes[1].name);
-      assert.equal('Input', nodes[1].op);
-
-      assert.equal('X', nodes[2].name);
-      assert.equal('MatMul', nodes[2].op);
-      assert.equal('Q', nodes[2].input[0]);
-      assert.equal('W', nodes[2].input[1]);
-
-      done();
-    });
-  });
-
-  test('stats pbtxt parsing', done => {
-    let statsPbtxt = tf.graph.test.util.stringToArrayBuffer(`step_stats {
-      dev_stats {
-        device: "cpu"
-        node_stats {
-          node_name: "Q"
-          all_start_micros: 10
-          all_end_rel_micros: 4
-        }
-        node_stats {
-          node_name: "Q"
-          all_start_micros: 12
-          all_end_rel_micros: 4
-        }
-      }
-    }`);
-    tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
-      assert.equal(stepStats.dev_stats.length, 1);
-      assert.equal(stepStats.dev_stats[0].device, 'cpu');
-      assert.equal(stepStats.dev_stats[0].node_stats.length, 2);
-      assert.equal(stepStats.dev_stats[0].node_stats[0].all_start_micros, 10);
-      assert.equal(stepStats.dev_stats[0].node_stats[1].node_name, 'Q');
-      assert.equal(stepStats.dev_stats[0].node_stats[1].all_end_rel_micros, 4);
-      done();
-    });
-  });
-
-  test('d3 exists', () => { assert.isTrue(d3 != null); });
-
-  // TODO(nsthorat): write tests.
-
-});
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/util-test.ts b/tensorflow/tensorboard/components/tf_graph_common/test/util-test.ts
deleted file mode 100644
index 4535d24888f0777c5bdfa40bd537ac885604a8d7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/test/util-test.ts
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-suite('util', () => {
-  let assert = chai.assert;
-
-  test('remove common prefix', () => {
-
-    // Empty array.
-    let result = tf.graph.util.removeCommonPrefix([]);
-    assert.deepEqual(result, []);
-
-    // No common prefix.
-    result = tf.graph.util.removeCommonPrefix(['a', 'b', 'c']);
-    assert.deepEqual(result, ['a', 'b', 'c']);
-
-    // One of the elements is empty string.
-    result = tf.graph.util.removeCommonPrefix(['a/b', '', 'a/c']);
-    assert.deepEqual(result, ['a/b', '', 'a/c']);
-
-    // Only one string.
-    result = tf.graph.util.removeCommonPrefix(['a/b/c']);
-    assert.deepEqual(result, ['a/b/c']);
-
-    // `q/w/` is the common prefix. Expect `q/w/` to be removed.
-    result = tf.graph.util.removeCommonPrefix(['q/w/a', 'q/w/b', 'q/w/c/f']);
-    assert.deepEqual(result, ['a', 'b', 'c/f']);
-
-    // `q/w/` is the common prefix and also an element. Expect nothing to be
-    // removed since the common prefix is also an element in the array.
-    result = tf.graph.util.removeCommonPrefix(['q/w/', 'q/w/b', 'q/w/c/f']);
-    assert.deepEqual(result, ['q/w/', 'q/w/b', 'q/w/c/f']);
-  });
-
-  test('query params', () => {
-    // Starts with question mark.
-    let queryParams = tf.graph.util.getQueryParams('?foo=1&bar=2');
-    assert.deepEqual(queryParams, {'foo': '1', 'bar': '2'});
-
-    // No question mark.
-    queryParams = tf.graph.util.getQueryParams('foo=1&bar=2');
-    assert.deepEqual(queryParams, {'foo': '1', 'bar': '2'});
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/util.ts b/tensorflow/tensorboard/components/tf_graph_common/test/util.ts
deleted file mode 100644
index bc73b735ed2bd6335c8f72d8903a118897dd1738..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/test/util.ts
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-
-/* tslint:disable:no-namespace */
-module tf.graph.test.util {
-  /**
-   * Converts a utf-8 string to an ArrayBuffer.
-   */
-  export function stringToArrayBuffer(str): ArrayBuffer {
-    let buf = new ArrayBuffer(str.length);
-    let bufView = new Uint8Array(buf);
-    for (let i = 0, strLen = str.length; i < strLen; i++) {
-      bufView[i] = str.charCodeAt(i);
-    }
-    return buf;
-  }
-
-}  // module
diff --git a/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html b/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
deleted file mode 100644
index a460072a38f3c0fcd868b70f8c2325320df95028..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="../tf-imports/dagre.html">
-<link rel="import" href="../tf-imports/graphlib.html">
-<link rel="import" href="../tf-imports/lodash.html">
-
-<script src="colors.js"></script>
-<script src="common.js"></script>
-<script src="externs.js"></script>
-<script src="graph.js"></script>
-<script src="hierarchy.js"></script>
-<script src="layout.js"></script>
-<script src="parser.js"></script>
-<script src="proto.js"></script>
-<script src="render.js"></script>
-<script src="annotation.js"></script>
-<script src="contextmenu.js"></script>
-<script src="edge.js"></script>
-<script src="node.js"></script>
-<script src="scene.js"></script>
-<script src="template.js"></script>
-<script src="util.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/util.ts b/tensorflow/tensorboard/components/tf_graph_common/util.ts
deleted file mode 100644
index 0b2df6545cc58b097a790fb94963f7eed0b56ba1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_common/util.ts
+++ /dev/null
@@ -1,316 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * @fileoverview Utility functions for the tensorflow graph visualizer.
- */
-
-module tf.graph.util {
-  /**
-   * Recommended delay (ms) when running an expensive task asynchronously
-   * that gives enough time for the progress bar to update its UI.
-   */
-  const ASYNC_TASK_DELAY = 20;
-
-  export function time<T>(msg: string, task: () => T) {
-    let start = Date.now();
-    let result = task();
-    /* tslint:disable */
-    console.log(msg, ':', Date.now() - start, 'ms');
-    /* tslint:enable */
-    return result;
-  }
-
-  /**
-   * Creates a tracker that sets the progress property of the
-   * provided polymer component. The provided component must have
-   * a property called 'progress' that is not read-only. The progress
-   * property is an object with a numerical 'value' property and a
-   * string 'msg' property.
-   */
-  export function getTracker(polymerComponent: any) {
-    return {
-      setMessage: function(msg) {
-        polymerComponent.set(
-            'progress', {value: polymerComponent.progress.value, msg: msg});
-      },
-      updateProgress: function(value) {
-        polymerComponent.set('progress', {
-          value: polymerComponent.progress.value + value,
-          msg: polymerComponent.progress.msg
-        });
-      },
-      reportError: function(msg: string, err) {
-        // Log the stack trace in the console.
-        console.error(err.stack);
-        // And send a user-friendly message to the UI.
-        polymerComponent.set(
-            'progress',
-            {value: polymerComponent.progress.value, msg: msg, error: true});
-      },
-    };
-  }
-
-  /**
-   * Creates a tracker for a subtask given the parent tracker, the total
-   * progress
-   * of the subtask and the subtask message. The parent task should pass a
-   * subtracker to its subtasks. The subtask reports its own progress which
-   * becomes relative to the main task.
-   */
-  export function getSubtaskTracker(
-      parentTracker: ProgressTracker, impactOnTotalProgress: number,
-      subtaskMsg: string): ProgressTracker {
-    return {
-      setMessage: function(progressMsg) {
-        // The parent should show a concatenation of its message along with
-        // its subtask tracker message.
-        parentTracker.setMessage(subtaskMsg + ': ' + progressMsg);
-      },
-      updateProgress: function(incrementValue) {
-        // Update the parent progress relative to the child progress.
-        // For example, if the sub-task progresses by 30%, and the impact on the
-        // total progress is 50%, then the task progresses by 30% * 50% = 15%.
-        parentTracker.updateProgress(
-            incrementValue * impactOnTotalProgress / 100);
-      },
-      reportError: function(msg: string, err: Error) {
-        // The parent should show a concatenation of its message along with
-        // its subtask error message.
-        parentTracker.reportError(subtaskMsg + ': ' + msg, err);
-      }
-    };
-  }
-
-  /**
-   * Runs an expensive task and return the result.
-   */
-  export function runTask<T>(
-      msg: string, incProgressValue: number, task: () => T,
-      tracker: ProgressTracker): T {
-    // Update the progress message to say the current running task.
-    tracker.setMessage(msg);
-    // Run the expensive task with a delay that gives enough time for the
-    // UI to update.
-    try {
-      let result = tf.graph.util.time(msg, task);
-      // Update the progress value.
-      tracker.updateProgress(incProgressValue);
-      // Return the result to be used by other tasks.
-      return result;
-    } catch (e) {
-      // Errors that happen inside asynchronous tasks are
-      // reported to the tracker using a user-friendly message.
-      tracker.reportError('Failed ' + msg, e);
-    }
-  }
-
-  /**
-   * Runs an expensive task asynchronously and returns a promise of the result.
-   */
-  export function runAsyncTask<T>(
-      msg: string, incProgressValue: number, task: () => T,
-      tracker: ProgressTracker): Promise<T> {
-    return new Promise((resolve, reject) => {
-      // Update the progress message to say the current running task.
-      tracker.setMessage(msg);
-      // Run the expensive task with a delay that gives enough time for the
-      // UI to update.
-      setTimeout(function() {
-        try {
-          let result = tf.graph.util.time(msg, task);
-          // Update the progress value.
-          tracker.updateProgress(incProgressValue);
-          // Return the result to be used by other tasks.
-          resolve(result);
-        } catch (e) {
-          // Errors that happen inside asynchronous tasks are
-          // reported to the tracker using a user-friendly message.
-          tracker.reportError('Failed ' + msg, e);
-        }
-      }, ASYNC_TASK_DELAY);
-    });
-  }
-
-  /**
-   * Asynchronously runs an expensive task that returns a promise. Updates the
-   * tracker's progress after the promise resolves. Returns a new promise that
-   * resolves after the progress is updated.
-   */
-  export function runAsyncPromiseTask<T>(
-      msg: string, incProgressValue: number, task: () => Promise<T>,
-      tracker: ProgressTracker): Promise<T> {
-    return new Promise((resolve, reject) => {
-      let handleError = function(e) {
-        // Errors that happen inside asynchronous tasks are
-        // reported to the tracker using a user-friendly message.
-        tracker.reportError('Failed ' + msg, e);
-        reject(e);
-      };
-
-      // Update the progress message to say the current running task.
-      tracker.setMessage(msg);
-      // Run the expensive task with a delay that gives enough time for the
-      // UI to update.
-      setTimeout(function() {
-        try {
-          let start = Date.now();
-          task()
-              .then(function(value) {
-                /* tslint:disable */
-                console.log(msg, ':', Date.now() - start, 'ms');
-                // Update the progress value.
-                tracker.updateProgress(incProgressValue);
-                // Return the result to be used by other tasks.
-                resolve(value);
-              })
-              .catch(handleError);
-        } catch (e) {
-          handleError(e);
-        }
-      }, ASYNC_TASK_DELAY);
-    });
-  }
-
-  /**
-   * Returns a query selector with escaped special characters that are not
-   * allowed in a query selector.
-   */
-  export function escapeQuerySelector(querySelector: string): string {
-    return querySelector.replace(/([:.\[\],/\\\(\)])/g, '\\$1');
-  }
-
-  // For unit conversion.
-  export const MEMORY_UNITS = [
-    // Atomic unit.
-    {symbol: 'B'},
-    // numUnits specifies how many previous units this unit contains.
-    {symbol: 'KB', numUnits: 1024}, {symbol: 'MB', numUnits: 1024},
-    {symbol: 'GB', numUnits: 1024}, {symbol: 'TB', numUnits: 1024},
-    {symbol: 'PB', numUnits: 1024}
-  ];
-  export const TIME_UNITS = [
-    // Atomic unit. Finest granularity in TensorFlow stat collection.
-    {symbol: 'µs'},
-    // numUnits specifies how many previous units this unit contains.
-    {symbol: 'ms', numUnits: 1000}, {symbol: 's', numUnits: 1000},
-    {symbol: 'min', numUnits: 60}, {symbol: 'hr', numUnits: 60},
-    {symbol: 'days', numUnits: 24}
-  ];
-
-  /**
-   * Returns the human readable version of the unit.
-   * (e.g. 1.35 GB, 23 MB, 34 ms, 6.53 min etc).
-   */
-  export function convertUnitsToHumanReadable(value, units, unitIndex) {
-    unitIndex = unitIndex == null ? 0 : unitIndex;
-    if (unitIndex + 1 < units.length &&
-        value >= units[unitIndex + 1].numUnits) {
-      return tf.graph.util.convertUnitsToHumanReadable(
-          value / units[unitIndex + 1].numUnits, units, unitIndex + 1);
-    }
-    // toPrecision() has the tendency to return a number in scientific
-    // notation and (number - 0) brings it back to normal notation.
-    return (value.toPrecision(3) - 0) + ' ' + units[unitIndex].symbol;
-  }
-
-  export function hasDisplayableNodeStats(stats: NodeStats) {
-    if (stats &&
-        (stats.totalBytes > 0 || stats.getTotalMicros() > 0 ||
-         stats.outputSize)) {
-      return true;
-    }
-    return false;
-  }
-
-  /**
-   * Given a list of strings, it returns a new list of strings with the longest
-   * common prefix removed. If the common prefix is one of the strings in the
-   * list, it returns the original strings.
-   */
-  export function removeCommonPrefix(strings: string[]) {
-    if (strings.length < 2) {
-      return strings;
-    }
-
-    let index = 0;
-    let largestIndex = 0;
-    // Find the shortest name across all strings.
-    let minLength = _.min(_.map(strings, str => str.length));
-    while (true) {
-      index++;
-      let prefixes = _.map(strings, str => str.substring(0, index));
-      let allTheSame = prefixes.every((prefix, i) => {
-        return (i === 0 ? true : prefix === prefixes[i - 1]);
-      });
-      if (allTheSame) {
-        if (index >= minLength) {
-          // There is a string whose whole name is a prefix to other string.
-          // In this case, we return the original list of string.
-          return strings;
-        }
-        largestIndex = index;
-      } else {
-        break;
-      }
-    }
-    return _.map(strings, str => str.substring(largestIndex));
-  }
-
-  /**
-   * Given a queryString, aka ?foo=1&bar=2, return the object representation.
-   */
-  export function getQueryParams(queryString: string) {
-    if (queryString.charAt(0) === '?') {
-      queryString = queryString.slice(1);
-    }
-
-    let queryParams = _.chain(queryString.split('&'))
-                          .map((item) => {
-                            if (item) {
-                              return item.split('=');
-                            }
-                          })
-                          .compact()
-                          .value();
-
-    return _.object(queryParams);
-  }
-
-  /**
-   * Given a timestamp in microseconds, return a human-friendly string denoting
-   * how long ago the timestamp was.
-   */
-  export function computeHumanFriendlyTime(timeInMicroseconds: number) {
-    var timeDifferenceInMs =
-        +(new Date()) - +(new Date(timeInMicroseconds / 1e3));
-    if (timeDifferenceInMs < 30000) {
-      return 'just now';
-    } else if (timeDifferenceInMs < 60000) {
-      return Math.floor(timeDifferenceInMs / 1000) + ' seconds ago';
-    } else if (timeDifferenceInMs < 120000) {
-      return 'a minute ago';
-    } else if (timeDifferenceInMs < 3600000) {
-      return Math.floor(timeDifferenceInMs / 60000) + ' minutes ago';
-    } else if (Math.floor(timeDifferenceInMs / 3600000) == 1) {
-      return 'an hour ago';
-    } else if (timeDifferenceInMs < 86400000) {
-      return Math.floor(timeDifferenceInMs / 3600000) + ' hours ago';
-    } else if (timeDifferenceInMs < 172800000) {
-      return 'yesterday';
-    }
-    return Math.floor(timeDifferenceInMs / 86400000) + ' days ago';
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/BUILD b/tensorflow/tensorboard/components/tf_graph_controls/BUILD
deleted file mode 100644
index ecca2ba4cb5346ac57b073205565dbc899e5130c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_controls/BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_controls",
-    srcs = ["tf-graph-controls.html"],
-    path = "/tf-graph-controls",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_dropdown_menu",
-        "@org_polymer_paper_menu",
-        "@org_polymer_paper_radio_group",
-        "@org_polymer_paper_toggle_button",
-        "@org_polymer_paper_tooltip",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_controls"],
-    destdir = "tf-graph-controls",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
-        "//third_party/javascript/polymer/v1/paper-button:lib",
-        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
-        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
-        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD
deleted file mode 100644
index 0e1205421329d4710a3c19227e309f7e667d41cf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_controls/demo/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_controls/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-graph-controls/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_controls",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html b/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html
deleted file mode 100644
index 8b12641b28e328351bd7321c43959a91fba56dcc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_controls/demo/index.html
+++ /dev/null
@@ -1,49 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-graph-controls.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<title>TF Graph Controls Demo</title>
-<style>
-  #demo-container {
-    border: 2px solid #808080;
-    width: 1000px;
-    height: 700px;
-    position: relative;
-  }
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="tf-graph-controls-demo">
-      <template>
-        <tf-graph-controls
-            id="controls"
-            color-by="structure"
-        ></tf-graph-controls>
-      </template>
-      <script>
-        Polymer({
-          is: "tf-graph-controls-demo",
-        });
-      </script>
-    </dom-module>
-    <div id="demo-container">
-      <tf-graph-controls-demo></tf-graph-controls-demo>
-    </div>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html b/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
deleted file mode 100644
index 6d896357482dbc91aa26ddfd9e03f01f8276262e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_controls/tf-graph-controls.html
+++ /dev/null
@@ -1,919 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-menu/paper-menu.html">
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-radio-group/paper-radio-group.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-
-<dom-module id="tf-graph-controls">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  color: gray;
-  --paper-font-subhead: {
-    font-size: 14px;
-    color: gray;
-  };
-  --paper-dropdown-menu-icon: {
-    width: 15px;
-    height: 15px;
-  };
-  --paper-dropdown-menu-button: {
-    padding: 0;
-  };
-  --paper-dropdown-menu-input: {
-    padding: 0;
-  };
-  --paper-item-min-height: 30px;
-}
-
-paper-button[raised].keyboard-focus {
-  font-weight: normal;
-}
-
-.run-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 25px;
-  };
-}
-
-.color-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 13px;
-  };
-}
-
-table {
-  border-collapse: collapse;
-  border-spacing: 0;
-}
-
-table td {
-  padding: 0;
-  margin: 0;
-}
-
-.allcontrols {
-  width: 188px;
-  padding: 0 30px;
-}
-
-.legend-holder {
-  position: absolute;
-  bottom: 0;
-  padding-bottom: 10px;
-}
-
-paper-radio-button {
-  display: block;
-  padding: 5px;
-}
-svg.icon {
-  width: 60px;
-  height: 18px;
-}
-.icon ellipse {
-  rx: 10px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 1px;
-  fill: #FFFFFF;
-  cy: 10px;
-}
-.icon rect {
-  height: 14px;
-  width: 35px;
-  rx: 5px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 2px;
-  fill: #D9D9D9;
-}
-.domainValues {
-  margin-bottom: 10px;
-  width: 165px;
-}
-.domainStart {
-  float: left;
-}
-.domainEnd {
-  float: right;
-}
-.colorBox {
-  width: 20px;
-}
-
-.image-icon {
-  width: 24px;
-  height: 24px;
-}
-
-.help-icon {
-  height: 15px;
-  margin: 0;
-  padding: 0;
-}
-
-.gray {
-  color: #666;
-}
-
-.title {
-  font-size: 16px;
-  margin: 8px 5px 8px 0;
-  color: black;
-}
-.title small {
-  font-weight: normal;
-}
-.deviceList, .xlaClusterList {
-  max-height: 200px;
-  overflow-y: auto;
-}
-
-#file {
-  padding: 8px 0;
-}
-
-.color-legend-row {
-  clear: both;
-  height: 20px;
-  margin-top: 5px;
-  position: relative;
-}
-
-.color-legend-row svg {
-  position: absolute;
-  top: -1px;
-  width: 40px;
-}
-
-.color-legend-row span.color-legend-value {
-  margin-left: 60px;
-}
-
-#grey-rect {
-  fill: #eee;
-  stroke: #a6a6a6;
-}
-
-#faded-rect {
-  fill: url(#rectHatch);
-  stroke: var(--tb-graph-faded);
-}
-
-.button-text {
-  text-transform: none;
-  padding: 8px 18px 0 18px;
-  font-size: 14px
-}
-
-.upload-button {
-  width: 165px;
-  height: 25px;
-  text-transform: none;
-  margin-top: 4px;
-}
-
-.iconbutton {
-  padding: 2px;
-  width: 30px;
-  height: 30px;
-  color: var(--paper-orange-500);
-}
-
-.hidden-input {
-  height: 0px;
-  width: 0px;
-  overflow:hidden;
-}
-
-.allcontrols .control-holder {
-  display: flex;
-  clear: both;
-}
-
-.allcontrols .control-holder paper-radio-group {
-  margin-top: 5px;
-}
-
-span.counter {
-  font-size: 13px;
-  color: gray;
-}
-
-.runs paper-item {
-  --paper-item: {
-    white-space: nowrap;
-  }
-}
-
-table.control-holder {
-  border: 0;
-  border-collapse: collapse;
-}
-
-table.tf-graph-controls td.input-element-table-data {
-  padding: 0 0 0 20px;
-}
-
-/** Override inline styles that suppress pointer events for disabled buttons. Otherwise, the */
-/*  tooltips do not appear. */
-#color-by-radio-group paper-radio-button {
-  pointer-events: auto !important;
-}
-
-.legend-clarifier {
-  color: #266236;
-  cursor: help;
-  display: inline-block;
-  text-decoration: underline;
-}
-
-.legend-clarifier paper-tooltip {
-  width: 150px;
-}
-</style>
-<svg width="0" height="0">
-  <defs>
-    <g id="legend-rect">
-      <rect x="1" y="1" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
-    </g>
-    <g id="grey-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink"
-            xlink:href="#legend-rect"/>
-     </g>
-     <g id="faded-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink"
-            xlink:href="#legend-rect"/>
-     </g>
-  </defs>
-</svg>
-<div class="allcontrols">
-  <div class="control-holder">
-    <paper-icon-button icon="aspect-ratio" class="iconbutton" on-click="fit" alt="Fit to screen">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="fit">Fit to screen
-    </paper-button>
-  </div>
-  <div class="control-holder">
-    <paper-icon-button icon="file-download" class="iconbutton" on-click="download" alt="Download PNG">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="download">Download PNG
-    </paper-button>
-    <a href="#" id="graphdownload" class="title" download="graph.png">
-    </a>
-  </div>
-  <div class="control-holder runs">
-    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
-    <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
-      <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
-        <template is="dom-repeat" items="[[datasets]]">
-          <paper-item>[[item.name]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-  </div>
-  <template is="dom-if" if="[[showSessionRunsDropdown]]">
-    <div class="control-holder">
-      <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
-      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
-        <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
-          <template is="dom-repeat" items="[[metadataTags]]">
-            <paper-item>[[item.tag]]</paper-item>
-          </template>
-          <paper-item>None</paper-item>
-        </paper-menu>
-      </paper-dropdown-menu>
-    </div>
-  </template>
-  <template is="dom-if" if="[[showUploadButton]]">
-    <div class="control-holder">
-      <div class="title">Upload</div>
-      <paper-button raised class="text-button upload-button"
-          on-click="_getFile">Choose File</paper-button>
-      <div class="hidden-input">
-        <input type="file" id="file" name="file" on-change="_updateFileInput" />
-      </div>
-    </div>
-  </template>
-  <table class="control-holder">
-    <tr>
-      <td class="title">Trace inputs</td>
-      <td class="input-element-table-data">
-        <paper-toggle-button id="trace-inputs"></paper-toggle-button>
-      </td>
-    </tr>
-    <template is="dom-if" if="[[healthPillsFeatureEnabled]]">
-      <tr>
-        <td class="title">Show health pills</td>
-        <td class="input-element-table-data">
-          <paper-toggle-button checked="{{healthPillsToggledOn}}"></paper-toggle-button>
-        </td>
-      </tr>
-    </template>
-  </table>
-  <div class="control-holder">
-    <div class="title">Color</div>
-    <paper-radio-group id="color-by-radio-group" selected="{{colorBy}}">
-      <paper-radio-button name="structure">Structure</paper-radio-button>
-
-      <paper-radio-button name="device">Device</paper-radio-button>
-
-      <paper-radio-button id="xla-cluster-radio-button"
-                          name="xla_cluster"
-                          disabled="[[!_xlaClustersProvided(renderHierarchy)]]">
-        XLA Cluster
-      </paper-radio-button>
-      <paper-tooltip animation-delay="0" for="xla-cluster-radio-button" position="right">
-        Coloring by XLA cluster is only enabled if at least 1 op specifies an XLA cluster.
-      </paper-tooltip>
-
-      <paper-radio-button id="compute-time-radio-button"
-                          name="compute_time"
-                          disabled="[[!stats]]">
-        Compute time
-      </paper-radio-button>
-      <paper-tooltip animation-delay="0" for="compute-time-radio-button" position="right">
-        Coloring by compute time is only enabled if the RunMetadata proto is passed to the
-        FileWriter when a specific session is run.
-      </paper-tooltip>
-
-      <paper-radio-button id="memory-radio-button"
-                          name="memory"
-                          disabled="[[!stats]]">
-        Memory
-      </paper-radio-button>
-      <paper-tooltip animation-delay="0" for="memory-radio-button" position="right">
-        Coloring by memory is only enabled if the RunMetadata proto is passed to the
-        FileWriter when a specific session is run.
-      </paper-tooltip>
-    </paper-radio-group>
-  </div>
-  <div>
-    <template is="dom-if" if="[[_isGradientColoring(stats, colorBy)]]">
-      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
-        <defs>
-          <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
-            <stop class="start" offset="0%"
-                stop-color$="[[_currentGradientParams.startColor]]"/>
-            <stop class="end" offset="100%"
-                stop-color$="[[_currentGradientParams.endColor]]"/>
-          </linearGradient>
-        </defs>
-        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)"
-            stroke="black" />
-      </svg>
-      <div class="domainValues color-text">
-        <div class="domainStart">[[_currentGradientParams.minValue]]</div>
-        <div class="domainEnd">[[_currentGradientParams.maxValue]]</div>
-      </div>
-      <br style="clear: both">
-      <div>Devices included in stats:</div>
-      <div class="deviceList">
-        <table>
-        <template is="dom-repeat" items="[[_getDevices(devicesForStats)]]">
-          <tr>
-            <td>
-              <input type="checkbox" value$="[[item.device]]" checked$="[[item.used]]" on-click="_deviceCheckboxClicked"/>
-            </td>
-            <td>
-              <div>
-                <span>[[item.suffix]]</span>
-                <template is="dom-if" if="[[item.ignoredMsg]]">
-                  <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-                  <paper-tooltip position="right" animation-delay="0">[[item.ignoredMsg]]</paper-tooltip>
-                </template>
-              </div>
-            </td>
-          </tr>
-        </template>
-        </table>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'structure')]]">
-      <div class="color-text">
-        <div class="color-legend-row">
-          <div style="position: absolute;">
-            colors
-          </div>
-          <span class="color-legend-value">same substructure</span>
-        </div>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                 xlink:href="#grey-rect" x="0" y="0"/>
-          </svg>
-          <span class="color-legend-value">unique substructure</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'device')]]">
-      <div class="color-text">
-        <div class="deviceList">
-          <table>
-          <template is="dom-repeat" items="[[colorByParams.device]]">
-            <tr>
-              <td style$="[[_getBackgroundColor(item.color)]]">
-                <div class="colorBox"></div>
-              </td>
-              <td>
-                <div>[[item.device]]</div>
-              </td>
-            </tr>
-          </template>
-          </table>
-        </div>
-        <br/>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                 xlink:href="#grey-rect" x="0" y="0"/>
-          </svg>
-          <span class="color-legend-value">unknown device</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'xla_cluster')]]">
-      <div class="color-text">
-        <div class="xlaClusterList">
-          <table>
-          <template is="dom-repeat" items="[[colorByParams.xla_cluster]]">
-            <tr>
-              <td style$="[[_getBackgroundColor(item.color)]]">
-                <div class="colorBox"></div>
-              </td>
-              <td>
-                <div>[[item.xla_cluster]]</div>
-              </td>
-            </tr>
-          </template>
-          </table>
-        </div>
-        <br/>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                 xlink:href="#grey-rect" x="0" y="0"/>
-          </svg>
-          <span class="color-legend-value">unknown XLA cluster</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_statsNotNull(stats)]]">
-      <div class="color-legend-row">
-        <svg>
-          <use xmlns:xlink="http://www.w3.org/1999/xlink"
-                xlink:href="#faded-rect" x="0" y="0"/>
-        </svg>
-        <span class="color-legend-value">unused substructure</span>
-      </div>
-    </template>
-  </div>
-  <!--
-    Due to limited vertical space on the left sidebar, hide the legend whenever
-    we show a list of devices to include in stats.
-  -->
-  <template is="dom-if" if="[[!_isGradientColoring(stats, colorBy)]]">
-    <div class="legend-holder">
-      <table>
-        <tr>
-          <td><div class="title">Graph</div></td>
-          <td>(* = expandable)</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <rect transform="translate(3, 1)" height="14" width="35"
-                  rx="5" ry="5"/>
-            </svg>
-          </td>
-          <td>
-            Namespace<span class="gray">*</span>
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Encapsulates a set of nodes. Namespace is hierarchical and based on scope.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" preserveAspectRatio="xMinYMid meet"
-                viewBox="0 0 10 10">
-              <use xlink:href="#op-node-stamp" fill="white" stroke="#ccc" x="9.5"
-                y="6" />
-            </svg>
-          </td>
-          <td>
-            OpNode
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Node that performs an operation. These nodes cannot expand.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet"
-                viewBox="0 0 12 12">
-              <use xlink:href="#op-series-horizontal-stamp" fill="white"
-                  stroke="#ccc" x="2" y="2"/>
-            </svg>
-          </td>
-          <td>
-            Unconnected series<span class="gray">*</span>
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Sequence of numbered nodes that are not connected to each other.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <use xlink:href="#op-series-vertical-stamp"
-                  fill="white" stroke="#ccc" x="2" y="2"/>
-            </svg>
-          </td>
-          <td>
-            Connected series<span class="gray">*</span>
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Sequence of numbered nodes that are connected to each other.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <circle fill="white" stroke="#848484" cx="10" cy="10" r="5"/>
-            </svg>
-          </td>
-          <td>
-            Constant
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Node that outputs a constant value.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="image-icon" viewBox="0 0 12 12" width="24" height="24">
-              <use x="0" y="0" class="image-icon" xlink:href="#summary-icon"/>
-            </svg>
-          </td>
-          <td>
-            Summary
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Node that collects data for visualization within TensorBoard.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <defs>
-                <marker id="dataflow-arrowhead-legend" fill="#bbb" markerWidth="10"
-                    markerHeight="10" refX="9" refY="5" orient="auto-start-reverse">
-                  <path d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
-                </marker>
-              </defs>
-              <path marker-end="url(#dataflow-arrowhead-legend)"
-                    stroke="#bbb" d="M2 9 l 29 0"
-                    stroke-linecap="round" />
-            </svg>
-          </td>
-          <td>
-            Dataflow edge
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Edge showing the data flow between operations. Edges flow upwards unless arrowheads specify otherwise.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path stroke="#bbb"
-                d="M2 9 l 29 0" stroke-linecap="round" stroke-dasharray="2, 2" />
-            </svg>
-          </td>
-          <td>
-            Control dependency edge
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Edge showing the control dependency between operations.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px"
-                preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <defs>
-                <marker id="reference-arrowhead-legend" fill="#FFB74D" markerWidth="10"
-                    markerHeight="10" refX="9" refY="5" orient="auto-start-reverse">
-                  <path d="M 0,0 L 10,5 L 0,10 C 3,7 3,3 0,0"/>
-                </marker>
-              </defs>
-              <path marker-end="url(#reference-arrowhead-legend)"
-                    stroke="#FFB74D" d="M2 9 l 29 0"
-                    stroke-linecap="round" />
-            </svg>
-          </td>
-          <td>
-            Reference edge
-            <div class="legend-clarifier">
-              <span>?</span>
-              <paper-tooltip animation-delay="0" position="right">
-                Edge showing that the outgoing operation node can mutate the incoming tensor.
-              </paper-tooltip>
-            </div>
-          </td>
-        </tr>
-      </table>
-    </div>
-  </template>
-  </div>
-</template>
-</dom-module>
-
-<script>
-(function() { // Private scope.
-/**
- * Stats from device names that match these regexes will be excluded by default.
- * The user can still turn on a device by selecting the checkbox in the device list.
- * See b/29089982 for context.
- */
-var DEVICE_NAMES_EXCLUDE = [
-  {
-    regex: /gpu:[0-9]+$/,
-    msg: 'Excluded by default since this is a CPU thread setting up GPU kernels.'
-  }
-];
-
-Polymer({
-  is: 'tf-graph-controls',
-  properties: {
-    // Public API.
-    stats: {
-      value: null,
-      type: Object,
-      observer: '_statsChanged'
-    },
-    devicesForStats: {
-      value: null,
-      type: Object,
-      notify: true,
-      readonly: true,
-    },
-    colorBy: {
-      type: String,
-      value: 'structure',
-      notify: true,
-      readonly: true
-    },
-    colorByParams: Object,
-    datasets: {
-      type: Array,
-      observer: '_datasetsChanged'
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true,
-    },
-    metadataTags: {
-      type: Array,
-      computed: '_getMetadataTags(selectedDataset, datasets)'
-    },
-    selectedDataset: {
-      type: Number,
-      notify: true,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    selectedFile: {
-      type: Object,
-      notify: true
-    },
-    selectedMetadataTag: {
-      type: Number,
-      notify: true,
-      value: -1
-    },
-    _currentGradientParams: {
-      type: Object,
-      computed: '_getCurrentGradientParams(colorByParams, colorBy)'
-    },
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    },
-    // This stores whether the feature for showing health pills is enabled in the first place.
-    healthPillsFeatureEnabled: Boolean,
-    // This stores whether to show health pills. Only relevant if healthPillsFeatureEnabled. The
-    // user can toggle this value.
-    healthPillsToggledOn: {
-      type: Boolean,
-      notify: true,
-    },
-  },
-  listeners: {
-    'trace-inputs.change': '_traceInputToggleChanged'
-  },
-  _traceInputToggleChanged: function(event) {
-    // Flip the state of the trace inputs flag.
-    this.renderHierarchy.traceInputs = event.target.active;
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-  },
-  _xlaClustersProvided: function(renderHierarchy) {
-    return renderHierarchy &&
-        renderHierarchy.hierarchy &&
-        renderHierarchy.hierarchy.xlaClusters.length > 0;
-  },
-  _statsChanged: function(stats) {
-    if (stats == null) {
-      return;
-    }
-    var devicesForStats = {};
-    var devices = _.each(stats.dev_stats, function(d) {
-      // Avoid device names that are ignored by default.
-      var exclude = _.some(DEVICE_NAMES_EXCLUDE, function(rule) {
-        return rule.regex.test(d.device);
-      });
-      if (!exclude) {
-        devicesForStats[d.device] = true;
-      }
-    });
-    this.set('devicesForStats', devicesForStats);
-  },
-  _getDevices: function(devicesForStats) {
-    var devices = _.map(this.stats.dev_stats, function(d) {
-      return d.device;
-    });
-    // Devices names can be long so we remove the longest common prefix
-    // before showing the devices in a list.
-    var suffixes = tf.graph.util.removeCommonPrefix(devices);
-    return _.map(devices, function(device, i) {
-      var ignoredMsg = null;
-      _.each(DEVICE_NAMES_EXCLUDE, function(rule) {
-        if (rule.regex.test(device)) {
-          ignoredMsg = rule.msg;
-        }
-      });
-      return {
-        device: device,
-        suffix: suffixes[i],
-        used: devicesForStats[device],
-        ignoredMsg: ignoredMsg
-      };
-    });
-  },
-  _deviceCheckboxClicked: function(checkbox) {
-    // Update the device map.
-    var devicesForStats = _.extend({}, this.devicesForStats);
-    var device = checkbox.target.value;
-    if (checkbox.target.checked) {
-      devicesForStats[device] = true;
-    } else {
-      delete devicesForStats[device];
-    }
-    this.set('devicesForStats', devicesForStats);
-  },
-  _numSessionRuns: function(metadataTags) {
-    return metadataTags != null ? metadataTags.length : 0;
-  },
-  _getBackgroundColor: function(color) {
-    return 'background-color:' + color;
-  },
-  fit: function() {
-    document.querySelector('#scene').fit();
-  },
-  _isGradientColoring: function(stats, colorBy) {
-    return ["compute_time", "memory"].indexOf(colorBy) !== -1
-        && stats != null;
-  },
-  _equals: function(a, b) {
-    return a === b;
-  },
-  _getCurrentGradientParams: function(colorByParams, colorBy) {
-    if (!this._isGradientColoring(this.stats, colorBy)) {
-      return;
-    }
-    var params = colorByParams[colorBy];
-    var minValue = params.minValue;
-    var maxValue = params.maxValue;
-    if (colorBy === 'memory') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.MEMORY_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.MEMORY_UNITS);
-    } else if (colorBy === 'compute_time') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.TIME_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.TIME_UNITS);
-    }
-    return {
-      minValue: minValue,
-      maxValue: maxValue,
-      startColor: params.startColor,
-      endColor: params.endColor
-    };
-  },
-  download: function() {
-    this.$.graphdownload.click();
-  },
-  _updateFileInput: function(e) {
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-    this._setDownloadFilename(file.name);
-    this.set('selectedFile', e);
-  },
-  _datasetsChanged: function(newDatasets, oldDatasets) {
-    if (oldDatasets != null || this.selected == null) {
-      // Select the first dataset by default.
-      this.set('selectedDataset', 0);
-      this._setDownloadFilename(this.datasets[this.selectedDataset].path);
-    }
-  },
-  _getMetadataTags: function(selectedDataset, datasets) {
-    return this.datasets[selectedDataset].runMetadata;
-  },
-  _selectedDatasetChanged: function(newDataset, oldDataset) {
-    if (this.datasets) {
-      this.set('selectedMetadataTag', -1);
-      this.set('colorBy', 'structure');
-      this.$['trace-inputs'].active = false; // Set trace input to off-state.
-      this._setDownloadFilename(this.datasets[newDataset].path);
-    }
-  },
-  _getFile: function() {
-    this.$$("#file").click();
-  },
-  _setDownloadFilename: function(graphPath) {
-    // Strip off everything before the last "/" and strip off the file
-    // extension in order to get the name of the PNG for the graph.
-    var dotIndex = graphPath.lastIndexOf('.');
-    if (dotIndex) {
-      graphPath = graphPath.substring(0, dotIndex);
-    }
-    var slashIndex = graphPath.lastIndexOf('/');
-    if (slashIndex) {
-      graphPath = graphPath.substring(slashIndex + 1);
-    }
-    this.$.graphdownload.setAttribute('download', graphPath + '.png');
-  },
-  _statsNotNull: function(stats) {
-    return stats !== null;
-  },
-});
-})(); // Closing private scope.
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD
deleted file mode 100644
index c69a7809035f9ee313bac9bb72b9a84c3cdf98db..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/BUILD
+++ /dev/null
@@ -1,44 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_dashboard",
-    srcs = ["tf-graph-dashboard.html"],
-    path = "/tf-graph-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_graph",
-        "//tensorflow/tensorboard/components/tf_graph_board",
-        "//tensorflow/tensorboard/components/tf_graph_controls",
-        "//tensorflow/tensorboard/components/tf_graph_loader",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/vz_sorting",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_dashboard"],
-    destdir = "tf-graph-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_board:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_controls:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_loader:legacy",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD
deleted file mode 100644
index 66a37b89785353f2cda41988f1c6182fc81f9dcf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_dashboard/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt
deleted file mode 100644
index 30b206453469801d31b46856c29cdda78164f18f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/graph_run_run1.pbtxt
+++ /dev/null
@@ -1,4606 +0,0 @@
-node {
-  name: "GradientDescent/learning_rate"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_3"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 100
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000d\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_grad/Shape"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_1_grad/Shape"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod_1"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape_1"
-  input: "gradients/Mean_grad/Const_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum"
-  op: "Maximum"
-  input: "gradients/Mean_grad/Prod_1"
-  input: "gradients/Mean_grad/Maximum/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape"
-  input: "gradients/Mean_grad/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/floordiv"
-  op: "FloorDiv"
-  input: "gradients/Mean_grad/Prod"
-  input: "gradients/Mean_grad/Maximum"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Cast"
-  op: "Cast"
-  input: "gradients/Mean_grad/floordiv"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "DstT"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "SrcT"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile/multiples"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Fill"
-  op: "Fill"
-  input: "gradients/Shape"
-  input: "gradients/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Fill"
-  input: "gradients/Mean_grad/Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile"
-  op: "Tile"
-  input: "gradients/Mean_grad/Reshape"
-  input: "gradients/Mean_grad/Tile/multiples"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tmultiples"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/truediv"
-  op: "RealDiv"
-  input: "gradients/Mean_grad/Tile"
-  input: "gradients/Mean_grad/Cast"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Mean_grad/truediv"
-  input: "gradients/Reshape_3_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  op: "ExpandDims"
-  input: "gradients/Reshape_3_grad/Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/begin"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1"
-  op: "Sub"
-  input: "Rank_2"
-  input: "Sub_1/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/begin"
-  op: "Pack"
-  input: "Sub_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_1"
-  op: "Slice"
-  input: "Shape_2"
-  input: "Slice_1/begin"
-  input: "Slice_1/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat_1"
-  op: "ConcatV2"
-  input: "concat_1/values_0"
-  input: "Slice_1"
-  input: "concat_1/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub"
-  op: "Sub"
-  input: "Rank_1"
-  input: "Sub/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice/begin"
-  op: "Pack"
-  input: "Sub"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice"
-  op: "Slice"
-  input: "Shape_1"
-  input: "Slice/begin"
-  input: "Slice/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat"
-  op: "ConcatV2"
-  input: "concat/values_0"
-  input: "Slice"
-  input: "concat/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2"
-  op: "Sub"
-  input: "Rank"
-  input: "Sub_2/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/size"
-  op: "Pack"
-  input: "Sub_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_2"
-  op: "Slice"
-  input: "Shape"
-  input: "Slice_2/begin"
-  input: "Slice_2/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_biases/read"
-  op: "Identity"
-  input: "logits_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_weights/read"
-  op: "Identity"
-  input: "logits_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_biases/read"
-  op: "Identity"
-  input: "hidden_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 784
-        }
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_weights/read"
-  op: "Identity"
-  input: "hidden_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\377\377\377\377"
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/depth"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/off_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/on_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_1/random_shuffle_queue"
-  op: "RandomShuffleQueueV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "capacity"
-    value {
-      i: 20000
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "min_after_dequeue"
-    value {
-      i: 4000
-    }
-  }
-  attr {
-    key: "seed"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "seed2"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 28
-          }
-          dim {
-            size: 28
-          }
-          dim {
-            size: 1
-          }
-        }
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  op: "QueueDequeueManyV2"
-  input: "mnist_dataset_train_1/random_shuffle_queue"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "timeout_ms"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  input: "Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul"
-  op: "MatMul"
-  input: "Reshape"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add"
-  op: "Add"
-  input: "MatMul"
-  input: "hidden_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Relu"
-  op: "Relu"
-  input: "add"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add_1"
-  op: "Add"
-  input: "MatMul_1"
-  input: "logits_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_1"
-  op: "Reshape"
-  input: "add_1"
-  input: "concat"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot"
-  op: "OneHot"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
-  input: "mnist_dataset_train_2/one_hot/depth"
-  input: "mnist_dataset_train_2/one_hot/on_value"
-  input: "mnist_dataset_train_2/one_hot/off_value"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "TI"
-    value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape_2"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/one_hot"
-  input: "concat_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "SoftmaxCrossEntropyWithLogits"
-  op: "SoftmaxCrossEntropyWithLogits"
-  input: "Reshape_1"
-  input: "Reshape_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  op: "PreventGradient"
-  input: "SoftmaxCrossEntropyWithLogits:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "message"
-    value {
-      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  op: "Mul"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  input: "gradients/Reshape_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum_1"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum"
-  input: "gradients/add_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape_1"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul_1"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Relu_grad/ReluGrad"
-  op: "ReluGrad"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency"
-  input: "Relu"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum_1"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum"
-  input: "gradients/add_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape_1"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul_1"
-  op: "MatMul"
-  input: "Reshape"
-  input: "gradients/add_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_grad/tuple/control_dependency"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 784
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_grad/MatMul"
-  input: "^gradients/MatMul_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_grad/MatMul_1"
-  input: "^gradients/MatMul_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "GradientDescent"
-  op: "NoOp"
-  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_2"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_3"
-  op: "Reshape"
-  input: "SoftmaxCrossEntropyWithLogits"
-  input: "Slice_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Mean"
-  op: "Mean"
-  input: "Reshape_3"
-  input: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "_send_Mean_0"
-  op: "_Send"
-  input: "Mean"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "client_terminated"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "recv_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device_incarnation"
-    value {
-      i: -5924635994370253548
-    }
-  }
-  attr {
-    key: "tensor_name"
-    value {
-      s: "Mean:0"
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 21
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json
deleted file mode 100644
index 0429aa71f8271a291450f898e2a4b73da738b267..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/data/runs.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "run1": {
-    "graph": true,
-    "scalars": ["foo/sin"]
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html
deleted file mode 100644
index ae84c547b4802973d3e0eee098eda0fd2b8de194..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/demo/index.html
+++ /dev/null
@@ -1,62 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../tf-graph-dashboard.html">
-<link rel="import" href="../../paper-styles/typography.html">
-
-<title>Graph Dashboard Demo</title>
-<style>
-  #demo-container {
-    display: block;
-    height: 900px;
-    position: relative;
-    width: 100%;
-  }
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="graph-dashboard-demo">
-      <template>
-        <tf-graph-dashboard backend="[[backend]]"></tf-graph-dashboard>
-      </template>
-      <script>
-        import {Backend} from "../../tf-backend/backend";
-        import {createRouter, setRouter} from "../../tf-backend/router";
-
-        Polymer({
-          is: "graph-dashboard-demo",
-          properties: {
-            backend: {
-              type: Object,
-              value: function() {
-                return new Backend();
-              },
-            },
-          },
-          created: function() {
-            var router = createRouter("data", true);
-            setRouter(router);
-          },
-        });
-      </script>
-    </dom-module>
-    <graph-dashboard-demo id="demo-container"></graph-dashboard-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html b/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
deleted file mode 100644
index ba69882a232d764e747f02c8556e74c704964b7c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
+++ /dev/null
@@ -1,321 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../vz-sorting/vz-sorting.html">
-
-<!--
-tf-graph-dashboard displays a graph from a TensorFlow run.
-
-It has simple behavior: Creates a url-generator and run-generator
-to talk to the backend, and then passes the runsWithGraph (list of runs with
-associated graphs) along with the url generator into tf-graph-board for display.
-
-If there are multiple runs with graphs, the first run's graph is shown
-by default. The user can select a different run from a dropdown menu.
--->
-<dom-module id="tf-graph-dashboard">
-<template>
-<tf-no-data-warning
-  data-type="graph"
-  show-warning="[[_datasetsEmpty(_datasets)]]"
-></tf-no-data-warning>
-<template is="dom-if" if="[[!_datasetsEmpty(_datasets)]]">
-<tf-dashboard-layout>
-<div class="sidebar">
-  <tf-graph-controls id="controls"
-        devices-for-stats="{{_devicesForStats}}"
-        color-by-params="[[_colorByParams]]"
-        stats="[[_stats]]"
-        color-by="{{_colorBy}}"
-        datasets="[[_datasets]]"
-        render-hierarchy="[[_renderHierarchy]]"
-        selected-dataset="{{_selectedDataset}}"
-        selected-file="{{_selectedFile}}"
-        selected-metadata-tag="{{_selectedMetadataTag}}"
-        health-pills-feature-enabled="[[debuggerDataEnabled]]"
-        health-pills-toggled-on="{{healthPillsToggledOn}}"
-  ></tf-graph-controls>
-  <tf-graph-loader id="loader"
-        datasets="[[_datasets]]"
-        selected-dataset="[[_selectedDataset]]"
-        selected-metadata-tag="[[_selectedMetadataTag]]"
-        selected-file="[[_selectedFile]]"
-        out-graph-hierarchy="{{_graphHierarchy}}"
-        out-graph="{{_graph}}"
-        out-stats="{{_stats}}"
-        progress="{{_progress}}"
-out-hierarchy-params="{{_hierarchyParams}}"
-  ></tf-graph-loader>
-</div>
-<div class="center">
-    <tf-graph-board id="graphboard"
-        devices-for-stats="[[_devicesForStats]]"
-        color-by="[[_colorBy]]"
-        color-by-params="{{_colorByParams}}"
-        graph-hierarchy="[[_graphHierarchy]]"
-        graph="[[_graph]]"
-        hierarchy-params="[[_hierarchyParams]]"
-        progress="[[_progress]]"
-        debugger-data-enabled="[[debuggerDataEnabled]]"
-        are-health-pills-loading="[[_areHealthPillsLoading]]"
-        debugger-numeric-alerts="[[_debuggerNumericAlerts]]"
-        node-names-to-health-pills="[[_nodeNamesToHealthPills]]"
-        all-steps-mode-enabled="{{allStepsModeEnabled}}"
-        specific-health-pill-step="{{specificHealthPillStep}}"
-        health-pill-step-index="[[_healthPillStepIndex]]"
-        render-hierarchy="{{_renderHierarchy}}"
-        selected-node="{{_selectedNode}}"
-        stats="[[_stats]]"
-    ></tf-graph-board>
-</div>
-</tf-dashboard-layout>
-</template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.center {
-  position: relative;
-  height: 100%;
-}
-
-</style>
-</template>
-</dom-module>
-
-<script>
-import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-import {BackendBehavior} from "../tf-backend/behavior";
-import {compareTagNames} from "../vz-sorting/sorting";
-
-Polymer({
-  is: 'tf-graph-dashboard',
-  factoryImpl: function(backend, debuggerDataEnabled) {
-    this.backend = backend;
-    this.debuggerDataEnabled = debuggerDataEnabled;
-  },
-  behaviors: [
-    DashboardBehavior("graphs"),
-    ReloadBehavior("tf-graph-dashboard"),
-    BackendBehavior,
-  ],
-  properties: {
-    _datasets: Object,
-    _renderHierarchy: {type: Object, observer: '_renderHierarchyChanged'},
-    backend: Object,
-    debuggerDataEnabled: Boolean,
-    allStepsModeEnabled: Boolean,
-    specificHealthPillStep: {type: Number, value: 0},
-    healthPillsToggledOn: {type: Boolean, value: true, observer: '_healthPillsToggledOnChanged'},
-    _selectedNode: Object,
-    _isAttached: Boolean,
-    // Whether this dashboard is initialized. This dashboard should only be initialized once.
-    _initialized: Boolean,
-    // Whether health pills are currently being loaded, in which case we may want to say show a
-    // spinner.
-    _areHealthPillsLoading: Boolean,
-    // An array of alerts (in chronological order) provided by debugging libraries on when bad
-    // values (NaN, +/- Inf) appear.
-    _debuggerNumericAlerts: {
-      type: Array,
-      value: [],
-      notify: true,
-    },
-    // Maps the names of nodes to an array of health pills (HealthPillDatums).
-    _nodeNamesToHealthPills: {
-      type: Object,
-      value: {},
-    },
-    _healthPillStepIndex: Number,
-    // A strictly increasing ID. Each request for health pills has a unique ID. This helps us
-    // identify stale requests.
-    _healthPillRequestId: {type: Number, value: 1},
-    // The setTimeout ID for the pending request for health pills at a specific step.
-    _healthPillStepRequestTimerId: Number,
-    // The request for health pills at a specific step (as opposed to all sampled health pills) may
-    // involve slow disk reads. Hence, we throttle to 1 of those requests every this many ms.
-    _healthPillStepRequestTimerDelay: {
-      type: Number,
-      value: 500,
-      readOnly: true,
-    },
-    runs: Array,
-  },
-  listeners: {
-    'node-toggle-expand': '_handleNodeToggleExpand',
-  },
-  observers: [
-    '_maybeFetchHealthPills(allStepsModeEnabled, specificHealthPillStep, _selectedNode)',
-    '_maybeInitializeDashboard(backend, _isAttached)',
-  ],
-  attached: function() {
-    this.set('_isAttached', true);
-  },
-  detached: function() {
-    this.set('_isAttached', false);
-  },
-  reload: function() {
-    this._maybeFetchHealthPills();
-  },
-  _shouldRequestHealthPills: function() {
-    // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
-    // or if the graph itself has not loaded yet. We need the graph to load so that we know which
-    // nodes to request health pills for.
-    return this.debuggerDataEnabled &&
-        this.healthPillsToggledOn &&
-        this._renderHierarchy &&
-        !this._datasetsEmpty(this._datasets);
-  },
-  _maybeInitializeDashboard: function(backend, isAttached) {
-    if (this._initialized || !backend || !isAttached) {
-      // Either this dashboard is already initialized ... or we are not yet ready to initialize.
-      return;
-    }
-    if (typeof ga !== 'undefined' && ga != null) {
-      ga('send', {hitType: 'pageview', page: '/v/graph'});
-    }
-    // Set this to true so we only initialize once.
-    this._initialized = true;
-    Promise.all([backend.graphRuns(), backend.runMetadataTags()])
-      .then(function(result) {
-        var runsWithGraph = result[0].sort(compareTagNames);
-        var runToMetadata = result[1];
-        var datasets = _.map(runsWithGraph, function(runName) {
-          return {
-            name: runName,
-            path: backend.graphUrl(
-                runName, tf.graph.LIMIT_ATTR_SIZE, tf.graph.LARGE_ATTRS_KEY),
-            runMetadata: runToMetadata[runName] ? _.map(
-              runToMetadata[runName].sort(compareTagNames), function(tag) {
-                return {
-                  tag: tag,
-                  path: backend.runMetadataUrl(tag, runName)
-                };
-              }, this) : []
-          };
-        }, this);
-        this.set('_datasets', datasets);
-      }.bind(this));
-  },
-  _requestHealthPills: function() {
-    this.set('_areHealthPillsLoading', true);
-    var requestId = ++this._healthPillRequestId;
-
-    if (this._healthPillStepRequestTimerId !== null) {
-      // A request for health pills is already scheduled to be initiated. Clear it, and schedule a
-      // new request.
-      window.clearTimeout(this._healthPillStepRequestTimerId);
-      this._healthPillStepRequestTimerId = null;
-    }
-
-    if (this.allStepsModeEnabled) {
-      // This path may be slow. Schedule network requests to start some time later. If another
-      // request is scheduled in the mean time, drop this current request.
-      this._healthPillStepRequestTimerId = setTimeout(function() {
-        this._healthPillStepRequestTimerId = null;
-        this._initiateNetworkRequestForHealthPills(requestId);
-      }.bind(this), this._healthPillStepRequestTimerDelay);
-    } else {
-      // The user is fetching sampled steps. This path is fast, so no need to throttle. Directly
-      // fetch the health pills across the network.
-      this._initiateNetworkRequestForHealthPills(requestId);
-    }
-  },
-  // Initiates the network request for health pills. Do not directly call this method - network
-  // requests may be throttled. Instead, call _requestHealthPills, which uses this method.
-  _initiateNetworkRequestForHealthPills: function(requestId) {
-    if (this._healthPillRequestId !== requestId) {
-      // This possibly scheduled request was outdated before it was even sent across the network. Do
-      // not bother initiating it.
-      return;
-    }
-
-    var specificStep = this.allStepsModeEnabled ? this.specificHealthPillStep : undefined;
-
-    var healthPillsPromise = this.backend.healthPills(
-        this._renderHierarchy.getNamesOfRenderedOps(), specificStep);
-    var alertsPromise = this.backend.debuggerNumericsAlerts();
-
-    Promise.all([healthPillsPromise, alertsPromise]).then(
-        function(result) {
-      var healthPillsResult = result[0];
-      var alertsResult = result[1];
-
-      if (!this.healthPillsToggledOn) {
-        // The user has opted to hide health pills via the toggle button.
-        return;
-      }
-
-      if (requestId !== this._healthPillRequestId) {
-        // This response is no longer relevant.
-        return;
-      }
-
-      // Set the index for which step to show for the health pills. By default, show the last step.
-      // A precondition we assume (that Tensorboard's reservoir sampling guarantees) is that all
-      // node names should be mapped to the same number of steps.
-      for (var nodeName in healthPillsResult) {
-        this.set('_healthPillStepIndex', healthPillsResult[nodeName].length - 1);
-        break;
-      }
-
-      this.set('_debuggerNumericAlerts', alertsResult);
-      this.set('_nodeNamesToHealthPills', healthPillsResult);
-      this.set('_areHealthPillsLoading', false);
-      this.set('_healthPillStepRequestTimerId', null);
-    }.bind(this));
-  },
-  _datasetsEmpty: function(datasets) {
-    return !datasets || !datasets.length;
-  },
-  _renderHierarchyChanged: function(renderHierarchy) {
-    // Reload any data on the graph when the render hierarchy (which determines which nodes are
-    // rendered) changes.
-    this.reload();
-  },
-  _handleNodeToggleExpand: function() {
-    // Nodes were toggled. We may need to request health pills for more nodes.
-    this._maybeFetchHealthPills();
-  },
-  _healthPillsToggledOnChanged: function(healthPillsToggledOn) {
-    if (healthPillsToggledOn) {
-      // Load health pills.
-      this.reload();
-    } else {
-      // Remove all health pills by setting an empty mapping.
-      this.set('_nodeNamesToHealthPills', {});
-    }
-  },
-  // Fetch health pills for a specific step if applicable.
-  _maybeFetchHealthPills: function() {
-    if (!this._shouldRequestHealthPills()) {
-      return;
-    }
-
-    this._requestHealthPills();
-  },
-});
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/BUILD b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/BUILD
deleted file mode 100644
index c0d2bd5a46c8355b0555b66ae00a69cc5ed8d117..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/BUILD
+++ /dev/null
@@ -1,44 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_debugger_data_card",
-    srcs = [
-        "tf-graph-debugger-data-card.html",
-    ],
-    path = "/tf-graph-debugger-data-card",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_debugger_data_card"],
-    destdir = "tf-graph-debugger-data-card",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
-        "//third_party/javascript/polymer/v1/iron-collapse:lib",
-        "//third_party/javascript/polymer/v1/iron-list:lib",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-item:lib",
-        "//third_party/javascript/polymer/v1/paper-slider:lib",
-        "//third_party/javascript/polymer/v1/paper-spinner:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/BUILD
deleted file mode 100644
index 66cb115618834ac001e387b417e2b37d3df9b1bc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph-debugger-data-card/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/index.html b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/index.html
deleted file mode 100644
index 934e4f86a83728939465080d90aaca9db5c8b1ca..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/demo/index.html
+++ /dev/null
@@ -1,36 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-graph-debugger-data-card.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<title>TF Graph Info Demo</title>
-<style>
-  #demo-container {
-    border: 2px solid #808080;
-    width: 1000px;
-    height: 600px;
-  }
-</style>
-<demo-snippet>
-  <template>
-    <div id='demo-container'>
-      <!-- This simple demo starts up a page with a health pill legend. -->
-      <tf-graph-debugger-data-card></tf-graph-debugger-data-card>
-    </div>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/tf-graph-debugger-data-card.html b/tensorflow/tensorboard/components/tf_graph_debugger_data_card/tf-graph-debugger-data-card.html
deleted file mode 100644
index 6cc99a327cb3f78b462d928f45b6c5be0e19529a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_debugger_data_card/tf-graph-debugger-data-card.html
+++ /dev/null
@@ -1,560 +0,0 @@
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-
-<dom-module id="tf-graph-debugger-data-card">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  margin: 0;
-  padding: 0;
-  display: block;
-}
-
-h2 {
-  padding: 0;
-  text-align: center;
-  margin: 0;
-}
-
-.health-pill-legend {
-  padding: 15px;
-}
-
-.health-pill-legend h2 {
-  text-align: left;
-}
-
-.health-pill-entry {
-  margin: 10px 10px 10px 0;
-}
-
-.health-pill-entry .color-preview {
-  width: 26px;
-  height: 26px;
-  border-radius: 3px;
-  display: inline-block;
-  margin: 0 10px 0 0;
-}
-
-.health-pill-entry .color-label, .health-pill-entry .tensor-count {
-  color: #777;
-  display: inline-block;
-  height: 26px;
-  font-size: 22px;
-  line-height: 26px;
-  vertical-align: top;
-}
-
-.health-pill-entry .tensor-count {
-  float: right;
-}
-
-#health-pill-step-slider {
-  width: 100%;
-  margin: 0 0 0 -15px;
-  /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
-   * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
-   * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2.
-   * Apparently, the paper-slider lacks a mixin for those padding values. */
-  width: calc(100% + 31px);
-}
-
-#health-pills-loading-spinner {
-  width: 20px;
-  height: 20px;
-  vertical-align: top;
-}
-
-#health-pill-step-number-input {
-  text-align: center;
-  vertical-align: top;
-}
-
-#numeric-alerts-table-container {
-  max-height: 400px;
-  overflow-x: hidden;
-  overflow-y: auto;
-}
-
-#numeric-alerts-table {
-  text-align: left;
-}
-
-#numeric-alerts-table td {
-  vertical-align: top;
-}
-
-#numeric-alerts-table .first-offense-td {
-  display: inline-block;
-}
-
-.first-offense-td {
-  width: 80px;
-}
-
-.tensor-device-td {
-  max-width: 140px;
-  word-wrap : break-word;
-}
-
-.tensor-section-within-table {
-  color: #266236;
-  cursor: pointer;
-  opacity: 0.8;
-  text-decoration: underline;
-}
-
-.tensor-section-within-table:hover {
-  opacity: 1;
-}
-
-.device-section-within-table {
-  color: #666;
-}
-
-.mini-health-pill {
-  width: 130px;
-}
-
-.mini-health-pill > div {
-  height: 100%;
-  width: 60px;
-  border-radius: 3px;
-}
-
-#event-counts-th {
-  padding: 0 0 0 10px;
-}
-
-.negative-inf-mini-health-pill-section {
-  background: rgb(255, 141, 0);
-  width: 20px;
-}
-
-.positive-inf-mini-health-pill-section {
-  background: rgb(0, 62, 212);
-  width: 20px;
-}
-
-.nan-mini-health-pill-section {
-  background: rgb(204, 47, 44);
-  width: 20px;
-}
-
-.negative-inf-mini-health-pill-section,
-.positive-inf-mini-health-pill-section,
-.nan-mini-health-pill-section {
-  color: #fff;
-  display: inline-block;
-  height: 100%;
-  line-height: 20px;
-  margin: 0 0 0 10px;
-  text-align: center;
-}
-
-.no-numeric-alerts-notification {
-  margin: 0;
-}
-</style>
-<paper-material elevation="1" class="card health-pill-legend">
-  <div class="title">
-    Enable all (not just sampled) steps. Requires slow disk read.
-  </div>
-  <paper-toggle-button id="enableAllStepsModeToggle" checked="{{allStepsModeEnabled}}">
-  </paper-toggle-button>
-  <h2>
-    Step of Health Pills:
-    <template is="dom-if" if="[[allStepsModeEnabled]]">
-      <input type="number"
-             id="health-pill-step-number-input"
-             min="0"
-             max="[[_biggestStepEverSeen]]"
-             value="{{specificHealthPillStep::input}}">
-    </template>
-    <template is="dom-if" if="[[!allStepsModeEnabled]]">
-      [[_currentStepDisplayValue]]
-    </template>
-    <paper-spinner-lite active
-                        hidden$=[[!areHealthPillsLoading]]
-                        id="health-pills-loading-spinner"></paper-spinner-lite>
-  </h2>
-  <template is="dom-if" if="[[allStepsModeEnabled]]">
-    <paper-slider
-          id="health-pill-step-slider"
-          immediate-value="{{specificHealthPillStep}}"
-          max="[[_biggestStepEverSeen]]"
-          snaps
-          step="1"
-          value="{{specificHealthPillStep}}"></paper-slider>
-  </template>
-  <template is="dom-if" if="[[!allStepsModeEnabled]]">
-    <template is="dom-if" if="[[_maxStepIndex]]">
-      <paper-slider
-            id="health-pill-step-slider"
-            immediate-value="{{healthPillStepIndex}}"
-            max="[[_maxStepIndex]]"
-            snaps
-            step="1"
-            value="{{healthPillStepIndex}}"></paper-slider>
-    </template>
-  </template>
-  <h2>
-    Health Pill
-    <template is="dom-if" if="[[healthPillValuesForSelectedNode]]">
-      Counts for Selected Node
-    </template>
-    <template is="dom-if" if="[[!healthPillValuesForSelectedNode]]">
-      Legend
-    </template>
-  </h2>
-  <template is="dom-repeat" items="[[healthPillEntries]]">
-    <div class="health-pill-entry">
-      <div class="color-preview" style="background:[[item.background_color]]"></div>
-      <div class="color-label">[[item.label]]</div>
-      <div class="tensor-count">
-        [[_computeTensorCountString(healthPillValuesForSelectedNode, index)]]
-      </div>
-    </div>
-  </template>
-  <div hidden$="[[!_hasDebuggerNumericAlerts(debuggerNumericAlerts)]]">
-    <h2 id="numeric-alerts-header">Numeric Alerts</h2>
-    <p>
-      Alerts are sorted from top to bottom by increasing timestamp.
-    </p>
-    <div id='numeric-alerts-table-container'>
-      <table id="numeric-alerts-table">
-        <thead>
-          <tr>
-            <th>First Offense</th>
-            <th>Tensor (Device)</th>
-            <th id='event-counts-th'>Event Counts</th>
-          </tr>
-        </thead>
-        <tbody id="numeric-alerts-body">
-        </tbody>
-      </table>
-    </div>
-  </div>
-  <template is="dom-if" if="[[!_hasDebuggerNumericAlerts(debuggerNumericAlerts)]]">
-    <p class="no-numeric-alerts-notification">
-      No numeric alerts so far. That is likely good. Alerts indicate the presence of NaN
-      or (+/-) Infinity values, which may be concerning.
-    </p>
-  </template>
-</paper-material>
-</template>
-<script>
-"use strict";
-
-(function() {
-  Polymer({
-    is: 'tf-graph-debugger-data-card',
-
-    properties: {
-      renderHierarchy: Object,
-      debuggerNumericAlerts: {
-        type: Array,
-        notify: true,
-      },
-      nodeNamesToHealthPills: Object,
-      healthPillStepIndex: {
-        type: Number,
-        notify: true,
-      },
-      // Only relevant if we are in all steps mode, in which case the user may want to view health
-      // pills for a specific step.
-      specificHealthPillStep: {
-        type: Number,
-        value: 0,
-        notify: true,
-      },
-      // Two-ways
-      selectedNode: {
-        type: String,
-        notify: true
-      },
-      highlightedNode: {
-        type: String,
-        notify: true
-      },
-      // The enum value of the include property of the selected node.
-      selectedNodeInclude: {
-        type: Number,
-        notify: true
-      },
-      // Whether health pills are currently being loaded, in which case we show a spinner (and the
-      // current health pills shown might be out of date).
-      areHealthPillsLoading: Boolean,
-      healthPillEntries: {
-        type: Array,
-        value: tf.graph.scene.healthPillEntries,
-        readOnly: true,
-      },
-      healthPillValuesForSelectedNode: {
-        type: Array,
-        computed: '_computeHealthPillForNode(nodeNamesToHealthPills, healthPillStepIndex, selectedNode, allStepsModeEnabled, areHealthPillsLoading)',
-      },
-      // When all-steps mode is enabled, the user can request health pills for any step. In this
-      // mode, Tensorboard makes a request every time the user drags the slider to a different step.
-      allStepsModeEnabled: {
-        type: Boolean,
-        notify: true,
-      },
-      // The biggest step value ever seen. Used to determine what steps of health pills to let the
-      // user fetch in all steps mode.
-      _biggestStepEverSeen: {
-        type: Number,
-        computed: '_computeBiggestStepEverSeen(nodeNamesToHealthPills)',
-      },
-      _maxStepIndex: {
-        type: Number,
-        computed: '_computeMaxStepIndex(nodeNamesToHealthPills)',
-      },
-      _currentStepDisplayValue: {
-        type: String,
-        computed: '_computeCurrentStepDisplayValue(nodeNamesToHealthPills, healthPillStepIndex, allStepsModeEnabled, specificHealthPillStep, areHealthPillsLoading)',
-      },
-    },
-    observers: [
-      '_updateAlertsList(debuggerNumericAlerts)',
-    ],
-    ready: function() {
-      var mainContainer = document.getElementById('mainContainer');
-      var scrollbarContainer = document.querySelector('tf-dashboard-layout .scrollbar');
-      if (mainContainer && scrollbarContainer) {
-        // If this component is being used inside of TensorBoard's dashboard layout, it may easily
-        // cause the dashboard layout element to overflow, giving the user 2 scroll bars. Prevent
-        // that by hiding whatever content overflows - the user will have to expand the viewport to
-        // use this debugging card.
-        mainContainer.style.overflow = 'hidden';
-        scrollbarContainer.style.overflow = 'hidden';
-      }
-    },
-    _healthPillsAvailable: function(debuggerDataEnabled, nodeNamesToHealthPills) {
-      // So long as there is a mapping (even if empty) from node name to health pills, show the
-      // legend and slider. We do that because, even if no health pills exist at the current step,
-      // the user may desire to change steps, and the slider must show for the user to do that.
-      return debuggerDataEnabled && nodeNamesToHealthPills;
-    },
-    _computeTensorCountString: function(healthPillValuesForSelectedNode, valueIndex) {
-      if (!healthPillValuesForSelectedNode) {
-        // No health pill data is available.
-        return '';
-      }
-
-      return healthPillValuesForSelectedNode[valueIndex].toFixed(0);
-    },
-    _computeHealthPillForNode: function(
-        nodeNamesToHealthPills, healthPillStepIndex, selectedNode, allStepsModeEnabled, areHealthPillsLoading) {
-      if (areHealthPillsLoading) {
-        // Health pills are loading. Do not render data that is out of date.
-        return null;
-      }
-
-      if (!selectedNode) {
-        // No node is selected.
-        return null;
-      }
-
-      const healthPills = nodeNamesToHealthPills[selectedNode];
-      if (!healthPills) {
-        // This node lacks a health pill.
-        return null;
-      }
-
-      // If all steps mode is enabled, we use the first health pill in the list because the JSON
-      // response from the server is a mapping between node name and a list of 1 health pill.
-      const healthPill = healthPills[allStepsModeEnabled ? 0 : healthPillStepIndex];
-      if (!healthPill) {
-        // This node lacks a health pill at the current step.
-        return null;
-      }
-
-      // The health pill count values start at 2. Each health pill contains 6 values.
-      return healthPill.value.slice(2, 8);
-    },
-    _computeCurrentStepDisplayValue: function(
-        nodeNamesToHealthPills,
-        healthPillStepIndex,
-        allStepsModeEnabled,
-        specificHealthPillStep,
-        areHealthPillsLoading) {
-      if (allStepsModeEnabled) {
-        // The user seeks health pills for specific step from the server.
-        return specificHealthPillStep.toFixed(0);
-      }
-
-      if (areHealthPillsLoading) {
-        // The current step is undefined.
-        return 0;
-      }
-
-      for (let nodeName in nodeNamesToHealthPills) {
-        // All nodes have the same number of steps stored, so only examine 1 node. We cannot
-        // directly index into the nodeNamesToHealthPills object because we do not have a key.
-        // If all steps mode is enabled, we only have 1 step to show.
-        return nodeNamesToHealthPills[nodeName][healthPillStepIndex].step.toFixed(0);
-      }
-
-      // The current step could not be computed.
-      return 0;
-    },
-    _computeBiggestStepEverSeen: function(nodeNamesToHealthPills) {
-      for (let nodeName in nodeNamesToHealthPills) {
-        // All nodes have the same number of steps stored, so only examine 1 node.
-        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
-        // of the array will be greater than 1.
-        var healthPills = nodeNamesToHealthPills[nodeName];
-        return Math.max(this._biggestStepEverSeen, healthPills[healthPills.length - 1].step);
-      }
-
-      // No steps seen so far. Default to 0.
-      return this._biggestStepEverSeen || 0;
-    },
-    _computeMaxStepIndex: function(nodeNamesToHealthPills) {
-      for (let nodeName in nodeNamesToHealthPills) {
-        // All nodes have the same number of steps stored, so only examine 1 node.
-        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
-        // of the array will be greater than 1.
-        return nodeNamesToHealthPills[nodeName].length - 1;
-      }
-
-      // Return a falsy value. The slider should be hidden.
-      return 0;
-    },
-    _hasDebuggerNumericAlerts: function(debuggerNumericAlerts) {
-      return debuggerNumericAlerts && debuggerNumericAlerts.length;
-    },
-    _updateAlertsList: function(debuggerNumericAlerts) {
-      var alertBody = this.$$('#numeric-alerts-body');
-      if (!alertBody) {
-        return;
-      }
-
-      alertBody.innerHTML = '';
-
-      for (var i = 0; i < debuggerNumericAlerts.length; i++) {
-        var alert = debuggerNumericAlerts[i];
-        var tableRow = document.createElement('tr');
-
-        var timestampTd = document.createElement('td');
-        timestampTd.innerHTML = tf.graph.util.computeHumanFriendlyTime(alert.first_timestamp);
-        timestampTd.classList.add('first-offense-td');
-        tableRow.appendChild(timestampTd);
-
-        var tensorDeviceTd = document.createElement('td');
-        tensorDeviceTd.classList.add('tensor-device-td')
-
-        var tensorSection = document.createElement('div');
-        tensorSection.classList.add('tensor-section-within-table');
-        tensorSection.innerHTML = alert.tensor_name;
-        this._addOpExpansionListener(tensorSection, alert.tensor_name);
-        tensorDeviceTd.appendChild(tensorSection);
-
-        var deviceSection = document.createElement('div');
-        deviceSection.classList.add('device-section-within-table');
-        deviceSection.innerHTML = '(' + alert.device_name + ')';
-        tensorDeviceTd.appendChild(deviceSection);
-        tableRow.appendChild(tensorDeviceTd);
-
-        var miniHealthPill = document.createElement('div');
-        miniHealthPill.classList.add('mini-health-pill');
-
-        var miniHealthPillTd = document.createElement('td');
-        miniHealthPillTd.classList.add('mini-health-pill-td');
-        miniHealthPillTd.appendChild(miniHealthPill);
-        tableRow.appendChild(miniHealthPillTd);
-
-        if (alert.neg_inf_event_count) {
-          var negativeInfCountSection = document.createElement('div');
-          negativeInfCountSection.classList.add('negative-inf-mini-health-pill-section');
-          negativeInfCountSection.innerHTML = alert.neg_inf_event_count;
-          negativeInfCountSection.setAttribute(
-              'title', alert.neg_inf_event_count + ' events with -∞')
-          miniHealthPill.appendChild(negativeInfCountSection);
-        }
-
-        if (alert.pos_inf_event_count) {
-          var positiveInfCountSection = document.createElement('div');
-          positiveInfCountSection.classList.add('positive-inf-mini-health-pill-section');
-          positiveInfCountSection.innerHTML = alert.pos_inf_event_count;
-          positiveInfCountSection.setAttribute(
-              'title', alert.pos_inf_event_count + ' events with +∞')
-          miniHealthPill.appendChild(positiveInfCountSection);
-        }
-
-        if (alert.nan_event_count) {
-          var nanCountSection = document.createElement('div');
-          nanCountSection.classList.add('nan-mini-health-pill-section');
-          nanCountSection.innerHTML = alert.nan_event_count;
-          nanCountSection.setAttribute(
-              'title', alert.nan_event_count + ' events with NaN')
-          miniHealthPill.appendChild(nanCountSection);
-        }
-
-        Polymer.dom(alertBody).appendChild(tableRow);
-      }
-    },
-    // Adds a listener to an element, so that when that element is clicked, the tensor with
-    // tensorName expands.
-    _addOpExpansionListener: function(clickableElement, tensorName) {
-      clickableElement.addEventListener('click', () => {
-        // When the user clicks on a tensor name, expand all nodes until the user can see the
-        // associated node.
-        var nameOfNodeToSelect = tf.graph.render.expandUntilNodeIsShown(
-            document.getElementById('scene'), this.renderHierarchy, tensorName);
-
-        // Store the current scroll of the graph info card. Node selection alters that scroll, and
-        // we restore the scroll later.
-        var previousScrollFromBottom;
-        var graphInfoCard = document.querySelector('tf-graph-info#graph-info');
-        if (graphInfoCard) {
-          previousScrollFromBottom = graphInfoCard.scrollHeight - graphInfoCard.scrollTop;
-        }
-
-        // Update the selected node within graph logic.
-        var previousSelectedNode = this.selectedNode;
-        this.set('selectedNode', nameOfNodeToSelect);
-
-        // Scroll the graph info card back down if necessary so that user can see the alerts section
-        // again. Selecting the node causes the info card to scroll to the top, which may mean the
-        // user no longer sees the list of alerts.
-        var scrollToOriginalLocation = () => {
-          graphInfoCard.scrollTop = graphInfoCard.scrollHeight - previousScrollFromBottom;
-        };
-        if (graphInfoCard) {
-          // This component is used within an info card. Restore the original scroll.
-          if (previousSelectedNode) {
-            // The card for the selected node has already opened. Immediately restore the scroll.
-            scrollToOriginalLocation();
-          } else {
-            // Give some time for the DOM of the info card to be created before scrolling down.
-            window.setTimeout(scrollToOriginalLocation, 20);
-          }
-        }
-      });
-    },
-  });
-})();
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/BUILD b/tensorflow/tensorboard/components/tf_graph_info/BUILD
deleted file mode 100644
index 22e886d881ef7941198dc35ac4cfba92cb5cb0fa..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/BUILD
+++ /dev/null
@@ -1,53 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_info",
-    srcs = [
-        "tf-graph-icon.html",
-        "tf-graph-info.html",
-        "tf-node-info.html",
-        "tf-node-list-item.html",
-    ],
-    path = "/tf-graph-info",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_iron_list",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_item",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_info"],
-    destdir = "tf-graph-info",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
-        "//tensorflow/tensorboard/components/tf_graph_debugger_data_card:legacy",
-        "//third_party/javascript/polymer/v1/iron-collapse:lib",
-        "//third_party/javascript/polymer/v1/iron-list:lib",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-item:lib",
-        "//third_party/javascript/polymer/v1/paper-slider:lib",
-        "//third_party/javascript/polymer/v1/paper-spinner:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD
deleted file mode 100644
index 2f1f7bf276176707ec33d45a744e35f8af47fd81..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_info/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph-info/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_graph_info",
-        "//tensorflow/tensorboard/components/tf_graph_loader",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt
deleted file mode 100644
index 30b206453469801d31b46856c29cdda78164f18f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/demo/data/graph.pbtxt
+++ /dev/null
@@ -1,4606 +0,0 @@
-node {
-  name: "GradientDescent/learning_rate"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_3"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 100
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000d\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_grad/Shape"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_1_grad/Shape"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod_1"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape_1"
-  input: "gradients/Mean_grad/Const_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum"
-  op: "Maximum"
-  input: "gradients/Mean_grad/Prod_1"
-  input: "gradients/Mean_grad/Maximum/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape"
-  input: "gradients/Mean_grad/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/floordiv"
-  op: "FloorDiv"
-  input: "gradients/Mean_grad/Prod"
-  input: "gradients/Mean_grad/Maximum"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Cast"
-  op: "Cast"
-  input: "gradients/Mean_grad/floordiv"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "DstT"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "SrcT"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile/multiples"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Fill"
-  op: "Fill"
-  input: "gradients/Shape"
-  input: "gradients/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Fill"
-  input: "gradients/Mean_grad/Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile"
-  op: "Tile"
-  input: "gradients/Mean_grad/Reshape"
-  input: "gradients/Mean_grad/Tile/multiples"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tmultiples"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/truediv"
-  op: "RealDiv"
-  input: "gradients/Mean_grad/Tile"
-  input: "gradients/Mean_grad/Cast"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Mean_grad/truediv"
-  input: "gradients/Reshape_3_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  op: "ExpandDims"
-  input: "gradients/Reshape_3_grad/Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/begin"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1"
-  op: "Sub"
-  input: "Rank_2"
-  input: "Sub_1/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/begin"
-  op: "Pack"
-  input: "Sub_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_1"
-  op: "Slice"
-  input: "Shape_2"
-  input: "Slice_1/begin"
-  input: "Slice_1/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat_1"
-  op: "ConcatV2"
-  input: "concat_1/values_0"
-  input: "Slice_1"
-  input: "concat_1/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub"
-  op: "Sub"
-  input: "Rank_1"
-  input: "Sub/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice/begin"
-  op: "Pack"
-  input: "Sub"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice"
-  op: "Slice"
-  input: "Shape_1"
-  input: "Slice/begin"
-  input: "Slice/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat"
-  op: "ConcatV2"
-  input: "concat/values_0"
-  input: "Slice"
-  input: "concat/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2"
-  op: "Sub"
-  input: "Rank"
-  input: "Sub_2/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/size"
-  op: "Pack"
-  input: "Sub_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_2"
-  op: "Slice"
-  input: "Shape"
-  input: "Slice_2/begin"
-  input: "Slice_2/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_biases/read"
-  op: "Identity"
-  input: "logits_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_weights/read"
-  op: "Identity"
-  input: "logits_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_biases/read"
-  op: "Identity"
-  input: "hidden_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 784
-        }
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_weights/read"
-  op: "Identity"
-  input: "hidden_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\377\377\377\377"
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/depth"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/off_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/on_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_1/random_shuffle_queue"
-  op: "RandomShuffleQueueV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "capacity"
-    value {
-      i: 20000
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "min_after_dequeue"
-    value {
-      i: 4000
-    }
-  }
-  attr {
-    key: "seed"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "seed2"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 28
-          }
-          dim {
-            size: 28
-          }
-          dim {
-            size: 1
-          }
-        }
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  op: "QueueDequeueManyV2"
-  input: "mnist_dataset_train_1/random_shuffle_queue"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "timeout_ms"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  input: "Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul"
-  op: "MatMul"
-  input: "Reshape"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add"
-  op: "Add"
-  input: "MatMul"
-  input: "hidden_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Relu"
-  op: "Relu"
-  input: "add"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add_1"
-  op: "Add"
-  input: "MatMul_1"
-  input: "logits_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_1"
-  op: "Reshape"
-  input: "add_1"
-  input: "concat"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot"
-  op: "OneHot"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
-  input: "mnist_dataset_train_2/one_hot/depth"
-  input: "mnist_dataset_train_2/one_hot/on_value"
-  input: "mnist_dataset_train_2/one_hot/off_value"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "TI"
-    value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape_2"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/one_hot"
-  input: "concat_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "SoftmaxCrossEntropyWithLogits"
-  op: "SoftmaxCrossEntropyWithLogits"
-  input: "Reshape_1"
-  input: "Reshape_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  op: "PreventGradient"
-  input: "SoftmaxCrossEntropyWithLogits:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "message"
-    value {
-      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  op: "Mul"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  input: "gradients/Reshape_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum_1"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum"
-  input: "gradients/add_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape_1"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul_1"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Relu_grad/ReluGrad"
-  op: "ReluGrad"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency"
-  input: "Relu"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum_1"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum"
-  input: "gradients/add_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape_1"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul_1"
-  op: "MatMul"
-  input: "Reshape"
-  input: "gradients/add_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_grad/tuple/control_dependency"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 784
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_grad/MatMul"
-  input: "^gradients/MatMul_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_grad/MatMul_1"
-  input: "^gradients/MatMul_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "GradientDescent"
-  op: "NoOp"
-  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_2"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_3"
-  op: "Reshape"
-  input: "SoftmaxCrossEntropyWithLogits"
-  input: "Slice_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Mean"
-  op: "Mean"
-  input: "Reshape_3"
-  input: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "_send_Mean_0"
-  op: "_Send"
-  input: "Mean"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "client_terminated"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "recv_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device_incarnation"
-    value {
-      i: -5924635994370253548
-    }
-  }
-  attr {
-    key: "tensor_name"
-    value {
-      s: "Mean:0"
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 21
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_info/demo/index.html b/tensorflow/tensorboard/components/tf_graph_info/demo/index.html
deleted file mode 100644
index f7d2ef7ee5e56a870b1b49cfff3dd416953f3fa3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/demo/index.html
+++ /dev/null
@@ -1,94 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-graph-info.html">
-<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<title>TF Graph Info Demo</title>
-<style>
-  #demo-container {
-    border: 2px solid #808080;
-    width: 1000px;
-    height: 600px;
-  }
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="tf-graph-info-demo">
-      <template>
-        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
-        <tf-graph-loader
-            id="loader"
-            datasets="[[_datasets]]"
-            selected-dataset="[[_selectedDataset]]"
-            out-graph="{{_graph}}">
-        </tf-graph-loader>
-
-        <tf-graph-info id="info" title="selected"></tf-graph-info>
-      </template>
-      <script>
-        "use strict";
-
-        Polymer({
-          is: "tf-graph-info-demo",
-          properties: {
-            // We tell the graph loader to load a specific pbtxt file.
-            _datasets: {
-              type: Array,
-              value: [{
-                "name": "Graph with XLA Clusters Specified",
-                "path": "data/graph.pbtxt"
-              }],
-            },
-            _selectedDataset: {
-              type: Number,
-              value: 0,
-            },
-
-            // This property will be updated by the graph loader.
-            _graph: {
-              type: Object,
-            },
-          },
-          observers: [
-            '_graphUpdated(_graph)',
-          ],
-          _graphUpdated: function(slimGraph) {
-            const tracker = tf.graph.util.getTracker(this.$.loader);
-            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
-                tracker, 100, 'Namespace hierarchy');
-            const hierarchyOptions = {};
-            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
-                function(graphHierarchy) {
-              // We have parsed and built the graph object from a pbtxt file. Show info.
-              this.$.info.set('graph', slimGraph);
-              this.$.info.set('graphHierarchy', graphHierarchy);
-
-              // Select a node within that graph.
-              this.$.info.set('selectedNode', 'GradientDescent/learning_rate');
-            }.bind(this));
-          },
-        });
-      </script>
-    </dom-module>
-    <div id='demo-container'>
-      <tf-graph-info-demo></tf-graph-info-demo>
-    </div>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html
deleted file mode 100644
index a3e9dc59c5abcb649d07362c1d60edf656c26d67..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-icon.html
+++ /dev/null
@@ -1,296 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-
-<dom-module id="tf-graph-icon">
-  <style>
-    .faded-rect {
-      fill: url(#rectHatch);
-    }
-
-    .faded-ellipse {
-      fill: url(#ellipseHatch);
-    }
-
-    .faded-rect, .faded-ellipse, .faded-series {
-      stroke:   var(--tb-graph-faded) !important;
-    }
-  </style>
-  <template>
-    <template is="dom-if" if="[[_isType(node, type, 'OP')]]">
-      <template is="dom-if" if="[[_isConst(node, const)]]">
-        <svg height$="[[height]]"
-            preserveAspectRatio="xMinYMid meet" viewBox="0 0 10 10">
-          <circle cx="5" cy="5" r="3"
-              fill$="[[_getFill(_computedFill, 'OP')]]"
-              stroke$="[[_getStroke(_computedFill, 'OP')]]" />
-        </svg>
-      </template>
-      <template is="dom-if" if="[[_isSummary(node, summary)]]">
-        <svg width$="[[height]]" height$="[[height]]" viewBox="0 0 12 12">
-          <use x="0" y="0" xlink:href="#summary-icon" />
-        </svg>
-      </template>
-      <template is="dom-if" if="[[_isRegularOp(node, const, summary)]]">
-        <svg height$="[[height]]"
-            preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 8">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink"
-              xlink:href="#op-node-stamp"
-              fill$="[[_getFill(_computedFill, 'OP')]]"
-              stroke$="[[_getStroke(_computedFill, 'OP')]]"
-              class$="{{_fadedClass(renderInfo, 'ellipse')}}"
-              x="8" y="4" />
-        </svg>
-      </template>
-    </template>
-    <template is="dom-if" if="[[_isType(node, type, 'META')]]">
-      <svg height$="[[height]]"
-            preserveAspectRatio="xMinYMid meet" viewBox="0 0 37 16">
-        <rect x="1" y="1"
-            fill$="[[_getFill(_computedFill, 'META')]]"
-            stroke$="[[_getStroke(_computedFill, 'META')]]"
-            class$="{{_fadedClass(renderInfo, 'rect')}}"
-            stroke-width="2px"
-            height="14" width="35"
-            rx="5" ry="5"/>
-      </svg>
-    </template>
-    <template is="dom-if" if="[[_isType(node, type, 'SERIES')]]">
-      <template is="dom-if" if="[[_isVertical(node, vertical)]]">
-        <svg height$="[[height]]"
-            preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 15">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink"
-              xlink:href="#op-series-vertical-stamp"
-              fill$="[[_getFill(_computedFill, 'SERIES')]]"
-              stroke$="[[_getStroke(_computedFill, 'SERIES')]]"
-              class$="{{_fadedClass(renderInfo, 'series')}}"
-              x="0" y="2" />
-        </svg>
-      </template>
-      <template is="dom-if" if="[[!_isVertical(node, vertical)]]">
-        <svg height$="[[height]]"
-            preserveAspectRatio="xMinYMid meet" viewBox="0 0 24 10">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink"
-              xlink:href="#op-series-horizontal-stamp"
-              fill$="[[_getFill(_computedFill, 'SERIES')]]"
-              stroke$="[[_getStroke(_computedFill, 'SERIES')]]"
-              class$="{{_fadedClass(renderInfo, 'series')}}"
-              x="0" y="1" />
-        </svg>
-      </template>
-    </template>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-graph-icon',
-
-        properties: {
-          /**
-           * Node to represent with an icon. Optional, but if specified, its
-           * properties override those defined in the type, vertical, const and
-           * summary properties.
-           * @type {tf.graph.Node}
-           */
-          node: {
-            type: Object,
-            value: null
-          },
-
-          /**
-           * Render node information associated with this node. Optional. If
-           * specified, this is only used when computing the fill of the icon
-           * element.
-           * @type {tf.graph.render.RenderNodeInfo}
-           */
-          renderInfo: {
-            type: Object,
-            value: null
-          },
-
-          /**
-           * String indicating the type of coloring to use for this node, used
-           * only for determining the fill.
-           */
-          colorBy: {
-            type: Object,
-            value: "structural"
-          },
-
-          /**
-           * Function used by structural coloring algorithm to determine which
-           * color to use based on the template ID of the node. Optional.
-           */
-          templateIndex: {
-            type: Function,
-            value: null
-          },
-
-          /** Type of node to draw (ignored if node is set). */
-          type: {
-            type: String,
-            value: null
-          },
-
-          /** Direction for series (ignored for other types). */
-          vertical: {
-            type: Boolean,
-            value: false
-          },
-
-          /** Whether the op is Const (ignored for non-ops). */
-          const: {
-            type: Boolean,
-            value: false
-          },
-
-          /** Whether the op is a Summary (ignored for non-ops). */
-          summary: {
-            type: Boolean,
-            value: false
-          },
-
-          /**
-           * Fill for the icon, optional. If fill is specified and node is not
-           * specified, then this value will override the default for the
-           * element. However, if node is specified, this value will be ignored.
-           */
-          fill: {
-            type: String,
-            value: null
-          },
-
-          /** Height of the SVG element in pixels, used for scaling. */
-          height: {
-            type: Number,
-            value: 20
-          },
-
-          /** The computed fill for the node. **/
-          _computedFill: {
-            type: String,
-            computed:
-              "_getComputedFill(node, renderInfo, colorBy, templateIndex, fill)"
-          }
-
-        },
-
-        /**
-         * Get the computed fill value for the element.
-         */
-        _getComputedFill: function(inputNode, inputRenderInfo, inputColorBy,
-            inputTemplateIndex, inputFill) {
-          if (inputNode && inputRenderInfo &&
-              inputColorBy && inputTemplateIndex) {
-            var ns = tf.graph.scene.node;
-            var colorBy = ns.ColorBy[inputColorBy.toUpperCase()];
-            return ns.getFillForNode(inputTemplateIndex, colorBy,
-                inputRenderInfo, false);
-          }
-          return inputFill;
-        },
-
-        /**
-         * Get the fill value for the element, or if that's not possible, return
-         * the default fill value for the node type.
-         */
-        _getFill: function(inputComputedFill, inputNodeType) {
-          return inputComputedFill || ({
-            OP: tf.graph.render.OpNodeColors.DEFAULT_FILL,
-            META: tf.graph.render.MetanodeColors.DEFAULT_FILL,
-            SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_FILL
-          })[inputNodeType];
-        },
-
-        /**
-         * Get the stroke value for the element, or if that's not possible,
-         * return the default stroke value for the node type.
-         */
-        _getStroke: function(inputComputedFill, inputNodeType) {
-          return inputComputedFill ?
-            tf.graph.scene.node.getStrokeForFill(inputComputedFill) :
-            ({
-              OP: tf.graph.render.OpNodeColors.DEFAULT_STROKE,
-              META: tf.graph.render.MetanodeColors.DEFAULT_STROKE,
-              SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_STROKE
-            })[inputNodeType];
-        },
-
-        /**
-         * Test whether the specified node's type, or the literal type string,
-         * match a particular other type.
-         */
-        _isType: function(inputNode, inputType, targetType) {
-          if (inputNode) {
-            return tf.graph.NodeType[inputNode.type] === targetType;
-          }
-          return inputType === targetType;
-        },
-
-        /**
-         * Test whether the specified node should be represented as a vertical
-         * series. Defaults to the value of the vertical property if node is
-         * not specified.
-         */
-        _isVertical: function(inputNode, inputVertical) {
-          if (inputNode) {
-            return inputNode.hasNonControlEdges;
-          }
-          return !!inputVertical;
-        },
-
-        /**
-         * Test whether the specified node is a constant. Defaults to the value
-         * of the const property if node is not specified.
-         */
-        _isConst: function(inputNode, inputConst) {
-          if (inputNode) {
-            return inputNode.op === 'Const';
-          }
-          return !!inputConst;
-        },
-
-        /**
-         * Test whether the specified node is a summary. Defaults to the value
-         * of the summary property if node is not specified.
-         */
-        _isSummary: function(inputNode, inputSummary) {
-          if (inputNode) {
-            return this._isType(inputNode, null, 'OP') &&
-                inputNode.op.substr(-7) === 'Summary';
-          }
-          return !!inputSummary;
-        },
-
-        /**
-         * Test whether the op node is a regular non-summary non-const node.
-         */
-        _isRegularOp: function(inputNode, inputConst, inputSummary) {
-          return !this._isConst(inputNode, inputConst) &&
-              !this._isSummary(inputNode, inputSummary);
-        },
-
-        _fadedClass: function(itemRenderInfo, shape) {
-          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded-' + shape : '';
-        }
-      });
-    })();
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html b/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
deleted file mode 100644
index bac25b67f77bf87fee1b40f7842fdf51acbfc3bf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
+++ /dev/null
@@ -1,130 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../tf-graph-debugger-data-card/tf-graph-debugger-data-card.html">
-<link rel="import" href="tf-node-info.html">
-
-<dom-module id="tf-graph-info">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  margin: 0;
-  padding: 0;
-  display: block;
-  max-height: 650px;
-  overflow-x: hidden;
-  overflow-y: auto;
-}
-
-h2 {
-  padding: 0;
-  text-align: center;
-  margin: 0;
-}
-</style>
-<template is="dom-if" if="{{selectedNode}}">
-  <paper-material elevation="1" class="card">
-    <tf-node-info graph-hierarchy="[[graphHierarchy]]"
-                  render-hierarchy="[[renderHierarchy]]"
-                  flat-graph="[[graph]]"
-                  node-name="[[selectedNode]]"
-                  node-include="[[selectedNodeInclude]]"
-                  highlighted-node="{{highlightedNode}}"
-                  color-by="[[colorBy]]">
-    </tf-node-info>
-  </paper-material>
-</template>
-<template is="dom-if" if="[[_healthPillsAvailable(debuggerDataEnabled, nodeNamesToHealthPills)]]">
-  <tf-graph-debugger-data-card render-hierarchy="[[renderHierarchy]]"
-                               debugger-numeric-alerts="[[debuggerNumericAlerts]]"
-                               node-names-to-health-pills="[[nodeNamesToHealthPills]]"
-                               render-hierarchy="[[renderHierarchy]]"
-                               selected-node="{{selectedNode}}"
-                               highlighted-node="{{highlightedNode}}"
-                               are-health-pills-loading="[[areHealthPillsLoading]]"
-                               all-steps-mode-enabled="{{allStepsModeEnabled}}"
-                               specific-health-pill-step="{{specificHealthPillStep}}"
-                               health-pill-step-index="{{healthPillStepIndex}}">
-  </tf-graph-debugger-data-card>
-</template>
-</template>
-<script>
-"use strict";
-
-(function() {
-  Polymer({
-    is: 'tf-graph-info',
-
-    properties: {
-      title: String,
-      graphHierarchy: Object,
-      graph: Object,
-      renderHierarchy: Object,
-      nodeNamesToHealthPills: Object,
-      healthPillStepIndex: {
-        type: Number,
-        notify: true,
-      },
-      colorBy: String,
-      // Two-ways
-      selectedNode: {
-        type: String,
-        notify: true
-      },
-      highlightedNode: {
-        type: String,
-        notify: true
-      },
-      // The enum value of the include property of the selected node.
-      selectedNodeInclude: {
-        type: Number,
-        notify: true
-      },
-      // Whether debugger data is enabled for this instance of Tensorboard.
-      debuggerDataEnabled: Boolean,
-      // Whether health pills are currently being loaded, in which case we show a spinner (and the
-      // current health pills shown might be out of date).
-    },
-    listeners: {
-      'node-list-item-click': '_nodeListItemClicked',
-      'node-list-item-mouseover': '_nodeListItemMouseover',
-      'node-list-item-mouseout': '_nodeListItemMouseout'
-    },
-    _nodeListItemClicked: function(event) {
-      this.selectedNode = event.detail.nodeName;
-    },
-    _nodeListItemMouseover: function(event) {
-      this.highlightedNode = event.detail.nodeName;
-    },
-    _nodeListItemMouseout: function() {
-      this.highlightedNode = null;
-    },
-    _healthPillsAvailable: function(debuggerDataEnabled, nodeNamesToHealthPills) {
-      // So long as there is a mapping (even if empty) from node name to health pills, show the
-      // legend and slider. We do that because, even if no health pills exist at the current step,
-      // the user may desire to change steps, and the slider must show for the user to do that.
-      return debuggerDataEnabled && nodeNamesToHealthPills;
-    },
-  });
-})();
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html b/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
deleted file mode 100644
index 66a3034b5b2c845cf6071585532d73a1bf38d142..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
+++ /dev/null
@@ -1,652 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../iron-list/iron-list.html">
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-item/paper-item.html">
-<link rel="import" href="../paper-item/paper-item-body.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="tf-graph-icon.html">
-<link rel="import" href="tf-node-list-item.html">
-
-<dom-module id="tf-node-info">
-  <style>
-  .sub-list-group {
-    font-weight: 500;
-    font-size: 12pt;
-    padding-bottom: 8px;
-    width: 100%;
-  }
-
-  .sub-list {
-    max-height: 300px;
-    overflow-y: scroll;
-  }
-
-  .attr-left {
-    float: left;
-    width: 30%;
-    word-wrap: break-word;
-    color: #565656;
-    font-size: 11pt;
-    font-weight: 400;
-  }
-
-  .attr-right {
-    margin-left: 30%;
-    word-wrap: break-word;
-    color: #565656;
-    font-weight: 400;
-  }
-
-  .sub-list-table {
-    display: table;
-    width: 100%;
-  }
-
-  .sub-list-table-row {
-    display: table-row;
-  }
-
-  .sub-list-table-row .sub-list-table-cell:last-child {
-    text-align: right;
-  }
-
-  .sub-list-table-cell {
-    color: #565656;
-    display: table-cell;
-    font-size: 11pt;
-    font-weight: 400;
-    max-width: 200px;
-    padding: 0 4px;
-  }
-
-  paper-item {
-    padding: 0;
-    background: #e9e9e9;
-  }
-
-  paper-item-body[two-line] {
-    min-height: 0;
-    padding: 8px 12px 4px;
-  }
-
-  .expandedInfo {
-    padding: 8px 12px;
-  }
-
-  .controlDeps {
-    padding: 0 0 0 8px;
-  }
-
-  .node-name {
-    white-space: normal;
-    word-wrap: break-word;
-    font-size: 14pt;
-    font-weight: 500;
-  }
-
-  .node-icon {
-    float: right;
-  }
-
-  .subtitle {
-    font-size: 12pt;
-    color: #5e5e5e;
-  }
-
-  .controlLine {
-    font-size: 11pt;
-    font-weight: 400;
-  }
-
-  .toggle-button {
-    float: right;
-    max-height: 20px;
-    max-width: 20px;
-    padding: 0;
-  }
-
-  .control-toggle-button {
-    float: left;
-    max-height: 20px;
-    max-width: 20px;
-    padding: 0;
-  }
-
-  .toggle-include-group {
-    padding-top: 4px;
-  }
-
-  .toggle-include {
-    margin: 5px 6px;
-    text-transform: none;
-    padding: 4px 6px;
-    font-size: 10pt;
-    background-color: #fafafa;
-    color: #666;
-  }
-
-  .toggle-include:hover {
-    background-color: var(--google-yellow-100);
-  }
-
-  .non-control-list-item {
-    padding-left: 10px;
-  }
-  </style>
-  <template>
-    <paper-item>
-      <paper-item-body two-line>
-        <div>
-          <paper-icon-button
-            icon="{{_getToggleIcon(_expanded)}}"
-            on-click="_toggleExpanded"
-            class="toggle-button">
-          </paper-icon-button>
-          <div class="node-name" id="nodetitle"></div>
-        </div>
-        <div secondary>
-          <tf-graph-icon class="node-icon" node="[[_node]]"
-              render-info="[[_getRenderInfo(nodeName, renderHierarchy)]]"
-              color-by="[[colorBy]]"
-              template-index="[[_templateIndex]]"
-              ></tf-graph-icon>
-          <template is="dom-if" if="{{_node.op}}">
-            <div class="subtitle">
-              Operation:
-              <span>[[_node.op]]</span>
-            </div>
-          </template>
-          <template is="dom-if" if="{{_node.metagraph}}">
-            <div class="subtitle">
-              Subgraph:
-              <span>[[_node.cardinality]]</span> nodes
-            </div>
-          </template>
-        </div>
-      </paper-item-body>
-    </paper-item>
-    <iron-collapse opened="{{_expanded}}">
-    <template is="dom-if" if="{{_expanded}}" restamp="true">
-      <div class="expandedInfo">
-        <div class="sub-list-group attributes">
-          Attributes
-          (<span>[[_attributes.length]]</span>)
-          <iron-list class="sub-list" id ="attributesList"
-                    items="[[_attributes]]">
-            <template>
-              <div>
-                <div class="attr-left">[[item.key]]</div>
-                <div class="attr-right">[[item.value]]</div>
-              </div>
-            </template>
-          </iron-list>
-        </div>
-
-        <template is="dom-if" if="{{_device}}">
-          <div class="sub-list-group device">
-            <div class="attr-left">Device</div>
-            <div class="attr-right">[[_device]]</div>
-          </div>
-        </template>
-
-        <div class="sub-list-group predecessors">
-          Inputs
-          (<span>[[_totalPredecessors]]</span>)
-          <iron-list class="sub-list" id ="inputsList"
-                    items="[[_predecessors.regular]]">
-            <template>
-              <tf-node-list-item
-                  class="non-control-list-item"
-                  card-node="[[_node]]"
-                  item-node="[[item.node]]"
-                  edge-label="[[item.edgeLabel]]"
-                  item-render-info="[[item.renderInfo]]"
-                  name="[[item.name]]"
-                  item-type="predecessors"
-                  color-by="[[colorBy]]"
-                  template-index="[[_templateIndex]]">
-              </tf-node-list-item>
-            </template>
-          </iron-list>
-          <template is="dom-if" if="[[_predecessors.control.length]]">
-            <div class="controlDeps">
-              <div class="controlLine">
-                <paper-icon-button
-                  icon="{{_getToggleIcon(_openedControlPred)}}"
-                  on-click="_toggleControlPred"
-                  class="control-toggle-button">
-                </paper-icon-button>
-                Control dependencies
-              </div>
-              <iron-collapse opened="{{_openedControlPred}}" no-animation>
-                <template is="dom-if" if="{{_openedControlPred}}" restamp="true">
-                  <iron-list class="sub-list" items="[[_predecessors.control]]">
-                    <template>
-                      <tf-node-list-item
-                          card-node="[[_node]]"
-                          item-node="[[item.node]]"
-                          item-render-info="[[item.renderInfo]]"
-                          name="[[item.name]]"
-                          item-type="predecessors"
-                          color-by="[[colorBy]]"
-                          template-index="[[_templateIndex]]">
-                      </tf-node-list-item>
-                    </template>
-                  </iron-list>
-                </template>
-              </iron-collapse>
-            </div>
-          </template>
-        </div>
-
-        <div class="sub-list-group successors">
-          Outputs
-          (<span>[[_totalSuccessors]]</span>)
-          <iron-list class="sub-list" id ="outputsList"
-                    items="[[_successors.regular]]">
-            <template>
-              <tf-node-list-item
-                  class="non-control-list-item"
-                  card-node="[[_node]]"
-                  item-node="[[item.node]]"
-                  edge-label="[[item.edgeLabel]]"
-                  item-render-info="[[item.renderInfo]]"
-                  name="[[item.name]]"
-                  item-type="successor"
-                  color-by="[[colorBy]]"
-                  template-index="[[_templateIndex]]">
-              </tf-node-list-item>
-            </template>
-          </iron-list>
-          <template is="dom-if" if="[[_successors.control.length]]">
-            <div class="controlDeps">
-              <div class="controlLine">
-                <paper-icon-button
-                  icon="{{_getToggleIcon(_openedControlSucc)}}"
-                  on-click="_toggleControlSucc"
-                  class="control-toggle-button">
-                </paper-icon-button>
-                Control dependencies
-              </div>
-              <iron-collapse opened="{{_openedControlSucc}}" no-animation>
-                <template is="dom-if" if="{{_openedControlSucc}}" restamp="true">
-                  <iron-list class="sub-list" items="[[_successors.control]]">
-                    <template>
-                      <tf-node-list-item
-                          card-node="[[_node]]"
-                          item-node="[[item.node]]"
-                          item-render-info="[[item.renderInfo]]"
-                          name="[[item.name]]"
-                          item-type="successors"
-                          color-by="[[colorBy]]"
-                          template-index="[[_templateIndex]]">
-                      </tf-node-list-item>
-                    </template>
-                  </iron-list>
-                </template>
-              </iron-collapse>
-            </div>
-          </template>
-        </div>
-        <template is="dom-if" if="{{_hasDisplayableNodeStats}}">
-          <div class="sub-list-group node-stats">
-            Node Stats
-            <div class="sub-list-table">
-              <template is="dom-if" if="{{_nodeStats.totalBytes}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Memory</div>
-                  <div class="sub-list-table-cell">[[_nodeStatsFormattedBytes]]</div>
-                </div>
-              </template>
-              <template is="dom-if" if="{{_getTotalMicros(_nodeStats)}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Compute Time</div>
-                  <div class="sub-list-table-cell">[[_nodeStatsFormattedComputeTime]]</div>
-                </div>
-              </template>
-              <template is="dom-if" if="{{_nodeStats.outputSize}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Tensor Output Sizes</div>
-                  <div class="sub-list-table-cell">
-                    <template is="dom-repeat" items="{{_nodeStatsFormattedOutputSizes}}">
-                      [[item]] <br/>
-                    </template>
-                  </div>
-                </div>
-              </template>
-            </div>
-          </div>
-        </template>
-        <div class="toggle-include-group">
-          <paper-button raised class="toggle-include" on-click="_toggleInclude">
-            <span>[[_auxButtonText]]</span>
-          </paper-button>
-        </div>
-        <template is="dom-if" if="{{_isInSeries(_node)}}">
-          <div class="toggle-include-group">
-            <paper-button raised class="toggle-include" on-click="_toggleGroup">
-              <span>[[_groupButtonText]]</span>
-            </paper-button>
-          </div>
-        </template>
-      </div>
-    </template>
-    </iron-collapse>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-node-info',
-
-        properties: {
-          nodeName: String,
-          graphHierarchy: Object,
-          renderHierarchy: Object,
-          /** What to color the nodes by (compute time, memory, device etc.) */
-          colorBy: String,
-          _templateIndex: {
-            type: Function,
-            computed: '_getTemplateIndex(graphHierarchy)'
-          },
-          _node: {
-            type: Object,
-            computed: '_getNode(nodeName, graphHierarchy)',
-            observer: '_resetState'
-          },
-          _nodeStats: {
-            type: Object,
-            computed: '_getNodeStats(nodeName, graphHierarchy)',
-            observer: '_resetState'
-          },
-          _hasDisplayableNodeStats: {
-            type: Object,
-            computed: '_getHasDisplayableNodeStats(_nodeStats)',
-          },
-          _nodeStatsFormattedBytes: {
-            type: String,
-            computed: '_getNodeStatsFormattedBytes(_nodeStats)',
-          },
-          _nodeStatsFormattedComputeTime: {
-            type: String,
-            computed: '_getNodeStatsFormattedComputeTime(_nodeStats)',
-          },
-          _nodeStatsFormattedOutputSizes: {
-            type: Array,
-            computed: '_getNodeStatsFormattedOutputSizes(_nodeStats)',
-          },
-          // The enum value of the include property of the selected node.
-          nodeInclude: {
-            type: Number,
-            observer: '_nodeIncludeStateChanged'
-          },
-          _attributes: {
-            type: Array,
-            computed: '_getAttributes(_node)'
-          },
-          _device: {
-            type: String,
-            computed: '_getDevice(_node)'
-          },
-          _successors: {
-            type: Object,
-            computed: '_getSuccessors(_node, graphHierarchy)'
-          },
-          _predecessors: {
-            type: Object,
-            computed: '_getPredecessors(_node, graphHierarchy)'
-          },
-          _subnodes: {
-            type: Array,
-            computed: '_getSubnodes(_node)'
-          },
-          _expanded: {
-            type: Boolean,
-            value: true
-          },
-          _totalPredecessors: {
-            type: Number,
-            computed: '_getTotalPred(_predecessors)'
-          },
-          _totalSuccessors: {
-            type: Number,
-            computed: '_getTotalSucc(_successors)'
-          },
-          _openedControlPred: {
-            type: Boolean,
-            value: false
-          },
-          _openedControlSucc: {
-            type: Boolean,
-            value: false
-          },
-          _auxButtonText: String,
-          _groupButtonText: String
-        },
-        expandNode: function() {
-          this.fire('_node.expand', this.node);
-        },
-        _getTemplateIndex: function(graphHierarchy) {
-          return graphHierarchy.getTemplateIndex();
-        },
-        _getNode: function(nodeName, graphHierarchy) {
-          return graphHierarchy.node(nodeName);
-        },
-        _getNodeStats: function(nodeName, graphHierarchy) {
-          var node = this._getNode(nodeName, graphHierarchy);
-          if (node) {
-            return node.stats;
-          }
-          return null;
-        },
-        _getTotalMicros: function(stats) {
-          return stats.getTotalMicros();
-        },
-        _getHasDisplayableNodeStats: function(stats) {
-          return tf.graph.util.hasDisplayableNodeStats(stats);
-        },
-        _getNodeStatsFormattedBytes: function(stats) {
-          if (!stats || !stats.totalBytes) {
-            return;
-          }
-
-          return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalBytes, tf.graph.util.MEMORY_UNITS);
-        },
-        _getNodeStatsFormattedComputeTime: function(stats) {
-          if (!stats || !stats.getTotalMicros()) {
-            return;
-          }
-
-          return tf.graph.util.convertUnitsToHumanReadable(
-              stats.getTotalMicros(), tf.graph.util.TIME_UNITS);
-        },
-        _getNodeStatsFormattedOutputSizes: function(stats) {
-          if (!stats || !stats.outputSize || !stats.outputSize.length) {
-            return;
-          }
-
-          return _.map(stats.outputSize, function(shape) {
-            if (shape.length === 0) {
-              return "scalar";
-            }
-            return "[" + shape.join(", ") + "]";
-          });
-        },
-        _getPrintableHTMLNodeName: function(nodeName) {
-          // Insert an optional line break before each slash so that
-          // long node names wrap cleanly at path boundaries.
-          return (nodeName || '').replace(/\//g, '<wbr>/');
-        },
-        _getRenderInfo: function(nodeName, renderHierarchy) {
-          return this.renderHierarchy.getOrCreateRenderNodeByName(nodeName);
-        },
-        _getAttributes: function(node) {
-          this.async(this._resizeList.bind(this, "#attributesList"));
-          if (!node || !node.attr) {
-            return [];
-          }
-          var attrs = [];
-          _.each(node.attr, function(entry) {
-            // Unpack the "too large" attributes into separate attributes
-            // in the info card, with values "too large to show".
-            if (entry.key === tf.graph.LARGE_ATTRS_KEY) {
-              attrs = attrs.concat(entry.value.list.s.map(function(key) {
-                return {key: key, value: "Too large to show..."};
-              }));
-            } else {
-              attrs.push({
-                key: entry.key,
-                value: JSON.stringify(entry.value)
-              });
-            }
-          });
-          return attrs;
-        },
-        _getDevice: function(node) {
-          return node ? node.device : null;
-        },
-        _getSuccessors: function(node, hierarchy) {
-          this.async(this._resizeList.bind(this, "#inputsList"));
-          if (!node) {
-            return {regular: [], control: []}
-          }
-          return this._convertEdgeListToEdgeInfoList(
-            hierarchy.getSuccessors(node.name), false, node.isGroupNode);
-        },
-        _getPredecessors: function(node, hierarchy) {
-          this.async(this._resizeList.bind(this, "#outputsList"));
-          if (!node) {
-            return {regular: [], control: []}
-          }
-          return this._convertEdgeListToEdgeInfoList(
-            hierarchy.getPredecessors(node.name), true, node.isGroupNode);
-        },
-        _convertEdgeListToEdgeInfoList: function(list, isPredecessor, isGroupNode) {
-
-          /**
-           * Unpacks the metaedge into a list of base edge information
-           * that can be rendered.
-           */
-          var unpackMetaedge = function(metaedge) {
-            return _.map(metaedge.baseEdgeList, function(baseEdge) {
-              var name = isPredecessor ? baseEdge.v : baseEdge.w;
-              return {
-                name: name,
-                node: this._getNode(name, this.graphHierarchy),
-                edgeLabel: tf.graph.scene.edge.getLabelForBaseEdge(baseEdge,
-                    this.renderHierarchy),
-                renderInfo: this._getRenderInfo(name, this.renderHierarchy)
-              };
-            }, this);
-          }.bind(this);
-
-          /**
-           * Converts a list of metaedges to a list of edge information
-           * that can be rendered.
-           */
-          var toEdgeInfoList = function(edges) {
-            var edgeInfoList = [];
-            _.each(edges, function(metaedge) {
-              var name = isPredecessor ? metaedge.v : metaedge.w;
-              // Enumerate all the base edges if the node is an OpNode, or the
-              // metaedge has only 1 edge in it.
-              if (!isGroupNode || metaedge.baseEdgeList.length == 1) {
-                edgeInfoList = edgeInfoList.concat(unpackMetaedge(metaedge));
-              } else {
-                edgeInfoList.push({
-                  name: name,
-                  node: this._getNode(name, this.graphHierarchy),
-                  edgeLabel: tf.graph.scene.edge.getLabelForEdge(metaedge,
-                      this.renderHierarchy),
-                  renderInfo: this._getRenderInfo(name, this.renderHierarchy)
-                });
-              }
-            }, this);
-            return edgeInfoList;
-          }.bind(this);
-
-          return {
-            regular: toEdgeInfoList(list.regular),
-            control: toEdgeInfoList(list.control)
-          };
-        },
-        _getSubnodes: function(node) {
-          return node && node.metagraph ? node.metagraph.nodes() : null;
-        },
-        _getTotalPred: function(predecessors) {
-          return predecessors.regular.length + predecessors.control.length;
-        },
-        _getTotalSucc: function(successors) {
-          return successors.regular.length + successors.control.length;
-        },
-        _toggleControlPred: function() {
-          this._openedControlPred = !this._openedControlPred;
-        },
-        _toggleControlSucc: function() {
-          this._openedControlSucc = !this._openedControlSucc;
-        },
-        _toggleExpanded: function() {
-          this._expanded = !this._expanded;
-        },
-        _getToggleIcon: function(expanded) {
-          return expanded ? "expand-less" : "expand-more";
-        },
-        _resetState: function() {
-          this._openedControlPred = false;
-          this._openedControlSucc = false;
-
-          this.set("_groupButtonText",
-            tf.graph.scene.node.getGroupSettingLabel(this._node));
-
-          if (this._node) {
-            Polymer.dom(this.$.nodetitle).innerHTML =
-              this._getPrintableHTMLNodeName(this._node.name);
-          }
-        },
-        _resizeList: function(selector) {
-          var list = document.querySelector(selector);
-          if (list) {
-            list.fire('iron-resize');
-          }
-        },
-        _toggleInclude: function() {
-          var graphElem = document.querySelector("#graph");
-          graphElem.fire("node-toggle-extract", { name: this.nodeName });
-          var graphBoardElem = document.querySelector("#graphboard");
-          graphBoardElem.fire("node-toggle-extract");
-        },
-        _nodeIncludeStateChanged: function(include, oldInclude) {
-          this.set("_auxButtonText",
-            tf.graph.getIncludeNodeButtonString(include));
-        },
-        _toggleGroup: function() {
-          var graphElem = document.querySelector("#graph");
-          var seriesName = tf.graph.scene.node.getSeriesName(this._node);
-          graphElem.fire("node-toggle-seriesgroup", { name: seriesName });
-        },
-        _isInSeries: function(node) {
-          return tf.graph.scene.node.canBeInSeries(node);
-        }
-      });
-    })();
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html b/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
deleted file mode 100644
index c15478d126ccbb055a7bbb46f3a29c897321a648..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
+++ /dev/null
@@ -1,138 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-<link rel="import" href="tf-graph-icon.html">
-
-<dom-module id="tf-node-list-item">
-  <style>
-  #list-item {
-    width: 100%;
-    color: #565656;
-    font-size: 11pt;
-    font-weight: 400;
-    position: relative;
-    display: inline-block;
-  }
-
-  #list-item:hover {
-    background-color: var(--google-yellow-100);
-  }
-
-  .clickable {
-    cursor: pointer;
-  }
-
-  #list-item span {
-    margin-left: 40px;
-  }
-
-  #list-item.excluded span {
-    color: #999;
-  }
-
-  #list-item span.edge-label {
-    float: right;
-    font-size: 10px;
-    margin-left: 3px;
-    margin-right: 5px;
-  }
-
-  .node-icon {
-    position: absolute;
-    top: 1px;
-    left: 2px;
-  }
-
-  .faded span {
-    color: var(--tb-graph-faded);
-  }
-  </style>
-  <template>
-    <div id="list-item"
-         on-mouseover="_nodeListener"
-         on-mouseout="_nodeListener"
-         on-click="_nodeListener">
-      <div class$="{{_fadedClass(itemRenderInfo)}}">
-        <tf-graph-icon class="node-icon" height="12"
-            color-by="[[colorBy]]" color-by-params="[[colorByParams]]"
-            node="[[itemNode]]" render-info="[[itemRenderInfo]]"
-            template-index="[[templateIndex]]"></tf-graph-icon>
-        <span title$="[[name]]">[[name]]</span>
-        <span class="edge-label">[[edgeLabel]]</span>
-      </div>
-    </div>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-node-list-item',
-
-        properties: {
-          /**
-           * The Node for the card itself, on which this item is being drawn.
-           * @type {tf.graph.Node}
-           */
-          cardNode: Object,
-          /**
-           * The Node for the item within the card, somehow related to cardNode.
-           * @type {tf.graph.Node}
-           */
-          itemNode: Object,
-          /** The edge label associated with this item. */
-          edgeLabel: String,
-          /**
-           * The render node information for the item node. Used by the graph
-           * icon in determining fill color.
-           */
-          itemRenderInfo: Object,
-          name: String,
-          itemType: {
-            type: String,
-            observer: '_itemTypeChanged'
-          },
-          colorBy: String,
-          colorByParams: Object,
-          templateIndex: Function
-        },
-
-        _itemTypeChanged: function() {
-          if (this.itemType !== 'subnode') {
-            this.$['list-item'].classList.add('clickable');
-          } else {
-            this.$['list-item'].classList.remove('clickable');
-          }
-        },
-
-        _nodeListener: function(event) {
-          // fire node.click/mouseover/mouseout
-          this.fire('node-list-item-' + event.type, {
-            cardNode: this.cardNode.name,
-            nodeName: this.name,
-            type: this.itemType
-          });
-        },
-
-        _fadedClass: function(itemRenderInfo) {
-          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded' : '';
-        }
-      });
-    })();
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/BUILD b/tensorflow/tensorboard/components/tf_graph_loader/BUILD
deleted file mode 100644
index 41fbfb8ee853aeae8a58009391d6f0a889e03ae8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_graph_loader",
-    srcs = ["tf-graph-loader.html"],
-    path = "/tf-graph-loader",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_graph_loader"],
-    destdir = "tf-graph-loader",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
deleted file mode 100644
index f109a19163bb9f86e567495e34059d56863f30fc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-graph-loader/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_graph_loader",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
deleted file mode 100644
index 30b206453469801d31b46856c29cdda78164f18f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
+++ /dev/null
@@ -1,4606 +0,0 @@
-node {
-  name: "GradientDescent/learning_rate"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_3"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0.1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 100
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000d\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_grad/Shape"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/BroadcastGradientArgs"
-  op: "BroadcastGradientArgs"
-  input: "gradients/add_1_grad/Shape"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-        }
-        shape {
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod_1"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape_1"
-  input: "gradients/Mean_grad/Const_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Maximum"
-  op: "Maximum"
-  input: "gradients/Mean_grad/Prod_1"
-  input: "gradients/Mean_grad/Maximum/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Prod"
-  op: "Prod"
-  input: "gradients/Mean_grad/Shape"
-  input: "gradients/Mean_grad/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/floordiv"
-  op: "FloorDiv"
-  input: "gradients/Mean_grad/Prod"
-  input: "gradients/Mean_grad/Maximum"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Cast"
-  op: "Cast"
-  input: "gradients/Mean_grad/floordiv"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "DstT"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "SrcT"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile/multiples"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Fill"
-  op: "Fill"
-  input: "gradients/Shape"
-  input: "gradients/Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Fill"
-  input: "gradients/Mean_grad/Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/Tile"
-  op: "Tile"
-  input: "gradients/Mean_grad/Reshape"
-  input: "gradients/Mean_grad/Tile/multiples"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tmultiples"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Mean_grad/truediv"
-  op: "RealDiv"
-  input: "gradients/Mean_grad/Tile"
-  input: "gradients/Mean_grad/Cast"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_3_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/Mean_grad/truediv"
-  input: "gradients/Reshape_3_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  op: "ExpandDims"
-  input: "gradients/Reshape_3_grad/Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tdim"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Const"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/begin"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat_1/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_2"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_1"
-  op: "Sub"
-  input: "Rank_2"
-  input: "Sub_1/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_1/begin"
-  op: "Pack"
-  input: "Sub_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_1"
-  op: "Slice"
-  input: "Shape_2"
-  input: "Slice_1/begin"
-  input: "Slice_1/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat_1"
-  op: "ConcatV2"
-  input: "concat_1/values_0"
-  input: "Slice_1"
-  input: "concat_1/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat/axis"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "concat/values_0"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: -1
-      }
-    }
-  }
-}
-node {
-  name: "Slice/size"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 1
-          }
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Sub/y"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "Shape_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank_1"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub"
-  op: "Sub"
-  input: "Rank_1"
-  input: "Sub/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice/begin"
-  op: "Pack"
-  input: "Sub"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice"
-  op: "Slice"
-  input: "Shape_1"
-  input: "Slice/begin"
-  input: "Slice/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "concat"
-  op: "ConcatV2"
-  input: "concat/values_0"
-  input: "Slice"
-  input: "concat/axis"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\n\000\000\000"
-      }
-    }
-  }
-}
-node {
-  name: "Rank"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 2
-      }
-    }
-  }
-}
-node {
-  name: "Sub_2"
-  op: "Sub"
-  input: "Rank"
-  input: "Sub_2/y"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Slice_2/size"
-  op: "Pack"
-  input: "Sub_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "N"
-    value {
-      i: 1
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: 0
-    }
-  }
-}
-node {
-  name: "Slice_2"
-  op: "Slice"
-  input: "Shape"
-  input: "Slice_2/begin"
-  input: "Slice_2/size"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "Index"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "T"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_biases/read"
-  op: "Identity"
-  input: "logits_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "logits_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-        dim {
-          size: 10
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "logits_weights/read"
-  op: "Identity"
-  input: "logits_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_biases"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_biases/read"
-  op: "Identity"
-  input: "hidden_biases"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "hidden_weights"
-  op: "VariableV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "shape"
-    value {
-      shape {
-        dim {
-          size: 784
-        }
-        dim {
-          size: 100
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "hidden_weights/read"
-  op: "Identity"
-  input: "hidden_weights"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape/shape"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 2
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-          dim {
-            size: 2
-          }
-        }
-        tensor_content: "\310\000\000\000\377\377\377\377"
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/depth"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 10
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/off_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 0
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot/on_value"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-        }
-        float_val: 1
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  op: "Const"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "dtype"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "value"
-    value {
-      tensor {
-        dtype: DT_INT32
-        tensor_shape {
-        }
-        int_val: 200
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_1/random_shuffle_queue"
-  op: "RandomShuffleQueueV2"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "capacity"
-    value {
-      i: 20000
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "container"
-    value {
-      s: ""
-    }
-  }
-  attr {
-    key: "min_after_dequeue"
-    value {
-      i: 4000
-    }
-  }
-  attr {
-    key: "seed"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "seed2"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 28
-          }
-          dim {
-            size: 28
-          }
-          dim {
-            size: 1
-          }
-        }
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "shared_name"
-    value {
-      s: ""
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  op: "QueueDequeueManyV2"
-  input: "mnist_dataset_train_1/random_shuffle_queue"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "component_types"
-    value {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    key: "timeout_ms"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
-  input: "Reshape/shape"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: -1
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul"
-  op: "MatMul"
-  input: "Reshape"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add"
-  op: "Add"
-  input: "MatMul"
-  input: "hidden_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Relu"
-  op: "Relu"
-  input: "add"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "add_1"
-  op: "Add"
-  input: "MatMul_1"
-  input: "logits_biases/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_1"
-  op: "Reshape"
-  input: "add_1"
-  input: "concat"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "mnist_dataset_train_2/one_hot"
-  op: "OneHot"
-  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
-  input: "mnist_dataset_train_2/one_hot/depth"
-  input: "mnist_dataset_train_2/one_hot/on_value"
-  input: "mnist_dataset_train_2/one_hot/off_value"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "TI"
-    value {
-      type: DT_INT64
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          unknown_rank: true
-        }
-      }
-    }
-  }
-  attr {
-    key: "axis"
-    value {
-      i: -1
-    }
-  }
-}
-node {
-  name: "Reshape_2"
-  op: "Reshape"
-  input: "mnist_dataset_train_2/one_hot"
-  input: "concat_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "SoftmaxCrossEntropyWithLogits"
-  op: "SoftmaxCrossEntropyWithLogits"
-  input: "Reshape_1"
-  input: "Reshape_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  op: "PreventGradient"
-  input: "SoftmaxCrossEntropyWithLogits:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "message"
-    value {
-      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
-    }
-  }
-}
-node {
-  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  op: "Mul"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Reshape_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
-  input: "gradients/Reshape_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum_1"
-  input: "gradients/add_1_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Sum"
-  op: "Sum"
-  input: "gradients/Reshape_1_grad/Reshape"
-  input: "gradients/add_1_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_1_grad/Sum"
-  input: "gradients/add_1_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape_1"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_1_grad/Reshape"
-  input: "^gradients/add_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_1_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul_1"
-  op: "MatMul"
-  input: "Relu"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_1_grad/tuple/control_dependency"
-  input: "logits_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul_1"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "logits_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@logits_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-          dim {
-            size: 10
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_1_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/MatMul_1_grad/MatMul"
-  input: "^gradients/MatMul_1_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_1_grad/MatMul"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/Relu_grad/ReluGrad"
-  op: "ReluGrad"
-  input: "gradients/MatMul_1_grad/tuple/control_dependency"
-  input: "Relu"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum_1"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs:1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape_1"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum_1"
-  input: "gradients/add_grad/Shape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Sum"
-  op: "Sum"
-  input: "gradients/Relu_grad/ReluGrad"
-  input: "gradients/add_grad/BroadcastGradientArgs"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/Reshape"
-  op: "Reshape"
-  input: "gradients/add_grad/Sum"
-  input: "gradients/add_grad/Shape"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/Reshape_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape_1"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_biases"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/add_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_biases"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/add_grad/tuple/control_dependency"
-  op: "Identity"
-  input: "gradients/add_grad/Reshape"
-  input: "^gradients/add_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/add_grad/Reshape"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul_1"
-  op: "MatMul"
-  input: "Reshape"
-  input: "gradients/add_grad/tuple/control_dependency"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/MatMul"
-  op: "MatMul"
-  input: "gradients/add_grad/tuple/control_dependency"
-  input: "hidden_weights/read"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-          dim {
-            size: 784
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "transpose_a"
-    value {
-      b: false
-    }
-  }
-  attr {
-    key: "transpose_b"
-    value {
-      b: true
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/group_deps"
-  op: "NoOp"
-  input: "^gradients/MatMul_grad/MatMul"
-  input: "^gradients/MatMul_grad/MatMul_1"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "gradients/MatMul_grad/tuple/control_dependency_1"
-  op: "Identity"
-  input: "gradients/MatMul_grad/MatMul_1"
-  input: "^gradients/MatMul_grad/tuple/group_deps"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@gradients/MatMul_grad/MatMul_1"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: -1
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  op: "ApplyGradientDescent"
-  input: "hidden_weights"
-  input: "GradientDescent/learning_rate"
-  input: "gradients/MatMul_grad/tuple/control_dependency_1"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "_class"
-    value {
-      list {
-        s: "loc:@hidden_weights"
-      }
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 784
-          }
-          dim {
-            size: 100
-          }
-        }
-      }
-    }
-  }
-  attr {
-    key: "use_locking"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "GradientDescent"
-  op: "NoOp"
-  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
-  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_2"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-      }
-    }
-  }
-}
-node {
-  name: "Reshape_3"
-  op: "Reshape"
-  input: "SoftmaxCrossEntropyWithLogits"
-  input: "Slice_2"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tshape"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 200
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "Mean"
-  op: "Mean"
-  input: "Reshape_3"
-  input: "Const"
-  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "Tidx"
-    value {
-      type: DT_INT32
-    }
-  }
-  attr {
-    key: "_XlaCluster"
-    value {
-      s: "cluster_1"
-    }
-  }
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-        }
-      }
-    }
-  }
-  attr {
-    key: "keep_dims"
-    value {
-      b: false
-    }
-  }
-}
-node {
-  name: "_send_Mean_0"
-  op: "_Send"
-  input: "Mean"
-  device: "/job:localhost/replica:0/task:0/cpu:0"
-  attr {
-    key: "T"
-    value {
-      type: DT_FLOAT
-    }
-  }
-  attr {
-    key: "client_terminated"
-    value {
-      b: true
-    }
-  }
-  attr {
-    key: "recv_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device"
-    value {
-      s: "/job:localhost/replica:0/task:0/cpu:0"
-    }
-  }
-  attr {
-    key: "send_device_incarnation"
-    value {
-      i: -5924635994370253548
-    }
-  }
-  attr {
-    key: "tensor_name"
-    value {
-      s: "Mean:0"
-    }
-  }
-}
-library {
-}
-versions {
-  producer: 21
-}
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html b/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
deleted file mode 100644
index 2ffb2a1a59cba900252eec4169a93c4babbef094..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
+++ /dev/null
@@ -1,75 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-graph-loader.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<title>TF Graph Loader Demo</title>
-<demo-snippet>
-  <template>
-    <dom-module id="tf-graph-loader-demo">
-      <template>
-        <tf-graph-loader
-            id="loader"
-            datasets="[[_datasets]]"
-            selected-dataset="[[_selectedDataset]]"
-            progress="{{_progress}}">
-        </tf-graph-loader>
-      </template>
-      <script>
-        "use strict";
-
-        Polymer({
-          is: "tf-graph-loader-demo",
-          properties: {
-            // We tell the graph loader to load a specific pbtxt file.
-            _datasets: {
-              type: Array,
-              value: [{
-                "name": "Graph with XLA Clusters Specified",
-                "path": "data/graph.pbtxt"
-              }],
-            },
-            _selectedDataset: {
-              type: Number,
-              value: 0,
-            },
-
-            // This property will be updated by the graph loader.
-            _progress: {
-              type: Object,
-            },
-          },
-          observers: [
-            '_progressUpdated(_progress)',
-          ],
-          _progressUpdated(progress) {
-            // console.log the progress.
-            console.log('Progress updated.', progress);
-
-            // The graph has loaded. console.log it.
-            if (progress.value == 100) {
-              console.log('graph', this.$.loader.outGraph);
-            }
-          },
-        });
-      </script>
-    </dom-module>
-    <!-- The graph loader lacks visual elements. -->
-    <tf-graph-loader-demo></tf-graph-loader-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/test/index.html b/tensorflow/tensorboard/components/tf_graph_loader/test/index.html
deleted file mode 100644
index c8e2027f42aa25ef1c8e2d2c1f1aa68329181ebf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/test/index.html
+++ /dev/null
@@ -1,30 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../tf-graph-loader.html">
-</head>
-<body>
-  <tf-graph-loader id="loader"></tf-graph-loader>
-  <script src="loader.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/test/loader.ts b/tensorflow/tensorboard/components/tf_graph_loader/test/loader.ts
deleted file mode 100644
index fcd9f7b5295756f863a6a72428862142cc716fb3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/test/loader.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-suite('graph loader', () => {
-  let assert = chai.assert;
-
-  test('loader exists', () => {
-    assert.isTrue(document.getElementById('loader') != null);
-  });
-
-  // TODO(bp): write tests.
-
-});
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html b/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
deleted file mode 100644
index 8d59cbd2aacf4295fbfe3bfa12013b47c2c39285..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
+++ /dev/null
@@ -1,184 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph-common/tf-graph-common.html">
-
-<!--
-An element which provides a filter parsing for pbtxt to graph output.
--->
-<dom-module id="tf-graph-loader">
-</dom-module>
-
-<script>
-Polymer({
-
-  is: 'tf-graph-loader',
-
-  properties: {
-    /**
-     * @type {value: number, msg: string}
-     *
-     * A number between 0 and 100 denoting the % of progress
-     * for the progress bar and the displayed message.
-     */
-    progress: {
-      type: Object,
-      notify: true,
-    },
-    datasets: Array,
-    selectedDataset: Number,
-    selectedFile: {
-      type: Object,
-      observer: '_selectedFileChanged'
-    },
-    outGraphHierarchy: {
-      type: Object,
-      readOnly: true, //readonly so outsider can't change this via binding
-      notify: true
-    },
-    outGraph: {
-      type: Object,
-      readOnly: true, //readonly so outsider can't change this via binding
-      notify: true
-    },
-    outHierarchyParams: {
-      type: Object,
-      readOnly: true,
-      notify: true
-    },
-    outStats: {
-      type: Object,
-      readOnly: true, // This property produces data.
-      notify: true
-    }
-  },
-  observers: [
-    '_selectedDatasetChanged(selectedDataset, datasets)',
-    '_readAndParseMetadata(selectedMetadataTag)'
-  ],
-  _readAndParseMetadata: function(metadataIndex) {
-    if (metadataIndex == -1 || this.datasets[this.selectedDataset] == null ||
-        this.datasets[this.selectedDataset].runMetadata == null ||
-        this.datasets[this.selectedDataset].runMetadata[metadataIndex] == null) {
-      this._setOutStats(null);
-      return;
-    }
-    var path = this.datasets[this.selectedDataset].runMetadata[metadataIndex].path;
-    // Reset the progress bar to 0.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    tf.graph.parser.fetchAndParseMetadata(path, tracker)
-    .then(function(stats) {
-      this._setOutStats(stats);
-    }.bind(this));
-  },
-  _parseAndConstructHierarchicalGraph: function(path, pbTxtFile) {
-    // Reset the progress bar to 0.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    var hierarchyParams = {
-      verifyTemplate: true,
-      // If a set of numbered op nodes has at least this number of nodes
-      // then group them into a series node.
-      seriesNodeMinSize: 5,
-      // A map of series node names to series grouping settings, to indicate
-      // if a series is to be rendered as grouped or ungrouped.
-      // Starts out empty which allows the renderer to decide which series
-      // are initially rendered grouped and which aren't.
-      seriesMap: {},
-    };
-    this._setOutHierarchyParams(hierarchyParams);
-    var dataTracker = tf.graph.util.getSubtaskTracker(tracker, 30, 'Data');
-    tf.graph.parser.fetchAndParseGraphData(path, pbTxtFile, dataTracker)
-    .then(function(graph) {
-      if (!graph) {
-        throw 'The graph is empty. Make sure that the graph is passed to the ' +
-            'SummaryWriter after the graph is defined.';
-      }
-
-      // Build the flat graph (consists only of Op nodes).
-
-      // This is the whitelist of inputs on op types that are considered
-      // reference edges. "Assign 0" indicates that the first input to
-      // an OpNode with operation type "Assign" is a reference edge.
-      var refEdges = {};
-      refEdges["Assign 0"] = true;
-      refEdges["AssignAdd 0"] = true;
-      refEdges["AssignSub 0"] = true;
-      refEdges["assign 0"] = true;
-      refEdges["assign_add 0"] = true;
-      refEdges["assign_sub 0"] = true;
-      refEdges["count_up_to 0"] = true;
-      refEdges["ScatterAdd 0"] = true;
-      refEdges["ScatterSub 0"] = true;
-      refEdges["ScatterUpdate 0"] = true;
-      refEdges["scatter_add 0"] = true;
-      refEdges["scatter_sub 0"] = true;
-      refEdges["scatter_update 0"] = true;
-      var buildParams = {
-        enableEmbedding: true,
-        inEmbeddingTypes: ['Const'],
-        outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
-        refEdges: refEdges
-      };
-      var graphTracker = tf.graph.util.getSubtaskTracker(tracker, 20, 'Graph');
-      return tf.graph.build(graph, buildParams, graphTracker);
-    })
-    .then(function(graph) {
-      this._setOutGraph(graph);
-      var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 50,
-          'Namespace hierarchy');
-      return tf.graph.hierarchy.build(graph, hierarchyParams, hierarchyTracker);
-    }.bind(this))
-    .then(function(graphHierarchy) {
-      // Update the properties which notify the parent with the
-      // graph hierarchy and whether the data has live stats or not.
-      this._setOutGraphHierarchy(graphHierarchy);
-    }.bind(this))
-    .catch(function(e) {
-      // Generic error catch, for errors that happened outside
-      // asynchronous tasks.
-      tracker.reportError("Graph visualization failed: " + e, e);
-    });
-  },
-  _selectedDatasetChanged: function(datasetIndex, datasets) {
-    this._parseAndConstructHierarchicalGraph(datasets[datasetIndex].path);
-  },
-  _selectedFileChanged: function(e) {
-    if (!e) {
-      return;
-    }
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-
-    // Clear out the value of the file chooser. This ensures that if the user
-    // selects the same file, we'll re-read it.
-    e.target.value = '';
-
-    this._parseAndConstructHierarchicalGraph(null, file);
-  }
-});
-</script>
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
deleted file mode 100644
index e510e4b467195aeaf5d38bb512dff56a22736c2d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_histogram_dashboard",
-    srcs = ["tf-histogram-dashboard.html"],
-    path = "/tf-histogram-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/vz_histogram_timeseries",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-histogram-dashboard",
-    deps = [
-        ":tf_histogram_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run1_tag_histo1.json
deleted file mode 100644
index a5600a356e8277e58be3b2891c3e328d058b5d08..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run1_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-0.3584790755077172, 3.0267252195784047, 20.0, 24.012225532303315, 48.29045006426564, [-0.35363819004775493, -0.29226296698161564, -0.19961953895336082, 0.3214892636797772, 0.5177616740489182, 0.56953784145381, 0.6264916255991911, 0.7580548669750213, 0.8338603536725235, 1.220854943811942, 1.3429404381931362, 1.47723448201245, 1.624957930213695, 1.7874537232350647, 1.9661990955585713, 2.379100905625872, 2.6170109961884593, 3.1665833053880363], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo1.json
deleted file mode 100644
index 407c375d2fc710e70408a3238df3a6165e964e84..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-2.599286228987632, 3.5098048900144323, 20.0, 10.792285491200078, 66.66796979177158, [-2.379100905625872, -1.9661990955585713, -1.624957930213695, -1.47723448201245, -1.109868130738129, -1.0089710279437536, -0.42790220995778355, -0.2195814928486969, 0.47069243095356195, 0.7580548669750213, 0.917246389039776, 1.3429404381931362, 1.624957930213695, 1.7874537232350647, 2.1628190051144287, 2.6170109961884593, 2.8787120958073054, 3.8315657995195243], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo2.json
deleted file mode 100644
index 752b621ab032f24805574708e1659c7139a701a8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/histograms_run_run2_tag_histo2.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-0.8286852465281818, 2.0954239138728523, 20.0, 13.546880465642861, 24.14836803774091, [-0.7580548669750213, -0.38900200905253046, -0.06996543062044111, 0.07696197368248522, 0.19961953895336082, 0.2656936063469233, 0.29226296698161564, 0.5177616740489182, 0.7580548669750213, 0.917246389039776, 1.109868130738129, 1.220854943811942, 1.624957930213695, 2.1628190051144287], [2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 3.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/logdir
deleted file mode 100644
index b6362b45d777266d6204b23884222a080f789f71..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_histogram_dashboard/data/runs.json
deleted file mode 100644
index cbe657af6b610f0cb3bd8b5f6ccc2b14f4e631e2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/data/runs.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-	"run1": {"histograms": ["histo1"]}, 
-	"run2": {"histograms": ["histo2", "histo1"]}
-}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/index.html b/tensorflow/tensorboard/components/tf_histogram_dashboard/index.html
deleted file mode 100644
index 7f1e2f9ff899962275a1bdc633925a9307e8f061..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/index.html
+++ /dev/null
@@ -1,67 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../paper-styles/typography.html">
-<link rel="import" href="tf-histogram-dashboard.html">
-
-<title>Distribution Dashboard Demo</title>
-<style>
-  #container {
-    height: 800px;
-    display: block;
-  }
-
-  html, body {
-    margin: 0;
-    padding: 0;
-    font-family: "RobotoDraft","Roboto",sans-serif;
-  }
-
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="histogram-dash-demo">
-      <template>
-        <tf-histogram-dashboard id="demo" backend="[[backend]]"></tf-histogram-dashboard>
-      </template>
-      <script>
-        import {Backend} from "../../tf-backend/backend";
-        import {createRouter, setRouter} from "../../tf-backend/router";
-
-        Polymer({
-          is: "histogram-dash-demo",
-          properties: {
-            backend: {
-              type: Object,
-              value: function() {
-                return new Backend();
-              },
-            },
-          },
-          created: function() {
-            var router = createRouter("data", true);
-            setRouter(router);
-          },
-        });
-      </script>
-    </dom-module>
-    <histogram-dash-demo id="container"></histogram-dash-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
deleted file mode 100644
index 1821ce3b6f36a49ab7b6be08cc21ef150e9b3251..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
+++ /dev/null
@@ -1,167 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-color-scale/tf-color-scale.html">
-<link rel="import" href="../tf-dashboard-common/tf-categorizer.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
-<link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
-<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../vz-histogram-timeseries/vz-histogram-timeseries.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-
-<!--
-tf-histogram-dashboard is a complete frontend that loads runs from a backend,
-and creates chart panes that display data for those runs.
-
-It provides a mode and time property selector, together with the selectors
-provided by tf-sidebar-helper, by which the user can customize how data is
-organized and displayed.
-
-Each chart has a button that can toggle whether it is "selected"; selectedRuns
-charts are larger.
-
-Organizationally, the #plumbing div contains components that have no concrete
-manifestation and just effect data bindings or data loading. The .sidebar div
-contains shared controls provided by tf-sidebar-helper. The .center div
-contains vz-histogram-timeseries embedded inside tf-panes-helper's.
--->
-<dom-module id="tf-histogram-dashboard">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale
-        id="colorScale"
-        runs="[[runs]]"
-        out-color-scale="{{_colorScale}}"
-      ></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper
-          backend="[[backend]]"
-          categories="{{_categories}}"
-          color-scale="[[_colorScale]]"
-          run2tag="[[run2tag]]"
-          runs="[[runs]]"
-          selected-runs="{{_selectedRuns}}"
-          show-download-links="{{_showDownloadLinks}}"
-          >
-          <div class="sidebar-section">
-            <tf-option-selector
-              id="histogramModeSelector"
-              name="Histogram Mode"
-              selected-id="{{_histogramMode}}"
-              >
-              <paper-button id="overlay">overlay</paper-button>
-              <paper-button id="offset">offset</paper-button>
-            </tf-option-selector>
-          </div>
-          <div class="sidebar-section">
-            <tf-option-selector
-              id="timePropertySelector"
-              name="Offset Time Axis"
-              selected-id="{{_timeProperty}}"
-              >
-              <paper-button id="step">step</paper-button>
-              <paper-button id="relative">relative</paper-button>
-              <paper-button id="wall_time">wall</paper-button>
-            </tf-option-selector>
-          </tf-sidebar-helper>
-       </div>
-      </div>
-
-      <div class="center">
-        <tf-panes-helper
-          categories="[[_categories]]"
-          color-scale="[[_colorScale]]"
-          data-type="[[dataType]]"
-          data-provider="[[dataProvider]]"
-          data-not-found="[[dataNotFound]]"
-          run2tag="[[run2tag]]"
-          selected-runs="[[_selectedRuns]]"
-          repeat-for-runs
-          >
-          <template>
-            <vz-histogram-timeseries
-              time-property="[[_timeProperty]]"
-              mode="[[_histogramMode]]"
-              color-scale="[[_colorScaleFunction]]"
-              ></vz-histogram-timeseries>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-expanded-height: 500px;
-        --card-expanded-width: 700px;
-      }
-    </style>
-  </template>
-
-  <script>
-    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-    import {BackendBehavior} from "../tf-backend/behavior";
-
-    Polymer({
-      is: "tf-histogram-dashboard",
-      factoryImpl: function(backend) {
-        this.backend = backend;
-      },
-      behaviors: [
-        DashboardBehavior("histograms"),
-        ReloadBehavior("tf-chart-scaffold"),
-        BackendBehavior,
-      ],
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "histogram"
-        },
-        _histogramMode: {
-          type: String,
-          value: "offset"
-        },
-        _timeProperty: {
-          type: String,
-          value: "step"
-        },
-        _colorScaleFunction: {
-          type: Function,
-          computed: "_getColorScaleFunction(_colorScale)"
-        },
-      },
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _getColorScaleFunction: function() {
-        return this._colorScale.scale.bind(this._colorScale);
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
deleted file mode 100644
index 1e2833f74c5e01997e1a1789d2efa84d75a89e06..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
+++ /dev/null
@@ -1,44 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_image_dashboard",
-    srcs = [
-        "tf-image-dashboard.html",
-        "tf-image-loader.html",
-    ],
-    path = "/tf-image-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-image-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_image_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im1_2Fimage_2F0.json
deleted file mode 100644
index 3dec43221348810a9447e385ea3d17e12ce58bcf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im1_2Fimage_2F0.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-  {
-    "wall_time":1459200389.088045,
-    "width":4,
-    "height":4,
-    "step":0,
-    "query":"tag=im1%2Fimage%2F0&index=0&run=run1"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im2_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im2_2Fimage_2F0.json
deleted file mode 100644
index 16152b8626a3260227b4aad8deadf24306d8c4ba..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run1_tag_im2_2Fimage_2F0.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-  {
-    "wall_time":1459200389.093653,
-    "width":4,
-    "height":4,
-    "step":0,
-    "query":"tag=im2%2Fimage%2F0&index=0&run=run1"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run2_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run2_tag_im1_2Fimage_2F0.json
deleted file mode 100644
index a717b79c5def825bf7c7eec229e2f1a85971fc9b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/data/images_run_run2_tag_im1_2Fimage_2F0.json
+++ /dev/null
@@ -1,9 +0,0 @@
-[
-  {
-    "wall_time":1459200389.117463,
-    "width":4,
-    "height":4,
-    "step":0,
-    "query":"tag=im1%2Fimage%2F0&index=0&run=run2"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
deleted file mode 100644
index 346fd0076be28b9338152c4d49a32fc5ed685e44..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png and /dev/null differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
deleted file mode 100644
index 26d2d10acaf8511efeb03169853092d09252215b..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png and /dev/null differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
deleted file mode 100644
index 6c4190629429e0929962c4f20bd1a1602620e4bd..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/components/tf_image_dashboard/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png and /dev/null differ
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_image_dashboard/data/logdir
deleted file mode 100644
index c7d82022cc061502c5991a22e72c214918a9f87b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_image_dashboard/data/runs.json
deleted file mode 100644
index b75de5b6614a77e9f0e13ea6ab134f01413668ad..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/data/runs.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-   "run1":{
-      "images":[
-         "im1/image/0",
-         "im2/image/0"
-      ]
-   },
-   "run2":{
-      "images":[
-         "im1/image/0"
-      ]
-   }
-}
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/index.html b/tensorflow/tensorboard/components/tf_image_dashboard/index.html
deleted file mode 100644
index 27a31d5ad50b99be1190caab887959781273a861..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/index.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="tf-image-dashboard.html">
-    <title>Image Dashboard Demo</title>
-    <style>
-      #container{
-        width: 1000px;
-        height: 800px;
-        border: 2px solid grey;
-      }
-      html,body {
-        height: 100%;
-      }
-    </style>
-  </head>
-  <body>
-    <demo-snippet>
-      <template>
-        <dom-module id="image-dash-demo">
-          <template>
-            <tf-image-dashboard id="demo" backend="[[backend]]">
-            </tf-image-dashboard>
-          </template>
-          <script>
-            import {Backend} from "../../tf-backend/backend";
-            import {createRouter, setRouter} from "../../tf-backend/router";
-
-            Polymer({
-              is: "image-dash-demo",
-              properties: {
-                backend: {
-                  type: Object,
-                  value: function() {
-                    return new Backend();
-                  },
-                },
-              },
-              created: function() {
-                var path = "data";
-                var router = createRouter(path, true);
-                setRouter(router);
-              },
-            });
-          </script>
-        </dom-module>
-        <div id="container">
-          <image-dash-demo></image-dash-demo>
-        </div>
-      </template>
-    </demo-snippet>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
deleted file mode 100644
index 5d46847eb8870e0d335aaaf5aae0145b1593f419..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
+++ /dev/null
@@ -1,160 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-color-scale/tf-color-scale.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
-<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
-<link rel="import" href="tf-image-loader.html">
-
-<!--
-tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
--->
-<dom-module id="tf-image-dashboard">
-  <template>
-    <paper-dialog with-backdrop id="actual-image-size-dialog"></paper-dialog>
-    <div id="plumbing">
-      <tf-color-scale
-        id="colorScale"
-        runs="[[runs]]"
-        out-color-scale="{{_colorScale}}"
-        ></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper
-          backend="[[backend]]"
-          categories="{{_categories}}"
-          color-scale="[[_colorScale]]"
-          run2tag="[[run2tag]]"
-          runs="[[runs]]"
-          selected-runs="{{_selectedRuns}}"
-          >
-        </tf-sidebar-helper>
-      </div>
-      <div class="center">
-        <tf-panes-helper
-          categories="[[_categories]]"
-          color-scale="[[_colorScale]]"
-          data-type="[[dataType]]"
-          data-provider="[[dataProvider]]"
-          data-not-found="[[dataNotFound]]"
-          run2tag="[[run2tag]]"
-          selected-runs="[[_selectedRuns]]"
-          repeat-for-runs
-          >
-          <template>
-            <tf-image-loader color-scale="[[_colorScale]]"></tf-image-loader>
-            <paper-icon-button
-              class="actual-size-button"
-              icon="aspect-ratio"
-              on-tap="_showActualSize"
-              title="Show the image at its true pixel size"
-              ></paper-icon-button>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-width: 340px;
-        --card-height: auto;
-        --card-expanded-width: 700px;
-        --card-expanded-height: auto;
-      }
-
-      .actual-size-button {
-        background: #fff;
-        border-radius: 100%;
-        bottom: -35px;
-        color: #2196f3;
-        height: 32px;
-        left: 35px;
-        padding: 4px;
-        pointer-events: auto;
-        position: absolute;
-        width: 32px;
-      }
-
-      .actual-size-button-selected {
-        background: var(--tb-ui-light-accent);
-      }
-
-      #actual-image-size-dialog {
-        overflow: auto;
-      }
-    </style>
-  </template>
-  <script>
-    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-    import {BackendBehavior} from "../tf-backend/behavior";
-
-    Polymer({
-      is: "tf-image-dashboard",
-      factoryImpl: function(backend) {
-        this.backend = backend;
-      },
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "image"
-        },
-      },
-      behaviors: [
-        DashboardBehavior("images"),
-        ReloadBehavior("tf-chart-scaffold"),
-        BackendBehavior,
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _showActualSize: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var card = currentTarget.node.closest('.card');
-
-        // Create a full-size copy of the image.
-        var newImage = card.querySelector('#img').cloneNode();
-        newImage.style.height = 'auto';
-        newImage.style.width = 'auto';
-        newImage.style.margin = 0;
-        newImage.style.padding = 0;
-        newImage.classList.add("actual-size-image");
-
-        // When the user clicks on the image, empty and close the dialog.
-        var dialog = this.$$('#actual-image-size-dialog');
-        newImage.addEventListener('click', function() {
-          dialog.close();
-        });
-
-        // Update dialog content. Show the dialog.
-        dialog.innerHTML = '';
-        dialog.appendChild(newImage);
-        dialog.open();
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
deleted file mode 100644
index 41fb12eefa78e2c304d47ba8c8b099cfcde7bd83..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
+++ /dev/null
@@ -1,234 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../tf-imports/d3.html">
-
-<!--
-tf-image-loader loads an individual image from the TensorBoard backend.
-
-Right now it always loads the most recent image. We should add support in the
-future for loading older images.
--->
-<dom-module id="tf-image-loader">
-  <template>
-    <div id="image-annotation">
-      <template is="dom-if" if="[[_hasAtLeastOneStep]]">
-        step
-        <span class="step-value">
-          [[_stepValue]]
-        </span>
-        <template is="dom-if" if="[[_currentWallTime]]">
-          ([[_currentWallTime]])
-        </template>
-        <paper-spinner-lite active hidden$=[[!_isImageLoading]]></paper-spinner-lite>
-      </template>
-      <template is="dom-if" if="[[_hasMultipleSteps]]">
-        <paper-slider
-          id="steps"
-          immediate-value="{{_stepIndex}}"
-          max="[[_maxStepIndex]]"
-          max-markers="[[_maxStepIndex]]"
-          snaps
-          step="1"
-          value="{{_stepIndex}}"></paper-slider>
-      </template>
-    </div>
-
-    <div id="main-image-container"></div>
-
-    <style>
-      :host {
-        display: block;
-        width: 100%;
-        height: auto;
-        position: relative;
-        --step-slider-knob-color: #424242;
-      }
-
-      #image-annotation {
-        border-left: 4px solid;
-        padding-left: 5px;
-        font-size: 12px;
-        margin: -10px 0 10px 0;
-      }
-
-      #image-annotation .step-value {
-        font-weight: bold;
-      }
-
-      #image-annotation paper-spinner-lite {
-        width: 14px;
-        height: 14px;
-        vertical-align: text-bottom;
-        --paper-spinner-color: var(--tb-orange-strong)
-      }
-
-      #steps {
-        height: 15px;
-        margin: 0 0 0 -15px;
-        /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
-         * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
-         * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
-        width: calc(100% + 31px);
-        --paper-slider-active-color: var(--step-slider-knob-color);
-        --paper-slider-knob-color: var(--step-slider-knob-color);
-        --paper-slider-pin-color: var(--step-slider-knob-color);
-        --paper-slider-knob-start-color: var(--step-slider-knob-color);
-        --paper-slider-knob-start-border-color: var(--step-slider-knob-color);
-        --paper-slider-pin-start-color: var(--step-slider-knob-color);
-      }
-
-      #main-image-container img {
-        border: 1px solid #f5f5f5;
-        image-rendering: -moz-crisp-edges;
-        image-rendering: pixelated;
-        display: block;
-        width: 100%;
-        height: auto;
-        max-height: 500px;
-      }
-    </style>
-  </template>
-  <script>
-    "use strict";
-
-    Polymer({
-      is: "tf-image-loader",
-      properties: {
-        colorScale: Object,
-        run: String,
-        // This is an array of Tensorboard Image&Datum objects (See backend.ts for details). The
-        // properties of objects in this array are
-        // {
-        //   width: number,
-        //   height: number,
-        //   wall_time: Date,
-        //   step: number,
-        //   url: string,
-        // }
-        _steps: {
-          type: Array,
-          value: [],
-          notify: true,
-        },
-        _stepIndex: {
-          type: Number,
-          notify: true,
-        },
-        _hasAtLeastOneStep: {
-          type: Boolean,
-          computed: "_computeHasAtLeastOneStep(_steps)",
-        },
-        _hasMultipleSteps: {
-          type: Boolean,
-          computed: "_computeHasMultipleSteps(_steps)",
-        },
-        _stepValue: {
-          type: Number,
-          computed: "_computeStepValue(_stepIndex)",
-        },
-        _currentWallTime: {
-          type: Number,
-          computed: "_computeCurrentWallTime(_stepIndex)",
-        },
-        _maxStepIndex: {
-          type: Number,
-          computed: "_computeMaxStepIndex(_steps)",
-        },
-        // We use a strictly increasing index to make sure that we don't settle on a stale image.
-        _currentImageLoadIndex: {
-          type: Number,
-          value: 1,
-        },
-        _isImageLoading: {
-          type: Boolean,
-          value: false,
-        },
-      },
-      observers: [
-        "_updateImageUrl(_steps, _stepIndex)",
-      ],
-      redraw: function() {
-        // Other dashboards logic requires a redraw method to be defined. redraw is called at
-        // various places such as when the image is expanded.
-        this.setSeriesData(this.run, this._steps);
-      },
-      setVisibleSeries: function(runs) {
-        // Do nothing.
-      },
-      setSeriesData: function(run, steps) {
-        this.set("run", run);
-        this.set("_steps", steps);
-        this.set("_stepIndex", steps.length - 1);
-
-        // Update the border color based on the run.
-        var color = this.colorScale.scale(run);
-        this.$$("#image-annotation").style.borderColor = color;
-      },
-      _updateImageUrl: function(steps, stepIndex) {
-        // We manually change the image URL (instead of binding to the image's src attribute)
-        // because we would like to manage what happens when the image starts to / finishes loading.
-        if (!steps.length) {
-          return;
-        }
-
-        let img = new Image();
-        img.id = "img"; // '#img' used to select the image in tf-image-dashboard.
-
-        const loadIndex = ++this._currentImageLoadIndex;
-        img.onload = img.onerror = (function() {
-          if (loadIndex != this._currentImageLoadIndex) {
-            // This load is no longer relevant.
-            return;
-          }
-
-          // The new image has finished loading. Remove the old image. Add the new one.
-          let mainImageContainer = this.$$("#main-image-container");
-          mainImageContainer.innerHTML = "";
-          Polymer.dom(mainImageContainer).appendChild(img);
-
-          // The image has finished loading (or has erred and failed to load).
-          this.set("_isImageLoading", false);
-        }).bind(this);
-
-        // Load the new image.
-        this.set("_isImageLoading", true);
-        img.src = steps[stepIndex].url;
-      },
-      _computeHasAtLeastOneStep: function(steps) {
-        return !!steps && steps.length > 0;
-      },
-      _computeHasMultipleSteps: function(steps) {
-        return !!steps && steps.length > 1;
-      },
-      _computeStepValue: function(stepIndex) {
-        return this._steps[stepIndex].step;
-      },
-      _computeCurrentWallTime: function(stepIndex) {
-        return this._steps[stepIndex].wall_time.toString();
-      },
-      _computeMaxStepIndex: function(steps) {
-        return steps.length - 1;
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_imports/BUILD b/tensorflow/tensorboard/components/tf_imports/BUILD
deleted file mode 100644
index 84b46bf0053c5819c186fd1e8684ae09848db831..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/BUILD
+++ /dev/null
@@ -1,499 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:hacks.bzl", "tensorboard_typescript_bundle")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "webcomponentsjs",
-    srcs = ["@org_definitelytyped//:webcomponents.js.d.ts"],
-    path = "/webcomponentsjs",
-    visibility = ["//visibility:public"],
-    exports = ["@org_polymer_webcomponentsjs"],
-)
-
-ts_web_library(
-    name = "polymer",
-    srcs = ["@org_definitelytyped//:polymer.d.ts"],
-    path = "/polymer",
-    visibility = ["//visibility:public"],
-    exports = ["@org_polymer"],
-    deps = [":webcomponentsjs"],
-)
-
-ts_web_library(
-    name = "lodash",
-    srcs = [
-        "lodash.html",
-        "@org_definitelytyped//:lodash.d.ts",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-    deps = ["@com_lodash"],
-)
-
-ts_web_library(
-    name = "threejs",
-    srcs = [
-        "threejs.html",
-        "@org_definitelytyped//:three.d.ts",
-        "@org_threejs//:OrbitControls.js",
-        "@org_threejs//:three.js",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-)
-
-ts_web_library(
-    name = "numericjs",
-    srcs = [
-        "numericjs.html",
-        "@com_numericjs//:numeric.js",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-)
-
-ts_web_library(
-    name = "weblas",
-    srcs = [
-        "weblas.html",
-        "@io_github_waylonflinn_weblas//:weblas.js",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-)
-
-ts_web_library(
-    name = "graphlib",
-    srcs = [
-        "graphlib.html",
-        "@io_github_cpettitt_graphlib//:graphlib.core.js",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-    deps = [":lodash"],
-)
-
-ts_web_library(
-    name = "dagre",
-    srcs = [
-        "dagre.html",
-        "@io_github_cpettitt_dagre//:dagre.core.js",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":graphlib",
-        ":lodash",
-    ],
-)
-
-ts_web_library(
-    name = "d3",
-    srcs = [
-        "d3.d.ts",
-        "d3.html",
-        "@org_d3js//:d3.min.js",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-)
-
-ts_web_library(
-    name = "plottable",
-    srcs = [
-        "plottable.d.ts",
-        "plottable.html",
-    ],
-    path = "/tf-imports",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":d3",
-        ":plottable_js_css",
-    ],
-)
-
-ts_web_library(
-    name = "plottable_js_css",
-    srcs = [
-        "@com_palantir_plottable//:package/plottable.css",
-        "@com_palantir_plottable//:package/plottable.js",
-    ],
-    path = "/tf-imports",
-    strip_prefix = "package",
-    visibility = ["//visibility:private"],
-)
-
-ts_web_library(
-    name = "web_component_tester",
-    testonly = 1,
-    visibility = ["//visibility:public"],
-    exports = [
-        ":chai_typings",
-        ":mocha_typings",
-        ":sinon_typings",
-        "@org_npmjs_registry_web_component_tester",
-    ],
-)
-
-ts_web_library(
-    name = "chai_typings",
-    testonly = 1,
-    srcs = ["@org_definitelytyped//:chai.d.ts"],
-    path = "/chai",
-    visibility = ["//visibility:private"],
-)
-
-ts_web_library(
-    name = "mocha_typings",
-    testonly = 1,
-    srcs = ["@org_definitelytyped//:mocha.d.ts"],
-    path = "/mocha",
-    visibility = ["//visibility:private"],
-)
-
-ts_web_library(
-    name = "sinon_typings",
-    testonly = 1,
-    srcs = ["@org_definitelytyped//:sinon.d.ts"],
-    path = "/sinonjs",
-    visibility = ["//visibility:private"],
-)
-
-# Generate single TypeScript typings file for d3.js with no ES6 imports.
-#
-# The DefinitelyTyped definition of d3 v4 was written under the assumption that
-# we want to use d3 in a modularized way. We don't want to do that because its
-# import statements use NodeJS namespaces, and the Web Compiler only supports
-# W3C, ECMA, and IETF standards.
-tensorboard_typescript_bundle(
-    name = "d3_typings",
-    out = "d3.d.ts",
-    namespace_srcs = {"d3": [
-        "d3-transition.d.ts",
-        "@org_definitelytyped_types_d3_path//:index.d.ts",
-        "@org_definitelytyped_types_d3_time//:index.d.ts",
-        "@org_definitelytyped_types_d3_dsv//:index.d.ts",
-        "@org_definitelytyped_types_d3_color//:index.d.ts",
-        "@org_definitelytyped_types_d3_selection//:index.d.ts",
-        "@org_definitelytyped_types_d3_shape//:index.d.ts",
-        "@org_definitelytyped_types_d3_scale//:index.d.ts",
-        "@org_definitelytyped_types_d3_request//:index.d.ts",
-        "@org_definitelytyped_types_d3_interpolate//:index.d.ts",
-        "@org_definitelytyped_types_d3_drag//:index.d.ts",
-        "@org_definitelytyped_types_d3_brush//:index.d.ts",
-        "@org_definitelytyped_types_d3_axis//:index.d.ts",
-        "@org_definitelytyped_types_d3_zoom//:index.d.ts",
-        "@org_definitelytyped_types_d3_array//:index.d.ts",
-        "@org_definitelytyped_types_d3_chord//:index.d.ts",
-        "@org_definitelytyped_types_d3_collection//:index.d.ts",
-        "@org_definitelytyped_types_d3_dispatch//:index.d.ts",
-        "@org_definitelytyped_types_d3_ease//:index.d.ts",
-        "@org_definitelytyped_types_d3_force//:index.d.ts",
-        "@org_definitelytyped_types_d3_format//:index.d.ts",
-        "@org_definitelytyped_types_d3_hierarchy//:index.d.ts",
-        "@org_definitelytyped_types_d3_polygon//:index.d.ts",
-        "@org_definitelytyped_types_d3_quadtree//:index.d.ts",
-        "@org_definitelytyped_types_d3_queue//:index.d.ts",
-        "@org_definitelytyped_types_d3_random//:index.d.ts",
-        "@org_definitelytyped_types_d3_timer//:index.d.ts",
-        "@org_definitelytyped_types_d3_voronoi//:index.d.ts",
-    ]},
-)
-
-# It would be nice if Plottable released a .d.ts file for plottable.js like
-# they did for previous versions.
-tensorboard_typescript_bundle(
-    name = "plottable_typings",
-    out = "plottable.d.ts",
-    namespace_srcs = {
-        "Plottable": [
-            "@com_palantir_plottable//:package/build/src/core/dataset.d.ts",
-            "@com_palantir_plottable//:package/build/src/core/interfaces.d.ts",
-            "@com_palantir_plottable//:package/build/src/core/version.d.ts",
-        ],
-        "Plottable.Animators": [
-            "@com_palantir_plottable//:package/build/src/animators/animator.d.ts",
-            "@com_palantir_plottable//:package/build/src/animators/easingAnimator.d.ts",
-            "@com_palantir_plottable//:package/build/src/animators/nullAnimator.d.ts",
-        ],
-        "Plottable.Axes": [
-            "@com_palantir_plottable//:package/build/src/axes/axis.d.ts",
-            "@com_palantir_plottable//:package/build/src/axes/categoryAxis.d.ts",
-            "@com_palantir_plottable//:package/build/src/axes/numericAxis.d.ts",
-            "@com_palantir_plottable//:package/build/src/axes/timeAxis.d.ts",
-        ],
-        "Plottable.Components": [
-            "@com_palantir_plottable//:package/build/src/components/component.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/componentContainer.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/dragBoxLayer.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/dragLineLayer.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/gridlines.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/group.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/guideLineLayer.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/interpolatedColorLegend.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/label.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/legend.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/plotGroup.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/selectionBoxLayer.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/table.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/xDragBoxLayer.d.ts",
-            "@com_palantir_plottable//:package/build/src/components/yDragBoxLayer.d.ts",
-        ],
-        "Plottable.Configs": [
-            "@com_palantir_plottable//:package/build/src/core/config.d.ts",
-        ],
-        "Plottable.Formatters": [
-            "@com_palantir_plottable//:package/build/src/core/formatters.d.ts",
-        ],
-        "Plottable.RenderController": [
-            "@com_palantir_plottable//:package/build/src/core/renderController.d.ts",
-        ],
-        "Plottable.RenderPolicies": [
-            "@com_palantir_plottable//:package/build/src/core/renderPolicy.d.ts",
-        ],
-        "Plottable.SymbolFactories": [
-            "@com_palantir_plottable//:package/build/src/core/symbolFactories.d.ts",
-        ],
-        "Plottable.Dispatchers": [
-            "@com_palantir_plottable//:package/build/src/dispatchers/dispatcher.d.ts",
-            "@com_palantir_plottable//:package/build/src/dispatchers/keyDispatcher.d.ts",
-            "@com_palantir_plottable//:package/build/src/dispatchers/mouseDispatcher.d.ts",
-            "@com_palantir_plottable//:package/build/src/dispatchers/touchDispatcher.d.ts",
-        ],
-        "Plottable.Drawers": [
-            "@com_palantir_plottable//:package/build/src/drawers/arcDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/arcOutlineDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/areaDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/canvasBuffer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/canvasDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/drawStep.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/drawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/lineDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/rectangleDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/segmentDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/svgDrawer.d.ts",
-            "@com_palantir_plottable//:package/build/src/drawers/symbolDrawer.d.ts",
-        ],
-        "Plottable.Interactions": [
-            "@com_palantir_plottable//:package/build/src/interactions/clickInteraction.d.ts",
-            "@com_palantir_plottable//:package/build/src/interactions/dragInteraction.d.ts",
-            "@com_palantir_plottable//:package/build/src/interactions/interaction.d.ts",
-            "@com_palantir_plottable//:package/build/src/interactions/keyInteraction.d.ts",
-            "@com_palantir_plottable//:package/build/src/interactions/panZoomInteraction.d.ts",
-            "@com_palantir_plottable//:package/build/src/interactions/pointerInteraction.d.ts",
-        ],
-        "Plottable.Plots": [
-            "@com_palantir_plottable//:package/build/src/plots/areaPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/barPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/clusteredBarPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/commons.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/linePlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/piePlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/plot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/rectanglePlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/scatterPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/segmentPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/stackedAreaPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/stackedBarPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/waterfallPlot.d.ts",
-            "@com_palantir_plottable//:package/build/src/plots/xyPlot.d.ts",
-        ],
-        "Plottable.Scales": [
-            "@com_palantir_plottable//:package/build/src/scales/index.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/categoryScale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/colorScale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/interpolatedColorScale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/linearScale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/modifiedLogScale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/quantitativeScale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/scale.d.ts",
-            "@com_palantir_plottable//:package/build/src/scales/timeScale.d.ts",
-        ],
-        "Plottable.Scales.TickGenerators": [
-            "@com_palantir_plottable//:package/build/src/scales/tickGenerators.d.ts",
-        ],
-        "Plottable.Utils": [
-            "@com_palantir_plottable//:package/build/src/utils/addD3SelectionMulti.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/bucket.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/callbackSet.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/coerceD3.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/entityStore.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/makeEnum.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/map.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/set.d.ts",
-            "@com_palantir_plottable//:package/build/src/utils/transformAwareTranslator.d.ts",
-        ],
-        "Plottable.Utils.Array": [
-            "@com_palantir_plottable//:package/build/src/utils/arrayUtils.d.ts",
-        ],
-        "Plottable.Utils.Color": [
-            "@com_palantir_plottable//:package/build/src/utils/colorUtils.d.ts",
-        ],
-        "Plottable.Utils.DOM": [
-            "@com_palantir_plottable//:package/build/src/utils/domUtils.d.ts",
-        ],
-        "Plottable.Utils.Math": [
-            "@com_palantir_plottable//:package/build/src/utils/mathUtils.d.ts",
-        ],
-        "Plottable.Utils.Stacking": [
-            "@com_palantir_plottable//:package/build/src/utils/stackingUtils.d.ts",
-        ],
-        "Plottable.Utils.Window": [
-            "@com_palantir_plottable//:package/build/src/utils/windowUtils.d.ts",
-        ],
-    },
-    namespace_symbol_aliases = {
-        "Plottable.Animators": {
-            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
-            "SimpleSelection": "Plottable.SimpleSelection",
-        },
-        "Plottable.Axes": {
-            "Component": "Plottable.Components.Component",
-            "Formatter": "Plottable.Formatters.Formatter",
-            "Point": "Plottable.Point",
-            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
-            "Scale": "Plottable.Scales.Scale",
-            "Scales": "Plottable.Scales",
-            "SimpleSelection": "Plottable.SimpleSelection",
-            "SpaceRequest": "Plottable.SpaceRequest",
-        },
-        "Plottable.Components": {
-            "Bounds": "Plottable.Bounds",
-            "Formatter": "Plottable.Formatters.Formatter",
-            "IEntity": "Plottable.IEntity",
-            "Interactions": "Plottable.Interactions",
-            "Plots": "Plottable.Plots",
-            "Point": "Plottable.Point",
-            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
-            "Scales": "Plottable.Scales",
-            "SimpleSelection": "Plottable.SimpleSelection",
-            "SpaceRequest": "Plottable.SpaceRequest",
-            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
-        },
-        "Plottable.RenderController": {
-            "Component": "Plottable.Components.Component",
-            "RenderPolicies": "Plottable.RenderPolicies",
-        },
-        "Plottable.SymbolFactories": {
-            "d3Shape": "d3",
-        },
-        "Plottable.Dispatchers": {
-            "Component": "Plottable.Components.Component",
-            "Dispatchers": "Plottable.Dispatchers",
-            "Point": "Plottable.Point",
-        },
-        "Plottable.Drawers": {
-            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
-            "AttributeToProjector": "Plottable.AttributeToProjector",
-            "Dataset": "Plottable.Dataset",
-            "IAccessor": "Plottable.IAccessor",
-            "IAnimator": "Plottable.Animators.IAnimator",
-            "SimpleSelection": "Plottable.SimpleSelection",
-            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
-        },
-        "Plottable.Interactions": {
-            "Component": "Plottable.Components.Component",
-            "Point": "Plottable.Point",
-            "TransformableScale": "Plottable.Scales.TransformableScale",
-        },
-        "Plottable.Plots": {
-            "AppliedDrawStep": "Plottable.Drawers.AppliedDrawStep",
-            "AttributeToProjector": "Plottable.AttributeToProjector",
-            "Bounds": "Plottable.Bounds",
-            "Component": "Plottable.Components.Component",
-            "Dataset": "Plottable.Dataset",
-            "DrawStep": "Plottable.Drawers.DrawStep",
-            "Drawers": "Plottable.Drawers",
-            "Formatter": "Plottable.Formatters.Formatter",
-            "IAccessor": "Plottable.IAccessor",
-            "IAnimator": "Plottable.Animators.IAnimator",
-            "IDrawer": "Plottable.Drawers.IDrawer",
-            "IEntity": "Plottable.IEntity",
-            "IScaleCallback": "Plottable.Scales.IScaleCallback",
-            "Plots": "Plottable.Plots",
-            "Point": "Plottable.Point",
-            "Projector": "Plottable.Projector",
-            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
-            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
-            "Range": "Plottable.Range",
-            "Scale": "Plottable.Scales.Scale",
-            "SimpleSelection": "Plottable.SimpleSelection",
-            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
-            "TransformableScale": "Plottable.Scales.TransformableScale",
-            "Utils": "Plottable.Utils",
-            "d3Shape": "d3",
-        },
-        "Plottable.Scales": {
-            "Dataset": "Plottable.Dataset",
-            "Scales": "Plottable.Scales",
-        },
-        "Plottable.Scales.TickGenerators": {
-            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
-        },
-        "Plottable.Utils": {
-            "Bounds": "Plottable.Bounds",
-            "Component": "Plottable.Components.Component",
-            "Dataset": "Plottable.Dataset",
-            "IAccessor": "Plottable.IAccessor",
-            "Point": "Plottable.Point",
-            "Range": "Plottable.Range",
-            "SimpleSelection": "Plottable.SimpleSelection",
-            "Utils": "Plottable.Utils",
-        },
-    },
-    namespace_symbol_aliases_public = {
-        "Plottable": {
-            "Axis": "Plottable.Axes.Axis",
-            "AxisOrientation": "Plottable.Axes.AxisOrientation",
-            "ClickCallback": "Plottable.Interactions.ClickCallback",
-            "Component": "Plottable.Components.Component",
-            "ComponentCallback": "Plottable.Components.ComponentCallback",
-            "ComponentContainer": "Plottable.Components.ComponentContainer",
-            "Dispatcher": "Plottable.Dispatchers.Dispatcher",
-            "DragBoxCallback": "Plottable.Components.DragBoxCallback",
-            "DragCallback": "Plottable.Interactions.DragCallback",
-            "EaseFn": "Plottable.Animators.EaseFn",
-            "EaseName": "Plottable.Animators.EaseName",
-            "Easing": "Plottable.Animators.Easing",
-            "Formatter": "Plottable.Formatters.Formatter",
-            "IAnimator": "Plottable.Animators.IAnimator",
-            "IDragLineCallback": "Plottable.Components.IDragLineCallback",
-            "IDrawer": "Plottable.Drawers.IDrawer",
-            "IResizeHandler": "Plottable.Components.IResizeHandler",
-            "IScaleCallback": "Plottable.Scales.IScaleCallback",
-            "Interaction": "Plottable.Interactions.Interaction",
-            "Key": "Plottable.Interactions.Key",
-            "KeyCallback": "Plottable.Interactions.KeyCallback",
-            "Null": "Plottable.Animators.Null",
-            "Plot": "Plottable.Plots.Plot",
-            "PointerCallback": "Plottable.Interactions.PointerCallback",
-            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
-            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
-            "Renderer": "Plottable.Plots.Renderer",
-            "Scale": "Plottable.Scales.Scale",
-            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
-            "TimeInterval": "Plottable.Axes.TimeInterval",
-            "TransformableScale": "Plottable.Scales.TransformableScale",
-            "XAlignment": "Plottable.Components.XAlignment",
-            "XYPlot": "Plottable.Plots.XYPlot",
-            "YAlignment": "Plottable.Components.YAlignment",
-        },
-    },
-)
-
-# Removes the 'declare module' block inside this file, but keeps its content.
-genrule(
-    name = "kludge_d3_transition",
-    srcs = ["@org_definitelytyped_types_d3_transition//:index.d.ts"],
-    outs = ["d3-transition.d.ts"],
-    cmd = "sed '/^declare module/d' $< | awk '/^}$$/ && !p {p++;next}1' >$@",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_imports/README.md b/tensorflow/tensorboard/components/tf_imports/README.md
deleted file mode 100644
index b1cabc61b9be000350c165690652ab906f5c1b53..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-This file acts as import routers for third party javascript libraries,
-e.g. Plottable and D3.
diff --git a/tensorflow/tensorboard/components/tf_imports/d3.html b/tensorflow/tensorboard/components/tf_imports/d3.html
deleted file mode 100644
index 76ca302709ac0ae6aeef2f34be8f463483ad6099..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/d3.html
+++ /dev/null
@@ -1,50 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!--
-@license
-d3
-Copyright 2010-2017 Mike Bostock
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of the author nor the names of contributors may be used to
-  endorse or promote products derived from this software without specific prior
-  written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--->
-
-<script jscomp-nocompile src="d3.min.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/dagre.html b/tensorflow/tensorboard/components/tf_imports/dagre.html
deleted file mode 100644
index b90dc58e3902482e5f674d2c551879a56afaaa4f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/dagre.html
+++ /dev/null
@@ -1,45 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!--
-@license
-Dagre
-Copyright (c) 2012-2014 Chris Pettitt
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
--->
-
-<link rel="import" href="lodash.html">
-<link rel="import" href="graphlib.html">
-
-<script jscomp-nocompile src="dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/lodash.html b/tensorflow/tensorboard/components/tf_imports/lodash.html
deleted file mode 100644
index 65ff6a4b032417c3f1cb3d0f8a2f7f1a7ba05339..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/lodash.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script jscomp-nocompile src="../lodash/lodash.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/numericjs.html b/tensorflow/tensorboard/components/tf_imports/numericjs.html
deleted file mode 100644
index 81fa94916885ace15fc3388a9f735d4c04ac537f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/numericjs.html
+++ /dev/null
@@ -1,43 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!--
-@license
-Numeric Javascript
-Copyright (C) 2011 by Sébastien Loisel
-Copyright (c) 2011 Alberto Santini <albertosantini@gmail.com>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
--->
-
-<script jscomp-suppress src="numeric.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/plottable.html b/tensorflow/tensorboard/components/tf_imports/plottable.html
deleted file mode 100644
index 77ad544d5a08edb96ac284174bb79113e37b19fb..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/plottable.html
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!--
-@license
-Plottable.js
-Copyright (c) 2014-2017 Palantir Technologies, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
--->
-
-<link rel="import" href="d3.html">
-<script jscomp-suppress src="plottable.js"></script>
-<link rel="stylesheet" href="plottable.css">
diff --git a/tensorflow/tensorboard/components/tf_imports/threejs.html b/tensorflow/tensorboard/components/tf_imports/threejs.html
deleted file mode 100644
index 7f4233b5713d903a6a72ba2bc5327ce1f29db612..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/threejs.html
+++ /dev/null
@@ -1,43 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!--
-@license
-three.js
-Copyright (c) 2010-2013 three.js authors
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
--->
-
-<script jscomp-suppress src="three.js"></script>
-<script jscomp-suppress src="OrbitControls.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/weblas.html b/tensorflow/tensorboard/components/tf_imports/weblas.html
deleted file mode 100644
index c07020598fc2d4430641b557cee0ff130f56e94f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports/weblas.html
+++ /dev/null
@@ -1,42 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!--
-@license
-weblas
-Copyright (c) 2015 Waylon Flinn
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--->
-
-<script jscomp-nocompile src="weblas.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_option_selector/BUILD b/tensorflow/tensorboard/components/tf_option_selector/BUILD
deleted file mode 100644
index 3f7eed25cb1dfb6ff406844ee001ea40e84b9e73..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_option_selector/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_option_selector",
-    srcs = ["tf-option-selector.html"],
-    path = "/tf-option-selector",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_option_selector/tf-option-selector.html b/tensorflow/tensorboard/components/tf_option_selector/tf-option-selector.html
deleted file mode 100644
index d6fc9d6861ffc6c12098da224c5fea16997e6ff3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_option_selector/tf-option-selector.html
+++ /dev/null
@@ -1,94 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-
-<!--
-tf-option-selector is a simple component that has buttons as content and
-provides a "selectedId" property that is one of the IDs of the buttons inside it.
--->
-<dom-module id="tf-option-selector">
-  <template>
-    <div id="wrap">
-      <h3>[[name]]</h3>
-      <div class="content-wrapper"><content></content></div>
-    </div>
-    <style>
-      .content-wrapper ::content > * {
-        width: 30%;
-        font-size: 13px;
-        background: none;
-        margin-top: 10px;
-        color: var(--tb-ui-dark-accent);
-      }
-
-      .content-wrapper ::content :first-of-type {
-        margin-left: 0;
-      }
-
-      .content-wrapper ::content .selected {
-        background-color: var(--tb-ui-dark-accent);
-        color: white!important;
-      }
-
-      h3 {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-        display: block;
-        pointer-events: none;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-option-selector",
-      properties: {
-        name: String,
-        selectedId: {
-          type: String,
-          notify: true,
-          observer: '_selectedIdChanged'
-        }
-      },
-      attached: function() {
-        this.async(function() {
-          this.getEffectiveChildren().forEach(function(node) {
-            this.listen(node, 'tap', '_selectTarget');
-          }.bind(this));
-        });
-      },
-      _selectTarget: function(e) {
-        this.selectedId = e.currentTarget.id;
-      },
-      _selectedIdChanged: function() {
-        var selected = this.queryEffectiveChildren('#' + this.selectedId);
-        if (!selected) {
-          return;
-        }
-
-        this.getEffectiveChildren().forEach(function(node) {
-          node.classList.remove("selected");
-        });
-        selected.classList.add("selected");
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/BUILD b/tensorflow/tensorboard/components/tf_profile_dashboard/BUILD
deleted file mode 100644
index 5d04618a545fdf1bddcb64e61efbeb0fdfe7eb2f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_profile_dashboard",
-    srcs = [
-        "tf-profile-dashboard.html",
-    ],
-    path = "/tf-profile-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_graph_controls",
-        "@org_polymer",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/BUILD
deleted file mode 100644
index 3cc20ba352f54684dd6fa2a2f974361f41636125..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-profile-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_profile_dashboard",
-        "//tensorflow/tensorboard/components/tf_trace_viewer:demo",
-        "@org_polymer",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/logdir
deleted file mode 100644
index ecaaa8ac758bdac82a5b164127339b9348289abf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/some/fake/logdir"}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_trace_viewer.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_trace_viewer.json
deleted file mode 100644
index bc1a08b535f12b737c65382e0af98e1cacdf952b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_trace_viewer.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "traceEvents": [
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "C",
-        "name": "counter", "args": {"value": 10}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "B",
-        "name": "A long name that doesnt fit but is exceedingly informative",
-        "args": {"name_false": false, "value_true": true}},
-    {"cat": "PERF", "pid": 22630, "ts": 835, "ph": "I", "s": "p",
-        "name": "ProcessWideEvent1", "args": {}}
-  ],
-  "stackFrames": {
-    "1": {
-      "category": "m1",
-      "name": "main"
-    },
-    "7": {
-      "category": "m2",
-      "name": "frame7",
-      "parent": "1"
-    },
-    "8": {
-      "category": "m2",
-      "name": "frame8",
-      "parent": "1"
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_foo_tag_trace_viewer.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_foo_tag_trace_viewer.json
deleted file mode 100644
index e1d57394e3594b68f73d3e5be56753d32503d80b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_foo_tag_trace_viewer.json
+++ /dev/null
@@ -1,105 +0,0 @@
-{
-  "traceEvents": [
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "C",
-        "name": "counter", "args": {"value": 10}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "B",
-        "name": "A long name that doesnt fit but is exceedingly informative",
-        "args": {"name_false": false, "value_true": true}},
-    {"cat": "PERF", "pid": 22630, "ts": 835, "ph": "I", "s": "p",
-        "name": "ProcessWideEvent1", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 827, "ph": "B",
-        "name": "Asub with a name that wont fit", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 828, "ph": "E",
-        "name": "Asub", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 829, "ph": "B",
-        "name": "Asub", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 15, "ts": 820, "ph": "X",
-        "name": "Long X type", "args": {}, "sf": 7, "esf": 8},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "E",
-        "name": "Asub", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
-        "name": "X1", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
-        "name": "X same ts and dur as X1", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "C",
-        "name": "counter", "args": {"value": 1}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 833, "ph": "E",
-        "name": "", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 835, "ph": "I",
-        "name": "ThreadLevelI1", "args": {}},
-
-    {"cat": "PERF", "ts": 880, "ph": "I", "s": "g", "name": "GlobalEvent1",
-        "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 837, "ph": "I",
-        "name": "ThreadLevelI2", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 839, "ph": "C",
-        "name": "counter", "args": {"value": 5}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 840, "ph": "B",
-        "name": "A not as long a name", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "E",
-        "name": "A not as long a name", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "C",
-        "name": "counter", "args": {"value": 1}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "C",
-        "name": "counter", "args": {"value": 10}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 850, "ph": "B",
-        "name": "B", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "E",
-        "name": "B", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 827, "ph": "B",
-        "name": "A", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 835, "ph": "I",
-        "name": "ThreadLevelImmediate Three", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 845, "ph": "I",
-        "name": "ThreadLevelImmediate4", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 854, "ph": "E",
-        "name": "A", "args": {}},
-
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
-        "name": "B/E over X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 10, "ts": 860, "ph": "X",
-        "name": "X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
-        "name": "B/E under X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
-        "name": "B/E under X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
-        "name": "B/E over X", "args": {}},
-
-    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 870, "ph": "P",
-        "name": "SampleA", "args": {}},
-    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 875, "ph": "P",
-        "name": "SampleB", "args": {}},
-    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 878, "ph": "P",
-        "name": "SampleC", "args": {}, "sf": 8},
-
-    {"cat": "__metadata", "pid": 22630, "tid": 22630, "ts": 0, "ph": "M",
-        "name": "thread_name", "args": {"name": "threadA"}},
-    {"cat": "__metadata", "pid": 22630, "tid": 22631, "ts": 0, "ph": "M",
-        "name": "thread_name", "args": {"name": "threadB"}},
-    {"cat": "__metadata", "pid": 22630, "tid": 22632, "ts": 0, "ph": "M",
-        "name": "thread_name", "args": {"name": "threadC"}}
-  ],
-  "stackFrames": {
-    "1": {
-      "category": "m1",
-      "name": "main"
-    },
-    "7": {
-      "category": "m2",
-      "name": "frame7",
-      "parent": "1"
-    },
-    "8": {
-      "category": "m2",
-      "name": "frame8",
-      "parent": "1"
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/tags.json b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/tags.json
deleted file mode 100644
index 12ef5bf8b2e5f4240ee7018ecad6bb2b106fcf4c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/tags.json
+++ /dev/null
@@ -1 +0,0 @@
-{"foo": ["trace_viewer"], "bar": ["unsupported", "trace_viewer"]}
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_profile_dashboard/demo/index.html
deleted file mode 100644
index 15064a54f8f563a8031146743bc3596c4c3e0391..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/index.html
+++ /dev/null
@@ -1,75 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../tf-profile-dashboard.html">
-    <title>Profile Dashboard Demo</title>
-    <style>
-      #container{
-        height: 800px;
-        border: 2px solid grey;
-      }
-      html, body {
-        margin: 0;
-        padding: 0;
-        height: 100%;
-        font-family: "RobotoDraft","Roboto",sans-serif;
-      }
-    </style>
-  </head>
-  <body>
-    <demo-snippet>
-      <template>
-        <dom-module id="profile-dash-demo">
-          <template>
-            <tf-profile-dashboard
-              backend="[[_backend]]",
-              trace-viewer-base-url="../../tf-trace-viewer/demo.html">
-            </tf-profile-dashboard>
-          </template>
-          <script>
-            import {Backend} from "../../tf-backend/backend";
-            import {createRouter, setRouter} from "../../tf-backend/router";
-
-            Polymer({
-              is: "profile-dash-demo",
-              properties: {
-                _backend: {
-                  type: Object,
-                  value: function() {
-                    return new Backend();
-                  },
-                },
-              },
-              created: function() {
-                var router = createRouter("data", true);
-                setRouter(router);
-              },
-            });
-          </script>
-        </dom-module>
-        <div id="container">
-          <profile-dash-demo></profile-dash-demo>
-        </div>
-      </template>
-    </demo-snippet>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/tf-profile-dashboard.html b/tensorflow/tensorboard/components/tf_profile_dashboard/tf-profile-dashboard.html
deleted file mode 100644
index 4028f0e0f064b36583d6422b5306f99a4bc51555..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_profile_dashboard/tf-profile-dashboard.html
+++ /dev/null
@@ -1,222 +0,0 @@
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
-
-<!--
-tf-profile-dashboard displays profiling information for different tools.
-
-In the profile dashboard, a "run" is a profile run and "tag" is a tool name. A
-profile run can have multiple tools that present the performance profile as different visualizations
-(e.g. Catapult TraceViewer).
--->
-
-<dom-module id="tf-profile-dashboard">
-<template>
-<tf-dashboard-layout>
-<div class="sidebar">
-  <div class="allcontrols">
-    <div class="sidebar-section">
-      <div class="title">Runs <span class="counter">([[_datasets.length]])</span></div>
-      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
-        <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
-          <template is="dom-repeat" items="[[_datasets]]">
-            <paper-item>[[item.name]]</paper-item>
-          </template>
-        </paper-menu>
-      </paper-dropdown-menu>
-    </div>
-    <div class="sidebar-section">
-      <div class="title">Tools <span class="counter">([[_activeTools.length]])</span></div>
-      <paper-dropdown-menu no-label-float no-animations noink class="run-dropdown">
-        <paper-menu id="select" class="dropdown-content" selected="{{selectedTool}}">
-          <template is="dom-repeat" items="[[_activeTools]]">
-            <paper-item>[[item]]</paper-item>
-          </template>
-          <paper-item>None</paper-item>
-        </paper-menu>
-      </paper-dropdown-menu>
-    </div>
-    <div class="sidebar-section"></div>
-  </div>
-</div>
-<div class="center">
-  <template is="dom-if" if="[[_toolIsTraceViewer(_currentTool)]]">
-    <iframe
-      id="tv_iframe"
-      height="100%"
-      width="100%"
-      src="[[_traceDataUrl]]">
-    </iframe>
-  </template>
-</div>
-</tf-dashboard-layout>
-<style include="dashboard-style"></style>
-
-<style>
-  .center {
-    position: relative;
-    height: 100%;
-  }
-  iframe {
-    position: absolute;
-    width: 100%;
-    height: 100%;
-    box-sizing: border-box;
-  }
-</style>
-
-</template>
-<script>
-  "use strict";
-
-  import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-  import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-  import {BackendBehavior} from "../tf-backend/behavior";
-  import {getRouter} from '../tf-backend/router';
-
-  Polymer({
-    is: "tf-profile-dashboard",
-    factoryImpl: function(backend) {
-      this.backend = backend;
-    },
-    behaviors: [
-      DashboardBehavior("profile"),
-      ReloadBehavior("tf-profile-dashboard"),
-      BackendBehavior,
-    ],
-    properties: {
-      backend: Object,
-      _isAttached: Boolean,
-      // Whether this dashboard is initialized. This dashboard should only be
-      // initialized once.
-      _initialized: Boolean,
-      // The endpoint that serves trace viewer app.
-      traceViewerBaseUrl: {
-        type: String,
-        value: function() {
-          return getRouter().pluginRoute("profile", "/trace_viewer");
-        },
-      },
-      // The URL for the trace data being display.
-      _traceDataUrl: {
-        type: String,
-        value: "",
-      },
-      _datasets: {
-        type: Array,
-        observer: '_datasetsChanged'
-      },
-      _activeTools: {
-        type: Array,
-        computed: '_getActiveTools(selectedDataset, _datasets)'
-      },
-      selectedDataset: {
-        type: Number,
-        notify: true,
-        value: 0,
-        observer: '_selectedDatasetChanged'
-      },
-      _currentTool: {
-        type: String,
-        computed: '_getCurrentToolName(selectedTool, _activeTools)',
-        notify: true,
-      },
-      selectedTool: {
-        type: Number,
-        notify: true,
-        value: 0,
-      },
-    },
-    reload: function() {
-    },
-    ready: function() {
-    },
-    observers: [
-      '_maybeInitializeDashboard(backend, _isAttached)',
-      '_maybeUpdateTool(_datasets, selectedDataset, _currentTool)',
-    ],
-    attached: function() {
-      this.set('_isAttached', true);
-    },
-    detached: function() {
-      this.set('_isAttached', false);
-    },
-    _maybeInitializeDashboard: function(backend, isAttached) {
-      if (this._initialized || !backend || !isAttached) {
-        // Either this dashboard is already initialized ... or we are not yet
-        // ready to initialize.
-        return;
-      }
-      // Set this to true so we only initialize once.
-      this._initialized = true;
-      backend.profileTags().then((runToTool) => {
-        var datasets = _.map(runToTool, function(tools, run) {
-          return {
-            name: run,
-            activeTools: tools,
-          };
-        }, this);
-        this.set('_datasets', datasets);
-      });
-    },
-    _maybeUpdateTool: function(datasets, selectedDataset, currentTool) {
-      if (currentTool == "trace_viewer") {
-        var trace_data_url = (getRouter().pluginRunTagRoute('profile', '/data')(
-            'trace_viewer', datasets[selectedDataset].name));
-        // Make the trace data url relative to the root.
-        if (trace_data_url[0] != '/') {
-          trace_data_url = '/' + trace_data_url;
-        }
-        this._traceDataUrl = this.traceViewerBaseUrl + "?trace_data_url=" +
-                            encodeURIComponent(trace_data_url);
-        return;
-      }
-    },
-    _datasetsChanged: function(newDatasets, oldDatasets) {
-      if (oldDatasets != null || this.selected == null) {
-        // Select the first dataset by default.
-        this.set('selectedDataset', 0);
-      }
-    },
-    _selectedDatasetChanged: function(newDataset, oldDataset) {
-      if (this._datasets) {
-        // Display the first tool by default when switching to another run.
-        this.set('selectedTool', 0);
-      }
-    },
-    _getCurrentToolName: function(selectedToolIndex, activeTools) {
-      if (selectedToolIndex >= 0 && selectedToolIndex < activeTools.length) {
-        return activeTools[selectedToolIndex];
-      }
-      return null;
-    },
-    _getActiveTools: function(selectedDataset, datasets) {
-      if (datasets && selectedDataset < datasets.length) {
-        return datasets[selectedDataset].activeTools;
-      }
-      return [];
-    },
-    _toolIsTraceViewer: function(toolName) {
-      return toolName == 'trace_viewer';
-    },
-  });
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_runs_selector/BUILD b/tensorflow/tensorboard/components/tf_runs_selector/BUILD
deleted file mode 100644
index 30265c8d29494c82c2c0625f87bc02b125edb4fb..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_runs_selector/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_runs_selector",
-    srcs = [
-        "tf-runs-selector.html",
-    ],
-    path = "/tf-runs-selector",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_runs_selector/tf-runs-selector.html b/tensorflow/tensorboard/components/tf_runs_selector/tf-runs-selector.html
deleted file mode 100644
index 6964bb076de68e26fc61c20f17fbf5953a89a705..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_runs_selector/tf-runs-selector.html
+++ /dev/null
@@ -1,195 +0,0 @@
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../tf-dashboard-common/tf-multi-checkbox.html">
-<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
-
-<!--
-tf-runs-selector creates a set of checkboxes to display which runs are
-selected. It also displays tooltips.
-
-Properties in: none.
-Properties out:
-- selectedRuns: The array of run names that are currently checked by the user.
-
--->
-<dom-module id="tf-runs-selector">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale
-        runs="[[runs]]"
-        out-color-scale="{{colorScale}}"
-      ></tf-color-scale>
-    </div>
-    <paper-dialog with-backdrop id="logdir-dialog">
-      <h2>logdir</h2>
-      <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
-    </paper-dialog>
-    <div id="top-text">
-      <h3 id="tooltip-help" class="tooltip-container">
-        Runs<!--
-        - TODO(wchargin): Remove the "new" notice when we remove the old
-        - selector. This is just so that we can tell them apart.
-        -->&#x2003;<span
-          style="text-transform:uppercase;color:red;font-size:smaller"
-        >new</span>
-      </h3>
-    </div>
-    <tf-multi-checkbox
-      id="multiCheckbox"
-      names="[[runs]]"
-      out-selected="{{selectedRuns}}"
-      color-scale="[[colorScale]]"
-    ></tf-multi-checkbox>
-    <paper-button
-      class="x-button"
-      id="toggle-all"
-      on-tap="_toggleAll"
-    >
-    Toggle All Runs
-    </paper-button>
-    <template
-      is="dom-if"
-      if="[[logdir]]">
-      <div id="logdir">
-        <span id="clipped-logdir" inner-h-t-m-l="[[_clippedLogdir]]"></span><!--
-          We use HTML comments to remove spaces before the ellipsis.
-        --><template
-                     is="dom-if"
-                     if="[[_shouldShowExpandLogdirButton(logdir, _logdirClipLength)]]"><!--
-          --><a href="" on-click="_openLogdirDialog">…</a>
-        </template>
-      </div>
-    </template>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        padding-bottom: 10px;
-        box-sizing: border-box;
-      }
-      #top-text {
-        width: 100%;
-        flex-grow: 0;
-        flex-shrink: 0;
-        padding-right: 16px;
-        box-sizing: border-box;
-        color: var(--paper-grey-800);
-      }
-      tf-multi-checkbox {
-        display: flex;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      .x-button {
-        font-size: 13px;
-        background-color: var(--tb-ui-light-accent);
-        color: var(--tb-ui-dark-accent);
-      }
-      #tooltip-help {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-      paper-button {
-        margin-left: 0;
-      }
-      #logdir {
-        color: var(--tb-ui-dark-accent);
-        font-size: 13px;
-        margin: 5px 0 0 0;
-        max-width: 288px;
-      }
-    </style>
-  </template>
-  <script>
-  import {RequestManager} from "../tf-backend/requestManager";
-  import {getRouter} from "../tf-backend/router";
-  import * as RunsStore from "../tf-backend/runsStore";
-
-  var requestManager = new RequestManager();
-  Polymer({
-    is: "tf-runs-selector",
-    properties: {
-      selectedRuns: {type: Array, notify: true},
-      // runs: an array of strings, representing the run names that may be chosen
-      runs: Array,
-      logdir: {
-        type: String,
-        notify: true,
-      },
-      // This is the potentially clipped portion of the logdir we show at the bottom of the sidebar.
-      _clippedLogdir: {
-        type: String,
-        computed: '_getClippedLogdir(logdir, _logdirClipLength)',
-      },
-      _logdirClipLength: {
-        type: Number,
-        value: 250,
-        readOnly: true,
-      },
-    },
-    ready: function() {
-      var updateRuns = function() {
-        this.set("runs", RunsStore.getRuns());
-      }.bind(this);
-      RunsStore.addListener(updateRuns);
-      updateRuns();
-
-      requestManager.request(getRouter().logdir()).then(logdirObject => {
-        this.set('logdir', logdirObject.logdir);
-      }).catch(e => {
-        // Fetching the logdir failed. Prevent the exception from logging to
-        // console. The console already logs a 404 network event.
-      });
-    },
-    _toggleAll: function() {
-      this.$.multiCheckbox.toggleAll();
-    },
-    // Break the string at natural points, including commas, equals, and slashes
-    _breakString: function(originalString) {
-      return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
-    },
-    _getClippedLogdir: function(logdir, logdirClipLength) {
-      if (logdir === undefined) {
-        // The logdir has not been set yet.
-        return undefined;
-      }
-
-      if (logdir.length > logdirClipLength) {
-        // Clip the logdir to avoid blocking the runs selector. Let the user view a more full
-        // version of the logdir.
-        return this._breakString(logdir.substring(0, logdirClipLength));
-      } else {
-        return this._breakString(logdir);
-      }
-    },
-    _openLogdirDialog: function(event) {
-      event.preventDefault();
-      this.$$('#logdir-dialog').open();
-    },
-    _shouldShowExpandLogdirButton(logdir, _logdirClipLength) {
-      return logdir && logdir.length > _logdirClipLength;
-    },
-  });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
deleted file mode 100644
index 7cc192b4640108f1eb29ccfd4f379980b3c34e70..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_scalar_dashboard",
-    srcs = [
-        "tf-scalar-dashboard.html",
-        "tf-smoothing-input.html",
-    ],
-    path = "/tf-scalar-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_runs_selector",
-        "//tensorflow/tensorboard/components/vz_line_chart",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_paper_checkbox",
-        "@org_polymer_paper_dropdown_menu",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_input",
-        "@org_polymer_paper_item",
-        "@org_polymer_paper_menu",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
deleted file mode 100644
index 0e892b1aa30226ca5752c1141e79d46b75108ccc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-scalar-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "//tensorflow/tensorboard/components/tf_scalar_dashboard",
-        "//tensorflow/tensorboard/demo:demo_data",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir
deleted file mode 100644
index b6362b45d777266d6204b23884222a080f789f71..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json
deleted file mode 100644
index d45f530763cb786777c5650eecd0ebaf91b9863f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "run1": {"scalars": ["foo/sin", "foo/cos", "foo/square", "bar/square"]},
-  "run2": {"scalars": ["foo/cos", "foo/square", "bar/square"]}
-}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json
deleted file mode 100644
index bc269395b68a35f7d4481fca05063e46c79c2859..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run2": {"foo/cos": [[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]}, "run1": {"foo/sin": [[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]], "foo/cos": [[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]}}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
deleted file mode 100644
index 6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
deleted file mode 100644
index 025eaa16e93110da0c50ad03486786ee6e521700..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
deleted file mode 100644
index eae69dd78f3b5aa75acec6b5daa08720fad9adba..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
deleted file mode 100644
index 6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
deleted file mode 100644
index 6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
deleted file mode 100644
index dd3593f9d109e81bef5a10c732a9e08e60b3ef4f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
deleted file mode 100644
index 0ff9ef0551d0a3053ba16b502d0d6148057df660..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
deleted file mode 100644
index 78f657b41040af9694308c14d773556fc5a79170..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
+++ /dev/null
@@ -1,70 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../../polymer/polymer.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../tf-scalar-dashboard.html">
-<link rel="import" href="../../paper-styles/typography.html">
-<link rel="import" href="../../tf-backend/tf-backend.html">
-
-<title>Scalar Dashboard Demo</title>
-<style>
-  #container {
-    height: 900px;
-    width: 100%;
-    display: block;
-  }
-
-  html, body {
-    margin: 0;
-    padding: 0;
-    font-family: "RobotoDraft","Roboto",sans-serif;
-  }
-
-</style>
-<demo-snippet>
-  <template>
-    <dom-module id="scalar-dash-demo">
-      <template>
-        <tf-scalar-dashboard id="demo" backend="[[backend]]"></tf-scalar-dashboard>
-      </template>
-      <script>
-        import {Backend} from "../tf-backend/backend";
-        import {createRouter, setRouter} from "../tf-backend/router";
-
-        Polymer({
-          is: "scalar-dash-demo",
-          properties: {
-            backend: {
-              type: Object,
-              value: function() {
-                return new Backend();
-              },
-            },
-            created: function() {
-              var router = createRouter("/data", true);
-              setRouter(router);
-            },
-          },
-        });
-      </script>
-    </dom-module>
-    <scalar-dash-demo id="container"></scalar-dash-demo>
-  </template>
-</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html b/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
deleted file mode 100644
index 848ed5292de1804a8276dccdcc5272dc821a6f8b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
+++ /dev/null
@@ -1,293 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="tf-smoothing-input.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-color-scale/tf-color-scale.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tf-option-selector.html">
-<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
-<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
-<link rel="import" href="../tf-runs-selector/tf-runs-selector.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../vz-line-chart/vz-line-chart.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-menu/paper-menu.html">
-<link rel="import" href="../paper-item/paper-item.html">
-
-<!--
-tf-scalar-dashboard is a complete frontend that loads runs from a backend,
-and creates chart panes that display data for those runs.
-
-It provides a categorizer, run selector, and x type selector, by which the user
-can customize how data is organized and displayed.
-
-Each chart has a button that can toggle whether it is "expanded"; expanded
-charts are larger.
-
-Organizationally, the #plumbing div contains components that have no concrete
-manifestation and just effect data bindings or data loading. The .sidebar div
-contains shared controls provided by tf-sidebar-helper. The .center div
-contains vz-line-charts embedded inside tf-panes-helper's.
--->
-<dom-module id="tf-scalar-dashboard">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale
-        id="colorScale"
-        runs="[[runs]]"
-        out-color-scale="{{_colorScale}}"
-      ></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <div class="sidebar-section">
-          <tf-categorizer
-            id="categorizer"
-            tags="[[tags]]"
-            categories="{{_categories}}"
-          ></tf-categorizer>
-          <div class="line-item">
-            <paper-checkbox
-            id="download-option"
-            checked="{{_showDownloadLinks}}"
-            >Show data download links</paper-checkbox>
-          </div>
-          <div class="line-item">
-            <paper-checkbox
-            id="outliersCheckbox"
-            checked="{{_ignoreYOutliers}}"
-            >Ignore outliers in chart scaling</paper-checkbox>
-          </div>
-          <div id="tooltip-sorting">
-            <div id="tooltip-sorting-label">Tooltip sorting method:</div>
-            <paper-dropdown-menu
-              no-label-float
-              selected-item-label="{{_tooltipSortingMethod}}"
-              >
-              <paper-menu class="dropdown-content" selected="0">
-                <paper-item>default</paper-item>
-                <paper-item>descending</paper-item>
-                <paper-item>ascending</paper-item>
-                <paper-item>nearest</paper-item>
-              </paper-menu>
-            </paper-dropdown-menu>
-          </div>
-        </div>
-        <div class="sidebar-section">
-          <tf-smoothing-input
-            weight="{{_smoothingWeight}}"
-            step="0.001"
-            min="0"
-            max="1"
-            ></tf-smoothing-input>
-        </div>
-        <div class="sidebar-section">
-          <tf-option-selector
-            id="xTypeSelector"
-            name="Horizontal Axis"
-            selected-id="{{_xType}}"
-            >
-            <paper-button id="step">step</paper-button>
-            <paper-button id="relative">relative</paper-button>
-            <paper-button id="wall_time">wall</paper-button>
-          </tf-option-selector>
-        </div>
-        <div class="sidebar-section">
-          <tf-runs-selector
-            id="runs-selector"
-            selected-runs="{{_selectedRuns}}"
-          ></tf-runs-selector>
-        </div>
-      </div>
-      <div class="center">
-        <tf-panes-helper
-          categories="[[_categories]]"
-          color-scale="[[_colorScale]]"
-          data-type="[[dataType]]"
-          data-provider="[[dataProvider]]"
-          data-not-found="[[dataNotFound]]"
-          run2tag="[[run2tag]]"
-          selected-runs="[[_selectedRuns]]"
-          show-download-links="[[_showDownloadLinks]]"
-          download-link-url-function="[[scalarUrl]]"
-        >
-          <template>
-            <vz-line-chart
-              x-type="[[_xType]]"
-              color-scale="[[_colorScale]]"
-              smoothing-enabled="[[_smoothingEnabled]]"
-              smoothing-weight="[[_smoothingWeight]]"
-              tooltip-sorting-method="[[_tooltipSortingMethod]]"
-              ignore-y-outliers="[[_ignoreYOutliers]]"
-              ></vz-line-chart>
-            <paper-icon-button
-              class="log-button"
-              icon="line-weight"
-              on-tap="toggleLogScale"
-              title="Toggle y-axis log scale"
-              ></paper-icon-button>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-    <style>
-      .log-button {
-        position: absolute;
-        left: 35px;
-        bottom: -35px;
-        color: #2196F3;
-        background: #fff;
-        width: 32px;
-        height: 32px;
-        padding: 4px;
-        border-radius: 100%;
-      }
-
-      .log-button-selected {
-        background: var(--tb-ui-light-accent);
-      }
-
-      #categorizer {
-        flex-shrink: 0;
-      }
-
-      #runs-selector {
-        flex-shrink: 1;
-        flex-grow: 1;
-      }
-
-      #tooltip-sorting {
-        display: flex;
-        font-size: 14px;
-        margin-top: 5px;
-      }
-
-      #tooltip-sorting-label {
-        margin-top: 13px;
-        margin-left: 28px;
-      }
-
-      #tooltip-sorting paper-dropdown-menu {
-        margin-left: 10px;
-        --paper-input-container-focus-color: var(--tb-orange-strong);
-        width: 105px;
-      }
-      .line-item {
-        display: block;
-        padding-top: 5px;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 20px 0px 20px 30px;
-      }
-
-      /* TODO(wchargin): These styles also exist in dashboard-style, */
-      /* but don't apply due to the namespacing. :-( */
-      .sidebar {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-      }
-    </style>
-  </template>
-
-  <script>
-    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-    import {BackendBehavior} from "../tf-backend/behavior";
-    import {getRouter} from "../tf-backend/router";
-    import * as storage from "../tf-storage/storage";
-
-    Polymer({
-      is: "tf-scalar-dashboard",
-      behaviors: [
-        DashboardBehavior("scalars"),
-        ReloadBehavior("tf-chart-scaffold"),
-        BackendBehavior,
-      ],
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "scalar"
-        },
-        scalarUrl: {
-          type: Function,
-          value: function() {
-            return getRouter().pluginRunTagRoute('scalars', '/scalars');
-          },
-        },
-        _showDownloadLinks: {
-          type: Boolean,
-          notify: true,
-          value: storage.getBooleanInitializer('_showDownloadLinks', false, true),
-          observer: '_showDownloadLinksObserver'
-        },
-        _smoothingWeight: {
-          type: Number,
-          notify: true,
-          value: storage.getNumberInitializer('_smoothingWeight', 0.6),
-          observer: '_smoothingWeightObserver'
-        },
-        _smoothingEnabled: {
-          type: Boolean,
-          computed: '_computeSmoothingEnabled(_smoothingWeight)'
-        },
-        _ignoreYOutliers: {
-          type: Boolean,
-          value: storage.getBooleanInitializer('_ignoreYOutliers', true, true),
-          observer: '_ignoreYOutliersObserver',
-        },
-        _xType: {
-          type: String,
-          value: "step"
-        },
-      },
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _showDownloadLinksObserver: storage.getBooleanObserver(
-          '_showDownloadLinks', /*default=*/ false, /*useLocalStorage=*/ true),
-      _smoothingWeightObserver: storage.getNumberObserver(
-          '_smoothingWeight', 0.6),
-      _ignoreYOutliersObserver: storage.getBooleanObserver(
-          '_ignoreYOutliers', /*default=*/ true, /*useLocalStorage=*/true),
-      _computeSmoothingEnabled: function(_smoothingWeight) {
-        return _smoothingWeight > 0;
-      },
-      toggleLogScale: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var button = currentTarget.parentNode.querySelector('.log-button');
-        var chart = currentTarget.parentNode.querySelector('vz-line-chart');
-
-        button.classList.toggle("log-button-selected");
-        chart.yScaleType = chart.yScaleType === 'log' ? 'linear' : 'log';
-        chart.redraw();
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-smoothing-input.html b/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-smoothing-input.html
deleted file mode 100644
index a0760330001310e3afee0f060b563c11d063ab65..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-smoothing-input.html
+++ /dev/null
@@ -1,138 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../tf-imports/lodash.html">
-
-<!--
-tf-smoothing-input creates an input component for exponential smoothing.
--->
-<dom-module id="tf-smoothing-input">
-  <template>
-    <h3 class="title">Smoothing</h3>
-    <div class="smoothing-block">
-      <paper-slider
-        id="slider"
-        value="{{weight}}"
-        immediate-value="{{_immediateWeightNumberForPaperSlider}}"
-        type="number"
-        step="[[step]]"
-        min="[[min]]"
-        max="[[max]]"
-        ></paper-slider>
-      <paper-input
-        id="input"
-        label="weight"
-        no-label-float
-        value="{{_inputWeightStringForPaperInput}}"
-        type="number"
-        step="[[step]]"
-        min="[[min]]"
-        max="[[max]]"
-        ></paper-input>
-    </div>
-    <style>
-      .title {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-
-      .smoothing-block {
-        display: flex;
-      }
-
-      paper-slider {
-        margin-left: 12px;
-        --paper-slider-knob-color: var(--tb-orange-strong);
-        --paper-slider-active-color: var(--tb-orange-strong);
-        flex-grow: 2;
-      }
-
-      paper-input {
-        --paper-input-container-focus-color: var(--tb-orange-strong);
-        --paper-input-container-input: {
-          font-size: 14px;
-        };
-        --paper-input-container-label: {
-          font-size: 14px;
-        };
-        width: 60px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-smoothing-input",
-
-      properties: {
-        step: Number,
-        max: Number,
-        min: Number,
-
-        weight: {
-          type: Number,
-          value: 0.6,
-          notify: true
-        },
-
-        _immediateWeightNumberForPaperSlider: {
-          type: Number,
-          notify: true,
-          observer: '_immediateWeightNumberForPaperSliderChanged'
-        },
-
-        // Paper input treats values as strings even if you specify them as
-        // numbers.
-        _inputWeightStringForPaperInput: {
-          type: String,
-          notify: true,
-          observer: '_inputWeightStringForPaperInputChanged'
-        }
-      },
-
-      _updateWeight: _.debounce(function(val) {
-        this.weight = val;
-      }, 250),
-
-      _immediateWeightNumberForPaperSliderChanged: function() {
-        this._inputWeightStringForPaperInput =
-            this._immediateWeightNumberForPaperSlider.toString();
-        this._updateWeight.call(this, this._immediateWeightNumberForPaperSlider);
-      },
-
-      _inputWeightStringForPaperInputChanged: function() {
-        if (+this._inputWeightStringForPaperInput < 0) {
-          this._inputWeightStringForPaperInput = '0';
-        }
-        else if (+this._inputWeightStringForPaperInput > 1) {
-          this._inputWeightStringForPaperInput = '1';
-        }
-
-        var d = +this._inputWeightStringForPaperInput;
-        if (!isNaN(d)) {
-          this._updateWeight.call(this, d);
-        }
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_storage/BUILD b/tensorflow/tensorboard/components/tf_storage/BUILD
deleted file mode 100644
index 197e0ae73d64897d192ce6e1581f2b3a6bbc5508..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_storage/BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_storage",
-    srcs = [
-        "storage.ts",
-        "tf-storage.html",
-    ],
-    path = "/tf-storage",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_globals",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":tf_storage"],
-    destdir = "tf-storage",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_globals:legacy",
-        "//tensorflow/tensorboard/components/tf_imports_google:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_storage/storage.ts b/tensorflow/tensorboard/components/tf_storage/storage.ts
deleted file mode 100644
index 873bc483a0732de632263f560fe3ee8e52b7ae26..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_storage/storage.ts
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {getFakeHash, setFakeHash, TABS, useHash} from '../tf-globals/globals';
-
-
-/* tslint:disable:no-namespace variable-name */
-/**
- * The Storage Module provides storage for URL parameters, and an API for
- * getting and setting TensorBoard's stateful URI.
- *
- * It generates URI components like: events&runPrefix=train*
- * which TensorBoard uses after like localhost:8000/#events&runPrefix=train*
- * to store state in the URI.
- *
- * It also allows saving the values to localStorage for long-term persistence.
- */
-type StringDict = {[key: string]: string};
-
-/**
- * A key that users cannot use, since TensorBoard uses this to store info
- * about the active tab.
- */
-export let TAB = '__tab__';
-
-/**
- * The name of the property for users to set on a Polymer component
- * in order for its stored properties to be stored in the URI unambiguously.
- * (No need to set this if you want multiple instances of the component to
- * share URI state)
- *
- * Example:
- * <my-component disambiguator="0"></my-component>
- *
- * The disambiguator should be set to any unique value so that multiple
- * instances of the component can store properties in URI storage.
- *
- * Because it's hard to dereference this variable in HTML property bindings,
- * it is NOT safe to change the disambiguator string without find+replace
- * across the codebase.
- */
-export let DISAMBIGUATOR = 'disambiguator';
-
-/**
- * Return a string stored in URI or localStorage.
- * Undefined if not found.
- */
-export function getString(key: string, useLocalStorage: boolean): string {
-  if (useLocalStorage) {
-    return window.localStorage.getItem(key);
-  } else {
-    return _componentToDict(_readComponent())[key];
-  }
-}
-
-/**
- * Set a string in URI or localStorage.
- */
-export function setString(
-    key: string, value: string, useLocalStorage: boolean) {
-  if (useLocalStorage) {
-    window.localStorage.setItem(key, value);
-  } else {
-    const items = _componentToDict(_readComponent());
-    items[key] = value;
-    _writeComponent(_dictToComponent(items));
-  }
-}
-
-/**
- * Return a boolean stored in stored in URI or localStorage.
- * Undefined if not found.
- */
-export function getBoolean(key: string, useLocalStorage: boolean): boolean {
-  const item = getString(key, useLocalStorage);
-  return item === 'true' ? true : item === 'false' ? false : undefined;
-}
-
-/**
- * Store a boolean in URI or localStorage.
- */
-export function setBoolean(
-    key: string, value: boolean, useLocalStorage = false) {
-  setString(key, value.toString(), useLocalStorage);
-}
-
-/**
- * Return a number stored in stored in URI or localStorage.
- * Undefined if not found.
- */
-export function getNumber(key: string, useLocalStorage: boolean): number {
-  const item = getString(key, useLocalStorage);
-  return item === undefined ? undefined : +item;
-}
-
-/**
- * Store a number in URI or localStorage.
- */
-export function setNumber(
-    key: string, value: number, useLocalStorage: boolean) {
-  setString(key, '' + value, useLocalStorage);
-}
-
-/**
- * Return an object stored in stored in URI or localStorage.
- * Undefined if not found.
- */
-export function getObject(key: string, useLocalStorage: boolean): {} {
-  const item = getString(key, useLocalStorage);
-  return item === undefined ? undefined : JSON.parse(atob(item));
-}
-
-/**
- * Store an object in URI or localStorage.
- */
-export function setObject(key: string, value: {}, useLocalStorage: boolean) {
-  setString(key, btoa(JSON.stringify(value)), useLocalStorage);
-}
-
-/**
- * Get a unique storage name for a (Polymer component, propertyName) tuple.
- *
- * DISAMBIGUATOR must be set on the component, if other components use the
- * same propertyName.
- */
-export function getURIStorageName(
-    component: {}, propertyName: string): string {
-  const d = component[DISAMBIGUATOR];
-  const components = d == null ? [propertyName] : [d, propertyName];
-  return components.join('.');
-}
-
-/**
- * Return a function that:
- * (1) Initializes a Polymer boolean property with a default value, if its
- *     value is not already set
- * (2) Sets up listener that updates Polymer property on hash change.
- */
-export function getBooleanInitializer(
-    propertyName: string, defaultVal: boolean,
-    useLocalStorage = false): Function {
-  return _getInitializer(
-      getBoolean, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that:
- * (1) Initializes a Polymer string property with a default value, if its
- *     value is not already set
- * (2) Sets up listener that updates Polymer property on hash change.
- */
-export function getStringInitializer(
-    propertyName: string, defaultVal: string,
-    useLocalStorage = false): Function {
-  return _getInitializer(
-      getString, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that:
- * (1) Initializes a Polymer number property with a default value, if its
- *     value is not already set
- * (2) Sets up listener that updates Polymer property on hash change.
- */
-export function getNumberInitializer(
-    propertyName: string, defaultVal: number,
-    useLocalStorage = false): Function {
-  return _getInitializer(
-      getNumber, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that:
- * (1) Initializes a Polymer Object property with a default value, if its
- *     value is not already set
- * (2) Sets up listener that updates Polymer property on hash change.
- *
- * Generates a deep clone of the defaultVal to avoid mutation issues.
- */
-export function getObjectInitializer(
-    propertyName: string, defaultVal: {}, useLocalStorage = false): Function {
-  return _getInitializer(
-      getObject, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that updates URIStorage when a string property changes.
- */
-export function getBooleanObserver(
-    propertyName: string, defaultVal: boolean,
-    useLocalStorage = false): Function {
-  return _getObserver(
-      getBoolean, setBoolean, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that updates URIStorage when a string property changes.
- */
-export function getStringObserver(
-    propertyName: string, defaultVal: string,
-    useLocalStorage = false): Function {
-  return _getObserver(
-      getString, setString, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that updates URIStorage when a number property changes.
- */
-export function getNumberObserver(
-    propertyName: string, defaultVal: number,
-    useLocalStorage = false): Function {
-  return _getObserver(
-      getNumber, setNumber, propertyName, defaultVal, useLocalStorage);
-}
-
-/**
- * Return a function that updates URIStorage when an object property changes.
- * Generates a deep clone of the defaultVal to avoid mutation issues.
- */
-export function getObjectObserver(
-    propertyName: string, defaultVal: {}, useLocalStorage = false): Function {
-  const clone = _.cloneDeep(defaultVal);
-  return _getObserver(
-      getObject, setObject, propertyName, clone, useLocalStorage);
-}
-
-/**
- * Read component from URI (e.g. returns "events&runPrefix=train*").
- */
-function _readComponent(): string {
-  return useHash() ? window.location.hash.slice(1) : getFakeHash();
-}
-
-/**
- * Write component to URI.
- */
-function _writeComponent(component: string) {
-  if (useHash()) {
-    window.location.hash = component;
-  } else {
-    setFakeHash(component);
-  }
-}
-
-/**
- * Convert dictionary of strings into a URI Component.
- * All key value entries get added as key value pairs in the component,
- * with the exception of a key with the TAB value, which if present
- * gets prepended to the URI Component string for backwards compatibility
- * reasons.
- */
-function _dictToComponent(items: StringDict): string {
-  let component = '';
-
-  // Add the tab name e.g. 'events', 'images', 'histograms' as a prefix
-  // for backwards compatbility.
-  if (items[TAB] !== undefined) {
-    component += items[TAB];
-  }
-
-  // Join other strings with &key=value notation
-  const nonTab = _.pairs(items)
-                   .filter((pair) =>  pair[0] !== TAB)
-                   .map((pair) => {
-                     return encodeURIComponent(pair[0]) + '=' +
-                         encodeURIComponent(pair[1]);
-                   })
-                   .join('&');
-
-  return nonTab.length > 0 ? (component + '&' + nonTab) : component;
-}
-
-/**
- * Convert a URI Component into a dictionary of strings.
- * Component should consist of key-value pairs joined by a delimiter
- * with the exception of the tabName.
- * Returns dict consisting of all key-value pairs and
- * dict[TAB] = tabName
- */
-function _componentToDict(component: string): StringDict {
-  const items = {} as StringDict;
-
-  const tokens = component.split('&');
-  tokens.forEach((token) => {
-    const kv = token.split('=');
-    // Special backwards compatibility for URI components like #events
-    if (kv.length === 1 && _.contains(TABS, kv[0])) {
-      items[TAB] = kv[0];
-    } else if (kv.length === 2) {
-      items[decodeURIComponent(kv[0])] = decodeURIComponent(kv[1]);
-    }
-  });
-  return items;
-}
-
-/**
- * Return a function that:
- * (1) Initializes a Polymer property with a default value, if its
- *     value is not already set
- * (2) Sets up listener that updates Polymer property on hash change.
- */
-function _getInitializer<T>(
-    get: (name: string, useLocalStorage: boolean) => T, propertyName: string,
-    defaultVal: T, useLocalStorage): Function {
-  return function() {
-    const URIStorageName = getURIStorageName(this, propertyName);
-    // setComponentValue will be called every time the hash changes, and is
-    // responsible for ensuring that new state in the hash will be propagated
-    // to the component with that property.
-    // It is important that this function does not re-assign needlessly,
-    // to avoid Polymer observer churn.
-    const setComponentValue = () => {
-      const uriValue = get(URIStorageName, false);
-      const currentValue = this[propertyName];
-      // if uriValue is undefined, we will ensure that the property has the
-      // default value
-      if (uriValue === undefined) {
-        let valueToSet: T;
-        // if we are using localStorage, we will set the value to the value
-        // from localStorage. Then, the corresponding observer will proxy
-        // the localStorage value into URI storage.
-        // in this way, localStorage takes precedence over the default val
-        // but not over the URI value.
-        if (useLocalStorage) {
-          const useLocalStorageValue = get(URIStorageName, true);
-          valueToSet = useLocalStorageValue === undefined ?
-              defaultVal :
-              useLocalStorageValue;
-        } else {
-          valueToSet = defaultVal;
-        }
-        if (!_.isEqual(currentValue, valueToSet)) {
-          // If we don't have an explicit URI value, then we need to ensure
-          // the property value is equal to the default value.
-          // We will assign a clone rather than the canonical default, because
-          // the component receiving this property may mutate it, and we need
-          // to keep a pristine copy of the default.
-          this[propertyName] = _.clone(valueToSet);
-        }
-        // In this case, we have an explicit URI value, so we will ensure that
-        // the component has an equivalent value.
-      } else {
-        if (!_.isEqual(uriValue, currentValue)) {
-          this[propertyName] = uriValue;
-        }
-      }
-    };
-    // Set the value on the property.
-    setComponentValue();
-    // Update it when the hashchanges.
-    window.addEventListener('hashchange', setComponentValue);
-  };
-}
-
-/**
- * Return a function that updates URIStorage when a property changes.
- */
-function _getObserver<T>(
-    get: (name: string, useLocalStorage: boolean) => T,
-    set: (name: string, newVal: T, useLocalStorage: boolean) => void,
-    propertyName: string, defaultVal: T, useLocalStorage: boolean): Function {
-  return function() {
-    const URIStorageName = getURIStorageName(this, propertyName);
-    const newVal = this[propertyName];
-    // if this is a localStorage property, we always synchronize the value
-    // in localStorage to match the one currently in the URI.
-    if (useLocalStorage) {
-      set(URIStorageName, newVal, true);
-    }
-    if (!_.isEqual(newVal, get(URIStorageName, false))) {
-      if (_.isEqual(newVal, defaultVal)) {
-        _unsetFromURI(URIStorageName);
-      } else {
-        set(URIStorageName, newVal, false);
-      }
-    }
-  };
-}
-
-/**
- * Delete a key from the URI.
- */
-function _unsetFromURI(key) {
-  const items = _componentToDict(_readComponent());
-  delete items[key];
-  _writeComponent(_dictToComponent(items));
-}
-
diff --git a/tensorflow/tensorboard/components/tf_storage/test/BUILD b/tensorflow/tensorboard/components/tf_storage/test/BUILD
deleted file mode 100644
index 32399ba7cbeff3179736a3d4671eed104f6ef925..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_storage/test/BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "storageTests.ts",
-        "tests.html",
-    ],
-    path = "/tf-storage/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "//tensorflow/tensorboard/components/tf_storage",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts b/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
deleted file mode 100644
index 82dc51f05dade857f1c9cbd09bb6b215e148977a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {TAB, getString, getNumber, getObject, setString, setNumber, setObject} from '../storage';
-import {TABS} from '../../tf-globals/globals';
-
-/* tslint:disable:no-namespace */
-describe('URIStorage', () => {
-  it('get/setString', () => {
-    setString('key_a', 'hello', false);
-    setString('key_b', 'there', false);
-    chai.assert.equal('hello', getString('key_a', false));
-    chai.assert.equal('there', getString('key_b', false));
-    chai.assert.equal(null, getString('key_c', false));
-  });
-
-  it('get/setNumber', () => {
-    setNumber('key_a', 12, false);
-    setNumber('key_b', 3.4, false);
-    chai.assert.equal(12, getNumber('key_a', false));
-    chai.assert.equal(3.4, getNumber('key_b', false));
-    chai.assert.equal(null, getNumber('key_c', false));
-  });
-
-  it('get/setObject', () => {
-    const obj = {'foo': 2.3, 'bar': 'barstr'};
-    setObject('key_a', obj, false);
-    chai.assert.deepEqual(obj, getObject('key_a', false));
-  });
-
-  it('get/setWeirdValues', () => {
-    setNumber('key_a', NaN, false);
-    chai.assert.deepEqual(NaN, getNumber('key_a', false));
-
-    setNumber('key_a', +Infinity, false);
-    chai.assert.equal(+Infinity, getNumber('key_a', false));
-
-    setNumber('key_a', -Infinity, false);
-    chai.assert.equal(-Infinity, getNumber('key_a', false));
-
-    setNumber('key_a', 1 / 3, false);
-    chai.assert.equal(1 / 3, getNumber('key_a', false));
-
-    setNumber('key_a', -0, false);
-    chai.assert.equal(-0, getNumber('key_a', false));
-  });
-
-  it('set/getTab', () => {
-    setString(TAB, TABS[0], false);
-    chai.assert.equal(TABS[0], getString(TAB, false));
-  });
-});
-
diff --git a/tensorflow/tensorboard/components/tf_storage/test/tests.html b/tensorflow/tensorboard/components/tf_storage/test/tests.html
deleted file mode 100644
index 4668b119d24eb542553ee5474aa001f1fec7cebc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_storage/test/tests.html
+++ /dev/null
@@ -1,25 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<script src="../../web-component-tester/browser.js"></script>
-<link rel="import" href="../../polymer/polymer.html">
-<link rel="import" href="../tf-storage.html">
-<body>
-<script src="storageTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/BUILD b/tensorflow/tensorboard/components/tf_tensorboard/BUILD
deleted file mode 100644
index 95fb8b7a882672b623565f50596e91bb5a049533..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/BUILD
+++ /dev/null
@@ -1,65 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-load("//tensorflow/tensorboard/defs:vulcanize.bzl", "tensorboard_html_binary")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_tensorboard",
-    srcs = [
-        "autoReloadBehavior.ts",
-        "style.html",
-        "tf-tensorboard.html",
-    ],
-    path = "/tf-tensorboard",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/tensorboard/components/tf_audio_dashboard",
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_distribution_dashboard",
-        "//tensorflow/tensorboard/components/tf_globals",
-        "//tensorflow/tensorboard/components/tf_graph_dashboard",
-        "//tensorflow/tensorboard/components/tf_histogram_dashboard",
-        "//tensorflow/tensorboard/components/tf_image_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_scalar_dashboard",
-        "//tensorflow/tensorboard/components/tf_storage",
-        "//tensorflow/tensorboard/components/tf_text_dashboard",
-        "//tensorflow/tensorboard/components/vz_projector",
-        "@org_polymer_font_roboto",
-        "@org_polymer_iron_icons",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_checkbox",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_header_panel",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_tabs",
-        "@org_polymer_paper_toolbar",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["demo.html"],
-    path = "/tf-tensorboard",
-    deps = [
-        ":tf_tensorboard",
-        "//tensorflow/tensorboard/demo:demo_data",
-    ],
-)
-
-tensorboard_html_binary(
-    name = "devserver",
-    testonly = 1,
-    input_path = "/tf-tensorboard/demo.html",
-    output_path = "/index.html",
-    deps = [":demo"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts b/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts
deleted file mode 100644
index 54df16f5b5dd096ffeeb340db2dea792993acbca..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-export var AUTORELOAD_LOCALSTORAGE_KEY = 'TF.TensorBoard.autoReloadEnabled';
-
-var getAutoReloadFromLocalStorage: () => boolean = () => {
-  var val = window.localStorage.getItem(AUTORELOAD_LOCALSTORAGE_KEY);
-  return val === 'true' || val == null;  // defaults to true
-};
-
-/**
- * @polymerBehavior
- */
-export var AutoReloadBehavior = {
-  properties: {
-    autoReloadEnabled: {
-      type: Boolean,
-      observer: '_autoReloadObserver',
-      value: getAutoReloadFromLocalStorage,
-    },
-    _autoReloadId: {
-      type: Number,
-    },
-    autoReloadIntervalSecs: {
-      type: Number,
-      value: 30,
-    },
-  },
-  detached: function() {
-    window.clearTimeout(this._autoReloadId);
-  },
-  _autoReloadObserver: function(autoReload) {
-    window.localStorage.setItem(AUTORELOAD_LOCALSTORAGE_KEY, autoReload);
-    if (autoReload) {
-      var _this = this;
-      this._autoReloadId = window.setTimeout(
-          this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-    } else {
-      window.clearTimeout(this._autoReloadId);
-    }
-  },
-  _doAutoReload: function() {
-    if (this.reload == null) {
-      throw new Error('AutoReloadBehavior requires a reload method');
-    }
-    this.reload();
-    this._autoReloadId = window.setTimeout(
-        this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-  }
-};
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/demo.html b/tensorflow/tensorboard/components/tf_tensorboard/demo.html
deleted file mode 100644
index f691f6211bcc516abeeb4d19a22a56bc294799b7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/demo.html
+++ /dev/null
@@ -1,24 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<title>TensorBoard Demo</title>
-<link rel="import" href="style.html">
-<link rel="import" href="tf-tensorboard.html">
-<body>
-<tf-tensorboard demo-dir="/data" use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts
deleted file mode 100644
index b68fd8c94383fad9aa59400b6b387c8ccd9ecfba..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {AUTORELOAD_LOCALSTORAGE_KEY, AutoReloadBehavior} from '../autoReloadBehavior';
-
-declare function fixture(id: string): void;
-
-window.HTMLImports.whenReady(() => {
-  Polymer({
-    is: 'autoreload-test-element',
-    behaviors: [AutoReloadBehavior],
-  });
-
-  describe('autoReload-behavior', function() {
-    let testElement;
-    const ls = window.localStorage;
-    const key = AUTORELOAD_LOCALSTORAGE_KEY;
-    let clock;
-    let callCount: number;
-
-    beforeEach(function() {
-      ls.setItem(key, 'false');  // start it turned off so we can mutate fns
-      testElement = fixture('autoReloadFixture');
-      callCount = 0;
-      testElement.reload = function() { callCount++; };
-    });
-
-    before(function() { clock = sinon.useFakeTimers(); });
-
-    after(function() { clock.restore(); });
-
-    it('reads and writes autoReload state from localStorage', function() {
-      ls.removeItem(key);
-      testElement = fixture('autoReloadFixture');
-      chai.assert.isTrue(
-          testElement.autoReloadEnabled, 'autoReload defaults to true');
-      chai.assert.equal(ls.getItem(key), 'true', 'autoReload setting saved');
-      testElement = fixture('autoReloadFixture');
-      chai.assert.isTrue(
-          testElement.autoReloadEnabled, 'read true from localStorage');
-      testElement.autoReloadEnabled = false;
-      chai.assert.equal(ls.getItem(key), 'false', 'autoReload setting saved');
-      testElement = fixture('autoReloadFixture');
-      chai.assert.isFalse(
-          testElement.autoReloadEnabled, 'read false setting properly');
-      testElement.autoReloadEnabled = true;
-      chai.assert.equal(ls.getItem(key), 'true', 'saved true setting');
-    });
-
-    it('reloads every interval secs when autoReloading', function() {
-      testElement.autoReloadIntervalSecs = 1;
-      testElement.autoReloadEnabled = true;
-      clock.tick(1000);
-      chai.assert.equal(callCount, 1, 'ticking clock triggered call');
-      clock.tick(20 * 1000);
-      chai.assert.equal(callCount, 21, 'ticking clock 20s triggered 20 calls');
-    });
-
-    it('can cancel pending autoReload', function() {
-      testElement.autoReloadIntervalSecs = 10;
-      testElement.autoReloadEnabled = true;
-      clock.tick(5 * 1000);
-      testElement.autoReloadEnabled = false;
-      clock.tick(20 * 1000);
-      chai.assert.equal(callCount, 0, 'callCount is 0');
-    });
-
-    it('throws an error in absence of reload method', function() {
-      testElement.reload = undefined;
-      testElement.autoReloadIntervalSecs = 1;
-      testElement.autoReloadEnabled = true;
-      chai.assert.throws(function() {
-        clock.tick(5000);
-      });
-    });
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.html b/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.html
deleted file mode 100644
index 5efc02ef98abbde1399db1f7d477b5b27593c7f3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../tf-tensorboard.html">
-  <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-</head>
-<body>
-  <tf-tensorboard demo-dir="data/"></tf-tensorboard>
-  <script src="e2eTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts
deleted file mode 100644
index a00027963be163809f1c999fbe39e3b764aab0ee..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {TABS} from '../../tf-globals/globals';
-
-describe('end-to-end test', () => {
-  window.HTMLImports.whenReady(() => {
-    let tb = d3.select('tf-tensorboard');
-    var tabs = (<any>tb.node()).$.tabs;
-
-    function testTab(tabIndex: number) {
-      it(`selecting ${TABS[tabIndex]} tab`, done => {
-        // Every dashboard emits a rendered event when it is done rendering.
-        tb.on('rendered', () => done());
-        tabs.set('selected', tabIndex);
-      });
-    }
-    // Listen for when the default tab has rendered and test other tabs after.
-    tb.on('rendered', () => {
-      // The default tab already rendered. Test everything else.
-      // If a bug happened while rendering the default tab, the test would
-      // have failed. Re-selecting the default tab and listening for
-      // "rendered" event won't work since the content is not re-stamped.
-      let selected = +tabs.get('selected');
-      for (let i = 0; i < TABS.length; i++) {
-        if (i !== selected) {
-          testTab(i);
-        }
-      }
-    });
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.html b/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.html
deleted file mode 100644
index 88bb6edc4828b4099e892ca323278580aaf6d15e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../tf-tensorboard.html">
-  <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-</head>
-<body>
-  <tf-tensorboard demo-dir="data/"></tf-tensorboard>
-  <script src="fastTabSwitch.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts
deleted file mode 100644
index 905ed4ee4aa30cef37b7d46acecae252edf9ed4a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {TABS} from '../../tf-globals/globals';
-
-describe('fast tab switch', () => {
-  window.HTMLImports.whenReady(() => {
-    let tb = d3.select('tf-tensorboard');
-    // tslint:disable-next-line:no-any be quiet tsc
-    var tabs = (<any>tb.node()).$.tabs;
-
-    // This test will select the events tab. Once the events tab
-    // renders, will select the graph tab, and immediately select
-    // the images tab wihout waiting for the graph tab to finish
-    // rendering. Finally, it finishes when the images tab
-    // has rendered and no errors were thrown.
-    const eventsTabIndex = TABS.indexOf('events');
-    const imagesTabIndex = TABS.indexOf('images');
-    const graphTabIndex = TABS.indexOf('graphs');
-
-    // Listen for when the events tab rendered.
-    tb.on('rendered', () => {
-      it('switching to graph tab and immediately to images', done => {
-        // Select the graph tab.
-        tabs.set('selected', graphTabIndex);
-        // Interrupt graph rendering by immediately selecting the images tab
-        // and finish when the images tab has rendered.
-        tb.on('rendered', () => done());
-        tabs.set('selected', imagesTabIndex);
-      });
-    });
-    // Select the events tab.
-    tabs.set('selected', eventsTabIndex);
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/index.html b/tensorflow/tensorboard/components/tf_tensorboard/test/index.html
deleted file mode 100644
index 8806f36fad91d26b39fe299246578aaf5a776c61..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/index.html
+++ /dev/null
@@ -1,35 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../web-component-tester/browser.js"></script>
-</head>
-<body>
-<script>
-// Run the tests for each main component in tensorboard.
-WCT.loadSuites([
-  'tensorboardTests.html',
-  // TODO: re-enable or remove. b/30163860
-  // 'e2eTests.html',
-  'fastTabSwitch.html'
-]);
-</script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.html b/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.html
deleted file mode 100644
index 2122cb79b16bc91a03a5765cfe9867c608752d0c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.html
+++ /dev/null
@@ -1,44 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <link rel="import" href="../../polymer/polymer.html">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../tf-tensorboard.html">
-  <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-</head>
-<body>
-  <test-fixture id="tensorboardFixture">
-    <template>
-      <tf-tensorboard>
-        <span id="inject-me">Injected content should be rendered by the element.</span>
-      </tf-tensorboard>
-    </template>
-  </test-fixture>
-
-  <test-fixture id="autoReloadFixture">
-    <template>
-      <autoreload-test-element></autoreload-test-element>
-    </template>
-  </test-fixture>
-  <script src="tensorboardTests.js"></script>
-  <script src="autoReloadTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts b/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts
deleted file mode 100644
index 06ff446f1869b34a592947839b45bd1fbd5de197..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {TABS} from '../../tf-globals/globals';
-
-describe('tf-tensorboard tests', () => {
-  window.HTMLImports.whenReady(() => {
-    let tensorboard: any;
-    beforeEach(function() {
-      tensorboard = fixture('tensorboardFixture');
-      tensorboard.demoDir = 'data';
-      tensorboard.autoReloadEnabled = false;
-    });
-
-    it('specified tabs are correct', function(done) {
-      setTimeout(function() {
-        let tabs = tensorboard.$.tabs.getElementsByTagName('paper-tab');
-        let tabMode = Array.prototype.map.call(tabs, (x) => x.dataMode);
-        chai.assert.deepEqual(tabMode, TABS, 'mode is correct');
-        let tabText =
-            Array.prototype.map.call(tabs, (x) => x.innerText.toLowerCase());
-        chai.assert.deepEqual(tabText, TABS, 'text is correct');
-        done();
-      });
-    });
-
-    it('renders injected content', function() {
-      let injected = tensorboard.querySelector('#inject-me');
-      chai.assert.isNotNull(injected);
-    });
-
-    describe('reloading the selected dashboard', function() {
-      TABS.forEach((name, tabIndex) => {
-        // These tabs do not support reload mode.
-        if (name === 'graphs' || name === 'projections') {
-          return;
-        }
-        it(`${name}: calling reload reloads dashboard`, function(done) {
-          tensorboard.$.tabs.set('selected', tabIndex);
-          setTimeout(function() {
-            let called = false;
-            tensorboard.selectedDashboard().reload = function() {
-              called = true;
-            };
-            tensorboard.reload();
-            chai.assert.isFalse(
-                tensorboard.$$('#reload-button').disabled,
-                'reload button not disabled');
-            chai.assert.isTrue(called, `reload was called`);
-            done();
-          });
-        });
-      });
-    });
-
-    it('reload is disabled for graph dashboard', function(done) {
-      const idx = TABS.indexOf('graphs');
-      chai.assert.notEqual(idx, -1, 'graphs was found');
-      tensorboard.$.tabs.set('selected', idx);
-      setTimeout(
-          function() {  // async so that the queued tab change will happen
-            let called = false;
-            tensorboard.selectedDashboard().reload = function() {
-              called = true;
-            };
-            tensorboard.reload();
-            chai.assert.isTrue(
-                tensorboard.$$('#reload-button').disabled,
-                'reload button disabled');
-            chai.assert.isFalse(called, `reload was not called`);
-            done();
-          });
-    });
-
-    describe('top right global icons', function() {
-      it('Clicking the reload button will call reload', function() {
-        let called = false;
-        tensorboard.reload = function() { called = true; };
-        tensorboard.$$('#reload-button').click();
-        chai.assert.isTrue(called);
-      });
-
-      it('settings pane is hidden', function() {
-        chai.assert.equal(tensorboard.$.settings.style['display'], 'none');
-      });
-
-      it('settings icon button opens the settings pane', function(done) {
-        tensorboard.$$('#settings-button').click();
-        // This test is a little hacky since we depend on polymer's
-        // async behavior, which is difficult to predict.
-
-        // keep checking until the panel is visible. error with a timeout if it
-        // is broken.
-        function verify() {
-          if (tensorboard.$.settings.style['display'] !== 'none') {
-            done();
-          } else {
-            setTimeout(verify, 3);  // wait and see if it becomes true
-          }
-        }
-        verify();
-      });
-
-      it('Autoreload checkbox toggle works', function() {
-        let checkbox = tensorboard.$$('#auto-reload-checkbox');
-        chai.assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
-        let oldValue = checkbox.checked;
-        checkbox.click();
-        chai.assert.notEqual(oldValue, checkbox.checked);
-        chai.assert.equal(checkbox.checked, tensorboard.autoReloadEnabled);
-      });
-
-      it('Autoreload checkbox contains correct interval info', function() {
-        let checkbox = tensorboard.$$('#auto-reload-checkbox');
-        let timeInSeconds = tensorboard.autoReloadIntervalSecs + 's';
-        chai.assert.include(checkbox.innerText, timeInSeconds);
-      });
-    });
-  });
-});
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
deleted file mode 100644
index 26b742996aa928b40f441f87727b7b43e51cc5cd..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
+++ /dev/null
@@ -1,361 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../paper-tabs/paper-tabs.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../paper-toolbar/paper-toolbar.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-header-panel/paper-header-panel.html">
-<link rel="import" href="../tf-globals/tf-globals.html">
-<link rel="import" href="../tf-scalar-dashboard/tf-scalar-dashboard.html">
-<link rel="import" href="../tf-distribution-dashboard/tf-distribution-dashboard.html">
-<link rel="import" href="../tf-histogram-dashboard/tf-histogram-dashboard.html">
-<link rel="import" href="../tf-image-dashboard/tf-image-dashboard.html">
-<link rel="import" href="../tf-audio-dashboard/tf-audio-dashboard.html">
-<link rel="import" href="../tf-graph-dashboard/tf-graph-dashboard.html">
-<link rel="import" href="../tf-text-dashboard/tf-text-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-storage/tf-storage.html">
-<link rel="import" href="../vz-projector/vz-projector-dashboard.html">
-
-<!--
-tf-tensorboard is the frontend entry point for TensorBoard.
-
-It implements a toolbar (via paper-header-panel and paper-toolbar) that
-allows the user to toggle between various dashboards.
--->
-<dom-module id="tf-tensorboard">
-  <template>
-    <paper-dialog with-backdrop id="settings">
-      <h2>Settings</h2>
-      <paper-checkbox id="auto-reload-checkbox" checked="{{autoReloadEnabled}}">
-        Reload data every <span>[[autoReloadIntervalSecs]]</span>s.
-      </paper-checkbox>
-    </paper-dialog>
-    <paper-header-panel>
-      <paper-toolbar id="toolbar">
-        <div id="toolbar-content">
-          <div class="toolbar-title">TensorBoard</div>
-          <paper-tabs selected="{{modeIndex}}" noink class="tabs" id="tabs">
-            <template is="dom-repeat" items="[[tabs]]">
-              <template is="dom-if" if="[[_isTabEnabled(item)]]">
-                <paper-tab data-mode="[[item]]">[[item]]</paper-tab>
-              </template>
-            </template>
-          </paper-tabs>
-          <div class="global-actions">
-            <paper-icon-button
-              icon="refresh"
-              on-tap="reload"
-              disabled$="[[_isReloadDisabled(mode)]]"
-              id="reload-button"
-            ></paper-icon-button>
-            <paper-icon-button
-              icon="settings"
-              on-tap="openSettings"
-              id="settings-button"
-            ></paper-icon-button>
-            <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md" tabindex="-1">
-              <paper-icon-button icon="help-outline"></paper-icon-button>
-            </a>
-          </div>
-        </div>
-      </paper-toolbar>
-
-      <div id="content" class="fit">
-        <content id="injected-overview"></content>
-
-        <template is="dom-if" if="[[_modeIsScalars(mode)]]">
-          <tf-scalar-dashboard
-            id="scalars"
-            backend="[[_backend]]"
-          ></tf-scalar-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsImages(mode)]]">
-          <tf-image-dashboard
-            id="images"
-            backend="[[_backend]]"
-          ></tf-image-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsAudio(mode)]]">
-          <tf-audio-dashboard
-            id="audio"
-            backend="[[_backend]]"
-          ></tf-audio-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsGraphs(mode)]]">
-          <tf-graph-dashboard
-            id="graphs"
-            backend="[[_backend]]"
-            debugger-data-enabled="[[_debuggerDataEnabled]]"
-          ></tf-graph-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
-          <tf-distribution-dashboard
-            id="distributions"
-            backend="[[_backend]]"
-          ></tf-distribution-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
-          <tf-histogram-dashboard
-            id="histograms"
-            backend="[[_backend]]"
-          ></tf-histogram-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsEmbeddings(mode)]]">
-          <vz-projector-dashboard
-            id="projector"
-            route-prefix="/data/plugin/projector">
-          </vz-projector-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsText(mode)]]">
-          <tf-text-dashboard
-            id="text"
-            backend="[[_backend]]">
-          </tf-text-dashboard>
-        </template>
-      </div>
-    </paper-header-panel>
-
-    <style>
-      :host {
-        height: 100%;
-        display: block;
-        background-color: var(--paper-grey-100);
-      }
-
-      #toolbar {
-        background-color: var(--tb-toolbar-background-color, --tb-orange-strong);
-        -webkit-font-smoothing: antialiased;
-      }
-
-      .toolbar-title {
-        font-size: 20px;
-        margin-left: 10px;
-        text-rendering: optimizeLegibility;
-        letter-spacing: -0.025em;
-        font-weight: 500;
-        flex-grow: 2;
-        display: var(--tb-toolbar-title-display, block);
-      }
-
-      .tabs {
-        flex-grow: 1;
-        text-transform: uppercase;
-        height: 100%;
-      }
-
-      paper-tabs {
-        --paper-tabs-selection-bar-color: white;
-      }
-
-      .global-actions {
-        flex-grow: 2;
-        display: inline-flex; /* Ensure that icons stay aligned */
-        justify-content: flex-end;
-        text-align: right;
-        color: white;
-      }
-
-      .global-actions a {
-        color: white;
-      }
-
-      #toolbar-content {
-        width: 100%;
-        height: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        align-items: center;
-      }
-
-      #content {
-        height: 100%;
-      }
-
-      [disabled] {
-        opacity: 0.2;
-        color: white;
-      }
-
-    </style>
-  </template>
-  <script src="autoReloadBehavior.js"></script>
-  <script>
-    import {AutoReloadBehavior} from "./autoReloadBehavior";
-    import {Backend} from "../tf-backend/backend";
-    import {TABS, setUseHash} from "../tf-globals/globals";
-    import {getString, setString, TAB} from "../tf-storage/storage";
-    import {setRouter, createRouter} from "../tf-backend/router";
-    import {fetchRuns} from "../tf-backend/runsStore";
-
-    Polymer({
-      is: "tf-tensorboard",
-      behaviors: [AutoReloadBehavior],
-      properties: {
-        router: {
-          type: Object,  // only to trigger an observer
-          observer: '_updateRouter',
-        },
-        _backend: {
-          type: Object,
-          computed: "_makeBackend(demoDir)",
-        },
-        _debuggerDataEnabled: {
-          type: Boolean,
-          value: function() {
-            // For now, Tensorboard only shows debugger data if the debugger_data GET param is set
-            // to enabled.
-            let match = window.location.href.match(/[&\?]debugger_data=enabled/);
-            return match && match.length == 1;
-          },
-        },
-        // Which tab is selected (scalars, graph, images etc).
-        mode: {
-          type: String,
-          computed: '_getModeFromIndex(modeIndex)',
-          notify: true,
-        },
-        tabs: {
-          type: Array,
-          readOnly: true,
-          value: TABS,
-        },
-        // If this is set to a string, TensorBoard will switch to "demo mode"
-        // and attempt to load serialized json data from that directory. You can
-        // generate conformant json using
-        // tensorboard/scripts/serialize_tensorboard.py
-        demoDir: {
-          type: String,
-          value: null,
-        },
-        // Set this to true to store state in URI hash. Should be true for all non-test purposes.
-        useHash: {
-          type: Boolean,
-          value: false,
-        },
-        disabledTabs: String,
-      },
-      _isTabEnabled: function(tab) {
-        if (this.disabledTabs != null &&
-            this.disabledTabs.split(',').indexOf(tab) >= 0) {
-          return false;
-        }
-        return true;
-      },
-      _getModeFromIndex: function(modeIndex) {
-        var mode = this.tabs[modeIndex];
-        setString(TAB, mode);
-        return mode;
-      },
-      _makeBackend: function(demoDir) {
-        // If the user has provided a router, we'll always use that.
-        // Otherwise, if the user has provided a demoDir, we'll use that
-        // to create a router.
-        if (demoDir != null && this.router == null) {
-          var router = createRouter(demoDir, true);
-          setRouter(router);
-        }
-        return new Backend();
-      },
-      _isReloadDisabled: function(mode) {
-        return !this._debuggerDataEnabled && this._modeIsGraphs(mode);
-      },
-      _modeIsScalars: function(mode) {
-        return mode === "scalars";
-      },
-      _modeIsImages: function(mode) {
-        return mode === "images";
-      },
-      _modeIsAudio: function(mode) {
-        return mode === "audio";
-      },
-      _modeIsGraphs: function(mode) {
-        return mode === "graphs";
-      },
-      _modeIsEmbeddings: function(mode) {
-        return mode === "embeddings";
-      },
-      _modeIsDistributions: function(mode) {
-        return mode === "distributions";
-      },
-      _modeIsHistograms: function(mode) {
-        return mode === "histograms";
-      },
-      _modeIsText: function(mode) {
-        return mode === "text";
-      },
-      selectedDashboard: function() {
-        var dashboard = this.$$("#" + this.mode);
-        if (dashboard == null) {
-          throw new Error(`Unable to find dashboard for mode: ${this.mode}`);
-        }
-        return dashboard;
-      },
-      ready: function() {
-        setUseHash(this.useHash);
-
-        this._getModeFromHash();
-        window.addEventListener('hashchange', function() {
-          this._getModeFromHash();
-        }.bind(this));
-        fetchRuns();
-      },
-      _getModeFromHash: function() {
-        var tabName = getString(TAB);
-        var modeIndex = this.tabs.indexOf(tabName);
-        if (modeIndex == -1 && this.modeIndex == null) {
-          // Select the first tab as default.
-          this.set('modeIndex', 0);
-        }
-        if (modeIndex != -1 && modeIndex != this.modeIndex) {
-          this.set('modeIndex', modeIndex);
-        }
-      },
-      _updateRouter: function(router) {
-        setRouter(router);
-      },
-      reload: function() {
-        if (this._modeIsEmbeddings(this.mode)) {
-          return;
-        }
-        if (!this._debuggerDataEnabled && this._modeIsGraphs(this.mode)) {
-          return;
-        }
-        fetchRuns().then(function() {
-          this.selectedDashboard().reload();
-        }.bind(this));
-      },
-      openSettings: function() {
-        this.$.settings.open();
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
deleted file mode 100644
index bed551aedfc1c90ca2a32eb16565798930364de6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
+++ /dev/null
@@ -1,45 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_text_dashboard",
-    srcs = [
-        "tf-text-dashboard.html",
-        "tf-text-loader.html",
-    ],
-    path = "/tf-text-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_material",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"] + glob(["data/**"]),
-    path = "/tf-text-dashboard",
-    deps = [
-        ":tf_text_dashboard",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/logdir b/tensorflow/tensorboard/components/tf_text_dashboard/data/logdir
deleted file mode 100644
index c7d82022cc061502c5991a22e72c214918a9f87b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/runs.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/runs.json
deleted file mode 100644
index aea7de5f91725ab58e9770b9b6fb60ad672fada0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"fry": ["message", "markdown"], "leela": ["message"]}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_markdown.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_markdown.json
deleted file mode 100644
index 94183ae13d1be1f25abf89572841be0db5d1dfe1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_markdown.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-  {
-    "wall_time": 1489715207.593146,
-    "step": 0,
-    "text": "<p><em>Italics1</em> <em>Italics2</em> <strong>bold1</strong> <strong>bold2</strong></p>"
-  },
-  {
-    "wall_time": 1489715207.593801,
-    "step": 1,
-    "text": "<ol>\n<li>List item one.</li>\n<li>List item two.</li>\n<li>Sublist</li>\n<li>Sublist2</li>\n<li>List continues.</li>\n</ol>"
-  },
-  {
-    "wall_time": 1489715207.594842,
-    "step": 2,
-    "text": "<table>\n<thead>\n<tr>\n<th>An</th>\n<th>Example</th>\n<th>Table</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>A</td>\n<td>B</td>\n<td>C</td>\n</tr>\n<tr>\n<td>1</td>\n<td>2</td>\n<td>3</td>\n</tr>\n</tbody>\n</table>"
-  },
-  {
-    "wall_time": 1489715207.595761,
-    "step": 3,
-    "text": "<p>hello <a><em>you</em></a></p>"
-  },
-  {
-    "wall_time": 1489715207.595761,
-    "step": 4,
-    "text": "<p><a href=\"http://tensorflow.org\">TensorFlow</a></p>"
-  },
-  {
-    "wall_time": 1489715207.595761,
-    "step": 530234352,
-    "text": "&lt;script&gt;alert('xss')&lt;/script&gt;"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_message.json
deleted file mode 100644
index e8cc006c0d0223795d646bf5245ae56e54329fa0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_fry_tag_message.json
+++ /dev/null
@@ -1,22 +0,0 @@
-[
-  {
-    "wall_time": 1489715207.593146,
-    "step": 0,
-    "text": "fry loves garnet"
-  },
-  {
-    "wall_time": 1489715207.593801,
-    "step": 1,
-    "text": "fry loves amethyst"
-  },
-  {
-    "wall_time": 1489715207.594842,
-    "step": 2,
-    "text": "fry loves pearl"
-  },
-  {
-    "wall_time": 1489715207.595761,
-    "step": 3,
-    "text": "fry loves steven"
-  }
-]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_leela_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_leela_tag_message.json
deleted file mode 100644
index 5a6d2598937b4e16b5420cac9423cfdd8b16ff48..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/data/text_run_leela_tag_message.json
+++ /dev/null
@@ -1,22 +0,0 @@
-[
-  {
-    "step": 0,
-    "wall_time": 1489715207.607792,
-    "text": "leela loves garnet and feels strongly about various issues of the day including the two-cent titanium tax and whether nixon's head contributes to greenhouse gas emissions"
-  },
-  {
-    "step": 1,
-    "wall_time": 1489715207.609011,
-    "text": "leela loves amethyst"
-  },
-  {
-    "step": 2,
-    "wall_time": 1489715207.610028,
-    "text": "leela loves pearl"
-  },
-  {
-    "step": 3,
-    "wall_time": 1489715207.611142,
-    "text": "leela loves someverylongwordwithoutanybreaksorspacessowecanseehowthatishandledbythefrontend"
-  }
-]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/index.html b/tensorflow/tensorboard/components/tf_text_dashboard/index.html
deleted file mode 100644
index 55ec4d79cf9b7022a84df87767be97ef4e431854..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/index.html
+++ /dev/null
@@ -1,74 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="tf-text-dashboard.html">
-    <title>text Dashboard Demo</title>
-    <style>
-      #container{
-        height: 800px;
-        border: 2px solid grey;
-      }
-      html, body {
-        margin: 0;
-        padding: 0;
-        height: 100%;
-        font-family: "RobotoDraft","Roboto",sans-serif;
-      }
-    </style>
-  </head>
-  <body>
-    <demo-snippet>
-      <template>
-        <dom-module id="text-dash-demo">
-          <template>
-            <tf-text-dashboard id="demo" backend="[[backend]]">
-            </tf-text-dashboard>
-          </template>
-          <script>
-            import * as backend_backend from '../tf-backend/backend';
-            import {createRouter, setRouter} from '../tf-backend/router';
-
-            Polymer({
-              is: "text-dash-demo",
-              properties: {
-                backend: {
-                  type: Object,
-                  value: function() {
-                    return new backend_backend.Backend();
-                  },
-                },
-              },
-              created: function() {
-                var path = "data";
-                var router = createRouter(path, true);
-                setRouter(router);
-              },
-            });
-          </script>
-        </dom-module>
-        <div id="container">
-          <text-dash-demo></text-dash-demo>
-        </div>
-      </template>
-    </demo-snippet>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html b/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html
deleted file mode 100644
index 9b4fd3239c924278fe4d42e50e4b7fe84ae8f102..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html
+++ /dev/null
@@ -1,113 +0,0 @@
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../tf-backend/tf-backend.html">
-<link rel="import" href="../tf-color-scale/tf-color-scale.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tf-panes-helper.html">
-<link rel="import" href="../tf-dashboard-common/tf-sidebar-helper.html">
-<link rel="import" href="tf-text-loader.html">
-
-<!--
-tf-text-dashboard displays a dashboard that loads texts from a TensorFlow run.
--->
-<dom-module id="tf-text-dashboard">
-  <template>
-    <paper-dialog with-backdrop id="actual-text-size-dialog"></paper-dialog>
-    <div id="plumbing">
-      <tf-color-scale
-        id="colorScale"
-        runs="[[runs]]"
-        out-color-scale="{{_colorScale}}"
-        ></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper
-          backend="[[backend]]"
-          categories="{{_categories}}"
-          color-scale="[[_colorScale]]"
-          run2tag="[[run2tag]]"
-          runs="[[runs]]"
-          selected-runs="{{_selectedRuns}}"
-          >
-        </tf-sidebar-helper>
-      </div>
-      <div class="center">
-        <tf-panes-helper
-          categories="[[_categories]]"
-          color-scale="[[_colorScale]]"
-          data-type="[[dataType]]"
-          data-provider="[[dataProvider]]"
-          data-not-found="[[dataNotFound]]"
-          run2tag="[[run2tag]]"
-          selected-runs="[[_selectedRuns]]"
-          repeat-for-runs
-          >
-          <template>
-            <tf-text-loader color-scale="[[_colorScale]]"></tf-text-loader>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-width: 100%;
-        --card-height: auto;
-        --card-expanded-width: 100%;
-        --card-expanded-height: 1000px;
-        --card-padding: 0 5px 5px 5px;
-        --show-expand-button: none;
-      }
-
-    </style>
-  </template>
-  <script>
-    import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-    import {ReloadBehavior} from "../tf-dashboard-common/reload-behavior";
-    import {BackendBehavior} from "../tf-backend/behavior";
-
-    Polymer({
-      is: "tf-text-dashboard",
-      factoryImpl: function(backend) {
-        this.backend = backend;
-      },
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "text"
-        },
-      },
-      behaviors: [
-        DashboardBehavior("text"),
-        ReloadBehavior("tf-chart-scaffold"),
-        BackendBehavior,
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html b/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html
deleted file mode 100644
index 374e0478dd19d6cd667e293bb5acca487de2cad8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html
+++ /dev/null
@@ -1,143 +0,0 @@
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-material/paper-material.html">
-<link rel="import" href="../tf-dashboard-common/scrollbar-style.html">
-<link rel="import" href="../tf-imports/d3.html">
-
-<!--
-tf-text-loader displays markdown text data from the Text plugin.
--->
-
-<style>
-  tf-text-loader p {
-    margin: 0.3em 0;
-  }
-
-  tf-text-loader table {
-    border-collapse: collapse;
-  }
-
-  tf-text-loader table th {
-    font-weight: 600;
-  }
-
-  tf-text-loader table th,
-  tf-text-loader table td {
-    padding: 6px 13px;
-    border: 1px solid #dfe2e5;
-  }
-
-  tf-text-loader table tr {
-    background-color: #fff;
-    border-top: 1px solid #c6cbd1;
-  }
-
-</style>
-<dom-module id="tf-text-loader">
-
-  <!-- Set the innerHTML with the textual content, so we can render the
-   html generated by our markdown parser. Note this content is always
-   sanitized by the backend, so xss attacks are not possible.
-  -->
-  <template>
-    <style include="scrollbar-style"></style>
-    <paper-material elevation="1" id="outer" class="container scrollbar">
-      <template id="repeater" is="dom-repeat" items="[[_texts]]">
-      <paper-material elevation="1" class="step-container">
-        step <span class="step-value">[[_numfmt(item.step)]]</span>
-      </paper-material>
-      <paper-material elevation="1" inner-h-t-m-l="[[item.text]]" class="text">
-        </paper-material>
-      </template>
-    </paper-material>
-
-
-    <style>
-      #outer {
-        display: block;
-        overflow: auto;
-        max-height: 500px;
-        position: relative;
-        border-radius: 3px;
-        border: 2px solid black;
-      }
-      .text {
-        margin: 0 10px 10px 10px;
-        border-radius: 0 3px 3px 3px;
-        background-color: white;
-        padding: 5px;
-        word-break: break-word;
-      }
-      .step-container {
-        border-left: 1px solid #ccc;
-        border-right: 1px solid #ccc;
-        border-top: 1px solid #ccc;
-        border-radius: 3px 3px 0 0;
-        font-style: italic;
-        margin-top: 10px;
-        background-color: var(--tb-ui-light-accent);
-        display: inline-block;
-        margin-left: 9px;
-        padding: 3px;
-        font-size: 12px;
-      }
-
-    </style>
-
-  </template>
-  <script>
-    Polymer({
-      is: "tf-text-loader",
-      properties: {
-        colorScale: Object,
-        run: String,
-        // This is an array of Tensorboard Text&Datum objects (See backend.ts for details). The
-        // properties of objects in this array are
-        // {
-        //   wall_time: Date,
-        //   step: number,
-        //   text: string,
-        // }
-        // they are ordered from most recent to oldest
-        _texts: {
-          type: Array,
-          value: [],
-        },
-
-      },
-      redraw: function() {
-        // Other dashboards logic requires a redraw method to be defined.
-      },
-      setVisibleSeries: function(runs) {
-        // Do nothing.
-      },
-      setSeriesData: function(run, texts) {
-        this.set("run", run);
-        this.set("_texts", texts.reverse());
-
-        // Update the border color based on the run.
-        var color = this.colorScale.scale(run);
-        this.$$("#outer").style.borderColor = color;
-      },
-      _numfmt: function(n) {
-        return d3.format(",")(n);
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/BUILD b/tensorflow/tensorboard/components/tf_trace_viewer/BUILD
deleted file mode 100644
index 9f582329f1d964d40f680cb825e89bfe41cf82e2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_trace_viewer/BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "tf_trace_viewer",
-    srcs = [
-        "tf-trace-viewer.html",
-        "@org_chromium_catapult_vulcanized_trace_viewer//:trace_viewer_full.html",
-    ],
-    path = "/tf-trace-viewer",
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["demo.html"],
-    path = "/tf-trace-viewer",
-    deps = [
-        ":tf_trace_viewer",
-        "//tensorflow/tensorboard/components/tf_trace_viewer/data",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/data/BUILD b/tensorflow/tensorboard/components/tf_trace_viewer/data/BUILD
deleted file mode 100644
index c295d38258f3cf2eae2d90bfa86a8e8a6201a5db..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_trace_viewer/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-web_library(
-    name = "data",
-    srcs = glob(["*.json"]),
-    path = "/tf-trace-viewer/data/plugin/profile",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/data/trace.json b/tensorflow/tensorboard/components/tf_trace_viewer/data/trace.json
deleted file mode 100644
index e1d57394e3594b68f73d3e5be56753d32503d80b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_trace_viewer/data/trace.json
+++ /dev/null
@@ -1,105 +0,0 @@
-{
-  "traceEvents": [
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "C",
-        "name": "counter", "args": {"value": 10}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 826, "ph": "B",
-        "name": "A long name that doesnt fit but is exceedingly informative",
-        "args": {"name_false": false, "value_true": true}},
-    {"cat": "PERF", "pid": 22630, "ts": 835, "ph": "I", "s": "p",
-        "name": "ProcessWideEvent1", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 827, "ph": "B",
-        "name": "Asub with a name that wont fit", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 828, "ph": "E",
-        "name": "Asub", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 829, "ph": "B",
-        "name": "Asub", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 15, "ts": 820, "ph": "X",
-        "name": "Long X type", "args": {}, "sf": 7, "esf": 8},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "E",
-        "name": "Asub", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
-        "name": "X1", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 2, "ts": 818, "ph": "X",
-        "name": "X same ts and dur as X1", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 832, "ph": "C",
-        "name": "counter", "args": {"value": 1}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 833, "ph": "E",
-        "name": "", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 835, "ph": "I",
-        "name": "ThreadLevelI1", "args": {}},
-
-    {"cat": "PERF", "ts": 880, "ph": "I", "s": "g", "name": "GlobalEvent1",
-        "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 837, "ph": "I",
-        "name": "ThreadLevelI2", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 839, "ph": "C",
-        "name": "counter", "args": {"value": 5}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 840, "ph": "B",
-        "name": "A not as long a name", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "E",
-        "name": "A not as long a name", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 848, "ph": "C",
-        "name": "counter", "args": {"value": 1}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "C",
-        "name": "counter", "args": {"value": 10}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 850, "ph": "B",
-        "name": "B", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22630, "ts": 854, "ph": "E",
-        "name": "B", "args": {}},
-
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 827, "ph": "B",
-        "name": "A", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 835, "ph": "I",
-        "name": "ThreadLevelImmediate Three", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 845, "ph": "I",
-        "name": "ThreadLevelImmediate4", "args": {}},
-    {"cat": "PERF", "pid": 22630, "tid": 22631, "ts": 854, "ph": "E",
-        "name": "A", "args": {}},
-
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
-        "name": "B/E over X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "dur": 10, "ts": 860, "ph": "X",
-        "name": "X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 860, "ph": "B",
-        "name": "B/E under X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
-        "name": "B/E under X", "args": {}},
-    {"cat": "PREF", "pid": 22630, "tid": 22630, "ts": 870, "ph": "E",
-        "name": "B/E over X", "args": {}},
-
-    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 870, "ph": "P",
-        "name": "SampleA", "args": {}},
-    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 875, "ph": "P",
-        "name": "SampleB", "args": {}},
-    {"cat": "SAMPLE", "pid": 22630, "tid": 22631, "ts": 878, "ph": "P",
-        "name": "SampleC", "args": {}, "sf": 8},
-
-    {"cat": "__metadata", "pid": 22630, "tid": 22630, "ts": 0, "ph": "M",
-        "name": "thread_name", "args": {"name": "threadA"}},
-    {"cat": "__metadata", "pid": 22630, "tid": 22631, "ts": 0, "ph": "M",
-        "name": "thread_name", "args": {"name": "threadB"}},
-    {"cat": "__metadata", "pid": 22630, "tid": 22632, "ts": 0, "ph": "M",
-        "name": "thread_name", "args": {"name": "threadC"}}
-  ],
-  "stackFrames": {
-    "1": {
-      "category": "m1",
-      "name": "main"
-    },
-    "7": {
-      "category": "m2",
-      "name": "frame7",
-      "parent": "1"
-    },
-    "8": {
-      "category": "m2",
-      "name": "frame8",
-      "parent": "1"
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/demo.html b/tensorflow/tensorboard/components/tf_trace_viewer/demo.html
deleted file mode 100644
index dd0029e96794ad9b641685ee4667130d265579f0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_trace_viewer/demo.html
+++ /dev/null
@@ -1,30 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="tf-trace-viewer.html">
-<title>Trace Viewer Demo</title>
-<style>
-  #container{
-    height: 800px;
-    border: 2px solid grey;
-  }
-</style>
-<div id="container">
-  <tf-trace-viewer trace-data-url="data/plugin/profile/trace.json">
-  </tf-trace-viewer>
-</div>
diff --git a/tensorflow/tensorboard/components/tf_trace_viewer/tf-trace-viewer.html b/tensorflow/tensorboard/components/tf_trace_viewer/tf-trace-viewer.html
deleted file mode 100644
index a7b0b2cd73008856872e2c88962633d3b7e7bebb..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_trace_viewer/tf-trace-viewer.html
+++ /dev/null
@@ -1,127 +0,0 @@
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="trace_viewer_full.html">
-
-<!--
-tf-trace-viewer is the frontend entry point for Trace Viewer on TensorBoard.
-
-The server serves the trace viewer app at a separate endpoint. TensorBoard
-dashboard would integrate trace viewer app using iframe.
--->
-<script>
-  "use strict";
-
-  Polymer({
-    is: "tf-trace-viewer",
-    properties: {
-      // The URL of trace data. Provided by caller via URL parameter.
-      traceDataUrl: {
-        type: String,
-        value: null,
-      },
-      _traceData: {
-        type: Object,
-        observer: "_traceDataChanged"
-      },
-      _traceViewer: Object,
-      _traceContainer: Object,
-      _traceModel: Object,
-    },
-    ready: function() {
-      // Initiate the trace viewer app.
-      this._traceContainer = document.createElement("track-view-container");
-      this._traceContainer.id = "track_view_container";
-
-      this._traceViewer = document.createElement("tr-ui-timeline-view");
-      this._traceViewer.track_view_container = this._traceContainer;
-      this._traceViewer.appendChild(this._traceContainer);
-
-      this._traceViewer.id = 'trace-viewer';
-      this._traceViewer.globalMode = true;
-
-      Polymer.dom(this.root).appendChild(this._traceViewer);
-
-      // Retrieve the URL of trace data.
-      var queryString = window.location.href.split("?")[1];
-      if (queryString) {
-        var parts = queryString.split('&')
-        for (var i=0; i<parts.length; i++) {
-          var components = parts[i].split('=');
-          if (components[0] == "trace_data_url") {
-            this.traceDataUrl = decodeURIComponent(components[1]);
-            break;
-          }
-        }
-      }
-
-      this._loadTrace();
-    },
-    _loadTrace : function() {
-      if (!this.traceDataUrl) {
-        this._displayOverlay("Trace data URL is not provided.", "Trace Viewer");
-        return null;
-      }
-      // Send HTTP request to get the trace data.
-      var req = new XMLHttpRequest();
-      var is_binary = / [.] gz$ /.test(this.traceDataUrl) ||
-                      / [.] zip$ /.test(this.traceDataUrl);
-      req.overrideMimeType('text/plain; charset=x-user-defined');
-      req.open('GET', this.traceDataUrl, true);
-      if (is_binary) {
-        req.responseType = 'arraybuffer';
-      }
-
-      req.onreadystatechange = function(event) {
-        if (req.readyState !== 4) {
-          return;
-        }
-        window.setTimeout(function() {
-          if (req.status === 200) {
-            this.set("_traceData", is_binary ? req.response : req.responseText);
-          } else {
-            this._displayOverlay(req.status, "Failed to fetch data");
-          }
-        }.bind(this), 0);
-      }.bind(this);
-      req.send(null);
-    },
-    _traceDataChanged: function(data) {
-      if (!data) {
-        this._displayOverlay("Trace Viewer", "No trace to display...");
-        return;
-      }
-      // Feed the trace data into the trace viewer app.
-      this._traceModel = new tr.Model();
-      var i = new tr.importer.Import(this._traceModel);
-      var p = i.importTracesWithProgressDialog([data]);
-      p.then(() => {
-        this._traceViewer.model = this._traceModel;
-        this._traceViewer.viewTitle = "Trace View";
-      }).catch((err) => {
-        this._displayOverlay(
-            'Import error', tr.b.normalizeException(err).message);
-      });
-    },
-    _displayOverlay: function(title, content) {
-      var overlay = new tr.ui.b.Overlay();
-      overlay.textContent = content;
-      overlay.title = title;
-      overlay.visible = true;
-    },
-  });
-</script>
diff --git a/tensorflow/tensorboard/components/trace_viewer.html b/tensorflow/tensorboard/components/trace_viewer.html
deleted file mode 100644
index c9bcdc9e207d5be7a61a2a781657f476be3dc618..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/trace_viewer.html
+++ /dev/null
@@ -1,28 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<title>Trace Viewer</title>
-<html>
-<head>
-  <link rel="import" href="tf-trace-viewer/tf-trace-viewer.html" jscomp-nocompile="true">
-  <body>
-    <tf-trace-viewer></tf-trace-viewer>
-  </body>
-</head>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
deleted file mode 100644
index 6645805d0c023c97f84c0cec5d67490e084e64cd..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "vz_distribution_chart",
-    srcs = [
-        "vz-distribution-chart.html",
-        "vz-distribution-chart.ts",
-    ],
-    path = "/vz-distribution-chart",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:plottable",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/vz_line_chart",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/vz-distribution-chart",
-    deps = [
-        ":vz_distribution_chart",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/index.html b/tensorflow/tensorboard/components/vz_distribution_chart/index.html
deleted file mode 100644
index 39db09354bd527fa90bb05f0d7656991b1d2383a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/index.html
+++ /dev/null
@@ -1,61 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>vz-distribution chart demo</title>
-    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="vz-distribution-chart.html">
-    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../paper-styles/typography.html">
-    <style type="text/css">
-      body {
-        font-family: "Roboto";
-      }
-
-      vz-distribution-chart {
-        height: 400px;
-      }
-    </style>
-  </head>
-  <body>
-    <h3>Simple distribution chart</h3>
-    <script>
-      var data = [[1716.20,0,[[0,0.10],[668,0.10],[1587,0.10],[3085,0.10],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1720.60,10,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1724.90,20,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1729.18,30,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1733.55,40,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1737.84,50,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1742.35,60,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1746.75,70,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1751.15,80,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1755.44,90,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.10]]],[1759.99,100,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1764.40,110,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1768.70,120,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1773.00,130,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1777.57,140,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.10],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1781.96,150,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.10],[10000,0.11]]],[1786.34,160,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1790.67,170,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1794.96,180,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1799.29,190,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1803.68,200,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1808.88,210,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1813.33,220,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1817.66,230,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1821.95,240,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1826.97,250,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1831.64,260,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.10],[9332,0.11],[10000,0.11]]],[1836.01,270,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1840.31,280,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]],[1844.63,290,[[0,0.09],[668,0.09],[1587,0.09],[3085,0.09],[5000,0.09],[6915,0.10],[8413,0.11],[9332,0.11],[10000,0.11]]]];
-    </script>
-    <demo-snippet>
-      <template>
-        <vz-distribution-chart id="demo"></vz-distribution-chart>
-        <script>
-          var elem = document.querySelector('#demo');
-          elem.setVisibleSeries(['demo']);
-          // The data format is strange. We don't expect you to use this chart.
-          var xform = data.map(function(x) {
-            var out = x[2];
-            out.wall_time = x[0];
-            out.step = x[1];
-            return out;
-          });
-          elem.setSeriesData('demo', xform);
-        </script>
-      </template>
-    </demo-snippet>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
deleted file mode 100644
index 1f1fdda91963cf7e90dd6d8df67aa3841390322c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
+++ /dev/null
@@ -1,45 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/plottable.html">
-<link rel="import" href="../vz-line-chart/vz-line-chart.html">
-
-<dom-module id="vz-distribution-chart">
-  <template>
-    <div id="chartdiv"></div>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-      #chartdiv {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-    </style>
-  </template>
-  <script src="vz-distribution-chart.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
deleted file mode 100644
index f3911d301d99cdeb481140ca3381ec79903a69fe..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import * as ChartHelpers from '../vz-line-chart/vz-chart-helpers';
-
-export class DistributionChart {
-  private run2datasets: {[run: string]: Plottable.Dataset};
-  protected runs: string[];
-
-  protected xAccessor: Plottable.IAccessor<number|Date>;
-  protected xScale: Plottable.QuantitativeScale<number|Date>;
-  protected yScale: Plottable.QuantitativeScale<number>;
-  protected gridlines: Plottable.Components.Gridlines;
-  protected center: Plottable.Components.Group;
-  protected xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
-  protected yAxis: Plottable.Axes.Numeric;
-  protected xLabel: Plottable.Components.AxisLabel;
-  protected yLabel: Plottable.Components.AxisLabel;
-  protected outer: Plottable.Components.Table;
-  protected colorScale: Plottable.Scales.Color;
-  private plots: Plottable.XYPlot<number|Date, number>[];
-
-  private targetSVG: d3.Selection<any, any, any, any>;
-
-  constructor(xType: string, colorScale: Plottable.Scales.Color) {
-    this.run2datasets = {};
-    this.colorScale = colorScale;
-    this.buildChart(xType);
-  }
-
-  protected getDataset(run: string) {
-    if (this.run2datasets[run] === undefined) {
-      this.run2datasets[run] = new Plottable.Dataset([], {run: run});
-    }
-    return this.run2datasets[run];
-  }
-
-  protected buildChart(xType: string) {
-    if (this.outer) {
-      this.outer.destroy();
-    }
-    let xComponents = ChartHelpers.getXComponents(xType);
-    this.xAccessor = xComponents.accessor;
-    this.xScale = xComponents.scale;
-    this.xAxis = xComponents.axis;
-    this.xAxis.margin(0).tickLabelPadding(3);
-    this.yScale = new Plottable.Scales.Linear();
-    this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-    let yFormatter = ChartHelpers.multiscaleFormatter(
-        ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-    this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-    this.yAxis.usesTextWidthApproximation(true);
-
-    let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-
-    this.gridlines =
-        new Plottable.Components.Gridlines(this.xScale, this.yScale);
-
-    this.center = new Plottable.Components.Group([this.gridlines, center]);
-    this.outer = new Plottable.Components.Table(
-        [[this.yAxis, this.center], [null, this.xAxis]]);
-  }
-
-  protected buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-    let percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
-    let opacities = _.range(percents.length - 1)
-                        .map((i) => (percents[i + 1] - percents[i]) / 2500);
-    let accessors = percents.map((p, i) => (datum) => datum[i][1]);
-    let median = 4;
-    let medianAccessor = accessors[median];
-
-    let plots = _.range(accessors.length - 1).map((i) => {
-      let p = new Plottable.Plots.Area<number|Date>();
-      p.x(xAccessor, xScale);
-
-      let y0 = i > median ? accessors[i] : accessors[i + 1];
-      let y = i > median ? accessors[i + 1] : accessors[i];
-      p.y(y, yScale);
-      p.y0(y0);
-      p.attr(
-          'fill',
-          (d: any, i: number, dataset: Plottable.Dataset) =>
-              this.colorScale.scale(dataset.metadata().run));
-      p.attr(
-          'stroke',
-          (d: any, i: number, dataset: Plottable.Dataset) =>
-              this.colorScale.scale(dataset.metadata().run));
-      p.attr('stroke-weight', (d: any, i: number, m: any) => '0.5px');
-      p.attr('stroke-opacity', () => opacities[i]);
-      p.attr('fill-opacity', () => opacities[i]);
-      return p;
-    });
-
-    let medianPlot = new Plottable.Plots.Line<number|Date>();
-    medianPlot.x(xAccessor, xScale);
-    medianPlot.y(medianAccessor, yScale);
-    medianPlot.attr(
-        'stroke', (d: any, i: number, m: any) => this.colorScale.scale(m.run));
-
-    this.plots = plots;
-    return new Plottable.Components.Group(plots);
-  }
-
-  public setVisibleSeries(runs: string[]) {
-    this.runs = runs;
-    let datasets = runs.map((r) => this.getDataset(r));
-    this.plots.forEach((p) => p.datasets(datasets));
-  }
-
-  /**
-   * Set the data of a series on the chart.
-   */
-  public setSeriesData(name: string, data: any) {
-    this.getDataset(name).data(data);
-  }
-
-  public renderTo(targetSVG: d3.Selection<any, any, any, any>) {
-    this.targetSVG = targetSVG;
-    this.outer.renderTo(targetSVG);
-  }
-
-  public redraw() {
-    this.outer.redraw();
-  }
-
-  protected destroy() {
-    this.outer.destroy();
-  }
-}
-
-
-Polymer({
-  is: 'vz-distribution-chart',
-  properties: {
-    /**
-     * Scale that maps series names to colors. The default colors are from
-     * d3.d3.schemeCategory10. Use this property to replace the default
-     * line colors with colors of your own choice.
-     * @type {Plottable.Scales.Color}
-     * @required
-     */
-    colorScale: {
-      type: Object,
-      value: function() {
-        return new Plottable.Scales.Color().range(d3.schemeCategory10);
-      }
-    },
-    /**
-     * The way to display the X values. Allows:
-     * - "step" - Linear scale using the  "step" property of the datum.
-     * - "wall_time" - Temporal scale using the "wall_time" property of the
-     * datum.
-     * - "relative" - Temporal scale using the "relative" property of the
-     * datum if it is present or calculating from "wall_time" if it isn't.
-     */
-    xType: {type: String, value: 'step'},
-    _attached: Boolean,
-    _chart: Object,
-    _visibleSeriesCache: {
-      type: Array,
-      value: function() {
-        return []
-      }
-    },
-    _seriesDataCache: {
-      type: Object,
-      value: function() {
-        return {}
-      }
-    },
-    _makeChartAsyncCallbackId: {type: Number, value: null}
-  },
-  observers: [
-    '_makeChart(xType, colorScale, _attached)',
-    '_reloadFromCache(_chart)',
-  ],
-  setVisibleSeries: function(names) {
-    this._visibleSeriesCache = names;
-    if (this._chart) {
-      this._chart.setVisibleSeries(names);
-      this.redraw();
-    }
-  },
-  setSeriesData: function(name, data) {
-    this._seriesDataCache[name] = data;
-    if (this._chart) {
-      this._chart.setSeriesData(name, data);
-    }
-  },
-  redraw: function() {
-    this._chart.redraw();
-  },
-  ready: function() {
-    this.scopeSubtree(this.$.chartdiv, true);
-  },
-  _makeChart: function(xType, colorScale, _attached) {
-    if (this._makeChartAsyncCallbackId === null) {
-      this.cancelAsync(this._makeChartAsyncCallbackId);
-    }
-
-    this._makeChartAsyncCallbackId = this.async(function() {
-      this._makeChartAsyncCallbackId = null;
-      if (!_attached) return;
-      if (this._chart) this._chart.destroy();
-      var chart = new DistributionChart(xType, colorScale);
-      var svg = d3.select(this.$.chartdiv);
-      chart.renderTo(svg);
-      this._chart = chart;
-    }, 350);
-  },
-  _reloadFromCache: function() {
-    if (this._chart) {
-      this._chart.setVisibleSeries(this._visibleSeriesCache);
-      this._visibleSeriesCache.forEach(function(name) {
-        this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-      }.bind(this));
-    }
-  },
-  attached: function() {
-    this._attached = true;
-  },
-  detached: function() {
-    this._attached = false;
-  }
-});
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD b/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
deleted file mode 100644
index 6f6c8d94c3732d650d00f872b293609328eb1cf7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "vz_histogram_timeseries",
-    srcs = ["vz-histogram-timeseries.html"],
-    path = "/vz-histogram-timeseries",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/vz-histogram-timeseries",
-    deps = [
-        ":vz_histogram_timeseries",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":vz_histogram_timeseries"],
-    visibility = ["//learning/vis/vz_elements/catalog:__pkg__"],
-    destdir = "vz-histogram-timeseries",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports_google:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html b/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
deleted file mode 100644
index 42efa83eb07d2da9993bb410a50c9503df8c582c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
+++ /dev/null
@@ -1,84 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>vz-histogram-timeseries demo</title>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="vz-histogram-timeseries.html">
-    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../paper-styles/typography.html">
-    <link rel="import" href="../paper-button/paper-button.html">
-    <style type="text/css">
-      body {
-        font-family: "Roboto";
-      }
-
-      vz-histogram-timeseries {
-        height: 300px;
-        width: 500px;
-      }
-    </style>
-  </head>
-  <body>
-    <h3>vz-histogram-timeseries mode</h3>
-    <demo-snippet>
-      <template>
-        <paper-button id="offsetButton">Offset</paper-button>
-        <paper-button id="overlayButton">Overlay</paper-button>
-        <vz-histogram-timeseries id="histo1"></vz-histogram-timeseries>
-        <script>
-          var histo1 = document.querySelector('#histo1'),
-            offsetButton = document.querySelector('#offsetButton'),
-            overlayButton = document.querySelector('#overlayButton');
-
-          histo1.setSeriesData('data', data);
-          offsetButton.addEventListener('click', function() { histo1.mode = 'offset' });
-          overlayButton.addEventListener('click', function() { histo1.mode = 'overlay' });
-        </script>
-      </template>
-    </demo-snippet>
-
-    <h3>vz-histogram-timeseries axis</h3>
-    <demo-snippet>
-      <template>
-        <paper-button id="stepButton">Step</paper-button>
-        <paper-button id="relativeButton">Relative</paper-button>
-        <paper-button id="wallTimeButton">Wall Time</paper-button>
-        <vz-histogram-timeseries id="histo2"></vz-histogram-timeseries>
-        <script>
-          var histo2 = document.querySelector('#histo2'),
-            stepButton = document.querySelector('#stepButton'),
-            relativeButton = document.querySelector('#relativeButton'),
-            wallTimeButton = document.querySelector('#wallTimeButton');
-
-          histo2.setSeriesData('data', data);
-          stepButton.addEventListener('click', function() { histo2.timeProperty = 'step' });
-          relativeButton.addEventListener('click', function() { histo2.timeProperty = 'relative' });
-          wallTimeButton.addEventListener('click', function() { histo2.timeProperty = 'wall_time' });
-        </script>
-      </template>
-    </demo-snippet>
-
-    <script>
-      var data = [{"wall_time":new Date("2016-06-24T04:13:11.455Z"),"step":28,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":0},{"x":0.0752,"dx":0.0054,"y":3.2512},{"x":0.0805,"dx":0.0054,"y":19.1043},{"x":0.0859,"dx":0.0054,"y":66.4758},{"x":0.0913,"dx":0.0054,"y":89.9105},{"x":0.0966,"dx":0.0054,"y":102.4476},{"x":0.102,"dx":0.0054,"y":83.8924},{"x":0.1073,"dx":0.0054,"y":81.9883},{"x":0.1127,"dx":0.0054,"y":25.21},{"x":0.1181,"dx":0.0054,"y":25.21},{"x":0.1234,"dx":0.0054,"y":2.51},{"x":0.1288,"dx":0.0054,"y":0},{"x":0.1341,"dx":0.0054,"y":0},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:13:32.817Z"),"step":74,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":0.7448},{"x":0.0752,"dx":0.0054,"y":14.6532},{"x":0.0805,"dx":0.0054,"y":30.3488},{"x":0.0859,"dx":0.0054,"y":59.5117},{"x":0.0913,"dx":0.0054,"y":75.1098},{"x":0.0966,"dx":0.0054,"y":83.4545},{"x":0.102,"dx":0.0054,"y":74.1291},{"x":0.1073,"dx":0.0054,"y":73.1618},{"x":0.1127,"dx":0.0054,"y":40.9067},{"x":0.1181,"dx":0.0054,"y":40.9067},{"x":0.1234,"dx":0.0054,"y":5.241},{"x":0.1288,"dx":0.0054,"y":1.2973},{"x":0.1341,"dx":0.0054,"y":0.5347},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:04.265Z"),"step":148,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":1.4897},{"x":0.0752,"dx":0.0054,"y":19.5529},{"x":0.0805,"dx":0.0054,"y":36.5817},{"x":0.0859,"dx":0.0054,"y":63.3103},{"x":0.0913,"dx":0.0054,"y":68.1847},{"x":0.0966,"dx":0.0054,"y":70.7925},{"x":0.102,"dx":0.0054,"y":68.2578},{"x":0.1073,"dx":0.0054,"y":67.9546},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":8.3454},{"x":0.1288,"dx":0.0054,"y":4.3242},{"x":0.1341,"dx":0.0054,"y":1.7822},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:17.166Z"),"step":172,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":2.2345},{"x":0.0752,"dx":0.0054,"y":22.1303},{"x":0.0805,"dx":0.0054,"y":37.9432},{"x":0.0859,"dx":0.0054,"y":57.6124},{"x":0.0913,"dx":0.0054,"y":63.9491},{"x":0.0966,"dx":0.0054,"y":67.3392},{"x":0.102,"dx":0.0054,"y":70.352},{"x":0.1073,"dx":0.0054,"y":70.5635},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":9.5135},{"x":0.1288,"dx":0.0054,"y":5.6214},{"x":0.1341,"dx":0.0054,"y":2.5479},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:30.696Z"),"step":202,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":5.2139},{"x":0.0752,"dx":0.0054,"y":22.6865},{"x":0.0805,"dx":0.0054,"y":37.4077},{"x":0.0859,"dx":0.0054,"y":57.6124},{"x":0.0913,"dx":0.0054,"y":62.0743},{"x":0.0966,"dx":0.0054,"y":64.4614},{"x":0.102,"dx":0.0054,"y":69.1481},{"x":0.1073,"dx":0.0054,"y":69.5186},{"x":0.1127,"dx":0.0054,"y":44.2364},{"x":0.1181,"dx":0.0054,"y":44.2364},{"x":0.1234,"dx":0.0054,"y":11.4129},{"x":0.1288,"dx":0.0054,"y":7.7835},{"x":0.1341,"dx":0.0054,"y":3.439},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:14:39.713Z"),"step":224,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0},{"x":0.0645,"dx":0.0054,"y":0},{"x":0.0698,"dx":0.0054,"y":5.9588},{"x":0.0752,"dx":0.0054,"y":21.5483},{"x":0.0805,"dx":0.0054,"y":37.556},{"x":0.0859,"dx":0.0054,"y":65.2096},{"x":0.0913,"dx":0.0054,"y":61.7226},{"x":0.0966,"dx":0.0054,"y":59.857},{"x":0.102,"dx":0.0054,"y":65.4045},{"x":0.1073,"dx":0.0054,"y":65.8728},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":12.0391},{"x":0.1288,"dx":0.0054,"y":8.2159},{"x":0.1341,"dx":0.0054,"y":3.3861},{"x":0.1395,"dx":0.0054,"y":0},{"x":0.1449,"dx":0.0054,"y":0},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:15:16.226Z"),"step":309,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.134},{"x":0.0645,"dx":0.0054,"y":0.8427},{"x":0.0698,"dx":0.0054,"y":13.4305},{"x":0.0752,"dx":0.0054,"y":25.0288},{"x":0.0805,"dx":0.0054,"y":37.6034},{"x":0.0859,"dx":0.0054,"y":60.7779},{"x":0.0913,"dx":0.0054,"y":61.3028},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":57.9013},{"x":0.1073,"dx":0.0054,"y":57.5244},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":13.9859},{"x":0.1288,"dx":0.0054,"y":10.378},{"x":0.1341,"dx":0.0054,"y":4.7394},{"x":0.1395,"dx":0.0054,"y":0.7862},{"x":0.1449,"dx":0.0054,"y":0.7516},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:15:47.102Z"),"step":377,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.67},{"x":0.0645,"dx":0.0054,"y":4.2133},{"x":0.0698,"dx":0.0054,"y":17.9929},{"x":0.0752,"dx":0.0054,"y":30.2753},{"x":0.0805,"dx":0.0054,"y":39.6941},{"x":0.0859,"dx":0.0054,"y":51.2814},{"x":0.0913,"dx":0.0054,"y":57.993},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":56.9449},{"x":0.1073,"dx":0.0054,"y":56.4756},{"x":0.1127,"dx":0.0054,"y":44.712},{"x":0.1181,"dx":0.0054,"y":44.712},{"x":0.1234,"dx":0.0054,"y":15.3539},{"x":0.1288,"dx":0.0054,"y":12.1077},{"x":0.1341,"dx":0.0054,"y":5.2212},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:16:13.835Z"),"step":433,"bins":[{"x":0.0537,"dx":0.0054,"y":0},{"x":0.0591,"dx":0.0054,"y":0.9381},{"x":0.0645,"dx":0.0054,"y":5.8986},{"x":0.0698,"dx":0.0054,"y":14.3154},{"x":0.0752,"dx":0.0054,"y":32.7152},{"x":0.0805,"dx":0.0054,"y":43.8322},{"x":0.0859,"dx":0.0054,"y":50.6483},{"x":0.0913,"dx":0.0054,"y":56.6475},{"x":0.0966,"dx":0.0054,"y":59.857},{"x":0.102,"dx":0.0054,"y":59.1875},{"x":0.1073,"dx":0.0054,"y":59.0805},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":14.0438},{"x":0.1288,"dx":0.0054,"y":10.8105},{"x":0.1341,"dx":0.0054,"y":4.6865},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3758},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:16:18.632Z"),"step":445,"bins":[{"x":0.0537,"dx":0.0054,"y":0.2205},{"x":0.0591,"dx":0.0054,"y":1.5836},{"x":0.0645,"dx":0.0054,"y":5.056},{"x":0.0698,"dx":0.0054,"y":18.0162},{"x":0.0752,"dx":0.0054,"y":32.1331},{"x":0.0805,"dx":0.0054,"y":41.2513},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":57.1104},{"x":0.0966,"dx":0.0054,"y":61.5837},{"x":0.102,"dx":0.0054,"y":57.9013},{"x":0.1073,"dx":0.0054,"y":57.5178},{"x":0.1127,"dx":0.0054,"y":44.2364},{"x":0.1181,"dx":0.0054,"y":44.2364},{"x":0.1234,"dx":0.0054,"y":13.7491},{"x":0.1288,"dx":0.0054,"y":10.378},{"x":0.1341,"dx":0.0054,"y":4.5083},{"x":0.1395,"dx":0.0054,"y":0.3931},{"x":0.1449,"dx":0.0054,"y":0.3973},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:36.297Z"),"step":484,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.3011},{"x":0.0645,"dx":0.0054,"y":10.9546},{"x":0.0698,"dx":0.0054,"y":13.7105},{"x":0.0752,"dx":0.0054,"y":32.9245},{"x":0.0805,"dx":0.0054,"y":43.1978},{"x":0.0859,"dx":0.0054,"y":45.5834},{"x":0.0913,"dx":0.0054,"y":55.2572},{"x":0.0966,"dx":0.0054,"y":60.4326},{"x":0.102,"dx":0.0054,"y":53.4983},{"x":0.1073,"dx":0.0054,"y":52.8206},{"x":0.1127,"dx":0.0054,"y":43.7607},{"x":0.1181,"dx":0.0054,"y":43.7607},{"x":0.1234,"dx":0.0054,"y":15.6486},{"x":0.1288,"dx":0.0054,"y":12.5401},{"x":0.1341,"dx":0.0054,"y":6.5549},{"x":0.1395,"dx":0.0054,"y":2.3586},{"x":0.1449,"dx":0.0054,"y":2.2763},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:41.053Z"),"step":496,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.5922},{"x":0.0645,"dx":0.0054,"y":9.2693},{"x":0.0698,"dx":0.0054,"y":14.4087},{"x":0.0752,"dx":0.0054,"y":31.3218},{"x":0.0805,"dx":0.0054,"y":41.9331},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":53.5815},{"x":0.0966,"dx":0.0054,"y":55.8282},{"x":0.102,"dx":0.0054,"y":55.0152},{"x":0.1073,"dx":0.0054,"y":54.901},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":18.1373},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.5713},{"x":0.1395,"dx":0.0054,"y":1.9655},{"x":0.1449,"dx":0.0054,"y":1.9005},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:41.186Z"),"step":497,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.4581},{"x":0.0645,"dx":0.0054,"y":8.4266},{"x":0.0698,"dx":0.0054,"y":15.1302},{"x":0.0752,"dx":0.0054,"y":31.577},{"x":0.0805,"dx":0.0054,"y":42.2256},{"x":0.0859,"dx":0.0054,"y":50.6483},{"x":0.0913,"dx":0.0054,"y":53.2729},{"x":0.0966,"dx":0.0054,"y":54.6771},{"x":0.102,"dx":0.0054,"y":53.4815},{"x":0.1073,"dx":0.0054,"y":53.3397},{"x":0.1127,"dx":0.0054,"y":42.8094},{"x":0.1181,"dx":0.0054,"y":42.8094},{"x":0.1234,"dx":0.0054,"y":18.6688},{"x":0.1288,"dx":0.0054,"y":15.9995},{"x":0.1341,"dx":0.0054,"y":7.5184},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5247},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:16:49.690Z"),"step":513,"bins":[{"x":0.0537,"dx":0.0054,"y":1.4385},{"x":0.0591,"dx":0.0054,"y":3.8126},{"x":0.0645,"dx":0.0054,"y":9.2693},{"x":0.0698,"dx":0.0054,"y":15.8984},{"x":0.0752,"dx":0.0054,"y":33.6899},{"x":0.0805,"dx":0.0054,"y":44.0753},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":51.3318},{"x":0.0966,"dx":0.0054,"y":52.3749},{"x":0.102,"dx":0.0054,"y":57.1094},{"x":0.1073,"dx":0.0054,"y":57.5073},{"x":0.1127,"dx":0.0054,"y":40.4311},{"x":0.1181,"dx":0.0054,"y":40.4311},{"x":0.1234,"dx":0.0054,"y":18.0426},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":6.878},{"x":0.1395,"dx":0.0054,"y":0.7862},{"x":0.1449,"dx":0.0054,"y":0.7731},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:07.617Z"),"step":551,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.4351},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":21.1823},{"x":0.0752,"dx":0.0054,"y":30.8315},{"x":0.0805,"dx":0.0054,"y":38.4273},{"x":0.0859,"dx":0.0054,"y":48.1158},{"x":0.0913,"dx":0.0054,"y":49.7656},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":56.0044},{"x":0.1073,"dx":0.0054,"y":56.4716},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":19.1055},{"x":0.1288,"dx":0.0054,"y":16.4319},{"x":0.1341,"dx":0.0054,"y":7.4655},{"x":0.1395,"dx":0.0054,"y":1.1793},{"x":0.1449,"dx":0.0054,"y":1.1489},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:17.407Z"),"step":577,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.3487},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":15.2469},{"x":0.0752,"dx":0.0054,"y":32.5059},{"x":0.0805,"dx":0.0054,"y":42.1267},{"x":0.0859,"dx":0.0054,"y":45.5834},{"x":0.0913,"dx":0.0054,"y":48.883},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":58.3955},{"x":0.1073,"dx":0.0054,"y":59.0752},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":19.3054},{"x":0.1288,"dx":0.0054,"y":16.8643},{"x":0.1341,"dx":0.0054,"y":7.8748},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5032},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:17:21.105Z"),"step":581,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.4827},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":15.2702},{"x":0.0752,"dx":0.0054,"y":30.6481},{"x":0.0805,"dx":0.0054,"y":40.862},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":48.3322},{"x":0.0966,"dx":0.0054,"y":47.7705},{"x":0.102,"dx":0.0054,"y":58.6263},{"x":0.1073,"dx":0.0054,"y":59.6023},{"x":0.1127,"dx":0.0054,"y":43.285},{"x":0.1181,"dx":0.0054,"y":43.285},{"x":0.1234,"dx":0.0054,"y":18.3268},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.3402},{"x":0.1395,"dx":0.0054,"y":1.5724},{"x":0.1449,"dx":0.0054,"y":1.5032},{"x":0.1502,"dx":0.0054,"y":0},{"x":0.1556,"dx":0.0054,"y":0}]},{"wall_time":new Date("2016-06-24T04:17:30.677Z"),"step":602,"bins":[{"x":0.0537,"dx":0.0054,"y":0.441},{"x":0.0591,"dx":0.0054,"y":3.5692},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":16.7365},{"x":0.0752,"dx":0.0054,"y":31.1584},{"x":0.0805,"dx":0.0054,"y":40.4233},{"x":0.0859,"dx":0.0054,"y":47.4827},{"x":0.0913,"dx":0.0054,"y":46.9203},{"x":0.0966,"dx":0.0054,"y":46.6194},{"x":0.102,"dx":0.0054,"y":57.0926},{"x":0.1073,"dx":0.0054,"y":58.0436},{"x":0.1127,"dx":0.0054,"y":45.6633},{"x":0.1181,"dx":0.0054,"y":45.6633},{"x":0.1234,"dx":0.0054,"y":18.5636},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":7.8024},{"x":0.1395,"dx":0.0054,"y":2.3586},{"x":0.1449,"dx":0.0054,"y":2.2763},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:30.960Z"),"step":604,"bins":[{"x":0.0537,"dx":0.0054,"y":0.6614},{"x":0.0591,"dx":0.0054,"y":4.0807},{"x":0.0645,"dx":0.0054,"y":10.9546},{"x":0.0698,"dx":0.0054,"y":17.4347},{"x":0.0752,"dx":0.0054,"y":31.878},{"x":0.0805,"dx":0.0054,"y":40.8126},{"x":0.0859,"dx":0.0054,"y":46.8496},{"x":0.0913,"dx":0.0054,"y":46.6997},{"x":0.0966,"dx":0.0054,"y":46.6194},{"x":0.102,"dx":0.0054,"y":57.0926},{"x":0.1073,"dx":0.0054,"y":58.0423},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":18.5162},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":8.0335},{"x":0.1395,"dx":0.0054,"y":2.7518},{"x":0.1449,"dx":0.0054,"y":2.6521},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:17:45.074Z"),"step":637,"bins":[{"x":0.0537,"dx":0.0054,"y":1.1024},{"x":0.0591,"dx":0.0054,"y":4.8356},{"x":0.0645,"dx":0.0054,"y":5.8986},{"x":0.0698,"dx":0.0054,"y":20.2741},{"x":0.0752,"dx":0.0054,"y":34.292},{"x":0.0805,"dx":0.0054,"y":41.5418},{"x":0.0859,"dx":0.0054,"y":43.051},{"x":0.0913,"dx":0.0054,"y":48.0004},{"x":0.0966,"dx":0.0054,"y":50.6483},{"x":0.102,"dx":0.0054,"y":56.0044},{"x":0.1073,"dx":0.0054,"y":56.4862},{"x":0.1127,"dx":0.0054,"y":48.5173},{"x":0.1181,"dx":0.0054,"y":48.5173},{"x":0.1234,"dx":0.0054,"y":15.7328},{"x":0.1288,"dx":0.0054,"y":12.1077},{"x":0.1341,"dx":0.0054,"y":6.6077},{"x":0.1395,"dx":0.0054,"y":2.7518},{"x":0.1449,"dx":0.0054,"y":2.6521},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:17.117Z"),"step":702,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.3241},{"x":0.0645,"dx":0.0054,"y":7.5839},{"x":0.0698,"dx":0.0054,"y":20.3208},{"x":0.0752,"dx":0.0054,"y":36.1498},{"x":0.0805,"dx":0.0054,"y":44.7076},{"x":0.0859,"dx":0.0054,"y":47.4827},{"x":0.0913,"dx":0.0054,"y":46.1704},{"x":0.0966,"dx":0.0054,"y":45.4683},{"x":0.102,"dx":0.0054,"y":50.2984},{"x":0.1073,"dx":0.0054,"y":50.748},{"x":0.1127,"dx":0.0054,"y":48.993},{"x":0.1181,"dx":0.0054,"y":48.993},{"x":0.1234,"dx":0.0054,"y":17.727},{"x":0.1288,"dx":0.0054,"y":14.2698},{"x":0.1341,"dx":0.0054,"y":7.961},{"x":0.1395,"dx":0.0054,"y":3.538},{"x":0.1449,"dx":0.0054,"y":3.4037},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:21.750Z"),"step":714,"bins":[{"x":0.0537,"dx":0.0054,"y":0.8819},{"x":0.0591,"dx":0.0054,"y":4.7262},{"x":0.0645,"dx":0.0054,"y":10.1119},{"x":0.0698,"dx":0.0054,"y":19.6459},{"x":0.0752,"dx":0.0054,"y":35.4302},{"x":0.0805,"dx":0.0054,"y":44.3183},{"x":0.0859,"dx":0.0054,"y":48.1158},{"x":0.0913,"dx":0.0054,"y":46.3911},{"x":0.0966,"dx":0.0054,"y":45.4683},{"x":0.102,"dx":0.0054,"y":50.2984},{"x":0.1073,"dx":0.0054,"y":50.7374},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":19.6843},{"x":0.1288,"dx":0.0054,"y":16.8643},{"x":0.1341,"dx":0.0054,"y":9.0303},{"x":0.1395,"dx":0.0054,"y":3.538},{"x":0.1449,"dx":0.0054,"y":3.4037},{"x":0.1502,"dx":0.0054,"y":0.4892},{"x":0.1556,"dx":0.0054,"y":0.4892}]},{"wall_time":new Date("2016-06-24T04:18:40.691Z"),"step":758,"bins":[{"x":0.0537,"dx":0.0054,"y":1.2181},{"x":0.0591,"dx":0.0054,"y":3.4351},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":19.6926},{"x":0.0752,"dx":0.0054,"y":34.0368},{"x":0.0805,"dx":0.0054,"y":43.0042},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":44.9576},{"x":0.0966,"dx":0.0054,"y":42.5906},{"x":0.102,"dx":0.0054,"y":52.9203},{"x":0.1073,"dx":0.0054,"y":53.8615},{"x":0.1127,"dx":0.0054,"y":42.8094},{"x":0.1181,"dx":0.0054,"y":42.8094},{"x":0.1234,"dx":0.0054,"y":21.005},{"x":0.1288,"dx":0.0054,"y":18.594},{"x":0.1341,"dx":0.0054,"y":9.5121},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:18:44.566Z"),"step":762,"bins":[{"x":0.0537,"dx":0.0054,"y":1.2181},{"x":0.0591,"dx":0.0054,"y":3.5692},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":18.9711},{"x":0.0752,"dx":0.0054,"y":34.7106},{"x":0.0805,"dx":0.0054,"y":43.929},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":43.6121},{"x":0.0966,"dx":0.0054,"y":40.8639},{"x":0.102,"dx":0.0054,"y":53.25},{"x":0.1073,"dx":0.0054,"y":54.3819},{"x":0.1127,"dx":0.0054,"y":42.3337},{"x":0.1181,"dx":0.0054,"y":42.3337},{"x":0.1234,"dx":0.0054,"y":21.7363},{"x":0.1288,"dx":0.0054,"y":19.4588},{"x":0.1341,"dx":0.0054,"y":9.8685},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:18:48.972Z"),"step":771,"bins":[{"x":0.0537,"dx":0.0054,"y":1.4385},{"x":0.0591,"dx":0.0054,"y":4.4827},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":17.5047},{"x":0.0752,"dx":0.0054,"y":34.2003},{"x":0.0805,"dx":0.0054,"y":43.3441},{"x":0.0859,"dx":0.0054,"y":46.2165},{"x":0.0913,"dx":0.0054,"y":43.1044},{"x":0.0966,"dx":0.0054,"y":41.4395},{"x":0.102,"dx":0.0054,"y":52.8213},{"x":0.1073,"dx":0.0054,"y":53.8641},{"x":0.1127,"dx":0.0054,"y":43.7607},{"x":0.1181,"dx":0.0054,"y":43.7607},{"x":0.1234,"dx":0.0054,"y":22.2678},{"x":0.1288,"dx":0.0054,"y":19.8912},{"x":0.1341,"dx":0.0054,"y":10.0467},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0495},{"x":0.1502,"dx":0.0054,"y":0.9785},{"x":0.1556,"dx":0.0054,"y":0.9785}]},{"wall_time":new Date("2016-06-24T04:19:18.102Z"),"step":836,"bins":[{"x":0.0537,"dx":0.0054,"y":1.8795},{"x":0.0591,"dx":0.0054,"y":6.4437},{"x":0.0645,"dx":0.0054,"y":16.0105},{"x":0.0698,"dx":0.0054,"y":18.3196},{"x":0.0752,"dx":0.0054,"y":30.2753},{"x":0.0805,"dx":0.0054,"y":39.2554},{"x":0.0859,"dx":0.0054,"y":49.3821},{"x":0.0913,"dx":0.0054,"y":44.5826},{"x":0.0966,"dx":0.0054,"y":42.015},{"x":0.102,"dx":0.0054,"y":53.349},{"x":0.1073,"dx":0.0054,"y":54.3793},{"x":0.1127,"dx":0.0054,"y":41.3824},{"x":0.1181,"dx":0.0054,"y":41.3824},{"x":0.1234,"dx":0.0054,"y":22.031},{"x":0.1288,"dx":0.0054,"y":19.8912},{"x":0.1341,"dx":0.0054,"y":10.0467},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.071},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]},{"wall_time":new Date("2016-06-24T04:19:26.881Z"),"step":854,"bins":[{"x":0.0537,"dx":0.0054,"y":1.8795},{"x":0.0591,"dx":0.0054,"y":6.3097},{"x":0.0645,"dx":0.0054,"y":15.1679},{"x":0.0698,"dx":0.0054,"y":19.0411},{"x":0.0752,"dx":0.0054,"y":29.1371},{"x":0.0805,"dx":0.0054,"y":37.795},{"x":0.0859,"dx":0.0054,"y":50.0152},{"x":0.0913,"dx":0.0054,"y":45.1782},{"x":0.0966,"dx":0.0054,"y":42.5906},{"x":0.102,"dx":0.0054,"y":51.0074},{"x":0.1073,"dx":0.0054,"y":51.781},{"x":0.1127,"dx":0.0054,"y":45.1877},{"x":0.1181,"dx":0.0054,"y":45.1877},{"x":0.1234,"dx":0.0054,"y":20.8524},{"x":0.1288,"dx":0.0054,"y":18.1616},{"x":0.1341,"dx":0.0054,"y":9.796},{"x":0.1395,"dx":0.0054,"y":3.9311},{"x":0.1449,"dx":0.0054,"y":3.8226},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]},{"wall_time":new Date("2016-06-24T04:20:06.833Z"),"step":947,"bins":[{"x":0.0537,"dx":0.0054,"y":3.6542},{"x":0.0591,"dx":0.0054,"y":6.8212},{"x":0.0645,"dx":0.0054,"y":13.4826},{"x":0.0698,"dx":0.0054,"y":15.2702},{"x":0.0752,"dx":0.0054,"y":32.0414},{"x":0.0805,"dx":0.0054,"y":41.1524},{"x":0.0859,"dx":0.0054,"y":43.6841},{"x":0.0913,"dx":0.0054,"y":43.7216},{"x":0.0966,"dx":0.0054,"y":43.7417},{"x":0.102,"dx":0.0054,"y":46.8023},{"x":0.1073,"dx":0.0054,"y":47.089},{"x":0.1127,"dx":0.0054,"y":46.6147},{"x":0.1181,"dx":0.0054,"y":46.6147},{"x":0.1234,"dx":0.0054,"y":24.8881},{"x":0.1288,"dx":0.0054,"y":22.4857},{"x":0.1341,"dx":0.0054,"y":11.116},{"x":0.1395,"dx":0.0054,"y":3.1449},{"x":0.1449,"dx":0.0054,"y":3.0926},{"x":0.1502,"dx":0.0054,"y":1.9569},{"x":0.1556,"dx":0.0054,"y":1.9569}]},{"wall_time":new Date("2016-06-24T04:20:20.058Z"),"step":974,"bins":[{"x":0.0537,"dx":0.0054,"y":3.5385},{"x":0.0591,"dx":0.0054,"y":9.0258},{"x":0.0645,"dx":0.0054,"y":12.6399},{"x":0.0698,"dx":0.0054,"y":17.4814},{"x":0.0752,"dx":0.0054,"y":30.9491},{"x":0.0805,"dx":0.0054,"y":39.7415},{"x":0.0859,"dx":0.0054,"y":46.8496},{"x":0.0913,"dx":0.0054,"y":42.9501},{"x":0.0966,"dx":0.0054,"y":40.8639},{"x":0.102,"dx":0.0054,"y":51.8154},{"x":0.1073,"dx":0.0054,"y":52.8153},{"x":0.1127,"dx":0.0054,"y":41.8581},{"x":0.1181,"dx":0.0054,"y":41.8581},{"x":0.1234,"dx":0.0054,"y":22.8571},{"x":0.1288,"dx":0.0054,"y":20.7561},{"x":0.1341,"dx":0.0054,"y":11.0964},{"x":0.1395,"dx":0.0054,"y":4.3242},{"x":0.1449,"dx":0.0054,"y":4.22},{"x":0.1502,"dx":0.0054,"y":1.9569},{"x":0.1556,"dx":0.0054,"y":1.9569}]},{"wall_time":new Date("2016-06-24T04:20:29.637Z"),"step":999,"bins":[{"x":0.0537,"dx":0.0054,"y":5.2083},{"x":0.0591,"dx":0.0054,"y":6.5532},{"x":0.0645,"dx":0.0054,"y":11.7972},{"x":0.0698,"dx":0.0054,"y":18.9477},{"x":0.0752,"dx":0.0054,"y":30.5305},{"x":0.0805,"dx":0.0054,"y":39.1091},{"x":0.0859,"dx":0.0054,"y":48.7489},{"x":0.0913,"dx":0.0054,"y":39.4876},{"x":0.0966,"dx":0.0054,"y":34.5329},{"x":0.102,"dx":0.0054,"y":55.0967},{"x":0.1073,"dx":0.0054,"y":57},{"x":0.1127,"dx":0.0054,"y":45.6633},{"x":0.1181,"dx":0.0054,"y":45.6633},{"x":0.1234,"dx":0.0054,"y":18.5636},{"x":0.1288,"dx":0.0054,"y":15.567},{"x":0.1341,"dx":0.0054,"y":10.3444},{"x":0.1395,"dx":0.0054,"y":6.6828},{"x":0.1449,"dx":0.0054,"y":6.4532},{"x":0.1502,"dx":0.0054,"y":1.4677},{"x":0.1556,"dx":0.0054,"y":1.4677}]}];
-    </script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html b/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
deleted file mode 100644
index bdba230077d48d3602be2213ea0aa18f7d60a5b4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
+++ /dev/null
@@ -1,707 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/d3.html">
-
-<!--
-vz-histogram-timeseries creates an element that draws beautiful histograms for
-displaying how data is distributed over time.
-
-This histogram supports changing the time axis type and different modes of
-visualization.
-
-@element vz-histogram-timeseries
-@demo demo/index.html
--->
-<dom-module id="vz-histogram-timeseries">
-    <template>
-      <div id="tooltip"><span></span></div>
-      <svg id="svg">
-        <g>
-          <g class="axis x"></g>
-          <g class="axis y"></g>
-          <g class="axis y slice"></g>
-          <g class="stage">
-            <rect class="background"></rect>
-          </g>
-          <g class="x-axis-hover"></g>
-          <g class="y-axis-hover"></g>
-          <g class="y-slice-axis-hover"></g>
-        </g>
-      </svg>
-
-      <style>
-        :host {
-          display: flex;
-          flex-direction: column;
-          flex-grow: 1;
-          flex-shrink: 1;
-          position: relative;
-        }
-
-        svg {
-          font-family: roboto, sans-serif;
-          overflow: visible;
-          display: block;
-          width: 100%;
-          flex-grow: 1;
-          flex-shrink: 1;
-        }
-
-        #tooltip {
-          position: absolute;
-          display: block;
-          opacity: 0;
-          font-weight: bold;
-          font-size: 11px;
-        }
-
-        .background {
-          fill-opacity: 0;
-          fill: red;
-        }
-
-        .histogram {
-          pointer-events: none;
-        }
-
-        .hover {
-          font-size: 9px;
-          dominant-baseline: middle;
-          opacity: 0;
-        }
-
-        .hover circle {
-          stroke: white;
-          stroke-opacity: 0.5;
-          stroke-width: 1px;
-        }
-
-        .hover text {
-          fill: black;
-          opacity: 0;
-        }
-
-        .hover.hover-closest circle {
-          fill: black!important;
-        }
-
-        .hover.hover-closest text {
-          opacity: 1;
-        }
-
-        .baseline {
-          stroke: black;
-          stroke-opacity: 0.1;
-        }
-
-        .outline {
-          fill: none;
-          stroke: white;
-          stroke-opacity: 0.5;
-        }
-
-        .outline.outline-hover {
-          stroke: black!important;
-          stroke-opacity: 1;
-        }
-
-        .x-axis-hover,
-        .y-axis-hover,
-        .y-slice-axis-hover {
-          pointer-events: none;
-        }
-
-        .x-axis-hover .label,
-        .y-axis-hover .label,
-        .y-slice-axis-hover .label {
-          opacity: 0;
-          font-weight: bold;
-          font-size: 11px;
-          text-anchor: end;
-        }
-
-        .x-axis-hover text {
-          text-anchor: middle;
-        }
-
-        .y-axis-hover text,
-        .y-slice-axis-hover text {
-          text-anchor: start;
-        }
-
-        .x-axis-hover line,
-        .y-axis-hover line,
-        .y-slice-axis-hover line {
-          stroke: black;
-        }
-
-        .x-axis-hover rect,
-        .y-axis-hover rect,
-        .y-slice-axis-hover rect {
-          fill: white;
-        }
-
-        .axis {
-          font-size: 11px;
-        }
-
-        .axis path.domain {
-          fill: none;
-        }
-
-        .axis .tick line {
-          stroke: #ddd;
-        }
-
-        .axis.slice {
-          opacity: 0;
-        }
-
-        .axis.slice .tick line {
-          stroke-dasharray: 2;
-        }
-
-        .small .axis text { display: none; }
-        .small .axis .tick:first-of-type text { display: block; }
-        .small .axis .tick:last-of-type text { display: block; }
-        .medium .axis text { display: none; }
-        .medium .axis .tick:nth-child(2n + 1) text { display: block; }
-        .large .axis text { display: none; }
-        .large .axis .tick:nth-child(2n + 1) text { display: block; }
-
-      </style>
-    </template>
-
-    <script>
-    Polymer({
-      is: "vz-histogram-timeseries",
-      properties: {
-        /**
-         * Defines which view mode is being used by the chart. Supported values
-         * are:
-         * - "offset" - Offset view of the data showing all timesteps.
-         * - "overlay" - Overlays all timesteps into one 2D view, with the
-         * brighter lines representing the newer timesteps.
-         */
-        mode: {
-          type: String,
-          value: "offset"
-        },
-
-        /*
-         * The name of the datum's property that contains the time values.
-         * Allows:
-         * - "step" - Linear scale using the "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale starting at 0 created by using
-         * the "wall_time" property of the datum.
-         */
-        timeProperty: {
-          type: String,
-          value: "step"
-        },
-
-        /**
-         * The name of the data's property that contains the bins.
-         */
-        bins: {
-          type: String,
-          value: "bins"
-        },
-
-        /**
-         * The name of the datum's property that contains the x values.
-         */
-        x: {
-          type: String,
-          value: "x"
-        },
-
-        /**
-         * The name of the datum's property that contains the bin width values.
-         */
-        dx: {
-          type: String,
-          value: "dx"
-        },
-
-        /**
-         * The name of the datum's property that contains the bin height.
-         */
-        y: {
-          type: String,
-          value: "y"
-        },
-
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.schemeCategory10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return d3.scaleOrdinal(d3.schemeCategory10);
-          }
-        },
-
-        /**
-         * Duration of the transition between histogram modes.
-         */
-        modeTransitionDuration: {
-          type: Number,
-          value: 500
-        },
-
-        _attached: Boolean,
-        _name: { type: String, value: null },
-        _data: { type: Array, value: null },
-      },
-      observers: [
-        'redraw(timeProperty, _attached)',
-        '_modeRedraw(mode)'
-      ],
-      ready: function() {
-        // Polymer's way of scoping styles on nodes that d3 created
-        this.scopeSubtree(this.$.svg, true);
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      setVisibleSeries: function(names) {
-        // Do nothing.
-      },
-      setSeriesData: function(name, data) {
-        this._name = name;
-        this._data = data;
-        this.redraw();
-      },
-
-      /**
-       * Redraws the chart. This is only called if the chart is attached to the
-       * screen and if the chart has data.
-       */
-      redraw: function() {
-        this._draw(0);
-      },
-
-      _modeRedraw: function() {
-        this._draw(this.modeTransitionDuration);
-      },
-
-      _draw: function(duration) {
-        if (!this._attached || !this._data) {
-          return;
-        }
-
-        //
-        // Data verification
-        //
-        if (duration === undefined) throw(new Error("vz-histogram-timeseries _draw needs duration"));
-        if (this._data.length <= 0) throw(new Error("Not enough steps in the data"));
-        if (!this._data[0].hasOwnProperty(this.bins)) throw(new Error("No bins property of '" + this.bins + "' in data"));
-        if (this._data[0][this.bins].length <= 0) throw(new Error("Must have at least one bin in bins in data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.x)) throw(new Error("No x property '" + this.x + "' on bins data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.dx)) throw(new Error("No dx property '" + this.dx + "' on bins data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.y)) throw(new Error("No y property '" + this.y + "' on bins data"));
-
-        //
-        // Initialization
-        //
-        var timeProp = this.timeProperty;
-        var xProp = this.x;
-        var binsProp = this.bins;
-        var dxProp = this.dx;
-        var yProp = this.y;
-
-        var data = this._data;
-        var name = this._name;
-        var mode = this.mode;
-        var color = d3.hcl(this.colorScale(name));
-        var tooltip = d3.select(this.$.tooltip);
-
-        var xAccessor = function(d) { return d[xProp] };
-        var yAccessor = function(d) { return d[yProp] };
-        var dxAccessor = function(d) { return d[dxProp] };
-        var xRightAccessor = function(d) { return d[xProp] + d[dxProp] };
-        var timeAccessor = function(d) { return d[timeProp] };
-
-        if (timeProp === "relative") {
-          timeAccessor = function(d) { return d.wall_time - data[0].wall_time };
-        }
-
-        var brect = this.$.svg.getBoundingClientRect();
-        var outerWidth = brect.width,
-            outerHeight = brect.height;
-
-        var sliceHeight,
-            margin = {top: 5, right: 60, bottom: 20, left: 24};
-
-        if (mode === "offset") {
-          sliceHeight = outerHeight / 2.5;
-          margin.top = sliceHeight + 5;
-        } else {
-          sliceHeight = outerHeight - margin.top - margin.bottom;
-        }
-
-        var width = outerWidth - margin.left - margin.right,
-            height = outerHeight - margin.top - margin.bottom;
-
-        var leftMin = d3.min(data, xAccessor),
-            rightMax = d3.max(data, xRightAccessor);
-
-        //
-        // Text formatters
-        //
-        var format = d3.format(".3n");
-        var yAxisFormat = d3.format(".0f");
-
-        if (timeProp === "wall_time") {
-          yAxisFormat = d3.timeFormat("%m/%d %X");
-        } else if (timeProp === "relative") {
-          yAxisFormat = function(d) {
-            return d3.format(".1r")(d / 3.6e6) + 'h'; // Convert to hours.
-          };
-        }
-
-        //
-        // Calculate the extents
-        //
-        var xExtents = data.map(function(d, i) {
-          return [
-            d3.min(d[binsProp], xAccessor),
-            d3.max(d[binsProp], xRightAccessor)
-          ];
-        });
-        var yExtents = data.map(function(d) {
-          return d3.extent(d[binsProp], yAccessor);
-        });
-
-        //
-        // Scales and axis
-        //
-        var outlineCanvasSize = 500;
-
-        var extent = d3.extent(data, timeAccessor);
-
-        var yScale = (timeProp === "wall_time" ? d3.scaleTime() : d3.scaleLinear())
-            .domain(extent)
-            .range([0, (mode === "offset" ? height : 0)]);
-
-        var ySliceScale = d3.scaleLinear()
-            .domain([0, d3.max(data, function(d, i) { return yExtents[i][1]; })])
-            .range([sliceHeight, 0]);
-
-        var yLineScale = d3.scaleLinear()
-            .domain(ySliceScale.domain())
-            .range([outlineCanvasSize, 0]);
-
-        var xScale = d3.scaleLinear()
-            .domain([
-              d3.min(data, function(d, i) { return xExtents[i][0]; }),
-              d3.max(data, function(d, i) { return xExtents[i][1]; })
-            ])
-            .nice()
-            .range([0, width]);
-
-        var xLineScale = d3.scaleLinear()
-            .domain(xScale.domain())
-            .range([0, outlineCanvasSize]);
-
-        var outlineColor = d3.scaleLinear()
-            .domain(d3.extent(data, timeAccessor))
-            .range([color.darker(), color.brighter()])
-            .interpolate(d3.interpolateHcl);
-
-        var xAxis = d3.axisBottom(xScale).ticks(Math.max(2, width / 20));
-
-        var yAxis = d3.axisRight(yScale)
-            .ticks(Math.max(2, height / 15))
-            .tickFormat(yAxisFormat);
-
-
-
-        var ySliceAxis = d3.axisRight(ySliceScale)
-            .ticks(Math.max(2, height / 15))
-            .tickSize(width + 5)
-            .tickFormat(format);
-
-        var xBinCentroid = function(d) {
-          return d[xProp] + d[dxProp] / 2;
-        };
-
-        var linePath = d3.line()
-            .x(function(d) { return xLineScale(xBinCentroid(d)); })
-            .y(function(d) { return yLineScale(d[yProp]); });
-
-        var path = function(d) {
-          // Draw a line from 0 to the first point and from the last point to 0.
-          return 'M' + xLineScale(xBinCentroid(d[0])) + ',' + yLineScale(0) +
-              'L' + linePath(d).slice(1) +
-              "L" + xLineScale(xBinCentroid(d[d.length - 1])) + "," + yLineScale(0);
-        };
-
-        //
-        // Render
-        //
-        var svgNode = this.$.svg;
-
-        var svg = d3.select(svgNode)
-
-        var svgTransition = svg.transition().duration(duration);
-
-        var g = svg.select("g")
-            .classed("small", function() { return (width > 0 && width <= 150); })
-            .classed("medium", function() { return (width > 150 && width <= 300); })
-            .classed("large", function() { return (width > 300); })
-
-        var gTransition = svgTransition.select("g")
-            .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
-
-        var bisect = d3.bisector(xRightAccessor).left;
-        var stage = g.select(".stage")
-            .on("mouseover", function() {
-              hoverUpdate.style("opacity", 1);
-              xAxisHoverUpdate.style("opacity", 1);
-              yAxisHoverUpdate.style("opacity", 1);
-              ySliceAxisHoverUpdate.style("opacity", 1);
-              tooltip.style("opacity", 1);
-            })
-            .on("mouseout", function() {
-              hoverUpdate.style("opacity", 0);
-              xAxisHoverUpdate.style("opacity", 0);
-              yAxisHoverUpdate.style("opacity", 0);
-              ySliceAxisHoverUpdate.style("opacity", 0);
-              hoverUpdate.classed("hover-closest", false);
-              outlineUpdate.classed("outline-hover", false);
-              tooltip.style("opacity", 0);
-            })
-            .on("mousemove", onMouseMove);
-
-        var background = stage.select(".background")
-            .attr("transform", "translate(" + -margin.left + "," + -margin.top + ")")
-            .attr("width", outerWidth)
-            .attr("height", outerHeight);
-
-        var histogram = stage.selectAll(".histogram").data(data),
-            histogramExit = histogram.exit().remove(),
-            histogramEnter = histogram.enter().append("g").attr("class", "histogram"),
-            histogramUpdate = histogramEnter.merge(histogram)
-                .sort(function(a, b) { return timeAccessor(a) - timeAccessor(b); }),
-            histogramTransition = gTransition.selectAll(".histogram")
-                .attr("transform", function(d) {
-                  return "translate(0, " +
-                    (mode === "offset" ? (yScale(timeAccessor(d)) - sliceHeight) : 0) + ")";
-                });
-
-        var baselineEnter = histogramEnter.append("line").attr("class", "baseline"),
-            baselineUpdate = histogramTransition.select(".baseline")
-                .style("stroke-opacity", function(d) { return (mode === "offset" ? 0.1 : 0); })
-                .attr("y1", sliceHeight)
-                .attr("y2", sliceHeight)
-                .attr("x2", width);
-
-        var outlineEnter = histogramEnter.append("path").attr("class", "outline"),
-            outlineUpdate = histogramUpdate.select(".outline")
-                .attr("vector-effect", "non-scaling-stroke")
-                .attr("d", function(d) { return path(d[binsProp]); })
-                .style("stroke-width", 1),
-            outlineTransition = histogramTransition.select(".outline")
-                .attr("transform", "scale(" + width / outlineCanvasSize + ", " +
-                      sliceHeight / outlineCanvasSize + ")")
-                .style("stroke", function(d) {
-                  return (mode === "offset" ? "white" : outlineColor(timeAccessor(d)));
-                })
-                .style("fill-opacity", function(d) { return (mode === "offset" ? 1 : 0); })
-                .style("fill", function(d) { return outlineColor(timeAccessor(d)); });
-
-        var hoverEnter = histogramEnter.append("g")
-                .attr("class", "hover")
-                .style("fill", function(d) { return outlineColor(timeAccessor(d)); }),
-            hoverUpdate = histogramUpdate.select(".hover");
-
-        hoverEnter.append("circle")
-            .attr("r", 2);
-
-        hoverEnter.append("text")
-            .style("display", "none")
-            .attr("dx", 4);
-
-        var xAxisHover = g.select(".x-axis-hover").selectAll(".label").data(["x"]),
-            xAxisHoverEnter = xAxisHover.enter().append("g").attr("class", "label"),
-            xAxisHoverUpdate = xAxisHover.merge(xAxisHoverEnter);
-
-        xAxisHoverEnter.append("rect")
-            .attr("x", -20)
-            .attr("y", 6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        xAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 0)
-            .attr("y1", 0)
-            .attr("y2", 6);
-
-        xAxisHoverEnter.append("text")
-            .attr("dy", 18);
-
-        var yAxisHover = g.select(".y-axis-hover").selectAll(".label").data(["y"]),
-            yAxisHoverEnter = yAxisHover.enter().append("g").attr("class", "label"),
-            yAxisHoverUpdate = yAxisHover.merge(yAxisHoverEnter);
-
-        yAxisHoverEnter.append("rect")
-            .attr("x", 8)
-            .attr("y", -6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        yAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 6)
-            .attr("y1", 0)
-            .attr("y2", 0);
-
-        yAxisHoverEnter.append("text")
-            .attr("dx", 8)
-            .attr("dy", 4);
-
-        var ySliceAxisHover = g.select(".y-slice-axis-hover").selectAll(".label").data(["y"]),
-            ySliceAxisHoverEnter = ySliceAxisHover.enter().append("g").attr("class", "label"),
-            ySliceAxisHoverUpdate = ySliceAxisHover.merge(ySliceAxisHoverEnter);
-
-        ySliceAxisHoverEnter.append("rect")
-            .attr("x", 8)
-            .attr("y", -6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        ySliceAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 6)
-            .attr("y1", 0)
-            .attr("y2", 0);
-
-        ySliceAxisHoverEnter.append("text")
-            .attr("dx", 8)
-            .attr("dy", 4);
-
-        gTransition.select(".y.axis.slice")
-            .style("opacity", mode === "offset" ? 0 : 1)
-            .attr("transform", "translate(0, " + (mode === "offset" ? -sliceHeight : 0) + ")")
-            .call(ySliceAxis);
-
-        gTransition.select(".x.axis")
-            .attr("transform", "translate(0, " + height + ")")
-            .call(xAxis);
-
-        gTransition.select(".y.axis")
-            .style("opacity", mode === "offset" ? 1 : 0)
-            .attr("transform", "translate(" + width + ", " + (mode === "offset" ? 0 : height) + ")")
-            .call(yAxis);
-
-        gTransition.selectAll(".tick text")
-            .attr("fill", "#aaa");
-        gTransition.selectAll(".axis path.domain").attr("stroke", "none");
-
-
-        function onMouseMove() {
-          var m = d3.mouse(this),
-              v = xScale.invert(m[0]),
-              t = yScale.invert(m[1]);
-
-          function hoverXIndex(d) {
-            return Math.min(d[binsProp].length - 1, bisect(d[binsProp], v));
-          }
-          var closestSliceData;
-          var closestSliceDistance = Infinity;
-          var lastSliceData;
-          hoverUpdate
-            .attr("transform", function(d, i) {
-              var index = hoverXIndex(d);
-              lastSliceData = d;
-              var x = xScale(d[binsProp][index][xProp] + d[binsProp][index][dxProp] / 2);
-              var y = ySliceScale(d[binsProp][index][yProp]);
-              var globalY = (mode === "offset" ? yScale(timeAccessor(d)) - (sliceHeight - y) : y);
-              var dist = Math.abs(m[1] - globalY);
-              if (dist < closestSliceDistance) {
-                closestSliceDistance = dist;
-                closestSliceData = d;
-              }
-              return "translate(" + x + "," + y + ")";
-            });
-          hoverUpdate.select("text").text(function(d) {
-            var index = hoverXIndex(d);
-            return d[binsProp][index][yProp];
-          })
-          hoverUpdate.classed("hover-closest", function(d) { return d === closestSliceData; });
-          outlineUpdate.classed("outline-hover", function(d) { return d === closestSliceData; });
-
-          var index = hoverXIndex(lastSliceData);
-
-          xAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" +
-                  xScale(lastSliceData[binsProp][index][xProp] +
-                         lastSliceData[binsProp][index][dxProp] / 2) + ", " +
-                  height + ")";
-              })
-            .select("text")
-              .text(function(d) { return format(lastSliceData[binsProp][index][xProp] +
-                                                lastSliceData[binsProp][index][dxProp] / 2); });
-
-          var fy = yAxis.tickFormat();
-          yAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" + width + ", " +
-                  (mode === "offset" ? yScale(timeAccessor(closestSliceData)) : 0) + ")";
-              })
-              .style("display", mode === "offset" ? "" : "none")
-            .select("text")
-              .text(function(d) { return fy(timeAccessor(closestSliceData));});
-
-          var fsy = ySliceAxis.tickFormat();
-          ySliceAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" + width + ", " +
-                  (mode === "offset" ? 0 : ySliceScale(closestSliceData[binsProp][index][yProp])) +
-                  ")";
-              })
-              .style("display", mode === "offset" ? "none" : "")
-            .select("text")
-              .text(function(d) { return fsy(closestSliceData[binsProp][index][yProp]); });
-
-          var svgMouse = d3.mouse(svgNode);
-          tooltip.style("transform", "translate(" + (svgMouse[0] + 15) + "px," +
-              (svgMouse[1] - 15) + "px)")
-            .select('span')
-            .text(mode === "offset" ?
-                fsy(closestSliceData[binsProp][index][yProp]) :
-                (timeProp === "step" ? "step " : "") +
-                fy(timeAccessor(closestSliceData)));
-        }
-      }
-    });
-    </script>
-
-  </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/BUILD b/tensorflow/tensorboard/components/vz_line_chart/BUILD
deleted file mode 100644
index 8bbf8a24d34fa79efce0c80ebe4cffa37c05ba41..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/BUILD
+++ /dev/null
@@ -1,53 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "vz_line_chart",
-    srcs = [
-        "dragZoomInteraction.ts",
-        "vz-chart-helpers.ts",
-        "vz-line-chart.html",
-        "vz-line-chart.ts",
-    ],
-    path = "/vz-line-chart",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:plottable",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-    ],
-)
-
-ts_web_library(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/vz-line-chart",
-    deps = [
-        ":vz_line_chart",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":vz_line_chart"],
-    visibility = ["//learning/vis/vz_elements/catalog:__pkg__"],
-    destdir = "vz-line-chart",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports_google:lib",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts b/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
deleted file mode 100644
index c7f1f30e76bf48d6000d8fc318a2764760a52db7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-export class DragZoomLayer extends Plottable.Components.SelectionBoxLayer {
-  private _dragInteraction: Plottable.Interactions.Drag;
-  private _doubleClickInteraction: Plottable.Interactions.Click;
-  private isZoomed = false;
-  private easeFn: (t: number) => number = d3.easeCubicInOut;
-  private _animationTime = 750;
-  private onStart: Function;
-  private onEnd: Function;
-  private unzoomMethod: Function;
-
-  /**
-   * Constructs a SelectionBoxLayer with an attached DragInteraction and
-   * ClickInteraction. On drag, it triggers an animated zoom into the box
-   * that was dragged. On double click, it zooms back out to the original
-   * view, before any zooming.
-   * The zoom animation uses an easing function (default
-   * d3.ease('cubic-in-out')) and is customizable.
-   * Usage: Construct the selection box layer and attach x and y scales,
-   * and then add the layer over the plot you are zooming on using a
-   * Component Group.
-   * TODO(danmane) - merge this into Plottable
-   */
-  constructor(
-      xScale: Plottable.QuantitativeScale<number|{valueOf(): number}>,
-      yScale: Plottable.QuantitativeScale<number|{valueOf(): number}>,
-      unzoomMethod: Function) {
-    super();
-    this.xScale(xScale);
-    this.yScale(yScale);
-    this._dragInteraction = new Plottable.Interactions.Drag();
-    this._dragInteraction.attachTo(this);
-    this._doubleClickInteraction = new Plottable.Interactions.Click();
-    this._doubleClickInteraction.attachTo(this);
-    this.setupCallbacks();
-    this.unzoomMethod = unzoomMethod;
-  }
-
-  /**
-   * Register a method that calls when the DragZoom interaction starts.
-   */
-  public interactionStart(cb: Function) {
-    this.onStart = cb;
-  }
-
-  /**
-   * Register a method that calls when the DragZoom interaction ends.
-   */
-  public interactionEnd(cb: Function) {
-    this.onEnd = cb;
-  }
-
-  private setupCallbacks() {
-    let dragging = false;
-    this._dragInteraction.onDragStart((startPoint: Plottable.Point) => {
-      this.bounds({
-        topLeft: startPoint,
-        bottomRight: startPoint,
-      });
-      this.onStart();
-    });
-    this._dragInteraction.onDrag((startPoint, endPoint) => {
-      this.bounds({topLeft: startPoint, bottomRight: endPoint});
-      this.boxVisible(true);
-      dragging = true;
-    });
-    this._dragInteraction.onDragEnd((startPoint, endPoint) => {
-      this.boxVisible(false);
-      this.bounds({topLeft: startPoint, bottomRight: endPoint});
-      if (dragging) {
-        this.zoom();
-      } else {
-        this.onEnd();
-      }
-      dragging = false;
-    });
-
-    this._doubleClickInteraction.onDoubleClick(this.unzoom.bind(this));
-  }
-
-  /* Set the time (in ms) over which the zoom will interpolate.
-   * 0 implies no interpolation. (ie zoom is instant)
-   */
-  public animationTime(): number;
-  public animationTime(animationTime: number): DragZoomLayer;
-  public animationTime(animationTime?: number): any {
-    if (animationTime == null) {
-      return this._animationTime;
-    }
-    if (animationTime < 0) {
-      throw new Error('animationTime cannot be negative');
-    }
-    this._animationTime = animationTime;
-    return this;
-  }
-
-  /**
-   * Set the easing function, which determines how the zoom interpolates
-   * over time.
-   */
-  public ease(fn: (t: number) => number): DragZoomLayer {
-    if (typeof(fn) !== 'function') {
-      throw new Error('ease function must be a function');
-    }
-    if (fn(0) !== 0 || fn(1) !== 1) {
-      Plottable.Utils.Window.warn(
-          'Easing function does not maintain invariant ' +
-          'f(0)==0 && f(1)==1. Bad behavior may result.');
-    }
-    this.easeFn = fn;
-    return this;
-  }
-
-  // Zoom into extent of the selection box bounds
-  private zoom() {
-    let x0: number = this.xExtent()[0].valueOf();
-    let x1: number = this.xExtent()[1].valueOf();
-    let y0: number = this.yExtent()[1].valueOf();
-    let y1: number = this.yExtent()[0].valueOf();
-
-    if (x0 === x1 || y0 === y1) {
-      return;
-    }
-
-    if (!this.isZoomed) {
-      this.isZoomed = true;
-    }
-    this.interpolateZoom(x0, x1, y0, y1);
-  }
-
-  // Restore the scales to their state before any zoom
-  private unzoom() {
-    if (!this.isZoomed) {
-      return;
-    }
-    this.isZoomed = false;
-    let xScale = this.xScale() as any;
-    xScale._domainMin = null;
-    xScale._domainMax = null;
-    let xDomain = xScale._getExtent();
-    this.xScale().domain(xDomain);
-    this.unzoomMethod();
-  }
-
-  // If we are zooming, disable interactions, to avoid contention
-  private isZooming(isZooming: boolean) {
-    this._dragInteraction.enabled(!isZooming);
-    this._doubleClickInteraction.enabled(!isZooming);
-  }
-
-  private interpolateZoom(x0f: number, x1f: number, y0f: number, y1f: number) {
-    let x0s: number = this.xScale().domain()[0].valueOf();
-    let x1s: number = this.xScale().domain()[1].valueOf();
-    let y0s: number = this.yScale().domain()[0].valueOf();
-    let y1s: number = this.yScale().domain()[1].valueOf();
-
-    // Copy a ref to the ease fn, so that changing ease wont affect zooms in
-    // progress.
-    let ease = this.easeFn;
-    let interpolator = (a: number, b: number, p: number) =>
-        d3.interpolateNumber(a, b)(ease(p));
-
-    this.isZooming(true);
-    let start = Date.now();
-    let draw = () => {
-      let now = Date.now();
-      let passed = now - start;
-      let p = this._animationTime === 0 ?
-          1 :
-          Math.min(1, passed / this._animationTime);
-      let x0 = interpolator(x0s, x0f, p);
-      let x1 = interpolator(x1s, x1f, p);
-      let y0 = interpolator(y0s, y0f, p);
-      let y1 = interpolator(y1s, y1f, p);
-      this.xScale().domain([x0, x1]);
-      this.yScale().domain([y0, y1]);
-      if (p < 1) {
-        Plottable.Utils.DOM.requestAnimationFramePolyfill(draw);
-      } else {
-        this.onEnd();
-        this.isZooming(false);
-      }
-    };
-    draw();
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_line_chart/index.html b/tensorflow/tensorboard/components/vz_line_chart/index.html
deleted file mode 100644
index 856ab7d1efead922c3f2ff64dcaedd5de4ea6722..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/index.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>vz-line-chart demo</title>
-    <link rel="import" href="vz-line-chart.html">
-    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../paper-styles/typography.html">
-    <style type="text/css">
-      body {
-        font-family: "Roboto";
-      }
-
-      vz-line-chart {
-        height: 400px;
-      }
-    </style>
-  </head>
-  <body>
-    <h3>Simple line chart</h3>
-    <demo-snippet>
-      <template>
-        <vz-line-chart id="simpleline"></vz-line-chart>
-        <script>
-          var elem = document.querySelector('#simpleline');
-          elem.setVisibleSeries(['test', 'train']);
-          elem.setSeriesData('test', data.test);
-          elem.setSeriesData('train', data.train);
-        </script>
-      </template>
-    </demo-snippet>
-
-    <h3>Exponential Smoothing enabled</h3>
-    <demo-snippet>
-      <template>
-        <vz-line-chart id="smoothedline"></vz-line-chart>
-        <script>
-          var elem = document.querySelector('#smoothedline');
-          elem.smoothingEnabled = true;
-          elem.setVisibleSeries(['test', 'train']);
-          elem.setSeriesData('test', data.test);
-          elem.setSeriesData('train', data.train);
-        </script>
-      </template>
-    </demo-snippet>
-
-    <script>
-      var data = {
-        "test": [{"scalar":0.07039999961853027,"wall_time":new Date("2016-06-24T04:13:01.295Z"),"step":0},{"scalar":0.6891000270843506,"wall_time":new Date("2016-06-24T04:13:05.909Z"),"step":10},{"scalar":0.8208000063896179,"wall_time":new Date("2016-06-24T04:13:10.318Z"),"step":20},{"scalar":0.8554999828338623,"wall_time":new Date("2016-06-24T04:13:14.794Z"),"step":30},{"scalar":0.8776000142097473,"wall_time":new Date("2016-06-24T04:13:19.166Z"),"step":40},{"scalar":0.8848999738693237,"wall_time":new Date("2016-06-24T04:13:23.603Z"),"step":50},{"scalar":0.8906000256538391,"wall_time":new Date("2016-06-24T04:13:27.931Z"),"step":60},{"scalar":0.8853999972343445,"wall_time":new Date("2016-06-24T04:13:32.281Z"),"step":70},{"scalar":0.8848999738693237,"wall_time":new Date("2016-06-24T04:13:36.636Z"),"step":80},{"scalar":0.8985999822616577,"wall_time":new Date("2016-06-24T04:13:41.070Z"),"step":90},{"scalar":0.9057000279426575,"wall_time":new Date("2016-06-24T04:13:45.475Z"),"step":100},{"scalar":0.9136000275611877,"wall_time":new Date("2016-06-24T04:13:49.947Z"),"step":110},{"scalar":0.919700026512146,"wall_time":new Date("2016-06-24T04:13:54.384Z"),"step":120},{"scalar":0.9182000160217285,"wall_time":new Date("2016-06-24T04:13:58.790Z"),"step":130},{"scalar":0.9282000064849854,"wall_time":new Date("2016-06-24T04:14:03.180Z"),"step":140},{"scalar":0.9218999743461609,"wall_time":new Date("2016-06-24T04:14:08.060Z"),"step":150},{"scalar":0.9294000267982483,"wall_time":new Date("2016-06-24T04:14:12.520Z"),"step":160},{"scalar":0.9261000156402588,"wall_time":new Date("2016-06-24T04:14:16.893Z"),"step":170},{"scalar":0.9236999750137329,"wall_time":new Date("2016-06-24T04:14:21.653Z"),"step":180},{"scalar":0.925000011920929,"wall_time":new Date("2016-06-24T04:14:26.065Z"),"step":190},{"scalar":0.9319000244140625,"wall_time":new Date("2016-06-24T04:14:30.430Z"),"step":200},{"scalar":0.933899998664856,"wall_time":new Date("2016-06-24T04:14:34.815Z"),"step":210},{"scalar":0.9347000122070312,"wall_time":new Date("2016-06-24T04:14:39.179Z"),"step":220},{"scalar":0.9341999888420105,"wall_time":new Date("2016-06-24T04:14:43.562Z"),"step":230},{"scalar":0.933899998664856,"wall_time":new Date("2016-06-24T04:14:47.953Z"),"step":240},{"scalar":0.9254000186920166,"wall_time":new Date("2016-06-24T04:14:52.322Z"),"step":250},{"scalar":0.9383000135421753,"wall_time":new Date("2016-06-24T04:14:56.703Z"),"step":260},{"scalar":0.9391999840736389,"wall_time":new Date("2016-06-24T04:15:01.046Z"),"step":270},{"scalar":0.9336000084877014,"wall_time":new Date("2016-06-24T04:15:05.458Z"),"step":280},{"scalar":0.9404000043869019,"wall_time":new Date("2016-06-24T04:15:10.438Z"),"step":290},{"scalar":0.944100022315979,"wall_time":new Date("2016-06-24T04:15:15.026Z"),"step":300},{"scalar":0.9401000142097473,"wall_time":new Date("2016-06-24T04:15:19.417Z"),"step":310},{"scalar":0.9394999742507935,"wall_time":new Date("2016-06-24T04:15:23.985Z"),"step":320},{"scalar":0.9438999891281128,"wall_time":new Date("2016-06-24T04:15:28.418Z"),"step":330},{"scalar":0.9478999972343445,"wall_time":new Date("2016-06-24T04:15:32.844Z"),"step":340},{"scalar":0.9470999836921692,"wall_time":new Date("2016-06-24T04:15:37.359Z"),"step":350},{"scalar":0.9423999786376953,"wall_time":new Date("2016-06-24T04:15:41.803Z"),"step":360},{"scalar":0.9472000002861023,"wall_time":new Date("2016-06-24T04:15:46.167Z"),"step":370},{"scalar":0.9491999745368958,"wall_time":new Date("2016-06-24T04:15:50.558Z"),"step":380},{"scalar":0.9458000063896179,"wall_time":new Date("2016-06-24T04:15:54.942Z"),"step":390},{"scalar":0.9442999958992004,"wall_time":new Date("2016-06-24T04:15:59.343Z"),"step":400},{"scalar":0.946399986743927,"wall_time":new Date("2016-06-24T04:16:03.703Z"),"step":410},{"scalar":0.947700023651123,"wall_time":new Date("2016-06-24T04:16:08.102Z"),"step":420},{"scalar":0.9451000094413757,"wall_time":new Date("2016-06-24T04:16:13.379Z"),"step":430},{"scalar":0.9532999992370605,"wall_time":new Date("2016-06-24T04:16:17.962Z"),"step":440},{"scalar":0.9496999979019165,"wall_time":new Date("2016-06-24T04:16:22.320Z"),"step":450},{"scalar":0.9513000249862671,"wall_time":new Date("2016-06-24T04:16:26.712Z"),"step":460},{"scalar":0.9488999843597412,"wall_time":new Date("2016-06-24T04:16:31.099Z"),"step":470},{"scalar":0.9520000219345093,"wall_time":new Date("2016-06-24T04:16:35.760Z"),"step":480},{"scalar":0.9516000151634216,"wall_time":new Date("2016-06-24T04:16:40.239Z"),"step":490},{"scalar":0.9537000060081482,"wall_time":new Date("2016-06-24T04:16:44.620Z"),"step":500},{"scalar":0.9528999924659729,"wall_time":new Date("2016-06-24T04:16:49.273Z"),"step":510},{"scalar":0.9502999782562256,"wall_time":new Date("2016-06-24T04:16:53.640Z"),"step":520},{"scalar":0.9573000073432922,"wall_time":new Date("2016-06-24T04:16:58.612Z"),"step":530},{"scalar":0.9550999999046326,"wall_time":new Date("2016-06-24T04:17:03.089Z"),"step":540},{"scalar":0.9563000202178955,"wall_time":new Date("2016-06-24T04:17:07.481Z"),"step":550},{"scalar":0.9563000202178955,"wall_time":new Date("2016-06-24T04:17:11.866Z"),"step":560},{"scalar":0.9599000215530396,"wall_time":new Date("2016-06-24T04:17:16.456Z"),"step":570},{"scalar":0.9588000178337097,"wall_time":new Date("2016-06-24T04:17:20.983Z"),"step":580},{"scalar":0.9569000005722046,"wall_time":new Date("2016-06-24T04:17:25.996Z"),"step":590},{"scalar":0.9585999846458435,"wall_time":new Date("2016-06-24T04:17:30.417Z"),"step":600},{"scalar":0.9555000066757202,"wall_time":new Date("2016-06-24T04:17:35.164Z"),"step":610},{"scalar":0.9567999839782715,"wall_time":new Date("2016-06-24T04:17:39.714Z"),"step":620},{"scalar":0.9616000056266785,"wall_time":new Date("2016-06-24T04:17:44.105Z"),"step":630},{"scalar":0.9603999853134155,"wall_time":new Date("2016-06-24T04:17:48.826Z"),"step":640},{"scalar":0.9605000019073486,"wall_time":new Date("2016-06-24T04:17:53.419Z"),"step":650},{"scalar":0.9627000093460083,"wall_time":new Date("2016-06-24T04:17:58.026Z"),"step":660},{"scalar":0.9639999866485596,"wall_time":new Date("2016-06-24T04:18:02.698Z"),"step":670},{"scalar":0.9613999724388123,"wall_time":new Date("2016-06-24T04:18:07.960Z"),"step":680},{"scalar":0.9599000215530396,"wall_time":new Date("2016-06-24T04:18:12.458Z"),"step":690},{"scalar":0.9617999792098999,"wall_time":new Date("2016-06-24T04:18:16.835Z"),"step":700},{"scalar":0.9635000228881836,"wall_time":new Date("2016-06-24T04:18:21.232Z"),"step":710},{"scalar":0.9641000032424927,"wall_time":new Date("2016-06-24T04:18:25.888Z"),"step":720},{"scalar":0.9628000259399414,"wall_time":new Date("2016-06-24T04:18:30.372Z"),"step":730},{"scalar":0.9656000137329102,"wall_time":new Date("2016-06-24T04:18:34.751Z"),"step":740},{"scalar":0.9642000198364258,"wall_time":new Date("2016-06-24T04:18:39.368Z"),"step":750},{"scalar":0.9646000266075134,"wall_time":new Date("2016-06-24T04:18:44.267Z"),"step":760},{"scalar":0.9617000222206116,"wall_time":new Date("2016-06-24T04:18:48.829Z"),"step":770},{"scalar":0.9657999873161316,"wall_time":new Date("2016-06-24T04:18:53.466Z"),"step":780},{"scalar":0.9667999744415283,"wall_time":new Date("2016-06-24T04:18:57.893Z"),"step":790},{"scalar":0.967199981212616,"wall_time":new Date("2016-06-24T04:19:02.601Z"),"step":800},{"scalar":0.9646999835968018,"wall_time":new Date("2016-06-24T04:19:07.657Z"),"step":810},{"scalar":0.9670000076293945,"wall_time":new Date("2016-06-24T04:19:12.331Z"),"step":820},{"scalar":0.96670001745224,"wall_time":new Date("2016-06-24T04:19:17.223Z"),"step":830},{"scalar":0.9668999910354614,"wall_time":new Date("2016-06-24T04:19:21.980Z"),"step":840},{"scalar":0.965399980545044,"wall_time":new Date("2016-06-24T04:19:26.352Z"),"step":850},{"scalar":0.9671000242233276,"wall_time":new Date("2016-06-24T04:19:30.764Z"),"step":860},{"scalar":0.9671000242233276,"wall_time":new Date("2016-06-24T04:19:35.244Z"),"step":870},{"scalar":0.9642000198364258,"wall_time":new Date("2016-06-24T04:19:39.620Z"),"step":880},{"scalar":0.9666000008583069,"wall_time":new Date("2016-06-24T04:19:43.979Z"),"step":890},{"scalar":0.9664999842643738,"wall_time":new Date("2016-06-24T04:19:48.337Z"),"step":900},{"scalar":0.9678000211715698,"wall_time":new Date("2016-06-24T04:19:52.688Z"),"step":910},{"scalar":0.9678999781608582,"wall_time":new Date("2016-06-24T04:19:57.058Z"),"step":920},{"scalar":0.9674999713897705,"wall_time":new Date("2016-06-24T04:20:01.415Z"),"step":930},{"scalar":0.9684000015258789,"wall_time":new Date("2016-06-24T04:20:05.887Z"),"step":940},{"scalar":0.9672999978065491,"wall_time":new Date("2016-06-24T04:20:10.261Z"),"step":950},{"scalar":0.9696000218391418,"wall_time":new Date("2016-06-24T04:20:14.610Z"),"step":960},{"scalar":0.9706000089645386,"wall_time":new Date("2016-06-24T04:20:19.526Z"),"step":970},{"scalar":0.9688000082969666,"wall_time":new Date("2016-06-24T04:20:23.881Z"),"step":980},{"scalar":0.9699000120162964,"wall_time":new Date("2016-06-24T04:20:28.415Z"),"step":990,"name":"test","relative":0.1242}],
-        "train": [{"scalar":0.05999999865889549,"wall_time":new Date("2016-06-24T04:13:01.556Z"),"step":1},{"scalar":0.18000000715255737,"wall_time":new Date("2016-06-24T04:13:01.693Z"),"step":2},{"scalar":0.25,"wall_time":new Date("2016-06-24T04:13:01.833Z"),"step":3},{"scalar":0.28999999165534973,"wall_time":new Date("2016-06-24T04:13:01.964Z"),"step":4},{"scalar":0.3400000035762787,"wall_time":new Date("2016-06-24T04:13:02.109Z"),"step":5},{"scalar":0.5099999904632568,"wall_time":new Date("2016-06-24T04:13:02.249Z"),"step":6},{"scalar":0.550000011920929,"wall_time":new Date("2016-06-24T04:13:02.387Z"),"step":7},{"scalar":0.5600000023841858,"wall_time":new Date("2016-06-24T04:13:02.515Z"),"step":8},{"scalar":0.6700000166893005,"wall_time":new Date("2016-06-24T04:13:02.650Z"),"step":9},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:06.061Z"),"step":11},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:06.202Z"),"step":12},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:06.343Z"),"step":13},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:06.480Z"),"step":14},{"scalar":0.75,"wall_time":new Date("2016-06-24T04:13:06.618Z"),"step":15},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:06.763Z"),"step":16},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:06.897Z"),"step":17},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:07.027Z"),"step":18},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:07.167Z"),"step":19},{"scalar":0.7599999904632568,"wall_time":new Date("2016-06-24T04:13:10.461Z"),"step":21},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:10.602Z"),"step":22},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:10.748Z"),"step":23},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:10.894Z"),"step":24},{"scalar":0.6899999976158142,"wall_time":new Date("2016-06-24T04:13:11.034Z"),"step":25},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:11.173Z"),"step":26},{"scalar":0.8100000023841858,"wall_time":new Date("2016-06-24T04:13:11.315Z"),"step":27},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:11.455Z"),"step":28},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:11.590Z"),"step":29},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:14.931Z"),"step":31},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:15.069Z"),"step":32},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:15.199Z"),"step":33},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:15.329Z"),"step":34},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:15.466Z"),"step":35},{"scalar":0.7699999809265137,"wall_time":new Date("2016-06-24T04:13:15.610Z"),"step":36},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:15.746Z"),"step":37},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:15.876Z"),"step":38},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:16.010Z"),"step":39},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:19.296Z"),"step":41},{"scalar":0.800000011920929,"wall_time":new Date("2016-06-24T04:13:19.435Z"),"step":42},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:19.567Z"),"step":43},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:19.696Z"),"step":44},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:19.833Z"),"step":45},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:19.972Z"),"step":46},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:20.111Z"),"step":47},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:20.239Z"),"step":48},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:20.370Z"),"step":49},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:23.731Z"),"step":51},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:23.869Z"),"step":52},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:24.000Z"),"step":53},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:24.129Z"),"step":54},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:24.256Z"),"step":55},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:24.391Z"),"step":56},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:24.529Z"),"step":57},{"scalar":0.7900000214576721,"wall_time":new Date("2016-06-24T04:13:24.663Z"),"step":58},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:24.792Z"),"step":59},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:28.072Z"),"step":61},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:13:28.212Z"),"step":62},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:28.340Z"),"step":63},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:28.477Z"),"step":64},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:28.610Z"),"step":65},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:28.751Z"),"step":66},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:28.875Z"),"step":67},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:29.003Z"),"step":68},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:29.137Z"),"step":69},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:32.418Z"),"step":71},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:32.552Z"),"step":72},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:32.685Z"),"step":73},{"scalar":0.8100000023841858,"wall_time":new Date("2016-06-24T04:13:32.817Z"),"step":74},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:32.942Z"),"step":75},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:33.077Z"),"step":76},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:33.213Z"),"step":77},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:33.347Z"),"step":78},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:33.489Z"),"step":79},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:36.766Z"),"step":81},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:36.902Z"),"step":82},{"scalar":0.8199999928474426,"wall_time":new Date("2016-06-24T04:13:37.042Z"),"step":83},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:37.187Z"),"step":84},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:37.319Z"),"step":85},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:37.460Z"),"step":86},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:37.595Z"),"step":87},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:37.730Z"),"step":88},{"scalar":0.7400000095367432,"wall_time":new Date("2016-06-24T04:13:37.866Z"),"step":89},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:41.201Z"),"step":91},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.334Z"),"step":92},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.480Z"),"step":93},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:41.618Z"),"step":94},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:41.755Z"),"step":95},{"scalar":0.8399999737739563,"wall_time":new Date("2016-06-24T04:13:41.885Z"),"step":96},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:42.022Z"),"step":97},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:13:42.145Z"),"step":98},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:42.271Z"),"step":99},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:45.616Z"),"step":101},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:45.751Z"),"step":102},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:45.882Z"),"step":103},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:46.031Z"),"step":104},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:46.169Z"),"step":105},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:46.302Z"),"step":106},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:46.443Z"),"step":107},{"scalar":0.7799999713897705,"wall_time":new Date("2016-06-24T04:13:46.578Z"),"step":108},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:46.720Z"),"step":109},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.080Z"),"step":111},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:50.219Z"),"step":112},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:50.351Z"),"step":113},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:50.495Z"),"step":114},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:50.622Z"),"step":115},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.761Z"),"step":116},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:13:50.894Z"),"step":117},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:13:51.027Z"),"step":118},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:13:51.167Z"),"step":119},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:13:54.523Z"),"step":121},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:54.649Z"),"step":122},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:54.813Z"),"step":123},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:54.958Z"),"step":124},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:55.097Z"),"step":125},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:13:55.232Z"),"step":126},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:13:55.363Z"),"step":127},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:13:55.491Z"),"step":128},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:13:55.619Z"),"step":129},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:58.923Z"),"step":131},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:59.051Z"),"step":132},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:59.183Z"),"step":133},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:59.316Z"),"step":134},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:13:59.443Z"),"step":135},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:13:59.583Z"),"step":136},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:13:59.712Z"),"step":137},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:13:59.840Z"),"step":138},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:13:59.978Z"),"step":139},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:03.306Z"),"step":141},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:03.443Z"),"step":142},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:03.572Z"),"step":143},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:03.700Z"),"step":144},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:03.834Z"),"step":145},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:03.964Z"),"step":146},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:04.104Z"),"step":147},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:04.265Z"),"step":148},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:04.470Z"),"step":149},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:08.189Z"),"step":151},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.328Z"),"step":152},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.462Z"),"step":153},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:08.594Z"),"step":154},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:08.725Z"),"step":155},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:08.859Z"),"step":156},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:08.998Z"),"step":157},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:09.132Z"),"step":158},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:09.268Z"),"step":159},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:12.667Z"),"step":161},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:12.803Z"),"step":162},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:12.935Z"),"step":163},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:13.068Z"),"step":164},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:13.199Z"),"step":165},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:13.333Z"),"step":166},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:13.462Z"),"step":167},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:13.591Z"),"step":168},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:13.719Z"),"step":169},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:17.038Z"),"step":171},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:17.166Z"),"step":172},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:17.302Z"),"step":173},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:17.430Z"),"step":174},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:17.565Z"),"step":175},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:17.700Z"),"step":176},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:17.844Z"),"step":177},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:18.013Z"),"step":178},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:18.164Z"),"step":179},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:21.786Z"),"step":181},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:21.915Z"),"step":182},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:22.054Z"),"step":183},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:22.187Z"),"step":184},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:22.317Z"),"step":185},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:22.442Z"),"step":186},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:22.583Z"),"step":187},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:22.718Z"),"step":188},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:22.849Z"),"step":189},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:26.199Z"),"step":191},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:26.327Z"),"step":192},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:26.462Z"),"step":193},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:26.591Z"),"step":194},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:26.721Z"),"step":195},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:26.857Z"),"step":196},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:26.989Z"),"step":197},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:27.118Z"),"step":198},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:27.251Z"),"step":199},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:30.566Z"),"step":201},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:30.696Z"),"step":202},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:30.831Z"),"step":203},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:30.965Z"),"step":204},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:31.105Z"),"step":205},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:31.234Z"),"step":206},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:31.369Z"),"step":207},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:31.498Z"),"step":208},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:31.646Z"),"step":209},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:34.956Z"),"step":211},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:35.097Z"),"step":212},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:35.225Z"),"step":213},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:35.361Z"),"step":214},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:35.486Z"),"step":215},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:14:35.618Z"),"step":216},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:35.747Z"),"step":217},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:14:35.879Z"),"step":218},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:36.019Z"),"step":219},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:39.310Z"),"step":221},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:39.439Z"),"step":222},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:39.580Z"),"step":223},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:39.713Z"),"step":224},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:39.854Z"),"step":225},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:39.988Z"),"step":226},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:40.117Z"),"step":227},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:40.244Z"),"step":228},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:40.376Z"),"step":229},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:43.701Z"),"step":231},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:43.852Z"),"step":232},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:43.988Z"),"step":233},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:44.127Z"),"step":234},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:14:44.261Z"),"step":235},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:44.394Z"),"step":236},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:44.535Z"),"step":237},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:44.663Z"),"step":238},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:44.803Z"),"step":239},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:48.092Z"),"step":241},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:14:48.234Z"),"step":242},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:48.374Z"),"step":243},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:48.503Z"),"step":244},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:14:48.635Z"),"step":245},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:14:48.768Z"),"step":246},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:48.895Z"),"step":247},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:49.037Z"),"step":248},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:49.169Z"),"step":249},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:52.450Z"),"step":251},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:52.582Z"),"step":252},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:52.712Z"),"step":253},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:52.850Z"),"step":254},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:52.982Z"),"step":255},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:53.112Z"),"step":256},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:14:53.244Z"),"step":257},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:53.377Z"),"step":258},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:53.501Z"),"step":259},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:14:56.842Z"),"step":261},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:14:56.973Z"),"step":262},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:14:57.101Z"),"step":263},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:14:57.229Z"),"step":264},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:14:57.360Z"),"step":265},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.491Z"),"step":266},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.629Z"),"step":267},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:14:57.770Z"),"step":268},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:14:57.898Z"),"step":269},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.199Z"),"step":271},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:01.330Z"),"step":272},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:01.455Z"),"step":273},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:01.579Z"),"step":274},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:01.716Z"),"step":275},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.838Z"),"step":276},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:01.966Z"),"step":277},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:02.103Z"),"step":278},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:02.233Z"),"step":279},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:05.608Z"),"step":281},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:05.733Z"),"step":282},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:05.870Z"),"step":283},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:05.999Z"),"step":284},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:06.136Z"),"step":285},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:06.266Z"),"step":286},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:06.397Z"),"step":287},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:06.525Z"),"step":288},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:06.656Z"),"step":289},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:10.573Z"),"step":291},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:10.702Z"),"step":292},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:15:10.837Z"),"step":293},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:10.968Z"),"step":294},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:11.109Z"),"step":295},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.242Z"),"step":296},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.376Z"),"step":297},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:11.505Z"),"step":298},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:11.645Z"),"step":299},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:15.160Z"),"step":301},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:15.308Z"),"step":302},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:15.440Z"),"step":303},{"scalar":0.8500000238418579,"wall_time":new Date("2016-06-24T04:15:15.565Z"),"step":304},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:15:15.696Z"),"step":305},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:15.826Z"),"step":306},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:15.961Z"),"step":307},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:16.097Z"),"step":308},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:16.226Z"),"step":309},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:19.556Z"),"step":311},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:19.686Z"),"step":312},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:19.826Z"),"step":313},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:19.954Z"),"step":314},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:15:20.093Z"),"step":315},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:20.230Z"),"step":316},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:20.358Z"),"step":317},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:20.490Z"),"step":318},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:20.621Z"),"step":319},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:24.128Z"),"step":321},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:24.259Z"),"step":322},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:24.399Z"),"step":323},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:24.536Z"),"step":324},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:24.663Z"),"step":325},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:24.792Z"),"step":326},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:24.924Z"),"step":327},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:25.055Z"),"step":328},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:25.192Z"),"step":329},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.553Z"),"step":331},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.709Z"),"step":332},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:28.848Z"),"step":333},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:28.980Z"),"step":334},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:29.109Z"),"step":335},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:29.243Z"),"step":336},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:29.375Z"),"step":337},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:29.515Z"),"step":338},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:29.655Z"),"step":339},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:32.974Z"),"step":341},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:33.098Z"),"step":342},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:33.234Z"),"step":343},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:33.383Z"),"step":344},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:33.544Z"),"step":345},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:33.710Z"),"step":346},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:33.860Z"),"step":347},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:34.008Z"),"step":348},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:34.150Z"),"step":349},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:37.499Z"),"step":351},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:37.638Z"),"step":352},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:37.775Z"),"step":353},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:37.908Z"),"step":354},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:38.042Z"),"step":355},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:38.174Z"),"step":356},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:15:38.313Z"),"step":357},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:38.444Z"),"step":358},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:15:38.581Z"),"step":359},{"scalar":0.8600000143051147,"wall_time":new Date("2016-06-24T04:15:41.935Z"),"step":361},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:42.074Z"),"step":362},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.194Z"),"step":363},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:42.325Z"),"step":364},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.463Z"),"step":365},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:42.605Z"),"step":366},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:42.730Z"),"step":367},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:42.858Z"),"step":368},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:42.998Z"),"step":369},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:15:46.303Z"),"step":371},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:15:46.436Z"),"step":372},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:46.568Z"),"step":373},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:15:46.708Z"),"step":374},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:46.839Z"),"step":375},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:46.971Z"),"step":376},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:47.102Z"),"step":377},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:15:47.236Z"),"step":378},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:47.379Z"),"step":379},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:50.684Z"),"step":381},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:15:50.815Z"),"step":382},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:50.956Z"),"step":383},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:51.090Z"),"step":384},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:51.232Z"),"step":385},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:51.365Z"),"step":386},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:51.503Z"),"step":387},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:51.643Z"),"step":388},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:51.777Z"),"step":389},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.084Z"),"step":391},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:55.220Z"),"step":392},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:55.354Z"),"step":393},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:15:55.486Z"),"step":394},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.625Z"),"step":395},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:15:55.763Z"),"step":396},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:55.899Z"),"step":397},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:15:56.036Z"),"step":398},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:56.180Z"),"step":399},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:15:59.479Z"),"step":401},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:15:59.604Z"),"step":402},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:59.745Z"),"step":403},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:15:59.875Z"),"step":404},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:00.005Z"),"step":405},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:00.138Z"),"step":406},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:00.277Z"),"step":407},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:00.402Z"),"step":408},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:00.538Z"),"step":409},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:03.851Z"),"step":411},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:03.985Z"),"step":412},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.118Z"),"step":413},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.248Z"),"step":414},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:04.376Z"),"step":415},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:04.515Z"),"step":416},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:04.641Z"),"step":417},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:04.770Z"),"step":418},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:04.905Z"),"step":419},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:08.228Z"),"step":421},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:16:08.358Z"),"step":422},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.500Z"),"step":423},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.639Z"),"step":424},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:08.766Z"),"step":425},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:08.903Z"),"step":426},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:09.042Z"),"step":427},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:09.172Z"),"step":428},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:09.310Z"),"step":429},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.517Z"),"step":431},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.682Z"),"step":432},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:13.835Z"),"step":433},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:13.975Z"),"step":434},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:14.131Z"),"step":435},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:14.280Z"),"step":436},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:14.434Z"),"step":437},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:14.584Z"),"step":438},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:14.730Z"),"step":439},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.093Z"),"step":441},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:18.230Z"),"step":442},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:18.359Z"),"step":443},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.501Z"),"step":444},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:18.632Z"),"step":445},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:18.765Z"),"step":446},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:18.894Z"),"step":447},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:19.032Z"),"step":448},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:19.160Z"),"step":449},{"scalar":0.8799999952316284,"wall_time":new Date("2016-06-24T04:16:22.445Z"),"step":451},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:22.584Z"),"step":452},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:22.711Z"),"step":453},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:22.858Z"),"step":454},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:23.006Z"),"step":455},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:23.138Z"),"step":456},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:23.274Z"),"step":457},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:23.417Z"),"step":458},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:23.543Z"),"step":459},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:26.832Z"),"step":461},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:26.962Z"),"step":462},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:27.093Z"),"step":463},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.230Z"),"step":464},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.371Z"),"step":465},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:27.495Z"),"step":466},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:27.620Z"),"step":467},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:27.759Z"),"step":468},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:27.885Z"),"step":469},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:31.227Z"),"step":471},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:16:31.354Z"),"step":472},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:31.484Z"),"step":473},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:31.629Z"),"step":474},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:31.764Z"),"step":475},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:31.898Z"),"step":476},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:32.032Z"),"step":477},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:32.157Z"),"step":478},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:32.288Z"),"step":479},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:35.896Z"),"step":481},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:36.024Z"),"step":482},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:36.163Z"),"step":483},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:36.297Z"),"step":484},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:36.428Z"),"step":485},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:36.561Z"),"step":486},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:36.695Z"),"step":487},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:36.828Z"),"step":488},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:16:36.959Z"),"step":489},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:40.381Z"),"step":491},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:40.508Z"),"step":492},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:40.647Z"),"step":493},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:40.782Z"),"step":494},{"scalar":0.8299999833106995,"wall_time":new Date("2016-06-24T04:16:40.920Z"),"step":495},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:41.053Z"),"step":496},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:41.186Z"),"step":497},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:41.311Z"),"step":498},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:41.458Z"),"step":499},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:44.767Z"),"step":501},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:44.902Z"),"step":502},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:45.039Z"),"step":503},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:45.170Z"),"step":504},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:45.306Z"),"step":505},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:45.440Z"),"step":506},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:45.584Z"),"step":507},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:45.718Z"),"step":508},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:45.853Z"),"step":509},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:49.414Z"),"step":511},{"scalar":1,"wall_time":new Date("2016-06-24T04:16:49.556Z"),"step":512},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:49.690Z"),"step":513},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:16:49.821Z"),"step":514},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:49.950Z"),"step":515},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:50.080Z"),"step":516},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:50.209Z"),"step":517},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:50.344Z"),"step":518},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:50.476Z"),"step":519},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:53.771Z"),"step":521},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:53.903Z"),"step":522},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:54.039Z"),"step":523},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:16:54.178Z"),"step":524},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:16:54.303Z"),"step":525},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:54.430Z"),"step":526},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:16:54.553Z"),"step":527},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:54.687Z"),"step":528},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:54.811Z"),"step":529},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:58.763Z"),"step":531},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:58.904Z"),"step":532},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:59.050Z"),"step":533},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:16:59.178Z"),"step":534},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:16:59.314Z"),"step":535},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:16:59.446Z"),"step":536},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:16:59.585Z"),"step":537},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:16:59.712Z"),"step":538},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:16:59.844Z"),"step":539},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:03.217Z"),"step":541},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:03.350Z"),"step":542},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:03.480Z"),"step":543},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:17:03.612Z"),"step":544},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:03.742Z"),"step":545},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:03.889Z"),"step":546},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.032Z"),"step":547},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.171Z"),"step":548},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:04.306Z"),"step":549},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:07.617Z"),"step":551},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:07.751Z"),"step":552},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:07.887Z"),"step":553},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:08.024Z"),"step":554},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:08.164Z"),"step":555},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:08.296Z"),"step":556},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:08.433Z"),"step":557},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:08.568Z"),"step":558},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:08.692Z"),"step":559},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:12.001Z"),"step":561},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:12.156Z"),"step":562},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:12.290Z"),"step":563},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:12.464Z"),"step":564},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:12.604Z"),"step":565},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:12.741Z"),"step":566},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:12.882Z"),"step":567},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:13.029Z"),"step":568},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:17:13.164Z"),"step":569},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:16.594Z"),"step":571},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:16.730Z"),"step":572},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:16.871Z"),"step":573},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:17.004Z"),"step":574},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:17.140Z"),"step":575},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:17.276Z"),"step":576},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:17.407Z"),"step":577},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:17.543Z"),"step":578},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:17.671Z"),"step":579},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:21.105Z"),"step":581},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:21.234Z"),"step":582},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:21.367Z"),"step":583},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:21.506Z"),"step":584},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:21.631Z"),"step":585},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:21.761Z"),"step":586},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:21.899Z"),"step":587},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:22.036Z"),"step":588},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:22.175Z"),"step":589},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:26.130Z"),"step":591},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:26.271Z"),"step":592},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:26.402Z"),"step":593},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:26.537Z"),"step":594},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:26.669Z"),"step":595},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:26.807Z"),"step":596},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:26.935Z"),"step":597},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:27.072Z"),"step":598},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:27.216Z"),"step":599},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:30.550Z"),"step":601},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:30.677Z"),"step":602},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:30.816Z"),"step":603},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:30.960Z"),"step":604},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:31.094Z"),"step":605},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:31.243Z"),"step":606},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:31.383Z"),"step":607},{"scalar":0.8899999856948853,"wall_time":new Date("2016-06-24T04:17:31.525Z"),"step":608},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:31.650Z"),"step":609},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:35.299Z"),"step":611},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:35.636Z"),"step":612},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:35.773Z"),"step":613},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:35.907Z"),"step":614},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:36.037Z"),"step":615},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:36.169Z"),"step":616},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:36.302Z"),"step":617},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:36.427Z"),"step":618},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:17:36.567Z"),"step":619},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:39.851Z"),"step":621},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:39.982Z"),"step":622},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.120Z"),"step":623},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.256Z"),"step":624},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.394Z"),"step":625},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.529Z"),"step":626},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:40.667Z"),"step":627},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:40.807Z"),"step":628},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:40.938Z"),"step":629},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:44.244Z"),"step":631},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:44.385Z"),"step":632},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:44.525Z"),"step":633},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:44.657Z"),"step":634},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:44.800Z"),"step":635},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:44.935Z"),"step":636},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:17:45.074Z"),"step":637},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:45.208Z"),"step":638},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:45.351Z"),"step":639},{"scalar":1,"wall_time":new Date("2016-06-24T04:17:48.960Z"),"step":641},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:49.090Z"),"step":642},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:49.219Z"),"step":643},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:49.361Z"),"step":644},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:49.502Z"),"step":645},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:49.634Z"),"step":646},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:49.770Z"),"step":647},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:49.908Z"),"step":648},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:50.033Z"),"step":649},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:53.566Z"),"step":651},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:17:53.699Z"),"step":652},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:53.829Z"),"step":653},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:53.958Z"),"step":654},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:54.102Z"),"step":655},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.244Z"),"step":656},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:54.377Z"),"step":657},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.518Z"),"step":658},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:54.649Z"),"step":659},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:58.186Z"),"step":661},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:17:58.343Z"),"step":662},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:58.479Z"),"step":663},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:17:58.620Z"),"step":664},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:58.749Z"),"step":665},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:17:58.876Z"),"step":666},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:59.023Z"),"step":667},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:17:59.157Z"),"step":668},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:17:59.297Z"),"step":669},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:02.835Z"),"step":671},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:02.987Z"),"step":672},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:03.126Z"),"step":673},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:03.266Z"),"step":674},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:03.399Z"),"step":675},{"scalar":0.8999999761581421,"wall_time":new Date("2016-06-24T04:18:03.545Z"),"step":676},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:03.696Z"),"step":677},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:03.841Z"),"step":678},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:03.974Z"),"step":679},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.117Z"),"step":681},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.271Z"),"step":682},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:08.410Z"),"step":683},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:08.557Z"),"step":684},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:08.696Z"),"step":685},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:08.848Z"),"step":686},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:08.987Z"),"step":687},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:09.119Z"),"step":688},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:09.261Z"),"step":689},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:12.588Z"),"step":691},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:12.719Z"),"step":692},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:12.854Z"),"step":693},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:12.986Z"),"step":694},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.118Z"),"step":695},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.253Z"),"step":696},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:13.392Z"),"step":697},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:13.524Z"),"step":698},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:13.659Z"),"step":699},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:16.970Z"),"step":701},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.117Z"),"step":702},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.250Z"),"step":703},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.382Z"),"step":704},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:17.517Z"),"step":705},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:17.653Z"),"step":706},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:17.782Z"),"step":707},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:17.915Z"),"step":708},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:18.056Z"),"step":709},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:21.361Z"),"step":711},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:21.478Z"),"step":712},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:21.610Z"),"step":713},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:21.750Z"),"step":714},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:21.880Z"),"step":715},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:22.018Z"),"step":716},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:22.147Z"),"step":717},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:22.283Z"),"step":718},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:22.410Z"),"step":719},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:26.058Z"),"step":721},{"scalar":1,"wall_time":new Date("2016-06-24T04:18:26.204Z"),"step":722},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:26.350Z"),"step":723},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:26.498Z"),"step":724},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:18:26.636Z"),"step":725},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:26.787Z"),"step":726},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:26.919Z"),"step":727},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:27.051Z"),"step":728},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:27.183Z"),"step":729},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:30.500Z"),"step":731},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:30.634Z"),"step":732},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:30.769Z"),"step":733},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:30.904Z"),"step":734},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:31.043Z"),"step":735},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:31.176Z"),"step":736},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:31.311Z"),"step":737},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:31.438Z"),"step":738},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:31.577Z"),"step":739},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:34.889Z"),"step":741},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.026Z"),"step":742},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:35.160Z"),"step":743},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.299Z"),"step":744},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.436Z"),"step":745},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.573Z"),"step":746},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:35.713Z"),"step":747},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:35.845Z"),"step":748},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:35.978Z"),"step":749},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:39.520Z"),"step":751},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:39.679Z"),"step":752},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:39.844Z"),"step":753},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:40.006Z"),"step":754},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:40.142Z"),"step":755},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:40.337Z"),"step":756},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:40.487Z"),"step":757},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:40.691Z"),"step":758},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:40.855Z"),"step":759},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:44.438Z"),"step":761},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:44.566Z"),"step":762},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:44.698Z"),"step":763},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:44.832Z"),"step":764},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:44.974Z"),"step":765},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:45.115Z"),"step":766},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:45.258Z"),"step":767},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:45.392Z"),"step":768},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:45.525Z"),"step":769},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:18:48.972Z"),"step":771},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.116Z"),"step":772},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.257Z"),"step":773},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:49.387Z"),"step":774},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:49.512Z"),"step":775},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:49.653Z"),"step":776},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:49.787Z"),"step":777},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:49.922Z"),"step":778},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:50.062Z"),"step":779},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:18:53.610Z"),"step":781},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:53.745Z"),"step":782},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:53.887Z"),"step":783},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:54.025Z"),"step":784},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:54.156Z"),"step":785},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:54.281Z"),"step":786},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:54.417Z"),"step":787},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:18:54.560Z"),"step":788},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:54.691Z"),"step":789},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:58.048Z"),"step":791},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:18:58.192Z"),"step":792},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:58.324Z"),"step":793},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:18:58.449Z"),"step":794},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:18:58.584Z"),"step":795},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:18:58.720Z"),"step":796},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:18:58.846Z"),"step":797},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:58.991Z"),"step":798},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:18:59.124Z"),"step":799},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:02.737Z"),"step":801},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:02.880Z"),"step":802},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:03.021Z"),"step":803},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:03.166Z"),"step":804},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:03.308Z"),"step":805},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:03.436Z"),"step":806},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:03.574Z"),"step":807},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:03.702Z"),"step":808},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:03.842Z"),"step":809},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:07.794Z"),"step":811},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:07.946Z"),"step":812},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:08.081Z"),"step":813},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:08.215Z"),"step":814},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:08.347Z"),"step":815},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:08.490Z"),"step":816},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:08.625Z"),"step":817},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:08.750Z"),"step":818},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:19:08.877Z"),"step":819},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:12.481Z"),"step":821},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:12.643Z"),"step":822},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:12.810Z"),"step":823},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:12.982Z"),"step":824},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:13.116Z"),"step":825},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.254Z"),"step":826},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.425Z"),"step":827},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:13.570Z"),"step":828},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:13.707Z"),"step":829},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:17.381Z"),"step":831},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:17.518Z"),"step":832},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:17.657Z"),"step":833},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:17.815Z"),"step":834},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:17.959Z"),"step":835},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:18.102Z"),"step":836},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:18.249Z"),"step":837},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:18.389Z"),"step":838},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:18.519Z"),"step":839},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:22.125Z"),"step":841},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:22.257Z"),"step":842},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.393Z"),"step":843},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.524Z"),"step":844},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.657Z"),"step":845},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:22.791Z"),"step":846},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:22.926Z"),"step":847},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:23.060Z"),"step":848},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:23.192Z"),"step":849},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:26.487Z"),"step":851},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:26.617Z"),"step":852},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:26.744Z"),"step":853},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:26.881Z"),"step":854},{"scalar":0.8700000047683716,"wall_time":new Date("2016-06-24T04:19:27.016Z"),"step":855},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:27.157Z"),"step":856},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:27.288Z"),"step":857},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:27.412Z"),"step":858},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:27.549Z"),"step":859},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:30.933Z"),"step":861},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:31.071Z"),"step":862},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.211Z"),"step":863},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.350Z"),"step":864},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:31.508Z"),"step":865},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:31.653Z"),"step":866},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:31.795Z"),"step":867},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:31.941Z"),"step":868},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:32.077Z"),"step":869},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:35.373Z"),"step":871},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:35.499Z"),"step":872},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:35.634Z"),"step":873},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:35.780Z"),"step":874},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:35.906Z"),"step":875},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:36.045Z"),"step":876},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:36.176Z"),"step":877},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:36.319Z"),"step":878},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:36.449Z"),"step":879},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:39.764Z"),"step":881},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:39.896Z"),"step":882},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.028Z"),"step":883},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.158Z"),"step":884},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:40.296Z"),"step":885},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:40.422Z"),"step":886},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:40.561Z"),"step":887},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:40.688Z"),"step":888},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:40.822Z"),"step":889},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:44.112Z"),"step":891},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.247Z"),"step":892},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:44.384Z"),"step":893},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.516Z"),"step":894},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:44.643Z"),"step":895},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.778Z"),"step":896},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:44.914Z"),"step":897},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:45.051Z"),"step":898},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:45.179Z"),"step":899},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:48.465Z"),"step":901},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:48.593Z"),"step":902},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:48.728Z"),"step":903},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:48.866Z"),"step":904},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:48.998Z"),"step":905},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:49.130Z"),"step":906},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:49.256Z"),"step":907},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:49.393Z"),"step":908},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:19:49.521Z"),"step":909},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:52.825Z"),"step":911},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:52.956Z"),"step":912},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:53.101Z"),"step":913},{"scalar":1,"wall_time":new Date("2016-06-24T04:19:53.235Z"),"step":914},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:53.368Z"),"step":915},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:53.500Z"),"step":916},{"scalar":0.9100000262260437,"wall_time":new Date("2016-06-24T04:19:53.623Z"),"step":917},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:53.768Z"),"step":918},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:19:53.899Z"),"step":919},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:57.197Z"),"step":921},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.333Z"),"step":922},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.467Z"),"step":923},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:19:57.594Z"),"step":924},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:57.731Z"),"step":925},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:19:57.866Z"),"step":926},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:19:57.991Z"),"step":927},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:19:58.128Z"),"step":928},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:19:58.260Z"),"step":929},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.545Z"),"step":931},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.681Z"),"step":932},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:01.814Z"),"step":933},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:01.950Z"),"step":934},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:02.111Z"),"step":935},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:02.255Z"),"step":936},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:02.399Z"),"step":937},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:02.542Z"),"step":938},{"scalar":1,"wall_time":new Date("2016-06-24T04:20:02.672Z"),"step":939},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.030Z"),"step":941},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.172Z"),"step":942},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:06.314Z"),"step":943},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:06.436Z"),"step":944},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:06.572Z"),"step":945},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:06.704Z"),"step":946},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:06.833Z"),"step":947},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:06.966Z"),"step":948},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:07.105Z"),"step":949},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:10.389Z"),"step":951},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:10.522Z"),"step":952},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:10.660Z"),"step":953},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:10.806Z"),"step":954},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:10.939Z"),"step":955},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:11.073Z"),"step":956},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:11.204Z"),"step":957},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:11.339Z"),"step":958},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:11.465Z"),"step":959},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:14.750Z"),"step":961},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:14.879Z"),"step":962},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:15.017Z"),"step":963},{"scalar":0.9200000166893005,"wall_time":new Date("2016-06-24T04:20:15.152Z"),"step":964},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:15.288Z"),"step":965},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:15.415Z"),"step":966},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:15.550Z"),"step":967},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:15.679Z"),"step":968},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:15.815Z"),"step":969},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:19.650Z"),"step":971},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:20:19.786Z"),"step":972},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:19.922Z"),"step":973},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:20.058Z"),"step":974},{"scalar":0.949999988079071,"wall_time":new Date("2016-06-24T04:20:20.198Z"),"step":975},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:20.326Z"),"step":976},{"scalar":0.9300000071525574,"wall_time":new Date("2016-06-24T04:20:20.461Z"),"step":977},{"scalar":1,"wall_time":new Date("2016-06-24T04:20:20.588Z"),"step":978},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:20.718Z"),"step":979},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:24.015Z"),"step":981},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:24.151Z"),"step":982},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.300Z"),"step":983},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:24.447Z"),"step":984},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.576Z"),"step":985},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:24.701Z"),"step":986},{"scalar":0.9399999976158142,"wall_time":new Date("2016-06-24T04:20:24.830Z"),"step":987},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:24.977Z"),"step":988},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:25.134Z"),"step":989},{"scalar":0.9599999785423279,"wall_time":new Date("2016-06-24T04:20:28.551Z"),"step":991},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:28.681Z"),"step":992},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:28.812Z"),"step":993},{"scalar":0.9900000095367432,"wall_time":new Date("2016-06-24T04:20:28.949Z"),"step":994},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.086Z"),"step":995},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:29.219Z"),"step":996},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.351Z"),"step":997},{"scalar":0.9700000286102295,"wall_time":new Date("2016-06-24T04:20:29.497Z"),"step":998},{"scalar":0.9800000190734863,"wall_time":new Date("2016-06-24T04:20:29.637Z"),"step":999,"name":"train","relative":0.12446694444444445}]
-      };
-    </script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
deleted file mode 100644
index fa89e06ada1f3faa7c197f79e34679f1086d7ec0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-export interface Datum {
-  wall_time: Date;
-  step: number;
-}
-
-export interface Scalar {
-  scalar: number;
-  smoothed: number;
-}
-
-export type ScalarDatum = Datum & Scalar;
-
-export type DataFn = (run: string, tag: string) => Promise<Array<Datum>>;
-
-export let Y_TOOLTIP_FORMATTER_PRECISION = 4;
-export let STEP_FORMATTER_PRECISION = 4;
-export let Y_AXIS_FORMATTER_PRECISION = 3;
-export let TOOLTIP_Y_PIXEL_OFFSET = 20;
-export let TOOLTIP_CIRCLE_SIZE = 4;
-export let NAN_SYMBOL_SIZE = 6;
-
-export interface Point {
-  x: number;  // pixel space
-  y: number;  // pixel space
-  datum: ScalarDatum;
-  dataset: Plottable.Dataset;
-}
-
-/* Create a formatter function that will switch between exponential and
- * regular display depending on the scale of the number being formatted,
- * and show `digits` significant digits.
- */
-export function multiscaleFormatter(digits: number): ((v: number) => string) {
-  return (v: number) => {
-    let absv = Math.abs(v);
-    if (absv < 1E-15) {
-      // Sometimes zero-like values get an annoying representation
-      absv = 0;
-    }
-    let f: (x: number) => string;
-    if (absv >= 1E4) {
-      f = d3.format('.' + digits + 'e');
-    } else if (absv > 0 && absv < 0.01) {
-      f = d3.format('.' + digits + 'e');
-    } else {
-      f = d3.format('.' + digits + 'g');
-    }
-    return f(v);
-  };
-}
-
-/* Compute an appropriate domain given an array of all the values that are
- * going to be displayed. If ignoreOutliers is true, it will ignore the
- * lowest 10% and highest 10% of the data when computing a domain.
- * It has n log n performance when ignoreOutliers is true, as it needs to
- * sort the data.
- */
-export function computeDomain(values: number[], ignoreOutliers: boolean) {
-  // Don't include infinities and NaNs in the domain computation.
-  values = values.filter(z => isFinite(z));
-
-  if (values.length === 0) {
-    return [-0.1, 1.1];
-  }
-  let a: number;
-  let b: number;
-  if (ignoreOutliers) {
-    let sorted = _.sortBy(values);
-    a = d3.quantile(sorted, 0.05);
-    b = d3.quantile(sorted, 0.95);
-  } else {
-    a = d3.min(values);
-    b = d3.max(values);
-  }
-
-  let padding: number;
-  let span = b - a;
-  if (span === 0) {
-    // If b===a, we would create an empty range. We instead select the range
-    // [0, 2*a] if a > 0, or [-2*a, 0] if a < 0, plus a little bit of
-    // extra padding on the top and bottom of the plot.
-    padding = Math.abs(a) * 1.1 + 1.1;
-  } else {
-    padding = span * 0.2;
-  }
-
-  let lower: number;
-  if (a >= 0 && a < span) {
-    // We include the intercept (y = 0) if doing so less than doubles the span
-    // of the y-axis. (We actually select a lower bound that's slightly less
-    // than 0 so that 0.00 will clearly be written on the lower edge of the
-    // chart. The label on the lowest tick is often filtered out.)
-    lower = -0.1 * b;
-  } else {
-    lower = a - padding;
-  }
-
-
-  let domain = [lower, b + padding];
-  domain = d3.scaleLinear().domain(domain).nice().domain();
-  return domain;
-}
-
-export function accessorize(key: string): Plottable.IAccessor<number> {
-  // tslint:disable-next-line:no-any be quiet tsc
-  return (d: any, index: number, dataset: Plottable.Dataset) => d[key];
-}
-
-export interface XComponents {
-  /* tslint:disable */
-  scale: Plottable.Scales.Linear|Plottable.Scales.Time,
-      axis: Plottable.Axes.Numeric|Plottable.Axes.Time,
-      accessor: Plottable.IAccessor<number|Date>,
-  /* tslint:enable */
-}
-
-export let stepFormatter =
-    Plottable.Formatters.siSuffix(STEP_FORMATTER_PRECISION);
-export function stepX(): XComponents {
-  let scale = new Plottable.Scales.Linear();
-  let axis = new Plottable.Axes.Numeric(scale, 'bottom');
-  axis.formatter(stepFormatter);
-  return {
-    scale: scale,
-    axis: axis,
-    accessor: (d: Datum) => d.step,
-  };
-}
-
-export let timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-
-export function wallX(): XComponents {
-  let scale = new Plottable.Scales.Time();
-  return {
-    scale: scale,
-    axis: new Plottable.Axes.Time(scale, 'bottom'),
-    accessor: (d: Datum) => d.wall_time,
-  };
-}
-export let relativeAccessor =
-    // tslint:disable-next-line:no-any be quiet tsc
-    (d: any, index: number, dataset: Plottable.Dataset) => {
-      // We may be rendering the final-point datum for scatterplot.
-      // If so, we will have already provided the 'relative' property
-      if (d.relative != null) {
-        return d.relative;
-      }
-      let data = dataset.data();
-      // I can't imagine how this function would be called when the data is
-      // empty (after all, it iterates over the data), but lets guard just
-      // to be safe.
-      let first = data.length > 0 ? +data[0].wall_time : 0;
-      return (+d.wall_time - first) / (60 * 60 * 1000);  // ms to hours
-    };
-
-export let relativeFormatter = (n: number) => {
-  // we will always show 2 units of precision, e.g days and hours, or
-  // minutes and seconds, but not hours and minutes and seconds
-  let ret = '';
-  let days = Math.floor(n / 24);
-  n -= (days * 24);
-  if (days) {
-    ret += days + 'd ';
-  }
-  let hours = Math.floor(n);
-  n -= hours;
-  n *= 60;
-  if (hours || days) {
-    ret += hours + 'h ';
-  }
-  let minutes = Math.floor(n);
-  n -= minutes;
-  n *= 60;
-  if (minutes || hours || days) {
-    ret += minutes + 'm ';
-  }
-  let seconds = Math.floor(n);
-  return ret + seconds + 's';
-};
-export function relativeX(): XComponents {
-  let scale = new Plottable.Scales.Linear();
-  return {
-    scale: scale,
-    axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-    accessor: relativeAccessor,
-  };
-}
-
-// a very literal definition of NaN: true for NaN for a non-number type
-// or null, etc. False for Infinity or -Infinity
-export let isNaN = (x) => +x !== x;
-
-export function getXComponents(xType: string): XComponents {
-  switch (xType) {
-    case 'step':
-      return stepX();
-    case 'wall_time':
-      return wallX();
-    case 'relative':
-      return relativeX();
-    default:
-      throw new Error('invalid xType: ' + xType);
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
deleted file mode 100644
index 38e0d7cb8d81f9faeb013b803852deed18afd5bc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
+++ /dev/null
@@ -1,131 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="../tf-imports/lodash.html">
-<link rel="import" href="../tf-imports/plottable.html">
-
-<!--
-vz-line-chart creates an element that draws a line chart for
-displaying event values.
-
-This line chart supports drawing multiple lines at the same time, with features
-such as different X scales (linear and temporal), tooltips and smoothing.
-
-@element vz-line-chart
-@demo demo/index.html
--->
-<dom-module id="vz-line-chart">
-  <template>
-    <div id="tooltip">
-      <table>
-        <thead>
-          <tr>
-            <th></th>
-            <th>Name</th>
-            <template is="dom-if" if="{{smoothingEnabled}}">
-              <th>Smoothed</th>
-            </template>
-            <th>Value</th>
-            <th>Step</th>
-            <th>Time</th>
-            <th>Relative</th>
-          </tr>
-        </thead>
-        <tbody>
-        </tbody>
-      </table>
-    </div>
-    <div id="chartdiv"></div>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-        outline: none;
-      }
-      div {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      td {
-        padding-left: 5px;
-        padding-right: 5px;
-        font-size: 13px;
-        opacity: 1;
-      }
-      #tooltip {
-        pointer-events: none;
-        position: absolute;
-        opacity: 0;
-        box-shadow: 0 1px 4px rgba(0, 0, 0, 0.3);
-        font-size: 14px;
-        background: rgba(0, 0, 0, 0.8);
-        color: white;
-        border-radius: 4px;
-        line-height: 1.4em;
-        padding: 8px;
-        z-index: 5;
-        cursor: none;
-        margin-top: 10px;
-      }
-      .swatch {
-        border-radius: 50%;
-        width: 14px;
-        height: 14px;
-        display: block;
-        border: 2px solid rgba(0,0,0,0);
-      }
-      .closest .swatch {
-        border: 2px solid white;
-      }
-      th {
-        padding-left: 5px;
-        padding-right: 5px;
-        text-align: left;
-      }
-      .distant td {
-        opacity: 0.8;
-      }
-
-      .distant td.swatch {
-        opacity: 1;
-      }
-
-      .ghost {
-        opacity: 0.2;
-        stroke-width: 1px;
-      }
-
-      #chartdiv line.guide-line {
-        stroke: #999;
-        stroke-width: 1.5px;
-      }
-
-    </style>
-  </template>
-  <script src="vz-chart-helpers.js"></script>
-  <script src="dragZoomInteraction.js"></script>
-  <script src="vz-line-chart.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
deleted file mode 100644
index 5da6190ea24b40adb79ad00e82f309fe0107a474..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
+++ /dev/null
@@ -1,773 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-
-import {DragZoomLayer} from './dragZoomInteraction'
-import * as ChartHelpers from './vz-chart-helpers'
-
-Polymer({
-  is: 'vz-line-chart',
-  properties: {
-    /**
-     * Scale that maps series names to colors. The default colors are from
-     * d3.schemeCategory10. Use this property to replace the default line
-     * colors with colors of your own choice.
-     * @type {Plottable.Scales.Color}
-     * @required
-     */
-    colorScale: {
-      type: Object,
-      value: function() {
-        return new Plottable.Scales.Color().range(d3.schemeCategory10);
-      }
-    },
-
-    /**
-     * Whether smoothing is enabled or not. If true, smoothed lines will be
-     * plotted in the chart while the unsmoothed lines will be ghosted in
-     * the background.
-     *
-     * The smoothing algorithm is a simple moving average, which, given a
-     * point p and a window w, replaces p with a simple average of the
-     * points in the [p - floor(w/2), p + floor(w/2)] range.  If there
-     * aren't enough points to cover the entire window to the left, the
-     * window is reduced to fit exactly the amount of elements available.
-     * This means that the smoothed line will be less in and gradually
-     * become more smooth until the desired window is reached. However when
-     * there aren't enough points on the right, the line stops being
-     * rendered at all.
-     */
-    smoothingEnabled: {type: Boolean, value: false},
-
-    /**
-     * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
-     * the window size, and a weight of 1.0 means using 50% of the entire
-     * dataset as the window, while a weight of 0.0 means using a window of
-     * 0 (and thus replacing each point with themselves).
-     *
-     * The growth between 0.0 and 1.0 is not linear though. Because
-     * changing the window from 0% to 30% of the dataset smooths the line a
-     * lot more than changing the window from 70% to 100%, an exponential
-     * function is used instead: http://i.imgur.com/bDrhEZU.png. This
-     * function increases the size of the window slowly at the beginning
-     * and gradually speeds up the growth, but 0.0 still means a window of
-     * 0 and 1.0 still means a window of the dataset's length.
-     */
-    smoothingWeight: {type: Number, value: 0.6},
-
-    /**
-     * The way to display the X values. Allows:
-     * - "step" - Linear scale using the  "step" property of the datum.
-     * - "wall_time" - Temporal scale using the "wall_time" property of the
-     * datum.
-     * - "relative" - Temporal scale using the "relative" property of the
-     * datum if it is present or calculating from "wall_time" if it isn't.
-     */
-    xType: {type: String, value: 'step'},
-
-    /**
-     * The scale for the y-axis. Allows:
-     * - "linear" - linear scale (Plottable.Scales.Linear)
-     * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
-     */
-    yScaleType: {type: String, value: 'linear'},
-
-    /**
-     * Whether to ignore outlier data when computing the yScale domain.
-     */
-
-    ignoreYOutliers: {
-      type: Boolean,
-      value: false,
-    },
-
-    /**
-     * Change how the tooltip is sorted. Allows:
-     * - "default" - Sort the tooltip by input order.
-     * - "ascending" - Sort the tooltip by ascending value.
-     * - "descending" - Sort the tooltip by descending value.
-     * - "nearest" - Sort the tooltip by closest to cursor.
-     */
-    tooltipSortingMethod: {type: String, value: 'default'},
-
-    /**
-     * Change how the tooltip is positioned. Allows:
-     * - "bottom" - Position the tooltip on the bottom of the chart.
-     * - "right" - Position the tooltip to the right of the chart.
-     */
-    tooltipPosition: {type: String, value: 'bottom'},
-
-    _attached: Boolean,
-    _chart: Object,
-    _visibleSeriesCache: {
-      type: Array,
-      value: function() {
-        return []
-      }
-    },
-    _seriesDataCache: {
-      type: Object,
-      value: function() {
-        return {}
-      }
-    },
-    _makeChartAsyncCallbackId: {type: Number, value: null}
-  },
-  observers: [
-    '_makeChart(xType, yScaleType, colorScale, _attached)',
-    '_reloadFromCache(_chart)',
-    '_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)',
-    '_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)',
-    '_tooltipPositionChanged(tooltipPosition, _chart)',
-    '_outliersChanged(ignoreYOutliers, _chart)'
-  ],
-
-  /**
-   * Sets the series that the chart displays. Series with other names will
-   * not be displayed.
-   *
-   * @param {Array<String>} names Array with the names of the series to
-   * display.
-   */
-  setVisibleSeries: function(names) {
-    this._visibleSeriesCache = names;
-    if (this._chart) {
-      this._chart.setVisibleSeries(names);
-      this.redraw();
-    }
-  },
-
-  /**
-   * Sets the data of one of the series. Note that to display this series
-   * its name must be in the setVisibleSeries() array.
-   *
-   * @param {string} name Name of the series.
-   * @param {Array<ChartHelpers.ScalarDatum>} data Data of the series. This is
-   * an array of objects with at least the following properties:
-   * - step: (Number) - index of the datum.
-   * - wall_time: (Date) - Date object with the datum's time.
-   * - scalar: (Number) - Value of the datum.
-   */
-  setSeriesData: function(name, data) {
-    this._seriesDataCache[name] = data;
-    if (this._chart) {
-      this._chart.setSeriesData(name, data);
-    }
-  },
-
-  /**
-   * Re-renders the chart. Useful if e.g. the container size changed.
-   */
-  redraw: function() {
-    this._chart.redraw();
-  },
-  attached: function() {
-    this._attached = true;
-  },
-  detached: function() {
-    this._attached = false;
-  },
-  ready: function() {
-    this.scopeSubtree(this.$.tooltip, true);
-    this.scopeSubtree(this.$.chartdiv, true);
-  },
-  _makeChart: function(xType, yScaleType, colorScale, _attached) {
-    if (this._makeChartAsyncCallbackId !== null) {
-      this.cancelAsync(this._makeChartAsyncCallbackId);
-      this._makeChartAsyncCallbackId = null;
-    }
-
-    this._makeChartAsyncCallbackId = this.async(function() {
-      this._makeChartAsyncCallbackId = null;
-      if (!this._attached) return;
-      if (this._chart) this._chart.destroy();
-      var tooltip = d3.select(this.$.tooltip);
-      var chart = new LineChart(xType, yScaleType, colorScale, tooltip);
-      var div = d3.select(this.$.chartdiv);
-      chart.renderTo(div);
-      this._chart = chart;
-    }, 350);
-  },
-  _reloadFromCache: function() {
-    if (this._chart) {
-      this._chart.setVisibleSeries(this._visibleSeriesCache);
-      this._visibleSeriesCache.forEach(function(name) {
-        this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-      }.bind(this));
-    }
-  },
-  _smoothingChanged: function() {
-    if (!this._chart) {
-      return;
-    }
-    if (this.smoothingEnabled) {
-      this._chart.smoothingUpdate(this.smoothingWeight);
-    } else {
-      this._chart.smoothingDisable();
-    }
-  },
-  _outliersChanged: function() {
-    if (!this._chart) {
-      return;
-    }
-    this._chart.ignoreYOutliers(this.ignoreYOutliers);
-  },
-  _tooltipSortingMethodChanged: function() {
-    if (this._chart) {
-      this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
-    }
-  },
-  _tooltipPositionChanged: function() {
-    if (this._chart) {
-      this._chart.setTooltipPosition(this.tooltipPosition);
-    }
-  }
-});
-
-class LineChart {
-  private name2datasets: {[name: string]: Plottable.Dataset};
-  private seriesNames: string[];
-
-  private xAccessor: Plottable.IAccessor<number|Date>;
-  private xScale: Plottable.QuantitativeScale<number|Date>;
-  private yScale: Plottable.QuantitativeScale<number>;
-  private gridlines: Plottable.Components.Gridlines;
-  private center: Plottable.Components.Group;
-  private xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
-  private yAxis: Plottable.Axes.Numeric;
-  private outer: Plottable.Components.Table;
-  private colorScale: Plottable.Scales.Color;
-  private tooltip: d3.Selection<any, any, any, any>;
-  private dzl: DragZoomLayer;
-
-  private linePlot: Plottable.Plots.Line<number|Date>;
-  private smoothLinePlot: Plottable.Plots.Line<number|Date>;
-  private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
-  private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
-  private scalarAccessor: Plottable.IAccessor<number>;
-  private smoothedAccessor: Plottable.IAccessor<number>;
-  private lastPointsDataset: Plottable.Dataset;
-  private datasets: Plottable.Dataset[];
-  private onDatasetChanged: (dataset: Plottable.Dataset) => void;
-  private nanDataset: Plottable.Dataset;
-  private smoothingWeight: number;
-  private smoothingEnabled: Boolean;
-  private tooltipSortingMethod: string;
-  private tooltipPosition: string;
-  private _ignoreYOutliers: boolean;
-
-  private targetSVG: d3.Selection<any, any, any, any>;
-
-  constructor(
-      xType: string, yScaleType: string, colorScale: Plottable.Scales.Color,
-      tooltip: d3.Selection<any, any, any, any>) {
-    this.seriesNames = [];
-    this.name2datasets = {};
-    this.colorScale = colorScale;
-    this.tooltip = tooltip;
-    this.datasets = [];
-    this._ignoreYOutliers = false;
-    // lastPointDataset is a dataset that contains just the last point of
-    // every dataset we're currently drawing.
-    this.lastPointsDataset = new Plottable.Dataset();
-    this.nanDataset = new Plottable.Dataset();
-    // need to do a single bind, so we can deregister the callback from
-    // old Plottable.Datasets. (Deregistration is done by identity checks.)
-    this.onDatasetChanged = this._onDatasetChanged.bind(this);
-    this.buildChart(xType, yScaleType);
-  }
-
-  private buildChart(xType: string, yScaleType: string) {
-    if (this.outer) {
-      this.outer.destroy();
-    }
-    let xComponents = ChartHelpers.getXComponents(xType);
-    this.xAccessor = xComponents.accessor;
-    this.xScale = xComponents.scale;
-    this.xAxis = xComponents.axis;
-    this.xAxis.margin(0).tickLabelPadding(3);
-    this.yScale = LineChart.getYScaleFromType(yScaleType);
-    this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-    let yFormatter = ChartHelpers.multiscaleFormatter(
-        ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-    this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-    this.yAxis.usesTextWidthApproximation(true);
-
-    this.dzl = new DragZoomLayer(
-        this.xScale, this.yScale, this.updateSpecialDatasets.bind(this));
-
-    let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-
-    this.gridlines =
-        new Plottable.Components.Gridlines(this.xScale, this.yScale);
-
-    let xZeroLine = new Plottable.Components.GuideLineLayer('horizontal');
-    xZeroLine.scale(this.yScale).value(0);
-    let yZeroLine = new Plottable.Components.GuideLineLayer('vertical');
-    yZeroLine.scale(this.xScale).value(0);
-
-    this.center = new Plottable.Components.Group(
-        [this.gridlines, xZeroLine, yZeroLine, center, this.dzl]);
-    this.outer = new Plottable.Components.Table(
-        [[this.yAxis, this.center], [null, this.xAxis]]);
-  }
-
-  private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-    this.scalarAccessor = (d: ChartHelpers.ScalarDatum) => d.scalar;
-    this.smoothedAccessor = (d: ChartHelpers.ScalarDatum) => d.smoothed;
-    let linePlot = new Plottable.Plots.Line<number|Date>();
-    linePlot.x(xAccessor, xScale);
-    linePlot.y(this.scalarAccessor, yScale);
-    linePlot.attr(
-        'stroke',
-        (d: ChartHelpers.Datum, i: number, dataset: Plottable.Dataset) =>
-            this.colorScale.scale(dataset.metadata().name));
-    this.linePlot = linePlot;
-    let group = this.setupTooltips(linePlot);
-
-    let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
-    smoothLinePlot.x(xAccessor, xScale);
-    smoothLinePlot.y(this.smoothedAccessor, yScale);
-    smoothLinePlot.attr(
-        'stroke',
-        (d: ChartHelpers.Datum, i: number, dataset: Plottable.Dataset) =>
-            this.colorScale.scale(dataset.metadata().name));
-    this.smoothLinePlot = smoothLinePlot;
-
-    // The scatterPlot will display the last point for each dataset.
-    // This way, if there is only one datum for the series, it is still
-    // visible. We hide it when tooltips are active to keep things clean.
-    let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
-    scatterPlot.x(xAccessor, xScale);
-    scatterPlot.y(this.scalarAccessor, yScale);
-    scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
-    scatterPlot.attr('opacity', 1);
-    scatterPlot.size(ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
-    scatterPlot.datasets([this.lastPointsDataset]);
-    this.scatterPlot = scatterPlot;
-
-    let nanDisplay = new Plottable.Plots.Scatter<number|Date, number>();
-    nanDisplay.x(xAccessor, xScale);
-    nanDisplay.y((x) => x.displayY, yScale);
-    nanDisplay.attr('fill', (d: any) => this.colorScale.scale(d.name));
-    nanDisplay.attr('opacity', 1);
-    nanDisplay.size(ChartHelpers.NAN_SYMBOL_SIZE * 2);
-    nanDisplay.datasets([this.nanDataset]);
-    nanDisplay.symbol(Plottable.SymbolFactories.triangle);
-    this.nanDisplay = nanDisplay;
-
-    return new Plottable.Components.Group(
-        [nanDisplay, scatterPlot, smoothLinePlot, group]);
-  }
-
-  /** Updates the chart when a dataset changes. Called every time the data of
-   * a dataset changes to update the charts.
-   */
-  private _onDatasetChanged(dataset: Plottable.Dataset) {
-    if (this.smoothingEnabled) {
-      this.resmoothDataset(dataset);
-    }
-    this.updateSpecialDatasets();
-  }
-
-  public ignoreYOutliers(ignoreYOutliers: boolean) {
-    if (ignoreYOutliers !== this._ignoreYOutliers) {
-      this._ignoreYOutliers = ignoreYOutliers;
-      this.updateSpecialDatasets();
-    }
-  }
-
-  private updateSpecialDatasets() {
-    if (this.smoothingEnabled) {
-      this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-    } else {
-      this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-    }
-  }
-
-  /** Constructs special datasets. Each special dataset contains exceptional
-   * values from all of the regular datasets, e.g. last points in series, or
-   * NaN values. Those points will have a `name` and `relative` property added
-   * (since usually those are context in the surrounding dataset).
-   * The accessor will point to the correct data to access.
-   */
-  private updateSpecialDatasetsWithAccessor(accessor:
-                                                Plottable.IAccessor<number>) {
-    let lastPointsData =
-        this.datasets
-            .map((d) => {
-              let datum = null;
-              // filter out NaNs to ensure last point is a clean one
-              let nonNanData =
-                  d.data().filter((x) => !isNaN(accessor(x, -1, d)));
-              if (nonNanData.length > 0) {
-                let idx = nonNanData.length - 1;
-                datum = nonNanData[idx];
-                datum.name = d.metadata().name;
-                datum.relative = ChartHelpers.relativeAccessor(datum, -1, d);
-              }
-              return datum;
-            })
-            .filter((x) => x != null);
-    this.lastPointsDataset.data(lastPointsData);
-
-    // Take a dataset, return an array of NaN data points
-    // the NaN points will have a "displayY" property which is the
-    // y-value of a nearby point that was not NaN (0 if all points are NaN)
-    let datasetToNaNData = (d: Plottable.Dataset) => {
-      let displayY = null;
-      let data = d.data();
-      let i = 0;
-      while (i < data.length && displayY == null) {
-        if (!isNaN(accessor(data[i], -1, d))) {
-          displayY = accessor(data[i], -1, d);
-        }
-        i++;
-      }
-      if (displayY == null) {
-        displayY = 0;
-      }
-      let nanData = [];
-      for (i = 0; i < data.length; i++) {
-        if (!isNaN(accessor(data[i], -1, d))) {
-          displayY = accessor(data[i], -1, d);
-        } else {
-          data[i].name = d.metadata().name;
-          data[i].displayY = displayY;
-          data[i].relative = ChartHelpers.relativeAccessor(data[i], -1, d);
-          nanData.push(data[i]);
-        }
-      }
-      return nanData;
-    };
-    let nanData = _.flatten(this.datasets.map(datasetToNaNData));
-    this.nanDataset.data(nanData);
-
-    let datasetToValues: (d: Plottable.Dataset) => number[] = (d) => {
-      return d.data().map((x) => accessor(x, -1, d));
-    };
-    let vals = _.flatten(this.datasets.map(datasetToValues));
-    vals = vals.filter((x) => x === x && x !== Infinity && x !== -Infinity);
-    let domain = ChartHelpers.computeDomain(vals, this._ignoreYOutliers);
-    this.yScale.domain(domain);
-  }
-
-  private setupTooltips(plot: Plottable.XYPlot<number|Date, number>):
-      Plottable.Components.Group {
-    let pi = new Plottable.Interactions.Pointer();
-    pi.attachTo(plot);
-    // PointsComponent is a Plottable Component that will hold the little
-    // circles we draw over the closest data points
-    let pointsComponent = new Plottable.Component();
-    let group = new Plottable.Components.Group([plot, pointsComponent]);
-
-    let hideTooltips = () => {
-      this.tooltip.style('opacity', 0);
-      this.scatterPlot.attr('opacity', 1);
-      pointsComponent.content().selectAll('.point').remove();
-    };
-
-    let enabled = true;
-    let disableTooltips = () => {
-      enabled = false;
-      hideTooltips();
-    };
-    let enableTooltips = () => {
-      enabled = true;
-    };
-
-    this.dzl.interactionStart(disableTooltips);
-    this.dzl.interactionEnd(enableTooltips);
-
-    pi.onPointerMove((p: Plottable.Point) => {
-      if (!enabled) {
-        return;
-      }
-      let target: ChartHelpers.Point = {
-        x: p.x,
-        y: p.y,
-        datum: null,
-        dataset: null,
-      };
-
-
-      let bbox: SVGRect = (<any>this.gridlines.content().node()).getBBox();
-
-      // pts is the closets point to the tooltip for each dataset
-      let pts = plot.datasets()
-                    .map((dataset) => this.findClosestPoint(target, dataset))
-                    .filter(x => x != null);
-      let intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
-      // We draw tooltips for points that are NaN, or are currently visible
-      let ptsForTooltips = pts.filter(
-          (p) => intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar));
-      // Only draw little indicator circles for the non-NaN points
-      let ptsToCircle = ptsForTooltips.filter((p) => !isNaN(p.datum.scalar));
-
-      let ptsSelection: any =
-          pointsComponent.content().selectAll('.point').data(
-              ptsToCircle,
-              (p: ChartHelpers.Point) => p.dataset.metadata().name);
-      if (pts.length !== 0) {
-        ptsSelection.enter().append('circle').classed('point', true);
-        ptsSelection.attr('r', ChartHelpers.TOOLTIP_CIRCLE_SIZE)
-            .attr('cx', (p) => p.x)
-            .attr('cy', (p) => p.y)
-            .style('stroke', 'none')
-            .attr(
-                'fill',
-                (p) => this.colorScale.scale(p.dataset.metadata().name));
-        ptsSelection.exit().remove();
-        this.drawTooltips(ptsForTooltips, target);
-      } else {
-        hideTooltips();
-      }
-    });
-
-    pi.onPointerExit(hideTooltips);
-
-    return group;
-  }
-
-  private drawTooltips(
-      points: ChartHelpers.Point[], target: ChartHelpers.Point) {
-    // Formatters for value, step, and wall_time
-    this.scatterPlot.attr('opacity', 0);
-    let valueFormatter = ChartHelpers.multiscaleFormatter(
-        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
-
-    let dist = (p: ChartHelpers.Point) =>
-        Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
-    let closestDist = _.min(points.map(dist));
-
-    let valueSortMethod = this.scalarAccessor;
-    if (this.smoothingEnabled) {
-      valueSortMethod = this.smoothedAccessor;
-    }
-
-    if (this.tooltipSortingMethod === 'ascending') {
-      points = _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset));
-    } else if (this.tooltipSortingMethod === 'descending') {
-      points = _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset))
-                   .reverse();
-    } else if (this.tooltipSortingMethod === 'nearest') {
-      points = _.sortBy(points, dist);
-    } else {
-      // The 'default' sorting method maintains the order of names passed to
-      // setVisibleSeries(). However we reverse that order when defining the
-      // datasets. So we must call reverse again to restore the order.
-      points = points.slice(0).reverse();
-    }
-
-    let rows = this.tooltip.select('tbody')
-                   .html('')
-                   .selectAll('tr')
-                   .data(points)
-                   .enter()
-                   .append('tr');
-    // Grey out the point if any of the following are true:
-    // - The cursor is outside of the x-extent of the dataset
-    // - The point's y value is NaN
-    rows.classed('distant', (d) => {
-      let firstPoint = d.dataset.data()[0];
-      let lastPoint = _.last(d.dataset.data());
-      let firstX = this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
-      let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
-      let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
-      return target.x < firstX || target.x > lastX || isNaN(s);
-    });
-    rows.classed('closest', (p) => dist(p) === closestDist);
-    // It is a bit hacky that we are manually applying the width to the swatch
-    // and the nowrap property to the text here. The reason is as follows:
-    // the style gets updated asynchronously by Polymer scopeSubtree observer.
-    // Which means we would get incorrect sizing information since the text
-    // would wrap by default. However, we need correct measurements so that
-    // we can stop the text from falling off the edge of the screen.
-    // therefore, we apply the size-critical styles directly.
-    rows.style('white-space', 'nowrap');
-    rows.append('td')
-        .append('span')
-        .classed('swatch', true)
-        .style(
-            'background-color',
-            (d) => this.colorScale.scale(d.dataset.metadata().name));
-    rows.append('td').text((d) => d.dataset.metadata().name);
-    if (this.smoothingEnabled) {
-      rows.append('td').text(
-          (d) => isNaN(d.datum.smoothed) ? 'NaN' :
-                                           valueFormatter(d.datum.smoothed));
-    }
-    rows.append('td').text(
-        (d) => isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
-    rows.append('td').text((d) => ChartHelpers.stepFormatter(d.datum.step));
-    rows.append('td').text(
-        (d) => ChartHelpers.timeFormatter(d.datum.wall_time));
-    rows.append('td').text(
-        (d) => ChartHelpers.relativeFormatter(
-            ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)));
-
-    // compute left position
-    let documentWidth = document.body.clientWidth;
-    let node: any = this.tooltip.node();
-    let parentRect = node.parentElement.getBoundingClientRect();
-    let nodeRect = node.getBoundingClientRect();
-    // prevent it from falling off the right side of the screen
-    let left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
-
-    if (this.tooltipPosition === 'right') {
-      left = Math.min(parentRect.width, left);
-    } else {  // 'bottom'
-      left = Math.min(0, left);
-      top = parentRect.height + ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
-    }
-
-    this.tooltip.style('transform', 'translate(' + left + 'px,' + top + 'px)');
-    this.tooltip.style('opacity', 1);
-  }
-
-  private findClosestPoint(
-      target: ChartHelpers.Point,
-      dataset: Plottable.Dataset): ChartHelpers.Point {
-    let points: ChartHelpers.Point[] = dataset.data().map((d, i) => {
-      let x = this.xAccessor(d, i, dataset);
-      let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
-                                      this.scalarAccessor(d, i, dataset);
-      return {
-        x: this.xScale.scale(x),
-        y: this.yScale.scale(y),
-        datum: d,
-        dataset: dataset,
-      };
-    });
-    let idx: number =
-        _.sortedIndex(points, target, (p: ChartHelpers.Point) => p.x);
-    if (idx === points.length) {
-      return points[points.length - 1];
-    } else if (idx === 0) {
-      return points[0];
-    } else {
-      let prev = points[idx - 1];
-      let next = points[idx];
-      let prevDist = Math.abs(prev.x - target.x);
-      let nextDist = Math.abs(next.x - target.x);
-      return prevDist < nextDist ? prev : next;
-    }
-  }
-
-  private resmoothDataset(dataset: Plottable.Dataset) {
-    let data = dataset.data();
-    const smoothingWeight = this.smoothingWeight;
-    let last = data.length > 0 ? data[0].scalar : NaN;
-    data.forEach((d) => {
-      if (!_.isFinite(last)) {
-        d.smoothed = d.scalar;
-      } else {
-        // 1st-order IIR low-pass filter to attenuate the higher-
-        // frequency components of the time-series.
-        d.smoothed = last * smoothingWeight + (1 - smoothingWeight) * d.scalar;
-      }
-      last = d.smoothed;
-    });
-  }
-
-  private getDataset(name: string) {
-    if (this.name2datasets[name] === undefined) {
-      this.name2datasets[name] = new Plottable.Dataset([], {name: name});
-    }
-    return this.name2datasets[name];
-  }
-
-  static getYScaleFromType(yScaleType: string):
-      Plottable.QuantitativeScale<number> {
-    if (yScaleType === 'log') {
-      return new Plottable.Scales.ModifiedLog();
-    } else if (yScaleType === 'linear') {
-      return new Plottable.Scales.Linear();
-    } else {
-      throw new Error('Unrecognized yScale type ' + yScaleType);
-    }
-  }
-
-  /**
-   * Update the selected series on the chart.
-   */
-  public setVisibleSeries(names: string[]) {
-    names = names.sort();
-    this.seriesNames = names;
-
-    names.reverse();  // draw first series on top
-    this.datasets.forEach((d) => d.offUpdate(this.onDatasetChanged));
-    this.datasets = names.map((r) => this.getDataset(r));
-    this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
-    this.linePlot.datasets(this.datasets);
-
-    if (this.smoothingEnabled) {
-      this.smoothLinePlot.datasets(this.datasets);
-    }
-    this.updateSpecialDatasets();
-  }
-
-  /**
-   * Set the data of a series on the chart.
-   */
-  public setSeriesData(name: string, data: ChartHelpers.ScalarDatum[]) {
-    this.getDataset(name).data(data);
-  }
-
-  public smoothingUpdate(weight: number) {
-    this.smoothingWeight = weight;
-    this.datasets.forEach((d) => this.resmoothDataset(d));
-
-    if (!this.smoothingEnabled) {
-      this.linePlot.addClass('ghost');
-      this.scatterPlot.y(this.smoothedAccessor, this.yScale);
-      this.smoothingEnabled = true;
-      this.smoothLinePlot.datasets(this.datasets);
-    }
-
-    this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-  }
-
-  public smoothingDisable() {
-    if (this.smoothingEnabled) {
-      this.linePlot.removeClass('ghost');
-      this.scatterPlot.y(this.scalarAccessor, this.yScale);
-      this.smoothLinePlot.datasets([]);
-      this.smoothingEnabled = false;
-      this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-    }
-  }
-
-  public setTooltipSortingMethod(method: string) {
-    this.tooltipSortingMethod = method;
-  }
-
-  public setTooltipPosition(position: string) {
-    this.tooltipPosition = position;
-  }
-
-  public renderTo(targetSVG: d3.Selection<any, any, any, any>) {
-    this.targetSVG = targetSVG;
-    this.outer.renderTo(targetSVG);
-  }
-
-  public redraw() {
-    this.outer.redraw();
-  }
-
-  public destroy() {
-    this.outer.destroy();
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/BUILD b/tensorflow/tensorboard/components/vz_projector/BUILD
deleted file mode 100644
index acc1312a944c4f2ce87789091ff032e05d9b0804..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/BUILD
+++ /dev/null
@@ -1,110 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "vz_projector",
-    srcs = [
-        "analyticsLogger.ts",
-        "bundle.html",
-        "data.ts",
-        "data-provider.ts",
-        "data-provider-demo.ts",
-        "data-provider-proto.ts",
-        "data-provider-server.ts",
-        "external.d.ts",
-        "knn.ts",
-        "label.ts",
-        "logging.ts",
-        "projectorEventContext.ts",
-        "projectorScatterPlotAdapter.ts",
-        "renderContext.ts",
-        "scatterPlot.ts",
-        "scatterPlotRectangleSelector.ts",
-        "scatterPlotVisualizer.ts",
-        "scatterPlotVisualizer3DLabels.ts",
-        "scatterPlotVisualizerCanvasLabels.ts",
-        "scatterPlotVisualizerPolylines.ts",
-        "scatterPlotVisualizerSprites.ts",
-        "styles.html",
-        "util.ts",
-        "vector.ts",
-        "vz-projector.html",
-        "vz-projector.ts",
-        "vz-projector-app.html",
-        "vz-projector-bookmark-panel.html",
-        "vz-projector-bookmark-panel.ts",
-        "vz-projector-colab.html",
-        "vz-projector-dashboard.html",
-        "vz-projector-data-panel.html",
-        "vz-projector-data-panel.ts",
-        "vz-projector-input.html",
-        "vz-projector-input.ts",
-        "vz-projector-inspector-panel.html",
-        "vz-projector-inspector-panel.ts",
-        "vz-projector-legend.html",
-        "vz-projector-legend.ts",
-        "vz-projector-metadata-card.html",
-        "vz-projector-metadata-card.ts",
-        "vz-projector-projections-panel.html",
-        "vz-projector-projections-panel.ts",
-        "vz-projector-util.ts",
-    ],
-    path = "/vz-projector",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":bh_tsne",
-        ":heap",
-        ":sptree",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:numericjs",
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:threejs",
-        "//tensorflow/tensorboard/components/tf_imports:weblas",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_iron_icons",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_checkbox",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_dialog_scrollable",
-        "@org_polymer_paper_dropdown_menu",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_input",
-        "@org_polymer_paper_item",
-        "@org_polymer_paper_listbox",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-        "@org_polymer_paper_styles",
-        "@org_polymer_paper_toast",
-        "@org_polymer_paper_toggle_button",
-        "@org_polymer_paper_tooltip",
-    ],
-)
-
-ts_web_library(
-    name = "heap",
-    srcs = ["heap.ts"],
-    path = "/vz-projector",
-)
-
-ts_web_library(
-    name = "sptree",
-    srcs = ["sptree.ts"],
-    path = "/vz-projector",
-)
-
-ts_web_library(
-    name = "bh_tsne",
-    srcs = ["bh_tsne.ts"],
-    path = "/vz-projector",
-    deps = [":sptree"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_projector/analyticsLogger.ts b/tensorflow/tensorboard/components/vz_projector/analyticsLogger.ts
deleted file mode 100644
index aa1f86927da7958d7968c029e39b6b35a714df75..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/analyticsLogger.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {ProjectionType} from './data';
-
-export class AnalyticsLogger {
-  private eventLogging: boolean;
-  private pageViewLogging: boolean;
-
-  /**
-   * Constructs an event logger using Google Analytics. It assumes there is a
-   * Google Analytics script added to the page elsewhere. If there is no such
-   * script, the logger acts as a no-op.
-   *
-   * @param pageViewLogging Whether to log page views.
-   * @param eventLogging Whether to log user interaction.
-   */
-  constructor(pageViewLogging: boolean, eventLogging: boolean) {
-    if (typeof ga === 'undefined' || ga == null) {
-      this.eventLogging = false;
-      this.pageViewLogging = false;
-      return;
-    }
-    this.eventLogging = eventLogging;
-    this.pageViewLogging = pageViewLogging;
-  }
-
-  logPageView(pageTitle: string) {
-    if (this.pageViewLogging) {
-      // Always send a page view.
-      ga('send', {hitType: 'pageview', page: `/v/${pageTitle}`});
-    }
-  }
-
-  logProjectionChanged(projection: ProjectionType) {
-    if (this.eventLogging) {
-      ga('send', {
-        hitType: 'event',
-        eventCategory: 'Projection',
-        eventAction: 'click',
-        eventLabel: projection
-      });
-    }
-  }
-
-  logWebGLDisabled() {
-    if (this.eventLogging) {
-      ga('send', {
-        hitType: 'event',
-        eventCategory: 'Error',
-        eventAction: 'PageLoad',
-        eventLabel: 'WebGL_disabled'
-      });
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts b/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
deleted file mode 100644
index 063d57ec401d196827ce978dc64d4121a9c5edb3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
+++ /dev/null
@@ -1,473 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * This is a fork of the Karpathy's TSNE.js (original license below).
- * This fork implements Barnes-Hut approximation and runs in O(NlogN)
- * time, as opposed to the Karpathy's O(N^2) version.
- *
- * @author smilkov@google.com (Daniel Smilkov)
- */
-
-/**
- * @license
- * The MIT License (MIT)
- * Copyright (c) 2015 Andrej Karpathy
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-import {SPNode, SPTree} from './sptree';
-
-type AugmSPNode = SPNode&{numCells: number, yCell: number[], rCell: number};
-
-/**
- * Barnes-hut approximation level. Higher means more approximation and faster
- * results. Recommended value mentioned in the paper is 0.8.
- */
-const THETA = 0.8;
-
-const MIN_POSSIBLE_PROB = 1E-9;
-
-// Variables used for memorizing the second random number since running
-// gaussRandom() generates two random numbers at the cost of 1 atomic
-// computation. This optimization results in 2X speed-up of the generator.
-let return_v = false;
-let v_val = 0.0;
-
-/** Returns the square euclidean distance between two vectors. */
-export function dist2(a: number[], b: number[]): number {
-  if (a.length !== b.length) {
-    throw new Error('Vectors a and b must be of same length');
-  }
-
-  let result = 0;
-  for (let i = 0; i < a.length; ++i) {
-    let diff = a[i] - b[i];
-    result += diff * diff;
-  }
-  return result;
-}
-
-/** Returns the square euclidean distance between two 2D points. */
-export function dist2_2D(a: number[], b: number[]): number {
-  let dX = a[0] - b[0];
-  let dY = a[1] - b[1];
-  return dX * dX + dY * dY;
-}
-
-/** Returns the square euclidean distance between two 3D points. */
-export function dist2_3D(a: number[], b: number[]): number {
-  let dX = a[0] - b[0];
-  let dY = a[1] - b[1];
-  let dZ = a[2] - b[2];
-  return dX * dX + dY * dY + dZ * dZ;
-}
-
-function gaussRandom(rng: () => number): number {
-  if (return_v) {
-    return_v = false;
-    return v_val;
-  }
-  let u = 2 * rng() - 1;
-  let v = 2 * rng() - 1;
-  let r = u * u + v * v;
-  if (r === 0 || r > 1) {
-    return gaussRandom(rng);
-  }
-  let c = Math.sqrt(-2 * Math.log(r) / r);
-  v_val = v * c;  // cache this for next function call for efficiency
-  return_v = true;
-  return u * c;
-};
-
-// return random normal number
-function randn(rng: () => number, mu: number, std: number) {
-  return mu + gaussRandom(rng) * std;
-};
-
-// utilitity that creates contiguous vector of zeros of size n
-function zeros(n: number): Float64Array {
-  return new Float64Array(n);
-};
-
-// utility that returns a matrix filled with random numbers
-// generated by the provided generator.
-function randnMatrix(n: number, d: number, rng: () => number) {
-  let nd = n * d;
-  let x = zeros(nd);
-  for (let i = 0; i < nd; ++i) {
-    x[i] = randn(rng, 0.0, 1E-4);
-  }
-  return x;
-};
-
-// utility that returns a matrix filled with the provided value.
-function arrayofs(n: number, d: number, val: number) {
-  let x: number[][] = [];
-  for (let i = 0; i < n; ++i) {
-    x.push(d === 3 ? [val, val, val] : [val, val]);
-  }
-  return x;
-};
-
-// compute (p_{i|j} + p_{j|i})/(2n)
-function nearest2P(
-    nearest: {index: number, dist: number}[][], perplexity: number,
-    tol: number) {
-  let N = nearest.length;
-  let Htarget = Math.log(perplexity);  // target entropy of distribution
-  let P = zeros(N * N);                // temporary probability matrix
-  let K = nearest[0].length;
-  let pRow: number[] = new Array(K);  // pij[].
-
-  for (let i = 0; i < N; ++i) {
-    let neighbors = nearest[i];
-    let betaMin = -Infinity;
-    let betaMax = Infinity;
-    let beta = 1;  // initial value of precision
-    let maxTries = 50;
-
-    // perform binary search to find a suitable precision beta
-    // so that the entropy of the distribution is appropriate
-    let numTries = 0;
-    while (true) {
-      // compute entropy and kernel row with beta precision
-      let psum = 0.0;
-      for (let k = 0; k < neighbors.length; ++k) {
-        let neighbor = neighbors[k];
-        let pij = (i === neighbor.index) ? 0 : Math.exp(-neighbor.dist * beta);
-        pij = Math.max(pij, MIN_POSSIBLE_PROB);
-        pRow[k] = pij;
-        psum += pij;
-      }
-      // normalize p and compute entropy
-      let Hhere = 0.0;
-      for (let k = 0; k < pRow.length; ++k) {
-        pRow[k] /= psum;
-        let pij = pRow[k];
-        if (pij > 1E-7) {
-          Hhere -= pij * Math.log(pij);
-        };
-      }
-
-      // adjust beta based on result
-      if (Hhere > Htarget) {
-        // entropy was too high (distribution too diffuse)
-        // so we need to increase the precision for more peaky distribution
-        betaMin = beta;  // move up the bounds
-        if (betaMax === Infinity) {
-          beta = beta * 2;
-        } else {
-          beta = (beta + betaMax) / 2;
-        }
-
-      } else {
-        // converse case. make distrubtion less peaky
-        betaMax = beta;
-        if (betaMin === -Infinity) {
-          beta = beta / 2;
-        } else {
-          beta = (beta + betaMin) / 2;
-        }
-      }
-      numTries++;
-      // stopping conditions: too many tries or got a good precision
-      if (numTries >= maxTries || Math.abs(Hhere - Htarget) < tol) {
-        break;
-      }
-    }
-
-    // copy over the final prow to P at row i
-    for (let k = 0; k < pRow.length; ++k) {
-      let pij = pRow[k];
-      let j = neighbors[k].index;
-      P[i * N + j] = pij;
-    }
-  }  // end loop over examples i
-
-  // symmetrize P and normalize it to sum to 1 over all ij
-  let N2 = N * 2;
-  for (let i = 0; i < N; ++i) {
-    for (let j = i + 1; j < N; ++j) {
-      let i_j = i * N + j;
-      let j_i = j * N + i;
-      let value = (P[i_j] + P[j_i]) / N2;
-      P[i_j] = value;
-      P[j_i] = value;
-    }
-  }
-  return P;
-};
-
-// helper function
-function sign(x: number) {
-  return x > 0 ? 1 : x < 0 ? -1 : 0;
-}
-
-function computeForce_2d(
-    force: number[], mult: number, pointA: number[], pointB: number[]) {
-  force[0] += mult * (pointA[0] - pointB[0]);
-  force[1] += mult * (pointA[1] - pointB[1]);
-}
-
-function computeForce_3d(
-    force: number[], mult: number, pointA: number[], pointB: number[]) {
-  force[0] += mult * (pointA[0] - pointB[0]);
-  force[1] += mult * (pointA[1] - pointB[1]);
-  force[2] += mult * (pointA[2] - pointB[2]);
-}
-
-export interface TSNEOptions {
-  /** How many dimensions. */
-  dim: number;
-  /** Roughly how many neighbors each point influences. */
-  perplexity?: number;
-  /** Learning rate. */
-  epsilon?: number;
-  /** A random number generator. */
-  rng?: () => number;
-}
-
-export class TSNE {
-  private perplexity: number;
-  private epsilon: number;
-  /** Random generator */
-  private rng: () => number;
-  private iter = 0;
-  private Y: Float64Array;
-  private N: number;
-  private P: Float64Array;
-  private gains: number[][];
-  private ystep: number[][];
-  private nearest: {index: number, dist: number}[][];
-  private dim: number;
-  private dist2: (a: number[], b: number[]) => number;
-  private computeForce:
-      (force: number[], mult: number, pointA: number[],
-       pointB: number[]) => void;
-
-  constructor(opt: TSNEOptions) {
-    opt = opt || {dim: 2};
-    this.perplexity = opt.perplexity || 30;
-    this.epsilon = opt.epsilon || 10;
-    this.rng = opt.rng || Math.random;
-    this.dim = opt.dim;
-    if (opt.dim === 2) {
-      this.dist2 = dist2_2D;
-      this.computeForce = computeForce_2d;
-    } else if (opt.dim === 3) {
-      this.dist2 = dist2_3D;
-      this.computeForce = computeForce_3d;
-    } else {
-      throw new Error('Only 2D and 3D is supported');
-    }
-  }
-
-  // this function takes a fattened distance matrix and creates
-  // matrix P from them.
-  // D is assumed to be provided as an array of size N^2.
-  initDataDist(nearest: {index: number, dist: number}[][]) {
-    let N = nearest.length;
-    this.nearest = nearest;
-    this.P = nearest2P(nearest, this.perplexity, 1E-4);
-    this.N = N;
-    this.initSolution();  // refresh this
-  }
-
-  // (re)initializes the solution to random
-  initSolution() {
-    // generate random solution to t-SNE
-    this.Y = randnMatrix(this.N, this.dim, this.rng);  // the solution
-    this.gains = arrayofs(this.N, this.dim, 1.0);      // step gains
-    // to accelerate progress in unchanging directions
-    this.ystep = arrayofs(this.N, this.dim, 0.0);  // momentum accumulator
-    this.iter = 0;
-  }
-
-  // return pointer to current solution
-  getSolution() { return this.Y; }
-
-  // perform a single step of optimization to improve the embedding
-  step() {
-    this.iter += 1;
-    let N = this.N;
-
-    let grad = this.costGrad(this.Y);  // evaluate gradient
-
-    // perform gradient step
-    let ymean = this.dim === 3 ? [0, 0, 0] : [0, 0];
-    for (let i = 0; i < N; ++i) {
-      for (let d = 0; d < this.dim; ++d) {
-        let gid = grad[i][d];
-        let sid = this.ystep[i][d];
-        let gainid = this.gains[i][d];
-
-        // compute gain update
-        let newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
-        if (newgain < 0.01) {
-          newgain = 0.01;  // clamp
-        }
-        this.gains[i][d] = newgain;  // store for next turn
-
-        // compute momentum step direction
-        let momval = this.iter < 250 ? 0.5 : 0.8;
-        let newsid = momval * sid - this.epsilon * newgain * grad[i][d];
-        this.ystep[i][d] = newsid;  // remember the step we took
-
-        // step!
-        let i_d = i * this.dim + d;
-        this.Y[i_d] += newsid;
-        ymean[d] += this.Y[i_d];  // accumulate mean so that we
-                                  // can center later
-      }
-    }
-
-    // reproject Y to be zero mean
-    for (let i = 0; i < N; ++i) {
-      for (let d = 0; d < this.dim; ++d) {
-        this.Y[i * this.dim + d] -= ymean[d] / N;
-      }
-    }
-  }
-
-  // return cost and gradient, given an arrangement
-  costGrad(Y: Float64Array): number[][] {
-    let N = this.N;
-    let P = this.P;
-
-    // Trick that helps with local optima.
-    let alpha = this.iter < 100 ? 4 : 1;
-
-    // Make data for the SP tree.
-    let points: number[][] = new Array(N);  // (x, y)[]
-    for (let i = 0; i < N; ++i) {
-      let iTimesD = i * this.dim;
-      let row = new Array(this.dim);
-      for (let d = 0; d < this.dim; ++d) {
-        row[d] = Y[iTimesD + d];
-      }
-      points[i] = row;
-    }
-
-    // Make a tree.
-    let tree = new SPTree(points);
-    let root = tree.root as AugmSPNode;
-    // Annotate the tree.
-
-    let annotateTree =
-        (node: AugmSPNode): {numCells: number, yCell: number[]} => {
-          let numCells = 1;
-          if (node.children == null) {
-            // Update the current node and tell the parent.
-            node.numCells = numCells;
-            node.yCell = node.point;
-            return {numCells, yCell: node.yCell};
-          }
-          // node.point is a 2 or 3-dim number[], so slice() makes a copy.
-          let yCell = node.point.slice();
-          for (let i = 0; i < node.children.length; ++i) {
-            let child = node.children[i];
-            if (child == null) {
-              continue;
-            }
-            let result = annotateTree(child as AugmSPNode);
-            numCells += result.numCells;
-            for (let d = 0; d < this.dim; ++d) {
-              yCell[d] += result.yCell[d];
-            }
-          }
-          // Update the node and tell the parent.
-          node.numCells = numCells;
-          node.yCell = yCell.map(v => v / numCells);
-          return {numCells, yCell};
-        };
-
-    // Augment the tree with more info.
-    annotateTree(root);
-    tree.visit((node: AugmSPNode, low: number[], high: number[]) => {
-      node.rCell = high[0] - low[0];
-      return false;
-    });
-    // compute current Q distribution, unnormalized first
-    let grad: number[][] = [];
-    let Z = 0;
-    let forces: [number[], number[]][] = new Array(N);
-    for (let i = 0; i < N; ++i) {
-      let pointI = points[i];
-      // Compute the positive forces for the i-th node.
-      let Fpos = this.dim === 3 ? [0, 0, 0] : [0, 0];
-      let neighbors = this.nearest[i];
-      for (let k = 0; k < neighbors.length; ++k) {
-        let j = neighbors[k].index;
-        let pij = P[i * N + j];
-        let pointJ = points[j];
-        let squaredDistItoJ = this.dist2(pointI, pointJ);
-        let premult = pij / (1 + squaredDistItoJ);
-        this.computeForce(Fpos, premult, pointI, pointJ);
-      }
-      // Compute the negative forces for the i-th node.
-      let FnegZ = this.dim === 3 ? [0, 0, 0] : [0, 0];
-      tree.visit((node: AugmSPNode) => {
-        let squaredDistToCell = this.dist2(pointI, node.yCell);
-        // Squared distance from point i to cell.
-        if (node.children == null ||
-            (squaredDistToCell > 0 &&
-             node.rCell / Math.sqrt(squaredDistToCell) < THETA)) {
-          let qijZ = 1 / (1 + squaredDistToCell);
-          let dZ = node.numCells * qijZ;
-          Z += dZ;
-          dZ *= qijZ;
-          this.computeForce(FnegZ, dZ, pointI, node.yCell);
-          return true;
-        }
-        // Cell is too close to approximate.
-        let squaredDistToPoint = this.dist2(pointI, node.point);
-        let qijZ = 1 / (1 + squaredDistToPoint);
-        Z += qijZ;
-        qijZ *= qijZ;
-        this.computeForce(FnegZ, qijZ, pointI, node.point);
-        return false;
-      }, true);
-      forces[i] = [Fpos, FnegZ];
-    }
-    // Normalize the negative forces and compute the gradient.
-    const A = 4 * alpha;
-    const B = 4 / Z;
-    for (let i = 0; i < N; ++i) {
-      let [FPos, FNegZ] = forces[i];
-      let gsum = new Array(this.dim);
-      for (let d = 0; d < this.dim; ++d) {
-        gsum[d] = A * FPos[d] - B * FNegZ[d];
-      }
-      grad.push(gsum);
-    }
-    return grad;
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/bundle.html b/tensorflow/tensorboard/components/vz_projector/bundle.html
deleted file mode 100644
index f5a25230a0bb0a857751c88c9fac7057cc747c79..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/bundle.html
+++ /dev/null
@@ -1,48 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="../tf-imports/numericjs.html">
-<link rel="import" href="../tf-imports/threejs.html">
-<link rel="import" href="../tf-imports/weblas.html">
-
-<script src="heap.js"></script>
-<script src="label.js"></script>
-<script src="sptree.js"></script>
-<script src="bh_tsne.js"></script>
-<script src="logging.js"></script>
-<script src="renderContext.js"></script>
-<script src="scatterPlotRectangleSelector.js"></script>
-<script src="analyticsLogger.js"></script>
-<script src="util.js"></script>
-<script src="vector.js"></script>
-<script src="knn.js"></script>
-<script src="data.js"></script>
-<script src="data-provider.js"></script>
-<script src="data-provider-demo.js"></script>
-<script src="data-provider-proto.js"></script>
-<script src="data-provider-server.js"></script>
-<script src="projectorEventContext.js"></script>
-<script src="scatterPlot.js"></script>
-<script src="scatterPlotVisualizer3DLabels.js"></script>
-<script src="scatterPlotVisualizerCanvasLabels.js"></script>
-<script src="scatterPlotVisualizerPolylines.js"></script>
-<script src="scatterPlotVisualizerSprites.js"></script>
-<script src="scatterPlotVisualizer.js"></script>
-<script src="projectorScatterPlotAdapter.js"></script>
-<script src="vz-projector-util.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
deleted file mode 100644
index 1410a84a8e4ee844eab76eb0c7d55aec70b9774f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataSet, SpriteAndMetadataInfo, State} from './data';
-import {ProjectorConfig, DataProvider, EmbeddingInfo, TENSORS_MSG_ID} from './data-provider';
-import * as dataProvider from './data-provider';
-import * as logging from './logging';
-
-const BYTES_EXTENSION = '.bytes';
-
-/** Data provider that loads data from a demo folder. */
-export class DemoDataProvider implements DataProvider {
-  private projectorConfigPath: string;
-  private projectorConfig: ProjectorConfig;
-
-  constructor(projectorConfigPath: string) {
-    this.projectorConfigPath = projectorConfigPath;
-  }
-
-  private getEmbeddingInfo(tensorName: string): EmbeddingInfo {
-    let embeddings = this.projectorConfig.embeddings;
-    for (let i = 0; i < embeddings.length; i++) {
-      let embedding = embeddings[i];
-      if (embedding.tensorName === tensorName) {
-        return embedding;
-      }
-    }
-    return null;
-  }
-
-  retrieveRuns(callback: (runs: string[]) => void): void {
-    callback(['Demo']);
-  }
-
-  retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void)
-      : void {
-    const msgId = logging.setModalMessage('Fetching projector config...');
-
-    const xhr = new XMLHttpRequest();
-    xhr.open('GET', this.projectorConfigPath);
-    xhr.onerror = (err) => {
-      let errorMessage = err.message;
-      // If the error is a valid XMLHttpResponse, it's possible this is a
-      // cross-origin error.
-      if (xhr.responseText != null) {
-        errorMessage = 'Cannot fetch projector config, possibly a ' +
-            'Cross-Origin request error.';
-      }
-      logging.setErrorMessage(errorMessage, 'fetching projector config');
-    };
-    xhr.onload = () => {
-      const projectorConfig = JSON.parse(xhr.responseText) as ProjectorConfig;
-      logging.setModalMessage(null, msgId);
-      this.projectorConfig = projectorConfig;
-      callback(projectorConfig);
-    };
-    xhr.send();
-  }
-
-  retrieveTensor(run: string, tensorName: string,
-      callback: (ds: DataSet) => void) {
-    let embedding = this.getEmbeddingInfo(tensorName);
-    let url = `${embedding.tensorPath}`;
-    if (embedding.tensorPath.substr(-1 * BYTES_EXTENSION.length) ===
-        BYTES_EXTENSION) {
-      dataProvider.retrieveTensorAsBytes(
-          this, this.getEmbeddingInfo(tensorName), run, tensorName, url,
-          callback);
-    } else {
-      logging.setModalMessage('Fetching tensors...', TENSORS_MSG_ID);
-      const request = new XMLHttpRequest();
-      request.open('GET', url);
-      request.responseType = 'arraybuffer';
-
-      request.onerror = () => {
-        logging.setErrorMessage(request.responseText, 'fetching tensors');
-      };
-      request.onload = () => {
-        dataProvider.parseTensors(request.response).then(points => {
-          callback(new DataSet(points));
-        });
-      };
-      request.send();
-    }
-  }
-
-  retrieveSpriteAndMetadata(run: string, tensorName: string,
-      callback: (r: SpriteAndMetadataInfo) => void) {
-    let embedding = this.getEmbeddingInfo(tensorName);
-    let spriteImagePath = null;
-    if (embedding.sprite && embedding.sprite.imagePath) {
-      spriteImagePath = embedding.sprite.imagePath;
-    }
-    dataProvider.retrieveSpriteAndMetadataInfo(
-        embedding.metadataPath, spriteImagePath, embedding.sprite, callback);
-  }
-
-  getBookmarks(
-      run: string, tensorName: string, callback: (r: State[]) => void) {
-    let embedding = this.getEmbeddingInfo(tensorName);
-    let msgId = logging.setModalMessage('Fetching bookmarks...');
-
-    const xhr = new XMLHttpRequest();
-    xhr.open('GET', embedding.bookmarksPath);
-    xhr.onerror = (err) => {
-      logging.setErrorMessage(xhr.responseText);
-    };
-    xhr.onload = () => {
-      const bookmarks = JSON.parse(xhr.responseText) as State[];
-      logging.setModalMessage(null, msgId);
-      callback(bookmarks);
-    };
-    xhr.send();
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts
deleted file mode 100644
index 67124a9232315a72bbce728878675d2053a9fd0a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataPoint, DataProto, DataSet, SpriteAndMetadataInfo, PointMetadata, State} from './data';
-import {analyzeMetadata, ProjectorConfig, DataProvider} from './data-provider';
-
-
-export class ProtoDataProvider implements DataProvider {
-  private dataProto: DataProto;
-
-  constructor(dataProto: DataProto) {
-    this.dataProto = dataProto;
-  }
-
-  retrieveRuns(callback: (runs: string[]) => void): void {
-    callback(['proto']);
-  }
-
-  retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void) {
-    callback({
-      modelCheckpointPath: 'proto',
-      embeddings: [{
-        tensorName: 'proto',
-        tensorShape: this.dataProto.shape,
-        metadataPath: 'proto'
-      }]
-    });
-  }
-
-  retrieveTensor(run: string, tensorName: string,
-      callback: (ds: DataSet) => void) {
-    callback(this.flatArrayToDataset(this.dataProto.tensor));
-  }
-
-  retrieveSpriteAndMetadata(run: string, tensorName: string,
-      callback: (r: SpriteAndMetadataInfo) => void): void {
-    let columnNames = this.dataProto.metadata.columns.map(c => c.name);
-    let n = this.dataProto.shape[0];
-    let pointsMetadata: PointMetadata[] = new Array(n);
-    this.dataProto.metadata.columns.forEach(c => {
-      let values = c.numericValues || c.stringValues;
-      for (let i = 0; i < n; i++) {
-        pointsMetadata[i] = pointsMetadata[i] || {};
-        pointsMetadata[i][c.name] = values[i];
-      }
-    });
-    callback({
-      stats: analyzeMetadata(columnNames, pointsMetadata),
-      pointsInfo: pointsMetadata
-    });
-  }
-
-  getBookmarks(run: string, tensorName: string,
-      callback: (r: State[]) => void): void {
-    return callback([]);
-  }
-
-  private flatArrayToDataset(tensor: number[]): DataSet {
-    let points: DataPoint[] = [];
-    let n = this.dataProto.shape[0];
-    let d = this.dataProto.shape[1];
-    if (n * d !== tensor.length) {
-      throw 'The shape doesn\'t match the length of the flattened array';
-    }
-    for (let i = 0; i < n; i++) {
-      let offset = i * d;
-      points.push({
-        vector: new Float32Array(tensor.slice(offset, offset + d)),
-        metadata: {},
-        projections: null,
-        index: i
-      });
-    }
-    return new DataSet(points);
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
deleted file mode 100644
index 02720ebf6a7cbbf68de64070fd9e9293dbc7300a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataSet, SpriteAndMetadataInfo, State} from './data';
-import * as dataProvider from './data-provider';
-import {DataProvider, EmbeddingInfo, ProjectorConfig} from './data-provider';
-import * as logging from './logging';
-
-// Limit for the number of data points we receive from the server.
-export const LIMIT_NUM_POINTS = 100000;
-
-/**
- * Data provider that loads data provided by a python server (usually backed
- * by a checkpoint file).
- */
-export class ServerDataProvider implements DataProvider {
-  private routePrefix: string;
-  private runProjectorConfigCache: {[run: string]: ProjectorConfig} = {};
-
-  constructor(routePrefix: string) {
-    this.routePrefix = routePrefix;
-  }
-
-  private getEmbeddingInfo(run: string, tensorName: string,
-      callback: (e: EmbeddingInfo) => void): void {
-    this.retrieveProjectorConfig(run, config => {
-      const embeddings = config.embeddings;
-      for (let i = 0; i < embeddings.length; i++) {
-        const embedding = embeddings[i];
-        if (embedding.tensorName === tensorName) {
-          callback(embedding);
-          return;
-        }
-      }
-      callback(null);
-    });
-  }
-
-  retrieveRuns(callback: (runs: string[]) => void): void {
-    const msgId = logging.setModalMessage('Fetching runs...');
-
-    const xhr = new XMLHttpRequest();
-    xhr.open('GET', `${this.routePrefix}/runs`);
-    xhr.onerror = (err) => {
-      logging.setErrorMessage(xhr.responseText, 'fetching runs');
-    };
-    xhr.onload = () => {
-      const runs = JSON.parse(xhr.responseText);
-      logging.setModalMessage(null, msgId);
-      callback(runs);
-    };
-    xhr.send();
-  }
-
-  retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void)
-      : void {
-    if (run in this.runProjectorConfigCache) {
-      callback(this.runProjectorConfigCache[run]);
-      return;
-    }
-
-    const msgId = logging.setModalMessage('Fetching projector config...');
-
-    const xhr = new XMLHttpRequest();
-    xhr.open('GET', `${this.routePrefix}/info?run=${run}`);
-    xhr.onerror = (err) => {
-      logging.setErrorMessage(xhr.responseText, 'fetching projector config');
-    };
-    xhr.onload = () => {
-      const config = JSON.parse(xhr.responseText) as ProjectorConfig;
-      logging.setModalMessage(null, msgId);
-      this.runProjectorConfigCache[run] = config;
-      callback(config);
-    };
-    xhr.send();
-  }
-
-  retrieveTensor(run: string, tensorName: string,
-      callback: (ds: DataSet) => void) {
-    this.getEmbeddingInfo(run, tensorName, embedding => {
-      dataProvider.retrieveTensorAsBytes(
-          this, embedding, run, tensorName,
-          `${this.routePrefix}/tensor?run=${run}&name=${tensorName}` +
-              `&num_rows=${LIMIT_NUM_POINTS}`,
-          callback);
-    });
-  }
-
-  retrieveSpriteAndMetadata(run: string, tensorName: string,
-      callback: (r: SpriteAndMetadataInfo) => void) {
-    this.getEmbeddingInfo(run, tensorName, embedding => {
-      let metadataPath = null;
-      if (embedding.metadataPath) {
-        metadataPath =
-            `${this.routePrefix}/metadata?` +
-            `run=${run}&name=${tensorName}&num_rows=${LIMIT_NUM_POINTS}`;
-      }
-      let spriteImagePath = null;
-      if (embedding.sprite && embedding.sprite.imagePath) {
-        spriteImagePath =
-            `${this.routePrefix}/sprite_image?run=${run}&name=${tensorName}`;
-      }
-      dataProvider.retrieveSpriteAndMetadataInfo(metadataPath, spriteImagePath,
-          embedding.sprite, callback);
-    });
-  }
-
-  getBookmarks(
-      run: string, tensorName: string, callback: (r: State[]) => void) {
-    const msgId = logging.setModalMessage('Fetching bookmarks...');
-
-    const xhr = new XMLHttpRequest();
-    xhr.open(
-        'GET', `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`);
-    xhr.onerror = (err) => {
-      logging.setErrorMessage(xhr.responseText, 'fetching bookmarks');
-    };
-    xhr.onload = () => {
-      logging.setModalMessage(null, msgId);
-      const bookmarks = JSON.parse(xhr.responseText);
-      callback(bookmarks);
-    };
-    xhr.send();
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
deleted file mode 100644
index c8eede798c670372e334e4a89e677162055d397e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/data-provider.ts
+++ /dev/null
@@ -1,429 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {ColumnStats, DataPoint, DataSet, SpriteAndMetadataInfo, PointMetadata, State} from './data';
-import * as logging from './logging';
-import {runAsyncTask} from './util';
-
-/** Maximum number of colors supported in the color map. */
-const NUM_COLORS_COLOR_MAP = 50;
-const MAX_SPRITE_IMAGE_SIZE_PX = 8192;
-
-export const METADATA_MSG_ID = 'metadata';
-export const TENSORS_MSG_ID = 'tensors';
-
-/** Matches the json format of `projector_config.proto` */
-export interface SpriteMetadata {
-  imagePath: string;
-  singleImageDim: [number, number];
-}
-
-/** Matches the json format of `projector_config.proto` */
-export interface EmbeddingInfo {
-  /** Name of the tensor. */
-  tensorName: string;
-  /** The shape of the tensor. */
-  tensorShape: [number, number];
-  /**
-   * The path to the tensors TSV file. If empty, it is assumed that the tensor
-   * is stored in the checkpoint file.
-   */
-  tensorPath?: string;
-  /** The path to the metadata file associated with the tensor. */
-  metadataPath?: string;
-  /** The path to the bookmarks file associated with the tensor. */
-  bookmarksPath?: string;
-  sprite?: SpriteMetadata;
-}
-
-/**
- * Matches the json format of `projector_config.proto`
- * This should be kept in sync with the code in vz-projector-data-panel which
- * holds a template for users to build a projector config JSON object from the
- * projector UI.
- */
-export interface ProjectorConfig {
-  embeddings: EmbeddingInfo[];
-  modelCheckpointPath?: string;
-}
-
-export type ServingMode = 'demo' | 'server' | 'proto';
-
-/** Interface between the data storage and the UI. */
-export interface DataProvider {
-  /** Returns a list of run names that have embedding config files. */
-  retrieveRuns(callback: (runs: string[]) => void): void;
-
-  /**
-   * Returns the projector configuration: number of tensors, their shapes,
-   * and their associated metadata files.
-   */
-  retrieveProjectorConfig(run: string,
-      callback: (d: ProjectorConfig) => void): void;
-
-  /** Fetches and returns the tensor with the specified name. */
-  retrieveTensor(run: string, tensorName: string,
-      callback: (ds: DataSet) => void);
-
-  /**
-   * Fetches the metadata for the specified tensor.
-   */
-  retrieveSpriteAndMetadata(run: string, tensorName: string,
-      callback: (r: SpriteAndMetadataInfo) => void): void;
-
-  getBookmarks(run: string, tensorName: string, callback: (r: State[]) => void):
-      void;
-}
-
-export function retrieveTensorAsBytes(
-    dp: DataProvider, embedding: EmbeddingInfo, run: string, tensorName: string,
-    tensorsPath: string, callback: (ds: DataSet) => void) {
-  // Get the tensor.
-  logging.setModalMessage('Fetching tensor values...', TENSORS_MSG_ID);
-  let xhr = new XMLHttpRequest();
-  xhr.open('GET', tensorsPath);
-  xhr.responseType = 'arraybuffer';
-  xhr.onprogress = (ev) => {
-    if (ev.lengthComputable) {
-      let percent = (ev.loaded * 100 / ev.total).toFixed(1);
-      logging.setModalMessage(
-          'Fetching tensor values: ' + percent + '%', TENSORS_MSG_ID);
-    }
-  };
-  xhr.onload = () => {
-    if (xhr.status !== 200) {
-      let msg = String.fromCharCode.apply(null, new Uint8Array(xhr.response));
-      logging.setErrorMessage(msg, 'fetching tensors');
-      return;
-    }
-    let data: Float32Array;
-    try {
-      data = new Float32Array(xhr.response);
-    } catch (e) {
-      logging.setErrorMessage(e, 'parsing tensor bytes');
-      return;
-    }
-
-    let dim = embedding.tensorShape[1];
-    let N = data.length / dim;
-    if (embedding.tensorShape[0] > N) {
-      logging.setWarningMessage(
-          `Showing the first ${N.toLocaleString()}` +
-          ` of ${embedding.tensorShape[0].toLocaleString()} data points`);
-    }
-    parseTensorsFromFloat32Array(data, dim).then(dataPoints => {
-      callback(new DataSet(dataPoints));
-    });
-  };
-  xhr.send();
-}
-
-export function parseRawTensors(
-    content: ArrayBuffer, callback: (ds: DataSet) => void) {
-  parseTensors(content).then(data => {
-    callback(new DataSet(data));
-  });
-}
-
-export function parseRawMetadata(
-    contents: ArrayBuffer, callback: (r: SpriteAndMetadataInfo) => void) {
-  parseMetadata(contents).then(result => callback(result));
-}
-
-/**
- * Parse an ArrayBuffer in a streaming fashion line by line (or custom delim).
- * Can handle very large files.
- *
- * @param content The array buffer.
- * @param callback The callback called on each line.
- * @param chunkSize The size of each read chunk, defaults to ~1MB. (optional)
- * @param delim The delimiter used to split a line, defaults to '\n'. (optional)
- * @returns A promise for when it is finished.
- */
-function streamParse(
-    content: ArrayBuffer, callback: (line: string) => void, chunkSize = 1000000,
-    delim = '\n'): Promise<void> {
-  return new Promise<void>((resolve, reject) => {
-    let offset = 0;
-    let bufferSize = content.byteLength - 1;
-    let data = '';
-
-    function readHandler(str) {
-      offset += chunkSize;
-      let parts = str.split(delim);
-      let first = data + parts[0];
-      if (parts.length === 1) {
-        data = first;
-        readChunk(offset, chunkSize);
-        return;
-      }
-      data = parts[parts.length - 1];
-      callback(first);
-      for (let i = 1; i < parts.length - 1; i++) {
-        callback(parts[i]);
-      }
-      if (offset >= bufferSize) {
-        if (data) {
-          callback(data);
-        }
-        resolve();
-        return;
-      }
-      readChunk(offset, chunkSize);
-    }
-
-    function readChunk(offset: number, size: number) {
-      const contentChunk = content.slice(offset, offset + size);
-
-      const blob = new Blob([contentChunk]);
-      const file = new FileReader();
-      file.onload = (e: any) => readHandler(e.target.result);
-      file.readAsText(blob);
-    }
-
-    readChunk(offset, chunkSize);
-  });
-}
-
-/** Parses a tsv text file. */
-export function parseTensors(
-    content: ArrayBuffer, valueDelim = '\t'): Promise<DataPoint[]> {
-  logging.setModalMessage('Parsing tensors...', TENSORS_MSG_ID);
-
-  return new Promise<DataPoint[]>((resolve, reject) => {
-    const data: DataPoint[] = [];
-    let numDim: number;
-
-    streamParse(content, (line: string) => {
-      line = line.trim();
-      if (line === '') {
-        return;
-      }
-      const row = line.split(valueDelim);
-      const dataPoint: DataPoint = {
-        metadata: {},
-        vector: null,
-        index: data.length,
-        projections: null,
-      };
-      // If the first label is not a number, take it as the label.
-      if (isNaN(row[0] as any) || numDim === row.length - 1) {
-        dataPoint.metadata['label'] = row[0];
-        dataPoint.vector = new Float32Array(row.slice(1).map(Number));
-      } else {
-        dataPoint.vector = new Float32Array(row.map(Number));
-      }
-      data.push(dataPoint);
-      if (numDim == null) {
-        numDim = dataPoint.vector.length;
-      }
-      if (numDim !== dataPoint.vector.length) {
-        logging.setModalMessage(
-            'Parsing failed. Vector dimensions do not match');
-        throw Error('Parsing failed');
-      }
-      if (numDim <= 1) {
-        logging.setModalMessage(
-            'Parsing failed. Found a vector with only one dimension?');
-        throw Error('Parsing failed');
-      }
-    }).then(() => {
-      logging.setModalMessage(null, TENSORS_MSG_ID);
-      resolve(data);
-    });
-  });
-}
-
-/** Parses a tsv text file. */
-export function parseTensorsFromFloat32Array(data: Float32Array,
-    dim: number): Promise<DataPoint[]> {
-  return runAsyncTask('Parsing tensors...', () => {
-    const N = data.length / dim;
-    const dataPoints: DataPoint[] = [];
-    let offset = 0;
-    for (let i = 0; i < N; ++i) {
-      dataPoints.push({
-        metadata: {},
-        vector: data.subarray(offset, offset + dim),
-        index: i,
-        projections: null,
-      });
-      offset += dim;
-    }
-    return dataPoints;
-  }, TENSORS_MSG_ID).then(dataPoints => {
-    logging.setModalMessage(null, TENSORS_MSG_ID);
-    return dataPoints;
-  });
-}
-
-export function analyzeMetadata(
-    columnNames, pointsMetadata: PointMetadata[]): ColumnStats[] {
-  const columnStats: ColumnStats[] = columnNames.map(name => {
-    return {
-      name: name,
-      isNumeric: true,
-      tooManyUniqueValues: false,
-      min: Number.POSITIVE_INFINITY,
-      max: Number.NEGATIVE_INFINITY
-    };
-  });
-
-  const mapOfValues: [{[value: string]: number}] =
-      columnNames.map(() => new Object());
-
-  pointsMetadata.forEach(metadata => {
-    columnNames.forEach((name: string, colIndex: number) => {
-      const stats = columnStats[colIndex];
-      const map = mapOfValues[colIndex];
-      const value = metadata[name];
-
-      // Skip missing values.
-      if (value == null) {
-        return;
-      }
-
-      if (!stats.tooManyUniqueValues) {
-        if (value in map) {
-          map[value]++;
-        } else {
-          map[value] = 1;
-        }
-        if (Object.keys(map).length > NUM_COLORS_COLOR_MAP) {
-          stats.tooManyUniqueValues = true;
-        }
-      }
-      if (isNaN(value as any)) {
-        stats.isNumeric = false;
-      } else {
-        metadata[name] = +value;
-        stats.min = Math.min(stats.min, +value);
-        stats.max = Math.max(stats.max, +value);
-      }
-    });
-  });
-  columnStats.forEach((stats, colIndex) => {
-    stats.uniqueEntries = Object.keys(mapOfValues[colIndex]).map(label => {
-      return {label, count: mapOfValues[colIndex][label]};
-    });
-  });
-  return columnStats;
-}
-
-export function parseMetadata(content: ArrayBuffer):
-    Promise<SpriteAndMetadataInfo> {
-  logging.setModalMessage('Parsing metadata...', METADATA_MSG_ID);
-
-  return new Promise<SpriteAndMetadataInfo>((resolve, reject) => {
-    let pointsMetadata: PointMetadata[] = [];
-    let hasHeader = false;
-    let lineNumber = 0;
-    let columnNames = ['label'];
-    streamParse(content, (line: string) => {
-      if (line.trim().length === 0) {
-        return;
-      }
-      if (lineNumber === 0) {
-        hasHeader = line.indexOf('\t') >= 0;
-
-        // If the first row doesn't contain metadata keys, we assume that the
-        // values are labels.
-        if (hasHeader) {
-          columnNames = line.split('\t');
-          lineNumber++;
-          return;
-        }
-      }
-
-      lineNumber++;
-
-      let rowValues = line.split('\t');
-      let metadata: PointMetadata = {};
-      pointsMetadata.push(metadata);
-      columnNames.forEach((name: string, colIndex: number) => {
-        let value = rowValues[colIndex];
-        // Normalize missing values.
-        value = (value === '' ? null : value);
-        metadata[name] = value;
-      });
-    }).then(() => {
-      logging.setModalMessage(null, METADATA_MSG_ID);
-      resolve({
-        stats: analyzeMetadata(columnNames, pointsMetadata),
-        pointsInfo: pointsMetadata
-      });
-    });
-  });
-}
-
-export function fetchImage(url: string): Promise<HTMLImageElement> {
-  return new Promise<HTMLImageElement>((resolve, reject) => {
-    let image = new Image();
-    image.onload = () => resolve(image);
-    image.onerror = (err) => reject(err);
-    image.crossOrigin = '';
-    image.src = url;
-  });
-}
-
-export function retrieveSpriteAndMetadataInfo(metadataPath: string,
-    spriteImagePath: string, spriteMetadata: SpriteMetadata,
-    callback: (r: SpriteAndMetadataInfo) => void) {
-  let metadataPromise: Promise<SpriteAndMetadataInfo> = Promise.resolve({});
-  if (metadataPath) {
-    metadataPromise = new Promise<SpriteAndMetadataInfo>((resolve, reject) => {
-      logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
-
-      const request = new XMLHttpRequest();
-      request.open('GET', metadataPath);
-      request.responseType = 'arraybuffer';
-
-      request.onerror = () => {
-        logging.setErrorMessage(request.responseText, 'fetching metadata');
-        reject();
-      };
-      request.onload = () => {
-        resolve(parseMetadata(request.response));
-      };
-      request.send(null);
-    });
-  }
-  let spriteMsgId = null;
-  let spritesPromise: Promise<HTMLImageElement> = null;
-  if (spriteImagePath) {
-    spriteMsgId = logging.setModalMessage('Fetching sprite image...');
-    spritesPromise = fetchImage(spriteImagePath);
-  }
-
-  // Fetch the metadata and the image in parallel.
-  Promise.all([metadataPromise, spritesPromise]).then(values => {
-    if (spriteMsgId) {
-      logging.setModalMessage(null, spriteMsgId);
-    }
-    const [metadata, spriteImage] = values;
-
-    if (spriteImage && (spriteImage.height > MAX_SPRITE_IMAGE_SIZE_PX ||
-                        spriteImage.width > MAX_SPRITE_IMAGE_SIZE_PX)) {
-      logging.setModalMessage(
-          `Error: Sprite image of dimensions ${spriteImage.width}px x ` +
-          `${spriteImage.height}px exceeds maximum dimensions ` +
-          `${MAX_SPRITE_IMAGE_SIZE_PX}px x ${MAX_SPRITE_IMAGE_SIZE_PX}px`);
-    } else {
-      metadata.spriteImage = spriteImage;
-      metadata.spriteMetadata = spriteMetadata;
-      callback(metadata);
-    }
-  });
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/data.ts b/tensorflow/tensorboard/components/vz_projector/data.ts
deleted file mode 100644
index c4e81985fc84ee17d6daaa337b589a32cec8cfc6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/data.ts
+++ /dev/null
@@ -1,547 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {TSNE} from './bh_tsne';
-import {SpriteMetadata} from './data-provider';
-import * as knn from './knn';
-import * as logging from './logging';
-import * as scatterPlot from './scatterPlot';
-import * as util from './util';
-import * as vector from './vector';
-
-export type DistanceFunction = (a: number[], b: number[]) => number;
-export type ProjectionComponents3D = [string, string, string];
-
-export interface PointMetadata { [key: string]: number|string; }
-
-export interface DataProto {
-  shape: [number, number];
-  tensor: number[];
-  metadata: {
-    columns: Array<
-        {name: string; stringValues: string[]; numericValues: number[];}>;
-  };
-}
-
-/** Statistics for a metadata column. */
-export interface ColumnStats {
-  name: string;
-  isNumeric: boolean;
-  tooManyUniqueValues: boolean;
-  uniqueEntries?: Array<{label: string, count: number}>;
-  min: number;
-  max: number;
-}
-
-export interface SpriteAndMetadataInfo {
-  stats?: ColumnStats[];
-  pointsInfo?: PointMetadata[];
-  spriteImage?: HTMLImageElement;
-  spriteMetadata?: SpriteMetadata;
-}
-
-/** A single collection of points which make up a sequence through space. */
-export interface Sequence {
-  /** Indices into the DataPoints array in the Data object. */
-  pointIndices: number[];
-}
-
-export interface DataPoint {
-  /** The point in the original space. */
-  vector: Float32Array;
-
-  /*
-   * Metadata for each point. Each metadata is a set of key/value pairs
-   * where the value can be a string or a number.
-   */
-  metadata: PointMetadata;
-
-  /** index of the sequence, used for highlighting on click */
-  sequenceIndex?: number;
-
-  /** index in the original data source */
-  index: number;
-
-  /** This is where the calculated projections space are cached */
-  projections: {[key: string]: number};
-}
-
-const IS_FIREFOX = navigator.userAgent.toLowerCase().indexOf('firefox') >= 0;
-/** Controls whether nearest neighbors computation is done on the GPU or CPU. */
-const KNN_GPU_ENABLED = util.hasWebGLSupport() && !IS_FIREFOX;
-
-export const TSNE_SAMPLE_SIZE = 10000;
-export const PCA_SAMPLE_SIZE = 50000;
-/** Number of dimensions to sample when doing approximate PCA. */
-export const PCA_SAMPLE_DIM = 200;
-/** Number of pca components to compute. */
-const NUM_PCA_COMPONENTS = 10;
-/**
- * Reserved metadata attributes used for sequence information
- * NOTE: Use "__seq_next__" as "__next__" is deprecated.
- */
-const SEQUENCE_METADATA_ATTRS = ['__next__', '__seq_next__'];
-
-function getSequenceNextPointIndex(pointMetadata: PointMetadata): number|null {
-  let sequenceAttr = null;
-  for (let metadataAttr of SEQUENCE_METADATA_ATTRS) {
-    if (metadataAttr in pointMetadata && pointMetadata[metadataAttr] !== '') {
-      sequenceAttr = pointMetadata[metadataAttr];
-      break;
-    }
-  }
-  if (sequenceAttr == null) {
-    return null;
-  }
-  return +sequenceAttr;
-}
-
-/**
- * Dataset contains a DataPoints array that should be treated as immutable. This
- * acts as a working subset of the original data, with cached properties
- * from computationally expensive operations. Because creating a subset
- * requires normalizing and shifting the vector space, we make a copy of the
- * data so we can still always create new subsets based on the original data.
- */
-export class DataSet {
-  points: DataPoint[];
-  sequences: Sequence[];
-
-  shuffledDataIndices: number[] = [];
-
-  /**
-   * This keeps a list of all current projections so you can easily test to see
-   * if it's been calculated already.
-   */
-  projections: {[projection: string]: boolean} = {};
-  nearest: knn.NearestEntry[][];
-  nearestK: number;
-  tSNEIteration: number = 0;
-  tSNEShouldStop = true;
-  dim: [number, number] = [0, 0];
-  hasTSNERun: boolean = false;
-  spriteAndMetadataInfo: SpriteAndMetadataInfo;
-  fracVariancesExplained: number[];
-
-  private tsne: TSNE;
-
-  /** Creates a new Dataset */
-  constructor(
-      points: DataPoint[], spriteAndMetadataInfo?: SpriteAndMetadataInfo) {
-    this.points = points;
-    this.shuffledDataIndices = util.shuffle(util.range(this.points.length));
-    this.sequences = this.computeSequences(points);
-    this.dim = [this.points.length, this.points[0].vector.length];
-    this.spriteAndMetadataInfo = spriteAndMetadataInfo;
-  }
-
-  private computeSequences(points: DataPoint[]) {
-    // Keep a list of indices seen so we don't compute sequences for a given
-    // point twice.
-    let indicesSeen = new Int8Array(points.length);
-    // Compute sequences.
-    let indexToSequence: {[index: number]: Sequence} = {};
-    let sequences: Sequence[] = [];
-    for (let i = 0; i < points.length; i++) {
-      if (indicesSeen[i]) {
-        continue;
-      }
-      indicesSeen[i] = 1;
-
-      // Ignore points without a sequence attribute.
-      let next = getSequenceNextPointIndex(points[i].metadata);
-      if (next == null) {
-        continue;
-      }
-      if (next in indexToSequence) {
-        let existingSequence = indexToSequence[next];
-        // Pushing at the beginning of the array.
-        existingSequence.pointIndices.unshift(i);
-        indexToSequence[i] = existingSequence;
-        continue;
-      }
-      // The current point is pointing to a new/unseen sequence.
-      let newSequence: Sequence = {pointIndices: []};
-      indexToSequence[i] = newSequence;
-      sequences.push(newSequence);
-      let currentIndex = i;
-      while (points[currentIndex]) {
-        newSequence.pointIndices.push(currentIndex);
-        let next = getSequenceNextPointIndex(points[currentIndex].metadata);
-        if (next != null) {
-          indicesSeen[next] = 1;
-          currentIndex = next;
-        } else {
-          currentIndex = -1;
-        }
-      }
-    }
-    return sequences;
-  }
-
-  projectionCanBeRendered(projection: ProjectionType): boolean {
-    if (projection !== 'tsne') {
-      return true;
-    }
-    return this.tSNEIteration > 0;
-  }
-
-  /**
-   * Returns a new subset dataset by copying out data. We make a copy because
-   * we have to modify the vectors by normalizing them.
-   *
-   * @param subset Array of indices of points that we want in the subset.
-   *
-   * @return A subset of the original dataset.
-   */
-  getSubset(subset?: number[]): DataSet {
-    const pointsSubset = ((subset != null) && (subset.length > 0)) ?
-        subset.map(i => this.points[i]) :
-        this.points;
-    let points = pointsSubset.map(dp => {
-      return {
-        metadata: dp.metadata,
-        index: dp.index,
-        vector: dp.vector.slice(),
-        projections: {} as {[key: string]: number}
-      };
-    });
-    return new DataSet(points, this.spriteAndMetadataInfo);
-  }
-
-  /**
-   * Computes the centroid, shifts all points to that centroid,
-   * then makes them all unit norm.
-   */
-  normalize() {
-    // Compute the centroid of all data points.
-    let centroid = vector.centroid(this.points, a => a.vector);
-    if (centroid == null) {
-      throw Error('centroid should not be null');
-    }
-    // Shift all points by the centroid and make them unit norm.
-    for (let id = 0; id < this.points.length; ++id) {
-      let dataPoint = this.points[id];
-      dataPoint.vector = vector.sub(dataPoint.vector, centroid);
-      vector.unit(dataPoint.vector);
-    }
-  }
-
-  /** Projects the dataset onto a given vector and caches the result. */
-  projectLinear(dir: vector.Vector, label: string) {
-    this.projections[label] = true;
-    this.points.forEach(dataPoint => {
-      dataPoint.projections[label] = vector.dot(dataPoint.vector, dir);
-    });
-  }
-
-  /** Projects the dataset along the top 10 principal components. */
-  projectPCA(): Promise<void> {
-    if (this.projections['pca-0'] != null) {
-      return Promise.resolve<void>(null);
-    }
-    return util.runAsyncTask('Computing PCA...', () => {
-      // Approximate pca vectors by sampling the dimensions.
-      let dim = this.points[0].vector.length;
-      let vectors = this.shuffledDataIndices.map(i => this.points[i].vector);
-      if (dim > PCA_SAMPLE_DIM) {
-        vectors = vector.projectRandom(vectors, PCA_SAMPLE_DIM);
-      }
-      let sampledVectors = vectors.slice(0, PCA_SAMPLE_SIZE);
-
-      let sigma = numeric.div(
-          numeric.dot(numeric.transpose(sampledVectors), sampledVectors),
-          sampledVectors.length);
-      let svd = numeric.svd(sigma);
-
-      let variances: number[] = svd.S;
-      let totalVariance = 0;
-      for (let i = 0; i < variances.length; ++i) {
-        totalVariance += variances[i];
-      }
-      for (let i = 0; i < variances.length; ++i) {
-        variances[i] /= totalVariance;
-      }
-      this.fracVariancesExplained = variances;
-
-      let U: number[][] = svd.U;
-      let pcaVectors = vectors.map(vector => {
-        let newV = new Float32Array(NUM_PCA_COMPONENTS);
-        for (let newDim = 0; newDim < NUM_PCA_COMPONENTS; newDim++) {
-          let dot = 0;
-          for (let oldDim = 0; oldDim < vector.length; oldDim++) {
-            dot += vector[oldDim] * U[oldDim][newDim];
-          }
-          newV[newDim] = dot;
-        }
-        return newV;
-      });
-      for (let d = 0; d < NUM_PCA_COMPONENTS; d++) {
-        let label = 'pca-' + d;
-        this.projections[label] = true;
-        for (let i = 0; i < pcaVectors.length; i++) {
-          let pointIndex = this.shuffledDataIndices[i];
-          this.points[pointIndex].projections[label] = pcaVectors[i][d];
-        }
-      }
-    });
-  }
-
-  /** Runs tsne on the data. */
-  projectTSNE(
-      perplexity: number, learningRate: number, tsneDim: number,
-      stepCallback: (iter: number) => void) {
-    this.hasTSNERun = true;
-    let k = Math.floor(3 * perplexity);
-    let opt = {epsilon: learningRate, perplexity: perplexity, dim: tsneDim};
-    this.tsne = new TSNE(opt);
-    this.tSNEShouldStop = false;
-    this.tSNEIteration = 0;
-
-    let sampledIndices = this.shuffledDataIndices.slice(0, TSNE_SAMPLE_SIZE);
-    let step = () => {
-      if (this.tSNEShouldStop) {
-        stepCallback(null);
-        this.tsne = null;
-        return;
-      }
-      this.tsne.step();
-      let result = this.tsne.getSolution();
-      sampledIndices.forEach((index, i) => {
-        let dataPoint = this.points[index];
-
-        dataPoint.projections['tsne-0'] = result[i * tsneDim + 0];
-        dataPoint.projections['tsne-1'] = result[i * tsneDim + 1];
-        if (tsneDim === 3) {
-          dataPoint.projections['tsne-2'] = result[i * tsneDim + 2];
-        }
-      });
-      this.tSNEIteration++;
-      stepCallback(this.tSNEIteration);
-      requestAnimationFrame(step);
-    };
-
-    // Nearest neighbors calculations.
-    let knnComputation: Promise<knn.NearestEntry[][]>;
-
-    if (this.nearest != null && k === this.nearestK) {
-      // We found the nearest neighbors before and will reuse them.
-      knnComputation = Promise.resolve(this.nearest);
-    } else {
-      let sampledData = sampledIndices.map(i => this.points[i]);
-      this.nearestK = k;
-      knnComputation = KNN_GPU_ENABLED ?
-          knn.findKNNGPUCosine(sampledData, k, (d => d.vector)) :
-          knn.findKNN(
-              sampledData, k, (d => d.vector),
-              (a, b, limit) => vector.cosDistNorm(a, b));
-    }
-    knnComputation.then(nearest => {
-      this.nearest = nearest;
-      util.runAsyncTask('Initializing T-SNE...', () => {
-            this.tsne.initDataDist(this.nearest);
-          }).then(step);
-    });
-  }
-
-  /**
-   * Merges metadata to the dataset and returns whether it succeeded.
-   */
-  mergeMetadata(metadata: SpriteAndMetadataInfo): boolean {
-    if (metadata.pointsInfo.length !== this.points.length) {
-      let errorMessage = `Number of tensors (${this.points.length}) do not` +
-          ` match the number of lines in metadata` +
-          ` (${metadata.pointsInfo.length}).`;
-
-      if (metadata.stats.length === 1 &&
-          this.points.length + 1 === metadata.pointsInfo.length) {
-        // If there is only one column of metadata and the number of points is
-        // exactly one less than the number of metadata lines, this is due to an
-        // unnecessary header line in the metadata and we can show a meaningful
-        // error.
-        logging.setErrorMessage(
-            errorMessage + ' Single column metadata should not have a header ' +
-                'row.',
-            'merging metadata');
-        return false;
-      } else if (
-          metadata.stats.length > 1 &&
-          this.points.length - 1 === metadata.pointsInfo.length) {
-        // If there are multiple columns of metadata and the number of points is
-        // exactly one greater than the number of lines in the metadata, this
-        // means there is a missing metadata header.
-        logging.setErrorMessage(
-            errorMessage + ' Multi-column metadata should have a header ' +
-                'row with column labels.',
-            'merging metadata');
-        return false;
-      }
-
-      logging.setWarningMessage(errorMessage);
-    }
-    this.spriteAndMetadataInfo = metadata;
-    metadata.pointsInfo.slice(0, this.points.length)
-        .forEach((m, i) => this.points[i].metadata = m);
-    return true;
-  }
-
-  stopTSNE() {
-    this.tSNEShouldStop = true;
-  }
-
-  /**
-   * Finds the nearest neighbors of the query point using a
-   * user-specified distance metric.
-   */
-  findNeighbors(pointIndex: number, distFunc: DistanceFunction, numNN: number):
-      knn.NearestEntry[] {
-    // Find the nearest neighbors of a particular point.
-    let neighbors = knn.findKNNofPoint(
-        this.points, pointIndex, numNN, (d => d.vector), distFunc);
-    // TODO(smilkov): Figure out why we slice.
-    let result = neighbors.slice(0, numNN);
-    return result;
-  }
-
-  /**
-   * Search the dataset based on a metadata field.
-   */
-  query(query: string, inRegexMode: boolean, fieldName: string): number[] {
-    let predicate = util.getSearchPredicate(query, inRegexMode, fieldName);
-    let matches: number[] = [];
-    this.points.forEach((point, id) => {
-      if (predicate(point)) {
-        matches.push(id);
-      }
-    });
-    return matches;
-  }
-}
-
-export type ProjectionType = 'tsne' | 'pca' | 'custom';
-
-export class Projection {
-  constructor(
-      public projectionType: ProjectionType,
-      public projectionComponents: ProjectionComponents3D,
-      public dimensionality: number, public dataSet: DataSet) {}
-}
-
-export interface ColorOption {
-  name: string;
-  desc?: string;
-  map?: (value: string|number) => string;
-  /** List of items for the color map. Defined only for categorical map. */
-  items?: {label: string, count: number}[];
-  /** Threshold values and their colors. Defined for gradient color map. */
-  thresholds?: {value: number, color: string}[];
-  isSeparator?: boolean;
-  tooManyUniqueValues?: boolean;
-}
-
-/**
- * An interface that holds all the data for serializing the current state of
- * the world.
- */
-export class State {
-  /** A label identifying this state. */
-  label: string = '';
-
-  /** Whether this State is selected in the bookmarks pane. */
-  isSelected: boolean = false;
-
-  /** The selected projection tab. */
-  selectedProjection: ProjectionType;
-
-  /** Dimensions of the DataSet. */
-  dataSetDimensions: [number, number];
-
-  /** t-SNE parameters */
-  tSNEIteration: number = 0;
-  tSNEPerplexity: number = 0;
-  tSNELearningRate: number = 0;
-  tSNEis3d: boolean = true;
-
-  /** PCA projection component dimensions */
-  pcaComponentDimensions: number[] = [];
-
-  /** Custom projection parameters */
-  customSelectedSearchByMetadataOption: string;
-  customXLeftText: string;
-  customXLeftRegex: boolean;
-  customXRightText: string;
-  customXRightRegex: boolean;
-  customYUpText: string;
-  customYUpRegex: boolean;
-  customYDownText: string;
-  customYDownRegex: boolean;
-
-  /** The computed projections of the tensors. */
-  projections: Array<{[key: string]: number}> = [];
-
-  /** Filtered dataset indices. */
-  filteredPoints: number[];
-
-  /** The indices of selected points. */
-  selectedPoints: number[] = [];
-
-  /** Camera state (2d/3d, position, target, zoom, etc). */
-  cameraDef: scatterPlot.CameraDef;
-
-  /** Color by option. */
-  selectedColorOptionName: string;
-  forceCategoricalColoring: boolean;
-
-  /** Label by option. */
-  selectedLabelOption: string;
-}
-
-export function getProjectionComponents(
-    projection: ProjectionType,
-    components: (number|string)[]): ProjectionComponents3D {
-  if (components.length > 3) {
-    throw new RangeError('components length must be <= 3');
-  }
-  const projectionComponents: [string, string, string] = [null, null, null];
-  const prefix = (projection === 'custom') ? 'linear' : projection;
-  for (let i = 0; i < components.length; ++i) {
-    if (components[i] == null) {
-      continue;
-    }
-    projectionComponents[i] = prefix + '-' + components[i];
-  }
-  return projectionComponents;
-}
-
-export function stateGetAccessorDimensions(state: State): Array<number|string> {
-  let dimensions: Array<number|string>;
-  switch (state.selectedProjection) {
-    case 'pca':
-      dimensions = state.pcaComponentDimensions.slice();
-      break;
-    case 'tsne':
-      dimensions = [0, 1];
-      if (state.tSNEis3d) {
-        dimensions.push(2);
-      }
-      break;
-    case 'custom':
-      dimensions = ['x', 'y'];
-      break;
-    default:
-      throw new Error('Unexpected fallthrough');
-  }
-  return dimensions;
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/external.d.ts b/tensorflow/tensorboard/components/vz_projector/external.d.ts
deleted file mode 100644
index cbc1512c215c7d803b31465775563b8b5a09bee8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/external.d.ts
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TODO(smilkov): Split into weblas.d.ts and numeric.d.ts and write
-// typings for numeric.
-interface Tensor {
-  new(size: [number, number], data: Float32Array);
-  transfer(): Float32Array;
-  delete(): void;
-}
-
-interface Weblas {
-  sgemm(M: number, N: number, K: number, alpha: number,
-      A: Float32Array, B: Float32Array, beta: number, C: Float32Array):
-      Float32Array;
-  pipeline: {
-     Tensor: Tensor;
-     sgemm(alpha: number, A: Tensor, B: Tensor, beta: number,
-         C: Tensor): Tensor;
-  };
-  util: {
-    transpose(M: number, N: number, data: Float32Array): Tensor;
-  };
-
-}
-
-declare let numeric: any;
-declare let weblas: Weblas;
-
-interface AnalyticsEventType {
-  hitType: string;
-  page?: string;
-  eventCategory?: string;
-  eventAction?: string;
-  eventLabel?: string;
-  eventValue?: number;
-}
-
-declare let ga: (command: string, eventObj: AnalyticsEventType) => void;
\ No newline at end of file
diff --git a/tensorflow/tensorboard/components/vz_projector/heap.ts b/tensorflow/tensorboard/components/vz_projector/heap.ts
deleted file mode 100644
index ac3144e64930c810d69e69acf0bfe250bff1b5e7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/heap.ts
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/** Min key heap. */
-export type HeapItem<T> = {
-  key: number,
-  value: T
-};
-
-/**
- * Min-heap data structure. Provides O(1) for peek, returning the smallest key.
- */
-// TODO(jart): Rename to Heap and use Comparator.
-export class MinHeap<T> {
-  private arr: HeapItem<T>[] = [];
-
-  /** Push an element with the provided key. */
-  push(key: number, value: T): void {
-    this.arr.push({key, value});
-    this.bubbleUp(this.arr.length - 1);
-  }
-
-  /** Pop the element with the smallest key. */
-  pop(): HeapItem<T> {
-    if (this.arr.length === 0) {
-      throw new Error('pop() called on empty binary heap');
-    }
-    let item = this.arr[0];
-    let last = this.arr.length - 1;
-    this.arr[0] = this.arr[last];
-    this.arr.pop();
-    if (last > 0) {
-      this.bubbleDown(0);
-    }
-    return item;
-  };
-
-  /** Returns, but doesn't remove the element with the smallest key */
-  peek(): HeapItem<T> { return this.arr[0]; }
-
-  /**
-   * Pops the element with the smallest key and at the same time
-   * adds the newly provided element. This is faster than calling
-   * pop() and push() separately.
-   */
-  popPush(key: number, value: T): HeapItem<T> {
-    if (this.arr.length === 0) {
-      throw new Error('pop() called on empty binary heap');
-    }
-    let item = this.arr[0];
-    this.arr[0] = {key, value};
-    if (this.arr.length > 0) {
-      this.bubbleDown(0);
-    }
-    return item;
-  }
-
-  /** Returns the number of elements in the heap. */
-  size(): number { return this.arr.length; }
-
-  /** Returns all the items in the heap. */
-  items(): HeapItem<T>[] { return this.arr; }
-
-  private swap(a: number, b: number) {
-    let temp = this.arr[a];
-    this.arr[a] = this.arr[b];
-    this.arr[b] = temp;
-  }
-
-  private bubbleDown(pos: number) {
-    let left = (pos << 1) + 1;
-    let right = left + 1;
-    let largest = pos;
-    if (left < this.arr.length && this.arr[left].key < this.arr[largest].key) {
-      largest = left;
-    }
-    if (right < this.arr.length &&
-        this.arr[right].key < this.arr[largest].key) {
-      largest = right;
-    }
-    if (largest !== pos) {
-      this.swap(largest, pos);
-      this.bubbleDown(largest);
-    }
-  }
-
-  private bubbleUp(pos: number) {
-    if (pos <= 0) {
-      return;
-    }
-    let parent = ((pos - 1) >> 1);
-    if (this.arr[pos].key < this.arr[parent].key) {
-      this.swap(pos, parent);
-      this.bubbleUp(parent);
-    }
-  }
-}
-
-/** List that keeps the K elements with the smallest keys. */
-export class KMin<T> {
-  private k: number;
-  private maxHeap = new MinHeap<T>();
-
-  /** Constructs a new k-min data structure with the provided k. */
-  constructor(k: number) { this.k = k; }
-
-  /** Adds an element to the list. */
-  add(key: number, value: T) {
-    if (this.maxHeap.size() < this.k) {
-      this.maxHeap.push(-key, value);
-      return;
-    }
-    let largest = this.maxHeap.peek();
-    // If the new element is smaller, replace the largest with the new element.
-    if (key < -largest.key) {
-      this.maxHeap.popPush(-key, value);
-    }
-  }
-
-  /** Returns the k items with the smallest keys. */
-  getMinKItems(): T[] {
-    let items = this.maxHeap.items();
-    items.sort((a, b) => b.key - a.key);
-    return items.map(a => a.value);
-  }
-
-  /** Returns the size of the list. */
-  getSize(): number { return this.maxHeap.size(); }
-
-  /** Returns the largest key in the list. */
-  getLargestKey(): number {
-    return this.maxHeap.size() === 0 ? null : -this.maxHeap.peek().key;
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/knn.ts b/tensorflow/tensorboard/components/vz_projector/knn.ts
deleted file mode 100644
index 906e077b5d7665bd4cd65a8d98d7063acade3ad0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/knn.ts
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {runAsyncTask} from './util';
-import * as logging from './logging';
-import {KMin} from './heap';
-import {Vector} from './vector';
-import * as vector from './vector';
-
-export type NearestEntry = {
-  index: number,
-  dist: number
-};
-
-/**
- * Optimal size for the height of the matrix when doing computation on the GPU
- * using WebGL. This was found experimentally.
- *
- * This also guarantees that for computing pair-wise distance for up to 10K
- * vectors, no more than 40MB will be allocated in the GPU. Without the
- * allocation limit, we can freeze the graphics of the whole OS.
- */
-const OPTIMAL_GPU_BLOCK_SIZE = 256;
-/** Id of message box used for knn gpu progress bar. */
-const KNN_GPU_MSG_ID = 'knn-gpu';
-
-/**
- * Returns the K nearest neighbors for each vector where the distance
- * computation is done on the GPU (WebGL) using cosine distance.
- *
- * @param dataPoints List of data points, where each data point holds an
- *   n-dimensional vector.
- * @param k Number of nearest neighbors to find.
- * @param accessor A method that returns the vector, given the data point.
- */
-export function findKNNGPUCosine<T>(
-    dataPoints: T[], k: number,
-    accessor: (dataPoint: T) => Float32Array): Promise<NearestEntry[][]> {
-  let N = dataPoints.length;
-  let dim = accessor(dataPoints[0]).length;
-
-  // The goal is to compute a large matrix multiplication A*A.T where A is of
-  // size NxD and A.T is its transpose. This results in a NxN matrix which
-  // could be too big to store on the GPU memory. To avoid memory overflow, we
-  // compute multiple A*partial_A.T where partial_A is of size BxD (B is much
-  // smaller than N). This results in storing only NxB size matrices on the GPU
-  // at a given time.
-
-  // A*A.T will give us NxN matrix holding the cosine distance between every
-  // pair of points, which we sort using KMin data structure to obtain the
-  // K nearest neighbors for each point.
-  let typedArray = vector.toTypedArray(dataPoints, accessor);
-  let bigMatrix = new weblas.pipeline.Tensor([N, dim], typedArray);
-  let nearest: NearestEntry[][] = new Array(N);
-  let numPieces = Math.ceil(N / OPTIMAL_GPU_BLOCK_SIZE);
-  let M = Math.floor(N / numPieces);
-  let modulo = N % numPieces;
-  let offset = 0;
-  let progress = 0;
-  let progressDiff = 1 / (2 * numPieces);
-  let piece = 0;
-
-  function step(resolve: (result: NearestEntry[][]) => void) {
-    let progressMsg =
-        'Finding nearest neighbors: ' + (progress * 100).toFixed() + '%';
-    runAsyncTask(progressMsg, () => {
-      let B = piece < modulo ? M + 1 : M;
-      let typedB = new Float32Array(B * dim);
-      for (let i = 0; i < B; ++i) {
-        let vector = accessor(dataPoints[offset + i]);
-        for (let d = 0; d < dim; ++d) {
-          typedB[i * dim + d] = vector[d];
-        }
-      }
-      let partialMatrix = new weblas.pipeline.Tensor([B, dim], typedB);
-      // Result is N x B matrix.
-      let result =
-          weblas.pipeline.sgemm(1, bigMatrix, partialMatrix, null, null);
-      let partial = result.transfer();
-      partialMatrix.delete();
-      result.delete();
-      progress += progressDiff;
-      for (let i = 0; i < B; i++) {
-        let kMin = new KMin<NearestEntry>(k);
-        let iReal = offset + i;
-        for (let j = 0; j < N; j++) {
-          if (j === iReal) {
-            continue;
-          }
-          let cosDist = 1 - partial[j * B + i];  // [j, i];
-          kMin.add(cosDist, {index: j, dist: cosDist});
-        }
-        nearest[iReal] = kMin.getMinKItems();
-      }
-      progress += progressDiff;
-      offset += B;
-      piece++;
-    }, KNN_GPU_MSG_ID).then(() => {
-      if (piece < numPieces) {
-        step(resolve);
-      } else {
-        logging.setModalMessage(null, KNN_GPU_MSG_ID);
-        bigMatrix.delete();
-        resolve(nearest);
-      }
-    }, error => {
-      // GPU failed. Reverting back to CPU.
-      logging.setModalMessage(null, KNN_GPU_MSG_ID);
-      let distFunc = (a, b, limit) => vector.cosDistNorm(a, b);
-      findKNN(dataPoints, k, accessor, distFunc).then(nearest => {
-        resolve(nearest);
-      });
-    });
-  }
-  return new Promise<NearestEntry[][]>(resolve => step(resolve));
-}
-
-/**
- * Returns the K nearest neighbors for each vector where the distance
- * computation is done on the CPU using a user-specified distance method.
- *
- * @param dataPoints List of data points, where each data point holds an
- *   n-dimensional vector.
- * @param k Number of nearest neighbors to find.
- * @param accessor A method that returns the vector, given the data point.
- * @param dist Method that takes two vectors and a limit, and computes the
- *   distance between two vectors, with the ability to stop early if the
- *   distance is above the limit.
- */
-export function findKNN<T>(
-    dataPoints: T[], k: number, accessor: (dataPoint: T) => Float32Array,
-    dist: (a: Vector, b: Vector, limit: number) =>
-        number): Promise<NearestEntry[][]> {
-  return runAsyncTask<NearestEntry[][]>('Finding nearest neighbors...', () => {
-    let N = dataPoints.length;
-    let nearest: NearestEntry[][] = new Array(N);
-    // Find the distances from node i.
-    let kMin: KMin<NearestEntry>[] = new Array(N);
-    for (let i = 0; i < N; i++) {
-      kMin[i] = new KMin<NearestEntry>(k);
-    }
-    for (let i = 0; i < N; i++) {
-      let a = accessor(dataPoints[i]);
-      let kMinA = kMin[i];
-      for (let j = i + 1; j < N; j++) {
-        let kMinB = kMin[j];
-        let limitI = kMinA.getSize() === k ?
-            kMinA.getLargestKey() || Number.MAX_VALUE :
-            Number.MAX_VALUE;
-        let limitJ = kMinB.getSize() === k ?
-            kMinB.getLargestKey() || Number.MAX_VALUE :
-            Number.MAX_VALUE;
-        let limit = Math.max(limitI, limitJ);
-        let dist2ItoJ = dist(a, accessor(dataPoints[j]), limit);
-        if (dist2ItoJ >= 0) {
-          kMinA.add(dist2ItoJ, {index: j, dist: dist2ItoJ});
-          kMinB.add(dist2ItoJ, {index: i, dist: dist2ItoJ});
-        }
-      }
-    }
-    for (let i = 0; i < N; i++) {
-      nearest[i] = kMin[i].getMinKItems();
-    }
-    return nearest;
-  });
-}
-
-/** Calculates the minimum distance between a search point and a rectangle. */
-function minDist(
-    point: [number, number], x1: number, y1: number, x2: number, y2: number) {
-  let x = point[0];
-  let y = point[1];
-  let dx1 = x - x1;
-  let dx2 = x - x2;
-  let dy1 = y - y1;
-  let dy2 = y - y2;
-
-  if (dx1 * dx2 <= 0) {    // x is between x1 and x2
-    if (dy1 * dy2 <= 0) {  // (x,y) is inside the rectangle
-      return 0;            // return 0 as point is in rect
-    }
-    return Math.min(Math.abs(dy1), Math.abs(dy2));
-  }
-  if (dy1 * dy2 <= 0) {  // y is between y1 and y2
-    // We know it is already inside the rectangle
-    return Math.min(Math.abs(dx1), Math.abs(dx2));
-  }
-  let corner: [number, number];
-  if (x > x2) {
-    // Upper-right vs lower-right.
-    corner = y > y2 ? [x2, y2] : [x2, y1];
-  } else {
-    // Upper-left vs lower-left.
-    corner = y > y2 ? [x1, y2] : [x1, y1];
-  }
-  return Math.sqrt(vector.dist22D([x, y], corner));
-}
-
-/**
- * Returns the nearest neighbors of a particular point.
- *
- * @param dataPoints List of data points.
- * @param pointIndex The index of the point we need the nearest neighbors of.
- * @param k Number of nearest neighbors to search for.
- * @param accessor Method that maps a data point => vector (array of numbers).
- * @param distance Method that takes two vectors and returns their distance.
- */
-export function findKNNofPoint<T>(
-    dataPoints: T[], pointIndex: number, k: number,
-    accessor: (dataPoint: T) => Float32Array,
-    distance: (a: Vector, b: Vector) => number) {
-  let kMin = new KMin<NearestEntry>(k);
-  let a = accessor(dataPoints[pointIndex]);
-  for (let i = 0; i < dataPoints.length; ++i) {
-    if (i === pointIndex) {
-      continue;
-    }
-    let b = accessor(dataPoints[i]);
-    let dist = distance(a, b);
-    kMin.add(dist, {index: i, dist: dist});
-  }
-  return kMin.getMinKItems();
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/label.ts b/tensorflow/tensorboard/components/vz_projector/label.ts
deleted file mode 100644
index 67987f06ea381731c114a632bf67214e86a46c81..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/label.ts
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-export interface BoundingBox {
-  loX: number;
-  loY: number;
-  hiX: number;
-  hiY: number;
-}
-
-/**
- * Accelerates label placement by dividing the view into a uniform grid.
- * Labels only need to be tested for collision with other labels that overlap
- * the same grid cells. This is a fork of {@code amoeba.CollisionGrid}.
- */
-export class CollisionGrid {
-  private numHorizCells: number;
-  private numVertCells: number;
-  private grid: BoundingBox[][];
-  private bound: BoundingBox;
-  private cellWidth: number;
-  private cellHeight: number;
-
-  /**
-   * Constructs a new Collision grid.
-   *
-   * @param bound The bound of the grid. Labels out of bounds will be rejected.
-   * @param cellWidth Width of a cell in the grid.
-   * @param cellHeight Height of a cell in the grid.
-   */
-  constructor(bound: BoundingBox, cellWidth: number, cellHeight: number) {
-    /** The bound of the grid. Labels out of bounds will be rejected. */
-    this.bound = bound;
-
-    /** Width of a cell in the grid. */
-    this.cellWidth = cellWidth;
-
-    /** Height of a cell in the grid. */
-    this.cellHeight = cellHeight;
-
-    /** Number of grid cells along the x axis. */
-    this.numHorizCells = Math.ceil(this.boundWidth(bound) / cellWidth);
-
-    /** Number of grid cells along the y axis. */
-    this.numVertCells = Math.ceil(this.boundHeight(bound) / cellHeight);
-
-    /**
-     * The 2d grid (stored as a 1d array.) Each cell consists of an array of
-     * BoundingBoxes for objects that are in the cell.
-     */
-    this.grid = new Array(this.numHorizCells * this.numVertCells);
-  }
-
-  private boundWidth(bound: BoundingBox) { return bound.hiX - bound.loX; }
-
-  private boundHeight(bound: BoundingBox) { return bound.hiY - bound.loY; }
-
-  private boundsIntersect(a: BoundingBox, b: BoundingBox) {
-    return !(a.loX > b.hiX || a.loY > b.hiY || a.hiX < b.loX || a.hiY < b.loY);
-  }
-
-  /**
-   * Checks if a given bounding box has any conflicts in the grid and inserts it
-   * if none are found.
-   *
-   * @param bound The bound to insert.
-   * @param justTest If true, just test if it conflicts, without inserting.
-   * @return True if the bound was successfully inserted; false if it
-   *         could not be inserted due to a conflict.
-   */
-  insert(bound: BoundingBox, justTest = false): boolean {
-    // Reject if the label is out of bounds.
-    if ((bound.hiX < this.bound.loX) || (bound.loX > this.bound.hiX) ||
-        (bound.hiY < this.bound.loY) || (bound.loY > this.bound.hiY)) {
-      return false;
-    }
-
-    let minCellX = this.getCellX(bound.loX);
-    let maxCellX = this.getCellX(bound.hiX);
-    let minCellY = this.getCellY(bound.loY);
-    let maxCellY = this.getCellY(bound.hiY);
-
-    // Check all overlapped cells to verify that we can insert.
-    let baseIdx = minCellY * this.numHorizCells + minCellX;
-    let idx = baseIdx;
-    for (let j = minCellY; j <= maxCellY; j++) {
-      for (let i = minCellX; i <= maxCellX; i++) {
-        let cell = this.grid[idx++];
-        if (cell) {
-          for (let k = 0; k < cell.length; k++) {
-            if (this.boundsIntersect(bound, cell[k])) {
-              return false;
-            }
-          }
-        }
-      }
-      idx += this.numHorizCells - (maxCellX - minCellX + 1);
-    }
-
-    if (justTest) {
-      return true;
-    }
-
-    // Insert into the overlapped cells.
-    idx = baseIdx;
-    for (let j = minCellY; j <= maxCellY; j++) {
-      for (let i = minCellX; i <= maxCellX; i++) {
-        if (!this.grid[idx]) {
-          this.grid[idx] = [bound];
-        } else {
-          this.grid[idx].push(bound);
-        }
-        idx++;
-      }
-      idx += this.numHorizCells - (maxCellX - minCellX + 1);
-    }
-    return true;
-  }
-
-  /**
-   * Returns the x index of the grid cell where the given x coordinate falls.
-   *
-   * @param x the coordinate, in world space.
-   * @return the x index of the cell.
-   */
-  private getCellX(x: number) {
-    return Math.floor((x - this.bound.loX) / this.cellWidth);
-  };
-
-  /**
-   * Returns the y index of the grid cell where the given y coordinate falls.
-   *
-   * @param y the coordinate, in world space.
-   * @return the y index of the cell.
-   */
-  private getCellY(y: number) {
-    return Math.floor((y - this.bound.loY) / this.cellHeight);
-  };
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/logging.ts b/tensorflow/tensorboard/components/vz_projector/logging.ts
deleted file mode 100644
index 59f3720601236453134f0e4cdf0448b6cb72d644..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/logging.ts
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/** Duration in ms for showing warning messages to the user */
-const WARNING_DURATION_MS = 10000;
-
-let dom: HTMLElement = null;
-let msgId = 0;
-let numActiveMessages = 0;
-
-export function setDomContainer(domElement: HTMLElement) {
-  dom = domElement;
-}
-
-/**
- * Updates the user message with the provided id.
- *
- * @param msg The message shown to the user. If null, the message is removed.
- * @param id The id of an existing message. If no id is provided, a unique id
- *     is assigned.
- * @param title The title of the notification.
- * @param isErrorMsg If true, the message is error and the dialog will have a
- *                   close button.
- * @return The id of the message.
- */
-export function setModalMessage(
-    msg: string, id: string = null, title = null, isErrorMsg = false): string {
-  if (dom == null) {
-    console.warn('Can\'t show modal message before the dom is initialized');
-    return;
-  }
-  if (id == null) {
-    id = (msgId++).toString();
-  }
-  let dialog = dom.querySelector('#notification-dialog') as any;
-  dialog.querySelector('.close-button').style.display =
-      isErrorMsg ? null : 'none';
-  let spinner = dialog.querySelector('.progress');
-  spinner.style.display = isErrorMsg ? 'none' : null;
-  spinner.active = isErrorMsg ? null : true;
-  dialog.querySelector('#notification-title').innerHTML = title;
-  let msgsContainer = dialog.querySelector('#notify-msgs') as HTMLElement;
-  if (isErrorMsg) {
-    msgsContainer.innerHTML = '';
-  } else {
-    const errors = msgsContainer.querySelectorAll('.error');
-    for (let i = 0; i < errors.length; i++) {
-      msgsContainer.removeChild(errors[i]);
-    }
-  }
-  let divId = `notify-msg-${id}`;
-  let msgDiv = dialog.querySelector('#' + divId) as HTMLDivElement;
-  if (msgDiv == null) {
-    msgDiv = document.createElement('div');
-    msgDiv.className = 'notify-msg ' + (isErrorMsg ? 'error' : '');
-    msgDiv.id = divId;
-
-    msgsContainer.insertBefore(msgDiv, msgsContainer.firstChild);
-
-    if (!isErrorMsg) {
-      numActiveMessages++;
-    } else {
-      numActiveMessages = 0;
-    }
-  }
-  if (msg == null) {
-    numActiveMessages--;
-    if (numActiveMessages === 0) {
-      dialog.close();
-    }
-    msgDiv.remove();
-  } else {
-    msgDiv.innerText = msg;
-    dialog.open();
-  }
-  return id;
-}
-
-export function setErrorMessage(errMsg: string, task?: string) {
-  setModalMessage(errMsg, null, 'Error ' + (task != null ? task : ''), true);
-}
-
-/**
- * Shows a warning message to the user for a certain amount of time.
- */
-export function setWarningMessage(msg: string): void {
-  let toast = dom.querySelector('#toast') as any;
-  toast.text = msg;
-  toast.duration = WARNING_DURATION_MS;
-  toast.open();
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorEventContext.ts b/tensorflow/tensorboard/components/vz_projector/projectorEventContext.ts
deleted file mode 100644
index 36f5c4c58411fba58b42035530a834fcdf8aa1a7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/projectorEventContext.ts
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DistanceFunction, Projection} from './data';
-import {NearestEntry} from './knn';
-
-export type HoverListener = (index: number) => void;
-export type SelectionChangedListener =
-    (selectedPointIndices: number[], neighborsOfFirstPoint: NearestEntry[]) =>
-        void;
-export type ProjectionChangedListener = (projection: Projection) => void;
-export type DistanceMetricChangedListener =
-    (distanceMetric: DistanceFunction) => void;
-export interface ProjectorEventContext {
-  /** Register a callback to be invoked when the mouse hovers over a point. */
-  registerHoverListener(listener: HoverListener);
-  /** Notify the hover system that a point is under the mouse. */
-  notifyHoverOverPoint(pointIndex: number);
-  /** Registers a callback to be invoked when the selection changes. */
-  registerSelectionChangedListener(listener: SelectionChangedListener);
-  /**
-   * Notify the selection system that a client has changed the selected point
-   * set.
-   */
-  notifySelectionChanged(newSelectedPointIndices: number[]);
-  /** Registers a callback to be invoked when the projection changes. */
-  registerProjectionChangedListener(listener: ProjectionChangedListener);
-  /** Notify listeners that a reprojection occurred. */
-  notifyProjectionChanged(projection: Projection);
-  registerDistanceMetricChangedListener(listener:
-                                            DistanceMetricChangedListener);
-  notifyDistanceMetricChanged(distMetric: DistanceFunction);
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
deleted file mode 100644
index c0da9526598e02af26626b3c26d7fdc1acd84a2f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
+++ /dev/null
@@ -1,711 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataSet, DistanceFunction, Projection, ProjectionComponents3D, State} from './data';
-import {NearestEntry} from './knn';
-import {ProjectorEventContext} from './projectorEventContext';
-import {LabelRenderParams} from './renderContext';
-import {ScatterPlot} from './scatterPlot';
-import {ScatterPlotVisualizer3DLabels} from './scatterPlotVisualizer3DLabels';
-import {ScatterPlotVisualizerCanvasLabels} from './scatterPlotVisualizerCanvasLabels';
-import {ScatterPlotVisualizerPolylines} from './scatterPlotVisualizerPolylines';
-import {ScatterPlotVisualizerSprites} from './scatterPlotVisualizerSprites';
-import * as vector from './vector';
-
-const LABEL_FONT_SIZE = 10;
-const LABEL_SCALE_DEFAULT = 1.0;
-const LABEL_SCALE_LARGE = 2;
-const LABEL_FILL_COLOR_SELECTED = 0x000000;
-const LABEL_FILL_COLOR_HOVER = 0x000000;
-const LABEL_FILL_COLOR_NEIGHBOR = 0x000000;
-const LABEL_STROKE_COLOR_SELECTED = 0xFFFFFF;
-const LABEL_STROKE_COLOR_HOVER = 0xFFFFFF;
-const LABEL_STROKE_COLOR_NEIGHBOR = 0xFFFFFF;
-
-const POINT_COLOR_UNSELECTED = 0xE3E3E3;
-const POINT_COLOR_NO_SELECTION = 0x7575D9;
-const POINT_COLOR_SELECTED = 0xFA6666;
-const POINT_COLOR_HOVER = 0x760B4F;
-
-const POINT_SCALE_DEFAULT = 1.0;
-const POINT_SCALE_SELECTED = 1.2;
-const POINT_SCALE_NEIGHBOR = 1.2;
-const POINT_SCALE_HOVER = 1.2;
-
-const LABELS_3D_COLOR_UNSELECTED = 0xFFFFFF;
-const LABELS_3D_COLOR_NO_SELECTION = 0xFFFFFF;
-
-const SPRITE_IMAGE_COLOR_UNSELECTED = 0xFFFFFF;
-const SPRITE_IMAGE_COLOR_NO_SELECTION = 0xFFFFFF;
-
-const POLYLINE_START_HUE = 60;
-const POLYLINE_END_HUE = 360;
-const POLYLINE_SATURATION = 1;
-const POLYLINE_LIGHTNESS = .3;
-
-const POLYLINE_DEFAULT_OPACITY = .2;
-const POLYLINE_DEFAULT_LINEWIDTH = 2;
-const POLYLINE_SELECTED_OPACITY = .9;
-const POLYLINE_SELECTED_LINEWIDTH = 3;
-const POLYLINE_DESELECTED_OPACITY = .05;
-
-const SCATTER_PLOT_CUBE_LENGTH = 2;
-
-/** Color scale for nearest neighbors. */
-const NN_COLOR_SCALE =
-    d3.scaleLinear<string, string>()
-        .domain([1, 0.7, 0.4])
-        .range(['hsl(285, 80%, 40%)', 'hsl(0, 80%, 65%)', 'hsl(40, 70%, 60%)'])
-        .clamp(true);
-
-/**
- * Interprets projector events and assembes the arrays and commands necessary
- * to use the ScatterPlot to render the current projected data set.
- */
-export class ProjectorScatterPlotAdapter {
-  public scatterPlot: ScatterPlot;
-  private projection: Projection;
-  private hoverPointIndex: number;
-  private selectedPointIndices: number[];
-  private neighborsOfFirstSelectedPoint: NearestEntry[];
-  private renderLabelsIn3D: boolean = false;
-  private labelPointAccessor: string;
-  private legendPointColorer: (ds: DataSet, index: number) => string;
-  private distanceMetric: DistanceFunction;
-
-  private spriteVisualizer: ScatterPlotVisualizerSprites;
-  private labels3DVisualizer: ScatterPlotVisualizer3DLabels;
-  private canvasLabelsVisualizer: ScatterPlotVisualizerCanvasLabels;
-  private polylineVisualizer: ScatterPlotVisualizerPolylines;
-
-  constructor(
-      private scatterPlotContainer: HTMLElement,
-      projectorEventContext: ProjectorEventContext) {
-    this.scatterPlot =
-        new ScatterPlot(scatterPlotContainer, projectorEventContext);
-    projectorEventContext.registerProjectionChangedListener(projection => {
-      this.projection = projection;
-      this.updateScatterPlotWithNewProjection(projection);
-    });
-    projectorEventContext.registerSelectionChangedListener(
-        (selectedPointIndices, neighbors) => {
-          this.selectedPointIndices = selectedPointIndices;
-          this.neighborsOfFirstSelectedPoint = neighbors;
-          this.updateScatterPlotPositions();
-          this.updateScatterPlotAttributes();
-          this.scatterPlot.render();
-        });
-    projectorEventContext.registerHoverListener(hoverPointIndex => {
-      this.hoverPointIndex = hoverPointIndex;
-      this.updateScatterPlotAttributes();
-      this.scatterPlot.render();
-    });
-    projectorEventContext.registerDistanceMetricChangedListener(
-        distanceMetric => {
-          this.distanceMetric = distanceMetric;
-          this.updateScatterPlotAttributes();
-          this.scatterPlot.render();
-        });
-    this.createVisualizers(false);
-  }
-
-  notifyProjectionPositionsUpdated() {
-    this.updateScatterPlotPositions();
-    this.scatterPlot.render();
-  }
-
-  setDataSet(dataSet: DataSet) {
-    if (this.projection != null) {
-      // TODO(nicholsonc): setDataSet needs to go away, the projection is the
-      // atomic unit of update.
-      this.projection.dataSet = dataSet;
-    }
-    if (this.polylineVisualizer != null) {
-      this.polylineVisualizer.setDataSet(dataSet);
-    }
-    if (this.labels3DVisualizer != null) {
-      this.labels3DVisualizer.setLabelStrings(
-          this.generate3DLabelsArray(dataSet, this.labelPointAccessor));
-    }
-    if (this.spriteVisualizer == null) {
-      return;
-    }
-    this.spriteVisualizer.clearSpriteAtlas();
-    if ((dataSet == null) || (dataSet.spriteAndMetadataInfo == null)) {
-      return;
-    }
-    const metadata = dataSet.spriteAndMetadataInfo;
-    if ((metadata.spriteImage == null) || (metadata.spriteMetadata == null)) {
-      return;
-    }
-    const n = dataSet.points.length;
-    const spriteIndices = new Float32Array(n);
-    for (let i = 0; i < n; ++i) {
-      spriteIndices[i] = dataSet.points[i].index;
-    }
-    this.spriteVisualizer.setSpriteAtlas(
-        metadata.spriteImage, metadata.spriteMetadata.singleImageDim,
-        spriteIndices);
-  }
-
-  set3DLabelMode(renderLabelsIn3D: boolean) {
-    this.renderLabelsIn3D = renderLabelsIn3D;
-    this.createVisualizers(renderLabelsIn3D);
-    this.updateScatterPlotAttributes();
-    this.scatterPlot.render();
-  }
-
-  setLegendPointColorer(
-      legendPointColorer: (ds: DataSet, index: number) => string) {
-    this.legendPointColorer = legendPointColorer;
-  }
-
-  setLabelPointAccessor(labelPointAccessor: string) {
-    this.labelPointAccessor = labelPointAccessor;
-    if (this.labels3DVisualizer != null) {
-      const ds = (this.projection == null) ? null : this.projection.dataSet;
-      this.labels3DVisualizer.setLabelStrings(
-          this.generate3DLabelsArray(ds, labelPointAccessor));
-    }
-  }
-
-  resize() {
-    this.scatterPlot.resize();
-  }
-
-  populateBookmarkFromUI(state: State) {
-    state.cameraDef = this.scatterPlot.getCameraDef();
-  }
-
-  restoreUIFromBookmark(state: State) {
-    this.scatterPlot.setCameraParametersForNextCameraCreation(
-        state.cameraDef, false);
-  }
-
-  updateScatterPlotPositions() {
-    const ds = (this.projection == null) ? null : this.projection.dataSet;
-    const projectionComponents =
-        (this.projection == null) ? null : this.projection.projectionComponents;
-    const newPositions =
-        this.generatePointPositionArray(ds, projectionComponents);
-    this.scatterPlot.setPointPositions(newPositions);
-  }
-
-  updateScatterPlotAttributes() {
-    if (this.projection == null) {
-      return;
-    }
-    const dataSet = this.projection.dataSet;
-    const selectedSet = this.selectedPointIndices;
-    const hoverIndex = this.hoverPointIndex;
-    const neighbors = this.neighborsOfFirstSelectedPoint;
-    const pointColorer = this.legendPointColorer;
-
-    const pointColors = this.generatePointColorArray(
-        dataSet, pointColorer, this.distanceMetric, selectedSet, neighbors,
-        hoverIndex, this.renderLabelsIn3D, this.getSpriteImageMode());
-    const pointScaleFactors = this.generatePointScaleFactorArray(
-        dataSet, selectedSet, neighbors, hoverIndex);
-    const labels = this.generateVisibleLabelRenderParams(
-        dataSet, selectedSet, neighbors, hoverIndex);
-    const polylineColors =
-        this.generateLineSegmentColorMap(dataSet, pointColorer);
-    const polylineOpacities =
-        this.generateLineSegmentOpacityArray(dataSet, selectedSet);
-    const polylineWidths =
-        this.generateLineSegmentWidthArray(dataSet, selectedSet);
-
-    this.scatterPlot.setPointColors(pointColors);
-    this.scatterPlot.setPointScaleFactors(pointScaleFactors);
-    this.scatterPlot.setLabels(labels);
-    this.scatterPlot.setPolylineColors(polylineColors);
-    this.scatterPlot.setPolylineOpacities(polylineOpacities);
-    this.scatterPlot.setPolylineWidths(polylineWidths);
-  }
-
-  render() {
-    this.scatterPlot.render();
-  }
-
-  generatePointPositionArray(
-      ds: DataSet, projectionComponents: ProjectionComponents3D): Float32Array {
-    if (ds == null) {
-      return null;
-    }
-
-    const xScaler = d3.scaleLinear();
-    const yScaler = d3.scaleLinear();
-    let zScaler = null;
-    {
-      // Determine max and min of each axis of our data.
-      const xExtent = d3.extent(
-          ds.points,
-          (p, i) => ds.points[i].projections[projectionComponents[0]]);
-      const yExtent = d3.extent(
-          ds.points,
-          (p, i) => ds.points[i].projections[projectionComponents[1]]);
-
-      const range =
-          [-SCATTER_PLOT_CUBE_LENGTH / 2, SCATTER_PLOT_CUBE_LENGTH / 2];
-
-      xScaler.domain(xExtent).range(range);
-      yScaler.domain(yExtent).range(range);
-
-      if (projectionComponents[2] != null) {
-        const zExtent = d3.extent(
-            ds.points,
-            (p, i) => ds.points[i].projections[projectionComponents[2]]);
-        zScaler = d3.scaleLinear();
-        zScaler.domain(zExtent).range(range);
-      }
-    }
-
-    const positions = new Float32Array(ds.points.length * 3);
-    let dst = 0;
-
-    ds.points.forEach((d, i) => {
-      positions[dst++] =
-          xScaler(ds.points[i].projections[projectionComponents[0]]);
-      positions[dst++] =
-          yScaler(ds.points[i].projections[projectionComponents[1]]);
-      positions[dst++] = 0.0;
-    });
-
-    if (zScaler) {
-      dst = 2;
-      ds.points.forEach((d, i) => {
-        positions[dst] =
-            zScaler(ds.points[i].projections[projectionComponents[2]]);
-        dst += 3;
-      });
-    }
-
-    return positions;
-  }
-
-  generateVisibleLabelRenderParams(
-      ds: DataSet, selectedPointIndices: number[],
-      neighborsOfFirstPoint: NearestEntry[],
-      hoverPointIndex: number): LabelRenderParams {
-    if (ds == null) {
-      return null;
-    }
-
-    const selectedPointCount =
-        (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-    const neighborCount =
-        (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-    const n = selectedPointCount + neighborCount +
-        ((hoverPointIndex != null) ? 1 : 0);
-
-    const visibleLabels = new Uint32Array(n);
-    const scale = new Float32Array(n);
-    const opacityFlags = new Int8Array(n);
-    const fillColors = new Uint8Array(n * 3);
-    const strokeColors = new Uint8Array(n * 3);
-    const labelStrings: string[] = [];
-
-    scale.fill(LABEL_SCALE_DEFAULT);
-    opacityFlags.fill(1);
-
-    let dst = 0;
-
-    if (hoverPointIndex != null) {
-      labelStrings.push(
-          this.getLabelText(ds, hoverPointIndex, this.labelPointAccessor));
-      visibleLabels[dst] = hoverPointIndex;
-      scale[dst] = LABEL_SCALE_LARGE;
-      opacityFlags[dst] = 0;
-      const fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_HOVER);
-      packRgbIntoUint8Array(
-          fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-      const strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_HOVER);
-      packRgbIntoUint8Array(
-          strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[1]);
-      ++dst;
-    }
-
-    // Selected points
-    {
-      const n = selectedPointCount;
-      const fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_SELECTED);
-      const strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_SELECTED);
-      for (let i = 0; i < n; ++i) {
-        const labelIndex = selectedPointIndices[i];
-        labelStrings.push(
-            this.getLabelText(ds, labelIndex, this.labelPointAccessor));
-        visibleLabels[dst] = labelIndex;
-        scale[dst] = LABEL_SCALE_LARGE;
-        opacityFlags[dst] = (n === 1) ? 0 : 1;
-        packRgbIntoUint8Array(
-            fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-        packRgbIntoUint8Array(
-            strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[2]);
-        ++dst;
-      }
-    }
-
-    // Neighbors
-    {
-      const n = neighborCount;
-      const fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_NEIGHBOR);
-      const strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_NEIGHBOR);
-      for (let i = 0; i < n; ++i) {
-        const labelIndex = neighborsOfFirstPoint[i].index;
-        labelStrings.push(
-            this.getLabelText(ds, labelIndex, this.labelPointAccessor));
-        visibleLabels[dst] = labelIndex;
-        packRgbIntoUint8Array(
-            fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-        packRgbIntoUint8Array(
-            strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[2]);
-        ++dst;
-      }
-    }
-
-    return new LabelRenderParams(
-        new Float32Array(visibleLabels), labelStrings, scale, opacityFlags,
-        LABEL_FONT_SIZE, fillColors, strokeColors);
-  }
-
-  generatePointScaleFactorArray(
-      ds: DataSet, selectedPointIndices: number[],
-      neighborsOfFirstPoint: NearestEntry[],
-      hoverPointIndex: number): Float32Array {
-    if (ds == null) {
-      return new Float32Array(0);
-    }
-
-    const scale = new Float32Array(ds.points.length);
-    scale.fill(POINT_SCALE_DEFAULT);
-
-    const selectedPointCount =
-        (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-    const neighborCount =
-        (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-
-    // Scale up all selected points.
-    {
-      const n = selectedPointCount;
-      for (let i = 0; i < n; ++i) {
-        const p = selectedPointIndices[i];
-        scale[p] = POINT_SCALE_SELECTED;
-      }
-    }
-
-    // Scale up the neighbor points.
-    {
-      const n = neighborCount;
-      for (let i = 0; i < n; ++i) {
-        const p = neighborsOfFirstPoint[i].index;
-        scale[p] = POINT_SCALE_NEIGHBOR;
-      }
-    }
-
-    // Scale up the hover point.
-    if (hoverPointIndex != null) {
-      scale[hoverPointIndex] = POINT_SCALE_HOVER;
-    }
-
-    return scale;
-  }
-
-  generateLineSegmentColorMap(
-      ds: DataSet, legendPointColorer: (ds: DataSet, index: number) => string):
-      {[polylineIndex: number]: Float32Array} {
-    let polylineColorArrayMap: {[polylineIndex: number]: Float32Array} = {};
-    if (ds == null) {
-      return polylineColorArrayMap;
-    }
-
-    for (let i = 0; i < ds.sequences.length; i++) {
-      let sequence = ds.sequences[i];
-      let colors = new Float32Array(2 * (sequence.pointIndices.length - 1) * 3);
-      let colorIndex = 0;
-
-      if (legendPointColorer) {
-        for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
-          const c1 =
-              new THREE.Color(legendPointColorer(ds, sequence.pointIndices[j]));
-          const c2 = new THREE.Color(
-              legendPointColorer(ds, sequence.pointIndices[j + 1]));
-          colors[colorIndex++] = c1.r;
-          colors[colorIndex++] = c1.g;
-          colors[colorIndex++] = c1.b;
-          colors[colorIndex++] = c2.r;
-          colors[colorIndex++] = c2.g;
-          colors[colorIndex++] = c2.b;
-        }
-      } else {
-        for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
-          const c1 =
-              getDefaultPointInPolylineColor(j, sequence.pointIndices.length);
-          const c2 = getDefaultPointInPolylineColor(
-              j + 1, sequence.pointIndices.length);
-          colors[colorIndex++] = c1.r;
-          colors[colorIndex++] = c1.g;
-          colors[colorIndex++] = c1.b;
-          colors[colorIndex++] = c2.r;
-          colors[colorIndex++] = c2.g;
-          colors[colorIndex++] = c2.b;
-        }
-      }
-
-      polylineColorArrayMap[i] = colors;
-    }
-
-    return polylineColorArrayMap;
-  }
-
-  generateLineSegmentOpacityArray(ds: DataSet, selectedPoints: number[]):
-      Float32Array {
-    if (ds == null) {
-      return new Float32Array(0);
-    }
-    const opacities = new Float32Array(ds.sequences.length);
-    const selectedPointCount =
-        (selectedPoints == null) ? 0 : selectedPoints.length;
-    if (selectedPointCount > 0) {
-      opacities.fill(POLYLINE_DESELECTED_OPACITY);
-      const i = ds.points[selectedPoints[0]].sequenceIndex;
-      opacities[i] = POLYLINE_SELECTED_OPACITY;
-    } else {
-      opacities.fill(POLYLINE_DEFAULT_OPACITY);
-    }
-    return opacities;
-  }
-
-  generateLineSegmentWidthArray(ds: DataSet, selectedPoints: number[]):
-      Float32Array {
-    if (ds == null) {
-      return new Float32Array(0);
-    }
-    const widths = new Float32Array(ds.sequences.length);
-    widths.fill(POLYLINE_DEFAULT_LINEWIDTH);
-    const selectedPointCount =
-        (selectedPoints == null) ? 0 : selectedPoints.length;
-    if (selectedPointCount > 0) {
-      const i = ds.points[selectedPoints[0]].sequenceIndex;
-      widths[i] = POLYLINE_SELECTED_LINEWIDTH;
-    }
-    return widths;
-  }
-
-  generatePointColorArray(
-      ds: DataSet, legendPointColorer: (ds: DataSet, index: number) => string,
-      distFunc: DistanceFunction, selectedPointIndices: number[],
-      neighborsOfFirstPoint: NearestEntry[], hoverPointIndex: number,
-      label3dMode: boolean, spriteImageMode: boolean): Float32Array {
-    if (ds == null) {
-      return new Float32Array(0);
-    }
-
-    const selectedPointCount =
-        (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-    const neighborCount =
-        (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-    const colors = new Float32Array(ds.points.length * 3);
-
-    let unselectedColor = POINT_COLOR_UNSELECTED;
-    let noSelectionColor = POINT_COLOR_NO_SELECTION;
-
-    if (label3dMode) {
-      unselectedColor = LABELS_3D_COLOR_UNSELECTED;
-      noSelectionColor = LABELS_3D_COLOR_NO_SELECTION;
-    }
-
-    if (spriteImageMode) {
-      unselectedColor = SPRITE_IMAGE_COLOR_UNSELECTED;
-      noSelectionColor = SPRITE_IMAGE_COLOR_NO_SELECTION;
-    }
-
-    // Give all points the unselected color.
-    {
-      const n = ds.points.length;
-      let dst = 0;
-      if (selectedPointCount > 0) {
-        const c = new THREE.Color(unselectedColor);
-        for (let i = 0; i < n; ++i) {
-          colors[dst++] = c.r;
-          colors[dst++] = c.g;
-          colors[dst++] = c.b;
-        }
-      } else {
-        if (legendPointColorer != null) {
-          for (let i = 0; i < n; ++i) {
-            const c = new THREE.Color(legendPointColorer(ds, i));
-            colors[dst++] = c.r;
-            colors[dst++] = c.g;
-            colors[dst++] = c.b;
-          }
-        } else {
-          const c = new THREE.Color(noSelectionColor);
-          for (let i = 0; i < n; ++i) {
-            colors[dst++] = c.r;
-            colors[dst++] = c.g;
-            colors[dst++] = c.b;
-          }
-        }
-      }
-    }
-
-    // Color the selected points.
-    {
-      const n = selectedPointCount;
-      const c = new THREE.Color(POINT_COLOR_SELECTED);
-      for (let i = 0; i < n; ++i) {
-        let dst = selectedPointIndices[i] * 3;
-        colors[dst++] = c.r;
-        colors[dst++] = c.g;
-        colors[dst++] = c.b;
-      }
-    }
-
-    // Color the neighbors.
-    {
-      const n = neighborCount;
-      let minDist = n > 0 ? neighborsOfFirstPoint[0].dist : 0;
-      for (let i = 0; i < n; ++i) {
-        const c = new THREE.Color(
-            dist2color(distFunc, neighborsOfFirstPoint[i].dist, minDist));
-        let dst = neighborsOfFirstPoint[i].index * 3;
-        colors[dst++] = c.r;
-        colors[dst++] = c.g;
-        colors[dst++] = c.b;
-      }
-    }
-
-    // Color the hover point.
-    if (hoverPointIndex != null) {
-      const c = new THREE.Color(POINT_COLOR_HOVER);
-      let dst = hoverPointIndex * 3;
-      colors[dst++] = c.r;
-      colors[dst++] = c.g;
-      colors[dst++] = c.b;
-    }
-
-    return colors;
-  }
-
-  generate3DLabelsArray(ds: DataSet, accessor: string) {
-    if ((ds == null) || (accessor == null)) {
-      return null;
-    }
-    let labels: string[] = [];
-    const n = ds.points.length;
-    for (let i = 0; i < n; ++i) {
-      labels.push(this.getLabelText(ds, i, accessor));
-    }
-    return labels;
-  }
-
-  private getLabelText(ds: DataSet, i: number, accessor: string) {
-    return ds.points[i].metadata[accessor].toString();
-  }
-
-  private updateScatterPlotWithNewProjection(projection: Projection) {
-    if (projection == null) {
-      this.createVisualizers(this.renderLabelsIn3D);
-      this.scatterPlot.render();
-      return;
-    }
-    this.setDataSet(projection.dataSet);
-    this.scatterPlot.setDimensions(projection.dimensionality);
-    if (projection.dataSet.projectionCanBeRendered(projection.projectionType)) {
-      this.updateScatterPlotAttributes();
-      this.notifyProjectionPositionsUpdated();
-    }
-    this.scatterPlot.setCameraParametersForNextCameraCreation(null, false);
-  }
-
-  private createVisualizers(inLabels3DMode: boolean) {
-    const ds = (this.projection == null) ? null : this.projection.dataSet;
-    const scatterPlot = this.scatterPlot;
-    scatterPlot.removeAllVisualizers();
-    this.labels3DVisualizer = null;
-    this.canvasLabelsVisualizer = null;
-    this.spriteVisualizer = null;
-    this.polylineVisualizer = null;
-    if (inLabels3DMode) {
-      this.labels3DVisualizer = new ScatterPlotVisualizer3DLabels();
-      this.labels3DVisualizer.setLabelStrings(
-          this.generate3DLabelsArray(ds, this.labelPointAccessor));
-    } else {
-      this.spriteVisualizer = new ScatterPlotVisualizerSprites();
-      scatterPlot.addVisualizer(this.spriteVisualizer);
-      this.canvasLabelsVisualizer =
-          new ScatterPlotVisualizerCanvasLabels(this.scatterPlotContainer);
-    }
-    this.polylineVisualizer = new ScatterPlotVisualizerPolylines();
-    this.setDataSet(ds);
-    if (this.spriteVisualizer) {
-      scatterPlot.addVisualizer(this.spriteVisualizer);
-    }
-    if (this.labels3DVisualizer) {
-      scatterPlot.addVisualizer(this.labels3DVisualizer);
-    }
-    if (this.canvasLabelsVisualizer) {
-      scatterPlot.addVisualizer(this.canvasLabelsVisualizer);
-    }
-    scatterPlot.addVisualizer(this.polylineVisualizer);
-  }
-
-  private getSpriteImageMode(): boolean {
-    if (this.projection == null) {
-      return false;
-    }
-    const ds = this.projection.dataSet;
-    if ((ds == null) || (ds.spriteAndMetadataInfo == null)) {
-      return false;
-    }
-    return ds.spriteAndMetadataInfo.spriteImage != null;
-  }
-}
-
-function packRgbIntoUint8Array(
-    rgbArray: Uint8Array, labelIndex: number, r: number, g: number, b: number) {
-  rgbArray[labelIndex * 3] = r;
-  rgbArray[labelIndex * 3 + 1] = g;
-  rgbArray[labelIndex * 3 + 2] = b;
-}
-
-function styleRgbFromHexColor(hex: number): [number, number, number] {
-  const c = new THREE.Color(hex);
-  return [(c.r * 255) | 0, (c.g * 255) | 0, (c.b * 255) | 0];
-}
-
-function getDefaultPointInPolylineColor(
-    index: number, totalPoints: number): THREE.Color {
-  let hue = POLYLINE_START_HUE +
-      (POLYLINE_END_HUE - POLYLINE_START_HUE) * index / totalPoints;
-
-  let rgb = d3.hsl(hue, POLYLINE_SATURATION, POLYLINE_LIGHTNESS).rgb();
-  return new THREE.Color(rgb.r / 255, rgb.g / 255, rgb.b / 255);
-}
-
-/**
- * Normalizes the distance so it can be visually encoded with color.
- * The normalization depends on the distance metric (cosine vs euclidean).
- */
-export function normalizeDist(
-    distFunc: DistanceFunction, d: number, minDist: number): number {
-  return (distFunc === vector.dist) ? (minDist / d) : (1 - d);
-}
-
-/** Normalizes and encodes the provided distance with color. */
-export function dist2color(
-    distFunc: DistanceFunction, d: number, minDist: number): string {
-  return NN_COLOR_SCALE(normalizeDist(distFunc, d, minDist));
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/renderContext.ts b/tensorflow/tensorboard/components/vz_projector/renderContext.ts
deleted file mode 100644
index 8d5232a80480cf4f853372cf18bca0f29f3f36af..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/renderContext.ts
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http:www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * LabelRenderParams describes the set of points that should have labels
- * rendered next to them.
- */
-export class LabelRenderParams {
-  constructor(
-      public pointIndices: Float32Array, public labelStrings: string[],
-      public scaleFactors: Float32Array, public useSceneOpacityFlags: Int8Array,
-      public defaultFontSize: number, public fillColors: Uint8Array,
-      public strokeColors: Uint8Array) {}
-}
-
-/** Details about the camera projection being used to render the scene. */
-export enum CameraType {
-  Perspective,
-  Orthographic
-}
-
-/**
- * RenderContext contains all of the state required to color and render the data
- * set. ScatterPlot passes this to every attached visualizer as part of the
- * render callback.
- * TODO(nicholsonc): This should only contain the data that's changed between
- * each frame. Data like colors / scale factors / labels should be reapplied
- * only when they change.
- */
-export class RenderContext {
-  constructor(
-      public camera: THREE.Camera, public cameraType: CameraType,
-      public cameraTarget: THREE.Vector3, public screenWidth: number,
-      public screenHeight: number, public nearestCameraSpacePointZ: number,
-      public farthestCameraSpacePointZ: number, public backgroundColor: number,
-      public pointColors: Float32Array, public pointScaleFactors: Float32Array,
-      public labels: LabelRenderParams,
-      public polylineColors: {[polylineIndex: number]: Float32Array},
-      public polylineOpacities: Float32Array,
-      public polylineWidths: Float32Array) {}
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
deleted file mode 100644
index 283b608e836b934c81ae25891ac257b0ad7c1193..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
+++ /dev/null
@@ -1,723 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {ProjectorEventContext} from './projectorEventContext';
-import {CameraType, LabelRenderParams, RenderContext} from './renderContext';
-import {BoundingBox, ScatterPlotRectangleSelector} from './scatterPlotRectangleSelector';
-import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
-import * as util from './util';
-import {Point2D, Point3D} from './vector';
-
-const BACKGROUND_COLOR = 0xffffff;
-
-/**
- * The length of the cube (diameter of the circumscribing sphere) where all the
- * points live.
- */
-const CUBE_LENGTH = 2;
-const MAX_ZOOM = 5 * CUBE_LENGTH;
-const MIN_ZOOM = 0.025 * CUBE_LENGTH;
-
-// Constants relating to the camera parameters.
-const PERSP_CAMERA_FOV_VERTICAL = 70;
-const PERSP_CAMERA_NEAR_CLIP_PLANE = 0.01;
-const PERSP_CAMERA_FAR_CLIP_PLANE = 100;
-const ORTHO_CAMERA_FRUSTUM_HALF_EXTENT = 1.2;
-
-// Key presses.
-const SHIFT_KEY = 16;
-const CTRL_KEY = 17;
-
-const START_CAMERA_POS_3D = new THREE.Vector3(0.45, 0.9, 1.6);
-const START_CAMERA_TARGET_3D = new THREE.Vector3(0, 0, 0);
-const START_CAMERA_POS_2D = new THREE.Vector3(0, 0, 4);
-const START_CAMERA_TARGET_2D = new THREE.Vector3(0, 0, 0);
-
-const ORBIT_MOUSE_ROTATION_SPEED = 1;
-const ORBIT_ANIMATION_ROTATION_CYCLE_IN_SECONDS = 7;
-
-export type OnCameraMoveListener =
-    (cameraPosition: THREE.Vector3, cameraTarget: THREE.Vector3) => void;
-
-/** Supported modes of interaction. */
-export enum MouseMode {
-  AREA_SELECT,
-  CAMERA_AND_CLICK_SELECT
-}
-
-/** Defines a camera, suitable for serialization. */
-export class CameraDef {
-  orthographic: boolean = false;
-  position: Point3D;
-  target: Point3D;
-  zoom: number;
-}
-
-/**
- * Maintains a three.js instantiation and context,
- * animation state, and all other logic that's
- * independent of how a 3D scatter plot is actually rendered. Also holds an
- * array of visualizers and dispatches application events to them.
- */
-export class ScatterPlot {
-  private visualizers: ScatterPlotVisualizer[] = [];
-
-  private onCameraMoveListeners: OnCameraMoveListener[] = [];
-
-  private height: number;
-  private width: number;
-
-  private mouseMode: MouseMode;
-  private backgroundColor: number = BACKGROUND_COLOR;
-
-  private dimensionality: number = 3;
-  private renderer: THREE.WebGLRenderer;
-
-  private scene: THREE.Scene;
-  private pickingTexture: THREE.WebGLRenderTarget;
-  private light: THREE.PointLight;
-
-  private cameraDef: CameraDef = null;
-  private camera: THREE.Camera;
-  private orbitAnimationOnNextCameraCreation: boolean = false;
-  private orbitCameraControls: any;
-  private orbitAnimationId: number;
-
-  private worldSpacePointPositions: Float32Array;
-  private pointColors: Float32Array;
-  private pointScaleFactors: Float32Array;
-  private labels: LabelRenderParams;
-  private polylineColors: {[polylineIndex: number]: Float32Array};
-  private polylineOpacities: Float32Array;
-  private polylineWidths: Float32Array;
-
-  private selecting = false;
-  private nearestPoint: number;
-  private mouseIsDown = false;
-  private isDragSequence = false;
-  private rectangleSelector: ScatterPlotRectangleSelector;
-
-  constructor(
-      private container: HTMLElement,
-      private projectorEventContext: ProjectorEventContext) {
-    this.getLayoutValues();
-
-    this.scene = new THREE.Scene();
-    this.renderer = new THREE.WebGLRenderer(
-        {alpha: true, premultipliedAlpha: false, antialias: false});
-    this.renderer.setClearColor(BACKGROUND_COLOR, 1);
-    this.container.appendChild(this.renderer.domElement);
-    this.light = new THREE.PointLight(0xFFECBF, 1, 0);
-    this.scene.add(this.light);
-
-    this.setDimensions(3);
-    this.recreateCamera(this.makeDefaultCameraDef(this.dimensionality));
-    this.renderer.render(this.scene, this.camera);
-
-    this.rectangleSelector = new ScatterPlotRectangleSelector(
-        this.container,
-        (boundingBox: BoundingBox) => this.selectBoundingBox(boundingBox));
-    this.addInteractionListeners();
-  }
-
-  private addInteractionListeners() {
-    this.container.addEventListener('mousemove', this.onMouseMove.bind(this));
-    this.container.addEventListener('mousedown', this.onMouseDown.bind(this));
-    this.container.addEventListener('mouseup', this.onMouseUp.bind(this));
-    this.container.addEventListener('click', this.onClick.bind(this));
-    window.addEventListener('keydown', this.onKeyDown.bind(this), false);
-    window.addEventListener('keyup', this.onKeyUp.bind(this), false);
-  }
-
-  private addCameraControlsEventListeners(cameraControls: any) {
-    // Start is called when the user stars interacting with
-    // controls.
-    cameraControls.addEventListener('start', () => {
-      this.stopOrbitAnimation();
-      this.onCameraMoveListeners.forEach(
-          l => l(this.camera.position, cameraControls.target));
-    });
-
-    // Change is called everytime the user interacts with the controls.
-    cameraControls.addEventListener('change', () => {
-      this.render();
-    });
-
-    // End is called when the user stops interacting with the
-    // controls (e.g. on mouse up, after dragging).
-    cameraControls.addEventListener('end', () => {});
-  }
-
-  private makeOrbitControls(
-      camera: THREE.Camera, cameraDef: CameraDef, cameraIs3D: boolean) {
-    if (this.orbitCameraControls != null) {
-      this.orbitCameraControls.dispose();
-    }
-    const occ =
-        new (THREE as any).OrbitControls(camera, this.renderer.domElement);
-    occ.target0 = new THREE.Vector3(
-        cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-    occ.position0 = new THREE.Vector3().copy(camera.position);
-    occ.zoom0 = cameraDef.zoom;
-    occ.enableRotate = cameraIs3D;
-    occ.autoRotate = false;
-    occ.rotateSpeed = ORBIT_MOUSE_ROTATION_SPEED;
-    if (cameraIs3D) {
-      occ.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-      occ.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-    } else {
-      occ.mouseButtons.ORBIT = null;
-      occ.mouseButtons.PAN = THREE.MOUSE.LEFT;
-    }
-    occ.reset();
-
-    this.camera = camera;
-    this.orbitCameraControls = occ;
-    this.addCameraControlsEventListeners(this.orbitCameraControls);
-  }
-
-  private makeCamera3D(cameraDef: CameraDef, w: number, h: number) {
-    let camera: THREE.PerspectiveCamera;
-    {
-      const aspectRatio = w / h;
-      camera = new THREE.PerspectiveCamera(
-          PERSP_CAMERA_FOV_VERTICAL, aspectRatio, PERSP_CAMERA_NEAR_CLIP_PLANE,
-          PERSP_CAMERA_FAR_CLIP_PLANE);
-      camera.position.set(
-          cameraDef.position[0], cameraDef.position[1], cameraDef.position[2]);
-      const at = new THREE.Vector3(
-          cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-      camera.lookAt(at);
-      camera.zoom = cameraDef.zoom;
-      camera.updateProjectionMatrix();
-    }
-    this.camera = camera;
-    this.makeOrbitControls(camera, cameraDef, true);
-  }
-
-  private makeCamera2D(cameraDef: CameraDef, w: number, h: number) {
-    let camera: THREE.OrthographicCamera;
-    const target = new THREE.Vector3(
-        cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-    {
-      const aspectRatio = w / h;
-      let left = -ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-      let right = ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-      let bottom = -ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-      let top = ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-      // Scale up the larger of (w, h) to match the aspect ratio.
-      if (aspectRatio > 1) {
-        left *= aspectRatio;
-        right *= aspectRatio;
-      } else {
-        top /= aspectRatio;
-        bottom /= aspectRatio;
-      }
-      camera =
-          new THREE.OrthographicCamera(left, right, top, bottom, -1000, 1000);
-      camera.position.set(
-          cameraDef.position[0], cameraDef.position[1], cameraDef.position[2]);
-      camera.up = new THREE.Vector3(0, 1, 0);
-      camera.lookAt(target);
-      camera.zoom = cameraDef.zoom;
-      camera.updateProjectionMatrix();
-    }
-    this.camera = camera;
-    this.makeOrbitControls(camera, cameraDef, false);
-  }
-
-  private makeDefaultCameraDef(dimensionality: number): CameraDef {
-    const def = new CameraDef();
-    def.orthographic = (dimensionality === 2);
-    def.zoom = 1.0;
-    if (def.orthographic) {
-      def.position =
-          [START_CAMERA_POS_2D.x, START_CAMERA_POS_2D.y, START_CAMERA_POS_2D.z];
-      def.target = [
-        START_CAMERA_TARGET_2D.x, START_CAMERA_TARGET_2D.y,
-        START_CAMERA_TARGET_2D.z
-      ];
-    } else {
-      def.position =
-          [START_CAMERA_POS_3D.x, START_CAMERA_POS_3D.y, START_CAMERA_POS_3D.z];
-      def.target = [
-        START_CAMERA_TARGET_3D.x, START_CAMERA_TARGET_3D.y,
-        START_CAMERA_TARGET_3D.z
-      ];
-    }
-    return def;
-  }
-
-  /** Recreate the scatter plot camera from a definition structure. */
-  recreateCamera(cameraDef: CameraDef) {
-    if (cameraDef.orthographic) {
-      this.makeCamera2D(cameraDef, this.width, this.height);
-    } else {
-      this.makeCamera3D(cameraDef, this.width, this.height);
-    }
-    this.orbitCameraControls.minDistance = MIN_ZOOM;
-    this.orbitCameraControls.maxDistance = MAX_ZOOM;
-    this.orbitCameraControls.update();
-    if (this.orbitAnimationOnNextCameraCreation) {
-      this.startOrbitAnimation();
-    }
-  }
-
-  private onClick(e?: MouseEvent, notify = true) {
-    if (e && this.selecting) {
-      return;
-    }
-    // Only call event handlers if the click originated from the scatter plot.
-    if (!this.isDragSequence && notify) {
-      const selection = (this.nearestPoint != null) ? [this.nearestPoint] : [];
-      this.projectorEventContext.notifySelectionChanged(selection);
-    }
-    this.isDragSequence = false;
-    this.render();
-  }
-
-  private onMouseDown(e: MouseEvent) {
-    this.isDragSequence = false;
-    this.mouseIsDown = true;
-    if (this.selecting) {
-      this.orbitCameraControls.enabled = false;
-      this.rectangleSelector.onMouseDown(e.offsetX, e.offsetY);
-      this.setNearestPointToMouse(e);
-    } else if (
-        !e.ctrlKey && this.sceneIs3D() &&
-        this.orbitCameraControls.mouseButtons.ORBIT === THREE.MOUSE.RIGHT) {
-      // The user happened to press the ctrl key when the tab was active,
-      // unpressed the ctrl when the tab was inactive, and now he/she
-      // is back to the projector tab.
-      this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-      this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-    } else if (
-        e.ctrlKey && this.sceneIs3D() &&
-        this.orbitCameraControls.mouseButtons.ORBIT === THREE.MOUSE.LEFT) {
-      // Similarly to the situation above.
-      this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.RIGHT;
-      this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.LEFT;
-    }
-  }
-
-  /** When we stop dragging/zooming, return to normal behavior. */
-  private onMouseUp(e: any) {
-    if (this.selecting) {
-      this.orbitCameraControls.enabled = true;
-      this.rectangleSelector.onMouseUp();
-      this.render();
-    }
-    this.mouseIsDown = false;
-  }
-
-  /**
-   * When the mouse moves, find the nearest point (if any) and send it to the
-   * hoverlisteners (usually called from embedding.ts)
-   */
-  private onMouseMove(e: MouseEvent) {
-    this.isDragSequence = this.mouseIsDown;
-    // Depending if we're selecting or just navigating, handle accordingly.
-    if (this.selecting && this.mouseIsDown) {
-      this.rectangleSelector.onMouseMove(e.offsetX, e.offsetY);
-      this.render();
-    } else if (!this.mouseIsDown) {
-      this.setNearestPointToMouse(e);
-      this.projectorEventContext.notifyHoverOverPoint(this.nearestPoint);
-    }
-  }
-
-  /** For using ctrl + left click as right click, and for circle select */
-  private onKeyDown(e: any) {
-    // If ctrl is pressed, use left click to orbit
-    if (e.keyCode === CTRL_KEY && this.sceneIs3D()) {
-      this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.RIGHT;
-      this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.LEFT;
-    }
-
-    // If shift is pressed, start selecting
-    if (e.keyCode === SHIFT_KEY) {
-      this.selecting = true;
-      this.container.style.cursor = 'crosshair';
-    }
-  }
-
-  /** For using ctrl + left click as right click, and for circle select */
-  private onKeyUp(e: any) {
-    if (e.keyCode === CTRL_KEY && this.sceneIs3D()) {
-      this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-      this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-    }
-
-    // If shift is released, stop selecting
-    if (e.keyCode === SHIFT_KEY) {
-      this.selecting = (this.getMouseMode() === MouseMode.AREA_SELECT);
-      if (!this.selecting) {
-        this.container.style.cursor = 'default';
-      }
-      this.render();
-    }
-  }
-
-  /**
-   * Returns a list of indices of points in a bounding box from the picking
-   * texture.
-   * @param boundingBox The bounding box to select from.
-   */
-  private getPointIndicesFromPickingTexture(boundingBox: BoundingBox):
-      number[] {
-    if (this.worldSpacePointPositions == null) {
-      return null;
-    }
-    const pointCount = this.worldSpacePointPositions.length / 3;
-    const dpr = window.devicePixelRatio || 1;
-    const x = Math.floor(boundingBox.x * dpr);
-    const y = Math.floor(boundingBox.y * dpr);
-    const width = Math.floor(boundingBox.width * dpr);
-    const height = Math.floor(boundingBox.height * dpr);
-
-    // Create buffer for reading all of the pixels from the texture.
-    let pixelBuffer = new Uint8Array(width * height * 4);
-
-    // Read the pixels from the bounding box.
-    this.renderer.readRenderTargetPixels(
-        this.pickingTexture, x, this.pickingTexture.height - y, width, height,
-        pixelBuffer);
-
-    // Keep a flat list of each point and whether they are selected or not. This
-    // approach is more efficient than using an object keyed by the index.
-    let pointIndicesSelection =
-        new Uint8Array(this.worldSpacePointPositions.length);
-    for (let i = 0; i < width * height; i++) {
-      const id = (pixelBuffer[i * 4] << 16) | (pixelBuffer[i * 4 + 1] << 8) |
-          pixelBuffer[i * 4 + 2];
-      if (id !== 0xffffff && (id < pointCount)) {
-        pointIndicesSelection[id] = 1;
-      }
-    }
-    let pointIndices: number[] = [];
-    for (let i = 0; i < pointIndicesSelection.length; i++) {
-      if (pointIndicesSelection[i] === 1) {
-        pointIndices.push(i);
-      }
-    }
-
-    return pointIndices;
-  }
-
-
-  private selectBoundingBox(boundingBox: BoundingBox) {
-    let pointIndices = this.getPointIndicesFromPickingTexture(boundingBox);
-    this.projectorEventContext.notifySelectionChanged(pointIndices);
-  }
-
-  private setNearestPointToMouse(e: MouseEvent) {
-    if (this.pickingTexture == null) {
-      this.nearestPoint = null;
-      return;
-    }
-    const boundingBox:
-        BoundingBox = {x: e.offsetX, y: e.offsetY, width: 1, height: 1};
-    const pointIndices = this.getPointIndicesFromPickingTexture(boundingBox);
-    this.nearestPoint = (pointIndices != null) ? pointIndices[0] : null;
-  }
-
-  private getLayoutValues(): Point2D {
-    this.width = this.container.offsetWidth;
-    this.height = Math.max(1, this.container.offsetHeight);
-    return [this.width, this.height];
-  }
-
-  private sceneIs3D(): boolean {
-    return this.dimensionality === 3;
-  }
-
-  private remove3dAxisFromScene(): THREE.Object3D {
-    const axes = this.scene.getObjectByName('axes');
-    if (axes != null) {
-      this.scene.remove(axes);
-    }
-    return axes;
-  }
-
-  private add3dAxis() {
-    const axes = new THREE.AxisHelper();
-    axes.name = 'axes';
-    this.scene.add(axes);
-  }
-
-  /** Set 2d vs 3d mode. */
-  setDimensions(dimensionality: number) {
-    if ((dimensionality !== 2) && (dimensionality !== 3)) {
-      throw new RangeError('dimensionality must be 2 or 3');
-    }
-    this.dimensionality = dimensionality;
-
-    const def = this.cameraDef || this.makeDefaultCameraDef(dimensionality);
-    this.recreateCamera(def);
-
-    this.remove3dAxisFromScene();
-    if (dimensionality === 3) {
-      this.add3dAxis();
-    }
-  }
-
-  /** Gets the current camera information, suitable for serialization. */
-  getCameraDef(): CameraDef {
-    const def = new CameraDef();
-    const pos = this.camera.position;
-    const tgt = this.orbitCameraControls.target;
-    def.orthographic = !this.sceneIs3D();
-    def.position = [pos.x, pos.y, pos.z];
-    def.target = [tgt.x, tgt.y, tgt.z];
-    def.zoom = (this.camera as any).zoom;
-    return def;
-  }
-
-  /** Sets parameters for the next camera recreation. */
-  setCameraParametersForNextCameraCreation(
-      def: CameraDef, orbitAnimation: boolean) {
-    this.cameraDef = def;
-    this.orbitAnimationOnNextCameraCreation = orbitAnimation;
-  }
-
-  /** Gets the current camera position. */
-  getCameraPosition(): Point3D {
-    const currPos = this.camera.position;
-    return [currPos.x, currPos.y, currPos.z];
-  }
-
-  /** Gets the current camera target. */
-  getCameraTarget(): Point3D {
-    let currTarget = this.orbitCameraControls.target;
-    return [currTarget.x, currTarget.y, currTarget.z];
-  }
-
-  /** Sets up the camera from given position and target coordinates. */
-  setCameraPositionAndTarget(position: Point3D, target: Point3D) {
-    this.stopOrbitAnimation();
-    this.camera.position.set(position[0], position[1], position[2]);
-    this.orbitCameraControls.target.set(target[0], target[1], target[2]);
-    this.orbitCameraControls.update();
-    this.render();
-  }
-
-  /** Starts orbiting the camera around its current lookat target. */
-  startOrbitAnimation() {
-    if (!this.sceneIs3D()) {
-      return;
-    }
-    if (this.orbitAnimationId != null) {
-      this.stopOrbitAnimation();
-    }
-    this.orbitCameraControls.autoRotate = true;
-    this.orbitCameraControls.rotateSpeed =
-        ORBIT_ANIMATION_ROTATION_CYCLE_IN_SECONDS;
-    this.updateOrbitAnimation();
-  }
-
-  private updateOrbitAnimation() {
-    this.orbitCameraControls.update();
-    this.orbitAnimationId =
-        requestAnimationFrame(() => this.updateOrbitAnimation());
-  }
-
-  /** Stops the orbiting animation on the camera. */
-  stopOrbitAnimation() {
-    this.orbitCameraControls.autoRotate = false;
-    this.orbitCameraControls.rotateSpeed = ORBIT_MOUSE_ROTATION_SPEED;
-    if (this.orbitAnimationId != null) {
-      cancelAnimationFrame(this.orbitAnimationId);
-      this.orbitAnimationId = null;
-    }
-  }
-
-  /** Adds a visualizer to the set, will start dispatching events to it */
-  addVisualizer(visualizer: ScatterPlotVisualizer) {
-    if (this.scene) {
-      visualizer.setScene(this.scene);
-    }
-    visualizer.onResize(this.width, this.height);
-    visualizer.onPointPositionsChanged(this.worldSpacePointPositions);
-    this.visualizers.push(visualizer);
-  }
-
-  /** Removes all visualizers attached to this scatter plot. */
-  removeAllVisualizers() {
-    this.visualizers.forEach(v => v.dispose());
-    this.visualizers = [];
-  }
-
-  /** Update scatter plot with a new array of packed xyz point positions. */
-  setPointPositions(worldSpacePointPositions: Float32Array) {
-    this.worldSpacePointPositions = worldSpacePointPositions;
-    this.visualizers.forEach(
-        v => v.onPointPositionsChanged(worldSpacePointPositions));
-  }
-
-  render() {
-    {
-      const lightPos = this.camera.position.clone();
-      lightPos.x += 1;
-      lightPos.y += 1;
-      this.light.position.set(lightPos.x, lightPos.y, lightPos.z);
-    }
-
-    const cameraType = (this.camera instanceof THREE.PerspectiveCamera) ?
-        CameraType.Perspective :
-        CameraType.Orthographic;
-
-    let cameraSpacePointExtents: [number, number] = [0, 0];
-    if (this.worldSpacePointPositions != null) {
-      cameraSpacePointExtents = util.getNearFarPoints(
-          this.worldSpacePointPositions, this.camera.position,
-          this.orbitCameraControls.target);
-    }
-
-    const rc = new RenderContext(
-        this.camera, cameraType, this.orbitCameraControls.target, this.width,
-        this.height, cameraSpacePointExtents[0], cameraSpacePointExtents[1],
-        this.backgroundColor, this.pointColors, this.pointScaleFactors,
-        this.labels, this.polylineColors, this.polylineOpacities,
-        this.polylineWidths);
-
-    // Render first pass to picking target. This render fills pickingTexture
-    // with colors that are actually point ids, so that sampling the texture at
-    // the mouse's current x,y coordinates will reveal the data point that the
-    // mouse is over.
-    this.visualizers.forEach(v => v.onPickingRender(rc));
-
-    {
-      const axes = this.remove3dAxisFromScene();
-      this.renderer.render(this.scene, this.camera, this.pickingTexture);
-      if (axes != null) {
-        this.scene.add(axes);
-      }
-    }
-
-    // Render second pass to color buffer, to be displayed on the canvas.
-    this.visualizers.forEach(v => v.onRender(rc));
-
-    this.renderer.render(this.scene, this.camera);
-  }
-
-  setMouseMode(mouseMode: MouseMode) {
-    this.mouseMode = mouseMode;
-    if (mouseMode === MouseMode.AREA_SELECT) {
-      this.selecting = true;
-      this.container.style.cursor = 'crosshair';
-    } else {
-      this.selecting = false;
-      this.container.style.cursor = 'default';
-    }
-  }
-
-  /** Set the colors for every data point. (RGB triplets) */
-  setPointColors(colors: Float32Array) {
-    this.pointColors = colors;
-  }
-
-  /** Set the scale factors for every data point. (scalars) */
-  setPointScaleFactors(scaleFactors: Float32Array) {
-    this.pointScaleFactors = scaleFactors;
-  }
-
-  /** Set the labels to rendered */
-  setLabels(labels: LabelRenderParams) {
-    this.labels = labels;
-  }
-
-  /** Set the colors for every data polyline. (RGB triplets) */
-  setPolylineColors(colors: {[polylineIndex: number]: Float32Array}) {
-    this.polylineColors = colors;
-  }
-
-  setPolylineOpacities(opacities: Float32Array) {
-    this.polylineOpacities = opacities;
-  }
-
-  setPolylineWidths(widths: Float32Array) {
-    this.polylineWidths = widths;
-  }
-
-  getMouseMode(): MouseMode {
-    return this.mouseMode;
-  }
-
-  resetZoom() {
-    this.recreateCamera(this.makeDefaultCameraDef(this.dimensionality));
-    this.render();
-  }
-
-  setDayNightMode(isNight: boolean) {
-    const canvases = this.container.querySelectorAll('canvas');
-    const filterValue = isNight ? 'invert(100%)' : null;
-    for (let i = 0; i < canvases.length; i++) {
-      canvases[i].style.filter = filterValue;
-    }
-  }
-
-  resize(render = true) {
-    const [oldW, oldH] = [this.width, this.height];
-    const [newW, newH] = this.getLayoutValues();
-
-    if (this.dimensionality === 3) {
-      const camera = (this.camera as THREE.PerspectiveCamera);
-      camera.aspect = newW / newH;
-      camera.updateProjectionMatrix();
-    } else {
-      const camera = (this.camera as THREE.OrthographicCamera);
-      // Scale the ortho frustum by however much the window changed.
-      const scaleW = newW / oldW;
-      const scaleH = newH / oldH;
-      const newCamHalfWidth = ((camera.right - camera.left) * scaleW) / 2;
-      const newCamHalfHeight = ((camera.top - camera.bottom) * scaleH) / 2;
-      camera.top = newCamHalfHeight;
-      camera.bottom = -newCamHalfHeight;
-      camera.left = -newCamHalfWidth;
-      camera.right = newCamHalfWidth;
-      camera.updateProjectionMatrix();
-    }
-
-    // Accouting for retina displays.
-    const dpr = window.devicePixelRatio || 1;
-    this.renderer.setPixelRatio(dpr);
-    this.renderer.setSize(newW, newH);
-
-    // the picking texture needs to be exactly the same as the render texture.
-    {
-      const renderCanvasSize = this.renderer.getSize();
-      const pixelRatio = this.renderer.getPixelRatio();
-      this.pickingTexture = new THREE.WebGLRenderTarget(
-          renderCanvasSize.width * pixelRatio,
-          renderCanvasSize.height * pixelRatio);
-      this.pickingTexture.texture.minFilter = THREE.LinearFilter;
-    }
-
-    this.visualizers.forEach(v => v.onResize(newW, newH));
-
-    if (render) {
-      this.render();
-    };
-  }
-
-  onCameraMove(listener: OnCameraMoveListener) {
-    this.onCameraMoveListeners.push(listener);
-  }
-
-  clickOnPoint(pointIndex: number) {
-    this.nearestPoint = pointIndex;
-    this.onClick(null, false);
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
deleted file mode 100644
index a781877014edfcf40746c7bd72b4e8fc0cfd2e47..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-const FILL = '#dddddd';
-const FILL_OPACITY = .2;
-const STROKE = '#aaaaaa';
-const STROKE_WIDTH = 2;
-const STROKE_DASHARRAY = '10 5';
-
-export interface BoundingBox {
-  // The bounding box (x, y) position refers to the bottom left corner of the
-  // rect.
-  x: number;
-  y: number;
-  width: number;
-  height: number;
-}
-
-/**
- * A class that manages and renders a data selection rectangle.
- */
-export class ScatterPlotRectangleSelector {
-  private svgElement: SVGElement;
-  private rectElement: SVGRectElement;
-
-  private isMouseDown: boolean;
-  private startCoordinates: [number, number];
-  private lastBoundingBox: BoundingBox;
-
-  private selectionCallback: (boundingBox: BoundingBox) => void;
-
-  /**
-   * @param container The container HTML element that the selection SVG rect
-   *     will be a child of.
-   * @param selectionCallback The callback that accepts a bounding box to be
-   *     called when selection changes. Currently, we only call the callback on
-   *     mouseUp.
-   */
-  constructor(
-      container: HTMLElement,
-      selectionCallback: (boundingBox: BoundingBox) => void) {
-    this.svgElement = container.querySelector('#selector') as SVGElement;
-    this.rectElement =
-        document.createElementNS('http://www.w3.org/2000/svg', 'rect');
-    this.rectElement.style.stroke = STROKE;
-    this.rectElement.style.strokeDasharray = STROKE_DASHARRAY;
-    this.rectElement.style.strokeWidth = '' + STROKE_WIDTH;
-    this.rectElement.style.fill = FILL;
-    this.rectElement.style.fillOpacity = '' + FILL_OPACITY;
-    this.svgElement.appendChild(this.rectElement);
-
-    this.selectionCallback = selectionCallback;
-    this.isMouseDown = false;
-  }
-
-  onMouseDown(offsetX: number, offsetY: number) {
-    this.isMouseDown = true;
-    this.rectElement.style.display = 'block';
-
-    this.startCoordinates = [offsetX, offsetY];
-    this.lastBoundingBox = {
-      x: this.startCoordinates[0],
-      y: this.startCoordinates[1],
-      width: 1,
-      height: 1
-    };
-  }
-
-  onMouseMove(offsetX: number, offsetY: number) {
-    if (!this.isMouseDown) {
-      return;
-    }
-
-    this.lastBoundingBox.x = Math.min(offsetX, this.startCoordinates[0]);
-    this.lastBoundingBox.y = Math.max(offsetY, this.startCoordinates[1]);
-    this.lastBoundingBox.width =
-        Math.max(offsetX, this.startCoordinates[0]) - this.lastBoundingBox.x;
-    this.lastBoundingBox.height =
-        this.lastBoundingBox.y - Math.min(offsetY, this.startCoordinates[1]);
-
-    this.rectElement.setAttribute('x', '' + this.lastBoundingBox.x);
-    this.rectElement.setAttribute(
-        'y', '' + (this.lastBoundingBox.y - this.lastBoundingBox.height));
-    this.rectElement.setAttribute('width', '' + this.lastBoundingBox.width);
-    this.rectElement.setAttribute('height', '' + this.lastBoundingBox.height);
-  }
-
-  onMouseUp() {
-    this.isMouseDown = false;
-    this.rectElement.style.display = 'none';
-    this.rectElement.setAttribute('width', '0');
-    this.rectElement.setAttribute('height', '0');
-    this.selectionCallback(this.lastBoundingBox);
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer.ts
deleted file mode 100644
index b0974a205386c99aaef232e1c9c33dbf0e1c8481..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer.ts
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {RenderContext} from './renderContext';
-
-/**
- * ScatterPlotVisualizer is an interface used by ScatterPlotContainer
- * to manage and aggregate any number of concurrent visualization behaviors.
- * To add a new visualization to the 3D scatter plot, create a new class that
- * implements this interface and attach it to the ScatterPlotContainer.
- */
-export interface ScatterPlotVisualizer {
-  /** Called to initialize the visualizer with the primary scene. */
-  setScene(scene: THREE.Scene);
-  /**
-   * Called when the main scatter plot tears down the visualizer. Remove all
-   * objects from the scene, and dispose any heavy resources.
-   */
-  dispose();
-  /**
-   * Called when the positions of the scatter plot points have changed.
-   */
-  onPointPositionsChanged(newWorldSpacePointPositions: Float32Array);
-  /**
-   * Called immediately before the main scatter plot performs a picking
-   * (selection) render. Set up render state for any geometry to use picking IDs
-   * instead of visual colors.
-   */
-  onPickingRender(renderContext: RenderContext);
-  /**
-   * Called immediately before the main scatter plot performs a color (visual)
-   * render. Set up render state, lights, etc here.
-   */
-  onRender(renderContext: RenderContext);
-  /**
-   * Called when the canvas size changes.
-   */
-  onResize(newWidth: number, newHeight: number);
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
deleted file mode 100644
index 7820af0d48dd0e5876644d27ae6c3092134dbe62..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
+++ /dev/null
@@ -1,367 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {RenderContext} from './renderContext';
-import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
-import * as util from './util';
-
-const FONT_SIZE = 80;
-const ONE_OVER_FONT_SIZE = 1 / FONT_SIZE;
-const LABEL_SCALE = 2.2;  // at 1:1 texel/pixel ratio
-const LABEL_COLOR = 'black';
-const LABEL_BACKGROUND = 'white';
-const MAX_CANVAS_DIMENSION = 8192;
-const NUM_GLYPHS = 256;
-const RGB_ELEMENTS_PER_ENTRY = 3;
-const XYZ_ELEMENTS_PER_ENTRY = 3;
-const UV_ELEMENTS_PER_ENTRY = 2;
-const VERTICES_PER_GLYPH = 2 * 3;  // 2 triangles, 3 verts per triangle
-
-/**
- * Each label is made up of triangles (two per letter.) Each vertex, then, is
- * the corner of one of these triangles (and thus the corner of a letter
- * rectangle.)
- * Each has the following attributes:
- *    posObj: The (x, y) position of the vertex within the label, where the
- *            bottom center of the word is positioned at (0, 0);
- *    position: The position of the label in worldspace.
- *    vUv: The (u, v) coordinates that index into the glyphs sheet (range 0, 1.)
- *    color: The color of the label (matches the corresponding point's color.)
- *    wordShown: Boolean. Whether or not the label is visible.
- */
-
-const VERTEX_SHADER = `
-    attribute vec2 posObj;
-    attribute vec3 color;
-    varying vec2 vUv;
-    varying vec3 vColor;
-
-    void main() {
-      vUv = uv;
-      vColor = color;
-
-      // Rotate label to face camera.
-
-      vec4 vRight = vec4(
-        modelViewMatrix[0][0], modelViewMatrix[1][0], modelViewMatrix[2][0], 0);
-
-      vec4 vUp = vec4(
-        modelViewMatrix[0][1], modelViewMatrix[1][1], modelViewMatrix[2][1], 0);
-
-      vec4 vAt = -vec4(
-        modelViewMatrix[0][2], modelViewMatrix[1][2], modelViewMatrix[2][2], 0);
-
-      mat4 pointToCamera = mat4(vRight, vUp, vAt, vec4(0, 0, 0, 1));
-
-      vec2 scaledPos = posObj * ${ONE_OVER_FONT_SIZE} * ${LABEL_SCALE};
-
-      vec4 posRotated = pointToCamera * vec4(scaledPos, 0, 1);
-      vec4 mvPosition = modelViewMatrix * (vec4(position, 0) + posRotated);
-      gl_Position = projectionMatrix * mvPosition;
-    }`;
-
-const FRAGMENT_SHADER = `
-    uniform sampler2D texture;
-    uniform bool picking;
-    varying vec2 vUv;
-    varying vec3 vColor;
-
-    void main() {
-      if (picking) {
-        gl_FragColor = vec4(vColor, 1.0);
-      } else {
-        vec4 fromTexture = texture2D(texture, vUv);
-        gl_FragColor = vec4(vColor, 1.0) * fromTexture;
-      }
-    }`;
-
-type GlyphTexture = {
-  texture: THREE.Texture; lengths: Float32Array; offsets: Float32Array;
-};
-
-/**
- * Renders the text labels as 3d geometry in the world.
- */
-export class ScatterPlotVisualizer3DLabels implements ScatterPlotVisualizer {
-  private scene: THREE.Scene;
-  private labelStrings: string[];
-  private geometry: THREE.BufferGeometry;
-  private worldSpacePointPositions: Float32Array;
-  private pickingColors: Float32Array;
-  private renderColors: Float32Array;
-  private material: THREE.ShaderMaterial;
-  private uniforms: Object;
-  private labelsMesh: THREE.Mesh;
-  private positions: THREE.BufferAttribute;
-  private totalVertexCount: number;
-  private labelVertexMap: number[][];
-  private glyphTexture: GlyphTexture;
-
-  private createGlyphTexture(): GlyphTexture {
-    let canvas = document.createElement('canvas');
-    canvas.width = MAX_CANVAS_DIMENSION;
-    canvas.height = FONT_SIZE;
-    let ctx = canvas.getContext('2d');
-    ctx.font = 'bold ' + FONT_SIZE * 0.75 + 'px roboto';
-    ctx.textBaseline = 'top';
-    ctx.fillStyle = LABEL_BACKGROUND;
-    ctx.rect(0, 0, canvas.width, canvas.height);
-    ctx.fill();
-    ctx.fillStyle = LABEL_COLOR;
-    let spaceOffset = ctx.measureText(' ').width;
-    // For each letter, store length, position at the encoded index.
-    let glyphLengths = new Float32Array(NUM_GLYPHS);
-    let glyphOffset = new Float32Array(NUM_GLYPHS);
-    let leftCoord = 0;
-    for (let i = 0; i < NUM_GLYPHS; i++) {
-      let text = ' ' + String.fromCharCode(i);
-      let textLength = ctx.measureText(text).width;
-      glyphLengths[i] = textLength - spaceOffset;
-      glyphOffset[i] = leftCoord;
-      ctx.fillText(text, leftCoord - spaceOffset, 0);
-      leftCoord += textLength;
-    }
-    const tex = util.createTexture(canvas);
-    return {texture: tex, lengths: glyphLengths, offsets: glyphOffset};
-  }
-
-  private processLabelVerts(pointCount: number) {
-    let numTotalLetters = 0;
-    this.labelVertexMap = [];
-    for (let i = 0; i < pointCount; i++) {
-      const label = this.labelStrings[i];
-      let vertsArray: number[] = [];
-      for (let j = 0; j < label.length; j++) {
-        for (let k = 0; k < VERTICES_PER_GLYPH; k++) {
-          vertsArray.push(numTotalLetters * VERTICES_PER_GLYPH + k);
-        }
-        numTotalLetters++;
-      }
-      this.labelVertexMap.push(vertsArray);
-    }
-    this.totalVertexCount = numTotalLetters * VERTICES_PER_GLYPH;
-  }
-
-  private createColorBuffers(pointCount: number) {
-    this.pickingColors =
-        new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-    this.renderColors =
-        new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-    for (let i = 0; i < pointCount; i++) {
-      let color = new THREE.Color(i);
-      this.labelVertexMap[i].forEach((j) => {
-        this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j] = color.r;
-        this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j + 1] = color.g;
-        this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j + 2] = color.b;
-        this.renderColors[RGB_ELEMENTS_PER_ENTRY * j] = 1.0;
-        this.renderColors[RGB_ELEMENTS_PER_ENTRY * j + 1] = 1.0;
-        this.renderColors[RGB_ELEMENTS_PER_ENTRY * j + 2] = 1.0;
-      });
-    }
-  }
-
-  private createLabels() {
-    if ((this.labelStrings == null) ||
-        (this.worldSpacePointPositions == null)) {
-      return;
-    }
-    const pointCount =
-        this.worldSpacePointPositions.length / XYZ_ELEMENTS_PER_ENTRY;
-    if (pointCount !== this.labelStrings.length) {
-      return;
-    }
-    this.glyphTexture = this.createGlyphTexture();
-
-    this.uniforms = {
-      texture: {type: 't'},
-      picking: {type: 'bool'},
-    };
-
-    this.material = new THREE.ShaderMaterial({
-      uniforms: this.uniforms,
-      transparent: true,
-      vertexShader: VERTEX_SHADER,
-      fragmentShader: FRAGMENT_SHADER,
-    });
-
-    this.processLabelVerts(pointCount);
-    this.createColorBuffers(pointCount);
-
-    let positionArray =
-        new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
-    this.positions =
-        new THREE.BufferAttribute(positionArray, XYZ_ELEMENTS_PER_ENTRY);
-
-    let posArray =
-        new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
-    let uvArray =
-        new Float32Array(this.totalVertexCount * UV_ELEMENTS_PER_ENTRY);
-    let colorsArray =
-        new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-    let positionObject = new THREE.BufferAttribute(posArray, 2);
-    let uv = new THREE.BufferAttribute(uvArray, UV_ELEMENTS_PER_ENTRY);
-    let colors = new THREE.BufferAttribute(colorsArray, RGB_ELEMENTS_PER_ENTRY);
-
-    this.geometry = new THREE.BufferGeometry();
-    this.geometry.addAttribute('posObj', positionObject);
-    this.geometry.addAttribute('position', this.positions);
-    this.geometry.addAttribute('uv', uv);
-    this.geometry.addAttribute('color', colors);
-
-    let lettersSoFar = 0;
-    for (let i = 0; i < pointCount; i++) {
-      const label = this.labelStrings[i];
-      let leftOffset = 0;
-      // Determine length of word in pixels.
-      for (let j = 0; j < label.length; j++) {
-        let letterCode = label.charCodeAt(j);
-        leftOffset += this.glyphTexture.lengths[letterCode];
-      }
-      leftOffset /= -2;  // centers text horizontally around the origin
-      for (let j = 0; j < label.length; j++) {
-        let letterCode = label.charCodeAt(j);
-        let letterWidth = this.glyphTexture.lengths[letterCode];
-        let scale = FONT_SIZE;
-        let right = (leftOffset + letterWidth) / scale;
-        let left = (leftOffset) / scale;
-        let top = FONT_SIZE / scale;
-
-        // First triangle
-        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, left, 0);
-        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, right, 0);
-        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, left, top);
-
-        // Second triangle
-        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, left, top);
-        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, right, 0);
-        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, right, top);
-
-        // Set UVs based on letter.
-        let uLeft = (this.glyphTexture.offsets[letterCode]);
-        let uRight = (this.glyphTexture.offsets[letterCode] + letterWidth);
-        // Scale so that uvs lie between 0 and 1 on the texture.
-        uLeft /= MAX_CANVAS_DIMENSION;
-        uRight /= MAX_CANVAS_DIMENSION;
-        let vTop = 1;
-        let vBottom = 0;
-        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, uLeft, vTop);
-        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, uRight, vTop);
-        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, uLeft, vBottom);
-        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, uLeft, vBottom);
-        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, uRight, vTop);
-        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, uRight, vBottom);
-
-        lettersSoFar++;
-        leftOffset += letterWidth;
-      }
-    }
-
-    for (let i = 0; i < pointCount; i++) {
-      const p = util.vector3FromPackedArray(this.worldSpacePointPositions, i);
-      this.labelVertexMap[i].forEach((j) => {
-        this.positions.setXYZ(j, p.x, p.y, p.z);
-      });
-    };
-
-    this.labelsMesh = new THREE.Mesh(this.geometry, this.material);
-    this.labelsMesh.frustumCulled = false;
-    this.scene.add(this.labelsMesh);
-  }
-
-  private colorLabels(pointColors: Float32Array) {
-    if (this.labelStrings == null || this.geometry == null ||
-        pointColors == null) {
-      return;
-    }
-
-    const colors = this.geometry.getAttribute('color') as THREE.BufferAttribute;
-    colors.array = this.renderColors;
-
-    const n = pointColors.length / XYZ_ELEMENTS_PER_ENTRY;
-    let src = 0;
-    for (let i = 0; i < n; ++i) {
-      const c = new THREE.Color(
-          pointColors[src], pointColors[src + 1], pointColors[src + 2]);
-      const m = this.labelVertexMap[i].length;
-      for (let j = 0; j < m; ++j) {
-        colors.setXYZ(this.labelVertexMap[i][j], c.r, c.g, c.b);
-      }
-      src += RGB_ELEMENTS_PER_ENTRY;
-    }
-    colors.needsUpdate = true;
-  }
-
-  setScene(scene: THREE.Scene) {
-    this.scene = scene;
-  }
-
-  dispose() {
-    if (this.labelsMesh) {
-      if (this.scene) {
-        this.scene.remove(this.labelsMesh);
-      }
-      this.labelsMesh = null;
-    }
-    if (this.geometry) {
-      this.geometry.dispose();
-      this.geometry = null;
-    }
-    if ((this.glyphTexture != null) && (this.glyphTexture.texture != null)) {
-      this.glyphTexture.texture.dispose();
-      this.glyphTexture.texture = null;
-    }
-  }
-
-  onPickingRender(rc: RenderContext) {
-    if (this.geometry == null) {
-      this.createLabels();
-    }
-    if (this.geometry == null) {
-      return;
-    }
-    this.material.uniforms.texture.value = this.glyphTexture.texture;
-    this.material.uniforms.picking.value = true;
-    const colors = this.geometry.getAttribute('color') as THREE.BufferAttribute;
-    colors.array = this.pickingColors;
-    colors.needsUpdate = true;
-  }
-
-  onRender(rc: RenderContext) {
-    if (this.geometry == null) {
-      this.createLabels();
-    }
-    if (this.geometry == null) {
-      return;
-    }
-    this.colorLabels(rc.pointColors);
-    this.material.uniforms.texture.value = this.glyphTexture.texture;
-    this.material.uniforms.picking.value = false;
-    const colors = this.geometry.getAttribute('color') as THREE.BufferAttribute;
-    colors.array = this.renderColors;
-    colors.needsUpdate = true;
-  }
-
-  onPointPositionsChanged(newPositions: Float32Array) {
-    this.worldSpacePointPositions = newPositions;
-    this.dispose();
-  }
-
-  setLabelStrings(labelStrings: string[]) {
-    this.labelStrings = labelStrings;
-    this.dispose();
-  }
-
-  onResize(newWidth: number, newHeight: number) {}
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
deleted file mode 100644
index 2f3146d213c1269737fb56f332de12f384b1d949..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {BoundingBox, CollisionGrid} from './label';
-import {CameraType, RenderContext} from './renderContext';
-import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
-import * as util from './util';
-
-const MAX_LABELS_ON_SCREEN = 10000;
-const LABEL_STROKE_WIDTH = 3;
-const LABEL_FILL_WIDTH = 6;
-
-/**
- * Creates and maintains a 2d canvas on top of the GL canvas. All labels, when
- * active, are rendered to the 2d canvas as part of the visible render pass.
- */
-export class ScatterPlotVisualizerCanvasLabels implements
-    ScatterPlotVisualizer {
-  private worldSpacePointPositions: Float32Array;
-  private gc: CanvasRenderingContext2D;
-  private canvas: HTMLCanvasElement;
-  private labelsActive: boolean = true;
-
-  constructor(container: HTMLElement) {
-    this.canvas = document.createElement('canvas');
-    container.appendChild(this.canvas);
-
-    this.gc = this.canvas.getContext('2d');
-    this.canvas.style.position = 'absolute';
-    this.canvas.style.left = '0';
-    this.canvas.style.top = '0';
-    this.canvas.style.pointerEvents = 'none';
-  }
-
-  private removeAllLabels() {
-    const pixelWidth = this.canvas.width * window.devicePixelRatio;
-    const pixelHeight = this.canvas.height * window.devicePixelRatio;
-    this.gc.clearRect(0, 0, pixelWidth, pixelHeight);
-  }
-
-  /** Render all of the non-overlapping visible labels to the canvas. */
-  private makeLabels(rc: RenderContext) {
-    if ((rc.labels == null) || (rc.labels.pointIndices.length === 0)) {
-      return;
-    }
-    if (this.worldSpacePointPositions == null) {
-      return;
-    }
-
-    const lrc = rc.labels;
-    const sceneIs3D: boolean = (rc.cameraType === CameraType.Perspective);
-    const labelHeight = parseInt(this.gc.font, 10);
-    const dpr = window.devicePixelRatio;
-
-    let grid: CollisionGrid;
-    {
-      const pixw = this.canvas.width * dpr;
-      const pixh = this.canvas.height * dpr;
-      const bb: BoundingBox = {loX: 0, hiX: pixw, loY: 0, hiY: pixh};
-      grid = new CollisionGrid(bb, pixw / 25, pixh / 50);
-    }
-
-    let opacityMap =
-        d3.scalePow()
-            .exponent(Math.E)
-            .domain([rc.farthestCameraSpacePointZ, rc.nearestCameraSpacePointZ])
-            .range([0.1, 1]);
-
-    const camPos = rc.camera.position;
-    const camToTarget = camPos.clone().sub(rc.cameraTarget);
-    let camToPoint = new THREE.Vector3();
-
-    this.gc.textBaseline = 'middle';
-    this.gc.miterLimit = 2;
-
-    // Have extra space between neighboring labels. Don't pack too tightly.
-    const labelMargin = 2;
-    // Shift the label to the right of the point circle.
-    const xShift = 4;
-
-    const n = Math.min(MAX_LABELS_ON_SCREEN, lrc.pointIndices.length);
-    for (let i = 0; i < n; ++i) {
-      let point: THREE.Vector3;
-      {
-        const pi = lrc.pointIndices[i];
-        point = util.vector3FromPackedArray(this.worldSpacePointPositions, pi);
-      }
-
-      // discard points that are behind the camera
-      camToPoint.copy(camPos).sub(point);
-      if (camToTarget.dot(camToPoint) < 0) {
-        continue;
-      }
-
-      let [x, y] = util.vector3DToScreenCoords(
-          rc.camera, rc.screenWidth, rc.screenHeight, point);
-      x += xShift;
-
-      // Computing the width of the font is expensive,
-      // so we assume width of 1 at first. Then, if the label doesn't
-      // conflict with other labels, we measure the actual width.
-      const textBoundingBox: BoundingBox = {
-        loX: x - labelMargin,
-        hiX: x + 1 + labelMargin,
-        loY: y - labelHeight / 2 - labelMargin,
-        hiY: y + labelHeight / 2 + labelMargin
-      };
-
-      if (grid.insert(textBoundingBox, true)) {
-        const text = lrc.labelStrings[i];
-        const fontSize = lrc.defaultFontSize * lrc.scaleFactors[i] * dpr;
-        this.gc.font = fontSize + 'px roboto';
-
-        // Now, check with properly computed width.
-        textBoundingBox.hiX += this.gc.measureText(text).width - 1;
-        if (grid.insert(textBoundingBox)) {
-          let opacity = 1;
-          if (sceneIs3D && (lrc.useSceneOpacityFlags[i] === 1)) {
-            opacity = opacityMap(camToPoint.length());
-          }
-          this.gc.fillStyle =
-              this.styleStringFromPackedRgba(lrc.fillColors, i, opacity);
-          this.gc.strokeStyle =
-              this.styleStringFromPackedRgba(lrc.strokeColors, i, opacity);
-          this.gc.lineWidth = LABEL_STROKE_WIDTH;
-          this.gc.strokeText(text, x, y);
-          this.gc.lineWidth = LABEL_FILL_WIDTH;
-          this.gc.fillText(text, x, y);
-        }
-      }
-    }
-  }
-
-  private styleStringFromPackedRgba(
-      packedRgbaArray: Uint8Array, colorIndex: number,
-      opacity: number): string {
-    const offset = colorIndex * 3;
-    const r = packedRgbaArray[offset];
-    const g = packedRgbaArray[offset + 1];
-    const b = packedRgbaArray[offset + 2];
-    return 'rgba(' + r + ',' + g + ',' + b + ',' + opacity + ')';
-  }
-
-  onResize(newWidth: number, newHeight: number) {
-    let dpr = window.devicePixelRatio;
-    this.canvas.width = newWidth * dpr;
-    this.canvas.height = newHeight * dpr;
-    this.canvas.style.width = newWidth + 'px';
-    this.canvas.style.height = newHeight + 'px';
-  }
-
-  dispose() {
-    this.removeAllLabels();
-    this.canvas = null;
-    this.gc = null;
-  }
-
-  onPointPositionsChanged(newPositions: Float32Array) {
-    this.worldSpacePointPositions = newPositions;
-    this.removeAllLabels();
-  }
-
-  onRender(rc: RenderContext) {
-    if (!this.labelsActive) {
-      return;
-    }
-
-    this.removeAllLabels();
-    this.makeLabels(rc);
-  }
-
-  setScene(scene: THREE.Scene) {}
-  onPickingRender(renderContext: RenderContext) {}
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts
deleted file mode 100644
index e6d4aeda28b9cd9288011a71967d67a79ffc0c42..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataSet} from './data';
-import {RenderContext} from './renderContext';
-import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
-import * as util from './util';
-
-const RGB_NUM_ELEMENTS = 3;
-const XYZ_NUM_ELEMENTS = 3;
-
-/**
- * Renders polylines that connect multiple points in the dataset.
- */
-export class ScatterPlotVisualizerPolylines implements ScatterPlotVisualizer {
-  private dataSet: DataSet;
-  private scene: THREE.Scene;
-  private polylines: THREE.Line[];
-  private polylinePositionBuffer:
-      {[polylineIndex: number]: THREE.BufferAttribute} = {};
-  private polylineColorBuffer:
-      {[polylineIndex: number]: THREE.BufferAttribute} = {};
-
-  private updateSequenceIndicesInDataSet(ds: DataSet) {
-    for (let i = 0; i < ds.sequences.length; i++) {
-      const sequence = ds.sequences[i];
-      for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
-        ds.points[sequence.pointIndices[j]].sequenceIndex = i;
-        ds.points[sequence.pointIndices[j + 1]].sequenceIndex = i;
-      }
-    }
-  }
-
-  private createPolylines(scene: THREE.Scene) {
-    if (!this.dataSet || !this.dataSet.sequences) {
-      return;
-    }
-
-    this.updateSequenceIndicesInDataSet(this.dataSet);
-    this.polylines = [];
-
-    for (let i = 0; i < this.dataSet.sequences.length; i++) {
-      const geometry = new THREE.BufferGeometry();
-      geometry.addAttribute('position', this.polylinePositionBuffer[i]);
-      geometry.addAttribute('color', this.polylineColorBuffer[i]);
-
-      const material = new THREE.LineBasicMaterial({
-        linewidth: 1,  // unused default, overwritten by width array.
-        opacity: 1.0,  // unused default, overwritten by opacity array.
-        transparent: true,
-        vertexColors: THREE.VertexColors
-      });
-
-      const polyline = new THREE.LineSegments(geometry, material);
-      polyline.frustumCulled = false;
-      this.polylines.push(polyline);
-      scene.add(polyline);
-    }
-  }
-
-  dispose() {
-    if (this.polylines == null) {
-      return;
-    }
-    for (let i = 0; i < this.polylines.length; i++) {
-      this.scene.remove(this.polylines[i]);
-      this.polylines[i].geometry.dispose();
-    }
-    this.polylines = null;
-    this.polylinePositionBuffer = {};
-    this.polylineColorBuffer = {};
-  }
-
-  setScene(scene: THREE.Scene) {
-    this.scene = scene;
-  }
-
-  setDataSet(dataSet: DataSet) {
-    this.dataSet = dataSet;
-  }
-
-  onPointPositionsChanged(newPositions: Float32Array) {
-    if ((newPositions == null) || (this.polylines != null)) {
-      this.dispose();
-    }
-    if ((newPositions == null) || (this.dataSet == null)) {
-      return;
-    }
-    // Set up the position buffer arrays for each polyline.
-    for (let i = 0; i < this.dataSet.sequences.length; i++) {
-      let sequence = this.dataSet.sequences[i];
-      const vertexCount = 2 * (sequence.pointIndices.length - 1);
-
-      let polylines = new Float32Array(vertexCount * XYZ_NUM_ELEMENTS);
-      this.polylinePositionBuffer[i] =
-          new THREE.BufferAttribute(polylines, XYZ_NUM_ELEMENTS);
-
-      let colors = new Float32Array(vertexCount * RGB_NUM_ELEMENTS);
-      this.polylineColorBuffer[i] =
-          new THREE.BufferAttribute(colors, RGB_NUM_ELEMENTS);
-    }
-    for (let i = 0; i < this.dataSet.sequences.length; i++) {
-      const sequence = this.dataSet.sequences[i];
-      let src = 0;
-      for (let j = 0; j < sequence.pointIndices.length - 1; j++) {
-        const p1Index = sequence.pointIndices[j];
-        const p2Index = sequence.pointIndices[j + 1];
-        const p1 = util.vector3FromPackedArray(newPositions, p1Index);
-        const p2 = util.vector3FromPackedArray(newPositions, p2Index);
-        this.polylinePositionBuffer[i].setXYZ(src, p1.x, p1.y, p1.z);
-        this.polylinePositionBuffer[i].setXYZ(src + 1, p2.x, p2.y, p2.z);
-        src += 2;
-      }
-      this.polylinePositionBuffer[i].needsUpdate = true;
-    }
-
-    if (this.polylines == null) {
-      this.createPolylines(this.scene);
-    }
-  }
-
-  onRender(renderContext: RenderContext) {
-    if (this.polylines == null) {
-      return;
-    }
-    for (let i = 0; i < this.polylines.length; i++) {
-      this.polylines[i].material.opacity = renderContext.polylineOpacities[i];
-      (this.polylines[i].material as THREE.LineBasicMaterial).linewidth =
-          renderContext.polylineWidths[i];
-      this.polylineColorBuffer[i].array = renderContext.polylineColors[i];
-      this.polylineColorBuffer[i].needsUpdate = true;
-    }
-  }
-
-  onPickingRender(renderContext: RenderContext) {}
-  onResize(newWidth: number, newHeight: number) {}
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
deleted file mode 100644
index be9c1703c727f11381d7836de86dad1c1c294cc0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
+++ /dev/null
@@ -1,435 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {CameraType, RenderContext} from './renderContext';
-import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
-import * as util from './util';
-
-const NUM_POINTS_FOG_THRESHOLD = 5000;
-const MIN_POINT_SIZE = 5.0;
-const IMAGE_SIZE = 30;
-
-// Constants relating to the indices of buffer arrays.
-const RGB_NUM_ELEMENTS = 3;
-const INDEX_NUM_ELEMENTS = 1;
-const XYZ_NUM_ELEMENTS = 3;
-
-const VERTEX_SHADER = `
-  // Index of the specific vertex (passed in as bufferAttribute), and the
-  // variable that will be used to pass it to the fragment shader.
-  attribute float spriteIndex;
-  attribute vec3 color;
-  attribute float scaleFactor;
-
-  varying vec2 xyIndex;
-  varying vec3 vColor;
-
-  uniform bool sizeAttenuation;
-  uniform float pointSize;
-  uniform float spritesPerRow;
-  uniform float spritesPerColumn;
-
-  void main() {
-    // Pass index and color values to fragment shader.
-    vColor = color;
-    xyIndex = vec2(mod(spriteIndex, spritesPerRow),
-              floor(spriteIndex / spritesPerColumn));
-
-    // Transform current vertex by modelViewMatrix (model world position and
-    // camera world position matrix).
-    vec4 cameraSpacePos = modelViewMatrix * vec4(position, 1.0);
-
-    // Project vertex in camera-space to screen coordinates using the camera's
-    // projection matrix.
-    gl_Position = projectionMatrix * cameraSpacePos;
-
-    // Create size attenuation (if we're in 3D mode) by making the size of
-    // each point inversly proportional to its distance to the camera.
-    float outputPointSize = pointSize;
-    if (sizeAttenuation) {
-      outputPointSize = -pointSize / cameraSpacePos.z;
-    }
-
-    gl_PointSize =
-      max(outputPointSize * scaleFactor, ${MIN_POINT_SIZE.toFixed(1)});
-  }`;
-
-const FRAGMENT_SHADER_POINT_TEST_CHUNK = `
-  bool point_in_unit_circle(vec2 spriteCoord) {
-    vec2 centerToP = spriteCoord - vec2(0.5, 0.5);
-    return dot(centerToP, centerToP) < (0.5 * 0.5);
-  }
-
-  bool point_in_unit_equilateral_triangle(vec2 spriteCoord) {
-    vec3 v0 = vec3(0, 1, 0);
-    vec3 v1 = vec3(0.5, 0, 0);
-    vec3 v2 = vec3(1, 1, 0);
-    vec3 p = vec3(spriteCoord, 0);
-    float p_in_v0_v1 = cross(v1 - v0, p - v0).z;
-    float p_in_v1_v2 = cross(v2 - v1, p - v1).z;
-    return (p_in_v0_v1 > 0.0) && (p_in_v1_v2 > 0.0);
-  }
-
-  bool point_in_unit_square(vec2 spriteCoord) {
-    return true;
-  }
-`;
-
-const FRAGMENT_SHADER = `
-  varying vec2 xyIndex;
-  varying vec3 vColor;
-
-  uniform sampler2D texture;
-  uniform float spritesPerRow;
-  uniform float spritesPerColumn;
-  uniform bool isImage;
-
-  ${THREE.ShaderChunk['common']}
-  ${THREE.ShaderChunk['fog_pars_fragment']}
-  ${FRAGMENT_SHADER_POINT_TEST_CHUNK}
-
-  void main() {
-    if (isImage) {
-      // Coordinates of the vertex within the entire sprite image.
-      vec2 coords =
-        (gl_PointCoord + xyIndex) / vec2(spritesPerRow, spritesPerColumn);
-      gl_FragColor = vec4(vColor, 1.0) * texture2D(texture, coords);
-    } else {
-      bool inside = point_in_unit_circle(gl_PointCoord);
-      if (!inside) {
-        discard;
-      }
-      gl_FragColor = vec4(vColor, 1);
-    }
-    ${THREE.ShaderChunk['fog_fragment']}
-  }`;
-
-const FRAGMENT_SHADER_PICKING = `
-  varying vec2 xyIndex;
-  varying vec3 vColor;
-  uniform bool isImage;
-
-  ${FRAGMENT_SHADER_POINT_TEST_CHUNK}
-
-  void main() {
-    xyIndex; // Silence 'unused variable' warning.
-    if (isImage) {
-      gl_FragColor = vec4(vColor, 1);
-    } else {
-      bool inside = point_in_unit_circle(gl_PointCoord);
-      if (!inside) {
-        discard;
-      }
-      gl_FragColor = vec4(vColor, 1);
-    }
-  }`;
-
-/**
- * Uses GL point sprites to render the dataset.
- */
-export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
-  private scene: THREE.Scene;
-  private fog: THREE.Fog;
-  private texture: THREE.Texture = null;
-  private standinTextureForPoints: THREE.Texture;
-  private spritesPerRow: number;
-  private spritesPerColumn: number;
-  private spriteDimensions: [number, number];
-  private spriteIndexBufferAttribute: THREE.BufferAttribute;
-  private renderMaterial: THREE.ShaderMaterial;
-  private pickingMaterial: THREE.ShaderMaterial;
-
-  private points: THREE.Points;
-  private worldSpacePointPositions: Float32Array;
-  private pickingColors: Float32Array;
-  private renderColors: Float32Array;
-
-  constructor() {
-    this.standinTextureForPoints =
-        util.createTexture(document.createElement('canvas'));
-    this.renderMaterial = this.createRenderMaterial(false);
-    this.pickingMaterial = this.createPickingMaterial(false);
-  }
-
-  private createTextureFromSpriteAtlas(
-      spriteAtlas: HTMLImageElement, spriteDimensions: [number, number],
-      spriteIndices: Float32Array) {
-    this.texture = util.createTexture(spriteAtlas);
-    this.spritesPerRow = spriteAtlas.width / spriteDimensions[0];
-    this.spritesPerColumn = spriteAtlas.height / spriteDimensions[1];
-    this.spriteDimensions = spriteDimensions;
-    this.spriteIndexBufferAttribute =
-        new THREE.BufferAttribute(spriteIndices, INDEX_NUM_ELEMENTS);
-
-    if (this.points != null) {
-      (this.points.geometry as THREE.BufferGeometry)
-          .addAttribute('spriteIndex', this.spriteIndexBufferAttribute);
-    }
-  }
-
-  private createUniforms(): any {
-    return {
-      texture: {type: 't'},
-      spritesPerRow: {type: 'f'},
-      spritesPerColumn: {type: 'f'},
-      fogColor: {type: 'c'},
-      fogNear: {type: 'f'},
-      fogFar: {type: 'f'},
-      isImage: {type: 'bool'},
-      sizeAttenuation: {type: 'bool'},
-      pointSize: {type: 'f'}
-    };
-  }
-
-  private createRenderMaterial(haveImage: boolean): THREE.ShaderMaterial {
-    const uniforms = this.createUniforms();
-    return new THREE.ShaderMaterial({
-      uniforms: uniforms,
-      vertexShader: VERTEX_SHADER,
-      fragmentShader: FRAGMENT_SHADER,
-      transparent: !haveImage,
-      depthTest: haveImage,
-      depthWrite: haveImage,
-      fog: true,
-      blending: THREE.MultiplyBlending,
-    });
-  }
-
-  private createPickingMaterial(haveImage: boolean): THREE.ShaderMaterial {
-    const uniforms = this.createUniforms();
-    return new THREE.ShaderMaterial({
-      uniforms: uniforms,
-      vertexShader: VERTEX_SHADER,
-      fragmentShader: FRAGMENT_SHADER_PICKING,
-      transparent: true,
-      depthTest: true,
-      depthWrite: true,
-      fog: false,
-      blending: THREE.NormalBlending,
-    });
-  }
-
-  /**
-   * Create points, set their locations and actually instantiate the
-   * geometry.
-   */
-  private createPointSprites(scene: THREE.Scene, positions: Float32Array) {
-    const pointCount =
-        (positions != null) ? (positions.length / XYZ_NUM_ELEMENTS) : 0;
-    const geometry = this.createGeometry(pointCount);
-
-    this.fog = new THREE.Fog(0xFFFFFF);  // unused value, gets overwritten.
-
-    this.points = new THREE.Points(geometry, this.renderMaterial);
-    this.points.frustumCulled = false;
-    if (this.spriteIndexBufferAttribute != null) {
-      (this.points.geometry as THREE.BufferGeometry)
-          .addAttribute('spriteIndex', this.spriteIndexBufferAttribute);
-    }
-    scene.add(this.points);
-  }
-
-  private calculatePointSize(sceneIs3D: boolean): number {
-    if (this.texture != null) {
-      return sceneIs3D ? IMAGE_SIZE : this.spriteDimensions[0];
-    }
-    const n = (this.worldSpacePointPositions != null) ?
-        (this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS) :
-        1;
-    const SCALE = 200;
-    const LOG_BASE = 8;
-    const DIVISOR = 1.5;
-    // Scale point size inverse-logarithmically to the number of points.
-    const pointSize = SCALE / Math.log(n) / Math.log(LOG_BASE);
-    return sceneIs3D ? pointSize : (pointSize / DIVISOR);
-  }
-
-  /**
-   * Set up buffer attributes to be used for the points/images.
-   */
-  private createGeometry(pointCount: number): THREE.BufferGeometry {
-    const n = pointCount;
-
-    // Fill pickingColors with each point's unique id as its color.
-    this.pickingColors = new Float32Array(n * RGB_NUM_ELEMENTS);
-    {
-      let dst = 0;
-      for (let i = 0; i < n; i++) {
-        const c = new THREE.Color(i);
-        this.pickingColors[dst++] = c.r;
-        this.pickingColors[dst++] = c.g;
-        this.pickingColors[dst++] = c.b;
-      }
-    }
-
-    const geometry = new THREE.BufferGeometry();
-    geometry.addAttribute(
-        'position', new THREE.BufferAttribute(null, XYZ_NUM_ELEMENTS));
-    geometry.addAttribute(
-        'color', new THREE.BufferAttribute(null, RGB_NUM_ELEMENTS));
-    geometry.addAttribute(
-        'scaleFactor', new THREE.BufferAttribute(null, INDEX_NUM_ELEMENTS));
-    return geometry;
-  }
-
-  private setFogDistances(
-      sceneIs3D: boolean, nearestPointZ: number, farthestPointZ: number) {
-    if (sceneIs3D) {
-      const n = this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS;
-      this.fog.near = nearestPointZ;
-      // If there are fewer points we want less fog. We do this
-      // by making the "far" value (that is, the distance from the camera to the
-      // far edge of the fog) proportional to the number of points.
-      let multiplier =
-          2 - Math.min(n, NUM_POINTS_FOG_THRESHOLD) / NUM_POINTS_FOG_THRESHOLD;
-      this.fog.far = farthestPointZ * multiplier;
-    } else {
-      this.fog.near = Infinity;
-      this.fog.far = Infinity;
-    }
-  }
-
-  dispose() {
-    this.disposeGeometry();
-    this.disposeTextureAtlas();
-  }
-
-  private disposeGeometry() {
-    if (this.points != null) {
-      this.scene.remove(this.points);
-      this.points.geometry.dispose();
-      this.points = null;
-      this.worldSpacePointPositions = null;
-    }
-  }
-
-  private disposeTextureAtlas() {
-    if (this.texture != null) {
-      this.texture.dispose();
-    }
-    this.texture = null;
-    this.renderMaterial = null;
-    this.pickingMaterial = null;
-  }
-
-  setScene(scene: THREE.Scene) {
-    this.scene = scene;
-  }
-
-  setSpriteAtlas(
-      spriteImage: HTMLImageElement, spriteDimensions: [number, number],
-      spriteIndices: Float32Array) {
-    this.disposeTextureAtlas();
-    this.createTextureFromSpriteAtlas(
-        spriteImage, spriteDimensions, spriteIndices);
-    this.renderMaterial = this.createRenderMaterial(true);
-    this.pickingMaterial = this.createPickingMaterial(true);
-  }
-
-  clearSpriteAtlas() {
-    this.disposeTextureAtlas();
-    this.renderMaterial = this.createRenderMaterial(false);
-    this.pickingMaterial = this.createPickingMaterial(false);
-  }
-
-  onPointPositionsChanged(newPositions: Float32Array) {
-    if ((newPositions == null) || (newPositions.length === 0)) {
-      this.dispose();
-      return;
-    }
-    if (this.points != null) {
-      if (this.worldSpacePointPositions.length !== newPositions.length) {
-        this.disposeGeometry();
-      }
-    }
-
-    this.worldSpacePointPositions = newPositions;
-
-    if (this.points == null) {
-      this.createPointSprites(this.scene, newPositions);
-    }
-
-    const positions = (this.points.geometry as THREE.BufferGeometry)
-                          .getAttribute('position') as THREE.BufferAttribute;
-    positions.array = newPositions;
-    positions.needsUpdate = true;
-  }
-
-  onPickingRender(rc: RenderContext) {
-    if (this.points == null) {
-      return;
-    }
-
-    const sceneIs3D: boolean = (rc.cameraType === CameraType.Perspective);
-
-    this.pickingMaterial.uniforms.spritesPerRow.value = this.spritesPerRow;
-    this.pickingMaterial.uniforms.spritesPerRow.value = this.spritesPerColumn;
-    this.pickingMaterial.uniforms.sizeAttenuation.value = sceneIs3D;
-    this.pickingMaterial.uniforms.pointSize.value =
-        this.calculatePointSize(sceneIs3D);
-    this.points.material = this.pickingMaterial;
-
-    let colors = (this.points.geometry as THREE.BufferGeometry)
-                     .getAttribute('color') as THREE.BufferAttribute;
-    colors.array = this.pickingColors;
-    colors.needsUpdate = true;
-
-    let scaleFactors =
-        (this.points.geometry as THREE.BufferGeometry)
-            .getAttribute('scaleFactor') as THREE.BufferAttribute;
-    scaleFactors.array = rc.pointScaleFactors;
-    scaleFactors.needsUpdate = true;
-  }
-
-  onRender(rc: RenderContext) {
-    if (!this.points) {
-      return;
-    }
-    const sceneIs3D: boolean = (rc.camera instanceof THREE.PerspectiveCamera);
-
-    this.setFogDistances(
-        sceneIs3D, rc.nearestCameraSpacePointZ, rc.farthestCameraSpacePointZ);
-
-    this.scene.fog = this.fog;
-    this.scene.fog.color = new THREE.Color(rc.backgroundColor);
-
-    this.renderMaterial.uniforms.fogColor.value = this.scene.fog.color;
-    this.renderMaterial.uniforms.fogNear.value = this.fog.near;
-    this.renderMaterial.uniforms.fogFar.value = this.fog.far;
-    this.renderMaterial.uniforms.spritesPerRow.value = this.spritesPerRow;
-    this.renderMaterial.uniforms.spritesPerColumn.value = this.spritesPerColumn;
-    this.renderMaterial.uniforms.isImage.value = (this.texture != null);
-    this.renderMaterial.uniforms.texture.value =
-        (this.texture != null) ? this.texture : this.standinTextureForPoints;
-    this.renderMaterial.uniforms.sizeAttenuation.value = sceneIs3D;
-    this.renderMaterial.uniforms.pointSize.value =
-        this.calculatePointSize(sceneIs3D);
-    this.points.material = this.renderMaterial;
-
-    let colors = (this.points.geometry as THREE.BufferGeometry)
-                     .getAttribute('color') as THREE.BufferAttribute;
-    this.renderColors = rc.pointColors;
-    colors.array = this.renderColors;
-    colors.needsUpdate = true;
-
-    let scaleFactors =
-        (this.points.geometry as THREE.BufferGeometry)
-            .getAttribute('scaleFactor') as THREE.BufferAttribute;
-    scaleFactors.array = rc.pointScaleFactors;
-    scaleFactors.needsUpdate = true;
-  }
-
-  onResize(newWidth: number, newHeight: number) {}
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/sptree.ts b/tensorflow/tensorboard/components/vz_projector/sptree.ts
deleted file mode 100644
index 991369a3352e9b3a778069194d270592ad5575c8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/sptree.ts
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/** N-dimensional point. Usually 2D or 3D. */
-export type Point = number[];
-
-export interface BBox {
-  center: Point;
-  halfDim: number;
-}
-
-/** A node in a space-partitioning tree. */
-export interface SPNode {
-  /** The children of this node. */
-  children?: SPNode[];
-  /** The bounding box of the region this node occupies. */
-  box: BBox;
-  /** One or more points this node has. */
-  point: Point;
-}
-
-/**
- * A Space-partitioning tree (https://en.wikipedia.org/wiki/Space_partitioning)
- * that recursively divides the space into regions of equal sizes. This data
- * structure can act both as a Quad tree and an Octree when the data is 2 or
- * 3 dimensional respectively. One usage is in t-SNE in order to do Barnes-Hut
- * approximation.
- */
-export class SPTree {
-  root: SPNode;
-
-  private masks: number[];
-  private dim: number;
-
-  /**
-   * Constructs a new tree with the provided data.
-   *
-   * @param data List of n-dimensional data points.
-   * @param capacity Number of data points to store in a single node.
-   */
-  constructor(data: Point[]) {
-    if (data.length < 1) {
-      throw new Error('There should be at least 1 data point');
-    }
-    // Make a bounding box based on the extent of the data.
-    this.dim = data[0].length;
-    // Each node has 2^d children, where d is the dimension of the space.
-    // Binary masks (e.g. 000, 001, ... 111 in 3D) are used to determine in
-    // which child (e.g. quadron in 2D) the new point is going to be assigned.
-    // For more details, see the insert() method and its comments.
-    this.masks = new Array(Math.pow(2, this.dim));
-    for (let d = 0; d < this.masks.length; ++d) {
-      this.masks[d] = (1 << d);
-    }
-    let min: Point = new Array(this.dim);
-    fillArray(min, Number.POSITIVE_INFINITY);
-    let max: Point = new Array(this.dim);
-    fillArray(max, Number.NEGATIVE_INFINITY);
-
-    for (let i = 0; i < data.length; ++i) {
-      // For each dim get the min and max.
-      // E.g. For 2-D, get the x_min, x_max, y_min, y_max.
-      for (let d = 0; d < this.dim; ++d) {
-        min[d] = Math.min(min[d], data[i][d]);
-        max[d] = Math.max(max[d], data[i][d]);
-      }
-    }
-    // Create a bounding box with the center of the largest span.
-    let center: Point = new Array(this.dim);
-    let halfDim = 0;
-    for (let d = 0; d < this.dim; ++d) {
-      let span = max[d] - min[d];
-      center[d] = min[d] + span / 2;
-      halfDim = Math.max(halfDim, span / 2);
-    }
-    this.root = {box: {center: center, halfDim: halfDim}, point: data[0]};
-    for (let i = 1; i < data.length; ++i) {
-      this.insert(this.root, data[i]);
-    }
-  }
-
-  /**
-   * Visits every node in the tree. Each node can store 1 or more points,
-   * depending on the node capacity provided in the constructor.
-   *
-   * @param accessor Method that takes the currently visited node, and the
-   * low and high point of the region that this node occupies. E.g. in 2D,
-   * the low and high points will be the lower-left corner and the upper-right
-   * corner.
-   */
-  visit(
-      accessor: (node: SPNode, lowPoint: Point, highPoint: Point) => boolean,
-      noBox = false) {
-    this.visitNode(this.root, accessor, noBox);
-  }
-
-  private visitNode(
-      node: SPNode,
-      accessor: (node: SPNode, lowPoint?: Point, highPoint?: Point) => boolean,
-      noBox: boolean) {
-    let skipChildren: boolean;
-    if (noBox) {
-      skipChildren = accessor(node);
-    } else {
-      let lowPoint = new Array(this.dim);
-      let highPoint = new Array(this.dim);
-      for (let d = 0; d < this.dim; ++d) {
-        lowPoint[d] = node.box.center[d] - node.box.halfDim;
-        highPoint[d] = node.box.center[d] + node.box.halfDim;
-      }
-      skipChildren = accessor(node, lowPoint, highPoint);
-    }
-    if (!node.children || skipChildren) {
-      return;
-    }
-    for (let i = 0; i < node.children.length; ++i) {
-      let child = node.children[i];
-      if (child) {
-        this.visitNode(child, accessor, noBox);
-      }
-    }
-  }
-
-  private insert(node: SPNode, p: Point) {
-    // Subdivide and then add the point to whichever node will accept it.
-    if (node.children == null) {
-      node.children = new Array(this.masks.length);
-    }
-
-    // Decide which child will get the new point by constructing a D-bits binary
-    // signature (D=3 for 3D) where the k-th bit is 1 if the point's k-th
-    // coordinate is greater than the node's k-th coordinate, 0 otherwise.
-    // Then the binary signature in decimal system gives us the index of the
-    // child where the new point should be.
-    let index = 0;
-    for (let d = 0; d < this.dim; ++d) {
-      if (p[d] > node.box.center[d]) {
-        index |= this.masks[d];
-      }
-    }
-    if (node.children[index] == null) {
-      this.makeChild(node, index, p);
-    } else {
-      this.insert(node.children[index], p);
-    }
-  }
-
-  private makeChild(node: SPNode, index: number, p: Point): void {
-    let oldC = node.box.center;
-    let h = node.box.halfDim / 2;
-    let newC: Point = new Array(this.dim);
-    for (let d = 0; d < this.dim; ++d) {
-      newC[d] = (index & (1 << d)) ? oldC[d] + h : oldC[d] - h;
-    }
-    node.children[index] = {box: {center: newC, halfDim: h}, point: p};
-  }
-}
-
-function fillArray<T>(arr: T[], value: T): void {
-  for (let i = 0; i < arr.length; ++i) {
-    arr[i] = value;
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/styles.html b/tensorflow/tensorboard/components/vz_projector/styles.html
deleted file mode 100644
index 32dc984b5d62512b9f46d0e38ca6ab413687dd24..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/styles.html
+++ /dev/null
@@ -1,185 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<dom-module id="vz-projector-styles">
-<template>
-<style>
-:host {
-  --paper-input-container-label: {
-    font-size: 14px;
-  };
-  --paper-input-container-input: {
-    font-size: 14px;
-  };
-  /* TODO: Figure out why this doesn't work */
-  --paper-dropdown-menu-input: {
-    font-size: 14px;
-  };
-}
-
-paper-button {
-  background: #e3e3e3;
-  margin-left: 0;
-  text-transform: none;
-}
-
-paper-dropdown-menu paper-item {
-  font-size: 13px;
-}
-
-paper-tooltip {
-  max-width: 200px;
-  --paper-tooltip: {
-    font-size: 12px;
-  };
-}
-
-paper-checkbox {
-  --paper-checkbox-checked-color: #880E4F;
-}
-
-paper-toggle-button {
-  --paper-toggle-button-checked-bar-color:  #880E4F;
-  --paper-toggle-button-checked-button-color:  #880E4F;
-  --paper-toggle-button-checked-ink-color: #880E4F;
-}
-
-paper-icon-button {
-  border-radius: 50%;
-}
-
-paper-icon-button[active] {
-  color: white;
-  background-color: #880E4F;
-}
-
-.slider {
-  display: flex;
-  align-items: center;
-  margin-bottom: 10px;
-  justify-content: space-between;
-}
-
-.slider span {
-  width: 35px;
-  text-align: right;
-}
-
-.slider label {
-  align-items: center;
-  display: flex;
-}
-
-.help-icon {
-  height: 15px;
-  left: 2px;
-  min-width: 15px;
-  min-height: 15px;
-  margin: 0;
-  padding: 0;
-  top: -2px;
-  width: 15px;
-}
-
-.ink-panel {
-  display: flex;
-  flex-direction: column;
-  font-size: 14px;
-}
-
-.ink-panel h4 {
-  border-bottom: 1px solid #ddd;
-  font-size: 14px;
-  font-weight: 500;
-  margin: 0;
-  margin-bottom: 10px;
-  padding-bottom: 5px;
-}
-
-.ink-panel-header {
-  border-bottom: 1px solid rgba(0, 0, 0, 0.1);
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  height: 50px;
-}
-
-.ink-panel-content {
-  display: none;
-  height: 100%;
-}
-
-.ink-panel-content.active {
-  display: block;
-}
-
-.ink-panel-content h3 {
-  font-weight: 500;
-  font-size: 14px;
-  margin-top: 20px;
-  margin-bottom: 5px;
-  text-transform: uppercase;
-}
-
-.ink-panel-header h3 {
-  font-weight: 500;
-  font-size: 14px;
-  margin: 0;
-  padding: 0 24px;
-  text-transform: uppercase;
-}
-
-
-/* - Tabs */
-.ink-tab-group {
-  align-items: center;
-  box-sizing: border-box;
-  display: flex;
-  height: 100%;
-  justify-content: space-around;
-}
-
-.ink-tab-group .projection-tab {
-  color: rgba(0, 0, 0, 0.5);
-  cursor: pointer;
-  font-weight: 300;
-  line-height: 49px;
-  padding: 0 12px;
-  text-align: center;
-  text-transform: uppercase;
-}
-
-.ink-tab-group .projection-tab:hover {
-  color: black;
-}
-
-.ink-tab-group .projection-tab.active {
-  border-bottom: 2px solid black;
-  color: black;
-  font-weight: 500;
-}
-
-h4 {
-  margin: 30px 0 10px 0;
-}
-
-.dismiss-dialog-note {
-  margin-top: 25px;
-  font-size: 11px;
-  text-align: right;
-}
-</style>
-</template>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/test/BUILD b/tensorflow/tensorboard/components/vz_projector/test/BUILD
deleted file mode 100644
index fc8659f06a3acb83e73c0ab753980be4d660b92b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "assert.ts",
-        "data-provider_test.ts",
-        "data_test.ts",
-        "sptree_test.ts",
-        "tests.html",
-        "util_test.ts",
-        # "scatterPlotRectangleSelector_test.ts",
-        # "vz-projector-projections-panel_test.ts",
-    ],
-    path = "/vz-projector/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:polymer",
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/tf_imports:webcomponentsjs",
-        "//tensorflow/tensorboard/components/vz_projector",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_projector/test/data-provider_test.ts b/tensorflow/tensorboard/components/vz_projector/test/data-provider_test.ts
deleted file mode 100644
index 59a42ffbfd84d7a6731af504081b7f0c64d17592..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/data-provider_test.ts
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataPoint, SpriteAndMetadataInfo} from '../data';
-import * as data_provider from '../data-provider';
-
-/**
- * Converts a string to an ArrayBuffer.
- */
-function stringToArrayBuffer(str: string): Promise<ArrayBuffer> {
-  return new Promise<ArrayBuffer>((resolve, reject) => {
-    let blob = new Blob([str]);
-    let file = new FileReader();
-    file.onload = (e: any) => {
-      resolve(e.target.result);
-    };
-    file.readAsArrayBuffer(blob);
-  });
-}
-
-/**
- * Converts an data array to TSV format.
- */
-function dataToTsv(data: string[][]|number[][]) {
-  let lines = [];
-  for (let i = 0; i < data.length; i++) {
-    lines.push(data[i].join('\t'));
-  }
-  return lines.join('\n');
-}
-
-describe('parse tensors', () => {
-  it('parseTensors', (doneFn) => {
-    let tensors = [[1.0, 2.0], [2.0, 3.0]];
-    stringToArrayBuffer(dataToTsv(tensors))
-        .then((tensorsArrayBuffer: ArrayBuffer) => {
-          data_provider.parseTensors(tensorsArrayBuffer)
-              .then((data: DataPoint[]) => {
-                assert.equal(2, data.length);
-
-                assert.deepEqual(new Float32Array(tensors[0]), data[0].vector);
-                assert.equal(0, data[0].index);
-                assert.isNull(data[0].projections);
-
-                assert.deepEqual(new Float32Array(tensors[1]), data[1].vector);
-                assert.equal(1, data[1].index);
-                assert.isNull(data[1].projections);
-                doneFn();
-              });
-        });
-  });
-  it('parseMetadata', (doneFn) => {
-    let metadata = [['label', 'fakecol'], ['Г', '0'], ['label1', '1']];
-
-    stringToArrayBuffer(dataToTsv(metadata))
-        .then((metadataArrayBuffer: ArrayBuffer) => {
-          data_provider.parseMetadata(metadataArrayBuffer)
-              .then((spriteAndMetadataInfo: SpriteAndMetadataInfo) => {
-                assert.equal(2, spriteAndMetadataInfo.stats.length);
-                assert.equal(metadata[0][0],
-                             spriteAndMetadataInfo.stats[0].name);
-                assert.isFalse(spriteAndMetadataInfo.stats[0].isNumeric);
-                assert.isFalse(
-                    spriteAndMetadataInfo.stats[0].tooManyUniqueValues);
-                assert.equal(metadata[0][1],
-                             spriteAndMetadataInfo.stats[1].name);
-                assert.isTrue(spriteAndMetadataInfo.stats[1].isNumeric);
-                assert.isFalse(
-                    spriteAndMetadataInfo.stats[1].tooManyUniqueValues);
-
-                assert.equal(2, spriteAndMetadataInfo.pointsInfo.length);
-                assert.equal(metadata[1][0],
-                             spriteAndMetadataInfo.pointsInfo[0]['label']);
-                assert.equal(+metadata[1][1],
-                             spriteAndMetadataInfo.pointsInfo[0]['fakecol']);
-                assert.equal(metadata[2][0],
-                             spriteAndMetadataInfo.pointsInfo[1]['label']);
-                assert.equal(+metadata[2][1],
-                             spriteAndMetadataInfo.pointsInfo[1]['fakecol']);
-                doneFn();
-              });
-        });
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/test/data_test.ts b/tensorflow/tensorboard/components/vz_projector/test/data_test.ts
deleted file mode 100644
index 5e47c091c5b5565ed084612b178201ee5ba19386..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/data_test.ts
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataPoint, DataSet, State, stateGetAccessorDimensions} from '../data';
-
-/**
- * Helper method that makes a list of points given an array of
- * sequence indexes.
- *
- * @param sequences The i-th entry holds the 'next' attribute for the i-th
- * point.
- */
-function makePointsWithSequences(
-    sequences: number[], nextAttr = '__seq_next__') {
-  let points: DataPoint[] = [];
-  sequences.forEach((t, i) => {
-    let metadata: {[key: string]: any} = {};
-    metadata[nextAttr] = t >= 0 ? t : null;
-    points.push({
-      vector: new Float32Array(0),
-      metadata: metadata,
-      projections: {},
-      index: i
-    });
-  });
-  return points;
-}
-
-describe('constructor_with_sequences', () => {
-  it('Simple forward pointing sequences, __seq_next__ metadata format', () => {
-    // The input is: 0->2, 1->None, 2->3, 3->None. This should return
-    // one sequence 0->2->3.
-    const points = makePointsWithSequences([2, -1, 3, -1]);
-    let dataset = new DataSet(points);
-    assert.equal(1, dataset.sequences.length);
-    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
-  });
-
-  it('Simple forward pointing sequences, __next__ metadata format', () => {
-    // The input is: 0->2, 1->None, 2->3, 3->None. This should return
-    // one sequence 0->2->3.
-    const points = makePointsWithSequences([2, -1, 3, -1], '__next__');
-    let dataset = new DataSet(points);
-    assert.equal(1, dataset.sequences.length);
-    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
-  });
-
-  it('No sequences', () => {
-    let points = makePointsWithSequences([-1, -1, -1, -1]);
-    let dataset = new DataSet(points);
-    assert.equal(0, dataset.sequences.length);
-  });
-
-  it('A sequence that goes backwards and forward in the array', () => {
-    // The input is: 0->2, 1->0, 2->nothing, 3->1. This should return
-    // one sequence 3->1->0->2.
-    let points = makePointsWithSequences([2, 0, -1, 1]);
-    let dataset = new DataSet(points);
-    assert.equal(1, dataset.sequences.length);
-    assert.deepEqual([3, 1, 0, 2], dataset.sequences[0].pointIndices);
-  });
-});
-
-describe('stateGetAccessorDimensions', () => {
-  it('returns [0, 1] for 2d t-SNE', () => {
-    const state = new State();
-    state.selectedProjection = 'tsne';
-    state.tSNEis3d = false;
-    assert.deepEqual([0, 1], stateGetAccessorDimensions(state));
-  });
-
-  it('returns [0, 1, 2] for 3d t-SNE', () => {
-    const state = new State();
-    state.selectedProjection = 'tsne';
-    state.tSNEis3d = true;
-    assert.deepEqual([0, 1, 2], stateGetAccessorDimensions(state));
-  });
-
-  it('returns pca component dimensions array for pca', () => {
-    const state = new State();
-    state.selectedProjection = 'pca';
-    state.pcaComponentDimensions = [13, 12, 11, 10];
-    assert.deepEqual(state.pcaComponentDimensions,
-                     stateGetAccessorDimensions(state));
-  });
-
-  it('returns ["x", "y"] for custom projections', () => {
-    const state = new State();
-    state.selectedProjection = 'custom';
-    assert.deepEqual(['x', 'y'], stateGetAccessorDimensions(state));
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/test/scatterPlotRectangleSelector_test.ts b/tensorflow/tensorboard/components/vz_projector/test/scatterPlotRectangleSelector_test.ts
deleted file mode 100644
index 0ee6cf620df8bb082adf424a66548b832346597d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/scatterPlotRectangleSelector_test.ts
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {BoundingBox, ScatterPlotRectangleSelector} from '../scatterPlotRectangleSelector';
-
-describe('selector callbacks make bounding box start bottom left', () => {
-  let containerElement: HTMLElement;
-  let selectionCallback: (boundingBox: BoundingBox) => void;
-  let selection: ScatterPlotRectangleSelector;
-
-  beforeEach(() => {
-    containerElement = document.createElement('div');
-    const selector = document.createElement('svg');
-    selector.id = 'selector';
-    containerElement.appendChild(selector);
-
-    selectionCallback = jasmine.createSpy('selectionCallback');
-    selection =
-        new ScatterPlotRectangleSelector(containerElement, selectionCallback);
-  });
-
-  it('Simple mouse event starting top left', () => {
-    selection.onMouseDown(0, 0);
-    selection.onMouseMove(10, 10);
-    selection.onMouseUp();
-
-    expect(selectionCallback)
-        .toHaveBeenCalledWith({x: 0, y: 10, width: 10, height: 10});
-  });
-
-  it('Simple mouse event starting bottom left', () => {
-    selection.onMouseDown(0, 10);
-    selection.onMouseMove(10, 0);
-    selection.onMouseUp();
-
-    expect(selectionCallback)
-        .toHaveBeenCalledWith({x: 0, y: 10, width: 10, height: 10});
-  });
-
-  it('Simple mouse event starting top right', () => {
-    selection.onMouseDown(10, 0);
-    selection.onMouseMove(0, 10);
-    selection.onMouseUp();
-
-    expect(selectionCallback)
-        .toHaveBeenCalledWith({x: 0, y: 10, width: 10, height: 10});
-  });
-
-  it('Simple mouse event starting bottom right', () => {
-    selection.onMouseDown(10, 10);
-    selection.onMouseMove(0, 0);
-    selection.onMouseUp();
-
-    expect(selectionCallback)
-        .toHaveBeenCalledWith({x: 0, y: 10, width: 10, height: 10});
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/test/sptree_test.ts b/tensorflow/tensorboard/components/vz_projector/test/sptree_test.ts
deleted file mode 100644
index 7e340ea62f5d1146e11b8321f4668dc97d14e0c8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/sptree_test.ts
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {SPTree} from '../sptree';
-
-it('simple 2D data', () => {
-  let data = [
-    [0, 1],
-    [1, 0],
-    [1, 1],
-    [0, 0],
-  ];
-  let tree = new SPTree(data);
-  // Check that each point is within the bound.
-  tree.visit((node, low, high) => {
-    assert.equal(low.length, 2);
-    assert.equal(high.length, 2);
-    let point = node.point;
-    assert.equal(point.length, 2);
-    // Each point should be in the node's bounding box.
-    assert.equal(
-        point[0] >= low[0] && point[0] <= high[0] && point[1] >= low[1] &&
-            point[1] <= high[1],
-        true);
-    return false;
-  });
-});
-
-it('simple 3D data', () => {
-  let data = [
-    [0, 1, 0],
-    [1, 0.4, 2],
-    [1, 1, 3],
-    [0, 0, 5],
-  ];
-  let tree = new SPTree(data);
-  // Check that each point is within the bound.
-  tree.visit((node, low, high) => {
-    assert.equal(low.length, 3);
-    assert.equal(high.length, 3);
-    let point = node.point;
-    assert.equal(point.length, 3);
-    // Each point should be in the node's bounding box.
-    assert.equal(
-        point[0] >= low[0] && point[0] <= high[0] && point[1] >= low[1] &&
-            point[1] <= high[1] && point[2] >= low[2] && point[2] <= high[2],
-        true);
-    return false;
-  });
-});
-
-it('Only visit root', () => {
-  let data = [
-    [0, 1, 0],
-    [1, 0.4, 2],
-    [1, 1, 3],
-    [0, 0, 5],
-  ];
-  let tree = new SPTree(data);
-  let numVisits = 0;
-  tree.visit((node, low, high) => {
-    numVisits++;
-    return true;
-  });
-  assert.equal(numVisits, 1);
-});
-
-it('Search in random data', () => {
-  let N = 10000;
-  let data = new Array(N);
-  for (let i = 0; i < N; i++) {
-    data[i] = [Math.random(), Math.random()];
-  }
-  let tree = new SPTree(data);
-  let numVisits = 0;
-  let query = data[Math.floor(Math.random() * N)];
-  let found = false;
-  tree.visit((node, low, high) => {
-    numVisits++;
-    if (node.point === query) {
-      found = true;
-      return true;
-    }
-    let outOfBounds = query[0] < low[0] || query[0] > high[0] ||
-        query[1] < low[1] || query[1] > high[1];
-    return outOfBounds;
-  });
-  assert.equal(found, true);
-  assert.isBelow(numVisits, N / 4);
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/test/tests.html b/tensorflow/tensorboard/components/vz_projector/test/tests.html
deleted file mode 100644
index a6843d0d6b827ef71e9eddd1f4debd4e63478892..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/tests.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<script src="../../web-component-tester/browser.js"></script>
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../bundle.html">
-<body>
-<script src="assert.js"></script>
-<script src="sptree_test.js"></script>
-<script src="data_test.js"></script>
-<script src="data-provider_test.js"></script>
-<script src="util_test.js"></script>
-<!-- TODO(smilkov): Migrate these away from jasmine. -->
-<!-- <script src="scatterPlotRectangleSelector_test.js"></script>
-     <script src="vz-projector-projections-panel_test.js"></script> -->
diff --git a/tensorflow/tensorboard/components/vz_projector/test/util_test.ts b/tensorflow/tensorboard/components/vz_projector/test/util_test.ts
deleted file mode 100644
index c18db95eed706a3eacd09486fca2b67b5e01f595..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/util_test.ts
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import * as util from '../util';
-
-describe('getURLParams', () => {
-  it('search query with valid param returns correct object', () => {
-    let urlParams = util.getURLParams('?config=http://google.com/');
-    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
-  });
-
-  it('search query with multiple valid params returns correct object', () => {
-    let urlParams = util.getURLParams('?config=http://google.com/&foo=bar');
-    assert.deepEqual({'config': 'http://google.com/', 'foo': 'bar'}, urlParams);
-  });
-
-  it('search query with valid param with URL encoded characters', () => {
-    let urlParams = util.getURLParams('?config=http://google.com/%20search');
-    assert.deepEqual({'config': 'http://google.com/ search'}, urlParams);
-  });
-
-  it('search query with pound sign', () => {
-    let urlParams = util.getURLParams('?config=http://google.com/#foo');
-    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
-  });
-
-  it('no search query returns empty object', () => {
-    let urlParams = util.getURLParams('');
-    assert.deepEqual({}, urlParams);
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/test/vz-projector-projections-panel_test.ts b/tensorflow/tensorboard/components/vz_projector/test/vz-projector-projections-panel_test.ts
deleted file mode 100644
index 2bf0c6eb48f019e2467d7c9451748696bb6ed54d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/test/vz-projector-projections-panel_test.ts
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {State} from '../data';
-import {ProjectionsPanel} from '../vz-projector-projections-panel';
-
-describe('restoreUIFromBookmark', () => {
-  let projectionsPanel: ProjectionsPanel;
-  beforeEach(() => {
-    projectionsPanel = document.createElement(ProjectionsPanel.prototype.is) as
-        ProjectionsPanel;
-
-    // Set up some of the UI so the elements are found in the production code.
-    const tsnePerplexityContainer = document.createElement('div');
-    tsnePerplexityContainer.className = 'tsne-perplexity';
-    const tsnePerplexity = document.createElement('span');
-    tsnePerplexityContainer.appendChild(tsnePerplexity);
-    projectionsPanel.appendChild(tsnePerplexityContainer);
-
-    const tsneLearningRateContainer = document.createElement('div');
-    tsneLearningRateContainer.className = 'tsne-learning-rate';
-    const tsneLearningRate = document.createElement('span');
-    tsneLearningRateContainer.appendChild(tsneLearningRate);
-    projectionsPanel.appendChild(tsneLearningRateContainer);
-  });
-
-  it('sets the pcaX/Y properties when setting 2D component values', () => {
-    spyOn(projectionsPanel, 'setZDropdownEnabled');
-
-    const s = new State();
-    s.pcaComponentDimensions = [0, 1];
-    projectionsPanel.restoreUIFromBookmark(s);
-
-    assert.equal(0, projectionsPanel.pcaX);
-    assert.equal(1, projectionsPanel.pcaY);
-
-    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(false);
-  });
-
-  it('sets the pcaX/Y properties when setting 3D component values', () => {
-    spyOn(projectionsPanel, 'setZDropdownEnabled');
-
-    const s = new State();
-    s.pcaComponentDimensions = [0, 1, 2];
-    projectionsPanel.restoreUIFromBookmark(s);
-
-    assert.equal(0, projectionsPanel.pcaX);
-    assert.equal(1, projectionsPanel.pcaY);
-    assert.equal(2, projectionsPanel.pcaZ);
-
-    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(true);
-  });
-});
-
-describe('populateBookmarkFromUI', () => {
-  let projectionsPanel: ProjectionsPanel;
-
-  beforeEach(() => {
-    projectionsPanel = document.createElement(ProjectionsPanel.prototype.is) as
-        ProjectionsPanel;
-
-    // Set up some of the UI so the elements are found in the production code.
-    const tsnePerplexityContainer = document.createElement('div');
-    tsnePerplexityContainer.className = 'tsne-perplexity';
-    const tsnePerplexity = document.createElement('span');
-    tsnePerplexityContainer.appendChild(tsnePerplexity);
-    projectionsPanel.appendChild(tsnePerplexityContainer);
-
-    const tsneLearningRateContainer = document.createElement('div');
-    tsneLearningRateContainer.className = 'tsne-learning-rate';
-    const tsneLearningRate = document.createElement('span');
-    tsneLearningRateContainer.appendChild(tsneLearningRate);
-    projectionsPanel.appendChild(tsneLearningRateContainer);
-  });
-
-  it('gets the PCA component UI values from a 2D PCA projection', () => {
-    projectionsPanel.pcaX = 0;
-    projectionsPanel.pcaY = 1;
-    projectionsPanel.pcaIs3d = false;
-
-    const s = new State();
-    projectionsPanel.populateBookmarkFromUI(s);
-    assert.deepEqual([0, 1], s.pcaComponentDimensions);
-  });
-
-  it('gets the PCA component UI values from a 3D PCA projection', () => {
-    projectionsPanel.pcaX = 0;
-    projectionsPanel.pcaY = 1;
-    projectionsPanel.pcaZ = 2;
-    projectionsPanel.pcaIs3d = true;
-
-    const s = new State();
-    projectionsPanel.populateBookmarkFromUI(s);
-    assert.deepEqual([0, 1, 2], s.pcaComponentDimensions);
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_projector/util.ts b/tensorflow/tensorboard/components/vz_projector/util.ts
deleted file mode 100644
index bd6df68b1a5965d7289db1eb8ecda528938908bb..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/util.ts
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DataPoint} from './data';
-import * as logging from './logging';
-import {Point2D} from './vector';
-
-/**
- * Delay for running expensive tasks, in milliseconds.
- * The duration was empirically found so that it leaves enough time for the
- * browser to update its UI state before starting an expensive UI-blocking task.
- */
-const TASK_DELAY_MS = 200;
-
-/** Shuffles the array in-place in O(n) time using Fisher-Yates algorithm. */
-export function shuffle<T>(array: T[]): T[] {
-  let m = array.length;
-  let t: T;
-  let i: number;
-
-  // While there remain elements to shuffle.
-  while (m) {
-    // Pick a remaining element
-    i = Math.floor(Math.random() * m--);
-    // And swap it with the current element.
-    t = array[m];
-    array[m] = array[i];
-    array[i] = t;
-  }
-  return array;
-}
-
-export function range(count: number): number[] {
-  const rangeOutput: number[] = [];
-  for (let i = 0; i < count; i++) {
-    rangeOutput.push(i);
-  }
-  return rangeOutput;
-}
-
-export function classed(
-    element: HTMLElement, className: string, enabled: boolean) {
-  const classNames = element.className.split(' ');
-  if (enabled) {
-    if (className in classNames) {
-      return;
-    } else {
-      classNames.push(className);
-    }
-  } else {
-    const index = classNames.indexOf(className);
-    if (index === -1) {
-      return;
-    }
-    classNames.splice(index, 1);
-  }
-  element.className = classNames.join(' ');
-}
-
-/** Projects a 3d point into screen space */
-export function vector3DToScreenCoords(
-    cam: THREE.Camera, w: number, h: number, v: THREE.Vector3): Point2D {
-  let dpr = window.devicePixelRatio;
-  let pv = new THREE.Vector3().copy(v).project(cam);
-
-  // The screen-space origin is at the middle of the screen, with +y up.
-  let coords: Point2D =
-      [((pv.x + 1) / 2 * w) * dpr, -((pv.y - 1) / 2 * h) * dpr];
-  return coords;
-}
-
-/** Loads 3 contiguous elements from a packed xyz array into a Vector3. */
-export function vector3FromPackedArray(
-    a: Float32Array, pointIndex: number): THREE.Vector3 {
-  const offset = pointIndex * 3;
-  return new THREE.Vector3(a[offset], a[offset + 1], a[offset + 2]);
-}
-
-/**
- * Gets the camera-space z coordinates of the nearest and farthest points.
- * Ignores points that are behind the camera.
- */
-export function getNearFarPoints(
-    worldSpacePoints: Float32Array, cameraPos: THREE.Vector3,
-    cameraTarget: THREE.Vector3): [number, number] {
-  let shortestDist: number = Infinity;
-  let furthestDist: number = 0;
-  const camToTarget = new THREE.Vector3().copy(cameraTarget).sub(cameraPos);
-  const camPlaneNormal = new THREE.Vector3().copy(camToTarget).normalize();
-  const n = worldSpacePoints.length / 3;
-  let src = 0;
-  let p = new THREE.Vector3();
-  let camToPoint = new THREE.Vector3();
-  for (let i = 0; i < n; i++) {
-    p.x = worldSpacePoints[src];
-    p.y = worldSpacePoints[src + 1];
-    p.z = worldSpacePoints[src + 2];
-    src += 3;
-
-    camToPoint.copy(p).sub(cameraPos);
-    const dist = camPlaneNormal.dot(camToPoint);
-    if (dist < 0) {
-      continue;
-    }
-    furthestDist = (dist > furthestDist) ? dist : furthestDist;
-    shortestDist = (dist < shortestDist) ? dist : shortestDist;
-  }
-  return [shortestDist, furthestDist];
-}
-
-/**
- * Generate a texture for the points/images and sets some initial params
- */
-export function createTexture(image: HTMLImageElement|
-                              HTMLCanvasElement): THREE.Texture {
-  let tex = new THREE.Texture(image);
-  tex.needsUpdate = true;
-  // Used if the texture isn't a power of 2.
-  tex.minFilter = THREE.LinearFilter;
-  tex.generateMipmaps = false;
-  tex.flipY = false;
-  return tex;
-}
-
-/**
- * Assert that the condition is satisfied; if not, log user-specified message
- * to the console.
- */
-export function assert(condition: boolean, message?: string) {
-  if (!condition) {
-    message = message || 'Assertion failed';
-    throw new Error(message);
-  }
-}
-
-export type SearchPredicate = (p: DataPoint) => boolean;
-
-export function getSearchPredicate(
-    query: string, inRegexMode: boolean, fieldName: string): SearchPredicate {
-  let predicate: SearchPredicate;
-  if (inRegexMode) {
-    let regExp = new RegExp(query, 'i');
-    predicate = p => regExp.test(p.metadata[fieldName].toString());
-  } else {
-    // Doing a case insensitive substring match.
-    query = query.toLowerCase();
-    predicate = p => {
-      let label = p.metadata[fieldName].toString().toLowerCase();
-      return label.indexOf(query) >= 0;
-    };
-  }
-  return predicate;
-}
-
-/**
- * Runs an expensive task asynchronously with some delay
- * so that it doesn't block the UI thread immediately.
- *
- * @param message The message to display to the user.
- * @param task The expensive task to run.
- * @param msgId Optional. ID of an existing message. If provided, will overwrite
- *     an existing message and won't automatically clear the message when the
- *     task is done.
- * @return The value returned by the task.
- */
-export function runAsyncTask<T>(
-    message: string, task: () => T, msgId: string = null): Promise<T> {
-  let autoClear = (msgId == null);
-  msgId = logging.setModalMessage(message, msgId);
-  return new Promise<T>((resolve, reject) => {
-    setTimeout(() => {
-      try {
-        let result = task();
-        // Clearing the old message.
-        if (autoClear) {
-          logging.setModalMessage(null, msgId);
-        }
-        resolve(result);
-      } catch (ex) {
-        reject(ex);
-      }
-      return true;
-    }, TASK_DELAY_MS);
-  });
-}
-
-
-/**
- * Parses the URL for query parameters, e.g. ?foo=1&bar=2 will return
- *   {'foo': '1', 'bar': '2'}.
- * @param url The URL to parse.
- * @return A map of queryParam key to its value.
- */
-export function getURLParams(url: string): {[key: string]: string} {
-  if (!url) {
-    return {};
-  }
-
-  let queryString = url.indexOf('?') !== -1 ? url.split('?')[1] : url;
-  if (queryString.indexOf('#')) {
-    queryString = queryString.split('#')[0];
-  }
-
-  const queryEntries = queryString.split('&');
-  let queryParams: {[key: string]: string} = {};
-  for (let i = 0; i < queryEntries.length; i++) {
-    let queryEntryComponents = queryEntries[i].split('=');
-    queryParams[queryEntryComponents[0].toLowerCase()] =
-        decodeURIComponent(queryEntryComponents[1]);
-  }
-  return queryParams;
-}
-
-/** List of substrings that auto generated tensors have in their name. */
-const SUBSTR_GEN_TENSORS = ['/Adagrad'];
-
-/** Returns true if the tensor was automatically generated by TF API calls. */
-export function tensorIsGenerated(tensorName: string): boolean {
-  for (let i = 0; i < SUBSTR_GEN_TENSORS.length; i++) {
-    if (tensorName.indexOf(SUBSTR_GEN_TENSORS[i]) >= 0) {
-      return true;
-    }
-  }
-  return false;
-}
-
-export function xor(cond1: boolean, cond2: boolean): boolean {
-  return (cond1 || cond2) && !(cond1 && cond2);
-}
-
-/** Checks to see if the browser supports webgl. */
-export function hasWebGLSupport(): boolean {
-  try {
-    let c = document.createElement('canvas');
-    let gl = c.getContext('webgl') || c.getContext('experimental-webgl');
-    return gl != null && typeof weblas !== 'undefined';
-  } catch (e) {
-    return false;
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/vector.ts b/tensorflow/tensorboard/components/vz_projector/vector.ts
deleted file mode 100644
index cab30483138d2610b020d74e24775f8c779e845a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vector.ts
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {assert} from './util';
-
-/**
- * @fileoverview Useful vector utilities.
- */
-
-export type Vector = Float32Array | number[];
-export type Point2D = [number, number];
-export type Point3D = [number, number, number];
-
-/** Returns the dot product of two vectors. */
-export function dot(a: Vector, b: Vector): number {
-  assert(a.length === b.length, 'Vectors a and b must be of same length');
-  let result = 0;
-  for (let i = 0; i < a.length; ++i) {
-    result += a[i] * b[i];
-  }
-  return result;
-}
-
-/** Sums all the elements in the vector */
-export function sum(a: Vector): number {
-  let result = 0;
-  for (let i = 0; i < a.length; ++i) {
-    result += a[i];
-  }
-  return result;
-}
-
-/** Returns the sum of two vectors, i.e. a + b */
-export function add(a: Vector, b: Vector): Float32Array {
-  assert(a.length === b.length, 'Vectors a and b must be of same length');
-  let result = new Float32Array(a.length);
-  for (let i = 0; i < a.length; ++i) {
-    result[i] = a[i] + b[i];
-  }
-  return result;
-}
-
-/** Subtracts vector b from vector a, i.e. returns a - b */
-export function sub(a: Vector, b: Vector): Float32Array {
-  assert(a.length === b.length, 'Vectors a and b must be of same length');
-  let result = new Float32Array(a.length);
-  for (let i = 0; i < a.length; ++i) {
-    result[i] = a[i] - b[i];
-  }
-  return result;
-}
-
-/** Returns the square norm of the vector */
-export function norm2(a: Vector): number {
-  let result = 0;
-  for (let i = 0; i < a.length; ++i) {
-    result += a[i] * a[i];
-  }
-  return result;
-}
-
-/** Returns the euclidean distance between two vectors. */
-export function dist(a: Vector, b: Vector): number {
-  return Math.sqrt(dist2(a, b));
-}
-
-/** Returns the square euclidean distance between two vectors. */
-export function dist2(a: Vector, b: Vector): number {
-  assert(a.length === b.length, 'Vectors a and b must be of same length');
-  let result = 0;
-  for (let i = 0; i < a.length; ++i) {
-    let diff = a[i] - b[i];
-    result += diff * diff;
-  }
-  return result;
-}
-
-/** Returns the square euclidean distance between two 2D points. */
-export function dist2_2D(a: Vector, b: Vector): number {
-  let dX = a[0] - b[0];
-  let dY = a[1] - b[1];
-  return dX * dX + dY * dY;
-}
-
-/** Returns the square euclidean distance between two 3D points. */
-export function dist2_3D(a: Vector, b: Vector): number {
-  let dX = a[0] - b[0];
-  let dY = a[1] - b[1];
-  let dZ = a[2] - b[2];
-  return dX * dX + dY * dY + dZ * dZ;
-}
-
-/** Returns the euclidean distance between 2 3D points. */
-export function dist_3D(a: Vector, b: Vector): number {
-  return Math.sqrt(dist2_3D(a, b));
-}
-
-/**
- * Returns the square euclidean distance between two vectors, with an early
- * exit (returns -1) if the distance is >= to the provided limit.
- */
-export function dist2WithLimit(a: Vector, b: Vector, limit: number): number {
-  assert(a.length === b.length, 'Vectors a and b must be of same length');
-  let result = 0;
-  for (let i = 0; i < a.length; ++i) {
-    let diff = a[i] - b[i];
-    result += diff * diff;
-    if (result >= limit) {
-      return -1;
-    }
-  }
-  return result;
-}
-
-/** Returns the square euclidean distance between two 2D points. */
-export function dist22D(a: Point2D, b: Point2D): number {
-  let dX = a[0] - b[0];
-  let dY = a[1] - b[1];
-  return dX * dX + dY * dY;
-}
-
-/** Modifies the vector in-place to have unit norm. */
-export function unit(a: Vector): void {
-  let norm = Math.sqrt(norm2(a));
-  assert(norm >= 0, 'Norm of the vector must be > 0');
-  for (let i = 0; i < a.length; ++i) {
-    a[i] /= norm;
-  }
-}
-
-/**
- *  Projects the vectors to a lower dimension
- *
- * @param vectors Array of vectors to be projected.
- * @param newDim The resulting dimension of the vectors.
- */
-export function projectRandom(vectors: Float32Array[], newDim: number):
-    Float32Array[] {
-  let dim = vectors[0].length;
-  let N = vectors.length;
-  let newVectors: Float32Array[] = new Array(N);
-  for (let i = 0; i < N; ++i) {
-    newVectors[i] = new Float32Array(newDim);
-  }
-  // Make nDim projections.
-  for (let k = 0; k < newDim; ++k) {
-    let randomVector = rn(dim);
-    for (let i = 0; i < N; ++i) {
-      newVectors[i][k] = dot(vectors[i], randomVector);
-    }
-  }
-  return newVectors;
-}
-
-/**
- * Projects a vector onto a 2D plane specified by the two direction vectors.
- */
-export function project2d(a: Vector, dir1: Vector, dir2: Vector): Point2D {
-  return [dot(a, dir1), dot(a, dir2)];
-}
-
-/**
- * Computes the centroid of the data points. If the provided data points are not
- * vectors, an accessor function needs to be provided.
- */
-export function centroid<T>(dataPoints: T[], accessor?: (a: T) => Vector):
-    Vector {
-  if (dataPoints.length === 0) {
-    return null;
-  }
-  if (accessor == null) {
-    accessor = (a: T) => <any>a;
-  }
-  assert(dataPoints.length >= 0, '`vectors` must be of length >= 1');
-  let centroid = new Float32Array(accessor(dataPoints[0]).length);
-  for (let i = 0; i < dataPoints.length; ++i) {
-    let dataPoint = dataPoints[i];
-    let vector = accessor(dataPoint);
-    for (let j = 0; j < centroid.length; ++j) {
-      centroid[j] += vector[j];
-    }
-  }
-  for (let j = 0; j < centroid.length; ++j) {
-    centroid[j] /= dataPoints.length;
-  }
-  return centroid;
-}
-
-/**
- * Generates a vector of the specified size where each component is drawn from
- * a random (0, 1) gaussian distribution.
- */
-export function rn(size: number): Float32Array {
-  const normal = d3.randomNormal();
-  let result = new Float32Array(size);
-  for (let i = 0; i < size; ++i) {
-    result[i] = normal();
-  }
-  return result;
-}
-
-/**
- * Returns the cosine distance ([0, 2]) between two vectors
- * that have been normalized to unit norm.
- */
-export function cosDistNorm(a: Vector, b: Vector): number {
-  return 1 - dot(a, b);
-}
-
-/**
- * Returns the cosine distance ([0, 2]) between two vectors.
- */
-export function cosDist(a: Vector, b: Vector): number {
-  return 1 - cosSim(a, b);
-}
-
-/** Returns the cosine similarity ([-1, 1]) between two vectors. */
-export function cosSim(a: Vector, b: Vector): number {
-  return dot(a, b) / Math.sqrt(norm2(a) * norm2(b));
-}
-
-/**
- * Converts list of vectors (matrix) into a 1-dimensional
- * typed array with row-first order.
- */
-export function toTypedArray<T>(
-    dataPoints: T[], accessor: (dataPoint: T) => Float32Array): Float32Array {
-  let N = dataPoints.length;
-  let dim = accessor(dataPoints[0]).length;
-  let result = new Float32Array(N * dim);
-  for (let i = 0; i < N; ++i) {
-    let vector = accessor(dataPoints[i]);
-    for (let d = 0; d < dim; ++d) {
-      result[i * dim + d] = vector[d];
-    }
-  }
-  return result;
-}
-
-/**
- * Transposes an RxC matrix represented as a flat typed array
- * into a CxR matrix, again represented as a flat typed array.
- */
-export function transposeTypedArray(
-    r: number, c: number, typedArray: Float32Array) {
-  let result = new Float32Array(r * c);
-  for (let i = 0; i < r; ++i) {
-    for (let j = 0; j < c; ++j) {
-      result[j * r + i] = typedArray[i * c + j];
-    }
-  }
-  return result;
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html
deleted file mode 100644
index e19f0364c441ba4e37836fde2230ca7830095f76..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html
+++ /dev/null
@@ -1,105 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-
-<link rel="import" href="vz-projector.html">
-<link rel="import" href="styles.html">
-
-<dom-module id="vz-projector-app">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-#appbar {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  padding: 0 24px;
-  height: 60px;
-  color: white;
-  background: #560731;
-}
-
-#appbar .logo {
-  font-size: 18px;
-  font-weight: 300;
-}
-
-.icons {
-  display: flex;
-}
-
-.icons a {
-  color: white;
-}
-
-vz-projector {
-  height: calc(100% - 60px);
-}
-
-#container {
-  height: 100%;
-}
-</style>
-
-<div id="container">
-  <div id="appbar">
-    <div>Embedding Projector</div>
-    <div class="icons">
-      <a title="Documentation" target="_blank" href="[[documentationLink]]">
-        <paper-icon-button icon="help-outline"></paper-icon-button>
-        <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-          Open documentation
-        </paper-tooltip>
-      </a>
-      <a title="Report bug" target="_blank" href="[[bugReportLink]]">
-        <paper-icon-button icon="bug-report"></paper-icon-button>
-        <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-          Report a bug
-        </paper-tooltip>
-      </a>
-    </div>
-  </div>
-  <vz-projector route-prefix="[[routePrefix]]"
-      serving-mode="[[servingMode]]"
-      projector-config-json-path="[[projectorConfigJsonPath]]"
-      page-view-logging="[[pageViewLogging]]"
-      event-logging="[[eventLogging]]">
-  </vz-projector>
-</div>
-</template>
-<!-- Google analytics -->
-<script jscomp-nocompile>
-  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
-  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
-  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
-  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
-
-  ga('create', 'UA-46457317-5', 'auto');
-</script>
-<script>
-  Polymer({
-    is: 'vz-projector-app',
-    properties: {
-      pageViewLogging: {type: Boolean, value: false},
-      eventLogging: {type: Boolean, value: false}
-    }
-  });
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html
deleted file mode 100644
index f3f3f59a9486ed7ecd4e27de49dc10ef7557aee7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html
+++ /dev/null
@@ -1,207 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-input/paper-textarea.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector-bookmark-panel">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-#title {
-  background-color: #fafafa;
-  color: black;
-  font-weight: 500;
-  left: 0;
-  line-height: 60px;
-  padding-left: 24px;
-  position: absolute;
-  width: 276px;
-}
-#bookmark-container {
-  background-color: #fafafa;
-}
-#icon-container {
-  line-height: 60px;
-  position: absolute;
-  right: 0;
-}
-#header {
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  position: relative;
-}
-#panel {
-  background-color: #fafafa;
-  position: relative;
-  overflow-y: scroll;
-  top: 60px;
-  max-height: 50vh;
-}
-
-#save-container {
-  text-align: center;
-}
-
-.state-radio {
-  display: table-cell;
-  vertical-align: middle;
-  padding-top: 16px;
-}
-
-.state-label {
-  display: table-cell;
-  vertical-align: middle;
-  top: 14px;
-}
-
-.state-label-input {
-  width: 194px;
-}
-
-.state-clear {
-  display: table-cell;
-  vertical-align: middle;
-  padding-top: 20px;
-}
-#state-file {
-  display: none;
-}
-#no-bookmarks {
-  padding: 0 24px;
-}
-#action-buttons-container .add-icon-button {
-  background-color: #03a9f4;
-  color: white;
-  margin: 0 4px 4px auto;
-  right: 7px;
-  top: -4px;
-}
-.upload-download-icon-button {
-  padding: 0;
-}
-#action-buttons-container {
-  display: flex;
-  margin-left: 34px;
-  margin-top: 6px;
-}
-.ink-fab {
-  border-radius: 50%;
-  background: white;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
-}
-paper-textarea {
-  --paper-input-container-input: {
-    font-size: 12px;
-  }
-  --paper-font-caption: {
-    display: none
-  }
-}
-</style>
-
-<!-- Bookmarking controls -->
-<div id="bookmark-container">
-  <div id="header">
-    <div id="title">
-      BOOKMARKS ([[savedStates.length]])
-      <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      <paper-tooltip animation-delay="0" position="top" offset="0">
-        Open this drawer to save a set of views of the projection, including
-        selected points. A file containing the bookmarks can then be saved and
-        later loaded to view them.
-      </paper-tooltip>
-    </div>
-    <div id="icon-container">
-      <!-- Icons and event handlers are inverted because the tray expands upwards. -->
-      <paper-icon-button id="expand-more"
-          icon="expand-less"
-          on-tap="_expandMore"></paper-icon-button>
-      <paper-icon-button id="expand-less"
-          style="display: none"
-          icon="expand-more"
-          on-tap="_expandLess"></paper-icon-button>
-    </div>
-  </div>
-  <iron-collapse id="panel">
-    <!-- Saving state section -->
-    <div id="state-section">
-      <template is="dom-if" if="[[!savedStates.length]]">
-        <p id="no-bookmarks">
-            No bookmarks yet, upload a bookmarks file or add a new bookmark by clicking the "+" below.
-        </p>
-      </template>
-
-      <template is="dom-repeat" items="{{savedStates}}">
-        <div class="state-row">
-          <div class="state-radio">
-            <template is="dom-if" if="{{item.isSelected}}">
-              <paper-icon-button icon="radio-button-checked"></paper-icon-button>
-            </template>
-            <template is="dom-if" if="{{!item.isSelected}}">
-              <paper-icon-button
-                  icon="radio-button-unchecked"
-                  data-index$="{{index}}"
-                  on-tap="_radioButtonHandler"></paper-icon-button>
-            </template>
-          </div>
-          <div class="state-label">
-            <paper-textarea value="[[item.label]]"
-                class="state-label-input"
-                on-keyup="_labelChange"
-                data-index$="[[index]]"
-                autoresizing></paper-input>
-          </div>
-          <div class="state-clear">
-            <paper-icon-button
-                icon="clear"
-                data-index$="{{index}}"
-                on-tap="_clearButtonHandler"></paper-icon-button>
-          </div>
-        </div>
-      </template>
-
-      <div id="action-buttons-container">
-        <paper-icon-button
-            class="upload-download-icon-button"
-            icon="save"
-            title="Save bookmarks"
-            disabled="[[!hasStates]]"
-            on-tap="_downloadFile"></paper-icon-button>
-        <paper-icon-button
-            class="upload-download-icon-button"
-            icon="file-upload"
-            title="Load bookmarks"
-            on-tap="_uploadFile"></paper-icon-button>
-        <paper-icon-button
-            class="add-icon-button ink-fab"
-            icon="add"
-            title="Add bookmark"
-            on-tap="_addBookmark"></paper-icon-button>
-        <input type="file" id="state-file" name="state-file"/>
-      </div>
-    </div>
-  </iron-collapse>
-</div>
-
-</template>
-<script src="vz-projector-bookmark-panel.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
deleted file mode 100644
index 53195fa47c05132102943d7052378ebf136973c8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-import {State} from './data';
-import {DataProvider, EmbeddingInfo} from './data-provider';
-import * as logging from './logging';
-import {ProjectorEventContext} from './projectorEventContext';
-import {Projector} from './vz-projector';
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-// tslint:disable-next-line
-export let BookmarkPanelPolymer = PolymerElement({
-  is: 'vz-projector-bookmark-panel',
-  properties: {
-    savedStates: Object,
-    // Keep a separate polymer property because the savedStates doesn't change
-    // when adding and removing states.
-    hasStates: {type: Boolean, value: false},
-    selectedState: Number
-  }
-});
-
-export class BookmarkPanel extends BookmarkPanelPolymer {
-  private projector: Projector;
-
-  // A list containing all of the saved states.
-  private savedStates: State[];
-  private hasStates = false;
-  private selectedState: number;
-  private ignoreNextProjectionEvent: boolean;
-
-  private expandLessButton: HTMLButtonElement;
-  private expandMoreButton: HTMLButtonElement;
-
-  ready() {
-    this.savedStates = [];
-    this.setupUploadButton();
-    this.ignoreNextProjectionEvent = false;
-    this.expandLessButton =
-        this.querySelector('#expand-less') as HTMLButtonElement;
-    this.expandMoreButton =
-        this.querySelector('#expand-more') as HTMLButtonElement;
-  }
-
-  initialize(
-      projector: Projector, projectorEventContext: ProjectorEventContext) {
-    this.projector = projector;
-    projectorEventContext.registerProjectionChangedListener(() => {
-      if (this.ignoreNextProjectionEvent) {
-        this.ignoreNextProjectionEvent = false;
-      } else {
-        this.clearStateSelection();
-      }
-    });
-  }
-
-  setSelectedTensor(
-      run: string, tensorInfo: EmbeddingInfo, dataProvider: DataProvider) {
-    // Clear any existing bookmarks.
-    this.addStates(null);
-    if (tensorInfo && tensorInfo.bookmarksPath) {
-      // Get any bookmarks that may come when the projector starts up.
-      dataProvider.getBookmarks(run, tensorInfo.tensorName, bookmarks => {
-        this.addStates(bookmarks);
-        this._expandMore();
-      });
-    } else {
-      this._expandLess();
-    }
-  }
-
-  /** Handles a click on show bookmarks tray button. */
-  _expandMore() {
-    this.$.panel.show();
-    this.expandMoreButton.style.display = 'none';
-    this.expandLessButton.style.display = '';
-  }
-
-  /** Handles a click on hide bookmarks tray button. */
-  _expandLess() {
-    this.$.panel.hide();
-    this.expandMoreButton.style.display = '';
-    this.expandLessButton.style.display = 'none';
-  }
-
-  /** Handles a click on the add bookmark button. */
-  _addBookmark() {
-    let currentState = this.projector.getCurrentState();
-    currentState.label = 'State ' + this.savedStates.length;
-    currentState.isSelected = true;
-
-    this.selectedState = this.savedStates.length;
-
-    for (let i = 0; i < this.savedStates.length; i++) {
-      this.savedStates[i].isSelected = false;
-      // We have to call notifyPath so that polymer knows this element was
-      // updated.
-      this.notifyPath('savedStates.' + i + '.isSelected', false, false);
-    }
-
-    this.push('savedStates', currentState as any);
-    this.updateHasStates();
-  }
-
-  /** Handles a click on the download bookmarks button. */
-  _downloadFile() {
-    let serializedState = this.serializeAllSavedStates();
-    let blob = new Blob([serializedState], {type: 'text/plain'});
-    let textFile = window.URL.createObjectURL(blob);
-
-    // Force a download.
-    let a = document.createElement('a');
-    document.body.appendChild(a);
-    a.style.display = 'none';
-    a.href = textFile;
-    (a as any).download = 'state';
-    a.click();
-
-    document.body.removeChild(a);
-    window.URL.revokeObjectURL(textFile);
-  }
-
-  /** Handles a click on the upload bookmarks button. */
-  _uploadFile() {
-    let fileInput = this.dom.select('#state-file');
-    (fileInput.node() as HTMLInputElement).click();
-  }
-
-  private setupUploadButton() {
-    // Show and setup the load view button.
-    const fileInput = this.querySelector('#state-file') as HTMLInputElement;
-    fileInput.onchange = () => {
-      const file: File = fileInput.files[0];
-      // Clear out the value of the file chooser. This ensures that if the user
-      // selects the same file, we'll re-read it.
-      fileInput.value = '';
-      const fileReader = new FileReader();
-      fileReader.onload = (evt) => {
-        const str: string = fileReader.result;
-        const savedStates = JSON.parse(str);
-
-        // Verify the bookmarks match.
-        if (this.savedStatesValid(savedStates)) {
-          this.addStates(savedStates);
-          this.loadSavedState(0);
-        } else {
-          logging.setWarningMessage(
-              `Unable to load bookmarks: wrong dataset, expected dataset ` +
-              `with shape (${savedStates[0].dataSetDimensions}).`);
-        }
-      };
-      fileReader.readAsText(file);
-    };
-  }
-
-  addStates(savedStates?: State[]) {
-    if (savedStates == null) {
-      this.savedStates = [];
-    } else {
-      for (let i = 0; i < savedStates.length; i++) {
-        savedStates[i].isSelected = false;
-        this.push('savedStates', savedStates[i] as any);
-      }
-    }
-    this.updateHasStates();
-  }
-
-  /** Deselects any selected state selection. */
-  clearStateSelection() {
-    for (let i = 0; i < this.savedStates.length; i++) {
-      this.setSelectionState(i, false);
-    }
-  }
-
-  /** Handles a radio button click on a saved state. */
-  _radioButtonHandler(evt: Event) {
-    const index = this.getParentDataIndex(evt);
-    this.loadSavedState(index);
-    this.setSelectionState(index, true);
-  }
-
-  loadSavedState(index: number) {
-    for (let i = 0; i < this.savedStates.length; i++) {
-      if (this.savedStates[i].isSelected) {
-        this.setSelectionState(i, false);
-      } else if (index === i) {
-        this.setSelectionState(i, true);
-        this.ignoreNextProjectionEvent = true;
-        this.projector.loadState(this.savedStates[i]);
-      }
-    }
-  }
-
-  private setSelectionState(stateIndex: number, selected: boolean) {
-    this.savedStates[stateIndex].isSelected = selected;
-    const path = 'savedStates.' + stateIndex + '.isSelected';
-    this.notifyPath(path, selected, false);
-  }
-
-  /**
-   * Crawls up the DOM to find an ancestor with a data-index attribute. This is
-   * used to match events to their bookmark index.
-   */
-  private getParentDataIndex(evt: Event) {
-    for (let i = 0; i < (evt as any).path.length; i++) {
-      let dataIndex = (evt as any).path[i].getAttribute('data-index');
-      if (dataIndex != null) {
-        return +dataIndex;
-      }
-    }
-    return -1;
-  }
-
-  /** Handles a clear button click on a bookmark. */
-  _clearButtonHandler(evt: Event) {
-    let index = this.getParentDataIndex(evt);
-    this.splice('savedStates', index, 1);
-    this.updateHasStates();
-  }
-
-  /** Handles a label change event on a bookmark. */
-  _labelChange(evt: Event) {
-    let index = this.getParentDataIndex(evt);
-    this.savedStates[index].label = (evt.target as any).value;
-  }
-
-  /**
-   * Used to determine whether to select the radio button for a given bookmark.
-   */
-  _isSelectedState(index: number) {
-    return index === this.selectedState;
-  }
-  _isNotSelectedState(index: number) {
-    return index !== this.selectedState;
-  }
-
-  /**
-   * Gets all of the saved states as a serialized string.
-   */
-  serializeAllSavedStates(): string {
-    return JSON.stringify(this.savedStates);
-  }
-
-  /**
-   * Loads all of the serialized states and shows them in the list of
-   * viewable states.
-   */
-  loadSavedStates(serializedStates: string) {
-    this.savedStates = JSON.parse(serializedStates);
-    this.updateHasStates();
-  }
-
-  /**
-   * Updates the hasState polymer property.
-   */
-  private updateHasStates() {
-    this.hasStates = (this.savedStates.length !== 0);
-  }
-
-  /** Sanity checks a State array to ensure it matches the current dataset. */
-  private savedStatesValid(states: State[]): boolean {
-    for (let i = 0; i < states.length; i++) {
-      if (states[i].dataSetDimensions[0] !== this.projector.dataSet.dim[0] ||
-          states[i].dataSetDimensions[1] !== this.projector.dataSet.dim[1]) {
-        return false;
-      }
-    }
-    return true;
-  }
-}
-document.registerElement(BookmarkPanel.prototype.is, BookmarkPanel);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-colab.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-colab.html
deleted file mode 100644
index 2acb570b3c1f12e25b0ee64e24cf398a4ad2df21..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-colab.html
+++ /dev/null
@@ -1,32 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="vz-projector.html">
-<dom-module id="vz-projector-colab">
-<template>
-<vz-projector serving-mode="proto" data-proto="[[dataProto]]"></vz-projector>
-</template>
-<script>
-Polymer({
-  is: 'vz-projector-colab',
-  properties: {
-    dataProto: Object
-  }
-});
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
deleted file mode 100644
index 8223c503ecdd690481498cbf6a9faa37bccb7496..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
+++ /dev/null
@@ -1,79 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
-<link rel="import" href="../tf-dashboard-common/tf-no-data-warning.html">
-<link rel="import" href="vz-projector.html">
-
-<dom-module id="vz-projector-dashboard">
-<template>
-  <tf-no-data-warning
-    data-type="projector"
-    show-warning="[[dataNotFound]]"
-  ></tf-no-data-warning>
-  <template is="dom-if" if="[[!dataNotFound]]">
-    <vz-projector
-      id="projector"
-      route-prefix="[[routePrefix]]"
-      serving-mode="server"
-      page-view-logging
-      event-logging
-    ></vz-projector>
-  </template>
-</template>
-<script>
-import {DashboardBehavior} from "../tf-dashboard-common/dashboard-behavior";
-
-Polymer({
-  is: 'vz-projector-dashboard',
-  factoryImpl: function(routePrefix) {
-    this.routePrefix = routePrefix;
-  },
-  properties: {
-    dataNotFound: Boolean,
-    routePrefix: String,
-    // Whether this dashboard is initialized. This dashboard should only be initialized once.
-    _initialized: Boolean,
-  },
-  behaviors: [
-    DashboardBehavior("embeddings"),
-  ],
-  reload: function() {
-    // Do not reload the embedding projector. Reloading could take a long time.
-  },
-  attached: function() {
-    if (this._initialized) {
-      return;
-    }
-    let xhr = new XMLHttpRequest();
-    xhr.open('GET', this.routePrefix + '/runs');
-    xhr.onload = () => {
-      // Set this to true so we only initialize once.
-      this._initialized = true;
-
-      let runs = JSON.parse(xhr.responseText);
-      this.set('dataNotFound', runs.length === 0);
-    };
-    xhr.onerror = () => {
-      this.set('dataNotFound', false);
-    };
-    xhr.send();
-  },
-});
-</script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
deleted file mode 100644
index d8dfd6e978c489ff78f5ad15b44808e8da679f03..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
+++ /dev/null
@@ -1,402 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../paper-input/paper-textarea.html">
-<link rel="import" href="../paper-item/paper-item.html">
-<link rel="import" href="../paper-listbox/paper-listbox.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../paper-dialog-scrollable/paper-dialog-scrollable.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="../tf-imports/d3.html">
-<link rel="import" href="vz-projector-legend.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector-data-panel">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.container {
-  padding: 10px 20px 20px 20px;
-}
-
-input[type=file] {
-  display: none;
-}
-
-.file-name {
-  margin-right: 10px;
-}
-
-.dirs {
-  color: rgba(0, 0, 0, 0.7);
-  font-size: 12px;
-}
-
-.dirs table tr {
-  vertical-align: top;
-}
-
-.dirs table tr td {
-  padding-bottom: 10px;
-}
-
-paper-item {
-  --paper-item-disabled: {
-    border-bottom: 1px solid black;
-    justify-content: center;
-    font-size: 12px;
-    line-height: normal;
-    min-height: 0px;
-  };
-}
-
-.item-details {
-  margin-left: 5px;
-  color: gray;
-  font-size: 12px;
-}
-
-paper-dropdown-menu {
-  width: 100%;
-}
-
-paper-dropdown-menu paper-item {
-  justify-content: space-between;
-}
-
-.title {
-  align-items: center;
-  border-bottom: 1px solid rgba(0, 0, 0, 0.1);
-  color: black;
-  display: flex;
-  font-weight: 500;
-  height: 59px;
-  padding-left: 20px;
-}
-
-#normalize-data-checkbox {
-  margin: 10px 0;
-}
-
-#projector-config-template {
-  --paper-input-container-input: {
-    line-height: 13px;
-    font-family: monospace;
-    font-size: 12px;
-  };
-}
-
-#generate-share-url {
-  padding: 16px;
-  margin-left: 24px;
-}
-
-#projector-share-button-container {
-  margin: 10px 0;
-}
-
-.config-checkbox {
-  display: inline-block;
-  font-size: 11px;
-  margin-left: 10px;
-}
-
-.projector-config-options {
-  margin-top: 12px;
-}
-
-.projector-config-dialog-container {
-  padding: 24px;
-}
-
-.code {
-  background-color: #f7f7f7;
-  display: table;
-  font-family: monospace;
-  margin-top: 7px;
-  padding: 15px;
-}
-
-.delimiter {
-  color: #B71C1C;
-}
-
-.upload-step {
-  display: flex;
-  justify-content: space-between;
-  margin-bottom: 6px;
-}
-
-.upload-step paper-button {
-  margin-left: 30px;
-}
-
-.step-label {
-  color: rgb(38, 180, 226);
-}
-
-.scrollable-container {
-  margin-top: 0;
-  min-width: 400px;
-}
-
-#projectorConfigDialog p {
-  margin: 8px 0 8px;
-}
-
-.data-step {
-  margin-top: 40px;
-}
-
-.data-step-contents {
-  display: table;
-  width: 100%;
-}
-
-.data-step-contents-contents {
-  display: table-cell;
-  margin-top: 6px;
-}
-
-.data-step-contents-upload {
-  display: table-cell;
-  text-align: right;
-  vertical-align: bottom;
-}
-
-#demo-data-buttons-container {
-  display: none;
-}
-
-.colorby-container {
-  margin-bottom: 10px;
-}
-</style>
-<div class="title">DATA</div>
-<div class="container">
-  <!-- List of runs -->
-  <template is="dom-if" if="[[_hasChoices(runNames)]]">
-    <paper-dropdown-menu no-animations label="[[_getNumRunsLabel(runNames)]] found">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedRun}}">
-        <template is="dom-repeat" items="[[runNames]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-  <template is="dom-if" if="[[tensorNames]]">
-    <!-- List of tensors in checkpoint -->
-    <paper-dropdown-menu no-animations label="[[_getNumTensorsLabel(tensorNames)]] found">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedTensor}}">
-        <template is="dom-repeat" items="[[tensorNames]]">
-          <paper-item value="[[item.name]]" label="[[item.name]]">
-            [[item.name]]
-            <span class="item-details">
-              [[item.shape.0]]x[[item.shape.1]]
-            </span>
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-  <!-- Label by -->
-  <template is="dom-if" if="[[_hasChoices(labelOptions)]]">
-    <paper-dropdown-menu no-animations label="Label by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedLabelOption}}">
-        <template is="dom-repeat" items="[[labelOptions]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-  <!-- Color by -->
-  <div hidden$="[[!_hasChoices(colorOptions)]]" class="colorby-container">
-    <paper-dropdown-menu id="colorby" no-animations label="Color by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedColorOptionName}}">
-        <template is="dom-repeat" items="[[colorOptions]]">
-          <paper-item class$="[[getSeparatorClass(item.isSeparator)]]" value="[[item.name]]" label="[[item.name]]" disabled="[[item.isSeparator]]">
-            [[item.name]]
-            <span class="item-details">[[item.desc]]</span>
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-    <div hidden$="[[!showForceCategoricalColorsCheckbox]]">
-      <paper-checkbox id="force-categorical-checkbox"></paper-checkbox>
-      Use categorical coloring
-      <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-        For metadata fields that have many unique values we use a gradient color map
-        by default. This checkbox allows you to force categorical coloring by a given
-        metadata field.
-      </paper-tooltip>
-    </div>
-    <template dom-if="[[colorLegendRenderInfo]]">
-      <vz-projector-legend render-info="[[colorLegendRenderInfo]]"></vz-projector-legend>
-    </template>
-  </div>
-  <paper-checkbox id="normalize-data-checkbox" checked="{{normalizeData}}">
-    Sphereize data
-    <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-    <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-      The data is normalized by shifting each point by the centroid and making
-      it unit norm.
-    </paper-tooltip>
-  </paper-checkbox>
-  <p id="demo-data-buttons-container">
-    <span>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-        Load data from your computer
-      </paper-tooltip>
-      <paper-button id="upload" class="ink-button" onclick="dataDialog.open()">Load data</paper-button>
-    </span>
-    <span id="publish-container">
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-        Publish your embedding visualization and data
-      </paper-tooltip>
-      <paper-button id="host-embedding" class="ink-button" onclick="projectorConfigDialog.open()">Publish</paper-button>
-    </span>
-  </p>
-  <div>
-    <paper-dialog id="dataDialog" with-backdrop>
-      <h2>Load data from your computer</h2>
-      <paper-dialog-scrollable class="scrollable-container">
-        <div class="data-step" id="upload-tensors-step-container">
-          <div class="upload-step">
-            <div>
-                <b><span class="step-label">Step 1:</span> Load a TSV file of vectors.</b>
-            </div>
-          </div>
-          <div class="data-step-contents">
-            <div class="data-step-contents-contents">
-              Example of 3 vectors with dimension 4:
-              <div class="code">
-                0.1<span class="delimiter">\t</span>0.2<span class="delimiter">\t</span>0.5<span class="delimiter">\t</span>0.9<br/>
-                0.2<span class="delimiter">\t</span>0.1<span class="delimiter">\t</span>5.0<span class="delimiter">\t</span>0.2<br/>
-                0.4<span class="delimiter">\t</span>0.1<span class="delimiter">\t</span>7.0<span class="delimiter">\t</span>0.8
-              </div>
-            </div>
-            <div class="data-step-contents-upload">
-              <paper-button id="upload-tensors" title="Choose a TSV tensor file">Choose file</paper-button>
-              <input type="file" id="file" name="file"/>
-            </div>
-          </div>
-        </div>
-        <div class="data-step">
-          <div class="upload-step">
-            <div>
-                <span class="step-label" id="upload-metadata-label"><b>Step 2</b> (optional):</span> <b>Load a TSV file of metadata.</b>
-            </div>
-          </div>
-          <div class="data-step-contents">
-            <div class="data-step-contents-contents">
-              Example of 3 data points and 2 columns.<br/>
-              <i>Note: If there is more than one column, the first row will be parsed as column labels.</i>
-              <div class="code">
-                <b>Pokémon<span class="delimiter">\t</span>Species</b><br/>
-                Wartortle<span class="delimiter">\t</span>Turtle<br/>
-                Venusaur<span class="delimiter">\t</span>Seed<br/>
-                Charmeleon<span class="delimiter">\t</span>Flame
-              </div>
-            </div>
-            <div class="data-step-contents-upload">
-              <paper-button id="upload-metadata" title="Choose a TSV metadata file" class="ink-button">Choose file</paper-button>
-              <input type="file" id="file-metadata" name="file-metadata"/>
-            </div>
-          </div>
-        </div>
-      </paper-dialog-scrollable>
-      <div class="dismiss-dialog-note">Click outside to dismiss.</div>
-    </paper-dialog>
-    <paper-dialog id="projectorConfigDialog" with-backdrop>
-      <h2>Publish your embedding visualization and data</h2>
-      <paper-dialog-scrollable class="scrollable-container">
-        <div>
-          <p>
-            If you'd like to share your visualization with the world, follow these simple steps.
-            See <a target=_blank href="https://www.tensorflow.org/get_started/embedding_viz">this tutorial</a> for more.
-          </p>
-          <h4><span class="step-label">Step 1:</span> Make data public</h4>
-          <p>
-            Host tensors, metadata, sprite image, and bookmarks TSV files <i>publicly</i> on the web.
-          </p>
-          <p>
-            One option is using a <a target=_blank href="https://gist.github.com/">github gist</a>.
-            If you choose this approach, make sure to link directly to the raw file.
-          </p>
-        </div>
-        <div>
-          <h4><span class="step-label">Step 2:</span> Projector config</h4>
-          <div class="projector-config-options">
-            <i>Optional:</i>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-metadata-checkbox" checked>Metadata</paper-checkbox>
-            </div>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-sprite-checkbox">Sprite</paper-checkbox>
-            </div>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-bookmarks-checkbox">Bookmarks</paper-checkbox>
-            </div>
-          </div>
-        </div>
-        <paper-textarea id="projector-config-template" label="template_projector_config.json"></paper-textarea>
-        <div>
-          <h4><span class="step-label">Step 3:</span> Host projector config</h4>
-          After you have hosted the projector config JSON file you built above, paste the URL to the config below.
-        </div>
-        <paper-input id="projector-config-url" label="Path to projector config"></paper-input>
-        <paper-input id="projector-share-url" label="Your shareable URL" readonly></paper-input>
-        <div id="projector-share-button-container">
-          <a target=_blank id="projector-share-url-link">
-            <paper-button title="Test your shareable URL" class="ink-button">Test your shareable URL</paper-button>
-          </a>
-        </div>
-      </paper-dialog-scrollable>
-      <div class="dismiss-dialog-note">Click outside to dismiss.</div>
-    </paper-dialog>
-  </div>
-  <div class="dirs">
-    <table>
-      <tr>
-        <td>Checkpoint:</td>
-        <td><span id="checkpoint-file"></span></td>
-      </tr>
-      <tr>
-        <td>Metadata:</td>
-        <td><span id="metadata-file"></span></td>
-      </tr>
-    </table>
-  </div>
-</div>
-<!-- Closing global template -->
-</template>
-<script src="vz-projector-data-panel.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
deleted file mode 100644
index a9b6f6c5a0679960cdf45c5c4f9aecc921cf08c5..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
+++ /dev/null
@@ -1,496 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {ColorOption, ColumnStats, SpriteAndMetadataInfo} from './data';
-import {DataProvider, EmbeddingInfo, parseRawMetadata, parseRawTensors, ProjectorConfig} from './data-provider';
-import * as util from './util';
-import {Projector} from './vz-projector';
-import {ColorLegendRenderInfo, ColorLegendThreshold} from './vz-projector-legend';
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-export let DataPanelPolymer = PolymerElement({
-  is: 'vz-projector-data-panel',
-  properties: {
-    selectedTensor: {type: String, observer: '_selectedTensorChanged'},
-    selectedRun: {type: String, observer: '_selectedRunChanged'},
-    selectedColorOptionName: {
-      type: String,
-      notify: true,
-      observer: '_selectedColorOptionNameChanged'
-    },
-    selectedLabelOption:
-        {type: String, notify: true, observer: '_selectedLabelOptionChanged'},
-    normalizeData: Boolean,
-    showForceCategoricalColorsCheckbox: Boolean
-  }
-});
-
-export class DataPanel extends DataPanelPolymer {
-  selectedLabelOption: string;
-  selectedColorOptionName: string;
-  showForceCategoricalColorsCheckbox: boolean;
-
-  private normalizeData: boolean;
-  private labelOptions: string[];
-  private colorOptions: ColorOption[];
-  forceCategoricalColoring: boolean = false;
-
-  private selectedTensor: string;
-  private selectedRun: string;
-  private dataProvider: DataProvider;
-  private tensorNames: {name: string, shape: number[]}[];
-  private runNames: string[];
-  private projector: Projector;
-  private projectorConfig: ProjectorConfig;
-  private colorLegendRenderInfo: ColorLegendRenderInfo;
-  private spriteAndMetadata: SpriteAndMetadataInfo;
-  private metadataFile: string;
-
-  ready() {
-    this.normalizeData = true;
-  }
-
-  initialize(projector: Projector, dp: DataProvider) {
-    this.projector = projector;
-    this.dataProvider = dp;
-    this.setupUploadButtons();
-
-    // Tell the projector whenever the data normalization changes.
-    // Unknown why, but the polymer checkbox button stops working as soon as
-    // you do d3.select() on it.
-    this.querySelector('#normalize-data-checkbox')
-        .addEventListener('change', () => {
-          this.projector.setNormalizeData(this.normalizeData);
-        });
-
-    let forceCategoricalColoringCheckbox =
-        this.querySelector('#force-categorical-checkbox');
-    forceCategoricalColoringCheckbox.addEventListener('change', () => {
-      this.setForceCategoricalColoring(
-          (forceCategoricalColoringCheckbox as HTMLInputElement).checked);
-    });
-
-    // Get all the runs.
-    this.dataProvider.retrieveRuns(runs => {
-      this.runNames = runs;
-      // Choose the first run by default.
-      if (this.runNames.length > 0) {
-        this.selectedRun = runs[0];
-      }
-    });
-  }
-
-  setForceCategoricalColoring(forceCategoricalColoring: boolean) {
-    this.forceCategoricalColoring = forceCategoricalColoring;
-    (this.querySelector('#force-categorical-checkbox') as HTMLInputElement)
-        .checked = this.forceCategoricalColoring;
-
-    this.updateMetadataUI(this.spriteAndMetadata.stats, this.metadataFile);
-
-    // The selected color option name doesn't change when we switch to using
-    // categorical coloring for stats with too many unique values, so we
-    // manually call this polymer observer so that we update the UI.
-    this._selectedColorOptionNameChanged();
-  }
-
-  getSeparatorClass(isSeparator: boolean): string {
-    return isSeparator ? 'separator' : null;
-  }
-
-  metadataChanged(
-      spriteAndMetadata: SpriteAndMetadataInfo, metadataFile: string) {
-    this.spriteAndMetadata = spriteAndMetadata;
-    this.metadataFile = metadataFile;
-
-    this.updateMetadataUI(this.spriteAndMetadata.stats, this.metadataFile);
-    this.selectedColorOptionName = this.colorOptions[0].name;
-  }
-
-  private addWordBreaks(longString: string): string {
-    if (longString == null) {
-      return '';
-    }
-    return longString.replace(/([\/=-_,])/g, '$1<wbr>');
-  }
-
-  private updateMetadataUI(columnStats: ColumnStats[], metadataFile: string) {
-    const metadataFileElement =
-        this.querySelector('#metadata-file') as HTMLSpanElement;
-    metadataFileElement.innerHTML = this.addWordBreaks(metadataFile);
-    metadataFileElement.title = metadataFile;
-
-    // Label by options.
-    let labelIndex = -1;
-    this.labelOptions = columnStats.map((stats, i) => {
-      // Make the default label by the first non-numeric column.
-      if (!stats.isNumeric && labelIndex === -1) {
-        labelIndex = i;
-      }
-      return stats.name;
-    });
-    this.selectedLabelOption = this.labelOptions[Math.max(0, labelIndex)];
-
-    // Color by options.
-    const standardColorOption: ColorOption[] = [
-      {name: 'No color map'},
-      // TODO(smilkov): Implement this.
-      // {name: 'Distance of neighbors',
-      //    desc: 'How far is each point from its neighbors'}
-    ];
-    const metadataColorOption: ColorOption[] =
-        columnStats
-            .filter(stats => {
-              return !stats.tooManyUniqueValues || stats.isNumeric;
-            })
-            .map(stats => {
-              let map;
-              let items: {label: string, count: number}[];
-              let thresholds: ColorLegendThreshold[];
-              let isCategorical =
-                  this.forceCategoricalColoring || !stats.tooManyUniqueValues;
-              if (isCategorical) {
-                const scale = d3.scaleOrdinal(d3.schemeCategory20);
-                let range = scale.range();
-                // Re-order the range.
-                let newRange = range.map((color, i) => {
-                  let index = (i * 3) % range.length;
-                  return range[index];
-                });
-                items = stats.uniqueEntries;
-                scale.range(newRange).domain(items.map(x => x.label));
-                map = scale;
-              } else {
-                thresholds = [
-                  {color: '#ffffdd', value: stats.min},
-                  {color: '#1f2d86', value: stats.max}
-                ];
-                map = d3.scaleLinear<string, string>()
-                          .domain(thresholds.map(t => t.value))
-                          .range(thresholds.map(t => t.color));
-              }
-              let desc = !isCategorical ? 'gradient' :
-                                          stats.uniqueEntries.length +
-                      ((stats.uniqueEntries.length > 20) ? ' non-unique' : '') +
-                      ' colors';
-              return {
-                name: stats.name,
-                desc: desc,
-                map: map,
-                items: items,
-                thresholds: thresholds,
-                tooManyUniqueValues: stats.tooManyUniqueValues
-              };
-            });
-
-    if (metadataColorOption.length > 0) {
-      // Add a separator line between built-in color maps
-      // and those based on metadata columns.
-      standardColorOption.push({name: 'Metadata', isSeparator: true});
-    }
-    this.colorOptions = standardColorOption.concat(metadataColorOption);
-  }
-
-  setNormalizeData(normalizeData: boolean) {
-    this.normalizeData = normalizeData;
-  }
-
-  _selectedTensorChanged() {
-    this.projector.updateDataSet(null, null, null);
-    if (this.selectedTensor == null) {
-      return;
-    }
-    this.dataProvider.retrieveTensor(
-        this.selectedRun, this.selectedTensor, ds => {
-          let metadataFile =
-              this.getEmbeddingInfoByName(this.selectedTensor).metadataPath;
-          this.dataProvider.retrieveSpriteAndMetadata(
-              this.selectedRun, this.selectedTensor, metadata => {
-                this.projector.updateDataSet(ds, metadata, metadataFile);
-              });
-        });
-    this.projector.setSelectedTensor(
-        this.selectedRun, this.getEmbeddingInfoByName(this.selectedTensor));
-  }
-
-  _selectedRunChanged() {
-    this.dataProvider.retrieveProjectorConfig(this.selectedRun, info => {
-      this.projectorConfig = info;
-      let names =
-          this.projectorConfig.embeddings.map(e => e.tensorName)
-              .filter(name => {
-                let shape = this.getEmbeddingInfoByName(name).tensorShape;
-                return shape.length === 2 && shape[0] > 1 && shape[1] > 1;
-              })
-              .sort((a, b) => {
-                let embA = this.getEmbeddingInfoByName(a);
-                let embB = this.getEmbeddingInfoByName(b);
-
-                // Prefer tensors with metadata.
-                if (util.xor(!!embA.metadataPath, !!embB.metadataPath)) {
-                  return embA.metadataPath ? -1 : 1;
-                }
-
-                // Prefer non-generated tensors.
-                let isGenA = util.tensorIsGenerated(a);
-                let isGenB = util.tensorIsGenerated(b);
-                if (util.xor(isGenA, isGenB)) {
-                  return isGenB ? -1 : 1;
-                }
-
-                // Prefer bigger tensors.
-                let sizeA = embA.tensorShape[0];
-                let sizeB = embB.tensorShape[0];
-                if (sizeA !== sizeB) {
-                  return sizeB - sizeA;
-                }
-
-                // Sort alphabetically by tensor name.
-                return a <= b ? -1 : 1;
-              });
-      this.tensorNames = names.map(name => {
-        return {name, shape: this.getEmbeddingInfoByName(name).tensorShape};
-      });
-      const wordBreakablePath =
-          this.addWordBreaks(this.projectorConfig.modelCheckpointPath);
-      const checkpointFile =
-          this.querySelector('#checkpoint-file') as HTMLSpanElement;
-      checkpointFile.innerHTML = wordBreakablePath;
-      checkpointFile.title = this.projectorConfig.modelCheckpointPath;
-
-      // If in demo mode, let the order decide which tensor to load by default.
-      const defaultTensor = this.projector.servingMode === 'demo' ?
-          this.projectorConfig.embeddings[0].tensorName :
-          names[0];
-      if (this.selectedTensor === defaultTensor) {
-        // Explicitly call the observer. Polymer won't call it if the previous
-        // string matches the current string.
-        this._selectedTensorChanged();
-      } else {
-        this.selectedTensor = defaultTensor;
-      }
-    });
-  }
-
-  _selectedLabelOptionChanged() {
-    this.projector.setSelectedLabelOption(this.selectedLabelOption);
-  }
-
-  _selectedColorOptionNameChanged() {
-    let colorOption: ColorOption;
-    for (let i = 0; i < this.colorOptions.length; i++) {
-      if (this.colorOptions[i].name === this.selectedColorOptionName) {
-        colorOption = this.colorOptions[i];
-        break;
-      }
-    }
-    if (!colorOption) {
-      return;
-    }
-
-    this.showForceCategoricalColorsCheckbox = !!colorOption.tooManyUniqueValues;
-
-    if (colorOption.map == null) {
-      this.colorLegendRenderInfo = null;
-    } else if (colorOption.items) {
-      let items = colorOption.items.map(item => {
-        return {
-          color: colorOption.map(item.label),
-          label: item.label,
-          count: item.count
-        };
-      });
-      this.colorLegendRenderInfo = {items, thresholds: null};
-    } else {
-      this.colorLegendRenderInfo = {
-        items: null,
-        thresholds: colorOption.thresholds
-      };
-    }
-    this.projector.setSelectedColorOption(colorOption);
-  }
-
-  private tensorWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
-    parseRawTensors(rawContents, ds => {
-      const checkpointFile =
-          this.querySelector('#checkpoint-file') as HTMLSpanElement;
-      checkpointFile.innerText = fileName;
-      checkpointFile.title = fileName;
-      this.projector.updateDataSet(ds);
-    });
-  }
-
-  private metadataWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
-    parseRawMetadata(rawContents, metadata => {
-      this.projector.updateDataSet(this.projector.dataSet, metadata, fileName);
-    });
-  }
-
-  private getEmbeddingInfoByName(tensorName: string): EmbeddingInfo {
-    for (let i = 0; i < this.projectorConfig.embeddings.length; i++) {
-      const e = this.projectorConfig.embeddings[i];
-      if (e.tensorName === tensorName) {
-        return e;
-      }
-    }
-  }
-
-  private setupUploadButtons() {
-    // Show and setup the upload button.
-    const fileInput = this.querySelector('#file') as HTMLInputElement;
-    fileInput.onchange = () => {
-      const file: File = fileInput.files[0];
-      // Clear out the value of the file chooser. This ensures that if the user
-      // selects the same file, we'll re-read it.
-      fileInput.value = '';
-      const fileReader = new FileReader();
-      fileReader.onload = evt => {
-        const content: ArrayBuffer = fileReader.result;
-        this.tensorWasReadFromFile(content, file.name);
-      };
-      fileReader.readAsArrayBuffer(file);
-    };
-
-    const uploadButton =
-        this.querySelector('#upload-tensors') as HTMLButtonElement;
-    uploadButton.onclick = () => {
-      fileInput.click();
-    };
-
-    // Show and setup the upload metadata button.
-    const fileMetadataInput =
-        this.querySelector('#file-metadata') as HTMLInputElement;
-    fileMetadataInput.onchange = () => {
-      const file: File = fileMetadataInput.files[0];
-      // Clear out the value of the file chooser. This ensures that if the user
-      // selects the same file, we'll re-read it.
-      fileMetadataInput.value = '';
-      const fileReader = new FileReader();
-      fileReader.onload = evt => {
-        const contents: ArrayBuffer = fileReader.result;
-        this.metadataWasReadFromFile(contents, file.name);
-      };
-      fileReader.readAsArrayBuffer(file);
-    };
-
-    const uploadMetadataButton =
-        this.querySelector('#upload-metadata') as HTMLButtonElement;
-    uploadMetadataButton.onclick = () => {
-      fileMetadataInput.click();
-    };
-
-    if (this.projector.servingMode !== 'demo') {
-      (this.$$('#publish-container') as HTMLElement).style.display = 'none';
-      (this.$$('#upload-tensors-step-container') as HTMLElement).style.display =
-          'none';
-      (this.$$('#upload-metadata-label') as HTMLElement).style.display = 'none';
-    }
-
-    (this.$$('#demo-data-buttons-container') as HTMLElement).style.display =
-        'block';
-
-    // Fill out the projector config.
-    const projectorConfigTemplate =
-        this.$$('#projector-config-template') as HTMLTextAreaElement;
-    const projectorConfigTemplateJson: ProjectorConfig = {
-      embeddings: [{
-        tensorName: 'My tensor',
-        tensorShape: [1000, 50],
-        tensorPath: 'https://raw.githubusercontent.com/.../tensors.tsv',
-        metadataPath:
-            'https://raw.githubusercontent.com/.../optional.metadata.tsv',
-      }],
-    };
-    this.setProjectorConfigTemplateJson(
-        projectorConfigTemplate, projectorConfigTemplateJson);
-
-    // Set up optional field checkboxes.
-    const spriteFieldCheckbox =
-        this.$$('#config-sprite-checkbox') as HTMLInputElement;
-    spriteFieldCheckbox.onchange = () => {
-      if ((spriteFieldCheckbox as any).checked) {
-        projectorConfigTemplateJson.embeddings[0].sprite = {
-          imagePath: 'https://github.com/.../optional.sprite.png',
-          singleImageDim: [32, 32]
-        };
-      } else {
-        delete projectorConfigTemplateJson.embeddings[0].sprite;
-      }
-      this.setProjectorConfigTemplateJson(
-          projectorConfigTemplate, projectorConfigTemplateJson);
-    };
-    const bookmarksFieldCheckbox =
-        this.$$('#config-bookmarks-checkbox') as HTMLInputElement;
-    bookmarksFieldCheckbox.onchange = () => {
-      if ((bookmarksFieldCheckbox as any).checked) {
-        projectorConfigTemplateJson.embeddings[0].bookmarksPath =
-            'https://raw.githubusercontent.com/.../bookmarks.txt';
-      } else {
-        delete projectorConfigTemplateJson.embeddings[0].bookmarksPath;
-      }
-      this.setProjectorConfigTemplateJson(
-          projectorConfigTemplate, projectorConfigTemplateJson);
-    };
-    const metadataFieldCheckbox =
-        this.$$('#config-metadata-checkbox') as HTMLInputElement;
-    metadataFieldCheckbox.onchange = () => {
-      if ((metadataFieldCheckbox as HTMLInputElement).checked) {
-        projectorConfigTemplateJson.embeddings[0].metadataPath =
-            'https://raw.githubusercontent.com/.../optional.metadata.tsv';
-      } else {
-        delete projectorConfigTemplateJson.embeddings[0].metadataPath;
-      }
-      this.setProjectorConfigTemplateJson(
-          projectorConfigTemplate, projectorConfigTemplateJson);
-    };
-
-    // Update the link and the readonly shareable URL.
-    const projectorConfigUrlInput =
-        this.$$('#projector-config-url') as HTMLInputElement;
-    const projectorConfigDemoUrlInput = this.$$('#projector-share-url');
-    const projectorConfigDemoUrlLink = this.$$('#projector-share-url-link');
-    projectorConfigUrlInput.onchange = () => {
-      let projectorDemoUrl = location.protocol + '//' + location.host +
-          location.pathname +
-          '?config=' + (projectorConfigUrlInput as HTMLInputElement).value;
-
-      (projectorConfigDemoUrlInput as HTMLInputElement).value =
-          projectorDemoUrl;
-      (projectorConfigDemoUrlLink as HTMLLinkElement).href = projectorDemoUrl;
-    };
-  }
-
-  private setProjectorConfigTemplateJson(
-      projectorConfigTemplate: HTMLTextAreaElement, config: ProjectorConfig) {
-    projectorConfigTemplate.value =
-        JSON.stringify(config, null, /** replacer */ 2 /** white space */);
-  }
-
-  _getNumTensorsLabel(): string {
-    return this.tensorNames.length === 1 ? '1 tensor' :
-                                           this.tensorNames.length + ' tensors';
-  }
-
-  _getNumRunsLabel(): string {
-    return this.runNames.length === 1 ? '1 run' :
-                                        this.runNames.length + ' runs';
-  }
-
-  _hasChoices(choices: any[]): boolean {
-    return choices.length > 1;
-  }
-}
-
-document.registerElement(DataPanel.prototype.is, DataPanel);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html
deleted file mode 100644
index 0d7bf7cdda653b7b6dc17210514f8a76fa3ab69a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector-input">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.info {
-  color: rgba(0, 0, 0, 0.5);
-  display: block;
-  font-size: 11px;
-}
-
-.toggle {
-  font-size: 12px;
-  height: 21px;
-  margin: 0px;
-  min-width: 0px;
-  min-height: 0px;
-  padding: 0;
-  width: 17px;
-}
-
-.toggle[active] {
-  background-color: #880E4F;
-  color: white;
-}
-</style>
-
-<paper-input label="[[label]]">
-  <div class="slash" prefix>/</div>
-  <div class="slash" suffix>/</div>
-  <div suffix>
-    <paper-button id="regex" toggles class="toggle">.*</paper-button>
-  </div>
-</paper-input>
-<paper-tooltip for="regex" position="bottom" animation-delay="0" fit-to-visible-bounds>
-  Enable/disable regex mode.
-</paper-tooltip>
-<span class="info">[[message]]</span>
-
-<!-- Closing global template -->
-</template>
-<script src="vz-projector-input.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
deleted file mode 100644
index e11346d327ff7bc12e5b3c84f32c15a86cfec975..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-// tslint:disable-next-line
-export let PolymerClass = PolymerElement(
-    {is: 'vz-projector-input', properties: {label: String, message: String}});
-
-export interface InputChangedListener {
-  (value: string, inRegexMode: boolean): void;
-}
-
-/** Input control with custom capabilities (e.g. regex). */
-export class ProjectorInput extends PolymerClass {
-  private textChangedListeners: InputChangedListener[];
-  private paperInput: HTMLInputElement;
-  private inRegexModeButton: HTMLButtonElement;
-  private inRegexMode: boolean;
-
-  /** Message that will be displayed at the bottom of the input control. */
-  message: string;
-
-  /** Subscribe to be called everytime the input changes. */
-  registerInputChangedListener(listener: InputChangedListener) {
-    this.textChangedListeners.push(listener);
-  }
-
-  ready() {
-    this.inRegexMode = false;
-    this.textChangedListeners = [];
-    this.paperInput = this.querySelector('paper-input') as HTMLInputElement;
-    this.inRegexModeButton =
-        this.querySelector('paper-button') as HTMLButtonElement;
-    this.paperInput.setAttribute('error-message', 'Invalid regex');
-
-    this.paperInput.addEventListener('input', () => {
-      this.onTextChanged();
-    });
-
-    this.paperInput.addEventListener('keydown', event => {
-      event.stopPropagation();
-    });
-
-    this.inRegexModeButton.addEventListener(
-        'click', () => this.onClickRegexModeButton());
-    this.updateRegexModeDisplaySlashes();
-    this.onTextChanged();
-  }
-
-  private onClickRegexModeButton() {
-    this.inRegexMode = (this.inRegexModeButton as any).active;
-    this.updateRegexModeDisplaySlashes();
-    this.onTextChanged();
-  }
-
-  private notifyInputChanged(value: string, inRegexMode: boolean) {
-    this.textChangedListeners.forEach(l => l(value, inRegexMode));
-  }
-
-  private onTextChanged() {
-    try {
-      if (this.inRegexMode) {
-        new RegExp(this.paperInput.value);
-      }
-    } catch (invalidRegexException) {
-      this.paperInput.setAttribute('invalid', 'true');
-      this.message = '';
-      this.notifyInputChanged(null, true);
-      return;
-    }
-    this.paperInput.removeAttribute('invalid');
-    this.notifyInputChanged(this.paperInput.value, this.inRegexMode);
-  }
-
-  private updateRegexModeDisplaySlashes() {
-    const slashes = this.paperInput.querySelectorAll('.slash');
-    const display = this.inRegexMode ? '' : 'none';
-
-    for (let i = 0; i < slashes.length; i++) {
-      (slashes[i] as HTMLDivElement).style.display = display;
-    }
-  }
-
-  getValue(): string {
-    return this.paperInput.value;
-  }
-
-  getInRegexMode(): boolean {
-    return this.inRegexMode;
-  }
-
-  set(value: string, inRegexMode: boolean) {
-    (this.inRegexModeButton as any).active = inRegexMode;
-    this.paperInput.value = value;
-    this.onClickRegexModeButton();
-  }
-}
-
-document.registerElement(ProjectorInput.prototype.is, ProjectorInput);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
deleted file mode 100644
index 1b81094776f8adaff83c189b8f3fed4b231320ac..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
+++ /dev/null
@@ -1,241 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="vz-projector-input.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector-inspector-panel">
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-   display: flex;
-   flex-direction: column;
-   /* Account for the bookmark pane at the bottom */
-   height: calc(100% - 55px);
-}
-
-.container {
-  display: block;
-  padding: 10px 20px 0 20px;
-}
-
-.buttons {
-  display: flex;
-  height: 60px;
-}
-
-.button {
-  margin-right: 10px;
-  border: none;
-  border-radius: 7px;
-  font-size: 13px;
-  padding: 10px;
-  background: #e3e3e3;
-}
-
-.button:last-child {
-  margin-right: 0;
-}
-
-.nn {
-  display: flex;
-  flex-direction: column;
-}
-
-.nn > * {
-  padding: 0 20px;
-}
-
-.nn-list {
-  overflow-y: auto;
-}
-
-.nn-list .neighbor {
-  font-size: 12px;
-  margin-bottom: 8px;
-}
-
-.nn-list .label-and-value {
-  display: flex;
-  justify-content: space-between;
-}
-
-.label {
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
-}
-
-.nn-list .value {
-  color: #666;
-  float: right;
-  font-weight: 300;
-  margin-left: 8px;
-}
-
-.nn-list .bar {
-  position: relative;
-  border-top: 1px solid rgba(0, 0, 0, 0.15);
-  margin: 2px 0;
-}
-
-.nn-list .bar .fill {
-  position: absolute;
-  top: -1px;
-  border-top: 1px solid white;
-}
-
-.nn-list .tick {
-  position: absolute;
-  top: 0px;
-  height: 3px;
-  border-left: 1px solid rgba(0, 0, 0, 0.15);
-}
-
-.nn-list .neighbor-link:hover {
-  cursor: pointer;
-}
-
-.search-by {
-  display: flex;
-}
-
-.search-by vz-projector-input {
-  width: 100%;
-}
-
-.search-by paper-dropdown-menu {
-  margin-left: 10px;
-  width: 100px;
-}
-
-.distance .options {
-  float: right;
-}
-
-.options a {
-  color: #727272;
-  font-size: 13px;
-  margin-left: 12px;
-  text-decoration: none;
-}
-
-.options a.selected {
-  color: #009EFE;
-}
-
-.neighbors {
-  margin-bottom: 30px;
-}
-
-.neighbors-options {
-  margin-top: 6px;
-}
-
-.neighbors-options .option-label, .distance .option-label {
-  color: #727272;
-  margin-right: 2px;
-  width: auto;
-}
-
-.num-neighbors-container {
-  display: inline-block;
-}
-
-#nn-slider {
-  margin: 0 -12px 0 10px;
-}
-
-.euclidean {
-  margin-right: 10px;
-}
-
-.matches-list {
-  padding: 0 20px;
-}
-
-.matches-list .row {
-  border-bottom: 1px solid #ddd;
-  cursor: pointer;
-  display: flex;
-  font-size: 12px;
-  margin: 5px 0;
-  padding: 4px 0;
-}
-
-.results {
-  display: flex;
-  flex-direction: column;
-}
-</style>
-<template>
-<div class="container">
-  <div class="buttons">
-    <button class="button reset-filter">Show All Data</button>
-    <button class="button set-filter">Isolate selection</button>
-    <button class="button clear-selection">Clear selection</button>
-  </div>
-  <div class="search-by">
-    <vz-projector-input id="search-box" label="Search"></vz-projector-input>
-    <paper-dropdown-menu no-animations label="by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedMetadataField}}">
-        <template is="dom-repeat" items="[[metadataFields]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </div>
-</div>
-<div class="results">
-  <div class="nn" style="display: none">
-    <div class="neighbors">
-      <div class="neighbors-options">
-        <div class="slider num-nn">
-          <span class="option-label">neighbors</span>
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds>
-            The number of neighbors (in the original space) to show when clicking on a point.
-          </paper-tooltip>
-          <paper-slider id="nn-slider" pin min="5" max="1000" value="100"></paper-slider>
-          <span class="nn-count"></span>
-        </div>
-      </div>
-      <div class="distance">
-        <span class="option-label">distance</span>
-        <div class="options">
-          <a class="selected cosine" href="javascript:void(0);">COSINE</a>
-          <a class="euclidean" href="javascript:void(0);">EUCLIDEAN</a>
-        </div>
-      </div>
-    </div>
-    <p>Nearest points in the original space:
-    <div class="nn-list"></div>
-  </div>
-  <div class="matches-list" style="display: none">
-    <div class="list"></div>
-    <div class="limit-msg">Showing only the first 100 results...</div>
-  </div>
-</div>
-<!-- Closing global template -->
-</template>
-<script src="vz-projector-inspector-panel.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
deleted file mode 100644
index 3ee2c2165f218f4a690b40569314611ebcf58fd1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
+++ /dev/null
@@ -1,337 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {DistanceFunction, SpriteAndMetadataInfo, State} from './data';
-import * as knn from './knn';
-import {ProjectorEventContext} from './projectorEventContext';
-import * as adapter from './projectorScatterPlotAdapter';
-import * as util from './util';
-import * as vector from './vector';
-import {Projector} from './vz-projector';
-import {ProjectorInput} from './vz-projector-input';
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-/** Limit the number of search results we show to the user. */
-const LIMIT_RESULTS = 100;
-
-// tslint:disable-next-line
-export let PolymerClass = PolymerElement({
-  is: 'vz-projector-inspector-panel',
-  properties: {selectedMetadataField: String, metadataFields: Array}
-});
-
-export class InspectorPanel extends PolymerClass {
-  distFunc: DistanceFunction;
-  numNN: number;
-
-  private projectorEventContext: ProjectorEventContext;
-
-  private selectedMetadataField: string;
-  private metadataFields: string[];
-  private projector: Projector;
-  private selectedPointIndices: number[];
-  private neighborsOfFirstPoint: knn.NearestEntry[];
-  private searchBox: ProjectorInput;
-
-  private resetFilterButton: HTMLButtonElement;
-  private setFilterButton: HTMLButtonElement;
-  private clearSelectionButton: HTMLButtonElement;
-  private limitMessage: HTMLDivElement;
-
-  ready() {
-    this.resetFilterButton =
-        this.querySelector('.reset-filter') as HTMLButtonElement;
-    this.setFilterButton =
-        this.querySelector('.set-filter') as HTMLButtonElement;
-    this.clearSelectionButton =
-        this.querySelector('.clear-selection') as HTMLButtonElement;
-    this.limitMessage = this.querySelector('.limit-msg') as HTMLDivElement;
-    this.searchBox = this.querySelector('#search-box') as ProjectorInput;
-    // https://www.polymer-project.org/1.0/docs/devguide/styling#scope-subtree
-    this.scopeSubtree(this, true);
-  }
-
-  initialize(
-      projector: Projector, projectorEventContext: ProjectorEventContext) {
-    this.projector = projector;
-    this.projectorEventContext = projectorEventContext;
-    this.setupUI(projector);
-    projectorEventContext.registerSelectionChangedListener(
-        (selection, neighbors) =>
-            this.updateInspectorPane(selection, neighbors));
-  }
-
-  /** Updates the nearest neighbors list in the inspector. */
-  private updateInspectorPane(
-      indices: number[], neighbors: knn.NearestEntry[]) {
-    this.neighborsOfFirstPoint = neighbors;
-    this.selectedPointIndices = indices;
-
-    this.updateFilterButtons(indices.length + neighbors.length);
-    this.updateNeighborsList(neighbors);
-    if (neighbors.length === 0) {
-      this.updateSearchResults(indices);
-    } else {
-      this.updateSearchResults([]);
-    }
-  }
-
-  private enableResetFilterButton(enabled: boolean) {
-    this.resetFilterButton.disabled = !enabled;
-  }
-
-  restoreUIFromBookmark(bookmark: State) {
-    this.enableResetFilterButton(bookmark.filteredPoints != null);
-  }
-
-  metadataChanged(spriteAndMetadata: SpriteAndMetadataInfo) {
-    let labelIndex = -1;
-    this.metadataFields = spriteAndMetadata.stats.map((stats, i) => {
-      if (!stats.isNumeric && labelIndex === -1) {
-        labelIndex = i;
-      }
-      return stats.name;
-    });
-    labelIndex = Math.max(0, labelIndex);
-    // Make the default label the first non-numeric column.
-    this.selectedMetadataField = spriteAndMetadata.stats[labelIndex].name;
-  }
-
-  datasetChanged() {
-    this.enableResetFilterButton(false);
-  }
-
-  private updateSearchResults(indices: number[]) {
-    const container = this.querySelector('.matches-list') as HTMLDivElement;
-    container.style.display = indices.length ? null : 'none';
-    const list = container.querySelector('.list') as HTMLDivElement;
-    list.innerHTML = '';
-    if (indices.length === 0) {
-      return;
-    }
-
-    this.limitMessage.style.display =
-        indices.length <= LIMIT_RESULTS ? 'none' : null;
-    indices = indices.slice(0, LIMIT_RESULTS);
-
-    for (let i = 0; i < indices.length; i++) {
-      const index = indices[i];
-
-      const row = document.createElement('div');
-      row.className = 'row';
-
-      const label = this.getLabelFromIndex(index);
-      const rowLink = document.createElement('a');
-      rowLink.className = 'label';
-      rowLink.title = label;
-      rowLink.innerText = label;
-
-      rowLink.onmouseenter = () => {
-        this.projectorEventContext.notifyHoverOverPoint(index);
-      };
-      rowLink.onmouseleave = () => {
-        this.projectorEventContext.notifyHoverOverPoint(null);
-      };
-      rowLink.onclick = () => {
-        this.projectorEventContext.notifySelectionChanged([index]);
-      };
-
-      row.appendChild(rowLink);
-      list.appendChild(row);
-    }
-  }
-
-  private getLabelFromIndex(pointIndex: number): string {
-    const point = this.projector.dataSet.points[pointIndex];
-    return point.metadata[this.selectedMetadataField].toString();
-  }
-
-  private updateNeighborsList(neighbors: knn.NearestEntry[]) {
-    const nnlist = this.querySelector('.nn-list') as HTMLDivElement;
-    nnlist.innerHTML = '';
-
-    (this.querySelector('.nn') as HTMLDivElement).style.display =
-        neighbors.length ? null : 'none';
-
-    if (neighbors.length === 0) {
-      return;
-    }
-
-    this.searchBox.message = '';
-    const minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
-
-    for (let i = 0; i < neighbors.length; i++) {
-      const neighbor = neighbors[i];
-
-      const neighborElement = document.createElement('div');
-      neighborElement.className = 'neighbor';
-
-      const neighborElementLink = document.createElement('a');
-      neighborElementLink.className = 'neighbor-link';
-      neighborElementLink.title = this.getLabelFromIndex(neighbor.index);
-
-      const labelValueElement = document.createElement('div');
-      labelValueElement.className = 'label-and-value';
-
-      const labelElement = document.createElement('div');
-      labelElement.className = 'label';
-      labelElement.style.color =
-          adapter.dist2color(this.distFunc, neighbor.dist, minDist);
-      labelElement.innerText = this.getLabelFromIndex(neighbor.index);
-
-      const valueElement = document.createElement('div');
-      valueElement.className = 'value';
-      valueElement.innerText = neighbor.dist.toFixed(3);
-
-      labelValueElement.appendChild(labelElement);
-      labelValueElement.appendChild(valueElement);
-
-      const barElement = document.createElement('div');
-      barElement.className = 'bar';
-
-      const barFillElement = document.createElement('div');
-      barFillElement.className = 'fill';
-      barFillElement.style.borderTopColor =
-          adapter.dist2color(this.distFunc, neighbor.dist, minDist);
-      barFillElement.style.width =
-          adapter.normalizeDist(this.distFunc, neighbor.dist, minDist) * 100 +
-          '%';
-      barElement.appendChild(barFillElement);
-
-      for (let j = 1; j < 4; j++) {
-        const tickElement = document.createElement('div');
-        tickElement.className = 'tick';
-        tickElement.style.left = j * 100 / 4 + '%';
-        barElement.appendChild(tickElement);
-      }
-
-      neighborElementLink.appendChild(labelValueElement);
-      neighborElementLink.appendChild(barElement);
-      neighborElement.appendChild(neighborElementLink);
-      nnlist.appendChild(neighborElement);
-
-      neighborElementLink.onmouseenter = () => {
-        this.projectorEventContext.notifyHoverOverPoint(neighbor.index);
-      };
-      neighborElementLink.onmouseleave = () => {
-        this.projectorEventContext.notifyHoverOverPoint(null);
-      };
-      neighborElementLink.onclick = () => {
-        this.projectorEventContext.notifySelectionChanged([neighbor.index]);
-      };
-    }
-  }
-
-  private updateFilterButtons(numPoints: number) {
-    if (numPoints > 1) {
-      this.setFilterButton.innerText = `Isolate ${numPoints} points`;
-      this.setFilterButton.disabled = null;
-      this.clearSelectionButton.disabled = null;
-    } else {
-      this.setFilterButton.disabled = true;
-      this.clearSelectionButton.disabled = true;
-    }
-  }
-
-  private setupUI(projector: Projector) {
-    this.distFunc = vector.cosDist;
-    const eucDist =
-        this.querySelector('.distance a.euclidean') as HTMLLinkElement;
-    eucDist.onclick = () => {
-      const links = this.querySelectorAll('.distance a');
-      for (let i = 0; i < links.length; i++) {
-        util.classed(links[i] as HTMLElement, 'selected', false);
-      }
-      util.classed(eucDist as HTMLElement, 'selected', true);
-
-      this.distFunc = vector.dist;
-      this.projectorEventContext.notifyDistanceMetricChanged(this.distFunc);
-      const neighbors = projector.dataSet.findNeighbors(
-          this.selectedPointIndices[0], this.distFunc, this.numNN);
-      this.updateNeighborsList(neighbors);
-    };
-
-    const cosDist = this.querySelector('.distance a.cosine') as HTMLLinkElement;
-    cosDist.onclick = () => {
-      const links = this.querySelectorAll('.distance a');
-      for (let i = 0; i < links.length; i++) {
-        util.classed(links[i] as HTMLElement, 'selected', false);
-      }
-      util.classed(cosDist, 'selected', true);
-
-      this.distFunc = vector.cosDist;
-      this.projectorEventContext.notifyDistanceMetricChanged(this.distFunc);
-      const neighbors = projector.dataSet.findNeighbors(
-          this.selectedPointIndices[0], this.distFunc, this.numNN);
-      this.updateNeighborsList(neighbors);
-    };
-
-    // Called whenever the search text input changes.
-    const updateInput = (value: string, inRegexMode: boolean) => {
-      if (value == null || value.trim() === '') {
-        this.searchBox.message = '';
-        this.projectorEventContext.notifySelectionChanged([]);
-        return;
-      }
-      const indices = projector.dataSet.query(
-          value, inRegexMode, this.selectedMetadataField);
-      if (indices.length === 0) {
-        this.searchBox.message = '0 matches.';
-      } else {
-        this.searchBox.message = `${indices.length} matches.`;
-      }
-      this.projectorEventContext.notifySelectionChanged(indices);
-    };
-    this.searchBox.registerInputChangedListener((value, inRegexMode) => {
-      updateInput(value, inRegexMode);
-    });
-
-    // Nearest neighbors controls.
-    const numNNInput = this.$$('#nn-slider') as HTMLInputElement;
-    const updateNumNN = () => {
-      this.numNN = +numNNInput.value;
-      (this.querySelector('.num-nn .nn-count') as HTMLSpanElement).innerText =
-          '' + this.numNN;
-      if (this.selectedPointIndices != null) {
-        this.projectorEventContext.notifySelectionChanged(
-            [this.selectedPointIndices[0]]);
-      }
-    };
-    numNNInput.addEventListener('change', updateNumNN);
-    updateNumNN();
-
-    // Filtering dataset.
-    this.setFilterButton.onclick = () => {
-      const indices = this.selectedPointIndices.concat(
-          this.neighborsOfFirstPoint.map(n => n.index));
-      projector.filterDataset(indices);
-      this.enableResetFilterButton(true);
-      this.updateFilterButtons(0);
-    };
-
-    this.resetFilterButton.onclick = () => {
-      projector.resetFilterDataset();
-      this.enableResetFilterButton(false);
-    };
-
-    this.clearSelectionButton.onclick = () => {
-      projector.adjustSelectionAndHover([]);
-    };
-    this.enableResetFilterButton(false);
-  }
-}
-
-document.registerElement(InspectorPanel.prototype.is, InspectorPanel);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html
deleted file mode 100644
index 4b98d8bded827c9777a2a9978bd5f597c1e60e41..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html
+++ /dev/null
@@ -1,78 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id='vz-projector-legend'>
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.item {
-  display: flex;
-  align-items: flex-start;
-  margin-bottom: 10px;
-}
-
-.shape {
-  width: 10px;
-  height: 10px;
-  margin-right: 10px;
-  margin-top: 5px;
-  border-radius: 50%;
-}
-
-.label {
-  flex-grow: 1;
-}
-
-.gradient {
-  width: 100%;
-  height: 10px;
-}
-
-.gradient-boundaries {
-  display: flex;
-  justify-content: space-between;
-}
-</style>
-
-<template is="dom-repeat" items="[[renderInfo.items]]">
-  <div class="item">
-    <div class="shape" style="background-color: [[item.color]];"></div>
-    <div class="label">[[item.label]]</div>
-    <div class="info" style="color: [[item.color]];">[[item.count]]</div>
-  </div>
-</template>
-
-<template is="dom-if" if="[[renderInfo.thresholds]]">
-  <svg class="gradient">
-    <defs>
-      <linearGradient id="gradient" x1="0%" y1="100%" x2="100%" y2="100%"></linearGradient>
-    </defs>
-    <rect height="10" style="fill: url('#gradient');"></rect>
-  </svg>
-  <div class="gradient-boundaries">
-    <div>[[renderInfo.thresholds.0.value]]</div>
-    <div>[[_getLastThreshold(renderInfo.thresholds)]]</div>
-  </div>
-</template>
-<!-- Closing global template -->
-</template>
-<script src="vz-projector-legend.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
deleted file mode 100644
index 1c4ddf940dc06c1eb6c4a523d18c2da673707934..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-// tslint:disable-next-line
-export let LegendPolymer = PolymerElement({
-  is: 'vz-projector-legend',
-  properties: {renderInfo: {type: Object, observer: '_renderInfoChanged'}}
-});
-
-export interface ColorLegendRenderInfo {
-  // To be used for categorical map.
-  items: ColorLegendItem[];
-  // To be used for gradient map.
-  thresholds: ColorLegendThreshold[];
-}
-
-/** An item in the categorical color legend. */
-export interface ColorLegendItem {
-  color: string;
-  label: string;
-  count: number;
-}
-
-/** An item in the gradient color legend. */
-export interface ColorLegendThreshold {
-  color: string;
-  value: number;
-}
-
-export class Legend extends LegendPolymer {
-  renderInfo: ColorLegendRenderInfo;
-
-  _renderInfoChanged() {
-    if (this.renderInfo == null) {
-      return;
-    }
-    if (this.renderInfo.thresholds) {
-      // <linearGradient> is under dom-if so we should wait for it to be
-      // inserted in the dom tree using async().
-      this.async(() => this.setupLinearGradient());
-    }
-  }
-
-  _getLastThreshold(): number {
-    if (this.renderInfo == null || this.renderInfo.thresholds == null) {
-      return;
-    }
-    return this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1]
-        .value;
-  }
-
-  private getOffset(value: number): string {
-    const min = this.renderInfo.thresholds[0].value;
-    const max =
-        this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1].value;
-    return (100 * (value - min) / (max - min)).toFixed(2) + '%';
-  }
-
-  private setupLinearGradient() {
-    const linearGradient =
-        this.querySelector('#gradient') as SVGLinearGradientElement;
-
-    const width =
-        (this.querySelector('svg.gradient') as SVGElement).clientWidth;
-
-    // Set the svg <rect> to be the width of its <svg> parent.
-    (this.querySelector('svg.gradient rect') as SVGRectElement).style.width =
-        width + 'px';
-
-    // Remove all <stop> children from before.
-    linearGradient.innerHTML = '';
-
-    // Add a <stop> child in <linearGradient> for each gradient threshold.
-    this.renderInfo.thresholds.forEach(t => {
-      const stopElement =
-          document.createElementNS('http://www.w3.org/2000/svg', 'stop');
-      stopElement.setAttribute('offset', this.getOffset(t.value));
-      stopElement.setAttribute('stop-color', t.color);
-    });
-  }
-}
-
-document.registerElement(Legend.prototype.is, Legend);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html
deleted file mode 100644
index 4231a61ff3038955a76a1f261f1703a88eefddec..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html
+++ /dev/null
@@ -1,99 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector-metadata-card">
-<template>
-<style>
-#metadata-card {
-  background-color: rgba(255,255,255,0.9);
-  box-shadow: 0 2px 2px 0 rgba(0, 0, 0, 0.14),
-      0 1px 5px 0 rgba(0, 0, 0, 0.12), 0 3px 1px -2px rgba(0, 0, 0, 0.2);
-  width: 280px;
-}
-
-#header {
-  background: #e9e9e9;
-}
-
-#icon-container {
-  position: absolute;
-  right: 0;
-  top: 4px;
-}
-
-#metadata-label {
-  font-weight: 400;
-  font-size: 14px;
-  line-height: 24px;
-  padding: 12px 12px 8px;
-  width: 230px;
-}
-
-#metadata-table {
-  display: table;
-  padding: 8px 12px 4px;
-}
-
-.metadata-row {
-  display: table-row;
-}
-
-.metadata-key {
-  font-weight: bold;
-}
-
-.metadata-key, .metadata-value {
-  display: table-cell;
-  font-size: 12px;
-  padding: 3px 3px;
-}
-</style>
-
-<template is="dom-if" if="[[hasMetadata]]">
-  <div id="metadata-card">
-    <div id="icon-container">
-      <paper-icon-button id="expand-more"
-          style="display: none"
-          icon="expand-more"
-          on-tap="_expandMore"></paper-icon-button>
-      <paper-icon-button id="expand-less"
-          on-tap="_expandLess"
-          icon="expand-less"></paper-icon-button>
-    </div>
-    <div id="header">
-      <div id="metadata-label">[[label]]</div>
-    </div>
-    <iron-collapse id="metadata-container" opened>
-      <div id="metadata-table">
-        <template is="dom-repeat" items="[[metadata]]">
-          <div class="metadata-row">
-            <div class="metadata-key">[[item.key]]</div>
-            <div class="metadata-value">[[item.value]]</div>
-          </div>
-        </template>
-      </div>
-    </iron-collapse>
-  </div>
-</template>
-</template>
-<script src="vz-projector-metadata-card.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
deleted file mode 100644
index 939300f3878e6c09551c77062a94a92d3cc07000..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {PointMetadata} from './data';
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-// tslint:disable-next-line
-export let MetadataCardPolymer = PolymerElement({
-  is: 'vz-projector-metadata-card',
-  properties: {
-    hasMetadata: {type: Boolean, value: false},
-    metadata: {type: Array},
-    label: String
-  }
-});
-
-export class MetadataCard extends MetadataCardPolymer {
-  hasMetadata: boolean;
-  metadata: Array<{key: string, value: string}>;
-  label: string;
-
-  private labelOption: string;
-  private pointMetadata: PointMetadata;
-
-  private expandLessButton: HTMLButtonElement;
-  private expandMoreButton: HTMLButtonElement;
-
-  ready() {
-    this.expandLessButton =
-        this.querySelector('#expand-less') as HTMLButtonElement;
-    this.expandMoreButton =
-        this.querySelector('#expand-more') as HTMLButtonElement;
-  }
-  /** Handles a click on the expand more icon. */
-  _expandMore() {
-    (this.$$('#metadata-container') as any).toggle();
-
-    this.expandMoreButton.style.display = 'none';
-    this.expandLessButton.style.display = '';
-  }
-
-  /** Handles a click on the expand less icon. */
-  _expandLess() {
-    (this.$$('#metadata-container') as any).toggle();
-    this.expandMoreButton.style.display = '';
-    this.expandLessButton.style.display = 'none';
-  }
-
-  updateMetadata(pointMetadata?: PointMetadata) {
-    this.pointMetadata = pointMetadata;
-    this.hasMetadata = (pointMetadata != null);
-
-    if (pointMetadata) {
-      let metadata = [];
-      for (let metadataKey in pointMetadata) {
-        if (!pointMetadata.hasOwnProperty(metadataKey)) {
-          continue;
-        }
-        metadata.push({key: metadataKey, value: pointMetadata[metadataKey]});
-      }
-
-      this.metadata = metadata;
-      this.label = '' + this.pointMetadata[this.labelOption];
-    }
-  }
-
-  setLabelOption(labelOption: string) {
-    this.labelOption = labelOption;
-    if (this.pointMetadata) {
-      this.label = '' + this.pointMetadata[this.labelOption];
-    }
-  }
-}
-
-document.registerElement(MetadataCard.prototype.is, MetadataCard);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
deleted file mode 100644
index b82f3f520b5e62bb381f1a9c6ebd10c4a04d13cf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
+++ /dev/null
@@ -1,316 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../paper-listbox/paper-listbox.html">
-<link rel="import" href="../paper-item/paper-item.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../iron-icons/image-icons.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector-projections-panel">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-  transition: height 0.2s;
-}
-
-.ink-button, ::shadow .ink-button {
-  border: none;
-  border-radius: 2px;
-  font-size: 13px;
-  padding: 10px;
-  min-width: 100px;
-  flex-shrink: 0;
-  background: #e3e3e3;
-}
-
-.ink-panel-buttons {
-  margin-bottom: 10px;
-}
-
-.two-way-toggle {
-  display: flex;
-  flex-direction: row;
-}
-
-.two-way-toggle span {
-  padding-right: 7px;
-}
-
-.has-border {
-  border: 1px solid rgba(0, 0, 0, 0.1);
-}
-
-.toggle {
-  min-width: 0px;
-  font-size: 12px;
-  width: 17px;
-  min-height: 0px;
-  height: 21px;
-  padding: 0;
-  margin: 0px;
-}
-
-.toggle[active] {
-  background-color: #880E4F;
-  color: white;
-}
-
-.two-columns {
-  display:flex;
-  justify-content: space-between;
-}
-
-.two-columns > :first-child {
-  margin-right: 15px;
-}
-
-.two-columns > div {
-  width: 50%;
-}
-
-.dropdown-item {
-  justify-content: space-between;
-  min-height: 35px;
-}
-
-#z-container {
-  display: flex;
-  align-items: center;
-  width: 50%;
-}
-
-#z-checkbox {
-  margin: 27px 0 0 5px;
-  width: 18px;
-}
-
-#z-dropdown {
-  flex-grow: 1;
-}
-
-.notice {
-  color: #880E4F;
-}
-
-.container {
-  padding: 20px;
-}
-
-.book-icon {
-  height: 20px;
-  color: rgba(0, 0, 0, 0.7);
-}
-
-.item-details {
-  color: gray;
-  font-size: 12px;
-  margin-left: 5px;
-}
-
-.pca-dropdown {
-  width: 100%;
-}
-
-.pca-dropdown paper-listbox {
-  width: 135px;
-}
-
-.dropdown-item.header {
-  border-bottom: 1px solid #aaa;
-  color: #333;
-  font-weight: bold;
-}
-
-#total-variance {
-  color: rgba(0, 0, 0, 0.7);
-}
-</style>
-<div id="main">
-  <div class="ink-panel-header">
-    <div class="ink-tab-group">
-
-      <div data-tab="tsne" id="tsne-tab" class="ink-tab projection-tab">t-SNE</div>
-      <paper-tooltip for="tsne-tab" position="bottom" animation-delay="0" fit-to-visible-bounds>
-        t-distributed stochastic neighbor embedding
-      </paper-tooltip>
-
-      <div data-tab="pca" id="pca-tab" class="ink-tab projection-tab">PCA</div>
-      <paper-tooltip for="pca-tab" position="bottom" animation-delay="0" fit-to-visible-bounds>
-        Principal component analysis
-      </paper-tooltip>
-
-      <div data-tab="custom" id="custom-tab" class="ink-tab projection-tab" title="Linear projection of two custom vectors">Custom</div>
-      <paper-tooltip for="custom-tab" position="bottom" animation-delay="0" fit-to-visible-bounds>
-        Search for two vectors upon which to project all points.
-      </paper-tooltip>
-
-    </div>
-  </div>
-  <div class="container">
-    <!-- TSNE Controls -->
-    <div data-panel="tsne" class="ink-panel-content">
-      <div class="slider">
-        <label>Dimension</label>
-        <div class="two-way-toggle">
-          <span>2D</span>
-          <paper-toggle-button id="tsne-toggle" checked="{{tSNEis3d}}">3D</paper-toggle-button>
-        </div>
-      </div>
-      <div class="slider tsne-perplexity">
-        <label>
-          Perplexity
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="right" animation-delay="0" fit-to-visible-bounds>
-            The most appropriate perplexity value depends on the density of the
-            data. Loosely speaking, a larger / denser dataset
-            requires a larger perplexity. Typical values for perplexity range
-            between 5 and 50.
-          </paper-tooltip>
-        </label>
-        <paper-slider id="perplexity-slider" pin min="2" max="100" value="30"></paper-slider>
-        <span></span>
-      </div>
-      <div class="slider tsne-learning-rate">
-        <label>
-          Learning rate
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="right" animation-delay="0" fit-to-visible-bounds>
-            The ideal learning rate often depends on the size of the data,
-            with smaller datasets requiring smaller learning rates.
-          </paper-tooltip>
-        </label>
-        <paper-slider id="learning-rate-slider" snaps min="-3" max="2" step="1"
-            value="1" max-markers="6">
-        </paper-slider>
-        <span></span>
-      </div>
-      <p>
-        <button class="run-tsne ink-button" title="Re-run t-SNE">Re-run</button>
-        <button class="stop-tsne ink-button" title="Stop t-SNE">Stop</button>
-      </p>
-      <p>Iteration: <span class="run-tsne-iter">0</span></p>
-      <p id="tsne-sampling" class="notice">
-        For fast results, the data will be sampled down to [[getTsneSampleSizeText()]] points.
-      </p>
-      <p>
-        <iron-icon icon="book" class="book-icon"></iron-icon>
-        <a target="_blank" href="http://distill.pub/2016/misread-tsne/">
-          How to use t-SNE effectively.
-        </a>
-      </p>
-    </div>
-    <!-- PCA Controls -->
-    <div data-panel="pca" class="ink-panel-content">
-      <div class="two-columns">
-        <div> <!-- Left column -->
-          <paper-dropdown-menu class="pca-dropdown" vertical-align="bottom" no-animations label="X">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaX}}">
-              <paper-item disabled class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]"
-                            label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-          <paper-dropdown-menu class="pca-dropdown" no-animations vertical-align="bottom" label="Z" disabled="[[!hasPcaZ]]" id="z-dropdown">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaZ}}">
-              <paper-item disabled class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]"
-                            label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-        </div>
-        <div> <!-- Right column -->
-          <paper-dropdown-menu class="pca-dropdown" vertical-align="bottom" no-animations label="Y">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaY}}">
-              <paper-item disabled class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]"
-                            label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-          <paper-checkbox id="z-checkbox" checked="{{pcaIs3d}}"></paper-checkbox>
-        </div>
-      </div>
-      <p id="pca-sampling" class="notice">
-        PCA is approximate.
-        <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      </p>
-      <div id="total-variance">Total variance</div>
-      <paper-tooltip for="pca-sampling" position="top" animation-delay="0" fit-to-visible-bounds>
-        For fast results, the data was sampled to [[getPcaSampleSizeText()]] points and randomly projected down to [[getPcaSampledDimText()]] dimensions.
-      </paper-tooltip>
-    </div>
-    <!-- Custom Controls -->
-    <div data-panel="custom" class="ink-panel-content">
-      <paper-dropdown-menu style="width: 100%" no-animations label="Search by">
-        <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{customSelectedSearchByMetadataOption}}">
-          <template is="dom-repeat" items="[[searchByMetadataOptions]]">
-            <paper-item class="dropdown-item" value="[[item]]" label="[[item]]">
-              [[item]]
-            </paper-item>
-          </template>
-        </paper-listbox>
-      </paper-dropdown-menu>
-      <div class="two-columns">
-        <vz-projector-input id="xLeft" label="Left"></vz-projector-input>
-        <vz-projector-input id="xRight" label="Right"></vz-projector-input>
-      </div>
-      <div class="two-columns">
-        <vz-projector-input id="yUp" label="Up"></vz-projector-input>
-        <vz-projector-input id="yDown" label="Down"></vz-projector-input>
-      </div>
-    </div>
-  </div>
-</div>
-</template>
-<script src="vz-projector-projections-panel.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
deleted file mode 100644
index 377c6c11ad5d19343682540bdadc3319b5d0ee3c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
+++ /dev/null
@@ -1,589 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import * as data from './data';
-import {DataSet, Projection, ProjectionType, SpriteAndMetadataInfo, State} from './data';
-import * as util from './util';
-import * as vector from './vector';
-import {Vector} from './vector';
-import {Projector} from './vz-projector';
-import {ProjectorInput} from './vz-projector-input';
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-const NUM_PCA_COMPONENTS = 10;
-
-// tslint:disable-next-line
-export let ProjectionsPanelPolymer = PolymerElement({
-  is: 'vz-projector-projections-panel',
-  properties: {
-    pcaIs3d:
-        {type: Boolean, value: true, observer: '_pcaDimensionToggleObserver'},
-    tSNEis3d:
-        {type: Boolean, value: true, observer: '_tsneDimensionToggleObserver'},
-    // PCA projection.
-    pcaComponents: Array,
-    pcaX: {type: Number, value: 0, observer: 'showPCAIfEnabled'},
-    pcaY: {type: Number, value: 1, observer: 'showPCAIfEnabled'},
-    pcaZ: {type: Number, value: 2, observer: 'showPCAIfEnabled'},
-    // Custom projection.
-    customSelectedSearchByMetadataOption: {
-      type: String,
-      observer: '_customSelectedSearchByMetadataOptionChanged'
-    },
-  }
-});
-
-type InputControlName = 'xLeft'|'xRight'|'yUp'|'yDown';
-
-type CentroidResult = {
-  centroid?: Vector; numMatches?: number;
-};
-
-type Centroids = {
-  [key: string]: Vector; xLeft: Vector; xRight: Vector; yUp: Vector;
-  yDown: Vector;
-};
-
-/**
- * A polymer component which handles the projection tabs in the projector.
- */
-export class ProjectionsPanel extends ProjectionsPanelPolymer {
-  private projector: Projector;
-  private pcaComponents:
-      Array<{id: number, componentNumber: number, percVariance: string}>;
-  private currentProjection: ProjectionType;
-  private polymerChangesTriggerReprojection: boolean;
-  private dataSet: DataSet;
-  private originalDataSet: DataSet;
-  private dim: number;
-
-  /** T-SNE perplexity. Roughly how many neighbors each point influences. */
-  private perplexity: number;
-  /** T-SNE learning rate. */
-  private learningRate: number;
-
-  private searchByMetadataOptions: string[];
-
-  /** Centroids for custom projections. */
-  private centroidValues: any;
-  private centroids: Centroids;
-  /** The centroid across all points. */
-  private allCentroid: number[];
-
-  /** Polymer properties. */
-  // TODO(nsthorat): Move these to a separate view controller.
-  public tSNEis3d: boolean;
-  public pcaIs3d: boolean;
-  public pcaX: number;
-  public pcaY: number;
-  public pcaZ: number;
-  public customSelectedSearchByMetadataOption: string;
-
-  /** Polymer elements. */
-  private runTsneButton: HTMLButtonElement;
-  private stopTsneButton: HTMLButtonElement;
-  private perplexitySlider: HTMLInputElement;
-  private learningRateInput: HTMLInputElement;
-  private zDropdown: HTMLElement;
-  private iterationLabel: HTMLElement;
-
-  private customProjectionXLeftInput: ProjectorInput;
-  private customProjectionXRightInput: ProjectorInput;
-  private customProjectionYUpInput: ProjectorInput;
-  private customProjectionYDownInput: ProjectorInput;
-
-  initialize(projector: Projector) {
-    this.polymerChangesTriggerReprojection = true;
-    this.projector = projector;
-
-    // Set up TSNE projections.
-    this.perplexity = 30;
-    this.learningRate = 10;
-
-    // Setup Custom projections.
-    this.centroidValues = {xLeft: null, xRight: null, yUp: null, yDown: null};
-    this.clearCentroids();
-
-    this.setupUIControls();
-  }
-
-  ready() {
-    this.zDropdown = this.querySelector('#z-dropdown') as HTMLElement;
-    this.runTsneButton = this.querySelector('.run-tsne') as HTMLButtonElement;
-    this.stopTsneButton = this.querySelector('.stop-tsne') as HTMLButtonElement;
-    this.perplexitySlider =
-        this.querySelector('#perplexity-slider') as HTMLInputElement;
-    this.learningRateInput =
-        this.querySelector('#learning-rate-slider') as HTMLInputElement;
-    this.iterationLabel = this.querySelector('.run-tsne-iter') as HTMLElement;
-  }
-
-  disablePolymerChangesTriggerReprojection() {
-    this.polymerChangesTriggerReprojection = false;
-  }
-
-  enablePolymerChangesTriggerReprojection() {
-    this.polymerChangesTriggerReprojection = true;
-  }
-
-  private updateTSNEPerplexityFromSliderChange() {
-    if (this.perplexitySlider) {
-      this.perplexity = +this.perplexitySlider.value;
-    }
-    (this.querySelector('.tsne-perplexity span') as HTMLSpanElement).innerText =
-        '' + this.perplexity;
-  }
-
-  private updateTSNELearningRateFromUIChange() {
-    if (this.learningRateInput) {
-      this.learningRate = Math.pow(10, +this.learningRateInput.value);
-    }
-    (this.querySelector('.tsne-learning-rate span') as HTMLSpanElement)
-        .innerText = '' + this.learningRate;
-  }
-
-  private setupUIControls() {
-    {
-      const self = this;
-      const inkTabs = this.querySelectorAll('.ink-tab');
-      for (let i = 0; i < inkTabs.length; i++) {
-        inkTabs[i].addEventListener('click', function() {
-          let id = this.getAttribute('data-tab');
-          self.showTab(id);
-        });
-      }
-    }
-
-    this.runTsneButton.addEventListener('click', () => this.runTSNE());
-    this.stopTsneButton.addEventListener(
-        'click', () => this.dataSet.stopTSNE());
-
-    this.perplexitySlider.value = this.perplexity.toString();
-    this.perplexitySlider.addEventListener(
-        'change', () => this.updateTSNEPerplexityFromSliderChange());
-    this.updateTSNEPerplexityFromSliderChange();
-
-    this.learningRateInput.addEventListener(
-        'change', () => this.updateTSNELearningRateFromUIChange());
-    this.updateTSNELearningRateFromUIChange();
-
-    this.setupCustomProjectionInputFields();
-    // TODO: figure out why `--paper-input-container-input` css mixin didn't
-    // work.
-    const inputs =
-        this.querySelectorAll('paper-dropdown-menu paper-input input');
-    for (let i = 0; i < inputs.length; i++) {
-      (inputs[i] as HTMLElement).style.fontSize = '14px';
-    }
-  }
-
-  restoreUIFromBookmark(bookmark: State) {
-    this.disablePolymerChangesTriggerReprojection();
-
-    // PCA
-    this.pcaX = bookmark.pcaComponentDimensions[0];
-    this.pcaY = bookmark.pcaComponentDimensions[1];
-    if (bookmark.pcaComponentDimensions.length === 3) {
-      this.pcaZ = bookmark.pcaComponentDimensions[2];
-    }
-    this.pcaIs3d = (bookmark.pcaComponentDimensions.length === 3);
-
-    // t-SNE
-    if (this.perplexitySlider) {
-      this.perplexitySlider.value = bookmark.tSNEPerplexity.toString();
-    }
-    if (this.learningRateInput) {
-      this.learningRateInput.value = bookmark.tSNELearningRate.toString();
-    }
-    this.tSNEis3d = bookmark.tSNEis3d;
-
-    // custom
-    this.customSelectedSearchByMetadataOption =
-        bookmark.customSelectedSearchByMetadataOption;
-    if (this.customProjectionXLeftInput) {
-      this.customProjectionXLeftInput.set(
-          bookmark.customXLeftText, bookmark.customXLeftRegex);
-    }
-    if (this.customProjectionXRightInput) {
-      this.customProjectionXRightInput.set(
-          bookmark.customXRightText, bookmark.customXRightRegex);
-    }
-    if (this.customProjectionYUpInput) {
-      this.customProjectionYUpInput.set(
-          bookmark.customYUpText, bookmark.customYUpRegex);
-    }
-    if (this.customProjectionYDownInput) {
-      this.customProjectionYDownInput.set(
-          bookmark.customYDownText, bookmark.customYDownRegex);
-    }
-    this.computeAllCentroids();
-
-    this.setZDropdownEnabled(this.pcaIs3d);
-    this.updateTSNEPerplexityFromSliderChange();
-    this.updateTSNELearningRateFromUIChange();
-    if (this.iterationLabel) {
-      this.iterationLabel.innerText = bookmark.tSNEIteration.toString();
-    }
-    if (bookmark.selectedProjection != null) {
-      this.showTab(bookmark.selectedProjection);
-    }
-    this.enablePolymerChangesTriggerReprojection();
-  }
-
-  populateBookmarkFromUI(bookmark: State) {
-    this.disablePolymerChangesTriggerReprojection();
-
-    // PCA
-    bookmark.pcaComponentDimensions = [this.pcaX, this.pcaY];
-    if (this.pcaIs3d) {
-      bookmark.pcaComponentDimensions.push(this.pcaZ);
-    }
-
-    // t-SNE
-    if (this.perplexitySlider != null) {
-      bookmark.tSNEPerplexity = +this.perplexitySlider.value;
-    }
-    if (this.learningRateInput != null) {
-      bookmark.tSNELearningRate = +this.learningRateInput.value;
-    }
-    bookmark.tSNEis3d = this.tSNEis3d;
-
-    // custom
-    bookmark.customSelectedSearchByMetadataOption =
-        this.customSelectedSearchByMetadataOption;
-    if (this.customProjectionXLeftInput != null) {
-      bookmark.customXLeftText = this.customProjectionXLeftInput.getValue();
-      bookmark.customXLeftRegex =
-          this.customProjectionXLeftInput.getInRegexMode();
-    }
-    if (this.customProjectionXRightInput != null) {
-      bookmark.customXRightText = this.customProjectionXRightInput.getValue();
-      bookmark.customXRightRegex =
-          this.customProjectionXRightInput.getInRegexMode();
-    }
-    if (this.customProjectionYUpInput != null) {
-      bookmark.customYUpText = this.customProjectionYUpInput.getValue();
-      bookmark.customYUpRegex = this.customProjectionYUpInput.getInRegexMode();
-    }
-    if (this.customProjectionYDownInput != null) {
-      bookmark.customYDownText = this.customProjectionYDownInput.getValue();
-      bookmark.customYDownRegex =
-          this.customProjectionYDownInput.getInRegexMode();
-    }
-
-    this.enablePolymerChangesTriggerReprojection();
-  }
-
-  // This method is marked as public as it is used as the view method that
-  // abstracts DOM manipulation so we can stub it in a test.
-  // TODO(nsthorat): Move this to its own class as the glue between this class
-  // and the DOM.
-  setZDropdownEnabled(enabled: boolean) {
-    if (this.zDropdown) {
-      if (enabled) {
-        this.zDropdown.removeAttribute('disabled');
-      } else {
-        this.zDropdown.setAttribute('disabled', 'true');
-      }
-    }
-  }
-
-  dataSetUpdated(dataSet: DataSet, originalDataSet: DataSet, dim: number) {
-    this.dataSet = dataSet;
-    this.originalDataSet = originalDataSet;
-    this.dim = dim;
-    const pointCount = (dataSet == null) ? 0 : dataSet.points.length;
-    const perplexity = Math.max(5, Math.ceil(Math.sqrt(pointCount) / 4));
-    this.perplexitySlider.value = perplexity.toString();
-    this.updateTSNEPerplexityFromSliderChange();
-    this.clearCentroids();
-
-    (this.querySelector('#tsne-sampling') as HTMLElement).style.display =
-        pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none';
-    const wasSampled =
-        (dataSet == null) ? false : (dataSet.dim[0] > data.PCA_SAMPLE_DIM ||
-                                     dataSet.dim[1] > data.PCA_SAMPLE_DIM);
-    (this.querySelector('#pca-sampling') as HTMLElement).style.display =
-        wasSampled ? null : 'none';
-    this.showTab('pca');
-  }
-
-  _pcaDimensionToggleObserver() {
-    this.setZDropdownEnabled(this.pcaIs3d);
-    this.beginProjection(this.currentProjection);
-  }
-
-  _tsneDimensionToggleObserver() {
-    this.beginProjection(this.currentProjection);
-  }
-
-  metadataChanged(spriteAndMetadata: SpriteAndMetadataInfo) {
-    // Project by options for custom projections.
-    let searchByMetadataIndex = -1;
-    this.searchByMetadataOptions = spriteAndMetadata.stats.map((stats, i) => {
-      // Make the default label by the first non-numeric column.
-      if (!stats.isNumeric && searchByMetadataIndex === -1) {
-        searchByMetadataIndex = i;
-      }
-      return stats.name;
-    });
-    this.customSelectedSearchByMetadataOption =
-        this.searchByMetadataOptions[Math.max(0, searchByMetadataIndex)];
-  }
-
-  public showTab(id: ProjectionType) {
-    this.currentProjection = id;
-
-    const tab =
-        this.querySelector('.ink-tab[data-tab="' + id + '"]') as HTMLElement;
-    const allTabs = this.querySelectorAll('.ink-tab');
-    for (let i = 0; i < allTabs.length; i++) {
-      util.classed(allTabs[i] as HTMLElement, 'active', false);
-    }
-
-    util.classed(tab, 'active', true);
-
-    const allTabContent = this.querySelectorAll('.ink-panel-content');
-    for (let i = 0; i < allTabContent.length; i++) {
-      util.classed(allTabContent[i] as HTMLElement, 'active', false);
-    }
-
-    util.classed(
-        this.querySelector('.ink-panel-content[data-panel="' + id + '"]') as
-            HTMLElement,
-        'active', true);
-
-    // guard for unit tests, where polymer isn't attached and $ doesn't exist.
-    if (this.$ != null) {
-      const main = this.$['main'];
-      // In order for the projections panel to animate its height, we need to
-      // set it explicitly.
-      requestAnimationFrame(() => {
-        this.style.height = main.clientHeight + 'px';
-      });
-    }
-
-    this.beginProjection(id);
-  }
-
-  private beginProjection(projection: ProjectionType) {
-    if (this.polymerChangesTriggerReprojection === false) {
-      return;
-    }
-    if (projection === 'pca') {
-      if (this.dataSet != null) {
-        this.dataSet.stopTSNE();
-      }
-      this.showPCA();
-    } else if (projection === 'tsne') {
-      this.showTSNE();
-    } else if (projection === 'custom') {
-      if (this.dataSet != null) {
-        this.dataSet.stopTSNE();
-      }
-      this.computeAllCentroids();
-      this.reprojectCustom();
-    }
-  }
-
-  private showTSNE() {
-    const dataSet = this.dataSet;
-    if (dataSet == null) {
-      return;
-    }
-    const accessors =
-        data.getProjectionComponents('tsne', [0, 1, this.tSNEis3d ? 2 : null]);
-    const dimensionality = this.tSNEis3d ? 3 : 2;
-    const projection =
-        new Projection('tsne', accessors, dimensionality, dataSet);
-    this.projector.setProjection(projection);
-
-    if (!this.dataSet.hasTSNERun) {
-      this.runTSNE();
-    } else {
-      this.projector.notifyProjectionPositionsUpdated();
-    }
-  }
-
-  private runTSNE() {
-    this.runTsneButton.disabled = true;
-    this.stopTsneButton.disabled = null;
-    this.dataSet.projectTSNE(
-        this.perplexity, this.learningRate, this.tSNEis3d ? 3 : 2,
-        (iteration: number) => {
-          if (iteration != null) {
-            this.iterationLabel.innerText = '' + iteration;
-            this.projector.notifyProjectionPositionsUpdated();
-          } else {
-            this.runTsneButton.disabled = null;
-            this.stopTsneButton.disabled = true;
-          }
-        });
-  }
-
-  // tslint:disable-next-line:no-unused-variable
-  private showPCAIfEnabled() {
-    if (this.polymerChangesTriggerReprojection) {
-      this.showPCA();
-    }
-  }
-
-  private updateTotalVarianceMessage() {
-    let variances = this.dataSet.fracVariancesExplained;
-    let totalVariance = variances[this.pcaX] + variances[this.pcaY];
-    let msg = 'Total variance described: ';
-    if (this.pcaIs3d) {
-      totalVariance += variances[this.pcaZ];
-    }
-    msg += (totalVariance * 100).toFixed(1) + '%.';
-    (this.querySelector('#total-variance') as HTMLElement).innerHTML = msg;
-  }
-
-  private showPCA() {
-    if (this.dataSet == null) {
-      return;
-    }
-    this.dataSet.projectPCA().then(() => {
-      // Polymer properties are 1-based.
-      const accessors = data.getProjectionComponents(
-          'pca', [this.pcaX, this.pcaY, this.pcaZ]);
-
-      const dimensionality = this.pcaIs3d ? 3 : 2;
-      const projection =
-          new Projection('pca', accessors, dimensionality, this.dataSet);
-      this.projector.setProjection(projection);
-      let numComponents = Math.min(NUM_PCA_COMPONENTS, this.dataSet.dim[1]);
-      this.updateTotalVarianceMessage();
-      this.pcaComponents = util.range(numComponents).map(i => {
-        let fracVariance = this.dataSet.fracVariancesExplained[i];
-        return {
-          id: i,
-          componentNumber: i + 1,
-          percVariance: (fracVariance * 100).toFixed(1)
-        };
-      });
-    });
-  }
-
-  private reprojectCustom() {
-    if (this.centroids == null || this.centroids.xLeft == null ||
-        this.centroids.xRight == null || this.centroids.yUp == null ||
-        this.centroids.yDown == null) {
-      return;
-    }
-    const xDir = vector.sub(this.centroids.xRight, this.centroids.xLeft);
-    this.dataSet.projectLinear(xDir, 'linear-x');
-
-    const yDir = vector.sub(this.centroids.yUp, this.centroids.yDown);
-    this.dataSet.projectLinear(yDir, 'linear-y');
-
-    const accessors = data.getProjectionComponents('custom', ['x', 'y']);
-    const projection = new Projection('custom', accessors, 2, this.dataSet);
-    this.projector.setProjection(projection);
-  }
-
-  clearCentroids(): void {
-    this.centroids = {xLeft: null, xRight: null, yUp: null, yDown: null};
-    this.allCentroid = null;
-  }
-
-  _customSelectedSearchByMetadataOptionChanged(newVal: string, oldVal: string) {
-    if (this.polymerChangesTriggerReprojection === false) {
-      return;
-    }
-    if (this.currentProjection === 'custom') {
-      this.computeAllCentroids();
-      this.reprojectCustom();
-    }
-  }
-
-  private setupCustomProjectionInputFields() {
-    this.customProjectionXLeftInput =
-        this.setupCustomProjectionInputField('xLeft');
-    this.customProjectionXRightInput =
-        this.setupCustomProjectionInputField('xRight');
-    this.customProjectionYUpInput = this.setupCustomProjectionInputField('yUp');
-    this.customProjectionYDownInput =
-        this.setupCustomProjectionInputField('yDown');
-  }
-
-  private computeAllCentroids() {
-    this.computeCentroid('xLeft');
-    this.computeCentroid('xRight');
-    this.computeCentroid('yUp');
-    this.computeCentroid('yDown');
-  }
-
-  private computeCentroid(name: InputControlName) {
-    const input = this.querySelector('#' + name) as ProjectorInput;
-    if (input == null) {
-      return;
-    }
-    const value = input.getValue();
-    if (value == null) {
-      return;
-    }
-    let inRegexMode = input.getInRegexMode();
-    let result = this.getCentroid(value, inRegexMode);
-    if (result.numMatches === 0) {
-      input.message = '0 matches. Using a random vector.';
-      result.centroid = vector.rn(this.dim);
-    } else {
-      input.message = `${result.numMatches} matches.`;
-    }
-    this.centroids[name] = result.centroid;
-    this.centroidValues[name] = value;
-  }
-
-  private setupCustomProjectionInputField(name: InputControlName):
-      ProjectorInput {
-    let input = this.querySelector('#' + name) as ProjectorInput;
-    input.registerInputChangedListener((input, inRegexMode) => {
-      if (this.polymerChangesTriggerReprojection) {
-        this.computeCentroid(name);
-        this.reprojectCustom();
-      }
-    });
-    return input;
-  }
-
-  private getCentroid(pattern: string, inRegexMode: boolean): CentroidResult {
-    if (pattern == null || pattern === '') {
-      return {numMatches: 0};
-    }
-    // Search by the original dataset since we often want to filter and project
-    // only the nearest neighbors of A onto B-C where B and C are not nearest
-    // neighbors of A.
-    let accessor = (i: number) => this.originalDataSet.points[i].vector;
-    let r = this.originalDataSet.query(
-        pattern, inRegexMode, this.customSelectedSearchByMetadataOption);
-    return {centroid: vector.centroid(r, accessor), numMatches: r.length};
-  }
-
-  getPcaSampledDimText() {
-    return data.PCA_SAMPLE_DIM.toLocaleString();
-  }
-
-  getPcaSampleSizeText() {
-    return data.PCA_SAMPLE_SIZE.toLocaleString();
-  }
-
-  getTsneSampleSizeText() {
-    return data.TSNE_SAMPLE_SIZE.toLocaleString();
-  }
-}
-
-document.registerElement(ProjectionsPanel.prototype.is, ProjectionsPanel);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-util.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-util.ts
deleted file mode 100644
index 44062062a364b742e2de6467614e508d4e89d37a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-util.ts
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-export type Spec = {
-  is: string; properties?: {
-    [key: string]:
-        (Function |
-         {
-           type: Function, value?: any;
-           readonly?: boolean;
-           notify?: boolean;
-           observer?: string;
-         })
-  };
-  observers?: string[];
-};
-
-export function PolymerElement(spec: Spec) {
-  return Polymer.Class(spec as any) as{new (): PolymerHTMLElement};
-}
-
-export interface PolymerHTMLElement extends HTMLElement, polymer.Base {}
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.html b/tensorflow/tensorboard/components/vz_projector/vz-projector.html
deleted file mode 100644
index 438ea9f4e978fa608eb0cabde35e9adf6f7e87fe..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.html
+++ /dev/null
@@ -1,346 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-<link rel="import" href="../paper-listbox/paper-listbox.html">
-<link rel="import" href="../paper-item/paper-item.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../iron-icons/image-icons.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<link rel="import" href="../paper-input/paper-input.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../paper-toast/paper-toast.html">
-<link rel="import" href="../paper-styles/typography.html">
-<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
-<link rel="import" href="../paper-dialog-scrollable/paper-dialog-scrollable.html">
-<link rel="import" href="../tf-imports/threejs.html">
-
-<link rel="import" href="vz-projector-bookmark-panel.html">
-<link rel="import" href="vz-projector-data-panel.html">
-<link rel="import" href="vz-projector-inspector-panel.html">
-<link rel="import" href="vz-projector-input.html">
-<link rel="import" href="vz-projector-metadata-card.html">
-<link rel="import" href="vz-projector-projections-panel.html">
-<link rel="import" href="styles.html">
-<link rel="import" href="bundle.html">
-
-<dom-module id="vz-projector">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-  display: flex;
-  width: 100%;
-  height: 100%;
-}
-
-#container {
-  display: flex;
-  width: 100%;
-  height: 100%;
-  overflow: hidden;
-}
-
-.hidden {
-  display: none !important;
-}
-
-/* Main */
-
-#main {
-  position: relative;
-  flex-grow: 2;
-}
-
-#main .stage {
-  position: relative;
-  flex-grow: 2;
-}
-
-#scatter {
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-#selector {
-  display: none;
-  height: 100%;
-  position: absolute;
-  width: 100%;
-}
-
-#left-pane {
-  display: flex;
-  flex-direction: column;
-  justify-content: space-between;
-  min-width: 312px;
-  width: 312px;
-  border-right: 1px solid rgba(0, 0, 0, 0.1);
-  background: #fafafa;
-}
-
-#right-pane {
-  border-left: 1px solid rgba(0, 0, 0, 0.1);
-  background: #fafafa;
-  display: flex;
-  height: 100%;
-  min-width: 300px;
-  width: 300px;
-}
-
-.file-name {
-  margin-right: 5px;
-}
-
-.control input[type=text]:focus {
-  outline: none;
-  border-bottom: 1px solid rgba(0, 0, 0, 1);
-}
-
-.control {
-  display: inline-block;
-  width: 45%;
-  vertical-align: top;
-  margin-right: 10px;
-  overflow-x: hidden;
-}
-
-.control.last {
-  margin-right: 0;
-}
-
-#notification-dialog {
-  width: 400px;
-  padding-bottom: 20px;
-}
-
-#notification-dialog paper-button {
-  background: none;
-  text-transform: uppercase;
-}
-
-#notification-dialog .progress {
-  --paper-spinner-color: #880E4F;
-  --paper-spinner-stroke-width: 2px;
-}
-
-#notify-msgs {
-  text-align: center;
-  display: block;
-}
-
-.notify-msg {
-  font-weight: 500;
-  margin: 0;
-  padding: 0;
-}
-
-.notify-msg.error {
-  text-align: left;
-}
-
-.brush .extent {
-  stroke: #fff;
-  fill-opacity: .125;
-  shape-rendering: crispEdges;
-}
-
-.origin text {
-  font-size: 12px;
-  font-weight: 500;
-}
-
-.origin line {
-  stroke: black;
-  stroke-opacity: 0.2;
-}
-
-/* Ink Framework */
-
-/* - Buttons */
-.ink-button, ::shadow .ink-button {
-  border: none;
-  border-radius: 2px;
-  font-size: 13px;
-  padding: 10px;
-  min-width: 100px;
-  flex-shrink: 0;
-  background: #e3e3e3;
-}
-
-.status-bar-panel {
-  display: flex;
-  align-items: center;
-}
-
-.status-bar-entry {
-  border-left: 1px solid rgba(0, 0, 0, 0.5);
-  margin-left: 5px;
-  padding-left: 5px;
-}
-
-/* - Menubar */
-
-.ink-panel-menubar {
-  align-items: center;
-  position: relative;
-  height: 60px;
-  border-bottom: solid 1px #eee;
-  padding: 0 24px;
-  display: flex;
-}
-
-.ink-panel-menubar .ink-fabs {
-  position: absolute;
-  right: 12px;
-  top: 40px;
-  z-index: 1;
-}
-
-#bookmark-panel {
-  bottom: 0;
-  position: absolute;
-  width: 300px;
-}
-#bookmark-panel-container {
-  bottom: 60px;
-  position: absolute;
-}
-
-.ink-fab {
-  margin-left: 8px;
-  border: 1px solid rgba(0, 0, 0, 0.02);
-  background: white;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
-}
-
-#metadata-card {
-  position: absolute;
-  right: 5px;
-  top: 25px;
-}
-
-#help-3d-icon {
-  position: absolute;
-  top: 20px;
-  left: 20px;
-}
-
-#help3dDialog .main {
-  margin: 0;
-  padding: 20px;
-}
-
-#help3dDialog h3 {
-  margin-top: 20px;
-  margin-bottom: 5px;
-}
-
-#help3dDialog h3:first-child {
-  margin-top: 0;
-}
-
-#data-panel {
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  overflow-y: auto;
-}
-
-#toast {
-  display: flex;
-  align-items: center;
-  --paper-toast-color: #eeff41;
-}
-</style>
-<paper-dialog id="notification-dialog" modal>
-  <h2 id="notification-title"></h2>
-  <paper-dialog-scrollable>
-    <div id="notify-msgs"></div>
-  </paper-dialog-scrollable>
-  <div style="text-align: center;"><paper-spinner-lite active class="progress"></paper-spinner-lite></div>
-  <div class="buttons">
-    <paper-button class="close-button" dialog-confirm autofocus>Close</paper-button>
-  </div>
-</paper-dialog>
-<div id="container">
-  <div id="left-pane" class="ink-panel">
-    <vz-projector-data-panel id="data-panel"></vz-projector-data-panel>
-    <vz-projector-projections-panel id="projections-panel"></vz-projector-projections-panel>
-  </div>
-  <div id="main" class="ink-panel">
-    <div class="ink-panel-menubar">
-      <paper-icon-button id="selectMode" alt="Bounding box selection" toggles icon="image:photo-size-select-small"></paper-icon-button>
-      <paper-tooltip for="selectMode" position="bottom" animation-delay="0" fit-to-visible-bounds>Bounding box selection</paper-tooltip>
-
-      <paper-icon-button id="nightDayMode" alt="Enable/disable night mode" toggles icon="image:brightness-2"></paper-icon-button>
-      <paper-tooltip for="nightDayMode" position="bottom" animation-delay="0" fit-to-visible-bounds>Enable/disable night mode</paper-tooltip>
-
-      <paper-icon-button id="labels3DMode" alt="Enable/disable 3D labels mode" toggles icon="font-download"></paper-icon-button>
-      <paper-tooltip for="labels3DMode" position="bottom" animation-delay="0" fit-to-visible-bounds>Enable/disable 3D labels mode</paper-tooltip>
-      <div class="status-bar-panel">
-        <div class="status-bar-entry">Points: <span class="numDataPoints">Loading...</span></div>
-        <div class="status-bar-entry">Dimension: <span class="dim">Loading...</span></div>
-        <div id="status-bar" class="status-bar-entry" style="display: none;"></div>
-      </div>
-      <div class="ink-fabs">
-        <paper-icon-button id="reset-zoom" class="ink-fab" alt="Reset zoom to fit all points" icon="home"></paper-icon-button>
-        <paper-tooltip for="reset-zoom" position="left" animation-delay="0">Reset zoom to fit all points</paper-tooltip>
-      </div>
-    </div>
-    <div class="stage">
-      <div id="scatter">
-        <svg id="selector"></svg>
-      </div>
-      <vz-projector-metadata-card id="metadata-card"></vz-projector-metadata-card>
-      <paper-icon-button raised onclick="help3dDialog.open()" icon="help-outline" id="help-3d-icon"></paper-icon-button>
-      <paper-tooltip animation-delay="0" for="help-3d-icon">Help with interaction controls.</paper-tooltip>
-      <paper-dialog id="help3dDialog" with-backdrop>
-        <div class="main" dialog-confirm autofocus>
-          <h3>3D controls</h3>
-            <b>Rotate</b> Mouse left click.<br/>
-            <b>Pan</b> Mouse right click.<br/>
-            <b>Zoom</b> Mouse wheel.<br/>
-            Holding <b>ctrl</b> reverses the mouse clicks.
-          <h3>2D controls</h3>
-            <b>Pan</b> Mouse left click.<br/>
-            <b>Zoom</b> Mouse wheel.
-          <div class="dismiss-dialog-note"> Click anywhere to dismiss.</div>
-        </div>
-      </paper-dialog>
-    </div>
-  </div>
-  <div id="right-pane" class="ink-panel">
-    <div class="ink-panel-content active">
-      <vz-projector-inspector-panel id="inspector-panel"></vz-projector-inspector-panel>
-    </div>
-    <div id="bookmark-panel-container">
-      <vz-projector-bookmark-panel id="bookmark-panel"></vz-projector-bookmark-panel>
-    </div>
-  </div>
-</div>
-<paper-toast id="toast" always-on-top></paper-toast>
-
-</template> <!-- global template -->
-<script src="vz-projector.js"></script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
deleted file mode 100644
index bf98a4d478599f7b859e893e7a17567f22fd5114..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
+++ /dev/null
@@ -1,570 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {AnalyticsLogger} from './analyticsLogger';
-import * as data from './data';
-import {ColorOption, ColumnStats, DataPoint, DataProto, DataSet, DistanceFunction, PointMetadata, Projection, SpriteAndMetadataInfo, State, stateGetAccessorDimensions} from './data';
-import {DataProvider, EmbeddingInfo, ServingMode} from './data-provider';
-import {DemoDataProvider} from './data-provider-demo';
-import {ProtoDataProvider} from './data-provider-proto';
-import {ServerDataProvider} from './data-provider-server';
-import * as knn from './knn';
-import * as logging from './logging';
-import {DistanceMetricChangedListener, HoverListener, ProjectionChangedListener, ProjectorEventContext, SelectionChangedListener} from './projectorEventContext';
-import {ProjectorScatterPlotAdapter} from './projectorScatterPlotAdapter';
-import {MouseMode} from './scatterPlot';
-import * as util from './util';
-import {BookmarkPanel} from './vz-projector-bookmark-panel';
-import {DataPanel} from './vz-projector-data-panel';
-import {InspectorPanel} from './vz-projector-inspector-panel';
-import {MetadataCard} from './vz-projector-metadata-card';
-import {ProjectionsPanel} from './vz-projector-projections-panel';
-// tslint:disable-next-line:no-unused-variable
-import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
-
-/**
- * The minimum number of dimensions the data should have to automatically
- * decide to normalize the data.
- */
-const THRESHOLD_DIM_NORMALIZE = 50;
-const POINT_COLOR_MISSING = 'black';
-
-export let ProjectorPolymer = PolymerElement({
-  is: 'vz-projector',
-  properties: {
-    routePrefix: String,
-    dataProto: {type: String, observer: '_dataProtoChanged'},
-    servingMode: String,
-    projectorConfigJsonPath: String,
-    pageViewLogging: Boolean,
-    eventLogging: Boolean
-  }
-});
-
-const INDEX_METADATA_FIELD = '__index__';
-
-export class Projector extends ProjectorPolymer implements
-    ProjectorEventContext {
-  // The working subset of the data source's original data set.
-  dataSet: DataSet;
-  servingMode: ServingMode;
-  // The path to the projector config JSON file for demo mode.
-  projectorConfigJsonPath: string;
-
-  private selectionChangedListeners: SelectionChangedListener[];
-  private hoverListeners: HoverListener[];
-  private projectionChangedListeners: ProjectionChangedListener[];
-  private distanceMetricChangedListeners: DistanceMetricChangedListener[];
-
-  private originalDataSet: DataSet;
-  private dataSetBeforeFilter: DataSet;
-  private projectorScatterPlotAdapter: ProjectorScatterPlotAdapter;
-  private dim: number;
-
-  private dataSetFilterIndices: number[];
-  private selectedPointIndices: number[];
-  private neighborsOfFirstPoint: knn.NearestEntry[];
-  private hoverPointIndex: number;
-
-  private dataProvider: DataProvider;
-  private inspectorPanel: InspectorPanel;
-
-  private selectedColorOption: ColorOption;
-  private selectedLabelOption: string;
-  private routePrefix: string;
-  private normalizeData: boolean;
-  private projection: Projection;
-
-  /** Polymer component panels */
-  private dataPanel: DataPanel;
-  private bookmarkPanel: BookmarkPanel;
-  private projectionsPanel: ProjectionsPanel;
-  private metadataCard: MetadataCard;
-
-  private statusBar: HTMLDivElement;
-  private analyticsLogger: AnalyticsLogger;
-  private eventLogging: boolean;
-  private pageViewLogging: boolean;
-
-  ready() {
-    logging.setDomContainer(this);
-
-    this.analyticsLogger =
-        new AnalyticsLogger(this.pageViewLogging, this.eventLogging);
-    this.analyticsLogger.logPageView('embeddings');
-
-    if (!util.hasWebGLSupport()) {
-      this.analyticsLogger.logWebGLDisabled();
-      logging.setErrorMessage(
-          'Your browser or device does not have WebGL enabled. Please enable ' +
-          'hardware acceleration, or use a browser that supports WebGL.');
-      return;
-    }
-
-    this.selectionChangedListeners = [];
-    this.hoverListeners = [];
-    this.projectionChangedListeners = [];
-    this.distanceMetricChangedListeners = [];
-    this.selectedPointIndices = [];
-    this.neighborsOfFirstPoint = [];
-
-    this.dataPanel = this.$['data-panel'] as DataPanel;
-    this.inspectorPanel = this.$['inspector-panel'] as InspectorPanel;
-    this.inspectorPanel.initialize(this, this as ProjectorEventContext);
-    this.projectionsPanel = this.$['projections-panel'] as ProjectionsPanel;
-    this.projectionsPanel.initialize(this);
-    this.bookmarkPanel = this.$['bookmark-panel'] as BookmarkPanel;
-    this.bookmarkPanel.initialize(this, this as ProjectorEventContext);
-    this.metadataCard = this.$['metadata-card'] as MetadataCard;
-    this.statusBar = this.querySelector('#status-bar') as HTMLDivElement;
-    this.scopeSubtree(this.$$('#notification-dialog'), true);
-    this.setupUIControls();
-    this.initializeDataProvider();
-  }
-
-  setSelectedLabelOption(labelOption: string) {
-    this.selectedLabelOption = labelOption;
-    this.metadataCard.setLabelOption(this.selectedLabelOption);
-    this.projectorScatterPlotAdapter.setLabelPointAccessor(labelOption);
-    this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-    this.projectorScatterPlotAdapter.render();
-  }
-
-  setSelectedColorOption(colorOption: ColorOption) {
-    this.selectedColorOption = colorOption;
-    this.projectorScatterPlotAdapter.setLegendPointColorer(
-        this.getLegendPointColorer(colorOption));
-    this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-    this.projectorScatterPlotAdapter.render();
-  }
-
-  setNormalizeData(normalizeData: boolean) {
-    this.normalizeData = normalizeData;
-    this.setCurrentDataSet(this.originalDataSet.getSubset());
-  }
-
-  updateDataSet(
-      ds: DataSet, spriteAndMetadata?: SpriteAndMetadataInfo,
-      metadataFile?: string) {
-    this.dataSetFilterIndices = null;
-    this.originalDataSet = ds;
-    if (ds != null) {
-      this.normalizeData =
-          this.originalDataSet.dim[1] >= THRESHOLD_DIM_NORMALIZE;
-      spriteAndMetadata = spriteAndMetadata || {};
-      if (spriteAndMetadata.pointsInfo == null) {
-        let [pointsInfo, stats] = this.makeDefaultPointsInfoAndStats(ds.points);
-        spriteAndMetadata.pointsInfo = pointsInfo;
-        spriteAndMetadata.stats = stats;
-      }
-      let metadataMergeSucceeded = ds.mergeMetadata(spriteAndMetadata);
-      if (!metadataMergeSucceeded) {
-        return;
-      }
-    }
-    if (this.projectorScatterPlotAdapter != null) {
-      if (ds == null) {
-        this.projectorScatterPlotAdapter.setLabelPointAccessor(null);
-        this.setProjection(null);
-      } else {
-        this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.projectorScatterPlotAdapter.resize();
-        this.projectorScatterPlotAdapter.render();
-      }
-    }
-    if (ds != null) {
-      this.dataPanel.setNormalizeData(this.normalizeData);
-      this.setCurrentDataSet(ds.getSubset());
-      this.projectorScatterPlotAdapter.setLabelPointAccessor(
-          this.selectedLabelOption);
-      this.inspectorPanel.datasetChanged();
-
-      this.inspectorPanel.metadataChanged(spriteAndMetadata);
-      this.projectionsPanel.metadataChanged(spriteAndMetadata);
-      this.dataPanel.metadataChanged(spriteAndMetadata, metadataFile);
-      // Set the container to a fixed height, otherwise in Colab the
-      // height can grow indefinitely.
-      const container = this.querySelector('#container') as HTMLDivElement;
-      container.style.height = container.clientHeight + 'px';
-    } else {
-      this.setCurrentDataSet(null);
-    }
-  }
-
-  setSelectedTensor(run: string, tensorInfo: EmbeddingInfo) {
-    this.bookmarkPanel.setSelectedTensor(run, tensorInfo, this.dataProvider);
-  }
-
-  /**
-   * Registers a listener to be called any time the selected point set changes.
-   */
-  registerSelectionChangedListener(listener: SelectionChangedListener) {
-    this.selectionChangedListeners.push(listener);
-  }
-
-  filterDataset(pointIndices: number[]) {
-    const selectionSize = this.selectedPointIndices.length;
-    if (this.dataSetBeforeFilter == null) {
-      this.dataSetBeforeFilter = this.dataSet;
-    }
-    this.setCurrentDataSet(this.dataSet.getSubset(pointIndices));
-    this.dataSetFilterIndices = pointIndices;
-    this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-    this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-    this.adjustSelectionAndHover(util.range(selectionSize));
-  }
-
-  resetFilterDataset() {
-    const originalPointIndices = this.selectedPointIndices.map(
-        filteredIndex => this.dataSet.points[filteredIndex].index);
-    this.setCurrentDataSet(this.dataSetBeforeFilter);
-    if (this.projection != null) {
-      this.projection.dataSet = this.dataSetBeforeFilter;
-    }
-    this.dataSetBeforeFilter = null;
-    this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-    this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-    this.dataSetFilterIndices = [];
-    this.adjustSelectionAndHover(originalPointIndices);
-  }
-
-  /**
-   * Used by clients to indicate that a selection has occurred.
-   */
-  notifySelectionChanged(newSelectedPointIndices: number[]) {
-    this.selectedPointIndices = newSelectedPointIndices;
-    let neighbors: knn.NearestEntry[] = [];
-
-    if (newSelectedPointIndices.length === 1) {
-      neighbors = this.dataSet.findNeighbors(
-          newSelectedPointIndices[0], this.inspectorPanel.distFunc,
-          this.inspectorPanel.numNN);
-      this.metadataCard.updateMetadata(
-          this.dataSet.points[newSelectedPointIndices[0]].metadata);
-    } else {
-      this.metadataCard.updateMetadata(null);
-    }
-
-    this.selectionChangedListeners.forEach(
-        l => l(this.selectedPointIndices, neighbors));
-  }
-
-  /**
-   * Registers a listener to be called any time the mouse hovers over a point.
-   */
-  registerHoverListener(listener: HoverListener) {
-    this.hoverListeners.push(listener);
-  }
-
-  /**
-   * Used by clients to indicate that a hover is occurring.
-   */
-  notifyHoverOverPoint(pointIndex: number) {
-    this.hoverListeners.forEach(l => l(pointIndex));
-  }
-
-  registerProjectionChangedListener(listener: ProjectionChangedListener) {
-    this.projectionChangedListeners.push(listener);
-  }
-
-  notifyProjectionChanged(projection: Projection) {
-    this.projectionChangedListeners.forEach(l => l(projection));
-  }
-
-  registerDistanceMetricChangedListener(l: DistanceMetricChangedListener) {
-    this.distanceMetricChangedListeners.push(l);
-  }
-
-  notifyDistanceMetricChanged(distMetric: DistanceFunction) {
-    this.distanceMetricChangedListeners.forEach(l => l(distMetric));
-  }
-
-  _dataProtoChanged(dataProtoString: string) {
-    let dataProto =
-        dataProtoString ? JSON.parse(dataProtoString) as DataProto : null;
-    this.initializeDataProvider(dataProto);
-  }
-
-  private makeDefaultPointsInfoAndStats(points: DataPoint[]):
-      [PointMetadata[], ColumnStats[]] {
-    let pointsInfo: PointMetadata[] = [];
-    points.forEach(p => {
-      let pointInfo: PointMetadata = {};
-      pointInfo[INDEX_METADATA_FIELD] = p.index;
-      pointsInfo.push(pointInfo);
-    });
-    let stats: ColumnStats[] = [{
-      name: INDEX_METADATA_FIELD,
-      isNumeric: false,
-      tooManyUniqueValues: true,
-      min: 0,
-      max: pointsInfo.length - 1
-    }];
-    return [pointsInfo, stats];
-  }
-
-  private initializeDataProvider(dataProto?: DataProto) {
-    if (this.servingMode === 'demo') {
-      let projectorConfigUrl: string;
-
-      // Only in demo mode do we allow the config being passed via URL.
-      let urlParams = util.getURLParams(window.location.search);
-      if ('config' in urlParams) {
-        projectorConfigUrl = urlParams['config'];
-      } else {
-        projectorConfigUrl = this.projectorConfigJsonPath;
-      }
-      this.dataProvider = new DemoDataProvider(projectorConfigUrl);
-    } else if (this.servingMode === 'server') {
-      if (!this.routePrefix) {
-        throw 'route-prefix is a required parameter';
-      }
-      this.dataProvider = new ServerDataProvider(this.routePrefix);
-    } else if (this.servingMode === 'proto' && dataProto != null) {
-      this.dataProvider = new ProtoDataProvider(dataProto);
-    }
-
-    this.dataPanel.initialize(this, this.dataProvider);
-  }
-
-  private getLegendPointColorer(colorOption: ColorOption):
-      (ds: DataSet, index: number) => string {
-    if ((colorOption == null) || (colorOption.map == null)) {
-      return null;
-    }
-    const colorer = (ds: DataSet, i: number) => {
-      let value = ds.points[i].metadata[this.selectedColorOption.name];
-      if (value == null) {
-        return POINT_COLOR_MISSING;
-      }
-      return colorOption.map(value);
-    };
-    return colorer;
-  }
-
-  private get3DLabelModeButton(): any {
-    return this.querySelector('#labels3DMode');
-  }
-
-  private get3DLabelMode(): boolean {
-    const label3DModeButton = this.get3DLabelModeButton();
-    return (label3DModeButton as any).active;
-  }
-
-  adjustSelectionAndHover(selectedPointIndices: number[], hoverIndex?: number) {
-    this.notifySelectionChanged(selectedPointIndices);
-    this.notifyHoverOverPoint(hoverIndex);
-    this.setMouseMode(MouseMode.CAMERA_AND_CLICK_SELECT);
-  }
-
-  private setMouseMode(mouseMode: MouseMode) {
-    let selectModeButton = this.querySelector('#selectMode');
-    (selectModeButton as any).active = (mouseMode === MouseMode.AREA_SELECT);
-    this.projectorScatterPlotAdapter.scatterPlot.setMouseMode(mouseMode);
-  }
-
-  private setCurrentDataSet(ds: DataSet) {
-    this.adjustSelectionAndHover([]);
-    if (this.dataSet != null) {
-      this.dataSet.stopTSNE();
-    }
-    if ((ds != null) && this.normalizeData) {
-      ds.normalize();
-    }
-    this.dim = (ds == null) ? 0 : ds.dim[1];
-    (this.querySelector('span.numDataPoints') as HTMLSpanElement).innerText =
-        (ds == null) ? '0' : '' + ds.dim[0];
-    (this.querySelector('span.dim') as HTMLSpanElement).innerText =
-        (ds == null) ? '0' : '' + ds.dim[1];
-
-    this.dataSet = ds;
-
-    this.projectionsPanel.dataSetUpdated(
-        this.dataSet, this.originalDataSet, this.dim);
-
-    this.projectorScatterPlotAdapter.setDataSet(this.dataSet);
-    this.projectorScatterPlotAdapter.scatterPlot
-        .setCameraParametersForNextCameraCreation(null, true);
-  }
-
-  private setupUIControls() {
-    // View controls
-    this.querySelector('#reset-zoom').addEventListener('click', () => {
-      this.projectorScatterPlotAdapter.scatterPlot.resetZoom();
-      this.projectorScatterPlotAdapter.scatterPlot.startOrbitAnimation();
-    });
-
-    let selectModeButton = this.querySelector('#selectMode');
-    selectModeButton.addEventListener('click', (event) => {
-      this.setMouseMode(
-          (selectModeButton as any).active ? MouseMode.AREA_SELECT :
-                                             MouseMode.CAMERA_AND_CLICK_SELECT);
-    });
-    let nightModeButton = this.querySelector('#nightDayMode');
-    nightModeButton.addEventListener('click', () => {
-      this.projectorScatterPlotAdapter.scatterPlot.setDayNightMode(
-          (nightModeButton as any).active);
-    });
-
-    const labels3DModeButton = this.get3DLabelModeButton();
-    labels3DModeButton.addEventListener('click', () => {
-      this.projectorScatterPlotAdapter.set3DLabelMode(this.get3DLabelMode());
-    });
-
-    window.addEventListener('resize', () => {
-      const container = this.querySelector('#container') as HTMLDivElement;
-      const parentHeight = (container.parentNode as HTMLElement).clientHeight;
-      container.style.height = parentHeight + 'px';
-      this.projectorScatterPlotAdapter.resize();
-    });
-
-    {
-      this.projectorScatterPlotAdapter = new ProjectorScatterPlotAdapter(
-          this.getScatterContainer(), this as ProjectorEventContext);
-      this.projectorScatterPlotAdapter.setLabelPointAccessor(
-          this.selectedLabelOption);
-    }
-
-    this.projectorScatterPlotAdapter.scatterPlot.onCameraMove(
-        (cameraPosition: THREE.Vector3, cameraTarget: THREE.Vector3) =>
-            this.bookmarkPanel.clearStateSelection());
-
-    this.registerHoverListener(
-        (hoverIndex: number) => this.onHover(hoverIndex));
-
-    this.registerSelectionChangedListener(
-        (selectedPointIndices: number[],
-         neighborsOfFirstPoint: knn.NearestEntry[]) =>
-            this.onSelectionChanged(
-                selectedPointIndices, neighborsOfFirstPoint));
-  }
-
-  private onHover(hoverIndex: number) {
-    this.hoverPointIndex = hoverIndex;
-    let hoverText = null;
-    if (hoverIndex != null) {
-      const point = this.dataSet.points[hoverIndex];
-      if (point.metadata[this.selectedLabelOption]) {
-        hoverText = point.metadata[this.selectedLabelOption].toString();
-      }
-    }
-    if (this.selectedPointIndices.length === 0) {
-      this.statusBar.style.display = hoverText ? null : 'none';
-      this.statusBar.innerText = hoverText;
-    }
-  }
-
-  private getScatterContainer(): HTMLDivElement {
-    return this.querySelector('#scatter') as HTMLDivElement;
-  }
-
-  private onSelectionChanged(
-      selectedPointIndices: number[],
-      neighborsOfFirstPoint: knn.NearestEntry[]) {
-    this.selectedPointIndices = selectedPointIndices;
-    this.neighborsOfFirstPoint = neighborsOfFirstPoint;
-    let totalNumPoints =
-        this.selectedPointIndices.length + neighborsOfFirstPoint.length;
-    this.statusBar.innerText = `Selected ${totalNumPoints} points`;
-    this.statusBar.style.display = totalNumPoints > 0 ? null : 'none';
-  }
-
-  setProjection(projection: Projection) {
-    this.projection = projection;
-    if (projection != null) {
-      this.analyticsLogger.logProjectionChanged(projection.projectionType);
-    }
-    this.notifyProjectionChanged(projection);
-  }
-
-  notifyProjectionPositionsUpdated() {
-    this.projectorScatterPlotAdapter.notifyProjectionPositionsUpdated();
-  }
-
-  /**
-   * Gets the current view of the embedding and saves it as a State object.
-   */
-  getCurrentState(): State {
-    const state = new State();
-
-    // Save the individual datapoint projections.
-    state.projections = [];
-    for (let i = 0; i < this.dataSet.points.length; i++) {
-      const point = this.dataSet.points[i];
-      const projections: {[key: string]: number} = {};
-      const keys = Object.keys(point.projections);
-      for (let j = 0; j < keys.length; ++j) {
-        projections[keys[j]] = point.projections[keys[j]];
-      }
-      state.projections.push(projections);
-    }
-    state.selectedProjection = this.projection.projectionType;
-    state.dataSetDimensions = this.dataSet.dim;
-    state.tSNEIteration = this.dataSet.tSNEIteration;
-    state.selectedPoints = this.selectedPointIndices;
-    state.filteredPoints = this.dataSetFilterIndices;
-    this.projectorScatterPlotAdapter.populateBookmarkFromUI(state);
-    state.selectedColorOptionName = this.dataPanel.selectedColorOptionName;
-    state.forceCategoricalColoring = this.dataPanel.forceCategoricalColoring;
-    state.selectedLabelOption = this.selectedLabelOption;
-    this.projectionsPanel.populateBookmarkFromUI(state);
-    return state;
-  }
-
-  /** Loads a State object into the world. */
-  loadState(state: State) {
-    this.setProjection(null);
-    {
-      this.projectionsPanel.disablePolymerChangesTriggerReprojection();
-      if (this.dataSetBeforeFilter != null) {
-        this.resetFilterDataset();
-      }
-      if (state.filteredPoints != null) {
-        this.filterDataset(state.filteredPoints);
-      }
-      this.projectionsPanel.enablePolymerChangesTriggerReprojection();
-    }
-    for (let i = 0; i < state.projections.length; i++) {
-      const point = this.dataSet.points[i];
-      const projection = state.projections[i];
-      const keys = Object.keys(projection);
-      for (let j = 0; j < keys.length; ++j) {
-        point.projections[keys[j]] = projection[keys[j]];
-      }
-    }
-    this.dataSet.hasTSNERun = (state.selectedProjection === 'tsne');
-    this.dataSet.tSNEIteration = state.tSNEIteration;
-    this.projectionsPanel.restoreUIFromBookmark(state);
-    this.inspectorPanel.restoreUIFromBookmark(state);
-    this.dataPanel.selectedColorOptionName = state.selectedColorOptionName;
-    this.dataPanel.setForceCategoricalColoring(
-        !!state.forceCategoricalColoring);
-    this.selectedLabelOption = state.selectedLabelOption;
-    this.projectorScatterPlotAdapter.restoreUIFromBookmark(state);
-    {
-      const dimensions = stateGetAccessorDimensions(state);
-      const components =
-          data.getProjectionComponents(state.selectedProjection, dimensions);
-      const projection = new Projection(
-          state.selectedProjection, components, dimensions.length,
-          this.dataSet);
-      this.setProjection(projection);
-    }
-    this.notifySelectionChanged(state.selectedPoints);
-  }
-}
-
-document.registerElement(Projector.prototype.is, Projector);
diff --git a/tensorflow/tensorboard/components/vz_sorting/BUILD b/tensorflow/tensorboard/components/vz_sorting/BUILD
deleted file mode 100644
index e06b8ae19790490e73d3ceb552ea03d9f304e68d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_sorting/BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "tensorboard_webcomponent_library")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "vz_sorting",
-    srcs = [
-        "sorting.ts",
-        "vz-sorting.html",
-    ],
-    path = "/vz-sorting",
-    visibility = ["//visibility:public"],
-)
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [":vz_sorting"],
-    destdir = "vz-sorting",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_sorting/sorting.ts b/tensorflow/tensorboard/components/vz_sorting/sorting.ts
deleted file mode 100644
index 061184d24bf30623e05834269b32acf745a56299..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_sorting/sorting.ts
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/**
- * Compares tag names asciinumerically broken into components.
- *
- * <p>This is the comparison function used for sorting most string values in
- * TensorBoard. Unlike the standard asciibetical comparator, this function
- * knows that 'a10b' > 'a2b'. Fixed point and engineering notation are
- * supported. This function also splits the input by slash and underscore to
- * perform array comparison. Therefore it knows that 'a/a' < 'a+/a' even
- * though '+' < '/' in the ASCII table.
- */
-export function compareTagNames(a, b: string): number {
-  let ai = 0;
-  let bi = 0;
-  while (true) {
-    if (ai === a.length) {
-      return bi === b.length ? 0 : -1;
-    }
-    if (bi === b.length) {
-      return 1;
-    }
-    if (isDigit(a[ai]) && isDigit(b[bi])) {
-      const ais = ai;
-      const bis = bi;
-      ai = consumeNumber(a, ai + 1);
-      bi = consumeNumber(b, bi + 1);
-      const an = parseFloat(a.slice(ais, ai));
-      const bn = parseFloat(b.slice(bis, bi));
-      if (an < bn) {
-        return -1;
-      }
-      if (an > bn) {
-        return 1;
-      }
-      continue;
-    }
-    if (isBreak(a[ai])) {
-      if (!isBreak(b[bi])) {
-        return -1;
-      }
-    } else if (isBreak(b[bi])) {
-      return 1;
-    } else if (a[ai] < b[bi]) {
-      return -1;
-    } else if (a[ai] > b[bi]) {
-      return 1;
-    }
-    ai++;
-    bi++;
-  }
-}
-
-function consumeNumber(s: string, i: number): number {
-  enum State { NATURAL, REAL, EXPONENT_SIGN, EXPONENT }
-  let state = State.NATURAL;
-  for (; i < s.length; i++) {
-    if (state === State.NATURAL) {
-      if (s[i] === '.') {
-        state = State.REAL;
-      } else if (s[i] === 'e' || s[i] === 'E') {
-        state = State.EXPONENT_SIGN;
-      } else if (!isDigit(s[i])) {
-        break;
-      }
-    } else if (state === State.REAL) {
-      if (s[i] === 'e' || s[i] === 'E') {
-        state = State.EXPONENT_SIGN;
-      } else if (!isDigit(s[i])) {
-        break;
-      }
-    } else if (state === State.EXPONENT_SIGN) {
-      if (isDigit(s[i]) || s[i] === '+' || s[i] === '-') {
-        state = State.EXPONENT;
-      } else {
-        break;
-      }
-    } else if (state === State.EXPONENT) {
-      if (!isDigit(s[i])) {
-        break;
-      }
-    }
-  }
-  return i;
-}
-
-function isDigit(c: string): boolean {
-  return '0' <= c && c <= '9';
-}
-
-function isBreak(c: string): boolean {
-  // TODO(jart): Remove underscore when people stop using it like a slash.
-  return c === '/' || c === '_' || isDigit(c);
-}
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/BUILD b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
deleted file mode 100644
index 929e80d37282387823ea4a93874a112710269cc1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_sorting/test/BUILD
+++ /dev/null
@@ -1,37 +0,0 @@
-package(
-    default_testonly = True,
-    default_visibility = ["//tensorflow/tensorboard:internal"],
-)
-
-load("//tensorflow/tensorboard/defs:vulcanize.bzl", "tensorboard_html_binary")
-load("//tensorflow/tensorboard/defs:web.bzl", "ts_web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-ts_web_library(
-    name = "test",
-    srcs = [
-        "sortingTests.ts",
-        "tests.html",
-    ],
-    path = "/vz-sorting/test",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:web_component_tester",
-        "//tensorflow/tensorboard/components/vz_sorting",
-    ],
-)
-
-tensorboard_html_binary(
-    name = "devserver",
-    compilation_level = "WHITESPACE_ONLY",
-    input_path = "/vz-sorting/test/tests.html",
-    output_path = "/vz-sorting/test/tests.html",
-    deps = [":test"],
-)
-
-filegroup(
-    name = "all_files",
-    testonly = 0,
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts b/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts
deleted file mode 100644
index 510685cb4b5e42ca19e56acef6b1f87347811c99..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_sorting/test/sortingTests.ts
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {compareTagNames} from '../sorting';
-
-describe('compareTagNames', () => {
-
-  const assert = chai.assert;
-  const sortTagNames = (a) => a.sort(compareTagNames);
-
-  it('is asciibetical', () => {
-    assert.deepEqual(sortTagNames(['a', 'b']), ['a', 'b']);
-    assert.deepEqual(sortTagNames(['a', 'B']), ['B', 'a']);
-  });
-
-  it('sorts integer portions', () => {
-    assert.deepEqual(['03', '1'].sort(), ['03', '1']);
-    assert.deepEqual(sortTagNames(['03', '1']), ['1', '03']);
-    assert.deepEqual(sortTagNames(['a03', 'a1']), ['a1', 'a03']);
-    assert.deepEqual(sortTagNames(['a03', 'b1']), ['a03', 'b1']);
-    assert.deepEqual(sortTagNames(['x0a03', 'x0a1']), ['x0a1', 'x0a03']);
-    assert.deepEqual(sortTagNames(['a/b/03', 'a/b/1']), ['a/b/1', 'a/b/03']);
-  });
-
-  it('sorts fixed point numbers', () => {
-    assert.deepEqual(sortTagNames(['a0.1', 'a0.01']), ['a0.01', 'a0.1']);
-  });
-
-  it('sorts engineering notation', () => {
-    assert.deepEqual(sortTagNames(['a1e9', 'a9e8']), ['a9e8', 'a1e9']);
-    assert.deepEqual(sortTagNames(['a1e+9', 'a9e+8']), ['a9e+8', 'a1e+9']);
-    assert.deepEqual(sortTagNames(['a1e+5', 'a9e-6']), ['a9e-6', 'a1e+5']);
-    assert.deepEqual(sortTagNames(['a1.0e9', 'a9.0e8']), ['a9.0e8', 'a1.0e9']);
-    assert.deepEqual(
-        sortTagNames(['a1.0e+9', 'a9.0e+8']), ['a9.0e+8', 'a1.0e+9']);
-  });
-
-  it('is componentized by slash', () => {
-    assert.deepEqual(['a+/a', 'a/a', 'ab/a'].sort(), ['a+/a', 'a/a', 'ab/a']);
-    assert.deepEqual(
-        sortTagNames(['a+/a', 'a/a', 'ab/a']), ['a/a', 'a+/a', 'ab/a']);
-  });
-
-  it('is componentized by underscore', () => {
-    assert.deepEqual(
-        sortTagNames(['a+_a', 'a_a', 'ab_a']), ['a_a', 'a+_a', 'ab_a']);
-    assert.deepEqual(
-        sortTagNames(['a+/a', 'a_a', 'ab_a']), ['a_a', 'a+/a', 'ab_a']);
-  });
-
-  it('is componentized by number boundaries', () => {
-    assert.deepEqual(
-        sortTagNames(['a+0a', 'a0a', 'ab0a']), ['a0a', 'a+0a', 'ab0a']);
-  });
-
-  it('empty comes first', () => {
-    assert.deepEqual(sortTagNames(['a', '//', '/', '']), ['', '/', '//', 'a']);
-  });
-
-  it('decimal parsed correctly', () => {
-    assert.deepEqual(sortTagNames(['0.2', '0.03']), ['0.03', '0.2']);
-    assert.deepEqual(sortTagNames(['0..2', '0..03']), ['0..2', '0..03']);
-    assert.deepEqual(sortTagNames(['.2', '.03']), ['.2', '.03']);
-  });
-});
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/tests.html b/tensorflow/tensorboard/components/vz_sorting/test/tests.html
deleted file mode 100644
index f92c608cdb125ec7e6d6b538d089f2779732ce6a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_sorting/test/tests.html
+++ /dev/null
@@ -1,23 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<meta charset="utf-8">
-<script vulcanize-noinline src="../../web-component-tester/browser.js"></script>
-<link rel="import" href="../vz-sorting.html">
-<body>
-<script src="sortingTests.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_sorting/vz-sorting.html b/tensorflow/tensorboard/components/vz_sorting/vz-sorting.html
deleted file mode 100644
index 5ff6f311589d2ef1c65dbfb052d255390c36991f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_sorting/vz-sorting.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<script src="sorting.js"></script>
diff --git a/tensorflow/tensorboard/defs/BUILD b/tensorflow/tensorboard/defs/BUILD
deleted file mode 100644
index 92a2af34048deaf6da07a7b14aa42e4cd8202958..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "ts_web_library_default_typings",
-    srcs = [
-        # Ordering probably matters.
-        "@com_microsoft_typescript//:lib.es6.d.ts",
-        "@io_angular_clutz//:src/resources/closure.lib.d.ts",
-        "clutz.d.ts",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/tensorflow/tensorboard/defs/clutz.d.ts b/tensorflow/tensorboard/defs/clutz.d.ts
deleted file mode 100644
index 47cf307d2619a4a84f631dceb03b393cd04aa0d6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/clutz.d.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// tslint:disable
-declare namespace ಠ_ಠ.clutz {
-  interface IteratorIterable<T> extends Iterator<T>, Iterable<T> {}
-  interface IIterableResult<T> extends IteratorResult<T> {}
-}
diff --git a/tensorflow/tensorboard/defs/hacks.bzl b/tensorflow/tensorboard/defs/hacks.bzl
deleted file mode 100644
index f1d4be790612ac912dc1b1a2298f8bc8dd99dee6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/hacks.bzl
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO(jart): Merge this file into defs.bzl once that file is sync unified.
-
-def tensorboard_typescript_bundle(
-    name,
-    out,
-    namespace_srcs,
-    namespace_symbol_aliases={},
-    namespace_symbol_aliases_public={},
-    **kwargs):
-  """Rolls TypeScript ES6 modules into one vanilla source file without imports.
-
-  This is a genrule wrapper that concatenates TypeScripts sources inside
-  namespace blocks while removing ^import lines. Because the sources themselves
-  are not parsed, the structure of the modules must be passed to this macro as
-  a Skylark data structure.
-
-  Args:
-    name: Name of this build rule target.
-    out: Path of outputted TypeScript source file.
-    namespace_srcs: Multimap of namespace strings to build file targets. The
-        ordering of the dictionary and nested lists does not matter when
-        generating a typings file, but *does* matter when generating a source
-        file.
-    namespace_symbol_aliases: Map of namespace strings where each value is a
-        map of symbol names to fully qualified symbol names.
-    namespace_symbol_aliases_public: Same as namespace_symbol_aliases but the
-        symbol will be visible to other namespaces.
-  """
-  cmd = ["(", "echo // GENERATED BY TENSORBOARD_TYPESCRIPT_BUNDLE"]
-  inputs = set()
-  for namespace, srcs in namespace_srcs.items():
-    cmd.append("echo")
-    if out[-5:] == ".d.ts":
-      cmd.append("echo 'declare namespace %s {'" % namespace)
-    elif out[-3:] == ".ts":
-      cmd.append("echo 'module %s {'" % namespace)
-    else:
-      fail("'out' must end with .ts or .d.ts: " + out)
-    for symbol, canon in namespace_symbol_aliases.get(namespace, {}).items():
-      cmd.append("echo 'import %s = %s;'" % (symbol, canon))
-    for symbol, canon in namespace_symbol_aliases_public.get(namespace,
-                                                             {}).items():
-      cmd.append("echo 'export import %s = %s;'" % (symbol, canon))
-    inputs += srcs
-    for src in srcs:
-      cmd.append("for f in $(locations %s); do" % src)
-      cmd.append("  echo")
-      cmd.append("  echo /////////////////////////////////////////////////////")
-      cmd.append("  echo // " + namespace)
-      cmd.append("  echo // $$f")
-      cmd.append("  echo /////////////////////////////////////////////////////")
-      cmd.append("  echo")
-      cmd.append("  sed 's!^import !// import !' $$f \\")
-      cmd.append("    | sed 's!^export declare !export !' \\")
-      cmd.append("    | sed '/^export .* from /d' \\")
-      cmd.append("    | sed '/^export {.*};$$/d'")
-      cmd.append("done")
-    cmd.append("echo '}'")
-  cmd.append(") >$@")
-  native.genrule(
-      name = name,
-      srcs = list(inputs),
-      outs = [out],
-      cmd = "\n".join(cmd),
-      **kwargs
-  )
diff --git a/tensorflow/tensorboard/defs/protos.bzl b/tensorflow/tensorboard/defs/protos.bzl
deleted file mode 100644
index 6d1982e098d9c549a3f6387035c6877d0b798ab7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/protos.bzl
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@protobuf//:protobuf.bzl", "py_proto_library")
-
-def tb_proto_library(name, srcs = [], visibility = []):
-  py_proto_library(
-    name = name + "_py",
-    srcs = srcs,
-    srcs_version = "PY2AND3",
-    deps = ["@protobuf//:protobuf_python"],
-    protoc = "@protobuf//:protoc",
-    visibility = visibility,
-    default_runtime = "@protobuf//:protobuf_python",
-    testonly = 0,
-  )
\ No newline at end of file
diff --git a/tensorflow/tensorboard/defs/vulcanize.bzl b/tensorflow/tensorboard/defs/vulcanize.bzl
deleted file mode 100644
index 6ff49a35ed73f0a8a5fb7ce5b3544e0807e1c0bc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/vulcanize.bzl
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "legacy_js")
-load("@io_bazel_rules_closure//closure/private:defs.bzl", "collect_js", "unfurl", "long_path")
-load("//tensorflow/tensorboard/defs:web.bzl", "web_aspect")
-
-def _tensorboard_html_binary(ctx):
-  deps = unfurl(ctx.attr.deps, provider="webfiles")
-  manifests = set(order="topological")
-  files = set()
-  webpaths = set()
-  for dep in deps:
-    manifests += dep.webfiles.manifests
-    webpaths += dep.webfiles.webpaths
-    files += dep.data_runfiles.files
-  webpaths += [ctx.attr.output_path]
-  closure_js_library=collect_js(
-      ctx, unfurl(ctx.attr.deps, provider="closure_js_library"))
-
-  # vulcanize
-  jslibs = depset(ctx.files._jslibs) + closure_js_library.srcs
-  ctx.action(
-      inputs=list(manifests | files | jslibs),
-      outputs=[ctx.outputs.html],
-      executable=ctx.executable._Vulcanize,
-      arguments=([ctx.attr.compilation_level,
-                  "true" if ctx.attr.testonly else "false",
-                  ctx.attr.input_path,
-                  ctx.attr.output_path,
-                  ctx.outputs.html.path] +
-                 [f.path for f in jslibs] +
-                 [f.path for f in manifests]),
-      progress_message="Vulcanizing %s" % ctx.attr.input_path)
-
-  # webfiles manifest
-  manifest_srcs = [struct(path=ctx.outputs.html.path,
-                          longpath=long_path(ctx, ctx.outputs.html),
-                          webpath=ctx.attr.output_path)]
-  manifest = ctx.new_file(ctx.configuration.bin_dir,
-                          "%s.pbtxt" % ctx.label.name)
-  ctx.file_action(
-      output=manifest,
-      content=struct(
-          label=str(ctx.label),
-          src=manifest_srcs).to_proto())
-  manifests += [manifest]
-
-  # webfiles server
-  params = struct(
-      label=str(ctx.label),
-      bind="[::]:6006",
-      manifest=[long_path(ctx, man) for man in manifests],
-      external_asset=[struct(webpath=k, path=v)
-                      for k, v in ctx.attr.external_assets.items()])
-  params_file = ctx.new_file(ctx.configuration.bin_dir,
-                             "%s_server_params.pbtxt" % ctx.label.name)
-  ctx.file_action(output=params_file, content=params.to_proto())
-  ctx.file_action(
-      executable=True,
-      output=ctx.outputs.executable,
-      content="#!/bin/sh\nexec %s %s" % (
-          ctx.executable._WebfilesServer.short_path,
-          long_path(ctx, params_file)))
-
-  transitive_runfiles = depset()
-  transitive_runfiles += ctx.attr._WebfilesServer.data_runfiles.files
-  for dep in deps:
-    transitive_runfiles += dep.data_runfiles.files
-  return struct(
-      files=depset([ctx.outputs.html]),
-      webfiles=struct(
-          manifest=manifest,
-          manifests=manifests,
-          webpaths=webpaths,
-          dummy=ctx.outputs.html),
-      runfiles=ctx.runfiles(
-          files=ctx.files.data + [manifest,
-                                  params_file,
-                                  ctx.outputs.html,
-                                  ctx.outputs.executable],
-          transitive_files=transitive_runfiles))
-
-tensorboard_html_binary = rule(
-    implementation=_tensorboard_html_binary,
-    executable=True,
-    attrs={
-        "compilation_level": attr.string(default="ADVANCED"),
-        "input_path": attr.string(mandatory=True),
-        "output_path": attr.string(mandatory=True),
-        "data": attr.label_list(cfg="data", allow_files=True),
-        "deps": attr.label_list(
-            aspects=[
-                web_aspect,
-                legacy_js,
-            ],
-            mandatory=True),
-        "external_assets": attr.string_dict(default={"/_/runfiles": "."}),
-        "_jslibs": attr.label(
-            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:jslibs"),
-            allow_files=True),
-        "_Vulcanize": attr.label(
-            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:Vulcanize"),
-            executable=True,
-            cfg="host"),
-        "_WebfilesServer": attr.label(
-            default=Label(
-                "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles/server:WebfilesServer"),
-            executable=True,
-            cfg="host"),
-    },
-    outputs={
-        "html": "%{name}.html",
-    })
diff --git a/tensorflow/tensorboard/defs/web.bzl b/tensorflow/tensorboard/defs/web.bzl
deleted file mode 100644
index 103942b0a25d2706b1af445383689dca02407d91..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/web.bzl
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Same as web_library but supports TypeScript."""
-
-load("//tensorflow/tensorboard/defs:defs.bzl", "legacy_js")
-
-load("//third_party:clutz.bzl",
-     "CLUTZ_ATTRIBUTES",
-     "CLUTZ_OUTPUTS",
-     "clutz_aspect",
-     "extract_dts_from_closure_libraries")
-
-load("@io_bazel_rules_closure//closure/private:defs.bzl",
-     "CLOSURE_LIBRARY_BASE_ATTR",
-     "CLOSURE_LIBRARY_DEPS_ATTR",
-     "collect_js",
-     "collect_runfiles",
-     "convert_path_to_es6_module_name",
-     "create_argfile",
-     "difference",
-     "long_path",
-     "unfurl")
-
-_ASPECT_SLURP_FILE_TYPE = FileType([
-    ".html", ".js", ".css", ".gss", ".png", ".jpg", ".gif", ".ico", ".svg"])
-
-_CLOSURE_WORKER = attr.label(
-    default=Label("@io_bazel_rules_closure//java/io/bazel/rules/closure:ClosureWorker"),
-    executable=True,
-    cfg="host")
-
-def _ts_web_library(ctx):
-  if not ctx.attr.srcs:
-    if ctx.attr.deps:
-      fail("deps can not be set when srcs is not")
-    if not ctx.attr.exports:
-      fail("exports must be set if srcs is not")
-  if ctx.attr.path:
-    if not ctx.attr.path.startswith("/"):
-      fail("webpath must start with /")
-    if ctx.attr.path != "/" and ctx.attr.path.endswith("/"):
-      fail("webpath must not end with / unless it is /")
-    if "//" in ctx.attr.path:
-      fail("webpath must not have //")
-  elif ctx.attr.srcs:
-    fail("path must be set when srcs is set")
-  if "*" in ctx.attr.suppress and len(ctx.attr.suppress) != 1:
-    fail("when \"*\" is suppressed no other items should be present")
-
-  # process what came before
-  deps = unfurl(ctx.attr.deps, provider="webfiles")
-  webpaths = depset()
-  ts_typings = depset(ctx.files._default_typings)
-  ts_typings_paths = depset(
-      [long_path(ctx, f) for f in ctx.files._default_typings])
-  ts_typings_execroots = depset()
-  aspect_runfiles = depset()
-  for dep in deps:
-    webpaths += dep.webfiles.webpaths
-    if hasattr(dep.webfiles, "ts_typings"):
-      ts_typings += dep.webfiles.ts_typings
-    if hasattr(dep.webfiles, "ts_typings_paths"):
-      ts_typings_paths += dep.webfiles.ts_typings_paths
-    if hasattr(dep.webfiles, "ts_typings_execroots"):
-      ts_typings_execroots += dep.webfiles.ts_typings_execroots
-    if hasattr(dep.webfiles, "aspect_runfiles"):
-      aspect_runfiles += dep.webfiles.aspect_runfiles
-
-  # process what comes now
-  manifest_srcs = []
-  new_webpaths = []
-  ts_inputs = depset()
-  ts_outputs = []
-  ts_files = list(ts_typings_paths)
-  new_typings = []
-  new_typings_paths = []
-  new_typings_execroot = struct(inputs=[])
-  execroot = struct(
-      inputs=[(long_path(ctx, f), f.path) for f in ctx.files._default_typings],
-      outputs=[],
-      program=[ctx.executable._tsc.path, "-p"])
-  web_srcs = []
-  path = ctx.attr.path
-  strip = _get_strip(ctx)
-  for src in ctx.files.srcs:
-    suffix = _get_path_relative_to_package(src)
-    if strip:
-      if not suffix.startswith(strip):
-        fail("Relative src path not start with '%s': %s" % (strip, suffix))
-      suffix = suffix[len(strip):]
-    webpath = "%s/%s" % ("" if path == "/" else path, suffix)
-    _add_webpath(ctx, src, webpath, webpaths, new_webpaths, manifest_srcs)
-    if suffix.endswith(".d.ts"):
-      web_srcs.append(src)
-      entry = (webpath[1:], src.path)
-      new_typings.append(src)
-      new_typings_paths.append(entry[0])
-      new_typings_execroot.inputs.append(entry)
-      ts_inputs += [src]
-      ts_files.append(entry[0])
-      execroot.inputs.append(entry)
-    elif suffix.endswith(".ts"):
-      noext = suffix[:-3]
-      js = ctx.new_file(ctx.bin_dir, "%s.js" % noext)
-      dts = ctx.new_file(ctx.bin_dir, "%s.d.ts" % noext)
-      webpath_js = webpath[:-3] + ".js"
-      webpath_dts = webpath[:-3] + ".d.ts"
-      _add_webpath(ctx, js, webpath_js, webpaths, new_webpaths, manifest_srcs)
-      _add_webpath(ctx, dts, webpath_dts, webpaths, new_webpaths, manifest_srcs)
-      ts_inputs += [src]
-      ts_outputs.append(js)
-      ts_outputs.append(dts)
-      web_srcs.append(dts)
-      web_srcs.append(js)
-      ts_files.append(webpath[1:])
-      execroot.inputs.append((webpath[1:], src.path))
-      execroot.outputs.append((webpath_js[1:], js.path))
-      execroot.outputs.append((webpath_dts[1:], dts.path))
-      new_typings.append(dts)
-      new_typings_paths.append(webpath_dts[1:])
-      new_typings_execroot.inputs.append((webpath_dts[1:], dts.path))
-    else:
-      web_srcs.append(src)
-
-  # get typings for closure code
-  clutz_dts = extract_dts_from_closure_libraries(ctx)
-  if clutz_dts:
-    entry = (long_path(ctx, clutz_dts), clutz_dts.path)
-    ts_inputs += [clutz_dts]
-    ts_files.append(entry[0])
-    execroot.inputs.append(entry)
-
-  # compile typescript
-  workspace = ""
-  if ctx.label.workspace_root:
-    workspace = "/" + ctx.label.workspace_root
-  if execroot.outputs:
-    ts_config = _new_file(ctx, "-tsc.json")
-    execroot.inputs.append(("tsconfig.json", ts_config.path))
-    ctx.file_action(
-        output=ts_config,
-        content=struct(
-            compilerOptions=struct(
-                baseUrl=".",
-                declaration=True,
-                inlineSourceMap=True,
-                inlineSources=True,
-                module="es6",
-                moduleResolution="node",
-                noResolve=True,
-                target="es5",
-            ),
-            files=ts_files,
-        ).to_json())
-    er_config = _new_file(ctx, "-tsc-execroot.json")
-    ctx.file_action(output=er_config, content=execroot.to_json())
-    ts_inputs += collect_runfiles([ctx.attr._tsc])
-    ts_inputs += ctx.files._tsc
-    ts_inputs += ts_typings
-    ts_inputs += ts_typings_execroots
-    ts_inputs += [ts_config, er_config]
-    ctx.action(
-        inputs=list(ts_inputs),
-        outputs=ts_outputs,
-        executable=ctx.executable._execrooter,
-        arguments=[er_config.path] + [f.path for f in ts_typings_execroots],
-        progress_message="Compiling %d TypeScript files %s" % (
-            len(ts_files), ctx.label))
-
-  # perform strict dependency checking
-  manifest = _make_manifest(ctx, manifest_srcs)
-  webpaths += new_webpaths
-  dummy, manifests = _run_webfiles_validator(ctx, web_srcs, deps, manifest)
-  web_srcs.append(dummy)
-
-  # define development web server that only applies to this transitive closure
-  params = struct(
-      label=str(ctx.label),
-      bind="[::]:6006",
-      manifest=[long_path(ctx, man) for man in manifests],
-      external_asset=[struct(webpath=k, path=v)
-                      for k, v in ctx.attr.external_assets.items()])
-  params_file = _new_file(ctx, "-params.pbtxt")
-  ctx.file_action(output=params_file, content=params.to_proto())
-  ctx.file_action(
-      executable=True,
-      output=ctx.outputs.executable,
-      content="#!/bin/sh\nexec %s %s" % (
-          ctx.executable._WebfilesServer.short_path,
-          long_path(ctx, params_file)))
-
-  if new_typings:
-    er_config = _new_file(ctx, "-typings-execroot.json")
-    ctx.file_action(output=er_config, content=new_typings_execroot.to_json())
-    ts_typings += new_typings
-    ts_typings_paths += new_typings_paths
-    ts_typings_execroots += [er_config]
-  else:
-    ts_typings = depset()
-    ts_typings_paths = depset()
-    ts_typings_execroots = depset()
-
-  # export data to parent rules
-  return struct(
-      files=depset(web_srcs + [dummy]),
-      exports=unfurl(ctx.attr.exports),
-      webfiles=struct(
-          manifest=manifest,
-          manifests=manifests,
-          webpaths=webpaths,
-          dummy=dummy,
-          ts_typings=ts_typings,
-          ts_typings_paths=ts_typings_paths,
-          ts_typings_execroots=ts_typings_execroots),
-      closure_js_library=collect_js(
-          ctx, unfurl(ctx.attr.deps, provider="closure_js_library")),
-      runfiles=ctx.runfiles(
-          files=ctx.files.srcs + ctx.files.data + ts_outputs + [
-              manifest,
-              params_file,
-              ctx.outputs.executable,
-              dummy],
-          transitive_files=(collect_runfiles([ctx.attr._WebfilesServer]) |
-                            collect_runfiles(deps) |
-                            collect_runfiles(ctx.attr.data) |
-                            aspect_runfiles)))
-
-def _web_aspect_impl(target, ctx):
-  if hasattr(target, "webfiles"):
-    return struct()
-  srcs = []
-  deps = []
-  if hasattr(ctx.rule.files, "srcs"):
-    srcs.extend(_ASPECT_SLURP_FILE_TYPE.filter(ctx.rule.files.srcs))
-  for attr in ("deps", "sticky_deps", "module_deps"):
-    value = getattr(ctx.rule.attr, attr, None)
-    if value:
-      deps.extend(value)
-  deps = unfurl(deps, provider="webfiles")
-  webpaths = depset()
-  aspect_runfiles = depset(srcs)
-  for dep in deps:
-    webpaths += dep.webfiles.webpaths
-    if hasattr(dep.webfiles, "aspect_runfiles"):
-      aspect_runfiles += dep.webfiles.aspect_runfiles
-  manifest_srcs = []
-  new_webpaths = []
-  for src in srcs:
-    webpath = "/" + long_path(ctx, src)
-    _add_webpath(ctx, src, webpath, webpaths, new_webpaths, manifest_srcs)
-  webpaths += new_webpaths
-  manifest = _make_manifest(ctx, manifest_srcs)
-  dummy, manifests = _run_webfiles_validator(ctx, srcs, deps, manifest)
-  aspect_runfiles += [dummy, manifest]
-  return struct(
-      webfiles=struct(
-          manifest=manifest,
-          manifests=manifests,
-          webpaths=webpaths,
-          dummy=dummy,
-          aspect_runfiles=aspect_runfiles))
-
-def _make_manifest(ctx, src_list):
-  manifest = _new_file(ctx, "-webfiles.pbtxt")
-  ctx.file_action(
-      output=manifest,
-      content=struct(
-          label=str(ctx.label),
-          src=src_list).to_proto())
-  return manifest
-
-def _run_webfiles_validator(ctx, srcs, deps, manifest):
-  dummy = _new_file(ctx, "-webfiles.ignoreme")
-  manifests = depset(order="topological")
-  for dep in deps:
-    manifests += dep.webfiles.manifests
-  if srcs:
-    args = ["WebfilesValidator",
-            "--dummy", dummy.path,
-            "--target", manifest.path]
-    if hasattr(ctx, "attr") and hasattr(ctx.attr, "suppress"):
-      for category in ctx.attr.suppress:
-        args.append("--suppress")
-        args.append(category)
-    inputs = [manifest]
-    inputs.extend(srcs)
-    direct_manifests = depset()
-    for dep in deps:
-      inputs.append(dep.webfiles.dummy)
-      for f in dep.files:
-        inputs.append(f)
-      direct_manifests += [dep.webfiles.manifest]
-      inputs.append(dep.webfiles.manifest)
-      args.append("--direct_dep")
-      args.append(dep.webfiles.manifest.path)
-    for man in difference(manifests, direct_manifests):
-      inputs.append(man)
-      args.append("--transitive_dep")
-      args.append(man.path)
-    argfile = _new_file(ctx, "-webfiles-checker-args.txt")
-    ctx.file_action(output=argfile, content="\n".join(args))
-    inputs.append(argfile)
-    ctx.action(
-        inputs=inputs,
-        outputs=[dummy],
-        executable=(getattr(ctx.executable, "_ClosureWorker", None) or
-                    getattr(ctx.executable, "_ClosureWorkerAspect", None)),
-        arguments=["@@" + argfile.path],
-        mnemonic="Closure",
-        execution_requirements={"supports-workers": "1"},
-        progress_message="Checking webfiles %s" % ctx.label)
-  else:
-    ctx.file_action(output=dummy, content="BOO!")
-  manifests += [manifest]
-  return dummy, manifests
-
-def _new_file(ctx, suffix):
-  return ctx.new_file(ctx.bin_dir, "%s%s" % (ctx.label.name, suffix))
-
-def _add_webpath(ctx, src, webpath, webpaths, new_webpaths, manifest_srcs):
-  if webpath in new_webpaths:
-    _fail(ctx, "multiple srcs within %s define the webpath %s " % (
-        ctx.label, webpath))
-  if webpath in webpaths:
-    _fail(ctx, "webpath %s was defined by %s when already defined by deps" % (
-        webpath, ctx.label))
-  new_webpaths.append(webpath)
-  manifest_srcs.append(struct(
-      path=src.path,
-      longpath=long_path(ctx, src),
-      webpath=webpath))
-
-def _fail(ctx, message):
-  if ctx.attr.suppress == ["*"]:
-    print(message)
-  else:
-    fail(message)
-
-def _get_path_relative_to_package(artifact):
-  """Returns file path relative to the package that declared it."""
-  path = artifact.path
-  for prefix in (artifact.root.path,
-                 artifact.owner.workspace_root if artifact.owner else '',
-                 artifact.owner.package if artifact.owner else ''):
-    if prefix:
-      prefix = prefix + "/"
-      if not path.startswith(prefix):
-        fail("Path %s doesn't start with %s" % (path, prefix))
-      path = path[len(prefix):]
-  return path
-
-def _get_strip(ctx):
-  strip = ctx.attr.strip_prefix
-  if strip:
-    if strip.startswith("/"):
-      _fail(ctx, "strip_prefix should not end with /")
-      strip = strip[1:]
-    if strip.endswith("/"):
-      _fail(ctx, "strip_prefix should not end with /")
-    else:
-      strip += "/"
-  return strip
-
-web_aspect = aspect(
-    implementation=_web_aspect_impl,
-    attr_aspects=["deps", "sticky_deps", "module_deps"],
-    attrs={"_ClosureWorkerAspect": _CLOSURE_WORKER})
-
-ts_web_library = rule(
-    implementation=_ts_web_library,
-    executable=True,
-    attrs=CLUTZ_ATTRIBUTES + {
-        "path": attr.string(),
-        "srcs": attr.label_list(allow_files=True),
-        "deps": attr.label_list(
-            aspects=[
-                web_aspect,
-                clutz_aspect,
-                legacy_js,
-            ]),
-        "exports": attr.label_list(),
-        "data": attr.label_list(cfg="data", allow_files=True),
-        "suppress": attr.string_list(),
-        "strip_prefix": attr.string(),
-        "external_assets": attr.string_dict(default={"/_/runfiles": "."}),
-        "clutz_entry_points": attr.string_list(),
-        "_execrooter": attr.label(
-            default=Label("//tensorflow/tensorboard/scripts:execrooter"),
-            executable=True,
-            cfg="host"),
-        "_tsc": attr.label(
-            default=Label("@com_microsoft_typescript//:tsc"),
-            allow_files=True,
-            executable=True,
-            cfg="host"),
-        "_default_typings": attr.label(
-            default=Label("//tensorflow/tensorboard:ts_web_library_default_typings"),
-            allow_files=True),
-        "_WebfilesServer": attr.label(
-            default=Label("@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles/server:WebfilesServer"),
-            executable=True,
-            cfg="host"),
-        "_ClosureWorker": _CLOSURE_WORKER,
-        "_closure_library_base": CLOSURE_LIBRARY_BASE_ATTR,
-        "_closure_library_deps": CLOSURE_LIBRARY_DEPS_ATTR,
-    },
-    outputs=CLUTZ_OUTPUTS)
diff --git a/tensorflow/tensorboard/defs/zipper.bzl b/tensorflow/tensorboard/defs/zipper.bzl
deleted file mode 100644
index e98309ec9a5d5185ac48e235ceb10d0d3f0e153d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/defs/zipper.bzl
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@io_bazel_rules_closure//closure/private:defs.bzl", "unfurl", "long_path")
-
-def _tensorboard_zip_file(ctx):
-  deps = unfurl(ctx.attr.deps, provider="webfiles")
-  manifests = set(order="link")
-  files = set()
-  webpaths = set()
-  for dep in deps:
-    manifests += dep.webfiles.manifests
-    webpaths += dep.webfiles.webpaths
-    files += dep.data_runfiles.files
-  ctx.action(
-      inputs=list(manifests + files),
-      outputs=[ctx.outputs.zip],
-      executable=ctx.executable._Zipper,
-      arguments=([ctx.outputs.zip.path] +
-                 [m.path for m in manifests]),
-      progress_message="Zipping %d files" % len(webpaths))
-  transitive_runfiles = set()
-  for dep in deps:
-    transitive_runfiles += dep.data_runfiles.files
-  return struct(
-      files=set([ctx.outputs.zip]),
-      runfiles=ctx.runfiles(
-          files=ctx.files.data + [ctx.outputs.zip],
-          transitive_files=transitive_runfiles))
-
-tensorboard_zip_file = rule(
-    implementation=_tensorboard_zip_file,
-    attrs={
-        "data": attr.label_list(cfg="data", allow_files=True),
-        "deps": attr.label_list(providers=["webfiles"], mandatory=True),
-        "_Zipper": attr.label(
-            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:Zipper"),
-            executable=True,
-            cfg="host"),
-    },
-    outputs={
-        "zip": "%{name}.zip",
-    })
diff --git a/tensorflow/tensorboard/demo/BUILD b/tensorflow/tensorboard/demo/BUILD
deleted file mode 100644
index b253572ec556314356dee4911eeb755e6da18950..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# THIS PACKAGE HAS MOVED
-# See tensorflow/tensorboard/components/tf_tensorboard:demo
-
-web_library(
-    name = "demo_data",
-    srcs = glob(["data/**"]),
-    path = "/",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/demo/data/audio_run_run1_tag_au1_2Faudio_2F0.json b/tensorflow/tensorboard/demo/data/audio_run_run1_tag_au1_2Faudio_2F0.json
deleted file mode 100644
index 7dfe32c7112c61bcacf896de2d906bc06a9c952f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/audio_run_run1_tag_au1_2Faudio_2F0.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"query": "index=0&tag=au1%2Faudio%2F0&run=run1", "step": 0, "wall_time": 1461795049.203407, "content_type": "audio/wav"}]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/audio_run_run2_tag_au2_2Faudio_2F0.json b/tensorflow/tensorboard/demo/data/audio_run_run2_tag_au2_2Faudio_2F0.json
deleted file mode 100644
index 13f9c2de4265d08a3b3635360d380c018f7aed7b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/audio_run_run2_tag_au2_2Faudio_2F0.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"query": "index=0&tag=au2%2Faudio%2F0&run=run2", "step": 0, "wall_time": 1461795049.212815, "content_type": "audio/wav"}]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/demo/data/compressedHistograms_run_run1_tag_histo1.json
deleted file mode 100644
index 6ae6fbf880e61bb8f7dfe3ed0a32dcba3e5d40cd..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/compressedHistograms_run_run1_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, [[0, -2.3150592308536755], [668, -2.0967547155036605], [1587, -1.4326244423655616], [3085, -0.8871306575801902], [5000, -0.09312398815580714], [6915, 0.2584093405812282], [8413, 0.8895470642005087], [9332, 1.3198979614453679], [10000, 1.6793308878855118]]], [100.0, 10, [[0, -1.3417572789138936], [668, -1.183563374619141], [1587, -0.48920418783271574], [3085, 0.29326906896076954], [5000, 0.56953784145381], [6915, 0.8684655583499333], [8413, 1.4133127368907181], [9332, 1.906140650457873], [10000, 2.135771998171255]]], [200.0, 20, [[0, -1.5066917525035333], [668, -1.3910909571770793], [1587, -0.902737218885874], [3085, -0.3807791904765027], [5000, 0.38900200905253046], [6915, 0.8209734209339482], [8413, 1.302385856695965], [9332, 1.9324626053521639], [10000, 2.957505317875451]]], [300.0, 30, [[0, -0.5430457051469562], [668, -0.4626161834245273], [1587, 0.21573949543027715], [3085, 0.37353741100174215], [5000, 0.6891407881591103], [6915, 1.0927156232630852], [8413, 1.2745337159550916], [9332, 1.4321116832891605], [10000, 2.1913774993059034]]], [400.0, 40, [[0, -0.3584790755077172], [668, -0.33301611509753215], [1587, -0.1089466072951948], [3085, 0.5792199847585249], [5000, 1.220854943811942], [6915, 1.759829438421432], [8413, 2.3072559906741614], [9332, 2.753036118353921], [10000, 3.0267252195784047]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/compressedHistograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/demo/data/compressedHistograms_run_run2_tag_histo1.json
deleted file mode 100644
index 3ad520c5687cdec798b401d3740814de75d39bc8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/compressedHistograms_run_run2_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, [[0, -3.6801669545044846], [668, -3.192188140974744], [1587, -2.3414678549368806], [3085, -0.9632173471995873], [5000, -0.3214892636797772], [6915, 0.11870794142185205], [8413, 0.8895470642005087], [9332, 1.183563374619141], [10000, 2.665663810418372]]], [100.0, 10, [[0, -3.564793583751807], [668, -3.376844436865802], [1587, -1.0366615731293798], [3085, -0.27318696312672563], [5000, 0.9718642422053263], [6915, 2.5765662807928194], [8413, 3.1415385101545126], [9332, 4.085981768607621], [10000, 4.623079406808927]]], [200.0, 20, [[0, -2.235172510433281], [668, -2.004569042815611], [1587, -1.2015432383370985], [3085, 0.11835464933202625], [5000, 0.56953784145381], [6915, 1.202844810963146], [8413, 2.689066032283515], [9332, 2.8494015726499944], [10000, 3.481377676013788]]], [300.0, 30, [[0, -3.360113978269659], [668, -2.8293185004961043], [1587, -1.5992540502266783], [3085, 0.14393860259807117], [5000, 1.47723448201245], [6915, 1.9510057389110733], [8413, 2.833176104473626], [9332, 4.142405216576347], [10000, 4.706937777668589]]], [400.0, 40, [[0, -2.599286228987632], [668, -2.240365897443259], [1587, -1.5992540502266783], [3085, -0.9101893288861387], [5000, 0.7580548669750213], [6915, 1.6009864433919474], [8413, 2.3504002974280036], [9332, 2.7907805263353733], [10000, 3.5098048900144323]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/compressedHistograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/demo/data/compressedHistograms_run_run2_tag_histo2.json
deleted file mode 100644
index a3802ba2365adadb2453809fdf77d07ee5ef9b1f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/compressedHistograms_run_run2_tag_histo2.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, [[0, -1.9291158122759586], [668, -1.5970765333488954], [1587, -1.0923120348519078], [3085, -0.6688082872192093], [5000, 0.09312398815580714], [6915, 0.44532789251701854], [8413, 0.8238009655877649], [9332, 1.0357232383581656], [10000, 1.2741043689144438]]], [100.0, 10, [[0, -0.7780725642449806], [668, -0.7138496178727424], [1587, -0.5448932415735014], [3085, -0.24370397454796228], [5000, 0.42790220995778355], [6915, 0.6191730643365096], [8413, 0.752059342118037], [9332, 1.0451472255274825], [10000, 2.5559479569222825]]], [200.0, 20, [[0, -1.3876904425996377], [668, -1.1464188862638496], [1587, -0.4049955219067526], [3085, 0.04721394862139682], [5000, 0.56953784145381], [6915, 1.3221859041483333], [8413, 1.6188495656305735], [9332, 1.7613953069723651], [10000, 2.3257482385477384]]], [300.0, 30, [[0, -1.600772629982185], [668, -1.1548516185367033], [1587, -0.260387173785447], [3085, 0.17416570914366614], [5000, 0.47069243095356195], [6915, 1.1559276581637614], [8413, 2.0474031182051404], [9332, 2.18821711651116], [10000, 2.2393193406467518]]], [400.0, 40, [[0, -0.8286852465281818], [668, -0.7815041529866706], [1587, -0.3334896444053469], [3085, 0.21085213041026643], [5000, 0.5177616740489182], [6915, 1.077122434649409], [8413, 1.5898009703967424], [9332, 1.8859097291499742], [10000, 2.0954239138728523]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/demo/data/graph_run_run1.pbtxt
deleted file mode 100644
index 2a6af3284086b4d797ebf3598bffe286d74baddf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/graph_run_run1.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
diff --git a/tensorflow/tensorboard/demo/data/graph_run_run2.pbtxt b/tensorflow/tensorboard/demo/data/graph_run_run2.pbtxt
deleted file mode 100644
index a5a4d65d5c61a7cf1c208b48f841a38a03847d60..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/graph_run_run2.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
-node {
-  name: "c"
-  op: "matmul"
-  input: "a:0"
-  input: "b:0"
-}
diff --git a/tensorflow/tensorboard/demo/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/demo/data/histograms_run_run1_tag_histo1.json
deleted file mode 100644
index a5600a356e8277e58be3b2891c3e328d058b5d08..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/histograms_run_run1_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-0.3584790755077172, 3.0267252195784047, 20.0, 24.012225532303315, 48.29045006426564, [-0.35363819004775493, -0.29226296698161564, -0.19961953895336082, 0.3214892636797772, 0.5177616740489182, 0.56953784145381, 0.6264916255991911, 0.7580548669750213, 0.8338603536725235, 1.220854943811942, 1.3429404381931362, 1.47723448201245, 1.624957930213695, 1.7874537232350647, 1.9661990955585713, 2.379100905625872, 2.6170109961884593, 3.1665833053880363], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/histograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/demo/data/histograms_run_run2_tag_histo1.json
deleted file mode 100644
index 407c375d2fc710e70408a3238df3a6165e964e84..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/histograms_run_run2_tag_histo1.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-2.599286228987632, 3.5098048900144323, 20.0, 10.792285491200078, 66.66796979177158, [-2.379100905625872, -1.9661990955585713, -1.624957930213695, -1.47723448201245, -1.109868130738129, -1.0089710279437536, -0.42790220995778355, -0.2195814928486969, 0.47069243095356195, 0.7580548669750213, 0.917246389039776, 1.3429404381931362, 1.624957930213695, 1.7874537232350647, 2.1628190051144287, 2.6170109961884593, 2.8787120958073054, 3.8315657995195243], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/histograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/demo/data/histograms_run_run2_tag_histo2.json
deleted file mode 100644
index 752b621ab032f24805574708e1659c7139a701a8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/histograms_run_run2_tag_histo2.json
+++ /dev/null
@@ -1 +0,0 @@
-[[400.0, 40, [-0.8286852465281818, 2.0954239138728523, 20.0, 13.546880465642861, 24.14836803774091, [-0.7580548669750213, -0.38900200905253046, -0.06996543062044111, 0.07696197368248522, 0.19961953895336082, 0.2656936063469233, 0.29226296698161564, 0.5177616740489182, 0.7580548669750213, 0.917246389039776, 1.109868130738129, 1.220854943811942, 1.624957930213695, 2.1628190051144287], [2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 3.0]]]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/images_run_run1_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/demo/data/images_run_run1_tag_im1_2Fimage_2F0.json
deleted file mode 100644
index 814b4193c638749620e86ac21b86c48747f18f4c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/images_run_run1_tag_im1_2Fimage_2F0.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"wall_time": 1459200389.088045, "width": 4, "height": 4, "step": 0, "query": "tag=im1%2Fimage%2F0&index=0&run=run1"}]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/images_run_run1_tag_im2_2Fimage_2F0.json b/tensorflow/tensorboard/demo/data/images_run_run1_tag_im2_2Fimage_2F0.json
deleted file mode 100644
index 0c2bdcfc79cb32433ac987752851ef6dd351b058..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/images_run_run1_tag_im2_2Fimage_2F0.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"wall_time": 1459200389.093653, "width": 4, "height": 4, "step": 0, "query": "tag=im2%2Fimage%2F0&index=0&run=run1"}]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/images_run_run2_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/demo/data/images_run_run2_tag_im1_2Fimage_2F0.json
deleted file mode 100644
index 3160aae366d904d5be5be22d60ca1b345a9d5172..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/images_run_run2_tag_im1_2Fimage_2F0.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"wall_time": 1459200389.117463, "width": 4, "height": 4, "step": 0, "query": "tag=im1%2Fimage%2F0&index=0&run=run2"}]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav b/tensorflow/tensorboard/demo/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav
deleted file mode 100644
index f1d24adc0cef5a734e07e8899b9abf8ae26fa228..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/demo/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav and /dev/null differ
diff --git a/tensorflow/tensorboard/demo/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/demo/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav
deleted file mode 100644
index 006c84338f7313a225830f121bcd95f457de1708..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/demo/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav and /dev/null differ
diff --git a/tensorflow/tensorboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
deleted file mode 100644
index 346fd0076be28b9338152c4d49a32fc5ed685e44..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png and /dev/null differ
diff --git a/tensorflow/tensorboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png b/tensorflow/tensorboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
deleted file mode 100644
index 26d2d10acaf8511efeb03169853092d09252215b..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png and /dev/null differ
diff --git a/tensorflow/tensorboard/demo/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/demo/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
deleted file mode 100644
index 6c4190629429e0929962c4f20bd1a1602620e4bd..0000000000000000000000000000000000000000
Binary files a/tensorflow/tensorboard/demo/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png and /dev/null differ
diff --git a/tensorflow/tensorboard/demo/data/logdir b/tensorflow/tensorboard/demo/data/logdir
deleted file mode 100644
index b6362b45d777266d6204b23884222a080f789f71..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/logdir
+++ /dev/null
@@ -1 +0,0 @@
-{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/runs.json b/tensorflow/tensorboard/demo/data/runs.json
deleted file mode 100644
index e09039054299cdc3e3453c620761e1ed6e0c0169..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run1": {"scalars": ["foo/sin", "foo/cos", "foo/square", "bar/square"], "run_metadata": [], "compressedHistograms": ["histo1"], "images": ["im1/image/0", "im2/image/0"], "histograms": ["histo1"], "graph": true, "audio": ["au1/audio/0"]}, "run2": {"scalars": ["foo/cos", "foo/square", "bar/square"], "run_metadata": [], "compressedHistograms": ["histo2", "histo1"], "images": ["im1/image/0"], "histograms": ["histo2", "histo1"], "graph": true, "audio": ["au2/audio/0"]}}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars.json b/tensorflow/tensorboard/demo/data/scalars.json
deleted file mode 100644
index bc269395b68a35f7d4481fca05063e46c79c2859..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run2": {"foo/cos": [[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]}, "run1": {"foo/sin": [[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]], "foo/cos": [[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]], "bar/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]], "foo/square": [[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]}}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json b/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
deleted file mode 100644
index 6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json b/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
deleted file mode 100644
index 025eaa16e93110da0c50ad03486786ee6e521700..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 1.0], [10.0, 1, 0.5403022766113281], [20.0, 2, -0.416146844625473], [30.0, 3, -0.9899924993515015], [40.0, 4, -0.6536436080932617]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json b/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
deleted file mode 100644
index eae69dd78f3b5aa75acec6b5daa08720fad9adba..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 0.8414709568023682], [20.0, 2, 0.9092974066734314], [30.0, 3, 0.14112000167369843], [40.0, 4, -0.756802499294281]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json b/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
deleted file mode 100644
index 6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json b/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
deleted file mode 100644
index 6d584fb4a9e1cd0a6a56d3d87b7183f55ac52ba6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 1.0], [20.0, 2, 4.0], [30.0, 3, 9.0], [40.0, 4, 16.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json b/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
deleted file mode 100644
index dd3593f9d109e81bef5a10c732a9e08e60b3ef4f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 2.0], [10.0, 1, 1.0806045532226562], [20.0, 2, -0.832293689250946], [30.0, 3, -1.979984998703003], [40.0, 4, -1.3072872161865234]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json b/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
deleted file mode 100644
index 0ff9ef0551d0a3053ba16b502d0d6148057df660..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
+++ /dev/null
@@ -1 +0,0 @@
-[[0.0, 0, 0.0], [10.0, 1, 2.0], [20.0, 2, 8.0], [30.0, 3, 18.0], [40.0, 4, 32.0]]
\ No newline at end of file
diff --git a/tensorflow/tensorboard/http_api.md b/tensorflow/tensorboard/http_api.md
deleted file mode 100644
index c2885daf93c29b5c39b68619d26623c666e28627..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/http_api.md
+++ /dev/null
@@ -1,402 +0,0 @@
-# Tensorboard client-server HTTP API
-
-## Runs, Tags, and Tag Types
-
-TensorBoard data is organized around the concept of a `run`, which represents
-all the related data thrown off by a single execution of TensorFlow, a `tag`,
-which groups values of data that come from the same source within a TensorFlow
-run, and `tag types`, which are our way of distinguishing different types of
-data that have fundamentally different representations and should be processed
-on different code paths. For example, a "train" run may have a `scalars`
-tag that represents the learning rate, another `scalars` tag that
-represents the value of the objective function, a `histograms` tag that reveals
-information on weights in a particular layer over time, and an `images` tag that
-shows input images flowing into the system. The "eval" run might have an
-entirely different set of tag names, or some duplicated tag names.
-
-The currently supported tag types are `scalars`, `images`, `audio`,
-`histograms`, `graph` and `run_metadata`. Each tag type corresponds to a route
-(documented below) for retrieving tag data of that type.
-
-All of the data provided comes from TensorFlow events files ('\*.tfevents\*'),
-which are written using the SummaryWriter class
-(tensorflow/python/training/summary_writer.py), and the data is generated by
-summary ops (tensorflow/python/ops/summary_ops.py). The `scalars` come from the
-`ScalarSummary` op, the `histograms` from the `HistogramSummary`, the `audio`
-from the `AudioSummary`, and the `images` from `ImageSummary`. The tag type
-`graph` is special in that it is not a collection of tags of that type, but a
-boolean denoting if there is a graph definition associated with the run. The tag
-is provided to the summary op (usually as a constant).
-
-## `data/logdir`
-
-Returns a JSON object with a key "logdir" that maps to the `logdir` argument
-(string) with which Tensorboard started up. Example:
-`{logdir: '/foo/logdir/argument'}`
-
-The `logdir` argument is the path of the directory that contains events files.
-
-## `data/plugins_listing`
-
-Returns a dict mapping from plugin name to a boolean indicating whether the
-plugin is active. A plugin might be inactive, for instance, if it lacks relevant
-data. Every plugin has a key. This route helps the frontend avoid issuing
-requests to an inactive plugin - the routes of an inactive plugin do not work.
-
-## `data/runs`
-
-Returns an array containing the names of all the runs known to the
-TensorBoard backend at this time. Each entry is a string corresponding
-to a single run.
-
-We guarantee that as new runs are created in the log directory, they
-will always appear at the end of the list returned by this route. That
-is, the order of runs is persistent, and the result of this route is an
-&ldquo;append-only&rdquo; list.
-
-Example response:
-
-    ["train_run", "eval"]
-
-## `/data/plugin/scalars/tags`
-
-Returns a dictionary mapping from `run_name` (quoted string) to arrays of
-`tag_name` (quoted string), where each array contains the names of all
-scalar tags present in the corresponding run. Here is an example:
-
-    {
-      "train_run": ["xent", "loss", "learning_rate"],
-      "eval": ["precision", "recall"]
-    }
-
-Note that runs without any scalar tags are included as keys with value the
-empty array.
-
-## `/data/plugin/scalars/scalars?run=foo&tag=bar`
-
-Returns an array of event_accumulator.SimpleValueEvents ([wall_time, step,
-value]) for the given run and tag. wall_time is seconds since epoch.
-
-Example:
-
-    [
-      [1443856985.705543, 1448, 0.7461960315704346],  # wall_time, step, value
-      [1443857105.704628, 3438, 0.5427092909812927],
-      [1443857225.705133, 5417, 0.5457325577735901],
-      ...
-    ]
-
-If the format parameter is set to 'csv', the response will instead be in CSV
-format:
-
-    Wall time,step,value
-    1443856985.705543,1448,0.7461960315704346
-    1443857105.704628,3438,0.5427092909812927
-    1443857225.705133,5417,0.5457325577735901
-
-## `/data/plugin/histograms/tags`
-
-Returns a dictionary mapping from `run_name` (quoted string) to arrays of
-`tag_name` (quoted string), where each array contains the names of all
-histogram tags present in the corresponding run. Here is an example:
-
-    {
-      "train_run": ["foo_histogram", "bar_histogram"],
-      "eval": ["foo_histogram", "bar_histogram"]
-    }
-
-Note that runs without any histogram tags are included as keys with
-value the empty array.
-
-## `/data/plugin/histograms/histograms?run=foo&tag=bar`
-
-Returns an array of event_accumulator.HistogramEvents ([wall_time, step,
-HistogramValue]) for the given run and tag. A HistogramValue is [min, max, num,
-sum, sum_squares, bucket_limit, bucket]. wall_time is seconds since epoch.
-
-Annotated Example: (note - real data is higher precision)
-
-    [
-      [
-        1443871386.185149, # wall_time
-        235166,            # step
-        [
-          -0.66,           # minimum value
-          0.44,            # maximum value
-          8.0,             # number of items in the histogram
-          -0.80,           # sum of items in the histogram
-          0.73,            # sum of squares of items in the histogram
-          [-0.68, -0.62, -0.292, -0.26, -0.11, -0.10, -0.08, -0.07, -0.05,
-          -0.0525, -0.0434, -0.039, -0.029, -0.026, 0.42, 0.47, 1.8e+308],
-                          # the right edge of each bucket
-        [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-          1.0, 0.0]        # the number of elements within each bucket
-        ]
-      ]
-    ]
-
-## `/data/plugin/distributions/tags`
-
-Returns a dictionary mapping from `run_name` (quoted string) to arrays of
-`tag_name` (quoted string), where each array contains the names of all
-distribution tags present in the corresponding run. Here is an example:
-
-    {
-      "train_run": ["foo_histogram", "bar_histogram"],
-      "eval": ["foo_histogram", "bar_histogram"]
-    }
-
-Note that runs without any distribution tags are included as keys with
-value the empty array.
-
-## `/data/plugin/distributions/distributions?run=foo&tag=bar`
-
-Returns an array of event_accumulator.CompressedHistogramEvents ([wall_time,
-step, CompressedHistogramValues]) for the given run and tag.
-
-CompressedHistogramValues is a list of namedtuples with each tuple specifying
-a basis point (bps) as well as an interpolated value of the histogram value
-at that basis point. A basis point is 1/100 of a percent.
-
-The current compression strategy is to choose basis points that correspond to
-the median and bands of 1SD, 2SD, and 3SDs around the median. Note that the
-current compression strategy does not work well for representing multimodal
-data -- this is something that will be improved in a later iteration.
-
-Annotated Example: (note - real data is higher precision)
-
-    [
-      [
-        1441154832.580509,   # wall_time
-        5,                   # step
-        [ [0, -3.67],        # CompressedHistogramValue for 0th percentile
-          [2500, -4.19],     # CompressedHistogramValue for 25th percentile
-          [5000, 6.29],
-          [7500, 1.64],
-          [10000, 3.67]
-        ]
-      ],
-      ...
-    ]
-
-## `/data/plugin/images/images?run=foo&tag=bar`
-
-Gets a sample of ImageMetadatas for the given run and tag.
-
-Returns an array of objects containing information about available images,
-crucially including the query parameter that may be used to retrieve that image.
-(See /data/plugin/images/individualImage for details.)
-
-For example:
-
-      {
-        "width": 28,                 # width in pixels
-        "height": 28,                # height in pixels
-        "wall_time": 1440210599.246, # time in seconds since epoch
-        "step": 63702821,            # number of steps that have passed
-        "query": "index=0&tagname=input%2Fimage%2F2&run=train"
-                                     # param for /individualImage
-      }
-
-## `/data/plugin/images/individualImage?{{query}}`
-
-Retrieves an individual image. The image query should not be generated by the
-frontend, but instead acquired from calling the /images route (the image
-metadata objects contain the query to use). The response is the image itself
-with mime-type 'image/png'.
-
-Note that the query is not guaranteed to always refer to the same image even
-within a single run, as images may be removed from the sampling reservoir and
-replaced with other images. (See Notes for details on the reservoir sampling.)
-
-An example call to this route would look like this:
-/data/plugin/images/individualImage?index=0&tagname=input%2Fimage%2F2&run=train
-
-## `/data/plugin/images/tags`
-
-Returns a dictionary mapping from `run_name` (quoted string) to arrays of
-`tag_name` (quoted string), where each array contains the names of all image
-tags present in the corresponding run. Here is an example:
-
-    {
-      "train": ["foo_image", "bar_image"],
-      "eval": ["foo_image", "bar_image"]
-    }
-
-Note that runs without any image tags are included as keys with value the empty
-array.
-
-## `/data/plugin/audio/audio?run=foo&tag=bar`
-
-Gets a sample of AudioMetadatas for the given run and tag.
-
-Returns an array of objects containing information about available audio,
-crucially including the query parameter that may be used to retrieve that audio.
-(See /data/plugin/audio/individualAudio for details.)
-
-For example:
-
-      {
-        "wall_time": 1440210599.246, # time in seconds since epoch
-        "step": 63702821,            # number of steps that have passed
-        "content_type": "audio/wav"  # the MIME-type of the audio
-        "query": "index=0&tagname=input%2Faudio%2F2&run=train"
-                                     # param for /individualAudio
-      }
-
-## `/data/plugin/audio/individualAudio?{{query}}`
-
-Retrieves an individual audio clip. The audio query should not be generated by
-the frontend, but instead acquired from calling the /audio route (the audio
-metadata objects contain the query to use). The response is the audio itself
-with an appropriate Content-Type header set.
-
-Note that the query is not guaranteed to always refer to the same clip even
-within a single run, as audio may be removed from the sampling reservoir and
-replaced with other clips. (See Notes for details on the reservoir sampling.)
-
-An example call to this route would look like this:
-/individualAudio?index=0&tagname=input%2Faudio%2F2&run=train
-
-## `/data/plugin/audio/tags`
-
-Returns a dictionary mapping from `run_name` (quoted string) to arrays of
-`tag_name` (quoted string), where each array contains the names of all audio
-tags present in the corresponding run. Here is an example:
-
-    {
-      "train": ["foo_audio", "bar_audio"],
-      "eval": ["foo_audio", "bar_audio"],
-    }
-
-Note that runs without any audio tags are included as keys with value the empty
-array.
-
-## `/data/plugin/graphs/runs`
-
-Returns a list of runs that have associated graphs.
-
-For example:
-
-    ["train"]
-
-## `/data/plugin/graphs/graph?run=foo&limit_attr_size=1024&large_attrs_key=key`
-
-Returns the graph definition for the given run in pbtxt format. The
-graph is composed of a list of nodes, where each node is a specific
-TensorFlow operation which takes as inputs other nodes (operations).
-
-The query parameters `limit_attr_size` and `large_attrs_key` are optional.
-
-`limit_attr_size` specifies the maximum allowed size in bytes, before the
-attribute is considered large and filtered out of the graph. If specified,
-it must be an int and > 0. If not specified, no filtering is applied.
-
-`large_attrs_key` is the attribute key that will be used for storing
-attributes that are too large. The value of this key (list of strings)
-should be used by the client in order to determine which attributes
-have been filtered. Must be specified if `limit_attr_size` is specified.
-
-For the query
-
-    /data/plugin/graphs/graph?run=foo&limit_attr_size=1024&large_attrs_key=_too_large,
-
-here is an example pbtxt response of a graph with 3 nodes, where the second
-node had two large attributes "a" and "b" that were filtered out (size > 1024):
-
-    node {
-      op: "Input"
-      name: "A"
-    }
-    node {
-      op: "Input"
-      name: "B"
-      attr {
-        key: "small_attr"
-        value: {
-          s: "some string"
-        }
-      }
-      attr {
-        key: "_too_large"
-        value {
-          list {
-            s: "a"
-            s: "b"
-          }
-        }
-      }
-    }
-    node {
-      op: "MatMul"
-      name: "C"
-      input: "A"
-      input: "B"
-    }
-
-Prior to filtering, the original node "B" had the following content:
-
-    node {
-      op: "Input"
-      name: "B"
-      attr {
-        key: "small_attr"
-        value: {
-          s: "some string"
-        }
-      }
-      attr {
-        key: "a"
-        value { Very large object... }
-      }
-      attr {
-        key: "b"
-        value { Very large object... }
-      }
-    }
-
-## `/data/run_metadata?run=foo&tag=bar`
-
-Given a run and tag, returns the metadata of a particular
-`session.run()` as a gzipped, pbtxt serialized [`RunMetadata`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto)
-proto. For example:
-
-    step_stats {
-      dev_stats {
-        device: "/job:localhost/replica:0/task:0/cpu:0"
-        node_stats {
-          node_name: "_SOURCE"
-          all_start_micros: 1458337695775395
-          op_start_rel_micros: 11
-          op_end_rel_micros: 12
-          all_end_rel_micros: 38
-          memory {
-            allocator_name: "cpu"
-          }
-          timeline_label: "_SOURCE = NoOp()"
-          scheduled_micros: 1458337695775363
-        }
-      }
-    }
-
-## Notes
-
-All returned values, histograms, audio, and images are returned in the order
-they were written by TensorFlow (which should correspond to increasing
-`wall_time` order, but may not necessarily correspond to increasing step count
-if the process had to restart from a previous checkpoint).
-
-The returned values may be downsampled using reservoir sampling, which is
-configurable by the TensorBoard server. When downsampling occurs, the server
-guarantees that different tags will all sample at the same sequence of indices,
-so that if there are two tags `A` and `B` which are related so that `A[i] ~
-B[i]` for all `i`, then `D(A)[i] ~ D(B)[i]` for all `i`, where `D` represents
-the downsampling operation.
-
-The reservoir sampling puts an upper bound on the number of items that will be
-returned for a given run-tag combination, and guarantees that all items are
-equally likely to be in the final sample (ie it is a uniform distribution over
-the values), with the proviso that the most recent individual item is always
-included in the sample.
-
-The reservoir sizes are configurable on a per-tag type basis.
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
deleted file mode 100644
index f1f7746ff846e549f3473412470bbff3970a7741..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-java_binary(
-    name = "Vulcanize",
-    srcs = ["Vulcanize.java"],
-    jvm_flags = [
-        "-Xss20m",  # JSCompiler needs big stacks for recursive parsing
-        "-XX:+UseParallelGC",  # Best GC when app isn't latency sensitive
-        "-Djava.util.logging.SimpleFormatter.format='%1$$tY-%1$$tm-%1$$td %1$$tH:%1$$tM:%1$$tS.%1$$tL %4$$-6s %5$$s%6$$s%n'",  # Less log spam
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_guava",
-        "@com_google_protobuf_java",
-        "@io_bazel_rules_closure//closure/compiler",
-        "@io_bazel_rules_closure//java/io/bazel/rules/closure:webpath",
-        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles:build_info_java_proto",
-        "@io_bazel_rules_closure//java/org/jsoup/nodes",
-        "@org_jsoup",
-    ],
-)
-
-java_binary(
-    name = "Zipper",
-    srcs = ["Zipper.java"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_guava",
-        "@com_google_protobuf_java",
-        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles",
-        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles:build_info_java_proto",
-    ],
-)
-
-# These JS files are always taken into consideration by the Closure Compiler
-# when vulcanizing, per vulcanize.bzl.
-filegroup(
-    name = "jslibs",
-    srcs = [
-        # Ordering probably matters
-        "@com_google_javascript_closure_compiler_externs",
-        "@com_google_javascript_closure_compiler_externs_polymer",
-        "externs.js",
-        "@com_google_javascript_closure_library//:closure/goog/base.js",
-        "@com_google_javascript_closure_library//:closure/goog/deps.js",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
deleted file mode 100644
index 533907dd64dd84107d46dd7411235c4ff8aaa755..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
+++ /dev/null
@@ -1,546 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package org.tensorflow.tensorboard.vulcanize;
-
-import static com.google.common.base.Preconditions.checkNotNull;
-import static com.google.common.base.Verify.verify;
-import static com.google.common.base.Verify.verifyNotNull;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import com.google.common.base.CharMatcher;
-import com.google.common.base.Joiner;
-import com.google.common.base.Optional;
-import com.google.common.base.Splitter;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.ImmutableMultimap;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Multimap;
-import com.google.javascript.jscomp.CheckLevel;
-import com.google.javascript.jscomp.CompilationLevel;
-import com.google.javascript.jscomp.Compiler;
-import com.google.javascript.jscomp.CompilerOptions;
-import com.google.javascript.jscomp.DiagnosticGroup;
-import com.google.javascript.jscomp.DiagnosticGroups;
-import com.google.javascript.jscomp.DiagnosticType;
-import com.google.javascript.jscomp.JSError;
-import com.google.javascript.jscomp.ModuleIdentifier;
-import com.google.javascript.jscomp.PropertyRenamingPolicy;
-import com.google.javascript.jscomp.Result;
-import com.google.javascript.jscomp.SourceFile;
-import com.google.javascript.jscomp.WarningsGuard;
-import com.google.protobuf.TextFormat;
-import io.bazel.rules.closure.Webpath;
-import io.bazel.rules.closure.webfiles.BuildInfo.Webfiles;
-import io.bazel.rules.closure.webfiles.BuildInfo.WebfilesSource;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardOpenOption;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Attribute;
-import org.jsoup.nodes.Comment;
-import org.jsoup.nodes.DataNode;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Html5Printer;
-import org.jsoup.nodes.Node;
-import org.jsoup.nodes.TextNode;
-import org.jsoup.parser.Parser;
-import org.jsoup.parser.Tag;
-
-/** Simple one-off solution for TensorBoard vulcanization. */
-public final class Vulcanize {
-
-  private static final Pattern IGNORE_PATHS_PATTERN =
-      Pattern.compile("/(?:polymer|marked-element)/.*");
-
-  private static final ImmutableSet<String> EXTRA_JSDOC_TAGS =
-      ImmutableSet.of("attribute", "hero", "group", "required");
-
-  private static final Pattern WEBPATH_PATTERN = Pattern.compile("//~~WEBPATH~~([^\n]+)");
-
-  private static final Parser parser = Parser.htmlParser();
-  private static final Map<Webpath, Path> webfiles = new HashMap<>();
-  private static final Set<Webpath> alreadyInlined = new HashSet<>();
-  private static final Set<String> legalese = new HashSet<>();
-  private static final List<String> licenses = new ArrayList<>();
-  private static final List<Webpath> stack = new ArrayList<>();
-  private static final List<SourceFile> externs = new ArrayList<>();
-  private static final List<SourceFile> sourcesFromJsLibraries = new ArrayList<>();
-  private static final Map<Webpath, String> sourcesFromScriptTags = new LinkedHashMap<>();
-  private static final Map<Webpath, Node> sourceTags = new LinkedHashMap<>();
-  private static final Multimap<Webpath, String> suppressions = HashMultimap.create();
-  private static CompilationLevel compilationLevel;
-  private static Webpath outputPath;
-  private static Node firstCompiledScript;
-  private static Node licenseComment;
-  private static int insideDemoSnippet;
-  private static boolean testOnly;
-
-  public static void main(String[] args) throws IOException {
-    compilationLevel = CompilationLevel.fromString(args[0]);
-    testOnly = args[1].equals("true");
-    Webpath inputPath = Webpath.get(args[2]);
-    outputPath = Webpath.get(args[3]);
-    Path output = Paths.get(args[4]);
-    for (int i = 5; i < args.length; i++) {
-      if (args[i].endsWith(".js")) {
-        String code = new String(Files.readAllBytes(Paths.get(args[i])), UTF_8);
-        SourceFile sourceFile = SourceFile.fromCode(args[i], code);
-        if (code.contains("@externs")) {
-          externs.add(sourceFile);
-        } else {
-          sourcesFromJsLibraries.add(sourceFile);
-        }
-        continue;
-      }
-      if (!args[i].endsWith(".pbtxt")) {
-        continue;
-      }
-      Webfiles manifest = loadWebfilesPbtxt(Paths.get(args[i]));
-      for (WebfilesSource src : manifest.getSrcList()) {
-        webfiles.put(Webpath.get(src.getWebpath()), Paths.get(src.getPath()));
-      }
-    }
-    stack.add(inputPath);
-    Document document = parse(Files.readAllBytes(webfiles.get(inputPath)));
-    transform(document);
-    compile();
-    if (licenseComment != null) {
-      licenseComment.attr("comment", String.format("\n%s\n", Joiner.on("\n\n").join(licenses)));
-    }
-    Files.write(
-        output,
-        Html5Printer.stringify(document).getBytes(UTF_8),
-        StandardOpenOption.WRITE,
-        StandardOpenOption.CREATE,
-        StandardOpenOption.TRUNCATE_EXISTING);
-  }
-
-  private static void transform(Node root) throws IOException {
-    Node node = checkNotNull(root);
-    Node newNode;
-    while (true) {
-      newNode = enterNode(node);
-      if (node.equals(root)) {
-        root = newNode;
-      }
-      node = newNode;
-      if (node.childNodeSize() > 0) {
-        node = node.childNode(0);
-      } else {
-        while (true) {
-          newNode = leaveNode(node);
-          if (node.equals(root)) {
-            root = newNode;
-          }
-          node = newNode;
-          if (node.equals(root)) {
-            return;
-          }
-          Node next = node.nextSibling();
-          if (next == null) {
-            if (node.parentNode() == null) {
-              return;
-            }
-            node = verifyNotNull(node.parentNode(), "unexpected root: %s", node);
-          } else {
-            node = next;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  private static Node enterNode(Node node) throws IOException {
-    if (node.nodeName().equals("demo-snippet")) {
-      insideDemoSnippet++;
-    }
-    if (insideDemoSnippet > 0) {
-      return node;
-    }
-    if (node instanceof Element) {
-      if (!getAttrTransitive(node, "vulcanize-noinline").isPresent()) {
-        if (node.nodeName().equals("link") && node.attr("rel").equals("import")) {
-          // Inline HTML.
-          node = visitHtmlImport(node);
-        } else if (node.nodeName().equals("script")
-            && !shouldIgnoreUri(node.attr("src"))
-            && !node.hasAttr("jscomp-ignore")) {
-          node = visitScript(node);
-        } else if (node.nodeName().equals("link")
-            && node.attr("rel").equals("stylesheet")
-            && !node.attr("href").isEmpty()
-            && !shouldIgnoreUri(node.attr("href"))) {
-          node = visitStylesheet(node);
-        }
-      }
-      rootifyAttribute(node, "href");
-      rootifyAttribute(node, "src");
-      rootifyAttribute(node, "action");
-      rootifyAttribute(node, "assetpath");
-    } else if (node instanceof Comment) {
-      String text = ((Comment) node).getData();
-      if (text.contains("@license")) {
-        handleLicense(text);
-        if (licenseComment == null) {
-          licenseComment = node;
-        } else {
-          node = replaceNode(node, new TextNode("", node.baseUri()));
-        }
-      } else {
-        node = replaceNode(node, new TextNode("", node.baseUri()));
-      }
-    }
-    return node;
-  }
-
-  private static Node leaveNode(Node node) {
-    if (node instanceof Document) {
-      stack.remove(stack.size() - 1);
-    } else if (node.nodeName().equals("demo-snippet")) {
-      insideDemoSnippet--;
-    }
-    return node;
-  }
-
-  private static Node visitHtmlImport(Node node) throws IOException {
-    Webpath href = me().lookup(Webpath.get(node.attr("href")));
-    if (alreadyInlined.add(href)) {
-      stack.add(href);
-      Document subdocument = parse(Files.readAllBytes(getWebfile(href)));
-      for (Attribute attr : node.attributes()) {
-        subdocument.attr(attr.getKey(), attr.getValue());
-      }
-      return replaceNode(node, subdocument);
-    } else {
-      return replaceNode(node, new TextNode("", node.baseUri()));
-    }
-  }
-
-  private static Node visitScript(Node node) throws IOException {
-    Webpath path;
-    String script;
-    if (node.attr("src").isEmpty()) {
-      path = makeSyntheticName(".js");
-      script = getInlineScriptFromNode(node);
-    } else {
-      path = me().lookup(Webpath.get(node.attr("src")));
-      script = new String(Files.readAllBytes(getWebfile(path)), UTF_8);
-    }
-    if (node.attr("src").endsWith(".min.js")
-        || getAttrTransitive(node, "jscomp-nocompile").isPresent()) {
-      Node newScript =
-          new Element(Tag.valueOf("script"), node.baseUri(), node.attributes())
-              .appendChild(new DataNode(script, node.baseUri()))
-              .removeAttr("src")
-              .removeAttr("jscomp-nocompile");
-      if (firstCompiledScript != null) {
-        firstCompiledScript.before(newScript);
-        return replaceNode(node, new TextNode("", node.baseUri()));
-      } else {
-        return replaceNode(node, newScript);
-      }
-    } else {
-      if (firstCompiledScript == null) {
-        firstCompiledScript = node;
-      }
-      sourcesFromScriptTags.put(path, script);
-      sourceTags.put(path, node);
-      Optional<String> suppress = getAttrTransitive(node, "jscomp-suppress");
-      if (suppress.isPresent()) {
-        if (suppress.get().isEmpty()) {
-          suppressions.put(path, "*");
-        } else {
-          suppressions.putAll(path, Splitter.on(' ').split(suppress.get()));
-        }
-      }
-      return node;
-    }
-  }
-
-  private static Node visitStylesheet(Node node) throws IOException {
-    Webpath href = me().lookup(Webpath.get(node.attr("href")));
-    return replaceNode(
-        node,
-        new Element(Tag.valueOf("style"), node.baseUri(), node.attributes())
-            .appendChild(
-                new DataNode(
-                    new String(Files.readAllBytes(getWebfile(href)), UTF_8), node.baseUri()))
-            .removeAttr("rel")
-            .removeAttr("href"));
-  }
-
-  private static Optional<String> getAttrTransitive(Node node, String attr) {
-    while (node != null) {
-      if (node.hasAttr(attr)) {
-        return Optional.of(node.attr(attr));
-      }
-      node = node.parent();
-    }
-    return Optional.absent();
-  }
-
-  private static Node replaceNode(Node oldNode, Node newNode) {
-    oldNode.replaceWith(newNode);
-    return newNode;
-  }
-
-  private static Path getWebfile(Webpath path) {
-    return verifyNotNull(webfiles.get(path), "Bad ref: %s -> %s", me(), path);
-  }
-
-  private static void compile() {
-    if (sourcesFromScriptTags.isEmpty()) {
-      return;
-    }
-
-    CompilerOptions options = new CompilerOptions();
-    compilationLevel.setOptionsForCompilationLevel(options);
-
-    // Nice options.
-    options.setColorizeErrorOutput(true);
-    options.setContinueAfterErrors(true);
-    options.setLanguageIn(CompilerOptions.LanguageMode.ECMASCRIPT_2016);
-    options.setLanguageOut(CompilerOptions.LanguageMode.ECMASCRIPT5);
-    options.setGenerateExports(true);
-    options.setStrictModeInput(false);
-    options.setExtraAnnotationNames(EXTRA_JSDOC_TAGS);
-
-    // So we can chop JS binary back up into the original script tags.
-    options.setPrintInputDelimiter(true);
-    options.setInputDelimiter("//~~WEBPATH~~%name%");
-
-    // Optimizations that are too advanced for us right now.
-    options.setPropertyRenaming(PropertyRenamingPolicy.OFF);
-    options.setCheckGlobalThisLevel(CheckLevel.OFF);
-    options.setRemoveUnusedPrototypeProperties(false);
-    options.setRemoveUnusedPrototypePropertiesInExterns(false);
-    options.setRemoveUnusedClassProperties(false);
-
-    // Dependency management.
-    options.setClosurePass(true);
-    options.setManageClosureDependencies(true);
-    options.getDependencyOptions().setDependencyPruning(true);
-    options.getDependencyOptions().setDependencySorting(true);
-    options.getDependencyOptions().setMoocherDropping(false);
-    options.getDependencyOptions()
-        .setEntryPoints(
-            sourceTags
-                .keySet()
-                .stream()
-                .map(Webpath::toString)
-                .map(ModuleIdentifier::forFile)
-                .collect(Collectors.toList()));
-
-    // Polymer pass.
-    options.setPolymerVersion(1);
-
-    // Debug flags.
-    if (testOnly) {
-      options.setPrettyPrint(true);
-      options.setGeneratePseudoNames(true);
-      options.setExportTestFunctions(true);
-    }
-
-    // Don't print warnings from <script jscomp-suppress="group1 group2" ...> tags.
-    ImmutableMultimap<DiagnosticType, String> diagnosticGroups = initDiagnosticGroups();
-    options.addWarningsGuard(
-        new WarningsGuard() {
-          @Override
-          public CheckLevel level(JSError error) {
-            if (error.sourceName == null) {
-              return null;
-            }
-            if (error.sourceName.startsWith("javascript/externs")
-                || error.sourceName.contains("com_google_javascript_closure_compiler_externs")) {
-              // TODO(jart): Figure out why these "mismatch of the removeEventListener property on
-              //             type" warnings are showing up.
-              //             https://github.com/google/closure-compiler/pull/1959
-              return CheckLevel.OFF;
-            }
-            if (IGNORE_PATHS_PATTERN.matcher(error.sourceName).matches()) {
-              return CheckLevel.OFF;
-            }
-            if (error.sourceName.startsWith("/tf-graph")
-                && error.getType().key.equals("JSC_VAR_MULTIPLY_DECLARED_ERROR")) {
-              return CheckLevel.OFF; // TODO(jart): Remove when tf-graph is ES6 modules.
-            }
-            if (error.getType().key.equals("JSC_POLYMER_UNQUALIFIED_BEHAVIOR")
-                || error.getType().key.equals("JSC_POLYMER_UNANNOTATED_BEHAVIOR")) {
-              return CheckLevel.OFF; // TODO(jart): What is wrong with this thing?
-            }
-            Collection<String> codes = suppressions.get(Webpath.get(error.sourceName));
-            if (codes.contains("*") || codes.contains(error.getType().key)) {
-              return CheckLevel.OFF;
-            }
-            for (String group : diagnosticGroups.get(error.getType())) {
-              if (codes.contains(group)) {
-                return CheckLevel.OFF;
-              }
-            }
-            return null;
-          }
-        });
-
-    // Get reverse topological script tags and their web paths, which js_library stuff first.
-    List<SourceFile> sauce = Lists.newArrayList(sourcesFromJsLibraries);
-    for (Map.Entry<Webpath, String> source : sourcesFromScriptTags.entrySet()) {
-      sauce.add(SourceFile.fromCode(source.getKey().toString(), source.getValue()));
-    }
-
-    // Compile everything into a single script.
-    Compiler compiler = new Compiler();
-    compiler.disableThreads();
-    Result result = compiler.compile(externs, sauce, options);
-    if (!result.success) {
-      System.exit(1);
-    }
-    String jsBlob = compiler.toSource();
-
-    // Split apart the JS blob and put it back in the original <script> locations.
-    Deque<Map.Entry<Webpath, Node>> tags = new ArrayDeque<>();
-    tags.addAll(sourceTags.entrySet());
-    Matcher matcher = WEBPATH_PATTERN.matcher(jsBlob);
-    verify(matcher.find(), "Nothing found in compiled JS blob!");
-    Webpath path = Webpath.get(matcher.group(1));
-    int start = 0;
-    while (matcher.find()) {
-      if (sourceTags.containsKey(path)) {
-        swapScript(tags, path, jsBlob.substring(start, matcher.start()));
-        start = matcher.start();
-      }
-      path = Webpath.get(matcher.group(1));
-    }
-    swapScript(tags, path, jsBlob.substring(start));
-    verify(tags.isEmpty(), "<script> wasn't compiled: %s", tags);
-  }
-
-  private static void swapScript(
-      Deque<Map.Entry<Webpath, Node>> tags, Webpath path, String script) {
-    verify(!tags.isEmpty(), "jscomp compiled %s after last <script>?!", path);
-    Webpath want = tags.getFirst().getKey();
-    verify(path.equals(want), "<script> tag for %s should come before %s", path, want);
-    Node tag = tags.removeFirst().getValue();
-    tag.replaceWith(
-        new Element(Tag.valueOf("script"), tag.baseUri())
-            .appendChild(new DataNode(script, tag.baseUri())));
-  }
-
-  private static void handleLicense(String text) {
-    if (legalese.add(CharMatcher.whitespace().removeFrom(text))) {
-      licenses.add(CharMatcher.anyOf("\r\n").trimFrom(text));
-    }
-  }
-
-  private static Webpath me() {
-    return Iterables.getLast(stack);
-  }
-
-  private static Webpath makeSyntheticName(String extension) {
-    String me = me().toString();
-    Webpath result = Webpath.get(me + extension);
-    int n = 2;
-    while (sourcesFromScriptTags.containsKey(result)) {
-      result = Webpath.get(String.format("%s-%d%s", me, n++, extension));
-    }
-    return result;
-  }
-
-  private static void rootifyAttribute(Node node, String attribute) {
-    String value = node.attr(attribute);
-    if (value.isEmpty()) {
-      return;
-    }
-    Webpath uri = Webpath.get(value);
-    if (webfiles.containsKey(uri)) {
-      node.attr(attribute, outputPath.getParent().relativize(uri).toString());
-    }
-  }
-
-  private static String getInlineScriptFromNode(Node node) {
-    StringBuilder sb = new StringBuilder();
-    for (Node child : node.childNodes()) {
-      if (child instanceof DataNode) {
-        sb.append(((DataNode) child).getWholeData());
-      }
-    }
-    return sb.toString();
-  }
-
-  private static Document parse(byte[] bytes) {
-    return parse(new ByteArrayInputStream(bytes));
-  }
-
-  private static Document parse(InputStream input) {
-    Document document;
-    try {
-      document = Jsoup.parse(input, null, "", parser);
-    } catch (IOException e) {
-      throw new AssertionError("I/O error when parsing byte array D:", e);
-    }
-    document.outputSettings().indentAmount(0);
-    document.outputSettings().prettyPrint(false);
-    return document;
-  }
-
-  private static Webfiles loadWebfilesPbtxt(Path path) throws IOException {
-    verify(path.toString().endsWith(".pbtxt"), "Not a pbtxt file: %s", path);
-    Webfiles.Builder build = Webfiles.newBuilder();
-    TextFormat.getParser().merge(new String(Files.readAllBytes(path), UTF_8), build);
-    return build.build();
-  }
-
-  private static boolean shouldIgnoreUri(String uri) {
-    return uri.startsWith("#")
-        || uri.endsWith("/")
-        || uri.contains("//")
-        || uri.startsWith("data:")
-        || uri.startsWith("javascript:")
-        // The following are intended to filter out URLs with Polymer variables.
-        || (uri.contains("[[") && uri.contains("]]"))
-        || (uri.contains("{{") && uri.contains("}}"));
-  }
-
-  private static ImmutableMultimap<DiagnosticType, String> initDiagnosticGroups() {
-    DiagnosticGroups groups = new DiagnosticGroups();
-    Multimap<DiagnosticType, String> builder = HashMultimap.create();
-    for (Map.Entry<String, DiagnosticGroup> group : groups.getRegisteredGroups().entrySet()) {
-      for (DiagnosticType type : group.getValue().getTypes()) {
-        builder.put(type, group.getKey());
-      }
-    }
-    return ImmutableMultimap.copyOf(builder);
-  }
-}
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Zipper.java b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Zipper.java
deleted file mode 100644
index 31b3aa195e1ebfb46d9556833d20964c63df1dbe..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Zipper.java
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package org.tensorflow.tensorboard.vulcanize;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import com.google.protobuf.TextFormat;
-import io.bazel.rules.closure.webfiles.BuildInfo.WebfileInfo;
-import io.bazel.rules.closure.webfiles.BuildInfo.Webfiles;
-import io.bazel.rules.closure.webfiles.BuildInfo.WebfilesSource;
-import io.bazel.rules.closure.webfiles.WebfilesWriter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardOpenOption;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.Deflater;
-
-/**
- * Simple one-off solution for TensorBoard zipping of web_library rules.
- *
- * <p>This is intended to collect static assets for production web server deployment. The paths of
- * files inside the zip will be web paths, with the prefix slash removed. These files will be
- * topologically ordered, i.e. web files higher up in the build tree come first.
- */
-public final class Zipper {
-
-  public static void main(String[] args) throws IOException {
-    Set<String> alreadyZipped = new HashSet<>();
-    try (WebfilesWriter writer =
-        new WebfilesWriter(
-            Files.newByteChannel(
-                Paths.get(args[0]),
-                StandardOpenOption.WRITE,
-                StandardOpenOption.CREATE,
-                StandardOpenOption.TRUNCATE_EXISTING),
-            Deflater.BEST_SPEED)) {
-      for (int i = 1; i < args.length; i++) {
-        Webfiles manifest = loadWebfilesPbtxt(Paths.get(args[i]));
-        for (WebfilesSource src : manifest.getSrcList()) {
-          if (!alreadyZipped.add(src.getWebpath())) {
-            continue;
-          }
-          try (InputStream input = Files.newInputStream(Paths.get(src.getPath()))) {
-            writer.writeWebfile(
-                WebfileInfo.newBuilder().setWebpath(src.getWebpath()).build(), input);
-          }
-        }
-      }
-    }
-  }
-
-  private static Webfiles loadWebfilesPbtxt(Path path) throws IOException {
-    Webfiles.Builder build = Webfiles.newBuilder();
-    TextFormat.getParser().merge(new String(Files.readAllBytes(path), UTF_8), build);
-    return build.build();
-  }
-}
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/externs.js b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/externs.js
deleted file mode 100644
index 2e56562c1c4ea332f53f09ebdd821dd9d8c96f38..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/externs.js
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/**
- * @fileoverview Miscellaneous JSCompiler externs needed for TensorBoard.
- * @externs
- */
-
-/** @type {!Object} */ var _;
-/** @type {!Object} */ var d3;
-/** @type {!Object} */ var dagre;
-/** @type {!Object} */ var weblas;
-/** @type {!Object} */ var graphlib;
-/** @type {!Object} */ var Plottable;
-/** @type {!Object} */ var GroupEffect;
-/** @type {!Function|undefined} */ var ga;
-/** @type {!Function|undefined} */ var KeyframeEffect;
-
-/**
- * Some weird webcomponents-lite.js thing.
- * @type {!Function|undefined}
- */
-var wrap;
-
-/**
- * Some weird webcomponents-lite.js thing.
- * @type {!Function|undefined}
- */
-window.wrap;
-
-var HTMLImports;
-
-/**
- * @param {function()} callback
- * @param {!HTMLDocument=} opt_doc
- */
-HTMLImports.whenReady = function(callback, opt_doc) {};
diff --git a/tensorflow/tensorboard/main.py b/tensorflow/tensorboard/main.py
deleted file mode 100644
index 3665d02ff55beaceea1b3f44ab76c015b9785651..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/main.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Serve TensorFlow summary data to a web frontend.
-
-This is a simple web server to proxy data from the event_loader to the web, and
-serve static web files.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import logging as base_logging
-import os
-import socket
-import sys
-
-import tensorflow as tf
-from werkzeug import serving
-
-
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.backend.event_processing import event_file_inspector as efi
-from tensorflow.tensorboard.plugins.audio import audio_plugin
-from tensorflow.tensorboard.plugins.distributions import distributions_plugin
-from tensorflow.tensorboard.plugins.graphs import graphs_plugin
-from tensorflow.tensorboard.plugins.histograms import histograms_plugin
-from tensorflow.tensorboard.plugins.images import images_plugin
-from tensorflow.tensorboard.plugins.projector import projector_plugin
-from tensorflow.tensorboard.plugins.scalars import scalars_plugin
-from tensorflow.tensorboard.plugins.text import text_plugin
-
-# TensorBoard flags
-
-tf.flags.DEFINE_string('logdir', '', """logdir specifies the directory where
-TensorBoard will look to find TensorFlow event files that it can display.
-TensorBoard will recursively walk the directory structure rooted at logdir,
-looking for .*tfevents.* files.
-
-You may also pass a comma separated list of log directories, and TensorBoard
-will watch each directory. You can also assign names to individual log
-directories by putting a colon between the name and the path, as in
-
-tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
-""")
-
-tf.flags.DEFINE_string(
-    'host', '', 'What host to listen to. Defaults to '
-    'serving on all interfaces, set to 127.0.0.1 (localhost) to'
-    'disable remote access (also quiets security warnings).')
-
-tf.flags.DEFINE_integer('port', 6006, 'What port to serve TensorBoard on.')
-
-tf.flags.DEFINE_boolean(
-    'purge_orphaned_data', True, 'Whether to purge data that '
-    'may have been orphaned due to TensorBoard restarts. '
-    'Disabling purge_orphaned_data can be used to debug data '
-    'disappearance.')
-
-tf.flags.DEFINE_integer('reload_interval', 5,
-                        'How often the backend should load '
-                        'more data.')
-
-# Inspect Mode flags
-
-tf.flags.DEFINE_boolean('inspect', False, """Use this flag to print out a digest
-of your event files to the command line, when no data is shown on TensorBoard or
-the data shown looks weird.
-
-Example usages:
-tensorboard --inspect --event_file=myevents.out
-tensorboard --inspect --event_file=myevents.out --tag=loss
-tensorboard --inspect --logdir=mylogdir
-tensorboard --inspect --logdir=mylogdir --tag=loss
-
-See tensorflow/python/summary/event_file_inspector.py for more info and
-detailed usage.
-""")
-tf.flags.DEFINE_string(
-    'tag', '',
-    'The particular tag to query for. Only used if --inspect is present')
-tf.flags.DEFINE_string(
-    'event_file', '',
-    'The particular event file to query for. Only used if --inspect is present '
-    'and --logdir is not specified.')
-
-FLAGS = tf.flags.FLAGS
-
-
-def create_tb_app(plugins):
-  """Read the flags, and create a TensorBoard WSGI application.
-
-  Args:
-    plugins: A list of plugins for TensorBoard to initialize.
-
-  Raises:
-    ValueError: if a logdir is not specified.
-
-  Returns:
-    A new TensorBoard WSGI application.
-  """
-  if not FLAGS.logdir:
-    raise ValueError('A logdir must be specified. Run `tensorboard --help` for '
-                     'details and examples.')
-
-  logdir = os.path.expanduser(FLAGS.logdir)
-  return application.standard_tensorboard_wsgi(
-      logdir=logdir,
-      purge_orphaned_data=FLAGS.purge_orphaned_data,
-      reload_interval=FLAGS.reload_interval,
-      plugins=plugins)
-
-
-def make_simple_server(tb_app, host, port):
-  """Create an HTTP server for TensorBoard.
-
-  Args:
-    tb_app: The TensorBoard WSGI application to create a server for.
-    host: Indicates the interfaces to bind to ('::' or '0.0.0.0' for all
-        interfaces, '::1' or '127.0.0.1' for localhost). A blank value ('')
-        indicates protocol-agnostic all interfaces.
-    port: The port to bind to (0 indicates an unused port selected by the
-        operating system).
-  Returns:
-    A tuple of (server, url):
-      server: An HTTP server object configured to host TensorBoard.
-      url: A best guess at a URL where TensorBoard will be accessible once the
-        server has been started.
-  Raises:
-    socket.error: If a server could not be constructed with the host and port
-      specified. Also logs an error message.
-  """
-  # Mute the werkzeug logging.
-  base_logging.getLogger('werkzeug').setLevel(base_logging.WARNING)
-
-  try:
-    if host:
-      # The user gave us an explicit host
-      server = serving.make_server(host, port, tb_app, threaded=True)
-      if ':' in host and not host.startswith('['):
-        # Display IPv6 addresses as [::1]:80 rather than ::1:80
-        final_host = '[{}]'.format(host)
-      else:
-        final_host = host
-    else:
-      # We've promised to bind to all interfaces on this host. However, we're
-      # not sure whether that means IPv4 or IPv6 interfaces.
-      try:
-        # First try passing in a blank host (meaning all interfaces). This,
-        # unfortunately, defaults to IPv4 even if no IPv4 interface is available
-        # (yielding a socket.error).
-        server = serving.make_server(host, port, tb_app, threaded=True)
-      except socket.error:
-        # If a blank host didn't work, we explicitly request IPv6 interfaces.
-        server = serving.make_server('::', port, tb_app, threaded=True)
-      final_host = socket.gethostname()
-    server.daemon_threads = True
-  except socket.error as socket_error:
-    if port == 0:
-      msg = 'TensorBoard unable to find any open port'
-    else:
-      msg = (
-          'TensorBoard attempted to bind to port %d, but it was already in use'
-          % FLAGS.port)
-    tf.logging.error(msg)
-    print(msg)
-    raise socket_error
-
-  final_port = server.socket.getsockname()[1]
-  tensorboard_url = 'http://%s:%d' % (final_host, final_port)
-  return server, tensorboard_url
-
-
-def run_simple_server(tb_app):
-  """Run a TensorBoard HTTP server, and print some messages to the console."""
-  try:
-    server, url = make_simple_server(tb_app, FLAGS.host, FLAGS.port)
-  except socket.error:
-    # An error message was already logged
-    exit(-1)
-  msg = 'Starting TensorBoard %s at %s' % (tb_app.tag, url)
-  print(msg)
-  tf.logging.info(msg)
-  print('(Press CTRL+C to quit)')
-  sys.stdout.flush()
-
-  server.serve_forever()
-
-
-def main(unused_argv=None):
-  if FLAGS.inspect:
-    tf.logging.info('Not bringing up TensorBoard, but inspecting event files.')
-    event_file = os.path.expanduser(FLAGS.event_file)
-    efi.inspect(FLAGS.logdir, event_file, FLAGS.tag)
-    return 0
-  else:
-    plugins = [
-        scalars_plugin.ScalarsPlugin(),
-        images_plugin.ImagesPlugin(),
-        audio_plugin.AudioPlugin(),
-        graphs_plugin.GraphsPlugin(),
-        distributions_plugin.DistributionsPlugin(),
-        histograms_plugin.HistogramsPlugin(),
-        projector_plugin.ProjectorPlugin(),
-        text_plugin.TextPlugin(),
-    ]
-    tb = create_tb_app(plugins)
-    run_simple_server(tb)
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/BUILD b/tensorflow/tensorboard/plugins/BUILD
deleted file mode 100644
index 7d3a96c0e1543f5398a0ff1120e8c636abfad369..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-# Description:
-# A plugin system for TensorBoard
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "base_plugin",
-    srcs = ["base_plugin.py"],
-    srcs_version = "PY2AND3",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/audio/BUILD b/tensorflow/tensorboard/plugins/audio/BUILD
deleted file mode 100644
index 372aa3067c9584d32c0c62f1b9325bc3fd2751e7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/audio/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-# Description:
-# TensorBoard plugin for audio
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "audio_plugin",
-    srcs = ["audio_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "audio_plugin_test",
-    size = "small",
-    srcs = ["audio_plugin_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":audio_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_binary(
-    name = "audio_demo",
-    srcs = ["audio_demo.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/audio/audio_demo.py b/tensorflow/tensorboard/plugins/audio/audio_demo.py
deleted file mode 100644
index b89310d3a8aa8c894546ab1168382986efe21fb9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/audio/audio_demo.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Sample data exhibiting audio summaries, via a waveform generator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os.path
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-
-FLAGS = tf.flags.FLAGS
-
-tf.flags.DEFINE_string('logdir', '/tmp/audio_demo',
-                       'Directory into which to write TensorBoard data.')
-
-tf.flags.DEFINE_integer('steps', 500,
-                        'Number of frequencies of each waveform to generate.')
-
-# Parameters for the audio output.
-tf.flags.DEFINE_integer('sample_rate', 44100, 'Sample rate, in Hz.')
-tf.flags.DEFINE_float('duration', 2.0, 'Duration of each waveform, in s.')
-
-
-def _samples():
-  """Compute how many samples should be included in each waveform."""
-  return int(FLAGS.sample_rate * FLAGS.duration)
-
-
-def run(logdir, run_name, wave_name, wave_constructor):
-  """Generate wave data of the given form.
-
-  The provided function `wave_constructor` should accept a scalar tensor
-  of type float32, representing the frequency (in Hz) at which to
-  construct a wave, and return a tensor of shape [1, _samples(), `n`]
-  representing audio data (for some number of channels `n`).
-
-  Waves will be generated at frequencies ranging from A4 to A5.
-
-  Arguments:
-    logdir: the top-level directory into which to write summary data
-    run_name: the name of this run; will be created as a subdirectory
-      under logdir
-    wave_name: the name of the wave being generated
-    wave_constructor: see above
-  """
-  tf.reset_default_graph()
-  tf.set_random_seed(0)
-
-  # On each step `i`, we'll set this placeholder to `i`. This allows us
-  # to know "what time it is" at each step.
-  step_placeholder = tf.placeholder(tf.float32, shape=[])
-
-  # We want to linearly interpolate a frequency between A4 (440 Hz) and
-  # A5 (880 Hz).
-  f_min = 440.0
-  f_max = 880.0
-  t = step_placeholder / (FLAGS.steps - 1)
-  frequency = f_min * (1.0 - t) + f_max * t
-
-  # Let's log this frequency, just so that we can make sure that it's as
-  # expected.
-  tf.summary.scalar('frequency', frequency)
-
-  # Now, we pass this to the wave constructor to get our waveform. Doing
-  # so within a name scope means that any summaries that the wave
-  # constructor produces will be namespaced.
-  with tf.name_scope(wave_name):
-    waveform = wave_constructor(frequency)
-
-  # Here's the crucial piece: we interpret this result as audio.
-  tf.summary.audio('waveform', waveform, FLAGS.sample_rate)
-
-  # Now, we can collect up all the summaries and begin the run.
-  summ = tf.summary.merge_all()
-
-  sess = tf.Session()
-  writer = tf.summary.FileWriter(os.path.join(logdir, run_name))
-  writer.add_graph(sess.graph)
-  sess.run(tf.global_variables_initializer())
-  for step in xrange(FLAGS.steps):
-    s = sess.run(summ, feed_dict={step_placeholder: float(step)})
-    writer.add_summary(s, global_step=step)
-  writer.close()
-
-
-# Now, let's take a look at the kinds of waves that we can generate.
-
-
-def sine_wave(frequency):
-  """Emit a sine wave at the given frequency."""
-  xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1])
-  ts = xs / FLAGS.sample_rate
-  return tf.sin(2 * math.pi * frequency * ts)
-
-
-def square_wave(frequency):
-  """Emit a square wave at the given frequency."""
-  # The square is just the sign of the sine!
-  return tf.sign(sine_wave(frequency))
-
-
-def triangle_wave(frequency):
-  """Emit a triangle wave at the given frequency."""
-  xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1])
-  ts = xs / FLAGS.sample_rate
-  #
-  # A triangle wave looks like this:
-  #
-  #      /\      /\
-  #     /  \    /  \
-  #         \  /    \  /
-  #          \/      \/
-  #
-  # If we look at just half a period (the first four slashes in the
-  # diagram above), we can see that it looks like a transformed absolute
-  # value function.
-  #
-  # Let's start by computing the times relative to the start of each
-  # half-wave pulse (each individual "mountain" or "valley", of which
-  # there are four in the above diagram).
-  half_pulse_index = ts * (frequency * 2)
-  half_pulse_angle = half_pulse_index % 1.0  # in [0, 1]
-  #
-  # Now, we can see that each positive half-pulse ("mountain") has
-  # amplitude given by A(z) = 0.5 - abs(z - 0.5), and then normalized:
-  absolute_amplitude = (0.5 - tf.abs(half_pulse_angle - 0.5)) / 0.5
-  #
-  # But every other half-pulse is negative, so we should invert these.
-  half_pulse_parity = tf.sign(1 - (half_pulse_index % 2.0))
-  amplitude = half_pulse_parity * absolute_amplitude
-  #
-  # This is precisely the desired result, so we're done!
-  return amplitude
-
-
-# If we want to get fancy, we can use our above waves as primitives to
-# build more interesting waves.
-
-
-def bisine_wave(frequency):
-  """Emit two sine waves, in stereo at different octaves."""
-  #
-  # We can first our existing sine generator to generate two different
-  # waves.
-  f_hi = frequency
-  f_lo = frequency / 2.0
-  with tf.name_scope('hi'):
-    sine_hi = sine_wave(f_hi)
-  with tf.name_scope('lo'):
-    sine_lo = sine_wave(f_lo)
-  #
-  # Now, we have two tensors of shape [1, _samples(), 1]. By concatenating
-  # them along axis 2, we get a tensor of shape [1, _samples(), 2]---a
-  # stereo waveform.
-  return tf.concat([sine_lo, sine_hi], axis=2)
-
-
-def bisine_wahwah_wave(frequency):
-  """Emit two sine waves with balance oscillating left and right."""
-  #
-  # This is clearly intended to build on the bisine wave defined above,
-  # so we can start by generating that.
-  waves_a = bisine_wave(frequency)
-  #
-  # Then, by reversing axis 2, we swap the stereo channels. By mixing
-  # this with `waves_a`, we'll be able to create the desired effect.
-  waves_b = tf.reverse(waves_a, axis=[2])
-  #
-  # Let's have the balance oscillate from left to right four times.
-  iterations = 4
-  #
-  # Now, we compute the balance for each sample: `ts` has values
-  # in [0, 1] that indicate how much we should use `waves_a`.
-  xs = tf.reshape(tf.range(_samples(), dtype=tf.float32), [1, _samples(), 1])
-  thetas = xs / _samples() * iterations
-  ts = (tf.sin(math.pi * 2 * thetas) + 1) / 2
-  #
-  # Finally, we can mix the two together, and we're done.
-  return ts * waves_a + (1.0 - ts) * waves_b
-
-
-def run_all(logdir, verbose=False):
-  """Generate waves of the shapes defined above.
-
-  Arguments:
-    logdir: the directory into which to store all the runs' data
-    verbose: if true, print out each run's name as it begins
-  """
-  waves = [sine_wave, square_wave, triangle_wave,
-           bisine_wave, bisine_wahwah_wave]
-  for (i, wave_constructor) in enumerate(waves):
-    wave_name = wave_constructor.__name__
-    run_name = 'wave:%02d,%s' % (i + 1, wave_name)
-    if verbose:
-      print('--- Running: %s' % run_name)
-    run(logdir, run_name, wave_name, wave_constructor)
-
-
-def main(unused_argv):
-  print('Saving output to %s.' % FLAGS.logdir)
-  run_all(FLAGS.logdir, verbose=True)
-  print('Done. Output saved to %s.' % FLAGS.logdir)
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/audio/audio_plugin.py b/tensorflow/tensorboard/plugins/audio/audio_plugin.py
deleted file mode 100644
index ee63b67637d980f3047056a5d9528f38fa604ef2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/audio/audio_plugin.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Audio plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from six.moves import urllib
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.plugins import base_plugin
-
-_PLUGIN_PREFIX_ROUTE = event_accumulator.AUDIO
-
-
-class AudioPlugin(base_plugin.TBPlugin):
-  """Audio Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self._multiplexer = multiplexer
-    return {
-        '/audio': self._serve_audio_metadata,
-        '/individualAudio': self._serve_individual_audio,
-        '/tags': self._serve_tags,
-    }
-
-  def is_active(self):
-    """The audio plugin is active iff any run has at least one relevant tag."""
-    return any(self.index_impl().values())
-
-  def _index_impl(self):
-    return {
-        run_name: run_data[event_accumulator.AUDIO]
-        for (run_name, run_data) in self._multiplexer.Runs().items()
-        if event_accumulator.AUDIO in run_data
-    }
-
-  @wrappers.Request.application
-  def _serve_audio_metadata(self, request):
-    """Given a tag and list of runs, serve a list of metadata for audio.
-
-    Note that the audio themselves are not sent; instead, we respond with URLs
-    to the audio. The frontend should treat these URLs as opaque and should not
-    try to parse information about them or generate them itself, as the format
-    may change.
-
-    Args:
-      request: A werkzeug.wrappers.Request object.
-
-    Returns:
-      A werkzeug.Response application.
-    """
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-
-    audio_list = self._multiplexer.Audio(run, tag)
-    response = self._audio_response_for_run(audio_list, run, tag)
-    return http_util.Respond(request, response, 'application/json')
-
-  def _audio_response_for_run(self, run_audio, run, tag):
-    """Builds a JSON-serializable object with information about run_audio.
-
-    Args:
-      run_audio: A list of event_accumulator.AudioValueEvent objects.
-      run: The name of the run.
-      tag: The name of the tag the audio entries all belong to.
-
-    Returns:
-      A list of dictionaries containing the wall time, step, URL, width, and
-      height for each audio entry.
-    """
-    response = []
-    for index, run_audio_clip in enumerate(run_audio):
-      response.append({
-          'wall_time': run_audio_clip.wall_time,
-          'step': run_audio_clip.step,
-          'content_type': run_audio_clip.content_type,
-          'query': self._query_for_individual_audio(run, tag, index)
-      })
-    return response
-
-  def _query_for_individual_audio(self, run, tag, index):
-    """Builds a URL for accessing the specified audio.
-
-    This should be kept in sync with _serve_audio_metadata. Note that the URL is
-    *not* guaranteed to always return the same audio, since audio may be
-    unloaded from the reservoir as new audio entries come in.
-
-    Args:
-      run: The name of the run.
-      tag: The tag.
-      index: The index of the audio entry. Negative values are OK.
-
-    Returns:
-      A string representation of a URL that will load the index-th sampled audio
-      in the given run with the given tag.
-    """
-    query_string = urllib.parse.urlencode({
-        'run': run,
-        'tag': tag,
-        'index': index
-    })
-    return query_string
-
-  @wrappers.Request.application
-  def _serve_individual_audio(self, request):
-    """Serves an individual audio entry."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    index = int(request.args.get('index'))
-    audio = self._multiplexer.Audio(run, tag)[index]
-    return http_util.Respond(
-        request, audio.encoded_audio_string, audio.content_type)
-
-  @wrappers.Request.application
-  def _serve_tags(self, request):
-    index = self._index_impl()
-    return http_util.Respond(request, index, 'application/json')
diff --git a/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py b/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py
deleted file mode 100644
index 961691086e1d9f613897440d6a49ed1f40febf74..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/audio/audio_plugin_test.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests the Tensorboard audio plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import json
-import os
-import shutil
-import tempfile
-
-import numpy
-from six.moves import urllib
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-from werkzeug import test as werkzeug_test
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.audio import audio_plugin
-
-
-class AudioPluginTest(tf.test.TestCase):
-
-  def setUp(self):
-    self.log_dir = tempfile.mkdtemp()
-
-    # We use numpy.random to generate audio. We seed to avoid non-determinism
-    # in this test.
-    numpy.random.seed(42)
-
-    # Create audio summaries for run foo.
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.float32)
-    tf.summary.audio(name="baz", tensor=placeholder, sample_rate=44100)
-    merged_summary_op = tf.summary.merge_all()
-    foo_directory = os.path.join(self.log_dir, "foo")
-    writer = tf.summary.FileWriter(foo_directory)
-    writer.add_graph(sess.graph)
-    for step in xrange(2):
-      # The floats (sample data) range from -1 to 1.
-      writer.add_summary(sess.run(merged_summary_op, feed_dict={
-          placeholder: numpy.random.rand(42, 22050) * 2 - 1
-      }), global_step=step)
-    writer.close()
-
-    # Create audio summaries for run bar.
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.float32)
-    tf.summary.audio(name="quux", tensor=placeholder, sample_rate=44100)
-    merged_summary_op = tf.summary.merge_all()
-    bar_directory = os.path.join(self.log_dir, "bar")
-    writer = tf.summary.FileWriter(bar_directory)
-    writer.add_graph(sess.graph)
-    for step in xrange(2):
-      # The floats (sample data) range from -1 to 1.
-      writer.add_summary(sess.run(merged_summary_op, feed_dict={
-          placeholder: numpy.random.rand(42, 11025) * 2 - 1
-      }), global_step=step)
-    writer.close()
-
-    # Start a server with the plugin.
-    multiplexer = event_multiplexer.EventMultiplexer({
-        "foo": foo_directory,
-        "bar": bar_directory,
-    })
-    plugin = audio_plugin.AudioPlugin()
-    wsgi_app = application.TensorBoardWSGIApp(
-        self.log_dir, [plugin], multiplexer, reload_interval=0)
-    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
-    self.routes = plugin.get_plugin_apps(multiplexer, self.log_dir)
-
-  def tearDown(self):
-    shutil.rmtree(self.log_dir, ignore_errors=True)
-
-  def _DeserializeResponse(self, byte_content):
-    """Deserializes byte content that is a JSON encoding.
-
-    Args:
-      byte_content: The byte content of a response.
-
-    Returns:
-      The deserialized python object decoded from JSON.
-    """
-    return json.loads(byte_content.decode("utf-8"))
-
-  def testRoutesProvided(self):
-    """Tests that the plugin offers the correct routes."""
-    self.assertIsInstance(self.routes["/audio"], collections.Callable)
-    self.assertIsInstance(self.routes["/individualAudio"], collections.Callable)
-    self.assertIsInstance(self.routes["/tags"], collections.Callable)
-
-  def testAudioRoute(self):
-    """Tests that the /audio routes returns with the correct data."""
-    response = self.server.get(
-        "/data/plugin/audio/audio?run=foo&tag=baz/audio/0")
-    self.assertEqual(200, response.status_code)
-
-    # Verify that the correct entries are returned.
-    entries = self._DeserializeResponse(response.get_data())
-    self.assertEqual(2, len(entries))
-
-    # Verify that the 1st entry is correct.
-    entry = entries[0]
-    self.assertEqual(0, entry["step"])
-    parsed_query = urllib.parse.parse_qs(entry["query"])
-    self.assertListEqual(["0"], parsed_query["index"])
-    self.assertListEqual(["foo"], parsed_query["run"])
-    self.assertListEqual(["baz/audio/0"], parsed_query["tag"])
-
-    # Verify that the 2nd entry is correct.
-    entry = entries[1]
-    self.assertEqual(1, entry["step"])
-    parsed_query = urllib.parse.parse_qs(entry["query"])
-    self.assertListEqual(["1"], parsed_query["index"])
-    self.assertListEqual(["foo"], parsed_query["run"])
-    self.assertListEqual(["baz/audio/0"], parsed_query["tag"])
-
-  def testIndividualAudioRoute(self):
-    """Tests fetching an individual audio."""
-    response = self.server.get(
-        "/data/plugin/audio/individualAudio?run=bar&tag=quux/audio/0&index=0")
-    self.assertEqual(200, response.status_code)
-    self.assertEqual("audio/wav", response.headers.get("content-type"))
-
-  def testRunsRoute(self):
-    """Tests that the /runs route offers the correct run to tag mapping."""
-    response = self.server.get("/data/plugin/audio/tags")
-    self.assertEqual(200, response.status_code)
-    run_to_tags = self._DeserializeResponse(response.get_data())
-    self.assertItemsEqual(("foo", "bar"), run_to_tags.keys())
-    self.assertItemsEqual(
-        ["baz/audio/0", "baz/audio/1", "baz/audio/2"], run_to_tags["foo"])
-    self.assertItemsEqual(
-        ["quux/audio/0", "quux/audio/1", "quux/audio/2"], run_to_tags["bar"])
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/base_plugin.py b/tensorflow/tensorboard/plugins/base_plugin.py
deleted file mode 100644
index 01ab06e26d3d0c52c761d573ff64de739e33c363..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/base_plugin.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorBoard Plugin abstract base class.
-
-Every plugin in TensorBoard must extend and implement the abstract methods of
-this base class.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from abc import ABCMeta
-from abc import abstractmethod
-
-
-class TBPlugin(object):
-  """TensorBoard plugin interface. Every plugin must extend from this class."""
-  __metaclass__ = ABCMeta
-
-  # The plugin_name will also be a prefix in the http handlers generated by
-  # the plugin, e.g. `data/plugins/$PLUGIN_NAME/$HANDLER`
-  # The plugin name must be unique for each registered plugin, or
-  # a ValueError will be thrown when the application is constructed. The
-  # plugin name must only contain characters among [A-Za-z0-9_.-], and
-  # must be nonempty, or a ValueError will similarly be thrown.
-  plugin_name = None
-
-  @abstractmethod
-  def get_plugin_apps(self, multiplexer, logdir):
-    """Returns a set of WSGI applications that the plugin implements.
-
-    Each application gets registered with the tensorboard app and is served
-    under a prefix path that includes the name of the plugin.
-
-    Args:
-      multiplexer: The event_multiplexer with underlying TB data.
-      logdir: The logging directory TensorBoard was started with.
-
-    Returns:
-      A dict mapping route paths to WSGI applications. Each route path
-      should include a leading slash.
-    """
-    raise NotImplementedError()
-
-  @abstractmethod
-  def is_active(self):
-    """Determines whether this plugin is active.
-
-    A plugin may not be active for instance if it lacks relevant data. If a
-    plugin is inactive, the frontend may avoid issuing requests to its routes.
-
-    Returns:
-      A boolean value. Whether this plugin is active.
-    """
-    raise NotImplementedError()
diff --git a/tensorflow/tensorboard/plugins/distributions/BUILD b/tensorflow/tensorboard/plugins/distributions/BUILD
deleted file mode 100644
index 3ce765020e0c5f508e0510a9d12e3323fea313d2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/distributions/BUILD
+++ /dev/null
@@ -1,48 +0,0 @@
-# Description:
-# TensorBoard plugin for distributions
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-## Distributions Plugin ##
-py_library(
-    name = "distributions_plugin",
-    srcs = ["distributions_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "distributions_plugin_test",
-    size = "small",
-    srcs = ["distributions_plugin_test.py"],
-    main = "distributions_plugin_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":distributions_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/distributions/distributions_plugin.py b/tensorflow/tensorboard/plugins/distributions/distributions_plugin.py
deleted file mode 100644
index 4bb9dfaf54592698df455e57625500ba30ff95a7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/distributions/distributions_plugin.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Distributions (a.k.a. compressed histograms) plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.plugins import base_plugin
-
-_PLUGIN_PREFIX_ROUTE = event_accumulator.COMPRESSED_HISTOGRAMS
-
-
-class DistributionsPlugin(base_plugin.TBPlugin):
-  """Distributions Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self._multiplexer = multiplexer
-    return {
-        '/distributions': self.distributions_route,
-        '/tags': self.tags_route,
-    }
-
-  def is_active(self):
-    """This plugin is active iff any run has at least one relevant tag."""
-    return any(self.index_impl().values())
-
-  def index_impl(self):
-    return {
-        run_name: run_data[event_accumulator.COMPRESSED_HISTOGRAMS]
-        for (run_name, run_data) in self._multiplexer.Runs().items()
-        if event_accumulator.COMPRESSED_HISTOGRAMS in run_data
-    }
-
-  def distributions_impl(self, tag, run):
-    """Result of the form `(body, mime_type)`."""
-    values = self._multiplexer.CompressedHistograms(run, tag)
-    return (values, 'application/json')
-
-  @wrappers.Request.application
-  def tags_route(self, request):
-    index = self.index_impl()
-    return http_util.Respond(request, index, 'application/json')
-
-  @wrappers.Request.application
-  def distributions_route(self, request):
-    """Given a tag and single run, return array of compressed histograms."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    (body, mime_type) = self.distributions_impl(tag, run)
-    return http_util.Respond(request, body, mime_type)
diff --git a/tensorflow/tensorboard/plugins/distributions/distributions_plugin_test.py b/tensorflow/tensorboard/plugins/distributions/distributions_plugin_test.py
deleted file mode 100644
index b5aae6dea796a932f16db91a68a401a988c975ea..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/distributions/distributions_plugin_test.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for the Distributions Plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.distributions import distributions_plugin
-
-
-class DistributionsPluginTest(tf.test.TestCase):
-
-  _STEPS = 99
-
-  _DISTRIBUTION_TAG = 'my-favorite-distribution'
-  _SCALAR_TAG = 'my-boring-scalars'
-
-  _RUN_WITH_DISTRIBUTION = '_RUN_WITH_DISTRIBUTION'
-  _RUN_WITH_SCALARS = '_RUN_WITH_SCALARS'
-
-  def set_up_with_runs(self, run_names):
-    self.logdir = self.get_temp_dir()
-    for run_name in run_names:
-      self.generate_run(run_name)
-    multiplexer = event_multiplexer.EventMultiplexer(size_guidance={
-        # don't truncate my test data, please
-        event_accumulator.COMPRESSED_HISTOGRAMS:
-            self._STEPS,
-    })
-    multiplexer.AddRunsFromDirectory(self.logdir)
-    multiplexer.Reload()
-    self.plugin = distributions_plugin.DistributionsPlugin()
-    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
-
-  def generate_run(self, run_name):
-    if run_name == self._RUN_WITH_DISTRIBUTION:
-      (use_distributions, use_scalars) = (True, False)
-    elif run_name == self._RUN_WITH_SCALARS:
-      (use_distributions, use_scalars) = (False, True)
-    else:
-      assert False, 'Invalid run name: %r' % run_name
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.float32, shape=[3])
-    if use_distributions:
-      tf.summary.histogram(self._DISTRIBUTION_TAG, placeholder)
-    if use_scalars:
-      tf.summary.scalar(self._SCALAR_TAG, tf.reduce_mean(placeholder))
-    summ = tf.summary.merge_all()
-
-    subdir = os.path.join(self.logdir, run_name)
-    writer = tf.summary.FileWriter(subdir)
-    writer.add_graph(sess.graph)
-    for step in xrange(self._STEPS):
-      feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]}
-      s = sess.run(summ, feed_dict=feed_dict)
-      writer.add_summary(s, global_step=step)
-    writer.close()
-
-  def test_index(self):
-    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION,
-                           self._RUN_WITH_SCALARS])
-    self.assertEqual({
-        self._RUN_WITH_DISTRIBUTION: [self._DISTRIBUTION_TAG],
-        self._RUN_WITH_SCALARS: [],
-    }, self.plugin.index_impl())
-
-  def _test_distributions_json(self, run_name, should_have_distributions):
-    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION,
-                           self._RUN_WITH_SCALARS])
-    if should_have_distributions:
-      (data, mime_type) = self.plugin.distributions_impl(
-          self._DISTRIBUTION_TAG, run_name)
-      self.assertEqual('application/json', mime_type)
-      self.assertEqual(len(data), self._STEPS)
-      for i in xrange(self._STEPS):
-        self.assertEqual(i, data[i].step)
-    else:
-      with self.assertRaises(KeyError):
-        self.plugin.distributions_impl(
-            self._DISTRIBUTION_TAG, run_name)
-
-  def test_distributions_json_with_scalars(self):
-    self._test_distributions_json(self._RUN_WITH_DISTRIBUTION, True)
-
-  def test_distributions_json_with_histogram(self):
-    self._test_distributions_json(self._RUN_WITH_SCALARS, False)
-
-  def test_active_with_distribution(self):
-    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION])
-    self.assertTrue(self.plugin.is_active())
-
-  def test_active_with_scalars(self):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS])
-    self.assertFalse(self.plugin.is_active())
-
-  def test_active_with_both(self):
-    self.set_up_with_runs([self._RUN_WITH_DISTRIBUTION,
-                           self._RUN_WITH_SCALARS])
-    self.assertTrue(self.plugin.is_active())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/graphs/BUILD b/tensorflow/tensorboard/plugins/graphs/BUILD
deleted file mode 100644
index 9ec659202ac11712ba7b7f8831c2b8a5ee83114e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/graphs/BUILD
+++ /dev/null
@@ -1,49 +0,0 @@
-# Description:
-# TensorBoard plugin for graphs
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-## Graphs Plugin ##
-py_library(
-    name = "graphs_plugin",
-    srcs = ["graphs_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend:process_graph",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "graphs_plugin_test",
-    size = "small",
-    srcs = ["graphs_plugin_test.py"],
-    main = "graphs_plugin_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":graphs_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/graphs/graphs_plugin.py b/tensorflow/tensorboard/plugins/graphs/graphs_plugin.py
deleted file mode 100644
index 7fdbf9903dbf8c7e689f87053b5085a55ec1698b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/graphs/graphs_plugin.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Graphs plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend import process_graph
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.plugins import base_plugin
-
-_PLUGIN_PREFIX_ROUTE = 'graphs'
-
-
-class GraphsPlugin(base_plugin.TBPlugin):
-  """Graphs Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self._multiplexer = multiplexer
-    return {
-        '/graph': self.graph_route,
-        '/runs': self.runs_route,
-        '/run_metadata': self.run_metadata_route,
-        '/run_metadata_tags': self.run_metadata_tags_route,
-    }
-
-  def is_active(self):
-    """The graphs plugin is active iff any run has a graph."""
-    return bool(self.index_impl())
-
-  def index_impl(self):
-    """Returns a list of all runs that have a graph."""
-    return [run_name
-            for (run_name, run_data) in self._multiplexer.Runs().items()
-            if run_data.get(event_accumulator.GRAPH)]
-
-  def run_metadata_index_impl(self):
-    """Returns a run-to-tag mapping for metadata."""
-    return {
-        run_name: run_data[event_accumulator.RUN_METADATA]
-        for (run_name, run_data) in self._multiplexer.Runs().items()
-        if event_accumulator.RUN_METADATA in run_data
-    }
-
-  def graph_impl(self, run, limit_attr_size=None, large_attrs_key=None):
-    """Result of the form `(body, mime_type)`, or `None` if no graph exists."""
-    try:
-      graph = self._multiplexer.Graph(run)
-    except ValueError:
-      return None
-    # This next line might raise a ValueError if the limit parameters
-    # are invalid (size is negative, size present but key absent, etc.).
-    process_graph.prepare_graph_for_ui(graph, limit_attr_size, large_attrs_key)
-    return (str(graph), 'text/x-protobuf')  # pbtxt
-
-  def run_metadata_impl(self, run, tag):
-    """Result of the form `(body, mime_type)`, or `None` if no data exists."""
-    try:
-      run_metadata = self._multiplexer.RunMetadata(run, tag)
-    except ValueError:
-      return None
-    return (str(run_metadata), 'text/x-protobuf')  # pbtxt
-
-  @wrappers.Request.application
-  def runs_route(self, request):
-    index = self.index_impl()
-    return http_util.Respond(request, index, 'application/json')
-
-  @wrappers.Request.application
-  def run_metadata_tags_route(self, request):
-    index = self.run_metadata_index_impl()
-    return http_util.Respond(request, index, 'application/json')
-
-  @wrappers.Request.application
-  def graph_route(self, request):
-    """Given a single run, return the graph definition in protobuf format."""
-    run = request.args.get('run')
-    if run is None:
-      return http_util.Respond(
-          request, 'query parameter "run" is required', 'text/plain', 400)
-
-    limit_attr_size = request.args.get('limit_attr_size', None)
-    if limit_attr_size is not None:
-      try:
-        limit_attr_size = int(limit_attr_size)
-      except ValueError:
-        return http_util.Respond(
-            request, 'query parameter `limit_attr_size` must be an integer',
-            'text/plain', 400)
-
-    large_attrs_key = request.args.get('large_attrs_key', None)
-
-    try:
-      result = self.graph_impl(run, limit_attr_size, large_attrs_key)
-    except ValueError as e:
-      return http_util.Respond(request, e.message, 'text/plain', code=400)
-    else:
-      if result is not None:
-        (body, mime_type) = result  # pylint: disable=unpacking-non-sequence
-        return http_util.Respond(request, body, mime_type)
-      else:
-        return http_util.Respond(request, '404 Not Found', 'text/plain',
-                                 code=404)
-
-  @wrappers.Request.application
-  def run_metadata_route(self, request):
-    """Given a tag and a run, return the session.run() metadata."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    if tag is None:
-      return http_util.Respond(
-          request, 'query parameter "tag" is required', 'text/plain', 400)
-    if run is None:
-      return http_util.Respond(
-          request, 'query parameter "run" is required', 'text/plain', 400)
-    result = self.run_metadata_impl(run, tag)
-    if result is not None:
-      (body, mime_type) = result  # pylint: disable=unpacking-non-sequence
-      return http_util.Respond(request, body, mime_type)
-    else:
-      return http_util.Respond(request, '404 Not Found', 'text/plain',
-                               code=404)
diff --git a/tensorflow/tensorboard/plugins/graphs/graphs_plugin_test.py b/tensorflow/tensorboard/plugins/graphs/graphs_plugin_test.py
deleted file mode 100644
index db4d0cb1b3c27574e12bca36c9925771699664a6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/graphs/graphs_plugin_test.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for the Graphs Plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os.path
-
-import tensorflow as tf
-
-from google.protobuf import text_format
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.graphs import graphs_plugin
-
-
-class GraphsPluginTest(tf.test.TestCase):
-
-  _RUN_WITH_GRAPH = '_RUN_WITH_GRAPH'
-  _RUN_WITHOUT_GRAPH = '_RUN_WITHOUT_GRAPH'
-
-  _METADATA_TAG = 'secret-stats'
-  _MESSAGE_PREFIX_LENGTH_LOWER_BOUND = 1024
-
-  def generate_run(self, run_name, include_graph):
-    """Create a run with a text summary, metadata, and optionally a graph."""
-    tf.reset_default_graph()
-    k1 = tf.constant(math.pi, name='k1')
-    k2 = tf.constant(math.e, name='k2')
-    result = (k1 ** k2) - k1
-    expected = tf.constant(20.0, name='expected')
-    error = tf.abs(result - expected, name='error')
-    message_prefix_value = 'error ' * 1000
-    true_length = len(message_prefix_value)
-    assert true_length > self._MESSAGE_PREFIX_LENGTH_LOWER_BOUND, true_length
-    message_prefix = tf.constant(message_prefix_value, name='message_prefix')
-    error_message = tf.string_join([message_prefix,
-                                    tf.as_string(error, name='error_string')],
-                                   name='error_message')
-    summary_message = tf.summary.text('summary_message', error_message)
-
-    sess = tf.Session()
-    writer = tf.summary.FileWriter(os.path.join(self.logdir, run_name))
-    if include_graph:
-      writer.add_graph(sess.graph)
-    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-    run_metadata = tf.RunMetadata()
-    s = sess.run(summary_message, options=options, run_metadata=run_metadata)
-    writer.add_summary(s)
-    writer.add_run_metadata(run_metadata, self._METADATA_TAG)
-    writer.close()
-
-  def set_up_with_runs(self, with_graph=True, without_graph=True):
-    self.logdir = self.get_temp_dir()
-    if with_graph:
-      self.generate_run(self._RUN_WITH_GRAPH, include_graph=True)
-    if without_graph:
-      self.generate_run(self._RUN_WITHOUT_GRAPH, include_graph=False)
-    multiplexer = event_multiplexer.EventMultiplexer()
-    multiplexer.AddRunsFromDirectory(self.logdir)
-    multiplexer.Reload()
-    self.plugin = graphs_plugin.GraphsPlugin()
-    self.plugin.get_plugin_apps(multiplexer, None)
-
-  def test_index(self):
-    self.set_up_with_runs()
-    self.assertItemsEqual([self._RUN_WITH_GRAPH], self.plugin.index_impl())
-
-  def test_run_metadata_index(self):
-    self.set_up_with_runs()
-    self.assertDictEqual({
-        self._RUN_WITH_GRAPH: [self._METADATA_TAG],
-        self._RUN_WITHOUT_GRAPH: [self._METADATA_TAG],
-    }, self.plugin.run_metadata_index_impl())
-
-  def _get_graph(self, *args, **kwargs):
-    """Set up runs, then fetch and return the graph as a proto."""
-    self.set_up_with_runs()
-    (graph_pbtxt, mime_type) = self.plugin.graph_impl(
-        self._RUN_WITH_GRAPH, *args, **kwargs)
-    self.assertEqual(mime_type, 'text/x-protobuf')
-    return text_format.Parse(graph_pbtxt, tf.GraphDef())
-
-  def test_graph_simple(self):
-    graph = self._get_graph()
-    node_names = set(node.name for node in graph.node)
-    self.assertEqual({'k1', 'k2', 'pow', 'sub', 'expected', 'sub_1', 'error',
-                      'message_prefix', 'error_string', 'error_message',
-                      'summary_message'},
-                     node_names)
-
-  def test_graph_large_attrs(self):
-    key = 'o---;;-;'
-    graph = self._get_graph(
-        limit_attr_size=self._MESSAGE_PREFIX_LENGTH_LOWER_BOUND,
-        large_attrs_key=key)
-    large_attrs = {
-        node.name: list(node.attr[key].list.s)
-        for node in graph.node
-        if key in node.attr
-    }
-    self.assertEqual({'message_prefix': [b'value']},
-                     large_attrs)
-
-  def test_run_metadata(self):
-    self.set_up_with_runs()
-    (metadata_pbtxt, mime_type) = self.plugin.run_metadata_impl(
-        self._RUN_WITH_GRAPH, self._METADATA_TAG)
-    self.assertEqual(mime_type, 'text/x-protobuf')
-    text_format.Parse(metadata_pbtxt, tf.RunMetadata())
-    # If it parses, we're happy.
-
-  def test_is_active_with_graph(self):
-    self.set_up_with_runs(with_graph=True, without_graph=False)
-    self.assertTrue(self.plugin.is_active())
-
-  def test_is_active_without_graph(self):
-    self.set_up_with_runs(with_graph=False, without_graph=True)
-    self.assertFalse(self.plugin.is_active())
-
-  def test_is_active_with_both(self):
-    self.set_up_with_runs(with_graph=True, without_graph=True)
-    self.assertTrue(self.plugin.is_active())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/histograms/BUILD b/tensorflow/tensorboard/plugins/histograms/BUILD
deleted file mode 100644
index 110a93d20b94623aa65ca31216a601759970b692..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/histograms/BUILD
+++ /dev/null
@@ -1,48 +0,0 @@
-# Description:
-# TensorBoard plugin for histograms
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-## Histograms Plugin ##
-py_library(
-    name = "histograms_plugin",
-    srcs = ["histograms_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "histograms_plugin_test",
-    size = "small",
-    srcs = ["histograms_plugin_test.py"],
-    main = "histograms_plugin_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":histograms_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/histograms/histograms_plugin.py b/tensorflow/tensorboard/plugins/histograms/histograms_plugin.py
deleted file mode 100644
index 7a0e005e8b40333b6dd2c363f5fe129e1a366658..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/histograms/histograms_plugin.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Histograms plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.plugins import base_plugin
-
-_PLUGIN_PREFIX_ROUTE = event_accumulator.HISTOGRAMS
-
-
-class HistogramsPlugin(base_plugin.TBPlugin):
-  """Histograms Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self._multiplexer = multiplexer
-    return {
-        '/histograms': self.histograms_route,
-        '/tags': self.tags_route,
-    }
-
-  def is_active(self):
-    """This plugin is active iff any run has at least one histograms tag."""
-    return any(self.index_impl().values())
-
-  def index_impl(self):
-    return {
-        run_name: run_data[event_accumulator.HISTOGRAMS]
-        for (run_name, run_data) in self._multiplexer.Runs().items()
-        if event_accumulator.HISTOGRAMS in run_data
-    }
-
-  def histograms_impl(self, tag, run):
-    """Result of the form `(body, mime_type)`."""
-    values = self._multiplexer.Histograms(run, tag)
-    return (values, 'application/json')
-
-  @wrappers.Request.application
-  def tags_route(self, request):
-    index = self.index_impl()
-    return http_util.Respond(request, index, 'application/json')
-
-  @wrappers.Request.application
-  def histograms_route(self, request):
-    """Given a tag and single run, return array of histogram values."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    (body, mime_type) = self.histograms_impl(tag, run)
-    return http_util.Respond(request, body, mime_type)
diff --git a/tensorflow/tensorboard/plugins/histograms/histograms_plugin_test.py b/tensorflow/tensorboard/plugins/histograms/histograms_plugin_test.py
deleted file mode 100644
index ee895d9ba7b0cdff5007f3c286e7a3762530e161..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/histograms/histograms_plugin_test.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for the Histograms Plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.histograms import histograms_plugin
-
-
-class HistogramsPluginTest(tf.test.TestCase):
-
-  _STEPS = 99
-
-  _HISTOGRAM_TAG = 'my-favorite-histogram'
-  _SCALAR_TAG = 'my-boring-scalars'
-
-  _RUN_WITH_HISTOGRAM = '_RUN_WITH_HISTOGRAM'
-  _RUN_WITH_SCALARS = '_RUN_WITH_SCALARS'
-
-  def set_up_with_runs(self, run_names):
-    self.logdir = self.get_temp_dir()
-    for run_name in run_names:
-      self.generate_run(run_name)
-    multiplexer = event_multiplexer.EventMultiplexer(size_guidance={
-        # don't truncate my test data, please
-        event_accumulator.HISTOGRAMS:
-            self._STEPS,
-    })
-    multiplexer.AddRunsFromDirectory(self.logdir)
-    multiplexer.Reload()
-    self.plugin = histograms_plugin.HistogramsPlugin()
-    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
-
-  def generate_run(self, run_name):
-    if run_name == self._RUN_WITH_HISTOGRAM:
-      (use_histogram, use_scalars) = (True, False)
-    elif run_name == self._RUN_WITH_SCALARS:
-      (use_histogram, use_scalars) = (False, True)
-    else:
-      assert False, 'Invalid run name: %r' % run_name
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.float32, shape=[3])
-    if use_histogram:
-      tf.summary.histogram(self._HISTOGRAM_TAG, placeholder)
-    if use_scalars:
-      tf.summary.scalar(self._SCALAR_TAG, tf.reduce_mean(placeholder))
-    summ = tf.summary.merge_all()
-
-    subdir = os.path.join(self.logdir, run_name)
-    writer = tf.summary.FileWriter(subdir)
-    writer.add_graph(sess.graph)
-    for step in xrange(self._STEPS):
-      feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]}
-      s = sess.run(summ, feed_dict=feed_dict)
-      writer.add_summary(s, global_step=step)
-    writer.close()
-
-  def test_index(self):
-    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM, self._RUN_WITH_SCALARS])
-    self.assertEqual({
-        self._RUN_WITH_HISTOGRAM: [self._HISTOGRAM_TAG],
-        self._RUN_WITH_SCALARS: [],
-    }, self.plugin.index_impl())
-
-  def _test_histograms(self, run_name, should_have_histogram):
-    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM, self._RUN_WITH_SCALARS])
-    if should_have_histogram:
-      (data, mime_type) = self.plugin.histograms_impl(self._HISTOGRAM_TAG,
-                                                      run_name)
-      self.assertEqual('application/json', mime_type)
-      self.assertEqual(len(data), self._STEPS)
-      for i in xrange(self._STEPS):
-        frame = data[i]
-        self.assertEqual(i, frame.step)
-        self.assertEqual(1 + i, frame.histogram_value.min)
-        self.assertEqual(3 + i, frame.histogram_value.max)
-        self.assertAlmostEqual(
-            3,  # three items across all buckets
-            sum(frame.histogram_value.bucket))
-    else:
-      with self.assertRaises(KeyError):
-        self.plugin.histograms_impl(self._HISTOGRAM_TAG, run_name)
-
-  def test_histograms_with_scalars(self):
-    self._test_histograms(self._RUN_WITH_HISTOGRAM, True)
-
-  def test_histograms_with_histogram(self):
-    self._test_histograms(self._RUN_WITH_SCALARS, False)
-
-  def test_active_with_histogram(self):
-    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM])
-    self.assertTrue(self.plugin.is_active())
-
-  def test_active_with_scalars(self):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS])
-    self.assertFalse(self.plugin.is_active())
-
-  def test_active_with_both(self):
-    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM, self._RUN_WITH_SCALARS])
-    self.assertTrue(self.plugin.is_active())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/images/BUILD b/tensorflow/tensorboard/plugins/images/BUILD
deleted file mode 100644
index e9f88c4114dc74443b9a1a1d73a915d60b5da4be..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/images/BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-# Description:
-# TensorBoard plugin for images
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "images_plugin",
-    srcs = ["images_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "images_plugin_test",
-    size = "small",
-    srcs = ["images_plugin_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":images_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/images/images_plugin.py b/tensorflow/tensorboard/plugins/images/images_plugin.py
deleted file mode 100644
index 99704c36afe8552dfe1e5cb589b37babb8b2d627..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/images/images_plugin.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Images plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import imghdr
-
-from six.moves import urllib
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.plugins import base_plugin
-
-_PLUGIN_PREFIX_ROUTE = event_accumulator.IMAGES
-
-_IMGHDR_TO_MIMETYPE = {
-    'bmp': 'image/bmp',
-    'gif': 'image/gif',
-    'jpeg': 'image/jpeg',
-    'png': 'image/png'
-}
-
-_DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
-
-
-class ImagesPlugin(base_plugin.TBPlugin):
-  """Images Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self._multiplexer = multiplexer
-    return {
-        '/images': self._serve_image_metadata,
-        '/individualImage': self._serve_individual_image,
-        '/tags': self._serve_tags,
-    }
-
-  def is_active(self):
-    """The images plugin is active iff any run has at least one relevant tag."""
-    return any(self.index_impl().values())
-
-  def _index_impl(self):
-    return {
-        run_name: run_data[event_accumulator.IMAGES]
-        for (run_name, run_data) in self._multiplexer.Runs().items()
-        if event_accumulator.IMAGES in run_data
-    }
-
-  @wrappers.Request.application
-  def _serve_image_metadata(self, request):
-    """Given a tag and list of runs, serve a list of metadata for images.
-
-    Note that the images themselves are not sent; instead, we respond with URLs
-    to the images. The frontend should treat these URLs as opaque and should not
-    try to parse information about them or generate them itself, as the format
-    may change.
-
-    Args:
-      request: A werkzeug.wrappers.Request object.
-
-    Returns:
-      A werkzeug.Response application.
-    """
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-
-    images = self._multiplexer.Images(run, tag)
-    response = self._image_response_for_run(images, run, tag)
-    return http_util.Respond(request, response, 'application/json')
-
-  def _image_response_for_run(self, run_images, run, tag):
-    """Builds a JSON-serializable object with information about run_images.
-
-    Args:
-      run_images: A list of event_accumulator.ImageValueEvent objects.
-      run: The name of the run.
-      tag: The name of the tag the images all belong to.
-
-    Returns:
-      A list of dictionaries containing the wall time, step, URL, width, and
-      height for each image.
-    """
-    response = []
-    for index, run_image in enumerate(run_images):
-      response.append({
-          'wall_time': run_image.wall_time,
-          'step': run_image.step,
-          # We include the size so that the frontend can add that to the <img>
-          # tag so that the page layout doesn't change when the image loads.
-          'width': run_image.width,
-          'height': run_image.height,
-          'query': self._query_for_individual_image(run, tag, index)
-      })
-    return response
-
-  def _query_for_individual_image(self, run, tag, index):
-    """Builds a URL for accessing the specified image.
-
-    This should be kept in sync with _serve_image_metadata. Note that the URL is
-    *not* guaranteed to always return the same image, since images may be
-    unloaded from the reservoir as new images come in.
-
-    Args:
-      run: The name of the run.
-      tag: The tag.
-      index: The index of the image. Negative values are OK.
-
-    Returns:
-      A string representation of a URL that will load the index-th sampled image
-      in the given run with the given tag.
-    """
-    query_string = urllib.parse.urlencode({
-        'run': run,
-        'tag': tag,
-        'index': index
-    })
-    return query_string
-
-  @wrappers.Request.application
-  def _serve_individual_image(self, request):
-    """Serves an individual image."""
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    index = int(request.args.get('index'))
-    image = self._multiplexer.Images(run, tag)[index]
-    image_type = imghdr.what(None, image.encoded_image_string)
-    content_type = _IMGHDR_TO_MIMETYPE.get(image_type, _DEFAULT_IMAGE_MIMETYPE)
-    return http_util.Respond(request, image.encoded_image_string, content_type)
-
-  @wrappers.Request.application
-  def _serve_tags(self, request):
-    index = self._index_impl()
-    return http_util.Respond(request, index, 'application/json')
diff --git a/tensorflow/tensorboard/plugins/images/images_plugin_test.py b/tensorflow/tensorboard/plugins/images/images_plugin_test.py
deleted file mode 100644
index cbace2b5f078eff6f3dbaaaad9f78ec2ee357ab6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/images/images_plugin_test.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests the Tensorboard images plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import json
-import os
-import shutil
-import tempfile
-
-import numpy
-from six.moves import urllib
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-from werkzeug import test as werkzeug_test
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.images import images_plugin
-
-
-class ImagesPluginTest(tf.test.TestCase):
-
-  def setUp(self):
-    self.log_dir = tempfile.mkdtemp()
-
-    # We use numpy.random to generate images. We seed to avoid non-determinism
-    # in this test.
-    numpy.random.seed(42)
-
-    # Create image summaries for run foo.
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.uint8)
-    tf.summary.image(name="baz", tensor=placeholder)
-    merged_summary_op = tf.summary.merge_all()
-    foo_directory = os.path.join(self.log_dir, "foo")
-    writer = tf.summary.FileWriter(foo_directory)
-    writer.add_graph(sess.graph)
-    for step in xrange(2):
-      writer.add_summary(sess.run(merged_summary_op, feed_dict={
-          placeholder: (numpy.random.rand(1, 16, 42, 3) * 255).astype(
-              numpy.uint8)
-      }), global_step=step)
-    writer.close()
-
-    # Create image summaries for run bar.
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.uint8)
-    tf.summary.image(name="quux", tensor=placeholder)
-    merged_summary_op = tf.summary.merge_all()
-    bar_directory = os.path.join(self.log_dir, "bar")
-    writer = tf.summary.FileWriter(bar_directory)
-    writer.add_graph(sess.graph)
-    for step in xrange(2):
-      writer.add_summary(sess.run(merged_summary_op, feed_dict={
-          placeholder: (numpy.random.rand(1, 6, 8, 3) * 255).astype(
-              numpy.uint8)
-      }), global_step=step)
-    writer.close()
-
-    # Start a server with the plugin.
-    multiplexer = event_multiplexer.EventMultiplexer({
-        "foo": foo_directory,
-        "bar": bar_directory,
-    })
-    plugin = images_plugin.ImagesPlugin()
-    wsgi_app = application.TensorBoardWSGIApp(
-        self.log_dir, [plugin], multiplexer, reload_interval=0)
-    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
-    self.routes = plugin.get_plugin_apps(multiplexer, self.log_dir)
-
-  def tearDown(self):
-    shutil.rmtree(self.log_dir, ignore_errors=True)
-
-  def _DeserializeResponse(self, byte_content):
-    """Deserializes byte content that is a JSON encoding.
-
-    Args:
-      byte_content: The byte content of a response.
-
-    Returns:
-      The deserialized python object decoded from JSON.
-    """
-    return json.loads(byte_content.decode("utf-8"))
-
-  def testRoutesProvided(self):
-    """Tests that the plugin offers the correct routes."""
-    self.assertIsInstance(self.routes["/images"], collections.Callable)
-    self.assertIsInstance(self.routes["/individualImage"], collections.Callable)
-    self.assertIsInstance(self.routes["/tags"], collections.Callable)
-
-  def testImagesRoute(self):
-    """Tests that the /images routes returns with the correct data."""
-    response = self.server.get(
-        "/data/plugin/images/images?run=foo&tag=baz/image/0")
-    self.assertEqual(200, response.status_code)
-
-    # Verify that the correct entries are returned.
-    entries = self._DeserializeResponse(response.get_data())
-    self.assertEqual(2, len(entries))
-
-    # Verify that the 1st entry is correct.
-    entry = entries[0]
-    self.assertEqual(42, entry["width"])
-    self.assertEqual(16, entry["height"])
-    self.assertEqual(0, entry["step"])
-    parsed_query = urllib.parse.parse_qs(entry["query"])
-    self.assertListEqual(["0"], parsed_query["index"])
-    self.assertListEqual(["foo"], parsed_query["run"])
-    self.assertListEqual(["baz/image/0"], parsed_query["tag"])
-
-    # Verify that the 2nd entry is correct.
-    entry = entries[1]
-    self.assertEqual(42, entry["width"])
-    self.assertEqual(16, entry["height"])
-    self.assertEqual(1, entry["step"])
-    parsed_query = urllib.parse.parse_qs(entry["query"])
-    self.assertListEqual(["1"], parsed_query["index"])
-    self.assertListEqual(["foo"], parsed_query["run"])
-    self.assertListEqual(["baz/image/0"], parsed_query["tag"])
-
-  def testIndividualImageRoute(self):
-    """Tests fetching an individual image."""
-    response = self.server.get(
-        "/data/plugin/images/individualImage?run=bar&tag=quux/image/0&index=0")
-    self.assertEqual(200, response.status_code)
-    self.assertEqual("image/png", response.headers.get("content-type"))
-
-  def testRunsRoute(self):
-    """Tests that the /runs route offers the correct run to tag mapping."""
-    response = self.server.get("/data/plugin/images/tags")
-    self.assertEqual(200, response.status_code)
-    self.assertDictEqual({
-        "foo": ["baz/image/0"],
-        "bar": ["quux/image/0"]
-    }, self._DeserializeResponse(response.get_data()))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/projector/BUILD b/tensorflow/tensorboard/plugins/projector/BUILD
deleted file mode 100644
index 9f9a18b2f26103e96f68fc3470a09600f5c05109..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/projector/BUILD
+++ /dev/null
@@ -1,58 +0,0 @@
-# Embedding Projector plugin.
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow/tensorboard/defs:protos.bzl", "tb_proto_library")
-
-py_library(
-    name = "projector_plugin",
-    srcs = ["projector_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//learning/vis/projector:__subpackages__",
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        ":protos_all_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "//third_party/py/numpy",
-        "@org_pocoo_werkzeug//:werkzeug",
-    ],
-)
-
-py_test(
-    name = "projector_plugin_test",
-    size = "small",
-    srcs = ["projector_plugin_test.py"],
-    main = "projector_plugin_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":projector_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "//third_party/py/numpy",
-        "@org_pocoo_werkzeug//:werkzeug",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
-
-tb_proto_library(
-    name = "protos_all",
-    srcs = glob(["*.proto"]),
-    visibility = [
-        "//tensorflow:internal",
-        "//tensorflow/tensorboard:internal",
-    ],
-)
diff --git a/tensorflow/tensorboard/plugins/projector/projector_plugin.py b/tensorflow/tensorboard/plugins/projector/projector_plugin.py
deleted file mode 100644
index 9a3a305d53a012c73b008f4f2238290bd2e78ab8..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/projector/projector_plugin.py
+++ /dev/null
@@ -1,640 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The Embedding Projector plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import imghdr
-import math
-import os
-import numpy as np
-import tensorflow as tf
-from werkzeug import wrappers
-
-from google.protobuf import json_format
-from google.protobuf import text_format
-from tensorflow.tensorboard.backend.http_util import Respond
-from tensorflow.tensorboard.plugins.base_plugin import TBPlugin
-from tensorflow.tensorboard.plugins.projector import projector_config_pb2
-
-# The prefix of routes provided by this plugin.
-_PLUGIN_PREFIX_ROUTE = 'projector'
-
-# FYI - the PROJECTOR_FILENAME is hardcoded in the visualize_embeddings
-# method in tf.contrib.tensorboard.plugins.projector module.
-# TODO(dandelion): Fix duplication when we find a permanent home for the
-# projector module.
-PROJECTOR_FILENAME = 'projector_config.pbtxt'
-_PLUGIN_NAME = 'org_tensorflow_tensorboard_projector'
-_PLUGINS_DIR = 'plugins'
-
-# Number of tensors in the LRU cache.
-_TENSOR_CACHE_CAPACITY = 1
-
-# HTTP routes.
-CONFIG_ROUTE = '/info'
-TENSOR_ROUTE = '/tensor'
-METADATA_ROUTE = '/metadata'
-RUNS_ROUTE = '/runs'
-BOOKMARKS_ROUTE = '/bookmarks'
-SPRITE_IMAGE_ROUTE = '/sprite_image'
-
-_IMGHDR_TO_MIMETYPE = {
-    'bmp': 'image/bmp',
-    'gif': 'image/gif',
-    'jpeg': 'image/jpeg',
-    'png': 'image/png'
-}
-_DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
-
-
-class LRUCache(object):
-  """LRU cache. Used for storing the last used tensor."""
-
-  def __init__(self, size):
-    if size < 1:
-      raise ValueError('The cache size must be >=1')
-    self._size = size
-    self._dict = collections.OrderedDict()
-
-  def get(self, key):
-    try:
-      value = self._dict.pop(key)
-      self._dict[key] = value
-      return value
-    except KeyError:
-      return None
-
-  def set(self, key, value):
-    if value is None:
-      raise ValueError('value must be != None')
-    try:
-      self._dict.pop(key)
-    except KeyError:
-      if len(self._dict) >= self._size:
-        self._dict.popitem(last=False)
-    self._dict[key] = value
-
-
-class EmbeddingMetadata(object):
-  """Metadata container for an embedding.
-
-  The metadata holds different columns with values used for visualization
-  (color by, label by) in the "Embeddings" tab in TensorBoard.
-  """
-
-  def __init__(self, num_points):
-    """Constructs a metadata for an embedding of the specified size.
-
-    Args:
-      num_points: Number of points in the embedding.
-    """
-    self.num_points = num_points
-    self.column_names = []
-    self.name_to_values = {}
-
-  def add_column(self, column_name, column_values):
-    """Adds a named column of metadata values.
-
-    Args:
-      column_name: Name of the column.
-      column_values: 1D array/list/iterable holding the column values. Must be
-          of length `num_points`. The i-th value corresponds to the i-th point.
-
-    Raises:
-      ValueError: If `column_values` is not 1D array, or of length `num_points`,
-          or the `name` is already used.
-    """
-    # Sanity checks.
-    if isinstance(column_values, list) and isinstance(column_values[0], list):
-      raise ValueError('"column_values" must be a flat list, but we detected '
-                       'that its first entry is a list')
-
-    if isinstance(column_values, np.ndarray) and column_values.ndim != 1:
-      raise ValueError('"column_values" should be of rank 1, '
-                       'but is of rank %d' % column_values.ndim)
-    if len(column_values) != self.num_points:
-      raise ValueError('"column_values" should be of length %d, but is of '
-                       'length %d' % (self.num_points, len(column_values)))
-    if column_name in self.name_to_values:
-      raise ValueError('The column name "%s" is already used' % column_name)
-
-    self.column_names.append(column_name)
-    self.name_to_values[column_name] = column_values
-
-
-def _read_tensor_tsv_file(fpath):
-  with tf.gfile.GFile(fpath, 'r') as f:
-    tensor = []
-    for line in f:
-      if line:
-        tensor.append(list(map(float, line.rstrip('\n').split('\t'))))
-  return np.array(tensor, dtype='float32')
-
-
-def _assets_dir_to_logdir(assets_dir):
-  sub_path = os.path.sep + _PLUGINS_DIR + os.path.sep
-  if sub_path in assets_dir:
-    two_parents_up = os.pardir + os.path.sep + os.pardir
-    return os.path.abspath(os.path.join(assets_dir, two_parents_up))
-  return assets_dir
-
-
-def _latest_checkpoints_changed(configs, run_path_pairs):
-  """Returns true if the latest checkpoint has changed in any of the runs."""
-  for run_name, assets_dir in run_path_pairs:
-    if run_name not in configs:
-      config = projector_config_pb2.ProjectorConfig()
-      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
-      if tf.gfile.Exists(config_fpath):
-        with tf.gfile.GFile(config_fpath, 'r') as f:
-          file_content = f.read()
-        text_format.Merge(file_content, config)
-    else:
-      config = configs[run_name]
-
-    # See if you can find a checkpoint file in the logdir.
-    logdir = _assets_dir_to_logdir(assets_dir)
-    ckpt_path = _find_latest_checkpoint(logdir)
-    if not ckpt_path:
-      continue
-    if config.model_checkpoint_path != ckpt_path:
-      return True
-  return False
-
-
-def _parse_positive_int_param(request, param_name):
-  """Parses and asserts a positive (>0) integer query parameter.
-
-  Args:
-    request: The Werkzeug Request object
-    param_name: Name of the parameter.
-
-  Returns:
-    Param, or None, or -1 if parameter is not a positive integer.
-  """
-  param = request.args.get(param_name)
-  if not param:
-    return None
-  try:
-    param = int(param)
-    if param <= 0:
-      raise ValueError()
-    return param
-  except ValueError:
-    return -1
-
-
-def _rel_to_abs_asset_path(fpath, config_fpath):
-  fpath = os.path.expanduser(fpath)
-  if not os.path.isabs(fpath):
-    return os.path.join(os.path.dirname(config_fpath), fpath)
-  return fpath
-
-
-class ProjectorPlugin(TBPlugin):
-  """Embedding projector."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def __init__(self):
-    self._handlers = None
-    self.readers = {}
-    self.run_paths = None
-    self.logdir = None
-    self._configs = None
-    self.old_num_run_paths = None
-    self.multiplexer = None
-    self.tensor_cache = LRUCache(_TENSOR_CACHE_CAPACITY)
-
-  def get_plugin_apps(self, multiplexer, logdir):
-    self.multiplexer = multiplexer
-    self.run_paths = multiplexer.RunPaths()
-    self.logdir = logdir
-    self._handlers = {
-        RUNS_ROUTE: self._serve_runs,
-        CONFIG_ROUTE: self._serve_config,
-        TENSOR_ROUTE: self._serve_tensor,
-        METADATA_ROUTE: self._serve_metadata,
-        BOOKMARKS_ROUTE: self._serve_bookmarks,
-        SPRITE_IMAGE_ROUTE: self._serve_sprite_image
-    }
-    return self._handlers
-
-  def is_active(self):
-    """Determines whether this plugin is active.
-
-    This plugin is only active if any run has an embedding.
-
-    Returns:
-      A boolean. Whether this plugin is active.
-    """
-    return bool(self.configs)
-
-  @property
-  def configs(self):
-    """Returns a map of run paths to `ProjectorConfig` protos."""
-    run_path_pairs = list(self.run_paths.items())
-    self._append_plugin_asset_directories(run_path_pairs)
-    # If there are no summary event files, the projector should still work,
-    # treating the `logdir` as the model checkpoint directory.
-    if not run_path_pairs:
-      run_path_pairs.append(('.', self.logdir))
-    if (self._run_paths_changed() or
-        _latest_checkpoints_changed(self._configs, run_path_pairs)):
-      self.readers = {}
-      self._configs, self.config_fpaths = self._read_latest_config_files(
-          run_path_pairs)
-      self._augment_configs_with_checkpoint_info()
-    return self._configs
-
-  def _run_paths_changed(self):
-    num_run_paths = len(list(self.run_paths.keys()))
-    if num_run_paths != self.old_num_run_paths:
-      self.old_num_run_paths = num_run_paths
-      return True
-    return False
-
-  def _augment_configs_with_checkpoint_info(self):
-    for run, config in self._configs.items():
-      for embedding in config.embeddings:
-        # Normalize the name of the embeddings.
-        if embedding.tensor_name.endswith(':0'):
-          embedding.tensor_name = embedding.tensor_name[:-2]
-        # Find the size of embeddings associated with a tensors file.
-        if embedding.tensor_path and not embedding.tensor_shape:
-          fpath = _rel_to_abs_asset_path(embedding.tensor_path,
-                                         self.config_fpaths[run])
-          tensor = self.tensor_cache.get(embedding.tensor_name)
-          if tensor is None:
-            tensor = _read_tensor_tsv_file(fpath)
-            self.tensor_cache.set(embedding.tensor_name, tensor)
-          embedding.tensor_shape.extend([len(tensor), len(tensor[0])])
-
-      reader = self._get_reader_for_run(run)
-      if not reader:
-        continue
-      # Augment the configuration with the tensors in the checkpoint file.
-      special_embedding = None
-      if config.embeddings and not config.embeddings[0].tensor_name:
-        special_embedding = config.embeddings[0]
-        config.embeddings.remove(special_embedding)
-      var_map = reader.get_variable_to_shape_map()
-      for tensor_name, tensor_shape in var_map.items():
-        if len(tensor_shape) != 2:
-          continue
-        embedding = self._get_embedding(tensor_name, config)
-        if not embedding:
-          embedding = config.embeddings.add()
-          embedding.tensor_name = tensor_name
-          if special_embedding:
-            embedding.metadata_path = special_embedding.metadata_path
-            embedding.bookmarks_path = special_embedding.bookmarks_path
-        if not embedding.tensor_shape:
-          embedding.tensor_shape.extend(tensor_shape)
-
-    # Remove configs that do not have any valid (2D) tensors.
-    runs_to_remove = []
-    for run, config in self._configs.items():
-      if not config.embeddings:
-        runs_to_remove.append(run)
-    for run in runs_to_remove:
-      del self._configs[run]
-      del self.config_fpaths[run]
-
-  def _read_latest_config_files(self, run_path_pairs):
-    """Reads and returns the projector config files in every run directory."""
-    configs = {}
-    config_fpaths = {}
-    for run_name, assets_dir in run_path_pairs:
-      config = projector_config_pb2.ProjectorConfig()
-      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
-      if tf.gfile.Exists(config_fpath):
-        with tf.gfile.GFile(config_fpath, 'r') as f:
-          file_content = f.read()
-        text_format.Merge(file_content, config)
-      has_tensor_files = False
-      for embedding in config.embeddings:
-        if embedding.tensor_path:
-          if not embedding.tensor_name:
-            embedding.tensor_name = os.path.basename(embedding.tensor_path)
-          has_tensor_files = True
-          break
-
-      if not config.model_checkpoint_path:
-        # See if you can find a checkpoint file in the logdir.
-        logdir = _assets_dir_to_logdir(assets_dir)
-        ckpt_path = _find_latest_checkpoint(logdir)
-        if not ckpt_path and not has_tensor_files:
-          continue
-        if ckpt_path:
-          config.model_checkpoint_path = ckpt_path
-
-      # Sanity check for the checkpoint file.
-      if (config.model_checkpoint_path and
-          not tf.train.checkpoint_exists(config.model_checkpoint_path)):
-        tf.logging.warning('Checkpoint file "%s" not found',
-                           config.model_checkpoint_path)
-        continue
-      configs[run_name] = config
-      config_fpaths[run_name] = config_fpath
-    return configs, config_fpaths
-
-  def _get_reader_for_run(self, run):
-    if run in self.readers:
-      return self.readers[run]
-
-    config = self._configs[run]
-    reader = None
-    if config.model_checkpoint_path:
-      try:
-        reader = tf.pywrap_tensorflow.NewCheckpointReader(
-            config.model_checkpoint_path)
-      except Exception:  # pylint: disable=broad-except
-        tf.logging.warning('Failed reading "%s"', config.model_checkpoint_path)
-    self.readers[run] = reader
-    return reader
-
-  def _get_metadata_file_for_tensor(self, tensor_name, config):
-    embedding_info = self._get_embedding(tensor_name, config)
-    if embedding_info:
-      return embedding_info.metadata_path
-    return None
-
-  def _get_bookmarks_file_for_tensor(self, tensor_name, config):
-    embedding_info = self._get_embedding(tensor_name, config)
-    if embedding_info:
-      return embedding_info.bookmarks_path
-    return None
-
-  def _canonical_tensor_name(self, tensor_name):
-    if ':' not in tensor_name:
-      return tensor_name + ':0'
-    else:
-      return tensor_name
-
-  def _get_embedding(self, tensor_name, config):
-    if not config.embeddings:
-      return None
-    for info in config.embeddings:
-      if (self._canonical_tensor_name(info.tensor_name) ==
-          self._canonical_tensor_name(tensor_name)):
-        return info
-    return None
-
-  def _append_plugin_asset_directories(self, run_path_pairs):
-    for run, assets in self.multiplexer.PluginAssets(_PLUGIN_NAME).items():
-      if PROJECTOR_FILENAME not in assets:
-        continue
-      assets_dir = os.path.join(self.run_paths[run], _PLUGINS_DIR, _PLUGIN_NAME)
-      assets_path_pair = (run, os.path.abspath(assets_dir))
-      run_path_pairs.append(assets_path_pair)
-
-  @wrappers.Request.application
-  def _serve_runs(self, request):
-    """Returns a list of runs that have embeddings."""
-    return Respond(request, list(self.configs.keys()), 'application/json')
-
-  @wrappers.Request.application
-  def _serve_config(self, request):
-    run = request.args.get('run')
-    if run is None:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    return Respond(request,
-                   json_format.MessageToJson(config), 'application/json')
-
-  @wrappers.Request.application
-  def _serve_metadata(self, request):
-    run = request.args.get('run')
-    if run is None:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    num_rows = _parse_positive_int_param(request, 'num_rows')
-    if num_rows == -1:
-      return Respond(request, 'query parameter num_rows must be integer > 0',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    fpath = self._get_metadata_file_for_tensor(name, config)
-    if not fpath:
-      return Respond(
-          request,
-          'No metadata file found for tensor "%s" in the config file "%s"' %
-          (name, self.config_fpaths[run]), 'text/plain', 400)
-    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
-    if not tf.gfile.Exists(fpath) or tf.gfile.IsDirectory(fpath):
-      return Respond(request, '"%s" not found, or is not a file' % fpath,
-                     'text/plain', 400)
-
-    num_header_rows = 0
-    with tf.gfile.GFile(fpath, 'r') as f:
-      lines = []
-      # Stream reading the file with early break in case the file doesn't fit in
-      # memory.
-      for line in f:
-        lines.append(line)
-        if len(lines) == 1 and '\t' in lines[0]:
-          num_header_rows = 1
-        if num_rows and len(lines) >= num_rows + num_header_rows:
-          break
-    return Respond(request, ''.join(lines), 'text/plain')
-
-  @wrappers.Request.application
-  def _serve_tensor(self, request):
-    run = request.args.get('run')
-    if run is None:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    num_rows = _parse_positive_int_param(request, 'num_rows')
-    if num_rows == -1:
-      return Respond(request, 'query parameter num_rows must be integer > 0',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-
-    tensor = self.tensor_cache.get(name)
-    if tensor is None:
-      # See if there is a tensor file in the config.
-      embedding = self._get_embedding(name, config)
-
-      if embedding and embedding.tensor_path:
-        fpath = _rel_to_abs_asset_path(embedding.tensor_path,
-                                       self.config_fpaths[run])
-        if not tf.gfile.Exists(fpath):
-          return Respond(request,
-                         'Tensor file "%s" does not exist' % fpath,
-                         'text/plain', 400)
-        tensor = _read_tensor_tsv_file(fpath)
-      else:
-        reader = self._get_reader_for_run(run)
-        if not reader or not reader.has_tensor(name):
-          return Respond(request,
-                         'Tensor "%s" not found in checkpoint dir "%s"' %
-                         (name, config.model_checkpoint_path), 'text/plain',
-                         400)
-        try:
-          tensor = reader.get_tensor(name)
-        except tf.errors.InvalidArgumentError as e:
-          return Respond(request, str(e), 'text/plain', 400)
-
-      self.tensor_cache.set(name, tensor)
-
-    if num_rows:
-      tensor = tensor[:num_rows]
-    if tensor.dtype != 'float32':
-      tensor = tensor.astype(dtype='float32', copy=False)
-    data_bytes = tensor.tobytes()
-    return Respond(request, data_bytes, 'application/octet-stream')
-
-  @wrappers.Request.application
-  def _serve_bookmarks(self, request):
-    run = request.args.get('run')
-    if not run:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    fpath = self._get_bookmarks_file_for_tensor(name, config)
-    if not fpath:
-      return Respond(
-          request,
-          'No bookmarks file found for tensor "%s" in the config file "%s"' %
-          (name, self.config_fpaths[run]), 'text/plain', 400)
-    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
-    if not tf.gfile.Exists(fpath) or tf.gfile.IsDirectory(fpath):
-      return Respond(request, '"%s" not found, or is not a file' % fpath,
-                     'text/plain', 400)
-
-    bookmarks_json = None
-    with tf.gfile.GFile(fpath, 'rb') as f:
-      bookmarks_json = f.read()
-    return Respond(request, bookmarks_json, 'application/json')
-
-  @wrappers.Request.application
-  def _serve_sprite_image(self, request):
-    run = request.args.get('run')
-    if not run:
-      return Respond(request, 'query parameter "run" is required', 'text/plain',
-                     400)
-
-    name = request.args.get('name')
-    if name is None:
-      return Respond(request, 'query parameter "name" is required',
-                     'text/plain', 400)
-
-    if run not in self.configs:
-      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
-
-    config = self.configs[run]
-    embedding_info = self._get_embedding(name, config)
-
-    if not embedding_info or not embedding_info.sprite.image_path:
-      return Respond(
-          request,
-          'No sprite image file found for tensor "%s" in the config file "%s"' %
-          (name, self.config_fpaths[run]), 'text/plain', 400)
-
-    fpath = os.path.expanduser(embedding_info.sprite.image_path)
-    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
-    if not tf.gfile.Exists(fpath) or tf.gfile.IsDirectory(fpath):
-      return Respond(request, '"%s" does not exist or is directory' % fpath,
-                     'text/plain', 400)
-    f = tf.gfile.GFile(fpath, 'rb')
-    encoded_image_string = f.read()
-    f.close()
-    image_type = imghdr.what(None, encoded_image_string)
-    mime_type = _IMGHDR_TO_MIMETYPE.get(image_type, _DEFAULT_IMAGE_MIMETYPE)
-    return Respond(request, encoded_image_string, mime_type)
-
-
-def _find_latest_checkpoint(dir_path):
-  try:
-    ckpt_path = tf.train.latest_checkpoint(dir_path)
-    if not ckpt_path:
-      # Check the parent directory.
-      ckpt_path = tf.train.latest_checkpoint(os.path.join(dir_path, os.pardir))
-    return ckpt_path
-  except tf.errors.NotFoundError:
-    return None
-
-
-def _make_sprite_image(thumbnails, thumbnail_dim):
-  """Constructs a sprite image from thumbnails and returns the png bytes."""
-  if len(thumbnails) < 1:
-    raise ValueError('The length of "thumbnails" must be >= 1')
-
-  if isinstance(thumbnails, np.ndarray) and thumbnails.ndim != 4:
-    raise ValueError('"thumbnails" should be of rank 4, '
-                     'but is of rank %d' % thumbnails.ndim)
-  if isinstance(thumbnails, list):
-    if not isinstance(thumbnails[0], np.ndarray) or thumbnails[0].ndim != 3:
-      raise ValueError('Each element of "thumbnails" must be a 3D `ndarray`')
-    thumbnails = np.array(thumbnails)
-
-  with tf.Graph().as_default():
-    s = tf.Session()
-    resized_images = tf.image.resize_images(thumbnails, thumbnail_dim).eval(
-        session=s)
-    images_per_row = int(math.ceil(math.sqrt(len(thumbnails))))
-    thumb_height = thumbnail_dim[0]
-    thumb_width = thumbnail_dim[1]
-    master_height = images_per_row * thumb_height
-    master_width = images_per_row * thumb_width
-    num_channels = thumbnails.shape[3]
-    master = np.zeros([master_height, master_width, num_channels])
-    for idx, image in enumerate(resized_images):
-      left_idx = idx % images_per_row
-      top_idx = int(math.floor(idx / images_per_row))
-      left_start = left_idx * thumb_width
-      left_end = left_start + thumb_width
-      top_start = top_idx * thumb_height
-      top_end = top_start + thumb_height
-      master[top_start:top_end, left_start:left_end, :] = image
-
-    return tf.image.encode_png(master).eval(session=s)
diff --git a/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py b/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
deleted file mode 100644
index 06cf2c3d0d4af6776880399f51c1811ae312a1d1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for the Embedding Projector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gzip
-import io
-import json
-import os
-import numpy as np
-import tensorflow as tf
-
-from werkzeug import test as werkzeug_test
-from werkzeug import wrappers
-
-from google.protobuf import text_format
-
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.projector import projector_config_pb2
-from tensorflow.tensorboard.plugins.projector import projector_plugin
-
-
-class ProjectorAppTest(tf.test.TestCase):
-
-  def setUp(self):
-    self.log_dir = self.get_temp_dir()
-
-  def testRunsWithValidCheckpoint(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-    run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertTrue(run_json)
-
-  def testRunsWithNoCheckpoint(self):
-    self._SetupWSGIApp()
-    run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertEqual(run_json, [])
-
-  def testRunsWithInvalidModelCheckpointPath(self):
-    checkpoint_file = os.path.join(self.log_dir, 'checkpoint')
-    f = open(checkpoint_file, 'w')
-    f.write('model_checkpoint_path: "does_not_exist"\n')
-    f.write('all_model_checkpoint_paths: "does_not_exist"\n')
-    f.close()
-    self._SetupWSGIApp()
-
-    run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertEqual(run_json, [])
-
-  def testRunsWithInvalidModelCheckpointPathInConfig(self):
-    config_path = os.path.join(self.log_dir, 'projector_config.pbtxt')
-    config = projector_config_pb2.ProjectorConfig()
-    config.model_checkpoint_path = 'does_not_exist'
-    embedding = config.embeddings.add()
-    embedding.tensor_name = 'var1'
-    with tf.gfile.GFile(config_path, 'w') as f:
-      f.write(text_format.MessageToString(config))
-    self._SetupWSGIApp()
-
-    run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertEqual(run_json, [])
-
-  def testInfoWithValidCheckpointNoEventsData(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    info_json = self._GetJson('/data/plugin/projector/info?run=.')
-    self.assertItemsEqual(info_json['embeddings'], [{
-        'tensorShape': [1, 2],
-        'tensorName': 'var1',
-        'bookmarksPath': 'bookmarks.json'
-    }, {
-        'tensorShape': [10, 10],
-        'tensorName': 'var2'
-    }, {
-        'tensorShape': [100, 100],
-        'tensorName': 'var3'
-    }])
-
-  def testInfoWithValidCheckpointAndEventsData(self):
-    self._GenerateProjectorTestData()
-    self._GenerateEventsData()
-    self._SetupWSGIApp()
-
-    run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertTrue(run_json)
-    run = run_json[0]
-    info_json = self._GetJson('/data/plugin/projector/info?run=%s' % run)
-    self.assertItemsEqual(info_json['embeddings'], [{
-        'tensorShape': [1, 2],
-        'tensorName': 'var1',
-        'bookmarksPath': 'bookmarks.json'
-    }, {
-        'tensorShape': [10, 10],
-        'tensorName': 'var2'
-    }, {
-        'tensorShape': [100, 100],
-        'tensorName': 'var3'
-    }])
-
-  def testTensorWithValidCheckpoint(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/tensor?run=.&name=var1'
-    tensor_bytes = self._Get(url).data
-    expected_tensor = np.array([[6, 6]], dtype=np.float32)
-    self._AssertTensorResponse(tensor_bytes, expected_tensor)
-
-  def testBookmarksRequestMissingRunAndName(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/bookmarks'
-    self.assertEqual(self._Get(url).status_code, 400)
-
-  def testBookmarksRequestMissingName(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/bookmarks?run=.'
-    self.assertEqual(self._Get(url).status_code, 400)
-
-  def testBookmarksRequestMissingRun(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/bookmarks?name=var1'
-    self.assertEqual(self._Get(url).status_code, 400)
-
-  def testBookmarksUnknownRun(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/bookmarks?run=unknown&name=var1'
-    self.assertEqual(self._Get(url).status_code, 400)
-
-  def testBookmarksUnknownName(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/bookmarks?run=.&name=unknown'
-    self.assertEqual(self._Get(url).status_code, 400)
-
-  def testBookmarks(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    url = '/data/plugin/projector/bookmarks?run=.&name=var1'
-    bookmark = self._GetJson(url)
-    self.assertEqual(bookmark, {'a': 'b'})
-
-  def testEndpointsNoAssets(self):
-    g = tf.Graph()
-
-    fw = tf.summary.FileWriter(self.log_dir, graph=g)
-    fw.close()
-
-    self._SetupWSGIApp()
-    run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertEqual(run_json, [])
-
-  def _AssertTensorResponse(self, tensor_bytes, expected_tensor):
-    tensor = np.reshape(np.fromstring(tensor_bytes, dtype=np.float32),
-                        expected_tensor.shape)
-    self.assertTrue(np.array_equal(tensor, expected_tensor))
-
-  def testPluginIsActive(self):
-    self._GenerateProjectorTestData()
-    self._SetupWSGIApp()
-
-    # Embedding data is available.
-    self.assertTrue(self.plugin.is_active())
-
-  def testPluginIsNotActive(self):
-    self._SetupWSGIApp()
-
-    # Embedding data is not available.
-    self.assertFalse(self.plugin.is_active())
-
-  def _SetupWSGIApp(self):
-    multiplexer = event_multiplexer.EventMultiplexer(
-        size_guidance=application.DEFAULT_SIZE_GUIDANCE,
-        purge_orphaned_data=True)
-    self.plugin = projector_plugin.ProjectorPlugin()
-    wsgi_app = application.TensorBoardWSGIApp(
-        self.log_dir, [self.plugin], multiplexer, reload_interval=0)
-    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
-
-  def _Get(self, path):
-    return self.server.get(path)
-
-  def _GetJson(self, path):
-    response = self.server.get(path)
-    data = response.data
-    if response.headers.get('Content-Encoding') == 'gzip':
-      data = gzip.GzipFile('', 'rb', 9, io.BytesIO(data)).read()
-    return json.loads(data.decode('utf-8'))
-
-  def _GenerateEventsData(self):
-    fw = tf.summary.FileWriter(self.log_dir)
-    event = tf.Event(
-        wall_time=1,
-        step=1,
-        summary=tf.Summary(value=[tf.Summary.Value(tag='s1', simple_value=0)]))
-    fw.add_event(event)
-    fw.close()
-
-  def _GenerateProjectorTestData(self):
-    config_path = os.path.join(self.log_dir, 'projector_config.pbtxt')
-    config = projector_config_pb2.ProjectorConfig()
-    embedding = config.embeddings.add()
-    # Add an embedding by its canonical tensor name.
-    embedding.tensor_name = 'var1:0'
-
-    with tf.gfile.GFile(os.path.join(self.log_dir, 'bookmarks.json'), 'w') as f:
-      f.write('{"a": "b"}')
-    embedding.bookmarks_path = 'bookmarks.json'
-
-    config_pbtxt = text_format.MessageToString(config)
-    with tf.gfile.GFile(config_path, 'w') as f:
-      f.write(config_pbtxt)
-
-    # Write a checkpoint with some dummy variables.
-    with tf.Graph().as_default():
-      sess = tf.Session()
-      checkpoint_path = os.path.join(self.log_dir, 'model')
-      tf.get_variable('var1', [1, 2], initializer=tf.constant_initializer(6.0))
-      tf.get_variable('var2', [10, 10])
-      tf.get_variable('var3', [100, 100])
-      sess.run(tf.global_variables_initializer())
-      saver = tf.train.Saver(write_version=tf.train.SaverDef.V1)
-      saver.save(sess, checkpoint_path)
-
-
-class MetadataColumnsTest(tf.test.TestCase):
-
-  def testLengthDoesNotMatch(self):
-    metadata = projector_plugin.EmbeddingMetadata(10)
-
-    with self.assertRaises(ValueError):
-      metadata.add_column('Labels', [''] * 11)
-
-  def testValuesNot1D(self):
-    metadata = projector_plugin.EmbeddingMetadata(3)
-    values = np.array([[1, 2, 3]])
-
-    with self.assertRaises(ValueError):
-      metadata.add_column('Labels', values)
-
-  def testMultipleColumnsRetrieval(self):
-    metadata = projector_plugin.EmbeddingMetadata(3)
-    metadata.add_column('Sizes', [1, 2, 3])
-    metadata.add_column('Labels', ['a', 'b', 'c'])
-    self.assertEqual(metadata.column_names, ['Sizes', 'Labels'])
-    self.assertEqual(metadata.name_to_values['Labels'], ['a', 'b', 'c'])
-    self.assertEqual(metadata.name_to_values['Sizes'], [1, 2, 3])
-
-  def testValuesAreListofLists(self):
-    metadata = projector_plugin.EmbeddingMetadata(3)
-    values = [[1, 2, 3], [4, 5, 6]]
-    with self.assertRaises(ValueError):
-      metadata.add_column('Labels', values)
-
-  def testStringListRetrieval(self):
-    metadata = projector_plugin.EmbeddingMetadata(3)
-    metadata.add_column('Labels', ['a', 'B', 'c'])
-    self.assertEqual(metadata.name_to_values['Labels'], ['a', 'B', 'c'])
-    self.assertEqual(metadata.column_names, ['Labels'])
-
-  def testNumericListRetrieval(self):
-    metadata = projector_plugin.EmbeddingMetadata(3)
-    metadata.add_column('Labels', [1, 2, 3])
-    self.assertEqual(metadata.name_to_values['Labels'], [1, 2, 3])
-
-  def testNumericNdArrayRetrieval(self):
-    metadata = projector_plugin.EmbeddingMetadata(3)
-    metadata.add_column('Labels', np.array([1, 2, 3]))
-    self.assertEqual(metadata.name_to_values['Labels'].tolist(), [1, 2, 3])
-
-  def testStringNdArrayRetrieval(self):
-    metadata = projector_plugin.EmbeddingMetadata(2)
-    metadata.add_column('Labels', np.array(['a', 'b']))
-    self.assertEqual(metadata.name_to_values['Labels'].tolist(), ['a', 'b'])
-
-  def testDuplicateColumnName(self):
-    metadata = projector_plugin.EmbeddingMetadata(2)
-    metadata.add_column('Labels', np.array(['a', 'b']))
-    with self.assertRaises(ValueError):
-      metadata.add_column('Labels', np.array(['a', 'b']))
-
-
-class LRUCacheTest(tf.test.TestCase):
-
-  def testInvalidSize(self):
-    with self.assertRaises(ValueError):
-      projector_plugin.LRUCache(0)
-
-  def testSimpleGetAndSet(self):
-    cache = projector_plugin.LRUCache(1)
-    value = cache.get('a')
-    self.assertIsNone(value)
-    cache.set('a', 10)
-    self.assertEqual(cache.get('a'), 10)
-
-  def testErrorsWhenSettingNoneAsValue(self):
-    cache = projector_plugin.LRUCache(1)
-    with self.assertRaises(ValueError):
-      cache.set('a', None)
-
-  def testLRUReplacementPolicy(self):
-    cache = projector_plugin.LRUCache(2)
-    cache.set('a', 1)
-    cache.set('b', 2)
-    cache.set('c', 3)
-    self.assertIsNone(cache.get('a'))
-    self.assertEqual(cache.get('b'), 2)
-    self.assertEqual(cache.get('c'), 3)
-
-    # Make 'b' the most recently used.
-    cache.get('b')
-    cache.set('d', 4)
-
-    # Make sure 'c' got replaced with 'd'.
-    self.assertIsNone(cache.get('c'))
-    self.assertEqual(cache.get('b'), 2)
-    self.assertEqual(cache.get('d'), 4)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/scalars/BUILD b/tensorflow/tensorboard/plugins/scalars/BUILD
deleted file mode 100644
index 1e95c7d701b4b3db9370b67e971c6241940a97ab..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/scalars/BUILD
+++ /dev/null
@@ -1,58 +0,0 @@
-# Description:
-# TensorBoard plugin for scalars
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-## Scalars Plugin ##
-py_library(
-    name = "scalars_plugin",
-    srcs = ["scalars_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "scalars_plugin_test",
-    size = "small",
-    srcs = ["scalars_plugin_test.py"],
-    main = "scalars_plugin_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":scalars_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-py_binary(
-    name = "scalars_demo",
-    srcs = ["scalars_demo.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
deleted file mode 100644
index f3195fd849b3ca1ab4a763a22eb02a191d4073e0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Sample data exhibiting scalar summaries, via a temperature simulation."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-# Directory into which to write tensorboard data.
-LOGDIR = '/tmp/scalars_demo'
-
-# Duration of the simulation.
-STEPS = 1000
-
-
-def run(logdir, run_name,
-        initial_temperature, ambient_temperature, heat_coefficient):
-  """Run a temperature simulation.
-
-  This will simulate an object at temperature `initial_temperature`
-  sitting at rest in a large room at temperature `ambient_temperature`.
-  The object has some intrinsic `heat_coefficient`, which indicates
-  how much thermal conductivity it has: for instance, metals have high
-  thermal conductivity, while the thermal conductivity of water is low.
-
-  Over time, the object's temperature will adjust to match the
-  temperature of its environment. We'll track the object's temperature,
-  how far it is from the room's temperature, and how much it changes at
-  each time step.
-
-  Arguments:
-    logdir: the top-level directory into which to write summary data
-    run_name: the name of this run; will be created as a subdirectory
-      under logdir
-    initial_temperature: float; the object's initial temperature
-    ambient_temperature: float; the temperature of the enclosing room
-    heat_coefficient: float; a measure of the object's thermal
-      conductivity
-  """
-  tf.reset_default_graph()
-  tf.set_random_seed(0)
-
-  with tf.name_scope('temperature'):
-    # Create a mutable variable to hold the object's temperature, and
-    # create a scalar summary to track its value over time. The name of
-    # the summary will appear as "temperature/current" due to the
-    # name-scope above.
-    temperature = tf.Variable(tf.constant(initial_temperature),
-                              name='temperature')
-    tf.summary.scalar('current', temperature)
-
-    # Compute how much the object's temperature differs from that of its
-    # environment, and track this, too: likewise, as
-    # "temperature/difference_to_ambient".
-    ambient_difference = temperature - ambient_temperature
-    tf.summary.scalar('difference_to_ambient', ambient_difference)
-
-  # Newton suggested that the rate of change of the temperature of an
-  # object is directly proportional to this `ambient_difference` above,
-  # where the proportionality constant is what we called the heat
-  # coefficient. But in real life, not everything is quite so clean, so
-  # we'll add in some noise. (The value of 50 is arbitrary, chosen to
-  # make the data look somewhat interesting. :-) )
-  noise = 50 * tf.random_normal([])
-  delta = -heat_coefficient * (ambient_difference + noise)
-  tf.summary.scalar('delta', delta)
-
-  # Now, augment the current temperature by this delta that we computed.
-  update_step = temperature.assign_add(delta)
-
-  # Collect all the scalars that we want to keep track of.
-  summ = tf.summary.merge_all()
-
-  sess = tf.Session()
-  writer = tf.summary.FileWriter(os.path.join(logdir, run_name))
-  writer.add_graph(sess.graph)
-  sess.run(tf.global_variables_initializer())
-  for step in xrange(STEPS):
-    # By asking TensorFlow to compute the update step, we force it to
-    # change the value of the temperature variable. We don't actually
-    # care about this value, so we discard it; instead, we grab the
-    # summary data computed along the way.
-    (s, _) = sess.run([summ, update_step])
-    writer.add_summary(s, global_step=step)
-  writer.close()
-
-
-def run_all(logdir, verbose=False):
-  """Run simulations on a reasonable set of parameters.
-
-  Arguments:
-    logdir: the directory into which to store all the runs' data
-    verbose: if true, print out each run's name as it begins
-  """
-  for initial_temperature in [270.0, 310.0, 350.0]:
-    for final_temperature in [270.0, 310.0, 350.0]:
-      for heat_coefficient in [0.001, 0.005]:
-        run_name = 'temperature:t0=%g,tA=%g,kH=%g' % (
-            initial_temperature, final_temperature, heat_coefficient)
-        if verbose:
-          print('--- Running: %s' % run_name)
-        run(logdir, run_name,
-            initial_temperature, final_temperature, heat_coefficient)
-
-
-def main(unused_argv):
-  print('Saving output to %s.' % LOGDIR)
-  run_all(LOGDIR, verbose=True)
-  print('Done. Output saved to %s.' % LOGDIR)
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_plugin.py b/tensorflow/tensorboard/plugins/scalars/scalars_plugin.py
deleted file mode 100644
index ec4bf1089c10b8274fbd4c615da6eed6f41cbad1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/scalars/scalars_plugin.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Scalars plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import csv
-
-from six import StringIO
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.plugins import base_plugin
-
-_PLUGIN_PREFIX_ROUTE = event_accumulator.SCALARS
-
-
-class OutputFormat(object):
-  """An enum used to list the valid output formats for API calls."""
-  JSON = 'json'
-  CSV = 'csv'
-
-
-class ScalarsPlugin(base_plugin.TBPlugin):
-  """Scalars Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self._multiplexer = multiplexer
-    return {
-        '/scalars': self.scalars_route,
-        '/tags': self.tags_route,
-    }
-
-  def is_active(self):
-    """The scalars plugin is active iff any run has at least one scalar tag."""
-    return any(self.index_impl().values())
-
-  def index_impl(self):
-    return {
-        run_name: run_data[event_accumulator.SCALARS]
-        for (run_name, run_data) in self._multiplexer.Runs().items()
-        if event_accumulator.SCALARS in run_data
-    }
-
-  def scalars_impl(self, tag, run, output_format):
-    """Result of the form `(body, mime_type)`."""
-    values = self._multiplexer.Scalars(run, tag)
-    if output_format == OutputFormat.CSV:
-      string_io = StringIO()
-      writer = csv.writer(string_io)
-      writer.writerow(['Wall time', 'Step', 'Value'])
-      writer.writerows(values)
-      return (string_io.getvalue(), 'text/csv')
-    else:
-      return (values, 'application/json')
-
-  @wrappers.Request.application
-  def tags_route(self, request):
-    index = self.index_impl()
-    return http_util.Respond(request, index, 'application/json')
-
-  @wrappers.Request.application
-  def scalars_route(self, request):
-    """Given a tag and single run, return array of ScalarEvents."""
-    # TODO(cassandrax): return HTTP status code for malformed requests
-    tag = request.args.get('tag')
-    run = request.args.get('run')
-    output_format = request.args.get('format')
-    (body, mime_type) = self.scalars_impl(tag, run, output_format)
-    return http_util.Respond(request, body, mime_type)
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_plugin_test.py b/tensorflow/tensorboard/plugins/scalars/scalars_plugin_test.py
deleted file mode 100644
index fb6ed0bc3f254ee331838ce04842961dcf87bb30..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/scalars/scalars_plugin_test.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for the Scalars Plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import csv
-import os.path
-
-from six import StringIO
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.scalars import scalars_plugin
-
-
-class ScalarsPluginTest(tf.test.TestCase):
-
-  _STEPS = 99
-
-  _SCALAR_TAG = 'simple-values'
-  _HISTOGRAM_TAG = 'complicated-values'
-
-  _RUN_WITH_SCALARS = '_RUN_WITH_SCALARS'
-  _RUN_WITH_HISTOGRAM = '_RUN_WITH_HISTOGRAM'
-
-  def set_up_with_runs(self, run_names):
-    self.logdir = self.get_temp_dir()
-    for run_name in run_names:
-      self.generate_run(run_name)
-    multiplexer = event_multiplexer.EventMultiplexer()
-    multiplexer.AddRunsFromDirectory(self.logdir)
-    multiplexer.Reload()
-    self.plugin = scalars_plugin.ScalarsPlugin()
-    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
-
-  def generate_run(self, run_name):
-    if run_name == self._RUN_WITH_SCALARS:
-      (use_scalars, use_histogram) = (True, False)
-    elif run_name == self._RUN_WITH_HISTOGRAM:
-      (use_scalars, use_histogram) = (False, True)
-    else:
-      assert False, 'Invalid run name: %r' % run_name
-    tf.reset_default_graph()
-    sess = tf.Session()
-    if use_scalars:
-      scalar_placeholder = tf.placeholder(tf.int64)
-      tf.summary.scalar(self._SCALAR_TAG, scalar_placeholder)
-    if use_histogram:
-      histogram_placeholder = tf.placeholder(tf.float32, shape=[3])
-      tf.summary.histogram(self._HISTOGRAM_TAG, histogram_placeholder)
-    summ = tf.summary.merge_all()
-
-    subdir = os.path.join(self.logdir, run_name)
-    writer = tf.summary.FileWriter(subdir)
-    writer.add_graph(sess.graph)
-    for step in xrange(self._STEPS):
-      feed_dict = {}
-      if use_scalars:
-        feed_dict[scalar_placeholder] = int((43**step) % 47)
-      if use_histogram:
-        feed_dict[histogram_placeholder] = [1 + step, 2 + step, 3 + step]
-      s = sess.run(summ, feed_dict=feed_dict)
-      writer.add_summary(s, global_step=step)
-    writer.close()
-
-  def test_index(self):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
-    self.assertEqual({
-        self._RUN_WITH_SCALARS: [self._SCALAR_TAG],
-        self._RUN_WITH_HISTOGRAM: [],
-    }, self.plugin.index_impl())
-
-  def _test_scalars_json(self, run_name, should_have_scalars):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
-    if should_have_scalars:
-      (data, mime_type) = self.plugin.scalars_impl(
-          self._SCALAR_TAG, run_name, scalars_plugin.OutputFormat.JSON)
-      self.assertEqual('application/json', mime_type)
-      self.assertEqual(len(data), self._STEPS)
-    else:
-      with self.assertRaises(KeyError):
-        self.plugin.scalars_impl(self._SCALAR_TAG, run_name,
-                                 scalars_plugin.OutputFormat.JSON)
-
-  def _test_scalars_csv(self, run_name, should_have_scalars):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
-    if should_have_scalars:
-      (data, mime_type) = self.plugin.scalars_impl(
-          self._SCALAR_TAG, run_name, scalars_plugin.OutputFormat.CSV)
-      self.assertEqual('text/csv', mime_type)
-      s = StringIO(data)
-      reader = csv.reader(s)
-      self.assertEqual(['Wall time', 'Step', 'Value'], next(reader))
-      self.assertEqual(len(list(reader)), self._STEPS)
-    else:
-      with self.assertRaises(KeyError):
-        self.plugin.scalars_impl(self._SCALAR_TAG, run_name,
-                                 scalars_plugin.OutputFormat.CSV)
-
-  def test_scalars_json_with_scalars(self):
-    self._test_scalars_json(self._RUN_WITH_SCALARS, True)
-
-  def test_scalars_json_with_histogram(self):
-    self._test_scalars_json(self._RUN_WITH_HISTOGRAM, False)
-
-  def test_scalars_csv_with_scalars(self):
-    self._test_scalars_csv(self._RUN_WITH_SCALARS, True)
-
-  def test_scalars_csv_with_histogram(self):
-    self._test_scalars_csv(self._RUN_WITH_HISTOGRAM, False)
-
-  def test_active_with_scalars(self):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS])
-    self.assertTrue(self.plugin.is_active())
-
-  def test_active_with_histogram(self):
-    self.set_up_with_runs([self._RUN_WITH_HISTOGRAM])
-    self.assertFalse(self.plugin.is_active())
-
-  def test_active_with_both(self):
-    self.set_up_with_runs([self._RUN_WITH_SCALARS, self._RUN_WITH_HISTOGRAM])
-    self.assertTrue(self.plugin.is_active())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/text/BUILD b/tensorflow/tensorboard/plugins/text/BUILD
deleted file mode 100644
index 8d30a01c017d675348fa0c8c08a3bf5d826b7fbf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/text/BUILD
+++ /dev/null
@@ -1,49 +0,0 @@
-# Description:
-# TensorBoard plugin for the Text
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-## Text Plugin ##
-py_library(
-    name = "text_plugin",
-    srcs = ["text_plugin.py"],
-    srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/tensorboard:internal",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-        "@org_mozilla_bleach",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@org_pythonhosted_markdown",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "text_plugin_test",
-    size = "small",
-    srcs = ["text_plugin_test.py"],
-    main = "text_plugin_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":text_plugin",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "@org_pocoo_werkzeug//:werkzeug",
-        "@six_archive//:six",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py
deleted file mode 100644
index d0040e20be465bc3dd61792b5019103a290b802e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/text/text_plugin.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The TensorBoard Text plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import textwrap
-
-# pylint: disable=g-bad-import-order
-# Necessary for an internal test with special behavior for numpy.
-import numpy as np
-# pylint: enable=g-bad-import-order
-
-import bleach
-# pylint: disable=g-bad-import-order
-# Google-only: import markdown_freewisdom
-import markdown
-import six
-# pylint: enable=g-bad-import-order
-import tensorflow as tf
-from werkzeug import wrappers
-
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.plugins import base_plugin
-
-# The prefix of routes provided by this plugin.
-_PLUGIN_PREFIX_ROUTE = 'text'
-
-# HTTP routes
-RUNS_ROUTE = '/runs'
-TEXT_ROUTE = '/text'
-
-ALLOWED_TAGS = [
-    'ul',
-    'ol',
-    'li',
-    'p',
-    'pre',
-    'code',
-    'blockquote',
-    'h1',
-    'h2',
-    'h3',
-    'h4',
-    'h5',
-    'h6',
-    'hr',
-    'br',
-    'strong',
-    'em',
-    'a',
-    'img',
-    'table',
-    'thead',
-    'tbody',
-    'td',
-    'tr',
-    'th',
-]
-
-ALLOWED_ATTRIBUTES = {'a': ['href', 'title'], 'img': ['src', 'title', 'alt']}
-
-WARNING_TEMPLATE = textwrap.dedent("""\
-  **Warning:** This text summary contained data of dimensionality %d, but only \
-  2d tables are supported. Showing a 2d slice of the data instead.""")
-
-
-def markdown_and_sanitize(markdown_string):
-  """Takes a markdown string and converts it into sanitized html.
-
-  It uses the table extension; while that's not a part of standard
-  markdown, it is sure to be useful for TensorBoard users.
-
-  The sanitizer uses the allowed_tags and attributes specified above. Mostly,
-  we ensure that our standard use cases like tables and links are supported.
-
-  Args:
-    markdown_string: Markdown string to sanitize
-
-  Returns:
-    a string containing sanitized html for input markdown
-  """
-  # Convert to utf-8 whenever we have a binary input.
-  if isinstance(markdown_string, six.binary_type):
-    markdown_string = markdown_string.decode('utf-8')
-
-  string_html = markdown.markdown(
-      markdown_string, extensions=['markdown.extensions.tables'])
-  string_sanitized = bleach.clean(
-      string_html, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES)
-  return string_sanitized
-
-
-def make_table_row(contents, tag='td'):
-  """Given an iterable of string contents, make a table row.
-
-  Args:
-    contents: An iterable yielding strings.
-    tag: The tag to place contents in. Defaults to 'td', you might want 'th'.
-
-  Returns:
-    A string containing the content strings, organized into a table row.
-
-  Example: make_table_row(['one', 'two', 'three']) == '''
-  <tr>
-  <td>one</td>
-  <td>two</td>
-  <td>three</td>
-  </tr>'''
-  """
-  columns = ('<%s>%s</%s>\n' % (tag, s, tag) for s in contents)
-  return '<tr>\n' + ''.join(columns) + '</tr>\n'
-
-
-def make_table(contents, headers=None):
-  """Given a numpy ndarray of strings, concatenate them into a html table.
-
-  Args:
-    contents: A np.ndarray of strings. May be 1d or 2d. In the 1d case, the
-      table is laid out vertically (i.e. row-major).
-    headers: A np.ndarray or list of string header names for the table.
-
-  Returns:
-    A string containing all of the content strings, organized into a table.
-
-  Raises:
-    ValueError: If contents is not a np.ndarray.
-    ValueError: If contents is not 1d or 2d.
-    ValueError: If contents is empty.
-    ValueError: If headers is present and not a list, tuple, or ndarray.
-    ValueError: If headers is not 1d.
-    ValueError: If number of elements in headers does not correspond to number
-      of columns in contents.
-  """
-  if not isinstance(contents, np.ndarray):
-    raise ValueError('make_table contents must be a numpy ndarray')
-
-  if contents.ndim not in [1, 2]:
-    raise ValueError('make_table requires a 1d or 2d numpy array, was %dd' %
-                     contents.ndim)
-
-  if headers:
-    if isinstance(headers, list) or isinstance(headers, tuple):
-      headers = np.array(headers)
-    if not isinstance(headers, np.ndarray):
-      raise ValueError('Could not convert headers %s into np.ndarray' % headers)
-    if headers.ndim != 1:
-      raise ValueError('Headers must be 1d, is %dd' % headers.ndim)
-    expected_n_columns = contents.shape[1] if contents.ndim == 2 else 1
-    if headers.shape[0] != expected_n_columns:
-      raise ValueError('Number of headers %d must match number of columns %d' %
-                       (headers.shape[0], expected_n_columns))
-    header = '<thead>\n%s</thead>\n' % make_table_row(headers, tag='th')
-  else:
-    header = ''
-
-  n_rows = contents.shape[0]
-  if contents.ndim == 1:
-    # If it's a vector, we need to wrap each element in a new list, otherwise
-    # we would turn the string itself into a row (see test code)
-    rows = (make_table_row([contents[i]]) for i in range(n_rows))
-  else:
-    rows = (make_table_row(contents[i, :]) for i in range(n_rows))
-
-  return '<table>\n%s<tbody>\n%s</tbody>\n</table>' % (header, ''.join(rows))
-
-
-def reduce_to_2d(arr):
-  """Given a np.npdarray with nDims > 2, reduce it to 2d.
-
-  It does this by selecting the zeroth coordinate for every dimension greater
-  than two.
-
-  Args:
-    arr: a numpy ndarray of dimension at least 2.
-
-  Returns:
-    A two-dimensional subarray from the input array.
-
-  Raises:
-    ValueError: If the argument is not a numpy ndarray, or the dimensionality
-      is too low.
-  """
-  if not isinstance(arr, np.ndarray):
-    raise ValueError('reduce_to_2d requires a numpy.ndarray')
-
-  ndims = len(arr.shape)
-  if ndims < 2:
-    raise ValueError('reduce_to_2d requires an array of dimensionality >=2')
-  # slice(None) is equivalent to `:`, so we take arr[0,0,...0,:,:]
-  slices = ([0] * (ndims - 2)) + [slice(None), slice(None)]
-  return arr[slices]
-
-
-def text_array_to_html(text_arr):
-  """Take a numpy.ndarray containing strings, and convert it into html.
-
-  If the ndarray contains a single scalar string, that string is converted to
-  html via our sanitized markdown parser. If it contains an array of strings,
-  the strings are individually converted to html and then composed into a table
-  using make_table. If the array contains dimensionality greater than 2,
-  all but two of the dimensions are removed, and a warning message is prefixed
-  to the table.
-
-  Args:
-    text_arr: A numpy.ndarray containing strings.
-
-  Returns:
-    The array converted to html.
-  """
-  if not text_arr.shape:
-    # It is a scalar. No need to put it in a table, just apply markdown
-    return markdown_and_sanitize(text_arr.astype(np.dtype(str)).tostring())
-  warning = ''
-  if len(text_arr.shape) > 2:
-    warning = markdown_and_sanitize(WARNING_TEMPLATE % len(text_arr.shape))
-    text_arr = reduce_to_2d(text_arr)
-
-  html_arr = [markdown_and_sanitize(x) for x in text_arr.reshape(-1)]
-  html_arr = np.array(html_arr).reshape(text_arr.shape)
-
-  return warning + make_table(html_arr)
-
-
-def process_string_tensor_event(event):
-  """Convert a TensorEvent into a JSON-compatible response."""
-  string_arr = tf.make_ndarray(event.tensor_proto)
-  html = text_array_to_html(string_arr)
-  return {
-      'wall_time': event.wall_time,
-      'step': event.step,
-      'text': html,
-  }
-
-
-class TextPlugin(base_plugin.TBPlugin):
-  """Text Plugin for TensorBoard."""
-
-  plugin_name = _PLUGIN_PREFIX_ROUTE
-
-  def index_impl(self):
-    run_to_series = {}
-    name = 'tensorboard_text'
-    run_to_assets = self.multiplexer.PluginAssets(name)
-
-    for run, assets in run_to_assets.items():
-      if 'tensors.json' in assets:
-        tensors_json = self.multiplexer.RetrievePluginAsset(
-            run, name, 'tensors.json')
-        tensors = json.loads(tensors_json)
-        run_to_series[run] = tensors
-    return run_to_series
-
-  @wrappers.Request.application
-  def runs_route(self, request):
-    index = self.index_impl()
-    return http_util.Respond(request, index, 'application/json')
-
-  def text_impl(self, run, tag):
-    try:
-      text_events = self.multiplexer.Tensors(run, tag)
-    except KeyError:
-      text_events = []
-    responses = [process_string_tensor_event(ev) for ev in text_events]
-    return responses
-
-  @wrappers.Request.application
-  def text_route(self, request):
-    run = request.args.get('run')
-    tag = request.args.get('tag')
-    response = self.text_impl(run, tag)
-    return http_util.Respond(request, response, 'application/json')
-
-  def get_plugin_apps(self, multiplexer, unused_logdir):
-    self.multiplexer = multiplexer
-    return {
-        RUNS_ROUTE: self.runs_route,
-        TEXT_ROUTE: self.text_route,
-    }
-
-  def is_active(self):
-    """Determines whether this plugin is active.
-
-    This plugin is only active if TensorBoard sampled any text summaries.
-
-    Returns:
-      Whether this plugin is active.
-    """
-    return bool(self.index_impl())
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin_test.py b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
deleted file mode 100644
index 5c4f6bfd33304db6211ae1b869fde47f90ffd453..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/text/text_plugin_test.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Integration tests for the Text Plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import textwrap
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.text import text_plugin
-
-GEMS = ['garnet', 'amethyst', 'pearl', 'steven']
-
-
-class TextPluginTest(tf.test.TestCase):
-
-  def setUp(self):
-    self.logdir = self.get_temp_dir()
-    self.generate_testdata()
-    multiplexer = event_multiplexer.EventMultiplexer()
-    multiplexer.AddRunsFromDirectory(self.logdir)
-    multiplexer.Reload()
-    self.plugin = text_plugin.TextPlugin()
-    self.apps = self.plugin.get_plugin_apps(multiplexer, None)
-
-  def assertConverted(self, actual, expected):
-    expected_html = text_plugin.markdown_and_sanitize(expected)
-    self.assertEqual(actual, expected_html)
-
-  def generate_testdata(self):
-    tf.reset_default_graph()
-    sess = tf.Session()
-    placeholder = tf.placeholder(tf.string)
-    summary_tensor = tf.summary.text('message', placeholder)
-
-    vector_summary = tf.summary.text('vector', placeholder)
-
-    run_names = ['fry', 'leela']
-    for run_name in run_names:
-      subdir = os.path.join(self.logdir, run_name)
-      writer = tf.summary.FileWriter(subdir)
-      writer.add_graph(sess.graph)
-
-      step = 0
-      for gem in GEMS:
-        message = run_name + ' *loves* ' + gem
-        feed_dict = {placeholder: message}
-        summ = sess.run(summary_tensor, feed_dict=feed_dict)
-        writer.add_summary(summ, global_step=step)
-        step += 1
-
-      vector_message = ['one', 'two', 'three', 'four']
-      summ = sess.run(vector_summary, feed_dict={placeholder: vector_message})
-      writer.add_summary(summ)
-      writer.close()
-
-  def testIndex(self):
-    index = self.plugin.index_impl()
-    self.assertEqual(index, {
-        'fry': ['message', 'vector'],
-        'leela': ['message', 'vector'],
-    })
-
-  def testText(self):
-    fry = self.plugin.text_impl('fry', 'message')
-    leela = self.plugin.text_impl('leela', 'message')
-    self.assertEqual(len(fry), 4)
-    self.assertEqual(len(leela), 4)
-    for i in range(4):
-      self.assertEqual(fry[i]['step'], i)
-      self.assertConverted(fry[i]['text'], 'fry *loves* ' + GEMS[i])
-      self.assertEqual(leela[i]['step'], i)
-      self.assertConverted(leela[i]['text'], 'leela *loves* ' + GEMS[i])
-
-    table = self.plugin.text_impl('fry', 'vector')[0]['text']
-    self.assertEqual(table,
-                     textwrap.dedent("""\
-      <table>
-      <tbody>
-      <tr>
-      <td><p>one</p></td>
-      </tr>
-      <tr>
-      <td><p>two</p></td>
-      </tr>
-      <tr>
-      <td><p>three</p></td>
-      </tr>
-      <tr>
-      <td><p>four</p></td>
-      </tr>
-      </tbody>
-      </table>"""))
-
-  def assertTextConverted(self, actual, expected):
-    self.assertEqual(text_plugin.markdown_and_sanitize(actual), expected)
-
-  def testMarkdownConversion(self):
-    emphasis = '*Italics1* _Italics2_ **bold1** __bold2__'
-    emphasis_converted = ('<p><em>Italics1</em> <em>Italics2</em> '
-                          '<strong>bold1</strong> <strong>bold2</strong></p>')
-
-    self.assertEqual(
-        text_plugin.markdown_and_sanitize(emphasis), emphasis_converted)
-
-    md_list = textwrap.dedent("""\
-    1. List item one.
-    2. List item two.
-      * Sublist
-      * Sublist2
-    1. List continues.
-    """)
-    md_list_converted = textwrap.dedent("""\
-    <ol>
-    <li>List item one.</li>
-    <li>List item two.</li>
-    <li>Sublist</li>
-    <li>Sublist2</li>
-    <li>List continues.</li>
-    </ol>""")
-    self.assertEqual(
-        text_plugin.markdown_and_sanitize(md_list), md_list_converted)
-
-    link = '[TensorFlow](http://tensorflow.org)'
-    link_converted = '<p><a href="http://tensorflow.org">TensorFlow</a></p>'
-    self.assertEqual(text_plugin.markdown_and_sanitize(link), link_converted)
-
-    table = textwrap.dedent("""\
-    An | Example | Table
-    --- | --- | ---
-    A | B | C
-    1 | 2 | 3
-    """)
-
-    table_converted = textwrap.dedent("""\
-    <table>
-    <thead>
-    <tr>
-    <th>An</th>
-    <th>Example</th>
-    <th>Table</th>
-    </tr>
-    </thead>
-    <tbody>
-    <tr>
-    <td>A</td>
-    <td>B</td>
-    <td>C</td>
-    </tr>
-    <tr>
-    <td>1</td>
-    <td>2</td>
-    <td>3</td>
-    </tr>
-    </tbody>
-    </table>""")
-
-    self.assertEqual(text_plugin.markdown_and_sanitize(table), table_converted)
-
-  def testSanitization(self):
-    dangerous = "<script>alert('xss')</script>"
-    sanitized = "&lt;script&gt;alert('xss')&lt;/script&gt;"
-    self.assertEqual(text_plugin.markdown_and_sanitize(dangerous), sanitized)
-
-    dangerous = textwrap.dedent("""\
-    hello <a name='n'
-    href='javascript:alert('xss')'>*you*</a>""")
-    sanitized = '<p>hello <a><em>you</em></a></p>'
-    self.assertEqual(text_plugin.markdown_and_sanitize(dangerous), sanitized)
-
-  def testTableGeneration(self):
-    array2d = np.array([['one', 'two'], ['three', 'four']])
-    expected_table = textwrap.dedent("""\
-    <table>
-    <tbody>
-    <tr>
-    <td>one</td>
-    <td>two</td>
-    </tr>
-    <tr>
-    <td>three</td>
-    <td>four</td>
-    </tr>
-    </tbody>
-    </table>""")
-    self.assertEqual(text_plugin.make_table(array2d), expected_table)
-
-    expected_table_with_headers = textwrap.dedent("""\
-    <table>
-    <thead>
-    <tr>
-    <th>c1</th>
-    <th>c2</th>
-    </tr>
-    </thead>
-    <tbody>
-    <tr>
-    <td>one</td>
-    <td>two</td>
-    </tr>
-    <tr>
-    <td>three</td>
-    <td>four</td>
-    </tr>
-    </tbody>
-    </table>""")
-
-    actual_with_headers = text_plugin.make_table(array2d, headers=['c1', 'c2'])
-    self.assertEqual(actual_with_headers, expected_table_with_headers)
-
-    array_1d = np.array(['one', 'two', 'three', 'four', 'five'])
-    expected_1d = textwrap.dedent("""\
-    <table>
-    <tbody>
-    <tr>
-    <td>one</td>
-    </tr>
-    <tr>
-    <td>two</td>
-    </tr>
-    <tr>
-    <td>three</td>
-    </tr>
-    <tr>
-    <td>four</td>
-    </tr>
-    <tr>
-    <td>five</td>
-    </tr>
-    </tbody>
-    </table>""")
-    self.assertEqual(text_plugin.make_table(array_1d), expected_1d)
-
-    expected_1d_with_headers = textwrap.dedent("""\
-    <table>
-    <thead>
-    <tr>
-    <th>X</th>
-    </tr>
-    </thead>
-    <tbody>
-    <tr>
-    <td>one</td>
-    </tr>
-    <tr>
-    <td>two</td>
-    </tr>
-    <tr>
-    <td>three</td>
-    </tr>
-    <tr>
-    <td>four</td>
-    </tr>
-    <tr>
-    <td>five</td>
-    </tr>
-    </tbody>
-    </table>""")
-    actual_1d_with_headers = text_plugin.make_table(array_1d, headers=['X'])
-    self.assertEqual(actual_1d_with_headers, expected_1d_with_headers)
-
-  def testMakeTableExceptions(self):
-    # Verify that contents is being type-checked and shape-checked.
-    with self.assertRaises(ValueError):
-      text_plugin.make_table([])
-
-    with self.assertRaises(ValueError):
-      text_plugin.make_table('foo')
-
-    with self.assertRaises(ValueError):
-      invalid_shape = np.full((3, 3, 3), 'nope', dtype=np.dtype('S3'))
-      text_plugin.make_table(invalid_shape)
-
-    # Test headers exceptions in 2d array case.
-    test_array = np.full((3, 3), 'foo', dtype=np.dtype('S3'))
-    with self.assertRaises(ValueError):
-      # Headers is wrong type.
-      text_plugin.make_table(test_array, headers='foo')
-    with self.assertRaises(ValueError):
-      # Too many headers.
-      text_plugin.make_table(test_array, headers=['foo', 'bar', 'zod', 'zoink'])
-    with self.assertRaises(ValueError):
-      # headers is 2d
-      text_plugin.make_table(test_array, headers=test_array)
-
-    # Also make sure the column counting logic works in the 1d array case.
-    test_array = np.array(['foo', 'bar', 'zod'])
-    with self.assertRaises(ValueError):
-      # Too many headers.
-      text_plugin.make_table(test_array, headers=test_array)
-
-  def test_reduce_to_2d(self):
-
-    def make_range_array(dim):
-      """Produce an incrementally increasing multidimensional array.
-
-      Args:
-        dim: the number of dimensions for the array
-
-      Returns:
-        An array of increasing integer elements, with dim dimensions and size
-        two in each dimension.
-
-      Example: rangeArray(2) results in [[0,1],[2,3]].
-      """
-      return np.array(range(2**dim)).reshape([2] * dim)
-
-    for i in range(2, 5):
-      actual = text_plugin.reduce_to_2d(make_range_array(i))
-      expected = make_range_array(2)
-      np.testing.assert_array_equal(actual, expected)
-
-  def test_text_array_to_html(self):
-
-    convert = text_plugin.text_array_to_html
-    scalar = np.array('foo')
-    scalar_expected = '<p>foo</p>'
-    self.assertEqual(convert(scalar), scalar_expected)
-
-    vector = np.array(['foo', 'bar'])
-    vector_expected = textwrap.dedent("""\
-      <table>
-      <tbody>
-      <tr>
-      <td><p>foo</p></td>
-      </tr>
-      <tr>
-      <td><p>bar</p></td>
-      </tr>
-      </tbody>
-      </table>""")
-    self.assertEqual(convert(vector), vector_expected)
-
-    d2 = np.array([['foo', 'bar'], ['zoink', 'zod']])
-    d2_expected = textwrap.dedent("""\
-      <table>
-      <tbody>
-      <tr>
-      <td><p>foo</p></td>
-      <td><p>bar</p></td>
-      </tr>
-      <tr>
-      <td><p>zoink</p></td>
-      <td><p>zod</p></td>
-      </tr>
-      </tbody>
-      </table>""")
-    self.assertEqual(convert(d2), d2_expected)
-
-    d3 = np.array([[['foo', 'bar'], ['zoink', 'zod']], [['FOO', 'BAR'],
-                                                        ['ZOINK', 'ZOD']]])
-
-    warning = text_plugin.markdown_and_sanitize(text_plugin.WARNING_TEMPLATE %
-                                                3)
-    d3_expected = warning + textwrap.dedent("""\
-      <table>
-      <tbody>
-      <tr>
-      <td><p>foo</p></td>
-      <td><p>bar</p></td>
-      </tr>
-      <tr>
-      <td><p>zoink</p></td>
-      <td><p>zod</p></td>
-      </tr>
-      </tbody>
-      </table>""")
-    self.assertEqual(convert(d3), d3_expected)
-
-  def testPluginIsActive(self):
-    plugin = text_plugin.TextPlugin()
-    multiplexer = event_multiplexer.EventMultiplexer()
-    plugin.get_plugin_apps(event_multiplexer.EventMultiplexer(), None)
-
-    # The plugin is inactive because text summaries are not available.
-    self.assertFalse(plugin.is_active())
-
-    multiplexer.AddRunsFromDirectory(self.logdir)
-    multiplexer.Reload()
-
-    # The plugin is active because text summaries are available.
-    self.assertTrue(self.plugin.is_active())
-
-  def testUnicode(self):
-    self.assertConverted(u'<p>Iñtërnâtiônàlizætiøn⚡💩</p>',
-                         'Iñtërnâtiônàlizætiøn⚡💩')
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/tensorboard/scripts/BUILD b/tensorflow/tensorboard/scripts/BUILD
deleted file mode 100644
index 05425ee61d05e3a0e540106a8c313205562b347c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/scripts/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-# Description:
-# Some useful scripts that are bundled with TensorBoard.
-
-package(default_visibility = ["//tensorflow/tensorboard:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "generate_testdata",
-    srcs = ["generate_testdata.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_binary(
-    name = "execrooter",
-    srcs = ["execrooter.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/scripts/execrooter.py b/tensorflow/tensorboard/scripts/execrooter.py
deleted file mode 100644
index 65569b9151258dc692ec45223a4f9118ea803126..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/scripts/execrooter.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utility for running programs in a symlinked execroot."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-import shutil
-import subprocess
-import sys
-import tempfile
-
-
-def run(inputs, program, outputs):
-  """Creates temp symlink tree, runs program, and copies back outputs.
-
-  Args:
-    inputs: List of fake paths to real paths, which are used for symlink tree.
-    program: List containing real path of program and its arguments. The
-        execroot directory will be appended as the last argument.
-    outputs: List of fake outputted paths to copy back to real paths.
-  Returns:
-    0 if succeeded or nonzero if failed.
-  """
-  root = tempfile.mkdtemp()
-  try:
-    cwd = os.getcwd()
-    for fake, real in inputs:
-      parent = os.path.join(root, os.path.dirname(fake))
-      if not os.path.exists(parent):
-        os.makedirs(parent)
-      os.symlink(os.path.join(cwd, real), os.path.join(root, fake))
-    if subprocess.call(program + [root]) != 0:
-      return 1
-    for fake, real in outputs:
-      shutil.copyfile(os.path.join(root, fake), real)
-    return 0
-  finally:
-    shutil.rmtree(root)
-
-
-def main(args):
-  """Invokes run function using a JSON file config.
-
-  Args:
-    args: CLI args, which can be a JSON file containing an object whose
-        attributes are the parameters to the run function. If multiple JSON
-        files are passed, their contents are concatenated.
-  Returns:
-    0 if succeeded or nonzero if failed.
-  Raises:
-    Exception: If input data is missing.
-  """
-  if not args:
-    raise Exception('Please specify at least one JSON config path')
-  inputs = []
-  program = []
-  outputs = []
-  for arg in args:
-    with open(arg) as fd:
-      config = json.load(fd)
-    inputs.extend(config.get('inputs', []))
-    program.extend(config.get('program', []))
-    outputs.extend(config.get('outputs', []))
-  if not program:
-    raise Exception('Please specify a program')
-  return run(inputs, program, outputs)
-
-
-if __name__ == '__main__':
-  sys.exit(main(sys.argv[1:]))
diff --git a/tensorflow/tensorboard/scripts/generate_testdata.py b/tensorflow/tensorboard/scripts/generate_testdata.py
deleted file mode 100644
index f191d16a82dc9f771ea4f1d42a510625c157d119..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/scripts/generate_testdata.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate some standard test data for debugging TensorBoard.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import bisect
-import math
-import os
-import os.path
-import random
-import shutil
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-
-tf.flags.DEFINE_string("target", None, """The directory where serialized data
-will be written""")
-
-tf.flags.DEFINE_boolean("overwrite", False, """Whether to remove and overwrite
-TARGET if it already exists.""")
-
-FLAGS = tf.flags.FLAGS
-
-# Hardcode a start time and reseed so script always generates the same data.
-_start_time = 0
-random.seed(0)
-
-
-def _MakeHistogramBuckets():
-  v = 1E-12
-  buckets = []
-  neg_buckets = []
-  while v < 1E20:
-    buckets.append(v)
-    neg_buckets.append(-v)
-    v *= 1.1
-  # Should include DBL_MAX, but won't bother for test data.
-  return neg_buckets[::-1] + [0] + buckets
-
-
-def _MakeHistogram(values):
-  """Convert values into a histogram proto using logic from histogram.cc."""
-  limits = _MakeHistogramBuckets()
-  counts = [0] * len(limits)
-  for v in values:
-    idx = bisect.bisect_left(limits, v)
-    counts[idx] += 1
-
-  limit_counts = [(limits[i], counts[i]) for i in xrange(len(limits))
-                  if counts[i]]
-  bucket_limit = [lc[0] for lc in limit_counts]
-  bucket = [lc[1] for lc in limit_counts]
-  sum_sq = sum(v * v for v in values)
-  return tf.HistogramProto(
-      min=min(values),
-      max=max(values),
-      num=len(values),
-      sum=sum(values),
-      sum_squares=sum_sq,
-      bucket_limit=bucket_limit,
-      bucket=bucket)
-
-
-def WriteScalarSeries(writer, tag, f, n=5):
-  """Write a series of scalar events to writer, using f to create values."""
-  step = 0
-  wall_time = _start_time
-  for i in xrange(n):
-    v = f(i)
-    value = tf.Summary.Value(tag=tag, simple_value=v)
-    summary = tf.Summary(value=[value])
-    event = tf.Event(wall_time=wall_time, step=step, summary=summary)
-    writer.add_event(event)
-    step += 1
-    wall_time += 10
-
-
-def WriteHistogramSeries(writer, tag, mu_sigma_tuples, n=20):
-  """Write a sequence of normally distributed histograms to writer."""
-  step = 0
-  wall_time = _start_time
-  for [mean, stddev] in mu_sigma_tuples:
-    data = [random.normalvariate(mean, stddev) for _ in xrange(n)]
-    histo = _MakeHistogram(data)
-    summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=histo)])
-    event = tf.Event(wall_time=wall_time, step=step, summary=summary)
-    writer.add_event(event)
-    step += 10
-    wall_time += 100
-
-
-def WriteImageSeries(writer, tag, n_images=1):
-  """Write a few dummy images to writer."""
-  step = 0
-  session = tf.Session()
-  p = tf.placeholder("uint8", (1, 4, 4, 3))
-  s = tf.summary.image(tag, p)
-  for _ in xrange(n_images):
-    im = np.random.random_integers(0, 255, (1, 4, 4, 3))
-    summ = session.run(s, feed_dict={p: im})
-    writer.add_summary(summ, step)
-    step += 20
-  session.close()
-
-
-def WriteAudioSeries(writer, tag, n_audio=1):
-  """Write a few dummy audio clips to writer."""
-  step = 0
-  session = tf.Session()
-
-  min_frequency_hz = 440
-  max_frequency_hz = 880
-  sample_rate = 4000
-  duration_frames = sample_rate // 2  # 0.5 seconds.
-  frequencies_per_run = 1
-  num_channels = 2
-
-  p = tf.placeholder("float32", (frequencies_per_run, duration_frames,
-                                 num_channels))
-  s = tf.summary.audio(tag, p, sample_rate)
-
-  for _ in xrange(n_audio):
-    # Generate a different frequency for each channel to show stereo works.
-    frequencies = np.random.random_integers(
-        min_frequency_hz,
-        max_frequency_hz,
-        size=(frequencies_per_run, num_channels))
-    tiled_frequencies = np.tile(frequencies, (1, duration_frames))
-    tiled_increments = np.tile(
-        np.arange(0, duration_frames),
-        (num_channels, 1)).T.reshape(1, duration_frames * num_channels)
-    tones = np.sin(2.0 * np.pi * tiled_frequencies * tiled_increments /
-                   sample_rate)
-    tones = tones.reshape(frequencies_per_run, duration_frames, num_channels)
-
-    summ = session.run(s, feed_dict={p: tones})
-    writer.add_summary(summ, step)
-    step += 20
-  session.close()
-
-
-def GenerateTestData(path):
-  """Generates the test data directory."""
-  run1_path = os.path.join(path, "run1")
-  os.makedirs(run1_path)
-  writer1 = tf.summary.FileWriter(run1_path)
-  WriteScalarSeries(writer1, "foo/square", lambda x: x * x)
-  WriteScalarSeries(writer1, "bar/square", lambda x: x * x)
-  WriteScalarSeries(writer1, "foo/sin", math.sin)
-  WriteScalarSeries(writer1, "foo/cos", math.cos)
-  WriteHistogramSeries(writer1, "histo1", [[0, 1], [0.3, 1], [0.5, 1], [0.7, 1],
-                                           [1, 1]])
-  WriteImageSeries(writer1, "im1")
-  WriteImageSeries(writer1, "im2")
-  WriteAudioSeries(writer1, "au1")
-
-  run2_path = os.path.join(path, "run2")
-  os.makedirs(run2_path)
-  writer2 = tf.summary.FileWriter(run2_path)
-  WriteScalarSeries(writer2, "foo/square", lambda x: x * x * 2)
-  WriteScalarSeries(writer2, "bar/square", lambda x: x * x * 3)
-  WriteScalarSeries(writer2, "foo/cos", lambda x: math.cos(x) * 2)
-  WriteHistogramSeries(writer2, "histo1", [[0, 2], [0.3, 2], [0.5, 2], [0.7, 2],
-                                           [1, 2]])
-  WriteHistogramSeries(writer2, "histo2", [[0, 1], [0.3, 1], [0.5, 1], [0.7, 1],
-                                           [1, 1]])
-  WriteImageSeries(writer2, "im1")
-  WriteAudioSeries(writer2, "au2")
-
-  graph_def = tf.GraphDef()
-  node1 = graph_def.node.add()
-  node1.name = "a"
-  node1.op = "matmul"
-  node2 = graph_def.node.add()
-  node2.name = "b"
-  node2.op = "matmul"
-  node2.input.extend(["a:0"])
-
-  writer1.add_graph(graph_def)
-  node3 = graph_def.node.add()
-  node3.name = "c"
-  node3.op = "matmul"
-  node3.input.extend(["a:0", "b:0"])
-  writer2.add_graph(graph_def)
-  writer1.close()
-  writer2.close()
-
-
-def main(unused_argv=None):
-  target = FLAGS.target
-  if not target:
-    print("The --target flag is required.")
-    return -1
-  if os.path.exists(target):
-    if FLAGS.overwrite:
-      if os.path.isdir(target):
-        shutil.rmtree(target)
-      else:
-        os.remove(target)
-    else:
-      print("Refusing to overwrite target %s without --overwrite" % target)
-      return -2
-  GenerateTestData(target)
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d01342827dc26a80ac0d7f829c4e093afcf76abb..d31b9a6e3141b1a30055e3c7cc94c8aec9633675 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -72,6 +72,13 @@ def if_android_arm64(a):
   })
 
 
+def if_android_mips(a):
+  return select({
+      clean_dep("//tensorflow:android_mips"): a,
+      "//conditions:default": [],
+  })
+
+
 def if_not_android(a):
   return select({
       clean_dep("//tensorflow:android"): [],
@@ -79,6 +86,14 @@ def if_not_android(a):
   })
 
 
+def if_not_android_mips_and_mips64(a):
+  return select({
+      clean_dep("//tensorflow:android_mips"): [],
+      clean_dep("//tensorflow:android_mips64"): [],
+      "//conditions:default": a,
+  })
+
+
 def if_android(a):
   return select({
       clean_dep("//tensorflow:android"): a,
@@ -117,11 +132,9 @@ def if_not_windows(a):
   })
 
 
-def if_x86(a):
+def if_linux_x86_64(a):
   return select({
       clean_dep("//tensorflow:linux_x86_64"): a,
-      clean_dep("//tensorflow:windows"): a,
-      clean_dep("//tensorflow:windows_msvc"): a,
       "//conditions:default": [],
   })
 
@@ -138,17 +151,21 @@ WIN_COPTS = [
     "/DTF_COMPILE_LIBRARY",
     "/DEIGEN_HAS_C99_MATH",
     "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+    "/DEIGEN_AVOID_STL_ARRAY",
+    "/Iexternal/gemmlowp",
+    "/wd4018", # -Wno-sign-compare
+    "/U_HAS_EXCEPTIONS", "/D_HAS_EXCEPTIONS=1", "/EHsc", # -fno-exceptions
 ]
 
 # LINT.IfChange
 def tf_copts():
-  return ([
+  return (if_not_windows([
       "-DEIGEN_AVOID_STL_ARRAY",
       "-Iexternal/gemmlowp",
       "-Wno-sign-compare",
       "-fno-exceptions",
-  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
-      ["-mfpu=neon"]) + if_x86(["-msse3"]) + select({
+  ]) + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
+      ["-mfpu=neon"]) + if_linux_x86_64(["-msse3"]) + select({
           clean_dep("//tensorflow:android"): [
               "-std=c++11",
               "-DTF_LEAN_BINARY",
@@ -167,7 +184,7 @@ def tf_opts_nortti_if_android():
       "-fno-rtti",
       "-DGOOGLE_PROTOBUF_NO_RTTI",
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
-  ]) + if_android_x86(["-msse4.1"])
+  ])
 
 
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
@@ -851,7 +868,7 @@ def cc_header_only_library(name, deps=[], **kwargs):
 
 def tf_custom_op_library_additional_deps():
   return [
-      "@protobuf//:protobuf_headers",
+      "@protobuf_archive//:protobuf_headers",
       clean_dep("//third_party/eigen3"),
       clean_dep("//tensorflow/core:framework_headers_lib"),
   ]
@@ -1021,9 +1038,9 @@ def tf_py_wrap_cc(name,
   native.cc_binary(
       name=cc_library_name,
       srcs=[module_name + ".cc"],
-      copts=(copts + [
+      copts=(copts + if_not_windows([
           "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
-      ] + tf_extension_copts()),
+      ]) + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
       linkshared=1,
diff --git a/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
index 72cc53244768ad515c0ce33b937a2eae3a9fd98a..a095616c00cfe8fb64413e2078ae1589a423d2f4 100644
--- a/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "from_list"
     argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "size"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
index 5c77b3dd5cca6c7741764e6b4bcea82ef30a47fd..260c796fd65b90020eb2b8191645ffdb2402a4a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
@@ -13,7 +13,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'name\', \'encoding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "num_records_produced"
diff --git a/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
index f5b0bae58d0d11d1fb0b83e3996a038f6254ccdc..0a3b81bf829f48e88e9c48ce26cdbb4207101a16 100644
--- a/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
@@ -34,7 +34,7 @@ tf_class {
   }
   member_method {
     name: "make_callable"
-    argspec: "args=[\'self\', \'fetches\', \'feed_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'fetches\', \'feed_list\', \'accept_options\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "partial_run"
diff --git a/tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9b7e9bbca82858ca99e67d70cf93583ca75972f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-l-m-d-b-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.LMDBReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.LMDBReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
index 1bfe723ce754830efeebd7644871ff29f9809423..8fed133561544b91abfc64577e63a7088b43a007 100644
--- a/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "from_list"
     argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "size"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
index dbe25f3a5b9ecc1596c77862396c684b6ddb9c5f..ebb017e81bc29e062d804fbe9f50c62f7b615dab 100644
--- a/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "from_list"
     argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "size"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
index 9263d73a51161e9df083992528400b57302832d2..761f90989f316611d42580ee911e24bb3d0d2fec 100644
--- a/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "from_list"
     argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "size"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
index ec783ffe5a01d66965d6370ec1bc6c83178b5a8c..f3ca84139311bc05478e3dce876b53f7b9dec883 100644
--- a/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "from_list"
     argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "size"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
index 173cd1963e5e8c088556e8530b65ac1bdee99dc3..1d6b037f9c3540653a8fb18b6508f74b01da66ab 100644
--- a/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
@@ -34,7 +34,7 @@ tf_class {
   }
   member_method {
     name: "make_callable"
-    argspec: "args=[\'self\', \'fetches\', \'feed_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'fetches\', \'feed_list\', \'accept_options\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "partial_run"
diff --git a/tensorflow/tools/api/golden/tensorflow.-auto-parallel-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
similarity index 87%
rename from tensorflow/tools/api/golden/tensorflow.-auto-parallel-options.pbtxt
rename to tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
index c8f3e8fb154c5a1a2bb61759d9241d7e79fe884e..067f02ce8cbb1a1f6e65758f37bb1d36927fad98 100644
--- a/tensorflow/tools/api/golden/tensorflow.-auto-parallel-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
@@ -1,21 +1,21 @@
-path: "tensorflow.AutoParallelOptions"
+path: "tensorflow.SummaryMetadata.PluginData"
 tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.rewriter_config_pb2.AutoParallelOptions\'>"
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.PluginData\'>"
   is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
   member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+    name: "CONTENT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
   }
   member {
-    name: "ENABLE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
   }
   member {
     name: "Extensions"
     mtype: "<type \'getset_descriptor\'>"
   }
   member {
-    name: "NUM_REPLICAS_FIELD_NUMBER"
+    name: "PLUGIN_NAME_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/tensorflow.-rewriter-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
similarity index 65%
rename from tensorflow/tools/api/golden/tensorflow.-rewriter-config.pbtxt
rename to tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
index 34d2e1761280de8079f82bef02b7dc2cc5ace442..b9156521ccbee25486113a82ddec1053f8b32e3b 100644
--- a/tensorflow/tools/api/golden/tensorflow.-rewriter-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
@@ -1,21 +1,13 @@
-path: "tensorflow.RewriterConfig"
+path: "tensorflow.SummaryMetadata"
 tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.rewriter_config_pb2.RewriterConfig\'>"
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.SummaryMetadata\'>"
   is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "AUTO_PARALLEL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CONSTANT_FOLDING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
   member {
     name: "DESCRIPTOR"
     mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
   }
   member {
-    name: "DISABLE_MODEL_PRUNING_FIELD_NUMBER"
+    name: "DISPLAY_NAME_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
   member {
@@ -23,27 +15,15 @@ tf_class {
     mtype: "<type \'getset_descriptor\'>"
   }
   member {
-    name: "MANUAL"
+    name: "PLUGIN_DATA_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
   member {
-    name: "MEMORY_OPTIMIZATION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "MemOptType"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member {
-    name: "NO_MEM_OPT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OPTIMIZERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
+    name: "PluginData"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
   member {
-    name: "OPTIMIZE_TENSOR_LAYOUT_FIELD_NUMBER"
+    name: "SUMMARY_DESCRIPTION_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
index d5b9cb8f5ed3cf088f5bd27809ff98f00801217d..8e3598fb2470b327e6e3601969f055d4907f614a 100644
--- a/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "merge_with"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "most_specific_compatible_shape"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "num_elements"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt b/tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e4d333cc0bb0bb33fb4cc8d76badd30c8babaa4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.bitwise.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.bitwise"
+tf_module {
+  member_method {
+    name: "bitwise_and"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bitwise_or"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bitwise_xor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "invert"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfe09345acccc410ad3041a965901134440e3c77
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
@@ -0,0 +1,135 @@
+path: "tensorflow.distributions.Bernoulli"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.bernoulli.Bernoulli\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "logits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "probs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Bernoulli\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e6578bae1604f69e4697bb4668dd69d94bd68b5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.distributions.Beta"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.beta.Beta\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "concentration0"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "concentration1"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total_concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'concentration1\', \'concentration0\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Beta\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d42b0e82e4fab3e30d3ebf1b8bea8b44bb61ea0f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.distributions.Categorical"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.categorical.Categorical\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "logits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "probs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Categorical\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..710164743e851f0bb5c31ebe78b260b623e87378
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.distributions.DirichletMultinomial"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet_multinomial.DirichletMultinomial\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total_concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total_count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'total_count\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'DirichletMultinomial\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cc361672ed8da313e1bebc41fbf093e019d38ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
@@ -0,0 +1,135 @@
+path: "tensorflow.distributions.Dirichlet"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet.Dirichlet\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total_concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Dirichlet\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40ad07d1be4bdea9585eb276debb1fdf3dfff583
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
@@ -0,0 +1,126 @@
+path: "tensorflow.distributions.Distribution"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'reparameterization_type\', \'validate_args\', \'allow_nan_stats\', \'parameters\', \'graph_parents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f34d25fea873827997ecd9df10cf1b3bfd0e56b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
@@ -0,0 +1,136 @@
+path: "tensorflow.distributions.Exponential"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.exponential.Exponential\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Exponential\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ae88fba3b4fd176641cc17c916181cc9a6a12c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
@@ -0,0 +1,135 @@
+path: "tensorflow.distributions.Gamma"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "concentration"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'concentration\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Gamma\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7cd595e946cb91f162a2a1af8753e44cdfbc0e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
@@ -0,0 +1,135 @@
+path: "tensorflow.distributions.Laplace"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.laplace.Laplace\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loc"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Laplace\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a4a16ff836a485e65cb6e061e27b92907cb4a63
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.distributions.Multinomial"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.multinomial.Multinomial\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "logits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "probs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total_count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'total_count\', \'logits\', \'probs\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Multinomial\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..14c8c34cc2d8efacec706bdb894d9f069d5e7033
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
@@ -0,0 +1,135 @@
+path: "tensorflow.distributions.Normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.normal.Normal\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loc"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Normal\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e3db443c2bdaa70f7651126a30caf2062a3c6f67
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-register-k-l.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distributions.RegisterKL"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.kullback_leibler.RegisterKL\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dist_cls_a\', \'dist_cls_b\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02e8d576ddd00aa21005fa39cd323a92392bf75a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-reparameterization-type.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distributions.ReparameterizationType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'rep_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30db6d3f35c1c8ea7bbc376a20093302dd373bd9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.distributions.StudentT"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.student_t.StudentT\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "df"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loc"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scale"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'df\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'StudentT\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..46cbdf225f68e879fd18ef4a07048746a9a71b08
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
@@ -0,0 +1,139 @@
+path: "tensorflow.distributions.Uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.uniform.Uniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_nan_stats"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "high"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "low"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reparameterization_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'low\', \'high\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'False\', \'True\', \'Uniform\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
+  }
+  member_method {
+    name: "covariance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
+  }
+  member_method {
+    name: "entropy"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
+  }
+  member_method {
+    name: "event_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
+  }
+  member_method {
+    name: "is_scalar_batch"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
+  }
+  member_method {
+    name: "is_scalar_event"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
+  }
+  member_method {
+    name: "log_cdf"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
+  }
+  member_method {
+    name: "log_prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
+  }
+  member_method {
+    name: "log_survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
+  }
+  member_method {
+    name: "mode"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
+  }
+  member_method {
+    name: "param_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
+  }
+  member_method {
+    name: "param_static_shapes"
+    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prob"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
+  }
+  member_method {
+    name: "quantile"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range\'], "
+  }
+  member_method {
+    name: "sample"
+    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
+  }
+  member_method {
+    name: "stddev"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
+  }
+  member_method {
+    name: "survival_function"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
+  }
+  member_method {
+    name: "variance"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11565bd3e4178202fa82e2e079d1035190dbd6ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-bijector.pbtxt
@@ -0,0 +1,65 @@
+path: "tensorflow.distributions.bijectors.Bijector"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_ndims"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_constant_jacobian"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'event_ndims\', \'graph_parents\', \'is_constant_jacobian\', \'validate_args\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "forward"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
+  }
+  member_method {
+    name: "forward_event_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "forward_event_shape_tensor"
+    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
+  }
+  member_method {
+    name: "forward_log_det_jacobian"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "inverse_event_shape"
+    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "inverse_event_shape_tensor"
+    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
+  }
+  member_method {
+    name: "inverse_log_det_jacobian"
+    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e5fe624eb838e188594d03b656c12890db344a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.-identity.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distributions.bijectors.Identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.distributions.identity_bijector.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.distributions.bijector_impl.Bijector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "event_ndims"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_constant_jacobian"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "validate_args"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'validate_args\', \'event_ndims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'identity\'], "
+  }
+  member_method {
+    name: "forward"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward\'], "
+  }
+  member_method {
+    name: "forward_event_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "forward_event_shape_tensor"
+    argspec: "args=[\'self\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_event_shape_tensor\'], "
+  }
+  member_method {
+    name: "forward_log_det_jacobian"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'forward_log_det_jacobian\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "inverse_event_shape"
+    argspec: "args=[\'self\', \'output_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "inverse_event_shape_tensor"
+    argspec: "args=[\'self\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_event_shape_tensor\'], "
+  }
+  member_method {
+    name: "inverse_log_det_jacobian"
+    argspec: "args=[\'self\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse_log_det_jacobian\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d0144f36ec332740889dc8caa5add8f41960d92
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.bijectors.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.distributions.bijectors"
+tf_module {
+  member {
+    name: "Bijector"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "Identity"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fba7c506ed9d2490e7c19c1746d3f4e9645424f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.pbtxt
@@ -0,0 +1,79 @@
+path: "tensorflow.distributions"
+tf_module {
+  member {
+    name: "Bernoulli"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Beta"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Categorical"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Dirichlet"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "DirichletMultinomial"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Distribution"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Exponential"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "FULLY_REPARAMETERIZED"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
+  }
+  member {
+    name: "Gamma"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Laplace"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Multinomial"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "NOT_REPARAMETERIZED"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
+  }
+  member {
+    name: "Normal"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "RegisterKL"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReparameterizationType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StudentT"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "Uniform"
+    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
+  }
+  member {
+    name: "bijectors"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3a6f770153013dc925dc1b65a38ec59202c4b0b2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.estimator.DNNClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.dnn.DNNClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83e53d3960477b8170664c03ee30f588f87454b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.estimator.DNNLinearCombinedClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17f30a04fbfe7ffe464e7d107f8a9d9a27140188
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.estimator.DNNLinearCombinedRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edd68f0bb9ac8654dbc53e090d812de37a168515
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.estimator.DNNRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.dnn.DNNRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
index 5dbfe2172640916803204a4c8f2c5e250bc982d7..6608d21d44c219acbf0265bee368a5a007eebc92 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "eval_metric_ops"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "evaluation_hooks"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "export_outputs"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3715dd5ec76284004efb24b0b6316d1eec87a589
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.estimator.LinearClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.linear.LinearClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ccb4abf675f3c05a14990a5ae0da3068fc0d8a47
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.estimator.LinearRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.linear.LinearRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index d69c475a313075a5b165dba9a80e30cf8212657d..801260c4507803345c4c84852fd83832b752ac12 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -22,6 +22,10 @@ tf_class {
     name: "keep_checkpoint_max"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "log_step_count_steps"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "master"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index 0d5dc73271dbc972c9177a6274f1632862f93ef0..07b04810b5c6d2eda3c3dce5ad4c35592158b085 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -1,5 +1,21 @@
 path: "tensorflow.estimator"
 tf_module {
+  member {
+    name: "DNNClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DNNLinearCombinedClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DNNLinearCombinedRegressor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DNNRegressor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Estimator"
     mtype: "<type \'type\'>"
@@ -8,6 +24,14 @@ tf_module {
     name: "EstimatorSpec"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearRegressor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
@@ -24,4 +48,12 @@ tf_module {
     name: "inputs"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "classifier_parse_example_spec"
+    argspec: "args=[\'feature_columns\', \'label_key\', \'label_dtype\', \'label_default\', \'weight_column\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "regressor_parse_example_spec"
+    argspec: "args=[\'feature_columns\', \'label_key\', \'label_dtype\', \'label_default\', \'label_dimension\', \'weight_column\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'1\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
index 4c633a850f8e069135f122292bac019e2646aa61..2a57a845cdcb92d2c3e5d87e06d4e03886696be1 100644
--- a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
-    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
   }
   member_method {
     name: "crossed_column"
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 93257c84a1f4ecd078923a2434d4ce48355e13ab..8f7790f2996d795ab7681c93d32909e01250725c 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -96,10 +96,6 @@ tf_module {
     name: "non_max_suppression"
     argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "non_max_suppression_v2"
-    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "pad_to_bounding_box"
     argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index 9f817beafd9251f1cd2a5d7a59f286d302948dc4..3beb95d25c15996b5ceb9c5005373498614bf944 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -256,6 +256,10 @@ tf_module {
     name: "sampled_softmax_loss"
     argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\'], "
   }
+  member_method {
+    name: "selu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "separable_conv2d"
     argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index b21d9a8ee3378c2a94636c69b8cbf089e8f04cad..a75e9e808025025b20b9c109e4b040c3b8f97fb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -54,7 +54,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'cell\', \'residual_fn\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 91abff6e13097faf3e24b85c7bd4ab8a02a303a8..314449bb7353fcf503973aa8847ac2a5c086304b 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "AttrValue"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "AutoParallelOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "COMPILER_VERSION"
     mtype: "<type \'str\'>"
@@ -112,6 +108,10 @@ tf_module {
     name: "InteractiveSession"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LMDBReader"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LogMessage"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -168,10 +168,6 @@ tf_module {
     name: "RegisterGradient"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RewriterConfig"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "RunMetadata"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -208,6 +204,10 @@ tf_module {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "SummaryMetadata"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
   member {
     name: "TFRecordReader"
     mtype: "<type \'type\'>"
@@ -260,6 +260,10 @@ tf_module {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "bitwise"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "bool"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -284,6 +288,10 @@ tf_module {
     name: "contrib"
     mtype: "<class \'tensorflow.python.util.lazy_loader.LazyLoader\'>"
   }
+  member {
+    name: "distributions"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "double"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -380,6 +388,10 @@ tf_module {
     name: "orthogonal_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "profiler"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "python_io"
     mtype: "<type \'module\'>"
@@ -476,6 +488,14 @@ tf_module {
     name: "user_ops"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "variance_scaling_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "variant"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "zeros_initializer"
     mtype: "<type \'type\'>"
@@ -508,6 +528,10 @@ tf_module {
     name: "acos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "acosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "add"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -530,19 +554,19 @@ tf_module {
   }
   member_method {
     name: "arg_max"
-    argspec: "args=[\'input\', \'dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "arg_min"
-    argspec: "args=[\'input\', \'dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
   member_method {
     name: "as_dtype"
@@ -556,6 +580,10 @@ tf_module {
     name: "asin"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "asinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "assert_equal"
     argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -648,6 +676,10 @@ tf_module {
     name: "atan2"
     argspec: "args=[\'y\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "atanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -720,6 +752,10 @@ tf_module {
     name: "clip_by_value"
     argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "colocate_with"
+    argspec: "args=[\'op\', \'ignore_existing\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "complex"
     argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -974,7 +1010,7 @@ tf_module {
   }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "gather_nd"
@@ -1032,6 +1068,14 @@ tf_module {
     name: "global_variables_initializer"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "glorot_normal_initializer"
+    argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "glorot_uniform_initializer"
+    argspec: "args=[\'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
+  }
   member_method {
     name: "gradients"
     argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\'], "
@@ -1346,7 +1390,7 @@ tf_module {
   }
   member_method {
     name: "pad"
-    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\', \'0\'], "
   }
   member_method {
     name: "parallel_stack"
@@ -1704,6 +1748,14 @@ tf_module {
     name: "sparse_placeholder"
     argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "sparse_reduce_max"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reduce_max_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "sparse_reduce_sum"
     argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
@@ -1740,6 +1792,10 @@ tf_module {
     name: "sparse_segment_sum"
     argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_slice"
+    argspec: "args=[\'sp_input\', \'start\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "sparse_softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd5c36f390add9cfb31642b80a792d65d59bb3e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.profiler.AdviceProto.Checker"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.Checker\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "REPORTS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c8c68e155c99da4f0c1c1ba2c944719c42c12c7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.profiler.AdviceProto.CheckersEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.CheckersEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b789f4fc92ed63fc72f3ecfe6be80a99eb3427f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.profiler.AdviceProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.AdviceProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CHECKERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Checker"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "CheckersEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0b9605bee1c7cf2f0154f65c475aac49c411f76
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.profiler.GraphNodeProto.InputShapesEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.InputShapesEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3bb71354e52fa79516696e6d5a58efeb2a46c18
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
@@ -0,0 +1,164 @@
+path: "tensorflow.profiler.GraphNodeProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.GraphNodeProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CHILDREN_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CPU_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DEVICES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FLOAT_OPS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INPUT_SHAPES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "InputShapesEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PARAMETERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "REQUESTED_BYTES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "RUN_COUNT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHAPES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_CPU_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_DEFINITION_COUNT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_FLOAT_OPS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_PARAMETERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_REQUESTED_BYTES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_RUN_COUNT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b88a11b2c3aabbb6f1e2dc401627cb49eeff7e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
@@ -0,0 +1,136 @@
+path: "tensorflow.profiler.MultiGraphNodeProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.MultiGraphNodeProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CHILDREN_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CPU_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FLOAT_OPS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_NODES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PARAMETERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "REQUESTED_BYTES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_CPU_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_EXEC_MICROS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_FLOAT_OPS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_PARAMETERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TOTAL_REQUESTED_BYTES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5f9c78c9e85ac4265125790b3f8b29fd0fc6b12
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.profiler.OpLogProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.profiler.tfprof_log_pb2.OpLogProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LOG_ENTRIES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..347187a890208eb5b78bb0d1a7040efbdeb3bd3f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-profile-option-builder.pbtxt
@@ -0,0 +1,89 @@
+path: "tensorflow.profiler.ProfileOptionBuilder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.profiler.option_builder.ProfileOptionBuilder\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "account_displayed_op_only"
+    argspec: "args=[\'self\', \'is_true\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "float_operation"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "order_by"
+    argspec: "args=[\'self\', \'attribute\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "select"
+    argspec: "args=[\'self\', \'attributes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "time_and_memory"
+    argspec: "args=[\'min_micros\', \'min_bytes\'], varargs=None, keywords=None, defaults=[\'1\', \'1\'], "
+  }
+  member_method {
+    name: "trainable_variables_parameter"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_accounted_types"
+    argspec: "args=[\'self\', \'account_type_regexes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_empty_output"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_file_output"
+    argspec: "args=[\'self\', \'outfile\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_max_depth"
+    argspec: "args=[\'self\', \'max_depth\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_min_execution_time"
+    argspec: "args=[\'self\', \'min_micros\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_min_float_operations"
+    argspec: "args=[\'self\', \'min_float_ops\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_min_memory"
+    argspec: "args=[\'self\', \'min_bytes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_min_occurrence"
+    argspec: "args=[\'self\', \'min_occurrence\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_min_parameters"
+    argspec: "args=[\'self\', \'min_params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_node_names"
+    argspec: "args=[\'self\', \'start_name_regexes\', \'show_name_regexes\', \'hide_name_regexes\', \'trim_name_regexes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "with_stdout_output"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_timeline_output"
+    argspec: "args=[\'self\', \'timeline_file\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb363aca48031e13487d716a0375973f93b3dc8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.profiler.Profiler"
+tf_class {
+  is_instance: "<class \'tensorflow.python.profiler.model_analyzer.Profiler\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_step"
+    argspec: "args=[\'self\', \'step\', \'run_meta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "advise"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "profile_graph"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "profile_name_scope"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "profile_operations"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "profile_python"
+    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26b25ee3d47241dbf351018f2aacbda12ff33492
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.profiler"
+tf_module {
+  member {
+    name: "AdviceProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "GraphNodeProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "MultiGraphNodeProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "OpLogProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "ProfileOptionBuilder"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Profiler"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "advise"
+    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'0\'], "
+  }
+  member_method {
+    name: "profile"
+    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'scope\', \'0\'], "
+  }
+  member_method {
+    name: "write_op_log"
+    argspec: "args=[\'graph\', \'log_dir\', \'op_log\', \'run_meta\', \'add_trace\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
index af0c11ca14d4f38547a49ac511ee13e15847eb33..31775de2d12bcd2f214f5a04be7a92f49c594fde 100644
--- a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "close"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "write"
     argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
index 7c24b7ad3cf38cfd949959d078e5d70838d0b2d9..35e49ee9f4a6ee5b4da2b034ece1c1e3b2136254 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.saved_model.tag_constants"
 tf_module {
+  member {
+    name: "GPU"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "SERVING"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
index bc150e56a36ca22479cdd6a0563466ef6275e143..d95c94668250e1de236462ccdcb134245eebf092 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "build_tensor_info"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_tensor_from_tensor_info"
+    argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
index 19d822e61bffb3621b966147519c90d425521e87..326e077d396bc5e3463bba3818f4757127ee0370 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
@@ -58,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "tensor_summary"
-    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\', \'summary_metadata\', \'family\', \'display_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "text"
diff --git a/tensorflow/tools/api/golden/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/tensorflow.test.pbtxt
index 2a88f26ed02c7e2690c37180f76b965d7ffa87e0..6237207821ab18c8eb3e6148875e29e2e2fad773 100644
--- a/tensorflow/tools/api/golden/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.test.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "create_local_cluster"
-    argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\'], varargs=None, keywords=None, defaults=[\'grpc\'], "
+    argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_temp_dir"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
index 2dc11df57b60b15a797b1866743b27ea1068624e..5cff6087ef533f6674d6d7f1e0a8be425c16f2ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "apply_gradients"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
index 62bfdab40bb83c634e101388ecb69da1233c60f9..7caf837cc385dbd64611a58de2c25d4de221a911 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
@@ -9,7 +9,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\'], "
+    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\', \'None\'], "
   }
   member_method {
     name: "close"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index 58fd5760c11d29f063c0f7f66ea0a11d39a08a1e..89c299ae994bcd4f6ceb6daa632f985247d3db7f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -230,7 +230,7 @@ tf_module {
   }
   member_method {
     name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'100\', \'None\', \'None\', \'120\', \'100\'], "
+    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\'], "
   }
   member_method {
     name: "NewCheckpointReader"
@@ -304,6 +304,10 @@ tf_module {
     name: "import_meta_graph"
     argspec: "args=[\'meta_graph_or_file\', \'clear_devices\', \'import_scope\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "init_from_checkpoint"
+    argspec: "args=[\'ckpt_dir_or_file\', \'assignment_map\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "input_producer"
     argspec: "args=[\'input_tensor\', \'element_shape\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'summary_name\', \'name\', \'cancel_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'32\', \'None\', \'None\', \'None\', \'None\'], "
@@ -320,6 +324,18 @@ tf_module {
     name: "limit_epochs"
     argspec: "args=[\'tensor\', \'num_epochs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "list_variables"
+    argspec: "args=[\'ckpt_dir_or_file\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_checkpoint"
+    argspec: "args=[\'ckpt_dir_or_file\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_variable"
+    argspec: "args=[\'ckpt_dir_or_file\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58398d645e8397dc8e61a6e0241710c3e34218f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.variance_scaling_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.variance_scaling_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'normal\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index cdfa0e7be524e3bb4ec039ac19bea72747afb58c..2d3b838957d60ffb5e827c6b43100d217cc5739e 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -22,7 +22,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":api_objects_proto_py",
-        "//tensorflow/tools/common:traverse",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 8421d8fce28611f6049847f6fbca5538475b59af..e9aeeb385586e3abd129d9a475d89545efaca45b 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -23,11 +23,12 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
-        "@protobuf//:protobuf_python",
     ],
 )
 
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index dfad11adf0b971748cbc64f9b86fd6cb2c7cdd37..38c1bd3fb59da57984052f504684ce4102ee76d9 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -137,8 +138,21 @@ Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
   std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
   std::vector<tensorflow::Tensor> output_tensors;
-  std::vector<string> output_tensor_names(wanted_shapes.begin(),
-                                          wanted_shapes.end());
+  std::vector<string> output_tensor_names;
+  for (const string& wanted_shape : wanted_shapes) {
+    bool is_input = false;
+    for (const std::pair<string, tensorflow::Tensor>& input_tensor :
+         input_tensors) {
+      if (input_tensor.first == wanted_shape) {
+        (*node_shapes)[wanted_shape] = input_tensor.second.shape();
+        is_input = true;
+        break;
+      }
+    }
+    if (!is_input) {
+      output_tensor_names.push_back(wanted_shape);
+    }
+  }
   TF_RETURN_IF_ERROR(
       session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
   CHECK_EQ(output_tensors.size(), output_tensor_names.size());
@@ -155,7 +169,8 @@ Status CalculateFlops(const GraphDef& graph,
                       Session* session, int64* total_flops,
                       std::unordered_map<string, int64>* flops_by_op) {
   std::unordered_set<string> floppable_ops = {
-      "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul"};
+      "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
+      "DepthwiseConv2dNative"};
 
   std::set<string> wanted_shapes;
   for (const NodeDef& node : graph.node()) {
@@ -200,6 +215,13 @@ Status CalculateFlops(const GraphDef& graph,
         }
         int64 output_count = output_shape.num_elements();
         current_flops = k * output_count * 2;
+      } else if (node.op() == "DepthwiseConv2dNative") {
+        const TensorShape& filter_shape = found_shapes[node.input(1)];
+        const TensorShape& output_shape = found_shapes[node.name()];
+        int64 filter_height = filter_shape.dim_size(0);
+        int64 filter_width = filter_shape.dim_size(1);
+        int64 output_count = output_shape.num_elements();
+        current_flops = output_count * filter_height * filter_width * 2;
       }
       (*flops_by_op)[node.op()] += current_flops;
       *total_flops += current_flops;
diff --git a/tensorflow/tools/ci_build/Dockerfile.tensorboard b/tensorflow/tools/ci_build/Dockerfile.tensorboard
deleted file mode 100644
index 9795872e2c4907908c288f8901d0a007f8d1dcaa..0000000000000000000000000000000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.tensorboard
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM ubuntu:14.04
-
-MAINTAINER Jan Prach <jendap@google.com>
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:george-edison55/cmake-3.x
-RUN /install/install_deb_packages.sh
-RUN /install/install_tensorboard_packages.sh
diff --git a/tensorflow/tools/ci_build/builds/builds_common.sh b/tensorflow/tools/ci_build/builds/builds_common.sh
index fd9a14bd698d183f14a65079d043e839319a435c..e3b58d038a713b0f8171b3e1803e4329cceda7c9 100644
--- a/tensorflow/tools/ci_build/builds/builds_common.sh
+++ b/tensorflow/tools/ci_build/builds/builds_common.sh
@@ -16,6 +16,10 @@
 #
 # Common Bash functions used by build scripts
 
+COLOR_NC='\033[0m'
+COLOR_LIGHT_GRAY='\033[0;37m'
+COLOR_GREEN='\033[0;32m'
+COLOR_RED='\033[0;31m'
 
 die() {
   # Print a message and exit with code 1.
diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index 25cb51ea7ccfb300d064f9a1a313bed57212832b..563e07e3afb2544d9dfde777860c9f3919a8d2ee 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -56,7 +56,7 @@ else
 fi
 
 pushd "${CI_TENSORFLOW_SUBMODULE_PATH:-.}"
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 popd
 
 # Gather and print build information
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 85c712d3c6db353574fda40363d58dc328259430..112dab3a7332bbe6446517843ecfe7ff9a526d0f 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -23,7 +23,7 @@
 #
 # When executing the Python unit tests, the script obeys the shell
 # variables: TF_BUILD_BAZEL_CLEAN, TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES,
-# NO_TEST_ON_INSTALL
+# NO_TEST_ON_INSTALL, PIP_TEST_ROOT
 #
 # TF_BUILD_BAZEL_CLEAN, if set to any non-empty and non-0 value, directs the
 # script to perform bazel clean prior to main build and test steps.
@@ -41,6 +41,9 @@
 # If NO_TEST_TFDBG_BINARIES has any non-empty and non-0 value, the testing of
 # TensorFlow Debugger (tfdbg) binaries and examples will be skipped.
 #
+# If PIP_TEST_ROOT has a non-empty and a non-0 value, the whl files will be
+# placed in that directory.
+#
 # Any flags not listed in the usage above will be passed directly to Bazel.
 #
 # If the --test_tutorials flag is set, it will cause the script to run the
@@ -70,6 +73,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds_common.sh"
 
 
+SKIP_RETURN_CODE=112
+
+
 # Get the command line arguments
 CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
 shift
@@ -162,7 +168,10 @@ echo "Python binary path to be used in PIP install: ${PYTHON_BIN_PATH} "\
 "(Major.Minor version: ${PY_MAJOR_MINOR_VER})"
 
 # Build PIP Wheel file
-PIP_TEST_ROOT="pip_test"
+# Set default pip file folder unless specified by env variable
+if [ -z "$PIP_TEST_ROOT" ]; then
+  PIP_TEST_ROOT="pip_test"
+fi
 PIP_WHL_DIR="${PIP_TEST_ROOT}/whl"
 PIP_WHL_DIR=$(realpath ${PIP_WHL_DIR})  # Get absolute path
 rm -rf ${PIP_WHL_DIR} && mkdir -p ${PIP_WHL_DIR}
@@ -236,106 +245,301 @@ if [[ $(uname) == "Linux" ]]; then
   fi
 fi
 
-# Perform installation
-echo "Installing pip whl file: ${WHL_PATH}"
 
-# Create virtualenv directory for install test
-VENV_DIR="${PIP_TEST_ROOT}/venv"
+create_activate_virtualenv_and_install_tensorflow() {
+  # Create and activate a virtualenv; then install tensorflow pip package in it.
+  #
+  # Usage:
+  #   create_activate_virtualenv_and_install_tensorflow [--clean] \
+  #       <VIRTUALENV_DIR> <TF_WHEEL_PATH>
+  #
+  # Arguments:
+  #   --clean: Create a clean virtualenv, i.e., without --system-site-packages.
+  #   VIRTUALENV_DIR: virtualenv directory to be created.
+  #   TF_WHEEL_PATH: Path to the tensorflow wheel file to be installed in the
+  #     virtualenv.
+
+  VIRTUALENV_FLAGS="--system-site-packages"
+  if [[ "$1" == "--clean" ]]; then
+    VIRTUALENV_FLAGS=""
+    shift
+  fi
+
+  VIRTUALENV_DIR="$1"
+  TF_WHEEL_PATH="$2"
+  if [[ -d "${VIRTUALENV_DIR}" ]]; then
+    if rm -rf "${VIRTUALENV_DIR}"
+    then
+      echo "Removed existing virtualenv directory: ${VIRTUALENV_DIR}"
+    else
+      die "Failed to remove existing virtualenv directory: ${VIRTUALENV_DIR}"
+    fi
+  fi
 
-if [[ -d "${VENV_DIR}" ]]; then
-  if rm -rf "${VENV_DIR}"
+  if mkdir -p "${VIRTUALENV_DIR}"
   then
-    echo "Removed existing virtualenv directory: ${VENV_DIR}"
+    echo "Created virtualenv directory: ${VIRTUALENV_DIR}"
   else
-    die "Failed to remove existing virtualenv directory: ${VENV_DIR}"
+    die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
   fi
-fi
 
-if mkdir -p ${VENV_DIR}
-then
-  echo "Created virtualenv directory: ${VENV_DIR}"
-else
-  die "FAILED to create virtualenv directory: ${VENV_DIR}"
-fi
-
-# Verify that virtualenv exists
-if [[ -z $(which virtualenv) ]]; then
-  die "FAILED: virtualenv not available on path"
-fi
+  # Verify that virtualenv exists
+  if [[ -z $(which virtualenv) ]]; then
+    die "FAILED: virtualenv not available on path"
+  fi
 
-virtualenv --system-site-packages -p "${PYTHON_BIN_PATH}" "${VENV_DIR}" || \
+  virtualenv ${VIRTUALENV_FLAGS} \
+    -p "${PYTHON_BIN_PATH}" "${VIRTUALENV_DIR}" || \
     die "FAILED: Unable to create virtualenv"
 
-source "${VENV_DIR}/bin/activate" || \
-    die "FAILED: Unable to activate virtualenv"
-
+  source "${VIRTUALENV_DIR}/bin/activate" || \
+    die "FAILED: Unable to activate virtualenv in ${VIRTUALENV_DIR}"
 
-# Install the pip file in virtual env (plus missing dependencies)
+  # Install the pip file in virtual env.
 
-# Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
-echo "Upgrade pip in virtualenv"
-pip install --upgrade pip==8.1.2
+  # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
+  echo "Upgrade pip in virtualenv"
+  pip install --upgrade pip==8.1.2
 
-# Force tensorflow reinstallation. Otherwise it may not get installed from
-# last build if it had the same version number as previous build.
-PIP_FLAGS="--upgrade --force-reinstall"
-pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
+  # Force tensorflow reinstallation. Otherwise it may not get installed from
+  # last build if it had the same version number as previous build.
+  PIP_FLAGS="--upgrade --force-reinstall"
+  pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
-echo "Successfully installed pip package ${WHL_PATH}"
-
-# Install extra pip packages required by the test-on-install
-for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
-  echo "Installing extra pip package required by test-on-install: ${PACKAGE}"
-
-  pip install ${PACKAGE} || \
-      die "pip install ${PACKAGE} FAILED"
-done
-
-if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
-   [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
-  echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
-  echo "  Skipping ALL Python unit tests on install"
-else
-  # Call run_pip_tests.sh to perform test-on-install
-  "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG} ||
-      die "PIP tests-on-install FAILED"
-fi
+  echo "Successfully installed pip package ${TF_WHEEL_PATH}"
+}
 
-# Test user ops
-if [[ "${DO_TEST_USER_OPS}" == "1" ]]; then
-  "${SCRIPT_DIR}/test_user_ops.sh" --virtualenv ${GPU_FLAG} || \
-      die "PIP user-op tests-on-install FAILED"
-fi
+################################################################################
+# Smoke test of tensorflow install in clean virtualenv
+################################################################################
+do_clean_virtualenv_smoke_test() {
+  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
+       [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
+    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
+    echo "  Skipping smoke test of tensorflow install in clean virtualenv"
+    return ${SKIP_RETURN_CODE}
+  fi
 
-# Test TensorFlow Debugger (tfdbg) examples.
-if [[ "${DO_TEST_TFDBG_BINARIES}" == "1" ]]; then
-  echo
-  echo "Testing TensorFlow Debugger (tfdbg) binaries"
-  echo
+  CLEAN_VENV_DIR="${PIP_TEST_ROOT}/venv_clean"
+  create_activate_virtualenv_and_install_tensorflow --clean \
+    "${CLEAN_VENV_DIR}" "${WHL_PATH}"
 
   # cd to a temporary directory to avoid picking up Python files in the source
   # tree.
   TMP_DIR=$(mktemp -d)
   pushd "${TMP_DIR}"
+  if [[ $(python -c "import tensorflow as tf; print(tf.Session().run(tf.constant(42)))") == 42 ]];
+  then
+    echo "Smoke test of tensorflow install in clean virtualenv PASSED."
+  else
+    echo "Smoke test of tensroflow install in clean virtualenv FAILED."
+    return 1
+  fi
 
-  "${SCRIPT_DIR}/../../../python/debug/examples/examples_test.sh" \
-      --virtualenv || \
-      die "PIP tests-on-install of tfdbg binaries FAILED"
+  deactivate
+  if [[ $? != 0 ]]; then
+    echo "FAILED: Unable to deactivate virtualenv from ${CLEAN_VENV_DIR}"
+    return 1
+  fi
 
   popd
-fi
+  rm -rf "${TMP_DIR}" "${CLEAN_VENV_DIR}"
+}
 
-# Optional: Run the tutorial tests
-if [[ "${DO_TEST_TUTORIALS}" == "1" ]]; then
-  "${SCRIPT_DIR}/test_tutorials.sh" --virtualenv || \
-      die "PIP tutorial tests-on-install FAILED"
-fi
+################################################################################
+# Perform installation of tensorflow in "non-clean" virtualenv and tests against
+# the install.
+################################################################################
+do_virtualenv_pip_test() {
+  # Create virtualenv directory for install test
+  VENV_DIR="${PIP_TEST_ROOT}/venv"
+  create_activate_virtualenv_and_install_tensorflow \
+    "${CLEAN_VENV_DIR}" "${WHL_PATH}"
+
+  # Install extra pip packages required by the test-on-install
+  for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
+    echo "Installing extra pip package required by test-on-install: ${PACKAGE}"
+
+    pip install ${PACKAGE}
+    if [[ $? != 0 ]]; then
+      echo "pip install ${PACKAGE} FAILED"
+      return 1
+    fi
+  done
 
-# Optional: Run integration tests
-if [[ "${DO_INTEGRATION_TESTS}" == "1" ]]; then
-  "${SCRIPT_DIR}/integration_tests.sh" --virtualenv || \
-      die "Integration tests on install FAILED"
-fi
+  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
+     [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
+    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
+    echo "  Skipping ALL Python unit tests on install"
+    return ${SKIP_RETURN_CODE}
+  else
+    # Call run_pip_tests.sh to perform test-on-install
+    "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG}
+    if [[ $? != 0 ]]; then
+      echo "PIP tests-on-install FAILED"
+      return 1
+    fi
+  fi
+}
 
-deactivate || \
-    die "FAILED: Unable to deactivate virtualenv"
+################################################################################
+# Run tests tagged with oss_serial against the virtualenv install.
+################################################################################
+do_virtualenv_oss_serial_pip_test() {
+  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
+     [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
+    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
+    echo "  Skipping Python unit tests on install tagged with oss_serial"
+    return ${SKIP_RETURN_CODE}
+  else
+    # Call run_pip_tests.sh to perform test-on-install
+    "${SCRIPT_DIR}/run_pip_tests.sh" \
+      --virtualenv ${GPU_FLAG} ${MAC_FLAG} --oss_serial
+    if [[ $? != 0 ]]; then
+      echo "PIP tests-on-install (oss_serial) FAILED"
+      return 1
+    fi
+  fi
+}
+
+################################################################################
+# Test user ops (optional).
+################################################################################
+do_test_user_ops() {
+  if [[ "${DO_TEST_USER_OPS}" == "1" ]]; then
+    "${SCRIPT_DIR}/test_user_ops.sh" --virtualenv ${GPU_FLAG}
+    if [[ $? != 0 ]]; then
+      echo "PIP user-op tests-on-install FAILED"
+      return 1
+    fi
+  else
+    echo "Skipping user-op test-on-install due to DO_TEST_USER_OPS = ${DO_TEST_USER_OPS}"
+    return ${SKIP_RETURN_CODE}
+  fi
+}
+
+################################################################################
+# Test TensorFlow Debugger (tfdbg) binaries (optional).
+################################################################################
+do_test_tfdbg_binaries() {
+  if [[ "${DO_TEST_TFDBG_BINARIES}" == "1" ]]; then
+    # cd to a temporary directory to avoid picking up Python files in the source
+    # tree.
+    TMP_DIR=$(mktemp -d)
+    pushd "${TMP_DIR}"
+
+    "${SCRIPT_DIR}/../../../python/debug/examples/examples_test.sh" \
+      --virtualenv
+    if  [[ $? != 0 ]]; then
+      echo "PIP tests-on-install of tfdbg binaries FAILED"
+      return 1
+    fi
+    popd
+  else
+    echo "Skipping test of tfdbg binaries due to DO_TEST_TFDBG_BINARIES = ${DO_TEST_TFDBG_BINARIES}"
+    return ${SKIP_RETURN_CODE}
+  fi
+}
+
+################################################################################
+# Test tutorials (optional).
+################################################################################
+do_test_tutorials() {
+  if [[ "${DO_TEST_TUTORIALS}" == "1" ]]; then
+    "${SCRIPT_DIR}/test_tutorials.sh" --virtualenv
+    if [[ $? != 0 ]]; then
+      echo "PIP tutorial tests-on-install FAILED"
+      return 1
+    fi
+  else
+    echo "Skipping tutorial tests-on-install due to DO_TEST_TUTORIALS = ${DO_TEST_TUTORIALS}"
+    return ${SKIP_RETURN_CODE}
+  fi
+}
+
+################################################################################
+# Integration test for ffmpeg (optional).
+################################################################################
+do_ffmpeg_integration_test() {
+  # Optional: Run integration tests
+  if [[ "${DO_INTEGRATION_TESTS}" == "1" ]]; then
+    "${SCRIPT_DIR}/integration_tests.sh" --virtualenv
+    if [[ $? != 0 ]]; then
+      echo "Integration tests on install FAILED"
+      return 1
+    fi
+  else
+    echo "Skipping ffmpeg integration due to DO_INTEGRATION_TESTS = ${DO_INTEGRATION_TESTS}"
+    return ${SKIP_RETURN_CODE}
+  fi
+}
+
+
+# List of all PIP test tasks and their descriptions.
+PIP_TASKS=("do_clean_virtualenv_smoke_test" "do_virtualenv_pip_test" "do_virtualenv_oss_serial_pip_test" "do_test_user_ops" "do_test_tfdbg_binaries" "do_test_tutorials" "do_ffmpeg_integration_test")
+PIP_TASKS_DESC=("Smoke test of pip install in clean virtualenv" "PIP tests in virtualenv" "PIP test in virtualenv (tag: oss_serial)" "User ops test" "TensorFlow Debugger (tfdbg) binaries test" "Tutorials test" "ffmpeg integration test")
+
+
+# Execute all the PIP test steps.
+COUNTER=0
+FAIL_COUNTER=0
+PASS_COUNTER=0
+SKIP_COUNTER=0
+while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
+  INDEX=COUNTER
+  ((INDEX++))
+
+  echo ""
+  echo "=== PIP test step ${INDEX} of ${#PIP_TASKS[@]}: "\
+"${PIP_TASKS[COUNTER]} (${PIP_TASKS_DESC[COUNTER]}) ==="
+  echo ""
+
+  ${PIP_TASKS[COUNTER]}
+  RESULT=$?
+
+  if [[ ${RESULT} == ${SKIP_RETURN_CODE} ]]; then
+    ((SKIP_COUNTER++))
+  elif [[ ${RESULT} != "0" ]]; then
+    ((FAIL_COUNTER++))
+  else
+    ((PASS_COUNTER++))
+  fi
+
+  STEP_EXIT_CODES+=(${RESULT})
+
+  echo ""
+  ((COUNTER++))
+done
+
+deactivate || die "FAILED: Unable to deactivate virtualenv from ${VENV_DIR}"
+
+
+# Print summary of build results
+COUNTER=0
+echo "==== Summary of PIP test results ===="
+while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
+  INDEX=COUNTER
+  ((INDEX++))
+
+  echo "${INDEX}. ${PIP_TASKS[COUNTER]}: ${PIP_TASKS_DESC[COUNTER]}"
+  if [[ ${STEP_EXIT_CODES[COUNTER]} == ${SKIP_RETURN_CODE} ]]; then
+    printf "  ${COLOR_LIGHT_GRAY}SKIP${COLOR_NC}\n"
+  elif [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
+    printf "  ${COLOR_GREEN}PASS${COLOR_NC}\n"
+  else
+    printf "  ${COLOR_RED}FAIL${COLOR_NC}\n"
+  fi
+
+  ((COUNTER++))
+done
+
+echo
+echo "${SKIP_COUNTER} skipped; ${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
+
+echo
+if [[ ${FAIL_COUNTER} == "0" ]]; then
+  printf "PIP test ${COLOR_GREEN}PASSED${COLOR_NC}\n"
+else
+  printf "PIP test ${COLOR_RED}FAILED${COLOR_NC}\n"
+  exit 1
+fi
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index 8e364f7ffb7c857762f30d85b2edcdb34c16c45e..9a6890401b7ab3dd54c50ddf41c539e1c6de4032 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -18,7 +18,7 @@
 # Run the python unit tests from the source code on the pip installation.
 #
 # Usage:
-#   run_pip_tests.sh [--virtualenv] [--gpu] [--mac]
+#   run_pip_tests.sh [--virtualenv] [--gpu] [--mac] [--oss_serial]
 #
 # If the flag --virtualenv is set, the script will use "python" as the Python
 # binary path. Otherwise, it will use tools/python_bin_path.sh to determine
@@ -30,6 +30,10 @@
 # The --mac flag informs the script that this is running on mac. Mac does not
 # have flock, so we should skip using parallel_gpu_execute on mac.
 #
+# The --oss_serial flag lets the script run only the py tests with the
+# oss_serial tag, in a serial fashion, i.e., using the bazel flag
+# --local_test_jobs=1
+#
 #   TF_BUILD_APPEND_ARGUMENTS:
 #                      Additional command line arguments for the bazel,
 #                      pip.sh or android.sh command
@@ -42,6 +46,7 @@ source "${SCRIPT_DIR}/builds_common.sh"
 IS_VIRTUALENV=0
 IS_GPU=0
 IS_MAC=0
+IS_OSS_SERIAL=0
 while true; do
   if [[ "$1" == "--virtualenv" ]]; then
     IS_VIRTUALENV=1
@@ -49,6 +54,8 @@ while true; do
     IS_GPU=1
   elif [[ "$1" == "--mac" ]]; then
     IS_MAC=1
+  elif [[ "$1" == "--oss_serial" ]]; then
+    IS_OSS_SERIAL=1
   fi
   shift
 
@@ -69,10 +76,19 @@ ln -s $(pwd)/tensorflow ${PIP_TEST_ROOT}/tensorflow
 
 # Do not run tests with "no_pip" tag. If running GPU tests, also do not run
 # tests with no_pip_gpu tag.
-PIP_TEST_FILTER_TAG="-no_pip"
+PIP_TEST_FILTER_TAG="-no_pip,-no_oss"
+if [[ ${IS_OSS_SERIAL} == "1" ]]; then
+  PIP_TEST_FILTER_TAG="${PIP_TEST_FILTER_TAG},oss_serial"
+else
+  PIP_TEST_FILTER_TAG="${PIP_TEST_FILTER_TAG},-oss_serial"
+fi
+
 if [[ ${IS_GPU} == "1" ]]; then
   PIP_TEST_FILTER_TAG="-no_pip_gpu,${PIP_TEST_FILTER_TAG}"
 fi
+if [[ ${IS_MAC} == "1" ]]; then
+  PIP_TEST_FILTER_TAG="-nomac,${PIP_TEST_FILTER_TAG}"
+fi
 
 # Bazel flags we need for all tests:
 #     define=no_tensorflow_py_deps=true, to skip all test dependencies.
@@ -104,7 +120,7 @@ else
 fi
 
 export TF_NEED_CUDA=$IS_GPU
-yes "" | ./configure
+${PYTHON_BIN_PATH} configure.py
 
 # Figure out how many concurrent tests we can run and do run the tests.
 BAZEL_PARALLEL_TEST_FLAGS=""
@@ -126,6 +142,10 @@ else
   fi
 fi
 
+if [[ ${IS_OSS_SERIAL} == 1 ]]; then
+  BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+fi
+
 # Actually run the tests.
 bazel test ${BAZEL_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} -- \
     ${BAZEL_TEST_TARGETS}
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 1cf87d7c7c09613d2a7f265e5cc1b54a3e2ae47e..13cfaad57e76028eaa7484aea334e8ed260b83b6 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -95,6 +95,10 @@
 #
 # This script can be used by Jenkins parameterized / matrix builds.
 
+# TODO(jhseu): Temporary for the gRPC pull request due to the
+# protobuf -> protobuf_archive rename. Remove later.
+TF_BUILD_BAZEL_CLEAN=1
+
 # Helper function: Convert to lower case
 to_lower () {
   echo "$1" | tr '[:upper:]' '[:lower:]'
@@ -358,7 +362,7 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
     fi
   done
 else
-  EXTRA_ARGS="${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-benchmark-test"
+  EXTRA_ARGS="${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
     EXTRA_ARGS="${EXTRA_ARGS},-nomac"
   fi
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index ddc95f690021feefe6725fada3385677aac09a98..68e826ccd5576100380c727a710482db1d5433f7 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -24,6 +24,7 @@
 
 # Current script directory
 SCRIPT_DIR=$( cd ${0%/*} && pwd -P )
+source "${SCRIPT_DIR}/builds/builds_common.sh"
 
 # Helper functions
 die() {
@@ -418,9 +419,25 @@ do_pip_smoke_test() {
     "The pip smoke test failed."
 }
 
+do_code_link_check() {
+  tensorflow/tools/ci_build/code_link_check.sh
+}
+
+do_check_load_py_test() {
+  BUILD_CMD="bazel build //tensorflow/tools/pip_package:check_load_py_test"
+  ${BUILD_CMD}
+  cmd_status \
+    "check_load_py_test failed to build."
+
+  BUILD_CMD="bazel-bin/tensorflow/tools/pip_package/check_load_py_test"
+  ${BUILD_CMD}
+  cmd_status \
+    "check_load_py_test failed."
+}
+
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links")
 
 INCREMENTAL_FLAG=""
 
@@ -463,7 +480,7 @@ while [[ ${COUNTER} -lt "${#SANITY_STEPS[@]}" ]]; do
     ((PASS_COUNTER++))
   fi
 
-  IFS=" " read -r -a STEP_EXIT_CODES <<< "${RESULT}"
+  STEP_EXIT_CODES+=(${RESULT})
 
   echo ""
   ((COUNTER++))
@@ -478,20 +495,21 @@ while [[ ${COUNTER} -lt "${#SANITY_STEPS[@]}" ]]; do
 
   echo "${INDEX}. ${SANITY_STEPS[COUNTER]}: ${SANITY_STEPS_DESC[COUNTER]}"
   if [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
-    echo "  PASS"
+    printf "  ${COLOR_GREEN}PASS${COLOR_NC}\n"
   else
-    echo "  FAIL"
+    printf "  ${COLOR_RED}FAIL${COLOR_NC}\n"
   fi
 
   ((COUNTER++))
 done
 
-echo ""
+echo
 echo "${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
 
-echo ""
+echo
 if [[ ${FAIL_COUNTER} == "0" ]]; then
-  echo "Sanity checks PASSED"
+  printf "Sanity checks ${COLOR_GREEN}PASSED${COLOR_NC}\n"
 else
-  die "Sanity checks FAILED"
+  printf "Sanity checks ${COLOR_RED}FAILED${COLOR_NC}\n"
+  exit 1
 fi
diff --git a/tensorflow/tools/ci_build/code_link_check.sh b/tensorflow/tools/ci_build/code_link_check.sh
new file mode 100755
index 0000000000000000000000000000000000000000..09130482cc9969a1c9e63fe73e183b631f53e0de
--- /dev/null
+++ b/tensorflow/tools/ci_build/code_link_check.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# please run this at root directory of tensorflow
+success=1
+
+for i in `grep -onI https://www.tensorflow.org/code/\[a-zA-Z0-9/._-\]\* -r tensorflow`
+do
+  filename=`echo $i|awk -F: '{print $1}'`
+  linenumber=`echo $i|awk -F: '{print $2}'`
+  target=`echo $i|awk -F: '{print $4}'|tail -c +27`
+
+  # skip files in tensorflow/models
+  if [[ $target == tensorflow_models/* ]] ; then
+    continue
+  fi
+
+  if [ ! -f $target ] && [ ! -d $target ]; then
+    success=0
+    echo Broken link $target at line $linenumber of file $filename
+  fi
+done
+
+if [ $success == 0 ]; then
+  echo Code link check fails.
+  exit 1
+fi
+
+echo Code link check success.
diff --git a/tensorflow/tools/ci_build/install/install_buildifier.sh b/tensorflow/tools/ci_build/install/install_buildifier.sh
index b2dfcf8db7605a08ed9554784b8de5cecac86af7..967c62bac03a124dca885b2118e0777204afa24d 100755
--- a/tensorflow/tools/ci_build/install/install_buildifier.sh
+++ b/tensorflow/tools/ci_build/install/install_buildifier.sh
@@ -15,14 +15,12 @@
 # ==============================================================================
 
 set -e
-BUILDIFIER_DIR="buildifier"
-mkdir ${BUILDIFIER_DIR}
-curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.5.tar.gz | \
-    tar -C "${BUILDIFIER_DIR}" --strip-components=1 -xz
-pushd ${BUILDIFIER_DIR}
+# Download buildifier.
+wget https://github.com/bazelbuild/buildtools/releases/download/0.4.5/buildifier
+chmod +x buildifier
+sudo mv buildifier /usr/local/bin/.
 
-bazel build buildifier:buildifier --spawn_strategy=standalone --genrule_strategy=standalone
-sudo cp bazel-bin/buildifier/buildifier /usr/local/bin/
-
-popd
-rm -rf ${BUILDIFIER_DIR}
+# Download buildozer.
+wget https://github.com/bazelbuild/buildtools/releases/download/0.4.5/buildozer
+chmod +x buildozer
+sudo mv buildozer /usr/local/bin/.
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index fef203b869704155e9c3b226bbef4af63e2e706c..88bc2960e347c9a0fb26b04863d359598edcce10 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.7.5.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.8.3.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 8768852dc7e847e29a6089f963d33bfb137675d7..44fc21df9458c0880d1972603c93e1590e2b0643 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -44,8 +44,8 @@ pip2 install --upgrade markdown==2.6.8
 pip3 install --upgrade markdown==2.6.8
 
 # Install protobuf.
-pip2 install --upgrade protobuf==3.2.0
-pip3 install --upgrade protobuf==3.2.0
+pip2 install --upgrade protobuf==3.3.0
+pip3 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -86,5 +86,6 @@ pip2 install mock
 pip2 install portpicker
 pip3 install portpicker
 
-pip2 install backports.weakref==1.0rc1
-pip3 install backports.weakref==1.0rc1
+# TensorFlow Serving integration tests require the following:
+pip2 install grpcio
+pip3 install grpcio
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index 773c89b70bbe64f7923645ea5c3c532e52d02c2d..7934002b2c982cd10216016f8614b70b77b58e29 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -17,9 +17,9 @@
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.2.0"
+PROTOBUF_VERSION="3.3.0"
 protobuf_ver_flat=$(echo $PROTOBUF_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_protobuf_ver=$(protoc --version | awk '{print $2}')
+local_protobuf_ver=$(protoc --version)
 local_protobuf_ver_flat=$(echo $local_protobuf_ver | sed 's/\.//g' | sed 's/^0*//g')
 if [[ -z $local_protobuf_ver_flat ]]; then
   local_protobuf_ver_flat=0
@@ -30,7 +30,7 @@ if (( $local_protobuf_ver_flat < $protobuf_ver_flat )); then
   PROTOBUF_ZIP=$(basename "${PROTOBUF_URL}")
   UNZIP_DEST="google-protobuf"
 
-  wget -q "${PROTOBUF_URL}"
+  wget "${PROTOBUF_URL}"
   unzip "${PROTOBUF_ZIP}" -d "${UNZIP_DEST}"
   cp "${UNZIP_DEST}/bin/protoc" /usr/local/bin/
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index edfc4e3a98f7c613fd7db80f8426514ad09f4f72..706d414746408d0bc918fb7408985561dac70d7c 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -64,7 +64,7 @@ set -e
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.5 install --upgrade protobuf==3.2.0
+pip3.5 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -90,5 +90,4 @@ pip3.5 install portpicker
 
 pip3.5 install werkzeug
 
-pip3.5 install backports.weakref==1.0rc1
-
+pip3.5 install grpcio
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
index 467e4ab7e53ebd1c6985bcc908c9efdda10cef17..ca840796543a055d58359449d43944720635f0c4 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -30,10 +30,10 @@ export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 # Only running cc tests, python version does not matter.
 export PYTHON_BIN_PATH=`which python`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=cc -k \
+bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test --test_lang_filters=cc -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
index e2bbc0e8c0be0d1069eb85364ba8a137b950cb3a..5c82c9efafa14e8491d50a02f35c0498a8f9ef79 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -29,10 +29,10 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python2`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=py -k \
+bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index a03cab0cca5c375e668a2adeae64c48ac2b217a0..7155636a53fa9333945fdec0f0582c745db8ba17 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -29,10 +29,10 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python3`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --test_tag_filters=-gpu,-benchmark-test -k \
+bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --test_output=errors -- \
     //tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
index 32de5cea200d4a43e5885364a9aeeafd2fa51af6..218d2a899135401d6fcc79677dd5ab3703034919 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -29,10 +29,10 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python3`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=py -k \
+bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index 6acc26213835c0d2924f9ee0a31a80790bf5d75e..dff72c25bf7a6d9b0593391fbc090fd8a8ab537f 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -32,10 +32,10 @@ export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=cc --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index e73fe046c967b0bb3db6eb5b109516c0d207a1e4..a36a8445afdebea15cf1fcf3c73d15ef4200a090 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -32,10 +32,10 @@ export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
+bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index e5f4a22f7ade7eb5c260a7a486cd5d3fa75d5859..0ee894e2c44e8115148612191c949f6f4b0d42ba 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -30,11 +30,10 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=$(which python2)
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 which bazel
-bazel test --test_tag_filters=-gpu,-benchmark-test,-nomac \
+bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
-    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... \
-    -//tensorflow/tensorboard/...
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
index 59ba71f5df77fd967e3699bce628adc49c7893ee..3e31aa1ce106531a32d0d8860de87a9aa490ae0c 100755
--- a/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
+++ b/tensorflow/tools/ci_build/protobuf/protobuf_optimized_pip.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-PROTOBUF_VERSION="3.2.0"
+PROTOBUF_VERSION="3.3.1"
 PYTHON_BIN=${PYTHON_BIN:-python}
 DIR=${PWD}/protobuf
 
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index 682f5329f58fffa5f2030c7e33db14bd3e343165..b707ee338a2786ce3946d9e3d34da311b9f512f5 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -130,12 +130,6 @@ if [[ ${OLD_MAJOR} != ${MAJOR} ]] || [[ ${OLD_MINOR} != ${MINOR} ]]; then
   echo "Detected Major.Minor change. "\
 "Updating pattern ${OLD_R_MAJOR_MINOR} to ${R_MAJOR_MINOR} in additional files"
 
-  # Update tensorflow/tensorboard/README.md
-  TENSORBOARD_README_MD="${TF_SRC_DIR}/tensorboard/README.md"
-  check_existence file "${TENSORBOARD_README_MD}"
-  sed -i -r -e "s/${OLD_R_MAJOR_MINOR}/${R_MAJOR_MINOR}/g" \
-      "${TENSORBOARD_README_MD}"
-
   # Update dockerfiles
   DEVEL_DOCKERFILE="${TF_SRC_DIR}/tools/docker/Dockerfile.devel"
   check_existence file "${DEVEL_DOCKERFILE}"
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 8853dc53b17b5b5f1dda096817c67723fdbefcc4..05392c27248f6603f61d59358887867cd9816550 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -56,3 +56,7 @@ export PATH="/c/tools/cuda/bin:$PATH"
 
 # Set the common build options on Windows
 export BUILD_OPTS='--copt=-w --host_copt=-w --verbose_failures --experimental_ui'
+
+# Build TF with wrapper-less CROSSTOOL
+# TODO(pcloudy): Remove this after wrapper-less CROSSTOOL becomes default
+export NO_MSVC_WRAPPER=1
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index cc157c33f501c8da1e17656c05232458a8c6aaac..7cb81c20f02edc4593591cba1be2bfd2074751e4 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -60,7 +60,7 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-# GPU tests are very flaky when running concurently, so set local_test_jobs=1
+# GPU tests are very flaky when running concurrently, so set local_test_jobs=1
 bazel test -c opt --config=win-cuda $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu \
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index 1106413071393be8cd60c88887bffa7ef673dc08..4a2f954dc957a4ba357437247646bf0c323f4e0c 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -33,8 +33,9 @@ export TF_NEED_CUDA=1
 export TF_ENABLE_XLA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
+bazel clean
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index f92edd0dd8863fa7a3a6ad764a895370d48a5958..8a8667957ae4acb97356d4a141edd422509b48c7 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -15,6 +15,7 @@ py_library(
     name = "public_api",
     srcs = ["public_api.py"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:util"],
 )
 
 py_test(
@@ -32,6 +33,7 @@ py_library(
     name = "traverse",
     srcs = ["traverse.py"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:util"],
 )
 
 py_test(
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index fb40cf0833f08fc142aec18fe8940ce836453906..19959ea6d260d5aded5a3f37850025f6722d82ee 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -24,7 +24,9 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         "tf_upgrade",
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index f7dbfea7fb0f3463cd708cde8762eb28b69b05a1..e40ecb43f9a00bee7309895969ff65e48b95b4e9 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -17,7 +17,7 @@
 
 A simple softmax model with one hidden layer is defined. The parameters
 (weights and biases) are located on one parameter server (ps), while the ops
-are executed on two worker nodes by default. The TF sessions also run on the 
+are executed on two worker nodes by default. The TF sessions also run on the
 worker node.
 Multiple invocations of this script can be done in parallel, with different
 values for --task_index. There should be exactly one invocation with
@@ -123,9 +123,7 @@ def main(unused_argv):
 
   is_chief = (FLAGS.task_index == 0)
   if FLAGS.num_gpus > 0:
-    if FLAGS.num_gpus < num_workers:
-      raise ValueError("number of gpus is less than number of workers")
-    # Avoid gpu allocation conflict: now allocate task_num -> #gpu 
+    # Avoid gpu allocation conflict: now allocate task_num -> #gpu
     # for each worker in the corresponding machine
     gpu = (FLAGS.task_index % FLAGS.num_gpus)
     worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
diff --git a/tensorflow/tools/dist_test/scripts/BUILD b/tensorflow/tools/dist_test/scripts/BUILD
index c329f0bbe8779fe300e601a1f41d6c123688815a..ce2fa5c743ece40eae10b30f4b2626a9cfada147 100644
--- a/tensorflow/tools/dist_test/scripts/BUILD
+++ b/tensorflow/tools/dist_test/scripts/BUILD
@@ -17,6 +17,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":k8s_tensorflow_lib",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
     ],
 )
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 5b3f1f936a48bb448b712152c57c095226efea8e..07a972400df46f59c2d24b7b8e99bd690659b83a 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -24,14 +24,15 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
         ipykernel \
         jupyter \
         matplotlib \
         numpy \
+        pandas \
         scipy \
         sklearn \
-        pandas \
-        Pillow \
         && \
     python -m ipykernel.kernelspec
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 38a67f80aae5c6ad66639c24059ac50a3c6f3220..1b97c0d10830f92118bf6b597558c107a0182a92 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -72,7 +72,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.2
+    git checkout r1.3
 WORKDIR /tensorflow
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index d0a038a9db61c97643678d9fbca8974df0f84c8f..80b45ae70473ccfbf8869d846a080d15dfcfd905 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -19,6 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         zlib1g-dev \
         openjdk-8-jdk \
         openjdk-8-jre-headless \
+        wget \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -72,7 +73,7 @@ RUN mkdir /bazel && \
 
 RUN git clone https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r1.2
+    git checkout r1.3
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 3ba1e963f92a0fd7294a36288785545962f40146..da83a300580b660bd2cea890eff8acc8a96103b2 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -24,14 +24,15 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
         ipykernel \
         jupyter \
         matplotlib \
         numpy \
+        pandas \
         scipy \
         sklearn \
-        pandas \
-        Pillow \
         && \
     python -m ipykernel.kernelspec
 
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 3e45ae362c71021ec1931c59acd1c38fbfac8fc6..3780bde2beeac389437627b012d95be7aa9dbbd2 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -54,6 +54,30 @@ for additional containers, such as release candidates or nightly builds.
 
 ## Rebuilding the containers
 
-Just pick the dockerfile corresponding to the container you want to build, and run
+Building TensorFlow Docker containers should be done through the
+[parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/parameterized_docker_build.sh)
+script. The raw Dockerfiles should not be used directly as they contain strings
+to be replaced by the script during the build.
 
-    $ docker build --pull -t $USER/tensorflow-suffix -f Dockerfile.suffix .
+To use the script, specify the container type (`CPU` vs. `GPU`), the desired
+Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
+is to be built (`NO` vs. `YES`). In addition, you need to specify the central
+location from where the pip package of TensorFlow will be downloaded.
+
+For example, to build a CPU-only non-developer Docker image for Python 2, using
+TensorFlow's nightly pip package:
+
+``` bash
+export TF_DOCKER_BUILD_IS_DEVEL=NO
+export TF_DOCKER_BUILD_TYPE=CPU
+export TF_DOCKER_BUILD_PYTHON_VERSION=PYTHON2
+
+export NIGHTLY_VERSION="1.head"
+export TF_DOCKER_BUILD_CENTRAL_PIP=$(echo ${TF_DOCKER_BUILD_PYTHON_VERSION} | sed s^PYTHON2^http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=${TF_DOCKER_BUILD_PYTHON_VERSION},label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp27-cp27mu-manylinux1_x86_64.whl^ | sed s^PYTHON3^http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-${NIGHTLY_VERSION}-cp35-cp35m-manylinux1_x86_64.whl^)
+
+tensorflow/tools/docker/parameterized_docker_build.sh
+```
+
+If successful, the image will be tagged as `${USER}/tensorflow:latest` by default.
+
+Rebuilding GPU images requires [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 8e27b133c2fa33a8f6366b0f94a596cf1ca7c1a2..8f10bc9e0ca3c947b8ca75663444309088e0513e 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -37,6 +37,7 @@ py_library(
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = ["@com_github_andreif_codegen"],
 )
 
 py_test(
@@ -44,7 +45,6 @@ py_test(
     size = "small",
     srcs = ["parser_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
     deps = [
         ":parser",
         "//tensorflow/python:platform_test",
@@ -78,13 +78,10 @@ py_test(
     size = "small",
     srcs = ["generate_lib_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
     deps = [
         ":generate_lib",
         ":parser",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python/debug:debug_py",
     ],
 )
 
@@ -105,7 +102,12 @@ py_test(
     srcs = ["build_docs_test.py"],
     data = ["//tensorflow:docs_src"],
     srcs_version = "PY2AND3",
-    tags = ["manual"],
+    tags = [
+        # No reason to run sanitizers for this test.
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":generate_lib",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index d28dd93b9a8d5eb19af414622c1d1b22516f9c1c..ae293f6576456ecdbb8a4b1ee4e8e4f40482ad94 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import sys
+import textwrap
 
 import tensorflow as tf
 from tensorflow.python import debug as tf_debug
@@ -29,19 +31,40 @@ from tensorflow.tools.docs import generate_lib
 
 class Flags(object):
   resource_root = resource_loader.get_root_dir_with_all_resources()
-  src_dir = os.path.join(resource_root, 'third_party/tensorflow/docs_src')
-  base_dir = os.path.join(resource_root, 'third_party/tensorflow/')
+  src_dir = os.path.join(resource_root, 'tensorflow/docs_src')
+  base_dir = os.path.join(resource_root, 'tensorflow/')
   output_dir = googletest.GetTempDir()
 
 
 class BuildDocsTest(googletest.TestCase):
 
   def testBuildDocs(self):
+    if sys.version_info >= (3, 0):
+      print('Warning: Doc generation is not supported from python3.')
+      return
+
     doc_generator = generate_lib.DocGenerator()
 
     doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
 
-    status = doc_generator.build(Flags())
+    try:
+      status = doc_generator.build(Flags())
+    except RuntimeError as e:
+      if not e.args[0].startswith('Modules nested too deep'):
+        raise
+
+      msg = textwrap.dedent("""\
+          %s
+
+          ****************************************************************
+          If this test fails here, you have most likely introduced an
+          unsealed module. Make sure to use `remove_undocumented` or similar
+          utilities to avoid leaking symbols. See above for more information
+          on the exact point of failure.
+          ****************************************************************
+          """ % e.args[0])
+
+      raise RuntimeError(msg)
 
     if status:
       self.fail('Found %s Errors!' % status)
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 99872e1d8446ab84bcf77caeb86003d86db85e52..bbeb3921d7b75a9d06d99e0131e1886af3849f2a 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import argparse
 import os
+import sys
 
 import six
 
@@ -90,6 +91,7 @@ def write_docs(output_dir, parser_config, yaml_toc):
 
   # Parse and write Markdown pages, resolving cross-links (@{symbol}).
   for full_name, py_object in six.iteritems(parser_config.index):
+    parser_config.reference_resolver.current_doc_full_name = full_name
 
     if full_name in parser_config.duplicate_of:
       continue
@@ -181,7 +183,7 @@ def add_dict_to_dict(add_from, add_to):
 
 # Exclude some libaries in contrib from the documentation altogether.
 def _get_default_private_map():
-  return {}
+  return {'tf.test': ['mock']}
 
 
 # Exclude members of some libaries.
@@ -390,6 +392,9 @@ def _other_docs(src_dir, output_dir, reference_resolver):
         print('Skipping excluded file %s...' % base_name)
         continue
       full_in_path = os.path.join(dirpath, base_name)
+
+      reference_resolver.current_doc_full_name = full_in_path
+
       suffix = os.path.relpath(path=full_in_path, start=src_dir)
       full_out_path = os.path.join(output_dir, suffix)
       if not base_name.endswith('.md'):
@@ -415,6 +420,8 @@ class DocGenerator(object):
   """Main entry point for generating docs."""
 
   def __init__(self):
+    if sys.version_info >= (3, 0):
+      sys.exit('Doc generation is not supported from python3.')
     self.argument_parser = argparse.ArgumentParser()
     self._py_modules = None
     self._private_map = _get_default_private_map()
@@ -442,7 +449,7 @@ class DocGenerator(object):
         '--base_dir',
         type=str,
         default=default_base_dir,
-        help='Base directory to to strip from file names referenced in docs.')
+        help='Base directory to strip from file names referenced in docs.')
 
   def parse_known_args(self):
     flags, _ = self.argument_parser.parse_known_args()
@@ -505,7 +512,6 @@ class DocGenerator(object):
     write_docs(output_dir, parser_config, yaml_toc=self.yaml_toc)
     _other_docs(flags.src_dir, flags.output_dir, reference_resolver)
 
-    if parser.all_errors:
-      print('Errors during processing:\n  ' + '\n  '.join(parser.all_errors))
-      return 1
-    return 0
+    parser_config.reference_resolver.log_errors()
+
+    return parser_config.reference_resolver.num_errors()
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index 6e5deb6a36ed7d7d8b51f28e7ed3d9a680fce13b..1ceaf31f1c3b83e2c2cb3c0d2022ce98781aed4b 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -21,9 +21,6 @@ from __future__ import print_function
 import os
 import sys
 
-import tensorflow as tf
-
-from tensorflow.python import debug as tf_debug
 from tensorflow.python.platform import googletest
 from tensorflow.tools.docs import generate_lib
 from tensorflow.tools.docs import parser
@@ -54,23 +51,10 @@ class DummyVisitor(object):
 
 class GenerateTest(googletest.TestCase):
 
-  def test_extraction(self):
-    py_modules = [('tf', tf), ('tfdbg', tf_debug)]
-
-    try:
-      generate_lib.extract(py_modules,
-                           generate_lib._get_default_private_map(),
-                           generate_lib._get_default_do_not_descend_map())
-    except RuntimeError:
-      print('*****************************************************************')
-      print('If this test fails, you have most likely introduced an unsealed')
-      print('module. Make sure to use remove_undocumented or similar utilities')
-      print('to avoid leaking symbols. See below for more information on the')
-      print('failure.')
-      print('*****************************************************************')
-      raise
-
   def test_write(self):
+    if sys.version_info >= (3, 0):
+      self.skipTest('Warning: Doc generation is not supported from python3.')
+
     module = sys.modules[__name__]
 
     index = {
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 7ae1d2abd9af813d29e527f447b6ce21c8e72b82..563e5be814ce227279b4f55e6050ff902de54487 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -24,6 +24,7 @@ import functools
 import json
 import os
 import re
+import sys
 
 import codegen
 import six
@@ -35,13 +36,36 @@ from tensorflow.python.util import tf_inspect
 # A regular expression capturing a python indentifier.
 IDENTIFIER_RE = '[a-zA-Z_][a-zA-Z0-9_]*'
 
-# Log of all reported errors
-all_errors = []
 
+class _Errors(object):
+  """A collection of errors."""
 
-def log_error(s):
-  all_errors.append(s)
-  print('ERROR:', s)
+  def __init__(self):
+    self._errors = []
+
+  def log_all(self):
+    """Log all the collected errors to the standard error."""
+    template = 'ERROR:\n    output file name: %s\n    %s\n\n'
+
+    for full_name, message in self._errors:
+      print(template % (full_name, message), file=sys.stderr)
+
+  def append(self, full_name, message):
+    """Add an error to the collection.
+
+    Args:
+      full_name: The path to the file in which the error occurred.
+      message: The message to display with the error.
+    """
+    self._errors.append((full_name, message))
+
+  def __len__(self):
+    return len(self._errors)
+
+  def __eq__(self, other):
+    if not isinstance(other, _Errors):
+      return False
+    return self._errors == other._errors  # pylint: disable=protected-access
 
 
 def documentation_path(full_name):
@@ -107,6 +131,18 @@ class ReferenceResolver(object):
     self._all_names = set(is_class.keys())
     self._py_module_names = py_module_names
 
+    self.current_doc_full_name = None
+    self._errors = _Errors()
+
+  def add_error(self, message):
+    self._errors.append(self.current_doc_full_name, message)
+
+  def log_errors(self):
+    self._errors.log_all()
+
+  def num_errors(self):
+    return len(self._errors)
+
   @classmethod
   def from_visitor(cls, visitor, doc_index, **kwargs):
     """A factory function for building a ReferenceResolver from a visitor.
@@ -153,7 +189,8 @@ class ReferenceResolver(object):
     for key, value in self.__dict__.items():
       # Drop these two fields. `_doc_index` is not serializable. `_all_names` is
       # generated by the constructor.
-      if key in ('_doc_index', '_all_names'):
+      if key in ('_doc_index', '_all_names',
+                 '_errors', 'current_doc_full_name'):
         continue
 
       # Strip off any leading underscores on field names as these are not
@@ -186,10 +223,10 @@ class ReferenceResolver(object):
     Returns:
       `string`, with "@{symbol}" references replaced by Markdown links.
     """
-    return re.sub(SYMBOL_REFERENCE_RE,
-                  lambda match: self._one_ref(match.group(1),  # pylint: disable=g-long-lambda
-                                              relative_path_to_root),
-                  string)
+    def one_ref(match):
+      return self._one_ref(match, relative_path_to_root)
+
+    return re.sub(SYMBOL_REFERENCE_RE, one_ref, string)
 
   def python_link(self, link_text, ref_full_name, relative_path_to_root,
                   code_ref=True):
@@ -250,9 +287,8 @@ class ReferenceResolver(object):
 
     # Check whether this link exists
     if master_name not in self._all_names:
-      # TODO(josh11b): Make error reporting more uniform.
-      print('ERROR: Cannot make link to %s (original: %s): Not in index.' %
-            (master_name, ref_full_name))
+      message = 'Cannot make link to "%s": Not in index.' % master_name
+      self.add_error(message)
       return 'BROKEN_LINK'
 
     # If this is a member of a class, link to the class page with an anchor.
@@ -270,8 +306,10 @@ class ReferenceResolver(object):
 
     return os.path.join(relative_path_to_root, ref_path)
 
-  def _one_ref(self, string, relative_path_to_root):
+  def _one_ref(self, match, relative_path_to_root):
     """Return a link for a single "@{symbol}" reference."""
+    string = match.group(1)
+
     # Look for link text after $.
     dollar = string.rfind('$')
     if dollar > 0:  # Ignore $ in first character
@@ -303,8 +341,8 @@ class ReferenceResolver(object):
                                 code_ref=not manual_link_text)
 
     # Error!
-    log_error('Did not understand "@{%s}"' % string)
-    return 'ERROR:%s' % string
+    self.add_error('Did not understand "%s"' % match.group(0))
+    return 'BROKEN_LINK'
 
   def _doc_link(self, string, link_text, manual_link_text,
                 relative_path_to_root):
@@ -330,7 +368,7 @@ class ReferenceResolver(object):
   def _doc_missing(self, string, unused_hash_tag, link_text,
                    unused_manual_link_text, unused_relative_path_to_root):
     """Generate an error for unrecognized @{$...} references."""
-    log_error('Handle doc reference "@{$%s}"' % string)
+    self.add_error('Unknown Document "%s"' % string)
     return link_text
 
   def _cc_link(self, string, link_text, unused_manual_link_text,
@@ -348,7 +386,7 @@ class ReferenceResolver(object):
     elif string == 'tensorflow::ops::Const':
       ret = 'namespace/tensorflow/ops.md#const'
     else:
-      log_error('Handle C++ reference "@{%s}"' % string)
+      self.add_error('C++ reference not understood: "%s"' % string)
       return 'TODO_C++:%s' % string
     # relative_path_to_root gets you to api_docs/python, we go from there
     # to api_docs/cc, and then add ret.
@@ -469,7 +507,7 @@ def _parse_function_details(docstring):
   pairs = list(_gen_pairs(parts[1:]))
 
   function_details = []
-  item_re = re.compile(r'^  (\w+):', re.MULTILINE)
+  item_re = re.compile(r'^  (\*?\*?\w+):', re.MULTILINE)
 
   for keyword, content in pairs:
     content = item_re.split(content)
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 3e02160130f1959484472ecc77e8b2e883294a1e..862f0acfa90fbc8ea7f5054b745c684783f1ff5a 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -491,13 +491,13 @@ Returns:
 
 class TestParseFunctionDetails(googletest.TestCase):
 
-  def testParseFunctionDetails(self):
+  def test_parse_function_details(self):
     docstring, function_details = parser._parse_function_details(RELU_DOC)
 
     self.assertEqual(len(function_details), 2)
     args = function_details[0]
     self.assertEqual(args.keyword, 'Args')
-    self.assertEmpty(args.header)
+    self.assertEqual(len(args.header), 0)
     self.assertEqual(len(args.items), 2)
     self.assertEqual(args.items[0][0], 'features')
     self.assertEqual(args.items[1][0], 'name')
@@ -515,5 +515,60 @@ class TestParseFunctionDetails(googletest.TestCase):
         docstring + ''.join(str(detail) for detail in function_details))
 
 
+class TestGenerateSignature(googletest.TestCase):
+
+  def test_known_object(self):
+    if sys.version_info >= (3, 0):
+      print('Warning: Doc generation is not supported from python3.')
+      return
+
+    known_object = object()
+    reverse_index = {id(known_object): 'location.of.object.in.api'}
+
+    def example_fun(arg=known_object):  # pylint: disable=unused-argument
+      pass
+
+    sig = parser._generate_signature(example_fun, reverse_index)
+    self.assertEqual(sig, ['arg=location.of.object.in.api'])
+
+  def test_literals(self):
+    if sys.version_info >= (3, 0):
+      print('Warning: Doc generation is not supported from python3.')
+      return
+
+    def example_fun(a=5, b=5.0, c=None, d=True, e='hello', f=(1, (2, 3))):  # pylint: disable=g-bad-name, unused-argument
+      pass
+
+    sig = parser._generate_signature(example_fun, reverse_index={})
+    self.assertEqual(
+        sig, ['a=5', 'b=5.0', 'c=None', 'd=True', "e='hello'", 'f=(1, (2, 3))'])
+
+  def test_dotted_name(self):
+    if sys.version_info >= (3, 0):
+      print('Warning: Doc generation is not supported from python3.')
+      return
+
+    # pylint: disable=g-bad-name
+    class a(object):
+
+      class b(object):
+
+        class c(object):
+
+          class d(object):
+
+            def __init__(self, *args):
+              pass
+    # pylint: enable=g-bad-name
+
+    e = {'f': 1}
+
+    def example_fun(arg1=a.b.c.d, arg2=a.b.c.d(1, 2), arg3=e['f']):  # pylint: disable=unused-argument
+      pass
+
+    sig = parser._generate_signature(example_fun, reverse_index={})
+    self.assertEqual(sig, ['arg1=a.b.c.d', 'arg2=a.b.c.d(1, 2)', "arg3=e['f']"])
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index fa2cf15cb16ec3396089b5f52ce8718fd05f94a0..cad0567b9e9acb586203cf105b40df9c0094bc61 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -26,14 +26,12 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
@@ -44,11 +42,46 @@ tf_cc_test(
     deps = [
         ":transform_utils",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "file_utils",
+    srcs = [
+        "file_utils.cc",
+    ],
+    hdrs = [
+        "file_utils.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+tf_cc_test(
+    name = "file_utils_test",
+    size = "small",
+    srcs = ["file_utils_test.cc"],
+    deps = [
+        ":file_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -60,6 +93,7 @@ cc_library(
     srcs = [
         "add_default_attributes.cc",
         "backports.cc",
+        "fake_quantize_training.cc",
         "fold_batch_norms.cc",
         "fold_constants_lib.cc",
         "fold_old_batch_norms.cc",
@@ -109,6 +143,7 @@ tf_cc_test(
     srcs = [
         "add_default_attributes_test.cc",
         "backports_test.cc",
+        "fake_quantize_training_test.cc",
         "fold_batch_norms_test.cc",
         "fold_constants_test.cc",
         "fold_old_batch_norms_test.cc",
@@ -152,6 +187,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":file_utils",
         ":transform_utils",
         ":transforms_lib",
         "//tensorflow/core:framework_internal",
@@ -213,6 +249,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":file_utils",
         ":transform_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -238,9 +275,9 @@ cc_binary(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        ":file_utils",
         ":transform_utils",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
     ],
@@ -250,7 +287,12 @@ py_library(
     name = "transform_graph_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/python:pywrap_tensorflow"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:util",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index b4274e67df315aaa094413ff2576e7c58bd610ff..66e0ba60ebcb994eb20910b8db4bb96cbcf9e319 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -578,10 +578,14 @@ eight-bit form.
 
 ### quantize_weights
 
-Args: None \
+Args:
+
+*   minimum_size: Tensors with fewer elements than this won't be quantized
+(defaults to 1024)
+
 Prerequisites: None
 
-Converts any large (more than 15 element) float Const op into an eight-bit
+Converts any large (more than minimum_size) float Const op into an eight-bit
 equivalent, followed by a float conversion op so that the result is usable by
 subsequent nodes. This is mostly useful for [shrinking file
 sizes](#shrinking-file-size), but also helps with the more advanced
@@ -760,7 +764,7 @@ heart, all of the transforms take in a valid GraphDef, make some changes, and
 output a new GraphDef. Each GraphDef is just a list of NodeDefs, each defining
 one node in the graph and its connections. You can find more information on the
 format at [this guide to TensorFlow model
-files](https://www.tensorflow.org/versions/master/how_tos/tool_developers/index.html),
+files](https://www.tensorflow.org/versions/master/extend/tool_developers/index.html),
 but for a simple example take a look at
 [tensorflow/tools/graph_transforms/rename_op.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms/rename_op.cc),
 which implements the [rename_op](#rename_op) transform:
diff --git a/tensorflow/tools/graph_transforms/compare_graphs.cc b/tensorflow/tools/graph_transforms/compare_graphs.cc
index 8fce16337f7a875835c6f5e5aeaf19a6627a3a13..28a80a885f86fed1f0f30d0ecdc87c9dbb7ba27c 100644
--- a/tensorflow/tools/graph_transforms/compare_graphs.cc
+++ b/tensorflow/tools/graph_transforms/compare_graphs.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/util/equal_graph_def.h"
+#include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fake_quantize_training.cc b/tensorflow/tools/graph_transforms/fake_quantize_training.cc
new file mode 100644
index 0000000000000000000000000000000000000000..321de47db1f1e5b305c91378917dab14f9912748
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/fake_quantize_training.cc
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/graph/quantize_training.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Rewrites the GraphDef for quantized training.
+// Rewrites the forward pass to include the precision loss with quantization so
+// the model can learn to deal with such loss and achieve better accuracy when
+// it is quantized later for inference.
+// Quantization range information is collected in FakeQuantizeWithMinMaxVars
+// ops.
+//
+// TODO(suharshs): Provide instructions on converting the resulting graph for
+// inference.
+// TODO(suharshs): Implement this using the GTT rather than calling the old
+// prototype function.
+Status FakeQuantizeTraining(const GraphDef& input_graph_def,
+                            const TransformFuncContext& context,
+                            GraphDef* output_graph_def) {
+  // TODO(suharshs): Make num_bits a parameter.
+  const int32 num_bits = 8;
+  // TODO(suharshs): Make quantization op a parameter?
+  const string quant_op_type = "FakeQuantWithMinMaxVars";
+
+  return DoQuantizeTrainingOnGraphDef(input_graph_def, num_bits, quant_op_type,
+                                      output_graph_def);
+}
+
+REGISTER_GRAPH_TRANSFORM("fake_quantize_training", FakeQuantizeTraining);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fake_quantize_training_test.cc b/tensorflow/tools/graph_transforms/fake_quantize_training_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ea7f512c6760c2e7d7b5870f17df9361e2488f6
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/fake_quantize_training_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declare here, so we don't need a public header.
+Status FakeQuantizeTraining(const GraphDef& input_graph_def,
+                            const TransformFuncContext& context,
+                            GraphDef* output_graph_def);
+
+class FakeQuantizeTrainingTest : public ::testing::Test {};
+
+// For now, since the fake_quantize_training transform just calls the
+// quantize_training rewrite from tensorflow/core/graph/quantize_training.h,
+// we just test that the graph has been changed by the transform.
+// TODO(suharshs): Once we implement the fake_quantize_training transform
+// using the GTT, write proper tests of the transform here.
+TEST_F(FakeQuantizeTrainingTest, TransformOccurred) {
+  auto root = tensorflow::Scope::NewRootScope();
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+  Tensor a_data(DT_FLOAT, TensorShape());
+  test::FillIota<float>(&a_data, 1.0f);
+  Output a_const = Const(root.WithOpName("a"), Input::Initializer(a_data));
+
+  Tensor b_data(DT_FLOAT, TensorShape());
+  test::FillIota<float>(&b_data, 1.0f);
+  Output b_const = Const(root.WithOpName("b"), Input::Initializer(b_data));
+
+  Output matmul = MatMul(root.WithOpName("matmul"), a_const, b_const);
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  GraphDef result;
+  TransformFuncContext context;
+  TF_ASSERT_OK(FakeQuantizeTraining(graph_def, context, &result));
+
+  // Test that the transformation resulted in a graph with more nodes.
+  EXPECT_GT(result.node_size(), graph_def.node_size());
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/file_utils.cc b/tensorflow/tools/graph_transforms/file_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5649c971982bd7a3db2f856f4219c8f6cc1aa811
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/file_utils.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/graph_transforms/file_utils.h"
+
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+Status LoadTextOrBinaryGraphFile(const string& file_name, GraphDef* graph_def) {
+  string file_data;
+  Status load_file_status =
+      ReadFileToString(Env::Default(), file_name, &file_data);
+  if (!load_file_status.ok()) {
+    errors::AppendToMessage(&load_file_status, " (for file ", file_name, ")");
+    return load_file_status;
+  }
+  // Try to load in binary format first, and then try ascii if that fails.
+  Status load_status = ReadBinaryProto(Env::Default(), file_name, graph_def);
+  if (!load_status.ok()) {
+    if (protobuf::TextFormat::ParseFromString(file_data, graph_def)) {
+      load_status = Status::OK();
+    } else {
+      errors::AppendToMessage(&load_status,
+                              " (both text and binary parsing failed for file ",
+                              file_name, ")");
+    }
+  }
+  return load_status;
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/file_utils.h b/tensorflow/tools/graph_transforms/file_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4737e95abcec3694d426e0c3c3a7112c2c5b6bd1
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/file_utils.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// First tries to load the file as a text protobuf, if that fails tries to parse
+// it as a binary protobuf, and returns an error if both fail.
+Status LoadTextOrBinaryGraphFile(const string& file_name, GraphDef* graph_def);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
diff --git a/tensorflow/tools/graph_transforms/file_utils_test.cc b/tensorflow/tools/graph_transforms/file_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c898ba0f1683d21e69e5be2fa6a8ab60bb10e31
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/file_utils_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/graph_transforms/file_utils.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/util/equal_graph_def.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+class FileUtilsTest : public ::testing::Test {
+ protected:
+  void TestLoadTextOrBinaryGraphFile() {
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+    const int width = 10;
+
+    auto root = tensorflow::Scope::NewRootScope();
+    Tensor a_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&a_data, 1.0f);
+    Output a_const = Const(root.WithOpName("a"), Input::Initializer(a_data));
+    GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+    const string text_file =
+        io::JoinPath(testing::TmpDir(), "text_graph.pbtxt");
+    TF_ASSERT_OK(WriteTextProto(Env::Default(), text_file, graph_def));
+
+    const string binary_file =
+        io::JoinPath(testing::TmpDir(), "binary_graph.pb");
+    TF_ASSERT_OK(WriteBinaryProto(Env::Default(), binary_file, graph_def));
+
+    const string bogus_file = io::JoinPath(testing::TmpDir(), "bogus_graph.pb");
+    TF_ASSERT_OK(
+        WriteStringToFile(Env::Default(), bogus_file, "Not a !{ proto..."));
+
+    GraphDef text_graph_def;
+    TF_EXPECT_OK(LoadTextOrBinaryGraphFile(text_file, &text_graph_def));
+    string text_diff;
+    EXPECT_TRUE(EqualGraphDef(text_graph_def, graph_def, &text_diff))
+        << text_diff;
+
+    GraphDef binary_graph_def;
+    TF_EXPECT_OK(LoadTextOrBinaryGraphFile(binary_file, &binary_graph_def));
+    string binary_diff;
+    EXPECT_TRUE(EqualGraphDef(binary_graph_def, graph_def, &binary_diff))
+        << binary_diff;
+
+    GraphDef no_graph_def;
+    EXPECT_FALSE(
+        LoadTextOrBinaryGraphFile("____non_existent_file_____", &no_graph_def)
+            .ok());
+
+    GraphDef bogus_graph_def;
+    EXPECT_FALSE(LoadTextOrBinaryGraphFile(bogus_file, &bogus_graph_def).ok());
+  }
+};
+
+TEST_F(FileUtilsTest, TestLoadTextOrBinaryGraphFile) {
+  TestLoadTextOrBinaryGraphFile();
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 79472ae554998ceae1eef73577c5b289a857690a..f97e4854183d24000717716ade5a9177a11ead5f 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -152,9 +152,19 @@ Status FoldConstants(const GraphDef& input_graph_def,
       &input_graph, context.input_names, context.output_names, {},
       device_attributes, false /* use_function_convention */, &metadata));
   bool was_mutated;
-  TF_RETURN_IF_ERROR(ConstantFold(ConstantFoldingOptions(), nullptr,
-                                  Env::Default(), nullptr, &input_graph,
-                                  &was_mutated));
+  // Exclude specified nodes from constant folding.
+  ConstantFoldingOptions cf_opts;
+  if (context.params.count("exclude_op") > 0) {
+    const auto& excluded_nodes = context.params.at("exclude_op");
+    const std::set<string> excluded_nodes_set(excluded_nodes.begin(),
+                                              excluded_nodes.end());
+    cf_opts.consider = [excluded_nodes_set](const Node* n) {
+      return excluded_nodes_set.find(n->op_def().name()) ==
+             excluded_nodes_set.end();
+    };
+  }
+  TF_RETURN_IF_ERROR(ConstantFold(cf_opts, nullptr, Env::Default(), nullptr,
+                                  &input_graph, &was_mutated));
   GraphDef folded_graph_def;
   input_graph.ToGraphDef(&folded_graph_def);
   GraphDef send_recvs_replaced;
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index 902f92952a6405ad6eed3f61364f6e127bfda8cb..14e2c01c7c2a5032860992d9a4956816cce1bed0 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
@@ -69,11 +71,46 @@ class ConstantFoldingTest : public ::testing::Test {
     test::FillIota<float>(&placeholder_tensor, 1.0f);
     TestConstantFolding(graph_def,
                         {{"placeholder_expect_remains", placeholder_tensor}},
-                        {"output_expect_remains"});
+                        {}, {"output_expect_remains"});
+  }
+
+  void TestOpExclusionAdd() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    const int width = 100;
+
+    Tensor a_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&a_data, 1.0f);
+    Output a_const =
+        Const(root.WithOpName("a_expect_remains"), Input::Initializer(a_data));
+
+    Tensor b_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&b_data, 1.0f);
+    Output b_const =
+        Const(root.WithOpName("b_expect_remains"), Input::Initializer(b_data));
+
+    Output add = Add(root.WithOpName("add_expect_remains"), a_const, b_const);
+
+    Output placeholder =
+        Placeholder(root.WithOpName("placeholder_expect_remains"), DT_FLOAT);
+
+    Output mul =
+        Mul(root.WithOpName("output_expect_remains"), add, placeholder);
+
+    GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+    Tensor placeholder_tensor(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&placeholder_tensor, 1.0f);
+    TestConstantFolding(graph_def,
+                        {{"placeholder_expect_remains", placeholder_tensor}},
+                        {"Add"}, {"output_expect_remains"});
   }
 
   void TestConstantFolding(const GraphDef& graph_def,
                            std::vector<std::pair<string, Tensor> > inputs,
+                           std::vector<string> excluded_ops,
                            const std::vector<string>& outputs) {
     std::unique_ptr<tensorflow::Session> unfolded_session(
         tensorflow::NewSession(tensorflow::SessionOptions()));
@@ -87,6 +124,7 @@ class ConstantFoldingTest : public ::testing::Test {
       context.input_names.push_back(input.first);
     }
     context.output_names = outputs;
+    context.params["exclude_op"] = std::move(excluded_ops);
     TF_ASSERT_OK(
         graph_transforms::FoldConstants(graph_def, context, &folded_graph_def));
 
@@ -203,6 +241,8 @@ class ConstantFoldingTest : public ::testing::Test {
 
 TEST_F(ConstantFoldingTest, TestSimpleAdd) { TestSimpleAdd(); }
 
+TEST_F(ConstantFoldingTest, TestOpExclusionAdd) { TestOpExclusionAdd(); }
+
 TEST_F(ConstantFoldingTest, TestReplaceSendRecvs) { TestReplaceSendRecvs(); }
 
 TEST_F(ConstantFoldingTest, TestRemoveUnusedNodes) { TestRemoveUnusedNodes(); }
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 066727614c8a24329d9d2f45d9dfe946a51b322b..0978c336b49ce8cc72d9fc35af551a7f15ee697f 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -54,7 +54,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
     GraphDef replaced_graph_def;
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def,  // clang-format off
-      {"BatchNormWithGlobalNormalization",    // batch_norm_node
+      {"BatchNormWithGlobalNormalization|FusedBatchNorm",    // batch_norm_node
         {
           {"Conv2D",                          // conv_node
             {
@@ -74,19 +74,33 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
                             std::vector<NodeDef>* new_nodes) {
           // Find all the nodes we expect in the subgraph.
           const NodeDef& batch_norm_node = match.node;
-          CHECK_EQ("BatchNormWithGlobalNormalization", batch_norm_node.op());
+          // BatchNormWithGlobalNormalization and FusedBatchNorm ops only differ
+          // by input order and attribute names.
+          CHECK(batch_norm_node.op() == "BatchNormWithGlobalNormalization" ||
+                batch_norm_node.op() == "FusedBatchNorm");
+          const bool is_fused = batch_norm_node.op() == "FusedBatchNorm";
+          const int mean_idx = is_fused ? 3 : 1;
+          const int var_idx = is_fused ? 4 : 2;
+          const int beta_idx = is_fused ? 2 : 3;
+          const int gamma_idx = is_fused ? 1 : 4;
+          const string epsilon_attr = is_fused ? "epsilon" : "variance_epsilon";
+          // FusedBatchNorm always scales after normalization.
+          const bool scale_after_normalization =
+              is_fused ||
+              batch_norm_node.attr().at("scale_after_normalization").b();
+
           const NodeDef& conv_node = match.inputs[0].node;
           CHECK_EQ("Conv2D", conv_node.op());
           const NodeDef& input_node = match.inputs[0].inputs[0].node;
           const NodeDef& weights_node = match.inputs[0].inputs[1].node;
           CHECK_EQ("Const", weights_node.op());
-          const NodeDef& mean_node = match.inputs[1].node;
+          const NodeDef& mean_node = match.inputs[mean_idx].node;
           CHECK_EQ("Const", mean_node.op());
-          const NodeDef& variance_node = match.inputs[2].node;
+          const NodeDef& variance_node = match.inputs[var_idx].node;
           CHECK_EQ("Const", variance_node.op());
-          const NodeDef& beta_node = match.inputs[3].node;
+          const NodeDef& beta_node = match.inputs[beta_idx].node;
           CHECK_EQ("Const", beta_node.op());
-          const NodeDef& gamma_node = match.inputs[4].node;
+          const NodeDef& gamma_node = match.inputs[gamma_idx].node;
           CHECK_EQ("Const", gamma_node.op());
 
           // We have a set of vectors that we want to combine into a vector of
@@ -98,9 +112,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
           Tensor beta = GetNodeTensorAttr(beta_node, "value");
           Tensor gamma = GetNodeTensorAttr(gamma_node, "value");
           const float variance_epsilon =
-              batch_norm_node.attr().at("variance_epsilon").f();
-          const bool scale_after_normalization =
-              batch_norm_node.attr().at("scale_after_normalization").b();
+              batch_norm_node.attr().at(epsilon_attr).f();
 
           // Make sure all the inputs really are vectors, with as many entries
           // as there are columns in the weights.
@@ -119,16 +131,17 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
               scale_values[i] =
                   (1.0f / sqrtf(variance.flat<float>()(i) + variance_epsilon)) *
                   gamma.flat<float>()(i);
-              offset_values[i] = 0.0f;
             }
           } else {
             for (int i = 0; i < weights_cols; ++i) {
               scale_values[i] =
                   (1.0f / sqrtf(variance.flat<float>()(i) + variance_epsilon));
-              offset_values[i] = (-mean.flat<float>()(i) * scale_values[i]) +
-                                 beta.flat<float>()(i);
             }
           }
+          for (int i = 0; i < weights_cols; ++i) {
+            offset_values[i] = (-mean.flat<float>()(i) * scale_values[i]) +
+                               beta.flat<float>()(i);
+          }
 
           // Multiply the original weights by the scale vector.
           auto weights_matrix = weights.flat_inner_dims<float>();
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 1c4958d83c935e4b298b54461e820b15608d7b8e..3be9110b475f97087be18118d2ba0c52d6388c03 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -118,11 +119,92 @@ class FoldOldBatchNormsTest : public ::testing::Test {
       EXPECT_NE("BatchNormWithGlobalNormalization", node.op());
     }
   }
+
+  void TestFoldFusedBatchNorms() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = Conv2D(root.WithOpName("conv_op"), input_op, weights_op,
+                            {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({2}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("FusedBatchNorm");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("is_training", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("FusedBatchNorm", node.op());
+    }
+  }
 };
 
 TEST_F(FoldOldBatchNormsTest, TestFoldOldBatchNorms) {
   TestFoldOldBatchNorms();
 }
 
+TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNorms) {
+  TestFoldFusedBatchNorms();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index da064377ac3f2919e0d0421099d9407a35518e22..2b85e7e83c6f3e2c8d0840f0b9eb0b4992a8b113 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -119,6 +119,13 @@ const std::vector<QuantizedOpInfo>& GetQuantizedOpList() {
        DT_QUINT8,
        {},
        QuantizedOpInfo::CONTIGUOUS_MIN_MAX},
+      {"ResizeBilinear",
+       {"align_corners"},
+       {{"T", DT_QUINT8}},
+       DT_QUINT8,
+       DT_QUINT8,
+       {1},
+       QuantizedOpInfo::CONTIGUOUS_MIN_MAX},
       {"Relu6",
        {},
        {{"Tinput", DT_QUINT8}},
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes_test.cc b/tensorflow/tools/graph_transforms/quantize_nodes_test.cc
index d02655f3f9cb5093a9c542e90aef2f8069e6e1dd..eca263a1ae0dbfad51565b1d3d0d26b066704fc8 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes_test.cc
@@ -106,8 +106,8 @@ class QuantizeNodesTest : public ::testing::Test {
     // Reshape is not included here because it can be added as part of the
     // quantization process.
     const std::set<string> quantizable_ops = {
-        "Add",  "BiasAdd", "Concat",  "Conv2D",  "MatMul",
-        "Relu", "Relu6",   "AvgPool", "MaxPool", "Mul"};
+        "Add",   "BiasAdd",        "Concat",  "Conv2D",  "MatMul", "Relu",
+        "Relu6", "ResizeBilinear", "AvgPool", "MaxPool", "Mul"};
     for (const NodeDef& node : quantized_graph_def.node()) {
       EXPECT_EQ(0, quantizable_ops.count(node.op()))
           << "Found quantizable node " << node.op() << " for node named "
@@ -652,6 +652,33 @@ class QuantizeNodesTest : public ::testing::Test {
     EXPECT_EQ("requantize_op", node_map.at("final_dequantize")->input(0));
   }
 
+  void TestQuantizeResizeBilinear() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor size_tensor(DT_INT32, TensorShape({2}));
+    test::FillValues<int32>(&size_tensor, {256, 256});
+
+    Output constant_op = Const(root.WithOpName("size_tensor_op"),
+                               Input::Initializer(size_tensor));
+
+    Output placeholder_op =
+        Placeholder(root.WithOpName("placeholder_op"), DT_FLOAT);
+
+    Output resize_bilinear_op = ResizeBilinear(
+        root.WithOpName("resize_bilinear_op"), placeholder_op, constant_op);
+
+    GraphDef float_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&float_graph_def));
+
+    Tensor input_tensor(DT_FLOAT, {1, 128, 128, 3});
+    test::FillFn<float>(&input_tensor, [](int) { return 100.0f; });
+
+    TestQuantizedVersusFloatGraph(float_graph_def,
+                                  {{"placeholder_op", input_tensor}},
+                                  {"resize_bilinear_op"});
+  }
+
   void TestRemoveRedundantQuantizationWithMultipleOutputs() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -1446,6 +1473,10 @@ TEST_F(QuantizeNodesTest, TestQuantizeAvgPool) { TestQuantizeAvgPool(); }
 
 TEST_F(QuantizeNodesTest, TestQuantizeReshape) { TestQuantizeReshape(); }
 
+TEST_F(QuantizeNodesTest, TestQuantizeResizeBilinear) {
+  TestQuantizeResizeBilinear();
+}
+
 TEST_F(QuantizeNodesTest, TestRemoveRedundantQuantization) {
   TestRemoveRedundantQuantization();
 }
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index 66d800f0da1f49a2026a71927d6910e18e87f2f5..cccae8a992a64b0f49798eda71513a2fe62ad656 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -35,11 +35,15 @@ namespace graph_transforms {
 Status QuantizeWeights(const GraphDef& input_graph_def,
                        const TransformFuncContext& context,
                        GraphDef* output_graph_def) {
+  int32 minimum_size;
+  TF_RETURN_IF_ERROR(
+      context.GetOneInt32Parameter("minimum_size", 1024, &minimum_size));
   TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
       input_graph_def, {"Const"},
-      [](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
-         std::vector<NodeDef>* new_nodes) {
+      [minimum_size](const NodeMatch& match,
+                     const std::set<string>& input_nodes,
+                     const std::set<string>& output_nodes,
+                     std::vector<NodeDef>* new_nodes) {
         const NodeDef& old_const_node = match.node;
         if (!old_const_node.attr().count("dtype")) {
           return errors::InvalidArgument("No 'dtype' attribute for Const node ",
@@ -58,7 +62,7 @@ Status QuantizeWeights(const GraphDef& input_graph_def,
         const size_t num_elements = old_tensor.NumElements();
         // If this isn't a float constant, or it's too small, then reuse the
         // same node with no changes.
-        if ((old_dtype != DT_FLOAT) || (num_elements < 16)) {
+        if ((old_dtype != DT_FLOAT) || (num_elements < minimum_size)) {
           new_nodes->push_back(old_const_node);
           return Status::OK();
         }
diff --git a/tensorflow/tools/graph_transforms/quantize_weights_test.cc b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
index 63c5b5a64d915e99f929e83650ac3d1dd432c6af..e1828831db19e9b449239b08e12e6e78c473552f 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
@@ -70,9 +70,12 @@ class QuantizeWeightsTest : public ::testing::Test {
                    0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f},
                   &original_graph_def);
 
+    TransformFuncContext context;
+    context.output_names = {"output"};
+    context.params["minimum_size"] = {"16"};
     GraphDef quantized_graph_def;
-    TF_ASSERT_OK(QuantizeWeights(original_graph_def, {{}, {"output"}},
-                                 &quantized_graph_def));
+    TF_ASSERT_OK(
+        QuantizeWeights(original_graph_def, context, &quantized_graph_def));
 
     // Verify the structure of the quantized graph.
     std::map<string, const NodeDef*> node_lookup;
@@ -122,9 +125,12 @@ TEST_F(QuantizeWeightsTest, RangesAlwaysIncludeZero) {
                  0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
                  0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f},
                 &original_graph_def);
+  TransformFuncContext context;
+  context.output_names = {"output"};
+  context.params["minimum_size"] = {"16"};
   GraphDef quantized_graph_def;
-  TF_ASSERT_OK(QuantizeWeights(original_graph_def, {{}, {"output"}},
-                               &quantized_graph_def));
+  TF_ASSERT_OK(
+      QuantizeWeights(original_graph_def, context, &quantized_graph_def));
 
   std::map<string, const NodeDef*> node_lookup;
   MapNamesToNodes(quantized_graph_def, &node_lookup);
diff --git a/tensorflow/tools/graph_transforms/rename_attribute_test.cc b/tensorflow/tools/graph_transforms/rename_attribute_test.cc
index a0a33e9fc090acea176333ec840e2e6f438ca998..31619d82ad998a48dde7a3c73fba12a16a0360c2 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute_test.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute_test.cc
@@ -43,17 +43,17 @@ class RenameAttributeTest : public ::testing::Test {
     mul_node1->set_op("Mul");
     mul_node1->add_input("add_node2");
     mul_node1->add_input("add_node3");
-    AddNodeAttr<int32>("foo", 23, mul_node1);
-    AddNodeAttr<string>("bar", "something", mul_node1);
+    AddNodeAttr("foo", 23, mul_node1);
+    AddNodeAttr("bar", "something", mul_node1);
 
     NodeDef* add_node2 = graph_def.add_node();
     add_node2->set_name("add_node2");
     add_node2->set_op("Add");
     add_node2->add_input("const_node1");
     add_node2->add_input("const_node2");
-    AddNodeAttr<int32>("foo", 46, add_node2);
-    AddNodeAttr<int32>("bob", 23, add_node2);
-    AddNodeAttr<string>("bar", "something else", add_node2);
+    AddNodeAttr("foo", 46, add_node2);
+    AddNodeAttr("bob", 23, add_node2);
+    AddNodeAttr("bar", "something else", add_node2);
 
     NodeDef* add_node3 = graph_def.add_node();
     add_node3->set_name("add_node3");
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index c441a089ced86e3ad779ae782eeec2e7e59e1e22..937d8c09ff78b0bf8e668bcef978f8e8e4120fdb 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -78,13 +78,14 @@ void CreateConstNode(const Tensor& tensor, const string& name,
   node_def->set_name(name);
   SetNodeTensorAttr<float>("value", tensor, node_def);
 }
-}  // namespace
 
-Status SparsifyGather(const GraphDef& input_graph_def,
-                      const TransformFuncContext& context,
-                      GraphDef* output_graph_def) {
+Status SparsifyGatherInternal(const GraphDef& input_graph_def,
+                              const TransformFuncContext& context,
+                              const OpTypePattern& pattern,
+                              GraphDef* output_graph_def) {
   GraphDef current_graph_def = input_graph_def;
   bool any_match_found = false;
+
   // The subgraphs may have overlapping components, therefore GraphMatcher
   // doesn't return all subgraphs in one round -- this has to be multi-round
   // update.
@@ -94,17 +95,7 @@ Status SparsifyGather(const GraphDef& input_graph_def,
     std::vector<string> init_table_node_names;
 
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
-        current_graph_def,  // clang-format off
-      {"Gather",
-        {
-          {"Identity",
-            {
-              {"Const"}
-            }
-          },
-          {"*"},
-        }
-      },  // clang-format on
+        current_graph_def, pattern,
         [&any_match_found, &init_table_node_names](
             const NodeMatch& match, const std::set<string>& input_nodes,
             const std::set<string>& output_nodes,
@@ -143,6 +134,33 @@ Status SparsifyGather(const GraphDef& input_graph_def,
           //    c. a `default_val` arg, valued at 0
           // clang-format on
           const NodeDef& gather_node = match.node;
+
+          // GatherV2 adds an "axis" parameter. sparsify_gather only supports
+          // axis 0 gathers.
+          if (gather_node.op() == "GatherV2") {
+            // Per the OpTypePattern, the 3rd input to Gather must be a Const.
+            const NodeDef& axis_node = match.inputs[2].node;
+
+            Tensor axis_t;
+            TF_RETURN_IF_ERROR(GetNodeAttr(axis_node, "value", &axis_t));
+            int64 axis = 0;
+            if (axis_t.dtype() == DT_INT32) {
+              axis = axis_t.scalar<int32>()();
+            } else if (axis_t.dtype() == DT_INT64) {
+              axis = axis_t.scalar<int64>()();
+            } else {
+              return tensorflow::errors::FailedPrecondition(
+                  "Gather axis was not int32 or int64.");
+            }
+
+            if (axis != 0) {
+              return tensorflow::errors::FailedPrecondition(
+                  "Transform only applicable to subgraph with GatherV2 over "
+                  "axis 0. Found axis ",
+                  axis, ".");
+            }
+          }
+
           const NodeDef& const_node = match.inputs[0].inputs[0].node;
 
           DataType data_type;
@@ -269,6 +287,45 @@ Status SparsifyGather(const GraphDef& input_graph_def,
   *output_graph_def = current_graph_def;
   return Status::OK();
 }
+}  // namespace
+
+Status SparsifyGather(const GraphDef& input_graph_def,
+                      const TransformFuncContext& context,
+                      GraphDef* output_graph_def) {
+  // clang-format off
+  const OpTypePattern gather_pattern =
+    {"Gather",
+     {
+       {"Identity",
+        {
+          {"Const"}
+        }
+       },
+       {"*"},
+     }
+    };
+  const OpTypePattern gather_v2_pattern =
+    {"GatherV2",
+      {
+        {"Identity",
+          {
+            {"Const"}
+          }
+        },
+        {"*"},
+        // GatherV2's axis must be constant.
+        {"Const"},
+      }
+    };
+  // clang-format on
+
+  GraphDef temp_output;
+  TF_RETURN_IF_ERROR(SparsifyGatherInternal(input_graph_def, context,
+                                            gather_pattern, &temp_output));
+
+  return SparsifyGatherInternal(temp_output, context, gather_v2_pattern,
+                                output_graph_def);
+}
 
 REGISTER_GRAPH_TRANSFORM("sparsify_gather", SparsifyGather);
 
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 8d353d34763a7f362e4e3164b5a93c504bee6fbe..c999212d6931fda940f4fad6c2b199c0e82c37aa 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -33,19 +34,33 @@ Status SparsifyGather(const GraphDef& input_graph_def,
 
 class SparsifyGatherTest : public ::testing::Test {
  protected:
-  NodeDef* CreateNode(const string& name, const string& op,
+  NodeDef* CreateNode(const StringPiece name, const StringPiece op,
                       const std::vector<NodeDef*>& inputs,
                       GraphDef* graph_def) {
     NodeDef* node_def = graph_def->add_node();
-    node_def->set_name(name);
-    node_def->set_op(op);
+    node_def->set_name(name.ToString());
+    node_def->set_op(op.ToString());
     std::for_each(inputs.begin(), inputs.end(), [&node_def](NodeDef* input) {
       node_def->add_input(input->name());
     });
     return node_def;
   }
 
-  void TestSinglePartitionConst() {
+  void MakeGather(StringPiece name, bool gather_v2, NodeDef* params,
+                  NodeDef* indices, GraphDef* graph_def) {
+    if (gather_v2) {
+      NodeDef* axis_node =
+          CreateNode(strings::StrCat(name, "_axis"), "Const", {}, graph_def);
+      Tensor axis_t(DT_INT32, TensorShape({}));
+      axis_t.scalar<int32>()() = 0;
+      SetNodeTensorAttr<int32>("value", axis_t, axis_node);
+      CreateNode(name, "GatherV2", {params, indices, axis_node}, graph_def);
+    } else {
+      CreateNode(name, "Gather", {params, indices}, graph_def);
+    }
+  }
+
+  void TestSinglePartitionConst(bool gather_v2) {
     GraphDef graph_def;
 
     // Build the graph.
@@ -59,7 +74,7 @@ class SparsifyGatherTest : public ::testing::Test {
 
     NodeDef* identity_node =
         CreateNode("const/read", "Identity", {const_node}, &graph_def);
-    CreateNode("gather", "Gather", {identity_node, input_node}, &graph_def);
+    MakeGather("gather", gather_v2, identity_node, input_node, &graph_def);
     CreateNode("group_deps", "NoOp", {}, &graph_def);
 
     // Run the op.
@@ -151,7 +166,7 @@ class SparsifyGatherTest : public ::testing::Test {
               node_lookup.at("group_deps")->input().end());
   }
 
-  void TestMultiPartitionConst() {
+  void TestMultiPartitionConst(bool gather_v2) {
     // The 'ids' node is served input for two 'Gather's.
     GraphDef graph_def;
 
@@ -177,8 +192,8 @@ class SparsifyGatherTest : public ::testing::Test {
         CreateNode("const1/read", "Identity", {const_node1}, &graph_def);
     NodeDef* identity_node2 =
         CreateNode("const2/read", "Identity", {const_node2}, &graph_def);
-    CreateNode("gather1", "Gather", {identity_node1, input_node}, &graph_def);
-    CreateNode("gather2", "Gather", {identity_node2, input_node}, &graph_def);
+    MakeGather("gather1", gather_v2, identity_node1, input_node, &graph_def);
+    MakeGather("gather2", gather_v2, identity_node2, input_node, &graph_def);
 
     // Run the op.
     GraphDef result;
@@ -341,11 +356,13 @@ class SparsifyGatherTest : public ::testing::Test {
 };
 
 TEST_F(SparsifyGatherTest, TestSinglePartitionConst) {
-  TestSinglePartitionConst();
+  TestSinglePartitionConst(false);
+  TestSinglePartitionConst(true);
 }
 
 TEST_F(SparsifyGatherTest, TestMultiPartitionConst) {
-  TestMultiPartitionConst();
+  TestMultiPartitionConst(false);
+  TestMultiPartitionConst(true);
 }
 
 }  // namespace graph_transforms
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes_test.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes_test.cc
index 4eb074998f71e8c1ff51ea64463ff35660bcedca..c0107014e2cf115aeafe78ca879c0cb169cb335b 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes_test.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index 91670f54d49d057bbb5ff894247c79538877ef5f..6c404c8061e199ca37c2d97eefd4fdb235c6b49a 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -23,13 +23,16 @@ limitations under the License.
 // bazel-bin/tensorflow/tools/graph_transforms/summarize_graph \
 // --in_graph=my_graph.pb
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
@@ -81,11 +84,17 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
         shape = PartialTensorShape(shape_proto);
       }
     }
-    sizes.reserve(shape.dims());
-    for (int i = 0; i < shape.dims(); ++i) {
-      sizes.push_back(shape.dim_size(i));
+    string sizes_string;
+    if (shape.dims() == -1) {
+      // Unknown shapes can have -1 for dims, so leave these blank.
+      sizes_string = "";
+    } else {
+      sizes.reserve(shape.dims());
+      for (int i = 0; i < shape.dims(); ++i) {
+        sizes.push_back(shape.dim_size(i));
+      }
+      sizes_string = str_util::Join(sizes, ",");
     }
-    string sizes_string = str_util::Join(sizes, ",");
     input_layer_shapes.push_back(sizes_string);
   }
   std::vector<string> output_layers;
@@ -116,7 +125,17 @@ Status PrintStructure(const GraphDef& graph) {
   TF_RETURN_IF_ERROR(SortByExecutionOrder(graph, &sorted_graph));
   for (const NodeDef& node : sorted_graph.node()) {
     std::cout << node.name() << " (" << node.op() << "): ["
-              << str_util::Join(node.input(), ", ") << "]" << std::endl;
+              << str_util::Join(node.input(), ", ") << "]";
+    if (node.op() == "Const") {
+      Tensor tensor;
+      if (node.attr().count("value") &&
+          tensor.FromProto(node.attr().at("value").tensor())) {
+        std::cout << ", value=" << tensor.DebugString();
+      } else {
+        LOG(WARNING) << "Decoding Tensor failed for node" << node.name();
+      }
+    }
+    std::cout << std::endl;
   }
   return Status::OK();
 }
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index e7694104cbded7581529904565a0d13f1a39efba..28387c2b48c06ecffd2afa0705a8dea5bc368460 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/file_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 0ef517acc5bb6518a54501ea271c21439789da42..bd1e4c90c06f76bbac608940ab792b02e68890d4 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -587,28 +586,6 @@ Status GetInOutTypes(const NodeDef& node_def, DataTypeVector* inputs,
   return Status::OK();
 }
 
-Status LoadTextOrBinaryGraphFile(const string& file_name, GraphDef* graph_def) {
-  string file_data;
-  Status load_file_status =
-      ReadFileToString(Env::Default(), file_name, &file_data);
-  if (!load_file_status.ok()) {
-    errors::AppendToMessage(&load_file_status, " (for file ", file_name, ")");
-    return load_file_status;
-  }
-  // Try to load in binary format first, and then try ascii if that fails.
-  Status load_status = ReadBinaryProto(Env::Default(), file_name, graph_def);
-  if (!load_status.ok()) {
-    if (protobuf::TextFormat::ParseFromString(file_data, graph_def)) {
-      load_status = Status::OK();
-    } else {
-      errors::AppendToMessage(&load_status,
-                              " (both text and binary parsing failed for file ",
-                              file_name, ")");
-    }
-  }
-  return load_status;
-}
-
 int TransformFuncContext::CountParameters(const string& name) const {
   if (params.count(name)) {
     return params.at(name).size();
diff --git a/tensorflow/tools/graph_transforms/transform_utils.h b/tensorflow/tools/graph_transforms/transform_utils.h
index 6ed549a9589af2ff287aa199b2cfb113e40bf871..c0fb4924123ca6637ccc18043aab8d9829a298eb 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.h
+++ b/tensorflow/tools/graph_transforms/transform_utils.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -107,8 +109,8 @@ void FilterGraphDef(const GraphDef& input_graph_def,
                     std::function<bool(const NodeDef&)> selector,
                     GraphDef* output_graph_def);
 
-// Creates a copy of the input graph, with all occurrences of the attributes with
-// the names in the argument removed from the node defs.
+// Creates a copy of the input graph, with all occurrences of the attributes
+// with the names in the argument removed from the node defs.
 void RemoveAttributes(const GraphDef& input_graph_def,
                       const std::vector<string>& attributes,
                       GraphDef* output_graph_def);
@@ -131,10 +133,6 @@ Status IsGraphValid(const GraphDef& graph_def);
 Status GetInOutTypes(const NodeDef& node_def, DataTypeVector* inputs,
                      DataTypeVector* outputs);
 
-// First tries to load the file as a text protobuf, if that fails tries to parse
-// it as a binary protobuf, and returns an error if both fail.
-Status LoadTextOrBinaryGraphFile(const string& file_name, GraphDef* graph);
-
 // This is used to spot particular subgraphs in a larger model. To use it,
 // create a pattern like:
 // OpTypePattern pattern({"Conv2D", {{"ResizeBilinear", {{"MirrorPad"}}}}});
diff --git a/tensorflow/tools/graph_transforms/transform_utils_test.cc b/tensorflow/tools/graph_transforms/transform_utils_test.cc
index d068254b35fd7331f79934139586d3f8d7cd0aff..b5bc2d75fd2726ff5d10026039c07cff7ede2797 100644
--- a/tensorflow/tools/graph_transforms/transform_utils_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils_test.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 namespace graph_transforms {
@@ -1066,50 +1064,6 @@ class TransformUtilsTest : public ::testing::Test {
     TF_EXPECT_OK(context.GetOneBoolParameter("not_present", true, &value));
     EXPECT_TRUE(value);
   }
-
-  void TestLoadTextOrBinaryGraphFile() {
-    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-    const int width = 10;
-
-    auto root = tensorflow::Scope::NewRootScope();
-    Tensor a_data(DT_FLOAT, TensorShape({width}));
-    test::FillIota<float>(&a_data, 1.0f);
-    Output a_const = Const(root.WithOpName("a"), Input::Initializer(a_data));
-    GraphDef graph_def;
-    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
-
-    const string text_file =
-        io::JoinPath(testing::TmpDir(), "text_graph.pbtxt");
-    TF_ASSERT_OK(WriteTextProto(Env::Default(), text_file, graph_def));
-
-    const string binary_file =
-        io::JoinPath(testing::TmpDir(), "binary_graph.pb");
-    TF_ASSERT_OK(WriteBinaryProto(Env::Default(), binary_file, graph_def));
-
-    const string bogus_file = io::JoinPath(testing::TmpDir(), "bogus_graph.pb");
-    TF_ASSERT_OK(
-        WriteStringToFile(Env::Default(), bogus_file, "Not a !{ proto..."));
-
-    GraphDef text_graph_def;
-    TF_EXPECT_OK(LoadTextOrBinaryGraphFile(text_file, &text_graph_def));
-    string text_diff;
-    EXPECT_TRUE(EqualGraphDef(text_graph_def, graph_def, &text_diff))
-        << text_diff;
-
-    GraphDef binary_graph_def;
-    TF_EXPECT_OK(LoadTextOrBinaryGraphFile(binary_file, &binary_graph_def));
-    string binary_diff;
-    EXPECT_TRUE(EqualGraphDef(binary_graph_def, graph_def, &binary_diff))
-        << binary_diff;
-
-    GraphDef no_graph_def;
-    EXPECT_FALSE(
-        LoadTextOrBinaryGraphFile("____non_existent_file_____", &no_graph_def)
-            .ok());
-
-    GraphDef bogus_graph_def;
-    EXPECT_FALSE(LoadTextOrBinaryGraphFile(bogus_file, &bogus_graph_def).ok());
-  }
 };
 
 TEST_F(TransformUtilsTest, TestMapNamesToNodes) { TestMapNamesToNodes(); }
@@ -1206,9 +1160,5 @@ TEST_F(TransformUtilsTest, TestGetOneBoolParameter) {
   TestGetOneBoolParameter();
 }
 
-TEST_F(TransformUtilsTest, TestLoadTextOrBinaryGraphFile) {
-  TestLoadTextOrBinaryGraphFile();
-}
-
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 51ba3b7a0be143a0186269678d508f4f0e95c55c..536437df2b6d1b9a16a9b4d1e218ab6bd01a14e2 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -4,6 +4,7 @@
 package(default_visibility = ["//visibility:private"])
 
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
+load("//third_party/mkl:build_defs.bzl", "if_mkl")
 
 genrule(
     name = "libtensorflow_proto",
@@ -87,6 +88,7 @@ genrule(
         "//third_party/fft2d:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
+        "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
@@ -100,10 +102,13 @@ genrule(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@png_archive//:LICENSE",
-        "@protobuf//:LICENSE",
+        "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ],
+    ] + if_mkl([
+        "//third_party/mkl:LICENSE",
+        "@mkl//:LICENSE",
+    ]),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
@@ -117,6 +122,7 @@ genrule(
         "//third_party/fft2d:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
+        "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
@@ -130,10 +136,13 @@ genrule(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@png_archive//:LICENSE",
-        "@protobuf//:LICENSE",
+        "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ],
+    ] + if_mkl([
+        "//third_party/mkl:LICENSE",
+        "@mkl//:LICENSE",
+    ]),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fc63e9a0b73fd92c63cde5d60bdb9b984922f820
--- /dev/null
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -0,0 +1,44 @@
+# Description:
+# This package provides binaries that convert between multi-line and standard
+# pbtxt (text-serialization of protocol message) files.
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files([
+    "LICENSE",
+    "placeholder.txt",
+])
+
+cc_binary(
+    name = "tomlpbtxt",
+    srcs = ["tomlpbtxt.cc"],
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:op_gen_lib",
+    ],
+)
+
+cc_binary(
+    name = "frommlpbtxt",
+    srcs = ["frommlpbtxt.cc"],
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:op_gen_lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/mlpbtxt/frommlpbtxt.cc b/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..643924b318d3fec850ebd6c8275a2eab4884a644
--- /dev/null
+++ b/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+int Run(int argc, char** argv) {
+  string FLAGS_in = "";
+  string FLAGS_out = "";
+
+  std::vector<Flag> flag_list = {
+      Flag("in", &FLAGS_in, "Input multi-line proto text (.mlpbtxt) file name"),
+      Flag("out", &FLAGS_out, "Output proto text (.pbtxt) file name")};
+
+  // Parse the command-line.
+  const string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_ok = Flags::Parse(&argc, argv, flag_list);
+  if (argc != 1 || !parse_ok) {
+    printf("%s", usage.c_str());
+    return 2;
+  }
+
+  port::InitMain(argv[0], &argc, &argv);
+
+  // Read the input file --in.
+  string in_contents;
+  Status s = ReadFileToString(Env::Default(), FLAGS_in, &in_contents);
+  if (!s.ok()) {
+    printf("Error reading file %s: %s\n", FLAGS_in.c_str(),
+           s.ToString().c_str());
+    return 1;
+  }
+
+  // Write the output file --out.
+  const string out_contents = PBTxtFromMultiline(in_contents);
+  s = WriteStringToFile(Env::Default(), FLAGS_out, out_contents);
+  if (!s.ok()) {
+    printf("Error writing file %s: %s\n", FLAGS_out.c_str(),
+           s.ToString().c_str());
+    return 1;
+  }
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) { return tensorflow::Run(argc, argv); }
diff --git a/tensorflow/tools/mlpbtxt/tomlpbtxt.cc b/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..469be49ed3c966c671f1f45619d0a8d88fe519f1
--- /dev/null
+++ b/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+int Run(int argc, char** argv) {
+  string FLAGS_in = "";
+  string FLAGS_out = "";
+  string FLAGS_fields = "description";
+
+  std::vector<Flag> flag_list = {
+      Flag("in", &FLAGS_in, "Input proto text (.pbtxt) file name"),
+      Flag("out", &FLAGS_out,
+           "Output multi-line proto text (.mlpbtxt) file name"),
+      Flag("fields", &FLAGS_fields, "Comma-separated list of field names")};
+
+  // Parse the command-line.
+  const string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_ok = Flags::Parse(&argc, argv, flag_list);
+  if (argc != 1 || !parse_ok) {
+    printf("%s", usage.c_str());
+    return 2;
+  }
+
+  // Parse the --fields option.
+  std::vector<string> fields =
+      str_util::Split(FLAGS_fields, ',', str_util::SkipEmpty());
+  if (fields.empty()) {
+    printf("--fields must be non-empty.\n%s", usage.c_str());
+    return 2;
+  }
+
+  port::InitMain(argv[0], &argc, &argv);
+
+  // Read the input file --in.
+  string in_contents;
+  Status s = ReadFileToString(Env::Default(), FLAGS_in, &in_contents);
+  if (!s.ok()) {
+    printf("Error reading file %s: %s\n", FLAGS_in.c_str(),
+           s.ToString().c_str());
+    return 1;
+  }
+
+  // Write the output file --out.
+  const string out_contents = PBTxtToMultiline(in_contents, fields);
+  s = WriteStringToFile(Env::Default(), FLAGS_out, out_contents);
+  if (!s.ok()) {
+    printf("Error writing file %s: %s\n", FLAGS_out.c_str(),
+           s.ToString().c_str());
+    return 1;
+  }
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) { return tensorflow::Run(argc, argv); }
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 798338d787551769d94afd9f774a23655a640086..4cd42d79c0600466684e87fcb3d8fd79f8f600c9 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -55,6 +55,15 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "check_load_py_test",
+    srcs = ["check_load_py_test.py"],
+    data = [
+        "//tensorflow:all_opensource_files",
+    ],
+    srcs_version = "PY2AND3",
+)
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
@@ -73,12 +82,9 @@ py_binary(
         "//tensorflow/python:util_example_parser_configuration",
         "//tensorflow/python/debug:debug_pip",
         "//tensorflow/python/saved_model",
+        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/tools:tools_pip",
         # These targets don't build on Windows yet. Exclude them for now.
-        # rules_closure currently doesn't build on Windows due to
-        # https://github.com/bazelbuild/rules_closure/pull/206
-        # Since tensorboard dependes on rules_closure, exclude tensorboard until it's fixed.
-        # "//tensorflow/tensorboard",
         # "//tensorflow/contrib/ndlstm",
         # "//tensorflow/contrib/slim",
         # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
@@ -99,6 +105,7 @@ filegroup(
         "//third_party/hadoop:LICENSE.txt",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
+        "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
@@ -112,17 +119,17 @@ filegroup(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
-        "@nanopb_git//:LICENSE.txt",
-        "@org_html5lib//:LICENSE",
-        "@org_mozilla_bleach//:LICENSE",
-        "@org_pocoo_werkzeug//:LICENSE",
-        "@org_pythonhosted_markdown//:LICENSE.md",
+        "@grpc//third_party/nanopb:LICENSE.txt",
         "@png_archive//:LICENSE",
-        "@protobuf//:LICENSE",
+        "@protobuf_archive//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ] + if_not_windows([
+        "@org_python_pypi_backports_weakref//:LICENSE",
+    ] + if_mkl([
+        "//third_party/mkl:LICENSE",
+        "@mkl//:LICENSE",
+    ]) + if_not_windows([
         "@nccl_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
 )
@@ -141,11 +148,14 @@ sh_binary(
             ":included_headers",
             ":simple_console",
             "//tensorflow:tensorflow_py",
+            "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
+            "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
             "//tensorflow/contrib/graph_editor:graph_editor_pip",
             "//tensorflow/contrib/keras:keras",
             "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
+            "//tensorflow/contrib/predictor:predictor_pip",
             "//tensorflow/contrib/session_bundle:session_bundle_pip",
             "//tensorflow/contrib/signal:signal_py",
             "//tensorflow/contrib/slim:slim",
@@ -154,14 +164,19 @@ sh_binary(
             "//tensorflow/contrib/specs:specs",
             "//tensorflow/contrib/tensor_forest:init_py",
             "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
+            "//tensorflow/contrib/timeseries:timeseries_pip",
+            "//tensorflow/contrib/tpu:tpu_estimator",
+            "//tensorflow/contrib/tpu:tpu_helper_library",
+            "//tensorflow/contrib/tpu:tpu_py",
             "//tensorflow/examples/tutorials/mnist:package",
             "//tensorflow/python:distributed_framework_test_lib",
             "//tensorflow/python:meta_graph_testdata",
             "//tensorflow/python:util_example_parser_configuration",
             "//tensorflow/python/debug:debug_pip",
             "//tensorflow/python/saved_model:saved_model",
+            "//tensorflow/python:spectral_ops_test_util",
             "//tensorflow/python/tools:tools_pip",
-            "//tensorflow/tensorboard",
+            "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8b9a6b3de05c6639a85e1d0437cfa2639d142b92..ff7db52cb0e1ea48498e06f8c808373a1bfd2dce 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -17,6 +17,10 @@
 
 set -e
 
+function real_path() {
+  [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}"
+}
+
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
@@ -41,7 +45,7 @@ function main() {
     exit 1
   fi
 
-  DEST=$1
+  DEST=$(real_path $1)
   TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
 
   GPU_FLAG=""
@@ -79,23 +83,6 @@ function main() {
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
       "${TMPDIR}/external"
     RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
-  elif [ ! -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow ]; then
-    # Really old (0.2.1-) runfiles, without workspace name.
-    cp -R \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/tensorflow \
-      "${TMPDIR}"
-    mkdir "${TMPDIR}/external"
-    cp_external \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/external \
-      "${TMPDIR}/external"
-    RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles
-    # Copy MKL libs over so they can be loaded at runtime
-    if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8/_U_S_Sthird_Uparty_Smkl_Cintel_Ubinary_Ublob___Uthird_Uparty_Smkl ]; then
-      mkdir "${TMPDIR}/_solib_k8"
-  		cp -R \
-  			bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8/_U_S_Sthird_Uparty_Smkl_Cintel_Ubinary_Ublob___Uthird_Uparty_Smkl \
-        "${TMPDIR}/_solib_k8"
-    fi
   else
     if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external ]; then
       # Old-style runfiles structure (--legacy_external_runfiles).
@@ -107,11 +94,13 @@ function main() {
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
         "${TMPDIR}/external"
       # Copy MKL libs over so they can be loaded at runtime
-      if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8/_U_S_Sthird_Uparty_Smkl_Cintel_Ubinary_Ublob___Uthird_Uparty_Smkl ]; then
-        mkdir "${TMPDIR}/_solib_k8"
-        cp -R \
-          bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8/_U_S_Sthird_Uparty_Smkl_Cintel_Ubinary_Ublob___Uthird_Uparty_Smkl \
-          "${TMPDIR}/_solib_k8"
+      so_lib_dir="bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8"
+      if [ -d ${so_lib_dir} ]; then
+        mkl_so_dir=$(ls ${so_lib_dir} | grep mkl)
+        if [ $? -eq 0 ]; then
+          mkdir "${TMPDIR}/_solib_k8"
+          cp -R ${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/_solib_k8"
+        fi
       fi
     else
       # New-style runfiles structure (--nolegacy_external_runfiles).
@@ -124,11 +113,13 @@ function main() {
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
         "${TMPDIR}/external"
       # Copy MKL libs over so they can be loaded at runtime
-      if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8/_U_S_Sthird_Uparty_Smkl_Cintel_Ubinary_Ublob___Uthird_Uparty_Smkl ]; then
-        mkdir "${TMPDIR}/_solib_k8"
-    		cp -R \
-    			bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8/_U_S_Sthird_Uparty_Smkl_Cintel_Ubinary_Ublob___Uthird_Uparty_Smkl \
-          "${TMPDIR}/_solib_k8"
+      so_lib_dir="bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8"
+      if [ -d ${so_lib_dir} ]; then
+        mkl_so_dir=$(ls ${so_lib_dir} | grep mkl)
+        if [ $? -eq 0 ]; then
+          mkdir "${TMPDIR}/_solib_k8"
+          cp -R ${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/_solib_k8"
+        fi
       fi
     fi
     RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
@@ -139,7 +130,7 @@ function main() {
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
   pushd ${RUNFILES%org_tensorflow}
-  for header in $(find protobuf -name \*.h); do
+  for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
diff --git a/tensorflow/tools/pip_package/check_load_py_test.py b/tensorflow/tools/pip_package/check_load_py_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a132a8de3ea596a2fbd8e661308f1603778666b
--- /dev/null
+++ b/tensorflow/tools/pip_package/check_load_py_test.py
@@ -0,0 +1,89 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests to check that py_test are properly loaded in BUILD files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+
+
+def check_output_despite_error(args):
+  """Get output of args from command line, even if there are errors.
+
+  Args:
+    args: a list of command line args.
+
+  Returns:
+    output as string.
+  """
+  try:
+    output = subprocess.check_output(args, stderr=subprocess.STDOUT)
+  except subprocess.CalledProcessError as e:
+    output = e.output
+  return output.strip()
+
+
+def main():
+  # Get all py_test target, note bazel query result will also include
+  # cuda_py_test etc.
+  try:
+    targets = subprocess.check_output(
+        'bazel query "kind(py_test, //tensorflow/contrib/... + '
+        '//tensorflow/python/... - '
+        '//tensorflow/contrib/tensorboard/...)"',
+        shell=True).strip()
+  except subprocess.CalledProcessError as e:
+    targets = e.output
+
+  # Only keep py_test targets, and filter out targets with 'no_pip' tag.
+  valid_targets = []
+  for target in targets.split('\n'):
+    kind = check_output_despite_error(['buildozer', 'print kind', target])
+    if kind == 'py_test':
+      tags = check_output_despite_error(['buildozer', 'print tags', target])
+      if 'no_pip' not in tags:
+        valid_targets.append(target)
+
+  # Get all BUILD files for all valid targets.
+  build_files = set()
+  for target in valid_targets:
+    build_files.add(os.path.join(target[2:].split(':')[0], 'BUILD'))
+
+  # Check if BUILD files load py_test.
+  files_missing_load = []
+  for build_file in build_files:
+    updated_build_file = subprocess.check_output(
+        'buildozer -stdout "new_load //tensorflow:tensorflow.bzl py_test" ' +
+        build_file,
+        shell=True)
+    with open(build_file, 'r') as f:
+      if f.read() != updated_build_file:
+        files_missing_load.append(build_file)
+
+  if files_missing_load:
+    raise RuntimeError('The following files are missing %s:\n %s' % (
+        'load("//tensorflow:tensorflow.bzl", "py_test").\nThis load statement'
+        ' is needed because otherwise pip tests will try to use their '
+        'dependencies, which are not visible to them.',
+        '\n'.join(files_missing_load)))
+  else:
+    print('TEST PASSED.')
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index dec08157c2c3edf9e632227fb54a50abf3b1b49d..83909d83ae4c45404419745ef7982649e7f416f5 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -60,6 +60,12 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/timeseries/examples:predict",
+    "//tensorflow/contrib/timeseries/examples:multivariate",
+    "//tensorflow/contrib/timeseries/examples:known_anomaly",
+    "//tensorflow/contrib/timeseries/examples:data/period_trend.csv",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
+    "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",  # pylint:disable=line-too-long
 ]
 
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 3bbc9000429a0c871470dc575e5ff8718a622378..0b0ee4c857224d239776e7a77b051221fc0d0a3b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,17 +29,13 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.2.0-rc2'
+_VERSION = '1.3.0-rc0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
     'six >= 1.10.0',
-    'protobuf >= 3.2.0',
-    'werkzeug >= 0.11.10',
-    'html5lib == 0.9999999',  # identical to 1.0b8
-    'markdown == 2.2.0',
-    'bleach == 1.5.0',
-    'backports.weakref == 1.0rc1',
+    'protobuf >= 3.3.0',
+    'tensorflow-tensorboard',
 ]
 
 project_name = 'tensorflow'
@@ -57,9 +53,12 @@ else:
   # mock comes with unittest.mock for python3, need to install for python2
   REQUIRED_PACKAGES.append('mock >= 2.0.0')
 
+# weakref.finalize was introduced in Python 3.4
+if sys.version_info < (3, 4):
+  REQUIRED_PACKAGES.append('backports.weakref >= 1.0rc1')
+
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
-    'tensorboard = tensorflow.tensorboard.tensorboard:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
 ]
 # pylint: enable=line-too-long
@@ -114,7 +113,7 @@ class InstallHeaders(Command):
     install_dir = os.path.join(self.install_dir, os.path.dirname(header))
     # Get rid of some extra intervening directories so we can have fewer
     # directories for -I
-    install_dir = re.sub('/google/protobuf/src', '', install_dir)
+    install_dir = re.sub('/google/protobuf_archive/src', '', install_dir)
 
     # Copy eigen code into tensorflow/include.
     # A symlink would do, but the wheel file that gets created ignores
@@ -165,7 +164,7 @@ else:
 
 headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*.h', 'tensorflow/stream_executor')) +
-           list(find_files('*.h', 'google/protobuf/src')) +
+           list(find_files('*.h', 'google/protobuf_archive/src')) +
            list(find_files('*', 'third_party/eigen3')) +
            list(find_files('*', 'external/eigen_archive')))
 
@@ -191,8 +190,6 @@ setup(
     package_data={
         'tensorflow': [
             EXTENSION_NAME,
-            'tensorboard/components/index.html',
-            'tensorboard/TAG',
         ] + matches,
     },
     zip_safe=False,
diff --git a/tensorflow/tools/quantization/BUILD b/tensorflow/tools/quantization/BUILD
index cb41185219c56f9a0d834a2e4b5b71c57b46810a..e99ad06a06294c4d037b76ea9450e51bd795e79d 100644
--- a/tensorflow/tools/quantization/BUILD
+++ b/tensorflow/tools/quantization/BUILD
@@ -13,7 +13,20 @@ py_library(
     name = "quantize_graph_lib",
     srcs = ["quantize_graph.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//third_party/py/numpy",
+    ],
 )
 
 py_binary(
@@ -27,18 +40,17 @@ py_binary(
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:graph_util",
         "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_util",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
 py_test(
     name = "quantize_graph_test",
     size = "small",
-    srcs = [
-        "quantize_graph_test.py",
-    ],
+    srcs = ["quantize_graph_test.py"],
     srcs_version = "PY2AND3",
     tags = ["nomsan"],  # http://b/32242946
     deps = [
@@ -48,6 +60,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:graph_util",
         "//tensorflow/python:platform",
         "//third_party/py/numpy",
     ],
@@ -55,12 +68,13 @@ py_test(
 
 py_binary(
     name = "graph_to_dot",
-    srcs = [
-        "graph_to_dot.py",
-    ],
+    srcs = ["graph_to_dot.py"],
     main = "graph_to_dot.py",
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/python:platform"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+    ],
 )
 
 filegroup(
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 9367bcd4a3457d7387ee8dc17a4d19043fa8c9a2..28d651e9106b29058824c06b160df2b9b5781757 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -22,6 +22,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:errors",
         "//tensorflow/python:platform",
@@ -46,6 +47,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":system_info_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
     ],
 )
@@ -54,8 +56,10 @@ py_binary(
     name = "run_and_gather_logs",
     srcs = ["run_and_gather_logs.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":run_and_gather_logs_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
     ],
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 2956c6dde74ff38a7f000d6b6b595beaa397fa76..64fff844a70d439306c9bcf7f21d5a6047fa428a 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -28,7 +28,7 @@ def tf_cc_logged_benchmark(
       name = name,
       tags = all_tags,
       size = "large",
-      srcs = ["//tensorflow/tools/test:run_and_gather_logs.py"],
+      srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
       args = [
           "--name=//%s:%s" % (PACKAGE_NAME, name),
           "--test_name=" + target,
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index 570e09f1659526198a20db8cb87971a51f353d2b..c798dd5de7532d87387da598a1e7332370e41bed 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -135,7 +135,7 @@ def run_and_gather_logs(name, test_name, test_args,
   gpu_config = gpu_info_lib.gather_gpu_devices()
   if gpu_config:
     gpu_name = gpu_config[0].model
-    gpu_short_name_match = re.search(r"Tesla [KP][4,8]0", gpu_name)
+    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100)", gpu_name)
     if gpu_short_name_match:
       gpu_short_name = gpu_short_name_match.group(0)
       test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 829333e05629946fc5627d37301883d70572b1be..77cc9f75f7725438918f681833d58e9ecb4a2f70 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -162,7 +162,7 @@ def upload_benchmark_data(client, data):
   t_val.update({
       "test": test_name,
       "start": start_time,
-      "info": unicode(test_result)
+      "info": unicode(data)
   })
   batch.append(t_val)
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks_index.yaml b/tensorflow/tools/test/upload_test_benchmarks_index.yaml
index 8cd33a1da60cad1c1a0e21998b4eefc81babfd8e..ec7f76f6663b3e586b4b63e92eb576740cd445f9 100644
--- a/tensorflow/tools/test/upload_test_benchmarks_index.yaml
+++ b/tensorflow/tools/test/upload_test_benchmarks_index.yaml
@@ -27,7 +27,7 @@ indexes:
   properties:
   - name: test
   - name: start
-    direction: asc
+    direction: desc
 
 # Index to access a specific (test, entry, start) Entity, and also to be able to
 # fetch a range of (start, timing) graph values for a given (test, entry) pair
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
deleted file mode 100644
index 54f3cd62f283de853bb1b14e61c96c81f77702b5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/README.md
+++ /dev/null
@@ -1,122 +0,0 @@
-# tfprof: TensorFlow Profiler and Beyond
-
-### Features
-
-* Profile model architectures
-  * parameters, tensor shapes, float operations, device placement, etc.
-* Profile model performance
-  * execution time, memory consumption
-  * Profile multiple steps.
-* Auto detect and advise. (Experimental)
-
-### Interfaces
-
-* Python API
-* Command Line
-* Visualization
-* C++ API (Not public, contact us if needed.)
-
-### Views and Options
-
-tfprof provides 4 different views to organize the statistics.
-
-    *  code view: operations are grouped by Python codes that generate them.
-    *  op view: operations are grouped by operation type (E.g. MatMul, Conv2D).
-    *  scope view: operations are organized based on name scope hierarchies.
-    *  graph view: operations are organized based on input/output.
-
-tfprof provides options to help user select, filter and order statistics.
-See [Options](g3doc/options.md) for detail instructions.
-
-```
--max_depth                  10
--min_bytes                  0
--min_micros                 0
--min_params                 0
--min_float_ops              0
--min_occurrence             0
--step                       -1
--order_by                   name
--account_type_regexes       .*
--start_name_regexes         .*
--trim_name_regexes
--show_name_regexes          .*
--hide_name_regexes
--account_displayed_op_only  false
--select                     params
--output                     stdout:
-```
-
-### Tutorials
-
-*  [Python API](g3doc/python_api.md)
-*  [Command Line Interface](g3doc/command_line.md)
-*  [Profile Time](g3doc/profile_time.md)
-*  [Profile Memory](g3doc/profile_memory.md)
-*  [Profile Model Architecture](g3doc/profile_model_architecture.md)
-*  [Auto Detect and Advise](g3doc/advise.md)
-*  [Options](g3doc/options.md)
-
-## Demo
-
-### Attribute TensorFlow graph running time to your Python codes.
-```shell
-tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
-_TFProfRoot (0us/22.44ms)
-  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
-    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
-      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
-        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
-          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
-          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
-          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
-            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
-            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
-            ...
-          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
-          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
-        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
-```
-
-### Show your model variables and the number of parameters.
-```
-tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
-_TFProfRoot (--/930.58k params)
-  global_step (1/1 params)
-  init/init_conv/DW (3x3x3x16, 432/864 params)
-  pool_logit/DW (64x10, 640/1.28k params)
-    pool_logit/DW/Momentum (64x10, 640/640 params)
-  pool_logit/biases (10, 10/20 params)
-    pool_logit/biases/Momentum (10, 10/10 params)
-  unit_last/final_bn/beta (64, 64/128 params)
-  unit_last/final_bn/gamma (64, 64/128 params)
-  unit_last/final_bn/moving_mean (64, 64/64 params)
-  unit_last/final_bn/moving_variance (64, 64/64 params)
-```
-
-### Show the most expensive operation types.
-```
-tfprof> op -select micros,bytes,occurrence -order_by micros
-SoftmaxCrossEntropyWithLogits      36.58MB (100.00%, 0.05%),      1.37sec (100.00%, 23.56%),         30
-MatMul                        2720.57MB (99.95%, 3.66%),      988.90ms (76.44%, 17.05%),       3450
-ConcatV2                       741.37MB (96.29%, 1.00%),       421.44ms (59.38%, 7.27%),       6098
-Mul                           3957.24MB (95.29%, 5.33%),       418.90ms (52.12%, 7.22%),       9427
-Add                            740.05MB (89.96%, 1.00%),       335.26ms (44.89%, 5.78%),       2180
-Sub                             32.46MB (88.97%, 0.04%),       216.44ms (39.11%, 3.73%),       4372
-AddN                           733.21MB (88.92%, 0.99%),       208.46ms (35.38%, 3.59%),       5481
-Slice                          708.07MB (87.94%, 0.95%),       205.27ms (31.78%, 3.54%),       7277
-Fill                           954.27MB (86.98%, 1.28%),       154.50ms (28.24%, 2.66%),       9686
-Select                         312.33MB (85.70%, 0.42%),       123.04ms (25.58%, 2.12%),       5746
-Sigmoid                        152.57MB (85.28%, 0.21%),        96.66ms (23.46%, 1.67%),       2970
-```
-
-### Visualize time and memory.
-<left>
-[CodeTimeline](g3doc/graph_timeline.png)
-</left>
-
-### Teams
-
-* Xin Pan (xpan@google.com, github: panyx0718)
-* Jon Shlens
-* Yao Zhang
diff --git a/tensorflow/tools/tfprof/g3doc/advise.md b/tensorflow/tools/tfprof/g3doc/advise.md
deleted file mode 100644
index 3bce6270ff8368fb57d183c6f4c6a88f5dd5bc07..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/g3doc/advise.md
+++ /dev/null
@@ -1,44 +0,0 @@
-## Auto Detect and Advise
-
-tfprof analyzes profiles and generates advises for common issues.
-
-### Run Advise.
-```python
-# First create a profiler. See profiler tutorials for more details.
-profiler = model_analyzer.Profiler(sess.graph)
-run_meta = config_pb2.RunMetadata()
-_ = sess.run(r1,
-             options=config_pb2.RunOptions(
-                 trace_level=config_pb2.RunOptions.FULL_TRACE),
-             run_metadata=run_meta)
-profiler.add_step(1, run_meta)
-
-# Start advise.
-profiler.advise()
-```
-
-### Checker
-
-There is no magic behind advise mode. tfprof builds the profiles first, then
-it runs through a list of `Checkers`, each one responsible for checking one
-area with the profile and report issues. A `Checker` is like a plugin.
-
-For example:
-
-####JobChecker (Not Available OSS)
-* Checking RecvTensor RPC latency and bandwidth.
-* Checking CPU/Memory utilization of the job.
-
-####AcceleratorUtilization Checker
-* Checks what percentage of time the accelerator spends on computation.
-
-####Operation Checker
-* Check whether the operation runs with optimal options.
-* Checks if there is a better implementation to replace the current operation.
-
-####Contribute Your Checker
-
-Follow examples of accelerator_utilization_checker.h
-
-
-
diff --git a/tensorflow/tools/tfprof/g3doc/options.md b/tensorflow/tools/tfprof/g3doc/options.md
deleted file mode 100644
index 78c72bf5eddab24dc6e967adf8ef5c4a82c0b98f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/g3doc/options.md
+++ /dev/null
@@ -1,86 +0,0 @@
-##Options
-
-###Overview
-
-For all tfprof views, the statistics are processed with the following procedures
-
-1) An in-memory data structure is used represent the view.
-
-2) `-account_type_regexes` is used to first select the operations that match
-   the specified operation types. An operation has its default type
-   (e.g. MatMul, Conv2D). `tfprof` also considers device as operation type.
-   User can also define customized operation type. Hence, an operation has
-   multiple types. Operations with matched
-   types are selected for display and their statistics are aggregated
-   by the in-memory data structure.
-
-3) Various `-xxx_name_regexes`,  `-min_xxx`, `-max_depth` etc options are then
-   applied to further filter based on names and values.
-   It's no limited operation name. In code view,
-   it's the code trace. In op view, it's the operation type name. Different
-   from `-account_type_regexes`, Statistics are used even if a name is not displayed.
-   For example, in code view, a callee might be hidden, but its statistics is
-   still aggregated by it's caller. `-account_displayed_op_only`, however,
-   breaks the rule and only use statistics of displayed names.
-
-4) Finally, the filtered data structure is displayed in a format depending
-   on the `-output` option.
-
-####Option Semantics In Different View
-options usually have the same semantics in different views. However, some
-can vary. For example `-max_depth` in scope view means the depth of
-name scope <b>tree</b>. In op view, it means the length of operation <b>list</b>.
-In graph view, in means the number of hops in the <b>graph</b>.
-
-
-###Docs
-
-`-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure.
-
-`-min_bytes`: Show ops that request at least this number of bytes.
-
-`-min_micros`: Show ops that spend at least this number of microseconds to run.
-
-`-min_params`: Show ops that contains at least this number of parameters.
-
-`-min_float_ops`: Show ops that contain at least this number of float operations. Only available if an op has op.RegisterStatistics() defined and OpLog is provided
-
-`-min_occurrence`: Show ops that appear at least this number of times. Only available in "op" view.
-
-`-step`: Show the stats of the this step when multiple steps of RunMetadata were added. By default, show the average of all steps."
-
-`-order_by`: Order the results by [name|depth|bytes|micros|params|float_ops|occurrence]
-
-`-account_type_regexes`: Account and display the ops whose types match one of the type regexes specified. tfprof allow user to define extra op types for ops through tensorflow.tfprof.OpLog proto. regexes are comma-sperated.
-
-`-start_name_regexes`: Show ops starting from the ops that matches the regexes, recursively. regexes are comma-separated.
-
-`-trim_name_regexes`: Hide ops starting from the ops that matches the regexes, recursively, regexes are comma-seprated.
-
-`-show_name_regexes`: Show ops that match the regexes. regexes are comma-seprated.
-
-`-hide_name_regexes`: Hide ops that match the regexes. regexes are comma-seprated.
-
-Notes: For each op, `-account_type_regexes` is first evaluated, only ops with
-types matching the specified regexes are accounted and selected for displayed.
-`-start/trim/show/hide_name_regexes` are used to further filter ops for display.
-`-start_name_regexes` is evaluated first to search the starting ops to display.
-Descendants of starting ops are then evaluated against `-show/hide_name_regexes`
-to make display decision. If an op matches trim_name_regexes, all its
-descendants are hidden. Ops statistics are *accounted even if they are hidden*
-as long as they match the `-account_xxx` options.
-
-`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
-
-`-select`: Comma-separated list of metrics to show:
-[bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types|input_shapes].
-
-`-output`: Output results as stdout, file or timeline.
-The format is ```output_type:key=value,key=value```.
-For example: ```-output timeline:outfile=<filename>```.
-
-```shell
-timeline: key=outfile, value=<filename>.
-stdout: none.
-file: key=outfile, value=<filename>.
-```
diff --git a/tensorflow/tools/tfprof/internal/advisor/checker.h b/tensorflow/tools/tfprof/internal/advisor/checker.h
deleted file mode 100644
index b8b057be5b1d6410acfb3e8607693e303a6f963c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/internal/advisor/checker.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_CHECKER_H_
-
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
-
-namespace tensorflow {
-namespace tfprof {
-
-static const char* const kLevel[] = {
-    "NOTE",     // Good to know.
-    "SUGGEST",  // Might get better.
-    "WARN",     // Please do it for better.
-};
-
-class Checker {
- public:
-  virtual ~Checker(){};
-
-  virtual string name() = 0;
-
-  std::vector<string> Run(const TFStats* stats) { return Check(stats); }
-
- protected:
-  // Returns a vector of string, each one being an advice.
-  virtual std::vector<string> Check(const TFStats* stats) = 0;
-};
-}  // namespace tfprof
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_CHECKER_H_
diff --git a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h b/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
deleted file mode 100644
index 856f51545921283799a87e053c96a19d0ee4387d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/internal/advisor/tfprof_advisor.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
-
-#include "tensorflow/tools/tfprof/internal/advisor/accelerator_utilization_checker.h"
-#include "tensorflow/tools/tfprof/internal/advisor/checker.h"
-#include "tensorflow/tools/tfprof/internal/advisor/internal_checker_runner.h"
-#include "tensorflow/tools/tfprof/internal/advisor/operation_checker.h"
-
-namespace tensorflow {
-namespace tfprof {
-
-// The Advisor runs a list of Checkers, each checks a specific area.
-class Advisor {
- public:
-  Advisor(const TFStats* stats) : stats_(stats) {}
-
-  std::map<string, std::vector<string>> Advise() {
-    // Note: Release a checker's memory ASAP.
-    std::map<string, std::vector<string>> reports = RunInternalCheckers(stats_);
-    // TODO(xpan): Think of a way to turn off/on specific checkers.
-    AcceleratorUtilizationChecker au_checker;
-    reports[au_checker.name()] = au_checker.Run(stats_);
-    OperationChecker op_checker;
-    reports[op_checker.name()] = op_checker.Run(stats_);
-
-    for (const auto& checker_r : reports) {
-      fprintf(stdout, "%s reports:\n", checker_r.first.c_str());
-      for (const auto& r : checker_r.second) {
-        fprintf(stdout, "%s\n", r.c_str());
-      }
-    }
-    fflush(stdout);
-    return reports;
-  }
-
- private:
-  const TFStats* stats_;
-};
-
-}  // namespace tfprof
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
deleted file mode 100644
index 498477de0a00f828b07a6a955e05722d6a79d433..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
-
-#include <utility>
-
-#include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
-
-namespace tensorflow {
-namespace tfprof {
-class TFProfShowTest : public ::testing::Test {
- protected:
-  TFProfShowTest() {
-    string graph_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
-    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
-
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
-    string run_meta_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/run_meta");
-    TF_CHECK_OK(
-        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
-
-    std::unique_ptr<OpLog> op_log_pb(new OpLog());
-    string op_log_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/tfprof_log");
-    TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
-
-    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                    "tools/tfprof/internal/testdata/ckpt");
-    TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
-    CHECK(TF_GetCode(status) == TF_OK);
-    TF_DeleteStatus(status);
-
-    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
-                                std::move(op_log_pb), std::move(ckpt_reader)));
-  }
-
-  std::unique_ptr<TFStats> tf_stats_;
-};
-
-TEST_F(TFProfShowTest, DumpScopeMode) {
-  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
-  Options opts(5, 0, 0, 0, 0, 0, -1, "name",
-               {"VariableV2"},  // accout_type_regexes
-               {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops"}, "file",
-               {{"outfile", dump_file}});
-  tf_stats_->ShowGraphNode("scope", opts);
-
-  string dump_str;
-  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
-  EXPECT_EQ(
-      "node name | # parameters | # float_ops | output bytes | execution "
-      "time\n_TFProfRoot (--/370 params, --/0 flops, --/1.48KB, --/5us)\n  "
-      "conv2d (--/140 params, --/0 flops, --/560B, --/2us)\n    conv2d/bias "
-      "(5, 5/5 params, 0/0 flops, 20B/20B, 1us/1us)\n    conv2d/kernel "
-      "(3x3x3x5, 135/135 params, 0/0 flops, 540B/540B, 1us/1us)\n  conv2d_1 "
-      "(--/230 params, --/0 flops, --/920B, --/3us)\n    conv2d_1/bias (5, 5/5 "
-      "params, 0/0 flops, 20B/20B, 1us/1us)\n    conv2d_1/kernel (3x3x5x5, "
-      "225/225 params, 0/0 flops, 900B/900B, 2us/2us)\n",
-      dump_str);
-}
-
-TEST_F(TFProfShowTest, DumpOpMode) {
-  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
-  Options opts(
-      5, 0, 0, 0, 0, 4, -1, "params", {".*"},  // accout_type_regexes
-      {".*"}, {""}, {".*"}, {""}, false,
-      {"params", "bytes", "micros", "float_ops", "occurrence", "input_shapes"},
-      "file", {{"outfile", dump_file}});
-  tf_stats_->ShowMultiGraphNode("op", opts);
-
-  string dump_str;
-  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
-  EXPECT_EQ(
-      "nodename|outputbytes|executiontime|#parameters|#float_ops|opoccurrence|"
-      "inputshapes\nVariableV21.48KB(100.00%,17.10%),5us(100.00%,5.15%),"
-      "370params(100.00%,100.00%),0float_ops(100.00%,0.00%),4\n\ninput_type:\t("
-      "*4)\texec_time:5us\n\nAssign0B(0.00%,0.00%),0us(94.85%,0.00%),0params(0."
-      "00%,0.00%),0float_ops(100.00%,0.00%),8\n\ninput_type:0:unknown,\t1:"
-      "unknown\t(*8)\texec_time:0us\n\nConst1.54KB(58.87%,17.74%),1us(80.41%,1."
-      "03%),0params(0.00%,0.00%),0float_ops(98.49%,0.00%),24\n\ninput_type:\t(*"
-      "24)\texec_time:1us\n\n",
-      StringReplace(dump_str, " ", ""));
-}
-}  // namespace tfprof
-}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
deleted file mode 100644
index a1e500f94929ce4b04c2ea2aabb4b6e13acd2202..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
-
-#include <utility>
-
-#include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
-
-namespace tensorflow {
-namespace tfprof {
-class TFProfStatsTest : public ::testing::Test {
- protected:
-  TFProfStatsTest() {
-    string graph_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
-    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
-
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
-    string run_meta_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/run_meta");
-    TF_CHECK_OK(
-        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
-
-    std::unique_ptr<OpLog> op_log_pb(new OpLog());
-    string op_log_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "tools/tfprof/internal/testdata/tfprof_log");
-    TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
-
-    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                    "tools/tfprof/internal/testdata/ckpt");
-    TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
-    CHECK(TF_GetCode(status) == TF_OK);
-    TF_DeleteStatus(status);
-
-    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
-                                std::move(op_log_pb), std::move(ckpt_reader)));
-  }
-
-  std::unique_ptr<TFStats> tf_stats_;
-};
-
-TEST_F(TFProfStatsTest, CustomOpType) {
-  Options opts(3, 0, 0, 0, 0, 0, -1, "name",
-               {kTrainableVarType},  // accout_type_regexes
-               {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops"}, "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
-
-  TFGraphNodeProto expected;
-  CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
-      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
-      "requested_bytes: 0\n  total_exec_micros: 2\n  total_requested_bytes: "
-      "560\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n "
-      "   exec_micros: 1\n    requested_bytes: 20\n    parameters: 5\n    "
-      "total_exec_micros: 1\n    total_requested_bytes: 20\n    "
-      "total_parameters: 5\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d/kernel\"\n    "
-      "exec_micros: 1\n    requested_bytes: 540\n    parameters: 135\n    "
-      "total_exec_micros: 1\n    total_requested_bytes: 540\n    "
-      "total_parameters: 135\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
-      "0\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
-      "requested_bytes: 0\n  total_exec_micros: 3\n  total_requested_bytes: "
-      "920\n  total_parameters: 230\n  children {\n    name: "
-      "\"conv2d_1/bias\"\n    exec_micros: 1\n    requested_bytes: 20\n    "
-      "parameters: 5\n    total_exec_micros: 1\n    total_requested_bytes: "
-      "20\n    total_parameters: 5\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d_1/kernel\"\n  "
-      "  exec_micros: 2\n    requested_bytes: 900\n    parameters: 225\n    "
-      "total_exec_micros: 2\n    total_requested_bytes: 900\n    "
-      "total_parameters: 225\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
-      "0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
-      &expected));
-  EXPECT_EQ(expected.DebugString(), root.DebugString());
-}
-
-TEST_F(TFProfStatsTest, CheckPointOpType) {
-  Options opts(3, 0, 0, 0, 0, 0, -1, "name",
-               {kCkptVarType},  // accout_type_regexes
-               {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops"}, "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
-
-  TFGraphNodeProto expected;
-  CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
-      "370\nchildren {\n  name: \"conv2d\"\n  exec_micros: 0\n  "
-      "requested_bytes: 0\n  total_exec_micros: 2\n  total_requested_bytes: "
-      "560\n  total_parameters: 140\n  children {\n    name: \"conv2d/bias\"\n "
-      "   exec_micros: 1\n    requested_bytes: 20\n    parameters: 5\n    "
-      "total_exec_micros: 1\n    total_requested_bytes: 20\n    "
-      "total_parameters: 5\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d/kernel\"\n    "
-      "exec_micros: 1\n    requested_bytes: 540\n    parameters: 135\n    "
-      "total_exec_micros: 1\n    total_requested_bytes: 540\n    "
-      "total_parameters: 135\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
-      "0\n}\nchildren {\n  name: \"conv2d_1\"\n  exec_micros: 0\n  "
-      "requested_bytes: 0\n  total_exec_micros: 3\n  total_requested_bytes: "
-      "920\n  total_parameters: 230\n  children {\n    name: "
-      "\"conv2d_1/bias\"\n    exec_micros: 1\n    requested_bytes: 20\n    "
-      "parameters: 5\n    total_exec_micros: 1\n    total_requested_bytes: "
-      "20\n    total_parameters: 5\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  children {\n    name: \"conv2d_1/kernel\"\n  "
-      "  exec_micros: 2\n    requested_bytes: 900\n    parameters: 225\n    "
-      "total_exec_micros: 2\n    total_requested_bytes: 900\n    "
-      "total_parameters: 225\n    devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n    float_ops: 0\n    "
-      "total_float_ops: 0\n  }\n  float_ops: 0\n  total_float_ops: "
-      "0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
-      &expected));
-  EXPECT_EQ(expected.DebugString(), root.DebugString());
-}
-
-TEST_F(TFProfStatsTest, TestGraph) {
-  Options opts(100, 0, 10000, 0, 0, 0, -1, "name", {".*"},
-               {"cost.*"},  // start_name_regexes
-               {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops"}, "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("graph", opts);
-
-  TFGraphNodeProto expected;
-  CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: 0\n"
-      "total_exec_micros: 97\ntotal_requested_bytes: "
-      "8656\ntotal_parameters: 370\nfloat_ops: "
-      "0\ntotal_float_ops: 34360\n",
-      &expected));
-  EXPECT_EQ(expected.DebugString(), root.DebugString());
-}
-
-TEST_F(TFProfStatsTest, TestFloatOps) {
-  Options opts(10, 0, 0, 0, 1, 0, -1, "name", {".*"}, {".*"}, {""}, {".*"},
-               {""}, false, {"float_ops"}, "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
-
-  TFGraphNodeProto expected;
-  CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
-      "8656\ntotal_parameters: 370\nchildren {\n  name: \"conv2d/BiasAdd\"\n  "
-      "exec_micros: 12\n  requested_bytes: 1440\n  total_exec_micros: 12\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 360\n  "
-      "total_float_ops: 360\n  input_shapes {\n    key: 0\n    value {\n      "
-      "unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    value "
-      "{\n      unknown_rank: true\n    }\n  }\n}\nchildren {\n  name: "
-      "\"conv2d/convolution\"\n  exec_micros: 60\n  requested_bytes: 1440\n  "
-      "total_exec_micros: 60\n  total_requested_bytes: 1440\n  "
-      "total_parameters: 0\n  devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 19440\n  "
-      "total_float_ops: 19440\n  input_shapes {\n    key: 0\n    value {\n     "
-      " unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    "
-      "value {\n      unknown_rank: true\n    }\n  }\n}\nchildren {\n  name: "
-      "\"conv2d_2/BiasAdd\"\n  exec_micros: 2\n  requested_bytes: 640\n  "
-      "total_exec_micros: 2\n  total_requested_bytes: 640\n  total_parameters: "
-      "0\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
-      "160\n  total_float_ops: 160\n  input_shapes {\n    key: 0\n    value "
-      "{\n      unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n "
-      "   value {\n      unknown_rank: true\n    }\n  }\n}\nchildren {\n  "
-      "name: \"conv2d_2/convolution\"\n  exec_micros: 13\n  requested_bytes: "
-      "640\n  total_exec_micros: 13\n  total_requested_bytes: 640\n  "
-      "total_parameters: 0\n  devices: "
-      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 14400\n  "
-      "total_float_ops: 14400\n  input_shapes {\n    key: 0\n    value {\n     "
-      " unknown_rank: true\n    }\n  }\n  input_shapes {\n    key: 1\n    "
-      "value {\n      unknown_rank: true\n    }\n  }\n}\nfloat_ops: "
-      "0\ntotal_float_ops: 34360\n",
-      &expected));
-  EXPECT_EQ(expected.DebugString(), root.DebugString());
-}
-
-TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
-  Options opts(100, 0, 0, 0, 0, 0, -1, "name", {".*"}, {".*"}, {""},
-               {"unit_2_1.*DW"},  // show_name_regexes.
-               {""}, true,        // account_displayed_op_only.
-               {"params"}, "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
-
-  TFGraphNodeProto expected;
-  CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
-      "0\nfloat_ops: 0\ntotal_float_ops: 0\n",
-      &expected));
-  EXPECT_EQ(expected.DebugString(), root.DebugString());
-}
-
-TEST_F(TFProfStatsTest, TestShowTensorValue) {
-  Options opts(10, 0, 0, 0, 0, 0, -1, "name", {".*"}, {".*"}, {""},
-               {"unit_1_0.*gamma"}, {""}, false,
-               {"tensor_value"},  // Show tensor value from checkpoint.
-               "", {});
-  const TFGraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
-  TFGraphNodeProto expected;
-  CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
-      "0\ntotal_exec_micros: 97\ntotal_requested_bytes: "
-      "8656\ntotal_parameters: 370\nfloat_ops: 0\ntotal_float_ops: 34360\n",
-      &expected));
-  EXPECT_EQ(expected.DebugString(), root.DebugString());
-}
-
-}  // namespace tfprof
-}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
deleted file mode 100644
index ae02b526347474e1aa738ee1a84cfabaeb7d723c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/tfprof_main.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "linenoise.h"
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
-
-using tensorflow::str_util::Split;
-
-void completion(const char* buf, linenoiseCompletions* lc) {
-  tensorflow::string buf_str = buf;
-  if (buf_str.find(" ") == buf_str.npos) {
-    for (const char* opt : tensorflow::tfprof::kCmds) {
-      if (tensorflow::string(opt).find(buf_str) == 0) {
-        linenoiseAddCompletion(lc, opt);
-      }
-    }
-    return;
-  }
-
-  tensorflow::string prefix;
-  int last_dash = buf_str.find_last_of(' ');
-  if (last_dash != tensorflow::string::npos) {
-    prefix = buf_str.substr(0, last_dash + 1);
-    buf_str = buf_str.substr(last_dash + 1, tensorflow::kint32max);
-  }
-  for (const char* opt : tensorflow::tfprof::kOptions) {
-    if (tensorflow::string(opt).find(buf_str) == 0) {
-      linenoiseAddCompletion(lc, (prefix + opt).c_str());
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  tensorflow::string FLAGS_graph_path = "";
-  tensorflow::string FLAGS_run_meta_path = "";
-  tensorflow::string FLAGS_op_log_path = "";
-  tensorflow::string FLAGS_checkpoint_path = "";
-  tensorflow::int32 FLAGS_max_depth = 10;
-  tensorflow::int64 FLAGS_min_bytes = 0;
-  tensorflow::int64 FLAGS_min_micros = 0;
-  tensorflow::int64 FLAGS_min_params = 0;
-  tensorflow::int64 FLAGS_min_float_ops = 0;
-  tensorflow::int64 FLAGS_min_occurrence = 0;
-  tensorflow::int64 FLAGS_step = -1;
-  tensorflow::string FLAGS_order_by = "name";
-  tensorflow::string FLAGS_account_type_regexes = ".*";
-  tensorflow::string FLAGS_start_name_regexes = ".*";
-  tensorflow::string FLAGS_trim_name_regexes = "";
-  tensorflow::string FLAGS_show_name_regexes = ".*";
-  tensorflow::string FLAGS_hide_name_regexes;
-  bool FLAGS_account_displayed_op_only = false;
-  tensorflow::string FLAGS_select = "params";
-  tensorflow::string FLAGS_output = "";
-  for (int i = 0; i < argc; i++) {
-    fprintf(stderr, "%s\n", argv[i]);
-  }
-
-  std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("graph_path", &FLAGS_graph_path,
-                       "GraphDef proto text file name"),
-      tensorflow::Flag("run_meta_path", &FLAGS_run_meta_path,
-                       "Comma-separated list of RunMetadata proto binary "
-                       "files. Each file is given step number 0,1,2,etc"),
-      tensorflow::Flag("op_log_path", &FLAGS_op_log_path,
-                       "tensorflow::tfprof::OpLog proto binary file name"),
-      tensorflow::Flag("checkpoint_path", &FLAGS_checkpoint_path,
-                       "TensorFlow Checkpoint file name"),
-      tensorflow::Flag("max_depth", &FLAGS_max_depth, "max depth"),
-      tensorflow::Flag("min_bytes", &FLAGS_min_bytes, "min_bytes"),
-      tensorflow::Flag("min_micros", &FLAGS_min_micros, "min micros"),
-      tensorflow::Flag("min_params", &FLAGS_min_params, "min params"),
-      tensorflow::Flag("min_float_ops", &FLAGS_min_float_ops, "min float ops"),
-      tensorflow::Flag("min_occurrence", &FLAGS_min_occurrence,
-                       "min occurrence"),
-      tensorflow::Flag("step", &FLAGS_step,
-                       "The stats of which step to use. By default average"),
-      tensorflow::Flag("order_by", &FLAGS_order_by, "order by"),
-      tensorflow::Flag("account_type_regexes", &FLAGS_start_name_regexes,
-                       "start name regexes"),
-      tensorflow::Flag("trim_name_regexes", &FLAGS_trim_name_regexes,
-                       "trim name regexes"),
-      tensorflow::Flag("show_name_regexes", &FLAGS_show_name_regexes,
-                       "show name regexes"),
-      tensorflow::Flag("hide_name_regexes", &FLAGS_hide_name_regexes,
-                       "hide name regexes"),
-      tensorflow::Flag("account_displayed_op_only",
-                       &FLAGS_account_displayed_op_only,
-                       "account displayed op only"),
-      tensorflow::Flag("select", &FLAGS_select, "select"),
-      tensorflow::Flag("output", &FLAGS_output, "output"),
-  };
-  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_ok) {
-    printf("%s", usage.c_str());
-    return (2);
-  }
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  fprintf(stderr, "%s\n", FLAGS_graph_path.c_str());
-
-  std::vector<tensorflow::string> account_type_regexes =
-      Split(FLAGS_account_type_regexes, ',', tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> start_name_regexes =
-      Split(FLAGS_start_name_regexes, ',', tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> trim_name_regexes =
-      Split(FLAGS_trim_name_regexes, ',', tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> show_name_regexes =
-      Split(FLAGS_show_name_regexes, ',', tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> hide_name_regexes =
-      Split(FLAGS_hide_name_regexes, ',', tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> select =
-      Split(FLAGS_select, ',', tensorflow::str_util::SkipEmpty());
-
-  tensorflow::string output_type;
-  std::map<tensorflow::string, tensorflow::string> output_options;
-  tensorflow::Status s = tensorflow::tfprof::ParseOutput(
-      FLAGS_output, &output_type, &output_options);
-  CHECK(s.ok()) << s.ToString();
-
-  tensorflow::string cmd = "";
-  if (argc == 1 && FLAGS_graph_path.empty()) {
-    printf("1) go/tfprof: Tutorial.\n");
-    printf("2) tfprof help: Detail help information.\n");
-    printf(
-        "3) tfprof --graph_path <GraphDef proto text file>: "
-        "Profiling model structure, tensor shape and # parameters.\n");
-    printf(
-        "4) tfprof --graph_path <GraphDef proto text file> \\\n"
-        "          --run_meta_path <RunMetadata proto binary file> \\\n"
-        "          --op_log_path <tensorflow::tfprof::OpLog proto binary file> "
-        "\\\n"
-        "          --checkpoint_path <TensorFlow Checkpoint file>: "
-        "Profiling everything!\n");
-    return 0;
-  } else if (argc > 1) {
-    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[5]) {
-      tensorflow::tfprof::PrintHelp();
-      return 0;
-    }
-    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[0] ||
-        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1] ||
-        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[2] ||
-        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[3]) {
-      cmd = argv[1];
-    }
-  }
-
-  printf("Reading Files...\n");
-  std::unique_ptr<tensorflow::GraphDef> graph(new tensorflow::GraphDef());
-  TF_CHECK_OK(tensorflow::tfprof::ReadGraphDef(tensorflow::Env::Default(),
-                                               FLAGS_graph_path, graph.get()));
-
-  std::unique_ptr<tensorflow::tfprof::OpLog> op_log(
-      new tensorflow::tfprof::OpLog());
-  if (!FLAGS_op_log_path.empty()) {
-    tensorflow::string op_log_str;
-    s = tensorflow::ReadFileToString(tensorflow::Env::Default(),
-                                     FLAGS_op_log_path, &op_log_str);
-    if (!s.ok()) {
-      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
-      return 1;
-    }
-    if (!tensorflow::ParseProtoUnlimited(op_log.get(), op_log_str)) {
-      fprintf(stderr, "Failed to parse op_log_path\n");
-      return 1;
-    }
-  }
-
-  std::unique_ptr<tensorflow::checkpoint::CheckpointReader> ckpt_reader;
-  TF_Status* status = TF_NewStatus();
-  if (!FLAGS_checkpoint_path.empty()) {
-    ckpt_reader.reset(new tensorflow::checkpoint::CheckpointReader(
-        FLAGS_checkpoint_path, status));
-    if (TF_GetCode(status) != TF_OK) {
-      fprintf(stderr, "%s\n", TF_Message(status));
-      TF_DeleteStatus(status);
-      return 1;
-    }
-    TF_DeleteStatus(status);
-  }
-
-  tensorflow::tfprof::TFStats tf_stat(
-      std::move(graph), nullptr, std::move(op_log), std::move(ckpt_reader));
-
-  std::vector<tensorflow::string> run_meta_files =
-      Split(FLAGS_run_meta_path, ',', tensorflow::str_util::SkipEmpty());
-  for (int i = 0; i < run_meta_files.size(); ++i) {
-    std::unique_ptr<tensorflow::RunMetadata> run_meta(
-        new tensorflow::RunMetadata());
-    s = ReadBinaryProto(tensorflow::Env::Default(), run_meta_files[i],
-                        run_meta.get());
-    if (!s.ok()) {
-      fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
-              run_meta_files[i].c_str(), s.ToString().c_str());
-      return 1;
-    }
-    tf_stat.ParseRunMeta(i, std::move(run_meta));
-  }
-
-  tensorflow::tfprof::Options opts(
-      FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros, FLAGS_min_params,
-      FLAGS_min_float_ops, FLAGS_min_occurrence, FLAGS_step, FLAGS_order_by,
-      account_type_regexes, start_name_regexes, trim_name_regexes,
-      show_name_regexes, hide_name_regexes, FLAGS_account_displayed_op_only,
-      select, output_type, output_options);
-
-  if (cmd == tensorflow::tfprof::kCmds[2] ||
-      cmd == tensorflow::tfprof::kCmds[3]) {
-    tf_stat.ShowMultiGraphNode(cmd, opts);
-    return 0;
-  } else if (cmd == tensorflow::tfprof::kCmds[0] ||
-             cmd == tensorflow::tfprof::kCmds[1]) {
-    tf_stat.ShowGraphNode(cmd, opts);
-    return 0;
-  }
-
-  linenoiseSetCompletionCallback(completion);
-  linenoiseHistoryLoad(".tfprof_history.txt");
-
-  for (char* line = nullptr; (line = linenoise("tfprof> ")) != nullptr;) {
-    tensorflow::string line_s = line;
-    free(line);
-
-    if (line_s.empty()) {
-      printf("%s", opts.ToString().c_str());
-      continue;
-    }
-    linenoiseHistoryAdd(line_s.c_str());
-    linenoiseHistorySave(".tfprof_history.txt");
-
-    tensorflow::tfprof::Options new_opts = opts;
-    tensorflow::Status s =
-        tensorflow::tfprof::ParseCmdLine(line_s, &cmd, &new_opts);
-    if (!s.ok()) {
-      fprintf(stderr, "E: %s\n", s.ToString().c_str());
-      continue;
-    }
-    if (cmd == tensorflow::tfprof::kCmds[4]) {
-      opts = new_opts;
-    } else if (cmd == tensorflow::tfprof::kCmds[5]) {
-      tensorflow::tfprof::PrintHelp();
-    } else if (cmd == tensorflow::tfprof::kCmds[2] ||
-               cmd == tensorflow::tfprof::kCmds[3]) {
-      tf_stat.ShowMultiGraphNode(cmd, new_opts);
-    } else if (cmd == tensorflow::tfprof::kCmds[0] ||
-               cmd == tensorflow::tfprof::kCmds[1]) {
-      tf_stat.ShowGraphNode(cmd, new_opts);
-    }
-  }
-  return 0;
-}
diff --git a/tensorflow/tools/tfprof/tfprof_options.proto b/tensorflow/tools/tfprof/tfprof_options.proto
deleted file mode 100644
index 27eafb1ca9c27a8f03324bf95b31715014d5d95b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/tfprof/tfprof_options.proto
+++ /dev/null
@@ -1,26 +0,0 @@
-syntax = "proto2";
-
-package tensorflow.tfprof;
-
-// Refers to tfprof_options.h/cc for documentation.
-// Only used to pass tfprof options from Python to C++.
-message OptionsProto {
-  optional int64 max_depth = 1;
-  optional int64 min_bytes = 2;
-  optional int64 min_micros = 3;
-  optional int64 min_params = 4;
-  optional int64 min_float_ops = 5;
-  optional int64 min_occurrence = 17;
-  optional int64 step = 18 [default = -1];
-
-  optional string order_by = 7;
-  repeated string account_type_regexes = 8;
-  repeated string start_name_regexes = 9;
-  repeated string trim_name_regexes = 10;
-  repeated string show_name_regexes = 11;
-  repeated string hide_name_regexes = 12;
-  optional bool account_displayed_op_only = 13;
-  repeated string select = 14;
-  optional string output = 15;
-  optional string dump_to_file = 16;
-}
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index de5525b0f208821594e06ca9cf3b029838afbdab..c2f42ba0c5ab2f50d768a1438ec23a1585bcd4d2 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -2,15 +2,13 @@
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
-load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("//third_party/mkl:build_defs.bzl", "mkl_repository")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl",
+     "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
 load("//third_party/py:python_configure.bzl", "python_configure")
-
-load("//third_party:polymer.bzl", "tensorboard_polymer_workspace")
-load("//third_party:python.bzl", "tensorboard_python_workspace")
-load("//third_party:js.bzl", "tensorboard_js_workspace")
-load("//third_party:typings.bzl", "tensorboard_typings_workspace")
+load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl",
+     "arm_compiler_configure")
 
 
 def _is_windows(repository_ctx):
@@ -88,7 +86,6 @@ temp_workaround_http_archive = repository_rule(
     },
 )
 
-
 # Executes specified command with arguments and calls 'fail' if it exited with
 # non-zero code
 def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
@@ -146,16 +143,29 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   cuda_configure(name="local_config_cuda")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
+
+  # Point //external/local_config_arm_compiler to //external/arm_compiler
+  arm_compiler_configure(
+      name="local_config_arm_compiler",
+      remote_config_repo="../arm_compiler",
+      build_file = str(Label("//third_party/toolchains/cpus/arm:BUILD")))
+
+  mkl_repository(
+      name = "mkl",
+      urls = [
+          "http://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.7/mklml_lnx_2018.0.20170425.tgz",
+          "https://github.com/01org/mkl-dnn/releases/download/v0.7/mklml_lnx_2018.0.20170425.tgz",
+      ],
+      sha256 = "3cc2501fb209e1fd0960a5f61c919438f9619c68a644dcebf0fdf69b07460c57",
+      strip_prefix = "mklml_lnx_2018.0.20170425",
+      build_file = str(Label("//third_party/mkl:mkl.BUILD")),
+      repository = tf_repo_name,
+  )
+
   if path_prefix:
     print("path_prefix was specified to tf_workspace but is no longer used " +
           "and will be removed in the future.")
 
-  # TODO(dandelion): Take these out when TB exits TF
-  tensorboard_polymer_workspace()
-  tensorboard_python_workspace()
-  tensorboard_typings_workspace()
-  tensorboard_js_workspace()
-
   native.new_http_archive(
       name = "eigen_archive",
       urls = [
@@ -167,6 +177,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
+  native.new_http_archive(
+    name = "arm_compiler",
+    build_file = str(Label("//:arm_compiler.BUILD")),
+    sha256 = "970285762565c7890c6c087d262b0a18286e7d0384f13a37786d8521773bc969",
+    strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
+    urls = [
+        "http://mirror.bazel.build/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+        "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+    ],
+  )
+
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
@@ -217,11 +238,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "farmhash_archive",
       urls = [
-          "http://mirror.bazel.build/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
-          "https://github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
+          "http://mirror.bazel.build/github.com/google/farmhash/archive/23eecfbe7e84ebf2e229bd02248f431c36e12f1a.zip",
+          "https://github.com/google/farmhash/archive/23eecfbe7e84ebf2e229bd02248f431c36e12f1a.zip",
       ],
-      sha256 = "4c626d1f306bda2c6804ab955892f803f5245f4dcaecb4979dc08b091256da54",
-      strip_prefix = "farmhash-92e897b282426729f4724d91a637596c7e2fe28f",
+      sha256 = "55215f8cd3ddbe9781f6fe5cc228731d6dcc8301b6191c6d420034c3fff1cb8d",
+      strip_prefix = "farmhash-23eecfbe7e84ebf2e229bd02248f431c36e12f1a",
       build_file = str(Label("//third_party:farmhash.BUILD")),
   )
 
@@ -291,26 +312,59 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "six_archive",
       urls = [
           "http://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
-          "http://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
+          "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
       ],
       sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
       strip_prefix = "six-1.10.0",
       build_file = str(Label("//third_party:six.BUILD")),
   )
 
+  native.new_http_archive(
+      name = "org_python_pypi_backports_weakref",
+      urls = [
+          "http://mirror.bazel.build/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
+          "https://pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
+      ],
+      sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892",
+      strip_prefix = "backports.weakref-1.0rc1/src",
+      build_file = str(Label("//third_party:backports_weakref.BUILD")),
+  )
+
+  native.new_http_archive(
+      name = "com_github_andreif_codegen",
+      urls = [
+          "http://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
+          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
+      ],
+      sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
+      strip_prefix = "codegen-1.0",
+      build_file = str(Label("//third_party:codegen.BUILD")),
+  )
+
+  filegroup_external(
+      name = "org_python_license",
+      licenses = ["notice"],  # Python 2.0
+      sha256_urls = {
+          "b5556e921715ddb9242c076cae3963f483aa47266c5e37ea4c187f77cc79501c": [
+              "http://mirror.bazel.build/docs.python.org/2.7/_sources/license.txt",
+              "https://docs.python.org/2.7/_sources/license.txt",
+          ],
+      },
+  )
+
   native.bind(
       name = "six",
       actual = "@six_archive//:six",
   )
 
   patched_http_archive(
-      name = "protobuf",
+      name = "protobuf_archive",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
-          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256 = "6d43b9d223ce09e5d4ce8b0060cb8a7513577a35a64c7e3dad10f0703bf3ad93",
+      strip_prefix = "protobuf-0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66",
       # TODO: remove patching when tensorflow stops linking same protos into
       #       multiple shared libraries loaded in runtime by python.
       #       This patch fixes a runtime crash when tensorflow is compiled
@@ -318,27 +372,32 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
   )
 
+  native.bind(
+      name = "protobuf",
+      actual = "@protobuf_archive//:protobuf",
+  )
+
   # We need to import the protobuf library under the names com_google_protobuf
   # and com_google_protobuf_cc to enable proto_library support in bazel.
   # Unfortunately there is no way to alias http_archives at the moment.
   native.http_archive(
       name = "com_google_protobuf",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
-          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256 = "6d43b9d223ce09e5d4ce8b0060cb8a7513577a35a64c7e3dad10f0703bf3ad93",
+      strip_prefix = "protobuf-0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66",
   )
 
   native.http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
-          "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "https://github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
       ],
-      sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
-      strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a",
+      sha256 = "6d43b9d223ce09e5d4ce8b0060cb8a7513577a35a64c7e3dad10f0703bf3ad93",
+      strip_prefix = "protobuf-0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66",
   )
 
   native.new_http_archive(
@@ -416,23 +475,30 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   # to point to the protobuf's compiler library.
   native.bind(
       name = "protobuf_clib",
-      actual = "@protobuf//:protoc_lib",
+      actual = "@protobuf_archive//:protoc_lib",
   )
 
   native.bind(
-      name = "protobuf_compiler",
-      actual = "@protobuf//:protoc_lib",
+      name = "libssl",
+      actual = "@boringssl//:ssl",
   )
 
-  native.new_http_archive(
+  # gRPC has includes directly from their third_party path for nanopb, so we
+  # must depend on their version of it.
+  native.bind(
+      name = "nanopb",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  patched_http_archive(
       name = "grpc",
       urls = [
-          "http://mirror.bazel.build/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
-          "https://github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
+          "http://mirror.bazel.build/github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
+          "https://github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
       ],
-      sha256 = "a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
-      strip_prefix = "grpc-d7ff4ff40071d2b486a052183e3e9f9382afb745",
-      build_file = str(Label("//third_party:grpc.BUILD")),
+      sha256 = "2004635e6a078acfac8ffa71738397796be4f8fb72f572cc44ecee5d99511d9f",
+      strip_prefix = "grpc-781fd6f6ea03645a520cd5c675da67ab61f87e4b",
+      patch_file = str(Label("//third_party/grpc:grpc.patch")),
   )
 
   # protobuf expects //external:grpc_cpp_plugin to point to grpc's
@@ -463,11 +529,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "llvm",
       urls = [
-          "http://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/e156d99231a7735d06a97b5b83de70bf4ce4f034.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/e156d99231a7735d06a97b5b83de70bf4ce4f034.tar.gz",
+          "http://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/4d98985c94c36b9eb4396c91fe0a72a0c5f707b2.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/4d98985c94c36b9eb4396c91fe0a72a0c5f707b2.tar.gz",
       ],
-      sha256 = "72e34e2411a06d4200a2688ee83832805fbef23a12ea481f31c2b8866fde007a",
-      strip_prefix = "llvm-e156d99231a7735d06a97b5b83de70bf4ce4f034",
+      sha256 = "1a085c995522fa19900568c03eb595b425df53842c7f281e3ab79aaa04affffa",
+      strip_prefix = "llvm-4d98985c94c36b9eb4396c91fe0a72a0c5f707b2",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
       repository = tf_repo_name,
   )
@@ -499,7 +565,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       actual = "@jsoncpp_git//:jsoncpp",
   )
 
-  native.http_archive(
+  patched_http_archive(
       name = "boringssl",
       urls = [
           "http://mirror.bazel.build/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
@@ -507,22 +573,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
       strip_prefix = "boringssl-bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",
-  )
-
-  native.new_http_archive(
-      name = "nanopb_git",
-      urls = [
-          "http://mirror.bazel.build/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
-          "https://github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
-      ],
-      sha256 = "ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
-      strip_prefix = "nanopb-1251fa1065afc0d62f635e0f63fec8276e14e13c",
-      build_file = str(Label("//third_party:nanopb.BUILD")),
-  )
 
-  native.bind(
-      name = "nanopb",
-      actual = "@nanopb_git//:nanopb",
+      # Add patch to boringssl code to support s390x
+      patch_file = str(Label("//third_party/boringssl:add_boringssl_s390x.patch")),
   )
 
   native.new_http_archive(
@@ -623,3 +676,28 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:pprof.BUILD")),
   )
 
+  native.new_http_archive(
+      name = "cub_archive",
+      urls = [
+          "http://mirror.bazel.build/github.com/NVlabs/cub/archive/1.6.4.zip",
+          "https://github.com/NVlabs/cub/archive/1.6.4.zip",
+      ],
+      sha256 = "966d0c4f41e2bdc81aebf9ccfbf0baffaac5a74f00b826b06f4dee79b2bb8cee",
+      strip_prefix = "cub-1.6.4",
+      build_file = str(Label("//third_party:cub.BUILD")),
+  )
+
+  native.bind(
+      name = "cub",
+      actual = "@cub_archive//:cub",
+  )
+
+  native.http_archive(
+      name = "bazel_toolchains",
+      urls = [
+          "http://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/bccee4855c049d34bac481083b4c68e2fab8cc50.tar.gz",
+          "https://github.com/bazelbuild/bazel-toolchains/archive/bccee4855c049d34bac481083b4c68e2fab8cc50.tar.gz",
+      ],
+      sha256 = "3903fd93b96b42067e00b7973a2c16c34e761ad7a0b55e1557d408f352849e41",
+      strip_prefix = "bazel-toolchains-bccee4855c049d34bac481083b4c68e2fab8cc50",
+  )
diff --git a/third_party/backports_weakref.BUILD b/third_party/backports_weakref.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0adfc5f05419e736b6af01252674e6fb11e6b8d7
--- /dev/null
+++ b/third_party/backports_weakref.BUILD
@@ -0,0 +1,22 @@
+# Description:
+#   Backport of new features in Python's weakref module.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Python 2.0
+
+py_library(
+    name = "org_python_pypi_backports_weakref",
+    srcs = [
+        "backports/__init__.py",
+        "backports/weakref.py",
+    ],
+    srcs_version = "PY2AND3",
+)
+
+genrule(
+    name = "license",
+    srcs = ["@org_python_license"],
+    outs = ["LICENSE"],
+    cmd = "cp $< $@",
+)
diff --git a/third_party/bleach.BUILD b/third_party/bleach.BUILD
deleted file mode 100644
index 1bf75b84a769642d74b9fdef78708eaffceb113e..0000000000000000000000000000000000000000
--- a/third_party/bleach.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-# Description:
-#   Build file for Bleach.
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "org_mozilla_bleach",
-    srcs = [
-        "bleach/__init__.py",
-        "bleach/callbacks.py",
-        "bleach/encoding.py",
-        "bleach/sanitizer.py",
-        "bleach/version.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = ["@org_html5lib"],
-)
diff --git a/tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_unsupported.json b/third_party/boringssl/BUILD
similarity index 100%
rename from tensorflow/tensorboard/components/tf_profile_dashboard/demo/data/plugin/profile/data_run_bar_tag_unsupported.json
rename to third_party/boringssl/BUILD
diff --git a/third_party/boringssl/add_boringssl_s390x.patch b/third_party/boringssl/add_boringssl_s390x.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0b41a4aa96831540bb55c69337bac1ed7b7cd651
--- /dev/null
+++ b/third_party/boringssl/add_boringssl_s390x.patch
@@ -0,0 +1,13 @@
+diff --git a/src/include/openssl/base.h b/src/include/openssl/base.h
+index 7a3adfb..88012ad 100644
+--- a/src/include/openssl/base.h
++++ b/src/include/openssl/base.h
+@@ -94,6 +94,8 @@ extern "C" {
+ #elif defined(__pnacl__)
+ #define OPENSSL_32_BIT
+ #define OPENSSL_PNACL
++#elif defined(__s390x__)
++#define OPENSSL_64_BIT
+ #else
+ #error "Unknown target CPU"
+ #endif
diff --git a/third_party/clutz.BUILD b/third_party/clutz.BUILD
deleted file mode 100644
index 593b70366a3a0908b91120ce5351fe7c2c0159b3..0000000000000000000000000000000000000000
--- a/third_party/clutz.BUILD
+++ /dev/null
@@ -1,44 +0,0 @@
-# Description:
-#   Build tool for making TypeScript .d.ts files from Closure JavaScript.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # MIT
-
-exports_files([
-    "LICENSE",
-    "src/resources/closure.lib.d.ts",
-])
-
-JVM_FLAGS = [
-    "-Xss20m",  # JSCompiler needs big stacks for recursive parsing
-    "-XX:+UseParallelGC",  # Best GC when app isn't latency sensitive
-]
-
-java_binary(
-    name = "clutz",
-    srcs = glob(["src/main/java/com/google/javascript/clutz/**/*.java"]),
-    jvm_flags = JVM_FLAGS,
-    main_class = "com.google.javascript.clutz.DeclarationGenerator",
-    deps = [
-        "@args4j",
-        "@com_google_code_findbugs_jsr305",
-        "@com_google_code_gson",
-        "@com_google_guava",
-        "@com_google_javascript_closure_compiler",
-    ],
-)
-
-java_binary(
-    name = "gents",
-    srcs = glob(["src/main/java/com/google/javascript/gents/**/*.java"]),
-    jvm_flags = JVM_FLAGS,
-    main_class = "com.google.javascript.gents.TypeScriptGenerator",
-    deps = [
-        "@args4j",
-        "@com_google_code_findbugs_jsr305",
-        "@com_google_code_gson",
-        "@com_google_guava",
-        "@com_google_javascript_closure_compiler",
-    ],
-)
diff --git a/third_party/clutz.bzl b/third_party/clutz.bzl
deleted file mode 100644
index f273c78c794c637f96af52c1c1aa96b31acc5a24..0000000000000000000000000000000000000000
--- a/third_party/clutz.bzl
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Build definitions for TypeScript from Closure JavaScript libraries."""
-
-load("@io_bazel_rules_closure//closure/private:defs.bzl",
-     "JS_FILE_TYPE",
-     "collect_js",
-     "unfurl")
-
-CLUTZ_ATTRIBUTES = {
-    "_clutz": attr.label(
-        default=Label("@io_angular_clutz//:clutz"),
-        executable=True,
-        cfg="host"),
-    "_clutz_externs": attr.label(
-        default=Label("@com_google_javascript_closure_compiler_externs"),
-        allow_files=True),
-}
-
-def extract_dts_from_closure_libraries(ctx):
-  """Extracts type definitions from closure dependencies.
-
-  This just generates one big .d.ts file for all transitive Closure sources,
-  and does not pass it down. That means each rule has to duplicate the effort,
-  but on the other hand allows transitive dependencies on shared rules without
-  causing duplicate definition errors.
-
-  Args:
-      ctx: A Skylark context.
-  Returns:
-      The generated Clutz typings file, or None if there were no JS deps.
-  """
-  deps = unfurl(ctx.attr.deps, provider="closure_js_library")
-  js = collect_js(ctx, deps)
-  if not js.srcs:
-    return None
-  js_typings = ctx.new_file(ctx.bin_dir, "%s-js-typings.d.ts" % ctx.label.name)
-  srcs = depset(JS_FILE_TYPE.filter(ctx.files._clutz_externs)) + js.srcs
-  args = ["-o", js_typings.path]
-  for src in srcs:
-    args.append(src.path)
-  if getattr(ctx.attr, "clutz_entry_points", None):
-    args.append("--closure_entry_points")
-    args.extend(ctx.attr.clutz_entry_points)
-  ctx.action(
-      inputs=list(srcs),
-      outputs=[js_typings],
-      executable=ctx.executable._clutz,
-      arguments=args,
-      mnemonic="Clutz",
-      progress_message="Running Clutz on %d JS files %s" % (
-          len(srcs), ctx.label))
-  return js_typings
-
-################################################################################
-# The following definitions are for API compatibility with internal clutz.bzl
-
-CLUTZ_OUTPUTS = {}
-
-def _clutz_aspect_impl(target, ctx):
-  return struct()
-
-clutz_aspect = aspect(
-    implementation=_clutz_aspect_impl,
-    attr_aspects=["exports"])
diff --git a/third_party/codegen.BUILD b/third_party/codegen.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..df436c81635a71421a67fa8d8c84eb8dfcc97d7b
--- /dev/null
+++ b/third_party/codegen.BUILD
@@ -0,0 +1,16 @@
+# -*- mode: python; -*-
+#
+# Description:
+#   Extension to ast that allow ast -> python code generation.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # New BSD
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "com_github_andreif_codegen",
+    srcs = glob(["codegen.py"]),
+    srcs_version = "PY2AND3",
+)
diff --git a/third_party/cub.BUILD b/third_party/cub.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..29159c9dad3d32121ce05278821e41b39f3f2a20
--- /dev/null
+++ b/third_party/cub.BUILD
@@ -0,0 +1,26 @@
+# Description: CUB library which is a set of primitives for GPU programming.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # BSD
+
+exports_files(["LICENSE.TXT"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
+
+filegroup(
+    name = "cub_header_files",
+    srcs = glob([
+        "cub/**",
+    ]),
+)
+
+cc_library(
+    name = "cub",
+    hdrs = if_cuda([":cub_header_files"]),
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index ad6821af3ccb5b3b15151427c99db4280a6905bf..21c5c11a44dd7bdcb3bbea839c751fc9e6b7e8e0 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -20,6 +20,15 @@ cc_library(
         "lib/quantize.c",
     ],
     hdrs = ["lib/gif_lib.h"],
+    defines = select({
+        #"@%ws%//tensorflow:android": [
+        ":android": [
+            "S_IREAD=S_IRUSR",
+            "S_IWRITE=S_IWUSR",
+            "S_IEXEC=S_IXUSR",
+        ],
+        "//conditions:default": [],
+    }),  
     includes = ["lib/."],
     visibility = ["//visibility:public"],
     deps = select({
@@ -54,3 +63,10 @@ config_setting(
         "cpu": "x64_windows",
     },
 )
+
+config_setting(
+    name = "android",
+    values = {"crosstool_top": "//external:android/crosstool"},
+)
+
+
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 242439daf456d6fd31a140e5d2c56d3e89900652..2558f46fd55c35b5089cc0119f2654f598e5128a 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -213,7 +213,7 @@ def InvokeNvcc(argv, log=False):
            ' --compiler-options "' + host_compiler_options + '"' +
            ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
            ' -I .' +
-           ' -x cu ' + includes + ' ' + srcs + ' -M -o ' + depfile)
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
     if log: Log(cmd)
     exit_status = os.system(cmd)
     if exit_status != 0:
diff --git a/third_party/gpus/crosstool/remote.BUILD.tpl b/third_party/gpus/crosstool/remote.BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..b2316331db257a39086bdd5ca02b5ca6848cebcb
--- /dev/null
+++ b/third_party/gpus/crosstool/remote.BUILD.tpl
@@ -0,0 +1,10 @@
+# Description:
+#   Template for crosstool Build file to use a pre-generated config.
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+alias(
+    name = "toolchain",
+    actual = "%{remote_cuda_repo}:toolchain",
+)
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index f7610dd7a99e3c65ac494d23f0a408d4391680c0..b752734a08a1ac7a60582ebd7e60ec3c1564f353 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -40,20 +40,23 @@ config_setting(
 cc_library(
     name = "cuda_headers",
     hdrs = [
-        "cuda_config.h",
+        "cuda/cuda_config.h",
         %{cuda_headers}
     ],
     includes = [
         ".",
-        "include",
+        "cuda/include",
     ],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart_static",
-    srcs = ["lib/%{cudart_static_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{cudart_static_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -66,95 +69,120 @@ cc_library(
 
 cc_library(
     name = "cuda_driver",
-    srcs = ["lib/%{cuda_driver_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{cuda_driver_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart",
-    srcs = ["lib/%{cudart_lib}"],
-    data = ["lib/%{cudart_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{cudart_lib}"],
+    data = ["cuda/lib/%{cudart_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cublas",
-    srcs = ["lib/%{cublas_lib}"],
-    data = ["lib/%{cublas_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{cublas_lib}"],
+    data = ["cuda/lib/%{cublas_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusolver",
-    srcs = ["lib/%{cusolver_lib}"],
-    data = ["lib/%{cusolver_lib}"],
-    includes = ["include"],
-    linkstatic = 1,
+    srcs = ["cuda/lib/%{cusolver_lib}"],
+    data = ["cuda/lib/%{cusolver_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkopts = ["-lgomp"],
+    linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn",
-    srcs = ["lib/%{cudnn_lib}"],
-    data = ["lib/%{cudnn_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{cudnn_lib}"],
+    data = ["cuda/lib/%{cudnn_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cufft",
-    srcs = ["lib/%{cufft_lib}"],
-    data = ["lib/%{cufft_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{cufft_lib}"],
+    data = ["cuda/lib/%{cufft_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "curand",
-    srcs = ["lib/%{curand_lib}"],
-    data = ["lib/%{curand_lib}"],
-    includes = ["include"],
+    srcs = ["cuda/lib/%{curand_lib}"],
+    data = ["cuda/lib/%{curand_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
+    visibility = ["//visibility:public"],
     deps = [
+        ":cublas",
         ":cuda_headers",
         ":cudart",
-        ":cublas",
         ":cudnn",
         ":cufft",
         ":curand",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cupti_headers",
     hdrs = [
-        "cuda_config.h",
+        "cuda/cuda_config.h",
         ":cuda-extras",
     ],
     includes = [
         ".",
-        "extras/CUPTI/include/",
+        "cuda/extras/CUPTI/include/",
     ],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cupti_dsos",
-    data = ["lib/%{cupti_lib}"],
+    data = ["cuda/lib/%{cupti_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/third_party/gpus/cuda/remote.BUILD.tpl b/third_party/gpus/cuda/remote.BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..d88d512b90c352e6a301ed6efe8266d8dd6bf744
--- /dev/null
+++ b/third_party/gpus/cuda/remote.BUILD.tpl
@@ -0,0 +1,105 @@
+# Description:
+#   Template for cuda Build file to use a pre-generated config.
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+alias(
+    name = "cuda_headers",
+    actual = "%{remote_cuda_repo}cuda:cuda_headers",
+)
+
+alias(
+    name = "cudart_static",
+    actual = "%{remote_cuda_repo}cuda:cudart_static",
+)
+
+alias(
+    name = "cuda_driver",
+    actual = "%{remote_cuda_repo}cuda:cuda_driver",
+)
+
+alias(
+    name = "cudart",
+    actual = "%{remote_cuda_repo}cuda:cudart",
+)
+
+alias(
+    name = "cublas",
+    actual = "%{remote_cuda_repo}cuda:cublas",
+)
+
+alias(
+    name = "cusolver",
+    actual = "%{remote_cuda_repo}cuda:cusolver",
+)
+
+alias(
+    name = "cudnn",
+    actual = "%{remote_cuda_repo}cuda:cudnn",
+)
+
+alias(
+    name = "cufft",
+    actual = "%{remote_cuda_repo}cuda:cufft",
+)
+
+alias(
+    name = "curand",
+    actual = "%{remote_cuda_repo}cuda:curand",
+)
+
+alias(
+    name = "cuda",
+    actual = "%{remote_cuda_repo}cuda:cuda",
+)
+
+alias(
+    name = "cupti_headers",
+    actual = "%{remote_cuda_repo}cuda:cupti_headers",
+)
+
+alias(
+    name = "cupti_dsos",
+    actual = "%{remote_cuda_repo}cuda:cupti_dsos",
+)
+
+alias(
+    name = "libdevice_root",
+    actual = "%{remote_cuda_repo}cuda:libdevice_root",
+)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 61932a8e6d1a699392c4de73ee36ed681d9eda94..4dd3169d418797fbda656d33c53e3f147b38725d 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -26,6 +26,7 @@ _TF_CUDA_VERSION = "TF_CUDA_VERSION"
 _TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
 _CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
+_TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 
 _DEFAULT_CUDA_VERSION = ""
 _DEFAULT_CUDNN_VERSION = ""
@@ -739,19 +740,19 @@ def _create_dummy_repository(repository_ctx):
 
   # Create dummy files for the CUDA toolkit since they are still required by
   # tensorflow/core/platform/default/build_config:cuda.
-  repository_ctx.file("cuda/include/cuda.h", "")
-  repository_ctx.file("cuda/include/cublas.h", "")
-  repository_ctx.file("cuda/include/cudnn.h", "")
-  repository_ctx.file("cuda/extras/CUPTI/include/cupti.h", "")
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cuda", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cudart", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cublas", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cusolver", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cudnn", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("curand", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cufft", cpu_value))
-  repository_ctx.file("cuda/lib/%s" % _lib_name("cupti", cpu_value))
+  repository_ctx.file("cuda/cuda/include/cuda.h", "")
+  repository_ctx.file("cuda/cuda/include/cublas.h", "")
+  repository_ctx.file("cuda/cuda/include/cudnn.h", "")
+  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
@@ -763,7 +764,7 @@ def _create_dummy_repository(repository_ctx):
                "CudaVersion(\"%s\")" % c
                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES]),
            "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
-       })
+       }, "cuda/cuda/cuda_config.h")
 
   # If cuda_configure is not configured to build with GPU support, and the user
   # attempts to build with --config=cuda, add a dummy build rule to intercept
@@ -820,6 +821,13 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
   command = []
+  if not _is_windows(repository_ctx):
+    # We clear folders that might have been generated previously to avoid
+    # undesired inclusions
+    command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
+    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
+    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
+    command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
   outs = []
   for i in range(len(dest_files)):
     if dest_files[i] != "":
@@ -829,7 +837,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
       # On Windows, symlink is not supported, so we just copy all the files.
       cmd = 'cp -f' if _is_windows(repository_ctx) else 'ln -s'
       command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
-      outs.append('      "' + dest_dir + dest_files[i] + '",')
+      outs.append('        "' + dest_dir + dest_files[i] + '",')
   genrule = _genrule(src_dir, genrule_name, " && ".join(command),
                      "\n".join(outs))
   return genrule
@@ -846,11 +854,11 @@ def _genrule(src_dir, genrule_name, command, outs):
       genrule_name + '",\n' +
       '    outs = [\n' +
       outs +
-      '    ],\n' +
+      '\n    ],\n' +
       '    cmd = """\n' +
       command +
-      '    """,\n' +
-      ')\n\n'
+      '\n   """,\n' +
+      ')\n'
   )
 
 
@@ -883,15 +891,16 @@ def _use_cuda_clang(repository_ctx):
     return enable_cuda == "1"
   return False
 
-def _compute_cuda_extra_copts(repository_ctx, cuda_config):
+def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
   if _use_cuda_clang(repository_ctx):
-    capability_flags = ["--cuda-gpu-arch=sm_" + cap.replace(".", "") for cap in cuda_config.compute_capabilities]
+    capability_flags = ["--cuda-gpu-arch=sm_" +
+        cap.replace(".", "") for cap in compute_capabilities]
   else:
     # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
     capability_flags = []
   return str(capability_flags)
 
-def _create_cuda_repository(repository_ctx):
+def _create_local_cuda_repository(repository_ctx):
   """Creates the repository containing files set up to build with CUDA."""
   cuda_config = _get_cuda_config(repository_ctx)
 
@@ -904,19 +913,19 @@ def _create_cuda_repository(repository_ctx):
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
   cuda_include_path = cuda_toolkit_path + "/include"
   genrules = [_symlink_genrule_for_dir(repository_ctx,
-      cuda_include_path, "include", "cuda-include")]
+      cuda_include_path, "cuda/include", "cuda-include")]
   genrules.append(_symlink_genrule_for_dir(repository_ctx,
-      cuda_toolkit_path + "/nvvm", "nvvm", "cuda-nvvm"))
+      cuda_toolkit_path + "/nvvm", "cuda/nvvm", "cuda-nvvm"))
   genrules.append(_symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/extras/CUPTI/include",
-      "extras/CUPTI/include", "cuda-extras"))
+      "cuda/extras/CUPTI/include", "cuda-extras"))
 
   cuda_libs = _find_libs(repository_ctx, cuda_config)
   cuda_lib_src = []
   cuda_lib_dest = []
   for lib in cuda_libs.values():
     cuda_lib_src.append(lib.path)
-    cuda_lib_dest.append("lib/" + lib.file_name)
+    cuda_lib_dest.append("cuda/lib/" + lib.file_name)
   genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
                                        cuda_lib_src, cuda_lib_dest))
 
@@ -925,8 +934,9 @@ def _create_cuda_repository(repository_ctx):
   included_files = _read_dir(repository_ctx, cuda_include_path).replace(
       cuda_include_path, '').splitlines()
   if '/cudnn.h' not in included_files:
-    genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "include/",
-        "cudnn-include", [cudnn_header_dir + "/cudnn.h"], ["cudnn.h"]))
+    genrules.append(_symlink_genrule_for_dir(repository_ctx, None,
+        "cuda/include/", "cudnn-include", [cudnn_header_dir + "/cudnn.h"],
+        ["cudnn.h"]))
   else:
     genrules.append(
             'filegroup(\n' +
@@ -939,7 +949,8 @@ def _create_cuda_repository(repository_ctx):
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
            "%{cuda_is_configured}": "True",
-           "%{cuda_extra_copts}": _compute_cuda_extra_copts(repository_ctx, cuda_config),
+           "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+               repository_ctx, cuda_config.compute_capabilities),
 
        })
   _tpl(repository_ctx, "cuda:BUILD",
@@ -997,16 +1008,35 @@ def _create_cuda_repository(repository_ctx):
                ["CudaVersion(\"%s\")" % c
                 for c in cuda_config.compute_capabilities]),
                "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
-       })
+       }, "cuda/cuda/cuda_config.h")
+
+def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
+  """Creates pointers to a remotely configured repo set up to build with CUDA."""
+  _tpl(repository_ctx, "cuda:build_defs.bzl",
+       {
+           "%{cuda_is_configured}": "True",
+           "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+               repository_ctx, _compute_capabilities(repository_ctx)),
 
+       })
+  _tpl(repository_ctx, "cuda:remote.BUILD",
+       {
+           "%{remote_cuda_repo}": remote_config_repo,
+       }, "cuda/BUILD")
+  _tpl(repository_ctx, "crosstool:remote.BUILD", {
+           "%{remote_cuda_repo}": remote_config_repo,
+       }, "crosstool/BUILD")
 
 def _cuda_autoconf_impl(repository_ctx):
   """Implementation of the cuda_autoconf repository rule."""
   if not _enable_cuda(repository_ctx):
     _create_dummy_repository(repository_ctx)
   else:
-    _create_cuda_repository(repository_ctx)
-
+    if _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
+      _create_remote_cuda_repository(repository_ctx,
+          repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO])
+    else:
+      _create_local_cuda_repository(repository_ctx)
 
 
 cuda_configure = repository_rule(
@@ -1019,6 +1049,7 @@ cuda_configure = repository_rule(
         _TF_CUDA_VERSION,
         _TF_CUDNN_VERSION,
         _TF_CUDA_COMPUTE_CAPABILITIES,
+        _TF_CUDA_CONFIG_REPO,
     ],
 )
 
diff --git a/third_party/grpc.BUILD b/third_party/grpc.BUILD
deleted file mode 100644
index b79259618f2f06c941b5a8e3427dd0d5a0fe1e40..0000000000000000000000000000000000000000
--- a/third_party/grpc.BUILD
+++ /dev/null
@@ -1,2478 +0,0 @@
-# NOTE(mrry): This file is an edited version of the following file:
-# https://raw.githubusercontent.com/grpc/grpc/d7ff4ff40071d2b486a052183e3e9f9382afb745/BUILD
-# ...with small modifications to fix the build rules for :grpc++_unsecure.
-#
-# TODO(mrry): Upstream these fixes back to the gRPC repository.
-# TODO(jart): Fix nanopb's BUILD file. Fix grpc BUILD file.
-
-# GRPC Bazel BUILD file.
-# This currently builds C, C++ and Objective-C code.
-# This file has been automatically generated from a template file.
-# Please look at the templates directory instead.
-# This file can be regenerated from the template by running
-# tools/buildgen/generate_projects.sh
-
-# Copyright 2015, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-licenses(["notice"])  # 3-clause BSD
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-genrule(
-    name = "pb_h",
-    outs = ["third_party/nanopb/pb.h"],
-    cmd = "echo '#include <pb.h>' >$@",
-    visibility = ["//visibility:private"],
-)
-
-genrule(
-    name = "pb_decode_h",
-    outs = ["third_party/nanopb/pb_decode.h"],
-    cmd = "echo '#include <pb_decode.h>' >$@",
-    visibility = ["//visibility:private"],
-)
-
-genrule(
-    name = "pb_encode_h",
-    outs = ["third_party/nanopb/pb_encode.h"],
-    cmd = "echo '#include <pb_encode.h>' >$@",
-    visibility = ["//visibility:private"],
-)
-
-cc_library(
-    name = "gpr",
-    srcs = [
-        "src/core/lib/profiling/basic_timers.c",
-        "src/core/lib/profiling/stap_timers.c",
-        "src/core/lib/profiling/timers.h",
-        "src/core/lib/support/alloc.c",
-        "src/core/lib/support/avl.c",
-        "src/core/lib/support/backoff.c",
-        "src/core/lib/support/backoff.h",
-        "src/core/lib/support/block_annotate.h",
-        "src/core/lib/support/cmdline.c",
-        "src/core/lib/support/cpu_iphone.c",
-        "src/core/lib/support/cpu_linux.c",
-        "src/core/lib/support/cpu_posix.c",
-        "src/core/lib/support/cpu_windows.c",
-        "src/core/lib/support/env.h",
-        "src/core/lib/support/env_linux.c",
-        "src/core/lib/support/env_posix.c",
-        "src/core/lib/support/env_windows.c",
-        "src/core/lib/support/histogram.c",
-        "src/core/lib/support/host_port.c",
-        "src/core/lib/support/log.c",
-        "src/core/lib/support/log_android.c",
-        "src/core/lib/support/log_linux.c",
-        "src/core/lib/support/log_posix.c",
-        "src/core/lib/support/log_windows.c",
-        "src/core/lib/support/murmur_hash.c",
-        "src/core/lib/support/murmur_hash.h",
-        "src/core/lib/support/slice.c",
-        "src/core/lib/support/slice_buffer.c",
-        "src/core/lib/support/stack_lockfree.c",
-        "src/core/lib/support/stack_lockfree.h",
-        "src/core/lib/support/string.c",
-        "src/core/lib/support/string.h",
-        "src/core/lib/support/string_posix.c",
-        "src/core/lib/support/string_util_windows.c",
-        "src/core/lib/support/string_windows.c",
-        "src/core/lib/support/string_windows.h",
-        "src/core/lib/support/subprocess_posix.c",
-        "src/core/lib/support/subprocess_windows.c",
-        "src/core/lib/support/sync.c",
-        "src/core/lib/support/sync_posix.c",
-        "src/core/lib/support/sync_windows.c",
-        "src/core/lib/support/thd.c",
-        "src/core/lib/support/thd_internal.h",
-        "src/core/lib/support/thd_posix.c",
-        "src/core/lib/support/thd_windows.c",
-        "src/core/lib/support/time.c",
-        "src/core/lib/support/time_posix.c",
-        "src/core/lib/support/time_precise.c",
-        "src/core/lib/support/time_precise.h",
-        "src/core/lib/support/time_windows.c",
-        "src/core/lib/support/tls_pthread.c",
-        "src/core/lib/support/tmpfile.h",
-        "src/core/lib/support/tmpfile_msys.c",
-        "src/core/lib/support/tmpfile_posix.c",
-        "src/core/lib/support/tmpfile_windows.c",
-        "src/core/lib/support/wrap_memcpy.c",
-    ],
-    hdrs = [
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/support/alloc.h",
-        "include/grpc/support/atm.h",
-        "include/grpc/support/atm_gcc_atomic.h",
-        "include/grpc/support/atm_gcc_sync.h",
-        "include/grpc/support/atm_windows.h",
-        "include/grpc/support/avl.h",
-        "include/grpc/support/cmdline.h",
-        "include/grpc/support/cpu.h",
-        "include/grpc/support/histogram.h",
-        "include/grpc/support/host_port.h",
-        "include/grpc/support/log.h",
-        "include/grpc/support/log_windows.h",
-        "include/grpc/support/port_platform.h",
-        "include/grpc/support/slice.h",
-        "include/grpc/support/slice_buffer.h",
-        "include/grpc/support/string_util.h",
-        "include/grpc/support/subprocess.h",
-        "include/grpc/support/sync.h",
-        "include/grpc/support/sync_generic.h",
-        "include/grpc/support/sync_posix.h",
-        "include/grpc/support/sync_windows.h",
-        "include/grpc/support/thd.h",
-        "include/grpc/support/time.h",
-        "include/grpc/support/tls.h",
-        "include/grpc/support/tls_gcc.h",
-        "include/grpc/support/tls_msvc.h",
-        "include/grpc/support/tls_pthread.h",
-        "include/grpc/support/useful.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    linkopts = ["-lpthread"],
-)
-
-cc_library(
-    name = "grpc",
-    srcs = [
-        "src/core/ext/census/aggregation.h",
-        "src/core/ext/census/census_interface.h",
-        "src/core/ext/census/census_rpc_stats.h",
-        "src/core/ext/census/context.c",
-        "src/core/ext/census/gen/census.pb.c",
-        "src/core/ext/census/gen/census.pb.h",
-        "src/core/ext/census/grpc_context.c",
-        "src/core/ext/census/grpc_filter.c",
-        "src/core/ext/census/grpc_filter.h",
-        "src/core/ext/census/grpc_plugin.c",
-        "src/core/ext/census/initialize.c",
-        "src/core/ext/census/mlog.c",
-        "src/core/ext/census/mlog.h",
-        "src/core/ext/census/operation.c",
-        "src/core/ext/census/placeholders.c",
-        "src/core/ext/census/rpc_metric_id.h",
-        "src/core/ext/census/tracing.c",
-        "src/core/ext/client_config/channel_connectivity.c",
-        "src/core/ext/client_config/client_channel.c",
-        "src/core/ext/client_config/client_channel.h",
-        "src/core/ext/client_config/client_channel_factory.c",
-        "src/core/ext/client_config/client_channel_factory.h",
-        "src/core/ext/client_config/client_config.c",
-        "src/core/ext/client_config/client_config.h",
-        "src/core/ext/client_config/client_config_plugin.c",
-        "src/core/ext/client_config/connector.c",
-        "src/core/ext/client_config/connector.h",
-        "src/core/ext/client_config/default_initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.h",
-        "src/core/ext/client_config/lb_policy.c",
-        "src/core/ext/client_config/lb_policy.h",
-        "src/core/ext/client_config/lb_policy_factory.c",
-        "src/core/ext/client_config/lb_policy_factory.h",
-        "src/core/ext/client_config/lb_policy_registry.c",
-        "src/core/ext/client_config/lb_policy_registry.h",
-        "src/core/ext/client_config/parse_address.c",
-        "src/core/ext/client_config/parse_address.h",
-        "src/core/ext/client_config/resolver.c",
-        "src/core/ext/client_config/resolver.h",
-        "src/core/ext/client_config/resolver_factory.c",
-        "src/core/ext/client_config/resolver_factory.h",
-        "src/core/ext/client_config/resolver_registry.c",
-        "src/core/ext/client_config/resolver_registry.h",
-        "src/core/ext/client_config/subchannel.c",
-        "src/core/ext/client_config/subchannel.h",
-        "src/core/ext/client_config/subchannel_call_holder.c",
-        "src/core/ext/client_config/subchannel_call_holder.h",
-        "src/core/ext/client_config/subchannel_index.c",
-        "src/core/ext/client_config/subchannel_index.h",
-        "src/core/ext/client_config/uri_parser.c",
-        "src/core/ext/client_config/uri_parser.h",
-        "src/core/ext/lb_policy/grpclb/load_balancer_api.c",
-        "src/core/ext/lb_policy/grpclb/load_balancer_api.h",
-        "src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c",
-        "src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.h",
-        "src/core/ext/lb_policy/pick_first/pick_first.c",
-        "src/core/ext/lb_policy/round_robin/round_robin.c",
-        "src/core/ext/load_reporting/load_reporting.c",
-        "src/core/ext/load_reporting/load_reporting.h",
-        "src/core/ext/load_reporting/load_reporting_filter.c",
-        "src/core/ext/load_reporting/load_reporting_filter.h",
-        "src/core/ext/resolver/dns/native/dns_resolver.c",
-        "src/core/ext/resolver/sockaddr/sockaddr_resolver.c",
-        "src/core/ext/transport/chttp2/alpn/alpn.c",
-        "src/core/ext/transport/chttp2/alpn/alpn.h",
-        "src/core/ext/transport/chttp2/client/insecure/channel_create.c",
-        "src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c",
-        "src/core/ext/transport/chttp2/client/secure/secure_channel_create.c",
-        "src/core/ext/transport/chttp2/server/insecure/server_chttp2.c",
-        "src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c",
-        "src/core/ext/transport/chttp2/server/secure/server_secure_chttp2.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.h",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.h",
-        "src/core/ext/transport/chttp2/transport/chttp2_plugin.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.h",
-        "src/core/ext/transport/chttp2/transport/frame.h",
-        "src/core/ext/transport/chttp2/transport/frame_data.c",
-        "src/core/ext/transport/chttp2/transport/frame_data.h",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.c",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.h",
-        "src/core/ext/transport/chttp2/transport/frame_ping.c",
-        "src/core/ext/transport/chttp2/transport/frame_ping.h",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.c",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.h",
-        "src/core/ext/transport/chttp2/transport/frame_settings.c",
-        "src/core/ext/transport/chttp2/transport/frame_settings.h",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.c",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.h",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.c",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.h",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.c",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.h",
-        "src/core/ext/transport/chttp2/transport/hpack_table.c",
-        "src/core/ext/transport/chttp2/transport/hpack_table.h",
-        "src/core/ext/transport/chttp2/transport/http2_errors.h",
-        "src/core/ext/transport/chttp2/transport/huffsyms.c",
-        "src/core/ext/transport/chttp2/transport/huffsyms.h",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.c",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.h",
-        "src/core/ext/transport/chttp2/transport/internal.h",
-        "src/core/ext/transport/chttp2/transport/parsing.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.h",
-        "src/core/ext/transport/chttp2/transport/stream_lists.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.h",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.c",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.h",
-        "src/core/ext/transport/chttp2/transport/varint.c",
-        "src/core/ext/transport/chttp2/transport/varint.h",
-        "src/core/ext/transport/chttp2/transport/writing.c",
-        "src/core/lib/channel/channel_args.c",
-        "src/core/lib/channel/channel_args.h",
-        "src/core/lib/channel/channel_stack.c",
-        "src/core/lib/channel/channel_stack.h",
-        "src/core/lib/channel/channel_stack_builder.c",
-        "src/core/lib/channel/channel_stack_builder.h",
-        "src/core/lib/channel/compress_filter.c",
-        "src/core/lib/channel/compress_filter.h",
-        "src/core/lib/channel/connected_channel.c",
-        "src/core/lib/channel/connected_channel.h",
-        "src/core/lib/channel/context.h",
-        "src/core/lib/channel/http_client_filter.c",
-        "src/core/lib/channel/http_client_filter.h",
-        "src/core/lib/channel/http_server_filter.c",
-        "src/core/lib/channel/http_server_filter.h",
-        "src/core/lib/compression/algorithm_metadata.h",
-        "src/core/lib/compression/compression.c",
-        "src/core/lib/compression/message_compress.c",
-        "src/core/lib/compression/message_compress.h",
-        "src/core/lib/debug/trace.c",
-        "src/core/lib/debug/trace.h",
-        "src/core/lib/http/format_request.c",
-        "src/core/lib/http/format_request.h",
-        "src/core/lib/http/httpcli.c",
-        "src/core/lib/http/httpcli.h",
-        "src/core/lib/http/httpcli_security_connector.c",
-        "src/core/lib/http/parser.c",
-        "src/core/lib/http/parser.h",
-        "src/core/lib/iomgr/closure.c",
-        "src/core/lib/iomgr/closure.h",
-        "src/core/lib/iomgr/endpoint.c",
-        "src/core/lib/iomgr/endpoint.h",
-        "src/core/lib/iomgr/endpoint_pair.h",
-        "src/core/lib/iomgr/endpoint_pair_posix.c",
-        "src/core/lib/iomgr/endpoint_pair_windows.c",
-        "src/core/lib/iomgr/error.c",
-        "src/core/lib/iomgr/error.h",
-        "src/core/lib/iomgr/ev_epoll_linux.c",
-        "src/core/lib/iomgr/ev_epoll_linux.h",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.c",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.h",
-        "src/core/lib/iomgr/ev_poll_posix.c",
-        "src/core/lib/iomgr/ev_poll_posix.h",
-        "src/core/lib/iomgr/ev_posix.c",
-        "src/core/lib/iomgr/ev_posix.h",
-        "src/core/lib/iomgr/exec_ctx.c",
-        "src/core/lib/iomgr/exec_ctx.h",
-        "src/core/lib/iomgr/executor.c",
-        "src/core/lib/iomgr/executor.h",
-        "src/core/lib/iomgr/iocp_windows.c",
-        "src/core/lib/iomgr/iocp_windows.h",
-        "src/core/lib/iomgr/iomgr.c",
-        "src/core/lib/iomgr/iomgr.h",
-        "src/core/lib/iomgr/iomgr_internal.h",
-        "src/core/lib/iomgr/iomgr_posix.c",
-        "src/core/lib/iomgr/iomgr_posix.h",
-        "src/core/lib/iomgr/iomgr_windows.c",
-        "src/core/lib/iomgr/load_file.c",
-        "src/core/lib/iomgr/load_file.h",
-        "src/core/lib/iomgr/network_status_tracker.c",
-        "src/core/lib/iomgr/network_status_tracker.h",
-        "src/core/lib/iomgr/polling_entity.c",
-        "src/core/lib/iomgr/polling_entity.h",
-        "src/core/lib/iomgr/pollset.h",
-        "src/core/lib/iomgr/pollset_set.h",
-        "src/core/lib/iomgr/pollset_set_windows.c",
-        "src/core/lib/iomgr/pollset_set_windows.h",
-        "src/core/lib/iomgr/pollset_windows.c",
-        "src/core/lib/iomgr/pollset_windows.h",
-        "src/core/lib/iomgr/resolve_address.h",
-        "src/core/lib/iomgr/resolve_address_posix.c",
-        "src/core/lib/iomgr/resolve_address_windows.c",
-        "src/core/lib/iomgr/sockaddr.h",
-        "src/core/lib/iomgr/sockaddr_posix.h",
-        "src/core/lib/iomgr/sockaddr_utils.c",
-        "src/core/lib/iomgr/sockaddr_utils.h",
-        "src/core/lib/iomgr/sockaddr_windows.h",
-        "src/core/lib/iomgr/socket_utils_common_posix.c",
-        "src/core/lib/iomgr/socket_utils_linux.c",
-        "src/core/lib/iomgr/socket_utils_posix.c",
-        "src/core/lib/iomgr/socket_utils_posix.h",
-        "src/core/lib/iomgr/socket_windows.c",
-        "src/core/lib/iomgr/socket_windows.h",
-        "src/core/lib/iomgr/tcp_client.h",
-        "src/core/lib/iomgr/tcp_client_posix.c",
-        "src/core/lib/iomgr/tcp_client_windows.c",
-        "src/core/lib/iomgr/tcp_posix.c",
-        "src/core/lib/iomgr/tcp_posix.h",
-        "src/core/lib/iomgr/tcp_server.h",
-        "src/core/lib/iomgr/tcp_server_posix.c",
-        "src/core/lib/iomgr/tcp_server_windows.c",
-        "src/core/lib/iomgr/tcp_windows.c",
-        "src/core/lib/iomgr/tcp_windows.h",
-        "src/core/lib/iomgr/time_averaged_stats.c",
-        "src/core/lib/iomgr/time_averaged_stats.h",
-        "src/core/lib/iomgr/timer.c",
-        "src/core/lib/iomgr/timer.h",
-        "src/core/lib/iomgr/timer_heap.c",
-        "src/core/lib/iomgr/timer_heap.h",
-        "src/core/lib/iomgr/udp_server.c",
-        "src/core/lib/iomgr/udp_server.h",
-        "src/core/lib/iomgr/unix_sockets_posix.c",
-        "src/core/lib/iomgr/unix_sockets_posix.h",
-        "src/core/lib/iomgr/unix_sockets_posix_noop.c",
-        "src/core/lib/iomgr/wakeup_fd_eventfd.c",
-        "src/core/lib/iomgr/wakeup_fd_nospecial.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.h",
-        "src/core/lib/iomgr/wakeup_fd_posix.c",
-        "src/core/lib/iomgr/wakeup_fd_posix.h",
-        "src/core/lib/iomgr/workqueue.h",
-        "src/core/lib/iomgr/workqueue_posix.c",
-        "src/core/lib/iomgr/workqueue_posix.h",
-        "src/core/lib/iomgr/workqueue_windows.c",
-        "src/core/lib/iomgr/workqueue_windows.h",
-        "src/core/lib/json/json.c",
-        "src/core/lib/json/json.h",
-        "src/core/lib/json/json_common.h",
-        "src/core/lib/json/json_reader.c",
-        "src/core/lib/json/json_reader.h",
-        "src/core/lib/json/json_string.c",
-        "src/core/lib/json/json_writer.c",
-        "src/core/lib/json/json_writer.h",
-        "src/core/lib/security/context/security_context.c",
-        "src/core/lib/security/context/security_context.h",
-        "src/core/lib/security/credentials/composite/composite_credentials.c",
-        "src/core/lib/security/credentials/composite/composite_credentials.h",
-        "src/core/lib/security/credentials/credentials.c",
-        "src/core/lib/security/credentials/credentials.h",
-        "src/core/lib/security/credentials/credentials_metadata.c",
-        "src/core/lib/security/credentials/fake/fake_credentials.c",
-        "src/core/lib/security/credentials/fake/fake_credentials.h",
-        "src/core/lib/security/credentials/google_default/credentials_posix.c",
-        "src/core/lib/security/credentials/google_default/credentials_windows.c",
-        "src/core/lib/security/credentials/google_default/google_default_credentials.c",
-        "src/core/lib/security/credentials/google_default/google_default_credentials.h",
-        "src/core/lib/security/credentials/iam/iam_credentials.c",
-        "src/core/lib/security/credentials/iam/iam_credentials.h",
-        "src/core/lib/security/credentials/jwt/json_token.c",
-        "src/core/lib/security/credentials/jwt/json_token.h",
-        "src/core/lib/security/credentials/jwt/jwt_credentials.c",
-        "src/core/lib/security/credentials/jwt/jwt_credentials.h",
-        "src/core/lib/security/credentials/jwt/jwt_verifier.c",
-        "src/core/lib/security/credentials/jwt/jwt_verifier.h",
-        "src/core/lib/security/credentials/oauth2/oauth2_credentials.c",
-        "src/core/lib/security/credentials/oauth2/oauth2_credentials.h",
-        "src/core/lib/security/credentials/plugin/plugin_credentials.c",
-        "src/core/lib/security/credentials/plugin/plugin_credentials.h",
-        "src/core/lib/security/credentials/ssl/ssl_credentials.c",
-        "src/core/lib/security/credentials/ssl/ssl_credentials.h",
-        "src/core/lib/security/transport/auth_filters.h",
-        "src/core/lib/security/transport/client_auth_filter.c",
-        "src/core/lib/security/transport/handshake.c",
-        "src/core/lib/security/transport/handshake.h",
-        "src/core/lib/security/transport/secure_endpoint.c",
-        "src/core/lib/security/transport/secure_endpoint.h",
-        "src/core/lib/security/transport/security_connector.c",
-        "src/core/lib/security/transport/security_connector.h",
-        "src/core/lib/security/transport/server_auth_filter.c",
-        "src/core/lib/security/transport/tsi_error.c",
-        "src/core/lib/security/transport/tsi_error.h",
-        "src/core/lib/security/util/b64.c",
-        "src/core/lib/security/util/b64.h",
-        "src/core/lib/security/util/json_util.c",
-        "src/core/lib/security/util/json_util.h",
-        "src/core/lib/surface/alarm.c",
-        "src/core/lib/surface/api_trace.c",
-        "src/core/lib/surface/api_trace.h",
-        "src/core/lib/surface/byte_buffer.c",
-        "src/core/lib/surface/byte_buffer_reader.c",
-        "src/core/lib/surface/call.c",
-        "src/core/lib/surface/call.h",
-        "src/core/lib/surface/call_details.c",
-        "src/core/lib/surface/call_log_batch.c",
-        "src/core/lib/surface/call_test_only.h",
-        "src/core/lib/surface/channel.c",
-        "src/core/lib/surface/channel.h",
-        "src/core/lib/surface/channel_init.c",
-        "src/core/lib/surface/channel_init.h",
-        "src/core/lib/surface/channel_ping.c",
-        "src/core/lib/surface/channel_stack_type.c",
-        "src/core/lib/surface/channel_stack_type.h",
-        "src/core/lib/surface/completion_queue.c",
-        "src/core/lib/surface/completion_queue.h",
-        "src/core/lib/surface/event_string.c",
-        "src/core/lib/surface/event_string.h",
-        "src/core/lib/surface/init.c",
-        "src/core/lib/surface/init.h",
-        "src/core/lib/surface/init_secure.c",
-        "src/core/lib/surface/lame_client.c",
-        "src/core/lib/surface/lame_client.h",
-        "src/core/lib/surface/metadata_array.c",
-        "src/core/lib/surface/server.c",
-        "src/core/lib/surface/server.h",
-        "src/core/lib/surface/validate_metadata.c",
-        "src/core/lib/surface/version.c",
-        "src/core/lib/transport/byte_stream.c",
-        "src/core/lib/transport/byte_stream.h",
-        "src/core/lib/transport/connectivity_state.c",
-        "src/core/lib/transport/connectivity_state.h",
-        "src/core/lib/transport/metadata.c",
-        "src/core/lib/transport/metadata.h",
-        "src/core/lib/transport/metadata_batch.c",
-        "src/core/lib/transport/metadata_batch.h",
-        "src/core/lib/transport/static_metadata.c",
-        "src/core/lib/transport/static_metadata.h",
-        "src/core/lib/transport/transport.c",
-        "src/core/lib/transport/transport.h",
-        "src/core/lib/transport/transport_impl.h",
-        "src/core/lib/transport/transport_op_string.c",
-        "src/core/lib/tsi/fake_transport_security.c",
-        "src/core/lib/tsi/fake_transport_security.h",
-        "src/core/lib/tsi/ssl_transport_security.c",
-        "src/core/lib/tsi/ssl_transport_security.h",
-        "src/core/lib/tsi/ssl_types.h",
-        "src/core/lib/tsi/transport_security.c",
-        "src/core/lib/tsi/transport_security.h",
-        "src/core/lib/tsi/transport_security_interface.h",
-        "src/core/plugin_registry/grpc_plugin_registry.c",
-        "third_party/nanopb/pb.h",
-        "third_party/nanopb/pb_decode.h",
-        "third_party/nanopb/pb_encode.h",
-    ],
-    hdrs = [
-        "include/grpc/byte_buffer.h",
-        "include/grpc/byte_buffer_reader.h",
-        "include/grpc/census.h",
-        "include/grpc/compression.h",
-        "include/grpc/grpc.h",
-        "include/grpc/grpc_posix.h",
-        "include/grpc/grpc_security.h",
-        "include/grpc/grpc_security_constants.h",
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/byte_buffer.h",
-        "include/grpc/impl/codegen/byte_buffer_reader.h",
-        "include/grpc/impl/codegen/compression_types.h",
-        "include/grpc/impl/codegen/connectivity_state.h",
-        "include/grpc/impl/codegen/grpc_types.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/propagation_bits.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/status.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/status.h",
-    ],
-    copts = [
-        "-std=gnu99",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        ":gpr",
-        "//external:libssl",
-        "//external:nanopb",
-        "//external:zlib",
-    ],
-)
-
-cc_library(
-    name = "grpc_cronet",
-    srcs = [
-        "src/core/ext/client_config/channel_connectivity.c",
-        "src/core/ext/client_config/client_channel.c",
-        "src/core/ext/client_config/client_channel.h",
-        "src/core/ext/client_config/client_channel_factory.c",
-        "src/core/ext/client_config/client_channel_factory.h",
-        "src/core/ext/client_config/client_config.c",
-        "src/core/ext/client_config/client_config.h",
-        "src/core/ext/client_config/client_config_plugin.c",
-        "src/core/ext/client_config/connector.c",
-        "src/core/ext/client_config/connector.h",
-        "src/core/ext/client_config/default_initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.h",
-        "src/core/ext/client_config/lb_policy.c",
-        "src/core/ext/client_config/lb_policy.h",
-        "src/core/ext/client_config/lb_policy_factory.c",
-        "src/core/ext/client_config/lb_policy_factory.h",
-        "src/core/ext/client_config/lb_policy_registry.c",
-        "src/core/ext/client_config/lb_policy_registry.h",
-        "src/core/ext/client_config/parse_address.c",
-        "src/core/ext/client_config/parse_address.h",
-        "src/core/ext/client_config/resolver.c",
-        "src/core/ext/client_config/resolver.h",
-        "src/core/ext/client_config/resolver_factory.c",
-        "src/core/ext/client_config/resolver_factory.h",
-        "src/core/ext/client_config/resolver_registry.c",
-        "src/core/ext/client_config/resolver_registry.h",
-        "src/core/ext/client_config/subchannel.c",
-        "src/core/ext/client_config/subchannel.h",
-        "src/core/ext/client_config/subchannel_call_holder.c",
-        "src/core/ext/client_config/subchannel_call_holder.h",
-        "src/core/ext/client_config/subchannel_index.c",
-        "src/core/ext/client_config/subchannel_index.h",
-        "src/core/ext/client_config/uri_parser.c",
-        "src/core/ext/client_config/uri_parser.h",
-        "src/core/ext/transport/chttp2/alpn/alpn.c",
-        "src/core/ext/transport/chttp2/alpn/alpn.h",
-        "src/core/ext/transport/chttp2/client/secure/secure_channel_create.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.h",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.h",
-        "src/core/ext/transport/chttp2/transport/chttp2_plugin.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.h",
-        "src/core/ext/transport/chttp2/transport/frame.h",
-        "src/core/ext/transport/chttp2/transport/frame_data.c",
-        "src/core/ext/transport/chttp2/transport/frame_data.h",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.c",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.h",
-        "src/core/ext/transport/chttp2/transport/frame_ping.c",
-        "src/core/ext/transport/chttp2/transport/frame_ping.h",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.c",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.h",
-        "src/core/ext/transport/chttp2/transport/frame_settings.c",
-        "src/core/ext/transport/chttp2/transport/frame_settings.h",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.c",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.h",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.c",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.h",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.c",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.h",
-        "src/core/ext/transport/chttp2/transport/hpack_table.c",
-        "src/core/ext/transport/chttp2/transport/hpack_table.h",
-        "src/core/ext/transport/chttp2/transport/http2_errors.h",
-        "src/core/ext/transport/chttp2/transport/huffsyms.c",
-        "src/core/ext/transport/chttp2/transport/huffsyms.h",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.c",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.h",
-        "src/core/ext/transport/chttp2/transport/internal.h",
-        "src/core/ext/transport/chttp2/transport/parsing.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.h",
-        "src/core/ext/transport/chttp2/transport/stream_lists.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.h",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.c",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.h",
-        "src/core/ext/transport/chttp2/transport/varint.c",
-        "src/core/ext/transport/chttp2/transport/varint.h",
-        "src/core/ext/transport/chttp2/transport/writing.c",
-        "src/core/ext/transport/cronet/client/secure/cronet_channel_create.c",
-        "src/core/ext/transport/cronet/transport/cronet_api_dummy.c",
-        "src/core/ext/transport/cronet/transport/cronet_transport.c",
-        "src/core/lib/channel/channel_args.c",
-        "src/core/lib/channel/channel_args.h",
-        "src/core/lib/channel/channel_stack.c",
-        "src/core/lib/channel/channel_stack.h",
-        "src/core/lib/channel/channel_stack_builder.c",
-        "src/core/lib/channel/channel_stack_builder.h",
-        "src/core/lib/channel/compress_filter.c",
-        "src/core/lib/channel/compress_filter.h",
-        "src/core/lib/channel/connected_channel.c",
-        "src/core/lib/channel/connected_channel.h",
-        "src/core/lib/channel/context.h",
-        "src/core/lib/channel/http_client_filter.c",
-        "src/core/lib/channel/http_client_filter.h",
-        "src/core/lib/channel/http_server_filter.c",
-        "src/core/lib/channel/http_server_filter.h",
-        "src/core/lib/compression/algorithm_metadata.h",
-        "src/core/lib/compression/compression.c",
-        "src/core/lib/compression/message_compress.c",
-        "src/core/lib/compression/message_compress.h",
-        "src/core/lib/debug/trace.c",
-        "src/core/lib/debug/trace.h",
-        "src/core/lib/http/format_request.c",
-        "src/core/lib/http/format_request.h",
-        "src/core/lib/http/httpcli.c",
-        "src/core/lib/http/httpcli.h",
-        "src/core/lib/http/httpcli_security_connector.c",
-        "src/core/lib/http/parser.c",
-        "src/core/lib/http/parser.h",
-        "src/core/lib/iomgr/closure.c",
-        "src/core/lib/iomgr/closure.h",
-        "src/core/lib/iomgr/endpoint.c",
-        "src/core/lib/iomgr/endpoint.h",
-        "src/core/lib/iomgr/endpoint_pair.h",
-        "src/core/lib/iomgr/endpoint_pair_posix.c",
-        "src/core/lib/iomgr/endpoint_pair_windows.c",
-        "src/core/lib/iomgr/error.c",
-        "src/core/lib/iomgr/error.h",
-        "src/core/lib/iomgr/ev_epoll_linux.c",
-        "src/core/lib/iomgr/ev_epoll_linux.h",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.c",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.h",
-        "src/core/lib/iomgr/ev_poll_posix.c",
-        "src/core/lib/iomgr/ev_poll_posix.h",
-        "src/core/lib/iomgr/ev_posix.c",
-        "src/core/lib/iomgr/ev_posix.h",
-        "src/core/lib/iomgr/exec_ctx.c",
-        "src/core/lib/iomgr/exec_ctx.h",
-        "src/core/lib/iomgr/executor.c",
-        "src/core/lib/iomgr/executor.h",
-        "src/core/lib/iomgr/iocp_windows.c",
-        "src/core/lib/iomgr/iocp_windows.h",
-        "src/core/lib/iomgr/iomgr.c",
-        "src/core/lib/iomgr/iomgr.h",
-        "src/core/lib/iomgr/iomgr_internal.h",
-        "src/core/lib/iomgr/iomgr_posix.c",
-        "src/core/lib/iomgr/iomgr_posix.h",
-        "src/core/lib/iomgr/iomgr_windows.c",
-        "src/core/lib/iomgr/load_file.c",
-        "src/core/lib/iomgr/load_file.h",
-        "src/core/lib/iomgr/network_status_tracker.c",
-        "src/core/lib/iomgr/network_status_tracker.h",
-        "src/core/lib/iomgr/polling_entity.c",
-        "src/core/lib/iomgr/polling_entity.h",
-        "src/core/lib/iomgr/pollset.h",
-        "src/core/lib/iomgr/pollset_set.h",
-        "src/core/lib/iomgr/pollset_set_windows.c",
-        "src/core/lib/iomgr/pollset_set_windows.h",
-        "src/core/lib/iomgr/pollset_windows.c",
-        "src/core/lib/iomgr/pollset_windows.h",
-        "src/core/lib/iomgr/resolve_address.h",
-        "src/core/lib/iomgr/resolve_address_posix.c",
-        "src/core/lib/iomgr/resolve_address_windows.c",
-        "src/core/lib/iomgr/sockaddr.h",
-        "src/core/lib/iomgr/sockaddr_posix.h",
-        "src/core/lib/iomgr/sockaddr_utils.c",
-        "src/core/lib/iomgr/sockaddr_utils.h",
-        "src/core/lib/iomgr/sockaddr_windows.h",
-        "src/core/lib/iomgr/socket_utils_common_posix.c",
-        "src/core/lib/iomgr/socket_utils_linux.c",
-        "src/core/lib/iomgr/socket_utils_posix.c",
-        "src/core/lib/iomgr/socket_utils_posix.h",
-        "src/core/lib/iomgr/socket_windows.c",
-        "src/core/lib/iomgr/socket_windows.h",
-        "src/core/lib/iomgr/tcp_client.h",
-        "src/core/lib/iomgr/tcp_client_posix.c",
-        "src/core/lib/iomgr/tcp_client_windows.c",
-        "src/core/lib/iomgr/tcp_posix.c",
-        "src/core/lib/iomgr/tcp_posix.h",
-        "src/core/lib/iomgr/tcp_server.h",
-        "src/core/lib/iomgr/tcp_server_posix.c",
-        "src/core/lib/iomgr/tcp_server_windows.c",
-        "src/core/lib/iomgr/tcp_windows.c",
-        "src/core/lib/iomgr/tcp_windows.h",
-        "src/core/lib/iomgr/time_averaged_stats.c",
-        "src/core/lib/iomgr/time_averaged_stats.h",
-        "src/core/lib/iomgr/timer.c",
-        "src/core/lib/iomgr/timer.h",
-        "src/core/lib/iomgr/timer_heap.c",
-        "src/core/lib/iomgr/timer_heap.h",
-        "src/core/lib/iomgr/udp_server.c",
-        "src/core/lib/iomgr/udp_server.h",
-        "src/core/lib/iomgr/unix_sockets_posix.c",
-        "src/core/lib/iomgr/unix_sockets_posix.h",
-        "src/core/lib/iomgr/unix_sockets_posix_noop.c",
-        "src/core/lib/iomgr/wakeup_fd_eventfd.c",
-        "src/core/lib/iomgr/wakeup_fd_nospecial.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.h",
-        "src/core/lib/iomgr/wakeup_fd_posix.c",
-        "src/core/lib/iomgr/wakeup_fd_posix.h",
-        "src/core/lib/iomgr/workqueue.h",
-        "src/core/lib/iomgr/workqueue_posix.c",
-        "src/core/lib/iomgr/workqueue_posix.h",
-        "src/core/lib/iomgr/workqueue_windows.c",
-        "src/core/lib/iomgr/workqueue_windows.h",
-        "src/core/lib/json/json.c",
-        "src/core/lib/json/json.h",
-        "src/core/lib/json/json_common.h",
-        "src/core/lib/json/json_reader.c",
-        "src/core/lib/json/json_reader.h",
-        "src/core/lib/json/json_string.c",
-        "src/core/lib/json/json_writer.c",
-        "src/core/lib/json/json_writer.h",
-        "src/core/lib/security/context/security_context.c",
-        "src/core/lib/security/context/security_context.h",
-        "src/core/lib/security/credentials/composite/composite_credentials.c",
-        "src/core/lib/security/credentials/composite/composite_credentials.h",
-        "src/core/lib/security/credentials/credentials.c",
-        "src/core/lib/security/credentials/credentials.h",
-        "src/core/lib/security/credentials/credentials_metadata.c",
-        "src/core/lib/security/credentials/fake/fake_credentials.c",
-        "src/core/lib/security/credentials/fake/fake_credentials.h",
-        "src/core/lib/security/credentials/google_default/credentials_posix.c",
-        "src/core/lib/security/credentials/google_default/credentials_windows.c",
-        "src/core/lib/security/credentials/google_default/google_default_credentials.c",
-        "src/core/lib/security/credentials/google_default/google_default_credentials.h",
-        "src/core/lib/security/credentials/iam/iam_credentials.c",
-        "src/core/lib/security/credentials/iam/iam_credentials.h",
-        "src/core/lib/security/credentials/jwt/json_token.c",
-        "src/core/lib/security/credentials/jwt/json_token.h",
-        "src/core/lib/security/credentials/jwt/jwt_credentials.c",
-        "src/core/lib/security/credentials/jwt/jwt_credentials.h",
-        "src/core/lib/security/credentials/jwt/jwt_verifier.c",
-        "src/core/lib/security/credentials/jwt/jwt_verifier.h",
-        "src/core/lib/security/credentials/oauth2/oauth2_credentials.c",
-        "src/core/lib/security/credentials/oauth2/oauth2_credentials.h",
-        "src/core/lib/security/credentials/plugin/plugin_credentials.c",
-        "src/core/lib/security/credentials/plugin/plugin_credentials.h",
-        "src/core/lib/security/credentials/ssl/ssl_credentials.c",
-        "src/core/lib/security/credentials/ssl/ssl_credentials.h",
-        "src/core/lib/security/transport/auth_filters.h",
-        "src/core/lib/security/transport/client_auth_filter.c",
-        "src/core/lib/security/transport/handshake.c",
-        "src/core/lib/security/transport/handshake.h",
-        "src/core/lib/security/transport/secure_endpoint.c",
-        "src/core/lib/security/transport/secure_endpoint.h",
-        "src/core/lib/security/transport/security_connector.c",
-        "src/core/lib/security/transport/security_connector.h",
-        "src/core/lib/security/transport/server_auth_filter.c",
-        "src/core/lib/security/transport/tsi_error.c",
-        "src/core/lib/security/transport/tsi_error.h",
-        "src/core/lib/security/util/b64.c",
-        "src/core/lib/security/util/b64.h",
-        "src/core/lib/security/util/json_util.c",
-        "src/core/lib/security/util/json_util.h",
-        "src/core/lib/surface/alarm.c",
-        "src/core/lib/surface/api_trace.c",
-        "src/core/lib/surface/api_trace.h",
-        "src/core/lib/surface/byte_buffer.c",
-        "src/core/lib/surface/byte_buffer_reader.c",
-        "src/core/lib/surface/call.c",
-        "src/core/lib/surface/call.h",
-        "src/core/lib/surface/call_details.c",
-        "src/core/lib/surface/call_log_batch.c",
-        "src/core/lib/surface/call_test_only.h",
-        "src/core/lib/surface/channel.c",
-        "src/core/lib/surface/channel.h",
-        "src/core/lib/surface/channel_init.c",
-        "src/core/lib/surface/channel_init.h",
-        "src/core/lib/surface/channel_ping.c",
-        "src/core/lib/surface/channel_stack_type.c",
-        "src/core/lib/surface/channel_stack_type.h",
-        "src/core/lib/surface/completion_queue.c",
-        "src/core/lib/surface/completion_queue.h",
-        "src/core/lib/surface/event_string.c",
-        "src/core/lib/surface/event_string.h",
-        "src/core/lib/surface/init.c",
-        "src/core/lib/surface/init.h",
-        "src/core/lib/surface/init_secure.c",
-        "src/core/lib/surface/lame_client.c",
-        "src/core/lib/surface/lame_client.h",
-        "src/core/lib/surface/metadata_array.c",
-        "src/core/lib/surface/server.c",
-        "src/core/lib/surface/server.h",
-        "src/core/lib/surface/validate_metadata.c",
-        "src/core/lib/surface/version.c",
-        "src/core/lib/transport/byte_stream.c",
-        "src/core/lib/transport/byte_stream.h",
-        "src/core/lib/transport/connectivity_state.c",
-        "src/core/lib/transport/connectivity_state.h",
-        "src/core/lib/transport/metadata.c",
-        "src/core/lib/transport/metadata.h",
-        "src/core/lib/transport/metadata_batch.c",
-        "src/core/lib/transport/metadata_batch.h",
-        "src/core/lib/transport/static_metadata.c",
-        "src/core/lib/transport/static_metadata.h",
-        "src/core/lib/transport/transport.c",
-        "src/core/lib/transport/transport.h",
-        "src/core/lib/transport/transport_impl.h",
-        "src/core/lib/transport/transport_op_string.c",
-        "src/core/lib/tsi/fake_transport_security.c",
-        "src/core/lib/tsi/fake_transport_security.h",
-        "src/core/lib/tsi/ssl_transport_security.c",
-        "src/core/lib/tsi/ssl_transport_security.h",
-        "src/core/lib/tsi/ssl_types.h",
-        "src/core/lib/tsi/transport_security.c",
-        "src/core/lib/tsi/transport_security.h",
-        "src/core/lib/tsi/transport_security_interface.h",
-        "src/core/plugin_registry/grpc_cronet_plugin_registry.c",
-        "third_party/nanopb/pb.h",
-        "third_party/nanopb/pb_decode.h",
-        "third_party/nanopb/pb_encode.h",
-        "third_party/objective_c/Cronet/cronet_c_for_grpc.h",
-    ],
-    hdrs = [
-        "include/grpc/byte_buffer.h",
-        "include/grpc/byte_buffer_reader.h",
-        "include/grpc/compression.h",
-        "include/grpc/grpc.h",
-        "include/grpc/grpc_cronet.h",
-        "include/grpc/grpc_posix.h",
-        "include/grpc/grpc_security.h",
-        "include/grpc/grpc_security_constants.h",
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/byte_buffer.h",
-        "include/grpc/impl/codegen/byte_buffer_reader.h",
-        "include/grpc/impl/codegen/compression_types.h",
-        "include/grpc/impl/codegen/connectivity_state.h",
-        "include/grpc/impl/codegen/grpc_types.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/propagation_bits.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/status.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/status.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        ":gpr",
-        "//external:libssl",
-    ],
-)
-
-cc_library(
-    name = "grpc_unsecure",
-    srcs = [
-        "src/core/ext/census/aggregation.h",
-        "src/core/ext/census/census_interface.h",
-        "src/core/ext/census/census_rpc_stats.h",
-        "src/core/ext/census/context.c",
-        "src/core/ext/census/gen/census.pb.c",
-        "src/core/ext/census/gen/census.pb.h",
-        "src/core/ext/census/grpc_context.c",
-        "src/core/ext/census/grpc_filter.c",
-        "src/core/ext/census/grpc_filter.h",
-        "src/core/ext/census/grpc_plugin.c",
-        "src/core/ext/census/initialize.c",
-        "src/core/ext/census/mlog.c",
-        "src/core/ext/census/mlog.h",
-        "src/core/ext/census/operation.c",
-        "src/core/ext/census/placeholders.c",
-        "src/core/ext/census/rpc_metric_id.h",
-        "src/core/ext/census/tracing.c",
-        "src/core/ext/client_config/channel_connectivity.c",
-        "src/core/ext/client_config/client_channel.c",
-        "src/core/ext/client_config/client_channel.h",
-        "src/core/ext/client_config/client_channel_factory.c",
-        "src/core/ext/client_config/client_channel_factory.h",
-        "src/core/ext/client_config/client_config.c",
-        "src/core/ext/client_config/client_config.h",
-        "src/core/ext/client_config/client_config_plugin.c",
-        "src/core/ext/client_config/connector.c",
-        "src/core/ext/client_config/connector.h",
-        "src/core/ext/client_config/default_initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.h",
-        "src/core/ext/client_config/lb_policy.c",
-        "src/core/ext/client_config/lb_policy.h",
-        "src/core/ext/client_config/lb_policy_factory.c",
-        "src/core/ext/client_config/lb_policy_factory.h",
-        "src/core/ext/client_config/lb_policy_registry.c",
-        "src/core/ext/client_config/lb_policy_registry.h",
-        "src/core/ext/client_config/parse_address.c",
-        "src/core/ext/client_config/parse_address.h",
-        "src/core/ext/client_config/resolver.c",
-        "src/core/ext/client_config/resolver.h",
-        "src/core/ext/client_config/resolver_factory.c",
-        "src/core/ext/client_config/resolver_factory.h",
-        "src/core/ext/client_config/resolver_registry.c",
-        "src/core/ext/client_config/resolver_registry.h",
-        "src/core/ext/client_config/subchannel.c",
-        "src/core/ext/client_config/subchannel.h",
-        "src/core/ext/client_config/subchannel_call_holder.c",
-        "src/core/ext/client_config/subchannel_call_holder.h",
-        "src/core/ext/client_config/subchannel_index.c",
-        "src/core/ext/client_config/subchannel_index.h",
-        "src/core/ext/client_config/uri_parser.c",
-        "src/core/ext/client_config/uri_parser.h",
-        "src/core/ext/lb_policy/grpclb/load_balancer_api.c",
-        "src/core/ext/lb_policy/grpclb/load_balancer_api.h",
-        "src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c",
-        "src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.h",
-        "src/core/ext/lb_policy/pick_first/pick_first.c",
-        "src/core/ext/lb_policy/round_robin/round_robin.c",
-        "src/core/ext/load_reporting/load_reporting.c",
-        "src/core/ext/load_reporting/load_reporting.h",
-        "src/core/ext/load_reporting/load_reporting_filter.c",
-        "src/core/ext/load_reporting/load_reporting_filter.h",
-        "src/core/ext/resolver/dns/native/dns_resolver.c",
-        "src/core/ext/resolver/sockaddr/sockaddr_resolver.c",
-        "src/core/ext/transport/chttp2/alpn/alpn.c",
-        "src/core/ext/transport/chttp2/alpn/alpn.h",
-        "src/core/ext/transport/chttp2/client/insecure/channel_create.c",
-        "src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c",
-        "src/core/ext/transport/chttp2/server/insecure/server_chttp2.c",
-        "src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.h",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.h",
-        "src/core/ext/transport/chttp2/transport/chttp2_plugin.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.h",
-        "src/core/ext/transport/chttp2/transport/frame.h",
-        "src/core/ext/transport/chttp2/transport/frame_data.c",
-        "src/core/ext/transport/chttp2/transport/frame_data.h",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.c",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.h",
-        "src/core/ext/transport/chttp2/transport/frame_ping.c",
-        "src/core/ext/transport/chttp2/transport/frame_ping.h",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.c",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.h",
-        "src/core/ext/transport/chttp2/transport/frame_settings.c",
-        "src/core/ext/transport/chttp2/transport/frame_settings.h",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.c",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.h",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.c",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.h",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.c",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.h",
-        "src/core/ext/transport/chttp2/transport/hpack_table.c",
-        "src/core/ext/transport/chttp2/transport/hpack_table.h",
-        "src/core/ext/transport/chttp2/transport/http2_errors.h",
-        "src/core/ext/transport/chttp2/transport/huffsyms.c",
-        "src/core/ext/transport/chttp2/transport/huffsyms.h",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.c",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.h",
-        "src/core/ext/transport/chttp2/transport/internal.h",
-        "src/core/ext/transport/chttp2/transport/parsing.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.h",
-        "src/core/ext/transport/chttp2/transport/stream_lists.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.h",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.c",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.h",
-        "src/core/ext/transport/chttp2/transport/varint.c",
-        "src/core/ext/transport/chttp2/transport/varint.h",
-        "src/core/ext/transport/chttp2/transport/writing.c",
-        "src/core/lib/channel/channel_args.c",
-        "src/core/lib/channel/channel_args.h",
-        "src/core/lib/channel/channel_stack.c",
-        "src/core/lib/channel/channel_stack.h",
-        "src/core/lib/channel/channel_stack_builder.c",
-        "src/core/lib/channel/channel_stack_builder.h",
-        "src/core/lib/channel/compress_filter.c",
-        "src/core/lib/channel/compress_filter.h",
-        "src/core/lib/channel/connected_channel.c",
-        "src/core/lib/channel/connected_channel.h",
-        "src/core/lib/channel/context.h",
-        "src/core/lib/channel/http_client_filter.c",
-        "src/core/lib/channel/http_client_filter.h",
-        "src/core/lib/channel/http_server_filter.c",
-        "src/core/lib/channel/http_server_filter.h",
-        "src/core/lib/compression/algorithm_metadata.h",
-        "src/core/lib/compression/compression.c",
-        "src/core/lib/compression/message_compress.c",
-        "src/core/lib/compression/message_compress.h",
-        "src/core/lib/debug/trace.c",
-        "src/core/lib/debug/trace.h",
-        "src/core/lib/http/format_request.c",
-        "src/core/lib/http/format_request.h",
-        "src/core/lib/http/httpcli.c",
-        "src/core/lib/http/httpcli.h",
-        "src/core/lib/http/parser.c",
-        "src/core/lib/http/parser.h",
-        "src/core/lib/iomgr/closure.c",
-        "src/core/lib/iomgr/closure.h",
-        "src/core/lib/iomgr/endpoint.c",
-        "src/core/lib/iomgr/endpoint.h",
-        "src/core/lib/iomgr/endpoint_pair.h",
-        "src/core/lib/iomgr/endpoint_pair_posix.c",
-        "src/core/lib/iomgr/endpoint_pair_windows.c",
-        "src/core/lib/iomgr/error.c",
-        "src/core/lib/iomgr/error.h",
-        "src/core/lib/iomgr/ev_epoll_linux.c",
-        "src/core/lib/iomgr/ev_epoll_linux.h",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.c",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.h",
-        "src/core/lib/iomgr/ev_poll_posix.c",
-        "src/core/lib/iomgr/ev_poll_posix.h",
-        "src/core/lib/iomgr/ev_posix.c",
-        "src/core/lib/iomgr/ev_posix.h",
-        "src/core/lib/iomgr/exec_ctx.c",
-        "src/core/lib/iomgr/exec_ctx.h",
-        "src/core/lib/iomgr/executor.c",
-        "src/core/lib/iomgr/executor.h",
-        "src/core/lib/iomgr/iocp_windows.c",
-        "src/core/lib/iomgr/iocp_windows.h",
-        "src/core/lib/iomgr/iomgr.c",
-        "src/core/lib/iomgr/iomgr.h",
-        "src/core/lib/iomgr/iomgr_internal.h",
-        "src/core/lib/iomgr/iomgr_posix.c",
-        "src/core/lib/iomgr/iomgr_posix.h",
-        "src/core/lib/iomgr/iomgr_windows.c",
-        "src/core/lib/iomgr/load_file.c",
-        "src/core/lib/iomgr/load_file.h",
-        "src/core/lib/iomgr/network_status_tracker.c",
-        "src/core/lib/iomgr/network_status_tracker.h",
-        "src/core/lib/iomgr/polling_entity.c",
-        "src/core/lib/iomgr/polling_entity.h",
-        "src/core/lib/iomgr/pollset.h",
-        "src/core/lib/iomgr/pollset_set.h",
-        "src/core/lib/iomgr/pollset_set_windows.c",
-        "src/core/lib/iomgr/pollset_set_windows.h",
-        "src/core/lib/iomgr/pollset_windows.c",
-        "src/core/lib/iomgr/pollset_windows.h",
-        "src/core/lib/iomgr/resolve_address.h",
-        "src/core/lib/iomgr/resolve_address_posix.c",
-        "src/core/lib/iomgr/resolve_address_windows.c",
-        "src/core/lib/iomgr/sockaddr.h",
-        "src/core/lib/iomgr/sockaddr_posix.h",
-        "src/core/lib/iomgr/sockaddr_utils.c",
-        "src/core/lib/iomgr/sockaddr_utils.h",
-        "src/core/lib/iomgr/sockaddr_windows.h",
-        "src/core/lib/iomgr/socket_utils_common_posix.c",
-        "src/core/lib/iomgr/socket_utils_linux.c",
-        "src/core/lib/iomgr/socket_utils_posix.c",
-        "src/core/lib/iomgr/socket_utils_posix.h",
-        "src/core/lib/iomgr/socket_windows.c",
-        "src/core/lib/iomgr/socket_windows.h",
-        "src/core/lib/iomgr/tcp_client.h",
-        "src/core/lib/iomgr/tcp_client_posix.c",
-        "src/core/lib/iomgr/tcp_client_windows.c",
-        "src/core/lib/iomgr/tcp_posix.c",
-        "src/core/lib/iomgr/tcp_posix.h",
-        "src/core/lib/iomgr/tcp_server.h",
-        "src/core/lib/iomgr/tcp_server_posix.c",
-        "src/core/lib/iomgr/tcp_server_windows.c",
-        "src/core/lib/iomgr/tcp_windows.c",
-        "src/core/lib/iomgr/tcp_windows.h",
-        "src/core/lib/iomgr/time_averaged_stats.c",
-        "src/core/lib/iomgr/time_averaged_stats.h",
-        "src/core/lib/iomgr/timer.c",
-        "src/core/lib/iomgr/timer.h",
-        "src/core/lib/iomgr/timer_heap.c",
-        "src/core/lib/iomgr/timer_heap.h",
-        "src/core/lib/iomgr/udp_server.c",
-        "src/core/lib/iomgr/udp_server.h",
-        "src/core/lib/iomgr/unix_sockets_posix.c",
-        "src/core/lib/iomgr/unix_sockets_posix.h",
-        "src/core/lib/iomgr/unix_sockets_posix_noop.c",
-        "src/core/lib/iomgr/wakeup_fd_eventfd.c",
-        "src/core/lib/iomgr/wakeup_fd_nospecial.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.h",
-        "src/core/lib/iomgr/wakeup_fd_posix.c",
-        "src/core/lib/iomgr/wakeup_fd_posix.h",
-        "src/core/lib/iomgr/workqueue.h",
-        "src/core/lib/iomgr/workqueue_posix.c",
-        "src/core/lib/iomgr/workqueue_posix.h",
-        "src/core/lib/iomgr/workqueue_windows.c",
-        "src/core/lib/iomgr/workqueue_windows.h",
-        "src/core/lib/json/json.c",
-        "src/core/lib/json/json.h",
-        "src/core/lib/json/json_common.h",
-        "src/core/lib/json/json_reader.c",
-        "src/core/lib/json/json_reader.h",
-        "src/core/lib/json/json_string.c",
-        "src/core/lib/json/json_writer.c",
-        "src/core/lib/json/json_writer.h",
-        "src/core/lib/surface/alarm.c",
-        "src/core/lib/surface/api_trace.c",
-        "src/core/lib/surface/api_trace.h",
-        "src/core/lib/surface/byte_buffer.c",
-        "src/core/lib/surface/byte_buffer_reader.c",
-        "src/core/lib/surface/call.c",
-        "src/core/lib/surface/call.h",
-        "src/core/lib/surface/call_details.c",
-        "src/core/lib/surface/call_log_batch.c",
-        "src/core/lib/surface/call_test_only.h",
-        "src/core/lib/surface/channel.c",
-        "src/core/lib/surface/channel.h",
-        "src/core/lib/surface/channel_init.c",
-        "src/core/lib/surface/channel_init.h",
-        "src/core/lib/surface/channel_ping.c",
-        "src/core/lib/surface/channel_stack_type.c",
-        "src/core/lib/surface/channel_stack_type.h",
-        "src/core/lib/surface/completion_queue.c",
-        "src/core/lib/surface/completion_queue.h",
-        "src/core/lib/surface/event_string.c",
-        "src/core/lib/surface/event_string.h",
-        "src/core/lib/surface/init.c",
-        "src/core/lib/surface/init.h",
-        "src/core/lib/surface/init_unsecure.c",
-        "src/core/lib/surface/lame_client.c",
-        "src/core/lib/surface/lame_client.h",
-        "src/core/lib/surface/metadata_array.c",
-        "src/core/lib/surface/server.c",
-        "src/core/lib/surface/server.h",
-        "src/core/lib/surface/validate_metadata.c",
-        "src/core/lib/surface/version.c",
-        "src/core/lib/transport/byte_stream.c",
-        "src/core/lib/transport/byte_stream.h",
-        "src/core/lib/transport/connectivity_state.c",
-        "src/core/lib/transport/connectivity_state.h",
-        "src/core/lib/transport/metadata.c",
-        "src/core/lib/transport/metadata.h",
-        "src/core/lib/transport/metadata_batch.c",
-        "src/core/lib/transport/metadata_batch.h",
-        "src/core/lib/transport/static_metadata.c",
-        "src/core/lib/transport/static_metadata.h",
-        "src/core/lib/transport/transport.c",
-        "src/core/lib/transport/transport.h",
-        "src/core/lib/transport/transport_impl.h",
-        "src/core/lib/transport/transport_op_string.c",
-        "src/core/plugin_registry/grpc_unsecure_plugin_registry.c",
-        "third_party/nanopb/pb.h",
-        "third_party/nanopb/pb_decode.h",
-        "third_party/nanopb/pb_encode.h",
-    ],
-    hdrs = [
-        "include/grpc/byte_buffer.h",
-        "include/grpc/byte_buffer_reader.h",
-        "include/grpc/census.h",
-        "include/grpc/compression.h",
-        "include/grpc/grpc.h",
-        "include/grpc/grpc_posix.h",
-        "include/grpc/grpc_security_constants.h",
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/byte_buffer.h",
-        "include/grpc/impl/codegen/byte_buffer_reader.h",
-        "include/grpc/impl/codegen/compression_types.h",
-        "include/grpc/impl/codegen/connectivity_state.h",
-        "include/grpc/impl/codegen/grpc_types.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/propagation_bits.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/status.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/status.h",
-    ],
-    copts = [
-        "-std=gnu99",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        ":gpr",
-        "//external:nanopb",
-        "//external:zlib",
-    ],
-)
-
-cc_library(
-    name = "grpc++",
-    srcs = [
-        "include/grpc++/impl/codegen/core_codegen.h",
-        "src/core/lib/channel/channel_args.c",
-        "src/core/lib/channel/channel_args.h",
-        "src/core/lib/channel/channel_stack.c",
-        "src/core/lib/channel/channel_stack.h",
-        "src/core/lib/channel/channel_stack_builder.c",
-        "src/core/lib/channel/channel_stack_builder.h",
-        "src/core/lib/channel/compress_filter.c",
-        "src/core/lib/channel/compress_filter.h",
-        "src/core/lib/channel/connected_channel.c",
-        "src/core/lib/channel/connected_channel.h",
-        "src/core/lib/channel/context.h",
-        "src/core/lib/channel/http_client_filter.c",
-        "src/core/lib/channel/http_client_filter.h",
-        "src/core/lib/channel/http_server_filter.c",
-        "src/core/lib/channel/http_server_filter.h",
-        "src/core/lib/compression/algorithm_metadata.h",
-        "src/core/lib/compression/compression.c",
-        "src/core/lib/compression/message_compress.c",
-        "src/core/lib/compression/message_compress.h",
-        "src/core/lib/debug/trace.c",
-        "src/core/lib/debug/trace.h",
-        "src/core/lib/http/format_request.c",
-        "src/core/lib/http/format_request.h",
-        "src/core/lib/http/httpcli.c",
-        "src/core/lib/http/httpcli.h",
-        "src/core/lib/http/parser.c",
-        "src/core/lib/http/parser.h",
-        "src/core/lib/iomgr/closure.c",
-        "src/core/lib/iomgr/closure.h",
-        "src/core/lib/iomgr/endpoint.c",
-        "src/core/lib/iomgr/endpoint.h",
-        "src/core/lib/iomgr/endpoint_pair.h",
-        "src/core/lib/iomgr/endpoint_pair_posix.c",
-        "src/core/lib/iomgr/endpoint_pair_windows.c",
-        "src/core/lib/iomgr/error.c",
-        "src/core/lib/iomgr/error.h",
-        "src/core/lib/iomgr/ev_epoll_linux.c",
-        "src/core/lib/iomgr/ev_epoll_linux.h",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.c",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.h",
-        "src/core/lib/iomgr/ev_poll_posix.c",
-        "src/core/lib/iomgr/ev_poll_posix.h",
-        "src/core/lib/iomgr/ev_posix.c",
-        "src/core/lib/iomgr/ev_posix.h",
-        "src/core/lib/iomgr/exec_ctx.c",
-        "src/core/lib/iomgr/exec_ctx.h",
-        "src/core/lib/iomgr/executor.c",
-        "src/core/lib/iomgr/executor.h",
-        "src/core/lib/iomgr/iocp_windows.c",
-        "src/core/lib/iomgr/iocp_windows.h",
-        "src/core/lib/iomgr/iomgr.c",
-        "src/core/lib/iomgr/iomgr.h",
-        "src/core/lib/iomgr/iomgr_internal.h",
-        "src/core/lib/iomgr/iomgr_posix.c",
-        "src/core/lib/iomgr/iomgr_posix.h",
-        "src/core/lib/iomgr/iomgr_windows.c",
-        "src/core/lib/iomgr/load_file.c",
-        "src/core/lib/iomgr/load_file.h",
-        "src/core/lib/iomgr/network_status_tracker.c",
-        "src/core/lib/iomgr/network_status_tracker.h",
-        "src/core/lib/iomgr/polling_entity.c",
-        "src/core/lib/iomgr/polling_entity.h",
-        "src/core/lib/iomgr/pollset.h",
-        "src/core/lib/iomgr/pollset_set.h",
-        "src/core/lib/iomgr/pollset_set_windows.c",
-        "src/core/lib/iomgr/pollset_set_windows.h",
-        "src/core/lib/iomgr/pollset_windows.c",
-        "src/core/lib/iomgr/pollset_windows.h",
-        "src/core/lib/iomgr/resolve_address.h",
-        "src/core/lib/iomgr/resolve_address_posix.c",
-        "src/core/lib/iomgr/resolve_address_windows.c",
-        "src/core/lib/iomgr/sockaddr.h",
-        "src/core/lib/iomgr/sockaddr_posix.h",
-        "src/core/lib/iomgr/sockaddr_utils.c",
-        "src/core/lib/iomgr/sockaddr_utils.h",
-        "src/core/lib/iomgr/sockaddr_windows.h",
-        "src/core/lib/iomgr/socket_utils_common_posix.c",
-        "src/core/lib/iomgr/socket_utils_linux.c",
-        "src/core/lib/iomgr/socket_utils_posix.c",
-        "src/core/lib/iomgr/socket_utils_posix.h",
-        "src/core/lib/iomgr/socket_windows.c",
-        "src/core/lib/iomgr/socket_windows.h",
-        "src/core/lib/iomgr/tcp_client.h",
-        "src/core/lib/iomgr/tcp_client_posix.c",
-        "src/core/lib/iomgr/tcp_client_windows.c",
-        "src/core/lib/iomgr/tcp_posix.c",
-        "src/core/lib/iomgr/tcp_posix.h",
-        "src/core/lib/iomgr/tcp_server.h",
-        "src/core/lib/iomgr/tcp_server_posix.c",
-        "src/core/lib/iomgr/tcp_server_windows.c",
-        "src/core/lib/iomgr/tcp_windows.c",
-        "src/core/lib/iomgr/tcp_windows.h",
-        "src/core/lib/iomgr/time_averaged_stats.c",
-        "src/core/lib/iomgr/time_averaged_stats.h",
-        "src/core/lib/iomgr/timer.c",
-        "src/core/lib/iomgr/timer.h",
-        "src/core/lib/iomgr/timer_heap.c",
-        "src/core/lib/iomgr/timer_heap.h",
-        "src/core/lib/iomgr/udp_server.c",
-        "src/core/lib/iomgr/udp_server.h",
-        "src/core/lib/iomgr/unix_sockets_posix.c",
-        "src/core/lib/iomgr/unix_sockets_posix.h",
-        "src/core/lib/iomgr/unix_sockets_posix_noop.c",
-        "src/core/lib/iomgr/wakeup_fd_eventfd.c",
-        "src/core/lib/iomgr/wakeup_fd_nospecial.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.h",
-        "src/core/lib/iomgr/wakeup_fd_posix.c",
-        "src/core/lib/iomgr/wakeup_fd_posix.h",
-        "src/core/lib/iomgr/workqueue.h",
-        "src/core/lib/iomgr/workqueue_posix.c",
-        "src/core/lib/iomgr/workqueue_posix.h",
-        "src/core/lib/iomgr/workqueue_windows.c",
-        "src/core/lib/iomgr/workqueue_windows.h",
-        "src/core/lib/json/json.c",
-        "src/core/lib/json/json.h",
-        "src/core/lib/json/json_common.h",
-        "src/core/lib/json/json_reader.c",
-        "src/core/lib/json/json_reader.h",
-        "src/core/lib/json/json_string.c",
-        "src/core/lib/json/json_writer.c",
-        "src/core/lib/json/json_writer.h",
-        "src/core/lib/surface/alarm.c",
-        "src/core/lib/surface/api_trace.c",
-        "src/core/lib/surface/api_trace.h",
-        "src/core/lib/surface/byte_buffer.c",
-        "src/core/lib/surface/byte_buffer_reader.c",
-        "src/core/lib/surface/call.c",
-        "src/core/lib/surface/call.h",
-        "src/core/lib/surface/call_details.c",
-        "src/core/lib/surface/call_log_batch.c",
-        "src/core/lib/surface/call_test_only.h",
-        "src/core/lib/surface/channel.c",
-        "src/core/lib/surface/channel.h",
-        "src/core/lib/surface/channel_init.c",
-        "src/core/lib/surface/channel_init.h",
-        "src/core/lib/surface/channel_ping.c",
-        "src/core/lib/surface/channel_stack_type.c",
-        "src/core/lib/surface/channel_stack_type.h",
-        "src/core/lib/surface/completion_queue.c",
-        "src/core/lib/surface/completion_queue.h",
-        "src/core/lib/surface/event_string.c",
-        "src/core/lib/surface/event_string.h",
-        "src/core/lib/surface/init.h",
-        "src/core/lib/surface/lame_client.c",
-        "src/core/lib/surface/lame_client.h",
-        "src/core/lib/surface/metadata_array.c",
-        "src/core/lib/surface/server.c",
-        "src/core/lib/surface/server.h",
-        "src/core/lib/surface/validate_metadata.c",
-        "src/core/lib/surface/version.c",
-        "src/core/lib/transport/byte_stream.c",
-        "src/core/lib/transport/byte_stream.h",
-        "src/core/lib/transport/connectivity_state.c",
-        "src/core/lib/transport/connectivity_state.h",
-        "src/core/lib/transport/metadata.c",
-        "src/core/lib/transport/metadata.h",
-        "src/core/lib/transport/metadata_batch.c",
-        "src/core/lib/transport/metadata_batch.h",
-        "src/core/lib/transport/static_metadata.c",
-        "src/core/lib/transport/static_metadata.h",
-        "src/core/lib/transport/transport.c",
-        "src/core/lib/transport/transport.h",
-        "src/core/lib/transport/transport_impl.h",
-        "src/core/lib/transport/transport_op_string.c",
-        "src/cpp/client/channel.cc",
-        "src/cpp/client/client_context.cc",
-        "src/cpp/client/create_channel.cc",
-        "src/cpp/client/create_channel_internal.cc",
-        "src/cpp/client/create_channel_internal.h",
-        "src/cpp/client/create_channel_posix.cc",
-        "src/cpp/client/credentials.cc",
-        "src/cpp/client/generic_stub.cc",
-        "src/cpp/client/insecure_credentials.cc",
-        "src/cpp/client/secure_credentials.cc",
-        "src/cpp/client/secure_credentials.h",
-        "src/cpp/codegen/codegen_init.cc",
-        "src/cpp/common/auth_property_iterator.cc",
-        "src/cpp/common/channel_arguments.cc",
-        "src/cpp/common/completion_queue.cc",
-        "src/cpp/common/core_codegen.cc",
-        "src/cpp/common/rpc_method.cc",
-        "src/cpp/common/secure_auth_context.cc",
-        "src/cpp/common/secure_auth_context.h",
-        "src/cpp/common/secure_channel_arguments.cc",
-        "src/cpp/common/secure_create_auth_context.cc",
-        "src/cpp/server/async_generic_service.cc",
-        "src/cpp/server/create_default_thread_pool.cc",
-        "src/cpp/server/dynamic_thread_pool.cc",
-        "src/cpp/server/dynamic_thread_pool.h",
-        "src/cpp/server/insecure_server_credentials.cc",
-        "src/cpp/server/secure_server_credentials.cc",
-        "src/cpp/server/secure_server_credentials.h",
-        "src/cpp/server/server.cc",
-        "src/cpp/server/server_builder.cc",
-        "src/cpp/server/server_context.cc",
-        "src/cpp/server/server_credentials.cc",
-        "src/cpp/server/server_posix.cc",
-        "src/cpp/server/thread_pool_interface.h",
-        "src/cpp/util/byte_buffer.cc",
-        "src/cpp/util/slice.cc",
-        "src/cpp/util/status.cc",
-        "src/cpp/util/string_ref.cc",
-        "src/cpp/util/time.cc",
-    ],
-    hdrs = [
-        "include/grpc++/alarm.h",
-        "include/grpc++/channel.h",
-        "include/grpc++/client_context.h",
-        "include/grpc++/completion_queue.h",
-        "include/grpc++/create_channel.h",
-        "include/grpc++/create_channel_posix.h",
-        "include/grpc++/generic/async_generic_service.h",
-        "include/grpc++/generic/generic_stub.h",
-        "include/grpc++/grpc++.h",
-        "include/grpc++/impl/call.h",
-        "include/grpc++/impl/client_unary_call.h",
-        "include/grpc++/impl/codegen/async_stream.h",
-        "include/grpc++/impl/codegen/async_unary_call.h",
-        "include/grpc++/impl/codegen/call.h",
-        "include/grpc++/impl/codegen/call_hook.h",
-        "include/grpc++/impl/codegen/channel_interface.h",
-        "include/grpc++/impl/codegen/client_context.h",
-        "include/grpc++/impl/codegen/client_unary_call.h",
-        "include/grpc++/impl/codegen/completion_queue.h",
-        "include/grpc++/impl/codegen/completion_queue_tag.h",
-        "include/grpc++/impl/codegen/config.h",
-        "include/grpc++/impl/codegen/core_codegen.h",
-        "include/grpc++/impl/codegen/core_codegen_interface.h",
-        "include/grpc++/impl/codegen/create_auth_context.h",
-        "include/grpc++/impl/codegen/grpc_library.h",
-        "include/grpc++/impl/codegen/method_handler_impl.h",
-        "include/grpc++/impl/codegen/proto_utils.h",
-        "include/grpc++/impl/codegen/rpc_method.h",
-        "include/grpc++/impl/codegen/rpc_service_method.h",
-        "include/grpc++/impl/codegen/security/auth_context.h",
-        "include/grpc++/impl/codegen/serialization_traits.h",
-        "include/grpc++/impl/codegen/server_context.h",
-        "include/grpc++/impl/codegen/server_interface.h",
-        "include/grpc++/impl/codegen/service_type.h",
-        "include/grpc++/impl/codegen/status.h",
-        "include/grpc++/impl/codegen/status_code_enum.h",
-        "include/grpc++/impl/codegen/string_ref.h",
-        "include/grpc++/impl/codegen/stub_options.h",
-        "include/grpc++/impl/codegen/sync.h",
-        "include/grpc++/impl/codegen/sync_cxx11.h",
-        "include/grpc++/impl/codegen/sync_no_cxx11.h",
-        "include/grpc++/impl/codegen/sync_stream.h",
-        "include/grpc++/impl/codegen/time.h",
-        "include/grpc++/impl/grpc_library.h",
-        "include/grpc++/impl/method_handler_impl.h",
-        "include/grpc++/impl/rpc_method.h",
-        "include/grpc++/impl/rpc_service_method.h",
-        "include/grpc++/impl/serialization_traits.h",
-        "include/grpc++/impl/server_builder_option.h",
-        "include/grpc++/impl/server_builder_plugin.h",
-        "include/grpc++/impl/server_initializer.h",
-        "include/grpc++/impl/service_type.h",
-        "include/grpc++/impl/sync.h",
-        "include/grpc++/impl/sync_cxx11.h",
-        "include/grpc++/impl/sync_no_cxx11.h",
-        "include/grpc++/impl/thd.h",
-        "include/grpc++/impl/thd_cxx11.h",
-        "include/grpc++/impl/thd_no_cxx11.h",
-        "include/grpc++/security/auth_context.h",
-        "include/grpc++/security/auth_metadata_processor.h",
-        "include/grpc++/security/credentials.h",
-        "include/grpc++/security/server_credentials.h",
-        "include/grpc++/server.h",
-        "include/grpc++/server_builder.h",
-        "include/grpc++/server_context.h",
-        "include/grpc++/server_posix.h",
-        "include/grpc++/support/async_stream.h",
-        "include/grpc++/support/async_unary_call.h",
-        "include/grpc++/support/byte_buffer.h",
-        "include/grpc++/support/channel_arguments.h",
-        "include/grpc++/support/config.h",
-        "include/grpc++/support/slice.h",
-        "include/grpc++/support/status.h",
-        "include/grpc++/support/status_code_enum.h",
-        "include/grpc++/support/string_ref.h",
-        "include/grpc++/support/stub_options.h",
-        "include/grpc++/support/sync_stream.h",
-        "include/grpc++/support/time.h",
-        "include/grpc/byte_buffer.h",
-        "include/grpc/byte_buffer_reader.h",
-        "include/grpc/compression.h",
-        "include/grpc/grpc.h",
-        "include/grpc/grpc_posix.h",
-        "include/grpc/grpc_security_constants.h",
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/byte_buffer.h",
-        "include/grpc/impl/codegen/byte_buffer_reader.h",
-        "include/grpc/impl/codegen/compression_types.h",
-        "include/grpc/impl/codegen/connectivity_state.h",
-        "include/grpc/impl/codegen/grpc_types.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/propagation_bits.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/status.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/status.h",
-    ],
-    copts = [
-        "-std=gnu99",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        ":gpr",
-        ":grpc",
-        "//external:libssl",
-        "//external:protobuf_clib",
-    ],
-)
-
-cc_library(
-    name = "grpc++_reflection",
-    srcs = [
-        "src/cpp/ext/proto_server_reflection.cc",
-        "src/cpp/ext/proto_server_reflection.h",
-        "src/cpp/ext/proto_server_reflection_plugin.cc",
-        "src/cpp/ext/reflection.grpc.pb.cc",
-        "src/cpp/ext/reflection.pb.cc",
-    ],
-    hdrs = [
-        "include/grpc++/ext/proto_server_reflection_plugin.h",
-        "include/grpc++/ext/reflection.grpc.pb.h",
-        "include/grpc++/ext/reflection.pb.h",
-        "include/grpc++/impl/codegen/async_stream.h",
-        "include/grpc++/impl/codegen/async_unary_call.h",
-        "include/grpc++/impl/codegen/call.h",
-        "include/grpc++/impl/codegen/call_hook.h",
-        "include/grpc++/impl/codegen/channel_interface.h",
-        "include/grpc++/impl/codegen/client_context.h",
-        "include/grpc++/impl/codegen/client_unary_call.h",
-        "include/grpc++/impl/codegen/completion_queue.h",
-        "include/grpc++/impl/codegen/completion_queue_tag.h",
-        "include/grpc++/impl/codegen/config.h",
-        "include/grpc++/impl/codegen/config_protobuf.h",
-        "include/grpc++/impl/codegen/core_codegen_interface.h",
-        "include/grpc++/impl/codegen/create_auth_context.h",
-        "include/grpc++/impl/codegen/grpc_library.h",
-        "include/grpc++/impl/codegen/method_handler_impl.h",
-        "include/grpc++/impl/codegen/proto_utils.h",
-        "include/grpc++/impl/codegen/rpc_method.h",
-        "include/grpc++/impl/codegen/rpc_service_method.h",
-        "include/grpc++/impl/codegen/security/auth_context.h",
-        "include/grpc++/impl/codegen/serialization_traits.h",
-        "include/grpc++/impl/codegen/server_context.h",
-        "include/grpc++/impl/codegen/server_interface.h",
-        "include/grpc++/impl/codegen/service_type.h",
-        "include/grpc++/impl/codegen/status.h",
-        "include/grpc++/impl/codegen/status_code_enum.h",
-        "include/grpc++/impl/codegen/string_ref.h",
-        "include/grpc++/impl/codegen/stub_options.h",
-        "include/grpc++/impl/codegen/sync.h",
-        "include/grpc++/impl/codegen/sync_cxx11.h",
-        "include/grpc++/impl/codegen/sync_no_cxx11.h",
-        "include/grpc++/impl/codegen/sync_stream.h",
-        "include/grpc++/impl/codegen/time.h",
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/byte_buffer.h",
-        "include/grpc/impl/codegen/byte_buffer_reader.h",
-        "include/grpc/impl/codegen/compression_types.h",
-        "include/grpc/impl/codegen/connectivity_state.h",
-        "include/grpc/impl/codegen/grpc_types.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/propagation_bits.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/status.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        ":grpc++",
-    ],
-)
-
-cc_library(
-    name = "grpc++_unsecure",
-    srcs = [
-        "src/cpp/client/channel.cc",
-        "src/cpp/client/client_context.cc",
-        "src/cpp/client/create_channel.cc",
-        "src/cpp/client/create_channel_internal.cc",
-        "src/cpp/client/create_channel_internal.h",
-        "src/cpp/client/create_channel_posix.cc",
-        "src/cpp/client/credentials.cc",
-        "src/cpp/client/generic_stub.cc",
-        "src/cpp/client/insecure_credentials.cc",
-        "src/cpp/codegen/codegen_init.cc",
-        "src/cpp/common/channel_arguments.cc",
-        "src/cpp/common/completion_queue.cc",
-        "src/cpp/common/core_codegen.cc",
-        "src/cpp/common/insecure_create_auth_context.cc",
-        "src/cpp/common/rpc_method.cc",
-        "src/cpp/server/async_generic_service.cc",
-        "src/cpp/server/create_default_thread_pool.cc",
-        "src/cpp/server/dynamic_thread_pool.cc",
-        "src/cpp/server/dynamic_thread_pool.h",
-        "src/cpp/server/insecure_server_credentials.cc",
-        "src/cpp/server/server.cc",
-        "src/cpp/server/server_builder.cc",
-        "src/cpp/server/server_context.cc",
-        "src/cpp/server/server_credentials.cc",
-        "src/cpp/server/server_posix.cc",
-        "src/cpp/server/thread_pool_interface.h",
-        "src/cpp/util/byte_buffer.cc",
-        "src/cpp/util/slice.cc",
-        "src/cpp/util/status.cc",
-        "src/cpp/util/string_ref.cc",
-        "src/cpp/util/time.cc",
-    ],
-    hdrs = [
-        "include/grpc++/alarm.h",
-        "include/grpc++/channel.h",
-        "include/grpc++/client_context.h",
-        "include/grpc++/completion_queue.h",
-        "include/grpc++/create_channel.h",
-        "include/grpc++/create_channel_posix.h",
-        "include/grpc++/generic/async_generic_service.h",
-        "include/grpc++/generic/generic_stub.h",
-        "include/grpc++/grpc++.h",
-        "include/grpc++/impl/call.h",
-        "include/grpc++/impl/client_unary_call.h",
-        "include/grpc++/impl/codegen/async_stream.h",
-        "include/grpc++/impl/codegen/async_unary_call.h",
-        "include/grpc++/impl/codegen/call.h",
-        "include/grpc++/impl/codegen/call_hook.h",
-        "include/grpc++/impl/codegen/channel_interface.h",
-        "include/grpc++/impl/codegen/client_context.h",
-        "include/grpc++/impl/codegen/client_unary_call.h",
-        "include/grpc++/impl/codegen/completion_queue.h",
-        "include/grpc++/impl/codegen/completion_queue_tag.h",
-        "include/grpc++/impl/codegen/config.h",
-        "include/grpc++/impl/codegen/config_protobuf.h",
-        "include/grpc++/impl/codegen/core_codegen.h",
-        "include/grpc++/impl/codegen/core_codegen_interface.h",
-        "include/grpc++/impl/codegen/create_auth_context.h",
-        "include/grpc++/impl/codegen/grpc_library.h",
-        "include/grpc++/impl/codegen/method_handler_impl.h",
-        "include/grpc++/impl/codegen/proto_utils.h",
-        "include/grpc++/impl/codegen/rpc_method.h",
-        "include/grpc++/impl/codegen/rpc_service_method.h",
-        "include/grpc++/impl/codegen/security/auth_context.h",
-        "include/grpc++/impl/codegen/serialization_traits.h",
-        "include/grpc++/impl/codegen/server_context.h",
-        "include/grpc++/impl/codegen/server_interface.h",
-        "include/grpc++/impl/codegen/service_type.h",
-        "include/grpc++/impl/codegen/status.h",
-        "include/grpc++/impl/codegen/status_code_enum.h",
-        "include/grpc++/impl/codegen/string_ref.h",
-        "include/grpc++/impl/codegen/stub_options.h",
-        "include/grpc++/impl/codegen/sync.h",
-        "include/grpc++/impl/codegen/sync_cxx11.h",
-        "include/grpc++/impl/codegen/sync_no_cxx11.h",
-        "include/grpc++/impl/codegen/sync_stream.h",
-        "include/grpc++/impl/codegen/time.h",
-        "include/grpc++/impl/grpc_library.h",
-        "include/grpc++/impl/method_handler_impl.h",
-        "include/grpc++/impl/rpc_method.h",
-        "include/grpc++/impl/rpc_service_method.h",
-        "include/grpc++/impl/serialization_traits.h",
-        "include/grpc++/impl/server_builder_option.h",
-        "include/grpc++/impl/server_builder_plugin.h",
-        "include/grpc++/impl/server_initializer.h",
-        "include/grpc++/impl/service_type.h",
-        "include/grpc++/impl/sync.h",
-        "include/grpc++/impl/sync_cxx11.h",
-        "include/grpc++/impl/sync_no_cxx11.h",
-        "include/grpc++/impl/thd.h",
-        "include/grpc++/impl/thd_cxx11.h",
-        "include/grpc++/impl/thd_no_cxx11.h",
-        "include/grpc++/security/auth_context.h",
-        "include/grpc++/security/auth_metadata_processor.h",
-        "include/grpc++/security/credentials.h",
-        "include/grpc++/security/server_credentials.h",
-        "include/grpc++/server.h",
-        "include/grpc++/server_builder.h",
-        "include/grpc++/server_context.h",
-        "include/grpc++/server_posix.h",
-        "include/grpc++/support/async_stream.h",
-        "include/grpc++/support/async_unary_call.h",
-        "include/grpc++/support/byte_buffer.h",
-        "include/grpc++/support/channel_arguments.h",
-        "include/grpc++/support/config.h",
-        "include/grpc++/support/slice.h",
-        "include/grpc++/support/status.h",
-        "include/grpc++/support/status_code_enum.h",
-        "include/grpc++/support/string_ref.h",
-        "include/grpc++/support/stub_options.h",
-        "include/grpc++/support/sync_stream.h",
-        "include/grpc++/support/time.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    linkopts = ["-lpthread"],
-    deps = [
-        ":gpr",
-        ":grpc_unsecure",
-        "//external:protobuf_clib",
-    ],
-)
-
-cc_library(
-    name = "grpc_plugin_support",
-    srcs = [
-        "src/compiler/config.h",
-        "src/compiler/cpp_generator.cc",
-        "src/compiler/cpp_generator.h",
-        "src/compiler/cpp_generator_helpers.h",
-        "src/compiler/csharp_generator.cc",
-        "src/compiler/csharp_generator.h",
-        "src/compiler/csharp_generator_helpers.h",
-        "src/compiler/generator_helpers.h",
-        "src/compiler/node_generator.cc",
-        "src/compiler/node_generator.h",
-        "src/compiler/node_generator_helpers.h",
-        "src/compiler/objective_c_generator.cc",
-        "src/compiler/objective_c_generator.h",
-        "src/compiler/objective_c_generator_helpers.h",
-        "src/compiler/python_generator.cc",
-        "src/compiler/python_generator.h",
-        "src/compiler/ruby_generator.cc",
-        "src/compiler/ruby_generator.h",
-        "src/compiler/ruby_generator_helpers-inl.h",
-        "src/compiler/ruby_generator_map-inl.h",
-        "src/compiler/ruby_generator_string-inl.h",
-    ],
-    hdrs = [
-        "include/grpc++/impl/codegen/config_protobuf.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        "//external:protobuf_compiler",
-    ],
-)
-
-cc_library(
-    name = "grpc_csharp_ext",
-    srcs = [
-        "src/csharp/ext/grpc_csharp_ext.c",
-    ],
-    hdrs = [
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-        ":gpr",
-        ":grpc",
-    ],
-)
-
-objc_library(
-    name = "gpr_objc",
-    srcs = [
-        "src/core/lib/profiling/basic_timers.c",
-        "src/core/lib/profiling/stap_timers.c",
-        "src/core/lib/support/alloc.c",
-        "src/core/lib/support/avl.c",
-        "src/core/lib/support/backoff.c",
-        "src/core/lib/support/cmdline.c",
-        "src/core/lib/support/cpu_iphone.c",
-        "src/core/lib/support/cpu_linux.c",
-        "src/core/lib/support/cpu_posix.c",
-        "src/core/lib/support/cpu_windows.c",
-        "src/core/lib/support/env_linux.c",
-        "src/core/lib/support/env_posix.c",
-        "src/core/lib/support/env_windows.c",
-        "src/core/lib/support/histogram.c",
-        "src/core/lib/support/host_port.c",
-        "src/core/lib/support/log.c",
-        "src/core/lib/support/log_android.c",
-        "src/core/lib/support/log_linux.c",
-        "src/core/lib/support/log_posix.c",
-        "src/core/lib/support/log_windows.c",
-        "src/core/lib/support/murmur_hash.c",
-        "src/core/lib/support/slice.c",
-        "src/core/lib/support/slice_buffer.c",
-        "src/core/lib/support/stack_lockfree.c",
-        "src/core/lib/support/string.c",
-        "src/core/lib/support/string_posix.c",
-        "src/core/lib/support/string_util_windows.c",
-        "src/core/lib/support/string_windows.c",
-        "src/core/lib/support/subprocess_posix.c",
-        "src/core/lib/support/subprocess_windows.c",
-        "src/core/lib/support/sync.c",
-        "src/core/lib/support/sync_posix.c",
-        "src/core/lib/support/sync_windows.c",
-        "src/core/lib/support/thd.c",
-        "src/core/lib/support/thd_posix.c",
-        "src/core/lib/support/thd_windows.c",
-        "src/core/lib/support/time.c",
-        "src/core/lib/support/time_posix.c",
-        "src/core/lib/support/time_precise.c",
-        "src/core/lib/support/time_windows.c",
-        "src/core/lib/support/tls_pthread.c",
-        "src/core/lib/support/tmpfile_msys.c",
-        "src/core/lib/support/tmpfile_posix.c",
-        "src/core/lib/support/tmpfile_windows.c",
-        "src/core/lib/support/wrap_memcpy.c",
-    ],
-    hdrs = [
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/support/alloc.h",
-        "include/grpc/support/atm.h",
-        "include/grpc/support/atm_gcc_atomic.h",
-        "include/grpc/support/atm_gcc_sync.h",
-        "include/grpc/support/atm_windows.h",
-        "include/grpc/support/avl.h",
-        "include/grpc/support/cmdline.h",
-        "include/grpc/support/cpu.h",
-        "include/grpc/support/histogram.h",
-        "include/grpc/support/host_port.h",
-        "include/grpc/support/log.h",
-        "include/grpc/support/log_windows.h",
-        "include/grpc/support/port_platform.h",
-        "include/grpc/support/slice.h",
-        "include/grpc/support/slice_buffer.h",
-        "include/grpc/support/string_util.h",
-        "include/grpc/support/subprocess.h",
-        "include/grpc/support/sync.h",
-        "include/grpc/support/sync_generic.h",
-        "include/grpc/support/sync_posix.h",
-        "include/grpc/support/sync_windows.h",
-        "include/grpc/support/thd.h",
-        "include/grpc/support/time.h",
-        "include/grpc/support/tls.h",
-        "include/grpc/support/tls_gcc.h",
-        "include/grpc/support/tls_msvc.h",
-        "include/grpc/support/tls_pthread.h",
-        "include/grpc/support/useful.h",
-        "src/core/lib/profiling/timers.h",
-        "src/core/lib/support/backoff.h",
-        "src/core/lib/support/block_annotate.h",
-        "src/core/lib/support/env.h",
-        "src/core/lib/support/murmur_hash.h",
-        "src/core/lib/support/stack_lockfree.h",
-        "src/core/lib/support/string.h",
-        "src/core/lib/support/string_windows.h",
-        "src/core/lib/support/thd_internal.h",
-        "src/core/lib/support/time_precise.h",
-        "src/core/lib/support/tmpfile.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    deps = [
-    ],
-)
-
-objc_library(
-    name = "grpc_objc",
-    srcs = [
-        "src/core/ext/census/context.c",
-        "src/core/ext/census/gen/census.pb.c",
-        "src/core/ext/census/grpc_context.c",
-        "src/core/ext/census/grpc_filter.c",
-        "src/core/ext/census/grpc_plugin.c",
-        "src/core/ext/census/initialize.c",
-        "src/core/ext/census/mlog.c",
-        "src/core/ext/census/operation.c",
-        "src/core/ext/census/placeholders.c",
-        "src/core/ext/census/tracing.c",
-        "src/core/ext/client_config/channel_connectivity.c",
-        "src/core/ext/client_config/client_channel.c",
-        "src/core/ext/client_config/client_channel_factory.c",
-        "src/core/ext/client_config/client_config.c",
-        "src/core/ext/client_config/client_config_plugin.c",
-        "src/core/ext/client_config/connector.c",
-        "src/core/ext/client_config/default_initial_connect_string.c",
-        "src/core/ext/client_config/initial_connect_string.c",
-        "src/core/ext/client_config/lb_policy.c",
-        "src/core/ext/client_config/lb_policy_factory.c",
-        "src/core/ext/client_config/lb_policy_registry.c",
-        "src/core/ext/client_config/parse_address.c",
-        "src/core/ext/client_config/resolver.c",
-        "src/core/ext/client_config/resolver_factory.c",
-        "src/core/ext/client_config/resolver_registry.c",
-        "src/core/ext/client_config/subchannel.c",
-        "src/core/ext/client_config/subchannel_call_holder.c",
-        "src/core/ext/client_config/subchannel_index.c",
-        "src/core/ext/client_config/uri_parser.c",
-        "src/core/ext/lb_policy/grpclb/load_balancer_api.c",
-        "src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c",
-        "src/core/ext/lb_policy/pick_first/pick_first.c",
-        "src/core/ext/lb_policy/round_robin/round_robin.c",
-        "src/core/ext/load_reporting/load_reporting.c",
-        "src/core/ext/load_reporting/load_reporting_filter.c",
-        "src/core/ext/resolver/dns/native/dns_resolver.c",
-        "src/core/ext/resolver/sockaddr/sockaddr_resolver.c",
-        "src/core/ext/transport/chttp2/alpn/alpn.c",
-        "src/core/ext/transport/chttp2/client/insecure/channel_create.c",
-        "src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c",
-        "src/core/ext/transport/chttp2/client/secure/secure_channel_create.c",
-        "src/core/ext/transport/chttp2/server/insecure/server_chttp2.c",
-        "src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c",
-        "src/core/ext/transport/chttp2/server/secure/server_secure_chttp2.c",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.c",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_plugin.c",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.c",
-        "src/core/ext/transport/chttp2/transport/frame_data.c",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.c",
-        "src/core/ext/transport/chttp2/transport/frame_ping.c",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.c",
-        "src/core/ext/transport/chttp2/transport/frame_settings.c",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.c",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.c",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.c",
-        "src/core/ext/transport/chttp2/transport/hpack_table.c",
-        "src/core/ext/transport/chttp2/transport/huffsyms.c",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.c",
-        "src/core/ext/transport/chttp2/transport/parsing.c",
-        "src/core/ext/transport/chttp2/transport/status_conversion.c",
-        "src/core/ext/transport/chttp2/transport/stream_lists.c",
-        "src/core/ext/transport/chttp2/transport/stream_map.c",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.c",
-        "src/core/ext/transport/chttp2/transport/varint.c",
-        "src/core/ext/transport/chttp2/transport/writing.c",
-        "src/core/lib/channel/channel_args.c",
-        "src/core/lib/channel/channel_stack.c",
-        "src/core/lib/channel/channel_stack_builder.c",
-        "src/core/lib/channel/compress_filter.c",
-        "src/core/lib/channel/connected_channel.c",
-        "src/core/lib/channel/http_client_filter.c",
-        "src/core/lib/channel/http_server_filter.c",
-        "src/core/lib/compression/compression.c",
-        "src/core/lib/compression/message_compress.c",
-        "src/core/lib/debug/trace.c",
-        "src/core/lib/http/format_request.c",
-        "src/core/lib/http/httpcli.c",
-        "src/core/lib/http/httpcli_security_connector.c",
-        "src/core/lib/http/parser.c",
-        "src/core/lib/iomgr/closure.c",
-        "src/core/lib/iomgr/endpoint.c",
-        "src/core/lib/iomgr/endpoint_pair_posix.c",
-        "src/core/lib/iomgr/endpoint_pair_windows.c",
-        "src/core/lib/iomgr/error.c",
-        "src/core/lib/iomgr/ev_epoll_linux.c",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.c",
-        "src/core/lib/iomgr/ev_poll_posix.c",
-        "src/core/lib/iomgr/ev_posix.c",
-        "src/core/lib/iomgr/exec_ctx.c",
-        "src/core/lib/iomgr/executor.c",
-        "src/core/lib/iomgr/iocp_windows.c",
-        "src/core/lib/iomgr/iomgr.c",
-        "src/core/lib/iomgr/iomgr_posix.c",
-        "src/core/lib/iomgr/iomgr_windows.c",
-        "src/core/lib/iomgr/load_file.c",
-        "src/core/lib/iomgr/network_status_tracker.c",
-        "src/core/lib/iomgr/polling_entity.c",
-        "src/core/lib/iomgr/pollset_set_windows.c",
-        "src/core/lib/iomgr/pollset_windows.c",
-        "src/core/lib/iomgr/resolve_address_posix.c",
-        "src/core/lib/iomgr/resolve_address_windows.c",
-        "src/core/lib/iomgr/sockaddr_utils.c",
-        "src/core/lib/iomgr/socket_utils_common_posix.c",
-        "src/core/lib/iomgr/socket_utils_linux.c",
-        "src/core/lib/iomgr/socket_utils_posix.c",
-        "src/core/lib/iomgr/socket_windows.c",
-        "src/core/lib/iomgr/tcp_client_posix.c",
-        "src/core/lib/iomgr/tcp_client_windows.c",
-        "src/core/lib/iomgr/tcp_posix.c",
-        "src/core/lib/iomgr/tcp_server_posix.c",
-        "src/core/lib/iomgr/tcp_server_windows.c",
-        "src/core/lib/iomgr/tcp_windows.c",
-        "src/core/lib/iomgr/time_averaged_stats.c",
-        "src/core/lib/iomgr/timer.c",
-        "src/core/lib/iomgr/timer_heap.c",
-        "src/core/lib/iomgr/udp_server.c",
-        "src/core/lib/iomgr/unix_sockets_posix.c",
-        "src/core/lib/iomgr/unix_sockets_posix_noop.c",
-        "src/core/lib/iomgr/wakeup_fd_eventfd.c",
-        "src/core/lib/iomgr/wakeup_fd_nospecial.c",
-        "src/core/lib/iomgr/wakeup_fd_pipe.c",
-        "src/core/lib/iomgr/wakeup_fd_posix.c",
-        "src/core/lib/iomgr/workqueue_posix.c",
-        "src/core/lib/iomgr/workqueue_windows.c",
-        "src/core/lib/json/json.c",
-        "src/core/lib/json/json_reader.c",
-        "src/core/lib/json/json_string.c",
-        "src/core/lib/json/json_writer.c",
-        "src/core/lib/security/context/security_context.c",
-        "src/core/lib/security/credentials/composite/composite_credentials.c",
-        "src/core/lib/security/credentials/credentials.c",
-        "src/core/lib/security/credentials/credentials_metadata.c",
-        "src/core/lib/security/credentials/fake/fake_credentials.c",
-        "src/core/lib/security/credentials/google_default/credentials_posix.c",
-        "src/core/lib/security/credentials/google_default/credentials_windows.c",
-        "src/core/lib/security/credentials/google_default/google_default_credentials.c",
-        "src/core/lib/security/credentials/iam/iam_credentials.c",
-        "src/core/lib/security/credentials/jwt/json_token.c",
-        "src/core/lib/security/credentials/jwt/jwt_credentials.c",
-        "src/core/lib/security/credentials/jwt/jwt_verifier.c",
-        "src/core/lib/security/credentials/oauth2/oauth2_credentials.c",
-        "src/core/lib/security/credentials/plugin/plugin_credentials.c",
-        "src/core/lib/security/credentials/ssl/ssl_credentials.c",
-        "src/core/lib/security/transport/client_auth_filter.c",
-        "src/core/lib/security/transport/handshake.c",
-        "src/core/lib/security/transport/secure_endpoint.c",
-        "src/core/lib/security/transport/security_connector.c",
-        "src/core/lib/security/transport/server_auth_filter.c",
-        "src/core/lib/security/transport/tsi_error.c",
-        "src/core/lib/security/util/b64.c",
-        "src/core/lib/security/util/json_util.c",
-        "src/core/lib/surface/alarm.c",
-        "src/core/lib/surface/api_trace.c",
-        "src/core/lib/surface/byte_buffer.c",
-        "src/core/lib/surface/byte_buffer_reader.c",
-        "src/core/lib/surface/call.c",
-        "src/core/lib/surface/call_details.c",
-        "src/core/lib/surface/call_log_batch.c",
-        "src/core/lib/surface/channel.c",
-        "src/core/lib/surface/channel_init.c",
-        "src/core/lib/surface/channel_ping.c",
-        "src/core/lib/surface/channel_stack_type.c",
-        "src/core/lib/surface/completion_queue.c",
-        "src/core/lib/surface/event_string.c",
-        "src/core/lib/surface/init.c",
-        "src/core/lib/surface/init_secure.c",
-        "src/core/lib/surface/lame_client.c",
-        "src/core/lib/surface/metadata_array.c",
-        "src/core/lib/surface/server.c",
-        "src/core/lib/surface/validate_metadata.c",
-        "src/core/lib/surface/version.c",
-        "src/core/lib/transport/byte_stream.c",
-        "src/core/lib/transport/connectivity_state.c",
-        "src/core/lib/transport/metadata.c",
-        "src/core/lib/transport/metadata_batch.c",
-        "src/core/lib/transport/static_metadata.c",
-        "src/core/lib/transport/transport.c",
-        "src/core/lib/transport/transport_op_string.c",
-        "src/core/lib/tsi/fake_transport_security.c",
-        "src/core/lib/tsi/ssl_transport_security.c",
-        "src/core/lib/tsi/transport_security.c",
-        "src/core/plugin_registry/grpc_plugin_registry.c",
-    ],
-    hdrs = [
-        "include/grpc/byte_buffer.h",
-        "include/grpc/byte_buffer_reader.h",
-        "include/grpc/census.h",
-        "include/grpc/compression.h",
-        "include/grpc/grpc.h",
-        "include/grpc/grpc_posix.h",
-        "include/grpc/grpc_security.h",
-        "include/grpc/grpc_security_constants.h",
-        "include/grpc/impl/codegen/alloc.h",
-        "include/grpc/impl/codegen/atm.h",
-        "include/grpc/impl/codegen/atm_gcc_atomic.h",
-        "include/grpc/impl/codegen/atm_gcc_sync.h",
-        "include/grpc/impl/codegen/atm_windows.h",
-        "include/grpc/impl/codegen/byte_buffer.h",
-        "include/grpc/impl/codegen/byte_buffer_reader.h",
-        "include/grpc/impl/codegen/compression_types.h",
-        "include/grpc/impl/codegen/connectivity_state.h",
-        "include/grpc/impl/codegen/grpc_types.h",
-        "include/grpc/impl/codegen/log.h",
-        "include/grpc/impl/codegen/port_platform.h",
-        "include/grpc/impl/codegen/propagation_bits.h",
-        "include/grpc/impl/codegen/slice.h",
-        "include/grpc/impl/codegen/slice_buffer.h",
-        "include/grpc/impl/codegen/status.h",
-        "include/grpc/impl/codegen/sync.h",
-        "include/grpc/impl/codegen/sync_generic.h",
-        "include/grpc/impl/codegen/sync_posix.h",
-        "include/grpc/impl/codegen/sync_windows.h",
-        "include/grpc/impl/codegen/time.h",
-        "include/grpc/status.h",
-        "src/core/ext/census/aggregation.h",
-        "src/core/ext/census/census_interface.h",
-        "src/core/ext/census/census_rpc_stats.h",
-        "src/core/ext/census/gen/census.pb.h",
-        "src/core/ext/census/grpc_filter.h",
-        "src/core/ext/census/mlog.h",
-        "src/core/ext/census/rpc_metric_id.h",
-        "src/core/ext/client_config/client_channel.h",
-        "src/core/ext/client_config/client_channel_factory.h",
-        "src/core/ext/client_config/client_config.h",
-        "src/core/ext/client_config/connector.h",
-        "src/core/ext/client_config/initial_connect_string.h",
-        "src/core/ext/client_config/lb_policy.h",
-        "src/core/ext/client_config/lb_policy_factory.h",
-        "src/core/ext/client_config/lb_policy_registry.h",
-        "src/core/ext/client_config/parse_address.h",
-        "src/core/ext/client_config/resolver.h",
-        "src/core/ext/client_config/resolver_factory.h",
-        "src/core/ext/client_config/resolver_registry.h",
-        "src/core/ext/client_config/subchannel.h",
-        "src/core/ext/client_config/subchannel_call_holder.h",
-        "src/core/ext/client_config/subchannel_index.h",
-        "src/core/ext/client_config/uri_parser.h",
-        "src/core/ext/lb_policy/grpclb/load_balancer_api.h",
-        "src/core/ext/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.h",
-        "src/core/ext/load_reporting/load_reporting.h",
-        "src/core/ext/load_reporting/load_reporting_filter.h",
-        "src/core/ext/transport/chttp2/alpn/alpn.h",
-        "src/core/ext/transport/chttp2/transport/bin_decoder.h",
-        "src/core/ext/transport/chttp2/transport/bin_encoder.h",
-        "src/core/ext/transport/chttp2/transport/chttp2_transport.h",
-        "src/core/ext/transport/chttp2/transport/frame.h",
-        "src/core/ext/transport/chttp2/transport/frame_data.h",
-        "src/core/ext/transport/chttp2/transport/frame_goaway.h",
-        "src/core/ext/transport/chttp2/transport/frame_ping.h",
-        "src/core/ext/transport/chttp2/transport/frame_rst_stream.h",
-        "src/core/ext/transport/chttp2/transport/frame_settings.h",
-        "src/core/ext/transport/chttp2/transport/frame_window_update.h",
-        "src/core/ext/transport/chttp2/transport/hpack_encoder.h",
-        "src/core/ext/transport/chttp2/transport/hpack_parser.h",
-        "src/core/ext/transport/chttp2/transport/hpack_table.h",
-        "src/core/ext/transport/chttp2/transport/http2_errors.h",
-        "src/core/ext/transport/chttp2/transport/huffsyms.h",
-        "src/core/ext/transport/chttp2/transport/incoming_metadata.h",
-        "src/core/ext/transport/chttp2/transport/internal.h",
-        "src/core/ext/transport/chttp2/transport/status_conversion.h",
-        "src/core/ext/transport/chttp2/transport/stream_map.h",
-        "src/core/ext/transport/chttp2/transport/timeout_encoding.h",
-        "src/core/ext/transport/chttp2/transport/varint.h",
-        "src/core/lib/channel/channel_args.h",
-        "src/core/lib/channel/channel_stack.h",
-        "src/core/lib/channel/channel_stack_builder.h",
-        "src/core/lib/channel/compress_filter.h",
-        "src/core/lib/channel/connected_channel.h",
-        "src/core/lib/channel/context.h",
-        "src/core/lib/channel/http_client_filter.h",
-        "src/core/lib/channel/http_server_filter.h",
-        "src/core/lib/compression/algorithm_metadata.h",
-        "src/core/lib/compression/message_compress.h",
-        "src/core/lib/debug/trace.h",
-        "src/core/lib/http/format_request.h",
-        "src/core/lib/http/httpcli.h",
-        "src/core/lib/http/parser.h",
-        "src/core/lib/iomgr/closure.h",
-        "src/core/lib/iomgr/endpoint.h",
-        "src/core/lib/iomgr/endpoint_pair.h",
-        "src/core/lib/iomgr/error.h",
-        "src/core/lib/iomgr/ev_epoll_linux.h",
-        "src/core/lib/iomgr/ev_poll_and_epoll_posix.h",
-        "src/core/lib/iomgr/ev_poll_posix.h",
-        "src/core/lib/iomgr/ev_posix.h",
-        "src/core/lib/iomgr/exec_ctx.h",
-        "src/core/lib/iomgr/executor.h",
-        "src/core/lib/iomgr/iocp_windows.h",
-        "src/core/lib/iomgr/iomgr.h",
-        "src/core/lib/iomgr/iomgr_internal.h",
-        "src/core/lib/iomgr/iomgr_posix.h",
-        "src/core/lib/iomgr/load_file.h",
-        "src/core/lib/iomgr/network_status_tracker.h",
-        "src/core/lib/iomgr/polling_entity.h",
-        "src/core/lib/iomgr/pollset.h",
-        "src/core/lib/iomgr/pollset_set.h",
-        "src/core/lib/iomgr/pollset_set_windows.h",
-        "src/core/lib/iomgr/pollset_windows.h",
-        "src/core/lib/iomgr/resolve_address.h",
-        "src/core/lib/iomgr/sockaddr.h",
-        "src/core/lib/iomgr/sockaddr_posix.h",
-        "src/core/lib/iomgr/sockaddr_utils.h",
-        "src/core/lib/iomgr/sockaddr_windows.h",
-        "src/core/lib/iomgr/socket_utils_posix.h",
-        "src/core/lib/iomgr/socket_windows.h",
-        "src/core/lib/iomgr/tcp_client.h",
-        "src/core/lib/iomgr/tcp_posix.h",
-        "src/core/lib/iomgr/tcp_server.h",
-        "src/core/lib/iomgr/tcp_windows.h",
-        "src/core/lib/iomgr/time_averaged_stats.h",
-        "src/core/lib/iomgr/timer.h",
-        "src/core/lib/iomgr/timer_heap.h",
-        "src/core/lib/iomgr/udp_server.h",
-        "src/core/lib/iomgr/unix_sockets_posix.h",
-        "src/core/lib/iomgr/wakeup_fd_pipe.h",
-        "src/core/lib/iomgr/wakeup_fd_posix.h",
-        "src/core/lib/iomgr/workqueue.h",
-        "src/core/lib/iomgr/workqueue_posix.h",
-        "src/core/lib/iomgr/workqueue_windows.h",
-        "src/core/lib/json/json.h",
-        "src/core/lib/json/json_common.h",
-        "src/core/lib/json/json_reader.h",
-        "src/core/lib/json/json_writer.h",
-        "src/core/lib/security/context/security_context.h",
-        "src/core/lib/security/credentials/composite/composite_credentials.h",
-        "src/core/lib/security/credentials/credentials.h",
-        "src/core/lib/security/credentials/fake/fake_credentials.h",
-        "src/core/lib/security/credentials/google_default/google_default_credentials.h",
-        "src/core/lib/security/credentials/iam/iam_credentials.h",
-        "src/core/lib/security/credentials/jwt/json_token.h",
-        "src/core/lib/security/credentials/jwt/jwt_credentials.h",
-        "src/core/lib/security/credentials/jwt/jwt_verifier.h",
-        "src/core/lib/security/credentials/oauth2/oauth2_credentials.h",
-        "src/core/lib/security/credentials/plugin/plugin_credentials.h",
-        "src/core/lib/security/credentials/ssl/ssl_credentials.h",
-        "src/core/lib/security/transport/auth_filters.h",
-        "src/core/lib/security/transport/handshake.h",
-        "src/core/lib/security/transport/secure_endpoint.h",
-        "src/core/lib/security/transport/security_connector.h",
-        "src/core/lib/security/transport/tsi_error.h",
-        "src/core/lib/security/util/b64.h",
-        "src/core/lib/security/util/json_util.h",
-        "src/core/lib/surface/api_trace.h",
-        "src/core/lib/surface/call.h",
-        "src/core/lib/surface/call_test_only.h",
-        "src/core/lib/surface/channel.h",
-        "src/core/lib/surface/channel_init.h",
-        "src/core/lib/surface/channel_stack_type.h",
-        "src/core/lib/surface/completion_queue.h",
-        "src/core/lib/surface/event_string.h",
-        "src/core/lib/surface/init.h",
-        "src/core/lib/surface/lame_client.h",
-        "src/core/lib/surface/server.h",
-        "src/core/lib/transport/byte_stream.h",
-        "src/core/lib/transport/connectivity_state.h",
-        "src/core/lib/transport/metadata.h",
-        "src/core/lib/transport/metadata_batch.h",
-        "src/core/lib/transport/static_metadata.h",
-        "src/core/lib/transport/transport.h",
-        "src/core/lib/transport/transport_impl.h",
-        "src/core/lib/tsi/fake_transport_security.h",
-        "src/core/lib/tsi/ssl_transport_security.h",
-        "src/core/lib/tsi/ssl_types.h",
-        "src/core/lib/tsi/transport_security.h",
-        "src/core/lib/tsi/transport_security_interface.h",
-        "third_party/nanopb/pb.h",
-        "third_party/nanopb/pb_decode.h",
-        "third_party/nanopb/pb_encode.h",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-    sdk_dylibs = ["libz"],
-    deps = [
-        ":gpr_objc",
-        "//external:libssl_objc",
-        "//external:nanopb",
-    ],
-)
-
-cc_binary(
-    name = "grpc_cpp_plugin",
-    srcs = [
-        "src/compiler/cpp_plugin.cc",
-    ],
-    deps = [
-        ":grpc_plugin_support",
-        "//external:protobuf_compiler",
-    ],
-)
-
-cc_binary(
-    name = "grpc_csharp_plugin",
-    srcs = [
-        "src/compiler/csharp_plugin.cc",
-    ],
-    deps = [
-        ":grpc_plugin_support",
-        "//external:protobuf_compiler",
-    ],
-)
-
-cc_binary(
-    name = "grpc_node_plugin",
-    srcs = [
-        "src/compiler/node_plugin.cc",
-    ],
-    deps = [
-        ":grpc_plugin_support",
-        "//external:protobuf_compiler",
-    ],
-)
-
-cc_binary(
-    name = "grpc_objective_c_plugin",
-    srcs = [
-        "src/compiler/objective_c_plugin.cc",
-    ],
-    deps = [
-        ":grpc_plugin_support",
-        "//external:protobuf_compiler",
-    ],
-)
-
-cc_binary(
-    name = "grpc_python_plugin",
-    srcs = [
-        "src/compiler/python_plugin.cc",
-    ],
-    deps = [
-        ":grpc_plugin_support",
-        "//external:protobuf_compiler",
-    ],
-)
-
-cc_binary(
-    name = "grpc_ruby_plugin",
-    srcs = [
-        "src/compiler/ruby_plugin.cc",
-    ],
-    deps = [
-        ":grpc_plugin_support",
-        "//external:protobuf_compiler",
-    ],
-)
-
-objc_path = "src/objective-c"
-
-rx_library_path = objc_path + "/RxLibrary"
-
-objc_library(
-    name = "rx_library",
-    srcs = glob([
-        rx_library_path + "/*.m",
-        rx_library_path + "/transformations/*.m",
-    ]),
-    hdrs = glob([
-        rx_library_path + "/*.h",
-        rx_library_path + "/transformations/*.h",
-    ]),
-    includes = [objc_path],
-    deps = [
-        ":rx_library_private",
-    ],
-)
-
-objc_library(
-    name = "rx_library_private",
-    srcs = glob([rx_library_path + "/private/*.m"]),
-    hdrs = glob([rx_library_path + "/private/*.h"]),
-    visibility = ["//visibility:private"],
-)
-
-objc_client_path = objc_path + "/GRPCClient"
-
-objc_library(
-    name = "grpc_client",
-    srcs = glob([
-        objc_client_path + "/*.m",
-        objc_client_path + "/private/*.m",
-    ]),
-    hdrs = glob([
-        objc_client_path + "/*.h",
-        objc_client_path + "/private/*.h",
-    ]),
-    bundles = [":gRPCCertificates"],
-    includes = [objc_path],
-    deps = [
-        ":grpc_objc",
-        ":rx_library",
-    ],
-)
-
-objc_bundle_library(
-    # The choice of name is signicant here, since it determines the bundle name.
-    name = "gRPCCertificates",
-    resources = ["etc/roots.pem"],
-)
-
-proto_objc_rpc_path = objc_path + "/ProtoRPC"
-
-objc_library(
-    name = "proto_objc_rpc",
-    srcs = glob([
-        proto_objc_rpc_path + "/*.m",
-    ]),
-    hdrs = glob([
-        proto_objc_rpc_path + "/*.h",
-    ]),
-    includes = [objc_path],
-    deps = [
-        ":grpc_client",
-        ":rx_library",
-        "//external:protobuf_objc",
-    ],
-)
diff --git a/tensorflow/tensorboard/scripts/__init__.py b/third_party/grpc/BUILD
similarity index 100%
rename from tensorflow/tensorboard/scripts/__init__.py
rename to third_party/grpc/BUILD
diff --git a/third_party/grpc/grpc.patch b/third_party/grpc/grpc.patch
new file mode 100644
index 0000000000000000000000000000000000000000..6e5b4b02fba2c4c98c82a1366f090dc985bbcda0
--- /dev/null
+++ b/third_party/grpc/grpc.patch
@@ -0,0 +1,76 @@
+diff --git a/bazel/grpc_build_system.bzl b/bazel/grpc_build_system.bzl
+index f793cae56d..0295adb8ab 100644
+--- a/bazel/grpc_build_system.bzl
++++ b/bazel/grpc_build_system.bzl
+@@ -80,7 +80,7 @@ def grpc_cc_test(name, srcs = [], deps = [], external_deps = [], args = [], data
+     linkopts = ["-pthread"],
+   )
+ 
+-def grpc_cc_binary(name, srcs = [], deps = [], external_deps = [], args = [], data = [], language = "C++", testonly = False, linkshared = False):
++def grpc_cc_binary(name, srcs = [], deps = [], external_deps = [], args = [], data = [], language = "C++", testonly = False, linkshared = False, linkopts = []):
+   copts = []
+   if language.upper() == "C":
+     copts = ["-std=c99"]
+@@ -93,7 +93,7 @@ def grpc_cc_binary(name, srcs = [], deps = [], external_deps = [], args = [], da
+     linkshared = linkshared,
+     deps = deps + ["//external:" + dep for dep in external_deps],
+     copts = copts,
+-    linkopts = ["-pthread"],
++    linkopts = ["-pthread"] + linkopts,
+   )
+ 
+ def grpc_generate_one_off_targets():
+diff --git a/src/core/plugin_registry/grpc_unsecure_plugin_registry.c b/src/core/plugin_registry/grpc_unsecure_plugin_registry.c
+index 7eb599d81a..4cc2e30af4 100644
+--- a/src/core/plugin_registry/grpc_unsecure_plugin_registry.c
++++ b/src/core/plugin_registry/grpc_unsecure_plugin_registry.c
+@@ -28,18 +28,12 @@ extern void grpc_client_channel_init(void);
+ extern void grpc_client_channel_shutdown(void);
+ extern void grpc_inproc_plugin_init(void);
+ extern void grpc_inproc_plugin_shutdown(void);
+-extern void grpc_resolver_dns_ares_init(void);
+-extern void grpc_resolver_dns_ares_shutdown(void);
+ extern void grpc_resolver_dns_native_init(void);
+ extern void grpc_resolver_dns_native_shutdown(void);
+ extern void grpc_resolver_sockaddr_init(void);
+ extern void grpc_resolver_sockaddr_shutdown(void);
+-extern void grpc_resolver_fake_init(void);
+-extern void grpc_resolver_fake_shutdown(void);
+ extern void grpc_load_reporting_plugin_init(void);
+ extern void grpc_load_reporting_plugin_shutdown(void);
+-extern void grpc_lb_policy_grpclb_init(void);
+-extern void grpc_lb_policy_grpclb_shutdown(void);
+ extern void grpc_lb_policy_pick_first_init(void);
+ extern void grpc_lb_policy_pick_first_shutdown(void);
+ extern void grpc_lb_policy_round_robin_init(void);
+@@ -64,18 +58,12 @@ void grpc_register_built_in_plugins(void) {
+                        grpc_client_channel_shutdown);
+   grpc_register_plugin(grpc_inproc_plugin_init,
+                        grpc_inproc_plugin_shutdown);
+-  grpc_register_plugin(grpc_resolver_dns_ares_init,
+-                       grpc_resolver_dns_ares_shutdown);
+   grpc_register_plugin(grpc_resolver_dns_native_init,
+                        grpc_resolver_dns_native_shutdown);
+   grpc_register_plugin(grpc_resolver_sockaddr_init,
+                        grpc_resolver_sockaddr_shutdown);
+-  grpc_register_plugin(grpc_resolver_fake_init,
+-                       grpc_resolver_fake_shutdown);
+   grpc_register_plugin(grpc_load_reporting_plugin_init,
+                        grpc_load_reporting_plugin_shutdown);
+-  grpc_register_plugin(grpc_lb_policy_grpclb_init,
+-                       grpc_lb_policy_grpclb_shutdown);
+   grpc_register_plugin(grpc_lb_policy_pick_first_init,
+                        grpc_lb_policy_pick_first_shutdown);
+   grpc_register_plugin(grpc_lb_policy_round_robin_init,
+diff --git a/test/cpp/util/BUILD b/test/cpp/util/BUILD
+index 33240f6f69..d2e1f67f06 100644
+--- a/test/cpp/util/BUILD
++++ b/test/cpp/util/BUILD
+@@ -29,6 +29,7 @@ package(
+ grpc_cc_binary(
+     name = "testso.so",
+     srcs = [],
++    linkopts = ['-Wl,--no-undefined'],
+     linkshared = 1,
+     deps = ["//:grpc++_unsecure"],
+ )
diff --git a/third_party/html5lib.BUILD b/third_party/html5lib.BUILD
deleted file mode 100644
index 63aac14f1559a86f626a5d99db973111f86f92ae..0000000000000000000000000000000000000000
--- a/third_party/html5lib.BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-# Description:
-# Import of html5lib library.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD-like notice-style license, see LICENSE file
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "org_html5lib",
-    srcs = glob(["html5lib/**/*.py"]),
-    srcs_version = "PY2AND3",
-    deps = [
-        "@six_archive//:six",
-    ],
-)
diff --git a/third_party/js.bzl b/third_party/js.bzl
deleted file mode 100644
index 2d2339c95e5b537ae9ba0ebe8044808ebe411a36..0000000000000000000000000000000000000000
--- a/third_party/js.bzl
+++ /dev/null
@@ -1,420 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TensorBoard external JS dependencies (both infrastructure and frontend libs)
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
-
-
-  ##############################################################################
-  # TensorBoard Build Tools
-def tensorboard_js_workspace():
-  filegroup_external(
-      name = "org_nodejs",
-      # MIT with portions licensed:
-      # - MIT
-      # - Old MIT
-      # - 2-Clause-BSD
-      # - 3-Clause-BSD
-      # - ISC
-      # - Unicode
-      # - zlib
-      # - Artistic 2.0
-      licenses = ["notice"],
-      sha256_urls_extract_macos = {
-          "47109a00cac344d80296c195451bb5eee7c21727fcef1594384ddfe1f852957a": [
-              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
-              "http://nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
-          ],
-      },
-      sha256_urls_windows = {
-          "3d4cfca9dcec556a077a2324bf5bd165ea3e6e64a2bfd7fc6e7a1f0dc4eb552b": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/nodejs/node/v4.3.2/LICENSE",
-              "https://raw.githubusercontent.com/nodejs/node/v4.3.2/LICENSE",
-          ],
-          "606c44c42d17866c017c50c0afadad411d9492ac4281d2431b937f881911614e": [
-              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.exe",
-              "http://nodejs.org/dist/v4.3.2/win-x64/node.exe",
-          ],
-          "451a40570099a95488d6438f175813629e0430f87f23c8659bc18dc42494820a": [
-              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.lib",
-              "http://nodejs.org/dist/v4.3.2/win-x64/node.lib",
-          ],
-      },
-      sha256_urls_extract = {
-          "4350d0431b49697517c6cca5d66adf5f74eb9101c52f52ae959fa94225822d44": [
-              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
-              "http://nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
-          ],
-      },
-      strip_prefix = {
-          "node-v4.3.2-darwin-x64.tar.xz": "node-v4.3.2-darwin-x64",
-          "node-v4.3.2-linux-x64.tar.xz": "node-v4.3.2-linux-x64",
-      },
-      executable = [
-          "node",
-          "node.exe",
-      ],
-  )
-  
-  filegroup_external(
-      name = "com_microsoft_typescript",
-      licenses = ["notice"],  # Apache 2.0
-      sha256_urls = {
-          "a7d00bfd54525bc694b6e32f64c7ebcf5e6b7ae3657be5cc12767bce74654a47": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/LICENSE.txt",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/LICENSE.txt",
-          ],
-          "8465342c318f9c4cf0a29b109fa63ee3742dd4dc7080d05d9fd8f604814d04cf": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
-          ],
-          "a67e36da3029d232e4e938e61a0a3302f516d71e7100d54dbf5362ad8618e994": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
-          ],
-      },
-      extra_build_file_content = "\n".join([
-          "sh_binary(",
-          "    name = \"tsc\",",
-          "    srcs = [\"tsc.sh\"],",
-          "    data = [",
-          "        \"tsc.js\",",
-          "        \"@org_nodejs\",",
-          "    ],",
-          ")",
-          "",
-          "genrule(",
-          "    name = \"tsc_sh\",",
-          "    outs = [\"tsc.sh\"],",
-          "    cmd = \"cat >$@ <<'EOF'\\n\" +",
-          "          \"#!/bin/bash\\n\" +",
-          "          \"NODE=external/org_nodejs/bin/node\\n\" +",
-          "          \"if [[ -e external/org_nodejs/node.exe ]]; then\\n\" +",
-          "          \"  NODE=external/org_nodejs/node.exe\\n\" +",
-          "          \"fi\\n\" +",
-          "          \"exec $${NODE} external/com_microsoft_typescript/tsc.js \\\"$$@\\\"\\n\" +",
-          "          \"EOF\",",
-          "    executable = True,",
-          ")",
-      ]),
-  )
-
-
-  native.new_http_archive(
-      name = "io_angular_clutz",
-      build_file = "//third_party:clutz.BUILD",
-      sha256 = "2981de41d1ff4774b544423da9a2cd8beb3be649e95aef2ef2fd83957300b3fe",
-      strip_prefix = "clutz-b0db5ade9bb535d387f05292316c422790c9848e",
-      urls = [
-          "http://mirror.bazel.build/github.com/angular/clutz/archive/b0db5ade9bb535d387f05292316c422790c9848e.tar.gz",  # 2017-05-22
-          "https://github.com/angular/clutz/archive/b0db5ade9bb535d387f05292316c422790c9848e.tar.gz",
-      ],
-  )
-
-  filegroup_external(
-      name = "com_google_javascript_closure_compiler_externs",
-      licenses = ["notice"],  # Apache 2.0
-      sha256_urls_extract = {
-          "0f515a6ebfa138490b3c5ea9f3591ea1a7e4a930d3074f18b3eca86084ad9b66": [
-              "http://mirror.bazel.build/github.com/google/closure-compiler/archive/b37e6000001b0a6bf4c0be49024ebda14a8711d9.tar.gz",  # 2017-06-02
-              "https://github.com/google/closure-compiler/archive/b37e6000001b0a6bf4c0be49024ebda14a8711d9.tar.gz",
-          ],
-      },
-      strip_prefix = {"b37e6000001b0a6bf4c0be49024ebda14a8711d9.tar.gz": "closure-compiler-b37e6000001b0a6bf4c0be49024ebda14a8711d9/externs"},
-  )
-
-  filegroup_external(
-      name = "com_google_javascript_closure_compiler_externs_polymer",
-      licenses = ["notice"],  # Apache 2.0
-      sha256_urls = {
-          "23baad9a200a717a821c6df504c84d3a893d7ea9102b14876eb80097e3b94292": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/google/closure-compiler/0e8dc5597a295ee259e3fecd98d6535dc621232f/contrib/externs/polymer-1.0.js",  # 2017-05-27
-              "https://raw.githubusercontent.com/google/closure-compiler/0e8dc5597a295ee259e3fecd98d6535dc621232f/contrib/externs/polymer-1.0.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "org_threejs",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
-          ],
-          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
-          ],
-      },
-  )
-  
-  ##############################################################################
-  # TensorBoard JavaScript Production Dependencies
-  web_library_external(
-      name = "com_lodash",
-      licenses = ["notice"],  # MIT
-      sha256 = "0e88207e5f90af4ce8790d6e1e7d09d2702d81bce0bafdc253d18c0a5bf7661e",
-      urls = [
-          "http://mirror.bazel.build/github.com/lodash/lodash/archive/3.10.1.tar.gz",
-          "https://github.com/lodash/lodash/archive/3.10.1.tar.gz",
-      ],
-      strip_prefix = "lodash-3.10.1",
-      path = "/lodash",
-      srcs = ["lodash.js"],
-  )
-
-  filegroup_external(
-      name = "com_numericjs",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "0e94aada97f12dee6118064add9170484c55022f5d53206ee4407143cd36ddcd": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/license.txt",
-              "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/license.txt",
-          ],
-          "dfaca3b8485bee735788cc6eebca82ea25719adc1fb8911c7799c6bd5a95df3b": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
-              "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "com_palantir_plottable",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls_extract = {
-          # Plottable doesn't have a release tarball on GitHub. Using the
-          # sources directly from git also requires running Node tooling
-          # beforehand to generate files. NPM is the only place to get it.
-          "e3159beb279391c47433789f22b32bac88488cfcad6c0b6ec8605ce6b0081b0d": [
-              "http://mirror.bazel.build/registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
-              "https://registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "io_github_cpettitt_dagre",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "6a349742a6cb219d5a2fc8d0844f6d89a6efc62e20c664450d884fc7ff2d6015": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/LICENSE",
-              "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/LICENSE",
-          ],
-          "7323829ddd77924a69e2b1235ded3eac30acd990da0f037e0fbd3c8e9035b50d": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
-              "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "io_github_cpettitt_graphlib",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "6a349742a6cb219d5a2fc8d0844f6d89a6efc62e20c664450d884fc7ff2d6015": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/LICENSE",
-              "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/LICENSE",
-          ],
-          "772045d412b1513b549be991c2e1846c38019429d43974efcae943fbe83489bf": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
-              "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "io_github_waylonflinn_weblas",
-      # no @license header
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "633f2861a9a862b9cd7967e841e14dd3527912f209d6563595774fa31e3d84cb": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/LICENSES",
-              "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/LICENSE",
-          ],
-          "f138fce57f673ca8a633f4aee5ae5b6fcb6ad0de59069a42a74e996fd04d8fcc": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
-              "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
-          ],
-      },
-  )
-
-  filegroup_external(
-      name = "org_d3js",
-      # no @license header
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256_urls_extract = {
-          "b5fac5b296bc196e6aa7b59f9e33986fc44d23d59a0e211705187be9e35b943d": [
-              "http://mirror.bazel.build/github.com/d3/d3/releases/download/v4.8.0/d3.zip",
-              "https://github.com/d3/d3/releases/download/v4.8.0/d3.zip",
-          ],
-      },
-      # TODO(jart): Use srcs=["d3.js"] instead of this once supported.
-      generated_rule_name = "all_files",
-      extra_build_file_content = "\n".join([
-          "filegroup(",
-          "    name = \"org_d3js\",",
-          "    srcs = [\"d3.js\"],",
-          ")",
-      ]),
-  )
-
-  filegroup_external(
-      name = "org_chromium_catapult_vulcanized_trace_viewer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256_urls = {
-          "f0df289ba9d03d857ad1c2f5918861376b1510b71588ffc60eff5c7a7bfedb09": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/LICENSE",
-              "https://raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/LICENSE",
-          ],
-          "9e99e79439ea5a1471bd4dd325bd6733e133bcb3da4df4b878ed6d2aec7c8d86": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/trace_viewer_full.html",
-              "https://raw.githubusercontent.com/catapult-project/catapult/2f7ee994984f3ebd3dd3dc3e05777bf180ec2ee8/trace_viewer_full.html"
-          ],
-      },
-  )
-
-  ##############################################################################
-  # TensorBoard Testing Dependencies
-  web_library_external(
-      name = "org_npmjs_registry_accessibility_developer_tools",
-      licenses = ["notice"],  # Apache License 2.0
-      sha256 = "1d6a72f401c9d53f68238c617dd43a05cd85ca5aa2e676a5b3c352711448e093",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
-          "https://registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
-      ],
-      strip_prefix = "package",
-      path = "/accessibility-developer-tools",
-      suppress = ["strictDependencies"],
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_async",
-      licenses = ["notice"],  # MIT
-      sha256 = "08655255ae810bf4d1cb1642df57658fcce823776d3ba8f4b46f4bbff6c87ece",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/async/-/async-1.5.0.tgz",
-          "https://registry.npmjs.org/async/-/async-1.5.0.tgz",
-      ],
-      strip_prefix = "package",
-      path = "/async",
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_chai",
-      licenses = ["notice"],  # MIT
-      sha256 = "aca8137bed5bb295bd7173325b7ad604cd2aeb341d739232b4f9f0b26745be90",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/chai/-/chai-3.5.0.tgz",
-          "https://registry.npmjs.org/chai/-/chai-3.5.0.tgz",
-      ],
-      strip_prefix = "package",
-      path = "/chai",
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_mocha",
-      licenses = ["notice"],  # MIT
-      sha256 = "13ef37a071196a2fba680799b906555d3f0ab61e80a7e8f73f93e77914590dd4",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
-          "https://registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
-      ],
-      suppress = ["strictDependencies"],
-      strip_prefix = "package",
-      path = "/mocha",
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_sinon",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "49edb057695fc9019aae992bf7e677a07de7c6ce2bf9f9facde4a245045d1532",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
-          "https://registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
-      ],
-      strip_prefix = "package/lib",
-      path = "/sinonjs",
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_sinon_chai",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "b85fc56f713832960b56fe9269ee4bb2cd41edd2ceb130b0936e5bdbed5dea63",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
-          "https://registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
-      ],
-      strip_prefix = "package",
-      path = "/sinon-chai",
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_stacky",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c659e60f7957d9d80c23a7aacc4d71b19c6421a08f91174c0062de369595acae",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
-          "https://registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
-      ],
-      strip_prefix = "package",
-      path = "/stacky",
-  )
-
-  web_library_external(
-      name = "org_npmjs_registry_web_component_tester",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9d4ebd4945df8a936916d4d32b7f280f2a3afa35f79e7ca8ad3ed0a42770c537",
-      urls = [
-          "http://mirror.bazel.build/registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
-          "https://registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
-      ],
-      strip_prefix = "package",
-      path = "/web-component-tester",
-      suppress = [
-          "absolutePaths",
-          "strictDependencies",
-      ],
-      deps = [
-          "@com_lodash",
-          "@org_npmjs_registry_accessibility_developer_tools",
-          "@org_npmjs_registry_async",
-          "@org_npmjs_registry_chai",
-          "@org_npmjs_registry_mocha",
-          "@org_npmjs_registry_sinon",
-          "@org_npmjs_registry_sinon_chai",
-          "@org_npmjs_registry_stacky",
-          "@org_polymer_test_fixture",
-      ],
-  )
-
-  web_library_external(
-      name = "org_polymer_test_fixture",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "59d6cfb1187733b71275becfea181fe0aa1f734df5ff77f5850c806bbbf9a0d9",
-      strip_prefix = "test-fixture-2.0.1",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
-          "https://github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
-      ],
-      path = "/test-fixture",
-      exclude = ["test/**"],
-  )
-
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 32266997a7e54c09525a60a48d2ad330941e2668..2d96406d27047ab9d518a79584cfae8b43c9feb4 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -257,6 +257,16 @@ cc_library(
     includes = ["include"],
 )
 
+# A creator of an empty file include/llvm/Support/VCSRevision.h.
+# This is usually populated by the upstream build infrastructure, but in this
+# case we leave it blank. See upstream revision r300160.
+genrule(
+    name = "vcs_revision_gen",
+    srcs = [],
+    outs = ["include/llvm/Support/VCSRevision.h"],
+    cmd = "echo '' > \"$@\"",
+)
+
 # Rules that apply the LLVM tblgen tool.
 gentbl(
     name = "intrinsics_gen",
@@ -376,6 +386,7 @@ llvm_target_list = [
         "tbl_outs": [
             ("-gen-register-bank", "lib/Target/ARM/ARMGenRegisterBank.inc"),
             ("-gen-register-info", "lib/Target/ARM/ARMGenRegisterInfo.inc"),
+            ("-gen-searchable-tables", "lib/Target/ARM/ARMGenSystemRegister.inc"),
             ("-gen-instr-info", "lib/Target/ARM/ARMGenInstrInfo.inc"),
             ("-gen-emitter", "lib/Target/ARM/ARMGenMCCodeEmitter.inc"),
             ("-gen-pseudo-lowering", "lib/Target/ARM/ARMGenMCPseudoLowering.inc"),
@@ -453,6 +464,7 @@ llvm_target_list = [
             "include/llvm/IR/Intrinsics*.td",
             "include/llvm/TableGen/*.td",
             "include/llvm/Target/*.td",
+            "include/llvm/Target/GlobalISel/*.td",
         ]),
     )
     for target in llvm_target_list
@@ -868,6 +880,7 @@ cc_library(
     deps = [
         ":arm_desc",
         ":arm_info",
+        ":arm_utils",
         ":config",
         ":mc",
         ":mc_parser",
@@ -886,12 +899,14 @@ cc_library(
         "include/llvm/Target/ARM/InstPrinter/*.h",
         "include/llvm/Target/ARM/InstPrinter/*.def",
         "include/llvm/Target/ARM/InstPrinter/*.inc",
+        "lib/Target/ARM/*.h",
         "lib/Target/ARM/InstPrinter/*.h",
     ]),
     copts = ["-Iexternal/llvm/lib/Target/ARM"],
     deps = [
         ":arm_info",
         ":arm_target_gen",
+        ":arm_utils",
         ":config",
         ":mc",
         ":support",
@@ -917,6 +932,7 @@ cc_library(
         ":arm_asm_printer",
         ":arm_desc",
         ":arm_info",
+        ":arm_utils",
         ":asm_printer",
         ":code_gen",
         ":config",
@@ -1005,6 +1021,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arm_utils",
+    srcs = glob([
+        "lib/Target/ARM/Utils/*.c",
+        "lib/Target/ARM/Utils/*.cpp",
+        "lib/Target/ARM/Utils/*.inc",
+        "lib/Target/ARM/MCTargetDesc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/Utils/*.h",
+        "include/llvm/Target/ARM/Utils/*.def",
+        "include/llvm/Target/ARM/Utils/*.inc",
+        "lib/Target/ARM/Utils/*.h",
+    ]),
+    copts = ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":arm_target_gen",
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "asm_parser",
     srcs = glob([
@@ -1067,6 +1106,8 @@ cc_library(
         "include/llvm/BinaryFormat/*.h",
         "include/llvm/BinaryFormat/*.def",
         "include/llvm/BinaryFormat/*.inc",
+        "include/llvm/BinaryFormat/ELFRelocs/*.def",
+        "include/llvm/BinaryFormat/WasmRelocs/*.def",
     ]),
     deps = [
         ":config",
@@ -1116,6 +1157,7 @@ cc_library(
         ":config",
         ":core",
         ":mc",
+        ":object",
         ":support",
     ],
 )
@@ -1165,6 +1207,7 @@ cc_library(
         "lib/IR/*.h",
     ]),
     hdrs = glob([
+        "include/llvm/Analysis/*.def",
         "include/llvm/IR/*.h",
         "include/llvm/IR/*.def",
         "include/llvm/IR/*.inc",
@@ -1194,6 +1237,7 @@ cc_library(
         "include/llvm/DebugInfo/CodeView/*.inc",
     ]),
     deps = [
+        ":binary_format",
         ":config",
         ":debug_info_msf",
         ":support",
@@ -1426,6 +1470,7 @@ cc_library(
         "include/llvm/MC/*.inc",
     ]),
     deps = [
+        ":binary_format",
         ":config",
         ":debug_info_code_view",
         ":support",
@@ -1921,6 +1966,8 @@ cc_library(
         "lib/Support/Unix/*.h",
         "include/llvm-c/*.h",
         "include/llvm/CodeGen/MachineValueType.h",
+        "include/llvm/BinaryFormat/COFF.h",
+        "include/llvm/BinaryFormat/MachO.h",
         "lib/Support/*.h",
     ]),
     hdrs = glob([
@@ -1931,7 +1978,9 @@ cc_library(
         "include/llvm/Support/ELFRelocs/*.def",
         "include/llvm/Support/WasmRelocs/*.def",
     ]) + [
+        "include/llvm/BinaryFormat/MachO.def",
         "include/llvm/Support/DataTypes.h",
+        "include/llvm/Support/VCSRevision.h",
         "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
     ],
     deps = [
@@ -1975,6 +2024,8 @@ cc_library(
         "lib/Target/*.h",
     ]),
     hdrs = glob([
+        "include/llvm/CodeGen/*.h",
+        "include/llvm/CodeGen/*.def",
         "include/llvm/Target/*.h",
         "include/llvm/Target/*.def",
         "include/llvm/Target/*.inc",
diff --git a/third_party/lmdb.BUILD b/third_party/lmdb.BUILD
index 7c6e3dc3f0531f7e2dc3c4ad782a6a02a6b4e514..9b3e1d97c83b44bba97e5513ae41c1511cf33ce7 100644
--- a/third_party/lmdb.BUILD
+++ b/third_party/lmdb.BUILD
@@ -19,8 +19,8 @@ cc_library(
         "-w",
     ],
     linkopts = select({
-        ":windows": ["-Wl,advapi32.lib"],  # InitializeSecurityDescriptor, SetSecurityDescriptorDacl
-        ":windows_msvc": ["-Wl,advapi32.lib"],
+        ":windows": ["-DEFAULTLIB:advapi32.lib"],  # InitializeSecurityDescriptor, SetSecurityDescriptorDacl
+        ":windows_msvc": ["-DEFAULTLIB:advapi32.lib"],
         "//conditions:default": ["-lpthread"],
     }),
     visibility = ["//visibility:public"],
diff --git a/third_party/markdown.BUILD b/third_party/markdown.BUILD
deleted file mode 100644
index fa3e85d5304083ed0de521c93c5ea1df1f477349..0000000000000000000000000000000000000000
--- a/third_party/markdown.BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-# Description:
-#   Markdown processor
-
-package(default_visibility = ["//visibility:public"])
-
-# This software says they use a BSD license.
-licenses(["notice"])
-
-exports_files(["LICENSE.md"])
-
-py_library(
-    name = "org_pythonhosted_markdown",
-    srcs = glob(["markdown/**/*.py"]),
-    srcs_version = "PY2AND3",
-)
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 8c86766effa97a08f6089194a5d9202da0e003b3..b27d341404c4ee1ca1e87ff3b9f427ec52eba739 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -1,4 +1,6 @@
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like     TODO
+licenses(["notice"])  # 3-Clause BSD
+
+exports_files(["LICENSE"])
 
 config_setting(
     name = "using_mkl",
@@ -16,10 +18,9 @@ load(
 cc_library(
     name = "intel_binary_blob",
     srcs = if_mkl([
-        "libdl.so.2",
-        "libmklml_intel.so",
-        "libiomp5.so",
+        "@mkl//:libmklml_intel.so",
+        "@mkl//:libiomp5.so",
     ]),
-    includes = ["."],
     visibility = ["//visibility:public"],
+    deps = ["@mkl//:mkl_headers"],
 )
diff --git a/third_party/mkl/LICENSE b/third_party/mkl/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f3ea0871e0bfe81da0fa6e7c1d7d156dc380e
--- /dev/null
+++ b/third_party/mkl/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 9a28b312c2de68481df774875330fd570f336ad3..533c0766c71a18e614f2f101a4e74b7f35fd26c3 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -1,4 +1,16 @@
-# Macros for building MKL code.
+# -*- Python -*-
+"""Skylark macros for MKL.
+if_mkl is a conditional to check if MKL is enabled or not.
+
+mkl_repository is a repository rule for creating MKL repository rule that can
+be pointed to either a local folder, or download it from the internet.
+mkl_repository depends on the following environment variables:
+  * `TF_MKL_ROOT`: The root folder where a copy of libmkl is located.
+"""
+
+
+_TF_MKL_ROOT = "TF_MKL_ROOT"
+
 
 def if_mkl(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with MKL.
@@ -11,3 +23,46 @@ def if_mkl(if_true, if_false = []):
         "//third_party/mkl:using_mkl": if_true,
         "//conditions:default": if_false
     })
+
+
+def _enable_local_mkl(repository_ctx):
+  return _TF_MKL_ROOT in repository_ctx.os.environ
+
+
+def _mkl_autoconf_impl(repository_ctx):
+  """Implementation of the local_mkl_autoconf repository rule."""
+
+  if _enable_local_mkl(repository_ctx):
+    # Symlink lib and include local folders.
+    mkl_root = repository_ctx.os.environ[_TF_MKL_ROOT]
+    mkl_lib_path = "%s/lib" % mkl_root
+    repository_ctx.symlink(mkl_lib_path, "lib")
+    mkl_include_path = "%s/include" % mkl_root
+    repository_ctx.symlink(mkl_include_path, "include")
+    mkl_license_path = "%s/license.txt" % mkl_root
+    repository_ctx.symlink(mkl_license_path, "license.txt")
+  else:
+    # setup remote mkl repository.
+    repository_ctx.download_and_extract(
+        repository_ctx.attr.urls,
+        sha256=repository_ctx.attr.sha256,
+        stripPrefix=repository_ctx.attr.strip_prefix,
+    )
+
+  # Also setup BUILD file.
+  repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
+
+
+mkl_repository = repository_rule(
+    implementation = _mkl_autoconf_impl,
+    environ = [
+        _TF_MKL_ROOT,
+    ],
+    attrs = {
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "urls": attr.string_list(default = []),
+        "sha256": attr.string(default = ""),
+        "strip_prefix": attr.string(default = ""),
+    },
+)
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8db97232e156b46091b379b0771239f55d6ea5ad
--- /dev/null
+++ b/third_party/mkl/mkl.BUILD
@@ -0,0 +1,30 @@
+licenses(["notice"])  # 3-Clause BSD
+
+exports_files(["license.txt"])
+
+filegroup(
+    name = "LICENSE",
+    srcs = [
+        "license.txt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "mkl_headers",
+    srcs = glob(["include/*"]),
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "libmklml_intel.so",
+    srcs = ["lib/libmklml_intel.so"],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "libiomp5.so",
+    srcs = ["lib/libiomp5.so"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/nanopb.BUILD b/third_party/nanopb.BUILD
deleted file mode 100644
index d21866911b862f0d4adf76c3a07e2732128a6102..0000000000000000000000000000000000000000
--- a/third_party/nanopb.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-# Description:
-#   Nanopb, a tiny ANSI C protobuf implementation for use on embedded devices.
-
-licenses(["notice"])  # zlib license
-
-exports_files(["LICENSE.txt"])
-
-cc_library(
-    name = "nanopb",
-    srcs = [
-        "pb_common.c",
-        "pb_decode.c",
-        "pb_encode.c",
-    ],
-    hdrs = [
-        "pb.h",
-        "pb_common.h",
-        "pb_decode.h",
-        "pb_encode.h",
-    ],
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/polymer.bzl b/third_party/polymer.bzl
deleted file mode 100644
index bd6e05803cf39192092fb20015c7abe520e8903e..0000000000000000000000000000000000000000
--- a/third_party/polymer.bzl
+++ /dev/null
@@ -1,1335 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TensorBoard Polymer Dependencies
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
-
-def tensorboard_polymer_workspace():
-  web_library_external(
-      name = "org_polymer_font_roboto",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
-          "https://github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
-      ],
-      strip_prefix = "font-roboto-1.0.1",
-      path = "/font-roboto",
-      srcs = ["roboto.html"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_hydrolysis",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "703b50f6b00f9e0546b5a3451da57bb20f77a166e27e4967923b9e835bab9b80",
-      urls = [
-          "http://mirror.bazel.build/github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
-          "https://github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
-      ],
-      strip_prefix = "polymer-analyzer-1.19.3",
-      path = "/hydrolysis",
-      srcs = [
-          "hydrolysis-analyzer.html",
-          "hydrolysis.html",
-          "hydrolysis.js",
-      ],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_a11y_announcer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
-          "https://github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
-      ],
-      strip_prefix = "iron-a11y-announcer-1.0.5",
-      path = "/iron-a11y-announcer",
-      srcs = ["iron-a11y-announcer.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_a11y_keys_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
-          "https://github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
-      ],
-      strip_prefix = "iron-a11y-keys-behavior-1.1.8",
-      path = "/iron-a11y-keys-behavior",
-      srcs = ["iron-a11y-keys-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_ajax",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
-          "https://github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
-      ],
-      strip_prefix = "iron-ajax-1.2.0",
-      path = "/iron-ajax",
-      srcs = [
-          "iron-ajax.html",
-          "iron-request.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_promise_polyfill",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_autogrow_textarea",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
-          "https://github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
-      ],
-      strip_prefix = "iron-autogrow-textarea-1.0.12",
-      path = "/iron-autogrow-textarea",
-      srcs = ["iron-autogrow-textarea.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_validatable_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_behaviors",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
-          "https://github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
-      ],
-      strip_prefix = "iron-behaviors-1.0.17",
-      path = "/iron-behaviors",
-      srcs = [
-          "iron-button-state.html",
-          "iron-control-state.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_checked_element_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
-          "https://github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
-      ],
-      strip_prefix = "iron-checked-element-behavior-1.0.4",
-      path = "/iron-checked-element-behavior",
-      srcs = ["iron-checked-element-behavior.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_validatable_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_component_page",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3636e8b9a1f229fc33b5aad3933bd02a9825f66e679a0be31855d7c8245c4b4b",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "iron-component-page-1.1.4",
-      path = "/iron-component-page",
-      srcs = ["iron-component-page.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_hydrolysis",
-          "@org_polymer_iron_ajax",
-          "@org_polymer_iron_doc_viewer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_icons",
-          "@org_polymer_iron_selector",
-          "@org_polymer_paper_header_panel",
-          "@org_polymer_paper_styles",
-          "@org_polymer_paper_toolbar",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_collapse",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
-          "https://github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
-      ],
-      strip_prefix = "iron-collapse-1.0.8",
-      path = "/iron-collapse",
-      srcs = ["iron-collapse.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_resizable_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_demo_helpers",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
-          "https://github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
-      ],
-      strip_prefix = "iron-demo-helpers-1.1.0",
-      path = "/iron-demo-helpers",
-      srcs = [
-          "demo-pages-shared-styles.html",
-          "demo-snippet.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_icons",
-          "@org_polymer_marked_element",
-          "@org_polymer_paper_icon_button",
-          "@org_polymer_paper_styles",
-          "@org_polymer_prism_element",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_doc_viewer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f0e9dfbbcd94d7e88ce82cb61e615406ace63c185fee9396f7f182206ca5cc9a",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
-          "https://github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
-      ],
-      strip_prefix = "iron-doc-viewer-1.0.12",
-      path = "/iron-doc-viewer",
-      srcs = [
-          "iron-doc-property-styles.html",
-          "iron-doc-property.html",
-          "iron-doc-viewer-styles.html",
-          "iron-doc-viewer.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_marked_element",
-          "@org_polymer_paper_button",
-          "@org_polymer_paper_styles",
-          "@org_polymer_prism_element",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_dropdown",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
-          "https://github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
-      ],
-      strip_prefix = "iron-dropdown-1.4.0",
-      path = "/iron-dropdown",
-      srcs = [
-          "iron-dropdown.html",
-          "iron-dropdown-scroll-manager.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_overlay_behavior",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_neon_animation",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_fit_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
-          "https://github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
-      ],
-      strip_prefix = "iron-fit-behavior-1.2.5",
-      path = "/iron-fit-behavior",
-      srcs = ["iron-fit-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_flex_layout",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
-          "https://github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
-      ],
-      strip_prefix = "iron-flex-layout-1.3.0",
-      path = "/iron-flex-layout",
-      srcs = [
-          "classes/iron-flex-layout.html",
-          "classes/iron-shadow-flex-layout.html",
-          "iron-flex-layout.html",
-          "iron-flex-layout-classes.html",
-      ],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_form_element_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
-          "https://github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
-      ],
-      strip_prefix = "iron-form-element-behavior-1.0.6",
-      path = "/iron-form-element-behavior",
-      srcs = ["iron-form-element-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_icon",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
-          "https://github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
-      ],
-      strip_prefix = "iron-icon-1.0.11",
-      path = "/iron-icon",
-      srcs = ["iron-icon.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_meta",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_icons",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
-          "https://github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
-      ],
-      strip_prefix = "iron-icons-1.1.3",
-      path = "/iron-icons",
-      srcs = [
-          "av-icons.html",
-          "communication-icons.html",
-          "device-icons.html",
-          "editor-icons.html",
-          "hardware-icons.html",
-          "image-icons.html",
-          "iron-icons.html",
-          "maps-icons.html",
-          "notification-icons.html",
-          "places-icons.html",
-          "social-icons.html",
-      ],
-      deps = [
-          "@org_polymer_iron_icon",
-          "@org_polymer_iron_iconset_svg",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_iconset_svg",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
-          "https://github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
-      ],
-      strip_prefix = "iron-iconset-svg-1.1.0",
-      path = "/iron-iconset-svg",
-      srcs = ["iron-iconset-svg.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_meta",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_input",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
-          "https://github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
-      ],
-      strip_prefix = "iron-input-1.0.10",
-      path = "/iron-input",
-      srcs = ["iron-input.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_announcer",
-          "@org_polymer_iron_validatable_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_list",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
-          "https://github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
-      ],
-      strip_prefix = "iron-list-1.3.9",
-      path = "/iron-list",
-      srcs = ["iron-list.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_iron_scroll_target_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_menu_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
-          "https://github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
-      ],
-      strip_prefix = "iron-menu-behavior-1.1.10",
-      path = "/iron-menu-behavior",
-      srcs = [
-          "iron-menu-behavior.html",
-          "iron-menubar-behavior.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_selector",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_meta",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
-          "https://github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
-      ],
-      strip_prefix = "iron-meta-1.1.1",
-      path = "/iron-meta",
-      srcs = ["iron-meta.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_overlay_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
-          "https://github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
-      ],
-      strip_prefix = "iron-overlay-behavior-1.10.1",
-      path = "/iron-overlay-behavior",
-      srcs = [
-          "iron-focusables-helper.html",
-          "iron-overlay-backdrop.html",
-          "iron-overlay-behavior.html",
-          "iron-overlay-manager.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_fit_behavior",
-          "@org_polymer_iron_resizable_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_range_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
-          "https://github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
-      ],
-      strip_prefix = "iron-range-behavior-1.0.4",
-      path = "/iron-range-behavior",
-      srcs = ["iron-range-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_resizable_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
-          "https://github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
-      ],
-      strip_prefix = "iron-resizable-behavior-1.0.3",
-      path = "/iron-resizable-behavior",
-      srcs = ["iron-resizable-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_scroll_target_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
-          "https://github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
-      ],
-      strip_prefix = "iron-scroll-target-behavior-1.0.3",
-      path = "/iron-scroll-target-behavior",
-      srcs = ["iron-scroll-target-behavior.html"],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_selector",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
-          "https://github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
-      ],
-      strip_prefix = "iron-selector-1.5.2",
-      path = "/iron-selector",
-      srcs = [
-          "iron-multi-selectable.html",
-          "iron-selectable.html",
-          "iron-selection.html",
-          "iron-selector.html",
-      ],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_iron_validatable_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
-          "https://github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
-      ],
-      strip_prefix = "iron-validatable-behavior-1.1.1",
-      path = "/iron-validatable-behavior",
-      srcs = ["iron-validatable-behavior.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_meta",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_marked",
-      licenses = ["notice"],  # MIT
-      sha256 = "93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
-      urls = [
-          "http://mirror.bazel.build/github.com/chjj/marked/archive/v0.3.2.zip",
-          "https://github.com/chjj/marked/archive/v0.3.2.zip",
-      ],
-      strip_prefix = "marked-0.3.2",
-      path = "/marked",
-      srcs = ["lib/marked.js"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_marked_element",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
-          "https://github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
-      ],
-      strip_prefix = "marked-element-1.1.3",
-      path = "/marked-element",
-      srcs = [
-          "marked-element.html",
-          "marked-import.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_marked",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_neon_animation",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
-          "https://github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
-      ],
-      strip_prefix = "neon-animation-1.2.2",
-      path = "/neon-animation",
-      srcs = [
-          "animations/cascaded-animation.html",
-          "animations/fade-in-animation.html",
-          "animations/fade-out-animation.html",
-          "animations/hero-animation.html",
-          "animations/opaque-animation.html",
-          "animations/reverse-ripple-animation.html",
-          "animations/ripple-animation.html",
-          "animations/scale-down-animation.html",
-          "animations/scale-up-animation.html",
-          "animations/slide-down-animation.html",
-          "animations/slide-from-bottom-animation.html",
-          "animations/slide-from-left-animation.html",
-          "animations/slide-from-right-animation.html",
-          "animations/slide-from-top-animation.html",
-          "animations/slide-left-animation.html",
-          "animations/slide-right-animation.html",
-          "animations/slide-up-animation.html",
-          "animations/transform-animation.html",
-          "neon-animatable.html",
-          "neon-animatable-behavior.html",
-          "neon-animated-pages.html",
-          "neon-animation.html",
-          "neon-animation-behavior.html",
-          "neon-animation-runner-behavior.html",
-          "neon-animations.html",
-          "neon-shared-element-animatable-behavior.html",
-          "neon-shared-element-animation-behavior.html",
-          "web-animations.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_meta",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_iron_selector",
-          "@org_polymer_web_animations_js",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_behaviors",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
-          "https://github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
-      ],
-      strip_prefix = "paper-behaviors-1.0.12",
-      path = "/paper-behaviors",
-      srcs = [
-          "paper-button-behavior.html",
-          "paper-checked-element-behavior.html",
-          "paper-inky-focus-behavior.html",
-          "paper-ripple-behavior.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_checked_element_behavior",
-          "@org_polymer_paper_ripple",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
-          "https://github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
-      ],
-      strip_prefix = "paper-button-1.0.11",
-      path = "/paper-button",
-      srcs = ["paper-button.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_material",
-          "@org_polymer_paper_ripple",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_checkbox",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
-          "https://github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
-      ],
-      strip_prefix = "paper-checkbox-1.4.0",
-      path = "/paper-checkbox",
-      srcs = ["paper-checkbox.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_dialog",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
-          "https://github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
-      ],
-      strip_prefix = "paper-dialog-1.0.4",
-      path = "/paper-dialog",
-      srcs = ["paper-dialog.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_neon_animation",
-          "@org_polymer_paper_dialog_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_dialog_behavior",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
-          "https://github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
-      ],
-      strip_prefix = "paper-dialog-behavior-1.2.5",
-      path = "/paper-dialog-behavior",
-      srcs = [
-          "paper-dialog-behavior.html",
-          "paper-dialog-common.css",
-          "paper-dialog-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_overlay_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_dialog_scrollable",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
-          "https://github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
-      ],
-      strip_prefix = "paper-dialog-scrollable-1.1.5",
-      path = "/paper-dialog-scrollable",
-      srcs = ["paper-dialog-scrollable.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_dialog_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_dropdown_menu",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
-          "https://github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
-      ],
-      strip_prefix = "paper-dropdown-menu-1.4.0",
-      path = "/paper-dropdown-menu",
-      srcs = [
-          "paper-dropdown-menu.html",
-          "paper-dropdown-menu-icons.html",
-          "paper-dropdown-menu-light.html",
-          "paper-dropdown-menu-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_icon",
-          "@org_polymer_iron_iconset_svg",
-          "@org_polymer_iron_validatable_behavior",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_input",
-          "@org_polymer_paper_menu_button",
-          "@org_polymer_paper_ripple",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_header_panel",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-header-panel-1.1.4",
-      path = "/paper-header-panel",
-      srcs = ["paper-header-panel.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_icon_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
-          "https://github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
-      ],
-      strip_prefix = "paper-icon-button-1.1.3",
-      path = "/paper-icon-button",
-      srcs = [
-          "paper-icon-button.html",
-          "paper-icon-button-light.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_icon",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_input",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
-          "https://github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
-      ],
-      strip_prefix = "paper-input-1.1.18",
-      path = "/paper-input",
-      srcs = [
-          "paper-input.html",
-          "paper-input-addon-behavior.html",
-          "paper-input-behavior.html",
-          "paper-input-char-counter.html",
-          "paper-input-container.html",
-          "paper-input-error.html",
-          "paper-textarea.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_autogrow_textarea",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_input",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_item",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-item-1.1.4",
-      path = "/paper-item",
-      srcs = [
-          "paper-icon-item.html",
-          "paper-item.html",
-          "paper-item-behavior.html",
-          "paper-item-body.html",
-          "paper-item-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_listbox",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
-          "https://github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
-      ],
-      strip_prefix = "paper-listbox-1.1.2",
-      path = "/paper-listbox",
-      srcs = ["paper-listbox.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_menu_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_material",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
-          "https://github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
-      ],
-      strip_prefix = "paper-material-1.0.6",
-      path = "/paper-material",
-      srcs = [
-          "paper-material.html",
-          "paper-material-shared-styles.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_menu",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
-          "https://github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
-      ],
-      strip_prefix = "paper-menu-1.2.2",
-      path = "/paper-menu",
-      srcs = [
-          "paper-menu.html",
-          "paper-menu-shared-styles.html",
-          "paper-submenu.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_collapse",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_menu_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_menu_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
-          "https://github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
-      ],
-      strip_prefix = "paper-menu-button-1.5.1",
-      path = "/paper-menu-button",
-      srcs = [
-          "paper-menu-button.html",
-          "paper-menu-button-animations.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_dropdown",
-          "@org_polymer_neon_animation",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_progress",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
-          "https://github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
-      ],
-      strip_prefix = "paper-progress-1.0.9",
-      path = "/paper-progress",
-      srcs = ["paper-progress.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_range_behavior",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_radio_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
-          "https://github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
-      ],
-      strip_prefix = "paper-radio-button-1.1.2",
-      path = "/paper-radio-button",
-      srcs = ["paper-radio-button.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_radio_group",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
-          "https://github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
-      ],
-      strip_prefix = "paper-radio-group-1.0.9",
-      path = "/paper-radio-group",
-      srcs = ["paper-radio-group.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_selector",
-          "@org_polymer_paper_radio_button",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_ripple",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
-          "https://github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
-      ],
-      strip_prefix = "paper-ripple-1.0.5",
-      path = "/paper-ripple",
-      srcs = ["paper-ripple.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_slider",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
-          "https://github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
-      ],
-      strip_prefix = "paper-slider-1.0.10",
-      path = "/paper-slider",
-      srcs = ["paper-slider.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_keys_behavior",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_form_element_behavior",
-          "@org_polymer_iron_range_behavior",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_input",
-          "@org_polymer_paper_progress",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_spinner",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
-          "https://github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
-      ],
-      strip_prefix = "paper-spinner-1.1.1",
-      path = "/paper-spinner",
-      srcs = [
-          "paper-spinner.html", "paper-spinner-behavior.html",
-          "paper-spinner-lite.html", "paper-spinner-styles.html"
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_styles",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-styles-1.1.4",
-      path = "/paper-styles",
-      srcs = [
-          "classes/global.html",
-          "classes/shadow.html",
-          "classes/shadow-layout.html",
-          "classes/typography.html",
-          "color.html",
-          "default-theme.html",
-          "demo.css",
-          "demo-pages.html",
-          "paper-styles.html",
-          "paper-styles-classes.html",
-          "shadow.html",
-          "typography.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_font_roboto",
-          "@org_polymer_iron_flex_layout",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_tabs",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
-          "https://github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
-      ],
-      strip_prefix = "paper-tabs-1.7.0",
-      path = "/paper-tabs",
-      srcs = [
-          "paper-tab.html",
-          "paper-tabs.html",
-          "paper-tabs-icons.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_behaviors",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_iron_icon",
-          "@org_polymer_iron_iconset_svg",
-          "@org_polymer_iron_menu_behavior",
-          "@org_polymer_iron_resizable_behavior",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_icon_button",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_toast",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
-          "https://github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
-      ],
-      strip_prefix = "paper-toast-1.3.0",
-      path = "/paper-toast",
-      srcs = ["paper-toast.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_a11y_announcer",
-          "@org_polymer_iron_overlay_behavior",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_toggle_button",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
-          "https://github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
-      ],
-      strip_prefix = "paper-toggle-button-1.2.0",
-      path = "/paper-toggle-button",
-      srcs = ["paper-toggle-button.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_behaviors",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_toolbar",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
-          "https://github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
-      ],
-      strip_prefix = "paper-toolbar-1.1.4",
-      path = "/paper-toolbar",
-      srcs = ["paper-toolbar.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_iron_flex_layout",
-          "@org_polymer_paper_styles",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_paper_tooltip",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
-          "https://github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
-      ],
-      strip_prefix = "paper-tooltip-1.1.2",
-      path = "/paper-tooltip",
-      srcs = ["paper-tooltip.html"],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_neon_animation",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
-      strip_prefix = "polymer-1.7.0",
-      urls = [
-          "http://mirror.bazel.build/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
-          "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
-      ],
-      path = "/polymer",
-      srcs = [
-          "polymer.html",
-          "polymer-micro.html",
-          "polymer-mini.html",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_prism",
-      licenses = ["notice"],  # MIT
-      sha256 = "e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
-      urls = [
-          "http://mirror.bazel.build/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
-          "https://github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
-      ],
-      strip_prefix = "prism-abee2b7587f1925e57777044270e2a1860810994",
-      path = "/prism",
-      srcs = [
-          "prism.js",
-          "themes/prism.css",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_prism_element",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
-          "https://github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
-      ],
-      strip_prefix = "prism-element-1.0.4",
-      path = "/prism-element",
-      srcs = [
-          "prism-highlighter.html",
-          "prism-import.html",
-      ],
-      deps = [
-          "@org_polymer",
-          "@org_polymer_prism",
-      ],
-  )
-  
-  web_library_external(
-      name = "org_polymer_promise_polyfill",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
-      strip_prefix = "promise-polyfill-1.0.0",
-      urls = [
-          "http://mirror.bazel.build/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
-          "https://github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
-      ],
-      path = "/promise-polyfill",
-      srcs = [
-          "Promise.js",
-          "Promise-Statics.js",
-          "promise-polyfill.html",
-          "promise-polyfill-lite.html"
-      ],
-      deps = ["@org_polymer"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_web_animations_js",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
-      urls = [
-          "http://mirror.bazel.build/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
-          "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
-      ],
-      strip_prefix = "web-animations-js-2.2.1",
-      path = "/web-animations-js",
-      srcs = ["web-animations-next-lite.min.js"],
-  )
-  
-  web_library_external(
-      name = "org_polymer_webcomponentsjs",
-      licenses = ["notice"],  # BSD-3-Clause
-      sha256 = "138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
-      urls = [
-          "http://mirror.bazel.build/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
-          "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
-      ],
-      strip_prefix = "webcomponentsjs-0.7.22",
-      path = "/webcomponentsjs",
-      srcs = [
-          "CustomElements.js",
-          "CustomElements.min.js",
-          "HTMLImports.js",
-          "HTMLImports.min.js",
-          "MutationObserver.js",
-          "MutationObserver.min.js",
-          "ShadowDOM.js",
-          "ShadowDOM.min.js",
-          "webcomponents.js",
-          "webcomponents.min.js",
-          "webcomponents-lite.js",
-          "webcomponents-lite.min.js",
-      ],
-  )
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
index edd52095949cfdeff5cde3a1c696fe419b01a016..8bd5bacaf12e00101fbabdcd04c40a27d2a900b8 100644
--- a/third_party/pprof.BUILD
+++ b/third_party/pprof.BUILD
@@ -4,15 +4,15 @@ package(
 
 licenses(["notice"])  # MIT
 
-load("@protobuf//:protobuf.bzl", "py_proto_library")
+load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 
 exports_files(["pprof/LICENSE"])
 
 py_proto_library(
     name = "pprof_proto_py",
     srcs = ["proto/profile.proto"],
-    default_runtime = "@protobuf//:protobuf_python",
-    protoc = "@protobuf//:protoc",
+    default_runtime = "@protobuf_archive//:protobuf_python",
+    protoc = "@protobuf_archive//:protoc",
     srcs_version = "PY2AND3",
-    deps = ["@protobuf//:protobuf_python"],
+    deps = ["@protobuf_archive//:protobuf_python"],
 )
diff --git a/third_party/py/BUILD.tpl b/third_party/py/BUILD.tpl
index 1ee9c071adb2d9f4aec84b92277c5067f153b666..de06ad5f27e7c08aade4a8f51ab60ba52d012b7b 100644
--- a/third_party/py/BUILD.tpl
+++ b/third_party/py/BUILD.tpl
@@ -5,7 +5,17 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
+    data = select({
+        ":windows": [":python_import_lib"],
+        "//conditions:default": [],
+    }),
     includes = ["python_include"],
+    linkopts = select({
+        # TODO(pcloudy): Ideally, this should just go into deps after resolving
+        # https://github.com/bazelbuild/bazel/issues/3237,
+        ":windows": ["$(locations :python_import_lib)"],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -21,5 +31,5 @@ config_setting(
 )
 
 %{PYTHON_INCLUDE_GENRULE}
-
 %{NUMPY_INCLUDE_GENRULE}
+%{PYTHON_IMPORT_LIB_GENRULE}
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index b4a98af7b6e7742ba99829e6b5e7ce13224cb217..bbc07905fc7f92a26d0aebade66a20209dc3e766 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -9,10 +9,9 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
-_NUMPY_INCLUDE_PATH = "NUMPY_INCLUDE_PATH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-_PYTHON_INCLUDE_PATH = "PYTHON_INCLUDE_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
+_TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
 
 
 def _tpl(repository_ctx, tpl, substitutions={}, out=None):
@@ -116,11 +115,11 @@ def _genrule(src_dir, genrule_name, command, outs):
       genrule_name + '",\n' +
       '    outs = [\n' +
       outs +
-      '    ],\n' +
+      '\n    ],\n' +
       '    cmd = """\n' +
       command +
-      '    """,\n' +
-      ')\n\n'
+      '\n   """,\n' +
+      ')\n'
   )
 
 
@@ -132,15 +131,20 @@ def _norm_path(path):
   return path
 
 
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name):
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+    src_files = [], dest_files = []):
   """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+  If src_dir is passed, files will be read from the given directory; otherwise
+  we assume files are in src_files and dest_files
   """
-  src_dir = _norm_path(src_dir)
-  dest_dir = _norm_path(dest_dir)
-  files = _read_dir(repository_ctx, src_dir)
-  # Create a list with the src_dir stripped to use for outputs.
-  dest_files = files.replace(src_dir, '').splitlines()
-  src_files = files.splitlines()
+  if src_dir != None:
+    src_dir = _norm_path(src_dir)
+    dest_dir = _norm_path(dest_dir)
+    files = _read_dir(repository_ctx, src_dir)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, '').splitlines()
+    src_files = files.splitlines()
   command = []
   outs = []
   for i in range(len(dest_files)):
@@ -151,12 +155,27 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name):
       # On Windows, symlink is not supported, so we just copy all the files.
       cmd = 'cp -f' if _is_windows(repository_ctx) else 'ln -s'
       command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
-      outs.append('      "' + dest_dir + dest_files[i] + '",')
+      outs.append('        "' + dest_dir + dest_files[i] + '",')
   genrule = _genrule(src_dir, genrule_name, " && ".join(command),
                      "\n".join(outs))
   return genrule
 
 
+def _get_python_bin(repository_ctx):
+  """Gets the python bin path."""
+  python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH,
+                            None, False)
+  if python_bin != None:
+    return python_bin
+  python_bin_path = repository_ctx.which("python")
+  if python_bin_path != None:
+    return str(python_bin_path)
+  path = _get_env_var(repository_ctx, "PATH")
+  _python_configure_fail("Cannot find python in PATH, please make sure " +
+      "python is installed and add its directory in PATH, or set the " +
+      "environment variable PYTHON_BIN_PATH.\nPATH=%s" % (path))
+
+
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   print_lib = ("<<END\n" +
@@ -216,11 +235,23 @@ def _get_python_include(repository_ctx, python_bin):
                      'print(sysconfig.get_python_inc())'],
                     error_msg="Problem getting python include path.",
                     error_details=("Is the Python binary path set up right? " +
-                                   "(See ./configure or BAZEL_BIN_PATH.) " +
+                                   "(See ./configure or PYTHON_BIN_PATH.) " +
                                    "Is distutils installed?"))
   return result.stdout.splitlines()[0]
 
 
+def _get_python_import_lib_name(repository_ctx, python_bin):
+  """Get Python import library name (pythonXY.lib) on Windows."""
+  result = _execute(repository_ctx,
+                    [python_bin, "-c",
+                     'import sys;' +
+                     'print("python" + str(sys.version_info[0]) + str(sys.version_info[1]) + ".lib")'],
+                    error_msg="Problem getting python import library.",
+                    error_details=("Is the Python binary path set up right? " +
+                                   "(See ./configure or PYTHON_BIN_PATH.) "))
+  return result.stdout.splitlines()[0]
+
+
 def _get_numpy_include(repository_ctx, python_bin):
   """Gets the numpy include path."""
   return _execute(repository_ctx,
@@ -234,79 +265,57 @@ def _get_numpy_include(repository_ctx, python_bin):
 
 def _create_local_python_repository(repository_ctx):
   """Creates the repository containing files set up to build with Python."""
-  python_include = None
-  numpy_include = None
-  empty_config = False
-  # If local checks were requested, the python and numpy include will be auto
-  # detected on the host config (using _PYTHON_BIN_PATH).
-  if repository_ctx.attr.local_checks:
-    # TODO(nlopezgi): The default argument here is a workaround until
-    #                 bazelbuild/bazel#3057 is resolved.
-    python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH,
-                              "/usr/bin/python")
-    _check_python_bin(repository_ctx, python_bin)
-    python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH,
+  python_bin = _get_python_bin(repository_ctx)
+  _check_python_bin(repository_ctx, python_bin)
+  python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH,
                               _get_python_lib(repository_ctx, python_bin))
-    _check_python_lib(repository_ctx, python_lib)
-    python_include = _get_python_include(repository_ctx, python_bin)
-    numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
-  else:
-    # Otherwise, we assume user provides all paths (via ENV or attrs)
-    python_include = _get_env_var(repository_ctx, _PYTHON_INCLUDE_PATH,
-                                  repository_ctx.attr.python_include)
-    numpy_include = _get_env_var(repository_ctx, _NUMPY_INCLUDE_PATH,
-                                 repository_ctx.attr.numpy_include) + '/numpy'
-  if empty_config:
-    _tpl(repository_ctx, "BUILD", {
-        "%{PYTHON_INCLUDE_GENRULE}": ('filegroup(\n' +
-                                      '    name = "python_include",\n' +
-                                      '    srcs = [],\n' +
-                                      ')\n'),
-        "%{NUMPY_INCLUDE_GENRULE}": ('filegroup(\n' +
-                                      '    name = "numpy_include",\n' +
-                                      '    srcs = [],\n' +
-                                      ')\n'),
-    })
-  else:
-    python_include_rule = _symlink_genrule_for_dir(
-        repository_ctx, python_include, 'python_include', 'python_include')
-    numpy_include_rule = _symlink_genrule_for_dir(
-        repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
-    _tpl(repository_ctx, "BUILD", {
-        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-    })
-
-
-def _create_remote_python_repository(repository_ctx):
+  _check_python_lib(repository_ctx, python_lib)
+  python_include = _get_python_include(repository_ctx, python_bin)
+  numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
+  python_include_rule = _symlink_genrule_for_dir(
+      repository_ctx, python_include, 'python_include', 'python_include')
+  python_import_lib_genrule = ""
+  # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+  # See https://docs.python.org/3/extending/windows.html
+  if _is_windows(repository_ctx):
+    python_include = _norm_path(python_include)
+    python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
+    python_import_lib_src = python_include.rsplit('/', 1)[0] + "/libs/" + python_import_lib_name
+    python_import_lib_genrule = _symlink_genrule_for_dir(
+      repository_ctx, None, '', 'python_import_lib',
+      [python_import_lib_src], [python_import_lib_name])
+  numpy_include_rule = _symlink_genrule_for_dir(
+      repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
+  _tpl(repository_ctx, "BUILD", {
+      "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+      "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
+      "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+  })
+
+
+def _create_remote_python_repository(repository_ctx, remote_config_repo):
   """Creates pointers to a remotely configured repo set up to build with Python.
   """
   _tpl(repository_ctx, "remote.BUILD", {
-      "%{REMOTE_PYTHON_REPO}": repository_ctx.attr.remote_config_repo,
+      "%{REMOTE_PYTHON_REPO}": remote_config_repo,
   }, "BUILD")
 
 
 def _python_autoconf_impl(repository_ctx):
   """Implementation of the python_autoconf repository rule."""
-  if repository_ctx.attr.remote_config_repo != "":
-    _create_remote_python_repository(repository_ctx)
+  if _TF_PYTHON_CONFIG_REPO in repository_ctx.os.environ:
+      _create_remote_python_repository(repository_ctx,
+          repository_ctx.os.environ[_TF_PYTHON_CONFIG_REPO])
   else:
     _create_local_python_repository(repository_ctx)
 
 
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
-    attrs = {
-        "local_checks": attr.bool(mandatory = False, default = True),
-        "python_include": attr.string(mandatory = False),
-        "numpy_include": attr.string(mandatory = False),
-        "remote_config_repo": attr.string(mandatory = False, default =""),
-    },
     environ = [
         _PYTHON_BIN_PATH,
-        _PYTHON_INCLUDE_PATH,
         _PYTHON_LIB_PATH,
-        _NUMPY_INCLUDE_PATH,
+        _TF_PYTHON_CONFIG_REPO,
     ],
 )
 """Detects and configures the local Python.
diff --git a/third_party/python.bzl b/third_party/python.bzl
deleted file mode 100644
index 25c2ae3e780869a49a2d6fbe5612852dcef5b2f4..0000000000000000000000000000000000000000
--- a/third_party/python.bzl
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TensorBoard external dependencies that are used on the python side.
-# Protobuf and six were deliberately left in the top-level workspace, as they
-# are used in TensorFlow as well.
-
-def tensorboard_python_workspace():
-  native.new_http_archive(
-      name = "org_pythonhosted_markdown",
-      urls = [
-          "http://mirror.bazel.build/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
-          "https://pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
-      ],
-      strip_prefix = "Markdown-2.6.8",
-      sha256 = "0ac8a81e658167da95d063a9279c9c1b2699f37c7c4153256a458b3a43860e33",
-      build_file = str(Label("//third_party:markdown.BUILD")),
-  )
-
-  native.new_http_archive(
-      name = "org_html5lib",
-      urls = [
-          "http://mirror.bazel.build/github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",
-          "https://github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",  # identical to 1.0b8
-      ],
-      sha256 = "184257f98539159a433e2a2197309657ae1283b4c44dbd9c87b2f02ff36adce8",
-      strip_prefix = "html5lib-python-0.9999999",
-      build_file = str(Label("//third_party:html5lib.BUILD")),
-  )
-
-  native.new_http_archive(
-      name = "org_mozilla_bleach",
-      urls = [
-          "http://mirror.bazel.build/github.com/mozilla/bleach/archive/v1.5.tar.gz",
-          "https://github.com/mozilla/bleach/archive/v1.5.tar.gz",
-      ],
-      strip_prefix = "bleach-1.5",
-      sha256 = "0d68713d02ba4148c417ab1637dd819333d96929a34401d0233947bec0881ad8",
-      build_file = str(Label("//third_party:bleach.BUILD")),
-  )
-  
-  native.new_http_archive(
-      name = "org_pocoo_werkzeug",
-      urls = [
-          "http://mirror.bazel.build/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
-          "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
-      ],
-      strip_prefix = "Werkzeug-0.11.10",
-      sha256 = "cc64dafbacc716cdd42503cf6c44cb5a35576443d82f29f6829e5c49264aeeee",
-      build_file = str(Label("//third_party:werkzeug.BUILD")),
-  )
\ No newline at end of file
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index 120028dc52a172d1d4476b42c0db6893a4c9f306..9c00b7068a802a361effab207409138c79addde7 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -4,6 +4,18 @@ licenses(["notice"])  # BSD 3-Clause
 
 exports_files(["COPYING"])
 
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "windows_msvc",
+    values = {"cpu": "x64_windows_msvc"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "snappy",
     srcs = [
@@ -19,10 +31,14 @@ cc_library(
         "snappy-stubs-public.h",
     ],
     hdrs = ["snappy.h"],
-    copts = [
-        "-Wno-shift-negative-value",
-        "-Wno-implicit-function-declaration",
-    ],
+    copts = select({
+        ":windows": [],
+        ":windows_msvc": [],
+        "//conditions:default": [
+            "-Wno-shift-negative-value",
+            "-Wno-implicit-function-declaration",
+        ],
+    }),
 )
 
 genrule(
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index 94c5e6aaad0cdb6fee56c8cd95656edeae29aa5c..c699eabb6f3b2153c5f73ad30957ffff4f58447e 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -15,7 +15,7 @@ COMPUTECPP_INCLUDE = COMPUTECPP_ROOT + 'include'
 
 def main():
   remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes')
-    # remove -fsamotoze-coverage from string with g++
+  # remove -fsanitize-coverage from string with g++
   if 'g++' in CPU_CXX_COMPILER:
     remove_flags += ('-fsanitize-coverage',)
   compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
@@ -43,7 +43,7 @@ def main():
 
   # create a blacklist of folders that will be skipped when compiling with ComputeCpp
   skip_extensions = [".cu.cc"]
-  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "tensorflow/tensorboard", "third_party", "external", "hexagon"]
+  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "third_party", "external", "hexagon"]
   skip_folders = [(folder + '/') for folder in skip_folders]
   # if compiling external project skip computecpp
   if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
@@ -73,13 +73,13 @@ def main():
   bc_out = filename + '.sycl'
 
   # strip asan for the device
-  computecpp_device_compiler_flags = ['-sycl-compress-name', '-Wno-unused-variable',
+  computecpp_device_compiler_flags = ['-sycl-compress-name', '-Wno-unused-variable', '-Wno-c++11-narrowing',
                                       '-I', COMPUTECPP_INCLUDE, '-isystem', COMPUTECPP_INCLUDE,
                                       '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop',
                                       '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt']
   # disable flags enabling SIMD instructions
   computecpp_device_compiler_flags += [flag for flag in compiler_flags if \
-    not any(x in flag.lower() for x in ('-fsanitize', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
+    not any(x in flag.lower() for x in ('-fsanitize', '-fno-canonical-system-headers', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
 
   x = call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags)
   if x == 0:
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index c66a9f007df05e0ef734fb8f8e76ea4f72fdb6a1..6cad190630062576262c977df6e84168e413bb99 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -33,7 +33,7 @@ cc_library(
         sycl_library_path("ComputeCpp")
     ],
     includes = ["include/"],
-    linkstatic = 1,
+    linkstatic = 0,
 )
 
 cc_library(
diff --git a/third_party/toolchains/cpus/BUILD b/third_party/toolchains/cpus/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..45ec9f8c870ee4c0b648cb9d535b3011bf20930c
--- /dev/null
+++ b/third_party/toolchains/cpus/BUILD
@@ -0,0 +1,82 @@
+# A build file to configure cc toolchain for CPU build used with Bazel remote
+# execution service
+# DO NOT EDIT: automatically generated BUILD file
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+cc_library(
+    name = "stl",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "cc_wrapper",
+    srcs = ["cc_wrapper.sh"],
+)
+
+# This is the entry point for --crosstool_top.  Toolchains are found
+# by lopping off the name of --crosstool_top and searching for
+# the "${CPU}" entry in the toolchains attribute.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "k8|clang": ":cc-compiler-k8",
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "ios_x86_64|compiler": ":cc-compiler-ios_x86_64",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-k8",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "k8",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+# ios crosstool configuration requires a default toolchain for the
+# ios_x86_64 cpu.
+cc_toolchain(
+    name = "cc-compiler-ios_x86_64",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
diff --git a/third_party/toolchains/cpus/CROSSTOOL b/third_party/toolchains/cpus/CROSSTOOL
new file mode 100644
index 0000000000000000000000000000000000000000..66039c2135be9d79ec85036975c5411e26ea4769
--- /dev/null
+++ b/third_party/toolchains/cpus/CROSSTOOL
@@ -0,0 +1,918 @@
+# A crosstool configuration for CPU build used with Bazel remote
+# execution service
+# DO NOT EDIT: automatically generated file
+
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "linux_gnu_x86"
+}
+
+default_toolchain {
+  cpu: "armeabi-v7a"
+  toolchain_identifier: "stub_armeabi-v7a"
+}
+
+default_toolchain {
+  cpu: "x64_windows_msvc"
+  toolchain_identifier: "msvc_x64"
+}
+
+default_toolchain {
+  cpu: "x64_windows_msys"
+  toolchain_identifier: "msys_x64"
+}
+
+default_toolchain {
+  cpu: "s390x"
+  toolchain_identifier: "linux_gnu_x86"
+}
+
+default_toolchain {
+  cpu: "ios_x86_64"
+  toolchain_identifier: "ios_x86_64"
+}
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+toolchain {
+  abi_version: "armeabi-v7a"
+  abi_libc_version: "armeabi-v7a"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "armeabi-v7a"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "armeabi-v7a"
+  target_cpu: "armeabi-v7a"
+  target_system_name: "armeabi-v7a"
+  toolchain_identifier: "stub_armeabi-v7a"
+
+  tool_path { name: "ar" path: "/bin/false" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "/bin/false" }
+  tool_path { name: "dwp" path: "/bin/false" }
+  tool_path { name: "gcc" path: "/bin/false" }
+  tool_path { name: "gcov" path: "/bin/false" }
+  tool_path { name: "ld" path: "/bin/false" }
+
+  tool_path { name: "nm" path: "/bin/false" }
+  tool_path { name: "objcopy" path: "/bin/false" }
+  tool_path { name: "objdump" path: "/bin/false" }
+  tool_path { name: "strip" path: "/bin/false" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "ios_x86_64"
+  host_system_name: "x86_64-apple-macosx"
+  target_system_name: "x86_64-apple-ios"
+  target_cpu: "ios_x86_64"
+  target_libc: "ios"
+  compiler: "compiler"
+  abi_version: "local"
+  abi_libc_version: "local"
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+
+  tool_path { name: "ar" path: "/bin/false" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "/bin/false" }
+  tool_path { name: "dwp" path: "/bin/false" }
+  tool_path { name: "gcc" path: "/bin/false" }
+  tool_path { name: "gcov" path: "/bin/false" }
+  tool_path { name: "ld" path: "/bin/false" }
+
+  tool_path { name: "nm" path: "/bin/false" }
+  tool_path { name: "objcopy" path: "/bin/false" }
+  tool_path { name: "objdump" path: "/bin/false" }
+  tool_path { name: "strip" path: "/bin/false" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "linux_gnu_x86"
+  abi_version: "clang"
+  abi_libc_version: "glibc_2.19"
+  builtin_sysroot: ""
+  compiler: "clang"
+  host_system_name: "i686-unknown-linux-gnu"
+  needsPic: true
+  supports_gold_linker: true
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: true
+  target_libc: "glibc_2.19"
+  target_cpu: "k8"
+  target_system_name: "x86_64-unknown-linux-gnu"
+  cxx_flag: "-std=c++0x"
+  linker_flag: "-lstdc++"
+  linker_flag: "-lm"
+  linker_flag: "-fuse-ld=gold"
+  linker_flag: "-B/usr/local/bin"
+  linker_flag: "-B/usr/bin"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9/backward"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/local/lib/clang/5.0.0/include"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-Wall"
+  compiler_flag: "-B/usr/local/bin"
+  compiler_flag: "-B/usr/bin"
+  compiler_flag: "-fcolor-diagnostics"
+  compiler_flag: "-fno-omit-frame-pointer"
+  tool_path {name: "ld" path: "/usr/bin/ld" }
+  tool_path {name: "cpp" path: "/usr/bin/cpp" }
+  tool_path {name: "dwp" path: "/usr/bin/dwp" }
+  tool_path {name: "gcov" path: "/usr/bin/gcov" }
+  tool_path {name: "nm" path: "/usr/bin/nm" }
+  tool_path {name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path {name: "objdump" path: "/usr/bin/objdump" }
+  tool_path {name: "strip" path: "/usr/bin/strip" }
+  tool_path {name: "gcc" path: "/usr/local/bin/clang" }
+  tool_path {name: "ar" path: "/usr/bin/ar" }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-O2"
+    compiler_flag: "-D_FORTIFY_SOURCE=1"
+    compiler_flag: "-DNDEBUG"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+
+
+    feature {
+      name: 'coverage'
+      provides: 'profile'
+      flag_set {
+        action: 'preprocess-assemble'
+        action: 'c-compile'
+        action: 'c++-compile'
+        action: 'c++-header-parsing'
+        action: 'c++-header-preprocessing'
+        action: 'c++-module-compile'
+        flag_group {
+          flag: '-fprofile-arcs'
+          flag: '-ftest-coverage'
+      }
+
+
+
+      }
+      flag_set {
+        action: 'c++-link-interface-dynamic-library'
+        action: 'c++-link-dynamic-library'
+        action: 'c++-link-executable'
+        flag_group {
+          flag: '-lgcov'
+      }
+      }
+    }
+}
+
+toolchain {
+  toolchain_identifier: "msvc_x64"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "cl"
+  target_libc: "msvcrt140"
+  default_python_version: "python2.7"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_gold_linker: false
+  supports_start_end_lib: false
+  supports_interface_shared_objects: false
+  supports_incremental_linker: false
+  supports_normalizing_ar: true
+  needsPic: false
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DOS_WINDOWS=OS_WINDOWS"
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't pollute with GDI macros in windows.h.
+  compiler_flag: "/DNOGDI"
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+  compiler_flag: "/DPRAGMA_SUPPORTED"
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+  # Use math constants (M_PI, etc.) from the math library
+  compiler_flag: "/D_USE_MATH_DEFINES"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch both asynchronous (structured) and synchronous (C++) exceptions.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  linker_flag: "/SUBSYSTEM:CONSOLE"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-static-library"
+      action: "c++-link-alwayslink-static-library"
+      action: "c++-link-pic-static-library"
+      action: "c++-link-alwayslink-pic-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-static-library"
+      action: "c++-link-alwayslink-static-library"
+      action: "c++-link-pic-static-library"
+      action: "c++-link-alwayslink-pic-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: "use_linker"
+    env_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      env_entry {
+        key: "USE_LINKER"
+        value: "1"
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-header-preprocessing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  # Stop adding any flag for dotD file, Bazel knows how to parse the output of /showIncludes option
+  # TODO(bazel-team): Remove this empty feature. https://github.com/bazelbuild/bazel/issues/2868
+  feature {
+    name: 'dependency_file'
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-preprocessing'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+  # Stop passing -frandom-seed option
+  feature {
+    name: 'random_seed'
+  }
+
+  # This feature is just for enabling flag_set in action_config for -c and -o options during the transitional period
+  feature {
+    name: 'compile_action_flags_in_flag_set'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    flag_set {
+      flag_group {
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_object_file'
+      flag_group {
+        flag: '/Fo%{output_object_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_assembly_file'
+      flag_group {
+        flag: '/Fa%{output_assembly_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_preprocess_file'
+      flag_group {
+        flag: '/P'
+        flag: '/Fi%{output_preprocess_file}'
+      }
+    }
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    flag_set {
+      flag_group {
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_object_file'
+      flag_group {
+        flag: '/Fo%{output_object_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_assembly_file'
+      flag_group {
+        flag: '/Fa%{output_assembly_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_preprocess_file'
+      flag_group {
+        flag: '/P'
+        flag: '/Fi%{output_preprocess_file}'
+      }
+    }
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'use_linker'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'has_configured_linker_path'
+    implies: 'legacy_link_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'use_linker'
+  }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  action_config {
+    config_name: 'c++-link-alwayslink-static-library'
+    action_name: 'c++-link-alwayslink-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(pcloudy): The following action_config is listed in MANDATORY_LINK_TARGET_TYPES.
+  # But do we really need them on Windows?
+  action_config {
+    config_name: 'c++-link-pic-static-library'
+    action_name: 'c++-link-pic-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  action_config {
+    config_name: 'c++-link-alwayslink-pic-static-library'
+    action_name: 'c++-link-alwayslink-pic-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  action_config {
+    config_name: 'c++-link-interface-dynamic-library'
+    action_name: 'c++-link-interface-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      action: 'c++-link-alwayslink-static-library'
+      action: 'c++-link-pic-static-library'
+      action: 'c++-link-alwayslink-pic-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'library_search_directories'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        iterate_over: 'library_search_directories'
+        flag: "-L%{library_search_directories}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: 'c++-link-static-library'
+      action: 'c++-link-alwayslink-static-library'
+      action: 'c++-link-pic-static-library'
+      action: 'c++-link-alwayslink-pic-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'dynamic_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'versioned_dynamic_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: 'c++-link-static-library'
+      action: 'c++-link-alwayslink-static-library'
+      action: 'c++-link-pic-static-library'
+      action: 'c++-link-alwayslink-pic-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'link_crt_library'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        # The flag is filled by cc_configure.
+        # The default option is /MT, set USE_DYNAMIC_CRT=1 to change it to /MD
+        flag: ""
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+      # The flag is filled by cc_configure.
+        # The default value is libcmt.lib, set USE_DYNAMIC_CRT=1 to change it to msvcrt.lib
+        flag: "/DEFAULTLIB:"
+      }
+    }
+  }
+
+  feature {
+    name: 'link_crt_debug_library'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        # The flag is filled by cc_configure.
+        # The default option is /MTd, set USE_DYNAMIC_CRT=1 to change it to /MDd
+        flag: ""
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        # The flag is filled by cc_configure.
+        # The default value is libcmtd.lib, set USE_DYNAMIC_CRT=1 to change it to msvcrtd.lib
+        flag: "/DEFAULTLIB:"
+      }
+    }
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        # This will signal the wrapper that we are doing a debug build, which sets
+        # some internal state of the toolchain wrapper. It is intentionally a "-"
+        # flag to make this very obvious.
+        flag: "-g"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'link_crt_debug_library'
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'link_crt_library'
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+      }
+    }
+    implies: 'link_crt_library'
+  }
+
+
+
+}
diff --git a/third_party/toolchains/cpus/arm/BUILD b/third_party/toolchains/cpus/arm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..10c7867c233e9ffb865101eef081a38a493fd4d6
--- /dev/null
+++ b/third_party/toolchains/cpus/arm/BUILD
@@ -0,0 +1,49 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi|compiler": ":cc-compiler-armeabi",
+        "local|compiler": ":cc-compiler-local",
+    },
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "arm_linux_all_files",
+    srcs = [
+        "@arm_compiler//:compiler_pieces",
+    ],
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi",
+    all_files = ":arm_linux_all_files",
+    compiler_files = ":arm_linux_all_files",
+    cpu = "armeabi",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":arm_linux_all_files",
+    objcopy_files = "arm_linux_all_files",
+    static_runtime_libs = [":empty"],
+    strip_files = "arm_linux_all_files",
+    supports_param_files = 1,
+)
diff --git a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..6753476c153b9b624436ed0af436ed0841864632
--- /dev/null
+++ b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
@@ -0,0 +1,874 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "freebsd"
+  toolchain_identifier: "local_freebsd"
+}
+default_toolchain {
+  cpu: "armeabi"
+  toolchain_identifier: "arm-linux-gnueabihf"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows_msys64"
+}
+default_toolchain {
+  cpu: "x64_windows_msvc"
+  toolchain_identifier: "vc_14_0_x64"
+}
+default_toolchain {
+  cpu: "s390x"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "armeabi"
+  abi_libc_version: "armeabi"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "armeabi"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "armeabi"
+  target_cpu: "armeabi"
+  target_system_name: "armeabi"
+  toolchain_identifier: "arm-linux-gnueabihf"
+
+  tool_path { name: "ar" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ar" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-cpp" }
+  tool_path { name: "dwp" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-dwp" }
+  tool_path { name: "gcc" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcc" }
+  tool_path { name: "gcov" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcov" }
+  tool_path { name: "ld" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ld" }
+
+  tool_path { name: "nm" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-nm" }
+  tool_path { name: "objcopy" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objcopy" }
+  tool_path { name: "objdump" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objdump" }
+  tool_path { name: "strip" path: "%{ARM_COMPILER_PATH}%/bin/arm-linux-gnueabihf-strip" }
+
+  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/4.9.3/"
+  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/sysroot/usr/include/"
+  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/"
+  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include"
+  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/4.9.3/include-fixed"
+  cxx_builtin_include_directory: "%{ARM_COMPILER_PATH}%/local_include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  cxx_flag: "-std=c++11"	
+  # The cxx_builtin_include_directory directives don't seem to be adding these, so
+  # explicitly set them as flags. There's a query to the Bazel team outstanding about
+  # why this is necessary.
+  cxx_flag: "-isystem"
+  cxx_flag: "/usr/include/arm-linux-gnueabihf"
+  cxx_flag: "-isystem"
+  cxx_flag: "/usr/include/python2.7"
+  cxx_flag: "-isystem"
+  cxx_flag: "/usr/include/"
+  linker_flag: "-lstdc++"
+
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-fno-canonical-system-headers"
+
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-DRASPBERRY_PI"  # To differentiate from mobile builds.
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  linker_flag: "-no-canonical-prefixes"
+  linker_flag: "-pass-exit-codes"
+
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "/usr/bin/gcc" }
+  cxx_flag: "-std=c++0x"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/gcc/"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-fno-canonical-system-headers"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  # compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wunused-but-set-parameter"
+  # But disable some that are problematic.
+  compiler_flag: "-Wno-free-nonheap-object" # has false positives
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+  # Have gcc return the exit code from ld.
+  linker_flag: "-pass-exit-codes"
+  # Stamp the binary with a unique identifier.
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+  # Gold linker only? Can we enable this by default?
+  # linker_flag: "-Wl,--warn-execstack"
+  # linker_flag: "-Wl,--detect-odr-violations"
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "osx_cc_wrapper.sh" }
+  cxx_flag: "-std=c++0x"
+  ar_flag: "-static"
+  ar_flag: "-s"
+  ar_flag: "-o"
+  linker_flag: "-lstdc++"
+  linker_flag: "-undefined"
+  linker_flag: "dynamic_lookup"
+  linker_flag: "-headerpad_max_install_names"
+  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
+  # setting from the local compiler, and also how to make incremental builds correct.
+  cxx_builtin_include_directory: "/"
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified.
+  compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  compiler_flag: "-Wthread-safety"
+  compiler_flag: "-Wself-assign"
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
+    # However, that can't happen here, as it requires special handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "local"
+  target_cpu: "freebsd"
+  target_system_name: "local"
+  toolchain_identifier: "local_freebsd"
+
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcc" path: "/usr/bin/clang" }
+  cxx_flag: "-std=c++0x"
+  linker_flag: "-lstdc++"
+  linker_flag: "-B/usr/bin/"
+
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/clang"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+
+  # C(++) compiles invoke the compiler (as that is the one knowing where
+  # to find libraries), but we provide LD so other rules can invoke the linker.
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Anticipated future default.
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  # compiler_flag: "-fcolor-diagnostics"
+
+  # All warnings are enabled. Maybe enable -Werror as well?
+  compiler_flag: "-Wall"
+  # Enable a few more warnings that aren't part of -Wall.
+  #compiler_flag: "-Wunused-but-set-parameter"
+  # But disable some that are problematic.
+  #compiler_flag: "-Wno-free-nonheap-object" # has false positives
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+  # Have gcc return the exit code from ld.
+  #linker_flag: "-pass-exit-codes"
+  # Stamp the binary with a unique identifier.
+  #linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--hash-style=gnu"
+  # Gold linker only? Can we enable this by default?
+  # linker_flag: "-Wl,--warn-execstack"
+  # linker_flag: "-Wl,--detect-odr-violations"
+
+  compilation_mode_flags {
+    mode: DBG
+    # Enable debug symbols.
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+    # No debug symbols.
+    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
+    # even generally? However, that can't happen here, as it requires special
+    # handling in Bazel.
+    compiler_flag: "-g0"
+
+    # Conservative choice for -O
+    # -O3 can increase binary size and even slow down the resulting binaries.
+    # Profile first and / or use FDO if you need better performance than this.
+    compiler_flag: "-O2"
+
+    # Disable assertions
+    compiler_flag: "-DNDEBUG"
+
+    # Removal of unused code and data at link time (can this increase binary size in some cases?).
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "windows_mingw"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "local"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+  toolchain_identifier: "local_windows_mingw"
+
+  tool_path { name: "ar" path: "C:/mingw/bin/ar" }
+  tool_path { name: "compat-ld" path: "C:/mingw/bin/ld" }
+  tool_path { name: "cpp" path: "C:/mingw/bin/cpp" }
+  tool_path { name: "dwp" path: "C:/mingw/bin/dwp" }
+  tool_path { name: "gcc" path: "C:/mingw/bin/gcc" }
+  cxx_flag: "-std=c++0x"
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "C:/mingw/include"
+  cxx_builtin_include_directory: "C:/mingw/lib/gcc"
+  tool_path { name: "gcov" path: "C:/mingw/bin/gcov" }
+  tool_path { name: "ld" path: "C:/mingw/bin/ld" }
+  tool_path { name: "nm" path: "C:/mingw/bin/nm" }
+  tool_path { name: "objcopy" path: "C:/mingw/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "C:/mingw/bin/objdump" }
+  tool_path { name: "strip" path: "C:/mingw/bin/strip" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "windows_msys64_mingw64"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "local"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+  toolchain_identifier: "local_windows_msys64_mingw64"
+
+  tool_path { name: "ar" path: "C:/tools/msys64/mingw64/bin/ar" }
+  tool_path { name: "compat-ld" path: "C:/tools/msys64/mingw64/bin/ld" }
+  tool_path { name: "cpp" path: "C:/tools/msys64/mingw64/bin/cpp" }
+  tool_path { name: "dwp" path: "C:/tools/msys64/mingw64/bin/dwp" }
+  tool_path { name: "gcc" path: "C:/tools/msys64/mingw64/bin/gcc" }
+  cxx_flag: "-std=c++0x"
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "C:/tools/msys64/mingw64/x86_64-w64-mingw32/include"
+  tool_path { name: "gcov" path: "C:/tools/msys64/mingw64/bin/gcov" }
+  tool_path { name: "ld" path: "C:/tools/msys64/mingw64/bin/ld" }
+  tool_path { name: "nm" path: "C:/tools/msys64/mingw64/bin/nm" }
+  tool_path { name: "objcopy" path: "C:/tools/msys64/mingw64/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "C:/tools/msys64/mingw64/bin/objdump" }
+  tool_path { name: "strip" path: "C:/tools/msys64/mingw64/bin/strip" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "windows_clang"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "local"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+  toolchain_identifier: "local_windows_clang"
+
+  tool_path { name: "ar" path: "C:/mingw/bin/ar" }
+  tool_path { name: "compat-ld" path: "C:/Program Files (x86)/LLVM/bin/ld" }
+  tool_path { name: "cpp" path: "C:/Program Files (x86)/LLVM/bin/cpp" }
+  tool_path { name: "dwp" path: "C:/Program Files (x86)/LLVM/bin/dwp" }
+  tool_path { name: "gcc" path: "C:/Program Files (x86)/LLVM/bin/clang" }
+  cxx_flag: "-std=c++0x"
+  # TODO(bazel-team): In theory, the path here ought to exactly match the path
+  # used by gcc. That works because bazel currently doesn't track files at
+  # absolute locations and has no remote execution, yet. However, this will need
+  # to be fixed, maybe with auto-detection?
+  cxx_builtin_include_directory: "/usr/lib/gcc/"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/include"
+  tool_path { name: "gcov" path: "C:/Program Files (x86)/LLVM/bin/gcov" }
+  tool_path { name: "ld" path: "C:/Program Files (x86)/LLVM/bin/ld" }
+  tool_path { name: "nm" path: "C:/Program Files (x86)/LLVM/bin/nm" }
+  tool_path { name: "objcopy" path: "C:/Program Files (x86)/LLVM/bin/objcopy" }
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  tool_path { name: "objdump" path: "C:/Program Files (x86)/LLVM/bin/objdump" }
+  tool_path { name: "strip" path: "C:/Program Files (x86)/LLVM/bin/strip" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+   abi_version: "local"
+   abi_libc_version: "local"
+   builtin_sysroot: ""
+   compiler: "windows_msys64"
+   host_system_name: "local"
+   needsPic: false
+   target_libc: "local"
+   target_cpu: "x64_windows"
+   target_system_name: "local"
+   toolchain_identifier: "local_windows_msys64"
+
+   tool_path { name: "ar" path: "C:/tools/msys64/usr/bin/ar" }
+   tool_path { name: "compat-ld" path: "C:/tools/msys64/usr/bin/ld" }
+   tool_path { name: "cpp" path: "C:/tools/msys64/usr/bin/cpp" }
+   tool_path { name: "dwp" path: "C:/tools/msys64/usr/bin/dwp" }
+   # Use gcc instead of g++ so that C will compile correctly.
+   tool_path { name: "gcc" path: "C:/tools/msys64/usr/bin/gcc" }
+   cxx_flag: "-std=gnu++0x"
+   linker_flag: "-lstdc++"
+   # TODO(bazel-team): In theory, the path here ought to exactly match the path
+   # used by gcc. That works because bazel currently doesn't track files at
+   # absolute locations and has no remote execution, yet. However, this will need
+   # to be fixed, maybe with auto-detection?
+   cxx_builtin_include_directory: "C:/tools/msys64/"
+   cxx_builtin_include_directory: "/usr/"
+   tool_path { name: "gcov" path: "C:/tools/msys64/usr/bin/gcov" }
+   tool_path { name: "ld" path: "C:/tools/msys64/usr/bin/ld" }
+   tool_path { name: "nm" path: "C:/tools/msys64/usr/bin/nm" }
+   tool_path { name: "objcopy" path: "C:/tools/msys64/usr/bin/objcopy" }
+   objcopy_embed_flag: "-I"
+   objcopy_embed_flag: "binary"
+   tool_path { name: "objdump" path: "C:/tools/msys64/usr/bin/objdump" }
+   tool_path { name: "strip" path: "C:/tools/msys64/usr/bin/strip" }
+   linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "vc_14_0_x64"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows_msvc"
+  compiler: "cl"
+  target_libc: "msvcrt140"
+  default_python_version: "python2.7"
+  cxx_builtin_include_directory: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/INCLUDE"
+  cxx_builtin_include_directory: "C:/Program Files (x86)/Windows Kits/10/include/"
+  cxx_builtin_include_directory: "C:/Program Files (x86)/Windows Kits/8.1/include/"
+  cxx_builtin_include_directory: "C:/Program Files (x86)/GnuWin32/include/"
+  cxx_builtin_include_directory: "C:/python_27_amd64/files/include"
+  tool_path {
+    name: "ar"
+    path: "wrapper/bin/msvc_link.bat"
+  }
+  tool_path {
+    name: "cpp"
+    path: "wrapper/bin/msvc_cl.bat"
+  }
+  tool_path {
+    name: "gcc"
+    path: "wrapper/bin/msvc_cl.bat"
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: "wrapper/bin/msvc_link.bat"
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_gold_linker: false
+  supports_start_end_lib: false
+  supports_interface_shared_objects: false
+  supports_incremental_linker: false
+  supports_normalizing_ar: true
+  needsPic: false
+
+  compiler_flag: "-m64"
+  compiler_flag: "/D__inline__=__inline"
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DOS_WINDOWS=OS_WINDOWS"
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't pollute with GDI macros in windows.h.
+  compiler_flag: "/DNOGDI"
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+  compiler_flag: "/DPRAGMA_SUPPORTED"
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+  # Use math constants (M_PI, etc.) from the math library
+  compiler_flag: "/D_USE_MATH_DEFINES"
+
+  # Useful options to have on for compilation.
+  # Suppress startup banner.
+  compiler_flag: "/nologo"
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Warning level 3 (could possibly go to 4 in the future).
+  compiler_flag: "/W3"
+  # Catch both asynchronous (structured) and synchronous (C++) exceptions.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "-m64"
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-header-preprocessing'
+      action: 'c++-module-compile'
+      flag_group {
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'dependency_file'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-preprocessing'
+      action: 'c++-header-parsing'
+      expand_if_all_available: 'dependency_file'
+      flag_group {
+        flag: '/DEPENDENCY_FILE'
+        flag: '%{dependency_file}'
+      }
+    }
+  }
+
+  # Stop passing -frandom-seed option
+  feature {
+    name: 'random_seed'
+  }
+
+  # This feature is just for enabling flag_set in action_config for -c and -o options during the transitional period
+  feature {
+    name: 'compile_action_flags_in_flag_set'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: 'wrapper/bin/msvc_cl.bat'
+    }
+    flag_set {
+      flag_group {
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_object_file'
+      flag_group {
+        flag: '/Fo%{output_object_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_assembly_file'
+      flag_group {
+        flag: '/Fa%{output_assembly_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_preprocess_file'
+      flag_group {
+        flag: '/P'
+        flag: '/Fi%{output_preprocess_file}'
+      }
+    }
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: 'wrapper/bin/msvc_cl.bat'
+    }
+    flag_set {
+      flag_group {
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_object_file'
+      flag_group {
+        flag: '/Fo%{output_object_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_assembly_file'
+      flag_group {
+        flag: '/Fa%{output_assembly_file}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'output_preprocess_file'
+      flag_group {
+        flag: '/P'
+        flag: '/Fi%{output_preprocess_file}'
+      }
+    }
+  }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "/DDEBUG=1"
+    # This will signal the wrapper that we are doing a debug build, which sets
+    # some internal state of the toolchain wrapper. It is intentionally a "-"
+    # flag to make this very obvious.
+    compiler_flag: "-g"
+    compiler_flag: "/Od"
+    compiler_flag: "-Xcompilation-mode=dbg"
+  }
+
+  compilation_mode_flags {
+    mode: FASTBUILD
+    compiler_flag: "/DNDEBUG"
+    compiler_flag: "/Od"
+    compiler_flag: "-Xcompilation-mode=fastbuild"
+  }
+
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "/DNDEBUG"
+    compiler_flag: "/O2"
+    compiler_flag: "-Xcompilation-mode=opt"
+  }
+}
diff --git a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..5eb3b7bb1c646781ab7d4869b230a5c07354f72d
--- /dev/null
+++ b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
@@ -0,0 +1,27 @@
+# -*- Python -*-
+"""Repository rule for arm compiler autoconfiguration."""
+
+def _tpl(repository_ctx, tpl, substitutions={}, out=None):
+  if not out:
+    out = tpl
+  repository_ctx.template(
+      out,
+      Label("//third_party/toolchains/cpus/arm:%s.tpl" % tpl),
+      substitutions)
+
+
+def _arm_compiler_configure_impl(repository_ctx):
+  _tpl(repository_ctx, "CROSSTOOL", {
+      "%{ARM_COMPILER_PATH}%": str(repository_ctx.path(
+          repository_ctx.attr.remote_config_repo)),
+  })
+  repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
+
+
+arm_compiler_configure = repository_rule(
+    implementation = _arm_compiler_configure_impl,
+    attrs = {
+        "remote_config_repo": attr.string(mandatory = False, default =""),
+        "build_file": attr.label(),
+    },
+)
diff --git a/third_party/toolchains/cpus/arm/build_raspberry_pi.sh b/third_party/toolchains/cpus/arm/build_raspberry_pi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a5ad0fce0e4309bdf7d95bedc52e9aeffb2849ea
--- /dev/null
+++ b/third_party/toolchains/cpus/arm/build_raspberry_pi.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+set -e
+
+# By default this builds packages for the Pi Two and Three only, since the NEON support
+# this allows makes calculations many times faster. To support the Pi One or Zero, pass
+# PI_ONE as the first argument to the script, for example:
+# third_party/toolchains/cpus/arm/build_raspberry_pi.sh PI_ONE
+#
+# To install the cross-compilation support for Python this script needs on Ubuntu Trusty, run
+# something like these steps, after backing up your original /etc/apt/sources.list file:
+#
+# dpkg --add-architecture armhf
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
+# apt-get update
+# apt-get install -y libpython-all-dev:armhf
+#
+# Make sure you have an up to date version of the Bazel build tool installed too.
+
+yes '' | ./configure
+
+if [[ $1 == "PI_ONE" ]]; then
+  PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp"
+  echo "Building for the Pi One/Zero, with no NEON support"
+else
+  PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  echo "Building for the Pi Two/Three, with NEON acceleration"
+fi
+
+bazel build -c opt ${PI_COPTS} \
+  --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
+  --copt=-fomit-frame-pointer --cpu=armeabi \
+  --crosstool_top=@local_config_arm_compiler//:toolchain \
+  --verbose_failures \
+  //tensorflow/tools/benchmark:benchmark_model \
+  //tensorflow/tools/pip_package:build_pip_package
+
+TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+echo "Final outputs will go to ${TMPDIR}"
+
+# Build a universal wheel.
+BDIST_OPTS="--universal" \
+  bazel-bin/tensorflow/tools/pip_package/build_pip_package "${TMPDIR}"
+
+OLD_FN=$(ls "${TMPDIR}" | grep \.whl)
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
+mv "${TMPDIR}/${OLD_FN}" "${TMPDIR}/${NEW_FN}"
+cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${TMPDIR}"
+
+echo "Output can be found here:"
+find "${TMPDIR}"
diff --git a/third_party/toolchains/cpus/py/BUILD b/third_party/toolchains/cpus/py/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c175742cbfe918e55035e89b7454596acd43307e
--- /dev/null
+++ b/third_party/toolchains/cpus/py/BUILD
@@ -0,0 +1,197 @@
+# A build file to configure python remote repository used with Bazel remote
+# execution service
+# DO NOT EDIT: automatically generated BUILD file
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    data = select({
+        ":windows": [":python_import_lib"],
+        "//conditions:default": [],
+    }),
+    includes = ["python_include"],
+    linkopts = select({
+        # TODO(pcloudy): Ideally, this should just go into deps after resolving
+        # https://github.com/bazelbuild/bazel/issues/3237,
+        ":windows": ["$(locations :python_import_lib)"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/code.h",
+        "python_include/dtoa.h",
+        "python_include/tupleobject.h",
+        "python_include/object.h",
+        "python_include/ast.h",
+        "python_include/pymacconfig.h",
+        "python_include/errcode.h",
+        "python_include/frameobject.h",
+        "python_include/pgenheaders.h",
+        "python_include/cellobject.h",
+        "python_include/intobject.h",
+        "python_include/pythread.h",
+        "python_include/cStringIO.h",
+        "python_include/boolobject.h",
+        "python_include/modsupport.h",
+        "python_include/import.h",
+        "python_include/pymath.h",
+        "python_include/node.h",
+        "python_include/funcobject.h",
+        "python_include/eval.h",
+        "python_include/longintrepr.h",
+        "python_include/floatobject.h",
+        "python_include/rangeobject.h",
+        "python_include/pyfpe.h",
+        "python_include/pystrcmp.h",
+        "python_include/dictobject.h",
+        "python_include/pyarena.h",
+        "python_include/objimpl.h",
+        "python_include/bitset.h",
+        "python_include/memoryobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/pydebug.h",
+        "python_include/pyerrors.h",
+        "python_include/weakrefobject.h",
+        "python_include/grammar.h",
+        "python_include/symtable.h",
+        "python_include/longobject.h",
+        "python_include/structmember.h",
+        "python_include/enumobject.h",
+        "python_include/classobject.h",
+        "python_include/unicodeobject.h",
+        "python_include/sliceobject.h",
+        "python_include/pystrtod.h",
+        "python_include/genobject.h",
+        "python_include/pymactoolbox.h",
+        "python_include/compile.h",
+        "python_include/pyexpat.h",
+        "python_include/asdl.h",
+        "python_include/codecs.h",
+        "python_include/pyctype.h",
+        "python_include/sysmodule.h",
+        "python_include/methodobject.h",
+        "python_include/graminit.h",
+        "python_include/cobject.h",
+        "python_include/intrcheck.h",
+        "python_include/pyport.h",
+        "python_include/warnings.h",
+        "python_include/osdefs.h",
+        "python_include/fileobject.h",
+        "python_include/stringobject.h",
+        "python_include/timefuncs.h",
+        "python_include/traceback.h",
+        "python_include/ceval.h",
+        "python_include/bytes_methods.h",
+        "python_include/pyconfig.h",
+        "python_include/Python.h",
+        "python_include/moduleobject.h",
+        "python_include/pystate.h",
+        "python_include/descrobject.h",
+        "python_include/ucnhash.h",
+        "python_include/pygetopt.h",
+        "python_include/pymem.h",
+        "python_include/complexobject.h",
+        "python_include/structseq.h",
+        "python_include/datetime.h",
+        "python_include/pythonrun.h",
+        "python_include/numpy/oldnumeric.h",
+        "python_include/numpy/npy_1_7_deprecated_api.h",
+        "python_include/numpy/ufunc_api.txt",
+        "python_include/numpy/multiarray_api.txt",
+        "python_include/numpy/halffloat.h",
+        "python_include/numpy/npy_common.h",
+        "python_include/numpy/utils.h",
+        "python_include/numpy/npy_interrupt.h",
+        "python_include/numpy/npy_endian.h",
+        "python_include/numpy/__ufunc_api.h",
+        "python_include/numpy/_neighborhood_iterator_imp.h",
+        "python_include/numpy/ufuncobject.h",
+        "python_include/numpy/ndarraytypes.h",
+        "python_include/numpy/npy_math.h",
+        "python_include/numpy/noprefix.h",
+        "python_include/numpy/npy_3kcompat.h",
+        "python_include/numpy/arrayscalars.h",
+        "python_include/numpy/npy_os.h",
+        "python_include/numpy/ndarrayobject.h",
+        "python_include/numpy/npy_no_deprecated_api.h",
+        "python_include/numpy/arrayobject.h",
+        "python_include/numpy/_numpyconfig.h",
+        "python_include/numpy/__multiarray_api.h",
+        "python_include/numpy/npy_cpu.h",
+        "python_include/numpy/old_defines.h",
+        "python_include/numpy/numpyconfig.h",
+        "python_include/pycapsule.h",
+        "python_include/setobject.h",
+        "python_include/listobject.h",
+        "python_include/bytesobject.h",
+        "python_include/pgen.h",
+        "python_include/patchlevel.h",
+        "python_include/opcode.h",
+        "python_include/parsetok.h",
+        "python_include/marshal.h",
+        "python_include/token.h",
+        "python_include/iterobject.h",
+        "python_include/abstract.h",
+        "python_include/py_curses.h",
+        "python_include/metagrammar.h",
+        "python_include/bufferobject.h",
+        "python_include/Python-ast.h",
+    ],
+    cmd = """
+cp "/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp "/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp "/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h" && cp "/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp "/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp "/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp "/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp "/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python2.7/numpy/oldnumeric.h" "$(@D)/python_include/numpy/oldnumeric.h" && cp "/usr/include/python2.7/numpy/npy_1_7_deprecated_api.h" "$(@D)/python_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/include/python2.7/numpy/ufunc_api.txt" "$(@D)/python_include/numpy/ufunc_api.txt" && cp "/usr/include/python2.7/numpy/multiarray_api.txt" "$(@D)/python_include/numpy/multiarray_api.txt" && cp "/usr/include/python2.7/numpy/halffloat.h" "$(@D)/python_include/numpy/halffloat.h" && cp "/usr/include/python2.7/numpy/npy_common.h" "$(@D)/python_include/numpy/npy_common.h" && cp "/usr/include/python2.7/numpy/utils.h" "$(@D)/python_include/numpy/utils.h" && cp "/usr/include/python2.7/numpy/npy_interrupt.h" "$(@D)/python_include/numpy/npy_interrupt.h" && cp "/usr/include/python2.7/numpy/npy_endian.h" "$(@D)/python_include/numpy/npy_endian.h" && cp "/usr/include/python2.7/numpy/__ufunc_api.h" "$(@D)/python_include/numpy/__ufunc_api.h" && cp "/usr/include/python2.7/numpy/_neighborhood_iterator_imp.h" "$(@D)/python_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/include/python2.7/numpy/ufuncobject.h" "$(@D)/python_include/numpy/ufuncobject.h" && cp "/usr/include/python2.7/numpy/ndarraytypes.h" "$(@D)/python_include/numpy/ndarraytypes.h" && cp "/usr/include/python2.7/numpy/npy_math.h" "$(@D)/python_include/numpy/npy_math.h" && cp "/usr/include/python2.7/numpy/noprefix.h" "$(@D)/python_include/numpy/noprefix.h" && cp "/usr/include/python2.7/numpy/npy_3kcompat.h" "$(@D)/python_include/numpy/npy_3kcompat.h" && cp "/usr/include/python2.7/numpy/arrayscalars.h" "$(@D)/python_include/numpy/arrayscalars.h" && cp "/usr/include/python2.7/numpy/npy_os.h" "$(@D)/python_include/numpy/npy_os.h" && cp "/usr/include/python2.7/numpy/ndarrayobject.h" "$(@D)/python_include/numpy/ndarrayobject.h" && cp "/usr/include/python2.7/numpy/npy_no_deprecated_api.h" "$(@D)/python_include/numpy/npy_no_deprecated_api.h" && cp "/usr/include/python2.7/numpy/arrayobject.h" "$(@D)/python_include/numpy/arrayobject.h" && cp "/usr/include/python2.7/numpy/_numpyconfig.h" "$(@D)/python_include/numpy/_numpyconfig.h" && cp "/usr/include/python2.7/numpy/__multiarray_api.h" "$(@D)/python_include/numpy/__multiarray_api.h" && cp "/usr/include/python2.7/numpy/npy_cpu.h" "$(@D)/python_include/numpy/npy_cpu.h" && cp "/usr/include/python2.7/numpy/old_defines.h" "$(@D)/python_include/numpy/old_defines.h" && cp "/usr/include/python2.7/numpy/numpyconfig.h" "$(@D)/python_include/numpy/numpyconfig.h" && cp "/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp "/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/utils.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/numpyconfig.h",
+    ],
+    cmd = """
+cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h"
+   """,
+)
diff --git a/third_party/toolchains/gpus/crosstool/BUILD b/third_party/toolchains/gpus/crosstool/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a8c6b0f0291363f3a7576a70e78b3428fb984957
--- /dev/null
+++ b/third_party/toolchains/gpus/crosstool/BUILD
@@ -0,0 +1,52 @@
+# A build file to configure cc toolchain for GPU build used with Bazel remote
+# execution service
+# DO NOT EDIT: automatically generated BUILD file
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
diff --git a/third_party/toolchains/gpus/crosstool/CROSSTOOL b/third_party/toolchains/gpus/crosstool/CROSSTOOL
new file mode 100644
index 0000000000000000000000000000000000000000..224b8912f6d743ad78b0ce835fdb8aa30e5e1309
--- /dev/null
+++ b/third_party/toolchains/gpus/crosstool/CROSSTOOL
@@ -0,0 +1,302 @@
+# A crosstool configuration for GPU build used with Bazel remote
+# execution service
+# DO NOT EDIT: automatically generated file
+
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        # TODO(ngiraldo): Some parts of the codebase set -Werror and hit this 
+        # warning, so switch it off for now.
+        flag: "-Wno-invalid-partial-specialization"
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin/"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "/usr/local/bin/clang" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/5.4.0"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/5.4.0"
+  cxx_builtin_include_directory: "/usr/include/c++/5.4.0/backward"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/local/lib/clang/5.0.0/include"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+}
diff --git a/third_party/toolchains/gpus/cuda/BUILD b/third_party/toolchains/gpus/cuda/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..36be86cd1021188eccf2f8d16e17c97531a9e09a
--- /dev/null
+++ b/third_party/toolchains/gpus/cuda/BUILD
@@ -0,0 +1,1362 @@
+# A build file to configure cuda remote repository used with Bazel remote
+# execution service
+# DO NOT EDIT: automatically generated BUILD file
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["cuda/lib/libcudart_static.a"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = select({
+        ":freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
+        "-lpthread",
+        "-lrt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda/lib/libcuda.so"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart",
+    srcs = ["cuda/lib/libcudart.so.8.0"],
+    data = ["cuda/lib/libcudart.so.8.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cuda/lib/libcublas.so.8.0"],
+    data = ["cuda/lib/libcublas.so.8.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["cuda/lib/libcusolver.so.8.0"],
+    data = ["cuda/lib/libcusolver.so.8.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn",
+    srcs = ["cuda/lib/libcudnn.so.6"],
+    data = ["cuda/lib/libcudnn.so.6"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cufft",
+    srcs = ["cuda/lib/libcufft.so.8.0"],
+    data = ["cuda/lib/libcufft.so.8.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["cuda/lib/libcurand.so.8.0"],
+    data = ["cuda/lib/libcurand.so.8.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cupti_dsos",
+    data = ["cuda/lib/libcupti.so.8.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "cuda-include",
+    outs = [
+        "cuda/include/math_functions.hpp",
+        "cuda/include/cufft.h",
+        "cuda/include/nvgraph.h",
+        "cuda/include/curand_normal.h",
+        "cuda/include/curand_uniform.h",
+        "cuda/include/nppi_data_exchange_and_initialization.h",
+        "cuda/include/cuda_gl_interop.h",
+        "cuda/include/nppi_compression_functions.h",
+        "cuda/include/npp.h",
+        "cuda/include/cuda.h",
+        "cuda/include/nppi_statistics_functions.h",
+        "cuda/include/vector_functions.hpp",
+        "cuda/include/sm_32_intrinsics.hpp",
+        "cuda/include/sm_32_intrinsics.h",
+        "cuda/include/curand_discrete.h",
+        "cuda/include/cuda_runtime.h",
+        "cuda/include/cufftXt.h",
+        "cuda/include/sm_61_intrinsics.h",
+        "cuda/include/texture_fetch_functions.h",
+        "cuda/include/curand_mrg32k3a.h",
+        "cuda/include/host_defines.h",
+        "cuda/include/common_functions.h",
+        "cuda/include/nppi_support_functions.h",
+        "cuda/include/nppi_linear_transforms.h",
+        "cuda/include/device_double_functions.hpp",
+        "cuda/include/math_constants.h",
+        "cuda/include/nvToolsExtSync.h",
+        "cuda/include/npps_initialization.h",
+        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
+        "cuda/include/texture_indirect_functions.hpp",
+        "cuda/include/cudaProfiler.h",
+        "cuda/include/npps_filtering_functions.h",
+        "cuda/include/cusparse_v2.h",
+        "cuda/include/nppi.h",
+        "cuda/include/surface_indirect_functions.h",
+        "cuda/include/sm_30_intrinsics.h",
+        "cuda/include/device_double_functions.h",
+        "cuda/include/sm_35_intrinsics.h",
+        "cuda/include/cusolverSp.h",
+        "cuda/include/library_types.h",
+        "cuda/include/surface_indirect_functions.hpp",
+        "cuda/include/cudalibxt.h",
+        "cuda/include/channel_descriptor.h",
+        "cuda/include/device_functions_decls.h",
+        "cuda/include/curand_kernel.h",
+        "cuda/include/curand_mtgp32_host.h",
+        "cuda/include/nvToolsExtCuda.h",
+        "cuda/include/nvToolsExt.h",
+        "cuda/include/cuComplex.h",
+        "cuda/include/sm_32_atomic_functions.h",
+        "cuda/include/texture_indirect_functions.h",
+        "cuda/include/sm_32_atomic_functions.hpp",
+        "cuda/include/sm_20_intrinsics.hpp",
+        "cuda/include/device_launch_parameters.h",
+        "cuda/include/curand_mtgp32.h",
+        "cuda/include/texture_fetch_functions.hpp",
+        "cuda/include/cuda_occupancy.h",
+        "cuda/include/CL/opencl.h",
+        "cuda/include/CL/cl_platform.h",
+        "cuda/include/CL/cl_egl.h",
+        "cuda/include/CL/cl_gl.h",
+        "cuda/include/CL/cl.h",
+        "cuda/include/CL/cl_gl_ext.h",
+        "cuda/include/CL/cl_ext.h",
+        "cuda/include/CL/cl.hpp",
+        "cuda/include/host_config.h",
+        "cuda/include/cuda_surface_types.h",
+        "cuda/include/math_functions.h",
+        "cuda/include/nvToolsExtMeta.h",
+        "cuda/include/sm_20_atomic_functions.hpp",
+        "cuda/include/device_functions.h",
+        "cuda/include/device_types.h",
+        "cuda/include/npps_conversion_functions.h",
+        "cuda/include/curand_precalc.h",
+        "cuda/include/cusolverRf.h",
+        "cuda/include/sm_60_atomic_functions.hpp",
+        "cuda/include/cuviddec.h",
+        "cuda/include/curand_discrete2.h",
+        "cuda/include/device_functions.hpp",
+        "cuda/include/thrust/transform_scan.h",
+        "cuda/include/thrust/system_error.h",
+        "cuda/include/thrust/device_malloc.h",
+        "cuda/include/thrust/partition.h",
+        "cuda/include/thrust/unique.h",
+        "cuda/include/thrust/device_delete.h",
+        "cuda/include/thrust/execution_policy.h",
+        "cuda/include/thrust/adjacent_difference.h",
+        "cuda/include/thrust/sequence.h",
+        "cuda/include/thrust/merge.h",
+        "cuda/include/thrust/device_new.h",
+        "cuda/include/thrust/transform_reduce.h",
+        "cuda/include/thrust/device_vector.h",
+        "cuda/include/thrust/gather.h",
+        "cuda/include/thrust/sort.h",
+        "cuda/include/thrust/scan.h",
+        "cuda/include/thrust/detail/temporary_array.h",
+        "cuda/include/thrust/detail/util/align.h",
+        "cuda/include/thrust/detail/util/blocking.h",
+        "cuda/include/thrust/detail/transform.inl",
+        "cuda/include/thrust/detail/device_vector.inl",
+        "cuda/include/thrust/detail/binary_search.inl",
+        "cuda/include/thrust/detail/overlapped_copy.h",
+        "cuda/include/thrust/detail/vector_base.inl",
+        "cuda/include/thrust/detail/device_reference.inl",
+        "cuda/include/thrust/detail/functional/actor.h",
+        "cuda/include/thrust/detail/functional/value.h",
+        "cuda/include/thrust/detail/functional/operators.h",
+        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
+        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
+        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
+        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
+        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
+        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
+        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
+        "cuda/include/thrust/detail/functional/argument.h",
+        "cuda/include/thrust/detail/functional/placeholder.h",
+        "cuda/include/thrust/detail/functional/actor.inl",
+        "cuda/include/thrust/detail/functional/composite.h",
+        "cuda/include/thrust/detail/static_map.h",
+        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
+        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
+        "cuda/include/thrust/detail/type_traits/function_traits.h",
+        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
+        "cuda/include/thrust/detail/type_traits/has_member_function.h",
+        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
+        "cuda/include/thrust/detail/type_traits/minimum_type.h",
+        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
+        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
+        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
+        "cuda/include/thrust/detail/reference.h",
+        "cuda/include/thrust/detail/inner_product.inl",
+        "cuda/include/thrust/detail/use_default.h",
+        "cuda/include/thrust/detail/sequence.inl",
+        "cuda/include/thrust/detail/sort.inl",
+        "cuda/include/thrust/detail/equal.inl",
+        "cuda/include/thrust/detail/execution_policy.h",
+        "cuda/include/thrust/detail/integer_traits.h",
+        "cuda/include/thrust/detail/type_traits.h",
+        "cuda/include/thrust/detail/reverse.inl",
+        "cuda/include/thrust/detail/tabulate.inl",
+        "cuda/include/thrust/detail/unique.inl",
+        "cuda/include/thrust/detail/scatter.inl",
+        "cuda/include/thrust/detail/set_operations.inl",
+        "cuda/include/thrust/detail/device_malloc.inl",
+        "cuda/include/thrust/detail/copy_if.inl",
+        "cuda/include/thrust/detail/fill.inl",
+        "cuda/include/thrust/detail/temporary_array.inl",
+        "cuda/include/thrust/detail/transform_scan.inl",
+        "cuda/include/thrust/detail/minmax.h",
+        "cuda/include/thrust/detail/swap.inl",
+        "cuda/include/thrust/detail/pointer.inl",
+        "cuda/include/thrust/detail/transform_reduce.inl",
+        "cuda/include/thrust/detail/config.h",
+        "cuda/include/thrust/detail/distance.inl",
+        "cuda/include/thrust/detail/pair.inl",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
+        "cuda/include/thrust/detail/allocator/destroy_range.inl",
+        "cuda/include/thrust/detail/allocator/destroy_range.h",
+        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
+        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.h",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
+        "cuda/include/thrust/detail/allocator/default_construct_range.h",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
+        "cuda/include/thrust/detail/temporary_buffer.h",
+        "cuda/include/thrust/detail/reduce.inl",
+        "cuda/include/thrust/detail/device_new.inl",
+        "cuda/include/thrust/detail/pointer.h",
+        "cuda/include/thrust/detail/for_each.inl",
+        "cuda/include/thrust/detail/generate.inl",
+        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
+        "cuda/include/thrust/detail/adjacent_difference.inl",
+        "cuda/include/thrust/detail/tuple_meta_transform.h",
+        "cuda/include/thrust/detail/functional.inl",
+        "cuda/include/thrust/detail/remove.inl",
+        "cuda/include/thrust/detail/tuple_transform.h",
+        "cuda/include/thrust/detail/merge.inl",
+        "cuda/include/thrust/detail/extrema.inl",
+        "cuda/include/thrust/detail/trivial_sequence.h",
+        "cuda/include/thrust/detail/vector_base.h",
+        "cuda/include/thrust/detail/count.inl",
+        "cuda/include/thrust/detail/uninitialized_copy.inl",
+        "cuda/include/thrust/detail/function.h",
+        "cuda/include/thrust/detail/swap_ranges.inl",
+        "cuda/include/thrust/detail/device_delete.inl",
+        "cuda/include/thrust/detail/static_assert.h",
+        "cuda/include/thrust/detail/logical.inl",
+        "cuda/include/thrust/detail/seq.h",
+        "cuda/include/thrust/detail/mpl/math.h",
+        "cuda/include/thrust/detail/mismatch.inl",
+        "cuda/include/thrust/detail/internal_functional.h",
+        "cuda/include/thrust/detail/get_iterator_value.h",
+        "cuda/include/thrust/detail/copy.inl",
+        "cuda/include/thrust/detail/copy.h",
+        "cuda/include/thrust/detail/complex/catrigf.h",
+        "cuda/include/thrust/detail/complex/cpowf.h",
+        "cuda/include/thrust/detail/complex/csqrtf.h",
+        "cuda/include/thrust/detail/complex/ccoshf.h",
+        "cuda/include/thrust/detail/complex/csinhf.h",
+        "cuda/include/thrust/detail/complex/clogf.h",
+        "cuda/include/thrust/detail/complex/ccosh.h",
+        "cuda/include/thrust/detail/complex/arithmetic.h",
+        "cuda/include/thrust/detail/complex/csqrt.h",
+        "cuda/include/thrust/detail/complex/cpow.h",
+        "cuda/include/thrust/detail/complex/complex.inl",
+        "cuda/include/thrust/detail/complex/math_private.h",
+        "cuda/include/thrust/detail/complex/c99math.h",
+        "cuda/include/thrust/detail/complex/cproj.h",
+        "cuda/include/thrust/detail/complex/catrig.h",
+        "cuda/include/thrust/detail/complex/ctanhf.h",
+        "cuda/include/thrust/detail/complex/cexpf.h",
+        "cuda/include/thrust/detail/complex/csinh.h",
+        "cuda/include/thrust/detail/complex/stream.h",
+        "cuda/include/thrust/detail/complex/ctanh.h",
+        "cuda/include/thrust/detail/complex/cexp.h",
+        "cuda/include/thrust/detail/complex/clog.h",
+        "cuda/include/thrust/detail/range/head_flags.h",
+        "cuda/include/thrust/detail/range/tail_flags.h",
+        "cuda/include/thrust/detail/execute_with_allocator.h",
+        "cuda/include/thrust/detail/integer_math.h",
+        "cuda/include/thrust/detail/swap.h",
+        "cuda/include/thrust/detail/uninitialized_fill.inl",
+        "cuda/include/thrust/detail/scan.inl",
+        "cuda/include/thrust/detail/gather.inl",
+        "cuda/include/thrust/detail/reference_forward_declaration.h",
+        "cuda/include/thrust/detail/numeric_traits.h",
+        "cuda/include/thrust/detail/reference.inl",
+        "cuda/include/thrust/detail/cstdint.h",
+        "cuda/include/thrust/detail/device_free.inl",
+        "cuda/include/thrust/detail/copy_if.h",
+        "cuda/include/thrust/detail/partition.inl",
+        "cuda/include/thrust/detail/find.inl",
+        "cuda/include/thrust/detail/config/forceinline.h",
+        "cuda/include/thrust/detail/config/debug.h",
+        "cuda/include/thrust/detail/config/config.h",
+        "cuda/include/thrust/detail/config/host_device.h",
+        "cuda/include/thrust/detail/config/host_system.h",
+        "cuda/include/thrust/detail/config/compiler.h",
+        "cuda/include/thrust/detail/config/device_system.h",
+        "cuda/include/thrust/detail/config/compiler_fence.h",
+        "cuda/include/thrust/detail/config/exec_check_disable.h",
+        "cuda/include/thrust/detail/config/simple_defines.h",
+        "cuda/include/thrust/detail/config/global_workarounds.h",
+        "cuda/include/thrust/detail/replace.inl",
+        "cuda/include/thrust/detail/device_ptr.inl",
+        "cuda/include/thrust/detail/tuple.inl",
+        "cuda/include/thrust/detail/malloc_and_free.h",
+        "cuda/include/thrust/detail/host_vector.inl",
+        "cuda/include/thrust/detail/raw_pointer_cast.h",
+        "cuda/include/thrust/detail/advance.inl",
+        "cuda/include/thrust/detail/contiguous_storage.h",
+        "cuda/include/thrust/detail/raw_reference_cast.h",
+        "cuda/include/thrust/detail/contiguous_storage.inl",
+        "cuda/include/thrust/reverse.h",
+        "cuda/include/thrust/device_malloc_allocator.h",
+        "cuda/include/thrust/scatter.h",
+        "cuda/include/thrust/pair.h",
+        "cuda/include/thrust/advance.h",
+        "cuda/include/thrust/find.h",
+        "cuda/include/thrust/device_ptr.h",
+        "cuda/include/thrust/generate.h",
+        "cuda/include/thrust/uninitialized_fill.h",
+        "cuda/include/thrust/system/system_error.h",
+        "cuda/include/thrust/system/detail/bad_alloc.h",
+        "cuda/include/thrust/system/detail/adl/transform_scan.h",
+        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
+        "cuda/include/thrust/system/detail/adl/partition.h",
+        "cuda/include/thrust/system/detail/adl/unique.h",
+        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/adl/sequence.h",
+        "cuda/include/thrust/system/detail/adl/merge.h",
+        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
+        "cuda/include/thrust/system/detail/adl/gather.h",
+        "cuda/include/thrust/system/detail/adl/sort.h",
+        "cuda/include/thrust/system/detail/adl/scan.h",
+        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
+        "cuda/include/thrust/system/detail/adl/reverse.h",
+        "cuda/include/thrust/system/detail/adl/assign_value.h",
+        "cuda/include/thrust/system/detail/adl/scatter.h",
+        "cuda/include/thrust/system/detail/adl/find.h",
+        "cuda/include/thrust/system/detail/adl/generate.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/adl/remove.h",
+        "cuda/include/thrust/system/detail/adl/tabulate.h",
+        "cuda/include/thrust/system/detail/adl/for_each.h",
+        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/adl/reduce.h",
+        "cuda/include/thrust/system/detail/adl/equal.h",
+        "cuda/include/thrust/system/detail/adl/copy.h",
+        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/adl/binary_search.h",
+        "cuda/include/thrust/system/detail/adl/set_operations.h",
+        "cuda/include/thrust/system/detail/adl/mismatch.h",
+        "cuda/include/thrust/system/detail/adl/extrema.h",
+        "cuda/include/thrust/system/detail/adl/count.h",
+        "cuda/include/thrust/system/detail/adl/replace.h",
+        "cuda/include/thrust/system/detail/adl/get_value.h",
+        "cuda/include/thrust/system/detail/adl/inner_product.h",
+        "cuda/include/thrust/system/detail/adl/copy_if.h",
+        "cuda/include/thrust/system/detail/adl/logical.h",
+        "cuda/include/thrust/system/detail/adl/iter_swap.h",
+        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/adl/fill.h",
+        "cuda/include/thrust/system/detail/adl/transform.h",
+        "cuda/include/thrust/system/detail/errno.h",
+        "cuda/include/thrust/system/detail/error_category.inl",
+        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
+        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
+        "cuda/include/thrust/system/detail/sequential/sort.inl",
+        "cuda/include/thrust/system/detail/sequential/partition.h",
+        "cuda/include/thrust/system/detail/sequential/unique.h",
+        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
+        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/sequential/sequence.h",
+        "cuda/include/thrust/system/detail/sequential/merge.h",
+        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
+        "cuda/include/thrust/system/detail/sequential/gather.h",
+        "cuda/include/thrust/system/detail/sequential/sort.h",
+        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/scan.h",
+        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/reverse.h",
+        "cuda/include/thrust/system/detail/sequential/assign_value.h",
+        "cuda/include/thrust/system/detail/sequential/scatter.h",
+        "cuda/include/thrust/system/detail/sequential/find.h",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/merge.inl",
+        "cuda/include/thrust/system/detail/sequential/generate.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/sequential/general_copy.h",
+        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
+        "cuda/include/thrust/system/detail/sequential/remove.h",
+        "cuda/include/thrust/system/detail/sequential/tabulate.h",
+        "cuda/include/thrust/system/detail/sequential/for_each.h",
+        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/reduce.h",
+        "cuda/include/thrust/system/detail/sequential/equal.h",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
+        "cuda/include/thrust/system/detail/sequential/copy.inl",
+        "cuda/include/thrust/system/detail/sequential/copy.h",
+        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/sequential/binary_search.h",
+        "cuda/include/thrust/system/detail/sequential/set_operations.h",
+        "cuda/include/thrust/system/detail/sequential/mismatch.h",
+        "cuda/include/thrust/system/detail/sequential/extrema.h",
+        "cuda/include/thrust/system/detail/sequential/count.h",
+        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
+        "cuda/include/thrust/system/detail/sequential/replace.h",
+        "cuda/include/thrust/system/detail/sequential/get_value.h",
+        "cuda/include/thrust/system/detail/sequential/inner_product.h",
+        "cuda/include/thrust/system/detail/sequential/copy_if.h",
+        "cuda/include/thrust/system/detail/sequential/logical.h",
+        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
+        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/sequential/fill.h",
+        "cuda/include/thrust/system/detail/sequential/transform.h",
+        "cuda/include/thrust/system/detail/error_condition.inl",
+        "cuda/include/thrust/system/detail/internal/decompose.h",
+        "cuda/include/thrust/system/detail/error_code.inl",
+        "cuda/include/thrust/system/detail/generic/transform_scan.h",
+        "cuda/include/thrust/system/detail/generic/memory.inl",
+        "cuda/include/thrust/system/detail/generic/transform.inl",
+        "cuda/include/thrust/system/detail/generic/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
+        "cuda/include/thrust/system/detail/generic/inner_product.inl",
+        "cuda/include/thrust/system/detail/generic/select_system.h",
+        "cuda/include/thrust/system/detail/generic/sequence.inl",
+        "cuda/include/thrust/system/detail/generic/sort.inl",
+        "cuda/include/thrust/system/detail/generic/equal.inl",
+        "cuda/include/thrust/system/detail/generic/partition.h",
+        "cuda/include/thrust/system/detail/generic/unique.h",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/generic/tag.h",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/sequence.h",
+        "cuda/include/thrust/system/detail/generic/type_traits.h",
+        "cuda/include/thrust/system/detail/generic/merge.h",
+        "cuda/include/thrust/system/detail/generic/reverse.inl",
+        "cuda/include/thrust/system/detail/generic/tabulate.inl",
+        "cuda/include/thrust/system/detail/generic/unique.inl",
+        "cuda/include/thrust/system/detail/generic/scatter.inl",
+        "cuda/include/thrust/system/detail/generic/set_operations.inl",
+        "cuda/include/thrust/system/detail/generic/copy_if.inl",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
+        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
+        "cuda/include/thrust/system/detail/generic/gather.h",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
+        "cuda/include/thrust/system/detail/generic/sort.h",
+        "cuda/include/thrust/system/detail/generic/distance.inl",
+        "cuda/include/thrust/system/detail/generic/scan.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/generic/reduce.inl",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
+        "cuda/include/thrust/system/detail/generic/reverse.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
+        "cuda/include/thrust/system/detail/generic/scatter.h",
+        "cuda/include/thrust/system/detail/generic/generate.inl",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
+        "cuda/include/thrust/system/detail/generic/remove.inl",
+        "cuda/include/thrust/system/detail/generic/advance.h",
+        "cuda/include/thrust/system/detail/generic/find.h",
+        "cuda/include/thrust/system/detail/generic/merge.inl",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/extrema.inl",
+        "cuda/include/thrust/system/detail/generic/generate.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/generic/count.inl",
+        "cuda/include/thrust/system/detail/generic/remove.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
+        "cuda/include/thrust/system/detail/generic/tabulate.h",
+        "cuda/include/thrust/system/detail/generic/for_each.h",
+        "cuda/include/thrust/system/detail/generic/distance.h",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/generic/reduce.h",
+        "cuda/include/thrust/system/detail/generic/equal.h",
+        "cuda/include/thrust/system/detail/generic/mismatch.inl",
+        "cuda/include/thrust/system/detail/generic/copy.inl",
+        "cuda/include/thrust/system/detail/generic/copy.h",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/generic/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/set_operations.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
+        "cuda/include/thrust/system/detail/generic/mismatch.h",
+        "cuda/include/thrust/system/detail/generic/scan.inl",
+        "cuda/include/thrust/system/detail/generic/gather.inl",
+        "cuda/include/thrust/system/detail/generic/extrema.h",
+        "cuda/include/thrust/system/detail/generic/count.h",
+        "cuda/include/thrust/system/detail/generic/replace.h",
+        "cuda/include/thrust/system/detail/generic/inner_product.h",
+        "cuda/include/thrust/system/detail/generic/copy_if.h",
+        "cuda/include/thrust/system/detail/generic/logical.h",
+        "cuda/include/thrust/system/detail/generic/partition.inl",
+        "cuda/include/thrust/system/detail/generic/memory.h",
+        "cuda/include/thrust/system/detail/generic/find.inl",
+        "cuda/include/thrust/system/detail/generic/replace.inl",
+        "cuda/include/thrust/system/detail/generic/advance.inl",
+        "cuda/include/thrust/system/detail/generic/fill.h",
+        "cuda/include/thrust/system/detail/generic/transform.h",
+        "cuda/include/thrust/system/detail/system_error.inl",
+        "cuda/include/thrust/system/omp/execution_policy.h",
+        "cuda/include/thrust/system/omp/vector.h",
+        "cuda/include/thrust/system/omp/detail/transform_scan.h",
+        "cuda/include/thrust/system/omp/detail/memory.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/omp/detail/sort.inl",
+        "cuda/include/thrust/system/omp/detail/partition.h",
+        "cuda/include/thrust/system/omp/detail/unique.h",
+        "cuda/include/thrust/system/omp/detail/execution_policy.h",
+        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/sequence.h",
+        "cuda/include/thrust/system/omp/detail/merge.h",
+        "cuda/include/thrust/system/omp/detail/unique.inl",
+        "cuda/include/thrust/system/omp/detail/copy_if.inl",
+        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/omp/detail/gather.h",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/sort.h",
+        "cuda/include/thrust/system/omp/detail/scan.h",
+        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
+        "cuda/include/thrust/system/omp/detail/reduce.inl",
+        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/omp/detail/reverse.h",
+        "cuda/include/thrust/system/omp/detail/assign_value.h",
+        "cuda/include/thrust/system/omp/detail/scatter.h",
+        "cuda/include/thrust/system/omp/detail/for_each.inl",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
+        "cuda/include/thrust/system/omp/detail/remove.inl",
+        "cuda/include/thrust/system/omp/detail/vector.inl",
+        "cuda/include/thrust/system/omp/detail/find.h",
+        "cuda/include/thrust/system/omp/detail/generate.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/omp/detail/remove.h",
+        "cuda/include/thrust/system/omp/detail/tabulate.h",
+        "cuda/include/thrust/system/omp/detail/for_each.h",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/omp/detail/reduce.h",
+        "cuda/include/thrust/system/omp/detail/equal.h",
+        "cuda/include/thrust/system/omp/detail/copy.inl",
+        "cuda/include/thrust/system/omp/detail/copy.h",
+        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/omp/detail/binary_search.h",
+        "cuda/include/thrust/system/omp/detail/set_operations.h",
+        "cuda/include/thrust/system/omp/detail/mismatch.h",
+        "cuda/include/thrust/system/omp/detail/extrema.h",
+        "cuda/include/thrust/system/omp/detail/count.h",
+        "cuda/include/thrust/system/omp/detail/replace.h",
+        "cuda/include/thrust/system/omp/detail/get_value.h",
+        "cuda/include/thrust/system/omp/detail/inner_product.h",
+        "cuda/include/thrust/system/omp/detail/copy_if.h",
+        "cuda/include/thrust/system/omp/detail/logical.h",
+        "cuda/include/thrust/system/omp/detail/partition.inl",
+        "cuda/include/thrust/system/omp/detail/iter_swap.h",
+        "cuda/include/thrust/system/omp/detail/par.h",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/omp/detail/fill.h",
+        "cuda/include/thrust/system/omp/detail/transform.h",
+        "cuda/include/thrust/system/omp/memory.h",
+        "cuda/include/thrust/system/tbb/execution_policy.h",
+        "cuda/include/thrust/system/tbb/vector.h",
+        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
+        "cuda/include/thrust/system/tbb/detail/memory.inl",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/sort.inl",
+        "cuda/include/thrust/system/tbb/detail/partition.h",
+        "cuda/include/thrust/system/tbb/detail/unique.h",
+        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
+        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/sequence.h",
+        "cuda/include/thrust/system/tbb/detail/merge.h",
+        "cuda/include/thrust/system/tbb/detail/unique.inl",
+        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
+        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
+        "cuda/include/thrust/system/tbb/detail/gather.h",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/sort.h",
+        "cuda/include/thrust/system/tbb/detail/scan.h",
+        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/tbb/detail/reduce.inl",
+        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/reverse.h",
+        "cuda/include/thrust/system/tbb/detail/assign_value.h",
+        "cuda/include/thrust/system/tbb/detail/scatter.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.inl",
+        "cuda/include/thrust/system/tbb/detail/remove.inl",
+        "cuda/include/thrust/system/tbb/detail/vector.inl",
+        "cuda/include/thrust/system/tbb/detail/find.h",
+        "cuda/include/thrust/system/tbb/detail/merge.inl",
+        "cuda/include/thrust/system/tbb/detail/generate.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/tbb/detail/remove.h",
+        "cuda/include/thrust/system/tbb/detail/tabulate.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.h",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/reduce.h",
+        "cuda/include/thrust/system/tbb/detail/equal.h",
+        "cuda/include/thrust/system/tbb/detail/copy.inl",
+        "cuda/include/thrust/system/tbb/detail/copy.h",
+        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/tbb/detail/binary_search.h",
+        "cuda/include/thrust/system/tbb/detail/set_operations.h",
+        "cuda/include/thrust/system/tbb/detail/mismatch.h",
+        "cuda/include/thrust/system/tbb/detail/scan.inl",
+        "cuda/include/thrust/system/tbb/detail/extrema.h",
+        "cuda/include/thrust/system/tbb/detail/count.h",
+        "cuda/include/thrust/system/tbb/detail/replace.h",
+        "cuda/include/thrust/system/tbb/detail/get_value.h",
+        "cuda/include/thrust/system/tbb/detail/inner_product.h",
+        "cuda/include/thrust/system/tbb/detail/copy_if.h",
+        "cuda/include/thrust/system/tbb/detail/logical.h",
+        "cuda/include/thrust/system/tbb/detail/partition.inl",
+        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
+        "cuda/include/thrust/system/tbb/detail/par.h",
+        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/tbb/detail/fill.h",
+        "cuda/include/thrust/system/tbb/detail/transform.h",
+        "cuda/include/thrust/system/tbb/memory.h",
+        "cuda/include/thrust/system/error_code.h",
+        "cuda/include/thrust/system/cpp/execution_policy.h",
+        "cuda/include/thrust/system/cpp/vector.h",
+        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
+        "cuda/include/thrust/system/cpp/detail/memory.inl",
+        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/partition.h",
+        "cuda/include/thrust/system/cpp/detail/unique.h",
+        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
+        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cpp/detail/sequence.h",
+        "cuda/include/thrust/system/cpp/detail/merge.h",
+        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cpp/detail/gather.h",
+        "cuda/include/thrust/system/cpp/detail/sort.h",
+        "cuda/include/thrust/system/cpp/detail/scan.h",
+        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/reverse.h",
+        "cuda/include/thrust/system/cpp/detail/assign_value.h",
+        "cuda/include/thrust/system/cpp/detail/scatter.h",
+        "cuda/include/thrust/system/cpp/detail/vector.inl",
+        "cuda/include/thrust/system/cpp/detail/find.h",
+        "cuda/include/thrust/system/cpp/detail/generate.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cpp/detail/remove.h",
+        "cuda/include/thrust/system/cpp/detail/tabulate.h",
+        "cuda/include/thrust/system/cpp/detail/for_each.h",
+        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/reduce.h",
+        "cuda/include/thrust/system/cpp/detail/equal.h",
+        "cuda/include/thrust/system/cpp/detail/copy.h",
+        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cpp/detail/binary_search.h",
+        "cuda/include/thrust/system/cpp/detail/set_operations.h",
+        "cuda/include/thrust/system/cpp/detail/mismatch.h",
+        "cuda/include/thrust/system/cpp/detail/extrema.h",
+        "cuda/include/thrust/system/cpp/detail/count.h",
+        "cuda/include/thrust/system/cpp/detail/replace.h",
+        "cuda/include/thrust/system/cpp/detail/get_value.h",
+        "cuda/include/thrust/system/cpp/detail/inner_product.h",
+        "cuda/include/thrust/system/cpp/detail/copy_if.h",
+        "cuda/include/thrust/system/cpp/detail/logical.h",
+        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
+        "cuda/include/thrust/system/cpp/detail/par.h",
+        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cpp/detail/fill.h",
+        "cuda/include/thrust/system/cpp/detail/transform.h",
+        "cuda/include/thrust/system/cpp/memory.h",
+        "cuda/include/thrust/system/cuda/execution_policy.h",
+        "cuda/include/thrust/system/cuda/vector.h",
+        "cuda/include/thrust/system/cuda/error.h",
+        "cuda/include/thrust/system/cuda/detail/copy_device_to_device.h",
+        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
+        "cuda/include/thrust/system/cuda/detail/memory.inl",
+        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_rle_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_histogram_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_by_key_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_scan_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_select_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_radix_sort_dispatch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_histo.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_downsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_upsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_satomic.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_gatomic.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_select.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_scan_prefix_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_upsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_histogram_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_rle_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_select_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_scan_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_reduce_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_satomic_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_sort_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_gatomic_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_downsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_reduce_by_key_sweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_scan_prefix_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/host/spinlock.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_shift.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
+        "cuda/include/thrust/system/cuda/detail/reduce_intervals.inl",
+        "cuda/include/thrust/system/cuda/detail/copy_cross_system.inl",
+        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/bulk.h",
+        "cuda/include/thrust/system/cuda/detail/sort.inl",
+        "cuda/include/thrust/system/cuda/detail/partition.h",
+        "cuda/include/thrust/system/cuda/detail/unique.h",
+        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
+        "cuda/include/thrust/system/cuda/detail/cuda_launch_config.h",
+        "cuda/include/thrust/system/cuda/detail/cub.h",
+        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cuda/detail/sequence.h",
+        "cuda/include/thrust/system/cuda/detail/merge.h",
+        "cuda/include/thrust/system/cuda/detail/set_symmetric_difference.inl",
+        "cuda/include/thrust/system/cuda/detail/copy_if.inl",
+        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cuda/detail/error.inl",
+        "cuda/include/thrust/system/cuda/detail/gather.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/cuda/detail/sort.h",
+        "cuda/include/thrust/system/cuda/detail/synchronize.h",
+        "cuda/include/thrust/system/cuda/detail/scan.h",
+        "cuda/include/thrust/system/cuda/detail/temporary_indirect_permutation.h",
+        "cuda/include/thrust/system/cuda/detail/extern_shared_ptr.h",
+        "cuda/include/thrust/system/cuda/detail/detail/set_operation.inl",
+        "cuda/include/thrust/system/cuda/detail/detail/balanced_path.h",
+        "cuda/include/thrust/system/cuda/detail/detail/virtualized_smem_closure.h",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_primitive_sort.h",
+        "cuda/include/thrust/system/cuda/detail/detail/set_operation.h",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_primitive_sort.inl",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_merge_sort.h",
+        "cuda/include/thrust/system/cuda/detail/detail/launch_closure.inl",
+        "cuda/include/thrust/system/cuda/detail/detail/merge.h",
+        "cuda/include/thrust/system/cuda/detail/detail/alignment.h",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_radix_sort.inl",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_sort_each.h",
+        "cuda/include/thrust/system/cuda/detail/detail/launch_calculator.inl",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_merge_sort.inl",
+        "cuda/include/thrust/system/cuda/detail/detail/launch_closure.h",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_radix_sort.h",
+        "cuda/include/thrust/system/cuda/detail/detail/uninitialized.h",
+        "cuda/include/thrust/system/cuda/detail/detail/cached_temporary_allocator.h",
+        "cuda/include/thrust/system/cuda/detail/detail/launch_calculator.h",
+        "cuda/include/thrust/system/cuda/detail/detail/stable_sort_each.inl",
+        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cuda/detail/default_decomposition.h",
+        "cuda/include/thrust/system/cuda/detail/reduce.inl",
+        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/reverse.h",
+        "cuda/include/thrust/system/cuda/detail/assign_value.h",
+        "cuda/include/thrust/system/cuda/detail/scatter.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_intervals.hpp",
+        "cuda/include/thrust/system/cuda/detail/for_each.inl",
+        "cuda/include/thrust/system/cuda/detail/default_decomposition.inl",
+        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
+        "cuda/include/thrust/system/cuda/detail/adjacent_difference.inl",
+        "cuda/include/thrust/system/cuda/detail/vector.inl",
+        "cuda/include/thrust/system/cuda/detail/throw_on_error.h",
+        "cuda/include/thrust/system/cuda/detail/find.h",
+        "cuda/include/thrust/system/cuda/detail/terminate.h",
+        "cuda/include/thrust/system/cuda/detail/merge.inl",
+        "cuda/include/thrust/system/cuda/detail/trivial_copy.inl",
+        "cuda/include/thrust/system/cuda/detail/generate.h",
+        "cuda/include/thrust/system/cuda/detail/execute_on_stream.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cuda/detail/remove.h",
+        "cuda/include/thrust/system/cuda/detail/tabulate.h",
+        "cuda/include/thrust/system/cuda/detail/for_each.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/decomposition.h",
+        "cuda/include/thrust/system/cuda/detail/reduce.h",
+        "cuda/include/thrust/system/cuda/detail/equal.h",
+        "cuda/include/thrust/system/cuda/detail/runtime_introspection.h",
+        "cuda/include/thrust/system/cuda/detail/copy.inl",
+        "cuda/include/thrust/system/cuda/detail/copy.h",
+        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cuda/detail/binary_search.h",
+        "cuda/include/thrust/system/cuda/detail/runtime_introspection.inl",
+        "cuda/include/thrust/system/cuda/detail/set_operations.h",
+        "cuda/include/thrust/system/cuda/detail/mismatch.h",
+        "cuda/include/thrust/system/cuda/detail/scan.inl",
+        "cuda/include/thrust/system/cuda/detail/synchronize.inl",
+        "cuda/include/thrust/system/cuda/detail/extrema.h",
+        "cuda/include/thrust/system/cuda/detail/set_union.inl",
+        "cuda/include/thrust/system/cuda/detail/set_intersection.inl",
+        "cuda/include/thrust/system/cuda/detail/count.h",
+        "cuda/include/thrust/system/cuda/detail/trivial_copy.h",
+        "cuda/include/thrust/system/cuda/detail/copy_device_to_device.inl",
+        "cuda/include/thrust/system/cuda/detail/replace.h",
+        "cuda/include/thrust/system/cuda/detail/bulk/malloc.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/config.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/closure.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/tail_flags.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/terminate.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/alignment.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/choose_sizes.inl",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_task.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/head_flags.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/synchronize.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/async.inl",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/iterator.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/choose_sizes.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/copy.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/merge.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/scan.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/gather.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/sort.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/reduce.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/scatter.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/algorithm/for_each.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/bulk.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/execution_policy.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/uninitialized.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/async.hpp",
+        "cuda/include/thrust/system/cuda/detail/bulk/future.hpp",
+        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
+        "cuda/include/thrust/system/cuda/detail/get_value.h",
+        "cuda/include/thrust/system/cuda/detail/inner_product.h",
+        "cuda/include/thrust/system/cuda/detail/copy_if.h",
+        "cuda/include/thrust/system/cuda/detail/logical.h",
+        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
+        "cuda/include/thrust/system/cuda/detail/block/merge.h",
+        "cuda/include/thrust/system/cuda/detail/block/inclusive_scan.h",
+        "cuda/include/thrust/system/cuda/detail/block/merge.inl",
+        "cuda/include/thrust/system/cuda/detail/block/merging_sort.h",
+        "cuda/include/thrust/system/cuda/detail/block/exclusive_scan.h",
+        "cuda/include/thrust/system/cuda/detail/block/reduce.h",
+        "cuda/include/thrust/system/cuda/detail/block/copy.h",
+        "cuda/include/thrust/system/cuda/detail/block/odd_even_sort.h",
+        "cuda/include/thrust/system/cuda/detail/par.h",
+        "cuda/include/thrust/system/cuda/detail/copy_cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cuda/detail/fill.h",
+        "cuda/include/thrust/system/cuda/detail/set_difference.inl",
+        "cuda/include/thrust/system/cuda/detail/transform.h",
+        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
+        "cuda/include/thrust/system/cuda/memory.h",
+        "cuda/include/thrust/remove.h",
+        "cuda/include/thrust/tabulate.h",
+        "cuda/include/thrust/for_each.h",
+        "cuda/include/thrust/distance.h",
+        "cuda/include/thrust/reduce.h",
+        "cuda/include/thrust/equal.h",
+        "cuda/include/thrust/complex.h",
+        "cuda/include/thrust/device_allocator.h",
+        "cuda/include/thrust/copy.h",
+        "cuda/include/thrust/uninitialized_copy.h",
+        "cuda/include/thrust/device_reference.h",
+        "cuda/include/thrust/binary_search.h",
+        "cuda/include/thrust/set_operations.h",
+        "cuda/include/thrust/swap.h",
+        "cuda/include/thrust/mismatch.h",
+        "cuda/include/thrust/extrema.h",
+        "cuda/include/thrust/count.h",
+        "cuda/include/thrust/device_free.h",
+        "cuda/include/thrust/random/discard_block_engine.h",
+        "cuda/include/thrust/random/normal_distribution.h",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
+        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
+        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
+        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
+        "cuda/include/thrust/random/detail/discard_block_engine.inl",
+        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
+        "cuda/include/thrust/random/detail/random_core_access.h",
+        "cuda/include/thrust/random/detail/mod.h",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
+        "cuda/include/thrust/random/detail/normal_distribution.inl",
+        "cuda/include/thrust/random/detail/normal_distribution_base.h",
+        "cuda/include/thrust/random/uniform_int_distribution.h",
+        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
+        "cuda/include/thrust/random/xor_combine_engine.h",
+        "cuda/include/thrust/random/subtract_with_carry_engine.h",
+        "cuda/include/thrust/random/linear_congruential_engine.h",
+        "cuda/include/thrust/random/uniform_real_distribution.h",
+        "cuda/include/thrust/functional.h",
+        "cuda/include/thrust/replace.h",
+        "cuda/include/thrust/device_new_allocator.h",
+        "cuda/include/thrust/host_vector.h",
+        "cuda/include/thrust/version.h",
+        "cuda/include/thrust/inner_product.h",
+        "cuda/include/thrust/iterator/iterator_traits.h",
+        "cuda/include/thrust/iterator/discard_iterator.h",
+        "cuda/include/thrust/iterator/retag.h",
+        "cuda/include/thrust/iterator/permutation_iterator.h",
+        "cuda/include/thrust/iterator/transform_iterator.h",
+        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
+        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
+        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
+        "cuda/include/thrust/iterator/detail/distance_from_result.h",
+        "cuda/include/thrust/iterator/detail/host_system_tag.h",
+        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
+        "cuda/include/thrust/iterator/detail/retag.h",
+        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
+        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
+        "cuda/include/thrust/iterator/detail/minimum_category.h",
+        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
+        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/normal_iterator.h",
+        "cuda/include/thrust/iterator/detail/join_iterator.h",
+        "cuda/include/thrust/iterator/detail/device_system_tag.h",
+        "cuda/include/thrust/iterator/detail/universal_categories.h",
+        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/minimum_system.h",
+        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
+        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
+        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/any_assign.h",
+        "cuda/include/thrust/iterator/detail/any_system_tag.h",
+        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
+        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
+        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
+        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
+        "cuda/include/thrust/iterator/constant_iterator.h",
+        "cuda/include/thrust/iterator/counting_iterator.h",
+        "cuda/include/thrust/iterator/iterator_adaptor.h",
+        "cuda/include/thrust/iterator/iterator_facade.h",
+        "cuda/include/thrust/iterator/iterator_categories.h",
+        "cuda/include/thrust/iterator/reverse_iterator.h",
+        "cuda/include/thrust/iterator/zip_iterator.h",
+        "cuda/include/thrust/logical.h",
+        "cuda/include/thrust/tuple.h",
+        "cuda/include/thrust/memory.h",
+        "cuda/include/thrust/random.h",
+        "cuda/include/thrust/fill.h",
+        "cuda/include/thrust/transform.h",
+        "cuda/include/texture_types.h",
+        "cuda/include/nppversion.h",
+        "cuda/include/cuda_texture_types.h",
+        "cuda/include/fatbinary.h",
+        "cuda/include/cublasXt.h",
+        "cuda/include/cuda_fp16.h",
+        "cuda/include/vector_functions.h",
+        "cuda/include/cusparse.h",
+        "cuda/include/nppi_filtering_functions.h",
+        "cuda/include/nppi_morphological_operations.h",
+        "cuda/include/sobol_direction_vectors.h",
+        "cuda/include/nvblas.h",
+        "cuda/include/curand_mtgp32dc_p_11213.h",
+        "cuda/include/nvcuvid.h",
+        "cuda/include/cuda_runtime_api.h",
+        "cuda/include/curand_mtgp32_kernel.h",
+        "cuda/include/cublas_v2.h",
+        "cuda/include/builtin_types.h",
+        "cuda/include/nppi_geometry_transforms.h",
+        "cuda/include/npps_support_functions.h",
+        "cuda/include/cufftw.h",
+        "cuda/include/cuda_device_runtime_api.h",
+        "cuda/include/sm_30_intrinsics.hpp",
+        "cuda/include/vector_types.h",
+        "cuda/include/sm_35_atomic_functions.h",
+        "cuda/include/sm_20_intrinsics.h",
+        "cuda/include/driver_types.h",
+        "cuda/include/nvToolsExtCudaRt.h",
+        "cuda/include/curand_globals.h",
+        "cuda/include/device_atomic_functions.h",
+        "cuda/include/surface_types.h",
+        "cuda/include/nvrtc.h",
+        "cuda/include/nppdefs.h",
+        "cuda/include/sm_60_atomic_functions.h",
+        "cuda/include/driver_functions.h",
+        "cuda/include/cusolver_common.h",
+        "cuda/include/cublas.h",
+        "cuda/include/curand_lognormal.h",
+        "cuda/include/device_atomic_functions.hpp",
+        "cuda/include/crt/device_runtime.h",
+        "cuda/include/crt/storage_class.h",
+        "cuda/include/crt/func_macro.h",
+        "cuda/include/crt/host_runtime.h",
+        "cuda/include/nppi_arithmetic_and_logical_operations.h",
+        "cuda/include/npps_arithmetic_and_logical_operations.h",
+        "cuda/include/nppi_computer_vision.h",
+        "cuda/include/surface_functions.hpp",
+        "cuda/include/surface_functions.h",
+        "cuda/include/curand_normal_static.h",
+        "cuda/include/curand.h",
+        "cuda/include/math_functions_dbl_ptx3.h",
+        "cuda/include/curand_philox4x32_x.h",
+        "cuda/include/nppi_threshold_and_compare_operations.h",
+        "cuda/include/nvml.h",
+        "cuda/include/npps.h",
+        "cuda/include/cuda_vdpau_interop.h",
+        "cuda/include/sm_61_intrinsics.hpp",
+        "cuda/include/cublas_api.h",
+        "cuda/include/nppi_color_conversion.h",
+        "cuda/include/math_functions_dbl_ptx3.hpp",
+        "cuda/include/nppcore.h",
+        "cuda/include/cudaGL.h",
+        "cuda/include/fatBinaryCtl.h",
+        "cuda/include/npps_statistics_functions.h",
+        "cuda/include/cudaVDPAU.h",
+        "cuda/include/curand_poisson.h",
+        "cuda/include/cusolverDn.h",
+        "cuda/include/cuda_profiler_api.h",
+        "cuda/include/sm_20_atomic_functions.h",
+        "cuda/include/nvfunctional",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-8.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-8.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-8.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-8.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-8.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-8.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-8.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-8.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-8.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-8.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-8.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-8.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-8.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-8.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-8.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-8.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-8.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-8.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-8.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-8.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-8.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-8.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-8.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-8.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-8.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-8.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-8.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-8.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-8.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-8.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-8.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-8.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-8.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-8.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-8.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-8.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-8.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-8.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-8.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-8.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-8.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-8.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-8.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-8.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-8.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-8.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-8.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-8.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-8.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-8.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-8.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-8.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-8.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-8.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-8.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-8.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-8.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-8.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-8.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-8.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-8.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-8.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-8.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-8.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-8.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-8.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-8.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-8.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-8.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-8.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-8.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-8.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-8.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-8.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-8.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-8.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-8.0/include/cuviddec.h" "$(@D)/cuda/include/cuviddec.h" && cp "/usr/local/cuda-8.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-8.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-8.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-8.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-8.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-8.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-8.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-8.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-8.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-8.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-8.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-8.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-8.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-8.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-8.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-8.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_device_to_device.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_rle_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_rle_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_histogram_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_histogram_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_by_key_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_by_key_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_scan_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_scan_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_select_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_select_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/dispatch/device_radix_sort_dispatch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/device_radix_sort_dispatch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_reduce.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_histo.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_histo.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_scan.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_satomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_satomic.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_sort.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_gatomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_gatomic.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_select.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_scan_prefix_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_scan_prefix_operators.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_range/block_range_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_range/block_range_reduce_by_key.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_histogram_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_histogram_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_rle_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_rle_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_select_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_select_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_scan_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_scan_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_reduce_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_reduce_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_satomic_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_satomic_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_sort_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_sort_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_gatomic_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_gatomic_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_reduce_by_key_sweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_reduce_by_key_sweep.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block_sweep/block_scan_prefix_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block_sweep/block_scan_prefix_operators.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/host/spinlock.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/spinlock.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_shift.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shift.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_intervals.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy_cross_system.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_cross_system.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk.h" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/sort.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cuda_launch_config.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cuda_launch_config.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/cub.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cub.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/set_symmetric_difference.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/set_symmetric_difference.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/synchronize.h" "$(@D)/cuda/include/thrust/system/cuda/detail/synchronize.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/temporary_indirect_permutation.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_indirect_permutation.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/extern_shared_ptr.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extern_shared_ptr.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/set_operation.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/set_operation.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/balanced_path.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/balanced_path.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/virtualized_smem_closure.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/virtualized_smem_closure.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_primitive_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/set_operation.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/set_operation.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_primitive_sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_merge_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/launch_closure.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/launch_closure.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/alignment.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_radix_sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_sort_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_sort_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/launch_calculator.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/launch_calculator.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_merge_sort.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/launch_closure.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/launch_closure.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_radix_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/uninitialized.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/uninitialized.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/cached_temporary_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/cached_temporary_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/launch_calculator.h" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/launch_calculator.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/detail/stable_sort_each.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/detail/stable_sort_each.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/default_decomposition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce_intervals.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_intervals.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/default_decomposition.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/throw_on_error.h" "$(@D)/cuda/include/thrust/system/cuda/detail/throw_on_error.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/merge.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/trivial_copy.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/trivial_copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/execute_on_stream.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execute_on_stream.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/decomposition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/decomposition.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/runtime_introspection.h" "$(@D)/cuda/include/thrust/system/cuda/detail/runtime_introspection.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/runtime_introspection.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/runtime_introspection.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/scan.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/synchronize.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/synchronize.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/set_union.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/set_union.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/set_intersection.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/set_intersection.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/trivial_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/trivial_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy_device_to_device.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_device_to_device.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/malloc.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/malloc.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/config.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/config.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/closure.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/closure.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/tail_flags.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/tail_flags.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/terminate.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/terminate.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/alignment.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/alignment.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/choose_sizes.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/choose_sizes.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_task.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_task.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/head_flags.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/head_flags.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/synchronize.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/synchronize.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/async.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/async.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/iterator.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/iterator.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/choose_sizes.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/choose_sizes.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/copy.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/copy.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/merge.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/merge.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/scan.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/scan.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/gather.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/gather.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/sort.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/sort.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/reduce.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/reduce.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/scatter.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/scatter.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/algorithm/for_each.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/algorithm/for_each.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/bulk.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/bulk.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/execution_policy.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/execution_policy.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/uninitialized.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/uninitialized.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/async.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/async.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/bulk/future.hpp" "$(@D)/cuda/include/thrust/system/cuda/detail/bulk/future.hpp" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/merge.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/inclusive_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/inclusive_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/merge.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/block/merge.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/merging_sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/merging_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/exclusive_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/exclusive_scan.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/block/odd_even_sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/block/odd_even_sort.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_cross_system.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_intervals.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/set_difference.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/set_difference.inl" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-8.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-8.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-8.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-8.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-8.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-8.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-8.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-8.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-8.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-8.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-8.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-8.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-8.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-8.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-8.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-8.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-8.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-8.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-8.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-8.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-8.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-8.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-8.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-8.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-8.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-8.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-8.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-8.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-8.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-8.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-8.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-8.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-8.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-8.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-8.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-8.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-8.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-8.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-8.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-8.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-8.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-8.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-8.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-8.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-8.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-8.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-8.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-8.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-8.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-8.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-8.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-8.0/include/nvcuvid.h" "$(@D)/cuda/include/nvcuvid.h" && cp "/usr/local/cuda-8.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-8.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-8.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-8.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-8.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-8.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-8.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-8.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-8.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-8.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h" && cp "/usr/local/cuda-8.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-8.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-8.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-8.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-8.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-8.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-8.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-8.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-8.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-8.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-8.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-8.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-8.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-8.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-8.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-8.0/include/crt/device_runtime.h" "$(@D)/cuda/include/crt/device_runtime.h" && cp "/usr/local/cuda-8.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-8.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-8.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-8.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-8.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-8.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-8.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-8.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-8.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-8.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-8.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-8.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-8.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-8.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-8.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-8.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-8.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-8.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-8.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-8.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-8.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-8.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-8.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-8.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-8.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-8.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-8.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-8.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-8.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-8.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional"
+   """,
+)
+
+genrule(
+    name = "cuda-nvvm",
+    outs = [
+        "cuda/nvvm/bin/cicc",
+        "cuda/nvvm/libdevice/libdevice.compute_50.10.bc",
+        "cuda/nvvm/libdevice/libdevice.compute_30.10.bc",
+        "cuda/nvvm/libdevice/libdevice.compute_20.10.bc",
+        "cuda/nvvm/libdevice/libdevice.compute_35.10.bc",
+        "cuda/nvvm/lib64/libnvvm.so.3",
+        "cuda/nvvm/lib64/libnvvm.so",
+        "cuda/nvvm/lib64/libnvvm.so.3.1.0",
+        "cuda/nvvm/include/nvvm.h",
+        "cuda/nvvm/libnvvm-samples/ptxgen/README.txt",
+        "cuda/nvvm/libnvvm-samples/ptxgen/ptxgen.c",
+        "cuda/nvvm/libnvvm-samples/ptxgen/CMakeLists.txt",
+        "cuda/nvvm/libnvvm-samples/build.bat",
+        "cuda/nvvm/libnvvm-samples/cuda-c-linking/README.txt",
+        "cuda/nvvm/libnvvm-samples/cuda-c-linking/math-funcs.cu",
+        "cuda/nvvm/libnvvm-samples/cuda-c-linking/CMakeLists.txt",
+        "cuda/nvvm/libnvvm-samples/cuda-c-linking/cuda-c-linking.cpp",
+        "cuda/nvvm/libnvvm-samples/README.txt",
+        "cuda/nvvm/libnvvm-samples/simple/simple.c",
+        "cuda/nvvm/libnvvm-samples/simple/simple-gpu.ll",
+        "cuda/nvvm/libnvvm-samples/simple/README.txt",
+        "cuda/nvvm/libnvvm-samples/simple/simple-gpu64.ll",
+        "cuda/nvvm/libnvvm-samples/simple/CMakeLists.txt",
+        "cuda/nvvm/libnvvm-samples/common/include/DDSWriter.h",
+        "cuda/nvvm/libnvvm-samples/common/include/drvapi_error_string.h",
+        "cuda/nvvm/libnvvm-samples/build.sh",
+        "cuda/nvvm/libnvvm-samples/CMakeLists.txt",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-8.0/nvvm/bin/cicc" "$(@D)/cuda/nvvm/bin/cicc" && cp "/usr/local/cuda-8.0/nvvm/libdevice/libdevice.compute_50.10.bc" "$(@D)/cuda/nvvm/libdevice/libdevice.compute_50.10.bc" && cp "/usr/local/cuda-8.0/nvvm/libdevice/libdevice.compute_30.10.bc" "$(@D)/cuda/nvvm/libdevice/libdevice.compute_30.10.bc" && cp "/usr/local/cuda-8.0/nvvm/libdevice/libdevice.compute_20.10.bc" "$(@D)/cuda/nvvm/libdevice/libdevice.compute_20.10.bc" && cp "/usr/local/cuda-8.0/nvvm/libdevice/libdevice.compute_35.10.bc" "$(@D)/cuda/nvvm/libdevice/libdevice.compute_35.10.bc" && cp "/usr/local/cuda-8.0/nvvm/lib64/libnvvm.so.3" "$(@D)/cuda/nvvm/lib64/libnvvm.so.3" && cp "/usr/local/cuda-8.0/nvvm/lib64/libnvvm.so" "$(@D)/cuda/nvvm/lib64/libnvvm.so" && cp "/usr/local/cuda-8.0/nvvm/lib64/libnvvm.so.3.1.0" "$(@D)/cuda/nvvm/lib64/libnvvm.so.3.1.0" && cp "/usr/local/cuda-8.0/nvvm/include/nvvm.h" "$(@D)/cuda/nvvm/include/nvvm.h" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/ptxgen/README.txt" "$(@D)/cuda/nvvm/libnvvm-samples/ptxgen/README.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/ptxgen/ptxgen.c" "$(@D)/cuda/nvvm/libnvvm-samples/ptxgen/ptxgen.c" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/ptxgen/CMakeLists.txt" "$(@D)/cuda/nvvm/libnvvm-samples/ptxgen/CMakeLists.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/build.bat" "$(@D)/cuda/nvvm/libnvvm-samples/build.bat" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/cuda-c-linking/README.txt" "$(@D)/cuda/nvvm/libnvvm-samples/cuda-c-linking/README.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/cuda-c-linking/math-funcs.cu" "$(@D)/cuda/nvvm/libnvvm-samples/cuda-c-linking/math-funcs.cu" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/cuda-c-linking/CMakeLists.txt" "$(@D)/cuda/nvvm/libnvvm-samples/cuda-c-linking/CMakeLists.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/cuda-c-linking/cuda-c-linking.cpp" "$(@D)/cuda/nvvm/libnvvm-samples/cuda-c-linking/cuda-c-linking.cpp" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/README.txt" "$(@D)/cuda/nvvm/libnvvm-samples/README.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/simple/simple.c" "$(@D)/cuda/nvvm/libnvvm-samples/simple/simple.c" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/simple/simple-gpu.ll" "$(@D)/cuda/nvvm/libnvvm-samples/simple/simple-gpu.ll" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/simple/README.txt" "$(@D)/cuda/nvvm/libnvvm-samples/simple/README.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/simple/simple-gpu64.ll" "$(@D)/cuda/nvvm/libnvvm-samples/simple/simple-gpu64.ll" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/simple/CMakeLists.txt" "$(@D)/cuda/nvvm/libnvvm-samples/simple/CMakeLists.txt" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/common/include/DDSWriter.h" "$(@D)/cuda/nvvm/libnvvm-samples/common/include/DDSWriter.h" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/common/include/drvapi_error_string.h" "$(@D)/cuda/nvvm/libnvvm-samples/common/include/drvapi_error_string.h" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/build.sh" "$(@D)/cuda/nvvm/libnvvm-samples/build.sh" && cp "/usr/local/cuda-8.0/nvvm/libnvvm-samples/CMakeLists.txt" "$(@D)/cuda/nvvm/libnvvm-samples/CMakeLists.txt"
+   """,
+)
+
+genrule(
+    name = "cuda-extras",
+    outs = [
+        "cuda/extras/CUPTI/include/cupti_result.h",
+        "cuda/extras/CUPTI/include/cupti_events.h",
+        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
+        "cuda/extras/CUPTI/include/cupti_version.h",
+        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
+        "cuda/extras/CUPTI/include/cupti_activity.h",
+        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
+        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
+        "cuda/extras/CUPTI/include/cuda_stdint.h",
+        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
+        "cuda/extras/CUPTI/include/cupti_metrics.h",
+        "cuda/extras/CUPTI/include/cupti_callbacks.h",
+        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
+        "cuda/extras/CUPTI/include/cupti.h",
+        "cuda/extras/CUPTI/include/GL/glut.h",
+        "cuda/extras/CUPTI/include/GL/glu.h",
+        "cuda/extras/CUPTI/include/GL/glxext.h",
+        "cuda/extras/CUPTI/include/GL/wglext.h",
+        "cuda/extras/CUPTI/include/GL/glx.h",
+        "cuda/extras/CUPTI/include/GL/glext.h",
+        "cuda/extras/CUPTI/include/GL/wglew.h",
+        "cuda/extras/CUPTI/include/GL/gl.h",
+        "cuda/extras/CUPTI/include/GL/glew.h",
+        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
+        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-8.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
+   """,
+)
+
+genrule(
+    name = "cuda-lib",
+    outs = [
+        "cuda/lib/libcuda.so",
+        "cuda/lib/libcudart.so.8.0",
+        "cuda/lib/libcudart_static.a",
+        "cuda/lib/libcublas.so.8.0",
+        "cuda/lib/libcusolver.so.8.0",
+        "cuda/lib/libcurand.so.8.0",
+        "cuda/lib/libcufft.so.8.0",
+        "cuda/lib/libcudnn.so.6",
+        "cuda/lib/libcupti.so.8.0",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudart.so.8.0.61" "$(@D)/cuda/lib/libcudart.so.8.0" && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcublas.so.8.0.71" "$(@D)/cuda/lib/libcublas.so.8.0" && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcusolver.so.8.0.61" "$(@D)/cuda/lib/libcusolver.so.8.0" && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcurand.so.8.0.61" "$(@D)/cuda/lib/libcurand.so.8.0" && cp "/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcufft.so.8.0.61" "$(@D)/cuda/lib/libcufft.so.8.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.6.0.21" "$(@D)/cuda/lib/libcudnn.so.6" && cp "/usr/local/cuda-8.0/extras/CUPTI/lib64/libcupti.so.8.0.61" "$(@D)/cuda/lib/libcupti.so.8.0"
+   """,
+)
+
+genrule(
+    name = "cudnn-include",
+    outs = [
+        "cuda/include/cudnn.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+   """,
+)
diff --git a/third_party/toolchains/gpus/cuda/build_defs.bzl b/third_party/toolchains/gpus/cuda/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..badaf4301934cb6c87cfecbacf0b3bdfff443fe4
--- /dev/null
+++ b/third_party/toolchains/gpus/cuda/build_defs.bzl
@@ -0,0 +1,37 @@
+# Macros for building CUDA code used with Bazel remote
+# execution service.
+# DO NOT EDIT: automatically generated file
+
+def if_cuda(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with CUDA.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
+        "//conditions:default": if_false
+    })
+
+
+def cuda_default_copts():
+    """Default options for all CUDA compilations."""
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + ["--cuda-gpu-arch=sm_30"])
+
+
+def cuda_is_configured():
+    """Returns true if CUDA was enabled during the configure process."""
+    return True
+
+def if_cuda_is_configured(x):
+    """Tests if the CUDA was enabled during the configure process.
+
+    Unlike if_cuda(), this does not require that we are building with
+    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
+    """
+    if cuda_is_configured():
+      return x
+    return []
+
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/layout-test.ts b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
similarity index 56%
rename from tensorflow/tensorboard/components/tf_graph_common/test/layout-test.ts
rename to third_party/toolchains/gpus/cuda/cuda/cuda_config.h
index b4884413c9d4f0b2e3d61d283736174f6549819b..f6662274cc0a31073adbd9a976a42af93f200cfd 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/test/layout-test.ts
+++ b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
@@ -1,23 +1,27 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the 'License');
+Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-suite('layout', () => {
-  let assert = chai.assert;
+// DO NOT EDIT: automatically generated file
+#ifndef CUDA_CUDA_CONFIG_H_
+#define CUDA_CUDA_CONFIG_H_
 
-  test('dagre exists', () => { assert.isTrue(dagre != null); });
+#define TF_CUDA_CAPABILITIES CudaVersion("3.0")
 
-  // TODO(bp): write tests.
+#define TF_CUDA_VERSION "8.0"
+#define TF_CUDNN_VERSION "5"
 
-});
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-8.0"
+
+#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/typings.bzl b/third_party/typings.bzl
deleted file mode 100644
index d0c9eddbb3f52803310caed8775840b5af8fbbfa..0000000000000000000000000000000000000000
--- a/third_party/typings.bzl
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TensorBoard typing dependencies
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-
-def tensorboard_typings_workspace():
-  filegroup_external(
-      name = "org_definitelytyped",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "b7da645f6e5555feb7aeede73775da0023ce2257df9c8e76c9159266035a9c0d": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
-          ],
-          "177293828c7a206bf2a7f725753d51396d38668311aa37c96445f91bbf8128a7": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
-          ],
-          "e4cd3d5de0eb3bc7b1063b50d336764a0ac82a658b39b5cf90511f489ffdee60": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
-          ],
-          "695a03dd2ccb238161d97160b239ab841562710e5c4e42886aefd4ace2ce152e": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
-          ],
-          "513ccd9ee1c708881120eeacd56788fc3b3da8e5c6172b20324cebbe858803fe": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
-          ],
-          "44eba36339bd1c0792072b7b204ee926fe5ffe1e9e2da916e67ac55548e3668a": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
-          ],
-          "9453c3e6bae824e90758c3b38975c1ed77e6abd79bf513bcb08368fcdb14898e": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
-          ],
-          "691756a6eb455f340c9e834de0d49fff269e7b8c1799c2454465dcd6a4435b80": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_array",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "61e7abb7b1f01fbcb0cab8cf39003392f422566209edd681fbd070eaa84ca000": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_axis",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "95f75c8dcc89850b2e72581d96a7b5f46ea4ac852f828893f141f14a597421f9": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_brush",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "a2738e693ce8a8640c2d29001e77582c9c361fd23bda44db471629866b60ada7": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_chord",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "c54d24756eb6d744b31e538ad9bab3a75f6d54e2288b29cc72338d4a057d3e83": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_collection",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "f987667167b1d2970911247e325eb1c37ca0823646f81ccec837ae59039822f7": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_color",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "9580c81f38ddcce7be0ac9bd3d0d083adebc34e17441709f90b9e4dcd1c19a56": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_dispatch",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "169f80b4cceca8e2e9ed384d81a5db0624cc01a26451dfb5a7e0cec6ea9cfb06": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_drag",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "08d35d139dde58c2722be98d718d01204fd6167d310f09b379e832f3c741489d": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_dsv",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "62594d00cf9e4bb895339c8e56f64330e202a5eb2a0fa580a1f6e6336f2c93ce": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_ease",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "d1cf8f99b7bf758c2ba3c0a4ce553e151d4d9b4cf45a6e8bd0edec7ce90f725b": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_force",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "288421e2008668d2076a4684657dd3d29b992832ef02c552981eb94a91042553": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_format",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "b42cb17e580c1fd0b64d478f7bd80ca806efaefda24426a833cf1f30a7275bca": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_hierarchy",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "a5683f5835d8716c6b89c075235078438cfab5897023ed720bfa492e244e969e": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_interpolate",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "590a71b741323ac3139b333ec8b743e24717fdd5b32bcff48ee521162a9dfe1c": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_path",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "96f35ba041bcaa265e2b373ee675177410d44d31c980e4f7fbeefd4bcba15b00": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_polygon",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "ce453451e8105cac6a4f4a4263ca2142ebb4bf442e342f470a81da691f220fcb": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_quadtree",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "238e278f1be5d6985a19800800cffee80f81199f71d848e3bbc288d1791a6f90": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_queue",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "e6ae19aad83495475653578de64fb9d6bf9764eda6c84d70f7935ec84bcc482e": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_random",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "d31b92ed86c23ec0a4776f99fa81ff033c95b96c8304d8aa9baf3b94af779aa8": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_request",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "44bb7b07d977028e6567540a3303b06fc9b33fb0960bc75c520e0733c840d89f": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_scale",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "02ce7c644ba34bd1abb84da2e832f248b048b6a23812be4365bd837f186c9f1f": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_selection",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "699043ddb28dfa5e46d87bc6a24cfc6d604237f298259d3fb3c7066e05e8c86e": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_shape",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "62668a7aaaf6232762b544f9f89c0f557ca7cfb0cd343a358dda7ecbe26f5739": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_time",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "0502490ce682fd9265fb1d5d693ce6cd82e3b05e5f5ee3433731266ecb03d5fc": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_timer",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "6f191f9aea704aa64b1defa40dfdff1447a6e6bb815feff1660f894500a9c94d": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_transition",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "a0a7c0c9bfb5c7d6d9d22a8d16b4484b66d13f2ed226954037546cb3da4098ba": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_voronoi",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "c6bd5f229f915151d0ef678fe50b1aa6a62334ea0a8c6fc0effbac9f7032efc7": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
-          ],
-      },
-  )
-  
-  filegroup_external(
-      name = "org_definitelytyped_types_d3_zoom",
-      licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "a25dc17fbd304cf7a0e5e7bbb8339c930d464eb40c4d6e5f839ce9c0191f4110": [
-              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
-          ],
-      },
-  )
diff --git a/third_party/werkzeug.BUILD b/third_party/werkzeug.BUILD
deleted file mode 100644
index 72a1402030d150c21b5d43261a4d5e2c0f1bce91..0000000000000000000000000000000000000000
--- a/third_party/werkzeug.BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Description:
-#   Werkzeug provides utilities for making WSGI applications
-
-licenses(["notice"])  # BSD 3-Clause
-
-exports_files(["LICENSE"])
-
-# Note: this library includes test code. Consider creating a testonly target.
-py_library(
-    name = "werkzeug",
-    srcs = glob(["werkzeug/*.py"]),
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index 279e6395b03a3c86b4b3fe25958ebafa4cb75062..85096688914a1598ef1d51b71721d860398947cb 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -2,6 +2,18 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # BSD/MIT-like license (for zlib)
 
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "windows_msvc",
+    values = {"cpu": "x64_windows_msvc"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "zlib",
     srcs = [
@@ -32,9 +44,13 @@ cc_library(
         "zutil.h",
     ],
     hdrs = ["zlib.h"],
-    copts = [
-        "-Wno-shift-negative-value",
-        "-Wno-implicit-function-declaration",
-    ],
+    copts = select({
+        ":windows": [],
+        ":windows_msvc": [],
+        "//conditions:default": [
+            "-Wno-shift-negative-value",
+            "-Wno-implicit-function-declaration",
+        ],
+    }),
     includes = ["."],
 )
diff --git a/tools/bazel.rc b/tools/bazel.rc
index e67a290cf40ca7f688dfdb03210786c8c85abe48..414ddf2e475da051cad4a4534a3a0ca955229997 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -11,6 +11,9 @@ build:mkl --define=using_mkl=true
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl --define=using_sycl=true
 
+build:sycl_nodouble --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_nodouble --define=using_sycl=true --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
+
 build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address